line     4  col     1  Print    
 
line     4  col     6  Lparen   
 
line     4  col     7  String   
"Hello, World!\n"
line     4  col    24  Rparen   
 
line     4  col    25  Semi     
 
line     5  col     1  EOI      
 


Lexical analysis is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an identified "meaning"). A program that performs lexical analysis may be called a lexer, tokenizer, or scanner (though "scanner" is also used to refer to the first stage of a lexer).

The Task

Create a lexical analyzer for the Tiny programming language. The program should read input from a file and/or stdin, and write output to a file and/or stdout.

Specification

The various token types are denoted below.

Operators
Characters Common name Name
* multiply Mul
/ divide Div
+ plus Add
- minus and unary minus Sub and Uminus
< less than Lss
<= less than or equal Leq
> greater than Gtr
!= not equal Neq
= assign Assign
&& and And
Symbols
Characters Common name Name
( left parenthesis Lparen
) right parenthesis Rparen
{ left brace Lbrace
} right brace Rbrace
; semi colon Semi
, comma Comma
Keywords
Characters Name
if If
while While
print Print
putc Putc
Other entities
Characters Regular expression Name
integers [0-9]+ Integer
char literal 'x' Integer
identifiers [_a-zA-Z][_a-zA-Z0-9]+ Ident
string literal ".*" String

Notes: For char literals, '\n' is supported as a new line character. To represent \, use: '\\'. \n may also be used in Strings, to print a newline. No other special sequences are supported.

Comments /* ... */ (multi-line)

Complete list of token names

EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident

Program output

Output of the program should be:

  • the word line, followed by:
  • the line number where the token starts, followed by:
  • the abbreviation col, followed by:
  • the column number where the token starts, followed by:
  • the token name.
  • If the token name is one of Integer, Ident or String, the actual value of the same should follow.
Test Cases

<lang c> /*

 Hello world
*/

print("Hello, World!\n"); </lang>

Output
line 4 col 1 Print  
line 4 col 6 Lparen  
line 4 col 7 String "Hello, World!\n"
line 4 col 24 Rparen  
line 4 col 25 Semi  
line 5 col 1 EOI  

<lang c> /*

 Show Ident and Integers
*/

phoenix_number = 142857; print(phoenix_number, "\n"); </lang>

Output
line 4 col 1 Ident phoenix_number
line 4 col 16 Assign  
line 4 col 18 Integer 142857
line 4 col 24 Semi  
line 5 col 1 Print  
line 5 col 6 Lparen  
line 5 col 7 Ident phoenix_number
line 5 col 21 Comma  
line 5 col 23 String "\n"
line 5 col 27 Rparen  
line 5 col 28 Semi  
line 6 col 1 EOI  
Diagnostics

The following error conditions should be caught:

  • Empty character constant. Example: ''
  • Unknown escape sequence. Example: '\r'
  • Multi-character constant. Example: 'xx'
  • End-of-file in comment. Closing comment characters not found.
  • End-of-file while scanning string literal. Closing string character not found.
  • End-of-line while scanning string literal. Closing string character not found before end-of-line.
  • Unrecognized character. Example: |
Reference

The C and Python versions can be considered reference implementations.

Implementations

C

<lang C>

  1. include <stdlib.h>
  2. include <stdio.h>
  3. include <stdarg.h>
  4. include <ctype.h>
  5. include <string.h>
  6. include <errno.h>
  7. include <stdbool.h>
  8. include <limits.h>
  1. define NELEMS(arr) (sizeof(arr) / sizeof(arr[0]))
  1. define da_dim(name, type) type *name = NULL; \
                           int _qy_ ## name ## _p = 0;  \
                           int _qy_ ## name ## _max = 0
  1. define da_rewind(name) _qy_ ## name ## _p = 0
  2. define da_redim(name) if (_qy_ ## name ## _p >= _qy_ ## name ## _max) \
                               name = realloc(name, (_qy_ ## name ## _max += 32) * sizeof(name[0]))
  1. define da_append(name, x) do {da_redim(name); name[_qy_ ## name ## _p++] = x;} while (0)
  2. define da_len(name) _qy_ ## name ## _p

// dependancy: atr table in parse.c ordering is based on these typedef enum {

   EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add,
   Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident

} TokenType;

typedef struct {

   int tok;
   int err_ln, err_col;
   union {
       int n;                  /* value for constants */
       char *text;             /* text for idents */
   };

} tok_s;

static FILE *source_fp, *dest_fp; static int line, col, the_ch; da_dim(text, char);

tok_s gettok();

static void error(int err_line, int err_col, const char *fmt, ... ) {

   char buf[1000];
   va_list ap;
   va_start(ap, fmt);
   vsprintf(buf, fmt, ap);
   va_end(ap);
   printf("(%d,%d) error: %s\n", err_line, err_col, buf);
   exit(1);

}

static void read_ch() { /* get next char from input */

   the_ch = getc(source_fp);
   ++col;
   if (the_ch == '\n') {
       ++line;
       col = 0;
   }

}

static tok_s char_lit(int n, int err_line, int err_col) { /* 'x' */

   if (the_ch == '\)
       error(err_line, err_col, "gettok: empty character constant");
   if (the_ch == '\\') {
       read_ch();
       if (the_ch == 'n')
           n = 10;
       else if (the_ch == '\\')
           n = '\\';
       else error(err_line, err_col, "gettok: unknown escape sequence \\%c", the_ch);
   }
   read_ch();
   if (the_ch != '\) error(err_line, err_col, "multi-character constant");
   read_ch();
   return (tok_s){Integerk, err_line, err_col, {n}};

}

static tok_s div_or_cmt(int err_line, int err_col) { /* process divide or comments */

   if (the_ch != '*')
       return (tok_s){Div, err_line, err_col, {0}};
   /* comment found */
   for (;;) {
       read_ch();
       if (the_ch == '*' || the_ch == EOF) {
           read_ch();
           if (the_ch == '/' || the_ch == EOF) {
               read_ch();
               return gettok();
           }
       }
   }

}

static tok_s string_lit(int start, int err_line, int err_col) { /* "st" */

   da_rewind(text);
   for (read_ch(); the_ch != start; read_ch()) {
       if (the_ch == '\n')
           error(err_line, err_col, "EOL in string");
       if (the_ch == EOF)
           error(err_line, err_col, "EOF in string");
       da_append(text, (char)the_ch);
   }
   da_append(text, '\0');
   read_ch();
   return (tok_s){Stringk, err_line, err_col, {.text=text}};

}

static int kwd_cmp(const void *p1, const void *p2) {

   return strcmp(*(char **)p1, *(char **)p2);

}

static TokenType get_ident_type(const char *ident) {

   static struct {
       char *s;
       TokenType sym;
   } kwds[] = {
       {"if",    If},
       {"print", Print},
       {"putc",  Putc},
       {"while", While},
   }, *kwp;
   return (kwp = bsearch(&ident, kwds, NELEMS(kwds), sizeof(kwds[0]), kwd_cmp)) == NULL ? Ident : kwp->sym;

}

static tok_s ident_or_int(int err_line, int err_col) {

   int n, is_number = true;
   da_rewind(text);
   while (isalnum(the_ch) || the_ch == '_') {
       da_append(text, (char)the_ch);
       if (!isdigit(the_ch))
           is_number = false;
       read_ch();
   }
   if (da_len(text) == 0)
       error(err_line, err_col, "gettok: unrecognized character (%d) '%c'\n", the_ch, the_ch);
   da_append(text, '\0');
   if (isdigit(text[0])) {
       if (!is_number)
           error(err_line, err_col, "invalid number: %s\n", text);
       n = strtol(text, NULL, 0);
       if (n == LONG_MAX && errno == ERANGE)
           error(err_line, err_col, "Number exceeds maximum value");
       return (tok_s){Integerk, err_line, err_col, {n}};
   }
   return (tok_s){get_ident_type(text), err_line, err_col, {.text=text}};

}

static tok_s follow(int expect, TokenType ifyes, TokenType ifno, int err_line, int err_col) { /* look ahead for '>=', etc. */

   if (the_ch == expect) {
       read_ch();
       return (tok_s){ifyes, err_line, err_col, {0}};
   }
   if (ifno == EOI) error(err_line, err_col, "follow: unrecognized character '%c' (%d)\n", the_ch, the_ch);
   return (tok_s){ifno, err_line, err_col, {0}};

}

tok_s gettok() { /* return the token type */

   /* skip white space */
   while (isspace(the_ch))
       read_ch();
   int err_line = line;
   int err_col  = col;
   switch (the_ch) {
       case '{':  read_ch(); return (tok_s){Lbrace, err_line, err_col, {0}};
       case '}':  read_ch(); return (tok_s){Rbrace, err_line, err_col, {0}};
       case '(':  read_ch(); return (tok_s){Lparen, err_line, err_col, {0}};
       case ')':  read_ch(); return (tok_s){Rparen, err_line, err_col, {0}};
       case '+':  read_ch(); return (tok_s){Add,    err_line, err_col, {0}};
       case '-':  read_ch(); return (tok_s){Sub,    err_line, err_col, {0}};
       case '*':  read_ch(); return (tok_s){Mul,    err_line, err_col, {0}};
       case ';':  read_ch(); return (tok_s){Semi,   err_line, err_col, {0}};
       case ',':  read_ch(); return (tok_s){Comma,  err_line, err_col, {0}};
       case '>':  read_ch(); return (tok_s){Gtr,    err_line, err_col, {0}};
       case '=':  read_ch(); return (tok_s){Assign, err_line, err_col, {0}};
       case '/':  read_ch(); return div_or_cmt(err_line, err_col);
       case '\: read_ch(); return char_lit(the_ch, err_line, err_col);
       case '<':  read_ch(); return follow('=', Leq, Lss, err_line, err_col);
       case '!':  read_ch(); return follow('=', Neq, EOI, err_line, err_col);
       case '&':  read_ch(); return follow('&', And, EOI, err_line, err_col);
       case '"' : return string_lit(the_ch, err_line, err_col);
       default:   return ident_or_int(err_line, err_col);
       case EOF:  return (tok_s){EOI, err_line, err_col, {0}};
   }

}

void init_lex() { /* initialize the scanner */

   line = 1;
   read_ch();

}

void run() { /* tokenize the given input */

   tok_s tok;
   do {
       tok = gettok();
       fprintf(dest_fp, "line %5d  col %5d %.8s",
           tok.err_ln, tok.err_col,
           &"EOI      Print    Putc     If       While    Lbrace   Rbrace   Lparen   Rparen   "
            "Uminus   Mul      Div      Add      Sub      Lss      Gtr      Leq      Neq      "
            "And      Semi     Comma    Assign   Integer  String   Ident    "[tok.tok * 9]);
       if (tok.tok == Integerk)
           fprintf(dest_fp, "  %8d", tok.n);
       else if (tok.tok == Ident)
           fprintf(dest_fp, " %s", tok.text);
       else if (tok.tok == Stringk)
           fprintf(dest_fp, " \"%s\"", tok.text);
       fprintf(dest_fp, "\n");
   } while (tok.tok != EOI);
   if (dest_fp != stdout)
       fclose(dest_fp);

}

void init_io(FILE **fp, FILE *std, const char mode[], const char fn[]) {

   if (fn[0] == '\0')
       *fp = std;
   else if ((*fp = fopen(fn, mode)) == NULL)
       error(0, 0, "Can't open %s\n", fn);

}

int main(int argc, char *argv[]) {

   init_io(&source_fp, stdin,  "r",  argc > 1 ? argv[1] : "");
   init_io(&dest_fp,   stdout, "wb", argc > 2 ? argv[2] : "");
   init_lex();
   run();

} </lang>

Python

<lang Python> import sys

  1. following two must remain in the same order

EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, \ Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident = range(25)

all_syms = [ 'EOI', 'Print', 'Putc', 'If', 'While', 'Lbrace', 'Rbrace', 'Lparen',

   'Rparen', 'Uminus', 'Mul', 'Div', 'Add', 'Sub', 'Lss', 'Gtr', 'Leq', 'Neq', 'And',
   'Semi', 'Comma', 'Assign', 'Integer', 'String', 'Ident' ]
  1. single character only symbols

symbols = { '{': Lbrace, '}': Rbrace, '(': Lparen, ')': Rparen, '+': Add, '-': Sub,

   '*': Mul, ';': Semi, ',': Comma, '>': Gtr, '=': Assign }

key_words = { 'if': If, 'print': Print, 'putc': Putc, 'while': While }

the_ch = " " # dummy first char - but it must be a space the_col = 0 the_line = 1 input_file = None

        • show error and exit

def error(line, col, msg):

   print(line, col, msg)
   exit(1)
        • get the next character from the input

def getc():

   global the_ch, the_col, the_line
   the_ch = input_file.read(1)
   the_col += 1
   if the_ch == '\n':
       the_line += 1
       the_col = 0
   return the_ch
        • 'x' - character constants

def char_lit(err_line, err_col):

   n = ord(getc())              # skip opening quote
   if the_ch == '\:
       error(err_line, err_col, "empty character constant")
   elif the_ch == '\\':
       getc()
       if the_ch == 'n':
           n = 10
       elif the_ch == '\\':
           n = '\\'
       else:
           error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch))
   if getc() != '\:
       error(err_line, err_col, "multi-character constant")
   getc()
   return Integerk, err_line, err_col, n
        • process divide or comments

def div_or_cmt(err_line, err_col):

   if getc() != '*':
       return Div, err_line, err_col
   # comment found
   while True:
       if getc() == '*' and getc() == '/':
           getc()
           return gettok()
       elif len(the_ch) == 0:
           error(err_line, err_col, "EOF in comment")
        • "string"

def string_lit(start, err_line, err_col):

   text = ""
   while getc() != start:
       if len(the_ch) == 0:
           error(err_line, err_col, "EOF while scanning string literal")
       if the_ch == '\n':
           error(err_line, err_col, "EOL while scanning string literal")
       text += the_ch
   getc()
   return Stringk, err_line, err_col, text
        • handle identifiers and integers

def ident_or_int(err_line, err_col):

   is_number = True
   text = ""
   while the_ch.isalnum() or the_ch == '_':
       text += the_ch
       if not the_ch.isdigit():
           is_number = False
       getc()
   if len(text) == 0:
       error(err_line, err_col, "ident_or_int: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
   if text[0].isdigit():
       if not is_number:
           error(err_line, err_col, "invalid number: %s" % (text))
       n = int(text)
       return Integerk, err_line, err_col, n
   if text in key_words:
       return key_words[text], err_line, err_col
   return Ident, err_line, err_col, text
        • look ahead for '>=', etc.

def follow(expect, ifyes, ifno, err_line, err_col):

   if getc() == expect:
       getc()
       return ifyes, err_line, err_col
   if ifno == EOI:
       error(err_line, err_col, "follow: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
   return ifno, err_line, err_col
        • return the next token type

def gettok():

   while the_ch.isspace():
       getc()
   err_line = the_line
   err_col  = the_col
   if len(the_ch) == 0:    return EOI, err_line, err_col
   elif the_ch in symbols: sym = symbols[the_ch]; getc(); return sym, err_line, err_col
   elif the_ch == '/':     return div_or_cmt(err_line, err_col)
   elif the_ch == '\:    return char_lit(err_line, err_col)
   elif the_ch == '<':     return follow('=', Leq, Lss, err_line, err_col)
   elif the_ch == '!':     return follow('=', Neq, EOI, err_line, err_col)
   elif the_ch == '&':     return follow('&', And, EOI, err_line, err_col)
   elif the_ch == '"':     return string_lit(the_ch, err_line, err_col)
   else:                   return ident_or_int(err_line, err_col)
        • main driver

input_file = sys.stdin if len(sys.argv) > 1:

   try:
       input_file = open(sys.argv[1], "r", 4096)
   except IOError as e:
       error(0, 0, "Can't open %s" % sys.argv[1])

while True:

   t = gettok()
   tok  = t[0]
   line = t[1]
   col  = t[2]
   if tok == Integerk:
       print("line %5d  col %5d %-8s  %8d" % (line, col, all_syms[tok], t[3]))
   elif tok == Ident:
       print("line %5d  col %5d %-8s %s"   % (line, col, all_syms[tok], t[3]))
   elif tok == Stringk:
       print('line %5d  col %5d %-8s "%s"' % (line, col, all_syms[tok], t[3]))
   else:
       print("line %5d  col %5d %-8s"      % (line, col, all_syms[tok]))
   if tok == EOI:
       break

</lang>