Ed Davis

line 4 col 1 Print	line 4 col 6 Lparen	line 4 col 7 String "Hello, World!\n"	line 4 col 24 Rparen	line 4 col 25 Semi	line 5 col 1 EOI

Lexical analysis is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an identified "meaning"). A program that performs lexical analysis may be called a lexer, tokenizer, or scanner (though "scanner" is also used to refer to the first stage of a lexer).

The Task

Create a lexical analyzer for the Tiny programming language. The program should read input from a file and/or stdin, and write output to a file and/or stdout.

Specification

The various token types are denoted below.

Operators

Characters	Common name	Name
*	multiply	Mul
/	divide	Div
+	plus	Add
-	minus and unary minus	Sub and Uminus
<	less than	Lss
<=	less than or equal	Leq
>	greater than	Gtr
!=	not equal	Neq
=	assign	Assign
&&	and	And

Symbols

Characters	Common name	Name
(	left parenthesis	Lparen
)	right parenthesis	Rparen
{	left brace	Lbrace
}	right brace	Rbrace
;	semi colon	Semi
,	comma	Comma

Keywords

Characters	Name
if	If
while	While
print	Print
putc	Putc

Other entities

Characters	Regular expression	Name
integers	[0-9]+	Integer
char literal	'x'	Integer
identifiers	[_a-zA-Z][_a-zA-Z0-9]+	Ident
string literal	".*"	String

Notes: For char literals, '\n' is supported as a new line character. To represent \, use: '\\'. \n may also be used in Strings, to print a newline. No other special sequences are supported.

Comments /* ... */ (multi-line)

Complete list of token names

EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident

Program output

Output of the program should be:

the word line, followed by:
the line number where the token starts, followed by:
the abbreviation col, followed by:
the column number where the token starts, followed by:
the token name.
If the token name is one of Integer, Ident or String, the actual value of the same should follow.

Test Cases

<lang c> /*

 Hello world
*/

print("Hello, World!\n"); </lang>

Output

line	4	col	1	Print
line	4	col	6	Lparen
line	4	col	7	String	"Hello, World!\n"
line	4	col	24	Rparen
line	4	col	25	Semi
line	5	col	1	EOI

<lang c> /*

 Show Ident and Integers
*/

phoenix_number = 142857; print(phoenix_number, "\n"); </lang>

Output

line	4	col	1	Ident	phoenix_number
line	4	col	16	Assign
line	4	col	18	Integer	142857
line	4	col	24	Semi
line	5	col	1	Print
line	5	col	6	Lparen
line	5	col	7	Ident	phoenix_number
line	5	col	21	Comma
line	5	col	23	String	"\n"
line	5	col	27	Rparen
line	5	col	28	Semi
line	6	col	1	EOI

Diagnostics

The following error conditions should be caught:

Empty character constant. Example: ''
Unknown escape sequence. Example: '\r'
Multi-character constant. Example: 'xx'
End-of-file in comment. Closing comment characters not found.
End-of-file while scanning string literal. Closing string character not found.
End-of-line while scanning string literal. Closing string character not found before end-of-line.
Unrecognized character. Example: |

Reference

The C and Python versions can be considered reference implementations.

Implementations

C

include <stdlib.h>
include <stdio.h>
include <stdarg.h>
include <ctype.h>
include <string.h>
include <errno.h>
include <stdbool.h>
include <limits.h>

define NELEMS(arr) (sizeof(arr) / sizeof(arr[0]))

define da_dim(name, type) type *name = NULL; \

                           int _qy_ ## name ## _p = 0;  \
                           int _qy_ ## name ## _max = 0

define da_rewind(name) _qy_ ## name ## _p = 0
define da_redim(name) if (_qy_ ## name ## _p >= _qy_ ## name ## _max) \

                               name = realloc(name, (_qy_ ## name ## _max += 32) * sizeof(name[0]))

define da_append(name, x) do {da_redim(name); name[_qy_ ## name ## _p++] = x;} while (0)
define da_len(name) _qy_ ## name ## _p

// dependancy: atr table in parse.c ordering is based on these typedef enum {

   EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add,
   Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident

} TokenType;

typedef struct {

   int tok;
   int err_ln, err_col;
   union {
       int n;                  /* value for constants */
       char *text;             /* text for idents */
   };

} tok_s;

static FILE *source_fp, *dest_fp; static int line, col, the_ch; da_dim(text, char);

tok_s gettok();

static void error(int err_line, int err_col, const char *fmt, ... ) {

   char buf[1000];
   va_list ap;

   va_start(ap, fmt);
   vsprintf(buf, fmt, ap);
   va_end(ap);
   printf("(%d,%d) error: %s\n", err_line, err_col, buf);
   exit(1);

}

static void read_ch() { /* get next char from input */

   the_ch = getc(source_fp);
   ++col;
   if (the_ch == '\n') {
       ++line;
       col = 0;
   }

}

static tok_s char_lit(int n, int err_line, int err_col) { /* 'x' */

   if (the_ch == '\)
       error(err_line, err_col, "gettok: empty character constant");
   if (the_ch == '\\') {
       read_ch();
       if (the_ch == 'n')
           n = 10;
       else if (the_ch == '\\')
           n = '\\';
       else error(err_line, err_col, "gettok: unknown escape sequence \\%c", the_ch);
   }
   read_ch();
   if (the_ch != '\) error(err_line, err_col, "multi-character constant");
   read_ch();
   return (tok_s){Integerk, err_line, err_col, {n}};

}

static tok_s div_or_cmt(int err_line, int err_col) { /* process divide or comments */

   if (the_ch != '*')
       return (tok_s){Div, err_line, err_col, {0}};

   /* comment found */
   for (;;) {
       read_ch();
       if (the_ch == '*' || the_ch == EOF) {
           read_ch();
           if (the_ch == '/' || the_ch == EOF) {
               read_ch();
               return gettok();
           }
       }
   }

}

static tok_s string_lit(int start, int err_line, int err_col) { /* "st" */

   da_rewind(text);

   for (read_ch(); the_ch != start; read_ch()) {
       if (the_ch == '\n')
           error(err_line, err_col, "EOL in string");
       if (the_ch == EOF)
           error(err_line, err_col, "EOF in string");
       da_append(text, (char)the_ch);
   }
   da_append(text, '\0');

   read_ch();
   return (tok_s){Stringk, err_line, err_col, {.text=text}};

}

static int kwd_cmp(const void *p1, const void *p2) {

   return strcmp(*(char **)p1, *(char **)p2);

}

static TokenType get_ident_type(const char *ident) {

   static struct {
       char *s;
       TokenType sym;
   } kwds[] = {
       {"if",    If},
       {"print", Print},
       {"putc",  Putc},
       {"while", While},
   }, *kwp;

   return (kwp = bsearch(&ident, kwds, NELEMS(kwds), sizeof(kwds[0]), kwd_cmp)) == NULL ? Ident : kwp->sym;

}

static tok_s ident_or_int(int err_line, int err_col) {

   int n, is_number = true;

   da_rewind(text);
   while (isalnum(the_ch) || the_ch == '_') {
       da_append(text, (char)the_ch);
       if (!isdigit(the_ch))
           is_number = false;
       read_ch();
   }
   if (da_len(text) == 0)
       error(err_line, err_col, "gettok: unrecognized character (%d) '%c'\n", the_ch, the_ch);
   da_append(text, '\0');
   if (isdigit(text[0])) {
       if (!is_number)
           error(err_line, err_col, "invalid number: %s\n", text);
       n = strtol(text, NULL, 0);
       if (n == LONG_MAX && errno == ERANGE)
           error(err_line, err_col, "Number exceeds maximum value");
       return (tok_s){Integerk, err_line, err_col, {n}};
   }
   return (tok_s){get_ident_type(text), err_line, err_col, {.text=text}};

}

static tok_s follow(int expect, TokenType ifyes, TokenType ifno, int err_line, int err_col) { /* look ahead for '>=', etc. */

   if (the_ch == expect) {
       read_ch();
       return (tok_s){ifyes, err_line, err_col, {0}};
   }
   if (ifno == EOI) error(err_line, err_col, "follow: unrecognized character '%c' (%d)\n", the_ch, the_ch);
   return (tok_s){ifno, err_line, err_col, {0}};

}

tok_s gettok() { /* return the token type */

   /* skip white space */
   while (isspace(the_ch))
       read_ch();
   int err_line = line;
   int err_col  = col;
   switch (the_ch) {
       case '{':  read_ch(); return (tok_s){Lbrace, err_line, err_col, {0}};
       case '}':  read_ch(); return (tok_s){Rbrace, err_line, err_col, {0}};
       case '(':  read_ch(); return (tok_s){Lparen, err_line, err_col, {0}};
       case ')':  read_ch(); return (tok_s){Rparen, err_line, err_col, {0}};
       case '+':  read_ch(); return (tok_s){Add,    err_line, err_col, {0}};
       case '-':  read_ch(); return (tok_s){Sub,    err_line, err_col, {0}};
       case '*':  read_ch(); return (tok_s){Mul,    err_line, err_col, {0}};
       case ';':  read_ch(); return (tok_s){Semi,   err_line, err_col, {0}};
       case ',':  read_ch(); return (tok_s){Comma,  err_line, err_col, {0}};
       case '>':  read_ch(); return (tok_s){Gtr,    err_line, err_col, {0}};
       case '=':  read_ch(); return (tok_s){Assign, err_line, err_col, {0}};
       case '/':  read_ch(); return div_or_cmt(err_line, err_col);
       case '\: read_ch(); return char_lit(the_ch, err_line, err_col);
       case '<':  read_ch(); return follow('=', Leq, Lss, err_line, err_col);
       case '!':  read_ch(); return follow('=', Neq, EOI, err_line, err_col);
       case '&':  read_ch(); return follow('&', And, EOI, err_line, err_col);
       case '"' : return string_lit(the_ch, err_line, err_col);
       default:   return ident_or_int(err_line, err_col);
       case EOF:  return (tok_s){EOI, err_line, err_col, {0}};
   }

}

void init_lex() { /* initialize the scanner */

   line = 1;
   read_ch();

}

void run() { /* tokenize the given input */

   tok_s tok;
   do {
       tok = gettok();
       fprintf(dest_fp, "line %5d  col %5d %.8s",
           tok.err_ln, tok.err_col,
           &"EOI      Print    Putc     If       While    Lbrace   Rbrace   Lparen   Rparen   "
            "Uminus   Mul      Div      Add      Sub      Lss      Gtr      Leq      Neq      "
            "And      Semi     Comma    Assign   Integer  String   Ident    "[tok.tok * 9]);
       if (tok.tok == Integerk)
           fprintf(dest_fp, "  %8d", tok.n);
       else if (tok.tok == Ident)
           fprintf(dest_fp, " %s", tok.text);
       else if (tok.tok == Stringk)
           fprintf(dest_fp, " \"%s\"", tok.text);
       fprintf(dest_fp, "\n");
   } while (tok.tok != EOI);
   if (dest_fp != stdout)
       fclose(dest_fp);

}

void init_io(FILE **fp, FILE *std, const char mode[], const char fn[]) {

   if (fn[0] == '\0')
       *fp = std;
   else if ((*fp = fopen(fn, mode)) == NULL)
       error(0, 0, "Can't open %s\n", fn);

}

int main(int argc, char *argv[]) {

   init_io(&source_fp, stdin,  "r",  argc > 1 ? argv[1] : "");
   init_io(&dest_fp,   stdout, "wb", argc > 2 ? argv[2] : "");
   init_lex();
   run();

} </lang>

Python

<lang Python> import sys

following two must remain in the same order

EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, \ Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident = range(25)

all_syms = [ 'EOI', 'Print', 'Putc', 'If', 'While', 'Lbrace', 'Rbrace', 'Lparen',

   'Rparen', 'Uminus', 'Mul', 'Div', 'Add', 'Sub', 'Lss', 'Gtr', 'Leq', 'Neq', 'And',
   'Semi', 'Comma', 'Assign', 'Integer', 'String', 'Ident' ]

single character only symbols

symbols = { '{': Lbrace, '}': Rbrace, '(': Lparen, ')': Rparen, '+': Add, '-': Sub,

   '*': Mul, ';': Semi, ',': Comma, '>': Gtr, '=': Assign }

key_words = { 'if': If, 'print': Print, 'putc': Putc, 'while': While }

the_ch = " " # dummy first char - but it must be a space the_col = 0 the_line = 1 input_file = None

- - - show error and exit

def error(line, col, msg):

   print(line, col, msg)
   exit(1)

- - - get the next character from the input

def getc():

   global the_ch, the_col, the_line

   the_ch = input_file.read(1)
   the_col += 1
   if the_ch == '\n':
       the_line += 1
       the_col = 0
   return the_ch

- - - 'x' - character constants

def char_lit(err_line, err_col):

   n = ord(getc())              # skip opening quote
   if the_ch == '\:
       error(err_line, err_col, "empty character constant")
   elif the_ch == '\\':
       getc()
       if the_ch == 'n':
           n = 10
       elif the_ch == '\\':
           n = '\\'
       else:
           error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch))
   if getc() != '\:
       error(err_line, err_col, "multi-character constant")
   getc()
   return Integerk, err_line, err_col, n

- - - process divide or comments

def div_or_cmt(err_line, err_col):

   if getc() != '*':
       return Div, err_line, err_col

   # comment found
   while True:
       if getc() == '*' and getc() == '/':
           getc()
           return gettok()
       elif len(the_ch) == 0:
           error(err_line, err_col, "EOF in comment")

- - - "string"

def string_lit(start, err_line, err_col):

   text = ""

   while getc() != start:
       if len(the_ch) == 0:
           error(err_line, err_col, "EOF while scanning string literal")
       if the_ch == '\n':
           error(err_line, err_col, "EOL while scanning string literal")
       text += the_ch

   getc()
   return Stringk, err_line, err_col, text

- - - handle identifiers and integers

def ident_or_int(err_line, err_col):

   is_number = True
   text = ""

   while the_ch.isalnum() or the_ch == '_':
       text += the_ch
       if not the_ch.isdigit():
           is_number = False
       getc()

   if len(text) == 0:
       error(err_line, err_col, "ident_or_int: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))

   if text[0].isdigit():
       if not is_number:
           error(err_line, err_col, "invalid number: %s" % (text))
       n = int(text)
       return Integerk, err_line, err_col, n

   if text in key_words:
       return key_words[text], err_line, err_col

   return Ident, err_line, err_col, text

- - - look ahead for '>=', etc.

def follow(expect, ifyes, ifno, err_line, err_col):

   if getc() == expect:
       getc()
       return ifyes, err_line, err_col

   if ifno == EOI:
       error(err_line, err_col, "follow: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))

   return ifno, err_line, err_col

- - - return the next token type

def gettok():

   while the_ch.isspace():
       getc()

   err_line = the_line
   err_col  = the_col

   if len(the_ch) == 0:    return EOI, err_line, err_col
   elif the_ch in symbols: sym = symbols[the_ch]; getc(); return sym, err_line, err_col
   elif the_ch == '/':     return div_or_cmt(err_line, err_col)
   elif the_ch == '\:    return char_lit(err_line, err_col)
   elif the_ch == '<':     return follow('=', Leq, Lss, err_line, err_col)
   elif the_ch == '!':     return follow('=', Neq, EOI, err_line, err_col)
   elif the_ch == '&':     return follow('&', And, EOI, err_line, err_col)
   elif the_ch == '"':     return string_lit(the_ch, err_line, err_col)
   else:                   return ident_or_int(err_line, err_col)

- - - main driver

input_file = sys.stdin if len(sys.argv) > 1:

   try:
       input_file = open(sys.argv[1], "r", 4096)
   except IOError as e:
       error(0, 0, "Can't open %s" % sys.argv[1])

while True:

   t = gettok()
   tok  = t[0]
   line = t[1]
   col  = t[2]

   if tok == Integerk:
       print("line %5d  col %5d %-8s  %8d" % (line, col, all_syms[tok], t[3]))
   elif tok == Ident:
       print("line %5d  col %5d %-8s %s"   % (line, col, all_syms[tok], t[3]))
   elif tok == Stringk:
       print('line %5d  col %5d %-8s "%s"' % (line, col, all_syms[tok], t[3]))
   else:
       print("line %5d  col %5d %-8s"      % (line, col, all_syms[tok]))

   if tok == EOI:
       break

</lang>