User:Ed Davis: Difference between revisions

From Rosetta Code
Content added Content deleted
No edit summary
No edit summary
Line 163: Line 163:
<lang c>
<lang c>
/*
/*
All lexical tokens - not syntatically correct, but that will
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
have to wait until syntax analysis
*/
*/
Line 232: Line 232:
;Implementations
;Implementations


__TOC__


=={{header|C}}==
=={{header|C}}==
Line 288: Line 287:
}
}


static void read_ch() { /* get next char from input */
static int next_ch() { /* get next char from input */
the_ch = getc(source_fp);
the_ch = getc(source_fp);
++col;
++col;
Line 295: Line 294:
col = 0;
col = 0;
}
}
return the_ch;
}
}


Line 301: Line 301:
error(err_line, err_col, "gettok: empty character constant");
error(err_line, err_col, "gettok: empty character constant");
if (the_ch == '\\') {
if (the_ch == '\\') {
read_ch();
next_ch();
if (the_ch == 'n')
if (the_ch == 'n')
n = 10;
n = 10;
Line 308: Line 308:
else error(err_line, err_col, "gettok: unknown escape sequence \\%c", the_ch);
else error(err_line, err_col, "gettok: unknown escape sequence \\%c", the_ch);
}
}
if (next_ch() != '\'')
read_ch();
if (the_ch != '\'') error(err_line, err_col, "multi-character constant");
error(err_line, err_col, "multi-character constant");
read_ch();
next_ch();
return (tok_s){Integerk, err_line, err_col, {n}};
return (tok_s){Integerk, err_line, err_col, {n}};
}
}
Line 320: Line 320:
/* comment found */
/* comment found */
for (;;) {
for (;;) {
if (next_ch() == '*' && next_ch() == '/') {
read_ch();
if (the_ch == '*' || the_ch == EOF) {
next_ch();
read_ch();
return gettok();
if (the_ch == '/' || the_ch == EOF) {
} else if (the_ch == EOF)
read_ch();
error(err_line, err_col, "EOF in comment");
return gettok();
}
}
}
}
}
}
Line 334: Line 331:
da_rewind(text);
da_rewind(text);


for (read_ch(); the_ch != start; read_ch()) {
while (next_ch() != start) {
if (the_ch == '\n')
if (the_ch == '\n') error(err_line, err_col, "EOL in string");
error(err_line, err_col, "EOL in string");
if (the_ch == EOF) error(err_line, err_col, "EOF in string");
if (the_ch == EOF)
error(err_line, err_col, "EOF in string");
da_append(text, (char)the_ch);
da_append(text, (char)the_ch);
}
}
da_append(text, '\0');
da_append(text, '\0');


read_ch();
next_ch();
return (tok_s){Stringk, err_line, err_col, {.text=text}};
return (tok_s){Stringk, err_line, err_col, {.text=text}};
}
}
Line 373: Line 368:
if (!isdigit(the_ch))
if (!isdigit(the_ch))
is_number = false;
is_number = false;
read_ch();
next_ch();
}
}
if (da_len(text) == 0)
if (da_len(text) == 0)
Line 391: Line 386:
static tok_s follow(int expect, TokenType ifyes, TokenType ifno, int err_line, int err_col) { /* look ahead for '>=', etc. */
static tok_s follow(int expect, TokenType ifyes, TokenType ifno, int err_line, int err_col) { /* look ahead for '>=', etc. */
if (the_ch == expect) {
if (the_ch == expect) {
read_ch();
next_ch();
return (tok_s){ifyes, err_line, err_col, {0}};
return (tok_s){ifyes, err_line, err_col, {0}};
}
}
if (ifno == EOI)
if (ifno == EOI) error(err_line, err_col, "follow: unrecognized character '%c' (%d)\n", the_ch, the_ch);
error(err_line, err_col, "follow: unrecognized character '%c' (%d)\n", the_ch, the_ch);
return (tok_s){ifno, err_line, err_col, {0}};
return (tok_s){ifno, err_line, err_col, {0}};
}
}
Line 401: Line 397:
/* skip white space */
/* skip white space */
while (isspace(the_ch))
while (isspace(the_ch))
read_ch();
next_ch();
int err_line = line;
int err_line = line;
int err_col = col;
int err_col = col;
switch (the_ch) {
switch (the_ch) {
case '{': read_ch(); return (tok_s){Lbrace, err_line, err_col, {0}};
case '{': next_ch(); return (tok_s){Lbrace, err_line, err_col, {0}};
case '}': read_ch(); return (tok_s){Rbrace, err_line, err_col, {0}};
case '}': next_ch(); return (tok_s){Rbrace, err_line, err_col, {0}};
case '(': read_ch(); return (tok_s){Lparen, err_line, err_col, {0}};
case '(': next_ch(); return (tok_s){Lparen, err_line, err_col, {0}};
case ')': read_ch(); return (tok_s){Rparen, err_line, err_col, {0}};
case ')': next_ch(); return (tok_s){Rparen, err_line, err_col, {0}};
case '+': read_ch(); return (tok_s){Add, err_line, err_col, {0}};
case '+': next_ch(); return (tok_s){Add, err_line, err_col, {0}};
case '-': read_ch(); return (tok_s){Sub, err_line, err_col, {0}};
case '-': next_ch(); return (tok_s){Sub, err_line, err_col, {0}};
case '*': read_ch(); return (tok_s){Mul, err_line, err_col, {0}};
case '*': next_ch(); return (tok_s){Mul, err_line, err_col, {0}};
case ';': read_ch(); return (tok_s){Semi, err_line, err_col, {0}};
case ';': next_ch(); return (tok_s){Semi, err_line, err_col, {0}};
case ',': read_ch(); return (tok_s){Comma, err_line, err_col, {0}};
case ',': next_ch(); return (tok_s){Comma, err_line, err_col, {0}};
case '>': read_ch(); return (tok_s){Gtr, err_line, err_col, {0}};
case '>': next_ch(); return (tok_s){Gtr, err_line, err_col, {0}};
case '=': read_ch(); return (tok_s){Assign, err_line, err_col, {0}};
case '=': next_ch(); return (tok_s){Assign, err_line, err_col, {0}};
case '/': read_ch(); return div_or_cmt(err_line, err_col);
case '/': next_ch(); return div_or_cmt(err_line, err_col);
case '\'': read_ch(); return char_lit(the_ch, err_line, err_col);
case '\'': next_ch(); return char_lit(the_ch, err_line, err_col);
case '<': read_ch(); return follow('=', Leq, Lss, err_line, err_col);
case '<': next_ch(); return follow('=', Leq, Lss, err_line, err_col);
case '!': read_ch(); return follow('=', Neq, EOI, err_line, err_col);
case '!': next_ch(); return follow('=', Neq, EOI, err_line, err_col);
case '&': read_ch(); return follow('&', And, EOI, err_line, err_col);
case '&': next_ch(); return follow('&', And, EOI, err_line, err_col);
case '"' : return string_lit(the_ch, err_line, err_col);
case '"' : return string_lit(the_ch, err_line, err_col);
default: return ident_or_int(err_line, err_col);
default: return ident_or_int(err_line, err_col);
Line 436: Line 432:
"Uminus Mul Div Add Sub Lss Gtr Leq Neq "
"Uminus Mul Div Add Sub Lss Gtr Leq Neq "
"And Semi Comma Assign Integer String Ident "[tok.tok * 9]);
"And Semi Comma Assign Integer String Ident "[tok.tok * 9]);

if (tok.tok == Integerk)
fprintf(dest_fp, " %8d", tok.n);
if (tok.tok == Integerk) fprintf(dest_fp, " %4d", tok.n);
else if (tok.tok == Ident)
else if (tok.tok == Ident) fprintf(dest_fp, " %s", tok.text);
fprintf(dest_fp, " %s", tok.text);
else if (tok.tok == Stringk) fprintf(dest_fp, " \"%s\"", tok.text);
else if (tok.tok == Stringk)
fprintf(dest_fp, " \"%s\"", tok.text);
fprintf(dest_fp, "\n");
fprintf(dest_fp, "\n");
} while (tok.tok != EOI);
} while (tok.tok != EOI);
Line 460: Line 454:
run();
run();
}
}
</lang>

=={{header|Euphoria}}==
<lang euphoria>
include std/io.e
include std/map.e
include std/types.e
include std/convert.e

constant true = 1, false = 0, EOF = -1

enum EOI, Printk, Putc, Ifk, Whilek, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div,
Add, Sub, Lss, Gtr, Leq, Neq, Andk, Semi, Comma, Assign, Integerk, Stringk, Ident

constant all_syms = { "EOI", "Print", "Putc", "If", "While", "Lbrace", "Rbrace", "Lparen",
"Rparen", "Uminus", "Mul", "Div", "Add", "Sub", "Lss", "Gtr", "Leq", "Neq", "And",
"Semi", "Comma", "Assign", "Integer", "String", "Ident"}

integer input_file, the_ch = ' ', the_col = 0, the_line = 1
sequence symbols
map key_words = new()

procedure error(sequence format, sequence data)
printf(STDOUT, format, data)
abort(1)
end procedure

-- get the next character from the input
function next_ch()
the_ch = getc(input_file)
the_col += 1
if the_ch = '\n' then
the_line += 1
the_col = 0
end if
return the_ch
end function

-- 'x' - character constants
function char_lit(integer err_line, integer err_col)
integer n = next_ch() -- skip opening quote
if the_ch = '\'' then
error("%d %d empty character constant", {err_line, err_col})
elsif the_ch = '\\' then
next_ch()
if the_ch = 'n' then
n = 10
elsif the_ch = '\\' then
n = '\\'
else
error("%d %d unknown escape sequence \\%c", {err_line, err_col, the_ch})
end if
end if
if next_ch() != '\'' then
error("%d %d multi-character constant", {err_line, err_col})
end if
next_ch()
return {Integerk, err_line, err_col, n}
end function

-- process divide or comments
function div_or_cmt(integer err_line, integer err_col)
if next_ch() != '*' then
return {Div, err_line, err_col}
end if

-- comment found
while true do
if next_ch() = '*' and next_ch() = '/' then
next_ch()
return get_tok()
elsif the_ch = EOF then
error("%d %d EOF in comment", {err_line, err_col})
end if
end while
end function

-- "string"
function string_lit(integer start, integer err_line, integer err_col)
string text = ""

while next_ch() != start do
if the_ch = EOF then
error("%d %d EOF while scanning string literal", {err_line, err_col})
end if
if the_ch = '\n' then
error("%d %d EOL while scanning string literal", {err_line, err_col})
end if
text &= the_ch
end while

next_ch()
return {Stringk, err_line, err_col, text}
end function

-- handle identifiers and integers
function ident_or_int(integer err_line, integer err_col)
integer n, is_number = true
string text = ""

while t_alnum(the_ch) or the_ch = '_' do
text &= the_ch
if not t_digit(the_ch) then
is_number = false
end if
next_ch()
end while

if length(text) = 0 then
error("%d %d ident_or_int: unrecognized character: (%d) '%s'", {err_line, err_col, the_ch, the_ch})
end if

if t_digit(text[1]) then
if not is_number then
error("%d %d invalid number: %s", {err_line, err_col, text})
end if
n = to_integer(text)
return {Integerk, err_line, err_col, n}
end if

if has(key_words, text) then
return {get(key_words, text), err_line, err_col}
end if

return {Ident, err_line, err_col, text}
end function

-- look ahead for '>=', etc.
function follow(integer expect, integer ifyes, integer ifno, integer err_line, integer err_col)
if next_ch() = expect then
next_ch()
return {ifyes, err_line, err_col}
end if

if ifno = EOI then
error("%d %d follow: unrecognized character: (%d)", {err_line, err_col, the_ch})
end if

return {ifno, err_line, err_col}
end function

-- return the next token type
function get_tok()
while t_space(the_ch) do
next_ch()
end while

integer err_line = the_line
integer err_col = the_col

switch the_ch do
case EOF then return {EOI, err_line, err_col}
case '/' then return div_or_cmt(err_line, err_col)
case '\'' then return char_lit(err_line, err_col)
case '<' then return follow('=', Leq, Lss, err_line, err_col)
case '!' then return follow('=', Neq, EOI, err_line, err_col)
case '&' then return follow('&', Andk, EOI, err_line, err_col)
case '"' then return string_lit(the_ch, err_line, err_col)
case else
integer sym = symbols[the_ch]
if sym != EOI then
next_ch()
return {sym, err_line, err_col}
end if
return ident_or_int(err_line, err_col)
end switch
end function

procedure init()
put(key_words, "if", Ifk)
put(key_words, "print", Printk)
put(key_words, "putc", Putc)
put(key_words, "while", Whilek)

symbols = repeat(EOI, 256)
symbols['{'] = Lbrace
symbols['}'] = Rbrace
symbols['('] = Lparen
symbols[')'] = Rparen
symbols['+'] = Add
symbols['-'] = Sub
symbols['*'] = Mul
symbols[';'] = Semi
symbols[','] = Comma
symbols['>'] = Gtr
symbols['='] = Assign
end procedure

procedure main(sequence cl)
sequence file_name

input_file = STDIN
if length(cl) > 2 then
file_name = cl[3]
input_file = open(file_name, "r")
if input_file = -1 then
error("Could not open %s", {file_name})
end if
end if
init()
sequence t
loop do
t = get_tok()
printf(STDOUT, "line %5d col %5d %-8s", {t[2], t[3], all_syms[t[1]]})
switch t[1] do
case Integerk then printf(STDOUT, " %5d\n", {t[4]})
case Ident then printf(STDOUT, " %s\n", {t[4]})
case Stringk then printf(STDOUT, " \"%s\"\n", {t[4]})
case else printf(STDOUT, "\n")
end switch
until t[1] = EOI
end loop
end procedure

main(command_line())
</lang>
</lang>


Line 689: Line 898:
dim tok_list(tk_eoi to tk_ident) as string
dim tok_list(tk_eoi to tk_ident) as string


tok_list(tk_eoi ) = "EOI"
tok_list(tk_eoi ) = "EOI"
tok_list(tk_print ) = "Print"
tok_list(tk_print ) = "Print"
tok_list(tk_putc ) = "Putc"
tok_list(tk_putc ) = "Putc"
tok_list(tk_if ) = "If"
tok_list(tk_if ) = "If"
tok_list(tk_while ) = "While"
tok_list(tk_while ) = "While"
tok_list(tk_lbrace ) = "Lbrace"
tok_list(tk_lbrace ) = "Lbrace"
tok_list(tk_rbrace ) = "Rbrace"
tok_list(tk_rbrace ) = "Rbrace"
tok_list(tk_lparen ) = "Lparen"
tok_list(tk_lparen ) = "Lparen"
tok_list(tk_rparen ) = "Rparen"
tok_list(tk_rparen ) = "Rparen"
tok_list(tk_uminus ) = "Uminus"
tok_list(tk_uminus ) = "Uminus"
tok_list(tk_mul ) = "Mul"
tok_list(tk_mul ) = "Mul"
tok_list(tk_div ) = "Div"
tok_list(tk_div ) = "Div"
tok_list(tk_add ) = "Add"
tok_list(tk_add ) = "Add"
tok_list(tk_sub ) = "Sub"
tok_list(tk_sub ) = "Sub"
tok_list(tk_lss ) = "Lss"
tok_list(tk_lss ) = "Lss"
tok_list(tk_gtr ) = "Gtr"
tok_list(tk_gtr ) = "Gtr"
tok_list(tk_leq ) = "Leq"
tok_list(tk_leq ) = "Leq"
tok_list(tk_neq ) = "Neq"
tok_list(tk_neq ) = "Neq"
tok_list(tk_and ) = "And"
tok_list(tk_and ) = "And"
tok_list(tk_semi ) = "Semi"
tok_list(tk_semi ) = "Semi"
tok_list(tk_comma ) = "Comma"
tok_list(tk_comma ) = "Comma"
tok_list(tk_assign ) = "Assign"
tok_list(tk_assign ) = "Assign"
tok_list(tk_integer ) = "Integer"
tok_list(tk_integer) = "Integer"
tok_list(tk_string ) = "String"
tok_list(tk_string ) = "String"
tok_list(tk_ident ) = "Ident"
tok_list(tk_ident ) = "Ident"


do
do
Line 735: Line 944:
=={{header|Python}}==
=={{header|Python}}==
<lang Python>
<lang Python>
from __future__ import print_function
import sys
import sys


Line 762: Line 972:


#*** get the next character from the input
#*** get the next character from the input
def getc():
def next_ch():
global the_ch, the_col, the_line
global the_ch, the_col, the_line


Line 774: Line 984:
#*** 'x' - character constants
#*** 'x' - character constants
def char_lit(err_line, err_col):
def char_lit(err_line, err_col):
n = ord(getc()) # skip opening quote
n = ord(next_ch()) # skip opening quote
if the_ch == '\'':
if the_ch == '\'':
error(err_line, err_col, "empty character constant")
error(err_line, err_col, "empty character constant")
elif the_ch == '\\':
elif the_ch == '\\':
getc()
next_ch()
if the_ch == 'n':
if the_ch == 'n':
n = 10
n = 10
Line 785: Line 995:
else:
else:
error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch))
error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch))
if getc() != '\'':
if next_ch() != '\'':
error(err_line, err_col, "multi-character constant")
error(err_line, err_col, "multi-character constant")
getc()
next_ch()
return Integerk, err_line, err_col, n
return Integerk, err_line, err_col, n


#*** process divide or comments
#*** process divide or comments
def div_or_cmt(err_line, err_col):
def div_or_cmt(err_line, err_col):
if getc() != '*':
if next_ch() != '*':
return Div, err_line, err_col
return Div, err_line, err_col


# comment found
# comment found
while True:
while True:
if getc() == '*' and getc() == '/':
if next_ch() == '*' and next_ch() == '/':
getc()
next_ch()
return gettok()
return gettok()
elif len(the_ch) == 0:
elif len(the_ch) == 0:
Line 807: Line 1,017:
text = ""
text = ""


while getc() != start:
while next_ch() != start:
if len(the_ch) == 0:
if len(the_ch) == 0:
error(err_line, err_col, "EOF while scanning string literal")
error(err_line, err_col, "EOF while scanning string literal")
Line 814: Line 1,024:
text += the_ch
text += the_ch


getc()
next_ch()
return Stringk, err_line, err_col, text
return Stringk, err_line, err_col, text


Line 826: Line 1,036:
if not the_ch.isdigit():
if not the_ch.isdigit():
is_number = False
is_number = False
getc()
next_ch()


if len(text) == 0:
if len(text) == 0:
Line 844: Line 1,054:
#*** look ahead for '>=', etc.
#*** look ahead for '>=', etc.
def follow(expect, ifyes, ifno, err_line, err_col):
def follow(expect, ifyes, ifno, err_line, err_col):
if getc() == expect:
if next_ch() == expect:
getc()
next_ch()
return ifyes, err_line, err_col
return ifyes, err_line, err_col


Line 856: Line 1,066:
def gettok():
def gettok():
while the_ch.isspace():
while the_ch.isspace():
getc()
next_ch()


err_line = the_line
err_line = the_line
Line 862: Line 1,072:


if len(the_ch) == 0: return EOI, err_line, err_col
if len(the_ch) == 0: return EOI, err_line, err_col
elif the_ch in symbols: sym = symbols[the_ch]; getc(); return sym, err_line, err_col
elif the_ch == '/': return div_or_cmt(err_line, err_col)
elif the_ch == '/': return div_or_cmt(err_line, err_col)
elif the_ch == '\'': return char_lit(err_line, err_col)
elif the_ch == '\'': return char_lit(err_line, err_col)
Line 869: Line 1,078:
elif the_ch == '&': return follow('&', And, EOI, err_line, err_col)
elif the_ch == '&': return follow('&', And, EOI, err_line, err_col)
elif the_ch == '"': return string_lit(the_ch, err_line, err_col)
elif the_ch == '"': return string_lit(the_ch, err_line, err_col)
elif the_ch in symbols:
else: return ident_or_int(err_line, err_col)
sym = symbols[the_ch]
next_ch()
return sym, err_line, err_col
else: return ident_or_int(err_line, err_col)


#*** main driver
#*** main driver
Line 885: Line 1,098:
col = t[2]
col = t[2]


print("line %5d col %5d %-8s" % (line, col, all_syms[tok]), end='')
if tok == Integerk:

print("line %5d col %5d %-8s %8d" % (line, col, all_syms[tok], t[3]))
elif tok == Ident:
if tok == Integerk: print(" %5d" % (t[3]))
print("line %5d col %5d %-8s %s" % (line, col, all_syms[tok], t[3]))
elif tok == Ident: print(" %s" % (t[3]))
elif tok == Stringk:
elif tok == Stringk: print(' "%s"' % (t[3]))
print('line %5d col %5d %-8s "%s"' % (line, col, all_syms[tok], t[3]))
else: print("")
else:
print("line %5d col %5d %-8s" % (line, col, all_syms[tok]))


if tok == EOI:
if tok == EOI:

Revision as of 15:15, 13 August 2016

Lexical analysis is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an identified "meaning"). A program that performs lexical analysis may be called a lexer, tokenizer, or scanner (though "scanner" is also used to refer to the first stage of a lexer).

The Task

Create a lexical analyzer for the Tiny programming language. The program should read input from a file and/or stdin, and write output to a file and/or stdout.

Specification

The various token types are denoted below.

Operators
Characters Common name Name
* multiply Mul
/ divide Div
+ plus Add
- minus and unary minus Sub and Uminus
< less than Lss
<= less than or equal Leq
> greater than Gtr
!= not equal Neq
= assign Assign
&& and And
Symbols
Characters Common name Name
( left parenthesis Lparen
) right parenthesis Rparen
{ left brace Lbrace
} right brace Rbrace
; semi colon Semi
, comma Comma
Keywords
Characters Name
if If
while While
print Print
putc Putc
Other entities
Characters Regular expression Name
integers [0-9]+ Integer
char literal 'x' Integer
identifiers [_a-zA-Z][_a-zA-Z0-9]+ Ident
string literal ".*" String

Notes: For char literals, '\n' is supported as a new line character. To represent \, use: '\\'. \n may also be used in Strings, to print a newline. No other special sequences are supported.

Comments /* ... */ (multi-line)

Complete list of token names

EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident

Program output

Output of the program should be:

  • the word line, followed by:
  • the line number where the token starts, followed by:
  • the abbreviation col, followed by:
  • the column number where the token starts, followed by:
  • the token name.
  • If the token name is one of Integer, Ident or String, the actual value of the same should follow.
Test Cases

<lang c> /*

 Hello world
*/

print("Hello, World!\n"); </lang>

Output

line     4  col     1 Print
line     4  col     6 Lparen
line     4  col     7 String   "Hello, World!\n"
line     4  col    24 Rparen
line     4  col    25 Semi
line     5  col     1 EOI

<lang c> /*

 Show Ident and Integers
*/

phoenix_number = 142857; print(phoenix_number, "\n"); </lang>

Output

line     4  col     1 Ident    phoenix_number
line     4  col    16 Assign
line     4  col    18 Integer     142857
line     4  col    24 Semi
line     5  col     1 Print
line     5  col     6 Lparen
line     5  col     7 Ident    phoenix_number
line     5  col    21 Comma
line     5  col    23 String   "\n"
line     5  col    27 Rparen
line     5  col    28 Semi
line     6  col     1 EOI

<lang c> /*

 All lexical tokens - not syntactically correct, but that will
 have to wait until syntax analysis
*/

/* Print */ print /* Sub */ - /* Putc */ putc /* Lss */ < /* If */ if /* Gtr */ > /* While */ while /* Leq */ <= /* Lbrace */ { /* Neq */  != /* Rbrace */ } /* And */ && /* Lparen */ ( /* Semi */  ; /* Rparen */ ) /* Comma */ , /* Uminus */ - /* Assign */ = /* Mul */ * /* Integer */ 42 /* Div */ / /* String */ "String literal" /* Add */ + /* Ident */ variable_name /* character literal */ '\n' /* character literal */ ' ' </lang>

Output

line     5  col    15 Print
line     5  col    41 Sub
line     6  col    15 Putc
line     6  col    41 Lss
line     7  col    15 If
line     7  col    41 Gtr
line     8  col    15 While
line     8  col    41 Leq
line     9  col    15 Lbrace
line     9  col    41 Neq
line    10  col    15 Rbrace
line    10  col    41 And
line    11  col    15 Lparen
line    11  col    41 Semi
line    12  col    15 Rparen
line    12  col    41 Comma
line    13  col    15 Sub
line    13  col    41 Assign
line    14  col    15 Mul
line    14  col    41 Integer         42
line    15  col    15 Div
line    15  col    41 String   "String literal"
line    16  col    15 Add
line    16  col    41 Ident    variable_name
line    17  col    26 Integer         10
line    18  col    26 Integer         32
line    19  col     1 EOI

Diagnostics

The following error conditions should be caught:

  • Empty character constant. Example: ''
  • Unknown escape sequence. Example: '\r'
  • Multi-character constant. Example: 'xx'
  • End-of-file in comment. Closing comment characters not found.
  • End-of-file while scanning string literal. Closing string character not found.
  • End-of-line while scanning string literal. Closing string character not found before end-of-line.
  • Unrecognized character. Example: |
Reference

The C and Python versions can be considered reference implementations.

Implementations


C

<lang C>

  1. include <stdlib.h>
  2. include <stdio.h>
  3. include <stdarg.h>
  4. include <ctype.h>
  5. include <string.h>
  6. include <errno.h>
  7. include <stdbool.h>
  8. include <limits.h>
  1. define NELEMS(arr) (sizeof(arr) / sizeof(arr[0]))
  1. define da_dim(name, type) type *name = NULL; \
                           int _qy_ ## name ## _p = 0;  \
                           int _qy_ ## name ## _max = 0
  1. define da_rewind(name) _qy_ ## name ## _p = 0
  2. define da_redim(name) if (_qy_ ## name ## _p >= _qy_ ## name ## _max) \
                               name = realloc(name, (_qy_ ## name ## _max += 32) * sizeof(name[0]))
  1. define da_append(name, x) do {da_redim(name); name[_qy_ ## name ## _p++] = x;} while (0)
  2. define da_len(name) _qy_ ## name ## _p

// dependancy: atr table in parse.c ordering is based on these typedef enum {

   EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add,
   Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident

} TokenType;

typedef struct {

   int tok;
   int err_ln, err_col;
   union {
       int n;                  /* value for constants */
       char *text;             /* text for idents */
   };

} tok_s;

static FILE *source_fp, *dest_fp; static int line = 1, col = 0, the_ch = ' '; da_dim(text, char);

tok_s gettok();

static void error(int err_line, int err_col, const char *fmt, ... ) {

   char buf[1000];
   va_list ap;
   va_start(ap, fmt);
   vsprintf(buf, fmt, ap);
   va_end(ap);
   printf("(%d,%d) error: %s\n", err_line, err_col, buf);
   exit(1);

}

static int next_ch() { /* get next char from input */

   the_ch = getc(source_fp);
   ++col;
   if (the_ch == '\n') {
       ++line;
       col = 0;
   }
   return the_ch;

}

static tok_s char_lit(int n, int err_line, int err_col) { /* 'x' */

   if (the_ch == '\)
       error(err_line, err_col, "gettok: empty character constant");
   if (the_ch == '\\') {
       next_ch();
       if (the_ch == 'n')
           n = 10;
       else if (the_ch == '\\')
           n = '\\';
       else error(err_line, err_col, "gettok: unknown escape sequence \\%c", the_ch);
   }
   if (next_ch() != '\)
       error(err_line, err_col, "multi-character constant");
   next_ch();
   return (tok_s){Integerk, err_line, err_col, {n}};

}

static tok_s div_or_cmt(int err_line, int err_col) { /* process divide or comments */

   if (the_ch != '*')
       return (tok_s){Div, err_line, err_col, {0}};
   /* comment found */
   for (;;) {
       if (next_ch() == '*' && next_ch() == '/') {
           next_ch();
           return gettok();
       } else if (the_ch == EOF)
           error(err_line, err_col, "EOF in comment");
   }

}

static tok_s string_lit(int start, int err_line, int err_col) { /* "st" */

   da_rewind(text);
   while (next_ch() != start) {
       if (the_ch == '\n') error(err_line, err_col, "EOL in string");
       if (the_ch == EOF)  error(err_line, err_col, "EOF in string");
       da_append(text, (char)the_ch);
   }
   da_append(text, '\0');
   next_ch();
   return (tok_s){Stringk, err_line, err_col, {.text=text}};

}

static int kwd_cmp(const void *p1, const void *p2) {

   return strcmp(*(char **)p1, *(char **)p2);

}

static TokenType get_ident_type(const char *ident) {

   static struct {
       char *s;
       TokenType sym;
   } kwds[] = {
       {"if",    If},
       {"print", Print},
       {"putc",  Putc},
       {"while", While},
   }, *kwp;
   return (kwp = bsearch(&ident, kwds, NELEMS(kwds), sizeof(kwds[0]), kwd_cmp)) == NULL ? Ident : kwp->sym;

}

static tok_s ident_or_int(int err_line, int err_col) {

   int n, is_number = true;
   da_rewind(text);
   while (isalnum(the_ch) || the_ch == '_') {
       da_append(text, (char)the_ch);
       if (!isdigit(the_ch))
           is_number = false;
       next_ch();
   }
   if (da_len(text) == 0)
       error(err_line, err_col, "gettok: unrecognized character (%d) '%c'\n", the_ch, the_ch);
   da_append(text, '\0');
   if (isdigit(text[0])) {
       if (!is_number)
           error(err_line, err_col, "invalid number: %s\n", text);
       n = strtol(text, NULL, 0);
       if (n == LONG_MAX && errno == ERANGE)
           error(err_line, err_col, "Number exceeds maximum value");
       return (tok_s){Integerk, err_line, err_col, {n}};
   }
   return (tok_s){get_ident_type(text), err_line, err_col, {.text=text}};

}

static tok_s follow(int expect, TokenType ifyes, TokenType ifno, int err_line, int err_col) { /* look ahead for '>=', etc. */

   if (the_ch == expect) {
       next_ch();
       return (tok_s){ifyes, err_line, err_col, {0}};
   }
   if (ifno == EOI)
       error(err_line, err_col, "follow: unrecognized character '%c' (%d)\n", the_ch, the_ch);
   return (tok_s){ifno, err_line, err_col, {0}};

}

tok_s gettok() { /* return the token type */

   /* skip white space */
   while (isspace(the_ch))
       next_ch();
   int err_line = line;
   int err_col  = col;
   switch (the_ch) {
       case '{':  next_ch(); return (tok_s){Lbrace, err_line, err_col, {0}};
       case '}':  next_ch(); return (tok_s){Rbrace, err_line, err_col, {0}};
       case '(':  next_ch(); return (tok_s){Lparen, err_line, err_col, {0}};
       case ')':  next_ch(); return (tok_s){Rparen, err_line, err_col, {0}};
       case '+':  next_ch(); return (tok_s){Add,    err_line, err_col, {0}};
       case '-':  next_ch(); return (tok_s){Sub,    err_line, err_col, {0}};
       case '*':  next_ch(); return (tok_s){Mul,    err_line, err_col, {0}};
       case ';':  next_ch(); return (tok_s){Semi,   err_line, err_col, {0}};
       case ',':  next_ch(); return (tok_s){Comma,  err_line, err_col, {0}};
       case '>':  next_ch(); return (tok_s){Gtr,    err_line, err_col, {0}};
       case '=':  next_ch(); return (tok_s){Assign, err_line, err_col, {0}};
       case '/':  next_ch(); return div_or_cmt(err_line, err_col);
       case '\: next_ch(); return char_lit(the_ch, err_line, err_col);
       case '<':  next_ch(); return follow('=', Leq, Lss, err_line, err_col);
       case '!':  next_ch(); return follow('=', Neq, EOI, err_line, err_col);
       case '&':  next_ch(); return follow('&', And, EOI, err_line, err_col);
       case '"' : return string_lit(the_ch, err_line, err_col);
       default:   return ident_or_int(err_line, err_col);
       case EOF:  return (tok_s){EOI, err_line, err_col, {0}};
   }

}

void run() { /* tokenize the given input */

   tok_s tok;
   do {
       tok = gettok();
       fprintf(dest_fp, "line %5d  col %5d %.8s",
           tok.err_ln, tok.err_col,
           &"EOI      Print    Putc     If       While    Lbrace   Rbrace   Lparen   Rparen   "
            "Uminus   Mul      Div      Add      Sub      Lss      Gtr      Leq      Neq      "
            "And      Semi     Comma    Assign   Integer  String   Ident    "[tok.tok * 9]);
       if (tok.tok == Integerk)     fprintf(dest_fp, "  %4d",   tok.n);
       else if (tok.tok == Ident)   fprintf(dest_fp, " %s",     tok.text);
       else if (tok.tok == Stringk) fprintf(dest_fp, " \"%s\"", tok.text);
       fprintf(dest_fp, "\n");
   } while (tok.tok != EOI);
   if (dest_fp != stdout)
       fclose(dest_fp);

}

void init_io(FILE **fp, FILE *std, const char mode[], const char fn[]) {

   if (fn[0] == '\0')
       *fp = std;
   else if ((*fp = fopen(fn, mode)) == NULL)
       error(0, 0, "Can't open %s\n", fn);

}

int main(int argc, char *argv[]) {

   init_io(&source_fp, stdin,  "r",  argc > 1 ? argv[1] : "");
   init_io(&dest_fp,   stdout, "wb", argc > 2 ? argv[2] : "");
   run();

} </lang>

Euphoria

<lang euphoria> include std/io.e include std/map.e include std/types.e include std/convert.e

constant true = 1, false = 0, EOF = -1

enum EOI, Printk, Putc, Ifk, Whilek, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div,

   Add, Sub, Lss, Gtr, Leq, Neq, Andk, Semi, Comma, Assign, Integerk, Stringk, Ident

constant all_syms = { "EOI", "Print", "Putc", "If", "While", "Lbrace", "Rbrace", "Lparen",

   "Rparen", "Uminus", "Mul", "Div", "Add", "Sub", "Lss", "Gtr", "Leq", "Neq", "And",
   "Semi", "Comma", "Assign", "Integer", "String", "Ident"}

integer input_file, the_ch = ' ', the_col = 0, the_line = 1 sequence symbols map key_words = new()

procedure error(sequence format, sequence data)

   printf(STDOUT, format, data)
   abort(1)

end procedure

-- get the next character from the input function next_ch()

   the_ch = getc(input_file)
   the_col += 1
   if the_ch = '\n' then
       the_line += 1
       the_col = 0
   end if
   return the_ch

end function

-- 'x' - character constants function char_lit(integer err_line, integer err_col)

   integer n = next_ch()              -- skip opening quote
   if the_ch = '\ then
       error("%d %d empty character constant", {err_line, err_col})
   elsif the_ch = '\\' then
       next_ch()
       if the_ch = 'n' then
           n = 10
       elsif the_ch = '\\' then
           n = '\\'
       else
           error("%d %d unknown escape sequence \\%c", {err_line, err_col, the_ch})
       end if
   end if
   if next_ch() != '\ then
       error("%d %d multi-character constant", {err_line, err_col})
   end if
   next_ch()
   return {Integerk, err_line, err_col, n}

end function

-- process divide or comments function div_or_cmt(integer err_line, integer err_col)

   if next_ch() != '*' then
       return {Div, err_line, err_col}
   end if
   -- comment found
   while true do
       if next_ch() = '*' and next_ch() = '/' then
           next_ch()
           return get_tok()
       elsif the_ch = EOF then
           error("%d %d EOF in comment", {err_line, err_col})
       end if
   end while

end function

-- "string" function string_lit(integer start, integer err_line, integer err_col)

   string text = ""
   while next_ch() != start do
       if the_ch = EOF then
           error("%d %d EOF while scanning string literal", {err_line, err_col})
       end if
       if the_ch = '\n' then
           error("%d %d EOL while scanning string literal", {err_line, err_col})
       end if
       text &= the_ch
   end while
   next_ch()
   return {Stringk, err_line, err_col, text}

end function

-- handle identifiers and integers function ident_or_int(integer err_line, integer err_col)

   integer n, is_number = true
   string text = ""
   while t_alnum(the_ch) or the_ch = '_' do
       text &= the_ch
       if not t_digit(the_ch) then
           is_number = false
       end if
       next_ch()
   end while
   if length(text) = 0 then
       error("%d %d ident_or_int: unrecognized character: (%d) '%s'", {err_line, err_col, the_ch, the_ch})
   end if
   if t_digit(text[1]) then
       if not is_number then
           error("%d %d invalid number: %s", {err_line, err_col, text})
       end if
       n = to_integer(text)
       return {Integerk, err_line, err_col, n}
   end if
   if has(key_words, text) then
       return {get(key_words, text), err_line, err_col}
   end if
   return {Ident, err_line, err_col, text}

end function

-- look ahead for '>=', etc. function follow(integer expect, integer ifyes, integer ifno, integer err_line, integer err_col)

   if next_ch() = expect then
       next_ch()
       return {ifyes, err_line, err_col}
   end if
   if ifno = EOI then
       error("%d %d follow: unrecognized character: (%d)", {err_line, err_col, the_ch})
   end if
   return {ifno, err_line, err_col}

end function

-- return the next token type function get_tok()

   while t_space(the_ch) do
       next_ch()
   end while
   integer err_line = the_line
   integer err_col  = the_col
   switch the_ch do
       case EOF  then return {EOI, err_line, err_col}
       case '/'  then return div_or_cmt(err_line, err_col)
       case '\ then return char_lit(err_line, err_col)
       case '<'  then return follow('=', Leq, Lss, err_line, err_col)
       case '!'  then return follow('=', Neq, EOI, err_line, err_col)
       case '&'  then return follow('&', Andk, EOI, err_line, err_col)
       case '"'  then return string_lit(the_ch, err_line, err_col)
       case else
           integer sym = symbols[the_ch]
           if sym  != EOI then
               next_ch()
               return {sym, err_line, err_col}
           end if
           return ident_or_int(err_line, err_col)
   end switch

end function

procedure init()

   put(key_words, "if",      Ifk)
   put(key_words, "print",   Printk)
   put(key_words, "putc",    Putc)
   put(key_words, "while",   Whilek)
   symbols = repeat(EOI, 256)
   symbols['{'] = Lbrace
   symbols['}'] = Rbrace
   symbols['('] = Lparen
   symbols[')'] = Rparen
   symbols['+'] = Add
   symbols['-'] = Sub
   symbols['*'] = Mul
   symbols[';'] = Semi
   symbols[','] = Comma
   symbols['>'] = Gtr
   symbols['='] = Assign

end procedure

procedure main(sequence cl)

   sequence file_name
   input_file = STDIN
   if length(cl) > 2 then
       file_name = cl[3]
       input_file = open(file_name, "r")
       if input_file = -1 then
           error("Could not open %s", {file_name})
       end if
   end if
   init()
   sequence t
   loop do
       t = get_tok()
       printf(STDOUT, "line %5d col %5d %-8s", {t[2], t[3], all_syms[t[1]]})
       switch t[1] do
           case Integerk then printf(STDOUT, "  %5d\n",   {t[4]})
           case Ident    then printf(STDOUT, " %s\n",     {t[4]})
           case Stringk  then printf(STDOUT, " \"%s\"\n", {t[4]})
           case else          printf(STDOUT, "\n")
       end switch
       until t[1] = EOI
   end loop

end procedure

main(command_line()) </lang>

FreeBASIC

<lang FreeBASIC> enum Token_type

   tk_eoi
   tk_print
   tk_putc
   tk_if
   tk_while
   tk_lbrace
   tk_rbrace
   tk_lparen
   tk_rparen
   tk_uminus
   tk_mul
   tk_div
   tk_add
   tk_sub
   tk_lss
   tk_gtr
   tk_leq
   tk_neq
   tk_and
   tk_semi
   tk_comma
   tk_assign
   tk_integer
   tk_string
   tk_ident

end enum

const NewLine = chr(10) const DoubleQuote = chr(34)

' where we store keywords and variables type Symbol

   s_name as string
   tok as Token_type

end type

dim shared symtab() as Symbol

dim shared cur_line as string dim shared cur_ch as string dim shared line_num as integer dim shared col_num as integer

function is_digit(byval ch as string) as long

   is_digit = (ch <> "") and ch >= "0" and ch <= "9"

end function

function is_alnum(byval ch as string) as long

   is_alnum = (ch <> "") and ((UCase(ch) >= "A" and UCase(ch) <= "Z") or (is_digit(ch)))

end function

sub error_msg(byval eline as integer, byval ecol as integer, byval msg as string)

   print "("; eline; ":"; ecol; ")"; " "; msg
   system

end sub

' add an identifier to the symbol table function install(byval s_name as string, byval tok as Token_type) as integer

   dim n as integer
   n = ubound(symtab)
   redim preserve symtab(n + 1)
   n = ubound(symtab)
   symtab(n).s_name = s_name
   symtab(n).tok    = tok
   return n

end function

' search for an identifier in the symbol table function lookup(byval s_name as string) as integer

   dim i as integer
   for i = lbound(symtab) to ubound(symtab)
       if symtab(i).s_name = s_name then return i
   next
   return -1

end function

sub next_line() ' read the next line of input from the source file

   cur_line = ""
   cur_ch  = ""        ' empty cur_ch means end-of-file
   if eof(1) then exit sub
   line input #1, cur_line
   cur_line = cur_line + NewLine
   line_num += + 1
   col_num = 1

end sub

sub next_char() ' get the next char

   cur_ch = ""
   col_num += 1
   if col_num > len(cur_line) then next_line()
   if col_num <= len(cur_line) then cur_ch = mid(cur_line, col_num, 1)

end sub

function follow(byval err_line as integer, byval err_col as integer, byval expect as string, byval ifyes as Token_type, byval ifno as Token_type) as Token_type

   if cur_ch = expect then
       next_char()
       return ifyes
   end if
   if ifno = tk_eoi then error_msg(err_line, err_col, "follow unrecognized character: " + cur_ch)
   return ifno

end function

sub gettok(byref err_line as integer, byref err_col as integer, byref tok as Token_type, byref v as string)

   ' skip whitespace
   do while (cur_ch = " " or cur_ch = chr(9) or cur_ch = NewLine) and (cur_ch <> "")
       next_char()
   loop
   err_line = line_num
   err_col  = col_num
   select case cur_ch
       case "":  tok = tk_eoi: exit sub
       case "{": tok = tk_lbrace: next_char(): exit sub
       case "}": tok = tk_rbrace: next_char(): exit sub
       case "(": tok = tk_lparen: next_char(): exit sub
       case ")": tok = tk_rparen: next_char(): exit sub
       case "+": tok = tk_add:    next_char(): exit sub
       case "-": tok = tk_sub:    next_char(): exit sub
       case "*": tok = tk_mul:    next_char(): exit sub
       case ";": tok = tk_semi:   next_char(): exit sub
       case ",": tok = tk_comma:  next_char(): exit sub
       case ">": tok = tk_gtr:    next_char(): exit sub
       case "=": tok = tk_assign: next_char(): exit sub
       case "/": ' div or comment
           next_char()
           if cur_ch <> "*" then
               tok = tk_div
               exit sub
           end if
           ' skip comments
           do
               next_char()
               if cur_ch = "*" or cur_ch = "" then
                   next_char()
                   if cur_ch = "/" or cur_ch = "" then
                       next_char()
                       gettok(err_line, err_col, tok, v)
                       exit sub
                   end if
               end if
           loop
       case "'":   ' single char literals
           next_char()
           v = str(Asc(cur_ch))
           if cur_ch = "'" then error_msg(err_line, err_col, "empty character constant")
           if cur_ch = "\" then
               next_char()
               if cur_ch = "n" then
                   v = "10"
               elseif cur_ch = "\" then
                   v = Str(Asc("\"))
               else error_msg(err_line, err_col, "unknown escape sequence: " + cur_ch)
               end if
           end if
           next_char()
           if cur_ch <> "'" then error_msg(err_line, err_col, "multi-character constant")
           next_char()
           tok = tk_integer
           exit sub
       case "<": next_char(): tok = follow(err_line, err_col, "=", tk_Leq, tk_Lss): exit sub
       case "!": next_char(): tok = follow(err_line, err_col, "=", tk_Neq, tk_EOI): exit sub
       case "&": next_char(): tok = follow(err_line, err_col, "&", tk_And, tk_EOI): exit sub
       case DoubleQuote: ' string
           v = cur_ch
           next_char()
           do while cur_ch <> DoubleQuote
               if cur_ch = NewLine then error_msg(err_line, err_col, "EOL in string")
               if cur_ch = "" then error_msg(err_line, err_col, "EOF in string")
               v += cur_ch
               next_char()
           loop
           v += cur_ch
           next_char()
           tok = tk_string
           exit sub
       case else   ' integers or identifiers
           dim is_number as boolean = is_digit(cur_ch)
           v = ""
           do while is_alnum(cur_ch) orelse cur_ch = "_"
               if not is_digit(cur_ch) then is_number = false
               v += cur_ch
               next_char()
           loop
           if len(v) = 0 then error_msg(err_line, err_col, "unknown character: " + cur_ch)
           if is_digit(mid(v, 1, 1)) then
               if not is_number then error_msg(err_line, err_col, "invalid number: " + v)
               tok = tk_integer
               exit sub
           end if
           dim as integer index = lookup(v)
           if index = -1 then
               tok = tk_ident
           else
               tok = symtab(index).tok
           end if
           exit sub
   end select

end sub

sub init_lex(byval filein as string)

   install("if",    tk_if)
   install("print", tk_print)
   install("putc",  tk_putc)
   install("while", tk_while)
   open filein for input as #1
   cur_line = ""
   line_num = 0
   col_num = 0
   next_char()

end sub

sub scanner()

   dim err_line as integer
   dim err_col as integer
   dim tok as Token_type
   dim v as string
   dim tok_list(tk_eoi to tk_ident) as string
   tok_list(tk_eoi    ) = "EOI"
   tok_list(tk_print  ) = "Print"
   tok_list(tk_putc   ) = "Putc"
   tok_list(tk_if     ) = "If"
   tok_list(tk_while  ) = "While"
   tok_list(tk_lbrace ) = "Lbrace"
   tok_list(tk_rbrace ) = "Rbrace"
   tok_list(tk_lparen ) = "Lparen"
   tok_list(tk_rparen ) = "Rparen"
   tok_list(tk_uminus ) = "Uminus"
   tok_list(tk_mul    ) = "Mul"
   tok_list(tk_div    ) = "Div"
   tok_list(tk_add    ) = "Add"
   tok_list(tk_sub    ) = "Sub"
   tok_list(tk_lss    ) = "Lss"
   tok_list(tk_gtr    ) = "Gtr"
   tok_list(tk_leq    ) = "Leq"
   tok_list(tk_neq    ) = "Neq"
   tok_list(tk_and    ) = "And"
   tok_list(tk_semi   ) = "Semi"
   tok_list(tk_comma  ) = "Comma"
   tok_list(tk_assign ) = "Assign"
   tok_list(tk_integer) = "Integer"
   tok_list(tk_string ) = "String"
   tok_list(tk_ident  ) = "Ident"
   do
       gettok(err_line, err_col, tok, v)
       print using "line ##### col ##### \       \"; err_line; err_col; tok_list(tok);
       if tok = tk_integer orelse tok = tk_ident orelse tok = tk_string then print " " + v;
       print
   loop until tok = tk_eoi

end sub

sub main()

   if command(1) = "" then print "filename required" : system
   init_lex(command(1))
   scanner()

end sub

main() system </lang>

Python

<lang Python> from __future__ import print_function import sys

  1. following two must remain in the same order

EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, \ Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident = range(25)

all_syms = [ 'EOI', 'Print', 'Putc', 'If', 'While', 'Lbrace', 'Rbrace', 'Lparen',

   'Rparen', 'Uminus', 'Mul', 'Div', 'Add', 'Sub', 'Lss', 'Gtr', 'Leq', 'Neq', 'And',
   'Semi', 'Comma', 'Assign', 'Integer', 'String', 'Ident' ]
  1. single character only symbols

symbols = { '{': Lbrace, '}': Rbrace, '(': Lparen, ')': Rparen, '+': Add, '-': Sub,

   '*': Mul, ';': Semi, ',': Comma, '>': Gtr, '=': Assign }

key_words = { 'if': If, 'print': Print, 'putc': Putc, 'while': While }

the_ch = " " # dummy first char - but it must be a space the_col = 0 the_line = 1 input_file = None

        • show error and exit

def error(line, col, msg):

   print(line, col, msg)
   exit(1)
        • get the next character from the input

def next_ch():

   global the_ch, the_col, the_line
   the_ch = input_file.read(1)
   the_col += 1
   if the_ch == '\n':
       the_line += 1
       the_col = 0
   return the_ch
        • 'x' - character constants

def char_lit(err_line, err_col):

   n = ord(next_ch())              # skip opening quote
   if the_ch == '\:
       error(err_line, err_col, "empty character constant")
   elif the_ch == '\\':
       next_ch()
       if the_ch == 'n':
           n = 10
       elif the_ch == '\\':
           n = '\\'
       else:
           error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch))
   if next_ch() != '\:
       error(err_line, err_col, "multi-character constant")
   next_ch()
   return Integerk, err_line, err_col, n
        • process divide or comments

def div_or_cmt(err_line, err_col):

   if next_ch() != '*':
       return Div, err_line, err_col
   # comment found
   while True:
       if next_ch() == '*' and next_ch() == '/':
           next_ch()
           return gettok()
       elif len(the_ch) == 0:
           error(err_line, err_col, "EOF in comment")
        • "string"

def string_lit(start, err_line, err_col):

   text = ""
   while next_ch() != start:
       if len(the_ch) == 0:
           error(err_line, err_col, "EOF while scanning string literal")
       if the_ch == '\n':
           error(err_line, err_col, "EOL while scanning string literal")
       text += the_ch
   next_ch()
   return Stringk, err_line, err_col, text
        • handle identifiers and integers

def ident_or_int(err_line, err_col):

   is_number = True
   text = ""
   while the_ch.isalnum() or the_ch == '_':
       text += the_ch
       if not the_ch.isdigit():
           is_number = False
       next_ch()
   if len(text) == 0:
       error(err_line, err_col, "ident_or_int: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
   if text[0].isdigit():
       if not is_number:
           error(err_line, err_col, "invalid number: %s" % (text))
       n = int(text)
       return Integerk, err_line, err_col, n
   if text in key_words:
       return key_words[text], err_line, err_col
   return Ident, err_line, err_col, text
        • look ahead for '>=', etc.

def follow(expect, ifyes, ifno, err_line, err_col):

   if next_ch() == expect:
       next_ch()
       return ifyes, err_line, err_col
   if ifno == EOI:
       error(err_line, err_col, "follow: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
   return ifno, err_line, err_col
        • return the next token type

def gettok():

   while the_ch.isspace():
       next_ch()
   err_line = the_line
   err_col  = the_col
   if len(the_ch) == 0:    return EOI, err_line, err_col
   elif the_ch == '/':     return div_or_cmt(err_line, err_col)
   elif the_ch == '\:    return char_lit(err_line, err_col)
   elif the_ch == '<':     return follow('=', Leq, Lss, err_line, err_col)
   elif the_ch == '!':     return follow('=', Neq, EOI, err_line, err_col)
   elif the_ch == '&':     return follow('&', And, EOI, err_line, err_col)
   elif the_ch == '"':     return string_lit(the_ch, err_line, err_col)
   elif the_ch in symbols:
       sym = symbols[the_ch]
       next_ch()
       return sym, err_line, err_col
   else: return ident_or_int(err_line, err_col)
        • main driver

input_file = sys.stdin if len(sys.argv) > 1:

   try:
       input_file = open(sys.argv[1], "r", 4096)
   except IOError as e:
       error(0, 0, "Can't open %s" % sys.argv[1])

while True:

   t = gettok()
   tok  = t[0]
   line = t[1]
   col  = t[2]
   print("line %5d  col %5d %-8s" % (line, col, all_syms[tok]), end=)
   if tok == Integerk:  print("  %5d" % (t[3]))
   elif tok == Ident:   print(" %s" %   (t[3]))
   elif tok == Stringk: print(' "%s"' % (t[3]))
   else:                print("")
   if tok == EOI:
       break

</lang>