Compiler/lexical analyzer: Difference between revisions

m
m (→‎{{header|Wren}}: Minor tidy)
(97 intermediate revisions by 16 users not shown)
Line 158:
For example, the following two program fragments are equivalent, and should produce the same token stream except for the line and column positions:
 
* <langsyntaxhighlight lang="c">if ( p /* meaning n is prime */ ) {
print ( n , " " ) ;
count = count + 1 ; /* number of primes found so far */
}</langsyntaxhighlight>
* <langsyntaxhighlight lang="c">if(p){print(n," ");count=count+1;}</langsyntaxhighlight>
 
;Complete list of token names
Line 184:
# the token value (only for <tt>Identifier</tt>, <tt>Integer</tt>, and <tt>String</tt> tokens)
# the number of spaces between fields is up to you. Neatly aligned is nice, but not a requirement.
<br>
 
This task is intended to be used as part of a pipeline, with the other compiler tasks - for example:
<br><b>lex < hello.t | parse | gen | vm</b>
 
Or possibly:
<br><b>lex hello.t lex.out</b>
<br><b>parse lex.out parse.out</b>
<br><b>gen parse.out gen.out</b>
<br><b>vm gen.out</b>
 
<br>
This implies that the output of this task (the lexical analyzer) should be suitable as input to any of the [[Compiler/syntax_analyzer|Syntax Analyzer task]] programs.
 
{{task heading|Diagnostics}}
Line 224 ⟶ 237:
| style="vertical-align:top" |
Test Case 1:
<langsyntaxhighlight lang="c">/*
Hello world
*/
print("Hello, World!\n");</langsyntaxhighlight>
 
| style="vertical-align:top" |
Line 242 ⟶ 255:
| style="vertical-align:top" |
Test Case 2:
<langsyntaxhighlight lang="c">/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");</langsyntaxhighlight>
 
| style="vertical-align:top" |
Line 267 ⟶ 280:
| style="vertical-align:top" |
Test Case 3:
<langsyntaxhighlight lang="c">/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
Line 288 ⟶ 301:
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '</langsyntaxhighlight>
 
| style="vertical-align:top" |
Line 331 ⟶ 344:
| style="vertical-align:top" |
Test Case 4:
<langsyntaxhighlight lang="c">/*** test printing, embedded \n and comments with lots of '*' ***/
print(42);
print("\nHello World\nGood Bye\nok\n");
print("Print a slash n - \\n.\n");</langsyntaxhighlight>
 
| style="vertical-align:top" |
Line 360 ⟶ 373:
;Additional examples
Your solution should pass all the test cases above and the additional tests found '''[[Compiler/Sample_programs|Here]]'''.
 
 
{{task heading|Reference}}
Line 372 ⟶ 386:
<hr>
<br><br>
 
=={{header|Ada}}==
<syntaxhighlight lang="ada">with Ada.Text_IO, Ada.Streams.Stream_IO, Ada.Strings.Unbounded, Ada.Command_Line,
Ada.Exceptions;
use Ada.Strings, Ada.Strings.Unbounded, Ada.Streams, Ada.Exceptions;
 
procedure Main is
package IO renames Ada.Text_IO;
 
package Lexer is
type Token is (Op_multiply, Op_divide, Op_mod, Op_add, Op_subtract, Op_negate,
Op_less, Op_lessequal, Op_greater, Op_greaterequal, Op_equal,
Op_notequal, Op_not, Op_assign, Op_and, Op_or,
 
LeftParen, RightParen, LeftBrace, RightBrace, Semicolon, Comma,
 
Keyword_if, Keyword_else, Keyword_while, Keyword_print, Keyword_putc,
Identifier, Token_Integer, Token_String, End_of_input,
 
Empty_Char_Error, Invalid_Escape_Error, Multi_Char_Error, EOF_Comment_Error,
EOF_String_Error, EOL_String_Error, Invalid_Char_Error, Invalid_Num_Error
);
 
subtype Operator is Token range Op_multiply .. Op_or;
subtype Symbol is Token range Token'Succ(Operator'Last) .. Comma;
subtype Keyword is Token range Token'Succ(Symbol'Last) .. Keyword_putc;
subtype Error is Token range Empty_Char_Error .. Invalid_Num_Error;
subtype Operator_or_Error is Token
with Static_Predicate => Operator_or_Error in Operator | Error;
 
subtype Whitespace is Character
with Static_Predicate => Whitespace in ' ' | ASCII.HT | ASCII.CR | ASCII.LF;
 
Lexer_Error : exception;
Invalid_Escape_Code : constant Character := ASCII.NUL;
 
procedure run(input : Stream_IO.File_Type);
end Lexer;
 
package body Lexer is
use type Stream_IO.Count;
 
procedure run(input : Stream_IO.File_Type) is
type State is (State_Start, State_Identifier, State_Integer, State_Char, State_String,
State_Comment);
curr_state : State := State_Start;
curr_char : Character;
curr_col, curr_row, token_col, token_row : Positive := 1;
token_text : Unbounded_String := Unbounded.Null_Unbounded_String;
 
function look_ahead return Character is
next_char : Character := ASCII.LF;
begin
if not Stream_IO.End_Of_File(input) then
next_char := Character'Input(Stream_IO.Stream(input));
Stream_IO.Set_Index(input, Stream_IO.Index(input) - 1);
end if;
return next_char;
end look_ahead;
 
procedure next_char is
next : Character := Character'Input(Stream_IO.Stream(input));
begin
curr_col := curr_col + 1;
if curr_char = ASCII.LF then
curr_row := curr_row + 1;
curr_col := 1;
end if;
curr_char := next;
end next_char;
 
procedure print_token(tok : Token; text : String := "") is
procedure raise_error(text : String) is
begin
raise Lexer_Error with "Error: " & text;
end;
begin
IO.Put(token_row'Image & ASCII.HT & token_col'Image & ASCII.HT);
case tok is
when Operator | Symbol | Keyword | End_of_input => IO.Put_Line(tok'Image);
when Token_Integer => IO.Put_Line("INTEGER" & ASCII.HT & text);
when Token_String => IO.Put_Line("STRING" & ASCII.HT & ASCII.Quotation & text & ASCII.Quotation);
when Identifier => IO.Put_Line(tok'Image & ASCII.HT & text);
when Empty_Char_Error => raise_error("empty character constant");
when Invalid_Escape_Error => raise_error("unknown escape sequence: " & text);
when Multi_Char_Error => raise_error("multi-character constant: " & text);
when EOF_Comment_Error => raise_error("EOF in comment");
when EOF_String_Error => raise_error("EOF in string");
when EOL_String_Error => raise_error("EOL in string");
when Invalid_Char_Error => raise_error("invalid character: " & curr_char);
when Invalid_Num_Error => raise_error("invalid number: " & text);
end case;
end print_token;
 
procedure lookahead_choose(determiner : Character; a, b : Operator_or_Error) is
begin
if look_ahead = determiner then
print_token(a);
next_char;
else
print_token(b);
end if;
end lookahead_choose;
 
function to_escape_code(c : Character) return Character is
begin
case c is
when 'n' => return ASCII.LF;
when '\' => return '\';
when others =>
print_token(Invalid_Escape_Error, ASCII.Back_Slash & c);
return Invalid_Escape_Code;
end case;
end to_escape_code;
begin
curr_char := Character'Input(Stream_IO.Stream(input));
loop
case curr_state is
when State_Start =>
token_col := curr_col;
token_row := curr_row;
case curr_char is
when '*' => print_token(Op_multiply);
when '/' =>
if look_ahead = '*' then
next_char;
curr_state := State_Comment;
else
print_token(Op_divide);
end if;
when '%' => print_token(Op_mod);
when '+' => print_token(Op_add);
when '-' => print_token(Op_subtract);
when '(' => print_token(LeftParen);
when ')' => print_token(RightParen);
when '{' => print_token(LeftBrace);
when '}' => print_token(RightBrace);
when ';' => print_token(Semicolon);
when ',' => print_token(Comma);
when '<' => lookahead_choose('=', Op_lessequal, Op_less);
when '>' => lookahead_choose('=', Op_greaterequal, Op_greater);
when '!' => lookahead_choose('=', Op_notequal, Op_not);
when '=' => lookahead_choose('=', Op_equal, Op_assign);
when '&' => lookahead_choose('&', Op_and, Invalid_Char_Error);
when '|' => lookahead_choose('|', Op_or, Invalid_Char_Error);
when 'a' .. 'z' | 'A' .. 'Z' | '_' =>
Unbounded.Append(token_text, curr_char);
curr_state := State_Identifier;
when '0' .. '9' =>
Unbounded.Append(token_text, curr_char);
curr_state := State_Integer;
when ''' => curr_state := State_Char;
when ASCII.Quotation => curr_state := State_String;
when Whitespace => null;
when others => null;
end case;
next_char;
 
when State_Identifier =>
case curr_char is
when 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_' =>
Unbounded.Append(token_text, curr_char);
next_char;
when others =>
if token_text = "if" then
print_token(Keyword_if);
elsif token_text = "else" then
print_token(Keyword_else);
elsif token_text = "while" then
print_token(Keyword_while);
elsif token_text = "print" then
print_token(Keyword_print);
elsif token_text = "putc" then
print_token(Keyword_putc);
else
print_token(Identifier, To_String(token_text));
end if;
Unbounded.Set_Unbounded_String(token_text, "");
curr_state := State_Start;
end case;
 
when State_Integer =>
case curr_char is
when '0' .. '9' =>
Unbounded.Append(token_text, curr_char);
next_char;
when 'a' .. 'z' | 'A' .. 'Z' | '_' =>
print_token(Invalid_Num_Error, To_String(token_text));
when others =>
print_token(Token_Integer, To_String(token_text));
Unbounded.Set_Unbounded_String(token_text, "");
curr_state := State_Start;
end case;
 
when State_Char =>
case curr_char is
when ''' =>
if Unbounded.Length(token_text) = 0 then
print_token(Empty_Char_Error);
elsif Unbounded.Length(token_text) = 1 then
print_token(Token_Integer, Character'Pos(Element(token_text, 1))'Image);
else
print_token(Multi_Char_Error, To_String(token_text));
end if;
Set_Unbounded_String(token_text, "");
curr_state := State_Start;
when '\' =>
Unbounded.Append(token_text, to_escape_code(look_ahead));
next_char;
when others => Unbounded.Append(token_text, curr_char);
end case;
next_char;
 
when State_String =>
case curr_char is
when ASCII.Quotation =>
print_token(Token_String, To_String(token_text));
Set_Unbounded_String(token_text, "");
curr_state := State_Start;
when '\' =>
if to_escape_code(look_ahead) /= Invalid_Escape_Code then
Unbounded.Append(token_text, curr_char);
end if;
when ASCII.LF | ASCII.CR => print_token(EOL_String_Error);
when others => Unbounded.Append(token_text, curr_char);
end case;
next_char;
 
when State_Comment =>
case curr_char is
when '*' =>
if look_ahead = '/' then
next_char;
curr_state := State_Start;
end if;
when others => null;
end case;
next_char;
end case;
end loop;
exception
when error : Stream_IO.End_Error =>
if curr_state = State_String then
print_token(EOF_String_Error);
else
print_token(End_of_input);
end if;
when error : Lexer.Lexer_Error => IO.Put_Line(Exception_Message(error));
end run;
end Lexer;
 
source_file : Stream_IO.File_Type;
begin
if Ada.Command_Line.Argument_Count < 1 then
IO.Put_Line("usage: lex [filename]");
return;
end if;
Stream_IO.Open(source_file, Stream_IO.In_File, Ada.Command_Line.Argument(1));
Lexer.run(source_file);
exception
when error : others => IO.Put_Line("Error: " & Exception_Message(error));
end Main;
</syntaxhighlight>
{{out}} Test case 3:
<pre>
5 16 KEYWORD_PRINT
5 40 OP_SUBTRACT
6 16 KEYWORD_PUTC
6 40 OP_LESS
7 16 KEYWORD_IF
7 40 OP_GREATER
8 16 KEYWORD_ELSE
8 40 OP_LESSEQUAL
9 16 KEYWORD_WHILE
9 40 OP_GREATEREQUAL
10 16 LEFTBRACE
10 40 OP_EQUAL
11 16 RIGHTBRACE
11 40 OP_NOTEQUAL
12 16 LEFTPAREN
12 40 OP_AND
13 16 RIGHTPAREN
13 40 OP_OR
14 16 OP_SUBTRACT
14 40 SEMICOLON
15 16 OP_NOT
15 40 COMMA
16 16 OP_MULTIPLY
16 40 OP_ASSIGN
17 16 OP_DIVIDE
17 40 INTEGER 42
18 16 OP_MOD
18 40 STRING "String literal"
19 16 OP_ADD
19 40 IDENTIFIER variable_name
20 26 INTEGER 10
21 26 INTEGER 92
22 26 INTEGER 32
23 1 END_OF_INPUT
</pre>
 
=={{header|ALGOL 68}}==
This is a simple ''token in, line out'' program. It doesn't keep an internal representation of tokens or anything like that, since that's not needed at all.
 
As an addition, it emits a diagnostic if integer literals are too big.
<syntaxhighlight lang="algol68">BEGIN
# implement C-like getchar, where EOF and EOLn are "characters" (-1 and 10 resp.). #
INT eof = -1, eoln = 10;
BOOL eof flag := FALSE;
STRING buf := "";
INT col := 1;
INT line := 0;
on logical file end (stand in, (REF FILE f)BOOL: eof flag := TRUE);
PROC getchar = INT:
IF eof flag THEN eof
ELIF col = UPB buf THEN col +:= 1; eoln
ELIF col > UPB buf THEN IF line > 0 THEN read(newline) FI;
line +:= 1;
read(buf);
IF eof flag THEN col := 1; eof
ELSE col := 0; getchar
FI
ELSE col +:= 1; ABS buf[col]
FI;
PROC nextchar = INT: IF eof flag THEN eof ELIF col >= UPB buf THEN eoln ELSE ABS buf[col+1] FI;
 
PROC is blank = (INT ch) BOOL: ch = 0 OR ch = 9 OR ch = 10 OR ch = 13 OR ch = ABS " ";
PROC is digit = (INT ch) BOOL: ch >= ABS "0" AND ch <= ABS "9";
PROC is ident start = (INT ch) BOOL: ch >= ABS "A" AND ch <= ABS "Z" OR
ch >= ABS "a" AND ch <= ABS "z" OR
ch = ABS "_";
PROC is ident = (INT ch) BOOL: is ident start(ch) OR is digit(ch);
 
PROC ident or keyword = (INT start char) VOID:
BEGIN
STRING w := REPR start char;
INT start col = col;
WHILE is ident (next char) DO w +:= REPR getchar OD;
IF w = "if" THEN output2("Keyword_if", start col)
ELIF w = "else" THEN output2("Keyword_else", start col)
ELIF w = "while" THEN output2("Keyword_while", start col)
ELIF w = "print" THEN output2("Keyword_print", start col)
ELIF w = "putc" THEN output2("Keyword_putc", start col)
ELSE output2("Identifier " + w, start col)
FI
END;
PROC char = VOID:
BEGIN
INT start col = col;
INT ch := getchar;
IF ch = ABS "'" THEN error("Empty character constant")
ELIF ch = ABS "\" THEN ch := getchar;
IF ch = ABS "n" THEN ch := 10
ELIF ch = ABS "\" THEN SKIP
ELSE error("Unknown escape sequence. \" + REPR ch)
FI
FI;
IF nextchar /= ABS "'" THEN error("Multi-character constant.") FI;
getchar;
output2("Integer " + whole(ch, 0), start col)
END;
PROC string = VOID:
BEGIN
INT start col = col;
STRING s := """";
WHILE INT ch := getchar; ch /= ABS """"
DO
IF ch = eoln THEN error("End-of-line while scanning string literal. Closing string character not found before end-of-line.")
ELIF ch = eof THEN error("End-of-file while scanning string literal. Closing string character not found.")
ELIF ch = ABS "\" THEN s +:= REPR ch; ch := getchar;
IF ch /= ABS "\" AND ch /= ABS "n" THEN error("Unknown escape sequence. \" + REPR ch) FI;
s +:= REPR ch
ELSE s +:= REPR ch
FI
OD;
output2("String " + s + """", start col)
END;
PROC comment = VOID:
BEGIN
WHILE INT ch := getchar; NOT (ch = ABS "*" AND nextchar = ABS "/")
DO IF ch = eof THEN error("End-of-file in comment. Closing comment characters not found.") FI
OD;
getchar
END;
PROC number = (INT first digit) VOID:
BEGIN
INT start col = col;
INT n := first digit - ABS "0";
WHILE is digit (nextchar) DO
INT u := getchar - ABS "0";
IF LENG n * 10 + LENG u > max int THEN error("Integer too big") FI;
n := n * 10 + u
OD;
IF is ident start (nextchar) THEN error("Invalid number. Starts like a number, but ends in non-numeric characters.") FI;
output2("Integer " + whole(n, 0), start col)
END;
 
PROC output = (STRING s) VOID: output2(s, col);
PROC output2 = (STRING s, INT col) VOID: print((whole(line,-8), whole(col,-8), " ", s, newline));
 
PROC if follows = (CHAR second, STRING longer, shorter) VOID:
IF nextchar = ABS second
THEN output(longer); getchar
ELSE output(shorter)
FI;
PROC error = (STRING s)VOID: (put(stand error, ("At ", whole(line,0), ":", whole(col,0), " ", s, new line)); stop);
PROC unrecognized = (INT char) VOID: error("Unrecognized character " + REPR char);
PROC double char = (INT first, STRING op) VOID:
IF nextchar /= first THEN unrecognized(first)
ELSE output2(op, col-1); getchar
FI;
 
WHILE INT ch := getchar; ch /= eof
DO
IF is blank(ch) THEN SKIP
ELIF ch = ABS "(" THEN output("LeftParen")
ELIF ch = ABS ")" THEN output("RightParen")
ELIF ch = ABS "{" THEN output("LeftBrace")
ELIF ch = ABS "}" THEN output("RightBrace")
ELIF ch = ABS ";" THEN output("Semicolon")
ELIF ch = ABS "," THEN output("Comma")
ELIF ch = ABS "*" THEN output("Op_multiply")
ELIF ch = ABS "/" THEN IF next char = ABS "*" THEN comment
ELSE output("Op_divide")
FI
ELIF ch = ABS "%" THEN output("Op_mod")
ELIF ch = ABS "+" THEN output("Op_add")
ELIF ch = ABS "-" THEN output("Op_subtract")
ELIF ch = ABS "<" THEN if follows("=", "Op_lessequal", "Op_less")
ELIF ch = ABS ">" THEN if follows("=", "Op_greaterequal", "Op_greater")
ELIF ch = ABS "=" THEN if follows("=", "Op_equal", "Op_assign")
ELIF ch = ABS "!" THEN if follows("=", "Op_notequal", "Op_not")
ELIF ch = ABS "&" THEN double char(ch, "Op_and")
ELIF ch = ABS "|" THEN double char(ch, "Op_or")
ELIF is ident start (ch) THEN ident or keyword (ch)
ELIF ch = ABS """" THEN string
ELIF ch = ABS "'" THEN char
ELIF is digit(ch) THEN number(ch)
ELSE unrecognized(ch)
FI
OD;
output("End_Of_Input")
END</syntaxhighlight>
 
=={{header|ALGOL W}}==
<langsyntaxhighlight lang="algolw">begin
%lexical analyser %
% Algol W strings are limited to 256 characters in length so we limit source lines %
Line 667 ⟶ 1,124:
while nextToken not = tEnd_of_input do writeToken;
writeToken
end.</langsyntaxhighlight>
{{out}} Test case 3:
<pre>
Line 705 ⟶ 1,162:
23 1 End_of_input
</pre>
 
=={{header|ATS}}==
 
One interesting feature of this implementation is my liberal use of a pushback buffer for input characters. This kept the code modular and easier to write.
 
(One point of note: the C "EOF" pseudo-character is detected in the following code by looking for a negative number. That EOF has to be negative and the other characters non-negative is implied by the ISO C standard.)
 
<syntaxhighlight lang="ats">(********************************************************************)
(* Usage: lex [INPUTFILE [OUTPUTFILE]]
If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input
or standard output is used, respectively. *)
 
#define ATS_DYNLOADFLAG 0
 
#include "share/atspre_staload.hats"
staload UN = "prelude/SATS/unsafe.sats"
 
#define NIL list_nil ()
#define :: list_cons
 
%{^
/* alloca(3) is needed for ATS exceptions. */
#include <alloca.h>
%}
 
(********************************************************************)
 
#define NUM_TOKENS 31
#define RESERVED_WORD_HASHTAB_SIZE 9
 
#define TOKEN_ELSE 0
#define TOKEN_IF 1
#define TOKEN_PRINT 2
#define TOKEN_PUTC 3
#define TOKEN_WHILE 4
#define TOKEN_MULTIPLY 5
#define TOKEN_DIVIDE 6
#define TOKEN_MOD 7
#define TOKEN_ADD 8
#define TOKEN_SUBTRACT 9
#define TOKEN_NEGATE 10
#define TOKEN_LESS 11
#define TOKEN_LESSEQUAL 12
#define TOKEN_GREATER 13
#define TOKEN_GREATEREQUAL 14
#define TOKEN_EQUAL 15
#define TOKEN_NOTEQUAL 16
#define TOKEN_NOT 17
#define TOKEN_ASSIGN 18
#define TOKEN_AND 19
#define TOKEN_OR 20
#define TOKEN_LEFTPAREN 21
#define TOKEN_RIGHTPAREN 22
#define TOKEN_LEFTBRACE 23
#define TOKEN_RIGHTBRACE 24
#define TOKEN_SEMICOLON 25
#define TOKEN_COMMA 26
#define TOKEN_IDENTIFIER 27
#define TOKEN_INTEGER 28
#define TOKEN_STRING 29
#define TOKEN_END_OF_INPUT 30
 
typedef token_t =
[i : int | TOKEN_ELSE <= i; i <= TOKEN_END_OF_INPUT]
int i
typedef tokentuple_t = (token_t, String, ullint, ullint)
typedef token_names_vt = @[string][NUM_TOKENS]
 
vtypedef reserved_words_vt =
@[String][RESERVED_WORD_HASHTAB_SIZE]
vtypedef reserved_word_tokens_vt =
@[token_t][RESERVED_WORD_HASHTAB_SIZE]
 
vtypedef lookups_vt =
[p_toknames : addr]
[p_wordtab : addr]
[p_toktab : addr]
@{
pf_toknames = token_names_vt @ p_toknames,
pf_wordtab = reserved_words_vt @ p_wordtab,
pf_toktab = reserved_word_tokens_vt @ p_toktab |
toknames = ptr p_toknames,
wordtab = ptr p_wordtab,
toktab = ptr p_toktab
}
 
fn
reserved_word_lookup
(s : String,
lookups : !lookups_vt,
line_no : ullint,
column_no : ullint) : tokentuple_t =
if string_length s < 2 then
(TOKEN_IDENTIFIER, s, line_no, column_no)
else
let
macdef wordtab = !(lookups.wordtab)
macdef toktab = !(lookups.toktab)
val hashval =
g1uint_mod (g1ofg0 (char2ui s[0] + char2ui s[1]),
g1i2u RESERVED_WORD_HASHTAB_SIZE)
val token = toktab[hashval]
in
if token = TOKEN_IDENTIFIER || s <> wordtab[hashval] then
(TOKEN_IDENTIFIER, s, line_no, column_no)
else
(token, s, line_no, column_no)
end
 
(********************************************************************)
(* Input allows pushback into a buffer. *)
 
typedef ch_t =
@{
ichar = int,
line_no = ullint,
column_no = ullint
}
 
typedef inp_t (n : int) =
[0 <= n]
@{
file = FILEref,
pushback = list (ch_t, n),
line_no = ullint,
column_no = ullint
}
typedef inp_t = [n : int] inp_t n
 
fn
get_ch (inp : inp_t) : (ch_t, inp_t) =
case+ (inp.pushback) of
| NIL =>
let
val c = fileref_getc (inp.file)
val ch =
@{
ichar = c,
line_no = inp.line_no,
column_no = inp.column_no
}
in
if c = char2i '\n' then
let
val inp =
@{
file = inp.file,
pushback = inp.pushback,
line_no = succ (inp.line_no),
column_no = 1ULL
}
in
(ch, inp)
end
else
let
val inp =
@{
file = inp.file,
pushback = inp.pushback,
line_no = inp.line_no,
column_no = succ (inp.column_no)
}
in
(ch, inp)
end
end
| ch :: pushback =>
let
val inp =
@{
file = inp.file,
pushback = pushback,
line_no = inp.line_no,
column_no = inp.column_no
}
in
(ch, inp)
end
 
fn
push_back_ch (ch : ch_t,
inp : inp_t) : [n : pos] inp_t n =
let
prval _ = lemma_list_param (inp.pushback)
in
@{
file = inp.file,
pushback = ch :: (inp.pushback),
line_no = inp.line_no,
column_no = inp.column_no
}
end
 
(********************************************************************)
 
exception unterminated_comment of (ullint, ullint)
exception unterminated_character_literal of (ullint, ullint)
exception multicharacter_literal of (ullint, ullint)
exception unterminated_string_literal of (ullint, ullint, bool)
exception unsupported_escape of (ullint, ullint, int)
exception invalid_integer_literal of (ullint, ullint, String)
exception unexpected_character of (ullint, ullint, int)
 
fn
scan_comment (inp : inp_t,
line_no : ullint,
column_no : ullint) : inp_t =
let
fun
loop (inp : inp_t) : inp_t =
let
val (ch, inp) = get_ch inp
in
if (ch.ichar) < 0 then
$raise unterminated_comment (line_no, column_no)
else if (ch.ichar) = char2i '*' then
let
val (ch1, inp) = get_ch inp
in
if (ch.ichar) < 0 then
$raise unterminated_comment (line_no, column_no)
else if (ch1.ichar) = char2i '/' then
inp
else
loop inp
end
else
loop inp
end
in
loop inp
end
 
fn
skip_spaces_and_comments (inp : inp_t) : [n : pos] inp_t n =
let
fun
loop (inp : inp_t) : [n : pos] inp_t n =
let
val (ch, inp) = get_ch inp
in
if isspace (ch.ichar) then
loop inp
else if (ch.ichar) = char2i '/' then
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '*' then
loop (scan_comment (inp, ch.line_no, ch.column_no))
else
let
val inp = push_back_ch (ch1, inp)
val inp = push_back_ch (ch, inp)
in
inp
end
end
else
push_back_ch (ch, inp)
end
in
loop inp
end
 
fn
reverse_list_to_string
{m : int}
(lst : list (char, m)) : string m =
let
fun
fill_array {n : nat | n <= m} .<n>.
(arr : &(@[char][m + 1]),
lst : list (char, n),
n : size_t n) : void =
case+ lst of
| NIL => ()
| c :: tail =>
begin
arr[pred n] := c;
fill_array (arr, tail, pred n)
end
 
prval _ = lemma_list_param lst
val m : size_t m = i2sz (list_length lst)
val (pf, pfgc | p) = array_ptr_alloc<char> (succ m)
val _ = array_initize_elt<char> (!p, succ m, '\0')
val _ = fill_array (!p, lst, m)
in
$UN.castvwtp0 @(pf, pfgc | p)
end
 
extern fun {}
simple_scan$pred : int -> bool
fun {}
simple_scan {u : nat}
(lst : list (char, u),
inp : inp_t) :
[m : nat]
[n : pos]
(list (char, m), inp_t n) =
let
val (ch, inp) = get_ch inp
in
if simple_scan$pred (ch.ichar) then
simple_scan<> (int2char0 (ch.ichar) :: lst, inp)
else
let
val inp = push_back_ch (ch, inp)
in
(lst, inp)
end
end
 
fn
is_ident_start (c : int) :<> bool =
isalpha (c) || c = char2i '_'
 
fn
is_ident_continuation (c : int) :<> bool =
isalnum (c) || c = char2i '_'
 
fn
scan_identifier_or_reserved_word
(inp : inp_t,
lookups : !lookups_vt) :
(tokentuple_t, [n : pos] inp_t n) =
let
val (ch, inp) = get_ch inp
val _ = assertloc (is_ident_start (ch.ichar))
 
implement simple_scan$pred<> c = is_ident_continuation c
val (lst, inp) = simple_scan (int2char0 (ch.ichar) :: NIL, inp)
 
val s = reverse_list_to_string lst
val toktup =
reserved_word_lookup (s, lookups, ch.line_no, ch.column_no)
in
(toktup, inp)
end
 
fn
scan_integer_literal
(inp : inp_t,
lookups : !lookups_vt) :
(tokentuple_t, [n : pos] inp_t n) =
let
val (ch, inp) = get_ch inp
val _ = assertloc (isdigit (ch.ichar))
 
implement simple_scan$pred<> c = is_ident_continuation c
val (lst, inp) = simple_scan (int2char0 (ch.ichar) :: NIL, inp)
 
val s = reverse_list_to_string lst
 
fun
check_they_are_all_digits
{n : nat} .<n>.
(lst : list (char, n)) : void =
case+ lst of
| NIL => ()
| c :: tail =>
if isdigit c then
check_they_are_all_digits tail
else
$raise invalid_integer_literal (ch.line_no, ch.column_no, s)
 
val _ = check_they_are_all_digits lst
in
((TOKEN_INTEGER, s, ch.line_no, ch.column_no), inp)
end
 
fn
ichar2integer_literal (c : int) : String0 =
let
var buf = @[char][100] ('\0')
val _ = $extfcall (int, "snprintf", addr@ buf, i2sz 99, "%d", c)
val s = string1_copy ($UN.castvwtp0{String0} buf)
in
strnptr2string s
end
 
fn
scan_character_literal_without_checking_end (inp : inp_t) :
(tokentuple_t, inp_t) =
let
val (ch, inp) = get_ch inp
val _ = assertloc ((ch.ichar) = '\'')
 
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) < 0 then
$raise unterminated_character_literal (ch.line_no, ch.column_no)
else if (ch1.ichar) = char2i '\\' then
let
val (ch2, inp) = get_ch inp
in
if (ch2.ichar) < 0 then
$raise unterminated_character_literal (ch.line_no,
ch.column_no)
else if (ch2.ichar) = char2i 'n' then
let
val s = ichar2integer_literal (char2i '\n')
in
((TOKEN_INTEGER, s, ch.line_no, ch.column_no), inp)
end
else if (ch2.ichar) = char2i '\\' then
let
val s = ichar2integer_literal (char2i '\\')
in
((TOKEN_INTEGER, s, ch.line_no, ch.column_no), inp)
end
else
$raise unsupported_escape (ch1.line_no, ch1.column_no,
ch2.ichar)
end
else
let
val s = ichar2integer_literal (ch1.ichar)
in
((TOKEN_INTEGER, s, ch.line_no, ch.column_no), inp)
end
end
 
fn
scan_character_literal (inp : inp_t) : (tokentuple_t, inp_t) =
let
val (tok, inp) =
scan_character_literal_without_checking_end inp
val line_no = (tok.2)
val column_no = (tok.3)
 
fun
check_end (inp : inp_t) : inp_t =
let
val (ch, inp) = get_ch inp
in
if (ch.ichar) = char2i '\'' then
inp
else
let
fun
loop_to_end (ch1 : ch_t,
inp : inp_t) : inp_t =
if (ch1.ichar) < 0 then
$raise unterminated_character_literal (line_no,
column_no)
else if (ch1.ichar) = char2i '\'' then
$raise multicharacter_literal (line_no, column_no)
else
let
val (ch1, inp) = get_ch inp
in
loop_to_end (ch1, inp)
end
 
val inp = loop_to_end (ch, inp)
in
inp
end
end
 
val inp = check_end inp
in
(tok, inp)
end
 
fn
scan_string_literal (inp : inp_t) : (tokentuple_t, inp_t) =
let
val (ch, inp) = get_ch inp
val _ = assertloc ((ch.ichar) = '"')
 
fun
scan {u : pos}
(lst : list (char, u),
inp : inp_t) :
[m : pos] (list (char, m), inp_t) =
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) < 0 then
$raise unterminated_string_literal (ch.line_no,
ch.column_no, false)
else if (ch1.ichar) = char2i '\n' then
$raise unterminated_string_literal (ch.line_no,
ch.column_no, true)
else if (ch1.ichar) = char2i '"' then
(lst, inp)
else if (ch1.ichar) <> char2i '\\' then
scan (int2char0 (ch1.ichar) :: lst, inp)
else
let
val (ch2, inp) = get_ch inp
in
if (ch2.ichar) = char2i 'n' then
scan ('n' :: '\\' :: lst, inp)
else if (ch2.ichar) = char2i '\\' then
scan ('\\' :: '\\' :: lst, inp)
else
$raise unsupported_escape (ch1.line_no, ch1.column_no,
ch2.ichar)
end
end
 
val lst = '"' :: NIL
val (lst, inp) = scan (lst, inp)
val lst = '"' :: lst
val s = reverse_list_to_string lst
in
((TOKEN_STRING, s, ch.line_no, ch.column_no), inp)
end
 
fn
get_next_token (inp : inp_t,
lookups : !lookups_vt) : (tokentuple_t, inp_t) =
let
val inp = skip_spaces_and_comments inp
val (ch, inp) = get_ch inp
val ln = ch.line_no
val cn = ch.column_no
in
if ch.ichar < 0 then
((TOKEN_END_OF_INPUT, "", ln, cn), inp)
else
case+ int2char0 (ch.ichar) of
| ',' => ((TOKEN_COMMA, ",", ln, cn), inp)
| ';' => ((TOKEN_SEMICOLON, ";", ln, cn), inp)
| '\(' => ((TOKEN_LEFTPAREN, "(", ln, cn), inp)
| ')' => ((TOKEN_RIGHTPAREN, ")", ln, cn), inp)
| '\{' => ((TOKEN_LEFTBRACE, "{", ln, cn), inp)
| '}' => ((TOKEN_RIGHTBRACE, "}", ln, cn), inp)
| '*' => ((TOKEN_MULTIPLY, "*", ln, cn), inp)
| '/' => ((TOKEN_DIVIDE, "/", ln, cn), inp)
| '%' => ((TOKEN_MOD, "%", ln, cn), inp)
| '+' => ((TOKEN_ADD, "+", ln, cn), inp)
| '-' => ((TOKEN_SUBTRACT, "-", ln, cn), inp)
| '<' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '=' then
((TOKEN_LESSEQUAL, "<=", ln, cn), inp)
else
let
val inp = push_back_ch (ch1, inp)
in
((TOKEN_LESS, "<", ln, cn), inp)
end
end
| '>' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '=' then
((TOKEN_GREATEREQUAL, ">=", ln, cn), inp)
else
let
val inp = push_back_ch (ch1, inp)
in
((TOKEN_GREATER, ">", ln, cn), inp)
end
end
| '=' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '=' then
((TOKEN_EQUAL, "==", ln, cn), inp)
else
let
val inp = push_back_ch (ch1, inp)
in
((TOKEN_ASSIGN, "=", ln, cn), inp)
end
end
| '!' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '=' then
((TOKEN_NOTEQUAL, "!=", ln, cn), inp)
else
let
val inp = push_back_ch (ch1, inp)
in
((TOKEN_NOT, "!", ln, cn), inp)
end
end
| '&' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '&' then
((TOKEN_AND, "&&", ln, cn), inp)
else
$raise unexpected_character (ch.line_no, ch.column_no,
ch.ichar)
end
| '|' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '|' then
((TOKEN_OR, "||", ln, cn), inp)
else
$raise unexpected_character (ch.line_no, ch.column_no,
ch.ichar)
end
| '"' =>
let
val inp = push_back_ch (ch, inp)
in
scan_string_literal inp
end
| '\'' =>
let
val inp = push_back_ch (ch, inp)
in
scan_character_literal inp
end
| _ when isdigit (ch.ichar) =>
let
val inp = push_back_ch (ch, inp)
in
scan_integer_literal (inp, lookups)
end
| _ when is_ident_start (ch.ichar) =>
let
val inp = push_back_ch (ch, inp)
in
scan_identifier_or_reserved_word (inp, lookups)
end
| _ => $raise unexpected_character (ch.line_no, ch.column_no,
ch.ichar)
end
 
fn
fprint_ullint_rightjust (outf : FILEref,
num : ullint) : void =
if num < 10ULL then
fprint! (outf, " ", num)
else if num < 100ULL then
fprint! (outf, " ", num)
else if num < 1000ULL then
fprint! (outf, " ", num)
else if num < 10000ULL then
fprint! (outf, " ", num)
else
fprint! (outf, num)
 
fn
print_token (outf : FILEref,
toktup : tokentuple_t,
lookups : !lookups_vt) : void =
let
macdef toknames = !(lookups.toknames)
val name = toknames[toktup.0]
val str = (toktup.1)
val line_no = (toktup.2)
val column_no = (toktup.3)
 
val _ = fprint_ullint_rightjust (outf, line_no)
val _ = fileref_puts (outf, " ")
val _ = fprint_ullint_rightjust (outf, column_no)
val _ = fileref_puts (outf, " ")
val _ = fileref_puts (outf, name)
in
begin
case+ toktup.0 of
| TOKEN_IDENTIFIER => fprint! (outf, " ", str)
| TOKEN_INTEGER => fprint! (outf, " ", str)
| TOKEN_STRING => fprint! (outf, " ", str)
| _ => ()
end;
 
fileref_putc (outf, '\n')
end
 
fn
scan_text (outf : FILEref,
inp : inp_t,
lookups : !lookups_vt) : void =
let
fun
loop (inp : inp_t,
lookups : !lookups_vt) : void =
let
val (toktup, inp) = get_next_token (inp, lookups)
in
print_token (outf, toktup, lookups);
if toktup.0 <> TOKEN_END_OF_INPUT then
loop (inp, lookups)
end
in
loop (inp, lookups)
end
 
(********************************************************************)
 
fn
main_program (inpf : FILEref,
outf : FILEref) : int =
let
(* Using a simple Scheme program, I found the following perfect
hash for the reserved words, using the sum of the first two
characters as the hash value. *)
var reserved_words =
@[String][RESERVED_WORD_HASHTAB_SIZE]
("if", "print", "else", "", "putc", "", "", "while", "")
var reserved_word_tokens =
@[token_t][RESERVED_WORD_HASHTAB_SIZE]
(TOKEN_IF, TOKEN_PRINT, TOKEN_ELSE, TOKEN_IDENTIFIER,
TOKEN_PUTC, TOKEN_IDENTIFIER, TOKEN_IDENTIFIER, TOKEN_WHILE,
TOKEN_IDENTIFIER)
 
var token_names =
@[string][NUM_TOKENS]
("Keyword_else",
"Keyword_if",
"Keyword_print",
"Keyword_putc",
"Keyword_while",
"Op_multiply",
"Op_divide",
"Op_mod",
"Op_add",
"Op_subtract",
"Op_negate",
"Op_less",
"Op_lessequal",
"Op_greater",
"Op_greaterequal",
"Op_equal",
"Op_notequal",
"Op_not",
"Op_assign",
"Op_and",
"Op_or",
"LeftParen",
"RightParen",
"LeftBrace",
"RightBrace",
"Semicolon",
"Comma",
"Identifier",
"Integer",
"String",
"End_of_input")
 
var lookups : lookups_vt =
@{
pf_toknames = view@ token_names,
pf_wordtab = view@ reserved_words,
pf_toktab = view@ reserved_word_tokens |
toknames = addr@ token_names,
wordtab = addr@ reserved_words,
toktab = addr@ reserved_word_tokens
}
 
val inp =
@{
file = inpf,
pushback = NIL,
line_no = 1ULL,
column_no = 1ULL
}
 
val _ = scan_text (outf, inp, lookups)
 
val @{
pf_toknames = pf_toknames,
pf_wordtab = pf_wordtab,
pf_toktab = pf_toktab |
toknames = toknames,
wordtab = wordtab,
toktab = toktab
} = lookups
prval _ = view@ token_names := pf_toknames
prval _ = view@ reserved_words := pf_wordtab
prval _ = view@ reserved_word_tokens := pf_toktab
in
0
end
 
macdef lex_error = "Lexical error: "
 
implement
main (argc, argv) =
let
val inpfname =
if 2 <= argc then
$UN.cast{string} argv[1]
else
"-"
val outfname =
if 3 <= argc then
$UN.cast{string} argv[2]
else
"-"
in
try
let
val inpf =
if (inpfname : string) = "-" then
stdin_ref
else
fileref_open_exn (inpfname, file_mode_r)
 
val outf =
if (outfname : string) = "-" then
stdout_ref
else
fileref_open_exn (outfname, file_mode_w)
in
main_program (inpf, outf)
end
with
| ~ unterminated_comment (line_no, column_no) =>
begin
fprintln! (stderr_ref, lex_error,
"unterminated comment starting at ",
line_no, ":", column_no);
1
end
| ~ unterminated_character_literal (line_no, column_no) =>
begin
fprintln! (stderr_ref, lex_error,
"unterminated character literal starting at ",
line_no, ":", column_no);
1
end
| ~ multicharacter_literal (line_no, column_no) =>
begin
fprintln! (stderr_ref, lex_error,
"unsupported multicharacter literal starting at ",
line_no, ":", column_no);
1
end
| ~ unterminated_string_literal (line_no, column_no,
end_of_line) =>
let
val s =
begin
if end_of_line then
"end of line"
else
"end of input"
end : String
in
fprintln! (stderr_ref, lex_error,
"unterminated string literal (", s,
") starting at ", line_no, ":", column_no);
1
end
| ~ unsupported_escape (line_no, column_no, c) =>
begin
fprintln! (stderr_ref, lex_error,
"unsupported escape \\",
int2char0 c, " starting at ",
line_no, ":", column_no);
1
end
| ~ invalid_integer_literal (line_no, column_no, s) =>
begin
fprintln! (stderr_ref, lex_error,
"invalid integer literal ", s,
" starting at ", line_no, ":", column_no);
1
end
| ~ unexpected_character (line_no, column_no, c) =>
begin
fprintln! (stderr_ref, lex_error,
"unexpected character '", int2char0 c,
"' at ", line_no, ":", column_no);
1
end
end
 
(********************************************************************)</syntaxhighlight>
 
{{out}}
<pre>$ patscc -O2 -DATS_MEMALLOC_GCBDW -o lex lex-in-ATS.dats -lgc && ./lex compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|AWK}}==
Tested with gawk 4.1.1 and mawk 1.3.4.
<syntaxhighlight lang="awk">
<lang AWK>
BEGIN {
all_syms["tk_EOI" ] = "End_of_input"
Line 914 ⟶ 2,288:
}
}
</syntaxhighlight>
</lang>
{{out|case=count}}
<b>
Line 950 ⟶ 2,324:
 
=={{header|C}}==
Tested with gcc 4.81 and later, compiles warning free with -Wpedantic -pedantic -Wall -Wextra
<langsyntaxhighlight Clang="c">#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
Line 991 ⟶ 2,365:
da_dim(text, char);
 
tok_s gettok(void);
 
static void error(int err_line, int err_col, const char *fmt, ... ) {
Line 1,004 ⟶ 2,378:
}
 
static int next_ch(void) { /* get next char from input */
the_ch = getc(source_fp);
++col;
Line 1,070 ⟶ 2,444:
static TokenType get_ident_type(const char *ident) {
static struct {
const char *s;
TokenType sym;
} kwds[] = {
Line 1,117 ⟶ 2,491:
}
 
tok_s gettok(void) { /* return the token type */
/* skip white space */
while (isspace(the_ch))
Line 1,148 ⟶ 2,522:
}
 
void run(void) { /* tokenize the given input */
tok_s tok;
do {
Line 1,183 ⟶ 2,557:
run();
return 0;
}</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 1,227 ⟶ 2,601:
=={{header|C sharp|C#}}==
Requires C#6.0 because of the use of null coalescing operators.
<langsyntaxhighlight lang="csharp">
using System;
using System.IO;
Line 1,577 ⟶ 2,951:
}
}
</syntaxhighlight>
</lang>
 
{{out|case=test case 3}}
Line 1,618 ⟶ 2,992:
</pre>
</b>
 
=={{header|C++}}==
Tested with GCC 9.3.0 (g++ -std=c++17)
<syntaxhighlight lang="cpp">#include <charconv> // std::from_chars
#include <fstream> // file_to_string, string_to_file
#include <functional> // std::invoke
#include <iomanip> // std::setw
#include <ios> // std::left
#include <iostream>
#include <map> // keywords
#include <sstream>
#include <string>
#include <utility> // std::forward
#include <variant> // TokenVal
 
using namespace std;
 
// =====================================================================================================================
// Machinery
// =====================================================================================================================
string file_to_string (const string& path)
{
// Open file
ifstream file {path, ios::in | ios::binary | ios::ate};
if (!file) throw (errno);
 
// Allocate string memory
string contents;
contents.resize(file.tellg());
 
// Read file contents into string
file.seekg(0);
file.read(contents.data(), contents.size());
 
return contents;
}
 
void string_to_file (const string& path, string contents)
{
ofstream file {path, ios::out | ios::binary};
if (!file) throw (errno);
 
file.write(contents.data(), contents.size());
}
 
template <class F>
void with_IO (string source, string destination, F&& f)
{
string input;
 
if (source == "stdin") getline(cin, input);
else input = file_to_string(source);
 
string output = invoke(forward<F>(f), input);
 
if (destination == "stdout") cout << output;
else string_to_file(destination, output);
}
 
// Add escaped newlines and backslashes back in for printing
string sanitize (string s)
{
for (auto i = 0u; i < s.size(); ++i)
{
if (s[i] == '\n') s.replace(i++, 1, "\\n");
else if (s[i] == '\\') s.replace(i++, 1, "\\\\");
}
 
return s;
}
 
class Scanner
{
public:
const char* pos;
int line = 1;
int column = 1;
 
Scanner (const char* source) : pos {source} {}
 
inline char peek () { return *pos; }
 
void advance ()
{
if (*pos == '\n') { ++line; column = 1; }
else ++column;
 
++pos;
}
 
char next ()
{
advance();
return peek();
}
 
void skip_whitespace ()
{
while (isspace(static_cast<unsigned char>(peek())))
advance();
}
}; // class Scanner
 
 
// =====================================================================================================================
// Tokens
// =====================================================================================================================
enum class TokenName
{
OP_MULTIPLY, OP_DIVIDE, OP_MOD, OP_ADD, OP_SUBTRACT, OP_NEGATE,
OP_LESS, OP_LESSEQUAL, OP_GREATER, OP_GREATEREQUAL, OP_EQUAL, OP_NOTEQUAL,
OP_NOT, OP_ASSIGN, OP_AND, OP_OR,
LEFTPAREN, RIGHTPAREN, LEFTBRACE, RIGHTBRACE, SEMICOLON, COMMA,
KEYWORD_IF, KEYWORD_ELSE, KEYWORD_WHILE, KEYWORD_PRINT, KEYWORD_PUTC,
IDENTIFIER, INTEGER, STRING,
END_OF_INPUT, ERROR
};
 
using TokenVal = variant<int, string>;
 
struct Token
{
TokenName name;
TokenVal value;
int line;
int column;
};
 
 
const char* to_cstring (TokenName name)
{
static const char* s[] =
{
"Op_multiply", "Op_divide", "Op_mod", "Op_add", "Op_subtract", "Op_negate",
"Op_less", "Op_lessequal", "Op_greater", "Op_greaterequal", "Op_equal", "Op_notequal",
"Op_not", "Op_assign", "Op_and", "Op_or",
"LeftParen", "RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma",
"Keyword_if", "Keyword_else", "Keyword_while", "Keyword_print", "Keyword_putc",
"Identifier", "Integer", "String",
"End_of_input", "Error"
};
 
return s[static_cast<int>(name)];
}
 
 
string to_string (Token t)
{
ostringstream out;
out << setw(2) << t.line << " " << setw(2) << t.column << " ";
 
switch (t.name)
{
case (TokenName::IDENTIFIER) : out << "Identifier " << get<string>(t.value); break;
case (TokenName::INTEGER) : out << "Integer " << left << get<int>(t.value); break;
case (TokenName::STRING) : out << "String \"" << sanitize(get<string>(t.value)) << '"'; break;
case (TokenName::END_OF_INPUT) : out << "End_of_input"; break;
case (TokenName::ERROR) : out << "Error " << get<string>(t.value); break;
default : out << to_cstring(t.name);
}
 
out << '\n';
 
return out.str();
}
 
 
// =====================================================================================================================
// Lexer
// =====================================================================================================================
class Lexer
{
public:
Lexer (const char* source) : s {source}, pre_state {s} {}
 
bool has_more () { return s.peek() != '\0'; }
 
Token next_token ()
{
s.skip_whitespace();
 
pre_state = s;
 
switch (s.peek())
{
case '*' : return simply(TokenName::OP_MULTIPLY);
case '%' : return simply(TokenName::OP_MOD);
case '+' : return simply(TokenName::OP_ADD);
case '-' : return simply(TokenName::OP_SUBTRACT);
case '{' : return simply(TokenName::LEFTBRACE);
case '}' : return simply(TokenName::RIGHTBRACE);
case '(' : return simply(TokenName::LEFTPAREN);
case ')' : return simply(TokenName::RIGHTPAREN);
case ';' : return simply(TokenName::SEMICOLON);
case ',' : return simply(TokenName::COMMA);
case '&' : return expect('&', TokenName::OP_AND);
case '|' : return expect('|', TokenName::OP_OR);
case '<' : return follow('=', TokenName::OP_LESSEQUAL, TokenName::OP_LESS);
case '>' : return follow('=', TokenName::OP_GREATEREQUAL, TokenName::OP_GREATER);
case '=' : return follow('=', TokenName::OP_EQUAL, TokenName::OP_ASSIGN);
case '!' : return follow('=', TokenName::OP_NOTEQUAL, TokenName::OP_NOT);
case '/' : return divide_or_comment();
case '\'' : return char_lit();
case '"' : return string_lit();
 
default : if (is_id_start(s.peek())) return identifier();
if (is_digit(s.peek())) return integer_lit();
return error("Unrecognized character '", s.peek(), "'");
 
case '\0' : return make_token(TokenName::END_OF_INPUT);
}
}
 
 
private:
Scanner s;
Scanner pre_state;
static const map<string, TokenName> keywords;
 
 
template <class... Args>
Token error (Args&&... ostream_args)
{
string code {pre_state.pos, (string::size_type) s.column - pre_state.column};
 
ostringstream msg;
(msg << ... << forward<Args>(ostream_args)) << '\n'
<< string(28, ' ') << "(" << s.line << ", " << s.column << "): " << code;
 
if (s.peek() != '\0') s.advance();
 
return make_token(TokenName::ERROR, msg.str());
}
 
 
inline Token make_token (TokenName name, TokenVal value = 0)
{
return {name, value, pre_state.line, pre_state.column};
}
 
 
Token simply (TokenName name)
{
s.advance();
return make_token(name);
}
 
 
Token expect (char expected, TokenName name)
{
if (s.next() == expected) return simply(name);
else return error("Unrecognized character '", s.peek(), "'");
}
 
 
Token follow (char expected, TokenName ifyes, TokenName ifno)
{
if (s.next() == expected) return simply(ifyes);
else return make_token(ifno);
}
 
 
Token divide_or_comment ()
{
if (s.next() != '*') return make_token(TokenName::OP_DIVIDE);
 
while (s.next() != '\0')
{
if (s.peek() == '*' && s.next() == '/')
{
s.advance();
return next_token();
}
}
 
return error("End-of-file in comment. Closing comment characters not found.");
}
 
 
Token char_lit ()
{
int n = s.next();
 
if (n == '\'') return error("Empty character constant");
 
if (n == '\\') switch (s.next())
{
case 'n' : n = '\n'; break;
case '\\' : n = '\\'; break;
default : return error("Unknown escape sequence \\", s.peek());
}
 
if (s.next() != '\'') return error("Multi-character constant");
 
s.advance();
return make_token(TokenName::INTEGER, n);
}
 
 
Token string_lit ()
{
string text = "";
 
while (s.next() != '"')
switch (s.peek())
{
case '\\' : switch (s.next())
{
case 'n' : text += '\n'; continue;
case '\\' : text += '\\'; continue;
default : return error("Unknown escape sequence \\", s.peek());
}
 
case '\n' : return error("End-of-line while scanning string literal."
" Closing string character not found before end-of-line.");
 
case '\0' : return error("End-of-file while scanning string literal."
" Closing string character not found.");
 
default : text += s.peek();
}
 
s.advance();
return make_token(TokenName::STRING, text);
}
 
 
static inline bool is_id_start (char c) { return isalpha(static_cast<unsigned char>(c)) || c == '_'; }
static inline bool is_id_end (char c) { return isalnum(static_cast<unsigned char>(c)) || c == '_'; }
static inline bool is_digit (char c) { return isdigit(static_cast<unsigned char>(c)); }
 
 
Token identifier ()
{
string text (1, s.peek());
 
while (is_id_end(s.next())) text += s.peek();
 
auto i = keywords.find(text);
if (i != keywords.end()) return make_token(i->second);
 
return make_token(TokenName::IDENTIFIER, text);
}
 
 
Token integer_lit ()
{
while (is_digit(s.next()));
 
if (is_id_start(s.peek()))
return error("Invalid number. Starts like a number, but ends in non-numeric characters.");
 
int n;
 
auto r = from_chars(pre_state.pos, s.pos, n);
if (r.ec == errc::result_out_of_range) return error("Number exceeds maximum value");
 
return make_token(TokenName::INTEGER, n);
}
}; // class Lexer
 
 
const map<string, TokenName> Lexer::keywords =
{
{"else", TokenName::KEYWORD_ELSE},
{"if", TokenName::KEYWORD_IF},
{"print", TokenName::KEYWORD_PRINT},
{"putc", TokenName::KEYWORD_PUTC},
{"while", TokenName::KEYWORD_WHILE}
};
 
 
int main (int argc, char* argv[])
{
string in = (argc > 1) ? argv[1] : "stdin";
string out = (argc > 2) ? argv[2] : "stdout";
 
with_IO(in, out, [](string input)
{
Lexer lexer {input.data()};
 
string s = "Location Token name Value\n"
"--------------------------------------\n";
 
while (lexer.has_more()) s += to_string(lexer.next_token());
return s;
});
}
</syntaxhighlight>
 
{{out|case=test case 3}}
<pre>
Location Token name Value
--------------------------------------
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
 
=={{header|COBOL}}==
Using GnuCOBOL 2. By Steve Williams (with one change to get around a Rosetta Code code highlighter problem).
 
<langsyntaxhighlight lang="cobol"> >>SOURCE FORMAT IS FREE
*> this code is dedicated to the public domain
*> (GnuCOBOL) 2.3-dev.0
Line 2,028 ⟶ 3,831:
end-if
.
end program lexer.</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 2,070 ⟶ 3,873:
Lisp has a built-in reader and you can customize the reader by modifying its readtable. I'm also using the Gray stream, which is an almost standard feature of Common Lisp, for counting lines and columns.
 
<langsyntaxhighlight lang="lisp">(defpackage #:lexical-analyzer
(:use #:cl #:sb-gray)
(:export #:main))
Line 2,283 ⟶ 4,086:
 
(defun main ()
(lex *standard-input*))</langsyntaxhighlight>
{{out|case=test case 3}}
<pre> 5 16 KEYWORD-PRINT
Line 2,319 ⟶ 4,122:
22 26 INTEGER 32
23 1 END-OF-INPUT </pre>
 
=={{header|Elixir}}==
{{works with|Elixir|1.13.3}}
{{trans|ATS}}
 
<syntaxhighlight lang="elixir">#!/bin/env elixir
# -*- elixir -*-
 
defmodule Lex do
 
def main args do
{inpf_name, outf_name, exit_status} =
case args do
[] -> {"-", "-", 0}
[name] -> {name, "-", 0}
[name1, name2] -> {name1, name2, 0}
[name1, name2 | _] -> {name1, name2, usage_error()}
end
 
{inpf, outf, exit_status} =
case {inpf_name, outf_name, exit_status} do
{"-", "-", 0} -> {:stdio, :stdio, 0}
{name1, "-", 0} ->
{inpf, exit_status} = open_file(name1, [:read])
{inpf, :stdio, exit_status}
{"-", name2, 0} ->
{outf, exit_status} = open_file(name2, [:write])
{:stdio, outf, exit_status}
{name1, name2, 0} ->
{inpf, exit_status} = open_file(name1, [:read])
if exit_status != 0 do
{inpf, name2, exit_status}
else
{outf, exit_status} = open_file(name2, [:write])
{inpf, outf, exit_status}
end
_ -> {inpf_name, outf_name, exit_status}
end
 
exit_status =
case exit_status do
0 -> main_program inpf, outf
_ -> exit_status
end
 
# Choose one.
System.halt exit_status # Fast exit.
#System.stop exit_status # Laborious cleanup.
end
 
def main_program inpf, outf do
inp = make_inp inpf
scan_text outf, inp
exit_status = 0
exit_status
end
 
def open_file name, rw do
case File.open name, rw do
{:ok, f} -> {f, 0}
_ ->
IO.write :stderr, "Cannot open "
IO.write :stderr, name
case rw do
[:read] -> IO.puts " for input"
[:write] -> IO.puts " for output"
end
{name, 1}
end
end
 
def scan_text outf, inp do
{toktup, inp} = get_next_token inp
print_token outf, toktup
case toktup do
{"End_of_input", _, _, _} -> :ok
_ -> scan_text outf, inp
end
end
 
def print_token outf, {tok, arg, line_no, column_no} do
IO.write outf, (String.pad_leading "#{line_no}", 5)
IO.write outf, " "
IO.write outf, (String.pad_leading "#{column_no}", 5)
IO.write outf, " "
IO.write outf, tok
case tok do
"Identifier" ->
IO.write outf, " "
IO.write outf, arg
"Integer" ->
IO.write outf, " "
IO.write outf, arg
"String" ->
IO.write outf, " "
IO.write outf, arg
_ -> :ok
end
IO.puts outf, ""
end
 
###-------------------------------------------------------------------
###
### The token dispatcher.
###
 
def get_next_token inp do
inp = skip_spaces_and_comments inp
{ch, inp} = get_ch inp
{chr, line_no, column_no} = ch
ln = line_no
cn = column_no
case chr do
:eof -> {{"End_of_input", "", ln, cn}, inp}
"," -> {{"Comma", ",", ln, cn}, inp}
";" -> {{"Semicolon", ";", ln, cn}, inp}
"(" -> {{"LeftParen", "(", ln, cn}, inp}
")" -> {{"RightParen", ")", ln, cn}, inp}
"{" -> {{"LeftBrace", "{", ln, cn}, inp}
"}" -> {{"RightBrace", "}", ln, cn}, inp}
"*" -> {{"Op_multiply", "*", ln, cn}, inp}
"/" -> {{"Op_divide", "/", ln, cn}, inp}
"%" -> {{"Op_mod", "%", ln, cn}, inp}
"+" -> {{"Op_add", "+", ln, cn}, inp}
"-" -> {{"Op_subtract", "-", ln, cn}, inp}
"<" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_lessequal", "<=", ln, cn}, inp}
_ -> {{"Op_less", "<", ln, cn}, (push_back ch1, inp)}
end
">" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_greaterequal", ">=", ln, cn}, inp}
_ -> {{"Op_greater", ">", ln, cn}, (push_back ch1, inp)}
end
"=" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_equal", "==", ln, cn}, inp}
_ -> {{"Op_assign", "=", ln, cn}, (push_back ch1, inp)}
end
"!" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_notequal", "!=", ln, cn}, inp}
_ -> {{"Op_not", "!", ln, cn}, (push_back ch1, inp)}
end
"&" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"&" -> {{"Op_and", "&&", ln, cn}, inp}
_ -> unexpected_character ln, cn, chr
end
"|" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"|" -> {{"Op_or", "||", ln, cn}, inp}
_ -> unexpected_character ln, cn, chr
end
"\"" ->
inp = push_back ch, inp
scan_string_literal inp
"'" ->
inp = push_back ch, inp
scan_character_literal inp
_ ->
cond do
String.match? chr, ~r/^[[:digit:]]$/u ->
inp = push_back ch, inp
scan_integer_literal inp
String.match? chr, ~r/^[[:alpha:]_]$/u ->
inp = push_back ch, inp
scan_identifier_or_reserved_word inp
true -> unexpected_character ln, cn, chr
end
end
end
 
###-------------------------------------------------------------------
###
### Skipping past spaces and /* ... */ comments.
###
### Comments are treated exactly like a bit of whitespace. They never
### make it to the dispatcher.
###
 
def skip_spaces_and_comments inp do
{ch, inp} = get_ch inp
{chr, line_no, column_no} = ch
cond do
chr == :eof -> push_back ch, inp
String.match? chr, ~r/^[[:space:]]$/u ->
skip_spaces_and_comments inp
chr == "/" ->
{ch1, inp} = get_ch inp
case ch1 do
{"*", _, _} ->
inp = scan_comment inp, line_no, column_no
skip_spaces_and_comments inp
_ -> push_back ch, (push_back ch1, inp)
end
true -> push_back ch, inp
end
end
 
def scan_comment inp, line_no, column_no do
{ch, inp} = get_ch inp
case ch do
{:eof, _, _} -> unterminated_comment line_no, column_no
{"*", _, _} ->
{ch1, inp} = get_ch inp
case ch1 do
{:eof, _, _} -> unterminated_comment line_no, column_no
{"/", _, _} -> inp
_ -> scan_comment inp, line_no, column_no
end
_ -> scan_comment inp, line_no, column_no
end
end
 
###-------------------------------------------------------------------
###
### Scanning of integer literals, identifiers, and reserved words.
###
### These three types of token are very similar to each other.
###
 
def scan_integer_literal inp do
# Scan an entire word, not just digits. This way we detect
# erroneous text such as "23skidoo".
{line_no, column_no, inp} = get_position inp
{word, inp} = scan_word inp
if String.match? word, (~r/^[[:digit:]]+$/u) do
{{"Integer", word, line_no, column_no}, inp}
else
invalid_integer_literal line_no, column_no, word
end
end
 
def scan_identifier_or_reserved_word inp do
# It is assumed that the first character is of the correct type,
# thanks to the dispatcher.
{line_no, column_no, inp} = get_position inp
{word, inp} = scan_word inp
tok =
case word do
"if" -> "Keyword_if"
"else" -> "Keyword_else"
"while" -> "Keyword_while"
"print" -> "Keyword_print"
"putc" -> "Keyword_putc"
_ -> "Identifier"
end
{{tok, word, line_no, column_no}, inp}
end
 
def scan_word inp, word\\"" do
{ch, inp} = get_ch inp
{chr, _, _} = ch
if String.match? chr, (~r/^[[:alnum:]_]$/u) do
scan_word inp, (word <> chr)
else
{word, (push_back ch, inp)}
end
end
 
def get_position inp do
{ch, inp} = get_ch inp
{_, line_no, column_no} = ch
inp = push_back ch, inp
{line_no, column_no, inp}
end
 
###-------------------------------------------------------------------
###
### Scanning of string literals.
###
### It is assumed that the first character is the opening quote, and
### that the closing quote is the same character.
###
 
def scan_string_literal inp do
{ch, inp} = get_ch inp
{quote_mark, line_no, column_no} = ch
{contents, inp} = scan_str_lit inp, ch
{{"String", quote_mark <> contents <> quote_mark,
line_no, column_no},
inp}
end
 
def scan_str_lit inp, ch, contents\\"" do
{quote_mark, line_no, column_no} = ch
{ch1, inp} = get_ch inp
{chr1, line_no1, column_no1} = ch1
if chr1 == quote_mark do
{contents, inp}
else
case chr1 do
:eof -> eoi_in_string_literal line_no, column_no
"\n" -> eoln_in_string_literal line_no, column_no
"\\" ->
{ch2, inp} = get_ch inp
{chr2, _, _} = ch2
case chr2 do
"n" -> scan_str_lit inp, ch, (contents <> "\\n")
"\\" -> scan_str_lit inp, ch, (contents <> "\\\\")
_ -> unsupported_escape line_no1, column_no1, chr2
end
_ -> scan_str_lit inp, ch, (contents <> chr1)
end
end
end
 
###-------------------------------------------------------------------
###
### Scanning of character literals.
###
### It is assumed that the first character is the opening quote, and
### that the closing quote is the same character.
###
### The tedious part of scanning a character literal is distinguishing
### between the kinds of lexical error. (One might wish to modify the
### code to detect, as a distinct kind of error, end of line within a
### character literal.)
###
 
def scan_character_literal inp do
{ch, inp} = get_ch inp
{_, line_no, column_no} = ch
{ch1, inp} = get_ch inp
{chr1, line_no1, column_no1} = ch1
{intval, inp} =
case chr1 do
:eof -> unterminated_character_literal line_no, column_no
"\\" ->
{ch2, inp} = get_ch inp
{chr2, _, _} = ch2
case chr2 do
:eof -> unterminated_character_literal line_no, column_no
"n" -> {(:binary.first "\n"), inp}
"\\" -> {(:binary.first "\\"), inp}
_ -> unsupported_escape line_no1, column_no1, chr2
end
_ -> {(:binary.first chr1), inp}
end
inp = check_character_literal_end inp, ch
{{"Integer", "#{intval}", line_no, column_no}, inp}
end
 
def check_character_literal_end inp, ch do
{chr, _, _} = ch
{{chr1, _, _}, inp} = get_ch inp
if chr1 == chr do
inp
else
# Lexical error.
find_char_lit_end inp, ch
end
end
 
def find_char_lit_end inp, ch do
{chr, line_no, column_no} = ch
{{chr1, _, _}, inp} = get_ch inp
if chr1 == chr do
multicharacter_literal line_no, column_no
else
case chr1 do
:eof -> unterminated_character_literal line_no, column_no
_ -> find_char_lit_end inp, ch
end
end
end
 
###-------------------------------------------------------------------
###
### Character-at-a-time input, with unrestricted pushback, and with
### line and column numbering.
###
 
def make_inp inpf do
{inpf, [], 1, 1}
end
 
def get_ch {inpf, pushback, line_no, column_no} do
case pushback do
[head | tail] ->
{head, {inpf, tail, line_no, column_no}}
[] ->
case IO.read(inpf, 1) do
:eof ->
{{:eof, line_no, column_no},
{inpf, pushback, line_no, column_no}}
{:error, _} ->
{{:eof, line_no, column_no},
{inpf, pushback, line_no, column_no}}
chr ->
case chr do
"\n" ->
{{chr, line_no, column_no},
{inpf, pushback, line_no + 1, 1}}
_ ->
{{chr, line_no, column_no},
{inpf, pushback, line_no, column_no + 1}}
end
end
end
end
 
def push_back ch, {inpf, pushback, line_no, column_no} do
{inpf, [ch | pushback], line_no, column_no}
end
 
###-------------------------------------------------------------------
###
### Lexical and usage errors.
###
 
def unterminated_comment line_no, column_no do
raise "#{scriptname()}: unterminated comment at #{line_no}:#{column_no}"
end
 
def invalid_integer_literal line_no, column_no, word do
raise "#{scriptname()}: invalid integer literal #{word} at #{line_no}:#{column_no}"
end
 
def unsupported_escape line_no, column_no, chr do
raise "#{scriptname()}: unsupported escape \\#{chr} at #{line_no}:#{column_no}"
end
 
def eoi_in_string_literal line_no, column_no do
raise "#{scriptname()}: end of input in string literal starting at #{line_no}:#{column_no}"
end
 
def eoln_in_string_literal line_no, column_no do
raise "#{scriptname()}: end of line in string literal starting at #{line_no}:#{column_no}"
end
 
def multicharacter_literal line_no, column_no do
raise "#{scriptname()}: unsupported multicharacter literal at #{line_no}:#{column_no}"
end
 
def unterminated_character_literal line_no, column_no do
raise "#{scriptname()}: unterminated character literal starting at #{line_no}:#{column_no}"
end
 
def unexpected_character line_no, column_no, chr do
raise "#{scriptname()}: unexpected character '#{chr}' at #{line_no}:#{column_no}"
end
 
def usage_error() do
IO.puts "Usage: #{scriptname()} [INPUTFILE [OUTPUTFILE]]"
IO.puts "If either of INPUTFILE or OUTPUTFILE is not present or is \"-\","
IO.puts "standard input or standard output is used, respectively."
exit_status = 2
exit_status
end
 
def scriptname() do
Path.basename(__ENV__.file)
end
 
#---------------------------------------------------------------------
 
end ## module Lex
 
Lex.main(System.argv)</syntaxhighlight>
 
{{out}}
<pre>$ ./lex testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
 
 
=={{header|Emacs Lisp}}==
{{works with|Emacs|GNU 27.2}}
{{trans|ATS}}
 
 
<syntaxhighlight lang="lisp">#!/usr/bin/emacs --script
;;
;; The Rosetta Code lexical analyzer in GNU Emacs Lisp.
;;
;; Migrated from the ATS. However, Emacs Lisp is not friendly to the
;; functional style of the ATS implementation; therefore the
;; differences are vast.
;;
;; (A Scheme migration could easily, on the other hand, have been
;; almost exact. It is interesting to contrast Lisp dialects and see
;; how huge the differences are.)
;;
;; The script currently takes input only from standard input and
;; writes the token stream only to standard output.
;;
 
(require 'cl-lib)
 
;;; The type of a character, consisting of its code point and where it
;;; occurred in the text.
(cl-defstruct (ch_t (:constructor make-ch (ichar line-no column-no)))
ichar line-no column-no)
 
(defun ch-ichar (ch)
(ch_t-ichar ch))
 
(defun ch-line-no (ch)
(ch_t-line-no ch))
 
(defun ch-column-no (ch)
(ch_t-column-no ch))
 
;;; The type of an "inputter", consisting of an open file for the
;;; text, a pushback buffer (which is an indefinitely deep stack of
;;; ch_t), an input buffer for the current line, and a position in the
;;; text.
(cl-defstruct inp_t file pushback line line-no column-no)
 
(defun make-inp (file)
"Initialize a new inp_t."
(make-inp_t :file file
:pushback '()
:line ""
:line-no 0
:column-no 0))
 
(defvar inp (make-inp t)
"A global inp_t.")
 
(defun get-ch ()
"Get a ch_t, either from the pushback buffer or from the input."
(pcase (inp_t-pushback inp)
(`(,ch . ,tail)
;; Emacs Lisp has only single value return, so the results come
;; back as a list rather than multiple values.
(setq inp (make-inp_t :file (inp_t-file inp)
:pushback tail
:line (inp_t-line inp)
:line-no (inp_t-line-no inp)
:column-no (inp_t-column-no inp)))
ch)
('()
(let ((line (inp_t-line inp))
(line-no (inp_t-line-no inp))
(column-no (inp_t-column-no inp)))
(when (string= line "")
;; Refill the buffer.
(let ((text
(condition-case nil (read-string "")
nil (error 'eoi))))
(if (eq text 'eoi)
(setq line 'eoi)
(setq line (format "%s%c" text ?\n)))
(setq line-no (1+ line-no))
(setq column-no 1)))
(if (eq line 'eoi)
(progn
(setq inp (make-inp_t :file (inp_t-file inp)
:pushback (inp_t-pushback inp)
:line line
:line-no line-no
:column-no column-no))
(make-ch 'eoi line-no column-no))
(let ((c (elt line 0))
(line (substring line 1)))
(setq inp (make-inp_t :file (inp_t-file inp)
:pushback (inp_t-pushback inp)
:line line
:line-no line-no
:column-no (1+ column-no)))
(make-ch c line-no column-no)))))))
 
(defun get-new-line (file)
;; Currently "file" is ignored and the input must be from stdin.
(read-from-minibuffer "" :default 'eoi))
 
(defun push-back (ch)
"Push back a ch_t."
(setq inp (make-inp_t :file (inp_t-file inp)
:pushback (cons ch (inp_t-pushback inp))
:line (inp_t-line inp)
:line-no (inp_t-line-no inp)
:column-no (inp_t-column-no inp))))
 
(defun get-position ()
"Return the line-no and column-no of the next ch_t to be
returned by get-ch, assuming there are no more pushbacks
beforehand."
(let* ((ch (get-ch))
(line-no (ch-line-no ch))
(column-no (ch-column-no ch)))
(push-back ch)
(list line-no column-no)))
 
(defun scan-text (outf)
"The main loop."
(cl-loop for toktup = (get-next-token)
do (print-token outf toktup)
until (string= (elt toktup 0) "End_of_input")))
 
(defun print-token (outf toktup)
"Print a token, along with its position and possibly an
argument."
;; Currently outf is ignored, and the output goes to stdout.
(pcase toktup
(`(,tok ,arg ,line-no ,column-no)
(princ (format "%5d %5d %s" line-no column-no tok))
(pcase tok
("Identifier" (princ (format " %s\n" arg)))
("Integer" (princ (format " %s\n" arg)))
("String" (princ (format " %s\n" arg)))
(_ (princ "\n"))))))
 
(defun get-next-token ()
"The token dispatcher. Returns the next token, as a list along
with its argument and text position."
(skip-spaces-and-comments)
(let* ((ch (get-ch))
(ln (ch-line-no ch))
(cn (ch-column-no ch)))
(pcase (ch-ichar ch)
('eoi (list "End_of_input" "" ln cn))
(?, (list "Comma" "," ln cn))
(?\N{SEMICOLON} (list "Semicolon" ";" ln cn))
(?\N{LEFT PARENTHESIS} (list "LeftParen" "(" ln cn))
(?\N{RIGHT PARENTHESIS} (list "RightParen" ")" ln cn))
(?{ (list "LeftBrace" "{" ln cn))
(?} (list "RightBrace" "}" ln cn))
(?* (list "Op_multiply" "*" ln cn))
(?/ (list "Op_divide" "/" ln cn))
(?% (list "Op_mod" "%" ln cn))
(?+ (list "Op_add" "+" ln cn))
(?- (list "Op_subtract" "-" ln cn))
(?< (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?= (list "Op_lessequal" "<=" ln cn))
(_ (push-back ch1)
(list "Op_less" "<" ln cn)))))
(?> (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?= (list "Op_greaterequal" ">=" ln cn))
(_ (push-back ch1)
(list "Op_greater" ">" ln cn)))))
(?= (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?= (list "Op_equal" "==" ln cn))
(_ (push-back ch1)
(list "Op_assign" "=" ln cn)))))
(?! (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?= (list "Op_notequal" "!=" ln cn))
(_ (push-back ch1)
(list "Op_not" "!" ln cn)))))
(?& (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?& (list "Op_and" "&&" ln cn))
(_ (unexpected-character ln cn (get-ichar ch))))))
(?| (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?| (list "Op_or" "||" ln cn))
(_ (unexpected-character ln cn (get-ichar ch))))))
(?\N{QUOTATION MARK} (push-back ch) (scan-string-literal))
(?\N{APOSTROPHE} (push-back ch) (scan-character-literal))
((pred digitp) (push-back ch) (scan-integer-literal))
((pred identifier-start-p)
(progn
(push-back ch)
(scan-identifier-or-reserved-word)))
(c (unexpected-character ln cn c)))))
 
(defun skip-spaces-and-comments ()
"Skip spaces and comments. A comment is treated as equivalent
to a run of spaces."
(cl-loop for ch = (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?/ (let* ((ch2 (get-ch))
(line-no (ch-line-no ch1))
(column-no (ch-column-no ch1))
(position `(,line-no ,column-no)))
(pcase (ch-ichar ch2)
(?* (scan-comment position)
(get-ch))
(_ (push-back ch2)
ch1))))
(_ ch1)))
while (spacep (ch-ichar ch))
finally do (push-back ch)))
 
(defun scan-comment (position)
(cl-loop for ch = (get-ch)
for done = (comment-done-p ch position)
until done))
 
(defun comment-done-p (ch position)
(pcase (ch-ichar ch)
('eoi (apply 'unterminated-comment position))
(?* (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
('eoi (apply 'unterminated-comment position))
(?/ t)
(_ nil))))
(_ nil)))
 
(defun scan-integer-literal ()
"Scan an integer literal, on the assumption that a digit has
been seen and pushed back."
(let* ((position (get-position))
(lst (scan-word))
(s (list-to-string lst)))
(if (all-digits-p lst)
`("Integer" ,s . ,position)
(apply 'illegal-integer-literal `(,@position , s)))))
 
(defun scan-identifier-or-reserved-word ()
"Scan an identifier or reserved word, on the assumption that a
legal first character (for an identifier) has been seen and
pushed back."
(let* ((position (get-position))
(lst (scan-word))
(s (list-to-string lst))
(tok (pcase s
("else" "Keyword_else")
("if" "Keyword_if")
("while" "Keyword_while")
("print" "Keyword_print")
("putc" "Keyword_putc")
(_ "Identifier"))))
`(,tok ,s . ,position)))
 
(defun scan-word ()
(cl-loop for ch = (get-ch)
while (identifier-continuation-p (ch-ichar ch))
collect (ch-ichar ch)
finally do (push-back ch)))
 
(defun scan-string-literal ()
"Scan a string literal, on the assumption that a double quote
has been seen and pushed back."
(let* ((ch (get-ch))
(_ (cl-assert (= (ch-ichar ch) ?\N{QUOTATION MARK})))
(line-no (ch-line-no ch))
(column-no (ch-column-no ch))
(position `(,line-no ,column-no))
(lst (scan-str-lit position))
(lst `(?\N{QUOTATION MARK} ,@lst ?\N{QUOTATION MARK})))
`("String" ,(list-to-string lst) . ,position)))
 
(defun scan-str-lit (position)
(flatten
(cl-loop for ch = (get-ch)
until (= (ch-ichar ch) ?\N{QUOTATION MARK})
collect (process-str-lit-character
(ch-ichar ch) position))))
 
(defun process-str-lit-character (c position)
;; NOTE: This script might insert a newline before any eoi, so that
;; "end-of-input-in-string-literal" never actually occurs. It is a
;; peculiarity of the script's input mechanism.
(pcase c
('eoi (apply 'end-of-input-in-string-literal position))
(?\n (apply 'end-of-line-in-string-literal position))
(?\\ (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?n '(?\\ ?n))
(?\\ '(?\\ ?\\))
(c (unsupported-escape (ch-line-no ch1)
(ch-column-no ch1)
c)))))
(c c)))
 
(defun scan-character-literal ()
"Scan a character literal, on the assumption that an ASCII
single quote (that is, a Unicode APOSTROPHE) has been seen and
pushed back."
(let* ((toktup (scan-character-literal-without-checking-end))
(line-no (elt toktup 2))
(column-no (elt toktup 3))
(position (list line-no column-no)))
(check-char-lit-end position)
toktup))
 
(defun check-char-lit-end (position)
(let ((ch (get-ch)))
(unless (and (integerp (ch-ichar ch))
(= (ch-ichar ch) ?\N{APOSTROPHE}))
(push-back ch)
(loop-to-char-lit-end position))))
 
(defun loop-to-char-lit-end (position)
(cl-loop for ch = (get-ch)
until (or (eq (ch-ichar ch) 'eoi)
(= (ch-ichar ch) ?\N{APOSTROPHE}))
finally do (if (eq (ch-ichar ch) 'eoi)
(apply 'unterminated-character-literal
position)
(apply 'multicharacter-literal position))))
 
(defun scan-character-literal-without-checking-end ()
(let* ((ch (get-ch))
(_ (cl-assert (= (ch-ichar ch) ?\N{APOSTROPHE})))
(line-no (ch-line-no ch))
(column-no (ch-column-no ch))
(position (list line-no column-no))
(ch1 (get-ch)))
(pcase (ch-ichar ch1)
('eoi (apply 'unterminated-character-literal position))
(?\\ (let ((ch2 (get-ch)))
(pcase (ch-ichar ch2)
('eoi (apply 'unterminated-character-literal position))
(?n `("Integer" ,(format "%d" ?\n) . ,position))
(?\\ `("Integer" ,(format "%d" ?\\) . ,position))
(c (unsupported-escape (ch-line-no ch1)
(ch-column-no ch1)
c)))))
(c `("Integer" ,(format "%d" c) . ,position)))))
 
(defun spacep (c)
(and (integerp c) (or (= c ?\N{SPACE})
(and (<= 9 c) (<= c 13)))))
 
(defun digitp (c)
(and (integerp c) (<= ?0 c) (<= c ?9)))
 
(defun lowerp (c)
;; Warning: in EBCDIC, this kind of test for "alphabetic" is no
;; good. The letters are not contiguous.
(and (integerp c) (<= ?a c) (<= c ?z)))
 
(defun upperp (c)
;; Warning: in EBCDIC, this kind of test for "alphabetic" is no
;; good. The letters are not contiguous.
(and (integerp c) (<= ?A c) (<= c ?Z)))
 
(defun alphap (c)
(or (lowerp c) (upperp c)))
 
(defun identifier-start-p (c)
(and (integerp c) (or (alphap c) (= c ?_))))
 
(defun identifier-continuation-p (c)
(and (integerp c) (or (alphap c) (= c ?_) (digitp c))))
 
(defun all-digits-p (thing)
(cl-loop for c in thing
if (not (digitp c)) return nil
finally return t))
 
(defun list-to-string (lst)
"Convert a list of characters to a string."
(apply 'string lst))
 
(defun flatten (lst)
"Flatten nested lists. (The implementation is recursive and not
for very long lists.)"
(pcase lst
('() '())
(`(,head . ,tail)
(if (listp head)
(append (flatten head) (flatten tail))
(cons head (flatten tail))))))
 
(defun unexpected-character (line-no column-no c)
(error (format "unexpected character '%c' at %d:%d"
c line-no column-no)))
 
(defun unsupported-escape (line-no column-no c)
(error (format "unsupported escape \\%c at %d:%d"
c line-no column-no)))
 
(defun illegal-integer-literal (line-no column-no s)
(error (format "illegal integer literal \"%s\" at %d:%d"
s line-no column-no)))
 
(defun unterminated-character-literal (line-no column-no)
(error (format "unterminated character literal starting at %d:%d"
line-no column-no)))
 
(defun multicharacter-literal (line-no column-no)
(error (format
"unsupported multicharacter literal starting at %d:%d"
line-no column-no)))
 
(defun end-of-input-in-string-literal (line-no column-no)
(error (format "end of input in string literal starting at %d:%d"
line-no column-no)))
 
(defun end-of-line-in-string-literal (line-no column-no)
(error (format "end of line in string literal starting at %d:%d"
line-no column-no)))
 
(defun unterminated-comment (line-no column-no)
(error (format "unterminated comment starting at %d:%d"
line-no column-no)))
 
(defun main ()
(setq inp (make-inp t))
(scan-text t))
 
(main)</syntaxhighlight>
 
 
{{out}}
<pre>$ ./lex-in-el < compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|Erlang}}==
{{works with|Erlang|24.3.3}}
{{trans|ATS}}
{{trans|Elixir}}
 
 
<syntaxhighlight lang="erlang">#!/bin/env escript
%%%-------------------------------------------------------------------
 
-record (inp_t, {inpf, pushback, line_no, column_no}).
 
main (Args) ->
main_program (Args).
 
main_program ([]) ->
scan_from_inpf_to_outf ("-", "-"),
halt (0);
main_program ([Inpf_filename]) ->
scan_from_inpf_to_outf (Inpf_filename, "-"),
halt (0);
main_program ([Inpf_filename, Outf_filename]) ->
scan_from_inpf_to_outf (Inpf_filename, Outf_filename),
halt (0);
main_program ([_, _ | _]) ->
ProgName = escript:script_name (),
io:put_chars (standard_error, "Usage: "),
io:put_chars (standard_error, ProgName),
io:put_chars (standard_error, " [INPUTFILE [OUTPUTFILE]]\n"),
halt (1).
 
scan_from_inpf_to_outf ("-", "-") ->
scan_input (standard_io, standard_io);
scan_from_inpf_to_outf (Inpf_filename, "-") ->
case file:open (Inpf_filename, [read]) of
{ok, Inpf} -> scan_input (Inpf, standard_io);
_ -> open_failure (Inpf_filename, "input")
end;
scan_from_inpf_to_outf ("-", Outf_filename) ->
case file:open (Outf_filename, [write]) of
{ok, Outf} -> scan_input (standard_io, Outf);
_ -> open_failure (Outf_filename, "output")
end;
scan_from_inpf_to_outf (Inpf_filename, Outf_filename) ->
case file:open(Inpf_filename, [read]) of
{ok, Inpf} ->
case file:open (Outf_filename, [write]) of
{ok, Outf} -> scan_input (Inpf, Outf);
_ -> open_failure (Outf_filename, "output")
end;
_ -> open_failure (Inpf_filename, "input")
end.
 
open_failure (Filename, ForWhat) ->
ProgName = escript:script_name (),
io:put_chars (standard_error, ProgName),
io:put_chars (standard_error, ": failed to open \""),
io:put_chars (standard_error, Filename),
io:put_chars (standard_error, "\" for "),
io:put_chars (standard_error, ForWhat),
io:put_chars (standard_error, "\n"),
halt (1).
 
scan_input (Inpf, Outf) ->
scan_text (Outf, make_inp (Inpf)).
 
scan_text (Outf, Inp) ->
{TokTup, Inp1} = get_next_token (Inp),
print_token (Outf, TokTup),
case TokTup of
{"End_of_input", _, _, _} -> ok;
_ -> scan_text (Outf, Inp1)
end.
 
print_token (Outf, {Tok, Arg, Line_no, Column_no}) ->
S_line_no = erlang:integer_to_list (Line_no),
S_column_no = erlang:integer_to_list (Column_no),
io:put_chars (Outf, string:pad (S_line_no, 5, leading)),
io:put_chars (Outf, " "),
io:put_chars (Outf, string:pad (S_column_no, 5, leading)),
io:put_chars (Outf, " "),
io:put_chars (Outf, Tok),
{Padding, Arg1} =
case Tok of
"Identifier" -> {" ", Arg};
"Integer" -> {" ", Arg};
"String" -> {" ", Arg};
_ -> {"", ""}
end,
io:put_chars (Outf, Padding),
io:put_chars (Outf, Arg1),
io:put_chars ("\n").
 
%%%-------------------------------------------------------------------
%%%
%%% The token dispatcher.
%%%
 
get_next_token (Inp) ->
Inp00 = skip_spaces_and_comments (Inp),
{Ch, Inp0} = get_ch (Inp00),
{Char, Line_no, Column_no} = Ch,
Ln = Line_no,
Cn = Column_no,
case Char of
eof -> {{"End_of_input", "", Ln, Cn}, Inp0};
"," -> {{"Comma", ",", Ln, Cn}, Inp0};
";" -> {{"Semicolon", ";", Ln, Cn}, Inp0};
"(" -> {{"LeftParen", "(", Ln, Cn}, Inp0};
")" -> {{"RightParen", ")", Ln, Cn}, Inp0};
"{" -> {{"LeftBrace", "{", Ln, Cn}, Inp0};
"}" -> {{"RightBrace", "}", Ln, Cn}, Inp0};
"*" -> {{"Op_multiply", "*", Ln, Cn}, Inp0};
"/" -> {{"Op_divide", "/", Ln, Cn}, Inp0};
"%" -> {{"Op_mod", "%", Ln, Cn}, Inp0};
"+" -> {{"Op_add", "+", Ln, Cn}, Inp0};
"-" -> {{"Op_subtract", "-", Ln, Cn}, Inp0};
"<" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"=" -> {{"Op_lessequal", "<=", Ln, Cn}, Inp1};
_ -> {{"Op_less", "<", Ln, Cn}, push_back (Ch1, Inp1)}
end;
">" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"=" -> {{"Op_greaterequal", ">=", Ln, Cn}, Inp1};
_ -> {{"Op_greater", ">", Ln, Cn}, push_back (Ch1, Inp1)}
end;
"=" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"=" -> {{"Op_equal", "==", Ln, Cn}, Inp1};
_ -> {{"Op_assign", "=", Ln, Cn}, push_back (Ch1, Inp1)}
end;
"!" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"=" -> {{"Op_notequal", "!=", Ln, Cn}, Inp1};
_ -> {{"Op_not", "!", Ln, Cn}, push_back (Ch1, Inp1)}
end;
"&" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"&" -> {{"Op_and", "&&", Ln, Cn}, Inp1};
_ -> unexpected_character (Ln, Cn, Char)
end;
"|" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"|" -> {{"Op_or", "||", Ln, Cn}, Inp1};
_ -> unexpected_character (Ln, Cn, Char)
end;
"\"" ->
Inp1 = push_back (Ch, Inp0),
scan_string_literal (Inp1);
"'" ->
Inp1 = push_back (Ch, Inp0),
scan_character_literal (Inp1);
_ ->
case is_digit (Char) of
true ->
Inp1 = push_back (Ch, Inp0),
scan_integer_literal (Inp1);
false ->
case is_alpha_or_underscore (Char) of
true ->
Inp1 = push_back (Ch, Inp0),
scan_identifier_or_reserved_word (Inp1);
false ->
unexpected_character (Ln, Cn, Char)
end
end
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Skipping past spaces and /* ... */ comments.
%%%
%%% Comments are treated exactly like a bit of whitespace. They never
%%% make it to the dispatcher.
%%%
 
skip_spaces_and_comments (Inp) ->
{Ch, Inp0} = get_ch (Inp),
{Char, Line_no, Column_no} = Ch,
case classify_char (Char) of
eof -> push_back (Ch, Inp0);
space -> skip_spaces_and_comments (Inp0);
slash ->
{Ch1, Inp1} = get_ch (Inp0),
case Ch1 of
{"*", _, _} ->
Inp2 = scan_comment (Inp1, Line_no, Column_no),
skip_spaces_and_comments (Inp2);
_ -> push_back (Ch, (push_back (Ch1, Inp1)))
end;
other -> push_back (Ch, Inp0)
end.
 
classify_char (Char) ->
case Char of
eof -> eof;
"/" -> slash;
_ -> case is_space (Char) of
true -> space;
false -> other
end
end.
 
scan_comment (Inp, Line_no, Column_no) ->
{Ch0, Inp0} = get_ch (Inp),
case Ch0 of
{eof, _, _} -> unterminated_comment (Line_no, Column_no);
{"*", _, _} ->
{Ch1, Inp1} = get_ch (Inp0),
case Ch1 of
{eof, _, _} ->
unterminated_comment (Line_no, Column_no);
{"/", _, _} -> Inp1;
_ -> scan_comment (Inp1, Line_no, Column_no)
end;
_ -> scan_comment (Inp0, Line_no, Column_no)
end.
 
is_space (S) ->
case re:run (S, "^[[:space:]]+$") of
{match, _} -> true;
_ -> false
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Scanning of integer literals, identifiers, and reserved words.
%%%
%%% These three types of token are very similar to each other.
%%%
 
scan_integer_literal (Inp) ->
%% Scan an entire word, not just digits. This way we detect
%% erroneous text such as "23skidoo".
{Line_no, Column_no, Inp1} = get_position (Inp),
{Word, Inp2} = scan_word (Inp1),
case is_digit (Word) of
true -> {{"Integer", Word, Line_no, Column_no}, Inp2};
false -> invalid_integer_literal (Line_no, Column_no, Word)
end.
 
scan_identifier_or_reserved_word (Inp) ->
%% It is assumed that the first character is of the correct type,
%% thanks to the dispatcher.
{Line_no, Column_no, Inp1} = get_position (Inp),
{Word, Inp2} = scan_word (Inp1),
Tok =
case Word of
"if" -> "Keyword_if";
"else" -> "Keyword_else";
"while" -> "Keyword_while";
"print" -> "Keyword_print";
"putc" -> "Keyword_putc";
_ -> "Identifier"
end,
{{Tok, Word, Line_no, Column_no}, Inp2}.
 
scan_word (Inp) ->
scan_word_loop (Inp, "").
 
scan_word_loop (Inp, Word0) ->
{Ch1, Inp1} = get_ch (Inp),
{Char1, _, _} = Ch1,
case is_alnum_or_underscore (Char1) of
true -> scan_word_loop (Inp1, Word0 ++ Char1);
false -> {Word0, push_back (Ch1, Inp1)}
end.
 
get_position (Inp) ->
{Ch1, Inp1} = get_ch (Inp),
{_, Line_no, Column_no} = Ch1,
Inp2 = push_back (Ch1, Inp1),
{Line_no, Column_no, Inp2}.
 
is_digit (S) ->
case re:run (S, "^[[:digit:]]+$") of
{match, _} -> true;
_ -> false
end.
 
is_alpha_or_underscore (S) ->
case re:run (S, "^[[:alpha:]_]+$") of
{match, _} -> true;
_ -> false
end.
 
is_alnum_or_underscore (S) ->
case re:run (S, "^[[:alnum:]_]+$") of
{match, _} -> true;
_ -> false
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Scanning of string literals.
%%%
%%% It is assumed that the first character is the opening quote, and
%%% that the closing quote is the same character.
%%%
 
 
scan_string_literal (Inp) ->
{Ch1, Inp1} = get_ch (Inp),
{Quote_mark, Line_no, Column_no} = Ch1,
{Contents, Inp2} = scan_str_lit (Inp1, Ch1),
Toktup = {"String", Quote_mark ++ Contents ++ Quote_mark,
Line_no, Column_no},
{Toktup, Inp2}.
 
scan_str_lit (Inp, Ch) -> scan_str_lit_loop (Inp, Ch, "").
 
scan_str_lit_loop (Inp, Ch, Contents) ->
{Quote_mark, Line_no, Column_no} = Ch,
{Ch1, Inp1} = get_ch (Inp),
{Char1, Line_no1, Column_no1} = Ch1,
case Char1 of
Quote_mark -> {Contents, Inp1};
eof -> eoi_in_string_literal (Line_no, Column_no);
"\n" -> eoln_in_string_literal (Line_no, Column_no);
"\\" ->
{Ch2, Inp2} = get_ch (Inp1),
{Char2, _, _} = Ch2,
case Char2 of
"n" ->
scan_str_lit_loop (Inp2, Ch, Contents ++ "\\n");
"\\" ->
scan_str_lit_loop (Inp2, Ch, Contents ++ "\\\\");
_ ->
unsupported_escape (Line_no1, Column_no1, Char2)
end;
_ -> scan_str_lit_loop (Inp1, Ch, Contents ++ Char1)
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Scanning of character literals.
%%%
%%% It is assumed that the first character is the opening quote, and
%%% that the closing quote is the same character.
%%%
%%% The tedious part of scanning a character literal is distinguishing
%%% between the kinds of lexical error. (One might wish to modify the
%%% code to detect, as a distinct kind of error, end of line within a
%%% character literal.)
%%%
 
scan_character_literal (Inp) ->
{Ch, Inp0} = get_ch (Inp),
{_, Line_no, Column_no} = Ch,
{Ch1, Inp1} = get_ch (Inp0),
{Char1, Line_no1, Column_no1} = Ch1,
{Intval, Inp3} =
case Char1 of
eof -> unterminated_character_literal (Line_no, Column_no);
"\\" ->
{Ch2, Inp2} = get_ch (Inp1),
{Char2, _, _} = Ch2,
case Char2 of
eof -> unterminated_character_literal (Line_no,
Column_no);
"n" -> {char_to_code ("\n"), Inp2};
"\\" -> {char_to_code ("\\"), Inp2};
_ -> unsupported_escape (Line_no1, Column_no1,
Char2)
end;
_ -> {char_to_code (Char1), Inp1}
end,
Inp4 = check_character_literal_end (Inp3, Ch),
{{"Integer", Intval, Line_no, Column_no}, Inp4}.
 
char_to_code (Char) ->
%% Hat tip to https://archive.ph/BxZRS
lists:flatmap (fun erlang:integer_to_list/1, Char).
 
check_character_literal_end (Inp, Ch) ->
{Char, _, _} = Ch,
{{Char1, _, _}, Inp1} = get_ch (Inp),
case Char1 of
Char -> Inp1;
_ -> find_char_lit_end (Inp1, Ch) % Handle a lexical error.
end.
 
find_char_lit_end (Inp, Ch) ->
%% There is a lexical error. Determine which kind it fits into.
{Char, Line_no, Column_no} = Ch,
{{Char1, _, _}, Inp1} = get_ch (Inp),
case Char1 of
Char -> multicharacter_literal (Line_no, Column_no);
eof -> unterminated_character_literal (Line_no, Column_no);
_ -> find_char_lit_end (Inp1, Ch)
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Character-at-a-time input, with unrestricted pushback, and with
%%% line and column numbering.
%%%
 
make_inp (Inpf) ->
#inp_t{inpf = Inpf,
pushback = [],
line_no = 1,
column_no = 1}.
 
get_ch (Inp) ->
#inp_t{inpf = Inpf,
pushback = Pushback,
line_no = Line_no,
column_no = Column_no} = Inp,
case Pushback of
[Ch | Tail] ->
Inp1 = Inp#inp_t{pushback = Tail},
{Ch, Inp1};
[] ->
case io:get_chars (Inpf, "", 1) of
eof ->
Ch = {eof, Line_no, Column_no},
{Ch, Inp};
{error, _} ->
Ch = {eof, Line_no, Column_no},
{Ch, Inp};
Char ->
case Char of
"\n" ->
Ch = {Char, Line_no, Column_no},
Inp1 = Inp#inp_t{line_no = Line_no + 1,
column_no = 1},
{Ch, Inp1};
_ ->
Ch = {Char, Line_no, Column_no},
Inp1 =
Inp#inp_t{column_no = Column_no + 1},
{Ch, Inp1}
end
end
end.
 
push_back (Ch, Inp) ->
Inp#inp_t{pushback = [Ch | Inp#inp_t.pushback]}.
 
%%%-------------------------------------------------------------------
 
invalid_integer_literal (Line_no, Column_no, Word) ->
error_abort ("invalid integer literal \"" ++
Word ++ "\" at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
unsupported_escape (Line_no, Column_no, Char) ->
error_abort ("unsupported escape \\" ++
Char ++ " at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
unexpected_character (Line_no, Column_no, Char) ->
error_abort ("unexpected character '" ++
Char ++ "' at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
eoi_in_string_literal (Line_no, Column_no) ->
error_abort ("end of input in string literal starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
eoln_in_string_literal (Line_no, Column_no) ->
error_abort ("end of line in string literal starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
unterminated_character_literal (Line_no, Column_no) ->
error_abort ("unterminated character literal starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
multicharacter_literal (Line_no, Column_no) ->
error_abort ("unsupported multicharacter literal starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
unterminated_comment (Line_no, Column_no) ->
error_abort ("unterminated comment starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
error_abort (Message) ->
ProgName = escript:script_name (),
io:put_chars (standard_error, ProgName),
io:put_chars (standard_error, ": "),
io:put_chars (standard_error, Message),
io:put_chars (standard_error, "\n"),
halt (1).
 
%%%-------------------------------------------------------------------
%%% Instructions to GNU Emacs --
%%% local variables:
%%% mode: erlang
%%% erlang-indent-level: 3
%%% end:
%%%-------------------------------------------------------------------</syntaxhighlight>
 
 
{{out}}
<pre>$ ./lex-in-Erlang compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|Euphoria}}==
Tested with Euphoria 4.05.
<langsyntaxhighlight lang="euphoria">include std/io.e
include std/map.e
include std/types.e
Line 2,547 ⟶ 5,877:
end procedure
 
main(command_line())</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 2,591 ⟶ 5,921:
=={{header|Flex}}==
Tested with Flex 2.5.4.
<syntaxhighlight lang="c">%{
<lang C>%{
#include <stdio.h>
#include <stdlib.h>
Line 2,764 ⟶ 6,094:
} while (tok != tk_EOI);
return 0;
}</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 2,808 ⟶ 6,138:
=={{header|Forth}}==
Tested with Gforth 0.7.3.
<langsyntaxhighlight Forthlang="forth">CREATE BUF 0 , \ single-character look-ahead buffer
CREATE COLUMN# 0 ,
CREATE LINE# 1 ,
Line 2,930 ⟶ 6,260:
THEN THEN ;
: TOKENIZE BEGIN CONSUME AGAIN ;
TOKENIZE</langsyntaxhighlight>
 
{{out}}
Tested against all programs in [[Compiler/Sample programs]].
 
=={{header|Fortran}}==
{{works with|gfortran|11.2.1}}
You should call the source file ‘lex.F90’, so gfortran will know to use the C preprocessor. I use the preprocessor to select between different ways to read stream input from the standard input.
 
(Despite the ‘.F90’ extension that I recommend, this is Fortran 2008/2018 code.)
 
There is ‘framework’ for supporting Unicode, but no actual Unicode support. To support Unicode reliably I would probably use the C interface and GNU libunistring.
 
The author has placed this Fortran code in the public domain.
<syntaxhighlight lang="fortran">!!!
!!! An implementation of the Rosetta Code lexical analyzer task:
!!! https://rosettacode.org/wiki/Compiler/lexical_analyzer
!!!
!!! The C implementation was used as a reference on behavior, but was
!!! not adhered to for the implementation.
!!!
 
module string_buffers
use, intrinsic :: iso_fortran_env, only: error_unit
use, intrinsic :: iso_fortran_env, only: int64
 
implicit none
private
 
public :: strbuf_t
public :: strbuf_t_length_kind
public :: strbuf_t_character_kind
 
integer, parameter :: strbuf_t_length_kind = int64
 
! String buffers can handle Unicode.
integer, parameter :: strbuf_t_character_kind = selected_char_kind ('ISO_10646')
 
! Private abbreviations.
integer, parameter :: nk = strbuf_t_length_kind
integer, parameter :: ck = strbuf_t_character_kind
 
type :: strbuf_t
integer(kind = nk), private :: len = 0
!
! ‘chars’ is made public for efficient access to the individual
! characters.
!
character(1, kind = ck), allocatable, public :: chars(:)
contains
procedure, pass, private :: ensure_storage => strbuf_t_ensure_storage
procedure, pass :: to_unicode => strbuf_t_to_unicode
procedure, pass :: length => strbuf_t_length
procedure, pass :: set => strbuf_t_set
procedure, pass :: append => strbuf_t_append
generic :: assignment(=) => set
end type strbuf_t
 
contains
 
function strbuf_t_to_unicode (strbuf) result (s)
class(strbuf_t), intent(in) :: strbuf
character(:, kind = ck), allocatable :: s
 
!
! This does not actually ensure that the string is valid Unicode;
! any 31-bit ‘character’ is supported.
!
 
integer(kind = nk) :: i
 
allocate (character(len = strbuf%len, kind = ck) :: s)
do i = 1, strbuf%len
s(i:i) = strbuf%chars(i)
end do
end function strbuf_t_to_unicode
 
elemental function strbuf_t_length (strbuf) result (n)
class(strbuf_t), intent(in) :: strbuf
integer(kind = nk) :: n
 
n = strbuf%len
end function strbuf_t_length
 
elemental function next_power_of_two (x) result (y)
integer(kind = nk), intent(in) :: x
integer(kind = nk) :: y
 
!
! It is assumed that no more than 64 bits are used.
!
! The branch-free algorithm is that of
! https://archive.is/nKxAc#RoundUpPowerOf2
!
! Fill in bits until one less than the desired power of two is
! reached, and then add one.
!
 
y = x - 1
y = ior (y, ishft (y, -1))
y = ior (y, ishft (y, -2))
y = ior (y, ishft (y, -4))
y = ior (y, ishft (y, -8))
y = ior (y, ishft (y, -16))
y = ior (y, ishft (y, -32))
y = y + 1
end function next_power_of_two
 
elemental function new_storage_size (length_needed) result (size)
integer(kind = nk), intent(in) :: length_needed
integer(kind = nk) :: size
 
! Increase storage by orders of magnitude.
 
if (2_nk**32 < length_needed) then
size = huge (1_nk)
else
size = next_power_of_two (length_needed)
end if
end function new_storage_size
 
subroutine strbuf_t_ensure_storage (strbuf, length_needed)
class(strbuf_t), intent(inout) :: strbuf
integer(kind = nk), intent(in) :: length_needed
 
integer(kind = nk) :: new_size
type(strbuf_t) :: new_strbuf
 
if (.not. allocated (strbuf%chars)) then
! Initialize a new strbuf%chars array.
new_size = new_storage_size (length_needed)
allocate (strbuf%chars(1:new_size))
else if (ubound (strbuf%chars, 1) < length_needed) then
! Allocate a new strbuf%chars array, larger than the current
! one, but containing the same characters.
new_size = new_storage_size (length_needed)
allocate (new_strbuf%chars(1:new_size))
new_strbuf%chars(1:strbuf%len) = strbuf%chars(1:strbuf%len)
call move_alloc (new_strbuf%chars, strbuf%chars)
end if
end subroutine strbuf_t_ensure_storage
 
subroutine strbuf_t_set (dst, src)
class(strbuf_t), intent(inout) :: dst
class(*), intent(in) :: src
 
integer(kind = nk) :: n
integer(kind = nk) :: i
 
select type (src)
type is (character(*, kind = ck))
n = len (src, kind = nk)
call dst%ensure_storage(n)
do i = 1, n
dst%chars(i) = src(i:i)
end do
dst%len = n
type is (character(*))
n = len (src, kind = nk)
call dst%ensure_storage(n)
do i = 1, n
dst%chars(i) = src(i:i)
end do
dst%len = n
class is (strbuf_t)
n = src%len
call dst%ensure_storage(n)
dst%chars(1:n) = src%chars(1:n)
dst%len = n
class default
error stop
end select
end subroutine strbuf_t_set
 
subroutine strbuf_t_append (dst, src)
class(strbuf_t), intent(inout) :: dst
class(*), intent(in) :: src
 
integer(kind = nk) :: n_dst, n_src, n
integer(kind = nk) :: i
 
select type (src)
type is (character(*, kind = ck))
n_dst = dst%len
n_src = len (src, kind = nk)
n = n_dst + n_src
call dst%ensure_storage(n)
do i = 1, n_src
dst%chars(n_dst + i) = src(i:i)
end do
dst%len = n
type is (character(*))
n_dst = dst%len
n_src = len (src, kind = nk)
n = n_dst + n_src
call dst%ensure_storage(n)
do i = 1, n_src
dst%chars(n_dst + i) = src(i:i)
end do
dst%len = n
class is (strbuf_t)
n_dst = dst%len
n_src = src%len
n = n_dst + n_src
call dst%ensure_storage(n)
dst%chars((n_dst + 1):n) = src%chars(1:n_src)
dst%len = n
class default
error stop
end select
end subroutine strbuf_t_append
 
end module string_buffers
 
module lexical_analysis
use, intrinsic :: iso_fortran_env, only: input_unit
use, intrinsic :: iso_fortran_env, only: output_unit
use, intrinsic :: iso_fortran_env, only: error_unit
use, intrinsic :: iso_fortran_env, only: int32
use, non_intrinsic :: string_buffers
 
implicit none
private
 
public :: lexer_input_t
public :: lexer_output_t
public :: run_lexer
 
integer, parameter :: input_file_unit_no = 100
integer, parameter :: output_file_unit_no = 101
 
! Private abbreviations.
integer, parameter :: nk = strbuf_t_length_kind
integer, parameter :: ck = strbuf_t_character_kind
 
! Integers large enough for a Unicode code point. Unicode code
! points (and UCS-4) have never been allowed to go higher than
! 7FFFFFFF, and are even further restricted now.
integer, parameter :: ichar_kind = int32
 
character(1, kind = ck), parameter :: horizontal_tab_char = char (9, kind = ck)
character(1, kind = ck), parameter :: linefeed_char = char (10, kind = ck)
character(1, kind = ck), parameter :: vertical_tab_char = char (11, kind = ck)
character(1, kind = ck), parameter :: formfeed_char = char (12, kind = ck)
character(1, kind = ck), parameter :: carriage_return_char = char (13, kind = ck)
character(1, kind = ck), parameter :: space_char = ck_' '
 
! The following is correct for Unix and its relatives.
character(1, kind = ck), parameter :: newline_char = linefeed_char
 
character(1, kind = ck), parameter :: backslash_char = char (92, kind = ck)
 
character(*, kind = ck), parameter :: newline_intstring = ck_'10'
character(*, kind = ck), parameter :: backslash_intstring = ck_'92'
 
integer, parameter :: tk_EOI = 0
integer, parameter :: tk_Mul = 1
integer, parameter :: tk_Div = 2
integer, parameter :: tk_Mod = 3
integer, parameter :: tk_Add = 4
integer, parameter :: tk_Sub = 5
integer, parameter :: tk_Negate = 6
integer, parameter :: tk_Not = 7
integer, parameter :: tk_Lss = 8
integer, parameter :: tk_Leq = 9
integer, parameter :: tk_Gtr = 10
integer, parameter :: tk_Geq = 11
integer, parameter :: tk_Eq = 12
integer, parameter :: tk_Neq = 13
integer, parameter :: tk_Assign = 14
integer, parameter :: tk_And = 15
integer, parameter :: tk_Or = 16
integer, parameter :: tk_If = 17
integer, parameter :: tk_Else = 18
integer, parameter :: tk_While = 19
integer, parameter :: tk_Print = 20
integer, parameter :: tk_Putc = 21
integer, parameter :: tk_Lparen = 22
integer, parameter :: tk_Rparen = 23
integer, parameter :: tk_Lbrace = 24
integer, parameter :: tk_Rbrace = 25
integer, parameter :: tk_Semi = 26
integer, parameter :: tk_Comma = 27
integer, parameter :: tk_Ident = 28
integer, parameter :: tk_Integer = 29
integer, parameter :: tk_String = 30
 
character(len = 16), parameter :: token_names(0:30) = &
& (/ "End_of_input ", "Op_multiply ", "Op_divide ", "Op_mod ", "Op_add ", &
& "Op_subtract ", "Op_negate ", "Op_not ", "Op_less ", "Op_lessequal ", &
& "Op_greater ", "Op_greaterequal ", "Op_equal ", "Op_notequal ", "Op_assign ", &
& "Op_and ", "Op_or ", "Keyword_if ", "Keyword_else ", "Keyword_while ", &
& "Keyword_print ", "Keyword_putc ", "LeftParen ", "RightParen ", "LeftBrace ", &
& "RightBrace ", "Semicolon ", "Comma ", "Identifier ", "Integer ", &
& "String " /)
 
type :: token_t
integer :: token_no
 
! Our implementation stores the value of a tk_Integer as a
! string. The C reference implementation stores it as an int.
character(:, kind = ck), allocatable :: val
 
integer(nk) :: line_no
integer(nk) :: column_no
end type token_t
 
type :: lexer_input_t
logical, private :: using_input_unit = .true.
integer, private :: unit_no = -(huge (1))
integer(kind = nk) :: line_no = 1
integer(kind = nk) :: column_no = 0
integer, private :: unget_count = 0
 
! The maximum lookahead is 2, although I believe we are using
! only 1. In principle, the lookahead could be any finite number.
character(1, kind = ck), private :: unget_buffer(1:2)
logical, private :: unget_eof_buffer(1:2)
 
! Using the same strbuf_t multiple times reduces the need for
! reallocations. Putting that strbuf_t in the lexer_input_t is
! simply for convenience.
type(strbuf_t), private :: strbuf
 
contains
!
! Note: There is currently no facility for closing one input and
! switching to another.
!
! Note: There is currently no facility to decode inputs into
! Unicode codepoints. Instead, what happens is raw bytes of
! input get stored as strbuf_t_character_kind values. This
! behavior is adequate for ASCII inputs.
!
procedure, pass :: use_file => lexer_input_t_use_file
procedure, pass :: get_next_ch => lexer_input_t_get_next_ch
procedure, pass :: unget_ch => lexer_input_t_unget_ch
procedure, pass :: unget_eof => lexer_input_t_unget_eof
end type lexer_input_t
 
type :: lexer_output_t
integer, private :: unit_no = output_unit
contains
procedure, pass :: use_file => lexer_output_t_use_file
procedure, pass :: output_token => lexer_output_t_output_token
end type lexer_output_t
 
contains
 
subroutine lexer_input_t_use_file (inputter, filename)
class(lexer_input_t), intent(inout) :: inputter
character(*), intent(in) :: filename
 
integer :: stat
 
inputter%using_input_unit = .false.
inputter%unit_no = input_file_unit_no
inputter%line_no = 1
inputter%column_no = 0
 
open (unit = input_file_unit_no, file = filename, status = 'old', &
& action = 'read', access = 'stream', form = 'unformatted', &
& iostat = stat)
if (stat /= 0) then
write (error_unit, '("Error: failed to open ", A, " for input")') filename
stop 1
end if
end subroutine lexer_input_t_use_file
 
!!!
!!! If you tell gfortran you want -std=f2008 or -std=f2018, you likely
!!! will need to add also -fall-intrinsics or -U__GFORTRAN__
!!!
!!! The first way, you get the FGETC intrinsic. The latter way, you
!!! get the C interface code that uses getchar(3).
!!!
#ifdef __GFORTRAN__
 
subroutine get_input_unit_char (c, stat)
!
! The following works if you are using gfortran.
!
! (FGETC is considered a feature for backwards compatibility with
! g77. However, I know of no way to reconfigure input_unit as a
! Fortran 2003 stream, for use with ordinary ‘read’.)
!
character, intent(inout) :: c
integer, intent(out) :: stat
 
call fgetc (input_unit, c, stat)
end subroutine get_input_unit_char
 
#else
 
subroutine get_input_unit_char (c, stat)
!
! An alternative implementation of get_input_unit_char. This
! actually reads input from the C standard input, which might not
! be the same as input_unit.
!
use, intrinsic :: iso_c_binding, only: c_int
character, intent(inout) :: c
integer, intent(out) :: stat
 
interface
!
! Use getchar(3) to read characters from standard input. This
! assumes there is actually such a function available, and that
! getchar(3) does not exist solely as a macro. (One could write
! one’s own getchar() if necessary, of course.)
!
function getchar () result (c) bind (c, name = 'getchar')
use, intrinsic :: iso_c_binding, only: c_int
integer(kind = c_int) :: c
end function getchar
end interface
 
integer(kind = c_int) :: i_char
 
i_char = getchar ()
!
! The C standard requires that EOF have a negative value. If the
! value returned by getchar(3) is not EOF, then it will be
! representable as an unsigned char. Therefore, to check for end
! of file, one need only test whether i_char is negative.
!
if (i_char < 0) then
stat = -1
else
stat = 0
c = char (i_char)
end if
end subroutine get_input_unit_char
 
#endif
 
subroutine lexer_input_t_get_next_ch (inputter, eof, ch)
class(lexer_input_t), intent(inout) :: inputter
logical, intent(out) :: eof
character(1, kind = ck), intent(inout) :: ch
 
integer :: stat
character(1) :: c = '*'
 
if (0 < inputter%unget_count) then
if (inputter%unget_eof_buffer(inputter%unget_count)) then
eof = .true.
else
eof = .false.
ch = inputter%unget_buffer(inputter%unget_count)
end if
inputter%unget_count = inputter%unget_count - 1
else
if (inputter%using_input_unit) then
call get_input_unit_char (c, stat)
else
read (unit = inputter%unit_no, iostat = stat) c
end if
 
ch = char (ichar (c, kind = ichar_kind), kind = ck)
 
if (0 < stat) then
write (error_unit, '("Input error with status code ", I0)') stat
stop 1
else if (stat < 0) then
eof = .true.
! The C reference code increases column number on end of file;
! therefore, so shall we.
inputter%column_no = inputter%column_no + 1
else
eof = .false.
if (ch == newline_char) then
inputter%line_no = inputter%line_no + 1
inputter%column_no = 0
else
inputter%column_no = inputter%column_no + 1
end if
end if
end if
end subroutine lexer_input_t_get_next_ch
 
subroutine lexer_input_t_unget_ch (inputter, ch)
class(lexer_input_t), intent(inout) :: inputter
character(1, kind = ck), intent(in) :: ch
 
if (ubound (inputter%unget_buffer, 1) <= inputter%unget_count) then
write (error_unit, '("class(lexer_input_t) unget buffer overflow")')
stop 1
else
inputter%unget_count = inputter%unget_count + 1
inputter%unget_buffer(inputter%unget_count) = ch
inputter%unget_eof_buffer(inputter%unget_count) = .false.
end if
end subroutine lexer_input_t_unget_ch
 
subroutine lexer_input_t_unget_eof (inputter)
class(lexer_input_t), intent(inout) :: inputter
 
if (ubound (inputter%unget_buffer, 1) <= inputter%unget_count) then
write (error_unit, '("class(lexer_input_t) unget buffer overflow")')
stop 1
else
inputter%unget_count = inputter%unget_count + 1
inputter%unget_buffer(inputter%unget_count) = ck_'*'
inputter%unget_eof_buffer(inputter%unget_count) = .true.
end if
end subroutine lexer_input_t_unget_eof
 
subroutine lexer_output_t_use_file (outputter, filename)
class(lexer_output_t), intent(inout) :: outputter
character(*), intent(in) :: filename
 
integer :: stat
 
outputter%unit_no = output_file_unit_no
open (unit = output_file_unit_no, file = filename, action = 'write', iostat = stat)
if (stat /= 0) then
write (error_unit, '("Error: failed to open ", A, " for output")') filename
stop 1
end if
end subroutine lexer_output_t_use_file
 
subroutine lexer_output_t_output_token (outputter, token)
class(lexer_output_t), intent(inout) :: outputter
class(token_t), intent(in) :: token
 
select case (token%token_no)
case (tk_Integer, tk_Ident, tk_String)
write (outputter%unit_no, '(1X, I20, 1X, I20, 1X, A, 1X, A)') &
& token%line_no, token%column_no, &
& token_names(token%token_no), token%val
case default
write (outputter%unit_no, '(1X, I20, 1X, I20, 1X, A)') &
& token%line_no, token%column_no, &
& trim (token_names(token%token_no))
end select
end subroutine lexer_output_t_output_token
 
subroutine run_lexer (inputter, outputter)
class(lexer_input_t), intent(inout) :: inputter
class(lexer_output_t), intent(inout) :: outputter
 
type(token_t) :: token
 
token = get_token (inputter)
do while (token%token_no /= tk_EOI)
call outputter%output_token (token)
token = get_token (inputter)
end do
call outputter%output_token (token)
end subroutine run_lexer
 
function get_token (inputter) result (token)
class(lexer_input_t), intent(inout) :: inputter
type(token_t) :: token
 
logical :: eof
character(1, kind = ck) :: ch
 
call skip_spaces_and_comments (inputter, eof, ch, &
& token%line_no, token%column_no)
 
if (eof) then
token%token_no = tk_EOI
else
select case (ch)
case (ck_'{')
token%token_no = tk_Lbrace
case (ck_'}')
token%token_no = tk_Rbrace
case (ck_'(')
token%token_no = tk_Lparen
case (ck_')')
token%token_no = tk_Rparen
case (ck_'+')
token%token_no = tk_Add
case (ck_'-')
token%token_no = tk_Sub
case (ck_'*')
token%token_no = tk_Mul
case (ck_'%')
token%token_no = tk_Mod
case (ck_';')
token%token_no = tk_Semi
case (ck_',')
token%token_no = tk_Comma
case (ck_'/')
token%token_no = tk_Div
 
case (ck_"'")
call read_character_literal
 
case (ck_'<')
call distinguish_operators (ch, ck_'=', tk_Leq, tk_Lss)
case (ck_'>')
call distinguish_operators (ch, ck_'=', tk_Geq, tk_Gtr)
case (ck_'=')
call distinguish_operators (ch, ck_'=', tk_Eq, tk_Assign)
case (ck_'!')
call distinguish_operators (ch, ck_'=', tk_Neq, tk_Not)
case (ck_'&')
call distinguish_operators (ch, ck_'&', tk_And, tk_EOI)
case (ck_'|')
call distinguish_operators (ch, ck_'|', tk_Or, tk_EOI)
 
case (ck_'"')
call read_string_literal (ch, ch)
 
case default
if (isdigit (ch)) then
call read_numeric_literal (ch)
else if (isalpha_or_underscore (ch)) then
call read_identifier_or_keyword (ch)
else
call start_error_message (inputter)
write (error_unit, '("unrecognized character ''", A, "''")') ch
stop 1
end if
end select
end if
contains
 
subroutine read_character_literal
character(1, kind = ck) :: ch
logical :: eof
character(20, kind = ck) :: buffer
 
token%token_no = tk_Integer
 
call inputter%get_next_ch (eof, ch)
if (eof) then
call start_error_message (inputter)
write (error_unit, '("end of input in character literal")')
stop 1
else if (ch == ck_"'") then
call start_error_message (inputter)
write (error_unit, '("empty character literal")')
stop 1
else if (ch == backslash_char) then
call inputter%get_next_ch (eof, ch)
if (eof) then
call start_error_message (inputter)
write (error_unit, '("end of input in character literal, after backslash")')
stop 1
else if (ch == ck_'n') then
allocate (token%val, source = newline_intstring)
else if (ch == backslash_char) then
allocate (token%val, source = backslash_intstring)
else
call start_error_message (inputter)
write (error_unit, '("unknown escape sequence ''", A, A, "'' in character literal")') &
& backslash_char, ch
stop 1
end if
call read_character_literal_close_quote
else
call read_character_literal_close_quote
write (buffer, '(I0)') ichar (ch, kind = ichar_kind)
allocate (token%val, source = trim (buffer))
end if
end subroutine read_character_literal
 
subroutine read_character_literal_close_quote
logical :: eof
character(1, kind = ck) :: close_quote
 
call inputter%get_next_ch (eof, close_quote)
if (eof) then
call start_error_message (inputter)
write (error_unit, '("end of input in character literal")')
stop 1
else if (close_quote /= ck_"'") then
call start_error_message (inputter)
write (error_unit, '("multi-character literal")')
stop 1
end if
end subroutine read_character_literal_close_quote
 
subroutine distinguish_operators (first_ch, second_ch, &
& token_no_if_second_ch, &
& token_no_if_no_second_ch)
character(1, kind = ck), intent(in) :: first_ch
character(1, kind = ck), intent(in) :: second_ch
integer, intent(in) :: token_no_if_second_ch
integer, intent(in) :: token_no_if_no_second_ch
 
character(1, kind = ck) :: ch
logical :: eof
 
call inputter%get_next_ch (eof, ch)
if (eof) then
call inputter%unget_eof
token%token_no = token_no_if_no_second_ch
else if (ch == second_ch) then
token%token_no = token_no_if_second_ch
else if (token_no_if_no_second_ch == tk_EOI) then
call start_error_message (inputter)
write (error_unit, '("unrecognized character ''", A, "''")') first_ch
stop 1
else
call inputter%unget_ch (ch)
token%token_no = token_no_if_no_second_ch
end if
end subroutine distinguish_operators
 
subroutine read_string_literal (opening_quote, closing_quote)
character(1, kind = ck), intent(in) :: opening_quote
character(1, kind = ck), intent(in) :: closing_quote
 
character(1, kind = ck) :: ch
logical :: done
 
inputter%strbuf = opening_quote
done = .false.
do while (.not. done)
call inputter%get_next_ch (eof, ch)
if (eof) then
call start_error_message (inputter)
write (error_unit, '("end of input in string literal")')
stop 1
else if (ch == closing_quote) then
call inputter%strbuf%append(ch)
done = .true.
else if (ch == newline_char) then
call start_error_message (inputter)
write (error_unit, '("end of line in string literal")')
stop 1
else
call inputter%strbuf%append(ch)
end if
end do
allocate (token%val, source = inputter%strbuf%to_unicode())
token%token_no = tk_String
end subroutine read_string_literal
 
subroutine read_numeric_literal (first_ch)
character(1, kind = ck), intent(in) :: first_ch
 
character(1, kind = ck) :: ch
 
token%token_no = tk_Integer
 
inputter%strbuf = first_ch
call inputter%get_next_ch (eof, ch)
do while (isdigit (ch))
call inputter%strbuf%append (ch)
call inputter%get_next_ch (eof, ch)
end do
if (isalpha_or_underscore (ch)) then
call start_error_message (inputter)
write (error_unit, '("invalid numeric literal """, A, """")') &
& inputter%strbuf%to_unicode()
stop 1
else
call inputter%unget_ch (ch)
allocate (token%val, source = inputter%strbuf%to_unicode())
end if
end subroutine read_numeric_literal
 
subroutine read_identifier_or_keyword (first_ch)
character(1, kind = ck), intent(in) :: first_ch
 
character(1, kind = ck) :: ch
 
inputter%strbuf = first_ch
call inputter%get_next_ch (eof, ch)
do while (isalnum_or_underscore (ch))
call inputter%strbuf%append (ch)
call inputter%get_next_ch (eof, ch)
end do
 
call inputter%unget_ch (ch)
 
!
! The following is a handwritten ‘implicit radix tree’ search
! for keywords, first partitioning the set of keywords according
! to their lengths.
!
! I did it this way for fun. One could, of course, write a
! program to generate code for such a search.
!
! Perfect hashes are another method one could use.
!
! The reference C implementation uses a binary search.
!
token%token_no = tk_Ident
select case (inputter%strbuf%length())
case (2)
select case (inputter%strbuf%chars(1))
case (ck_'i')
select case (inputter%strbuf%chars(2))
case (ck_'f')
token%token_no = tk_If
case default
continue
end select
case default
continue
end select
case (4)
select case (inputter%strbuf%chars(1))
case (ck_'e')
select case (inputter%strbuf%chars(2))
case (ck_'l')
select case (inputter%strbuf%chars(3))
case (ck_'s')
select case (inputter%strbuf%chars(4))
case (ck_'e')
token%token_no = tk_Else
case default
continue
end select
case default
continue
end select
case default
continue
end select
case (ck_'p')
select case (inputter%strbuf%chars(2))
case (ck_'u')
select case (inputter%strbuf%chars(3))
case (ck_'t')
select case (inputter%strbuf%chars(4))
case (ck_'c')
token%token_no = tk_Putc
case default
continue
end select
case default
continue
end select
case default
continue
end select
case default
continue
end select
case (5)
select case (inputter%strbuf%chars(1))
case (ck_'p')
select case (inputter%strbuf%chars(2))
case (ck_'r')
select case (inputter%strbuf%chars(3))
case (ck_'i')
select case (inputter%strbuf%chars(4))
case (ck_'n')
select case (inputter%strbuf%chars(5))
case (ck_'t')
token%token_no = tk_Print
case default
continue
end select
case default
continue
end select
case default
continue
end select
case default
continue
end select
case (ck_'w')
select case (inputter%strbuf%chars(2))
case (ck_'h')
select case (inputter%strbuf%chars(3))
case (ck_'i')
select case (inputter%strbuf%chars(4))
case (ck_'l')
select case (inputter%strbuf%chars(5))
case (ck_'e')
token%token_no = tk_While
case default
continue
end select
case default
continue
end select
case default
continue
end select
case default
continue
end select
case default
continue
end select
case default
continue
end select
if (token%token_no == tk_Ident) then
allocate (token%val, source = inputter%strbuf%to_unicode ())
end if
end subroutine read_identifier_or_keyword
 
end function get_token
 
subroutine skip_spaces_and_comments (inputter, eof, ch, line_no, column_no)
!
! This procedure skips spaces and comments, and also captures the
! line and column numbers at the correct moment to indicate the
! start of a token.
!
class(lexer_input_t), intent(inout) :: inputter
logical, intent(out) :: eof
character(1, kind = ck), intent(inout) :: ch
integer(kind = nk), intent(out) :: line_no
integer(kind = nk), intent(out) :: column_no
 
integer(kind = nk), parameter :: not_done = -(huge (1_nk))
 
line_no = not_done
do while (line_no == not_done)
call inputter%get_next_ch (eof, ch)
if (eof) then
line_no = inputter%line_no
column_no = inputter%column_no
else if (ch == ck_'/') then
line_no = inputter%line_no
column_no = inputter%column_no
call inputter%get_next_ch (eof, ch)
if (eof) then
call inputter%unget_eof
ch = ck_'/'
else if (ch /= ck_'*') then
call inputter%unget_ch (ch)
ch = ck_'/'
else
call read_to_end_of_comment
line_no = not_done
end if
else if (.not. isspace (ch)) then
line_no = inputter%line_no
column_no = inputter%column_no
end if
end do
 
contains
 
subroutine read_to_end_of_comment
logical :: done
 
done = .false.
do while (.not. done)
call inputter%get_next_ch (eof, ch)
if (eof) then
call end_of_input_in_comment
else if (ch == ck_'*') then
call inputter%get_next_ch (eof, ch)
if (eof) then
call end_of_input_in_comment
else if (ch == ck_'/') then
done = .true.
end if
end if
end do
end subroutine read_to_end_of_comment
 
subroutine end_of_input_in_comment
call start_error_message (inputter)
write (error_unit, '("end of input in comment")')
stop 1
end subroutine end_of_input_in_comment
 
end subroutine skip_spaces_and_comments
 
subroutine start_error_message (inputter)
class(lexer_input_t), intent(inout) :: inputter
 
write (error_unit, '("Lexical error at ", I0, ".", I0, ": ")', advance = 'no') &
& inputter%line_no, inputter%column_no
end subroutine start_error_message
 
elemental function isspace (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
bool = (ch == horizontal_tab_char) .or. &
& (ch == linefeed_char) .or. &
& (ch == vertical_tab_char) .or. &
& (ch == formfeed_char) .or. &
& (ch == carriage_return_char) .or. &
& (ch == space_char)
end function isspace
 
elemental function isupper (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
integer(kind = ichar_kind), parameter :: uppercase_A = ichar (ck_'A', kind = ichar_kind)
integer(kind = ichar_kind), parameter :: uppercase_Z = ichar (ck_'Z', kind = ichar_kind)
 
integer(kind = ichar_kind) :: i_ch
 
i_ch = ichar (ch, kind = ichar_kind)
bool = (uppercase_A <= i_ch .and. i_ch <= uppercase_Z)
end function isupper
 
elemental function islower (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
integer(kind = ichar_kind), parameter :: lowercase_a = ichar (ck_'a', kind = ichar_kind)
integer(kind = ichar_kind), parameter :: lowercase_z = ichar (ck_'z', kind = ichar_kind)
 
integer(kind = ichar_kind) :: i_ch
 
i_ch = ichar (ch, kind = ichar_kind)
bool = (lowercase_a <= i_ch .and. i_ch <= lowercase_z)
end function islower
 
elemental function isalpha (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
bool = isupper (ch) .or. islower (ch)
end function isalpha
 
elemental function isdigit (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
integer(kind = ichar_kind), parameter :: zero = ichar (ck_'0', kind = ichar_kind)
integer(kind = ichar_kind), parameter :: nine = ichar (ck_'9', kind = ichar_kind)
 
integer(kind = ichar_kind) :: i_ch
 
i_ch = ichar (ch, kind = ichar_kind)
bool = (zero <= i_ch .and. i_ch <= nine)
end function isdigit
 
elemental function isalnum (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
bool = isalpha (ch) .or. isdigit (ch)
end function isalnum
 
elemental function isalpha_or_underscore (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
bool = isalpha (ch) .or. (ch == ck_'_')
end function isalpha_or_underscore
 
elemental function isalnum_or_underscore (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
bool = isalnum (ch) .or. (ch == ck_'_')
end function isalnum_or_underscore
 
end module lexical_analysis
 
program lex
use, intrinsic :: iso_fortran_env, only: output_unit
use, non_intrinsic :: lexical_analysis
 
implicit none
 
integer :: arg_count
character(200) :: arg
type(lexer_input_t) :: inputter
type(lexer_output_t) :: outputter
 
arg_count = command_argument_count ()
if (3 <= arg_count) then
call print_usage
else if (arg_count == 0) then
call run_lexer (inputter, outputter)
else if (arg_count == 1) then
call get_command_argument (1, arg)
call inputter%use_file(trim (arg))
call run_lexer (inputter, outputter)
else if (arg_count == 2) then
call get_command_argument (1, arg)
call inputter%use_file(trim (arg))
call get_command_argument (2, arg)
call outputter%use_file(trim (arg))
call run_lexer (inputter, outputter)
end if
 
contains
 
subroutine print_usage
character(200) :: progname
 
call get_command_argument (0, progname)
write (output_unit, '("Usage: ", A, " [INPUT_FILE [OUTPUT_FILE]]")') &
& trim (progname)
end subroutine print_usage
end program lex</syntaxhighlight>
 
{{out}}
Test case 3.
<pre> 5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|FreeBASIC}}==
Tested with FreeBASIC 1.05
<langsyntaxhighlight FreeBASIClang="freebasic">enum Token_type
tk_EOI
tk_Mul
Line 3,223 ⟶ 7,679:
print : print "Hit any to end program"
sleep
system</langsyntaxhighlight>
{{out|case=test case 3}}
<b>
Line 3,261 ⟶ 7,717:
22 30 End_of_input</pre>
</b>
 
=={{header|Go}}==
{{trans|FreeBASIC}}
<syntaxhighlight lang="go">package main
 
import (
"bufio"
"fmt"
"log"
"os"
)
 
type TokenType int
 
const (
tkEOI TokenType = iota
tkMul
tkDiv
tkMod
tkAdd
tkSub
tkNegate
tkNot
tkLss
tkLeq
tkGtr
tkGeq
tkEq
tkNeq
tkAssign
tkAnd
tkOr
tkIf
tkElse
tkWhile
tkPrint
tkPutc
tkLparen
tkRparen
tkLbrace
tkRbrace
tkSemi
tkComma
tkIdent
tkInteger
tkString
)
 
type Symbol struct {
name string
tok TokenType
}
 
// symbol table
var symtab []Symbol
 
var scanner *bufio.Scanner
 
var (
curLine = ""
curCh byte
lineNum = 0
colNum = 0
)
 
const etx byte = 4 // used to signify EOI
 
func isDigit(ch byte) bool {
return ch >= '0' && ch <= '9'
}
 
func isAlnum(ch byte) bool {
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || isDigit(ch)
}
 
func errorMsg(eline, ecol int, msg string) {
log.Fatalf("(%d:%d) %s", eline, ecol, msg)
}
 
// add an identifier to the symbol table
func install(name string, tok TokenType) {
sym := Symbol{name, tok}
symtab = append(symtab, sym)
}
 
// search for an identifier in the symbol table
func lookup(name string) int {
for i := 0; i < len(symtab); i++ {
if symtab[i].name == name {
return i
}
}
return -1
}
 
// read the next line of input from the source file
func nextLine() {
if scanner.Scan() {
curLine = scanner.Text()
lineNum++
colNum = 0
if curLine == "" { // skip blank lines
nextLine()
}
} else {
err := scanner.Err()
if err == nil { // EOF
curCh = etx
curLine = ""
lineNum++
colNum = 1
} else {
log.Fatal(err)
}
}
}
 
// get the next char
func nextChar() {
if colNum >= len(curLine) {
nextLine()
}
if colNum < len(curLine) {
curCh = curLine[colNum]
colNum++
}
}
 
func follow(eline, ecol int, expect byte, ifyes, ifno TokenType) TokenType {
if curCh == expect {
nextChar()
return ifyes
}
if ifno == tkEOI {
errorMsg(eline, ecol, "follow unrecognized character: "+string(curCh))
}
return ifno
}
 
func gettok() (eline, ecol int, tok TokenType, v string) {
// skip whitespace
for curCh == ' ' || curCh == '\t' || curCh == '\n' {
nextChar()
}
eline = lineNum
ecol = colNum
switch curCh {
case etx:
tok = tkEOI
return
case '{':
tok = tkLbrace
nextChar()
return
case '}':
tok = tkRbrace
nextChar()
return
case '(':
tok = tkLparen
nextChar()
return
case ')':
tok = tkRparen
nextChar()
return
case '+':
tok = tkAdd
nextChar()
return
case '-':
tok = tkSub
nextChar()
return
case '*':
tok = tkMul
nextChar()
return
case '%':
tok = tkMod
nextChar()
return
case ';':
tok = tkSemi
nextChar()
return
case ',':
tok = tkComma
nextChar()
return
case '/': // div or comment
nextChar()
if curCh != '*' {
tok = tkDiv
return
}
// skip comments
nextChar()
for {
if curCh == '*' {
nextChar()
if curCh == '/' {
nextChar()
eline, ecol, tok, v = gettok()
return
}
} else if curCh == etx {
errorMsg(eline, ecol, "EOF in comment")
} else {
nextChar()
}
}
case '\'': // single char literals
nextChar()
v = fmt.Sprintf("%d", curCh)
if curCh == '\'' {
errorMsg(eline, ecol, "Empty character constant")
}
if curCh == '\\' {
nextChar()
if curCh == 'n' {
v = "10"
} else if curCh == '\\' {
v = "92"
} else {
errorMsg(eline, ecol, "unknown escape sequence: "+string(curCh))
}
}
nextChar()
if curCh != '\'' {
errorMsg(eline, ecol, "multi-character constant")
}
nextChar()
tok = tkInteger
return
case '<':
nextChar()
tok = follow(eline, ecol, '=', tkLeq, tkLss)
return
case '>':
nextChar()
tok = follow(eline, ecol, '=', tkGeq, tkGtr)
return
case '!':
nextChar()
tok = follow(eline, ecol, '=', tkNeq, tkNot)
return
case '=':
nextChar()
tok = follow(eline, ecol, '=', tkEq, tkAssign)
return
case '&':
nextChar()
tok = follow(eline, ecol, '&', tkAnd, tkEOI)
return
case '|':
nextChar()
tok = follow(eline, ecol, '|', tkOr, tkEOI)
return
case '"': // string
v = string(curCh)
nextChar()
for curCh != '"' {
if curCh == '\n' {
errorMsg(eline, ecol, "EOL in string")
}
if curCh == etx {
errorMsg(eline, ecol, "EOF in string")
}
v += string(curCh)
nextChar()
}
v += string(curCh)
nextChar()
tok = tkString
return
default: // integers or identifiers
isNumber := isDigit(curCh)
v = ""
for isAlnum(curCh) || curCh == '_' {
if !isDigit(curCh) {
isNumber = false
}
v += string(curCh)
nextChar()
}
if len(v) == 0 {
errorMsg(eline, ecol, "unknown character: "+string(curCh))
}
if isDigit(v[0]) {
if !isNumber {
errorMsg(eline, ecol, "invalid number: "+string(curCh))
}
tok = tkInteger
return
}
index := lookup(v)
if index == -1 {
tok = tkIdent
} else {
tok = symtab[index].tok
}
return
}
}
 
func initLex() {
install("else", tkElse)
install("if", tkIf)
install("print", tkPrint)
install("putc", tkPutc)
install("while", tkWhile)
nextChar()
}
 
func process() {
tokMap := make(map[TokenType]string)
tokMap[tkEOI] = "End_of_input"
tokMap[tkMul] = "Op_multiply"
tokMap[tkDiv] = "Op_divide"
tokMap[tkMod] = "Op_mod"
tokMap[tkAdd] = "Op_add"
tokMap[tkSub] = "Op_subtract"
tokMap[tkNegate] = "Op_negate"
tokMap[tkNot] = "Op_not"
tokMap[tkLss] = "Op_less"
tokMap[tkLeq] = "Op_lessequal"
tokMap[tkGtr] = "Op_greater"
tokMap[tkGeq] = "Op_greaterequal"
tokMap[tkEq] = "Op_equal"
tokMap[tkNeq] = "Op_notequal"
tokMap[tkAssign] = "Op_assign"
tokMap[tkAnd] = "Op_and"
tokMap[tkOr] = "Op_or"
tokMap[tkIf] = "Keyword_if"
tokMap[tkElse] = "Keyword_else"
tokMap[tkWhile] = "Keyword_while"
tokMap[tkPrint] = "Keyword_print"
tokMap[tkPutc] = "Keyword_putc"
tokMap[tkLparen] = "LeftParen"
tokMap[tkRparen] = "RightParen"
tokMap[tkLbrace] = "LeftBrace"
tokMap[tkRbrace] = "RightBrace"
tokMap[tkSemi] = "Semicolon"
tokMap[tkComma] = "Comma"
tokMap[tkIdent] = "Identifier"
tokMap[tkInteger] = "Integer"
tokMap[tkString] = "String"
 
for {
eline, ecol, tok, v := gettok()
fmt.Printf("%5d %5d %-16s", eline, ecol, tokMap[tok])
if tok == tkInteger || tok == tkIdent || tok == tkString {
fmt.Println(v)
} else {
fmt.Println()
}
if tok == tkEOI {
return
}
}
}
 
func check(err error) {
if err != nil {
log.Fatal(err)
}
}
 
func main() {
if len(os.Args) < 2 {
fmt.Println("Filename required")
return
}
f, err := os.Open(os.Args[1])
check(err)
defer f.Close()
scanner = bufio.NewScanner(f)
initLex()
process()
}</syntaxhighlight>
 
{{out}}
Test Case 3:
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
 
=={{header|Haskell}}==
Tested with GHC 8.0.2
<syntaxhighlight lang="haskell">import Control.Applicative hiding (many, some)
import Control.Monad.State.Lazy
import Control.Monad.Trans.Maybe (MaybeT, runMaybeT)
import Data.Char (isAsciiLower, isAsciiUpper, isDigit, ord)
import Data.Foldable (asum)
import Data.Functor (($>))
import Data.Text (Text)
import qualified Data.Text as T
import Prelude hiding (lex)
import System.Environment (getArgs)
import System.IO
import Text.Printf
 
 
-- Tokens --------------------------------------------------------------------------------------------------------------
data Val = IntVal Int -- value
| TextVal String Text -- name value
| SymbolVal String -- name
| Skip
| LexError String -- message
 
data Token = Token Val Int Int -- value line column
 
 
instance Show Val where
show (IntVal value) = printf "%-18s%d\n" "Integer" value
show (TextVal "String" value) = printf "%-18s%s\n" "String" (show $ T.unpack value) -- show escaped characters
show (TextVal name value) = printf "%-18s%s\n" name (T.unpack value)
show (SymbolVal name ) = printf "%s\n" name
show (LexError msg ) = printf "%-18s%s\n" "Error" msg
show Skip = printf ""
 
instance Show Token where
show (Token val line column) = printf "%2d %2d %s" line column (show val)
 
 
printTokens :: [Token] -> String
printTokens tokens =
"Location Token name Value\n" ++
"--------------------------------------\n" ++
(concatMap show tokens)
 
 
-- Tokenizers ----------------------------------------------------------------------------------------------------------
makeToken :: Lexer Val -> Lexer Token
makeToken lexer = do
(t, l, c) <- get
val <- lexer
 
case val of
Skip -> nextToken
 
LexError msg -> do
(_, l', c') <- get
 
let code = T.unpack $ T.take (c' - c + 1) t
let str = printf "%s\n%s(%d, %d): %s" msg (replicate 27 ' ') l' c' code
 
ch <- peek
unless (ch == '\0') $ advance 1
 
return $ Token (LexError str) l c
 
_ -> return $ Token val l c
 
 
simpleToken :: String -> String -> Lexer Val
simpleToken lexeme name = lit lexeme $> SymbolVal name
 
 
makeTokenizers :: [(String, String)] -> Lexer Val
makeTokenizers = asum . map (uncurry simpleToken)
 
 
keywords :: Lexer Val
keywords = makeTokenizers
[("if", "Keyword_if"), ("else", "Keyword_else"), ("while", "Keyword_while"),
("print", "Keyword_print"), ("putc", "Keyword_putc")]
 
 
operators :: Lexer Val
operators = makeTokenizers
[("*", "Op_multiply"), ("/", "Op_divide"), ("%", "Op_mod"), ("+", "Op_add"),
("-", "Op_subtract"), ("<=", "Op_lessequal"), ("<", "Op_less"), (">=", "Op_greaterequal"),
(">", "Op_greater"), ("==", "Op_equal"), ("!=", "Op_notequal"), ("!", "Op_not"),
("=", "Op_assign"), ("&&", "Op_and"), ("||", "Op_or")]
 
 
symbols :: Lexer Val
symbols = makeTokenizers
[("(", "LeftParen"), (")", "RightParen"),
("{", "LeftBrace"), ("}", "RightBrace"),
(";", "Semicolon"), (",", "Comma")]
 
 
isIdStart :: Char -> Bool
isIdStart ch = isAsciiLower ch || isAsciiUpper ch || ch == '_'
 
isIdEnd :: Char -> Bool
isIdEnd ch = isIdStart ch || isDigit ch
 
identifier :: Lexer Val
identifier = TextVal "Identifier" <$> lexeme
where lexeme = T.cons <$> (one isIdStart) <*> (many isIdEnd)
 
 
integer :: Lexer Val
integer = do
lexeme <- some isDigit
next_ch <- peek
 
if (isIdStart next_ch) then
return $ LexError "Invalid number. Starts like a number, but ends in non-numeric characters."
else do
let num = read (T.unpack lexeme) :: Int
return $ IntVal num
 
 
character :: Lexer Val
character = do
lit "'"
str <- lookahead 3
 
case str of
(ch : '\'' : _) -> advance 2 $> IntVal (ord ch)
"\\n'" -> advance 3 $> IntVal 10
"\\\\'" -> advance 3 $> IntVal 92
('\\' : ch : "\'") -> advance 2 $> LexError (printf "Unknown escape sequence \\%c" ch)
('\'' : _) -> return $ LexError "Empty character constant"
_ -> advance 2 $> LexError "Multi-character constant"
 
 
string :: Lexer Val
string = do
lit "\""
 
loop (T.pack "") =<< peek
where loop t ch = case ch of
'\\' -> do
next_ch <- next
 
case next_ch of
'n' -> loop (T.snoc t '\n') =<< next
'\\' -> loop (T.snoc t '\\') =<< next
_ -> return $ LexError $ printf "Unknown escape sequence \\%c" next_ch
 
'"' -> next $> TextVal "String" t
 
'\n' -> return $ LexError $ "End-of-line while scanning string literal." ++
" Closing string character not found before end-of-line."
 
'\0' -> return $ LexError $ "End-of-file while scanning string literal." ++
" Closing string character not found."
 
_ -> loop (T.snoc t ch) =<< next
 
 
skipComment :: Lexer Val
skipComment = do
lit "/*"
 
loop =<< peek
where loop ch = case ch of
'\0' -> return $ LexError "End-of-file in comment. Closing comment characters not found."
 
'*' -> do
next_ch <- next
 
case next_ch of
'/' -> next $> Skip
_ -> loop next_ch
 
_ -> loop =<< next
 
 
nextToken :: Lexer Token
nextToken = do
skipWhitespace
 
makeToken $ skipComment
<|> keywords
<|> identifier
<|> integer
<|> character
<|> string
<|> operators
<|> symbols
<|> simpleToken "\0" "End_of_input"
<|> (return $ LexError "Unrecognized character.")
 
 
main :: IO ()
main = do
args <- getArgs
(hin, hout) <- getIOHandles args
 
withHandles hin hout $ printTokens . (lex nextToken)
 
 
------------------------------------------------------------------------------------------------------------------------
-- Machinery
------------------------------------------------------------------------------------------------------------------------
 
-- File handling -------------------------------------------------------------------------------------------------------
getIOHandles :: [String] -> IO (Handle, Handle)
getIOHandles [] = return (stdin, stdout)
 
getIOHandles [infile] = do
inhandle <- openFile infile ReadMode
return (inhandle, stdout)
 
getIOHandles (infile : outfile : _) = do
inhandle <- openFile infile ReadMode
outhandle <- openFile outfile WriteMode
return (inhandle, outhandle)
 
 
withHandles :: Handle -> Handle -> (String -> String) -> IO ()
withHandles in_handle out_handle f = do
contents <- hGetContents in_handle
let contents' = contents ++ "\0" -- adding \0 simplifies treatment of EOF
 
hPutStr out_handle $ f contents'
 
unless (in_handle == stdin) $ hClose in_handle
unless (out_handle == stdout) $ hClose out_handle
 
 
-- Lexer ---------------------------------------------------------------------------------------------------------------
type LexerState = (Text, Int, Int) -- input line column
type Lexer = MaybeT (State LexerState)
 
 
lexerAdvance :: Int -> LexerState -> LexerState
lexerAdvance 0 ctx = ctx
 
lexerAdvance 1 (t, l, c)
| ch == '\n' = (rest, l + 1, 1 )
| otherwise = (rest, l, c + 1)
where
(ch, rest) = (T.head t, T.tail t)
 
lexerAdvance n ctx = lexerAdvance (n - 1) $ lexerAdvance 1 ctx
 
 
advance :: Int -> Lexer ()
advance n = modify $ lexerAdvance n
 
 
peek :: Lexer Char
peek = gets $ \(t, _, _) -> T.head t
 
 
lookahead :: Int -> Lexer String
lookahead n = gets $ \(t, _, _) -> T.unpack $ T.take n t
 
 
next :: Lexer Char
next = advance 1 >> peek
 
 
skipWhitespace :: Lexer ()
skipWhitespace = do
ch <- peek
when (ch `elem` " \n") (next >> skipWhitespace)
 
 
lit :: String -> Lexer ()
lit lexeme = do
(t, _, _) <- get
guard $ T.isPrefixOf (T.pack lexeme) t
advance $ length lexeme
 
 
one :: (Char -> Bool) -> Lexer Char
one f = do
ch <- peek
guard $ f ch
next
return ch
 
 
lexerMany :: (Char -> Bool) -> LexerState -> (Text, LexerState)
lexerMany f (t, l, c) = (lexeme, (t', l', c'))
where (lexeme, _) = T.span f t
(t', l', c') = lexerAdvance (T.length lexeme) (t, l, c)
 
 
many :: (Char -> Bool) -> Lexer Text
many f = state $ lexerMany f
 
 
some :: (Char -> Bool) -> Lexer Text
some f = T.cons <$> (one f) <*> (many f)
 
 
lex :: Lexer a -> String -> [a]
lex lexer str = loop lexer (T.pack str, 1, 1)
where loop lexer s
| T.null txt = [t]
| otherwise = t : loop lexer s'
 
where (Just t, s') = runState (runMaybeT lexer) s
(txt, _, _) = s'
</syntaxhighlight>
 
{{out|case=test case 3}}
<pre>
Location Token name Value
--------------------------------------
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
 
=={{header|Icon}}==
{{trans|ATS}}
{{works with|Icon|9.5.20i}}
 
This implementation was developed for Arizona Icon, but ought to work with the Unicon compiler, as well.
 
One interesting aspect is the use of co-expressions to handle "input with pushback". The main advantage of this approach is it hides the pushback buffer from the user, without making the buffer a global variable.
 
Global variables are avoided except for some constants that require initialization.
 
<syntaxhighlight lang="icon">#
# The Rosetta Code lexical analyzer in Icon with co-expressions. Based
# upon the ATS implementation.
#
# Usage: lex [INPUTFILE [OUTPUTFILE]]
# If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input
# or standard output is used, respectively. *)
#
 
$define EOF -1
 
$define TOKEN_ELSE 0
$define TOKEN_IF 1
$define TOKEN_PRINT 2
$define TOKEN_PUTC 3
$define TOKEN_WHILE 4
$define TOKEN_MULTIPLY 5
$define TOKEN_DIVIDE 6
$define TOKEN_MOD 7
$define TOKEN_ADD 8
$define TOKEN_SUBTRACT 9
$define TOKEN_NEGATE 10
$define TOKEN_LESS 11
$define TOKEN_LESSEQUAL 12
$define TOKEN_GREATER 13
$define TOKEN_GREATEREQUAL 14
$define TOKEN_EQUAL 15
$define TOKEN_NOTEQUAL 16
$define TOKEN_NOT 17
$define TOKEN_ASSIGN 18
$define TOKEN_AND 19
$define TOKEN_OR 20
$define TOKEN_LEFTPAREN 21
$define TOKEN_RIGHTPAREN 22
$define TOKEN_LEFTBRACE 23
$define TOKEN_RIGHTBRACE 24
$define TOKEN_SEMICOLON 25
$define TOKEN_COMMA 26
$define TOKEN_IDENTIFIER 27
$define TOKEN_INTEGER 28
$define TOKEN_STRING 29
$define TOKEN_END_OF_INPUT 30
 
global whitespace
global ident_start
global ident_continuation
 
procedure main(args)
local inpf, outf
local pushback_buffer, inp, pushback
 
initial {
whitespace := ' \t\v\f\r\n'
ident_start := '_' ++ &letters
ident_continuation := ident_start ++ &digits
}
 
inpf := &input
outf := &output
if 1 <= *args & args[1] ~== "-" then {
inpf := open(args[1], "rt") |
stop("cannot open ", args[1], " for input")
}
if 2 <= *args & args[2] ~== "-" then {
outf := open(args[2], "wt") |
stop("cannot open ", args[2], " for output")
}
 
pushback_buffer := []
inp := create inputter(inpf, pushback_buffer)
pushback := create repeat push(pushback_buffer, \@&source)
@pushback # The first invocation does nothing.
 
scan_text(outf, inp, pushback)
end
 
procedure scan_text(outf, inp, pushback)
local ch
 
while /ch | ch[1] ~=== EOF do {
skip_spaces_and_comments(inp, pushback)
ch := @inp
if ch[1] === EOF then {
print_token(outf, [TOKEN_END_OF_INPUT, "", ch[2], ch[3]])
} else {
ch @pushback
print_token(outf, get_next_token(inp, pushback))
}
}
end
 
procedure get_next_token(inp, pushback)
local ch, ch1
local ln, cn
 
skip_spaces_and_comments(inp, pushback)
ch := @inp
ln := ch[2] # line number
cn := ch[3] # column number
case ch[1] of {
"," : return [TOKEN_COMMA, ",", ln, cn]
";" : return [TOKEN_SEMICOLON, ";", ln, cn]
"(" : return [TOKEN_LEFTPAREN, "(", ln, cn]
")" : return [TOKEN_RIGHTPAREN, ")", ln, cn]
"{" : return [TOKEN_LEFTBRACE, "{", ln, cn]
"}" : return [TOKEN_RIGHTBRACE, "}", ln, cn]
"*" : return [TOKEN_MULTIPLY, "*", ln, cn]
"/" : return [TOKEN_DIVIDE, "/", ln, cn]
"%" : return [TOKEN_MOD, "%", ln, cn]
"+" : return [TOKEN_ADD, "+", ln, cn]
"-" : return [TOKEN_SUBTRACT, "-", ln, cn]
"<" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_LESSEQUAL, "<=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_LESS, "<", ln, cn]
}
}
">" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_GREATEREQUAL, ">=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_GREATER, ">", ln, cn]
}
}
"=" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_EQUAL, "==", ln, cn]
} else {
ch1 @pushback
return [TOKEN_ASSIGN, "=", ln, cn]
}
}
"!" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_NOTEQUAL, "!=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_NOT, "!", ln, cn]
}
}
"&" : {
ch1 := @inp
if ch1[1] === "&" then {
return [TOKEN_AND, "&&", ln, cn]
} else {
unexpected_character(ln, cn, ch)
}
}
"|" : {
ch1 := @inp
if ch1[1] === "|" then {
return [TOKEN_OR, "||", ln, cn]
} else {
unexpected_character(ln, cn, ch)
}
}
"\"" : {
ch @pushback
return scan_string_literal(inp)
}
"'" : {
ch @pushback
return scan_character_literal(inp, pushback)
}
default : {
if any(&digits, ch[1]) then {
ch @pushback
return scan_integer_literal(inp, pushback)
} else if any(ident_start, ch[1]) then {
ch @pushback
return scan_identifier_or_reserved_word (inp, pushback)
} else {
unexpected_character(ln, cn, ch)
}
}
}
end
 
procedure scan_identifier_or_reserved_word(inp, pushback)
local ch
local s
local line_no, column_no
 
s := ""
ch := @inp
line_no := ch[2]
column_no := ch[3]
while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do {
s ||:= ch[1]
ch := @inp
}
ch @pushback
return reserved_word_lookup (s, line_no, column_no)
end
 
procedure scan_integer_literal(inp, pushback)
local ch
local s
local line_no, column_no
 
s := ""
ch := @inp
line_no := ch[2]
column_no := ch[3]
while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do {
s ||:= ch[1]
ch := @inp
}
ch @pushback
not upto(~&digits, s) | invalid_integer_literal(line_no, column_no, s)
return [TOKEN_INTEGER, s, line_no, column_no]
end
 
procedure scan_character_literal(inp, pushback)
local ch, ch1
local close_quote
local toktup
local line_no, column_no
 
ch := @inp # The opening quote.
close_quote := ch[1] # Same as the opening quote.
ch @pushback
 
line_no := ch[2]
column_no := ch[3]
 
toktup := scan_character_literal_without_checking_end(inp)
ch1 := @inp
if ch1[1] ~=== close_quote then {
repeat {
case ch1[1] of {
EOF : unterminated_character_literal(line_no, column_no)
close_quote : multicharacter_literal(line_no, column_no)
default : ch1 := @inp
}
}
}
return toktup
end
 
procedure scan_character_literal_without_checking_end(inp)
local ch, ch1, ch2
 
ch := @inp # The opening quote.
ch1 := @inp
EOF ~=== ch1[1] | unterminated_character_literal(ch[2], ch[3])
if ch1[1] == "\\" then {
ch2 := @inp
EOF ~=== ch2[1] | unterminated_character_literal(ch[2], ch[3])
case ch2[1] of {
"n" : return [TOKEN_INTEGER, string(ord("\n")), ch[2], ch[3]]
"\\" : return [TOKEN_INTEGER, string(ord("\\")), ch[2], ch[3]]
default : unsupported_escape(ch1[2], ch1[3], ch2)
}
} else {
return [TOKEN_INTEGER, string(ord(ch1[1])), ch[2], ch[3]]
}
end
 
procedure scan_string_literal(inp)
local ch, ch1, ch2
local line_no, column_no
local close_quote
local s
local retval
 
ch := @inp # The opening quote
close_quote := ch[1] # Same as the opening quote.
line_no := ch[2]
column_no := ch[3]
 
s := ch[1]
until \retval do {
ch1 := @inp
ch1[1] ~=== EOF |
unterminated_string_literal (line_no, column_no,
"end of input")
ch1[1] ~== "\n" |
unterminated_string_literal (line_no, column_no,
"end of line")
if ch1[1] == close_quote then {
retval := [TOKEN_STRING, s || close_quote, line_no, column_no]
} else if ch1[1] ~== "\\" then {
s ||:= ch1[1]
} else {
ch2 := @inp
EOF ~=== ch2[1] | unsupported_escape(line_no, column_no, ch2)
case ch2[1] of {
"n" : s ||:= "\\n"
"\\" : s ||:= "\\\\"
default : unsupported_escape(line_no, column_no, ch2)
}
}
}
return retval
end
 
procedure skip_spaces_and_comments(inp, pushback)
local ch, ch1
 
repeat {
ch := @inp
(EOF === ch[1]) & { ch @pushback; return }
if not any(whitespace, ch[1]) then {
(ch[1] == "/") | { ch @pushback; return }
(ch1 := @inp) | { ch @pushback; return }
(ch1[1] == "*") | { ch1 @pushback; ch @pushback; return }
scan_comment(inp, ch[2], ch[3])
}
}
end
 
procedure scan_comment(inp, line_no, column_no)
local ch, ch1
 
until (\ch)[1] == "*" & (\ch1)[1] == "/" do {
ch := @inp
(EOF === ch[1]) & unterminated_comment(line_no, column_no)
if ch[1] == "*" then {
ch1 := @inp
(EOF === ch1[1]) & unterminated_comment(line_no, column_no)
}
}
return
end
 
procedure reserved_word_lookup(s, line_no, column_no)
 
# Lookup is by an extremely simple perfect hash.
 
static reserved_words
static reserved_word_tokens
local hashval, token, toktup
 
initial {
reserved_words := ["if", "print", "else",
"", "putc", "",
"", "while", ""]
reserved_word_tokens :=
[TOKEN_IF, TOKEN_PRINT, TOKEN_ELSE,
TOKEN_IDENTIFIER, TOKEN_PUTC, TOKEN_IDENTIFIER,
TOKEN_IDENTIFIER, TOKEN_WHILE, TOKEN_IDENTIFIER]
}
 
if *s < 2 then {
toktup := [TOKEN_IDENTIFIER, s, line_no, column_no]
} else {
hashval := ((ord(s[1]) + ord(s[2])) % (*reserved_words)) + 1
token := reserved_word_tokens[hashval]
if token = TOKEN_IDENTIFIER | s ~== reserved_words[hashval] then {
toktup := [TOKEN_IDENTIFIER, s, line_no, column_no]
} else {
toktup := [token, s, line_no, column_no]
}
}
return toktup
end
 
procedure print_token(outf, toktup)
static token_names
local s_line, s_column
 
initial {
token_names := ["Keyword_else",
"Keyword_if",
"Keyword_print",
"Keyword_putc",
"Keyword_while",
"Op_multiply",
"Op_divide",
"Op_mod",
"Op_add",
"Op_subtract",
"Op_negate",
"Op_less",
"Op_lessequal",
"Op_greater",
"Op_greaterequal",
"Op_equal",
"Op_notequal",
"Op_not",
"Op_assign",
"Op_and",
"Op_or",
"LeftParen",
"RightParen",
"LeftBrace",
"RightBrace",
"Semicolon",
"Comma",
"Identifier",
"Integer",
"String",
"End_of_input"]
}
 
/outf := &output
s_line := string(toktup[3])
s_column := string(toktup[4])
writes(outf, right (s_line, max(5, *s_line)))
writes(outf, " ")
writes(outf, right (s_column, max(5, *s_column)))
writes(outf, " ")
writes(outf, token_names[toktup[1] + 1])
case toktup[1] of {
TOKEN_IDENTIFIER : writes(outf, " ", toktup[2])
TOKEN_INTEGER : writes(outf, " ", toktup[2])
TOKEN_STRING : writes(outf, " ", toktup[2])
}
write(outf)
return
end
 
procedure inputter(inpf, pushback_buffer)
local buffer
local line_no, column_no
local c
 
buffer := ""
line_no := 1
column_no := 1
 
repeat {
buffer? {
until *pushback_buffer = 0 & pos(0) do {
if *pushback_buffer ~= 0 then {
suspend pop(pushback_buffer)
} else {
c := move(1)
suspend [c, line_no, column_no]
if c == "\n" then {
line_no +:= 1
column_no := 1
} else {
column_no +:= 1
}
}
}
}
(buffer := reads(inpf, 2048)) |
suspend [EOF, line_no, column_no]
}
end
 
procedure unterminated_comment(line_no, column_no)
error("unterminated comment starting at ",
line_no, ":", column_no)
end
 
procedure unexpected_character(line_no, column_no, ch)
error("unexpected character '", ch[1], "' starting at ",
line_no, ":", column_no)
end
 
procedure unterminated_string_literal (line_no, column_no, cause)
error("unterminated string literal (", cause, ") starting at ",
line_no, ":", column_no)
end
 
procedure unsupported_escape (line_no, column_no, ch)
if ch[1] === EOF then {
error("unexpected \\ at end of input",
" starting at ", line_no, ":", column_no)
} else {
error("unsupported escape \\", ch[1],
" starting at ", line_no, ":", column_no)
}
end
 
procedure invalid_integer_literal(line_no, column_no, s)
error("invalid integer literal ", s,
" starting at ", line_no, ":", column_no)
end
 
procedure unterminated_character_literal(line_no, column_no)
error("unterminated character literal starting at ",
line_no, ":", column_no)
end
 
procedure multicharacter_literal(line_no, column_no)
error("unsupported multicharacter literal starting at ",
line_no, ":", column_no)
end
 
procedure error(args[])
write!([&errout] ||| args)
exit(1)
end
 
procedure max(x, y)
return (if x < y then y else x)
end</syntaxhighlight>
 
 
{{out}}
<pre>$ icont -s -u -o lex lex-in-Icon.icn && ./lex compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|J}}==
Here, we first build a tokenizer state machine sufficient to recognize our mini-language. This tokenizer must not discard any characters, because we will be using cumulative character offsets to identify line numbers and column numbers.
 
Then, we refine this result: we generate those line and column numbers, discard whitespace and comments, and classify tokens based on their structure.
 
(Also, in this version, rather than building out a full state machine to recognize character literals, we treat character literals as a sequence of tokens which we must then refine. It might have been wiser to build character literals as single tokens,)
 
Implementation:
 
<syntaxhighlight lang="j">symbols=:256#0
ch=: {{1 0+x[symbols=: x (a.i.y)} symbols}}
'T0 token' =: 0 ch '%+-!(){};,<>=!|&'
'L0 letter' =: 1 ch '_',,u:65 97+/i.26
'D0 digit' =: 2 ch u:48+i.10
'S0 space' =: 3 ch ' ',LF
'C0 commen' =: 4 ch '/'
'C1 comment'=: 5 ch '*'
'q0 quote' =: 6 ch ''''
'Q0 dquote' =: 7 ch '"'
 
width=: 1+>./symbols
default=: ,:(1+i.width),every 2
states=:((1+i.width),every 1),width#default
extend=: {{
if.y>#states do.states=: y{.states,y#default
end.states
}}
pad=: {{if. 0=#y do.y=.#states end.y}}
function=: {{ NB. x: before, m: op, n: symbol, y: after
y[states=: (y,m) (<x,n)} extend 1+x>.y=.pad y
}}
{{for_op.y do.(op)=: op_index function end.0}};:'nop init start'
all=: {{y=.pad y
for_symbol.i.width do.
x symbol nop y
end.y
}}
any=: {{y=.pad y
for_symbol.i.width do.
x symbol start y
end.y
}}
 
NB. identifiers and keywords
L0 letter nop L0
L0 digit nop L0
 
NB. numbers
D0 digit nop D0
D0 letter nop D0
 
NB. white space
S0 space nop S0
 
NB. comments
C1=: C0 comment nop ''
C2=: C1 all ''
C2 all C2
C3=: C2 commen nop ''
C4=: C3 comment nop ''
 
NB. quoted characters
q1=: q0 any ''
 
NB. strings
Q1=: Q0 all ''
Q1 all Q1
Q2=: Q1 dquote nop ''
Q0 dquote nop Q2
 
tokenize=:{{
tok=. (0;states;symbols);:y
for_fix.cut'<= >= == != && ||'do.
M=.;:;fix
for_k.|.I.M E.tok do.
tok=.(fix,<'') (0 1+k)} tok
end.
end.tok-.a:
}}
 
(tknames=:;: {{)n
Op_multiply Op_divide Op_mod Op_add Op_subtract Op_less Op_lessequal
Op_greater Op_greaterequal Op_equal Op_notequal Op_not Op_and Op_or
Op_assign LeftParen RightParen Keyword_if LeftBrace Keyword_else
RightBrace Keyword_while Semicolon Keyword_print Comma Keyword_putc
}}-.LF)=: tkref=: tokenize '*/%+-<<=>>===!=!&&||=()if{else}while;print,putc'
NB. the reference tokens here were arranged to avoid whitespace tokens
NB. also, we reserve multiple token instances where a literal string
NB. appears in different syntactic productions. Here, we only use the initial
NB. instances -- the others will be used in the syntax analyzer which
NB. uses the same tkref and tknames,
 
shift=: |.!.0
numvals=: {{
ndx=. I.(0<#@>y)**/@> y e.L:0 '0123456789'
({{".y,'x'}}each ndx{y) ndx} y
}}
chrvals=: {{
q=. y=<,''''
s=. y=<,'\'
j=. I.(-.s)*(1&shift * _1&shift)q
k=. I.(y e.;:'\n')*(1 shift q)*(_2 shift q)*_1 shift s
jvals=. a.i.L:0 j{y NB. not escaped
kvals=. (k{s){<"0 a.i.LF,'\' NB. escaped
(,a:,jvals,:a:) (,_1 0 1+/j)} (,a:,a:,kvals,:a:) (,_2 _1 0 1+/k)} y
}}
 
validstring=: ((1<#)*('"'={.)*('"'={:)*('\'=])-:'\n'&E.(+._1&shift)@+.'\\'&E.) every
 
validid=: ((<,'\')~:_1&|.) * (e.&tkref) < (e.&(u:I.symbols=letter)@{. * */@(e.&(u:I.symbols e.letter,digit))@}.) every
 
lex=: {{
lineref=.I.y=LF
tokens=.(tokenize y),<,'_'
offsets=.0,}:#@;\tokens
lines=. lineref I.offsets
columns=. offsets-lines{0,lineref
keep=. -.({.@> tokens)e.u:I.space=symbols
names=. (<'End_of_input') _1} (tkref i.tokens) {(_3}.tknames),4#<'Error'
unknown=. names=<'Error'
values=. a: _1} unknown#inv numvals chrvals unknown#tokens
names=. (<'Integer') (I.(values~:a:)*tokens~:values)} names
names=. (<'String') (I.validstring tokens)} names
names=. (<'Identifier') (I.validid tokens)} names
names=. (<'End_of_input') _1} names
comments=. '*/'&-:@(_2&{.)@> tokens
whitespace=. (values=tokens) * e.&(' ',LF)@{.@> tokens
keep=. (tokens~:<,'''')*-.comments+.whitespace+.unknown*a:=values
keep&#each ((1+lines),.columns);<names,.values
}}</syntaxhighlight>
 
Test case 3:
 
<syntaxhighlight lang="j">
flex=: {{
'A B'=.y
'names values'=.|:":each B
(":A),.' ',.names,.' ',.values
}}@lex
 
testcase3=: {{)n
/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* Else */ else /* Leq */ <=
/* While */ while /* Geq */ >=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '
}}
 
flex testcase3
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 28 Identifier 10
21 28 Integer 92
22 27 Integer 32
23 1 End_of_input </syntaxhighlight>
 
Here, it seems expedient to retain a structured representation of the lexical result. As shown, it's straightforward to produce a "pure" textual result for a hypothetical alternative implementation of the syntax analyzer, but the structured representation will be easier to deal with.
 
=={{header|Java}}==
<langsyntaxhighlight lang="java">
// Translated from python source
 
Line 3,412 ⟶ 9,388:
if (text.equals("")) {
error(line, pos, String.format("identifer_or_integer unrecopgnizedunrecognized character: (%d) %c", (int)this.chr, this.chr));
}
if (Character.isDigit(text.charAt(0))) {
if (!is_number) {
error(line, pos, String.format("invaslidinvalid number: %s", text));
}
return new Token(TokenType.Integer, text, line, pos);
Line 3,503 ⟶ 9,479:
}
}
</syntaxhighlight>
</lang>
 
=={{header|JavaScript}}==
{{incorrect|Javascript|Please show output. Code is identical to [[Compiler/syntax_analyzer]] task}}
<langsyntaxhighlight lang="javascript">
/*
Token: type, value, line, pos
Line 3,719 ⟶ 9,696:
l.printTokens()
})
</syntaxhighlight>
</lang>
 
=={{header|Julia}}==
<langsyntaxhighlight lang="julia">struct Tokenized
startline::Int
startcol::Int
Line 3,877 ⟶ 9,854:
println(lpad(tok.startline, 3), lpad(tok.startcol, 5), lpad(tok.name, 18), " ", tok.value != nothing ? tok.value : "")
end
</langsyntaxhighlight>{{output}}<pre>
Line Col Name Value
5 16 Keyword_print
Line 3,914 ⟶ 9,891:
23 1 End_of_input
</pre>
=={{header|kotlin}}==
{{trans|Java}}
<syntaxhighlight lang="kotlin">// Input: command line argument of file to process or console input. A two or
// three character console input of digits followed by a new line will be
// checked for an integer between zero and twenty-five to select a fixed test
// case to run. Any other console input will be parsed.
 
// Code based on the Java version found here:
=={{header|Go}}==
// https://rosettacode.org/mw/index.php?title=Compiler/lexical_analyzer&action=edit&section=22
{{trans|FreeBASIC}}
<lang go>package main
 
// Class to halt the parsing with an exception.
import (
class ParsingFailed(message: String): Exception(message)
"bufio"
"fmt"
"log"
"os"
)
 
// Enumerate class of tokens supported by this scanner.
type TokenType int
enum class TokenType {
Tk_End_of_input, Op_multiply, Op_divide, Op_mod, Op_add, Op_subtract,
Op_negate, Op_not, Op_less, Op_lessequal, Op_greater, Op_greaterequal,
Op_equal, Op_notequal, Op_assign, Op_and, Op_or, Kw_if,
Kw_else, Kw_while, Kw_print, Kw_putc, Sy_LeftParen, Sy_RightParen,
Sy_LeftBrace, Sy_RightBrace, Sy_Semicolon, Sy_Comma, Tk_Identifier,
Tk_Integer, Tk_String;
 
override fun toString() =
const (
listOf("End_of_input", "Op_multiply", "Op_divide", "Op_mod", "Op_add",
tkEOI TokenType = iota
"Op_subtract", "Op_negate", "Op_not", "Op_less", "Op_lessequal",
tkMul
"Op_greater", "Op_greaterequal", "Op_equal", "Op_notequal",
tkDiv
"Op_assign", "Op_and", "Op_or", "Keyword_if", "Keyword_else",
tkMod
"Keyword_while", "Keyword_print", "Keyword_putc", "LeftParen",
tkAdd
"RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma",
tkSub
"Identifier", "Integer", "String")[this.ordinal]
tkNegate
} // TokenType
tkNot
tkLss
tkLeq
tkGtr
tkGeq
tkEq
tkNeq
tkAssign
tkAnd
tkOr
tkIf
tkElse
tkWhile
tkPrint
tkPutc
tkLparen
tkRparen
tkLbrace
tkRbrace
tkSemi
tkComma
tkIdent
tkInteger
tkString
)
 
// Data class of tokens returned by the scanner.
type Symbol struct {
data class Token(val token: TokenType, val value: String, val line: Int,
name string
tok TokenType val pos: Int) {
}
 
// Overridden method to display the token.
// symbol table
override fun toString() =
var symtab []Symbol
"%5d %5d %-15s %s".format(line, pos, this.token,
when (this.token) {
TokenType.Tk_Integer, TokenType.Tk_Identifier ->
" %s".format(this.value)
TokenType.Tk_String ->
this.value.toList().joinToString("", " \"", "\"") {
when (it) {
'\t' ->
"\\t"
'\n' ->
"\\n"
'\u000b' ->
"\\v"
'\u000c' ->
"\\f"
'\r' ->
"\\r"
'"' ->
"\\\""
'\\' ->
"\\"
in ' '..'~' ->
"$it"
else ->
"\\u%04x".format(it.code) } }
else ->
"" } )
} // Token
 
// Function to display an error message and halt the scanner.
var scanner *bufio.Scanner
fun error(line: Int, pos: Int, msg: String): Nothing =
throw ParsingFailed("(%d, %d) %s\n".format(line, pos, msg))
 
// Class to process the source into tokens with properties of the
var (
// source string, the line number, the column position, the index
curLine = ""
// within the source string, the current character being processed,
curCh byte
// and map of the keyword strings to the corresponding token type.
lineNum = 0
class Lexer(private val s: String) {
colNum = 0
private var line = 1
)
private var pos = 1
private var position = 0
private var chr =
if (s.isEmpty())
' '
else
s[0]
private val keywords = mapOf<String, TokenType>(
"if" to TokenType.Kw_if,
"else" to TokenType.Kw_else,
"print" to TokenType.Kw_print,
"putc" to TokenType.Kw_putc,
"while" to TokenType.Kw_while)
 
// Method to retrive the next character from the source. Use null after
const etx byte = 4 // used to signify EOI
// the end of our source.
private fun getNextChar() =
if (++this.position >= this.s.length) {
this.pos++
this.chr = '\u0000'
this.chr
} else {
this.pos++
this.chr = this.s[this.position]
when (this.chr) {
'\n' -> {
this.line++
this.pos = 0
} // line
'\t' ->
while (this.pos%8 != 1)
this.pos++
} // when
this.chr
} // if
 
// Method to return the division token, skip the comment, or handle the
func isDigit(ch byte) bool {
// error.
return ch >= '0' && ch <= '9'
private fun div_or_comment(line: Int, pos: Int): Token =
}
if (getNextChar() != '*')
Token(TokenType.Op_divide, "", line, pos);
else {
getNextChar() // Skip comment start
outer@
while (true)
when (this.chr) {
'\u0000' ->
error(line, pos, "Lexer: EOF in comment");
'*' ->
if (getNextChar() == '/') {
getNextChar() // Skip comment end
break@outer
} // if
else ->
getNextChar()
} // when
getToken()
} // if
 
// Method to verify a character literal. Return the token or handle the
func isAlnum(ch byte) bool {
// error.
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || isDigit(ch)
private fun char_lit(line: Int, pos: Int): Token {
}
var c = getNextChar() // skip opening quote
when (c) {
'\'' ->
error(line, pos, "Lexer: Empty character constant");
'\\' ->
c = when (getNextChar()) {
'n' ->
10.toChar()
'\\' ->
'\\'
'\'' ->
'\''
else ->
error(line, pos, "Lexer: Unknown escape sequence '\\%c'".
format(this.chr)) }
} // when
if (getNextChar() != '\'')
error(line, pos, "Lexer: Multi-character constant")
getNextChar() // Skip closing quote
return Token(TokenType.Tk_Integer, c.code.toString(), line, pos)
} // char_lit
 
// Method to check next character to see whether it belongs to the token
func errorMsg(eline, ecol int, msg string) {
// we might be in the middle of. Return the correct token or handle the
log.Fatalf("(%d:%d) %s", eline, ecol, msg)
// error.
}
private fun follow(expect: Char, ifyes: TokenType, ifno: TokenType,
line: Int, pos: Int): Token =
when {
getNextChar() == expect -> {
getNextChar()
Token(ifyes, "", line, pos)
} // matches
ifno == TokenType.Tk_End_of_input ->
error(line, pos,
"Lexer: %c expected: (%d) '%c'".format(expect,
this.chr.code, this.chr))
else ->
Token(ifno, "", line, pos)
} // when
 
// Method to verify a character string. Return the token or handle the
// error.
private fun string_lit(start: Char, line: Int, pos: Int): Token {
var result = ""
while (getNextChar() != start)
when (this.chr) {
'\u0000' ->
error(line, pos, "Lexer: EOF while scanning string literal")
'\n' ->
error(line, pos, "Lexer: EOL while scanning string literal")
'\\' ->
when (getNextChar()) {
'\\' ->
result += '\\'
'n' ->
result += '\n'
'"' ->
result += '"'
else ->
error(line, pos, "Lexer: Escape sequence unknown '\\%c'".
format(this.chr))
} // when
else ->
result += this.chr
} // when
getNextChar() // Toss closing quote
return Token(TokenType.Tk_String, result, line, pos)
} // string_lit
 
// Method to retrive an identifier or integer. Return the keyword
// token, if the string matches one. Return the integer token,
// if the string is all digits. Return the identifer token, if the
// string is valid. Otherwise, handle the error.
private fun identifier_or_integer(line: Int, pos: Int): Token {
var is_number = true
var text = ""
while (this.chr in listOf('_')+('0'..'9')+('a'..'z')+('A'..'Z')) {
text += this.chr
is_number = is_number && this.chr in '0'..'9'
getNextChar()
} // while
if (text.isEmpty())
error(line, pos, "Lexer: Unrecognized character: (%d) %c".
format(this.chr.code, this.chr))
return when {
text[0] in '0'..'9' ->
if (!is_number)
error(line, pos, "Lexer: Invalid number: %s".
format(text))
else {
val max = Int.MAX_VALUE.toString()
if (text.length > max.length || (text.length == max.length &&
max < text))
error(line, pos,
"Lexer: Number exceeds maximum value %s".
format(text))
Token(TokenType.Tk_Integer, text, line, pos)
} // if
this.keywords.containsKey(text) ->
Token(this.keywords[text]!!, "", line, pos)
else ->
Token(TokenType.Tk_Identifier, text, line, pos) }
} // identifier_or_integer
 
// Method to skip whitespace both C's and Unicode ones and retrive the next
// token.
private fun getToken(): Token {
while (this.chr in listOf('\t', '\n', '\u000b', '\u000c', '\r', ' ') ||
this.chr.isWhitespace())
getNextChar()
val line = this.line
val pos = this.pos
return when (this.chr) {
'\u0000' ->
Token(TokenType.Tk_End_of_input, "", line, pos)
'/' ->
div_or_comment(line, pos)
'\'' ->
char_lit(line, pos)
'<' ->
follow('=', TokenType.Op_lessequal, TokenType.Op_less, line, pos)
'>' ->
follow('=', TokenType.Op_greaterequal, TokenType.Op_greater, line, pos)
'=' ->
follow('=', TokenType.Op_equal, TokenType.Op_assign, line, pos)
'!' ->
follow('=', TokenType.Op_notequal, TokenType.Op_not, line, pos)
'&' ->
follow('&', TokenType.Op_and, TokenType.Tk_End_of_input, line, pos)
'|' ->
follow('|', TokenType.Op_or, TokenType.Tk_End_of_input, line, pos)
'"' ->
string_lit(this.chr, line, pos)
'{' -> {
getNextChar()
Token(TokenType.Sy_LeftBrace, "", line, pos)
} // open brace
'}' -> {
getNextChar()
Token(TokenType.Sy_RightBrace, "", line, pos)
} // close brace
'(' -> {
getNextChar()
Token(TokenType.Sy_LeftParen, "", line, pos)
} // open paren
')' -> {
getNextChar()
Token(TokenType.Sy_RightParen, "", line, pos)
} // close paren
'+' -> {
getNextChar()
Token(TokenType.Op_add, "", line, pos)
} // plus
'-' -> {
getNextChar()
Token(TokenType.Op_subtract, "", line, pos)
} // dash
'*' -> {
getNextChar()
Token(TokenType.Op_multiply, "", line, pos)
} // asterisk
'%' -> {
getNextChar()
Token(TokenType.Op_mod, "", line, pos)
} // percent
';' -> {
getNextChar()
Token(TokenType.Sy_Semicolon, "", line, pos)
} // semicolon
',' -> {
getNextChar()
Token(TokenType.Sy_Comma, "", line, pos)
} // comma
else ->
identifier_or_integer(line, pos) }
} // getToken
 
// Method to parse and display tokens.
fun printTokens() {
do {
val t: Token = getToken()
println(t)
} while (t.token != TokenType.Tk_End_of_input)
} // printTokens
} // Lexer
 
// add an identifier to the symbol table
func install(name string, tok TokenType) {
sym := Symbol{name, tok}
symtab = append(symtab, sym)
}
 
// Function to test all good tests from the website and produce all of the
// search for an identifier in the symbol table
// error messages this program supports.
func lookup(name string) int {
fun tests(number: Int) {
for i := 0; i < len(symtab); i++ {
 
if symtab[i].name == name {
// Function to generate test case 0 source: Hello World/Text.
return i
fun hello() }{
} Lexer(
"""/*
return -1
Hello world
*/
print("Hello, World!\n");
""").printTokens()
} // hello
 
// Function to generate test case 1 source: Phoenix Number.
fun phoenix() {
Lexer(
"""/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");""").printTokens()
} // phoenix
 
// Function to generate test case 2 source: All Symbols.
fun symbols() {
Lexer(
"""/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* Else */ else /* Leq */ <=
/* While */ while /* Geq */ >=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '""").printTokens()
} // symbols
 
// Function to generate test case 3 source: Test Case 4.
fun four() {
Lexer(
"""/*** test printing, embedded \n and comments with lots of '*' ***/
print(42);
print("\nHello World\nGood Bye\nok\n");
print("Print a slash n - \\n.\n");""").printTokens()
} // four
 
// Function to generate test case 4 source: Count.
fun count() {
Lexer(
"""count = 1;
while (count < 10) {
print("count is: ", count, "\n");
count = count + 1;
}""").printTokens()
} // count
 
// Function to generate test case 5 source: 100 Doors.
fun doors() {
Lexer(
"""/* 100 Doors */
i = 1;
while (i * i <= 100) {
print("door ", i * i, " is open\n");
i = i + 1;
}""").printTokens()
} // doors
 
// Function to generate test case 6 source: Negative Tests.
fun negative() {
Lexer(
"""a = (-1 * ((-1 * (5 * 15)) / 10));
print(a, "\n");
b = -a;
print(b, "\n");
print(-b, "\n");
print(-(1), "\n");""").printTokens()
} // negative
 
// Function to generate test case 7 source: Deep.
fun deep() {
Lexer(
"""print(---------------------------------+++5, "\n");
print(((((((((3 + 2) * ((((((2))))))))))))), "\n");
 
if (1) { if (1) { if (1) { if (1) { if (1) { print(15, "\n"); } } } } }""").printTokens()
} // deep
 
// Function to generate test case 8 source: Greatest Common Divisor.
fun gcd() {
Lexer(
"""/* Compute the gcd of 1071, 1029: 21 */
 
a = 1071;
b = 1029;
 
while (b != 0) {
new_a = b;
b = a % b;
a = new_a;
}
print(a);""").printTokens()
} // gcd
 
// read the next line// Function to ofgenerate inputtest fromcase the9 source: fileFactorial.
fun factorial() {
func nextLine() {
Lexer(
if scanner.Scan() {
"""/* 12 factorial is 479001600 */
curLine = scanner.Text()
 
lineNum++
n = 12;
colNum = 0
result = 1;
if curLine == "" { // skip blank lines
i = 1;
nextLine()
while (i <= n) {
}
}result else= {result * i;
i = i + err := scanner.Err()1;
if err == nil { // EOF
curCh = etx
curLine = ""
lineNum++
colNum = 1
} else {
log.Fatal(err)
}
}
}
print(result);""").printTokens()
} // factorial
 
// Function to generate test case 10 source: Fibonacci Sequence.
// get the next char
fun fibonacci() {
func nextChar() {
if colNum >= len(curLine) {Lexer(
"""/* fibonacci of 44 is 701408733 */
nextLine()
 
}
n = 44;
if colNum < len(curLine) {
i = 1;
curCh = curLine[colNum]
a = 0;
colNum++
b = }1;
while (i < n) {
w = a + b;
a = b;
b = w;
i = i + 1;
}
print(w, "\n");""").printTokens()
} // fibonacci
 
// Function to generate test case 11 source: FizzBuzz.
fun fizzbuzz() {
Lexer(
"""/* FizzBuzz */
i = 1;
while (i <= 100) {
if (!(i % 15))
print("FizzBuzz");
else if (!(i % 3))
print("Fizz");
else if (!(i % 5))
print("Buzz");
else
print(i);
 
print("\n");
i = i + 1;
}""").printTokens()
} // fizzbuzz
 
// Function to generate test case 12 source: 99 Bottles of Beer.
func follow(eline, ecol int, expect byte, ifyes, ifno TokenType) TokenType {
iffun curCh == expectbottles() {
nextCharLexer()
"""/* 99 bottles */
return ifyes
bottles = 99;
while (bottles > 0) {
print(bottles, " bottles of beer on the wall\n");
print(bottles, " bottles of beer\n");
print("Take one down, pass it around\n");
bottles = bottles - 1;
print(bottles, " bottles of beer on the wall\n\n");
}""").printTokens()
} // bottles
 
// Function to generate test case 13 source: Primes.
fun primes() {
Lexer(
"""/*
Simple prime number generator
*/
count = 1;
n = 1;
limit = 100;
while (n < limit) {
k=3;
p=1;
n=n+2;
while ((k*k<=n) && (p)) {
p=n/k*k!=n;
k=k+2;
}
if ifno == tkEOI(p) {
print(n, " is prime\n");
errorMsg(eline, ecol, "follow unrecognized character: "+string(curCh))
count = count + 1;
}
return ifno
}
print("Total primes found: ", count, "\n");""").printTokens()
} // primes
 
// Function to generate test case 14 source: Ascii Mandelbrot.
func gettok() (eline, ecol int, tok TokenType, v string) {
//fun skipascii() whitespace{
Lexer(
for curCh == ' ' || curCh == '\t' || curCh == '\n' {
"""{
nextChar()
/*
}
This is an integer ascii Mandelbrot generator
eline = lineNum
*/
ecol = colNum
switchleft_edge curCh { = -420;
caseright_edge etx: = 300;
top_edge tok = tkEOI 300;
bottom_edge = return-300;
casex_step '{': = 7;
y_step tok = tkLbrace 15;
 
nextChar()
max_iter return= 200;
 
case '}':
toky0 = tkRbracetop_edge;
while (y0 > nextChar(bottom_edge) {
returnx0 = left_edge;
while (x0 < right_edge) {
case '(':
tok y = tkLparen0;
nextChar() x = 0;
return the_char = ' ';
case ')': i = 0;
tok = tkRparen while (i < max_iter) {
nextChar x_x = (x * x) / 200;
return y_y = (y * y) / 200;
if (x_x + y_y > 800 ) {
case '+':
tok the_char = tkAdd'0' + i;
nextChar if (i > 9) {
return the_char = '@';
}
case '-':
tok i = tkSubmax_iter;
nextChar()
return
case '*':
tok = tkMul
nextChar()
return
case '%':
tok = tkMod
nextChar()
return
case ';':
tok = tkSemi
nextChar()
return
case ',':
tok = tkComma
nextChar()
return
case '/': // div or comment
nextChar()
if curCh != '*' {
tok = tkDiv
return
}
// skip comments
nextChar()
for {
if curCh == '*' {
nextChar()
if curCh == '/' {
nextChar()
eline, ecol, tok, v = gettok()
return
}
} else if curCh y == etxx * y / 100 + {y0;
errorMsg(eline,x ecol,= "EOFx_x in- comment")y_y + x0;
} else { i = i + 1;
nextChar()
}
putc(the_char);
x0 = x0 + x_step;
}
putc('\n');
case '\'': // single char literals
nextChar()y0 = y0 - y_step;
v = fmt.Sprintf("%d", curCh)
if curCh == '\'' {
errorMsg(eline, ecol, "Empty character constant")
}
if curCh == '\\' {
nextChar()
if curCh == 'n' {
v = "10"
} else if curCh == '\\' {
v = "92"
} else {
errorMsg(eline, ecol, "unknown escape sequence: "+string(curCh))
}
}
nextChar()
if curCh != '\'' {
errorMsg(eline, ecol, "multi-character constant")
}
nextChar()
tok = tkInteger
return
case '<':
nextChar()
tok = follow(eline, ecol, '=', tkLeq, tkLss)
return
case '>':
nextChar()
tok = follow(eline, ecol, '=', tkGeq, tkGtr)
return
case '!':
nextChar()
tok = follow(eline, ecol, '=', tkNeq, tkNot)
return
case '=':
nextChar()
tok = follow(eline, ecol, '=', tkEq, tkAssign)
return
case '&':
nextChar()
tok = follow(eline, ecol, '&', tkAnd, tkEOI)
return
case '|':
nextChar()
tok = follow(eline, ecol, '|', tkOr, tkEOI)
return
case '"': // string
v = string(curCh)
nextChar()
for curCh != '"' {
if curCh == '\n' {
errorMsg(eline, ecol, "EOL in string")
}
if curCh == etx {
errorMsg(eline, ecol, "EOF in string")
}
v += string(curCh)
nextChar()
}
v += string(curCh)
nextChar()
tok = tkString
return
default: // integers or identifiers
isNumber := isDigit(curCh)
v = ""
for isAlnum(curCh) || curCh == '_' {
if !isDigit(curCh) {
isNumber = false
}
v += string(curCh)
nextChar()
}
if len(v) == 0 {
errorMsg(eline, ecol, "unknown character: "+string(curCh))
}
if isDigit(v[0]) {
if !isNumber {
errorMsg(eline, ecol, "invalid number: "+string(curCh))
}
tok = tkInteger
return
}
index := lookup(v)
if index == -1 {
tok = tkIdent
} else {
tok = symtab[index].tok
}
return
}
}
""").printTokens()
} // ascii
 
func initLex when (number) {
0 ->
install("else", tkElse)
hello()
install("if", tkIf)
1 ->
install("print", tkPrint)
phoenix()
install("putc", tkPutc)
2 ->
install("while", tkWhile)
nextChar symbols()
3 ->
four()
4 ->
count()
5 ->
doors()
6 ->
negative()
7 ->
deep()
8 ->
gcd()
9 ->
factorial()
10 ->
fibonacci()
11 ->
fizzbuzz()
12 ->
bottles()
13 ->
primes()
14 ->
ascii()
15 -> // Lexer: Empty character constant
Lexer("''").printTokens()
16 -> // Lexer: Unknown escape sequence
Lexer("'\\x").printTokens()
17 -> // Lexer: Multi-character constant
Lexer("' ").printTokens()
18 -> // Lexer: EOF in comment
Lexer("/*").printTokens()
19 -> // Lexer: EOL in string
Lexer("\"\n").printTokens()
20 -> // Lexer: EOF in string
Lexer("\"").printTokens()
21 -> // Lexer: Escape sequence unknown
Lexer("\"\\x").printTokens()
22 -> // Lexer: Unrecognized character
Lexer("~").printTokens()
23 -> // Lexer: invalid number
Lexer("9a9").printTokens()
24 -> // Lexer: Number exceeds maximum value
Lexer("2147483648\n9223372036854775808").printTokens()
25 -> // Lexer: Operator expected
Lexer("|.").printTokens()
else ->
println("Invalid test number %d!".format(number))
} // when
} // tests
 
// Main function to check our source and read its data before parsing it.
// With no source specified, run the test of all symbols.
fun main(args: Array<String>) {
try {
val s =
if (args.size > 0 && args[0].isNotEmpty()) // file on command line
java.util.Scanner(java.io.File(args[0]))
else // use the console
java.util.Scanner(System.`in`)
var source = ""
while (s.hasNext())
source += s.nextLine()+
if (s.hasNext())
"\n"
else
""
if (args.size > 0 && args[0].isNotEmpty()) // file on command line
Lexer(source).printTokens()
else {
val digits = source.filter { it in '0'..'9' }
when {
source.isEmpty() -> // nothing given
tests(2)
source.length in 1..2 && digits.length == source.length &&
digits.toInt() in 0..25 ->
tests(digits.toInt())
else ->
Lexer(source).printTokens()
} // when
} // if
} catch(e: Throwable) {
println(e.message)
System.exit(1)
} // try
} // main</syntaxhighlight>
{{out|case=test case 3: All Symbols}}
<b>
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
22 29 End_of_input
</pre>
</b>
 
=={{header|Lua}}==
===Using LPeg library===
This version uses LPeg, a parsing expression grammar library developed by one of the authors of Lua.
The source is broken into several modules, in part to make it easier to present the "vanilla Lua" version afterwards.
Tested with Lua 5.3.5 and LPeg 1.0.2-1.
 
The first module is simply a table defining the names of tokens which don't have an associated value.
<syntaxhighlight lang="lua">-- module token_name (in a file "token_name.lua")
local token_name = {
['*'] = 'Op_multiply',
['/'] = 'Op_divide',
['%'] = 'Op_mod',
['+'] = 'Op_add',
['-'] = 'Op_subtract',
['<'] = 'Op_less',
['<='] = 'Op_lessequal',
['>'] = 'Op_greater',
['>='] = 'Op_greaterequal',
['=='] = 'Op_equal',
['!='] = 'Op_notequal',
['!'] = 'Op_not',
['='] = 'Op_assign',
['&&'] = 'Op_and',
['||'] = 'Op_or',
['('] = 'LeftParen',
[')'] = 'RightParen',
['{'] = 'LeftBrace',
['}'] = 'RightBrace',
[';'] = 'Semicolon',
[','] = 'Comma',
['if'] = 'Keyword_if',
['else'] = 'Keyword_else',
['while'] = 'Keyword_while',
['print'] = 'Keyword_print',
['putc'] = 'Keyword_putc',
}
return token_name</syntaxhighlight>
 
This module exports a function <i>find_token</i>, which attempts to find the next valid token from a specified position in a source line.
func process() {
<syntaxhighlight lang="lua">-- module lpeg_token_finder
tokMap := make(map[TokenType]string)
local M = {} -- only items added to M will be public (via 'return M' at end)
tokMap[tkEOI] = "End_of_input"
local table, concat = table, table.concat
tokMap[tkMul] = "Op_multiply"
local error, tonumber = error, tonumber
tokMap[tkDiv] = "Op_divide"
 
tokMap[tkMod] = "Op_mod"
local lpeg = require 'lpeg' -- see http://www.inf.puc-rio.br/~roberto/lpeg/
tokMap[tkAdd] = "Op_add"
local token_name = require 'token_name'
tokMap[tkSub] = "Op_subtract"
_ENV = {}
tokMap[tkNegate] = "Op_negate"
 
tokMap[tkNot] = "Op_not"
local imports = 'P R S C Carg Cb Cc Cf Cg Cp Cs Ct Cmt V'
tokMap[tkLss] = "Op_less"
for w in imports:gmatch('%a+') do _ENV[w] = lpeg[w] end
tokMap[tkLeq] = "Op_lessequal"
 
tokMap[tkGtr] = "Op_greater"
------------------- Define patterns to match tokens -----------------------
tokMap[tkGeq] = "Op_greaterequal"
 
tokMap[tkEq] = "Op_equal"
alpha = R'az' + R'AZ' + P'_'
tokMap[tkNeq] = "Op_notequal"
digit = R'09'
tokMap[tkAssign] = "Op_assign"
alnum = alpha + digit
tokMap[tkAnd] = "Op_and"
space = S' \t\r\n'
tokMap[tkOr] = "Op_or"
 
tokMap[tkIf] = "Keyword_if"
function ptok(text) return {name=token_name[text]} end
tokMap[tkElse] = "Keyword_else"
op2c = C(P'<=' + P'>=' + P'==' + P'!=' + P'&&' + P'||') / ptok
tokMap[tkWhile] = "Keyword_while"
op1c = C(S'*/%+-<>!=') / ptok
tokMap[tkPrint] = "Keyword_print"
symbol = C(S'(){};,') / ptok
tokMap[tkPutc] = "Keyword_putc"
tokMap[tkLparen] = "LeftParen"
tokMap[tkRparen] = "RightParen"
tokMap[tkLbrace] = "LeftBrace"
tokMap[tkRbrace] = "RightBrace"
tokMap[tkSemi] = "Semicolon"
tokMap[tkComma] = "Comma"
tokMap[tkIdent] = "Identifier"
tokMap[tkInteger] = "Integer"
tokMap[tkString] = "String"
 
keyword_or_identifier = C(alpha * alnum^0) / function(text)
for {
local name = token_name[text]
eline, ecol, tok, v := gettok()
return name and {name=name} or {name='Identifier', value=text}
fmt.Printf("%5d %5d %-16s", eline, ecol, tokMap[tok])
end
if tok == tkInteger || tok == tkIdent || tok == tkString {
 
fmt.Println(v)
integer = C(digit^1) * -alpha / function(text)
} else {
return {name='Integer', value=tonumber(text)}
fmt.Println()
end
 
Cline = Carg(1) -- call to 'match' sets the first extra argument to source line number
 
bad_escseq_err = Cmt(Cline, function (_,pos,line)
error{err='bad_escseq', line=line, column=pos-1}
end)
 
esc_subst = {['\\'] = '\\', ['n'] = '\n'}
escseq = P'\\' * C(S'\\n' + bad_escseq_err) / esc_subst
 
qchar = P"'" * ( C( P(1) - S"'\n\\" ) + escseq ) * P"'" / function (text)
return {name='Integer', value=text:byte()}
end
 
qstr = P'"' * ( C((P(1) - S'"\n\\')^1) + escseq )^0 * P'"' / function(...)
return {name='String', value=concat{...}}
end
 
Ctoken = symbol + op2c + op1c + keyword_or_identifier + integer + qstr + qchar
 
unfinished_comment_err = Cmt(Cline * Cb('SOC'), function (_, pos, line, socpos)
error{err='unfinished_comment', line=line, column=socpos}
end)
commentstart = Cg(Cp() * P'/*', 'SOC')
commentrest = (P(1) - P'*/')^0 * (P'*/' + unfinished_comment_err)
comment = commentstart * commentrest
morecomment = Cg(Cp(), 'SOC') * commentrest
 
ws = (space^1 + comment)^0
 
bad_token_err = Cmt(Cline, function (_, pos, line)
error{err='invalid_token', line=line, column=pos}
end)
tokenpat = ws * Cline * Cp() * (C(-1) + Ctoken + bad_token_err) * Cp() /
function (line, pos, token, nextpos)
if pos == nextpos then -- at end of line; no token
return nil
else
token.line, token.column = line, pos
return token, nextpos
end
end
 
closecomment_tokenpat = morecomment * tokenpat
 
function M.find_token(line, line_pos, line_number, in_comment)
pattern = in_comment and closecomment_tokenpat or tokenpat
return lpeg.match(pattern, line, line_pos, line_number)
end
return M</syntaxhighlight>
 
The <i>lexer</i> module uses <i>finder.find_token</i> to produce an iterator over the tokens in a source.
<syntaxhighlight lang="lua">-- module lexer
local M = {} -- only items added to M will publicly available (via 'return M' at end)
local string, io, coroutine, yield = string, io, coroutine, coroutine.yield
local error, pcall, type = error, pcall, type
 
local finder = require 'lpeg_token_finder'
_ENV = {}
 
-- produces a token iterator given a source line iterator
function M.tokenize_lineiter(lineiter)
local function fatal(err)
local msgtext = {
unfinished_comment = "EOF inside comment started",
invalid_token = "Invalid token",
bad_escseq = "Invalid escape sequence",
}
iflocal tokfmt == tkEOI"LEX ERROR: %s at line %d, column {%d"
error(string.format(fmt, msgtext[err.err], err.line, err.column))
return
end }
}
return coroutine.wrap(function()
}
local line_number = 0
local line_pos
local in_comment -- where unfinished comment started
for line in lineiter do
line_number = line_number + 1
line_pos = 1
local function scanline() -- yield current line's tokens
repeat
local token, pos =
finder.find_token(line, line_pos, line_number, in_comment)
if token then
line_pos = pos
in_comment = nil
yield(token)
end
until token == nil
end
 
if line then
func check(err error) {
if local ok, err != nil {pcall(scanline)
log.Fatal(err) if ok then
in_comment = nil
}
elseif type(err) == 'table' and err.err=='unfinished_comment' then
}
if not(in_comment and err.column==1) then
in_comment = err
end
elseif type(err) == 'table' then
fatal(err)
else
error(err) -- some internal error
end
end
end
if in_comment then
fatal(in_comment)
else
yield{name='End_of_input', line=line_number+1, column=1}
end
return nil
end)
end
 
------------------- exports -----------------------------
func main() {
 
if len(os.Args) < 2 {
lexer = M.tokenize_lineiter
fmt.Println("Filename required")
 
return
function M.tokenize_file(filename)
}
f,return err := oslexer(io.Openlines(os.Args[1]filename))
end
check(err)
 
defer f.Close()
function M.tokenize_text(text)
scanner = bufio.NewScanner(f)
return lexer(text:gmatch('[^\n]+'))
initLex()
end
process()
 
}</lang>
-- M._INTERNALS = _ENV
return M
</syntaxhighlight>
 
This script uses <i>lexer.tokenize_text</i> to show the token sequence produced from a source text.
 
<syntaxhighlight lang="lua">lexer = require 'lexer'
format, gsub = string.format, string.gsub
 
function printf(fmt, ...) print(format(fmt, ...)) end
 
function stringrep(str)
local subst = {['\n'] = "\\n", ['\\'] = '\\\\'}
return format('"%s"', gsub(str, '[\n\\]', subst))
end
 
function display(text)
for t in lexer.tokenize_text(text) do
local value = (t.name=='String') and stringrep(t.value) or t.value or ''
printf("%4d %3d %-15s %s", t.line, t.column, t.name, value)
end
end
 
----------------------- test cases from Rosetta spec ------------------------
testing = true
if testing then
-- test case 1
display[[
/*
Hello world
*/
print("Hello, World!\n");]]
print()
 
-- test ercase 2
display[[
/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");]]
print()
-- etc.
end
</syntaxhighlight>
 
===Using only standard libraries===
This version replaces the <i>lpeg_token_finder</i> module of the LPeg version with this <i>basic_token_finder</i> module, altering the <i>require</i> expression near the top of the <i>lexer</i> module accordingly. Tested with Lua 5.3.5. (Note that <i>select</i> is a standard function as of Lua 5.2.)
 
<syntaxhighlight lang="lua">-- module basic_token_finder
local M = {} -- only items added to M will be public (via 'return M' at end)
local table, string = table, string
local error, tonumber, select, assert = error, tonumber, select, assert
 
local token_name = require 'token_name'
_ENV = {}
 
function next_token(line, pos, line_num) -- match a token at line,pos
local function m(pat)
from, to, capture = line:find(pat, pos)
if from then
pos = to + 1
return capture
end
end
local function ptok(str)
return {name=token_name[str]}
end
local function op2c()
local text = m'^([<>=!]=)' or m'^(&&)' or m'^(||)'
if text then return ptok(text) end
end
 
local function op1c_or_symbol()
local char = m'^([%*/%%%+%-<>!=%(%){};,])'
if char then return ptok(char) end
end
local function keyword_or_identifier()
local text = m'^([%a_][%w_]*)'
if text then
local name = token_name[text]
return name and {name=name} or {name='Identifier', value=text}
end
end
local function integer()
local text = m'^(%d+)%f[^%w_]'
if text then return {name='Integer', value=tonumber(text)} end
end
local subst = {['\\\\'] = '\\', ['\\n'] = '\n'}
local function qchar()
local text = m"^'([^\\])'" or m"^'(\\[\\n])'"
if text then
local value = #text==1 and text:byte() or subst[text]:byte()
return {name='Integer', value=value}
end
end
local function qstr()
local text = m'^"([^"\n]*\\?)"'
if text then
local value = text:gsub('()(\\.?)', function(at, esc)
local replace = subst[esc]
if replace then
return replace
else
error{err='bad_escseq', line=line_num, column=pos+at-1}
end
end)
return {name='String', value=value}
end
end
local found = (op2c() or op1c_or_symbol() or
keyword_or_identifier() or integer() or qchar() or qstr())
if found then
return found, pos
end
end
 
function find_commentrest(line, pos, line_num, socpos)
local sfrom, sto = line:find('%*%/', pos)
if sfrom then
return socpos, sto
else
error{err='unfinished_comment', line=line_num, column=socpos}
end
end
 
function find_comment(line, pos, line_num)
local sfrom, sto = line:find('^%/%*', pos)
if sfrom then
local efrom, eto = find_commentrest(line, sto+1, line_num, sfrom)
return sfrom, eto
end
end
 
function find_morecomment(line, pos, line_num)
assert(pos==1)
return find_commentrest(line, pos, line_num, pos)
end
 
function find_whitespace(line, pos, line_num)
local spos = pos
repeat
local eto = select(2, line:find('^%s+', pos))
if not eto then
eto = select(2, find_comment(line, pos, line_num))
end
if eto then pos = eto + 1 end
until not eto
return spos, pos - 1
end
 
function M.find_token(line, pos, line_num, in_comment)
local spos = pos
if in_comment then
pos = 1 + select(2, find_morecomment(line, pos, line_num))
end
pos = 1 + select(2, find_whitespace(line, pos, line_num))
if pos > #line then
return nil
else
local token, nextpos = next_token(line, pos, line_num)
if token then
token.line, token.column = line_num, pos
return token, nextpos
else
error{err='invalid_token', line=line_num, column=pos}
end
end
end
 
-- M._ENV = _ENV
return M</syntaxhighlight>
 
{{out}}
Test Case 3:
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
=={{header|M2000 Interpreter}}==
<syntaxhighlight lang="m2000 interpreter">
<lang M2000 Interpreter>
Module lexical_analyzer {
a$={/*
Line 4,591 ⟶ 11,247:
}
lexical_analyzer
</syntaxhighlight>
</lang>
 
{{out}}
Line 4,630 ⟶ 11,286:
 
</pre >
 
=={{header|Mercury}}==
{{trans|ATS}}
{{works with|Mercury|20.06.1}}
 
 
<syntaxhighlight lang="mercury">% -*- mercury -*-
%
% Compile with maybe something like:
% mmc -O4 --intermod-opt -E --make --warn-non-tail-recursion self-and-mutual lex
%
 
:- module lex.
 
:- interface.
:- import_module io.
:- pred main(io::di, io::uo) is det.
 
:- implementation.
:- import_module char.
:- import_module exception.
:- import_module int.
:- import_module list.
:- import_module stack.
:- import_module string.
 
:- type token_t
---> token_ELSE
; token_IF
; token_PRINT
; token_PUTC
; token_WHILE
; token_MULTIPLY
; token_DIVIDE
; token_MOD
; token_ADD
; token_SUBTRACT
; token_NEGATE
; token_LESS
; token_LESSEQUAL
; token_GREATER
; token_GREATEREQUAL
; token_EQUAL
; token_NOTEQUAL
; token_NOT
; token_ASSIGN
; token_AND
; token_OR
; token_LEFTPAREN
; token_RIGHTPAREN
; token_LEFTBRACE
; token_RIGHTBRACE
; token_SEMICOLON
; token_COMMA
; token_IDENTIFIER
; token_INTEGER
; token_STRING
; token_END_OF_INPUT.
 
:- type ch_t % The type of a fetched character.
---> {int, % A character or `eof', stored as an int.
int, % The line number.
int}. % The column number.
 
:- type inp_t % The `inputter' type. Fetches one character.
---> inp_t(inpf :: text_input_stream,
line_no :: int,
column_no :: int,
pushback :: stack(ch_t)).
 
:- type toktup_t % The type of a scanned token with its argument.
---> {token_t, % The token kind.
string, % An argument. (May or may not be meaningful.)
int, % The starting line number.
int}. % The starting column number.
 
main(!IO) :-
command_line_arguments(Args, !IO),
(
if (Args = [])
then (InpF_filename = "-",
OutF_filename = "-",
main_program(InpF_filename, OutF_filename, !IO))
else if (Args = [F1])
then (InpF_filename = F1,
OutF_filename = "-",
main_program(InpF_filename, OutF_filename, !IO))
else if (Args = [F1, F2])
then (InpF_filename = F1,
OutF_filename = F2,
main_program(InpF_filename, OutF_filename, !IO))
else usage_error(!IO)
).
 
:- pred main_program(string::in, string::in, io::di, io::uo) is det.
main_program(InpF_filename, OutF_filename, !IO) :-
open_InpF(InpF, InpF_filename, !IO),
open_OutF(OutF, OutF_filename, !IO),
init(InpF, Inp0),
scan_text(OutF, Inp0, _, !IO).
 
:- pred open_InpF(text_input_stream::out, string::in,
io::di, io::uo) is det.
open_InpF(InpF, InpF_filename, !IO) :-
if (InpF_filename = "-")
then (InpF = io.stdin_stream)
else
(
open_input(InpF_filename, InpF_result, !IO),
(
if (InpF_result = ok(F))
then (InpF = F)
else throw("Error: cannot open " ++ InpF_filename ++
" for input")
)
).
 
:- pred open_OutF(text_output_stream::out, string::in,
io::di, io::uo) is det.
open_OutF(OutF, OutF_filename, !IO) :-
if (OutF_filename = "-")
then (OutF = io.stdout_stream)
else
(
open_output(OutF_filename, OutF_result, !IO),
(
if (OutF_result = ok(F))
then (OutF = F)
else throw("Error: cannot open " ++ OutF_filename ++
" for output")
)
).
 
:- pred usage_error(io::di, io::uo) is det.
usage_error(!IO) :-
progname("lex", ProgName, !IO),
(io.format("Usage: %s [INPUT_FILE [OUTPUT_FILE]]\n",
[s(ProgName)], !IO)),
(io.write_string("If INPUT_FILE is \"-\" or not present then standard input is used.\n",
!IO)),
(io.write_string("If OUTPUT_FILE is \"-\" or not present then standard output is used.\n",
!IO)),
set_exit_status(1, !IO).
 
:- pred scan_text(text_output_stream::in, inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_text(OutF, !Inp, !IO) :-
get_next_token(TokTup, !Inp, !IO),
print_token(TokTup, OutF, !IO),
{Tok, _, _, _} = TokTup,
(
if (Tok = token_END_OF_INPUT)
then true
else scan_text(OutF, !Inp, !IO)
).
 
:- pred print_token(toktup_t::in, text_output_stream::in,
io::di, io::uo) is det.
print_token(TokTup, OutF, !IO) :-
{Tok, Arg, Line_no, Column_no} = TokTup,
token_name(Tok) = TokName,
(io.format(OutF, "%5d %5d %s",
[i(Line_no), i(Column_no), s(TokName)],
!IO)),
(
if (Tok = token_IDENTIFIER)
then (io.format(OutF, " %s", [s(Arg)], !IO))
else if (Tok = token_INTEGER)
then (io.format(OutF, " %s", [s(Arg)], !IO))
else if (Tok = token_STRING)
then (io.format(OutF, " %s", [s(Arg)], !IO))
else true
),
(io.format(OutF, "\n", [], !IO)).
 
:- func token_name(token_t) = string is det.
:- pred token_name(token_t::in, string::out) is det.
token_name(Tok) = Str :- token_name(Tok, Str).
token_name(token_ELSE, "Keyword_else").
token_name(token_IF, "Keyword_if").
token_name(token_PRINT, "Keyword_print").
token_name(token_PUTC, "Keyword_putc").
token_name(token_WHILE, "Keyword_while").
token_name(token_MULTIPLY, "Op_multiply").
token_name(token_DIVIDE, "Op_divide").
token_name(token_MOD, "Op_mod").
token_name(token_ADD, "Op_add").
token_name(token_SUBTRACT, "Op_subtract").
token_name(token_NEGATE, "Op_negate").
token_name(token_LESS, "Op_less").
token_name(token_LESSEQUAL, "Op_lessequal").
token_name(token_GREATER, "Op_greater").
token_name(token_GREATEREQUAL, "Op_greaterequal").
token_name(token_EQUAL, "Op_equal").
token_name(token_NOTEQUAL, "Op_notequal").
token_name(token_NOT, "Op_not").
token_name(token_ASSIGN, "Op_assign").
token_name(token_AND, "Op_and").
token_name(token_OR, "Op_or").
token_name(token_LEFTPAREN, "LeftParen").
token_name(token_RIGHTPAREN, "RightParen").
token_name(token_LEFTBRACE, "LeftBrace").
token_name(token_RIGHTBRACE, "RightBrace").
token_name(token_SEMICOLON, "Semicolon").
token_name(token_COMMA, "Comma").
token_name(token_IDENTIFIER, "Identifier").
token_name(token_INTEGER, "Integer").
token_name(token_STRING, "String").
token_name(token_END_OF_INPUT, "End_of_input").
 
:- pred get_next_token(toktup_t::out, inp_t::in, inp_t::out,
io::di, io::uo) is det.
get_next_token(TokTup, !Inp, !IO) :-
skip_spaces_and_comments(!Inp, !IO),
get_ch(Ch, !Inp, !IO),
{IChar, Line_no, Column_no} = Ch,
LN = Line_no,
CN = Column_no,
(
if (IChar = eof)
then
(
TokTup = {token_END_OF_INPUT, "", LN, CN}
)
else
(
Char = det_from_int(IChar),
(
if (Char = (','))
then (TokTup = {token_COMMA, ",", LN, CN})
else if (Char = (';'))
then (TokTup = {token_SEMICOLON, ";", LN, CN})
else if (Char = ('('))
then (TokTup = {token_LEFTPAREN, "(", LN, CN})
else if (Char = (')'))
then (TokTup = {token_RIGHTPAREN, ")", LN, CN})
else if (Char = ('{'))
then (TokTup = {token_LEFTBRACE, "{", LN, CN})
else if (Char = ('}'))
then (TokTup = {token_RIGHTBRACE, "}", LN, CN})
else if (Char = ('*'))
then (TokTup = {token_MULTIPLY, "*", LN, CN})
else if (Char = ('/'))
then (TokTup = {token_DIVIDE, "/", LN, CN})
else if (Char = ('%'))
then (TokTup = {token_MOD, "%", LN, CN})
else if (Char = ('+'))
then (TokTup = {token_ADD, "+", LN, CN})
else if (Char = ('-'))
then (TokTup = {token_SUBTRACT, "-", LN, CN})
else if (Char = ('<'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('='))
then
(
TokTup = {token_LESSEQUAL, "<=", LN, CN}
)
else
(
push_back(Ch1, !Inp),
TokTup = {token_LESS, "<", LN, CN}
)
)
)
else if (Char = ('>'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('='))
then
(
TokTup = {token_GREATEREQUAL, ">=", LN, CN}
)
else
(
push_back(Ch1, !Inp),
TokTup = {token_GREATER, ">", LN, CN}
)
)
)
else if (Char = ('='))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('='))
then
(
TokTup = {token_EQUAL, "==", LN, CN}
)
else
(
push_back(Ch1, !Inp),
TokTup = {token_ASSIGN, "=", LN, CN}
)
)
)
else if (Char = ('!'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('='))
then
(
TokTup = {token_NOTEQUAL, "!=", LN, CN}
)
else
(
push_back(Ch1, !Inp),
TokTup = {token_NOT, "!", LN, CN}
)
)
)
else if (Char = ('&'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('&'))
then
(
TokTup = {token_AND, "&&", LN, CN}
)
else throw("Error: unexpected character '" ++
from_char(Char) ++ "' at " ++
from_int(LN) ++ ":" ++
from_int(CN))
)
)
else if (Char = ('|'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('|'))
then
(
TokTup = {token_OR, "||", LN, CN}
)
else throw("Error: unexpected character '" ++
from_char(Char) ++ "' at " ++
from_int(LN) ++ ":" ++
from_int(CN))
)
)
else if (Char = ('"'))
then
(
push_back(Ch, !Inp),
scan_string_literal(TokTup, !Inp, !IO)
)
else if (Char = ('\''))
then
(
push_back(Ch, !Inp),
scan_character_literal(TokTup, !Inp, !IO)
)
else if (is_alpha(Char))
then
(
push_back(Ch, !Inp),
scan_identifier_or_reserved_word(
TokTup, !Inp, !IO)
)
else if (is_digit(Char))
then
(
push_back(Ch, !Inp),
scan_integer_literal(TokTup, !Inp, !IO)
)
else
(
throw("Error: unexpected character '" ++
from_char(Char) ++ "' at " ++
from_int(LN) ++ ":" ++
from_int(CN))
)
)
)
).
 
 
:- pred skip_spaces_and_comments(inp_t::in, inp_t::out,
io::di, io::uo) is det.
skip_spaces_and_comments(!Inp, !IO) :-
get_ch(Ch, !Inp, !IO),
Ch = {IChar, _, _},
(
if (IChar = eof)
then push_back(Ch, !Inp)
else
if (is_whitespace(det_from_int(IChar)))
then skip_spaces_and_comments(!Inp, !IO)
else if (IChar = to_int('/'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, Line_no, Column_no},
(
if (IChar1 = to_int('*'))
then
(
scan_comment(Line_no, Column_no,
!Inp, !IO),
skip_spaces_and_comments(!Inp, !IO)
)
else
(
push_back(Ch1, !Inp),
push_back(Ch, !Inp)
)
)
)
else push_back(Ch, !Inp)
).
 
:- pred scan_comment(int::in, int::in, % line and column nos.
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_comment(Line_no, Column_no, !Inp, !IO) :-
get_ch(Ch, !Inp, !IO),
{IChar, _, _} = Ch,
(
if (IChar = eof)
then throw("Error: unterminated comment " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else
(
det_from_int(IChar) = Char,
(
if (Char = ('*'))
then
(
get_ch(Ch1, !Inp, !IO),
{IChar1, _, _} = Ch1,
(
if (IChar1 = to_int('/'))
then true % End of comment has been reached.
else
(
push_back(Ch1, !Inp),
scan_comment(Line_no, Column_no, !Inp,
!IO)
)
)
)
else scan_comment(Line_no, Column_no, !Inp, !IO)
)
)
).
 
:- pred scan_character_literal(toktup_t::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_character_literal(TokTup, !Inp, !IO) :-
get_ch(Ch, !Inp, !IO),
Ch = {OpenQuote, Line_no, Column_no},
CloseQuote = OpenQuote,
scan_char_lit_contents(CodePoint, Line_no, Column_no,
!Inp, !IO),
check_char_lit_end(CloseQuote, Line_no, Column_no, !Inp, !IO),
Arg = from_int(CodePoint),
TokTup = {token_INTEGER, Arg, Line_no, Column_no}.
 
:- pred scan_char_lit_contents(int::out, int::in, int::in,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_char_lit_contents(CodePoint, Line_no, Column_no,
!Inp, !IO) :-
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, Line_no1, Column_no1},
(
if (IChar1 = eof)
then throw("Error: end of input in character literal " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar1 = to_int('\\'))
then
(
get_ch(Ch2, !Inp, !IO),
Ch2 = {IChar2, _, _},
(if (IChar2 = eof)
then throw("Error: end of input in character literal " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar2 = to_int('n'))
then (CodePoint = to_int('\n'))
else if (IChar2 = to_int('\\'))
then (CodePoint = to_int('\\'))
else throw("Error: unsupported escape \\" ++
from_char(det_from_int(IChar2)) ++
" at " ++ from_int(Line_no1) ++
":" ++ from_int(Column_no1))
)
)
else (CodePoint = IChar1)
).
 
:- pred check_char_lit_end(int::in, int::in, int::in,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
check_char_lit_end(CloseQuote, Line_no, Column_no, !Inp, !IO) :-
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = CloseQuote)
then true
else find_bad_char_lit_end(CloseQuote, Line_no, Column_no,
!Inp, !IO)
).
 
:- pred find_bad_char_lit_end(int::in, int::in, int::in,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
find_bad_char_lit_end(CloseQuote, Line_no, Column_no, !Inp, !IO) :-
get_ch(Ch2, !Inp, !IO),
Ch2 = {IChar2, _, _},
(
if (IChar2 = CloseQuote)
then throw("Error: unsupported multicharacter literal " ++
" at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar2 = eof)
then throw("Error: end of input in character literal " ++
" at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else find_bad_char_lit_end(CloseQuote, Line_no, Column_no,
!Inp, !IO)
).
 
:- pred scan_string_literal(toktup_t::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_string_literal(TokTup, !Inp, !IO) :-
get_ch(Ch, !Inp, !IO),
Ch = {OpenQuote, Line_no, Column_no},
CloseQuote = OpenQuote,
scan_string_lit_contents("", Str, CloseQuote,
Line_no, Column_no,
!Inp, !IO),
Arg = from_char(det_from_int(OpenQuote)) ++
Str ++ from_char(det_from_int(CloseQuote)),
TokTup = {token_STRING, Arg, Line_no, Column_no}.
 
:- pred scan_string_lit_contents(string::in, string::out, int::in,
int::in, int::in,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_string_lit_contents(Str0, Str, CloseQuote, Line_no, Column_no,
!Inp, !IO) :-
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, Line_no1, Column_no1},
(
if (IChar1 = CloseQuote)
then (Str = Str0)
else if (IChar1 = eof)
then throw("Error: end of input in string literal " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar1 = to_int('\n'))
then throw("Error: end of line in string literal " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar1 = to_int('\\'))
then
(
get_ch(Ch2, !Inp, !IO),
Ch2 = {IChar2, _, _},
(
if (IChar2 = to_int('n'))
then
(
Str1 = Str0 ++ "\\n",
scan_string_lit_contents(Str1, Str, CloseQuote,
Line_no, Column_no,
!Inp, !IO)
)
else if (IChar2 = to_int('\\'))
then
(
Str1 = Str0 ++ "\\\\",
scan_string_lit_contents(Str1, Str, CloseQuote,
Line_no, Column_no,
!Inp, !IO)
)
else if (IChar2 = eof)
then throw("Error: end of input in string literal " ++
"starting at " ++ from_int(Line_no) ++
":" ++ from_int(Column_no))
else if (IChar2 = to_int('\n'))
then throw("Error: end of line in string literal " ++
"starting at " ++ from_int(Line_no) ++
":" ++ from_int(Column_no))
else throw("Error: unsupported escape \\" ++
from_char(det_from_int(IChar2)) ++
" at " ++ from_int(Line_no1) ++
":" ++ from_int(Column_no1))
)
)
else
(
Char1 = det_from_int(IChar1),
Str1 = Str0 ++ from_char(Char1),
scan_string_lit_contents(Str1, Str, CloseQuote,
Line_no, Column_no, !Inp, !IO)
)
).
 
:- pred scan_identifier_or_reserved_word(toktup_t::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_identifier_or_reserved_word(TokTup, !Inp, !IO) :-
scan_integer_or_word(Str, Line_no, Column_no, !Inp, !IO),
(
if (Str = "if")
then (TokTup = {token_IF, Str, Line_no, Column_no})
else if (Str = "else")
then (TokTup = {token_ELSE, Str, Line_no, Column_no})
else if (Str = "while")
then (TokTup = {token_WHILE, Str, Line_no, Column_no})
else if (Str = "print")
then (TokTup = {token_PRINT, Str, Line_no, Column_no})
else if (Str = "putc")
then (TokTup = {token_PUTC, Str, Line_no, Column_no})
else (TokTup = {token_IDENTIFIER, Str, Line_no, Column_no})
).
 
:- pred scan_integer_literal(toktup_t::out, inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_integer_literal(TokTup, !Inp, !IO) :-
scan_integer_or_word(Str, Line_no, Column_no, !Inp, !IO),
(
if (not is_all_digits(Str))
then throw("Error: not a valid integer literal: " ++ Str)
else (TokTup = {token_INTEGER, Str, Line_no, Column_no})
).
 
:- pred scan_integer_or_word(string::out, int::out, int::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_integer_or_word(Str, Line_no, Column_no, !Inp, !IO) :-
get_ch({IChar, Line_no, Column_no}, !Inp, !IO),
(
if (IChar = eof)
then throw("internal error")
else
(
Char = det_from_int(IChar),
(if (not is_alnum_or_underscore(Char))
then throw("internal error")
else scan_int_or_word(from_char(Char), Str, !Inp, !IO))
)
).
 
:- pred scan_int_or_word(string::in, string::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_int_or_word(Str0, Str, !Inp, !IO) :-
get_ch(CharTup, !Inp, !IO),
{IChar, _, _} = CharTup,
(
if (IChar = eof)
then
(
push_back(CharTup, !Inp),
Str = Str0
)
else
(
Char = det_from_int(IChar),
(
if (not is_alnum_or_underscore(Char))
then
(
push_back(CharTup, !Inp),
Str = Str0
)
else scan_int_or_word(Str0 ++ from_char(Char), Str,
!Inp, !IO)
)
)
).
 
:- pred init(text_input_stream::in, inp_t::out) is det.
init(Inpf, Inp) :-
Inp = inp_t(Inpf, 1, 1, init).
 
 
:- pred get_ch(ch_t::out, inp_t::in, inp_t::out,
io::di, io::uo) is det.
get_ch(Ch, Inp0, Inp, !IO) :-
if (pop(Ch1, Inp0^pushback, Pushback))
then
(
Ch = Ch1,
Inp = (Inp0^pushback := Pushback)
)
else
(
inp_t(Inpf, Line_no, Column_no, Pushback) = Inp0,
read_char_unboxed(Inpf, Result, Char, !IO),
(
if (Result = ok)
then
(
Ch = {to_int(Char), Line_no, Column_no},
Inp =
(if (Char = ('\n'))
then inp_t(Inpf, Line_no + 1, 1, Pushback)
else inp_t(Inpf, Line_no, Column_no + 1, Pushback))
)
else
(
Ch = {eof, Line_no, Column_no},
Inp = Inp0
)
)
).
 
:- pred push_back(ch_t::in, inp_t::in, inp_t::out) is det.
push_back(Ch, Inp0, Inp) :-
Inp = (Inp0^pushback := push(Inp0^pushback, Ch)).
 
:- func eof = int is det.
eof = -1.</syntaxhighlight>
 
{{out}}
<pre>$ mmc -O6 --intermod-opt -E --make --warn-non-tail-recursion self-and-mutual lex && ./lex compiler-tests/testcase3.t
Making Mercury/int3s/lex.int3
Making Mercury/ints/lex.int
Making Mercury/opts/lex.opt
Making Mercury/cs/lex.c
Making Mercury/os/lex.o
Making lex
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
 
=={{header|Nim}}==
Tested with Nim v0.19.4. Both examples are tested against all programs in [[Compiler/Sample programs]].
===Using string with regular expressions===
<langsyntaxhighlight lang="nim">
import re, strformat, strutils
 
Line 4,826 ⟶ 12,263:
 
echo input.tokenize.output
</syntaxhighlight>
</lang>
===Using stream with lexer library===
<langsyntaxhighlight lang="nim">
import lexbase, streams
from strutils import Whitespace
Line 5,138 ⟶ 12,575:
if hasError l:
echo &"({l.lineNumber},{l.getColNumber l.bufpos + 1}) {l.error}"
main()
</syntaxhighlight>
</lang>
 
===Using nothing but system and strutils===
<langsyntaxhighlight lang="nim">import strutils
 
type
 
TokenKind* = enum
tokMult = "Op_multiply", tokDiv = "Op_divide", tokMod = "Op_mod",
Line 5,160 ⟶ 12,598:
tokString = "String"
tokEnd = "End_of_input"
 
Token* = object
ln*, col*: int
Line 5,168 ⟶ 12,607:
of tokString: stringVal*: string
else: discard
 
Lexer* = object
input: string
pos: int
ln, col: int
 
LexicalError* = object of CatchableError
ln*, col*: int
Line 5,243 ⟶ 12,684:
of '>':
next()
if current() == '=': result = Token(kind: tokGreaterEq)
else: result = Token(kind: tokGreatertokGreaterEq)
next()
else:
result = Token(kind: tokGreater)
of '=':
next()
if current() == '=': result = Token(kind: tokEq)
else: result = Token(kind: tokAssigntokEq)
next()
else:
result = Token(kind: tokAssign)
of '!':
next()
if current() == '=': result = Token(kind: tokNotEq)
else: result = Token(kind: tokNottokNotEq)
next()
else:
result = Token(kind: tokNot)
of '&':
next()
if current() == '&': result = Token(kind: tokAnd)
result = Token(kind: tokAnd)
else: lexer.error("'&&' expected")
next()
else:
lexer.error("'&&' expected")
of '|':
next()
if current() == '|': result = Token(kind: tokOr)
result = Token(kind: tokOr)
else: lexer.error("'||' expected")
next()
else:
lexer.error("'||' expected")
of '(': result = Token(kind: tokLPar); next()
of ')': result = Token(kind: tokRPar); next()
Line 5,275 ⟶ 12,731:
"escape sequence")
result = Token(kind: tokChar, charVal: ch)
next()
of '0'..'9':
var number = ""
Line 5,342 ⟶ 12,799:
stdout.write('\n')
if token.kind == tokEnd:
break</langsyntaxhighlight>
 
=={{header|ObjectIcon}}==
{{trans|Icon}}
{{trans|ATS}}
 
 
There are very few changes from the ordinary Icon version: I/O is modified to use FileStreams; and the '''max''' procedure is removed, because there is an Object Icon builtin procedure.
 
 
<syntaxhighlight lang="objecticon"># -*- ObjectIcon -*-
#
# The Rosetta Code lexical analyzer in Object Icon. Based upon the ATS
# implementation.
#
# Usage: lex [INPUTFILE [OUTPUTFILE]]
# If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input
# or standard output is used, respectively. *)
#
 
import io
 
$define EOF -1
 
$define TOKEN_ELSE 0
$define TOKEN_IF 1
$define TOKEN_PRINT 2
$define TOKEN_PUTC 3
$define TOKEN_WHILE 4
$define TOKEN_MULTIPLY 5
$define TOKEN_DIVIDE 6
$define TOKEN_MOD 7
$define TOKEN_ADD 8
$define TOKEN_SUBTRACT 9
$define TOKEN_NEGATE 10
$define TOKEN_LESS 11
$define TOKEN_LESSEQUAL 12
$define TOKEN_GREATER 13
$define TOKEN_GREATEREQUAL 14
$define TOKEN_EQUAL 15
$define TOKEN_NOTEQUAL 16
$define TOKEN_NOT 17
$define TOKEN_ASSIGN 18
$define TOKEN_AND 19
$define TOKEN_OR 20
$define TOKEN_LEFTPAREN 21
$define TOKEN_RIGHTPAREN 22
$define TOKEN_LEFTBRACE 23
$define TOKEN_RIGHTBRACE 24
$define TOKEN_SEMICOLON 25
$define TOKEN_COMMA 26
$define TOKEN_IDENTIFIER 27
$define TOKEN_INTEGER 28
$define TOKEN_STRING 29
$define TOKEN_END_OF_INPUT 30
 
global whitespace
global ident_start
global ident_continuation
 
procedure main(args)
local inpf, outf
local pushback_buffer, inp, pushback
 
initial {
whitespace := ' \t\v\f\r\n'
ident_start := '_' ++ &letters
ident_continuation := ident_start ++ &digits
}
 
inpf := FileStream.stdin
outf := FileStream.stdout
if 1 <= *args & args[1] ~== "-" then {
inpf := FileStream(args[1], FileOpt.RDONLY) | stop(&why)
}
if 2 <= *args & args[2] ~== "-" then {
outf := FileStream(args[2], ior(FileOpt.WRONLY,
FileOpt.TRUNC,
FileOpt.CREAT)) | stop(&why)
}
 
pushback_buffer := []
inp := create inputter(inpf, pushback_buffer)
pushback := create repeat push(pushback_buffer, \@&source)
@pushback # The first invocation does nothing.
 
scan_text(outf, inp, pushback)
end
 
procedure scan_text(outf, inp, pushback)
local ch
 
while /ch | ch[1] ~=== EOF do {
skip_spaces_and_comments(inp, pushback)
ch := @inp
if ch[1] === EOF then {
print_token(outf, [TOKEN_END_OF_INPUT, "", ch[2], ch[3]])
} else {
ch @pushback
print_token(outf, get_next_token(inp, pushback))
}
}
end
 
procedure get_next_token(inp, pushback)
local ch, ch1
local ln, cn
 
skip_spaces_and_comments(inp, pushback)
ch := @inp
ln := ch[2] # line number
cn := ch[3] # column number
case ch[1] of {
"," : return [TOKEN_COMMA, ",", ln, cn]
";" : return [TOKEN_SEMICOLON, ";", ln, cn]
"(" : return [TOKEN_LEFTPAREN, "(", ln, cn]
")" : return [TOKEN_RIGHTPAREN, ")", ln, cn]
"{" : return [TOKEN_LEFTBRACE, "{", ln, cn]
"}" : return [TOKEN_RIGHTBRACE, "}", ln, cn]
"*" : return [TOKEN_MULTIPLY, "*", ln, cn]
"/" : return [TOKEN_DIVIDE, "/", ln, cn]
"%" : return [TOKEN_MOD, "%", ln, cn]
"+" : return [TOKEN_ADD, "+", ln, cn]
"-" : return [TOKEN_SUBTRACT, "-", ln, cn]
"<" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_LESSEQUAL, "<=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_LESS, "<", ln, cn]
}
}
">" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_GREATEREQUAL, ">=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_GREATER, ">", ln, cn]
}
}
"=" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_EQUAL, "==", ln, cn]
} else {
ch1 @pushback
return [TOKEN_ASSIGN, "=", ln, cn]
}
}
"!" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_NOTEQUAL, "!=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_NOT, "!", ln, cn]
}
}
"&" : {
ch1 := @inp
if ch1[1] === "&" then {
return [TOKEN_AND, "&&", ln, cn]
} else {
unexpected_character(ln, cn, ch)
}
}
"|" : {
ch1 := @inp
if ch1[1] === "|" then {
return [TOKEN_OR, "||", ln, cn]
} else {
unexpected_character(ln, cn, ch)
}
}
"\"" : {
ch @pushback
return scan_string_literal(inp)
}
"'" : {
ch @pushback
return scan_character_literal(inp, pushback)
}
default : {
if any(&digits, ch[1]) then {
ch @pushback
return scan_integer_literal(inp, pushback)
} else if any(ident_start, ch[1]) then {
ch @pushback
return scan_identifier_or_reserved_word (inp, pushback)
} else {
unexpected_character(ln, cn, ch)
}
}
}
end
 
procedure scan_identifier_or_reserved_word(inp, pushback)
local ch
local s
local line_no, column_no
 
s := ""
ch := @inp
line_no := ch[2]
column_no := ch[3]
while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do {
s ||:= ch[1]
ch := @inp
}
ch @pushback
return reserved_word_lookup (s, line_no, column_no)
end
 
procedure scan_integer_literal(inp, pushback)
local ch
local s
local line_no, column_no
 
s := ""
ch := @inp
line_no := ch[2]
column_no := ch[3]
while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do {
s ||:= ch[1]
ch := @inp
}
ch @pushback
not upto(~&digits, s) | invalid_integer_literal(line_no, column_no, s)
return [TOKEN_INTEGER, s, line_no, column_no]
end
 
procedure scan_character_literal(inp, pushback)
local ch, ch1
local close_quote
local toktup
local line_no, column_no
 
ch := @inp # The opening quote.
close_quote := ch[1] # Same as the opening quote.
ch @pushback
 
line_no := ch[2]
column_no := ch[3]
 
toktup := scan_character_literal_without_checking_end(inp)
ch1 := @inp
if ch1[1] ~=== close_quote then {
repeat {
case ch1[1] of {
EOF : unterminated_character_literal(line_no, column_no)
close_quote : multicharacter_literal(line_no, column_no)
default : ch1 := @inp
}
}
}
return toktup
end
 
procedure scan_character_literal_without_checking_end(inp)
local ch, ch1, ch2
 
ch := @inp # The opening quote.
ch1 := @inp
EOF ~=== ch1[1] | unterminated_character_literal(ch[2], ch[3])
if ch1[1] == "\\" then {
ch2 := @inp
EOF ~=== ch2[1] | unterminated_character_literal(ch[2], ch[3])
case ch2[1] of {
"n" : return [TOKEN_INTEGER, string(ord("\n")), ch[2], ch[3]]
"\\" : return [TOKEN_INTEGER, string(ord("\\")), ch[2], ch[3]]
default : unsupported_escape(ch1[2], ch1[3], ch2)
}
} else {
return [TOKEN_INTEGER, string(ord(ch1[1])), ch[2], ch[3]]
}
end
 
procedure scan_string_literal(inp)
local ch, ch1, ch2
local line_no, column_no
local close_quote
local s
local retval
 
ch := @inp # The opening quote
close_quote := ch[1] # Same as the opening quote.
line_no := ch[2]
column_no := ch[3]
 
s := ch[1]
until \retval do {
ch1 := @inp
ch1[1] ~=== EOF |
unterminated_string_literal (line_no, column_no,
"end of input")
ch1[1] ~== "\n" |
unterminated_string_literal (line_no, column_no,
"end of line")
if ch1[1] == close_quote then {
retval := [TOKEN_STRING, s || close_quote, line_no, column_no]
} else if ch1[1] ~== "\\" then {
s ||:= ch1[1]
} else {
ch2 := @inp
EOF ~=== ch2[1] | unsupported_escape(line_no, column_no, ch2)
case ch2[1] of {
"n" : s ||:= "\\n"
"\\" : s ||:= "\\\\"
default : unsupported_escape(line_no, column_no, ch2)
}
}
}
return retval
end
 
procedure skip_spaces_and_comments(inp, pushback)
local ch, ch1
 
repeat {
ch := @inp
(EOF === ch[1]) & { ch @pushback; return }
if not any(whitespace, ch[1]) then {
(ch[1] == "/") | { ch @pushback; return }
(ch1 := @inp) | { ch @pushback; return }
(ch1[1] == "*") | { ch1 @pushback; ch @pushback; return }
scan_comment(inp, ch[2], ch[3])
}
}
end
 
procedure scan_comment(inp, line_no, column_no)
local ch, ch1
 
until (\ch)[1] == "*" & (\ch1)[1] == "/" do {
ch := @inp
(EOF === ch[1]) & unterminated_comment(line_no, column_no)
if ch[1] == "*" then {
ch1 := @inp
(EOF === ch[1]) & unterminated_comment(line_no, column_no)
}
}
return
end
 
procedure reserved_word_lookup(s, line_no, column_no)
 
# Lookup is by an extremely simple perfect hash.
 
static reserved_words
static reserved_word_tokens
local hashval, token, toktup
 
initial {
reserved_words := ["if", "print", "else",
"", "putc", "",
"", "while", ""]
reserved_word_tokens :=
[TOKEN_IF, TOKEN_PRINT, TOKEN_ELSE,
TOKEN_IDENTIFIER, TOKEN_PUTC, TOKEN_IDENTIFIER,
TOKEN_IDENTIFIER, TOKEN_WHILE, TOKEN_IDENTIFIER]
}
 
if *s < 2 then {
toktup := [TOKEN_IDENTIFIER, s, line_no, column_no]
} else {
hashval := ((ord(s[1]) + ord(s[2])) % (*reserved_words)) + 1
token := reserved_word_tokens[hashval]
if token = TOKEN_IDENTIFIER | s ~== reserved_words[hashval] then {
toktup := [TOKEN_IDENTIFIER, s, line_no, column_no]
} else {
toktup := [token, s, line_no, column_no]
}
}
return toktup
end
 
procedure print_token(outf, toktup)
static token_names
local s_line, s_column
 
initial {
token_names := ["Keyword_else",
"Keyword_if",
"Keyword_print",
"Keyword_putc",
"Keyword_while",
"Op_multiply",
"Op_divide",
"Op_mod",
"Op_add",
"Op_subtract",
"Op_negate",
"Op_less",
"Op_lessequal",
"Op_greater",
"Op_greaterequal",
"Op_equal",
"Op_notequal",
"Op_not",
"Op_assign",
"Op_and",
"Op_or",
"LeftParen",
"RightParen",
"LeftBrace",
"RightBrace",
"Semicolon",
"Comma",
"Identifier",
"Integer",
"String",
"End_of_input"]
}
 
/outf := FileStream.stdout
s_line := string(toktup[3])
s_column := string(toktup[4])
writes(outf, right (s_line, max(5, *s_line)))
writes(outf, " ")
writes(outf, right (s_column, max(5, *s_column)))
writes(outf, " ")
writes(outf, token_names[toktup[1] + 1])
case toktup[1] of {
TOKEN_IDENTIFIER : writes(outf, " ", toktup[2])
TOKEN_INTEGER : writes(outf, " ", toktup[2])
TOKEN_STRING : writes(outf, " ", toktup[2])
}
write(outf)
return
end
 
procedure inputter(inpf, pushback_buffer)
local buffer
local line_no, column_no
local c
 
buffer := ""
line_no := 1
column_no := 1
 
repeat {
buffer? {
until *pushback_buffer = 0 & pos(0) do {
if *pushback_buffer ~= 0 then {
suspend pop(pushback_buffer)
} else {
c := move(1)
suspend [c, line_no, column_no]
if c == "\n" then {
line_no +:= 1
column_no := 1
} else {
column_no +:= 1
}
}
}
}
(buffer := reads(inpf, 2048)) |
suspend [EOF, line_no, column_no]
}
end
 
procedure unterminated_comment(line_no, column_no)
error("unterminated comment starting at ",
line_no, ":", column_no)
end
 
procedure unexpected_character(line_no, column_no, ch)
error("unexpected character '", ch[1], "' starting at ",
line_no, ":", column_no)
end
 
procedure unterminated_string_literal (line_no, column_no, cause)
error("unterminated string literal (", cause, ") starting at ",
line_no, ":", column_no)
end
 
procedure unsupported_escape (line_no, column_no, ch)
if ch[1] === EOF then {
error("unexpected \\ at end of input",
" starting at ", line_no, ":", column_no)
} else {
error("unsupported escape \\", ch[1],
" starting at ", line_no, ":", column_no)
}
end
 
procedure invalid_integer_literal(line_no, column_no, s)
error("invalid integer literal ", s,
" starting at ", line_no, ":", column_no)
end
 
procedure unterminated_character_literal(line_no, column_no)
error("unterminated character literal starting at ",
line_no, ":", column_no)
end
 
procedure multicharacter_literal(line_no, column_no)
error("unsupported multicharacter literal starting at ",
line_no, ":", column_no)
end
 
procedure error(args[])
write!([FileStream.stderr] ||| args)
exit(1)
end</syntaxhighlight>
 
 
{{out}}
<pre>$ oit -s -o lex lex-in-ObjectIcon.icn && ./lex compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|OCaml}}==
{{works with|OCaml|4.12.1}}
{{trans|ATS}}
 
This is a close translation of the ATS. It may interest the reader to compare the two implementations.
 
(Much of the extra complication in the ATS comes from arrays being a linear type (whose "views" need tending), and from values of linear type having to be local to any function using them. This limitation could have been worked around, and arrays more similar to OCaml arrays could have been used, but at a cost in safety and efficiency.)
 
<syntaxhighlight lang="ocaml">(*------------------------------------------------------------------*)
(* The Rosetta Code lexical analyzer, in OCaml. Based on the ATS. *)
 
(* When you compare this code to the ATS code, please keep in mind
that, although ATS has an ML-like syntax:
 
* The type system is not the same at all.
 
* Most ATS functions are not closures. Those that are will have
special notations such as "<cloref1>" associated with them. *)
 
(*------------------------------------------------------------------*)
(* The following functions are compatible with ASCII. *)
 
let is_digit ichar =
48 <= ichar && ichar <= 57
 
let is_lower ichar =
97 <= ichar && ichar <= 122
 
let is_upper ichar =
65 <= ichar && ichar <= 90
 
let is_alpha ichar =
is_lower ichar || is_upper ichar
 
let is_alnum ichar =
is_digit ichar || is_alpha ichar
 
let is_ident_start ichar =
is_alpha ichar || ichar = 95
 
let is_ident_continuation ichar =
is_alnum ichar || ichar = 95
 
let is_space ichar =
ichar = 32 || (9 <= ichar && ichar <= 13)
 
(*------------------------------------------------------------------*)
 
let reverse_list_to_string lst =
List.rev lst |> List.to_seq |> String.of_seq
 
(*------------------------------------------------------------------*)
(* Character input more like that of C. There are various advantages
and disadvantages to this method, but key points in its favor are:
(a) it is how character input is done in the original ATS code, (b)
Unicode code points are 21-bit positive integers. *)
 
let eof = (-1)
 
let input_ichar channel =
try
int_of_char (input_char channel)
with
| End_of_file -> eof
 
(*------------------------------------------------------------------*)
 
(* The type of an input character. *)
 
module Ch =
struct
type t =
{
ichar : int;
line_no : int;
column_no : int
}
end
 
(*------------------------------------------------------------------*)
(* Inputting with unlimited pushback, and with counting of lines and
columns. *)
 
module Inp =
struct
type t =
{
inpf : in_channel;
pushback : Ch.t list;
line_no : int;
column_no : int
}
 
let of_in_channel inpf =
{ inpf = inpf;
pushback = [];
line_no = 1;
column_no = 1
}
 
let get_ch inp =
match inp.pushback with
| ch :: tail ->
(ch, {inp with pushback = tail})
| [] ->
let ichar = input_ichar inp.inpf in
if ichar = int_of_char '\n' then
({ ichar = ichar;
line_no = inp.line_no;
column_no = inp.column_no },
{ inp with line_no = inp.line_no + 1;
column_no = 1 })
else
({ ichar = ichar;
line_no = inp.line_no;
column_no = inp.column_no },
{ inp with column_no = inp.column_no + 1 })
 
let push_back_ch ch inp =
{inp with pushback = ch :: inp.pushback}
end
 
(*------------------------------------------------------------------*)
(* Tokens, appearing in tuples with arguments, and with line and
column numbers. The tokens are integers, so they can be used as
array indices. *)
 
(* (token, argument, line_no, column_no) *)
type toktup_t = int * string * int * int
 
let token_ELSE = 0
let token_IF = 1
let token_PRINT = 2
let token_PUTC = 3
let token_WHILE = 4
let token_MULTIPLY = 5
let token_DIVIDE = 6
let token_MOD = 7
let token_ADD = 8
let token_SUBTRACT = 9
let token_NEGATE = 10
let token_LESS = 11
let token_LESSEQUAL = 12
let token_GREATER = 13
let token_GREATEREQUAL = 14
let token_EQUAL = 15
let token_NOTEQUAL = 16
let token_NOT = 17
let token_ASSIGN = 18
let token_AND = 19
let token_OR = 20
let token_LEFTPAREN = 21
let token_RIGHTPAREN = 22
let token_LEFTBRACE = 23
let token_RIGHTBRACE = 24
let token_SEMICOLON = 25
let token_COMMA = 26
let token_IDENTIFIER = 27
let token_INTEGER = 28
let token_STRING = 29
let token_END_OF_INPUT = 30
;;
 
(* A *very* simple perfect hash for the reserved words. (Yes, this is
overkill, except for demonstration of the principle.) *)
 
let reserved_words =
[| "if"; "print"; "else"; ""; "putc"; ""; ""; "while"; "" |]
let reserved_word_tokens =
[| token_IF; token_PRINT; token_ELSE; token_IDENTIFIER;
token_PUTC; token_IDENTIFIER; token_IDENTIFIER; token_WHILE;
token_IDENTIFIER |]
 
let reserved_word_lookup s line_no column_no =
if String.length s < 2 then
(token_IDENTIFIER, s, line_no, column_no)
else
let hashval = (int_of_char s.[0] + int_of_char s.[1]) mod 9 in
let token = reserved_word_tokens.(hashval) in
if token = token_IDENTIFIER || s <> reserved_words.(hashval) then
(token_IDENTIFIER, s, line_no, column_no)
else
(token, s, line_no, column_no)
 
(* Token to string lookup. *)
 
let token_names =
[| "Keyword_else";
"Keyword_if";
"Keyword_print";
"Keyword_putc";
"Keyword_while";
"Op_multiply";
"Op_divide";
"Op_mod";
"Op_add";
"Op_subtract";
"Op_negate";
"Op_less";
"Op_lessequal";
"Op_greater";
"Op_greaterequal";
"Op_equal";
"Op_notequal";
"Op_not";
"Op_assign";
"Op_and";
"Op_or";
"LeftParen";
"RightParen";
"LeftBrace";
"RightBrace";
"Semicolon";
"Comma";
"Identifier";
"Integer";
"String";
"End_of_input" |]
 
let token_name token =
token_names.(token)
 
(*------------------------------------------------------------------*)
 
exception Unterminated_comment of int * int
exception Unterminated_character_literal of int * int
exception Multicharacter_literal of int * int
exception End_of_input_in_string_literal of int * int
exception End_of_line_in_string_literal of int * int
exception Unsupported_escape of int * int * int
exception Invalid_integer_literal of int * int * string
exception Unexpected_character of int * int * char
 
(*------------------------------------------------------------------*)
(* Skipping past spaces and comments. (A comment in the target
language is, if you think about it, a kind of space.) *)
 
let scan_comment inp line_no column_no =
let rec loop inp =
let (ch, inp) = Inp.get_ch inp in
if ch.ichar = eof then
raise (Unterminated_comment (line_no, column_no))
else if ch.ichar = int_of_char '*' then
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = eof then
raise (Unterminated_comment (line_no, column_no))
else if ch1.ichar = int_of_char '/' then
inp
else
loop inp
else
loop inp
in
loop inp
 
let skip_spaces_and_comments inp =
let rec loop inp =
let (ch, inp) = Inp.get_ch inp in
if is_space ch.ichar then
loop inp
else if ch.ichar = int_of_char '/' then
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '*' then
scan_comment inp ch.line_no ch.column_no |> loop
else
let inp = Inp.push_back_ch ch1 inp in
let inp = Inp.push_back_ch ch inp in
inp
else
Inp.push_back_ch ch inp
in
loop inp
 
(*------------------------------------------------------------------*)
(* Integer literals, identifiers, and reserved words. *)
 
(* In ATS the predicate for simple scan was supplied by template
expansion, which (typically) produces faster code than passing a
function or closure (although passing either of those could have
been done). Here I pass the predicate as a function/closure. It is
worth contrasting the methods. *)
let rec simple_scan pred lst inp =
let (ch, inp) = Inp.get_ch inp in
if pred ch.ichar then
simple_scan pred (char_of_int ch.ichar :: lst) inp
else
(lst, Inp.push_back_ch ch inp)
 
(* Demonstration of one way to make a new closure in OCaml. (In ATS,
one might see things that look similar but are actually template
operations.) *)
let simple_scan_iic = simple_scan is_ident_continuation
 
let scan_integer_literal inp =
let (ch, inp) = Inp.get_ch inp in
let _ = assert (is_digit ch.ichar) in
let (lst, inp) = simple_scan_iic [char_of_int ch.ichar] inp in
let s = reverse_list_to_string lst in
if List.for_all (fun c -> is_digit (int_of_char c)) lst then
((token_INTEGER, s, ch.line_no, ch.column_no), inp)
else
raise (Invalid_integer_literal (ch.line_no, ch.column_no, s))
 
let scan_identifier_or_reserved_word inp =
let (ch, inp) = Inp.get_ch inp in
let _ = assert (is_ident_start ch.ichar) in
let (lst, inp) = simple_scan_iic [char_of_int ch.ichar] inp in
let s = reverse_list_to_string lst in
let toktup = reserved_word_lookup s ch.line_no ch.column_no in
(toktup, inp)
 
(*------------------------------------------------------------------*)
(* String literals. *)
 
let scan_string_literal inp =
let (ch, inp) = Inp.get_ch inp in
let _ = assert (ch.ichar = int_of_char '"') in
 
let rec scan lst inp =
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = eof then
raise (End_of_input_in_string_literal
(ch.line_no, ch.column_no))
else if ch1.ichar = int_of_char '\n' then
raise (End_of_line_in_string_literal
(ch.line_no, ch.column_no))
else if ch1.ichar = int_of_char '"' then
(lst, inp)
else if ch1.ichar <> int_of_char '\\' then
scan (char_of_int ch1.ichar :: lst) inp
else
let (ch2, inp) = Inp.get_ch inp in
if ch2.ichar = int_of_char 'n' then
scan ('n' :: '\\' :: lst) inp
else if ch2.ichar = int_of_char '\\' then
scan ('\\' :: '\\' :: lst) inp
else
raise (Unsupported_escape (ch1.line_no, ch1.column_no,
ch2.ichar))
in
let lst = '"' :: [] in
let (lst, inp) = scan lst inp in
let lst = '"' :: lst in
let s = reverse_list_to_string lst in
((token_STRING, s, ch.line_no, ch.column_no), inp)
 
(*------------------------------------------------------------------*)
(* Character literals. *)
 
let scan_character_literal_without_checking_end inp =
let (ch, inp) = Inp.get_ch inp in
let _ = assert (ch.ichar = int_of_char '\'') in
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = eof then
raise (Unterminated_character_literal
(ch.line_no, ch.column_no))
else if ch1.ichar = int_of_char '\\' then
let (ch2, inp) = Inp.get_ch inp in
if ch2.ichar = eof then
raise (Unterminated_character_literal
(ch.line_no, ch.column_no))
else if ch2.ichar = int_of_char 'n' then
let s = (int_of_char '\n' |> string_of_int) in
((token_INTEGER, s, ch.line_no, ch.column_no), inp)
else if ch2.ichar = int_of_char '\\' then
let s = (int_of_char '\\' |> string_of_int) in
((token_INTEGER, s, ch.line_no, ch.column_no), inp)
else
raise (Unsupported_escape
(ch1.line_no, ch1.column_no, ch2.ichar))
else
let s = string_of_int ch1.ichar in
((token_INTEGER, s, ch.line_no, ch.column_no), inp)
 
let scan_character_literal inp =
let (toktup, inp) =
scan_character_literal_without_checking_end inp in
let (_, _, line_no, column_no) = toktup in
 
let check_end inp =
let (ch, inp) = Inp.get_ch inp in
if ch.ichar = int_of_char '\'' then
inp
else
let rec loop_to_end (ch1 : Ch.t) inp =
if ch1.ichar = eof then
raise (Unterminated_character_literal (line_no, column_no))
else if ch1.ichar = int_of_char '\'' then
raise (Multicharacter_literal (line_no, column_no))
else
let (ch1, inp) = Inp.get_ch inp in
loop_to_end ch1 inp
in
loop_to_end ch inp
in
let inp = check_end inp in
(toktup, inp)
 
(*------------------------------------------------------------------*)
 
let get_next_token inp =
let inp = skip_spaces_and_comments inp in
let (ch, inp) = Inp.get_ch inp in
let ln = ch.line_no in
let cn = ch.column_no in
if ch.ichar = eof then
((token_END_OF_INPUT, "", ln, cn), inp)
else
match char_of_int ch.ichar with
| ',' -> ((token_COMMA, ",", ln, cn), inp)
| ';' -> ((token_SEMICOLON, ";", ln, cn), inp)
| '(' -> ((token_LEFTPAREN, "(", ln, cn), inp)
| ')' -> ((token_RIGHTPAREN, ")", ln, cn), inp)
| '{' -> ((token_LEFTBRACE, "{", ln, cn), inp)
| '}' -> ((token_RIGHTBRACE, "}", ln, cn), inp)
| '*' -> ((token_MULTIPLY, "*", ln, cn), inp)
| '/' -> ((token_DIVIDE, "/", ln, cn), inp)
| '%' -> ((token_MOD, "%", ln, cn), inp)
| '+' -> ((token_ADD, "+", ln, cn), inp)
| '-' -> ((token_SUBTRACT, "-", ln, cn), inp)
| '<' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '=' then
((token_LESSEQUAL, "<=", ln, cn), inp)
else
let inp = Inp.push_back_ch ch1 inp in
((token_LESS, "<", ln, cn), inp)
| '>' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '=' then
((token_GREATEREQUAL, ">=", ln, cn), inp)
else
let inp = Inp.push_back_ch ch1 inp in
((token_GREATER, ">", ln, cn), inp)
| '=' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '=' then
((token_EQUAL, "==", ln, cn), inp)
else
let inp = Inp.push_back_ch ch1 inp in
((token_ASSIGN, "=", ln, cn), inp)
| '!' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '=' then
((token_NOTEQUAL, "!=", ln, cn), inp)
else
let inp = Inp.push_back_ch ch1 inp in
((token_NOT, "!", ln, cn), inp)
| '&' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '&' then
((token_AND, "&&", ln, cn), inp)
else
raise (Unexpected_character (ch.line_no, ch.column_no,
char_of_int ch.ichar))
| '|' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '|' then
((token_OR, "||", ln, cn), inp)
else
raise (Unexpected_character (ch.line_no, ch.column_no,
char_of_int ch.ichar))
| '"' ->
let inp = Inp.push_back_ch ch inp in
scan_string_literal inp
| '\'' ->
let inp = Inp.push_back_ch ch inp in
scan_character_literal inp
| _ when is_digit ch.ichar ->
let inp = Inp.push_back_ch ch inp in
scan_integer_literal inp
| _ when is_ident_start ch.ichar ->
let inp = Inp.push_back_ch ch inp in
scan_identifier_or_reserved_word inp
| _ -> raise (Unexpected_character (ch.line_no, ch.column_no,
char_of_int ch.ichar))
 
let print_token outf toktup =
let (token, arg, line_no, column_no) = toktup in
let name = token_name token in
let (padding, str) =
match 0 with
| _ when token = token_IDENTIFIER -> (" ", arg)
| _ when token = token_INTEGER -> (" ", arg)
| _ when token = token_STRING -> (" ", arg)
| _ -> ("", "")
in
Printf.fprintf outf "%5d %5d %s%s%s\n"
line_no column_no name padding str
 
let scan_text outf inp =
let rec loop inp =
let (toktup, inp) = get_next_token inp in
begin
print_token outf toktup;
let (token, _, _, _) = toktup in
if token <> token_END_OF_INPUT then
loop inp
end
in
loop inp
 
(*------------------------------------------------------------------*)
 
let main () =
let inpf_filename =
if 2 <= Array.length Sys.argv then
Sys.argv.(1)
else
"-"
in
let outf_filename =
if 3 <= Array.length Sys.argv then
Sys.argv.(2)
else
"-"
in
let inpf =
if inpf_filename = "-" then
stdin
else
open_in inpf_filename
in
let outf =
if outf_filename = "-" then
stdout
else
open_out outf_filename
in
let inp = Inp.of_in_channel inpf in
scan_text outf inp
;;
 
main ()
 
(*------------------------------------------------------------------*)</syntaxhighlight>
 
{{out}}
<pre>$ ocamlopt -O2 lex.ml && ./a.out compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|Ol}}==
==== Source ====
Note: we do not print the line and token source code position for the simplicity.
 
<syntaxhighlight lang="scheme">
(import (owl parse))
 
(define (get-comment)
(get-either
(let-parses (
(_ (get-imm #\*))
(_ (get-imm #\/)))
#true)
(let-parses (
(_ get-byte)
(_ (get-comment)))
#true)))
 
(define get-whitespace
(get-any-of
(get-byte-if (lambda (x) (has? '(#\tab #\newline #\space #\return) x))) ; whitespace
(let-parses ( ; comment
(_ (get-imm #\/))
(_ (get-imm #\*))
(_ (get-comment)))
#true)))
 
(define get-operator
(let-parses (
(operator (get-any-of
(get-word "||" 'Op_or)
(get-word "&&" 'Op_and)
(get-word "!=" 'Op_notequal)
(get-word "==" 'Op_equal)
(get-word ">=" 'Op_greaterequal)
(get-word "<=" 'Op_lessequal)
 
(get-word "=" 'Op_assign)
(get-word "!" 'Op_nop)
(get-word ">" 'Op_greater)
(get-word "<" 'Op_less)
(get-word "-" 'Op_subtract)
(get-word "+" 'Op_add)
(get-word "%" 'Op_mod)
(get-word "/" 'Op_divide)
(get-word "*" 'Op_multiply))))
(cons 'operator operator)))
 
(define get-symbol
(let-parses (
(symbol (get-any-of
(get-word "(" 'LeftParen)
(get-word ")" 'RightParen)
(get-word "{" 'LeftBrace)
(get-word "}" 'RightBrace)
(get-word ";" 'Semicolon)
(get-word "," 'Comma))))
(cons 'symbol symbol)))
 
(define get-keyword
(let-parses (
(keyword (get-any-of
(get-word "if" 'Keyword_if)
(get-word "else" 'Keyword_else)
(get-word "while" 'Keyword_while)
(get-word "print" 'Keyword_print)
(get-word "putc" 'Keyword_putc))))
(cons 'keyword keyword)))
 
 
 
(define get-identifier
(let-parses (
(lead (get-byte-if (lambda (x) (or (<= #\a x #\z) (<= #\A x #\Z) (= x #\_)))))
(tail (get-greedy* (get-byte-if (lambda (x) (or (<= #\a x #\z) (<= #\A x #\Z) (= x #\_) (<= #\0 x #\9)))))))
(cons 'identifier (bytes->string (cons lead tail)))))
 
(define get-integer
(let-parses (
(main (get-greedy+ (get-byte-if (lambda (x) (<= #\0 x #\9))))) )
(cons 'integer (string->integer (bytes->string main)))))
 
(define get-character
(let-parses (
(_ (get-imm #\'))
(char (get-any-of
(get-word "\\n" #\newline)
(get-word "\\\\" #\\)
(get-byte-if (lambda (x) (not (or (eq? x #\') (eq? x #\newline)))))))
(_ (get-imm #\')) )
(cons 'character char)))
 
(define get-string
(let-parses (
(_ (get-imm #\")) ;"
(data (get-greedy* (get-any-of
(get-word "\\n" #\newline)
(get-word "\\\\" #\\) ;\"
(get-byte-if (lambda (x) (not (or (eq? x #\") (eq? x #\newline)))))))) ;", newline
(_ (get-imm #\")) ) ;"
(cons 'string (bytes->string data))))
 
(define get-token
(let-parses (
(_ (get-greedy* get-whitespace))
(token (get-any-of
get-symbol
get-keyword
get-identifier
get-operator
get-integer
get-character
get-string
)) )
token))
 
(define token-parser
(let-parses (
(tokens (get-greedy+ get-token))
(_ (get-greedy* get-whitespace)))
tokens))
 
 
(define (translate source)
(let ((stream (try-parse token-parser (str-iter source) #t)))
(for-each print (car stream))
(if (null? (cdr stream))
(print 'End_of_input))))
</syntaxhighlight>
 
==== Testing ====
 
Testing function:
<syntaxhighlight lang="scheme">
(define (translate source)
(let ((stream (try-parse token-parser (str-iter source) #t)))
(for-each print (car stream))
(if (null? (force (cdr stream)))
(print 'End_of_input))))
</syntaxhighlight>
 
====== Testcase 1 ======
 
<syntaxhighlight lang="scheme">
(translate "
/*
Hello world
*/
print(\"Hello, World!\\\\n\");
")</syntaxhighlight>
{{Out}}
<pre>
(keyword . Keyword_print)
(symbol . LeftParen)
(string . Hello, World!\n)
(symbol . RightParen)
(symbol . Semicolon)
End_of_input
</pre>
 
====== Testcase 2 ======
 
<syntaxhighlight lang="scheme">
(translate "
/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, \"\\\\n\");
")</syntaxhighlight>
{{Out}}
<pre>
(identifier . phoenix_number)
(operator . Op_assign)
(integer . 142857)
(symbol . Semicolon)
(keyword . Keyword_print)
(symbol . LeftParen)
(identifier . phoenix_number)
(symbol . Comma)
(string . \n)
(symbol . RightParen)
(symbol . Semicolon)
End_of_input
</pre>
 
====== Testcase 3 ======
 
<syntaxhighlight lang="scheme">
(translate "
/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* Else */ else /* Leq */ <=
/* While */ while /* Geq */ >=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ \"String literal\"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\\n'
/* character literal */ '\\\\'
/* character literal */ ' '
")</syntaxhighlight>
{{Out}}
<pre>
(keyword . Keyword_print)
(operator . Op_subtract)
(keyword . Keyword_putc)
(operator . Op_less)
(keyword . Keyword_if)
(operator . Op_greater)
(keyword . Keyword_else)
(operator . Op_lessequal)
(keyword . Keyword_while)
(operator . Op_greaterequal)
(symbol . LeftBrace)
(operator . Op_equal)
(symbol . RightBrace)
(operator . Op_notequal)
(symbol . LeftParen)
(operator . Op_and)
(symbol . RightParen)
(operator . Op_or)
(operator . Op_subtract)
(symbol . Semicolon)
(operator . Op_nop)
(symbol . Comma)
(operator . Op_multiply)
(operator . Op_assign)
(operator . Op_divide)
(integer . 42)
(operator . Op_mod)
(string . String literal)
(operator . Op_add)
(identifier . variable_name)
(character . 10)
(character . 92)
(character . 32)
End_of_input
</pre>
 
====== Testcase 4 ======
 
<syntaxhighlight lang="scheme">
(translate "
/*** test printing, embedded \\\\n and comments with lots of '*' ***/
print(42);
print(\"\\\\nHello World\\\\nGood Bye\\\\nok\\\\n\");
print(\"Print a slash n - \\\\\\\\n.\\\\n\");
")
</syntaxhighlight>
{{Out}}
<pre>
(keyword . Keyword_print)
(symbol . LeftParen)
(integer . 42)
(symbol . RightParen)
(symbol . Semicolon)
(keyword . Keyword_print)
(symbol . LeftParen)
(string . \nHello World\nGood Bye\nok\n)
(symbol . RightParen)
(symbol . Semicolon)
(keyword . Keyword_print)
(symbol . LeftParen)
(string . Print a slash n - \\n.\n)
(symbol . RightParen)
(symbol . Semicolon)
End_of_input
</pre>
 
=={{header|Perl}}==
 
<langsyntaxhighlight lang="perl">#!/usr/bin/env perl
 
use strict;
Line 5,485 ⟶ 14,342:
($line, $col)
}
}</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 5,528 ⟶ 14,385:
===Alternate Perl Solution===
Tested on perl v5.26.1
<langsyntaxhighlight Perllang="perl">#!/usr/bin/perl
 
use strict; # lex.pl - source to tokens
Line 5,564 ⟶ 14,421:
1 + $` =~ tr/\n//, 1 + length $` =~ s/.*\n//sr, $^R;
}
printf "%5d %7d %s\n", 1 + tr/\n//, 1, 'End_of_input';</langsyntaxhighlight>
 
=={{header|Perl 6}}==
This is more complicated than strictly necessary for this task. It is set up to be easily adapted to do syntax analysis.
 
(Note: there are several bogus comments added solely to help with syntax highlighting.)
 
{{works with|Rakudo|2016.08}}
 
<lang perl6>grammar tiny_C {
rule TOP { ^ <.whitespace>? <tokens> + % <.whitespace> <.whitespace> <eoi> }
 
rule whitespace { [ <comment> + % <ws> | <ws> ] }
 
token comment { '/*' ~ '*/' .*? }
 
token tokens {
[
| <operator> { make $/<operator>.ast }
| <keyword> { make $/<keyword>.ast }
| <symbol> { make $/<symbol>.ast }
| <identifier> { make $/<identifier>.ast }
| <integer> { make $/<integer>.ast }
| <char> { make $/<char>.ast }
| <string> { make $/<string>.ast }
| <error>
]
}
 
proto token operator {*}
token operator:sym<*> { '*' { make 'Op_multiply' } }
token operator:sym</> { '/'<!before '*'> { make 'Op_divide' } }
token operator:sym<%> { '%' { make 'Op_mod' } }
token operator:sym<+> { '+' { make 'Op_add' } }
token operator:sym<-> { '-' { make 'Op_subtract' } }
token operator:sym('<='){ '<=' { make 'Op_lessequal' } }
token operator:sym('<') { '<' { make 'Op_less' } }
token operator:sym('>='){ '>=' { make 'Op_greaterequal'} }
token operator:sym('>') { '>' { make 'Op_greater' } }
token operator:sym<==> { '==' { make 'Op_equal' } }
token operator:sym<!=> { '!=' { make 'Op_notequal' } }
token operator:sym<!> { '!' { make 'Op_not' } }
token operator:sym<=> { '=' { make 'Op_assign' } }
token operator:sym<&&> { '&&' { make 'Op_and' } }
token operator:sym<||> { '||' { make 'Op_or' } }
 
proto token keyword {*}
token keyword:sym<if> { 'if' { make 'Keyword_if' } }
token keyword:sym<else> { 'else' { make 'Keyword_else' } }
token keyword:sym<putc> { 'putc' { make 'Keyword_putc' } }
token keyword:sym<while> { 'while' { make 'Keyword_while' } }
token keyword:sym<print> { 'print' { make 'Keyword_print' } }
 
proto token symbol {*}
token symbol:sym<(> { '(' { make 'LeftParen' } }
token symbol:sym<)> { ')' { make 'RightParen' } }
token symbol:sym<{> { '{' { make 'LeftBrace' } }
token symbol:sym<}> { '}' { make 'RightBrace' } }
token symbol:sym<;> { ';' { make 'Semicolon' } }
token symbol:sym<,> { ',' { make 'Comma' } }
 
token identifier { <[_A..Za..z]><[_A..Za..z0..9]>* { make 'Identifier ' ~ $/ } }
token integer { <[0..9]>+ { make 'Integer ' ~ $/ } }
 
token char {
'\'' [<-[']> | '\n' | '\\\\'] '\''
{ make 'Char_Literal ' ~ $/.subst("\\n", "\n").substr(1, *-1).ord }
}
 
token string {
'"' <-["\n]>* '"' #'
{
make 'String ' ~ $/;
note 'Error: Unknown escape sequence.' and exit if (~$/ ~~ m:r/ <!after <[\\]>>[\\<-[n\\]>]<!before <[\\]>> /);
}
}
 
token eoi { $ { make 'End_of_input' } }
 
token error {
| '\'''\'' { note 'Error: Empty character constant.' and exit }
| '\'' <-[']> ** {2..*} '\'' { note 'Error: Multi-character constant.' and exit }
| '/*' <-[*]>* $ { note 'Error: End-of-file in comment.' and exit }
| '"' <-["]>* $ { note 'Error: End-of-file in string.' and exit }
| '"' <-["]>*? \n { note 'Error: End of line in string.' and exit } #'
}
}
 
sub parse_it ( $c_code ) {
my $l;
my @pos = gather for $c_code.lines>>.chars.kv -> $line, $v {
take [ $line + 1, $_ ] for 1 .. ($v+1); # v+1 for newline
$l = $line+2;
}
@pos.push: [ $l, 1 ]; # capture eoi
 
for flat $c_code<tokens>.list, $c_code<eoi> -> $m {
say join "\t", @pos[$m.from].fmt('%3d'), $m.ast;
}
}
 
my $tokenizer = tiny_C.parse(@*ARGS[0].IO.slurp);
parse_it( $tokenizer );</lang>
 
{{out|case=test case 3}}
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Char_Literal 10
21 26 Char_Literal 92
22 26 Char_Literal 32
23 1 End_of_input
</pre>
 
=={{header|Phix}}==
Deviates from the task requirements in that it is written in a modular form so that the output
from one stage can be used directly in the next, rather than re-loading from a human-readable
form. If required, demo\rosetta\Compiler\extra.e (below) contains some code that achieves the latter.
Code to print the human readable forms is likewise kept separate from any re-usable parts.
<!--<syntaxhighlight lang="phix">(phixonline)-->
 
<span style="color: #000080;font-style:italic;">--
<lang Phix>--
-- demo\\rosetta\\Compiler\\core.e
-- ===============================
--
-- Standard declarations and routines used by lex.exw, parse.exw, cgen.exw, and interp.exw
-- (included in distribution as above, which contains some additional sanity checks)
--</span>
--
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span>
--
<span style="color: #008080;">global</span> <span style="color: #008080;">constant</span> <span style="color: #000000;">EOF</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">STDIN</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">STDOUT</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span>
global constant EOF = -1, STDIN = 0, STDOUT = 1
 
<span style="color: #008080;">global</span> <span style="color: #008080;">enum</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">=</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">UNARY</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">=</span><span style="color: #000000;">2</span>
global enum type nary NONE=0, UNARY=1, BINARY=2 end type
<span style="color: #008080;">global</span> <span style="color: #008080;">type</span> <span style="color: #000000;">nary</span><span style="color: #0000FF;">(</span><span style="color: #004080;">integer</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">return</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">=</span><span style="color: #000000;">NONE</span> <span style="color: #008080;">or</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">=</span><span style="color: #000000;">UNARY</span> <span style="color: #008080;">or</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">=</span><span style="color: #000000;">BINARY</span> <span style="color: #008080;">end</span> <span style="color: #008080;">type</span>
 
global sequence tkNames = {} -- eg/ie {"Op_multiply","Op_divide",..}
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">tkNames</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span> <span style="color: #000080;font-style:italic;">-- eg/ie {"Op_multiply","Op_divide",..}</span>
global sequence precedences = {}
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">precedences</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span>
global sequence narys = {} -- NONE/UNARY/BINARY
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">narys</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span> <span style="color: #000080;font-style:italic;">-- NONE/UNARY/BINARY</span>
global sequence operators = {} -- eg/ie {"*","/","+","-","<","<=",..}
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">operators</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span> <span style="color: #000080;font-style:italic;">-- eg/ie {"*","/","+","-","&lt;","&lt;=",..}</span>
global sequence opcodes = {} -- idx to tkNames, matching operators
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">opcodes</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span> <span style="color: #000080;font-style:italic;">-- idx to tkNames, matching operators</span>
 
global constant KEYWORDS = new_dict() -- eg/ie {"if"=>idx to tkNames}
<span style="color: #008080;">global</span> <span style="color: #008080;">constant</span> <span style="color: #000000;">KEYWORDS</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">new_dict</span><span style="color: #0000FF;">()</span> <span style="color: #000080;font-style:italic;">-- eg/ie {"if"=&gt;idx to tkNames}</span>
 
global enum OPERATOR=1, DIGIT, LETTER -- character classes
<span style="color: #008080;">global</span> <span style="color: #008080;">enum</span> <span style="color: #000000;">OPERATOR</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">DIGIT</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">LETTER</span> <span style="color: #000080;font-style:italic;">-- character classes</span>
 
global sequence charmap = repeat(0,255)
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">charmap</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #000000;">255</span><span style="color: #0000FF;">)</span>
charmap['0'..'9'] = DIGIT
<span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #008000;">'0'</span><span style="color: #0000FF;">..</span><span style="color: #008000;">'9'</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">DIGIT</span>
charmap['A'..'Z'] = LETTER
<span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #008000;">'A'</span><span style="color: #0000FF;">..</span><span style="color: #008000;">'Z'</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">LETTER</span>
charmap['a'..'z'] = LETTER
<span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #008000;">'a'</span><span style="color: #0000FF;">..</span><span style="color: #008000;">'z'</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">LETTER</span>
charmap['_'] = LETTER
<span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #008000;">'_'</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">LETTER</span>
 
function tkName(string s, nary n = NONE, integer precedence = -1)
<span style="color: #008080;">function</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">nary</span> <span style="color: #000000;">n</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">precedence</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">)</span>
tkNames = append(tkNames,s)
<span style="color: #000000;">tkNames</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tkNames</span><span style="color: #0000FF;">,</span><span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
narys = append(narys,n)
<span style="color: #000000;">narys</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">narys</span><span style="color: #0000FF;">,</span><span style="color: #000000;">n</span><span style="color: #0000FF;">)</span>
precedences = append(precedences,precedence)
<span style="color: #000000;">precedences</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">precedences</span><span style="color: #0000FF;">,</span><span style="color: #000000;">precedence</span><span style="color: #0000FF;">)</span>
return length(tkNames)
<span style="color: #008080;">return</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tkNames</span><span style="color: #0000FF;">)</span>
end function
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
 
function tkOp(string s, string op, nary n, integer precedence)
<span style="color: #008080;">function</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">string</span> <span style="color: #000000;">op</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">nary</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">precedence</span><span style="color: #0000FF;">)</span>
integer res = tkName(s, n, precedence)
<span style="color: #004080;">integer</span> <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">precedence</span><span style="color: #0000FF;">)</span>
operators = append(operators,op)
<span style="color: #000000;">operators</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">operators</span><span style="color: #0000FF;">,</span><span style="color: #000000;">op</span><span style="color: #0000FF;">)</span>
opcodes = append(opcodes,res)
<span style="color: #000000;">opcodes</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">opcodes</span><span style="color: #0000FF;">,</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)</span>
for i=1 to length(op) do
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">op</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
charmap[op[i]] = OPERATOR
<span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">op</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">OPERATOR</span>
end for
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
return res
<span style="color: #008080;">return</span> <span style="color: #000000;">res</span>
end function
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
 
function tkKw(string s, string keyword)
<span style="color: #008080;">function</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">string</span> <span style="color: #000000;">keyword</span><span style="color: #0000FF;">)</span>
integer res = tkName(s)
<span style="color: #004080;">integer</span> <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
putd(keyword, res, KEYWORDS)
<span style="color: #7060A8;">putd</span><span style="color: #0000FF;">(</span><span style="color: #000000;">keyword</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">KEYWORDS</span><span style="color: #0000FF;">)</span>
return res
<span style="color: #008080;">return</span> <span style="color: #000000;">res</span>
end function
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
 
global constant
<span style="color: #008080;">global</span> <span style="color: #008080;">constant</span>
tk_EOI = tkName("End_of_input"), --1
<span style="color: #000000;">tk_EOI</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"End_of_input"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--1</span>
tk_mul = tkOp("Op_multiply", "*", BINARY,13), --2
<span style="color: #000000;">tk_mul</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_multiply"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"*"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">13</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--2</span>
tk_div = tkOp("Op_divide", "/", BINARY,13), --3
<span style="color: #000000;">tk_div</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_divide"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"/"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">13</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--3</span>
tk_mod = tkOp("Op_mod", "%", BINARY,13), --4
<span style="color: #000000;">tk_mod</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_mod"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"%"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">13</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--4</span>
tk_add = tkOp("Op_add", "+", BINARY,12), --5
<span style="color: #000000;">tk_add</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_add"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"+"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">12</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--5</span>
tk_sub = tkOp("Op_subtract", "-", BINARY,12), --6
<span style="color: #000000;">tk_sub</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_subtract"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"-"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">12</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--6</span>
tk_neg = tkName("Op_negate", UNARY, 14), --7
<span style="color: #000000;">tk_neg</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_negate"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">UNARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">14</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--7</span>
tk_not = tkOp("Op_not", "!", UNARY, 14), --8
<span style="color: #000000;">tk_not</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_not"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"!"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">UNARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">14</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--8</span>
tk_lt = tkOp("Op_less", "<", BINARY,10), --9
<span style="color: #000000;">tk_lt</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_less"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"&lt;"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">10</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--9</span>
tk_le = tkOp("Op_lessequal", "<=",BINARY,10), --10
<span style="color: #000000;">tk_le</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_lessequal"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"&lt;="</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">10</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--10</span>
tk_gt = tkOp("Op_greater", ">", BINARY,10), --11
<span style="color: #000000;">tk_gt</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_greater"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"&gt;"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">10</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--11</span>
tk_ge = tkOp("Op_greaterequal", ">=",BINARY,10), --12
<span style="color: #000000;">tk_ge</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_greaterequal"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"&gt;="</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">10</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--12</span>
tk_eq = tkOp("Op_equal", "==",BINARY, 9), --13
<span style="color: #000000;">tk_eq</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_equal"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"=="</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">9</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--13</span>
tk_ne = tkOp("Op_notequal", "!=",BINARY, 9), --14
<span style="color: #000000;">tk_ne</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_notequal"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"!="</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">9</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--14</span>
tk_assign = tkOp("Op_assign", "=", NONE, -1), --15
<span style="color: #000000;">tk_assign</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_assign"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"="</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--15</span>
tk_and = tkOp("Op_and", "&&",BINARY, 5), --16
<span style="color: #000000;">tk_and</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_and"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"&&"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">5</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--16</span>
tk_or = tkOp("Op_or", "||",BINARY, 4), --17
<span style="color: #000000;">tk_or</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_or"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"||"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">4</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--17</span>
tk_if = tkKw("Keyword_if", "if"), --18
<span style="color: #000000;">tk_if</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Keyword_if"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"if"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--18</span>
tk_else = tkKw("Keyword_else", "else"), --19
<span style="color: #000000;">tk_else</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Keyword_else"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"else"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--19</span>
tk_while = tkKw("Keyword_while","while"), --20
<span style="color: #000000;">tk_while</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Keyword_while"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"while"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--20</span>
tk_print = tkKw("Keyword_print","print"), --21
<span style="color: #000000;">tk_print</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Keyword_print"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"print"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--21</span>
tk_putc = tkKw("Keyword_putc", "putc"), --22
<span style="color: #000000;">tk_putc</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Keyword_putc"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"putc"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--22</span>
tk_LeftParen = tkOp("LeftParen", "(", NONE, -1), --23
<span style="color: #000000;">tk_LeftParen</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"LeftParen"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"("</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--23</span>
tk_RightParen = tkOp("RightParen", ")", NONE, -1), --24
<span style="color: #000000;">tk_RightParen</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"RightParen"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">")"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--24</span>
tk_LeftBrace = tkOp("LeftBrace", "{", NONE, -1), --25
<span style="color: #000000;">tk_LeftBrace</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"LeftBrace"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"{"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--25</span>
tk_RightBrace = tkOp("RightBrace", "}", NONE, -1), --26
<span style="color: #000000;">tk_RightBrace</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"RightBrace"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"}"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--26</span>
tk_Semicolon = tkOp("Semicolon", ";", NONE, -1), --27
<span style="color: #000000;">tk_Semicolon</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Semicolon"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">";"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--27</span>
tk_Comma = tkOp("Comma", ",", NONE, -1), --28
<span style="color: #000000;">tk_Comma</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Comma"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">","</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--28</span>
tk_Identifier = tkName("Identifier"), --29
<span style="color: #000000;">tk_Identifier</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Identifier"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--29</span>
tk_Integer = tkName("Integer"), --30
<span style="color: #000000;">tk_Integer</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Integer"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--30</span>
tk_String = tkName("String"), --31
<span style="color: #000000;">tk_String</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"String"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--31</span>
tk_Sequence = tkName("Sequence"), --32
<span style="color: #000000;">tk_Sequence</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Sequence"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--32</span>
tk_Prints = tkName("tk_Prints"), --33
<span style="color: #000000;">tk_Prints</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"tk_Prints"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--33</span>
tk_Printi = tkName("tk_Printi") --34
<span style="color: #000000;">tk_Printi</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"tk_Printi"</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">--34</span>
 
global integer input_file = STDIN,
<span style="color: #008080;">global</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">input_file</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">STDIN</span><span style="color: #0000FF;">,</span>
output_file = STDOUT
<span style="color: #000000;">output_file</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">STDOUT</span>
 
type strint(object o)
<span style="color: #008080;">type</span> <span style="color: #000000;">strint</span><span style="color: #0000FF;">(</span><span style="color: #004080;">object</span> <span style="color: #000000;">o</span><span style="color: #0000FF;">)</span>
return string(o) or integer(o)
<span style="color: #008080;">return</span> <span style="color: #004080;">string</span><span style="color: #0000FF;">(</span><span style="color: #000000;">o</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">or</span> <span style="color: #004080;">integer</span><span style="color: #0000FF;">(</span><span style="color: #000000;">o</span><span style="color: #0000FF;">)</span>
end type
<span style="color: #008080;">end</span> <span style="color: #008080;">type</span>
 
global strint tok_line, -- save of line/col at the start of
<span style="color: #008080;">global</span> <span style="color: #000000;">strint</span> <span style="color: #000000;">tok_line</span><span style="color: #0000FF;">,</span> <span style="color: #000080;font-style:italic;">-- save of line/col at the start of</span>
tok_col -- token/comment, for result/errors
<span style="color: #000000;">tok_col</span> <span style="color: #000080;font-style:italic;">-- token/comment, for result/errors</span>
 
global object oneline = ""
<span style="color: #008080;">global</span> <span style="color: #004080;">object</span> <span style="color: #000000;">oneline</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span>
 
constant errfmt = "Line %s column %s:\n%s%s"
<span style="color: #008080;">constant</span> <span style="color: #000000;">errfmt</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">"Line %s column %s:\n%s%s"</span>
 
function errline()
<span style="color: #008080;">function</span> <span style="color: #000000;">errline</span><span style="color: #0000FF;">()</span>
oneline = substitute(trim(oneline,"\r\n"),"\t"," ")
<span style="color: #000000;">oneline</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">substitute</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">trim</span><span style="color: #0000FF;">(</span><span style="color: #000000;">oneline</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\r\n"</span><span style="color: #0000FF;">),</span><span style="color: #008000;">'\t'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">)</span>
string padding = repeat(' ',tok_col)
<span style="color: #004080;">string</span> <span style="color: #000000;">padding</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">)</span>
return sprintf("%s\n%s^ ",{oneline,padding})
<span style="color: #008080;">return</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"%s\n%s^ "</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">oneline</span><span style="color: #0000FF;">,</span><span style="color: #000000;">padding</span><span style="color: #0000FF;">})</span>
end function
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
 
global procedure error(sequence msg, sequence args={})
<span style="color: #008080;">global</span> <span style="color: #008080;">procedure</span> <span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #004080;">sequence</span> <span style="color: #000000;">msg</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">args</span><span style="color: #0000FF;">={})</span>
if length(args) then
<span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">args</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
msg = sprintf(msg,args)
<span style="color: #000000;">msg</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">msg</span><span style="color: #0000FF;">,</span><span style="color: #000000;">args</span><span style="color: #0000FF;">)</span>
end if
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
string el = iff(atom(oneline)?"":errline())
<span style="color: #004080;">string</span> <span style="color: #000000;">el</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #004080;">atom</span><span style="color: #0000FF;">(</span><span style="color: #000000;">oneline</span><span style="color: #0000FF;">)?</span><span style="color: #008000;">""</span><span style="color: #0000FF;">:</span><span style="color: #000000;">errline</span><span style="color: #0000FF;">())</span>
if integer(tok_line) then tok_line = sprintf("%d",tok_line) end if
<span style="color: #008080;">if</span> <span style="color: #004080;">integer</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span> <span style="color: #000000;">tok_line</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"%d"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
if integer(tok_col) then tok_col = sprintf("%d",tok_col) end if
<span style="color: #008080;">if</span> <span style="color: #004080;">integer</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span> <span style="color: #000000;">tok_col</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"%d"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
printf(STDOUT,errfmt,{tok_line,tok_col,el,msg})
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">STDOUT</span><span style="color: #0000FF;">,</span><span style="color: #000000;">errfmt</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">,</span><span style="color: #000000;">el</span><span style="color: #0000FF;">,</span><span style="color: #000000;">msg</span><span style="color: #0000FF;">})</span>
{} = wait_key()
<span style="color: #0000FF;">{}</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">wait_key</span><span style="color: #0000FF;">()</span>
abort(1)
<span style="color: #7060A8;">abort</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">)</span>
end procedure
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
 
function open_file(string file_name, string mode)
<span style="color: #008080;">include</span> <span style="color: #000000;">js_io</span><span style="color: #0000FF;">.</span><span style="color: #000000;">e</span> <span style="color: #000080;font-style:italic;">-- fake file i/o for running under pwa/p2js</span>
integer fn = open(file_name, mode)
if fn = -1 then
<span style="color: #008080;">function</span> <span style="color: #000000;">open_file</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">file_name</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">string</span> <span style="color: #000000;">mode</span><span style="color: #0000FF;">)</span>
printf(STDOUT, "Could not open %s", {file_name})
<span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">platform</span><span style="color: #0000FF;">()=</span><span style="color: #004600;">JS</span><span style="color: #0000FF;">?</span><span style="color: #000000;">js_open</span><span style="color: #0000FF;">(</span><span style="color: #000000;">file_name</span><span style="color: #0000FF;">)</span>
{} = wait_key()
<span style="color: #0000FF;">:</span><span style="color: #7060A8;">open</span><span style="color: #0000FF;">(</span><span style="color: #000000;">file_name</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">mode</span><span style="color: #0000FF;">))</span>
abort(1)
<span style="color: #008080;">if</span> <span style="color: #000000;">fn</span><span style="color: #0000FF;"><=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span>
end if
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">STDOUT</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"Could not open %s"</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">file_name</span><span style="color: #0000FF;">})</span>
return fn
<span style="color: #0000FF;">{}</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">wait_key</span><span style="color: #0000FF;">()</span>
end function
<span style="color: #7060A8;">abort</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">)</span>
 
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
global procedure open_files(sequence cl)
<span style="color: #008080;">return</span> <span style="color: #000000;">fn</span>
if length(cl)>2 then
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
input_file = open_file(cl[3],"r")
if length(cl)>3 then
<span style="color: #008080;">global</span> <span style="color: #008080;">procedure</span> <span style="color: #000000;">open_files</span><span style="color: #0000FF;">(</span><span style="color: #004080;">sequence</span> <span style="color: #000000;">cl</span><span style="color: #0000FF;">)</span>
output_file = open_file(cl[4],"w")
<span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">cl</span><span style="color: #0000FF;">)></span><span style="color: #000000;">2</span> <span style="color: #008080;">then</span>
end if
<span style="color: #000000;">input_file</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">open_file</span><span style="color: #0000FF;">(</span><span style="color: #000000;">cl</span><span style="color: #0000FF;">[</span><span style="color: #000000;">3</span><span style="color: #0000FF;">],</span><span style="color: #008000;">"r"</span><span style="color: #0000FF;">)</span>
end if
<span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">cl</span><span style="color: #0000FF;">)></span><span style="color: #000000;">3</span> <span style="color: #008080;">then</span>
end procedure
<span style="color: #000000;">output_file</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">open_file</span><span style="color: #0000FF;">(</span><span style="color: #000000;">cl</span><span style="color: #0000FF;">[</span><span style="color: #000000;">4</span><span style="color: #0000FF;">],</span><span style="color: #008000;">"w"</span><span style="color: #0000FF;">)</span>
 
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
global procedure close_files()
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
if input_file!=STDIN then close(input_file) end if
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
if output_file!=STDOUT then close(output_file) end if
end procedure
<span style="color: #008080;">global</span> <span style="color: #008080;">procedure</span> <span style="color: #000000;">close_files</span><span style="color: #0000FF;">()</span>
 
<span style="color: #008080;">if</span> <span style="color: #7060A8;">platform</span><span style="color: #0000FF;">()!=</span><span style="color: #004600;">JS</span> <span style="color: #008080;">then</span>
global function enquote(string s)
<span style="color: #008080;">if</span> <span style="color: #000000;">input_file</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">STDIN</span> <span style="color: #008080;">then</span> <span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
return sprintf("\"%s\"",substitute(s,"\n","\\n"))
<span style="color: #008080;">if</span> <span style="color: #000000;">output_file</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">STDOUT</span> <span style="color: #008080;">then</span> <span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">output_file</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end function
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
 
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
global function unquote(string s)
if s[1]!='\"' then ?9/0 end if
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">enquote</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
if s[$]!='\"' then ?9/0 end if
<span style="color: #008080;">return</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">`"%s"`</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">substitute</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\\n"</span><span style="color: #0000FF;">))</span>
s = substitute(s[2..-2],"\\n","\n")
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
return s
end function</lang>
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">unquote</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">[</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]!=</span><span style="color: #008000;">'\"'</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">[$]!=</span><span style="color: #008000;">'\"'</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">s</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">substitute</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">..-</span><span style="color: #000000;">2</span><span style="color: #0000FF;">],</span><span style="color: #008000;">"\\n"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">s</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</syntaxhighlight>-->
For running under pwa/p2js, we also have a "fake file/io" component:
<!--<syntaxhighlight lang="phix">(phixonline)-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\js_io.e
-- =============================
--
-- Fake file i/o for running under pwa/p2js in a browser
-- Does not cover the human readable reload parts of extra.e
--</span>
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span>
<span style="color: #004080;">sequence</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">known_files</span><span style="color: #0000FF;">,</span><span style="color: #000000;">kfc</span><span style="color: #0000FF;">}</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">columnize</span><span style="color: #0000FF;">({</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"test3.c"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ &lt;
/* If */ if /* Gtr */ &gt;
/* Else */ else /* Leq */ &lt;=
/* While */ while /* Geq */ &gt;=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal **/ '\\'
/* character literal */ ' '
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"test4.c"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
/*** test printing, embedded \n and comments with lots of '*' ***/
print(42);
print("\nHello World\nGood Bye\nok\n");
print("Print a slash n - \\n.\n");
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"primes.c"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
/*
Simple prime number generator
*/
count = 1;
n = 1;
limit = 100;
while (n &lt; limit) {
k=3;
p=1;
n=n+2;
while ((k*k&lt;=n) && (p)) {
p=n/k*k!=n;
k=k+2;
}
if (p) {
print(n, " is prime\n");
count = count + 1;
}
}
print("Total primes found: ", count, "\n");
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"gcd.c"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
/* Compute the gcd of 1071, 1029: 21 */
a = 1071;
b = 1029;
while (b != 0) {
new_a = b;
b = a % b;
a = new_a;
}
print(a);
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"Header.h"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
#define area(h, w) h * w
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"Source.t"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
#include "Header.h"
#define width 5
#define height 6
area = #area(height, width)#;
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)}})</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">linenos</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">known_files</span><span style="color: #0000FF;">))</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">js_open</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">filename</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">filename</span><span style="color: #0000FF;">,</span><span style="color: #000000;">known_files</span><span style="color: #0000FF;">)</span>
<span style="color: #7060A8;">assert</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">0</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">linenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">fn</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">js_gets</span><span style="color: #0000FF;">(</span><span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">lineno</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">linenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">]+</span><span style="color: #000000;">1</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">lineno</span><span style="color: #0000FF;"><=</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">kfc</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">linenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">lineno</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">kfc</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">][</span><span style="color: #000000;">lineno</span><span style="color: #0000FF;">]</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">EOF</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</syntaxhighlight>-->
The main lexer is also written to be reusable by later stages.
<!--<syntaxhighlight lang="phix">(phixonline)-->
<lang Phix>--
<span style="color: #000080;font-style:italic;">--
-- demo\\rosetta\\Compiler\\lex.e
-- demo\\rosetta\\Compiler\\lex.e
-- ==============================
-- ==============================
--
--
-- The reusable part of lex.exw
-- The reusable part of lex.exw
-- This is only kept separate from core.e for consistency with later modules.
-- This is only kept separate from core.e for consistency with later modules.</span>
 
include core.e
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span>
 
<span style="color: #008080;">include</span> <span style="color: #000000;">core</span><span style="color: #0000FF;">.</span><span style="color: #000000;">e</span>
integer ch = ' ',
line = 0,
<span style="color: #004080;">integer</span> <span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span>
col = 0
<span style="color: #000000;">line</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span><span style="color: #0000FF;">,</span>
 
<span style="color: #000000;">col</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
procedure eof(string s)
error("%s in %s literal",{iff(ch=EOF?"EOF":"EOL"),s})
<span style="color: #008080;">procedure</span> <span style="color: #000000;">eof</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
end procedure
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"%s in %s literal"</span><span style="color: #0000FF;">,{</span><span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span><span style="color: #0000FF;">?</span><span style="color: #008000;">"EOF"</span><span style="color: #0000FF;">:</span><span style="color: #008000;">"EOL"</span><span style="color: #0000FF;">),</span><span style="color: #000000;">s</span><span style="color: #0000FF;">})</span>
 
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
function next_ch()
while 1 do
<span style="color: #008080;">function</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
col += 1
<span style="color: #008080;">while</span> <span style="color: #000000;">1</span> <span style="color: #008080;">do</span>
if oneline=EOF then
<span style="color: #000000;">col</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
ch = EOF
<span style="color: #008080;">if</span> <span style="color: #000000;">oneline</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span> <span style="color: #008080;">then</span>
exit
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">EOF</span>
elsif col>length(oneline) then
line +<span style="color: 1#008080;">exit</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">col</span><span style="color: #0000FF;">></span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">oneline</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
col = 0
<span style="color: #000000;">line</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
oneline = gets(input_file)
<span style="color: #000000;">col</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
else
<span style="color: #000000;">oneline</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">platform</span><span style="color: #0000FF;">()=</span><span style="color: #004600;">JS</span><span style="color: #0000FF;">?</span><span style="color: #000000;">js_gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">)</span>
ch = oneline[col]
<span style="color: #0000FF;">:</span><span style="color: #7060A8;">gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">))</span>
exit
<span style="color: #008080;">else</span>
end if
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">oneline</span><span style="color: #0000FF;">[</span><span style="color: #000000;">col</span><span style="color: #0000FF;">]</span>
end while
<span style="color: #008080;">exit</span>
return ch
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end function
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
 
<span style="color: #008080;">return</span> <span style="color: #000000;">ch</span>
constant whitespace = " \t\r\n\x0B\xA0"
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
-- (0x0B is Vertical Tab, 0xA0 is Non-breaking space)
 
<span style="color: #000080;font-style:italic;">-- for pwa/p2js (JavaScript *really* dislikes tabs in strings):
procedure skipspacesandcomments()
--constant whitespace = " \t\r\n\x0B\xA0"</span>
while 1 do
<span style="color: #008080;">constant</span> <span style="color: #000000;">whitespace</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\t'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\r'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\n'</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#0B</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#A0</span><span style="color: #0000FF;">}</span>
if not find(ch,whitespace) then
<span style="color: #000080;font-style:italic;">-- (0x0B is Vertical Tab, 0xA0 is Non-breaking space)</span>
if ch='/' and col<length(oneline) and oneline[col+1]='*' then
tok_line = line -- (in case of EOF error)
<span style="color: #008080;">procedure</span> <span style="color: #000000;">skipspacesandcomments</span><span style="color: #0000FF;">()</span>
tok_col = col
<span style="color: #008080;">while</span> <span style="color: #000000;">1</span> <span style="color: #008080;">do</span>
ch = next_ch() -- (can be EOF)
<span style="color: #008080;">if</span> <span style="color: #008080;">not</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">,</span><span style="color: #000000;">whitespace</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
ch = next_ch() -- ( "" )
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'/'</span> <span style="color: #008080;">and</span> <span style="color: #000000;">col</span><span style="color: #0000FF;"><</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">oneline</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">and</span> <span style="color: #000000;">oneline</span><span style="color: #0000FF;">[</span><span style="color: #000000;">col</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]=</span><span style="color: #008000;">'*'</span> <span style="color: #008080;">then</span>
while 1 do
<span style="color: #000000;">tok_line</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">line</span> <span style="color: #000080;font-style:italic;">-- (in case of EOF error)</span>
if ch='*' then
<span style="color: #000000;">tok_col</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">col</span>
ch = next_ch()
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span> <span style="color: #000080;font-style:italic;">-- (can be EOF)</span>
if ch='/' then exit end if
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span> <span style="color: #000080;font-style:italic;">-- ( "" )</span>
elsif ch=EOF then
<span style="color: #008080;">while</span> <span style="color: #000000;">1</span> <span style="color: #008080;">do</span>
error("EOF in comment")
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'*'</span> <span style="color: #008080;">then</span>
else
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'/'</span> <span style="color: #008080;">then</span> <span style="color: #008080;">exit</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end if
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span> <span style="color: #008080;">then</span>
end while
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"EOF in comment"</span><span style="color: #0000FF;">)</span>
else
exit <span style="color: #008080;">else</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
end if
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end if
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
ch = next_ch()
<span style="color: #008080;">else</span>
end while
<span style="color: #008080;">exit</span>
end procedure
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
 
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
function escape_char(string s)
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
ch = next_ch() -- (discard the '\\')
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
if ch='n' then
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
ch = '\n'
elsif ch='\\' then
<span style="color: #008080;">function</span> <span style="color: #000000;">escape_char</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
ch = '\\'
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span> <span style="color: #000080;font-style:italic;">-- (discard the '\\')</span>
elsif ch=EOF
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'n'</span> <span style="color: #008080;">then</span>
or ch='\n' then
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">'\n'</span>
eof(s)
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\\'</span> <span style="color: #008080;">then</span>
else
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">'\\'</span>
error("unknown escape sequence \\%c", {ch})
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span>
end if
<span style="color: #008080;">or</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\n'</span> <span style="color: #008080;">then</span>
return ch
<span style="color: #000000;">eof</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
end function
<span style="color: #008080;">else</span>
 
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">`unknown escape sequence \%c`</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">})</span>
function char_lit()
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
integer startch = ch
<span style="color: #008080;">return</span> <span style="color: #000000;">ch</span>
integer res = next_ch() -- (skip opening quote, save res)
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
if ch=startch then
error("empty character constant")
<span style="color: #008080;">function</span> <span style="color: #000000;">char_lit</span><span style="color: #0000FF;">()</span>
elsif ch='\\' then
<span style="color: #004080;">integer</span> <span style="color: #000000;">startch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">,</span>
res = escape_char("character")
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span> <span style="color: #000080;font-style:italic;">-- (skip opening quote, save res)</span>
end if
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">startch</span> <span style="color: #008080;">then</span>
ch = next_ch()
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"empty character constant"</span><span style="color: #0000FF;">)</span>
if ch=EOF
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\\'</span> <span style="color: #008080;">then</span>
or ch='\n' then
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">escape_char</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"character"</span><span style="color: #0000FF;">)</span>
eof("character")
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
elsif ch!=startch then
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
error("multi-character constant")
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span>
end if
<span style="color: #008080;">or</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\n'</span> <span style="color: #008080;">then</span>
ch = next_ch()
<span style="color: #000000;">eof</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"character"</span><span style="color: #0000FF;">)</span>
return {tk_Integer, res}
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">startch</span> <span style="color: #008080;">then</span>
end function
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"multi-character constant"</span><span style="color: #0000FF;">)</span>
 
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
function string_lit()
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
integer startch = ch
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tk_Integer</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">}</span>
string text = ""
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
while next_ch()!=startch do
if ch=EOF
<span style="color: #008080;">function</span> <span style="color: #000000;">string_lit</span><span style="color: #0000FF;">()</span>
or ch='\n' then
<span style="color: #004080;">integer</span> <span style="color: #000000;">startch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">ch</span>
eof("string")
<span style="color: #004080;">string</span> <span style="color: #000000;">text</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span>
elsif ch='\\' then
<span style="color: #008080;">while</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()!=</span><span style="color: #000000;">startch</span> <span style="color: #008080;">do</span>
ch = escape_char("string")
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span>
end if
<span style="color: #008080;">or</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\n'</span> <span style="color: #008080;">then</span>
text &= ch
<span style="color: #000000;">eof</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"string"</span><span style="color: #0000FF;">)</span>
end while
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\\'</span> <span style="color: #008080;">then</span>
ch = next_ch()
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">escape_char</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"string"</span><span style="color: #0000FF;">)</span>
return {tk_String, text}
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end function
<span style="color: #000000;">text</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">ch</span>
 
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
function op()
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
sequence operator = {ch}
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tk_String</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">text</span><span style="color: #0000FF;">}</span>
ch = next_ch()
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
while charmap[ch]=OPERATOR
and find(operator&ch,operators) do
<span style="color: #008080;">function</span> <span style="color: #000000;">get_op</span><span style="color: #0000FF;">()</span>
-- (^ ie/eg merge ">=", but not ");")
<span style="color: #000080;font-style:italic;">-- sequence operator &= {ch}</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">operator</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span><span style="color: #0000FF;">&</span><span style="color: #000000;">ch</span>
ch = next_ch()
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
end while
<span style="color: #008080;">while</span> <span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">]=</span><span style="color: #000000;">OPERATOR</span>
integer k = find(operator,operators)
<span style="color: #008080;">and</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">operator</span><span style="color: #0000FF;">&</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">,</span><span style="color: #000000;">operators</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
if k=0 then error("unknown operator") end if
<span style="color: #000080;font-style:italic;">-- (^ ie/eg merge "&gt;=", but not ");")</span>
return {opcodes[k], 0} -- (0 unused)
<span style="color: #000000;">operator</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">ch</span>
end function
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
 
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
function int()
<span style="color: #004080;">integer</span> <span style="color: #000000;">k</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">operator</span><span style="color: #0000FF;">,</span><span style="color: #000000;">operators</span><span style="color: #0000FF;">)</span>
integer i = 0
<span style="color: #008080;">if</span> <span style="color: #000000;">k</span><span style="color: #0000FF;">=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"unknown operator"</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
while charmap[ch]=DIGIT do
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">opcodes</span><span style="color: #0000FF;">[</span><span style="color: #000000;">k</span><span style="color: #0000FF;">],</span> <span style="color: #000000;">0</span><span style="color: #0000FF;">}</span> <span style="color: #000080;font-style:italic;">-- (0 unused)</span>
i = i*10 + (ch-'0')
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
ch = next_ch()
end while
<span style="color: #008080;">function</span> <span style="color: #000000;">get_int</span><span style="color: #0000FF;">()</span>
if charmap[ch]=LETTER then
<span style="color: #004080;">integer</span> <span style="color: #000000;">i</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
error("invalid number")
<span style="color: #008080;">while</span> <span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">]=</span><span style="color: #000000;">DIGIT</span> <span style="color: #008080;">do</span>
end if
<span style="color: #000000;">i</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">*</span><span style="color: #000000;">10</span> <span style="color: #0000FF;">+</span> <span style="color: #0000FF;">(</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">-</span><span style="color: #008000;">'0'</span><span style="color: #0000FF;">)</span>
return {tk_Integer, i}
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
end function
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
 
<span style="color: #008080;">if</span> <span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">]=</span><span style="color: #000000;">LETTER</span> <span style="color: #008080;">then</span>
function ident()
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"invalid number"</span><span style="color: #0000FF;">)</span>
string text = ""
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
while find(charmap[ch],{LETTER,DIGIT}) do
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tk_Integer</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">}</span>
text &= ch
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
ch = next_ch()
end while
<span style="color: #008080;">function</span> <span style="color: #000000;">get_ident</span><span style="color: #0000FF;">()</span>
integer keyword = getd(text,KEYWORDS)
<span style="color: #004080;">string</span> <span style="color: #000000;">text</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span>
if keyword!=NULL then
<span style="color: #008080;">while</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">],{</span><span style="color: #000000;">LETTER</span><span style="color: #0000FF;">,</span><span style="color: #000000;">DIGIT</span><span style="color: #0000FF;">})</span> <span style="color: #008080;">do</span>
return {keyword, 0} -- (0 unused)
<span style="color: #000000;">text</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">ch</span>
end if
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
return {tk_Identifier, text}
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
end function
<span style="color: #004080;">integer</span> <span style="color: #000000;">keyword</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">getd</span><span style="color: #0000FF;">(</span><span style="color: #000000;">text</span><span style="color: #0000FF;">,</span><span style="color: #000000;">KEYWORDS</span><span style="color: #0000FF;">)</span>
 
<span style="color: #008080;">if</span> <span style="color: #000000;">keyword</span><span style="color: #0000FF;">!=</span><span style="color: #004600;">NULL</span> <span style="color: #008080;">then</span>
function get_tok()
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">keyword</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">0</span><span style="color: #0000FF;">}</span> <span style="color: #000080;font-style:italic;">-- (0 unused)</span>
skipspacesandcomments()
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
tok_line = line
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tk_Identifier</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">text</span><span style="color: #0000FF;">}</span>
tok_col = col
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
switch ch do
case EOF then return {tk_EOI, 0} -- (0 unused)
<span style="color: #008080;">function</span> <span style="color: #000000;">get_token</span><span style="color: #0000FF;">()</span>
case '\'' then return char_lit()
<span style="color: #000000;">skipspacesandcomments</span><span style="color: #0000FF;">()</span>
case '"' then return string_lit()
<span style="color: #000000;">tok_line</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">line</span>
else
<span style="color: #000000;">tok_col</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">col</span>
switch charmap[ch] do
<span style="color: #008080;">switch</span> <span style="color: #000000;">ch</span> <span style="color: #008080;">do</span>
case OPERATOR then return op()
<span style="color: #008080;">case</span> <span style="color: #000000;">EOF</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tk_EOI</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">0</span><span style="color: #0000FF;">}</span> <span style="color: #000080;font-style:italic;">-- (0 unused)</span>
case DIGIT then return int()
<span style="color: #008080;">case</span> <span style="color: #008000;"><nowiki>'\''</nowiki></span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">char_lit</span><span style="color: #0000FF;">()</span>
case LETTER then return ident()
<span style="color: #008080;">case</span> <span style="color: #008000;">'"'</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">string_lit</span><span style="color: #0000FF;">()</span>
else error("unrecognized character: (%d)", {ch})
<span style="color: end switch#008080;">else</span>
<span style="color: #008080;">switch</span> <span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">]</span> <span style="color: #008080;">do</span>
end switch
<span style="color: #008080;">case</span> <span style="color: #000000;">OPERATOR</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">get_op</span><span style="color: #0000FF;">()</span>
end function
<span style="color: #008080;">case</span> <span style="color: #000000;">DIGIT</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">get_int</span><span style="color: #0000FF;">()</span>
 
<span style="color: #008080;">case</span> <span style="color: #000000;">LETTER</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">get_ident</span><span style="color: #0000FF;">()</span>
global function lex()
<span style="color: #008080;">else</span> <span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"unrecognized character: (%d)"</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">})</span>
sequence toks = {}
<span style="color: #008080;">end</span> <span style="color: #008080;">switch</span>
integer tok = -1
<span style="color: #008080;">end</span> <span style="color: #008080;">switch</span>
object v
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
while tok!=tk_EOI do
{tok,v} = get_tok()
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">lex</span><span style="color: #0000FF;">()</span>
toks = append(toks,{tok_line,tok_col,tok,v})
<span style="color: #004080;">sequence</span> <span style="color: #000000;">toks</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span>
end while
<span style="color: #004080;">integer</span> <span style="color: #000000;">tok</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span>
return toks
<span style="color: #004080;">object</span> <span style="color: #000000;">v</span>
end function</lang>
<span style="color: #008080;">while</span> <span style="color: #000000;">tok</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">tk_EOI</span> <span style="color: #008080;">do</span>
<span style="color: #0000FF;">{</span><span style="color: #000000;">tok</span><span style="color: #0000FF;">,</span><span style="color: #000000;">v</span><span style="color: #0000FF;">}</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">get_token</span><span style="color: #0000FF;">()</span>
<span style="color: #000000;">toks</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">toks</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok</span><span style="color: #0000FF;">,</span><span style="color: #000000;">v</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">toks</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</syntaxhighlight>-->
Optional: if you need human-readable output/input at each (later) stage, so you can use pipes
<!--<syntaxhighlight lang="phix">-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\extra.e
-- =============================
--
-- Routines to reload human-readable files (deviation from task requirement)
--</span>
<span style="color: #008080;">without</span> <span style="color: #008080;">js</span> <span style="color: #000080;font-style:italic;">-- (file i/o)
--The following can be used to load .lex files, as created by lex.exw:
-- (in place of the existing get_tok() in parse.e)</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">get_tok</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">line</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">trim</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">))</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">tok</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #000000;">line</span><span style="color: #0000FF;">,</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #000000;">limit</span><span style="color: #0000FF;">:=</span><span style="color: #000000;">4</span><span style="color: #0000FF;">,</span><span style="color: #000000;">no_empty</span><span style="color: #0000FF;">:=</span><span style="color: #000000;">1</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">k</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tok</span><span style="color: #0000FF;">[</span><span style="color: #000000;">3</span><span style="color: #0000FF;">],</span><span style="color: #000000;">tkNames</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">k</span><span style="color: #0000FF;">=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">tok</span><span style="color: #0000FF;">[</span><span style="color: #000000;">3</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">k</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">tok</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #000080;font-style:italic;">--The following can be used to load .ast files, as created by parse.exw:
-- (in place of the existing lex()/parse() pairs in cgen.exw and interp.exw)</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">load_ast</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">line</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">trim</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">))</span>
<span style="color: #000080;font-style:italic;">-- Each line has at least one token</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">node</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #000000;">line</span><span style="color: #0000FF;">,</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #000000;">limit</span><span style="color: #0000FF;">:=</span><span style="color: #000000;">2</span><span style="color: #0000FF;">,</span><span style="color: #000000;">no_empty</span><span style="color: #0000FF;">:=</span><span style="color: #000000;">1</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">node_type</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">node_type</span> <span style="color: #0000FF;">==</span> <span style="color: #008000;">";"</span> <span style="color: #008080;">then</span> <span style="color: #000080;font-style:italic;">-- a terminal node</span>
<span style="color: #008080;">return</span> <span style="color: #004600;">NULL</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">n_type</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">node_type</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tkNames</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">n_type</span><span style="color: #0000FF;">=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000080;font-style:italic;">-- A line with two tokens is a leaf node
-- Leaf nodes are: Identifier, Integer, String
-- The 2nd token is the value</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">node</span><span style="color: #0000FF;">)></span><span style="color: #000000;">1</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">n_type</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">n_type</span><span style="color: #0000FF;">=</span><span style="color: #000000;">tk_Integer</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">to_integer</span><span style="color: #0000FF;">(</span><span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">])</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">n_type</span><span style="color: #0000FF;">=</span><span style="color: #000000;">tk_String</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">unquote</span><span style="color: #0000FF;">(</span><span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">])</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">node</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #004080;">object</span> <span style="color: #000000;">left</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">load_ast</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">object</span> <span style="color: #000000;">right</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">load_ast</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">n_type</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">left</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">right</span><span style="color: #0000FF;">}</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</syntaxhighlight>-->
Finally, a simple test driver for the specific task:
<!--<syntaxhighlight lang="phix">(phixonline)-->
<lang Phix>--
<span style="color: #000080;font-style:italic;">--
-- demo\\rosetta\\Compiler\\lex.exw
-- demo\rosetta\Compiler\lex.exw
-- ================================
-- =============================
--
--</span>
 
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span>
include lex.e
<span style="color: #008080;">include</span> <span style="color: #000000;">lex</span><span style="color: #0000FF;">.</span><span style="color: #000000;">e</span>
 
procedure main(sequence cl)
<span style="color: #008080;">procedure</span> <span style="color: #000000;">main</span><span style="color: #0000FF;">(</span><span style="color: #004080;">sequence</span> <span style="color: #000000;">cl</span><span style="color: #0000FF;">)</span>
open_files(cl)
<span style="color: #000000;">open_files</span><span style="color: #0000FF;">(</span><span style="color: #000000;">cl</span><span style="color: #0000FF;">)</span>
sequence toks = lex()
<span style="color: #004080;">sequence</span> <span style="color: #000000;">toks</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">lex</span><span style="color: #0000FF;">()</span>
integer tok
<span style="color: #004080;">integer</span> <span style="color: #000000;">tok</span>
object v
<span style="color: #004080;">object</span> <span style="color: #000000;">v</span>
for i=1 to length(toks) do
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">toks</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
{tok_line,tok_col,tok,v} = toks[i]
<span style="color: #0000FF;">{</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok</span><span style="color: #0000FF;">,</span><span style="color: #000000;">v</span><span style="color: #0000FF;">}</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">toks</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span>
switch tok do
<span style="color: #008080;">switch</span> <span style="color: #000000;">tok</span> <span style="color: #008080;">do</span>
case tk_Identifier: v = sprintf(" %s",v)
<span style="color: #008080;">case</span> <span style="color: #000000;">tk_Identifier</span><span style="color: #0000FF;">:</span> <span style="color: #000000;">v</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">" %s"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">v</span><span style="color: #0000FF;">)</span>
case tk_Integer: v = sprintf(" %5d",v)
<span style="color: #008080;">case</span> <span style="color: #000000;">tk_Integer</span><span style="color: #0000FF;">:</span> <span style="color: #000000;">v</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">" %5d"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">v</span><span style="color: #0000FF;">)</span>
case tk_String: v = sprintf(" %s",enquote(v))
<span style="color: #008080;">case</span> <span style="color: #000000;">tk_String</span><span style="color: #0000FF;">:</span> <span style="color: #000000;">v</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">" %s"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">enquote</span><span style="color: #0000FF;">(</span><span style="color: #000000;">v</span><span style="color: #0000FF;">))</span>
else v = ""
<span style="color: #008080;">else</span> <span style="color: #000000;">v</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span>
end switch
<span style="color: #008080;">end</span> <span style="color: #008080;">switch</span>
printf(output_file, "%5d %5d %-10s%s\n", {tok_line,tok_col,tkNames[tok],v})
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">output_file</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"%5d %5d %-10s%s\n"</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tkNames</span><span style="color: #0000FF;">[</span><span style="color: #000000;">tok</span><span style="color: #0000FF;">],</span><span style="color: #000000;">v</span><span style="color: #0000FF;">})</span>
end for
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
close_files()
<span style="color: #000000;">close_files</span><span style="color: #0000FF;">()</span>
end procedure
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
 
--main(command_line())
<span style="color: #000080;font-style:italic;">--main(command_line())</span>
main({0,0,"test4.c"})</lang>
<span style="color: #000000;">main</span><span style="color: #0000FF;">({</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"test4.c"</span><span style="color: #0000FF;">})</span>
<!--</syntaxhighlight>-->
{{out}}
<pre>
Line 6,097 ⟶ 14,989:
=={{header|Prolog}}==
 
<langsyntaxhighlight lang="prolog">/*
Test harness for the analyzer, not needed if we are actually using the output.
*/
Line 6,257 ⟶ 15,149:
 
% anything else is an error
tok(_,_,L,P) --> { format(atom(Error), 'Invalid token at line ~d,~d', [L,P]), throw(Error) }.</langsyntaxhighlight>
{{out}}
<pre>
Line 6,298 ⟶ 15,190:
=={{header|Python}}==
Tested with Python 2.7 and 3.x
<langsyntaxhighlight Pythonlang="python">from __future__ import print_function
import sys
 
Line 6,379 ⟶ 15,271:
#*** "string"
def string_lit(start, err_line, err_col):
global the_ch
text = ""
 
Line 6,386 ⟶ 15,279:
if the_ch == '\n':
error(err_line, err_col, "EOL while scanning string literal")
if the_ch == '\\':
next_ch()
if the_ch != 'n':
error(err_line, err_col, "escape sequence unknown \\%c" % the_ch)
the_ch = '\n'
text += the_ch
 
Line 6,473 ⟶ 15,371:
 
if tok == tk_EOI:
break</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 6,513 ⟶ 15,411:
23 1 End_of_input
</pre>
</b>
 
=={{header|QB64}}==
Tested with QB64 1.5
<syntaxhighlight lang="vb">dim shared source as string, the_ch as string, tok as string, toktyp as string
dim shared line_n as integer, col_n as integer, text_p as integer, err_line as integer, err_col as integer, errors as integer
 
declare function isalnum&(s as string)
declare function isalpha&(s as string)
declare function isdigit&(s as string)
declare sub divide_or_comment
declare sub error_exit(line_n as integer, col_n as integer, msg as string)
declare sub follow(c as string, typ2 as string, typ1 as string)
declare sub nextch
declare sub nexttok
declare sub read_char_lit
declare sub read_ident
declare sub read_number
declare sub read_string
 
const c_integer = "Integer", c_ident = "Identifier", c_string = "String"
 
dim out_fn as string, out_tok as string
 
if command$(1) = "" then print "Expecting a filename": end
open command$(1) for binary as #1
source = space$(lof(1))
get #1, 1, source
close #1
 
out_fn = command$(2): if out_fn <> "" then open out_fn for output as #1
 
line_n = 1: col_n = 0: text_p = 1: the_ch = " "
 
do
call nexttok
select case toktyp
case c_integer, c_ident, c_string: out_tok = tok
case else: out_tok = ""
end select
if out_fn = "" then
print err_line, err_col, toktyp, out_tok
else
print #1, err_line, err_col, toktyp, out_tok
end if
loop until errors or tok = ""
if out_fn <> "" then close #1
end
 
' get next tok, toktyp
sub nexttok
toktyp = ""
restart: err_line = line_n: err_col = col_n: tok = the_ch
select case the_ch
case " ", chr$(9), chr$(10): call nextch: goto restart
case "/": call divide_or_comment: if tok = "" then goto restart
 
case "%": call nextch: toktyp = "Op_mod"
case "(": call nextch: toktyp = "LeftParen"
case ")": call nextch: toktyp = "RightParen"
case "*": call nextch: toktyp = "Op_multiply"
case "+": call nextch: toktyp = "Op_add"
case ",": call nextch: toktyp = "Comma"
case "-": call nextch: toktyp = "Op_subtract"
case ";": call nextch: toktyp = "Semicolon"
case "{": call nextch: toktyp = "LeftBrace"
case "}": call nextch: toktyp = "RightBrace"
 
case "&": call follow("&", "Op_and", "")
case "|": call follow("|", "Op_or", "")
case "!": call follow("=", "Op_notequal", "Op_not")
case "<": call follow("=", "Op_lessequal", "Op_less")
case "=": call follow("=", "Op_equal", "Op_assign")
case ">": call follow("=", "Op_greaterequal", "Op_greater")
 
case chr$(34): call read_string
case chr$(39): call read_char_lit
 
case "": toktyp = "End_of_input"
 
case else
if isdigit&(the_ch) then
call read_number
elseif isalpha&(the_ch) then
call read_ident
else
call nextch
end if
end select
end sub
 
sub follow(c as string, if_both as string, if_one as string)
call nextch
if the_ch = c then
tok = tok + the_ch
call nextch
toktyp = if_both
else
if if_one = "" then call error_exit(line_n, col_n, "Expecting " + c): exit sub
toktyp = if_one
end if
end sub
 
sub read_string
toktyp = c_string
call nextch
do
tok = tok + the_ch
select case the_ch
case chr$(10): call error_exit(line_n, col_n, "EOL in string"): exit sub
case "": call error_exit(line_n, col_n, "EOF in string"): exit sub
case chr$(34): call nextch: exit sub
case else: call nextch
end select
loop
end sub
 
sub read_char_lit
toktyp = c_integer
call nextch
if the_ch = chr$(39) then
call error_exit(err_line, err_col, "Empty character constant"): exit sub
end if
 
if the_ch = "\" then
call nextch
if the_ch = "n" then
tok = "10"
elseif the_ch = "\" then
tok = "92"
else
call error_exit(line_n, col_n, "Unknown escape sequence:" + the_ch): exit sub
end if
else
tok = ltrim$(str$(asc(the_ch)))
end if
 
call nextch
if the_ch <> chr$(39) then
call error_exit(line_n, col_n, "Multi-character constant"): exit sub
end if
call nextch
end sub
 
sub divide_or_comment
call nextch
if the_ch <> "*" then
toktyp = "Op_divide"
else ' skip comments
tok = ""
call nextch
do
if the_ch = "*" then
call nextch
if the_ch = "/" then
call nextch
exit sub
end if
elseif the_ch = "" then
call error_exit(line_n, col_n, "EOF in comment"): exit sub
else
call nextch
end if
loop
end if
end sub
 
sub read_ident
do
call nextch
if not isalnum&(the_ch) then exit do
tok = tok + the_ch
loop
select case tok
case "else": toktyp = "keyword_else"
case "if": toktyp = "keyword_if"
case "print": toktyp = "keyword_print"
case "putc":: toktyp = "keyword_putc"
case "while": toktyp = "keyword_while"
case else: toktyp = c_ident
end select
end sub
 
sub read_number
toktyp = c_integer
do
call nextch
if not isdigit&(the_ch) then exit do
tok = tok + the_ch
loop
 
if isalpha&(the_ch) then
call error_exit(err_line, err_col, "Bogus number: " + tok + the_ch): exit sub
end if
end sub
 
function isalpha&(s as string)
dim c as string
c = left$(s, 1)
isalpha& = c <> "" and instr("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", c) > 0
end function
 
function isdigit&(s as string)
dim c as string
c = left$(s, 1)
isdigit& = c <> "" and instr("0123456789", c) > 0
end function
 
function isalnum&(s as string)
dim c as string
c = left$(s, 1)
isalnum& = c <> "" and instr("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_", c) > 0
end function
 
' get next char - fold cr/lf into just lf
sub nextch
the_ch = ""
col_n = col_n + 1
if text_p > len(source) then exit sub
 
the_ch = mid$(source, text_p, 1)
text_p = text_p + 1
 
if the_ch = chr$(13) then
the_ch = chr$(10)
if text_p <= len(source) then
if mid$(source, text_p, 1) = chr$(10) then
text_p = text_p + 1
end if
end if
end if
 
if the_ch = chr$(10) then
line_n = line_n + 1
col_n = 0
end if
 
end sub
 
sub error_exit(line_n as integer, col_n as integer, msg as string)
errors = -1
print line_n, col_n, msg
end
end sub
</syntaxhighlight>
{{out|case=test case 3}}
<b>
<pre> 5 16 keyword_print
5 40 Op_subtract
6 16 keyword_putc
6 40 Op_less
7 16 keyword_if
7 40 Op_greater
8 16 keyword_else
8 40 Op_lessequal
9 16 keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
</b>
 
=={{header|Racket}}==
<langsyntaxhighlight lang="racket">
#lang racket
(require parser-tools/lex)
Line 6,672 ⟶ 15,851:
"TEST 5"
(display-tokens (string->tokens test5))
</syntaxhighlight>
</lang>
 
=={{header|Raku}}==
(formerly Perl 6)
This is more complicated than strictly necessary for this task. It is set up to be easily adapted to do syntax analysis.
 
(Note: there are several bogus comments added solely to help with syntax highlighting.)
 
{{works with|Rakudo|2016.08}}
 
<syntaxhighlight lang="raku" line>grammar tiny_C {
rule TOP { ^ <.whitespace>? <tokens> + % <.whitespace> <.whitespace> <eoi> }
 
rule whitespace { [ <comment> + % <ws> | <ws> ] }
 
token comment { '/*' ~ '*/' .*? }
 
token tokens {
[
| <operator> { make $/<operator>.ast }
| <keyword> { make $/<keyword>.ast }
| <symbol> { make $/<symbol>.ast }
| <identifier> { make $/<identifier>.ast }
| <integer> { make $/<integer>.ast }
| <char> { make $/<char>.ast }
| <string> { make $/<string>.ast }
| <error>
]
}
 
proto token operator {*}
token operator:sym<*> { '*' { make 'Op_multiply' } }
token operator:sym</> { '/'<!before '*'> { make 'Op_divide' } }
token operator:sym<%> { '%' { make 'Op_mod' } }
token operator:sym<+> { '+' { make 'Op_add' } }
token operator:sym<-> { '-' { make 'Op_subtract' } }
token operator:sym('<='){ '<=' { make 'Op_lessequal' } }
token operator:sym('<') { '<' { make 'Op_less' } }
token operator:sym('>='){ '>=' { make 'Op_greaterequal'} }
token operator:sym('>') { '>' { make 'Op_greater' } }
token operator:sym<==> { '==' { make 'Op_equal' } }
token operator:sym<!=> { '!=' { make 'Op_notequal' } }
token operator:sym<!> { '!' { make 'Op_not' } }
token operator:sym<=> { '=' { make 'Op_assign' } }
token operator:sym<&&> { '&&' { make 'Op_and' } }
token operator:sym<||> { '||' { make 'Op_or' } }
 
proto token keyword {*}
token keyword:sym<if> { 'if' { make 'Keyword_if' } }
token keyword:sym<else> { 'else' { make 'Keyword_else' } }
token keyword:sym<putc> { 'putc' { make 'Keyword_putc' } }
token keyword:sym<while> { 'while' { make 'Keyword_while' } }
token keyword:sym<print> { 'print' { make 'Keyword_print' } }
 
proto token symbol {*}
token symbol:sym<(> { '(' { make 'LeftParen' } }
token symbol:sym<)> { ')' { make 'RightParen' } }
token symbol:sym<{> { '{' { make 'LeftBrace' } }
token symbol:sym<}> { '}' { make 'RightBrace' } }
token symbol:sym<;> { ';' { make 'Semicolon' } }
token symbol:sym<,> { ',' { make 'Comma' } }
 
token identifier { <[_A..Za..z]><[_A..Za..z0..9]>* { make 'Identifier ' ~ $/ } }
token integer { <[0..9]>+ { make 'Integer ' ~ $/ } }
 
token char {
'\'' [<-[']> | '\n' | '\\\\'] '\''
{ make 'Char_Literal ' ~ $/.subst("\\n", "\n").substr(1, *-1).ord }
}
 
token string {
'"' <-["\n]>* '"' #'
{
make 'String ' ~ $/;
note 'Error: Unknown escape sequence.' and exit if (~$/ ~~ m:r/ <!after <[\\]>>[\\<-[n\\]>]<!before <[\\]>> /);
}
}
 
token eoi { $ { make 'End_of_input' } }
 
token error {
| '\'''\'' { note 'Error: Empty character constant.' and exit }
| '\'' <-[']> ** {2..*} '\'' { note 'Error: Multi-character constant.' and exit }
| '/*' <-[*]>* $ { note 'Error: End-of-file in comment.' and exit }
| '"' <-["]>* $ { note 'Error: End-of-file in string.' and exit }
| '"' <-["]>*? \n { note 'Error: End of line in string.' and exit } #'
}
}
 
sub parse_it ( $c_code ) {
my $l;
my @pos = gather for $c_code.lines>>.chars.kv -> $line, $v {
take [ $line + 1, $_ ] for 1 .. ($v+1); # v+1 for newline
$l = $line+2;
}
@pos.push: [ $l, 1 ]; # capture eoi
 
for flat $c_code<tokens>.list, $c_code<eoi> -> $m {
say join "\t", @pos[$m.from].fmt('%3d'), $m.ast;
}
}
 
my $tokenizer = tiny_C.parse(@*ARGS[0].IO.slurp);
parse_it( $tokenizer );</syntaxhighlight>
 
{{out|case=test case 3}}
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Char_Literal 10
21 26 Char_Literal 92
22 26 Char_Literal 32
23 1 End_of_input
</pre>
 
=={{header|RATFOR}}==
{{works with|ratfor77|[https://sourceforge.net/p/chemoelectric/ratfor77/ public domain 1.0]}}
{{works with|gfortran|11.2.1}}
{{works with|f2c|20100827}}
 
 
<syntaxhighlight lang="ratfor">######################################################################
#
# The Rosetta Code scanner in Ratfor 77.
#
#
# How to deal with FORTRAN 77 input is a problem. I use formatted
# input, treating each line as an array of type CHARACTER--regrettably
# of no more than some predetermined, finite length. It is a very
# simple method and presents no significant difficulties, aside from
# the restriction on line length of the input.
#
#
# On a POSIX platform, the program can be compiled with f2c and run
# somewhat as follows:
#
# ratfor77 lex-in-ratfor.r > lex-in-ratfor.f
# f2c -C -Nc40 lex-in-ratfor.f
# cc -O lex-in-ratfor.c -lf2c
# ./a.out < compiler-tests/primes.t
#
# With gfortran, a little differently:
#
# ratfor77 lex-in-ratfor.r > lex-in-ratfor.f
# gfortran -O2 -fcheck=all -std=legacy lex-in-ratfor.f
# ./a.out < compiler-tests/primes.t
#
#
# I/O is strictly from default input and to default output, which, on
# POSIX systems, usually correspond respectively to standard input and
# standard output. (I did not wish to have to deal with unit numbers;
# these are now standardized in ISO_FORTRAN_ENV, but that is not
# available in FORTRAN 77.)
#
#---------------------------------------------------------------------
 
# Some parameters you may with to modify.
 
define(LINESZ, 256) # Size of an input line.
define(OUTLSZ, 512) # Size of an output line.
define(PSHBSZ, 10) # Size of the character pushback buffer.
define(STRNSZ, 4096) # Size of the string pool.
 
#---------------------------------------------------------------------
 
define(EOF, -1)
define(NEWLIN, 10) # Unix newline (the LF control character).
define(BACKSL, 92) # ASCII backslash.
 
define(ILINNO, 1) # Line number's index.
define(ICOLNO, 2) # Column number's index.
 
define(CHRSZ, 3) # See ILINNO and ICOLNO above.
define(ICHRCD, 3) # Character code's index.
 
define(TOKSZ, 5) # See ILINNO and ICOLNO above.
define(ITOKNO, 3) # Token number's index.
define(IARGIX, 4) # Index of the string pool index.
define(IARGLN, 5) # Index of the string length.
 
define(TKELSE, 0)
define(TKIF, 1)
define(TKPRNT, 2)
define(TKPUTC, 3)
define(TKWHIL, 4)
define(TKMUL, 5)
define(TKDIV, 6)
define(TKMOD, 7)
define(TKADD, 8)
define(TKSUB, 9)
define(TKNEG, 10)
define(TKLT, 11)
define(TKLE, 12)
define(TKGT, 13)
define(TKGE, 14)
define(TKEQ, 15)
define(TKNE, 16)
define(TKNOT, 17)
define(TKASGN, 18)
define(TKAND, 19)
define(TKOR, 20)
define(TKLPAR, 21)
define(TKRPAR, 22)
define(TKLBRC, 23)
define(TKRBRC, 24)
define(TKSEMI, 25)
define(TKCMMA, 26)
define(TKID, 27)
define(TKINT, 28)
define(TKSTR, 29)
define(TKEOI, 30)
 
define(LOC10, 1) # Location of "10" in the string pool.
define(LOC92, 3) # Location of "92" in the string pool.
 
#---------------------------------------------------------------------
 
subroutine addstr (strngs, istrng, src, i0, n0, i, n)
 
# Add a string to the string pool.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character src(*) # Source string.
integer i0, n0 # Index and length in source string.
integer i, n # Index and length in string pool.
 
integer j
 
if (STRNSZ < istrng + (n0 - 1))
{
write (*, '(''string pool exhausted'')')
stop
}
for (j = 0; j < n0; j = j + 1)
strngs(istrng + j) = src(i0 + j)
i = istrng
n = n0
istrng = istrng + n0
end
 
subroutine cpystr (strngs, i, n, dst, i0)
 
# Copy a string from the string pool to an output string.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer i, n # Index and length in string pool.
character dst(OUTLSZ) # Destination string.
integer i0 # Index within destination string.
 
integer j
 
if (i0 < 1 || OUTLSZ < i0 + (n - 1))
{
write (*, '(''string boundary exceeded'')')
stop
}
for (j = 0; j < n; j = j + 1)
dst(i0 + j) = strngs(i + j)
end
 
#---------------------------------------------------------------------
 
subroutine getchr (line, linno, colno, pushbk, npshbk, chr)
 
# Get a character, with its line number and column number.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer chr(CHRSZ)
 
# End of file is indicated (as in C) by a negative "char code"
# called "EOF".
 
character*20 fmt
integer stat
integer chr1(CHRSZ)
 
if (0 < npshbk)
{
chr(ICHRCD) = pushbk(ICHRCD, npshbk)
chr(ILINNO) = pushbk(ILINNO, npshbk)
chr(ICOLNO) = pushbk(ICOLNO, npshbk)
npshbk = npshbk - 1
}
else if (colno <= LINESZ)
{
chr(ICHRCD) = ichar (line(colno))
chr(ILINNO) = linno
chr(ICOLNO) = colno
colno = colno + 1
}
else
{
# Return a newline character.
chr(ICHRCD) = NEWLIN
chr(ILINNO) = linno
chr(ICOLNO) = colno
 
# Fetch a new line.
linno = linno + 1
colno = 1
 
# Read a line of text as an array of characters.
write (fmt, '(''('', I10, ''A1)'')') LINESZ
read (*, fmt, iostat = stat) line
 
if (stat != 0)
{
# If end of file has been reached, push an EOF.
chr1(ICHRCD) = EOF
chr1(ILINNO) = linno
chr1(ICOLNO) = colno
call pshchr (pushbk, npshbk, chr1)
}
}
end
 
subroutine pshchr (pushbk, npshbk, chr)
 
# Push back a character.
 
implicit none
 
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer chr(CHRSZ)
 
if (PSHBSZ <= npshbk)
{
write (*, '(''pushback buffer overfull'')')
stop
}
npshbk = npshbk + 1
pushbk(ICHRCD, npshbk) = chr(ICHRCD)
pushbk(ILINNO, npshbk) = chr(ILINNO)
pushbk(ICOLNO, npshbk) = chr(ICOLNO)
end
 
subroutine getpos (line, linno, colno, pushbk, npshbk, ln, cn)
 
# Get the position of the next character.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # The line and column nos. returned.
 
integer chr(CHRSZ)
 
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
call pshchr (pushbk, npshbk, chr)
end
 
#---------------------------------------------------------------------
 
function isspc (c)
 
# Is c character code for a space?
 
implicit none
 
integer c
logical isspc
 
#
# The following is correct for ASCII: 32 is the SPACE character, and
# 9 to 13 are control characters commonly regarded as spaces.
#
# In Unicode these are all code points for spaces, but so are others
# besides.
#
isspc = (c == 32 || (9 <= c && c <= 13))
end
 
function isdgt (c)
 
# Is c character code for a digit?
 
implicit none
 
integer c
logical isdgt
 
isdgt = (ichar ('0') <= c && c <= ichar ('9'))
end
 
function isalph (c)
 
# Is c character code for a letter?
 
implicit none
 
integer c
logical isalph
 
#
# The following is correct for ASCII and Unicode, but not for
# EBCDIC.
#
isalph = (ichar ('a') <= c && c <= ichar ('z')) _
|| (ichar ('A') <= c && c <= ichar ('Z'))
end
 
function isid0 (c)
 
# Is c character code for the start of an identifier?
 
implicit none
 
integer c
logical isid0
 
logical isalph
 
isid0 = isalph (c) || c == ichar ('_')
end
 
function isid1 (c)
 
# Is c character code for the continuation of an identifier?
 
implicit none
 
integer c
logical isid1
 
logical isalph
logical isdgt
 
isid1 = isalph (c) || isdgt (c) || c == ichar ('_')
end
 
#---------------------------------------------------------------------
 
function trimlf (str, n)
 
# "Trim left" leading spaces.
 
implicit none
 
character str(*) # The string to "trim".
integer n # The length.
integer trimlf # The index of the first non-space
# character, or n + 1.
 
logical isspc
 
integer j
logical done
 
j = 1
done = .false.
while (!done)
{
if (j == n + 1)
done = .true.
else if (!isspc (ichar (str(j))))
done = .true.
else
j = j + 1
}
trimlf = j
end
 
function trimrt (str, n)
 
# "Trim right" trailing spaces.
 
implicit none
 
character str(*) # The string to "trim".
integer n # The length including trailing spaces.
integer trimrt # The length without trailing spaces.
 
logical isspc
 
integer j
logical done
 
j = n
done = .false.
while (!done)
{
if (j == 0)
done = .true.
else if (!isspc (ichar (str(j))))
done = .true.
else
j = j - 1
}
trimrt = j
end
 
#---------------------------------------------------------------------
 
subroutine toknam (tokno, str, i)
 
# Copy a token name to the character array str, starting at i.
 
implicit none
 
integer tokno
character str(*)
integer i
integer j
 
character*16 names(0:30)
character*16 nm
 
data names / "Keyword_else ", _
"Keyword_if ", _
"Keyword_print ", _
"Keyword_putc ", _
"Keyword_while ", _
"Op_multiply ", _
"Op_divide ", _
"Op_mod ", _
"Op_add ", _
"Op_subtract ", _
"Op_negate ", _
"Op_less ", _
"Op_lessequal ", _
"Op_greater ", _
"Op_greaterequal ", _
"Op_equal ", _
"Op_notequal ", _
"Op_not ", _
"Op_assign ", _
"Op_and ", _
"Op_or ", _
"LeftParen ", _
"RightParen ", _
"LeftBrace ", _
"RightBrace ", _
"Semicolon ", _
"Comma ", _
"Identifier ", _
"Integer ", _
"String ", _
"End_of_input " /
 
nm = names(tokno)
for (j = 0; j < 16; j = j + 1)
str(i + j) = nm(1 + j : 1 + j)
end
 
subroutine intstr (str, i, n, x)
 
# Convert a positive integer to a substring.
 
implicit none
 
character str(*) # Destination string.
integer i, n # Index and length of the field.
integer x # The positive integer to represent.
 
integer j
integer y
 
if (x == 0)
{
for (j = 0; j < n - 1; j = j + 1)
str(i + j) = ' '
str(i + j) = '0'
}
else
{
y = x
for (j = n - 1; 0 <= j; j = j - 1)
{
if (y == 0)
str(i + j) = ' '
else
{
str(i + j) = char (mod (y, 10) + ichar ('0'))
y = y / 10
}
}
}
end
 
subroutine prttok (strngs, tok)
 
# Print a token.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer tok(TOKSZ) # The token.
 
integer trimrt
 
character line(OUTLSZ)
character*20 fmt
integer i, n
integer tokno
 
for (i = 1; i <= OUTLSZ; i = i + 1)
line(i) = ' '
 
call intstr (line, 1, 10, tok(ILINNO))
call intstr (line, 12, 10, tok(ICOLNO))
 
tokno = tok(ITOKNO)
call toknam (tokno, line, 25)
if (tokno == TKID || tokno == TKINT || tokno == TKSTR)
{
i = tok(IARGIX)
n = tok(IARGLN)
call cpystr (strngs, i, n, line, 45)
}
 
n = trimrt (line, OUTLSZ)
write (fmt, '(''('', I10, ''A)'')') n
write (*, fmt) (line(i), i = 1, n)
end
 
#---------------------------------------------------------------------
 
subroutine wrtpos (ln, cn)
 
implicit none
 
integer ln, cn
 
write (*, 1000) ln, cn
1000 format ('At line ', I5, ', column ' I5)
end
 
#---------------------------------------------------------------------
 
subroutine utcmnt (ln, cn)
 
implicit none
 
integer ln, cn
 
call wrtpos (ln, cn)
write (*, '(''Unterminated comment'')')
stop
end
 
subroutine skpcmt (line, linno, colno, pushbk, npshbk, ln, cn)
 
# Skip to the end of a comment.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column of start of comment.
 
integer chr(CHRSZ)
logical done
 
done = .false.
while (!done)
{
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) == ichar ('*'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) == ichar ('/'))
done = .true.
else if (chr(ICHRCD) == EOF)
call utcmnt (ln, cn)
}
else if (chr(ICHRCD) == EOF)
call utcmnt (ln, cn)
}
end
 
subroutine skpspc (line, linno, colno, pushbk, npshbk)
 
# Skip spaces and comments.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
 
logical isspc
 
integer chr(CHRSZ)
integer chr1(CHRSZ)
integer ln, cn
logical done
 
done = .false.
while (!done)
{
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (!isspc (chr(ICHRCD)))
{
if (chr(ICHRCD) != ichar ('/'))
{
call pshchr (pushbk, npshbk, chr)
done = .true.
}
else
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) != ichar ('*'))
{
call pshchr (pushbk, npshbk, chr1)
call pshchr (pushbk, npshbk, chr)
done = .true.
}
else
{
ln = chr(ILINNO)
cn = chr(ICOLNO)
call skpcmt (line, linno, colno, pushbk, npshbk, _
ln, cn)
}
}
}
}
end
 
#---------------------------------------------------------------------
 
subroutine rwdlkp (strngs, istrng, src, i0, n0, ln, cn, tok)
 
# Reserved word lookup
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character src(*) # The source string.
integer i0, n0 # Index and length of the substring.
integer ln, cn # Line and column number
# to associate with the token.
integer tok(TOKSZ) # The output token.
 
integer tokno
integer i, n
 
tokno = TKID
 
if (n0 == 2)
{
if (src(i0) == 'i' && src(i0 + 1) == 'f')
tokno = TKIF
}
else if (n0 == 4)
{
if (src(i0) == 'e' && src(i0 + 1) == 'l' _
&& src(i0 + 2) == 's' && src(i0 + 3) == 'e')
tokno = TKELSE
else if (src(i0) == 'p' && src(i0 + 1) == 'u' _
&& src(i0 + 2) == 't' && src(i0 + 3) == 'c')
tokno = TKPUTC
}
else if (n0 == 5)
{
if (src(i0) == 'p' && src(i0 + 1) == 'r' _
&& src(i0 + 2) == 'i' && src(i0 + 3) == 'n' _
&& src(i0 + 4) == 't')
tokno = TKPRNT
else if (src(i0) == 'w' && src(i0 + 1) == 'h' _
&& src(i0 + 2) == 'i' && src(i0 + 3) == 'l' _
&& src(i0 + 4) == 'e')
tokno = TKWHIL
}
 
i = 0
n = 0
if (tokno == TKID)
call addstr (strngs, istrng, src, i0, n0, i, n)
 
tok(ITOKNO) = tokno
tok(IARGIX) = i
tok(IARGLN) = n
tok(ILINNO) = ln
tok(ICOLNO) = cn
end
 
subroutine scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n)
 
# Scan characters that may represent an identifier, reserved word,
# or integer literal.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column number of the start.
character buf(LINESZ) # The output buffer.
integer n # The length of the string collected.
 
logical isid1
 
integer chr(CHRSZ)
 
n = 0
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
while (isid1 (chr(ICHRCD)))
{
n = n + 1
buf(n) = char (chr(ICHRCD))
call getchr (line, linno, colno, pushbk, npshbk, chr)
}
call pshchr (pushbk, npshbk, chr)
end
 
subroutine scnidr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan an identifier or reserved word.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column number of the start.
integer tok(TOKSZ) # The output token.
 
character buf(LINESZ)
integer n
call scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n)
call rwdlkp (strngs, istrng, buf, 1, n, ln, cn, tok)
end
 
subroutine scnint (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a positive integer literal.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column number of the start.
integer tok(TOKSZ) # The output token.
 
logical isdgt
 
character buf(LINESZ)
integer n0, n
integer i, j, k
character*80 fmt
call scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n0)
for (j = 1; j <= n0; j = j + 1)
if (!isdgt (ichar (buf(j))))
{
call wrtpos (ln, cn)
write (fmt, 1000) n0
1000 format ('(''Not a legal word: "''', I10, 'A, ''"'')')
write (*, fmt) (buf(k), k = 1, n0)
stop
}
 
call addstr (strngs, istrng, buf, 1, n0, i, n)
 
tok(ITOKNO) = TKINT
tok(IARGIX) = i
tok(IARGLN) = n
tok(ILINNO) = ln
tok(ICOLNO) = cn
end
 
subroutine utclit (ln, cn)
 
implicit none
 
integer ln, cn
 
call wrtpos (ln, cn)
write (*, '(''Unterminated character literal'')')
stop
end
 
subroutine scnch1 (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a character literal, without yet checking that the literal
# ends correctly.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
integer trimlf
 
integer bufsz
parameter (bufsz = 40)
 
integer chr(CHRSZ)
integer chr1(CHRSZ)
integer chr2(CHRSZ)
integer ln, cn
character buf(bufsz)
integer i, j, n
 
# Refetch the opening quote.
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
 
tok(ITOKNO) = TKINT
tok(ILINNO) = ln
tok(ICOLNO) = cn
 
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == EOF)
call utclit (ln, cn)
if (chr1(ICHRCD) == BACKSL)
{
call getchr (line, linno, colno, pushbk, npshbk, chr2)
if (chr2(ICHRCD) == EOF)
call utclit (ln, cn)
else if (chr2(ICHRCD) == ichar ('n'))
{
tok(IARGIX) = LOC10 # "10" = code for Unix newline
tok(IARGLN) = 2
}
else if (chr2(ICHRCD) == BACKSL)
{
tok(IARGIX) = LOC92 # "92" = code for backslash
tok(IARGLN) = 2
}
else
{
call wrtpos (ln, cn)
write (*, '(''Unsupported escape: '', 1A)') _
char (chr2(ICHRCD))
stop
}
}
else
{
# Character codes are non-negative, so we can use intstr.
call intstr (buf, 1, bufsz, chr1(ICHRCD))
 
j = trimlf (buf, bufsz)
call addstr (strngs, istrng, buf, j, bufsz - (j - 1), i, n)
tok(IARGIX) = i
tok(IARGLN) = n
}
end
 
subroutine scnch (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a character literal.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
integer ln, cn
integer chr(CHRSZ)
 
call getpos (line, linno, colno, pushbk, npshbk, ln, cn)
call scnch1 (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) != ichar (''''))
{
while (.true.)
{
if (chr(ICHRCD) == EOF)
{
call utclit (ln, cn)
stop
}
else if (chr(ICHRCD) == ichar (''''))
{
call wrtpos (ln, cn)
write (*, '(''Unsupported multicharacter literal'')')
stop
}
call getchr (line, linno, colno, pushbk, npshbk, chr)
}
}
end
 
subroutine scnstr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a string literal.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
integer ln, cn
integer chr1(CHRSZ)
integer chr2(CHRSZ)
character buf(LINESZ + 10) # Enough space, with some room to spare.
integer n0
integer i, n
 
call getchr (line, linno, colno, pushbk, npshbk, chr1)
ln = chr1(ILINNO)
cn = chr1(ICOLNO)
 
tok(ITOKNO) = TKSTR
tok(ILINNO) = ln
tok(ICOLNO) = cn
 
n0 = 1
buf(n0) = '"'
 
call getchr (line, linno, colno, pushbk, npshbk, chr1)
while (chr1(ICHRCD) != ichar ('"'))
{
# Our input method always puts a NEWLIN before EOF, and so this
# test is redundant, unless someone changes the input method.
if (chr1(ICHRCD) == EOF || chr1(ICHRCD) == NEWLIN)
{
call wrtpos (ln, cn)
write (*, '(''Unterminated string literal'')')
stop
}
if (chr1(ICHRCD) == BACKSL)
{
call getchr (line, linno, colno, pushbk, npshbk, chr2)
if (chr2(ICHRCD) == ichar ('n'))
{
n0 = n0 + 1
buf(n0) = char (BACKSL)
n0 = n0 + 1
buf(n0) = 'n'
}
else if (chr2(ICHRCD) == BACKSL)
{
n0 = n0 + 1
buf(n0) = char (BACKSL)
n0 = n0 + 1
buf(n0) = char (BACKSL)
}
else
{
call wrtpos (chr1(ILINNO), chr1(ICOLNO))
write (*, '(''Unsupported escape sequence'')')
stop
}
}
else
{
n0 = n0 + 1
buf(n0) = char (chr1(ICHRCD))
}
call getchr (line, linno, colno, pushbk, npshbk, chr1)
}
n0 = n0 + 1
buf(n0) = '"'
 
call addstr (strngs, istrng, buf, 1, n0, i, n)
tok(IARGIX) = i
tok(IARGLN) = n
end
 
subroutine unxchr (chr)
 
implicit none
 
integer chr(CHRSZ)
 
call wrtpos (chr(ILINNO), chr(ICOLNO))
write (*, 1000) char (chr(ICHRCD))
1000 format ('Unexpected character ''', A1, '''')
stop
end
 
subroutine scntok (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a token.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
logical isdgt
logical isid0
 
integer chr(CHRSZ)
integer chr1(CHRSZ)
integer ln, cn
 
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
tok(ILINNO) = ln
tok(ICOLNO) = cn
tok(IARGIX) = 0
tok(IARGLN) = 0
if (chr(ICHRCD) == ichar (','))
tok(ITOKNO) = TKCMMA
else if (chr(ICHRCD) == ichar (';'))
tok(ITOKNO) = TKSEMI
else if (chr(ICHRCD) == ichar ('('))
tok(ITOKNO) = TKLPAR
else if (chr(ICHRCD) == ichar (')'))
tok(ITOKNO) = TKRPAR
else if (chr(ICHRCD) == ichar ('{'))
tok(ITOKNO) = TKLBRC
else if (chr(ICHRCD) == ichar ('}'))
tok(ITOKNO) = TKRBRC
else if (chr(ICHRCD) == ichar ('*'))
tok(ITOKNO) = TKMUL
else if (chr(ICHRCD) == ichar ('/'))
tok(ITOKNO) = TKDIV
else if (chr(ICHRCD) == ichar ('%'))
tok(ITOKNO) = TKMOD
else if (chr(ICHRCD) == ichar ('+'))
tok(ITOKNO) = TKADD
else if (chr(ICHRCD) == ichar ('-'))
tok(ITOKNO) = TKSUB
else if (chr(ICHRCD) == ichar ('<'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKLE
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKLT
}
}
else if (chr(ICHRCD) == ichar ('>'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKGE
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKGT
}
}
else if (chr(ICHRCD) == ichar ('='))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKEQ
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKASGN
}
}
else if (chr(ICHRCD) == ichar ('!'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKNE
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKNOT
}
}
else if (chr(ICHRCD) == ichar ('&'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('&'))
tok(ITOKNO) = TKAND
else
call unxchr (chr)
}
else if (chr(ICHRCD) == ichar ('|'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('|'))
tok(ITOKNO) = TKOR
else
call unxchr (chr)
}
else if (chr(ICHRCD) == ichar ('"'))
{
call pshchr (pushbk, npshbk, chr)
call scnstr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else if (chr(ICHRCD) == ichar (''''))
{
call pshchr (pushbk, npshbk, chr)
call scnch (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else if (isdgt (chr(ICHRCD)))
{
call pshchr (pushbk, npshbk, chr)
call scnint (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else if (isid0 (chr(ICHRCD)))
{
call pshchr (pushbk, npshbk, chr)
call scnidr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else
call unxchr (chr)
end
 
subroutine scntxt (strngs, istrng, _
line, linno, colno, pushbk, npshbk)
 
# Scan the text and print the token stream.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
 
integer chr(CHRSZ)
integer tok(TOKSZ)
 
chr(ICHRCD) = ichar ('x')
while (chr(ICHRCD) != EOF)
{
call skpspc (line, linno, colno, pushbk, npshbk)
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) != EOF)
{
call pshchr (pushbk, npshbk, chr)
call scntok (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
call prttok (strngs, tok)
}
}
tok(ITOKNO) = TKEOI
tok(ILINNO) = chr(ILINNO)
tok(ICOLNO) = chr(ICOLNO)
tok(IARGIX) = 0
tok(IARGLN) = 0
call prttok (strngs, tok)
end
 
#---------------------------------------------------------------------
 
program lex
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
 
integer i, n
 
istrng = 1
 
# Locate "10" (newline) at 1 in the string pool.
line(1) = '1'
line(2) = '0'
call addstr (strngs, istrng, line, 1, 2, i, n)
if (i != 1 && n != 2)
{
write (*, '(''internal error'')')
stop
}
 
# Locate "92" (backslash) at 3 in the string pool.
line(1) = '9'
line(2) = '2'
call addstr (strngs, istrng, line, 1, 2, i, n)
if (i != 3 && n != 2)
{
write (*, '(''internal error'')')
stop
}
 
linno = 0
colno = LINESZ + 1 # This will trigger a READ.
npshbk = 0
 
call scntxt (strngs, istrng, line, linno, colno, pushbk, npshbk)
end
 
######################################################################</syntaxhighlight>
 
 
{{out}}
<pre>$ ratfor77 lex-in-ratfor.r > lex-in-ratfor.f && gfortran -O2 -std=legacy -fcheck=all lex-in-ratfor.f && ./a.out < compiler-tests/primes.t
4 1 Identifier count
4 7 Op_assign
4 9 Integer 1
4 10 Semicolon
5 1 Identifier n
5 3 Op_assign
5 5 Integer 1
5 6 Semicolon
6 1 Identifier limit
6 7 Op_assign
6 9 Integer 100
6 12 Semicolon
7 1 Keyword_while
7 7 LeftParen
7 8 Identifier n
7 10 Op_less
7 12 Identifier limit
7 17 RightParen
7 19 LeftBrace
8 5 Identifier k
8 6 Op_assign
8 7 Integer 3
8 8 Semicolon
9 5 Identifier p
9 6 Op_assign
9 7 Integer 1
9 8 Semicolon
10 5 Identifier n
10 6 Op_assign
10 7 Identifier n
10 8 Op_add
10 9 Integer 2
10 10 Semicolon
11 5 Keyword_while
11 11 LeftParen
11 12 LeftParen
11 13 Identifier k
11 14 Op_multiply
11 15 Identifier k
11 16 Op_lessequal
11 18 Identifier n
11 19 RightParen
11 21 Op_and
11 24 LeftParen
11 25 Identifier p
11 26 RightParen
11 27 RightParen
11 29 LeftBrace
12 9 Identifier p
12 10 Op_assign
12 11 Identifier n
12 12 Op_divide
12 13 Identifier k
12 14 Op_multiply
12 15 Identifier k
12 16 Op_notequal
12 18 Identifier n
12 19 Semicolon
13 9 Identifier k
13 10 Op_assign
13 11 Identifier k
13 12 Op_add
13 13 Integer 2
13 14 Semicolon
14 5 RightBrace
15 5 Keyword_if
15 8 LeftParen
15 9 Identifier p
15 10 RightParen
15 12 LeftBrace
16 9 Keyword_print
16 14 LeftParen
16 15 Identifier n
16 16 Comma
16 18 String " is prime\n"
16 31 RightParen
16 32 Semicolon
17 9 Identifier count
17 15 Op_assign
17 17 Identifier count
17 23 Op_add
17 25 Integer 1
17 26 Semicolon
18 5 RightBrace
19 1 RightBrace
20 1 Keyword_print
20 6 LeftParen
20 7 String "Total primes found: "
20 29 Comma
20 31 Identifier count
20 36 Comma
20 38 String "\n"
20 42 RightParen
20 43 Semicolon
21 1 End_of_input</pre>
 
=={{header|Scala}}==
The complete implementation for the compiler tasks can be found in a GitHub repository at [https://github.com/edadma/rosettacodeCompiler github.com/edadma/rosettacodeCompiler] which includes full unit testing for the samples given in [[Compiler/Sample programs]].
 
The following code implements a configurable (from a symbol map and keyword map provided as parameters) lexical analyzer.
 
<syntaxhighlight lang="scala">
package xyz.hyperreal.rosettacodeCompiler
 
import scala.io.Source
import scala.util.matching.Regex
 
object LexicalAnalyzer {
private val EOT = '\u0004'
 
val symbols =
Map(
"*" -> "Op_multiply",
"/" -> "Op_divide",
"%" -> "Op_mod",
"+" -> "Op_add",
"-" -> "Op_minus",
"<" -> "Op_less",
"<=" -> "Op_lessequal",
">" -> "Op_greater",
">=" -> "Op_greaterequal",
"==" -> "Op_equal",
"!=" -> "Op_notequal",
"!" -> "Op_not",
"=" -> "Op_assign",
"&&" -> "Op_and",
"¦¦" -> "Op_or",
"(" -> "LeftParen",
")" -> "RightParen",
"{" -> "LeftBrace",
"}" -> "RightBrace",
";" -> "Semicolon",
"," -> "Comma"
)
 
val keywords =
Map(
"if" -> "Keyword_if",
"else" -> "Keyword_else",
"while" -> "Keyword_while",
"print" -> "Keyword_print",
"putc" -> "Keyword_putc"
)
val alpha = ('a' to 'z' toSet) ++ ('A' to 'Z')
val numeric = '0' to '9' toSet
val alphanumeric = alpha ++ numeric
val identifiers = StartRestToken("Identifier", alpha + '_', alphanumeric + '_')
val integers = SimpleToken("Integer", numeric, alpha, "alpha characters may not follow right after a number")
 
val characters =
DelimitedToken("Integer",
'\'',
"[^'\\n]|\\\\n|\\\\\\\\" r,
"invalid character literal",
"unclosed character literal")
 
val strings =
DelimitedToken("String", '"', "[^\"\\n]*" r, "invalid string literal", "unclosed string literal")
 
def apply =
new LexicalAnalyzer(4, symbols, keywords, "End_of_input", identifiers, integers, characters, strings)
 
abstract class Token
case class StartRestToken(name: String, start: Set[Char], rest: Set[Char]) extends Token
case class SimpleToken(name: String, chars: Set[Char], exclude: Set[Char], excludeError: String) extends Token
case class DelimitedToken(name: String, delimiter: Char, pattern: Regex, patternError: String, unclosedError: String)
extends Token
}
 
class LexicalAnalyzer(tabs: Int,
symbols: Map[String, String],
keywords: Map[String, String],
endOfInput: String,
identifier: LexicalAnalyzer.Token,
tokens: LexicalAnalyzer.Token*) {
 
import LexicalAnalyzer._
 
private val symbolStartChars = symbols.keys map (_.head) toSet
private val symbolChars = symbols.keys flatMap (_.toList) toSet
private var curline: Int = _
private var curcol: Int = _
 
def fromStdin = fromSource(Source.stdin)
 
def fromString(src: String) = fromSource(Source.fromString(src))
 
def fromSource(ast: Source) = {
curline = 1
curcol = 1
 
var s = (ast ++ Iterator(EOT)) map (new Chr(_)) toStream
 
tokenize
 
def token(name: String, first: Chr) = println(f"${first.line}%5d ${first.col}%6d $name")
 
def value(name: String, v: String, first: Chr) = println(f"${first.line}%5d ${first.col}%6d $name%-14s $v")
 
def until(c: Char) = {
val buf = new StringBuilder
 
def until: String =
if (s.head.c == EOT || s.head.c == c)
buf.toString
else {
buf += getch
until
}
 
until
}
 
def next = s = s.tail
 
def getch = {
val c = s.head.c
 
next
c
}
 
def consume(first: Char, cs: Set[Char]) = {
val buf = new StringBuilder
 
def consume: String =
if (s.head.c == EOT || !cs(s.head.c))
buf.toString
else {
buf += getch
consume
}
 
buf += first
consume
}
 
def comment(start: Chr): Unit = {
until('*')
 
if (s.head.c == EOT || s.tail.head.c == EOT)
sys.error(s"unclosed comment ${start.at}")
else if (s.tail.head.c != '/') {
next
comment(start)
} else {
next
next
}
}
 
def recognize(t: Token): Option[(String, String)] = {
val first = s
 
next
 
t match {
case StartRestToken(name, start, rest) =>
if (start(first.head.c))
Some((name, consume(first.head.c, rest)))
else {
s = first
None
}
case SimpleToken(name, chars, exclude, excludeError) =>
if (chars(first.head.c)) {
val m = consume(first.head.c, chars)
 
if (exclude(s.head.c))
sys.error(s"$excludeError ${s.head.at}")
else
Some((name, m))
} else {
s = first
None
}
case DelimitedToken(name, delimiter, pattern, patternError, unclosedError) =>
if (first.head.c == delimiter) {
val m = until(delimiter)
 
if (s.head.c != delimiter)
sys.error(s"$unclosedError ${first.head.at}")
else if (pattern.pattern.matcher(m).matches) {
next
Some((name, s"$delimiter$m$delimiter"))
} else
sys.error(s"$patternError ${s.head.at}")
} else {
s = first
None
}
}
}
 
def tokenize: Unit =
if (s.head.c == EOT)
token(endOfInput, s.head)
else {
if (s.head.c.isWhitespace)
next
else if (s.head.c == '/' && s.tail.head.c == '*')
comment(s.head)
else if (symbolStartChars(s.head.c)) {
val first = s.head
val buf = new StringBuilder
 
while (!symbols.contains(buf.toString) && s.head.c != EOT && symbolChars(s.head.c)) buf += getch
 
while (symbols.contains(buf.toString :+ s.head.c) && s.head.c != EOT && symbolChars(s.head.c)) buf += getch
 
symbols get buf.toString match {
case Some(name) => token(name, first)
case None => sys.error(s"unrecognized symbol: '${buf.toString}' ${first.at}")
}
} else {
val first = s.head
 
recognize(identifier) match {
case None =>
find(0)
 
@scala.annotation.tailrec
def find(t: Int): Unit =
if (t == tokens.length)
sys.error(s"unrecognized character ${first.at}")
else
recognize(tokens(t)) match {
case None => find(t + 1)
case Some((name, v)) => value(name, v, first)
}
case Some((name, ident)) =>
keywords get ident match {
case None => value(name, ident, first)
case Some(keyword) => token(keyword, first)
}
}
}
 
tokenize
}
}
 
private class Chr(val c: Char) {
val line = curline
val col = curcol
 
if (c == '\n') {
curline += 1
curcol = 1
} else if (c == '\r')
curcol = 1
else if (c == '\t')
curcol += tabs - (curcol - 1) % tabs
else
curcol += 1
 
def at = s"[${line}, ${col}]"
 
override def toString: String = s"<$c, $line, $col>"
}
 
}
</syntaxhighlight>
 
=={{header|Scheme}}==
 
<langsyntaxhighlight lang="scheme">
(import (scheme base)
(scheme char)
Line 6,873 ⟶ 17,798:
(display-tokens (lexer (cadr (command-line))))
(display "Error: provide program filename\n"))
</syntaxhighlight>
</lang>
 
{{out}}
Line 6,885 ⟶ 17,810:
5 1 End_of_input
</pre>
 
=={{header|Standard ML}}==
{{trans|ATS}}
{{trans|OCaml}}
 
 
<syntaxhighlight lang="sml">(*------------------------------------------------------------------*)
(* The Rosetta Code lexical analyzer, in Standard ML. Based on the ATS
and the OCaml. The intended compiler is Mlton or Poly/ML; there is
a tiny difference near the end of the file, depending on which
compiler is used. *)
 
(*------------------------------------------------------------------*)
(* The following functions are compatible with ASCII. *)
 
fun
is_digit ichar =
48 <= ichar andalso ichar <= 57
 
fun
is_lower ichar =
97 <= ichar andalso ichar <= 122
 
fun
is_upper ichar =
65 <= ichar andalso ichar <= 90
 
fun
is_alpha ichar =
is_lower ichar orelse is_upper ichar
 
fun
is_alnum ichar =
is_digit ichar orelse is_alpha ichar
 
fun
is_ident_start ichar =
is_alpha ichar orelse ichar = 95
 
fun
is_ident_continuation ichar =
is_alnum ichar orelse ichar = 95
 
fun
is_space ichar =
ichar = 32 orelse (9 <= ichar andalso ichar <= 13)
 
(*------------------------------------------------------------------*)
(* Character input more like that of C. There are various advantages
and disadvantages to this method, but key points in its favor are:
(a) it is how character input is done in the original ATS code, (b)
Unicode code points are 21-bit positive integers. *)
 
val eof = ~1
 
fun
input_ichar inpf =
case TextIO.input1 inpf of
NONE => eof
| SOME c => Char.ord c
 
(*------------------------------------------------------------------*)
 
(* The type of an input character. *)
 
structure Ch =
struct
 
type t = {
ichar : int,
line_no : int,
column_no : int
}
 
end
 
(*------------------------------------------------------------------*)
(* Inputting with unlimited pushback, and with counting of lines and
columns. *)
 
structure Inp =
struct
 
type t = {
inpf : TextIO.instream,
pushback : Ch.t list,
line_no : int,
column_no : int
}
 
fun
of_instream inpf =
{
inpf = inpf,
pushback = [],
line_no = 1,
column_no = 1
} : t
 
fun
get_ch ({ inpf = inpf,
pushback = pushback,
line_no = line_no,
column_no = column_no } : t) =
case pushback of
ch :: tail =>
let
val inp = { inpf = inpf,
pushback = tail,
line_no = line_no,
column_no = column_no }
in
(ch, inp)
end
| [] =>
let
val ichar = input_ichar inpf
val ch = { ichar = ichar,
line_no = line_no,
column_no = column_no }
in
if ichar = Char.ord #"\n" then
let
val inp = { inpf = inpf,
pushback = [],
line_no = line_no + 1,
column_no = 1 }
in
(ch, inp)
end
else
let
val inp = { inpf = inpf,
pushback = [],
line_no = line_no,
column_no = column_no + 1 }
in
(ch, inp)
end
end
 
fun
push_back_ch (ch, inp : t) =
{
inpf = #inpf inp,
pushback = ch :: #pushback inp,
line_no = #line_no inp,
column_no = #column_no inp
}
 
end
 
(*------------------------------------------------------------------*)
(* Tokens, appearing in tuples with arguments, and with line and
column numbers. The tokens are integers, so they can be used as
array indices. *)
 
val token_ELSE = 0
val token_IF = 1
val token_PRINT = 2
val token_PUTC = 3
val token_WHILE = 4
val token_MULTIPLY = 5
val token_DIVIDE = 6
val token_MOD = 7
val token_ADD = 8
val token_SUBTRACT = 9
val token_NEGATE = 10
val token_LESS = 11
val token_LESSEQUAL = 12
val token_GREATER = 13
val token_GREATEREQUAL = 14
val token_EQUAL = 15
val token_NOTEQUAL = 16
val token_NOT = 17
val token_ASSIGN = 18
val token_AND = 19
val token_OR = 20
val token_LEFTPAREN = 21
val token_RIGHTPAREN = 22
val token_LEFTBRACE = 23
val token_RIGHTBRACE = 24
val token_SEMICOLON = 25
val token_COMMA = 26
val token_IDENTIFIER = 27
val token_INTEGER = 28
val token_STRING = 29
val token_END_OF_INPUT = 30
 
(* A *very* simple perfect hash for the reserved words. (Yes, this is
overkill, except for demonstration of the principle.) *)
 
val reserved_words =
Vector.fromList ["if", "print", "else",
"", "putc", "",
"", "while", ""]
val reserved_word_tokens =
Vector.fromList [token_IF, token_PRINT, token_ELSE,
token_IDENTIFIER, token_PUTC, token_IDENTIFIER,
token_IDENTIFIER, token_WHILE, token_IDENTIFIER]
 
fun
reserved_word_lookup (s, line_no, column_no) =
if (String.size s) < 2 then
(token_IDENTIFIER, s, line_no, column_no)
else
let
val hashval =
(Char.ord (String.sub (s, 0)) +
Char.ord (String.sub (s, 1)))
mod 9
val token = Vector.sub (reserved_word_tokens, hashval)
in
if token = token_IDENTIFIER orelse
s <> Vector.sub (reserved_words, hashval) then
(token_IDENTIFIER, s, line_no, column_no)
else
(token, s, line_no, column_no)
end
 
(* Token to string lookup. *)
 
val token_names =
Vector.fromList
["Keyword_else",
"Keyword_if",
"Keyword_print",
"Keyword_putc",
"Keyword_while",
"Op_multiply",
"Op_divide",
"Op_mod",
"Op_add",
"Op_subtract",
"Op_negate",
"Op_less",
"Op_lessequal",
"Op_greater",
"Op_greaterequal",
"Op_equal",
"Op_notequal",
"Op_not",
"Op_assign",
"Op_and",
"Op_or",
"LeftParen",
"RightParen",
"LeftBrace",
"RightBrace",
"Semicolon",
"Comma",
"Identifier",
"Integer",
"String",
"End_of_input"]
 
fun
token_name token =
Vector.sub (token_names, token)
 
(*------------------------------------------------------------------*)
 
exception Unterminated_comment of int * int
exception Unterminated_character_literal of int * int
exception Multicharacter_literal of int * int
exception End_of_input_in_string_literal of int * int
exception End_of_line_in_string_literal of int * int
exception Unsupported_escape of int * int * char
exception Invalid_integer_literal of int * int * string
exception Unexpected_character of int * int * char
 
(*------------------------------------------------------------------*)
(* Skipping past spaces and comments. (In the Rosetta Code tiny
language, a comment, if you think about it, is a kind of space.) *)
 
fun
scan_comment (inp, line_no, column_no) =
let
fun
loop inp =
let
val (ch, inp) = Inp.get_ch inp
in
if #ichar ch = eof then
raise Unterminated_comment (line_no, column_no)
else if #ichar ch = Char.ord #"*" then
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = eof then
raise Unterminated_comment (line_no, column_no)
else if #ichar ch1 = Char.ord #"/" then
inp
else
loop inp
end
else
loop inp
end
in
loop inp
end
 
fun
skip_spaces_and_comments inp =
let
fun
loop inp =
let
val (ch, inp) = Inp.get_ch inp
in
if is_space (#ichar ch) then
loop inp
else if #ichar ch = Char.ord #"/" then
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"*" then
loop (scan_comment (inp, #line_no ch, #column_no ch))
else
let
val inp = Inp.push_back_ch (ch1, inp)
val inp = Inp.push_back_ch (ch, inp)
in
inp
end
end
else
Inp.push_back_ch (ch, inp)
end
in
loop inp
end
 
(*------------------------------------------------------------------*)
(* Integer literals, identifiers, and reserved words. *)
 
fun
scan_word (lst, inp) =
let
val (ch, inp) = Inp.get_ch inp
in
if is_ident_continuation (#ichar ch) then
scan_word (Char.chr (#ichar ch) :: lst, inp)
else
(lst, Inp.push_back_ch (ch, inp))
end
 
fun
scan_integer_literal inp =
let
val (ch, inp) = Inp.get_ch inp
val (lst, inp) = scan_word ([Char.chr (#ichar ch)], inp)
val s = String.implode (List.rev lst)
in
if List.all (fn c => is_digit (Char.ord c)) lst then
((token_INTEGER, s, #line_no ch, #column_no ch), inp)
else
raise Invalid_integer_literal (#line_no ch, #column_no ch, s)
end
 
fun
scan_identifier_or_reserved_word inp =
let
val (ch, inp) = Inp.get_ch inp
val (lst, inp) = scan_word ([Char.chr (#ichar ch)], inp)
val s = String.implode (List.rev lst)
val toktup = reserved_word_lookup (s, #line_no ch, #column_no ch)
in
(toktup, inp)
end
 
(*------------------------------------------------------------------*)
(* String literals. *)
 
fun
scan_string_literal inp =
let
val (ch, inp) = Inp.get_ch inp
 
fun
scan (lst, inp) =
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = eof then
raise End_of_input_in_string_literal
(#line_no ch, #column_no ch)
else if #ichar ch1 = Char.ord #"\n" then
raise End_of_line_in_string_literal
(#line_no ch, #column_no ch)
else if #ichar ch1 = Char.ord #"\"" then
(lst, inp)
else if #ichar ch1 <> Char.ord #"\\" then
scan (Char.chr (#ichar ch1) :: lst, inp)
else
let
val (ch2, inp) = Inp.get_ch inp
in
if #ichar ch2 = Char.ord #"n" then
scan (#"n" :: #"\\" :: lst, inp)
else if #ichar ch2 = Char.ord #"\\" then
scan (#"\\" :: #"\\" :: lst, inp)
else if #ichar ch2 = eof then
raise End_of_input_in_string_literal
(#line_no ch, #column_no ch)
else if #ichar ch2 = Char.ord #"\n" then
raise End_of_line_in_string_literal
(#line_no ch, #column_no ch)
else
raise Unsupported_escape (#line_no ch1, #column_no ch1,
Char.chr (#ichar ch2))
end
end
 
val lst = #"\"" :: []
val (lst, inp) = scan (lst, inp)
val lst = #"\"" :: lst
val s = String.implode (List.rev lst)
in
((token_STRING, s, #line_no ch, #column_no ch), inp)
end
 
(*------------------------------------------------------------------*)
(* Character literals. *)
 
fun
scan_character_literal_without_checking_end inp =
let
val (ch, inp) = Inp.get_ch inp
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = eof then
raise Unterminated_character_literal
(#line_no ch, #column_no ch)
else if #ichar ch1 = Char.ord #"\\" then
let
val (ch2, inp) = Inp.get_ch inp
in
if #ichar ch2 = eof then
raise Unterminated_character_literal
(#line_no ch, #column_no ch)
else if #ichar ch2 = Char.ord #"n" then
let
val s = Int.toString (Char.ord #"\n")
in
((token_INTEGER, s, #line_no ch, #column_no ch), inp)
end
else if #ichar ch2 = Char.ord #"\\" then
let
val s = Int.toString (Char.ord #"\\")
in
((token_INTEGER, s, #line_no ch, #column_no ch), inp)
end
else
raise Unsupported_escape (#line_no ch1, #column_no ch1,
Char.chr (#ichar ch2))
end
else
let
val s = Int.toString (#ichar ch1)
in
((token_INTEGER, s, #line_no ch, #column_no ch), inp)
end
end
 
fun
scan_character_literal inp =
let
val (toktup, inp) =
scan_character_literal_without_checking_end inp
val (_, _, line_no, column_no) = toktup
 
fun
check_end inp =
let
val (ch, inp) = Inp.get_ch inp
in
if #ichar ch = Char.ord #"'" then
inp
else
let
fun
loop_to_end (ch1 : Ch.t, inp) =
if #ichar ch1 = eof then
raise Unterminated_character_literal (line_no, column_no)
else if #ichar ch1 = Char.ord #"'" then
raise Multicharacter_literal (line_no, column_no)
else
let
val (ch1, inp) = Inp.get_ch inp
in
loop_to_end (ch1, inp)
end
in
loop_to_end (ch, inp)
end
end
 
val inp = check_end inp
in
(toktup, inp)
end
 
(*------------------------------------------------------------------*)
 
fun
get_next_token inp =
let
val inp = skip_spaces_and_comments inp
val (ch, inp) = Inp.get_ch inp
val ln = #line_no ch
val cn = #column_no ch
in
if #ichar ch = eof then
((token_END_OF_INPUT, "", ln, cn), inp)
else
case Char.chr (#ichar ch) of
#"," => ((token_COMMA, ",", ln, cn), inp)
| #";" => ((token_SEMICOLON, ";", ln, cn), inp)
| #"(" => ((token_LEFTPAREN, "(", ln, cn), inp)
| #")" => ((token_RIGHTPAREN, ")", ln, cn), inp)
| #"{" => ((token_LEFTBRACE, "{", ln, cn), inp)
| #"}" => ((token_RIGHTBRACE, "}", ln, cn), inp)
| #"*" => ((token_MULTIPLY, "*", ln, cn), inp)
| #"/" => ((token_DIVIDE, "/", ln, cn), inp)
| #"%" => ((token_MOD, "%", ln, cn), inp)
| #"+" => ((token_ADD, "+", ln, cn), inp)
| #"-" => ((token_SUBTRACT, "-", ln, cn), inp)
| #"<" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"=" then
((token_LESSEQUAL, "<=", ln, cn), inp)
else
let
val inp = Inp.push_back_ch (ch1, inp)
in
((token_LESS, "<", ln, cn), inp)
end
end
| #">" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"=" then
((token_GREATEREQUAL, ">=", ln, cn), inp)
else
let
val inp = Inp.push_back_ch (ch1, inp)
in
((token_GREATER, ">", ln, cn), inp)
end
end
| #"=" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"=" then
((token_EQUAL, "==", ln, cn), inp)
else
let
val inp = Inp.push_back_ch (ch1, inp)
in
((token_ASSIGN, "=", ln, cn), inp)
end
end
| #"!" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"=" then
((token_NOTEQUAL, "!=", ln, cn), inp)
else
let
val inp = Inp.push_back_ch (ch1, inp)
in
((token_NOT, "!", ln, cn), inp)
end
end
| #"&" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"&" then
((token_AND, "&&", ln, cn), inp)
else
raise Unexpected_character (#line_no ch, #column_no ch,
Char.chr (#ichar ch))
end
| #"|" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"|" then
((token_OR, "||", ln, cn), inp)
else
raise Unexpected_character (#line_no ch, #column_no ch,
Char.chr (#ichar ch))
end
| #"\"" =>
let
val inp = Inp.push_back_ch (ch, inp)
in
scan_string_literal inp
end
| #"'" =>
let
val inp = Inp.push_back_ch (ch, inp)
in
scan_character_literal inp
end
| _ =>
if is_digit (#ichar ch) then
let
val inp = Inp.push_back_ch (ch, inp)
in
scan_integer_literal inp
end
else if is_ident_start (#ichar ch) then
let
val inp = Inp.push_back_ch (ch, inp)
in
scan_identifier_or_reserved_word inp
end
else
raise Unexpected_character (#line_no ch, #column_no ch,
Char.chr (#ichar ch))
end
 
fun
output_integer_rightjust (outf, num) =
(if num < 10 then
TextIO.output (outf, " ")
else if num < 100 then
TextIO.output (outf, " ")
else if num < 1000 then
TextIO.output (outf, " ")
else if num < 10000 then
TextIO.output (outf, " ")
else
();
TextIO.output (outf, Int.toString num))
 
fun
print_token (outf, toktup) =
let
val (token, arg, line_no, column_no) = toktup
val name = token_name token
val (padding, str) =
if token = token_IDENTIFIER then
(" ", arg)
else if token = token_INTEGER then
(" ", arg)
else if token = token_STRING then
(" ", arg)
else("", "")
in
output_integer_rightjust (outf, line_no);
TextIO.output (outf, " ");
output_integer_rightjust (outf, column_no);
TextIO.output (outf, " ");
TextIO.output (outf, name);
TextIO.output (outf, padding);
TextIO.output (outf, str);
TextIO.output (outf, "\n")
end
 
fun
scan_text (outf, inp) =
let
fun
loop inp =
let
val (toktup, inp) = get_next_token inp
in
(print_token (outf, toktup);
let
val (token, _, _, _) = toktup
in
if token <> token_END_OF_INPUT then
loop inp
else
()
end)
end
in
loop inp
end
 
(*------------------------------------------------------------------*)
 
fun
main () =
let
val args = CommandLine.arguments ()
val (inpf_filename, outf_filename) =
case args of
[] => ("-", "-")
| name :: [] => (name, "-")
| name1 :: name2 :: _ => (name1, name2)
val inpf =
if inpf_filename = "-" then
TextIO.stdIn
else
TextIO.openIn inpf_filename
handle
(IO.Io _) =>
(TextIO.output (TextIO.stdErr, "Failure opening \"");
TextIO.output (TextIO.stdErr, inpf_filename);
TextIO.output (TextIO.stdErr, "\" for input\n");
OS.Process.exit OS.Process.failure)
val outf =
if outf_filename = "-" then
TextIO.stdOut
else
TextIO.openOut outf_filename
handle
(IO.Io _) =>
(TextIO.output (TextIO.stdErr, "Failure opening \"");
TextIO.output (TextIO.stdErr, outf_filename);
TextIO.output (TextIO.stdErr, "\" for output\n");
OS.Process.exit OS.Process.failure)
val inp = Inp.of_instream inpf
in
scan_text (outf, inp)
end
handle Unterminated_comment (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": unterminated comment ");
TextIO.output (TextIO.stdErr, "starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Unterminated_character_literal (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": unterminated character ");
TextIO.output (TextIO.stdErr, "literal starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Multicharacter_literal (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": unsupported multicharacter");
TextIO.output (TextIO.stdErr, " literal starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| End_of_input_in_string_literal (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": end of input in string");
TextIO.output (TextIO.stdErr, " literal starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| End_of_line_in_string_literal (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": end of line in string");
TextIO.output (TextIO.stdErr, " literal starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Unsupported_escape (line_no, column_no, c) =>
(TextIO.output (TextIO.stdErr, CommandLine.name ());
TextIO.output (TextIO.stdErr, ": unsupported escape \\");
TextIO.output (TextIO.stdErr, Char.toString c);
TextIO.output (TextIO.stdErr, " at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Invalid_integer_literal (line_no, column_no, str) =>
(TextIO.output (TextIO.stdErr, CommandLine.name ());
TextIO.output (TextIO.stdErr, ": invalid integer literal ");
TextIO.output (TextIO.stdErr, str);
TextIO.output (TextIO.stdErr, " at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Unexpected_character (line_no, column_no, c) =>
(TextIO.output (TextIO.stdErr, CommandLine.name ());
TextIO.output (TextIO.stdErr, ": unexpected character '");
TextIO.output (TextIO.stdErr, Char.toString c);
TextIO.output (TextIO.stdErr, "' at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure);
 
(*------------------------------------------------------------------*)
(* For the Mlton compiler, include the following. For Poly/ML, comment
it out. *)
main ();
 
(*------------------------------------------------------------------*)
(* Instructions for GNU Emacs. *)
 
(* local variables: *)
(* mode: sml *)
(* sml-indent-level: 2 *)
(* sml-indent-args: 2 *)
(* end: *)
(*------------------------------------------------------------------*)</syntaxhighlight>
 
 
{{out}}
For Mlton, compile with
<pre>mlton -output lex lex.sml</pre>
 
For Poly/ML, compile with
<pre>polyc -o lex lex.sml</pre>
 
Mlton is an optimizing whole-program compiler. It might take much longer to compile the source but produce much faster executables.
 
Output for testcase3:
<pre> 5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|Wren}}==
{{trans|Go}}
{{libheader|Wren-dynamic}}
{{libheader|Wren-str}}
{{libheader|Wren-fmt}}
{{libheader|Wren-ioutil}}
<syntaxhighlight lang="wren">import "./dynamic" for Enum, Struct, Tuple
import "./str" for Char
import "./fmt" for Fmt
import "./ioutil" for FileUtil
import "os" for Process
 
var tokens = [
"EOI",
"Mul",
"Div",
"Mod",
"Add",
"Sub",
"Negate",
"Not",
"Lss",
"Leq",
"Gtr",
"Geq",
"Eq",
"Neq",
"Assign",
"And",
"Or",
"If",
"Else",
"While",
"Print",
"Putc",
"Lparen",
"Rparen",
"Lbrace",
"Rbrace",
"Semi",
"Comma",
"Ident",
"Integer",
"String"
]
 
var Token = Enum.create("Token", tokens)
 
var TokData = Struct.create("TokData", ["eline", "ecol", "tok", "v"])
 
var Symbol = Tuple.create("Symbol", ["name", "tok"])
 
// symbol table
var symtab = []
 
var curLine = ""
var curCh = ""
var lineNum = 0
var colNum = 0
var etx = 4 // used to signify EOI
 
var lines = []
var lineCount = 0
 
var errorMsg = Fn.new { |eline, ecol, msg| Fiber.abort("(%(eline):%(ecol)) %(msg)") }
 
// add an identifier to the symbpl table
var install = Fn.new { |name, tok|
var sym = Symbol.new(name, tok)
symtab.add(sym)
}
 
// search for an identifier in the symbol table
var lookup = Fn.new { |name|
for (i in 0...symtab.count) {
if (symtab[i].name == name) return i
}
return -1
}
 
// read the next line of input from the source file
var nextLine // recursive function
nextLine = Fn.new {
if (lineNum == lineCount) {
curCh = etx
curLine = ""
colNum = 1
return
}
curLine = lines[lineNum]
lineNum = lineNum + 1
colNum = 0
if (curLine == "") nextLine.call() // skip blank lines
}
 
// get the next char
var nextChar = Fn.new {
if (colNum >= curLine.count) nextLine.call()
if (colNum < curLine.count) {
curCh = curLine[colNum]
colNum = colNum + 1
}
}
 
var follow = Fn.new { |eline, ecol, expect, ifyes, ifno|
if (curCh == expect) {
nextChar.call()
return ifyes
}
if (ifno == Token.EOI) {
errorMsg.call(eline, ecol, "follow unrecognized character: " + curCh)
}
return ifno
}
 
var getTok // recursive function
getTok = Fn.new {
// skip whitespace
while (curCh == " " || curCh == "\t" || curCh == "\n") nextChar.call()
var td = TokData.new(lineNum, colNum, 0, "")
if (curCh == etx) {
td.tok = Token.EOI
return td
}
if (curCh == "{") {
td.tok = Token.Lbrace
nextChar.call()
return td
}
if (curCh == "}") {
td.tok = Token.Rbrace
nextChar.call()
return td
}
if (curCh == "(") {
td.tok = Token.Lparen
nextChar.call()
return td
}
if (curCh == ")") {
td.tok = Token.Rparen
nextChar.call()
return td
}
if (curCh == "+") {
td.tok = Token.Add
nextChar.call()
return td
}
if (curCh == "-") {
td.tok = Token.Sub
nextChar.call()
return td
}
if (curCh == "*") {
td.tok = Token.Mul
nextChar.call()
return td
}
if (curCh == "\%") {
td.tok = Token.Mod
nextChar.call()
return td
}
if (curCh == ";") {
td.tok = Token.Semi
nextChar.call()
return td
}
if (curCh == ",") {
td.tok = Token.Comma
nextChar.call()
return td
}
if (curCh == "'") { // single char literals
nextChar.call()
td.v = curCh.bytes[0].toString
if (curCh == "'") {
errorMsg.call(td.eline, td.ecol, "Empty character constant")
}
if (curCh == "\\") {
nextChar.call()
if (curCh == "n") {
td.v = "10"
} else if (curCh == "\\") {
td.v = "92"
} else {
errorMsg.call(td.eline, td.ecol, "unknown escape sequence: "+ curCh)
}
}
nextChar.call()
if (curCh != "'") {
errorMsg.call(td.eline, td.ecol, "multi-character constant")
}
nextChar.call()
td.tok = Token.Integer
return td
}
if (curCh == "<") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "=", Token.Leq, Token.Lss)
return td
}
if (curCh == ">") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "=", Token.Geq, Token.Gtr)
return td
}
if (curCh == "!") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "=", Token.Neq, Token.Not)
return td
}
if (curCh == "=") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "=", Token.Eq, Token.Assign)
return td
}
if (curCh == "&") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "&", Token.And, Token.EOI)
return td
}
if (curCh == "|") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "|", Token.Or, Token.EOI)
return td
}
if (curCh == "\"") { // string
td.v = curCh
nextChar.call()
while (curCh != "\"") {
if (curCh == "\n") {
errorMsg.call(td.eline, td.ecol, "EOL in string")
}
if (curCh == etx) {
errorMsg.call(td.eline, td.ecol, "EOF in string")
}
td.v = td.v + curCh
nextChar.call()
}
td.v = td.v + curCh
nextChar.call()
td.tok = Token.String
return td
}
if (curCh == "/") { // div or comment
nextChar.call()
if (curCh != "*") {
td.tok = Token.Div
return td
}
// skip comments
nextChar.call()
while (true) {
if (curCh == "*") {
nextChar.call()
if (curCh == "/") {
nextChar.call()
return getTok.call()
}
} else if (curCh == etx) {
errorMsg.call(td.eline, td.ecol, "EOF in comment")
} else {
nextChar.call()
}
}
}
//integers or identifiers
var isNumber = Char.isDigit(curCh)
td.v = ""
while (Char.isAsciiAlphaNum(curCh) || curCh == "_") {
if (!Char.isDigit(curCh)) isNumber = false
td.v = td.v + curCh
nextChar.call()
}
if (td.v.count == 0) {
errorMsg.call(td.eline, td.ecol, "unknown character: " + curCh)
}
if (Char.isDigit(td.v[0])) {
if (!isNumber) {
errorMsg.call(td.eline, td.ecol, "invalid number: " + curCh)
}
td.tok = Token.Integer
return td
}
var index = lookup.call(td.v)
td.tok = (index == -1) ? Token.Ident : symtab[index].tok
return td
}
 
var initLex = Fn.new {
install.call("else", Token.Else)
install.call("if", Token.If)
install.call("print", Token.Print)
install.call("putc", Token.Putc)
install.call("while", Token.While)
nextChar.call()
}
 
var process = Fn.new {
var tokMap = {}
tokMap[Token.EOI] = "End_of_input"
tokMap[Token.Mul] = "Op_multiply"
tokMap[Token.Div] = "Op_divide"
tokMap[Token.Mod] = "Op_mod"
tokMap[Token.Add] = "Op_add"
tokMap[Token.Sub] = "Op_subtract"
tokMap[Token.Negate] = "Op_negate"
tokMap[Token.Not] = "Op_not"
tokMap[Token.Lss] = "Op_less"
tokMap[Token.Leq] = "Op_lessequal"
tokMap[Token.Gtr] = "Op_greater"
tokMap[Token.Geq] = "Op_greaterequal"
tokMap[Token.Eq] = "Op_equal"
tokMap[Token.Neq] = "Op_notequal"
tokMap[Token.Assign] = "Op_assign"
tokMap[Token.And] = "Op_and"
tokMap[Token.Or] = "Op_or"
tokMap[Token.If] = "Keyword_if"
tokMap[Token.Else] = "Keyword_else"
tokMap[Token.While] = "Keyword_while"
tokMap[Token.Print] = "Keyword_print"
tokMap[Token.Putc] = "Keyword_putc"
tokMap[Token.Lparen] = "LeftParen"
tokMap[Token.Rparen] = "RightParen"
tokMap[Token.Lbrace] = "LeftBrace"
tokMap[Token.Rbrace] = "RightBrace"
tokMap[Token.Semi] = "Semicolon"
tokMap[Token.Comma] = "Comma"
tokMap[Token.Ident] = "Identifier"
tokMap[Token.Integer] = "Integer"
tokMap[Token.String] = "String"
 
while (true) {
var td = getTok.call()
Fmt.write("$5d $5d $-16s", td.eline, td.ecol, tokMap[td.tok])
if (td.tok == Token.Integer || td.tok == Token.Ident || td.tok == Token.String) {
System.print(td.v)
} else {
System.print()
}
if (td.tok == Token.EOI) return
}
}
 
var args = Process.arguments
if (args.count == 0) {
System.print("Filename required")
return
}
 
lines = FileUtil.readLines(args[0])
lineCount = lines.count
initLex.call()
process.call()</syntaxhighlight>
 
{{out}}
For test case 3:
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
 
=={{header|Zig}}==
<syntaxhighlight lang="zig">
const std = @import("std");
 
pub const TokenType = enum {
unknown,
multiply,
divide,
mod,
add,
subtract,
negate,
less,
less_equal,
greater,
greater_equal,
equal,
not_equal,
not,
assign,
bool_and,
bool_or,
left_paren,
right_paren,
left_brace,
right_brace,
semicolon,
comma,
kw_if,
kw_else,
kw_while,
kw_print,
kw_putc,
identifier,
integer,
string,
eof,
 
// More efficient implementation can be done with `std.enums.directEnumArray`.
pub fn toString(self: @This()) []const u8 {
return switch (self) {
.unknown => "UNKNOWN",
.multiply => "Op_multiply",
.divide => "Op_divide",
.mod => "Op_mod",
.add => "Op_add",
.subtract => "Op_subtract",
.negate => "Op_negate",
.less => "Op_less",
.less_equal => "Op_lessequal",
.greater => "Op_greater",
.greater_equal => "Op_greaterequal",
.equal => "Op_equal",
.not_equal => "Op_notequal",
.not => "Op_not",
.assign => "Op_assign",
.bool_and => "Op_and",
.bool_or => "Op_or",
.left_paren => "LeftParen",
.right_paren => "RightParen",
.left_brace => "LeftBrace",
.right_brace => "RightBrace",
.semicolon => "Semicolon",
.comma => "Comma",
.kw_if => "Keyword_if",
.kw_else => "Keyword_else",
.kw_while => "Keyword_while",
.kw_print => "Keyword_print",
.kw_putc => "Keyword_putc",
.identifier => "Identifier",
.integer => "Integer",
.string => "String",
.eof => "End_of_input",
};
}
};
 
pub const TokenValue = union(enum) {
intlit: i32,
string: []const u8,
};
 
pub const Token = struct {
line: usize,
col: usize,
typ: TokenType = .unknown,
value: ?TokenValue = null,
};
 
// Error conditions described in the task.
pub const LexerError = error{
EmptyCharacterConstant,
UnknownEscapeSequence,
MulticharacterConstant,
EndOfFileInComment,
EndOfFileInString,
EndOfLineInString,
UnrecognizedCharacter,
InvalidNumber,
};
 
pub const Lexer = struct {
content: []const u8,
line: usize,
col: usize,
offset: usize,
start: bool,
 
const Self = @This();
 
pub fn init(content: []const u8) Lexer {
return Lexer{
.content = content,
.line = 1,
.col = 1,
.offset = 0,
.start = true,
};
}
 
pub fn buildToken(self: Self) Token {
return Token{ .line = self.line, .col = self.col };
}
 
pub fn buildTokenT(self: Self, typ: TokenType) Token {
return Token{ .line = self.line, .col = self.col, .typ = typ };
}
 
pub fn curr(self: Self) u8 {
return self.content[self.offset];
}
 
// Alternative implementation is to return `Token` value from `next()` which is
// arguably more idiomatic version.
pub fn next(self: *Self) ?u8 {
// We use `start` in order to make the very first invocation of `next()` to return
// the very first character. It should be possible to avoid this variable.
if (self.start) {
self.start = false;
} else {
const newline = self.curr() == '\n';
self.offset += 1;
if (newline) {
self.col = 1;
self.line += 1;
} else {
self.col += 1;
}
}
if (self.offset >= self.content.len) {
return null;
} else {
return self.curr();
}
}
 
pub fn peek(self: Self) ?u8 {
if (self.offset + 1 >= self.content.len) {
return null;
} else {
return self.content[self.offset + 1];
}
}
 
fn divOrComment(self: *Self) LexerError!?Token {
var result = self.buildToken();
if (self.peek()) |peek_ch| {
if (peek_ch == '*') {
_ = self.next(); // peeked character
while (self.next()) |ch| {
if (ch == '*') {
if (self.peek()) |next_ch| {
if (next_ch == '/') {
_ = self.next(); // peeked character
return null;
}
}
}
}
return LexerError.EndOfFileInComment;
}
}
result.typ = .divide;
return result;
}
 
fn identifierOrKeyword(self: *Self) !Token {
var result = self.buildToken();
const init_offset = self.offset;
while (self.peek()) |ch| : (_ = self.next()) {
switch (ch) {
'_', 'a'...'z', 'A'...'Z', '0'...'9' => {},
else => break,
}
}
const final_offset = self.offset + 1;
 
if (std.mem.eql(u8, self.content[init_offset..final_offset], "if")) {
result.typ = .kw_if;
} else if (std.mem.eql(u8, self.content[init_offset..final_offset], "else")) {
result.typ = .kw_else;
} else if (std.mem.eql(u8, self.content[init_offset..final_offset], "while")) {
result.typ = .kw_while;
} else if (std.mem.eql(u8, self.content[init_offset..final_offset], "print")) {
result.typ = .kw_print;
} else if (std.mem.eql(u8, self.content[init_offset..final_offset], "putc")) {
result.typ = .kw_putc;
} else {
result.typ = .identifier;
result.value = TokenValue{ .string = self.content[init_offset..final_offset] };
}
 
return result;
}
 
fn string(self: *Self) !Token {
var result = self.buildToken();
result.typ = .string;
const init_offset = self.offset;
while (self.next()) |ch| {
switch (ch) {
'"' => break,
'\n' => return LexerError.EndOfLineInString,
'\\' => {
switch (self.peek() orelse return LexerError.EndOfFileInString) {
'n', '\\' => _ = self.next(), // peeked character
else => return LexerError.UnknownEscapeSequence,
}
},
else => {},
}
} else {
return LexerError.EndOfFileInString;
}
const final_offset = self.offset + 1;
result.value = TokenValue{ .string = self.content[init_offset..final_offset] };
return result;
}
 
/// Choose either `ifyes` or `ifno` token type depending on whether the peeked
/// character is `by`.
fn followed(self: *Self, by: u8, ifyes: TokenType, ifno: TokenType) Token {
var result = self.buildToken();
if (self.peek()) |ch| {
if (ch == by) {
_ = self.next(); // peeked character
result.typ = ifyes;
} else {
result.typ = ifno;
}
} else {
result.typ = ifno;
}
return result;
}
 
/// Raise an error if there's no next `by` character but return token with `typ` otherwise.
fn consecutive(self: *Self, by: u8, typ: TokenType) LexerError!Token {
const result = self.buildTokenT(typ);
if (self.peek()) |ch| {
if (ch == by) {
_ = self.next(); // peeked character
return result;
} else {
return LexerError.UnrecognizedCharacter;
}
} else {
return LexerError.UnrecognizedCharacter;
}
}
 
fn integerLiteral(self: *Self) LexerError!Token {
var result = self.buildTokenT(.integer);
const init_offset = self.offset;
while (self.peek()) |ch| {
switch (ch) {
'0'...'9' => _ = self.next(), // peeked character
'_', 'a'...'z', 'A'...'Z' => return LexerError.InvalidNumber,
else => break,
}
}
const final_offset = self.offset + 1;
result.value = TokenValue{
.intlit = std.fmt.parseInt(i32, self.content[init_offset..final_offset], 10) catch {
return LexerError.InvalidNumber;
},
};
return result;
}
 
// This is a beautiful way of how Zig allows to remove bilerplate and at the same time
// to not lose any error completeness guarantees.
fn nextOrEmpty(self: *Self) LexerError!u8 {
return self.next() orelse LexerError.EmptyCharacterConstant;
}
 
fn integerChar(self: *Self) LexerError!Token {
var result = self.buildTokenT(.integer);
switch (try self.nextOrEmpty()) {
'\'', '\n' => return LexerError.EmptyCharacterConstant,
'\\' => {
switch (try self.nextOrEmpty()) {
'n' => result.value = TokenValue{ .intlit = '\n' },
'\\' => result.value = TokenValue{ .intlit = '\\' },
else => return LexerError.EmptyCharacterConstant,
}
switch (try self.nextOrEmpty()) {
'\'' => {},
else => return LexerError.MulticharacterConstant,
}
},
else => {
result.value = TokenValue{ .intlit = self.curr() };
switch (try self.nextOrEmpty()) {
'\'' => {},
else => return LexerError.MulticharacterConstant,
}
},
}
return result;
}
};
 
pub fn lex(allocator: std.mem.Allocator, content: []u8) !std.ArrayList(Token) {
var tokens = std.ArrayList(Token).init(allocator);
var lexer = Lexer.init(content);
while (lexer.next()) |ch| {
switch (ch) {
' ' => {},
'*' => try tokens.append(lexer.buildTokenT(.multiply)),
'%' => try tokens.append(lexer.buildTokenT(.mod)),
'+' => try tokens.append(lexer.buildTokenT(.add)),
'-' => try tokens.append(lexer.buildTokenT(.subtract)),
'<' => try tokens.append(lexer.followed('=', .less_equal, .less)),
'>' => try tokens.append(lexer.followed('=', .greater_equal, .greater)),
'=' => try tokens.append(lexer.followed('=', .equal, .assign)),
'!' => try tokens.append(lexer.followed('=', .not_equal, .not)),
'(' => try tokens.append(lexer.buildTokenT(.left_paren)),
')' => try tokens.append(lexer.buildTokenT(.right_paren)),
'{' => try tokens.append(lexer.buildTokenT(.left_brace)),
'}' => try tokens.append(lexer.buildTokenT(.right_brace)),
';' => try tokens.append(lexer.buildTokenT(.semicolon)),
',' => try tokens.append(lexer.buildTokenT(.comma)),
'&' => try tokens.append(try lexer.consecutive('&', .bool_and)),
'|' => try tokens.append(try lexer.consecutive('|', .bool_or)),
'/' => {
if (try lexer.divOrComment()) |token| try tokens.append(token);
},
'_', 'a'...'z', 'A'...'Z' => try tokens.append(try lexer.identifierOrKeyword()),
'"' => try tokens.append(try lexer.string()),
'0'...'9' => try tokens.append(try lexer.integerLiteral()),
'\'' => try tokens.append(try lexer.integerChar()),
else => {},
}
}
try tokens.append(lexer.buildTokenT(.eof));
 
return tokens;
}
 
pub fn main() !void {
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena.deinit();
const allocator = arena.allocator();
 
var arg_it = std.process.args();
_ = try arg_it.next(allocator) orelse unreachable; // program name
const file_name = arg_it.next(allocator);
// We accept both files and standard input.
var file_handle = blk: {
if (file_name) |file_name_delimited| {
const fname: []const u8 = try file_name_delimited;
break :blk try std.fs.cwd().openFile(fname, .{});
} else {
break :blk std.io.getStdIn();
}
};
defer file_handle.close();
const input_content = try file_handle.readToEndAlloc(allocator, std.math.maxInt(usize));
 
const tokens = try lex(allocator, input_content);
const pretty_output = try tokenListToString(allocator, tokens);
_ = try std.io.getStdOut().write(pretty_output);
}
 
fn tokenListToString(allocator: std.mem.Allocator, token_list: std.ArrayList(Token)) ![]u8 {
var result = std.ArrayList(u8).init(allocator);
var w = result.writer();
for (token_list.items) |token| {
const common_args = .{ token.line, token.col, token.typ.toString() };
if (token.value) |value| {
const init_fmt = "{d:>5}{d:>7} {s:<15}";
switch (value) {
.string => |str| _ = try w.write(try std.fmt.allocPrint(
allocator,
init_fmt ++ "{s}\n",
common_args ++ .{str},
)),
.intlit => |i| _ = try w.write(try std.fmt.allocPrint(
allocator,
init_fmt ++ "{d}\n",
common_args ++ .{i},
)),
}
} else {
_ = try w.write(try std.fmt.allocPrint(allocator, "{d:>5}{d:>7} {s}\n", common_args));
}
}
return result.items;
}
</syntaxhighlight>
9,488

edits