Compiler/lexical analyzer: Difference between revisions

m
(Added Go)
m (→‎{{header|Wren}}: Minor tidy)
(123 intermediate revisions by 23 users not shown)
Line 18:
;Operators
 
:::{| class="wikitable"
|-
! Name !! Common name !! Character sequence
Line 59:
;Symbols
 
:::{| class="wikitable"
|-
! Name !! Common name !! Character
Line 78:
;Keywords
 
:::{| class="wikitable"
|-
! Name || Character sequence
Line 97:
These differ from the the previous tokens, in that each occurrence of them has a value associated with it.
 
:::{| class="wikitable"
|-
! Name
Line 138:
;Zero-width tokens
 
:::{| class="wikitable"
|-
! Name || Location
Line 158:
For example, the following two program fragments are equivalent, and should produce the same token stream except for the line and column positions:
 
* <langsyntaxhighlight lang="c">if ( p /* meaning n is prime */ ) {
print ( n , " " ) ;
count = count + 1 ; /* number of primes found so far */
}</langsyntaxhighlight>
* <langsyntaxhighlight lang="c">if(p){print(n," ");count=count+1;}</langsyntaxhighlight>
 
;Complete list of token names
Line 184:
# the token value (only for <tt>Identifier</tt>, <tt>Integer</tt>, and <tt>String</tt> tokens)
# the number of spaces between fields is up to you. Neatly aligned is nice, but not a requirement.
<br>
 
This task is intended to be used as part of a pipeline, with the other compiler tasks - for example:
<br><b>lex < hello.t | parse | gen | vm</b>
 
Or possibly:
<br><b>lex hello.t lex.out</b>
<br><b>parse lex.out parse.out</b>
<br><b>gen parse.out gen.out</b>
<br><b>vm gen.out</b>
 
<br>
This implies that the output of this task (the lexical analyzer) should be suitable as input to any of the [[Compiler/syntax_analyzer|Syntax Analyzer task]] programs.
 
{{task heading|Diagnostics}}
Line 189 ⟶ 202:
The following error conditions should be caught:
 
:::{| class="wikitable"
|-
! Error
Line 217 ⟶ 230:
 
{{task heading|Test Cases}}
:{| class="wikitable"
 
{| class="wikitable"
|-
! Input
Line 225 ⟶ 237:
| style="vertical-align:top" |
Test Case 1:
<langsyntaxhighlight lang="c">/*
Hello world
*/
print("Hello, World!\n");</langsyntaxhighlight>
 
| style="vertical-align:top" |
Line 243 ⟶ 255:
| style="vertical-align:top" |
Test Case 2:
<langsyntaxhighlight lang="c">/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");</langsyntaxhighlight>
 
| style="vertical-align:top" |
Line 268 ⟶ 280:
| style="vertical-align:top" |
Test Case 3:
<langsyntaxhighlight lang="c">/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
Line 289 ⟶ 301:
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '</langsyntaxhighlight>
 
| style="vertical-align:top" |
Line 332 ⟶ 344:
| style="vertical-align:top" |
Test Case 4:
<langsyntaxhighlight lang="c">/*** test printing, embedded \n and comments with lots of '*' ***/
print(42);
print("\nHello World\nGood Bye\nok\n");
print("Print a slash n - \\n.\n");</langsyntaxhighlight>
 
| style="vertical-align:top" |
Line 359 ⟶ 371:
|}
 
; Additional examples
 
Your solution should pass all the test cases above and the additional tests found '''[[Compiler/Sample_programs|Here]]'''.
 
{{task heading|Reference}}
 
{{task heading|Reference}}
The C and Python versions can be considered reference implementations.
 
;Related Tasks
 
;Related Tasks
* [[Compiler/syntax_analyzer|Syntax Analyzer task]]
* [[Compiler/code_generator|Code Generator task]]
* [[Compiler/virtual_machine_interpreter|Virtual Machine Interpreter task]]
* [[Compiler/AST_interpreter|AST Interpreter task]]
 
<hr>
<br><br>
__TOC__
 
=={{header|Ada}}==
<syntaxhighlight lang="ada">with Ada.Text_IO, Ada.Streams.Stream_IO, Ada.Strings.Unbounded, Ada.Command_Line,
Ada.Exceptions;
use Ada.Strings, Ada.Strings.Unbounded, Ada.Streams, Ada.Exceptions;
 
procedure Main is
package IO renames Ada.Text_IO;
 
package Lexer is
type Token is (Op_multiply, Op_divide, Op_mod, Op_add, Op_subtract, Op_negate,
Op_less, Op_lessequal, Op_greater, Op_greaterequal, Op_equal,
Op_notequal, Op_not, Op_assign, Op_and, Op_or,
 
LeftParen, RightParen, LeftBrace, RightBrace, Semicolon, Comma,
 
Keyword_if, Keyword_else, Keyword_while, Keyword_print, Keyword_putc,
Identifier, Token_Integer, Token_String, End_of_input,
 
Empty_Char_Error, Invalid_Escape_Error, Multi_Char_Error, EOF_Comment_Error,
EOF_String_Error, EOL_String_Error, Invalid_Char_Error, Invalid_Num_Error
);
 
subtype Operator is Token range Op_multiply .. Op_or;
subtype Symbol is Token range Token'Succ(Operator'Last) .. Comma;
subtype Keyword is Token range Token'Succ(Symbol'Last) .. Keyword_putc;
subtype Error is Token range Empty_Char_Error .. Invalid_Num_Error;
subtype Operator_or_Error is Token
with Static_Predicate => Operator_or_Error in Operator | Error;
 
subtype Whitespace is Character
with Static_Predicate => Whitespace in ' ' | ASCII.HT | ASCII.CR | ASCII.LF;
 
Lexer_Error : exception;
Invalid_Escape_Code : constant Character := ASCII.NUL;
 
procedure run(input : Stream_IO.File_Type);
end Lexer;
 
package body Lexer is
use type Stream_IO.Count;
 
procedure run(input : Stream_IO.File_Type) is
type State is (State_Start, State_Identifier, State_Integer, State_Char, State_String,
State_Comment);
curr_state : State := State_Start;
curr_char : Character;
curr_col, curr_row, token_col, token_row : Positive := 1;
token_text : Unbounded_String := Unbounded.Null_Unbounded_String;
 
function look_ahead return Character is
next_char : Character := ASCII.LF;
begin
if not Stream_IO.End_Of_File(input) then
next_char := Character'Input(Stream_IO.Stream(input));
Stream_IO.Set_Index(input, Stream_IO.Index(input) - 1);
end if;
return next_char;
end look_ahead;
 
procedure next_char is
next : Character := Character'Input(Stream_IO.Stream(input));
begin
curr_col := curr_col + 1;
if curr_char = ASCII.LF then
curr_row := curr_row + 1;
curr_col := 1;
end if;
curr_char := next;
end next_char;
 
procedure print_token(tok : Token; text : String := "") is
procedure raise_error(text : String) is
begin
raise Lexer_Error with "Error: " & text;
end;
begin
IO.Put(token_row'Image & ASCII.HT & token_col'Image & ASCII.HT);
case tok is
when Operator | Symbol | Keyword | End_of_input => IO.Put_Line(tok'Image);
when Token_Integer => IO.Put_Line("INTEGER" & ASCII.HT & text);
when Token_String => IO.Put_Line("STRING" & ASCII.HT & ASCII.Quotation & text & ASCII.Quotation);
when Identifier => IO.Put_Line(tok'Image & ASCII.HT & text);
when Empty_Char_Error => raise_error("empty character constant");
when Invalid_Escape_Error => raise_error("unknown escape sequence: " & text);
when Multi_Char_Error => raise_error("multi-character constant: " & text);
when EOF_Comment_Error => raise_error("EOF in comment");
when EOF_String_Error => raise_error("EOF in string");
when EOL_String_Error => raise_error("EOL in string");
when Invalid_Char_Error => raise_error("invalid character: " & curr_char);
when Invalid_Num_Error => raise_error("invalid number: " & text);
end case;
end print_token;
 
procedure lookahead_choose(determiner : Character; a, b : Operator_or_Error) is
begin
if look_ahead = determiner then
print_token(a);
next_char;
else
print_token(b);
end if;
end lookahead_choose;
 
function to_escape_code(c : Character) return Character is
begin
case c is
when 'n' => return ASCII.LF;
when '\' => return '\';
when others =>
print_token(Invalid_Escape_Error, ASCII.Back_Slash & c);
return Invalid_Escape_Code;
end case;
end to_escape_code;
begin
curr_char := Character'Input(Stream_IO.Stream(input));
loop
case curr_state is
when State_Start =>
token_col := curr_col;
token_row := curr_row;
case curr_char is
when '*' => print_token(Op_multiply);
when '/' =>
if look_ahead = '*' then
next_char;
curr_state := State_Comment;
else
print_token(Op_divide);
end if;
when '%' => print_token(Op_mod);
when '+' => print_token(Op_add);
when '-' => print_token(Op_subtract);
when '(' => print_token(LeftParen);
when ')' => print_token(RightParen);
when '{' => print_token(LeftBrace);
when '}' => print_token(RightBrace);
when ';' => print_token(Semicolon);
when ',' => print_token(Comma);
when '<' => lookahead_choose('=', Op_lessequal, Op_less);
when '>' => lookahead_choose('=', Op_greaterequal, Op_greater);
when '!' => lookahead_choose('=', Op_notequal, Op_not);
when '=' => lookahead_choose('=', Op_equal, Op_assign);
when '&' => lookahead_choose('&', Op_and, Invalid_Char_Error);
when '|' => lookahead_choose('|', Op_or, Invalid_Char_Error);
when 'a' .. 'z' | 'A' .. 'Z' | '_' =>
Unbounded.Append(token_text, curr_char);
curr_state := State_Identifier;
when '0' .. '9' =>
Unbounded.Append(token_text, curr_char);
curr_state := State_Integer;
when ''' => curr_state := State_Char;
when ASCII.Quotation => curr_state := State_String;
when Whitespace => null;
when others => null;
end case;
next_char;
 
when State_Identifier =>
case curr_char is
when 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_' =>
Unbounded.Append(token_text, curr_char);
next_char;
when others =>
if token_text = "if" then
print_token(Keyword_if);
elsif token_text = "else" then
print_token(Keyword_else);
elsif token_text = "while" then
print_token(Keyword_while);
elsif token_text = "print" then
print_token(Keyword_print);
elsif token_text = "putc" then
print_token(Keyword_putc);
else
print_token(Identifier, To_String(token_text));
end if;
Unbounded.Set_Unbounded_String(token_text, "");
curr_state := State_Start;
end case;
 
when State_Integer =>
case curr_char is
when '0' .. '9' =>
Unbounded.Append(token_text, curr_char);
next_char;
when 'a' .. 'z' | 'A' .. 'Z' | '_' =>
print_token(Invalid_Num_Error, To_String(token_text));
when others =>
print_token(Token_Integer, To_String(token_text));
Unbounded.Set_Unbounded_String(token_text, "");
curr_state := State_Start;
end case;
 
when State_Char =>
case curr_char is
when ''' =>
if Unbounded.Length(token_text) = 0 then
print_token(Empty_Char_Error);
elsif Unbounded.Length(token_text) = 1 then
print_token(Token_Integer, Character'Pos(Element(token_text, 1))'Image);
else
print_token(Multi_Char_Error, To_String(token_text));
end if;
Set_Unbounded_String(token_text, "");
curr_state := State_Start;
when '\' =>
Unbounded.Append(token_text, to_escape_code(look_ahead));
next_char;
when others => Unbounded.Append(token_text, curr_char);
end case;
next_char;
 
when State_String =>
case curr_char is
when ASCII.Quotation =>
print_token(Token_String, To_String(token_text));
Set_Unbounded_String(token_text, "");
curr_state := State_Start;
when '\' =>
if to_escape_code(look_ahead) /= Invalid_Escape_Code then
Unbounded.Append(token_text, curr_char);
end if;
when ASCII.LF | ASCII.CR => print_token(EOL_String_Error);
when others => Unbounded.Append(token_text, curr_char);
end case;
next_char;
 
when State_Comment =>
case curr_char is
when '*' =>
if look_ahead = '/' then
next_char;
curr_state := State_Start;
end if;
when others => null;
end case;
next_char;
end case;
end loop;
exception
when error : Stream_IO.End_Error =>
if curr_state = State_String then
print_token(EOF_String_Error);
else
print_token(End_of_input);
end if;
when error : Lexer.Lexer_Error => IO.Put_Line(Exception_Message(error));
end run;
end Lexer;
 
source_file : Stream_IO.File_Type;
begin
if Ada.Command_Line.Argument_Count < 1 then
IO.Put_Line("usage: lex [filename]");
return;
end if;
Stream_IO.Open(source_file, Stream_IO.In_File, Ada.Command_Line.Argument(1));
Lexer.run(source_file);
exception
when error : others => IO.Put_Line("Error: " & Exception_Message(error));
end Main;
</syntaxhighlight>
{{out}} Test case 3:
<pre>
5 16 KEYWORD_PRINT
5 40 OP_SUBTRACT
6 16 KEYWORD_PUTC
6 40 OP_LESS
7 16 KEYWORD_IF
7 40 OP_GREATER
8 16 KEYWORD_ELSE
8 40 OP_LESSEQUAL
9 16 KEYWORD_WHILE
9 40 OP_GREATEREQUAL
10 16 LEFTBRACE
10 40 OP_EQUAL
11 16 RIGHTBRACE
11 40 OP_NOTEQUAL
12 16 LEFTPAREN
12 40 OP_AND
13 16 RIGHTPAREN
13 40 OP_OR
14 16 OP_SUBTRACT
14 40 SEMICOLON
15 16 OP_NOT
15 40 COMMA
16 16 OP_MULTIPLY
16 40 OP_ASSIGN
17 16 OP_DIVIDE
17 40 INTEGER 42
18 16 OP_MOD
18 40 STRING "String literal"
19 16 OP_ADD
19 40 IDENTIFIER variable_name
20 26 INTEGER 10
21 26 INTEGER 92
22 26 INTEGER 32
23 1 END_OF_INPUT
</pre>
 
=={{header|ALGOL 68}}==
This is a simple ''token in, line out'' program. It doesn't keep an internal representation of tokens or anything like that, since that's not needed at all.
 
As an addition, it emits a diagnostic if integer literals are too big.
<syntaxhighlight lang="algol68">BEGIN
# implement C-like getchar, where EOF and EOLn are "characters" (-1 and 10 resp.). #
INT eof = -1, eoln = 10;
BOOL eof flag := FALSE;
STRING buf := "";
INT col := 1;
INT line := 0;
on logical file end (stand in, (REF FILE f)BOOL: eof flag := TRUE);
PROC getchar = INT:
IF eof flag THEN eof
ELIF col = UPB buf THEN col +:= 1; eoln
ELIF col > UPB buf THEN IF line > 0 THEN read(newline) FI;
line +:= 1;
read(buf);
IF eof flag THEN col := 1; eof
ELSE col := 0; getchar
FI
ELSE col +:= 1; ABS buf[col]
FI;
PROC nextchar = INT: IF eof flag THEN eof ELIF col >= UPB buf THEN eoln ELSE ABS buf[col+1] FI;
 
PROC is blank = (INT ch) BOOL: ch = 0 OR ch = 9 OR ch = 10 OR ch = 13 OR ch = ABS " ";
PROC is digit = (INT ch) BOOL: ch >= ABS "0" AND ch <= ABS "9";
PROC is ident start = (INT ch) BOOL: ch >= ABS "A" AND ch <= ABS "Z" OR
ch >= ABS "a" AND ch <= ABS "z" OR
ch = ABS "_";
PROC is ident = (INT ch) BOOL: is ident start(ch) OR is digit(ch);
 
PROC ident or keyword = (INT start char) VOID:
BEGIN
STRING w := REPR start char;
INT start col = col;
WHILE is ident (next char) DO w +:= REPR getchar OD;
IF w = "if" THEN output2("Keyword_if", start col)
ELIF w = "else" THEN output2("Keyword_else", start col)
ELIF w = "while" THEN output2("Keyword_while", start col)
ELIF w = "print" THEN output2("Keyword_print", start col)
ELIF w = "putc" THEN output2("Keyword_putc", start col)
ELSE output2("Identifier " + w, start col)
FI
END;
PROC char = VOID:
BEGIN
INT start col = col;
INT ch := getchar;
IF ch = ABS "'" THEN error("Empty character constant")
ELIF ch = ABS "\" THEN ch := getchar;
IF ch = ABS "n" THEN ch := 10
ELIF ch = ABS "\" THEN SKIP
ELSE error("Unknown escape sequence. \" + REPR ch)
FI
FI;
IF nextchar /= ABS "'" THEN error("Multi-character constant.") FI;
getchar;
output2("Integer " + whole(ch, 0), start col)
END;
PROC string = VOID:
BEGIN
INT start col = col;
STRING s := """";
WHILE INT ch := getchar; ch /= ABS """"
DO
IF ch = eoln THEN error("End-of-line while scanning string literal. Closing string character not found before end-of-line.")
ELIF ch = eof THEN error("End-of-file while scanning string literal. Closing string character not found.")
ELIF ch = ABS "\" THEN s +:= REPR ch; ch := getchar;
IF ch /= ABS "\" AND ch /= ABS "n" THEN error("Unknown escape sequence. \" + REPR ch) FI;
s +:= REPR ch
ELSE s +:= REPR ch
FI
OD;
output2("String " + s + """", start col)
END;
PROC comment = VOID:
BEGIN
WHILE INT ch := getchar; NOT (ch = ABS "*" AND nextchar = ABS "/")
DO IF ch = eof THEN error("End-of-file in comment. Closing comment characters not found.") FI
OD;
getchar
END;
PROC number = (INT first digit) VOID:
BEGIN
INT start col = col;
INT n := first digit - ABS "0";
WHILE is digit (nextchar) DO
INT u := getchar - ABS "0";
IF LENG n * 10 + LENG u > max int THEN error("Integer too big") FI;
n := n * 10 + u
OD;
IF is ident start (nextchar) THEN error("Invalid number. Starts like a number, but ends in non-numeric characters.") FI;
output2("Integer " + whole(n, 0), start col)
END;
 
PROC output = (STRING s) VOID: output2(s, col);
PROC output2 = (STRING s, INT col) VOID: print((whole(line,-8), whole(col,-8), " ", s, newline));
 
PROC if follows = (CHAR second, STRING longer, shorter) VOID:
IF nextchar = ABS second
THEN output(longer); getchar
ELSE output(shorter)
FI;
PROC error = (STRING s)VOID: (put(stand error, ("At ", whole(line,0), ":", whole(col,0), " ", s, new line)); stop);
PROC unrecognized = (INT char) VOID: error("Unrecognized character " + REPR char);
PROC double char = (INT first, STRING op) VOID:
IF nextchar /= first THEN unrecognized(first)
ELSE output2(op, col-1); getchar
FI;
 
WHILE INT ch := getchar; ch /= eof
DO
IF is blank(ch) THEN SKIP
ELIF ch = ABS "(" THEN output("LeftParen")
ELIF ch = ABS ")" THEN output("RightParen")
ELIF ch = ABS "{" THEN output("LeftBrace")
ELIF ch = ABS "}" THEN output("RightBrace")
ELIF ch = ABS ";" THEN output("Semicolon")
ELIF ch = ABS "," THEN output("Comma")
ELIF ch = ABS "*" THEN output("Op_multiply")
ELIF ch = ABS "/" THEN IF next char = ABS "*" THEN comment
ELSE output("Op_divide")
FI
ELIF ch = ABS "%" THEN output("Op_mod")
ELIF ch = ABS "+" THEN output("Op_add")
ELIF ch = ABS "-" THEN output("Op_subtract")
ELIF ch = ABS "<" THEN if follows("=", "Op_lessequal", "Op_less")
ELIF ch = ABS ">" THEN if follows("=", "Op_greaterequal", "Op_greater")
ELIF ch = ABS "=" THEN if follows("=", "Op_equal", "Op_assign")
ELIF ch = ABS "!" THEN if follows("=", "Op_notequal", "Op_not")
ELIF ch = ABS "&" THEN double char(ch, "Op_and")
ELIF ch = ABS "|" THEN double char(ch, "Op_or")
ELIF is ident start (ch) THEN ident or keyword (ch)
ELIF ch = ABS """" THEN string
ELIF ch = ABS "'" THEN char
ELIF is digit(ch) THEN number(ch)
ELSE unrecognized(ch)
FI
OD;
output("End_Of_Input")
END</syntaxhighlight>
 
=={{header|ALGOL W}}==
<langsyntaxhighlight lang="algolw">begin
%lexical analyser %
% Algol W strings are limited to 256 characters in length so we limit source lines %
Line 671 ⟶ 1,124:
while nextToken not = tEnd_of_input do writeToken;
writeToken
end.</langsyntaxhighlight>
{{out}} Test case 3:
<pre>
Line 709 ⟶ 1,162:
23 1 End_of_input
</pre>
 
=={{header|ATS}}==
 
One interesting feature of this implementation is my liberal use of a pushback buffer for input characters. This kept the code modular and easier to write.
 
(One point of note: the C "EOF" pseudo-character is detected in the following code by looking for a negative number. That EOF has to be negative and the other characters non-negative is implied by the ISO C standard.)
 
<syntaxhighlight lang="ats">(********************************************************************)
(* Usage: lex [INPUTFILE [OUTPUTFILE]]
If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input
or standard output is used, respectively. *)
 
#define ATS_DYNLOADFLAG 0
 
#include "share/atspre_staload.hats"
staload UN = "prelude/SATS/unsafe.sats"
 
#define NIL list_nil ()
#define :: list_cons
 
%{^
/* alloca(3) is needed for ATS exceptions. */
#include <alloca.h>
%}
 
(********************************************************************)
 
#define NUM_TOKENS 31
#define RESERVED_WORD_HASHTAB_SIZE 9
 
#define TOKEN_ELSE 0
#define TOKEN_IF 1
#define TOKEN_PRINT 2
#define TOKEN_PUTC 3
#define TOKEN_WHILE 4
#define TOKEN_MULTIPLY 5
#define TOKEN_DIVIDE 6
#define TOKEN_MOD 7
#define TOKEN_ADD 8
#define TOKEN_SUBTRACT 9
#define TOKEN_NEGATE 10
#define TOKEN_LESS 11
#define TOKEN_LESSEQUAL 12
#define TOKEN_GREATER 13
#define TOKEN_GREATEREQUAL 14
#define TOKEN_EQUAL 15
#define TOKEN_NOTEQUAL 16
#define TOKEN_NOT 17
#define TOKEN_ASSIGN 18
#define TOKEN_AND 19
#define TOKEN_OR 20
#define TOKEN_LEFTPAREN 21
#define TOKEN_RIGHTPAREN 22
#define TOKEN_LEFTBRACE 23
#define TOKEN_RIGHTBRACE 24
#define TOKEN_SEMICOLON 25
#define TOKEN_COMMA 26
#define TOKEN_IDENTIFIER 27
#define TOKEN_INTEGER 28
#define TOKEN_STRING 29
#define TOKEN_END_OF_INPUT 30
 
typedef token_t =
[i : int | TOKEN_ELSE <= i; i <= TOKEN_END_OF_INPUT]
int i
typedef tokentuple_t = (token_t, String, ullint, ullint)
typedef token_names_vt = @[string][NUM_TOKENS]
 
vtypedef reserved_words_vt =
@[String][RESERVED_WORD_HASHTAB_SIZE]
vtypedef reserved_word_tokens_vt =
@[token_t][RESERVED_WORD_HASHTAB_SIZE]
 
vtypedef lookups_vt =
[p_toknames : addr]
[p_wordtab : addr]
[p_toktab : addr]
@{
pf_toknames = token_names_vt @ p_toknames,
pf_wordtab = reserved_words_vt @ p_wordtab,
pf_toktab = reserved_word_tokens_vt @ p_toktab |
toknames = ptr p_toknames,
wordtab = ptr p_wordtab,
toktab = ptr p_toktab
}
 
fn
reserved_word_lookup
(s : String,
lookups : !lookups_vt,
line_no : ullint,
column_no : ullint) : tokentuple_t =
if string_length s < 2 then
(TOKEN_IDENTIFIER, s, line_no, column_no)
else
let
macdef wordtab = !(lookups.wordtab)
macdef toktab = !(lookups.toktab)
val hashval =
g1uint_mod (g1ofg0 (char2ui s[0] + char2ui s[1]),
g1i2u RESERVED_WORD_HASHTAB_SIZE)
val token = toktab[hashval]
in
if token = TOKEN_IDENTIFIER || s <> wordtab[hashval] then
(TOKEN_IDENTIFIER, s, line_no, column_no)
else
(token, s, line_no, column_no)
end
 
(********************************************************************)
(* Input allows pushback into a buffer. *)
 
typedef ch_t =
@{
ichar = int,
line_no = ullint,
column_no = ullint
}
 
typedef inp_t (n : int) =
[0 <= n]
@{
file = FILEref,
pushback = list (ch_t, n),
line_no = ullint,
column_no = ullint
}
typedef inp_t = [n : int] inp_t n
 
fn
get_ch (inp : inp_t) : (ch_t, inp_t) =
case+ (inp.pushback) of
| NIL =>
let
val c = fileref_getc (inp.file)
val ch =
@{
ichar = c,
line_no = inp.line_no,
column_no = inp.column_no
}
in
if c = char2i '\n' then
let
val inp =
@{
file = inp.file,
pushback = inp.pushback,
line_no = succ (inp.line_no),
column_no = 1ULL
}
in
(ch, inp)
end
else
let
val inp =
@{
file = inp.file,
pushback = inp.pushback,
line_no = inp.line_no,
column_no = succ (inp.column_no)
}
in
(ch, inp)
end
end
| ch :: pushback =>
let
val inp =
@{
file = inp.file,
pushback = pushback,
line_no = inp.line_no,
column_no = inp.column_no
}
in
(ch, inp)
end
 
fn
push_back_ch (ch : ch_t,
inp : inp_t) : [n : pos] inp_t n =
let
prval _ = lemma_list_param (inp.pushback)
in
@{
file = inp.file,
pushback = ch :: (inp.pushback),
line_no = inp.line_no,
column_no = inp.column_no
}
end
 
(********************************************************************)
 
exception unterminated_comment of (ullint, ullint)
exception unterminated_character_literal of (ullint, ullint)
exception multicharacter_literal of (ullint, ullint)
exception unterminated_string_literal of (ullint, ullint, bool)
exception unsupported_escape of (ullint, ullint, int)
exception invalid_integer_literal of (ullint, ullint, String)
exception unexpected_character of (ullint, ullint, int)
 
fn
scan_comment (inp : inp_t,
line_no : ullint,
column_no : ullint) : inp_t =
let
fun
loop (inp : inp_t) : inp_t =
let
val (ch, inp) = get_ch inp
in
if (ch.ichar) < 0 then
$raise unterminated_comment (line_no, column_no)
else if (ch.ichar) = char2i '*' then
let
val (ch1, inp) = get_ch inp
in
if (ch.ichar) < 0 then
$raise unterminated_comment (line_no, column_no)
else if (ch1.ichar) = char2i '/' then
inp
else
loop inp
end
else
loop inp
end
in
loop inp
end
 
fn
skip_spaces_and_comments (inp : inp_t) : [n : pos] inp_t n =
let
fun
loop (inp : inp_t) : [n : pos] inp_t n =
let
val (ch, inp) = get_ch inp
in
if isspace (ch.ichar) then
loop inp
else if (ch.ichar) = char2i '/' then
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '*' then
loop (scan_comment (inp, ch.line_no, ch.column_no))
else
let
val inp = push_back_ch (ch1, inp)
val inp = push_back_ch (ch, inp)
in
inp
end
end
else
push_back_ch (ch, inp)
end
in
loop inp
end
 
fn
reverse_list_to_string
{m : int}
(lst : list (char, m)) : string m =
let
fun
fill_array {n : nat | n <= m} .<n>.
(arr : &(@[char][m + 1]),
lst : list (char, n),
n : size_t n) : void =
case+ lst of
| NIL => ()
| c :: tail =>
begin
arr[pred n] := c;
fill_array (arr, tail, pred n)
end
 
prval _ = lemma_list_param lst
val m : size_t m = i2sz (list_length lst)
val (pf, pfgc | p) = array_ptr_alloc<char> (succ m)
val _ = array_initize_elt<char> (!p, succ m, '\0')
val _ = fill_array (!p, lst, m)
in
$UN.castvwtp0 @(pf, pfgc | p)
end
 
extern fun {}
simple_scan$pred : int -> bool
fun {}
simple_scan {u : nat}
(lst : list (char, u),
inp : inp_t) :
[m : nat]
[n : pos]
(list (char, m), inp_t n) =
let
val (ch, inp) = get_ch inp
in
if simple_scan$pred (ch.ichar) then
simple_scan<> (int2char0 (ch.ichar) :: lst, inp)
else
let
val inp = push_back_ch (ch, inp)
in
(lst, inp)
end
end
 
fn
is_ident_start (c : int) :<> bool =
isalpha (c) || c = char2i '_'
 
fn
is_ident_continuation (c : int) :<> bool =
isalnum (c) || c = char2i '_'
 
fn
scan_identifier_or_reserved_word
(inp : inp_t,
lookups : !lookups_vt) :
(tokentuple_t, [n : pos] inp_t n) =
let
val (ch, inp) = get_ch inp
val _ = assertloc (is_ident_start (ch.ichar))
 
implement simple_scan$pred<> c = is_ident_continuation c
val (lst, inp) = simple_scan (int2char0 (ch.ichar) :: NIL, inp)
 
val s = reverse_list_to_string lst
val toktup =
reserved_word_lookup (s, lookups, ch.line_no, ch.column_no)
in
(toktup, inp)
end
 
fn
scan_integer_literal
(inp : inp_t,
lookups : !lookups_vt) :
(tokentuple_t, [n : pos] inp_t n) =
let
val (ch, inp) = get_ch inp
val _ = assertloc (isdigit (ch.ichar))
 
implement simple_scan$pred<> c = is_ident_continuation c
val (lst, inp) = simple_scan (int2char0 (ch.ichar) :: NIL, inp)
 
val s = reverse_list_to_string lst
 
fun
check_they_are_all_digits
{n : nat} .<n>.
(lst : list (char, n)) : void =
case+ lst of
| NIL => ()
| c :: tail =>
if isdigit c then
check_they_are_all_digits tail
else
$raise invalid_integer_literal (ch.line_no, ch.column_no, s)
 
val _ = check_they_are_all_digits lst
in
((TOKEN_INTEGER, s, ch.line_no, ch.column_no), inp)
end
 
fn
ichar2integer_literal (c : int) : String0 =
let
var buf = @[char][100] ('\0')
val _ = $extfcall (int, "snprintf", addr@ buf, i2sz 99, "%d", c)
val s = string1_copy ($UN.castvwtp0{String0} buf)
in
strnptr2string s
end
 
fn
scan_character_literal_without_checking_end (inp : inp_t) :
(tokentuple_t, inp_t) =
let
val (ch, inp) = get_ch inp
val _ = assertloc ((ch.ichar) = '\'')
 
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) < 0 then
$raise unterminated_character_literal (ch.line_no, ch.column_no)
else if (ch1.ichar) = char2i '\\' then
let
val (ch2, inp) = get_ch inp
in
if (ch2.ichar) < 0 then
$raise unterminated_character_literal (ch.line_no,
ch.column_no)
else if (ch2.ichar) = char2i 'n' then
let
val s = ichar2integer_literal (char2i '\n')
in
((TOKEN_INTEGER, s, ch.line_no, ch.column_no), inp)
end
else if (ch2.ichar) = char2i '\\' then
let
val s = ichar2integer_literal (char2i '\\')
in
((TOKEN_INTEGER, s, ch.line_no, ch.column_no), inp)
end
else
$raise unsupported_escape (ch1.line_no, ch1.column_no,
ch2.ichar)
end
else
let
val s = ichar2integer_literal (ch1.ichar)
in
((TOKEN_INTEGER, s, ch.line_no, ch.column_no), inp)
end
end
 
fn
scan_character_literal (inp : inp_t) : (tokentuple_t, inp_t) =
let
val (tok, inp) =
scan_character_literal_without_checking_end inp
val line_no = (tok.2)
val column_no = (tok.3)
 
fun
check_end (inp : inp_t) : inp_t =
let
val (ch, inp) = get_ch inp
in
if (ch.ichar) = char2i '\'' then
inp
else
let
fun
loop_to_end (ch1 : ch_t,
inp : inp_t) : inp_t =
if (ch1.ichar) < 0 then
$raise unterminated_character_literal (line_no,
column_no)
else if (ch1.ichar) = char2i '\'' then
$raise multicharacter_literal (line_no, column_no)
else
let
val (ch1, inp) = get_ch inp
in
loop_to_end (ch1, inp)
end
 
val inp = loop_to_end (ch, inp)
in
inp
end
end
 
val inp = check_end inp
in
(tok, inp)
end
 
fn
scan_string_literal (inp : inp_t) : (tokentuple_t, inp_t) =
let
val (ch, inp) = get_ch inp
val _ = assertloc ((ch.ichar) = '"')
 
fun
scan {u : pos}
(lst : list (char, u),
inp : inp_t) :
[m : pos] (list (char, m), inp_t) =
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) < 0 then
$raise unterminated_string_literal (ch.line_no,
ch.column_no, false)
else if (ch1.ichar) = char2i '\n' then
$raise unterminated_string_literal (ch.line_no,
ch.column_no, true)
else if (ch1.ichar) = char2i '"' then
(lst, inp)
else if (ch1.ichar) <> char2i '\\' then
scan (int2char0 (ch1.ichar) :: lst, inp)
else
let
val (ch2, inp) = get_ch inp
in
if (ch2.ichar) = char2i 'n' then
scan ('n' :: '\\' :: lst, inp)
else if (ch2.ichar) = char2i '\\' then
scan ('\\' :: '\\' :: lst, inp)
else
$raise unsupported_escape (ch1.line_no, ch1.column_no,
ch2.ichar)
end
end
 
val lst = '"' :: NIL
val (lst, inp) = scan (lst, inp)
val lst = '"' :: lst
val s = reverse_list_to_string lst
in
((TOKEN_STRING, s, ch.line_no, ch.column_no), inp)
end
 
fn
get_next_token (inp : inp_t,
lookups : !lookups_vt) : (tokentuple_t, inp_t) =
let
val inp = skip_spaces_and_comments inp
val (ch, inp) = get_ch inp
val ln = ch.line_no
val cn = ch.column_no
in
if ch.ichar < 0 then
((TOKEN_END_OF_INPUT, "", ln, cn), inp)
else
case+ int2char0 (ch.ichar) of
| ',' => ((TOKEN_COMMA, ",", ln, cn), inp)
| ';' => ((TOKEN_SEMICOLON, ";", ln, cn), inp)
| '\(' => ((TOKEN_LEFTPAREN, "(", ln, cn), inp)
| ')' => ((TOKEN_RIGHTPAREN, ")", ln, cn), inp)
| '\{' => ((TOKEN_LEFTBRACE, "{", ln, cn), inp)
| '}' => ((TOKEN_RIGHTBRACE, "}", ln, cn), inp)
| '*' => ((TOKEN_MULTIPLY, "*", ln, cn), inp)
| '/' => ((TOKEN_DIVIDE, "/", ln, cn), inp)
| '%' => ((TOKEN_MOD, "%", ln, cn), inp)
| '+' => ((TOKEN_ADD, "+", ln, cn), inp)
| '-' => ((TOKEN_SUBTRACT, "-", ln, cn), inp)
| '<' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '=' then
((TOKEN_LESSEQUAL, "<=", ln, cn), inp)
else
let
val inp = push_back_ch (ch1, inp)
in
((TOKEN_LESS, "<", ln, cn), inp)
end
end
| '>' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '=' then
((TOKEN_GREATEREQUAL, ">=", ln, cn), inp)
else
let
val inp = push_back_ch (ch1, inp)
in
((TOKEN_GREATER, ">", ln, cn), inp)
end
end
| '=' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '=' then
((TOKEN_EQUAL, "==", ln, cn), inp)
else
let
val inp = push_back_ch (ch1, inp)
in
((TOKEN_ASSIGN, "=", ln, cn), inp)
end
end
| '!' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '=' then
((TOKEN_NOTEQUAL, "!=", ln, cn), inp)
else
let
val inp = push_back_ch (ch1, inp)
in
((TOKEN_NOT, "!", ln, cn), inp)
end
end
| '&' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '&' then
((TOKEN_AND, "&&", ln, cn), inp)
else
$raise unexpected_character (ch.line_no, ch.column_no,
ch.ichar)
end
| '|' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '|' then
((TOKEN_OR, "||", ln, cn), inp)
else
$raise unexpected_character (ch.line_no, ch.column_no,
ch.ichar)
end
| '"' =>
let
val inp = push_back_ch (ch, inp)
in
scan_string_literal inp
end
| '\'' =>
let
val inp = push_back_ch (ch, inp)
in
scan_character_literal inp
end
| _ when isdigit (ch.ichar) =>
let
val inp = push_back_ch (ch, inp)
in
scan_integer_literal (inp, lookups)
end
| _ when is_ident_start (ch.ichar) =>
let
val inp = push_back_ch (ch, inp)
in
scan_identifier_or_reserved_word (inp, lookups)
end
| _ => $raise unexpected_character (ch.line_no, ch.column_no,
ch.ichar)
end
 
fn
fprint_ullint_rightjust (outf : FILEref,
num : ullint) : void =
if num < 10ULL then
fprint! (outf, " ", num)
else if num < 100ULL then
fprint! (outf, " ", num)
else if num < 1000ULL then
fprint! (outf, " ", num)
else if num < 10000ULL then
fprint! (outf, " ", num)
else
fprint! (outf, num)
 
fn
print_token (outf : FILEref,
toktup : tokentuple_t,
lookups : !lookups_vt) : void =
let
macdef toknames = !(lookups.toknames)
val name = toknames[toktup.0]
val str = (toktup.1)
val line_no = (toktup.2)
val column_no = (toktup.3)
 
val _ = fprint_ullint_rightjust (outf, line_no)
val _ = fileref_puts (outf, " ")
val _ = fprint_ullint_rightjust (outf, column_no)
val _ = fileref_puts (outf, " ")
val _ = fileref_puts (outf, name)
in
begin
case+ toktup.0 of
| TOKEN_IDENTIFIER => fprint! (outf, " ", str)
| TOKEN_INTEGER => fprint! (outf, " ", str)
| TOKEN_STRING => fprint! (outf, " ", str)
| _ => ()
end;
 
fileref_putc (outf, '\n')
end
 
fn
scan_text (outf : FILEref,
inp : inp_t,
lookups : !lookups_vt) : void =
let
fun
loop (inp : inp_t,
lookups : !lookups_vt) : void =
let
val (toktup, inp) = get_next_token (inp, lookups)
in
print_token (outf, toktup, lookups);
if toktup.0 <> TOKEN_END_OF_INPUT then
loop (inp, lookups)
end
in
loop (inp, lookups)
end
 
(********************************************************************)
 
fn
main_program (inpf : FILEref,
outf : FILEref) : int =
let
(* Using a simple Scheme program, I found the following perfect
hash for the reserved words, using the sum of the first two
characters as the hash value. *)
var reserved_words =
@[String][RESERVED_WORD_HASHTAB_SIZE]
("if", "print", "else", "", "putc", "", "", "while", "")
var reserved_word_tokens =
@[token_t][RESERVED_WORD_HASHTAB_SIZE]
(TOKEN_IF, TOKEN_PRINT, TOKEN_ELSE, TOKEN_IDENTIFIER,
TOKEN_PUTC, TOKEN_IDENTIFIER, TOKEN_IDENTIFIER, TOKEN_WHILE,
TOKEN_IDENTIFIER)
 
var token_names =
@[string][NUM_TOKENS]
("Keyword_else",
"Keyword_if",
"Keyword_print",
"Keyword_putc",
"Keyword_while",
"Op_multiply",
"Op_divide",
"Op_mod",
"Op_add",
"Op_subtract",
"Op_negate",
"Op_less",
"Op_lessequal",
"Op_greater",
"Op_greaterequal",
"Op_equal",
"Op_notequal",
"Op_not",
"Op_assign",
"Op_and",
"Op_or",
"LeftParen",
"RightParen",
"LeftBrace",
"RightBrace",
"Semicolon",
"Comma",
"Identifier",
"Integer",
"String",
"End_of_input")
 
var lookups : lookups_vt =
@{
pf_toknames = view@ token_names,
pf_wordtab = view@ reserved_words,
pf_toktab = view@ reserved_word_tokens |
toknames = addr@ token_names,
wordtab = addr@ reserved_words,
toktab = addr@ reserved_word_tokens
}
 
val inp =
@{
file = inpf,
pushback = NIL,
line_no = 1ULL,
column_no = 1ULL
}
 
val _ = scan_text (outf, inp, lookups)
 
val @{
pf_toknames = pf_toknames,
pf_wordtab = pf_wordtab,
pf_toktab = pf_toktab |
toknames = toknames,
wordtab = wordtab,
toktab = toktab
} = lookups
prval _ = view@ token_names := pf_toknames
prval _ = view@ reserved_words := pf_wordtab
prval _ = view@ reserved_word_tokens := pf_toktab
in
0
end
 
macdef lex_error = "Lexical error: "
 
implement
main (argc, argv) =
let
val inpfname =
if 2 <= argc then
$UN.cast{string} argv[1]
else
"-"
val outfname =
if 3 <= argc then
$UN.cast{string} argv[2]
else
"-"
in
try
let
val inpf =
if (inpfname : string) = "-" then
stdin_ref
else
fileref_open_exn (inpfname, file_mode_r)
 
val outf =
if (outfname : string) = "-" then
stdout_ref
else
fileref_open_exn (outfname, file_mode_w)
in
main_program (inpf, outf)
end
with
| ~ unterminated_comment (line_no, column_no) =>
begin
fprintln! (stderr_ref, lex_error,
"unterminated comment starting at ",
line_no, ":", column_no);
1
end
| ~ unterminated_character_literal (line_no, column_no) =>
begin
fprintln! (stderr_ref, lex_error,
"unterminated character literal starting at ",
line_no, ":", column_no);
1
end
| ~ multicharacter_literal (line_no, column_no) =>
begin
fprintln! (stderr_ref, lex_error,
"unsupported multicharacter literal starting at ",
line_no, ":", column_no);
1
end
| ~ unterminated_string_literal (line_no, column_no,
end_of_line) =>
let
val s =
begin
if end_of_line then
"end of line"
else
"end of input"
end : String
in
fprintln! (stderr_ref, lex_error,
"unterminated string literal (", s,
") starting at ", line_no, ":", column_no);
1
end
| ~ unsupported_escape (line_no, column_no, c) =>
begin
fprintln! (stderr_ref, lex_error,
"unsupported escape \\",
int2char0 c, " starting at ",
line_no, ":", column_no);
1
end
| ~ invalid_integer_literal (line_no, column_no, s) =>
begin
fprintln! (stderr_ref, lex_error,
"invalid integer literal ", s,
" starting at ", line_no, ":", column_no);
1
end
| ~ unexpected_character (line_no, column_no, c) =>
begin
fprintln! (stderr_ref, lex_error,
"unexpected character '", int2char0 c,
"' at ", line_no, ":", column_no);
1
end
end
 
(********************************************************************)</syntaxhighlight>
 
{{out}}
<pre>$ patscc -O2 -DATS_MEMALLOC_GCBDW -o lex lex-in-ATS.dats -lgc && ./lex compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|AWK}}==
Tested with gawk 4.1.1 and mawk 1.3.4.
<syntaxhighlight lang="awk">
<lang AWK>
BEGIN {
all_syms["tk_EOI" ] = "End_of_input"
Line 918 ⟶ 2,288:
}
}
</syntaxhighlight>
</lang>
{{out|case=count}}
<b>
Line 954 ⟶ 2,324:
 
=={{header|C}}==
Tested with gcc 4.81 and later, compiles warning free with -Wpedantic -pedantic -Wall -Wextra
<langsyntaxhighlight Clang="c">#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
Line 995 ⟶ 2,365:
da_dim(text, char);
 
tok_s gettok(void);
 
static void error(int err_line, int err_col, const char *fmt, ... ) {
Line 1,008 ⟶ 2,378:
}
 
static int next_ch(void) { /* get next char from input */
the_ch = getc(source_fp);
++col;
Line 1,074 ⟶ 2,444:
static TokenType get_ident_type(const char *ident) {
static struct {
const char *s;
TokenType sym;
} kwds[] = {
Line 1,121 ⟶ 2,491:
}
 
tok_s gettok(void) { /* return the token type */
/* skip white space */
while (isspace(the_ch))
Line 1,152 ⟶ 2,522:
}
 
void run(void) { /* tokenize the given input */
tok_s tok;
do {
Line 1,187 ⟶ 2,557:
run();
return 0;
}</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 1,231 ⟶ 2,601:
=={{header|C sharp|C#}}==
Requires C#6.0 because of the use of null coalescing operators.
<langsyntaxhighlight lang="csharp">
using System;
using System.IO;
Line 1,581 ⟶ 2,951:
}
}
</syntaxhighlight>
</lang>
 
{{out|case=test case 3}}
Line 1,622 ⟶ 2,992:
</pre>
</b>
 
=={{header|C++}}==
Tested with GCC 9.3.0 (g++ -std=c++17)
<syntaxhighlight lang="cpp">#include <charconv> // std::from_chars
#include <fstream> // file_to_string, string_to_file
#include <functional> // std::invoke
#include <iomanip> // std::setw
#include <ios> // std::left
#include <iostream>
#include <map> // keywords
#include <sstream>
#include <string>
#include <utility> // std::forward
#include <variant> // TokenVal
 
using namespace std;
 
// =====================================================================================================================
// Machinery
// =====================================================================================================================
string file_to_string (const string& path)
{
// Open file
ifstream file {path, ios::in | ios::binary | ios::ate};
if (!file) throw (errno);
 
// Allocate string memory
string contents;
contents.resize(file.tellg());
 
// Read file contents into string
file.seekg(0);
file.read(contents.data(), contents.size());
 
return contents;
}
 
void string_to_file (const string& path, string contents)
{
ofstream file {path, ios::out | ios::binary};
if (!file) throw (errno);
 
file.write(contents.data(), contents.size());
}
 
template <class F>
void with_IO (string source, string destination, F&& f)
{
string input;
 
if (source == "stdin") getline(cin, input);
else input = file_to_string(source);
 
string output = invoke(forward<F>(f), input);
 
if (destination == "stdout") cout << output;
else string_to_file(destination, output);
}
 
// Add escaped newlines and backslashes back in for printing
string sanitize (string s)
{
for (auto i = 0u; i < s.size(); ++i)
{
if (s[i] == '\n') s.replace(i++, 1, "\\n");
else if (s[i] == '\\') s.replace(i++, 1, "\\\\");
}
 
return s;
}
 
class Scanner
{
public:
const char* pos;
int line = 1;
int column = 1;
 
Scanner (const char* source) : pos {source} {}
 
inline char peek () { return *pos; }
 
void advance ()
{
if (*pos == '\n') { ++line; column = 1; }
else ++column;
 
++pos;
}
 
char next ()
{
advance();
return peek();
}
 
void skip_whitespace ()
{
while (isspace(static_cast<unsigned char>(peek())))
advance();
}
}; // class Scanner
 
 
// =====================================================================================================================
// Tokens
// =====================================================================================================================
enum class TokenName
{
OP_MULTIPLY, OP_DIVIDE, OP_MOD, OP_ADD, OP_SUBTRACT, OP_NEGATE,
OP_LESS, OP_LESSEQUAL, OP_GREATER, OP_GREATEREQUAL, OP_EQUAL, OP_NOTEQUAL,
OP_NOT, OP_ASSIGN, OP_AND, OP_OR,
LEFTPAREN, RIGHTPAREN, LEFTBRACE, RIGHTBRACE, SEMICOLON, COMMA,
KEYWORD_IF, KEYWORD_ELSE, KEYWORD_WHILE, KEYWORD_PRINT, KEYWORD_PUTC,
IDENTIFIER, INTEGER, STRING,
END_OF_INPUT, ERROR
};
 
using TokenVal = variant<int, string>;
 
struct Token
{
TokenName name;
TokenVal value;
int line;
int column;
};
 
 
const char* to_cstring (TokenName name)
{
static const char* s[] =
{
"Op_multiply", "Op_divide", "Op_mod", "Op_add", "Op_subtract", "Op_negate",
"Op_less", "Op_lessequal", "Op_greater", "Op_greaterequal", "Op_equal", "Op_notequal",
"Op_not", "Op_assign", "Op_and", "Op_or",
"LeftParen", "RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma",
"Keyword_if", "Keyword_else", "Keyword_while", "Keyword_print", "Keyword_putc",
"Identifier", "Integer", "String",
"End_of_input", "Error"
};
 
return s[static_cast<int>(name)];
}
 
 
string to_string (Token t)
{
ostringstream out;
out << setw(2) << t.line << " " << setw(2) << t.column << " ";
 
switch (t.name)
{
case (TokenName::IDENTIFIER) : out << "Identifier " << get<string>(t.value); break;
case (TokenName::INTEGER) : out << "Integer " << left << get<int>(t.value); break;
case (TokenName::STRING) : out << "String \"" << sanitize(get<string>(t.value)) << '"'; break;
case (TokenName::END_OF_INPUT) : out << "End_of_input"; break;
case (TokenName::ERROR) : out << "Error " << get<string>(t.value); break;
default : out << to_cstring(t.name);
}
 
out << '\n';
 
return out.str();
}
 
 
// =====================================================================================================================
// Lexer
// =====================================================================================================================
class Lexer
{
public:
Lexer (const char* source) : s {source}, pre_state {s} {}
 
bool has_more () { return s.peek() != '\0'; }
 
Token next_token ()
{
s.skip_whitespace();
 
pre_state = s;
 
switch (s.peek())
{
case '*' : return simply(TokenName::OP_MULTIPLY);
case '%' : return simply(TokenName::OP_MOD);
case '+' : return simply(TokenName::OP_ADD);
case '-' : return simply(TokenName::OP_SUBTRACT);
case '{' : return simply(TokenName::LEFTBRACE);
case '}' : return simply(TokenName::RIGHTBRACE);
case '(' : return simply(TokenName::LEFTPAREN);
case ')' : return simply(TokenName::RIGHTPAREN);
case ';' : return simply(TokenName::SEMICOLON);
case ',' : return simply(TokenName::COMMA);
case '&' : return expect('&', TokenName::OP_AND);
case '|' : return expect('|', TokenName::OP_OR);
case '<' : return follow('=', TokenName::OP_LESSEQUAL, TokenName::OP_LESS);
case '>' : return follow('=', TokenName::OP_GREATEREQUAL, TokenName::OP_GREATER);
case '=' : return follow('=', TokenName::OP_EQUAL, TokenName::OP_ASSIGN);
case '!' : return follow('=', TokenName::OP_NOTEQUAL, TokenName::OP_NOT);
case '/' : return divide_or_comment();
case '\'' : return char_lit();
case '"' : return string_lit();
 
default : if (is_id_start(s.peek())) return identifier();
if (is_digit(s.peek())) return integer_lit();
return error("Unrecognized character '", s.peek(), "'");
 
case '\0' : return make_token(TokenName::END_OF_INPUT);
}
}
 
 
private:
Scanner s;
Scanner pre_state;
static const map<string, TokenName> keywords;
 
 
template <class... Args>
Token error (Args&&... ostream_args)
{
string code {pre_state.pos, (string::size_type) s.column - pre_state.column};
 
ostringstream msg;
(msg << ... << forward<Args>(ostream_args)) << '\n'
<< string(28, ' ') << "(" << s.line << ", " << s.column << "): " << code;
 
if (s.peek() != '\0') s.advance();
 
return make_token(TokenName::ERROR, msg.str());
}
 
 
inline Token make_token (TokenName name, TokenVal value = 0)
{
return {name, value, pre_state.line, pre_state.column};
}
 
 
Token simply (TokenName name)
{
s.advance();
return make_token(name);
}
 
 
Token expect (char expected, TokenName name)
{
if (s.next() == expected) return simply(name);
else return error("Unrecognized character '", s.peek(), "'");
}
 
 
Token follow (char expected, TokenName ifyes, TokenName ifno)
{
if (s.next() == expected) return simply(ifyes);
else return make_token(ifno);
}
 
 
Token divide_or_comment ()
{
if (s.next() != '*') return make_token(TokenName::OP_DIVIDE);
 
while (s.next() != '\0')
{
if (s.peek() == '*' && s.next() == '/')
{
s.advance();
return next_token();
}
}
 
return error("End-of-file in comment. Closing comment characters not found.");
}
 
 
Token char_lit ()
{
int n = s.next();
 
if (n == '\'') return error("Empty character constant");
 
if (n == '\\') switch (s.next())
{
case 'n' : n = '\n'; break;
case '\\' : n = '\\'; break;
default : return error("Unknown escape sequence \\", s.peek());
}
 
if (s.next() != '\'') return error("Multi-character constant");
 
s.advance();
return make_token(TokenName::INTEGER, n);
}
 
 
Token string_lit ()
{
string text = "";
 
while (s.next() != '"')
switch (s.peek())
{
case '\\' : switch (s.next())
{
case 'n' : text += '\n'; continue;
case '\\' : text += '\\'; continue;
default : return error("Unknown escape sequence \\", s.peek());
}
 
case '\n' : return error("End-of-line while scanning string literal."
" Closing string character not found before end-of-line.");
 
case '\0' : return error("End-of-file while scanning string literal."
" Closing string character not found.");
 
default : text += s.peek();
}
 
s.advance();
return make_token(TokenName::STRING, text);
}
 
 
static inline bool is_id_start (char c) { return isalpha(static_cast<unsigned char>(c)) || c == '_'; }
static inline bool is_id_end (char c) { return isalnum(static_cast<unsigned char>(c)) || c == '_'; }
static inline bool is_digit (char c) { return isdigit(static_cast<unsigned char>(c)); }
 
 
Token identifier ()
{
string text (1, s.peek());
 
while (is_id_end(s.next())) text += s.peek();
 
auto i = keywords.find(text);
if (i != keywords.end()) return make_token(i->second);
 
return make_token(TokenName::IDENTIFIER, text);
}
 
 
Token integer_lit ()
{
while (is_digit(s.next()));
 
if (is_id_start(s.peek()))
return error("Invalid number. Starts like a number, but ends in non-numeric characters.");
 
int n;
 
auto r = from_chars(pre_state.pos, s.pos, n);
if (r.ec == errc::result_out_of_range) return error("Number exceeds maximum value");
 
return make_token(TokenName::INTEGER, n);
}
}; // class Lexer
 
 
const map<string, TokenName> Lexer::keywords =
{
{"else", TokenName::KEYWORD_ELSE},
{"if", TokenName::KEYWORD_IF},
{"print", TokenName::KEYWORD_PRINT},
{"putc", TokenName::KEYWORD_PUTC},
{"while", TokenName::KEYWORD_WHILE}
};
 
 
int main (int argc, char* argv[])
{
string in = (argc > 1) ? argv[1] : "stdin";
string out = (argc > 2) ? argv[2] : "stdout";
 
with_IO(in, out, [](string input)
{
Lexer lexer {input.data()};
 
string s = "Location Token name Value\n"
"--------------------------------------\n";
 
while (lexer.has_more()) s += to_string(lexer.next_token());
return s;
});
}
</syntaxhighlight>
 
{{out|case=test case 3}}
<pre>
Location Token name Value
--------------------------------------
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
 
=={{header|COBOL}}==
Using GnuCOBOL 2. By Steve Williams (with one change to get around a Rosetta Code code highlighter problem).
 
<langsyntaxhighlight lang="cobol"> >>SOURCE FORMAT IS FREE
*> this code is dedicated to the public domain
*> (GnuCOBOL) 2.3-dev.0
Line 2,032 ⟶ 3,831:
end-if
.
end program lexer.</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 2,074 ⟶ 3,873:
Lisp has a built-in reader and you can customize the reader by modifying its readtable. I'm also using the Gray stream, which is an almost standard feature of Common Lisp, for counting lines and columns.
 
<langsyntaxhighlight lang="lisp">(defpackage #:lexical-analyzer
(:use #:cl #:sb-gray)
(:export #:main))
Line 2,287 ⟶ 4,086:
 
(defun main ()
(lex *standard-input*))</langsyntaxhighlight>
{{out|case=test case 3}}
<pre> 5 16 KEYWORD-PRINT
Line 2,323 ⟶ 4,122:
22 26 INTEGER 32
23 1 END-OF-INPUT </pre>
 
=={{header|Elixir}}==
{{works with|Elixir|1.13.3}}
{{trans|ATS}}
 
<syntaxhighlight lang="elixir">#!/bin/env elixir
# -*- elixir -*-
 
defmodule Lex do
 
def main args do
{inpf_name, outf_name, exit_status} =
case args do
[] -> {"-", "-", 0}
[name] -> {name, "-", 0}
[name1, name2] -> {name1, name2, 0}
[name1, name2 | _] -> {name1, name2, usage_error()}
end
 
{inpf, outf, exit_status} =
case {inpf_name, outf_name, exit_status} do
{"-", "-", 0} -> {:stdio, :stdio, 0}
{name1, "-", 0} ->
{inpf, exit_status} = open_file(name1, [:read])
{inpf, :stdio, exit_status}
{"-", name2, 0} ->
{outf, exit_status} = open_file(name2, [:write])
{:stdio, outf, exit_status}
{name1, name2, 0} ->
{inpf, exit_status} = open_file(name1, [:read])
if exit_status != 0 do
{inpf, name2, exit_status}
else
{outf, exit_status} = open_file(name2, [:write])
{inpf, outf, exit_status}
end
_ -> {inpf_name, outf_name, exit_status}
end
 
exit_status =
case exit_status do
0 -> main_program inpf, outf
_ -> exit_status
end
 
# Choose one.
System.halt exit_status # Fast exit.
#System.stop exit_status # Laborious cleanup.
end
 
def main_program inpf, outf do
inp = make_inp inpf
scan_text outf, inp
exit_status = 0
exit_status
end
 
def open_file name, rw do
case File.open name, rw do
{:ok, f} -> {f, 0}
_ ->
IO.write :stderr, "Cannot open "
IO.write :stderr, name
case rw do
[:read] -> IO.puts " for input"
[:write] -> IO.puts " for output"
end
{name, 1}
end
end
 
def scan_text outf, inp do
{toktup, inp} = get_next_token inp
print_token outf, toktup
case toktup do
{"End_of_input", _, _, _} -> :ok
_ -> scan_text outf, inp
end
end
 
def print_token outf, {tok, arg, line_no, column_no} do
IO.write outf, (String.pad_leading "#{line_no}", 5)
IO.write outf, " "
IO.write outf, (String.pad_leading "#{column_no}", 5)
IO.write outf, " "
IO.write outf, tok
case tok do
"Identifier" ->
IO.write outf, " "
IO.write outf, arg
"Integer" ->
IO.write outf, " "
IO.write outf, arg
"String" ->
IO.write outf, " "
IO.write outf, arg
_ -> :ok
end
IO.puts outf, ""
end
 
###-------------------------------------------------------------------
###
### The token dispatcher.
###
 
def get_next_token inp do
inp = skip_spaces_and_comments inp
{ch, inp} = get_ch inp
{chr, line_no, column_no} = ch
ln = line_no
cn = column_no
case chr do
:eof -> {{"End_of_input", "", ln, cn}, inp}
"," -> {{"Comma", ",", ln, cn}, inp}
";" -> {{"Semicolon", ";", ln, cn}, inp}
"(" -> {{"LeftParen", "(", ln, cn}, inp}
")" -> {{"RightParen", ")", ln, cn}, inp}
"{" -> {{"LeftBrace", "{", ln, cn}, inp}
"}" -> {{"RightBrace", "}", ln, cn}, inp}
"*" -> {{"Op_multiply", "*", ln, cn}, inp}
"/" -> {{"Op_divide", "/", ln, cn}, inp}
"%" -> {{"Op_mod", "%", ln, cn}, inp}
"+" -> {{"Op_add", "+", ln, cn}, inp}
"-" -> {{"Op_subtract", "-", ln, cn}, inp}
"<" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_lessequal", "<=", ln, cn}, inp}
_ -> {{"Op_less", "<", ln, cn}, (push_back ch1, inp)}
end
">" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_greaterequal", ">=", ln, cn}, inp}
_ -> {{"Op_greater", ">", ln, cn}, (push_back ch1, inp)}
end
"=" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_equal", "==", ln, cn}, inp}
_ -> {{"Op_assign", "=", ln, cn}, (push_back ch1, inp)}
end
"!" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_notequal", "!=", ln, cn}, inp}
_ -> {{"Op_not", "!", ln, cn}, (push_back ch1, inp)}
end
"&" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"&" -> {{"Op_and", "&&", ln, cn}, inp}
_ -> unexpected_character ln, cn, chr
end
"|" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"|" -> {{"Op_or", "||", ln, cn}, inp}
_ -> unexpected_character ln, cn, chr
end
"\"" ->
inp = push_back ch, inp
scan_string_literal inp
"'" ->
inp = push_back ch, inp
scan_character_literal inp
_ ->
cond do
String.match? chr, ~r/^[[:digit:]]$/u ->
inp = push_back ch, inp
scan_integer_literal inp
String.match? chr, ~r/^[[:alpha:]_]$/u ->
inp = push_back ch, inp
scan_identifier_or_reserved_word inp
true -> unexpected_character ln, cn, chr
end
end
end
 
###-------------------------------------------------------------------
###
### Skipping past spaces and /* ... */ comments.
###
### Comments are treated exactly like a bit of whitespace. They never
### make it to the dispatcher.
###
 
def skip_spaces_and_comments inp do
{ch, inp} = get_ch inp
{chr, line_no, column_no} = ch
cond do
chr == :eof -> push_back ch, inp
String.match? chr, ~r/^[[:space:]]$/u ->
skip_spaces_and_comments inp
chr == "/" ->
{ch1, inp} = get_ch inp
case ch1 do
{"*", _, _} ->
inp = scan_comment inp, line_no, column_no
skip_spaces_and_comments inp
_ -> push_back ch, (push_back ch1, inp)
end
true -> push_back ch, inp
end
end
 
def scan_comment inp, line_no, column_no do
{ch, inp} = get_ch inp
case ch do
{:eof, _, _} -> unterminated_comment line_no, column_no
{"*", _, _} ->
{ch1, inp} = get_ch inp
case ch1 do
{:eof, _, _} -> unterminated_comment line_no, column_no
{"/", _, _} -> inp
_ -> scan_comment inp, line_no, column_no
end
_ -> scan_comment inp, line_no, column_no
end
end
 
###-------------------------------------------------------------------
###
### Scanning of integer literals, identifiers, and reserved words.
###
### These three types of token are very similar to each other.
###
 
def scan_integer_literal inp do
# Scan an entire word, not just digits. This way we detect
# erroneous text such as "23skidoo".
{line_no, column_no, inp} = get_position inp
{word, inp} = scan_word inp
if String.match? word, (~r/^[[:digit:]]+$/u) do
{{"Integer", word, line_no, column_no}, inp}
else
invalid_integer_literal line_no, column_no, word
end
end
 
def scan_identifier_or_reserved_word inp do
# It is assumed that the first character is of the correct type,
# thanks to the dispatcher.
{line_no, column_no, inp} = get_position inp
{word, inp} = scan_word inp
tok =
case word do
"if" -> "Keyword_if"
"else" -> "Keyword_else"
"while" -> "Keyword_while"
"print" -> "Keyword_print"
"putc" -> "Keyword_putc"
_ -> "Identifier"
end
{{tok, word, line_no, column_no}, inp}
end
 
def scan_word inp, word\\"" do
{ch, inp} = get_ch inp
{chr, _, _} = ch
if String.match? chr, (~r/^[[:alnum:]_]$/u) do
scan_word inp, (word <> chr)
else
{word, (push_back ch, inp)}
end
end
 
def get_position inp do
{ch, inp} = get_ch inp
{_, line_no, column_no} = ch
inp = push_back ch, inp
{line_no, column_no, inp}
end
 
###-------------------------------------------------------------------
###
### Scanning of string literals.
###
### It is assumed that the first character is the opening quote, and
### that the closing quote is the same character.
###
 
def scan_string_literal inp do
{ch, inp} = get_ch inp
{quote_mark, line_no, column_no} = ch
{contents, inp} = scan_str_lit inp, ch
{{"String", quote_mark <> contents <> quote_mark,
line_no, column_no},
inp}
end
 
def scan_str_lit inp, ch, contents\\"" do
{quote_mark, line_no, column_no} = ch
{ch1, inp} = get_ch inp
{chr1, line_no1, column_no1} = ch1
if chr1 == quote_mark do
{contents, inp}
else
case chr1 do
:eof -> eoi_in_string_literal line_no, column_no
"\n" -> eoln_in_string_literal line_no, column_no
"\\" ->
{ch2, inp} = get_ch inp
{chr2, _, _} = ch2
case chr2 do
"n" -> scan_str_lit inp, ch, (contents <> "\\n")
"\\" -> scan_str_lit inp, ch, (contents <> "\\\\")
_ -> unsupported_escape line_no1, column_no1, chr2
end
_ -> scan_str_lit inp, ch, (contents <> chr1)
end
end
end
 
###-------------------------------------------------------------------
###
### Scanning of character literals.
###
### It is assumed that the first character is the opening quote, and
### that the closing quote is the same character.
###
### The tedious part of scanning a character literal is distinguishing
### between the kinds of lexical error. (One might wish to modify the
### code to detect, as a distinct kind of error, end of line within a
### character literal.)
###
 
def scan_character_literal inp do
{ch, inp} = get_ch inp
{_, line_no, column_no} = ch
{ch1, inp} = get_ch inp
{chr1, line_no1, column_no1} = ch1
{intval, inp} =
case chr1 do
:eof -> unterminated_character_literal line_no, column_no
"\\" ->
{ch2, inp} = get_ch inp
{chr2, _, _} = ch2
case chr2 do
:eof -> unterminated_character_literal line_no, column_no
"n" -> {(:binary.first "\n"), inp}
"\\" -> {(:binary.first "\\"), inp}
_ -> unsupported_escape line_no1, column_no1, chr2
end
_ -> {(:binary.first chr1), inp}
end
inp = check_character_literal_end inp, ch
{{"Integer", "#{intval}", line_no, column_no}, inp}
end
 
def check_character_literal_end inp, ch do
{chr, _, _} = ch
{{chr1, _, _}, inp} = get_ch inp
if chr1 == chr do
inp
else
# Lexical error.
find_char_lit_end inp, ch
end
end
 
def find_char_lit_end inp, ch do
{chr, line_no, column_no} = ch
{{chr1, _, _}, inp} = get_ch inp
if chr1 == chr do
multicharacter_literal line_no, column_no
else
case chr1 do
:eof -> unterminated_character_literal line_no, column_no
_ -> find_char_lit_end inp, ch
end
end
end
 
###-------------------------------------------------------------------
###
### Character-at-a-time input, with unrestricted pushback, and with
### line and column numbering.
###
 
def make_inp inpf do
{inpf, [], 1, 1}
end
 
def get_ch {inpf, pushback, line_no, column_no} do
case pushback do
[head | tail] ->
{head, {inpf, tail, line_no, column_no}}
[] ->
case IO.read(inpf, 1) do
:eof ->
{{:eof, line_no, column_no},
{inpf, pushback, line_no, column_no}}
{:error, _} ->
{{:eof, line_no, column_no},
{inpf, pushback, line_no, column_no}}
chr ->
case chr do
"\n" ->
{{chr, line_no, column_no},
{inpf, pushback, line_no + 1, 1}}
_ ->
{{chr, line_no, column_no},
{inpf, pushback, line_no, column_no + 1}}
end
end
end
end
 
def push_back ch, {inpf, pushback, line_no, column_no} do
{inpf, [ch | pushback], line_no, column_no}
end
 
###-------------------------------------------------------------------
###
### Lexical and usage errors.
###
 
def unterminated_comment line_no, column_no do
raise "#{scriptname()}: unterminated comment at #{line_no}:#{column_no}"
end
 
def invalid_integer_literal line_no, column_no, word do
raise "#{scriptname()}: invalid integer literal #{word} at #{line_no}:#{column_no}"
end
 
def unsupported_escape line_no, column_no, chr do
raise "#{scriptname()}: unsupported escape \\#{chr} at #{line_no}:#{column_no}"
end
 
def eoi_in_string_literal line_no, column_no do
raise "#{scriptname()}: end of input in string literal starting at #{line_no}:#{column_no}"
end
 
def eoln_in_string_literal line_no, column_no do
raise "#{scriptname()}: end of line in string literal starting at #{line_no}:#{column_no}"
end
 
def multicharacter_literal line_no, column_no do
raise "#{scriptname()}: unsupported multicharacter literal at #{line_no}:#{column_no}"
end
 
def unterminated_character_literal line_no, column_no do
raise "#{scriptname()}: unterminated character literal starting at #{line_no}:#{column_no}"
end
 
def unexpected_character line_no, column_no, chr do
raise "#{scriptname()}: unexpected character '#{chr}' at #{line_no}:#{column_no}"
end
 
def usage_error() do
IO.puts "Usage: #{scriptname()} [INPUTFILE [OUTPUTFILE]]"
IO.puts "If either of INPUTFILE or OUTPUTFILE is not present or is \"-\","
IO.puts "standard input or standard output is used, respectively."
exit_status = 2
exit_status
end
 
def scriptname() do
Path.basename(__ENV__.file)
end
 
#---------------------------------------------------------------------
 
end ## module Lex
 
Lex.main(System.argv)</syntaxhighlight>
 
{{out}}
<pre>$ ./lex testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
 
 
=={{header|Emacs Lisp}}==
{{works with|Emacs|GNU 27.2}}
{{trans|ATS}}
 
 
<syntaxhighlight lang="lisp">#!/usr/bin/emacs --script
;;
;; The Rosetta Code lexical analyzer in GNU Emacs Lisp.
;;
;; Migrated from the ATS. However, Emacs Lisp is not friendly to the
;; functional style of the ATS implementation; therefore the
;; differences are vast.
;;
;; (A Scheme migration could easily, on the other hand, have been
;; almost exact. It is interesting to contrast Lisp dialects and see
;; how huge the differences are.)
;;
;; The script currently takes input only from standard input and
;; writes the token stream only to standard output.
;;
 
(require 'cl-lib)
 
;;; The type of a character, consisting of its code point and where it
;;; occurred in the text.
(cl-defstruct (ch_t (:constructor make-ch (ichar line-no column-no)))
ichar line-no column-no)
 
(defun ch-ichar (ch)
(ch_t-ichar ch))
 
(defun ch-line-no (ch)
(ch_t-line-no ch))
 
(defun ch-column-no (ch)
(ch_t-column-no ch))
 
;;; The type of an "inputter", consisting of an open file for the
;;; text, a pushback buffer (which is an indefinitely deep stack of
;;; ch_t), an input buffer for the current line, and a position in the
;;; text.
(cl-defstruct inp_t file pushback line line-no column-no)
 
(defun make-inp (file)
"Initialize a new inp_t."
(make-inp_t :file file
:pushback '()
:line ""
:line-no 0
:column-no 0))
 
(defvar inp (make-inp t)
"A global inp_t.")
 
(defun get-ch ()
"Get a ch_t, either from the pushback buffer or from the input."
(pcase (inp_t-pushback inp)
(`(,ch . ,tail)
;; Emacs Lisp has only single value return, so the results come
;; back as a list rather than multiple values.
(setq inp (make-inp_t :file (inp_t-file inp)
:pushback tail
:line (inp_t-line inp)
:line-no (inp_t-line-no inp)
:column-no (inp_t-column-no inp)))
ch)
('()
(let ((line (inp_t-line inp))
(line-no (inp_t-line-no inp))
(column-no (inp_t-column-no inp)))
(when (string= line "")
;; Refill the buffer.
(let ((text
(condition-case nil (read-string "")
nil (error 'eoi))))
(if (eq text 'eoi)
(setq line 'eoi)
(setq line (format "%s%c" text ?\n)))
(setq line-no (1+ line-no))
(setq column-no 1)))
(if (eq line 'eoi)
(progn
(setq inp (make-inp_t :file (inp_t-file inp)
:pushback (inp_t-pushback inp)
:line line
:line-no line-no
:column-no column-no))
(make-ch 'eoi line-no column-no))
(let ((c (elt line 0))
(line (substring line 1)))
(setq inp (make-inp_t :file (inp_t-file inp)
:pushback (inp_t-pushback inp)
:line line
:line-no line-no
:column-no (1+ column-no)))
(make-ch c line-no column-no)))))))
 
(defun get-new-line (file)
;; Currently "file" is ignored and the input must be from stdin.
(read-from-minibuffer "" :default 'eoi))
 
(defun push-back (ch)
"Push back a ch_t."
(setq inp (make-inp_t :file (inp_t-file inp)
:pushback (cons ch (inp_t-pushback inp))
:line (inp_t-line inp)
:line-no (inp_t-line-no inp)
:column-no (inp_t-column-no inp))))
 
(defun get-position ()
"Return the line-no and column-no of the next ch_t to be
returned by get-ch, assuming there are no more pushbacks
beforehand."
(let* ((ch (get-ch))
(line-no (ch-line-no ch))
(column-no (ch-column-no ch)))
(push-back ch)
(list line-no column-no)))
 
(defun scan-text (outf)
"The main loop."
(cl-loop for toktup = (get-next-token)
do (print-token outf toktup)
until (string= (elt toktup 0) "End_of_input")))
 
(defun print-token (outf toktup)
"Print a token, along with its position and possibly an
argument."
;; Currently outf is ignored, and the output goes to stdout.
(pcase toktup
(`(,tok ,arg ,line-no ,column-no)
(princ (format "%5d %5d %s" line-no column-no tok))
(pcase tok
("Identifier" (princ (format " %s\n" arg)))
("Integer" (princ (format " %s\n" arg)))
("String" (princ (format " %s\n" arg)))
(_ (princ "\n"))))))
 
(defun get-next-token ()
"The token dispatcher. Returns the next token, as a list along
with its argument and text position."
(skip-spaces-and-comments)
(let* ((ch (get-ch))
(ln (ch-line-no ch))
(cn (ch-column-no ch)))
(pcase (ch-ichar ch)
('eoi (list "End_of_input" "" ln cn))
(?, (list "Comma" "," ln cn))
(?\N{SEMICOLON} (list "Semicolon" ";" ln cn))
(?\N{LEFT PARENTHESIS} (list "LeftParen" "(" ln cn))
(?\N{RIGHT PARENTHESIS} (list "RightParen" ")" ln cn))
(?{ (list "LeftBrace" "{" ln cn))
(?} (list "RightBrace" "}" ln cn))
(?* (list "Op_multiply" "*" ln cn))
(?/ (list "Op_divide" "/" ln cn))
(?% (list "Op_mod" "%" ln cn))
(?+ (list "Op_add" "+" ln cn))
(?- (list "Op_subtract" "-" ln cn))
(?< (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?= (list "Op_lessequal" "<=" ln cn))
(_ (push-back ch1)
(list "Op_less" "<" ln cn)))))
(?> (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?= (list "Op_greaterequal" ">=" ln cn))
(_ (push-back ch1)
(list "Op_greater" ">" ln cn)))))
(?= (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?= (list "Op_equal" "==" ln cn))
(_ (push-back ch1)
(list "Op_assign" "=" ln cn)))))
(?! (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?= (list "Op_notequal" "!=" ln cn))
(_ (push-back ch1)
(list "Op_not" "!" ln cn)))))
(?& (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?& (list "Op_and" "&&" ln cn))
(_ (unexpected-character ln cn (get-ichar ch))))))
(?| (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?| (list "Op_or" "||" ln cn))
(_ (unexpected-character ln cn (get-ichar ch))))))
(?\N{QUOTATION MARK} (push-back ch) (scan-string-literal))
(?\N{APOSTROPHE} (push-back ch) (scan-character-literal))
((pred digitp) (push-back ch) (scan-integer-literal))
((pred identifier-start-p)
(progn
(push-back ch)
(scan-identifier-or-reserved-word)))
(c (unexpected-character ln cn c)))))
 
(defun skip-spaces-and-comments ()
"Skip spaces and comments. A comment is treated as equivalent
to a run of spaces."
(cl-loop for ch = (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?/ (let* ((ch2 (get-ch))
(line-no (ch-line-no ch1))
(column-no (ch-column-no ch1))
(position `(,line-no ,column-no)))
(pcase (ch-ichar ch2)
(?* (scan-comment position)
(get-ch))
(_ (push-back ch2)
ch1))))
(_ ch1)))
while (spacep (ch-ichar ch))
finally do (push-back ch)))
 
(defun scan-comment (position)
(cl-loop for ch = (get-ch)
for done = (comment-done-p ch position)
until done))
 
(defun comment-done-p (ch position)
(pcase (ch-ichar ch)
('eoi (apply 'unterminated-comment position))
(?* (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
('eoi (apply 'unterminated-comment position))
(?/ t)
(_ nil))))
(_ nil)))
 
(defun scan-integer-literal ()
"Scan an integer literal, on the assumption that a digit has
been seen and pushed back."
(let* ((position (get-position))
(lst (scan-word))
(s (list-to-string lst)))
(if (all-digits-p lst)
`("Integer" ,s . ,position)
(apply 'illegal-integer-literal `(,@position , s)))))
 
(defun scan-identifier-or-reserved-word ()
"Scan an identifier or reserved word, on the assumption that a
legal first character (for an identifier) has been seen and
pushed back."
(let* ((position (get-position))
(lst (scan-word))
(s (list-to-string lst))
(tok (pcase s
("else" "Keyword_else")
("if" "Keyword_if")
("while" "Keyword_while")
("print" "Keyword_print")
("putc" "Keyword_putc")
(_ "Identifier"))))
`(,tok ,s . ,position)))
 
(defun scan-word ()
(cl-loop for ch = (get-ch)
while (identifier-continuation-p (ch-ichar ch))
collect (ch-ichar ch)
finally do (push-back ch)))
 
(defun scan-string-literal ()
"Scan a string literal, on the assumption that a double quote
has been seen and pushed back."
(let* ((ch (get-ch))
(_ (cl-assert (= (ch-ichar ch) ?\N{QUOTATION MARK})))
(line-no (ch-line-no ch))
(column-no (ch-column-no ch))
(position `(,line-no ,column-no))
(lst (scan-str-lit position))
(lst `(?\N{QUOTATION MARK} ,@lst ?\N{QUOTATION MARK})))
`("String" ,(list-to-string lst) . ,position)))
 
(defun scan-str-lit (position)
(flatten
(cl-loop for ch = (get-ch)
until (= (ch-ichar ch) ?\N{QUOTATION MARK})
collect (process-str-lit-character
(ch-ichar ch) position))))
 
(defun process-str-lit-character (c position)
;; NOTE: This script might insert a newline before any eoi, so that
;; "end-of-input-in-string-literal" never actually occurs. It is a
;; peculiarity of the script's input mechanism.
(pcase c
('eoi (apply 'end-of-input-in-string-literal position))
(?\n (apply 'end-of-line-in-string-literal position))
(?\\ (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?n '(?\\ ?n))
(?\\ '(?\\ ?\\))
(c (unsupported-escape (ch-line-no ch1)
(ch-column-no ch1)
c)))))
(c c)))
 
(defun scan-character-literal ()
"Scan a character literal, on the assumption that an ASCII
single quote (that is, a Unicode APOSTROPHE) has been seen and
pushed back."
(let* ((toktup (scan-character-literal-without-checking-end))
(line-no (elt toktup 2))
(column-no (elt toktup 3))
(position (list line-no column-no)))
(check-char-lit-end position)
toktup))
 
(defun check-char-lit-end (position)
(let ((ch (get-ch)))
(unless (and (integerp (ch-ichar ch))
(= (ch-ichar ch) ?\N{APOSTROPHE}))
(push-back ch)
(loop-to-char-lit-end position))))
 
(defun loop-to-char-lit-end (position)
(cl-loop for ch = (get-ch)
until (or (eq (ch-ichar ch) 'eoi)
(= (ch-ichar ch) ?\N{APOSTROPHE}))
finally do (if (eq (ch-ichar ch) 'eoi)
(apply 'unterminated-character-literal
position)
(apply 'multicharacter-literal position))))
 
(defun scan-character-literal-without-checking-end ()
(let* ((ch (get-ch))
(_ (cl-assert (= (ch-ichar ch) ?\N{APOSTROPHE})))
(line-no (ch-line-no ch))
(column-no (ch-column-no ch))
(position (list line-no column-no))
(ch1 (get-ch)))
(pcase (ch-ichar ch1)
('eoi (apply 'unterminated-character-literal position))
(?\\ (let ((ch2 (get-ch)))
(pcase (ch-ichar ch2)
('eoi (apply 'unterminated-character-literal position))
(?n `("Integer" ,(format "%d" ?\n) . ,position))
(?\\ `("Integer" ,(format "%d" ?\\) . ,position))
(c (unsupported-escape (ch-line-no ch1)
(ch-column-no ch1)
c)))))
(c `("Integer" ,(format "%d" c) . ,position)))))
 
(defun spacep (c)
(and (integerp c) (or (= c ?\N{SPACE})
(and (<= 9 c) (<= c 13)))))
 
(defun digitp (c)
(and (integerp c) (<= ?0 c) (<= c ?9)))
 
(defun lowerp (c)
;; Warning: in EBCDIC, this kind of test for "alphabetic" is no
;; good. The letters are not contiguous.
(and (integerp c) (<= ?a c) (<= c ?z)))
 
(defun upperp (c)
;; Warning: in EBCDIC, this kind of test for "alphabetic" is no
;; good. The letters are not contiguous.
(and (integerp c) (<= ?A c) (<= c ?Z)))
 
(defun alphap (c)
(or (lowerp c) (upperp c)))
 
(defun identifier-start-p (c)
(and (integerp c) (or (alphap c) (= c ?_))))
 
(defun identifier-continuation-p (c)
(and (integerp c) (or (alphap c) (= c ?_) (digitp c))))
 
(defun all-digits-p (thing)
(cl-loop for c in thing
if (not (digitp c)) return nil
finally return t))
 
(defun list-to-string (lst)
"Convert a list of characters to a string."
(apply 'string lst))
 
(defun flatten (lst)
"Flatten nested lists. (The implementation is recursive and not
for very long lists.)"
(pcase lst
('() '())
(`(,head . ,tail)
(if (listp head)
(append (flatten head) (flatten tail))
(cons head (flatten tail))))))
 
(defun unexpected-character (line-no column-no c)
(error (format "unexpected character '%c' at %d:%d"
c line-no column-no)))
 
(defun unsupported-escape (line-no column-no c)
(error (format "unsupported escape \\%c at %d:%d"
c line-no column-no)))
 
(defun illegal-integer-literal (line-no column-no s)
(error (format "illegal integer literal \"%s\" at %d:%d"
s line-no column-no)))
 
(defun unterminated-character-literal (line-no column-no)
(error (format "unterminated character literal starting at %d:%d"
line-no column-no)))
 
(defun multicharacter-literal (line-no column-no)
(error (format
"unsupported multicharacter literal starting at %d:%d"
line-no column-no)))
 
(defun end-of-input-in-string-literal (line-no column-no)
(error (format "end of input in string literal starting at %d:%d"
line-no column-no)))
 
(defun end-of-line-in-string-literal (line-no column-no)
(error (format "end of line in string literal starting at %d:%d"
line-no column-no)))
 
(defun unterminated-comment (line-no column-no)
(error (format "unterminated comment starting at %d:%d"
line-no column-no)))
 
(defun main ()
(setq inp (make-inp t))
(scan-text t))
 
(main)</syntaxhighlight>
 
 
{{out}}
<pre>$ ./lex-in-el < compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|Erlang}}==
{{works with|Erlang|24.3.3}}
{{trans|ATS}}
{{trans|Elixir}}
 
 
<syntaxhighlight lang="erlang">#!/bin/env escript
%%%-------------------------------------------------------------------
 
-record (inp_t, {inpf, pushback, line_no, column_no}).
 
main (Args) ->
main_program (Args).
 
main_program ([]) ->
scan_from_inpf_to_outf ("-", "-"),
halt (0);
main_program ([Inpf_filename]) ->
scan_from_inpf_to_outf (Inpf_filename, "-"),
halt (0);
main_program ([Inpf_filename, Outf_filename]) ->
scan_from_inpf_to_outf (Inpf_filename, Outf_filename),
halt (0);
main_program ([_, _ | _]) ->
ProgName = escript:script_name (),
io:put_chars (standard_error, "Usage: "),
io:put_chars (standard_error, ProgName),
io:put_chars (standard_error, " [INPUTFILE [OUTPUTFILE]]\n"),
halt (1).
 
scan_from_inpf_to_outf ("-", "-") ->
scan_input (standard_io, standard_io);
scan_from_inpf_to_outf (Inpf_filename, "-") ->
case file:open (Inpf_filename, [read]) of
{ok, Inpf} -> scan_input (Inpf, standard_io);
_ -> open_failure (Inpf_filename, "input")
end;
scan_from_inpf_to_outf ("-", Outf_filename) ->
case file:open (Outf_filename, [write]) of
{ok, Outf} -> scan_input (standard_io, Outf);
_ -> open_failure (Outf_filename, "output")
end;
scan_from_inpf_to_outf (Inpf_filename, Outf_filename) ->
case file:open(Inpf_filename, [read]) of
{ok, Inpf} ->
case file:open (Outf_filename, [write]) of
{ok, Outf} -> scan_input (Inpf, Outf);
_ -> open_failure (Outf_filename, "output")
end;
_ -> open_failure (Inpf_filename, "input")
end.
 
open_failure (Filename, ForWhat) ->
ProgName = escript:script_name (),
io:put_chars (standard_error, ProgName),
io:put_chars (standard_error, ": failed to open \""),
io:put_chars (standard_error, Filename),
io:put_chars (standard_error, "\" for "),
io:put_chars (standard_error, ForWhat),
io:put_chars (standard_error, "\n"),
halt (1).
 
scan_input (Inpf, Outf) ->
scan_text (Outf, make_inp (Inpf)).
 
scan_text (Outf, Inp) ->
{TokTup, Inp1} = get_next_token (Inp),
print_token (Outf, TokTup),
case TokTup of
{"End_of_input", _, _, _} -> ok;
_ -> scan_text (Outf, Inp1)
end.
 
print_token (Outf, {Tok, Arg, Line_no, Column_no}) ->
S_line_no = erlang:integer_to_list (Line_no),
S_column_no = erlang:integer_to_list (Column_no),
io:put_chars (Outf, string:pad (S_line_no, 5, leading)),
io:put_chars (Outf, " "),
io:put_chars (Outf, string:pad (S_column_no, 5, leading)),
io:put_chars (Outf, " "),
io:put_chars (Outf, Tok),
{Padding, Arg1} =
case Tok of
"Identifier" -> {" ", Arg};
"Integer" -> {" ", Arg};
"String" -> {" ", Arg};
_ -> {"", ""}
end,
io:put_chars (Outf, Padding),
io:put_chars (Outf, Arg1),
io:put_chars ("\n").
 
%%%-------------------------------------------------------------------
%%%
%%% The token dispatcher.
%%%
 
get_next_token (Inp) ->
Inp00 = skip_spaces_and_comments (Inp),
{Ch, Inp0} = get_ch (Inp00),
{Char, Line_no, Column_no} = Ch,
Ln = Line_no,
Cn = Column_no,
case Char of
eof -> {{"End_of_input", "", Ln, Cn}, Inp0};
"," -> {{"Comma", ",", Ln, Cn}, Inp0};
";" -> {{"Semicolon", ";", Ln, Cn}, Inp0};
"(" -> {{"LeftParen", "(", Ln, Cn}, Inp0};
")" -> {{"RightParen", ")", Ln, Cn}, Inp0};
"{" -> {{"LeftBrace", "{", Ln, Cn}, Inp0};
"}" -> {{"RightBrace", "}", Ln, Cn}, Inp0};
"*" -> {{"Op_multiply", "*", Ln, Cn}, Inp0};
"/" -> {{"Op_divide", "/", Ln, Cn}, Inp0};
"%" -> {{"Op_mod", "%", Ln, Cn}, Inp0};
"+" -> {{"Op_add", "+", Ln, Cn}, Inp0};
"-" -> {{"Op_subtract", "-", Ln, Cn}, Inp0};
"<" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"=" -> {{"Op_lessequal", "<=", Ln, Cn}, Inp1};
_ -> {{"Op_less", "<", Ln, Cn}, push_back (Ch1, Inp1)}
end;
">" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"=" -> {{"Op_greaterequal", ">=", Ln, Cn}, Inp1};
_ -> {{"Op_greater", ">", Ln, Cn}, push_back (Ch1, Inp1)}
end;
"=" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"=" -> {{"Op_equal", "==", Ln, Cn}, Inp1};
_ -> {{"Op_assign", "=", Ln, Cn}, push_back (Ch1, Inp1)}
end;
"!" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"=" -> {{"Op_notequal", "!=", Ln, Cn}, Inp1};
_ -> {{"Op_not", "!", Ln, Cn}, push_back (Ch1, Inp1)}
end;
"&" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"&" -> {{"Op_and", "&&", Ln, Cn}, Inp1};
_ -> unexpected_character (Ln, Cn, Char)
end;
"|" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"|" -> {{"Op_or", "||", Ln, Cn}, Inp1};
_ -> unexpected_character (Ln, Cn, Char)
end;
"\"" ->
Inp1 = push_back (Ch, Inp0),
scan_string_literal (Inp1);
"'" ->
Inp1 = push_back (Ch, Inp0),
scan_character_literal (Inp1);
_ ->
case is_digit (Char) of
true ->
Inp1 = push_back (Ch, Inp0),
scan_integer_literal (Inp1);
false ->
case is_alpha_or_underscore (Char) of
true ->
Inp1 = push_back (Ch, Inp0),
scan_identifier_or_reserved_word (Inp1);
false ->
unexpected_character (Ln, Cn, Char)
end
end
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Skipping past spaces and /* ... */ comments.
%%%
%%% Comments are treated exactly like a bit of whitespace. They never
%%% make it to the dispatcher.
%%%
 
skip_spaces_and_comments (Inp) ->
{Ch, Inp0} = get_ch (Inp),
{Char, Line_no, Column_no} = Ch,
case classify_char (Char) of
eof -> push_back (Ch, Inp0);
space -> skip_spaces_and_comments (Inp0);
slash ->
{Ch1, Inp1} = get_ch (Inp0),
case Ch1 of
{"*", _, _} ->
Inp2 = scan_comment (Inp1, Line_no, Column_no),
skip_spaces_and_comments (Inp2);
_ -> push_back (Ch, (push_back (Ch1, Inp1)))
end;
other -> push_back (Ch, Inp0)
end.
 
classify_char (Char) ->
case Char of
eof -> eof;
"/" -> slash;
_ -> case is_space (Char) of
true -> space;
false -> other
end
end.
 
scan_comment (Inp, Line_no, Column_no) ->
{Ch0, Inp0} = get_ch (Inp),
case Ch0 of
{eof, _, _} -> unterminated_comment (Line_no, Column_no);
{"*", _, _} ->
{Ch1, Inp1} = get_ch (Inp0),
case Ch1 of
{eof, _, _} ->
unterminated_comment (Line_no, Column_no);
{"/", _, _} -> Inp1;
_ -> scan_comment (Inp1, Line_no, Column_no)
end;
_ -> scan_comment (Inp0, Line_no, Column_no)
end.
 
is_space (S) ->
case re:run (S, "^[[:space:]]+$") of
{match, _} -> true;
_ -> false
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Scanning of integer literals, identifiers, and reserved words.
%%%
%%% These three types of token are very similar to each other.
%%%
 
scan_integer_literal (Inp) ->
%% Scan an entire word, not just digits. This way we detect
%% erroneous text such as "23skidoo".
{Line_no, Column_no, Inp1} = get_position (Inp),
{Word, Inp2} = scan_word (Inp1),
case is_digit (Word) of
true -> {{"Integer", Word, Line_no, Column_no}, Inp2};
false -> invalid_integer_literal (Line_no, Column_no, Word)
end.
 
scan_identifier_or_reserved_word (Inp) ->
%% It is assumed that the first character is of the correct type,
%% thanks to the dispatcher.
{Line_no, Column_no, Inp1} = get_position (Inp),
{Word, Inp2} = scan_word (Inp1),
Tok =
case Word of
"if" -> "Keyword_if";
"else" -> "Keyword_else";
"while" -> "Keyword_while";
"print" -> "Keyword_print";
"putc" -> "Keyword_putc";
_ -> "Identifier"
end,
{{Tok, Word, Line_no, Column_no}, Inp2}.
 
scan_word (Inp) ->
scan_word_loop (Inp, "").
 
scan_word_loop (Inp, Word0) ->
{Ch1, Inp1} = get_ch (Inp),
{Char1, _, _} = Ch1,
case is_alnum_or_underscore (Char1) of
true -> scan_word_loop (Inp1, Word0 ++ Char1);
false -> {Word0, push_back (Ch1, Inp1)}
end.
 
get_position (Inp) ->
{Ch1, Inp1} = get_ch (Inp),
{_, Line_no, Column_no} = Ch1,
Inp2 = push_back (Ch1, Inp1),
{Line_no, Column_no, Inp2}.
 
is_digit (S) ->
case re:run (S, "^[[:digit:]]+$") of
{match, _} -> true;
_ -> false
end.
 
is_alpha_or_underscore (S) ->
case re:run (S, "^[[:alpha:]_]+$") of
{match, _} -> true;
_ -> false
end.
 
is_alnum_or_underscore (S) ->
case re:run (S, "^[[:alnum:]_]+$") of
{match, _} -> true;
_ -> false
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Scanning of string literals.
%%%
%%% It is assumed that the first character is the opening quote, and
%%% that the closing quote is the same character.
%%%
 
 
scan_string_literal (Inp) ->
{Ch1, Inp1} = get_ch (Inp),
{Quote_mark, Line_no, Column_no} = Ch1,
{Contents, Inp2} = scan_str_lit (Inp1, Ch1),
Toktup = {"String", Quote_mark ++ Contents ++ Quote_mark,
Line_no, Column_no},
{Toktup, Inp2}.
 
scan_str_lit (Inp, Ch) -> scan_str_lit_loop (Inp, Ch, "").
 
scan_str_lit_loop (Inp, Ch, Contents) ->
{Quote_mark, Line_no, Column_no} = Ch,
{Ch1, Inp1} = get_ch (Inp),
{Char1, Line_no1, Column_no1} = Ch1,
case Char1 of
Quote_mark -> {Contents, Inp1};
eof -> eoi_in_string_literal (Line_no, Column_no);
"\n" -> eoln_in_string_literal (Line_no, Column_no);
"\\" ->
{Ch2, Inp2} = get_ch (Inp1),
{Char2, _, _} = Ch2,
case Char2 of
"n" ->
scan_str_lit_loop (Inp2, Ch, Contents ++ "\\n");
"\\" ->
scan_str_lit_loop (Inp2, Ch, Contents ++ "\\\\");
_ ->
unsupported_escape (Line_no1, Column_no1, Char2)
end;
_ -> scan_str_lit_loop (Inp1, Ch, Contents ++ Char1)
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Scanning of character literals.
%%%
%%% It is assumed that the first character is the opening quote, and
%%% that the closing quote is the same character.
%%%
%%% The tedious part of scanning a character literal is distinguishing
%%% between the kinds of lexical error. (One might wish to modify the
%%% code to detect, as a distinct kind of error, end of line within a
%%% character literal.)
%%%
 
scan_character_literal (Inp) ->
{Ch, Inp0} = get_ch (Inp),
{_, Line_no, Column_no} = Ch,
{Ch1, Inp1} = get_ch (Inp0),
{Char1, Line_no1, Column_no1} = Ch1,
{Intval, Inp3} =
case Char1 of
eof -> unterminated_character_literal (Line_no, Column_no);
"\\" ->
{Ch2, Inp2} = get_ch (Inp1),
{Char2, _, _} = Ch2,
case Char2 of
eof -> unterminated_character_literal (Line_no,
Column_no);
"n" -> {char_to_code ("\n"), Inp2};
"\\" -> {char_to_code ("\\"), Inp2};
_ -> unsupported_escape (Line_no1, Column_no1,
Char2)
end;
_ -> {char_to_code (Char1), Inp1}
end,
Inp4 = check_character_literal_end (Inp3, Ch),
{{"Integer", Intval, Line_no, Column_no}, Inp4}.
 
char_to_code (Char) ->
%% Hat tip to https://archive.ph/BxZRS
lists:flatmap (fun erlang:integer_to_list/1, Char).
 
check_character_literal_end (Inp, Ch) ->
{Char, _, _} = Ch,
{{Char1, _, _}, Inp1} = get_ch (Inp),
case Char1 of
Char -> Inp1;
_ -> find_char_lit_end (Inp1, Ch) % Handle a lexical error.
end.
 
find_char_lit_end (Inp, Ch) ->
%% There is a lexical error. Determine which kind it fits into.
{Char, Line_no, Column_no} = Ch,
{{Char1, _, _}, Inp1} = get_ch (Inp),
case Char1 of
Char -> multicharacter_literal (Line_no, Column_no);
eof -> unterminated_character_literal (Line_no, Column_no);
_ -> find_char_lit_end (Inp1, Ch)
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Character-at-a-time input, with unrestricted pushback, and with
%%% line and column numbering.
%%%
 
make_inp (Inpf) ->
#inp_t{inpf = Inpf,
pushback = [],
line_no = 1,
column_no = 1}.
 
get_ch (Inp) ->
#inp_t{inpf = Inpf,
pushback = Pushback,
line_no = Line_no,
column_no = Column_no} = Inp,
case Pushback of
[Ch | Tail] ->
Inp1 = Inp#inp_t{pushback = Tail},
{Ch, Inp1};
[] ->
case io:get_chars (Inpf, "", 1) of
eof ->
Ch = {eof, Line_no, Column_no},
{Ch, Inp};
{error, _} ->
Ch = {eof, Line_no, Column_no},
{Ch, Inp};
Char ->
case Char of
"\n" ->
Ch = {Char, Line_no, Column_no},
Inp1 = Inp#inp_t{line_no = Line_no + 1,
column_no = 1},
{Ch, Inp1};
_ ->
Ch = {Char, Line_no, Column_no},
Inp1 =
Inp#inp_t{column_no = Column_no + 1},
{Ch, Inp1}
end
end
end.
 
push_back (Ch, Inp) ->
Inp#inp_t{pushback = [Ch | Inp#inp_t.pushback]}.
 
%%%-------------------------------------------------------------------
 
invalid_integer_literal (Line_no, Column_no, Word) ->
error_abort ("invalid integer literal \"" ++
Word ++ "\" at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
unsupported_escape (Line_no, Column_no, Char) ->
error_abort ("unsupported escape \\" ++
Char ++ " at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
unexpected_character (Line_no, Column_no, Char) ->
error_abort ("unexpected character '" ++
Char ++ "' at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
eoi_in_string_literal (Line_no, Column_no) ->
error_abort ("end of input in string literal starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
eoln_in_string_literal (Line_no, Column_no) ->
error_abort ("end of line in string literal starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
unterminated_character_literal (Line_no, Column_no) ->
error_abort ("unterminated character literal starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
multicharacter_literal (Line_no, Column_no) ->
error_abort ("unsupported multicharacter literal starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
unterminated_comment (Line_no, Column_no) ->
error_abort ("unterminated comment starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
error_abort (Message) ->
ProgName = escript:script_name (),
io:put_chars (standard_error, ProgName),
io:put_chars (standard_error, ": "),
io:put_chars (standard_error, Message),
io:put_chars (standard_error, "\n"),
halt (1).
 
%%%-------------------------------------------------------------------
%%% Instructions to GNU Emacs --
%%% local variables:
%%% mode: erlang
%%% erlang-indent-level: 3
%%% end:
%%%-------------------------------------------------------------------</syntaxhighlight>
 
 
{{out}}
<pre>$ ./lex-in-Erlang compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|Euphoria}}==
Tested with Euphoria 4.05.
<langsyntaxhighlight lang="euphoria">include std/io.e
include std/map.e
include std/types.e
Line 2,551 ⟶ 5,877:
end procedure
 
main(command_line())</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 2,595 ⟶ 5,921:
=={{header|Flex}}==
Tested with Flex 2.5.4.
<syntaxhighlight lang="c">%{
<lang C>%{
#include <stdio.h>
#include <stdlib.h>
Line 2,768 ⟶ 6,094:
} while (tok != tk_EOI);
return 0;
}</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 2,812 ⟶ 6,138:
=={{header|Forth}}==
Tested with Gforth 0.7.3.
<langsyntaxhighlight Forthlang="forth">CREATE BUF 0 , \ single-character look-ahead buffer
CREATE COLUMN# 0 ,
CREATE LINE# 1 ,
Line 2,934 ⟶ 6,260:
THEN THEN ;
: TOKENIZE BEGIN CONSUME AGAIN ;
TOKENIZE</langsyntaxhighlight>
 
{{out}}
Tested against all programs in [[Compiler/Sample programs]].
 
=={{header|Fortran}}==
{{works with|gfortran|11.2.1}}
You should call the source file ‘lex.F90’, so gfortran will know to use the C preprocessor. I use the preprocessor to select between different ways to read stream input from the standard input.
 
(Despite the ‘.F90’ extension that I recommend, this is Fortran 2008/2018 code.)
 
There is ‘framework’ for supporting Unicode, but no actual Unicode support. To support Unicode reliably I would probably use the C interface and GNU libunistring.
 
The author has placed this Fortran code in the public domain.
<syntaxhighlight lang="fortran">!!!
!!! An implementation of the Rosetta Code lexical analyzer task:
!!! https://rosettacode.org/wiki/Compiler/lexical_analyzer
!!!
!!! The C implementation was used as a reference on behavior, but was
!!! not adhered to for the implementation.
!!!
 
module string_buffers
use, intrinsic :: iso_fortran_env, only: error_unit
use, intrinsic :: iso_fortran_env, only: int64
 
implicit none
private
 
public :: strbuf_t
public :: strbuf_t_length_kind
public :: strbuf_t_character_kind
 
integer, parameter :: strbuf_t_length_kind = int64
 
! String buffers can handle Unicode.
integer, parameter :: strbuf_t_character_kind = selected_char_kind ('ISO_10646')
 
! Private abbreviations.
integer, parameter :: nk = strbuf_t_length_kind
integer, parameter :: ck = strbuf_t_character_kind
 
type :: strbuf_t
integer(kind = nk), private :: len = 0
!
! ‘chars’ is made public for efficient access to the individual
! characters.
!
character(1, kind = ck), allocatable, public :: chars(:)
contains
procedure, pass, private :: ensure_storage => strbuf_t_ensure_storage
procedure, pass :: to_unicode => strbuf_t_to_unicode
procedure, pass :: length => strbuf_t_length
procedure, pass :: set => strbuf_t_set
procedure, pass :: append => strbuf_t_append
generic :: assignment(=) => set
end type strbuf_t
 
contains
 
function strbuf_t_to_unicode (strbuf) result (s)
class(strbuf_t), intent(in) :: strbuf
character(:, kind = ck), allocatable :: s
 
!
! This does not actually ensure that the string is valid Unicode;
! any 31-bit ‘character’ is supported.
!
 
integer(kind = nk) :: i
 
allocate (character(len = strbuf%len, kind = ck) :: s)
do i = 1, strbuf%len
s(i:i) = strbuf%chars(i)
end do
end function strbuf_t_to_unicode
 
elemental function strbuf_t_length (strbuf) result (n)
class(strbuf_t), intent(in) :: strbuf
integer(kind = nk) :: n
 
n = strbuf%len
end function strbuf_t_length
 
elemental function next_power_of_two (x) result (y)
integer(kind = nk), intent(in) :: x
integer(kind = nk) :: y
 
!
! It is assumed that no more than 64 bits are used.
!
! The branch-free algorithm is that of
! https://archive.is/nKxAc#RoundUpPowerOf2
!
! Fill in bits until one less than the desired power of two is
! reached, and then add one.
!
 
y = x - 1
y = ior (y, ishft (y, -1))
y = ior (y, ishft (y, -2))
y = ior (y, ishft (y, -4))
y = ior (y, ishft (y, -8))
y = ior (y, ishft (y, -16))
y = ior (y, ishft (y, -32))
y = y + 1
end function next_power_of_two
 
elemental function new_storage_size (length_needed) result (size)
integer(kind = nk), intent(in) :: length_needed
integer(kind = nk) :: size
 
! Increase storage by orders of magnitude.
 
if (2_nk**32 < length_needed) then
size = huge (1_nk)
else
size = next_power_of_two (length_needed)
end if
end function new_storage_size
 
subroutine strbuf_t_ensure_storage (strbuf, length_needed)
class(strbuf_t), intent(inout) :: strbuf
integer(kind = nk), intent(in) :: length_needed
 
integer(kind = nk) :: new_size
type(strbuf_t) :: new_strbuf
 
if (.not. allocated (strbuf%chars)) then
! Initialize a new strbuf%chars array.
new_size = new_storage_size (length_needed)
allocate (strbuf%chars(1:new_size))
else if (ubound (strbuf%chars, 1) < length_needed) then
! Allocate a new strbuf%chars array, larger than the current
! one, but containing the same characters.
new_size = new_storage_size (length_needed)
allocate (new_strbuf%chars(1:new_size))
new_strbuf%chars(1:strbuf%len) = strbuf%chars(1:strbuf%len)
call move_alloc (new_strbuf%chars, strbuf%chars)
end if
end subroutine strbuf_t_ensure_storage
 
subroutine strbuf_t_set (dst, src)
class(strbuf_t), intent(inout) :: dst
class(*), intent(in) :: src
 
integer(kind = nk) :: n
integer(kind = nk) :: i
 
select type (src)
type is (character(*, kind = ck))
n = len (src, kind = nk)
call dst%ensure_storage(n)
do i = 1, n
dst%chars(i) = src(i:i)
end do
dst%len = n
type is (character(*))
n = len (src, kind = nk)
call dst%ensure_storage(n)
do i = 1, n
dst%chars(i) = src(i:i)
end do
dst%len = n
class is (strbuf_t)
n = src%len
call dst%ensure_storage(n)
dst%chars(1:n) = src%chars(1:n)
dst%len = n
class default
error stop
end select
end subroutine strbuf_t_set
 
subroutine strbuf_t_append (dst, src)
class(strbuf_t), intent(inout) :: dst
class(*), intent(in) :: src
 
integer(kind = nk) :: n_dst, n_src, n
integer(kind = nk) :: i
 
select type (src)
type is (character(*, kind = ck))
n_dst = dst%len
n_src = len (src, kind = nk)
n = n_dst + n_src
call dst%ensure_storage(n)
do i = 1, n_src
dst%chars(n_dst + i) = src(i:i)
end do
dst%len = n
type is (character(*))
n_dst = dst%len
n_src = len (src, kind = nk)
n = n_dst + n_src
call dst%ensure_storage(n)
do i = 1, n_src
dst%chars(n_dst + i) = src(i:i)
end do
dst%len = n
class is (strbuf_t)
n_dst = dst%len
n_src = src%len
n = n_dst + n_src
call dst%ensure_storage(n)
dst%chars((n_dst + 1):n) = src%chars(1:n_src)
dst%len = n
class default
error stop
end select
end subroutine strbuf_t_append
 
end module string_buffers
 
module lexical_analysis
use, intrinsic :: iso_fortran_env, only: input_unit
use, intrinsic :: iso_fortran_env, only: output_unit
use, intrinsic :: iso_fortran_env, only: error_unit
use, intrinsic :: iso_fortran_env, only: int32
use, non_intrinsic :: string_buffers
 
implicit none
private
 
public :: lexer_input_t
public :: lexer_output_t
public :: run_lexer
 
integer, parameter :: input_file_unit_no = 100
integer, parameter :: output_file_unit_no = 101
 
! Private abbreviations.
integer, parameter :: nk = strbuf_t_length_kind
integer, parameter :: ck = strbuf_t_character_kind
 
! Integers large enough for a Unicode code point. Unicode code
! points (and UCS-4) have never been allowed to go higher than
! 7FFFFFFF, and are even further restricted now.
integer, parameter :: ichar_kind = int32
 
character(1, kind = ck), parameter :: horizontal_tab_char = char (9, kind = ck)
character(1, kind = ck), parameter :: linefeed_char = char (10, kind = ck)
character(1, kind = ck), parameter :: vertical_tab_char = char (11, kind = ck)
character(1, kind = ck), parameter :: formfeed_char = char (12, kind = ck)
character(1, kind = ck), parameter :: carriage_return_char = char (13, kind = ck)
character(1, kind = ck), parameter :: space_char = ck_' '
 
! The following is correct for Unix and its relatives.
character(1, kind = ck), parameter :: newline_char = linefeed_char
 
character(1, kind = ck), parameter :: backslash_char = char (92, kind = ck)
 
character(*, kind = ck), parameter :: newline_intstring = ck_'10'
character(*, kind = ck), parameter :: backslash_intstring = ck_'92'
 
integer, parameter :: tk_EOI = 0
integer, parameter :: tk_Mul = 1
integer, parameter :: tk_Div = 2
integer, parameter :: tk_Mod = 3
integer, parameter :: tk_Add = 4
integer, parameter :: tk_Sub = 5
integer, parameter :: tk_Negate = 6
integer, parameter :: tk_Not = 7
integer, parameter :: tk_Lss = 8
integer, parameter :: tk_Leq = 9
integer, parameter :: tk_Gtr = 10
integer, parameter :: tk_Geq = 11
integer, parameter :: tk_Eq = 12
integer, parameter :: tk_Neq = 13
integer, parameter :: tk_Assign = 14
integer, parameter :: tk_And = 15
integer, parameter :: tk_Or = 16
integer, parameter :: tk_If = 17
integer, parameter :: tk_Else = 18
integer, parameter :: tk_While = 19
integer, parameter :: tk_Print = 20
integer, parameter :: tk_Putc = 21
integer, parameter :: tk_Lparen = 22
integer, parameter :: tk_Rparen = 23
integer, parameter :: tk_Lbrace = 24
integer, parameter :: tk_Rbrace = 25
integer, parameter :: tk_Semi = 26
integer, parameter :: tk_Comma = 27
integer, parameter :: tk_Ident = 28
integer, parameter :: tk_Integer = 29
integer, parameter :: tk_String = 30
 
character(len = 16), parameter :: token_names(0:30) = &
& (/ "End_of_input ", "Op_multiply ", "Op_divide ", "Op_mod ", "Op_add ", &
& "Op_subtract ", "Op_negate ", "Op_not ", "Op_less ", "Op_lessequal ", &
& "Op_greater ", "Op_greaterequal ", "Op_equal ", "Op_notequal ", "Op_assign ", &
& "Op_and ", "Op_or ", "Keyword_if ", "Keyword_else ", "Keyword_while ", &
& "Keyword_print ", "Keyword_putc ", "LeftParen ", "RightParen ", "LeftBrace ", &
& "RightBrace ", "Semicolon ", "Comma ", "Identifier ", "Integer ", &
& "String " /)
 
type :: token_t
integer :: token_no
 
! Our implementation stores the value of a tk_Integer as a
! string. The C reference implementation stores it as an int.
character(:, kind = ck), allocatable :: val
 
integer(nk) :: line_no
integer(nk) :: column_no
end type token_t
 
type :: lexer_input_t
logical, private :: using_input_unit = .true.
integer, private :: unit_no = -(huge (1))
integer(kind = nk) :: line_no = 1
integer(kind = nk) :: column_no = 0
integer, private :: unget_count = 0
 
! The maximum lookahead is 2, although I believe we are using
! only 1. In principle, the lookahead could be any finite number.
character(1, kind = ck), private :: unget_buffer(1:2)
logical, private :: unget_eof_buffer(1:2)
 
! Using the same strbuf_t multiple times reduces the need for
! reallocations. Putting that strbuf_t in the lexer_input_t is
! simply for convenience.
type(strbuf_t), private :: strbuf
 
contains
!
! Note: There is currently no facility for closing one input and
! switching to another.
!
! Note: There is currently no facility to decode inputs into
! Unicode codepoints. Instead, what happens is raw bytes of
! input get stored as strbuf_t_character_kind values. This
! behavior is adequate for ASCII inputs.
!
procedure, pass :: use_file => lexer_input_t_use_file
procedure, pass :: get_next_ch => lexer_input_t_get_next_ch
procedure, pass :: unget_ch => lexer_input_t_unget_ch
procedure, pass :: unget_eof => lexer_input_t_unget_eof
end type lexer_input_t
 
type :: lexer_output_t
integer, private :: unit_no = output_unit
contains
procedure, pass :: use_file => lexer_output_t_use_file
procedure, pass :: output_token => lexer_output_t_output_token
end type lexer_output_t
 
contains
 
subroutine lexer_input_t_use_file (inputter, filename)
class(lexer_input_t), intent(inout) :: inputter
character(*), intent(in) :: filename
 
integer :: stat
 
inputter%using_input_unit = .false.
inputter%unit_no = input_file_unit_no
inputter%line_no = 1
inputter%column_no = 0
 
open (unit = input_file_unit_no, file = filename, status = 'old', &
& action = 'read', access = 'stream', form = 'unformatted', &
& iostat = stat)
if (stat /= 0) then
write (error_unit, '("Error: failed to open ", A, " for input")') filename
stop 1
end if
end subroutine lexer_input_t_use_file
 
!!!
!!! If you tell gfortran you want -std=f2008 or -std=f2018, you likely
!!! will need to add also -fall-intrinsics or -U__GFORTRAN__
!!!
!!! The first way, you get the FGETC intrinsic. The latter way, you
!!! get the C interface code that uses getchar(3).
!!!
#ifdef __GFORTRAN__
 
subroutine get_input_unit_char (c, stat)
!
! The following works if you are using gfortran.
!
! (FGETC is considered a feature for backwards compatibility with
! g77. However, I know of no way to reconfigure input_unit as a
! Fortran 2003 stream, for use with ordinary ‘read’.)
!
character, intent(inout) :: c
integer, intent(out) :: stat
 
call fgetc (input_unit, c, stat)
end subroutine get_input_unit_char
 
#else
 
subroutine get_input_unit_char (c, stat)
!
! An alternative implementation of get_input_unit_char. This
! actually reads input from the C standard input, which might not
! be the same as input_unit.
!
use, intrinsic :: iso_c_binding, only: c_int
character, intent(inout) :: c
integer, intent(out) :: stat
 
interface
!
! Use getchar(3) to read characters from standard input. This
! assumes there is actually such a function available, and that
! getchar(3) does not exist solely as a macro. (One could write
! one’s own getchar() if necessary, of course.)
!
function getchar () result (c) bind (c, name = 'getchar')
use, intrinsic :: iso_c_binding, only: c_int
integer(kind = c_int) :: c
end function getchar
end interface
 
integer(kind = c_int) :: i_char
 
i_char = getchar ()
!
! The C standard requires that EOF have a negative value. If the
! value returned by getchar(3) is not EOF, then it will be
! representable as an unsigned char. Therefore, to check for end
! of file, one need only test whether i_char is negative.
!
if (i_char < 0) then
stat = -1
else
stat = 0
c = char (i_char)
end if
end subroutine get_input_unit_char
 
#endif
 
subroutine lexer_input_t_get_next_ch (inputter, eof, ch)
class(lexer_input_t), intent(inout) :: inputter
logical, intent(out) :: eof
character(1, kind = ck), intent(inout) :: ch
 
integer :: stat
character(1) :: c = '*'
 
if (0 < inputter%unget_count) then
if (inputter%unget_eof_buffer(inputter%unget_count)) then
eof = .true.
else
eof = .false.
ch = inputter%unget_buffer(inputter%unget_count)
end if
inputter%unget_count = inputter%unget_count - 1
else
if (inputter%using_input_unit) then
call get_input_unit_char (c, stat)
else
read (unit = inputter%unit_no, iostat = stat) c
end if
 
ch = char (ichar (c, kind = ichar_kind), kind = ck)
 
if (0 < stat) then
write (error_unit, '("Input error with status code ", I0)') stat
stop 1
else if (stat < 0) then
eof = .true.
! The C reference code increases column number on end of file;
! therefore, so shall we.
inputter%column_no = inputter%column_no + 1
else
eof = .false.
if (ch == newline_char) then
inputter%line_no = inputter%line_no + 1
inputter%column_no = 0
else
inputter%column_no = inputter%column_no + 1
end if
end if
end if
end subroutine lexer_input_t_get_next_ch
 
subroutine lexer_input_t_unget_ch (inputter, ch)
class(lexer_input_t), intent(inout) :: inputter
character(1, kind = ck), intent(in) :: ch
 
if (ubound (inputter%unget_buffer, 1) <= inputter%unget_count) then
write (error_unit, '("class(lexer_input_t) unget buffer overflow")')
stop 1
else
inputter%unget_count = inputter%unget_count + 1
inputter%unget_buffer(inputter%unget_count) = ch
inputter%unget_eof_buffer(inputter%unget_count) = .false.
end if
end subroutine lexer_input_t_unget_ch
 
subroutine lexer_input_t_unget_eof (inputter)
class(lexer_input_t), intent(inout) :: inputter
 
if (ubound (inputter%unget_buffer, 1) <= inputter%unget_count) then
write (error_unit, '("class(lexer_input_t) unget buffer overflow")')
stop 1
else
inputter%unget_count = inputter%unget_count + 1
inputter%unget_buffer(inputter%unget_count) = ck_'*'
inputter%unget_eof_buffer(inputter%unget_count) = .true.
end if
end subroutine lexer_input_t_unget_eof
 
subroutine lexer_output_t_use_file (outputter, filename)
class(lexer_output_t), intent(inout) :: outputter
character(*), intent(in) :: filename
 
integer :: stat
 
outputter%unit_no = output_file_unit_no
open (unit = output_file_unit_no, file = filename, action = 'write', iostat = stat)
if (stat /= 0) then
write (error_unit, '("Error: failed to open ", A, " for output")') filename
stop 1
end if
end subroutine lexer_output_t_use_file
 
subroutine lexer_output_t_output_token (outputter, token)
class(lexer_output_t), intent(inout) :: outputter
class(token_t), intent(in) :: token
 
select case (token%token_no)
case (tk_Integer, tk_Ident, tk_String)
write (outputter%unit_no, '(1X, I20, 1X, I20, 1X, A, 1X, A)') &
& token%line_no, token%column_no, &
& token_names(token%token_no), token%val
case default
write (outputter%unit_no, '(1X, I20, 1X, I20, 1X, A)') &
& token%line_no, token%column_no, &
& trim (token_names(token%token_no))
end select
end subroutine lexer_output_t_output_token
 
subroutine run_lexer (inputter, outputter)
class(lexer_input_t), intent(inout) :: inputter
class(lexer_output_t), intent(inout) :: outputter
 
type(token_t) :: token
 
token = get_token (inputter)
do while (token%token_no /= tk_EOI)
call outputter%output_token (token)
token = get_token (inputter)
end do
call outputter%output_token (token)
end subroutine run_lexer
 
function get_token (inputter) result (token)
class(lexer_input_t), intent(inout) :: inputter
type(token_t) :: token
 
logical :: eof
character(1, kind = ck) :: ch
 
call skip_spaces_and_comments (inputter, eof, ch, &
& token%line_no, token%column_no)
 
if (eof) then
token%token_no = tk_EOI
else
select case (ch)
case (ck_'{')
token%token_no = tk_Lbrace
case (ck_'}')
token%token_no = tk_Rbrace
case (ck_'(')
token%token_no = tk_Lparen
case (ck_')')
token%token_no = tk_Rparen
case (ck_'+')
token%token_no = tk_Add
case (ck_'-')
token%token_no = tk_Sub
case (ck_'*')
token%token_no = tk_Mul
case (ck_'%')
token%token_no = tk_Mod
case (ck_';')
token%token_no = tk_Semi
case (ck_',')
token%token_no = tk_Comma
case (ck_'/')
token%token_no = tk_Div
 
case (ck_"'")
call read_character_literal
 
case (ck_'<')
call distinguish_operators (ch, ck_'=', tk_Leq, tk_Lss)
case (ck_'>')
call distinguish_operators (ch, ck_'=', tk_Geq, tk_Gtr)
case (ck_'=')
call distinguish_operators (ch, ck_'=', tk_Eq, tk_Assign)
case (ck_'!')
call distinguish_operators (ch, ck_'=', tk_Neq, tk_Not)
case (ck_'&')
call distinguish_operators (ch, ck_'&', tk_And, tk_EOI)
case (ck_'|')
call distinguish_operators (ch, ck_'|', tk_Or, tk_EOI)
 
case (ck_'"')
call read_string_literal (ch, ch)
 
case default
if (isdigit (ch)) then
call read_numeric_literal (ch)
else if (isalpha_or_underscore (ch)) then
call read_identifier_or_keyword (ch)
else
call start_error_message (inputter)
write (error_unit, '("unrecognized character ''", A, "''")') ch
stop 1
end if
end select
end if
contains
 
subroutine read_character_literal
character(1, kind = ck) :: ch
logical :: eof
character(20, kind = ck) :: buffer
 
token%token_no = tk_Integer
 
call inputter%get_next_ch (eof, ch)
if (eof) then
call start_error_message (inputter)
write (error_unit, '("end of input in character literal")')
stop 1
else if (ch == ck_"'") then
call start_error_message (inputter)
write (error_unit, '("empty character literal")')
stop 1
else if (ch == backslash_char) then
call inputter%get_next_ch (eof, ch)
if (eof) then
call start_error_message (inputter)
write (error_unit, '("end of input in character literal, after backslash")')
stop 1
else if (ch == ck_'n') then
allocate (token%val, source = newline_intstring)
else if (ch == backslash_char) then
allocate (token%val, source = backslash_intstring)
else
call start_error_message (inputter)
write (error_unit, '("unknown escape sequence ''", A, A, "'' in character literal")') &
& backslash_char, ch
stop 1
end if
call read_character_literal_close_quote
else
call read_character_literal_close_quote
write (buffer, '(I0)') ichar (ch, kind = ichar_kind)
allocate (token%val, source = trim (buffer))
end if
end subroutine read_character_literal
 
subroutine read_character_literal_close_quote
logical :: eof
character(1, kind = ck) :: close_quote
 
call inputter%get_next_ch (eof, close_quote)
if (eof) then
call start_error_message (inputter)
write (error_unit, '("end of input in character literal")')
stop 1
else if (close_quote /= ck_"'") then
call start_error_message (inputter)
write (error_unit, '("multi-character literal")')
stop 1
end if
end subroutine read_character_literal_close_quote
 
subroutine distinguish_operators (first_ch, second_ch, &
& token_no_if_second_ch, &
& token_no_if_no_second_ch)
character(1, kind = ck), intent(in) :: first_ch
character(1, kind = ck), intent(in) :: second_ch
integer, intent(in) :: token_no_if_second_ch
integer, intent(in) :: token_no_if_no_second_ch
 
character(1, kind = ck) :: ch
logical :: eof
 
call inputter%get_next_ch (eof, ch)
if (eof) then
call inputter%unget_eof
token%token_no = token_no_if_no_second_ch
else if (ch == second_ch) then
token%token_no = token_no_if_second_ch
else if (token_no_if_no_second_ch == tk_EOI) then
call start_error_message (inputter)
write (error_unit, '("unrecognized character ''", A, "''")') first_ch
stop 1
else
call inputter%unget_ch (ch)
token%token_no = token_no_if_no_second_ch
end if
end subroutine distinguish_operators
 
subroutine read_string_literal (opening_quote, closing_quote)
character(1, kind = ck), intent(in) :: opening_quote
character(1, kind = ck), intent(in) :: closing_quote
 
character(1, kind = ck) :: ch
logical :: done
 
inputter%strbuf = opening_quote
done = .false.
do while (.not. done)
call inputter%get_next_ch (eof, ch)
if (eof) then
call start_error_message (inputter)
write (error_unit, '("end of input in string literal")')
stop 1
else if (ch == closing_quote) then
call inputter%strbuf%append(ch)
done = .true.
else if (ch == newline_char) then
call start_error_message (inputter)
write (error_unit, '("end of line in string literal")')
stop 1
else
call inputter%strbuf%append(ch)
end if
end do
allocate (token%val, source = inputter%strbuf%to_unicode())
token%token_no = tk_String
end subroutine read_string_literal
 
subroutine read_numeric_literal (first_ch)
character(1, kind = ck), intent(in) :: first_ch
 
character(1, kind = ck) :: ch
 
token%token_no = tk_Integer
 
inputter%strbuf = first_ch
call inputter%get_next_ch (eof, ch)
do while (isdigit (ch))
call inputter%strbuf%append (ch)
call inputter%get_next_ch (eof, ch)
end do
if (isalpha_or_underscore (ch)) then
call start_error_message (inputter)
write (error_unit, '("invalid numeric literal """, A, """")') &
& inputter%strbuf%to_unicode()
stop 1
else
call inputter%unget_ch (ch)
allocate (token%val, source = inputter%strbuf%to_unicode())
end if
end subroutine read_numeric_literal
 
subroutine read_identifier_or_keyword (first_ch)
character(1, kind = ck), intent(in) :: first_ch
 
character(1, kind = ck) :: ch
 
inputter%strbuf = first_ch
call inputter%get_next_ch (eof, ch)
do while (isalnum_or_underscore (ch))
call inputter%strbuf%append (ch)
call inputter%get_next_ch (eof, ch)
end do
 
call inputter%unget_ch (ch)
 
!
! The following is a handwritten ‘implicit radix tree’ search
! for keywords, first partitioning the set of keywords according
! to their lengths.
!
! I did it this way for fun. One could, of course, write a
! program to generate code for such a search.
!
! Perfect hashes are another method one could use.
!
! The reference C implementation uses a binary search.
!
token%token_no = tk_Ident
select case (inputter%strbuf%length())
case (2)
select case (inputter%strbuf%chars(1))
case (ck_'i')
select case (inputter%strbuf%chars(2))
case (ck_'f')
token%token_no = tk_If
case default
continue
end select
case default
continue
end select
case (4)
select case (inputter%strbuf%chars(1))
case (ck_'e')
select case (inputter%strbuf%chars(2))
case (ck_'l')
select case (inputter%strbuf%chars(3))
case (ck_'s')
select case (inputter%strbuf%chars(4))
case (ck_'e')
token%token_no = tk_Else
case default
continue
end select
case default
continue
end select
case default
continue
end select
case (ck_'p')
select case (inputter%strbuf%chars(2))
case (ck_'u')
select case (inputter%strbuf%chars(3))
case (ck_'t')
select case (inputter%strbuf%chars(4))
case (ck_'c')
token%token_no = tk_Putc
case default
continue
end select
case default
continue
end select
case default
continue
end select
case default
continue
end select
case (5)
select case (inputter%strbuf%chars(1))
case (ck_'p')
select case (inputter%strbuf%chars(2))
case (ck_'r')
select case (inputter%strbuf%chars(3))
case (ck_'i')
select case (inputter%strbuf%chars(4))
case (ck_'n')
select case (inputter%strbuf%chars(5))
case (ck_'t')
token%token_no = tk_Print
case default
continue
end select
case default
continue
end select
case default
continue
end select
case default
continue
end select
case (ck_'w')
select case (inputter%strbuf%chars(2))
case (ck_'h')
select case (inputter%strbuf%chars(3))
case (ck_'i')
select case (inputter%strbuf%chars(4))
case (ck_'l')
select case (inputter%strbuf%chars(5))
case (ck_'e')
token%token_no = tk_While
case default
continue
end select
case default
continue
end select
case default
continue
end select
case default
continue
end select
case default
continue
end select
case default
continue
end select
if (token%token_no == tk_Ident) then
allocate (token%val, source = inputter%strbuf%to_unicode ())
end if
end subroutine read_identifier_or_keyword
 
end function get_token
 
subroutine skip_spaces_and_comments (inputter, eof, ch, line_no, column_no)
!
! This procedure skips spaces and comments, and also captures the
! line and column numbers at the correct moment to indicate the
! start of a token.
!
class(lexer_input_t), intent(inout) :: inputter
logical, intent(out) :: eof
character(1, kind = ck), intent(inout) :: ch
integer(kind = nk), intent(out) :: line_no
integer(kind = nk), intent(out) :: column_no
 
integer(kind = nk), parameter :: not_done = -(huge (1_nk))
 
line_no = not_done
do while (line_no == not_done)
call inputter%get_next_ch (eof, ch)
if (eof) then
line_no = inputter%line_no
column_no = inputter%column_no
else if (ch == ck_'/') then
line_no = inputter%line_no
column_no = inputter%column_no
call inputter%get_next_ch (eof, ch)
if (eof) then
call inputter%unget_eof
ch = ck_'/'
else if (ch /= ck_'*') then
call inputter%unget_ch (ch)
ch = ck_'/'
else
call read_to_end_of_comment
line_no = not_done
end if
else if (.not. isspace (ch)) then
line_no = inputter%line_no
column_no = inputter%column_no
end if
end do
 
contains
 
subroutine read_to_end_of_comment
logical :: done
 
done = .false.
do while (.not. done)
call inputter%get_next_ch (eof, ch)
if (eof) then
call end_of_input_in_comment
else if (ch == ck_'*') then
call inputter%get_next_ch (eof, ch)
if (eof) then
call end_of_input_in_comment
else if (ch == ck_'/') then
done = .true.
end if
end if
end do
end subroutine read_to_end_of_comment
 
subroutine end_of_input_in_comment
call start_error_message (inputter)
write (error_unit, '("end of input in comment")')
stop 1
end subroutine end_of_input_in_comment
 
end subroutine skip_spaces_and_comments
 
subroutine start_error_message (inputter)
class(lexer_input_t), intent(inout) :: inputter
 
write (error_unit, '("Lexical error at ", I0, ".", I0, ": ")', advance = 'no') &
& inputter%line_no, inputter%column_no
end subroutine start_error_message
 
elemental function isspace (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
bool = (ch == horizontal_tab_char) .or. &
& (ch == linefeed_char) .or. &
& (ch == vertical_tab_char) .or. &
& (ch == formfeed_char) .or. &
& (ch == carriage_return_char) .or. &
& (ch == space_char)
end function isspace
 
elemental function isupper (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
integer(kind = ichar_kind), parameter :: uppercase_A = ichar (ck_'A', kind = ichar_kind)
integer(kind = ichar_kind), parameter :: uppercase_Z = ichar (ck_'Z', kind = ichar_kind)
 
integer(kind = ichar_kind) :: i_ch
 
i_ch = ichar (ch, kind = ichar_kind)
bool = (uppercase_A <= i_ch .and. i_ch <= uppercase_Z)
end function isupper
 
elemental function islower (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
integer(kind = ichar_kind), parameter :: lowercase_a = ichar (ck_'a', kind = ichar_kind)
integer(kind = ichar_kind), parameter :: lowercase_z = ichar (ck_'z', kind = ichar_kind)
 
integer(kind = ichar_kind) :: i_ch
 
i_ch = ichar (ch, kind = ichar_kind)
bool = (lowercase_a <= i_ch .and. i_ch <= lowercase_z)
end function islower
 
elemental function isalpha (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
bool = isupper (ch) .or. islower (ch)
end function isalpha
 
elemental function isdigit (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
integer(kind = ichar_kind), parameter :: zero = ichar (ck_'0', kind = ichar_kind)
integer(kind = ichar_kind), parameter :: nine = ichar (ck_'9', kind = ichar_kind)
 
integer(kind = ichar_kind) :: i_ch
 
i_ch = ichar (ch, kind = ichar_kind)
bool = (zero <= i_ch .and. i_ch <= nine)
end function isdigit
 
elemental function isalnum (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
bool = isalpha (ch) .or. isdigit (ch)
end function isalnum
 
elemental function isalpha_or_underscore (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
bool = isalpha (ch) .or. (ch == ck_'_')
end function isalpha_or_underscore
 
elemental function isalnum_or_underscore (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
bool = isalnum (ch) .or. (ch == ck_'_')
end function isalnum_or_underscore
 
end module lexical_analysis
 
program lex
use, intrinsic :: iso_fortran_env, only: output_unit
use, non_intrinsic :: lexical_analysis
 
implicit none
 
integer :: arg_count
character(200) :: arg
type(lexer_input_t) :: inputter
type(lexer_output_t) :: outputter
 
arg_count = command_argument_count ()
if (3 <= arg_count) then
call print_usage
else if (arg_count == 0) then
call run_lexer (inputter, outputter)
else if (arg_count == 1) then
call get_command_argument (1, arg)
call inputter%use_file(trim (arg))
call run_lexer (inputter, outputter)
else if (arg_count == 2) then
call get_command_argument (1, arg)
call inputter%use_file(trim (arg))
call get_command_argument (2, arg)
call outputter%use_file(trim (arg))
call run_lexer (inputter, outputter)
end if
 
contains
 
subroutine print_usage
character(200) :: progname
 
call get_command_argument (0, progname)
write (output_unit, '("Usage: ", A, " [INPUT_FILE [OUTPUT_FILE]]")') &
& trim (progname)
end subroutine print_usage
end program lex</syntaxhighlight>
 
{{out}}
Test case 3.
<pre> 5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|FreeBASIC}}==
Tested with FreeBASIC 1.05
<langsyntaxhighlight FreeBASIClang="freebasic">enum Token_type
tk_EOI
tk_Mul
Line 3,227 ⟶ 7,679:
print : print "Hit any to end program"
sleep
system</langsyntaxhighlight>
{{out|case=test case 3}}
<b>
Line 3,268 ⟶ 7,720:
=={{header|Go}}==
{{trans|FreeBASIC}}
<langsyntaxhighlight lang="go">package main
 
import (
Line 3,645 ⟶ 8,097:
initLex()
process()
}</langsyntaxhighlight>
 
{{out}}
Line 3,684 ⟶ 8,136:
22 26 Integer 32
23 1 End_of_input
</pre>
 
=={{header|Haskell}}==
Tested with GHC 8.0.2
<syntaxhighlight lang="haskell">import Control.Applicative hiding (many, some)
import Control.Monad.State.Lazy
import Control.Monad.Trans.Maybe (MaybeT, runMaybeT)
import Data.Char (isAsciiLower, isAsciiUpper, isDigit, ord)
import Data.Foldable (asum)
import Data.Functor (($>))
import Data.Text (Text)
import qualified Data.Text as T
import Prelude hiding (lex)
import System.Environment (getArgs)
import System.IO
import Text.Printf
 
 
-- Tokens --------------------------------------------------------------------------------------------------------------
data Val = IntVal Int -- value
| TextVal String Text -- name value
| SymbolVal String -- name
| Skip
| LexError String -- message
 
data Token = Token Val Int Int -- value line column
 
 
instance Show Val where
show (IntVal value) = printf "%-18s%d\n" "Integer" value
show (TextVal "String" value) = printf "%-18s%s\n" "String" (show $ T.unpack value) -- show escaped characters
show (TextVal name value) = printf "%-18s%s\n" name (T.unpack value)
show (SymbolVal name ) = printf "%s\n" name
show (LexError msg ) = printf "%-18s%s\n" "Error" msg
show Skip = printf ""
 
instance Show Token where
show (Token val line column) = printf "%2d %2d %s" line column (show val)
 
 
printTokens :: [Token] -> String
printTokens tokens =
"Location Token name Value\n" ++
"--------------------------------------\n" ++
(concatMap show tokens)
 
 
-- Tokenizers ----------------------------------------------------------------------------------------------------------
makeToken :: Lexer Val -> Lexer Token
makeToken lexer = do
(t, l, c) <- get
val <- lexer
 
case val of
Skip -> nextToken
 
LexError msg -> do
(_, l', c') <- get
 
let code = T.unpack $ T.take (c' - c + 1) t
let str = printf "%s\n%s(%d, %d): %s" msg (replicate 27 ' ') l' c' code
 
ch <- peek
unless (ch == '\0') $ advance 1
 
return $ Token (LexError str) l c
 
_ -> return $ Token val l c
 
 
simpleToken :: String -> String -> Lexer Val
simpleToken lexeme name = lit lexeme $> SymbolVal name
 
 
makeTokenizers :: [(String, String)] -> Lexer Val
makeTokenizers = asum . map (uncurry simpleToken)
 
 
keywords :: Lexer Val
keywords = makeTokenizers
[("if", "Keyword_if"), ("else", "Keyword_else"), ("while", "Keyword_while"),
("print", "Keyword_print"), ("putc", "Keyword_putc")]
 
 
operators :: Lexer Val
operators = makeTokenizers
[("*", "Op_multiply"), ("/", "Op_divide"), ("%", "Op_mod"), ("+", "Op_add"),
("-", "Op_subtract"), ("<=", "Op_lessequal"), ("<", "Op_less"), (">=", "Op_greaterequal"),
(">", "Op_greater"), ("==", "Op_equal"), ("!=", "Op_notequal"), ("!", "Op_not"),
("=", "Op_assign"), ("&&", "Op_and"), ("||", "Op_or")]
 
 
symbols :: Lexer Val
symbols = makeTokenizers
[("(", "LeftParen"), (")", "RightParen"),
("{", "LeftBrace"), ("}", "RightBrace"),
(";", "Semicolon"), (",", "Comma")]
 
 
isIdStart :: Char -> Bool
isIdStart ch = isAsciiLower ch || isAsciiUpper ch || ch == '_'
 
isIdEnd :: Char -> Bool
isIdEnd ch = isIdStart ch || isDigit ch
 
identifier :: Lexer Val
identifier = TextVal "Identifier" <$> lexeme
where lexeme = T.cons <$> (one isIdStart) <*> (many isIdEnd)
 
 
integer :: Lexer Val
integer = do
lexeme <- some isDigit
next_ch <- peek
 
if (isIdStart next_ch) then
return $ LexError "Invalid number. Starts like a number, but ends in non-numeric characters."
else do
let num = read (T.unpack lexeme) :: Int
return $ IntVal num
 
 
character :: Lexer Val
character = do
lit "'"
str <- lookahead 3
 
case str of
(ch : '\'' : _) -> advance 2 $> IntVal (ord ch)
"\\n'" -> advance 3 $> IntVal 10
"\\\\'" -> advance 3 $> IntVal 92
('\\' : ch : "\'") -> advance 2 $> LexError (printf "Unknown escape sequence \\%c" ch)
('\'' : _) -> return $ LexError "Empty character constant"
_ -> advance 2 $> LexError "Multi-character constant"
 
 
string :: Lexer Val
string = do
lit "\""
 
loop (T.pack "") =<< peek
where loop t ch = case ch of
'\\' -> do
next_ch <- next
 
case next_ch of
'n' -> loop (T.snoc t '\n') =<< next
'\\' -> loop (T.snoc t '\\') =<< next
_ -> return $ LexError $ printf "Unknown escape sequence \\%c" next_ch
 
'"' -> next $> TextVal "String" t
 
'\n' -> return $ LexError $ "End-of-line while scanning string literal." ++
" Closing string character not found before end-of-line."
 
'\0' -> return $ LexError $ "End-of-file while scanning string literal." ++
" Closing string character not found."
 
_ -> loop (T.snoc t ch) =<< next
 
 
skipComment :: Lexer Val
skipComment = do
lit "/*"
 
loop =<< peek
where loop ch = case ch of
'\0' -> return $ LexError "End-of-file in comment. Closing comment characters not found."
 
'*' -> do
next_ch <- next
 
case next_ch of
'/' -> next $> Skip
_ -> loop next_ch
 
_ -> loop =<< next
 
 
nextToken :: Lexer Token
nextToken = do
skipWhitespace
 
makeToken $ skipComment
<|> keywords
<|> identifier
<|> integer
<|> character
<|> string
<|> operators
<|> symbols
<|> simpleToken "\0" "End_of_input"
<|> (return $ LexError "Unrecognized character.")
 
 
main :: IO ()
main = do
args <- getArgs
(hin, hout) <- getIOHandles args
 
withHandles hin hout $ printTokens . (lex nextToken)
 
 
------------------------------------------------------------------------------------------------------------------------
-- Machinery
------------------------------------------------------------------------------------------------------------------------
 
-- File handling -------------------------------------------------------------------------------------------------------
getIOHandles :: [String] -> IO (Handle, Handle)
getIOHandles [] = return (stdin, stdout)
 
getIOHandles [infile] = do
inhandle <- openFile infile ReadMode
return (inhandle, stdout)
 
getIOHandles (infile : outfile : _) = do
inhandle <- openFile infile ReadMode
outhandle <- openFile outfile WriteMode
return (inhandle, outhandle)
 
 
withHandles :: Handle -> Handle -> (String -> String) -> IO ()
withHandles in_handle out_handle f = do
contents <- hGetContents in_handle
let contents' = contents ++ "\0" -- adding \0 simplifies treatment of EOF
 
hPutStr out_handle $ f contents'
 
unless (in_handle == stdin) $ hClose in_handle
unless (out_handle == stdout) $ hClose out_handle
 
 
-- Lexer ---------------------------------------------------------------------------------------------------------------
type LexerState = (Text, Int, Int) -- input line column
type Lexer = MaybeT (State LexerState)
 
 
lexerAdvance :: Int -> LexerState -> LexerState
lexerAdvance 0 ctx = ctx
 
lexerAdvance 1 (t, l, c)
| ch == '\n' = (rest, l + 1, 1 )
| otherwise = (rest, l, c + 1)
where
(ch, rest) = (T.head t, T.tail t)
 
lexerAdvance n ctx = lexerAdvance (n - 1) $ lexerAdvance 1 ctx
 
 
advance :: Int -> Lexer ()
advance n = modify $ lexerAdvance n
 
 
peek :: Lexer Char
peek = gets $ \(t, _, _) -> T.head t
 
 
lookahead :: Int -> Lexer String
lookahead n = gets $ \(t, _, _) -> T.unpack $ T.take n t
 
 
next :: Lexer Char
next = advance 1 >> peek
 
 
skipWhitespace :: Lexer ()
skipWhitespace = do
ch <- peek
when (ch `elem` " \n") (next >> skipWhitespace)
 
 
lit :: String -> Lexer ()
lit lexeme = do
(t, _, _) <- get
guard $ T.isPrefixOf (T.pack lexeme) t
advance $ length lexeme
 
 
one :: (Char -> Bool) -> Lexer Char
one f = do
ch <- peek
guard $ f ch
next
return ch
 
 
lexerMany :: (Char -> Bool) -> LexerState -> (Text, LexerState)
lexerMany f (t, l, c) = (lexeme, (t', l', c'))
where (lexeme, _) = T.span f t
(t', l', c') = lexerAdvance (T.length lexeme) (t, l, c)
 
 
many :: (Char -> Bool) -> Lexer Text
many f = state $ lexerMany f
 
 
some :: (Char -> Bool) -> Lexer Text
some f = T.cons <$> (one f) <*> (many f)
 
 
lex :: Lexer a -> String -> [a]
lex lexer str = loop lexer (T.pack str, 1, 1)
where loop lexer s
| T.null txt = [t]
| otherwise = t : loop lexer s'
 
where (Just t, s') = runState (runMaybeT lexer) s
(txt, _, _) = s'
</syntaxhighlight>
 
{{out|case=test case 3}}
<pre>
Location Token name Value
--------------------------------------
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
 
=={{header|Icon}}==
{{trans|ATS}}
{{works with|Icon|9.5.20i}}
 
This implementation was developed for Arizona Icon, but ought to work with the Unicon compiler, as well.
 
One interesting aspect is the use of co-expressions to handle "input with pushback". The main advantage of this approach is it hides the pushback buffer from the user, without making the buffer a global variable.
 
Global variables are avoided except for some constants that require initialization.
 
<syntaxhighlight lang="icon">#
# The Rosetta Code lexical analyzer in Icon with co-expressions. Based
# upon the ATS implementation.
#
# Usage: lex [INPUTFILE [OUTPUTFILE]]
# If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input
# or standard output is used, respectively. *)
#
 
$define EOF -1
 
$define TOKEN_ELSE 0
$define TOKEN_IF 1
$define TOKEN_PRINT 2
$define TOKEN_PUTC 3
$define TOKEN_WHILE 4
$define TOKEN_MULTIPLY 5
$define TOKEN_DIVIDE 6
$define TOKEN_MOD 7
$define TOKEN_ADD 8
$define TOKEN_SUBTRACT 9
$define TOKEN_NEGATE 10
$define TOKEN_LESS 11
$define TOKEN_LESSEQUAL 12
$define TOKEN_GREATER 13
$define TOKEN_GREATEREQUAL 14
$define TOKEN_EQUAL 15
$define TOKEN_NOTEQUAL 16
$define TOKEN_NOT 17
$define TOKEN_ASSIGN 18
$define TOKEN_AND 19
$define TOKEN_OR 20
$define TOKEN_LEFTPAREN 21
$define TOKEN_RIGHTPAREN 22
$define TOKEN_LEFTBRACE 23
$define TOKEN_RIGHTBRACE 24
$define TOKEN_SEMICOLON 25
$define TOKEN_COMMA 26
$define TOKEN_IDENTIFIER 27
$define TOKEN_INTEGER 28
$define TOKEN_STRING 29
$define TOKEN_END_OF_INPUT 30
 
global whitespace
global ident_start
global ident_continuation
 
procedure main(args)
local inpf, outf
local pushback_buffer, inp, pushback
 
initial {
whitespace := ' \t\v\f\r\n'
ident_start := '_' ++ &letters
ident_continuation := ident_start ++ &digits
}
 
inpf := &input
outf := &output
if 1 <= *args & args[1] ~== "-" then {
inpf := open(args[1], "rt") |
stop("cannot open ", args[1], " for input")
}
if 2 <= *args & args[2] ~== "-" then {
outf := open(args[2], "wt") |
stop("cannot open ", args[2], " for output")
}
 
pushback_buffer := []
inp := create inputter(inpf, pushback_buffer)
pushback := create repeat push(pushback_buffer, \@&source)
@pushback # The first invocation does nothing.
 
scan_text(outf, inp, pushback)
end
 
procedure scan_text(outf, inp, pushback)
local ch
 
while /ch | ch[1] ~=== EOF do {
skip_spaces_and_comments(inp, pushback)
ch := @inp
if ch[1] === EOF then {
print_token(outf, [TOKEN_END_OF_INPUT, "", ch[2], ch[3]])
} else {
ch @pushback
print_token(outf, get_next_token(inp, pushback))
}
}
end
 
procedure get_next_token(inp, pushback)
local ch, ch1
local ln, cn
 
skip_spaces_and_comments(inp, pushback)
ch := @inp
ln := ch[2] # line number
cn := ch[3] # column number
case ch[1] of {
"," : return [TOKEN_COMMA, ",", ln, cn]
";" : return [TOKEN_SEMICOLON, ";", ln, cn]
"(" : return [TOKEN_LEFTPAREN, "(", ln, cn]
")" : return [TOKEN_RIGHTPAREN, ")", ln, cn]
"{" : return [TOKEN_LEFTBRACE, "{", ln, cn]
"}" : return [TOKEN_RIGHTBRACE, "}", ln, cn]
"*" : return [TOKEN_MULTIPLY, "*", ln, cn]
"/" : return [TOKEN_DIVIDE, "/", ln, cn]
"%" : return [TOKEN_MOD, "%", ln, cn]
"+" : return [TOKEN_ADD, "+", ln, cn]
"-" : return [TOKEN_SUBTRACT, "-", ln, cn]
"<" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_LESSEQUAL, "<=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_LESS, "<", ln, cn]
}
}
">" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_GREATEREQUAL, ">=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_GREATER, ">", ln, cn]
}
}
"=" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_EQUAL, "==", ln, cn]
} else {
ch1 @pushback
return [TOKEN_ASSIGN, "=", ln, cn]
}
}
"!" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_NOTEQUAL, "!=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_NOT, "!", ln, cn]
}
}
"&" : {
ch1 := @inp
if ch1[1] === "&" then {
return [TOKEN_AND, "&&", ln, cn]
} else {
unexpected_character(ln, cn, ch)
}
}
"|" : {
ch1 := @inp
if ch1[1] === "|" then {
return [TOKEN_OR, "||", ln, cn]
} else {
unexpected_character(ln, cn, ch)
}
}
"\"" : {
ch @pushback
return scan_string_literal(inp)
}
"'" : {
ch @pushback
return scan_character_literal(inp, pushback)
}
default : {
if any(&digits, ch[1]) then {
ch @pushback
return scan_integer_literal(inp, pushback)
} else if any(ident_start, ch[1]) then {
ch @pushback
return scan_identifier_or_reserved_word (inp, pushback)
} else {
unexpected_character(ln, cn, ch)
}
}
}
end
 
procedure scan_identifier_or_reserved_word(inp, pushback)
local ch
local s
local line_no, column_no
 
s := ""
ch := @inp
line_no := ch[2]
column_no := ch[3]
while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do {
s ||:= ch[1]
ch := @inp
}
ch @pushback
return reserved_word_lookup (s, line_no, column_no)
end
 
procedure scan_integer_literal(inp, pushback)
local ch
local s
local line_no, column_no
 
s := ""
ch := @inp
line_no := ch[2]
column_no := ch[3]
while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do {
s ||:= ch[1]
ch := @inp
}
ch @pushback
not upto(~&digits, s) | invalid_integer_literal(line_no, column_no, s)
return [TOKEN_INTEGER, s, line_no, column_no]
end
 
procedure scan_character_literal(inp, pushback)
local ch, ch1
local close_quote
local toktup
local line_no, column_no
 
ch := @inp # The opening quote.
close_quote := ch[1] # Same as the opening quote.
ch @pushback
 
line_no := ch[2]
column_no := ch[3]
 
toktup := scan_character_literal_without_checking_end(inp)
ch1 := @inp
if ch1[1] ~=== close_quote then {
repeat {
case ch1[1] of {
EOF : unterminated_character_literal(line_no, column_no)
close_quote : multicharacter_literal(line_no, column_no)
default : ch1 := @inp
}
}
}
return toktup
end
 
procedure scan_character_literal_without_checking_end(inp)
local ch, ch1, ch2
 
ch := @inp # The opening quote.
ch1 := @inp
EOF ~=== ch1[1] | unterminated_character_literal(ch[2], ch[3])
if ch1[1] == "\\" then {
ch2 := @inp
EOF ~=== ch2[1] | unterminated_character_literal(ch[2], ch[3])
case ch2[1] of {
"n" : return [TOKEN_INTEGER, string(ord("\n")), ch[2], ch[3]]
"\\" : return [TOKEN_INTEGER, string(ord("\\")), ch[2], ch[3]]
default : unsupported_escape(ch1[2], ch1[3], ch2)
}
} else {
return [TOKEN_INTEGER, string(ord(ch1[1])), ch[2], ch[3]]
}
end
 
procedure scan_string_literal(inp)
local ch, ch1, ch2
local line_no, column_no
local close_quote
local s
local retval
 
ch := @inp # The opening quote
close_quote := ch[1] # Same as the opening quote.
line_no := ch[2]
column_no := ch[3]
 
s := ch[1]
until \retval do {
ch1 := @inp
ch1[1] ~=== EOF |
unterminated_string_literal (line_no, column_no,
"end of input")
ch1[1] ~== "\n" |
unterminated_string_literal (line_no, column_no,
"end of line")
if ch1[1] == close_quote then {
retval := [TOKEN_STRING, s || close_quote, line_no, column_no]
} else if ch1[1] ~== "\\" then {
s ||:= ch1[1]
} else {
ch2 := @inp
EOF ~=== ch2[1] | unsupported_escape(line_no, column_no, ch2)
case ch2[1] of {
"n" : s ||:= "\\n"
"\\" : s ||:= "\\\\"
default : unsupported_escape(line_no, column_no, ch2)
}
}
}
return retval
end
 
procedure skip_spaces_and_comments(inp, pushback)
local ch, ch1
 
repeat {
ch := @inp
(EOF === ch[1]) & { ch @pushback; return }
if not any(whitespace, ch[1]) then {
(ch[1] == "/") | { ch @pushback; return }
(ch1 := @inp) | { ch @pushback; return }
(ch1[1] == "*") | { ch1 @pushback; ch @pushback; return }
scan_comment(inp, ch[2], ch[3])
}
}
end
 
procedure scan_comment(inp, line_no, column_no)
local ch, ch1
 
until (\ch)[1] == "*" & (\ch1)[1] == "/" do {
ch := @inp
(EOF === ch[1]) & unterminated_comment(line_no, column_no)
if ch[1] == "*" then {
ch1 := @inp
(EOF === ch1[1]) & unterminated_comment(line_no, column_no)
}
}
return
end
 
procedure reserved_word_lookup(s, line_no, column_no)
 
# Lookup is by an extremely simple perfect hash.
 
static reserved_words
static reserved_word_tokens
local hashval, token, toktup
 
initial {
reserved_words := ["if", "print", "else",
"", "putc", "",
"", "while", ""]
reserved_word_tokens :=
[TOKEN_IF, TOKEN_PRINT, TOKEN_ELSE,
TOKEN_IDENTIFIER, TOKEN_PUTC, TOKEN_IDENTIFIER,
TOKEN_IDENTIFIER, TOKEN_WHILE, TOKEN_IDENTIFIER]
}
 
if *s < 2 then {
toktup := [TOKEN_IDENTIFIER, s, line_no, column_no]
} else {
hashval := ((ord(s[1]) + ord(s[2])) % (*reserved_words)) + 1
token := reserved_word_tokens[hashval]
if token = TOKEN_IDENTIFIER | s ~== reserved_words[hashval] then {
toktup := [TOKEN_IDENTIFIER, s, line_no, column_no]
} else {
toktup := [token, s, line_no, column_no]
}
}
return toktup
end
 
procedure print_token(outf, toktup)
static token_names
local s_line, s_column
 
initial {
token_names := ["Keyword_else",
"Keyword_if",
"Keyword_print",
"Keyword_putc",
"Keyword_while",
"Op_multiply",
"Op_divide",
"Op_mod",
"Op_add",
"Op_subtract",
"Op_negate",
"Op_less",
"Op_lessequal",
"Op_greater",
"Op_greaterequal",
"Op_equal",
"Op_notequal",
"Op_not",
"Op_assign",
"Op_and",
"Op_or",
"LeftParen",
"RightParen",
"LeftBrace",
"RightBrace",
"Semicolon",
"Comma",
"Identifier",
"Integer",
"String",
"End_of_input"]
}
 
/outf := &output
s_line := string(toktup[3])
s_column := string(toktup[4])
writes(outf, right (s_line, max(5, *s_line)))
writes(outf, " ")
writes(outf, right (s_column, max(5, *s_column)))
writes(outf, " ")
writes(outf, token_names[toktup[1] + 1])
case toktup[1] of {
TOKEN_IDENTIFIER : writes(outf, " ", toktup[2])
TOKEN_INTEGER : writes(outf, " ", toktup[2])
TOKEN_STRING : writes(outf, " ", toktup[2])
}
write(outf)
return
end
 
procedure inputter(inpf, pushback_buffer)
local buffer
local line_no, column_no
local c
 
buffer := ""
line_no := 1
column_no := 1
 
repeat {
buffer? {
until *pushback_buffer = 0 & pos(0) do {
if *pushback_buffer ~= 0 then {
suspend pop(pushback_buffer)
} else {
c := move(1)
suspend [c, line_no, column_no]
if c == "\n" then {
line_no +:= 1
column_no := 1
} else {
column_no +:= 1
}
}
}
}
(buffer := reads(inpf, 2048)) |
suspend [EOF, line_no, column_no]
}
end
 
procedure unterminated_comment(line_no, column_no)
error("unterminated comment starting at ",
line_no, ":", column_no)
end
 
procedure unexpected_character(line_no, column_no, ch)
error("unexpected character '", ch[1], "' starting at ",
line_no, ":", column_no)
end
 
procedure unterminated_string_literal (line_no, column_no, cause)
error("unterminated string literal (", cause, ") starting at ",
line_no, ":", column_no)
end
 
procedure unsupported_escape (line_no, column_no, ch)
if ch[1] === EOF then {
error("unexpected \\ at end of input",
" starting at ", line_no, ":", column_no)
} else {
error("unsupported escape \\", ch[1],
" starting at ", line_no, ":", column_no)
}
end
 
procedure invalid_integer_literal(line_no, column_no, s)
error("invalid integer literal ", s,
" starting at ", line_no, ":", column_no)
end
 
procedure unterminated_character_literal(line_no, column_no)
error("unterminated character literal starting at ",
line_no, ":", column_no)
end
 
procedure multicharacter_literal(line_no, column_no)
error("unsupported multicharacter literal starting at ",
line_no, ":", column_no)
end
 
procedure error(args[])
write!([&errout] ||| args)
exit(1)
end
 
procedure max(x, y)
return (if x < y then y else x)
end</syntaxhighlight>
 
 
{{out}}
<pre>$ icont -s -u -o lex lex-in-Icon.icn && ./lex compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|J}}==
Here, we first build a tokenizer state machine sufficient to recognize our mini-language. This tokenizer must not discard any characters, because we will be using cumulative character offsets to identify line numbers and column numbers.
 
Then, we refine this result: we generate those line and column numbers, discard whitespace and comments, and classify tokens based on their structure.
 
(Also, in this version, rather than building out a full state machine to recognize character literals, we treat character literals as a sequence of tokens which we must then refine. It might have been wiser to build character literals as single tokens,)
 
Implementation:
 
<syntaxhighlight lang="j">symbols=:256#0
ch=: {{1 0+x[symbols=: x (a.i.y)} symbols}}
'T0 token' =: 0 ch '%+-!(){};,<>=!|&'
'L0 letter' =: 1 ch '_',,u:65 97+/i.26
'D0 digit' =: 2 ch u:48+i.10
'S0 space' =: 3 ch ' ',LF
'C0 commen' =: 4 ch '/'
'C1 comment'=: 5 ch '*'
'q0 quote' =: 6 ch ''''
'Q0 dquote' =: 7 ch '"'
 
width=: 1+>./symbols
default=: ,:(1+i.width),every 2
states=:((1+i.width),every 1),width#default
extend=: {{
if.y>#states do.states=: y{.states,y#default
end.states
}}
pad=: {{if. 0=#y do.y=.#states end.y}}
function=: {{ NB. x: before, m: op, n: symbol, y: after
y[states=: (y,m) (<x,n)} extend 1+x>.y=.pad y
}}
{{for_op.y do.(op)=: op_index function end.0}};:'nop init start'
all=: {{y=.pad y
for_symbol.i.width do.
x symbol nop y
end.y
}}
any=: {{y=.pad y
for_symbol.i.width do.
x symbol start y
end.y
}}
 
NB. identifiers and keywords
L0 letter nop L0
L0 digit nop L0
 
NB. numbers
D0 digit nop D0
D0 letter nop D0
 
NB. white space
S0 space nop S0
 
NB. comments
C1=: C0 comment nop ''
C2=: C1 all ''
C2 all C2
C3=: C2 commen nop ''
C4=: C3 comment nop ''
 
NB. quoted characters
q1=: q0 any ''
 
NB. strings
Q1=: Q0 all ''
Q1 all Q1
Q2=: Q1 dquote nop ''
Q0 dquote nop Q2
 
tokenize=:{{
tok=. (0;states;symbols);:y
for_fix.cut'<= >= == != && ||'do.
M=.;:;fix
for_k.|.I.M E.tok do.
tok=.(fix,<'') (0 1+k)} tok
end.
end.tok-.a:
}}
 
(tknames=:;: {{)n
Op_multiply Op_divide Op_mod Op_add Op_subtract Op_less Op_lessequal
Op_greater Op_greaterequal Op_equal Op_notequal Op_not Op_and Op_or
Op_assign LeftParen RightParen Keyword_if LeftBrace Keyword_else
RightBrace Keyword_while Semicolon Keyword_print Comma Keyword_putc
}}-.LF)=: tkref=: tokenize '*/%+-<<=>>===!=!&&||=()if{else}while;print,putc'
NB. the reference tokens here were arranged to avoid whitespace tokens
NB. also, we reserve multiple token instances where a literal string
NB. appears in different syntactic productions. Here, we only use the initial
NB. instances -- the others will be used in the syntax analyzer which
NB. uses the same tkref and tknames,
 
shift=: |.!.0
numvals=: {{
ndx=. I.(0<#@>y)**/@> y e.L:0 '0123456789'
({{".y,'x'}}each ndx{y) ndx} y
}}
chrvals=: {{
q=. y=<,''''
s=. y=<,'\'
j=. I.(-.s)*(1&shift * _1&shift)q
k=. I.(y e.;:'\n')*(1 shift q)*(_2 shift q)*_1 shift s
jvals=. a.i.L:0 j{y NB. not escaped
kvals=. (k{s){<"0 a.i.LF,'\' NB. escaped
(,a:,jvals,:a:) (,_1 0 1+/j)} (,a:,a:,kvals,:a:) (,_2 _1 0 1+/k)} y
}}
 
validstring=: ((1<#)*('"'={.)*('"'={:)*('\'=])-:'\n'&E.(+._1&shift)@+.'\\'&E.) every
 
validid=: ((<,'\')~:_1&|.) * (e.&tkref) < (e.&(u:I.symbols=letter)@{. * */@(e.&(u:I.symbols e.letter,digit))@}.) every
 
lex=: {{
lineref=.I.y=LF
tokens=.(tokenize y),<,'_'
offsets=.0,}:#@;\tokens
lines=. lineref I.offsets
columns=. offsets-lines{0,lineref
keep=. -.({.@> tokens)e.u:I.space=symbols
names=. (<'End_of_input') _1} (tkref i.tokens) {(_3}.tknames),4#<'Error'
unknown=. names=<'Error'
values=. a: _1} unknown#inv numvals chrvals unknown#tokens
names=. (<'Integer') (I.(values~:a:)*tokens~:values)} names
names=. (<'String') (I.validstring tokens)} names
names=. (<'Identifier') (I.validid tokens)} names
names=. (<'End_of_input') _1} names
comments=. '*/'&-:@(_2&{.)@> tokens
whitespace=. (values=tokens) * e.&(' ',LF)@{.@> tokens
keep=. (tokens~:<,'''')*-.comments+.whitespace+.unknown*a:=values
keep&#each ((1+lines),.columns);<names,.values
}}</syntaxhighlight>
 
Test case 3:
 
<syntaxhighlight lang="j">
flex=: {{
'A B'=.y
'names values'=.|:":each B
(":A),.' ',.names,.' ',.values
}}@lex
 
testcase3=: {{)n
/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* Else */ else /* Leq */ <=
/* While */ while /* Geq */ >=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '
}}
 
flex testcase3
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 28 Identifier 10
21 28 Integer 92
22 27 Integer 32
23 1 End_of_input </syntaxhighlight>
 
Here, it seems expedient to retain a structured representation of the lexical result. As shown, it's straightforward to produce a "pure" textual result for a hypothetical alternative implementation of the syntax analyzer, but the structured representation will be easier to deal with.
 
=={{header|Java}}==
<syntaxhighlight lang="java">
// Translated from python source
 
import java.io.File;
import java.io.FileNotFoundException;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;
 
public class Lexer {
private int line;
private int pos;
private int position;
private char chr;
private String s;
Map<String, TokenType> keywords = new HashMap<>();
static class Token {
public TokenType tokentype;
public String value;
public int line;
public int pos;
Token(TokenType token, String value, int line, int pos) {
this.tokentype = token; this.value = value; this.line = line; this.pos = pos;
}
@Override
public String toString() {
String result = String.format("%5d %5d %-15s", this.line, this.pos, this.tokentype);
switch (this.tokentype) {
case Integer:
result += String.format(" %4s", value);
break;
case Identifier:
result += String.format(" %s", value);
break;
case String:
result += String.format(" \"%s\"", value);
break;
}
return result;
}
}
static enum TokenType {
End_of_input, Op_multiply, Op_divide, Op_mod, Op_add, Op_subtract,
Op_negate, Op_not, Op_less, Op_lessequal, Op_greater, Op_greaterequal,
Op_equal, Op_notequal, Op_assign, Op_and, Op_or, Keyword_if,
Keyword_else, Keyword_while, Keyword_print, Keyword_putc, LeftParen, RightParen,
LeftBrace, RightBrace, Semicolon, Comma, Identifier, Integer, String
}
static void error(int line, int pos, String msg) {
if (line > 0 && pos > 0) {
System.out.printf("%s in line %d, pos %d\n", msg, line, pos);
} else {
System.out.println(msg);
}
System.exit(1);
}
 
Lexer(String source) {
this.line = 1;
this.pos = 0;
this.position = 0;
this.s = source;
this.chr = this.s.charAt(0);
this.keywords.put("if", TokenType.Keyword_if);
this.keywords.put("else", TokenType.Keyword_else);
this.keywords.put("print", TokenType.Keyword_print);
this.keywords.put("putc", TokenType.Keyword_putc);
this.keywords.put("while", TokenType.Keyword_while);
}
Token follow(char expect, TokenType ifyes, TokenType ifno, int line, int pos) {
if (getNextChar() == expect) {
getNextChar();
return new Token(ifyes, "", line, pos);
}
if (ifno == TokenType.End_of_input) {
error(line, pos, String.format("follow: unrecognized character: (%d) '%c'", (int)this.chr, this.chr));
}
return new Token(ifno, "", line, pos);
}
Token char_lit(int line, int pos) {
char c = getNextChar(); // skip opening quote
int n = (int)c;
if (c == '\'') {
error(line, pos, "empty character constant");
} else if (c == '\\') {
c = getNextChar();
if (c == 'n') {
n = 10;
} else if (c == '\\') {
n = '\\';
} else {
error(line, pos, String.format("unknown escape sequence \\%c", c));
}
}
if (getNextChar() != '\'') {
error(line, pos, "multi-character constant");
}
getNextChar();
return new Token(TokenType.Integer, "" + n, line, pos);
}
Token string_lit(char start, int line, int pos) {
String result = "";
while (getNextChar() != start) {
if (this.chr == '\u0000') {
error(line, pos, "EOF while scanning string literal");
}
if (this.chr == '\n') {
error(line, pos, "EOL while scanning string literal");
}
result += this.chr;
}
getNextChar();
return new Token(TokenType.String, result, line, pos);
}
Token div_or_comment(int line, int pos) {
if (getNextChar() != '*') {
return new Token(TokenType.Op_divide, "", line, pos);
}
getNextChar();
while (true) {
if (this.chr == '\u0000') {
error(line, pos, "EOF in comment");
} else if (this.chr == '*') {
if (getNextChar() == '/') {
getNextChar();
return getToken();
}
} else {
getNextChar();
}
}
}
Token identifier_or_integer(int line, int pos) {
boolean is_number = true;
String text = "";
while (Character.isAlphabetic(this.chr) || Character.isDigit(this.chr) || this.chr == '_') {
text += this.chr;
if (!Character.isDigit(this.chr)) {
is_number = false;
}
getNextChar();
}
if (text.equals("")) {
error(line, pos, String.format("identifer_or_integer unrecognized character: (%d) %c", (int)this.chr, this.chr));
}
if (Character.isDigit(text.charAt(0))) {
if (!is_number) {
error(line, pos, String.format("invalid number: %s", text));
}
return new Token(TokenType.Integer, text, line, pos);
}
if (this.keywords.containsKey(text)) {
return new Token(this.keywords.get(text), "", line, pos);
}
return new Token(TokenType.Identifier, text, line, pos);
}
Token getToken() {
int line, pos;
while (Character.isWhitespace(this.chr)) {
getNextChar();
}
line = this.line;
pos = this.pos;
switch (this.chr) {
case '\u0000': return new Token(TokenType.End_of_input, "", this.line, this.pos);
case '/': return div_or_comment(line, pos);
case '\'': return char_lit(line, pos);
case '<': return follow('=', TokenType.Op_lessequal, TokenType.Op_less, line, pos);
case '>': return follow('=', TokenType.Op_greaterequal, TokenType.Op_greater, line, pos);
case '=': return follow('=', TokenType.Op_equal, TokenType.Op_assign, line, pos);
case '!': return follow('=', TokenType.Op_notequal, TokenType.Op_not, line, pos);
case '&': return follow('&', TokenType.Op_and, TokenType.End_of_input, line, pos);
case '|': return follow('|', TokenType.Op_or, TokenType.End_of_input, line, pos);
case '"': return string_lit(this.chr, line, pos);
case '{': getNextChar(); return new Token(TokenType.LeftBrace, "", line, pos);
case '}': getNextChar(); return new Token(TokenType.RightBrace, "", line, pos);
case '(': getNextChar(); return new Token(TokenType.LeftParen, "", line, pos);
case ')': getNextChar(); return new Token(TokenType.RightParen, "", line, pos);
case '+': getNextChar(); return new Token(TokenType.Op_add, "", line, pos);
case '-': getNextChar(); return new Token(TokenType.Op_subtract, "", line, pos);
case '*': getNextChar(); return new Token(TokenType.Op_multiply, "", line, pos);
case '%': getNextChar(); return new Token(TokenType.Op_mod, "", line, pos);
case ';': getNextChar(); return new Token(TokenType.Semicolon, "", line, pos);
case ',': getNextChar(); return new Token(TokenType.Comma, "", line, pos);
default: return identifier_or_integer(line, pos);
}
}
char getNextChar() {
this.pos++;
this.position++;
if (this.position >= this.s.length()) {
this.chr = '\u0000';
return this.chr;
}
this.chr = this.s.charAt(this.position);
if (this.chr == '\n') {
this.line++;
this.pos = 0;
}
return this.chr;
}
 
void printTokens() {
Token t;
while ((t = getToken()).tokentype != TokenType.End_of_input) {
System.out.println(t);
}
System.out.println(t);
}
public static void main(String[] args) {
if (args.length > 0) {
try {
File f = new File(args[0]);
Scanner s = new Scanner(f);
String source = " ";
while (s.hasNext()) {
source += s.nextLine() + "\n";
}
Lexer l = new Lexer(source);
l.printTokens();
} catch(FileNotFoundException e) {
error(-1, -1, "Exception: " + e.getMessage());
}
} else {
error(-1, -1, "No args");
}
}
}
</syntaxhighlight>
 
=={{header|JavaScript}}==
{{incorrect|Javascript|Please show output. Code is identical to [[Compiler/syntax_analyzer]] task}}
<syntaxhighlight lang="javascript">
/*
Token: type, value, line, pos
*/
 
const TokenType = {
Keyword_if: 1, Keyword_else: 2, Keyword_print: 3, Keyword_putc: 4, Keyword_while: 5,
Op_add: 6, Op_and: 7, Op_assign: 8, Op_divide: 9, Op_equal: 10, Op_greater: 11,
Op_greaterequal: 12, Op_less: 13, Op_Lessequal: 14, Op_mod: 15, Op_multiply: 16, Op_not: 17,
Op_notequal: 18, Op_or: 19, Op_subtract: 20,
Integer: 21, String: 22, Identifier: 23,
Semicolon: 24, Comma: 25,
LeftBrace: 26, RightBrace: 27,
LeftParen: 28, RightParen: 29,
End_of_input: 99
}
 
class Lexer {
constructor(source) {
this.source = source
this.pos = 1 // position in line
this.position = 0 // position in source
this.line = 1
this.chr = this.source.charAt(0)
this.keywords = {
"if": TokenType.Keyword_if,
"else": TokenType.Keyword_else,
"print": TokenType.Keyword_print,
"putc": TokenType.Keyword_putc,
"while": TokenType.Keyword_while
}
}
getNextChar() {
this.pos++
this.position++
if (this.position >= this.source.length) {
this.chr = undefined
return this.chr
}
this.chr = this.source.charAt(this.position)
if (this.chr === '\n') {
this.line++
this.pos = 0
}
return this.chr
}
error(line, pos, message) {
if (line > 0 && pos > 0) {
console.log(message + " in line " + line + ", pos " + pos + "\n")
} else {
console.log(message)
}
process.exit(1)
}
follow(expect, ifyes, ifno, line, pos) {
if (this.getNextChar() === expect) {
this.getNextChar()
return { type: ifyes, value: "", line, pos }
}
if (ifno === TokenType.End_of_input) {
this.error(line, pos, "follow: unrecognized character: (" + this.chr.charCodeAt(0) + ") '" + this.chr + "'")
}
return { type: ifno, value: "", line, pos }
}
div_or_comment(line, pos) {
if (this.getNextChar() !== '*') {
return { type: TokenType.Op_divide, value: "/", line, pos }
}
this.getNextChar()
while (true) {
if (this.chr === '\u0000') {
this.error(line, pos, "EOF in comment")
} else if (this.chr === '*') {
if (this.getNextChar() === '/') {
this.getNextChar()
return this.getToken()
}
} else {
this.getNextChar()
}
}
}
char_lit(line, pos) {
let c = this.getNextChar() // skip opening quote
let n = c.charCodeAt(0)
if (c === "\'") {
this.error(line, pos, "empty character constant")
} else if (c === "\\") {
c = this.getNextChar()
if (c == "n") {
n = 10
} else if (c === "\\") {
n = 92
} else {
this.error(line, pos, "unknown escape sequence \\" + c)
}
}
if (this.getNextChar() !== "\'") {
this.error(line, pos, "multi-character constant")
}
this.getNextChar()
return { type: TokenType.Integer, value: n, line, pos }
}
string_lit(start, line, pos) {
let value = ""
while (this.getNextChar() !== start) {
if (this.chr === undefined) {
this.error(line, pos, "EOF while scanning string literal")
}
if (this.chr === "\n") {
this.error(line, pos, "EOL while scanning string literal")
}
value += this.chr
}
this.getNextChar()
return { type: TokenType.String, value, line, pos }
}
identifier_or_integer(line, pos) {
let is_number = true
let text = ""
while (/\w/.test(this.chr) || this.chr === '_') {
text += this.chr
if (!/\d/.test(this.chr)) {
is_number = false
}
this.getNextChar()
}
if (text === "") {
this.error(line, pos, "identifer_or_integer unrecopgnized character: follow: unrecognized character: (" + this.chr.charCodeAt(0) + ") '" + this.chr + "'")
}
if (/\d/.test(text.charAt(0))) {
if (!is_number) {
this.error(line, pos, "invaslid number: " + text)
}
return { type: TokenType.Integer, value: text, line, pos }
}
if (text in this.keywords) {
return { type: this.keywords[text], value: "", line, pos }
}
return { type: TokenType.Identifier, value: text, line, pos }
}
getToken() {
let pos, line
// Ignore whitespaces
while (/\s/.test(this.chr)) { this.getNextChar() }
line = this.line; pos = this.pos
switch (this.chr) {
case undefined: return { type: TokenType.End_of_input, value: "", line: this.line, pos: this.pos }
case "/": return this.div_or_comment(line, pos)
case "\'": return this.char_lit(line, pos)
case "\"": return this.string_lit(this.chr, line, pos)
 
case "<": return this.follow("=", TokenType.Op_lessequal, TokenType.Op_less, line, pos)
case ">": return this.follow("=", TokenType.Op_greaterequal, TokenType.Op_greater, line, pos)
case "=": return this.follow("=", TokenType.Op_equal, TokenType.Op_assign, line, pos)
case "!": return this.follow("=", TokenType.Op_notequal, TokenType.Op_not, line, pos)
case "&": return this.follow("&", TokenType.Op_and, TokenType.End_of_input, line, pos)
case "|": return this.follow("|", TokenType.Op_or, TokenType.End_of_input, line, pos)
 
case "{": this.getNextChar(); return { type: TokenType.LeftBrace, value: "{", line, pos }
case "}": this.getNextChar(); return { type: TokenType.RightBrace, value: "}", line, pos }
case "(": this.getNextChar(); return { type: TokenType.LeftParen, value: "(", line, pos }
case ")": this.getNextChar(); return { type: TokenType.RightParen, value: ")", line, pos }
case "+": this.getNextChar(); return { type: TokenType.Op_add, value: "+", line, pos }
case "-": this.getNextChar(); return { type: TokenType.Op_subtract, value: "-", line, pos }
case "*": this.getNextChar(); return { type: TokenType.Op_multiply, value: "*", line, pos }
case "%": this.getNextChar(); return { type: TokenType.Op_mod, value: "%", line, pos }
case ";": this.getNextChar(); return { type: TokenType.Semicolon, value: ";", line, pos }
case ",": this.getNextChar(); return { type: TokenType.Comma, value: ",", line, pos }
 
default: return this.identifier_or_integer(line, pos)
}
}
/*
https://stackoverflow.com/questions/9907419/how-to-get-a-key-in-a-javascript-object-by-its-value
*/
getTokenType(value) {
return Object.keys(TokenType).find(key => TokenType[key] === value)
}
printToken(t) {
let result = (" " + t.line).substr(t.line.toString().length)
result += (" " + t.pos).substr(t.pos.toString().length)
result += (" " + this.getTokenType(t.type) + " ").substr(0, 16)
switch (t.type) {
case TokenType.Integer:
result += " " + t.value
break;
case TokenType.Identifier:
result += " " + t.value
break;
case TokenType.String:
result += " \""+ t.value + "\""
break;
}
console.log(result)
}
printTokens() {
let t
while ((t = this.getToken()).type !== TokenType.End_of_input) {
this.printToken(t)
}
this.printToken(t)
}
}
const fs = require("fs")
fs.readFile(process.argv[2], "utf8", (err, data) => {
l = new Lexer(data)
l.printTokens()
})
</syntaxhighlight>
 
=={{header|Julia}}==
<syntaxhighlight lang="julia">struct Tokenized
startline::Int
startcol::Int
name::String
value::Union{Nothing, Int, String}
end
 
const optokens = Dict("*" => "Op_multiply", "/" => "Op_divide", "%" => "Op_mod", "+" => "Op_add",
"-" => "Op_subtract", "!" => "Op_not", "<" => "Op_less", "<=" => "Op_lessequal",
">" => "Op_greater", ">=" => "Op_greaterequal", "==" => "Op_equal", "!=" => "Op_notequal",
"!" => "Op_not", "=" => "Op_assign", "&&" => "Op_and", "||" => "Op_or")
 
const keywordtokens = Dict("if" => "Keyword_if", "else" => "Keyword_else", "while" => "Keyword_while",
"print" => "Keyword_print", "putc" => "Keyword_putc")
 
const symboltokens = Dict("(" => "LeftParen", ")" => "RightParen", "{" => "LeftBrace",
"}" => "RightBrace", ";" => "Semicolon", "," => "Comma")
 
const errors = ["Empty character constant.", "Unknown escape sequence.", "Multi-character constant.",
"End-of-file in comment. Closing comment characters not found.",
"End-of-file while scanning string literal. Closing string character not found.",
"End-of-line while scanning string literal. Closing string character not found before end-of-line.",
"Unrecognized character.", "Invalid number. Starts like a number, but ends in non-numeric characters."]
 
asws(s) = (nnl = length(findall(x->x=='\n', s)); " " ^ (length(s) - nnl) * "\n" ^ nnl)
comment2ws(t) = (while occursin("/*", t) t = replace(t, r"\/\* .+? (?: \*\/)"xs => asws; count = 1) end; t)
hasinvalidescapes(t) = ((m = match(r"\\.", t)) != nothing && m.match != "\\\\" && m.match != "\\n")
hasemptycharconstant(t) = (match(r"\'\'", t) != nothing)
hasmulticharconstant(t) = ((m = match(r"\'[^\'][^\']+\'", t)) != nothing && m.match != "\'\\\\\'" && m.match != "\'\\n\'")
hasunbalancedquotes(t) = isodd(length(findall(x -> x == '\"', t)))
hasunrecognizedchar(t) = match(r"[^\w\s\d\*\/\%\+\-\<\>\=\!\&\|\(\)\{\}\;\,\"\'\\]", t) != nothing
 
function throwiferror(line, n)
if hasemptycharconstant(line)
throw("Tokenizer error line $n: " * errors[1])
end
if hasinvalidescapes(line)
throw("Tokenizer error line $n: " * errors[2])
end
if hasmulticharconstant(line)
println("error at ", match(r"\'[^\'][^\']+\'", line).match)
throw("Tokenizer error line $n: " * errors[3])
end
if occursin("/*", line)
throw("Tokenizer error line $n: " * errors[4])
end
if hasunrecognizedchar(line)
throw("Tokenizer error line $n: " * errors[7])
end
end
 
function tokenize(txt)
tokens = Vector{Tokenized}()
txt = comment2ws(txt)
lines = split(txt, "\n")
if hasunbalancedquotes(txt)
throw("Tokenizer error: $(errors[5])")
end
for (startline, line) in enumerate(lines)
if strip(line) == ""
continue
end
throwiferror(line, startline)
lastc = Char(0)
withintoken = 0
for (startcol, c) in enumerate(line)
if withintoken > 0
withintoken -= 1
continue
elseif isspace(c[1])
continue
elseif (c == '=') && (startcol > 1) && ((c2 = line[startcol - 1]) in ['<', '>', '=', '!'])
tokens[end] = Tokenized(startline, startcol - 1, optokens[c2 * c], nothing)
elseif (c == '&') || (c == '|')
if length(line) > startcol && line[startcol + 1] == c
push!(tokens, Tokenized(startline, startcol, optokens[c * c], nothing))
withintoken = 1
else
throw("Tokenizer error line $startline: $(errors[7])")
end
elseif haskey(optokens, string(c))
push!(tokens, Tokenized(startline, startcol, optokens[string(c)], nothing))
elseif haskey(symboltokens, string(c))
push!(tokens, Tokenized(startline, startcol, symboltokens[string(c)], nothing))
elseif isdigit(c)
integerstring = match(r"^\d+", line[startcol:end]).match
pastnumposition = startcol + length(integerstring)
if (pastnumposition <= length(line)) && isletter(line[pastnumposition])
throw("Tokenizer error line $startline: " * errors[8])
end
i = parse(Int, integerstring)
push!(tokens, Tokenized(startline, startcol, "Integer", i))
withintoken = length(integerstring) - 1
elseif c == Char(39) # single quote
if (m = match(r"([^\\\'\n]|\\n|\\\\)\'", line[startcol+1:end])) != nothing
chs = m.captures[1]
i = (chs == "\\n") ? Int('\n') : (chs == "\\\\" ? Int('\\') : Int(chs[1]))
push!(tokens, Tokenized(startline, startcol, "Integer", i))
withintoken = length(chs) + 1
else
println("line $startline: bad match with ", line[startcol+1:end])
end
elseif c == Char(34) # double quote
if (m = match(r"([^\"\n]+)\"", line[startcol+1:end])) == nothing
throw("Tokenizer error line $startline: $(errors[6])")
end
litstring = m.captures[1]
push!(tokens, Tokenized(startline, startcol, "String", "\"$litstring\""))
withintoken = length(litstring) + 1
elseif (cols = findfirst(r"[a-zA-Z]+", line[startcol:end])) != nothing
litstring = line[cols .+ startcol .- 1]
if haskey(keywordtokens, string(litstring))
push!(tokens, Tokenized(startline, startcol, keywordtokens[litstring], nothing))
else
litstring = match(r"[_a-zA-Z0-9]+", line[startcol:end]).match
push!(tokens, Tokenized(startline, startcol, "Identifier", string(litstring)))
end
withintoken = length(litstring) - 1
end
lastc = c
end
end
push!(tokens, Tokenized(length(lines), length(lines[end]) + 1, "End_of_input", nothing))
tokens
end
 
const test3txt = raw"""
/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* Else */ else /* Leq */ <=
/* While */ while /* Geq */ >=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '
"""
 
println("Line Col Name Value")
for tok in tokenize(test3txt)
println(lpad(tok.startline, 3), lpad(tok.startcol, 5), lpad(tok.name, 18), " ", tok.value != nothing ? tok.value : "")
end
</syntaxhighlight>{{output}}<pre>
Line Col Name Value
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
=={{header|kotlin}}==
{{trans|Java}}
<syntaxhighlight lang="kotlin">// Input: command line argument of file to process or console input. A two or
// three character console input of digits followed by a new line will be
// checked for an integer between zero and twenty-five to select a fixed test
// case to run. Any other console input will be parsed.
 
// Code based on the Java version found here:
// https://rosettacode.org/mw/index.php?title=Compiler/lexical_analyzer&action=edit&section=22
 
// Class to halt the parsing with an exception.
class ParsingFailed(message: String): Exception(message)
 
// Enumerate class of tokens supported by this scanner.
enum class TokenType {
Tk_End_of_input, Op_multiply, Op_divide, Op_mod, Op_add, Op_subtract,
Op_negate, Op_not, Op_less, Op_lessequal, Op_greater, Op_greaterequal,
Op_equal, Op_notequal, Op_assign, Op_and, Op_or, Kw_if,
Kw_else, Kw_while, Kw_print, Kw_putc, Sy_LeftParen, Sy_RightParen,
Sy_LeftBrace, Sy_RightBrace, Sy_Semicolon, Sy_Comma, Tk_Identifier,
Tk_Integer, Tk_String;
 
override fun toString() =
listOf("End_of_input", "Op_multiply", "Op_divide", "Op_mod", "Op_add",
"Op_subtract", "Op_negate", "Op_not", "Op_less", "Op_lessequal",
"Op_greater", "Op_greaterequal", "Op_equal", "Op_notequal",
"Op_assign", "Op_and", "Op_or", "Keyword_if", "Keyword_else",
"Keyword_while", "Keyword_print", "Keyword_putc", "LeftParen",
"RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma",
"Identifier", "Integer", "String")[this.ordinal]
} // TokenType
 
// Data class of tokens returned by the scanner.
data class Token(val token: TokenType, val value: String, val line: Int,
val pos: Int) {
 
// Overridden method to display the token.
override fun toString() =
"%5d %5d %-15s %s".format(line, pos, this.token,
when (this.token) {
TokenType.Tk_Integer, TokenType.Tk_Identifier ->
" %s".format(this.value)
TokenType.Tk_String ->
this.value.toList().joinToString("", " \"", "\"") {
when (it) {
'\t' ->
"\\t"
'\n' ->
"\\n"
'\u000b' ->
"\\v"
'\u000c' ->
"\\f"
'\r' ->
"\\r"
'"' ->
"\\\""
'\\' ->
"\\"
in ' '..'~' ->
"$it"
else ->
"\\u%04x".format(it.code) } }
else ->
"" } )
} // Token
 
// Function to display an error message and halt the scanner.
fun error(line: Int, pos: Int, msg: String): Nothing =
throw ParsingFailed("(%d, %d) %s\n".format(line, pos, msg))
 
// Class to process the source into tokens with properties of the
// source string, the line number, the column position, the index
// within the source string, the current character being processed,
// and map of the keyword strings to the corresponding token type.
class Lexer(private val s: String) {
private var line = 1
private var pos = 1
private var position = 0
private var chr =
if (s.isEmpty())
' '
else
s[0]
private val keywords = mapOf<String, TokenType>(
"if" to TokenType.Kw_if,
"else" to TokenType.Kw_else,
"print" to TokenType.Kw_print,
"putc" to TokenType.Kw_putc,
"while" to TokenType.Kw_while)
 
// Method to retrive the next character from the source. Use null after
// the end of our source.
private fun getNextChar() =
if (++this.position >= this.s.length) {
this.pos++
this.chr = '\u0000'
this.chr
} else {
this.pos++
this.chr = this.s[this.position]
when (this.chr) {
'\n' -> {
this.line++
this.pos = 0
} // line
'\t' ->
while (this.pos%8 != 1)
this.pos++
} // when
this.chr
} // if
 
// Method to return the division token, skip the comment, or handle the
// error.
private fun div_or_comment(line: Int, pos: Int): Token =
if (getNextChar() != '*')
Token(TokenType.Op_divide, "", line, pos);
else {
getNextChar() // Skip comment start
outer@
while (true)
when (this.chr) {
'\u0000' ->
error(line, pos, "Lexer: EOF in comment");
'*' ->
if (getNextChar() == '/') {
getNextChar() // Skip comment end
break@outer
} // if
else ->
getNextChar()
} // when
getToken()
} // if
 
// Method to verify a character literal. Return the token or handle the
// error.
private fun char_lit(line: Int, pos: Int): Token {
var c = getNextChar() // skip opening quote
when (c) {
'\'' ->
error(line, pos, "Lexer: Empty character constant");
'\\' ->
c = when (getNextChar()) {
'n' ->
10.toChar()
'\\' ->
'\\'
'\'' ->
'\''
else ->
error(line, pos, "Lexer: Unknown escape sequence '\\%c'".
format(this.chr)) }
} // when
if (getNextChar() != '\'')
error(line, pos, "Lexer: Multi-character constant")
getNextChar() // Skip closing quote
return Token(TokenType.Tk_Integer, c.code.toString(), line, pos)
} // char_lit
 
// Method to check next character to see whether it belongs to the token
// we might be in the middle of. Return the correct token or handle the
// error.
private fun follow(expect: Char, ifyes: TokenType, ifno: TokenType,
line: Int, pos: Int): Token =
when {
getNextChar() == expect -> {
getNextChar()
Token(ifyes, "", line, pos)
} // matches
ifno == TokenType.Tk_End_of_input ->
error(line, pos,
"Lexer: %c expected: (%d) '%c'".format(expect,
this.chr.code, this.chr))
else ->
Token(ifno, "", line, pos)
} // when
 
// Method to verify a character string. Return the token or handle the
// error.
private fun string_lit(start: Char, line: Int, pos: Int): Token {
var result = ""
while (getNextChar() != start)
when (this.chr) {
'\u0000' ->
error(line, pos, "Lexer: EOF while scanning string literal")
'\n' ->
error(line, pos, "Lexer: EOL while scanning string literal")
'\\' ->
when (getNextChar()) {
'\\' ->
result += '\\'
'n' ->
result += '\n'
'"' ->
result += '"'
else ->
error(line, pos, "Lexer: Escape sequence unknown '\\%c'".
format(this.chr))
} // when
else ->
result += this.chr
} // when
getNextChar() // Toss closing quote
return Token(TokenType.Tk_String, result, line, pos)
} // string_lit
 
// Method to retrive an identifier or integer. Return the keyword
// token, if the string matches one. Return the integer token,
// if the string is all digits. Return the identifer token, if the
// string is valid. Otherwise, handle the error.
private fun identifier_or_integer(line: Int, pos: Int): Token {
var is_number = true
var text = ""
while (this.chr in listOf('_')+('0'..'9')+('a'..'z')+('A'..'Z')) {
text += this.chr
is_number = is_number && this.chr in '0'..'9'
getNextChar()
} // while
if (text.isEmpty())
error(line, pos, "Lexer: Unrecognized character: (%d) %c".
format(this.chr.code, this.chr))
return when {
text[0] in '0'..'9' ->
if (!is_number)
error(line, pos, "Lexer: Invalid number: %s".
format(text))
else {
val max = Int.MAX_VALUE.toString()
if (text.length > max.length || (text.length == max.length &&
max < text))
error(line, pos,
"Lexer: Number exceeds maximum value %s".
format(text))
Token(TokenType.Tk_Integer, text, line, pos)
} // if
this.keywords.containsKey(text) ->
Token(this.keywords[text]!!, "", line, pos)
else ->
Token(TokenType.Tk_Identifier, text, line, pos) }
} // identifier_or_integer
 
// Method to skip whitespace both C's and Unicode ones and retrive the next
// token.
private fun getToken(): Token {
while (this.chr in listOf('\t', '\n', '\u000b', '\u000c', '\r', ' ') ||
this.chr.isWhitespace())
getNextChar()
val line = this.line
val pos = this.pos
return when (this.chr) {
'\u0000' ->
Token(TokenType.Tk_End_of_input, "", line, pos)
'/' ->
div_or_comment(line, pos)
'\'' ->
char_lit(line, pos)
'<' ->
follow('=', TokenType.Op_lessequal, TokenType.Op_less, line, pos)
'>' ->
follow('=', TokenType.Op_greaterequal, TokenType.Op_greater, line, pos)
'=' ->
follow('=', TokenType.Op_equal, TokenType.Op_assign, line, pos)
'!' ->
follow('=', TokenType.Op_notequal, TokenType.Op_not, line, pos)
'&' ->
follow('&', TokenType.Op_and, TokenType.Tk_End_of_input, line, pos)
'|' ->
follow('|', TokenType.Op_or, TokenType.Tk_End_of_input, line, pos)
'"' ->
string_lit(this.chr, line, pos)
'{' -> {
getNextChar()
Token(TokenType.Sy_LeftBrace, "", line, pos)
} // open brace
'}' -> {
getNextChar()
Token(TokenType.Sy_RightBrace, "", line, pos)
} // close brace
'(' -> {
getNextChar()
Token(TokenType.Sy_LeftParen, "", line, pos)
} // open paren
')' -> {
getNextChar()
Token(TokenType.Sy_RightParen, "", line, pos)
} // close paren
'+' -> {
getNextChar()
Token(TokenType.Op_add, "", line, pos)
} // plus
'-' -> {
getNextChar()
Token(TokenType.Op_subtract, "", line, pos)
} // dash
'*' -> {
getNextChar()
Token(TokenType.Op_multiply, "", line, pos)
} // asterisk
'%' -> {
getNextChar()
Token(TokenType.Op_mod, "", line, pos)
} // percent
';' -> {
getNextChar()
Token(TokenType.Sy_Semicolon, "", line, pos)
} // semicolon
',' -> {
getNextChar()
Token(TokenType.Sy_Comma, "", line, pos)
} // comma
else ->
identifier_or_integer(line, pos) }
} // getToken
 
// Method to parse and display tokens.
fun printTokens() {
do {
val t: Token = getToken()
println(t)
} while (t.token != TokenType.Tk_End_of_input)
} // printTokens
} // Lexer
 
 
// Function to test all good tests from the website and produce all of the
// error messages this program supports.
fun tests(number: Int) {
 
// Function to generate test case 0 source: Hello World/Text.
fun hello() {
Lexer(
"""/*
Hello world
*/
print("Hello, World!\n");
""").printTokens()
} // hello
 
// Function to generate test case 1 source: Phoenix Number.
fun phoenix() {
Lexer(
"""/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");""").printTokens()
} // phoenix
 
// Function to generate test case 2 source: All Symbols.
fun symbols() {
Lexer(
"""/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* Else */ else /* Leq */ <=
/* While */ while /* Geq */ >=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '""").printTokens()
} // symbols
 
// Function to generate test case 3 source: Test Case 4.
fun four() {
Lexer(
"""/*** test printing, embedded \n and comments with lots of '*' ***/
print(42);
print("\nHello World\nGood Bye\nok\n");
print("Print a slash n - \\n.\n");""").printTokens()
} // four
 
// Function to generate test case 4 source: Count.
fun count() {
Lexer(
"""count = 1;
while (count < 10) {
print("count is: ", count, "\n");
count = count + 1;
}""").printTokens()
} // count
 
// Function to generate test case 5 source: 100 Doors.
fun doors() {
Lexer(
"""/* 100 Doors */
i = 1;
while (i * i <= 100) {
print("door ", i * i, " is open\n");
i = i + 1;
}""").printTokens()
} // doors
 
// Function to generate test case 6 source: Negative Tests.
fun negative() {
Lexer(
"""a = (-1 * ((-1 * (5 * 15)) / 10));
print(a, "\n");
b = -a;
print(b, "\n");
print(-b, "\n");
print(-(1), "\n");""").printTokens()
} // negative
 
// Function to generate test case 7 source: Deep.
fun deep() {
Lexer(
"""print(---------------------------------+++5, "\n");
print(((((((((3 + 2) * ((((((2))))))))))))), "\n");
 
if (1) { if (1) { if (1) { if (1) { if (1) { print(15, "\n"); } } } } }""").printTokens()
} // deep
 
// Function to generate test case 8 source: Greatest Common Divisor.
fun gcd() {
Lexer(
"""/* Compute the gcd of 1071, 1029: 21 */
 
a = 1071;
b = 1029;
 
while (b != 0) {
new_a = b;
b = a % b;
a = new_a;
}
print(a);""").printTokens()
} // gcd
 
// Function to generate test case 9 source: Factorial.
fun factorial() {
Lexer(
"""/* 12 factorial is 479001600 */
 
n = 12;
result = 1;
i = 1;
while (i <= n) {
result = result * i;
i = i + 1;
}
print(result);""").printTokens()
} // factorial
 
// Function to generate test case 10 source: Fibonacci Sequence.
fun fibonacci() {
Lexer(
"""/* fibonacci of 44 is 701408733 */
 
n = 44;
i = 1;
a = 0;
b = 1;
while (i < n) {
w = a + b;
a = b;
b = w;
i = i + 1;
}
print(w, "\n");""").printTokens()
} // fibonacci
 
// Function to generate test case 11 source: FizzBuzz.
fun fizzbuzz() {
Lexer(
"""/* FizzBuzz */
i = 1;
while (i <= 100) {
if (!(i % 15))
print("FizzBuzz");
else if (!(i % 3))
print("Fizz");
else if (!(i % 5))
print("Buzz");
else
print(i);
 
print("\n");
i = i + 1;
}""").printTokens()
} // fizzbuzz
 
// Function to generate test case 12 source: 99 Bottles of Beer.
fun bottles() {
Lexer(
"""/* 99 bottles */
bottles = 99;
while (bottles > 0) {
print(bottles, " bottles of beer on the wall\n");
print(bottles, " bottles of beer\n");
print("Take one down, pass it around\n");
bottles = bottles - 1;
print(bottles, " bottles of beer on the wall\n\n");
}""").printTokens()
} // bottles
 
// Function to generate test case 13 source: Primes.
fun primes() {
Lexer(
"""/*
Simple prime number generator
*/
count = 1;
n = 1;
limit = 100;
while (n < limit) {
k=3;
p=1;
n=n+2;
while ((k*k<=n) && (p)) {
p=n/k*k!=n;
k=k+2;
}
if (p) {
print(n, " is prime\n");
count = count + 1;
}
}
print("Total primes found: ", count, "\n");""").printTokens()
} // primes
 
// Function to generate test case 14 source: Ascii Mandelbrot.
fun ascii() {
Lexer(
"""{
/*
This is an integer ascii Mandelbrot generator
*/
left_edge = -420;
right_edge = 300;
top_edge = 300;
bottom_edge = -300;
x_step = 7;
y_step = 15;
 
max_iter = 200;
 
y0 = top_edge;
while (y0 > bottom_edge) {
x0 = left_edge;
while (x0 < right_edge) {
y = 0;
x = 0;
the_char = ' ';
i = 0;
while (i < max_iter) {
x_x = (x * x) / 200;
y_y = (y * y) / 200;
if (x_x + y_y > 800 ) {
the_char = '0' + i;
if (i > 9) {
the_char = '@';
}
i = max_iter;
}
y = x * y / 100 + y0;
x = x_x - y_y + x0;
i = i + 1;
}
putc(the_char);
x0 = x0 + x_step;
}
putc('\n');
y0 = y0 - y_step;
}
}
""").printTokens()
} // ascii
 
when (number) {
0 ->
hello()
1 ->
phoenix()
2 ->
symbols()
3 ->
four()
4 ->
count()
5 ->
doors()
6 ->
negative()
7 ->
deep()
8 ->
gcd()
9 ->
factorial()
10 ->
fibonacci()
11 ->
fizzbuzz()
12 ->
bottles()
13 ->
primes()
14 ->
ascii()
15 -> // Lexer: Empty character constant
Lexer("''").printTokens()
16 -> // Lexer: Unknown escape sequence
Lexer("'\\x").printTokens()
17 -> // Lexer: Multi-character constant
Lexer("' ").printTokens()
18 -> // Lexer: EOF in comment
Lexer("/*").printTokens()
19 -> // Lexer: EOL in string
Lexer("\"\n").printTokens()
20 -> // Lexer: EOF in string
Lexer("\"").printTokens()
21 -> // Lexer: Escape sequence unknown
Lexer("\"\\x").printTokens()
22 -> // Lexer: Unrecognized character
Lexer("~").printTokens()
23 -> // Lexer: invalid number
Lexer("9a9").printTokens()
24 -> // Lexer: Number exceeds maximum value
Lexer("2147483648\n9223372036854775808").printTokens()
25 -> // Lexer: Operator expected
Lexer("|.").printTokens()
else ->
println("Invalid test number %d!".format(number))
} // when
} // tests
 
// Main function to check our source and read its data before parsing it.
// With no source specified, run the test of all symbols.
fun main(args: Array<String>) {
try {
val s =
if (args.size > 0 && args[0].isNotEmpty()) // file on command line
java.util.Scanner(java.io.File(args[0]))
else // use the console
java.util.Scanner(System.`in`)
var source = ""
while (s.hasNext())
source += s.nextLine()+
if (s.hasNext())
"\n"
else
""
if (args.size > 0 && args[0].isNotEmpty()) // file on command line
Lexer(source).printTokens()
else {
val digits = source.filter { it in '0'..'9' }
when {
source.isEmpty() -> // nothing given
tests(2)
source.length in 1..2 && digits.length == source.length &&
digits.toInt() in 0..25 ->
tests(digits.toInt())
else ->
Lexer(source).printTokens()
} // when
} // if
} catch(e: Throwable) {
println(e.message)
System.exit(1)
} // try
} // main</syntaxhighlight>
{{out|case=test case 3: All Symbols}}
<b>
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
22 29 End_of_input
</pre>
</b>
 
=={{header|Lua}}==
===Using LPeg library===
This version uses LPeg, a parsing expression grammar library developed by one of the authors of Lua.
The source is broken into several modules, in part to make it easier to present the "vanilla Lua" version afterwards.
Tested with Lua 5.3.5 and LPeg 1.0.2-1.
 
The first module is simply a table defining the names of tokens which don't have an associated value.
<syntaxhighlight lang="lua">-- module token_name (in a file "token_name.lua")
local token_name = {
['*'] = 'Op_multiply',
['/'] = 'Op_divide',
['%'] = 'Op_mod',
['+'] = 'Op_add',
['-'] = 'Op_subtract',
['<'] = 'Op_less',
['<='] = 'Op_lessequal',
['>'] = 'Op_greater',
['>='] = 'Op_greaterequal',
['=='] = 'Op_equal',
['!='] = 'Op_notequal',
['!'] = 'Op_not',
['='] = 'Op_assign',
['&&'] = 'Op_and',
['||'] = 'Op_or',
['('] = 'LeftParen',
[')'] = 'RightParen',
['{'] = 'LeftBrace',
['}'] = 'RightBrace',
[';'] = 'Semicolon',
[','] = 'Comma',
['if'] = 'Keyword_if',
['else'] = 'Keyword_else',
['while'] = 'Keyword_while',
['print'] = 'Keyword_print',
['putc'] = 'Keyword_putc',
}
return token_name</syntaxhighlight>
 
This module exports a function <i>find_token</i>, which attempts to find the next valid token from a specified position in a source line.
<syntaxhighlight lang="lua">-- module lpeg_token_finder
local M = {} -- only items added to M will be public (via 'return M' at end)
local table, concat = table, table.concat
local error, tonumber = error, tonumber
 
local lpeg = require 'lpeg' -- see http://www.inf.puc-rio.br/~roberto/lpeg/
local token_name = require 'token_name'
_ENV = {}
 
local imports = 'P R S C Carg Cb Cc Cf Cg Cp Cs Ct Cmt V'
for w in imports:gmatch('%a+') do _ENV[w] = lpeg[w] end
 
------------------- Define patterns to match tokens -----------------------
 
alpha = R'az' + R'AZ' + P'_'
digit = R'09'
alnum = alpha + digit
space = S' \t\r\n'
 
function ptok(text) return {name=token_name[text]} end
op2c = C(P'<=' + P'>=' + P'==' + P'!=' + P'&&' + P'||') / ptok
op1c = C(S'*/%+-<>!=') / ptok
symbol = C(S'(){};,') / ptok
 
keyword_or_identifier = C(alpha * alnum^0) / function(text)
local name = token_name[text]
return name and {name=name} or {name='Identifier', value=text}
end
 
integer = C(digit^1) * -alpha / function(text)
return {name='Integer', value=tonumber(text)}
end
 
Cline = Carg(1) -- call to 'match' sets the first extra argument to source line number
 
bad_escseq_err = Cmt(Cline, function (_,pos,line)
error{err='bad_escseq', line=line, column=pos-1}
end)
 
esc_subst = {['\\'] = '\\', ['n'] = '\n'}
escseq = P'\\' * C(S'\\n' + bad_escseq_err) / esc_subst
 
qchar = P"'" * ( C( P(1) - S"'\n\\" ) + escseq ) * P"'" / function (text)
return {name='Integer', value=text:byte()}
end
 
qstr = P'"' * ( C((P(1) - S'"\n\\')^1) + escseq )^0 * P'"' / function(...)
return {name='String', value=concat{...}}
end
 
Ctoken = symbol + op2c + op1c + keyword_or_identifier + integer + qstr + qchar
 
unfinished_comment_err = Cmt(Cline * Cb('SOC'), function (_, pos, line, socpos)
error{err='unfinished_comment', line=line, column=socpos}
end)
commentstart = Cg(Cp() * P'/*', 'SOC')
commentrest = (P(1) - P'*/')^0 * (P'*/' + unfinished_comment_err)
comment = commentstart * commentrest
morecomment = Cg(Cp(), 'SOC') * commentrest
 
ws = (space^1 + comment)^0
 
bad_token_err = Cmt(Cline, function (_, pos, line)
error{err='invalid_token', line=line, column=pos}
end)
tokenpat = ws * Cline * Cp() * (C(-1) + Ctoken + bad_token_err) * Cp() /
function (line, pos, token, nextpos)
if pos == nextpos then -- at end of line; no token
return nil
else
token.line, token.column = line, pos
return token, nextpos
end
end
 
closecomment_tokenpat = morecomment * tokenpat
 
function M.find_token(line, line_pos, line_number, in_comment)
pattern = in_comment and closecomment_tokenpat or tokenpat
return lpeg.match(pattern, line, line_pos, line_number)
end
return M</syntaxhighlight>
 
The <i>lexer</i> module uses <i>finder.find_token</i> to produce an iterator over the tokens in a source.
<syntaxhighlight lang="lua">-- module lexer
local M = {} -- only items added to M will publicly available (via 'return M' at end)
local string, io, coroutine, yield = string, io, coroutine, coroutine.yield
local error, pcall, type = error, pcall, type
 
local finder = require 'lpeg_token_finder'
_ENV = {}
 
-- produces a token iterator given a source line iterator
function M.tokenize_lineiter(lineiter)
local function fatal(err)
local msgtext = {
unfinished_comment = "EOF inside comment started",
invalid_token = "Invalid token",
bad_escseq = "Invalid escape sequence",
}
local fmt = "LEX ERROR: %s at line %d, column %d"
error(string.format(fmt, msgtext[err.err], err.line, err.column))
end
return coroutine.wrap(function()
local line_number = 0
local line_pos
local in_comment -- where unfinished comment started
for line in lineiter do
line_number = line_number + 1
line_pos = 1
local function scanline() -- yield current line's tokens
repeat
local token, pos =
finder.find_token(line, line_pos, line_number, in_comment)
if token then
line_pos = pos
in_comment = nil
yield(token)
end
until token == nil
end
 
if line then
local ok, err = pcall(scanline)
if ok then
in_comment = nil
elseif type(err) == 'table' and err.err=='unfinished_comment' then
if not(in_comment and err.column==1) then
in_comment = err
end
elseif type(err) == 'table' then
fatal(err)
else
error(err) -- some internal error
end
end
end
if in_comment then
fatal(in_comment)
else
yield{name='End_of_input', line=line_number+1, column=1}
end
return nil
end)
end
 
------------------- exports -----------------------------
 
lexer = M.tokenize_lineiter
 
function M.tokenize_file(filename)
return lexer(io.lines(filename))
end
 
function M.tokenize_text(text)
return lexer(text:gmatch('[^\n]+'))
end
 
-- M._INTERNALS = _ENV
return M
</syntaxhighlight>
 
This script uses <i>lexer.tokenize_text</i> to show the token sequence produced from a source text.
 
<syntaxhighlight lang="lua">lexer = require 'lexer'
format, gsub = string.format, string.gsub
 
function printf(fmt, ...) print(format(fmt, ...)) end
 
function stringrep(str)
local subst = {['\n'] = "\\n", ['\\'] = '\\\\'}
return format('"%s"', gsub(str, '[\n\\]', subst))
end
 
function display(text)
for t in lexer.tokenize_text(text) do
local value = (t.name=='String') and stringrep(t.value) or t.value or ''
printf("%4d %3d %-15s %s", t.line, t.column, t.name, value)
end
end
 
----------------------- test cases from Rosetta spec ------------------------
testing = true
if testing then
-- test case 1
display[[
/*
Hello world
*/
print("Hello, World!\n");]]
print()
 
-- test ercase 2
display[[
/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");]]
print()
-- etc.
end
</syntaxhighlight>
 
===Using only standard libraries===
This version replaces the <i>lpeg_token_finder</i> module of the LPeg version with this <i>basic_token_finder</i> module, altering the <i>require</i> expression near the top of the <i>lexer</i> module accordingly. Tested with Lua 5.3.5. (Note that <i>select</i> is a standard function as of Lua 5.2.)
 
<syntaxhighlight lang="lua">-- module basic_token_finder
local M = {} -- only items added to M will be public (via 'return M' at end)
local table, string = table, string
local error, tonumber, select, assert = error, tonumber, select, assert
 
local token_name = require 'token_name'
_ENV = {}
 
function next_token(line, pos, line_num) -- match a token at line,pos
local function m(pat)
from, to, capture = line:find(pat, pos)
if from then
pos = to + 1
return capture
end
end
local function ptok(str)
return {name=token_name[str]}
end
local function op2c()
local text = m'^([<>=!]=)' or m'^(&&)' or m'^(||)'
if text then return ptok(text) end
end
 
local function op1c_or_symbol()
local char = m'^([%*/%%%+%-<>!=%(%){};,])'
if char then return ptok(char) end
end
local function keyword_or_identifier()
local text = m'^([%a_][%w_]*)'
if text then
local name = token_name[text]
return name and {name=name} or {name='Identifier', value=text}
end
end
local function integer()
local text = m'^(%d+)%f[^%w_]'
if text then return {name='Integer', value=tonumber(text)} end
end
local subst = {['\\\\'] = '\\', ['\\n'] = '\n'}
local function qchar()
local text = m"^'([^\\])'" or m"^'(\\[\\n])'"
if text then
local value = #text==1 and text:byte() or subst[text]:byte()
return {name='Integer', value=value}
end
end
local function qstr()
local text = m'^"([^"\n]*\\?)"'
if text then
local value = text:gsub('()(\\.?)', function(at, esc)
local replace = subst[esc]
if replace then
return replace
else
error{err='bad_escseq', line=line_num, column=pos+at-1}
end
end)
return {name='String', value=value}
end
end
local found = (op2c() or op1c_or_symbol() or
keyword_or_identifier() or integer() or qchar() or qstr())
if found then
return found, pos
end
end
 
function find_commentrest(line, pos, line_num, socpos)
local sfrom, sto = line:find('%*%/', pos)
if sfrom then
return socpos, sto
else
error{err='unfinished_comment', line=line_num, column=socpos}
end
end
 
function find_comment(line, pos, line_num)
local sfrom, sto = line:find('^%/%*', pos)
if sfrom then
local efrom, eto = find_commentrest(line, sto+1, line_num, sfrom)
return sfrom, eto
end
end
 
function find_morecomment(line, pos, line_num)
assert(pos==1)
return find_commentrest(line, pos, line_num, pos)
end
 
function find_whitespace(line, pos, line_num)
local spos = pos
repeat
local eto = select(2, line:find('^%s+', pos))
if not eto then
eto = select(2, find_comment(line, pos, line_num))
end
if eto then pos = eto + 1 end
until not eto
return spos, pos - 1
end
 
function M.find_token(line, pos, line_num, in_comment)
local spos = pos
if in_comment then
pos = 1 + select(2, find_morecomment(line, pos, line_num))
end
pos = 1 + select(2, find_whitespace(line, pos, line_num))
if pos > #line then
return nil
else
local token, nextpos = next_token(line, pos, line_num)
if token then
token.line, token.column = line_num, pos
return token, nextpos
else
error{err='invalid_token', line=line_num, column=pos}
end
end
end
 
-- M._ENV = _ENV
return M</syntaxhighlight>
 
=={{header|M2000 Interpreter}}==
<syntaxhighlight lang="m2000 interpreter">
Module lexical_analyzer {
a$={/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* Else */ else /* Leq */ <=
/* While */ while /* Geq */ >=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '
}
lim=Len(a$)
LineNo=1
ColumnNo=1
Document Output$
Buffer Scanner as Integer*lim
Return Scanner, 0:=a$
offset=0
buffer1$=""
flag_rem=true
Ahead=lambda Scanner (a$, offset)->{
=false
Try {
\\ second parameter is the offset in buffer units
\\ third parameter is length in bytes
=Eval$(Scanner, offset,2*len(a$))=a$
}
}
Ahead2=lambda Scanner (a$, offset)->{
=false
Try {
=Eval$(Scanner, offset,2) ~ a$
}
}
const nl$=chr$(13)+chr$(10), quo$="""", er$="@", Ansi=3
Try {
Do
If Ahead("/*", offset) Then {
offset+=2 : ColumnNo+=2
While not Ahead("*/", offset)
If Ahead(nl$, offset) Then
lineNo++: ColumnNo=1 : offset+=2
Else
offset++ : ColumnNo++
End If
if offset>lim then
Error "End-of-file in comment. Closing comment characters not found"+er$
End if
End While
offset+=2 : ColumnNo+=2
} Else.if Ahead(nl$, offset) Then{
LineNo++: ColumnNo=1
offset+=2
} Else.if Ahead(quo$, offset) Then {
Output$=format$("{0::-10}{1::-10} ", LineNo, ColumnNo)
offset++ : ColumnNo++
strin=offset
While not Ahead(quo$, offset)
If Ahead("/", offset) Then
offset+=2 : ColumnNo+=2
else
offset++ : ColumnNo++
End if
checkerror()
End While
Output$="String "+quote$(Eval$(Scanner, strin, (offset-strin)*2))+nl$
offset++ : ColumnNo++
} Else.if Ahead("'", offset) Then {
Output$=format$("{0::-10}{1::-10} ", LineNo, ColumnNo)
offset++ : ColumnNo++
strin=offset
While not Ahead("'", offset)
If Ahead("/", offset) Then
offset+=2 : ColumnNo+=2
else
offset++ : ColumnNo++
End if
checkerror()
End While
lit$=format$(Eval$(Scanner, strin, (offset-strin)*2))
select case len(lit$)
case 1
Output$="Integer "+str$(asc(lit$),0)+nl$
case >1
{Error "Multi-character constant."+er$}
case 0
{Error "Empty character constant."+er$}
end select
offset++ : ColumnNo++
} Else.if Ahead2("[a-z]", offset) Then {
strin=offset
Output$=format$("{0::-10}{1::-10} ", LineNo, ColumnNo)
offset++ : ColumnNo++
While Ahead2("[a-zA-Z0-9_]", offset)
offset++ : ColumnNo++
End While
Keywords(Eval$(Scanner, strin, (offset-strin)*2))
} Else.if Ahead2("[0-9]", offset) Then {
strin=offset
Output$=format$("{0::-10}{1::-10} Integer ", LineNo, ColumnNo)
offset++ : ColumnNo++
While Ahead2("[0-9]", offset)
offset++ : ColumnNo++
End While
if Ahead2("[a-zA-Z_]", offset) then
{Error " Invalid number. Starts like a number, but ends in non-numeric characters."+er$}
else
Output$=Eval$(Scanner, strin, (offset-strin)*2)+nl$
end if
} Else {
Symbols(Eval$(Scanner, Offset, 2))
offset++ : ColumnNo++
}
Until offset>=lim
}
er1$=leftpart$(error$,er$)
if er1$<>"" then
Print
Report "Error:"+er1$
Output$="(Error)"+nl$+"Error:"+er1$
else
Output$=format$("{0::-10}{1::-10}", LineNo, ColumnNo)+" End_of_Input"+nl$
end if
Clipboard Output$
Save.Doc Output$, "lex.t", Ansi
document lex$
Load.Doc lex$,"lex.t", Ansi
Report lex$
Sub Keywords(a$)
select case a$
case "if"
a$="Keyword_if"
case "else"
a$="Keyword_else"
case "while"
a$="Keyword_while"
case "print"
a$="Keyword_print"
case "putc"
a$="Keyword_putc"
else case
a$="Identifier "+a$
end select
Output$=a$+nl$
End sub
Sub Symbols(a$)
select case a$
case " ", chr$(9)
a$=""
case "("
a$="LeftParen"
case ")"
a$="RightParen"
case "{"
a$="LeftBrace"
case "}"
a$="RightBrace"
case ";"
a$="Semicolon"
case ","
a$="Comma"
case "*"
a$="Op_multiply"
case "/"
a$="Op_divide"
case "+"
a$="Op_add"
case "-"
a$="Op_subtract"
case "%"
a$="Op_mod"
case "<"
{ if Ahead("=", offset+1) Then
offset++
a$="Op_lessequal"
ColumnNo++
else
a$="Op_less"
end if
}
case ">"
{ if Ahead("=", offset+1) Then
offset++
ColumnNo++
a$="Op_greaterequal"
else
a$="Op_greater"
end if
}
case "="
{ if Ahead("=", offset+1) Then
offset++
ColumnNo++
a$="Op_equal"
else
a$="Op_assign"
end if
}
case "!"
{ if Ahead("=", offset+1) Then
offset++
ColumnNo++
a$="Op_notequal"
else
a$="Op_not"
end if
}
case "&"
{ if Ahead("&", offset+1) Then
offset++
ColumnNo++
a$="Op_and"
else
a$=""
end if
}
case "|"
{ if Ahead("|", offset+1) Then
offset++
ColumnNo++
a$="Op_or"
else
a$=""
end if
}
else case
{Error "Unrecognized character."+er$}
end select
if a$<>"" then
Output$=format$("{0::-10}{1::-10} ", LineNo, ColumnNo)+a$+nl$
end if
End Sub
Sub checkerror()
if offset>lim then {
Error "End-of-line while scanning string literal. Closing string character not found before end-of-line."+er$
} else.if Ahead(nl$,offset) then {
Error "End-of-file while scanning string literal. Closing string character not found."+er$
}
End Sub
}
lexical_analyzer
</syntaxhighlight>
 
{{out}}
<pre style="height:30ex;overflow:scroll">
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 41 Op_lessequal
9 16 Keyword_while
9 41 Op_greaterequal
10 16 LeftBrace
10 41 Op_equal
11 16 RightBrace
11 41 Op_notequal
12 16 LeftParen
12 41 Op_and
13 16 RightParen
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_Input
 
</pre >
 
=={{header|Mercury}}==
{{trans|ATS}}
{{works with|Mercury|20.06.1}}
 
 
<syntaxhighlight lang="mercury">% -*- mercury -*-
%
% Compile with maybe something like:
% mmc -O4 --intermod-opt -E --make --warn-non-tail-recursion self-and-mutual lex
%
 
:- module lex.
 
:- interface.
:- import_module io.
:- pred main(io::di, io::uo) is det.
 
:- implementation.
:- import_module char.
:- import_module exception.
:- import_module int.
:- import_module list.
:- import_module stack.
:- import_module string.
 
:- type token_t
---> token_ELSE
; token_IF
; token_PRINT
; token_PUTC
; token_WHILE
; token_MULTIPLY
; token_DIVIDE
; token_MOD
; token_ADD
; token_SUBTRACT
; token_NEGATE
; token_LESS
; token_LESSEQUAL
; token_GREATER
; token_GREATEREQUAL
; token_EQUAL
; token_NOTEQUAL
; token_NOT
; token_ASSIGN
; token_AND
; token_OR
; token_LEFTPAREN
; token_RIGHTPAREN
; token_LEFTBRACE
; token_RIGHTBRACE
; token_SEMICOLON
; token_COMMA
; token_IDENTIFIER
; token_INTEGER
; token_STRING
; token_END_OF_INPUT.
 
:- type ch_t % The type of a fetched character.
---> {int, % A character or `eof', stored as an int.
int, % The line number.
int}. % The column number.
 
:- type inp_t % The `inputter' type. Fetches one character.
---> inp_t(inpf :: text_input_stream,
line_no :: int,
column_no :: int,
pushback :: stack(ch_t)).
 
:- type toktup_t % The type of a scanned token with its argument.
---> {token_t, % The token kind.
string, % An argument. (May or may not be meaningful.)
int, % The starting line number.
int}. % The starting column number.
 
main(!IO) :-
command_line_arguments(Args, !IO),
(
if (Args = [])
then (InpF_filename = "-",
OutF_filename = "-",
main_program(InpF_filename, OutF_filename, !IO))
else if (Args = [F1])
then (InpF_filename = F1,
OutF_filename = "-",
main_program(InpF_filename, OutF_filename, !IO))
else if (Args = [F1, F2])
then (InpF_filename = F1,
OutF_filename = F2,
main_program(InpF_filename, OutF_filename, !IO))
else usage_error(!IO)
).
 
:- pred main_program(string::in, string::in, io::di, io::uo) is det.
main_program(InpF_filename, OutF_filename, !IO) :-
open_InpF(InpF, InpF_filename, !IO),
open_OutF(OutF, OutF_filename, !IO),
init(InpF, Inp0),
scan_text(OutF, Inp0, _, !IO).
 
:- pred open_InpF(text_input_stream::out, string::in,
io::di, io::uo) is det.
open_InpF(InpF, InpF_filename, !IO) :-
if (InpF_filename = "-")
then (InpF = io.stdin_stream)
else
(
open_input(InpF_filename, InpF_result, !IO),
(
if (InpF_result = ok(F))
then (InpF = F)
else throw("Error: cannot open " ++ InpF_filename ++
" for input")
)
).
 
:- pred open_OutF(text_output_stream::out, string::in,
io::di, io::uo) is det.
open_OutF(OutF, OutF_filename, !IO) :-
if (OutF_filename = "-")
then (OutF = io.stdout_stream)
else
(
open_output(OutF_filename, OutF_result, !IO),
(
if (OutF_result = ok(F))
then (OutF = F)
else throw("Error: cannot open " ++ OutF_filename ++
" for output")
)
).
 
:- pred usage_error(io::di, io::uo) is det.
usage_error(!IO) :-
progname("lex", ProgName, !IO),
(io.format("Usage: %s [INPUT_FILE [OUTPUT_FILE]]\n",
[s(ProgName)], !IO)),
(io.write_string("If INPUT_FILE is \"-\" or not present then standard input is used.\n",
!IO)),
(io.write_string("If OUTPUT_FILE is \"-\" or not present then standard output is used.\n",
!IO)),
set_exit_status(1, !IO).
 
:- pred scan_text(text_output_stream::in, inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_text(OutF, !Inp, !IO) :-
get_next_token(TokTup, !Inp, !IO),
print_token(TokTup, OutF, !IO),
{Tok, _, _, _} = TokTup,
(
if (Tok = token_END_OF_INPUT)
then true
else scan_text(OutF, !Inp, !IO)
).
 
:- pred print_token(toktup_t::in, text_output_stream::in,
io::di, io::uo) is det.
print_token(TokTup, OutF, !IO) :-
{Tok, Arg, Line_no, Column_no} = TokTup,
token_name(Tok) = TokName,
(io.format(OutF, "%5d %5d %s",
[i(Line_no), i(Column_no), s(TokName)],
!IO)),
(
if (Tok = token_IDENTIFIER)
then (io.format(OutF, " %s", [s(Arg)], !IO))
else if (Tok = token_INTEGER)
then (io.format(OutF, " %s", [s(Arg)], !IO))
else if (Tok = token_STRING)
then (io.format(OutF, " %s", [s(Arg)], !IO))
else true
),
(io.format(OutF, "\n", [], !IO)).
 
:- func token_name(token_t) = string is det.
:- pred token_name(token_t::in, string::out) is det.
token_name(Tok) = Str :- token_name(Tok, Str).
token_name(token_ELSE, "Keyword_else").
token_name(token_IF, "Keyword_if").
token_name(token_PRINT, "Keyword_print").
token_name(token_PUTC, "Keyword_putc").
token_name(token_WHILE, "Keyword_while").
token_name(token_MULTIPLY, "Op_multiply").
token_name(token_DIVIDE, "Op_divide").
token_name(token_MOD, "Op_mod").
token_name(token_ADD, "Op_add").
token_name(token_SUBTRACT, "Op_subtract").
token_name(token_NEGATE, "Op_negate").
token_name(token_LESS, "Op_less").
token_name(token_LESSEQUAL, "Op_lessequal").
token_name(token_GREATER, "Op_greater").
token_name(token_GREATEREQUAL, "Op_greaterequal").
token_name(token_EQUAL, "Op_equal").
token_name(token_NOTEQUAL, "Op_notequal").
token_name(token_NOT, "Op_not").
token_name(token_ASSIGN, "Op_assign").
token_name(token_AND, "Op_and").
token_name(token_OR, "Op_or").
token_name(token_LEFTPAREN, "LeftParen").
token_name(token_RIGHTPAREN, "RightParen").
token_name(token_LEFTBRACE, "LeftBrace").
token_name(token_RIGHTBRACE, "RightBrace").
token_name(token_SEMICOLON, "Semicolon").
token_name(token_COMMA, "Comma").
token_name(token_IDENTIFIER, "Identifier").
token_name(token_INTEGER, "Integer").
token_name(token_STRING, "String").
token_name(token_END_OF_INPUT, "End_of_input").
 
:- pred get_next_token(toktup_t::out, inp_t::in, inp_t::out,
io::di, io::uo) is det.
get_next_token(TokTup, !Inp, !IO) :-
skip_spaces_and_comments(!Inp, !IO),
get_ch(Ch, !Inp, !IO),
{IChar, Line_no, Column_no} = Ch,
LN = Line_no,
CN = Column_no,
(
if (IChar = eof)
then
(
TokTup = {token_END_OF_INPUT, "", LN, CN}
)
else
(
Char = det_from_int(IChar),
(
if (Char = (','))
then (TokTup = {token_COMMA, ",", LN, CN})
else if (Char = (';'))
then (TokTup = {token_SEMICOLON, ";", LN, CN})
else if (Char = ('('))
then (TokTup = {token_LEFTPAREN, "(", LN, CN})
else if (Char = (')'))
then (TokTup = {token_RIGHTPAREN, ")", LN, CN})
else if (Char = ('{'))
then (TokTup = {token_LEFTBRACE, "{", LN, CN})
else if (Char = ('}'))
then (TokTup = {token_RIGHTBRACE, "}", LN, CN})
else if (Char = ('*'))
then (TokTup = {token_MULTIPLY, "*", LN, CN})
else if (Char = ('/'))
then (TokTup = {token_DIVIDE, "/", LN, CN})
else if (Char = ('%'))
then (TokTup = {token_MOD, "%", LN, CN})
else if (Char = ('+'))
then (TokTup = {token_ADD, "+", LN, CN})
else if (Char = ('-'))
then (TokTup = {token_SUBTRACT, "-", LN, CN})
else if (Char = ('<'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('='))
then
(
TokTup = {token_LESSEQUAL, "<=", LN, CN}
)
else
(
push_back(Ch1, !Inp),
TokTup = {token_LESS, "<", LN, CN}
)
)
)
else if (Char = ('>'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('='))
then
(
TokTup = {token_GREATEREQUAL, ">=", LN, CN}
)
else
(
push_back(Ch1, !Inp),
TokTup = {token_GREATER, ">", LN, CN}
)
)
)
else if (Char = ('='))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('='))
then
(
TokTup = {token_EQUAL, "==", LN, CN}
)
else
(
push_back(Ch1, !Inp),
TokTup = {token_ASSIGN, "=", LN, CN}
)
)
)
else if (Char = ('!'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('='))
then
(
TokTup = {token_NOTEQUAL, "!=", LN, CN}
)
else
(
push_back(Ch1, !Inp),
TokTup = {token_NOT, "!", LN, CN}
)
)
)
else if (Char = ('&'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('&'))
then
(
TokTup = {token_AND, "&&", LN, CN}
)
else throw("Error: unexpected character '" ++
from_char(Char) ++ "' at " ++
from_int(LN) ++ ":" ++
from_int(CN))
)
)
else if (Char = ('|'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('|'))
then
(
TokTup = {token_OR, "||", LN, CN}
)
else throw("Error: unexpected character '" ++
from_char(Char) ++ "' at " ++
from_int(LN) ++ ":" ++
from_int(CN))
)
)
else if (Char = ('"'))
then
(
push_back(Ch, !Inp),
scan_string_literal(TokTup, !Inp, !IO)
)
else if (Char = ('\''))
then
(
push_back(Ch, !Inp),
scan_character_literal(TokTup, !Inp, !IO)
)
else if (is_alpha(Char))
then
(
push_back(Ch, !Inp),
scan_identifier_or_reserved_word(
TokTup, !Inp, !IO)
)
else if (is_digit(Char))
then
(
push_back(Ch, !Inp),
scan_integer_literal(TokTup, !Inp, !IO)
)
else
(
throw("Error: unexpected character '" ++
from_char(Char) ++ "' at " ++
from_int(LN) ++ ":" ++
from_int(CN))
)
)
)
).
 
 
:- pred skip_spaces_and_comments(inp_t::in, inp_t::out,
io::di, io::uo) is det.
skip_spaces_and_comments(!Inp, !IO) :-
get_ch(Ch, !Inp, !IO),
Ch = {IChar, _, _},
(
if (IChar = eof)
then push_back(Ch, !Inp)
else
if (is_whitespace(det_from_int(IChar)))
then skip_spaces_and_comments(!Inp, !IO)
else if (IChar = to_int('/'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, Line_no, Column_no},
(
if (IChar1 = to_int('*'))
then
(
scan_comment(Line_no, Column_no,
!Inp, !IO),
skip_spaces_and_comments(!Inp, !IO)
)
else
(
push_back(Ch1, !Inp),
push_back(Ch, !Inp)
)
)
)
else push_back(Ch, !Inp)
).
 
:- pred scan_comment(int::in, int::in, % line and column nos.
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_comment(Line_no, Column_no, !Inp, !IO) :-
get_ch(Ch, !Inp, !IO),
{IChar, _, _} = Ch,
(
if (IChar = eof)
then throw("Error: unterminated comment " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else
(
det_from_int(IChar) = Char,
(
if (Char = ('*'))
then
(
get_ch(Ch1, !Inp, !IO),
{IChar1, _, _} = Ch1,
(
if (IChar1 = to_int('/'))
then true % End of comment has been reached.
else
(
push_back(Ch1, !Inp),
scan_comment(Line_no, Column_no, !Inp,
!IO)
)
)
)
else scan_comment(Line_no, Column_no, !Inp, !IO)
)
)
).
 
:- pred scan_character_literal(toktup_t::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_character_literal(TokTup, !Inp, !IO) :-
get_ch(Ch, !Inp, !IO),
Ch = {OpenQuote, Line_no, Column_no},
CloseQuote = OpenQuote,
scan_char_lit_contents(CodePoint, Line_no, Column_no,
!Inp, !IO),
check_char_lit_end(CloseQuote, Line_no, Column_no, !Inp, !IO),
Arg = from_int(CodePoint),
TokTup = {token_INTEGER, Arg, Line_no, Column_no}.
 
:- pred scan_char_lit_contents(int::out, int::in, int::in,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_char_lit_contents(CodePoint, Line_no, Column_no,
!Inp, !IO) :-
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, Line_no1, Column_no1},
(
if (IChar1 = eof)
then throw("Error: end of input in character literal " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar1 = to_int('\\'))
then
(
get_ch(Ch2, !Inp, !IO),
Ch2 = {IChar2, _, _},
(if (IChar2 = eof)
then throw("Error: end of input in character literal " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar2 = to_int('n'))
then (CodePoint = to_int('\n'))
else if (IChar2 = to_int('\\'))
then (CodePoint = to_int('\\'))
else throw("Error: unsupported escape \\" ++
from_char(det_from_int(IChar2)) ++
" at " ++ from_int(Line_no1) ++
":" ++ from_int(Column_no1))
)
)
else (CodePoint = IChar1)
).
 
:- pred check_char_lit_end(int::in, int::in, int::in,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
check_char_lit_end(CloseQuote, Line_no, Column_no, !Inp, !IO) :-
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = CloseQuote)
then true
else find_bad_char_lit_end(CloseQuote, Line_no, Column_no,
!Inp, !IO)
).
 
:- pred find_bad_char_lit_end(int::in, int::in, int::in,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
find_bad_char_lit_end(CloseQuote, Line_no, Column_no, !Inp, !IO) :-
get_ch(Ch2, !Inp, !IO),
Ch2 = {IChar2, _, _},
(
if (IChar2 = CloseQuote)
then throw("Error: unsupported multicharacter literal " ++
" at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar2 = eof)
then throw("Error: end of input in character literal " ++
" at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else find_bad_char_lit_end(CloseQuote, Line_no, Column_no,
!Inp, !IO)
).
 
:- pred scan_string_literal(toktup_t::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_string_literal(TokTup, !Inp, !IO) :-
get_ch(Ch, !Inp, !IO),
Ch = {OpenQuote, Line_no, Column_no},
CloseQuote = OpenQuote,
scan_string_lit_contents("", Str, CloseQuote,
Line_no, Column_no,
!Inp, !IO),
Arg = from_char(det_from_int(OpenQuote)) ++
Str ++ from_char(det_from_int(CloseQuote)),
TokTup = {token_STRING, Arg, Line_no, Column_no}.
 
:- pred scan_string_lit_contents(string::in, string::out, int::in,
int::in, int::in,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_string_lit_contents(Str0, Str, CloseQuote, Line_no, Column_no,
!Inp, !IO) :-
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, Line_no1, Column_no1},
(
if (IChar1 = CloseQuote)
then (Str = Str0)
else if (IChar1 = eof)
then throw("Error: end of input in string literal " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar1 = to_int('\n'))
then throw("Error: end of line in string literal " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar1 = to_int('\\'))
then
(
get_ch(Ch2, !Inp, !IO),
Ch2 = {IChar2, _, _},
(
if (IChar2 = to_int('n'))
then
(
Str1 = Str0 ++ "\\n",
scan_string_lit_contents(Str1, Str, CloseQuote,
Line_no, Column_no,
!Inp, !IO)
)
else if (IChar2 = to_int('\\'))
then
(
Str1 = Str0 ++ "\\\\",
scan_string_lit_contents(Str1, Str, CloseQuote,
Line_no, Column_no,
!Inp, !IO)
)
else if (IChar2 = eof)
then throw("Error: end of input in string literal " ++
"starting at " ++ from_int(Line_no) ++
":" ++ from_int(Column_no))
else if (IChar2 = to_int('\n'))
then throw("Error: end of line in string literal " ++
"starting at " ++ from_int(Line_no) ++
":" ++ from_int(Column_no))
else throw("Error: unsupported escape \\" ++
from_char(det_from_int(IChar2)) ++
" at " ++ from_int(Line_no1) ++
":" ++ from_int(Column_no1))
)
)
else
(
Char1 = det_from_int(IChar1),
Str1 = Str0 ++ from_char(Char1),
scan_string_lit_contents(Str1, Str, CloseQuote,
Line_no, Column_no, !Inp, !IO)
)
).
 
:- pred scan_identifier_or_reserved_word(toktup_t::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_identifier_or_reserved_word(TokTup, !Inp, !IO) :-
scan_integer_or_word(Str, Line_no, Column_no, !Inp, !IO),
(
if (Str = "if")
then (TokTup = {token_IF, Str, Line_no, Column_no})
else if (Str = "else")
then (TokTup = {token_ELSE, Str, Line_no, Column_no})
else if (Str = "while")
then (TokTup = {token_WHILE, Str, Line_no, Column_no})
else if (Str = "print")
then (TokTup = {token_PRINT, Str, Line_no, Column_no})
else if (Str = "putc")
then (TokTup = {token_PUTC, Str, Line_no, Column_no})
else (TokTup = {token_IDENTIFIER, Str, Line_no, Column_no})
).
 
:- pred scan_integer_literal(toktup_t::out, inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_integer_literal(TokTup, !Inp, !IO) :-
scan_integer_or_word(Str, Line_no, Column_no, !Inp, !IO),
(
if (not is_all_digits(Str))
then throw("Error: not a valid integer literal: " ++ Str)
else (TokTup = {token_INTEGER, Str, Line_no, Column_no})
).
 
:- pred scan_integer_or_word(string::out, int::out, int::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_integer_or_word(Str, Line_no, Column_no, !Inp, !IO) :-
get_ch({IChar, Line_no, Column_no}, !Inp, !IO),
(
if (IChar = eof)
then throw("internal error")
else
(
Char = det_from_int(IChar),
(if (not is_alnum_or_underscore(Char))
then throw("internal error")
else scan_int_or_word(from_char(Char), Str, !Inp, !IO))
)
).
 
:- pred scan_int_or_word(string::in, string::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_int_or_word(Str0, Str, !Inp, !IO) :-
get_ch(CharTup, !Inp, !IO),
{IChar, _, _} = CharTup,
(
if (IChar = eof)
then
(
push_back(CharTup, !Inp),
Str = Str0
)
else
(
Char = det_from_int(IChar),
(
if (not is_alnum_or_underscore(Char))
then
(
push_back(CharTup, !Inp),
Str = Str0
)
else scan_int_or_word(Str0 ++ from_char(Char), Str,
!Inp, !IO)
)
)
).
 
:- pred init(text_input_stream::in, inp_t::out) is det.
init(Inpf, Inp) :-
Inp = inp_t(Inpf, 1, 1, init).
 
 
:- pred get_ch(ch_t::out, inp_t::in, inp_t::out,
io::di, io::uo) is det.
get_ch(Ch, Inp0, Inp, !IO) :-
if (pop(Ch1, Inp0^pushback, Pushback))
then
(
Ch = Ch1,
Inp = (Inp0^pushback := Pushback)
)
else
(
inp_t(Inpf, Line_no, Column_no, Pushback) = Inp0,
read_char_unboxed(Inpf, Result, Char, !IO),
(
if (Result = ok)
then
(
Ch = {to_int(Char), Line_no, Column_no},
Inp =
(if (Char = ('\n'))
then inp_t(Inpf, Line_no + 1, 1, Pushback)
else inp_t(Inpf, Line_no, Column_no + 1, Pushback))
)
else
(
Ch = {eof, Line_no, Column_no},
Inp = Inp0
)
)
).
 
:- pred push_back(ch_t::in, inp_t::in, inp_t::out) is det.
push_back(Ch, Inp0, Inp) :-
Inp = (Inp0^pushback := push(Inp0^pushback, Ch)).
 
:- func eof = int is det.
eof = -1.</syntaxhighlight>
 
{{out}}
<pre>$ mmc -O6 --intermod-opt -E --make --warn-non-tail-recursion self-and-mutual lex && ./lex compiler-tests/testcase3.t
Making Mercury/int3s/lex.int3
Making Mercury/ints/lex.int
Making Mercury/opts/lex.opt
Making Mercury/cs/lex.c
Making Mercury/os/lex.o
Making lex
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
 
=={{header|Nim}}==
Tested with Nim v0.19.4. Both examples are tested against all programs in [[Compiler/Sample programs]].
===Using string with regular expressions===
<syntaxhighlight lang="nim">
import re, strformat, strutils
 
type
TokenKind = enum
tkUnknown = "UNKNOWN_TOKEN",
tkMul = "Op_multiply",
tkDiv = "Op_divide",
tkMod = "Op_mod",
tkAdd = "Op_add",
tkSub = "Op_subtract",
tkNeg = "Op_negate",
tkLt = "Op_less",
tkLte = "Op_lessequal",
tkGt = "Op_greater",
tkGte = "Op_greaterequal",
tkEq = "Op_equal",
tkNeq = "Op_notequal",
tkNot = "Op_not",
tkAsgn = "Op_assign",
tkAnd = "Op_and",
tkOr = "Op_or",
tkLpar = "LeftParen",
tkRpar = "RightParen",
tkLbra = "LeftBrace",
tkRbra = "RightBrace",
tkSmc = "Semicolon",
tkCom = "Comma",
tkIf = "Keyword_if",
tkElse = "Keyword_else",
tkWhile = "Keyword_while",
tkPrint = "Keyword_print",
tkPutc = "Keyword_putc",
tkId = "Identifier",
tkInt = "Integer",
tkChar = "Integer",
tkStr = "String",
tkEof = "End_of_input"
 
Token = object
kind: TokenKind
value: string
 
TokenAnn = object
## Annotated token with messages for compiler
token: Token
line, column: int
 
proc getSymbols(table: openArray[(char, TokenKind)]): seq[char] =
result = newSeq[char]()
for ch, tokenKind in items(table):
result.add ch
 
const
tkSymbols = { # single-char tokens
'*': tkMul,
'%': tkMod,
'+': tkAdd,
'-': tkSub,
'(': tkLpar,
')': tkRpar,
'{': tkLbra,
'}': tkRbra,
';': tkSmc,
',': tkCom,
'/': tkDiv, # the comment case /* ... */ is handled in `stripUnimportant`
}
symbols = getSymbols(tkSymbols)
 
proc findTokenKind(table: openArray[(char, TokenKind)]; needle: char):
TokenKind =
for ch, tokenKind in items(table):
if ch == needle: return tokenKind
tkUnknown
 
proc stripComment(text: var string, lineNo, colNo: var int) =
var matches: array[1, string]
 
if match(text, re"\A(/\*[\s\S]*?\*/)", matches):
text = text[matches[0].len..^1]
for s in matches[0]:
if s == '\n':
inc lineNo
colNo = 1
else:
inc colNo
 
proc stripUnimportant(text: var string; lineNo, colNo: var int) =
while true:
if text.len == 0: return
elif text[0] == '\n':
inc lineNo
colNo = 1
text = text[1..^1]
elif text[0] == ' ':
inc colNo
text = text[1..^1]
elif text.len >= 2 and text[0] == '/' and text[1] == '*':
stripComment(text, lineNo, colNo)
else: return
 
proc lookAhead(ch1, ch2: char, tk1, tk2: TokenKind): (TokenKind, int) =
if ch1 == ch2: (tk1, 2)
else: (tk2, 1)
 
proc consumeToken(text: var string; tkl: var int): Token =
## Return token removing it from the `text` and write its length to
## `tkl`. If the token can not be defined, return `tkUnknown` as a
## token, shrink text by 1 and write 1 to its length.
 
var
matches: array[1, string]
tKind: TokenKind
val: string
 
if text.len == 0:
(tKind, tkl) = (tkEof, 0)
 
# Simple characters
elif text[0] in symbols: (tKind, tkl) = (tkSymbols.findTokenKind(text[0]), 1)
elif text[0] == '<': (tKind, tkl) = lookAhead(text[1], '=', tkLte, tkLt)
elif text[0] == '>': (tKind, tkl) = lookAhead(text[1], '=', tkGte, tkGt)
elif text[0] == '=': (tKind, tkl) = lookAhead(text[1], '=', tkEq, tkAsgn)
elif text[0] == '!': (tKind, tkl) = lookAhead(text[1], '=', tkNeq, tkNot)
elif text[0] == '&': (tKind, tkl) = lookAhead(text[1], '&', tkAnd, tkUnknown)
elif text[0] == '|': (tKind, tkl) = lookAhead(text[1], '|', tkOr, tkUnknown)
 
# Keywords
elif match(text, re"\Aif\b"): (tKind, tkl) = (tkIf, 2)
elif match(text, re"\Aelse\b"): (tKind, tkl) = (tkElse, 4)
elif match(text, re"\Awhile\b"): (tKind, tkl) = (tkWhile, 5)
elif match(text, re"\Aprint\b"): (tKind, tkl) = (tkPrint, 5)
elif match(text, re"\Aputc\b"): (tKind, tkl) = (tkPutc, 4)
 
# Literals and identifiers
elif match(text, re"\A([0-9]+)", matches):
(tKind, tkl) = (tkInt, matches[0].len)
val = matches[0]
elif match(text, re"\A([_a-zA-Z][_a-zA-Z0-9]*)", matches):
(tKind, tkl) = (tkId, matches[0].len)
val = matches[0]
elif match(text, re"\A('(?:[^'\n]|\\\\|\\n)')", matches):
(tKind, tkl) = (tkChar, matches[0].len)
val = case matches[0]
of r"' '": $ord(' ')
of r"'\n'": $ord('\n')
of r"'\\'": $ord('\\')
else: $ord(matches[0][1]) # "'a'"[1] == 'a'
elif match(text, re"\A(""[^""\n]*"")", matches):
(tKind, tkl) = (tkStr, matches[0].len)
val = matches[0]
else: (tKind, tkl) = (tkUnknown, 1)
 
text = text[tkl..^1]
Token(kind: tKind, value: val)
 
proc tokenize*(text: string): seq[TokenAnn] =
result = newSeq[TokenAnn]()
var
lineNo, colNo: int = 1
text = text
token: Token
tokenLength: int
 
while text.len > 0:
stripUnimportant(text, lineNo, colNo)
token = consumeToken(text, tokenLength)
result.add TokenAnn(token: token, line: lineNo, column: colNo)
inc colNo, tokenLength
 
proc output*(s: seq[TokenAnn]): string =
var
tokenKind: TokenKind
value: string
line, column: int
 
for tokenAnn in items(s):
line = tokenAnn.line
column = tokenAnn.column
tokenKind = tokenAnn.token.kind
value = tokenAnn.token.value
result.add(
fmt"{line:>5}{column:>7} {tokenKind:<15}{value}"
.strip(leading = false) & "\n")
 
when isMainModule:
import os
 
let input = if paramCount() > 0: readFile paramStr(1)
else: readAll stdin
 
echo input.tokenize.output
</syntaxhighlight>
===Using stream with lexer library===
<syntaxhighlight lang="nim">
import lexbase, streams
from strutils import Whitespace
 
type
TokenKind = enum
tkInvalid = "Invalid",
tkOpMultiply = "Op_multiply",
tkOpDivide = "Op_divide",
tkOpMod = "Op_mod",
tkOpAdd = "Op_add",
tkOpSubtract = "Op_subtract",
tkOpLess = "Op_less",
tkOpLessEqual = "Op_lessequal",
tkOpGreater = "Op_greater",
tkOpGreaterEqual = "Op_greaterequal",
tkOpEqual = "Op_equal",
tkOpNotEqual = "Op_notequal",
tkOpNot = "Op_not",
tkOpAssign = "Op_assign",
tkOpAnd = "Op_and",
tkOpOr = "Op_or",
tkLeftParen = "LeftParen",
tkRightParen = "RightParen",
tkLeftBrace = "LeftBrace",
tkRightBrace = "RightBrace",
tkSemicolon = "Semicolon",
tkComma = "Comma",
tkKeywordIf = "Keyword_if",
tkKeywordElse = "Keyword_else",
tkKeywordWhile = "Keyword_while",
tkKeywordPrint = "Keyword_print",
tkKeywordPutc = "Keyword_putc",
tkIdentifier = "Identifier",
tkInteger = "Integer",
tkString = "String",
tkEndOfInput = "End_of_input"
 
Lexer = object of BaseLexer
kind: TokenKind
token, error: string
startPos: int
 
template setError(l: var Lexer; err: string): untyped =
l.kind = tkInvalid
if l.error.len == 0:
l.error = err
 
proc hasError(l: Lexer): bool {.inline.} =
l.error.len > 0
 
proc open(l: var Lexer; input: Stream) {.inline.} =
lexbase.open(l, input)
l.startPos = 0
l.kind = tkInvalid
l.token = ""
l.error = ""
 
proc handleNewLine(l: var Lexer) =
case l.buf[l.bufpos]
of '\c': l.bufpos = l.handleCR l.bufpos
of '\n': l.bufpos = l.handleLF l.bufpos
else: discard
 
proc skip(l: var Lexer) =
while true:
case l.buf[l.bufpos]
of Whitespace:
if l.buf[l.bufpos] notin NewLines:
inc l.bufpos
else:
handleNewLine l
of '/':
if l.buf[l.bufpos + 1] == '*':
inc l.bufpos, 2
while true:
case l.buf[l.bufpos]
of '*':
if l.buf[l.bufpos + 1] == '/':
inc l.bufpos, 2
break
else: inc l.bufpos
of NewLines:
handleNewLine l
of EndOfFile:
setError l, "EOF reached in comment"
return
else:
inc l.bufpos
else: break
else: break
 
proc handleSpecial(l: var Lexer): char =
assert l.buf[l.bufpos] == '\\'
inc l.bufpos
case l.buf[l.bufpos]
of 'n':
l.token.add "\\n"
result = '\n'
inc l.bufpos
of '\\':
l.token.add "\\\\"
result = '\\'
inc l.bufpos
else:
setError l, "Unknown escape sequence: '\\" & l.buf[l.bufpos] & "'"
result = '\0'
 
proc handleChar(l: var Lexer) =
assert l.buf[l.bufpos] == '\''
l.startPos = l.getColNumber l.bufpos
l.kind = tkInvalid
inc l.bufpos
if l.buf[l.bufpos] == '\\':
l.token = $ord(handleSpecial l)
if hasError l: return
elif l.buf[l.bufpos] == '\'':
setError l, "Empty character constant"
return
else:
l.token = $ord(l.buf[l.bufpos])
inc l.bufpos
if l.buf[l.bufpos] == '\'':
l.kind = tkInteger
inc l.bufpos
else:
setError l, "Multi-character constant"
 
proc handleString(l: var Lexer) =
assert l.buf[l.bufpos] == '"'
l.startPos = l.getColNumber l.bufpos
l.token = "\""
inc l.bufpos
while true:
case l.buf[l.bufpos]
of '\\':
discard handleSpecial l
if hasError l: return
of '"':
l.kind = tkString
add l.token, '"'
inc l.bufpos
break
of NewLines:
setError l, "EOL reached before end-of-string"
return
of EndOfFile:
setError l, "EOF reached before end-of-string"
return
else:
add l.token, l.buf[l.bufpos]
inc l.bufpos
 
proc handleNumber(l: var Lexer) =
assert l.buf[l.bufpos] in {'0'..'9'}
l.startPos = l.getColNumber l.bufpos
l.token = "0"
while l.buf[l.bufpos] == '0': inc l.bufpos
while true:
case l.buf[l.bufpos]
of '0'..'9':
if l.token == "0":
setLen l.token, 0
add l.token, l.buf[l.bufpos]
inc l.bufpos
of 'a'..'z', 'A'..'Z', '_':
setError l, "Invalid number"
return
else:
l.kind = tkInteger
break
 
proc handleIdent(l: var Lexer) =
assert l.buf[l.bufpos] in {'a'..'z'}
l.startPos = l.getColNumber l.bufpos
setLen l.token, 0
while true:
if l.buf[l.bufpos] in {'a'..'z', 'A'..'Z', '0'..'9', '_'}:
add l.token, l.buf[l.bufpos]
inc l.bufpos
else:
break
l.kind = case l.token
of "if": tkKeywordIf
of "else": tkKeywordElse
of "while": tkKeywordWhile
of "print": tkKeywordPrint
of "putc": tkKeywordPutc
else: tkIdentifier
 
proc getToken(l: var Lexer): TokenKind =
l.kind = tkInvalid
setLen l.token, 0
skip l
 
case l.buf[l.bufpos]
of '*':
l.kind = tkOpMultiply
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of '/':
l.kind = tkOpDivide
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of '%':
l.kind = tkOpMod
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of '+':
l.kind = tkOpAdd
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of '-':
l.kind = tkOpSubtract
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of '<':
l.kind = tkOpLess
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
if l.buf[l.bufpos] == '=':
l.kind = tkOpLessEqual
inc l.bufpos
of '>':
l.kind = tkOpGreater
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
if l.buf[l.bufpos] == '=':
l.kind = tkOpGreaterEqual
inc l.bufpos
of '=':
l.kind = tkOpAssign
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
if l.buf[l.bufpos] == '=':
l.kind = tkOpEqual
inc l.bufpos
of '!':
l.kind = tkOpNot
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
if l.buf[l.bufpos] == '=':
l.kind = tkOpNotEqual
inc l.bufpos
of '&':
if l.buf[l.bufpos + 1] == '&':
l.kind = tkOpAnd
l.startPos = l.getColNumber l.bufpos
inc l.bufpos, 2
else:
setError l, "Unrecognized character"
of '|':
if l.buf[l.bufpos + 1] == '|':
l.kind = tkOpOr
l.startPos = l.getColNumber l.bufpos
inc l.bufpos, 2
else:
setError l, "Unrecognized character"
of '(':
l.kind = tkLeftParen
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of ')':
l.kind = tkRightParen
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of '{':
l.kind = tkLeftBrace
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of '}':
l.kind = tkRightBrace
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of ';':
l.kind = tkSemicolon
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of ',':
l.kind = tkComma
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of '\'': handleChar l
of '"': handleString l
of '0'..'9': handleNumber l
of 'a'..'z', 'A'..'Z': handleIdent l
of EndOfFile:
l.startPos = l.getColNumber l.bufpos
l.kind = tkEndOfInput
else:
setError l, "Unrecognized character"
result = l.kind
 
when isMainModule:
import os, strformat
proc main() =
var l: Lexer
if paramCount() < 1:
open l, newFileStream stdin
else:
open l, newFileStream paramStr(1)
while l.getToken notin {tkInvalid}:
stdout.write &"{l.lineNumber:5} {l.startPos + 1:5} {l.kind:<14}"
if l.kind in {tkIdentifier, tkInteger, tkString}:
stdout.write &" {l.token}"
stdout.write '\n'
if l.kind == tkEndOfInput:
break
if hasError l:
echo &"({l.lineNumber},{l.getColNumber l.bufpos + 1}) {l.error}"
main()
</syntaxhighlight>
 
===Using nothing but system and strutils===
<syntaxhighlight lang="nim">import strutils
 
type
 
TokenKind* = enum
tokMult = "Op_multiply", tokDiv = "Op_divide", tokMod = "Op_mod",
tokAdd = "Op_add", tokSub = "Op_subtract", tokLess = "Op_less",
tokLessEq = "Op_lessequal", tokGreater = "Op_greater",
tokGreaterEq = "Op_greaterequal", tokEq = "Op_equal",
tokNotEq = "Op_notequal", tokNot = "Op_not", tokAssign = "Op_assign",
tokAnd = "Op_and", tokOr = "Op_or"
tokLPar = "LeftParen", tokRPar = "RightParen"
tokLBrace = "LeftBrace", tokRBrace = "RightBrace"
tokSemi = "Semicolon", tokComma = "Comma"
tokIf = "Keyword_if", tokElse = "Keyword_else", tokWhile = "Keyword_while",
tokPrint = "Keyword_print", tokPutc = "Keyword_putc"
tokIdent = "Identifier", tokInt = "Integer", tokChar = "Integer",
tokString = "String"
tokEnd = "End_of_input"
 
Token* = object
ln*, col*: int
case kind*: TokenKind
of tokIdent: ident*: string
of tokInt: intVal*: int
of tokChar: charVal*: char
of tokString: stringVal*: string
else: discard
 
Lexer* = object
input: string
pos: int
ln, col: int
 
LexicalError* = object of CatchableError
ln*, col*: int
 
proc error(lexer: var Lexer, message: string) =
var err = newException(LexicalError, message)
err.ln = lexer.ln
err.col = lexer.col
 
template current: char =
if lexer.pos < lexer.input.len: lexer.input[lexer.pos]
else: '\x00'
template get(n: int): string =
if lexer.pos < lexer.input.len:
lexer.input[min(lexer.pos, lexer.input.len)..
min(lexer.pos + n - 1, lexer.input.len)]
else: ""
 
template next() =
inc(lexer.pos); inc(lexer.col)
if current() == '\n':
inc(lexer.ln)
lexer.col = 0
elif current() == '\r':
lexer.col = 0
 
proc skip(lexer: var Lexer) =
while true:
if current() in Whitespace:
while current() in Whitespace:
next()
continue
elif get(2) == "/*":
next(); next()
while get(2) != "*/":
if current() == '\x00':
lexer.error("Unterminated comment")
next()
next(); next()
continue
else: discard
break
 
proc charOrEscape(lexer: var Lexer): char =
if current() != '\\':
result = current()
next()
else:
next()
case current()
of 'n': result = '\n'
of '\\': result = '\\'
else: lexer.error("Unknown escape sequence '\\" & current() & "'")
next()
 
proc next*(lexer: var Lexer): Token =
let
ln = lexer.ln
col = lexer.col
 
case current()
of '*': result = Token(kind: tokMult); next()
of '/': result = Token(kind: tokDiv); next()
of '%': result = Token(kind: tokMod); next()
of '+': result = Token(kind: tokAdd); next()
of '-': result = Token(kind: tokSub); next()
of '<':
next()
if current() == '=': result = Token(kind: tokLessEq)
else: result = Token(kind: tokLess)
of '>':
next()
if current() == '=':
result = Token(kind: tokGreaterEq)
next()
else:
result = Token(kind: tokGreater)
of '=':
next()
if current() == '=':
result = Token(kind: tokEq)
next()
else:
result = Token(kind: tokAssign)
of '!':
next()
if current() == '=':
result = Token(kind: tokNotEq)
next()
else:
result = Token(kind: tokNot)
of '&':
next()
if current() == '&':
result = Token(kind: tokAnd)
next()
else:
lexer.error("'&&' expected")
of '|':
next()
if current() == '|':
result = Token(kind: tokOr)
next()
else:
lexer.error("'||' expected")
of '(': result = Token(kind: tokLPar); next()
of ')': result = Token(kind: tokRPar); next()
of '{': result = Token(kind: tokLBrace); next()
of '}': result = Token(kind: tokRBrace); next()
of ';': result = Token(kind: tokSemi); next()
of ',': result = Token(kind: tokComma); next()
of '\'':
next()
if current() == '\'': lexer.error("Empty character literal")
let ch = lexer.charOrEscape()
if current() != '\'':
lexer.error("Character literal must contain a single character or " &
"escape sequence")
result = Token(kind: tokChar, charVal: ch)
next()
of '0'..'9':
var number = ""
while current() in Digits:
number.add(current())
next()
if current() in IdentStartChars:
lexer.error("Integer literal ends in non-digit characters")
result = Token(kind: tokInt, intVal: parseInt(number))
of '"':
next()
var str = ""
while current() notin {'"', '\x00', '\n'}:
str.add(lexer.charOrEscape())
if current() == '\x00':
lexer.error("Unterminated string literal")
elif current() == '\n':
lexer.error("Line feed in string literal")
else:
next()
result = Token(kind: tokString, stringVal: str)
of IdentStartChars:
var ident = $current()
next()
while current() in IdentChars:
ident.add(current())
next()
case ident
of "if": result = Token(kind: tokIf)
of "else": result = Token(kind: tokElse)
of "while": result = Token(kind: tokWhile)
of "print": result = Token(kind: tokPrint)
of "putc": result = Token(kind: tokPutc)
else: result = Token(kind: tokIdent, ident: ident)
of '\x00':
result = Token(kind: tokEnd)
else:
lexer.error("Unexpected character: '" & current() & "'")
 
result.ln = ln
result.col = col
lexer.skip()
 
proc peek*(lexer: var Lexer): Token =
discard
 
proc initLexer*(input: string): Lexer =
result = Lexer(input: input, pos: 0, ln: 1, col: 1)
result.skip()
 
when isMainModule:
let code = readAll(stdin)
var
lexer = initLexer(code)
token: Token
while true:
token = lexer.next()
stdout.write(token.ln, ' ', token.col, ' ', token.kind)
case token.kind
of tokInt: stdout.write(' ', token.intVal)
of tokChar: stdout.write(' ', token.charVal.ord)
of tokString: stdout.write(" \"", token.stringVal
.replace("\\", "\\\\")
.replace("\n", "\\n"), '"')
of tokIdent: stdout.write(' ', token.ident)
else: discard
stdout.write('\n')
if token.kind == tokEnd:
break</syntaxhighlight>
 
=={{header|ObjectIcon}}==
{{trans|Icon}}
{{trans|ATS}}
 
 
There are very few changes from the ordinary Icon version: I/O is modified to use FileStreams; and the '''max''' procedure is removed, because there is an Object Icon builtin procedure.
 
 
<syntaxhighlight lang="objecticon"># -*- ObjectIcon -*-
#
# The Rosetta Code lexical analyzer in Object Icon. Based upon the ATS
# implementation.
#
# Usage: lex [INPUTFILE [OUTPUTFILE]]
# If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input
# or standard output is used, respectively. *)
#
 
import io
 
$define EOF -1
 
$define TOKEN_ELSE 0
$define TOKEN_IF 1
$define TOKEN_PRINT 2
$define TOKEN_PUTC 3
$define TOKEN_WHILE 4
$define TOKEN_MULTIPLY 5
$define TOKEN_DIVIDE 6
$define TOKEN_MOD 7
$define TOKEN_ADD 8
$define TOKEN_SUBTRACT 9
$define TOKEN_NEGATE 10
$define TOKEN_LESS 11
$define TOKEN_LESSEQUAL 12
$define TOKEN_GREATER 13
$define TOKEN_GREATEREQUAL 14
$define TOKEN_EQUAL 15
$define TOKEN_NOTEQUAL 16
$define TOKEN_NOT 17
$define TOKEN_ASSIGN 18
$define TOKEN_AND 19
$define TOKEN_OR 20
$define TOKEN_LEFTPAREN 21
$define TOKEN_RIGHTPAREN 22
$define TOKEN_LEFTBRACE 23
$define TOKEN_RIGHTBRACE 24
$define TOKEN_SEMICOLON 25
$define TOKEN_COMMA 26
$define TOKEN_IDENTIFIER 27
$define TOKEN_INTEGER 28
$define TOKEN_STRING 29
$define TOKEN_END_OF_INPUT 30
 
global whitespace
global ident_start
global ident_continuation
 
procedure main(args)
local inpf, outf
local pushback_buffer, inp, pushback
 
initial {
whitespace := ' \t\v\f\r\n'
ident_start := '_' ++ &letters
ident_continuation := ident_start ++ &digits
}
 
inpf := FileStream.stdin
outf := FileStream.stdout
if 1 <= *args & args[1] ~== "-" then {
inpf := FileStream(args[1], FileOpt.RDONLY) | stop(&why)
}
if 2 <= *args & args[2] ~== "-" then {
outf := FileStream(args[2], ior(FileOpt.WRONLY,
FileOpt.TRUNC,
FileOpt.CREAT)) | stop(&why)
}
 
pushback_buffer := []
inp := create inputter(inpf, pushback_buffer)
pushback := create repeat push(pushback_buffer, \@&source)
@pushback # The first invocation does nothing.
 
scan_text(outf, inp, pushback)
end
 
procedure scan_text(outf, inp, pushback)
local ch
 
while /ch | ch[1] ~=== EOF do {
skip_spaces_and_comments(inp, pushback)
ch := @inp
if ch[1] === EOF then {
print_token(outf, [TOKEN_END_OF_INPUT, "", ch[2], ch[3]])
} else {
ch @pushback
print_token(outf, get_next_token(inp, pushback))
}
}
end
 
procedure get_next_token(inp, pushback)
local ch, ch1
local ln, cn
 
skip_spaces_and_comments(inp, pushback)
ch := @inp
ln := ch[2] # line number
cn := ch[3] # column number
case ch[1] of {
"," : return [TOKEN_COMMA, ",", ln, cn]
";" : return [TOKEN_SEMICOLON, ";", ln, cn]
"(" : return [TOKEN_LEFTPAREN, "(", ln, cn]
")" : return [TOKEN_RIGHTPAREN, ")", ln, cn]
"{" : return [TOKEN_LEFTBRACE, "{", ln, cn]
"}" : return [TOKEN_RIGHTBRACE, "}", ln, cn]
"*" : return [TOKEN_MULTIPLY, "*", ln, cn]
"/" : return [TOKEN_DIVIDE, "/", ln, cn]
"%" : return [TOKEN_MOD, "%", ln, cn]
"+" : return [TOKEN_ADD, "+", ln, cn]
"-" : return [TOKEN_SUBTRACT, "-", ln, cn]
"<" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_LESSEQUAL, "<=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_LESS, "<", ln, cn]
}
}
">" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_GREATEREQUAL, ">=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_GREATER, ">", ln, cn]
}
}
"=" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_EQUAL, "==", ln, cn]
} else {
ch1 @pushback
return [TOKEN_ASSIGN, "=", ln, cn]
}
}
"!" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_NOTEQUAL, "!=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_NOT, "!", ln, cn]
}
}
"&" : {
ch1 := @inp
if ch1[1] === "&" then {
return [TOKEN_AND, "&&", ln, cn]
} else {
unexpected_character(ln, cn, ch)
}
}
"|" : {
ch1 := @inp
if ch1[1] === "|" then {
return [TOKEN_OR, "||", ln, cn]
} else {
unexpected_character(ln, cn, ch)
}
}
"\"" : {
ch @pushback
return scan_string_literal(inp)
}
"'" : {
ch @pushback
return scan_character_literal(inp, pushback)
}
default : {
if any(&digits, ch[1]) then {
ch @pushback
return scan_integer_literal(inp, pushback)
} else if any(ident_start, ch[1]) then {
ch @pushback
return scan_identifier_or_reserved_word (inp, pushback)
} else {
unexpected_character(ln, cn, ch)
}
}
}
end
 
procedure scan_identifier_or_reserved_word(inp, pushback)
local ch
local s
local line_no, column_no
 
s := ""
ch := @inp
line_no := ch[2]
column_no := ch[3]
while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do {
s ||:= ch[1]
ch := @inp
}
ch @pushback
return reserved_word_lookup (s, line_no, column_no)
end
 
procedure scan_integer_literal(inp, pushback)
local ch
local s
local line_no, column_no
 
s := ""
ch := @inp
line_no := ch[2]
column_no := ch[3]
while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do {
s ||:= ch[1]
ch := @inp
}
ch @pushback
not upto(~&digits, s) | invalid_integer_literal(line_no, column_no, s)
return [TOKEN_INTEGER, s, line_no, column_no]
end
 
procedure scan_character_literal(inp, pushback)
local ch, ch1
local close_quote
local toktup
local line_no, column_no
 
ch := @inp # The opening quote.
close_quote := ch[1] # Same as the opening quote.
ch @pushback
 
line_no := ch[2]
column_no := ch[3]
 
toktup := scan_character_literal_without_checking_end(inp)
ch1 := @inp
if ch1[1] ~=== close_quote then {
repeat {
case ch1[1] of {
EOF : unterminated_character_literal(line_no, column_no)
close_quote : multicharacter_literal(line_no, column_no)
default : ch1 := @inp
}
}
}
return toktup
end
 
procedure scan_character_literal_without_checking_end(inp)
local ch, ch1, ch2
 
ch := @inp # The opening quote.
ch1 := @inp
EOF ~=== ch1[1] | unterminated_character_literal(ch[2], ch[3])
if ch1[1] == "\\" then {
ch2 := @inp
EOF ~=== ch2[1] | unterminated_character_literal(ch[2], ch[3])
case ch2[1] of {
"n" : return [TOKEN_INTEGER, string(ord("\n")), ch[2], ch[3]]
"\\" : return [TOKEN_INTEGER, string(ord("\\")), ch[2], ch[3]]
default : unsupported_escape(ch1[2], ch1[3], ch2)
}
} else {
return [TOKEN_INTEGER, string(ord(ch1[1])), ch[2], ch[3]]
}
end
 
procedure scan_string_literal(inp)
local ch, ch1, ch2
local line_no, column_no
local close_quote
local s
local retval
 
ch := @inp # The opening quote
close_quote := ch[1] # Same as the opening quote.
line_no := ch[2]
column_no := ch[3]
 
s := ch[1]
until \retval do {
ch1 := @inp
ch1[1] ~=== EOF |
unterminated_string_literal (line_no, column_no,
"end of input")
ch1[1] ~== "\n" |
unterminated_string_literal (line_no, column_no,
"end of line")
if ch1[1] == close_quote then {
retval := [TOKEN_STRING, s || close_quote, line_no, column_no]
} else if ch1[1] ~== "\\" then {
s ||:= ch1[1]
} else {
ch2 := @inp
EOF ~=== ch2[1] | unsupported_escape(line_no, column_no, ch2)
case ch2[1] of {
"n" : s ||:= "\\n"
"\\" : s ||:= "\\\\"
default : unsupported_escape(line_no, column_no, ch2)
}
}
}
return retval
end
 
procedure skip_spaces_and_comments(inp, pushback)
local ch, ch1
 
repeat {
ch := @inp
(EOF === ch[1]) & { ch @pushback; return }
if not any(whitespace, ch[1]) then {
(ch[1] == "/") | { ch @pushback; return }
(ch1 := @inp) | { ch @pushback; return }
(ch1[1] == "*") | { ch1 @pushback; ch @pushback; return }
scan_comment(inp, ch[2], ch[3])
}
}
end
 
procedure scan_comment(inp, line_no, column_no)
local ch, ch1
 
until (\ch)[1] == "*" & (\ch1)[1] == "/" do {
ch := @inp
(EOF === ch[1]) & unterminated_comment(line_no, column_no)
if ch[1] == "*" then {
ch1 := @inp
(EOF === ch[1]) & unterminated_comment(line_no, column_no)
}
}
return
end
 
procedure reserved_word_lookup(s, line_no, column_no)
 
# Lookup is by an extremely simple perfect hash.
 
static reserved_words
static reserved_word_tokens
local hashval, token, toktup
 
initial {
reserved_words := ["if", "print", "else",
"", "putc", "",
"", "while", ""]
reserved_word_tokens :=
[TOKEN_IF, TOKEN_PRINT, TOKEN_ELSE,
TOKEN_IDENTIFIER, TOKEN_PUTC, TOKEN_IDENTIFIER,
TOKEN_IDENTIFIER, TOKEN_WHILE, TOKEN_IDENTIFIER]
}
 
if *s < 2 then {
toktup := [TOKEN_IDENTIFIER, s, line_no, column_no]
} else {
hashval := ((ord(s[1]) + ord(s[2])) % (*reserved_words)) + 1
token := reserved_word_tokens[hashval]
if token = TOKEN_IDENTIFIER | s ~== reserved_words[hashval] then {
toktup := [TOKEN_IDENTIFIER, s, line_no, column_no]
} else {
toktup := [token, s, line_no, column_no]
}
}
return toktup
end
 
procedure print_token(outf, toktup)
static token_names
local s_line, s_column
 
initial {
token_names := ["Keyword_else",
"Keyword_if",
"Keyword_print",
"Keyword_putc",
"Keyword_while",
"Op_multiply",
"Op_divide",
"Op_mod",
"Op_add",
"Op_subtract",
"Op_negate",
"Op_less",
"Op_lessequal",
"Op_greater",
"Op_greaterequal",
"Op_equal",
"Op_notequal",
"Op_not",
"Op_assign",
"Op_and",
"Op_or",
"LeftParen",
"RightParen",
"LeftBrace",
"RightBrace",
"Semicolon",
"Comma",
"Identifier",
"Integer",
"String",
"End_of_input"]
}
 
/outf := FileStream.stdout
s_line := string(toktup[3])
s_column := string(toktup[4])
writes(outf, right (s_line, max(5, *s_line)))
writes(outf, " ")
writes(outf, right (s_column, max(5, *s_column)))
writes(outf, " ")
writes(outf, token_names[toktup[1] + 1])
case toktup[1] of {
TOKEN_IDENTIFIER : writes(outf, " ", toktup[2])
TOKEN_INTEGER : writes(outf, " ", toktup[2])
TOKEN_STRING : writes(outf, " ", toktup[2])
}
write(outf)
return
end
 
procedure inputter(inpf, pushback_buffer)
local buffer
local line_no, column_no
local c
 
buffer := ""
line_no := 1
column_no := 1
 
repeat {
buffer? {
until *pushback_buffer = 0 & pos(0) do {
if *pushback_buffer ~= 0 then {
suspend pop(pushback_buffer)
} else {
c := move(1)
suspend [c, line_no, column_no]
if c == "\n" then {
line_no +:= 1
column_no := 1
} else {
column_no +:= 1
}
}
}
}
(buffer := reads(inpf, 2048)) |
suspend [EOF, line_no, column_no]
}
end
 
procedure unterminated_comment(line_no, column_no)
error("unterminated comment starting at ",
line_no, ":", column_no)
end
 
procedure unexpected_character(line_no, column_no, ch)
error("unexpected character '", ch[1], "' starting at ",
line_no, ":", column_no)
end
 
procedure unterminated_string_literal (line_no, column_no, cause)
error("unterminated string literal (", cause, ") starting at ",
line_no, ":", column_no)
end
 
procedure unsupported_escape (line_no, column_no, ch)
if ch[1] === EOF then {
error("unexpected \\ at end of input",
" starting at ", line_no, ":", column_no)
} else {
error("unsupported escape \\", ch[1],
" starting at ", line_no, ":", column_no)
}
end
 
procedure invalid_integer_literal(line_no, column_no, s)
error("invalid integer literal ", s,
" starting at ", line_no, ":", column_no)
end
 
procedure unterminated_character_literal(line_no, column_no)
error("unterminated character literal starting at ",
line_no, ":", column_no)
end
 
procedure multicharacter_literal(line_no, column_no)
error("unsupported multicharacter literal starting at ",
line_no, ":", column_no)
end
 
procedure error(args[])
write!([FileStream.stderr] ||| args)
exit(1)
end</syntaxhighlight>
 
 
{{out}}
<pre>$ oit -s -o lex lex-in-ObjectIcon.icn && ./lex compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|OCaml}}==
{{works with|OCaml|4.12.1}}
{{trans|ATS}}
 
This is a close translation of the ATS. It may interest the reader to compare the two implementations.
 
(Much of the extra complication in the ATS comes from arrays being a linear type (whose "views" need tending), and from values of linear type having to be local to any function using them. This limitation could have been worked around, and arrays more similar to OCaml arrays could have been used, but at a cost in safety and efficiency.)
 
<syntaxhighlight lang="ocaml">(*------------------------------------------------------------------*)
(* The Rosetta Code lexical analyzer, in OCaml. Based on the ATS. *)
 
(* When you compare this code to the ATS code, please keep in mind
that, although ATS has an ML-like syntax:
 
* The type system is not the same at all.
 
* Most ATS functions are not closures. Those that are will have
special notations such as "<cloref1>" associated with them. *)
 
(*------------------------------------------------------------------*)
(* The following functions are compatible with ASCII. *)
 
let is_digit ichar =
48 <= ichar && ichar <= 57
 
let is_lower ichar =
97 <= ichar && ichar <= 122
 
let is_upper ichar =
65 <= ichar && ichar <= 90
 
let is_alpha ichar =
is_lower ichar || is_upper ichar
 
let is_alnum ichar =
is_digit ichar || is_alpha ichar
 
let is_ident_start ichar =
is_alpha ichar || ichar = 95
 
let is_ident_continuation ichar =
is_alnum ichar || ichar = 95
 
let is_space ichar =
ichar = 32 || (9 <= ichar && ichar <= 13)
 
(*------------------------------------------------------------------*)
 
let reverse_list_to_string lst =
List.rev lst |> List.to_seq |> String.of_seq
 
(*------------------------------------------------------------------*)
(* Character input more like that of C. There are various advantages
and disadvantages to this method, but key points in its favor are:
(a) it is how character input is done in the original ATS code, (b)
Unicode code points are 21-bit positive integers. *)
 
let eof = (-1)
 
let input_ichar channel =
try
int_of_char (input_char channel)
with
| End_of_file -> eof
 
(*------------------------------------------------------------------*)
 
(* The type of an input character. *)
 
module Ch =
struct
type t =
{
ichar : int;
line_no : int;
column_no : int
}
end
 
(*------------------------------------------------------------------*)
(* Inputting with unlimited pushback, and with counting of lines and
columns. *)
 
module Inp =
struct
type t =
{
inpf : in_channel;
pushback : Ch.t list;
line_no : int;
column_no : int
}
 
let of_in_channel inpf =
{ inpf = inpf;
pushback = [];
line_no = 1;
column_no = 1
}
 
let get_ch inp =
match inp.pushback with
| ch :: tail ->
(ch, {inp with pushback = tail})
| [] ->
let ichar = input_ichar inp.inpf in
if ichar = int_of_char '\n' then
({ ichar = ichar;
line_no = inp.line_no;
column_no = inp.column_no },
{ inp with line_no = inp.line_no + 1;
column_no = 1 })
else
({ ichar = ichar;
line_no = inp.line_no;
column_no = inp.column_no },
{ inp with column_no = inp.column_no + 1 })
 
let push_back_ch ch inp =
{inp with pushback = ch :: inp.pushback}
end
 
(*------------------------------------------------------------------*)
(* Tokens, appearing in tuples with arguments, and with line and
column numbers. The tokens are integers, so they can be used as
array indices. *)
 
(* (token, argument, line_no, column_no) *)
type toktup_t = int * string * int * int
 
let token_ELSE = 0
let token_IF = 1
let token_PRINT = 2
let token_PUTC = 3
let token_WHILE = 4
let token_MULTIPLY = 5
let token_DIVIDE = 6
let token_MOD = 7
let token_ADD = 8
let token_SUBTRACT = 9
let token_NEGATE = 10
let token_LESS = 11
let token_LESSEQUAL = 12
let token_GREATER = 13
let token_GREATEREQUAL = 14
let token_EQUAL = 15
let token_NOTEQUAL = 16
let token_NOT = 17
let token_ASSIGN = 18
let token_AND = 19
let token_OR = 20
let token_LEFTPAREN = 21
let token_RIGHTPAREN = 22
let token_LEFTBRACE = 23
let token_RIGHTBRACE = 24
let token_SEMICOLON = 25
let token_COMMA = 26
let token_IDENTIFIER = 27
let token_INTEGER = 28
let token_STRING = 29
let token_END_OF_INPUT = 30
;;
 
(* A *very* simple perfect hash for the reserved words. (Yes, this is
overkill, except for demonstration of the principle.) *)
 
let reserved_words =
[| "if"; "print"; "else"; ""; "putc"; ""; ""; "while"; "" |]
let reserved_word_tokens =
[| token_IF; token_PRINT; token_ELSE; token_IDENTIFIER;
token_PUTC; token_IDENTIFIER; token_IDENTIFIER; token_WHILE;
token_IDENTIFIER |]
 
let reserved_word_lookup s line_no column_no =
if String.length s < 2 then
(token_IDENTIFIER, s, line_no, column_no)
else
let hashval = (int_of_char s.[0] + int_of_char s.[1]) mod 9 in
let token = reserved_word_tokens.(hashval) in
if token = token_IDENTIFIER || s <> reserved_words.(hashval) then
(token_IDENTIFIER, s, line_no, column_no)
else
(token, s, line_no, column_no)
 
(* Token to string lookup. *)
 
let token_names =
[| "Keyword_else";
"Keyword_if";
"Keyword_print";
"Keyword_putc";
"Keyword_while";
"Op_multiply";
"Op_divide";
"Op_mod";
"Op_add";
"Op_subtract";
"Op_negate";
"Op_less";
"Op_lessequal";
"Op_greater";
"Op_greaterequal";
"Op_equal";
"Op_notequal";
"Op_not";
"Op_assign";
"Op_and";
"Op_or";
"LeftParen";
"RightParen";
"LeftBrace";
"RightBrace";
"Semicolon";
"Comma";
"Identifier";
"Integer";
"String";
"End_of_input" |]
 
let token_name token =
token_names.(token)
 
(*------------------------------------------------------------------*)
 
exception Unterminated_comment of int * int
exception Unterminated_character_literal of int * int
exception Multicharacter_literal of int * int
exception End_of_input_in_string_literal of int * int
exception End_of_line_in_string_literal of int * int
exception Unsupported_escape of int * int * int
exception Invalid_integer_literal of int * int * string
exception Unexpected_character of int * int * char
 
(*------------------------------------------------------------------*)
(* Skipping past spaces and comments. (A comment in the target
language is, if you think about it, a kind of space.) *)
 
let scan_comment inp line_no column_no =
let rec loop inp =
let (ch, inp) = Inp.get_ch inp in
if ch.ichar = eof then
raise (Unterminated_comment (line_no, column_no))
else if ch.ichar = int_of_char '*' then
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = eof then
raise (Unterminated_comment (line_no, column_no))
else if ch1.ichar = int_of_char '/' then
inp
else
loop inp
else
loop inp
in
loop inp
 
let skip_spaces_and_comments inp =
let rec loop inp =
let (ch, inp) = Inp.get_ch inp in
if is_space ch.ichar then
loop inp
else if ch.ichar = int_of_char '/' then
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '*' then
scan_comment inp ch.line_no ch.column_no |> loop
else
let inp = Inp.push_back_ch ch1 inp in
let inp = Inp.push_back_ch ch inp in
inp
else
Inp.push_back_ch ch inp
in
loop inp
 
(*------------------------------------------------------------------*)
(* Integer literals, identifiers, and reserved words. *)
 
(* In ATS the predicate for simple scan was supplied by template
expansion, which (typically) produces faster code than passing a
function or closure (although passing either of those could have
been done). Here I pass the predicate as a function/closure. It is
worth contrasting the methods. *)
let rec simple_scan pred lst inp =
let (ch, inp) = Inp.get_ch inp in
if pred ch.ichar then
simple_scan pred (char_of_int ch.ichar :: lst) inp
else
(lst, Inp.push_back_ch ch inp)
 
(* Demonstration of one way to make a new closure in OCaml. (In ATS,
one might see things that look similar but are actually template
operations.) *)
let simple_scan_iic = simple_scan is_ident_continuation
 
let scan_integer_literal inp =
let (ch, inp) = Inp.get_ch inp in
let _ = assert (is_digit ch.ichar) in
let (lst, inp) = simple_scan_iic [char_of_int ch.ichar] inp in
let s = reverse_list_to_string lst in
if List.for_all (fun c -> is_digit (int_of_char c)) lst then
((token_INTEGER, s, ch.line_no, ch.column_no), inp)
else
raise (Invalid_integer_literal (ch.line_no, ch.column_no, s))
 
let scan_identifier_or_reserved_word inp =
let (ch, inp) = Inp.get_ch inp in
let _ = assert (is_ident_start ch.ichar) in
let (lst, inp) = simple_scan_iic [char_of_int ch.ichar] inp in
let s = reverse_list_to_string lst in
let toktup = reserved_word_lookup s ch.line_no ch.column_no in
(toktup, inp)
 
(*------------------------------------------------------------------*)
(* String literals. *)
 
let scan_string_literal inp =
let (ch, inp) = Inp.get_ch inp in
let _ = assert (ch.ichar = int_of_char '"') in
 
let rec scan lst inp =
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = eof then
raise (End_of_input_in_string_literal
(ch.line_no, ch.column_no))
else if ch1.ichar = int_of_char '\n' then
raise (End_of_line_in_string_literal
(ch.line_no, ch.column_no))
else if ch1.ichar = int_of_char '"' then
(lst, inp)
else if ch1.ichar <> int_of_char '\\' then
scan (char_of_int ch1.ichar :: lst) inp
else
let (ch2, inp) = Inp.get_ch inp in
if ch2.ichar = int_of_char 'n' then
scan ('n' :: '\\' :: lst) inp
else if ch2.ichar = int_of_char '\\' then
scan ('\\' :: '\\' :: lst) inp
else
raise (Unsupported_escape (ch1.line_no, ch1.column_no,
ch2.ichar))
in
let lst = '"' :: [] in
let (lst, inp) = scan lst inp in
let lst = '"' :: lst in
let s = reverse_list_to_string lst in
((token_STRING, s, ch.line_no, ch.column_no), inp)
 
(*------------------------------------------------------------------*)
(* Character literals. *)
 
let scan_character_literal_without_checking_end inp =
let (ch, inp) = Inp.get_ch inp in
let _ = assert (ch.ichar = int_of_char '\'') in
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = eof then
raise (Unterminated_character_literal
(ch.line_no, ch.column_no))
else if ch1.ichar = int_of_char '\\' then
let (ch2, inp) = Inp.get_ch inp in
if ch2.ichar = eof then
raise (Unterminated_character_literal
(ch.line_no, ch.column_no))
else if ch2.ichar = int_of_char 'n' then
let s = (int_of_char '\n' |> string_of_int) in
((token_INTEGER, s, ch.line_no, ch.column_no), inp)
else if ch2.ichar = int_of_char '\\' then
let s = (int_of_char '\\' |> string_of_int) in
((token_INTEGER, s, ch.line_no, ch.column_no), inp)
else
raise (Unsupported_escape
(ch1.line_no, ch1.column_no, ch2.ichar))
else
let s = string_of_int ch1.ichar in
((token_INTEGER, s, ch.line_no, ch.column_no), inp)
 
let scan_character_literal inp =
let (toktup, inp) =
scan_character_literal_without_checking_end inp in
let (_, _, line_no, column_no) = toktup in
 
let check_end inp =
let (ch, inp) = Inp.get_ch inp in
if ch.ichar = int_of_char '\'' then
inp
else
let rec loop_to_end (ch1 : Ch.t) inp =
if ch1.ichar = eof then
raise (Unterminated_character_literal (line_no, column_no))
else if ch1.ichar = int_of_char '\'' then
raise (Multicharacter_literal (line_no, column_no))
else
let (ch1, inp) = Inp.get_ch inp in
loop_to_end ch1 inp
in
loop_to_end ch inp
in
let inp = check_end inp in
(toktup, inp)
 
(*------------------------------------------------------------------*)
 
let get_next_token inp =
let inp = skip_spaces_and_comments inp in
let (ch, inp) = Inp.get_ch inp in
let ln = ch.line_no in
let cn = ch.column_no in
if ch.ichar = eof then
((token_END_OF_INPUT, "", ln, cn), inp)
else
match char_of_int ch.ichar with
| ',' -> ((token_COMMA, ",", ln, cn), inp)
| ';' -> ((token_SEMICOLON, ";", ln, cn), inp)
| '(' -> ((token_LEFTPAREN, "(", ln, cn), inp)
| ')' -> ((token_RIGHTPAREN, ")", ln, cn), inp)
| '{' -> ((token_LEFTBRACE, "{", ln, cn), inp)
| '}' -> ((token_RIGHTBRACE, "}", ln, cn), inp)
| '*' -> ((token_MULTIPLY, "*", ln, cn), inp)
| '/' -> ((token_DIVIDE, "/", ln, cn), inp)
| '%' -> ((token_MOD, "%", ln, cn), inp)
| '+' -> ((token_ADD, "+", ln, cn), inp)
| '-' -> ((token_SUBTRACT, "-", ln, cn), inp)
| '<' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '=' then
((token_LESSEQUAL, "<=", ln, cn), inp)
else
let inp = Inp.push_back_ch ch1 inp in
((token_LESS, "<", ln, cn), inp)
| '>' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '=' then
((token_GREATEREQUAL, ">=", ln, cn), inp)
else
let inp = Inp.push_back_ch ch1 inp in
((token_GREATER, ">", ln, cn), inp)
| '=' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '=' then
((token_EQUAL, "==", ln, cn), inp)
else
let inp = Inp.push_back_ch ch1 inp in
((token_ASSIGN, "=", ln, cn), inp)
| '!' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '=' then
((token_NOTEQUAL, "!=", ln, cn), inp)
else
let inp = Inp.push_back_ch ch1 inp in
((token_NOT, "!", ln, cn), inp)
| '&' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '&' then
((token_AND, "&&", ln, cn), inp)
else
raise (Unexpected_character (ch.line_no, ch.column_no,
char_of_int ch.ichar))
| '|' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '|' then
((token_OR, "||", ln, cn), inp)
else
raise (Unexpected_character (ch.line_no, ch.column_no,
char_of_int ch.ichar))
| '"' ->
let inp = Inp.push_back_ch ch inp in
scan_string_literal inp
| '\'' ->
let inp = Inp.push_back_ch ch inp in
scan_character_literal inp
| _ when is_digit ch.ichar ->
let inp = Inp.push_back_ch ch inp in
scan_integer_literal inp
| _ when is_ident_start ch.ichar ->
let inp = Inp.push_back_ch ch inp in
scan_identifier_or_reserved_word inp
| _ -> raise (Unexpected_character (ch.line_no, ch.column_no,
char_of_int ch.ichar))
 
let print_token outf toktup =
let (token, arg, line_no, column_no) = toktup in
let name = token_name token in
let (padding, str) =
match 0 with
| _ when token = token_IDENTIFIER -> (" ", arg)
| _ when token = token_INTEGER -> (" ", arg)
| _ when token = token_STRING -> (" ", arg)
| _ -> ("", "")
in
Printf.fprintf outf "%5d %5d %s%s%s\n"
line_no column_no name padding str
 
let scan_text outf inp =
let rec loop inp =
let (toktup, inp) = get_next_token inp in
begin
print_token outf toktup;
let (token, _, _, _) = toktup in
if token <> token_END_OF_INPUT then
loop inp
end
in
loop inp
 
(*------------------------------------------------------------------*)
 
let main () =
let inpf_filename =
if 2 <= Array.length Sys.argv then
Sys.argv.(1)
else
"-"
in
let outf_filename =
if 3 <= Array.length Sys.argv then
Sys.argv.(2)
else
"-"
in
let inpf =
if inpf_filename = "-" then
stdin
else
open_in inpf_filename
in
let outf =
if outf_filename = "-" then
stdout
else
open_out outf_filename
in
let inp = Inp.of_in_channel inpf in
scan_text outf inp
;;
 
main ()
 
(*------------------------------------------------------------------*)</syntaxhighlight>
 
{{out}}
<pre>$ ocamlopt -O2 lex.ml && ./a.out compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|Ol}}==
==== Source ====
Note: we do not print the line and token source code position for the simplicity.
 
<syntaxhighlight lang="scheme">
(import (owl parse))
 
(define (get-comment)
(get-either
(let-parses (
(_ (get-imm #\*))
(_ (get-imm #\/)))
#true)
(let-parses (
(_ get-byte)
(_ (get-comment)))
#true)))
 
(define get-whitespace
(get-any-of
(get-byte-if (lambda (x) (has? '(#\tab #\newline #\space #\return) x))) ; whitespace
(let-parses ( ; comment
(_ (get-imm #\/))
(_ (get-imm #\*))
(_ (get-comment)))
#true)))
 
(define get-operator
(let-parses (
(operator (get-any-of
(get-word "||" 'Op_or)
(get-word "&&" 'Op_and)
(get-word "!=" 'Op_notequal)
(get-word "==" 'Op_equal)
(get-word ">=" 'Op_greaterequal)
(get-word "<=" 'Op_lessequal)
 
(get-word "=" 'Op_assign)
(get-word "!" 'Op_nop)
(get-word ">" 'Op_greater)
(get-word "<" 'Op_less)
(get-word "-" 'Op_subtract)
(get-word "+" 'Op_add)
(get-word "%" 'Op_mod)
(get-word "/" 'Op_divide)
(get-word "*" 'Op_multiply))))
(cons 'operator operator)))
 
(define get-symbol
(let-parses (
(symbol (get-any-of
(get-word "(" 'LeftParen)
(get-word ")" 'RightParen)
(get-word "{" 'LeftBrace)
(get-word "}" 'RightBrace)
(get-word ";" 'Semicolon)
(get-word "," 'Comma))))
(cons 'symbol symbol)))
 
(define get-keyword
(let-parses (
(keyword (get-any-of
(get-word "if" 'Keyword_if)
(get-word "else" 'Keyword_else)
(get-word "while" 'Keyword_while)
(get-word "print" 'Keyword_print)
(get-word "putc" 'Keyword_putc))))
(cons 'keyword keyword)))
 
 
 
(define get-identifier
(let-parses (
(lead (get-byte-if (lambda (x) (or (<= #\a x #\z) (<= #\A x #\Z) (= x #\_)))))
(tail (get-greedy* (get-byte-if (lambda (x) (or (<= #\a x #\z) (<= #\A x #\Z) (= x #\_) (<= #\0 x #\9)))))))
(cons 'identifier (bytes->string (cons lead tail)))))
 
(define get-integer
(let-parses (
(main (get-greedy+ (get-byte-if (lambda (x) (<= #\0 x #\9))))) )
(cons 'integer (string->integer (bytes->string main)))))
 
(define get-character
(let-parses (
(_ (get-imm #\'))
(char (get-any-of
(get-word "\\n" #\newline)
(get-word "\\\\" #\\)
(get-byte-if (lambda (x) (not (or (eq? x #\') (eq? x #\newline)))))))
(_ (get-imm #\')) )
(cons 'character char)))
 
(define get-string
(let-parses (
(_ (get-imm #\")) ;"
(data (get-greedy* (get-any-of
(get-word "\\n" #\newline)
(get-word "\\\\" #\\) ;\"
(get-byte-if (lambda (x) (not (or (eq? x #\") (eq? x #\newline)))))))) ;", newline
(_ (get-imm #\")) ) ;"
(cons 'string (bytes->string data))))
 
(define get-token
(let-parses (
(_ (get-greedy* get-whitespace))
(token (get-any-of
get-symbol
get-keyword
get-identifier
get-operator
get-integer
get-character
get-string
)) )
token))
 
(define token-parser
(let-parses (
(tokens (get-greedy+ get-token))
(_ (get-greedy* get-whitespace)))
tokens))
 
 
(define (translate source)
(let ((stream (try-parse token-parser (str-iter source) #t)))
(for-each print (car stream))
(if (null? (cdr stream))
(print 'End_of_input))))
</syntaxhighlight>
 
==== Testing ====
 
Testing function:
<syntaxhighlight lang="scheme">
(define (translate source)
(let ((stream (try-parse token-parser (str-iter source) #t)))
(for-each print (car stream))
(if (null? (force (cdr stream)))
(print 'End_of_input))))
</syntaxhighlight>
 
====== Testcase 1 ======
 
<syntaxhighlight lang="scheme">
(translate "
/*
Hello world
*/
print(\"Hello, World!\\\\n\");
")</syntaxhighlight>
{{Out}}
<pre>
(keyword . Keyword_print)
(symbol . LeftParen)
(string . Hello, World!\n)
(symbol . RightParen)
(symbol . Semicolon)
End_of_input
</pre>
 
====== Testcase 2 ======
 
<syntaxhighlight lang="scheme">
(translate "
/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, \"\\\\n\");
")</syntaxhighlight>
{{Out}}
<pre>
(identifier . phoenix_number)
(operator . Op_assign)
(integer . 142857)
(symbol . Semicolon)
(keyword . Keyword_print)
(symbol . LeftParen)
(identifier . phoenix_number)
(symbol . Comma)
(string . \n)
(symbol . RightParen)
(symbol . Semicolon)
End_of_input
</pre>
 
====== Testcase 3 ======
 
<syntaxhighlight lang="scheme">
(translate "
/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* Else */ else /* Leq */ <=
/* While */ while /* Geq */ >=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ \"String literal\"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\\n'
/* character literal */ '\\\\'
/* character literal */ ' '
")</syntaxhighlight>
{{Out}}
<pre>
(keyword . Keyword_print)
(operator . Op_subtract)
(keyword . Keyword_putc)
(operator . Op_less)
(keyword . Keyword_if)
(operator . Op_greater)
(keyword . Keyword_else)
(operator . Op_lessequal)
(keyword . Keyword_while)
(operator . Op_greaterequal)
(symbol . LeftBrace)
(operator . Op_equal)
(symbol . RightBrace)
(operator . Op_notequal)
(symbol . LeftParen)
(operator . Op_and)
(symbol . RightParen)
(operator . Op_or)
(operator . Op_subtract)
(symbol . Semicolon)
(operator . Op_nop)
(symbol . Comma)
(operator . Op_multiply)
(operator . Op_assign)
(operator . Op_divide)
(integer . 42)
(operator . Op_mod)
(string . String literal)
(operator . Op_add)
(identifier . variable_name)
(character . 10)
(character . 92)
(character . 32)
End_of_input
</pre>
 
====== Testcase 4 ======
 
<syntaxhighlight lang="scheme">
(translate "
/*** test printing, embedded \\\\n and comments with lots of '*' ***/
print(42);
print(\"\\\\nHello World\\\\nGood Bye\\\\nok\\\\n\");
print(\"Print a slash n - \\\\\\\\n.\\\\n\");
")
</syntaxhighlight>
{{Out}}
<pre>
(keyword . Keyword_print)
(symbol . LeftParen)
(integer . 42)
(symbol . RightParen)
(symbol . Semicolon)
(keyword . Keyword_print)
(symbol . LeftParen)
(string . \nHello World\nGood Bye\nok\n)
(symbol . RightParen)
(symbol . Semicolon)
(keyword . Keyword_print)
(symbol . LeftParen)
(string . Print a slash n - \\n.\n)
(symbol . RightParen)
(symbol . Semicolon)
End_of_input
</pre>
 
=={{header|Perl}}==
 
<langsyntaxhighlight lang="perl">#!/usr/bin/env perl
 
use strict;
Line 3,827 ⟶ 14,342:
($line, $col)
}
}</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 3,870 ⟶ 14,385:
===Alternate Perl Solution===
Tested on perl v5.26.1
<langsyntaxhighlight Perllang="perl">#!/usr/bin/perl
 
use strict; # lex.pl - source to tokens
Line 3,906 ⟶ 14,421:
1 + $` =~ tr/\n//, 1 + length $` =~ s/.*\n//sr, $^R;
}
printf "%5d %7d %s\n", 1 + tr/\n//, 1, 'End_of_input';</langsyntaxhighlight>
 
=={{header|Perl 6}}==
This is more complicated than strictly necessary for this task. It is set up to be easily adapted to do syntax analysis.
 
(Note: there are several bogus comments added solely to help with syntax highlighting.)
 
{{works with|Rakudo|2016.08}}
 
<lang perl6>grammar tiny_C {
rule TOP { ^ <.whitespace>? <tokens> + % <.whitespace> <.whitespace> <eoi> }
 
rule whitespace { [ <comment> + % <ws> | <ws> ] }
 
token comment { '/*' ~ '*/' .*? }
 
token tokens {
[
| <operator> { make $/<operator>.ast }
| <keyword> { make $/<keyword>.ast }
| <symbol> { make $/<symbol>.ast }
| <identifier> { make $/<identifier>.ast }
| <integer> { make $/<integer>.ast }
| <char> { make $/<char>.ast }
| <string> { make $/<string>.ast }
| <error>
]
}
 
proto token operator {*}
token operator:sym<*> { '*' { make 'Op_multiply' } }
token operator:sym</> { '/'<!before '*'> { make 'Op_divide' } }
token operator:sym<%> { '%' { make 'Op_mod' } }
token operator:sym<+> { '+' { make 'Op_add' } }
token operator:sym<-> { '-' { make 'Op_subtract' } }
token operator:sym('<='){ '<=' { make 'Op_lessequal' } }
token operator:sym('<') { '<' { make 'Op_less' } }
token operator:sym('>='){ '>=' { make 'Op_greaterequal'} }
token operator:sym('>') { '>' { make 'Op_greater' } }
token operator:sym<==> { '==' { make 'Op_equal' } }
token operator:sym<!=> { '!=' { make 'Op_notequal' } }
token operator:sym<!> { '!' { make 'Op_not' } }
token operator:sym<=> { '=' { make 'Op_assign' } }
token operator:sym<&&> { '&&' { make 'Op_and' } }
token operator:sym<||> { '||' { make 'Op_or' } }
 
proto token keyword {*}
token keyword:sym<if> { 'if' { make 'Keyword_if' } }
token keyword:sym<else> { 'else' { make 'Keyword_else' } }
token keyword:sym<putc> { 'putc' { make 'Keyword_putc' } }
token keyword:sym<while> { 'while' { make 'Keyword_while' } }
token keyword:sym<print> { 'print' { make 'Keyword_print' } }
 
proto token symbol {*}
token symbol:sym<(> { '(' { make 'LeftParen' } }
token symbol:sym<)> { ')' { make 'RightParen' } }
token symbol:sym<{> { '{' { make 'LeftBrace' } }
token symbol:sym<}> { '}' { make 'RightBrace' } }
token symbol:sym<;> { ';' { make 'Semicolon' } }
token symbol:sym<,> { ',' { make 'Comma' } }
 
token identifier { <[_A..Za..z]><[_A..Za..z0..9]>* { make 'Identifier ' ~ $/ } }
token integer { <[0..9]>+ { make 'Integer ' ~ $/ } }
 
token char {
'\'' [<-[']> | '\n' | '\\\\'] '\''
{ make 'Char_Literal ' ~ $/.subst("\\n", "\n").substr(1, *-1).ord }
}
 
token string {
'"' <-["\n]>* '"' #'
{
make 'String ' ~ $/;
note 'Error: Unknown escape sequence.' and exit if (~$/ ~~ m:r/ <!after <[\\]>>[\\<-[n\\]>]<!before <[\\]>> /);
}
}
 
token eoi { $ { make 'End_of_input' } }
 
token error {
| '\'''\'' { note 'Error: Empty character constant.' and exit }
| '\'' <-[']> ** {2..*} '\'' { note 'Error: Multi-character constant.' and exit }
| '/*' <-[*]>* $ { note 'Error: End-of-file in comment.' and exit }
| '"' <-["]>* $ { note 'Error: End-of-file in string.' and exit }
| '"' <-["]>*? \n { note 'Error: End of line in string.' and exit } #'
}
}
 
sub parse_it ( $c_code ) {
my $l;
my @pos = gather for $c_code.lines>>.chars.kv -> $line, $v {
take [ $line + 1, $_ ] for 1 .. ($v+1); # v+1 for newline
$l = $line+2;
}
@pos.push: [ $l, 1 ]; # capture eoi
 
for flat $c_code<tokens>.list, $c_code<eoi> -> $m {
say join "\t", @pos[$m.from].fmt('%3d'), $m.ast;
}
}
 
my $tokenizer = tiny_C.parse(@*ARGS[0].IO.slurp);
parse_it( $tokenizer );</lang>
 
{{out|case=test case 3}}
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Char_Literal 10
21 26 Char_Literal 92
22 26 Char_Literal 32
23 1 End_of_input
</pre>
 
=={{header|Phix}}==
Deviates from the task requirements in that it is written in a modular form so that the output
from one stage can be used directly in the next, rather than re-loading from a human-readable
form. If required, demo\rosetta\Compiler\extra.e (below) contains some code that achieves the latter.
Code to print the human readable forms is likewise kept separate from any re-usable parts.
<!--<syntaxhighlight lang="phix">(phixonline)-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\core.e
-- ============================
--
-- Standard declarations and routines used by lex.exw, parse.exw, cgen.exw, and interp.exw
-- (included in distribution as above, which contains some additional sanity checks)
--</span>
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">constant</span> <span style="color: #000000;">EOF</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">STDIN</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">STDOUT</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">enum</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">=</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">UNARY</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">=</span><span style="color: #000000;">2</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">type</span> <span style="color: #000000;">nary</span><span style="color: #0000FF;">(</span><span style="color: #004080;">integer</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">return</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">=</span><span style="color: #000000;">NONE</span> <span style="color: #008080;">or</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">=</span><span style="color: #000000;">UNARY</span> <span style="color: #008080;">or</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">=</span><span style="color: #000000;">BINARY</span> <span style="color: #008080;">end</span> <span style="color: #008080;">type</span>
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">tkNames</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span> <span style="color: #000080;font-style:italic;">-- eg/ie {"Op_multiply","Op_divide",..}</span>
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">precedences</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span>
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">narys</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span> <span style="color: #000080;font-style:italic;">-- NONE/UNARY/BINARY</span>
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">operators</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span> <span style="color: #000080;font-style:italic;">-- eg/ie {"*","/","+","-","&lt;","&lt;=",..}</span>
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">opcodes</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span> <span style="color: #000080;font-style:italic;">-- idx to tkNames, matching operators</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">constant</span> <span style="color: #000000;">KEYWORDS</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">new_dict</span><span style="color: #0000FF;">()</span> <span style="color: #000080;font-style:italic;">-- eg/ie {"if"=&gt;idx to tkNames}</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">enum</span> <span style="color: #000000;">OPERATOR</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">DIGIT</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">LETTER</span> <span style="color: #000080;font-style:italic;">-- character classes</span>
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">charmap</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #000000;">255</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #008000;">'0'</span><span style="color: #0000FF;">..</span><span style="color: #008000;">'9'</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">DIGIT</span>
<span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #008000;">'A'</span><span style="color: #0000FF;">..</span><span style="color: #008000;">'Z'</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">LETTER</span>
<span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #008000;">'a'</span><span style="color: #0000FF;">..</span><span style="color: #008000;">'z'</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">LETTER</span>
<span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #008000;">'_'</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">LETTER</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">nary</span> <span style="color: #000000;">n</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">precedence</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">tkNames</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tkNames</span><span style="color: #0000FF;">,</span><span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">narys</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">narys</span><span style="color: #0000FF;">,</span><span style="color: #000000;">n</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">precedences</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">precedences</span><span style="color: #0000FF;">,</span><span style="color: #000000;">precedence</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">return</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tkNames</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">string</span> <span style="color: #000000;">op</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">nary</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">precedence</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">precedence</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">operators</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">operators</span><span style="color: #0000FF;">,</span><span style="color: #000000;">op</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">opcodes</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">opcodes</span><span style="color: #0000FF;">,</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">op</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
<span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">op</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">OPERATOR</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">res</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">string</span> <span style="color: #000000;">keyword</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
<span style="color: #7060A8;">putd</span><span style="color: #0000FF;">(</span><span style="color: #000000;">keyword</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">KEYWORDS</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">res</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">constant</span>
<span style="color: #000000;">tk_EOI</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"End_of_input"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--1</span>
<span style="color: #000000;">tk_mul</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_multiply"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"*"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">13</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--2</span>
<span style="color: #000000;">tk_div</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_divide"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"/"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">13</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--3</span>
<span style="color: #000000;">tk_mod</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_mod"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"%"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">13</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--4</span>
<span style="color: #000000;">tk_add</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_add"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"+"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">12</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--5</span>
<span style="color: #000000;">tk_sub</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_subtract"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"-"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">12</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--6</span>
<span style="color: #000000;">tk_neg</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_negate"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">UNARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">14</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--7</span>
<span style="color: #000000;">tk_not</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_not"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"!"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">UNARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">14</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--8</span>
<span style="color: #000000;">tk_lt</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_less"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"&lt;"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">10</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--9</span>
<span style="color: #000000;">tk_le</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_lessequal"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"&lt;="</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">10</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--10</span>
<span style="color: #000000;">tk_gt</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_greater"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"&gt;"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">10</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--11</span>
<span style="color: #000000;">tk_ge</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_greaterequal"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"&gt;="</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">10</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--12</span>
<span style="color: #000000;">tk_eq</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_equal"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"=="</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">9</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--13</span>
<span style="color: #000000;">tk_ne</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_notequal"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"!="</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">9</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--14</span>
<span style="color: #000000;">tk_assign</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_assign"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"="</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--15</span>
<span style="color: #000000;">tk_and</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_and"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"&&"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">5</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--16</span>
<span style="color: #000000;">tk_or</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_or"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"||"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">4</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--17</span>
<span style="color: #000000;">tk_if</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Keyword_if"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"if"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--18</span>
<span style="color: #000000;">tk_else</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Keyword_else"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"else"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--19</span>
<span style="color: #000000;">tk_while</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Keyword_while"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"while"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--20</span>
<span style="color: #000000;">tk_print</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Keyword_print"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"print"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--21</span>
<span style="color: #000000;">tk_putc</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Keyword_putc"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"putc"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--22</span>
<span style="color: #000000;">tk_LeftParen</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"LeftParen"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"("</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--23</span>
<span style="color: #000000;">tk_RightParen</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"RightParen"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">")"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--24</span>
<span style="color: #000000;">tk_LeftBrace</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"LeftBrace"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"{"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--25</span>
<span style="color: #000000;">tk_RightBrace</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"RightBrace"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"}"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--26</span>
<span style="color: #000000;">tk_Semicolon</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Semicolon"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">";"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--27</span>
<span style="color: #000000;">tk_Comma</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Comma"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">","</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--28</span>
<span style="color: #000000;">tk_Identifier</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Identifier"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--29</span>
<span style="color: #000000;">tk_Integer</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Integer"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--30</span>
<span style="color: #000000;">tk_String</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"String"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--31</span>
<span style="color: #000000;">tk_Sequence</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Sequence"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--32</span>
<span style="color: #000000;">tk_Prints</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"tk_Prints"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--33</span>
<span style="color: #000000;">tk_Printi</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"tk_Printi"</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">--34</span>
<span style="color: #008080;">global</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">input_file</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">STDIN</span><span style="color: #0000FF;">,</span>
<span style="color: #000000;">output_file</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">STDOUT</span>
<span style="color: #008080;">type</span> <span style="color: #000000;">strint</span><span style="color: #0000FF;">(</span><span style="color: #004080;">object</span> <span style="color: #000000;">o</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">return</span> <span style="color: #004080;">string</span><span style="color: #0000FF;">(</span><span style="color: #000000;">o</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">or</span> <span style="color: #004080;">integer</span><span style="color: #0000FF;">(</span><span style="color: #000000;">o</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">type</span>
<span style="color: #008080;">global</span> <span style="color: #000000;">strint</span> <span style="color: #000000;">tok_line</span><span style="color: #0000FF;">,</span> <span style="color: #000080;font-style:italic;">-- save of line/col at the start of</span>
<span style="color: #000000;">tok_col</span> <span style="color: #000080;font-style:italic;">-- token/comment, for result/errors</span>
<span style="color: #008080;">global</span> <span style="color: #004080;">object</span> <span style="color: #000000;">oneline</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span>
<span style="color: #008080;">constant</span> <span style="color: #000000;">errfmt</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">"Line %s column %s:\n%s%s"</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">errline</span><span style="color: #0000FF;">()</span>
<span style="color: #000000;">oneline</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">substitute</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">trim</span><span style="color: #0000FF;">(</span><span style="color: #000000;">oneline</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\r\n"</span><span style="color: #0000FF;">),</span><span style="color: #008000;">'\t'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">padding</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">return</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"%s\n%s^ "</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">oneline</span><span style="color: #0000FF;">,</span><span style="color: #000000;">padding</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">procedure</span> <span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #004080;">sequence</span> <span style="color: #000000;">msg</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">args</span><span style="color: #0000FF;">={})</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">args</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">msg</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">msg</span><span style="color: #0000FF;">,</span><span style="color: #000000;">args</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">el</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #004080;">atom</span><span style="color: #0000FF;">(</span><span style="color: #000000;">oneline</span><span style="color: #0000FF;">)?</span><span style="color: #008000;">""</span><span style="color: #0000FF;">:</span><span style="color: #000000;">errline</span><span style="color: #0000FF;">())</span>
<span style="color: #008080;">if</span> <span style="color: #004080;">integer</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span> <span style="color: #000000;">tok_line</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"%d"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">if</span> <span style="color: #004080;">integer</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span> <span style="color: #000000;">tok_col</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"%d"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">STDOUT</span><span style="color: #0000FF;">,</span><span style="color: #000000;">errfmt</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">,</span><span style="color: #000000;">el</span><span style="color: #0000FF;">,</span><span style="color: #000000;">msg</span><span style="color: #0000FF;">})</span>
<span style="color: #0000FF;">{}</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">wait_key</span><span style="color: #0000FF;">()</span>
<span style="color: #7060A8;">abort</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
<span style="color: #008080;">include</span> <span style="color: #000000;">js_io</span><span style="color: #0000FF;">.</span><span style="color: #000000;">e</span> <span style="color: #000080;font-style:italic;">-- fake file i/o for running under pwa/p2js</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">open_file</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">file_name</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">string</span> <span style="color: #000000;">mode</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">platform</span><span style="color: #0000FF;">()=</span><span style="color: #004600;">JS</span><span style="color: #0000FF;">?</span><span style="color: #000000;">js_open</span><span style="color: #0000FF;">(</span><span style="color: #000000;">file_name</span><span style="color: #0000FF;">)</span>
<span style="color: #0000FF;">:</span><span style="color: #7060A8;">open</span><span style="color: #0000FF;">(</span><span style="color: #000000;">file_name</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">mode</span><span style="color: #0000FF;">))</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">fn</span><span style="color: #0000FF;"><=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">STDOUT</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"Could not open %s"</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">file_name</span><span style="color: #0000FF;">})</span>
<span style="color: #0000FF;">{}</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">wait_key</span><span style="color: #0000FF;">()</span>
<span style="color: #7060A8;">abort</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">fn</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">procedure</span> <span style="color: #000000;">open_files</span><span style="color: #0000FF;">(</span><span style="color: #004080;">sequence</span> <span style="color: #000000;">cl</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">cl</span><span style="color: #0000FF;">)></span><span style="color: #000000;">2</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">input_file</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">open_file</span><span style="color: #0000FF;">(</span><span style="color: #000000;">cl</span><span style="color: #0000FF;">[</span><span style="color: #000000;">3</span><span style="color: #0000FF;">],</span><span style="color: #008000;">"r"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">cl</span><span style="color: #0000FF;">)></span><span style="color: #000000;">3</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">output_file</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">open_file</span><span style="color: #0000FF;">(</span><span style="color: #000000;">cl</span><span style="color: #0000FF;">[</span><span style="color: #000000;">4</span><span style="color: #0000FF;">],</span><span style="color: #008000;">"w"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">procedure</span> <span style="color: #000000;">close_files</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">platform</span><span style="color: #0000FF;">()!=</span><span style="color: #004600;">JS</span> <span style="color: #008080;">then</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">input_file</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">STDIN</span> <span style="color: #008080;">then</span> <span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">output_file</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">STDOUT</span> <span style="color: #008080;">then</span> <span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">output_file</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">enquote</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">return</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">`"%s"`</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">substitute</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\\n"</span><span style="color: #0000FF;">))</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">unquote</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">[</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]!=</span><span style="color: #008000;">'\"'</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">[$]!=</span><span style="color: #008000;">'\"'</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">s</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">substitute</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">..-</span><span style="color: #000000;">2</span><span style="color: #0000FF;">],</span><span style="color: #008000;">"\\n"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">s</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</syntaxhighlight>-->
For running under pwa/p2js, we also have a "fake file/io" component:
<!--<syntaxhighlight lang="phix">(phixonline)-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\js_io.e
-- =============================
--
-- Fake file i/o for running under pwa/p2js in a browser
-- Does not cover the human readable reload parts of extra.e
--</span>
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span>
<span style="color: #004080;">sequence</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">known_files</span><span style="color: #0000FF;">,</span><span style="color: #000000;">kfc</span><span style="color: #0000FF;">}</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">columnize</span><span style="color: #0000FF;">({</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"test3.c"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ &lt;
/* If */ if /* Gtr */ &gt;
/* Else */ else /* Leq */ &lt;=
/* While */ while /* Geq */ &gt;=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal **/ '\\'
/* character literal */ ' '
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"test4.c"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
/*** test printing, embedded \n and comments with lots of '*' ***/
print(42);
print("\nHello World\nGood Bye\nok\n");
print("Print a slash n - \\n.\n");
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"primes.c"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
/*
Simple prime number generator
*/
count = 1;
n = 1;
limit = 100;
while (n &lt; limit) {
k=3;
p=1;
n=n+2;
while ((k*k&lt;=n) && (p)) {
p=n/k*k!=n;
k=k+2;
}
if (p) {
print(n, " is prime\n");
count = count + 1;
}
}
print("Total primes found: ", count, "\n");
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"gcd.c"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
/* Compute the gcd of 1071, 1029: 21 */
a = 1071;
b = 1029;
while (b != 0) {
new_a = b;
b = a % b;
a = new_a;
}
print(a);
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"Header.h"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
#define area(h, w) h * w
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"Source.t"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
#include "Header.h"
#define width 5
#define height 6
area = #area(height, width)#;
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)}})</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">linenos</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">known_files</span><span style="color: #0000FF;">))</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">js_open</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">filename</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">filename</span><span style="color: #0000FF;">,</span><span style="color: #000000;">known_files</span><span style="color: #0000FF;">)</span>
<span style="color: #7060A8;">assert</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">0</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">linenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">fn</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">js_gets</span><span style="color: #0000FF;">(</span><span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">lineno</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">linenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">]+</span><span style="color: #000000;">1</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">lineno</span><span style="color: #0000FF;"><=</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">kfc</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">linenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">lineno</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">kfc</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">][</span><span style="color: #000000;">lineno</span><span style="color: #0000FF;">]</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">EOF</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</syntaxhighlight>-->
The main lexer is also written to be reusable by later stages.
<!--<syntaxhighlight lang="phix">(phixonline)-->
<span style="color: #000080;font-style:italic;">--
-- demo\\rosetta\\Compiler\\lex.e
-- ==============================
--
-- The reusable part of lex.exw
-- This is only kept separate from core.e for consistency with later modules.</span>
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span>
<span style="color: #008080;">include</span> <span style="color: #000000;">core</span><span style="color: #0000FF;">.</span><span style="color: #000000;">e</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span>
<span style="color: #000000;">line</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span><span style="color: #0000FF;">,</span>
<span style="color: #000000;">col</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
<span style="color: #008080;">procedure</span> <span style="color: #000000;">eof</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"%s in %s literal"</span><span style="color: #0000FF;">,{</span><span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span><span style="color: #0000FF;">?</span><span style="color: #008000;">"EOF"</span><span style="color: #0000FF;">:</span><span style="color: #008000;">"EOL"</span><span style="color: #0000FF;">),</span><span style="color: #000000;">s</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">while</span> <span style="color: #000000;">1</span> <span style="color: #008080;">do</span>
<span style="color: #000000;">col</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">oneline</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">EOF</span>
<span style="color: #008080;">exit</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">col</span><span style="color: #0000FF;">></span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">oneline</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">line</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
<span style="color: #000000;">col</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
<span style="color: #000000;">oneline</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">platform</span><span style="color: #0000FF;">()=</span><span style="color: #004600;">JS</span><span style="color: #0000FF;">?</span><span style="color: #000000;">js_gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">)</span>
<span style="color: #0000FF;">:</span><span style="color: #7060A8;">gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">))</span>
<span style="color: #008080;">else</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">oneline</span><span style="color: #0000FF;">[</span><span style="color: #000000;">col</span><span style="color: #0000FF;">]</span>
<span style="color: #008080;">exit</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">ch</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #000080;font-style:italic;">-- for pwa/p2js (JavaScript *really* dislikes tabs in strings):
--constant whitespace = " \t\r\n\x0B\xA0"</span>
<span style="color: #008080;">constant</span> <span style="color: #000000;">whitespace</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\t'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\r'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\n'</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#0B</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#A0</span><span style="color: #0000FF;">}</span>
<span style="color: #000080;font-style:italic;">-- (0x0B is Vertical Tab, 0xA0 is Non-breaking space)</span>
<span style="color: #008080;">procedure</span> <span style="color: #000000;">skipspacesandcomments</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">while</span> <span style="color: #000000;">1</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">if</span> <span style="color: #008080;">not</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">,</span><span style="color: #000000;">whitespace</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'/'</span> <span style="color: #008080;">and</span> <span style="color: #000000;">col</span><span style="color: #0000FF;"><</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">oneline</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">and</span> <span style="color: #000000;">oneline</span><span style="color: #0000FF;">[</span><span style="color: #000000;">col</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]=</span><span style="color: #008000;">'*'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">tok_line</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">line</span> <span style="color: #000080;font-style:italic;">-- (in case of EOF error)</span>
<span style="color: #000000;">tok_col</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">col</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span> <span style="color: #000080;font-style:italic;">-- (can be EOF)</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span> <span style="color: #000080;font-style:italic;">-- ( "" )</span>
<span style="color: #008080;">while</span> <span style="color: #000000;">1</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'*'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'/'</span> <span style="color: #008080;">then</span> <span style="color: #008080;">exit</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"EOF in comment"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">else</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #008080;">else</span>
<span style="color: #008080;">exit</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">escape_char</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span> <span style="color: #000080;font-style:italic;">-- (discard the '\\')</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'n'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">'\n'</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\\'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">'\\'</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span>
<span style="color: #008080;">or</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\n'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">eof</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">else</span>
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">`unknown escape sequence \%c`</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">ch</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">char_lit</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">startch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">,</span>
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span> <span style="color: #000080;font-style:italic;">-- (skip opening quote, save res)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">startch</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"empty character constant"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\\'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">escape_char</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"character"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span>
<span style="color: #008080;">or</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\n'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">eof</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"character"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">startch</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"multi-character constant"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tk_Integer</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">}</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">string_lit</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">startch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">ch</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">text</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span>
<span style="color: #008080;">while</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()!=</span><span style="color: #000000;">startch</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span>
<span style="color: #008080;">or</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\n'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">eof</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"string"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\\'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">escape_char</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"string"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">text</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">ch</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tk_String</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">text</span><span style="color: #0000FF;">}</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">get_op</span><span style="color: #0000FF;">()</span>
<span style="color: #000080;font-style:italic;">-- sequence operator = {ch}</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">operator</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span><span style="color: #0000FF;">&</span><span style="color: #000000;">ch</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">while</span> <span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">]=</span><span style="color: #000000;">OPERATOR</span>
<span style="color: #008080;">and</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">operator</span><span style="color: #0000FF;">&</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">,</span><span style="color: #000000;">operators</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
<span style="color: #000080;font-style:italic;">-- (^ ie/eg merge "&gt;=", but not ");")</span>
<span style="color: #000000;">operator</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">ch</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">k</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">operator</span><span style="color: #0000FF;">,</span><span style="color: #000000;">operators</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">k</span><span style="color: #0000FF;">=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"unknown operator"</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">opcodes</span><span style="color: #0000FF;">[</span><span style="color: #000000;">k</span><span style="color: #0000FF;">],</span> <span style="color: #000000;">0</span><span style="color: #0000FF;">}</span> <span style="color: #000080;font-style:italic;">-- (0 unused)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">get_int</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">i</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
<span style="color: #008080;">while</span> <span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">]=</span><span style="color: #000000;">DIGIT</span> <span style="color: #008080;">do</span>
<span style="color: #000000;">i</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">*</span><span style="color: #000000;">10</span> <span style="color: #0000FF;">+</span> <span style="color: #0000FF;">(</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">-</span><span style="color: #008000;">'0'</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">]=</span><span style="color: #000000;">LETTER</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"invalid number"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tk_Integer</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">}</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">get_ident</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">text</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span>
<span style="color: #008080;">while</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">],{</span><span style="color: #000000;">LETTER</span><span style="color: #0000FF;">,</span><span style="color: #000000;">DIGIT</span><span style="color: #0000FF;">})</span> <span style="color: #008080;">do</span>
<span style="color: #000000;">text</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">ch</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">keyword</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">getd</span><span style="color: #0000FF;">(</span><span style="color: #000000;">text</span><span style="color: #0000FF;">,</span><span style="color: #000000;">KEYWORDS</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">keyword</span><span style="color: #0000FF;">!=</span><span style="color: #004600;">NULL</span> <span style="color: #008080;">then</span>
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">keyword</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">0</span><span style="color: #0000FF;">}</span> <span style="color: #000080;font-style:italic;">-- (0 unused)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tk_Identifier</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">text</span><span style="color: #0000FF;">}</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">get_token</span><span style="color: #0000FF;">()</span>
<span style="color: #000000;">skipspacesandcomments</span><span style="color: #0000FF;">()</span>
<span style="color: #000000;">tok_line</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">line</span>
<span style="color: #000000;">tok_col</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">col</span>
<span style="color: #008080;">switch</span> <span style="color: #000000;">ch</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">case</span> <span style="color: #000000;">EOF</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tk_EOI</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">0</span><span style="color: #0000FF;">}</span> <span style="color: #000080;font-style:italic;">-- (0 unused)</span>
<span style="color: #008080;">case</span> <span style="color: #008000;"><nowiki>'\''</nowiki></span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">char_lit</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">case</span> <span style="color: #008000;">'"'</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">string_lit</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">else</span>
<span style="color: #008080;">switch</span> <span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">]</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">case</span> <span style="color: #000000;">OPERATOR</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">get_op</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">case</span> <span style="color: #000000;">DIGIT</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">get_int</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">case</span> <span style="color: #000000;">LETTER</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">get_ident</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">else</span> <span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"unrecognized character: (%d)"</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">switch</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">switch</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">lex</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">toks</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">tok</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span>
<span style="color: #004080;">object</span> <span style="color: #000000;">v</span>
<span style="color: #008080;">while</span> <span style="color: #000000;">tok</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">tk_EOI</span> <span style="color: #008080;">do</span>
<span style="color: #0000FF;">{</span><span style="color: #000000;">tok</span><span style="color: #0000FF;">,</span><span style="color: #000000;">v</span><span style="color: #0000FF;">}</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">get_token</span><span style="color: #0000FF;">()</span>
<span style="color: #000000;">toks</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">toks</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok</span><span style="color: #0000FF;">,</span><span style="color: #000000;">v</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">toks</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</syntaxhighlight>-->
Optional: if you need human-readable output/input at each (later) stage, so you can use pipes
<!--<syntaxhighlight lang="phix">-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\extra.e
-- =============================
--
-- Routines to reload human-readable files (deviation from task requirement)
--</span>
<span style="color: #008080;">without</span> <span style="color: #008080;">js</span> <span style="color: #000080;font-style:italic;">-- (file i/o)
--The following can be used to load .lex files, as created by lex.exw:
-- (in place of the existing get_tok() in parse.e)</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">get_tok</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">line</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">trim</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">))</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">tok</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #000000;">line</span><span style="color: #0000FF;">,</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #000000;">limit</span><span style="color: #0000FF;">:=</span><span style="color: #000000;">4</span><span style="color: #0000FF;">,</span><span style="color: #000000;">no_empty</span><span style="color: #0000FF;">:=</span><span style="color: #000000;">1</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">k</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tok</span><span style="color: #0000FF;">[</span><span style="color: #000000;">3</span><span style="color: #0000FF;">],</span><span style="color: #000000;">tkNames</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">k</span><span style="color: #0000FF;">=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">tok</span><span style="color: #0000FF;">[</span><span style="color: #000000;">3</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">k</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">tok</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #000080;font-style:italic;">--The following can be used to load .ast files, as created by parse.exw:
-- (in place of the existing lex()/parse() pairs in cgen.exw and interp.exw)</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">load_ast</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">line</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">trim</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">))</span>
<span style="color: #000080;font-style:italic;">-- Each line has at least one token</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">node</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #000000;">line</span><span style="color: #0000FF;">,</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #000000;">limit</span><span style="color: #0000FF;">:=</span><span style="color: #000000;">2</span><span style="color: #0000FF;">,</span><span style="color: #000000;">no_empty</span><span style="color: #0000FF;">:=</span><span style="color: #000000;">1</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">node_type</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">node_type</span> <span style="color: #0000FF;">==</span> <span style="color: #008000;">";"</span> <span style="color: #008080;">then</span> <span style="color: #000080;font-style:italic;">-- a terminal node</span>
<span style="color: #008080;">return</span> <span style="color: #004600;">NULL</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">n_type</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">node_type</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tkNames</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">n_type</span><span style="color: #0000FF;">=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000080;font-style:italic;">-- A line with two tokens is a leaf node
-- Leaf nodes are: Identifier, Integer, String
-- The 2nd token is the value</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">node</span><span style="color: #0000FF;">)></span><span style="color: #000000;">1</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">n_type</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">n_type</span><span style="color: #0000FF;">=</span><span style="color: #000000;">tk_Integer</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">to_integer</span><span style="color: #0000FF;">(</span><span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">])</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">n_type</span><span style="color: #0000FF;">=</span><span style="color: #000000;">tk_String</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">unquote</span><span style="color: #0000FF;">(</span><span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">])</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">node</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #004080;">object</span> <span style="color: #000000;">left</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">load_ast</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">object</span> <span style="color: #000000;">right</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">load_ast</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">n_type</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">left</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">right</span><span style="color: #0000FF;">}</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</syntaxhighlight>-->
Finally, a simple test driver for the specific task:
<!--<syntaxhighlight lang="phix">(phixonline)-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\lex.exw
-- =============================
--</span>
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span>
<span style="color: #008080;">include</span> <span style="color: #000000;">lex</span><span style="color: #0000FF;">.</span><span style="color: #000000;">e</span>
<span style="color: #008080;">procedure</span> <span style="color: #000000;">main</span><span style="color: #0000FF;">(</span><span style="color: #004080;">sequence</span> <span style="color: #000000;">cl</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">open_files</span><span style="color: #0000FF;">(</span><span style="color: #000000;">cl</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">toks</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">lex</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">tok</span>
<span style="color: #004080;">object</span> <span style="color: #000000;">v</span>
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">toks</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
<span style="color: #0000FF;">{</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok</span><span style="color: #0000FF;">,</span><span style="color: #000000;">v</span><span style="color: #0000FF;">}</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">toks</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span>
<span style="color: #008080;">switch</span> <span style="color: #000000;">tok</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">case</span> <span style="color: #000000;">tk_Identifier</span><span style="color: #0000FF;">:</span> <span style="color: #000000;">v</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">" %s"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">v</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">case</span> <span style="color: #000000;">tk_Integer</span><span style="color: #0000FF;">:</span> <span style="color: #000000;">v</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">" %5d"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">v</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">case</span> <span style="color: #000000;">tk_String</span><span style="color: #0000FF;">:</span> <span style="color: #000000;">v</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">" %s"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">enquote</span><span style="color: #0000FF;">(</span><span style="color: #000000;">v</span><span style="color: #0000FF;">))</span>
<span style="color: #008080;">else</span> <span style="color: #000000;">v</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">switch</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">output_file</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"%5d %5d %-10s%s\n"</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tkNames</span><span style="color: #0000FF;">[</span><span style="color: #000000;">tok</span><span style="color: #0000FF;">],</span><span style="color: #000000;">v</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #000000;">close_files</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
<span style="color: #000080;font-style:italic;">--main(command_line())</span>
<span style="color: #000000;">main</span><span style="color: #0000FF;">({</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"test4.c"</span><span style="color: #0000FF;">})</span>
<!--</syntaxhighlight>-->
{{out}}
<pre>
2 1 Keyword_print
2 6 LeftParen
2 7 Integer 42
2 9 RightParen
2 10 Semicolon
3 1 Keyword_print
3 6 LeftParen
3 7 String "\nHello World\nGood Bye\nok\n"
3 38 RightParen
3 39 Semicolon
4 1 Keyword_print
4 6 LeftParen
4 7 String "Print a slash n - \n.\n"
4 33 RightParen
4 34 Semicolon
5 1 End_of_input
</pre>
 
=={{header|Prolog}}==
<lang Phix>--
-- demo\\rosetta\\Compiler\\core.e
-- ===============================
--
-- Standard declarations and routines used by lex.exw, parse.exw, cgen.exw, and interp.exw
-- (included in distribution as above, which contains some additional sanity checks)
--
--
global constant EOF = -1, STDIN = 0, STDOUT = 1
 
<syntaxhighlight lang="prolog">/*
global enum type nary NONE=0, UNARY=1, BINARY=2 end type
Test harness for the analyzer, not needed if we are actually using the output.
*/
load_file(File, Input) :-
read_file_to_codes(File, Codes, []),
maplist(char_code, Chars, Codes),
atom_chars(Input,Chars).
 
test_file(File) :-
global sequence tkNames = {} -- eg/ie {"Op_multiply","Op_divide",..}
load_file(File, Input),
global sequence precedences = {}
tester(Input).
global sequence narys = {} -- NONE/UNARY/BINARY
global sequence operators = {} -- eg/ie {"*","/","+","-","<","<=",..}
global sequence opcodes = {} -- idx to tkNames, matching operators
 
tester(S) :-
global constant KEYWORDS = new_dict() -- eg/ie {"if"=>idx to tkNames}
atom_chars(S,Chars),
tokenize(Chars,L),
maplist(print_tok, L),
!.
 
print_tok(L) :-
global enum OPERATOR=1, DIGIT, LETTER -- character classes
L =.. [Op,Line,Pos],
format('~d\t~d\t~p~n', [Line,Pos,Op]).
print_tok(string(Value,Line,Pos)) :-
format('~d\t~d\tstring\t\t"~w"~n', [Line,Pos,Value]).
print_tok(identifier(Value,Line,Pos)) :-
format('~d\t~d\tidentifier\t~p~n', [Line,Pos,Value]).
print_tok(integer(Value,Line,Pos)) :-
format('~d\t~d\tinteger\t\t~p~n', [Line,Pos,Value]).
 
global sequence charmap = repeat(0,255)
charmap['0'..'9'] = DIGIT
charmap['A'..'Z'] = LETTER
charmap['a'..'z'] = LETTER
charmap['_'] = LETTER
 
/*
function tkName(string s, nary n = NONE, integer precedence = -1)
Tokenize
tkNames = append(tkNames,s)
narys = append(narys,n)
run the input over a DCG to get out the tokens.
precedences = append(precedences,precedence)
return length(tkNames)
In - a list of chars to tokenize
end function
Tokens = a list of tokens (excluding spaces).
*/
tokenize(In,RelTokens) :-
newline_positions(In,1,NewLines),
tokenize(In,[0|NewLines],1,1,Tokens),
check_for_exceptions(Tokens),
exclude(token_name(space),Tokens,RelTokens).
 
tokenize([],NewLines,Pos,LineNo,[end_of_input(LineNo,Offset)]) :-
function tkOp(string s, string op, nary n, integer precedence)
position_offset(NewLines,Pos,Offset).
integer res = tkName(s, n, precedence)
tokenize(In,NewLines,Pos,LineNo,Out) :-
operators = append(operators,op)
position_offset(NewLines,Pos,Offset),
opcodes = append(opcodes,res)
phrase(tok(Tok,TokLen,LineNo,Offset),In,T),
for i=1 to length(op) do
(
charmap[op[i]] = OPERATOR
Tok = [] -> endOut for= Toks
; Out = [Tok|Toks]
return res
),
end function
Pos1 is Pos + TokLen,
update_line_no(LineNo,NewLines,Pos1,NewLineNo,NewNewLines),
tokenize(T,NewNewLines,Pos1,NewLineNo,Toks).
update_line_no(LNo,[L],_,LNo,[L]).
update_line_no(LNo,[L,Nl|T],Pos,LNo,[L,Nl|T]) :-
Pos =< Nl.
update_line_no(LNo,[_,Nl|T],Pos,LNo2,Nlines) :-
Pos > Nl,
succ(LNo,LNo1),
update_line_no(LNo1,[Nl|T],Pos,LNo2,Nlines).
 
position_offset([Line|_],Pos,Offset) :- Offset is Pos - Line.
function tkKw(string s, string keyword)
integer res = tkName(s)
token_name(Name,Tok) :- functor(Tok,Name,_).
putd(keyword, res, KEYWORDS)
return res
end function
 
% Get a list of all the newlines and their position in the data
global constant
% This is used to create accurate row/column numbers.
tk_EOI = tkName("End_of_input"), --1
newline_positions([],N,[N]).
tk_mul = tkOp("Op_multiply", "*", BINARY,13), --2
newline_positions(['\n'|T],N,[N|Nt]) :- succ(N,N1), newline_positions(T,N1,Nt).
tk_div = tkOp("Op_divide", "/", BINARY,13), --3
newline_positions([C|T],N,Nt) :- dif(C,'\n'), succ(N,N1), newline_positions(T,N1,Nt).
tk_mod = tkOp("Op_mod", "%", BINARY,13), --4
tk_add = tkOp("Op_add", "+", BINARY,12), --5
% The tokenizer can tokenize some things that it shouldn't, deal with them here.
tk_sub = tkOp("Op_subtract", "-", BINARY,12), --6
check_for_exceptions([]). % all ok
tk_neg = tkName("Op_negate", UNARY, 14), --7
check_for_exceptions([op_divide(L,P),op_multiply(_,_)|_]) :-
tk_not = tkOp("Op_not", "!", UNARY, 14), --8
format(atom(Error), 'Unclosed comment at line ~d,~d', [L,P]),
tk_lt = tkOp("Op_less", "<", BINARY,10), --9
throw(Error).
tk_le = tkOp("Op_lessequal", "<=",BINARY,10), --10
check_for_exceptions([integer(_,L,P),identifier(_,_,_)|_]) :-
tk_gt = tkOp("Op_greater", ">", BINARY,10), --11
format(atom(Error), 'Invalid identifier at line ~d,~d', [L,P]),
tk_ge = tkOp("Op_greaterequal", ">=",BINARY,10), --12
throw(Error).
tk_eq = tkOp("Op_equal", "==",BINARY, 9), --13
check_for_exceptions([_|T]) :- check_for_exceptions(T).
tk_ne = tkOp("Op_notequal", "!=",BINARY, 9), --14
tk_assign = tkOp("Op_assign", "=", NONE, -1), --15
tk_and = tkOp("Op_and", "&&",BINARY, 5), --16
tk_or = tkOp("Op_or", "||",BINARY, 4), --17
tk_if = tkKw("Keyword_if", "if"), --18
tk_else = tkKw("Keyword_else", "else"), --19
tk_while = tkKw("Keyword_while","while"), --20
tk_print = tkKw("Keyword_print","print"), --21
tk_putc = tkKw("Keyword_putc", "putc"), --22
tk_LeftParen = tkOp("LeftParen", "(", NONE, -1), --23
tk_RightParen = tkOp("RightParen", ")", NONE, -1), --24
tk_LeftBrace = tkOp("LeftBrace", "{", NONE, -1), --25
tk_RightBrace = tkOp("RightBrace", "}", NONE, -1), --26
tk_Semicolon = tkOp("Semicolon", ";", NONE, -1), --27
tk_Comma = tkOp("Comma", ",", NONE, -1), --28
tk_Identifier = tkName("Identifier"), --29
tk_Integer = tkName("Integer"), --30
tk_String = tkName("String"), --31
tk_Sequence = tkName("Sequence"), --32
tk_Prints = tkName("tk_Prints"), --33
tk_Printi = tkName("tk_Printi") --34
 
global integer input_file = STDIN,
output_file = STDOUT
 
/*
type strint(object o)
A set of helper DCGs for the more complicated token types
return string(o) or integer(o)
*/
end type
:- set_prolog_flag(double_quotes, chars).
 
identifier(I) --> c_types(I,csym).
global strint tok_line, -- save of line/col at the start of
identifier(['_']) --> ['_'].
tok_col -- token/comment, for result/errors
identifier([]) --> [].
 
integer_(I,L) --> c_types(N,digit), { number_codes(I,N), length(N,L) }.
global object oneline = ""
 
% get a sequence of characters of the same type (https://www.swi-prolog.org/pldoc/doc_for?object=char_type/2)
constant errfmt = "Line %s column %s:\n%s%s"
c_types([C|T],Type) --> c_type(C,Type), c_types(T,Type).
c_types([C],Type) --> c_type(C,Type).
c_type(C,Type) --> [C],{ char_type(C,Type) }.
 
anything([]) --> [].
function errline()
anything([A|T]) --> [A], anything(T).
oneline = substitute(trim(oneline,"\r\n"),"\t"," ")
string padding = repeat(' ',tok_col)
return sprintf("%s\n%s^ ",{oneline,padding})
end function
 
string_([]) --> [].
global procedure error(sequence msg, sequence args={})
string_([A|T]) --> [A], { dif(A,'\n') }, string_(T).
if length(args) then
msg = sprintf(msg,args)
end if
string el = iff(atom(oneline)?"":errline())
if integer(tok_line) then tok_line = sprintf("%d",tok_line) end if
if integer(tok_col) then tok_col = sprintf("%d",tok_col) end if
printf(STDOUT,errfmt,{tok_line,tok_col,el,msg})
{} = wait_key()
abort(1)
end procedure
 
function open_file(string file_name, string mode)
integer fn = open(file_name, mode)
if fn = -1 then
printf(STDOUT, "Could not open %s", {file_name})
{} = wait_key()
abort(1)
end if
return fn
end function
 
/*
global procedure open_files(sequence cl)
The token types are all handled by the tok DCG, order of predicates is important here.
if length(cl)>2 then
*/
input_file = open_file(cl[3],"r")
% comment
if length(cl)>3 then
tok([],CLen,_,_) --> "/*", anything(A), "*/", { length(A,Len), CLen is Len + 4 }.
output_file = open_file(cl[4],"w")
end if
end if
end procedure
 
% toks
global procedure close_files()
tok(op_and(L,P),2,L,P) --> "&&".
if input_file!=STDIN then close(input_file) end if
tok(op_or(L,P),2,L,P) --> "||".
if output_file!=STDOUT then close(output_file) end if
tok(op_lessequal(L,P),2,L,P) --> "<=".
end procedure
tok(op_greaterequal(L,P),2,L,P) --> ">=".
tok(op_greaterequal(L,P),2,L,P) --> ">=".
tok(op_equal(L,P),2,L,P) --> "==".
tok(op_notequal(L,P),2,L,P) --> "!=".
tok(op_assign(L,P),1,L,P) --> "=".
tok(op_multiply(L,P),1,L,P) --> "*".
tok(op_divide(L,P),1,L,P) --> "/".
tok(op_mod(L,P),1,L,P) --> "%".
tok(op_add(L,P),1,L,P) --> "+".
tok(op_subtract(L,P),1,L,P) --> "-".
tok(op_negate(L,P),1,L,P) --> "-".
tok(op_less(L,P),1,L,P) --> "<".
tok(op_greater(L,P),1,L,P) --> ">".
tok(op_not(L,P),1,L,P) --> "!".
 
% symbols
global function enquote(string s)
tok(left_paren(L,P),1,L,P) --> "(".
return sprintf("\"%s\"",substitute(s,"\n","\\n"))
tok(right_paren(L,P),1,L,P) --> ")".
end function
tok(left_brace(L,P),1,L,P) --> "{".
tok(right_brace(L,P),1,L,P) --> "}".
tok(semicolon(L,P),1,L,P) --> ";".
tok(comma(L,P),1,L,P) --> ",".
 
% keywords
global function unquote(string s)
tok(keyword_if(L,P),2,L,P) --> "if".
if s[1]!='\"' then ?9/0 end if
tok(keyword_else(L,P),4,L,P) --> "else".
if s[$]!='\"' then ?9/0 end if
tok(keyword_while(L,P),5,L,P) --> "while".
s = substitute(s[2..-2],"\\n","\n")
tok(keyword_print(L,P),5,L,P) --> "print".
return s
tok(keyword_putc(L,P),4,L,P) --> "putc".
end function</lang>
The main lexer is also written to be reusable by later stages.
<lang Phix>--
-- demo\\rosetta\\Compiler\\lex.e
-- ==============================
--
-- The reusable part of lex.exw
-- This is only kept separate from core.e for consistency with later modules.
 
% identifier and literals
include core.e
tok(identifier(I,L,P),Len,L,P) --> c_type(S,csymf), identifier(T), { atom_chars(I,[S|T]), length([S|T],Len) }.
tok(integer(V,L,P),Len,L,P) --> integer_(V,Len).
tok(integer(I,L,P),4,L,P) --> "'\\\\'", { char_code('\\', I) }.
tok(integer(I,L,P),4,L,P) --> "'\\n'", { char_code('\n', I) }.
tok(integer(I,L,P),3,L,P) --> ['\''], [C], ['\''], { dif(C,'\n'), dif(C,'\''), char_code(C,I) }.
tok(string(S,L,P),SLen,L,P) --> ['"'], string_(A),['"'], { atom_chars(S,A), length(A,Len), SLen is Len + 2 }.
 
% spaces
integer ch = ' ',
tok(space(L,P),Len,L,P) --> c_types(S,space), { length(S,Len) }.
line = 0,
col = 0
 
procedure eof(string s)
error("%s in %s literal",{iff(ch=EOF?"EOF":"EOL"),s})
end procedure
 
function next_ch()
while 1 do
col += 1
if oneline=EOF then
ch = EOF
exit
elsif col>length(oneline) then
line += 1
col = 0
oneline = gets(input_file)
else
ch = oneline[col]
exit
end if
end while
return ch
end function
 
constant whitespace = " \t\r\n\x0B\xA0"
-- (0x0B is Vertical Tab, 0xA0 is Non-breaking space)
 
procedure skipspacesandcomments()
while 1 do
if not find(ch,whitespace) then
if ch='/' and col<length(oneline) and oneline[col+1]='*' then
tok_line = line -- (in case of EOF error)
tok_col = col
ch = next_ch() -- (can be EOF)
ch = next_ch() -- ( "" )
while 1 do
if ch='*' then
ch = next_ch()
if ch='/' then exit end if
elsif ch=EOF then
error("EOF in comment")
else
ch = next_ch()
end if
end while
else
exit
end if
end if
ch = next_ch()
end while
end procedure
 
function escape_char(string s)
ch = next_ch() -- (discard the '\\')
if ch='n' then
ch = '\n'
elsif ch='\\' then
ch = '\\'
elsif ch=EOF
or ch='\n' then
eof(s)
else
error("unknown escape sequence \\%c", {ch})
end if
return ch
end function
 
function char_lit()
integer startch = ch
integer res = next_ch() -- (skip opening quote, save res)
if ch=startch then
error("empty character constant")
elsif ch='\\' then
res = escape_char("character")
end if
ch = next_ch()
if ch=EOF
or ch='\n' then
eof("character")
elsif ch!=startch then
error("multi-character constant")
end if
ch = next_ch()
return {tk_Integer, res}
end function
 
% anything else is an error
function string_lit()
tok(_,_,L,P) --> { format(atom(Error), 'Invalid token at line ~d,~d', [L,P]), throw(Error) }.</syntaxhighlight>
integer startch = ch
{{out}}
string text = ""
while next_ch()!=startch do
if ch=EOF
or ch='\n' then
eof("string")
elsif ch='\\' then
ch = escape_char("string")
end if
text &= ch
end while
ch = next_ch()
return {tk_String, text}
end function
 
function op()
sequence operator = {ch}
ch = next_ch()
while charmap[ch]=OPERATOR
and find(operator&ch,operators) do
-- (^ ie/eg merge ">=", but not ");")
operator &= ch
ch = next_ch()
end while
integer k = find(operator,operators)
if k=0 then error("unknown operator") end if
return {opcodes[k], 0} -- (0 unused)
end function
 
function int()
integer i = 0
while charmap[ch]=DIGIT do
i = i*10 + (ch-'0')
ch = next_ch()
end while
if charmap[ch]=LETTER then
error("invalid number")
end if
return {tk_Integer, i}
end function
 
function ident()
string text = ""
while find(charmap[ch],{LETTER,DIGIT}) do
text &= ch
ch = next_ch()
end while
integer keyword = getd(text,KEYWORDS)
if keyword!=NULL then
return {keyword, 0} -- (0 unused)
end if
return {tk_Identifier, text}
end function
 
function get_tok()
skipspacesandcomments()
tok_line = line
tok_col = col
switch ch do
case EOF then return {tk_EOI, 0} -- (0 unused)
case '\'' then return char_lit()
case '"' then return string_lit()
else
switch charmap[ch] do
case OPERATOR then return op()
case DIGIT then return int()
case LETTER then return ident()
else error("unrecognized character: (%d)", {ch})
end switch
end switch
end function
 
global function lex()
sequence toks = {}
integer tok = -1
object v
while tok!=tk_EOI do
{tok,v} = get_tok()
toks = append(toks,{tok_line,tok_col,tok,v})
end while
return toks
end function</lang>
Finally, a simple test driver for the specific task:
<lang Phix>--
-- demo\\rosetta\\Compiler\\lex.exw
-- ================================
--
 
include lex.e
 
procedure main(sequence cl)
open_files(cl)
sequence toks = lex()
integer tok
object v
for i=1 to length(toks) do
{tok_line,tok_col,tok,v} = toks[i]
switch tok do
case tk_Identifier: v = sprintf(" %s",v)
case tk_Integer: v = sprintf(" %5d",v)
case tk_String: v = sprintf(" %s",enquote(v))
else v = ""
end switch
printf(output_file, "%5d %5d %-10s%s\n", {tok_line,tok_col,tkNames[tok],v})
end for
close_files()
end procedure
 
--main(command_line())
main({0,0,"test4.c"})</lang>
{{out}}
<pre>
5 2 16 1 Keyword_print keyword_print
5 2 40 6 LeftParen op_subtract
6 2 16 7 Integer 42keyword_putc
6 2 40 9 RightParen op_less
7 2 16 10 Semicolon keyword_if
7 3 40 1 Keyword_print op_greater
8 3 16 6 LeftParen keyword_else
8 3 40 7 String "\nHello World\nGood Bye\nok\n"op_lessequal
9 3 16 38 RightParen keyword_while
9 3 40 39 Semicolon op_greaterequal
10 4 16 1 Keyword_print left_brace
10 4 40 6 LeftParen op_equal
11 4 16 7 String "Print a slash n - \n.\n"right_brace
11 4 40 33 RightParen op_notequal
12 4 16 34 Semicolon left_paren
12 5 40 1 End_of_input op_and
13 16 right_paren
13 40 op_or
14 16 op_subtract
14 40 semicolon
15 16 op_not
15 40 comma
16 16 op_multiply
16 40 op_assign
17 16 op_divide
17 40 integer 42
18 16 op_mod
18 40 string "String literal"
19 16 op_add
19 40 identifier variable_name
20 26 integer 10
21 26 integer 92
22 26 integer 32
22 29 end_of_input
</pre>
 
=={{header|Python}}==
Tested with Python 2.7 and 3.x
<langsyntaxhighlight Pythonlang="python">from __future__ import print_function
import sys
 
Line 4,520 ⟶ 15,271:
#*** "string"
def string_lit(start, err_line, err_col):
global the_ch
text = ""
 
Line 4,527 ⟶ 15,279:
if the_ch == '\n':
error(err_line, err_col, "EOL while scanning string literal")
if the_ch == '\\':
next_ch()
if the_ch != 'n':
error(err_line, err_col, "escape sequence unknown \\%c" % the_ch)
the_ch = '\n'
text += the_ch
 
Line 4,614 ⟶ 15,371:
 
if tok == tk_EOI:
break</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 4,655 ⟶ 15,412:
</pre>
</b>
 
=={{header|QB64}}==
Tested with QB64 1.5
<syntaxhighlight lang="vb">dim shared source as string, the_ch as string, tok as string, toktyp as string
dim shared line_n as integer, col_n as integer, text_p as integer, err_line as integer, err_col as integer, errors as integer
 
declare function isalnum&(s as string)
declare function isalpha&(s as string)
declare function isdigit&(s as string)
declare sub divide_or_comment
declare sub error_exit(line_n as integer, col_n as integer, msg as string)
declare sub follow(c as string, typ2 as string, typ1 as string)
declare sub nextch
declare sub nexttok
declare sub read_char_lit
declare sub read_ident
declare sub read_number
declare sub read_string
 
const c_integer = "Integer", c_ident = "Identifier", c_string = "String"
 
dim out_fn as string, out_tok as string
 
if command$(1) = "" then print "Expecting a filename": end
open command$(1) for binary as #1
source = space$(lof(1))
get #1, 1, source
close #1
 
out_fn = command$(2): if out_fn <> "" then open out_fn for output as #1
 
line_n = 1: col_n = 0: text_p = 1: the_ch = " "
 
do
call nexttok
select case toktyp
case c_integer, c_ident, c_string: out_tok = tok
case else: out_tok = ""
end select
if out_fn = "" then
print err_line, err_col, toktyp, out_tok
else
print #1, err_line, err_col, toktyp, out_tok
end if
loop until errors or tok = ""
if out_fn <> "" then close #1
end
 
' get next tok, toktyp
sub nexttok
toktyp = ""
restart: err_line = line_n: err_col = col_n: tok = the_ch
select case the_ch
case " ", chr$(9), chr$(10): call nextch: goto restart
case "/": call divide_or_comment: if tok = "" then goto restart
 
case "%": call nextch: toktyp = "Op_mod"
case "(": call nextch: toktyp = "LeftParen"
case ")": call nextch: toktyp = "RightParen"
case "*": call nextch: toktyp = "Op_multiply"
case "+": call nextch: toktyp = "Op_add"
case ",": call nextch: toktyp = "Comma"
case "-": call nextch: toktyp = "Op_subtract"
case ";": call nextch: toktyp = "Semicolon"
case "{": call nextch: toktyp = "LeftBrace"
case "}": call nextch: toktyp = "RightBrace"
 
case "&": call follow("&", "Op_and", "")
case "|": call follow("|", "Op_or", "")
case "!": call follow("=", "Op_notequal", "Op_not")
case "<": call follow("=", "Op_lessequal", "Op_less")
case "=": call follow("=", "Op_equal", "Op_assign")
case ">": call follow("=", "Op_greaterequal", "Op_greater")
 
case chr$(34): call read_string
case chr$(39): call read_char_lit
 
case "": toktyp = "End_of_input"
 
case else
if isdigit&(the_ch) then
call read_number
elseif isalpha&(the_ch) then
call read_ident
else
call nextch
end if
end select
end sub
 
sub follow(c as string, if_both as string, if_one as string)
call nextch
if the_ch = c then
tok = tok + the_ch
call nextch
toktyp = if_both
else
if if_one = "" then call error_exit(line_n, col_n, "Expecting " + c): exit sub
toktyp = if_one
end if
end sub
 
sub read_string
toktyp = c_string
call nextch
do
tok = tok + the_ch
select case the_ch
case chr$(10): call error_exit(line_n, col_n, "EOL in string"): exit sub
case "": call error_exit(line_n, col_n, "EOF in string"): exit sub
case chr$(34): call nextch: exit sub
case else: call nextch
end select
loop
end sub
 
sub read_char_lit
toktyp = c_integer
call nextch
if the_ch = chr$(39) then
call error_exit(err_line, err_col, "Empty character constant"): exit sub
end if
 
if the_ch = "\" then
call nextch
if the_ch = "n" then
tok = "10"
elseif the_ch = "\" then
tok = "92"
else
call error_exit(line_n, col_n, "Unknown escape sequence:" + the_ch): exit sub
end if
else
tok = ltrim$(str$(asc(the_ch)))
end if
 
call nextch
if the_ch <> chr$(39) then
call error_exit(line_n, col_n, "Multi-character constant"): exit sub
end if
call nextch
end sub
 
sub divide_or_comment
call nextch
if the_ch <> "*" then
toktyp = "Op_divide"
else ' skip comments
tok = ""
call nextch
do
if the_ch = "*" then
call nextch
if the_ch = "/" then
call nextch
exit sub
end if
elseif the_ch = "" then
call error_exit(line_n, col_n, "EOF in comment"): exit sub
else
call nextch
end if
loop
end if
end sub
 
sub read_ident
do
call nextch
if not isalnum&(the_ch) then exit do
tok = tok + the_ch
loop
select case tok
case "else": toktyp = "keyword_else"
case "if": toktyp = "keyword_if"
case "print": toktyp = "keyword_print"
case "putc":: toktyp = "keyword_putc"
case "while": toktyp = "keyword_while"
case else: toktyp = c_ident
end select
end sub
 
sub read_number
toktyp = c_integer
do
call nextch
if not isdigit&(the_ch) then exit do
tok = tok + the_ch
loop
 
if isalpha&(the_ch) then
call error_exit(err_line, err_col, "Bogus number: " + tok + the_ch): exit sub
end if
end sub
 
function isalpha&(s as string)
dim c as string
c = left$(s, 1)
isalpha& = c <> "" and instr("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", c) > 0
end function
 
function isdigit&(s as string)
dim c as string
c = left$(s, 1)
isdigit& = c <> "" and instr("0123456789", c) > 0
end function
 
function isalnum&(s as string)
dim c as string
c = left$(s, 1)
isalnum& = c <> "" and instr("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_", c) > 0
end function
 
' get next char - fold cr/lf into just lf
sub nextch
the_ch = ""
col_n = col_n + 1
if text_p > len(source) then exit sub
 
the_ch = mid$(source, text_p, 1)
text_p = text_p + 1
 
if the_ch = chr$(13) then
the_ch = chr$(10)
if text_p <= len(source) then
if mid$(source, text_p, 1) = chr$(10) then
text_p = text_p + 1
end if
end if
end if
 
if the_ch = chr$(10) then
line_n = line_n + 1
col_n = 0
end if
 
end sub
 
sub error_exit(line_n as integer, col_n as integer, msg as string)
errors = -1
print line_n, col_n, msg
end
end sub
</syntaxhighlight>
{{out|case=test case 3}}
<b>
<pre> 5 16 keyword_print
5 40 Op_subtract
6 16 keyword_putc
6 40 Op_less
7 16 keyword_if
7 40 Op_greater
8 16 keyword_else
8 40 Op_lessequal
9 16 keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
</b>
 
=={{header|Racket}}==
<syntaxhighlight lang="racket">
#lang racket
(require parser-tools/lex)
 
(define-lex-abbrevs
[letter (union (char-range #\a #\z) (char-range #\A #\Z))]
[digit (char-range #\0 #\9)]
[underscore #\_]
[identifier (concatenation (union letter underscore)
(repetition 0 +inf.0 (union letter digit underscore)))]
[integer (repetition 1 +inf.0 digit)]
[char-content (char-complement (char-set "'\n"))]
[char-literal (union (concatenation #\' char-content #\')
"'\\n'" "'\\\\'")]
[string-content (union (char-complement (char-set "\"\n")))]
[string-literal (union (concatenation #\" (repetition 0 +inf.0 string-content) #\")
"\"\\n\"" "\"\\\\\"")]
[keyword (union "if" "else" "while" "print" "putc")]
[operator (union "*" "/" "%" "+" "-" "-"
"<" "<=" ">" ">=" "==" "!="
"!" "=" "&&" "||")]
[symbol (union "(" ")" "{" "}" ";" ",")]
[comment (concatenation "/*" (complement (concatenation any-string "*/" any-string)) "*/")])
 
(define operators-ht
(hash "*" 'Op_multiply "/" 'Op_divide "%" 'Op_mod "+" 'Op_add "-" 'Op_subtract
"<" 'Op_less "<=" 'Op_lessequal ">" 'Op_greater ">=" 'Op_greaterequal "==" 'Op_equal
"!=" 'Op_notequal "!" 'Op_not "=" 'Op_assign "&&" 'Op_and "||" 'Op_or))
 
(define symbols-ht
(hash "(" 'LeftParen ")" 'RightParen
"{" 'LeftBrace "}" 'RightBrace
";" 'Semicolon "," 'Comma))
 
(define (lexeme->keyword l) (string->symbol (~a "Keyword_" l)))
(define (lexeme->operator l) (hash-ref operators-ht l))
(define (lexeme->symbol l) (hash-ref symbols-ht l))
(define (lexeme->char l) (match l
["'\\\\'" #\\]
["'\\n'" #\newline]
[_ (string-ref l 1)]))
 
(define (token name [value #f])
(cons name (if value (list value) '())))
 
(define (lex ip)
(port-count-lines! ip)
(define my-lexer
(lexer-src-pos
[integer (token 'Integer (string->number lexeme))]
[char-literal (token 'Integer (char->integer (lexeme->char lexeme)))]
[string-literal (token 'String lexeme)]
[keyword (token (lexeme->keyword lexeme))]
[operator (token (lexeme->operator lexeme))]
[symbol (token (lexeme->symbol lexeme))]
[comment #f]
[whitespace #f]
[identifier (token 'Identifier lexeme)]
[(eof) (token 'End_of_input)]))
(define (next-token) (my-lexer ip))
next-token)
 
(define (string->tokens s)
(port->tokens (open-input-string s)))
 
(define (port->tokens ip)
(define next-token (lex ip))
(let loop ()
(match (next-token)
[(position-token t (position offset line col) _)
(set! col (+ col 1)) ; output is 1-based
(match t
[#f (loop)] ; skip whitespace/comments
[(list 'End_of_input) (list (list line col 'End_of_input))]
[(list name value) (cons (list line col name value) (loop))]
[(list name) (cons (list line col name) (loop))]
[_ (error)])])))
 
(define test1 #<<TEST
/*
Hello world
*/
print("Hello, World!\n");
 
TEST
)
 
(define test2 #<<TEST
/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");
 
TEST
)
 
(define test3 #<<TEST
/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* Else */ else /* Leq */ <=
/* While */ while /* Geq */ >=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '
TEST
)
 
(define test4 #<<TEST
/*** test printing, embedded \n and comments with lots of '*' ***/
print(42);
print("\nHello World\nGood Bye\nok\n");
print("Print a slash n - \\n.\n");
TEST
)
 
(define test5 #<<TEST
count = 1;
while (count < 10) {
print("count is: ", count, "\n");
count = count + 1;
}
TEST
)
 
(define (display-tokens ts)
(for ([t ts])
(for ([x t])
(display x) (display "\t\t"))
(newline)))
 
"TEST 1"
(display-tokens (string->tokens test1))
"TEST 2"
(display-tokens (string->tokens test2))
"TEST 3"
(display-tokens (string->tokens test3))
"TEST 4"
(display-tokens (string->tokens test4))
"TEST 5"
(display-tokens (string->tokens test5))
</syntaxhighlight>
 
=={{header|Raku}}==
(formerly Perl 6)
This is more complicated than strictly necessary for this task. It is set up to be easily adapted to do syntax analysis.
 
(Note: there are several bogus comments added solely to help with syntax highlighting.)
 
{{works with|Rakudo|2016.08}}
 
<syntaxhighlight lang="raku" line>grammar tiny_C {
rule TOP { ^ <.whitespace>? <tokens> + % <.whitespace> <.whitespace> <eoi> }
 
rule whitespace { [ <comment> + % <ws> | <ws> ] }
 
token comment { '/*' ~ '*/' .*? }
 
token tokens {
[
| <operator> { make $/<operator>.ast }
| <keyword> { make $/<keyword>.ast }
| <symbol> { make $/<symbol>.ast }
| <identifier> { make $/<identifier>.ast }
| <integer> { make $/<integer>.ast }
| <char> { make $/<char>.ast }
| <string> { make $/<string>.ast }
| <error>
]
}
 
proto token operator {*}
token operator:sym<*> { '*' { make 'Op_multiply' } }
token operator:sym</> { '/'<!before '*'> { make 'Op_divide' } }
token operator:sym<%> { '%' { make 'Op_mod' } }
token operator:sym<+> { '+' { make 'Op_add' } }
token operator:sym<-> { '-' { make 'Op_subtract' } }
token operator:sym('<='){ '<=' { make 'Op_lessequal' } }
token operator:sym('<') { '<' { make 'Op_less' } }
token operator:sym('>='){ '>=' { make 'Op_greaterequal'} }
token operator:sym('>') { '>' { make 'Op_greater' } }
token operator:sym<==> { '==' { make 'Op_equal' } }
token operator:sym<!=> { '!=' { make 'Op_notequal' } }
token operator:sym<!> { '!' { make 'Op_not' } }
token operator:sym<=> { '=' { make 'Op_assign' } }
token operator:sym<&&> { '&&' { make 'Op_and' } }
token operator:sym<||> { '||' { make 'Op_or' } }
 
proto token keyword {*}
token keyword:sym<if> { 'if' { make 'Keyword_if' } }
token keyword:sym<else> { 'else' { make 'Keyword_else' } }
token keyword:sym<putc> { 'putc' { make 'Keyword_putc' } }
token keyword:sym<while> { 'while' { make 'Keyword_while' } }
token keyword:sym<print> { 'print' { make 'Keyword_print' } }
 
proto token symbol {*}
token symbol:sym<(> { '(' { make 'LeftParen' } }
token symbol:sym<)> { ')' { make 'RightParen' } }
token symbol:sym<{> { '{' { make 'LeftBrace' } }
token symbol:sym<}> { '}' { make 'RightBrace' } }
token symbol:sym<;> { ';' { make 'Semicolon' } }
token symbol:sym<,> { ',' { make 'Comma' } }
 
token identifier { <[_A..Za..z]><[_A..Za..z0..9]>* { make 'Identifier ' ~ $/ } }
token integer { <[0..9]>+ { make 'Integer ' ~ $/ } }
 
token char {
'\'' [<-[']> | '\n' | '\\\\'] '\''
{ make 'Char_Literal ' ~ $/.subst("\\n", "\n").substr(1, *-1).ord }
}
 
token string {
'"' <-["\n]>* '"' #'
{
make 'String ' ~ $/;
note 'Error: Unknown escape sequence.' and exit if (~$/ ~~ m:r/ <!after <[\\]>>[\\<-[n\\]>]<!before <[\\]>> /);
}
}
 
token eoi { $ { make 'End_of_input' } }
 
token error {
| '\'''\'' { note 'Error: Empty character constant.' and exit }
| '\'' <-[']> ** {2..*} '\'' { note 'Error: Multi-character constant.' and exit }
| '/*' <-[*]>* $ { note 'Error: End-of-file in comment.' and exit }
| '"' <-["]>* $ { note 'Error: End-of-file in string.' and exit }
| '"' <-["]>*? \n { note 'Error: End of line in string.' and exit } #'
}
}
 
sub parse_it ( $c_code ) {
my $l;
my @pos = gather for $c_code.lines>>.chars.kv -> $line, $v {
take [ $line + 1, $_ ] for 1 .. ($v+1); # v+1 for newline
$l = $line+2;
}
@pos.push: [ $l, 1 ]; # capture eoi
 
for flat $c_code<tokens>.list, $c_code<eoi> -> $m {
say join "\t", @pos[$m.from].fmt('%3d'), $m.ast;
}
}
 
my $tokenizer = tiny_C.parse(@*ARGS[0].IO.slurp);
parse_it( $tokenizer );</syntaxhighlight>
 
{{out|case=test case 3}}
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Char_Literal 10
21 26 Char_Literal 92
22 26 Char_Literal 32
23 1 End_of_input
</pre>
 
=={{header|RATFOR}}==
{{works with|ratfor77|[https://sourceforge.net/p/chemoelectric/ratfor77/ public domain 1.0]}}
{{works with|gfortran|11.2.1}}
{{works with|f2c|20100827}}
 
 
<syntaxhighlight lang="ratfor">######################################################################
#
# The Rosetta Code scanner in Ratfor 77.
#
#
# How to deal with FORTRAN 77 input is a problem. I use formatted
# input, treating each line as an array of type CHARACTER--regrettably
# of no more than some predetermined, finite length. It is a very
# simple method and presents no significant difficulties, aside from
# the restriction on line length of the input.
#
#
# On a POSIX platform, the program can be compiled with f2c and run
# somewhat as follows:
#
# ratfor77 lex-in-ratfor.r > lex-in-ratfor.f
# f2c -C -Nc40 lex-in-ratfor.f
# cc -O lex-in-ratfor.c -lf2c
# ./a.out < compiler-tests/primes.t
#
# With gfortran, a little differently:
#
# ratfor77 lex-in-ratfor.r > lex-in-ratfor.f
# gfortran -O2 -fcheck=all -std=legacy lex-in-ratfor.f
# ./a.out < compiler-tests/primes.t
#
#
# I/O is strictly from default input and to default output, which, on
# POSIX systems, usually correspond respectively to standard input and
# standard output. (I did not wish to have to deal with unit numbers;
# these are now standardized in ISO_FORTRAN_ENV, but that is not
# available in FORTRAN 77.)
#
#---------------------------------------------------------------------
 
# Some parameters you may with to modify.
 
define(LINESZ, 256) # Size of an input line.
define(OUTLSZ, 512) # Size of an output line.
define(PSHBSZ, 10) # Size of the character pushback buffer.
define(STRNSZ, 4096) # Size of the string pool.
 
#---------------------------------------------------------------------
 
define(EOF, -1)
define(NEWLIN, 10) # Unix newline (the LF control character).
define(BACKSL, 92) # ASCII backslash.
 
define(ILINNO, 1) # Line number's index.
define(ICOLNO, 2) # Column number's index.
 
define(CHRSZ, 3) # See ILINNO and ICOLNO above.
define(ICHRCD, 3) # Character code's index.
 
define(TOKSZ, 5) # See ILINNO and ICOLNO above.
define(ITOKNO, 3) # Token number's index.
define(IARGIX, 4) # Index of the string pool index.
define(IARGLN, 5) # Index of the string length.
 
define(TKELSE, 0)
define(TKIF, 1)
define(TKPRNT, 2)
define(TKPUTC, 3)
define(TKWHIL, 4)
define(TKMUL, 5)
define(TKDIV, 6)
define(TKMOD, 7)
define(TKADD, 8)
define(TKSUB, 9)
define(TKNEG, 10)
define(TKLT, 11)
define(TKLE, 12)
define(TKGT, 13)
define(TKGE, 14)
define(TKEQ, 15)
define(TKNE, 16)
define(TKNOT, 17)
define(TKASGN, 18)
define(TKAND, 19)
define(TKOR, 20)
define(TKLPAR, 21)
define(TKRPAR, 22)
define(TKLBRC, 23)
define(TKRBRC, 24)
define(TKSEMI, 25)
define(TKCMMA, 26)
define(TKID, 27)
define(TKINT, 28)
define(TKSTR, 29)
define(TKEOI, 30)
 
define(LOC10, 1) # Location of "10" in the string pool.
define(LOC92, 3) # Location of "92" in the string pool.
 
#---------------------------------------------------------------------
 
subroutine addstr (strngs, istrng, src, i0, n0, i, n)
 
# Add a string to the string pool.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character src(*) # Source string.
integer i0, n0 # Index and length in source string.
integer i, n # Index and length in string pool.
 
integer j
 
if (STRNSZ < istrng + (n0 - 1))
{
write (*, '(''string pool exhausted'')')
stop
}
for (j = 0; j < n0; j = j + 1)
strngs(istrng + j) = src(i0 + j)
i = istrng
n = n0
istrng = istrng + n0
end
 
subroutine cpystr (strngs, i, n, dst, i0)
 
# Copy a string from the string pool to an output string.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer i, n # Index and length in string pool.
character dst(OUTLSZ) # Destination string.
integer i0 # Index within destination string.
 
integer j
 
if (i0 < 1 || OUTLSZ < i0 + (n - 1))
{
write (*, '(''string boundary exceeded'')')
stop
}
for (j = 0; j < n; j = j + 1)
dst(i0 + j) = strngs(i + j)
end
 
#---------------------------------------------------------------------
 
subroutine getchr (line, linno, colno, pushbk, npshbk, chr)
 
# Get a character, with its line number and column number.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer chr(CHRSZ)
 
# End of file is indicated (as in C) by a negative "char code"
# called "EOF".
 
character*20 fmt
integer stat
integer chr1(CHRSZ)
 
if (0 < npshbk)
{
chr(ICHRCD) = pushbk(ICHRCD, npshbk)
chr(ILINNO) = pushbk(ILINNO, npshbk)
chr(ICOLNO) = pushbk(ICOLNO, npshbk)
npshbk = npshbk - 1
}
else if (colno <= LINESZ)
{
chr(ICHRCD) = ichar (line(colno))
chr(ILINNO) = linno
chr(ICOLNO) = colno
colno = colno + 1
}
else
{
# Return a newline character.
chr(ICHRCD) = NEWLIN
chr(ILINNO) = linno
chr(ICOLNO) = colno
 
# Fetch a new line.
linno = linno + 1
colno = 1
 
# Read a line of text as an array of characters.
write (fmt, '(''('', I10, ''A1)'')') LINESZ
read (*, fmt, iostat = stat) line
 
if (stat != 0)
{
# If end of file has been reached, push an EOF.
chr1(ICHRCD) = EOF
chr1(ILINNO) = linno
chr1(ICOLNO) = colno
call pshchr (pushbk, npshbk, chr1)
}
}
end
 
subroutine pshchr (pushbk, npshbk, chr)
 
# Push back a character.
 
implicit none
 
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer chr(CHRSZ)
 
if (PSHBSZ <= npshbk)
{
write (*, '(''pushback buffer overfull'')')
stop
}
npshbk = npshbk + 1
pushbk(ICHRCD, npshbk) = chr(ICHRCD)
pushbk(ILINNO, npshbk) = chr(ILINNO)
pushbk(ICOLNO, npshbk) = chr(ICOLNO)
end
 
subroutine getpos (line, linno, colno, pushbk, npshbk, ln, cn)
 
# Get the position of the next character.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # The line and column nos. returned.
 
integer chr(CHRSZ)
 
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
call pshchr (pushbk, npshbk, chr)
end
 
#---------------------------------------------------------------------
 
function isspc (c)
 
# Is c character code for a space?
 
implicit none
 
integer c
logical isspc
 
#
# The following is correct for ASCII: 32 is the SPACE character, and
# 9 to 13 are control characters commonly regarded as spaces.
#
# In Unicode these are all code points for spaces, but so are others
# besides.
#
isspc = (c == 32 || (9 <= c && c <= 13))
end
 
function isdgt (c)
 
# Is c character code for a digit?
 
implicit none
 
integer c
logical isdgt
 
isdgt = (ichar ('0') <= c && c <= ichar ('9'))
end
 
function isalph (c)
 
# Is c character code for a letter?
 
implicit none
 
integer c
logical isalph
 
#
# The following is correct for ASCII and Unicode, but not for
# EBCDIC.
#
isalph = (ichar ('a') <= c && c <= ichar ('z')) _
|| (ichar ('A') <= c && c <= ichar ('Z'))
end
 
function isid0 (c)
 
# Is c character code for the start of an identifier?
 
implicit none
 
integer c
logical isid0
 
logical isalph
 
isid0 = isalph (c) || c == ichar ('_')
end
 
function isid1 (c)
 
# Is c character code for the continuation of an identifier?
 
implicit none
 
integer c
logical isid1
 
logical isalph
logical isdgt
 
isid1 = isalph (c) || isdgt (c) || c == ichar ('_')
end
 
#---------------------------------------------------------------------
 
function trimlf (str, n)
 
# "Trim left" leading spaces.
 
implicit none
 
character str(*) # The string to "trim".
integer n # The length.
integer trimlf # The index of the first non-space
# character, or n + 1.
 
logical isspc
 
integer j
logical done
 
j = 1
done = .false.
while (!done)
{
if (j == n + 1)
done = .true.
else if (!isspc (ichar (str(j))))
done = .true.
else
j = j + 1
}
trimlf = j
end
 
function trimrt (str, n)
 
# "Trim right" trailing spaces.
 
implicit none
 
character str(*) # The string to "trim".
integer n # The length including trailing spaces.
integer trimrt # The length without trailing spaces.
 
logical isspc
 
integer j
logical done
 
j = n
done = .false.
while (!done)
{
if (j == 0)
done = .true.
else if (!isspc (ichar (str(j))))
done = .true.
else
j = j - 1
}
trimrt = j
end
 
#---------------------------------------------------------------------
 
subroutine toknam (tokno, str, i)
 
# Copy a token name to the character array str, starting at i.
 
implicit none
 
integer tokno
character str(*)
integer i
integer j
 
character*16 names(0:30)
character*16 nm
 
data names / "Keyword_else ", _
"Keyword_if ", _
"Keyword_print ", _
"Keyword_putc ", _
"Keyword_while ", _
"Op_multiply ", _
"Op_divide ", _
"Op_mod ", _
"Op_add ", _
"Op_subtract ", _
"Op_negate ", _
"Op_less ", _
"Op_lessequal ", _
"Op_greater ", _
"Op_greaterequal ", _
"Op_equal ", _
"Op_notequal ", _
"Op_not ", _
"Op_assign ", _
"Op_and ", _
"Op_or ", _
"LeftParen ", _
"RightParen ", _
"LeftBrace ", _
"RightBrace ", _
"Semicolon ", _
"Comma ", _
"Identifier ", _
"Integer ", _
"String ", _
"End_of_input " /
 
nm = names(tokno)
for (j = 0; j < 16; j = j + 1)
str(i + j) = nm(1 + j : 1 + j)
end
 
subroutine intstr (str, i, n, x)
 
# Convert a positive integer to a substring.
 
implicit none
 
character str(*) # Destination string.
integer i, n # Index and length of the field.
integer x # The positive integer to represent.
 
integer j
integer y
 
if (x == 0)
{
for (j = 0; j < n - 1; j = j + 1)
str(i + j) = ' '
str(i + j) = '0'
}
else
{
y = x
for (j = n - 1; 0 <= j; j = j - 1)
{
if (y == 0)
str(i + j) = ' '
else
{
str(i + j) = char (mod (y, 10) + ichar ('0'))
y = y / 10
}
}
}
end
 
subroutine prttok (strngs, tok)
 
# Print a token.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer tok(TOKSZ) # The token.
 
integer trimrt
 
character line(OUTLSZ)
character*20 fmt
integer i, n
integer tokno
 
for (i = 1; i <= OUTLSZ; i = i + 1)
line(i) = ' '
 
call intstr (line, 1, 10, tok(ILINNO))
call intstr (line, 12, 10, tok(ICOLNO))
 
tokno = tok(ITOKNO)
call toknam (tokno, line, 25)
if (tokno == TKID || tokno == TKINT || tokno == TKSTR)
{
i = tok(IARGIX)
n = tok(IARGLN)
call cpystr (strngs, i, n, line, 45)
}
 
n = trimrt (line, OUTLSZ)
write (fmt, '(''('', I10, ''A)'')') n
write (*, fmt) (line(i), i = 1, n)
end
 
#---------------------------------------------------------------------
 
subroutine wrtpos (ln, cn)
 
implicit none
 
integer ln, cn
 
write (*, 1000) ln, cn
1000 format ('At line ', I5, ', column ' I5)
end
 
#---------------------------------------------------------------------
 
subroutine utcmnt (ln, cn)
 
implicit none
 
integer ln, cn
 
call wrtpos (ln, cn)
write (*, '(''Unterminated comment'')')
stop
end
 
subroutine skpcmt (line, linno, colno, pushbk, npshbk, ln, cn)
 
# Skip to the end of a comment.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column of start of comment.
 
integer chr(CHRSZ)
logical done
 
done = .false.
while (!done)
{
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) == ichar ('*'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) == ichar ('/'))
done = .true.
else if (chr(ICHRCD) == EOF)
call utcmnt (ln, cn)
}
else if (chr(ICHRCD) == EOF)
call utcmnt (ln, cn)
}
end
 
subroutine skpspc (line, linno, colno, pushbk, npshbk)
 
# Skip spaces and comments.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
 
logical isspc
 
integer chr(CHRSZ)
integer chr1(CHRSZ)
integer ln, cn
logical done
 
done = .false.
while (!done)
{
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (!isspc (chr(ICHRCD)))
{
if (chr(ICHRCD) != ichar ('/'))
{
call pshchr (pushbk, npshbk, chr)
done = .true.
}
else
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) != ichar ('*'))
{
call pshchr (pushbk, npshbk, chr1)
call pshchr (pushbk, npshbk, chr)
done = .true.
}
else
{
ln = chr(ILINNO)
cn = chr(ICOLNO)
call skpcmt (line, linno, colno, pushbk, npshbk, _
ln, cn)
}
}
}
}
end
 
#---------------------------------------------------------------------
 
subroutine rwdlkp (strngs, istrng, src, i0, n0, ln, cn, tok)
 
# Reserved word lookup
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character src(*) # The source string.
integer i0, n0 # Index and length of the substring.
integer ln, cn # Line and column number
# to associate with the token.
integer tok(TOKSZ) # The output token.
 
integer tokno
integer i, n
 
tokno = TKID
 
if (n0 == 2)
{
if (src(i0) == 'i' && src(i0 + 1) == 'f')
tokno = TKIF
}
else if (n0 == 4)
{
if (src(i0) == 'e' && src(i0 + 1) == 'l' _
&& src(i0 + 2) == 's' && src(i0 + 3) == 'e')
tokno = TKELSE
else if (src(i0) == 'p' && src(i0 + 1) == 'u' _
&& src(i0 + 2) == 't' && src(i0 + 3) == 'c')
tokno = TKPUTC
}
else if (n0 == 5)
{
if (src(i0) == 'p' && src(i0 + 1) == 'r' _
&& src(i0 + 2) == 'i' && src(i0 + 3) == 'n' _
&& src(i0 + 4) == 't')
tokno = TKPRNT
else if (src(i0) == 'w' && src(i0 + 1) == 'h' _
&& src(i0 + 2) == 'i' && src(i0 + 3) == 'l' _
&& src(i0 + 4) == 'e')
tokno = TKWHIL
}
 
i = 0
n = 0
if (tokno == TKID)
call addstr (strngs, istrng, src, i0, n0, i, n)
 
tok(ITOKNO) = tokno
tok(IARGIX) = i
tok(IARGLN) = n
tok(ILINNO) = ln
tok(ICOLNO) = cn
end
 
subroutine scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n)
 
# Scan characters that may represent an identifier, reserved word,
# or integer literal.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column number of the start.
character buf(LINESZ) # The output buffer.
integer n # The length of the string collected.
 
logical isid1
 
integer chr(CHRSZ)
 
n = 0
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
while (isid1 (chr(ICHRCD)))
{
n = n + 1
buf(n) = char (chr(ICHRCD))
call getchr (line, linno, colno, pushbk, npshbk, chr)
}
call pshchr (pushbk, npshbk, chr)
end
 
subroutine scnidr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan an identifier or reserved word.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column number of the start.
integer tok(TOKSZ) # The output token.
 
character buf(LINESZ)
integer n
call scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n)
call rwdlkp (strngs, istrng, buf, 1, n, ln, cn, tok)
end
 
subroutine scnint (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a positive integer literal.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column number of the start.
integer tok(TOKSZ) # The output token.
 
logical isdgt
 
character buf(LINESZ)
integer n0, n
integer i, j, k
character*80 fmt
call scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n0)
for (j = 1; j <= n0; j = j + 1)
if (!isdgt (ichar (buf(j))))
{
call wrtpos (ln, cn)
write (fmt, 1000) n0
1000 format ('(''Not a legal word: "''', I10, 'A, ''"'')')
write (*, fmt) (buf(k), k = 1, n0)
stop
}
 
call addstr (strngs, istrng, buf, 1, n0, i, n)
 
tok(ITOKNO) = TKINT
tok(IARGIX) = i
tok(IARGLN) = n
tok(ILINNO) = ln
tok(ICOLNO) = cn
end
 
subroutine utclit (ln, cn)
 
implicit none
 
integer ln, cn
 
call wrtpos (ln, cn)
write (*, '(''Unterminated character literal'')')
stop
end
 
subroutine scnch1 (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a character literal, without yet checking that the literal
# ends correctly.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
integer trimlf
 
integer bufsz
parameter (bufsz = 40)
 
integer chr(CHRSZ)
integer chr1(CHRSZ)
integer chr2(CHRSZ)
integer ln, cn
character buf(bufsz)
integer i, j, n
 
# Refetch the opening quote.
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
 
tok(ITOKNO) = TKINT
tok(ILINNO) = ln
tok(ICOLNO) = cn
 
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == EOF)
call utclit (ln, cn)
if (chr1(ICHRCD) == BACKSL)
{
call getchr (line, linno, colno, pushbk, npshbk, chr2)
if (chr2(ICHRCD) == EOF)
call utclit (ln, cn)
else if (chr2(ICHRCD) == ichar ('n'))
{
tok(IARGIX) = LOC10 # "10" = code for Unix newline
tok(IARGLN) = 2
}
else if (chr2(ICHRCD) == BACKSL)
{
tok(IARGIX) = LOC92 # "92" = code for backslash
tok(IARGLN) = 2
}
else
{
call wrtpos (ln, cn)
write (*, '(''Unsupported escape: '', 1A)') _
char (chr2(ICHRCD))
stop
}
}
else
{
# Character codes are non-negative, so we can use intstr.
call intstr (buf, 1, bufsz, chr1(ICHRCD))
 
j = trimlf (buf, bufsz)
call addstr (strngs, istrng, buf, j, bufsz - (j - 1), i, n)
tok(IARGIX) = i
tok(IARGLN) = n
}
end
 
subroutine scnch (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a character literal.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
integer ln, cn
integer chr(CHRSZ)
 
call getpos (line, linno, colno, pushbk, npshbk, ln, cn)
call scnch1 (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) != ichar (''''))
{
while (.true.)
{
if (chr(ICHRCD) == EOF)
{
call utclit (ln, cn)
stop
}
else if (chr(ICHRCD) == ichar (''''))
{
call wrtpos (ln, cn)
write (*, '(''Unsupported multicharacter literal'')')
stop
}
call getchr (line, linno, colno, pushbk, npshbk, chr)
}
}
end
 
subroutine scnstr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a string literal.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
integer ln, cn
integer chr1(CHRSZ)
integer chr2(CHRSZ)
character buf(LINESZ + 10) # Enough space, with some room to spare.
integer n0
integer i, n
 
call getchr (line, linno, colno, pushbk, npshbk, chr1)
ln = chr1(ILINNO)
cn = chr1(ICOLNO)
 
tok(ITOKNO) = TKSTR
tok(ILINNO) = ln
tok(ICOLNO) = cn
 
n0 = 1
buf(n0) = '"'
 
call getchr (line, linno, colno, pushbk, npshbk, chr1)
while (chr1(ICHRCD) != ichar ('"'))
{
# Our input method always puts a NEWLIN before EOF, and so this
# test is redundant, unless someone changes the input method.
if (chr1(ICHRCD) == EOF || chr1(ICHRCD) == NEWLIN)
{
call wrtpos (ln, cn)
write (*, '(''Unterminated string literal'')')
stop
}
if (chr1(ICHRCD) == BACKSL)
{
call getchr (line, linno, colno, pushbk, npshbk, chr2)
if (chr2(ICHRCD) == ichar ('n'))
{
n0 = n0 + 1
buf(n0) = char (BACKSL)
n0 = n0 + 1
buf(n0) = 'n'
}
else if (chr2(ICHRCD) == BACKSL)
{
n0 = n0 + 1
buf(n0) = char (BACKSL)
n0 = n0 + 1
buf(n0) = char (BACKSL)
}
else
{
call wrtpos (chr1(ILINNO), chr1(ICOLNO))
write (*, '(''Unsupported escape sequence'')')
stop
}
}
else
{
n0 = n0 + 1
buf(n0) = char (chr1(ICHRCD))
}
call getchr (line, linno, colno, pushbk, npshbk, chr1)
}
n0 = n0 + 1
buf(n0) = '"'
 
call addstr (strngs, istrng, buf, 1, n0, i, n)
tok(IARGIX) = i
tok(IARGLN) = n
end
 
subroutine unxchr (chr)
 
implicit none
 
integer chr(CHRSZ)
 
call wrtpos (chr(ILINNO), chr(ICOLNO))
write (*, 1000) char (chr(ICHRCD))
1000 format ('Unexpected character ''', A1, '''')
stop
end
 
subroutine scntok (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a token.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
logical isdgt
logical isid0
 
integer chr(CHRSZ)
integer chr1(CHRSZ)
integer ln, cn
 
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
tok(ILINNO) = ln
tok(ICOLNO) = cn
tok(IARGIX) = 0
tok(IARGLN) = 0
if (chr(ICHRCD) == ichar (','))
tok(ITOKNO) = TKCMMA
else if (chr(ICHRCD) == ichar (';'))
tok(ITOKNO) = TKSEMI
else if (chr(ICHRCD) == ichar ('('))
tok(ITOKNO) = TKLPAR
else if (chr(ICHRCD) == ichar (')'))
tok(ITOKNO) = TKRPAR
else if (chr(ICHRCD) == ichar ('{'))
tok(ITOKNO) = TKLBRC
else if (chr(ICHRCD) == ichar ('}'))
tok(ITOKNO) = TKRBRC
else if (chr(ICHRCD) == ichar ('*'))
tok(ITOKNO) = TKMUL
else if (chr(ICHRCD) == ichar ('/'))
tok(ITOKNO) = TKDIV
else if (chr(ICHRCD) == ichar ('%'))
tok(ITOKNO) = TKMOD
else if (chr(ICHRCD) == ichar ('+'))
tok(ITOKNO) = TKADD
else if (chr(ICHRCD) == ichar ('-'))
tok(ITOKNO) = TKSUB
else if (chr(ICHRCD) == ichar ('<'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKLE
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKLT
}
}
else if (chr(ICHRCD) == ichar ('>'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKGE
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKGT
}
}
else if (chr(ICHRCD) == ichar ('='))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKEQ
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKASGN
}
}
else if (chr(ICHRCD) == ichar ('!'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKNE
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKNOT
}
}
else if (chr(ICHRCD) == ichar ('&'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('&'))
tok(ITOKNO) = TKAND
else
call unxchr (chr)
}
else if (chr(ICHRCD) == ichar ('|'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('|'))
tok(ITOKNO) = TKOR
else
call unxchr (chr)
}
else if (chr(ICHRCD) == ichar ('"'))
{
call pshchr (pushbk, npshbk, chr)
call scnstr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else if (chr(ICHRCD) == ichar (''''))
{
call pshchr (pushbk, npshbk, chr)
call scnch (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else if (isdgt (chr(ICHRCD)))
{
call pshchr (pushbk, npshbk, chr)
call scnint (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else if (isid0 (chr(ICHRCD)))
{
call pshchr (pushbk, npshbk, chr)
call scnidr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else
call unxchr (chr)
end
 
subroutine scntxt (strngs, istrng, _
line, linno, colno, pushbk, npshbk)
 
# Scan the text and print the token stream.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
 
integer chr(CHRSZ)
integer tok(TOKSZ)
 
chr(ICHRCD) = ichar ('x')
while (chr(ICHRCD) != EOF)
{
call skpspc (line, linno, colno, pushbk, npshbk)
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) != EOF)
{
call pshchr (pushbk, npshbk, chr)
call scntok (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
call prttok (strngs, tok)
}
}
tok(ITOKNO) = TKEOI
tok(ILINNO) = chr(ILINNO)
tok(ICOLNO) = chr(ICOLNO)
tok(IARGIX) = 0
tok(IARGLN) = 0
call prttok (strngs, tok)
end
 
#---------------------------------------------------------------------
 
program lex
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
 
integer i, n
 
istrng = 1
 
# Locate "10" (newline) at 1 in the string pool.
line(1) = '1'
line(2) = '0'
call addstr (strngs, istrng, line, 1, 2, i, n)
if (i != 1 && n != 2)
{
write (*, '(''internal error'')')
stop
}
 
# Locate "92" (backslash) at 3 in the string pool.
line(1) = '9'
line(2) = '2'
call addstr (strngs, istrng, line, 1, 2, i, n)
if (i != 3 && n != 2)
{
write (*, '(''internal error'')')
stop
}
 
linno = 0
colno = LINESZ + 1 # This will trigger a READ.
npshbk = 0
 
call scntxt (strngs, istrng, line, linno, colno, pushbk, npshbk)
end
 
######################################################################</syntaxhighlight>
 
 
{{out}}
<pre>$ ratfor77 lex-in-ratfor.r > lex-in-ratfor.f && gfortran -O2 -std=legacy -fcheck=all lex-in-ratfor.f && ./a.out < compiler-tests/primes.t
4 1 Identifier count
4 7 Op_assign
4 9 Integer 1
4 10 Semicolon
5 1 Identifier n
5 3 Op_assign
5 5 Integer 1
5 6 Semicolon
6 1 Identifier limit
6 7 Op_assign
6 9 Integer 100
6 12 Semicolon
7 1 Keyword_while
7 7 LeftParen
7 8 Identifier n
7 10 Op_less
7 12 Identifier limit
7 17 RightParen
7 19 LeftBrace
8 5 Identifier k
8 6 Op_assign
8 7 Integer 3
8 8 Semicolon
9 5 Identifier p
9 6 Op_assign
9 7 Integer 1
9 8 Semicolon
10 5 Identifier n
10 6 Op_assign
10 7 Identifier n
10 8 Op_add
10 9 Integer 2
10 10 Semicolon
11 5 Keyword_while
11 11 LeftParen
11 12 LeftParen
11 13 Identifier k
11 14 Op_multiply
11 15 Identifier k
11 16 Op_lessequal
11 18 Identifier n
11 19 RightParen
11 21 Op_and
11 24 LeftParen
11 25 Identifier p
11 26 RightParen
11 27 RightParen
11 29 LeftBrace
12 9 Identifier p
12 10 Op_assign
12 11 Identifier n
12 12 Op_divide
12 13 Identifier k
12 14 Op_multiply
12 15 Identifier k
12 16 Op_notequal
12 18 Identifier n
12 19 Semicolon
13 9 Identifier k
13 10 Op_assign
13 11 Identifier k
13 12 Op_add
13 13 Integer 2
13 14 Semicolon
14 5 RightBrace
15 5 Keyword_if
15 8 LeftParen
15 9 Identifier p
15 10 RightParen
15 12 LeftBrace
16 9 Keyword_print
16 14 LeftParen
16 15 Identifier n
16 16 Comma
16 18 String " is prime\n"
16 31 RightParen
16 32 Semicolon
17 9 Identifier count
17 15 Op_assign
17 17 Identifier count
17 23 Op_add
17 25 Integer 1
17 26 Semicolon
18 5 RightBrace
19 1 RightBrace
20 1 Keyword_print
20 6 LeftParen
20 7 String "Total primes found: "
20 29 Comma
20 31 Identifier count
20 36 Comma
20 38 String "\n"
20 42 RightParen
20 43 Semicolon
21 1 End_of_input</pre>
 
=={{header|Scala}}==
The complete implementation for the compiler tasks can be found in a GitHub repository at [https://github.com/edadma/rosettacodeCompiler github.com/edadma/rosettacodeCompiler] which includes full unit testing for the samples given in [[Compiler/Sample programs]].
 
The following code implements a configurable (from a symbol map and keyword map provided as parameters) lexical analyzer.
 
<syntaxhighlight lang="scala">
package xyz.hyperreal.rosettacodeCompiler
 
import scala.io.Source
import scala.util.matching.Regex
 
object LexicalAnalyzer {
private val EOT = '\u0004'
 
val symbols =
Map(
"*" -> "Op_multiply",
"/" -> "Op_divide",
"%" -> "Op_mod",
"+" -> "Op_add",
"-" -> "Op_minus",
"<" -> "Op_less",
"<=" -> "Op_lessequal",
">" -> "Op_greater",
">=" -> "Op_greaterequal",
"==" -> "Op_equal",
"!=" -> "Op_notequal",
"!" -> "Op_not",
"=" -> "Op_assign",
"&&" -> "Op_and",
"¦¦" -> "Op_or",
"(" -> "LeftParen",
")" -> "RightParen",
"{" -> "LeftBrace",
"}" -> "RightBrace",
";" -> "Semicolon",
"," -> "Comma"
)
 
val keywords =
Map(
"if" -> "Keyword_if",
"else" -> "Keyword_else",
"while" -> "Keyword_while",
"print" -> "Keyword_print",
"putc" -> "Keyword_putc"
)
val alpha = ('a' to 'z' toSet) ++ ('A' to 'Z')
val numeric = '0' to '9' toSet
val alphanumeric = alpha ++ numeric
val identifiers = StartRestToken("Identifier", alpha + '_', alphanumeric + '_')
val integers = SimpleToken("Integer", numeric, alpha, "alpha characters may not follow right after a number")
 
val characters =
DelimitedToken("Integer",
'\'',
"[^'\\n]|\\\\n|\\\\\\\\" r,
"invalid character literal",
"unclosed character literal")
 
val strings =
DelimitedToken("String", '"', "[^\"\\n]*" r, "invalid string literal", "unclosed string literal")
 
def apply =
new LexicalAnalyzer(4, symbols, keywords, "End_of_input", identifiers, integers, characters, strings)
 
abstract class Token
case class StartRestToken(name: String, start: Set[Char], rest: Set[Char]) extends Token
case class SimpleToken(name: String, chars: Set[Char], exclude: Set[Char], excludeError: String) extends Token
case class DelimitedToken(name: String, delimiter: Char, pattern: Regex, patternError: String, unclosedError: String)
extends Token
}
 
class LexicalAnalyzer(tabs: Int,
symbols: Map[String, String],
keywords: Map[String, String],
endOfInput: String,
identifier: LexicalAnalyzer.Token,
tokens: LexicalAnalyzer.Token*) {
 
import LexicalAnalyzer._
 
private val symbolStartChars = symbols.keys map (_.head) toSet
private val symbolChars = symbols.keys flatMap (_.toList) toSet
private var curline: Int = _
private var curcol: Int = _
 
def fromStdin = fromSource(Source.stdin)
 
def fromString(src: String) = fromSource(Source.fromString(src))
 
def fromSource(ast: Source) = {
curline = 1
curcol = 1
 
var s = (ast ++ Iterator(EOT)) map (new Chr(_)) toStream
 
tokenize
 
def token(name: String, first: Chr) = println(f"${first.line}%5d ${first.col}%6d $name")
 
def value(name: String, v: String, first: Chr) = println(f"${first.line}%5d ${first.col}%6d $name%-14s $v")
 
def until(c: Char) = {
val buf = new StringBuilder
 
def until: String =
if (s.head.c == EOT || s.head.c == c)
buf.toString
else {
buf += getch
until
}
 
until
}
 
def next = s = s.tail
 
def getch = {
val c = s.head.c
 
next
c
}
 
def consume(first: Char, cs: Set[Char]) = {
val buf = new StringBuilder
 
def consume: String =
if (s.head.c == EOT || !cs(s.head.c))
buf.toString
else {
buf += getch
consume
}
 
buf += first
consume
}
 
def comment(start: Chr): Unit = {
until('*')
 
if (s.head.c == EOT || s.tail.head.c == EOT)
sys.error(s"unclosed comment ${start.at}")
else if (s.tail.head.c != '/') {
next
comment(start)
} else {
next
next
}
}
 
def recognize(t: Token): Option[(String, String)] = {
val first = s
 
next
 
t match {
case StartRestToken(name, start, rest) =>
if (start(first.head.c))
Some((name, consume(first.head.c, rest)))
else {
s = first
None
}
case SimpleToken(name, chars, exclude, excludeError) =>
if (chars(first.head.c)) {
val m = consume(first.head.c, chars)
 
if (exclude(s.head.c))
sys.error(s"$excludeError ${s.head.at}")
else
Some((name, m))
} else {
s = first
None
}
case DelimitedToken(name, delimiter, pattern, patternError, unclosedError) =>
if (first.head.c == delimiter) {
val m = until(delimiter)
 
if (s.head.c != delimiter)
sys.error(s"$unclosedError ${first.head.at}")
else if (pattern.pattern.matcher(m).matches) {
next
Some((name, s"$delimiter$m$delimiter"))
} else
sys.error(s"$patternError ${s.head.at}")
} else {
s = first
None
}
}
}
 
def tokenize: Unit =
if (s.head.c == EOT)
token(endOfInput, s.head)
else {
if (s.head.c.isWhitespace)
next
else if (s.head.c == '/' && s.tail.head.c == '*')
comment(s.head)
else if (symbolStartChars(s.head.c)) {
val first = s.head
val buf = new StringBuilder
 
while (!symbols.contains(buf.toString) && s.head.c != EOT && symbolChars(s.head.c)) buf += getch
 
while (symbols.contains(buf.toString :+ s.head.c) && s.head.c != EOT && symbolChars(s.head.c)) buf += getch
 
symbols get buf.toString match {
case Some(name) => token(name, first)
case None => sys.error(s"unrecognized symbol: '${buf.toString}' ${first.at}")
}
} else {
val first = s.head
 
recognize(identifier) match {
case None =>
find(0)
 
@scala.annotation.tailrec
def find(t: Int): Unit =
if (t == tokens.length)
sys.error(s"unrecognized character ${first.at}")
else
recognize(tokens(t)) match {
case None => find(t + 1)
case Some((name, v)) => value(name, v, first)
}
case Some((name, ident)) =>
keywords get ident match {
case None => value(name, ident, first)
case Some(keyword) => token(keyword, first)
}
}
}
 
tokenize
}
}
 
private class Chr(val c: Char) {
val line = curline
val col = curcol
 
if (c == '\n') {
curline += 1
curcol = 1
} else if (c == '\r')
curcol = 1
else if (c == '\t')
curcol += tabs - (curcol - 1) % tabs
else
curcol += 1
 
def at = s"[${line}, ${col}]"
 
override def toString: String = s"<$c, $line, $col>"
}
 
}
</syntaxhighlight>
 
=={{header|Scheme}}==
 
<langsyntaxhighlight lang="scheme">
(import (scheme base)
(scheme char)
Line 4,855 ⟶ 17,798:
(display-tokens (lexer (cadr (command-line))))
(display "Error: provide program filename\n"))
</syntaxhighlight>
</lang>
 
{{out}}
Line 4,867 ⟶ 17,810:
5 1 End_of_input
</pre>
 
=={{header|Standard ML}}==
{{trans|ATS}}
{{trans|OCaml}}
 
 
<syntaxhighlight lang="sml">(*------------------------------------------------------------------*)
(* The Rosetta Code lexical analyzer, in Standard ML. Based on the ATS
and the OCaml. The intended compiler is Mlton or Poly/ML; there is
a tiny difference near the end of the file, depending on which
compiler is used. *)
 
(*------------------------------------------------------------------*)
(* The following functions are compatible with ASCII. *)
 
fun
is_digit ichar =
48 <= ichar andalso ichar <= 57
 
fun
is_lower ichar =
97 <= ichar andalso ichar <= 122
 
fun
is_upper ichar =
65 <= ichar andalso ichar <= 90
 
fun
is_alpha ichar =
is_lower ichar orelse is_upper ichar
 
fun
is_alnum ichar =
is_digit ichar orelse is_alpha ichar
 
fun
is_ident_start ichar =
is_alpha ichar orelse ichar = 95
 
fun
is_ident_continuation ichar =
is_alnum ichar orelse ichar = 95
 
fun
is_space ichar =
ichar = 32 orelse (9 <= ichar andalso ichar <= 13)
 
(*------------------------------------------------------------------*)
(* Character input more like that of C. There are various advantages
and disadvantages to this method, but key points in its favor are:
(a) it is how character input is done in the original ATS code, (b)
Unicode code points are 21-bit positive integers. *)
 
val eof = ~1
 
fun
input_ichar inpf =
case TextIO.input1 inpf of
NONE => eof
| SOME c => Char.ord c
 
(*------------------------------------------------------------------*)
 
(* The type of an input character. *)
 
structure Ch =
struct
 
type t = {
ichar : int,
line_no : int,
column_no : int
}
 
end
 
(*------------------------------------------------------------------*)
(* Inputting with unlimited pushback, and with counting of lines and
columns. *)
 
structure Inp =
struct
 
type t = {
inpf : TextIO.instream,
pushback : Ch.t list,
line_no : int,
column_no : int
}
 
fun
of_instream inpf =
{
inpf = inpf,
pushback = [],
line_no = 1,
column_no = 1
} : t
 
fun
get_ch ({ inpf = inpf,
pushback = pushback,
line_no = line_no,
column_no = column_no } : t) =
case pushback of
ch :: tail =>
let
val inp = { inpf = inpf,
pushback = tail,
line_no = line_no,
column_no = column_no }
in
(ch, inp)
end
| [] =>
let
val ichar = input_ichar inpf
val ch = { ichar = ichar,
line_no = line_no,
column_no = column_no }
in
if ichar = Char.ord #"\n" then
let
val inp = { inpf = inpf,
pushback = [],
line_no = line_no + 1,
column_no = 1 }
in
(ch, inp)
end
else
let
val inp = { inpf = inpf,
pushback = [],
line_no = line_no,
column_no = column_no + 1 }
in
(ch, inp)
end
end
 
fun
push_back_ch (ch, inp : t) =
{
inpf = #inpf inp,
pushback = ch :: #pushback inp,
line_no = #line_no inp,
column_no = #column_no inp
}
 
end
 
(*------------------------------------------------------------------*)
(* Tokens, appearing in tuples with arguments, and with line and
column numbers. The tokens are integers, so they can be used as
array indices. *)
 
val token_ELSE = 0
val token_IF = 1
val token_PRINT = 2
val token_PUTC = 3
val token_WHILE = 4
val token_MULTIPLY = 5
val token_DIVIDE = 6
val token_MOD = 7
val token_ADD = 8
val token_SUBTRACT = 9
val token_NEGATE = 10
val token_LESS = 11
val token_LESSEQUAL = 12
val token_GREATER = 13
val token_GREATEREQUAL = 14
val token_EQUAL = 15
val token_NOTEQUAL = 16
val token_NOT = 17
val token_ASSIGN = 18
val token_AND = 19
val token_OR = 20
val token_LEFTPAREN = 21
val token_RIGHTPAREN = 22
val token_LEFTBRACE = 23
val token_RIGHTBRACE = 24
val token_SEMICOLON = 25
val token_COMMA = 26
val token_IDENTIFIER = 27
val token_INTEGER = 28
val token_STRING = 29
val token_END_OF_INPUT = 30
 
(* A *very* simple perfect hash for the reserved words. (Yes, this is
overkill, except for demonstration of the principle.) *)
 
val reserved_words =
Vector.fromList ["if", "print", "else",
"", "putc", "",
"", "while", ""]
val reserved_word_tokens =
Vector.fromList [token_IF, token_PRINT, token_ELSE,
token_IDENTIFIER, token_PUTC, token_IDENTIFIER,
token_IDENTIFIER, token_WHILE, token_IDENTIFIER]
 
fun
reserved_word_lookup (s, line_no, column_no) =
if (String.size s) < 2 then
(token_IDENTIFIER, s, line_no, column_no)
else
let
val hashval =
(Char.ord (String.sub (s, 0)) +
Char.ord (String.sub (s, 1)))
mod 9
val token = Vector.sub (reserved_word_tokens, hashval)
in
if token = token_IDENTIFIER orelse
s <> Vector.sub (reserved_words, hashval) then
(token_IDENTIFIER, s, line_no, column_no)
else
(token, s, line_no, column_no)
end
 
(* Token to string lookup. *)
 
val token_names =
Vector.fromList
["Keyword_else",
"Keyword_if",
"Keyword_print",
"Keyword_putc",
"Keyword_while",
"Op_multiply",
"Op_divide",
"Op_mod",
"Op_add",
"Op_subtract",
"Op_negate",
"Op_less",
"Op_lessequal",
"Op_greater",
"Op_greaterequal",
"Op_equal",
"Op_notequal",
"Op_not",
"Op_assign",
"Op_and",
"Op_or",
"LeftParen",
"RightParen",
"LeftBrace",
"RightBrace",
"Semicolon",
"Comma",
"Identifier",
"Integer",
"String",
"End_of_input"]
 
fun
token_name token =
Vector.sub (token_names, token)
 
(*------------------------------------------------------------------*)
 
exception Unterminated_comment of int * int
exception Unterminated_character_literal of int * int
exception Multicharacter_literal of int * int
exception End_of_input_in_string_literal of int * int
exception End_of_line_in_string_literal of int * int
exception Unsupported_escape of int * int * char
exception Invalid_integer_literal of int * int * string
exception Unexpected_character of int * int * char
 
(*------------------------------------------------------------------*)
(* Skipping past spaces and comments. (In the Rosetta Code tiny
language, a comment, if you think about it, is a kind of space.) *)
 
fun
scan_comment (inp, line_no, column_no) =
let
fun
loop inp =
let
val (ch, inp) = Inp.get_ch inp
in
if #ichar ch = eof then
raise Unterminated_comment (line_no, column_no)
else if #ichar ch = Char.ord #"*" then
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = eof then
raise Unterminated_comment (line_no, column_no)
else if #ichar ch1 = Char.ord #"/" then
inp
else
loop inp
end
else
loop inp
end
in
loop inp
end
 
fun
skip_spaces_and_comments inp =
let
fun
loop inp =
let
val (ch, inp) = Inp.get_ch inp
in
if is_space (#ichar ch) then
loop inp
else if #ichar ch = Char.ord #"/" then
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"*" then
loop (scan_comment (inp, #line_no ch, #column_no ch))
else
let
val inp = Inp.push_back_ch (ch1, inp)
val inp = Inp.push_back_ch (ch, inp)
in
inp
end
end
else
Inp.push_back_ch (ch, inp)
end
in
loop inp
end
 
(*------------------------------------------------------------------*)
(* Integer literals, identifiers, and reserved words. *)
 
fun
scan_word (lst, inp) =
let
val (ch, inp) = Inp.get_ch inp
in
if is_ident_continuation (#ichar ch) then
scan_word (Char.chr (#ichar ch) :: lst, inp)
else
(lst, Inp.push_back_ch (ch, inp))
end
 
fun
scan_integer_literal inp =
let
val (ch, inp) = Inp.get_ch inp
val (lst, inp) = scan_word ([Char.chr (#ichar ch)], inp)
val s = String.implode (List.rev lst)
in
if List.all (fn c => is_digit (Char.ord c)) lst then
((token_INTEGER, s, #line_no ch, #column_no ch), inp)
else
raise Invalid_integer_literal (#line_no ch, #column_no ch, s)
end
 
fun
scan_identifier_or_reserved_word inp =
let
val (ch, inp) = Inp.get_ch inp
val (lst, inp) = scan_word ([Char.chr (#ichar ch)], inp)
val s = String.implode (List.rev lst)
val toktup = reserved_word_lookup (s, #line_no ch, #column_no ch)
in
(toktup, inp)
end
 
(*------------------------------------------------------------------*)
(* String literals. *)
 
fun
scan_string_literal inp =
let
val (ch, inp) = Inp.get_ch inp
 
fun
scan (lst, inp) =
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = eof then
raise End_of_input_in_string_literal
(#line_no ch, #column_no ch)
else if #ichar ch1 = Char.ord #"\n" then
raise End_of_line_in_string_literal
(#line_no ch, #column_no ch)
else if #ichar ch1 = Char.ord #"\"" then
(lst, inp)
else if #ichar ch1 <> Char.ord #"\\" then
scan (Char.chr (#ichar ch1) :: lst, inp)
else
let
val (ch2, inp) = Inp.get_ch inp
in
if #ichar ch2 = Char.ord #"n" then
scan (#"n" :: #"\\" :: lst, inp)
else if #ichar ch2 = Char.ord #"\\" then
scan (#"\\" :: #"\\" :: lst, inp)
else if #ichar ch2 = eof then
raise End_of_input_in_string_literal
(#line_no ch, #column_no ch)
else if #ichar ch2 = Char.ord #"\n" then
raise End_of_line_in_string_literal
(#line_no ch, #column_no ch)
else
raise Unsupported_escape (#line_no ch1, #column_no ch1,
Char.chr (#ichar ch2))
end
end
 
val lst = #"\"" :: []
val (lst, inp) = scan (lst, inp)
val lst = #"\"" :: lst
val s = String.implode (List.rev lst)
in
((token_STRING, s, #line_no ch, #column_no ch), inp)
end
 
(*------------------------------------------------------------------*)
(* Character literals. *)
 
fun
scan_character_literal_without_checking_end inp =
let
val (ch, inp) = Inp.get_ch inp
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = eof then
raise Unterminated_character_literal
(#line_no ch, #column_no ch)
else if #ichar ch1 = Char.ord #"\\" then
let
val (ch2, inp) = Inp.get_ch inp
in
if #ichar ch2 = eof then
raise Unterminated_character_literal
(#line_no ch, #column_no ch)
else if #ichar ch2 = Char.ord #"n" then
let
val s = Int.toString (Char.ord #"\n")
in
((token_INTEGER, s, #line_no ch, #column_no ch), inp)
end
else if #ichar ch2 = Char.ord #"\\" then
let
val s = Int.toString (Char.ord #"\\")
in
((token_INTEGER, s, #line_no ch, #column_no ch), inp)
end
else
raise Unsupported_escape (#line_no ch1, #column_no ch1,
Char.chr (#ichar ch2))
end
else
let
val s = Int.toString (#ichar ch1)
in
((token_INTEGER, s, #line_no ch, #column_no ch), inp)
end
end
 
fun
scan_character_literal inp =
let
val (toktup, inp) =
scan_character_literal_without_checking_end inp
val (_, _, line_no, column_no) = toktup
 
fun
check_end inp =
let
val (ch, inp) = Inp.get_ch inp
in
if #ichar ch = Char.ord #"'" then
inp
else
let
fun
loop_to_end (ch1 : Ch.t, inp) =
if #ichar ch1 = eof then
raise Unterminated_character_literal (line_no, column_no)
else if #ichar ch1 = Char.ord #"'" then
raise Multicharacter_literal (line_no, column_no)
else
let
val (ch1, inp) = Inp.get_ch inp
in
loop_to_end (ch1, inp)
end
in
loop_to_end (ch, inp)
end
end
 
val inp = check_end inp
in
(toktup, inp)
end
 
(*------------------------------------------------------------------*)
 
fun
get_next_token inp =
let
val inp = skip_spaces_and_comments inp
val (ch, inp) = Inp.get_ch inp
val ln = #line_no ch
val cn = #column_no ch
in
if #ichar ch = eof then
((token_END_OF_INPUT, "", ln, cn), inp)
else
case Char.chr (#ichar ch) of
#"," => ((token_COMMA, ",", ln, cn), inp)
| #";" => ((token_SEMICOLON, ";", ln, cn), inp)
| #"(" => ((token_LEFTPAREN, "(", ln, cn), inp)
| #")" => ((token_RIGHTPAREN, ")", ln, cn), inp)
| #"{" => ((token_LEFTBRACE, "{", ln, cn), inp)
| #"}" => ((token_RIGHTBRACE, "}", ln, cn), inp)
| #"*" => ((token_MULTIPLY, "*", ln, cn), inp)
| #"/" => ((token_DIVIDE, "/", ln, cn), inp)
| #"%" => ((token_MOD, "%", ln, cn), inp)
| #"+" => ((token_ADD, "+", ln, cn), inp)
| #"-" => ((token_SUBTRACT, "-", ln, cn), inp)
| #"<" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"=" then
((token_LESSEQUAL, "<=", ln, cn), inp)
else
let
val inp = Inp.push_back_ch (ch1, inp)
in
((token_LESS, "<", ln, cn), inp)
end
end
| #">" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"=" then
((token_GREATEREQUAL, ">=", ln, cn), inp)
else
let
val inp = Inp.push_back_ch (ch1, inp)
in
((token_GREATER, ">", ln, cn), inp)
end
end
| #"=" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"=" then
((token_EQUAL, "==", ln, cn), inp)
else
let
val inp = Inp.push_back_ch (ch1, inp)
in
((token_ASSIGN, "=", ln, cn), inp)
end
end
| #"!" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"=" then
((token_NOTEQUAL, "!=", ln, cn), inp)
else
let
val inp = Inp.push_back_ch (ch1, inp)
in
((token_NOT, "!", ln, cn), inp)
end
end
| #"&" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"&" then
((token_AND, "&&", ln, cn), inp)
else
raise Unexpected_character (#line_no ch, #column_no ch,
Char.chr (#ichar ch))
end
| #"|" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"|" then
((token_OR, "||", ln, cn), inp)
else
raise Unexpected_character (#line_no ch, #column_no ch,
Char.chr (#ichar ch))
end
| #"\"" =>
let
val inp = Inp.push_back_ch (ch, inp)
in
scan_string_literal inp
end
| #"'" =>
let
val inp = Inp.push_back_ch (ch, inp)
in
scan_character_literal inp
end
| _ =>
if is_digit (#ichar ch) then
let
val inp = Inp.push_back_ch (ch, inp)
in
scan_integer_literal inp
end
else if is_ident_start (#ichar ch) then
let
val inp = Inp.push_back_ch (ch, inp)
in
scan_identifier_or_reserved_word inp
end
else
raise Unexpected_character (#line_no ch, #column_no ch,
Char.chr (#ichar ch))
end
 
fun
output_integer_rightjust (outf, num) =
(if num < 10 then
TextIO.output (outf, " ")
else if num < 100 then
TextIO.output (outf, " ")
else if num < 1000 then
TextIO.output (outf, " ")
else if num < 10000 then
TextIO.output (outf, " ")
else
();
TextIO.output (outf, Int.toString num))
 
fun
print_token (outf, toktup) =
let
val (token, arg, line_no, column_no) = toktup
val name = token_name token
val (padding, str) =
if token = token_IDENTIFIER then
(" ", arg)
else if token = token_INTEGER then
(" ", arg)
else if token = token_STRING then
(" ", arg)
else("", "")
in
output_integer_rightjust (outf, line_no);
TextIO.output (outf, " ");
output_integer_rightjust (outf, column_no);
TextIO.output (outf, " ");
TextIO.output (outf, name);
TextIO.output (outf, padding);
TextIO.output (outf, str);
TextIO.output (outf, "\n")
end
 
fun
scan_text (outf, inp) =
let
fun
loop inp =
let
val (toktup, inp) = get_next_token inp
in
(print_token (outf, toktup);
let
val (token, _, _, _) = toktup
in
if token <> token_END_OF_INPUT then
loop inp
else
()
end)
end
in
loop inp
end
 
(*------------------------------------------------------------------*)
 
fun
main () =
let
val args = CommandLine.arguments ()
val (inpf_filename, outf_filename) =
case args of
[] => ("-", "-")
| name :: [] => (name, "-")
| name1 :: name2 :: _ => (name1, name2)
val inpf =
if inpf_filename = "-" then
TextIO.stdIn
else
TextIO.openIn inpf_filename
handle
(IO.Io _) =>
(TextIO.output (TextIO.stdErr, "Failure opening \"");
TextIO.output (TextIO.stdErr, inpf_filename);
TextIO.output (TextIO.stdErr, "\" for input\n");
OS.Process.exit OS.Process.failure)
val outf =
if outf_filename = "-" then
TextIO.stdOut
else
TextIO.openOut outf_filename
handle
(IO.Io _) =>
(TextIO.output (TextIO.stdErr, "Failure opening \"");
TextIO.output (TextIO.stdErr, outf_filename);
TextIO.output (TextIO.stdErr, "\" for output\n");
OS.Process.exit OS.Process.failure)
val inp = Inp.of_instream inpf
in
scan_text (outf, inp)
end
handle Unterminated_comment (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": unterminated comment ");
TextIO.output (TextIO.stdErr, "starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Unterminated_character_literal (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": unterminated character ");
TextIO.output (TextIO.stdErr, "literal starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Multicharacter_literal (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": unsupported multicharacter");
TextIO.output (TextIO.stdErr, " literal starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| End_of_input_in_string_literal (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": end of input in string");
TextIO.output (TextIO.stdErr, " literal starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| End_of_line_in_string_literal (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": end of line in string");
TextIO.output (TextIO.stdErr, " literal starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Unsupported_escape (line_no, column_no, c) =>
(TextIO.output (TextIO.stdErr, CommandLine.name ());
TextIO.output (TextIO.stdErr, ": unsupported escape \\");
TextIO.output (TextIO.stdErr, Char.toString c);
TextIO.output (TextIO.stdErr, " at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Invalid_integer_literal (line_no, column_no, str) =>
(TextIO.output (TextIO.stdErr, CommandLine.name ());
TextIO.output (TextIO.stdErr, ": invalid integer literal ");
TextIO.output (TextIO.stdErr, str);
TextIO.output (TextIO.stdErr, " at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Unexpected_character (line_no, column_no, c) =>
(TextIO.output (TextIO.stdErr, CommandLine.name ());
TextIO.output (TextIO.stdErr, ": unexpected character '");
TextIO.output (TextIO.stdErr, Char.toString c);
TextIO.output (TextIO.stdErr, "' at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure);
 
(*------------------------------------------------------------------*)
(* For the Mlton compiler, include the following. For Poly/ML, comment
it out. *)
main ();
 
(*------------------------------------------------------------------*)
(* Instructions for GNU Emacs. *)
 
(* local variables: *)
(* mode: sml *)
(* sml-indent-level: 2 *)
(* sml-indent-args: 2 *)
(* end: *)
(*------------------------------------------------------------------*)</syntaxhighlight>
 
 
{{out}}
For Mlton, compile with
<pre>mlton -output lex lex.sml</pre>
 
For Poly/ML, compile with
<pre>polyc -o lex lex.sml</pre>
 
Mlton is an optimizing whole-program compiler. It might take much longer to compile the source but produce much faster executables.
 
Output for testcase3:
<pre> 5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|Wren}}==
{{trans|Go}}
{{libheader|Wren-dynamic}}
{{libheader|Wren-str}}
{{libheader|Wren-fmt}}
{{libheader|Wren-ioutil}}
<syntaxhighlight lang="wren">import "./dynamic" for Enum, Struct, Tuple
import "./str" for Char
import "./fmt" for Fmt
import "./ioutil" for FileUtil
import "os" for Process
 
var tokens = [
"EOI",
"Mul",
"Div",
"Mod",
"Add",
"Sub",
"Negate",
"Not",
"Lss",
"Leq",
"Gtr",
"Geq",
"Eq",
"Neq",
"Assign",
"And",
"Or",
"If",
"Else",
"While",
"Print",
"Putc",
"Lparen",
"Rparen",
"Lbrace",
"Rbrace",
"Semi",
"Comma",
"Ident",
"Integer",
"String"
]
 
var Token = Enum.create("Token", tokens)
 
var TokData = Struct.create("TokData", ["eline", "ecol", "tok", "v"])
 
var Symbol = Tuple.create("Symbol", ["name", "tok"])
 
// symbol table
var symtab = []
 
var curLine = ""
var curCh = ""
var lineNum = 0
var colNum = 0
var etx = 4 // used to signify EOI
 
var lines = []
var lineCount = 0
 
var errorMsg = Fn.new { |eline, ecol, msg| Fiber.abort("(%(eline):%(ecol)) %(msg)") }
 
// add an identifier to the symbpl table
var install = Fn.new { |name, tok|
var sym = Symbol.new(name, tok)
symtab.add(sym)
}
 
// search for an identifier in the symbol table
var lookup = Fn.new { |name|
for (i in 0...symtab.count) {
if (symtab[i].name == name) return i
}
return -1
}
 
// read the next line of input from the source file
var nextLine // recursive function
nextLine = Fn.new {
if (lineNum == lineCount) {
curCh = etx
curLine = ""
colNum = 1
return
}
curLine = lines[lineNum]
lineNum = lineNum + 1
colNum = 0
if (curLine == "") nextLine.call() // skip blank lines
}
 
// get the next char
var nextChar = Fn.new {
if (colNum >= curLine.count) nextLine.call()
if (colNum < curLine.count) {
curCh = curLine[colNum]
colNum = colNum + 1
}
}
 
var follow = Fn.new { |eline, ecol, expect, ifyes, ifno|
if (curCh == expect) {
nextChar.call()
return ifyes
}
if (ifno == Token.EOI) {
errorMsg.call(eline, ecol, "follow unrecognized character: " + curCh)
}
return ifno
}
 
var getTok // recursive function
getTok = Fn.new {
// skip whitespace
while (curCh == " " || curCh == "\t" || curCh == "\n") nextChar.call()
var td = TokData.new(lineNum, colNum, 0, "")
if (curCh == etx) {
td.tok = Token.EOI
return td
}
if (curCh == "{") {
td.tok = Token.Lbrace
nextChar.call()
return td
}
if (curCh == "}") {
td.tok = Token.Rbrace
nextChar.call()
return td
}
if (curCh == "(") {
td.tok = Token.Lparen
nextChar.call()
return td
}
if (curCh == ")") {
td.tok = Token.Rparen
nextChar.call()
return td
}
if (curCh == "+") {
td.tok = Token.Add
nextChar.call()
return td
}
if (curCh == "-") {
td.tok = Token.Sub
nextChar.call()
return td
}
if (curCh == "*") {
td.tok = Token.Mul
nextChar.call()
return td
}
if (curCh == "\%") {
td.tok = Token.Mod
nextChar.call()
return td
}
if (curCh == ";") {
td.tok = Token.Semi
nextChar.call()
return td
}
if (curCh == ",") {
td.tok = Token.Comma
nextChar.call()
return td
}
if (curCh == "'") { // single char literals
nextChar.call()
td.v = curCh.bytes[0].toString
if (curCh == "'") {
errorMsg.call(td.eline, td.ecol, "Empty character constant")
}
if (curCh == "\\") {
nextChar.call()
if (curCh == "n") {
td.v = "10"
} else if (curCh == "\\") {
td.v = "92"
} else {
errorMsg.call(td.eline, td.ecol, "unknown escape sequence: "+ curCh)
}
}
nextChar.call()
if (curCh != "'") {
errorMsg.call(td.eline, td.ecol, "multi-character constant")
}
nextChar.call()
td.tok = Token.Integer
return td
}
if (curCh == "<") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "=", Token.Leq, Token.Lss)
return td
}
if (curCh == ">") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "=", Token.Geq, Token.Gtr)
return td
}
if (curCh == "!") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "=", Token.Neq, Token.Not)
return td
}
if (curCh == "=") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "=", Token.Eq, Token.Assign)
return td
}
if (curCh == "&") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "&", Token.And, Token.EOI)
return td
}
if (curCh == "|") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "|", Token.Or, Token.EOI)
return td
}
if (curCh == "\"") { // string
td.v = curCh
nextChar.call()
while (curCh != "\"") {
if (curCh == "\n") {
errorMsg.call(td.eline, td.ecol, "EOL in string")
}
if (curCh == etx) {
errorMsg.call(td.eline, td.ecol, "EOF in string")
}
td.v = td.v + curCh
nextChar.call()
}
td.v = td.v + curCh
nextChar.call()
td.tok = Token.String
return td
}
if (curCh == "/") { // div or comment
nextChar.call()
if (curCh != "*") {
td.tok = Token.Div
return td
}
// skip comments
nextChar.call()
while (true) {
if (curCh == "*") {
nextChar.call()
if (curCh == "/") {
nextChar.call()
return getTok.call()
}
} else if (curCh == etx) {
errorMsg.call(td.eline, td.ecol, "EOF in comment")
} else {
nextChar.call()
}
}
}
//integers or identifiers
var isNumber = Char.isDigit(curCh)
td.v = ""
while (Char.isAsciiAlphaNum(curCh) || curCh == "_") {
if (!Char.isDigit(curCh)) isNumber = false
td.v = td.v + curCh
nextChar.call()
}
if (td.v.count == 0) {
errorMsg.call(td.eline, td.ecol, "unknown character: " + curCh)
}
if (Char.isDigit(td.v[0])) {
if (!isNumber) {
errorMsg.call(td.eline, td.ecol, "invalid number: " + curCh)
}
td.tok = Token.Integer
return td
}
var index = lookup.call(td.v)
td.tok = (index == -1) ? Token.Ident : symtab[index].tok
return td
}
 
var initLex = Fn.new {
install.call("else", Token.Else)
install.call("if", Token.If)
install.call("print", Token.Print)
install.call("putc", Token.Putc)
install.call("while", Token.While)
nextChar.call()
}
 
var process = Fn.new {
var tokMap = {}
tokMap[Token.EOI] = "End_of_input"
tokMap[Token.Mul] = "Op_multiply"
tokMap[Token.Div] = "Op_divide"
tokMap[Token.Mod] = "Op_mod"
tokMap[Token.Add] = "Op_add"
tokMap[Token.Sub] = "Op_subtract"
tokMap[Token.Negate] = "Op_negate"
tokMap[Token.Not] = "Op_not"
tokMap[Token.Lss] = "Op_less"
tokMap[Token.Leq] = "Op_lessequal"
tokMap[Token.Gtr] = "Op_greater"
tokMap[Token.Geq] = "Op_greaterequal"
tokMap[Token.Eq] = "Op_equal"
tokMap[Token.Neq] = "Op_notequal"
tokMap[Token.Assign] = "Op_assign"
tokMap[Token.And] = "Op_and"
tokMap[Token.Or] = "Op_or"
tokMap[Token.If] = "Keyword_if"
tokMap[Token.Else] = "Keyword_else"
tokMap[Token.While] = "Keyword_while"
tokMap[Token.Print] = "Keyword_print"
tokMap[Token.Putc] = "Keyword_putc"
tokMap[Token.Lparen] = "LeftParen"
tokMap[Token.Rparen] = "RightParen"
tokMap[Token.Lbrace] = "LeftBrace"
tokMap[Token.Rbrace] = "RightBrace"
tokMap[Token.Semi] = "Semicolon"
tokMap[Token.Comma] = "Comma"
tokMap[Token.Ident] = "Identifier"
tokMap[Token.Integer] = "Integer"
tokMap[Token.String] = "String"
 
while (true) {
var td = getTok.call()
Fmt.write("$5d $5d $-16s", td.eline, td.ecol, tokMap[td.tok])
if (td.tok == Token.Integer || td.tok == Token.Ident || td.tok == Token.String) {
System.print(td.v)
} else {
System.print()
}
if (td.tok == Token.EOI) return
}
}
 
var args = Process.arguments
if (args.count == 0) {
System.print("Filename required")
return
}
 
lines = FileUtil.readLines(args[0])
lineCount = lines.count
initLex.call()
process.call()</syntaxhighlight>
 
{{out}}
For test case 3:
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
 
=={{header|Zig}}==
<syntaxhighlight lang="zig">
const std = @import("std");
 
pub const TokenType = enum {
unknown,
multiply,
divide,
mod,
add,
subtract,
negate,
less,
less_equal,
greater,
greater_equal,
equal,
not_equal,
not,
assign,
bool_and,
bool_or,
left_paren,
right_paren,
left_brace,
right_brace,
semicolon,
comma,
kw_if,
kw_else,
kw_while,
kw_print,
kw_putc,
identifier,
integer,
string,
eof,
 
// More efficient implementation can be done with `std.enums.directEnumArray`.
pub fn toString(self: @This()) []const u8 {
return switch (self) {
.unknown => "UNKNOWN",
.multiply => "Op_multiply",
.divide => "Op_divide",
.mod => "Op_mod",
.add => "Op_add",
.subtract => "Op_subtract",
.negate => "Op_negate",
.less => "Op_less",
.less_equal => "Op_lessequal",
.greater => "Op_greater",
.greater_equal => "Op_greaterequal",
.equal => "Op_equal",
.not_equal => "Op_notequal",
.not => "Op_not",
.assign => "Op_assign",
.bool_and => "Op_and",
.bool_or => "Op_or",
.left_paren => "LeftParen",
.right_paren => "RightParen",
.left_brace => "LeftBrace",
.right_brace => "RightBrace",
.semicolon => "Semicolon",
.comma => "Comma",
.kw_if => "Keyword_if",
.kw_else => "Keyword_else",
.kw_while => "Keyword_while",
.kw_print => "Keyword_print",
.kw_putc => "Keyword_putc",
.identifier => "Identifier",
.integer => "Integer",
.string => "String",
.eof => "End_of_input",
};
}
};
 
pub const TokenValue = union(enum) {
intlit: i32,
string: []const u8,
};
 
pub const Token = struct {
line: usize,
col: usize,
typ: TokenType = .unknown,
value: ?TokenValue = null,
};
 
// Error conditions described in the task.
pub const LexerError = error{
EmptyCharacterConstant,
UnknownEscapeSequence,
MulticharacterConstant,
EndOfFileInComment,
EndOfFileInString,
EndOfLineInString,
UnrecognizedCharacter,
InvalidNumber,
};
 
pub const Lexer = struct {
content: []const u8,
line: usize,
col: usize,
offset: usize,
start: bool,
 
const Self = @This();
 
pub fn init(content: []const u8) Lexer {
return Lexer{
.content = content,
.line = 1,
.col = 1,
.offset = 0,
.start = true,
};
}
 
pub fn buildToken(self: Self) Token {
return Token{ .line = self.line, .col = self.col };
}
 
pub fn buildTokenT(self: Self, typ: TokenType) Token {
return Token{ .line = self.line, .col = self.col, .typ = typ };
}
 
pub fn curr(self: Self) u8 {
return self.content[self.offset];
}
 
// Alternative implementation is to return `Token` value from `next()` which is
// arguably more idiomatic version.
pub fn next(self: *Self) ?u8 {
// We use `start` in order to make the very first invocation of `next()` to return
// the very first character. It should be possible to avoid this variable.
if (self.start) {
self.start = false;
} else {
const newline = self.curr() == '\n';
self.offset += 1;
if (newline) {
self.col = 1;
self.line += 1;
} else {
self.col += 1;
}
}
if (self.offset >= self.content.len) {
return null;
} else {
return self.curr();
}
}
 
pub fn peek(self: Self) ?u8 {
if (self.offset + 1 >= self.content.len) {
return null;
} else {
return self.content[self.offset + 1];
}
}
 
fn divOrComment(self: *Self) LexerError!?Token {
var result = self.buildToken();
if (self.peek()) |peek_ch| {
if (peek_ch == '*') {
_ = self.next(); // peeked character
while (self.next()) |ch| {
if (ch == '*') {
if (self.peek()) |next_ch| {
if (next_ch == '/') {
_ = self.next(); // peeked character
return null;
}
}
}
}
return LexerError.EndOfFileInComment;
}
}
result.typ = .divide;
return result;
}
 
fn identifierOrKeyword(self: *Self) !Token {
var result = self.buildToken();
const init_offset = self.offset;
while (self.peek()) |ch| : (_ = self.next()) {
switch (ch) {
'_', 'a'...'z', 'A'...'Z', '0'...'9' => {},
else => break,
}
}
const final_offset = self.offset + 1;
 
if (std.mem.eql(u8, self.content[init_offset..final_offset], "if")) {
result.typ = .kw_if;
} else if (std.mem.eql(u8, self.content[init_offset..final_offset], "else")) {
result.typ = .kw_else;
} else if (std.mem.eql(u8, self.content[init_offset..final_offset], "while")) {
result.typ = .kw_while;
} else if (std.mem.eql(u8, self.content[init_offset..final_offset], "print")) {
result.typ = .kw_print;
} else if (std.mem.eql(u8, self.content[init_offset..final_offset], "putc")) {
result.typ = .kw_putc;
} else {
result.typ = .identifier;
result.value = TokenValue{ .string = self.content[init_offset..final_offset] };
}
 
return result;
}
 
fn string(self: *Self) !Token {
var result = self.buildToken();
result.typ = .string;
const init_offset = self.offset;
while (self.next()) |ch| {
switch (ch) {
'"' => break,
'\n' => return LexerError.EndOfLineInString,
'\\' => {
switch (self.peek() orelse return LexerError.EndOfFileInString) {
'n', '\\' => _ = self.next(), // peeked character
else => return LexerError.UnknownEscapeSequence,
}
},
else => {},
}
} else {
return LexerError.EndOfFileInString;
}
const final_offset = self.offset + 1;
result.value = TokenValue{ .string = self.content[init_offset..final_offset] };
return result;
}
 
/// Choose either `ifyes` or `ifno` token type depending on whether the peeked
/// character is `by`.
fn followed(self: *Self, by: u8, ifyes: TokenType, ifno: TokenType) Token {
var result = self.buildToken();
if (self.peek()) |ch| {
if (ch == by) {
_ = self.next(); // peeked character
result.typ = ifyes;
} else {
result.typ = ifno;
}
} else {
result.typ = ifno;
}
return result;
}
 
/// Raise an error if there's no next `by` character but return token with `typ` otherwise.
fn consecutive(self: *Self, by: u8, typ: TokenType) LexerError!Token {
const result = self.buildTokenT(typ);
if (self.peek()) |ch| {
if (ch == by) {
_ = self.next(); // peeked character
return result;
} else {
return LexerError.UnrecognizedCharacter;
}
} else {
return LexerError.UnrecognizedCharacter;
}
}
 
fn integerLiteral(self: *Self) LexerError!Token {
var result = self.buildTokenT(.integer);
const init_offset = self.offset;
while (self.peek()) |ch| {
switch (ch) {
'0'...'9' => _ = self.next(), // peeked character
'_', 'a'...'z', 'A'...'Z' => return LexerError.InvalidNumber,
else => break,
}
}
const final_offset = self.offset + 1;
result.value = TokenValue{
.intlit = std.fmt.parseInt(i32, self.content[init_offset..final_offset], 10) catch {
return LexerError.InvalidNumber;
},
};
return result;
}
 
// This is a beautiful way of how Zig allows to remove bilerplate and at the same time
// to not lose any error completeness guarantees.
fn nextOrEmpty(self: *Self) LexerError!u8 {
return self.next() orelse LexerError.EmptyCharacterConstant;
}
 
fn integerChar(self: *Self) LexerError!Token {
var result = self.buildTokenT(.integer);
switch (try self.nextOrEmpty()) {
'\'', '\n' => return LexerError.EmptyCharacterConstant,
'\\' => {
switch (try self.nextOrEmpty()) {
'n' => result.value = TokenValue{ .intlit = '\n' },
'\\' => result.value = TokenValue{ .intlit = '\\' },
else => return LexerError.EmptyCharacterConstant,
}
switch (try self.nextOrEmpty()) {
'\'' => {},
else => return LexerError.MulticharacterConstant,
}
},
else => {
result.value = TokenValue{ .intlit = self.curr() };
switch (try self.nextOrEmpty()) {
'\'' => {},
else => return LexerError.MulticharacterConstant,
}
},
}
return result;
}
};
 
pub fn lex(allocator: std.mem.Allocator, content: []u8) !std.ArrayList(Token) {
var tokens = std.ArrayList(Token).init(allocator);
var lexer = Lexer.init(content);
while (lexer.next()) |ch| {
switch (ch) {
' ' => {},
'*' => try tokens.append(lexer.buildTokenT(.multiply)),
'%' => try tokens.append(lexer.buildTokenT(.mod)),
'+' => try tokens.append(lexer.buildTokenT(.add)),
'-' => try tokens.append(lexer.buildTokenT(.subtract)),
'<' => try tokens.append(lexer.followed('=', .less_equal, .less)),
'>' => try tokens.append(lexer.followed('=', .greater_equal, .greater)),
'=' => try tokens.append(lexer.followed('=', .equal, .assign)),
'!' => try tokens.append(lexer.followed('=', .not_equal, .not)),
'(' => try tokens.append(lexer.buildTokenT(.left_paren)),
')' => try tokens.append(lexer.buildTokenT(.right_paren)),
'{' => try tokens.append(lexer.buildTokenT(.left_brace)),
'}' => try tokens.append(lexer.buildTokenT(.right_brace)),
';' => try tokens.append(lexer.buildTokenT(.semicolon)),
',' => try tokens.append(lexer.buildTokenT(.comma)),
'&' => try tokens.append(try lexer.consecutive('&', .bool_and)),
'|' => try tokens.append(try lexer.consecutive('|', .bool_or)),
'/' => {
if (try lexer.divOrComment()) |token| try tokens.append(token);
},
'_', 'a'...'z', 'A'...'Z' => try tokens.append(try lexer.identifierOrKeyword()),
'"' => try tokens.append(try lexer.string()),
'0'...'9' => try tokens.append(try lexer.integerLiteral()),
'\'' => try tokens.append(try lexer.integerChar()),
else => {},
}
}
try tokens.append(lexer.buildTokenT(.eof));
 
return tokens;
}
 
pub fn main() !void {
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena.deinit();
const allocator = arena.allocator();
 
var arg_it = std.process.args();
_ = try arg_it.next(allocator) orelse unreachable; // program name
const file_name = arg_it.next(allocator);
// We accept both files and standard input.
var file_handle = blk: {
if (file_name) |file_name_delimited| {
const fname: []const u8 = try file_name_delimited;
break :blk try std.fs.cwd().openFile(fname, .{});
} else {
break :blk std.io.getStdIn();
}
};
defer file_handle.close();
const input_content = try file_handle.readToEndAlloc(allocator, std.math.maxInt(usize));
 
const tokens = try lex(allocator, input_content);
const pretty_output = try tokenListToString(allocator, tokens);
_ = try std.io.getStdOut().write(pretty_output);
}
 
fn tokenListToString(allocator: std.mem.Allocator, token_list: std.ArrayList(Token)) ![]u8 {
var result = std.ArrayList(u8).init(allocator);
var w = result.writer();
for (token_list.items) |token| {
const common_args = .{ token.line, token.col, token.typ.toString() };
if (token.value) |value| {
const init_fmt = "{d:>5}{d:>7} {s:<15}";
switch (value) {
.string => |str| _ = try w.write(try std.fmt.allocPrint(
allocator,
init_fmt ++ "{s}\n",
common_args ++ .{str},
)),
.intlit => |i| _ = try w.write(try std.fmt.allocPrint(
allocator,
init_fmt ++ "{d}\n",
common_args ++ .{i},
)),
}
} else {
_ = try w.write(try std.fmt.allocPrint(allocator, "{d:>5}{d:>7} {s}\n", common_args));
}
}
return result.items;
}
</syntaxhighlight>
9,486

edits