Compiler/lexical analyzer: Difference between revisions

m
m (Updated definition of whitespace.)
m (→‎{{header|Wren}}: Minor tidy)
(186 intermediate revisions by 34 users not shown)
Line 1:
{{draft task}}Lexical Analyzer
{{clarify task}}
 
Definition from [https://en.wikipedia.org/wiki/Lexical_analysis Wikipedia]:
Line 6 ⟶ 5:
: ''Lexical analysis is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an identified "meaning"). A program that performs lexical analysis may be called a lexer, tokenizer, or scanner (though "scanner" is also used to refer to the first stage of a lexer).''
 
{{task heading}}
{{introheader|The Task}}
 
Create a lexical analyzer for the simple programming language specified below. The
Line 13 ⟶ 12:
if two versions of the solution are provided: One without the lexer module, and one with.
 
{{introheadertask heading|Input Specification}}
 
The simple programming language to be analyzed is more or less a subset of [[C]]. It supports the following tokens:
Line 19 ⟶ 18:
;Operators
 
:::{| class="wikitable"
|-
! Characters Name !! Common name !! Name Character sequence
|-
| *<tt>Op_multiply</tt> || multiply || Mul <tt>*</tt>
|-
| <tt>Op_divide</tt> || divide || Div <tt>/</tt>
|-
| +<tt>Op_mod</tt> || plus mod || Add <tt>%</tt>
|-
| <tt>Op_add</tt> || plus || <tt>+</tt>
| - || minus and unary minus || Sub and Uminus
|-
| <tt>Op_subtract</tt> || less thanminus || Lss <tt>-</tt>
|-
| <=tt>Op_negate</tt> || less thanunary minus or equal || Leq <tt>-</tt>
|-
| <tt>Op_less</tt> || greater less than || Gtr <tt><</tt>
|-
| &#33;=<tt>Op_lessequal</tt> || not equal less than or equal || Neq <tt><=</tt>
|-
| =<tt>Op_greater</tt> || assign || greater than || Assign <tt>&gt;</tt>
|-
| <tt>Op_greaterequal</tt> || greater than or equal || <tt>&gt;=</tt>
| && || and || And
|-
| <tt>Op_equal</tt> || equal || <tt>==</tt>
|-
| <tt>Op_notequal</tt> || not equal || <tt>&#33;=</tt>
|-
| <tt>Op_not</tt> || unary not || <tt>&#33;</tt>
|-
| <tt>Op_assign</tt> || assignment || <tt>=</tt>
|-
| <tt>Op_and</tt> || logical and || <tt>&amp;&amp;</tt>
|-
| <tt>Op_or</tt> || logical or || <tt>&brvbar;&brvbar;</tt>
|}
 
* The <code>-</code> token should always be interpreted as <tt>Op_subtract</tt> by the lexer. Turning some <tt>Op_subtract</tt> into <tt>Op_negate</tt> will be the job of the syntax analyzer, which is not part of this task.
 
;Symbols
 
:::{| class="wikitable"
|-
! Characters Name !! Common name !! Name Character
|-
| (<tt>LeftParen</tt> || left parenthesis || Lparen<tt>(</tt>
|-
| )<tt>RightParen</tt> || right parenthesis || Rparen<tt>)</tt>
|-
| {<tt>LeftBrace</tt> || left brace || Lbrace<tt>{</tt>
|-
| }<tt>RightBrace</tt> || right brace || Rbrace<tt>}</tt>
|-
| &#59;<tt>Semicolon</tt> || semi -colon || Semi<tt>&#59;</tt>
|-
| ,<tt>Comma</tt> || comma || Comma<tt>,</tt>
|}
 
;Keywords
 
:::{| class="wikitable"
|-
! Name || Character sequence
! Characters !! Name
|-
| if <tt>Keyword_if</tt> || If<tt>if</tt>
|-
| <tt>Keyword_else</tt> || <tt>else</tt>
| while || While
|-
| <tt>Keyword_while</tt> || <tt>while</tt>
| print || Print
|-
| <tt>Keyword_print</tt> || <tt>print</tt>
| putc || Putc
|-
| <tt>Keyword_putc</tt> || <tt>putc</tt>
|}
 
;Identifiers and literals
;Other entities
 
These differ from the the previous tokens, in that each occurrence of them has a value associated with it.
 
:::{| class="wikitable"
|-
! Name
! Characters !! Regular expression !! Name
! Common name
! Format description
! Format regex
! Value
|-
| <tt>Identifier</tt>
| integers || [0-9]+ || Integer
| identifier
| one or more letter/number/underscore characters, but not starting with a number
| <code style="white-space:nowrap">[_a-zA-Z][_a-zA-Z0-9]*</code>
| as is
|-
| <tt>Integer</tt>
| char literal || <pre>'([^'\n]|\\n|\\\\)'</pre> || Integer
| integer literal
| one or more digits
| <code>[0-9]+</code>
| as is, interpreted as a number
|-
| <tt>Integer</tt>
| identifiers || [_a-zA-Z][_a-zA-Z0-9]* || Ident
| char literal
| exactly one character (anything except newline or single quote) or one of the allowed escape sequences, enclosed by single quotes
| <code><nowiki>'([^'\n]|\\n|\\\\)'</nowiki></code>
| the ASCII code point number of the character, e.g. 65 for <code>'A'</code> and 10 for <code>'\n'</code>
|-
| <tt>String</tt>
| string literal || "[^"\n]*" || String
| string literal
| zero or more characters (anything except newline or double quote), enclosed by double quotes
| <code>"[^"\n]*"</code>
| the characters without the double quotes and with escape sequences converted
|}
 
Notes:* For char and string literals, the <code>\n</code> escape sequence is supported asto represent a new -line character. To represent a backslash, use <code>\\</code>. No other special sequences are supported.
* For char and string literals, to represent a backslash, use <code>\\</code>.
* No other special sequences are supported. This means that:
** Char literals cannot represent a single quote character (value 39).
** String literals cannot represent strings containing double quote characters.
 
;Zero-width tokens
;White space
 
:::{| class="wikitable"
* Zero or more whitespace characters or comments are allowed between any two tokens, with the exceptions noted below.
|-
* "longest token matching" is used to resolve conflicts (e.g., in order to match '''<=''' as a single token rather than the two tokens '''<''' and '''=''').
! Name || Location
* Whitespace is only required in the following situations:
|-
** To distinguish between keywords:
| <tt>End_of_input</tt> || when the end of the input stream is reached
*** ifprint - is recognized as an identifier, instead of the keywords '''if''' and '''print'''.
|}
** To distinguish between keywords and integers:
*** 42fred - is an invalid number or invalid identifier.
* Whitespace is not allowed between:
** Multi-character operators: These cannot be recognized unless they occur without embedded whitespace: '''&&''' '''<='''.
 
;White space
The following programs are equivalent:
 
<lang c>
count = 1 ;
n = 1 ;
limit = 100 ;
while ( n < limit ) {
k = 3 ;
p = 1 ;
n = n + 2 ;
while ( ( k * k <= n ) && ( p ) ) {
p = n / k * k != n ;
k = k + 2 ;
}
if ( p ) {
print ( n , " " ) ;
count = count + 1 ;
}
}
print ( count , "\n" ) ;
</lang>
 
* Zero or more whitespace characters, or comments enclosed in <code>/* ... */</code>, are allowed between any two tokens, with the exceptions noted below.
<lang c>
* "Longest token matching" is used to resolve conflicts (e.g., in order to match '''<=''' as a single token rather than the two tokens '''<''' and '''=''').
count=1;n=1;limit=100;while(n<limit){k=3;p=1;n=n+2;while((k*k<=n)&&(p)){p=n/k*k!=n;k=k+2;}if(p){print(n," ");count=count+1;}}print(count,"\n");
* Whitespace is ''required'' between two tokens that have an alphanumeric character or underscore at the edge.
</lang>
** This means: keywords, identifiers, and integer literals.
** e.g. <code>ifprint</code> is recognized as an identifier, instead of the keywords <tt>if</tt> and <tt>print</tt>.
** e.g. <code>42fred</code> is invalid, and neither recognized as a number nor an identifier.
* Whitespace is ''not allowed'' inside of tokens (except for chars and strings where they are part of the value).
** e.g. <code>& &</code> is invalid, and not interpreted as the <tt>&&</tt> operator.
 
TheyFor example, the following two program fragments are equivalent, and should produce the same token stream, except for the line and column positions.:
 
* <syntaxhighlight lang="c">if ( p /* meaning n is prime */ ) {
Comments enclosed in <code>/* ... */</code> are also treated as whitespace outside of strings.
print ( n , " " ) ;
count = count + 1 ; /* number of primes found so far */
}</syntaxhighlight>
* <syntaxhighlight lang="c">if(p){print(n," ");count=count+1;}</syntaxhighlight>
 
;Complete list of token names
 
<pre>
EOIEnd_of_input Op_multiply PrintOp_divide Putc Op_mod If Op_add While Lbrace RbraceOp_subtract
LparenOp_negate Rparen UminusOp_not Mul Div Op_less Add SubOp_lessequal Op_greater Op_greaterequal
LssOp_equal Gtr Op_notequal Op_assign Leq Op_and Neq And Op_or Semi CommaKeyword_if
Keyword_else Keyword_while Keyword_print Keyword_putc LeftParen RightParen
Assign Integer String Ident
LeftBrace RightBrace Semicolon Comma Identifier Integer
String
</pre>
 
{{introheadertask heading|Output Format}}
 
The program output should be a sequence of lines, each consisting of the following whitespace-separated fields:
 
# <code>line</code>
# the line number where the token starts
# <code>col</code>
# the column number where the token starts
# the token name
# the token value, in(only casefor of<tt>Identifier</tt>, ''<tt>Integer''</tt>, ''and <tt>String'', or</tt> ''Ident''tokens)
# the number of spaces between fields is up to you. Neatly aligned is nice, but not a requirement.
<br>
 
This task is intended to be used as part of a pipeline, with the other compiler tasks - for example:
{{introheader|Diagnostics}}
<br><b>lex < hello.t | parse | gen | vm</b>
 
Or possibly:
<br><b>lex hello.t lex.out</b>
<br><b>parse lex.out parse.out</b>
<br><b>gen parse.out gen.out</b>
<br><b>vm gen.out</b>
 
<br>
This implies that the output of this task (the lexical analyzer) should be suitable as input to any of the [[Compiler/syntax_analyzer|Syntax Analyzer task]] programs.
 
{{task heading|Diagnostics}}
 
The following error conditions should be caught:
 
:::{| class="wikitable"
|-
! Error
Line 183 ⟶ 224:
| Unrecognized character.
| <code>&#124;</code>
|-
| Invalid number. Starts like a number, but ends in non-numeric characters.
| <code>123abc</code>
|}
 
{{introheadertask heading|Test Cases}}
:{| class="wikitable"
 
{| class="wikitable"
|-
! Input
Line 193 ⟶ 236:
|-
| style="vertical-align:top" |
Test Case 1:
<lang c>
<syntaxhighlight lang="c">/*
/*
Hello world
*/
print("Hello, World!\n");</syntaxhighlight>
</lang>
 
| style="vertical-align:top" |
<b><pre>
line 4 col 1 PrintKeyword_print
line 4 col 6 LparenLeftParen
line 4 col 7 String "Hello, World!\n"
line 4 col 24 RparenRightParen
line 4 col 25 SemiSemicolon
line 5 col 1 EOIEnd_of_input
</pre></b>
 
|-
| style="vertical-align:top" |
Test Case 2:
<lang c>
<syntaxhighlight lang="c">/*
/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");</syntaxhighlight>
</lang>
 
| style="vertical-align:top" |
<b><pre>
line 4 col 1 1Identifier Ident phoenix_number
line 4 col 16 AssignOp_assign
line 4 col 18 Integer 142857
line 4 col 24 SemiSemicolon
line 5 col 1 PrintKeyword_print
line 5 col 6 LparenLeftParen
line 5 col 7 7Identifier Ident phoenix_number
line 5 col 21 Comma
line 5 col 23 String "\n"
line 5 col 27 RparenRightParen
line 5 col 28 SemiSemicolon
line 6 col 1 EOIEnd_of_input
</pre></b>
 
|-
| style="vertical-align:top" |
Test Case 3:
<lang c>
<syntaxhighlight lang="c">/*
/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* WhileElse */ while else /* Leq */ <=
/* LbraceWhile */ { while /* NeqGeq */ !>=
/* RbraceLbrace */ } { /* AndEq */ &&==
/* LparenRbrace */ ( } /* SemiNeq */ ;!=
/* RparenLparen */ ) ( /* CommaAnd */ ,&&
/* UminusRparen */ - ) /* AssignOr */ =||
/* Mul Uminus */ *- /* Semi /* Integer */ 42;
/* DivNot */ / ! /* StringComma */ "String literal",
/* AddMul */ + * /* Ident Assign */ variable_name=
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal */ ' \\'
/* character literal */ ' '</syntaxhighlight>
</lang>
 
| style="vertical-align:top" |
<b><pre>
line 5 col 16 15 PrintKeyword_print
line 5 col 40 41 SubOp_subtract
line 6 col 16 15 PutcKeyword_putc
line 6 col 40 41 LssOp_less
line 7 col 16 15 IfKeyword_if
line 7 col 40 41 GtrOp_greater
line 8 col 16 15 WhileKeyword_else
line 8 col 40 41 LeqOp_lessequal
line 9 col 16 15 LbraceKeyword_while
line 9 col 40 41 NeqOp_greaterequal
line 10 col 16 15 RbraceLeftBrace
line 10 col 40 41 AndOp_equal
line 11 col 16 15 LparenRightBrace
line 11 col 40 41 SemiOp_notequal
line 12 col 16 15 RparenLeftParen
line 12 col 40 41 CommaOp_and
line 13 col 16 15 SubRightParen
line 13 col 40 41 AssignOp_or
line 14 col 16 15 MulOp_subtract
line 14 col 41 Integer 40 42Semicolon
line 15 col 16 15 DivOp_not
line 15 col 41 String 40 "String literal"Comma
line 16 col 16 15 AddOp_multiply
line 16 col 41 Ident 40 variable_nameOp_assign
line 17 col 26 Integer 16 10Op_divide
line 17 18 col 40 26 Integer 32 42
line 18 19 col 16 1 EOIOp_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre></b>
 
|-
| style="vertical-align:top" |
Test Case 4:
<syntaxhighlight lang="c">/*** test printing, embedded \n and comments with lots of '*' ***/
print(42);
print("\nHello World\nGood Bye\nok\n");
print("Print a slash n - \\n.\n");</syntaxhighlight>
 
| style="vertical-align:top" |
<b><pre>
2 1 Keyword_print
2 6 LeftParen
2 7 Integer 42
2 9 RightParen
2 10 Semicolon
3 1 Keyword_print
3 6 LeftParen
3 7 String "\nHello World\nGood Bye\nok\n"
3 38 RightParen
3 39 Semicolon
4 1 Keyword_print
4 6 LeftParen
4 7 String "Print a slash n - \\n.\n"
4 33 RightParen
4 34 Semicolon
5 1 End_of_input
</pre></b>
 
|}
 
;Additional examples
{{introheader|Reference}}
Your solution should pass all the test cases above and the additional tests found '''[[Compiler/Sample_programs|Here]]'''.
 
The Flex, C, Python and Euphoria versions can be considered reference implementations.
 
{{task heading|Reference}}
The C and Python versions can be considered reference implementations.
 
 
;Related Tasks
* [[Compiler/syntax_analyzer|Syntax Analyzer task]]
* [[Compiler/code_generator|Code Generator task]]
* [[Compiler/virtual_machine_interpreter|Virtual Machine Interpreter task]]
* [[Compiler/AST_interpreter|AST Interpreter task]]
<hr>
<br><br>
__TOC__
 
=={{header|Ada}}==
<syntaxhighlight lang="ada">with Ada.Text_IO, Ada.Streams.Stream_IO, Ada.Strings.Unbounded, Ada.Command_Line,
Ada.Exceptions;
use Ada.Strings, Ada.Strings.Unbounded, Ada.Streams, Ada.Exceptions;
 
procedure Main is
package IO renames Ada.Text_IO;
 
package Lexer is
type Token is (Op_multiply, Op_divide, Op_mod, Op_add, Op_subtract, Op_negate,
Op_less, Op_lessequal, Op_greater, Op_greaterequal, Op_equal,
Op_notequal, Op_not, Op_assign, Op_and, Op_or,
 
LeftParen, RightParen, LeftBrace, RightBrace, Semicolon, Comma,
 
Keyword_if, Keyword_else, Keyword_while, Keyword_print, Keyword_putc,
Identifier, Token_Integer, Token_String, End_of_input,
 
Empty_Char_Error, Invalid_Escape_Error, Multi_Char_Error, EOF_Comment_Error,
EOF_String_Error, EOL_String_Error, Invalid_Char_Error, Invalid_Num_Error
);
 
subtype Operator is Token range Op_multiply .. Op_or;
subtype Symbol is Token range Token'Succ(Operator'Last) .. Comma;
subtype Keyword is Token range Token'Succ(Symbol'Last) .. Keyword_putc;
subtype Error is Token range Empty_Char_Error .. Invalid_Num_Error;
subtype Operator_or_Error is Token
with Static_Predicate => Operator_or_Error in Operator | Error;
 
subtype Whitespace is Character
with Static_Predicate => Whitespace in ' ' | ASCII.HT | ASCII.CR | ASCII.LF;
 
Lexer_Error : exception;
Invalid_Escape_Code : constant Character := ASCII.NUL;
 
procedure run(input : Stream_IO.File_Type);
end Lexer;
 
package body Lexer is
use type Stream_IO.Count;
 
procedure run(input : Stream_IO.File_Type) is
type State is (State_Start, State_Identifier, State_Integer, State_Char, State_String,
State_Comment);
curr_state : State := State_Start;
curr_char : Character;
curr_col, curr_row, token_col, token_row : Positive := 1;
token_text : Unbounded_String := Unbounded.Null_Unbounded_String;
 
function look_ahead return Character is
next_char : Character := ASCII.LF;
begin
if not Stream_IO.End_Of_File(input) then
next_char := Character'Input(Stream_IO.Stream(input));
Stream_IO.Set_Index(input, Stream_IO.Index(input) - 1);
end if;
return next_char;
end look_ahead;
 
procedure next_char is
next : Character := Character'Input(Stream_IO.Stream(input));
begin
curr_col := curr_col + 1;
if curr_char = ASCII.LF then
curr_row := curr_row + 1;
curr_col := 1;
end if;
curr_char := next;
end next_char;
 
procedure print_token(tok : Token; text : String := "") is
procedure raise_error(text : String) is
begin
raise Lexer_Error with "Error: " & text;
end;
begin
IO.Put(token_row'Image & ASCII.HT & token_col'Image & ASCII.HT);
case tok is
when Operator | Symbol | Keyword | End_of_input => IO.Put_Line(tok'Image);
when Token_Integer => IO.Put_Line("INTEGER" & ASCII.HT & text);
when Token_String => IO.Put_Line("STRING" & ASCII.HT & ASCII.Quotation & text & ASCII.Quotation);
when Identifier => IO.Put_Line(tok'Image & ASCII.HT & text);
when Empty_Char_Error => raise_error("empty character constant");
when Invalid_Escape_Error => raise_error("unknown escape sequence: " & text);
when Multi_Char_Error => raise_error("multi-character constant: " & text);
when EOF_Comment_Error => raise_error("EOF in comment");
when EOF_String_Error => raise_error("EOF in string");
when EOL_String_Error => raise_error("EOL in string");
when Invalid_Char_Error => raise_error("invalid character: " & curr_char);
when Invalid_Num_Error => raise_error("invalid number: " & text);
end case;
end print_token;
 
procedure lookahead_choose(determiner : Character; a, b : Operator_or_Error) is
begin
if look_ahead = determiner then
print_token(a);
next_char;
else
print_token(b);
end if;
end lookahead_choose;
 
function to_escape_code(c : Character) return Character is
begin
case c is
when 'n' => return ASCII.LF;
when '\' => return '\';
when others =>
print_token(Invalid_Escape_Error, ASCII.Back_Slash & c);
return Invalid_Escape_Code;
end case;
end to_escape_code;
begin
curr_char := Character'Input(Stream_IO.Stream(input));
loop
case curr_state is
when State_Start =>
token_col := curr_col;
token_row := curr_row;
case curr_char is
when '*' => print_token(Op_multiply);
when '/' =>
if look_ahead = '*' then
next_char;
curr_state := State_Comment;
else
print_token(Op_divide);
end if;
when '%' => print_token(Op_mod);
when '+' => print_token(Op_add);
when '-' => print_token(Op_subtract);
when '(' => print_token(LeftParen);
when ')' => print_token(RightParen);
when '{' => print_token(LeftBrace);
when '}' => print_token(RightBrace);
when ';' => print_token(Semicolon);
when ',' => print_token(Comma);
when '<' => lookahead_choose('=', Op_lessequal, Op_less);
when '>' => lookahead_choose('=', Op_greaterequal, Op_greater);
when '!' => lookahead_choose('=', Op_notequal, Op_not);
when '=' => lookahead_choose('=', Op_equal, Op_assign);
when '&' => lookahead_choose('&', Op_and, Invalid_Char_Error);
when '|' => lookahead_choose('|', Op_or, Invalid_Char_Error);
when 'a' .. 'z' | 'A' .. 'Z' | '_' =>
Unbounded.Append(token_text, curr_char);
curr_state := State_Identifier;
when '0' .. '9' =>
Unbounded.Append(token_text, curr_char);
curr_state := State_Integer;
when ''' => curr_state := State_Char;
when ASCII.Quotation => curr_state := State_String;
when Whitespace => null;
when others => null;
end case;
next_char;
 
when State_Identifier =>
case curr_char is
when 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_' =>
Unbounded.Append(token_text, curr_char);
next_char;
when others =>
if token_text = "if" then
print_token(Keyword_if);
elsif token_text = "else" then
print_token(Keyword_else);
elsif token_text = "while" then
print_token(Keyword_while);
elsif token_text = "print" then
print_token(Keyword_print);
elsif token_text = "putc" then
print_token(Keyword_putc);
else
print_token(Identifier, To_String(token_text));
end if;
Unbounded.Set_Unbounded_String(token_text, "");
curr_state := State_Start;
end case;
 
when State_Integer =>
case curr_char is
when '0' .. '9' =>
Unbounded.Append(token_text, curr_char);
next_char;
when 'a' .. 'z' | 'A' .. 'Z' | '_' =>
print_token(Invalid_Num_Error, To_String(token_text));
when others =>
print_token(Token_Integer, To_String(token_text));
Unbounded.Set_Unbounded_String(token_text, "");
curr_state := State_Start;
end case;
 
when State_Char =>
case curr_char is
when ''' =>
if Unbounded.Length(token_text) = 0 then
print_token(Empty_Char_Error);
elsif Unbounded.Length(token_text) = 1 then
print_token(Token_Integer, Character'Pos(Element(token_text, 1))'Image);
else
print_token(Multi_Char_Error, To_String(token_text));
end if;
Set_Unbounded_String(token_text, "");
curr_state := State_Start;
when '\' =>
Unbounded.Append(token_text, to_escape_code(look_ahead));
next_char;
when others => Unbounded.Append(token_text, curr_char);
end case;
next_char;
 
when State_String =>
case curr_char is
when ASCII.Quotation =>
print_token(Token_String, To_String(token_text));
Set_Unbounded_String(token_text, "");
curr_state := State_Start;
when '\' =>
if to_escape_code(look_ahead) /= Invalid_Escape_Code then
Unbounded.Append(token_text, curr_char);
end if;
when ASCII.LF | ASCII.CR => print_token(EOL_String_Error);
when others => Unbounded.Append(token_text, curr_char);
end case;
next_char;
 
when State_Comment =>
case curr_char is
when '*' =>
if look_ahead = '/' then
next_char;
curr_state := State_Start;
end if;
when others => null;
end case;
next_char;
end case;
end loop;
exception
when error : Stream_IO.End_Error =>
if curr_state = State_String then
print_token(EOF_String_Error);
else
print_token(End_of_input);
end if;
when error : Lexer.Lexer_Error => IO.Put_Line(Exception_Message(error));
end run;
end Lexer;
 
source_file : Stream_IO.File_Type;
begin
if Ada.Command_Line.Argument_Count < 1 then
IO.Put_Line("usage: lex [filename]");
return;
end if;
Stream_IO.Open(source_file, Stream_IO.In_File, Ada.Command_Line.Argument(1));
Lexer.run(source_file);
exception
when error : others => IO.Put_Line("Error: " & Exception_Message(error));
end Main;
</syntaxhighlight>
{{out}} Test case 3:
<pre>
5 16 KEYWORD_PRINT
5 40 OP_SUBTRACT
6 16 KEYWORD_PUTC
6 40 OP_LESS
7 16 KEYWORD_IF
7 40 OP_GREATER
8 16 KEYWORD_ELSE
8 40 OP_LESSEQUAL
9 16 KEYWORD_WHILE
9 40 OP_GREATEREQUAL
10 16 LEFTBRACE
10 40 OP_EQUAL
11 16 RIGHTBRACE
11 40 OP_NOTEQUAL
12 16 LEFTPAREN
12 40 OP_AND
13 16 RIGHTPAREN
13 40 OP_OR
14 16 OP_SUBTRACT
14 40 SEMICOLON
15 16 OP_NOT
15 40 COMMA
16 16 OP_MULTIPLY
16 40 OP_ASSIGN
17 16 OP_DIVIDE
17 40 INTEGER 42
18 16 OP_MOD
18 40 STRING "String literal"
19 16 OP_ADD
19 40 IDENTIFIER variable_name
20 26 INTEGER 10
21 26 INTEGER 92
22 26 INTEGER 32
23 1 END_OF_INPUT
</pre>
 
=={{header|ALGOL 68}}==
This is a simple ''token in, line out'' program. It doesn't keep an internal representation of tokens or anything like that, since that's not needed at all.
 
As an addition, it emits a diagnostic if integer literals are too big.
<syntaxhighlight lang="algol68">BEGIN
# implement C-like getchar, where EOF and EOLn are "characters" (-1 and 10 resp.). #
INT eof = -1, eoln = 10;
BOOL eof flag := FALSE;
STRING buf := "";
INT col := 1;
INT line := 0;
on logical file end (stand in, (REF FILE f)BOOL: eof flag := TRUE);
PROC getchar = INT:
IF eof flag THEN eof
ELIF col = UPB buf THEN col +:= 1; eoln
ELIF col > UPB buf THEN IF line > 0 THEN read(newline) FI;
line +:= 1;
read(buf);
IF eof flag THEN col := 1; eof
ELSE col := 0; getchar
FI
ELSE col +:= 1; ABS buf[col]
FI;
PROC nextchar = INT: IF eof flag THEN eof ELIF col >= UPB buf THEN eoln ELSE ABS buf[col+1] FI;
 
PROC is blank = (INT ch) BOOL: ch = 0 OR ch = 9 OR ch = 10 OR ch = 13 OR ch = ABS " ";
PROC is digit = (INT ch) BOOL: ch >= ABS "0" AND ch <= ABS "9";
PROC is ident start = (INT ch) BOOL: ch >= ABS "A" AND ch <= ABS "Z" OR
ch >= ABS "a" AND ch <= ABS "z" OR
ch = ABS "_";
PROC is ident = (INT ch) BOOL: is ident start(ch) OR is digit(ch);
 
PROC ident or keyword = (INT start char) VOID:
BEGIN
STRING w := REPR start char;
INT start col = col;
WHILE is ident (next char) DO w +:= REPR getchar OD;
IF w = "if" THEN output2("Keyword_if", start col)
ELIF w = "else" THEN output2("Keyword_else", start col)
ELIF w = "while" THEN output2("Keyword_while", start col)
ELIF w = "print" THEN output2("Keyword_print", start col)
ELIF w = "putc" THEN output2("Keyword_putc", start col)
ELSE output2("Identifier " + w, start col)
FI
END;
PROC char = VOID:
BEGIN
INT start col = col;
INT ch := getchar;
IF ch = ABS "'" THEN error("Empty character constant")
ELIF ch = ABS "\" THEN ch := getchar;
IF ch = ABS "n" THEN ch := 10
ELIF ch = ABS "\" THEN SKIP
ELSE error("Unknown escape sequence. \" + REPR ch)
FI
FI;
IF nextchar /= ABS "'" THEN error("Multi-character constant.") FI;
getchar;
output2("Integer " + whole(ch, 0), start col)
END;
PROC string = VOID:
BEGIN
INT start col = col;
STRING s := """";
WHILE INT ch := getchar; ch /= ABS """"
DO
IF ch = eoln THEN error("End-of-line while scanning string literal. Closing string character not found before end-of-line.")
ELIF ch = eof THEN error("End-of-file while scanning string literal. Closing string character not found.")
ELIF ch = ABS "\" THEN s +:= REPR ch; ch := getchar;
IF ch /= ABS "\" AND ch /= ABS "n" THEN error("Unknown escape sequence. \" + REPR ch) FI;
s +:= REPR ch
ELSE s +:= REPR ch
FI
OD;
output2("String " + s + """", start col)
END;
PROC comment = VOID:
BEGIN
WHILE INT ch := getchar; NOT (ch = ABS "*" AND nextchar = ABS "/")
DO IF ch = eof THEN error("End-of-file in comment. Closing comment characters not found.") FI
OD;
getchar
END;
PROC number = (INT first digit) VOID:
BEGIN
INT start col = col;
INT n := first digit - ABS "0";
WHILE is digit (nextchar) DO
INT u := getchar - ABS "0";
IF LENG n * 10 + LENG u > max int THEN error("Integer too big") FI;
n := n * 10 + u
OD;
IF is ident start (nextchar) THEN error("Invalid number. Starts like a number, but ends in non-numeric characters.") FI;
output2("Integer " + whole(n, 0), start col)
END;
 
PROC output = (STRING s) VOID: output2(s, col);
PROC output2 = (STRING s, INT col) VOID: print((whole(line,-8), whole(col,-8), " ", s, newline));
 
PROC if follows = (CHAR second, STRING longer, shorter) VOID:
IF nextchar = ABS second
THEN output(longer); getchar
ELSE output(shorter)
FI;
PROC error = (STRING s)VOID: (put(stand error, ("At ", whole(line,0), ":", whole(col,0), " ", s, new line)); stop);
PROC unrecognized = (INT char) VOID: error("Unrecognized character " + REPR char);
PROC double char = (INT first, STRING op) VOID:
IF nextchar /= first THEN unrecognized(first)
ELSE output2(op, col-1); getchar
FI;
 
WHILE INT ch := getchar; ch /= eof
DO
IF is blank(ch) THEN SKIP
ELIF ch = ABS "(" THEN output("LeftParen")
ELIF ch = ABS ")" THEN output("RightParen")
ELIF ch = ABS "{" THEN output("LeftBrace")
ELIF ch = ABS "}" THEN output("RightBrace")
ELIF ch = ABS ";" THEN output("Semicolon")
ELIF ch = ABS "," THEN output("Comma")
ELIF ch = ABS "*" THEN output("Op_multiply")
ELIF ch = ABS "/" THEN IF next char = ABS "*" THEN comment
ELSE output("Op_divide")
FI
ELIF ch = ABS "%" THEN output("Op_mod")
ELIF ch = ABS "+" THEN output("Op_add")
ELIF ch = ABS "-" THEN output("Op_subtract")
ELIF ch = ABS "<" THEN if follows("=", "Op_lessequal", "Op_less")
ELIF ch = ABS ">" THEN if follows("=", "Op_greaterequal", "Op_greater")
ELIF ch = ABS "=" THEN if follows("=", "Op_equal", "Op_assign")
ELIF ch = ABS "!" THEN if follows("=", "Op_notequal", "Op_not")
ELIF ch = ABS "&" THEN double char(ch, "Op_and")
ELIF ch = ABS "|" THEN double char(ch, "Op_or")
ELIF is ident start (ch) THEN ident or keyword (ch)
ELIF ch = ABS """" THEN string
ELIF ch = ABS "'" THEN char
ELIF is digit(ch) THEN number(ch)
ELSE unrecognized(ch)
FI
OD;
output("End_Of_Input")
END</syntaxhighlight>
 
=={{header|ALGOL W}}==
<syntaxhighlight lang="algolw">begin
%lexical analyser %
% Algol W strings are limited to 256 characters in length so we limit source lines %
% and tokens to 256 characters %
 
integer lineNumber, columnNumber;
string(256) line;
string(256) tkValue;
integer tkType, tkLine, tkColumn, tkLength, tkIntegerValue;
logical tkTooLong;
string(1) currChar;
string(1) newlineChar;
 
integer LINE_WIDTH, MAX_TOKEN_LENGTH, MAXINTEGER_OVER_10, MAXINTEGER_MOD_10;
integer tOp_multiply , tOp_divide , tOp_mod , tOp_add
, tOp_subtract , tOp_negate , tOp_less , tOp_lessequal
, tOp_greater , tOp_greaterequal , tOp_equal , tOp_notequal
, tOp_not , tOp_assign , tOp_and , tOp_or
, tLeftParen , tRightParen , tLeftBrace , tRightBrace
, tSemicolon , tComma , tKeyword_if , tKeyword_else
, tKeyword_while , tKeyword_print , tKeyword_putc , tIdentifier
, tInteger , tString , tEnd_of_input , tComment
;
 
string(16) array tkName ( 1 :: 32 );
 
% reports an error %
procedure lexError( string(80) value message ); begin
integer errorPos;
write( i_w := 1, s_w := 0, "**** Error at(", lineNumber, ",", columnNumber, "): " );
errorPos := 0;
while errorPos < 80 and message( errorPos // 1 ) not = "." do begin
writeon( s_w := 0, message( errorPos // 1 ) );
errorPos := errorPos + 1
end while_not_at_end_of_message ;
writeon( s_w := 0, "." )
end lexError ;
 
% gets the next source character %
procedure nextChar ; begin
if columnNumber = LINE_WIDTH then begin
currChar := newlineChar;
columnNumber := columnNumber + 1
end
else if columnNumber > LINE_WIDTH then begin
readcard( line );
columnNumber := 1;
if not XCPNOTED(ENDFILE) then lineNumber := lineNumber + 1;
currChar := line( 0 // 1 )
end
else begin
currChar := line( columnNumber // 1 );
columnNumber := columnNumber + 1
end
end nextChar ;
 
% gets the next token, returns the token type %
integer procedure nextToken ; begin
 
% returns true if currChar is in the inclusive range lowerValue to upperValue %
% false otherwise %
logical procedure range( string(1) value lowerValue, upperValue ) ; begin
currChar >= lowerValue and currChar <= upperValue
end range ;
 
% returns true if the current character can start an identifier, false otherwise %
logical procedure identifierStartChar ; begin
currChar = "_" or range( "a", "z" ) or range( "A", "Z" )
end identifierStartChar ;
 
% add the current character to the token and get the next %
procedure addAndNextChar ; begin
if tkLength >= MAX_TOKEN_LENGTH then tkTooLong := true
else begin
tkValue( tkLength // 1 ) := currChar;
tkLength := tkLength + 1
end if_symbol_not_too_long ;
nextChar
end % addAndNextChar % ;
 
% handle a single character token %
procedure singleCharToken( integer value tokenType ) ; begin
tkType := tokenType;
nextChar
end singleCharToken ;
 
% handle a doubled character token: && or || %
procedure doubleCharToken( integer value tokenType ) ; begin
string(1) firstChar;
firstChar := currChar;
tkType := tokenType;
nextChar;
if currChar = firstChar then nextChar
else % the character wasn't doubled % lexError( "Unrecognised character." );
end singleCharToken ;
 
% handle an operator or operator= token %
procedure opOrOpEqual( integer value opToken, opEqualToken ) ; begin
tkType := opToken;
nextChar;
if currChar = "=" then begin
% have operator= %
tkType := opEqualToken;
nextChar
end if_currChar_is_equal ;
end opOrOpEqual ;
 
% handle a / operator or /* comment %
procedure divideOrComment ; begin
tkType := tOp_divide;
nextChar;
if currChar = "*" then begin
% have a comment %
logical moreComment;
tkType := tComment;
moreComment := true;
while moreComment do begin
nextChar;
while currChar not = "*" and not XCPNOTED(ENDFILE) do nextChar;
while currChar = "*" and not XCPNOTED(ENDFILE) do nextChar;
moreComment := ( currChar not = "/" and not XCPNOTED(ENDFILE) )
end while_more_comment ;
if not XCPNOTED(ENDFILE)
then nextChar
else lexError( "End-of-file in comment." )
end if_currChar_is_star ;
end divideOrComment ;
 
% handle an indentifier or keyword %
procedure identifierOrKeyword ; begin
tkType := tIdentifier;
while identifierStartChar or range( "0", "9" ) do addAndNextChar;
% there are only 5 keywords, so we just test each in turn here %
if tkValue = "if" then tkType := tKeyword_if
else if tkValue = "else" then tkType := tKeyword_else
else if tkValue = "while" then tkType := tKeyword_while
else if tkValue = "print" then tkType := tKeyword_print
else if tkValue = "putc" then tkType := tKeyword_putc;
if tkType not = tIdentifier then tkValue := "";
end identifierOrKeyword ;
 
% handle an integer literal %
procedure integerLiteral ; begin
logical overflowed;
integer digit;
overflowed := false;
tkType := tInteger;
while range( "0", "9" ) do begin
digit := ( decode( currChar ) - decode( "0" ) );
if tkIntegerValue > MAXINTEGER_OVER_10 then overflowed := true
else if tkIntegerValue = MAXINTEGER_OVER_10
and digit > MAXINTEGER_MOD_10 then overflowed := true
else begin
tkIntegerValue := tkIntegerValue * 10;
tkIntegerValue := tkIntegerValue + digit;
end;
nextChar
end while_have_a_digit ;
if overflowed then lexError( "Number too large." );
if identifierStartChar then lexError( "Number followed by letter or underscore." );
end integerLiteral ;
 
% handle a char literal %
procedure charLiteral ; begin
nextChar;
if currChar = "'" or currChar = newlineChar then lexError( "Invalid character constant." )
else if currChar = "\" then begin
% have an escape %
nextChar;
if currChar = "n" then currChar := newlineChar
else if currChar not = "\" then lexError( "Unknown escape sequence." )
end;
tkType := tInteger;
tkIntegerValue := decode( currChar );
% should have a closing quoute next %
nextChar;
if currChar not = "'"
then lexError( "Multi-character constant." )
else nextChar
end charLiteral ;
 
% handle a string literal %
procedure stringLiteral ; begin
tkType := tString;
tkValue( 0 // 1 ) := currChar;
tkLength := 1;
nextChar;
while currChar not = """" and currChar not = newlineChar and not XCPNOTED(ENDFILE) do addAndNextChar;
if currChar = newlineChar then lexError( "End-of-line while scanning string literal." )
else if XCPNOTED(ENDFILE) then lexError( "End-of-file while scanning string literal." )
else % currChar must be """" % addAndNextChar
end stringLiteral ;
 
while begin
% skip white space %
while ( currChar = " " or currChar = newlineChar ) and not XCPNOTED(ENDFILE) do nextChar;
% get the token %
tkLine := lineNumber;
tkColumn := columnNumber;
tkValue := "";
tkLength := 0;
tkIntegerValue := 0;
tkTooLong := false;
if XCPNOTED(ENDFILE) then tkType := tEnd_of_input
else if currChar = "*" then singleCharToken( tOp_multiply )
else if currChar = "/" then divideOrComment
else if currChar = "%" then singleCharToken( tOp_mod )
else if currChar = "+" then singleCharToken( tOp_add )
else if currChar = "-" then singleCharToken( tOp_subtract )
else if currChar = "<" then opOrOpEqual( tOp_less, tOp_lessequal )
else if currChar = ">" then opOrOpEqual( tOp_greater, tOp_greaterequal )
else if currChar = "=" then opOrOpEqual( tOp_assign, tOp_equal )
else if currChar = "!" then opOrOpEqual( tOp_not, tOp_notequal )
else if currChar = "&" then doubleCharToken( tOp_and )
else if currChar = "|" then doubleCharToken( tOp_or )
else if currChar = "(" then singleCharToken( tLeftParen )
else if currChar = ")" then singleCharToken( tRightParen )
else if currChar = "{" then singleCharToken( tLeftBrace )
else if currChar = "}" then singleCharToken( tRightBrace )
else if currChar = ";" then singleCharToken( tSemicolon )
else if currChar = "," then singleCharToken( tComma )
else if identifierStartChar then identifierOrKeyword
else if range( "0", "9" ) then integerLiteral
else if currChar = "'" then charLiteral
else if currChar = """" then stringLiteral
else begin
lexError( "Unrecognised character." );
singleCharToken( tComment )
end ;
% continue until we get something other than a comment %
tkType = tComment
end do begin end;
if tkTooLong then if tkType = tString
then lexError( "String literal too long." )
else lexError( "Identifier too long." );
tkType
end nextToken ;
 
% outputs the current token %
procedure writeToken ; begin
write( i_w := 5, s_w := 2, tkLine, tkColumn, tkName( tkType ) );
if tkType = tInteger then writeon( i_w := 11, tkIntegerValue )
else if tkLength > 0 then begin
writeon( " " );
for tkPos := 0 until tkLength - 1 do writeon( s_w := 0, tkValue( tkPos // 1 ) );
end
end writeToken ;
 
LINE_WIDTH := 256; MAXINTEGER_MOD_10 := MAXINTEGER rem 10;
MAX_TOKEN_LENGTH := 256; MAXINTEGER_OVER_10 := MAXINTEGER div 10;
newlineChar := code( 10 );
tOp_multiply := 1; tkName( tOp_multiply ) := "Op_multiply";
tOp_divide := 2; tkName( tOp_divide ) := "Op_divide";
tOp_mod := 3; tkName( tOp_mod ) := "Op_mod";
tOp_add := 4; tkName( tOp_add ) := "Op_add";
tOp_subtract := 5; tkName( tOp_subtract ) := "Op_subtract";
tOp_negate := 6; tkName( tOp_negate ) := "Op_negate";
tOp_less := 7; tkName( tOp_less ) := "Op_less";
tOp_lessequal := 8; tkName( tOp_lessequal ) := "Op_lessequal";
tOp_greater := 9; tkName( tOp_greater ) := "Op_greater";
tOp_greaterequal := 10; tkName( tOp_greaterequal ) := "Op_greaterequal";
tOp_equal := 11; tkName( tOp_equal ) := "Op_equal";
tOp_notequal := 12; tkName( tOp_notequal ) := "Op_notequal";
tOp_not := 13; tkName( tOp_not ) := "Op_not";
tOp_assign := 14; tkName( tOp_assign ) := "Op_assign";
tOp_and := 15; tkName( tOp_and ) := "Op_and";
tOp_or := 16; tkName( tOp_or ) := "Op_or";
tLeftParen := 17; tkName( tLeftParen ) := "LeftParen";
tRightParen := 18; tkName( tRightParen ) := "RightParen";
tLeftBrace := 19; tkName( tLeftBrace ) := "LeftBrace";
tRightBrace := 20; tkName( tRightBrace ) := "RightBrace";
tSemicolon := 21; tkName( tSemicolon ) := "Semicolon";
tComma := 22; tkName( tComma ) := "Comma";
tKeyword_if := 23; tkName( tKeyword_if ) := "Keyword_if";
tKeyword_else := 24; tkName( tKeyword_else ) := "Keyword_else";
tKeyword_while := 25; tkName( tKeyword_while ) := "Keyword_while";
tKeyword_print := 26; tkName( tKeyword_print ) := "Keyword_print";
tKeyword_putc := 27; tkName( tKeyword_putc ) := "Keyword_putc";
tIdentifier := 28; tkName( tIdentifier ) := "Identifier";
tInteger := 29; tkName( tInteger ) := "Integer";
tString := 30; tkName( tString ) := "String";
tEnd_of_input := 31; tkName( tEnd_of_input ) := "End_of_input";
tComment := 32; tkName( tComment ) := "Comment";
 
% allow the program to continue after reaching end-of-file %
ENDFILE := EXCEPTION( false, 1, 0, false, "EOF" );
% ensure the first call to nextToken reads the first line %
lineNumber := 0;
columnNumber := LINE_WIDTH + 1;
currChar := " ";
% get and print all tokens from standard input %
while nextToken not = tEnd_of_input do writeToken;
writeToken
end.</syntaxhighlight>
{{out}} Test case 3:
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
 
=={{header|ATS}}==
 
One interesting feature of this implementation is my liberal use of a pushback buffer for input characters. This kept the code modular and easier to write.
 
(One point of note: the C "EOF" pseudo-character is detected in the following code by looking for a negative number. That EOF has to be negative and the other characters non-negative is implied by the ISO C standard.)
 
<syntaxhighlight lang="ats">(********************************************************************)
(* Usage: lex [INPUTFILE [OUTPUTFILE]]
If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input
or standard output is used, respectively. *)
 
#define ATS_DYNLOADFLAG 0
 
#include "share/atspre_staload.hats"
staload UN = "prelude/SATS/unsafe.sats"
 
#define NIL list_nil ()
#define :: list_cons
 
%{^
/* alloca(3) is needed for ATS exceptions. */
#include <alloca.h>
%}
 
(********************************************************************)
 
#define NUM_TOKENS 31
#define RESERVED_WORD_HASHTAB_SIZE 9
 
#define TOKEN_ELSE 0
#define TOKEN_IF 1
#define TOKEN_PRINT 2
#define TOKEN_PUTC 3
#define TOKEN_WHILE 4
#define TOKEN_MULTIPLY 5
#define TOKEN_DIVIDE 6
#define TOKEN_MOD 7
#define TOKEN_ADD 8
#define TOKEN_SUBTRACT 9
#define TOKEN_NEGATE 10
#define TOKEN_LESS 11
#define TOKEN_LESSEQUAL 12
#define TOKEN_GREATER 13
#define TOKEN_GREATEREQUAL 14
#define TOKEN_EQUAL 15
#define TOKEN_NOTEQUAL 16
#define TOKEN_NOT 17
#define TOKEN_ASSIGN 18
#define TOKEN_AND 19
#define TOKEN_OR 20
#define TOKEN_LEFTPAREN 21
#define TOKEN_RIGHTPAREN 22
#define TOKEN_LEFTBRACE 23
#define TOKEN_RIGHTBRACE 24
#define TOKEN_SEMICOLON 25
#define TOKEN_COMMA 26
#define TOKEN_IDENTIFIER 27
#define TOKEN_INTEGER 28
#define TOKEN_STRING 29
#define TOKEN_END_OF_INPUT 30
 
typedef token_t =
[i : int | TOKEN_ELSE <= i; i <= TOKEN_END_OF_INPUT]
int i
typedef tokentuple_t = (token_t, String, ullint, ullint)
typedef token_names_vt = @[string][NUM_TOKENS]
 
vtypedef reserved_words_vt =
@[String][RESERVED_WORD_HASHTAB_SIZE]
vtypedef reserved_word_tokens_vt =
@[token_t][RESERVED_WORD_HASHTAB_SIZE]
 
vtypedef lookups_vt =
[p_toknames : addr]
[p_wordtab : addr]
[p_toktab : addr]
@{
pf_toknames = token_names_vt @ p_toknames,
pf_wordtab = reserved_words_vt @ p_wordtab,
pf_toktab = reserved_word_tokens_vt @ p_toktab |
toknames = ptr p_toknames,
wordtab = ptr p_wordtab,
toktab = ptr p_toktab
}
 
fn
reserved_word_lookup
(s : String,
lookups : !lookups_vt,
line_no : ullint,
column_no : ullint) : tokentuple_t =
if string_length s < 2 then
(TOKEN_IDENTIFIER, s, line_no, column_no)
else
let
macdef wordtab = !(lookups.wordtab)
macdef toktab = !(lookups.toktab)
val hashval =
g1uint_mod (g1ofg0 (char2ui s[0] + char2ui s[1]),
g1i2u RESERVED_WORD_HASHTAB_SIZE)
val token = toktab[hashval]
in
if token = TOKEN_IDENTIFIER || s <> wordtab[hashval] then
(TOKEN_IDENTIFIER, s, line_no, column_no)
else
(token, s, line_no, column_no)
end
 
(********************************************************************)
(* Input allows pushback into a buffer. *)
 
typedef ch_t =
@{
ichar = int,
line_no = ullint,
column_no = ullint
}
 
typedef inp_t (n : int) =
[0 <= n]
@{
file = FILEref,
pushback = list (ch_t, n),
line_no = ullint,
column_no = ullint
}
typedef inp_t = [n : int] inp_t n
 
fn
get_ch (inp : inp_t) : (ch_t, inp_t) =
case+ (inp.pushback) of
| NIL =>
let
val c = fileref_getc (inp.file)
val ch =
@{
ichar = c,
line_no = inp.line_no,
column_no = inp.column_no
}
in
if c = char2i '\n' then
let
val inp =
@{
file = inp.file,
pushback = inp.pushback,
line_no = succ (inp.line_no),
column_no = 1ULL
}
in
(ch, inp)
end
else
let
val inp =
@{
file = inp.file,
pushback = inp.pushback,
line_no = inp.line_no,
column_no = succ (inp.column_no)
}
in
(ch, inp)
end
end
| ch :: pushback =>
let
val inp =
@{
file = inp.file,
pushback = pushback,
line_no = inp.line_no,
column_no = inp.column_no
}
in
(ch, inp)
end
 
fn
push_back_ch (ch : ch_t,
inp : inp_t) : [n : pos] inp_t n =
let
prval _ = lemma_list_param (inp.pushback)
in
@{
file = inp.file,
pushback = ch :: (inp.pushback),
line_no = inp.line_no,
column_no = inp.column_no
}
end
 
(********************************************************************)
 
exception unterminated_comment of (ullint, ullint)
exception unterminated_character_literal of (ullint, ullint)
exception multicharacter_literal of (ullint, ullint)
exception unterminated_string_literal of (ullint, ullint, bool)
exception unsupported_escape of (ullint, ullint, int)
exception invalid_integer_literal of (ullint, ullint, String)
exception unexpected_character of (ullint, ullint, int)
 
fn
scan_comment (inp : inp_t,
line_no : ullint,
column_no : ullint) : inp_t =
let
fun
loop (inp : inp_t) : inp_t =
let
val (ch, inp) = get_ch inp
in
if (ch.ichar) < 0 then
$raise unterminated_comment (line_no, column_no)
else if (ch.ichar) = char2i '*' then
let
val (ch1, inp) = get_ch inp
in
if (ch.ichar) < 0 then
$raise unterminated_comment (line_no, column_no)
else if (ch1.ichar) = char2i '/' then
inp
else
loop inp
end
else
loop inp
end
in
loop inp
end
 
fn
skip_spaces_and_comments (inp : inp_t) : [n : pos] inp_t n =
let
fun
loop (inp : inp_t) : [n : pos] inp_t n =
let
val (ch, inp) = get_ch inp
in
if isspace (ch.ichar) then
loop inp
else if (ch.ichar) = char2i '/' then
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '*' then
loop (scan_comment (inp, ch.line_no, ch.column_no))
else
let
val inp = push_back_ch (ch1, inp)
val inp = push_back_ch (ch, inp)
in
inp
end
end
else
push_back_ch (ch, inp)
end
in
loop inp
end
 
fn
reverse_list_to_string
{m : int}
(lst : list (char, m)) : string m =
let
fun
fill_array {n : nat | n <= m} .<n>.
(arr : &(@[char][m + 1]),
lst : list (char, n),
n : size_t n) : void =
case+ lst of
| NIL => ()
| c :: tail =>
begin
arr[pred n] := c;
fill_array (arr, tail, pred n)
end
 
prval _ = lemma_list_param lst
val m : size_t m = i2sz (list_length lst)
val (pf, pfgc | p) = array_ptr_alloc<char> (succ m)
val _ = array_initize_elt<char> (!p, succ m, '\0')
val _ = fill_array (!p, lst, m)
in
$UN.castvwtp0 @(pf, pfgc | p)
end
 
extern fun {}
simple_scan$pred : int -> bool
fun {}
simple_scan {u : nat}
(lst : list (char, u),
inp : inp_t) :
[m : nat]
[n : pos]
(list (char, m), inp_t n) =
let
val (ch, inp) = get_ch inp
in
if simple_scan$pred (ch.ichar) then
simple_scan<> (int2char0 (ch.ichar) :: lst, inp)
else
let
val inp = push_back_ch (ch, inp)
in
(lst, inp)
end
end
 
fn
is_ident_start (c : int) :<> bool =
isalpha (c) || c = char2i '_'
 
fn
is_ident_continuation (c : int) :<> bool =
isalnum (c) || c = char2i '_'
 
fn
scan_identifier_or_reserved_word
(inp : inp_t,
lookups : !lookups_vt) :
(tokentuple_t, [n : pos] inp_t n) =
let
val (ch, inp) = get_ch inp
val _ = assertloc (is_ident_start (ch.ichar))
 
implement simple_scan$pred<> c = is_ident_continuation c
val (lst, inp) = simple_scan (int2char0 (ch.ichar) :: NIL, inp)
 
val s = reverse_list_to_string lst
val toktup =
reserved_word_lookup (s, lookups, ch.line_no, ch.column_no)
in
(toktup, inp)
end
 
fn
scan_integer_literal
(inp : inp_t,
lookups : !lookups_vt) :
(tokentuple_t, [n : pos] inp_t n) =
let
val (ch, inp) = get_ch inp
val _ = assertloc (isdigit (ch.ichar))
 
implement simple_scan$pred<> c = is_ident_continuation c
val (lst, inp) = simple_scan (int2char0 (ch.ichar) :: NIL, inp)
 
val s = reverse_list_to_string lst
 
fun
check_they_are_all_digits
{n : nat} .<n>.
(lst : list (char, n)) : void =
case+ lst of
| NIL => ()
| c :: tail =>
if isdigit c then
check_they_are_all_digits tail
else
$raise invalid_integer_literal (ch.line_no, ch.column_no, s)
 
val _ = check_they_are_all_digits lst
in
((TOKEN_INTEGER, s, ch.line_no, ch.column_no), inp)
end
 
fn
ichar2integer_literal (c : int) : String0 =
let
var buf = @[char][100] ('\0')
val _ = $extfcall (int, "snprintf", addr@ buf, i2sz 99, "%d", c)
val s = string1_copy ($UN.castvwtp0{String0} buf)
in
strnptr2string s
end
 
fn
scan_character_literal_without_checking_end (inp : inp_t) :
(tokentuple_t, inp_t) =
let
val (ch, inp) = get_ch inp
val _ = assertloc ((ch.ichar) = '\'')
 
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) < 0 then
$raise unterminated_character_literal (ch.line_no, ch.column_no)
else if (ch1.ichar) = char2i '\\' then
let
val (ch2, inp) = get_ch inp
in
if (ch2.ichar) < 0 then
$raise unterminated_character_literal (ch.line_no,
ch.column_no)
else if (ch2.ichar) = char2i 'n' then
let
val s = ichar2integer_literal (char2i '\n')
in
((TOKEN_INTEGER, s, ch.line_no, ch.column_no), inp)
end
else if (ch2.ichar) = char2i '\\' then
let
val s = ichar2integer_literal (char2i '\\')
in
((TOKEN_INTEGER, s, ch.line_no, ch.column_no), inp)
end
else
$raise unsupported_escape (ch1.line_no, ch1.column_no,
ch2.ichar)
end
else
let
val s = ichar2integer_literal (ch1.ichar)
in
((TOKEN_INTEGER, s, ch.line_no, ch.column_no), inp)
end
end
 
fn
scan_character_literal (inp : inp_t) : (tokentuple_t, inp_t) =
let
val (tok, inp) =
scan_character_literal_without_checking_end inp
val line_no = (tok.2)
val column_no = (tok.3)
 
fun
check_end (inp : inp_t) : inp_t =
let
val (ch, inp) = get_ch inp
in
if (ch.ichar) = char2i '\'' then
inp
else
let
fun
loop_to_end (ch1 : ch_t,
inp : inp_t) : inp_t =
if (ch1.ichar) < 0 then
$raise unterminated_character_literal (line_no,
column_no)
else if (ch1.ichar) = char2i '\'' then
$raise multicharacter_literal (line_no, column_no)
else
let
val (ch1, inp) = get_ch inp
in
loop_to_end (ch1, inp)
end
 
val inp = loop_to_end (ch, inp)
in
inp
end
end
 
val inp = check_end inp
in
(tok, inp)
end
 
fn
scan_string_literal (inp : inp_t) : (tokentuple_t, inp_t) =
let
val (ch, inp) = get_ch inp
val _ = assertloc ((ch.ichar) = '"')
 
fun
scan {u : pos}
(lst : list (char, u),
inp : inp_t) :
[m : pos] (list (char, m), inp_t) =
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) < 0 then
$raise unterminated_string_literal (ch.line_no,
ch.column_no, false)
else if (ch1.ichar) = char2i '\n' then
$raise unterminated_string_literal (ch.line_no,
ch.column_no, true)
else if (ch1.ichar) = char2i '"' then
(lst, inp)
else if (ch1.ichar) <> char2i '\\' then
scan (int2char0 (ch1.ichar) :: lst, inp)
else
let
val (ch2, inp) = get_ch inp
in
if (ch2.ichar) = char2i 'n' then
scan ('n' :: '\\' :: lst, inp)
else if (ch2.ichar) = char2i '\\' then
scan ('\\' :: '\\' :: lst, inp)
else
$raise unsupported_escape (ch1.line_no, ch1.column_no,
ch2.ichar)
end
end
 
val lst = '"' :: NIL
val (lst, inp) = scan (lst, inp)
val lst = '"' :: lst
val s = reverse_list_to_string lst
in
((TOKEN_STRING, s, ch.line_no, ch.column_no), inp)
end
 
fn
get_next_token (inp : inp_t,
lookups : !lookups_vt) : (tokentuple_t, inp_t) =
let
val inp = skip_spaces_and_comments inp
val (ch, inp) = get_ch inp
val ln = ch.line_no
val cn = ch.column_no
in
if ch.ichar < 0 then
((TOKEN_END_OF_INPUT, "", ln, cn), inp)
else
case+ int2char0 (ch.ichar) of
| ',' => ((TOKEN_COMMA, ",", ln, cn), inp)
| ';' => ((TOKEN_SEMICOLON, ";", ln, cn), inp)
| '\(' => ((TOKEN_LEFTPAREN, "(", ln, cn), inp)
| ')' => ((TOKEN_RIGHTPAREN, ")", ln, cn), inp)
| '\{' => ((TOKEN_LEFTBRACE, "{", ln, cn), inp)
| '}' => ((TOKEN_RIGHTBRACE, "}", ln, cn), inp)
| '*' => ((TOKEN_MULTIPLY, "*", ln, cn), inp)
| '/' => ((TOKEN_DIVIDE, "/", ln, cn), inp)
| '%' => ((TOKEN_MOD, "%", ln, cn), inp)
| '+' => ((TOKEN_ADD, "+", ln, cn), inp)
| '-' => ((TOKEN_SUBTRACT, "-", ln, cn), inp)
| '<' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '=' then
((TOKEN_LESSEQUAL, "<=", ln, cn), inp)
else
let
val inp = push_back_ch (ch1, inp)
in
((TOKEN_LESS, "<", ln, cn), inp)
end
end
| '>' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '=' then
((TOKEN_GREATEREQUAL, ">=", ln, cn), inp)
else
let
val inp = push_back_ch (ch1, inp)
in
((TOKEN_GREATER, ">", ln, cn), inp)
end
end
| '=' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '=' then
((TOKEN_EQUAL, "==", ln, cn), inp)
else
let
val inp = push_back_ch (ch1, inp)
in
((TOKEN_ASSIGN, "=", ln, cn), inp)
end
end
| '!' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '=' then
((TOKEN_NOTEQUAL, "!=", ln, cn), inp)
else
let
val inp = push_back_ch (ch1, inp)
in
((TOKEN_NOT, "!", ln, cn), inp)
end
end
| '&' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '&' then
((TOKEN_AND, "&&", ln, cn), inp)
else
$raise unexpected_character (ch.line_no, ch.column_no,
ch.ichar)
end
| '|' =>
let
val (ch1, inp) = get_ch inp
in
if (ch1.ichar) = char2i '|' then
((TOKEN_OR, "||", ln, cn), inp)
else
$raise unexpected_character (ch.line_no, ch.column_no,
ch.ichar)
end
| '"' =>
let
val inp = push_back_ch (ch, inp)
in
scan_string_literal inp
end
| '\'' =>
let
val inp = push_back_ch (ch, inp)
in
scan_character_literal inp
end
| _ when isdigit (ch.ichar) =>
let
val inp = push_back_ch (ch, inp)
in
scan_integer_literal (inp, lookups)
end
| _ when is_ident_start (ch.ichar) =>
let
val inp = push_back_ch (ch, inp)
in
scan_identifier_or_reserved_word (inp, lookups)
end
| _ => $raise unexpected_character (ch.line_no, ch.column_no,
ch.ichar)
end
 
fn
fprint_ullint_rightjust (outf : FILEref,
num : ullint) : void =
if num < 10ULL then
fprint! (outf, " ", num)
else if num < 100ULL then
fprint! (outf, " ", num)
else if num < 1000ULL then
fprint! (outf, " ", num)
else if num < 10000ULL then
fprint! (outf, " ", num)
else
fprint! (outf, num)
 
fn
print_token (outf : FILEref,
toktup : tokentuple_t,
lookups : !lookups_vt) : void =
let
macdef toknames = !(lookups.toknames)
val name = toknames[toktup.0]
val str = (toktup.1)
val line_no = (toktup.2)
val column_no = (toktup.3)
 
val _ = fprint_ullint_rightjust (outf, line_no)
val _ = fileref_puts (outf, " ")
val _ = fprint_ullint_rightjust (outf, column_no)
val _ = fileref_puts (outf, " ")
val _ = fileref_puts (outf, name)
in
begin
case+ toktup.0 of
| TOKEN_IDENTIFIER => fprint! (outf, " ", str)
| TOKEN_INTEGER => fprint! (outf, " ", str)
| TOKEN_STRING => fprint! (outf, " ", str)
| _ => ()
end;
 
fileref_putc (outf, '\n')
end
 
fn
scan_text (outf : FILEref,
inp : inp_t,
lookups : !lookups_vt) : void =
let
fun
loop (inp : inp_t,
lookups : !lookups_vt) : void =
let
val (toktup, inp) = get_next_token (inp, lookups)
in
print_token (outf, toktup, lookups);
if toktup.0 <> TOKEN_END_OF_INPUT then
loop (inp, lookups)
end
in
loop (inp, lookups)
end
 
(********************************************************************)
 
fn
main_program (inpf : FILEref,
outf : FILEref) : int =
let
(* Using a simple Scheme program, I found the following perfect
hash for the reserved words, using the sum of the first two
characters as the hash value. *)
var reserved_words =
@[String][RESERVED_WORD_HASHTAB_SIZE]
("if", "print", "else", "", "putc", "", "", "while", "")
var reserved_word_tokens =
@[token_t][RESERVED_WORD_HASHTAB_SIZE]
(TOKEN_IF, TOKEN_PRINT, TOKEN_ELSE, TOKEN_IDENTIFIER,
TOKEN_PUTC, TOKEN_IDENTIFIER, TOKEN_IDENTIFIER, TOKEN_WHILE,
TOKEN_IDENTIFIER)
 
var token_names =
@[string][NUM_TOKENS]
("Keyword_else",
"Keyword_if",
"Keyword_print",
"Keyword_putc",
"Keyword_while",
"Op_multiply",
"Op_divide",
"Op_mod",
"Op_add",
"Op_subtract",
"Op_negate",
"Op_less",
"Op_lessequal",
"Op_greater",
"Op_greaterequal",
"Op_equal",
"Op_notequal",
"Op_not",
"Op_assign",
"Op_and",
"Op_or",
"LeftParen",
"RightParen",
"LeftBrace",
"RightBrace",
"Semicolon",
"Comma",
"Identifier",
"Integer",
"String",
"End_of_input")
 
var lookups : lookups_vt =
@{
pf_toknames = view@ token_names,
pf_wordtab = view@ reserved_words,
pf_toktab = view@ reserved_word_tokens |
toknames = addr@ token_names,
wordtab = addr@ reserved_words,
toktab = addr@ reserved_word_tokens
}
 
val inp =
@{
file = inpf,
pushback = NIL,
line_no = 1ULL,
column_no = 1ULL
}
 
val _ = scan_text (outf, inp, lookups)
 
val @{
pf_toknames = pf_toknames,
pf_wordtab = pf_wordtab,
pf_toktab = pf_toktab |
toknames = toknames,
wordtab = wordtab,
toktab = toktab
} = lookups
prval _ = view@ token_names := pf_toknames
prval _ = view@ reserved_words := pf_wordtab
prval _ = view@ reserved_word_tokens := pf_toktab
in
0
end
 
macdef lex_error = "Lexical error: "
 
implement
main (argc, argv) =
let
val inpfname =
if 2 <= argc then
$UN.cast{string} argv[1]
else
"-"
val outfname =
if 3 <= argc then
$UN.cast{string} argv[2]
else
"-"
in
try
let
val inpf =
if (inpfname : string) = "-" then
stdin_ref
else
fileref_open_exn (inpfname, file_mode_r)
 
val outf =
if (outfname : string) = "-" then
stdout_ref
else
fileref_open_exn (outfname, file_mode_w)
in
main_program (inpf, outf)
end
with
| ~ unterminated_comment (line_no, column_no) =>
begin
fprintln! (stderr_ref, lex_error,
"unterminated comment starting at ",
line_no, ":", column_no);
1
end
| ~ unterminated_character_literal (line_no, column_no) =>
begin
fprintln! (stderr_ref, lex_error,
"unterminated character literal starting at ",
line_no, ":", column_no);
1
end
| ~ multicharacter_literal (line_no, column_no) =>
begin
fprintln! (stderr_ref, lex_error,
"unsupported multicharacter literal starting at ",
line_no, ":", column_no);
1
end
| ~ unterminated_string_literal (line_no, column_no,
end_of_line) =>
let
val s =
begin
if end_of_line then
"end of line"
else
"end of input"
end : String
in
fprintln! (stderr_ref, lex_error,
"unterminated string literal (", s,
") starting at ", line_no, ":", column_no);
1
end
| ~ unsupported_escape (line_no, column_no, c) =>
begin
fprintln! (stderr_ref, lex_error,
"unsupported escape \\",
int2char0 c, " starting at ",
line_no, ":", column_no);
1
end
| ~ invalid_integer_literal (line_no, column_no, s) =>
begin
fprintln! (stderr_ref, lex_error,
"invalid integer literal ", s,
" starting at ", line_no, ":", column_no);
1
end
| ~ unexpected_character (line_no, column_no, c) =>
begin
fprintln! (stderr_ref, lex_error,
"unexpected character '", int2char0 c,
"' at ", line_no, ":", column_no);
1
end
end
 
(********************************************************************)</syntaxhighlight>
 
{{out}}
<pre>$ patscc -O2 -DATS_MEMALLOC_GCBDW -o lex lex-in-ATS.dats -lgc && ./lex compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|AWK}}==
Tested with gawk 4.1.1 and mawk 1.3.4.
<syntaxhighlight lang="awk">
BEGIN {
all_syms["tk_EOI" ] = "End_of_input"
all_syms["tk_Mul" ] = "Op_multiply"
all_syms["tk_Div" ] = "Op_divide"
all_syms["tk_Mod" ] = "Op_mod"
all_syms["tk_Add" ] = "Op_add"
all_syms["tk_Sub" ] = "Op_subtract"
all_syms["tk_Negate" ] = "Op_negate"
all_syms["tk_Not" ] = "Op_not"
all_syms["tk_Lss" ] = "Op_less"
all_syms["tk_Leq" ] = "Op_lessequal"
all_syms["tk_Gtr" ] = "Op_greater"
all_syms["tk_Geq" ] = "Op_greaterequal"
all_syms["tk_Eq" ] = "Op_equal"
all_syms["tk_Neq" ] = "Op_notequal"
all_syms["tk_Assign" ] = "Op_assign"
all_syms["tk_And" ] = "Op_and"
all_syms["tk_Or" ] = "Op_or"
all_syms["tk_If" ] = "Keyword_if"
all_syms["tk_Else" ] = "Keyword_else"
all_syms["tk_While" ] = "Keyword_while"
all_syms["tk_Print" ] = "Keyword_print"
all_syms["tk_Putc" ] = "Keyword_putc"
all_syms["tk_Lparen" ] = "LeftParen"
all_syms["tk_Rparen" ] = "RightParen"
all_syms["tk_Lbrace" ] = "LeftBrace"
all_syms["tk_Rbrace" ] = "RightBrace"
all_syms["tk_Semi" ] = "Semicolon"
all_syms["tk_Comma" ] = "Comma"
all_syms["tk_Ident" ] = "Identifier"
all_syms["tk_Integer"] = "Integer"
all_syms["tk_String" ] = "String"
 
## single character only symbols
symbols["{" ] = "tk_Lbrace"
symbols["}" ] = "tk_Rbrace"
symbols["(" ] = "tk_Lparen"
symbols[")" ] = "tk_Rparen"
symbols["+" ] = "tk_Add"
symbols["-" ] = "tk_Sub"
symbols["*" ] = "tk_Mul"
symbols["%" ] = "tk_Mod"
symbols[";" ] = "tk_Semi"
symbols["," ] = "tk_Comma"
 
key_words["if" ] = "tk_If"
key_words["else" ] = "tk_Else"
key_words["print"] = "tk_Print"
key_words["putc" ] = "tk_Putc"
key_words["while"] = "tk_While"
 
# Set up an array that emulates the ord() function.
for(n=0;n<256;n++)
ord[sprintf("%c",n)]=n
 
input_file = "-"
if (ARGC > 1)
input_file = ARGV[1]
RS=FS="" # read complete file into one line $0
getline < input_file
the_ch = " " # dummy first char - but it must be a space
the_col = 0 # always points to the current character
the_line = 1
for (the_nf=1; ; ) {
split(gettok(), t, SUBSEP)
printf("%5s %5s %-14s", t[2], t[3], all_syms[t[1]])
if (t[1] == "tk_Integer") printf(" %5s\n", t[4])
else if (t[1] == "tk_Ident" ) printf(" %s\n", t[4])
else if (t[1] == "tk_String" ) printf(" \"%s\"\n", t[4])
else print("")
if (t[1] == "tk_EOI")
break
}
}
 
#*** show error and exit
function error(line, col, msg) {
print(line, col, msg)
exit(1)
}
 
# get the next character from the input
function next_ch() {
the_ch = $the_nf
the_nf ++
the_col ++
if (the_ch == "\n") {
the_line ++
the_col = 0
}
return the_ch
}
 
#*** 'x' - character constants
function char_lit(err_line, err_col) {
n = ord[next_ch()] # skip opening quote
if (the_ch == "'") {
error(err_line, err_col, "empty character constant")
} else if (the_ch == "\\") {
next_ch()
if (the_ch == "n")
n = 10
else if (the_ch == "\\")
n = ord["\\"]
else
error(err_line, err_col, "unknown escape sequence " the_ch)
}
if (next_ch() != "'")
error(err_line, err_col, "multi-character constant")
next_ch()
return "tk_Integer" SUBSEP err_line SUBSEP err_col SUBSEP n
}
 
#*** process divide or comments
function div_or_cmt(err_line, err_col) {
if (next_ch() != "*")
return "tk_Div" SUBSEP err_line SUBSEP err_col
# comment found
next_ch()
while (1) {
if (the_ch == "*") {
if (next_ch() == "/") {
next_ch()
return gettok()
} else if (the_ch == "") {
error(err_line, err_col, "EOF in comment")
}
} else {
next_ch()
}
}
}
 
#*** "string"
function string_lit(start, err_line, err_col) {
text = ""
while (next_ch() != start) {
if (the_ch == "")
error(err_line, err_col, "EOF while scanning string literal")
if (the_ch == "\n")
error(err_line, err_col, "EOL while scanning string literal")
text = text the_ch
}
next_ch()
return "tk_String" SUBSEP err_line SUBSEP err_col SUBSEP text
}
 
#*** handle identifiers and integers
function ident_or_int(err_line, err_col) {
is_number = 1
text = ""
while ((the_ch ~ /^[0-9a-zA-Z]+$/) || (the_ch == "_")) {
text = text the_ch
if (! (the_ch ~ /^[0-9]+$/))
is_number = 0
next_ch()
}
if (text == "")
error(err_line, err_col, "ident_or_int: unrecognized character: " the_ch)
if (text ~ /^[0-9]/) {
if (! is_number)
error(err_line, err_col, "invalid number: " text)
n = text + 0
return "tk_Integer" SUBSEP err_line SUBSEP err_col SUBSEP n
}
if (text in key_words)
return key_words[text] SUBSEP err_line SUBSEP err_col
return "tk_Ident" SUBSEP err_line SUBSEP err_col SUBSEP text
}
 
#*** look ahead for '>=', etc.
function follow(expect, ifyes, ifno, err_line, err_col) {
if (next_ch() == expect) {
next_ch()
return ifyes SUBSEP err_line SUBSEP err_col
}
if (ifno == tk_EOI)
error(err_line, err_col, "follow: unrecognized character: " the_ch)
return ifno SUBSEP err_line SUBSEP err_col
}
 
#*** return the next token type
function gettok() {
while (the_ch == " " || the_ch == "\n" || the_ch == "\r")
next_ch()
err_line = the_line
err_col = the_col
if (the_ch == "" ) return "tk_EOI" SUBSEP err_line SUBSEP err_col
else if (the_ch == "/") return div_or_cmt(err_line, err_col)
else if (the_ch == "'") return char_lit(err_line, err_col)
else if (the_ch == "<") return follow("=", "tk_Leq", "tk_Lss", err_line, err_col)
else if (the_ch == ">") return follow("=", "tk_Geq", "tk_Gtr", err_line, err_col)
else if (the_ch == "=") return follow("=", "tk_Eq", "tk_Assign", err_line, err_col)
else if (the_ch == "!") return follow("=", "tk_Neq", "tk_Not", err_line, err_col)
else if (the_ch == "&") return follow("&", "tk_And", "tk_EOI", err_line, err_col)
else if (the_ch == "|") return follow("|", "tk_Or", "tk_EOI", err_line, err_col)
else if (the_ch =="\"") return string_lit(the_ch, err_line, err_col)
else if (the_ch in symbols) {
sym = symbols[the_ch]
next_ch()
return sym SUBSEP err_line SUBSEP err_col
} else {
return ident_or_int(err_line, err_col)
}
}
</syntaxhighlight>
{{out|case=count}}
<b>
<pre>
1 1 Identifier count
1 7 Op_assign
1 9 Integer 1
1 10 Semicolon
2 1 Keyword_while
2 7 LeftParen
2 8 Identifier count
2 14 Op_less
2 16 Integer 10
2 18 RightParen
2 20 LeftBrace
3 5 Keyword_print
3 10 LeftParen
3 11 String "count is: "
3 23 Comma
3 25 Identifier count
3 30 Comma
3 32 String "\n"
3 36 RightParen
3 37 Semicolon
4 5 Identifier count
4 11 Op_assign
4 13 Identifier count
4 19 Op_add
4 21 Integer 1
4 22 Semicolon
5 1 RightBrace
5 3 End_of_input
</pre>
</b>
 
=={{header|C}}==
Tested with gcc 4.81 and later, compiles warning free with -Wpedantic -pedantic -Wall -Wextra
<syntaxhighlight lang="c">#include <stdlib.h>
<lang C>
#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
Line 316 ⟶ 2,340:
int _qy_ ## name ## _max = 0
#define da_rewind(name) _qy_ ## name ## _p = 0
#define da_redim(name) do {if (_qy_ ## name ## _p >= _qy_ ## name ## _max) \
name = realloc(name, (_qy_ ## name ## _max += 32) * sizeof(name[0]));} while (0)
#define da_append(name, x) do {da_redim(name); name[_qy_ ## name ## _p++] = x;} while (0)
#define da_len(name) _qy_ ## name ## _p
 
// dependancy: atr table in parse.c ordering is based on these
typedef enum {
EOItk_EOI, Printtk_Mul, Putctk_Div, Iftk_Mod, Whiletk_Add, Lbracetk_Sub, Rbracetk_Negate, Lparentk_Not, Rparentk_Lss, Uminus, Mul, Div, Addtk_Leq,
Subtk_Gtr, Lsstk_Geq, Gtrtk_Eq, Leqtk_Neq, Neqtk_Assign, Andtk_And, Semitk_Or, Commatk_If, Assigntk_Else, Integerktk_While, Stringk, Ident
tk_Print, tk_Putc, tk_Lparen, tk_Rparen, tk_Lbrace, tk_Rbrace, tk_Semi, tk_Comma,
tk_Ident, tk_Integer, tk_String
} TokenType;
 
typedef struct {
intTokenType tok;
int err_ln, err_col;
union {
Line 340 ⟶ 2,365:
da_dim(text, char);
 
tok_s gettok(void);
 
static void error(int err_line, int err_col, const char *fmt, ... ) {
Line 353 ⟶ 2,378:
}
 
static int next_ch(void) { /* get next char from input */
the_ch = getc(source_fp);
++col;
Line 377 ⟶ 2,402:
error(err_line, err_col, "multi-character constant");
next_ch();
return (tok_s){Integerktk_Integer, err_line, err_col, {n}};
}
 
static tok_s div_or_cmt(int err_line, int err_col) { /* process divide or comments */
if (the_ch != '*')
return (tok_s){Divtk_Div, err_line, err_col, {0}};
 
/* comment found */
next_ch();
for (;;) {
if (next_ch()the_ch == '*' && next_ch() == '/') {
if (next_ch(); == '/') {
return gettok next_ch();
return gettok();
}
} else if (the_ch == EOF)
error(err_line, err_col, "EOF in comment");
else
next_ch();
}
}
Line 405 ⟶ 2,435:
 
next_ch();
return (tok_s){Stringktk_String, err_line, err_col, {.text=text}};
}
 
Line 414 ⟶ 2,444:
static TokenType get_ident_type(const char *ident) {
static struct {
const char *s;
TokenType sym;
} kwds[] = {
{"ifelse", Iftk_Else},
{"printif", Print tk_If},
{"putcprint", Putctk_Print},
{"whileputc", While tk_Putc},
{"while", tk_While},
}, *kwp;
 
return (kwp = bsearch(&ident, kwds, NELEMS(kwds), sizeof(kwds[0]), kwd_cmp)) == NULL ? Identtk_Ident : kwp->sym;
}
 
Line 445 ⟶ 2,476:
if (n == LONG_MAX && errno == ERANGE)
error(err_line, err_col, "Number exceeds maximum value");
return (tok_s){Integerktk_Integer, err_line, err_col, {n}};
}
return (tok_s){get_ident_type(text), err_line, err_col, {.text=text}};
Line 455 ⟶ 2,486:
return (tok_s){ifyes, err_line, err_col, {0}};
}
if (ifno == EOItk_EOI)
error(err_line, err_col, "follow: unrecognized character '%c' (%d)\n", the_ch, the_ch);
return (tok_s){ifno, err_line, err_col, {0}};
}
 
tok_s gettok(void) { /* return the token type */
/* skip white space */
while (isspace(the_ch))
Line 467 ⟶ 2,498:
int err_col = col;
switch (the_ch) {
case '{': next_ch(); return (tok_s){Lbracetk_Lbrace, err_line, err_col, {0}};
case '}': next_ch(); return (tok_s){Rbracetk_Rbrace, err_line, err_col, {0}};
case '(': next_ch(); return (tok_s){Lparentk_Lparen, err_line, err_col, {0}};
case ')': next_ch(); return (tok_s){Rparentk_Rparen, err_line, err_col, {0}};
case '+': next_ch(); return (tok_s){Addtk_Add, err_line, err_col, {0}};
case '-': next_ch(); return (tok_s){Subtk_Sub, err_line, err_col, {0}};
case '*': next_ch(); return (tok_s){Multk_Mul, err_line, err_col, {0}};
case ';%': next_ch(); return (tok_s){Semitk_Mod, err_line, err_col, {0}};
case ',;': next_ch(); return (tok_s){Commatk_Semi, err_line, err_col, {0}};
case '>,': next_ch(); return (tok_s){Gtrtk_Comma, err_line, err_col, {0}};
case '=': next_ch(); return (tok_s){Assign, err_line, err_col, {0}};
case '/': next_ch(); return div_or_cmt(err_line, err_col);
case '\'': next_ch(); return char_lit(the_ch, err_line, err_col);
case '<': next_ch(); return follow('=', Leqtk_Leq, Lsstk_Lss, err_line, err_col);
case '!>': next_ch(); return follow('=', Neqtk_Geq, EOItk_Gtr, err_line, err_col);
case '&=': next_ch(); return follow('&=', Andtk_Eq, EOI tk_Assign, err_line, err_col);
case '!': next_ch(); return follow('=', tk_Neq, tk_Not, err_line, err_col);
case '&': next_ch(); return follow('&', tk_And, tk_EOI, err_line, err_col);
case '|': next_ch(); return follow('|', tk_Or, tk_EOI, err_line, err_col);
case '"' : return string_lit(the_ch, err_line, err_col);
default: return ident_or_int(err_line, err_col);
case EOF: return (tok_s){EOItk_EOI, err_line, err_col, {0}};
}
}
 
void run(void) { /* tokenize the given input */
tok_s tok;
do {
tok = gettok();
fprintf(dest_fp, "line %5d col %5d %.8s15s",
tok.err_ln, tok.err_col,
&"EOIEnd_of_input Op_multiply Print Op_divide Putc If Op_mod While LbraceOp_add Rbrace Lparen Rparen "
"UminusOp_subtract Mul Op_negate Div Op_not Add Sub Op_less Lss Gtr Leq Neq Op_lessequal "
"AndOp_greater SemiOp_greaterequal Op_equal Comma AssignOp_notequal Integer StringOp_assign Ident "[tok.tok * 9]);
"Op_and Op_or Keyword_if Keyword_else Keyword_while "
 
"Keyword_print Keyword_putc LeftParen RightParen LeftBrace "
if (tok.tok == Integerk) fprintf(dest_fp, " %4d", tok.n);
"RightBrace Semicolon Comma Identifier Integer "
else if (tok.tok == Ident) fprintf(dest_fp, " %s", tok.text);
"String "
else if (tok.tok == Stringk) fprintf(dest_fp, " \"%s\"", tok.text);
[tok.tok * 16]);
if (tok.tok == tk_Integer) fprintf(dest_fp, " %4d", tok.n);
else if (tok.tok == tk_Ident) fprintf(dest_fp, " %s", tok.text);
else if (tok.tok == tk_String) fprintf(dest_fp, " \"%s\"", tok.text);
fprintf(dest_fp, "\n");
} while (tok.tok != EOItk_EOI);
if (dest_fp != stdout)
fclose(dest_fp);
Line 520 ⟶ 2,557:
run();
return 0;
}</syntaxhighlight>
}
</lang>
 
Output from {{out|case=test case 3:}}
<b>
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
</b>
 
=={{header|C sharp|C#}}==
Requires C#6.0 because of the use of null coalescing operators.
<syntaxhighlight lang="csharp">
using System;
using System.IO;
using System.Linq;
using System.Collections.Generic;
 
 
namespace Rosetta {
 
public enum TokenType {
End_of_input, Op_multiply, Op_divide, Op_mod, Op_add, Op_subtract,
Op_negate, Op_not, Op_less, Op_lessequal, Op_greater, Op_greaterequal,
Op_equal, Op_notequal, Op_assign, Op_and, Op_or, Keyword_if,
Keyword_else, Keyword_while, Keyword_print, Keyword_putc, LeftParen, RightParen,
LeftBrace, RightBrace, Semicolon, Comma, Identifier, Integer, String, None
}
 
/// <summary>
/// Storage class for tokens
/// </summary>
public class Token {
public TokenType Type { get; set; }
public int Line { get; set; }
public int Position { get; set; }
public string Value { get; set; }
public override string ToString() {
if (Type == TokenType.Integer || Type == TokenType.Identifier) {
return String.Format("{0,-5} {1,-5} {2,-14} {3}", Line, Position, Type.ToString(), Value);
} else if (Type == TokenType.String) {
return String.Format("{0,-5} {1,-5} {2,-14} \"{3}\"", Line, Position, Type.ToString(), Value.Replace("\n", "\\n"));
}
return String.Format("{0,-5} {1,-5} {2,-14}", Line, Position, Type.ToString());
}
}
 
/// <summary>
/// C# Example of Lexical scanner for Rosetta Compiler
/// </summary>
public class LexicalScanner {
 
// character classes
private const string _letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_";
private const string _numbers = "0123456789";
private const string _identifier = _letters + _numbers + "_";
private const string _whitespace = " \t\n\r";
// mappings from string keywords to token type
private Dictionary<string, TokenType> _keywordTokenTypeMap = new Dictionary<string, TokenType>() {
{ "if", TokenType.Keyword_if },
{ "else", TokenType.Keyword_else },
{ "while", TokenType.Keyword_while },
{ "print", TokenType.Keyword_print },
{ "putc", TokenType.Keyword_putc }
};
 
// mappings from simple operators to token type
private Dictionary<string, TokenType> _operatorTokenTypeMap = new Dictionary<string, TokenType>() {
{ "+", TokenType.Op_add },
{ "-", TokenType.Op_subtract },
{ "*", TokenType.Op_multiply },
{ "/", TokenType.Op_divide },
{ "%", TokenType.Op_mod },
{ "=", TokenType.Op_assign },
{ "<", TokenType.Op_less },
{ ">", TokenType.Op_greater },
{ "!", TokenType.Op_not },
};
 
private List<string> _keywords;
private string _operators = "+-*/%=<>!%";
 
private string _code;
private List<Token> tokens = new List<Token>();
 
private int _line = 1;
private int _position = 1;
 
public string CurrentCharacter {
get {
try {
return _code.Substring(0, 1);
} catch (ArgumentOutOfRangeException) {
return "";
}
}
}
 
/// <summary>
/// Lexical scanner initialiser
/// </summary>
/// <param name="code">Code to be tokenised</param>
public LexicalScanner (string code) {
_code = code;
_keywords = _keywordTokenTypeMap.Keys.ToList();
}
 
/// <summary>
/// Advance the cursor forward given number of characters
/// </summary>
/// <param name="characters">Number of characters to advance</param>
private void advance(int characters=1) {
try {
// reset position when there is a newline
if (CurrentCharacter == "\n") {
_position = 0;
_line++;
}
_code = _code.Substring(characters, _code.Length - characters);
_position += characters;
} catch (ArgumentOutOfRangeException) {
_code = "";
}
}
 
/// <summary>
/// Outputs error message to the console and exits
/// </summary>
/// <param name="message">Error message to display to user</param>
/// <param name="line">Line error occurred on</param>
/// <param name="position">Line column that the error occurred at</param>
public void error(string message, int line, int position) {
// output error to the console and exit
Console.WriteLine(String.Format("{0} @ {1}:{2}", message, line, position));
Environment.Exit(1);
}
 
/// <summary>
/// Pattern matching using first & follow matching
/// </summary>
/// <param name="recogniseClass">String of characters that identifies the token type
/// or the exact match the be made if exact:true</param>
/// <param name="matchClass">String of characters to match against remaining target characters</param>
/// <param name="tokenType">Type of token the match represents.</param>
/// <param name="notNextClass">Optional class of characters that cannot follow the match</param>
/// <param name="maxLen">Optional maximum length of token value</param>
/// <param name="exact">Denotes whether recogniseClass represents an exact match or class match.
/// Default: false</param>
/// <param name="discard">Denotes whether the token is kept or discarded. Default: false</param>
/// <param name="offset">Optiona line position offset to account for discarded tokens</param>
/// <returns>Boolean indicating if a match was made </returns>
public bool match(string recogniseClass, string matchClass, TokenType tokenType,
string notNextClass=null, int maxLen=Int32.MaxValue, bool exact=false,
bool discard=false, int offset=0) {
 
// if we've hit the end of the file, there's no more matching to be done
if (CurrentCharacter == "")
return false;
 
// store _current_ line and position so that our vectors point at the start
// of each token
int line = _line;
int position = _position;
 
// special case exact tokens to avoid needing to worry about backtracking
if (exact) {
if (_code.StartsWith(recogniseClass)) {
if (!discard)
tokens.Add(new Token() { Type = tokenType, Value = recogniseClass, Line = line, Position = position - offset});
advance(recogniseClass.Length);
return true;
}
return false;
}
 
// first match - denotes the token type usually
if (!recogniseClass.Contains(CurrentCharacter))
return false;
 
string tokenValue = CurrentCharacter;
advance();
 
// follow match while we haven't exceeded maxLen and there are still characters
// in the code stream
while ((matchClass ?? "").Contains(CurrentCharacter) && tokenValue.Length <= maxLen && CurrentCharacter != "") {
tokenValue += CurrentCharacter;
advance();
}
 
// ensure that any incompatible characters are not next to the token
// eg 42fred is invalid, and neither recognized as a number nor an identifier.
// _letters would be the notNextClass
if (notNextClass != null && notNextClass.Contains(CurrentCharacter))
error("Unrecognised character: " + CurrentCharacter, _line, _position);
 
// only add tokens to the stack that aren't marked as discard - dont want
// things like open and close quotes/comments
if (!discard) {
Token token = new Token() { Type = tokenType, Value = tokenValue, Line = line, Position = position - offset };
tokens.Add(token);
}
 
return true;
}
 
/// <summary>
/// Tokenise the input code
/// </summary>
/// <returns>List of Tokens</returns>
public List<Token> scan() {
 
while (CurrentCharacter != "") {
// match whitespace
match(_whitespace, _whitespace, TokenType.None, discard: true);
 
// match integers
match(_numbers, _numbers, TokenType.Integer, notNextClass:_letters);
// match identifiers and keywords
if (match(_letters, _identifier, TokenType.Identifier)) {
Token match = tokens.Last();
if (_keywords.Contains(match.Value))
match.Type = _keywordTokenTypeMap[match.Value];
}
 
// match string similarly to comments without allowing newlines
// this token doesn't get discarded though
if (match("\"", null, TokenType.String, discard:true)) {
string value = "";
int position = _position;
while (!match("\"", null, TokenType.String, discard:true)) {
// not allowed newlines in strings
if (CurrentCharacter == "\n")
error("End-of-line while scanning string literal. Closing string character not found before end-of-line", _line, _position);
// end of file reached before finding end of string
if (CurrentCharacter == "")
error("End-of-file while scanning string literal. Closing string character not found", _line, _position);
 
value += CurrentCharacter;
 
// deal with escape sequences - we only accept newline (\n)
if (value.Length >= 2) {
string lastCharacters = value.Substring(value.Length - 2, 2);
if (lastCharacters[0] == '\\') {
if (lastCharacters[1] != 'n') {
error("Unknown escape sequence. ", _line, position);
}
value = value.Substring(0, value.Length - 2).ToString() + "\n";
}
}
 
advance();
}
tokens.Add(new Token() { Type = TokenType.String, Value = value, Line = _line, Position = position - 1});
}
 
// match string literals
if (match("'", null, TokenType.Integer, discard:true)) {
int value;
int position = _position;
value = CurrentCharacter.ToCharArray()[0];
advance();
 
// deal with empty literals ''
if (value == '\'')
error("Empty character literal", _line, _position);
 
// deal with escaped characters, only need to worry about \n and \\
// throw werror on any other
if (value == '\\') {
if (CurrentCharacter == "n") {
value = '\n';
} else if (CurrentCharacter == "\\") {
value = '\\';
} else {
error("Unknown escape sequence. ", _line, _position - 1);
}
advance();
}
 
// if we haven't hit a closing ' here, there are two many characters
// in the literal
if (!match("'", null, TokenType.Integer, discard: true))
error("Multi-character constant", _line, _position);
 
tokens.Add(new Rosetta.Token() { Type = TokenType.Integer, Value = value.ToString(), Line = _line, Position = position - 1 });
}
 
// match comments by checking for starting token, then advancing
// until closing token is matched
if (match("/*", null, TokenType.None, exact: true, discard: true)) {
while (!match("*/", null, TokenType.None, exact: true, discard: true)) {
// reached the end of the file without closing comment!
if (CurrentCharacter == "")
error("End-of-file in comment. Closing comment characters not found.", _line, _position);
advance();
}
continue;
}
 
// match complex operators
match("<=", null, TokenType.Op_lessequal, exact: true);
match(">=", null, TokenType.Op_greaterequal, exact: true);
match("==", null, TokenType.Op_equal, exact: true);
match("!=", null, TokenType.Op_notequal, exact: true);
match("&&", null, TokenType.Op_and, exact: true);
match("||", null, TokenType.Op_or, exact: true);
 
// match simple operators
if (match(_operators, null, TokenType.None, maxLen:1)) {
Token match = tokens.Last();
match.Type = _operatorTokenTypeMap[match.Value];
}
 
// brackets, braces and separators
match("(", null, TokenType.LeftParen, exact: true);
match(")", null, TokenType.RightParen, exact: true);
match("{", null, TokenType.LeftBrace, exact: true);
match("}", null, TokenType.RightBrace, exact: true);
match(";", null, TokenType.Semicolon, exact: true);
match(",", null, TokenType.Comma, exact: true);
 
}
 
// end of file token
tokens.Add(new Rosetta.Token() { Type = TokenType.End_of_input, Line = _line, Position = _position });
return tokens;
}
 
static void Main (string[] args) {
StreamReader inputFile;
 
// if we passed in a filename, read code from that, else
// read code from stdin
if (args.Length > 0) {
string path = args[0];
try {
inputFile = new StreamReader(path);
} catch (IOException) {
inputFile = new StreamReader(Console.OpenStandardInput(8192));
}
} else {
inputFile = new StreamReader(Console.OpenStandardInput(8192));
}
 
string code = inputFile.ReadToEnd();
 
// strip windows line endings out
code = code.Replace("\r", "");
 
LexicalScanner scanner = new LexicalScanner(code);
List<Token> tokens = scanner.scan();
 
foreach(Token token in tokens) {
Console.WriteLine(token.ToString());
}
}
}
}
</syntaxhighlight>
 
{{out|case=test case 3}}
<b>
<pre>
line5 5 16 col 15 PrintKeyword_print
line5 5 40 col 41 SubOp_subtract
line6 6 16 col 15 PutcKeyword_putc
line6 6 40 col 41 LssOp_less
line7 7 16 col 15 IfKeyword_if
line7 7 40 col 41 GtrOp_greater
line8 8 16 col 15 WhileKeyword_else
line8 8 40 col 41 LeqOp_lessequal
line9 9 16 col 15 LbraceKeyword_while
line9 9 40 col 41 NeqOp_greaterequal
line10 10 16 col 15 RbraceLeftBrace
line10 10 40 col 41 AndOp_equal
line11 11 16 col 15 LparenRightBrace
line11 11 40 col 41 SemiOp_notequal
line12 12 16 col 15 RparenLeftParen
line12 12 40 col 41 CommaOp_and
line13 13 16 col 15 SubRightParen
line13 13 40 col 41 AssignOp_or
line14 14 16 col 15 MulOp_subtract
line 14 col 4140 Integer 42Semicolon
line15 15 16 col 15 DivOp_not
line 15 col 40 41 String "String literal"Comma
line16 16 16 col 15 AddOp_multiply
line 16 col 40 41 Ident variable_nameOp_assign
line 17 col 2616 Integer 10Op_divide
line17 18 40 col 26 Integer 32 42
line18 19 16 col 1 EOIOp_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
</b>
 
=={{header|C++}}==
Tested with GCC 9.3.0 (g++ -std=c++17)
<syntaxhighlight lang="cpp">#include <charconv> // std::from_chars
#include <fstream> // file_to_string, string_to_file
#include <functional> // std::invoke
#include <iomanip> // std::setw
#include <ios> // std::left
#include <iostream>
#include <map> // keywords
#include <sstream>
#include <string>
#include <utility> // std::forward
#include <variant> // TokenVal
 
using namespace std;
 
// =====================================================================================================================
// Machinery
// =====================================================================================================================
string file_to_string (const string& path)
{
// Open file
ifstream file {path, ios::in | ios::binary | ios::ate};
if (!file) throw (errno);
 
// Allocate string memory
string contents;
contents.resize(file.tellg());
 
// Read file contents into string
file.seekg(0);
file.read(contents.data(), contents.size());
 
return contents;
}
 
void string_to_file (const string& path, string contents)
{
ofstream file {path, ios::out | ios::binary};
if (!file) throw (errno);
 
file.write(contents.data(), contents.size());
}
 
template <class F>
void with_IO (string source, string destination, F&& f)
{
string input;
 
if (source == "stdin") getline(cin, input);
else input = file_to_string(source);
 
string output = invoke(forward<F>(f), input);
 
if (destination == "stdout") cout << output;
else string_to_file(destination, output);
}
 
// Add escaped newlines and backslashes back in for printing
string sanitize (string s)
{
for (auto i = 0u; i < s.size(); ++i)
{
if (s[i] == '\n') s.replace(i++, 1, "\\n");
else if (s[i] == '\\') s.replace(i++, 1, "\\\\");
}
 
return s;
}
 
class Scanner
{
public:
const char* pos;
int line = 1;
int column = 1;
 
Scanner (const char* source) : pos {source} {}
 
inline char peek () { return *pos; }
 
void advance ()
{
if (*pos == '\n') { ++line; column = 1; }
else ++column;
 
++pos;
}
 
char next ()
{
advance();
return peek();
}
 
void skip_whitespace ()
{
while (isspace(static_cast<unsigned char>(peek())))
advance();
}
}; // class Scanner
 
 
// =====================================================================================================================
// Tokens
// =====================================================================================================================
enum class TokenName
{
OP_MULTIPLY, OP_DIVIDE, OP_MOD, OP_ADD, OP_SUBTRACT, OP_NEGATE,
OP_LESS, OP_LESSEQUAL, OP_GREATER, OP_GREATEREQUAL, OP_EQUAL, OP_NOTEQUAL,
OP_NOT, OP_ASSIGN, OP_AND, OP_OR,
LEFTPAREN, RIGHTPAREN, LEFTBRACE, RIGHTBRACE, SEMICOLON, COMMA,
KEYWORD_IF, KEYWORD_ELSE, KEYWORD_WHILE, KEYWORD_PRINT, KEYWORD_PUTC,
IDENTIFIER, INTEGER, STRING,
END_OF_INPUT, ERROR
};
 
using TokenVal = variant<int, string>;
 
struct Token
{
TokenName name;
TokenVal value;
int line;
int column;
};
 
 
const char* to_cstring (TokenName name)
{
static const char* s[] =
{
"Op_multiply", "Op_divide", "Op_mod", "Op_add", "Op_subtract", "Op_negate",
"Op_less", "Op_lessequal", "Op_greater", "Op_greaterequal", "Op_equal", "Op_notequal",
"Op_not", "Op_assign", "Op_and", "Op_or",
"LeftParen", "RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma",
"Keyword_if", "Keyword_else", "Keyword_while", "Keyword_print", "Keyword_putc",
"Identifier", "Integer", "String",
"End_of_input", "Error"
};
 
return s[static_cast<int>(name)];
}
 
 
string to_string (Token t)
{
ostringstream out;
out << setw(2) << t.line << " " << setw(2) << t.column << " ";
 
switch (t.name)
{
case (TokenName::IDENTIFIER) : out << "Identifier " << get<string>(t.value); break;
case (TokenName::INTEGER) : out << "Integer " << left << get<int>(t.value); break;
case (TokenName::STRING) : out << "String \"" << sanitize(get<string>(t.value)) << '"'; break;
case (TokenName::END_OF_INPUT) : out << "End_of_input"; break;
case (TokenName::ERROR) : out << "Error " << get<string>(t.value); break;
default : out << to_cstring(t.name);
}
 
out << '\n';
 
return out.str();
}
 
 
// =====================================================================================================================
// Lexer
// =====================================================================================================================
class Lexer
{
public:
Lexer (const char* source) : s {source}, pre_state {s} {}
 
bool has_more () { return s.peek() != '\0'; }
 
Token next_token ()
{
s.skip_whitespace();
 
pre_state = s;
 
switch (s.peek())
{
case '*' : return simply(TokenName::OP_MULTIPLY);
case '%' : return simply(TokenName::OP_MOD);
case '+' : return simply(TokenName::OP_ADD);
case '-' : return simply(TokenName::OP_SUBTRACT);
case '{' : return simply(TokenName::LEFTBRACE);
case '}' : return simply(TokenName::RIGHTBRACE);
case '(' : return simply(TokenName::LEFTPAREN);
case ')' : return simply(TokenName::RIGHTPAREN);
case ';' : return simply(TokenName::SEMICOLON);
case ',' : return simply(TokenName::COMMA);
case '&' : return expect('&', TokenName::OP_AND);
case '|' : return expect('|', TokenName::OP_OR);
case '<' : return follow('=', TokenName::OP_LESSEQUAL, TokenName::OP_LESS);
case '>' : return follow('=', TokenName::OP_GREATEREQUAL, TokenName::OP_GREATER);
case '=' : return follow('=', TokenName::OP_EQUAL, TokenName::OP_ASSIGN);
case '!' : return follow('=', TokenName::OP_NOTEQUAL, TokenName::OP_NOT);
case '/' : return divide_or_comment();
case '\'' : return char_lit();
case '"' : return string_lit();
 
default : if (is_id_start(s.peek())) return identifier();
if (is_digit(s.peek())) return integer_lit();
return error("Unrecognized character '", s.peek(), "'");
 
case '\0' : return make_token(TokenName::END_OF_INPUT);
}
}
 
 
private:
Scanner s;
Scanner pre_state;
static const map<string, TokenName> keywords;
 
 
template <class... Args>
Token error (Args&&... ostream_args)
{
string code {pre_state.pos, (string::size_type) s.column - pre_state.column};
 
ostringstream msg;
(msg << ... << forward<Args>(ostream_args)) << '\n'
<< string(28, ' ') << "(" << s.line << ", " << s.column << "): " << code;
 
if (s.peek() != '\0') s.advance();
 
return make_token(TokenName::ERROR, msg.str());
}
 
 
inline Token make_token (TokenName name, TokenVal value = 0)
{
return {name, value, pre_state.line, pre_state.column};
}
 
 
Token simply (TokenName name)
{
s.advance();
return make_token(name);
}
 
 
Token expect (char expected, TokenName name)
{
if (s.next() == expected) return simply(name);
else return error("Unrecognized character '", s.peek(), "'");
}
 
 
Token follow (char expected, TokenName ifyes, TokenName ifno)
{
if (s.next() == expected) return simply(ifyes);
else return make_token(ifno);
}
 
 
Token divide_or_comment ()
{
if (s.next() != '*') return make_token(TokenName::OP_DIVIDE);
 
while (s.next() != '\0')
{
if (s.peek() == '*' && s.next() == '/')
{
s.advance();
return next_token();
}
}
 
return error("End-of-file in comment. Closing comment characters not found.");
}
 
 
Token char_lit ()
{
int n = s.next();
 
if (n == '\'') return error("Empty character constant");
 
if (n == '\\') switch (s.next())
{
case 'n' : n = '\n'; break;
case '\\' : n = '\\'; break;
default : return error("Unknown escape sequence \\", s.peek());
}
 
if (s.next() != '\'') return error("Multi-character constant");
 
s.advance();
return make_token(TokenName::INTEGER, n);
}
 
 
Token string_lit ()
{
string text = "";
 
while (s.next() != '"')
switch (s.peek())
{
case '\\' : switch (s.next())
{
case 'n' : text += '\n'; continue;
case '\\' : text += '\\'; continue;
default : return error("Unknown escape sequence \\", s.peek());
}
 
case '\n' : return error("End-of-line while scanning string literal."
" Closing string character not found before end-of-line.");
 
case '\0' : return error("End-of-file while scanning string literal."
" Closing string character not found.");
 
default : text += s.peek();
}
 
s.advance();
return make_token(TokenName::STRING, text);
}
 
 
static inline bool is_id_start (char c) { return isalpha(static_cast<unsigned char>(c)) || c == '_'; }
static inline bool is_id_end (char c) { return isalnum(static_cast<unsigned char>(c)) || c == '_'; }
static inline bool is_digit (char c) { return isdigit(static_cast<unsigned char>(c)); }
 
 
Token identifier ()
{
string text (1, s.peek());
 
while (is_id_end(s.next())) text += s.peek();
 
auto i = keywords.find(text);
if (i != keywords.end()) return make_token(i->second);
 
return make_token(TokenName::IDENTIFIER, text);
}
 
 
Token integer_lit ()
{
while (is_digit(s.next()));
 
if (is_id_start(s.peek()))
return error("Invalid number. Starts like a number, but ends in non-numeric characters.");
 
int n;
 
auto r = from_chars(pre_state.pos, s.pos, n);
if (r.ec == errc::result_out_of_range) return error("Number exceeds maximum value");
 
return make_token(TokenName::INTEGER, n);
}
}; // class Lexer
 
 
const map<string, TokenName> Lexer::keywords =
{
{"else", TokenName::KEYWORD_ELSE},
{"if", TokenName::KEYWORD_IF},
{"print", TokenName::KEYWORD_PRINT},
{"putc", TokenName::KEYWORD_PUTC},
{"while", TokenName::KEYWORD_WHILE}
};
 
 
int main (int argc, char* argv[])
{
string in = (argc > 1) ? argv[1] : "stdin";
string out = (argc > 2) ? argv[2] : "stdout";
 
with_IO(in, out, [](string input)
{
Lexer lexer {input.data()};
 
string s = "Location Token name Value\n"
"--------------------------------------\n";
 
while (lexer.has_more()) s += to_string(lexer.next_token());
return s;
});
}
</syntaxhighlight>
 
{{out|case=test case 3}}
<pre>
Location Token name Value
--------------------------------------
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
 
=={{header|COBOL}}==
Using GnuCOBOL 2. By Steve Williams (with one change to get around a Rosetta Code code highlighter problem).
 
<syntaxhighlight lang="cobol"> >>SOURCE FORMAT IS FREE
*> this code is dedicated to the public domain
*> (GnuCOBOL) 2.3-dev.0
identification division.
program-id. lexer.
environment division.
configuration section.
repository. function all intrinsic.
input-output section.
file-control.
select input-file assign using input-name
status input-status
organization line sequential.
data division.
 
file section.
fd input-file.
01 input-record pic x(98).
 
working-storage section.
01 input-name pic x(32).
01 input-status pic xx.
01 input-length pic 99.
 
01 output-name pic x(32) value spaces.
01 output-status pic xx.
01 output-record pic x(64).
 
01 line-no pic 999 value 0.
01 col-no pic 99.
01 col-no-max pic 99.
01 col-increment pic 9 value 1.
01 start-col pic 99.
01 outx pic 99.
01 out-lim pic 99 value 48.
 
01 output-line value spaces.
03 out-line pic zzzz9.
03 out-column pic zzzzzz9.
03 message-area.
05 filler pic xxx.
05 token pic x(16).
05 out-value pic x(48).
05 out-integer redefines out-value pic zzzzz9.
05 out-integer1 redefines out-value pic zzzzzz9. *> to match the python lexer
 
01 error-record.
03 error-line pic zzzz9 value 0.
03 error-col pic zzzzzz9 value 0.
03 error-message pic x(68) value spaces.
 
01 scan-state pic x(16) value spaces.
01 current-character pic x.
01 previous-character pic x.
 
procedure division chaining input-name.
start-lexer.
if input-name <> spaces
open input input-file
if input-status = '35'
string 'in lexer ' trim(input-name) ' not found' into error-message
perform report-error
end-if
end-if
perform read-input-file
perform until input-status <> '00'
add 1 to line-no
move line-no to out-line
move length(trim(input-record,trailing)) to col-no-max
move 1 to col-no
move space to previous-character
perform until col-no > col-no-max
move col-no to out-column
move input-record(col-no:1) to current-character
evaluate scan-state
 
when 'identifier'
if current-character >= 'A' and <= 'Z'
or (current-character >= 'a' and <= 'z')
or (current-character >= '0' and <= '9')
or current-character = '_'
perform increment-outx
move current-character to out-value(outx:1)
if col-no = col-no-max
perform process-identifier
end-if
else
perform process-identifier
if current-character <> space
move 0 to col-increment
end-if
end-if
 
when 'integer'
evaluate true
when current-character >= '0' and <= '9'
perform increment-outx
move current-character to out-value(outx:1)
if col-no = col-no-max
move numval(out-value) to out-integer
move 'Integer' to token
end-if
when current-character >= 'A' and <= 'Z'
when current-character >= 'a' and <= 'z'
move 'in lexer invalid integer' to error-message
perform report-error
when other
if outx > 5
move numval(out-value) to out-integer1 *> to match the python lexer
else
move numval(out-value) to out-integer
end-if
move 'Integer' to token
if current-character <> space
move 0 to col-increment
end-if
end-evaluate
when 'comment'
if previous-character = '*' and current-character = '/'
move 'comment' to token
end-if
 
when 'quote'
evaluate current-character also outx
when '"' also 0
string 'in lexer empty string' into error-message
perform report-error
when '"' also any
perform increment-outx
move current-character to out-value(outx:1)
move 'String' to token
when other
if col-no = col-no-max
string 'in lexer missing close quote' into error-message
perform report-error
else
perform increment-outx
move current-character to out-value(outx:1)
end-if
end-evaluate
 
when 'character'
evaluate current-character also outx
when "'" also 0
string 'in lexer empty character constant' into error-message
perform report-error
when "'" also 1
subtract 1 from ord(out-value(1:1)) giving out-integer
move 'Integer' to token
when "'" also 2
evaluate true
when out-value(1:2) = '\n'
move 10 to out-integer
when out-value(1:2) = '\\'
subtract 1 from ord('\') giving out-integer *> ' (workaround a Rosetta Code highlighter problem)
when other
string 'in lexer unknown escape sequence ' out-value(1:2)
into error-message
perform report-error
end-evaluate
move 'Integer' to token
when "'" also any
string 'in lexer multicharacter constant' into error-message
perform report-error
when other
if col-no = col-no-max
string 'in lexer missing close quote' into error-message
perform report-error
end-if
perform increment-outx
move current-character to out-value(outx:1)
end-evaluate
 
when 'and'
evaluate previous-character also current-character
when '&' also '&'
move 'Op_and' to token
when other
string 'in lexer AND error' into error-message
perform report-error
end-evaluate
 
when 'or'
evaluate previous-character also current-character
when '|' also '|'
move 'Op_or' to token
when other
string 'in lexer OR error' into error-message
perform report-error
end-evaluate
 
when 'ambiguous'
evaluate previous-character also current-character
when '/' also '*'
move 'comment' to scan-state
subtract 1 from col-no giving start-col
when '/' also any
move 'Op_divide' to token
move 0 to col-increment
 
when '=' also '='
move 'Op_equal' to token
when '=' also any
move 'Op_assign' to token
move 0 to col-increment
 
when '<' also '='
move 'Op_lessequal' to token
when '<' also any
move 'Op_less' to token
move 0 to col-increment
 
when '>' also '='
move 'Op_greaterequal' to token
when '>'also any
move 'Op_greater' to token
move 0 to col-increment
 
when '!' also '='
move 'Op_notequal' to token
when '!' also any
move 'Op_not' to token
move 0 to col-increment
 
when other
display input-record
string 'in lexer ' trim(scan-state)
' unknown character "' current-character '"'
' with previous character "' previous-character '"'
into error-message
perform report-error
end-evaluate
 
when other
move col-no to start-col
evaluate current-character
when space
continue
when >= 'A' and <= 'Z'
when >= 'a' and <= 'z'
move 'identifier' to scan-state
move 1 to outx
move current-character to out-value
when >= '0' and <= '9'
move 'integer' to scan-state
move 1 to outx
move current-character to out-value
when '&'
move 'and' to scan-state
when '|'
move 'or' to scan-state
when '"'
move 'quote' to scan-state
move 1 to outx
move current-character to out-value
when "'"
move 'character' to scan-state
move 0 to outx
when '{'
move 'LeftBrace' to token
when '}'
move 'RightBrace' to token
when '('
move 'LeftParen' to token
when ')'
move 'RightParen' to token
when '+'
move 'Op_add' to token
when '-'
move 'Op_subtract' to token
when '*'
move 'Op_multiply' to token
when '%'
move 'Op_mod' to token
when ';'
move 'Semicolon' to token
when ','
move 'Comma' to token
when '/'
when '<'
when '>'
when '='
when '='
when '<'
when '>'
when '!'
move 'ambiguous' to scan-state
when other
string 'in lexer unknown character "' current-character '"'
into error-message
perform report-error
end-evaluate
end-evaluate
 
if token <> spaces
perform process-token
end-if
 
move current-character to previous-character
add col-increment to col-no
move 1 to col-increment
end-perform
if scan-state = 'ambiguous'
evaluate previous-character
when '/'
move 'Op_divide' to token
perform process-token
 
when '='
move 'Op_assign' to token
perform process-token
 
when '<'
move 'Op_less' to token
perform process-token
 
when '>'
move 'Op_greater' to token
perform process-token
 
when '!'
move 'Op_not' to token
perform process-token
 
when other
string 'in lexer unresolved ambiguous
"' previous-character '" at end of line'
into error-message
perform report-error
end-evaluate
end-if
perform read-input-file
end-perform
 
evaluate true
when input-status <> '10'
string 'in lexer ' trim(input-name) ' invalid input status ' input-status
into error-message
perform report-error
when scan-state = 'comment'
string 'in lexer unclosed comment at end of input' into error-message
perform report-error
end-evaluate
move 'End_of_input' to token
move 1 to out-column
move 1 to start-col
add 1 to line-no
perform process-token
 
close input-file
stop run
.
process-identifier.
evaluate true
when out-value = 'print'
move 'Keyword_print' to token
move spaces to out-value
when out-value = 'while'
move 'Keyword_while' to token
move spaces to out-value
when out-value = 'if'
move 'Keyword_if' to token
move spaces to out-value
when out-value = 'else'
move 'Keyword_else' to token
move spaces to out-value
when out-value = 'putc'
move 'Keyword_putc' to token
move spaces to out-value
when other
move 'Identifier' to token
end-evaluate
.
increment-outx.
if outx >= out-lim
string 'in lexer token value length exceeds ' out-lim into error-message
perform report-error
end-if
add 1 to outx
.
process-token.
if token <> 'comment'
move start-col to out-column
move line-no to out-line
display output-line
end-if
move 0 to start-col
move spaces to scan-state message-area
.
report-error.
move line-no to error-line
move start-col to error-col
display error-record
close input-file
stop run with error status -1
.
read-input-file.
if input-name = spaces
move '00' to input-status
accept input-record on exception move '10' to input-status end-accept
else
read input-file
end-if
.
end program lexer.</syntaxhighlight>
 
{{out|case=test case 3}}
<pre>prompt$ ./lexer <testcase3
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|Common Lisp}}==
Lisp has a built-in reader and you can customize the reader by modifying its readtable. I'm also using the Gray stream, which is an almost standard feature of Common Lisp, for counting lines and columns.
 
<syntaxhighlight lang="lisp">(defpackage #:lexical-analyzer
(:use #:cl #:sb-gray)
(:export #:main))
 
(in-package #:lexical-analyzer)
 
(defconstant +lex-symbols-package+ (or (find-package :lex-symbols)
(make-package :lex-symbols)))
 
(defclass counting-character-input-stream (fundamental-character-input-stream)
((stream :type stream :initarg :stream :reader stream-of)
(line :type fixnum :initform 1 :accessor line-of)
(column :type fixnum :initform 0 :accessor column-of)
(prev-column :type (or null fixnum) :initform nil :accessor prev-column-of))
(:documentation "Character input stream that counts lines and columns."))
 
(defmethod stream-read-char ((stream counting-character-input-stream))
(let ((ch (read-char (stream-of stream) nil :eof)))
(case ch
(#\Newline
(incf (line-of stream))
(setf (prev-column-of stream) (column-of stream)
(column-of stream) 0))
(t
(incf (column-of stream))))
ch))
 
(defmethod stream-unread-char ((stream counting-character-input-stream) char)
(unread-char char (stream-of stream))
(case char
(#\Newline
(decf (line-of stream))
(setf (column-of stream) (prev-column-of stream)))
(t
(decf (column-of stream)))))
 
(defstruct token
(name nil :type symbol)
(value nil :type t)
(line nil :type fixnum)
(column nil :type fixnum))
 
(defun lexer-error (format-control &rest args)
(apply #'error format-control args))
 
(defun handle-divide-or-comment (stream char)
(declare (ignore char))
(case (peek-char nil stream t nil t)
(#\* (loop with may-end = nil
initially (read-char stream t nil t)
for ch = (read-char stream t nil t)
until (and may-end (char= ch #\/))
do (setf may-end (char= ch #\*))
finally (return (read stream t nil t))))
(t (make-token :name :op-divide :line (line-of stream) :column (column-of stream)))))
 
(defun make-constant-handler (token-name)
(lambda (stream char)
(declare (ignore char))
(make-token :name token-name :line (line-of stream) :column (column-of stream))))
 
(defun make-this-or-that-handler (expect then &optional else)
(lambda (stream char)
(declare (ignore char))
(let ((line (line-of stream))
(column (column-of stream))
(next (peek-char nil stream nil nil t)))
(cond ((and expect (char= next expect))
(read-char stream nil nil t)
(make-token :name then :line line :column column))
(else
(make-token :name else :line line :column column))
(t
(lexer-error "Unrecognized character '~A'" next))))))
 
(defun identifier? (symbol)
(and (symbolp symbol)
(not (keywordp symbol))
(let ((name (symbol-name symbol)))
(and (find (char name 0) "_abcdefghijklmnopqrstuvwxyz" :test #'char-equal)
(or (< (length name) 2)
(not (find-if-not (lambda (ch)
(find ch "_abcdefghijklmnopqrstuvwxyz0123456789"
:test #'char-equal))
name :start 1)))))))
 
(defun id->keyword (id line column)
(case id
(lex-symbols::|if| (make-token :name :keyword-if :line line :column column))
(lex-symbols::|else| (make-token :name :keyword-else :line line :column column))
(lex-symbols::|while| (make-token :name :keyword-while :line line :column column))
(lex-symbols::|print| (make-token :name :keyword-print :line line :column column))
(lex-symbols::|putc| (make-token :name :keyword-putc :line line :column column))
(t nil)))
 
(defun handle-identifier (stream char)
(let ((*readtable* (copy-readtable)))
(set-syntax-from-char char #\z)
(let ((line (line-of stream))
(column (column-of stream)))
(unread-char char stream)
(let ((obj (read stream t nil t)))
(if (identifier? obj)
(or (id->keyword obj line column)
(make-token :name :identifier :value obj :line line :column column))
(lexer-error "Invalid identifier name: ~A" obj))))))
 
(defun handle-integer (stream char)
(let ((*readtable* (copy-readtable)))
(set-syntax-from-char char #\z)
(let ((line (line-of stream))
(column (column-of stream)))
(unread-char char stream)
(let ((obj (read stream t nil t)))
(if (integerp obj)
(make-token :name :integer :value obj :line line :column column)
(lexer-error "Invalid integer: ~A" obj))))))
 
(defun handle-char-literal (stream char)
(declare (ignore char))
(let* ((line (line-of stream))
(column (column-of stream))
(ch (read-char stream t nil t))
(parsed (case ch
(#\' (lexer-error "Empty character constant"))
(#\Newline (lexer-error "New line in character literal"))
(#\\ (let ((next-ch (read-char stream t nil t)))
(case next-ch
(#\n #\Newline)
(#\\ #\\)
(t (lexer-error "Unknown escape sequence: \\~A" next-ch)))))
(t ch))))
(if (char= #\' (read-char stream t nil t))
(make-token :name :integer :value (char-code parsed) :line line :column column)
(lexer-error "Only one character is allowed in character literal"))))
 
(defun handle-string (stream char)
(declare (ignore char))
(loop with result = (make-array 0 :element-type 'character :adjustable t :fill-pointer t)
with line = (line-of stream)
with column = (column-of stream)
for ch = (read-char stream t nil t)
until (char= ch #\")
do (setf ch (case ch
(#\Newline (lexer-error "New line in string"))
(#\\ (let ((next-ch (read-char stream t nil t)))
(case next-ch
(#\n #\Newline)
(#\\ #\\)
(t (lexer-error "Unknown escape sequence: \\~A" next-ch)))))
(t ch)))
(vector-push-extend ch result)
finally (return (make-token :name :string :value result :line line :column column))))
 
(defun make-lexer-readtable ()
(let ((*readtable* (copy-readtable nil)))
(setf (readtable-case *readtable*) :preserve)
(set-syntax-from-char #\\ #\z)
(set-syntax-from-char #\# #\z)
(set-syntax-from-char #\` #\z)
 
;; operators
(set-macro-character #\* (make-constant-handler :op-multiply))
(set-macro-character #\/ #'handle-divide-or-comment)
(set-macro-character #\% (make-constant-handler :op-mod))
(set-macro-character #\+ (make-constant-handler :op-add))
(set-macro-character #\- (make-constant-handler :op-subtract))
(set-macro-character #\< (make-this-or-that-handler #\= :op-lessequal :op-less))
(set-macro-character #\> (make-this-or-that-handler #\= :op-greaterequal :op-greater))
(set-macro-character #\= (make-this-or-that-handler #\= :op-equal :op-assign))
(set-macro-character #\! (make-this-or-that-handler #\= :op-notequal :op-not))
(set-macro-character #\& (make-this-or-that-handler #\& :op-and))
(set-macro-character #\| (make-this-or-that-handler #\| :op-or))
 
;; symbols
(set-macro-character #\( (make-constant-handler :leftparen))
(set-macro-character #\) (make-constant-handler :rightparen))
(set-macro-character #\{ (make-constant-handler :leftbrace))
(set-macro-character #\} (make-constant-handler :rightbrace))
(set-macro-character #\; (make-constant-handler :semicolon))
(set-macro-character #\, (make-constant-handler :comma))
 
;; identifiers & keywords
(set-macro-character #\_ #'handle-identifier t)
(loop for ch across "abcdefghijklmnopqrstuvwxyz"
do (set-macro-character ch #'handle-identifier t))
(loop for ch across "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
do (set-macro-character ch #'handle-identifier t))
 
;; integers
(loop for ch across "0123456789"
do (set-macro-character ch #'handle-integer t))
(set-macro-character #\' #'handle-char-literal)
 
;; strings
(set-macro-character #\" #'handle-string)
 
*readtable*))
 
(defun lex (stream)
(loop with *readtable* = (make-lexer-readtable)
with *package* = +lex-symbols-package+
with eof = (gensym)
with counting-stream = (make-instance 'counting-character-input-stream :stream stream)
for token = (read counting-stream nil eof)
until (eq token eof)
do (format t "~5D ~5D ~15A~@[ ~S~]~%"
(token-line token) (token-column token) (token-name token) (token-value token))
finally (format t "~5D ~5D ~15A~%"
(line-of counting-stream) (column-of counting-stream) :end-of-input)
(close counting-stream)))
 
(defun main ()
(lex *standard-input*))</syntaxhighlight>
{{out|case=test case 3}}
<pre> 5 16 KEYWORD-PRINT
5 40 OP-SUBTRACT
6 16 KEYWORD-PUTC
6 40 OP-LESS
7 16 KEYWORD-IF
7 40 OP-GREATER
8 16 KEYWORD-ELSE
8 40 OP-LESSEQUAL
9 16 KEYWORD-WHILE
9 40 OP-GREATEREQUAL
10 16 LEFTBRACE
10 40 OP-EQUAL
11 16 RIGHTBRACE
11 40 OP-NOTEQUAL
12 16 LEFTPAREN
12 40 OP-AND
13 16 RIGHTPAREN
13 40 OP-OR
14 16 OP-SUBTRACT
14 40 SEMICOLON
15 16 OP-NOT
15 40 COMMA
16 16 OP-MULTIPLY
16 40 OP-ASSIGN
17 16 OP-DIVIDE
17 40 INTEGER 42
18 16 OP-MOD
18 40 STRING "String literal"
19 16 OP-ADD
19 40 IDENTIFIER variable_name
20 26 INTEGER 10
21 26 INTEGER 92
22 26 INTEGER 32
23 1 END-OF-INPUT </pre>
 
=={{header|Elixir}}==
{{works with|Elixir|1.13.3}}
{{trans|ATS}}
 
<syntaxhighlight lang="elixir">#!/bin/env elixir
# -*- elixir -*-
 
defmodule Lex do
 
def main args do
{inpf_name, outf_name, exit_status} =
case args do
[] -> {"-", "-", 0}
[name] -> {name, "-", 0}
[name1, name2] -> {name1, name2, 0}
[name1, name2 | _] -> {name1, name2, usage_error()}
end
 
{inpf, outf, exit_status} =
case {inpf_name, outf_name, exit_status} do
{"-", "-", 0} -> {:stdio, :stdio, 0}
{name1, "-", 0} ->
{inpf, exit_status} = open_file(name1, [:read])
{inpf, :stdio, exit_status}
{"-", name2, 0} ->
{outf, exit_status} = open_file(name2, [:write])
{:stdio, outf, exit_status}
{name1, name2, 0} ->
{inpf, exit_status} = open_file(name1, [:read])
if exit_status != 0 do
{inpf, name2, exit_status}
else
{outf, exit_status} = open_file(name2, [:write])
{inpf, outf, exit_status}
end
_ -> {inpf_name, outf_name, exit_status}
end
 
exit_status =
case exit_status do
0 -> main_program inpf, outf
_ -> exit_status
end
 
# Choose one.
System.halt exit_status # Fast exit.
#System.stop exit_status # Laborious cleanup.
end
 
def main_program inpf, outf do
inp = make_inp inpf
scan_text outf, inp
exit_status = 0
exit_status
end
 
def open_file name, rw do
case File.open name, rw do
{:ok, f} -> {f, 0}
_ ->
IO.write :stderr, "Cannot open "
IO.write :stderr, name
case rw do
[:read] -> IO.puts " for input"
[:write] -> IO.puts " for output"
end
{name, 1}
end
end
 
def scan_text outf, inp do
{toktup, inp} = get_next_token inp
print_token outf, toktup
case toktup do
{"End_of_input", _, _, _} -> :ok
_ -> scan_text outf, inp
end
end
 
def print_token outf, {tok, arg, line_no, column_no} do
IO.write outf, (String.pad_leading "#{line_no}", 5)
IO.write outf, " "
IO.write outf, (String.pad_leading "#{column_no}", 5)
IO.write outf, " "
IO.write outf, tok
case tok do
"Identifier" ->
IO.write outf, " "
IO.write outf, arg
"Integer" ->
IO.write outf, " "
IO.write outf, arg
"String" ->
IO.write outf, " "
IO.write outf, arg
_ -> :ok
end
IO.puts outf, ""
end
 
###-------------------------------------------------------------------
###
### The token dispatcher.
###
 
def get_next_token inp do
inp = skip_spaces_and_comments inp
{ch, inp} = get_ch inp
{chr, line_no, column_no} = ch
ln = line_no
cn = column_no
case chr do
:eof -> {{"End_of_input", "", ln, cn}, inp}
"," -> {{"Comma", ",", ln, cn}, inp}
";" -> {{"Semicolon", ";", ln, cn}, inp}
"(" -> {{"LeftParen", "(", ln, cn}, inp}
")" -> {{"RightParen", ")", ln, cn}, inp}
"{" -> {{"LeftBrace", "{", ln, cn}, inp}
"}" -> {{"RightBrace", "}", ln, cn}, inp}
"*" -> {{"Op_multiply", "*", ln, cn}, inp}
"/" -> {{"Op_divide", "/", ln, cn}, inp}
"%" -> {{"Op_mod", "%", ln, cn}, inp}
"+" -> {{"Op_add", "+", ln, cn}, inp}
"-" -> {{"Op_subtract", "-", ln, cn}, inp}
"<" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_lessequal", "<=", ln, cn}, inp}
_ -> {{"Op_less", "<", ln, cn}, (push_back ch1, inp)}
end
">" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_greaterequal", ">=", ln, cn}, inp}
_ -> {{"Op_greater", ">", ln, cn}, (push_back ch1, inp)}
end
"=" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_equal", "==", ln, cn}, inp}
_ -> {{"Op_assign", "=", ln, cn}, (push_back ch1, inp)}
end
"!" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_notequal", "!=", ln, cn}, inp}
_ -> {{"Op_not", "!", ln, cn}, (push_back ch1, inp)}
end
"&" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"&" -> {{"Op_and", "&&", ln, cn}, inp}
_ -> unexpected_character ln, cn, chr
end
"|" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"|" -> {{"Op_or", "||", ln, cn}, inp}
_ -> unexpected_character ln, cn, chr
end
"\"" ->
inp = push_back ch, inp
scan_string_literal inp
"'" ->
inp = push_back ch, inp
scan_character_literal inp
_ ->
cond do
String.match? chr, ~r/^[[:digit:]]$/u ->
inp = push_back ch, inp
scan_integer_literal inp
String.match? chr, ~r/^[[:alpha:]_]$/u ->
inp = push_back ch, inp
scan_identifier_or_reserved_word inp
true -> unexpected_character ln, cn, chr
end
end
end
 
###-------------------------------------------------------------------
###
### Skipping past spaces and /* ... */ comments.
###
### Comments are treated exactly like a bit of whitespace. They never
### make it to the dispatcher.
###
 
def skip_spaces_and_comments inp do
{ch, inp} = get_ch inp
{chr, line_no, column_no} = ch
cond do
chr == :eof -> push_back ch, inp
String.match? chr, ~r/^[[:space:]]$/u ->
skip_spaces_and_comments inp
chr == "/" ->
{ch1, inp} = get_ch inp
case ch1 do
{"*", _, _} ->
inp = scan_comment inp, line_no, column_no
skip_spaces_and_comments inp
_ -> push_back ch, (push_back ch1, inp)
end
true -> push_back ch, inp
end
end
 
def scan_comment inp, line_no, column_no do
{ch, inp} = get_ch inp
case ch do
{:eof, _, _} -> unterminated_comment line_no, column_no
{"*", _, _} ->
{ch1, inp} = get_ch inp
case ch1 do
{:eof, _, _} -> unterminated_comment line_no, column_no
{"/", _, _} -> inp
_ -> scan_comment inp, line_no, column_no
end
_ -> scan_comment inp, line_no, column_no
end
end
 
###-------------------------------------------------------------------
###
### Scanning of integer literals, identifiers, and reserved words.
###
### These three types of token are very similar to each other.
###
 
def scan_integer_literal inp do
# Scan an entire word, not just digits. This way we detect
# erroneous text such as "23skidoo".
{line_no, column_no, inp} = get_position inp
{word, inp} = scan_word inp
if String.match? word, (~r/^[[:digit:]]+$/u) do
{{"Integer", word, line_no, column_no}, inp}
else
invalid_integer_literal line_no, column_no, word
end
end
 
def scan_identifier_or_reserved_word inp do
# It is assumed that the first character is of the correct type,
# thanks to the dispatcher.
{line_no, column_no, inp} = get_position inp
{word, inp} = scan_word inp
tok =
case word do
"if" -> "Keyword_if"
"else" -> "Keyword_else"
"while" -> "Keyword_while"
"print" -> "Keyword_print"
"putc" -> "Keyword_putc"
_ -> "Identifier"
end
{{tok, word, line_no, column_no}, inp}
end
 
def scan_word inp, word\\"" do
{ch, inp} = get_ch inp
{chr, _, _} = ch
if String.match? chr, (~r/^[[:alnum:]_]$/u) do
scan_word inp, (word <> chr)
else
{word, (push_back ch, inp)}
end
end
 
def get_position inp do
{ch, inp} = get_ch inp
{_, line_no, column_no} = ch
inp = push_back ch, inp
{line_no, column_no, inp}
end
 
###-------------------------------------------------------------------
###
### Scanning of string literals.
###
### It is assumed that the first character is the opening quote, and
### that the closing quote is the same character.
###
 
def scan_string_literal inp do
{ch, inp} = get_ch inp
{quote_mark, line_no, column_no} = ch
{contents, inp} = scan_str_lit inp, ch
{{"String", quote_mark <> contents <> quote_mark,
line_no, column_no},
inp}
end
 
def scan_str_lit inp, ch, contents\\"" do
{quote_mark, line_no, column_no} = ch
{ch1, inp} = get_ch inp
{chr1, line_no1, column_no1} = ch1
if chr1 == quote_mark do
{contents, inp}
else
case chr1 do
:eof -> eoi_in_string_literal line_no, column_no
"\n" -> eoln_in_string_literal line_no, column_no
"\\" ->
{ch2, inp} = get_ch inp
{chr2, _, _} = ch2
case chr2 do
"n" -> scan_str_lit inp, ch, (contents <> "\\n")
"\\" -> scan_str_lit inp, ch, (contents <> "\\\\")
_ -> unsupported_escape line_no1, column_no1, chr2
end
_ -> scan_str_lit inp, ch, (contents <> chr1)
end
end
end
 
###-------------------------------------------------------------------
###
### Scanning of character literals.
###
### It is assumed that the first character is the opening quote, and
### that the closing quote is the same character.
###
### The tedious part of scanning a character literal is distinguishing
### between the kinds of lexical error. (One might wish to modify the
### code to detect, as a distinct kind of error, end of line within a
### character literal.)
###
 
def scan_character_literal inp do
{ch, inp} = get_ch inp
{_, line_no, column_no} = ch
{ch1, inp} = get_ch inp
{chr1, line_no1, column_no1} = ch1
{intval, inp} =
case chr1 do
:eof -> unterminated_character_literal line_no, column_no
"\\" ->
{ch2, inp} = get_ch inp
{chr2, _, _} = ch2
case chr2 do
:eof -> unterminated_character_literal line_no, column_no
"n" -> {(:binary.first "\n"), inp}
"\\" -> {(:binary.first "\\"), inp}
_ -> unsupported_escape line_no1, column_no1, chr2
end
_ -> {(:binary.first chr1), inp}
end
inp = check_character_literal_end inp, ch
{{"Integer", "#{intval}", line_no, column_no}, inp}
end
 
def check_character_literal_end inp, ch do
{chr, _, _} = ch
{{chr1, _, _}, inp} = get_ch inp
if chr1 == chr do
inp
else
# Lexical error.
find_char_lit_end inp, ch
end
end
 
def find_char_lit_end inp, ch do
{chr, line_no, column_no} = ch
{{chr1, _, _}, inp} = get_ch inp
if chr1 == chr do
multicharacter_literal line_no, column_no
else
case chr1 do
:eof -> unterminated_character_literal line_no, column_no
_ -> find_char_lit_end inp, ch
end
end
end
 
###-------------------------------------------------------------------
###
### Character-at-a-time input, with unrestricted pushback, and with
### line and column numbering.
###
 
def make_inp inpf do
{inpf, [], 1, 1}
end
 
def get_ch {inpf, pushback, line_no, column_no} do
case pushback do
[head | tail] ->
{head, {inpf, tail, line_no, column_no}}
[] ->
case IO.read(inpf, 1) do
:eof ->
{{:eof, line_no, column_no},
{inpf, pushback, line_no, column_no}}
{:error, _} ->
{{:eof, line_no, column_no},
{inpf, pushback, line_no, column_no}}
chr ->
case chr do
"\n" ->
{{chr, line_no, column_no},
{inpf, pushback, line_no + 1, 1}}
_ ->
{{chr, line_no, column_no},
{inpf, pushback, line_no, column_no + 1}}
end
end
end
end
 
def push_back ch, {inpf, pushback, line_no, column_no} do
{inpf, [ch | pushback], line_no, column_no}
end
 
###-------------------------------------------------------------------
###
### Lexical and usage errors.
###
 
def unterminated_comment line_no, column_no do
raise "#{scriptname()}: unterminated comment at #{line_no}:#{column_no}"
end
 
def invalid_integer_literal line_no, column_no, word do
raise "#{scriptname()}: invalid integer literal #{word} at #{line_no}:#{column_no}"
end
 
def unsupported_escape line_no, column_no, chr do
raise "#{scriptname()}: unsupported escape \\#{chr} at #{line_no}:#{column_no}"
end
 
def eoi_in_string_literal line_no, column_no do
raise "#{scriptname()}: end of input in string literal starting at #{line_no}:#{column_no}"
end
 
def eoln_in_string_literal line_no, column_no do
raise "#{scriptname()}: end of line in string literal starting at #{line_no}:#{column_no}"
end
 
def multicharacter_literal line_no, column_no do
raise "#{scriptname()}: unsupported multicharacter literal at #{line_no}:#{column_no}"
end
 
def unterminated_character_literal line_no, column_no do
raise "#{scriptname()}: unterminated character literal starting at #{line_no}:#{column_no}"
end
 
def unexpected_character line_no, column_no, chr do
raise "#{scriptname()}: unexpected character '#{chr}' at #{line_no}:#{column_no}"
end
 
def usage_error() do
IO.puts "Usage: #{scriptname()} [INPUTFILE [OUTPUTFILE]]"
IO.puts "If either of INPUTFILE or OUTPUTFILE is not present or is \"-\","
IO.puts "standard input or standard output is used, respectively."
exit_status = 2
exit_status
end
 
def scriptname() do
Path.basename(__ENV__.file)
end
 
#---------------------------------------------------------------------
 
end ## module Lex
 
Lex.main(System.argv)</syntaxhighlight>
 
{{out}}
<pre>$ ./lex testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
 
 
=={{header|Emacs Lisp}}==
{{works with|Emacs|GNU 27.2}}
{{trans|ATS}}
 
 
<syntaxhighlight lang="lisp">#!/usr/bin/emacs --script
;;
;; The Rosetta Code lexical analyzer in GNU Emacs Lisp.
;;
;; Migrated from the ATS. However, Emacs Lisp is not friendly to the
;; functional style of the ATS implementation; therefore the
;; differences are vast.
;;
;; (A Scheme migration could easily, on the other hand, have been
;; almost exact. It is interesting to contrast Lisp dialects and see
;; how huge the differences are.)
;;
;; The script currently takes input only from standard input and
;; writes the token stream only to standard output.
;;
 
(require 'cl-lib)
 
;;; The type of a character, consisting of its code point and where it
;;; occurred in the text.
(cl-defstruct (ch_t (:constructor make-ch (ichar line-no column-no)))
ichar line-no column-no)
 
(defun ch-ichar (ch)
(ch_t-ichar ch))
 
(defun ch-line-no (ch)
(ch_t-line-no ch))
 
(defun ch-column-no (ch)
(ch_t-column-no ch))
 
;;; The type of an "inputter", consisting of an open file for the
;;; text, a pushback buffer (which is an indefinitely deep stack of
;;; ch_t), an input buffer for the current line, and a position in the
;;; text.
(cl-defstruct inp_t file pushback line line-no column-no)
 
(defun make-inp (file)
"Initialize a new inp_t."
(make-inp_t :file file
:pushback '()
:line ""
:line-no 0
:column-no 0))
 
(defvar inp (make-inp t)
"A global inp_t.")
 
(defun get-ch ()
"Get a ch_t, either from the pushback buffer or from the input."
(pcase (inp_t-pushback inp)
(`(,ch . ,tail)
;; Emacs Lisp has only single value return, so the results come
;; back as a list rather than multiple values.
(setq inp (make-inp_t :file (inp_t-file inp)
:pushback tail
:line (inp_t-line inp)
:line-no (inp_t-line-no inp)
:column-no (inp_t-column-no inp)))
ch)
('()
(let ((line (inp_t-line inp))
(line-no (inp_t-line-no inp))
(column-no (inp_t-column-no inp)))
(when (string= line "")
;; Refill the buffer.
(let ((text
(condition-case nil (read-string "")
nil (error 'eoi))))
(if (eq text 'eoi)
(setq line 'eoi)
(setq line (format "%s%c" text ?\n)))
(setq line-no (1+ line-no))
(setq column-no 1)))
(if (eq line 'eoi)
(progn
(setq inp (make-inp_t :file (inp_t-file inp)
:pushback (inp_t-pushback inp)
:line line
:line-no line-no
:column-no column-no))
(make-ch 'eoi line-no column-no))
(let ((c (elt line 0))
(line (substring line 1)))
(setq inp (make-inp_t :file (inp_t-file inp)
:pushback (inp_t-pushback inp)
:line line
:line-no line-no
:column-no (1+ column-no)))
(make-ch c line-no column-no)))))))
 
(defun get-new-line (file)
;; Currently "file" is ignored and the input must be from stdin.
(read-from-minibuffer "" :default 'eoi))
 
(defun push-back (ch)
"Push back a ch_t."
(setq inp (make-inp_t :file (inp_t-file inp)
:pushback (cons ch (inp_t-pushback inp))
:line (inp_t-line inp)
:line-no (inp_t-line-no inp)
:column-no (inp_t-column-no inp))))
 
(defun get-position ()
"Return the line-no and column-no of the next ch_t to be
returned by get-ch, assuming there are no more pushbacks
beforehand."
(let* ((ch (get-ch))
(line-no (ch-line-no ch))
(column-no (ch-column-no ch)))
(push-back ch)
(list line-no column-no)))
 
(defun scan-text (outf)
"The main loop."
(cl-loop for toktup = (get-next-token)
do (print-token outf toktup)
until (string= (elt toktup 0) "End_of_input")))
 
(defun print-token (outf toktup)
"Print a token, along with its position and possibly an
argument."
;; Currently outf is ignored, and the output goes to stdout.
(pcase toktup
(`(,tok ,arg ,line-no ,column-no)
(princ (format "%5d %5d %s" line-no column-no tok))
(pcase tok
("Identifier" (princ (format " %s\n" arg)))
("Integer" (princ (format " %s\n" arg)))
("String" (princ (format " %s\n" arg)))
(_ (princ "\n"))))))
 
(defun get-next-token ()
"The token dispatcher. Returns the next token, as a list along
with its argument and text position."
(skip-spaces-and-comments)
(let* ((ch (get-ch))
(ln (ch-line-no ch))
(cn (ch-column-no ch)))
(pcase (ch-ichar ch)
('eoi (list "End_of_input" "" ln cn))
(?, (list "Comma" "," ln cn))
(?\N{SEMICOLON} (list "Semicolon" ";" ln cn))
(?\N{LEFT PARENTHESIS} (list "LeftParen" "(" ln cn))
(?\N{RIGHT PARENTHESIS} (list "RightParen" ")" ln cn))
(?{ (list "LeftBrace" "{" ln cn))
(?} (list "RightBrace" "}" ln cn))
(?* (list "Op_multiply" "*" ln cn))
(?/ (list "Op_divide" "/" ln cn))
(?% (list "Op_mod" "%" ln cn))
(?+ (list "Op_add" "+" ln cn))
(?- (list "Op_subtract" "-" ln cn))
(?< (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?= (list "Op_lessequal" "<=" ln cn))
(_ (push-back ch1)
(list "Op_less" "<" ln cn)))))
(?> (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?= (list "Op_greaterequal" ">=" ln cn))
(_ (push-back ch1)
(list "Op_greater" ">" ln cn)))))
(?= (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?= (list "Op_equal" "==" ln cn))
(_ (push-back ch1)
(list "Op_assign" "=" ln cn)))))
(?! (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?= (list "Op_notequal" "!=" ln cn))
(_ (push-back ch1)
(list "Op_not" "!" ln cn)))))
(?& (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?& (list "Op_and" "&&" ln cn))
(_ (unexpected-character ln cn (get-ichar ch))))))
(?| (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?| (list "Op_or" "||" ln cn))
(_ (unexpected-character ln cn (get-ichar ch))))))
(?\N{QUOTATION MARK} (push-back ch) (scan-string-literal))
(?\N{APOSTROPHE} (push-back ch) (scan-character-literal))
((pred digitp) (push-back ch) (scan-integer-literal))
((pred identifier-start-p)
(progn
(push-back ch)
(scan-identifier-or-reserved-word)))
(c (unexpected-character ln cn c)))))
 
(defun skip-spaces-and-comments ()
"Skip spaces and comments. A comment is treated as equivalent
to a run of spaces."
(cl-loop for ch = (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?/ (let* ((ch2 (get-ch))
(line-no (ch-line-no ch1))
(column-no (ch-column-no ch1))
(position `(,line-no ,column-no)))
(pcase (ch-ichar ch2)
(?* (scan-comment position)
(get-ch))
(_ (push-back ch2)
ch1))))
(_ ch1)))
while (spacep (ch-ichar ch))
finally do (push-back ch)))
 
(defun scan-comment (position)
(cl-loop for ch = (get-ch)
for done = (comment-done-p ch position)
until done))
 
(defun comment-done-p (ch position)
(pcase (ch-ichar ch)
('eoi (apply 'unterminated-comment position))
(?* (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
('eoi (apply 'unterminated-comment position))
(?/ t)
(_ nil))))
(_ nil)))
 
(defun scan-integer-literal ()
"Scan an integer literal, on the assumption that a digit has
been seen and pushed back."
(let* ((position (get-position))
(lst (scan-word))
(s (list-to-string lst)))
(if (all-digits-p lst)
`("Integer" ,s . ,position)
(apply 'illegal-integer-literal `(,@position , s)))))
 
(defun scan-identifier-or-reserved-word ()
"Scan an identifier or reserved word, on the assumption that a
legal first character (for an identifier) has been seen and
pushed back."
(let* ((position (get-position))
(lst (scan-word))
(s (list-to-string lst))
(tok (pcase s
("else" "Keyword_else")
("if" "Keyword_if")
("while" "Keyword_while")
("print" "Keyword_print")
("putc" "Keyword_putc")
(_ "Identifier"))))
`(,tok ,s . ,position)))
 
(defun scan-word ()
(cl-loop for ch = (get-ch)
while (identifier-continuation-p (ch-ichar ch))
collect (ch-ichar ch)
finally do (push-back ch)))
 
(defun scan-string-literal ()
"Scan a string literal, on the assumption that a double quote
has been seen and pushed back."
(let* ((ch (get-ch))
(_ (cl-assert (= (ch-ichar ch) ?\N{QUOTATION MARK})))
(line-no (ch-line-no ch))
(column-no (ch-column-no ch))
(position `(,line-no ,column-no))
(lst (scan-str-lit position))
(lst `(?\N{QUOTATION MARK} ,@lst ?\N{QUOTATION MARK})))
`("String" ,(list-to-string lst) . ,position)))
 
(defun scan-str-lit (position)
(flatten
(cl-loop for ch = (get-ch)
until (= (ch-ichar ch) ?\N{QUOTATION MARK})
collect (process-str-lit-character
(ch-ichar ch) position))))
 
(defun process-str-lit-character (c position)
;; NOTE: This script might insert a newline before any eoi, so that
;; "end-of-input-in-string-literal" never actually occurs. It is a
;; peculiarity of the script's input mechanism.
(pcase c
('eoi (apply 'end-of-input-in-string-literal position))
(?\n (apply 'end-of-line-in-string-literal position))
(?\\ (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?n '(?\\ ?n))
(?\\ '(?\\ ?\\))
(c (unsupported-escape (ch-line-no ch1)
(ch-column-no ch1)
c)))))
(c c)))
 
(defun scan-character-literal ()
"Scan a character literal, on the assumption that an ASCII
single quote (that is, a Unicode APOSTROPHE) has been seen and
pushed back."
(let* ((toktup (scan-character-literal-without-checking-end))
(line-no (elt toktup 2))
(column-no (elt toktup 3))
(position (list line-no column-no)))
(check-char-lit-end position)
toktup))
 
(defun check-char-lit-end (position)
(let ((ch (get-ch)))
(unless (and (integerp (ch-ichar ch))
(= (ch-ichar ch) ?\N{APOSTROPHE}))
(push-back ch)
(loop-to-char-lit-end position))))
 
(defun loop-to-char-lit-end (position)
(cl-loop for ch = (get-ch)
until (or (eq (ch-ichar ch) 'eoi)
(= (ch-ichar ch) ?\N{APOSTROPHE}))
finally do (if (eq (ch-ichar ch) 'eoi)
(apply 'unterminated-character-literal
position)
(apply 'multicharacter-literal position))))
 
(defun scan-character-literal-without-checking-end ()
(let* ((ch (get-ch))
(_ (cl-assert (= (ch-ichar ch) ?\N{APOSTROPHE})))
(line-no (ch-line-no ch))
(column-no (ch-column-no ch))
(position (list line-no column-no))
(ch1 (get-ch)))
(pcase (ch-ichar ch1)
('eoi (apply 'unterminated-character-literal position))
(?\\ (let ((ch2 (get-ch)))
(pcase (ch-ichar ch2)
('eoi (apply 'unterminated-character-literal position))
(?n `("Integer" ,(format "%d" ?\n) . ,position))
(?\\ `("Integer" ,(format "%d" ?\\) . ,position))
(c (unsupported-escape (ch-line-no ch1)
(ch-column-no ch1)
c)))))
(c `("Integer" ,(format "%d" c) . ,position)))))
 
(defun spacep (c)
(and (integerp c) (or (= c ?\N{SPACE})
(and (<= 9 c) (<= c 13)))))
 
(defun digitp (c)
(and (integerp c) (<= ?0 c) (<= c ?9)))
 
(defun lowerp (c)
;; Warning: in EBCDIC, this kind of test for "alphabetic" is no
;; good. The letters are not contiguous.
(and (integerp c) (<= ?a c) (<= c ?z)))
 
(defun upperp (c)
;; Warning: in EBCDIC, this kind of test for "alphabetic" is no
;; good. The letters are not contiguous.
(and (integerp c) (<= ?A c) (<= c ?Z)))
 
(defun alphap (c)
(or (lowerp c) (upperp c)))
 
(defun identifier-start-p (c)
(and (integerp c) (or (alphap c) (= c ?_))))
 
(defun identifier-continuation-p (c)
(and (integerp c) (or (alphap c) (= c ?_) (digitp c))))
 
(defun all-digits-p (thing)
(cl-loop for c in thing
if (not (digitp c)) return nil
finally return t))
 
(defun list-to-string (lst)
"Convert a list of characters to a string."
(apply 'string lst))
 
(defun flatten (lst)
"Flatten nested lists. (The implementation is recursive and not
for very long lists.)"
(pcase lst
('() '())
(`(,head . ,tail)
(if (listp head)
(append (flatten head) (flatten tail))
(cons head (flatten tail))))))
 
(defun unexpected-character (line-no column-no c)
(error (format "unexpected character '%c' at %d:%d"
c line-no column-no)))
 
(defun unsupported-escape (line-no column-no c)
(error (format "unsupported escape \\%c at %d:%d"
c line-no column-no)))
 
(defun illegal-integer-literal (line-no column-no s)
(error (format "illegal integer literal \"%s\" at %d:%d"
s line-no column-no)))
 
(defun unterminated-character-literal (line-no column-no)
(error (format "unterminated character literal starting at %d:%d"
line-no column-no)))
 
(defun multicharacter-literal (line-no column-no)
(error (format
"unsupported multicharacter literal starting at %d:%d"
line-no column-no)))
 
(defun end-of-input-in-string-literal (line-no column-no)
(error (format "end of input in string literal starting at %d:%d"
line-no column-no)))
 
(defun end-of-line-in-string-literal (line-no column-no)
(error (format "end of line in string literal starting at %d:%d"
line-no column-no)))
 
(defun unterminated-comment (line-no column-no)
(error (format "unterminated comment starting at %d:%d"
line-no column-no)))
 
(defun main ()
(setq inp (make-inp t))
(scan-text t))
 
(main)</syntaxhighlight>
 
 
{{out}}
<pre>$ ./lex-in-el < compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|Erlang}}==
{{works with|Erlang|24.3.3}}
{{trans|ATS}}
{{trans|Elixir}}
 
 
<syntaxhighlight lang="erlang">#!/bin/env escript
%%%-------------------------------------------------------------------
 
-record (inp_t, {inpf, pushback, line_no, column_no}).
 
main (Args) ->
main_program (Args).
 
main_program ([]) ->
scan_from_inpf_to_outf ("-", "-"),
halt (0);
main_program ([Inpf_filename]) ->
scan_from_inpf_to_outf (Inpf_filename, "-"),
halt (0);
main_program ([Inpf_filename, Outf_filename]) ->
scan_from_inpf_to_outf (Inpf_filename, Outf_filename),
halt (0);
main_program ([_, _ | _]) ->
ProgName = escript:script_name (),
io:put_chars (standard_error, "Usage: "),
io:put_chars (standard_error, ProgName),
io:put_chars (standard_error, " [INPUTFILE [OUTPUTFILE]]\n"),
halt (1).
 
scan_from_inpf_to_outf ("-", "-") ->
scan_input (standard_io, standard_io);
scan_from_inpf_to_outf (Inpf_filename, "-") ->
case file:open (Inpf_filename, [read]) of
{ok, Inpf} -> scan_input (Inpf, standard_io);
_ -> open_failure (Inpf_filename, "input")
end;
scan_from_inpf_to_outf ("-", Outf_filename) ->
case file:open (Outf_filename, [write]) of
{ok, Outf} -> scan_input (standard_io, Outf);
_ -> open_failure (Outf_filename, "output")
end;
scan_from_inpf_to_outf (Inpf_filename, Outf_filename) ->
case file:open(Inpf_filename, [read]) of
{ok, Inpf} ->
case file:open (Outf_filename, [write]) of
{ok, Outf} -> scan_input (Inpf, Outf);
_ -> open_failure (Outf_filename, "output")
end;
_ -> open_failure (Inpf_filename, "input")
end.
 
open_failure (Filename, ForWhat) ->
ProgName = escript:script_name (),
io:put_chars (standard_error, ProgName),
io:put_chars (standard_error, ": failed to open \""),
io:put_chars (standard_error, Filename),
io:put_chars (standard_error, "\" for "),
io:put_chars (standard_error, ForWhat),
io:put_chars (standard_error, "\n"),
halt (1).
 
scan_input (Inpf, Outf) ->
scan_text (Outf, make_inp (Inpf)).
 
scan_text (Outf, Inp) ->
{TokTup, Inp1} = get_next_token (Inp),
print_token (Outf, TokTup),
case TokTup of
{"End_of_input", _, _, _} -> ok;
_ -> scan_text (Outf, Inp1)
end.
 
print_token (Outf, {Tok, Arg, Line_no, Column_no}) ->
S_line_no = erlang:integer_to_list (Line_no),
S_column_no = erlang:integer_to_list (Column_no),
io:put_chars (Outf, string:pad (S_line_no, 5, leading)),
io:put_chars (Outf, " "),
io:put_chars (Outf, string:pad (S_column_no, 5, leading)),
io:put_chars (Outf, " "),
io:put_chars (Outf, Tok),
{Padding, Arg1} =
case Tok of
"Identifier" -> {" ", Arg};
"Integer" -> {" ", Arg};
"String" -> {" ", Arg};
_ -> {"", ""}
end,
io:put_chars (Outf, Padding),
io:put_chars (Outf, Arg1),
io:put_chars ("\n").
 
%%%-------------------------------------------------------------------
%%%
%%% The token dispatcher.
%%%
 
get_next_token (Inp) ->
Inp00 = skip_spaces_and_comments (Inp),
{Ch, Inp0} = get_ch (Inp00),
{Char, Line_no, Column_no} = Ch,
Ln = Line_no,
Cn = Column_no,
case Char of
eof -> {{"End_of_input", "", Ln, Cn}, Inp0};
"," -> {{"Comma", ",", Ln, Cn}, Inp0};
";" -> {{"Semicolon", ";", Ln, Cn}, Inp0};
"(" -> {{"LeftParen", "(", Ln, Cn}, Inp0};
")" -> {{"RightParen", ")", Ln, Cn}, Inp0};
"{" -> {{"LeftBrace", "{", Ln, Cn}, Inp0};
"}" -> {{"RightBrace", "}", Ln, Cn}, Inp0};
"*" -> {{"Op_multiply", "*", Ln, Cn}, Inp0};
"/" -> {{"Op_divide", "/", Ln, Cn}, Inp0};
"%" -> {{"Op_mod", "%", Ln, Cn}, Inp0};
"+" -> {{"Op_add", "+", Ln, Cn}, Inp0};
"-" -> {{"Op_subtract", "-", Ln, Cn}, Inp0};
"<" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"=" -> {{"Op_lessequal", "<=", Ln, Cn}, Inp1};
_ -> {{"Op_less", "<", Ln, Cn}, push_back (Ch1, Inp1)}
end;
">" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"=" -> {{"Op_greaterequal", ">=", Ln, Cn}, Inp1};
_ -> {{"Op_greater", ">", Ln, Cn}, push_back (Ch1, Inp1)}
end;
"=" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"=" -> {{"Op_equal", "==", Ln, Cn}, Inp1};
_ -> {{"Op_assign", "=", Ln, Cn}, push_back (Ch1, Inp1)}
end;
"!" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"=" -> {{"Op_notequal", "!=", Ln, Cn}, Inp1};
_ -> {{"Op_not", "!", Ln, Cn}, push_back (Ch1, Inp1)}
end;
"&" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"&" -> {{"Op_and", "&&", Ln, Cn}, Inp1};
_ -> unexpected_character (Ln, Cn, Char)
end;
"|" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"|" -> {{"Op_or", "||", Ln, Cn}, Inp1};
_ -> unexpected_character (Ln, Cn, Char)
end;
"\"" ->
Inp1 = push_back (Ch, Inp0),
scan_string_literal (Inp1);
"'" ->
Inp1 = push_back (Ch, Inp0),
scan_character_literal (Inp1);
_ ->
case is_digit (Char) of
true ->
Inp1 = push_back (Ch, Inp0),
scan_integer_literal (Inp1);
false ->
case is_alpha_or_underscore (Char) of
true ->
Inp1 = push_back (Ch, Inp0),
scan_identifier_or_reserved_word (Inp1);
false ->
unexpected_character (Ln, Cn, Char)
end
end
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Skipping past spaces and /* ... */ comments.
%%%
%%% Comments are treated exactly like a bit of whitespace. They never
%%% make it to the dispatcher.
%%%
 
skip_spaces_and_comments (Inp) ->
{Ch, Inp0} = get_ch (Inp),
{Char, Line_no, Column_no} = Ch,
case classify_char (Char) of
eof -> push_back (Ch, Inp0);
space -> skip_spaces_and_comments (Inp0);
slash ->
{Ch1, Inp1} = get_ch (Inp0),
case Ch1 of
{"*", _, _} ->
Inp2 = scan_comment (Inp1, Line_no, Column_no),
skip_spaces_and_comments (Inp2);
_ -> push_back (Ch, (push_back (Ch1, Inp1)))
end;
other -> push_back (Ch, Inp0)
end.
 
classify_char (Char) ->
case Char of
eof -> eof;
"/" -> slash;
_ -> case is_space (Char) of
true -> space;
false -> other
end
end.
 
scan_comment (Inp, Line_no, Column_no) ->
{Ch0, Inp0} = get_ch (Inp),
case Ch0 of
{eof, _, _} -> unterminated_comment (Line_no, Column_no);
{"*", _, _} ->
{Ch1, Inp1} = get_ch (Inp0),
case Ch1 of
{eof, _, _} ->
unterminated_comment (Line_no, Column_no);
{"/", _, _} -> Inp1;
_ -> scan_comment (Inp1, Line_no, Column_no)
end;
_ -> scan_comment (Inp0, Line_no, Column_no)
end.
 
is_space (S) ->
case re:run (S, "^[[:space:]]+$") of
{match, _} -> true;
_ -> false
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Scanning of integer literals, identifiers, and reserved words.
%%%
%%% These three types of token are very similar to each other.
%%%
 
scan_integer_literal (Inp) ->
%% Scan an entire word, not just digits. This way we detect
%% erroneous text such as "23skidoo".
{Line_no, Column_no, Inp1} = get_position (Inp),
{Word, Inp2} = scan_word (Inp1),
case is_digit (Word) of
true -> {{"Integer", Word, Line_no, Column_no}, Inp2};
false -> invalid_integer_literal (Line_no, Column_no, Word)
end.
 
scan_identifier_or_reserved_word (Inp) ->
%% It is assumed that the first character is of the correct type,
%% thanks to the dispatcher.
{Line_no, Column_no, Inp1} = get_position (Inp),
{Word, Inp2} = scan_word (Inp1),
Tok =
case Word of
"if" -> "Keyword_if";
"else" -> "Keyword_else";
"while" -> "Keyword_while";
"print" -> "Keyword_print";
"putc" -> "Keyword_putc";
_ -> "Identifier"
end,
{{Tok, Word, Line_no, Column_no}, Inp2}.
 
scan_word (Inp) ->
scan_word_loop (Inp, "").
 
scan_word_loop (Inp, Word0) ->
{Ch1, Inp1} = get_ch (Inp),
{Char1, _, _} = Ch1,
case is_alnum_or_underscore (Char1) of
true -> scan_word_loop (Inp1, Word0 ++ Char1);
false -> {Word0, push_back (Ch1, Inp1)}
end.
 
get_position (Inp) ->
{Ch1, Inp1} = get_ch (Inp),
{_, Line_no, Column_no} = Ch1,
Inp2 = push_back (Ch1, Inp1),
{Line_no, Column_no, Inp2}.
 
is_digit (S) ->
case re:run (S, "^[[:digit:]]+$") of
{match, _} -> true;
_ -> false
end.
 
is_alpha_or_underscore (S) ->
case re:run (S, "^[[:alpha:]_]+$") of
{match, _} -> true;
_ -> false
end.
 
is_alnum_or_underscore (S) ->
case re:run (S, "^[[:alnum:]_]+$") of
{match, _} -> true;
_ -> false
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Scanning of string literals.
%%%
%%% It is assumed that the first character is the opening quote, and
%%% that the closing quote is the same character.
%%%
 
 
scan_string_literal (Inp) ->
{Ch1, Inp1} = get_ch (Inp),
{Quote_mark, Line_no, Column_no} = Ch1,
{Contents, Inp2} = scan_str_lit (Inp1, Ch1),
Toktup = {"String", Quote_mark ++ Contents ++ Quote_mark,
Line_no, Column_no},
{Toktup, Inp2}.
 
scan_str_lit (Inp, Ch) -> scan_str_lit_loop (Inp, Ch, "").
 
scan_str_lit_loop (Inp, Ch, Contents) ->
{Quote_mark, Line_no, Column_no} = Ch,
{Ch1, Inp1} = get_ch (Inp),
{Char1, Line_no1, Column_no1} = Ch1,
case Char1 of
Quote_mark -> {Contents, Inp1};
eof -> eoi_in_string_literal (Line_no, Column_no);
"\n" -> eoln_in_string_literal (Line_no, Column_no);
"\\" ->
{Ch2, Inp2} = get_ch (Inp1),
{Char2, _, _} = Ch2,
case Char2 of
"n" ->
scan_str_lit_loop (Inp2, Ch, Contents ++ "\\n");
"\\" ->
scan_str_lit_loop (Inp2, Ch, Contents ++ "\\\\");
_ ->
unsupported_escape (Line_no1, Column_no1, Char2)
end;
_ -> scan_str_lit_loop (Inp1, Ch, Contents ++ Char1)
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Scanning of character literals.
%%%
%%% It is assumed that the first character is the opening quote, and
%%% that the closing quote is the same character.
%%%
%%% The tedious part of scanning a character literal is distinguishing
%%% between the kinds of lexical error. (One might wish to modify the
%%% code to detect, as a distinct kind of error, end of line within a
%%% character literal.)
%%%
 
scan_character_literal (Inp) ->
{Ch, Inp0} = get_ch (Inp),
{_, Line_no, Column_no} = Ch,
{Ch1, Inp1} = get_ch (Inp0),
{Char1, Line_no1, Column_no1} = Ch1,
{Intval, Inp3} =
case Char1 of
eof -> unterminated_character_literal (Line_no, Column_no);
"\\" ->
{Ch2, Inp2} = get_ch (Inp1),
{Char2, _, _} = Ch2,
case Char2 of
eof -> unterminated_character_literal (Line_no,
Column_no);
"n" -> {char_to_code ("\n"), Inp2};
"\\" -> {char_to_code ("\\"), Inp2};
_ -> unsupported_escape (Line_no1, Column_no1,
Char2)
end;
_ -> {char_to_code (Char1), Inp1}
end,
Inp4 = check_character_literal_end (Inp3, Ch),
{{"Integer", Intval, Line_no, Column_no}, Inp4}.
 
char_to_code (Char) ->
%% Hat tip to https://archive.ph/BxZRS
lists:flatmap (fun erlang:integer_to_list/1, Char).
 
check_character_literal_end (Inp, Ch) ->
{Char, _, _} = Ch,
{{Char1, _, _}, Inp1} = get_ch (Inp),
case Char1 of
Char -> Inp1;
_ -> find_char_lit_end (Inp1, Ch) % Handle a lexical error.
end.
 
find_char_lit_end (Inp, Ch) ->
%% There is a lexical error. Determine which kind it fits into.
{Char, Line_no, Column_no} = Ch,
{{Char1, _, _}, Inp1} = get_ch (Inp),
case Char1 of
Char -> multicharacter_literal (Line_no, Column_no);
eof -> unterminated_character_literal (Line_no, Column_no);
_ -> find_char_lit_end (Inp1, Ch)
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Character-at-a-time input, with unrestricted pushback, and with
%%% line and column numbering.
%%%
 
make_inp (Inpf) ->
#inp_t{inpf = Inpf,
pushback = [],
line_no = 1,
column_no = 1}.
 
get_ch (Inp) ->
#inp_t{inpf = Inpf,
pushback = Pushback,
line_no = Line_no,
column_no = Column_no} = Inp,
case Pushback of
[Ch | Tail] ->
Inp1 = Inp#inp_t{pushback = Tail},
{Ch, Inp1};
[] ->
case io:get_chars (Inpf, "", 1) of
eof ->
Ch = {eof, Line_no, Column_no},
{Ch, Inp};
{error, _} ->
Ch = {eof, Line_no, Column_no},
{Ch, Inp};
Char ->
case Char of
"\n" ->
Ch = {Char, Line_no, Column_no},
Inp1 = Inp#inp_t{line_no = Line_no + 1,
column_no = 1},
{Ch, Inp1};
_ ->
Ch = {Char, Line_no, Column_no},
Inp1 =
Inp#inp_t{column_no = Column_no + 1},
{Ch, Inp1}
end
end
end.
 
push_back (Ch, Inp) ->
Inp#inp_t{pushback = [Ch | Inp#inp_t.pushback]}.
 
%%%-------------------------------------------------------------------
 
invalid_integer_literal (Line_no, Column_no, Word) ->
error_abort ("invalid integer literal \"" ++
Word ++ "\" at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
unsupported_escape (Line_no, Column_no, Char) ->
error_abort ("unsupported escape \\" ++
Char ++ " at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
unexpected_character (Line_no, Column_no, Char) ->
error_abort ("unexpected character '" ++
Char ++ "' at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
eoi_in_string_literal (Line_no, Column_no) ->
error_abort ("end of input in string literal starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
eoln_in_string_literal (Line_no, Column_no) ->
error_abort ("end of line in string literal starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
unterminated_character_literal (Line_no, Column_no) ->
error_abort ("unterminated character literal starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
multicharacter_literal (Line_no, Column_no) ->
error_abort ("unsupported multicharacter literal starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
unterminated_comment (Line_no, Column_no) ->
error_abort ("unterminated comment starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
error_abort (Message) ->
ProgName = escript:script_name (),
io:put_chars (standard_error, ProgName),
io:put_chars (standard_error, ": "),
io:put_chars (standard_error, Message),
io:put_chars (standard_error, "\n"),
halt (1).
 
%%%-------------------------------------------------------------------
%%% Instructions to GNU Emacs --
%%% local variables:
%%% mode: erlang
%%% erlang-indent-level: 3
%%% end:
%%%-------------------------------------------------------------------</syntaxhighlight>
 
 
{{out}}
<pre>$ ./lex-in-Erlang compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|Euphoria}}==
Tested with Euphoria 4.05.
<langsyntaxhighlight lang="euphoria">include std/io.e
include std/io.e
include std/map.e
include std/types.e
Line 567 ⟶ 5,659:
constant true = 1, false = 0, EOF = -1
 
enum EOItk_EOI, Printktk_Mul, Putctk_Div, Ifktk_Mod, Whilektk_Add, Lbracetk_Sub, Rbracetk_Negate, Lparentk_Not, Rparentk_Lss, Uminus, Mul, Divtk_Leq,
Addtk_Gtr, Subtk_Geq, Lsstk_Eq, Gtrtk_Neq, Leqtk_Assign, Neqtk_And, Andktk_Or, Semitk_If, Commatk_Else, Assigntk_While, Integerk, Stringk, Ident
tk_Print, tk_Putc, tk_Lparen, tk_Rparen, tk_Lbrace, tk_Rbrace, tk_Semi, tk_Comma,
tk_Ident, tk_Integer, tk_String
 
constant all_syms = { "EOI", "Print", "Putc", "IfEnd_of_input", "WhileOp_multiply", "LbraceOp_divide", "RbraceOp_mod", "LparenOp_add",
"RparenOp_subtract", "UminusOp_negate", "MulOp_not", "DivOp_less", "AddOp_lessequal", "Sub", "Lss", "Gtr", "Leq", "Neq", "AndOp_greater",
"SemiOp_greaterequal", "CommaOp_equal", "AssignOp_notequal", "IntegerOp_assign", "StringOp_and", "IdentOp_or"},
"Keyword_if", "Keyword_else", "Keyword_while", "Keyword_print", "Keyword_putc",
"LeftParen", "RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma",
"Identifier", "Integer", "String"}
 
integer input_file, the_ch = ' ', the_col = 0, the_line = 1
Line 613 ⟶ 5,710:
end if
next_ch()
return {Integerktk_Integer, err_line, err_col, n}
end function
 
Line 619 ⟶ 5,716:
function div_or_cmt(integer err_line, integer err_col)
if next_ch() != '*' then
return {Divtk_Div, err_line, err_col}
end if
 
-- comment found
next_ch()
while true do
if next_ch()the_ch = '*' and next_ch() = '/' then
if next_ch() = '/' then
return get_tok next_ch()
return get_tok()
end if
elsif the_ch = EOF then
error("%d %d EOF in comment", {err_line, err_col})
else
next_ch()
end if
end while
Line 648 ⟶ 5,750:
 
next_ch()
return {Stringktk_String, err_line, err_col, text}
end function
 
Line 673 ⟶ 5,775:
end if
n = to_integer(text)
return {Integerktk_Integer, err_line, err_col, n}
end if
 
Line 680 ⟶ 5,782:
end if
 
return {Identtk_Ident, err_line, err_col, text}
end function
 
Line 690 ⟶ 5,792:
end if
 
if ifno = EOItk_EOI then
error("%d %d follow: unrecognized character: (%d)", {err_line, err_col, the_ch})
end if
Line 707 ⟶ 5,809:
 
switch the_ch do
case EOF then return {EOItk_EOI, err_line, err_col}
case '/' then return div_or_cmt(err_line, err_col)
case '\'' then return char_lit(err_line, err_col)
 
case '<' then return follow('=', Leq, Lss, err_line, err_col)
case '!<' then return follow('=', Neqtk_Leq, EOItk_Lss, err_line, err_col)
case '&>' then return follow('&=', Andktk_Geq, EOItk_Gtr, err_line, err_col)
case '=' then return follow('=', tk_Eq, tk_Assign, err_line, err_col)
case '!' then return follow('=', tk_Neq, tk_Not, err_line, err_col)
case '&' then return follow('&', tk_And, tk_EOI, err_line, err_col)
case '|' then return follow('|', tk_Or, tk_EOI, err_line, err_col)
 
case '"' then return string_lit(the_ch, err_line, err_col)
case else
integer sym = symbols[the_ch]
if sym != EOItk_EOI then
next_ch()
return {sym, err_line, err_col}
Line 725 ⟶ 5,832:
 
procedure init()
put(key_words, "ifelse", Ifktk_Else)
put(key_words, "printif", Printk tk_If)
put(key_words, "putcprint", Putctk_Print)
put(key_words, "whileputc", Whilek tk_Putc)
put(key_words, "while", tk_While)
 
symbols = repeat(EOItk_EOI, 256)
symbols['{'] = Lbracetk_Lbrace
symbols['}'] = Rbracetk_Rbrace
symbols['('] = Lparentk_Lparen
symbols[')'] = Rparentk_Rparen
symbols['+'] = Addtk_Add
symbols['-'] = Subtk_Sub
symbols['*'] = Multk_Mul
symbols[';%'] = Semitk_Mod
symbols[',;'] = Commatk_Semi
symbols['>,'] = Gtrtk_Comma
symbols['='] = Assign
end procedure
 
Line 759 ⟶ 5,866:
loop do
t = get_tok()
printf(STDOUT, "line %5d col %5d %-8s", {t[2], t[3], all_syms[t[1]]})
switch t[1] do
case Integerktk_Integer then printf(STDOUT, " %5d\n", {t[4]})
case Ident tk_Ident then printf(STDOUT, " %s\n", {t[4]})
case Stringktk_String then printf(STDOUT, " \"%s\"\n", {t[4]})
case else printf(STDOUT, "\n")
end switch
until t[1] = EOItk_EOI
end loop
end procedure
 
main(command_line())</syntaxhighlight>
</lang>
 
Output from test case 3:
 
{{out|case=test case 3}}
<b>
<pre>
line 5 col 1516 PrintKeyword_print
line 5 col 4140 SubOp_subtract
line 6 col 1516 PutcKeyword_putc
line 6 col 4140 LssOp_less
line 7 col 1516 IfKeyword_if
line 7 col 4140 GtrOp_greater
line 8 col 1516 WhileKeyword_else
line 8 col 4140 LeqOp_lessequal
line 9 col 1516 LbraceKeyword_while
line 9 col 4140 NeqOp_greaterequal
line 10 col 1516 RbraceLeftBrace
line 10 col 4140 AndOp_equal
line 11 col 1516 LparenRightBrace
line 11 col 4140 SemiOp_notequal
line 12 col 1516 RparenLeftParen
line 12 col 4140 CommaOp_and
line 13 col 1516 SubRightParen
line 13 col 4140 AssignOp_or
line 14 col 1516 MulOp_subtract
line 14 col 41 Integer 40 42Semicolon
line 15 col 1516 DivOp_not
line 15 col 41 String "String40 literal"Comma
line 16 col 1516 AddOp_multiply
line 16 col 41 Ident 40 variable_nameOp_assign
line 17 col 26 Integer 16 10Op_divide
line 17 18 col 2640 Integer 3242
line 18 19 col 16 1 EOIOp_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
</b>
Line 809 ⟶ 5,921:
=={{header|Flex}}==
Tested with Flex 2.5.4.
<syntaxhighlight lang="c">%{
<lang C>
%{
#include <stdio.h>
#include <stdlib.h>
Line 820 ⟶ 5,931:
 
typedef enum {
EOItk_EOI, Printtk_Mul, Putctk_Div, Iftk_Mod, Whiletk_Add, Lbracetk_Sub, Rbracetk_Negate, Lparentk_Not, Rparentk_Lss, Uminus, Mul, Div, Addtk_Leq,
Subtk_Gtr, Lsstk_Geq, Gtrtk_Eq, Leqtk_Neq, Neqtk_Assign, Andtk_And, Semitk_Or, Commatk_If, Assigntk_Else, Integertk_While, String, Ident
tk_Print, tk_Putc, tk_Lparen, tk_Rparen, tk_Lbrace, tk_Rbrace, tk_Semi, tk_Comma,
tk_Ident, tk_Integer, tk_String
} TokenType;
 
Line 867 ⟶ 5,980:
TokenType sym;
} kwds[] = {
{"ifelse", Iftk_Else},
{"printif", Print tk_If},
{"putcprint", Putctk_Print},
{"whileputc", While tk_Putc},
{"while", tk_While},
}, *kwp;
 
return (kwp = bsearch(&ident, kwds, NELEMS(kwds), sizeof(kwds[0]), kwd_cmp)) == NULL ? Identtk_Ident : kwp->sym;
}
 
Line 885 ⟶ 5,999:
ident [a-zA-Z_][a-zA-Z_0-9]*
 
ascii_char [^\"\n]
escaped_char \\n|\\\"
number {digit}+
string \"[^"\n]*\"
string_char {ascii_char}|{escaped_char}
char_const \'([^'\n]|\\n|\\\\)\'
string \"({string_char})*\"
char_const \'{string_char}\'
 
%%
Line 899 ⟶ 6,010:
"/*" BEGIN COMMENT2;
 
"{" {return Lbracetk_Lbrace;}
"}" {return Rbracetk_Rbrace;}
"(" {return Lparentk_Lparen;}
")" {return Rparentk_Rparen;}
"*" {return Multk_Mul;}
"/" {return Divtk_Div;}
"+%" {return Addtk_Mod;}
"-+" {return Subtk_Add;}
"<-" {return Lsstk_Sub;}
"><" {return Gtrtk_Lss;}
"<=>" {return Leqtk_Gtr;}
"!<=" {return Neqtk_Leq;}
"&&>=" {return Andtk_Geq;}
";!=" {return Semitk_Neq;}
",!" {return Commatk_Not;}
"=&&" {return Assigntk_And;}
"||" {return tk_Or;}
";" {return tk_Semi;}
"," {return tk_Comma;}
"==" {return tk_Eq;}
"=" {return tk_Assign;}
{ident} {return get_ident_type(yytext);}
{string} {return Stringtk_String;}
 
[ \t\n]+ ; /* ignore whitespace */
Line 925 ⟶ 6,041:
yyerror("Number exceeds maximum value");
 
return Integertk_Integer;
}
 
Line 945 ⟶ 6,061:
}
yynval = n;
return Integertk_Integer;
}
 
Line 962 ⟶ 6,078:
do {
tok = yylex();
printf("line %5d col %5d %.8s15s", yylloc.first_line, yylloc.first_col,
&"EOIEnd_of_input Op_multiply Print Op_divide Putc If Op_mod While LbraceOp_add Rbrace Lparen Rparen "
"UminusOp_subtract Mul Op_negate Div Op_not Add Sub Op_less Lss Gtr Leq Neq Op_lessequal "
"AndOp_greater SemiOp_greaterequal Op_equal Comma AssignOp_notequal Integer StringOp_assign Ident "[tok * 9]);
"Op_and Op_or Keyword_if Keyword_else Keyword_while "
"Keyword_print Keyword_putc LeftParen RightParen LeftBrace "
"RightBrace Semicolon Comma Identifier Integer "
"String "
[tok * 16]);
 
if (tok == Integertk_Integer) printf(" %4d5d", yynval);
else if (tok == Identtk_Ident) printf(" %s", yytext);
else if (tok == Stringtk_String) printf(" %s", yytext);
printf("\n");
} while (tok != EOItk_EOI);
return 0;
}</syntaxhighlight>
}
</lang>
 
Output from test case 3:
 
{{out|case=test case 3}}
<b>
<pre>
line 5 col 1516 PrintKeyword_print
line 5 col 4140 SubOp_subtract
line 6 col 1516 PutcKeyword_putc
line 6 col 4140 LssOp_less
line 7 col 1516 IfKeyword_if
line 7 col 4140 GtrOp_greater
line 8 col 1516 WhileKeyword_else
line 8 col 4140 LeqOp_lessequal
line 9 col 1516 LbraceKeyword_while
line 9 col 4140 NeqOp_greaterequal
line 10 col 1516 RbraceLeftBrace
line 10 col 4140 AndOp_equal
line 11 col 1516 LparenRightBrace
line 11 col 4140 SemiOp_notequal
line 12 col 1516 RparenLeftParen
line 12 col 4140 CommaOp_and
line 13 col 1516 SubRightParen
line 13 col 4140 AssignOp_or
line 14 col 1516 MulOp_subtract
line 14 col 41 Integer 40 42Semicolon
line 15 col 16 15 DivOp_not
line 15 col 41 String "String40 literal"Comma
line 16 col 16 15 AddOp_multiply
line 16 col 41 Ident 40 variable_nameOp_assign
line 17 col 26 Integer 16 10Op_divide
line 17 40 Integer 18 col 26 Integer 3242
line 18 col 2916 EOIOp_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
22 29 End_of_input
</pre>
</b>
 
=={{header|Forth}}==
Tested with Gforth 0.7.3.
<syntaxhighlight lang="forth">CREATE BUF 0 , \ single-character look-ahead buffer
CREATE COLUMN# 0 ,
CREATE LINE# 1 ,
 
: NEWLINE? ( c -- t|f) DUP 10 = SWAP 13 = OR ;
: +IN ( c --)
1 SWAP NEWLINE?
IF 0 COLUMN# ! LINE# ELSE COLUMN# THEN
+! 0 BUF ! ;
: PEEK BUF @ 0= IF STDIN KEY-FILE BUF ! THEN BUF @ ;
: GETC PEEK DUP +IN ;
: SKIP GETC DROP ;
: .LOCATION 7 .R 4 .R SPACE ;
: WHERE COLUMN# @ LINE# @ ;
: .WHERE WHERE .LOCATION ;
: .WHERE+ WHERE SWAP 1+ SWAP .LOCATION ;
 
: EXPECT GETC OVER OVER =
IF 2DROP
ELSE CR ." stdin:" COLUMN# @ 0 LINE# @ 0
<# #s #> TYPE ." :" <# #s #> TYPE ." : "
." unexpected `" EMIT ." ', expecting `" EMIT ." '" CR
BYE
THEN ;
: EQ PEEK [CHAR] = = IF SKIP 2SWAP THEN
." Op_" TYPE CR 2DROP ;
 
CREATE ESC 4 C, CHAR $ C, CHAR $ C, CHAR \ C, 0 C,
: ?ESC? CR ." Unknown escape sequence `\" EMIT ." '" CR BYE ;
: >ESC ESC 4 + C! ESC ;
: $$\n 10 ;
: $$\\ [CHAR] \ ;
: ESCAPE DUP >ESC FIND IF NIP EXECUTE ELSE DROP ?ESC? THEN ;
: ?ESCAPE DUP [CHAR] \ = IF DROP GETC ESCAPE THEN ;
: ?EOF DUP 4 = IF CR ." End-of-file in string" CR BYE THEN ;
: ?EOL DUP NEWLINE?
IF CR ." End-of-line in string" CR BYE THEN ;
: STRING PAD
BEGIN GETC ?EOF ?EOL DUP [CHAR] " <>
WHILE OVER C! CHAR+
REPEAT DROP PAD TUCK - ;
: "TYPE" [CHAR] " EMIT TYPE [CHAR] " EMIT ;
 
CREATE TOKEN 4 C, CHAR $ C, CHAR $ C, 0 C, 0 C,
: >HEX DUP 9 > IF 7 + THEN [CHAR] 0 + ;
: HI! $F0 AND 2/ 2/ 2/ 2/ >HEX TOKEN 3 + C! ;
: LO! $0F AND >HEX TOKEN 4 + C! ;
: >TOKEN DUP HI! LO! TOKEN ;
 
: ?EOF DUP 4 = IF CR ." End-of-file in comment" CR BYE THEN ;
: $$2F PEEK [CHAR] * =
IF SKIP
BEGIN
GETC ?EOF [CHAR] * =
PEEK [CHAR] / = AND
UNTIL SKIP
ELSE .WHERE ." Op_divide" CR THEN ;
: $$22 .WHERE ." String " STRING "TYPE" CR ;
: $$27 .WHERE GETC ?ESCAPE ." Integer " . [CHAR] ' EXPECT CR ;
: $$04 .WHERE ." End_of_input" CR BYE ;
: $$2D .WHERE ." Op_subtract" CR ;
: $$2B .WHERE ." Op_add" CR ;
: $$25 .WHERE ." Op_mod" CR ;
: $$2A .WHERE ." Op_multiply" CR ;
: $$7B .WHERE ." LeftBrace" CR ;
: $$7D .WHERE ." RightBrace" CR ;
: $$2C .WHERE ." Comma" CR ;
: $$29 .WHERE ." RightParen" CR ;
: $$28 .WHERE ." LeftParen" CR ;
: $$3B .WHERE ." Semicolon" CR ;
: $$3D .WHERE s" equal" s" assign" EQ ;
: $$21 .WHERE s" notequal" s" not" EQ ;
: $$3C .WHERE s" lessequal" s" less" EQ ;
: $$3E .WHERE s" greaterequal" s" greater" EQ ;
: $$26 .WHERE [CHAR] & EXPECT ." Op_and" CR ;
: $$7C .WHERE [CHAR] | EXPECT ." Op_or" CR ;
: $$20 ; \ space
 
CREATE KEYWORD 0 C, CHAR $ C, CHAR $ C, 5 CHARS ALLOT
: >KEYWORD DUP 2 + KEYWORD C!
KEYWORD 3 + SWAP CMOVE KEYWORD ;
: FIND-KW DUP 5 <=
IF 2DUP >KEYWORD FIND
IF TRUE 2SWAP 2DROP ELSE DROP FALSE THEN
ELSE FALSE THEN ;
 
: $$if ." Keyword_if" ;
: $$else ." Keyword_else" ;
: $$while ." Keyword_while" ;
: $$print ." Keyword_print" ;
: $$putc ." Keyword_putc" ;
 
: DIGIT? 48 58 WITHIN ;
: ALPHA? DUP 95 = SWAP \ underscore?
DUP 97 123 WITHIN SWAP \ lower?
65 91 WITHIN OR OR ; \ upper?
: ALNUM? DUP DIGIT? SWAP ALPHA? OR ;
: INTEGER 0
BEGIN PEEK DIGIT?
WHILE GETC [CHAR] 0 - SWAP 10 * +
REPEAT ;
: ?INTEGER? CR ." Invalid number" CR BYE ;
: ?INTEGER PEEK ALPHA? IF ?INTEGER? THEN ;
: DIGIT .WHERE+ ." Integer " INTEGER ?INTEGER . CR ;
: NAME PAD
BEGIN PEEK ALNUM?
WHILE GETC OVER C! CHAR+
REPEAT PAD TUCK - ;
: IDENT ." Identifier " TYPE ;
: ALPHA .WHERE+ NAME FIND-KW
IF EXECUTE ELSE IDENT THEN CR ;
: ?CHAR? CR ." Character '" EMIT ." ' not recognized" CR BYE ;
: SPACE? DUP BL = SWAP 9 14 WITHIN OR ;
: SKIP-SPACE BEGIN PEEK SPACE? WHILE SKIP REPEAT ;
: CONSUME
SKIP-SPACE
PEEK DIGIT? IF DIGIT ELSE
PEEK ALPHA? IF ALPHA ELSE
PEEK >TOKEN FIND
IF SKIP EXECUTE ELSE GETC ?CHAR? BYE THEN
THEN THEN ;
: TOKENIZE BEGIN CONSUME AGAIN ;
TOKENIZE</syntaxhighlight>
 
{{out}}
Tested against all programs in [[Compiler/Sample programs]].
 
=={{header|Fortran}}==
{{works with|gfortran|11.2.1}}
You should call the source file ‘lex.F90’, so gfortran will know to use the C preprocessor. I use the preprocessor to select between different ways to read stream input from the standard input.
 
(Despite the ‘.F90’ extension that I recommend, this is Fortran 2008/2018 code.)
 
There is ‘framework’ for supporting Unicode, but no actual Unicode support. To support Unicode reliably I would probably use the C interface and GNU libunistring.
 
The author has placed this Fortran code in the public domain.
<syntaxhighlight lang="fortran">!!!
!!! An implementation of the Rosetta Code lexical analyzer task:
!!! https://rosettacode.org/wiki/Compiler/lexical_analyzer
!!!
!!! The C implementation was used as a reference on behavior, but was
!!! not adhered to for the implementation.
!!!
 
module string_buffers
use, intrinsic :: iso_fortran_env, only: error_unit
use, intrinsic :: iso_fortran_env, only: int64
 
implicit none
private
 
public :: strbuf_t
public :: strbuf_t_length_kind
public :: strbuf_t_character_kind
 
integer, parameter :: strbuf_t_length_kind = int64
 
! String buffers can handle Unicode.
integer, parameter :: strbuf_t_character_kind = selected_char_kind ('ISO_10646')
 
! Private abbreviations.
integer, parameter :: nk = strbuf_t_length_kind
integer, parameter :: ck = strbuf_t_character_kind
 
type :: strbuf_t
integer(kind = nk), private :: len = 0
!
! ‘chars’ is made public for efficient access to the individual
! characters.
!
character(1, kind = ck), allocatable, public :: chars(:)
contains
procedure, pass, private :: ensure_storage => strbuf_t_ensure_storage
procedure, pass :: to_unicode => strbuf_t_to_unicode
procedure, pass :: length => strbuf_t_length
procedure, pass :: set => strbuf_t_set
procedure, pass :: append => strbuf_t_append
generic :: assignment(=) => set
end type strbuf_t
 
contains
 
function strbuf_t_to_unicode (strbuf) result (s)
class(strbuf_t), intent(in) :: strbuf
character(:, kind = ck), allocatable :: s
 
!
! This does not actually ensure that the string is valid Unicode;
! any 31-bit ‘character’ is supported.
!
 
integer(kind = nk) :: i
 
allocate (character(len = strbuf%len, kind = ck) :: s)
do i = 1, strbuf%len
s(i:i) = strbuf%chars(i)
end do
end function strbuf_t_to_unicode
 
elemental function strbuf_t_length (strbuf) result (n)
class(strbuf_t), intent(in) :: strbuf
integer(kind = nk) :: n
 
n = strbuf%len
end function strbuf_t_length
 
elemental function next_power_of_two (x) result (y)
integer(kind = nk), intent(in) :: x
integer(kind = nk) :: y
 
!
! It is assumed that no more than 64 bits are used.
!
! The branch-free algorithm is that of
! https://archive.is/nKxAc#RoundUpPowerOf2
!
! Fill in bits until one less than the desired power of two is
! reached, and then add one.
!
 
y = x - 1
y = ior (y, ishft (y, -1))
y = ior (y, ishft (y, -2))
y = ior (y, ishft (y, -4))
y = ior (y, ishft (y, -8))
y = ior (y, ishft (y, -16))
y = ior (y, ishft (y, -32))
y = y + 1
end function next_power_of_two
 
elemental function new_storage_size (length_needed) result (size)
integer(kind = nk), intent(in) :: length_needed
integer(kind = nk) :: size
 
! Increase storage by orders of magnitude.
 
if (2_nk**32 < length_needed) then
size = huge (1_nk)
else
size = next_power_of_two (length_needed)
end if
end function new_storage_size
 
subroutine strbuf_t_ensure_storage (strbuf, length_needed)
class(strbuf_t), intent(inout) :: strbuf
integer(kind = nk), intent(in) :: length_needed
 
integer(kind = nk) :: new_size
type(strbuf_t) :: new_strbuf
 
if (.not. allocated (strbuf%chars)) then
! Initialize a new strbuf%chars array.
new_size = new_storage_size (length_needed)
allocate (strbuf%chars(1:new_size))
else if (ubound (strbuf%chars, 1) < length_needed) then
! Allocate a new strbuf%chars array, larger than the current
! one, but containing the same characters.
new_size = new_storage_size (length_needed)
allocate (new_strbuf%chars(1:new_size))
new_strbuf%chars(1:strbuf%len) = strbuf%chars(1:strbuf%len)
call move_alloc (new_strbuf%chars, strbuf%chars)
end if
end subroutine strbuf_t_ensure_storage
 
subroutine strbuf_t_set (dst, src)
class(strbuf_t), intent(inout) :: dst
class(*), intent(in) :: src
 
integer(kind = nk) :: n
integer(kind = nk) :: i
 
select type (src)
type is (character(*, kind = ck))
n = len (src, kind = nk)
call dst%ensure_storage(n)
do i = 1, n
dst%chars(i) = src(i:i)
end do
dst%len = n
type is (character(*))
n = len (src, kind = nk)
call dst%ensure_storage(n)
do i = 1, n
dst%chars(i) = src(i:i)
end do
dst%len = n
class is (strbuf_t)
n = src%len
call dst%ensure_storage(n)
dst%chars(1:n) = src%chars(1:n)
dst%len = n
class default
error stop
end select
end subroutine strbuf_t_set
 
subroutine strbuf_t_append (dst, src)
class(strbuf_t), intent(inout) :: dst
class(*), intent(in) :: src
 
integer(kind = nk) :: n_dst, n_src, n
integer(kind = nk) :: i
 
select type (src)
type is (character(*, kind = ck))
n_dst = dst%len
n_src = len (src, kind = nk)
n = n_dst + n_src
call dst%ensure_storage(n)
do i = 1, n_src
dst%chars(n_dst + i) = src(i:i)
end do
dst%len = n
type is (character(*))
n_dst = dst%len
n_src = len (src, kind = nk)
n = n_dst + n_src
call dst%ensure_storage(n)
do i = 1, n_src
dst%chars(n_dst + i) = src(i:i)
end do
dst%len = n
class is (strbuf_t)
n_dst = dst%len
n_src = src%len
n = n_dst + n_src
call dst%ensure_storage(n)
dst%chars((n_dst + 1):n) = src%chars(1:n_src)
dst%len = n
class default
error stop
end select
end subroutine strbuf_t_append
 
end module string_buffers
 
module lexical_analysis
use, intrinsic :: iso_fortran_env, only: input_unit
use, intrinsic :: iso_fortran_env, only: output_unit
use, intrinsic :: iso_fortran_env, only: error_unit
use, intrinsic :: iso_fortran_env, only: int32
use, non_intrinsic :: string_buffers
 
implicit none
private
 
public :: lexer_input_t
public :: lexer_output_t
public :: run_lexer
 
integer, parameter :: input_file_unit_no = 100
integer, parameter :: output_file_unit_no = 101
 
! Private abbreviations.
integer, parameter :: nk = strbuf_t_length_kind
integer, parameter :: ck = strbuf_t_character_kind
 
! Integers large enough for a Unicode code point. Unicode code
! points (and UCS-4) have never been allowed to go higher than
! 7FFFFFFF, and are even further restricted now.
integer, parameter :: ichar_kind = int32
 
character(1, kind = ck), parameter :: horizontal_tab_char = char (9, kind = ck)
character(1, kind = ck), parameter :: linefeed_char = char (10, kind = ck)
character(1, kind = ck), parameter :: vertical_tab_char = char (11, kind = ck)
character(1, kind = ck), parameter :: formfeed_char = char (12, kind = ck)
character(1, kind = ck), parameter :: carriage_return_char = char (13, kind = ck)
character(1, kind = ck), parameter :: space_char = ck_' '
 
! The following is correct for Unix and its relatives.
character(1, kind = ck), parameter :: newline_char = linefeed_char
 
character(1, kind = ck), parameter :: backslash_char = char (92, kind = ck)
 
character(*, kind = ck), parameter :: newline_intstring = ck_'10'
character(*, kind = ck), parameter :: backslash_intstring = ck_'92'
 
integer, parameter :: tk_EOI = 0
integer, parameter :: tk_Mul = 1
integer, parameter :: tk_Div = 2
integer, parameter :: tk_Mod = 3
integer, parameter :: tk_Add = 4
integer, parameter :: tk_Sub = 5
integer, parameter :: tk_Negate = 6
integer, parameter :: tk_Not = 7
integer, parameter :: tk_Lss = 8
integer, parameter :: tk_Leq = 9
integer, parameter :: tk_Gtr = 10
integer, parameter :: tk_Geq = 11
integer, parameter :: tk_Eq = 12
integer, parameter :: tk_Neq = 13
integer, parameter :: tk_Assign = 14
integer, parameter :: tk_And = 15
integer, parameter :: tk_Or = 16
integer, parameter :: tk_If = 17
integer, parameter :: tk_Else = 18
integer, parameter :: tk_While = 19
integer, parameter :: tk_Print = 20
integer, parameter :: tk_Putc = 21
integer, parameter :: tk_Lparen = 22
integer, parameter :: tk_Rparen = 23
integer, parameter :: tk_Lbrace = 24
integer, parameter :: tk_Rbrace = 25
integer, parameter :: tk_Semi = 26
integer, parameter :: tk_Comma = 27
integer, parameter :: tk_Ident = 28
integer, parameter :: tk_Integer = 29
integer, parameter :: tk_String = 30
 
character(len = 16), parameter :: token_names(0:30) = &
& (/ "End_of_input ", "Op_multiply ", "Op_divide ", "Op_mod ", "Op_add ", &
& "Op_subtract ", "Op_negate ", "Op_not ", "Op_less ", "Op_lessequal ", &
& "Op_greater ", "Op_greaterequal ", "Op_equal ", "Op_notequal ", "Op_assign ", &
& "Op_and ", "Op_or ", "Keyword_if ", "Keyword_else ", "Keyword_while ", &
& "Keyword_print ", "Keyword_putc ", "LeftParen ", "RightParen ", "LeftBrace ", &
& "RightBrace ", "Semicolon ", "Comma ", "Identifier ", "Integer ", &
& "String " /)
 
type :: token_t
integer :: token_no
 
! Our implementation stores the value of a tk_Integer as a
! string. The C reference implementation stores it as an int.
character(:, kind = ck), allocatable :: val
 
integer(nk) :: line_no
integer(nk) :: column_no
end type token_t
 
type :: lexer_input_t
logical, private :: using_input_unit = .true.
integer, private :: unit_no = -(huge (1))
integer(kind = nk) :: line_no = 1
integer(kind = nk) :: column_no = 0
integer, private :: unget_count = 0
 
! The maximum lookahead is 2, although I believe we are using
! only 1. In principle, the lookahead could be any finite number.
character(1, kind = ck), private :: unget_buffer(1:2)
logical, private :: unget_eof_buffer(1:2)
 
! Using the same strbuf_t multiple times reduces the need for
! reallocations. Putting that strbuf_t in the lexer_input_t is
! simply for convenience.
type(strbuf_t), private :: strbuf
 
contains
!
! Note: There is currently no facility for closing one input and
! switching to another.
!
! Note: There is currently no facility to decode inputs into
! Unicode codepoints. Instead, what happens is raw bytes of
! input get stored as strbuf_t_character_kind values. This
! behavior is adequate for ASCII inputs.
!
procedure, pass :: use_file => lexer_input_t_use_file
procedure, pass :: get_next_ch => lexer_input_t_get_next_ch
procedure, pass :: unget_ch => lexer_input_t_unget_ch
procedure, pass :: unget_eof => lexer_input_t_unget_eof
end type lexer_input_t
 
type :: lexer_output_t
integer, private :: unit_no = output_unit
contains
procedure, pass :: use_file => lexer_output_t_use_file
procedure, pass :: output_token => lexer_output_t_output_token
end type lexer_output_t
 
contains
 
subroutine lexer_input_t_use_file (inputter, filename)
class(lexer_input_t), intent(inout) :: inputter
character(*), intent(in) :: filename
 
integer :: stat
 
inputter%using_input_unit = .false.
inputter%unit_no = input_file_unit_no
inputter%line_no = 1
inputter%column_no = 0
 
open (unit = input_file_unit_no, file = filename, status = 'old', &
& action = 'read', access = 'stream', form = 'unformatted', &
& iostat = stat)
if (stat /= 0) then
write (error_unit, '("Error: failed to open ", A, " for input")') filename
stop 1
end if
end subroutine lexer_input_t_use_file
 
!!!
!!! If you tell gfortran you want -std=f2008 or -std=f2018, you likely
!!! will need to add also -fall-intrinsics or -U__GFORTRAN__
!!!
!!! The first way, you get the FGETC intrinsic. The latter way, you
!!! get the C interface code that uses getchar(3).
!!!
#ifdef __GFORTRAN__
 
subroutine get_input_unit_char (c, stat)
!
! The following works if you are using gfortran.
!
! (FGETC is considered a feature for backwards compatibility with
! g77. However, I know of no way to reconfigure input_unit as a
! Fortran 2003 stream, for use with ordinary ‘read’.)
!
character, intent(inout) :: c
integer, intent(out) :: stat
 
call fgetc (input_unit, c, stat)
end subroutine get_input_unit_char
 
#else
 
subroutine get_input_unit_char (c, stat)
!
! An alternative implementation of get_input_unit_char. This
! actually reads input from the C standard input, which might not
! be the same as input_unit.
!
use, intrinsic :: iso_c_binding, only: c_int
character, intent(inout) :: c
integer, intent(out) :: stat
 
interface
!
! Use getchar(3) to read characters from standard input. This
! assumes there is actually such a function available, and that
! getchar(3) does not exist solely as a macro. (One could write
! one’s own getchar() if necessary, of course.)
!
function getchar () result (c) bind (c, name = 'getchar')
use, intrinsic :: iso_c_binding, only: c_int
integer(kind = c_int) :: c
end function getchar
end interface
 
integer(kind = c_int) :: i_char
 
i_char = getchar ()
!
! The C standard requires that EOF have a negative value. If the
! value returned by getchar(3) is not EOF, then it will be
! representable as an unsigned char. Therefore, to check for end
! of file, one need only test whether i_char is negative.
!
if (i_char < 0) then
stat = -1
else
stat = 0
c = char (i_char)
end if
end subroutine get_input_unit_char
 
#endif
 
subroutine lexer_input_t_get_next_ch (inputter, eof, ch)
class(lexer_input_t), intent(inout) :: inputter
logical, intent(out) :: eof
character(1, kind = ck), intent(inout) :: ch
 
integer :: stat
character(1) :: c = '*'
 
if (0 < inputter%unget_count) then
if (inputter%unget_eof_buffer(inputter%unget_count)) then
eof = .true.
else
eof = .false.
ch = inputter%unget_buffer(inputter%unget_count)
end if
inputter%unget_count = inputter%unget_count - 1
else
if (inputter%using_input_unit) then
call get_input_unit_char (c, stat)
else
read (unit = inputter%unit_no, iostat = stat) c
end if
 
ch = char (ichar (c, kind = ichar_kind), kind = ck)
 
if (0 < stat) then
write (error_unit, '("Input error with status code ", I0)') stat
stop 1
else if (stat < 0) then
eof = .true.
! The C reference code increases column number on end of file;
! therefore, so shall we.
inputter%column_no = inputter%column_no + 1
else
eof = .false.
if (ch == newline_char) then
inputter%line_no = inputter%line_no + 1
inputter%column_no = 0
else
inputter%column_no = inputter%column_no + 1
end if
end if
end if
end subroutine lexer_input_t_get_next_ch
 
subroutine lexer_input_t_unget_ch (inputter, ch)
class(lexer_input_t), intent(inout) :: inputter
character(1, kind = ck), intent(in) :: ch
 
if (ubound (inputter%unget_buffer, 1) <= inputter%unget_count) then
write (error_unit, '("class(lexer_input_t) unget buffer overflow")')
stop 1
else
inputter%unget_count = inputter%unget_count + 1
inputter%unget_buffer(inputter%unget_count) = ch
inputter%unget_eof_buffer(inputter%unget_count) = .false.
end if
end subroutine lexer_input_t_unget_ch
 
subroutine lexer_input_t_unget_eof (inputter)
class(lexer_input_t), intent(inout) :: inputter
 
if (ubound (inputter%unget_buffer, 1) <= inputter%unget_count) then
write (error_unit, '("class(lexer_input_t) unget buffer overflow")')
stop 1
else
inputter%unget_count = inputter%unget_count + 1
inputter%unget_buffer(inputter%unget_count) = ck_'*'
inputter%unget_eof_buffer(inputter%unget_count) = .true.
end if
end subroutine lexer_input_t_unget_eof
 
subroutine lexer_output_t_use_file (outputter, filename)
class(lexer_output_t), intent(inout) :: outputter
character(*), intent(in) :: filename
 
integer :: stat
 
outputter%unit_no = output_file_unit_no
open (unit = output_file_unit_no, file = filename, action = 'write', iostat = stat)
if (stat /= 0) then
write (error_unit, '("Error: failed to open ", A, " for output")') filename
stop 1
end if
end subroutine lexer_output_t_use_file
 
subroutine lexer_output_t_output_token (outputter, token)
class(lexer_output_t), intent(inout) :: outputter
class(token_t), intent(in) :: token
 
select case (token%token_no)
case (tk_Integer, tk_Ident, tk_String)
write (outputter%unit_no, '(1X, I20, 1X, I20, 1X, A, 1X, A)') &
& token%line_no, token%column_no, &
& token_names(token%token_no), token%val
case default
write (outputter%unit_no, '(1X, I20, 1X, I20, 1X, A)') &
& token%line_no, token%column_no, &
& trim (token_names(token%token_no))
end select
end subroutine lexer_output_t_output_token
 
subroutine run_lexer (inputter, outputter)
class(lexer_input_t), intent(inout) :: inputter
class(lexer_output_t), intent(inout) :: outputter
 
type(token_t) :: token
 
token = get_token (inputter)
do while (token%token_no /= tk_EOI)
call outputter%output_token (token)
token = get_token (inputter)
end do
call outputter%output_token (token)
end subroutine run_lexer
 
function get_token (inputter) result (token)
class(lexer_input_t), intent(inout) :: inputter
type(token_t) :: token
 
logical :: eof
character(1, kind = ck) :: ch
 
call skip_spaces_and_comments (inputter, eof, ch, &
& token%line_no, token%column_no)
 
if (eof) then
token%token_no = tk_EOI
else
select case (ch)
case (ck_'{')
token%token_no = tk_Lbrace
case (ck_'}')
token%token_no = tk_Rbrace
case (ck_'(')
token%token_no = tk_Lparen
case (ck_')')
token%token_no = tk_Rparen
case (ck_'+')
token%token_no = tk_Add
case (ck_'-')
token%token_no = tk_Sub
case (ck_'*')
token%token_no = tk_Mul
case (ck_'%')
token%token_no = tk_Mod
case (ck_';')
token%token_no = tk_Semi
case (ck_',')
token%token_no = tk_Comma
case (ck_'/')
token%token_no = tk_Div
 
case (ck_"'")
call read_character_literal
 
case (ck_'<')
call distinguish_operators (ch, ck_'=', tk_Leq, tk_Lss)
case (ck_'>')
call distinguish_operators (ch, ck_'=', tk_Geq, tk_Gtr)
case (ck_'=')
call distinguish_operators (ch, ck_'=', tk_Eq, tk_Assign)
case (ck_'!')
call distinguish_operators (ch, ck_'=', tk_Neq, tk_Not)
case (ck_'&')
call distinguish_operators (ch, ck_'&', tk_And, tk_EOI)
case (ck_'|')
call distinguish_operators (ch, ck_'|', tk_Or, tk_EOI)
 
case (ck_'"')
call read_string_literal (ch, ch)
 
case default
if (isdigit (ch)) then
call read_numeric_literal (ch)
else if (isalpha_or_underscore (ch)) then
call read_identifier_or_keyword (ch)
else
call start_error_message (inputter)
write (error_unit, '("unrecognized character ''", A, "''")') ch
stop 1
end if
end select
end if
contains
 
subroutine read_character_literal
character(1, kind = ck) :: ch
logical :: eof
character(20, kind = ck) :: buffer
 
token%token_no = tk_Integer
 
call inputter%get_next_ch (eof, ch)
if (eof) then
call start_error_message (inputter)
write (error_unit, '("end of input in character literal")')
stop 1
else if (ch == ck_"'") then
call start_error_message (inputter)
write (error_unit, '("empty character literal")')
stop 1
else if (ch == backslash_char) then
call inputter%get_next_ch (eof, ch)
if (eof) then
call start_error_message (inputter)
write (error_unit, '("end of input in character literal, after backslash")')
stop 1
else if (ch == ck_'n') then
allocate (token%val, source = newline_intstring)
else if (ch == backslash_char) then
allocate (token%val, source = backslash_intstring)
else
call start_error_message (inputter)
write (error_unit, '("unknown escape sequence ''", A, A, "'' in character literal")') &
& backslash_char, ch
stop 1
end if
call read_character_literal_close_quote
else
call read_character_literal_close_quote
write (buffer, '(I0)') ichar (ch, kind = ichar_kind)
allocate (token%val, source = trim (buffer))
end if
end subroutine read_character_literal
 
subroutine read_character_literal_close_quote
logical :: eof
character(1, kind = ck) :: close_quote
 
call inputter%get_next_ch (eof, close_quote)
if (eof) then
call start_error_message (inputter)
write (error_unit, '("end of input in character literal")')
stop 1
else if (close_quote /= ck_"'") then
call start_error_message (inputter)
write (error_unit, '("multi-character literal")')
stop 1
end if
end subroutine read_character_literal_close_quote
 
subroutine distinguish_operators (first_ch, second_ch, &
& token_no_if_second_ch, &
& token_no_if_no_second_ch)
character(1, kind = ck), intent(in) :: first_ch
character(1, kind = ck), intent(in) :: second_ch
integer, intent(in) :: token_no_if_second_ch
integer, intent(in) :: token_no_if_no_second_ch
 
character(1, kind = ck) :: ch
logical :: eof
 
call inputter%get_next_ch (eof, ch)
if (eof) then
call inputter%unget_eof
token%token_no = token_no_if_no_second_ch
else if (ch == second_ch) then
token%token_no = token_no_if_second_ch
else if (token_no_if_no_second_ch == tk_EOI) then
call start_error_message (inputter)
write (error_unit, '("unrecognized character ''", A, "''")') first_ch
stop 1
else
call inputter%unget_ch (ch)
token%token_no = token_no_if_no_second_ch
end if
end subroutine distinguish_operators
 
subroutine read_string_literal (opening_quote, closing_quote)
character(1, kind = ck), intent(in) :: opening_quote
character(1, kind = ck), intent(in) :: closing_quote
 
character(1, kind = ck) :: ch
logical :: done
 
inputter%strbuf = opening_quote
done = .false.
do while (.not. done)
call inputter%get_next_ch (eof, ch)
if (eof) then
call start_error_message (inputter)
write (error_unit, '("end of input in string literal")')
stop 1
else if (ch == closing_quote) then
call inputter%strbuf%append(ch)
done = .true.
else if (ch == newline_char) then
call start_error_message (inputter)
write (error_unit, '("end of line in string literal")')
stop 1
else
call inputter%strbuf%append(ch)
end if
end do
allocate (token%val, source = inputter%strbuf%to_unicode())
token%token_no = tk_String
end subroutine read_string_literal
 
subroutine read_numeric_literal (first_ch)
character(1, kind = ck), intent(in) :: first_ch
 
character(1, kind = ck) :: ch
 
token%token_no = tk_Integer
 
inputter%strbuf = first_ch
call inputter%get_next_ch (eof, ch)
do while (isdigit (ch))
call inputter%strbuf%append (ch)
call inputter%get_next_ch (eof, ch)
end do
if (isalpha_or_underscore (ch)) then
call start_error_message (inputter)
write (error_unit, '("invalid numeric literal """, A, """")') &
& inputter%strbuf%to_unicode()
stop 1
else
call inputter%unget_ch (ch)
allocate (token%val, source = inputter%strbuf%to_unicode())
end if
end subroutine read_numeric_literal
 
subroutine read_identifier_or_keyword (first_ch)
character(1, kind = ck), intent(in) :: first_ch
 
character(1, kind = ck) :: ch
 
inputter%strbuf = first_ch
call inputter%get_next_ch (eof, ch)
do while (isalnum_or_underscore (ch))
call inputter%strbuf%append (ch)
call inputter%get_next_ch (eof, ch)
end do
 
call inputter%unget_ch (ch)
 
!
! The following is a handwritten ‘implicit radix tree’ search
! for keywords, first partitioning the set of keywords according
! to their lengths.
!
! I did it this way for fun. One could, of course, write a
! program to generate code for such a search.
!
! Perfect hashes are another method one could use.
!
! The reference C implementation uses a binary search.
!
token%token_no = tk_Ident
select case (inputter%strbuf%length())
case (2)
select case (inputter%strbuf%chars(1))
case (ck_'i')
select case (inputter%strbuf%chars(2))
case (ck_'f')
token%token_no = tk_If
case default
continue
end select
case default
continue
end select
case (4)
select case (inputter%strbuf%chars(1))
case (ck_'e')
select case (inputter%strbuf%chars(2))
case (ck_'l')
select case (inputter%strbuf%chars(3))
case (ck_'s')
select case (inputter%strbuf%chars(4))
case (ck_'e')
token%token_no = tk_Else
case default
continue
end select
case default
continue
end select
case default
continue
end select
case (ck_'p')
select case (inputter%strbuf%chars(2))
case (ck_'u')
select case (inputter%strbuf%chars(3))
case (ck_'t')
select case (inputter%strbuf%chars(4))
case (ck_'c')
token%token_no = tk_Putc
case default
continue
end select
case default
continue
end select
case default
continue
end select
case default
continue
end select
case (5)
select case (inputter%strbuf%chars(1))
case (ck_'p')
select case (inputter%strbuf%chars(2))
case (ck_'r')
select case (inputter%strbuf%chars(3))
case (ck_'i')
select case (inputter%strbuf%chars(4))
case (ck_'n')
select case (inputter%strbuf%chars(5))
case (ck_'t')
token%token_no = tk_Print
case default
continue
end select
case default
continue
end select
case default
continue
end select
case default
continue
end select
case (ck_'w')
select case (inputter%strbuf%chars(2))
case (ck_'h')
select case (inputter%strbuf%chars(3))
case (ck_'i')
select case (inputter%strbuf%chars(4))
case (ck_'l')
select case (inputter%strbuf%chars(5))
case (ck_'e')
token%token_no = tk_While
case default
continue
end select
case default
continue
end select
case default
continue
end select
case default
continue
end select
case default
continue
end select
case default
continue
end select
if (token%token_no == tk_Ident) then
allocate (token%val, source = inputter%strbuf%to_unicode ())
end if
end subroutine read_identifier_or_keyword
 
end function get_token
 
subroutine skip_spaces_and_comments (inputter, eof, ch, line_no, column_no)
!
! This procedure skips spaces and comments, and also captures the
! line and column numbers at the correct moment to indicate the
! start of a token.
!
class(lexer_input_t), intent(inout) :: inputter
logical, intent(out) :: eof
character(1, kind = ck), intent(inout) :: ch
integer(kind = nk), intent(out) :: line_no
integer(kind = nk), intent(out) :: column_no
 
integer(kind = nk), parameter :: not_done = -(huge (1_nk))
 
line_no = not_done
do while (line_no == not_done)
call inputter%get_next_ch (eof, ch)
if (eof) then
line_no = inputter%line_no
column_no = inputter%column_no
else if (ch == ck_'/') then
line_no = inputter%line_no
column_no = inputter%column_no
call inputter%get_next_ch (eof, ch)
if (eof) then
call inputter%unget_eof
ch = ck_'/'
else if (ch /= ck_'*') then
call inputter%unget_ch (ch)
ch = ck_'/'
else
call read_to_end_of_comment
line_no = not_done
end if
else if (.not. isspace (ch)) then
line_no = inputter%line_no
column_no = inputter%column_no
end if
end do
 
contains
 
subroutine read_to_end_of_comment
logical :: done
 
done = .false.
do while (.not. done)
call inputter%get_next_ch (eof, ch)
if (eof) then
call end_of_input_in_comment
else if (ch == ck_'*') then
call inputter%get_next_ch (eof, ch)
if (eof) then
call end_of_input_in_comment
else if (ch == ck_'/') then
done = .true.
end if
end if
end do
end subroutine read_to_end_of_comment
 
subroutine end_of_input_in_comment
call start_error_message (inputter)
write (error_unit, '("end of input in comment")')
stop 1
end subroutine end_of_input_in_comment
 
end subroutine skip_spaces_and_comments
 
subroutine start_error_message (inputter)
class(lexer_input_t), intent(inout) :: inputter
 
write (error_unit, '("Lexical error at ", I0, ".", I0, ": ")', advance = 'no') &
& inputter%line_no, inputter%column_no
end subroutine start_error_message
 
elemental function isspace (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
bool = (ch == horizontal_tab_char) .or. &
& (ch == linefeed_char) .or. &
& (ch == vertical_tab_char) .or. &
& (ch == formfeed_char) .or. &
& (ch == carriage_return_char) .or. &
& (ch == space_char)
end function isspace
 
elemental function isupper (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
integer(kind = ichar_kind), parameter :: uppercase_A = ichar (ck_'A', kind = ichar_kind)
integer(kind = ichar_kind), parameter :: uppercase_Z = ichar (ck_'Z', kind = ichar_kind)
 
integer(kind = ichar_kind) :: i_ch
 
i_ch = ichar (ch, kind = ichar_kind)
bool = (uppercase_A <= i_ch .and. i_ch <= uppercase_Z)
end function isupper
 
elemental function islower (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
integer(kind = ichar_kind), parameter :: lowercase_a = ichar (ck_'a', kind = ichar_kind)
integer(kind = ichar_kind), parameter :: lowercase_z = ichar (ck_'z', kind = ichar_kind)
 
integer(kind = ichar_kind) :: i_ch
 
i_ch = ichar (ch, kind = ichar_kind)
bool = (lowercase_a <= i_ch .and. i_ch <= lowercase_z)
end function islower
 
elemental function isalpha (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
bool = isupper (ch) .or. islower (ch)
end function isalpha
 
elemental function isdigit (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
integer(kind = ichar_kind), parameter :: zero = ichar (ck_'0', kind = ichar_kind)
integer(kind = ichar_kind), parameter :: nine = ichar (ck_'9', kind = ichar_kind)
 
integer(kind = ichar_kind) :: i_ch
 
i_ch = ichar (ch, kind = ichar_kind)
bool = (zero <= i_ch .and. i_ch <= nine)
end function isdigit
 
elemental function isalnum (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
bool = isalpha (ch) .or. isdigit (ch)
end function isalnum
 
elemental function isalpha_or_underscore (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
bool = isalpha (ch) .or. (ch == ck_'_')
end function isalpha_or_underscore
 
elemental function isalnum_or_underscore (ch) result (bool)
character(1, kind = ck), intent(in) :: ch
logical :: bool
 
bool = isalnum (ch) .or. (ch == ck_'_')
end function isalnum_or_underscore
 
end module lexical_analysis
 
program lex
use, intrinsic :: iso_fortran_env, only: output_unit
use, non_intrinsic :: lexical_analysis
 
implicit none
 
integer :: arg_count
character(200) :: arg
type(lexer_input_t) :: inputter
type(lexer_output_t) :: outputter
 
arg_count = command_argument_count ()
if (3 <= arg_count) then
call print_usage
else if (arg_count == 0) then
call run_lexer (inputter, outputter)
else if (arg_count == 1) then
call get_command_argument (1, arg)
call inputter%use_file(trim (arg))
call run_lexer (inputter, outputter)
else if (arg_count == 2) then
call get_command_argument (1, arg)
call inputter%use_file(trim (arg))
call get_command_argument (2, arg)
call outputter%use_file(trim (arg))
call run_lexer (inputter, outputter)
end if
 
contains
 
subroutine print_usage
character(200) :: progname
 
call get_command_argument (0, progname)
write (output_unit, '("Usage: ", A, " [INPUT_FILE [OUTPUT_FILE]]")') &
& trim (progname)
end subroutine print_usage
end program lex</syntaxhighlight>
 
{{out}}
Test case 3.
<pre> 5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|FreeBASIC}}==
Tested with FreeBASIC 1.05
<syntaxhighlight lang="freebasic">enum Token_type
<lang FreeBASIC>
tk_EOI
enum Token_type
tk_eoitk_Mul
tk_printtk_Div
tk_putctk_Mod
tk_iftk_Add
tk_whiletk_Sub
tk_lbracetk_Negate
tk_rbracetk_Not
tk_lparentk_Lss
tk_rparentk_Leq
tk_uminustk_Gtr
tk_multk_Geq
tk_divtk_Eq
tk_addtk_Neq
tk_subtk_Assign
tk_lsstk_And
tk_gtrtk_Or
tk_leqtk_If
tk_neqtk_Else
tk_andtk_While
tk_semitk_Print
tk_commatk_Putc
tk_assigntk_Lparen
tk_integertk_Rparen
tk_stringtk_Lbrace
tk_identtk_Rbrace
tk_Semi
tk_Comma
tk_Ident
tk_Integer
tk_String
end enum
 
const NewLine = chr(10)
const DoubleQuote = chr(34)
const BackSlash = chr(92)
 
' where we store keywords and variables
Line 1,058 ⟶ 7,445:
 
function is_digit(byval ch as string) as long
is_digit = (ch <> "") and ch >= "0" andAndAlso ch <= "9"
end function
 
function is_alnum(byval ch as string) as long
is_alnum = (ch <> "") and ((UCaseucase(ch) >= "A" andAndAlso UCaseucase(ch) <= "Z") orOrElse (is_digit(ch)))
end function
 
sub error_msg(byval eline as integer, byval ecol as integer, byval msg as string)
print "("; eline; ":"; ecol; ") "; "msg "; msg
print : print "Hit any to end program"
sleep
system
end sub
Line 1,072 ⟶ 7,461:
' add an identifier to the symbol table
function install(byval s_name as string, byval tok as Token_type) as integer
dim n as integer = ubound(symtab) + 1
redim preserve symtab(n)
 
n = ubound(symtab)
redim preserve symtab(n + 1)
n = ubound(symtab)
 
symtab(n).s_name = s_name
Line 1,137 ⟶ 7,523:
case "-": tok = tk_sub: next_char(): exit sub
case "*": tok = tk_mul: next_char(): exit sub
case "%": tok = tk_Mod: next_char(): exit sub
case ";": tok = tk_semi: next_char(): exit sub
case ",": tok = tk_comma: next_char(): exit sub
case ">": tok = tk_gtr: next_char(): exit sub
case "=": tok = tk_assign: next_char(): exit sub
case "/": ' div or comment
next_char()
Line 1,148 ⟶ 7,533:
end if
' skip comments
next_char()
do
next_char()if cur_ch = "*" then
if cur_ch = "*" or cur_ch = "" then
next_char()
if cur_ch = "/" or cur_ch = "" then
next_char()
gettok(err_line, err_col, tok, v)
exit sub
end if
elseif cur_ch = "" then error_msg(err_line, err_col, "EOF in comment")
else
next_char()
end if
loop
case "'": ' single char literals
next_char()
v = str(Ascasc(cur_ch))
if cur_ch = "'" then error_msg(err_line, err_col, "empty character constant")
if cur_ch = "\"BackSlash then
next_char()
if cur_ch = "n" then
v = "10"
elseif cur_ch = "\"BackSlash then
v = Str(Asc("\92"))
else error_msg(err_line, err_col, "unknown escape sequence: " + cur_ch)
end if
Line 1,178 ⟶ 7,566:
exit sub
case "<": next_char(): tok = follow(err_line, err_col, "=", tk_Leq, tk_Lss): exit sub
case "!>": next_char(): tok = follow(err_line, err_col, "=", tk_Neqtk_Geq, tk_EOItk_Gtr): exit sub
case "!": next_char(): tok = follow(err_line, err_col, "=", tk_Neq, tk_Not): exit sub
case "=": next_char(): tok = follow(err_line, err_col, "=", tk_Eq, tk_Assign): exit sub
case "&": next_char(): tok = follow(err_line, err_col, "&", tk_And, tk_EOI): exit sub
case "|": next_char(): tok = follow(err_line, err_col, "|", tk_Or, tk_EOI): exit sub
case DoubleQuote: ' string
v = cur_ch
Line 1,218 ⟶ 7,609:
 
sub init_lex(byval filein as string)
install("else", tk_else)
install("if", tk_if)
install("print", tk_print)
Line 1,236 ⟶ 7,628:
dim tok as Token_type
dim v as string
dim tok_list(tk_eoi to tk_identtk_string) as string
 
tok_list(tk_eoitk_EOI ) = "EOIEnd_of_input"
tok_list(tk_printtk_Mul ) = "PrintOp_multiply"
tok_list(tk_putctk_Div ) = "PutcOp_divide"
tok_list(tk_if tk_Mod ) = "IfOp_mod"
tok_list(tk_whiletk_Add ) = "WhileOp_add"
tok_list(tk_lbracetk_Sub ) = "LbraceOp_subtract"
tok_list(tk_rbracetk_Negate ) = "RbraceOp_negate"
tok_list(tk_lparentk_Not ) = "LparenOp_not"
tok_list(tk_rparentk_Lss ) = "RparenOp_less"
tok_list(tk_uminustk_Leq ) = "UminusOp_lessequal"
tok_list(tk_multk_Gtr ) = "MulOp_greater"
tok_list(tk_divtk_Geq ) = "DivOp_greaterequal"
tok_list(tk_addtk_Eq ) = "AddOp_equal"
tok_list(tk_subtk_Neq ) = "SubOp_notequal"
tok_list(tk_lss tk_Assign ) = "LssOp_assign"
tok_list(tk_gtrtk_And ) = "GtrOp_and"
tok_list(tk_leqtk_Or ) = "LeqOp_or"
tok_list(tk_neqtk_If ) = "NeqKeyword_if"
tok_list(tk_and tk_Else ) = "AndKeyword_else"
tok_list(tk_semi tk_While ) = "SemiKeyword_while"
tok_list(tk_commatk_Print ) = "CommaKeyword_print"
tok_list(tk_assigntk_Putc ) = "AssignKeyword_putc"
tok_list(tk_integertk_Lparen ) = "IntegerLeftParen"
tok_list(tk_stringtk_Rparen ) = "StringRightParen"
tok_list(tk_ident tk_Lbrace ) = "IdentLeftBrace"
tok_list(tk_Rbrace ) = "RightBrace"
tok_list(tk_Semi ) = "Semicolon"
tok_list(tk_Comma ) = "Comma"
tok_list(tk_Ident ) = "Identifier"
tok_list(tk_Integer) = "Integer"
tok_list(tk_String ) = "String"
 
do
gettok(err_line, err_col, tok, v)
print using "line ##### col ##### \ \ " + BackSlash; err_line; err_col; tok_list(tok);
if tok = tk_integer orelse tok = tk_ident orelse tok = tk_string then print " " + v;
print
Line 1,273 ⟶ 7,671:
 
sub main()
if command(1) = "" then print "filename required" : systemexit sub
init_lex(command(1))
scanner()
Line 1,279 ⟶ 7,677:
 
main()
print : print "Hit any to end program"
system
sleep
</lang>
system</syntaxhighlight>
{{out|case=test case 3}}
<b>
<pre> 5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
22 30 End_of_input</pre>
</b>
 
=={{header|Go}}==
Output from test case 3:
{{trans|FreeBASIC}}
<syntaxhighlight lang="go">package main
 
import (
"bufio"
"fmt"
"log"
"os"
)
 
type TokenType int
 
const (
tkEOI TokenType = iota
tkMul
tkDiv
tkMod
tkAdd
tkSub
tkNegate
tkNot
tkLss
tkLeq
tkGtr
tkGeq
tkEq
tkNeq
tkAssign
tkAnd
tkOr
tkIf
tkElse
tkWhile
tkPrint
tkPutc
tkLparen
tkRparen
tkLbrace
tkRbrace
tkSemi
tkComma
tkIdent
tkInteger
tkString
)
 
type Symbol struct {
name string
tok TokenType
}
 
// symbol table
var symtab []Symbol
 
var scanner *bufio.Scanner
 
var (
curLine = ""
curCh byte
lineNum = 0
colNum = 0
)
 
const etx byte = 4 // used to signify EOI
 
func isDigit(ch byte) bool {
return ch >= '0' && ch <= '9'
}
 
func isAlnum(ch byte) bool {
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || isDigit(ch)
}
 
func errorMsg(eline, ecol int, msg string) {
log.Fatalf("(%d:%d) %s", eline, ecol, msg)
}
 
// add an identifier to the symbol table
func install(name string, tok TokenType) {
sym := Symbol{name, tok}
symtab = append(symtab, sym)
}
 
// search for an identifier in the symbol table
func lookup(name string) int {
for i := 0; i < len(symtab); i++ {
if symtab[i].name == name {
return i
}
}
return -1
}
 
// read the next line of input from the source file
func nextLine() {
if scanner.Scan() {
curLine = scanner.Text()
lineNum++
colNum = 0
if curLine == "" { // skip blank lines
nextLine()
}
} else {
err := scanner.Err()
if err == nil { // EOF
curCh = etx
curLine = ""
lineNum++
colNum = 1
} else {
log.Fatal(err)
}
}
}
 
// get the next char
func nextChar() {
if colNum >= len(curLine) {
nextLine()
}
if colNum < len(curLine) {
curCh = curLine[colNum]
colNum++
}
}
 
func follow(eline, ecol int, expect byte, ifyes, ifno TokenType) TokenType {
if curCh == expect {
nextChar()
return ifyes
}
if ifno == tkEOI {
errorMsg(eline, ecol, "follow unrecognized character: "+string(curCh))
}
return ifno
}
 
func gettok() (eline, ecol int, tok TokenType, v string) {
// skip whitespace
for curCh == ' ' || curCh == '\t' || curCh == '\n' {
nextChar()
}
eline = lineNum
ecol = colNum
switch curCh {
case etx:
tok = tkEOI
return
case '{':
tok = tkLbrace
nextChar()
return
case '}':
tok = tkRbrace
nextChar()
return
case '(':
tok = tkLparen
nextChar()
return
case ')':
tok = tkRparen
nextChar()
return
case '+':
tok = tkAdd
nextChar()
return
case '-':
tok = tkSub
nextChar()
return
case '*':
tok = tkMul
nextChar()
return
case '%':
tok = tkMod
nextChar()
return
case ';':
tok = tkSemi
nextChar()
return
case ',':
tok = tkComma
nextChar()
return
case '/': // div or comment
nextChar()
if curCh != '*' {
tok = tkDiv
return
}
// skip comments
nextChar()
for {
if curCh == '*' {
nextChar()
if curCh == '/' {
nextChar()
eline, ecol, tok, v = gettok()
return
}
} else if curCh == etx {
errorMsg(eline, ecol, "EOF in comment")
} else {
nextChar()
}
}
case '\'': // single char literals
nextChar()
v = fmt.Sprintf("%d", curCh)
if curCh == '\'' {
errorMsg(eline, ecol, "Empty character constant")
}
if curCh == '\\' {
nextChar()
if curCh == 'n' {
v = "10"
} else if curCh == '\\' {
v = "92"
} else {
errorMsg(eline, ecol, "unknown escape sequence: "+string(curCh))
}
}
nextChar()
if curCh != '\'' {
errorMsg(eline, ecol, "multi-character constant")
}
nextChar()
tok = tkInteger
return
case '<':
nextChar()
tok = follow(eline, ecol, '=', tkLeq, tkLss)
return
case '>':
nextChar()
tok = follow(eline, ecol, '=', tkGeq, tkGtr)
return
case '!':
nextChar()
tok = follow(eline, ecol, '=', tkNeq, tkNot)
return
case '=':
nextChar()
tok = follow(eline, ecol, '=', tkEq, tkAssign)
return
case '&':
nextChar()
tok = follow(eline, ecol, '&', tkAnd, tkEOI)
return
case '|':
nextChar()
tok = follow(eline, ecol, '|', tkOr, tkEOI)
return
case '"': // string
v = string(curCh)
nextChar()
for curCh != '"' {
if curCh == '\n' {
errorMsg(eline, ecol, "EOL in string")
}
if curCh == etx {
errorMsg(eline, ecol, "EOF in string")
}
v += string(curCh)
nextChar()
}
v += string(curCh)
nextChar()
tok = tkString
return
default: // integers or identifiers
isNumber := isDigit(curCh)
v = ""
for isAlnum(curCh) || curCh == '_' {
if !isDigit(curCh) {
isNumber = false
}
v += string(curCh)
nextChar()
}
if len(v) == 0 {
errorMsg(eline, ecol, "unknown character: "+string(curCh))
}
if isDigit(v[0]) {
if !isNumber {
errorMsg(eline, ecol, "invalid number: "+string(curCh))
}
tok = tkInteger
return
}
index := lookup(v)
if index == -1 {
tok = tkIdent
} else {
tok = symtab[index].tok
}
return
}
}
 
func initLex() {
install("else", tkElse)
install("if", tkIf)
install("print", tkPrint)
install("putc", tkPutc)
install("while", tkWhile)
nextChar()
}
 
func process() {
tokMap := make(map[TokenType]string)
tokMap[tkEOI] = "End_of_input"
tokMap[tkMul] = "Op_multiply"
tokMap[tkDiv] = "Op_divide"
tokMap[tkMod] = "Op_mod"
tokMap[tkAdd] = "Op_add"
tokMap[tkSub] = "Op_subtract"
tokMap[tkNegate] = "Op_negate"
tokMap[tkNot] = "Op_not"
tokMap[tkLss] = "Op_less"
tokMap[tkLeq] = "Op_lessequal"
tokMap[tkGtr] = "Op_greater"
tokMap[tkGeq] = "Op_greaterequal"
tokMap[tkEq] = "Op_equal"
tokMap[tkNeq] = "Op_notequal"
tokMap[tkAssign] = "Op_assign"
tokMap[tkAnd] = "Op_and"
tokMap[tkOr] = "Op_or"
tokMap[tkIf] = "Keyword_if"
tokMap[tkElse] = "Keyword_else"
tokMap[tkWhile] = "Keyword_while"
tokMap[tkPrint] = "Keyword_print"
tokMap[tkPutc] = "Keyword_putc"
tokMap[tkLparen] = "LeftParen"
tokMap[tkRparen] = "RightParen"
tokMap[tkLbrace] = "LeftBrace"
tokMap[tkRbrace] = "RightBrace"
tokMap[tkSemi] = "Semicolon"
tokMap[tkComma] = "Comma"
tokMap[tkIdent] = "Identifier"
tokMap[tkInteger] = "Integer"
tokMap[tkString] = "String"
 
for {
eline, ecol, tok, v := gettok()
fmt.Printf("%5d %5d %-16s", eline, ecol, tokMap[tok])
if tok == tkInteger || tok == tkIdent || tok == tkString {
fmt.Println(v)
} else {
fmt.Println()
}
if tok == tkEOI {
return
}
}
}
 
func check(err error) {
if err != nil {
log.Fatal(err)
}
}
 
func main() {
if len(os.Args) < 2 {
fmt.Println("Filename required")
return
}
f, err := os.Open(os.Args[1])
check(err)
defer f.Close()
scanner = bufio.NewScanner(f)
initLex()
process()
}</syntaxhighlight>
 
{{out}}
Test Case 3:
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
 
=={{header|Haskell}}==
Tested with GHC 8.0.2
<syntaxhighlight lang="haskell">import Control.Applicative hiding (many, some)
import Control.Monad.State.Lazy
import Control.Monad.Trans.Maybe (MaybeT, runMaybeT)
import Data.Char (isAsciiLower, isAsciiUpper, isDigit, ord)
import Data.Foldable (asum)
import Data.Functor (($>))
import Data.Text (Text)
import qualified Data.Text as T
import Prelude hiding (lex)
import System.Environment (getArgs)
import System.IO
import Text.Printf
 
 
-- Tokens --------------------------------------------------------------------------------------------------------------
data Val = IntVal Int -- value
| TextVal String Text -- name value
| SymbolVal String -- name
| Skip
| LexError String -- message
 
data Token = Token Val Int Int -- value line column
 
 
instance Show Val where
show (IntVal value) = printf "%-18s%d\n" "Integer" value
show (TextVal "String" value) = printf "%-18s%s\n" "String" (show $ T.unpack value) -- show escaped characters
show (TextVal name value) = printf "%-18s%s\n" name (T.unpack value)
show (SymbolVal name ) = printf "%s\n" name
show (LexError msg ) = printf "%-18s%s\n" "Error" msg
show Skip = printf ""
 
instance Show Token where
show (Token val line column) = printf "%2d %2d %s" line column (show val)
 
 
printTokens :: [Token] -> String
printTokens tokens =
"Location Token name Value\n" ++
"--------------------------------------\n" ++
(concatMap show tokens)
 
 
-- Tokenizers ----------------------------------------------------------------------------------------------------------
makeToken :: Lexer Val -> Lexer Token
makeToken lexer = do
(t, l, c) <- get
val <- lexer
 
case val of
Skip -> nextToken
 
LexError msg -> do
(_, l', c') <- get
 
let code = T.unpack $ T.take (c' - c + 1) t
let str = printf "%s\n%s(%d, %d): %s" msg (replicate 27 ' ') l' c' code
 
ch <- peek
unless (ch == '\0') $ advance 1
 
return $ Token (LexError str) l c
 
_ -> return $ Token val l c
 
 
simpleToken :: String -> String -> Lexer Val
simpleToken lexeme name = lit lexeme $> SymbolVal name
 
 
makeTokenizers :: [(String, String)] -> Lexer Val
makeTokenizers = asum . map (uncurry simpleToken)
 
 
keywords :: Lexer Val
keywords = makeTokenizers
[("if", "Keyword_if"), ("else", "Keyword_else"), ("while", "Keyword_while"),
("print", "Keyword_print"), ("putc", "Keyword_putc")]
 
 
operators :: Lexer Val
operators = makeTokenizers
[("*", "Op_multiply"), ("/", "Op_divide"), ("%", "Op_mod"), ("+", "Op_add"),
("-", "Op_subtract"), ("<=", "Op_lessequal"), ("<", "Op_less"), (">=", "Op_greaterequal"),
(">", "Op_greater"), ("==", "Op_equal"), ("!=", "Op_notequal"), ("!", "Op_not"),
("=", "Op_assign"), ("&&", "Op_and"), ("||", "Op_or")]
 
 
symbols :: Lexer Val
symbols = makeTokenizers
[("(", "LeftParen"), (")", "RightParen"),
("{", "LeftBrace"), ("}", "RightBrace"),
(";", "Semicolon"), (",", "Comma")]
 
 
isIdStart :: Char -> Bool
isIdStart ch = isAsciiLower ch || isAsciiUpper ch || ch == '_'
 
isIdEnd :: Char -> Bool
isIdEnd ch = isIdStart ch || isDigit ch
 
identifier :: Lexer Val
identifier = TextVal "Identifier" <$> lexeme
where lexeme = T.cons <$> (one isIdStart) <*> (many isIdEnd)
 
 
integer :: Lexer Val
integer = do
lexeme <- some isDigit
next_ch <- peek
 
if (isIdStart next_ch) then
return $ LexError "Invalid number. Starts like a number, but ends in non-numeric characters."
else do
let num = read (T.unpack lexeme) :: Int
return $ IntVal num
 
 
character :: Lexer Val
character = do
lit "'"
str <- lookahead 3
 
case str of
(ch : '\'' : _) -> advance 2 $> IntVal (ord ch)
"\\n'" -> advance 3 $> IntVal 10
"\\\\'" -> advance 3 $> IntVal 92
('\\' : ch : "\'") -> advance 2 $> LexError (printf "Unknown escape sequence \\%c" ch)
('\'' : _) -> return $ LexError "Empty character constant"
_ -> advance 2 $> LexError "Multi-character constant"
 
 
string :: Lexer Val
string = do
lit "\""
 
loop (T.pack "") =<< peek
where loop t ch = case ch of
'\\' -> do
next_ch <- next
 
case next_ch of
'n' -> loop (T.snoc t '\n') =<< next
'\\' -> loop (T.snoc t '\\') =<< next
_ -> return $ LexError $ printf "Unknown escape sequence \\%c" next_ch
 
'"' -> next $> TextVal "String" t
 
'\n' -> return $ LexError $ "End-of-line while scanning string literal." ++
" Closing string character not found before end-of-line."
 
'\0' -> return $ LexError $ "End-of-file while scanning string literal." ++
" Closing string character not found."
 
_ -> loop (T.snoc t ch) =<< next
 
 
skipComment :: Lexer Val
skipComment = do
lit "/*"
 
loop =<< peek
where loop ch = case ch of
'\0' -> return $ LexError "End-of-file in comment. Closing comment characters not found."
 
'*' -> do
next_ch <- next
 
case next_ch of
'/' -> next $> Skip
_ -> loop next_ch
 
_ -> loop =<< next
 
 
nextToken :: Lexer Token
nextToken = do
skipWhitespace
 
makeToken $ skipComment
<|> keywords
<|> identifier
<|> integer
<|> character
<|> string
<|> operators
<|> symbols
<|> simpleToken "\0" "End_of_input"
<|> (return $ LexError "Unrecognized character.")
 
 
main :: IO ()
main = do
args <- getArgs
(hin, hout) <- getIOHandles args
 
withHandles hin hout $ printTokens . (lex nextToken)
 
 
------------------------------------------------------------------------------------------------------------------------
-- Machinery
------------------------------------------------------------------------------------------------------------------------
 
-- File handling -------------------------------------------------------------------------------------------------------
getIOHandles :: [String] -> IO (Handle, Handle)
getIOHandles [] = return (stdin, stdout)
 
getIOHandles [infile] = do
inhandle <- openFile infile ReadMode
return (inhandle, stdout)
 
getIOHandles (infile : outfile : _) = do
inhandle <- openFile infile ReadMode
outhandle <- openFile outfile WriteMode
return (inhandle, outhandle)
 
 
withHandles :: Handle -> Handle -> (String -> String) -> IO ()
withHandles in_handle out_handle f = do
contents <- hGetContents in_handle
let contents' = contents ++ "\0" -- adding \0 simplifies treatment of EOF
 
hPutStr out_handle $ f contents'
 
unless (in_handle == stdin) $ hClose in_handle
unless (out_handle == stdout) $ hClose out_handle
 
 
-- Lexer ---------------------------------------------------------------------------------------------------------------
type LexerState = (Text, Int, Int) -- input line column
type Lexer = MaybeT (State LexerState)
 
 
lexerAdvance :: Int -> LexerState -> LexerState
lexerAdvance 0 ctx = ctx
 
lexerAdvance 1 (t, l, c)
| ch == '\n' = (rest, l + 1, 1 )
| otherwise = (rest, l, c + 1)
where
(ch, rest) = (T.head t, T.tail t)
 
lexerAdvance n ctx = lexerAdvance (n - 1) $ lexerAdvance 1 ctx
 
 
advance :: Int -> Lexer ()
advance n = modify $ lexerAdvance n
 
 
peek :: Lexer Char
peek = gets $ \(t, _, _) -> T.head t
 
 
lookahead :: Int -> Lexer String
lookahead n = gets $ \(t, _, _) -> T.unpack $ T.take n t
 
 
next :: Lexer Char
next = advance 1 >> peek
 
 
skipWhitespace :: Lexer ()
skipWhitespace = do
ch <- peek
when (ch `elem` " \n") (next >> skipWhitespace)
 
 
lit :: String -> Lexer ()
lit lexeme = do
(t, _, _) <- get
guard $ T.isPrefixOf (T.pack lexeme) t
advance $ length lexeme
 
 
one :: (Char -> Bool) -> Lexer Char
one f = do
ch <- peek
guard $ f ch
next
return ch
 
 
lexerMany :: (Char -> Bool) -> LexerState -> (Text, LexerState)
lexerMany f (t, l, c) = (lexeme, (t', l', c'))
where (lexeme, _) = T.span f t
(t', l', c') = lexerAdvance (T.length lexeme) (t, l, c)
 
 
many :: (Char -> Bool) -> Lexer Text
many f = state $ lexerMany f
 
 
some :: (Char -> Bool) -> Lexer Text
some f = T.cons <$> (one f) <*> (many f)
 
 
lex :: Lexer a -> String -> [a]
lex lexer str = loop lexer (T.pack str, 1, 1)
where loop lexer s
| T.null txt = [t]
| otherwise = t : loop lexer s'
 
where (Just t, s') = runState (runMaybeT lexer) s
(txt, _, _) = s'
</syntaxhighlight>
 
{{out|case=test case 3}}
<pre>
Location Token name Value
--------------------------------------
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
 
=={{header|Icon}}==
{{trans|ATS}}
{{works with|Icon|9.5.20i}}
 
This implementation was developed for Arizona Icon, but ought to work with the Unicon compiler, as well.
 
One interesting aspect is the use of co-expressions to handle "input with pushback". The main advantage of this approach is it hides the pushback buffer from the user, without making the buffer a global variable.
 
Global variables are avoided except for some constants that require initialization.
 
<syntaxhighlight lang="icon">#
# The Rosetta Code lexical analyzer in Icon with co-expressions. Based
# upon the ATS implementation.
#
# Usage: lex [INPUTFILE [OUTPUTFILE]]
# If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input
# or standard output is used, respectively. *)
#
 
$define EOF -1
 
$define TOKEN_ELSE 0
$define TOKEN_IF 1
$define TOKEN_PRINT 2
$define TOKEN_PUTC 3
$define TOKEN_WHILE 4
$define TOKEN_MULTIPLY 5
$define TOKEN_DIVIDE 6
$define TOKEN_MOD 7
$define TOKEN_ADD 8
$define TOKEN_SUBTRACT 9
$define TOKEN_NEGATE 10
$define TOKEN_LESS 11
$define TOKEN_LESSEQUAL 12
$define TOKEN_GREATER 13
$define TOKEN_GREATEREQUAL 14
$define TOKEN_EQUAL 15
$define TOKEN_NOTEQUAL 16
$define TOKEN_NOT 17
$define TOKEN_ASSIGN 18
$define TOKEN_AND 19
$define TOKEN_OR 20
$define TOKEN_LEFTPAREN 21
$define TOKEN_RIGHTPAREN 22
$define TOKEN_LEFTBRACE 23
$define TOKEN_RIGHTBRACE 24
$define TOKEN_SEMICOLON 25
$define TOKEN_COMMA 26
$define TOKEN_IDENTIFIER 27
$define TOKEN_INTEGER 28
$define TOKEN_STRING 29
$define TOKEN_END_OF_INPUT 30
 
global whitespace
global ident_start
global ident_continuation
 
procedure main(args)
local inpf, outf
local pushback_buffer, inp, pushback
 
initial {
whitespace := ' \t\v\f\r\n'
ident_start := '_' ++ &letters
ident_continuation := ident_start ++ &digits
}
 
inpf := &input
outf := &output
if 1 <= *args & args[1] ~== "-" then {
inpf := open(args[1], "rt") |
stop("cannot open ", args[1], " for input")
}
if 2 <= *args & args[2] ~== "-" then {
outf := open(args[2], "wt") |
stop("cannot open ", args[2], " for output")
}
 
pushback_buffer := []
inp := create inputter(inpf, pushback_buffer)
pushback := create repeat push(pushback_buffer, \@&source)
@pushback # The first invocation does nothing.
 
scan_text(outf, inp, pushback)
end
 
procedure scan_text(outf, inp, pushback)
local ch
 
while /ch | ch[1] ~=== EOF do {
skip_spaces_and_comments(inp, pushback)
ch := @inp
if ch[1] === EOF then {
print_token(outf, [TOKEN_END_OF_INPUT, "", ch[2], ch[3]])
} else {
ch @pushback
print_token(outf, get_next_token(inp, pushback))
}
}
end
 
procedure get_next_token(inp, pushback)
local ch, ch1
local ln, cn
 
skip_spaces_and_comments(inp, pushback)
ch := @inp
ln := ch[2] # line number
cn := ch[3] # column number
case ch[1] of {
"," : return [TOKEN_COMMA, ",", ln, cn]
";" : return [TOKEN_SEMICOLON, ";", ln, cn]
"(" : return [TOKEN_LEFTPAREN, "(", ln, cn]
")" : return [TOKEN_RIGHTPAREN, ")", ln, cn]
"{" : return [TOKEN_LEFTBRACE, "{", ln, cn]
"}" : return [TOKEN_RIGHTBRACE, "}", ln, cn]
"*" : return [TOKEN_MULTIPLY, "*", ln, cn]
"/" : return [TOKEN_DIVIDE, "/", ln, cn]
"%" : return [TOKEN_MOD, "%", ln, cn]
"+" : return [TOKEN_ADD, "+", ln, cn]
"-" : return [TOKEN_SUBTRACT, "-", ln, cn]
"<" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_LESSEQUAL, "<=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_LESS, "<", ln, cn]
}
}
">" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_GREATEREQUAL, ">=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_GREATER, ">", ln, cn]
}
}
"=" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_EQUAL, "==", ln, cn]
} else {
ch1 @pushback
return [TOKEN_ASSIGN, "=", ln, cn]
}
}
"!" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_NOTEQUAL, "!=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_NOT, "!", ln, cn]
}
}
"&" : {
ch1 := @inp
if ch1[1] === "&" then {
return [TOKEN_AND, "&&", ln, cn]
} else {
unexpected_character(ln, cn, ch)
}
}
"|" : {
ch1 := @inp
if ch1[1] === "|" then {
return [TOKEN_OR, "||", ln, cn]
} else {
unexpected_character(ln, cn, ch)
}
}
"\"" : {
ch @pushback
return scan_string_literal(inp)
}
"'" : {
ch @pushback
return scan_character_literal(inp, pushback)
}
default : {
if any(&digits, ch[1]) then {
ch @pushback
return scan_integer_literal(inp, pushback)
} else if any(ident_start, ch[1]) then {
ch @pushback
return scan_identifier_or_reserved_word (inp, pushback)
} else {
unexpected_character(ln, cn, ch)
}
}
}
end
 
procedure scan_identifier_or_reserved_word(inp, pushback)
local ch
local s
local line_no, column_no
 
s := ""
ch := @inp
line_no := ch[2]
column_no := ch[3]
while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do {
s ||:= ch[1]
ch := @inp
}
ch @pushback
return reserved_word_lookup (s, line_no, column_no)
end
 
procedure scan_integer_literal(inp, pushback)
local ch
local s
local line_no, column_no
 
s := ""
ch := @inp
line_no := ch[2]
column_no := ch[3]
while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do {
s ||:= ch[1]
ch := @inp
}
ch @pushback
not upto(~&digits, s) | invalid_integer_literal(line_no, column_no, s)
return [TOKEN_INTEGER, s, line_no, column_no]
end
 
procedure scan_character_literal(inp, pushback)
local ch, ch1
local close_quote
local toktup
local line_no, column_no
 
ch := @inp # The opening quote.
close_quote := ch[1] # Same as the opening quote.
ch @pushback
 
line_no := ch[2]
column_no := ch[3]
 
toktup := scan_character_literal_without_checking_end(inp)
ch1 := @inp
if ch1[1] ~=== close_quote then {
repeat {
case ch1[1] of {
EOF : unterminated_character_literal(line_no, column_no)
close_quote : multicharacter_literal(line_no, column_no)
default : ch1 := @inp
}
}
}
return toktup
end
 
procedure scan_character_literal_without_checking_end(inp)
local ch, ch1, ch2
 
ch := @inp # The opening quote.
ch1 := @inp
EOF ~=== ch1[1] | unterminated_character_literal(ch[2], ch[3])
if ch1[1] == "\\" then {
ch2 := @inp
EOF ~=== ch2[1] | unterminated_character_literal(ch[2], ch[3])
case ch2[1] of {
"n" : return [TOKEN_INTEGER, string(ord("\n")), ch[2], ch[3]]
"\\" : return [TOKEN_INTEGER, string(ord("\\")), ch[2], ch[3]]
default : unsupported_escape(ch1[2], ch1[3], ch2)
}
} else {
return [TOKEN_INTEGER, string(ord(ch1[1])), ch[2], ch[3]]
}
end
 
procedure scan_string_literal(inp)
local ch, ch1, ch2
local line_no, column_no
local close_quote
local s
local retval
 
ch := @inp # The opening quote
close_quote := ch[1] # Same as the opening quote.
line_no := ch[2]
column_no := ch[3]
 
s := ch[1]
until \retval do {
ch1 := @inp
ch1[1] ~=== EOF |
unterminated_string_literal (line_no, column_no,
"end of input")
ch1[1] ~== "\n" |
unterminated_string_literal (line_no, column_no,
"end of line")
if ch1[1] == close_quote then {
retval := [TOKEN_STRING, s || close_quote, line_no, column_no]
} else if ch1[1] ~== "\\" then {
s ||:= ch1[1]
} else {
ch2 := @inp
EOF ~=== ch2[1] | unsupported_escape(line_no, column_no, ch2)
case ch2[1] of {
"n" : s ||:= "\\n"
"\\" : s ||:= "\\\\"
default : unsupported_escape(line_no, column_no, ch2)
}
}
}
return retval
end
 
procedure skip_spaces_and_comments(inp, pushback)
local ch, ch1
 
repeat {
ch := @inp
(EOF === ch[1]) & { ch @pushback; return }
if not any(whitespace, ch[1]) then {
(ch[1] == "/") | { ch @pushback; return }
(ch1 := @inp) | { ch @pushback; return }
(ch1[1] == "*") | { ch1 @pushback; ch @pushback; return }
scan_comment(inp, ch[2], ch[3])
}
}
end
 
procedure scan_comment(inp, line_no, column_no)
local ch, ch1
 
until (\ch)[1] == "*" & (\ch1)[1] == "/" do {
ch := @inp
(EOF === ch[1]) & unterminated_comment(line_no, column_no)
if ch[1] == "*" then {
ch1 := @inp
(EOF === ch1[1]) & unterminated_comment(line_no, column_no)
}
}
return
end
 
procedure reserved_word_lookup(s, line_no, column_no)
 
# Lookup is by an extremely simple perfect hash.
 
static reserved_words
static reserved_word_tokens
local hashval, token, toktup
 
initial {
reserved_words := ["if", "print", "else",
"", "putc", "",
"", "while", ""]
reserved_word_tokens :=
[TOKEN_IF, TOKEN_PRINT, TOKEN_ELSE,
TOKEN_IDENTIFIER, TOKEN_PUTC, TOKEN_IDENTIFIER,
TOKEN_IDENTIFIER, TOKEN_WHILE, TOKEN_IDENTIFIER]
}
 
if *s < 2 then {
toktup := [TOKEN_IDENTIFIER, s, line_no, column_no]
} else {
hashval := ((ord(s[1]) + ord(s[2])) % (*reserved_words)) + 1
token := reserved_word_tokens[hashval]
if token = TOKEN_IDENTIFIER | s ~== reserved_words[hashval] then {
toktup := [TOKEN_IDENTIFIER, s, line_no, column_no]
} else {
toktup := [token, s, line_no, column_no]
}
}
return toktup
end
 
procedure print_token(outf, toktup)
static token_names
local s_line, s_column
 
initial {
token_names := ["Keyword_else",
"Keyword_if",
"Keyword_print",
"Keyword_putc",
"Keyword_while",
"Op_multiply",
"Op_divide",
"Op_mod",
"Op_add",
"Op_subtract",
"Op_negate",
"Op_less",
"Op_lessequal",
"Op_greater",
"Op_greaterequal",
"Op_equal",
"Op_notequal",
"Op_not",
"Op_assign",
"Op_and",
"Op_or",
"LeftParen",
"RightParen",
"LeftBrace",
"RightBrace",
"Semicolon",
"Comma",
"Identifier",
"Integer",
"String",
"End_of_input"]
}
 
/outf := &output
s_line := string(toktup[3])
s_column := string(toktup[4])
writes(outf, right (s_line, max(5, *s_line)))
writes(outf, " ")
writes(outf, right (s_column, max(5, *s_column)))
writes(outf, " ")
writes(outf, token_names[toktup[1] + 1])
case toktup[1] of {
TOKEN_IDENTIFIER : writes(outf, " ", toktup[2])
TOKEN_INTEGER : writes(outf, " ", toktup[2])
TOKEN_STRING : writes(outf, " ", toktup[2])
}
write(outf)
return
end
 
procedure inputter(inpf, pushback_buffer)
local buffer
local line_no, column_no
local c
 
buffer := ""
line_no := 1
column_no := 1
 
repeat {
buffer? {
until *pushback_buffer = 0 & pos(0) do {
if *pushback_buffer ~= 0 then {
suspend pop(pushback_buffer)
} else {
c := move(1)
suspend [c, line_no, column_no]
if c == "\n" then {
line_no +:= 1
column_no := 1
} else {
column_no +:= 1
}
}
}
}
(buffer := reads(inpf, 2048)) |
suspend [EOF, line_no, column_no]
}
end
 
procedure unterminated_comment(line_no, column_no)
error("unterminated comment starting at ",
line_no, ":", column_no)
end
 
procedure unexpected_character(line_no, column_no, ch)
error("unexpected character '", ch[1], "' starting at ",
line_no, ":", column_no)
end
 
procedure unterminated_string_literal (line_no, column_no, cause)
error("unterminated string literal (", cause, ") starting at ",
line_no, ":", column_no)
end
 
procedure unsupported_escape (line_no, column_no, ch)
if ch[1] === EOF then {
error("unexpected \\ at end of input",
" starting at ", line_no, ":", column_no)
} else {
error("unsupported escape \\", ch[1],
" starting at ", line_no, ":", column_no)
}
end
 
procedure invalid_integer_literal(line_no, column_no, s)
error("invalid integer literal ", s,
" starting at ", line_no, ":", column_no)
end
 
procedure unterminated_character_literal(line_no, column_no)
error("unterminated character literal starting at ",
line_no, ":", column_no)
end
 
procedure multicharacter_literal(line_no, column_no)
error("unsupported multicharacter literal starting at ",
line_no, ":", column_no)
end
 
procedure error(args[])
write!([&errout] ||| args)
exit(1)
end
 
procedure max(x, y)
return (if x < y then y else x)
end</syntaxhighlight>
 
 
{{out}}
<pre>$ icont -s -u -o lex lex-in-Icon.icn && ./lex compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|J}}==
Here, we first build a tokenizer state machine sufficient to recognize our mini-language. This tokenizer must not discard any characters, because we will be using cumulative character offsets to identify line numbers and column numbers.
 
Then, we refine this result: we generate those line and column numbers, discard whitespace and comments, and classify tokens based on their structure.
 
(Also, in this version, rather than building out a full state machine to recognize character literals, we treat character literals as a sequence of tokens which we must then refine. It might have been wiser to build character literals as single tokens,)
 
Implementation:
 
<syntaxhighlight lang="j">symbols=:256#0
ch=: {{1 0+x[symbols=: x (a.i.y)} symbols}}
'T0 token' =: 0 ch '%+-!(){};,<>=!|&'
'L0 letter' =: 1 ch '_',,u:65 97+/i.26
'D0 digit' =: 2 ch u:48+i.10
'S0 space' =: 3 ch ' ',LF
'C0 commen' =: 4 ch '/'
'C1 comment'=: 5 ch '*'
'q0 quote' =: 6 ch ''''
'Q0 dquote' =: 7 ch '"'
 
width=: 1+>./symbols
default=: ,:(1+i.width),every 2
states=:((1+i.width),every 1),width#default
extend=: {{
if.y>#states do.states=: y{.states,y#default
end.states
}}
pad=: {{if. 0=#y do.y=.#states end.y}}
function=: {{ NB. x: before, m: op, n: symbol, y: after
y[states=: (y,m) (<x,n)} extend 1+x>.y=.pad y
}}
{{for_op.y do.(op)=: op_index function end.0}};:'nop init start'
all=: {{y=.pad y
for_symbol.i.width do.
x symbol nop y
end.y
}}
any=: {{y=.pad y
for_symbol.i.width do.
x symbol start y
end.y
}}
 
NB. identifiers and keywords
L0 letter nop L0
L0 digit nop L0
 
NB. numbers
D0 digit nop D0
D0 letter nop D0
 
NB. white space
S0 space nop S0
 
NB. comments
C1=: C0 comment nop ''
C2=: C1 all ''
C2 all C2
C3=: C2 commen nop ''
C4=: C3 comment nop ''
 
NB. quoted characters
q1=: q0 any ''
 
NB. strings
Q1=: Q0 all ''
Q1 all Q1
Q2=: Q1 dquote nop ''
Q0 dquote nop Q2
 
tokenize=:{{
tok=. (0;states;symbols);:y
for_fix.cut'<= >= == != && ||'do.
M=.;:;fix
for_k.|.I.M E.tok do.
tok=.(fix,<'') (0 1+k)} tok
end.
end.tok-.a:
}}
 
(tknames=:;: {{)n
Op_multiply Op_divide Op_mod Op_add Op_subtract Op_less Op_lessequal
Op_greater Op_greaterequal Op_equal Op_notequal Op_not Op_and Op_or
Op_assign LeftParen RightParen Keyword_if LeftBrace Keyword_else
RightBrace Keyword_while Semicolon Keyword_print Comma Keyword_putc
}}-.LF)=: tkref=: tokenize '*/%+-<<=>>===!=!&&||=()if{else}while;print,putc'
NB. the reference tokens here were arranged to avoid whitespace tokens
NB. also, we reserve multiple token instances where a literal string
NB. appears in different syntactic productions. Here, we only use the initial
NB. instances -- the others will be used in the syntax analyzer which
NB. uses the same tkref and tknames,
 
shift=: |.!.0
numvals=: {{
ndx=. I.(0<#@>y)**/@> y e.L:0 '0123456789'
({{".y,'x'}}each ndx{y) ndx} y
}}
chrvals=: {{
q=. y=<,''''
s=. y=<,'\'
j=. I.(-.s)*(1&shift * _1&shift)q
k=. I.(y e.;:'\n')*(1 shift q)*(_2 shift q)*_1 shift s
jvals=. a.i.L:0 j{y NB. not escaped
kvals=. (k{s){<"0 a.i.LF,'\' NB. escaped
(,a:,jvals,:a:) (,_1 0 1+/j)} (,a:,a:,kvals,:a:) (,_2 _1 0 1+/k)} y
}}
 
validstring=: ((1<#)*('"'={.)*('"'={:)*('\'=])-:'\n'&E.(+._1&shift)@+.'\\'&E.) every
 
validid=: ((<,'\')~:_1&|.) * (e.&tkref) < (e.&(u:I.symbols=letter)@{. * */@(e.&(u:I.symbols e.letter,digit))@}.) every
 
lex=: {{
lineref=.I.y=LF
tokens=.(tokenize y),<,'_'
offsets=.0,}:#@;\tokens
lines=. lineref I.offsets
columns=. offsets-lines{0,lineref
keep=. -.({.@> tokens)e.u:I.space=symbols
names=. (<'End_of_input') _1} (tkref i.tokens) {(_3}.tknames),4#<'Error'
unknown=. names=<'Error'
values=. a: _1} unknown#inv numvals chrvals unknown#tokens
names=. (<'Integer') (I.(values~:a:)*tokens~:values)} names
names=. (<'String') (I.validstring tokens)} names
names=. (<'Identifier') (I.validid tokens)} names
names=. (<'End_of_input') _1} names
comments=. '*/'&-:@(_2&{.)@> tokens
whitespace=. (values=tokens) * e.&(' ',LF)@{.@> tokens
keep=. (tokens~:<,'''')*-.comments+.whitespace+.unknown*a:=values
keep&#each ((1+lines),.columns);<names,.values
}}</syntaxhighlight>
 
Test case 3:
 
<syntaxhighlight lang="j">
flex=: {{
'A B'=.y
'names values'=.|:":each B
(":A),.' ',.names,.' ',.values
}}@lex
 
testcase3=: {{)n
/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* Else */ else /* Leq */ <=
/* While */ while /* Geq */ >=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '
}}
 
flex testcase3
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 28 Identifier 10
21 28 Integer 92
22 27 Integer 32
23 1 End_of_input </syntaxhighlight>
 
Here, it seems expedient to retain a structured representation of the lexical result. As shown, it's straightforward to produce a "pure" textual result for a hypothetical alternative implementation of the syntax analyzer, but the structured representation will be easier to deal with.
 
=={{header|Java}}==
<syntaxhighlight lang="java">
// Translated from python source
 
import java.io.File;
import java.io.FileNotFoundException;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;
 
public class Lexer {
private int line;
private int pos;
private int position;
private char chr;
private String s;
Map<String, TokenType> keywords = new HashMap<>();
static class Token {
public TokenType tokentype;
public String value;
public int line;
public int pos;
Token(TokenType token, String value, int line, int pos) {
this.tokentype = token; this.value = value; this.line = line; this.pos = pos;
}
@Override
public String toString() {
String result = String.format("%5d %5d %-15s", this.line, this.pos, this.tokentype);
switch (this.tokentype) {
case Integer:
result += String.format(" %4s", value);
break;
case Identifier:
result += String.format(" %s", value);
break;
case String:
result += String.format(" \"%s\"", value);
break;
}
return result;
}
}
static enum TokenType {
End_of_input, Op_multiply, Op_divide, Op_mod, Op_add, Op_subtract,
Op_negate, Op_not, Op_less, Op_lessequal, Op_greater, Op_greaterequal,
Op_equal, Op_notequal, Op_assign, Op_and, Op_or, Keyword_if,
Keyword_else, Keyword_while, Keyword_print, Keyword_putc, LeftParen, RightParen,
LeftBrace, RightBrace, Semicolon, Comma, Identifier, Integer, String
}
static void error(int line, int pos, String msg) {
if (line > 0 && pos > 0) {
System.out.printf("%s in line %d, pos %d\n", msg, line, pos);
} else {
System.out.println(msg);
}
System.exit(1);
}
 
Lexer(String source) {
this.line = 1;
this.pos = 0;
this.position = 0;
this.s = source;
this.chr = this.s.charAt(0);
this.keywords.put("if", TokenType.Keyword_if);
this.keywords.put("else", TokenType.Keyword_else);
this.keywords.put("print", TokenType.Keyword_print);
this.keywords.put("putc", TokenType.Keyword_putc);
this.keywords.put("while", TokenType.Keyword_while);
}
Token follow(char expect, TokenType ifyes, TokenType ifno, int line, int pos) {
if (getNextChar() == expect) {
getNextChar();
return new Token(ifyes, "", line, pos);
}
if (ifno == TokenType.End_of_input) {
error(line, pos, String.format("follow: unrecognized character: (%d) '%c'", (int)this.chr, this.chr));
}
return new Token(ifno, "", line, pos);
}
Token char_lit(int line, int pos) {
char c = getNextChar(); // skip opening quote
int n = (int)c;
if (c == '\'') {
error(line, pos, "empty character constant");
} else if (c == '\\') {
c = getNextChar();
if (c == 'n') {
n = 10;
} else if (c == '\\') {
n = '\\';
} else {
error(line, pos, String.format("unknown escape sequence \\%c", c));
}
}
if (getNextChar() != '\'') {
error(line, pos, "multi-character constant");
}
getNextChar();
return new Token(TokenType.Integer, "" + n, line, pos);
}
Token string_lit(char start, int line, int pos) {
String result = "";
while (getNextChar() != start) {
if (this.chr == '\u0000') {
error(line, pos, "EOF while scanning string literal");
}
if (this.chr == '\n') {
error(line, pos, "EOL while scanning string literal");
}
result += this.chr;
}
getNextChar();
return new Token(TokenType.String, result, line, pos);
}
Token div_or_comment(int line, int pos) {
if (getNextChar() != '*') {
return new Token(TokenType.Op_divide, "", line, pos);
}
getNextChar();
while (true) {
if (this.chr == '\u0000') {
error(line, pos, "EOF in comment");
} else if (this.chr == '*') {
if (getNextChar() == '/') {
getNextChar();
return getToken();
}
} else {
getNextChar();
}
}
}
Token identifier_or_integer(int line, int pos) {
boolean is_number = true;
String text = "";
while (Character.isAlphabetic(this.chr) || Character.isDigit(this.chr) || this.chr == '_') {
text += this.chr;
if (!Character.isDigit(this.chr)) {
is_number = false;
}
getNextChar();
}
if (text.equals("")) {
error(line, pos, String.format("identifer_or_integer unrecognized character: (%d) %c", (int)this.chr, this.chr));
}
if (Character.isDigit(text.charAt(0))) {
if (!is_number) {
error(line, pos, String.format("invalid number: %s", text));
}
return new Token(TokenType.Integer, text, line, pos);
}
if (this.keywords.containsKey(text)) {
return new Token(this.keywords.get(text), "", line, pos);
}
return new Token(TokenType.Identifier, text, line, pos);
}
Token getToken() {
int line, pos;
while (Character.isWhitespace(this.chr)) {
getNextChar();
}
line = this.line;
pos = this.pos;
switch (this.chr) {
case '\u0000': return new Token(TokenType.End_of_input, "", this.line, this.pos);
case '/': return div_or_comment(line, pos);
case '\'': return char_lit(line, pos);
case '<': return follow('=', TokenType.Op_lessequal, TokenType.Op_less, line, pos);
case '>': return follow('=', TokenType.Op_greaterequal, TokenType.Op_greater, line, pos);
case '=': return follow('=', TokenType.Op_equal, TokenType.Op_assign, line, pos);
case '!': return follow('=', TokenType.Op_notequal, TokenType.Op_not, line, pos);
case '&': return follow('&', TokenType.Op_and, TokenType.End_of_input, line, pos);
case '|': return follow('|', TokenType.Op_or, TokenType.End_of_input, line, pos);
case '"': return string_lit(this.chr, line, pos);
case '{': getNextChar(); return new Token(TokenType.LeftBrace, "", line, pos);
case '}': getNextChar(); return new Token(TokenType.RightBrace, "", line, pos);
case '(': getNextChar(); return new Token(TokenType.LeftParen, "", line, pos);
case ')': getNextChar(); return new Token(TokenType.RightParen, "", line, pos);
case '+': getNextChar(); return new Token(TokenType.Op_add, "", line, pos);
case '-': getNextChar(); return new Token(TokenType.Op_subtract, "", line, pos);
case '*': getNextChar(); return new Token(TokenType.Op_multiply, "", line, pos);
case '%': getNextChar(); return new Token(TokenType.Op_mod, "", line, pos);
case ';': getNextChar(); return new Token(TokenType.Semicolon, "", line, pos);
case ',': getNextChar(); return new Token(TokenType.Comma, "", line, pos);
default: return identifier_or_integer(line, pos);
}
}
char getNextChar() {
this.pos++;
this.position++;
if (this.position >= this.s.length()) {
this.chr = '\u0000';
return this.chr;
}
this.chr = this.s.charAt(this.position);
if (this.chr == '\n') {
this.line++;
this.pos = 0;
}
return this.chr;
}
 
void printTokens() {
Token t;
while ((t = getToken()).tokentype != TokenType.End_of_input) {
System.out.println(t);
}
System.out.println(t);
}
public static void main(String[] args) {
if (args.length > 0) {
try {
File f = new File(args[0]);
Scanner s = new Scanner(f);
String source = " ";
while (s.hasNext()) {
source += s.nextLine() + "\n";
}
Lexer l = new Lexer(source);
l.printTokens();
} catch(FileNotFoundException e) {
error(-1, -1, "Exception: " + e.getMessage());
}
} else {
error(-1, -1, "No args");
}
}
}
</syntaxhighlight>
 
=={{header|JavaScript}}==
{{incorrect|Javascript|Please show output. Code is identical to [[Compiler/syntax_analyzer]] task}}
<syntaxhighlight lang="javascript">
/*
Token: type, value, line, pos
*/
 
const TokenType = {
Keyword_if: 1, Keyword_else: 2, Keyword_print: 3, Keyword_putc: 4, Keyword_while: 5,
Op_add: 6, Op_and: 7, Op_assign: 8, Op_divide: 9, Op_equal: 10, Op_greater: 11,
Op_greaterequal: 12, Op_less: 13, Op_Lessequal: 14, Op_mod: 15, Op_multiply: 16, Op_not: 17,
Op_notequal: 18, Op_or: 19, Op_subtract: 20,
Integer: 21, String: 22, Identifier: 23,
Semicolon: 24, Comma: 25,
LeftBrace: 26, RightBrace: 27,
LeftParen: 28, RightParen: 29,
End_of_input: 99
}
 
class Lexer {
constructor(source) {
this.source = source
this.pos = 1 // position in line
this.position = 0 // position in source
this.line = 1
this.chr = this.source.charAt(0)
this.keywords = {
"if": TokenType.Keyword_if,
"else": TokenType.Keyword_else,
"print": TokenType.Keyword_print,
"putc": TokenType.Keyword_putc,
"while": TokenType.Keyword_while
}
}
getNextChar() {
this.pos++
this.position++
if (this.position >= this.source.length) {
this.chr = undefined
return this.chr
}
this.chr = this.source.charAt(this.position)
if (this.chr === '\n') {
this.line++
this.pos = 0
}
return this.chr
}
error(line, pos, message) {
if (line > 0 && pos > 0) {
console.log(message + " in line " + line + ", pos " + pos + "\n")
} else {
console.log(message)
}
process.exit(1)
}
follow(expect, ifyes, ifno, line, pos) {
if (this.getNextChar() === expect) {
this.getNextChar()
return { type: ifyes, value: "", line, pos }
}
if (ifno === TokenType.End_of_input) {
this.error(line, pos, "follow: unrecognized character: (" + this.chr.charCodeAt(0) + ") '" + this.chr + "'")
}
return { type: ifno, value: "", line, pos }
}
div_or_comment(line, pos) {
if (this.getNextChar() !== '*') {
return { type: TokenType.Op_divide, value: "/", line, pos }
}
this.getNextChar()
while (true) {
if (this.chr === '\u0000') {
this.error(line, pos, "EOF in comment")
} else if (this.chr === '*') {
if (this.getNextChar() === '/') {
this.getNextChar()
return this.getToken()
}
} else {
this.getNextChar()
}
}
}
char_lit(line, pos) {
let c = this.getNextChar() // skip opening quote
let n = c.charCodeAt(0)
if (c === "\'") {
this.error(line, pos, "empty character constant")
} else if (c === "\\") {
c = this.getNextChar()
if (c == "n") {
n = 10
} else if (c === "\\") {
n = 92
} else {
this.error(line, pos, "unknown escape sequence \\" + c)
}
}
if (this.getNextChar() !== "\'") {
this.error(line, pos, "multi-character constant")
}
this.getNextChar()
return { type: TokenType.Integer, value: n, line, pos }
}
string_lit(start, line, pos) {
let value = ""
while (this.getNextChar() !== start) {
if (this.chr === undefined) {
this.error(line, pos, "EOF while scanning string literal")
}
if (this.chr === "\n") {
this.error(line, pos, "EOL while scanning string literal")
}
value += this.chr
}
this.getNextChar()
return { type: TokenType.String, value, line, pos }
}
identifier_or_integer(line, pos) {
let is_number = true
let text = ""
while (/\w/.test(this.chr) || this.chr === '_') {
text += this.chr
if (!/\d/.test(this.chr)) {
is_number = false
}
this.getNextChar()
}
if (text === "") {
this.error(line, pos, "identifer_or_integer unrecopgnized character: follow: unrecognized character: (" + this.chr.charCodeAt(0) + ") '" + this.chr + "'")
}
if (/\d/.test(text.charAt(0))) {
if (!is_number) {
this.error(line, pos, "invaslid number: " + text)
}
return { type: TokenType.Integer, value: text, line, pos }
}
if (text in this.keywords) {
return { type: this.keywords[text], value: "", line, pos }
}
return { type: TokenType.Identifier, value: text, line, pos }
}
getToken() {
let pos, line
// Ignore whitespaces
while (/\s/.test(this.chr)) { this.getNextChar() }
line = this.line; pos = this.pos
switch (this.chr) {
case undefined: return { type: TokenType.End_of_input, value: "", line: this.line, pos: this.pos }
case "/": return this.div_or_comment(line, pos)
case "\'": return this.char_lit(line, pos)
case "\"": return this.string_lit(this.chr, line, pos)
 
case "<": return this.follow("=", TokenType.Op_lessequal, TokenType.Op_less, line, pos)
case ">": return this.follow("=", TokenType.Op_greaterequal, TokenType.Op_greater, line, pos)
case "=": return this.follow("=", TokenType.Op_equal, TokenType.Op_assign, line, pos)
case "!": return this.follow("=", TokenType.Op_notequal, TokenType.Op_not, line, pos)
case "&": return this.follow("&", TokenType.Op_and, TokenType.End_of_input, line, pos)
case "|": return this.follow("|", TokenType.Op_or, TokenType.End_of_input, line, pos)
 
case "{": this.getNextChar(); return { type: TokenType.LeftBrace, value: "{", line, pos }
case "}": this.getNextChar(); return { type: TokenType.RightBrace, value: "}", line, pos }
case "(": this.getNextChar(); return { type: TokenType.LeftParen, value: "(", line, pos }
case ")": this.getNextChar(); return { type: TokenType.RightParen, value: ")", line, pos }
case "+": this.getNextChar(); return { type: TokenType.Op_add, value: "+", line, pos }
case "-": this.getNextChar(); return { type: TokenType.Op_subtract, value: "-", line, pos }
case "*": this.getNextChar(); return { type: TokenType.Op_multiply, value: "*", line, pos }
case "%": this.getNextChar(); return { type: TokenType.Op_mod, value: "%", line, pos }
case ";": this.getNextChar(); return { type: TokenType.Semicolon, value: ";", line, pos }
case ",": this.getNextChar(); return { type: TokenType.Comma, value: ",", line, pos }
 
default: return this.identifier_or_integer(line, pos)
}
}
/*
https://stackoverflow.com/questions/9907419/how-to-get-a-key-in-a-javascript-object-by-its-value
*/
getTokenType(value) {
return Object.keys(TokenType).find(key => TokenType[key] === value)
}
printToken(t) {
let result = (" " + t.line).substr(t.line.toString().length)
result += (" " + t.pos).substr(t.pos.toString().length)
result += (" " + this.getTokenType(t.type) + " ").substr(0, 16)
switch (t.type) {
case TokenType.Integer:
result += " " + t.value
break;
case TokenType.Identifier:
result += " " + t.value
break;
case TokenType.String:
result += " \""+ t.value + "\""
break;
}
console.log(result)
}
printTokens() {
let t
while ((t = this.getToken()).type !== TokenType.End_of_input) {
this.printToken(t)
}
this.printToken(t)
}
}
const fs = require("fs")
fs.readFile(process.argv[2], "utf8", (err, data) => {
l = new Lexer(data)
l.printTokens()
})
</syntaxhighlight>
 
=={{header|Julia}}==
<syntaxhighlight lang="julia">struct Tokenized
startline::Int
startcol::Int
name::String
value::Union{Nothing, Int, String}
end
 
const optokens = Dict("*" => "Op_multiply", "/" => "Op_divide", "%" => "Op_mod", "+" => "Op_add",
"-" => "Op_subtract", "!" => "Op_not", "<" => "Op_less", "<=" => "Op_lessequal",
">" => "Op_greater", ">=" => "Op_greaterequal", "==" => "Op_equal", "!=" => "Op_notequal",
"!" => "Op_not", "=" => "Op_assign", "&&" => "Op_and", "||" => "Op_or")
 
const keywordtokens = Dict("if" => "Keyword_if", "else" => "Keyword_else", "while" => "Keyword_while",
"print" => "Keyword_print", "putc" => "Keyword_putc")
 
const symboltokens = Dict("(" => "LeftParen", ")" => "RightParen", "{" => "LeftBrace",
"}" => "RightBrace", ";" => "Semicolon", "," => "Comma")
 
const errors = ["Empty character constant.", "Unknown escape sequence.", "Multi-character constant.",
"End-of-file in comment. Closing comment characters not found.",
"End-of-file while scanning string literal. Closing string character not found.",
"End-of-line while scanning string literal. Closing string character not found before end-of-line.",
"Unrecognized character.", "Invalid number. Starts like a number, but ends in non-numeric characters."]
 
asws(s) = (nnl = length(findall(x->x=='\n', s)); " " ^ (length(s) - nnl) * "\n" ^ nnl)
comment2ws(t) = (while occursin("/*", t) t = replace(t, r"\/\* .+? (?: \*\/)"xs => asws; count = 1) end; t)
hasinvalidescapes(t) = ((m = match(r"\\.", t)) != nothing && m.match != "\\\\" && m.match != "\\n")
hasemptycharconstant(t) = (match(r"\'\'", t) != nothing)
hasmulticharconstant(t) = ((m = match(r"\'[^\'][^\']+\'", t)) != nothing && m.match != "\'\\\\\'" && m.match != "\'\\n\'")
hasunbalancedquotes(t) = isodd(length(findall(x -> x == '\"', t)))
hasunrecognizedchar(t) = match(r"[^\w\s\d\*\/\%\+\-\<\>\=\!\&\|\(\)\{\}\;\,\"\'\\]", t) != nothing
 
function throwiferror(line, n)
if hasemptycharconstant(line)
throw("Tokenizer error line $n: " * errors[1])
end
if hasinvalidescapes(line)
throw("Tokenizer error line $n: " * errors[2])
end
if hasmulticharconstant(line)
println("error at ", match(r"\'[^\'][^\']+\'", line).match)
throw("Tokenizer error line $n: " * errors[3])
end
if occursin("/*", line)
throw("Tokenizer error line $n: " * errors[4])
end
if hasunrecognizedchar(line)
throw("Tokenizer error line $n: " * errors[7])
end
end
 
function tokenize(txt)
tokens = Vector{Tokenized}()
txt = comment2ws(txt)
lines = split(txt, "\n")
if hasunbalancedquotes(txt)
throw("Tokenizer error: $(errors[5])")
end
for (startline, line) in enumerate(lines)
if strip(line) == ""
continue
end
throwiferror(line, startline)
lastc = Char(0)
withintoken = 0
for (startcol, c) in enumerate(line)
if withintoken > 0
withintoken -= 1
continue
elseif isspace(c[1])
continue
elseif (c == '=') && (startcol > 1) && ((c2 = line[startcol - 1]) in ['<', '>', '=', '!'])
tokens[end] = Tokenized(startline, startcol - 1, optokens[c2 * c], nothing)
elseif (c == '&') || (c == '|')
if length(line) > startcol && line[startcol + 1] == c
push!(tokens, Tokenized(startline, startcol, optokens[c * c], nothing))
withintoken = 1
else
throw("Tokenizer error line $startline: $(errors[7])")
end
elseif haskey(optokens, string(c))
push!(tokens, Tokenized(startline, startcol, optokens[string(c)], nothing))
elseif haskey(symboltokens, string(c))
push!(tokens, Tokenized(startline, startcol, symboltokens[string(c)], nothing))
elseif isdigit(c)
integerstring = match(r"^\d+", line[startcol:end]).match
pastnumposition = startcol + length(integerstring)
if (pastnumposition <= length(line)) && isletter(line[pastnumposition])
throw("Tokenizer error line $startline: " * errors[8])
end
i = parse(Int, integerstring)
push!(tokens, Tokenized(startline, startcol, "Integer", i))
withintoken = length(integerstring) - 1
elseif c == Char(39) # single quote
if (m = match(r"([^\\\'\n]|\\n|\\\\)\'", line[startcol+1:end])) != nothing
chs = m.captures[1]
i = (chs == "\\n") ? Int('\n') : (chs == "\\\\" ? Int('\\') : Int(chs[1]))
push!(tokens, Tokenized(startline, startcol, "Integer", i))
withintoken = length(chs) + 1
else
println("line $startline: bad match with ", line[startcol+1:end])
end
elseif c == Char(34) # double quote
if (m = match(r"([^\"\n]+)\"", line[startcol+1:end])) == nothing
throw("Tokenizer error line $startline: $(errors[6])")
end
litstring = m.captures[1]
push!(tokens, Tokenized(startline, startcol, "String", "\"$litstring\""))
withintoken = length(litstring) + 1
elseif (cols = findfirst(r"[a-zA-Z]+", line[startcol:end])) != nothing
litstring = line[cols .+ startcol .- 1]
if haskey(keywordtokens, string(litstring))
push!(tokens, Tokenized(startline, startcol, keywordtokens[litstring], nothing))
else
litstring = match(r"[_a-zA-Z0-9]+", line[startcol:end]).match
push!(tokens, Tokenized(startline, startcol, "Identifier", string(litstring)))
end
withintoken = length(litstring) - 1
end
lastc = c
end
end
push!(tokens, Tokenized(length(lines), length(lines[end]) + 1, "End_of_input", nothing))
tokens
end
 
const test3txt = raw"""
/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* Else */ else /* Leq */ <=
/* While */ while /* Geq */ >=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '
"""
 
println("Line Col Name Value")
for tok in tokenize(test3txt)
println(lpad(tok.startline, 3), lpad(tok.startcol, 5), lpad(tok.name, 18), " ", tok.value != nothing ? tok.value : "")
end
</syntaxhighlight>{{output}}<pre>
Line Col Name Value
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
=={{header|kotlin}}==
{{trans|Java}}
<syntaxhighlight lang="kotlin">// Input: command line argument of file to process or console input. A two or
// three character console input of digits followed by a new line will be
// checked for an integer between zero and twenty-five to select a fixed test
// case to run. Any other console input will be parsed.
 
// Code based on the Java version found here:
// https://rosettacode.org/mw/index.php?title=Compiler/lexical_analyzer&action=edit&section=22
 
// Class to halt the parsing with an exception.
class ParsingFailed(message: String): Exception(message)
 
// Enumerate class of tokens supported by this scanner.
enum class TokenType {
Tk_End_of_input, Op_multiply, Op_divide, Op_mod, Op_add, Op_subtract,
Op_negate, Op_not, Op_less, Op_lessequal, Op_greater, Op_greaterequal,
Op_equal, Op_notequal, Op_assign, Op_and, Op_or, Kw_if,
Kw_else, Kw_while, Kw_print, Kw_putc, Sy_LeftParen, Sy_RightParen,
Sy_LeftBrace, Sy_RightBrace, Sy_Semicolon, Sy_Comma, Tk_Identifier,
Tk_Integer, Tk_String;
 
override fun toString() =
listOf("End_of_input", "Op_multiply", "Op_divide", "Op_mod", "Op_add",
"Op_subtract", "Op_negate", "Op_not", "Op_less", "Op_lessequal",
"Op_greater", "Op_greaterequal", "Op_equal", "Op_notequal",
"Op_assign", "Op_and", "Op_or", "Keyword_if", "Keyword_else",
"Keyword_while", "Keyword_print", "Keyword_putc", "LeftParen",
"RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma",
"Identifier", "Integer", "String")[this.ordinal]
} // TokenType
 
// Data class of tokens returned by the scanner.
data class Token(val token: TokenType, val value: String, val line: Int,
val pos: Int) {
 
// Overridden method to display the token.
override fun toString() =
"%5d %5d %-15s %s".format(line, pos, this.token,
when (this.token) {
TokenType.Tk_Integer, TokenType.Tk_Identifier ->
" %s".format(this.value)
TokenType.Tk_String ->
this.value.toList().joinToString("", " \"", "\"") {
when (it) {
'\t' ->
"\\t"
'\n' ->
"\\n"
'\u000b' ->
"\\v"
'\u000c' ->
"\\f"
'\r' ->
"\\r"
'"' ->
"\\\""
'\\' ->
"\\"
in ' '..'~' ->
"$it"
else ->
"\\u%04x".format(it.code) } }
else ->
"" } )
} // Token
 
// Function to display an error message and halt the scanner.
fun error(line: Int, pos: Int, msg: String): Nothing =
throw ParsingFailed("(%d, %d) %s\n".format(line, pos, msg))
 
// Class to process the source into tokens with properties of the
// source string, the line number, the column position, the index
// within the source string, the current character being processed,
// and map of the keyword strings to the corresponding token type.
class Lexer(private val s: String) {
private var line = 1
private var pos = 1
private var position = 0
private var chr =
if (s.isEmpty())
' '
else
s[0]
private val keywords = mapOf<String, TokenType>(
"if" to TokenType.Kw_if,
"else" to TokenType.Kw_else,
"print" to TokenType.Kw_print,
"putc" to TokenType.Kw_putc,
"while" to TokenType.Kw_while)
 
// Method to retrive the next character from the source. Use null after
// the end of our source.
private fun getNextChar() =
if (++this.position >= this.s.length) {
this.pos++
this.chr = '\u0000'
this.chr
} else {
this.pos++
this.chr = this.s[this.position]
when (this.chr) {
'\n' -> {
this.line++
this.pos = 0
} // line
'\t' ->
while (this.pos%8 != 1)
this.pos++
} // when
this.chr
} // if
 
// Method to return the division token, skip the comment, or handle the
// error.
private fun div_or_comment(line: Int, pos: Int): Token =
if (getNextChar() != '*')
Token(TokenType.Op_divide, "", line, pos);
else {
getNextChar() // Skip comment start
outer@
while (true)
when (this.chr) {
'\u0000' ->
error(line, pos, "Lexer: EOF in comment");
'*' ->
if (getNextChar() == '/') {
getNextChar() // Skip comment end
break@outer
} // if
else ->
getNextChar()
} // when
getToken()
} // if
 
// Method to verify a character literal. Return the token or handle the
// error.
private fun char_lit(line: Int, pos: Int): Token {
var c = getNextChar() // skip opening quote
when (c) {
'\'' ->
error(line, pos, "Lexer: Empty character constant");
'\\' ->
c = when (getNextChar()) {
'n' ->
10.toChar()
'\\' ->
'\\'
'\'' ->
'\''
else ->
error(line, pos, "Lexer: Unknown escape sequence '\\%c'".
format(this.chr)) }
} // when
if (getNextChar() != '\'')
error(line, pos, "Lexer: Multi-character constant")
getNextChar() // Skip closing quote
return Token(TokenType.Tk_Integer, c.code.toString(), line, pos)
} // char_lit
 
// Method to check next character to see whether it belongs to the token
// we might be in the middle of. Return the correct token or handle the
// error.
private fun follow(expect: Char, ifyes: TokenType, ifno: TokenType,
line: Int, pos: Int): Token =
when {
getNextChar() == expect -> {
getNextChar()
Token(ifyes, "", line, pos)
} // matches
ifno == TokenType.Tk_End_of_input ->
error(line, pos,
"Lexer: %c expected: (%d) '%c'".format(expect,
this.chr.code, this.chr))
else ->
Token(ifno, "", line, pos)
} // when
 
// Method to verify a character string. Return the token or handle the
// error.
private fun string_lit(start: Char, line: Int, pos: Int): Token {
var result = ""
while (getNextChar() != start)
when (this.chr) {
'\u0000' ->
error(line, pos, "Lexer: EOF while scanning string literal")
'\n' ->
error(line, pos, "Lexer: EOL while scanning string literal")
'\\' ->
when (getNextChar()) {
'\\' ->
result += '\\'
'n' ->
result += '\n'
'"' ->
result += '"'
else ->
error(line, pos, "Lexer: Escape sequence unknown '\\%c'".
format(this.chr))
} // when
else ->
result += this.chr
} // when
getNextChar() // Toss closing quote
return Token(TokenType.Tk_String, result, line, pos)
} // string_lit
 
// Method to retrive an identifier or integer. Return the keyword
// token, if the string matches one. Return the integer token,
// if the string is all digits. Return the identifer token, if the
// string is valid. Otherwise, handle the error.
private fun identifier_or_integer(line: Int, pos: Int): Token {
var is_number = true
var text = ""
while (this.chr in listOf('_')+('0'..'9')+('a'..'z')+('A'..'Z')) {
text += this.chr
is_number = is_number && this.chr in '0'..'9'
getNextChar()
} // while
if (text.isEmpty())
error(line, pos, "Lexer: Unrecognized character: (%d) %c".
format(this.chr.code, this.chr))
return when {
text[0] in '0'..'9' ->
if (!is_number)
error(line, pos, "Lexer: Invalid number: %s".
format(text))
else {
val max = Int.MAX_VALUE.toString()
if (text.length > max.length || (text.length == max.length &&
max < text))
error(line, pos,
"Lexer: Number exceeds maximum value %s".
format(text))
Token(TokenType.Tk_Integer, text, line, pos)
} // if
this.keywords.containsKey(text) ->
Token(this.keywords[text]!!, "", line, pos)
else ->
Token(TokenType.Tk_Identifier, text, line, pos) }
} // identifier_or_integer
 
// Method to skip whitespace both C's and Unicode ones and retrive the next
// token.
private fun getToken(): Token {
while (this.chr in listOf('\t', '\n', '\u000b', '\u000c', '\r', ' ') ||
this.chr.isWhitespace())
getNextChar()
val line = this.line
val pos = this.pos
return when (this.chr) {
'\u0000' ->
Token(TokenType.Tk_End_of_input, "", line, pos)
'/' ->
div_or_comment(line, pos)
'\'' ->
char_lit(line, pos)
'<' ->
follow('=', TokenType.Op_lessequal, TokenType.Op_less, line, pos)
'>' ->
follow('=', TokenType.Op_greaterequal, TokenType.Op_greater, line, pos)
'=' ->
follow('=', TokenType.Op_equal, TokenType.Op_assign, line, pos)
'!' ->
follow('=', TokenType.Op_notequal, TokenType.Op_not, line, pos)
'&' ->
follow('&', TokenType.Op_and, TokenType.Tk_End_of_input, line, pos)
'|' ->
follow('|', TokenType.Op_or, TokenType.Tk_End_of_input, line, pos)
'"' ->
string_lit(this.chr, line, pos)
'{' -> {
getNextChar()
Token(TokenType.Sy_LeftBrace, "", line, pos)
} // open brace
'}' -> {
getNextChar()
Token(TokenType.Sy_RightBrace, "", line, pos)
} // close brace
'(' -> {
getNextChar()
Token(TokenType.Sy_LeftParen, "", line, pos)
} // open paren
')' -> {
getNextChar()
Token(TokenType.Sy_RightParen, "", line, pos)
} // close paren
'+' -> {
getNextChar()
Token(TokenType.Op_add, "", line, pos)
} // plus
'-' -> {
getNextChar()
Token(TokenType.Op_subtract, "", line, pos)
} // dash
'*' -> {
getNextChar()
Token(TokenType.Op_multiply, "", line, pos)
} // asterisk
'%' -> {
getNextChar()
Token(TokenType.Op_mod, "", line, pos)
} // percent
';' -> {
getNextChar()
Token(TokenType.Sy_Semicolon, "", line, pos)
} // semicolon
',' -> {
getNextChar()
Token(TokenType.Sy_Comma, "", line, pos)
} // comma
else ->
identifier_or_integer(line, pos) }
} // getToken
 
// Method to parse and display tokens.
fun printTokens() {
do {
val t: Token = getToken()
println(t)
} while (t.token != TokenType.Tk_End_of_input)
} // printTokens
} // Lexer
 
 
// Function to test all good tests from the website and produce all of the
// error messages this program supports.
fun tests(number: Int) {
 
// Function to generate test case 0 source: Hello World/Text.
fun hello() {
Lexer(
"""/*
Hello world
*/
print("Hello, World!\n");
""").printTokens()
} // hello
 
// Function to generate test case 1 source: Phoenix Number.
fun phoenix() {
Lexer(
"""/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");""").printTokens()
} // phoenix
 
// Function to generate test case 2 source: All Symbols.
fun symbols() {
Lexer(
"""/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* Else */ else /* Leq */ <=
/* While */ while /* Geq */ >=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '""").printTokens()
} // symbols
 
// Function to generate test case 3 source: Test Case 4.
fun four() {
Lexer(
"""/*** test printing, embedded \n and comments with lots of '*' ***/
print(42);
print("\nHello World\nGood Bye\nok\n");
print("Print a slash n - \\n.\n");""").printTokens()
} // four
 
// Function to generate test case 4 source: Count.
fun count() {
Lexer(
"""count = 1;
while (count < 10) {
print("count is: ", count, "\n");
count = count + 1;
}""").printTokens()
} // count
 
// Function to generate test case 5 source: 100 Doors.
fun doors() {
Lexer(
"""/* 100 Doors */
i = 1;
while (i * i <= 100) {
print("door ", i * i, " is open\n");
i = i + 1;
}""").printTokens()
} // doors
 
// Function to generate test case 6 source: Negative Tests.
fun negative() {
Lexer(
"""a = (-1 * ((-1 * (5 * 15)) / 10));
print(a, "\n");
b = -a;
print(b, "\n");
print(-b, "\n");
print(-(1), "\n");""").printTokens()
} // negative
 
// Function to generate test case 7 source: Deep.
fun deep() {
Lexer(
"""print(---------------------------------+++5, "\n");
print(((((((((3 + 2) * ((((((2))))))))))))), "\n");
 
if (1) { if (1) { if (1) { if (1) { if (1) { print(15, "\n"); } } } } }""").printTokens()
} // deep
 
// Function to generate test case 8 source: Greatest Common Divisor.
fun gcd() {
Lexer(
"""/* Compute the gcd of 1071, 1029: 21 */
 
a = 1071;
b = 1029;
 
while (b != 0) {
new_a = b;
b = a % b;
a = new_a;
}
print(a);""").printTokens()
} // gcd
 
// Function to generate test case 9 source: Factorial.
fun factorial() {
Lexer(
"""/* 12 factorial is 479001600 */
 
n = 12;
result = 1;
i = 1;
while (i <= n) {
result = result * i;
i = i + 1;
}
print(result);""").printTokens()
} // factorial
 
// Function to generate test case 10 source: Fibonacci Sequence.
fun fibonacci() {
Lexer(
"""/* fibonacci of 44 is 701408733 */
 
n = 44;
i = 1;
a = 0;
b = 1;
while (i < n) {
w = a + b;
a = b;
b = w;
i = i + 1;
}
print(w, "\n");""").printTokens()
} // fibonacci
 
// Function to generate test case 11 source: FizzBuzz.
fun fizzbuzz() {
Lexer(
"""/* FizzBuzz */
i = 1;
while (i <= 100) {
if (!(i % 15))
print("FizzBuzz");
else if (!(i % 3))
print("Fizz");
else if (!(i % 5))
print("Buzz");
else
print(i);
 
print("\n");
i = i + 1;
}""").printTokens()
} // fizzbuzz
 
// Function to generate test case 12 source: 99 Bottles of Beer.
fun bottles() {
Lexer(
"""/* 99 bottles */
bottles = 99;
while (bottles > 0) {
print(bottles, " bottles of beer on the wall\n");
print(bottles, " bottles of beer\n");
print("Take one down, pass it around\n");
bottles = bottles - 1;
print(bottles, " bottles of beer on the wall\n\n");
}""").printTokens()
} // bottles
 
// Function to generate test case 13 source: Primes.
fun primes() {
Lexer(
"""/*
Simple prime number generator
*/
count = 1;
n = 1;
limit = 100;
while (n < limit) {
k=3;
p=1;
n=n+2;
while ((k*k<=n) && (p)) {
p=n/k*k!=n;
k=k+2;
}
if (p) {
print(n, " is prime\n");
count = count + 1;
}
}
print("Total primes found: ", count, "\n");""").printTokens()
} // primes
 
// Function to generate test case 14 source: Ascii Mandelbrot.
fun ascii() {
Lexer(
"""{
/*
This is an integer ascii Mandelbrot generator
*/
left_edge = -420;
right_edge = 300;
top_edge = 300;
bottom_edge = -300;
x_step = 7;
y_step = 15;
 
max_iter = 200;
 
y0 = top_edge;
while (y0 > bottom_edge) {
x0 = left_edge;
while (x0 < right_edge) {
y = 0;
x = 0;
the_char = ' ';
i = 0;
while (i < max_iter) {
x_x = (x * x) / 200;
y_y = (y * y) / 200;
if (x_x + y_y > 800 ) {
the_char = '0' + i;
if (i > 9) {
the_char = '@';
}
i = max_iter;
}
y = x * y / 100 + y0;
x = x_x - y_y + x0;
i = i + 1;
}
putc(the_char);
x0 = x0 + x_step;
}
putc('\n');
y0 = y0 - y_step;
}
}
""").printTokens()
} // ascii
 
when (number) {
0 ->
hello()
1 ->
phoenix()
2 ->
symbols()
3 ->
four()
4 ->
count()
5 ->
doors()
6 ->
negative()
7 ->
deep()
8 ->
gcd()
9 ->
factorial()
10 ->
fibonacci()
11 ->
fizzbuzz()
12 ->
bottles()
13 ->
primes()
14 ->
ascii()
15 -> // Lexer: Empty character constant
Lexer("''").printTokens()
16 -> // Lexer: Unknown escape sequence
Lexer("'\\x").printTokens()
17 -> // Lexer: Multi-character constant
Lexer("' ").printTokens()
18 -> // Lexer: EOF in comment
Lexer("/*").printTokens()
19 -> // Lexer: EOL in string
Lexer("\"\n").printTokens()
20 -> // Lexer: EOF in string
Lexer("\"").printTokens()
21 -> // Lexer: Escape sequence unknown
Lexer("\"\\x").printTokens()
22 -> // Lexer: Unrecognized character
Lexer("~").printTokens()
23 -> // Lexer: invalid number
Lexer("9a9").printTokens()
24 -> // Lexer: Number exceeds maximum value
Lexer("2147483648\n9223372036854775808").printTokens()
25 -> // Lexer: Operator expected
Lexer("|.").printTokens()
else ->
println("Invalid test number %d!".format(number))
} // when
} // tests
 
// Main function to check our source and read its data before parsing it.
// With no source specified, run the test of all symbols.
fun main(args: Array<String>) {
try {
val s =
if (args.size > 0 && args[0].isNotEmpty()) // file on command line
java.util.Scanner(java.io.File(args[0]))
else // use the console
java.util.Scanner(System.`in`)
var source = ""
while (s.hasNext())
source += s.nextLine()+
if (s.hasNext())
"\n"
else
""
if (args.size > 0 && args[0].isNotEmpty()) // file on command line
Lexer(source).printTokens()
else {
val digits = source.filter { it in '0'..'9' }
when {
source.isEmpty() -> // nothing given
tests(2)
source.length in 1..2 && digits.length == source.length &&
digits.toInt() in 0..25 ->
tests(digits.toInt())
else ->
Lexer(source).printTokens()
} // when
} // if
} catch(e: Throwable) {
println(e.message)
System.exit(1)
} // try
} // main</syntaxhighlight>
{{out|case=test case 3: All Symbols}}
<b>
<pre>
line 5 col 1516 PrintKeyword_print
line 5 col 4140 SubOp_subtract
line 6 col 1516 PutcKeyword_putc
line 6 col 4140 LssOp_less
line 7 col 1516 IfKeyword_if
line 7 col 4140 GtrOp_greater
line 8 col 1516 WhileKeyword_else
line 8 col 4140 LeqOp_lessequal
line 9 col 1516 LbraceKeyword_while
line 9 col 4140 NeqOp_greaterequal
line 10 col 1516 RbraceLeftBrace
line 10 col 4140 AndOp_equal
line 11 col 1516 LparenRightBrace
line 11 col 4140 SemiOp_notequal
line 12 col 1516 RparenLeftParen
line 12 col 4140 CommaOp_and
line 13 col 1516 SubRightParen
line 13 col 4140 AssignOp_or
line 14 col 1516 MulOp_subtract
line 14 col 41 Integer 40 42Semicolon
line 15 col 1516 DivOp_not
line 15 col 41 String "String40 literal"Comma
line 16 col 1516 AddOp_multiply
line 16 col 41 Ident 40 variable_nameOp_assign
line 17 col 26 Integer 16 10Op_divide
line 17 18 col 2640 Integer 32 42
line 18 col 3016 EOIOp_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
22 29 End_of_input
</pre>
</b>
 
=={{header|Lua}}==
===Using LPeg library===
This version uses LPeg, a parsing expression grammar library developed by one of the authors of Lua.
The source is broken into several modules, in part to make it easier to present the "vanilla Lua" version afterwards.
Tested with Lua 5.3.5 and LPeg 1.0.2-1.
 
The first module is simply a table defining the names of tokens which don't have an associated value.
<syntaxhighlight lang="lua">-- module token_name (in a file "token_name.lua")
local token_name = {
['*'] = 'Op_multiply',
['/'] = 'Op_divide',
['%'] = 'Op_mod',
['+'] = 'Op_add',
['-'] = 'Op_subtract',
['<'] = 'Op_less',
['<='] = 'Op_lessequal',
['>'] = 'Op_greater',
['>='] = 'Op_greaterequal',
['=='] = 'Op_equal',
['!='] = 'Op_notequal',
['!'] = 'Op_not',
['='] = 'Op_assign',
['&&'] = 'Op_and',
['||'] = 'Op_or',
['('] = 'LeftParen',
[')'] = 'RightParen',
['{'] = 'LeftBrace',
['}'] = 'RightBrace',
[';'] = 'Semicolon',
[','] = 'Comma',
['if'] = 'Keyword_if',
['else'] = 'Keyword_else',
['while'] = 'Keyword_while',
['print'] = 'Keyword_print',
['putc'] = 'Keyword_putc',
}
return token_name</syntaxhighlight>
 
This module exports a function <i>find_token</i>, which attempts to find the next valid token from a specified position in a source line.
<syntaxhighlight lang="lua">-- module lpeg_token_finder
local M = {} -- only items added to M will be public (via 'return M' at end)
local table, concat = table, table.concat
local error, tonumber = error, tonumber
 
local lpeg = require 'lpeg' -- see http://www.inf.puc-rio.br/~roberto/lpeg/
local token_name = require 'token_name'
_ENV = {}
 
local imports = 'P R S C Carg Cb Cc Cf Cg Cp Cs Ct Cmt V'
for w in imports:gmatch('%a+') do _ENV[w] = lpeg[w] end
 
------------------- Define patterns to match tokens -----------------------
 
alpha = R'az' + R'AZ' + P'_'
digit = R'09'
alnum = alpha + digit
space = S' \t\r\n'
 
function ptok(text) return {name=token_name[text]} end
op2c = C(P'<=' + P'>=' + P'==' + P'!=' + P'&&' + P'||') / ptok
op1c = C(S'*/%+-<>!=') / ptok
symbol = C(S'(){};,') / ptok
 
keyword_or_identifier = C(alpha * alnum^0) / function(text)
local name = token_name[text]
return name and {name=name} or {name='Identifier', value=text}
end
 
integer = C(digit^1) * -alpha / function(text)
return {name='Integer', value=tonumber(text)}
end
 
Cline = Carg(1) -- call to 'match' sets the first extra argument to source line number
 
bad_escseq_err = Cmt(Cline, function (_,pos,line)
error{err='bad_escseq', line=line, column=pos-1}
end)
 
esc_subst = {['\\'] = '\\', ['n'] = '\n'}
escseq = P'\\' * C(S'\\n' + bad_escseq_err) / esc_subst
 
qchar = P"'" * ( C( P(1) - S"'\n\\" ) + escseq ) * P"'" / function (text)
return {name='Integer', value=text:byte()}
end
 
qstr = P'"' * ( C((P(1) - S'"\n\\')^1) + escseq )^0 * P'"' / function(...)
return {name='String', value=concat{...}}
end
 
Ctoken = symbol + op2c + op1c + keyword_or_identifier + integer + qstr + qchar
 
unfinished_comment_err = Cmt(Cline * Cb('SOC'), function (_, pos, line, socpos)
error{err='unfinished_comment', line=line, column=socpos}
end)
commentstart = Cg(Cp() * P'/*', 'SOC')
commentrest = (P(1) - P'*/')^0 * (P'*/' + unfinished_comment_err)
comment = commentstart * commentrest
morecomment = Cg(Cp(), 'SOC') * commentrest
 
ws = (space^1 + comment)^0
 
bad_token_err = Cmt(Cline, function (_, pos, line)
error{err='invalid_token', line=line, column=pos}
end)
tokenpat = ws * Cline * Cp() * (C(-1) + Ctoken + bad_token_err) * Cp() /
function (line, pos, token, nextpos)
if pos == nextpos then -- at end of line; no token
return nil
else
token.line, token.column = line, pos
return token, nextpos
end
end
 
closecomment_tokenpat = morecomment * tokenpat
 
function M.find_token(line, line_pos, line_number, in_comment)
pattern = in_comment and closecomment_tokenpat or tokenpat
return lpeg.match(pattern, line, line_pos, line_number)
end
return M</syntaxhighlight>
 
The <i>lexer</i> module uses <i>finder.find_token</i> to produce an iterator over the tokens in a source.
<syntaxhighlight lang="lua">-- module lexer
local M = {} -- only items added to M will publicly available (via 'return M' at end)
local string, io, coroutine, yield = string, io, coroutine, coroutine.yield
local error, pcall, type = error, pcall, type
 
local finder = require 'lpeg_token_finder'
_ENV = {}
 
-- produces a token iterator given a source line iterator
function M.tokenize_lineiter(lineiter)
local function fatal(err)
local msgtext = {
unfinished_comment = "EOF inside comment started",
invalid_token = "Invalid token",
bad_escseq = "Invalid escape sequence",
}
local fmt = "LEX ERROR: %s at line %d, column %d"
error(string.format(fmt, msgtext[err.err], err.line, err.column))
end
return coroutine.wrap(function()
local line_number = 0
local line_pos
local in_comment -- where unfinished comment started
for line in lineiter do
line_number = line_number + 1
line_pos = 1
local function scanline() -- yield current line's tokens
repeat
local token, pos =
finder.find_token(line, line_pos, line_number, in_comment)
if token then
line_pos = pos
in_comment = nil
yield(token)
end
until token == nil
end
 
if line then
local ok, err = pcall(scanline)
if ok then
in_comment = nil
elseif type(err) == 'table' and err.err=='unfinished_comment' then
if not(in_comment and err.column==1) then
in_comment = err
end
elseif type(err) == 'table' then
fatal(err)
else
error(err) -- some internal error
end
end
end
if in_comment then
fatal(in_comment)
else
yield{name='End_of_input', line=line_number+1, column=1}
end
return nil
end)
end
 
------------------- exports -----------------------------
 
lexer = M.tokenize_lineiter
 
function M.tokenize_file(filename)
return lexer(io.lines(filename))
end
 
function M.tokenize_text(text)
return lexer(text:gmatch('[^\n]+'))
end
 
-- M._INTERNALS = _ENV
return M
</syntaxhighlight>
 
This script uses <i>lexer.tokenize_text</i> to show the token sequence produced from a source text.
 
<syntaxhighlight lang="lua">lexer = require 'lexer'
format, gsub = string.format, string.gsub
 
function printf(fmt, ...) print(format(fmt, ...)) end
 
function stringrep(str)
local subst = {['\n'] = "\\n", ['\\'] = '\\\\'}
return format('"%s"', gsub(str, '[\n\\]', subst))
end
 
function display(text)
for t in lexer.tokenize_text(text) do
local value = (t.name=='String') and stringrep(t.value) or t.value or ''
printf("%4d %3d %-15s %s", t.line, t.column, t.name, value)
end
end
 
----------------------- test cases from Rosetta spec ------------------------
testing = true
if testing then
-- test case 1
display[[
/*
Hello world
*/
print("Hello, World!\n");]]
print()
 
-- test ercase 2
display[[
/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");]]
print()
-- etc.
end
</syntaxhighlight>
 
===Using only standard libraries===
This version replaces the <i>lpeg_token_finder</i> module of the LPeg version with this <i>basic_token_finder</i> module, altering the <i>require</i> expression near the top of the <i>lexer</i> module accordingly. Tested with Lua 5.3.5. (Note that <i>select</i> is a standard function as of Lua 5.2.)
 
<syntaxhighlight lang="lua">-- module basic_token_finder
local M = {} -- only items added to M will be public (via 'return M' at end)
local table, string = table, string
local error, tonumber, select, assert = error, tonumber, select, assert
 
local token_name = require 'token_name'
_ENV = {}
 
function next_token(line, pos, line_num) -- match a token at line,pos
local function m(pat)
from, to, capture = line:find(pat, pos)
if from then
pos = to + 1
return capture
end
end
local function ptok(str)
return {name=token_name[str]}
end
local function op2c()
local text = m'^([<>=!]=)' or m'^(&&)' or m'^(||)'
if text then return ptok(text) end
end
 
local function op1c_or_symbol()
local char = m'^([%*/%%%+%-<>!=%(%){};,])'
if char then return ptok(char) end
end
local function keyword_or_identifier()
local text = m'^([%a_][%w_]*)'
if text then
local name = token_name[text]
return name and {name=name} or {name='Identifier', value=text}
end
end
local function integer()
local text = m'^(%d+)%f[^%w_]'
if text then return {name='Integer', value=tonumber(text)} end
end
local subst = {['\\\\'] = '\\', ['\\n'] = '\n'}
local function qchar()
local text = m"^'([^\\])'" or m"^'(\\[\\n])'"
if text then
local value = #text==1 and text:byte() or subst[text]:byte()
return {name='Integer', value=value}
end
end
local function qstr()
local text = m'^"([^"\n]*\\?)"'
if text then
local value = text:gsub('()(\\.?)', function(at, esc)
local replace = subst[esc]
if replace then
return replace
else
error{err='bad_escseq', line=line_num, column=pos+at-1}
end
end)
return {name='String', value=value}
end
end
local found = (op2c() or op1c_or_symbol() or
keyword_or_identifier() or integer() or qchar() or qstr())
if found then
return found, pos
end
end
 
function find_commentrest(line, pos, line_num, socpos)
local sfrom, sto = line:find('%*%/', pos)
if sfrom then
return socpos, sto
else
error{err='unfinished_comment', line=line_num, column=socpos}
end
end
 
function find_comment(line, pos, line_num)
local sfrom, sto = line:find('^%/%*', pos)
if sfrom then
local efrom, eto = find_commentrest(line, sto+1, line_num, sfrom)
return sfrom, eto
end
end
 
function find_morecomment(line, pos, line_num)
assert(pos==1)
return find_commentrest(line, pos, line_num, pos)
end
 
function find_whitespace(line, pos, line_num)
local spos = pos
repeat
local eto = select(2, line:find('^%s+', pos))
if not eto then
eto = select(2, find_comment(line, pos, line_num))
end
if eto then pos = eto + 1 end
until not eto
return spos, pos - 1
end
 
function M.find_token(line, pos, line_num, in_comment)
local spos = pos
if in_comment then
pos = 1 + select(2, find_morecomment(line, pos, line_num))
end
pos = 1 + select(2, find_whitespace(line, pos, line_num))
if pos > #line then
return nil
else
local token, nextpos = next_token(line, pos, line_num)
if token then
token.line, token.column = line_num, pos
return token, nextpos
else
error{err='invalid_token', line=line_num, column=pos}
end
end
end
 
-- M._ENV = _ENV
return M</syntaxhighlight>
 
=={{header|M2000 Interpreter}}==
<syntaxhighlight lang="m2000 interpreter">
Module lexical_analyzer {
a$={/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* Else */ else /* Leq */ <=
/* While */ while /* Geq */ >=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '
}
lim=Len(a$)
LineNo=1
ColumnNo=1
Document Output$
Buffer Scanner as Integer*lim
Return Scanner, 0:=a$
offset=0
buffer1$=""
flag_rem=true
Ahead=lambda Scanner (a$, offset)->{
=false
Try {
\\ second parameter is the offset in buffer units
\\ third parameter is length in bytes
=Eval$(Scanner, offset,2*len(a$))=a$
}
}
Ahead2=lambda Scanner (a$, offset)->{
=false
Try {
=Eval$(Scanner, offset,2) ~ a$
}
}
const nl$=chr$(13)+chr$(10), quo$="""", er$="@", Ansi=3
Try {
Do
If Ahead("/*", offset) Then {
offset+=2 : ColumnNo+=2
While not Ahead("*/", offset)
If Ahead(nl$, offset) Then
lineNo++: ColumnNo=1 : offset+=2
Else
offset++ : ColumnNo++
End If
if offset>lim then
Error "End-of-file in comment. Closing comment characters not found"+er$
End if
End While
offset+=2 : ColumnNo+=2
} Else.if Ahead(nl$, offset) Then{
LineNo++: ColumnNo=1
offset+=2
} Else.if Ahead(quo$, offset) Then {
Output$=format$("{0::-10}{1::-10} ", LineNo, ColumnNo)
offset++ : ColumnNo++
strin=offset
While not Ahead(quo$, offset)
If Ahead("/", offset) Then
offset+=2 : ColumnNo+=2
else
offset++ : ColumnNo++
End if
checkerror()
End While
Output$="String "+quote$(Eval$(Scanner, strin, (offset-strin)*2))+nl$
offset++ : ColumnNo++
} Else.if Ahead("'", offset) Then {
Output$=format$("{0::-10}{1::-10} ", LineNo, ColumnNo)
offset++ : ColumnNo++
strin=offset
While not Ahead("'", offset)
If Ahead("/", offset) Then
offset+=2 : ColumnNo+=2
else
offset++ : ColumnNo++
End if
checkerror()
End While
lit$=format$(Eval$(Scanner, strin, (offset-strin)*2))
select case len(lit$)
case 1
Output$="Integer "+str$(asc(lit$),0)+nl$
case >1
{Error "Multi-character constant."+er$}
case 0
{Error "Empty character constant."+er$}
end select
offset++ : ColumnNo++
} Else.if Ahead2("[a-z]", offset) Then {
strin=offset
Output$=format$("{0::-10}{1::-10} ", LineNo, ColumnNo)
offset++ : ColumnNo++
While Ahead2("[a-zA-Z0-9_]", offset)
offset++ : ColumnNo++
End While
Keywords(Eval$(Scanner, strin, (offset-strin)*2))
} Else.if Ahead2("[0-9]", offset) Then {
strin=offset
Output$=format$("{0::-10}{1::-10} Integer ", LineNo, ColumnNo)
offset++ : ColumnNo++
While Ahead2("[0-9]", offset)
offset++ : ColumnNo++
End While
if Ahead2("[a-zA-Z_]", offset) then
{Error " Invalid number. Starts like a number, but ends in non-numeric characters."+er$}
else
Output$=Eval$(Scanner, strin, (offset-strin)*2)+nl$
end if
} Else {
Symbols(Eval$(Scanner, Offset, 2))
offset++ : ColumnNo++
}
Until offset>=lim
}
er1$=leftpart$(error$,er$)
if er1$<>"" then
Print
Report "Error:"+er1$
Output$="(Error)"+nl$+"Error:"+er1$
else
Output$=format$("{0::-10}{1::-10}", LineNo, ColumnNo)+" End_of_Input"+nl$
end if
Clipboard Output$
Save.Doc Output$, "lex.t", Ansi
document lex$
Load.Doc lex$,"lex.t", Ansi
Report lex$
Sub Keywords(a$)
select case a$
case "if"
a$="Keyword_if"
case "else"
a$="Keyword_else"
case "while"
a$="Keyword_while"
case "print"
a$="Keyword_print"
case "putc"
a$="Keyword_putc"
else case
a$="Identifier "+a$
end select
Output$=a$+nl$
End sub
Sub Symbols(a$)
select case a$
case " ", chr$(9)
a$=""
case "("
a$="LeftParen"
case ")"
a$="RightParen"
case "{"
a$="LeftBrace"
case "}"
a$="RightBrace"
case ";"
a$="Semicolon"
case ","
a$="Comma"
case "*"
a$="Op_multiply"
case "/"
a$="Op_divide"
case "+"
a$="Op_add"
case "-"
a$="Op_subtract"
case "%"
a$="Op_mod"
case "<"
{ if Ahead("=", offset+1) Then
offset++
a$="Op_lessequal"
ColumnNo++
else
a$="Op_less"
end if
}
case ">"
{ if Ahead("=", offset+1) Then
offset++
ColumnNo++
a$="Op_greaterequal"
else
a$="Op_greater"
end if
}
case "="
{ if Ahead("=", offset+1) Then
offset++
ColumnNo++
a$="Op_equal"
else
a$="Op_assign"
end if
}
case "!"
{ if Ahead("=", offset+1) Then
offset++
ColumnNo++
a$="Op_notequal"
else
a$="Op_not"
end if
}
case "&"
{ if Ahead("&", offset+1) Then
offset++
ColumnNo++
a$="Op_and"
else
a$=""
end if
}
case "|"
{ if Ahead("|", offset+1) Then
offset++
ColumnNo++
a$="Op_or"
else
a$=""
end if
}
else case
{Error "Unrecognized character."+er$}
end select
if a$<>"" then
Output$=format$("{0::-10}{1::-10} ", LineNo, ColumnNo)+a$+nl$
end if
End Sub
Sub checkerror()
if offset>lim then {
Error "End-of-line while scanning string literal. Closing string character not found before end-of-line."+er$
} else.if Ahead(nl$,offset) then {
Error "End-of-file while scanning string literal. Closing string character not found."+er$
}
End Sub
}
lexical_analyzer
</syntaxhighlight>
 
{{out}}
<pre style="height:30ex;overflow:scroll">
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 41 Op_lessequal
9 16 Keyword_while
9 41 Op_greaterequal
10 16 LeftBrace
10 41 Op_equal
11 16 RightBrace
11 41 Op_notequal
12 16 LeftParen
12 41 Op_and
13 16 RightParen
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_Input
 
</pre >
 
=={{header|Mercury}}==
{{trans|ATS}}
{{works with|Mercury|20.06.1}}
 
 
<syntaxhighlight lang="mercury">% -*- mercury -*-
%
% Compile with maybe something like:
% mmc -O4 --intermod-opt -E --make --warn-non-tail-recursion self-and-mutual lex
%
 
:- module lex.
 
:- interface.
:- import_module io.
:- pred main(io::di, io::uo) is det.
 
:- implementation.
:- import_module char.
:- import_module exception.
:- import_module int.
:- import_module list.
:- import_module stack.
:- import_module string.
 
:- type token_t
---> token_ELSE
; token_IF
; token_PRINT
; token_PUTC
; token_WHILE
; token_MULTIPLY
; token_DIVIDE
; token_MOD
; token_ADD
; token_SUBTRACT
; token_NEGATE
; token_LESS
; token_LESSEQUAL
; token_GREATER
; token_GREATEREQUAL
; token_EQUAL
; token_NOTEQUAL
; token_NOT
; token_ASSIGN
; token_AND
; token_OR
; token_LEFTPAREN
; token_RIGHTPAREN
; token_LEFTBRACE
; token_RIGHTBRACE
; token_SEMICOLON
; token_COMMA
; token_IDENTIFIER
; token_INTEGER
; token_STRING
; token_END_OF_INPUT.
 
:- type ch_t % The type of a fetched character.
---> {int, % A character or `eof', stored as an int.
int, % The line number.
int}. % The column number.
 
:- type inp_t % The `inputter' type. Fetches one character.
---> inp_t(inpf :: text_input_stream,
line_no :: int,
column_no :: int,
pushback :: stack(ch_t)).
 
:- type toktup_t % The type of a scanned token with its argument.
---> {token_t, % The token kind.
string, % An argument. (May or may not be meaningful.)
int, % The starting line number.
int}. % The starting column number.
 
main(!IO) :-
command_line_arguments(Args, !IO),
(
if (Args = [])
then (InpF_filename = "-",
OutF_filename = "-",
main_program(InpF_filename, OutF_filename, !IO))
else if (Args = [F1])
then (InpF_filename = F1,
OutF_filename = "-",
main_program(InpF_filename, OutF_filename, !IO))
else if (Args = [F1, F2])
then (InpF_filename = F1,
OutF_filename = F2,
main_program(InpF_filename, OutF_filename, !IO))
else usage_error(!IO)
).
 
:- pred main_program(string::in, string::in, io::di, io::uo) is det.
main_program(InpF_filename, OutF_filename, !IO) :-
open_InpF(InpF, InpF_filename, !IO),
open_OutF(OutF, OutF_filename, !IO),
init(InpF, Inp0),
scan_text(OutF, Inp0, _, !IO).
 
:- pred open_InpF(text_input_stream::out, string::in,
io::di, io::uo) is det.
open_InpF(InpF, InpF_filename, !IO) :-
if (InpF_filename = "-")
then (InpF = io.stdin_stream)
else
(
open_input(InpF_filename, InpF_result, !IO),
(
if (InpF_result = ok(F))
then (InpF = F)
else throw("Error: cannot open " ++ InpF_filename ++
" for input")
)
).
 
:- pred open_OutF(text_output_stream::out, string::in,
io::di, io::uo) is det.
open_OutF(OutF, OutF_filename, !IO) :-
if (OutF_filename = "-")
then (OutF = io.stdout_stream)
else
(
open_output(OutF_filename, OutF_result, !IO),
(
if (OutF_result = ok(F))
then (OutF = F)
else throw("Error: cannot open " ++ OutF_filename ++
" for output")
)
).
 
:- pred usage_error(io::di, io::uo) is det.
usage_error(!IO) :-
progname("lex", ProgName, !IO),
(io.format("Usage: %s [INPUT_FILE [OUTPUT_FILE]]\n",
[s(ProgName)], !IO)),
(io.write_string("If INPUT_FILE is \"-\" or not present then standard input is used.\n",
!IO)),
(io.write_string("If OUTPUT_FILE is \"-\" or not present then standard output is used.\n",
!IO)),
set_exit_status(1, !IO).
 
:- pred scan_text(text_output_stream::in, inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_text(OutF, !Inp, !IO) :-
get_next_token(TokTup, !Inp, !IO),
print_token(TokTup, OutF, !IO),
{Tok, _, _, _} = TokTup,
(
if (Tok = token_END_OF_INPUT)
then true
else scan_text(OutF, !Inp, !IO)
).
 
:- pred print_token(toktup_t::in, text_output_stream::in,
io::di, io::uo) is det.
print_token(TokTup, OutF, !IO) :-
{Tok, Arg, Line_no, Column_no} = TokTup,
token_name(Tok) = TokName,
(io.format(OutF, "%5d %5d %s",
[i(Line_no), i(Column_no), s(TokName)],
!IO)),
(
if (Tok = token_IDENTIFIER)
then (io.format(OutF, " %s", [s(Arg)], !IO))
else if (Tok = token_INTEGER)
then (io.format(OutF, " %s", [s(Arg)], !IO))
else if (Tok = token_STRING)
then (io.format(OutF, " %s", [s(Arg)], !IO))
else true
),
(io.format(OutF, "\n", [], !IO)).
 
:- func token_name(token_t) = string is det.
:- pred token_name(token_t::in, string::out) is det.
token_name(Tok) = Str :- token_name(Tok, Str).
token_name(token_ELSE, "Keyword_else").
token_name(token_IF, "Keyword_if").
token_name(token_PRINT, "Keyword_print").
token_name(token_PUTC, "Keyword_putc").
token_name(token_WHILE, "Keyword_while").
token_name(token_MULTIPLY, "Op_multiply").
token_name(token_DIVIDE, "Op_divide").
token_name(token_MOD, "Op_mod").
token_name(token_ADD, "Op_add").
token_name(token_SUBTRACT, "Op_subtract").
token_name(token_NEGATE, "Op_negate").
token_name(token_LESS, "Op_less").
token_name(token_LESSEQUAL, "Op_lessequal").
token_name(token_GREATER, "Op_greater").
token_name(token_GREATEREQUAL, "Op_greaterequal").
token_name(token_EQUAL, "Op_equal").
token_name(token_NOTEQUAL, "Op_notequal").
token_name(token_NOT, "Op_not").
token_name(token_ASSIGN, "Op_assign").
token_name(token_AND, "Op_and").
token_name(token_OR, "Op_or").
token_name(token_LEFTPAREN, "LeftParen").
token_name(token_RIGHTPAREN, "RightParen").
token_name(token_LEFTBRACE, "LeftBrace").
token_name(token_RIGHTBRACE, "RightBrace").
token_name(token_SEMICOLON, "Semicolon").
token_name(token_COMMA, "Comma").
token_name(token_IDENTIFIER, "Identifier").
token_name(token_INTEGER, "Integer").
token_name(token_STRING, "String").
token_name(token_END_OF_INPUT, "End_of_input").
 
:- pred get_next_token(toktup_t::out, inp_t::in, inp_t::out,
io::di, io::uo) is det.
get_next_token(TokTup, !Inp, !IO) :-
skip_spaces_and_comments(!Inp, !IO),
get_ch(Ch, !Inp, !IO),
{IChar, Line_no, Column_no} = Ch,
LN = Line_no,
CN = Column_no,
(
if (IChar = eof)
then
(
TokTup = {token_END_OF_INPUT, "", LN, CN}
)
else
(
Char = det_from_int(IChar),
(
if (Char = (','))
then (TokTup = {token_COMMA, ",", LN, CN})
else if (Char = (';'))
then (TokTup = {token_SEMICOLON, ";", LN, CN})
else if (Char = ('('))
then (TokTup = {token_LEFTPAREN, "(", LN, CN})
else if (Char = (')'))
then (TokTup = {token_RIGHTPAREN, ")", LN, CN})
else if (Char = ('{'))
then (TokTup = {token_LEFTBRACE, "{", LN, CN})
else if (Char = ('}'))
then (TokTup = {token_RIGHTBRACE, "}", LN, CN})
else if (Char = ('*'))
then (TokTup = {token_MULTIPLY, "*", LN, CN})
else if (Char = ('/'))
then (TokTup = {token_DIVIDE, "/", LN, CN})
else if (Char = ('%'))
then (TokTup = {token_MOD, "%", LN, CN})
else if (Char = ('+'))
then (TokTup = {token_ADD, "+", LN, CN})
else if (Char = ('-'))
then (TokTup = {token_SUBTRACT, "-", LN, CN})
else if (Char = ('<'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('='))
then
(
TokTup = {token_LESSEQUAL, "<=", LN, CN}
)
else
(
push_back(Ch1, !Inp),
TokTup = {token_LESS, "<", LN, CN}
)
)
)
else if (Char = ('>'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('='))
then
(
TokTup = {token_GREATEREQUAL, ">=", LN, CN}
)
else
(
push_back(Ch1, !Inp),
TokTup = {token_GREATER, ">", LN, CN}
)
)
)
else if (Char = ('='))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('='))
then
(
TokTup = {token_EQUAL, "==", LN, CN}
)
else
(
push_back(Ch1, !Inp),
TokTup = {token_ASSIGN, "=", LN, CN}
)
)
)
else if (Char = ('!'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('='))
then
(
TokTup = {token_NOTEQUAL, "!=", LN, CN}
)
else
(
push_back(Ch1, !Inp),
TokTup = {token_NOT, "!", LN, CN}
)
)
)
else if (Char = ('&'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('&'))
then
(
TokTup = {token_AND, "&&", LN, CN}
)
else throw("Error: unexpected character '" ++
from_char(Char) ++ "' at " ++
from_int(LN) ++ ":" ++
from_int(CN))
)
)
else if (Char = ('|'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('|'))
then
(
TokTup = {token_OR, "||", LN, CN}
)
else throw("Error: unexpected character '" ++
from_char(Char) ++ "' at " ++
from_int(LN) ++ ":" ++
from_int(CN))
)
)
else if (Char = ('"'))
then
(
push_back(Ch, !Inp),
scan_string_literal(TokTup, !Inp, !IO)
)
else if (Char = ('\''))
then
(
push_back(Ch, !Inp),
scan_character_literal(TokTup, !Inp, !IO)
)
else if (is_alpha(Char))
then
(
push_back(Ch, !Inp),
scan_identifier_or_reserved_word(
TokTup, !Inp, !IO)
)
else if (is_digit(Char))
then
(
push_back(Ch, !Inp),
scan_integer_literal(TokTup, !Inp, !IO)
)
else
(
throw("Error: unexpected character '" ++
from_char(Char) ++ "' at " ++
from_int(LN) ++ ":" ++
from_int(CN))
)
)
)
).
 
 
:- pred skip_spaces_and_comments(inp_t::in, inp_t::out,
io::di, io::uo) is det.
skip_spaces_and_comments(!Inp, !IO) :-
get_ch(Ch, !Inp, !IO),
Ch = {IChar, _, _},
(
if (IChar = eof)
then push_back(Ch, !Inp)
else
if (is_whitespace(det_from_int(IChar)))
then skip_spaces_and_comments(!Inp, !IO)
else if (IChar = to_int('/'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, Line_no, Column_no},
(
if (IChar1 = to_int('*'))
then
(
scan_comment(Line_no, Column_no,
!Inp, !IO),
skip_spaces_and_comments(!Inp, !IO)
)
else
(
push_back(Ch1, !Inp),
push_back(Ch, !Inp)
)
)
)
else push_back(Ch, !Inp)
).
 
:- pred scan_comment(int::in, int::in, % line and column nos.
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_comment(Line_no, Column_no, !Inp, !IO) :-
get_ch(Ch, !Inp, !IO),
{IChar, _, _} = Ch,
(
if (IChar = eof)
then throw("Error: unterminated comment " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else
(
det_from_int(IChar) = Char,
(
if (Char = ('*'))
then
(
get_ch(Ch1, !Inp, !IO),
{IChar1, _, _} = Ch1,
(
if (IChar1 = to_int('/'))
then true % End of comment has been reached.
else
(
push_back(Ch1, !Inp),
scan_comment(Line_no, Column_no, !Inp,
!IO)
)
)
)
else scan_comment(Line_no, Column_no, !Inp, !IO)
)
)
).
 
:- pred scan_character_literal(toktup_t::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_character_literal(TokTup, !Inp, !IO) :-
get_ch(Ch, !Inp, !IO),
Ch = {OpenQuote, Line_no, Column_no},
CloseQuote = OpenQuote,
scan_char_lit_contents(CodePoint, Line_no, Column_no,
!Inp, !IO),
check_char_lit_end(CloseQuote, Line_no, Column_no, !Inp, !IO),
Arg = from_int(CodePoint),
TokTup = {token_INTEGER, Arg, Line_no, Column_no}.
 
:- pred scan_char_lit_contents(int::out, int::in, int::in,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_char_lit_contents(CodePoint, Line_no, Column_no,
!Inp, !IO) :-
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, Line_no1, Column_no1},
(
if (IChar1 = eof)
then throw("Error: end of input in character literal " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar1 = to_int('\\'))
then
(
get_ch(Ch2, !Inp, !IO),
Ch2 = {IChar2, _, _},
(if (IChar2 = eof)
then throw("Error: end of input in character literal " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar2 = to_int('n'))
then (CodePoint = to_int('\n'))
else if (IChar2 = to_int('\\'))
then (CodePoint = to_int('\\'))
else throw("Error: unsupported escape \\" ++
from_char(det_from_int(IChar2)) ++
" at " ++ from_int(Line_no1) ++
":" ++ from_int(Column_no1))
)
)
else (CodePoint = IChar1)
).
 
:- pred check_char_lit_end(int::in, int::in, int::in,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
check_char_lit_end(CloseQuote, Line_no, Column_no, !Inp, !IO) :-
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = CloseQuote)
then true
else find_bad_char_lit_end(CloseQuote, Line_no, Column_no,
!Inp, !IO)
).
 
:- pred find_bad_char_lit_end(int::in, int::in, int::in,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
find_bad_char_lit_end(CloseQuote, Line_no, Column_no, !Inp, !IO) :-
get_ch(Ch2, !Inp, !IO),
Ch2 = {IChar2, _, _},
(
if (IChar2 = CloseQuote)
then throw("Error: unsupported multicharacter literal " ++
" at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar2 = eof)
then throw("Error: end of input in character literal " ++
" at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else find_bad_char_lit_end(CloseQuote, Line_no, Column_no,
!Inp, !IO)
).
 
:- pred scan_string_literal(toktup_t::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_string_literal(TokTup, !Inp, !IO) :-
get_ch(Ch, !Inp, !IO),
Ch = {OpenQuote, Line_no, Column_no},
CloseQuote = OpenQuote,
scan_string_lit_contents("", Str, CloseQuote,
Line_no, Column_no,
!Inp, !IO),
Arg = from_char(det_from_int(OpenQuote)) ++
Str ++ from_char(det_from_int(CloseQuote)),
TokTup = {token_STRING, Arg, Line_no, Column_no}.
 
:- pred scan_string_lit_contents(string::in, string::out, int::in,
int::in, int::in,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_string_lit_contents(Str0, Str, CloseQuote, Line_no, Column_no,
!Inp, !IO) :-
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, Line_no1, Column_no1},
(
if (IChar1 = CloseQuote)
then (Str = Str0)
else if (IChar1 = eof)
then throw("Error: end of input in string literal " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar1 = to_int('\n'))
then throw("Error: end of line in string literal " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar1 = to_int('\\'))
then
(
get_ch(Ch2, !Inp, !IO),
Ch2 = {IChar2, _, _},
(
if (IChar2 = to_int('n'))
then
(
Str1 = Str0 ++ "\\n",
scan_string_lit_contents(Str1, Str, CloseQuote,
Line_no, Column_no,
!Inp, !IO)
)
else if (IChar2 = to_int('\\'))
then
(
Str1 = Str0 ++ "\\\\",
scan_string_lit_contents(Str1, Str, CloseQuote,
Line_no, Column_no,
!Inp, !IO)
)
else if (IChar2 = eof)
then throw("Error: end of input in string literal " ++
"starting at " ++ from_int(Line_no) ++
":" ++ from_int(Column_no))
else if (IChar2 = to_int('\n'))
then throw("Error: end of line in string literal " ++
"starting at " ++ from_int(Line_no) ++
":" ++ from_int(Column_no))
else throw("Error: unsupported escape \\" ++
from_char(det_from_int(IChar2)) ++
" at " ++ from_int(Line_no1) ++
":" ++ from_int(Column_no1))
)
)
else
(
Char1 = det_from_int(IChar1),
Str1 = Str0 ++ from_char(Char1),
scan_string_lit_contents(Str1, Str, CloseQuote,
Line_no, Column_no, !Inp, !IO)
)
).
 
:- pred scan_identifier_or_reserved_word(toktup_t::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_identifier_or_reserved_word(TokTup, !Inp, !IO) :-
scan_integer_or_word(Str, Line_no, Column_no, !Inp, !IO),
(
if (Str = "if")
then (TokTup = {token_IF, Str, Line_no, Column_no})
else if (Str = "else")
then (TokTup = {token_ELSE, Str, Line_no, Column_no})
else if (Str = "while")
then (TokTup = {token_WHILE, Str, Line_no, Column_no})
else if (Str = "print")
then (TokTup = {token_PRINT, Str, Line_no, Column_no})
else if (Str = "putc")
then (TokTup = {token_PUTC, Str, Line_no, Column_no})
else (TokTup = {token_IDENTIFIER, Str, Line_no, Column_no})
).
 
:- pred scan_integer_literal(toktup_t::out, inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_integer_literal(TokTup, !Inp, !IO) :-
scan_integer_or_word(Str, Line_no, Column_no, !Inp, !IO),
(
if (not is_all_digits(Str))
then throw("Error: not a valid integer literal: " ++ Str)
else (TokTup = {token_INTEGER, Str, Line_no, Column_no})
).
 
:- pred scan_integer_or_word(string::out, int::out, int::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_integer_or_word(Str, Line_no, Column_no, !Inp, !IO) :-
get_ch({IChar, Line_no, Column_no}, !Inp, !IO),
(
if (IChar = eof)
then throw("internal error")
else
(
Char = det_from_int(IChar),
(if (not is_alnum_or_underscore(Char))
then throw("internal error")
else scan_int_or_word(from_char(Char), Str, !Inp, !IO))
)
).
 
:- pred scan_int_or_word(string::in, string::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_int_or_word(Str0, Str, !Inp, !IO) :-
get_ch(CharTup, !Inp, !IO),
{IChar, _, _} = CharTup,
(
if (IChar = eof)
then
(
push_back(CharTup, !Inp),
Str = Str0
)
else
(
Char = det_from_int(IChar),
(
if (not is_alnum_or_underscore(Char))
then
(
push_back(CharTup, !Inp),
Str = Str0
)
else scan_int_or_word(Str0 ++ from_char(Char), Str,
!Inp, !IO)
)
)
).
 
:- pred init(text_input_stream::in, inp_t::out) is det.
init(Inpf, Inp) :-
Inp = inp_t(Inpf, 1, 1, init).
 
 
:- pred get_ch(ch_t::out, inp_t::in, inp_t::out,
io::di, io::uo) is det.
get_ch(Ch, Inp0, Inp, !IO) :-
if (pop(Ch1, Inp0^pushback, Pushback))
then
(
Ch = Ch1,
Inp = (Inp0^pushback := Pushback)
)
else
(
inp_t(Inpf, Line_no, Column_no, Pushback) = Inp0,
read_char_unboxed(Inpf, Result, Char, !IO),
(
if (Result = ok)
then
(
Ch = {to_int(Char), Line_no, Column_no},
Inp =
(if (Char = ('\n'))
then inp_t(Inpf, Line_no + 1, 1, Pushback)
else inp_t(Inpf, Line_no, Column_no + 1, Pushback))
)
else
(
Ch = {eof, Line_no, Column_no},
Inp = Inp0
)
)
).
 
:- pred push_back(ch_t::in, inp_t::in, inp_t::out) is det.
push_back(Ch, Inp0, Inp) :-
Inp = (Inp0^pushback := push(Inp0^pushback, Ch)).
 
:- func eof = int is det.
eof = -1.</syntaxhighlight>
 
{{out}}
<pre>$ mmc -O6 --intermod-opt -E --make --warn-non-tail-recursion self-and-mutual lex && ./lex compiler-tests/testcase3.t
Making Mercury/int3s/lex.int3
Making Mercury/ints/lex.int
Making Mercury/opts/lex.opt
Making Mercury/cs/lex.c
Making Mercury/os/lex.o
Making lex
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
 
=={{header|Nim}}==
Tested with Nim v0.19.4. Both examples are tested against all programs in [[Compiler/Sample programs]].
===Using string with regular expressions===
<syntaxhighlight lang="nim">
import re, strformat, strutils
 
type
TokenKind = enum
tkUnknown = "UNKNOWN_TOKEN",
tkMul = "Op_multiply",
tkDiv = "Op_divide",
tkMod = "Op_mod",
tkAdd = "Op_add",
tkSub = "Op_subtract",
tkNeg = "Op_negate",
tkLt = "Op_less",
tkLte = "Op_lessequal",
tkGt = "Op_greater",
tkGte = "Op_greaterequal",
tkEq = "Op_equal",
tkNeq = "Op_notequal",
tkNot = "Op_not",
tkAsgn = "Op_assign",
tkAnd = "Op_and",
tkOr = "Op_or",
tkLpar = "LeftParen",
tkRpar = "RightParen",
tkLbra = "LeftBrace",
tkRbra = "RightBrace",
tkSmc = "Semicolon",
tkCom = "Comma",
tkIf = "Keyword_if",
tkElse = "Keyword_else",
tkWhile = "Keyword_while",
tkPrint = "Keyword_print",
tkPutc = "Keyword_putc",
tkId = "Identifier",
tkInt = "Integer",
tkChar = "Integer",
tkStr = "String",
tkEof = "End_of_input"
 
Token = object
kind: TokenKind
value: string
 
TokenAnn = object
## Annotated token with messages for compiler
token: Token
line, column: int
 
proc getSymbols(table: openArray[(char, TokenKind)]): seq[char] =
result = newSeq[char]()
for ch, tokenKind in items(table):
result.add ch
 
const
tkSymbols = { # single-char tokens
'*': tkMul,
'%': tkMod,
'+': tkAdd,
'-': tkSub,
'(': tkLpar,
')': tkRpar,
'{': tkLbra,
'}': tkRbra,
';': tkSmc,
',': tkCom,
'/': tkDiv, # the comment case /* ... */ is handled in `stripUnimportant`
}
symbols = getSymbols(tkSymbols)
 
proc findTokenKind(table: openArray[(char, TokenKind)]; needle: char):
TokenKind =
for ch, tokenKind in items(table):
if ch == needle: return tokenKind
tkUnknown
 
proc stripComment(text: var string, lineNo, colNo: var int) =
var matches: array[1, string]
 
if match(text, re"\A(/\*[\s\S]*?\*/)", matches):
text = text[matches[0].len..^1]
for s in matches[0]:
if s == '\n':
inc lineNo
colNo = 1
else:
inc colNo
 
proc stripUnimportant(text: var string; lineNo, colNo: var int) =
while true:
if text.len == 0: return
elif text[0] == '\n':
inc lineNo
colNo = 1
text = text[1..^1]
elif text[0] == ' ':
inc colNo
text = text[1..^1]
elif text.len >= 2 and text[0] == '/' and text[1] == '*':
stripComment(text, lineNo, colNo)
else: return
 
proc lookAhead(ch1, ch2: char, tk1, tk2: TokenKind): (TokenKind, int) =
if ch1 == ch2: (tk1, 2)
else: (tk2, 1)
 
proc consumeToken(text: var string; tkl: var int): Token =
## Return token removing it from the `text` and write its length to
## `tkl`. If the token can not be defined, return `tkUnknown` as a
## token, shrink text by 1 and write 1 to its length.
 
var
matches: array[1, string]
tKind: TokenKind
val: string
 
if text.len == 0:
(tKind, tkl) = (tkEof, 0)
 
# Simple characters
elif text[0] in symbols: (tKind, tkl) = (tkSymbols.findTokenKind(text[0]), 1)
elif text[0] == '<': (tKind, tkl) = lookAhead(text[1], '=', tkLte, tkLt)
elif text[0] == '>': (tKind, tkl) = lookAhead(text[1], '=', tkGte, tkGt)
elif text[0] == '=': (tKind, tkl) = lookAhead(text[1], '=', tkEq, tkAsgn)
elif text[0] == '!': (tKind, tkl) = lookAhead(text[1], '=', tkNeq, tkNot)
elif text[0] == '&': (tKind, tkl) = lookAhead(text[1], '&', tkAnd, tkUnknown)
elif text[0] == '|': (tKind, tkl) = lookAhead(text[1], '|', tkOr, tkUnknown)
 
# Keywords
elif match(text, re"\Aif\b"): (tKind, tkl) = (tkIf, 2)
elif match(text, re"\Aelse\b"): (tKind, tkl) = (tkElse, 4)
elif match(text, re"\Awhile\b"): (tKind, tkl) = (tkWhile, 5)
elif match(text, re"\Aprint\b"): (tKind, tkl) = (tkPrint, 5)
elif match(text, re"\Aputc\b"): (tKind, tkl) = (tkPutc, 4)
 
# Literals and identifiers
elif match(text, re"\A([0-9]+)", matches):
(tKind, tkl) = (tkInt, matches[0].len)
val = matches[0]
elif match(text, re"\A([_a-zA-Z][_a-zA-Z0-9]*)", matches):
(tKind, tkl) = (tkId, matches[0].len)
val = matches[0]
elif match(text, re"\A('(?:[^'\n]|\\\\|\\n)')", matches):
(tKind, tkl) = (tkChar, matches[0].len)
val = case matches[0]
of r"' '": $ord(' ')
of r"'\n'": $ord('\n')
of r"'\\'": $ord('\\')
else: $ord(matches[0][1]) # "'a'"[1] == 'a'
elif match(text, re"\A(""[^""\n]*"")", matches):
(tKind, tkl) = (tkStr, matches[0].len)
val = matches[0]
else: (tKind, tkl) = (tkUnknown, 1)
 
text = text[tkl..^1]
Token(kind: tKind, value: val)
 
proc tokenize*(text: string): seq[TokenAnn] =
result = newSeq[TokenAnn]()
var
lineNo, colNo: int = 1
text = text
token: Token
tokenLength: int
 
while text.len > 0:
stripUnimportant(text, lineNo, colNo)
token = consumeToken(text, tokenLength)
result.add TokenAnn(token: token, line: lineNo, column: colNo)
inc colNo, tokenLength
 
proc output*(s: seq[TokenAnn]): string =
var
tokenKind: TokenKind
value: string
line, column: int
 
for tokenAnn in items(s):
line = tokenAnn.line
column = tokenAnn.column
tokenKind = tokenAnn.token.kind
value = tokenAnn.token.value
result.add(
fmt"{line:>5}{column:>7} {tokenKind:<15}{value}"
.strip(leading = false) & "\n")
 
when isMainModule:
import os
 
let input = if paramCount() > 0: readFile paramStr(1)
else: readAll stdin
 
echo input.tokenize.output
</syntaxhighlight>
===Using stream with lexer library===
<syntaxhighlight lang="nim">
import lexbase, streams
from strutils import Whitespace
 
type
TokenKind = enum
tkInvalid = "Invalid",
tkOpMultiply = "Op_multiply",
tkOpDivide = "Op_divide",
tkOpMod = "Op_mod",
tkOpAdd = "Op_add",
tkOpSubtract = "Op_subtract",
tkOpLess = "Op_less",
tkOpLessEqual = "Op_lessequal",
tkOpGreater = "Op_greater",
tkOpGreaterEqual = "Op_greaterequal",
tkOpEqual = "Op_equal",
tkOpNotEqual = "Op_notequal",
tkOpNot = "Op_not",
tkOpAssign = "Op_assign",
tkOpAnd = "Op_and",
tkOpOr = "Op_or",
tkLeftParen = "LeftParen",
tkRightParen = "RightParen",
tkLeftBrace = "LeftBrace",
tkRightBrace = "RightBrace",
tkSemicolon = "Semicolon",
tkComma = "Comma",
tkKeywordIf = "Keyword_if",
tkKeywordElse = "Keyword_else",
tkKeywordWhile = "Keyword_while",
tkKeywordPrint = "Keyword_print",
tkKeywordPutc = "Keyword_putc",
tkIdentifier = "Identifier",
tkInteger = "Integer",
tkString = "String",
tkEndOfInput = "End_of_input"
 
Lexer = object of BaseLexer
kind: TokenKind
token, error: string
startPos: int
 
template setError(l: var Lexer; err: string): untyped =
l.kind = tkInvalid
if l.error.len == 0:
l.error = err
 
proc hasError(l: Lexer): bool {.inline.} =
l.error.len > 0
 
proc open(l: var Lexer; input: Stream) {.inline.} =
lexbase.open(l, input)
l.startPos = 0
l.kind = tkInvalid
l.token = ""
l.error = ""
 
proc handleNewLine(l: var Lexer) =
case l.buf[l.bufpos]
of '\c': l.bufpos = l.handleCR l.bufpos
of '\n': l.bufpos = l.handleLF l.bufpos
else: discard
 
proc skip(l: var Lexer) =
while true:
case l.buf[l.bufpos]
of Whitespace:
if l.buf[l.bufpos] notin NewLines:
inc l.bufpos
else:
handleNewLine l
of '/':
if l.buf[l.bufpos + 1] == '*':
inc l.bufpos, 2
while true:
case l.buf[l.bufpos]
of '*':
if l.buf[l.bufpos + 1] == '/':
inc l.bufpos, 2
break
else: inc l.bufpos
of NewLines:
handleNewLine l
of EndOfFile:
setError l, "EOF reached in comment"
return
else:
inc l.bufpos
else: break
else: break
 
proc handleSpecial(l: var Lexer): char =
assert l.buf[l.bufpos] == '\\'
inc l.bufpos
case l.buf[l.bufpos]
of 'n':
l.token.add "\\n"
result = '\n'
inc l.bufpos
of '\\':
l.token.add "\\\\"
result = '\\'
inc l.bufpos
else:
setError l, "Unknown escape sequence: '\\" & l.buf[l.bufpos] & "'"
result = '\0'
 
proc handleChar(l: var Lexer) =
assert l.buf[l.bufpos] == '\''
l.startPos = l.getColNumber l.bufpos
l.kind = tkInvalid
inc l.bufpos
if l.buf[l.bufpos] == '\\':
l.token = $ord(handleSpecial l)
if hasError l: return
elif l.buf[l.bufpos] == '\'':
setError l, "Empty character constant"
return
else:
l.token = $ord(l.buf[l.bufpos])
inc l.bufpos
if l.buf[l.bufpos] == '\'':
l.kind = tkInteger
inc l.bufpos
else:
setError l, "Multi-character constant"
 
proc handleString(l: var Lexer) =
assert l.buf[l.bufpos] == '"'
l.startPos = l.getColNumber l.bufpos
l.token = "\""
inc l.bufpos
while true:
case l.buf[l.bufpos]
of '\\':
discard handleSpecial l
if hasError l: return
of '"':
l.kind = tkString
add l.token, '"'
inc l.bufpos
break
of NewLines:
setError l, "EOL reached before end-of-string"
return
of EndOfFile:
setError l, "EOF reached before end-of-string"
return
else:
add l.token, l.buf[l.bufpos]
inc l.bufpos
 
proc handleNumber(l: var Lexer) =
assert l.buf[l.bufpos] in {'0'..'9'}
l.startPos = l.getColNumber l.bufpos
l.token = "0"
while l.buf[l.bufpos] == '0': inc l.bufpos
while true:
case l.buf[l.bufpos]
of '0'..'9':
if l.token == "0":
setLen l.token, 0
add l.token, l.buf[l.bufpos]
inc l.bufpos
of 'a'..'z', 'A'..'Z', '_':
setError l, "Invalid number"
return
else:
l.kind = tkInteger
break
 
proc handleIdent(l: var Lexer) =
assert l.buf[l.bufpos] in {'a'..'z'}
l.startPos = l.getColNumber l.bufpos
setLen l.token, 0
while true:
if l.buf[l.bufpos] in {'a'..'z', 'A'..'Z', '0'..'9', '_'}:
add l.token, l.buf[l.bufpos]
inc l.bufpos
else:
break
l.kind = case l.token
of "if": tkKeywordIf
of "else": tkKeywordElse
of "while": tkKeywordWhile
of "print": tkKeywordPrint
of "putc": tkKeywordPutc
else: tkIdentifier
 
proc getToken(l: var Lexer): TokenKind =
l.kind = tkInvalid
setLen l.token, 0
skip l
 
case l.buf[l.bufpos]
of '*':
l.kind = tkOpMultiply
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of '/':
l.kind = tkOpDivide
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of '%':
l.kind = tkOpMod
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of '+':
l.kind = tkOpAdd
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of '-':
l.kind = tkOpSubtract
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of '<':
l.kind = tkOpLess
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
if l.buf[l.bufpos] == '=':
l.kind = tkOpLessEqual
inc l.bufpos
of '>':
l.kind = tkOpGreater
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
if l.buf[l.bufpos] == '=':
l.kind = tkOpGreaterEqual
inc l.bufpos
of '=':
l.kind = tkOpAssign
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
if l.buf[l.bufpos] == '=':
l.kind = tkOpEqual
inc l.bufpos
of '!':
l.kind = tkOpNot
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
if l.buf[l.bufpos] == '=':
l.kind = tkOpNotEqual
inc l.bufpos
of '&':
if l.buf[l.bufpos + 1] == '&':
l.kind = tkOpAnd
l.startPos = l.getColNumber l.bufpos
inc l.bufpos, 2
else:
setError l, "Unrecognized character"
of '|':
if l.buf[l.bufpos + 1] == '|':
l.kind = tkOpOr
l.startPos = l.getColNumber l.bufpos
inc l.bufpos, 2
else:
setError l, "Unrecognized character"
of '(':
l.kind = tkLeftParen
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of ')':
l.kind = tkRightParen
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of '{':
l.kind = tkLeftBrace
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of '}':
l.kind = tkRightBrace
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of ';':
l.kind = tkSemicolon
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of ',':
l.kind = tkComma
l.startPos = l.getColNumber l.bufpos
inc l.bufpos
of '\'': handleChar l
of '"': handleString l
of '0'..'9': handleNumber l
of 'a'..'z', 'A'..'Z': handleIdent l
of EndOfFile:
l.startPos = l.getColNumber l.bufpos
l.kind = tkEndOfInput
else:
setError l, "Unrecognized character"
result = l.kind
 
when isMainModule:
import os, strformat
proc main() =
var l: Lexer
if paramCount() < 1:
open l, newFileStream stdin
else:
open l, newFileStream paramStr(1)
while l.getToken notin {tkInvalid}:
stdout.write &"{l.lineNumber:5} {l.startPos + 1:5} {l.kind:<14}"
if l.kind in {tkIdentifier, tkInteger, tkString}:
stdout.write &" {l.token}"
stdout.write '\n'
if l.kind == tkEndOfInput:
break
if hasError l:
echo &"({l.lineNumber},{l.getColNumber l.bufpos + 1}) {l.error}"
main()
</syntaxhighlight>
 
===Using nothing but system and strutils===
<syntaxhighlight lang="nim">import strutils
 
type
 
TokenKind* = enum
tokMult = "Op_multiply", tokDiv = "Op_divide", tokMod = "Op_mod",
tokAdd = "Op_add", tokSub = "Op_subtract", tokLess = "Op_less",
tokLessEq = "Op_lessequal", tokGreater = "Op_greater",
tokGreaterEq = "Op_greaterequal", tokEq = "Op_equal",
tokNotEq = "Op_notequal", tokNot = "Op_not", tokAssign = "Op_assign",
tokAnd = "Op_and", tokOr = "Op_or"
tokLPar = "LeftParen", tokRPar = "RightParen"
tokLBrace = "LeftBrace", tokRBrace = "RightBrace"
tokSemi = "Semicolon", tokComma = "Comma"
tokIf = "Keyword_if", tokElse = "Keyword_else", tokWhile = "Keyword_while",
tokPrint = "Keyword_print", tokPutc = "Keyword_putc"
tokIdent = "Identifier", tokInt = "Integer", tokChar = "Integer",
tokString = "String"
tokEnd = "End_of_input"
 
Token* = object
ln*, col*: int
case kind*: TokenKind
of tokIdent: ident*: string
of tokInt: intVal*: int
of tokChar: charVal*: char
of tokString: stringVal*: string
else: discard
 
Lexer* = object
input: string
pos: int
ln, col: int
 
LexicalError* = object of CatchableError
ln*, col*: int
 
proc error(lexer: var Lexer, message: string) =
var err = newException(LexicalError, message)
err.ln = lexer.ln
err.col = lexer.col
 
template current: char =
if lexer.pos < lexer.input.len: lexer.input[lexer.pos]
else: '\x00'
template get(n: int): string =
if lexer.pos < lexer.input.len:
lexer.input[min(lexer.pos, lexer.input.len)..
min(lexer.pos + n - 1, lexer.input.len)]
else: ""
 
template next() =
inc(lexer.pos); inc(lexer.col)
if current() == '\n':
inc(lexer.ln)
lexer.col = 0
elif current() == '\r':
lexer.col = 0
 
proc skip(lexer: var Lexer) =
while true:
if current() in Whitespace:
while current() in Whitespace:
next()
continue
elif get(2) == "/*":
next(); next()
while get(2) != "*/":
if current() == '\x00':
lexer.error("Unterminated comment")
next()
next(); next()
continue
else: discard
break
 
proc charOrEscape(lexer: var Lexer): char =
if current() != '\\':
result = current()
next()
else:
next()
case current()
of 'n': result = '\n'
of '\\': result = '\\'
else: lexer.error("Unknown escape sequence '\\" & current() & "'")
next()
 
proc next*(lexer: var Lexer): Token =
let
ln = lexer.ln
col = lexer.col
 
case current()
of '*': result = Token(kind: tokMult); next()
of '/': result = Token(kind: tokDiv); next()
of '%': result = Token(kind: tokMod); next()
of '+': result = Token(kind: tokAdd); next()
of '-': result = Token(kind: tokSub); next()
of '<':
next()
if current() == '=': result = Token(kind: tokLessEq)
else: result = Token(kind: tokLess)
of '>':
next()
if current() == '=':
result = Token(kind: tokGreaterEq)
next()
else:
result = Token(kind: tokGreater)
of '=':
next()
if current() == '=':
result = Token(kind: tokEq)
next()
else:
result = Token(kind: tokAssign)
of '!':
next()
if current() == '=':
result = Token(kind: tokNotEq)
next()
else:
result = Token(kind: tokNot)
of '&':
next()
if current() == '&':
result = Token(kind: tokAnd)
next()
else:
lexer.error("'&&' expected")
of '|':
next()
if current() == '|':
result = Token(kind: tokOr)
next()
else:
lexer.error("'||' expected")
of '(': result = Token(kind: tokLPar); next()
of ')': result = Token(kind: tokRPar); next()
of '{': result = Token(kind: tokLBrace); next()
of '}': result = Token(kind: tokRBrace); next()
of ';': result = Token(kind: tokSemi); next()
of ',': result = Token(kind: tokComma); next()
of '\'':
next()
if current() == '\'': lexer.error("Empty character literal")
let ch = lexer.charOrEscape()
if current() != '\'':
lexer.error("Character literal must contain a single character or " &
"escape sequence")
result = Token(kind: tokChar, charVal: ch)
next()
of '0'..'9':
var number = ""
while current() in Digits:
number.add(current())
next()
if current() in IdentStartChars:
lexer.error("Integer literal ends in non-digit characters")
result = Token(kind: tokInt, intVal: parseInt(number))
of '"':
next()
var str = ""
while current() notin {'"', '\x00', '\n'}:
str.add(lexer.charOrEscape())
if current() == '\x00':
lexer.error("Unterminated string literal")
elif current() == '\n':
lexer.error("Line feed in string literal")
else:
next()
result = Token(kind: tokString, stringVal: str)
of IdentStartChars:
var ident = $current()
next()
while current() in IdentChars:
ident.add(current())
next()
case ident
of "if": result = Token(kind: tokIf)
of "else": result = Token(kind: tokElse)
of "while": result = Token(kind: tokWhile)
of "print": result = Token(kind: tokPrint)
of "putc": result = Token(kind: tokPutc)
else: result = Token(kind: tokIdent, ident: ident)
of '\x00':
result = Token(kind: tokEnd)
else:
lexer.error("Unexpected character: '" & current() & "'")
 
result.ln = ln
result.col = col
lexer.skip()
 
proc peek*(lexer: var Lexer): Token =
discard
 
proc initLexer*(input: string): Lexer =
result = Lexer(input: input, pos: 0, ln: 1, col: 1)
result.skip()
 
when isMainModule:
let code = readAll(stdin)
var
lexer = initLexer(code)
token: Token
while true:
token = lexer.next()
stdout.write(token.ln, ' ', token.col, ' ', token.kind)
case token.kind
of tokInt: stdout.write(' ', token.intVal)
of tokChar: stdout.write(' ', token.charVal.ord)
of tokString: stdout.write(" \"", token.stringVal
.replace("\\", "\\\\")
.replace("\n", "\\n"), '"')
of tokIdent: stdout.write(' ', token.ident)
else: discard
stdout.write('\n')
if token.kind == tokEnd:
break</syntaxhighlight>
 
=={{header|ObjectIcon}}==
{{trans|Icon}}
{{trans|ATS}}
 
 
There are very few changes from the ordinary Icon version: I/O is modified to use FileStreams; and the '''max''' procedure is removed, because there is an Object Icon builtin procedure.
 
 
<syntaxhighlight lang="objecticon"># -*- ObjectIcon -*-
#
# The Rosetta Code lexical analyzer in Object Icon. Based upon the ATS
# implementation.
#
# Usage: lex [INPUTFILE [OUTPUTFILE]]
# If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input
# or standard output is used, respectively. *)
#
 
import io
 
$define EOF -1
 
$define TOKEN_ELSE 0
$define TOKEN_IF 1
$define TOKEN_PRINT 2
$define TOKEN_PUTC 3
$define TOKEN_WHILE 4
$define TOKEN_MULTIPLY 5
$define TOKEN_DIVIDE 6
$define TOKEN_MOD 7
$define TOKEN_ADD 8
$define TOKEN_SUBTRACT 9
$define TOKEN_NEGATE 10
$define TOKEN_LESS 11
$define TOKEN_LESSEQUAL 12
$define TOKEN_GREATER 13
$define TOKEN_GREATEREQUAL 14
$define TOKEN_EQUAL 15
$define TOKEN_NOTEQUAL 16
$define TOKEN_NOT 17
$define TOKEN_ASSIGN 18
$define TOKEN_AND 19
$define TOKEN_OR 20
$define TOKEN_LEFTPAREN 21
$define TOKEN_RIGHTPAREN 22
$define TOKEN_LEFTBRACE 23
$define TOKEN_RIGHTBRACE 24
$define TOKEN_SEMICOLON 25
$define TOKEN_COMMA 26
$define TOKEN_IDENTIFIER 27
$define TOKEN_INTEGER 28
$define TOKEN_STRING 29
$define TOKEN_END_OF_INPUT 30
 
global whitespace
global ident_start
global ident_continuation
 
procedure main(args)
local inpf, outf
local pushback_buffer, inp, pushback
 
initial {
whitespace := ' \t\v\f\r\n'
ident_start := '_' ++ &letters
ident_continuation := ident_start ++ &digits
}
 
inpf := FileStream.stdin
outf := FileStream.stdout
if 1 <= *args & args[1] ~== "-" then {
inpf := FileStream(args[1], FileOpt.RDONLY) | stop(&why)
}
if 2 <= *args & args[2] ~== "-" then {
outf := FileStream(args[2], ior(FileOpt.WRONLY,
FileOpt.TRUNC,
FileOpt.CREAT)) | stop(&why)
}
 
pushback_buffer := []
inp := create inputter(inpf, pushback_buffer)
pushback := create repeat push(pushback_buffer, \@&source)
@pushback # The first invocation does nothing.
 
scan_text(outf, inp, pushback)
end
 
procedure scan_text(outf, inp, pushback)
local ch
 
while /ch | ch[1] ~=== EOF do {
skip_spaces_and_comments(inp, pushback)
ch := @inp
if ch[1] === EOF then {
print_token(outf, [TOKEN_END_OF_INPUT, "", ch[2], ch[3]])
} else {
ch @pushback
print_token(outf, get_next_token(inp, pushback))
}
}
end
 
procedure get_next_token(inp, pushback)
local ch, ch1
local ln, cn
 
skip_spaces_and_comments(inp, pushback)
ch := @inp
ln := ch[2] # line number
cn := ch[3] # column number
case ch[1] of {
"," : return [TOKEN_COMMA, ",", ln, cn]
";" : return [TOKEN_SEMICOLON, ";", ln, cn]
"(" : return [TOKEN_LEFTPAREN, "(", ln, cn]
")" : return [TOKEN_RIGHTPAREN, ")", ln, cn]
"{" : return [TOKEN_LEFTBRACE, "{", ln, cn]
"}" : return [TOKEN_RIGHTBRACE, "}", ln, cn]
"*" : return [TOKEN_MULTIPLY, "*", ln, cn]
"/" : return [TOKEN_DIVIDE, "/", ln, cn]
"%" : return [TOKEN_MOD, "%", ln, cn]
"+" : return [TOKEN_ADD, "+", ln, cn]
"-" : return [TOKEN_SUBTRACT, "-", ln, cn]
"<" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_LESSEQUAL, "<=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_LESS, "<", ln, cn]
}
}
">" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_GREATEREQUAL, ">=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_GREATER, ">", ln, cn]
}
}
"=" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_EQUAL, "==", ln, cn]
} else {
ch1 @pushback
return [TOKEN_ASSIGN, "=", ln, cn]
}
}
"!" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_NOTEQUAL, "!=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_NOT, "!", ln, cn]
}
}
"&" : {
ch1 := @inp
if ch1[1] === "&" then {
return [TOKEN_AND, "&&", ln, cn]
} else {
unexpected_character(ln, cn, ch)
}
}
"|" : {
ch1 := @inp
if ch1[1] === "|" then {
return [TOKEN_OR, "||", ln, cn]
} else {
unexpected_character(ln, cn, ch)
}
}
"\"" : {
ch @pushback
return scan_string_literal(inp)
}
"'" : {
ch @pushback
return scan_character_literal(inp, pushback)
}
default : {
if any(&digits, ch[1]) then {
ch @pushback
return scan_integer_literal(inp, pushback)
} else if any(ident_start, ch[1]) then {
ch @pushback
return scan_identifier_or_reserved_word (inp, pushback)
} else {
unexpected_character(ln, cn, ch)
}
}
}
end
 
procedure scan_identifier_or_reserved_word(inp, pushback)
local ch
local s
local line_no, column_no
 
s := ""
ch := @inp
line_no := ch[2]
column_no := ch[3]
while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do {
s ||:= ch[1]
ch := @inp
}
ch @pushback
return reserved_word_lookup (s, line_no, column_no)
end
 
procedure scan_integer_literal(inp, pushback)
local ch
local s
local line_no, column_no
 
s := ""
ch := @inp
line_no := ch[2]
column_no := ch[3]
while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do {
s ||:= ch[1]
ch := @inp
}
ch @pushback
not upto(~&digits, s) | invalid_integer_literal(line_no, column_no, s)
return [TOKEN_INTEGER, s, line_no, column_no]
end
 
procedure scan_character_literal(inp, pushback)
local ch, ch1
local close_quote
local toktup
local line_no, column_no
 
ch := @inp # The opening quote.
close_quote := ch[1] # Same as the opening quote.
ch @pushback
 
line_no := ch[2]
column_no := ch[3]
 
toktup := scan_character_literal_without_checking_end(inp)
ch1 := @inp
if ch1[1] ~=== close_quote then {
repeat {
case ch1[1] of {
EOF : unterminated_character_literal(line_no, column_no)
close_quote : multicharacter_literal(line_no, column_no)
default : ch1 := @inp
}
}
}
return toktup
end
 
procedure scan_character_literal_without_checking_end(inp)
local ch, ch1, ch2
 
ch := @inp # The opening quote.
ch1 := @inp
EOF ~=== ch1[1] | unterminated_character_literal(ch[2], ch[3])
if ch1[1] == "\\" then {
ch2 := @inp
EOF ~=== ch2[1] | unterminated_character_literal(ch[2], ch[3])
case ch2[1] of {
"n" : return [TOKEN_INTEGER, string(ord("\n")), ch[2], ch[3]]
"\\" : return [TOKEN_INTEGER, string(ord("\\")), ch[2], ch[3]]
default : unsupported_escape(ch1[2], ch1[3], ch2)
}
} else {
return [TOKEN_INTEGER, string(ord(ch1[1])), ch[2], ch[3]]
}
end
 
procedure scan_string_literal(inp)
local ch, ch1, ch2
local line_no, column_no
local close_quote
local s
local retval
 
ch := @inp # The opening quote
close_quote := ch[1] # Same as the opening quote.
line_no := ch[2]
column_no := ch[3]
 
s := ch[1]
until \retval do {
ch1 := @inp
ch1[1] ~=== EOF |
unterminated_string_literal (line_no, column_no,
"end of input")
ch1[1] ~== "\n" |
unterminated_string_literal (line_no, column_no,
"end of line")
if ch1[1] == close_quote then {
retval := [TOKEN_STRING, s || close_quote, line_no, column_no]
} else if ch1[1] ~== "\\" then {
s ||:= ch1[1]
} else {
ch2 := @inp
EOF ~=== ch2[1] | unsupported_escape(line_no, column_no, ch2)
case ch2[1] of {
"n" : s ||:= "\\n"
"\\" : s ||:= "\\\\"
default : unsupported_escape(line_no, column_no, ch2)
}
}
}
return retval
end
 
procedure skip_spaces_and_comments(inp, pushback)
local ch, ch1
 
repeat {
ch := @inp
(EOF === ch[1]) & { ch @pushback; return }
if not any(whitespace, ch[1]) then {
(ch[1] == "/") | { ch @pushback; return }
(ch1 := @inp) | { ch @pushback; return }
(ch1[1] == "*") | { ch1 @pushback; ch @pushback; return }
scan_comment(inp, ch[2], ch[3])
}
}
end
 
procedure scan_comment(inp, line_no, column_no)
local ch, ch1
 
until (\ch)[1] == "*" & (\ch1)[1] == "/" do {
ch := @inp
(EOF === ch[1]) & unterminated_comment(line_no, column_no)
if ch[1] == "*" then {
ch1 := @inp
(EOF === ch[1]) & unterminated_comment(line_no, column_no)
}
}
return
end
 
procedure reserved_word_lookup(s, line_no, column_no)
 
# Lookup is by an extremely simple perfect hash.
 
static reserved_words
static reserved_word_tokens
local hashval, token, toktup
 
initial {
reserved_words := ["if", "print", "else",
"", "putc", "",
"", "while", ""]
reserved_word_tokens :=
[TOKEN_IF, TOKEN_PRINT, TOKEN_ELSE,
TOKEN_IDENTIFIER, TOKEN_PUTC, TOKEN_IDENTIFIER,
TOKEN_IDENTIFIER, TOKEN_WHILE, TOKEN_IDENTIFIER]
}
 
if *s < 2 then {
toktup := [TOKEN_IDENTIFIER, s, line_no, column_no]
} else {
hashval := ((ord(s[1]) + ord(s[2])) % (*reserved_words)) + 1
token := reserved_word_tokens[hashval]
if token = TOKEN_IDENTIFIER | s ~== reserved_words[hashval] then {
toktup := [TOKEN_IDENTIFIER, s, line_no, column_no]
} else {
toktup := [token, s, line_no, column_no]
}
}
return toktup
end
 
procedure print_token(outf, toktup)
static token_names
local s_line, s_column
 
initial {
token_names := ["Keyword_else",
"Keyword_if",
"Keyword_print",
"Keyword_putc",
"Keyword_while",
"Op_multiply",
"Op_divide",
"Op_mod",
"Op_add",
"Op_subtract",
"Op_negate",
"Op_less",
"Op_lessequal",
"Op_greater",
"Op_greaterequal",
"Op_equal",
"Op_notequal",
"Op_not",
"Op_assign",
"Op_and",
"Op_or",
"LeftParen",
"RightParen",
"LeftBrace",
"RightBrace",
"Semicolon",
"Comma",
"Identifier",
"Integer",
"String",
"End_of_input"]
}
 
/outf := FileStream.stdout
s_line := string(toktup[3])
s_column := string(toktup[4])
writes(outf, right (s_line, max(5, *s_line)))
writes(outf, " ")
writes(outf, right (s_column, max(5, *s_column)))
writes(outf, " ")
writes(outf, token_names[toktup[1] + 1])
case toktup[1] of {
TOKEN_IDENTIFIER : writes(outf, " ", toktup[2])
TOKEN_INTEGER : writes(outf, " ", toktup[2])
TOKEN_STRING : writes(outf, " ", toktup[2])
}
write(outf)
return
end
 
procedure inputter(inpf, pushback_buffer)
local buffer
local line_no, column_no
local c
 
buffer := ""
line_no := 1
column_no := 1
 
repeat {
buffer? {
until *pushback_buffer = 0 & pos(0) do {
if *pushback_buffer ~= 0 then {
suspend pop(pushback_buffer)
} else {
c := move(1)
suspend [c, line_no, column_no]
if c == "\n" then {
line_no +:= 1
column_no := 1
} else {
column_no +:= 1
}
}
}
}
(buffer := reads(inpf, 2048)) |
suspend [EOF, line_no, column_no]
}
end
 
procedure unterminated_comment(line_no, column_no)
error("unterminated comment starting at ",
line_no, ":", column_no)
end
 
procedure unexpected_character(line_no, column_no, ch)
error("unexpected character '", ch[1], "' starting at ",
line_no, ":", column_no)
end
 
procedure unterminated_string_literal (line_no, column_no, cause)
error("unterminated string literal (", cause, ") starting at ",
line_no, ":", column_no)
end
 
procedure unsupported_escape (line_no, column_no, ch)
if ch[1] === EOF then {
error("unexpected \\ at end of input",
" starting at ", line_no, ":", column_no)
} else {
error("unsupported escape \\", ch[1],
" starting at ", line_no, ":", column_no)
}
end
 
procedure invalid_integer_literal(line_no, column_no, s)
error("invalid integer literal ", s,
" starting at ", line_no, ":", column_no)
end
 
procedure unterminated_character_literal(line_no, column_no)
error("unterminated character literal starting at ",
line_no, ":", column_no)
end
 
procedure multicharacter_literal(line_no, column_no)
error("unsupported multicharacter literal starting at ",
line_no, ":", column_no)
end
 
procedure error(args[])
write!([FileStream.stderr] ||| args)
exit(1)
end</syntaxhighlight>
 
 
{{out}}
<pre>$ oit -s -o lex lex-in-ObjectIcon.icn && ./lex compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|OCaml}}==
{{works with|OCaml|4.12.1}}
{{trans|ATS}}
 
This is a close translation of the ATS. It may interest the reader to compare the two implementations.
 
(Much of the extra complication in the ATS comes from arrays being a linear type (whose "views" need tending), and from values of linear type having to be local to any function using them. This limitation could have been worked around, and arrays more similar to OCaml arrays could have been used, but at a cost in safety and efficiency.)
 
<syntaxhighlight lang="ocaml">(*------------------------------------------------------------------*)
(* The Rosetta Code lexical analyzer, in OCaml. Based on the ATS. *)
 
(* When you compare this code to the ATS code, please keep in mind
that, although ATS has an ML-like syntax:
 
* The type system is not the same at all.
 
* Most ATS functions are not closures. Those that are will have
special notations such as "<cloref1>" associated with them. *)
 
(*------------------------------------------------------------------*)
(* The following functions are compatible with ASCII. *)
 
let is_digit ichar =
48 <= ichar && ichar <= 57
 
let is_lower ichar =
97 <= ichar && ichar <= 122
 
let is_upper ichar =
65 <= ichar && ichar <= 90
 
let is_alpha ichar =
is_lower ichar || is_upper ichar
 
let is_alnum ichar =
is_digit ichar || is_alpha ichar
 
let is_ident_start ichar =
is_alpha ichar || ichar = 95
 
let is_ident_continuation ichar =
is_alnum ichar || ichar = 95
 
let is_space ichar =
ichar = 32 || (9 <= ichar && ichar <= 13)
 
(*------------------------------------------------------------------*)
 
let reverse_list_to_string lst =
List.rev lst |> List.to_seq |> String.of_seq
 
(*------------------------------------------------------------------*)
(* Character input more like that of C. There are various advantages
and disadvantages to this method, but key points in its favor are:
(a) it is how character input is done in the original ATS code, (b)
Unicode code points are 21-bit positive integers. *)
 
let eof = (-1)
 
let input_ichar channel =
try
int_of_char (input_char channel)
with
| End_of_file -> eof
 
(*------------------------------------------------------------------*)
 
(* The type of an input character. *)
 
module Ch =
struct
type t =
{
ichar : int;
line_no : int;
column_no : int
}
end
 
(*------------------------------------------------------------------*)
(* Inputting with unlimited pushback, and with counting of lines and
columns. *)
 
module Inp =
struct
type t =
{
inpf : in_channel;
pushback : Ch.t list;
line_no : int;
column_no : int
}
 
let of_in_channel inpf =
{ inpf = inpf;
pushback = [];
line_no = 1;
column_no = 1
}
 
let get_ch inp =
match inp.pushback with
| ch :: tail ->
(ch, {inp with pushback = tail})
| [] ->
let ichar = input_ichar inp.inpf in
if ichar = int_of_char '\n' then
({ ichar = ichar;
line_no = inp.line_no;
column_no = inp.column_no },
{ inp with line_no = inp.line_no + 1;
column_no = 1 })
else
({ ichar = ichar;
line_no = inp.line_no;
column_no = inp.column_no },
{ inp with column_no = inp.column_no + 1 })
 
let push_back_ch ch inp =
{inp with pushback = ch :: inp.pushback}
end
 
(*------------------------------------------------------------------*)
(* Tokens, appearing in tuples with arguments, and with line and
column numbers. The tokens are integers, so they can be used as
array indices. *)
 
(* (token, argument, line_no, column_no) *)
type toktup_t = int * string * int * int
 
let token_ELSE = 0
let token_IF = 1
let token_PRINT = 2
let token_PUTC = 3
let token_WHILE = 4
let token_MULTIPLY = 5
let token_DIVIDE = 6
let token_MOD = 7
let token_ADD = 8
let token_SUBTRACT = 9
let token_NEGATE = 10
let token_LESS = 11
let token_LESSEQUAL = 12
let token_GREATER = 13
let token_GREATEREQUAL = 14
let token_EQUAL = 15
let token_NOTEQUAL = 16
let token_NOT = 17
let token_ASSIGN = 18
let token_AND = 19
let token_OR = 20
let token_LEFTPAREN = 21
let token_RIGHTPAREN = 22
let token_LEFTBRACE = 23
let token_RIGHTBRACE = 24
let token_SEMICOLON = 25
let token_COMMA = 26
let token_IDENTIFIER = 27
let token_INTEGER = 28
let token_STRING = 29
let token_END_OF_INPUT = 30
;;
 
(* A *very* simple perfect hash for the reserved words. (Yes, this is
overkill, except for demonstration of the principle.) *)
 
let reserved_words =
[| "if"; "print"; "else"; ""; "putc"; ""; ""; "while"; "" |]
let reserved_word_tokens =
[| token_IF; token_PRINT; token_ELSE; token_IDENTIFIER;
token_PUTC; token_IDENTIFIER; token_IDENTIFIER; token_WHILE;
token_IDENTIFIER |]
 
let reserved_word_lookup s line_no column_no =
if String.length s < 2 then
(token_IDENTIFIER, s, line_no, column_no)
else
let hashval = (int_of_char s.[0] + int_of_char s.[1]) mod 9 in
let token = reserved_word_tokens.(hashval) in
if token = token_IDENTIFIER || s <> reserved_words.(hashval) then
(token_IDENTIFIER, s, line_no, column_no)
else
(token, s, line_no, column_no)
 
(* Token to string lookup. *)
 
let token_names =
[| "Keyword_else";
"Keyword_if";
"Keyword_print";
"Keyword_putc";
"Keyword_while";
"Op_multiply";
"Op_divide";
"Op_mod";
"Op_add";
"Op_subtract";
"Op_negate";
"Op_less";
"Op_lessequal";
"Op_greater";
"Op_greaterequal";
"Op_equal";
"Op_notequal";
"Op_not";
"Op_assign";
"Op_and";
"Op_or";
"LeftParen";
"RightParen";
"LeftBrace";
"RightBrace";
"Semicolon";
"Comma";
"Identifier";
"Integer";
"String";
"End_of_input" |]
 
let token_name token =
token_names.(token)
 
(*------------------------------------------------------------------*)
 
exception Unterminated_comment of int * int
exception Unterminated_character_literal of int * int
exception Multicharacter_literal of int * int
exception End_of_input_in_string_literal of int * int
exception End_of_line_in_string_literal of int * int
exception Unsupported_escape of int * int * int
exception Invalid_integer_literal of int * int * string
exception Unexpected_character of int * int * char
 
(*------------------------------------------------------------------*)
(* Skipping past spaces and comments. (A comment in the target
language is, if you think about it, a kind of space.) *)
 
let scan_comment inp line_no column_no =
let rec loop inp =
let (ch, inp) = Inp.get_ch inp in
if ch.ichar = eof then
raise (Unterminated_comment (line_no, column_no))
else if ch.ichar = int_of_char '*' then
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = eof then
raise (Unterminated_comment (line_no, column_no))
else if ch1.ichar = int_of_char '/' then
inp
else
loop inp
else
loop inp
in
loop inp
 
let skip_spaces_and_comments inp =
let rec loop inp =
let (ch, inp) = Inp.get_ch inp in
if is_space ch.ichar then
loop inp
else if ch.ichar = int_of_char '/' then
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '*' then
scan_comment inp ch.line_no ch.column_no |> loop
else
let inp = Inp.push_back_ch ch1 inp in
let inp = Inp.push_back_ch ch inp in
inp
else
Inp.push_back_ch ch inp
in
loop inp
 
(*------------------------------------------------------------------*)
(* Integer literals, identifiers, and reserved words. *)
 
(* In ATS the predicate for simple scan was supplied by template
expansion, which (typically) produces faster code than passing a
function or closure (although passing either of those could have
been done). Here I pass the predicate as a function/closure. It is
worth contrasting the methods. *)
let rec simple_scan pred lst inp =
let (ch, inp) = Inp.get_ch inp in
if pred ch.ichar then
simple_scan pred (char_of_int ch.ichar :: lst) inp
else
(lst, Inp.push_back_ch ch inp)
 
(* Demonstration of one way to make a new closure in OCaml. (In ATS,
one might see things that look similar but are actually template
operations.) *)
let simple_scan_iic = simple_scan is_ident_continuation
 
let scan_integer_literal inp =
let (ch, inp) = Inp.get_ch inp in
let _ = assert (is_digit ch.ichar) in
let (lst, inp) = simple_scan_iic [char_of_int ch.ichar] inp in
let s = reverse_list_to_string lst in
if List.for_all (fun c -> is_digit (int_of_char c)) lst then
((token_INTEGER, s, ch.line_no, ch.column_no), inp)
else
raise (Invalid_integer_literal (ch.line_no, ch.column_no, s))
 
let scan_identifier_or_reserved_word inp =
let (ch, inp) = Inp.get_ch inp in
let _ = assert (is_ident_start ch.ichar) in
let (lst, inp) = simple_scan_iic [char_of_int ch.ichar] inp in
let s = reverse_list_to_string lst in
let toktup = reserved_word_lookup s ch.line_no ch.column_no in
(toktup, inp)
 
(*------------------------------------------------------------------*)
(* String literals. *)
 
let scan_string_literal inp =
let (ch, inp) = Inp.get_ch inp in
let _ = assert (ch.ichar = int_of_char '"') in
 
let rec scan lst inp =
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = eof then
raise (End_of_input_in_string_literal
(ch.line_no, ch.column_no))
else if ch1.ichar = int_of_char '\n' then
raise (End_of_line_in_string_literal
(ch.line_no, ch.column_no))
else if ch1.ichar = int_of_char '"' then
(lst, inp)
else if ch1.ichar <> int_of_char '\\' then
scan (char_of_int ch1.ichar :: lst) inp
else
let (ch2, inp) = Inp.get_ch inp in
if ch2.ichar = int_of_char 'n' then
scan ('n' :: '\\' :: lst) inp
else if ch2.ichar = int_of_char '\\' then
scan ('\\' :: '\\' :: lst) inp
else
raise (Unsupported_escape (ch1.line_no, ch1.column_no,
ch2.ichar))
in
let lst = '"' :: [] in
let (lst, inp) = scan lst inp in
let lst = '"' :: lst in
let s = reverse_list_to_string lst in
((token_STRING, s, ch.line_no, ch.column_no), inp)
 
(*------------------------------------------------------------------*)
(* Character literals. *)
 
let scan_character_literal_without_checking_end inp =
let (ch, inp) = Inp.get_ch inp in
let _ = assert (ch.ichar = int_of_char '\'') in
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = eof then
raise (Unterminated_character_literal
(ch.line_no, ch.column_no))
else if ch1.ichar = int_of_char '\\' then
let (ch2, inp) = Inp.get_ch inp in
if ch2.ichar = eof then
raise (Unterminated_character_literal
(ch.line_no, ch.column_no))
else if ch2.ichar = int_of_char 'n' then
let s = (int_of_char '\n' |> string_of_int) in
((token_INTEGER, s, ch.line_no, ch.column_no), inp)
else if ch2.ichar = int_of_char '\\' then
let s = (int_of_char '\\' |> string_of_int) in
((token_INTEGER, s, ch.line_no, ch.column_no), inp)
else
raise (Unsupported_escape
(ch1.line_no, ch1.column_no, ch2.ichar))
else
let s = string_of_int ch1.ichar in
((token_INTEGER, s, ch.line_no, ch.column_no), inp)
 
let scan_character_literal inp =
let (toktup, inp) =
scan_character_literal_without_checking_end inp in
let (_, _, line_no, column_no) = toktup in
 
let check_end inp =
let (ch, inp) = Inp.get_ch inp in
if ch.ichar = int_of_char '\'' then
inp
else
let rec loop_to_end (ch1 : Ch.t) inp =
if ch1.ichar = eof then
raise (Unterminated_character_literal (line_no, column_no))
else if ch1.ichar = int_of_char '\'' then
raise (Multicharacter_literal (line_no, column_no))
else
let (ch1, inp) = Inp.get_ch inp in
loop_to_end ch1 inp
in
loop_to_end ch inp
in
let inp = check_end inp in
(toktup, inp)
 
(*------------------------------------------------------------------*)
 
let get_next_token inp =
let inp = skip_spaces_and_comments inp in
let (ch, inp) = Inp.get_ch inp in
let ln = ch.line_no in
let cn = ch.column_no in
if ch.ichar = eof then
((token_END_OF_INPUT, "", ln, cn), inp)
else
match char_of_int ch.ichar with
| ',' -> ((token_COMMA, ",", ln, cn), inp)
| ';' -> ((token_SEMICOLON, ";", ln, cn), inp)
| '(' -> ((token_LEFTPAREN, "(", ln, cn), inp)
| ')' -> ((token_RIGHTPAREN, ")", ln, cn), inp)
| '{' -> ((token_LEFTBRACE, "{", ln, cn), inp)
| '}' -> ((token_RIGHTBRACE, "}", ln, cn), inp)
| '*' -> ((token_MULTIPLY, "*", ln, cn), inp)
| '/' -> ((token_DIVIDE, "/", ln, cn), inp)
| '%' -> ((token_MOD, "%", ln, cn), inp)
| '+' -> ((token_ADD, "+", ln, cn), inp)
| '-' -> ((token_SUBTRACT, "-", ln, cn), inp)
| '<' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '=' then
((token_LESSEQUAL, "<=", ln, cn), inp)
else
let inp = Inp.push_back_ch ch1 inp in
((token_LESS, "<", ln, cn), inp)
| '>' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '=' then
((token_GREATEREQUAL, ">=", ln, cn), inp)
else
let inp = Inp.push_back_ch ch1 inp in
((token_GREATER, ">", ln, cn), inp)
| '=' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '=' then
((token_EQUAL, "==", ln, cn), inp)
else
let inp = Inp.push_back_ch ch1 inp in
((token_ASSIGN, "=", ln, cn), inp)
| '!' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '=' then
((token_NOTEQUAL, "!=", ln, cn), inp)
else
let inp = Inp.push_back_ch ch1 inp in
((token_NOT, "!", ln, cn), inp)
| '&' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '&' then
((token_AND, "&&", ln, cn), inp)
else
raise (Unexpected_character (ch.line_no, ch.column_no,
char_of_int ch.ichar))
| '|' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '|' then
((token_OR, "||", ln, cn), inp)
else
raise (Unexpected_character (ch.line_no, ch.column_no,
char_of_int ch.ichar))
| '"' ->
let inp = Inp.push_back_ch ch inp in
scan_string_literal inp
| '\'' ->
let inp = Inp.push_back_ch ch inp in
scan_character_literal inp
| _ when is_digit ch.ichar ->
let inp = Inp.push_back_ch ch inp in
scan_integer_literal inp
| _ when is_ident_start ch.ichar ->
let inp = Inp.push_back_ch ch inp in
scan_identifier_or_reserved_word inp
| _ -> raise (Unexpected_character (ch.line_no, ch.column_no,
char_of_int ch.ichar))
 
let print_token outf toktup =
let (token, arg, line_no, column_no) = toktup in
let name = token_name token in
let (padding, str) =
match 0 with
| _ when token = token_IDENTIFIER -> (" ", arg)
| _ when token = token_INTEGER -> (" ", arg)
| _ when token = token_STRING -> (" ", arg)
| _ -> ("", "")
in
Printf.fprintf outf "%5d %5d %s%s%s\n"
line_no column_no name padding str
 
let scan_text outf inp =
let rec loop inp =
let (toktup, inp) = get_next_token inp in
begin
print_token outf toktup;
let (token, _, _, _) = toktup in
if token <> token_END_OF_INPUT then
loop inp
end
in
loop inp
 
(*------------------------------------------------------------------*)
 
let main () =
let inpf_filename =
if 2 <= Array.length Sys.argv then
Sys.argv.(1)
else
"-"
in
let outf_filename =
if 3 <= Array.length Sys.argv then
Sys.argv.(2)
else
"-"
in
let inpf =
if inpf_filename = "-" then
stdin
else
open_in inpf_filename
in
let outf =
if outf_filename = "-" then
stdout
else
open_out outf_filename
in
let inp = Inp.of_in_channel inpf in
scan_text outf inp
;;
 
main ()
 
(*------------------------------------------------------------------*)</syntaxhighlight>
 
{{out}}
<pre>$ ocamlopt -O2 lex.ml && ./a.out compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|Ol}}==
==== Source ====
Note: we do not print the line and token source code position for the simplicity.
 
<syntaxhighlight lang="scheme">
(import (owl parse))
 
(define (get-comment)
(get-either
(let-parses (
(_ (get-imm #\*))
(_ (get-imm #\/)))
#true)
(let-parses (
(_ get-byte)
(_ (get-comment)))
#true)))
 
(define get-whitespace
(get-any-of
(get-byte-if (lambda (x) (has? '(#\tab #\newline #\space #\return) x))) ; whitespace
(let-parses ( ; comment
(_ (get-imm #\/))
(_ (get-imm #\*))
(_ (get-comment)))
#true)))
 
(define get-operator
(let-parses (
(operator (get-any-of
(get-word "||" 'Op_or)
(get-word "&&" 'Op_and)
(get-word "!=" 'Op_notequal)
(get-word "==" 'Op_equal)
(get-word ">=" 'Op_greaterequal)
(get-word "<=" 'Op_lessequal)
 
(get-word "=" 'Op_assign)
(get-word "!" 'Op_nop)
(get-word ">" 'Op_greater)
(get-word "<" 'Op_less)
(get-word "-" 'Op_subtract)
(get-word "+" 'Op_add)
(get-word "%" 'Op_mod)
(get-word "/" 'Op_divide)
(get-word "*" 'Op_multiply))))
(cons 'operator operator)))
 
(define get-symbol
(let-parses (
(symbol (get-any-of
(get-word "(" 'LeftParen)
(get-word ")" 'RightParen)
(get-word "{" 'LeftBrace)
(get-word "}" 'RightBrace)
(get-word ";" 'Semicolon)
(get-word "," 'Comma))))
(cons 'symbol symbol)))
 
(define get-keyword
(let-parses (
(keyword (get-any-of
(get-word "if" 'Keyword_if)
(get-word "else" 'Keyword_else)
(get-word "while" 'Keyword_while)
(get-word "print" 'Keyword_print)
(get-word "putc" 'Keyword_putc))))
(cons 'keyword keyword)))
 
 
 
(define get-identifier
(let-parses (
(lead (get-byte-if (lambda (x) (or (<= #\a x #\z) (<= #\A x #\Z) (= x #\_)))))
(tail (get-greedy* (get-byte-if (lambda (x) (or (<= #\a x #\z) (<= #\A x #\Z) (= x #\_) (<= #\0 x #\9)))))))
(cons 'identifier (bytes->string (cons lead tail)))))
 
(define get-integer
(let-parses (
(main (get-greedy+ (get-byte-if (lambda (x) (<= #\0 x #\9))))) )
(cons 'integer (string->integer (bytes->string main)))))
 
(define get-character
(let-parses (
(_ (get-imm #\'))
(char (get-any-of
(get-word "\\n" #\newline)
(get-word "\\\\" #\\)
(get-byte-if (lambda (x) (not (or (eq? x #\') (eq? x #\newline)))))))
(_ (get-imm #\')) )
(cons 'character char)))
 
(define get-string
(let-parses (
(_ (get-imm #\")) ;"
(data (get-greedy* (get-any-of
(get-word "\\n" #\newline)
(get-word "\\\\" #\\) ;\"
(get-byte-if (lambda (x) (not (or (eq? x #\") (eq? x #\newline)))))))) ;", newline
(_ (get-imm #\")) ) ;"
(cons 'string (bytes->string data))))
 
(define get-token
(let-parses (
(_ (get-greedy* get-whitespace))
(token (get-any-of
get-symbol
get-keyword
get-identifier
get-operator
get-integer
get-character
get-string
)) )
token))
 
(define token-parser
(let-parses (
(tokens (get-greedy+ get-token))
(_ (get-greedy* get-whitespace)))
tokens))
 
 
(define (translate source)
(let ((stream (try-parse token-parser (str-iter source) #t)))
(for-each print (car stream))
(if (null? (cdr stream))
(print 'End_of_input))))
</syntaxhighlight>
 
==== Testing ====
 
Testing function:
<syntaxhighlight lang="scheme">
(define (translate source)
(let ((stream (try-parse token-parser (str-iter source) #t)))
(for-each print (car stream))
(if (null? (force (cdr stream)))
(print 'End_of_input))))
</syntaxhighlight>
 
====== Testcase 1 ======
 
<syntaxhighlight lang="scheme">
(translate "
/*
Hello world
*/
print(\"Hello, World!\\\\n\");
")</syntaxhighlight>
{{Out}}
<pre>
(keyword . Keyword_print)
(symbol . LeftParen)
(string . Hello, World!\n)
(symbol . RightParen)
(symbol . Semicolon)
End_of_input
</pre>
 
====== Testcase 2 ======
 
<syntaxhighlight lang="scheme">
(translate "
/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, \"\\\\n\");
")</syntaxhighlight>
{{Out}}
<pre>
(identifier . phoenix_number)
(operator . Op_assign)
(integer . 142857)
(symbol . Semicolon)
(keyword . Keyword_print)
(symbol . LeftParen)
(identifier . phoenix_number)
(symbol . Comma)
(string . \n)
(symbol . RightParen)
(symbol . Semicolon)
End_of_input
</pre>
 
====== Testcase 3 ======
 
<syntaxhighlight lang="scheme">
(translate "
/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* Else */ else /* Leq */ <=
/* While */ while /* Geq */ >=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ \"String literal\"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\\n'
/* character literal */ '\\\\'
/* character literal */ ' '
")</syntaxhighlight>
{{Out}}
<pre>
(keyword . Keyword_print)
(operator . Op_subtract)
(keyword . Keyword_putc)
(operator . Op_less)
(keyword . Keyword_if)
(operator . Op_greater)
(keyword . Keyword_else)
(operator . Op_lessequal)
(keyword . Keyword_while)
(operator . Op_greaterequal)
(symbol . LeftBrace)
(operator . Op_equal)
(symbol . RightBrace)
(operator . Op_notequal)
(symbol . LeftParen)
(operator . Op_and)
(symbol . RightParen)
(operator . Op_or)
(operator . Op_subtract)
(symbol . Semicolon)
(operator . Op_nop)
(symbol . Comma)
(operator . Op_multiply)
(operator . Op_assign)
(operator . Op_divide)
(integer . 42)
(operator . Op_mod)
(string . String literal)
(operator . Op_add)
(identifier . variable_name)
(character . 10)
(character . 92)
(character . 32)
End_of_input
</pre>
 
====== Testcase 4 ======
 
<syntaxhighlight lang="scheme">
(translate "
/*** test printing, embedded \\\\n and comments with lots of '*' ***/
print(42);
print(\"\\\\nHello World\\\\nGood Bye\\\\nok\\\\n\");
print(\"Print a slash n - \\\\\\\\n.\\\\n\");
")
</syntaxhighlight>
{{Out}}
<pre>
(keyword . Keyword_print)
(symbol . LeftParen)
(integer . 42)
(symbol . RightParen)
(symbol . Semicolon)
(keyword . Keyword_print)
(symbol . LeftParen)
(string . \nHello World\nGood Bye\nok\n)
(symbol . RightParen)
(symbol . Semicolon)
(keyword . Keyword_print)
(symbol . LeftParen)
(string . Print a slash n - \\n.\n)
(symbol . RightParen)
(symbol . Semicolon)
End_of_input
</pre>
 
=={{header|Perl}}==
 
<syntaxhighlight lang="perl">#!/usr/bin/env perl
 
use strict;
use warnings;
no warnings 'once';
 
 
#----- Definition of the language to be lexed -----#
 
my @tokens = (
# Name | Format | Value #
# -------------- |----------------------|-------------#
['Op_multiply' , '*' , ],
['Op_divide' , '/' , ],
['Op_mod' , '%' , ],
['Op_add' , '+' , ],
['Op_subtract' , '-' , ],
['Op_lessequal' , '<=' , ],
['Op_less' , '<' , ],
['Op_greaterequal', '>=' , ],
['Op_greater' , '>' , ],
['Op_equal' , '==' , ],
['Op_assign' , '=' , ],
['Op_not' , '!' , ],
['Op_notequal' , '!=' , ],
['Op_and' , '&&' , ],
['Op_or' , '||' , ],
['Keyword_else' , qr/else\b/ , ],
['Keyword_if' , qr/if\b/ , ],
['Keyword_while' , qr/while\b/ , ],
['Keyword_print' , qr/print\b/ , ],
['Keyword_putc' , qr/putc\b/ , ],
 
['LeftParen' , '(' , ],
['RightParen' , ')' , ],
['LeftBrace' , '{' , ],
['RightBrace' , '}' , ],
['Semicolon' , ';' , ],
['Comma' , ',' , ],
 
['Identifier' , qr/[_a-z][_a-z0-9]*/i, \&raw ],
['Integer' , qr/[0-9]+\b/ , \&raw ],
['Integer' , qr/'([^']*)(')?/ , \&char_val ],
['String' , qr/"([^"]*)(")?/ , \&string_raw],
 
['End_of_input' , qr/$/ , ],
);
 
my $comment = qr/\/\* .+? (?: \*\/ | $ (?{die "End-of-file in comment\n"}) )/xs;
my $whitespace = qr/(?: \s | $comment)*/x;
my $unrecognized = qr/\w+ | ./x;
 
#| Returns the value of a matched char literal, or dies if it is invalid
sub char_val {
my $str = string_val();
die "Multiple characters\n" if length $str > 1;
die "No character\n" if length $str == 0;
ord $str;
}
 
#| Returns the value of a matched string literal, or dies if it is invalid
sub string_val {
my ($str, $end) = ($1, $2);
die "End-of-file\n" if not defined $end;
die "End-of-line\n" if $str =~ /\n/;
$str =~ s/\\(.)/
$1 eq 'n' ? "\n"
: $1 eq '\\' ? $1
: $1 eq $end ? $1
: die "Unknown escape sequence \\$1\n"
/rge;
}
 
#| Returns the source string of a matched literal
sub raw { $& }
 
#| Returns the source string of a matched string literal, or dies if invalid
sub string_raw {
string_val(); # Just for the error handling side-effects
$&;
}
 
 
#----- Lexer "engine" -----#
 
# Construct the scanner regex:
 
my $tokens =
join "|",
map {
my $format = $tokens[$_][1];
"\n".(ref $format ? $format : quotemeta $format)." (*MARK:$_) ";
} 0..$#tokens;
 
my $regex = qr/
\G (?| $whitespace \K (?| $tokens )
| $whitespace? \K ($unrecognized) (*MARK:!) )
/x;
 
 
# Run the lexer:
 
my $input = do { local $/ = undef; <STDIN> };
my $pos = 0;
my $linecol = linecol_accumulator();
 
while ($input =~ /$regex/g) {
# Get the line and column number
my ($line, $col) = $linecol->(substr $input, $pos, $-[0] - $pos);
$pos = $-[0];
 
# Get the token type that was identified by the scanner regex
my $type = $main::REGMARK;
die "Unrecognized token $1 at line $line, col $col\n" if $type eq '!';
my ($name, $evaluator) = @{$tokens[$type]}[0, 2];
 
# Get the token value
my $value;
if ($evaluator) {
eval { $value = $evaluator->() };
if ($@) { chomp $@; die "$@ in $name at line $line, col $col\n" }
}
 
# Print the output line
print "$line\t$col\t$name".($value ? "\t$value" : '')."\n";
}
 
#| Returns a closure, which can be fed a string one piece at a time and gives
#| back the cumulative line and column number each time
sub linecol_accumulator {
my ($line, $col) = (1, 1);
sub {
my $str = shift;
my @lines = split "\n", $str, -1;
my ($l, $c) = @lines ? (@lines - 1, length $lines[-1]) : (0, 0);
if ($l) { $line += $l; $col = 1 + $c }
else { $col += $c }
($line, $col)
}
}</syntaxhighlight>
 
{{out|case=test case 3}}
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_not
11 41 Op_assign
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
 
===Alternate Perl Solution===
Tested on perl v5.26.1
<syntaxhighlight lang="perl">#!/usr/bin/perl
 
use strict; # lex.pl - source to tokens
use warnings; # http://www.rosettacode.org/wiki/Compiler/lexical_analyzer
no warnings qw(qw);
 
my %keywords = map { $_, "Keyword_$_" } qw( while print if else putc );
my %tokens = qw[ ; Semicolon ( LeftParen ) RightParen { LeftBrace } RightBrace
+ Op_add - Op_subtract * Op_multiply % Op_mod = Op_assign >= Op_greaterequal
!= Op_notequal == Op_equal ! Op_not < Op_less <= Op_lessequal > Op_greater
, Comma && Op_and || Op_or ];
 
local $_ = join '', <>;
 
while( /\G (?|
\s+ (?{ undef })
| \d+[_a-zA-Z]\w* (?{ die "invalid mixed number $&\n" })
| \d+ (?{ "Integer $&" })
| \w+ (?{ $keywords{$&} || "Identifier $&" })
| ( [-;(){}+*%,] | [=!<>]=? | && | \|\| )
(?{ $tokens{$1} })
| \/ (?{ 'Op_divide' }) (?: \* (?: [\s\S]*?\*\/ (?{ undef }) |
(?{ die "End-of-file in comment\n" }) ) )?
| "[^"\n]*" (?{ "String $&" })
| " (?{ die "unterminated string\n" })
| '' (?{ die "empty character constant\n" })
| '([^\n\\])' (?{ 'Integer ' . ord $1 })
| '\\n' (?{ 'Integer 10' })
| '\\\\' (?{ 'Integer 92' })
| ' (?{ die "unterminated or bad character constant\n" }) #'
| . (?{ die "invalid character $&\n" })
) /gcx )
{
defined $^R and printf "%5d %7d %s\n",
1 + $` =~ tr/\n//, 1 + length $` =~ s/.*\n//sr, $^R;
}
printf "%5d %7d %s\n", 1 + tr/\n//, 1, 'End_of_input';</syntaxhighlight>
 
=={{header|Phix}}==
Deviates from the task requirements in that it is written in a modular form so that the output
from one stage can be used directly in the next, rather than re-loading from a human-readable
form. If required, demo\rosetta\Compiler\extra.e (below) contains some code that achieves the latter.
Code to print the human readable forms is likewise kept separate from any re-usable parts.
<!--<syntaxhighlight lang="phix">(phixonline)-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\core.e
-- ============================
--
-- Standard declarations and routines used by lex.exw, parse.exw, cgen.exw, and interp.exw
-- (included in distribution as above, which contains some additional sanity checks)
--</span>
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">constant</span> <span style="color: #000000;">EOF</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">STDIN</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">STDOUT</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">enum</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">=</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">UNARY</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">=</span><span style="color: #000000;">2</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">type</span> <span style="color: #000000;">nary</span><span style="color: #0000FF;">(</span><span style="color: #004080;">integer</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">return</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">=</span><span style="color: #000000;">NONE</span> <span style="color: #008080;">or</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">=</span><span style="color: #000000;">UNARY</span> <span style="color: #008080;">or</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">=</span><span style="color: #000000;">BINARY</span> <span style="color: #008080;">end</span> <span style="color: #008080;">type</span>
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">tkNames</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span> <span style="color: #000080;font-style:italic;">-- eg/ie {"Op_multiply","Op_divide",..}</span>
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">precedences</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span>
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">narys</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span> <span style="color: #000080;font-style:italic;">-- NONE/UNARY/BINARY</span>
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">operators</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span> <span style="color: #000080;font-style:italic;">-- eg/ie {"*","/","+","-","&lt;","&lt;=",..}</span>
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">opcodes</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span> <span style="color: #000080;font-style:italic;">-- idx to tkNames, matching operators</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">constant</span> <span style="color: #000000;">KEYWORDS</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">new_dict</span><span style="color: #0000FF;">()</span> <span style="color: #000080;font-style:italic;">-- eg/ie {"if"=&gt;idx to tkNames}</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">enum</span> <span style="color: #000000;">OPERATOR</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">DIGIT</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">LETTER</span> <span style="color: #000080;font-style:italic;">-- character classes</span>
<span style="color: #008080;">global</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">charmap</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #000000;">255</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #008000;">'0'</span><span style="color: #0000FF;">..</span><span style="color: #008000;">'9'</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">DIGIT</span>
<span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #008000;">'A'</span><span style="color: #0000FF;">..</span><span style="color: #008000;">'Z'</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">LETTER</span>
<span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #008000;">'a'</span><span style="color: #0000FF;">..</span><span style="color: #008000;">'z'</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">LETTER</span>
<span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #008000;">'_'</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">LETTER</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">nary</span> <span style="color: #000000;">n</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">precedence</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">tkNames</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tkNames</span><span style="color: #0000FF;">,</span><span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">narys</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">narys</span><span style="color: #0000FF;">,</span><span style="color: #000000;">n</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">precedences</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">precedences</span><span style="color: #0000FF;">,</span><span style="color: #000000;">precedence</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">return</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tkNames</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">string</span> <span style="color: #000000;">op</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">nary</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">precedence</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">n</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">precedence</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">operators</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">operators</span><span style="color: #0000FF;">,</span><span style="color: #000000;">op</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">opcodes</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">opcodes</span><span style="color: #0000FF;">,</span><span style="color: #000000;">res</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">op</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
<span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">op</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">OPERATOR</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">res</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">string</span> <span style="color: #000000;">keyword</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
<span style="color: #7060A8;">putd</span><span style="color: #0000FF;">(</span><span style="color: #000000;">keyword</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">KEYWORDS</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">res</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">constant</span>
<span style="color: #000000;">tk_EOI</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"End_of_input"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--1</span>
<span style="color: #000000;">tk_mul</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_multiply"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"*"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">13</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--2</span>
<span style="color: #000000;">tk_div</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_divide"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"/"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">13</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--3</span>
<span style="color: #000000;">tk_mod</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_mod"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"%"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">13</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--4</span>
<span style="color: #000000;">tk_add</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_add"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"+"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">12</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--5</span>
<span style="color: #000000;">tk_sub</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_subtract"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"-"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">12</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--6</span>
<span style="color: #000000;">tk_neg</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_negate"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">UNARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">14</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--7</span>
<span style="color: #000000;">tk_not</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_not"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"!"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">UNARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">14</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--8</span>
<span style="color: #000000;">tk_lt</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_less"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"&lt;"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">10</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--9</span>
<span style="color: #000000;">tk_le</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_lessequal"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"&lt;="</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">10</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--10</span>
<span style="color: #000000;">tk_gt</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_greater"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"&gt;"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">10</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--11</span>
<span style="color: #000000;">tk_ge</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_greaterequal"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"&gt;="</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span><span style="color: #000000;">10</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--12</span>
<span style="color: #000000;">tk_eq</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_equal"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"=="</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">9</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--13</span>
<span style="color: #000000;">tk_ne</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_notequal"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"!="</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">9</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--14</span>
<span style="color: #000000;">tk_assign</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_assign"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"="</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--15</span>
<span style="color: #000000;">tk_and</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_and"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"&&"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">5</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--16</span>
<span style="color: #000000;">tk_or</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Op_or"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"||"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">BINARY</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">4</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--17</span>
<span style="color: #000000;">tk_if</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Keyword_if"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"if"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--18</span>
<span style="color: #000000;">tk_else</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Keyword_else"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"else"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--19</span>
<span style="color: #000000;">tk_while</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Keyword_while"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"while"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--20</span>
<span style="color: #000000;">tk_print</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Keyword_print"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"print"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--21</span>
<span style="color: #000000;">tk_putc</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkKw</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Keyword_putc"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"putc"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--22</span>
<span style="color: #000000;">tk_LeftParen</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"LeftParen"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"("</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--23</span>
<span style="color: #000000;">tk_RightParen</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"RightParen"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">")"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--24</span>
<span style="color: #000000;">tk_LeftBrace</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"LeftBrace"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"{"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--25</span>
<span style="color: #000000;">tk_RightBrace</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"RightBrace"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"}"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--26</span>
<span style="color: #000000;">tk_Semicolon</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Semicolon"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">";"</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--27</span>
<span style="color: #000000;">tk_Comma</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkOp</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Comma"</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">","</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">NONE</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--28</span>
<span style="color: #000000;">tk_Identifier</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Identifier"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--29</span>
<span style="color: #000000;">tk_Integer</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Integer"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--30</span>
<span style="color: #000000;">tk_String</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"String"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--31</span>
<span style="color: #000000;">tk_Sequence</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"Sequence"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--32</span>
<span style="color: #000000;">tk_Prints</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"tk_Prints"</span><span style="color: #0000FF;">),</span> <span style="color: #000080;font-style:italic;">--33</span>
<span style="color: #000000;">tk_Printi</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">tkName</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"tk_Printi"</span><span style="color: #0000FF;">)</span> <span style="color: #000080;font-style:italic;">--34</span>
<span style="color: #008080;">global</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">input_file</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">STDIN</span><span style="color: #0000FF;">,</span>
<span style="color: #000000;">output_file</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">STDOUT</span>
<span style="color: #008080;">type</span> <span style="color: #000000;">strint</span><span style="color: #0000FF;">(</span><span style="color: #004080;">object</span> <span style="color: #000000;">o</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">return</span> <span style="color: #004080;">string</span><span style="color: #0000FF;">(</span><span style="color: #000000;">o</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">or</span> <span style="color: #004080;">integer</span><span style="color: #0000FF;">(</span><span style="color: #000000;">o</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">type</span>
<span style="color: #008080;">global</span> <span style="color: #000000;">strint</span> <span style="color: #000000;">tok_line</span><span style="color: #0000FF;">,</span> <span style="color: #000080;font-style:italic;">-- save of line/col at the start of</span>
<span style="color: #000000;">tok_col</span> <span style="color: #000080;font-style:italic;">-- token/comment, for result/errors</span>
<span style="color: #008080;">global</span> <span style="color: #004080;">object</span> <span style="color: #000000;">oneline</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span>
<span style="color: #008080;">constant</span> <span style="color: #000000;">errfmt</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">"Line %s column %s:\n%s%s"</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">errline</span><span style="color: #0000FF;">()</span>
<span style="color: #000000;">oneline</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">substitute</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">trim</span><span style="color: #0000FF;">(</span><span style="color: #000000;">oneline</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\r\n"</span><span style="color: #0000FF;">),</span><span style="color: #008000;">'\t'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">padding</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">return</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"%s\n%s^ "</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">oneline</span><span style="color: #0000FF;">,</span><span style="color: #000000;">padding</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">procedure</span> <span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #004080;">sequence</span> <span style="color: #000000;">msg</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">args</span><span style="color: #0000FF;">={})</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">args</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">msg</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">msg</span><span style="color: #0000FF;">,</span><span style="color: #000000;">args</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">el</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #004080;">atom</span><span style="color: #0000FF;">(</span><span style="color: #000000;">oneline</span><span style="color: #0000FF;">)?</span><span style="color: #008000;">""</span><span style="color: #0000FF;">:</span><span style="color: #000000;">errline</span><span style="color: #0000FF;">())</span>
<span style="color: #008080;">if</span> <span style="color: #004080;">integer</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span> <span style="color: #000000;">tok_line</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"%d"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">if</span> <span style="color: #004080;">integer</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span> <span style="color: #000000;">tok_col</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"%d"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">STDOUT</span><span style="color: #0000FF;">,</span><span style="color: #000000;">errfmt</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">,</span><span style="color: #000000;">el</span><span style="color: #0000FF;">,</span><span style="color: #000000;">msg</span><span style="color: #0000FF;">})</span>
<span style="color: #0000FF;">{}</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">wait_key</span><span style="color: #0000FF;">()</span>
<span style="color: #7060A8;">abort</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
<span style="color: #008080;">include</span> <span style="color: #000000;">js_io</span><span style="color: #0000FF;">.</span><span style="color: #000000;">e</span> <span style="color: #000080;font-style:italic;">-- fake file i/o for running under pwa/p2js</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">open_file</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">file_name</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">string</span> <span style="color: #000000;">mode</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">platform</span><span style="color: #0000FF;">()=</span><span style="color: #004600;">JS</span><span style="color: #0000FF;">?</span><span style="color: #000000;">js_open</span><span style="color: #0000FF;">(</span><span style="color: #000000;">file_name</span><span style="color: #0000FF;">)</span>
<span style="color: #0000FF;">:</span><span style="color: #7060A8;">open</span><span style="color: #0000FF;">(</span><span style="color: #000000;">file_name</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">mode</span><span style="color: #0000FF;">))</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">fn</span><span style="color: #0000FF;"><=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">STDOUT</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"Could not open %s"</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">file_name</span><span style="color: #0000FF;">})</span>
<span style="color: #0000FF;">{}</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">wait_key</span><span style="color: #0000FF;">()</span>
<span style="color: #7060A8;">abort</span><span style="color: #0000FF;">(</span><span style="color: #000000;">1</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">fn</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">procedure</span> <span style="color: #000000;">open_files</span><span style="color: #0000FF;">(</span><span style="color: #004080;">sequence</span> <span style="color: #000000;">cl</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">cl</span><span style="color: #0000FF;">)></span><span style="color: #000000;">2</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">input_file</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">open_file</span><span style="color: #0000FF;">(</span><span style="color: #000000;">cl</span><span style="color: #0000FF;">[</span><span style="color: #000000;">3</span><span style="color: #0000FF;">],</span><span style="color: #008000;">"r"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">cl</span><span style="color: #0000FF;">)></span><span style="color: #000000;">3</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">output_file</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">open_file</span><span style="color: #0000FF;">(</span><span style="color: #000000;">cl</span><span style="color: #0000FF;">[</span><span style="color: #000000;">4</span><span style="color: #0000FF;">],</span><span style="color: #008000;">"w"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">procedure</span> <span style="color: #000000;">close_files</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">platform</span><span style="color: #0000FF;">()!=</span><span style="color: #004600;">JS</span> <span style="color: #008080;">then</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">input_file</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">STDIN</span> <span style="color: #008080;">then</span> <span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">output_file</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">STDOUT</span> <span style="color: #008080;">then</span> <span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">output_file</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">enquote</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">return</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">`"%s"`</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">substitute</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\\n"</span><span style="color: #0000FF;">))</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">unquote</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">[</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]!=</span><span style="color: #008000;">'\"'</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">[$]!=</span><span style="color: #008000;">'\"'</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">s</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">substitute</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">..-</span><span style="color: #000000;">2</span><span style="color: #0000FF;">],</span><span style="color: #008000;">"\\n"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">s</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</syntaxhighlight>-->
For running under pwa/p2js, we also have a "fake file/io" component:
<!--<syntaxhighlight lang="phix">(phixonline)-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\js_io.e
-- =============================
--
-- Fake file i/o for running under pwa/p2js in a browser
-- Does not cover the human readable reload parts of extra.e
--</span>
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span>
<span style="color: #004080;">sequence</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">known_files</span><span style="color: #0000FF;">,</span><span style="color: #000000;">kfc</span><span style="color: #0000FF;">}</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">columnize</span><span style="color: #0000FF;">({</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"test3.c"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ &lt;
/* If */ if /* Gtr */ &gt;
/* Else */ else /* Leq */ &lt;=
/* While */ while /* Geq */ &gt;=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal **/ '\\'
/* character literal */ ' '
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"test4.c"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
/*** test printing, embedded \n and comments with lots of '*' ***/
print(42);
print("\nHello World\nGood Bye\nok\n");
print("Print a slash n - \\n.\n");
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"primes.c"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
/*
Simple prime number generator
*/
count = 1;
n = 1;
limit = 100;
while (n &lt; limit) {
k=3;
p=1;
n=n+2;
while ((k*k&lt;=n) && (p)) {
p=n/k*k!=n;
k=k+2;
}
if (p) {
print(n, " is prime\n");
count = count + 1;
}
}
print("Total primes found: ", count, "\n");
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"gcd.c"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
/* Compute the gcd of 1071, 1029: 21 */
a = 1071;
b = 1029;
while (b != 0) {
new_a = b;
b = a % b;
a = new_a;
}
print(a);
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"Header.h"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
#define area(h, w) h * w
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"Source.t"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
#include "Header.h"
#define width 5
#define height 6
area = #area(height, width)#;
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)}})</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">linenos</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">known_files</span><span style="color: #0000FF;">))</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">js_open</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">filename</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">filename</span><span style="color: #0000FF;">,</span><span style="color: #000000;">known_files</span><span style="color: #0000FF;">)</span>
<span style="color: #7060A8;">assert</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">0</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">linenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">fn</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">js_gets</span><span style="color: #0000FF;">(</span><span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">lineno</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">linenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">]+</span><span style="color: #000000;">1</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">lineno</span><span style="color: #0000FF;"><=</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">kfc</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">linenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">lineno</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">kfc</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">][</span><span style="color: #000000;">lineno</span><span style="color: #0000FF;">]</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">EOF</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</syntaxhighlight>-->
The main lexer is also written to be reusable by later stages.
<!--<syntaxhighlight lang="phix">(phixonline)-->
<span style="color: #000080;font-style:italic;">--
-- demo\\rosetta\\Compiler\\lex.e
-- ==============================
--
-- The reusable part of lex.exw
-- This is only kept separate from core.e for consistency with later modules.</span>
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span>
<span style="color: #008080;">include</span> <span style="color: #000000;">core</span><span style="color: #0000FF;">.</span><span style="color: #000000;">e</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span>
<span style="color: #000000;">line</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span><span style="color: #0000FF;">,</span>
<span style="color: #000000;">col</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
<span style="color: #008080;">procedure</span> <span style="color: #000000;">eof</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"%s in %s literal"</span><span style="color: #0000FF;">,{</span><span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span><span style="color: #0000FF;">?</span><span style="color: #008000;">"EOF"</span><span style="color: #0000FF;">:</span><span style="color: #008000;">"EOL"</span><span style="color: #0000FF;">),</span><span style="color: #000000;">s</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">while</span> <span style="color: #000000;">1</span> <span style="color: #008080;">do</span>
<span style="color: #000000;">col</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">oneline</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">EOF</span>
<span style="color: #008080;">exit</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">col</span><span style="color: #0000FF;">></span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">oneline</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">line</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
<span style="color: #000000;">col</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
<span style="color: #000000;">oneline</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">platform</span><span style="color: #0000FF;">()=</span><span style="color: #004600;">JS</span><span style="color: #0000FF;">?</span><span style="color: #000000;">js_gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">)</span>
<span style="color: #0000FF;">:</span><span style="color: #7060A8;">gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">))</span>
<span style="color: #008080;">else</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">oneline</span><span style="color: #0000FF;">[</span><span style="color: #000000;">col</span><span style="color: #0000FF;">]</span>
<span style="color: #008080;">exit</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">ch</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #000080;font-style:italic;">-- for pwa/p2js (JavaScript *really* dislikes tabs in strings):
--constant whitespace = " \t\r\n\x0B\xA0"</span>
<span style="color: #008080;">constant</span> <span style="color: #000000;">whitespace</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\t'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\r'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\n'</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#0B</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#A0</span><span style="color: #0000FF;">}</span>
<span style="color: #000080;font-style:italic;">-- (0x0B is Vertical Tab, 0xA0 is Non-breaking space)</span>
<span style="color: #008080;">procedure</span> <span style="color: #000000;">skipspacesandcomments</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">while</span> <span style="color: #000000;">1</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">if</span> <span style="color: #008080;">not</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">,</span><span style="color: #000000;">whitespace</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'/'</span> <span style="color: #008080;">and</span> <span style="color: #000000;">col</span><span style="color: #0000FF;"><</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">oneline</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">and</span> <span style="color: #000000;">oneline</span><span style="color: #0000FF;">[</span><span style="color: #000000;">col</span><span style="color: #0000FF;">+</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]=</span><span style="color: #008000;">'*'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">tok_line</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">line</span> <span style="color: #000080;font-style:italic;">-- (in case of EOF error)</span>
<span style="color: #000000;">tok_col</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">col</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span> <span style="color: #000080;font-style:italic;">-- (can be EOF)</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span> <span style="color: #000080;font-style:italic;">-- ( "" )</span>
<span style="color: #008080;">while</span> <span style="color: #000000;">1</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'*'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'/'</span> <span style="color: #008080;">then</span> <span style="color: #008080;">exit</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"EOF in comment"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">else</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #008080;">else</span>
<span style="color: #008080;">exit</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">escape_char</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span> <span style="color: #000080;font-style:italic;">-- (discard the '\\')</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'n'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">'\n'</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\\'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">'\\'</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span>
<span style="color: #008080;">or</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\n'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">eof</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">else</span>
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">`unknown escape sequence \%c`</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">ch</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">char_lit</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">startch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">,</span>
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span> <span style="color: #000080;font-style:italic;">-- (skip opening quote, save res)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">startch</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"empty character constant"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\\'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">res</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">escape_char</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"character"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span>
<span style="color: #008080;">or</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\n'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">eof</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"character"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">startch</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"multi-character constant"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tk_Integer</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">res</span><span style="color: #0000FF;">}</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">string_lit</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">startch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">ch</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">text</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span>
<span style="color: #008080;">while</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()!=</span><span style="color: #000000;">startch</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #000000;">EOF</span>
<span style="color: #008080;">or</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\n'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">eof</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"string"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\\'</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">escape_char</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"string"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">text</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">ch</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tk_String</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">text</span><span style="color: #0000FF;">}</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">get_op</span><span style="color: #0000FF;">()</span>
<span style="color: #000080;font-style:italic;">-- sequence operator = {ch}</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">operator</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span><span style="color: #0000FF;">&</span><span style="color: #000000;">ch</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">while</span> <span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">]=</span><span style="color: #000000;">OPERATOR</span>
<span style="color: #008080;">and</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">operator</span><span style="color: #0000FF;">&</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">,</span><span style="color: #000000;">operators</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
<span style="color: #000080;font-style:italic;">-- (^ ie/eg merge "&gt;=", but not ");")</span>
<span style="color: #000000;">operator</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">ch</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">k</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">operator</span><span style="color: #0000FF;">,</span><span style="color: #000000;">operators</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">k</span><span style="color: #0000FF;">=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"unknown operator"</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">opcodes</span><span style="color: #0000FF;">[</span><span style="color: #000000;">k</span><span style="color: #0000FF;">],</span> <span style="color: #000000;">0</span><span style="color: #0000FF;">}</span> <span style="color: #000080;font-style:italic;">-- (0 unused)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">get_int</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">i</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
<span style="color: #008080;">while</span> <span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">]=</span><span style="color: #000000;">DIGIT</span> <span style="color: #008080;">do</span>
<span style="color: #000000;">i</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">*</span><span style="color: #000000;">10</span> <span style="color: #0000FF;">+</span> <span style="color: #0000FF;">(</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">-</span><span style="color: #008000;">'0'</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">]=</span><span style="color: #000000;">LETTER</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"invalid number"</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tk_Integer</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">}</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">get_ident</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">text</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span>
<span style="color: #008080;">while</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">],{</span><span style="color: #000000;">LETTER</span><span style="color: #0000FF;">,</span><span style="color: #000000;">DIGIT</span><span style="color: #0000FF;">})</span> <span style="color: #008080;">do</span>
<span style="color: #000000;">text</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">ch</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">keyword</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">getd</span><span style="color: #0000FF;">(</span><span style="color: #000000;">text</span><span style="color: #0000FF;">,</span><span style="color: #000000;">KEYWORDS</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">keyword</span><span style="color: #0000FF;">!=</span><span style="color: #004600;">NULL</span> <span style="color: #008080;">then</span>
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">keyword</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">0</span><span style="color: #0000FF;">}</span> <span style="color: #000080;font-style:italic;">-- (0 unused)</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tk_Identifier</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">text</span><span style="color: #0000FF;">}</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">get_token</span><span style="color: #0000FF;">()</span>
<span style="color: #000000;">skipspacesandcomments</span><span style="color: #0000FF;">()</span>
<span style="color: #000000;">tok_line</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">line</span>
<span style="color: #000000;">tok_col</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">col</span>
<span style="color: #008080;">switch</span> <span style="color: #000000;">ch</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">case</span> <span style="color: #000000;">EOF</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tk_EOI</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">0</span><span style="color: #0000FF;">}</span> <span style="color: #000080;font-style:italic;">-- (0 unused)</span>
<span style="color: #008080;">case</span> <span style="color: #008000;"><nowiki>'\''</nowiki></span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">char_lit</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">case</span> <span style="color: #008000;">'"'</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">string_lit</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">else</span>
<span style="color: #008080;">switch</span> <span style="color: #000000;">charmap</span><span style="color: #0000FF;">[</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">]</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">case</span> <span style="color: #000000;">OPERATOR</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">get_op</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">case</span> <span style="color: #000000;">DIGIT</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">get_int</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">case</span> <span style="color: #000000;">LETTER</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">get_ident</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">else</span> <span style="color: #000000;">error</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"unrecognized character: (%d)"</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">ch</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">switch</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">switch</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">lex</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">toks</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">tok</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span>
<span style="color: #004080;">object</span> <span style="color: #000000;">v</span>
<span style="color: #008080;">while</span> <span style="color: #000000;">tok</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">tk_EOI</span> <span style="color: #008080;">do</span>
<span style="color: #0000FF;">{</span><span style="color: #000000;">tok</span><span style="color: #0000FF;">,</span><span style="color: #000000;">v</span><span style="color: #0000FF;">}</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">get_token</span><span style="color: #0000FF;">()</span>
<span style="color: #000000;">toks</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">toks</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok</span><span style="color: #0000FF;">,</span><span style="color: #000000;">v</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">while</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">toks</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</syntaxhighlight>-->
Optional: if you need human-readable output/input at each (later) stage, so you can use pipes
<!--<syntaxhighlight lang="phix">-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\extra.e
-- =============================
--
-- Routines to reload human-readable files (deviation from task requirement)
--</span>
<span style="color: #008080;">without</span> <span style="color: #008080;">js</span> <span style="color: #000080;font-style:italic;">-- (file i/o)
--The following can be used to load .lex files, as created by lex.exw:
-- (in place of the existing get_tok() in parse.e)</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">get_tok</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">line</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">trim</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">))</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">tok</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #000000;">line</span><span style="color: #0000FF;">,</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #000000;">limit</span><span style="color: #0000FF;">:=</span><span style="color: #000000;">4</span><span style="color: #0000FF;">,</span><span style="color: #000000;">no_empty</span><span style="color: #0000FF;">:=</span><span style="color: #000000;">1</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">k</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">tok</span><span style="color: #0000FF;">[</span><span style="color: #000000;">3</span><span style="color: #0000FF;">],</span><span style="color: #000000;">tkNames</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">k</span><span style="color: #0000FF;">=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000000;">tok</span><span style="color: #0000FF;">[</span><span style="color: #000000;">3</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">k</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">tok</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #000080;font-style:italic;">--The following can be used to load .ast files, as created by parse.exw:
-- (in place of the existing lex()/parse() pairs in cgen.exw and interp.exw)</span>
<span style="color: #008080;">function</span> <span style="color: #000000;">load_ast</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">line</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">trim</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">))</span>
<span style="color: #000080;font-style:italic;">-- Each line has at least one token</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">node</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #000000;">line</span><span style="color: #0000FF;">,</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #000000;">limit</span><span style="color: #0000FF;">:=</span><span style="color: #000000;">2</span><span style="color: #0000FF;">,</span><span style="color: #000000;">no_empty</span><span style="color: #0000FF;">:=</span><span style="color: #000000;">1</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">node_type</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">node_type</span> <span style="color: #0000FF;">==</span> <span style="color: #008000;">";"</span> <span style="color: #008080;">then</span> <span style="color: #000080;font-style:italic;">-- a terminal node</span>
<span style="color: #008080;">return</span> <span style="color: #004600;">NULL</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">n_type</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">node_type</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tkNames</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">n_type</span><span style="color: #0000FF;">=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #0000FF;">?</span><span style="color: #000000;">9</span><span style="color: #0000FF;">/</span><span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #000080;font-style:italic;">-- A line with two tokens is a leaf node
-- Leaf nodes are: Identifier, Integer, String
-- The 2nd token is the value</span>
<span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">node</span><span style="color: #0000FF;">)></span><span style="color: #000000;">1</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">n_type</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">n_type</span><span style="color: #0000FF;">=</span><span style="color: #000000;">tk_Integer</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">to_integer</span><span style="color: #0000FF;">(</span><span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">])</span>
<span style="color: #008080;">elsif</span> <span style="color: #000000;">n_type</span><span style="color: #0000FF;">=</span><span style="color: #000000;">tk_String</span> <span style="color: #008080;">then</span>
<span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">unquote</span><span style="color: #0000FF;">(</span><span style="color: #000000;">node</span><span style="color: #0000FF;">[</span><span style="color: #000000;">2</span><span style="color: #0000FF;">])</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">node</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #004080;">object</span> <span style="color: #000000;">left</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">load_ast</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">object</span> <span style="color: #000000;">right</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">load_ast</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">n_type</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">left</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">right</span><span style="color: #0000FF;">}</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</syntaxhighlight>-->
Finally, a simple test driver for the specific task:
<!--<syntaxhighlight lang="phix">(phixonline)-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\lex.exw
-- =============================
--</span>
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span>
<span style="color: #008080;">include</span> <span style="color: #000000;">lex</span><span style="color: #0000FF;">.</span><span style="color: #000000;">e</span>
<span style="color: #008080;">procedure</span> <span style="color: #000000;">main</span><span style="color: #0000FF;">(</span><span style="color: #004080;">sequence</span> <span style="color: #000000;">cl</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">open_files</span><span style="color: #0000FF;">(</span><span style="color: #000000;">cl</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">toks</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">lex</span><span style="color: #0000FF;">()</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">tok</span>
<span style="color: #004080;">object</span> <span style="color: #000000;">v</span>
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">toks</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
<span style="color: #0000FF;">{</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok</span><span style="color: #0000FF;">,</span><span style="color: #000000;">v</span><span style="color: #0000FF;">}</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">toks</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span>
<span style="color: #008080;">switch</span> <span style="color: #000000;">tok</span> <span style="color: #008080;">do</span>
<span style="color: #008080;">case</span> <span style="color: #000000;">tk_Identifier</span><span style="color: #0000FF;">:</span> <span style="color: #000000;">v</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">" %s"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">v</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">case</span> <span style="color: #000000;">tk_Integer</span><span style="color: #0000FF;">:</span> <span style="color: #000000;">v</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">" %5d"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">v</span><span style="color: #0000FF;">)</span>
<span style="color: #008080;">case</span> <span style="color: #000000;">tk_String</span><span style="color: #0000FF;">:</span> <span style="color: #000000;">v</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">sprintf</span><span style="color: #0000FF;">(</span><span style="color: #008000;">" %s"</span><span style="color: #0000FF;">,</span><span style="color: #000000;">enquote</span><span style="color: #0000FF;">(</span><span style="color: #000000;">v</span><span style="color: #0000FF;">))</span>
<span style="color: #008080;">else</span> <span style="color: #000000;">v</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">switch</span>
<span style="color: #7060A8;">printf</span><span style="color: #0000FF;">(</span><span style="color: #000000;">output_file</span><span style="color: #0000FF;">,</span> <span style="color: #008000;">"%5d %5d %-10s%s\n"</span><span style="color: #0000FF;">,</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">tok_line</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tok_col</span><span style="color: #0000FF;">,</span><span style="color: #000000;">tkNames</span><span style="color: #0000FF;">[</span><span style="color: #000000;">tok</span><span style="color: #0000FF;">],</span><span style="color: #000000;">v</span><span style="color: #0000FF;">})</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
<span style="color: #000000;">close_files</span><span style="color: #0000FF;">()</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
<span style="color: #000080;font-style:italic;">--main(command_line())</span>
<span style="color: #000000;">main</span><span style="color: #0000FF;">({</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"test4.c"</span><span style="color: #0000FF;">})</span>
<!--</syntaxhighlight>-->
{{out}}
<pre>
2 1 Keyword_print
2 6 LeftParen
2 7 Integer 42
2 9 RightParen
2 10 Semicolon
3 1 Keyword_print
3 6 LeftParen
3 7 String "\nHello World\nGood Bye\nok\n"
3 38 RightParen
3 39 Semicolon
4 1 Keyword_print
4 6 LeftParen
4 7 String "Print a slash n - \n.\n"
4 33 RightParen
4 34 Semicolon
5 1 End_of_input
</pre>
 
=={{header|Prolog}}==
 
<syntaxhighlight lang="prolog">/*
Test harness for the analyzer, not needed if we are actually using the output.
*/
load_file(File, Input) :-
read_file_to_codes(File, Codes, []),
maplist(char_code, Chars, Codes),
atom_chars(Input,Chars).
 
test_file(File) :-
load_file(File, Input),
tester(Input).
 
tester(S) :-
atom_chars(S,Chars),
tokenize(Chars,L),
maplist(print_tok, L),
!.
 
print_tok(L) :-
L =.. [Op,Line,Pos],
format('~d\t~d\t~p~n', [Line,Pos,Op]).
print_tok(string(Value,Line,Pos)) :-
format('~d\t~d\tstring\t\t"~w"~n', [Line,Pos,Value]).
print_tok(identifier(Value,Line,Pos)) :-
format('~d\t~d\tidentifier\t~p~n', [Line,Pos,Value]).
print_tok(integer(Value,Line,Pos)) :-
format('~d\t~d\tinteger\t\t~p~n', [Line,Pos,Value]).
 
 
/*
Tokenize
run the input over a DCG to get out the tokens.
In - a list of chars to tokenize
Tokens = a list of tokens (excluding spaces).
*/
tokenize(In,RelTokens) :-
newline_positions(In,1,NewLines),
tokenize(In,[0|NewLines],1,1,Tokens),
check_for_exceptions(Tokens),
exclude(token_name(space),Tokens,RelTokens).
 
tokenize([],NewLines,Pos,LineNo,[end_of_input(LineNo,Offset)]) :-
position_offset(NewLines,Pos,Offset).
tokenize(In,NewLines,Pos,LineNo,Out) :-
position_offset(NewLines,Pos,Offset),
phrase(tok(Tok,TokLen,LineNo,Offset),In,T),
(
Tok = [] -> Out = Toks
; Out = [Tok|Toks]
),
Pos1 is Pos + TokLen,
update_line_no(LineNo,NewLines,Pos1,NewLineNo,NewNewLines),
tokenize(T,NewNewLines,Pos1,NewLineNo,Toks).
update_line_no(LNo,[L],_,LNo,[L]).
update_line_no(LNo,[L,Nl|T],Pos,LNo,[L,Nl|T]) :-
Pos =< Nl.
update_line_no(LNo,[_,Nl|T],Pos,LNo2,Nlines) :-
Pos > Nl,
succ(LNo,LNo1),
update_line_no(LNo1,[Nl|T],Pos,LNo2,Nlines).
 
position_offset([Line|_],Pos,Offset) :- Offset is Pos - Line.
token_name(Name,Tok) :- functor(Tok,Name,_).
 
% Get a list of all the newlines and their position in the data
% This is used to create accurate row/column numbers.
newline_positions([],N,[N]).
newline_positions(['\n'|T],N,[N|Nt]) :- succ(N,N1), newline_positions(T,N1,Nt).
newline_positions([C|T],N,Nt) :- dif(C,'\n'), succ(N,N1), newline_positions(T,N1,Nt).
% The tokenizer can tokenize some things that it shouldn't, deal with them here.
check_for_exceptions([]). % all ok
check_for_exceptions([op_divide(L,P),op_multiply(_,_)|_]) :-
format(atom(Error), 'Unclosed comment at line ~d,~d', [L,P]),
throw(Error).
check_for_exceptions([integer(_,L,P),identifier(_,_,_)|_]) :-
format(atom(Error), 'Invalid identifier at line ~d,~d', [L,P]),
throw(Error).
check_for_exceptions([_|T]) :- check_for_exceptions(T).
 
 
/*
A set of helper DCGs for the more complicated token types
*/
:- set_prolog_flag(double_quotes, chars).
 
identifier(I) --> c_types(I,csym).
identifier(['_']) --> ['_'].
identifier([]) --> [].
 
integer_(I,L) --> c_types(N,digit), { number_codes(I,N), length(N,L) }.
 
% get a sequence of characters of the same type (https://www.swi-prolog.org/pldoc/doc_for?object=char_type/2)
c_types([C|T],Type) --> c_type(C,Type), c_types(T,Type).
c_types([C],Type) --> c_type(C,Type).
c_type(C,Type) --> [C],{ char_type(C,Type) }.
 
anything([]) --> [].
anything([A|T]) --> [A], anything(T).
 
string_([]) --> [].
string_([A|T]) --> [A], { dif(A,'\n') }, string_(T).
 
 
/*
The token types are all handled by the tok DCG, order of predicates is important here.
*/
% comment
tok([],CLen,_,_) --> "/*", anything(A), "*/", { length(A,Len), CLen is Len + 4 }.
 
% toks
tok(op_and(L,P),2,L,P) --> "&&".
tok(op_or(L,P),2,L,P) --> "||".
tok(op_lessequal(L,P),2,L,P) --> "<=".
tok(op_greaterequal(L,P),2,L,P) --> ">=".
tok(op_greaterequal(L,P),2,L,P) --> ">=".
tok(op_equal(L,P),2,L,P) --> "==".
tok(op_notequal(L,P),2,L,P) --> "!=".
tok(op_assign(L,P),1,L,P) --> "=".
tok(op_multiply(L,P),1,L,P) --> "*".
tok(op_divide(L,P),1,L,P) --> "/".
tok(op_mod(L,P),1,L,P) --> "%".
tok(op_add(L,P),1,L,P) --> "+".
tok(op_subtract(L,P),1,L,P) --> "-".
tok(op_negate(L,P),1,L,P) --> "-".
tok(op_less(L,P),1,L,P) --> "<".
tok(op_greater(L,P),1,L,P) --> ">".
tok(op_not(L,P),1,L,P) --> "!".
 
% symbols
tok(left_paren(L,P),1,L,P) --> "(".
tok(right_paren(L,P),1,L,P) --> ")".
tok(left_brace(L,P),1,L,P) --> "{".
tok(right_brace(L,P),1,L,P) --> "}".
tok(semicolon(L,P),1,L,P) --> ";".
tok(comma(L,P),1,L,P) --> ",".
 
% keywords
tok(keyword_if(L,P),2,L,P) --> "if".
tok(keyword_else(L,P),4,L,P) --> "else".
tok(keyword_while(L,P),5,L,P) --> "while".
tok(keyword_print(L,P),5,L,P) --> "print".
tok(keyword_putc(L,P),4,L,P) --> "putc".
 
% identifier and literals
tok(identifier(I,L,P),Len,L,P) --> c_type(S,csymf), identifier(T), { atom_chars(I,[S|T]), length([S|T],Len) }.
tok(integer(V,L,P),Len,L,P) --> integer_(V,Len).
tok(integer(I,L,P),4,L,P) --> "'\\\\'", { char_code('\\', I) }.
tok(integer(I,L,P),4,L,P) --> "'\\n'", { char_code('\n', I) }.
tok(integer(I,L,P),3,L,P) --> ['\''], [C], ['\''], { dif(C,'\n'), dif(C,'\''), char_code(C,I) }.
tok(string(S,L,P),SLen,L,P) --> ['"'], string_(A),['"'], { atom_chars(S,A), length(A,Len), SLen is Len + 2 }.
 
% spaces
tok(space(L,P),Len,L,P) --> c_types(S,space), { length(S,Len) }.
 
% anything else is an error
tok(_,_,L,P) --> { format(atom(Error), 'Invalid token at line ~d,~d', [L,P]), throw(Error) }.</syntaxhighlight>
{{out}}
<pre>
5 16 keyword_print
5 40 op_subtract
6 16 keyword_putc
6 40 op_less
7 16 keyword_if
7 40 op_greater
8 16 keyword_else
8 40 op_lessequal
9 16 keyword_while
9 40 op_greaterequal
10 16 left_brace
10 40 op_equal
11 16 right_brace
11 40 op_notequal
12 16 left_paren
12 40 op_and
13 16 right_paren
13 40 op_or
14 16 op_subtract
14 40 semicolon
15 16 op_not
15 40 comma
16 16 op_multiply
16 40 op_assign
17 16 op_divide
17 40 integer 42
18 16 op_mod
18 40 string "String literal"
19 16 op_add
19 40 identifier variable_name
20 26 integer 10
21 26 integer 92
22 26 integer 32
22 29 end_of_input
</pre>
 
=={{header|Python}}==
Tested with Python 2.7 and 3.x
<syntaxhighlight lang="python">from __future__ import print_function
<lang Python>
 
from __future__ import print_function
import sys
 
# following two must remain in the same order
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, \
Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident = range(25)
 
tk_EOI, tk_Mul, tk_Div, tk_Mod, tk_Add, tk_Sub, tk_Negate, tk_Not, tk_Lss, tk_Leq, tk_Gtr, \
all_syms = [ 'EOI', 'Print', 'Putc', 'If', 'While', 'Lbrace', 'Rbrace', 'Lparen',
tk_Geq, tk_Eq, tk_Neq, tk_Assign, tk_And, tk_Or, tk_If, tk_Else, tk_While, tk_Print, \
'Rparen', 'Uminus', 'Mul', 'Div', 'Add', 'Sub', 'Lss', 'Gtr', 'Leq', 'Neq', 'And',
tk_Putc, tk_Lparen, tk_Rparen, tk_Lbrace, tk_Rbrace, tk_Semi, tk_Comma, tk_Ident, \
'Semi', 'Comma', 'Assign', 'Integer', 'String', 'Ident' ]
tk_Integer, tk_String = range(31)
 
all_syms = ["End_of_input", "Op_multiply", "Op_divide", "Op_mod", "Op_add", "Op_subtract",
"Op_negate", "Op_not", "Op_less", "Op_lessequal", "Op_greater", "Op_greaterequal",
"Op_equal", "Op_notequal", "Op_assign", "Op_and", "Op_or", "Keyword_if",
"Keyword_else", "Keyword_while", "Keyword_print", "Keyword_putc", "LeftParen",
"RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma", "Identifier",
"Integer", "String"]
 
# single character only symbols
symbols = { '{': Lbracetk_Lbrace, '}': Rbracetk_Rbrace, '(': Lparentk_Lparen, ')': Rparentk_Rparen, '+': Addtk_Add, '-': Subtk_Sub,
'*': Multk_Mul, ';%': Semitk_Mod, ',;': Commatk_Semi, '>': Gtr, '=': Assigntk_Comma }
 
key_words = { 'if': Iftk_If, 'else': tk_Else, 'print': Printtk_Print, 'putc': Putctk_Putc, 'while': While tk_While}
 
the_ch = " " # dummy first char - but it must be a space
Line 1,368 ⟶ 15,244:
n = 10
elif the_ch == '\\':
n = ord('\\')
else:
error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch))
Line 1,374 ⟶ 15,250:
error(err_line, err_col, "multi-character constant")
next_ch()
return Integerktk_Integer, err_line, err_col, n
 
#*** process divide or comments
def div_or_cmt(err_line, err_col):
if next_ch() != '*':
return Divtk_Div, err_line, err_col
 
# comment found
next_ch()
while True:
if next_ch()the_ch == '*' and next_ch() == '/':
if next_ch() == '/':
return gettok next_ch()
return gettok()
elif len(the_ch) == 0:
error(err_line, err_col, "EOF in comment")
else:
next_ch()
 
#*** "string"
def string_lit(start, err_line, err_col):
global the_ch
text = ""
 
Line 1,398 ⟶ 15,279:
if the_ch == '\n':
error(err_line, err_col, "EOL while scanning string literal")
if the_ch == '\\':
next_ch()
if the_ch != 'n':
error(err_line, err_col, "escape sequence unknown \\%c" % the_ch)
the_ch = '\n'
text += the_ch
 
next_ch()
return Stringktk_String, err_line, err_col, text
 
#*** handle identifiers and integers
Line 1,421 ⟶ 15,307:
error(err_line, err_col, "invalid number: %s" % (text))
n = int(text)
return Integerktk_Integer, err_line, err_col, n
 
if text in key_words:
return key_words[text], err_line, err_col
 
return Identtk_Ident, err_line, err_col, text
 
#*** look ahead for '>=', etc.
Line 1,434 ⟶ 15,320:
return ifyes, err_line, err_col
 
if ifno == EOItk_EOI:
error(err_line, err_col, "follow: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
 
Line 1,447 ⟶ 15,333:
err_col = the_col
 
if len(the_ch) == 0: return EOItk_EOI, err_line, err_col
elif the_ch == '/': return div_or_cmt(err_line, err_col)
elif the_ch == '\'': return char_lit(err_line, err_col)
elif the_ch == '<': return follow('=', Leqtk_Leq, Lsstk_Lss, err_line, err_col)
elif the_ch == '!>': return follow('=', Neqtk_Geq, EOItk_Gtr, err_line, err_col)
elif the_ch == '&=': return follow('&=', Andtk_Eq, EOI tk_Assign, err_line, err_col)
elif the_ch == '!': return follow('=', tk_Neq, tk_Not, err_line, err_col)
elif the_ch == '&': return follow('&', tk_And, tk_EOI, err_line, err_col)
elif the_ch == '|': return follow('|', tk_Or, tk_EOI, err_line, err_col)
elif the_ch == '"': return string_lit(the_ch, err_line, err_col)
elif the_ch in symbols:
Line 1,474 ⟶ 15,363:
col = t[2]
 
print("line %5d col %5d %-8s14s" % (line, col, all_syms[tok]), end='')
 
if tok == Integerktk_Integer: print(" %5d" % (t[3]))
elif tok == Identtk_Ident: print(" %s" % (t[3]))
elif tok == Stringktk_String: print(' "%s"' % (t[3]))
else: print("")
 
if tok == EOItk_EOI:
break</syntaxhighlight>
</lang>
 
Output from test case 3:
 
{{out|case=test case 3}}
<b>
<pre>
line 5 col 16 15 PrintKeyword_print
line 5 col 40 41 SubOp_subtract
line 6 col 16 15 PutcKeyword_putc
line 6 col 40 41 LssOp_less
line 7 col 16 15 IfKeyword_if
line 7 col 40 41 GtrOp_greater
line 8 col 16 15 WhileKeyword_else
line 8 col 40 41 LeqOp_lessequal
line 9 col 16 15 LbraceKeyword_while
line 9 col 40 41 NeqOp_greaterequal
line 10 col 16 15 RbraceLeftBrace
line 10 col 40 41 AndOp_equal
line 11 col 16 15 LparenRightBrace
line 11 col 40 41 SemiOp_notequal
line 12 col 16 15 RparenLeftParen
line 12 col 40 41 CommaOp_and
line 13 col 16 15 SubRightParen
line 13 col 40 41 AssignOp_or
line 14 col 16 15 MulOp_subtract
line 14 col 41 Integer 40 42Semicolon
line 15 col 16 15 DivOp_not
line 15 col 41 String 40 "String literal"Comma
line 16 col 16 15 AddOp_multiply
line 16 col 41 Ident 40 variable_nameOp_assign
line 17 col 26 Integer 16 10Op_divide
line 17 18 col 40 26 Integer 32 42
line 18 19 col 16 1 EOIOp_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
</b>
 
=={{header|QB64}}==
Tested with QB64 1.5
<syntaxhighlight lang="vb">dim shared source as string, the_ch as string, tok as string, toktyp as string
dim shared line_n as integer, col_n as integer, text_p as integer, err_line as integer, err_col as integer, errors as integer
 
declare function isalnum&(s as string)
declare function isalpha&(s as string)
declare function isdigit&(s as string)
declare sub divide_or_comment
declare sub error_exit(line_n as integer, col_n as integer, msg as string)
declare sub follow(c as string, typ2 as string, typ1 as string)
declare sub nextch
declare sub nexttok
declare sub read_char_lit
declare sub read_ident
declare sub read_number
declare sub read_string
 
const c_integer = "Integer", c_ident = "Identifier", c_string = "String"
 
dim out_fn as string, out_tok as string
 
if command$(1) = "" then print "Expecting a filename": end
open command$(1) for binary as #1
source = space$(lof(1))
get #1, 1, source
close #1
 
out_fn = command$(2): if out_fn <> "" then open out_fn for output as #1
 
line_n = 1: col_n = 0: text_p = 1: the_ch = " "
 
do
call nexttok
select case toktyp
case c_integer, c_ident, c_string: out_tok = tok
case else: out_tok = ""
end select
if out_fn = "" then
print err_line, err_col, toktyp, out_tok
else
print #1, err_line, err_col, toktyp, out_tok
end if
loop until errors or tok = ""
if out_fn <> "" then close #1
end
 
' get next tok, toktyp
sub nexttok
toktyp = ""
restart: err_line = line_n: err_col = col_n: tok = the_ch
select case the_ch
case " ", chr$(9), chr$(10): call nextch: goto restart
case "/": call divide_or_comment: if tok = "" then goto restart
 
case "%": call nextch: toktyp = "Op_mod"
case "(": call nextch: toktyp = "LeftParen"
case ")": call nextch: toktyp = "RightParen"
case "*": call nextch: toktyp = "Op_multiply"
case "+": call nextch: toktyp = "Op_add"
case ",": call nextch: toktyp = "Comma"
case "-": call nextch: toktyp = "Op_subtract"
case ";": call nextch: toktyp = "Semicolon"
case "{": call nextch: toktyp = "LeftBrace"
case "}": call nextch: toktyp = "RightBrace"
 
case "&": call follow("&", "Op_and", "")
case "|": call follow("|", "Op_or", "")
case "!": call follow("=", "Op_notequal", "Op_not")
case "<": call follow("=", "Op_lessequal", "Op_less")
case "=": call follow("=", "Op_equal", "Op_assign")
case ">": call follow("=", "Op_greaterequal", "Op_greater")
 
case chr$(34): call read_string
case chr$(39): call read_char_lit
 
case "": toktyp = "End_of_input"
 
case else
if isdigit&(the_ch) then
call read_number
elseif isalpha&(the_ch) then
call read_ident
else
call nextch
end if
end select
end sub
 
sub follow(c as string, if_both as string, if_one as string)
call nextch
if the_ch = c then
tok = tok + the_ch
call nextch
toktyp = if_both
else
if if_one = "" then call error_exit(line_n, col_n, "Expecting " + c): exit sub
toktyp = if_one
end if
end sub
 
sub read_string
toktyp = c_string
call nextch
do
tok = tok + the_ch
select case the_ch
case chr$(10): call error_exit(line_n, col_n, "EOL in string"): exit sub
case "": call error_exit(line_n, col_n, "EOF in string"): exit sub
case chr$(34): call nextch: exit sub
case else: call nextch
end select
loop
end sub
 
sub read_char_lit
toktyp = c_integer
call nextch
if the_ch = chr$(39) then
call error_exit(err_line, err_col, "Empty character constant"): exit sub
end if
 
if the_ch = "\" then
call nextch
if the_ch = "n" then
tok = "10"
elseif the_ch = "\" then
tok = "92"
else
call error_exit(line_n, col_n, "Unknown escape sequence:" + the_ch): exit sub
end if
else
tok = ltrim$(str$(asc(the_ch)))
end if
 
call nextch
if the_ch <> chr$(39) then
call error_exit(line_n, col_n, "Multi-character constant"): exit sub
end if
call nextch
end sub
 
sub divide_or_comment
call nextch
if the_ch <> "*" then
toktyp = "Op_divide"
else ' skip comments
tok = ""
call nextch
do
if the_ch = "*" then
call nextch
if the_ch = "/" then
call nextch
exit sub
end if
elseif the_ch = "" then
call error_exit(line_n, col_n, "EOF in comment"): exit sub
else
call nextch
end if
loop
end if
end sub
 
sub read_ident
do
call nextch
if not isalnum&(the_ch) then exit do
tok = tok + the_ch
loop
select case tok
case "else": toktyp = "keyword_else"
case "if": toktyp = "keyword_if"
case "print": toktyp = "keyword_print"
case "putc":: toktyp = "keyword_putc"
case "while": toktyp = "keyword_while"
case else: toktyp = c_ident
end select
end sub
 
sub read_number
toktyp = c_integer
do
call nextch
if not isdigit&(the_ch) then exit do
tok = tok + the_ch
loop
 
if isalpha&(the_ch) then
call error_exit(err_line, err_col, "Bogus number: " + tok + the_ch): exit sub
end if
end sub
 
function isalpha&(s as string)
dim c as string
c = left$(s, 1)
isalpha& = c <> "" and instr("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", c) > 0
end function
 
function isdigit&(s as string)
dim c as string
c = left$(s, 1)
isdigit& = c <> "" and instr("0123456789", c) > 0
end function
 
function isalnum&(s as string)
dim c as string
c = left$(s, 1)
isalnum& = c <> "" and instr("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_", c) > 0
end function
 
' get next char - fold cr/lf into just lf
sub nextch
the_ch = ""
col_n = col_n + 1
if text_p > len(source) then exit sub
 
the_ch = mid$(source, text_p, 1)
text_p = text_p + 1
 
if the_ch = chr$(13) then
the_ch = chr$(10)
if text_p <= len(source) then
if mid$(source, text_p, 1) = chr$(10) then
text_p = text_p + 1
end if
end if
end if
 
if the_ch = chr$(10) then
line_n = line_n + 1
col_n = 0
end if
 
end sub
 
sub error_exit(line_n as integer, col_n as integer, msg as string)
errors = -1
print line_n, col_n, msg
end
end sub
</syntaxhighlight>
{{out|case=test case 3}}
<b>
<pre> 5 16 keyword_print
5 40 Op_subtract
6 16 keyword_putc
6 40 Op_less
7 16 keyword_if
7 40 Op_greater
8 16 keyword_else
8 40 Op_lessequal
9 16 keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
</b>
 
=={{header|Racket}}==
<syntaxhighlight lang="racket">
#lang racket
(require parser-tools/lex)
 
(define-lex-abbrevs
[letter (union (char-range #\a #\z) (char-range #\A #\Z))]
[digit (char-range #\0 #\9)]
[underscore #\_]
[identifier (concatenation (union letter underscore)
(repetition 0 +inf.0 (union letter digit underscore)))]
[integer (repetition 1 +inf.0 digit)]
[char-content (char-complement (char-set "'\n"))]
[char-literal (union (concatenation #\' char-content #\')
"'\\n'" "'\\\\'")]
[string-content (union (char-complement (char-set "\"\n")))]
[string-literal (union (concatenation #\" (repetition 0 +inf.0 string-content) #\")
"\"\\n\"" "\"\\\\\"")]
[keyword (union "if" "else" "while" "print" "putc")]
[operator (union "*" "/" "%" "+" "-" "-"
"<" "<=" ">" ">=" "==" "!="
"!" "=" "&&" "||")]
[symbol (union "(" ")" "{" "}" ";" ",")]
[comment (concatenation "/*" (complement (concatenation any-string "*/" any-string)) "*/")])
 
(define operators-ht
(hash "*" 'Op_multiply "/" 'Op_divide "%" 'Op_mod "+" 'Op_add "-" 'Op_subtract
"<" 'Op_less "<=" 'Op_lessequal ">" 'Op_greater ">=" 'Op_greaterequal "==" 'Op_equal
"!=" 'Op_notequal "!" 'Op_not "=" 'Op_assign "&&" 'Op_and "||" 'Op_or))
 
(define symbols-ht
(hash "(" 'LeftParen ")" 'RightParen
"{" 'LeftBrace "}" 'RightBrace
";" 'Semicolon "," 'Comma))
 
(define (lexeme->keyword l) (string->symbol (~a "Keyword_" l)))
(define (lexeme->operator l) (hash-ref operators-ht l))
(define (lexeme->symbol l) (hash-ref symbols-ht l))
(define (lexeme->char l) (match l
["'\\\\'" #\\]
["'\\n'" #\newline]
[_ (string-ref l 1)]))
 
(define (token name [value #f])
(cons name (if value (list value) '())))
 
(define (lex ip)
(port-count-lines! ip)
(define my-lexer
(lexer-src-pos
[integer (token 'Integer (string->number lexeme))]
[char-literal (token 'Integer (char->integer (lexeme->char lexeme)))]
[string-literal (token 'String lexeme)]
[keyword (token (lexeme->keyword lexeme))]
[operator (token (lexeme->operator lexeme))]
[symbol (token (lexeme->symbol lexeme))]
[comment #f]
[whitespace #f]
[identifier (token 'Identifier lexeme)]
[(eof) (token 'End_of_input)]))
(define (next-token) (my-lexer ip))
next-token)
 
(define (string->tokens s)
(port->tokens (open-input-string s)))
 
(define (port->tokens ip)
(define next-token (lex ip))
(let loop ()
(match (next-token)
[(position-token t (position offset line col) _)
(set! col (+ col 1)) ; output is 1-based
(match t
[#f (loop)] ; skip whitespace/comments
[(list 'End_of_input) (list (list line col 'End_of_input))]
[(list name value) (cons (list line col name value) (loop))]
[(list name) (cons (list line col name) (loop))]
[_ (error)])])))
 
(define test1 #<<TEST
/*
Hello world
*/
print("Hello, World!\n");
 
TEST
)
 
(define test2 #<<TEST
/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");
 
TEST
)
 
(define test3 #<<TEST
/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* Else */ else /* Leq */ <=
/* While */ while /* Geq */ >=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '
TEST
)
 
(define test4 #<<TEST
/*** test printing, embedded \n and comments with lots of '*' ***/
print(42);
print("\nHello World\nGood Bye\nok\n");
print("Print a slash n - \\n.\n");
TEST
)
 
(define test5 #<<TEST
count = 1;
while (count < 10) {
print("count is: ", count, "\n");
count = count + 1;
}
TEST
)
 
(define (display-tokens ts)
(for ([t ts])
(for ([x t])
(display x) (display "\t\t"))
(newline)))
 
"TEST 1"
(display-tokens (string->tokens test1))
"TEST 2"
(display-tokens (string->tokens test2))
"TEST 3"
(display-tokens (string->tokens test3))
"TEST 4"
(display-tokens (string->tokens test4))
"TEST 5"
(display-tokens (string->tokens test5))
</syntaxhighlight>
 
=={{header|Raku}}==
(formerly Perl 6)
This is more complicated than strictly necessary for this task. It is set up to be easily adapted to do syntax analysis.
 
(Note: there are several bogus comments added solely to help with syntax highlighting.)
 
{{works with|Rakudo|2016.08}}
 
<syntaxhighlight lang="raku" line>grammar tiny_C {
rule TOP { ^ <.whitespace>? <tokens> + % <.whitespace> <.whitespace> <eoi> }
 
rule whitespace { [ <comment> + % <ws> | <ws> ] }
 
token comment { '/*' ~ '*/' .*? }
 
token tokens {
[
| <operator> { make $/<operator>.ast }
| <keyword> { make $/<keyword>.ast }
| <symbol> { make $/<symbol>.ast }
| <identifier> { make $/<identifier>.ast }
| <integer> { make $/<integer>.ast }
| <char> { make $/<char>.ast }
| <string> { make $/<string>.ast }
| <error>
]
}
 
proto token operator {*}
token operator:sym<*> { '*' { make 'Op_multiply' } }
token operator:sym</> { '/'<!before '*'> { make 'Op_divide' } }
token operator:sym<%> { '%' { make 'Op_mod' } }
token operator:sym<+> { '+' { make 'Op_add' } }
token operator:sym<-> { '-' { make 'Op_subtract' } }
token operator:sym('<='){ '<=' { make 'Op_lessequal' } }
token operator:sym('<') { '<' { make 'Op_less' } }
token operator:sym('>='){ '>=' { make 'Op_greaterequal'} }
token operator:sym('>') { '>' { make 'Op_greater' } }
token operator:sym<==> { '==' { make 'Op_equal' } }
token operator:sym<!=> { '!=' { make 'Op_notequal' } }
token operator:sym<!> { '!' { make 'Op_not' } }
token operator:sym<=> { '=' { make 'Op_assign' } }
token operator:sym<&&> { '&&' { make 'Op_and' } }
token operator:sym<||> { '||' { make 'Op_or' } }
 
proto token keyword {*}
token keyword:sym<if> { 'if' { make 'Keyword_if' } }
token keyword:sym<else> { 'else' { make 'Keyword_else' } }
token keyword:sym<putc> { 'putc' { make 'Keyword_putc' } }
token keyword:sym<while> { 'while' { make 'Keyword_while' } }
token keyword:sym<print> { 'print' { make 'Keyword_print' } }
 
proto token symbol {*}
token symbol:sym<(> { '(' { make 'LeftParen' } }
token symbol:sym<)> { ')' { make 'RightParen' } }
token symbol:sym<{> { '{' { make 'LeftBrace' } }
token symbol:sym<}> { '}' { make 'RightBrace' } }
token symbol:sym<;> { ';' { make 'Semicolon' } }
token symbol:sym<,> { ',' { make 'Comma' } }
 
token identifier { <[_A..Za..z]><[_A..Za..z0..9]>* { make 'Identifier ' ~ $/ } }
token integer { <[0..9]>+ { make 'Integer ' ~ $/ } }
 
token char {
'\'' [<-[']> | '\n' | '\\\\'] '\''
{ make 'Char_Literal ' ~ $/.subst("\\n", "\n").substr(1, *-1).ord }
}
 
token string {
'"' <-["\n]>* '"' #'
{
make 'String ' ~ $/;
note 'Error: Unknown escape sequence.' and exit if (~$/ ~~ m:r/ <!after <[\\]>>[\\<-[n\\]>]<!before <[\\]>> /);
}
}
 
token eoi { $ { make 'End_of_input' } }
 
token error {
| '\'''\'' { note 'Error: Empty character constant.' and exit }
| '\'' <-[']> ** {2..*} '\'' { note 'Error: Multi-character constant.' and exit }
| '/*' <-[*]>* $ { note 'Error: End-of-file in comment.' and exit }
| '"' <-["]>* $ { note 'Error: End-of-file in string.' and exit }
| '"' <-["]>*? \n { note 'Error: End of line in string.' and exit } #'
}
}
 
sub parse_it ( $c_code ) {
my $l;
my @pos = gather for $c_code.lines>>.chars.kv -> $line, $v {
take [ $line + 1, $_ ] for 1 .. ($v+1); # v+1 for newline
$l = $line+2;
}
@pos.push: [ $l, 1 ]; # capture eoi
 
for flat $c_code<tokens>.list, $c_code<eoi> -> $m {
say join "\t", @pos[$m.from].fmt('%3d'), $m.ast;
}
}
 
my $tokenizer = tiny_C.parse(@*ARGS[0].IO.slurp);
parse_it( $tokenizer );</syntaxhighlight>
 
{{out|case=test case 3}}
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Char_Literal 10
21 26 Char_Literal 92
22 26 Char_Literal 32
23 1 End_of_input
</pre>
 
=={{header|RATFOR}}==
{{works with|ratfor77|[https://sourceforge.net/p/chemoelectric/ratfor77/ public domain 1.0]}}
{{works with|gfortran|11.2.1}}
{{works with|f2c|20100827}}
 
 
<syntaxhighlight lang="ratfor">######################################################################
#
# The Rosetta Code scanner in Ratfor 77.
#
#
# How to deal with FORTRAN 77 input is a problem. I use formatted
# input, treating each line as an array of type CHARACTER--regrettably
# of no more than some predetermined, finite length. It is a very
# simple method and presents no significant difficulties, aside from
# the restriction on line length of the input.
#
#
# On a POSIX platform, the program can be compiled with f2c and run
# somewhat as follows:
#
# ratfor77 lex-in-ratfor.r > lex-in-ratfor.f
# f2c -C -Nc40 lex-in-ratfor.f
# cc -O lex-in-ratfor.c -lf2c
# ./a.out < compiler-tests/primes.t
#
# With gfortran, a little differently:
#
# ratfor77 lex-in-ratfor.r > lex-in-ratfor.f
# gfortran -O2 -fcheck=all -std=legacy lex-in-ratfor.f
# ./a.out < compiler-tests/primes.t
#
#
# I/O is strictly from default input and to default output, which, on
# POSIX systems, usually correspond respectively to standard input and
# standard output. (I did not wish to have to deal with unit numbers;
# these are now standardized in ISO_FORTRAN_ENV, but that is not
# available in FORTRAN 77.)
#
#---------------------------------------------------------------------
 
# Some parameters you may with to modify.
 
define(LINESZ, 256) # Size of an input line.
define(OUTLSZ, 512) # Size of an output line.
define(PSHBSZ, 10) # Size of the character pushback buffer.
define(STRNSZ, 4096) # Size of the string pool.
 
#---------------------------------------------------------------------
 
define(EOF, -1)
define(NEWLIN, 10) # Unix newline (the LF control character).
define(BACKSL, 92) # ASCII backslash.
 
define(ILINNO, 1) # Line number's index.
define(ICOLNO, 2) # Column number's index.
 
define(CHRSZ, 3) # See ILINNO and ICOLNO above.
define(ICHRCD, 3) # Character code's index.
 
define(TOKSZ, 5) # See ILINNO and ICOLNO above.
define(ITOKNO, 3) # Token number's index.
define(IARGIX, 4) # Index of the string pool index.
define(IARGLN, 5) # Index of the string length.
 
define(TKELSE, 0)
define(TKIF, 1)
define(TKPRNT, 2)
define(TKPUTC, 3)
define(TKWHIL, 4)
define(TKMUL, 5)
define(TKDIV, 6)
define(TKMOD, 7)
define(TKADD, 8)
define(TKSUB, 9)
define(TKNEG, 10)
define(TKLT, 11)
define(TKLE, 12)
define(TKGT, 13)
define(TKGE, 14)
define(TKEQ, 15)
define(TKNE, 16)
define(TKNOT, 17)
define(TKASGN, 18)
define(TKAND, 19)
define(TKOR, 20)
define(TKLPAR, 21)
define(TKRPAR, 22)
define(TKLBRC, 23)
define(TKRBRC, 24)
define(TKSEMI, 25)
define(TKCMMA, 26)
define(TKID, 27)
define(TKINT, 28)
define(TKSTR, 29)
define(TKEOI, 30)
 
define(LOC10, 1) # Location of "10" in the string pool.
define(LOC92, 3) # Location of "92" in the string pool.
 
#---------------------------------------------------------------------
 
subroutine addstr (strngs, istrng, src, i0, n0, i, n)
 
# Add a string to the string pool.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character src(*) # Source string.
integer i0, n0 # Index and length in source string.
integer i, n # Index and length in string pool.
 
integer j
 
if (STRNSZ < istrng + (n0 - 1))
{
write (*, '(''string pool exhausted'')')
stop
}
for (j = 0; j < n0; j = j + 1)
strngs(istrng + j) = src(i0 + j)
i = istrng
n = n0
istrng = istrng + n0
end
 
subroutine cpystr (strngs, i, n, dst, i0)
 
# Copy a string from the string pool to an output string.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer i, n # Index and length in string pool.
character dst(OUTLSZ) # Destination string.
integer i0 # Index within destination string.
 
integer j
 
if (i0 < 1 || OUTLSZ < i0 + (n - 1))
{
write (*, '(''string boundary exceeded'')')
stop
}
for (j = 0; j < n; j = j + 1)
dst(i0 + j) = strngs(i + j)
end
 
#---------------------------------------------------------------------
 
subroutine getchr (line, linno, colno, pushbk, npshbk, chr)
 
# Get a character, with its line number and column number.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer chr(CHRSZ)
 
# End of file is indicated (as in C) by a negative "char code"
# called "EOF".
 
character*20 fmt
integer stat
integer chr1(CHRSZ)
 
if (0 < npshbk)
{
chr(ICHRCD) = pushbk(ICHRCD, npshbk)
chr(ILINNO) = pushbk(ILINNO, npshbk)
chr(ICOLNO) = pushbk(ICOLNO, npshbk)
npshbk = npshbk - 1
}
else if (colno <= LINESZ)
{
chr(ICHRCD) = ichar (line(colno))
chr(ILINNO) = linno
chr(ICOLNO) = colno
colno = colno + 1
}
else
{
# Return a newline character.
chr(ICHRCD) = NEWLIN
chr(ILINNO) = linno
chr(ICOLNO) = colno
 
# Fetch a new line.
linno = linno + 1
colno = 1
 
# Read a line of text as an array of characters.
write (fmt, '(''('', I10, ''A1)'')') LINESZ
read (*, fmt, iostat = stat) line
 
if (stat != 0)
{
# If end of file has been reached, push an EOF.
chr1(ICHRCD) = EOF
chr1(ILINNO) = linno
chr1(ICOLNO) = colno
call pshchr (pushbk, npshbk, chr1)
}
}
end
 
subroutine pshchr (pushbk, npshbk, chr)
 
# Push back a character.
 
implicit none
 
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer chr(CHRSZ)
 
if (PSHBSZ <= npshbk)
{
write (*, '(''pushback buffer overfull'')')
stop
}
npshbk = npshbk + 1
pushbk(ICHRCD, npshbk) = chr(ICHRCD)
pushbk(ILINNO, npshbk) = chr(ILINNO)
pushbk(ICOLNO, npshbk) = chr(ICOLNO)
end
 
subroutine getpos (line, linno, colno, pushbk, npshbk, ln, cn)
 
# Get the position of the next character.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # The line and column nos. returned.
 
integer chr(CHRSZ)
 
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
call pshchr (pushbk, npshbk, chr)
end
 
#---------------------------------------------------------------------
 
function isspc (c)
 
# Is c character code for a space?
 
implicit none
 
integer c
logical isspc
 
#
# The following is correct for ASCII: 32 is the SPACE character, and
# 9 to 13 are control characters commonly regarded as spaces.
#
# In Unicode these are all code points for spaces, but so are others
# besides.
#
isspc = (c == 32 || (9 <= c && c <= 13))
end
 
function isdgt (c)
 
# Is c character code for a digit?
 
implicit none
 
integer c
logical isdgt
 
isdgt = (ichar ('0') <= c && c <= ichar ('9'))
end
 
function isalph (c)
 
# Is c character code for a letter?
 
implicit none
 
integer c
logical isalph
 
#
# The following is correct for ASCII and Unicode, but not for
# EBCDIC.
#
isalph = (ichar ('a') <= c && c <= ichar ('z')) _
|| (ichar ('A') <= c && c <= ichar ('Z'))
end
 
function isid0 (c)
 
# Is c character code for the start of an identifier?
 
implicit none
 
integer c
logical isid0
 
logical isalph
 
isid0 = isalph (c) || c == ichar ('_')
end
 
function isid1 (c)
 
# Is c character code for the continuation of an identifier?
 
implicit none
 
integer c
logical isid1
 
logical isalph
logical isdgt
 
isid1 = isalph (c) || isdgt (c) || c == ichar ('_')
end
 
#---------------------------------------------------------------------
 
function trimlf (str, n)
 
# "Trim left" leading spaces.
 
implicit none
 
character str(*) # The string to "trim".
integer n # The length.
integer trimlf # The index of the first non-space
# character, or n + 1.
 
logical isspc
 
integer j
logical done
 
j = 1
done = .false.
while (!done)
{
if (j == n + 1)
done = .true.
else if (!isspc (ichar (str(j))))
done = .true.
else
j = j + 1
}
trimlf = j
end
 
function trimrt (str, n)
 
# "Trim right" trailing spaces.
 
implicit none
 
character str(*) # The string to "trim".
integer n # The length including trailing spaces.
integer trimrt # The length without trailing spaces.
 
logical isspc
 
integer j
logical done
 
j = n
done = .false.
while (!done)
{
if (j == 0)
done = .true.
else if (!isspc (ichar (str(j))))
done = .true.
else
j = j - 1
}
trimrt = j
end
 
#---------------------------------------------------------------------
 
subroutine toknam (tokno, str, i)
 
# Copy a token name to the character array str, starting at i.
 
implicit none
 
integer tokno
character str(*)
integer i
integer j
 
character*16 names(0:30)
character*16 nm
 
data names / "Keyword_else ", _
"Keyword_if ", _
"Keyword_print ", _
"Keyword_putc ", _
"Keyword_while ", _
"Op_multiply ", _
"Op_divide ", _
"Op_mod ", _
"Op_add ", _
"Op_subtract ", _
"Op_negate ", _
"Op_less ", _
"Op_lessequal ", _
"Op_greater ", _
"Op_greaterequal ", _
"Op_equal ", _
"Op_notequal ", _
"Op_not ", _
"Op_assign ", _
"Op_and ", _
"Op_or ", _
"LeftParen ", _
"RightParen ", _
"LeftBrace ", _
"RightBrace ", _
"Semicolon ", _
"Comma ", _
"Identifier ", _
"Integer ", _
"String ", _
"End_of_input " /
 
nm = names(tokno)
for (j = 0; j < 16; j = j + 1)
str(i + j) = nm(1 + j : 1 + j)
end
 
subroutine intstr (str, i, n, x)
 
# Convert a positive integer to a substring.
 
implicit none
 
character str(*) # Destination string.
integer i, n # Index and length of the field.
integer x # The positive integer to represent.
 
integer j
integer y
 
if (x == 0)
{
for (j = 0; j < n - 1; j = j + 1)
str(i + j) = ' '
str(i + j) = '0'
}
else
{
y = x
for (j = n - 1; 0 <= j; j = j - 1)
{
if (y == 0)
str(i + j) = ' '
else
{
str(i + j) = char (mod (y, 10) + ichar ('0'))
y = y / 10
}
}
}
end
 
subroutine prttok (strngs, tok)
 
# Print a token.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer tok(TOKSZ) # The token.
 
integer trimrt
 
character line(OUTLSZ)
character*20 fmt
integer i, n
integer tokno
 
for (i = 1; i <= OUTLSZ; i = i + 1)
line(i) = ' '
 
call intstr (line, 1, 10, tok(ILINNO))
call intstr (line, 12, 10, tok(ICOLNO))
 
tokno = tok(ITOKNO)
call toknam (tokno, line, 25)
if (tokno == TKID || tokno == TKINT || tokno == TKSTR)
{
i = tok(IARGIX)
n = tok(IARGLN)
call cpystr (strngs, i, n, line, 45)
}
 
n = trimrt (line, OUTLSZ)
write (fmt, '(''('', I10, ''A)'')') n
write (*, fmt) (line(i), i = 1, n)
end
 
#---------------------------------------------------------------------
 
subroutine wrtpos (ln, cn)
 
implicit none
 
integer ln, cn
 
write (*, 1000) ln, cn
1000 format ('At line ', I5, ', column ' I5)
end
 
#---------------------------------------------------------------------
 
subroutine utcmnt (ln, cn)
 
implicit none
 
integer ln, cn
 
call wrtpos (ln, cn)
write (*, '(''Unterminated comment'')')
stop
end
 
subroutine skpcmt (line, linno, colno, pushbk, npshbk, ln, cn)
 
# Skip to the end of a comment.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column of start of comment.
 
integer chr(CHRSZ)
logical done
 
done = .false.
while (!done)
{
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) == ichar ('*'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) == ichar ('/'))
done = .true.
else if (chr(ICHRCD) == EOF)
call utcmnt (ln, cn)
}
else if (chr(ICHRCD) == EOF)
call utcmnt (ln, cn)
}
end
 
subroutine skpspc (line, linno, colno, pushbk, npshbk)
 
# Skip spaces and comments.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
 
logical isspc
 
integer chr(CHRSZ)
integer chr1(CHRSZ)
integer ln, cn
logical done
 
done = .false.
while (!done)
{
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (!isspc (chr(ICHRCD)))
{
if (chr(ICHRCD) != ichar ('/'))
{
call pshchr (pushbk, npshbk, chr)
done = .true.
}
else
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) != ichar ('*'))
{
call pshchr (pushbk, npshbk, chr1)
call pshchr (pushbk, npshbk, chr)
done = .true.
}
else
{
ln = chr(ILINNO)
cn = chr(ICOLNO)
call skpcmt (line, linno, colno, pushbk, npshbk, _
ln, cn)
}
}
}
}
end
 
#---------------------------------------------------------------------
 
subroutine rwdlkp (strngs, istrng, src, i0, n0, ln, cn, tok)
 
# Reserved word lookup
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character src(*) # The source string.
integer i0, n0 # Index and length of the substring.
integer ln, cn # Line and column number
# to associate with the token.
integer tok(TOKSZ) # The output token.
 
integer tokno
integer i, n
 
tokno = TKID
 
if (n0 == 2)
{
if (src(i0) == 'i' && src(i0 + 1) == 'f')
tokno = TKIF
}
else if (n0 == 4)
{
if (src(i0) == 'e' && src(i0 + 1) == 'l' _
&& src(i0 + 2) == 's' && src(i0 + 3) == 'e')
tokno = TKELSE
else if (src(i0) == 'p' && src(i0 + 1) == 'u' _
&& src(i0 + 2) == 't' && src(i0 + 3) == 'c')
tokno = TKPUTC
}
else if (n0 == 5)
{
if (src(i0) == 'p' && src(i0 + 1) == 'r' _
&& src(i0 + 2) == 'i' && src(i0 + 3) == 'n' _
&& src(i0 + 4) == 't')
tokno = TKPRNT
else if (src(i0) == 'w' && src(i0 + 1) == 'h' _
&& src(i0 + 2) == 'i' && src(i0 + 3) == 'l' _
&& src(i0 + 4) == 'e')
tokno = TKWHIL
}
 
i = 0
n = 0
if (tokno == TKID)
call addstr (strngs, istrng, src, i0, n0, i, n)
 
tok(ITOKNO) = tokno
tok(IARGIX) = i
tok(IARGLN) = n
tok(ILINNO) = ln
tok(ICOLNO) = cn
end
 
subroutine scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n)
 
# Scan characters that may represent an identifier, reserved word,
# or integer literal.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column number of the start.
character buf(LINESZ) # The output buffer.
integer n # The length of the string collected.
 
logical isid1
 
integer chr(CHRSZ)
 
n = 0
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
while (isid1 (chr(ICHRCD)))
{
n = n + 1
buf(n) = char (chr(ICHRCD))
call getchr (line, linno, colno, pushbk, npshbk, chr)
}
call pshchr (pushbk, npshbk, chr)
end
 
subroutine scnidr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan an identifier or reserved word.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column number of the start.
integer tok(TOKSZ) # The output token.
 
character buf(LINESZ)
integer n
call scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n)
call rwdlkp (strngs, istrng, buf, 1, n, ln, cn, tok)
end
 
subroutine scnint (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a positive integer literal.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column number of the start.
integer tok(TOKSZ) # The output token.
 
logical isdgt
 
character buf(LINESZ)
integer n0, n
integer i, j, k
character*80 fmt
call scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n0)
for (j = 1; j <= n0; j = j + 1)
if (!isdgt (ichar (buf(j))))
{
call wrtpos (ln, cn)
write (fmt, 1000) n0
1000 format ('(''Not a legal word: "''', I10, 'A, ''"'')')
write (*, fmt) (buf(k), k = 1, n0)
stop
}
 
call addstr (strngs, istrng, buf, 1, n0, i, n)
 
tok(ITOKNO) = TKINT
tok(IARGIX) = i
tok(IARGLN) = n
tok(ILINNO) = ln
tok(ICOLNO) = cn
end
 
subroutine utclit (ln, cn)
 
implicit none
 
integer ln, cn
 
call wrtpos (ln, cn)
write (*, '(''Unterminated character literal'')')
stop
end
 
subroutine scnch1 (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a character literal, without yet checking that the literal
# ends correctly.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
integer trimlf
 
integer bufsz
parameter (bufsz = 40)
 
integer chr(CHRSZ)
integer chr1(CHRSZ)
integer chr2(CHRSZ)
integer ln, cn
character buf(bufsz)
integer i, j, n
 
# Refetch the opening quote.
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
 
tok(ITOKNO) = TKINT
tok(ILINNO) = ln
tok(ICOLNO) = cn
 
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == EOF)
call utclit (ln, cn)
if (chr1(ICHRCD) == BACKSL)
{
call getchr (line, linno, colno, pushbk, npshbk, chr2)
if (chr2(ICHRCD) == EOF)
call utclit (ln, cn)
else if (chr2(ICHRCD) == ichar ('n'))
{
tok(IARGIX) = LOC10 # "10" = code for Unix newline
tok(IARGLN) = 2
}
else if (chr2(ICHRCD) == BACKSL)
{
tok(IARGIX) = LOC92 # "92" = code for backslash
tok(IARGLN) = 2
}
else
{
call wrtpos (ln, cn)
write (*, '(''Unsupported escape: '', 1A)') _
char (chr2(ICHRCD))
stop
}
}
else
{
# Character codes are non-negative, so we can use intstr.
call intstr (buf, 1, bufsz, chr1(ICHRCD))
 
j = trimlf (buf, bufsz)
call addstr (strngs, istrng, buf, j, bufsz - (j - 1), i, n)
tok(IARGIX) = i
tok(IARGLN) = n
}
end
 
subroutine scnch (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a character literal.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
integer ln, cn
integer chr(CHRSZ)
 
call getpos (line, linno, colno, pushbk, npshbk, ln, cn)
call scnch1 (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) != ichar (''''))
{
while (.true.)
{
if (chr(ICHRCD) == EOF)
{
call utclit (ln, cn)
stop
}
else if (chr(ICHRCD) == ichar (''''))
{
call wrtpos (ln, cn)
write (*, '(''Unsupported multicharacter literal'')')
stop
}
call getchr (line, linno, colno, pushbk, npshbk, chr)
}
}
end
 
subroutine scnstr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a string literal.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
integer ln, cn
integer chr1(CHRSZ)
integer chr2(CHRSZ)
character buf(LINESZ + 10) # Enough space, with some room to spare.
integer n0
integer i, n
 
call getchr (line, linno, colno, pushbk, npshbk, chr1)
ln = chr1(ILINNO)
cn = chr1(ICOLNO)
 
tok(ITOKNO) = TKSTR
tok(ILINNO) = ln
tok(ICOLNO) = cn
 
n0 = 1
buf(n0) = '"'
 
call getchr (line, linno, colno, pushbk, npshbk, chr1)
while (chr1(ICHRCD) != ichar ('"'))
{
# Our input method always puts a NEWLIN before EOF, and so this
# test is redundant, unless someone changes the input method.
if (chr1(ICHRCD) == EOF || chr1(ICHRCD) == NEWLIN)
{
call wrtpos (ln, cn)
write (*, '(''Unterminated string literal'')')
stop
}
if (chr1(ICHRCD) == BACKSL)
{
call getchr (line, linno, colno, pushbk, npshbk, chr2)
if (chr2(ICHRCD) == ichar ('n'))
{
n0 = n0 + 1
buf(n0) = char (BACKSL)
n0 = n0 + 1
buf(n0) = 'n'
}
else if (chr2(ICHRCD) == BACKSL)
{
n0 = n0 + 1
buf(n0) = char (BACKSL)
n0 = n0 + 1
buf(n0) = char (BACKSL)
}
else
{
call wrtpos (chr1(ILINNO), chr1(ICOLNO))
write (*, '(''Unsupported escape sequence'')')
stop
}
}
else
{
n0 = n0 + 1
buf(n0) = char (chr1(ICHRCD))
}
call getchr (line, linno, colno, pushbk, npshbk, chr1)
}
n0 = n0 + 1
buf(n0) = '"'
 
call addstr (strngs, istrng, buf, 1, n0, i, n)
tok(IARGIX) = i
tok(IARGLN) = n
end
 
subroutine unxchr (chr)
 
implicit none
 
integer chr(CHRSZ)
 
call wrtpos (chr(ILINNO), chr(ICOLNO))
write (*, 1000) char (chr(ICHRCD))
1000 format ('Unexpected character ''', A1, '''')
stop
end
 
subroutine scntok (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a token.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
logical isdgt
logical isid0
 
integer chr(CHRSZ)
integer chr1(CHRSZ)
integer ln, cn
 
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
tok(ILINNO) = ln
tok(ICOLNO) = cn
tok(IARGIX) = 0
tok(IARGLN) = 0
if (chr(ICHRCD) == ichar (','))
tok(ITOKNO) = TKCMMA
else if (chr(ICHRCD) == ichar (';'))
tok(ITOKNO) = TKSEMI
else if (chr(ICHRCD) == ichar ('('))
tok(ITOKNO) = TKLPAR
else if (chr(ICHRCD) == ichar (')'))
tok(ITOKNO) = TKRPAR
else if (chr(ICHRCD) == ichar ('{'))
tok(ITOKNO) = TKLBRC
else if (chr(ICHRCD) == ichar ('}'))
tok(ITOKNO) = TKRBRC
else if (chr(ICHRCD) == ichar ('*'))
tok(ITOKNO) = TKMUL
else if (chr(ICHRCD) == ichar ('/'))
tok(ITOKNO) = TKDIV
else if (chr(ICHRCD) == ichar ('%'))
tok(ITOKNO) = TKMOD
else if (chr(ICHRCD) == ichar ('+'))
tok(ITOKNO) = TKADD
else if (chr(ICHRCD) == ichar ('-'))
tok(ITOKNO) = TKSUB
else if (chr(ICHRCD) == ichar ('<'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKLE
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKLT
}
}
else if (chr(ICHRCD) == ichar ('>'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKGE
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKGT
}
}
else if (chr(ICHRCD) == ichar ('='))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKEQ
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKASGN
}
}
else if (chr(ICHRCD) == ichar ('!'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKNE
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKNOT
}
}
else if (chr(ICHRCD) == ichar ('&'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('&'))
tok(ITOKNO) = TKAND
else
call unxchr (chr)
}
else if (chr(ICHRCD) == ichar ('|'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('|'))
tok(ITOKNO) = TKOR
else
call unxchr (chr)
}
else if (chr(ICHRCD) == ichar ('"'))
{
call pshchr (pushbk, npshbk, chr)
call scnstr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else if (chr(ICHRCD) == ichar (''''))
{
call pshchr (pushbk, npshbk, chr)
call scnch (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else if (isdgt (chr(ICHRCD)))
{
call pshchr (pushbk, npshbk, chr)
call scnint (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else if (isid0 (chr(ICHRCD)))
{
call pshchr (pushbk, npshbk, chr)
call scnidr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else
call unxchr (chr)
end
 
subroutine scntxt (strngs, istrng, _
line, linno, colno, pushbk, npshbk)
 
# Scan the text and print the token stream.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
 
integer chr(CHRSZ)
integer tok(TOKSZ)
 
chr(ICHRCD) = ichar ('x')
while (chr(ICHRCD) != EOF)
{
call skpspc (line, linno, colno, pushbk, npshbk)
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) != EOF)
{
call pshchr (pushbk, npshbk, chr)
call scntok (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
call prttok (strngs, tok)
}
}
tok(ITOKNO) = TKEOI
tok(ILINNO) = chr(ILINNO)
tok(ICOLNO) = chr(ICOLNO)
tok(IARGIX) = 0
tok(IARGLN) = 0
call prttok (strngs, tok)
end
 
#---------------------------------------------------------------------
 
program lex
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
 
integer i, n
 
istrng = 1
 
# Locate "10" (newline) at 1 in the string pool.
line(1) = '1'
line(2) = '0'
call addstr (strngs, istrng, line, 1, 2, i, n)
if (i != 1 && n != 2)
{
write (*, '(''internal error'')')
stop
}
 
# Locate "92" (backslash) at 3 in the string pool.
line(1) = '9'
line(2) = '2'
call addstr (strngs, istrng, line, 1, 2, i, n)
if (i != 3 && n != 2)
{
write (*, '(''internal error'')')
stop
}
 
linno = 0
colno = LINESZ + 1 # This will trigger a READ.
npshbk = 0
 
call scntxt (strngs, istrng, line, linno, colno, pushbk, npshbk)
end
 
######################################################################</syntaxhighlight>
 
 
{{out}}
<pre>$ ratfor77 lex-in-ratfor.r > lex-in-ratfor.f && gfortran -O2 -std=legacy -fcheck=all lex-in-ratfor.f && ./a.out < compiler-tests/primes.t
4 1 Identifier count
4 7 Op_assign
4 9 Integer 1
4 10 Semicolon
5 1 Identifier n
5 3 Op_assign
5 5 Integer 1
5 6 Semicolon
6 1 Identifier limit
6 7 Op_assign
6 9 Integer 100
6 12 Semicolon
7 1 Keyword_while
7 7 LeftParen
7 8 Identifier n
7 10 Op_less
7 12 Identifier limit
7 17 RightParen
7 19 LeftBrace
8 5 Identifier k
8 6 Op_assign
8 7 Integer 3
8 8 Semicolon
9 5 Identifier p
9 6 Op_assign
9 7 Integer 1
9 8 Semicolon
10 5 Identifier n
10 6 Op_assign
10 7 Identifier n
10 8 Op_add
10 9 Integer 2
10 10 Semicolon
11 5 Keyword_while
11 11 LeftParen
11 12 LeftParen
11 13 Identifier k
11 14 Op_multiply
11 15 Identifier k
11 16 Op_lessequal
11 18 Identifier n
11 19 RightParen
11 21 Op_and
11 24 LeftParen
11 25 Identifier p
11 26 RightParen
11 27 RightParen
11 29 LeftBrace
12 9 Identifier p
12 10 Op_assign
12 11 Identifier n
12 12 Op_divide
12 13 Identifier k
12 14 Op_multiply
12 15 Identifier k
12 16 Op_notequal
12 18 Identifier n
12 19 Semicolon
13 9 Identifier k
13 10 Op_assign
13 11 Identifier k
13 12 Op_add
13 13 Integer 2
13 14 Semicolon
14 5 RightBrace
15 5 Keyword_if
15 8 LeftParen
15 9 Identifier p
15 10 RightParen
15 12 LeftBrace
16 9 Keyword_print
16 14 LeftParen
16 15 Identifier n
16 16 Comma
16 18 String " is prime\n"
16 31 RightParen
16 32 Semicolon
17 9 Identifier count
17 15 Op_assign
17 17 Identifier count
17 23 Op_add
17 25 Integer 1
17 26 Semicolon
18 5 RightBrace
19 1 RightBrace
20 1 Keyword_print
20 6 LeftParen
20 7 String "Total primes found: "
20 29 Comma
20 31 Identifier count
20 36 Comma
20 38 String "\n"
20 42 RightParen
20 43 Semicolon
21 1 End_of_input</pre>
 
=={{header|Scala}}==
The complete implementation for the compiler tasks can be found in a GitHub repository at [https://github.com/edadma/rosettacodeCompiler github.com/edadma/rosettacodeCompiler] which includes full unit testing for the samples given in [[Compiler/Sample programs]].
 
The following code implements a configurable (from a symbol map and keyword map provided as parameters) lexical analyzer.
 
<syntaxhighlight lang="scala">
package xyz.hyperreal.rosettacodeCompiler
 
import scala.io.Source
import scala.util.matching.Regex
 
object LexicalAnalyzer {
private val EOT = '\u0004'
 
val symbols =
Map(
"*" -> "Op_multiply",
"/" -> "Op_divide",
"%" -> "Op_mod",
"+" -> "Op_add",
"-" -> "Op_minus",
"<" -> "Op_less",
"<=" -> "Op_lessequal",
">" -> "Op_greater",
">=" -> "Op_greaterequal",
"==" -> "Op_equal",
"!=" -> "Op_notequal",
"!" -> "Op_not",
"=" -> "Op_assign",
"&&" -> "Op_and",
"¦¦" -> "Op_or",
"(" -> "LeftParen",
")" -> "RightParen",
"{" -> "LeftBrace",
"}" -> "RightBrace",
";" -> "Semicolon",
"," -> "Comma"
)
 
val keywords =
Map(
"if" -> "Keyword_if",
"else" -> "Keyword_else",
"while" -> "Keyword_while",
"print" -> "Keyword_print",
"putc" -> "Keyword_putc"
)
val alpha = ('a' to 'z' toSet) ++ ('A' to 'Z')
val numeric = '0' to '9' toSet
val alphanumeric = alpha ++ numeric
val identifiers = StartRestToken("Identifier", alpha + '_', alphanumeric + '_')
val integers = SimpleToken("Integer", numeric, alpha, "alpha characters may not follow right after a number")
 
val characters =
DelimitedToken("Integer",
'\'',
"[^'\\n]|\\\\n|\\\\\\\\" r,
"invalid character literal",
"unclosed character literal")
 
val strings =
DelimitedToken("String", '"', "[^\"\\n]*" r, "invalid string literal", "unclosed string literal")
 
def apply =
new LexicalAnalyzer(4, symbols, keywords, "End_of_input", identifiers, integers, characters, strings)
 
abstract class Token
case class StartRestToken(name: String, start: Set[Char], rest: Set[Char]) extends Token
case class SimpleToken(name: String, chars: Set[Char], exclude: Set[Char], excludeError: String) extends Token
case class DelimitedToken(name: String, delimiter: Char, pattern: Regex, patternError: String, unclosedError: String)
extends Token
}
 
class LexicalAnalyzer(tabs: Int,
symbols: Map[String, String],
keywords: Map[String, String],
endOfInput: String,
identifier: LexicalAnalyzer.Token,
tokens: LexicalAnalyzer.Token*) {
 
import LexicalAnalyzer._
 
private val symbolStartChars = symbols.keys map (_.head) toSet
private val symbolChars = symbols.keys flatMap (_.toList) toSet
private var curline: Int = _
private var curcol: Int = _
 
def fromStdin = fromSource(Source.stdin)
 
def fromString(src: String) = fromSource(Source.fromString(src))
 
def fromSource(ast: Source) = {
curline = 1
curcol = 1
 
var s = (ast ++ Iterator(EOT)) map (new Chr(_)) toStream
 
tokenize
 
def token(name: String, first: Chr) = println(f"${first.line}%5d ${first.col}%6d $name")
 
def value(name: String, v: String, first: Chr) = println(f"${first.line}%5d ${first.col}%6d $name%-14s $v")
 
def until(c: Char) = {
val buf = new StringBuilder
 
def until: String =
if (s.head.c == EOT || s.head.c == c)
buf.toString
else {
buf += getch
until
}
 
until
}
 
def next = s = s.tail
 
def getch = {
val c = s.head.c
 
next
c
}
 
def consume(first: Char, cs: Set[Char]) = {
val buf = new StringBuilder
 
def consume: String =
if (s.head.c == EOT || !cs(s.head.c))
buf.toString
else {
buf += getch
consume
}
 
buf += first
consume
}
 
def comment(start: Chr): Unit = {
until('*')
 
if (s.head.c == EOT || s.tail.head.c == EOT)
sys.error(s"unclosed comment ${start.at}")
else if (s.tail.head.c != '/') {
next
comment(start)
} else {
next
next
}
}
 
def recognize(t: Token): Option[(String, String)] = {
val first = s
 
next
 
t match {
case StartRestToken(name, start, rest) =>
if (start(first.head.c))
Some((name, consume(first.head.c, rest)))
else {
s = first
None
}
case SimpleToken(name, chars, exclude, excludeError) =>
if (chars(first.head.c)) {
val m = consume(first.head.c, chars)
 
if (exclude(s.head.c))
sys.error(s"$excludeError ${s.head.at}")
else
Some((name, m))
} else {
s = first
None
}
case DelimitedToken(name, delimiter, pattern, patternError, unclosedError) =>
if (first.head.c == delimiter) {
val m = until(delimiter)
 
if (s.head.c != delimiter)
sys.error(s"$unclosedError ${first.head.at}")
else if (pattern.pattern.matcher(m).matches) {
next
Some((name, s"$delimiter$m$delimiter"))
} else
sys.error(s"$patternError ${s.head.at}")
} else {
s = first
None
}
}
}
 
def tokenize: Unit =
if (s.head.c == EOT)
token(endOfInput, s.head)
else {
if (s.head.c.isWhitespace)
next
else if (s.head.c == '/' && s.tail.head.c == '*')
comment(s.head)
else if (symbolStartChars(s.head.c)) {
val first = s.head
val buf = new StringBuilder
 
while (!symbols.contains(buf.toString) && s.head.c != EOT && symbolChars(s.head.c)) buf += getch
 
while (symbols.contains(buf.toString :+ s.head.c) && s.head.c != EOT && symbolChars(s.head.c)) buf += getch
 
symbols get buf.toString match {
case Some(name) => token(name, first)
case None => sys.error(s"unrecognized symbol: '${buf.toString}' ${first.at}")
}
} else {
val first = s.head
 
recognize(identifier) match {
case None =>
find(0)
 
@scala.annotation.tailrec
def find(t: Int): Unit =
if (t == tokens.length)
sys.error(s"unrecognized character ${first.at}")
else
recognize(tokens(t)) match {
case None => find(t + 1)
case Some((name, v)) => value(name, v, first)
}
case Some((name, ident)) =>
keywords get ident match {
case None => value(name, ident, first)
case Some(keyword) => token(keyword, first)
}
}
}
 
tokenize
}
}
 
private class Chr(val c: Char) {
val line = curline
val col = curcol
 
if (c == '\n') {
curline += 1
curcol = 1
} else if (c == '\r')
curcol = 1
else if (c == '\t')
curcol += tabs - (curcol - 1) % tabs
else
curcol += 1
 
def at = s"[${line}, ${col}]"
 
override def toString: String = s"<$c, $line, $col>"
}
 
}
</syntaxhighlight>
 
=={{header|Scheme}}==
 
<syntaxhighlight lang="scheme">
(import (scheme base)
(scheme char)
(scheme file)
(scheme process-context)
(scheme write))
 
(define *symbols* (list (cons #\( 'LeftParen)
(cons #\) 'RightParen)
(cons #\{ 'LeftBrace)
(cons #\} 'RightBrace)
(cons #\; 'Semicolon)
(cons #\, 'Comma)
(cons #\* 'Op_multiply)
(cons #\/ 'Op_divide)
(cons #\% 'Op_mod)
(cons #\+ 'Op_add)
(cons #\- 'Op_subtract)))
 
(define *keywords* (list (cons 'if 'Keyword_if)
(cons 'else 'Keyword_else)
(cons 'while 'Keyword_while)
(cons 'print 'Keyword_print)
(cons 'putc 'Keyword_putc)))
 
;; return list of tokens from current port
(define (read-tokens)
; information on position in input
(define line 1)
(define col 0)
(define next-char #f)
; get char, updating line/col posn
(define (get-next-char)
(if (char? next-char) ; check for returned character
(let ((c next-char))
(set! next-char #f)
c)
(let ((c (read-char)))
(cond ((and (not (eof-object? c))
(char=? c #\newline))
(set! col 0)
(set! line (+ 1 line))
(get-next-char))
(else
(set! col (+ 1 col))
c)))))
(define (push-char c)
(set! next-char c))
; step over any whitespace or comments
(define (skip-whitespace+comment)
(let loop ()
(let ((c (get-next-char)))
(cond ((eof-object? c)
'())
((char-whitespace? c) ; ignore whitespace
(loop))
((char=? c #\/) ; check for comments
(if (char=? (peek-char) #\*) ; found start of comment
(begin ; eat comment
(get-next-char)
(let m ((c (get-next-char)))
(cond ((eof-object? c)
(error "End of file in comment"))
((and (char=? c #\*)
(char=? (peek-char) #\/))
(get-next-char)) ; eat / and end
(else
(m (get-next-char)))))
(loop)) ; continue looking for whitespace / more comments
(push-char #\/))) ; not comment, so put / back and return
(else ; return to stream, as not a comment or space char
(push-char c))))))
; read next token from input
(define (next-token)
(define (read-string) ; returns string value along with " " marks
(let loop ((chars '(#\"))) ; " (needed to appease Rosetta code's highlighter)
(cond ((eof-object? (peek-char))
(error "End of file while scanning string literal."))
((char=? (peek-char) #\newline)
(error "End of line while scanning string literal."))
((char=? (peek-char) #\") ; "
(get-next-char) ; consume the final quote
(list->string (reverse (cons #\" chars)))) ; " highlighter)
(else
(loop (cons (get-next-char) chars))))))
(define (read-identifier initial-c) ; returns identifier as a Scheme symbol
(do ((chars (list initial-c) (cons c chars))
(c (get-next-char) (get-next-char)))
((or (eof-object? c) ; finish when hit end of file
(not (or (char-numeric? c) ; or a character not permitted in an identifier
(char-alphabetic? c)
(char=? c #\_))))
(push-char c) ; return last character to stream
(string->symbol (list->string (reverse chars))))))
(define (read-number initial-c) ; returns integer read as a Scheme integer
(let loop ((res (digit-value initial-c))
(c (get-next-char)))
(cond ((char-alphabetic? c)
(error "Invalid number - ends in alphabetic chars"))
((char-numeric? c)
(loop (+ (* res 10) (digit-value c))
(get-next-char)))
(else
(push-char c) ; return non-number to stream
res))))
; select op symbol based on if there is a following = sign
(define (check-eq-extend start-line start-col opeq op)
(if (char=? (peek-char) #\=)
(begin (get-next-char) ; consume it
(list start-line start-col opeq))
(list start-line start-col op)))
;
(let* ((start-line line) ; save start position of tokens
(start-col col)
(c (get-next-char)))
(cond ((eof-object? c)
(list start-line start-col 'End_of_input))
((char-alphabetic? c) ; read an identifier
(let ((id (read-identifier c)))
(if (assq id *keywords*) ; check if identifier is a keyword
(list start-line start-col (cdr (assq id *keywords*)))
(list start-line start-col 'Identifier id))))
((char-numeric? c) ; read a number
(list start-line start-col 'Integer (read-number c)))
(else
(case c
((#\( #\) #\{ #\} #\; #\, #\* #\/ #\% #\+ #\-)
(list start-line start-col (cdr (assq c *symbols*))))
((#\<)
(check-eq-extend start-line start-col 'Op_lessequal 'Op_less))
((#\>)
(check-eq-extend start-line start-col 'Op_greaterequal 'Op_greater))
((#\=)
(check-eq-extend start-line start-col 'Op_equal 'Op_assign))
((#\!)
(check-eq-extend start-line start-col 'Op_notequal 'Op_not))
((#\& #\|)
(if (char=? (peek-char) c) ; looks for && or ||
(begin (get-next-char) ; consume second character if valid
(list start-line start-col
(if (char=? c #\&) 'Op_and 'Op_or)))
(push-char c)))
((#\") ; "
(list start-line start-col 'String (read-string)))
((#\')
(let* ((c1 (get-next-char))
(c2 (get-next-char)))
(cond ((or (eof-object? c1)
(eof-object? c2))
(error "Incomplete character constant"))
((char=? c1 #\')
(error "Empty character constant"))
((and (char=? c2 #\') ; case of single character
(not (char=? c1 #\\)))
(list start-line start-col 'Integer (char->integer c1)))
((and (char=? c1 #\\) ; case of escaped character
(char=? (peek-char) #\'))
(get-next-char) ; consume the ending '
(cond ((char=? c2 #\n)
(list start-line start-col 'Integer 10))
((char=? c2 #\\)
(list start-line start-col 'Integer (char->integer c2)))
(else
(error "Unknown escape sequence"))))
(else
(error "Multi-character constant")))))
(else
(error "Unrecognised character")))))))
;
(let loop ((tokens '())) ; loop, ignoring space/comments, while reading tokens
(skip-whitespace+comment)
(let ((tok (next-token)))
(if (eof-object? (peek-char)) ; check if at end of input
(reverse (cons tok tokens))
(loop (cons tok tokens))))))
 
(define (lexer filename)
(with-input-from-file filename
(lambda () (read-tokens))))
 
;; output tokens to stdout, tab separated
;; line number, column number, token type, optional value
(define (display-tokens tokens)
(for-each
(lambda (token)
(display (list-ref token 0))
(display #\tab) (display (list-ref token 1))
(display #\tab) (display (list-ref token 2))
(when (= 4 (length token))
(display #\tab) (display (list-ref token 3)))
(newline))
tokens))
 
;; read from filename passed on command line
(if (= 2 (length (command-line)))
(display-tokens (lexer (cadr (command-line))))
(display "Error: provide program filename\n"))
</syntaxhighlight>
 
{{out}}
Output shown for "hello.c" example. Tested against all programs in [[Compiler/Sample programs]].
 
<pre>4 1 Keyword_print
4 6 LeftParen
4 7 String "Hello, World!\n"
4 24 RightParen
4 25 Semicolon
5 1 End_of_input
</pre>
 
=={{header|Standard ML}}==
{{trans|ATS}}
{{trans|OCaml}}
 
 
<syntaxhighlight lang="sml">(*------------------------------------------------------------------*)
(* The Rosetta Code lexical analyzer, in Standard ML. Based on the ATS
and the OCaml. The intended compiler is Mlton or Poly/ML; there is
a tiny difference near the end of the file, depending on which
compiler is used. *)
 
(*------------------------------------------------------------------*)
(* The following functions are compatible with ASCII. *)
 
fun
is_digit ichar =
48 <= ichar andalso ichar <= 57
 
fun
is_lower ichar =
97 <= ichar andalso ichar <= 122
 
fun
is_upper ichar =
65 <= ichar andalso ichar <= 90
 
fun
is_alpha ichar =
is_lower ichar orelse is_upper ichar
 
fun
is_alnum ichar =
is_digit ichar orelse is_alpha ichar
 
fun
is_ident_start ichar =
is_alpha ichar orelse ichar = 95
 
fun
is_ident_continuation ichar =
is_alnum ichar orelse ichar = 95
 
fun
is_space ichar =
ichar = 32 orelse (9 <= ichar andalso ichar <= 13)
 
(*------------------------------------------------------------------*)
(* Character input more like that of C. There are various advantages
and disadvantages to this method, but key points in its favor are:
(a) it is how character input is done in the original ATS code, (b)
Unicode code points are 21-bit positive integers. *)
 
val eof = ~1
 
fun
input_ichar inpf =
case TextIO.input1 inpf of
NONE => eof
| SOME c => Char.ord c
 
(*------------------------------------------------------------------*)
 
(* The type of an input character. *)
 
structure Ch =
struct
 
type t = {
ichar : int,
line_no : int,
column_no : int
}
 
end
 
(*------------------------------------------------------------------*)
(* Inputting with unlimited pushback, and with counting of lines and
columns. *)
 
structure Inp =
struct
 
type t = {
inpf : TextIO.instream,
pushback : Ch.t list,
line_no : int,
column_no : int
}
 
fun
of_instream inpf =
{
inpf = inpf,
pushback = [],
line_no = 1,
column_no = 1
} : t
 
fun
get_ch ({ inpf = inpf,
pushback = pushback,
line_no = line_no,
column_no = column_no } : t) =
case pushback of
ch :: tail =>
let
val inp = { inpf = inpf,
pushback = tail,
line_no = line_no,
column_no = column_no }
in
(ch, inp)
end
| [] =>
let
val ichar = input_ichar inpf
val ch = { ichar = ichar,
line_no = line_no,
column_no = column_no }
in
if ichar = Char.ord #"\n" then
let
val inp = { inpf = inpf,
pushback = [],
line_no = line_no + 1,
column_no = 1 }
in
(ch, inp)
end
else
let
val inp = { inpf = inpf,
pushback = [],
line_no = line_no,
column_no = column_no + 1 }
in
(ch, inp)
end
end
 
fun
push_back_ch (ch, inp : t) =
{
inpf = #inpf inp,
pushback = ch :: #pushback inp,
line_no = #line_no inp,
column_no = #column_no inp
}
 
end
 
(*------------------------------------------------------------------*)
(* Tokens, appearing in tuples with arguments, and with line and
column numbers. The tokens are integers, so they can be used as
array indices. *)
 
val token_ELSE = 0
val token_IF = 1
val token_PRINT = 2
val token_PUTC = 3
val token_WHILE = 4
val token_MULTIPLY = 5
val token_DIVIDE = 6
val token_MOD = 7
val token_ADD = 8
val token_SUBTRACT = 9
val token_NEGATE = 10
val token_LESS = 11
val token_LESSEQUAL = 12
val token_GREATER = 13
val token_GREATEREQUAL = 14
val token_EQUAL = 15
val token_NOTEQUAL = 16
val token_NOT = 17
val token_ASSIGN = 18
val token_AND = 19
val token_OR = 20
val token_LEFTPAREN = 21
val token_RIGHTPAREN = 22
val token_LEFTBRACE = 23
val token_RIGHTBRACE = 24
val token_SEMICOLON = 25
val token_COMMA = 26
val token_IDENTIFIER = 27
val token_INTEGER = 28
val token_STRING = 29
val token_END_OF_INPUT = 30
 
(* A *very* simple perfect hash for the reserved words. (Yes, this is
overkill, except for demonstration of the principle.) *)
 
val reserved_words =
Vector.fromList ["if", "print", "else",
"", "putc", "",
"", "while", ""]
val reserved_word_tokens =
Vector.fromList [token_IF, token_PRINT, token_ELSE,
token_IDENTIFIER, token_PUTC, token_IDENTIFIER,
token_IDENTIFIER, token_WHILE, token_IDENTIFIER]
 
fun
reserved_word_lookup (s, line_no, column_no) =
if (String.size s) < 2 then
(token_IDENTIFIER, s, line_no, column_no)
else
let
val hashval =
(Char.ord (String.sub (s, 0)) +
Char.ord (String.sub (s, 1)))
mod 9
val token = Vector.sub (reserved_word_tokens, hashval)
in
if token = token_IDENTIFIER orelse
s <> Vector.sub (reserved_words, hashval) then
(token_IDENTIFIER, s, line_no, column_no)
else
(token, s, line_no, column_no)
end
 
(* Token to string lookup. *)
 
val token_names =
Vector.fromList
["Keyword_else",
"Keyword_if",
"Keyword_print",
"Keyword_putc",
"Keyword_while",
"Op_multiply",
"Op_divide",
"Op_mod",
"Op_add",
"Op_subtract",
"Op_negate",
"Op_less",
"Op_lessequal",
"Op_greater",
"Op_greaterequal",
"Op_equal",
"Op_notequal",
"Op_not",
"Op_assign",
"Op_and",
"Op_or",
"LeftParen",
"RightParen",
"LeftBrace",
"RightBrace",
"Semicolon",
"Comma",
"Identifier",
"Integer",
"String",
"End_of_input"]
 
fun
token_name token =
Vector.sub (token_names, token)
 
(*------------------------------------------------------------------*)
 
exception Unterminated_comment of int * int
exception Unterminated_character_literal of int * int
exception Multicharacter_literal of int * int
exception End_of_input_in_string_literal of int * int
exception End_of_line_in_string_literal of int * int
exception Unsupported_escape of int * int * char
exception Invalid_integer_literal of int * int * string
exception Unexpected_character of int * int * char
 
(*------------------------------------------------------------------*)
(* Skipping past spaces and comments. (In the Rosetta Code tiny
language, a comment, if you think about it, is a kind of space.) *)
 
fun
scan_comment (inp, line_no, column_no) =
let
fun
loop inp =
let
val (ch, inp) = Inp.get_ch inp
in
if #ichar ch = eof then
raise Unterminated_comment (line_no, column_no)
else if #ichar ch = Char.ord #"*" then
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = eof then
raise Unterminated_comment (line_no, column_no)
else if #ichar ch1 = Char.ord #"/" then
inp
else
loop inp
end
else
loop inp
end
in
loop inp
end
 
fun
skip_spaces_and_comments inp =
let
fun
loop inp =
let
val (ch, inp) = Inp.get_ch inp
in
if is_space (#ichar ch) then
loop inp
else if #ichar ch = Char.ord #"/" then
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"*" then
loop (scan_comment (inp, #line_no ch, #column_no ch))
else
let
val inp = Inp.push_back_ch (ch1, inp)
val inp = Inp.push_back_ch (ch, inp)
in
inp
end
end
else
Inp.push_back_ch (ch, inp)
end
in
loop inp
end
 
(*------------------------------------------------------------------*)
(* Integer literals, identifiers, and reserved words. *)
 
fun
scan_word (lst, inp) =
let
val (ch, inp) = Inp.get_ch inp
in
if is_ident_continuation (#ichar ch) then
scan_word (Char.chr (#ichar ch) :: lst, inp)
else
(lst, Inp.push_back_ch (ch, inp))
end
 
fun
scan_integer_literal inp =
let
val (ch, inp) = Inp.get_ch inp
val (lst, inp) = scan_word ([Char.chr (#ichar ch)], inp)
val s = String.implode (List.rev lst)
in
if List.all (fn c => is_digit (Char.ord c)) lst then
((token_INTEGER, s, #line_no ch, #column_no ch), inp)
else
raise Invalid_integer_literal (#line_no ch, #column_no ch, s)
end
 
fun
scan_identifier_or_reserved_word inp =
let
val (ch, inp) = Inp.get_ch inp
val (lst, inp) = scan_word ([Char.chr (#ichar ch)], inp)
val s = String.implode (List.rev lst)
val toktup = reserved_word_lookup (s, #line_no ch, #column_no ch)
in
(toktup, inp)
end
 
(*------------------------------------------------------------------*)
(* String literals. *)
 
fun
scan_string_literal inp =
let
val (ch, inp) = Inp.get_ch inp
 
fun
scan (lst, inp) =
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = eof then
raise End_of_input_in_string_literal
(#line_no ch, #column_no ch)
else if #ichar ch1 = Char.ord #"\n" then
raise End_of_line_in_string_literal
(#line_no ch, #column_no ch)
else if #ichar ch1 = Char.ord #"\"" then
(lst, inp)
else if #ichar ch1 <> Char.ord #"\\" then
scan (Char.chr (#ichar ch1) :: lst, inp)
else
let
val (ch2, inp) = Inp.get_ch inp
in
if #ichar ch2 = Char.ord #"n" then
scan (#"n" :: #"\\" :: lst, inp)
else if #ichar ch2 = Char.ord #"\\" then
scan (#"\\" :: #"\\" :: lst, inp)
else if #ichar ch2 = eof then
raise End_of_input_in_string_literal
(#line_no ch, #column_no ch)
else if #ichar ch2 = Char.ord #"\n" then
raise End_of_line_in_string_literal
(#line_no ch, #column_no ch)
else
raise Unsupported_escape (#line_no ch1, #column_no ch1,
Char.chr (#ichar ch2))
end
end
 
val lst = #"\"" :: []
val (lst, inp) = scan (lst, inp)
val lst = #"\"" :: lst
val s = String.implode (List.rev lst)
in
((token_STRING, s, #line_no ch, #column_no ch), inp)
end
 
(*------------------------------------------------------------------*)
(* Character literals. *)
 
fun
scan_character_literal_without_checking_end inp =
let
val (ch, inp) = Inp.get_ch inp
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = eof then
raise Unterminated_character_literal
(#line_no ch, #column_no ch)
else if #ichar ch1 = Char.ord #"\\" then
let
val (ch2, inp) = Inp.get_ch inp
in
if #ichar ch2 = eof then
raise Unterminated_character_literal
(#line_no ch, #column_no ch)
else if #ichar ch2 = Char.ord #"n" then
let
val s = Int.toString (Char.ord #"\n")
in
((token_INTEGER, s, #line_no ch, #column_no ch), inp)
end
else if #ichar ch2 = Char.ord #"\\" then
let
val s = Int.toString (Char.ord #"\\")
in
((token_INTEGER, s, #line_no ch, #column_no ch), inp)
end
else
raise Unsupported_escape (#line_no ch1, #column_no ch1,
Char.chr (#ichar ch2))
end
else
let
val s = Int.toString (#ichar ch1)
in
((token_INTEGER, s, #line_no ch, #column_no ch), inp)
end
end
 
fun
scan_character_literal inp =
let
val (toktup, inp) =
scan_character_literal_without_checking_end inp
val (_, _, line_no, column_no) = toktup
 
fun
check_end inp =
let
val (ch, inp) = Inp.get_ch inp
in
if #ichar ch = Char.ord #"'" then
inp
else
let
fun
loop_to_end (ch1 : Ch.t, inp) =
if #ichar ch1 = eof then
raise Unterminated_character_literal (line_no, column_no)
else if #ichar ch1 = Char.ord #"'" then
raise Multicharacter_literal (line_no, column_no)
else
let
val (ch1, inp) = Inp.get_ch inp
in
loop_to_end (ch1, inp)
end
in
loop_to_end (ch, inp)
end
end
 
val inp = check_end inp
in
(toktup, inp)
end
 
(*------------------------------------------------------------------*)
 
fun
get_next_token inp =
let
val inp = skip_spaces_and_comments inp
val (ch, inp) = Inp.get_ch inp
val ln = #line_no ch
val cn = #column_no ch
in
if #ichar ch = eof then
((token_END_OF_INPUT, "", ln, cn), inp)
else
case Char.chr (#ichar ch) of
#"," => ((token_COMMA, ",", ln, cn), inp)
| #";" => ((token_SEMICOLON, ";", ln, cn), inp)
| #"(" => ((token_LEFTPAREN, "(", ln, cn), inp)
| #")" => ((token_RIGHTPAREN, ")", ln, cn), inp)
| #"{" => ((token_LEFTBRACE, "{", ln, cn), inp)
| #"}" => ((token_RIGHTBRACE, "}", ln, cn), inp)
| #"*" => ((token_MULTIPLY, "*", ln, cn), inp)
| #"/" => ((token_DIVIDE, "/", ln, cn), inp)
| #"%" => ((token_MOD, "%", ln, cn), inp)
| #"+" => ((token_ADD, "+", ln, cn), inp)
| #"-" => ((token_SUBTRACT, "-", ln, cn), inp)
| #"<" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"=" then
((token_LESSEQUAL, "<=", ln, cn), inp)
else
let
val inp = Inp.push_back_ch (ch1, inp)
in
((token_LESS, "<", ln, cn), inp)
end
end
| #">" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"=" then
((token_GREATEREQUAL, ">=", ln, cn), inp)
else
let
val inp = Inp.push_back_ch (ch1, inp)
in
((token_GREATER, ">", ln, cn), inp)
end
end
| #"=" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"=" then
((token_EQUAL, "==", ln, cn), inp)
else
let
val inp = Inp.push_back_ch (ch1, inp)
in
((token_ASSIGN, "=", ln, cn), inp)
end
end
| #"!" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"=" then
((token_NOTEQUAL, "!=", ln, cn), inp)
else
let
val inp = Inp.push_back_ch (ch1, inp)
in
((token_NOT, "!", ln, cn), inp)
end
end
| #"&" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"&" then
((token_AND, "&&", ln, cn), inp)
else
raise Unexpected_character (#line_no ch, #column_no ch,
Char.chr (#ichar ch))
end
| #"|" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"|" then
((token_OR, "||", ln, cn), inp)
else
raise Unexpected_character (#line_no ch, #column_no ch,
Char.chr (#ichar ch))
end
| #"\"" =>
let
val inp = Inp.push_back_ch (ch, inp)
in
scan_string_literal inp
end
| #"'" =>
let
val inp = Inp.push_back_ch (ch, inp)
in
scan_character_literal inp
end
| _ =>
if is_digit (#ichar ch) then
let
val inp = Inp.push_back_ch (ch, inp)
in
scan_integer_literal inp
end
else if is_ident_start (#ichar ch) then
let
val inp = Inp.push_back_ch (ch, inp)
in
scan_identifier_or_reserved_word inp
end
else
raise Unexpected_character (#line_no ch, #column_no ch,
Char.chr (#ichar ch))
end
 
fun
output_integer_rightjust (outf, num) =
(if num < 10 then
TextIO.output (outf, " ")
else if num < 100 then
TextIO.output (outf, " ")
else if num < 1000 then
TextIO.output (outf, " ")
else if num < 10000 then
TextIO.output (outf, " ")
else
();
TextIO.output (outf, Int.toString num))
 
fun
print_token (outf, toktup) =
let
val (token, arg, line_no, column_no) = toktup
val name = token_name token
val (padding, str) =
if token = token_IDENTIFIER then
(" ", arg)
else if token = token_INTEGER then
(" ", arg)
else if token = token_STRING then
(" ", arg)
else("", "")
in
output_integer_rightjust (outf, line_no);
TextIO.output (outf, " ");
output_integer_rightjust (outf, column_no);
TextIO.output (outf, " ");
TextIO.output (outf, name);
TextIO.output (outf, padding);
TextIO.output (outf, str);
TextIO.output (outf, "\n")
end
 
fun
scan_text (outf, inp) =
let
fun
loop inp =
let
val (toktup, inp) = get_next_token inp
in
(print_token (outf, toktup);
let
val (token, _, _, _) = toktup
in
if token <> token_END_OF_INPUT then
loop inp
else
()
end)
end
in
loop inp
end
 
(*------------------------------------------------------------------*)
 
fun
main () =
let
val args = CommandLine.arguments ()
val (inpf_filename, outf_filename) =
case args of
[] => ("-", "-")
| name :: [] => (name, "-")
| name1 :: name2 :: _ => (name1, name2)
val inpf =
if inpf_filename = "-" then
TextIO.stdIn
else
TextIO.openIn inpf_filename
handle
(IO.Io _) =>
(TextIO.output (TextIO.stdErr, "Failure opening \"");
TextIO.output (TextIO.stdErr, inpf_filename);
TextIO.output (TextIO.stdErr, "\" for input\n");
OS.Process.exit OS.Process.failure)
val outf =
if outf_filename = "-" then
TextIO.stdOut
else
TextIO.openOut outf_filename
handle
(IO.Io _) =>
(TextIO.output (TextIO.stdErr, "Failure opening \"");
TextIO.output (TextIO.stdErr, outf_filename);
TextIO.output (TextIO.stdErr, "\" for output\n");
OS.Process.exit OS.Process.failure)
val inp = Inp.of_instream inpf
in
scan_text (outf, inp)
end
handle Unterminated_comment (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": unterminated comment ");
TextIO.output (TextIO.stdErr, "starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Unterminated_character_literal (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": unterminated character ");
TextIO.output (TextIO.stdErr, "literal starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Multicharacter_literal (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": unsupported multicharacter");
TextIO.output (TextIO.stdErr, " literal starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| End_of_input_in_string_literal (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": end of input in string");
TextIO.output (TextIO.stdErr, " literal starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| End_of_line_in_string_literal (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": end of line in string");
TextIO.output (TextIO.stdErr, " literal starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Unsupported_escape (line_no, column_no, c) =>
(TextIO.output (TextIO.stdErr, CommandLine.name ());
TextIO.output (TextIO.stdErr, ": unsupported escape \\");
TextIO.output (TextIO.stdErr, Char.toString c);
TextIO.output (TextIO.stdErr, " at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Invalid_integer_literal (line_no, column_no, str) =>
(TextIO.output (TextIO.stdErr, CommandLine.name ());
TextIO.output (TextIO.stdErr, ": invalid integer literal ");
TextIO.output (TextIO.stdErr, str);
TextIO.output (TextIO.stdErr, " at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Unexpected_character (line_no, column_no, c) =>
(TextIO.output (TextIO.stdErr, CommandLine.name ());
TextIO.output (TextIO.stdErr, ": unexpected character '");
TextIO.output (TextIO.stdErr, Char.toString c);
TextIO.output (TextIO.stdErr, "' at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure);
 
(*------------------------------------------------------------------*)
(* For the Mlton compiler, include the following. For Poly/ML, comment
it out. *)
main ();
 
(*------------------------------------------------------------------*)
(* Instructions for GNU Emacs. *)
 
(* local variables: *)
(* mode: sml *)
(* sml-indent-level: 2 *)
(* sml-indent-args: 2 *)
(* end: *)
(*------------------------------------------------------------------*)</syntaxhighlight>
 
 
{{out}}
For Mlton, compile with
<pre>mlton -output lex lex.sml</pre>
 
For Poly/ML, compile with
<pre>polyc -o lex lex.sml</pre>
 
Mlton is an optimizing whole-program compiler. It might take much longer to compile the source but produce much faster executables.
 
Output for testcase3:
<pre> 5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|Wren}}==
{{trans|Go}}
{{libheader|Wren-dynamic}}
{{libheader|Wren-str}}
{{libheader|Wren-fmt}}
{{libheader|Wren-ioutil}}
<syntaxhighlight lang="wren">import "./dynamic" for Enum, Struct, Tuple
import "./str" for Char
import "./fmt" for Fmt
import "./ioutil" for FileUtil
import "os" for Process
 
var tokens = [
"EOI",
"Mul",
"Div",
"Mod",
"Add",
"Sub",
"Negate",
"Not",
"Lss",
"Leq",
"Gtr",
"Geq",
"Eq",
"Neq",
"Assign",
"And",
"Or",
"If",
"Else",
"While",
"Print",
"Putc",
"Lparen",
"Rparen",
"Lbrace",
"Rbrace",
"Semi",
"Comma",
"Ident",
"Integer",
"String"
]
 
var Token = Enum.create("Token", tokens)
 
var TokData = Struct.create("TokData", ["eline", "ecol", "tok", "v"])
 
var Symbol = Tuple.create("Symbol", ["name", "tok"])
 
// symbol table
var symtab = []
 
var curLine = ""
var curCh = ""
var lineNum = 0
var colNum = 0
var etx = 4 // used to signify EOI
 
var lines = []
var lineCount = 0
 
var errorMsg = Fn.new { |eline, ecol, msg| Fiber.abort("(%(eline):%(ecol)) %(msg)") }
 
// add an identifier to the symbpl table
var install = Fn.new { |name, tok|
var sym = Symbol.new(name, tok)
symtab.add(sym)
}
 
// search for an identifier in the symbol table
var lookup = Fn.new { |name|
for (i in 0...symtab.count) {
if (symtab[i].name == name) return i
}
return -1
}
 
// read the next line of input from the source file
var nextLine // recursive function
nextLine = Fn.new {
if (lineNum == lineCount) {
curCh = etx
curLine = ""
colNum = 1
return
}
curLine = lines[lineNum]
lineNum = lineNum + 1
colNum = 0
if (curLine == "") nextLine.call() // skip blank lines
}
 
// get the next char
var nextChar = Fn.new {
if (colNum >= curLine.count) nextLine.call()
if (colNum < curLine.count) {
curCh = curLine[colNum]
colNum = colNum + 1
}
}
 
var follow = Fn.new { |eline, ecol, expect, ifyes, ifno|
if (curCh == expect) {
nextChar.call()
return ifyes
}
if (ifno == Token.EOI) {
errorMsg.call(eline, ecol, "follow unrecognized character: " + curCh)
}
return ifno
}
 
var getTok // recursive function
getTok = Fn.new {
// skip whitespace
while (curCh == " " || curCh == "\t" || curCh == "\n") nextChar.call()
var td = TokData.new(lineNum, colNum, 0, "")
if (curCh == etx) {
td.tok = Token.EOI
return td
}
if (curCh == "{") {
td.tok = Token.Lbrace
nextChar.call()
return td
}
if (curCh == "}") {
td.tok = Token.Rbrace
nextChar.call()
return td
}
if (curCh == "(") {
td.tok = Token.Lparen
nextChar.call()
return td
}
if (curCh == ")") {
td.tok = Token.Rparen
nextChar.call()
return td
}
if (curCh == "+") {
td.tok = Token.Add
nextChar.call()
return td
}
if (curCh == "-") {
td.tok = Token.Sub
nextChar.call()
return td
}
if (curCh == "*") {
td.tok = Token.Mul
nextChar.call()
return td
}
if (curCh == "\%") {
td.tok = Token.Mod
nextChar.call()
return td
}
if (curCh == ";") {
td.tok = Token.Semi
nextChar.call()
return td
}
if (curCh == ",") {
td.tok = Token.Comma
nextChar.call()
return td
}
if (curCh == "'") { // single char literals
nextChar.call()
td.v = curCh.bytes[0].toString
if (curCh == "'") {
errorMsg.call(td.eline, td.ecol, "Empty character constant")
}
if (curCh == "\\") {
nextChar.call()
if (curCh == "n") {
td.v = "10"
} else if (curCh == "\\") {
td.v = "92"
} else {
errorMsg.call(td.eline, td.ecol, "unknown escape sequence: "+ curCh)
}
}
nextChar.call()
if (curCh != "'") {
errorMsg.call(td.eline, td.ecol, "multi-character constant")
}
nextChar.call()
td.tok = Token.Integer
return td
}
if (curCh == "<") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "=", Token.Leq, Token.Lss)
return td
}
if (curCh == ">") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "=", Token.Geq, Token.Gtr)
return td
}
if (curCh == "!") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "=", Token.Neq, Token.Not)
return td
}
if (curCh == "=") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "=", Token.Eq, Token.Assign)
return td
}
if (curCh == "&") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "&", Token.And, Token.EOI)
return td
}
if (curCh == "|") {
nextChar.call()
td.tok = follow.call(td.eline, td.ecol, "|", Token.Or, Token.EOI)
return td
}
if (curCh == "\"") { // string
td.v = curCh
nextChar.call()
while (curCh != "\"") {
if (curCh == "\n") {
errorMsg.call(td.eline, td.ecol, "EOL in string")
}
if (curCh == etx) {
errorMsg.call(td.eline, td.ecol, "EOF in string")
}
td.v = td.v + curCh
nextChar.call()
}
td.v = td.v + curCh
nextChar.call()
td.tok = Token.String
return td
}
if (curCh == "/") { // div or comment
nextChar.call()
if (curCh != "*") {
td.tok = Token.Div
return td
}
// skip comments
nextChar.call()
while (true) {
if (curCh == "*") {
nextChar.call()
if (curCh == "/") {
nextChar.call()
return getTok.call()
}
} else if (curCh == etx) {
errorMsg.call(td.eline, td.ecol, "EOF in comment")
} else {
nextChar.call()
}
}
}
//integers or identifiers
var isNumber = Char.isDigit(curCh)
td.v = ""
while (Char.isAsciiAlphaNum(curCh) || curCh == "_") {
if (!Char.isDigit(curCh)) isNumber = false
td.v = td.v + curCh
nextChar.call()
}
if (td.v.count == 0) {
errorMsg.call(td.eline, td.ecol, "unknown character: " + curCh)
}
if (Char.isDigit(td.v[0])) {
if (!isNumber) {
errorMsg.call(td.eline, td.ecol, "invalid number: " + curCh)
}
td.tok = Token.Integer
return td
}
var index = lookup.call(td.v)
td.tok = (index == -1) ? Token.Ident : symtab[index].tok
return td
}
 
var initLex = Fn.new {
install.call("else", Token.Else)
install.call("if", Token.If)
install.call("print", Token.Print)
install.call("putc", Token.Putc)
install.call("while", Token.While)
nextChar.call()
}
 
var process = Fn.new {
var tokMap = {}
tokMap[Token.EOI] = "End_of_input"
tokMap[Token.Mul] = "Op_multiply"
tokMap[Token.Div] = "Op_divide"
tokMap[Token.Mod] = "Op_mod"
tokMap[Token.Add] = "Op_add"
tokMap[Token.Sub] = "Op_subtract"
tokMap[Token.Negate] = "Op_negate"
tokMap[Token.Not] = "Op_not"
tokMap[Token.Lss] = "Op_less"
tokMap[Token.Leq] = "Op_lessequal"
tokMap[Token.Gtr] = "Op_greater"
tokMap[Token.Geq] = "Op_greaterequal"
tokMap[Token.Eq] = "Op_equal"
tokMap[Token.Neq] = "Op_notequal"
tokMap[Token.Assign] = "Op_assign"
tokMap[Token.And] = "Op_and"
tokMap[Token.Or] = "Op_or"
tokMap[Token.If] = "Keyword_if"
tokMap[Token.Else] = "Keyword_else"
tokMap[Token.While] = "Keyword_while"
tokMap[Token.Print] = "Keyword_print"
tokMap[Token.Putc] = "Keyword_putc"
tokMap[Token.Lparen] = "LeftParen"
tokMap[Token.Rparen] = "RightParen"
tokMap[Token.Lbrace] = "LeftBrace"
tokMap[Token.Rbrace] = "RightBrace"
tokMap[Token.Semi] = "Semicolon"
tokMap[Token.Comma] = "Comma"
tokMap[Token.Ident] = "Identifier"
tokMap[Token.Integer] = "Integer"
tokMap[Token.String] = "String"
 
while (true) {
var td = getTok.call()
Fmt.write("$5d $5d $-16s", td.eline, td.ecol, tokMap[td.tok])
if (td.tok == Token.Integer || td.tok == Token.Ident || td.tok == Token.String) {
System.print(td.v)
} else {
System.print()
}
if (td.tok == Token.EOI) return
}
}
 
var args = Process.arguments
if (args.count == 0) {
System.print("Filename required")
return
}
 
lines = FileUtil.readLines(args[0])
lineCount = lines.count
initLex.call()
process.call()</syntaxhighlight>
 
{{out}}
For test case 3:
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
</pre>
 
=={{header|Zig}}==
<syntaxhighlight lang="zig">
const std = @import("std");
 
pub const TokenType = enum {
unknown,
multiply,
divide,
mod,
add,
subtract,
negate,
less,
less_equal,
greater,
greater_equal,
equal,
not_equal,
not,
assign,
bool_and,
bool_or,
left_paren,
right_paren,
left_brace,
right_brace,
semicolon,
comma,
kw_if,
kw_else,
kw_while,
kw_print,
kw_putc,
identifier,
integer,
string,
eof,
 
// More efficient implementation can be done with `std.enums.directEnumArray`.
pub fn toString(self: @This()) []const u8 {
return switch (self) {
.unknown => "UNKNOWN",
.multiply => "Op_multiply",
.divide => "Op_divide",
.mod => "Op_mod",
.add => "Op_add",
.subtract => "Op_subtract",
.negate => "Op_negate",
.less => "Op_less",
.less_equal => "Op_lessequal",
.greater => "Op_greater",
.greater_equal => "Op_greaterequal",
.equal => "Op_equal",
.not_equal => "Op_notequal",
.not => "Op_not",
.assign => "Op_assign",
.bool_and => "Op_and",
.bool_or => "Op_or",
.left_paren => "LeftParen",
.right_paren => "RightParen",
.left_brace => "LeftBrace",
.right_brace => "RightBrace",
.semicolon => "Semicolon",
.comma => "Comma",
.kw_if => "Keyword_if",
.kw_else => "Keyword_else",
.kw_while => "Keyword_while",
.kw_print => "Keyword_print",
.kw_putc => "Keyword_putc",
.identifier => "Identifier",
.integer => "Integer",
.string => "String",
.eof => "End_of_input",
};
}
};
 
pub const TokenValue = union(enum) {
intlit: i32,
string: []const u8,
};
 
pub const Token = struct {
line: usize,
col: usize,
typ: TokenType = .unknown,
value: ?TokenValue = null,
};
 
// Error conditions described in the task.
pub const LexerError = error{
EmptyCharacterConstant,
UnknownEscapeSequence,
MulticharacterConstant,
EndOfFileInComment,
EndOfFileInString,
EndOfLineInString,
UnrecognizedCharacter,
InvalidNumber,
};
 
pub const Lexer = struct {
content: []const u8,
line: usize,
col: usize,
offset: usize,
start: bool,
 
const Self = @This();
 
pub fn init(content: []const u8) Lexer {
return Lexer{
.content = content,
.line = 1,
.col = 1,
.offset = 0,
.start = true,
};
}
 
pub fn buildToken(self: Self) Token {
return Token{ .line = self.line, .col = self.col };
}
 
pub fn buildTokenT(self: Self, typ: TokenType) Token {
return Token{ .line = self.line, .col = self.col, .typ = typ };
}
 
pub fn curr(self: Self) u8 {
return self.content[self.offset];
}
 
// Alternative implementation is to return `Token` value from `next()` which is
// arguably more idiomatic version.
pub fn next(self: *Self) ?u8 {
// We use `start` in order to make the very first invocation of `next()` to return
// the very first character. It should be possible to avoid this variable.
if (self.start) {
self.start = false;
} else {
const newline = self.curr() == '\n';
self.offset += 1;
if (newline) {
self.col = 1;
self.line += 1;
} else {
self.col += 1;
}
}
if (self.offset >= self.content.len) {
return null;
} else {
return self.curr();
}
}
 
pub fn peek(self: Self) ?u8 {
if (self.offset + 1 >= self.content.len) {
return null;
} else {
return self.content[self.offset + 1];
}
}
 
fn divOrComment(self: *Self) LexerError!?Token {
var result = self.buildToken();
if (self.peek()) |peek_ch| {
if (peek_ch == '*') {
_ = self.next(); // peeked character
while (self.next()) |ch| {
if (ch == '*') {
if (self.peek()) |next_ch| {
if (next_ch == '/') {
_ = self.next(); // peeked character
return null;
}
}
}
}
return LexerError.EndOfFileInComment;
}
}
result.typ = .divide;
return result;
}
 
fn identifierOrKeyword(self: *Self) !Token {
var result = self.buildToken();
const init_offset = self.offset;
while (self.peek()) |ch| : (_ = self.next()) {
switch (ch) {
'_', 'a'...'z', 'A'...'Z', '0'...'9' => {},
else => break,
}
}
const final_offset = self.offset + 1;
 
if (std.mem.eql(u8, self.content[init_offset..final_offset], "if")) {
result.typ = .kw_if;
} else if (std.mem.eql(u8, self.content[init_offset..final_offset], "else")) {
result.typ = .kw_else;
} else if (std.mem.eql(u8, self.content[init_offset..final_offset], "while")) {
result.typ = .kw_while;
} else if (std.mem.eql(u8, self.content[init_offset..final_offset], "print")) {
result.typ = .kw_print;
} else if (std.mem.eql(u8, self.content[init_offset..final_offset], "putc")) {
result.typ = .kw_putc;
} else {
result.typ = .identifier;
result.value = TokenValue{ .string = self.content[init_offset..final_offset] };
}
 
return result;
}
 
fn string(self: *Self) !Token {
var result = self.buildToken();
result.typ = .string;
const init_offset = self.offset;
while (self.next()) |ch| {
switch (ch) {
'"' => break,
'\n' => return LexerError.EndOfLineInString,
'\\' => {
switch (self.peek() orelse return LexerError.EndOfFileInString) {
'n', '\\' => _ = self.next(), // peeked character
else => return LexerError.UnknownEscapeSequence,
}
},
else => {},
}
} else {
return LexerError.EndOfFileInString;
}
const final_offset = self.offset + 1;
result.value = TokenValue{ .string = self.content[init_offset..final_offset] };
return result;
}
 
/// Choose either `ifyes` or `ifno` token type depending on whether the peeked
/// character is `by`.
fn followed(self: *Self, by: u8, ifyes: TokenType, ifno: TokenType) Token {
var result = self.buildToken();
if (self.peek()) |ch| {
if (ch == by) {
_ = self.next(); // peeked character
result.typ = ifyes;
} else {
result.typ = ifno;
}
} else {
result.typ = ifno;
}
return result;
}
 
/// Raise an error if there's no next `by` character but return token with `typ` otherwise.
fn consecutive(self: *Self, by: u8, typ: TokenType) LexerError!Token {
const result = self.buildTokenT(typ);
if (self.peek()) |ch| {
if (ch == by) {
_ = self.next(); // peeked character
return result;
} else {
return LexerError.UnrecognizedCharacter;
}
} else {
return LexerError.UnrecognizedCharacter;
}
}
 
fn integerLiteral(self: *Self) LexerError!Token {
var result = self.buildTokenT(.integer);
const init_offset = self.offset;
while (self.peek()) |ch| {
switch (ch) {
'0'...'9' => _ = self.next(), // peeked character
'_', 'a'...'z', 'A'...'Z' => return LexerError.InvalidNumber,
else => break,
}
}
const final_offset = self.offset + 1;
result.value = TokenValue{
.intlit = std.fmt.parseInt(i32, self.content[init_offset..final_offset], 10) catch {
return LexerError.InvalidNumber;
},
};
return result;
}
 
// This is a beautiful way of how Zig allows to remove bilerplate and at the same time
// to not lose any error completeness guarantees.
fn nextOrEmpty(self: *Self) LexerError!u8 {
return self.next() orelse LexerError.EmptyCharacterConstant;
}
 
fn integerChar(self: *Self) LexerError!Token {
var result = self.buildTokenT(.integer);
switch (try self.nextOrEmpty()) {
'\'', '\n' => return LexerError.EmptyCharacterConstant,
'\\' => {
switch (try self.nextOrEmpty()) {
'n' => result.value = TokenValue{ .intlit = '\n' },
'\\' => result.value = TokenValue{ .intlit = '\\' },
else => return LexerError.EmptyCharacterConstant,
}
switch (try self.nextOrEmpty()) {
'\'' => {},
else => return LexerError.MulticharacterConstant,
}
},
else => {
result.value = TokenValue{ .intlit = self.curr() };
switch (try self.nextOrEmpty()) {
'\'' => {},
else => return LexerError.MulticharacterConstant,
}
},
}
return result;
}
};
 
pub fn lex(allocator: std.mem.Allocator, content: []u8) !std.ArrayList(Token) {
var tokens = std.ArrayList(Token).init(allocator);
var lexer = Lexer.init(content);
while (lexer.next()) |ch| {
switch (ch) {
' ' => {},
'*' => try tokens.append(lexer.buildTokenT(.multiply)),
'%' => try tokens.append(lexer.buildTokenT(.mod)),
'+' => try tokens.append(lexer.buildTokenT(.add)),
'-' => try tokens.append(lexer.buildTokenT(.subtract)),
'<' => try tokens.append(lexer.followed('=', .less_equal, .less)),
'>' => try tokens.append(lexer.followed('=', .greater_equal, .greater)),
'=' => try tokens.append(lexer.followed('=', .equal, .assign)),
'!' => try tokens.append(lexer.followed('=', .not_equal, .not)),
'(' => try tokens.append(lexer.buildTokenT(.left_paren)),
')' => try tokens.append(lexer.buildTokenT(.right_paren)),
'{' => try tokens.append(lexer.buildTokenT(.left_brace)),
'}' => try tokens.append(lexer.buildTokenT(.right_brace)),
';' => try tokens.append(lexer.buildTokenT(.semicolon)),
',' => try tokens.append(lexer.buildTokenT(.comma)),
'&' => try tokens.append(try lexer.consecutive('&', .bool_and)),
'|' => try tokens.append(try lexer.consecutive('|', .bool_or)),
'/' => {
if (try lexer.divOrComment()) |token| try tokens.append(token);
},
'_', 'a'...'z', 'A'...'Z' => try tokens.append(try lexer.identifierOrKeyword()),
'"' => try tokens.append(try lexer.string()),
'0'...'9' => try tokens.append(try lexer.integerLiteral()),
'\'' => try tokens.append(try lexer.integerChar()),
else => {},
}
}
try tokens.append(lexer.buildTokenT(.eof));
 
return tokens;
}
 
pub fn main() !void {
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena.deinit();
const allocator = arena.allocator();
 
var arg_it = std.process.args();
_ = try arg_it.next(allocator) orelse unreachable; // program name
const file_name = arg_it.next(allocator);
// We accept both files and standard input.
var file_handle = blk: {
if (file_name) |file_name_delimited| {
const fname: []const u8 = try file_name_delimited;
break :blk try std.fs.cwd().openFile(fname, .{});
} else {
break :blk std.io.getStdIn();
}
};
defer file_handle.close();
const input_content = try file_handle.readToEndAlloc(allocator, std.math.maxInt(usize));
 
const tokens = try lex(allocator, input_content);
const pretty_output = try tokenListToString(allocator, tokens);
_ = try std.io.getStdOut().write(pretty_output);
}
 
fn tokenListToString(allocator: std.mem.Allocator, token_list: std.ArrayList(Token)) ![]u8 {
var result = std.ArrayList(u8).init(allocator);
var w = result.writer();
for (token_list.items) |token| {
const common_args = .{ token.line, token.col, token.typ.toString() };
if (token.value) |value| {
const init_fmt = "{d:>5}{d:>7} {s:<15}";
switch (value) {
.string => |str| _ = try w.write(try std.fmt.allocPrint(
allocator,
init_fmt ++ "{s}\n",
common_args ++ .{str},
)),
.intlit => |i| _ = try w.write(try std.fmt.allocPrint(
allocator,
init_fmt ++ "{d}\n",
common_args ++ .{i},
)),
}
} else {
_ = try w.write(try std.fmt.allocPrint(allocator, "{d:>5}{d:>7} {s}\n", common_args));
}
}
return result.items;
}
</syntaxhighlight>
9,488

edits