Compiler/lexical analyzer: Difference between revisions

Compiler/lexical analyzer (view source)

Revision as of 15:54, 20 November 2023

36,896 bytes added , 6 months ago

m

→‎{{header|Wren}}: Minor tidy

PureFox

9,482

edits

Revision as of 17:50, 24 April 2022 (view source) Chemoelectric (talk \| contribs) (→‎{{header\|Scala}}) ← Older edit		Revision as of 15:54, 20 November 2023 (view source) PureFox (talk \| contribs) m (→‎{{header\|Wren}}: Minor tidy) Newer edit →
(14 intermediate revisions by 5 users not shown)
Line 158: For example, the following two program fragments are equivalent, and should produce the same token stream except for the line and column positions: * <~~lang~~syntaxhighlight lang="c">if ( p /* meaning n is prime / ) { print ( n , " " ) ; count = count + 1 ; / number of primes found so far / }</~~lang~~syntaxhighlight> <~~lang~~syntaxhighlight lang="c">if(p){print(n," ");count=count+1;}</~~lang~~syntaxhighlight> ;Complete list of token names Line 237: \| style="vertical-align:top" \| Test Case 1: <~~lang~~syntaxhighlight lang="c">/* Hello world / print("Hello, World!\n");</~~lang~~syntaxhighlight> \| style="vertical-align:top" \| Line 255: \| style="vertical-align:top" \| Test Case 2: <~~lang~~syntaxhighlight lang="c">/ Show Ident and Integers / phoenix_number = 142857; print(phoenix_number, "\n");</~~lang~~syntaxhighlight> \| style="vertical-align:top" \| Line 280: \| style="vertical-align:top" \| Test Case 3: <~~lang~~syntaxhighlight lang="c">/ All lexical tokens - not syntactically correct, but that will have to wait until syntax analysis Line 301: /* character literal / '\n' / character literal / '\\' / character literal / ' '</~~lang~~syntaxhighlight> \| style="vertical-align:top" \| Line 344: \| style="vertical-align:top" \| Test Case 4: <~~lang~~syntaxhighlight lang="c">/** test printing, embedded \n and comments with lots of '' */ print(42); print("\nHello World\nGood Bye\nok\n"); print("Print a slash n - \\n.\n");</~~lang~~syntaxhighlight> \| style="vertical-align:top" \| Line 388: =={{header\|Ada}}== <~~lang~~syntaxhighlight lang="ada">with Ada.Text_IO, Ada.Streams.Stream_IO, Ada.Strings.Unbounded, Ada.Command_Line, Ada.Exceptions; use Ada.Strings, Ada.Strings.Unbounded, Ada.Streams, Ada.Exceptions; Line 648: when error : others => IO.Put_Line("Error: " & Exception_Message(error)); end Main; </syntaxhighlight> ~~</lang>~~ {{out}} Test case 3: <pre> Line 686: 23 1 END_OF_INPUT </pre> =={{header\|ALGOL 68}}== This is a simple ''token in, line out'' program. It doesn't keep an internal representation of tokens or anything like that, since that's not needed at all. As an addition, it emits a diagnostic if integer literals are too big. <syntaxhighlight lang="algol68">BEGIN # implement C-like getchar, where EOF and EOLn are "characters" (-1 and 10 resp.). # INT eof = -1, eoln = 10; BOOL eof flag := FALSE; STRING buf := ""; INT col := 1; INT line := 0; on logical file end (stand in, (REF FILE f)BOOL: eof flag := TRUE); PROC getchar = INT: IF eof flag THEN eof ELIF col = UPB buf THEN col +:= 1; eoln ELIF col > UPB buf THEN IF line > 0 THEN read(newline) FI; line +:= 1; read(buf); IF eof flag THEN col := 1; eof ELSE col := 0; getchar FI ELSE col +:= 1; ABS buf[col] FI; PROC nextchar = INT: IF eof flag THEN eof ELIF col >= UPB buf THEN eoln ELSE ABS buf[col+1] FI; PROC is blank = (INT ch) BOOL: ch = 0 OR ch = 9 OR ch = 10 OR ch = 13 OR ch = ABS " "; PROC is digit = (INT ch) BOOL: ch >= ABS "0" AND ch <= ABS "9"; PROC is ident start = (INT ch) BOOL: ch >= ABS "A" AND ch <= ABS "Z" OR ch >= ABS "a" AND ch <= ABS "z" OR ch = ABS "_"; PROC is ident = (INT ch) BOOL: is ident start(ch) OR is digit(ch); PROC ident or keyword = (INT start char) VOID: BEGIN STRING w := REPR start char; INT start col = col; WHILE is ident (next char) DO w +:= REPR getchar OD; IF w = "if" THEN output2("Keyword_if", start col) ELIF w = "else" THEN output2("Keyword_else", start col) ELIF w = "while" THEN output2("Keyword_while", start col) ELIF w = "print" THEN output2("Keyword_print", start col) ELIF w = "putc" THEN output2("Keyword_putc", start col) ELSE output2("Identifier " + w, start col) FI END; PROC char = VOID: BEGIN INT start col = col; INT ch := getchar; IF ch = ABS "'" THEN error("Empty character constant") ELIF ch = ABS "\" THEN ch := getchar; IF ch = ABS "n" THEN ch := 10 ELIF ch = ABS "\" THEN SKIP ELSE error("Unknown escape sequence. \" + REPR ch) FI FI; IF nextchar /= ABS "'" THEN error("Multi-character constant.") FI; getchar; output2("Integer " + whole(ch, 0), start col) END; PROC string = VOID: BEGIN INT start col = col; STRING s := """"; WHILE INT ch := getchar; ch /= ABS """" DO IF ch = eoln THEN error("End-of-line while scanning string literal. Closing string character not found before end-of-line.") ELIF ch = eof THEN error("End-of-file while scanning string literal. Closing string character not found.") ELIF ch = ABS "\" THEN s +:= REPR ch; ch := getchar; IF ch /= ABS "\" AND ch /= ABS "n" THEN error("Unknown escape sequence. \" + REPR ch) FI; s +:= REPR ch ELSE s +:= REPR ch FI OD; output2("String " + s + """", start col) END; PROC comment = VOID: BEGIN WHILE INT ch := getchar; NOT (ch = ABS "" AND nextchar = ABS "/") DO IF ch = eof THEN error("End-of-file in comment. Closing comment characters not found.") FI OD; getchar END; PROC number = (INT first digit) VOID: BEGIN INT start col = col; INT n := first digit - ABS "0"; WHILE is digit (nextchar) DO INT u := getchar - ABS "0"; IF LENG n * 10 + LENG u > max int THEN error("Integer too big") FI; n := n * 10 + u OD; IF is ident start (nextchar) THEN error("Invalid number. Starts like a number, but ends in non-numeric characters.") FI; output2("Integer " + whole(n, 0), start col) END; PROC output = (STRING s) VOID: output2(s, col); PROC output2 = (STRING s, INT col) VOID: print((whole(line,-8), whole(col,-8), " ", s, newline)); PROC if follows = (CHAR second, STRING longer, shorter) VOID: IF nextchar = ABS second THEN output(longer); getchar ELSE output(shorter) FI; PROC error = (STRING s)VOID: (put(stand error, ("At ", whole(line,0), ":", whole(col,0), " ", s, new line)); stop); PROC unrecognized = (INT char) VOID: error("Unrecognized character " + REPR char); PROC double char = (INT first, STRING op) VOID: IF nextchar /= first THEN unrecognized(first) ELSE output2(op, col-1); getchar FI; WHILE INT ch := getchar; ch /= eof DO IF is blank(ch) THEN SKIP ELIF ch = ABS "(" THEN output("LeftParen") ELIF ch = ABS ")" THEN output("RightParen") ELIF ch = ABS "{" THEN output("LeftBrace") ELIF ch = ABS "}" THEN output("RightBrace") ELIF ch = ABS ";" THEN output("Semicolon") ELIF ch = ABS "," THEN output("Comma") ELIF ch = ABS "" THEN output("Op_multiply") ELIF ch = ABS "/" THEN IF next char = ABS "" THEN comment ELSE output("Op_divide") FI ELIF ch = ABS "%" THEN output("Op_mod") ELIF ch = ABS "+" THEN output("Op_add") ELIF ch = ABS "-" THEN output("Op_subtract") ELIF ch = ABS "<" THEN if follows("=", "Op_lessequal", "Op_less") ELIF ch = ABS ">" THEN if follows("=", "Op_greaterequal", "Op_greater") ELIF ch = ABS "=" THEN if follows("=", "Op_equal", "Op_assign") ELIF ch = ABS "!" THEN if follows("=", "Op_notequal", "Op_not") ELIF ch = ABS "&" THEN double char(ch, "Op_and") ELIF ch = ABS "\|" THEN double char(ch, "Op_or") ELIF is ident start (ch) THEN ident or keyword (ch) ELIF ch = ABS """" THEN string ELIF ch = ABS "'" THEN char ELIF is digit(ch) THEN number(ch) ELSE unrecognized(ch) FI OD; output("End_Of_Input") END</syntaxhighlight> =={{header\|ALGOL W}}== <~~lang~~syntaxhighlight lang="algolw">begin %lexical analyser % % Algol W strings are limited to 256 characters in length so we limit source lines % Line 981 ⟶ 1,124: while nextToken not = tEnd_of_input do writeToken; writeToken end.</~~lang~~syntaxhighlight> {{out}} Test case 3: <pre> Line 1,026 ⟶ 1,169: (One point of note: the C "EOF" pseudo-character is detected in the following code by looking for a negative number. That EOF has to be negative and the other characters non-negative is implied by the ISO C standard.) <~~lang~~syntaxhighlight ~~ATS~~lang="ats">(*******************************************************************) ( Usage: lex [INPUTFILE [OUTPUTFILE]] If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input Line 1,898 ⟶ 2,041: end (*******************************************************************)</~~lang~~syntaxhighlight> {{out}} Line 1,939 ⟶ 2,082: =={{header\|AWK}}== Tested with gawk 4.1.1 and mawk 1.3.4. <syntaxhighlight lang="awk"> ~~<lang AWK>~~ BEGIN { all_syms["tk_EOI" ] = "End_of_input" Line 2,145 ⟶ 2,288: } } </syntaxhighlight> ~~</lang>~~ {{out\|case=count}} <b> Line 2,182 ⟶ 2,325: =={{header\|C}}== Tested with gcc 4.81 and later, compiles warning free with -Wpedantic -pedantic -Wall -Wextra <~~lang~~syntaxhighlight Clang="c">#include <stdlib.h> #include <stdio.h> #include <stdarg.h> Line 2,414 ⟶ 2,557: run(); return 0; }</~~lang~~syntaxhighlight> {{out\|case=test case 3}} Line 2,458 ⟶ 2,601: =={{header\|C sharp\|C#}}== Requires C#6.0 because of the use of null coalescing operators. <~~lang~~syntaxhighlight lang="csharp"> using System; using System.IO; Line 2,808 ⟶ 2,951: } } </syntaxhighlight> ~~</lang>~~ {{out\|case=test case 3}} Line 2,852 ⟶ 2,995: =={{header\|C++}}== Tested with GCC 9.3.0 (g++ -std=c++17) <~~lang~~syntaxhighlight lang="cpp">#include <charconv> // std::from_chars #include <fstream> // file_to_string, string_to_file #include <functional> // std::invoke Line 3,237 ⟶ 3,380: }); } </syntaxhighlight> ~~</lang>~~ {{out\|case=test case 3}} Line 3,282 ⟶ 3,425: Using GnuCOBOL 2. By Steve Williams (with one change to get around a Rosetta Code code highlighter problem). <~~lang~~syntaxhighlight lang="cobol"> >>SOURCE FORMAT IS FREE > this code is dedicated to the public domain > (GnuCOBOL) 2.3-dev.0 Line 3,688 ⟶ 3,831: end-if . end program lexer.</~~lang~~syntaxhighlight> {{out\|case=test case 3}} Line 3,730 ⟶ 3,873: Lisp has a built-in reader and you can customize the reader by modifying its readtable. I'm also using the Gray stream, which is an almost standard feature of Common Lisp, for counting lines and columns. <~~lang~~syntaxhighlight lang="lisp">(defpackage #:lexical-analyzer (:use #:cl #:sb-gray) (:export #:main)) Line 3,943 ⟶ 4,086: (defun main () (lex standard-input))</~~lang~~syntaxhighlight> {{out\|case=test case 3}} <pre> 5 16 KEYWORD-PRINT Line 3,984 ⟶ 4,127: {{trans\|ATS}} <~~lang~~syntaxhighlight ~~Elixir~~lang="elixir">#!/bin/env elixir # -- elixir -- Line 4,452 ⟶ 4,595: end ## module Lex Lex.main(System.argv)</~~lang~~syntaxhighlight> {{out}} Line 4,498 ⟶ 4,641: <~~lang~~syntaxhighlight lang="lisp">#!/usr/bin/emacs --script ;; ;; The Rosetta Code lexical analyzer in GNU Emacs Lisp. Line 4,916 ⟶ 5,059: (scan-text t)) (main)</~~lang~~syntaxhighlight> Line 4,962 ⟶ 5,105: <~~lang~~syntaxhighlight lang="erlang">#!/bin/env escript %%%------------------------------------------------------------------- Line 5,467 ⟶ 5,610: %%% erlang-indent-level: 3 %%% end: %%%-------------------------------------------------------------------</~~lang~~syntaxhighlight> Line 5,509 ⟶ 5,652: =={{header\|Euphoria}}== Tested with Euphoria 4.05. <~~lang~~syntaxhighlight lang="euphoria">include std/io.e include std/map.e include std/types.e Line 5,734 ⟶ 5,877: end procedure main(command_line())</~~lang~~syntaxhighlight> {{out\|case=test case 3}} Line 5,778 ⟶ 5,921: =={{header\|Flex}}== Tested with Flex 2.5.4. <syntaxhighlight lang="c">%{ ~~<lang C>%{~~ #include <stdio.h> #include <stdlib.h> Line 5,951 ⟶ 6,094: } while (tok != tk_EOI); return 0; }</~~lang~~syntaxhighlight> {{out\|case=test case 3}} Line 5,995 ⟶ 6,138: =={{header\|Forth}}== Tested with Gforth 0.7.3. <~~lang~~syntaxhighlight ~~Forth~~lang="forth">CREATE BUF 0 , \ single-character look-ahead buffer CREATE COLUMN# 0 , CREATE LINE# 1 , Line 6,117 ⟶ 6,260: THEN THEN ; : TOKENIZE BEGIN CONSUME AGAIN ; TOKENIZE</~~lang~~syntaxhighlight> {{out}} Line 6,131 ⟶ 6,274: The author has placed this Fortran code in the public domain. <syntaxhighlight lang="fortran">!!! ~~<lang Fortran>!!!~~ !!! An implementation of the Rosetta Code lexical analyzer task: !!! https://rosettacode.org/wiki/Compiler/lexical_analyzer Line 7,209 ⟶ 7,352: end subroutine print_usage end program lex</~~lang~~syntaxhighlight> {{out}} Line 7,250 ⟶ 7,393: =={{header\|FreeBASIC}}== Tested with FreeBASIC 1.05 <~~lang~~syntaxhighlight ~~FreeBASIC~~lang="freebasic">enum Token_type tk_EOI tk_Mul Line 7,536 ⟶ 7,679: print : print "Hit any to end program" sleep system</~~lang~~syntaxhighlight> {{out\|case=test case 3}} <b> Line 7,577 ⟶ 7,720: =={{header\|Go}}== {{trans\|FreeBASIC}} <~~lang~~syntaxhighlight lang="go">package main import ( Line 7,954 ⟶ 8,097: initLex() process() }</~~lang~~syntaxhighlight> {{out}} Line 7,997 ⟶ 8,140: =={{header\|Haskell}}== Tested with GHC 8.0.2 <~~lang~~syntaxhighlight lang="haskell">import Control.Applicative hiding (many, some) import Control.Monad.State.Lazy import Control.Monad.Trans.Maybe (MaybeT, runMaybeT) Line 8,301 ⟶ 8,444: where (Just t, s') = runState (runMaybeT lexer) s (txt, _, _) = s' </syntaxhighlight> ~~</lang>~~ {{out\|case=test case 3}} Line 8,353 ⟶ 8,496: Global variables are avoided except for some constants that require initialization. <syntaxhighlight lang="icon"># ~~<lang Icon>#~~ # The Rosetta Code lexical analyzer in Icon with co-expressions. Based # upon the ATS implementation. Line 8,851 ⟶ 8,994: procedure max(x, y) return (if x < y then y else x) end</~~lang~~syntaxhighlight> Line 8,900 ⟶ 9,043: Implementation: <~~lang~~syntaxhighlight Jlang="j">symbols=:256#0 ch=: {{1 0+x[symbols=: x (a.i.y)} symbols}} 'T0 token' =: 0 ch '%+-!(){};,<>=!\|&' Line 9,020 ⟶ 9,163: keep=. (tokens~:<,'''')-.comments+.whitespace+.unknowna:=values keep&#each ((1+lines),.columns);<names,.values }}</~~lang~~syntaxhighlight> Test case 3: <syntaxhighlight lang="j"> ~~<lang J>~~ flex=: {{ 'A B'=.y Line 9,090 ⟶ 9,233: 21 28 Integer 92 22 27 Integer 32 23 1 End_of_input </~~lang~~syntaxhighlight> Here, it seems expedient to retain a structured representation of the lexical result. As shown, it's straightforward to produce a "pure" textual result for a hypothetical alternative implementation of the syntax analyzer, but the structured representation will be easier to deal with. =={{header\|Java}}== <~~lang~~syntaxhighlight lang="java"> // Translated from python source Line 9,245 ⟶ 9,388: if (text.equals("")) { error(line, pos, String.format("identifer_or_integer ~~unrecopgnized~~unrecognized character: (%d) %c", (int)this.chr, this.chr)); } if (Character.isDigit(text.charAt(0))) { if (!is_number) { error(line, pos, String.format("~~invaslid~~invalid number: %s", text)); } return new Token(TokenType.Integer, text, line, pos); Line 9,336 ⟶ 9,479: } } </syntaxhighlight> ~~</lang>~~ =={{header\|JavaScript}}== {{incorrect\|Javascript\|Please show output. Code is identical to [[Compiler/syntax_analyzer]] task}} <~~lang~~syntaxhighlight lang="javascript"> / Token: type, value, line, pos Line 9,553 ⟶ 9,696: l.printTokens() }) </syntaxhighlight> ~~</lang>~~ =={{header\|Julia}}== <~~lang~~syntaxhighlight lang="julia">struct Tokenized startline::Int startcol::Int Line 9,711 ⟶ 9,854: println(lpad(tok.startline, 3), lpad(tok.startcol, 5), lpad(tok.name, 18), " ", tok.value != nothing ? tok.value : "") end </~~lang~~syntaxhighlight>{{output}}<pre> Line Col Name Value 5 16 Keyword_print Line 9,748 ⟶ 9,891: 23 1 End_of_input </pre> =={{header\|kotlin}}== {{trans\|Java}} <syntaxhighlight lang="kotlin">// Input: command line argument of file to process or console input. A two or // three character console input of digits followed by a new line will be // checked for an integer between zero and twenty-five to select a fixed test // case to run. Any other console input will be parsed. // Code based on the Java version found here: // https://rosettacode.org/mw/index.php?title=Compiler/lexical_analyzer&action=edit&section=22 // Class to halt the parsing with an exception. class ParsingFailed(message: String): Exception(message) // Enumerate class of tokens supported by this scanner. enum class TokenType { Tk_End_of_input, Op_multiply, Op_divide, Op_mod, Op_add, Op_subtract, Op_negate, Op_not, Op_less, Op_lessequal, Op_greater, Op_greaterequal, Op_equal, Op_notequal, Op_assign, Op_and, Op_or, Kw_if, Kw_else, Kw_while, Kw_print, Kw_putc, Sy_LeftParen, Sy_RightParen, Sy_LeftBrace, Sy_RightBrace, Sy_Semicolon, Sy_Comma, Tk_Identifier, Tk_Integer, Tk_String; override fun toString() = listOf("End_of_input", "Op_multiply", "Op_divide", "Op_mod", "Op_add", "Op_subtract", "Op_negate", "Op_not", "Op_less", "Op_lessequal", "Op_greater", "Op_greaterequal", "Op_equal", "Op_notequal", "Op_assign", "Op_and", "Op_or", "Keyword_if", "Keyword_else", "Keyword_while", "Keyword_print", "Keyword_putc", "LeftParen", "RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma", "Identifier", "Integer", "String")[this.ordinal] } // TokenType // Data class of tokens returned by the scanner. data class Token(val token: TokenType, val value: String, val line: Int, val pos: Int) { // Overridden method to display the token. override fun toString() = "%5d %5d %-15s %s".format(line, pos, this.token, when (this.token) { TokenType.Tk_Integer, TokenType.Tk_Identifier -> " %s".format(this.value) TokenType.Tk_String -> this.value.toList().joinToString("", " \"", "\"") { when (it) { '\t' -> "\\t" '\n' -> "\\n" '\u000b' -> "\\v" '\u000c' -> "\\f" '\r' -> "\\r" '"' -> "\\\"" '\\' -> "\\" in ' '..'~' -> "$it" else -> "\\u%04x".format(it.code) } } else -> "" } ) } // Token // Function to display an error message and halt the scanner. fun error(line: Int, pos: Int, msg: String): Nothing = throw ParsingFailed("(%d, %d) %s\n".format(line, pos, msg)) // Class to process the source into tokens with properties of the // source string, the line number, the column position, the index // within the source string, the current character being processed, // and map of the keyword strings to the corresponding token type. class Lexer(private val s: String) { private var line = 1 private var pos = 1 private var position = 0 private var chr = if (s.isEmpty()) ' ' else s[0] private val keywords = mapOf<String, TokenType>( "if" to TokenType.Kw_if, "else" to TokenType.Kw_else, "print" to TokenType.Kw_print, "putc" to TokenType.Kw_putc, "while" to TokenType.Kw_while) // Method to retrive the next character from the source. Use null after // the end of our source. private fun getNextChar() = if (++this.position >= this.s.length) { this.pos++ this.chr = '\u0000' this.chr } else { this.pos++ this.chr = this.s[this.position] when (this.chr) { '\n' -> { this.line++ this.pos = 0 } // line '\t' -> while (this.pos%8 != 1) this.pos++ } // when this.chr } // if // Method to return the division token, skip the comment, or handle the // error. private fun div_or_comment(line: Int, pos: Int): Token = if (getNextChar() != '') Token(TokenType.Op_divide, "", line, pos); else { getNextChar() // Skip comment start outer@ while (true) when (this.chr) { '\u0000' -> error(line, pos, "Lexer: EOF in comment"); '' -> if (getNextChar() == '/') { getNextChar() // Skip comment end break@outer } // if else -> getNextChar() } // when getToken() } // if // Method to verify a character literal. Return the token or handle the // error. private fun char_lit(line: Int, pos: Int): Token { var c = getNextChar() // skip opening quote when (c) { '\'' -> error(line, pos, "Lexer: Empty character constant"); '\\' -> c = when (getNextChar()) { 'n' -> 10.toChar() '\\' -> '\\' '\'' -> '\'' else -> error(line, pos, "Lexer: Unknown escape sequence '\\%c'". format(this.chr)) } } // when if (getNextChar() != '\'') error(line, pos, "Lexer: Multi-character constant") getNextChar() // Skip closing quote return Token(TokenType.Tk_Integer, c.code.toString(), line, pos) } // char_lit // Method to check next character to see whether it belongs to the token // we might be in the middle of. Return the correct token or handle the // error. private fun follow(expect: Char, ifyes: TokenType, ifno: TokenType, line: Int, pos: Int): Token = when { getNextChar() == expect -> { getNextChar() Token(ifyes, "", line, pos) } // matches ifno == TokenType.Tk_End_of_input -> error(line, pos, "Lexer: %c expected: (%d) '%c'".format(expect, this.chr.code, this.chr)) else -> Token(ifno, "", line, pos) } // when // Method to verify a character string. Return the token or handle the // error. private fun string_lit(start: Char, line: Int, pos: Int): Token { var result = "" while (getNextChar() != start) when (this.chr) { '\u0000' -> error(line, pos, "Lexer: EOF while scanning string literal") '\n' -> error(line, pos, "Lexer: EOL while scanning string literal") '\\' -> when (getNextChar()) { '\\' -> result += '\\' 'n' -> result += '\n' '"' -> result += '"' else -> error(line, pos, "Lexer: Escape sequence unknown '\\%c'". format(this.chr)) } // when else -> result += this.chr } // when getNextChar() // Toss closing quote return Token(TokenType.Tk_String, result, line, pos) } // string_lit // Method to retrive an identifier or integer. Return the keyword // token, if the string matches one. Return the integer token, // if the string is all digits. Return the identifer token, if the // string is valid. Otherwise, handle the error. private fun identifier_or_integer(line: Int, pos: Int): Token { var is_number = true var text = "" while (this.chr in listOf('_')+('0'..'9')+('a'..'z')+('A'..'Z')) { text += this.chr is_number = is_number && this.chr in '0'..'9' getNextChar() } // while if (text.isEmpty()) error(line, pos, "Lexer: Unrecognized character: (%d) %c". format(this.chr.code, this.chr)) return when { text[0] in '0'..'9' -> if (!is_number) error(line, pos, "Lexer: Invalid number: %s". format(text)) else { val max = Int.MAX_VALUE.toString() if (text.length > max.length \|\| (text.length == max.length && max < text)) error(line, pos, "Lexer: Number exceeds maximum value %s". format(text)) Token(TokenType.Tk_Integer, text, line, pos) } // if this.keywords.containsKey(text) -> Token(this.keywords[text]!!, "", line, pos) else -> Token(TokenType.Tk_Identifier, text, line, pos) } } // identifier_or_integer // Method to skip whitespace both C's and Unicode ones and retrive the next // token. private fun getToken(): Token { while (this.chr in listOf('\t', '\n', '\u000b', '\u000c', '\r', ' ') \|\| this.chr.isWhitespace()) getNextChar() val line = this.line val pos = this.pos return when (this.chr) { '\u0000' -> Token(TokenType.Tk_End_of_input, "", line, pos) '/' -> div_or_comment(line, pos) '\'' -> char_lit(line, pos) '<' -> follow('=', TokenType.Op_lessequal, TokenType.Op_less, line, pos) '>' -> follow('=', TokenType.Op_greaterequal, TokenType.Op_greater, line, pos) '=' -> follow('=', TokenType.Op_equal, TokenType.Op_assign, line, pos) '!' -> follow('=', TokenType.Op_notequal, TokenType.Op_not, line, pos) '&' -> follow('&', TokenType.Op_and, TokenType.Tk_End_of_input, line, pos) '\|' -> follow('\|', TokenType.Op_or, TokenType.Tk_End_of_input, line, pos) '"' -> string_lit(this.chr, line, pos) '{' -> { getNextChar() Token(TokenType.Sy_LeftBrace, "", line, pos) } // open brace '}' -> { getNextChar() Token(TokenType.Sy_RightBrace, "", line, pos) } // close brace '(' -> { getNextChar() Token(TokenType.Sy_LeftParen, "", line, pos) } // open paren ')' -> { getNextChar() Token(TokenType.Sy_RightParen, "", line, pos) } // close paren '+' -> { getNextChar() Token(TokenType.Op_add, "", line, pos) } // plus '-' -> { getNextChar() Token(TokenType.Op_subtract, "", line, pos) } // dash '' -> { getNextChar() Token(TokenType.Op_multiply, "", line, pos) } // asterisk '%' -> { getNextChar() Token(TokenType.Op_mod, "", line, pos) } // percent ';' -> { getNextChar() Token(TokenType.Sy_Semicolon, "", line, pos) } // semicolon ',' -> { getNextChar() Token(TokenType.Sy_Comma, "", line, pos) } // comma else -> identifier_or_integer(line, pos) } } // getToken // Method to parse and display tokens. fun printTokens() { do { val t: Token = getToken() println(t) } while (t.token != TokenType.Tk_End_of_input) } // printTokens } // Lexer // Function to test all good tests from the website and produce all of the // error messages this program supports. fun tests(number: Int) { // Function to generate test case 0 source: Hello World/Text. fun hello() { Lexer( """/ Hello world / print("Hello, World!\n"); """).printTokens() } // hello // Function to generate test case 1 source: Phoenix Number. fun phoenix() { Lexer( """/ Show Ident and Integers / phoenix_number = 142857; print(phoenix_number, "\n");""").printTokens() } // phoenix // Function to generate test case 2 source: All Symbols. fun symbols() { Lexer( """/ All lexical tokens - not syntactically correct, but that will have to wait until syntax analysis / / Print / print / Sub / - / Putc / putc / Lss / < / If / if / Gtr / > / Else / else / Leq / <= / While / while / Geq / >= / Lbrace / { / Eq / == / Rbrace / } / Neq / != / Lparen / ( / And / && / Rparen / ) / Or / \|\| / Uminus / - / Semi / ; / Not / ! / Comma / , / Mul / /* Assign / = / Div / / / Integer / 42 / Mod / % / String / "String literal" / Add / + / Ident / variable_name / character literal / '\n' / character literal / '\\' / character literal / ' '""").printTokens() } // symbols // Function to generate test case 3 source: Test Case 4. fun four() { Lexer( """/** test printing, embedded \n and comments with lots of '' */ print(42); print("\nHello World\nGood Bye\nok\n"); print("Print a slash n - \\n.\n");""").printTokens() } // four // Function to generate test case 4 source: Count. fun count() { Lexer( """count = 1; while (count < 10) { print("count is: ", count, "\n"); count = count + 1; }""").printTokens() } // count // Function to generate test case 5 source: 100 Doors. fun doors() { Lexer( """/ 100 Doors / i = 1; while (i i <= 100) { print("door ", i * i, " is open\n"); i = i + 1; }""").printTokens() } // doors // Function to generate test case 6 source: Negative Tests. fun negative() { Lexer( """a = (-1 * ((-1 * (5 * 15)) / 10)); print(a, "\n"); b = -a; print(b, "\n"); print(-b, "\n"); print(-(1), "\n");""").printTokens() } // negative // Function to generate test case 7 source: Deep. fun deep() { Lexer( """print(---------------------------------+++5, "\n"); print(((((((((3 + 2) * ((((((2))))))))))))), "\n"); if (1) { if (1) { if (1) { if (1) { if (1) { print(15, "\n"); } } } } }""").printTokens() } // deep // Function to generate test case 8 source: Greatest Common Divisor. fun gcd() { Lexer( """/* Compute the gcd of 1071, 1029: 21 / a = 1071; b = 1029; while (b != 0) { new_a = b; b = a % b; a = new_a; } print(a);""").printTokens() } // gcd // Function to generate test case 9 source: Factorial. fun factorial() { Lexer( """/ 12 factorial is 479001600 / n = 12; result = 1; i = 1; while (i <= n) { result = result i; i = i + 1; } print(result);""").printTokens() } // factorial // Function to generate test case 10 source: Fibonacci Sequence. fun fibonacci() { Lexer( """/* fibonacci of 44 is 701408733 / n = 44; i = 1; a = 0; b = 1; while (i < n) { w = a + b; a = b; b = w; i = i + 1; } print(w, "\n");""").printTokens() } // fibonacci // Function to generate test case 11 source: FizzBuzz. fun fizzbuzz() { Lexer( """/ FizzBuzz / i = 1; while (i <= 100) { if (!(i % 15)) print("FizzBuzz"); else if (!(i % 3)) print("Fizz"); else if (!(i % 5)) print("Buzz"); else print(i); print("\n"); i = i + 1; }""").printTokens() } // fizzbuzz // Function to generate test case 12 source: 99 Bottles of Beer. fun bottles() { Lexer( """/ 99 bottles / bottles = 99; while (bottles > 0) { print(bottles, " bottles of beer on the wall\n"); print(bottles, " bottles of beer\n"); print("Take one down, pass it around\n"); bottles = bottles - 1; print(bottles, " bottles of beer on the wall\n\n"); }""").printTokens() } // bottles // Function to generate test case 13 source: Primes. fun primes() { Lexer( """/ Simple prime number generator / count = 1; n = 1; limit = 100; while (n < limit) { k=3; p=1; n=n+2; while ((kk<=n) && (p)) { p=n/kk!=n; k=k+2; } if (p) { print(n, " is prime\n"); count = count + 1; } } print("Total primes found: ", count, "\n");""").printTokens() } // primes // Function to generate test case 14 source: Ascii Mandelbrot. fun ascii() { Lexer( """{ / This is an integer ascii Mandelbrot generator / left_edge = -420; right_edge = 300; top_edge = 300; bottom_edge = -300; x_step = 7; y_step = 15; max_iter = 200; y0 = top_edge; while (y0 > bottom_edge) { x0 = left_edge; while (x0 < right_edge) { y = 0; x = 0; the_char = ' '; i = 0; while (i < max_iter) { x_x = (x x) / 200; y_y = (y * y) / 200; if (x_x + y_y > 800 ) { the_char = '0' + i; if (i > 9) { the_char = '@'; } i = max_iter; } y = x * y / 100 + y0; x = x_x - y_y + x0; i = i + 1; } putc(the_char); x0 = x0 + x_step; } putc('\n'); y0 = y0 - y_step; } } """).printTokens() } // ascii when (number) { 0 -> hello() 1 -> phoenix() 2 -> symbols() 3 -> four() 4 -> count() 5 -> doors() 6 -> negative() 7 -> deep() 8 -> gcd() 9 -> factorial() 10 -> fibonacci() 11 -> fizzbuzz() 12 -> bottles() 13 -> primes() 14 -> ascii() 15 -> // Lexer: Empty character constant Lexer("''").printTokens() 16 -> // Lexer: Unknown escape sequence Lexer("'\\x").printTokens() 17 -> // Lexer: Multi-character constant Lexer("' ").printTokens() 18 -> // Lexer: EOF in comment Lexer("/").printTokens() 19 -> // Lexer: EOL in string Lexer("\"\n").printTokens() 20 -> // Lexer: EOF in string Lexer("\"").printTokens() 21 -> // Lexer: Escape sequence unknown Lexer("\"\\x").printTokens() 22 -> // Lexer: Unrecognized character Lexer("~").printTokens() 23 -> // Lexer: invalid number Lexer("9a9").printTokens() 24 -> // Lexer: Number exceeds maximum value Lexer("2147483648\n9223372036854775808").printTokens() 25 -> // Lexer: Operator expected Lexer("\|.").printTokens() else -> println("Invalid test number %d!".format(number)) } // when } // tests // Main function to check our source and read its data before parsing it. // With no source specified, run the test of all symbols. fun main(args: Array<String>) { try { val s = if (args.size > 0 && args[0].isNotEmpty()) // file on command line java.util.Scanner(java.io.File(args[0])) else // use the console java.util.Scanner(System.`in`) var source = "" while (s.hasNext()) source += s.nextLine()+ if (s.hasNext()) "\n" else "" if (args.size > 0 && args[0].isNotEmpty()) // file on command line Lexer(source).printTokens() else { val digits = source.filter { it in '0'..'9' } when { source.isEmpty() -> // nothing given tests(2) source.length in 1..2 && digits.length == source.length && digits.toInt() in 0..25 -> tests(digits.toInt()) else -> Lexer(source).printTokens() } // when } // if } catch(e: Throwable) { println(e.message) System.exit(1) } // try } // main</syntaxhighlight> {{out\|case=test case 3: All Symbols}} <b> <pre> 5 16 Keyword_print 5 40 Op_subtract 6 16 Keyword_putc 6 40 Op_less 7 16 Keyword_if 7 40 Op_greater 8 16 Keyword_else 8 40 Op_lessequal 9 16 Keyword_while 9 40 Op_greaterequal 10 16 LeftBrace 10 40 Op_equal 11 16 RightBrace 11 40 Op_notequal 12 16 LeftParen 12 40 Op_and 13 16 RightParen 13 40 Op_or 14 16 Op_subtract 14 40 Semicolon 15 16 Op_not 15 40 Comma 16 16 Op_multiply 16 40 Op_assign 17 16 Op_divide 17 40 Integer 42 18 16 Op_mod 18 40 String "String literal" 19 16 Op_add 19 40 Identifier variable_name 20 26 Integer 10 21 26 Integer 92 22 26 Integer 32 22 29 End_of_input </pre> </b> =={{header\|Lua}}== Line 9,756 ⟶ 10,614: The first module is simply a table defining the names of tokens which don't have an associated value. <~~lang~~syntaxhighlight ~~Lua~~lang="lua">-- module token_name (in a file "token_name.lua") local token_name = { [''] = 'Op_multiply', Line 9,785 ⟶ 10,643: ['putc'] = 'Keyword_putc', } return token_name</~~lang~~syntaxhighlight> This module exports a function <i>find_token</i>, which attempts to find the next valid token from a specified position in a source line. <~~lang~~syntaxhighlight ~~Lua~~lang="lua">-- module lpeg_token_finder local M = {} -- only items added to M will be public (via 'return M' at end) local table, concat = table, table.concat Line 9,871 ⟶ 10,729: end return M</~~lang~~syntaxhighlight> The <i>lexer</i> module uses <i>finder.find_token</i> to produce an iterator over the tokens in a source. <~~lang~~syntaxhighlight ~~Lua~~lang="lua">-- module lexer local M = {} -- only items added to M will publicly available (via 'return M' at end) local string, io, coroutine, yield = string, io, coroutine, coroutine.yield Line 9,953 ⟶ 10,811: -- M._INTERNALS = _ENV return M </syntaxhighlight> ~~</lang>~~ This script uses <i>lexer.tokenize_text</i> to show the token sequence produced from a source text. <~~lang~~syntaxhighlight ~~Lua~~lang="lua">lexer = require 'lexer' format, gsub = string.format, string.gsub Line 9,995 ⟶ 10,853: -- etc. end </syntaxhighlight> ~~</lang>~~ ===Using only standard libraries=== This version replaces the <i>lpeg_token_finder</i> module of the LPeg version with this <i>basic_token_finder</i> module, altering the <i>require</i> expression near the top of the <i>lexer</i> module accordingly. Tested with Lua 5.3.5. (Note that <i>select</i> is a standard function as of Lua 5.2.) <~~lang~~syntaxhighlight lang="lua">-- module basic_token_finder local M = {} -- only items added to M will be public (via 'return M' at end) local table, string = table, string Line 10,130 ⟶ 10,988: -- M._ENV = _ENV return M</~~lang~~syntaxhighlight> =={{header\|M2000 Interpreter}}== <syntaxhighlight lang="m2000 interpreter"> ~~<lang M2000 Interpreter>~~ Module lexical_analyzer { a$={/* Line 10,389 ⟶ 11,247: } lexical_analyzer </syntaxhighlight> ~~</lang>~~ {{out}} Line 10,434 ⟶ 11,292: <~~lang~~syntaxhighlight ~~Mercury~~lang="mercury">% -- mercury -- % % Compile with maybe something like: Line 11,164 ⟶ 12,022: :- func eof = int is det. eof = -1.</~~lang~~syntaxhighlight> {{out}} Line 11,213 ⟶ 12,071: Tested with Nim v0.19.4. Both examples are tested against all programs in [[Compiler/Sample programs]]. ===Using string with regular expressions=== <~~lang~~syntaxhighlight lang="nim"> import re, strformat, strutils Line 11,405 ⟶ 12,263: echo input.tokenize.output </syntaxhighlight> ~~</lang>~~ ===Using stream with lexer library=== <~~lang~~syntaxhighlight lang="nim"> import lexbase, streams from strutils import Whitespace Line 11,718 ⟶ 12,576: echo &"({l.lineNumber},{l.getColNumber l.bufpos + 1}) {l.error}" main() </syntaxhighlight> ~~</lang>~~ ===Using nothing but system and strutils=== <~~lang~~syntaxhighlight lang="nim">import strutils type Line 11,941 ⟶ 12,799: stdout.write('\n') if token.kind == tokEnd: break</~~lang~~syntaxhighlight> =={{header\|ObjectIcon}}== Line 11,951 ⟶ 12,809: <~~lang~~syntaxhighlight ~~ObjectIcon~~lang="objecticon"># -- ObjectIcon -- # # The Rosetta Code lexical analyzer in Object Icon. Based upon the ATS Line 12,448 ⟶ 13,306: write!([FileStream.stderr] \|\|\| args) exit(1) end</~~lang~~syntaxhighlight> Line 12,496 ⟶ 13,354: (Much of the extra complication in the ATS comes from arrays being a linear type (whose "views" need tending), and from values of linear type having to be local to any function using them. This limitation could have been worked around, and arrays more similar to OCaml arrays could have been used, but at a cost in safety and efficiency.) <~~lang~~syntaxhighlight ~~OCaml~~lang="ocaml">(------------------------------------------------------------------) (* The Rosetta Code lexical analyzer, in OCaml. Based on the ATS. ) Line 13,023 ⟶ 13,881: main () (------------------------------------------------------------------)</~~lang~~syntaxhighlight> {{out}} Line 13,066 ⟶ 13,924: Note: we do not print the line and token source code position for the simplicity. <~~lang~~syntaxhighlight lang="scheme"> (import (owl parse)) Line 13,190 ⟶ 14,048: (if (null? (cdr stream)) (print 'End_of_input)))) </syntaxhighlight> ~~</lang>~~ ==== Testing ==== Testing function: <~~lang~~syntaxhighlight lang="scheme"> (define (translate source) (let ((stream (try-parse token-parser (str-iter source) #t))) Line 13,201 ⟶ 14,059: (if (null? (force (cdr stream))) (print 'End_of_input)))) </syntaxhighlight> ~~</lang>~~ ====== Testcase 1 ====== <~~lang~~syntaxhighlight lang="scheme"> (translate " / Line 13,211 ⟶ 14,069: / print(\"Hello, World!\\\\n\"); ")</~~lang~~syntaxhighlight> {{Out}} <pre> Line 13,224 ⟶ 14,082: ====== Testcase 2 ====== <~~lang~~syntaxhighlight lang="scheme"> (translate " / Line 13,231 ⟶ 14,089: phoenix_number = 142857; print(phoenix_number, \"\\\\n\"); ")</~~lang~~syntaxhighlight> {{Out}} <pre> Line 13,250 ⟶ 14,108: ====== Testcase 3 ====== <~~lang~~syntaxhighlight lang="scheme"> (translate " /* Line 13,274 ⟶ 14,132: /* character literal / '\\\\' / character literal / ' ' ")</~~lang~~syntaxhighlight> {{Out}} <pre> Line 13,315 ⟶ 14,173: ====== Testcase 4 ====== <~~lang~~syntaxhighlight lang="scheme"> (translate " /** test printing, embedded \\\\n and comments with lots of '' */ Line 13,322 ⟶ 14,180: print(\"Print a slash n - \\\\\\\\n.\\\\n\"); ") </syntaxhighlight> ~~</lang>~~ {{Out}} <pre> Line 13,345 ⟶ 14,203: =={{header\|Perl}}== <~~lang~~syntaxhighlight lang="perl">#!/usr/bin/env perl use strict; Line 13,484 ⟶ 14,342: ($line, $col) } }</~~lang~~syntaxhighlight> {{out\|case=test case 3}} Line 13,527 ⟶ 14,385: ===Alternate Perl Solution=== Tested on perl v5.26.1 <~~lang~~syntaxhighlight ~~Perl~~lang="perl">#!/usr/bin/perl use strict; # lex.pl - source to tokens Line 13,563 ⟶ 14,421: 1 + $` =~ tr/\n//, 1 + length $` =~ s/.\n//sr, $^R; } printf "%5d %7d %s\n", 1 + tr/\n//, 1, 'End_of_input';</~~lang~~syntaxhighlight> =={{header\|Phix}}== Line 13,570 ⟶ 14,428: form. If required, demo\rosetta\Compiler\extra.e (below) contains some code that achieves the latter. Code to print the human readable forms is likewise kept separate from any re-usable parts. <!--<~~lang~~syntaxhighlight ~~Phix~~lang="phix">(phixonline)--> <span style="color: #000080;font-style:italic;">-- -- demo\rosetta\Compiler\core.e Line 13,730 ⟶ 14,588: <span style="color: #008080;">return</span> <span style="color: #000000;">s</span> <span style="color: #008080;">end</span> <span style="color: #008080;">function</span> <!--</~~lang~~syntaxhighlight>--> For running under pwa/p2js, we also have a "fake file/io" component: <!--<~~lang~~syntaxhighlight ~~Phix~~lang="phix">(phixonline)--> <span style="color: #000080;font-style:italic;">-- -- demo\rosetta\Compiler\js_io.e Line 13,741 ⟶ 14,599: --</span> <span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span> <span style="color: #~~008080~~004080;">~~constant~~sequence</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">known_files</span><span style="color: #0000FF;">,</span><span style="color: #000000;">kfc</span><span style="color: #0000FF;">}</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">columnize</span><span style="color: #0000FF;">({</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">"test3.c"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">""" /* Line 13,806 ⟶ 14,664: } print(a); """</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">"Header.h"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">""" #define area(h, w) h * w """</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">"Source.t"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">""" #include "Header.h" #define width 5 #define height 6 area = #area(height, width)#; """</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)}})</span> <span style="color: #004080;">~~integer~~sequence</span> <span style="color: #000000;">fnlinenos</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">~~lineno~~known_files</span><span style="color: #0000FF;">))</span> <span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">js_open</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">filename</span><span style="color: #0000FF;">)</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">filename</span><span style="color: #0000FF;">,</span><span style="color: #000000;">known_files</span><span style="color: #0000FF;">)</span> <span style="color: #7060A8;">assert</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">0</span><span style="color: #0000FF;">)</span> <span style="color: #000000;">~~lineno~~linenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span> <span style="color: #008080;">return</span> <span style="color: #000000;">fn</span> <span style="color: #008080;">end</span> <span style="color: #008080;">function</span> <span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">js_gets</span><span style="color: #0000FF;">(</span><span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">lineno</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">linenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">]+</span><span style="color: #000000;">1</span> <span style="color: #008080;">if</span> <span style="color: #000000;">lineno</span><span style="color: #0000FF;">><=</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">kfc</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">then~~</span> <span style="color: #008080;">return</span> <span style="color: #000000;">EOF</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if~~</span> ~~<span~~ ~~style="color:~~ ~~#008080;">return</span>~~ <span style="color: #000000;">~~kfc~~linenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">][</span> <span style="color: #~~000000~~0000FF;">~~lineno~~=</span> <span style="color: #~~0000FF~~000000;">]lineno</span> <span style="color: #008080;">return</span> <span style="color: #000000;">kfc</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">][</span><span style="color: #000000;">lineno</span><span style="color: #0000FF;">]</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> <span style="color: #008080;">return</span> <span style="color: #000000;">EOF</span> <span style="color: #008080;">end</span> <span style="color: #008080;">function</span> <!--</~~lang~~syntaxhighlight>--> The main lexer is also written to be reusable by later stages. <!--<~~lang~~syntaxhighlight ~~Phix~~lang="phix">(phixonline)--> <span style="color: #000080;font-style:italic;">-- -- demo\\rosetta\\Compiler\\lex.e Line 13,852 ⟶ 14,722: <span style="color: #000000;">line</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span> <span style="color: #000000;">col</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span> <span style="color: #000000;">oneline</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">platform</span><span style="color: #0000FF;">()=</span><span style="color: #004600;">JS</span><span style="color: #0000FF;">?</span><span style="color: #000000;">js_gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">)</span> <span style="color: #0000FF;">:</span><span style="color: #7060A8;">gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">))</span> <span style="color: #008080;">else</span> Line 13,862 ⟶ 14,732: <span style="color: #008080;">end</span> <span style="color: #008080;">function</span> <span style="color: #000080;font-style:italic;">-- for pwa/p2js (JavaScript really dislikes tabs in strings): --constant whitespace = " \t\r\n\x0B\xA0"</span> <span style="color: #008080;">constant</span> <span style="color: #000000;">whitespace</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\t'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\r'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\n'</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#0B</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#A0</span><span style="color: #0000FF;">}</span> <span style="color: #000080;font-style:italic;">-- (0x0B is Vertical Tab, 0xA0 is Non-breaking space)</span> Line 13,942 ⟶ 14,814: <span style="color: #008080;">function</span> <span style="color: #000000;">get_op</span><span style="color: #0000FF;">()</span> <span style="color: #000080;font-style:italic;">-- sequence operator = {ch}</span> <span style="color: #004080;">string</span> <span style="color: #000000;">operator</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span><span style="color: #0000FF;">&</span><span style="color: #000000;">ch</span> <span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span> Line 14,008 ⟶ 14,881: <span style="color: #008080;">return</span> <span style="color: #000000;">toks</span> <span style="color: #008080;">end</span> <span style="color: #008080;">function</span> <!--</~~lang~~syntaxhighlight>--> Optional: if you need human-readable output/input at each (later) stage, so you can use pipes <!--<~~lang~~syntaxhighlight ~~Phix~~lang="phix">--> <span style="color: #000080;font-style:italic;">-- -- demo\rosetta\Compiler\extra.e Line 14,063 ⟶ 14,936: <span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">n_type</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">left</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">right</span><span style="color: #0000FF;">}</span> <span style="color: #008080;">end</span> <span style="color: #008080;">function</span> <!--</~~lang~~syntaxhighlight>--> Finally, a simple test driver for the specific task: <!--<~~lang~~syntaxhighlight ~~Phix~~lang="phix">(phixonline)--> <span style="color: #000080;font-style:italic;">-- -- demo\rosetta\Compiler\lex.exw Line 14,093 ⟶ 14,966: <span style="color: #000080;font-style:italic;">--main(command_line())</span> <span style="color: #000000;">main</span><span style="color: #0000FF;">({</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"test4.c"</span><span style="color: #0000FF;">})</span> <!--</~~lang~~syntaxhighlight>--> {{out}} <pre> Line 14,116 ⟶ 14,989: =={{header\|Prolog}}== <~~lang~~syntaxhighlight lang="prolog">/* Test harness for the analyzer, not needed if we are actually using the output. / Line 14,276 ⟶ 15,149: % anything else is an error tok(_,_,L,P) --> { format(atom(Error), 'Invalid token at line ~d,~d', [L,P]), throw(Error) }.</~~lang~~syntaxhighlight> {{out}} <pre> Line 14,317 ⟶ 15,190: =={{header\|Python}}== Tested with Python 2.7 and 3.x <~~lang~~syntaxhighlight ~~Python~~lang="python">from __future__ import print_function import sys Line 14,398 ⟶ 15,271: #** "string" def string_lit(start, err_line, err_col): global the_ch text = "" Line 14,405 ⟶ 15,279: if the_ch == '\n': error(err_line, err_col, "EOL while scanning string literal") if the_ch == '\\': next_ch() if the_ch != 'n': error(err_line, err_col, "escape sequence unknown \\%c" % the_ch) the_ch = '\n' text += the_ch Line 14,492 ⟶ 15,371: if tok == tk_EOI: break</~~lang~~syntaxhighlight> {{out\|case=test case 3}} Line 14,536 ⟶ 15,415: =={{header\|QB64}}== Tested with QB64 1.5 <~~lang~~syntaxhighlight lang="vb">dim shared source as string, the_ch as string, tok as string, toktyp as string dim shared line_n as integer, col_n as integer, text_p as integer, err_line as integer, err_col as integer, errors as integer Line 14,776 ⟶ 15,655: end end sub </syntaxhighlight> ~~</lang>~~ {{out\|case=test case 3}} <b> Line 14,816 ⟶ 15,695: =={{header\|Racket}}== <~~lang~~syntaxhighlight lang="racket"> #lang racket (require parser-tools/lex) Line 14,972 ⟶ 15,851: "TEST 5" (display-tokens (string->tokens test5)) </syntaxhighlight> ~~</lang>~~ =={{header\|Raku}}== Line 14,982 ⟶ 15,861: {{works with\|Rakudo\|2016.08}} <syntaxhighlight lang="raku" ~~perl6~~line>grammar tiny_C { rule TOP { ^ <.whitespace>? <tokens> + % <.whitespace> <.whitespace> <eoi> } Line 15,075 ⟶ 15,954: my $tokenizer = tiny_C.parse(@ARGS[0].IO.slurp); parse_it( $tokenizer );</~~lang~~syntaxhighlight> {{out\|case=test case 3}} Line 15,121 ⟶ 16,000: <~~lang~~syntaxhighlight lang="ratfor">###################################################################### # # The Rosetta Code scanner in Ratfor 77. Line 16,351 ⟶ 17,230: end ######################################################################</~~lang~~syntaxhighlight> {{out}} <pre>$ ratfor77 lex-in-ratfor.r > lex-in-ratfor.f && gfortran -O2 -std=legacy -fcheck=all lex-in-ratfor.f && ./a.out < compiler-tests/primes.t 4 1 Identifier count 4 7 Op_assign 4 9 Integer 1 4 10 Semicolon 5 1 Identifier n 5 3 Op_assign 5 5 Integer 1 5 6 Semicolon 6 1 Identifier limit 6 7 Op_assign 6 9 Integer 100 6 12 Semicolon 7 1 Keyword_while 7 7 LeftParen 7 8 Identifier n 7 10 Op_less 7 12 Identifier limit 7 17 RightParen 7 19 LeftBrace 8 5 Identifier k 8 6 Op_assign 8 7 Integer 3 8 8 Semicolon 9 5 Identifier p 9 6 Op_assign 9 7 Integer 1 9 8 Semicolon 10 5 Identifier n 10 6 Op_assign 10 7 Identifier n 10 8 Op_add 10 9 Integer 2 10 10 Semicolon 11 5 Keyword_while 11 11 LeftParen 11 12 LeftParen 11 13 Identifier k 11 14 Op_multiply 11 15 Identifier k 11 16 Op_lessequal 11 18 Identifier n 11 19 RightParen 11 21 Op_and 11 24 LeftParen 11 25 Identifier p 11 26 RightParen 11 27 RightParen 11 29 LeftBrace 12 9 Identifier p 12 10 Op_assign 12 11 Identifier n 12 12 Op_divide 12 13 Identifier k 12 14 Op_multiply 12 15 Identifier k 12 16 Op_notequal 12 18 Identifier n 12 19 Semicolon 13 9 Identifier k 13 10 Op_assign 13 11 Identifier k 13 12 Op_add 13 13 Integer 2 13 14 Semicolon 14 5 RightBrace 15 5 Keyword_if 15 8 LeftParen 15 9 Identifier p 15 10 RightParen 15 12 LeftBrace 16 9 Keyword_print 16 14 LeftParen 16 15 Identifier n 16 16 Comma 16 18 String " is prime\n" 16 31 RightParen 16 32 Semicolon 17 9 Identifier count 17 15 Op_assign 17 17 Identifier count 17 23 Op_add 17 25 Integer 1 17 26 Semicolon 18 5 RightBrace 19 1 RightBrace 20 1 Keyword_print 20 6 LeftParen 20 7 String "Total primes found: " 20 29 Comma 20 31 Identifier count 20 36 Comma 20 38 String "\n" 20 42 RightParen 20 43 Semicolon 21 1 End_of_input</pre> =={{header\|Scala}}== Line 16,360 ⟶ 17,336: The following code implements a configurable (from a symbol map and keyword map provided as parameters) lexical analyzer. <~~lang~~syntaxhighlight lang="scala"> package xyz.hyperreal.rosettacodeCompiler Line 16,621 ⟶ 17,597: } </syntaxhighlight> ~~</lang>~~ =={{header\|Scheme}}== <~~lang~~syntaxhighlight lang="scheme"> (import (scheme base) (scheme char) Line 16,822 ⟶ 17,798: (display-tokens (lexer (cadr (command-line)))) (display "Error: provide program filename\n")) </syntaxhighlight> ~~</lang>~~ {{out}} Line 16,840 ⟶ 17,816: <~~lang~~syntaxhighlight ~~SML~~lang="sml">(------------------------------------------------------------------) ( The Rosetta Code lexical analyzer, in Standard ML. Based on the ATS and the OCaml. The intended compiler is Mlton or Poly/ML; there is Line 17,646 ⟶ 18,622: (* sml-indent-args: 2 ) ( end: ) (------------------------------------------------------------------*)</~~lang~~syntaxhighlight> Line 17,700 ⟶ 18,676: {{libheader\|Wren-fmt}} {{libheader\|Wren-ioutil}} <~~lang~~syntaxhighlight ~~ecmascript~~lang="wren">import "./dynamic" for Enum, Struct, Tuple import "./str" for Char import "./fmt" for Fmt import "./ioutil" for FileUtil import "os" for Process Line 18,049 ⟶ 19,025: lineCount = lines.count initLex.call() process.call()</~~lang~~syntaxhighlight> {{out}} Line 18,091 ⟶ 19,067: =={{header\|Zig}}== <~~lang~~syntaxhighlight lang="zig"> const std = @import("std"); Line 18,500 ⟶ 19,476: return result.items; } </syntaxhighlight> ~~</lang>~~