Compiler/lexical analyzer: Difference between revisions

Added another Nim example, using only system and a bit of strutils
m (removed some debug stuff)
(Added another Nim example, using only system and a bit of strutils)
Line 5,142:
echo &"({l.lineNumber},{l.getColNumber l.bufpos + 1}) {l.error}"
main()
</lang>
 
===Using nothing but system and strutils===
<lang nim>
import strutils
 
type
TokenKind* = enum
tokMult = "Op_multiply", tokDiv = "Op_divide", tokMod = "Op_mod",
tokAdd = "Op_add", tokSub = "Op_subtract", tokLess = "Op_less",
tokLessEq = "Op_lessequal", tokGreater = "Op_greater",
tokGreaterEq = "Op_greaterequal", tokEq = "Op_equal",
tokNotEq = "Op_notequal", tokNot = "Op_not", tokAssign = "Op_assign",
tokAnd = "Op_and", tokOr = "Op_or"
tokLPar = "LeftParen", tokRPar = "RightParen"
tokLBrace = "LeftBrace", tokRBrace = "RightBrace"
tokSemi = "Semicolon", tokComma = "Comma"
tokIf = "Keyword_if", tokElse = "Keyword_else", tokWhile = "Keyword_while",
tokPrint = "Keyword_print", tokPutc = "Keyword_putc"
tokIdent = "Identifier", tokInt = "Integer", tokChar = "Integer",
tokString = "String"
tokEnd = "End_of_input"
Token* = object
ln*, col*: int
case kind*: TokenKind
of tokIdent: ident*: string
of tokInt: intVal*: int
of tokChar: charVal*: char
of tokString: stringVal*: string
else: discard
Lexer* = object
input: string
pos: int
ln, col: int
LexicalError* = object of CatchableError
ln*, col*: int
 
proc error(lexer: var Lexer, message: string) =
var err = newException(LexicalError, message)
err.ln = lexer.ln
err.col = lexer.col
 
template current: char =
if lexer.pos < lexer.input.len: lexer.input[lexer.pos]
else: '\x00'
template get(n: int): string =
if lexer.pos < lexer.input.len:
lexer.input[min(lexer.pos, lexer.input.len)..
min(lexer.pos + n - 1, lexer.input.len)]
else: ""
 
template next() =
inc(lexer.pos); inc(lexer.col)
if current() == '\n':
inc(lexer.ln)
lexer.col = 0
elif current() == '\r':
lexer.col = 0
 
proc skip(lexer: var Lexer) =
while true:
if current() in Whitespace:
while current() in Whitespace:
next()
continue
elif get(2) == "/*":
next(); next()
while get(2) != "*/":
if current() == '\x00':
lexer.error("Unterminated comment")
next()
next(); next()
continue
else: discard
break
 
proc charOrEscape(lexer: var Lexer): char =
if current() != '\\':
result = current()
next()
else:
next()
case current()
of 'n': result = '\n'
of '\\': result = '\\'
else: lexer.error("Unknown escape sequence '\\" & current() & "'")
next()
 
proc next*(lexer: var Lexer): Token =
let
ln = lexer.ln
col = lexer.col
 
case current()
of '*': result = Token(kind: tokMult); next()
of '/': result = Token(kind: tokDiv); next()
of '%': result = Token(kind: tokMod); next()
of '+': result = Token(kind: tokAdd); next()
of '-': result = Token(kind: tokSub); next()
of '<':
next()
if current() == '=': result = Token(kind: tokLessEq)
else: result = Token(kind: tokLess)
of '>':
next()
if current() == '=': result = Token(kind: tokGreaterEq)
else: result = Token(kind: tokGreater)
of '=':
next()
if current() == '=': result = Token(kind: tokEq)
else: result = Token(kind: tokAssign)
of '!':
next()
if current() == '=': result = Token(kind: tokNotEq)
else: result = Token(kind: tokNot)
of '&':
next()
if current() == '&': result = Token(kind: tokAnd)
else: lexer.error("'&&' expected")
of '|':
next()
if current() == '|': result = Token(kind: tokOr)
else: lexer.error("'||' expected")
of '(': result = Token(kind: tokLPar); next()
of ')': result = Token(kind: tokRPar); next()
of '{': result = Token(kind: tokLBrace); next()
of '}': result = Token(kind: tokRBrace); next()
of ';': result = Token(kind: tokSemi); next()
of ',': result = Token(kind: tokComma); next()
of '\'':
next()
if current() == '\'': lexer.error("Empty character literal")
let ch = lexer.charOrEscape()
if current() != '\'':
lexer.error("Character literal must contain a single character or " &
"escape sequence")
result = Token(kind: tokChar, charVal: ch)
of '0'..'9':
var number = ""
while current() in Digits:
number.add(current())
next()
if current() in IdentStartChars:
lexer.error("Integer literal ends in non-digit characters")
result = Token(kind: tokInt, intVal: parseInt(number))
of '"':
next()
var str = ""
while current() notin {'"', '\x00', '\n'}:
str.add(lexer.charOrEscape())
if current() == '\x00':
lexer.error("Unterminated string literal")
elif current() == '\n':
lexer.error("Line feed in string literal")
else:
next()
result = Token(kind: tokString, stringVal: str)
of IdentStartChars:
var ident = $current()
next()
while current() in IdentChars:
ident.add(current())
next()
case ident
of "if": result = Token(kind: tokIf)
of "else": result = Token(kind: tokElse)
of "while": result = Token(kind: tokWhile)
of "print": result = Token(kind: tokPrint)
of "putc": result = Token(kind: tokPutc)
else: result = Token(kind: tokIdent, ident: ident)
of '\x00':
result = Token(kind: tokEnd)
else:
lexer.error("Unexpected character: '" & current() & "'")
 
result.ln = ln
result.col = col
lexer.skip()
 
proc peek*(lexer: var Lexer): Token =
discard
 
proc initLexer*(input: string): Lexer =
result = Lexer(input: input, pos: 0, ln: 1, col: 1)
result.skip()
 
when isMainModule:
const Code = """
/*
Hello world
*/
print("Hello, World!\n");
"""
var
lexer = initLexer(Code)
token: Token
while true:
token = lexer.next()
echo token
if token.kind == tokEnd:
break
</lang>
 
Anonymous user