Compiler/lexical analyzer: Difference between revisions
Content deleted Content added
m removed some debug stuff |
Added another Nim example, using only system and a bit of strutils |
||
Line 5,142: | Line 5,142: | ||
echo &"({l.lineNumber},{l.getColNumber l.bufpos + 1}) {l.error}" |
echo &"({l.lineNumber},{l.getColNumber l.bufpos + 1}) {l.error}" |
||
main() |
main() |
||
</lang> |
|||
===Using nothing but system and strutils=== |
|||
<lang nim> |
|||
import strutils |
|||
type |
|||
TokenKind* = enum |
|||
tokMult = "Op_multiply", tokDiv = "Op_divide", tokMod = "Op_mod", |
|||
tokAdd = "Op_add", tokSub = "Op_subtract", tokLess = "Op_less", |
|||
tokLessEq = "Op_lessequal", tokGreater = "Op_greater", |
|||
tokGreaterEq = "Op_greaterequal", tokEq = "Op_equal", |
|||
tokNotEq = "Op_notequal", tokNot = "Op_not", tokAssign = "Op_assign", |
|||
tokAnd = "Op_and", tokOr = "Op_or" |
|||
tokLPar = "LeftParen", tokRPar = "RightParen" |
|||
tokLBrace = "LeftBrace", tokRBrace = "RightBrace" |
|||
tokSemi = "Semicolon", tokComma = "Comma" |
|||
tokIf = "Keyword_if", tokElse = "Keyword_else", tokWhile = "Keyword_while", |
|||
tokPrint = "Keyword_print", tokPutc = "Keyword_putc" |
|||
tokIdent = "Identifier", tokInt = "Integer", tokChar = "Integer", |
|||
tokString = "String" |
|||
tokEnd = "End_of_input" |
|||
Token* = object |
|||
ln*, col*: int |
|||
case kind*: TokenKind |
|||
of tokIdent: ident*: string |
|||
of tokInt: intVal*: int |
|||
of tokChar: charVal*: char |
|||
of tokString: stringVal*: string |
|||
else: discard |
|||
Lexer* = object |
|||
input: string |
|||
pos: int |
|||
ln, col: int |
|||
LexicalError* = object of CatchableError |
|||
ln*, col*: int |
|||
proc error(lexer: var Lexer, message: string) = |
|||
var err = newException(LexicalError, message) |
|||
err.ln = lexer.ln |
|||
err.col = lexer.col |
|||
template current: char = |
|||
if lexer.pos < lexer.input.len: lexer.input[lexer.pos] |
|||
else: '\x00' |
|||
template get(n: int): string = |
|||
if lexer.pos < lexer.input.len: |
|||
lexer.input[min(lexer.pos, lexer.input.len).. |
|||
min(lexer.pos + n - 1, lexer.input.len)] |
|||
else: "" |
|||
template next() = |
|||
inc(lexer.pos); inc(lexer.col) |
|||
if current() == '\n': |
|||
inc(lexer.ln) |
|||
lexer.col = 0 |
|||
elif current() == '\r': |
|||
lexer.col = 0 |
|||
proc skip(lexer: var Lexer) = |
|||
while true: |
|||
if current() in Whitespace: |
|||
while current() in Whitespace: |
|||
next() |
|||
continue |
|||
elif get(2) == "/*": |
|||
next(); next() |
|||
while get(2) != "*/": |
|||
if current() == '\x00': |
|||
lexer.error("Unterminated comment") |
|||
next() |
|||
next(); next() |
|||
continue |
|||
else: discard |
|||
break |
|||
proc charOrEscape(lexer: var Lexer): char = |
|||
if current() != '\\': |
|||
result = current() |
|||
next() |
|||
else: |
|||
next() |
|||
case current() |
|||
of 'n': result = '\n' |
|||
of '\\': result = '\\' |
|||
else: lexer.error("Unknown escape sequence '\\" & current() & "'") |
|||
next() |
|||
proc next*(lexer: var Lexer): Token = |
|||
let |
|||
ln = lexer.ln |
|||
col = lexer.col |
|||
case current() |
|||
of '*': result = Token(kind: tokMult); next() |
|||
of '/': result = Token(kind: tokDiv); next() |
|||
of '%': result = Token(kind: tokMod); next() |
|||
of '+': result = Token(kind: tokAdd); next() |
|||
of '-': result = Token(kind: tokSub); next() |
|||
of '<': |
|||
next() |
|||
if current() == '=': result = Token(kind: tokLessEq) |
|||
else: result = Token(kind: tokLess) |
|||
of '>': |
|||
next() |
|||
if current() == '=': result = Token(kind: tokGreaterEq) |
|||
else: result = Token(kind: tokGreater) |
|||
of '=': |
|||
next() |
|||
if current() == '=': result = Token(kind: tokEq) |
|||
else: result = Token(kind: tokAssign) |
|||
of '!': |
|||
next() |
|||
if current() == '=': result = Token(kind: tokNotEq) |
|||
else: result = Token(kind: tokNot) |
|||
of '&': |
|||
next() |
|||
if current() == '&': result = Token(kind: tokAnd) |
|||
else: lexer.error("'&&' expected") |
|||
of '|': |
|||
next() |
|||
if current() == '|': result = Token(kind: tokOr) |
|||
else: lexer.error("'||' expected") |
|||
of '(': result = Token(kind: tokLPar); next() |
|||
of ')': result = Token(kind: tokRPar); next() |
|||
of '{': result = Token(kind: tokLBrace); next() |
|||
of '}': result = Token(kind: tokRBrace); next() |
|||
of ';': result = Token(kind: tokSemi); next() |
|||
of ',': result = Token(kind: tokComma); next() |
|||
of '\'': |
|||
next() |
|||
if current() == '\'': lexer.error("Empty character literal") |
|||
let ch = lexer.charOrEscape() |
|||
if current() != '\'': |
|||
lexer.error("Character literal must contain a single character or " & |
|||
"escape sequence") |
|||
result = Token(kind: tokChar, charVal: ch) |
|||
of '0'..'9': |
|||
var number = "" |
|||
while current() in Digits: |
|||
number.add(current()) |
|||
next() |
|||
if current() in IdentStartChars: |
|||
lexer.error("Integer literal ends in non-digit characters") |
|||
result = Token(kind: tokInt, intVal: parseInt(number)) |
|||
of '"': |
|||
next() |
|||
var str = "" |
|||
while current() notin {'"', '\x00', '\n'}: |
|||
str.add(lexer.charOrEscape()) |
|||
if current() == '\x00': |
|||
lexer.error("Unterminated string literal") |
|||
elif current() == '\n': |
|||
lexer.error("Line feed in string literal") |
|||
else: |
|||
next() |
|||
result = Token(kind: tokString, stringVal: str) |
|||
of IdentStartChars: |
|||
var ident = $current() |
|||
next() |
|||
while current() in IdentChars: |
|||
ident.add(current()) |
|||
next() |
|||
case ident |
|||
of "if": result = Token(kind: tokIf) |
|||
of "else": result = Token(kind: tokElse) |
|||
of "while": result = Token(kind: tokWhile) |
|||
of "print": result = Token(kind: tokPrint) |
|||
of "putc": result = Token(kind: tokPutc) |
|||
else: result = Token(kind: tokIdent, ident: ident) |
|||
of '\x00': |
|||
result = Token(kind: tokEnd) |
|||
else: |
|||
lexer.error("Unexpected character: '" & current() & "'") |
|||
result.ln = ln |
|||
result.col = col |
|||
lexer.skip() |
|||
proc peek*(lexer: var Lexer): Token = |
|||
discard |
|||
proc initLexer*(input: string): Lexer = |
|||
result = Lexer(input: input, pos: 0, ln: 1, col: 1) |
|||
result.skip() |
|||
when isMainModule: |
|||
const Code = """ |
|||
/* |
|||
Hello world |
|||
*/ |
|||
print("Hello, World!\n"); |
|||
""" |
|||
var |
|||
lexer = initLexer(Code) |
|||
token: Token |
|||
while true: |
|||
token = lexer.next() |
|||
echo token |
|||
if token.kind == tokEnd: |
|||
break |
|||
</lang> |
</lang> |
||