Compiler/syntax analyzer: Difference between revisions

Line 3,656:

}

</lang>

=={{header|JavaScript}}==

/*

Token: type, value, line, pos

*/

const TokenType = {

Keyword_if: 1, Keyword_else: 2, Keyword_print: 3, Keyword_putc: 4, Keyword_while: 5,

Op_add: 6, Op_and: 7, Op_assign: 8, Op_divide: 9, Op_equal: 10, Op_greater: 11,

Op_greaterequal: 12, Op_less: 13, Op_lessequal: 14, Op_mod: 15, Op_multiply: 16, Op_not: 17,

Op_notequal: 18, Op_or: 19, Op_subtract: 20,

Integer: 21, String: 22, Identifier: 23,

Semicolon: 24, Comma: 25,

LeftBrace: 26, RightBrace: 27,

LeftParen: 28, RightParen: 29,

End_of_input: 99

}

class Lexer {

constructor(source) {

this.source = source

this.pos = 1 // position in line

this.position = 0 // position in source

this.line = 1

this.chr = this.source.charAt(0)

this.keywords = {

"if": TokenType.Keyword_if,

"else": TokenType.Keyword_else,

"print": TokenType.Keyword_print,

"putc": TokenType.Keyword_putc,

"while": TokenType.Keyword_while

}

getNextChar() {

this.pos++

this.position++

if (this.position >= this.source.length) {

this.chr = undefined

return this.chr

}

this.chr = this.source.charAt(this.position)

if (this.chr === '\n') {

this.line++

this.pos = 0

}

return this.chr

}

error(line, pos, message) {

if (line > 0 && pos > 0) {

console.log(message + " in line " + line + ", pos " + pos + "\n")

} else {

console.log(message)

}

process.exit(1)

}

follow(expect, ifyes, ifno, line, pos) {

if (this.getNextChar() === expect) {

this.getNextChar()

return { type: ifyes, value: "", line, pos }

}

if (ifno === TokenType.End_of_input) {

this.error(line, pos, "follow: unrecognized character: (" + this.chr.charCodeAt(0) + ") '" + this.chr + "'")

}

return { type: ifno, value: "", line, pos }

}

div_or_comment(line, pos) {

if (this.getNextChar() !== '*') {

return { type: TokenType.Op_divide, value: "/", line, pos }

}

this.getNextChar()

while (true) {

if (this.chr === '\u0000') {

this.error(line, pos, "EOF in comment")

} else if (this.chr === '*') {

if (this.getNextChar() === '/') {

this.getNextChar()

return this.getToken()

}

} else {

this.getNextChar()

}

char_lit(line, pos) {

let c = this.getNextChar() // skip opening quote

let n = c.charCodeAt(0)

if (c === "\'") {

this.error(line, pos, "empty character constant")

} else if (c === "\\") {

c = this.getNextChar()

if (c == "n") {

n = 10

} else if (c === "\\") {

n = 92

} else {

this.error(line, pos, "unknown escape sequence \\" + c)

}

if (this.getNextChar() !== "\'") {

this.error(line, pos, "multi-character constant")

}

this.getNextChar()

return { type: TokenType.Integer, value: n, line, pos }

}

string_lit(start, line, pos) {

let value = ""

while (this.getNextChar() !== start) {

if (this.chr === undefined) {

this.error(line, pos, "EOF while scanning string literal")

}

if (this.chr === "\n") {

this.error(line, pos, "EOL while scanning string literal")

}

value += this.chr

}

this.getNextChar()

return { type: TokenType.String, value, line, pos }

}

identifier_or_integer(line, pos) {

let is_number = true

let text = ""

while (/\w/.test(this.chr) || this.chr === '_') {

text += this.chr

if (!/\d/.test(this.chr)) {

is_number = false

}

this.getNextChar()

}

if (text === "") {

this.error(line, pos, "identifer_or_integer unrecopgnized character: follow: unrecognized character: (" + this.chr.charCodeAt(0) + ") '" + this.chr + "'")

}

if (/\d/.test(text.charAt(0))) {

if (!is_number) {

this.error(line, pos, "invaslid number: " + text)

}

return { type: TokenType.Integer, value: text, line, pos }

}

if (text in this.keywords) {

return { type: this.keywords[text], value: "", line, pos }

}

return { type: TokenType.Identifier, value: text, line, pos }

}

getToken() {

let pos, line

// Ignore whitespaces

while (/\s/.test(this.chr)) { this.getNextChar() }

line = this.line; pos = this.pos

switch (this.chr) {

case undefined: return { type: TokenType.End_of_input, value: "", line: this.line, pos: this.pos }

case "/": return this.div_or_comment(line, pos)

case "\'": return this.char_lit(line, pos)

case "\"": return this.string_lit(this.chr, line, pos)

case "<": return this.follow("=", TokenType.Op_lessequal, TokenType.Op_less, line, pos)

case ">": return this.follow("=", TokenType.Op_greaterequal, TokenType.Op_greater, line, pos)

case "=": return this.follow("=", TokenType.Op_equal, TokenType.Op_assign, line, pos)

case "!": return this.follow("=", TokenType.Op_notequal, TokenType.Op_not, line, pos)

case "&": return this.follow("&", TokenType.Op_and, TokenType.End_of_input, line, pos)

case "|": return this.follow("|", TokenType.Op_or, TokenType.End_of_input, line, pos)

case "{": this.getNextChar(); return { type: TokenType.LeftBrace, value: "{", line, pos }

case "}": this.getNextChar(); return { type: TokenType.RightBrace, value: "}", line, pos }

case "(": this.getNextChar(); return { type: TokenType.LeftParen, value: "(", line, pos }

case ")": this.getNextChar(); return { type: TokenType.RightParen, value: ")", line, pos }

case "+": this.getNextChar(); return { type: TokenType.Op_add, value: "+", line, pos }

case "-": this.getNextChar(); return { type: TokenType.Op_subtract, value: "-", line, pos }

case "*": this.getNextChar(); return { type: TokenType.Op_multiply, value: "*", line, pos }

case "%": this.getNextChar(); return { type: TokenType.Op_mod, value: "%", line, pos }

case ";": this.getNextChar(); return { type: TokenType.Semicolon, value: ";", line, pos }

case ",": this.getNextChar(); return { type: TokenType.Comma, value: ",", line, pos }

default: return this.identifier_or_integer(line, pos)

}

/*

https://stackoverflow.com/questions/9907419/how-to-get-a-key-in-a-javascript-object-by-its-value

*/

getTokenType(value) {

return Object.keys(TokenType).find(key => TokenType[key] === value)

}

printToken(t) {

//console.log(t.line+" "+t.pos+" "+this.getTokenType(t.type)+" "+t.value)

let result = (" " + t.line).substr(t.line.toString().length)

result += (" " + t.pos).substr(t.pos.toString().length)

result += (" " + this.getTokenType(t.type) + " ").substr(0, 16)

//"%5d %5d %-15s", this.line, this.pos, this.tokentype);

switch (t.type) {

case TokenType.Integer:

result += " " + t.value

break;

case TokenType.Identifier:

result += " " + t.value

break;

case TokenType.String:

result += " \""+ t.value + "\""

break;

}

console.log(result)

}

printTokens() {

let t

while ((t = this.getToken()).type !== TokenType.End_of_input) {

this.printToken(t)

}

this.printToken(t)

}

const fs = require("fs")

fs.readFile(process.argv[2], "utf8", (err, data) => {

l = new Lexer(data)

l.printTokens()

})

</lang>

Line 4,775:

Line 4,992:

Print: Expecting 'LeftParen', found 'Op_subtract'

</pre>

=={{header|PHP}}==

<?php

/*

Token => type, value, line, cpos

*/

class Lexer {

private $source, $cpos, $cposition, $line, $char, $keywords;

const Undefined = -1;

const TokenType = [

'Keyword_if' => 1, 'Keyword_else' => 2, 'Keyword_print' => 3, 'Keyword_putc' => 4, 'Keyword_while' => 5,

'Op_add' => 6, 'Op_and' => 7, 'Op_assign' => 8, 'Op_divide' => 9, 'Op_equal' => 10, 'Op_greater' => 11,

'Op_greaterequal' => 12, 'Op_less' => 13, 'Op_lessequal' => 14, 'Op_mod' => 15, 'Op_multiply' => 16, 'Op_not' => 17,

'Op_notequal' => 18, 'Op_or' => 19, 'Op_subtract' => 20,

'Integer' => 21, 'String' => 22, 'Identifier' => 23,

'Semicolon' => 24, 'Comma' => 25,

'LeftBrace' => 26, 'RightBrace' => 27,

'LeftParen' => 28, 'RightParen' => 29,

'End_of_input' => 99

];

public function __construct($source) {

$this->source = $source;

$this->cpos = 1; // position in line

$this->cposition = 0; // position in source

$this->line = 1;

$this->char = substr($this->source, 0, 1);

$this->keywords = [

'if' => Lexer::TokenType['Keyword_if'],

'else' => Lexer::TokenType['Keyword_else'],

'print' => Lexer::TokenType['Keyword_print'],

'putc' => Lexer::TokenType['Keyword_putc'],

'while' => Lexer::TokenType['Keyword_while']

];

}

private function getNextChar() {

$this->cpos++;

$this->cposition++;

if ($this->cposition >= strlen($this->source)) {

$this->char = Lexer::Undefined;

return $this->char;

}

$this->char = substr($this->source, $this->cposition, 1);

if ($this->char === "\n") {

$this->line++;

$this->cpos = 0;

}

return $this->char;

}

private function error($line, $cpos, $message) {

if ($line > 0 && $cpos > 0) {

echo $message . ' in line ' . $line . ', pos ' . $cpos . '\n';

} else {

echo $message;

}

exit();

}

private function follow($expect, $ifyes, $ifno, $line, $cpos) {

if ($this->getNextChar() === $expect) {

$this->getNextChar();

return [ 'type' => $ifyes, 'value' => '', 'line' => $line, 'cpos' => $cpos ];

}

if ($ifno === Lexer::TokenType['End_of_input']) {

$this->error($line, $cpos, 'follow: unrecognized character = (' . substr($this->char, 0, 1) . ') "'. $this->char . '"');

}

return [ 'type' => $ifno, 'value' => '', 'line' => $line, 'cpos' => $cpos ];

}

private function div_or_comment($line, $cpos) {

if ($this->getNextChar() !== '*') {

return [ 'type' => Lexer::TokenType['Op_divide'], 'value' => '/', 'line' => $line, 'cpos' => $cpos ];

}

$this->getNextChar();

while (true) {

if ($this->char === Lexer::Undefined) {

$this->error($line, $cpos, 'EOF in comment');

} else if ($this->char === '*') {

if ($this->getNextChar() === '/') {

$this->getNextChar();

return $this->getToken();

}

} else {

$this->getNextChar();

}

private function char_lit($line, $cpos) {

$c = $this->getNextChar(); // skip opening quote

$n = mb_ord($c);

if ($c === "\'") {

$this->error(line, $cpos, 'empty character constant');

} else if ($c === '\\') {

$c = $this->getNextChar();

if ($c == 'n') {

$n = 10;

} else if ($c === "\\") {

$n = 92;

} else {

$this->error($line, $cpos, 'unknown escape sequence \\' . c);

}

if ($this->getNextChar() !== '\'') {

$this->error($line, $cpos, 'multi-character constant');

}

$this->getNextChar();

return [ 'type' => Lexer::TokenType['Integer'], 'value' => $n, 'line' => $line, 'cpos' => $cpos ];

}

private function String_lit($start, $line, $cpos) {

$value = '';

while ($this->getNextChar() !== $start) {;

if ($this->char === Lexer::Undefined) {

$this->error($line, $cpos, 'EOF while scanning String literal');

}

if ($this->char === "\n") {

$this->error($line, $cpos, 'EOL while scanning String literal');

}

$value .= $this->char;

}

$this->getNextChar();

return [ 'type' => Lexer::TokenType['String'], 'value' => $value, 'line' => $line, 'cpos' => $cpos ];

}

private function identifier_or_integer($line, $cpos) {

$is_number = true;

$text = '';

while (preg_match('/\w/', $this->char) || $this->char === '_') {

$text .= $this->char;

if (!preg_match('/\d/', $this->char)) {

$is_number = false;

}

$this->getNextChar();

}

if ($text === '') {

$this->error($line, $cpos, 'identifer_or_integer: unrecopgnized character: (' . mb_ord($this->char) . ') "' . $this->char . '"');

}

if (preg_match('/\d/', substr($text, 0, 1))) {

if (!$is_number) {

$this->error($line, $cpos, 'invaslid number => ' . $text);

}

return [ 'type' => Lexer::TokenType['Integer'], 'value' => $text, 'line' => $line, 'cpos' => $cpos ];

}

if (array_key_exists($text, $this->keywords)) {

return [ 'type' => $this->keywords[$text], 'value' => '', 'line' => $line, 'cpos' => $cpos ];

}

return [ 'type' => Lexer::TokenType['Identifier'], 'value' => $text, 'line' => $line, 'cpos' => $cpos ];

}

private function getToken() {

// Ignore whitespaces

while (preg_match('/\s/', $this->char)) { $this->getNextChar(); }

$line = $this->line; $cpos = $this->cpos;

switch ($this->char) {

case Lexer::Undefined: return [ 'type' => Lexer::TokenType['End_of_input'], 'value' => '', 'line' => $this->line, 'cpos' => $this->cpos ];

case '/': return $this->div_or_comment($line, $cpos);

case '\'': return $this->char_lit($line, $cpos);

case "\"": return $this->String_lit($this->char, $line, $cpos);

case '<': return $this->follow('=', Lexer::TokenType['Op_lessequal'], Lexer::TokenType['Op_less'], $line, $cpos);

case '>': return $this->follow('=', Lexer::TokenType['Op_greaterequal'], Lexer::TokenType['Op_greater'], $line, $cpos);

case '=': return $this->follow('=', Lexer::TokenType['Op_equal'], Lexer::TokenType['Op_assign'], $line, $cpos);

case '!': return $this->follow('=', Lexer::TokenType['Op_notequal'], Lexer::TokenType['Op_not'], $line, $cpos);

case '&': return $this->follow('&', Lexer::TokenType['Op_and'], Lexer::TokenType['End_of_input'], $line, $cpos);

case '|': return $this->follow('|', Lexer::TokenType['Op_or'], Lexer::TokenType['End_of_input'], $line, $cpos);

case '{': $this->getNextChar(); return [ 'type' => Lexer::TokenType['LeftBrace'], 'value' => '{', 'line' => $line, 'cpos' => $cpos ];

case '}': $this->getNextChar(); return [ 'type' => Lexer::TokenType['RightBrace'], 'value' => '}', 'line' => $line, 'cpos' => $cpos ];

case '(': $this->getNextChar(); return [ 'type' => Lexer::TokenType['LeftParen'], 'value' => '(', 'line' => $line, 'cpos' => $cpos ];

case ')': $this->getNextChar(); return [ 'type' => Lexer::TokenType['RightParen'], 'value' => ')', 'line' => $line, 'cpos' => $cpos ];

case '+': $this->getNextChar(); return [ 'type' => Lexer::TokenType['Op_add'], 'value' => '+', 'line' => $line, 'cpos' => $cpos ];

case '-': $this->getNextChar(); return [ 'type' => Lexer::TokenType['Op_subtract'], 'value' => '-', 'line' => $line, 'cpos' => $cpos ];

case '*': $this->getNextChar(); return [ 'type' => Lexer::TokenType['Op_multiply'], 'value' => '*', 'line' => $line, 'cpos' => $cpos ];

case '%': $this->getNextChar(); return [ 'type' => Lexer::TokenType['Op_mod'], 'value' => '%', 'line' => $line, 'cpos' => $cpos ];

case ';': $this->getNextChar(); return [ 'type' => Lexer::TokenType['Semicolon'], 'value' => ';', 'line' => $line, 'cpos' => $cpos ];

case ',': $this->getNextChar(); return [ 'type' => Lexer::TokenType['Comma'], 'value' => ',', 'line' => $line, 'cpos' => $cpos ];

default: return $this->identifier_or_integer($line, $cpos);

}

private function getTokenType($value) {

return array_search($value, Lexer::TokenType);

}

private function printToken($t) {

$result = substr(' ' . $t['line'], strlen(strval($t['line'])));

$result .= substr(' ' . $t['cpos'], strlen(strval($t['cpos'])));

$result .= substr(' ' . $this->getTokenType($t['type']) . ' ', 0, 16);

switch ($t['type']) {

case Lexer::TokenType['Integer']:

$result .= ' ' . $t['value'];

break;

case Lexer::TokenType['Identifier']:

$result .= ' ' . $t['value'];

break;

case Lexer::TokenType['String']:

$result .= " \"". $t['value'] . "\"";

break;

}

echo $result."\n";

}

public function printTokens() {

$t = '';

while (($t = $this->getToken())['type'] !== Lexer::TokenType['End_of_input']) {

$this->printToken($t);

}

$this->printToken($t);

}

$file = file_get_contents($argv[1]);

$l = new Lexer($file);

$l->printTokens();

</lang>

=={{header|Python}}==