Compiler/virtual machine interpreter

Revision as of 16:42, 22 October 2016 by Ed Davis (talk | contribs) (Created page with "{{draft task}} A code generator translates the output of the syntax analyzer and/or semantic analyzer into lower level code, either assembly, object, or virtual. {{task head...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

A code generator translates the output of the syntax analyzer and/or semantic analyzer into lower level code, either assembly, object, or virtual.

Compiler/virtual machine interpreter is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.

Take the output of the Syntax analyzer task, and convert it to virtual machine code, than can be run by the Virtual machine interpreter.

The program should read input from a file and/or stdin, and write output to a file and/or stdout.

Input format: See Syntax analyzer output

Output format:

This is a byte-code, 32-bit word stack based virtual machine.

Registers:

sp:

   the stack pointer - points to the next top of stack.  The stack is a 32-bit integer
   array.

pc:

   the program counter - points to the current instruction to be performed.  The code is an
   array of bytes.

Data:

   data
   string pool

Instructions:

Each instruction is one byte. The following instructions also have a 32-bit integer operand:

FETCH:

   load the word immediately following the opcode to the stack.  Increment the stack
   pointer. increment pc by 1 (for the opcode) + 4 (for the size of a 32-bit word).

STORE:

   decrement the stack pointer. Get the word following the opcode, use it as an index
   into data, and store the top of stack value there. increment pc by 1 (for the opcode)
   + 4 (for the size of a 32-bit word).

PUSH:

   Get the word following the opcode, store it to the top of stack.  Increment the stack
   pointer. increment pc by 1 (for the opcode) + 4 (for the size of a 32-bit word).

JMP:

   Get the word following the opcode, and add it to pc.  The word is signed, so pc can
   become greater or lessor.

JZ:

   decrement the stack pointer.  If the stack top is 0, perform a jump, as per the JMP
   instruction.  Otherwise, increment pc by 1 (for the opcode) + 4 (for the size of a
   32-bit word).

The following instructions do not have an operand. They perform their operation directly against the stack:

For the following instructions, there are two operands on the stack. Decrement the stack pointer, and then pop ADD: SUB: MUL: DIV: LT: GT: LE: NE: AND:

NEG: PRTC: PRTS: PRTI: HALT:


C

Tested with gcc 4.81 and later, compiles warning free with -Wall -Wextra <lang C>#include <stdio.h>

  1. include <stdlib.h>
  2. include <stdarg.h>
  3. include <string.h>
  4. include <stdint.h>
  5. include <ctype.h>
  1. define NELEMS(arr) (sizeof(arr) / sizeof(arr[0]))
  1. define da_dim(name, type) type *name = NULL; \
                           int _qy_ ## name ## _p = 0;  \
                           int _qy_ ## name ## _max = 0
  1. define da_redim(name) do {if (_qy_ ## name ## _p >= _qy_ ## name ## _max) \
                               name = realloc(name, (_qy_ ## name ## _max += 32) * sizeof(name[0]));} while (0)
  1. define da_rewind(name) _qy_ ## name ## _p = 0
  1. define da_append(name, x) do {da_redim(name); name[_qy_ ## name ## _p++] = x;} while (0)

typedef unsigned char uchar; typedef uchar code;

typedef enum { FETCH, STORE, PUSH, ADD, SUB, MUL, DIV, MOD, LT, GT, LE, GE, EQ, NE, AND,

   OR, NEG, NOT, JMP, JZ, PRTC, PRTS, PRTI, HALT

} Code_t;

typedef struct Code_map {

   char    *text;
   Code_t   op;

} Code_map;

Code_map code_map[] = {

   {"fetch",  FETCH},
   {"store",  STORE},
   {"push",   PUSH },
   {"add",    ADD  },
   {"sub",    SUB  },
   {"mul",    MUL  },
   {"div",    DIV  },
   {"mod",    MOD  },
   {"lt",     LT   },
   {"gt",     GT   },
   {"le",     LE   },
   {"ge",     GE   },
   {"eq",     EQ   },
   {"ne",     NE   },
   {"and",    AND  },
   {"or",     OR   },
   {"neg",    NEG  },
   {"not",    NOT  },
   {"jmp",    JMP  },
   {"jz",     JZ   },
   {"prtc",   PRTC },
   {"prts",   PRTS },
   {"prti",   PRTI },
   {"halt",   HALT },

};

FILE *source_fp; da_dim(object, code);

void error(const char *fmt, ... ) {

   va_list ap;
   char buf[1000];
   va_start(ap, fmt);
   vsprintf(buf, fmt, ap);
   va_end(ap);
   printf("error: %s\n", buf);
   exit(1);

}

/*** Virtual Machine interpreter ***/ void run_vm(const code obj[], int32_t data[], int g_size, char **string_pool) {

   int32_t *sp = &data[g_size + 1];
   const code *pc = obj;
   again:
   switch (*pc++) {
       case FETCH: *sp++ = data[*(int32_t *)pc];  pc += sizeof(int32_t); goto again;
       case STORE: data[*(int32_t *)pc] = *--sp;  pc += sizeof(int32_t); goto again;
       case PUSH:  *sp++ = *(int32_t *)pc;        pc += sizeof(int32_t); goto again;
       case ADD:   sp[-2] += sp[-1]; --sp;                             goto again;
       case SUB:   sp[-2] -= sp[-1]; --sp;                             goto again;
       case MUL:   sp[-2] *= sp[-1]; --sp;                             goto again;
       case DIV:   sp[-2] /= sp[-1]; --sp;                             goto again;
       case MOD:   sp[-2] %= sp[-1]; --sp;                             goto again;
       case LT:    sp[-2] = sp[-2] <  sp[-1]; --sp;                    goto again;
       case GT:    sp[-2] = sp[-2] >  sp[-1]; --sp;                    goto again;
       case LE:    sp[-2] = sp[-2] <= sp[-1]; --sp;                    goto again;
       case GE:    sp[-2] = sp[-2] >= sp[-1]; --sp;                    goto again;
       case EQ:    sp[-2] = sp[-2] == sp[-1]; --sp;                    goto again;
       case NE:    sp[-2] = sp[-2] != sp[-1]; --sp;                    goto again;
       case AND:   sp[-2] = sp[-2] && sp[-1]; --sp;                    goto again;
       case OR:    sp[-2] = sp[-2] || sp[-1]; --sp;                    goto again;
       case NEG:   sp[-1] = -sp[-1];                                   goto again;
       case NOT:   sp[-1] = !sp[-1];                                   goto again;
       case JMP:   pc += *(int32_t *)pc;                               goto again;
       case JZ:    pc += (*--sp == 0) ? *(int32_t *)pc : (int32_t)sizeof(int32_t); goto again;
       case PRTC:  printf("%c", sp[-1]); --sp;                         goto again;
       case PRTS:  printf("%s", string_pool[sp[-1]]); --sp;            goto again;
       case PRTI:  printf("%d", sp[-1]); --sp;                         goto again;
       case HALT:                                                      break;
       default:    error("Unknown opcode %d\n", *(pc - 1));
   }

}

char *read_line(int *len) {

   static char *text = NULL;
   static int textmax = 0;
   for (*len = 0; ; (*len)++) {
       int ch = fgetc(source_fp);
       if (ch == EOF || ch == '\n') {
           if (*len == 0)
               return NULL;
           break;
       }
       if (*len + 1 >= textmax) {
           textmax = (textmax == 0 ? 128 : textmax * 2);
           text = realloc(text, textmax);
       }
       text[*len] = ch;
   }
   text[*len] = '\0';
   return text;

}

char *rtrim(char *text, int *len) { // remove trailing spaces

   for (; *len > 0 && isspace(text[*len - 1]); --(*len))
       ;
   text[*len] = '\0';
   return text;

}

char *translate(char *st) {

   char *p, *q;
   if (st[0] == '"')                       // skip leading " if there
       ++st;
   p = q = st;
   while ((*p++ = *q++) != '\0') {
       if (q[-1] == '\\' && q[0] == 'n') {
           p[-1] = '\n';
           ++q;
       }
       if (q[0] == '"' && q[1] == '\0')    // skip trialing " if there
           ++q;
   }
   return st;

}

/* convert an opcode string into its byte value */ int findit(const char text[], int offset) {

   for (size_t i = 0; i < sizeof(code_map) / sizeof(code_map[0]); i++) {
       if (strcmp(code_map[i].text, text) == 0)
           return code_map[i].op;
   }
   error("Unknown instruction %s at %d\n", text, offset);
   return -1;

}

void emit_byte(int c) {

   da_append(object, (uchar)c);

}

void emit_int(int32_t n) {

   union {
       int32_t n;
       unsigned char c[sizeof(int32_t)];
   } x;
   x.n = n;
   for (size_t i = 0; i < sizeof(x.n); ++i) {
       emit_byte(x.c[i]);
   }

}

/* Datasize: 5 Strings: 3 " is prime\n" "Total primes found: " "\n"

154 jmp    (-73) 82
164 jz     (32) 197
175 push  0
159 fetch [4]
149 store [3]
*/

/* Load code into global array object, return the string pool and data size */ char **load_code(int *ds) {

   int line_len, n_strings;
   char **string_pool;
   char *text = read_line(&line_len);
   text = rtrim(text, &line_len);
   strtok(text, " ");                      // skip "Datasize:"
   *ds = atoi(strtok(NULL, " "));          // get actual data_size
   strtok(NULL, " ");                      // skip "Strings:"
   n_strings = atoi(strtok(NULL, " "));    // get number of strings
   string_pool = malloc(n_strings * sizeof(char *));
   for (int i = 0; i < n_strings; ++i) {
       text = read_line(&line_len);
       text = rtrim(text, &line_len);
       text = translate(text);
       string_pool[i] = strdup(text);
   }
   for (;;) {
       int len;
       text = read_line(&line_len);
       if (text == NULL)
           break;
       text = rtrim(text, &line_len);
       int offset = atoi(strtok(text, " "));   // get the offset
       char *instr = strtok(NULL, " ");    // get the instruction
       int opcode = findit(instr, offset);
       emit_byte(opcode);
       char *operand = strtok(NULL, " ");
       switch (opcode) {
           case JMP: case JZ:
               operand++;                  // skip the '('
               len = strlen(operand);
               operand[len - 1] = '\0';    // remove the ')'
               emit_int(atoi(operand));
               break;
           case PUSH:
               emit_int(atoi(operand));
               break;
           case FETCH: case STORE:
               operand++;                  // skip the '['
               len = strlen(operand);
               operand[len - 1] = '\0';    // remove the ']'
               emit_int(atoi(operand));
               break;
       }
   }
   return string_pool;

}

void init_io(FILE **fp, FILE *std, const char mode[], const char fn[]) {

   if (fn[0] == '\0')
       *fp = std;
   else if ((*fp = fopen(fn, mode)) == NULL)
       error(0, 0, "Can't open %s\n", fn);

}

int main(int argc, char *argv[]) {

   init_io(&source_fp, stdin,  "r",  argc > 1 ? argv[1] : "");
   int data_size;
   char **string_pool = load_code(&data_size);
   int data[1000 + data_size];
   run_vm(object, data, data_size, string_pool);

}</lang>

Python

Tested with Python 2.7 and 3.x <lang Python>from __future__ import print_function import sys, struct

FETCH, STORE, PUSH, ADD, SUB, MUL, DIV, MOD, LT, GT, LE, GE, EQ, NE, AND, OR, NEG, NOT, \ JMP, JZ, PRTC, PRTS, PRTI, HALT = range(24)

code_map = {

   "fetch": FETCH,
   "store": STORE,
   "push":  PUSH,
   "add":   ADD,
   "sub":   SUB,
   "mul":   MUL,
   "div":   DIV,
   "mod":   MOD,
   "lt":    LT,
   "gt":    GT,
   "le":    LE,
   "ge":    GE,
   "eq":    EQ,
   "ne":    NE,
   "and":   AND,
   "or":    OR,
   "not":   NOT,
   "neg":   NEG,
   "jmp":   JMP,
   "jz":    JZ,
   "prtc":  PRTC,
   "prts":  PRTS,
   "prti":  PRTI,
   "halt":  HALT

}

input_file = None code = bytearray() string_pool = [] word_size = 4

        • show error and exit

def error(msg):

   print("%s" % (msg))
   exit(1)

def int_to_bytes(val):

   return struct.pack("<i", val)

def bytes_to_int(bstr):

   return struct.unpack("<i", bstr)

def emit_byte(x):

   code.append(x)

def emit_word(x):

   s = int_to_bytes(x)
   for x in s:
       code.append(x)

def run_vm(data_size):

   stack = [0 for i in range(data_size + 1)]
   pc = 0
   while True:
       op = code[pc]
       pc += 1
       if op == FETCH:
           stack.append(stack[bytes_to_int(code[pc:pc+word_size])[0]]);
           pc += word_size
       elif op == STORE:
           stack[bytes_to_int(code[pc:pc+word_size])[0]] = stack.pop();
           pc += word_size
       elif op == PUSH:
           stack.append(bytes_to_int(code[pc:pc+word_size])[0]);
           pc += word_size
       elif op == ADD:   stack[-2] += stack[-1]; stack.pop()
       elif op == SUB:   stack[-2] -= stack[-1]; stack.pop()
       elif op == MUL:   stack[-2] *= stack[-1]; stack.pop()
       # use C like division semantics
       elif op == DIV:   stack[-2] = int(float(stack[-2]) / stack[-1]); stack.pop()
       elif op == MOD:   stack[-2] = int(float(stack[-2]) % stack[-1]); stack.pop()
       elif op == LT:    stack[-2] = stack[-2] <  stack[-1]; stack.pop()
       elif op == GT:    stack[-2] = stack[-2] >  stack[-1]; stack.pop()
       elif op == LE:    stack[-2] = stack[-2] <= stack[-1]; stack.pop()
       elif op == GE:    stack[-2] = stack[-2] >= stack[-1]; stack.pop()
       elif op == EQ:    stack[-2] = stack[-2] == stack[-1]; stack.pop()
       elif op == NE:    stack[-2] = stack[-2] != stack[-1]; stack.pop()
       elif op == AND:   stack[-2] = stack[-2] and stack[-1]; stack.pop()
       elif op == OR:    stack[-2] = stack[-2] or  stack[-1]; stack.pop()
       elif op == NEG:   stack[-1] = -stack[-1]
       elif op == NOT:   stack[-1] = not stack[-1]
       elif op == JMP:   pc += bytes_to_int(code[pc:pc+word_size])[0]
       elif op == JZ:
           if stack.pop():
               pc += word_size
           else:
               pc += bytes_to_int(code[pc:pc+word_size])[0]
       elif op == PRTC:  print("%c" % (stack[-1]), end=); stack.pop()
       elif op == PRTS:  print("%s" % (string_pool[stack[-1]].replace("\\n", "\n")), end=); stack.pop()
       elif op == PRTI:  print("%d" % (stack[-1]), end=); stack.pop()
       elif op == HALT:  break

def load_code():

   global string_pool
   line = input_file.readline()
   if len(line) == 0:
       error("empty line")
   line_list = line.split()
   data_size = int(line_list[1])
   n_strings = int(line_list[3])
   for i in range(n_strings):
       string_pool.append(input_file.readline().strip('"\n'))
   while True:
       line = input_file.readline()
       if len(line) == 0:
           break
       line_list = line.split()
       offset = int(line_list[0])
       instr  = line_list[1]
       opcode = code_map.get(instr)
       if opcode == None:
           error("Unknown instruction %s at %d" % (instr, offset))
       emit_byte(opcode)
       if opcode in [JMP, JZ]:
           p = int(line_list[3])
           emit_word(p - (offset + 1))
       elif opcode == PUSH:
           value = int(line_list[2])
           emit_word(value)
       elif opcode in [FETCH, STORE]:
           value = int(line_list[2].strip('[]'))
           emit_word(value)
   return data_size
        • main driver

input_file = sys.stdin if len(sys.argv) > 1:

   try:
       input_file = open(sys.argv[1], "r", 4096)
   except IOError as e:
       error(0, 0, "Can't open %s" % sys.argv[1])

data_size = load_code() run_vm(data_size)</lang>