这次回顾第十章的内容,这一章介绍了编译器的第一部分:语法分析。

课程官网:

https://www.nand2tetris.org/

视频地址:

https://www.coursera.org/learn/build-a-computer

Part 1:课程回顾

背景介绍

这一章完成了编译器的前半部分:语法分析,输出结果为XML格式,本章重点依然为项目,背景介绍部分从略。

Part 2:项目

项目的注意点可以参考PPT的83,110,125页,分为JackTokenizer,CompilationEngine,JackAnalyzer。

JackTokenizer

class JackTokenizer():
    Keyword = {'class', 'constructor', 'function', 'method', 
               'field', 'static', 'var', 'int', 'char', 
               'boolean', 'void', 'true', 'false', 'null', 
               'this', 'let', 'do' , 'if', 'else', 'while', 
               'return'}
    
    Symbol = {'{', '}', '(', ')', '[', ']', '.', ',', ';', '+', '-', '*',
              '/', '&', '|', '<', '>', '=', '~'}
    
    StringConstant = '"'
    
    Replace = {"<": "&lt;", ">": "&gt;", '"': "&quot;", '&': "&amp;"}
    Replacewords = ['&', '"', "<", ">"]
    
    
    def __init__(self, filename):
        self.Token = []
        self.sep = '[' + ''.join(JackTokenizer.Symbol) + ']'
        self.cnt = 0
        tmp = filename.split('/')
        self.filename = '/'.join(tmp[:-1]) + '/' + tmp[-1].split('.')[0]
        #读取文件,忽略注释
        with open(filename) as f:
            for i in f.readlines():
                data = i.split()
                if len(data) == 0:
                    pass
                #忽略注释
                elif data[0] == "//":
                    pass
                #忽略注释
                elif data[0] == "/**" or data[0] == "*" or data[0] == "*/":
                    pass
                else:
                    #防止有空格
                    data = ' '.join(data)
                    #删除每行的注释
                    data = data.split("//")[0]
                    #处理每行
                    data = self.process_sentence(data)
                    #合并结果                    
                    self.Token += data
        
        self.maxlength = len(self.Token)
        self.token = ""
        
    def next_token(self):
        if self.cnt < self.maxlength:
            return self.Token[self.cnt]
        
    def process_sentence(self, string):
        res = []
        i = 0
        n = len(string)
        #判断字符串
        cnt = 0
        for j in range(n):
            #处理字符串
            if string[j] in JackTokenizer.StringConstant:
                if cnt == 0:
                    i = j
                    cnt += 1
                else:
                    res.append(string[i: (j+1)])
                    cnt = 0
                    continue
            #处理空格
            if string[j] == ' ' and cnt == 0:
                if (j > i):
                    res.append(string[i: j])
                i = j + 1
                    
            #处理symbol
            if string[j] in JackTokenizer.Symbol:
                if (j > i and string[j-1] != '"'):
                    res.append(string[i: j])
                res.append(string[j])
                i = j + 1
        
        return res           
        
    def hasMoreTokens(self):
        return self.cnt < self.maxlength
    
    def advance(self):
        if self.hasMoreTokens():
            self.token = self.Token[self.cnt]
            self.cnt += 1
    
    def replace(self, word):
        for i in JackTokenizer.Replacewords:
            if i in word:
                word = word.replace(i, JackTokenizer.Replace[i])
        return word
            
    def tokenType(self):
        if self.token in JackTokenizer.Keyword:
            return "KEYWORD"
        elif self.token in JackTokenizer.Symbol:
            return "SYMBOL"
        elif self.token[0] == '"' and self.token[-1] == '"':
            return "STRING_CONST"
        elif str.isdigit(self.token):
            return "INT_CONST"
        else:
            return "IDENTIFIER"
        
    def keyword(self):
        return self.token
    
    def symbol(self):
        self.token = self.replace(self.token)
        return self.token
        
    def identifier(self):
        return self.token
        
    def intVal(self):
        return int(self.token)
        
    def stringVal(self):
        self.token = self.replace(self.token[1:-1])
        return self.token
        
def Write_XML(jackTokenizer):
    sentence = ""
    filename = jackTokenizer.filename + "toke.xml"
    with open(filename, "w+") as f:
        f.write("<tokens>\n")
        while jackTokenizer.hasMoreTokens():
            jackTokenizer.advance()
            tokentype = jackTokenizer.tokenType()
            if tokentype == "KEYWORD":
                sentence = "<keyword> " + jackTokenizer.keyword() + " </keyword>"
            elif tokentype == "SYMBOL":
                sentence = "<symbol> " + jackTokenizer.symbol() + " </symbol>"
            elif tokentype == "IDENTIFIER":
                sentence = "<identifier> " + jackTokenizer.identifier() + " </identifier>"
            elif tokentype == "INT_CONST":
                sentence = "<integerConstant> " + str(jackTokenizer.intVal()) + " </integerConstant>"
            elif tokentype == "STRING_CONST":
                sentence = "<stringConstant> " + jackTokenizer.stringVal() + " </stringConstant>"
            
            f.write(sentence + "\n")
        f.write("</tokens>\n")

CompilationEngine

from JackTokenizer import JackTokenizer

class CompilationEngine():
    
    def __init__(self, filename, flag=1):
        self.tokenizer = JackTokenizer(filename)
        if flag == 1:
            self.output = self.tokenizer.filename + "X.xml"
        else:
            self.output = self.tokenizer.filename + ".xml"
        self.file = open(self.output, "w+")
        
    def CompileClass(self):
        #<class>
        s = "<class>\n"
        self.file.write(s)
        #class
        s = "<keyword> " + self.tokenizer.keyword() + " </keyword>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #className
        s = "<identifier> " + self.tokenizer.identifier() + " </identifier>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #{
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #判断下一项是否为classVarDec
        while self.tokenizer.token in ['static', 'field']:
            #classVarDec*
            self.CompileClassVarDec()
        while self.tokenizer.token in ['constructor', 'function', 'method']:
            #subroutineDec
            self.CompileSubroutine()
        #}
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #</class>
        s = "</class>\n"
        self.file.write(s)
        
    def CompileClassVarDec(self):
        #<classVarDec>
        s = "<classVarDec>\n"
        self.file.write(s)
        #static,field
        s = "<keyword> " + self.tokenizer.keyword() + " </keyword>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #type
        self.CompileType()
        #varName
        s = "<identifier> " + self.tokenizer.identifier() + " </identifier>\n"
        self.file.write(s)
        #, varName
        self.tokenizer.advance()
        while self.tokenizer.token == ',':
            #','
            s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
            self.file.write(s)
            self.tokenizer.advance()
            #varName
            s = "<identifier> " + self.tokenizer.identifier() + " </identifier>\n"
            self.file.write(s)
            self.tokenizer.advance()
        #;
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #</classVarDec>
        s = "</classVarDec>\n"
        self.file.write(s)
    
    def CompileSubroutine(self):
        #<subroutineDec>
        s = "<subroutineDec>\n"
        self.file.write(s)
        #constructor, function, method
        s = "<keyword> " + self.tokenizer.keyword() + " </keyword>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #void, int, char, boolean, className
        if self.tokenizer.token in ['int', 'char', 'boolean', 'void']:
            s = "<keyword> " + self.tokenizer.keyword() + " </keyword>\n"
            self.file.write(s)
        else:
            s = "<identifier> " + self.tokenizer.identifier() + " </identifier>\n"
            self.file.write(s)
        self.tokenizer.advance()
        #subroutineName
        s = "<identifier> " + self.tokenizer.identifier() + " </identifier>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #(
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #parameterlist
        self.CompileParameterList()
        #)
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #subroutineBody
        self.CompileSubroutineBody()
        #</subroutineDec>
        s = "</subroutineDec>\n"
        self.file.write(s)
    
    def CompileSubroutineBody(self):
        #<subroutineBody>
        s = "<subroutineBody>\n"
        self.file.write(s)
        #{
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #varDec
        while self.tokenizer.token == "var":
            self.CompileVarDec()
        #statements
        self.CompileStatements()
        #}
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #</subroutineBody>
        s = "</subroutineBody>\n"
        self.file.write(s)
    
    def CompileType(self):
        if self.tokenizer.token in ['int', 'char', 'boolean']:
            s = "<keyword> " + self.tokenizer.keyword() + " </keyword>\n"
            self.file.write(s)
            self.tokenizer.advance()
        else:
            s = "<identifier> " + self.tokenizer.identifier() + " </identifier>\n"
            self.file.write(s)
            self.tokenizer.advance()
    
    def CompileParameterList(self):
        #<parameterList>
        s = "<parameterList>\n"
        self.file.write(s)
        #type
        if self.tokenizer.token in ['int', 'char', 'boolean'] or self.tokenizer.tokenType == "IDENTIFIER":
            #type
            self.CompileType()
            #varName
            s = "<identifier> " + self.tokenizer.identifier() + " </identifier>\n"
            self.file.write(s)
            self.tokenizer.advance()
            #,
            while self.tokenizer.token == ",":
                s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
                self.file.write(s)
                self.tokenizer.advance()
                #type
                self.CompileType()
                #varName
                s = "<identifier> " + self.tokenizer.identifier() + " </identifier>\n"
                self.file.write(s)
                self.tokenizer.advance()
        
        #</parameterList>
        s = "</parameterList>\n"
        self.file.write(s)
    
    def CompileVarDec(self):
        #<varDec>
        s = "<varDec>\n"
        self.file.write(s)
        #var
        s = "<keyword> " + self.tokenizer.keyword() + " </keyword>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #type
        self.CompileType()
        #varName
        s = "<identifier> " + self.tokenizer.identifier() + " </identifier>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #,
        while self.tokenizer.token == ",":
            #,
            s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
            self.file.write(s)
            self.tokenizer.advance()
            #varName
            s = "<identifier> " + self.tokenizer.identifier() + " </identifier>\n"
            self.file.write(s)
            self.tokenizer.advance()
        #;
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #</varDec>
        s = "</varDec>\n"
        self.file.write(s)
        
    def CompileStatement(self):
        if self.tokenizer.token == "let":
            self.CompileLet()
        elif self.tokenizer.token == "if":
            self.CompileIf()
        elif self.tokenizer.token == "while":
            self.CompileWhile()
        elif self.tokenizer.token == "do":
            self.CompileDo()
        elif self.tokenizer.token == "return":
            self.CompileReturn()
    
    def CompileStatements(self):
        #<statements>
        s = "<statements>\n"
        self.file.write(s)
        while self.tokenizer.token in ["let", "if", "while", "do", "return"]:
            self.CompileStatement()
        #</statements>
        s = "</statements>\n"
        self.file.write(s)
    
    def CompileDo(self):
        #<doStatement>
        s = "<doStatement>\n"
        self.file.write(s)
        #do
        s = "<keyword> " + self.tokenizer.keyword() + " </keyword>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #subroutineCall
        self.CompileSubroutineCall()
        #;
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #</doStatement>
        s = "</doStatement>\n"
        self.file.write(s)
    
    def CompileLet(self):
        #<letStatement>
        s = "<letStatement>\n"
        self.file.write(s)
        #let
        s = "<keyword> " + self.tokenizer.keyword() + " </keyword>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #varName
        s = "<identifier> " + self.tokenizer.identifier() + " </identifier>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #[
        if self.tokenizer.token == "[":
            #[
            s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
            self.file.write(s)
            self.tokenizer.advance()
            #expression
            self.CompileExpression()
            #]
            s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
            self.file.write(s)
            self.tokenizer.advance()
        #=
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #expression
        self.CompileExpression();
        #;
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #</letStatement>
        s = "</letStatement>\n"
        self.file.write(s)
    
    def CompileWhile(self):
        #<whileStatement>
        s = "<whileStatement>\n"
        self.file.write(s)
        #while
        s = "<keyword> " + self.tokenizer.keyword() + " </keyword>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #(
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #expression
        self.CompileExpression()
        #)
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #{
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #statements
        self.CompileStatements()
        #}
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #</whileStatement>
        s = "</whileStatement>\n"
        self.file.write(s)
    
    def CompileReturn(self):
        #<returnStatement>
        s = "<returnStatement>\n"
        self.file.write(s)
        #return
        s = "<keyword> " + self.tokenizer.keyword() + " </keyword>\n"
        self.file.write(s)
        self.tokenizer.advance()
        if self.tokenizer.token != ';':
            self.CompileExpression()
        #;
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #<returnStatement>
        s = "</returnStatement>\n"
        self.file.write(s)
    
    def CompileIf(self):
        #<ifStatement>
        s = "<ifStatement>\n"
        self.file.write(s)
        #if
        s = "<keyword> " + self.tokenizer.keyword() + " </keyword>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #(
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #expression
        self.CompileExpression()
        #)
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #{
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #statements
        self.CompileStatements()
        #}
        s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
        self.file.write(s)
        self.tokenizer.advance()
        #else
        if self.tokenizer.token == "else":
            #else
            s = "<keyword> " + self.tokenizer.keyword() + " </keyword>\n"
            self.file.write(s)
            self.tokenizer.advance()
            #{
            s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
            self.file.write(s)
            self.tokenizer.advance()
            #statements
            self.CompileStatements()
            #}
            s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
            self.file.write(s)
            self.tokenizer.advance()
        #</ifStatement>
        s = "</ifStatement>\n"
        self.file.write(s)
        
    
    def CompileExpression(self):
        #<expression>
        s = "<expression>\n"
        self.file.write(s)
        self.CompileTerm()
        #判断是否为op
        while self.tokenizer.token in ['+', '-', '*', '/', '&', '|', '<', '>', '=']:
            #op
            s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
            self.file.write(s)
            self.tokenizer.advance()
            #term
            self.CompileTerm()
        #</expression>
        s = "</expression>\n"
        self.file.write(s)
    
    def CompileTerm(self):
        #<term>
        s = "<term>\n"
        self.file.write(s)
        #integerConstant
        if self.tokenizer.tokenType() == "INT_CONST":
            s = "<integerConstant> " + str(self.tokenizer.intVal()) + " </integerConstant>\n"
            self.file.write(s)
            self.tokenizer.advance()
        #stringConstant
        elif self.tokenizer.tokenType() == "STRING_CONST":
            s = "<stringConstant> " + self.tokenizer.stringVal() + " </stringConstant>\n"
            self.file.write(s)
            self.tokenizer.advance()
        #keywordConstant
        elif self.tokenizer.tokenType() == "KEYWORD" and self.tokenizer.token in ['true', 'false', 'null', 'this']:
            s = "<keyword> " + self.tokenizer.keyword() + " </keyword>\n"
            self.file.write(s)
            self.tokenizer.advance()
        #varName[], varName, subroutineCall
        elif self.tokenizer.tokenType() == "IDENTIFIER":
            #a[]
            if self.tokenizer.next_token() == "[":
                #varName
                s = "<identifier> " + self.tokenizer.identifier() + " </identifier>\n"
                self.file.write(s)
                self.tokenizer.advance()
                #[
                s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
                self.file.write(s)
                self.tokenizer.advance()
                #expression
                self.CompileExpression()
                #]
                s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
                self.file.write(s)
                self.tokenizer.advance()
            #subroutineCall
            elif self.tokenizer.next_token() in ["(", "."]:
                self.CompileSubroutineCall()
            else:
                #varName
                s = "<identifier> " + self.tokenizer.identifier() + " </identifier>\n"
                self.file.write(s)
                self.tokenizer.advance()
        #(expression)
        elif self.tokenizer.token == "(":
            #(
            s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
            self.file.write(s)
            self.tokenizer.advance()
            #expression
            self.CompileExpression()
            #)
            s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
            self.file.write(s)
            self.tokenizer.advance()
        #unaryOp term
        elif self.tokenizer.token in ['-', '~']:
            #unaryOp
            s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
            self.file.write(s)
            self.tokenizer.advance()
            #term
            self.CompileTerm()
        #</term>
        s = "</term>\n"
        self.file.write(s)
    
    def IsExpression(self):
        #integerConstant
        if self.tokenizer.tokenType() == "INT_CONST":
            return True
        #stringConstant
        elif self.tokenizer.tokenType() == "STRING_CONST":
            return True
        #keywordConstant
        elif self.tokenizer.tokenType() == "KEYWORD" and self.tokenizer.token in ['true', 'false', 'null', 'this']:
            return True
        #varName[], varName, subroutineCall
        elif self.tokenizer.tokenType() == "IDENTIFIER":
            return True
        #(expression)
        elif self.tokenizer.token == "(":
            return True
        #unaryOp term
        elif self.tokenizer.token in ['-', '~']:
            return True
        #subroutineCall
        else:
            return False
    
    def CompileExpressionList(self):
        #<expressionList>
        s = "<expressionList>\n"
        self.file.write(s)
        if self.IsExpression():
            self.CompileExpression()
            while self.tokenizer.token == ",":
                #,
                s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
                self.file.write(s)
                self.tokenizer.advance()
                #expression
                self.CompileExpression()
        #</expressionList>
        s = "</expressionList>\n"
        self.file.write(s)

    
    def CompileSubroutineCall(self):
        if self.tokenizer.next_token() == "(":
            #subroutineName
            s = "<identifier> " + self.tokenizer.identifier() + " </identifier>\n"
            self.file.write(s)
            self.tokenizer.advance()
            #(
            s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
            self.file.write(s)
            self.tokenizer.advance()
            #expressionList
            self.CompileExpressionList()
            #)
            s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
            self.file.write(s)
            self.tokenizer.advance()
        else:
            #className|varName
            s = "<identifier> " + self.tokenizer.identifier() + " </identifier>\n"
            self.file.write(s)
            self.tokenizer.advance()
            #.
            s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
            self.file.write(s)
            self.tokenizer.advance()
            #subroutineName
            s = "<identifier> " + self.tokenizer.identifier() + " </identifier>\n"
            self.file.write(s)
            self.tokenizer.advance()
            #(
            s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
            self.file.write(s)
            self.tokenizer.advance()
            #expressionList
            self.CompileExpressionList()
            #)
            s = "<symbol> " + self.tokenizer.symbol() + " </symbol>\n"
            self.file.write(s)
            self.tokenizer.advance()

JackAnalyzer

from CompilationEngine import CompilationEngine
import sys
import os

def AnalyzerOneFile(filename):
    compiler = CompilationEngine(filename, 2)
    compiler.tokenizer.advance()
    compiler.CompileClass()
    compiler.file.close()
    
if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Error")
        sys.exit(1)
    #划分输入
    data = sys.argv[-1].split("/")
    #文件名
    file = data[-1]
    #判断是否为目录
    Name = file.split(".")
    #路径
    path = sys.argv[-1]
    #划分文件名
    if len(Name) == 2:
        filename = path
        AnalyzerOneFile(filename)
    else:
        files = os.listdir(sys.argv[-1])
        for f in files:
            if f.split(".")[-1] == "jack":
                filename = path + "/" + f
                AnalyzerOneFile(filename)