编译原理实践1 正则表达式 Part1

概要：

工作后时间确实紧张不少，之前看的编译原理公开课没有看完，project也一直没写，今年的一个目标就是完成编译原理的课程以及对应project，实现一些对应的算法。

这一次先从正则表达式的实现开始，这部分完成了正则表达式到NFA的转换。

参考资料：

正则表达式：

https://cyberzhg.github.io/toolbox/

中缀转后缀：

https://blog.csdn.net/weixin_42034217/article/details/84679679

https://www.cnblogs.com/tech-bird/p/3971555.html

代码实现参考：

https://github.com/bonkyzhu/Regex2NFA2DFA

辅助函数

class Stack(object):
    def __init__(self):
        self.stack = []
    
    def empty(self):
        return len(self.stack) == 0
    
    def pop(self):
        self.stack.pop()
    
    def top(self):
        return self.stack[-1]
    
    def push(self, x):
        self.stack.append(x)
        
    def reverse(self):
        return self.stack[::-1]
    
    def size(self):
        return len(self.stack)

预处理正则表达式

分为三个步骤：

添加加号
中缀转前缀
预处理前缀，转换成二/三元组的形式

代码如下：

from stack import Stack

def proc(reg):
    """
    增加加号
    aaab -> a + a + a + b
    """
    res = []
    op = ['|', '*', '(', ')']
    n = len(reg)
    i = 0
    while (i < n):
        if reg[i] in op:
            #(a) -> + (a)
            #*(a) not -> *+(a)
            #((a)) not -> (*(a))
            if (reg[i] == '(') and ((i > 0) and (reg[i - 1] not in op)):
                res.append('+')
            res.append(reg[i])
            i += 1
        else:
            tmp = []
            #)b or *b -> )+b, *+b
            if ((i > 0) and (reg[i - 1] in [')', '*'])):
                tmp.append('+')
            tmp.append(reg[i])
            j = i + 1
            while ((j < n) and (reg[j] not in op)):
                tmp.append('+')
                tmp.append(reg[j])
                j += 1
            i = j
            res += tmp
            
    return res
    
def infix2prefix(reg):
    """
    中缀转前缀
    a*+b|c -> (| (+ (* a) b) c)
    """
    #初始化两个栈
    opStack = Stack()
    strStack = Stack()
    #op
    op = ['|', '+', '*']
    pra = ['(', ')']
    #优先级
    priority = {}
    priority['|'] = 0
    priority['+'] = 1
    priority['*'] = 2
    
    for s in reg[::-1]:
        if (s in op):
            while True:
                if (opStack.empty() or (opStack.top() == ')')):
                    opStack.push(s)
                    break
                elif (priority[s] >= priority[opStack.top()]):
                    opStack.push(s)
                    break
                else:
                    #弹出优先级高的
                    strStack.push(opStack.top())
                    opStack.pop()
        elif (s in pra):
            if (s == ')'):
                opStack.push(s)
            else:
                #弹出, 直到碰到右括号
                while ((not opStack.empty()) and (opStack.top() != ')')):
                    strStack.push(opStack.top())
                    opStack.pop()
                #去除右括号
                if (not opStack.empty()):
                    opStack.pop()
        else:
            strStack.push(s)
    
    #处理剩余op
    while (not opStack.empty()):
        strStack.push(opStack.top())
        opStack.pop()
        
    return strStack.reverse()

def procPrefix(prefix):
    """
    前缀转换成三元组或者二元组的形式
    (| a b)     ->  [|, a, b]
    (| (* a) b) ->  [|, [*, a], b]
    (* a)       ->  [*, a]
    """
    stack = Stack()
    op = ['|', '*', '+']
    for s in prefix:
        stack.push(s)
        if (s in op):
            continue
        else:
            while True:
                l = stack.size()
                if (l == 2):
                    #[*, a]
                    break
                if (l >= 3):
                    s1 = stack.top()
                    stack.pop()
                    s2 = stack.top()
                    stack.pop()
                    if (s2 == '*'):
                        #[*, a]
                        stack.push([s2, s1])
                        continue
                    else:
                        #防止[|, *, a]情形出现
                        if (s2 not in op):
                            #[|, a, b]
                            s3 = stack.top()
                            stack.pop()
                            stack.push([s3, s2, s1])
                        else:
                            stack.push(s2)
                            stack.push(s1)
                            break
                else:
                    break
    
    if (stack.size() == 2):
        return stack.stack
    else:
        return stack.reverse()[0]

############################
# proc test
print("############################")
print("Test for proc")
reg = ["(a|b)*", "(a*|b*)*", "((ϵ|a)b*)*", "(a|b)*abb(a|b)*", "aaab|b"]
for r in reg:
    print(r, proc(r))
    
############################
# proc test
print("############################")
print("Test for infix2prefix")
for r in reg:
    print(r, infix2prefix(proc(r)))

############################
# procPrefix test
print("############################")
print("Test for infix2prefix")
for r in reg:
    print(r, procPrefix(infix2prefix(proc(r))))

测试结果：

############################
Test for proc
(a|b)* ['(', 'a', '|', 'b', ')', '*']
(a*|b*)* ['(', 'a', '*', '|', 'b', '*', ')', '*']
((ϵ|a)b*)* ['(', '(', 'ϵ', '|', 'a', ')', '+', 'b', '*', ')', '*']
(a|b)*abb(a|b)* ['(', 'a', '|', 'b', ')', '*', '+', 'a', '+', 'b', '+', 'b', '+', '(', 'a', '|', 'b', ')', '*']
aaab|b ['a', '+', 'a', '+', 'a', '+', 'b', '|', 'b']
############################
Test for infix2prefix
(a|b)* ['*', '|', 'a', 'b']
(a*|b*)* ['*', '|', '*', 'a', '*', 'b']
((ϵ|a)b*)* ['*', '+', '|', 'ϵ', 'a', '*', 'b']
(a|b)*abb(a|b)* ['+', '+', '+', '+', '*', '|', 'a', 'b', 'a', 'b', 'b', '*', '|', 'a', 'b']
aaab|b ['|', '+', '+', '+', 'a', 'a', 'a', 'b', 'b']
############################
Test for infix2prefix
(a|b)* ['*', ['|', 'a', 'b']]
(a*|b*)* ['*', ['|', ['*', 'a'], ['*', 'b']]]
((ϵ|a)b*)* ['*', ['+', ['|', 'ϵ', 'a'], ['*', 'b']]]
(a|b)*abb(a|b)* ['+', ['+', ['+', ['+', ['*', ['|', 'a', 'b']], 'a'], 'b'], 'b'], ['*', ['|', 'a', 'b']]]
aaab|b ['|', ['+', ['+', ['+', 'a', 'a'], 'a'], 'b'], 'b']

正则表达式转换成nfa

利用下图进行转换：

代码如下：

from regProcess import proc
from regProcess import infix2prefix
from regProcess import procPrefix

def reg2Nfa(reg, curState, endState, maxState, transition):
    if (len(reg) == 1):
        #[a]
        return maxState
    if (reg[0] == '*'):
        maxState += 1
        #添加
        addLink(transition, curState, maxState, epsilon)
        addLink(transition, maxState, endState, epsilon)
        addLink(transition, maxState, maxState, reg[1])
        #递归
        return reg2Nfa(reg[1], maxState, maxState, maxState, transition)
    elif (reg[0] == '+'):
        maxState += 1
        #添加
        addLink(transition, curState, maxState, reg[1])
        addLink(transition, maxState, endState, reg[2])
        #递归
        m1 = reg2Nfa(reg[1], curState, maxState, maxState, transition)
        m2 = reg2Nfa(reg[2], maxState, endState, m1, transition)
        
        return m2
    else:
        #添加
        addLink(transition, curState, endState, reg[1])
        addLink(transition, curState, endState, reg[2])
        #递归
        m1 = reg2Nfa(reg[1], curState, endState, maxState, transition)
        m2 = reg2Nfa(reg[2], curState, endState, m1, transition)
        
        return m2
        
def addLink(transition, i, j, string):
    if (isinstance(string, str)):
        if i not in transition:
            transition[i] = dict()
        if string not in transition[i]:
            transition[i][string] = set()
        transition[i][string].add(j)

def removeLink(transition, i, j, string):
    if ((i in transition) and (j in transition[i]) and 
        (string in transition[i][j])):
        transition[i][j].remove(string)

#空字
epsilon = 'ϵ'
reg = ["(a|b)*", "(a*|b*)*", "((ϵ|a)b*)*", "(a|b)*abb(a|b)*", "aaab|b"]
#预处理
regproc = [procPrefix(infix2prefix(proc(r))) for r in reg]

for i in range(len(regproc)):
    r = regproc[i]
    #init
    curState = 0
    endState = 1
    maxState = 1
    transition = dict()
    addLink(transition, curState, endState, r)
    #计算结果
    res = reg2Nfa(r, curState, endState, maxState, transition)
    print(reg[i], res, transition)

结果如下：

(a|b)* 2 {0: {'ϵ': {2}}, 2: {'ϵ': {1}, 'a': {2}, 'b': {2}}}
(a*|b*)* 4 {0: {'ϵ': {2}}, 2: {'ϵ': {1, 3, 4}}, 3: {'ϵ': {2}, 'a': {3}}, 4: {'ϵ': {2}, 'b': {4}}}
((ϵ|a)b*)* 4 {0: {'ϵ': {2}}, 2: {'ϵ': {1, 3}, 'a': {3}}, 3: {'ϵ': {4}}, 4: {'ϵ': {2}, 'b': {4}}}
(a|b)*abb(a|b)* 7 {3: {'b': {2}}, 4: {'b': {3}}, 5: {'a': {4}}, 0: {'ϵ': {6}}, 6: {'ϵ': {5}, 'a': {6}, 'b': {6}}, 2: {'ϵ': {7}}, 7: {'ϵ': {1}, 'a': {7}, 'b': {7}}}
aaab|b 4 {0: {'b': {1}, 'a': {4}}, 2: {'b': {1}}, 3: {'a': {2}}, 4: {'a': {3}}}