编译原理实践1 正则表达式 Part1
概要:
工作后时间确实紧张不少,之前看的编译原理公开课没有看完,project也一直没写,今年的一个目标就是完成编译原理的课程以及对应project,实现一些对应的算法。
这一次先从正则表达式的实现开始,这部分完成了正则表达式到NFA的转换。
参考资料:
正则表达式:
https://cyberzhg.github.io/toolbox/
中缀转后缀:
https://blog.csdn.net/weixin_42034217/article/details/84679679
https://www.cnblogs.com/tech-bird/p/3971555.html
代码实现参考:
https://github.com/bonkyzhu/Regex2NFA2DFA
辅助函数
class Stack(object):
def __init__(self):
self.stack = []
def empty(self):
return len(self.stack) == 0
def pop(self):
self.stack.pop()
def top(self):
return self.stack[-1]
def push(self, x):
self.stack.append(x)
def reverse(self):
return self.stack[::-1]
def size(self):
return len(self.stack)
预处理正则表达式
分为三个步骤:
- 添加加号
- 中缀转前缀
- 预处理前缀,转换成二/三元组的形式
代码如下:
from stack import Stack
def proc(reg):
"""
增加加号
aaab -> a + a + a + b
"""
res = []
op = ['|', '*', '(', ')']
n = len(reg)
i = 0
while (i < n):
if reg[i] in op:
#(a) -> + (a)
#*(a) not -> *+(a)
#((a)) not -> (*(a))
if (reg[i] == '(') and ((i > 0) and (reg[i - 1] not in op)):
res.append('+')
res.append(reg[i])
i += 1
else:
tmp = []
#)b or *b -> )+b, *+b
if ((i > 0) and (reg[i - 1] in [')', '*'])):
tmp.append('+')
tmp.append(reg[i])
j = i + 1
while ((j < n) and (reg[j] not in op)):
tmp.append('+')
tmp.append(reg[j])
j += 1
i = j
res += tmp
return res
def infix2prefix(reg):
"""
中缀转前缀
a*+b|c -> (| (+ (* a) b) c)
"""
#初始化两个栈
opStack = Stack()
strStack = Stack()
#op
op = ['|', '+', '*']
pra = ['(', ')']
#优先级
priority = {}
priority['|'] = 0
priority['+'] = 1
priority['*'] = 2
for s in reg[::-1]:
if (s in op):
while True:
if (opStack.empty() or (opStack.top() == ')')):
opStack.push(s)
break
elif (priority[s] >= priority[opStack.top()]):
opStack.push(s)
break
else:
#弹出优先级高的
strStack.push(opStack.top())
opStack.pop()
elif (s in pra):
if (s == ')'):
opStack.push(s)
else:
#弹出, 直到碰到右括号
while ((not opStack.empty()) and (opStack.top() != ')')):
strStack.push(opStack.top())
opStack.pop()
#去除右括号
if (not opStack.empty()):
opStack.pop()
else:
strStack.push(s)
#处理剩余op
while (not opStack.empty()):
strStack.push(opStack.top())
opStack.pop()
return strStack.reverse()
def procPrefix(prefix):
"""
前缀转换成三元组或者二元组的形式
(| a b) -> [|, a, b]
(| (* a) b) -> [|, [*, a], b]
(* a) -> [*, a]
"""
stack = Stack()
op = ['|', '*', '+']
for s in prefix:
stack.push(s)
if (s in op):
continue
else:
while True:
l = stack.size()
if (l == 2):
#[*, a]
break
if (l >= 3):
s1 = stack.top()
stack.pop()
s2 = stack.top()
stack.pop()
if (s2 == '*'):
#[*, a]
stack.push([s2, s1])
continue
else:
#防止[|, *, a]情形出现
if (s2 not in op):
#[|, a, b]
s3 = stack.top()
stack.pop()
stack.push([s3, s2, s1])
else:
stack.push(s2)
stack.push(s1)
break
else:
break
if (stack.size() == 2):
return stack.stack
else:
return stack.reverse()[0]
############################
# proc test
print("############################")
print("Test for proc")
reg = ["(a|b)*", "(a*|b*)*", "((ϵ|a)b*)*", "(a|b)*abb(a|b)*", "aaab|b"]
for r in reg:
print(r, proc(r))
############################
# proc test
print("############################")
print("Test for infix2prefix")
for r in reg:
print(r, infix2prefix(proc(r)))
############################
# procPrefix test
print("############################")
print("Test for infix2prefix")
for r in reg:
print(r, procPrefix(infix2prefix(proc(r))))
测试结果:
############################
Test for proc
(a|b)* ['(', 'a', '|', 'b', ')', '*']
(a*|b*)* ['(', 'a', '*', '|', 'b', '*', ')', '*']
((ϵ|a)b*)* ['(', '(', 'ϵ', '|', 'a', ')', '+', 'b', '*', ')', '*']
(a|b)*abb(a|b)* ['(', 'a', '|', 'b', ')', '*', '+', 'a', '+', 'b', '+', 'b', '+', '(', 'a', '|', 'b', ')', '*']
aaab|b ['a', '+', 'a', '+', 'a', '+', 'b', '|', 'b']
############################
Test for infix2prefix
(a|b)* ['*', '|', 'a', 'b']
(a*|b*)* ['*', '|', '*', 'a', '*', 'b']
((ϵ|a)b*)* ['*', '+', '|', 'ϵ', 'a', '*', 'b']
(a|b)*abb(a|b)* ['+', '+', '+', '+', '*', '|', 'a', 'b', 'a', 'b', 'b', '*', '|', 'a', 'b']
aaab|b ['|', '+', '+', '+', 'a', 'a', 'a', 'b', 'b']
############################
Test for infix2prefix
(a|b)* ['*', ['|', 'a', 'b']]
(a*|b*)* ['*', ['|', ['*', 'a'], ['*', 'b']]]
((ϵ|a)b*)* ['*', ['+', ['|', 'ϵ', 'a'], ['*', 'b']]]
(a|b)*abb(a|b)* ['+', ['+', ['+', ['+', ['*', ['|', 'a', 'b']], 'a'], 'b'], 'b'], ['*', ['|', 'a', 'b']]]
aaab|b ['|', ['+', ['+', ['+', 'a', 'a'], 'a'], 'b'], 'b']
正则表达式转换成nfa
利用下图进行转换:
代码如下:
from regProcess import proc
from regProcess import infix2prefix
from regProcess import procPrefix
def reg2Nfa(reg, curState, endState, maxState, transition):
if (len(reg) == 1):
#[a]
return maxState
if (reg[0] == '*'):
maxState += 1
#添加
addLink(transition, curState, maxState, epsilon)
addLink(transition, maxState, endState, epsilon)
addLink(transition, maxState, maxState, reg[1])
#递归
return reg2Nfa(reg[1], maxState, maxState, maxState, transition)
elif (reg[0] == '+'):
maxState += 1
#添加
addLink(transition, curState, maxState, reg[1])
addLink(transition, maxState, endState, reg[2])
#递归
m1 = reg2Nfa(reg[1], curState, maxState, maxState, transition)
m2 = reg2Nfa(reg[2], maxState, endState, m1, transition)
return m2
else:
#添加
addLink(transition, curState, endState, reg[1])
addLink(transition, curState, endState, reg[2])
#递归
m1 = reg2Nfa(reg[1], curState, endState, maxState, transition)
m2 = reg2Nfa(reg[2], curState, endState, m1, transition)
return m2
def addLink(transition, i, j, string):
if (isinstance(string, str)):
if i not in transition:
transition[i] = dict()
if string not in transition[i]:
transition[i][string] = set()
transition[i][string].add(j)
def removeLink(transition, i, j, string):
if ((i in transition) and (j in transition[i]) and
(string in transition[i][j])):
transition[i][j].remove(string)
#空字
epsilon = 'ϵ'
reg = ["(a|b)*", "(a*|b*)*", "((ϵ|a)b*)*", "(a|b)*abb(a|b)*", "aaab|b"]
#预处理
regproc = [procPrefix(infix2prefix(proc(r))) for r in reg]
for i in range(len(regproc)):
r = regproc[i]
#init
curState = 0
endState = 1
maxState = 1
transition = dict()
addLink(transition, curState, endState, r)
#计算结果
res = reg2Nfa(r, curState, endState, maxState, transition)
print(reg[i], res, transition)
结果如下:
(a|b)* 2 {0: {'ϵ': {2}}, 2: {'ϵ': {1}, 'a': {2}, 'b': {2}}}
(a*|b*)* 4 {0: {'ϵ': {2}}, 2: {'ϵ': {1, 3, 4}}, 3: {'ϵ': {2}, 'a': {3}}, 4: {'ϵ': {2}, 'b': {4}}}
((ϵ|a)b*)* 4 {0: {'ϵ': {2}}, 2: {'ϵ': {1, 3}, 'a': {3}}, 3: {'ϵ': {4}}, 4: {'ϵ': {2}, 'b': {4}}}
(a|b)*abb(a|b)* 7 {3: {'b': {2}}, 4: {'b': {3}}, 5: {'a': {4}}, 0: {'ϵ': {6}}, 6: {'ϵ': {5}, 'a': {6}, 'b': {6}}, 2: {'ϵ': {7}}, 7: {'ϵ': {1}, 'a': {7}, 'b': {7}}}
aaab|b 4 {0: {'b': {1}, 'a': {4}}, 2: {'b': {1}}, 3: {'a': {2}}, 4: {'a': {3}}}
本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 Doraemonzzz!
评论
ValineLivere