#### Question 1

(a)只需要指出变化的部分即可，其余部分保持不变：

(b)根据上述规则即可，略过。

#### Question 2

(a)第一部分：

(b)(c)作图较麻烦，这里略过。

#### Question 3

(a)利用平衡二叉树的结构：左子树的叶节点数量和右子树的叶节点数量相同

#### Question 4

import json

#### Part 1 计数
def Count(File):
Count_x = {}
with open(File) as f:
#分隔数据
data = i.split()
#判断
if data == "UNARYRULE":
#计算Count(x)
if data in Count_x:
Count_x[data] += float(data)
else:
Count_x[data] = float(data)

return Count_x

#### Part 2 处理低频词
def Replace_word(tree, Count_x):
#基本情形
if len(tree) == 2:
word = tree
if word not in Count_x:
tree = "_RARE_"
elif Count_x[word] < 5:
tree = "_RARE_"
else:
Replace_word(tree, Count_x)
Replace_word(tree, Count_x)

#### Part 3 换并输出
def Replace(File, output, Count_x):
Res = []
with open(File) as f:
Replace_word(tree, Count_x)
res = json.dumps(tree)
Res.append(res)

with open(output, "w+") as f:
for i in Res:
f.write(i)
f.write("\n")

def Replace_dev(File, output, Count_x):
Res = []
with open(File) as f:
data = i.strip().split(" ")
for j in range(len(data)):
if data[j] not in Count_x or Count_x[data[j]] < 5:
data[j] = "_RARE_"
Res.append(" ".join(data) + "\n")

with open(output, "w+") as f:
for i in Res:
f.writelines(i)

#### 运行
File = "cfg.counts"
Count_x = Count(File)

#训练数据
f1 = "parse_train.dat"
o1 = "parse_train_proc.dat"
Replace(f1, o1, Count_x)

#测试数据
f2 = "parse_dev.dat"
o2 = "parse_dev_proc.dat"
Replace_dev(f2, o2, Count_x)

#python count_cfg_freq.py parse_train_proc.dat > cfg_proc.counts

#### Question 5

import json

#### Part 1 计算MLE
#计数
def Generate_count(File):
Count_xyy = {}
Count_xy = {}
Count_x = {}
Count_y = {}
with open(File) as f:
#分隔数据
data = i.strip().split()
#判断
if data == "NONTERMINAL":
Count_x[data] = float(data)
elif data == "BINARYRULE":
if data not in Count_xyy:
Count_xyy[data] = {}
Count_xyy[data][(data, data)] = float(data)
else:
if data not in Count_xy:
Count_xy[data] = {}
Count_xy[data][data] = float(data)
#统计单词数量
if data in Count_y:
Count_y[data] += float(data)
else:
Count_y[data] = float(data)

return Count_xyy, Count_xy, Count_x, Count_y

def Generate_mle(Count_xyy, Count_xy, Count_x, Count_y):
#计算MLE
q_xyy = {}
q_xy = {}
for i in Count_x:
q_xy[i] = {}
q_xyy[i] = {}
if i in Count_xy:
for j in Count_xy[i]:
q_xy[i][j] = Count_xy[i][j] / Count_x[i]
if i in Count_xyy:
for k in Count_xyy[i]:
q_xyy[i][k] = Count_xyy[i][k] / Count_x[i]

return q_xyy, q_xy

#### Part2: CKY
def CKY(word, Count_xyy, Count_xy, Count_x, Count_y, q_xyy, q_xy):
n = len(word)
Pi = {}
Bp = {}
for i in range(n):
for j in range(i, n):
Pi[(i, j)] = {}
Bp[(i, j)] = {}

for i in range(n):
for x in Count_x:
if x in Count_xy and word[i] in Count_xy[x]:
Pi[(i, i)][x] = q_xy[x][word[i]]

for l in range(1, n):
for i in range(n - l):
j = i + l
for X in Count_x:
#概率
p = 0
#最佳规则以及位置
y1 = ""
z1 = ""
s1 = ""
if X in q_xyy:
for (Y, Z) in q_xyy[X]:
for s in range(i, j):
if Y in Pi[(i, s)] and Z in Pi[(s + 1, j)]:
p1 = q_xyy[X][(Y, Z)] * Pi[(i, s)][Y] * Pi[(s + 1, j)][Z]
if p1 > p:
p = p1
y1 = Y
z1 = Z
s1 = s
#更新最大值
if p > 0:
Pi[(i, j)][X] = p
Bp[(i, j)][X] = (y1, z1, s1)

pmax = 0
node = ""
if "S" in Pi[(0, n - 1)]:
pmax = Pi[(0, n - 1)]["S"]
node = "S"
else:
for i in Pi[(0, n - 1)]:
if Pi[(0, n - 1)][i] > pmax:
pmax = Pi[(0, n - 1)][i]
node = i

return Pi, Bp, node

#生成树
def tree(Bp, word, i, j, node):
if j == i:
return [node, word[i]]
else:
lnode, rnode, k = Bp[(i, j)][node]
left = tree(Bp, word, i, k, lnode)
right = tree(Bp, word, k + 1, j, rnode)
return [node, left, right]

#生成结果
def Generate(File, output, Count_xyy, Count_xy, Count_x, Count_y, q_xyy, q_xy):
Res = []
with open(File) as f:
data = i.strip().split(" ")
word = i.strip().split(" ")
for j in range(len(data)):
if data[j] not in Count_y or Count_y[data[j]] < 5:
data[j] = "_RARE_"

#CKY算法
Pi, Bp, node = CKY(data, Count_xyy, Count_xy, Count_x, Count_y, q_xyy, q_xy)
n = len(word)
#生成树
res = tree(Bp, word, 0, n - 1, node)
Res.append(res)

with open(output, "w+") as f:
for res in Res:
r = json.dumps(res)
f.write(r)
f.write("\n")

#### 运行
File = "cfg_proc.counts"
f1 = "parse_dev.dat"
o1 = "parse_dev_result.dat"
Count_xyy, Count_xy, Count_x, Count_y = Generate_count(File)
q_xyy, q_xy = Generate_mle(Count_xyy, Count_xy, Count_x, Count_y)
Generate(f1, o1, Count_xyy, Count_xy, Count_x, Count_y, q_xyy, q_xy)

#检测结果
#python eval_parser.py parse_dev.key parse_dev_result.dat

#### Question 6

from Q4 import *
from Q5 import *

#### Q6
File1 = "cfg_vert.counts"
Count_x = Count(File1)
f1 = "parse_train_vert.dat"
o1 = "parse_train_vert_proc.dat"
Replace(f1, o1, Count_x)

#python count_cfg_freq.py parse_train_vert_proc.dat > cfg_vert_proc.counts

File2 = "cfg_vert_proc.counts"
f2 = "parse_dev.dat"
o2 = "parse_dev_vert_result.dat"
Count_xyy, Count_xy, Count_x, Count_y = Generate_count(File2)
q_xyy, q_xy = Generate_mle(Count_xyy, Count_xy, Count_x, Count_y)
Generate(f2, o2, Count_xyy, Count_xy, Count_x, Count_y, q_xyy, q_xy)