Michael Collins NLP Homework 2

课程主页:http://www.cs.columbia.edu/~cs4705/

课程网盘地址:

链接:https://pan.baidu.com/s/1KijgO7yjL_MVCC9zKZ7Jdg
提取码:t1i3

这一次回顾Michael Collins NLP作业2。

Question 1

(a)只需要指出变化的部分即可,其余部分保持不变:

增加如下规则:

(b)根据上述规则即可,略过。

Question 2

(a)第一部分:

第二部分:

(b)(c)作图较麻烦,这里略过。

Question 3

(a)利用平衡二叉树的结构:左子树的叶节点数量和右子树的叶节点数量相同

Question 4

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import json

#### Part 1 计数
def Count(File):
Count_x = {}
with open(File) as f:
for i in f.readlines():
#分隔数据
data = i.split()
#判断
if data[1] == "UNARYRULE":
#计算Count(x)
if data[3] in Count_x:
Count_x[data[3]] += float(data[0])
else:
Count_x[data[3]] = float(data[0])

return Count_x

#### Part 2 处理低频词
def Replace_word(tree, Count_x):
#基本情形
if len(tree) == 2:
word = tree[1]
if word not in Count_x:
tree[1] = "_RARE_"
elif Count_x[word] < 5:
tree[1] = "_RARE_"
else:
Replace_word(tree[1], Count_x)
Replace_word(tree[2], Count_x)

#### Part 3 换并输出
def Replace(File, output, Count_x):
Res = []
with open(File) as f:
for i in f.readlines():
tree = json.loads(i, encoding="utf-8")
Replace_word(tree, Count_x)
res = json.dumps(tree)
Res.append(res)

with open(output, "w+") as f:
for i in Res:
f.write(i)
f.write("\n")

def Replace_dev(File, output, Count_x):
Res = []
with open(File) as f:
for i in f.readlines():
data = i.strip().split(" ")
for j in range(len(data)):
if data[j] not in Count_x or Count_x[data[j]] < 5:
data[j] = "_RARE_"
Res.append(" ".join(data) + "\n")

with open(output, "w+") as f:
for i in Res:
f.writelines(i)

#### 运行
File = "cfg.counts"
Count_x = Count(File)

#训练数据
f1 = "parse_train.dat"
o1 = "parse_train_proc.dat"
Replace(f1, o1, Count_x)

#测试数据
f2 = "parse_dev.dat"
o2 = "parse_dev_proc.dat"
Replace_dev(f2, o2, Count_x)

#python count_cfg_freq.py parse_train_proc.dat > cfg_proc.counts

Question 5

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import json

#### Part 1 计算MLE
#计数
def Generate_count(File):
Count_xyy = {}
Count_xy = {}
Count_x = {}
Count_y = {}
with open(File) as f:
for i in f.readlines():
#分隔数据
data = i.strip().split()
#判断
if data[1] == "NONTERMINAL":
Count_x[data[2]] = float(data[0])
elif data[1] == "BINARYRULE":
if data[2] not in Count_xyy:
Count_xyy[data[2]] = {}
Count_xyy[data[2]][(data[3], data[4])] = float(data[0])
else:
if data[2] not in Count_xy:
Count_xy[data[2]] = {}
Count_xy[data[2]][data[3]] = float(data[0])
#统计单词数量
if data[3] in Count_y:
Count_y[data[3]] += float(data[0])
else:
Count_y[data[3]] = float(data[0])

return Count_xyy, Count_xy, Count_x, Count_y

def Generate_mle(Count_xyy, Count_xy, Count_x, Count_y):
#计算MLE
q_xyy = {}
q_xy = {}
for i in Count_x:
q_xy[i] = {}
q_xyy[i] = {}
if i in Count_xy:
for j in Count_xy[i]:
q_xy[i][j] = Count_xy[i][j] / Count_x[i]
if i in Count_xyy:
for k in Count_xyy[i]:
q_xyy[i][k] = Count_xyy[i][k] / Count_x[i]

return q_xyy, q_xy

#### Part2: CKY
def CKY(word, Count_xyy, Count_xy, Count_x, Count_y, q_xyy, q_xy):
n = len(word)
Pi = {}
Bp = {}
for i in range(n):
for j in range(i, n):
Pi[(i, j)] = {}
Bp[(i, j)] = {}

for i in range(n):
for x in Count_x:
if x in Count_xy and word[i] in Count_xy[x]:
Pi[(i, i)][x] = q_xy[x][word[i]]

for l in range(1, n):
for i in range(n - l):
j = i + l
for X in Count_x:
#概率
p = 0
#最佳规则以及位置
y1 = ""
z1 = ""
s1 = ""
if X in q_xyy:
for (Y, Z) in q_xyy[X]:
for s in range(i, j):
if Y in Pi[(i, s)] and Z in Pi[(s + 1, j)]:
p1 = q_xyy[X][(Y, Z)] * Pi[(i, s)][Y] * Pi[(s + 1, j)][Z]
if p1 > p:
p = p1
y1 = Y
z1 = Z
s1 = s
#更新最大值
if p > 0:
Pi[(i, j)][X] = p
Bp[(i, j)][X] = (y1, z1, s1)

pmax = 0
node = ""
if "S" in Pi[(0, n - 1)]:
pmax = Pi[(0, n - 1)]["S"]
node = "S"
else:
for i in Pi[(0, n - 1)]:
if Pi[(0, n - 1)][i] > pmax:
pmax = Pi[(0, n - 1)][i]
node = i

return Pi, Bp, node

#生成树
def tree(Bp, word, i, j, node):
if j == i:
return [node, word[i]]
else:
lnode, rnode, k = Bp[(i, j)][node]
left = tree(Bp, word, i, k, lnode)
right = tree(Bp, word, k + 1, j, rnode)
return [node, left, right]

#生成结果
def Generate(File, output, Count_xyy, Count_xy, Count_x, Count_y, q_xyy, q_xy):
Res = []
with open(File) as f:
for i in f.readlines():
data = i.strip().split(" ")
word = i.strip().split(" ")
for j in range(len(data)):
if data[j] not in Count_y or Count_y[data[j]] < 5:
data[j] = "_RARE_"

#CKY算法
Pi, Bp, node = CKY(data, Count_xyy, Count_xy, Count_x, Count_y, q_xyy, q_xy)
n = len(word)
#生成树
res = tree(Bp, word, 0, n - 1, node)
Res.append(res)

with open(output, "w+") as f:
for res in Res:
r = json.dumps(res)
f.write(r)
f.write("\n")

#### 运行
File = "cfg_proc.counts"
f1 = "parse_dev.dat"
o1 = "parse_dev_result.dat"
Count_xyy, Count_xy, Count_x, Count_y = Generate_count(File)
q_xyy, q_xy = Generate_mle(Count_xyy, Count_xy, Count_x, Count_y)
Generate(f1, o1, Count_xyy, Count_xy, Count_x, Count_y, q_xyy, q_xy)

#检测结果
#python eval_parser.py parse_dev.key parse_dev_result.dat

Question 6

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from Q4 import *
from Q5 import *

#### Q6
File1 = "cfg_vert.counts"
Count_x = Count(File1)
f1 = "parse_train_vert.dat"
o1 = "parse_train_vert_proc.dat"
Replace(f1, o1, Count_x)

#python count_cfg_freq.py parse_train_vert_proc.dat > cfg_vert_proc.counts

File2 = "cfg_vert_proc.counts"
f2 = "parse_dev.dat"
o2 = "parse_dev_vert_result.dat"
Count_xyy, Count_xy, Count_x, Count_y = Generate_count(File2)
q_xyy, q_xy = Generate_mle(Count_xyy, Count_xy, Count_x, Count_y)
Generate(f2, o2, Count_xyy, Count_xy, Count_x, Count_y, q_xyy, q_xy)

本文标题:Michael Collins NLP Homework 2

文章作者:Doraemonzzz

发布时间:2020年04月21日 - 12:02:00

最后更新:2020年05月07日 - 20:30:33

原始链接:http://doraemonzzz.com/2020/04/21/Michael Collins NLP Homework 2/

许可协议: 署名-非商业性使用-禁止演绎 4.0 国际 转载请保留原文链接及作者。