Michael Collins NLP Homework 1

课程主页:http://www.cs.columbia.edu/~cs4705/

课程网盘地址:

链接:https://pan.baidu.com/s/1KijgO7yjL_MVCC9zKZ7Jdg
提取码:t1i3

整理资料的时候发现拖欠了很多课程,后续会一一完成,这一次先回顾Michael Collins NLP作业1。

Quesion 1

假设句子$S_i$的长度为$|S_i|$,记

考虑如下量

那么验证集上的困惑度为

所以选择$\lambda$最大化$L\left(\lambda_{1}, \lambda_{2}, \lambda_{3}\right)$等价于选择$\lambda$最小化验证集上的困惑度。

Quesion 2

我的理解是$\lambda_i^j$较多,很难找到具体的值,更难找到最优的选择。

Question 3

初始化:

算法:

  • 对$k=1\ldots n$,

    • 对$u\in \mathcal T(x_{k-1}),v\in \mathcal T(x_{k})$,
  • 返回

Question 4

本题分为四步,第一步统计词频,计算MLE:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#### Part 1 计算MLE
File = "ner.counts"
Count_x_y = {}
Count_y = {}
Count_x = {}
with open(File) as f:
for i in f.readlines():
#分隔数据
data = i.split()
#判断
if data[1] == "WORDTAG":
#计算Count(y, x)
Count_x_y[(data[2], data[3])] = float(data[0])
#计算Count(y)
if data[2] in Count_y:
Count_y[data[2]] += float(data[0])
else:
Count_y[data[2]] = float(data[0])
#计算Count(x)
if data[3] in Count_x:
Count_x[data[3]] += float(data[0])
else:
Count_x[data[3]] = float(data[0])

e = {}
for i in Count_x_y:
e[i] = Count_x_y[i] / Count_y[i[0]]

第二步根据第一步的结果处理低频词:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#### Part 2 处理低频词
File = "ner_train.dat"
res = []
with open(File) as f:
for i in f.readlines():
#分隔数据
data = i.split()
if len(data) > 0:
#判断次数
if Count_x[data[0]] < 5:
data[0] = "_RARE_"
content = ' '.join(data) + "\n"
i = content
res.append(i)

#生成文件
File = "ner_train_proc.dat"
with open(File, "w+") as f:
for i in res:
f.writelines(i)

生成ner_train_proc.dat文件后,运行如下命令产生新的数据集:

1
python count_freqs.py ner_train_proc.dat > ner_proc.counts

第三步由对处理后的结果重新计算MLE:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#### Part 3 重新计算MLE
File = "ner_proc.counts"
Count_x_y_proc = {}
Count_y_proc = {}
Count_x_proc = {}
with open(File) as f:
for i in f.readlines():
#分隔数据
data = i.split()
#判断
if data[1] == "WORDTAG":
#计算Count(y, x)
Count_x_y_proc[(data[2], data[3])] = float(data[0])
#计算Count(y)
if data[2] in Count_y_proc:
Count_y_proc[data[2]] += float(data[0])
else:
Count_y_proc[data[2]] = float(data[0])
#计算Count(x)
if data[3] in Count_x_proc:
Count_x_proc[data[3]] += float(data[0])
else:
Count_x_proc[data[3]] = float(data[0])

第四步计算$y=\arg \max _{y} e(x | y)$:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#### Part 4 计算argmax(e(x|y))

#按单词生成字典,每个字典元素为字典
e_proc = {}
for i in Count_x_proc:
e_proc[i] = {}

for i in Count_x_y_proc:
e_proc[i[1]][i] = Count_x_y_proc[i] / Count_y_proc[i[0]]


#生成结果
File = "ner_dev.dat"
res = []
#验证数据的句子
Sentence = []
s = []
with open(File) as f:
for i in f.readlines():
#分隔数据
data = i.split()
#记录句子
if len(data) > 0:
key = data[0]
#判断是否为低频词
if key not in Count_x or Count_x[key] < 5:
key = "_RARE_"
#找到argmax(e(x|y))
p = 0
label = ""
for j in e_proc[key]:
p1 = e_proc[key][j]
if p1 > p:
p = p1
label = j[0]
#生成结果
i = ' '.join([data[0], label, str(p)]) + "\n"
s += data
else:
Sentence.append(s)
s = []
res.append(i)

#产生结果
File = "ner_dev_proc.dat"
with open(File, "w+") as f:
for i in res:
f.writelines(i)

最后评估结果

1
python eval_ne_tagger.py ner_dev.key ner_dev_proc.dat
1
2
3
4
5
6
7
8
Found 14043 NEs. Expected 5931 NEs; Correct: 3117.

precision recall F1-Score
Total: 0.221961 0.525544 0.312106
PER: 0.435451 0.231230 0.302061
ORG: 0.475936 0.399103 0.434146
LOC: 0.147750 0.870229 0.252612
MISC: 0.491689 0.610206 0.544574

可以看到效果不是很理想。

Question 5

这部分实现维特比算法。

第一步,计算MLE:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#### Part 1 计算MLE
#预处理
File = "ner_proc.counts"
q2 = {}
q3 = {}
with open(File) as f:
for i in f.readlines():
#分隔数据
data = i.split()
#判断
if data[1] == "2-GRAM":
q2[(data[2], data[3])] = float(data[0])
q3[(data[2], data[3])] = {}
elif data[1] == "3-GRAM":
q3[(data[2], data[3])][(data[2], data[3], data[4])] = float(data[0])

#计算 MLE
q = {}
for i in q3:
for j in q3[i]:
q[j] = q3[i][j] / q2[i]

第二步,实现维特比算法:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#### Part 2 维特比算法
def Viterbi(sentence, q, e):
#K_0 = *
#标签数量
K = list(Count_y.keys())
#动态规划表
Pi = {}
#反向指针表
bp = {}
#单词数量
n = len(sentence)
for i in range(n + 1):
Pi[i-1] = {}
bp[i-1] = {}
#初始化
Pi[-1][("*", "*")] = 1
#遍历句子中的单词
for k in range(n):
#可以选的标签
K0 = K
K1 = K
K2 = K
if k == 0:
K0 = ["*"]
K1 = ["*"]
elif k == 1:
K0 = ["*"]

#循环
for u in K1:
for v in K2:
p = 0
w_arg = ""
key = sentence[k]
if key not in Count_x or Count_x[key] < 5:
key = "_RARE_"
for w in K0:
if (w, u) in Pi[k-1] and (w, u, v) in q and (v, key) in e[key]:
p1 = Pi[k-1][(w, u)] * q[(w, u, v)] * e[key][(v, key)]
if p1 > p:
p = p1
w_arg = w
Pi[k][(u, v)] = p
bp[k][(u, v)] = w_arg

#判断长度是否为1
if n == 1:
y = ""
pmax = 0
for u in K + ["*"]:
for v in K + ["*"]:
if (u, v) in Pi[n-1] and (u, v, "STOP") in q:
p = Pi[n-1][(u, v)] * q[(u, v, "STOP")]
if p > pmax:
pmax = p
y = v
tag = [y]
else:
#计算最后两个标签
y0 = ""
y1 = ""
pmax = 0
for u in K:
for v in K:
if (u, v) in Pi[n-1] and (u, v, "STOP") in q:
p = Pi[n-1][(u, v)] * q[(u, v, "STOP")]
if p > pmax:
pmax = p
y0 = u
y1 = v

tag = [y1, y0]

for k in range(n-3, -1, -1):
y = bp[k+2][(y0, y1)]
tag.append(y)
#更新
y1 = y0
y0 = y

#反序
tag = tag[::-1]

return tag, pmax

第三步,产生结果:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#### Part 3 产生结果
Res_viterbi = []
P = []

#产生结果
File = "ner_dev_viterbi.dat"
with open(File, "w+") as f:
for sentence in Sentence:
tag, p = Viterbi(sentence, q, e_proc)
if(p == 0):
print(sentence)
break
logp = np.log(p)
Res_viterbi.append(tag)
P.append(p)
for i in range(len(sentence)):
data = ' '.join([sentence[i], tag[i], str(logp)]) + "\n"
f.writelines(data)
#换行符
f.writelines("\n")

最后评估结果

1
python eval_ne_tagger.py ner_dev.key ner_dev_viterbi.dat
1
2
3
4
5
6
7
8
Found 4704 NEs. Expected 5931 NEs; Correct: 3647.

precision recall F1-Score
Total: 0.775298 0.614905 0.685849
PER: 0.762535 0.595756 0.668907
ORG: 0.611855 0.478326 0.536913
LOC: 0.876458 0.696292 0.776056
MISC: 0.830065 0.689468 0.753262

可以看到效果比之前好很多。

本文标题:Michael Collins NLP Homework 1

文章作者:Doraemonzzz

发布时间:2020年04月08日 - 12:47:00

最后更新:2020年04月08日 - 12:54:01

原始链接:http://doraemonzzz.com/2020/04/08/Michael Collins NLP Homework 1/

许可协议: 署名-非商业性使用-禁止演绎 4.0 国际 转载请保留原文链接及作者。