Michael Collins NLP Homework 3

课程主页:http://www.cs.columbia.edu/~cs4705/

课程网盘地址:

链接:https://pan.baidu.com/s/1KijgO7yjL_MVCC9zKZ7Jdg
提取码:t1i3

这一次回顾Michael Collins NLP作业3。

Quesion 1

(a)假设法语单词的数量为$|F|$

  • 初始化$p=1$
  • $\text{for }j=1,\ldots,m:$
    • $p=\max_{f_j,a_j} p\times t\left(f_{j} | e_{a_{j}}\right) \times q\left(a_{j} | j, l, m\right)$
    • $f_j,a_j=\arg\max_{f_j,a_j} p\times t\left(f_{j} | e_{a_{j}}\right) \times q\left(a_{j} | j, l, m\right)$
  • $\text{return } p,f_j,a_j,j=1,\ldots,m$

不难看出时间复杂度为

(b)注意恒等式

所以可以得到如下算法

  • $\text{for }j=1,\ldots,m:$
    • $f_j =\arg \max_{f_j} \sum_{a_{j}=0}^{l} t\left(f_{j} | e_{a_{j}}\right) q\left(a_{j} | j, l, m\right)$

时间复杂度为

(c)注意到

所以两种搜索的效果等价,但是

需要计算$p(\mathbf f)$

Question 2

假设法语单词的数量为$|F|$

  • 初始化$p(j,f,a)=1,\text{for }a=0,\ldots, l, j=0,\ldots, m,f\in F$
  • $\text{for }j=1,\ldots,m:$
    • $\text{for }a=0,\ldots, l, f\in F:$
      • $p(j,f, a)=\max_{f’,a’} p(j-1, f’,a’)\times t\left(f| e_{a}\right) \times q\left(a |a’, j, l, m\right)$
      • $bp(j,f,a)=\arg \max_{f’,a’} p(j-1, f’,a’)\times t\left(f| e_{a}\right) \times q\left(a |a’, j, l, m\right)$
  • $f_m,a_m= \arg\max_{f,a} p(m, f,a)$
  • $\text{for } j=m-1,\ldots,1:$
    • $f_j,a_j= bp(j, f_{j+1},a_{j+1})$

不难看出时间复杂度为

Question 3

使用类似CKY算法的思路即可。

定义

  • $\text{for }l=1,\ldots,N-1:$

    • $\text{for }i=1,\ldots, N-l:$

    • 令$j=i+l$

    • 计算

  • 返回$\pi(1,N,x)$

时间复杂度为

Question 4

因为许多部分要复用,所以编写了helper模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import copy
NULL = "NULL"

def process(filename, flag=False):
data = []
vocab = []
with open(filename) as f:
for sentence in f.readlines():
word = sentence.split()
if flag:
word = [NULL] + word
data.append(word)
vocab += word
return data, set(vocab)

def get_count(en, de):
#生成计数
C1 = dict()
C2 = dict()
C3 = dict()
C4 = dict()
n = len(en)

for k in range(n):
mk = len(de[k])
#包括NULL
lk = len(en[k])
for i in range(mk):
for j in range(lk):
e = en[k][j]
f = de[k][i]
C1[(e, f)] = 0
C2[e] = 0
C3[(j, i, lk, mk)] = 0
C4[(i, lk, mk)] = 0

return C1, C2, C3, C4

#IBM模型
def IBM_model(en, de, t, q, S, C1, C2, C3, C4, delta, flag=False):
n = len(en)
for s in range(S):
c1 = copy.deepcopy(C1)
c2 = copy.deepcopy(C2)
c3 = copy.deepcopy(C3)
c4 = copy.deepcopy(C4)
for k in range(n):
mk = len(de[k])
#包括NULL
lk = len(en[k])
for i in range(mk):
for j in range(lk):
e = en[k][j]
f = de[k][i]
#计算delta
d = delta(t, q, k, i, j, lk, mk)
#更新
c1[(e, f)] += d
c2[e] += d
c3[(j, i, lk, mk)] += d
c4[(i, lk, mk)] += d

#更新t
for e in t:
for f in t[e]:
t[e][f] = c1[(e, f)] / c2[e]

#更新q
if flag:
for k1 in C3:
k2 = k1[1:]
q[k1] = c3[k1] / c4[k2]

Q4代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pickle
from helper import *

#读取t
filename = "data.txt"
with open(filename) as f:
t = pickle.load(f)

#读取单词
en, vocab_en = process("corpus.en", True)
de, vocab_de = process("corpus.de")

#获得计数
C1, C2, C3, C4 = get_count(en, de)

def delta1(t, q, k, i, j, lk, mk):
e = en[k][j]
f = de[k][i]
s = 0
for l in range(lk):
e1 = en[k][l]
s += t[e1][f]

return t[e][f] / s

#训练
q = dict()
S = 5
IBM_model(en, de, t, q, S, C1, C2, C3, C4, delta1)

#保存结果
res = []
with open("devwords.txt") as f:
for word in f.readlines():
e = word.split()[0]
f = list(t[e].keys())
values = list(t[e].values())

#排序
data = zip(f, values)
data.sort(key=lambda x: -x[1])
#保存结果
res.append([f[0] for f in data[:10]])

with open("Q4_word.txt", "w+") as f:
for data in res:
f.writelines(" ".join(data) + "\n")

#alignment
Alignment = []
for k in range(20):
mk = len(de[k])
lk = len(en[k])
alignment = []
for i in range(mk):
f = de[k][i]
p = 0
a = 0
for j in range(lk):
e = en[k][j]
if t[e][f] > p:
p = t[e][f]
a = j
alignment.append(a)
Alignment.append(alignment)

with open("Q4_aligment.txt", "w+") as f:
for alignment in Alignment:
f.write(" ".join(map(str, alignment)) + "\n")

#保存模型1的结果
with open("Q4_t.txt", "wb") as f:
pickle.dump(t, f)

Question 5

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import pickle
from helper import *

#读取t
filename = "Q4_t.txt"
with open(filename) as f:
t = pickle.load(f)

#读取单词
en, vocab_en = process("corpus.en", True)
de, vocab_de = process("corpus.de")

#获得计数
C1, C2, C3, C4 = get_count(en, de)

def delta2(t, q, k, i, j, lk, mk):
e = en[k][j]
f = de[k][i]
s = 0
for l in range(lk):
e1 = en[k][l]
s += t[e1][f] * q[(l, i, lk, mk)]

return t[e][f] * q[(j, i, lk, mk)] / s

#训练
q = dict()
for key in C3:
l = key[2]
q[key] = 1.0 / l

S = 5
IBM_model(en, de, t, q, S, C1, C2, C3, C4, delta2, True)

#alignment
Alignment = []
for k in range(20):
mk = len(de[k])
lk = len(en[k])
alignment = []
for i in range(mk):
f = de[k][i]
p = 0
a = 0
for j in range(lk):
e = en[k][j]
if t[e][f] * q[(j, i, lk, mk)] > p:
p = t[e][f] * q[(j, i, lk, mk)]
a = j
alignment.append(a)
Alignment.append(alignment)

with open("Q5_aligment.txt", "w+") as f:
for alignment in Alignment:
f.write(" ".join(map(str, alignment)) + "\n")

#保存模型1的结果
with open("Q5_t.txt", "wb") as f:
pickle.dump(t, f)

with open("Q5_q.txt", "wb") as f:
pickle.dump(q, f)

Question 6

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import numpy as np
import pickle
from helper import *

f1 = "Q5_t.txt"
f2 = "Q5_q.txt"
with open(f1) as f:
t = pickle.load(f)

with open(f2) as f:
q = pickle.load(f)

#读取单词
en, vocab_en = process("scrambled.en", True)
de, vocab_de = process("original.de")

order = []

s1 = de[0]

INF = -1e10
eps = 1e-20

for s1 in de:
m = len(s1)
Log_prob = -1e100
Index = 0
for index, s2 in enumerate(en):
#当前句子的对数分数
log_prob = 0
l = len(s2)
for i in range(m):
f = s1[i]
#初始化
score = INF
alignment = 0
#找到最优的alignment
for a in range(l):
k1 = (a, i, l, m)
e = s2[a]
if (k1 in q) and (e in t) and (f in t[e]):
s = np.log(q[k1] + eps) + np.log(t[e][f] + eps)
if s > score:
score = s
aligment = a
#更新logprob
log_prob += score
if log_prob > Log_prob:
Index = index
Log_prob = log_prob

order.append(Index)


#生成结果
scrambled = []
with open("scrambled.en") as f:
for sentence in f.readlines():
scrambled.append(sentence)


filename = "unscrambled.en"
with open(filename, "wb") as f:
for index in order:
f.writelines(scrambled[index])

最后结果

1
2
Right   Total   Acc
92 100 0.920

本文标题:Michael Collins NLP Homework 3

文章作者:Doraemonzzz

发布时间:2020年05月07日 - 20:26:00

最后更新:2020年05月07日 - 20:55:58

原始链接:http://doraemonzzz.com/2020/05/07/Michael Collins NLP Homework 3/

许可协议: 署名-非商业性使用-禁止演绎 4.0 国际 转载请保留原文链接及作者。