Michael Collins NLP Homework 3
课程主页:http://www.cs.columbia.edu/~cs4705/
课程网盘地址:
链接:https://pan.baidu.com/s/1KijgO7yjL_MVCC9zKZ7Jdg
提取码:t1i3
这一次回顾Michael Collins NLP作业3。
Quesion 1
(a)假设法语单词的数量为$|F|$
- 初始化$p=1$
- $\text{for }j=1,\ldots,m:$
- $p=\max_{f_j,a_j} p\times t\left(f_{j} | e_{a_{j}}\right) \times q\left(a_{j} | j, l, m\right)$
- $f_j,a_j=\arg\max_{f_j,a_j} p\times t\left(f_{j} | e_{a_{j}}\right) \times q\left(a_{j} | j, l, m\right)$
- $\text{return } p,f_j,a_j,j=1,\ldots,m$
不难看出时间复杂度为
(b)注意恒等式
所以可以得到如下算法
- $\text{for }j=1,\ldots,m:$
- $f_j =\arg \max_{f_j} \sum_{a_{j}=0}^{l} t\left(f_{j} | e_{a_{j}}\right) q\left(a_{j} | j, l, m\right)$
时间复杂度为
(c)注意到
所以两种搜索的效果等价,但是
需要计算$p(\mathbf f)$
Question 2
假设法语单词的数量为$|F|$
- 初始化$p(j,f,a)=1,\text{for }a=0,\ldots, l, j=0,\ldots, m,f\in F$
- $\text{for }j=1,\ldots,m:$
- $\text{for }a=0,\ldots, l, f\in F:$
- $p(j,f, a)=\max_{f’,a’} p(j-1, f’,a’)\times t\left(f| e_{a}\right) \times q\left(a |a’, j, l, m\right)$
- $bp(j,f,a)=\arg \max_{f’,a’} p(j-1, f’,a’)\times t\left(f| e_{a}\right) \times q\left(a |a’, j, l, m\right)$
- $\text{for }a=0,\ldots, l, f\in F:$
- $f_m,a_m= \arg\max_{f,a} p(m, f,a)$
- $\text{for } j=m-1,\ldots,1:$
- $f_j,a_j= bp(j, f_{j+1},a_{j+1})$
不难看出时间复杂度为
Question 3
使用类似CKY算法的思路即可。
定义
$\text{for }l=1,\ldots,N-1:$
$\text{for }i=1,\ldots, N-l:$
令$j=i+l$
计算
返回$\pi(1,N,x)$
时间复杂度为
Question 4
因为许多部分要复用,所以编写了helper模块
import copy
NULL = "NULL"
def process(filename, flag=False):
data = []
vocab = []
with open(filename) as f:
for sentence in f.readlines():
word = sentence.split()
if flag:
word = [NULL] + word
data.append(word)
vocab += word
return data, set(vocab)
def get_count(en, de):
#生成计数
C1 = dict()
C2 = dict()
C3 = dict()
C4 = dict()
n = len(en)
for k in range(n):
mk = len(de[k])
#包括NULL
lk = len(en[k])
for i in range(mk):
for j in range(lk):
e = en[k][j]
f = de[k][i]
C1[(e, f)] = 0
C2[e] = 0
C3[(j, i, lk, mk)] = 0
C4[(i, lk, mk)] = 0
return C1, C2, C3, C4
#IBM模型
def IBM_model(en, de, t, q, S, C1, C2, C3, C4, delta, flag=False):
n = len(en)
for s in range(S):
c1 = copy.deepcopy(C1)
c2 = copy.deepcopy(C2)
c3 = copy.deepcopy(C3)
c4 = copy.deepcopy(C4)
for k in range(n):
mk = len(de[k])
#包括NULL
lk = len(en[k])
for i in range(mk):
for j in range(lk):
e = en[k][j]
f = de[k][i]
#计算delta
d = delta(t, q, k, i, j, lk, mk)
#更新
c1[(e, f)] += d
c2[e] += d
c3[(j, i, lk, mk)] += d
c4[(i, lk, mk)] += d
#更新t
for e in t:
for f in t[e]:
t[e][f] = c1[(e, f)] / c2[e]
#更新q
if flag:
for k1 in C3:
k2 = k1[1:]
q[k1] = c3[k1] / c4[k2]
Q4代码
import pickle
from helper import *
#读取t
filename = "data.txt"
with open(filename) as f:
t = pickle.load(f)
#读取单词
en, vocab_en = process("corpus.en", True)
de, vocab_de = process("corpus.de")
#获得计数
C1, C2, C3, C4 = get_count(en, de)
def delta1(t, q, k, i, j, lk, mk):
e = en[k][j]
f = de[k][i]
s = 0
for l in range(lk):
e1 = en[k][l]
s += t[e1][f]
return t[e][f] / s
#训练
q = dict()
S = 5
IBM_model(en, de, t, q, S, C1, C2, C3, C4, delta1)
#保存结果
res = []
with open("devwords.txt") as f:
for word in f.readlines():
e = word.split()[0]
f = list(t[e].keys())
values = list(t[e].values())
#排序
data = zip(f, values)
data.sort(key=lambda x: -x[1])
#保存结果
res.append([f[0] for f in data[:10]])
with open("Q4_word.txt", "w+") as f:
for data in res:
f.writelines(" ".join(data) + "\n")
#alignment
Alignment = []
for k in range(20):
mk = len(de[k])
lk = len(en[k])
alignment = []
for i in range(mk):
f = de[k][i]
p = 0
a = 0
for j in range(lk):
e = en[k][j]
if t[e][f] > p:
p = t[e][f]
a = j
alignment.append(a)
Alignment.append(alignment)
with open("Q4_aligment.txt", "w+") as f:
for alignment in Alignment:
f.write(" ".join(map(str, alignment)) + "\n")
#保存模型1的结果
with open("Q4_t.txt", "wb") as f:
pickle.dump(t, f)
Question 5
import pickle
from helper import *
#读取t
filename = "Q4_t.txt"
with open(filename) as f:
t = pickle.load(f)
#读取单词
en, vocab_en = process("corpus.en", True)
de, vocab_de = process("corpus.de")
#获得计数
C1, C2, C3, C4 = get_count(en, de)
def delta2(t, q, k, i, j, lk, mk):
e = en[k][j]
f = de[k][i]
s = 0
for l in range(lk):
e1 = en[k][l]
s += t[e1][f] * q[(l, i, lk, mk)]
return t[e][f] * q[(j, i, lk, mk)] / s
#训练
q = dict()
for key in C3:
l = key[2]
q[key] = 1.0 / l
S = 5
IBM_model(en, de, t, q, S, C1, C2, C3, C4, delta2, True)
#alignment
Alignment = []
for k in range(20):
mk = len(de[k])
lk = len(en[k])
alignment = []
for i in range(mk):
f = de[k][i]
p = 0
a = 0
for j in range(lk):
e = en[k][j]
if t[e][f] * q[(j, i, lk, mk)] > p:
p = t[e][f] * q[(j, i, lk, mk)]
a = j
alignment.append(a)
Alignment.append(alignment)
with open("Q5_aligment.txt", "w+") as f:
for alignment in Alignment:
f.write(" ".join(map(str, alignment)) + "\n")
#保存模型1的结果
with open("Q5_t.txt", "wb") as f:
pickle.dump(t, f)
with open("Q5_q.txt", "wb") as f:
pickle.dump(q, f)
Question 6
import numpy as np
import pickle
from helper import *
f1 = "Q5_t.txt"
f2 = "Q5_q.txt"
with open(f1) as f:
t = pickle.load(f)
with open(f2) as f:
q = pickle.load(f)
#读取单词
en, vocab_en = process("scrambled.en", True)
de, vocab_de = process("original.de")
order = []
s1 = de[0]
INF = -1e10
eps = 1e-20
for s1 in de:
m = len(s1)
Log_prob = -1e100
Index = 0
for index, s2 in enumerate(en):
#当前句子的对数分数
log_prob = 0
l = len(s2)
for i in range(m):
f = s1[i]
#初始化
score = INF
alignment = 0
#找到最优的alignment
for a in range(l):
k1 = (a, i, l, m)
e = s2[a]
if (k1 in q) and (e in t) and (f in t[e]):
s = np.log(q[k1] + eps) + np.log(t[e][f] + eps)
if s > score:
score = s
aligment = a
#更新logprob
log_prob += score
if log_prob > Log_prob:
Index = index
Log_prob = log_prob
order.append(Index)
#生成结果
scrambled = []
with open("scrambled.en") as f:
for sentence in f.readlines():
scrambled.append(sentence)
filename = "unscrambled.en"
with open(filename, "wb") as f:
for index in order:
f.writelines(scrambled[index])
最后结果
Right Total Acc
92 100 0.920
本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 Doraemonzzz!
评论
ValineLivere