#### Quesion 1

(a)假设法语单词的数量为$|F|$

• 初始化$p=1$
• $\text{for }j=1,\ldots,m:$
• $p=\max_{f_j,a_j} p\times t\left(f_{j} | e_{a_{j}}\right) \times q\left(a_{j} | j, l, m\right)$
• $f_j,a_j=\arg\max_{f_j,a_j} p\times t\left(f_{j} | e_{a_{j}}\right) \times q\left(a_{j} | j, l, m\right)$
• $\text{return } p,f_j,a_j,j=1,\ldots,m$

(b)注意恒等式

• $\text{for }j=1,\ldots,m:$
• $f_j =\arg \max_{f_j} \sum_{a_{j}=0}^{l} t\left(f_{j} | e_{a_{j}}\right) q\left(a_{j} | j, l, m\right)$

(c)注意到

#### Question 2

• 初始化$p(j,f,a)=1,\text{for }a=0,\ldots, l, j=0,\ldots, m,f\in F$
• $\text{for }j=1,\ldots,m:$
• $\text{for }a=0,\ldots, l, f\in F:$
• $p(j,f, a)=\max_{f’,a’} p(j-1, f’,a’)\times t\left(f| e_{a}\right) \times q\left(a |a’, j, l, m\right)$
• $bp(j,f,a)=\arg \max_{f’,a’} p(j-1, f’,a’)\times t\left(f| e_{a}\right) \times q\left(a |a’, j, l, m\right)$
• $f_m,a_m= \arg\max_{f,a} p(m, f,a)$
• $\text{for } j=m-1,\ldots,1:$
• $f_j,a_j= bp(j, f_{j+1},a_{j+1})$

#### Question 3

• $\text{for }l=1,\ldots,N-1:$

• $\text{for }i=1,\ldots, N-l:$

• 令$j=i+l$

• 计算

• 返回$\pi(1,N,x)$

#### Question 4

import copy
NULL = "NULL"

def process(filename, flag=False):
data = []
vocab = []
with open(filename) as f:
for sentence in f.readlines():
word = sentence.split()
if flag:
word = [NULL] + word
data.append(word)
vocab += word
return data, set(vocab)

def get_count(en, de):
#生成计数
C1 = dict()
C2 = dict()
C3 = dict()
C4 = dict()
n = len(en)

for k in range(n):
mk = len(de[k])
#包括NULL
lk = len(en[k])
for i in range(mk):
for j in range(lk):
e = en[k][j]
f = de[k][i]
C1[(e, f)] = 0
C2[e] = 0
C3[(j, i, lk, mk)] = 0
C4[(i, lk, mk)] = 0

return C1, C2, C3, C4

#IBM模型
def IBM_model(en, de, t, q, S, C1, C2, C3, C4, delta, flag=False):
n = len(en)
for s in range(S):
c1 = copy.deepcopy(C1)
c2 = copy.deepcopy(C2)
c3 = copy.deepcopy(C3)
c4 = copy.deepcopy(C4)
for k in range(n):
mk = len(de[k])
#包括NULL
lk = len(en[k])
for i in range(mk):
for j in range(lk):
e = en[k][j]
f = de[k][i]
#计算delta
d = delta(t, q, k, i, j, lk, mk)
#更新
c1[(e, f)] += d
c2[e] += d
c3[(j, i, lk, mk)] += d
c4[(i, lk, mk)] += d

#更新t
for e in t:
for f in t[e]:
t[e][f] = c1[(e, f)] / c2[e]

#更新q
if flag:
for k1 in C3:
k2 = k1[1:]
q[k1] = c3[k1] / c4[k2]

Q4代码

import pickle
from helper import *

#读取t
filename = "data.txt"
with open(filename) as f:

#读取单词
en, vocab_en = process("corpus.en", True)
de, vocab_de = process("corpus.de")

#获得计数
C1, C2, C3, C4 = get_count(en, de)

def delta1(t, q, k, i, j, lk, mk):
e = en[k][j]
f = de[k][i]
s = 0
for l in range(lk):
e1 = en[k][l]
s += t[e1][f]

return t[e][f] / s

#训练
q = dict()
S = 5
IBM_model(en, de, t, q, S, C1, C2, C3, C4, delta1)

#保存结果
res = []
with open("devwords.txt") as f:
for word in f.readlines():
e = word.split()[0]
f = list(t[e].keys())
values = list(t[e].values())

#排序
data = zip(f, values)
data.sort(key=lambda x: -x[1])
#保存结果
res.append([f[0] for f in data[:10]])

with open("Q4_word.txt", "w+") as f:
for data in res:
f.writelines(" ".join(data) + "\n")

#alignment
Alignment = []
for k in range(20):
mk = len(de[k])
lk = len(en[k])
alignment = []
for i in range(mk):
f = de[k][i]
p = 0
a = 0
for j in range(lk):
e = en[k][j]
if t[e][f] > p:
p = t[e][f]
a = j
alignment.append(a)
Alignment.append(alignment)

with open("Q4_aligment.txt", "w+") as f:
for alignment in Alignment:
f.write(" ".join(map(str, alignment)) + "\n")

#保存模型1的结果
with open("Q4_t.txt", "wb") as f:
pickle.dump(t, f)

#### Question 5

import pickle
from helper import *

#读取t
filename = "Q4_t.txt"
with open(filename) as f:

#读取单词
en, vocab_en = process("corpus.en", True)
de, vocab_de = process("corpus.de")

#获得计数
C1, C2, C3, C4 = get_count(en, de)

def delta2(t, q, k, i, j, lk, mk):
e = en[k][j]
f = de[k][i]
s = 0
for l in range(lk):
e1 = en[k][l]
s += t[e1][f] * q[(l, i, lk, mk)]

return t[e][f] * q[(j, i, lk, mk)] / s

#训练
q = dict()
for key in C3:
l = key[2]
q[key] = 1.0 / l

S = 5
IBM_model(en, de, t, q, S, C1, C2, C3, C4, delta2, True)

#alignment
Alignment = []
for k in range(20):
mk = len(de[k])
lk = len(en[k])
alignment = []
for i in range(mk):
f = de[k][i]
p = 0
a = 0
for j in range(lk):
e = en[k][j]
if t[e][f] * q[(j, i, lk, mk)] > p:
p = t[e][f] * q[(j, i, lk, mk)]
a = j
alignment.append(a)
Alignment.append(alignment)

with open("Q5_aligment.txt", "w+") as f:
for alignment in Alignment:
f.write(" ".join(map(str, alignment)) + "\n")

#保存模型1的结果
with open("Q5_t.txt", "wb") as f:
pickle.dump(t, f)

with open("Q5_q.txt", "wb") as f:
pickle.dump(q, f)

#### Question 6

import numpy as np
import pickle
from helper import *

f1 = "Q5_t.txt"
f2 = "Q5_q.txt"
with open(f1) as f:

with open(f2) as f:

#读取单词
en, vocab_en = process("scrambled.en", True)
de, vocab_de = process("original.de")

order = []

s1 = de[0]

INF = -1e10
eps = 1e-20

for s1 in de:
m = len(s1)
Log_prob = -1e100
Index = 0
for index, s2 in enumerate(en):
#当前句子的对数分数
log_prob = 0
l = len(s2)
for i in range(m):
f = s1[i]
#初始化
score = INF
alignment = 0
#找到最优的alignment
for a in range(l):
k1 = (a, i, l, m)
e = s2[a]
if (k1 in q) and (e in t) and (f in t[e]):
s = np.log(q[k1] + eps) + np.log(t[e][f] + eps)
if s > score:
score = s
aligment = a
#更新logprob
log_prob += score
if log_prob > Log_prob:
Index = index
Log_prob = log_prob

order.append(Index)

#生成结果
scrambled = []
with open("scrambled.en") as f:
for sentence in f.readlines():
scrambled.append(sentence)

filename = "unscrambled.en"
with open(filename, "wb") as f:
for index in order:
f.writelines(scrambled[index])

Right   Total   Acc
92      100     0.920