Michael Collins NLP Homework 4

课程主页:http://www.cs.columbia.edu/~cs4705/

课程网盘地址:

链接:https://pan.baidu.com/s/1KijgO7yjL_MVCC9zKZ7Jdg
提取码:t1i3

这一次回顾Michael Collins NLP作业4。

Quesion 1

(a)

令上式为$0$可得

注意$f_1=f_2$,所以

(b)

令上式为$0$可得

取$j=1,2$可得

构造集合

那么上式等价于

由于上式对$j=1,2$成立,这说明$v_1,v_2$同号。

Question 2

假设

那么

令上式为$0$可得

Question 3

(a)

定义

所以需要两个参数$v_1, v_2$。

(b)
(c)

那么

注意到

所以条件为

Question 4

由于一些函数要共用,所以编写了helper.py文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import tagger_config
from subprocess import PIPE
import sys, subprocess

tags = tagger_config.tags

def sentence_reader(filename):
sentences = []
with open(filename) as f:
sentence = [('*', '*')]
sentence = []
for word in f.readlines():
w = word.strip().split()
#非空
if not w:
sentences.append(sentence)
sentence = [('*', '*')]
sentence = []
else:

sentence.append(w)

return sentences

def transform(sentence):
res = ""
n = len(sentence)
for i in range(n):
word = sentence[i]
m = len(word)
tmp = word[0]
#单词之间以\t间隔
for j in range(1, m):
tmp += "\t" + word[j]
res += tmp
#除了最后一行增加换行
if (i < n - 1):
res += "\n"

return res

def process(args):
"Create a 'server' to send commands to."
return subprocess.Popen(args, stdin=PIPE, stdout=PIPE)

def call(process, stdin):
"Send command to a server and get stdout."
res = []
process.stdin.write(stdin + "\n\n")
line = process.stdout.readline().strip()
while line:
res.append(line)
line = process.stdout.readline().strip()
return res

def get_feature(sentence, his):
#his=[i, tag[i-1], tag[i]]
#BIGRAM
BIGRAM = "BIGRAM:" + his[1] + ":" + his[2]
#TAG
i = int(his[0]) - 1
TAG = "TAG:" + sentence[i][0] + ":" + his[2]

return BIGRAM, TAG

def get_feature_v1(sentence, his):
#his=[i, tag[i-1], tag[i]]
res = []
#BIGRAM
BIGRAM = "BIGRAM:" + his[1] + ":" + his[2]
if (his[1] == " "):
print(his)
res.append(BIGRAM)
#TAG
i = int(his[0]) - 1
TAG = "TAG:" + sentence[i][0] + ":" + his[2]
res.append(TAG)
#SUFF
for j in range(len(sentence)):
word = sentence[j][0]
n = len(word)
for k in range(1, 4):
if k <= n:
tmp = "SUFF:" + word[-k:] + ":" + str(k) + ":" + his[2]
res.append(tmp)

return res

Q4

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from helper import *

#读取
def get_value(filename):
value = dict()

with open(filename) as f:
for string in f.readlines():
fea, v = string.strip().split()
value[fea] = float(v)

return value

def generate(output, sentences, F):
with open(output, "wb") as f:
for sentence in sentences:
sent = transform(sentence)
history = call(enum_server, sent)
score_ = []
for his in history:
score = 0
feature = F(sentence, his.split())
for fea in feature:
if fea in value:
score += value[fea]
score_.append(his + "\t" + str(score))
score_ = '\n'.join(score_)
#生成结果
res = call(decoder_server, score_)
#保存
n = len(sentence)
for i in range(n):
tmp = sentence[i][0] + "\t" + res[i].split()[-1]
f.writelines(tmp)
f.write("\n")
f.write("\n")

enum_server = process(["python", "tagger_history_generator.py", "ENUM"])
decoder_server = process(["python", "tagger_decoder.py", "HISTORY"])
filename = "tag_dev.dat"
sentences = sentence_reader(filename)

#Q4
f1 = "tag.model"
o1 = "Q4.out"
value = get_value(f1)
generate(o1, sentences, get_feature)
1
2226 2459 0.905246034974

Question 5

Q5

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#参考https://github.com/huxiuhan/nlp-hw

from helper import *

filename = "tag.model"
value = dict()

enum_server = process(["python", "tagger_history_generator.py", "ENUM"])
gold_server = process(["python", "tagger_history_generator.py", "GOLD"])
decoder_server = process(["python", "tagger_decoder.py", "HISTORY"])

filename = "tag_train.dat"
sentences = sentence_reader(filename)

K = 5
#历史
History = []
History_label = []
for sentence in sentences:
sent = transform(sentence)
history = call(enum_server, sent)
history_label = call(gold_server, sent)
History.append(history)
History_label.append(history_label)
N = len(sentences)

#训练
for k in range(K):
for i, sentence in enumerate(sentences):
sent = transform(sentence)
history = History[i]
#真实结果
history_label = History_label[i]
score_ = []
for his in history:
score = 0
feature = get_feature_v1(sentence, his.split())
for fea in feature:
if fea in value:
score += value[fea]
score_.append(his + "\t" + str(score))
score_ = '\n'.join(score_)
#生成结果
res = call(decoder_server, score_)
#比较结果
flag = True
n = len(history_label)

for j in range(n):
a1 = res[j][-1]
a2 = history_label[j].split()[-1]
if a1 != a2:
#不相同
for f in get_feature_v1(sentence, res[j].split()):
if f in value:
value[f] -= 1
else:
value[f] = -1
#相同
for f in get_feature_v1(sentence, history_label[j].split()):
if f in value:
value[f] += 1
else:
value[f] = 1

#生成结果
outputname = "Q5.model"
with open(outputname, "wb") as f:
for fea in value:
f.writelines(fea + " " + str(value[fea]))
f.writelines("\n")
1
2184 2459 0.888165921106

本文标题:Michael Collins NLP Homework 4

文章作者:Doraemonzzz

发布时间:2020年05月18日 - 11:58:00

最后更新:2020年05月20日 - 16:30:15

原始链接:http://doraemonzzz.com/2020/05/18/Michael Collins NLP Homework 4/

许可协议: 署名-非商业性使用-禁止演绎 4.0 国际 转载请保留原文链接及作者。