课程主页:http://www.cs.columbia.edu/~cs4705/

课程网盘地址:

链接:https://pan.baidu.com/s/1KijgO7yjL_MVCC9zKZ7Jdg
提取码:t1i3

这一次回顾Michael Collins NLP作业4。

Quesion 1

(a)

令上式为$0$可得

注意$f_1=f_2$,所以

(b)

令上式为$0$可得

取$j=1,2$可得

构造集合

那么上式等价于

由于上式对$j=1,2$成立,这说明$v_1,v_2$同号。

Question 2

假设

那么

令上式为$0$可得

Question 3

(a)

定义

所以需要两个参数$v_1, v_2$。

(b)
(c)

那么

注意到

所以条件为

Question 4

由于一些函数要共用,所以编写了helper.py文件

import tagger_config
from subprocess import PIPE
import sys, subprocess

tags = tagger_config.tags

def sentence_reader(filename):
    sentences = []
    with open(filename) as f:
        sentence = [('*', '*')]
        sentence = []
        for word in f.readlines():
            w = word.strip().split()
            #非空
            if not w:
                sentences.append(sentence)
                sentence = [('*', '*')]
                sentence = []
            else:
                
                sentence.append(w)
            
    return sentences

def transform(sentence):
    res = ""
    n = len(sentence)
    for i in range(n):
        word = sentence[i]
        m = len(word)
        tmp = word[0]
        #单词之间以\t间隔
        for j in range(1, m):
            tmp += "\t" + word[j]
        res += tmp
        #除了最后一行增加换行
        if (i < n - 1):
             res += "\n"
        
    return res

def process(args):
    "Create a 'server' to send commands to."
    return subprocess.Popen(args, stdin=PIPE, stdout=PIPE)

def call(process, stdin):
    "Send command to a server and get stdout."
    res = []
    process.stdin.write(stdin + "\n\n")
    line = process.stdout.readline().strip()
    while line:
        res.append(line)
        line = process.stdout.readline().strip()
    return res
                            
def get_feature(sentence, his):
    #his=[i, tag[i-1], tag[i]]
    #BIGRAM
    BIGRAM = "BIGRAM:" + his[1] + ":" + his[2]
    #TAG
    i = int(his[0]) - 1
    TAG = "TAG:" + sentence[i][0] + ":" + his[2]
    
    return BIGRAM, TAG

def get_feature_v1(sentence, his):
    #his=[i, tag[i-1], tag[i]]
    res = []
    #BIGRAM
    BIGRAM = "BIGRAM:" + his[1] + ":" + his[2]
    if (his[1] == " "):
        print(his)
    res.append(BIGRAM)
    #TAG
    i = int(his[0]) - 1
    TAG = "TAG:" + sentence[i][0] + ":" + his[2]
    res.append(TAG)
    #SUFF
    for j in range(len(sentence)):
        word = sentence[j][0]
        n = len(word)
        for k in range(1, 4):
            if k <= n:
                tmp = "SUFF:" + word[-k:] + ":" + str(k) + ":" + his[2]
                res.append(tmp)
    
    return res

Q4

from helper import *

#读取
def get_value(filename):
    value = dict()
    
    with open(filename) as f:
        for string in f.readlines():
            fea, v = string.strip().split()
            value[fea] = float(v)
    
    return value

def generate(output, sentences, F):
    with open(output, "wb") as f:
        for sentence in sentences:
            sent = transform(sentence)
            history = call(enum_server, sent)
            score_ = []
            for his in history:
                score = 0
                feature = F(sentence, his.split())
                for fea in feature:
                    if fea in value:
                        score += value[fea]
                score_.append(his + "\t" + str(score))
            score_ = '\n'.join(score_)
            #生成结果
            res = call(decoder_server, score_)
            #保存
            n = len(sentence)
            for i in range(n):
                tmp = sentence[i][0] + "\t" + res[i].split()[-1]
                f.writelines(tmp)
                f.write("\n")
            f.write("\n")

enum_server = process(["python", "tagger_history_generator.py", "ENUM"])
decoder_server = process(["python", "tagger_decoder.py", "HISTORY"])
filename = "tag_dev.dat"
sentences = sentence_reader(filename)

#Q4
f1 = "tag.model"
o1 = "Q4.out"
value = get_value(f1)
generate(o1, sentences, get_feature)
2226 2459 0.905246034974

Question 5

Q5

#参考https://github.com/huxiuhan/nlp-hw

from helper import *

filename = "tag.model"
value = dict()

enum_server = process(["python", "tagger_history_generator.py", "ENUM"])
gold_server = process(["python", "tagger_history_generator.py", "GOLD"])
decoder_server = process(["python", "tagger_decoder.py", "HISTORY"])

filename = "tag_train.dat"
sentences = sentence_reader(filename)

K = 5
#历史
History = []
History_label = []
for sentence in sentences:
    sent = transform(sentence)
    history = call(enum_server, sent)
    history_label = call(gold_server, sent)
    History.append(history)
    History_label.append(history_label)
N = len(sentences)

#训练
for k in range(K):
    for i, sentence in enumerate(sentences):
        sent = transform(sentence)
        history = History[i]
        #真实结果
        history_label = History_label[i]
        score_ = []
        for his in history:
            score = 0
            feature = get_feature_v1(sentence, his.split())
            for fea in feature:
                if fea in value:
                    score += value[fea]
            score_.append(his + "\t" + str(score))
        score_ = '\n'.join(score_)
        #生成结果
        res = call(decoder_server, score_)
        #比较结果
        flag = True
        n = len(history_label)
        
        for j in range(n):
            a1 = res[j][-1]
            a2 = history_label[j].split()[-1]
            if a1 != a2:
                #不相同
                for f in get_feature_v1(sentence, res[j].split()):
                    if f in value:
                        value[f] -= 1
                    else:
                        value[f] = -1
                #相同
                for f in get_feature_v1(sentence, history_label[j].split()):
                    if f in value:
                        value[f] += 1
                    else:
                        value[f] = 1

#生成结果
outputname = "Q5.model"
with open(outputname, "wb") as f:
    for fea in value:
        f.writelines(fea + " " + str(value[fea]))    
        f.writelines("\n")
2184 2459 0.888165921106