距离上次更新已经 1776 天了,文章内容可能已经过时。

课程主页:http://www.cs.columbia.edu/~cs4705/

课程网盘地址:

链接:https://pan.baidu.com/s/1KijgO7yjL_MVCC9zKZ7Jdg
提取码:t1i3

这一次回顾Michael Collins NLP作业4。

Quesion 1

(a)
L(v)vj=i(fj(xi,yi)yVfj(xi,y)P(y|xi,v))2Cvj

令上式为0可得

vj=12Ci(fj(xi,yi)yVfj(xi,y)P(y|xi,v))

注意f1=f2,所以

v1=v2
(b)
L(v)vj=i(fj(xi,yi)yVfj(xi,y)P(y|xi,v))C1{vj0}+C1{vj<0}

令上式为0可得

i(fj(xi,yi)yVfj(xi,y)P(y|xi,v))C1{vj0}+C1{vj<0}=0

j=1,2可得

i(fj(xi,yi)fj(xi,model)P(model|xi,v))C1{vj0}+C1{vj<0}=0

构造集合

S1={i|xithe}S¯1={1,,n}SS2={i|yi=model}

那么上式等价于

|S1S2|iS1ev1+v2+j=3dvifi(x,y)yVevf(x,y)C1{vj0}+C1{vj<0}=0

由于上式对j=1,2成立,这说明v1,v2同号。

Question 2

假设

fj(x,y)={1 if y=w2j and x=w1j0 otherwise 

那么

L(v)vj=i(fj(xi,yi)yVfj(xi,y)P(y|xi,v))=Count(w1j,w2j)iyVfj(xi,y)P(y|xi,v)=Count(w1j,w2j)ifj(xi,w2j)P(w2j|xi,v)=Count(w1j,w2j)xi=w1jP(w2j|w1j,v)=Count(w1j,w2j)Count(w2j)P(w2j|w1j,v)

令上式为0可得

P(y=w2j|x=w1j,v)=Count(w1j,w2j)Count(w1j)

Question 3

(a)

定义

f1(x,y)={1x=y0 otherwise f2(x,y)={1x=yreverse0 otherwise 

所以需要两个参数v1,v2

(b)
P(the|the)=ev1yev.f(the,y)P(eht|the)=ev2yev.f(the,y)P(dog|the)=1yev.f(the,y)
(c)

|V|=n

那么

|V|=2n

注意到

yev.f(the,y)=ev1+ev2+(2n2)

所以条件为

ev1ev1+ev2+(2n2)=0.4ev2ev1+ev2+(2n2)=0.31ev1+ev2+(2n2)=0.3×12n2

Question 4

由于一些函数要共用,所以编写了helper.py文件

python
import tagger_config
from subprocess import PIPE
import sys, subprocess

tags = tagger_config.tags

def sentence_reader(filename):
    sentences = []
    with open(filename) as f:
        sentence = [('*', '*')]
        sentence = []
        for word in f.readlines():
            w = word.strip().split()
            #非空
            if not w:
                sentences.append(sentence)
                sentence = [('*', '*')]
                sentence = []
            else:
                
                sentence.append(w)
            
    return sentences

def transform(sentence):
    res = ""
    n = len(sentence)
    for i in range(n):
        word = sentence[i]
        m = len(word)
        tmp = word[0]
        #单词之间以\t间隔
        for j in range(1, m):
            tmp += "\t" + word[j]
        res += tmp
        #除了最后一行增加换行
        if (i < n - 1):
             res += "\n"
        
    return res

def process(args):
    "Create a 'server' to send commands to."
    return subprocess.Popen(args, stdin=PIPE, stdout=PIPE)

def call(process, stdin):
    "Send command to a server and get stdout."
    res = []
    process.stdin.write(stdin + "\n\n")
    line = process.stdout.readline().strip()
    while line:
        res.append(line)
        line = process.stdout.readline().strip()
    return res
                            
def get_feature(sentence, his):
    #his=[i, tag[i-1], tag[i]]
    #BIGRAM
    BIGRAM = "BIGRAM:" + his[1] + ":" + his[2]
    #TAG
    i = int(his[0]) - 1
    TAG = "TAG:" + sentence[i][0] + ":" + his[2]
    
    return BIGRAM, TAG

def get_feature_v1(sentence, his):
    #his=[i, tag[i-1], tag[i]]
    res = []
    #BIGRAM
    BIGRAM = "BIGRAM:" + his[1] + ":" + his[2]
    if (his[1] == " "):
        print(his)
    res.append(BIGRAM)
    #TAG
    i = int(his[0]) - 1
    TAG = "TAG:" + sentence[i][0] + ":" + his[2]
    res.append(TAG)
    #SUFF
    for j in range(len(sentence)):
        word = sentence[j][0]
        n = len(word)
        for k in range(1, 4):
            if k <= n:
                tmp = "SUFF:" + word[-k:] + ":" + str(k) + ":" + his[2]
                res.append(tmp)
    
    return res

Q4

python
from helper import *

#读取
def get_value(filename):
    value = dict()
    
    with open(filename) as f:
        for string in f.readlines():
            fea, v = string.strip().split()
            value[fea] = float(v)
    
    return value

def generate(output, sentences, F):
    with open(output, "wb") as f:
        for sentence in sentences:
            sent = transform(sentence)
            history = call(enum_server, sent)
            score_ = []
            for his in history:
                score = 0
                feature = F(sentence, his.split())
                for fea in feature:
                    if fea in value:
                        score += value[fea]
                score_.append(his + "\t" + str(score))
            score_ = '\n'.join(score_)
            #生成结果
            res = call(decoder_server, score_)
            #保存
            n = len(sentence)
            for i in range(n):
                tmp = sentence[i][0] + "\t" + res[i].split()[-1]
                f.writelines(tmp)
                f.write("\n")
            f.write("\n")

enum_server = process(["python", "tagger_history_generator.py", "ENUM"])
decoder_server = process(["python", "tagger_decoder.py", "HISTORY"])
filename = "tag_dev.dat"
sentences = sentence_reader(filename)

#Q4
f1 = "tag.model"
o1 = "Q4.out"
value = get_value(f1)
generate(o1, sentences, get_feature)
Code
2226 2459 0.905246034974

Question 5

Q5

python
#参考https://github.com/huxiuhan/nlp-hw

from helper import *

filename = "tag.model"
value = dict()

enum_server = process(["python", "tagger_history_generator.py", "ENUM"])
gold_server = process(["python", "tagger_history_generator.py", "GOLD"])
decoder_server = process(["python", "tagger_decoder.py", "HISTORY"])

filename = "tag_train.dat"
sentences = sentence_reader(filename)

K = 5
#历史
History = []
History_label = []
for sentence in sentences:
    sent = transform(sentence)
    history = call(enum_server, sent)
    history_label = call(gold_server, sent)
    History.append(history)
    History_label.append(history_label)
N = len(sentences)

#训练
for k in range(K):
    for i, sentence in enumerate(sentences):
        sent = transform(sentence)
        history = History[i]
        #真实结果
        history_label = History_label[i]
        score_ = []
        for his in history:
            score = 0
            feature = get_feature_v1(sentence, his.split())
            for fea in feature:
                if fea in value:
                    score += value[fea]
            score_.append(his + "\t" + str(score))
        score_ = '\n'.join(score_)
        #生成结果
        res = call(decoder_server, score_)
        #比较结果
        flag = True
        n = len(history_label)
        
        for j in range(n):
            a1 = res[j][-1]
            a2 = history_label[j].split()[-1]
            if a1 != a2:
                #不相同
                for f in get_feature_v1(sentence, res[j].split()):
                    if f in value:
                        value[f] -= 1
                    else:
                        value[f] = -1
                #相同
                for f in get_feature_v1(sentence, history_label[j].split()):
                    if f in value:
                        value[f] += 1
                    else:
                        value[f] = 1

#生成结果
outputname = "Q5.model"
with open(outputname, "wb") as f:
    for fea in value:
        f.writelines(fea + " " + str(value[fea]))    
        f.writelines("\n")
Code
2184 2459 0.888165921106