Michael Collins NLP Homework 2

课程主页：http://www.cs.columbia.edu/~cs4705/

课程网盘地址：

链接：https://pan.baidu.com/s/1KijgO7yjL_MVCC9zKZ7Jdg
提取码：t1i3

这一次回顾Michael Collins NLP作业2。

Question 1

(a)只需要指出变化的部分即可，其余部分保持不变：

$\begin{aligned} f(\text{S} \to \text{NP NP VP}) &=\text{S} \to \text{NP NP}_1 ,p=0.3\\ f(\text{VP}\to\text{Vt NP PP})&=\text{VP}\to\text{Vt NP}_2,p=0.2\\ f(\text{NP}\to \text{DT NN NN}) & = \text{NP}\to \text{DT NN}_1 ,p=0.3 \end{aligned}$

增加如下规则：

$\begin{aligned} \text{NP}_1&\to\text{NP VP},p=1\\ \text{NP}_2&\to\text{NP PP}, p=1\\ \text{NN}_1&\to \text{NN NN},p=0.3 \end{aligned}$

(b)根据上述规则即可，略过。

Question 2

(a)第一部分：

$\begin{aligned} q(\text{S}\to \text{NP VP})&=\frac{6}{6}=1\\ q(\text{VP}\to \text{V1 SBAR})&=\frac{3}{9} =\frac 1 3\\ q(\text{VP}\to \text{VP ADVP})&=\frac{3}{9} =\frac 1 3\\ q(\text{VP}\to \text{V2})&=\frac{3}{9} =\frac 1 3\\ q(\text{SBAR}\to \text{COMP S})&=\frac{3}{3}=1 \end{aligned}$

第二部分：

$\begin{aligned} q(\text{NP}\to \text{John}) &=q(\text{NP}\to \text{Bill})=q(\text{NP}\to \text{Fred}) =q(\text{NP}\to \text{Jeff})=\frac{1}{6}\\ q(\text{NP}\to \text{Sally}) &=\frac{2}{6} =\frac 13 \\ q(\text{V1}\to \text{said})&=q(\text{V1}\to \text{declared}) =q(\text{V1}\to \text{pronounced}) =\frac 1 3 \\ q(\text{V2}\to \text{snored})&=q(\text{V2}\to \text{ran}) =q(\text{V2}\to \text{swam}) =\frac 1 3 \\ q(\text{COMP}\to \text{that})& =1\\ q(\text{ADVP}\to \text{loudly})&=q(\text{ADVP}\to \text{quickly}) =q(\text{elegantly}\to \text{swam}) =\frac 1 3 \\ \end{aligned}$

(b)(c)作图较麻烦，这里略过。

Question 3

(a)利用平衡二叉树的结构：左子树的叶节点数量和右子树的叶节点数量相同

$\pi(i, j, X)=\max _{X \rightarrow Y Z \in R } \left(q(X \rightarrow Y Z) \times \pi\left(i,\left[\frac {i+j} 2\right], Y\right) \times \pi\left(\left[\frac {i+j} 2\right]+1, j, Z\right)\right)$

Question 4

import json

#### Part 1 计数
def Count(File):
    Count_x = {}
    with open(File) as f:
        for i in f.readlines():
            #分隔数据
            data = i.split()
            #判断
            if data[1] == "UNARYRULE":
                #计算Count(x)
                if data[3] in Count_x:
                    Count_x[data[3]] += float(data[0])
                else:
                    Count_x[data[3]] = float(data[0])
    
    return Count_x

#### Part 2 处理低频词
def Replace_word(tree, Count_x):
    #基本情形
    if len(tree) == 2:
        word = tree[1]
        if word not in Count_x:
            tree[1] = "_RARE_"
        elif Count_x[word] < 5:
            tree[1] = "_RARE_"
    else:
        Replace_word(tree[1], Count_x)
        Replace_word(tree[2], Count_x)

#### Part 3 换并输出
def Replace(File, output, Count_x):
    Res = []
    with open(File) as f:
        for i in f.readlines():
            tree = json.loads(i, encoding="utf-8")
            Replace_word(tree, Count_x)
            res = json.dumps(tree)
            Res.append(res)
            
    with open(output, "w+") as f:
        for i in Res:
            f.write(i)
            f.write("\n")
            
def Replace_dev(File, output, Count_x):
    Res = []
    with open(File) as f:
        for i in f.readlines():
            data = i.strip().split(" ")
            for j in range(len(data)):
                if data[j] not in Count_x or Count_x[data[j]] < 5:
                    data[j] = "_RARE_"
            Res.append(" ".join(data) + "\n")
    
    with open(output, "w+") as f:
        for i in Res:
            f.writelines(i)

#### 运行
File = "cfg.counts"
Count_x = Count(File)

#训练数据
f1 = "parse_train.dat"
o1 = "parse_train_proc.dat"
Replace(f1, o1, Count_x)

#测试数据
f2 = "parse_dev.dat"
o2 = "parse_dev_proc.dat"
Replace_dev(f2, o2, Count_x)

#python count_cfg_freq.py parse_train_proc.dat > cfg_proc.counts

Question 5

import json

#### Part 1 计算MLE
#计数
def Generate_count(File):
    Count_xyy = {}
    Count_xy = {}
    Count_x = {}
    Count_y = {}
    with open(File) as f:
        for i in f.readlines():
            #分隔数据
            data = i.strip().split()
            #判断
            if data[1] == "NONTERMINAL":
                Count_x[data[2]] = float(data[0])
            elif data[1] == "BINARYRULE":
                if data[2] not in Count_xyy:
                    Count_xyy[data[2]] = {}
                Count_xyy[data[2]][(data[3], data[4])] = float(data[0])
            else:
                if data[2] not in Count_xy:
                    Count_xy[data[2]] = {}
                Count_xy[data[2]][data[3]] = float(data[0])
                #统计单词数量
                if data[3] in Count_y:
                    Count_y[data[3]] += float(data[0])
                else:
                    Count_y[data[3]] = float(data[0])
                    
    return Count_xyy, Count_xy, Count_x, Count_y

def Generate_mle(Count_xyy, Count_xy, Count_x, Count_y):
    #计算MLE
    q_xyy = {}
    q_xy = {}
    for i in Count_x:
        q_xy[i] = {}
        q_xyy[i] = {}
        if i in Count_xy:
            for j in Count_xy[i]:
                q_xy[i][j] = Count_xy[i][j] / Count_x[i]
        if i in Count_xyy:
            for k in Count_xyy[i]:
                q_xyy[i][k] = Count_xyy[i][k] / Count_x[i]
                
    return q_xyy, q_xy

#### Part2: CKY
def CKY(word, Count_xyy, Count_xy, Count_x, Count_y, q_xyy, q_xy):
    n = len(word)
    Pi = {}
    Bp = {}
    for i in range(n):
        for j in range(i, n):
            Pi[(i, j)] = {}
            Bp[(i, j)] = {}
    
    for i in range(n):
        for x in Count_x:
            if x in Count_xy and word[i] in Count_xy[x]:
                Pi[(i, i)][x] = q_xy[x][word[i]]
                
    for l in range(1, n):
        for i in range(n - l):
            j = i + l
            for X in Count_x:
                #概率
                p = 0
                #最佳规则以及位置
                y1 = ""
                z1 = ""
                s1 = ""
                if X in q_xyy:
                    for (Y, Z) in q_xyy[X]:
                        for s in range(i, j):
                            if Y in Pi[(i, s)] and Z in Pi[(s + 1, j)]:
                                p1 = q_xyy[X][(Y, Z)] * Pi[(i, s)][Y] * Pi[(s + 1, j)][Z]
                                if p1 > p:
                                    p = p1
                                    y1 = Y
                                    z1 = Z
                                    s1 = s
                #更新最大值
                if p > 0:
                    Pi[(i, j)][X] = p
                    Bp[(i, j)][X] = (y1, z1, s1)
    
    pmax = 0
    node = ""
    if "S" in Pi[(0, n - 1)]:
        pmax = Pi[(0, n - 1)]["S"]
        node = "S"
    else:
        for i in Pi[(0, n - 1)]:
            if Pi[(0, n - 1)][i] > pmax:
                pmax = Pi[(0, n - 1)][i]
                node = i
    
    return Pi, Bp, node

#生成树
def tree(Bp, word, i, j, node):
    if j == i:
        return [node, word[i]]
    else:
        lnode, rnode, k = Bp[(i, j)][node]
        left = tree(Bp, word, i, k, lnode)
        right = tree(Bp, word, k + 1, j, rnode)
        return [node, left, right]

#生成结果
def Generate(File, output, Count_xyy, Count_xy, Count_x, Count_y, q_xyy, q_xy):
    Res = []
    with open(File) as f:
        for i in f.readlines():
            data = i.strip().split(" ")
            word = i.strip().split(" ")
            for j in range(len(data)):
                if data[j] not in Count_y or Count_y[data[j]] < 5:
                    data[j] = "_RARE_"
            
            #CKY算法
            Pi, Bp, node = CKY(data, Count_xyy, Count_xy, Count_x, Count_y, q_xyy, q_xy)
            n = len(word)
            #生成树
            res = tree(Bp, word, 0, n - 1, node)
            Res.append(res)
            
    with open(output, "w+") as f:
        for res in Res:
            r = json.dumps(res)
            f.write(r)
            f.write("\n")
            
#### 运行
File = "cfg_proc.counts"
f1 = "parse_dev.dat"
o1 = "parse_dev_result.dat"
Count_xyy, Count_xy, Count_x, Count_y = Generate_count(File)
q_xyy, q_xy = Generate_mle(Count_xyy, Count_xy, Count_x, Count_y)
Generate(f1, o1, Count_xyy, Count_xy, Count_x, Count_y, q_xyy, q_xy)

#检测结果
#python eval_parser.py parse_dev.key parse_dev_result.dat

Question 6

from Q4 import *
from Q5 import *

#### Q6
File1 = "cfg_vert.counts"
Count_x = Count(File1)
f1 = "parse_train_vert.dat"
o1 = "parse_train_vert_proc.dat"
Replace(f1, o1, Count_x)

#python count_cfg_freq.py parse_train_vert_proc.dat > cfg_vert_proc.counts

File2 = "cfg_vert_proc.counts"
f2 = "parse_dev.dat"
o2 = "parse_dev_vert_result.dat"
Count_xyy, Count_xy, Count_x, Count_y = Generate_count(File2)
q_xyy, q_xy = Generate_mle(Count_xyy, Count_xy, Count_x, Count_y)
Generate(f2, o2, Count_xyy, Count_xy, Count_x, Count_y, q_xyy, q_xy)