

1. Character-based convolutional encoder for NMT











def words2charindices(self, sents):
    """ Convert list of sentences of words into list of list of list of character indices.
    @param sents (list[list[str] ]): sentence(s) in words
    @return word_ids (list[list[list[int] ]]): sentence(s) in indices
    ### YOUR CODE HERE for part 1e
    ### TODO: 
    ###     This method should convert characters in the input sentences into their 
    ###     corresponding character indices using the character vocabulary char2id 
    ###     defined above.
    ###     You must prepend each word with the `start_of_word` character and append 
    ###     with the `end_of_word` character. 
    res = []
    for sent in sents:
        l1 = []
        for word in sent:
            l2 = []
            for char in word:
    return res


python sanity_check.py 1e


Running Sanity Check for Question 1e: words2charindices()
Running test on small list of sentences
Running test on large list of sentences
All Sanity Checks Passed for Question 1e: words2charindices()!


def pad_sents_char(sents, char_pad_token):
    """ Pad list of sentences according to the longest sentence in the batch and max_word_length.
    @param sents (list[list[list[int] ]]): list of sentences, result of `words2charindices()`
        from `vocab.py`
    @param char_pad_token (int): index of the character-padding token
    @returns sents_padded (list[list[list[int] ]]): list of sentences where sentences/words shorter
        than the max length sentence/word are padded out with the appropriate pad token, such that
        each sentence in the batch now has same number of words and each word has an equal
        number of characters
        Output shape: (batch_size, max_sentence_length, max_word_length)
    # Words longer than 21 characters should be truncated
    max_word_length = 21

    ### YOUR CODE HERE for part 1f
    ### TODO:
    ###     Perform necessary padding to the sentences in the batch similar to the pad_sents()
    ###     method below using the padding character from the arguments. You should ensure all
    ###     sentences have the same number of words and each word has the same number of
    ###     characters.
    ###     Set padding words to a `max_word_length` sized vector of padding characters.
    ###     You should NOT use the method `pad_sents()` below because of the way it handles
    ###     padding and unknown words.
    L = 0
    for sent in sents:
        L = max(L, len(sent))
    sents_padded = []
    for sent in sents:
        l1 = []
        for word in sent:
            l2 = []
            n = len(word)
            m = min(n, max_word_length)
            for index in word[:m]:
            for i in range(max_word_length - m):
        for i in range(L - len(sent)):
            l1.append([char_pad_token for j in range(max_word_length)])


    return sents_padded


python sanity_check.py 1f


Running Sanity Check for Question 1f: Padding
Running test on a list of sentences
Sanity Check Passed for Question 1f: Padding!


def to_input_tensor_char(self, sents: List[List[str] ], device: torch.device) -> torch.Tensor:
    """ Convert list of sentences (words) into tensor with necessary padding for 
    shorter sentences.

    @param sents (List[List[str] ]): list of sentences (words)
    @param device: device on which to load the tensor, i.e. CPU or GPU

    @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
    ### YOUR CODE HERE for part 1g
    ### TODO: 
    ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in 
    ###     previous parts
    charindices = self.words2charindices(sents)
    charindices_pad = pad_sents_char(charindices, self.char2id['<pad>'])
    res = torch.tensor(charindices_pad).to(device)
    res = res.permute(1, 0, 2)
    return res




import torch
import torch.nn as nn
import torch.nn.utils
import torch.nn.functional as F


class Highway(nn.Module):
    def __init__(self, d):
        super(Highway, self).__init__()
        self.proj = nn.Linear(d, d)
        self.gate = nn.Linear(d, d)
        self.a1 = nn.ReLU()
        self.a2 = nn.Sigmoid()
        self.dropout = nn.Dropout()
    def forward(self, x):
        x_proj = self.a1(self.proj(x))
        x_gate = self.a2(self.gate(x))
        x_highway = x_gate * x_proj + (1 - x_gate) * x
        return x_highway



import torch
import torch.nn as nn
import torch.nn.utils
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, din, dout, k=5):
        super(CNN, self).__init__()
        self.conv = nn.Conv1d(din, dout, k)
    def forward(self, x):
        x_conv = self.conv(x)
        x_conv_temp = nn.ReLU()(x_conv)
        x_conv_out = nn.MaxPool1d(x_conv_temp.shape[-1])(x_conv_temp).squeeze(-1)
        return x_conv_out


class ModelEmbeddings(nn.Module): 
    Class that converts input words to their CNN-based embeddings.
    def __init__(self, embed_size, vocab):
        Init the Embedding layer for one language
        @param embed_size (int): Embedding size (dimensionality) for the output 
        @param vocab (VocabEntry): VocabEntry object. See vocab.py for documentation.
        super(ModelEmbeddings, self).__init__()

        ## A4 code
        # pad_token_idx = vocab.src['<pad>']
        # self.embeddings = nn.Embedding(len(vocab.src), embed_size, padding_idx=pad_token_idx)
        pad_token_idx = vocab.char2id['<pad>']
        self.embeddings = nn.Embedding(len(vocab.char2id), 50, padding_idx=pad_token_idx)
        ## End A4 code

        ### YOUR CODE HERE for part 1j
        self.embed_size = embed_size
        self.p = 0.3
        self.m_word = len(vocab.word2id)
        self.e_char = embed_size
        self.e_char = 50
        self.e_word = embed_size
        self.cnn = CNN(self.e_char, self.e_word)
        self.highway = Highway(self.e_word)
        self.dropout = nn.Dropout(self.p)

        ### END YOUR CODE

    def forward(self, input):
        Looks up character-based CNN embeddings for the words in a batch of sentences.
        @param input: Tensor of integers of shape (sentence_length, batch_size, max_word_length) where
            each integer is an index into the character vocabulary

        @param output: Tensor of shape (sentence_length, batch_size, embed_size), containing the 
            CNN-based embeddings for each word of the sentences in the batch
        ## A4 code
        # output = self.embeddings(input)
        # return output
        x_emb = self.embeddings(input)
        sentence_length, batch_size, max_word_length, emb_size = x_emb.shape
        x_emb = x_emb.permute(0, 1, 3, 2)
        x_emb = x_emb.view(-1, emb_size, max_word_length)

        ### YOUR CODE HERE for part 1j
        x_conv = self.cnn(x_emb)
        x_high = self.highway(x_conv)
        x_word_emb = self.dropout(x_high)
        x_word_emb = x_word_emb.view(sentence_length, batch_size, -1)

        return x_word_emb
        ### END YOUR CODE


python sanity_check.py 1j


Running Sanity Check for Question 1j: Model Embedding
Sanity Check Passed for Question 1j: Model Embedding!


def forward(self, source: List[List[str] ], target: List[List[str] ]) -> torch.Tensor:
    """ Take a mini-batch of source and target sentences, compute the log-likelihood of
    target sentences under the language models learned by the NMT system.

    @param source (List[List[str] ]): list of source sentence tokens
    @param target (List[List[str] ]): list of target sentence tokens, wrapped by `<s>` and `</s>`

    @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                log-likelihood of generating the gold-standard target sentence for
                                each example in the input batch. Here b = batch size.
    # Compute sentence lengths
    source_lengths = [len(s) for s in source]

    # Convert list of lists into tensors

    ## A4 code
    # source_padded = self.vocab.src.to_input_tensor(source, device=self.device)   # Tensor: (src_len, b)
    # target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device)   # Tensor: (tgt_len, b)

    # enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths)
    # enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
    # combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)
    ## End A4 code

    ### YOUR CODE HERE for part 1k
    ### TODO:
    ###     Modify the code lines above as needed to fetch the character-level tensor
    ###     to feed into encode() and decode(). You should:
    ###     - Keep `target_padded` from A4 code above for predictions
    ###     - Add `source_padded_chars` for character level padded encodings for source
    ###     - Add `target_padded_chars` for character level padded encodings for target
    ###     - Modify calls to encode() and decode() to use the character level encodings
    target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device)
    source_padded_chars = self.vocab.src.to_input_tensor_char(source, device=self.device)
    target_padded_chars = self.vocab.tgt.to_input_tensor_char(target, device=self.device)
    enc_hiddens, dec_init_state = self.encode(source_padded_chars, source_lengths)
    enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
    combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded_chars)

    P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1)

    # Zero out, probabilities for which we have nothing in the target text
    target_masks = (target_padded != self.vocab.tgt['<pad>']).float()

    # Compute log probability of generating true target words
    target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:]
    scores = target_gold_words_log_prob.sum() # mhahn2 Small modification from A4 code.

    if self.charDecoder is not None:
        max_word_len = target_padded_chars.shape[-1]

        target_words = target_padded[1:].contiguous().view(-1)
        #print(max_word_len, target_padded_chars[1:].shape)
        #target_chars = target_padded_chars[1:].view(-1, max_word_len)
        target_chars = target_padded_chars[1:].reshape(-1, max_word_len)
        target_outputs = combined_outputs.view(-1, 256)

        target_chars_oov = target_chars #torch.index_select(target_chars, dim=0, index=oovIndices)
        rnn_states_oov = target_outputs #torch.index_select(target_outputs, dim=0, index=oovIndices)
        oovs_losses = self.charDecoder.train_forward(target_chars_oov.t(), (rnn_states_oov.unsqueeze(0), rnn_states_oov.unsqueeze(0)))
        scores = scores - oovs_losses

    return scores



python run.py train --train-src=./en_es_data/train_tiny.es --train-tgt=./en_es_data/train_tiny.en --dev-src=./en_es_data/dev_tiny.es --dev-tgt=./en_es_data/dev_tiny.en --vocab=vocab_tiny_q1.json --batch-size=2 --valid-niter=100 --max-epoch=101 --no-char-decoder

python run.py decode model.bin ./en_es_data/test_tiny.es ./en_es_data/test_tiny.en outputs/test_outputs_local_q1.txt --no-char-decoder

2. Character-based LSTM decoder for NMT




def __init__(self, hidden_size, char_embedding_size=50, target_vocab=None):
    """ Init Character Decoder.

    @param hidden_size (int): Hidden size of the decoder LSTM
    @param char_embedding_size (int): dimensionality of character embeddings
    @param target_vocab (VocabEntry): vocabulary for the target language. See vocab.py for documentation.
    ### YOUR CODE HERE for part 2a
    ### TODO - Initialize as an nn.Module.
    ###      - Initialize the following variables:
    ###        self.charDecoder: LSTM. Please use nn.LSTM() to construct this.
    ###        self.char_output_projection: Linear layer, called W_{dec} and b_{dec} in the PDF
    ###        self.decoderCharEmb: Embedding matrix of character embeddings
    ###        self.target_vocab: vocabulary for the target language
    ### Hint: - Use target_vocab.char2id to access the character vocabulary for the target language.
    ###       - Set the padding_idx argument of the embedding matrix.
    ###       - Create a new Embedding layer. Do not reuse embeddings created in Part 1 of this assignment.
    super(CharDecoder, self).__init__()
    self.charDecoder = nn.LSTM(char_embedding_size, hidden_size)
    V_char = len(target_vocab.char2id)
    self.pad = target_vocab.char2id['<pad>']
    self.char_output_projection = nn.Linear(hidden_size, V_char)
    self.decoderCharEmb = nn.Embedding(V_char, char_embedding_size, self.pad)
    self.target_vocab = target_vocab


python sanity_check.py 2a


Running Sanity Check for Question 2a: CharDecoder.__init__()
Sanity Check Passed for Question 2a: CharDecoder.__init__()!


def forward(self, input, dec_hidden=None):
    """ Forward pass of character decoder.

    @param input: tensor of integers, shape (length, batch)
    @param dec_hidden: internal state of the LSTM before reading the input characters. A tuple of two tensors of shape (1, batch, hidden_size)

    @returns scores: called s_t in the PDF, shape (length, batch, self.vocab_size)
    @returns dec_hidden: internal state of the LSTM after reading the input characters. A tuple of two tensors of shape (1, batch, hidden_size)
    ### YOUR CODE HERE for part 2b
    ### TODO - Implement the forward pass of the character decoder.
    X = self.decoderCharEmb(input)
    enc_hiddens, dec_hidden = self.charDecoder(X, dec_hidden)
    scores = self.char_output_projection(enc_hiddens)
    return scores, dec_hidden
    ### END YOUR CODE 


python sanity_check.py 2b


Running Sanity Check for Question 2b: CharDecoder.forward()
Sanity Check Passed for Question 2b: CharDecoder.forward()!


def train_forward(self, char_sequence, dec_hidden=None):
    """ Forward computation during training.

    @param char_sequence: tensor of integers, shape (length, batch). Note that "length" here and in forward() need not be the same.
    @param dec_hidden: initial internal state of the LSTM, obtained from the output of the word-level decoder. A tuple of two tensors of shape (1, batch, hidden_size)

    @returns The cross-entropy loss, computed as the *sum* of cross-entropy losses of all the words in the batch.
    ### YOUR CODE HERE for part 2c
    ### TODO - Implement training forward pass.
    ### Hint: - Make sure padding characters do not contribute to the cross-entropy loss.
    ###       - char_sequence corresponds to the sequence x_1 ... x_{n+1} from the handout (e.g., <START>,m,u,s,i,c,<END>).
    scores, dec_hidden = self.forward(char_sequence, dec_hidden)
    scores = scores[:-1]
    target = char_sequence[1:]
    loss = 0
    batch_size = char_sequence.shape[1]
    loss_func = nn.CrossEntropyLoss(ignore_index=self.pad)
    for i in range(batch_size):
        loss += loss_func(scores[:, i], target[:, i])
    return loss


python sanity_check.py 2c


Running Sanity Check for Question 2c: CharDecoder.train_forward()
Sanity Check Passed for Question 2c: CharDecoder.train_forward()!



def decode_greedy(self, initialStates, device, max_length=21):
    """ Greedy decoding
    @param initialStates: initial internal state of the LSTM, a tuple of two tensors of size (1, batch, hidden_size)
    @param device: torch.device (indicates whether the model is on CPU or GPU)
    @param max_length: maximum length of words to decode

    @returns decodedWords: a list (of length batch) of strings, each of which has length <= max_length.
                          The decoded strings should NOT contain the start-of-word and end-of-word characters.

    ### YOUR CODE HERE for part 2d
    ### TODO - Implement greedy decoding.
    ### Hints:
    ###      - Use target_vocab.char2id and target_vocab.id2char to convert between integers and characters
    ###      - Use torch.tensor(..., device=device) to turn a list of character indices into a tensor.
    ###      - We use curly brackets as start-of-word and end-of-word characters. That is, use the character '{' for <START> and '}' for <END>.
    ###        Their indices are self.target_vocab.start_of_word and self.target_vocab.end_of_word, respectively.
    batch_size = initialStates[0].shape[1]
    start = self.target_vocab.start_of_word
    end = self.target_vocab.end_of_word
    current_char = torch.tensor([ [start] * batch_size], device=device)
    dec_hidden = initialStates
    output = []
    char_index = []
    for i in range(max_length):
        scores, dec_hidden = self.forward(current_char, dec_hidden)
        current_char = torch.argmax(scores, axis=-1)
    char_index = np.array(char_index)
    for i in range(batch_size):
        index = char_index[:, i]
        tmp = ""
        for j in index:
            char = self.target_vocab.id2char[j]
            if j != end:
                tmp += char
    return output


python sanity_check.py 2d


Running Sanity Check for Question 2d: CharDecoder.decode_greedy()
Sanity Check Passed for Question 2d: CharDecoder.decode_greedy()!



python run.py train --train-src=./en_es_data/train_tiny.es --train-tgt=./en_es_data/train_tiny.en --dev-src=./en_es_data/dev_tiny.es --dev-tgt=./en_es_data/dev_tiny.en --vocab=vocab_tiny_q2.json --batch-size=2 --max-epoch=201 --valid-niter=100
python run.py decode model.bin ./en_es_data/test_tiny.es ./en_es_data/test_tiny.en outputs/test_outputs_local_q2.txt



python run.py train --train-src=./en_es_data/train.es --train-tgt=./en_es_data/train.en --dev-src=./en_es_data/dev.es --dev-tgt=./en_es_data/dev.en --vocab=vocab.json --cuda
python run.py decode model.bin ./en_es_data/test.es ./en_es_data/test.en outputs/test_outputs.txt --cuda


Corpus BLEU: 22.732161209949027

3. Analyzing NMT Systems
