Neural Network Training

After everything is ready, we will start training the neural network. First, define encoder and decoder instances, as well as optimizers, loss functions, and other components.

# Start RNN training with attention mechanism

# Define network architecture
hidden_size = 32
max_length = MAX_LENGTH
n_layers = 2
encoder = EncoderRNN(input_lang.n_words, hidden_size, n_layers = n_layers)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.5,
                         max_length = max_length, n_layers = n_layers)

if use_cuda:
    encoder = encoder.cuda()
    decoder = decoder.cuda()

learning_rate = 0.0001
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

criterion = nn.NLLLoss()
teacher_forcing_ratio = 0.5

num_epoch = 100

# start training cycle
plot_losses = []
for epoch in range(num_epoch):
    # Put the decoder into training state and let the dropout work
    decoder.train()
    print_loss_total = 0
    # Loop through the training data
    for data in train_loader:
        input_variable = data[0].cuda() if use_cuda else data[0]
        # The size of input_variable: batch_size, length_seq
        target_variable = data[1].cuda() if use_cuda else data[1]
        # The size of target_variable: batch_size, length_seq

        # clear gradient
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_hidden = encoder.initHidden(data[0].size()[0])

        loss = 0

        # Encoder starts working
        encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)
        # The size of encoder_outputs: batch_size, length_seq, hidden_size*direction
        # The size of encoder_hidden: direction*n_layer, batch_size, hidden_size

        # The decoder starts working
        decoder_input = torch.LongTensor([[SOS_token]] * target_variable.size()[0])
        # decoder_input size: batch_size, length_seq
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input

        # Pass the hidden unit value of the encoder to the decoder as the encoding result
        decoder_hidden = encoder_hidden
        # decoder_hidden size: direction*n_layer, batch_size, hidden_size

        # Train the decoder in two ways at the same time
        # Use the information supervised by the teacher as the input at the next moment and the information without supervision, and use your own prediction results as the input at the next moment
        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
        if use_teacher_forcing:
            # Use the supervisory information as the input to the decoder at the next moment
            # Start time must not loop
            for di in range(MAX_LENGTH):
                # The information input to the decoder includes the input word decoder_input, the hidden unit state of the decoder at the previous moment,
                # The output of the encoder at each time step
                decoder_output, decoder_hidden, decoder_attention = decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
                # decoder_ouput size: batch_size, output_size
                # Calculate the loss function to get the input of the decoder at the next moment
                loss += criterion(decoder_output, target_variable[:, di])
                decoder_input = target_variable[:, di].unsqueeze(1)  # Teacher forcing
                # decoder_input size: batch_size, length_seq
        else:
            # Without teacher supervision, use the decoder's own prediction as input for the next moment

            # Loop over time steps
            for di in range(MAX_LENGTH):
                decoder_output, decoder_hidden, decoder_attention = decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
                #decoder_ouput size: batch_size, output_size(vocab_size)
                # Obtain the prediction result of the decoder and use it as the input of the next moment
                topv, topi = decoder_output.data.topk(1, dim = 1)
                # topi size: batch_size, k
                ni = topi[:, 0]

                decoder_input = ni.unsqueeze(1)
                # decoder_input size: batch_size, length_seq
                decoder_input = decoder_input.cuda() if use_cuda else decoder_input

                # Calculate the loss function
                loss += criterion(decoder_output, target_variable[:, di])
 
 
 
        # Backpropagation begins
        loss.backward()
        loss = loss.cpu() if use_cuda else loss
        # start gradient descent
        encoder_optimizer.step()
        decoder_optimizer.step()
        print_loss_total += loss.data.numpy()[0]

    print_loss_avg = print_loss_total / len(train_loader)

    valid_loss = 0
    rights = []
    # Set the decoder's training to False to turn off dropout
    decoder.eval()

    # Loop through all validation data
    for data in valid_loader:
        input_variable = data[0].cuda() if use_cuda else data[0]
        # The size of input_variable: batch_size, length_seq
        target_variable = data[1].cuda() if use_cuda else data[1]
        # The size of target_variable: batch_size, length_seq

        encoder_hidden = encoder.initHidden(data[0].size()[0])

        loss = 0
        encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)
        # The size of encoder_outputs: batch_size, length_seq, hidden_size*direction
        # The size of encoder_hidden: direction*n_layer, batch_size, hidden_size

        decoder_input = torch.LongTensor([[SOS_token]] * target_variable.size()[0])
        # decoder_input size: batch_size, length_seq
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input

        decoder_hidden = encoder_hidden
        # decoder_hidden size: direction*n_layer, batch_size, hidden_size

        # start each step of the forecast
        for di in range(MAX_LENGTH):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            # decoder_ouput size: batch_size, output_size(vocab_size)
            topv, topi = decoder_output.data.topk(1, dim = 1)
            # topi size: batch_size, k
            ni = topi[:, 0]

            decoder_input = ni.unsqueeze(1)
            # decoder_input size: batch_size, length_seq
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input
            right = rightness(decoder_output, target_variable[:, di])
            rights.append(right)
            loss += criterion(decoder_output, target_variable[:, di])
        loss = loss.cpu() if use_cuda else loss
        valid_loss += loss.data.numpy()[0]
    # Calculate the average loss, accuracy and other indicators and print out
    right_ratio = 1.0 * np.sum([i[0] for i in rights]) / np.sum([i[1] for i in rights])
    print('process:%d%%,Training loss:%.4f,Check loss:%.4f,word accuracy:%.2f%%' % (epoch * 1.0 / num_epoch * 100,
                                                                        print_loss_avg,
                                                                        valid_loss / len(valid_loader),
                                                                        100.0 * right_ratio))
    plot_losses.append([print_loss_avg, valid_loss / len(valid_loader), right_ratio])

In this code, the focus is on the process of training the decoder. Note that there is a judgment if use_teacher_forcing:, when this variable is True, the standard parallel corpus is used as the label data; when it is False, the output prediction result of the decoder itself is used as the next input word, these two The process is very different.

Testing Neural Machine Translators

After large-scale training, it's time to test our neural machine translator. Run the following code on the test set:

# Randomly pick 20 sentences from the test set to test the translation results
indices = np.random.choice(range(len(test_X)), 20)
for ind in indices:
    data = [test_X[ind]]
    target = [test_Y[ind]]
    print(data[0])
    print(SentenceFromList(input_lang, data[0]))
    input_variable = torch.LongTensor(data).cuda() if use_cuda else
        torch.LongTensor(data)
    #The size of input_variable: batch_size, length_seq
    target_variable = torch.LongTensor(target).cuda() if use_cuda else
        torch.LongTensor(target)
    # The size of target_variable: batch_size, length_seq

    encoder_hidden = encoder.initHidden(input_variable.size()[0])

    loss = 0
    encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)
    # The size of encoder_outputs: batch_size, length_seq, hidden_size*direction
    # The size of encoder_hidden: direction*n_layer, batch_size, hidden_size

    decoder_input = torch.LongTensor([[SOS_token]] * target_variable.size()[0])
    # decoder_input size: batch_size, length_seq
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = encoder_hidden
    # decoder_hidden size: direction*n_layer, batch_size, hidden_size

    # Without teacher forcing: use its own predictions as the next input
    output_sentence = []
    decoder_attentions = torch.zeros(max_length, max_length)
    rights = []
    for di in range(MAX_LENGTH):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        # decoder_ouput size: batch_size, output_size(vocab_size)
        topv, topi = decoder_output.data.topk(1, dim = 1)
        decoder_attentions[di] = decoder_attention.data
        # topi size: batch_size, k
        ni = topi[:, 0]
        decoder_input = ni.unsqueeze(1)
        ni = ni.cpu.numpy()[0]
        output_sentence.append(ni)
        # decoder_input size: batch_size, length_seq
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input
        right = rightness(decoder_output, target_variable[:, di])
        rights.append(right)
    sentence = SentenceFromList(output_lang, output_sentence)
    standard = SentenceFromList(output_lang, target[0])
    print('machine translation:', sentence)
    print('Standard translation:', standard)
    # Output the accuracy of this sentence
    right_ratio = 1.0 * np.sum([i[0] for i in rights]) / np.sum([i[1] for i in rights])
    print('word accuracy:', 100.0 * right_ratio)
    print('\n')

After running this code, the system will output 20 sentences, including the original French text, machine-translated English, and the standard translation and machine-translated word accuracy in the parallel corpus.

Result display

In fact, this is a wrong way that untrained neural networks often show, that is, directly output high-frequency words. The advantage of this is that the probability of guessing the next word wrong is low, because words such as "the" and "was" appear frequently in English, so the accuracy of its guess is also high. Of course, in addition to increasing the training scale, many methods have been proposed to avoid this problem.
Next, let's take a look at how machine translation learns to pay attention to related words. First, the following code can be used to output the distribution of attention weights of the machine when translating a certain sentence:

# Examining the attention mechanism of the attention mechanism through the translation of several special sentences
data = 'elle a cinq ans de moins que moi .'
data = np.array([indexFromSentence(input_lang, data)])

input_variable = torch.LongTensor(data).cuda() if use_cuda else torch.LongTensor(data)
# The size of input_variable: batch_size, length_seq
target_variable = torch.LongTensor(target).cuda() if use_cuda else torch.LongTensor(target)
# The size of target_variable: batch_size, length_seq

encoder_hidden = encoder.initHidden(input_variable.size()[0])

loss = 0
encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)
# The size of encoder_outputs: batch_size, length_seq, hidden_size*direction
# The size of encoder_hidden: direction*n_layer, batch_size, hidden_size

decoder_input = torch.LongTensor([[SOS_token]] * target_variable.size()[0])
# decoder_input size: batch_size, length_seq
decoder_input = decoder_input.cuda() if use_cuda else decoder_input

decoder_hidden = encoder_hidden
# decoder_hidden size: direction*n_layer, batch_size, hidden_size

output_sentence = []
decoder_attentions = torch.zeros(max_length, max_length)
for di in range(MAX_LENGTH):
    decoder_output, decoder_hidden, decoder_attention = decoder(
        decoder_input, decoder_hidden, encoder_outputs)
    # decoder_ouput size: batch_size, output_size(vocab_size)
    topv, topi = decoder_output.data.topk(1, dim = 1)

    # At each step, get the attention weight vector and store it in decoder_attentions
    decoder_attentions[di] = decoder_attention.data
    # topi size: batch_size, k
    ni = topi[:, 0]
    decoder_input = ni.unsqueeze(1)
    ni = ni.cpu.numpy()[0]
    output_sentence.append(ni)
    # decoder_input size: batch_size, length_seq
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
    right = rightness(decoder_output, target_variable[:, di])
    rights.append(right)
sentence = SentenceFromList(output_lang, output_sentence)
print('machine translation:', sentence)
print('\n')

The decoder_attention array stores the distribution of attention weights at each moment in the process of translating a certain sentence. Next, visualize this weight distribution:

# Combine the attention weights stored at each step to form an attention matrix and draw it into a graph
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(decoder_attentions.numpy(), cmap='bone')
fig.colorbar(cax)

# set axis
ax.set_xticklabels([''] + input_sentence.split(' ') +
                   ['<EOS>'], rotation=90)
ax.set_yticklabels([''] + sentence.split(' '))

# display words on a scale
import matplotlib.ticker as ticker
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

plt.show()

Tags: Deep Learning neural networks Machine Learning

Posted by ultrus on Sat, 01 Apr 2023 03:12:36 +0530