After everything is ready, we will start training the neural network. First, define encoder and decoder instances, as well as optimizers, loss functions, and other components.
# Start RNN training with attention mechanism # Define network architecture hidden_size = 32 max_length = MAX_LENGTH n_layers = 2 encoder = EncoderRNN(input_lang.n_words, hidden_size, n_layers = n_layers) decoder = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.5, max_length = max_length, n_layers = n_layers) if use_cuda: encoder = encoder.cuda() decoder = decoder.cuda() learning_rate = 0.0001 encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate) criterion = nn.NLLLoss() teacher_forcing_ratio = 0.5 num_epoch = 100 # start training cycle plot_losses = [] for epoch in range(num_epoch): # Put the decoder into training state and let the dropout work decoder.train() print_loss_total = 0 # Loop through the training data for data in train_loader: input_variable = data[0].cuda() if use_cuda else data[0] # The size of input_variable: batch_size, length_seq target_variable = data[1].cuda() if use_cuda else data[1] # The size of target_variable: batch_size, length_seq # clear gradient encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() encoder_hidden = encoder.initHidden(data[0].size()[0]) loss = 0 # Encoder starts working encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden) # The size of encoder_outputs: batch_size, length_seq, hidden_size*direction # The size of encoder_hidden: direction*n_layer, batch_size, hidden_size # The decoder starts working decoder_input = torch.LongTensor([[SOS_token]] * target_variable.size()[0]) # decoder_input size: batch_size, length_seq decoder_input = decoder_input.cuda() if use_cuda else decoder_input # Pass the hidden unit value of the encoder to the decoder as the encoding result decoder_hidden = encoder_hidden # decoder_hidden size: direction*n_layer, batch_size, hidden_size # Train the decoder in two ways at the same time # Use the information supervised by the teacher as the input at the next moment and the information without supervision, and use your own prediction results as the input at the next moment use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False if use_teacher_forcing: # Use the supervisory information as the input to the decoder at the next moment # Start time must not loop for di in range(MAX_LENGTH): # The information input to the decoder includes the input word decoder_input, the hidden unit state of the decoder at the previous moment, # The output of the encoder at each time step decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) # decoder_ouput size: batch_size, output_size # Calculate the loss function to get the input of the decoder at the next moment loss += criterion(decoder_output, target_variable[:, di]) decoder_input = target_variable[:, di].unsqueeze(1) # Teacher forcing # decoder_input size: batch_size, length_seq else: # Without teacher supervision, use the decoder's own prediction as input for the next moment # Loop over time steps for di in range(MAX_LENGTH): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) #decoder_ouput size: batch_size, output_size(vocab_size) # Obtain the prediction result of the decoder and use it as the input of the next moment topv, topi = decoder_output.data.topk(1, dim = 1) # topi size: batch_size, k ni = topi[:, 0] decoder_input = ni.unsqueeze(1) # decoder_input size: batch_size, length_seq decoder_input = decoder_input.cuda() if use_cuda else decoder_input # Calculate the loss function loss += criterion(decoder_output, target_variable[:, di]) # Backpropagation begins loss.backward() loss = loss.cpu() if use_cuda else loss # start gradient descent encoder_optimizer.step() decoder_optimizer.step() print_loss_total += loss.data.numpy()[0] print_loss_avg = print_loss_total / len(train_loader) valid_loss = 0 rights = [] # Set the decoder's training to False to turn off dropout decoder.eval() # Loop through all validation data for data in valid_loader: input_variable = data[0].cuda() if use_cuda else data[0] # The size of input_variable: batch_size, length_seq target_variable = data[1].cuda() if use_cuda else data[1] # The size of target_variable: batch_size, length_seq encoder_hidden = encoder.initHidden(data[0].size()[0]) loss = 0 encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden) # The size of encoder_outputs: batch_size, length_seq, hidden_size*direction # The size of encoder_hidden: direction*n_layer, batch_size, hidden_size decoder_input = torch.LongTensor([[SOS_token]] * target_variable.size()[0]) # decoder_input size: batch_size, length_seq decoder_input = decoder_input.cuda() if use_cuda else decoder_input decoder_hidden = encoder_hidden # decoder_hidden size: direction*n_layer, batch_size, hidden_size # start each step of the forecast for di in range(MAX_LENGTH): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) # decoder_ouput size: batch_size, output_size(vocab_size) topv, topi = decoder_output.data.topk(1, dim = 1) # topi size: batch_size, k ni = topi[:, 0] decoder_input = ni.unsqueeze(1) # decoder_input size: batch_size, length_seq decoder_input = decoder_input.cuda() if use_cuda else decoder_input right = rightness(decoder_output, target_variable[:, di]) rights.append(right) loss += criterion(decoder_output, target_variable[:, di]) loss = loss.cpu() if use_cuda else loss valid_loss += loss.data.numpy()[0] # Calculate the average loss, accuracy and other indicators and print out right_ratio = 1.0 * np.sum([i[0] for i in rights]) / np.sum([i[1] for i in rights]) print('process:%d%%,Training loss:%.4f,Check loss:%.4f,word accuracy:%.2f%%' % (epoch * 1.0 / num_epoch * 100, print_loss_avg, valid_loss / len(valid_loader), 100.0 * right_ratio)) plot_losses.append([print_loss_avg, valid_loss / len(valid_loader), right_ratio])
In this code, the focus is on the process of training the decoder. Note that there is a judgment if use_teacher_forcing:, when this variable is True, the standard parallel corpus is used as the label data; when it is False, the output prediction result of the decoder itself is used as the next input word, these two The process is very different.
Testing Neural Machine Translators
After large-scale training, it's time to test our neural machine translator. Run the following code on the test set:
# Randomly pick 20 sentences from the test set to test the translation results indices = np.random.choice(range(len(test_X)), 20) for ind in indices: data = [test_X[ind]] target = [test_Y[ind]] print(data[0]) print(SentenceFromList(input_lang, data[0])) input_variable = torch.LongTensor(data).cuda() if use_cuda else torch.LongTensor(data) #The size of input_variable: batch_size, length_seq target_variable = torch.LongTensor(target).cuda() if use_cuda else torch.LongTensor(target) # The size of target_variable: batch_size, length_seq encoder_hidden = encoder.initHidden(input_variable.size()[0]) loss = 0 encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden) # The size of encoder_outputs: batch_size, length_seq, hidden_size*direction # The size of encoder_hidden: direction*n_layer, batch_size, hidden_size decoder_input = torch.LongTensor([[SOS_token]] * target_variable.size()[0]) # decoder_input size: batch_size, length_seq decoder_input = decoder_input.cuda() if use_cuda else decoder_input decoder_hidden = encoder_hidden # decoder_hidden size: direction*n_layer, batch_size, hidden_size # Without teacher forcing: use its own predictions as the next input output_sentence = [] decoder_attentions = torch.zeros(max_length, max_length) rights = [] for di in range(MAX_LENGTH): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) # decoder_ouput size: batch_size, output_size(vocab_size) topv, topi = decoder_output.data.topk(1, dim = 1) decoder_attentions[di] = decoder_attention.data # topi size: batch_size, k ni = topi[:, 0] decoder_input = ni.unsqueeze(1) ni = ni.cpu.numpy()[0] output_sentence.append(ni) # decoder_input size: batch_size, length_seq decoder_input = decoder_input.cuda() if use_cuda else decoder_input right = rightness(decoder_output, target_variable[:, di]) rights.append(right) sentence = SentenceFromList(output_lang, output_sentence) standard = SentenceFromList(output_lang, target[0]) print('machine translation:', sentence) print('Standard translation:', standard) # Output the accuracy of this sentence right_ratio = 1.0 * np.sum([i[0] for i in rights]) / np.sum([i[1] for i in rights]) print('word accuracy:', 100.0 * right_ratio) print('\n')
After running this code, the system will output 20 sentences, including the original French text, machine-translated English, and the standard translation and machine-translated word accuracy in the parallel corpus.
Result display
In fact, this is a wrong way that untrained neural networks often show, that is, directly output high-frequency words. The advantage of this is that the probability of guessing the next word wrong is low, because words such as "the" and "was" appear frequently in English, so the accuracy of its guess is also high. Of course, in addition to increasing the training scale, many methods have been proposed to avoid this problem.
Next, let's take a look at how machine translation learns to pay attention to related words. First, the following code can be used to output the distribution of attention weights of the machine when translating a certain sentence:
# Examining the attention mechanism of the attention mechanism through the translation of several special sentences data = 'elle a cinq ans de moins que moi .' data = np.array([indexFromSentence(input_lang, data)]) input_variable = torch.LongTensor(data).cuda() if use_cuda else torch.LongTensor(data) # The size of input_variable: batch_size, length_seq target_variable = torch.LongTensor(target).cuda() if use_cuda else torch.LongTensor(target) # The size of target_variable: batch_size, length_seq encoder_hidden = encoder.initHidden(input_variable.size()[0]) loss = 0 encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden) # The size of encoder_outputs: batch_size, length_seq, hidden_size*direction # The size of encoder_hidden: direction*n_layer, batch_size, hidden_size decoder_input = torch.LongTensor([[SOS_token]] * target_variable.size()[0]) # decoder_input size: batch_size, length_seq decoder_input = decoder_input.cuda() if use_cuda else decoder_input decoder_hidden = encoder_hidden # decoder_hidden size: direction*n_layer, batch_size, hidden_size output_sentence = [] decoder_attentions = torch.zeros(max_length, max_length) for di in range(MAX_LENGTH): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) # decoder_ouput size: batch_size, output_size(vocab_size) topv, topi = decoder_output.data.topk(1, dim = 1) # At each step, get the attention weight vector and store it in decoder_attentions decoder_attentions[di] = decoder_attention.data # topi size: batch_size, k ni = topi[:, 0] decoder_input = ni.unsqueeze(1) ni = ni.cpu.numpy()[0] output_sentence.append(ni) # decoder_input size: batch_size, length_seq decoder_input = decoder_input.cuda() if use_cuda else decoder_input right = rightness(decoder_output, target_variable[:, di]) rights.append(right) sentence = SentenceFromList(output_lang, output_sentence) print('machine translation:', sentence) print('\n')
The decoder_attention array stores the distribution of attention weights at each moment in the process of translating a certain sentence. Next, visualize this weight distribution:
# Combine the attention weights stored at each step to form an attention matrix and draw it into a graph fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(decoder_attentions.numpy(), cmap='bone') fig.colorbar(cax) # set axis ax.set_xticklabels([''] + input_sentence.split(' ') + ['<EOS>'], rotation=90) ax.set_yticklabels([''] + sentence.split(' ')) # display words on a scale import matplotlib.ticker as ticker ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) plt.show()