Getting Started with NLP - Text Classification | paddle

Text classification: a classic problem in the field of natural language processing, text classification is to use computers to automatically classify and mark texts according to a certain classification system.

  • Data source: Crawling 56,821 pieces of data from the website Chinese news digest
  • Data content: contains 10 categories, international, culture, entertainment, sports, finance, automobile, education, technology, real estate, securities


Strictly speaking, the data set of this news is not very good, the number of news in each category is not consistent, and a good data set is more evenly distributed for each category.

1. Prepare data:

data preprocessing

Create datasets and data dictionaries

Create a data reader train_reader and test_reader

2. Configure the network

define network

Defining Loss Function: Cross Entropy Loss Function

Define the optimization algorithm: choose the optimizer, adam, SGD, etc.

3. Train the network

We need to train the network, throw it into the training set, and train our model

4. Model evaluation

5. Model prediction

# View the currently mounted dataset directory
!ls /home/aistudio/data/
#Move the data to the /home/aistudio/data/ directory
!cp data/data6825/news_classify_data.txt data/
data6825

# Import necessary packages
import os  #system operating package
from multiprocessing import cpu_count
import numpy as np #computing package
import shutil
import paddle #paddle's toolkit
import paddle.fluid as fluid
# Create datasets and data dictionaries

data_root_path='/home/aistudio/data/' #Select data path
#Create a data dictionary for the path we read
def create_data_list(data_root_path):
    with open(data_root_path + 'test_list.txt', 'w') as f:
        pass
    with open(data_root_path + 'train_list.txt', 'w') as f:
        pass

    with open(os.path.join(data_root_path, 'dict_txt.txt'), 'r', encoding='utf-8') as f_data:
        dict_txt = eval(f_data.readlines()[0])

    with open(os.path.join(data_root_path, 'news_classify_data.txt'), 'r', encoding='utf-8') as f_data:
        lines = f_data.readlines()
    i = 0
    for line in lines:
        title = line.split('_!_')[-1].replace('\n', '')
        l = line.split('_!_')[1]
        labs = ""
        if i % 10 == 0:
            with open(os.path.join(data_root_path, 'test_list.txt'), 'a', encoding='utf-8') as f_test:
                for s in title:
                    lab = str(dict_txt[s])
                    labs = labs + lab + ','
                labs = labs[:-1]
                labs = labs + '\t' + l + '\n'
                f_test.write(labs)
        else:
            with open(os.path.join(data_root_path, 'train_list.txt'), 'a', encoding='utf-8') as f_train:
                for s in title:
                    lab = str(dict_txt[s])
                    labs = labs + lab + ','
                labs = labs[:-1]
                labs = labs + '\t' + l + '\n'
                f_train.write(labs)
        i += 1
    print("The data list generation is complete!")


# Generate a dictionary from the downloaded data
#Map each sub-text of each text to a dictionary to get a digital ID, because the input into the model is not a Chinese character, but a digital ID
def create_dict(data_path, dict_path):
    dict_set = set()
    # Read downloaded data
    with open(data_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    # generate a tuple from the data
    for line in lines:
        title = line.split('_!_')[-1].replace('\n', '')
        for s in title:
            dict_set.add(s)
    # Convert tuple to dictionary, one word corresponds to one number
    dict_list = []
    i = 0
    for s in dict_set:
        dict_list.append([s, i])
        i += 1
    # Add unknown characters
    dict_txt = dict(dict_list)
    end_dict = {"<unk>": i}
    dict_txt.update(end_dict)
    # save these dictionaries locally
    with open(dict_path, 'w', encoding='utf-8') as f:
        f.write(str(dict_txt))

    print("The data dictionary generation is complete!")


# Get the length of the dictionary
def get_dict_len(dict_path):
    with open(dict_path, 'r', encoding='utf-8') as f:
        line = eval(f.readlines()[0])

    return len(line.keys())


if __name__ == '__main__':
    # Put the list of produced data in its own general category folder
    data_root_path = "/home/aistudio/data/"
    data_path = os.path.join(data_root_path, 'news_classify_data.txt')
    dict_path = os.path.join(data_root_path, "dict_txt.txt")
    # Create a data dictionary
    create_dict(data_path, dict_path)
    # Create a data list
    create_data_list(data_root_path)
The data dictionary generation is complete!
The data list generation is complete!

Created dictionary: each word will correspond to a digital ID

Created list of data: text into serialized representation


Each line represents a sentence of news, which is a sample.

paddle.reader.xmap_readers(): Map the samples returned by the reader (to the output queue) through a user-defined mapper mapper in a multi-threaded manner.

# Create data readers train_reader and test_reader
# Preprocessing of train/test data
def data_mapper(sample):
    data, label = sample
    data = [int(data) for data in data.split(',')]
    return data, int(label)

# Create data reader train_reader
def train_reader(train_list_path):
    def reader():
        with open(train_list_path, 'r') as f:
            lines = f.readlines()
            # Shuffle the data
            np.random.shuffle(lines)
            # Start getting each image and label
            for line in lines:
                data, label = line.split('\t')
                yield data, label
    return paddle.reader.xmap_readers(data_mapper, reader, cpu_count(), 1024)
#  Create data reader test_reader
def test_reader(test_list_path):

    def reader():
        with open(test_list_path, 'r') as f:
            lines = f.readlines()
            for line in lines:
                data, label = line.split('\t')
                yield data, label

    return paddle.reader.xmap_readers(data_mapper, reader, cpu_count(), 1024)

So far, the data preparation work has been completed.

Convolutional Neural Networks (CNN)

Input the word vector sequence, generate a feature map, and use the max pooling over time operation on the feature map to obtain the features of the entire sentence corresponding to this convolution kernel. The features obtained by the product kernel are spliced ​​together to form the fixed-length vector representation of the text. For the text classification problem, connecting it to the softmax constructs a complete model.

In practical applications, we will use multiple convolution kernels to process sentences, and convolution kernels with the same window size are stacked to form a matrix, which can complete the operation more efficiently.

Alternatively, we can use convolution kernels with different window sizes to process sentences.

# Create a CNN network

def CNN_net(data,dict_dim, class_dim=10, emb_dim=128, hid_dim=128,hid_dim2=98):
        emb = fluid.layers.embedding(input=data,#Before entering the model, you need to get an emb word embedding and get a matrix encoding
                                 size=[dict_dim, emb_dim])
        conv_3 = fluid.nets.sequence_conv_pool(
                                                 input=emb,
                                                 num_filters=hid_dim,
                                                 filter_size=3,#convolution kernel
                                                 act="tanh",
                                                 pool_type="sqrt")
        conv_4 = fluid.nets.sequence_conv_pool(
                                                 input=emb,
                                                 num_filters=hid_dim2,
                                                 filter_size=4,
                                                 act="tanh",
                                                 pool_type="sqrt")
                                                 
        output = fluid.layers.fc(
            input=[conv_3, conv_4], size=class_dim, act='softmax')#After the fully connected layer, the results of the two CNNs are spliced ​​together
        return output#1x10 matrix of probability distribution, 10 numbers, the number with the highest probability is the prediction result of the current model

# Define the input data, lod_level is not 0 to specify that the input data is sequence data
words = fluid.layers.data(name='words', shape=[1], dtype='int64', lod_level=1)#lod_level handles variable-length sequences. The index of LoDtensor lodlayer in the official website of paddle does not need to be considered for fixed-length data.
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Get data dictionary length
dict_dim = get_dict_len('/home/aistudio/data/dict_txt.txt')
# Get Convolutional Neural Networks
# model = CNN_net(words, dict_dim, 15)
# get classifier
model = CNN_net(words, dict_dim)
# Get loss function and accuracy
cost = fluid.layers.cross_entropy(input=model, label=label)#loss function
avg_cost = fluid.layers.mean(cost)#Each training is a batch, find an average
acc = fluid.layers.accuracy(input=model, label=label)

# Get the predictor
test_program = fluid.default_main_program().clone(for_test=True)#clone clone function

# Define an optimization method
optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.002)
opt = optimizer.minimize(avg_cost)

# Create an executor, the CPU training speed is slower
#place = fluid.CPUPlace()
place = fluid.CUDAPlace(0)#GPU execution
exe = fluid.Executor(place)
# Initialize parameters
exe.run(fluid.default_startup_program())

[]
# Get training data reader and test data reader
train_reader = paddle.batch(reader=train_reader('/home/aistudio/data/train_list.txt'), batch_size=128)
test_reader = paddle.batch(reader=test_reader('/home/aistudio/data/test_list.txt'), batch_size=128)
# define data mapper
feeder = fluid.DataFeeder(place=place, feed_list=[words, label])
EPOCH_NUM=20#number of iterations
model_save_dir = '/home/aistudio/work/infer_model/'
# start training

for pass_id in range(EPOCH_NUM):
    # to train
    for batch_id, data in enumerate(train_reader()):
        train_cost, train_acc = exe.run(program=fluid.default_main_program(),
                             feed=feeder.feed(data),
                             fetch_list=[avg_cost, acc])

        if batch_id % 100 == 0:#Every 100 executions, print once
            print('Pass:%d, Batch:%d, Cost:%0.5f, Acc:%0.5f' % (pass_id, batch_id, train_cost[0], train_acc[0]))
    # Test, read in a batch of unfamiliar data, data the model has not seen before,
    test_costs = []
    test_accs = []
    for batch_id, data in enumerate(test_reader()):
        test_cost, test_acc = exe.run(program=test_program,
                                              feed=feeder.feed(data),
                                              fetch_list=[avg_cost, acc])
        test_costs.append(test_cost[0])
        test_accs.append(test_acc[0])
    # Calculate the average prediction loss and accuracy
    test_cost = (sum(test_costs) / len(test_costs))
    test_acc = (sum(test_accs) / len(test_accs))
    print('Test:%d, Cost:%0.5f, ACC:%0.5f' % (pass_id, test_cost, test_acc))

# To save the prediction model, you can consider putting this code of saving the model into the for loop, and save the model of each round
if not os.path.exists(model_save_dir): 
    os.makedirs(model_save_dir) 
fluid.io.save_inference_model(model_save_dir, 
                            feeded_var_names=[words.name], 
                            target_vars=[model], 
                            executor=exe)
print('The training model is saved!') 
Pass:0, Batch:0, Cost:2.30681, Acc:0.09375
Pass:0, Batch:100, Cost:0.99743, Acc:0.68750
Pass:0, Batch:200, Cost:0.89360, Acc:0.76562
Pass:0, Batch:300, Cost:0.92248, Acc:0.70312
Test:0, Cost:0.81883, ACC:0.73921
Pass:1, Batch:0, Cost:0.90457, Acc:0.67969
Pass:1, Batch:100, Cost:0.67305, Acc:0.83594
Pass:1, Batch:200, Cost:0.63098, Acc:0.80469
Pass:1, Batch:300, Cost:0.76019, Acc:0.77344
Test:1, Cost:0.75819, ACC:0.75909
Pass:2, Batch:0, Cost:0.73232, Acc:0.76562
Pass:2, Batch:100, Cost:0.70476, Acc:0.77344
Pass:2, Batch:200, Cost:0.71542, Acc:0.75781
Pass:2, Batch:300, Cost:0.63258, Acc:0.78125
Test:2, Cost:0.73717, ACC:0.76160
Pass:3, Batch:0, Cost:0.56025, Acc:0.82812
Pass:3, Batch:100, Cost:0.48580, Acc:0.86719
Pass:3, Batch:200, Cost:0.54991, Acc:0.84375
Pass:3, Batch:300, Cost:0.67272, Acc:0.78906
Test:3, Cost:0.72726, ACC:0.76317
Pass:4, Batch:0, Cost:0.53660, Acc:0.82812
Pass:4, Batch:100, Cost:0.73550, Acc:0.78906
Pass:4, Batch:200, Cost:0.53774, Acc:0.80469
Pass:4, Batch:300, Cost:0.46155, Acc:0.85156
Test:4, Cost:0.72185, ACC:0.76169
Pass:5, Batch:0, Cost:0.65421, Acc:0.78906
Pass:5, Batch:100, Cost:0.59889, Acc:0.80469
Pass:5, Batch:200, Cost:0.71301, Acc:0.79688
Pass:5, Batch:300, Cost:0.69682, Acc:0.81250
Test:5, Cost:0.71626, ACC:0.76525
Pass:6, Batch:0, Cost:0.72434, Acc:0.75000
Pass:6, Batch:100, Cost:0.59109, Acc:0.77344
Pass:6, Batch:200, Cost:0.48783, Acc:0.81250
Pass:6, Batch:300, Cost:0.57463, Acc:0.81250
Test:6, Cost:0.71520, ACC:0.76447
Pass:7, Batch:0, Cost:0.50502, Acc:0.84375
Pass:7, Batch:100, Cost:0.62133, Acc:0.79688
Pass:7, Batch:200, Cost:0.68593, Acc:0.76562
Pass:7, Batch:300, Cost:0.55528, Acc:0.80469
Test:7, Cost:0.71300, ACC:0.76769
Pass:8, Batch:0, Cost:0.60046, Acc:0.76562
Pass:8, Batch:100, Cost:0.47617, Acc:0.82812
Pass:8, Batch:200, Cost:0.59591, Acc:0.79688
Pass:8, Batch:300, Cost:0.66050, Acc:0.76562
Test:8, Cost:0.71475, ACC:0.76594
Pass:9, Batch:0, Cost:0.40968, Acc:0.84375
Pass:9, Batch:100, Cost:0.50980, Acc:0.81250
Pass:9, Batch:200, Cost:0.55923, Acc:0.85156
Pass:9, Batch:300, Cost:0.42255, Acc:0.87500
Test:9, Cost:0.71282, ACC:0.76717
Pass:10, Batch:0, Cost:0.44147, Acc:0.88281
Pass:10, Batch:100, Cost:0.55140, Acc:0.85938
Pass:10, Batch:200, Cost:0.50935, Acc:0.84375
Pass:10, Batch:300, Cost:0.56366, Acc:0.83594
Test:10, Cost:0.71520, ACC:0.76586
Pass:11, Batch:0, Cost:0.55133, Acc:0.79688
Pass:11, Batch:100, Cost:0.45308, Acc:0.80469
Pass:11, Batch:200, Cost:0.63471, Acc:0.78125
Pass:11, Batch:300, Cost:0.52810, Acc:0.80469
Test:11, Cost:0.71511, ACC:0.76673
Pass:12, Batch:0, Cost:0.51947, Acc:0.83594
Pass:12, Batch:100, Cost:0.63086, Acc:0.80469
Pass:12, Batch:200, Cost:0.57166, Acc:0.82812
Pass:12, Batch:300, Cost:0.59658, Acc:0.75781
Test:12, Cost:0.71533, ACC:0.76673
Pass:13, Batch:0, Cost:0.34512, Acc:0.89062
Pass:13, Batch:100, Cost:0.47249, Acc:0.82812
Pass:13, Batch:200, Cost:0.51224, Acc:0.85156
Pass:13, Batch:300, Cost:0.45350, Acc:0.84375
Test:13, Cost:0.71736, ACC:0.76647
Pass:14, Batch:0, Cost:0.45494, Acc:0.85156
Pass:14, Batch:100, Cost:0.68085, Acc:0.78125
Pass:14, Batch:200, Cost:0.48124, Acc:0.83594
Pass:14, Batch:300, Cost:0.47296, Acc:0.85938
Test:14, Cost:0.71745, ACC:0.76760
Pass:15, Batch:0, Cost:0.73750, Acc:0.77344
Pass:15, Batch:100, Cost:0.55038, Acc:0.83594
Pass:15, Batch:200, Cost:0.59775, Acc:0.74219
Pass:15, Batch:300, Cost:0.47932, Acc:0.82812
Test:15, Cost:0.72163, ACC:0.76673
Pass:16, Batch:0, Cost:0.31890, Acc:0.90625
Pass:16, Batch:100, Cost:0.38017, Acc:0.85156
Pass:16, Batch:200, Cost:0.57517, Acc:0.79688
Pass:16, Batch:300, Cost:0.44878, Acc:0.87500
Test:16, Cost:0.72158, ACC:0.76786
Pass:17, Batch:0, Cost:0.43048, Acc:0.88281
Pass:17, Batch:100, Cost:0.47145, Acc:0.82031
Pass:17, Batch:200, Cost:0.47934, Acc:0.82812
Pass:17, Batch:300, Cost:0.36709, Acc:0.89062
Test:17, Cost:0.72381, ACC:0.76647
Pass:18, Batch:0, Cost:0.35568, Acc:0.88281
Pass:18, Batch:100, Cost:0.61057, Acc:0.82031
Pass:18, Batch:200, Cost:0.40052, Acc:0.88281
Pass:18, Batch:300, Cost:0.45469, Acc:0.83594
Test:18, Cost:0.72549, ACC:0.76743
Pass:19, Batch:0, Cost:0.41658, Acc:0.86719
Pass:19, Batch:100, Cost:0.48703, Acc:0.86719
Pass:19, Batch:200, Cost:0.47010, Acc:0.83594
Pass:19, Batch:300, Cost:0.35333, Acc:0.84375
Test:19, Cost:0.72887, ACC:0.76690
 The training model is saved!
# Use the trained model to make predictions and output the prediction results
# Create the executor
#place = fluid.CPUPlace()
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())

save_path = '/home/aistudio/work/infer_model/'

# Get predictor from model, list of input data names, classifier
[infer_program, feeded_var_names, target_var] = fluid.io.load_inference_model(dirname=save_path, executor=exe)


# retrieve data
def get_data(sentence):
    # read data dictionary
    with open('/home/aistudio/data/dict_txt.txt', 'r', encoding='utf-8') as f_data:
        dict_txt = eval(f_data.readlines()[0])
    dict_txt = dict(dict_txt)
    # Convert string data to list data
    keys = dict_txt.keys()
    data = []
    for s in sentence:
        # Determine if there is an unknown character
        if not s in keys:
            s = '<unk>'
        data.append(int(dict_txt[s]))
    return data


data = []
# Get image data
data1 = get_data('Seven years after winning the Nobel Prize for Literature, Mo Yan said this at Jiajiazhuang, Fenyang, Shanxi Province on the evening of the 15th')
data2 = get_data('According to local media reports such as "USA Today" and "World Journal", the Chicago Riverside Police Department said,')
data.append(data1)
data.append(data2)

# Get the number of words in each sentence
base_shape = [[len(c) for c in data]]

# Generate forecast data
tensor_words = fluid.create_lod_tensor(data, base_shape, place)

# perform prediction
result = exe.run(program=infer_program,
                 feed={feeded_var_names[0]: tensor_words},
                 fetch_list=target_var)

# Category Name
names = [ 'culture', 'entertainment', 'physical education', 'Finance','real estate', 'car', 'educate', 'Technology', 'internationality', 'securities']

# Get the label with the highest probability of the result
for i in range(len(data)):
    lab = np.argsort(result)[0][i][-1]#10 probability values, sort them, choose the one with the largest probability, (-1)
    print('The predicted result labels are:%d, Name is:%s, The probability is:%f' % (lab, names[lab], result[0][i][lab]))
The predicted result label is: 0, the name is: culture, the probability is: 0.949490
 The predicted result label is: 8, the name is: international, the probability is: 0.472569

Tags: Python NLP Machine Learning

Posted by superdezign on Wed, 01 Jun 2022 19:49:54 +0530