中文文本分类与PaddlePaddle实战

前言

在第五章中,我们学习了循环神经网络(RNN)并实现了一个文本分类模型,但使用的是PaddlePaddle自带的数据集。本章我们将学习如何使用PaddlePaddle训练自己的中文文本数据集,包括数据爬取、预处理、模型定义、训练和预测。

GitHub地址:https://github.com/yeyupiaoling/LearnPaddle2/tree/master/note12

爬取文本数据集

高质量的中文文本分类数据集较少,我们将从网络爬取数据。以下是爬取今日头条中文新闻标题的代码:

创建download_text_data.py文件

import os
import random
import requests
import json
import time

# 分类新闻参数
news_classify = [
    [0, '民生', 'news_story'],
    [1, '文化', 'news_culture'],
    [2, '娱乐', 'news_entertainment'],
    [3, '体育', 'news_sports'],
    [4, '财经', 'news_finance'],
    [5, '房产', 'news_house'],
    [6, '汽车', 'news_car'],
    [7, '教育', 'news_edu'],
    [8, '科技', 'news_tech'],
    [9, '军事', 'news_military'],
    [10, '旅游', 'news_travel'],
    [11, '国际', 'news_world'],
    [12, '证券', 'stock'],
    [13, '农业', 'news_agriculture'],
    [14, '游戏', 'news_game']
]

# 已下载的新闻标题ID和数量
downloaded_data_id = []
downloaded_sum = 0

def get_data(tup, data_path):
    global downloaded_data_id
    global downloaded_sum
    print('============%s============' % tup[1])
    url = "http://it.snssdk.com/api/news/feed/v63/"
    t = int(time.time() / 10000)
    t = random.randint(6 * t, 10 * t)
    querystring = {
        "category": tup[2], "max_behot_time": t, 
        "last_refresh_sub_entrance_interval": "1524907088",
        "loc_mode": "5", "tt_from": "pre_load_more", 
        "cp": "51a5ee4f38c50q1", "plugin_enable": "0", 
        "iid": "31047425023", "device_id": "51425358841", 
        "ac": "wifi", "channel": "tengxun", "aid": "13",
        "app_name": "news_article", "version_code": "631", 
        "version_name": "6.3.1", "device_platform": "android",
        "ab_version": "333116,297979,317498,336556,295827,325046,239097,324283,170988,335432,332098,325198,336443,330632,297058,276203,286212,313219,328615,332041,329358,322321,327537,335710,333883,335102,334828,328670,324007,317077,334305,280773,335671,319960,333985,331719,336452,214069,31643,332881,333968,318434,207253,266310,321519,247847,281298,328218,335998,325618,333327,336199,323429,287591,288418,260650,326188,324614,335477,271178,326588,326524,326532",
        "ab_client": "a1,c4,e1,f2,g2,f7", "ab_feature": "94563,102749", 
        "abflag": "3", "ssmix": "a", "device_type": "MuMu", 
        "device_brand": "Android", "language": "zh", "os_api": "19",
        "os_version": "4.4.4", "uuid": "008796762094657", 
        "openudid": "b7215ea70ca32066", "manifest_version_code": "631",
        "resolution": "1280*720", "dpi": "240", 
        "update_version_code": "6310", "_rticket": "1524907088018",
        "plugin": "256"
    }

    headers = {
        'cache-control': "no-cache",
        'postman-token': "26530547-e697-1e8b-fd82-7c6014b3ee86",
        "User-Agent": "Dalvik/1.6.0 (Linux; U; Android 4.4.4; MuMu Build/V417IR) NewsArticle/6.3.1 okhttp/3.7.0.2"
    }

    response = requests.request("GET", url, headers=headers, params=querystring)
    new_data = json.loads(response.text)

    with open(data_path, 'a', encoding='utf-8') as fp:
        for item in new_data['data']:
            item = item['content']
            item = item.replace('\"', '"')
            item = json.loads(item)
            if 'item_id' in item and 'title' in item:
                item_id = item['item_id']
                if item_id not in downloaded_data_id:
                    downloaded_data_id.append(item_id)
                    line = u"{}_!_{}_!_{}_!_{}".format(
                        item['item_id'], tup[0], tup[1], item['title']
                    ).replace('\n', '').replace('\r', '') + '\n'
                    fp.write(line)
                    downloaded_sum += 1
                    print(downloaded_sum, tup[0], tup[1], item['item_id'], item['title'])

def get_routine(data_path):
    global downloaded_sum
    if os.path.exists(data_path):
        with open(data_path, 'r', encoding='utf-8') as fp:
            lines = fp.readlines()
            downloaded_sum = len(lines)
            for line in lines:
                item_id = int(line.split('_!_')[0])
                downloaded_data_id.append(item_id)
        print('已读取%d条数据' % downloaded_sum)
    else:
        os.makedirs(os.path.dirname(data_path))

    while 1:
        time.sleep(10)
        for classify in news_classify:
            get_data(classify, data_path)
        if downloaded_sum >= 300000:
            break

if __name__ == '__main__':
    data_path = 'datasets/news_classify_data.txt'
    get_routine(data_path)

爬取输出示例

============文化============
17 1 文化 6646565189942510093 世界第一豪宅,坐落于北京...
18 1 文化 6658382232383652104 俗语讲:“男怕初一,女怕十五”...
...

制作训练数据

爬取的数据需转换为PaddlePaddle可处理的整型格式,包括构建字典和生成数据列表。

创建create_data.py文件

import os

def create_dict(data_path, dict_path):
    dict_set = set()
    with open(data_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        title = line.split('_!_')[-1].replace('\n', '')
        for s in title:
            dict_set.add(s)
    dict_list = [[s, i] for i, s in enumerate(dict_set)]
    dict_txt = dict(dict_list)
    dict_txt["<unk>"] = len(dict_txt)  # 添加未知字符

    with open(dict_path, 'w', encoding='utf-8') as f:
        f.write(str(dict_txt))
    print("数据字典生成完成!")

def create_data_list(data_root_path):
    with open(os.path.join(data_root_path, 'dict_txt.txt'), 'r', encoding='utf-8') as f:
        dict_txt = eval(f.readlines()[0])

    with open(os.path.join(data_root_path, 'news_classify_data.txt'), 'r', encoding='utf-8') as f:
        lines = f.readlines()

    for i, line in enumerate(lines):
        title = line.split('_!_')[-1].replace('\n', '')
        label = line.split('_!_')[1]
        words = [str(dict_txt[s] if s in dict_txt else dict_txt["<unk>"]) for s in title]
        line_str = ','.join(words) + '\t' + label + '\n'

        if i % 10 == 0:
            with open(os.path.join(data_root_path, 'test_list.txt'), 'a', encoding='utf-8') as f_test:
                f_test.write(line_str)
        else:
            with open(os.path.join(data_root_path, 'train_list.txt'), 'a', encoding='utf-8') as f_train:
                f_train.write(line_str)
    print("数据列表生成完成!")

def get_dict_len(dict_path):
    with open(dict_path, 'r', encoding='utf-8') as f:
        return len(eval(f.readlines()[0]).keys())

if __name__ == '__main__':
    data_root_path = "datasets/"
    data_path = os.path.join(data_root_path, 'news_classify_data.txt')
    dict_path = os.path.join(data_root_path, "dict_txt.txt")
    create_dict(data_path, dict_path)
    create_data_list(data_root_path)

数据转换示例

321,364,535,897,322,263,354,337,441,815,943 12
540,299,884,1092,671,938    13

定义模型

使用双向单层LSTM构建文本分类模型:

创建bilstm_net.py文件

import paddle.fluid as fluid

def bilstm_net(data, dict_dim, class_dim, emb_dim=128, hid_dim=128, hid_dim2=96, emb_lr=30.0):
    # Embedding层
    emb = fluid.layers.embedding(
        input=data, size=[dict_dim, emb_dim], 
        param_attr=fluid.ParamAttr(learning_rate=emb_lr)
    )

    # Bi-LSTM层
    lstm_h, c = fluid.layers.dynamic_lstm(
        input=fluid.layers.fc(input=emb, size=hid_dim * 4), 
        size=hid_dim * 4, is_reverse=False
    )
    rlstm_h, c = fluid.layers.dynamic_lstm(
        input=fluid.layers.fc(input=emb, size=hid_dim * 4), 
        size=hid_dim * 4, is_reverse=True
    )

    # 提取最后一步输出
    lstm_last = fluid.layers.sequence_last_step(input=lstm_h)
    rlstm_last = fluid.layers.sequence_last_step(input=rlstm_h)

    # 拼接双向输出
    lstm_concat = fluid.layers.concat(input=[lstm_last, rlstm_last], axis=1)

    # 全连接层和输出层
    fc1 = fluid.layers.fc(input=lstm_concat, size=hid_dim2, act='tanh')
    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
    return prediction

定义数据读取

创建text_reader.py文件处理训练和测试数据:

from multiprocessing import cpu_count
import numpy as np
import paddle

def train_mapper(sample):
    data, label = sample
    return [int(d) for d in data.split(',')], int(label)

def train_reader(train_list_path):
    def reader():
        with open(train_list_path, 'r') as f:
            lines = f.readlines()
            np.random.shuffle(lines)
            for line in lines:
                data, label = line.split('\t')
                yield data, label
    return paddle.reader.xmap_readers(train_mapper, reader, cpu_count(), 1024)

def test_mapper(sample):
    data, label = sample
    return [int(d) for d in data.split(',')], int(label)

def test_reader(test_list_path):
    def reader():
        with open(test_list_path, 'r') as f:
            for line in f.readlines():
                data, label = line.split('\t')
                yield data, label
    return paddle.reader.xmap_readers(test_mapper, reader, cpu_count(), 1024)

训练模型

创建train.py文件

import os
import shutil
import paddle
import paddle.fluid as fluid
from create_data import get_dict_len
from bilstm_net import bilstm_net
from text_reader import train_reader, test_reader

# 定义输入层
words = fluid.layers.data(name='words', shape=[1], dtype='int64', lod_level=1)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')

# 模型参数
dict_dim = get_dict_len('datasets/dict_txt.txt')
class_dim = 15  # 15个分类类别
model = bilstm_net(words, dict_dim, class_dim)

# 损失函数和优化器
cost = fluid.layers.cross_entropy(input=model, label=label)
avg_cost = fluid.layers.mean(cost)
acc = fluid.layers.accuracy(input=model, label=label)
test_program = fluid.default_main_program().clone(for_test=True)

optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.002)
opt = optimizer.minimize(avg_cost)

# 执行器配置
place = fluid.CUDAPlace(0)  # 使用GPU
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())

# 训练数据准备
train_reader = paddle.batch(train_reader('datasets/train_list.txt'), batch_size=128)
test_reader = paddle.batch(test_reader('datasets/test_list.txt'), batch_size=128)

# 训练循环
for pass_id in range(10):
    for batch_id, data in enumerate(train_reader()):
        train_cost, train_acc = exe.run(
            program=fluid.default_main_program(),
            feed=fluid.DataFeeder(place=place, feed_list=[words, label]).feed(data),
            fetch_list=[avg_cost, acc]
        )
        if batch_id % 40 == 0:
            print(f"Pass:{pass_id}, Batch:{batch_id}, Cost:{train_cost[0]:.5f}, Acc:{train_acc[0]:.5f}")

            # 测试
            test_costs, test_accs = [], []
            for data in test_reader():
                test_cost, test_acc = exe.run(
                    program=test_program,
                    feed=fluid.DataFeeder(place=place, feed_list=[words, label]).feed(data),
                    fetch_list=[avg_cost, acc]
                )
                test_costs.append(test_cost[0])
                test_accs.append(test_acc[0])
            test_cost = sum(test_costs)/len(test_costs)
            test_acc = sum(test_accs)/len(test_accs)
            print(f"Test: Cost:{test_cost:.5f}, Acc:{test_acc:.5f}")

    # 保存模型
    save_path = 'infer_model/'
    shutil.rmtree(save_path, ignore_errors=True)
    os.makedirs(save_path)
    fluid.io.save_inference_model(
        save_path, feeded_var_names=[words.name], 
        target_vars=[model], executor=exe
    )

训练输出示例

```
Pass:0

Xiaoye