中文文本分类与PaddlePaddle实战¶
前言¶
在第五章中,我们学习了循环神经网络(RNN)并实现了一个文本分类模型,但使用的是PaddlePaddle自带的数据集。本章我们将学习如何使用PaddlePaddle训练自己的中文文本数据集,包括数据爬取、预处理、模型定义、训练和预测。
GitHub地址:https://github.com/yeyupiaoling/LearnPaddle2/tree/master/note12
爬取文本数据集¶
高质量的中文文本分类数据集较少,我们将从网络爬取数据。以下是爬取今日头条中文新闻标题的代码:
创建download_text_data.py文件¶
import os
import random
import requests
import json
import time
# 分类新闻参数
news_classify = [
[0, '民生', 'news_story'],
[1, '文化', 'news_culture'],
[2, '娱乐', 'news_entertainment'],
[3, '体育', 'news_sports'],
[4, '财经', 'news_finance'],
[5, '房产', 'news_house'],
[6, '汽车', 'news_car'],
[7, '教育', 'news_edu'],
[8, '科技', 'news_tech'],
[9, '军事', 'news_military'],
[10, '旅游', 'news_travel'],
[11, '国际', 'news_world'],
[12, '证券', 'stock'],
[13, '农业', 'news_agriculture'],
[14, '游戏', 'news_game']
]
# 已下载的新闻标题ID和数量
downloaded_data_id = []
downloaded_sum = 0
def get_data(tup, data_path):
global downloaded_data_id
global downloaded_sum
print('============%s============' % tup[1])
url = "http://it.snssdk.com/api/news/feed/v63/"
t = int(time.time() / 10000)
t = random.randint(6 * t, 10 * t)
querystring = {
"category": tup[2], "max_behot_time": t,
"last_refresh_sub_entrance_interval": "1524907088",
"loc_mode": "5", "tt_from": "pre_load_more",
"cp": "51a5ee4f38c50q1", "plugin_enable": "0",
"iid": "31047425023", "device_id": "51425358841",
"ac": "wifi", "channel": "tengxun", "aid": "13",
"app_name": "news_article", "version_code": "631",
"version_name": "6.3.1", "device_platform": "android",
"ab_version": "333116,297979,317498,336556,295827,325046,239097,324283,170988,335432,332098,325198,336443,330632,297058,276203,286212,313219,328615,332041,329358,322321,327537,335710,333883,335102,334828,328670,324007,317077,334305,280773,335671,319960,333985,331719,336452,214069,31643,332881,333968,318434,207253,266310,321519,247847,281298,328218,335998,325618,333327,336199,323429,287591,288418,260650,326188,324614,335477,271178,326588,326524,326532",
"ab_client": "a1,c4,e1,f2,g2,f7", "ab_feature": "94563,102749",
"abflag": "3", "ssmix": "a", "device_type": "MuMu",
"device_brand": "Android", "language": "zh", "os_api": "19",
"os_version": "4.4.4", "uuid": "008796762094657",
"openudid": "b7215ea70ca32066", "manifest_version_code": "631",
"resolution": "1280*720", "dpi": "240",
"update_version_code": "6310", "_rticket": "1524907088018",
"plugin": "256"
}
headers = {
'cache-control': "no-cache",
'postman-token': "26530547-e697-1e8b-fd82-7c6014b3ee86",
"User-Agent": "Dalvik/1.6.0 (Linux; U; Android 4.4.4; MuMu Build/V417IR) NewsArticle/6.3.1 okhttp/3.7.0.2"
}
response = requests.request("GET", url, headers=headers, params=querystring)
new_data = json.loads(response.text)
with open(data_path, 'a', encoding='utf-8') as fp:
for item in new_data['data']:
item = item['content']
item = item.replace('\"', '"')
item = json.loads(item)
if 'item_id' in item and 'title' in item:
item_id = item['item_id']
if item_id not in downloaded_data_id:
downloaded_data_id.append(item_id)
line = u"{}_!_{}_!_{}_!_{}".format(
item['item_id'], tup[0], tup[1], item['title']
).replace('\n', '').replace('\r', '') + '\n'
fp.write(line)
downloaded_sum += 1
print(downloaded_sum, tup[0], tup[1], item['item_id'], item['title'])
def get_routine(data_path):
global downloaded_sum
if os.path.exists(data_path):
with open(data_path, 'r', encoding='utf-8') as fp:
lines = fp.readlines()
downloaded_sum = len(lines)
for line in lines:
item_id = int(line.split('_!_')[0])
downloaded_data_id.append(item_id)
print('已读取%d条数据' % downloaded_sum)
else:
os.makedirs(os.path.dirname(data_path))
while 1:
time.sleep(10)
for classify in news_classify:
get_data(classify, data_path)
if downloaded_sum >= 300000:
break
if __name__ == '__main__':
data_path = 'datasets/news_classify_data.txt'
get_routine(data_path)
爬取输出示例¶
============文化============
17 1 文化 6646565189942510093 世界第一豪宅,坐落于北京...
18 1 文化 6658382232383652104 俗语讲:“男怕初一,女怕十五”...
...
制作训练数据¶
爬取的数据需转换为PaddlePaddle可处理的整型格式,包括构建字典和生成数据列表。
创建create_data.py文件¶
import os
def create_dict(data_path, dict_path):
dict_set = set()
with open(data_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
title = line.split('_!_')[-1].replace('\n', '')
for s in title:
dict_set.add(s)
dict_list = [[s, i] for i, s in enumerate(dict_set)]
dict_txt = dict(dict_list)
dict_txt["<unk>"] = len(dict_txt) # 添加未知字符
with open(dict_path, 'w', encoding='utf-8') as f:
f.write(str(dict_txt))
print("数据字典生成完成!")
def create_data_list(data_root_path):
with open(os.path.join(data_root_path, 'dict_txt.txt'), 'r', encoding='utf-8') as f:
dict_txt = eval(f.readlines()[0])
with open(os.path.join(data_root_path, 'news_classify_data.txt'), 'r', encoding='utf-8') as f:
lines = f.readlines()
for i, line in enumerate(lines):
title = line.split('_!_')[-1].replace('\n', '')
label = line.split('_!_')[1]
words = [str(dict_txt[s] if s in dict_txt else dict_txt["<unk>"]) for s in title]
line_str = ','.join(words) + '\t' + label + '\n'
if i % 10 == 0:
with open(os.path.join(data_root_path, 'test_list.txt'), 'a', encoding='utf-8') as f_test:
f_test.write(line_str)
else:
with open(os.path.join(data_root_path, 'train_list.txt'), 'a', encoding='utf-8') as f_train:
f_train.write(line_str)
print("数据列表生成完成!")
def get_dict_len(dict_path):
with open(dict_path, 'r', encoding='utf-8') as f:
return len(eval(f.readlines()[0]).keys())
if __name__ == '__main__':
data_root_path = "datasets/"
data_path = os.path.join(data_root_path, 'news_classify_data.txt')
dict_path = os.path.join(data_root_path, "dict_txt.txt")
create_dict(data_path, dict_path)
create_data_list(data_root_path)
数据转换示例¶
321,364,535,897,322,263,354,337,441,815,943 12
540,299,884,1092,671,938 13
定义模型¶
使用双向单层LSTM构建文本分类模型:
创建bilstm_net.py文件¶
import paddle.fluid as fluid
def bilstm_net(data, dict_dim, class_dim, emb_dim=128, hid_dim=128, hid_dim2=96, emb_lr=30.0):
# Embedding层
emb = fluid.layers.embedding(
input=data, size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(learning_rate=emb_lr)
)
# Bi-LSTM层
lstm_h, c = fluid.layers.dynamic_lstm(
input=fluid.layers.fc(input=emb, size=hid_dim * 4),
size=hid_dim * 4, is_reverse=False
)
rlstm_h, c = fluid.layers.dynamic_lstm(
input=fluid.layers.fc(input=emb, size=hid_dim * 4),
size=hid_dim * 4, is_reverse=True
)
# 提取最后一步输出
lstm_last = fluid.layers.sequence_last_step(input=lstm_h)
rlstm_last = fluid.layers.sequence_last_step(input=rlstm_h)
# 拼接双向输出
lstm_concat = fluid.layers.concat(input=[lstm_last, rlstm_last], axis=1)
# 全连接层和输出层
fc1 = fluid.layers.fc(input=lstm_concat, size=hid_dim2, act='tanh')
prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
return prediction
定义数据读取¶
创建text_reader.py文件处理训练和测试数据:
from multiprocessing import cpu_count
import numpy as np
import paddle
def train_mapper(sample):
data, label = sample
return [int(d) for d in data.split(',')], int(label)
def train_reader(train_list_path):
def reader():
with open(train_list_path, 'r') as f:
lines = f.readlines()
np.random.shuffle(lines)
for line in lines:
data, label = line.split('\t')
yield data, label
return paddle.reader.xmap_readers(train_mapper, reader, cpu_count(), 1024)
def test_mapper(sample):
data, label = sample
return [int(d) for d in data.split(',')], int(label)
def test_reader(test_list_path):
def reader():
with open(test_list_path, 'r') as f:
for line in f.readlines():
data, label = line.split('\t')
yield data, label
return paddle.reader.xmap_readers(test_mapper, reader, cpu_count(), 1024)
训练模型¶
创建train.py文件¶
import os
import shutil
import paddle
import paddle.fluid as fluid
from create_data import get_dict_len
from bilstm_net import bilstm_net
from text_reader import train_reader, test_reader
# 定义输入层
words = fluid.layers.data(name='words', shape=[1], dtype='int64', lod_level=1)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# 模型参数
dict_dim = get_dict_len('datasets/dict_txt.txt')
class_dim = 15 # 15个分类类别
model = bilstm_net(words, dict_dim, class_dim)
# 损失函数和优化器
cost = fluid.layers.cross_entropy(input=model, label=label)
avg_cost = fluid.layers.mean(cost)
acc = fluid.layers.accuracy(input=model, label=label)
test_program = fluid.default_main_program().clone(for_test=True)
optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.002)
opt = optimizer.minimize(avg_cost)
# 执行器配置
place = fluid.CUDAPlace(0) # 使用GPU
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
# 训练数据准备
train_reader = paddle.batch(train_reader('datasets/train_list.txt'), batch_size=128)
test_reader = paddle.batch(test_reader('datasets/test_list.txt'), batch_size=128)
# 训练循环
for pass_id in range(10):
for batch_id, data in enumerate(train_reader()):
train_cost, train_acc = exe.run(
program=fluid.default_main_program(),
feed=fluid.DataFeeder(place=place, feed_list=[words, label]).feed(data),
fetch_list=[avg_cost, acc]
)
if batch_id % 40 == 0:
print(f"Pass:{pass_id}, Batch:{batch_id}, Cost:{train_cost[0]:.5f}, Acc:{train_acc[0]:.5f}")
# 测试
test_costs, test_accs = [], []
for data in test_reader():
test_cost, test_acc = exe.run(
program=test_program,
feed=fluid.DataFeeder(place=place, feed_list=[words, label]).feed(data),
fetch_list=[avg_cost, acc]
)
test_costs.append(test_cost[0])
test_accs.append(test_acc[0])
test_cost = sum(test_costs)/len(test_costs)
test_acc = sum(test_accs)/len(test_accs)
print(f"Test: Cost:{test_cost:.5f}, Acc:{test_acc:.5f}")
# 保存模型
save_path = 'infer_model/'
shutil.rmtree(save_path, ignore_errors=True)
os.makedirs(save_path)
fluid.io.save_inference_model(
save_path, feeded_var_names=[words.name],
target_vars=[model], executor=exe
)
训练输出示例¶
```
Pass:0