验证码端到端识别(基于PaddlePaddle)¶
前言¶
上一篇文章介绍了传统验证码识别方法(需图像分割+分类),操作繁琐。本文将采用端到端识别,无需分割步骤,直接训练模型输出结果。
数据集准备¶
数据集:使用方正系统验证码(见上一篇文章)。
步骤:
1. 图像灰度化:将彩色验证码转为灰度图,减少训练复杂度。
2. 生成训练/测试列表:按格式生成图像路径与标签的映射文件。
1. 图像灰度化¶
# coding=utf-8
import os
from PIL import Image
def Image2GRAY(path):
imgs = os.listdir(path)
i = 0
for img in imgs:
if i % 10 == 0:
im = Image.open(path + '/' + img).convert('L') # 灰度化
im.save('data/test_data/' + img)
else:
im = Image.open(path + '/' + img).convert('L')
im.save('data/train_data/' + img)
i += 1
if __name__ == '__main__':
Image2GRAY('data/data_temp') # 临时数据路径
2. 生成图像列表¶
格式:图像文件名\t标签(如 4uqh.png\t4uqh)。
# coding=utf-8
import os
class CreateDataList:
def createDataList(self, data_path, isTrain):
list_name = 'trainer.list' if isTrain else 'test.list'
list_path = os.path.join(data_path, list_name)
if os.path.exists(list_path):
os.remove(list_path)
imgs = os.listdir(data_path)
for img in imgs:
name = img.split('.')[0]
with open(list_path, 'a') as f:
f.write(img + '\t' + name + '\n') # 路径+标签
if __name__ == '__main__':
createDataList = CreateDataList()
createDataList.createDataList('data/train_data/', True)
createDataList.createDataList('data/test_data/', False)
3. 数据读取与预处理¶
3.1 读取图像列表¶
def get_file_list(image_file_list):
dirname = os.path.dirname(image_file_list)
path_list = []
with open(image_file_list) as f:
for line in f:
line_split = line.strip().split('\t')
filename = line_split[0].strip()
label = line_split[1].strip()
path = os.path.join(dirname, filename)
path_list.append((path, label))
return path_list
3.2 生成标签字典¶
统计训练集中字符频率,生成字符→索引映射:
from collections import defaultdict
def build_label_dict(file_list, save_path):
values = defaultdict(int)
for path, label in file_list:
for c in label:
values[c] += 1
values['<unk>'] = 0 # 未知字符
with open(save_path, 'w') as f:
for v, count in sorted(values.items(), key=lambda x: x[1], reverse=True):
f.write(f"{v}\t{count}\n")
def load_dict(dict_path):
return {line.strip().split('\t')[0]: idx
for idx, line in enumerate(open(dict_path))}
3.3 构建数据读取器¶
将图像转为灰度向量并加载标签:
import cv2
import paddle.v2 as paddle
class Reader(object):
def __init__(self, char_dict, image_shape):
self.char_dict = char_dict
self.image_shape = image_shape
def train_reader(self, file_list):
def reader():
UNK_ID = self.char_dict['<unk>']
for image_path, label in file_list:
label = [self.char_dict.get(c, UNK_ID) for c in label]
image = cv2.imread(image_path, 0) # 灰度读取
image = cv2.resize(image, self.image_shape, interpolation=cv2.INTER_CUBIC)
image = image.flatten() / 255. # 归一化
yield image, label
return reader
4. 定义网络模型(CNN+RNN+CTC)¶
4.1 模型结构¶
- CNN:提取图像特征(3个卷积块)。
- RNN:双向GRU捕获序列信息。
- CTC:处理可变长度标签,直接输出字符序列。
class Model(object):
def __init__(self, num_classes, shape, is_infer=False):
self.num_classes = num_classes
self.shape = shape
self.image_vector_size = shape[0] * shape[1]
self.__declare_input_layers__()
self.__build_nn__()
def __declare_input_layers__(self):
self.image = paddle.layer.data(
name='image',
type=paddle.data_type.dense_vector(self.image_vector_size),
height=self.shape[1], width=self.shape[0])
if not is_infer:
self.label = paddle.layer.data(
name='label',
type=paddle.data_type.integer_value_sequence(self.num_classes))
def __build_nn__(self):
# CNN特征提取
def conv_block(ipt, num_filter, groups, num_channels=1):
return paddle.networks.img_conv_group(
input=ipt, num_channels=num_channels,
conv_filter_size=3, conv_act=paddle.activation.Relu(),
pool_size=2, pool_stride=2,
conv_num_filter=[num_filter]*groups, conv_with_batchnorm=True)
conv1 = conv_block(self.image, 16, 2)
conv2 = conv_block(conv1, 32, 2)
conv3 = conv_block(conv2, 64, 2)
conv4 = conv_block(conv3, 128, 2)
# 特征展开与双向GRU
sliced_feature = paddle.layer.block_expand(conv4, num_channels=128, stride_x=1, stride_y=1)
gru_forward = paddle.networks.simple_gru(sliced_feature, size=128)
gru_backward = paddle.networks.simple_gru(sliced_feature, size=128, reverse=True)
# CTC输出层
self.output = paddle.layer.fc([gru_forward, gru_backward], size=self.num_classes+1, act=paddle.activation.Linear())
self.log_probs = paddle.layer.mixed(
input=self.output, act=paddle.activation.Softmax())
if not is_infer:
self.cost = paddle.layer.warp_ctc(
input=self.output, label=self.label, size=self.num_classes+1, blank=self.num_classes)
self.eval = paddle.evaluator.ctc_error(input=self.output, label=self.label)
5. 训练模型¶
5.1 初始化与训练器¶
# 超参数
IMAGE_SHAPE = (72, 27) # 图像宽×高
BATCH_SIZE = 10
dict_size = len(char_dict) # 标签字典大小
# 初始化Paddle
paddle.init(use_gpu=True, trainer_count=1)
# 模型与训练器
model = Model(dict_size, IMAGE_SHAPE, is_infer=False)
optimizer = paddle.optimizer.Momentum(momentum=0)
params = paddle.parameters.create(model.cost)
trainer = paddle.trainer.SGD(
cost=model.cost,
parameters=params,
update_equation=optimizer,
extra_layers=model.eval)
5.2 训练执行¶
def train():
train_file_list = get_file_list('data/train_data/trainer.list')
test_file_list = get_file_list('data/test_data/test.list')
model_reader = Reader(char_dict, IMAGE_SHAPE)
trainer.train(
reader=paddle.batch(model_reader.train_reader(train_file_list), batch_size=BATCH_SIZE),
feeding={'image': 0, 'label': 1}, # 数据层映射
event_handler=lambda e: print(f"Pass {e.pass_id}, Cost {e.cost}"),
num_passes=100
)
6. 预测¶
6.1 加载模型与参数¶
def infer(img_path, model_path, image_shape, label_dict_path):
char_dict = load_dict(label_dict_path)
paddle.init(use_gpu=True, trainer_count=1)
parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))
model = Model(len(char_dict), image_shape, is_infer=True)
inferer = paddle.inference.Inference(output_layer=model.log_probs, parameters=parameters)
return start_infer(inferer, img_path, char_dict)
6.2 CTC解码(最佳路径)¶
import numpy as np
from itertools import groupby
def ctc_greedy_decoder(probs_seq, vocabulary):
max_index_list = np.array(probs_seq).argmax(axis=1)
index_list = [idx for idx, _ in groupby(max_index_list) if idx != len(vocabulary)] # 过滤空白
return ''.join([vocabulary[idx] for idx in index_list])
7. 参考资料¶
注意:需确保环境中安装 paddlepaddle-gpu 及 warp-ctc 库(仅支持GPU训练)。训练完成后,模型可直接用于预测新验证码。