验证码端到端识别(基于PaddlePaddle)

前言

上一篇文章介绍了传统验证码识别方法(需图像分割+分类),操作繁琐。本文将采用端到端识别,无需分割步骤,直接训练模型输出结果。

数据集准备

数据集:使用方正系统验证码(见上一篇文章)。
步骤
1. 图像灰度化:将彩色验证码转为灰度图,减少训练复杂度。
2. 生成训练/测试列表:按格式生成图像路径与标签的映射文件。

1. 图像灰度化

# coding=utf-8
import os
from PIL import Image

def Image2GRAY(path):
    imgs = os.listdir(path)
    i = 0
    for img in imgs:
        if i % 10 == 0:
            im = Image.open(path + '/' + img).convert('L')  # 灰度化
            im.save('data/test_data/' + img)
        else:
            im = Image.open(path + '/' + img).convert('L')
            im.save('data/train_data/' + img)
        i += 1

if __name__ == '__main__':
    Image2GRAY('data/data_temp')  # 临时数据路径

2. 生成图像列表

格式:图像文件名\t标签(如 4uqh.png\t4uqh)。

# coding=utf-8
import os

class CreateDataList:
    def createDataList(self, data_path, isTrain):
        list_name = 'trainer.list' if isTrain else 'test.list'
        list_path = os.path.join(data_path, list_name)
        if os.path.exists(list_path):
            os.remove(list_path)
        imgs = os.listdir(data_path)
        for img in imgs:
            name = img.split('.')[0]
            with open(list_path, 'a') as f:
                f.write(img + '\t' + name + '\n')  # 路径+标签

if __name__ == '__main__':
    createDataList = CreateDataList()
    createDataList.createDataList('data/train_data/', True)
    createDataList.createDataList('data/test_data/', False)

3. 数据读取与预处理

3.1 读取图像列表

def get_file_list(image_file_list):
    dirname = os.path.dirname(image_file_list)
    path_list = []
    with open(image_file_list) as f:
        for line in f:
            line_split = line.strip().split('\t')
            filename = line_split[0].strip()
            label = line_split[1].strip()
            path = os.path.join(dirname, filename)
            path_list.append((path, label))
    return path_list

3.2 生成标签字典

统计训练集中字符频率,生成字符→索引映射:

from collections import defaultdict

def build_label_dict(file_list, save_path):
    values = defaultdict(int)
    for path, label in file_list:
        for c in label:
            values[c] += 1
    values['<unk>'] = 0  # 未知字符
    with open(save_path, 'w') as f:
        for v, count in sorted(values.items(), key=lambda x: x[1], reverse=True):
            f.write(f"{v}\t{count}\n")

def load_dict(dict_path):
    return {line.strip().split('\t')[0]: idx 
            for idx, line in enumerate(open(dict_path))}

3.3 构建数据读取器

将图像转为灰度向量并加载标签:

import cv2
import paddle.v2 as paddle

class Reader(object):
    def __init__(self, char_dict, image_shape):
        self.char_dict = char_dict
        self.image_shape = image_shape

    def train_reader(self, file_list):
        def reader():
            UNK_ID = self.char_dict['<unk>']
            for image_path, label in file_list:
                label = [self.char_dict.get(c, UNK_ID) for c in label]
                image = cv2.imread(image_path, 0)  # 灰度读取
                image = cv2.resize(image, self.image_shape, interpolation=cv2.INTER_CUBIC)
                image = image.flatten() / 255.  # 归一化
                yield image, label
        return reader

4. 定义网络模型(CNN+RNN+CTC)

4.1 模型结构

  • CNN:提取图像特征(3个卷积块)。
  • RNN:双向GRU捕获序列信息。
  • CTC:处理可变长度标签,直接输出字符序列。
class Model(object):
    def __init__(self, num_classes, shape, is_infer=False):
        self.num_classes = num_classes
        self.shape = shape
        self.image_vector_size = shape[0] * shape[1]
        self.__declare_input_layers__()
        self.__build_nn__()

    def __declare_input_layers__(self):
        self.image = paddle.layer.data(
            name='image',
            type=paddle.data_type.dense_vector(self.image_vector_size),
            height=self.shape[1], width=self.shape[0])
        if not is_infer:
            self.label = paddle.layer.data(
                name='label',
                type=paddle.data_type.integer_value_sequence(self.num_classes))

    def __build_nn__(self):
        # CNN特征提取
        def conv_block(ipt, num_filter, groups, num_channels=1):
            return paddle.networks.img_conv_group(
                input=ipt, num_channels=num_channels,
                conv_filter_size=3, conv_act=paddle.activation.Relu(),
                pool_size=2, pool_stride=2,
                conv_num_filter=[num_filter]*groups, conv_with_batchnorm=True)

        conv1 = conv_block(self.image, 16, 2)
        conv2 = conv_block(conv1, 32, 2)
        conv3 = conv_block(conv2, 64, 2)
        conv4 = conv_block(conv3, 128, 2)

        # 特征展开与双向GRU
        sliced_feature = paddle.layer.block_expand(conv4, num_channels=128, stride_x=1, stride_y=1)
        gru_forward = paddle.networks.simple_gru(sliced_feature, size=128)
        gru_backward = paddle.networks.simple_gru(sliced_feature, size=128, reverse=True)

        # CTC输出层
        self.output = paddle.layer.fc([gru_forward, gru_backward], size=self.num_classes+1, act=paddle.activation.Linear())
        self.log_probs = paddle.layer.mixed(
            input=self.output, act=paddle.activation.Softmax())

        if not is_infer:
            self.cost = paddle.layer.warp_ctc(
                input=self.output, label=self.label, size=self.num_classes+1, blank=self.num_classes)
            self.eval = paddle.evaluator.ctc_error(input=self.output, label=self.label)

5. 训练模型

5.1 初始化与训练器

# 超参数
IMAGE_SHAPE = (72, 27)  # 图像宽×高
BATCH_SIZE = 10
dict_size = len(char_dict)  # 标签字典大小

# 初始化Paddle
paddle.init(use_gpu=True, trainer_count=1)

# 模型与训练器
model = Model(dict_size, IMAGE_SHAPE, is_infer=False)
optimizer = paddle.optimizer.Momentum(momentum=0)
params = paddle.parameters.create(model.cost)

trainer = paddle.trainer.SGD(
    cost=model.cost,
    parameters=params,
    update_equation=optimizer,
    extra_layers=model.eval)

5.2 训练执行

def train():
    train_file_list = get_file_list('data/train_data/trainer.list')
    test_file_list = get_file_list('data/test_data/test.list')
    model_reader = Reader(char_dict, IMAGE_SHAPE)

    trainer.train(
        reader=paddle.batch(model_reader.train_reader(train_file_list), batch_size=BATCH_SIZE),
        feeding={'image': 0, 'label': 1},  # 数据层映射
        event_handler=lambda e: print(f"Pass {e.pass_id}, Cost {e.cost}"),
        num_passes=100
    )

6. 预测

6.1 加载模型与参数

def infer(img_path, model_path, image_shape, label_dict_path):
    char_dict = load_dict(label_dict_path)
    paddle.init(use_gpu=True, trainer_count=1)
    parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))
    model = Model(len(char_dict), image_shape, is_infer=True)
    inferer = paddle.inference.Inference(output_layer=model.log_probs, parameters=parameters)
    return start_infer(inferer, img_path, char_dict)

6.2 CTC解码(最佳路径)

import numpy as np
from itertools import groupby

def ctc_greedy_decoder(probs_seq, vocabulary):
    max_index_list = np.array(probs_seq).argmax(axis=1)
    index_list = [idx for idx, _ in groupby(max_index_list) if idx != len(vocabulary)]  # 过滤空白
    return ''.join([vocabulary[idx] for idx in index_list])

7. 参考资料

  1. PaddlePaddle官方文档
  2. Warp-CTC实现
  3. 验证码端到端识别教程

注意:需确保环境中安装 paddlepaddle-gpuwarp-ctc 库(仅支持GPU训练)。训练完成后,模型可直接用于预测新验证码。

Xiaoye