验证码识别:使用PaddlePaddle进行端到端训练

概述

本文将详细介绍如何使用PaddlePaddle构建一个验证码识别系统。验证码通常由多个字符组成,我们需要先对验证码进行预处理(裁剪、分类),然后训练模型进行识别。

1. 验证码预处理

1.1 下载验证码

我们需要生成大量验证码用于训练。这里使用Python代码下载验证码图片:

# coding=utf-8
import os
import uuid
import requests
import random
import time

class DownloadCaptcha:
    def __init__(self, save_path, max_count=1000):
        self.save_path = save_path
        self.max_count = max_count
        if not os.path.exists(save_path):
            os.makedirs(save_path)

    def download(self):
        for i in range(self.max_count):
            try:
                # 模拟不同的验证码请求参数
                params = {
                    'type': random.choice(['1', '2', '3']),
                    'timestamp': int(time.time() * 1000),
                    'rand': random.choice(['r', 'g', 'b'])
                }
                response = requests.get(
                    'http://jwsys.ctbu.edu.cn/CheckCode.aspx', 
                    params=params, 
                    timeout=10
                )
                # 保存图片
                filename = os.path.join(self.save_path, f"{uuid.uuid1()}.png")
                with open(filename, 'wb') as f:
                    f.write(response.content)
                print(f"已下载 {i+1}/{self.max_count} 张验证码")
            except Exception as e:
                print(f"下载失败: {e}")
            time.sleep(random.uniform(0.5, 1.5))

if __name__ == '__main__':
    downloader = DownloadCaptcha('captcha_images', 1000)
    downloader.download()

1.2 验证码裁剪

将完整验证码分割成单个字符图像:

# coding=utf-8
import os
import cv2
import numpy as np

class SplitCaptcha:
    def __init__(self, input_dir, output_dir):
        self.input_dir = input_dir
        self.output_dir = output_dir
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

    def split(self):
        # 验证码宽度约50,高度约20,每个字符宽度约12-15
        for filename in os.listdir(self.input_dir):
            if filename.endswith('.png'):
                filepath = os.path.join(self.input_dir, filename)
                img = cv2.imread(filepath, 0)  # 灰度图
                # 二值化处理
                _, binary = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY_INV)
                # 查找轮廓
                contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                # 按x坐标排序轮廓
                contours = sorted(contours, key=lambda c: cv2.boundingRect(c)[0])
                # 分割字符
                for i, cnt in enumerate(contours):
                    x, y, w, h = cv2.boundingRect(cnt)
                    if w > 5 and h > 15:  # 过滤小噪声
                        roi = binary[y:y+h, x:x+w]
                        # 调整大小
                        roi = cv2.resize(roi, (20, 20), interpolation=cv2.INTER_AREA)
                        # 保存
                        save_dir = os.path.join(self.output_dir, f"{i}")
                        if not os.path.exists(save_dir):
                            os.makedirs(save_dir)
                        cv2.imwrite(os.path.join(save_dir, filename), roi)

2. 数据准备

2.1 创建图像列表

为训练准备图像路径和标签:

# coding=utf-8
import os
import random
import json

class CreateDataList:
    def __init__(self, src_dir, train_ratio=0.8):
        self.src_dir = src_dir
        self.train_ratio = train_ratio
        self.data_list = []
        self.classes = []
        self._load_classes()

    def _load_classes(self):
        for dirname in os.listdir(self.src_dir):
            if os.path.isdir(os.path.join(self.src_dir, dirname)):
                self.classes.append(dirname)

    def create(self):
        # 读取每个类别的图像
        for cls in self.classes:
            cls_dir = os.path.join(self.src_dir, cls)
            for img_file in os.listdir(cls_dir):
                img_path = os.path.join(cls_dir, img_file)
                label = self.classes.index(cls)
                self.data_list.append((img_path, label))

        # 打乱数据
        random.shuffle(self.data_list)

        # 划分训练集和测试集
        train_size = int(len(self.data_list) * self.train_ratio)
        train_data = self.data_list[:train_size]
        test_data = self.data_list[train_size:]

        # 保存数据列表
        with open('train.list', 'w') as f:
            for img_path, label in train_data:
                f.write(f"{img_path} {label}\n")

        with open('test.list', 'w') as f:
            for img_path, label in test_data:
                f.write(f"{img_path} {label}\n")

        # 保存类别信息
        class_info = {cls: idx for idx, cls in enumerate(self.classes)}
        with open('class_info.json', 'w') as f:
            json.dump(class_info, f, indent=4)

3. 定义网络模型

3.1 LeNet-5模型

适用于简单验证码识别:

# coding=utf-8
import paddle.v2 as paddle

def lenet5(input_dim, class_dim):
    # 输入层
    img = paddle.layer.data(name="image", 
                           type=paddle.data_type.dense_vector(input_dim))

    # 卷积层1
    conv1 = paddle.networks.simple_img_conv_pool(
        input=img,
        filter_size=5,
        num_filters=20,
        num_channels=1,
        pool_size=2,
        pool_stride=2,
        act=paddle.activation.Relu()
    )

    # 卷积层2
    conv2 = paddle.networks.simple_img_conv_pool(
        input=conv1,
        filter_size=5,
        num_filters=50,
        num_channels=20,
        pool_size=2,
        pool_stride=2,
        act=paddle.activation.Relu()
    )

    # 全连接层
    fc = paddle.layer.fc(
        input=conv2,
        size=500,
        act=paddle.activation.Tanh()
    )

    # 输出层
    out = paddle.layer.fc(
        input=fc,
        size=class_dim,
        act=paddle.activation.Softmax()
    )

    return out

4. 训练模型

4.1 数据读取器

# coding=utf-8
import os
import numpy as np
import paddle.v2 as paddle

class CaptchaReader:
    def __init__(self, data_list_file, class_info_file):
        self.data_list_file = data_list_file
        self.class_info = self._load_class_info(class_info_file)

    def _load_class_info(self, file_path):
        with open(file_path, 'r') as f:
            return json.load(f)

    def reader(self):
        with open(self.data_list_file, 'r') as f:
            lines = f.readlines()

        def reader():
            for line in lines:
                img_path, label = line.strip().split()
                label = int(label)
                # 读取图像并预处理
                img = cv2.imread(img_path, 0)
                img = cv2.resize(img, (20, 20))
                img = img.flatten().astype('float32') / 255.0
                yield img, label

        return reader

4.2 训练主程序

# coding=utf-8
import paddle.v2 as paddle
from lenet5 import lenet5
from CaptchaReader import CaptchaReader

# 初始化Paddle
paddle.init(use_gpu=False, trainer_count=1)

# 超参数设置
BATCH_SIZE = 64
IMAGE_SIZE = 20 * 20  # 20x20图像
CLASS_DIM = 33  # 假设33个字符类别

# 构建网络
image = paddle.layer.data(name="image", 
                         type=paddle.data_type.dense_vector(IMAGE_SIZE))
label = paddle.layer.data(name="label", 
                         type=paddle.data_type.integer_value(CLASS_DIM))

# 定义模型
predict = lenet5(IMAGE_SIZE, CLASS_DIM)

# 定义损失函数
cost = paddle.layer.classification_cost(input=predict, label=label)

# 定义优化器
optimizer = paddle.optimizer.Momentum(
    learning_rate=0.001,
    momentum=0.9,
    regularization=paddle.optimizer.L2Regularization(rate=1e-4)
)

# 创建训练器
trainer = paddle.trainer.SGD(
    cost=cost,
    parameters=paddle.parameters.create(cost),
    update_equation=optimizer
)

# 数据读取器
train_reader = CaptchaReader('train.list', 'class_info.json').reader()
test_reader = CaptchaReader('test.list', 'class_info.json').reader()

# 开始训练
def event_handler(event):
    if isinstance(event, paddle.event.EndIteration):
        if event.batch_id % 100 == 0:
            print("Pass %d, Batch %d, Cost: %f" % (
                event.pass_id, event.batch_id, event.cost))

    if isinstance(event, paddle.event.EndPass):
        # 测试集评估
        result = trainer.test(reader=paddle.batch(test_reader, batch_size=BATCH_SIZE), 
                              feeding={"image": 0, "label": 1})
        print("Test Result: %s" % result.metrics)
        # 保存模型
        trainer.save_parameter_to_tar("captcha_model.tar")

# 开始训练
trainer.train(
    reader=paddle.batch(paddle.reader.shuffle(train_reader, buf_size=1000), batch_size=BATCH_SIZE),
    num_passes=50,
    event_handler=event_handler,
    feeding={"image": 0, "label": 1}
)

5. 模型预测

5.1 预测函数

# coding=utf-8
import paddle.v2 as paddle
import cv2
import numpy as np
import json

def predict_captcha(image_path, model_path, class_info_path):
    # 加载模型
    parameters = paddle.parameters.Parameters.from_tar(model_path)

    # 加载类别信息
    with open(class_info_path, 'r') as f:
        class_info = json.load(f)
    reverse_info = {v: k for k, v in class_info.items()}

    # 预处理图像
    img = cv2.imread(image_path, 0)
    img = cv2.resize(img, (20, 20))
    img = img.flatten().reshape(1, -1).astype('float32') / 255.0

    # 预测
    result = paddle.infer(
        output_layer=predict,
        parameters=parameters,
        input=[img]
    )

    # 解析结果
    pred_label = np.argmax(result[0])
    confidence = result[0][pred_label]

    return reverse_info[pred_label], confidence

6. 端到端验证码识别

# coding=utf-8
import os
import cv2
import numpy as np
import paddle.v2 as paddle

def full_captcha_recognition(captcha_image_path, model_path, class_info_path):
    # 步骤1: 分割验证码
    split_captcha = split_captcha_image(captcha_image_path)

    # 步骤2: 预测每个字符
    result = ""
    for i, img_path in enumerate(split_captcha):
        char, confidence = predict_captcha(
            img_path, model_path, class_info_path)
        result += char
        print(f"第{i+1}个字符: {char}, 置信度: {confidence:.2%}")

    return result

def split_captcha_image(image_path):
    # 分割验证码为单个字符
    img = cv2.imread(image_path, 0)
    _, binary = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key=lambda c: cv2.boundingRect(c)[0])

    split_images = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 5 and h > 15:
            roi = binary[y:y+h, x:x+w]
            roi = cv2.resize(roi, (20, 20))
            temp_dir = "temp"
            if not os.path.exists(temp_dir):
                os.makedirs(temp_dir)
            temp_path = os.path.join(temp_dir, f"{len(split_images)}.png")
            cv2.imwrite(temp_path, roi)
            split_images.append(temp_path)

    return split_images

# 预测示例
if __name__ == "__main__":
    captcha_path = "test_captcha.png"
    result = full_captcha_recognition(
        captcha_path, "captcha_model.tar", "class_info.json")
    print(f"验证码识别结果: {result}")

7. 优化建议

  1. 数据增强:使用旋转、平移、缩放等方式扩充数据集
  2. 学习率调度:使用学习率衰减策略提升收敛速度
  3. 迁移学习:使用预训练模型(如ResNet)进行微调
  4. 多模型集成:训练多个模型并进行结果融合
  5. 优化预处理:使用更精确的字符分割算法

8. 项目结构

captcha_recognition/
├── data/
   ├── train.list
   ├── test.list
   └── class_info.json
├── models/
   └── captcha_model.tar
├── utils/
   ├── downloader.py
   ├── splitter.py
   ├── reader.py
   └── predictor.py
├── train.py
└── predict.py

通过以上步骤,我们可以构建一个完整的验证码识别系统,从数据采集、预处理、模型训练到最终的端到端识别。根据实际验证码的复杂度和字符集大小,可能需要调整模型结构和训练参数以获得最佳识别效果。

Xiaoye