验证码识别:使用PaddlePaddle进行端到端训练¶
概述¶
本文将详细介绍如何使用PaddlePaddle构建一个验证码识别系统。验证码通常由多个字符组成,我们需要先对验证码进行预处理(裁剪、分类),然后训练模型进行识别。
1. 验证码预处理¶
1.1 下载验证码¶
我们需要生成大量验证码用于训练。这里使用Python代码下载验证码图片:
# coding=utf-8
import os
import uuid
import requests
import random
import time
class DownloadCaptcha:
def __init__(self, save_path, max_count=1000):
self.save_path = save_path
self.max_count = max_count
if not os.path.exists(save_path):
os.makedirs(save_path)
def download(self):
for i in range(self.max_count):
try:
# 模拟不同的验证码请求参数
params = {
'type': random.choice(['1', '2', '3']),
'timestamp': int(time.time() * 1000),
'rand': random.choice(['r', 'g', 'b'])
}
response = requests.get(
'http://jwsys.ctbu.edu.cn/CheckCode.aspx',
params=params,
timeout=10
)
# 保存图片
filename = os.path.join(self.save_path, f"{uuid.uuid1()}.png")
with open(filename, 'wb') as f:
f.write(response.content)
print(f"已下载 {i+1}/{self.max_count} 张验证码")
except Exception as e:
print(f"下载失败: {e}")
time.sleep(random.uniform(0.5, 1.5))
if __name__ == '__main__':
downloader = DownloadCaptcha('captcha_images', 1000)
downloader.download()
1.2 验证码裁剪¶
将完整验证码分割成单个字符图像:
# coding=utf-8
import os
import cv2
import numpy as np
class SplitCaptcha:
def __init__(self, input_dir, output_dir):
self.input_dir = input_dir
self.output_dir = output_dir
if not os.path.exists(output_dir):
os.makedirs(output_dir)
def split(self):
# 验证码宽度约50,高度约20,每个字符宽度约12-15
for filename in os.listdir(self.input_dir):
if filename.endswith('.png'):
filepath = os.path.join(self.input_dir, filename)
img = cv2.imread(filepath, 0) # 灰度图
# 二值化处理
_, binary = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY_INV)
# 查找轮廓
contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# 按x坐标排序轮廓
contours = sorted(contours, key=lambda c: cv2.boundingRect(c)[0])
# 分割字符
for i, cnt in enumerate(contours):
x, y, w, h = cv2.boundingRect(cnt)
if w > 5 and h > 15: # 过滤小噪声
roi = binary[y:y+h, x:x+w]
# 调整大小
roi = cv2.resize(roi, (20, 20), interpolation=cv2.INTER_AREA)
# 保存
save_dir = os.path.join(self.output_dir, f"{i}")
if not os.path.exists(save_dir):
os.makedirs(save_dir)
cv2.imwrite(os.path.join(save_dir, filename), roi)
2. 数据准备¶
2.1 创建图像列表¶
为训练准备图像路径和标签:
# coding=utf-8
import os
import random
import json
class CreateDataList:
def __init__(self, src_dir, train_ratio=0.8):
self.src_dir = src_dir
self.train_ratio = train_ratio
self.data_list = []
self.classes = []
self._load_classes()
def _load_classes(self):
for dirname in os.listdir(self.src_dir):
if os.path.isdir(os.path.join(self.src_dir, dirname)):
self.classes.append(dirname)
def create(self):
# 读取每个类别的图像
for cls in self.classes:
cls_dir = os.path.join(self.src_dir, cls)
for img_file in os.listdir(cls_dir):
img_path = os.path.join(cls_dir, img_file)
label = self.classes.index(cls)
self.data_list.append((img_path, label))
# 打乱数据
random.shuffle(self.data_list)
# 划分训练集和测试集
train_size = int(len(self.data_list) * self.train_ratio)
train_data = self.data_list[:train_size]
test_data = self.data_list[train_size:]
# 保存数据列表
with open('train.list', 'w') as f:
for img_path, label in train_data:
f.write(f"{img_path} {label}\n")
with open('test.list', 'w') as f:
for img_path, label in test_data:
f.write(f"{img_path} {label}\n")
# 保存类别信息
class_info = {cls: idx for idx, cls in enumerate(self.classes)}
with open('class_info.json', 'w') as f:
json.dump(class_info, f, indent=4)
3. 定义网络模型¶
3.1 LeNet-5模型¶
适用于简单验证码识别:
# coding=utf-8
import paddle.v2 as paddle
def lenet5(input_dim, class_dim):
# 输入层
img = paddle.layer.data(name="image",
type=paddle.data_type.dense_vector(input_dim))
# 卷积层1
conv1 = paddle.networks.simple_img_conv_pool(
input=img,
filter_size=5,
num_filters=20,
num_channels=1,
pool_size=2,
pool_stride=2,
act=paddle.activation.Relu()
)
# 卷积层2
conv2 = paddle.networks.simple_img_conv_pool(
input=conv1,
filter_size=5,
num_filters=50,
num_channels=20,
pool_size=2,
pool_stride=2,
act=paddle.activation.Relu()
)
# 全连接层
fc = paddle.layer.fc(
input=conv2,
size=500,
act=paddle.activation.Tanh()
)
# 输出层
out = paddle.layer.fc(
input=fc,
size=class_dim,
act=paddle.activation.Softmax()
)
return out
4. 训练模型¶
4.1 数据读取器¶
# coding=utf-8
import os
import numpy as np
import paddle.v2 as paddle
class CaptchaReader:
def __init__(self, data_list_file, class_info_file):
self.data_list_file = data_list_file
self.class_info = self._load_class_info(class_info_file)
def _load_class_info(self, file_path):
with open(file_path, 'r') as f:
return json.load(f)
def reader(self):
with open(self.data_list_file, 'r') as f:
lines = f.readlines()
def reader():
for line in lines:
img_path, label = line.strip().split()
label = int(label)
# 读取图像并预处理
img = cv2.imread(img_path, 0)
img = cv2.resize(img, (20, 20))
img = img.flatten().astype('float32') / 255.0
yield img, label
return reader
4.2 训练主程序¶
# coding=utf-8
import paddle.v2 as paddle
from lenet5 import lenet5
from CaptchaReader import CaptchaReader
# 初始化Paddle
paddle.init(use_gpu=False, trainer_count=1)
# 超参数设置
BATCH_SIZE = 64
IMAGE_SIZE = 20 * 20 # 20x20图像
CLASS_DIM = 33 # 假设33个字符类别
# 构建网络
image = paddle.layer.data(name="image",
type=paddle.data_type.dense_vector(IMAGE_SIZE))
label = paddle.layer.data(name="label",
type=paddle.data_type.integer_value(CLASS_DIM))
# 定义模型
predict = lenet5(IMAGE_SIZE, CLASS_DIM)
# 定义损失函数
cost = paddle.layer.classification_cost(input=predict, label=label)
# 定义优化器
optimizer = paddle.optimizer.Momentum(
learning_rate=0.001,
momentum=0.9,
regularization=paddle.optimizer.L2Regularization(rate=1e-4)
)
# 创建训练器
trainer = paddle.trainer.SGD(
cost=cost,
parameters=paddle.parameters.create(cost),
update_equation=optimizer
)
# 数据读取器
train_reader = CaptchaReader('train.list', 'class_info.json').reader()
test_reader = CaptchaReader('test.list', 'class_info.json').reader()
# 开始训练
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print("Pass %d, Batch %d, Cost: %f" % (
event.pass_id, event.batch_id, event.cost))
if isinstance(event, paddle.event.EndPass):
# 测试集评估
result = trainer.test(reader=paddle.batch(test_reader, batch_size=BATCH_SIZE),
feeding={"image": 0, "label": 1})
print("Test Result: %s" % result.metrics)
# 保存模型
trainer.save_parameter_to_tar("captcha_model.tar")
# 开始训练
trainer.train(
reader=paddle.batch(paddle.reader.shuffle(train_reader, buf_size=1000), batch_size=BATCH_SIZE),
num_passes=50,
event_handler=event_handler,
feeding={"image": 0, "label": 1}
)
5. 模型预测¶
5.1 预测函数¶
# coding=utf-8
import paddle.v2 as paddle
import cv2
import numpy as np
import json
def predict_captcha(image_path, model_path, class_info_path):
# 加载模型
parameters = paddle.parameters.Parameters.from_tar(model_path)
# 加载类别信息
with open(class_info_path, 'r') as f:
class_info = json.load(f)
reverse_info = {v: k for k, v in class_info.items()}
# 预处理图像
img = cv2.imread(image_path, 0)
img = cv2.resize(img, (20, 20))
img = img.flatten().reshape(1, -1).astype('float32') / 255.0
# 预测
result = paddle.infer(
output_layer=predict,
parameters=parameters,
input=[img]
)
# 解析结果
pred_label = np.argmax(result[0])
confidence = result[0][pred_label]
return reverse_info[pred_label], confidence
6. 端到端验证码识别¶
# coding=utf-8
import os
import cv2
import numpy as np
import paddle.v2 as paddle
def full_captcha_recognition(captcha_image_path, model_path, class_info_path):
# 步骤1: 分割验证码
split_captcha = split_captcha_image(captcha_image_path)
# 步骤2: 预测每个字符
result = ""
for i, img_path in enumerate(split_captcha):
char, confidence = predict_captcha(
img_path, model_path, class_info_path)
result += char
print(f"第{i+1}个字符: {char}, 置信度: {confidence:.2%}")
return result
def split_captcha_image(image_path):
# 分割验证码为单个字符
img = cv2.imread(image_path, 0)
_, binary = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY_INV)
contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=lambda c: cv2.boundingRect(c)[0])
split_images = []
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
if w > 5 and h > 15:
roi = binary[y:y+h, x:x+w]
roi = cv2.resize(roi, (20, 20))
temp_dir = "temp"
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
temp_path = os.path.join(temp_dir, f"{len(split_images)}.png")
cv2.imwrite(temp_path, roi)
split_images.append(temp_path)
return split_images
# 预测示例
if __name__ == "__main__":
captcha_path = "test_captcha.png"
result = full_captcha_recognition(
captcha_path, "captcha_model.tar", "class_info.json")
print(f"验证码识别结果: {result}")
7. 优化建议¶
- 数据增强:使用旋转、平移、缩放等方式扩充数据集
- 学习率调度:使用学习率衰减策略提升收敛速度
- 迁移学习:使用预训练模型(如ResNet)进行微调
- 多模型集成:训练多个模型并进行结果融合
- 优化预处理:使用更精确的字符分割算法
8. 项目结构¶
captcha_recognition/
├── data/
│ ├── train.list
│ ├── test.list
│ └── class_info.json
├── models/
│ └── captcha_model.tar
├── utils/
│ ├── downloader.py
│ ├── splitter.py
│ ├── reader.py
│ └── predictor.py
├── train.py
└── predict.py
通过以上步骤,我们可以构建一个完整的验证码识别系统,从数据采集、预处理、模型训练到最终的端到端识别。根据实际验证码的复杂度和字符集大小,可能需要调整模型结构和训练参数以获得最佳识别效果。