基于ECAPA-TDNN的声纹识别系统实现¶
一、项目概述¶
本项目实现了基于ECAPA-TDNN模型的声纹识别系统,支持音频预处理、模型训练、声纹对比和识别功能。通过Mel频谱特征提取、数据增强和ArcFace损失函数优化,系统能够有效区分不同说话人,实现高准确率的声纹识别。
二、环境配置¶
- 依赖库:
pip install torch torchaudio soundfile librosa matplotlib visualdl scikit-learn numpy
- 数据集准备:
- 使用zhvoice数据集,包含3242个说话人及113万+条语音。
- 数据格式:wav音频文件,命名格式为<说话人ID>/<文件名>.wav。
三、核心代码实现¶
1. 音频预处理与特征提取¶
import torch
import torchaudio
import numpy as np
from scipy.io import wavfile
class AudioProcessor:
def __init__(self, sample_rate=16000, n_fft=512, hop_length=160, n_mels=80):
self.sample_rate = sample_rate
self.n_fft = n_fft
self.hop_length = hop_length
self.n_mels = n_mels
self.transform = torchaudio.transforms.MelSpectrogram(
sample_rate=sample_rate,
n_fft=n_fft,
hop_length=hop_length,
n_mels=n_mels,
f_min=50,
f_max=14000
)
def load_audio(self, file_path):
# 读取音频并重采样
sr, audio = wavfile.read(file_path)
if sr != self.sample_rate:
audio = torchaudio.transforms.Resample(sr, self.sample_rate)(torch.tensor(audio).float())
return audio
def extract_feature(self, audio):
# 提取Mel频谱特征
mel = self.transform(audio)
mel_db = torchaudio.transforms.AmplitudeToDB()(mel)
return mel_db.numpy().T # 转置为 (n_mels, time_steps)
2. 数据增强配置¶
import random
import librosa
class Augmenter:
def __init__(self, noise_path=None):
self.noise_path = noise_path
self.noises = []
if noise_path:
# 加载背景噪声库
for file in os.listdir(noise_path):
if file.endswith('.wav'):
sr, noise = wavfile.read(os.path.join(noise_path, file))
self.noises.append(noise.astype(np.float32) / 32767)
def add_noise(self, audio, snr_range=(10, 30)):
if not self.noises or random.random() > 0.5:
return audio
snr = random.uniform(*snr_range)
noise = random.choice(self.noises)
audio_length = len(audio)
noise_length = len(noise)
if noise_length < audio_length:
noise = np.tile(noise, (audio_length // noise_length + 1))[:audio_length]
noise = noise[:audio_length]
noise_power = np.sum(noise ** 2) / audio_length
audio_power = np.sum(audio ** 2) / audio_length
scale = np.sqrt(audio_power / (10 ** (snr / 10)) / noise_power)
return audio + scale * noise
def time_stretch(self, audio, rate_range=(0.9, 1.1)):
if random.random() > 0.5:
rate = random.uniform(*rate_range)
return librosa.effects.time_stretch(audio, rate)
return audio
3. 模型定义(ECAPA-TDNN)¶
import torch.nn as nn
import torch.nn.functional as F
class ECAPA_TDNN(nn.Module):
def __init__(self, input_channels=80, num_speakers=3242, lin_neurons=192):
super().__init__()
# 初始化TDNN层
self.convs = nn.ModuleList([
nn.Conv1d(input_channels, 512, kernel_size=5, dilation=1, padding=2),
nn.Conv1d(512, 512, kernel_size=3, dilation=2, padding=2),
nn.Conv1d(512, 512, kernel_size=3, dilation=3, padding=3),
nn.Conv1d(512, 512, kernel_size=3, dilation=4, padding=4),
nn.Conv1d(512, 1536, kernel_size=1, dilation=1, padding=0),
])
self.bn = nn.BatchNorm1d(1536)
self.pooling = nn.AdaptiveAvgPool1d(1)
self.fc = nn.Linear(1536, lin_neurons)
self.arcface = ArcFace(num_features=lin_neurons, num_classes=num_speakers)
def forward(self, x, labels=None):
x = x.unsqueeze(1) # [B, 1, T] -> [B, C, T]
for conv in self.convs[:-1]:
x = conv(x)
x = F.relu(x)
x = self.convs[-1](x)
x = self.bn(x)
x = F.relu(x)
x = self.pooling(x).squeeze(-1) # [B, 1536]
x = self.fc(x)
x = F.normalize(x, p=2, dim=1) # 特征归一化
if labels is not None:
return self.arcface(x, labels)
return x
4. ArcFace损失函数¶
class ArcFace(nn.Module):
def __init__(self, num_features, num_classes, s=30.0, m=0.5):
super().__init__()
self.fc = nn.Linear(num_features, num_classes, bias=False)
self.s = s # 缩放因子
self.m = m # 角度间隔
self.weight = self.fc.weight
def forward(self, x, labels):
cos_theta = F.linear(F.normalize(x), F.normalize(self.weight)) # 余弦相似度
theta = torch.acos(torch.clamp(cos_theta, -1.0 + 1e-7, 1.0 - 1e-7)) # 角度计算
theta = theta + self.m # 添加角度间隔
cos_theta_m = torch.cos(theta)
# 替换同类别样本的余弦值
one_hot = torch.zeros_like(cos_theta)
one_hot.scatter_(1, labels.view(-1, 1), 1.0)
output = (cos_theta_m - cos_theta) * one_hot + cos_theta
output = self.s * output
return F.cross_entropy(output, labels)
5. 训练主函数¶
def train(model, train_loader, optimizer, criterion, device):
model.train()
total_loss = 0.0
for batch in train_loader:
audio, labels = batch
audio = audio.to(device)
labels = labels.to(device)
optimizer.zero_grad()
loss = criterion(model(audio, labels), labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
return total_loss / len(train_loader)
6. 声纹对比与识别¶
def extract_feature_from_file(audio_processor, file_path):
audio = audio_processor.load_audio(file_path)
return audio_processor.extract_feature(audio)
def compare_speakers(feature1, feature2):
return np.dot(feature1, feature2) / (np.linalg.norm(feature1) * np.linalg.norm(feature2))
def recognize(speaker_db, input_audio_path, threshold=0.7):
input_feature = extract_feature_from_file(audio_processor, input_audio_path)
max_sim = -1
recognized = None
for name, feature in speaker_db.items():
sim = compare_speakers(input_feature, feature)
if sim > max_sim:
max_sim = sim
recognized = name if sim > threshold else None
return recognized, max_sim
四、运行流程¶
- 数据准备:
python create_data.py # 生成训练/测试列表
- 模型训练:
python train.py --config configs/ecapa_tdnn.yml
- 评估模型:
python eval.py --model_path models/best_model.pth
- 声纹识别:
python infer_recognition.py # 支持录音识别
五、关键配置参数¶
- 预处理配置(
configs/preprocess.yaml):
sample_rate: 16000
n_fft: 512
hop_length: 160
n_mels: 80
- 训练配置(
configs/train.yaml):
batch_size: 64
epochs: 30
lr: 0.001
weight_decay: 1e-6
device: cuda
六、性能优化¶
- 数据增强:通过SpecAugment、噪声添加、时间拉伸等提升模型鲁棒性。
- 特征归一化:确保特征向量在L2范数下归一化,便于余弦相似度计算。
- 模型优化:使用1维卷积和自适应池化减少参数量,加速推理。
七、扩展功能¶
- 多模型支持:提供TDNN、ResNetSE等替换ECAPA-TDNN。
- Web部署:通过FastAPI封装识别API,实现跨平台调用。
- 语音合成攻击检测:结合频谱特征分析生成对抗样本。
八、参考资料¶
通过上述实现,系统可高效完成声纹特征提取、模型训练及识别任务,适用于门禁、身份验证等场景。如需进一步优化,可考虑引入元学习或多模态融合提升抗干扰能力。