DSP 项目4 - DTW孤立词语音识别代码模板

🎤 DSP 项目4 - DTW孤立词语音识别代码模板

▼📄 extract_label.m（辅助函数 - 标签提取）

function label = extract_label(filename)
%EXTRACT_LABEL 从文件名提取数字标签
%   支持格式: '0.wav', 'zero-1.wav', 'one_2.wav' 等

[~, name, ~] = fileparts(filename);
name_lower = lower(name);

% 英文数字名映射
digit_names = {'zero','one','two','three','four','five','six','seven','eight','nine'};

% 先尝试英文名匹配
for i = 1:10
    if ~isempty(strfind(name_lower, digit_names{i}))
        label = i - 1;
        return;
    end
end

% 再尝试阿拉伯数字
if ~isempty(name) && name(1) >= '0' && name(1) <= '9'
    label = str2double(name(1));
    return;
end

label = -1;
end

▼📄 train_template.m（模板训练 - 必须先运行）

%% train_template.m - 从训练音频生成DTW模板库
% 功能: 从 train/ 目录读取音频，使用中心样本法生成模板
% 使用: >> train_template
% 兼容: MATLAB 2014b及以上

clear; clc; close all;

train_dir = 'train';
output_file = 'template.mat';
target_fs = 16000;

params = struct();
params.frame_len_ms = 25;
params.frame_shift_ms = 10;
params.pre_emphasis = 0.97;
params.nfft = 2048;
params.num_filters = 32;
params.num_ceps = 12;
params.lifter_coef = 22;
params.delta_N = 2;

fprintf('DTW模板训练程序\n');
fprintf('训练目录: %s\n', train_dir);

wav_files = dir(fullfile(train_dir, '*.wav'));
fprintf('找到 %d 个音频文件\n', length(wav_files));

%% 按数字分组
digit_files = cell(10, 1);
for i = 1:length(wav_files)
    label = extract_label(wav_files(i).name);
    if label >= 0 && label <= 9
        digit_files{label + 1} = [digit_files{label + 1}; {wav_files(i).name}];
    end
end

%% 提取MFCC特征
all_features = cell(10, 1);
for digit = 0:9
    files = digit_files{digit + 1};
    num_samples = length(files);
    features = cell(num_samples, 1);
    valid_count = 0;
    
    for k = 1:num_samples
        filepath = fullfile(train_dir, files{k});
        try
            [audio, fs] = audioread(filepath);
            if size(audio, 2) > 1
                audio = audio(:, 1);
            end
            if fs ~= target_fs
                audio = resample(audio, target_fs, fs);
            end
            audio_vad = ex6_vad(audio, target_fs);
            if length(audio_vad) >= target_fs * 0.05
                valid_count = valid_count + 1;
                features{valid_count} = ex3_mfcc(audio_vad, target_fs, params);
            end
        catch
        end
    end
    all_features{digit + 1} = features(1:valid_count);
    fprintf('数字 %d: %d 个有效样本\n', digit, valid_count);
end

%% 中心样本法选择模板
template = cell(1, 10);
for digit = 0:9
    samples = all_features{digit + 1};
    num_valid = length(samples);
    
    if num_valid == 0
        fprintf('警告: 数字 %d 没有有效样本!\n', digit);
        template{digit + 1} = [];
    elseif num_valid == 1
        template{digit + 1} = samples{1};
    else
        % 计算每个样本到其他样本的距离之和
        dist_sum = zeros(num_valid, 1);
        for i = 1:num_valid
            for j = 1:num_valid
                if i ~= j
                    dist_sum(i) = dist_sum(i) + ex4_dtw(samples{i}, samples{j}, true);
                end
            end
        end
        % 选择距离之和最小的作为中心样本
        [~, center_idx] = min(dist_sum);
        template{digit + 1} = samples{center_idx};
    end
end

save(output_file, 'template');
fprintf('\n模板已保存到: %s\n', output_file);

▼📄 ex0_all.m（主程序 - 无需修改）

%% ex0_all.m - DTW孤立词语音识别主程序
% 使用方法: 先运行 train_template.m，再运行此脚本
% 可修改 audio_dir 为 'test' 或 'train' 进行对比测试
% 兼容: MATLAB 2014b及以上

clear; clc; close all;

%% 配置参数
audio_dir = 'test';   % 测试目录，可改为 'train' 对比
target_fs = 16000;    % 目标采样率

params = struct();
params.frame_len_ms = 25;
params.frame_shift_ms = 10;
params.pre_emphasis = 0.97;
params.nfft = 2048;
params.num_filters = 32;
params.num_ceps = 12;
params.lifter_coef = 22;
params.delta_N = 2;

fprintf('============================================\n');
fprintf('    DTW 孤立词语音识别系统\n');
fprintf('============================================\n');
fprintf('测试目录: %s\n\n', audio_dir);

%% 加载模板
if ~exist('template.mat', 'file')
    error('找不到 template.mat！请先运行 train_template.m 生成模板。');
end
load('template.mat', 'template');
templates = template;
fprintf('已加载模板文件\n\n');

%% 批量识别
wav_files = dir(fullfile(audio_dir, '*.wav'));
correct = 0;
total = 0;

fprintf('开始识别...\n');
fprintf('--------------------------------------------\n');

for i = 1:length(wav_files)
    filepath = fullfile(audio_dir, wav_files(i).name);
    
    % 读取音频
    [audio, fs] = audioread(filepath);
    if size(audio, 2) > 1
        audio = audio(:, 1);
    end
    
    % 重采样
    if fs ~= target_fs
        audio = resample(audio, target_fs, fs);
        fs = target_fs;
    end
    
    % 端点检测
    audio_vad = ex6_vad(audio, fs);
    if length(audio_vad) < fs * 0.05
        fprintf('%s: 音频太短，跳过\n', wav_files(i).name);
        continue;
    end
    
    % 提取MFCC
    test_feat = ex3_mfcc(audio_vad, fs, params);
    
    % DTW识别
    [result, scores, min_score] = ex5_recognize(test_feat, templates);
    
    % 获取真实标签并统计
    true_label = extract_label(wav_files(i).name);
    if true_label >= 0
        total = total + 1;
        if result == true_label
            correct = correct + 1;
            mark = 'OK';
        else
            mark = 'X';
        end
        fprintf('%s: 真实=%d, 识别=%d [%s]\n', wav_files(i).name, true_label, result, mark);
    end
end

%% 输出结果
fprintf('--------------------------------------------\n');
if total > 0
    accuracy = correct / total * 100;
    fprintf('识别率: %.1f%% (%d/%d)\n', accuracy, correct, total);
    
    if strcmp(audio_dir, 'test') && accuracy < 50
        fprintf('\n提示: test目录识别率低是正常现象（说话人不同）\n');
        fprintf('可将 audio_dir 改为 ''train'' 验证代码正确性\n');
    end
else
    fprintf('未找到可识别的音频文件\n');
end
fprintf('============================================\n');

▼📄 ex1_preemphasis.m（预加重滤波）

function y = ex1_preemphasis(x, alpha)
%EX1_PREEMPHASIS 预加重滤波
%   y = ex1_preemphasis(x, alpha)
%   公式: y[n] = x[n] - alpha * x[n-1]
%
%   输入:
%       x     - 输入语音信号
%       alpha - 预加重系数，默认0.97
%   输出:
%       y     - 预加重后的信号

if nargin < 2
    alpha = 0.97;
end

x = x(:);

% TODO: 使用filter函数实现预加重
% 提示: b = [1, -alpha], a = 1
y = filter(?, 1, x);

end

▼📄 ex2_enframe.m（信号分帧）

function frames = ex2_enframe(x, frame_len, frame_shift)
%EX2_ENFRAME 将信号分帧
%   frames = ex2_enframe(x, frame_len, frame_shift)
%
%   输入:
%       x           - 输入信号
%       frame_len   - 帧长（样本点数）
%       frame_shift - 帧移（样本点数）
%   输出:
%       frames      - 帧矩阵 [frame_len x num_frames]

x = x(:);
N = length(x);

% TODO: 计算帧数
% 公式: num_frames = floor((N - frame_len) / frame_shift) + 1
num_frames = ?;

if num_frames < 1
    error('信号太短，无法分帧');
end

frames = zeros(frame_len, num_frames);

for i = 1:num_frames
    % TODO: 计算第i帧的起始索引
    % 公式: start_idx = (i - 1) * frame_shift + 1
    start_idx = ?;
    frames(:, i) = x(start_idx : start_idx + frame_len - 1);
end

end

▼📄 ex3_mfcc.m（MFCC特征提取）

function [mfcc_all, mfcc_base] = ex3_mfcc(x, fs, params)
%EX3_MFCC 计算MFCC特征（含Delta和Delta-Delta）
%   输出: mfcc_all [36 x num_frames] = 12 MFCC + 12 Delta + 12 Delta-Delta

%% 参数默认值
if nargin < 3, params = struct(); end
if ~isfield(params, 'frame_len_ms'), params.frame_len_ms = 25; end
if ~isfield(params, 'frame_shift_ms'), params.frame_shift_ms = 10; end
if ~isfield(params, 'pre_emphasis'), params.pre_emphasis = 0.97; end
if ~isfield(params, 'nfft'), params.nfft = 2048; end
if ~isfield(params, 'num_filters'), params.num_filters = 32; end
if ~isfield(params, 'num_ceps'), params.num_ceps = 12; end
if ~isfield(params, 'lifter_coef'), params.lifter_coef = 22; end
if ~isfield(params, 'delta_N'), params.delta_N = 2; end

%% Step 1: 预加重
x_emph = ex1_preemphasis(x, params.pre_emphasis);

%% Step 2: 分帧
frame_len = round(params.frame_len_ms * fs / 1000);
frame_shift = round(params.frame_shift_ms * fs / 1000);
frames = ex2_enframe(x_emph, frame_len, frame_shift);
num_frames = size(frames, 2);

%% Step 3: 加窗（汉宁窗）
% TODO: 创建周期性汉宁窗
win = ?;

% TODO: 对每帧应用窗函数（使用repmat或bsxfun）
frames_win = ?;

%% Step 4: FFT 和功率谱
nfft = params.nfft;

% TODO: 计算FFT
X = ?;

% TODO: 计算功率谱（只取正频率部分）
% 公式: P = (1/nfft) * |X(1:nfft/2+1)|^2
pow_spec = ?;

%% Step 5: Mel滤波器组
num_filters = params.num_filters;
mel_bank = create_mel_filterbank(fs, nfft, num_filters, 0, fs/2);
mel_energy = mel_bank * pow_spec;
mel_energy = max(mel_energy, eps);  % 避免log(0)

%% Step 6: 取对数
log_mel = log(mel_energy);

%% Step 7: DCT变换
% TODO: 对log_mel进行DCT变换
dct_coef = ?;

% TODO: 取第2到(num_ceps+1)阶系数
num_ceps = params.num_ceps;
mfcc_base = ?;

%% Step 8: 升倒谱（Liftering）
L = params.lifter_coef;
n = (1:num_ceps)';
lifter = 1 + (L/2) * sin(pi * n / L);
mfcc_base = mfcc_base .* repmat(lifter, 1, num_frames);

%% Step 9: Delta 和 Delta-Delta
delta = compute_delta(mfcc_base, params.delta_N);
delta_delta = compute_delta(delta, params.delta_N);

%% 合并输出
mfcc_all = [mfcc_base; delta; delta_delta];

end

%% ==================== 辅助函数 ====================

function mel_bank = create_mel_filterbank(fs, nfft, num_filters, low_freq, high_freq)
%CREATE_MEL_FILTERBANK 创建Mel三角滤波器组

hz2mel = @(f) 2595 * log10(1 + f/700);
mel2hz = @(m) 700 * (10.^(m/2595) - 1);

low_mel = hz2mel(low_freq);
high_mel = hz2mel(high_freq);

mel_points = linspace(low_mel, high_mel, num_filters + 2);
hz_points = mel2hz(mel_points);

bin_points = floor(hz_points / fs * nfft) + 1;
bin_points = min(bin_points, nfft/2 + 1);
bin_points = max(bin_points, 1);

num_bins = nfft/2 + 1;
mel_bank = zeros(num_filters, num_bins);

for i = 1:num_filters
    left = bin_points(i);
    center = bin_points(i + 1);
    right = bin_points(i + 2);
    
    for k = left:center
        if center ~= left
            mel_bank(i, k) = (k - left) / (center - left);
        end
    end
    
    for k = center:right
        if right ~= center
            mel_bank(i, k) = (right - k) / (right - center);
        end
    end
end

end

function delta = compute_delta(feat, N)
%COMPUTE_DELTA 计算差分系数

[num_ceps, num_frames] = size(feat);
delta = zeros(num_ceps, num_frames);
denom = 2 * sum((1:N).^2);

for t = 1:num_frames
    numerator = zeros(num_ceps, 1);
    for n = 1:N
        t_plus = min(t + n, num_frames);
        t_minus = max(t - n, 1);
        numerator = numerator + n * (feat(:, t_plus) - feat(:, t_minus));
    end
    delta(:, t) = numerator / denom;
end

end

▼📄 ex4_dtw.m（DTW动态规整）

function [dist, D, path] = ex4_dtw(template, test, normalize_flag)
%EX4_DTW 动态时间规整算法
%   [dist, D, path] = ex4_dtw(template, test, normalize_flag)
%
%   递推公式: D(i,j) = d(i,j) + min{D(i-1,j), D(i,j-1), D(i-1,j-1)}

if nargin < 3
    normalize_flag = true;
end

[dim1, n] = size(template);
[dim2, m] = size(test);

if dim1 ~= dim2
    error('特征维度不匹配');
end

% 均值归一化
if normalize_flag
    template = template - repmat(mean(template, 2), 1, n);
    test = test - repmat(mean(test, 2), 1, m);
end

%% 计算局部距离矩阵
local_dist = zeros(n, m);
for i = 1:n
    for j = 1:m
        % TODO: 计算欧氏距离
        local_dist(i, j) = ?;
    end
end

%% 初始化累积距离矩阵
% TODO: 创建(n+1)x(m+1)的矩阵，初始化为inf
D = ?;

% TODO: 设置边界条件 D(1,1) = 0
?;

%% 动态规划填充
for i = 1:n
    for j = 1:m
        cost = local_dist(i, j);
        
        % TODO: 获取三个方向的累积距离
        d1 = ?;  % 从上方 D(i, j+1)
        d2 = ?;  % 从左方 D(i+1, j)
        d3 = ?;  % 从对角 D(i, j)
        
        % TODO: 递推公式
        D(i + 1, j + 1) = ?;
    end
end

% 最终DTW距离
dist = D(n + 1, m + 1);

% 路径回溯（可选）
if nargout >= 3
    path = backtrack(D);
else
    path = [];
end

end

function path = backtrack(D)
%BACKTRACK 路径回溯
[n_plus1, m_plus1] = size(D);
path = zeros(n_plus1 + m_plus1, 2);
k = 0;
i = n_plus1 - 1;
j = m_plus1 - 1;

while i > 0 && j > 0
    k = k + 1;
    path(k, :) = [i, j];
    
    if i == 1 && j == 1
        break;
    end
    
    candidates = [];
    if i > 1 && j > 1
        candidates = [candidates; i-1, j-1, D(i, j)];
    end
    if i > 1
        candidates = [candidates; i-1, j, D(i, j+1)];
    end
    if j > 1
        candidates = [candidates; i, j-1, D(i+1, j)];
    end
    
    if isempty(candidates)
        break;
    end
    
    [~, idx] = min(candidates(:, 3));
    i = candidates(idx, 1);
    j = candidates(idx, 2);
end

path = flipud(path(1:k, :));
end

▼📄 ex5_recognize.m（识别决策 - 已完成）

function [result, scores, min_score] = ex5_recognize(test_feat, templates)
%EX5_RECOGNIZE DTW模板匹配识别
%   选择DTW距离最小的模板作为识别结果

num_templates = length(templates);
scores = zeros(1, num_templates);

for i = 1:num_templates
    if isempty(templates{i})
        scores(i) = inf;
    else
        scores(i) = ex4_dtw(templates{i}, test_feat, true);
    end
end

[min_score, min_idx] = min(scores);
result = min_idx - 1;  % 转换为0-9

end

▼📄 ex6_vad.m（端点检测 - 已完成）

function [y, start_idx, end_idx] = ex6_vad(x, fs)
%EX6_VAD 基于短时能量的端点检测（双门限法）

x = x(:);
N = length(x);

frame_len = round(0.025 * fs);
frame_shift = round(0.010 * fs);
thresh_high = 0.15;
thresh_low = 0.05;
margin = 3;

num_frames = floor((N - frame_len) / frame_shift) + 1;
if num_frames < 1
    y = x; start_idx = 1; end_idx = N;
    return;
end

% 计算短时能量
energy = zeros(num_frames, 1);
for i = 1:num_frames
    s = (i - 1) * frame_shift + 1;
    energy(i) = sum(x(s : s + frame_len - 1).^2);
end

% 归一化
max_energy = max(energy);
if max_energy <= 0
    y = x; start_idx = 1; end_idx = N;
    return;
end
energy_norm = energy / max_energy;

% 双门限检测
speech_high = energy_norm > thresh_high;
if ~any(speech_high)
    speech_high = energy_norm > thresh_low;
end
if ~any(speech_high)
    y = x; start_idx = 1; end_idx = N;
    return;
end

first_high = find(speech_high, 1, 'first');
last_high = find(speech_high, 1, 'last');

% 向外扩展到低门限
start_frame = first_high;
while start_frame > 1 && energy_norm(start_frame - 1) > thresh_low
    start_frame = start_frame - 1;
end
end_frame = last_high;
while end_frame < num_frames && energy_norm(end_frame + 1) > thresh_low
    end_frame = end_frame + 1;
end

% 添加边距
start_frame = max(1, start_frame - margin);
end_frame = min(num_frames, end_frame + margin);

% 转换为样本索引
start_idx = (start_frame - 1) * frame_shift + 1;
end_idx = min(N, (end_frame - 1) * frame_shift + frame_len);

y = x(start_idx : end_idx);

end