function label = extract_label(filename) %EXTRACT_LABEL 从文件名提取数字标签 % 支持格式: '0.wav', 'zero-1.wav', 'one_2.wav' 等 [~, name, ~] = fileparts(filename); name_lower = lower(name); % 英文数字名映射 digit_names = {'zero','one','two','three','four','five','six','seven','eight','nine'}; % 先尝试英文名匹配 for i = 1:10 if ~isempty(strfind(name_lower, digit_names{i})) label = i - 1; return; end end % 再尝试阿拉伯数字 if ~isempty(name) && name(1) >= '0' && name(1) <= '9' label = str2double(name(1)); return; end label = -1; end
%% train_template.m - 从训练音频生成DTW模板库 % 功能: 从 train/ 目录读取音频,使用中心样本法生成模板 % 使用: >> train_template % 兼容: MATLAB 2014b及以上 clear; clc; close all; train_dir = 'train'; output_file = 'template.mat'; target_fs = 16000; params = struct(); params.frame_len_ms = 25; params.frame_shift_ms = 10; params.pre_emphasis = 0.97; params.nfft = 2048; params.num_filters = 32; params.num_ceps = 12; params.lifter_coef = 22; params.delta_N = 2; fprintf('DTW模板训练程序\n'); fprintf('训练目录: %s\n', train_dir); wav_files = dir(fullfile(train_dir, '*.wav')); fprintf('找到 %d 个音频文件\n', length(wav_files)); %% 按数字分组 digit_files = cell(10, 1); for i = 1:length(wav_files) label = extract_label(wav_files(i).name); if label >= 0 && label <= 9 digit_files{label + 1} = [digit_files{label + 1}; {wav_files(i).name}]; end end %% 提取MFCC特征 all_features = cell(10, 1); for digit = 0:9 files = digit_files{digit + 1}; num_samples = length(files); features = cell(num_samples, 1); valid_count = 0; for k = 1:num_samples filepath = fullfile(train_dir, files{k}); try [audio, fs] = audioread(filepath); if size(audio, 2) > 1 audio = audio(:, 1); end if fs ~= target_fs audio = resample(audio, target_fs, fs); end audio_vad = ex6_vad(audio, target_fs); if length(audio_vad) >= target_fs * 0.05 valid_count = valid_count + 1; features{valid_count} = ex3_mfcc(audio_vad, target_fs, params); end catch end end all_features{digit + 1} = features(1:valid_count); fprintf('数字 %d: %d 个有效样本\n', digit, valid_count); end %% 中心样本法选择模板 template = cell(1, 10); for digit = 0:9 samples = all_features{digit + 1}; num_valid = length(samples); if num_valid == 0 fprintf('警告: 数字 %d 没有有效样本!\n', digit); template{digit + 1} = []; elseif num_valid == 1 template{digit + 1} = samples{1}; else % 计算每个样本到其他样本的距离之和 dist_sum = zeros(num_valid, 1); for i = 1:num_valid for j = 1:num_valid if i ~= j dist_sum(i) = dist_sum(i) + ex4_dtw(samples{i}, samples{j}, true); end end end % 选择距离之和最小的作为中心样本 [~, center_idx] = min(dist_sum); template{digit + 1} = samples{center_idx}; end end save(output_file, 'template'); fprintf('\n模板已保存到: %s\n', output_file);
%% ex0_all.m - DTW孤立词语音识别主程序 % 使用方法: 先运行 train_template.m,再运行此脚本 % 可修改 audio_dir 为 'test' 或 'train' 进行对比测试 % 兼容: MATLAB 2014b及以上 clear; clc; close all; %% 配置参数 audio_dir = 'test'; % 测试目录,可改为 'train' 对比 target_fs = 16000; % 目标采样率 params = struct(); params.frame_len_ms = 25; params.frame_shift_ms = 10; params.pre_emphasis = 0.97; params.nfft = 2048; params.num_filters = 32; params.num_ceps = 12; params.lifter_coef = 22; params.delta_N = 2; fprintf('============================================\n'); fprintf(' DTW 孤立词语音识别系统\n'); fprintf('============================================\n'); fprintf('测试目录: %s\n\n', audio_dir); %% 加载模板 if ~exist('template.mat', 'file') error('找不到 template.mat!请先运行 train_template.m 生成模板。'); end load('template.mat', 'template'); templates = template; fprintf('已加载模板文件\n\n'); %% 批量识别 wav_files = dir(fullfile(audio_dir, '*.wav')); correct = 0; total = 0; fprintf('开始识别...\n'); fprintf('--------------------------------------------\n'); for i = 1:length(wav_files) filepath = fullfile(audio_dir, wav_files(i).name); % 读取音频 [audio, fs] = audioread(filepath); if size(audio, 2) > 1 audio = audio(:, 1); end % 重采样 if fs ~= target_fs audio = resample(audio, target_fs, fs); fs = target_fs; end % 端点检测 audio_vad = ex6_vad(audio, fs); if length(audio_vad) < fs * 0.05 fprintf('%s: 音频太短,跳过\n', wav_files(i).name); continue; end % 提取MFCC test_feat = ex3_mfcc(audio_vad, fs, params); % DTW识别 [result, scores, min_score] = ex5_recognize(test_feat, templates); % 获取真实标签并统计 true_label = extract_label(wav_files(i).name); if true_label >= 0 total = total + 1; if result == true_label correct = correct + 1; mark = 'OK'; else mark = 'X'; end fprintf('%s: 真实=%d, 识别=%d [%s]\n', wav_files(i).name, true_label, result, mark); end end %% 输出结果 fprintf('--------------------------------------------\n'); if total > 0 accuracy = correct / total * 100; fprintf('识别率: %.1f%% (%d/%d)\n', accuracy, correct, total); if strcmp(audio_dir, 'test') && accuracy < 50 fprintf('\n提示: test目录识别率低是正常现象(说话人不同)\n'); fprintf('可将 audio_dir 改为 ''train'' 验证代码正确性\n'); end else fprintf('未找到可识别的音频文件\n'); end fprintf('============================================\n');
function y = ex1_preemphasis(x, alpha) %EX1_PREEMPHASIS 预加重滤波 % y = ex1_preemphasis(x, alpha) % 公式: y[n] = x[n] - alpha * x[n-1] % % 输入: % x - 输入语音信号 % alpha - 预加重系数,默认0.97 % 输出: % y - 预加重后的信号 if nargin < 2 alpha = 0.97; end x = x(:); % TODO: 使用filter函数实现预加重 % 提示: b = [1, -alpha], a = 1 y = filter(?, 1, x); end
function frames = ex2_enframe(x, frame_len, frame_shift) %EX2_ENFRAME 将信号分帧 % frames = ex2_enframe(x, frame_len, frame_shift) % % 输入: % x - 输入信号 % frame_len - 帧长(样本点数) % frame_shift - 帧移(样本点数) % 输出: % frames - 帧矩阵 [frame_len x num_frames] x = x(:); N = length(x); % TODO: 计算帧数 % 公式: num_frames = floor((N - frame_len) / frame_shift) + 1 num_frames = ?; if num_frames < 1 error('信号太短,无法分帧'); end frames = zeros(frame_len, num_frames); for i = 1:num_frames % TODO: 计算第i帧的起始索引 % 公式: start_idx = (i - 1) * frame_shift + 1 start_idx = ?; frames(:, i) = x(start_idx : start_idx + frame_len - 1); end end
function [mfcc_all, mfcc_base] = ex3_mfcc(x, fs, params) %EX3_MFCC 计算MFCC特征(含Delta和Delta-Delta) % 输出: mfcc_all [36 x num_frames] = 12 MFCC + 12 Delta + 12 Delta-Delta %% 参数默认值 if nargin < 3, params = struct(); end if ~isfield(params, 'frame_len_ms'), params.frame_len_ms = 25; end if ~isfield(params, 'frame_shift_ms'), params.frame_shift_ms = 10; end if ~isfield(params, 'pre_emphasis'), params.pre_emphasis = 0.97; end if ~isfield(params, 'nfft'), params.nfft = 2048; end if ~isfield(params, 'num_filters'), params.num_filters = 32; end if ~isfield(params, 'num_ceps'), params.num_ceps = 12; end if ~isfield(params, 'lifter_coef'), params.lifter_coef = 22; end if ~isfield(params, 'delta_N'), params.delta_N = 2; end %% Step 1: 预加重 x_emph = ex1_preemphasis(x, params.pre_emphasis); %% Step 2: 分帧 frame_len = round(params.frame_len_ms * fs / 1000); frame_shift = round(params.frame_shift_ms * fs / 1000); frames = ex2_enframe(x_emph, frame_len, frame_shift); num_frames = size(frames, 2); %% Step 3: 加窗(汉宁窗) % TODO: 创建周期性汉宁窗 win = ?; % TODO: 对每帧应用窗函数(使用repmat或bsxfun) frames_win = ?; %% Step 4: FFT 和功率谱 nfft = params.nfft; % TODO: 计算FFT X = ?; % TODO: 计算功率谱(只取正频率部分) % 公式: P = (1/nfft) * |X(1:nfft/2+1)|^2 pow_spec = ?; %% Step 5: Mel滤波器组 num_filters = params.num_filters; mel_bank = create_mel_filterbank(fs, nfft, num_filters, 0, fs/2); mel_energy = mel_bank * pow_spec; mel_energy = max(mel_energy, eps); % 避免log(0) %% Step 6: 取对数 log_mel = log(mel_energy); %% Step 7: DCT变换 % TODO: 对log_mel进行DCT变换 dct_coef = ?; % TODO: 取第2到(num_ceps+1)阶系数 num_ceps = params.num_ceps; mfcc_base = ?; %% Step 8: 升倒谱(Liftering) L = params.lifter_coef; n = (1:num_ceps)'; lifter = 1 + (L/2) * sin(pi * n / L); mfcc_base = mfcc_base .* repmat(lifter, 1, num_frames); %% Step 9: Delta 和 Delta-Delta delta = compute_delta(mfcc_base, params.delta_N); delta_delta = compute_delta(delta, params.delta_N); %% 合并输出 mfcc_all = [mfcc_base; delta; delta_delta]; end %% ==================== 辅助函数 ==================== function mel_bank = create_mel_filterbank(fs, nfft, num_filters, low_freq, high_freq) %CREATE_MEL_FILTERBANK 创建Mel三角滤波器组 hz2mel = @(f) 2595 * log10(1 + f/700); mel2hz = @(m) 700 * (10.^(m/2595) - 1); low_mel = hz2mel(low_freq); high_mel = hz2mel(high_freq); mel_points = linspace(low_mel, high_mel, num_filters + 2); hz_points = mel2hz(mel_points); bin_points = floor(hz_points / fs * nfft) + 1; bin_points = min(bin_points, nfft/2 + 1); bin_points = max(bin_points, 1); num_bins = nfft/2 + 1; mel_bank = zeros(num_filters, num_bins); for i = 1:num_filters left = bin_points(i); center = bin_points(i + 1); right = bin_points(i + 2); for k = left:center if center ~= left mel_bank(i, k) = (k - left) / (center - left); end end for k = center:right if right ~= center mel_bank(i, k) = (right - k) / (right - center); end end end end function delta = compute_delta(feat, N) %COMPUTE_DELTA 计算差分系数 [num_ceps, num_frames] = size(feat); delta = zeros(num_ceps, num_frames); denom = 2 * sum((1:N).^2); for t = 1:num_frames numerator = zeros(num_ceps, 1); for n = 1:N t_plus = min(t + n, num_frames); t_minus = max(t - n, 1); numerator = numerator + n * (feat(:, t_plus) - feat(:, t_minus)); end delta(:, t) = numerator / denom; end end
function [dist, D, path] = ex4_dtw(template, test, normalize_flag) %EX4_DTW 动态时间规整算法 % [dist, D, path] = ex4_dtw(template, test, normalize_flag) % % 递推公式: D(i,j) = d(i,j) + min{D(i-1,j), D(i,j-1), D(i-1,j-1)} if nargin < 3 normalize_flag = true; end [dim1, n] = size(template); [dim2, m] = size(test); if dim1 ~= dim2 error('特征维度不匹配'); end % 均值归一化 if normalize_flag template = template - repmat(mean(template, 2), 1, n); test = test - repmat(mean(test, 2), 1, m); end %% 计算局部距离矩阵 local_dist = zeros(n, m); for i = 1:n for j = 1:m % TODO: 计算欧氏距离 local_dist(i, j) = ?; end end %% 初始化累积距离矩阵 % TODO: 创建(n+1)x(m+1)的矩阵,初始化为inf D = ?; % TODO: 设置边界条件 D(1,1) = 0 ?; %% 动态规划填充 for i = 1:n for j = 1:m cost = local_dist(i, j); % TODO: 获取三个方向的累积距离 d1 = ?; % 从上方 D(i, j+1) d2 = ?; % 从左方 D(i+1, j) d3 = ?; % 从对角 D(i, j) % TODO: 递推公式 D(i + 1, j + 1) = ?; end end % 最终DTW距离 dist = D(n + 1, m + 1); % 路径回溯(可选) if nargout >= 3 path = backtrack(D); else path = []; end end function path = backtrack(D) %BACKTRACK 路径回溯 [n_plus1, m_plus1] = size(D); path = zeros(n_plus1 + m_plus1, 2); k = 0; i = n_plus1 - 1; j = m_plus1 - 1; while i > 0 && j > 0 k = k + 1; path(k, :) = [i, j]; if i == 1 && j == 1 break; end candidates = []; if i > 1 && j > 1 candidates = [candidates; i-1, j-1, D(i, j)]; end if i > 1 candidates = [candidates; i-1, j, D(i, j+1)]; end if j > 1 candidates = [candidates; i, j-1, D(i+1, j)]; end if isempty(candidates) break; end [~, idx] = min(candidates(:, 3)); i = candidates(idx, 1); j = candidates(idx, 2); end path = flipud(path(1:k, :)); end
function [result, scores, min_score] = ex5_recognize(test_feat, templates) %EX5_RECOGNIZE DTW模板匹配识别 % 选择DTW距离最小的模板作为识别结果 num_templates = length(templates); scores = zeros(1, num_templates); for i = 1:num_templates if isempty(templates{i}) scores(i) = inf; else scores(i) = ex4_dtw(templates{i}, test_feat, true); end end [min_score, min_idx] = min(scores); result = min_idx - 1; % 转换为0-9 end
function [y, start_idx, end_idx] = ex6_vad(x, fs) %EX6_VAD 基于短时能量的端点检测(双门限法) x = x(:); N = length(x); frame_len = round(0.025 * fs); frame_shift = round(0.010 * fs); thresh_high = 0.15; thresh_low = 0.05; margin = 3; num_frames = floor((N - frame_len) / frame_shift) + 1; if num_frames < 1 y = x; start_idx = 1; end_idx = N; return; end % 计算短时能量 energy = zeros(num_frames, 1); for i = 1:num_frames s = (i - 1) * frame_shift + 1; energy(i) = sum(x(s : s + frame_len - 1).^2); end % 归一化 max_energy = max(energy); if max_energy <= 0 y = x; start_idx = 1; end_idx = N; return; end energy_norm = energy / max_energy; % 双门限检测 speech_high = energy_norm > thresh_high; if ~any(speech_high) speech_high = energy_norm > thresh_low; end if ~any(speech_high) y = x; start_idx = 1; end_idx = N; return; end first_high = find(speech_high, 1, 'first'); last_high = find(speech_high, 1, 'last'); % 向外扩展到低门限 start_frame = first_high; while start_frame > 1 && energy_norm(start_frame - 1) > thresh_low start_frame = start_frame - 1; end end_frame = last_high; while end_frame < num_frames && energy_norm(end_frame + 1) > thresh_low end_frame = end_frame + 1; end % 添加边距 start_frame = max(1, start_frame - margin); end_frame = min(num_frames, end_frame + margin); % 转换为样本索引 start_idx = (start_frame - 1) * frame_shift + 1; end_idx = min(N, (end_frame - 1) * frame_shift + frame_len); y = x(start_idx : end_idx); end