#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # Copyright 2016-2099 Ailemon.net # # This file is part of ASRT Speech Recognition Tool. # # ASRT is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # ASRT is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with ASRT. If not, see . # ============================================================================ """ @author: nl8590687 ASRT语音识别内置声学特征提取模块,定义了几个常用的声学特征类 """ import random import numpy as np from scipy.fftpack import fft from .base import mfcc, delta, logfbank class SpeechFeatureMeta: """ ASRT语音识别中所有声学特征提取类的基类 """ def __init__(self, framesamplerate=16000): self.framesamplerate = framesamplerate def run(self, wavsignal, fs=16000): ''' run method ''' raise NotImplementedError('[ASRT] `run()` method is not implemented.') class MFCC(SpeechFeatureMeta): """ ASRT语音识别内置的mfcc声学特征提取类 Compute MFCC features from an audio signal. :param framesamplerate: the sample rate of the signal we are working with, in Hz. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param numcep: the number of cepstrum to return, default 13 :param nfilt: the number of filters in the filterbank, default 26. :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. """ def __init__(self, framesamplerate=16000, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, preemph=0.97): self.framesamplerate = framesamplerate self.winlen = winlen self.winstep = winstep self.numcep = numcep self.nfilt = nfilt self.preemph = preemph super().__init__(framesamplerate) def run(self, wavsignal, fs=16000): """ 计算mfcc声学特征,包含静态特征、一阶差分和二阶差分 :returns: A numpy array of size (NUMFRAMES by numcep * 3) containing features. Each row holds 1 feature vector. """ wavsignal = np.array(wavsignal, dtype=np.float64) # 获取输入特征 feat_mfcc = mfcc(wavsignal[0], samplerate=self.framesamplerate, winlen=self.winlen, winstep=self.winstep, numcep=self.numcep, nfilt=self.nfilt, preemph=self.preemph) feat_mfcc_d = delta(feat_mfcc, 2) feat_mfcc_dd = delta(feat_mfcc_d, 2) # 返回值分别是mfcc特征向量的矩阵及其一阶差分和二阶差分矩阵 wav_feature = np.column_stack((feat_mfcc, feat_mfcc_d, feat_mfcc_dd)) return wav_feature class Logfbank(SpeechFeatureMeta): """ ASRT语音识别内置的logfbank声学特征提取类 """ def __init__(self, framesamplerate=16000, nfilt=26): self.nfilt = nfilt super().__init__(framesamplerate) def run(self, wavsignal, fs=16000): wavsignal = np.array(wavsignal, dtype=np.float64) # 获取输入特征 wav_feature = logfbank(wavsignal, fs, nfilt=self.nfilt) return wav_feature class Spectrogram(SpeechFeatureMeta): """ ASRT语音识别内置的语谱图声学特征提取类 """ def __init__(self, framesamplerate=16000, timewindow=25, timeshift=10): self.time_window = timewindow self.window_length = int(framesamplerate / 1000 * self.time_window) # 计算窗长度的公式,目前全部为400固定值 self.timeshift = timeshift ''' # 保留将来用于不同采样频率 self.x=np.linspace(0, self.window_length - 1, self.window_length, dtype = np.int64) self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (self.window_length - 1) ) # 汉明窗 ''' self.x = np.linspace(0, 400 - 1, 400, dtype=np.int64) self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (400 - 1)) # 汉明窗 super().__init__(framesamplerate) def run(self, wavsignal, fs=16000): if fs != 16000: raise ValueError( f"[Error] ASRT currently only supports wav audio files with a sampling rate of 16000 Hz, but this " f"audio is {fs} Hz.") # wav波形 加时间窗以及时移10ms time_window = 25 # 单位ms window_length = int(fs / 1000 * time_window) # 计算窗长度的公式,目前全部为400固定值 wav_arr = np.array(wavsignal) # wav_length = len(wavsignal[0]) # wav_length = wav_arr.shape[1] range0_end = int(len(wavsignal[0]) / fs * 1000 - time_window) // 10 + 1 # 计算循环终止的位置,也就是最终生成的窗数 data_input = np.zeros((range0_end, window_length // 2), dtype=np.float64) # 用于存放最终的频率特征数据 data_line = np.zeros((1, window_length), dtype=np.float64) for i in range(0, range0_end): p_start = i * 160 p_end = p_start + 400 data_line = wav_arr[0, p_start:p_end] data_line = data_line * self.w # 加窗 data_line = np.abs(fft(data_line)) data_input[i] = data_line[0: window_length // 2] # 设置为400除以2的值(即200)是取一半数据,因为是对称的 data_input = np.log(data_input + 1) return data_input class SpecAugment(SpeechFeatureMeta): """ 复现谷歌SpecAugment数据增强特征算法,基于Spectrogram语谱图基础特征 """ def __init__(self, framesamplerate=16000, timewindow=25, timeshift=10): self.time_window = timewindow self.window_length = int(framesamplerate / 1000 * self.time_window) # 计算窗长度的公式,目前全部为400固定值 self.timeshift = timeshift ''' # 保留将来用于不同采样频率 self.x=np.linspace(0, self.window_length - 1, self.window_length, dtype = np.int64) self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (self.window_length - 1) ) # 汉明窗 ''' self.x = np.linspace(0, 400 - 1, 400, dtype=np.int64) self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (400 - 1)) # 汉明窗 super().__init__(framesamplerate) def run(self, wavsignal, fs=16000): if fs != 16000: raise ValueError( f"[Error] ASRT currently only supports wav audio files with a sampling rate of 16000 Hz, but this " f"audio is {fs} Hz.") # wav波形 加时间窗以及时移10ms time_window = 25 # 单位ms window_length = int(fs / 1000 * time_window) # 计算窗长度的公式,目前全部为400固定值 wav_arr = np.array(wavsignal) # wav_length = len(wavsignal[0]) # wav_length = wav_arr.shape[1] range0_end = int(len(wavsignal[0]) / fs * 1000 - time_window) // 10 + 1 # 计算循环终止的位置,也就是最终生成的窗数 data_input = np.zeros((range0_end, window_length // 2), dtype=np.float64) # 用于存放最终的频率特征数据 data_line = np.zeros((1, window_length), dtype=np.float64) for i in range(0, range0_end): p_start = i * 160 p_end = p_start + 400 data_line = wav_arr[0, p_start:p_end] data_line = data_line * self.w # 加窗 data_line = np.abs(fft(data_line)) data_input[i] = data_line[0: window_length // 2] # 设置为400除以2的值(即200)是取一半数据,因为是对称的 # print(data_input.shape) data_input = np.log(data_input + 1) # 开始对得到的特征应用SpecAugment mode = random.randint(1, 100) h_start = random.randint(1, data_input.shape[0]) h_width = random.randint(1, 100) v_start = random.randint(1, data_input.shape[1]) v_width = random.randint(1, 100) if mode <= 60: # 正常特征 60% pass elif 60 < mode <= 75: # 横向遮盖 15% data_input[h_start:h_start + h_width, :] = 0 elif 75 < mode <= 90: # 纵向遮盖 15% data_input[:, v_start:v_start + v_width] = 0 else: # 两种遮盖叠加 10% data_input[h_start:h_start + h_width, :v_start:v_start + v_width] = 0 return data_input