You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
228 lines
8.8 KiB
228 lines
8.8 KiB
#!/usr/bin/env python3 |
|
# -*- coding: utf-8 -*- |
|
# |
|
# Copyright 2016-2099 Ailemon.net |
|
# |
|
# This file is part of ASRT Speech Recognition Tool. |
|
# |
|
# ASRT is free software: you can redistribute it and/or modify |
|
# it under the terms of the GNU General Public License as published by |
|
# the Free Software Foundation, either version 3 of the License, or |
|
# (at your option) any later version. |
|
# ASRT is distributed in the hope that it will be useful, |
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
# GNU General Public License for more details. |
|
# |
|
# You should have received a copy of the GNU General Public License |
|
# along with ASRT. If not, see <https://www.gnu.org/licenses/>. |
|
# ============================================================================ |
|
|
|
""" |
|
@author: nl8590687 |
|
ASRT语音识别内置声学特征提取模块,定义了几个常用的声学特征类 |
|
""" |
|
|
|
import random |
|
import numpy as np |
|
from scipy.fftpack import fft |
|
from .base import mfcc, delta, logfbank |
|
|
|
|
|
class SpeechFeatureMeta: |
|
""" |
|
ASRT语音识别中所有声学特征提取类的基类 |
|
""" |
|
|
|
def __init__(self, framesamplerate=16000): |
|
self.framesamplerate = framesamplerate |
|
|
|
def run(self, wavsignal, fs=16000): |
|
''' |
|
run method |
|
''' |
|
raise NotImplementedError('[ASRT] `run()` method is not implemented.') |
|
|
|
|
|
class MFCC(SpeechFeatureMeta): |
|
""" |
|
ASRT语音识别内置的mfcc声学特征提取类 |
|
|
|
Compute MFCC features from an audio signal. |
|
|
|
:param framesamplerate: the sample rate of the signal we are working with, in Hz. |
|
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) |
|
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) |
|
:param numcep: the number of cepstrum to return, default 13 |
|
:param nfilt: the number of filters in the filterbank, default 26. |
|
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. |
|
""" |
|
|
|
def __init__(self, framesamplerate=16000, |
|
winlen=0.025, |
|
winstep=0.01, |
|
numcep=13, |
|
nfilt=26, |
|
preemph=0.97): |
|
self.framesamplerate = framesamplerate |
|
self.winlen = winlen |
|
self.winstep = winstep |
|
self.numcep = numcep |
|
self.nfilt = nfilt |
|
self.preemph = preemph |
|
super().__init__(framesamplerate) |
|
|
|
def run(self, wavsignal, fs=16000): |
|
""" |
|
计算mfcc声学特征,包含静态特征、一阶差分和二阶差分 |
|
|
|
:returns: A numpy array of size (NUMFRAMES by numcep * 3) containing features. Each row holds 1 feature vector. |
|
""" |
|
wavsignal = np.array(wavsignal, dtype=np.float64) |
|
# 获取输入特征 |
|
feat_mfcc = mfcc(wavsignal[0], samplerate=self.framesamplerate, winlen=self.winlen, |
|
winstep=self.winstep, numcep=self.numcep, nfilt=self.nfilt, preemph=self.preemph) |
|
feat_mfcc_d = delta(feat_mfcc, 2) |
|
feat_mfcc_dd = delta(feat_mfcc_d, 2) |
|
# 返回值分别是mfcc特征向量的矩阵及其一阶差分和二阶差分矩阵 |
|
wav_feature = np.column_stack((feat_mfcc, feat_mfcc_d, feat_mfcc_dd)) |
|
return wav_feature |
|
|
|
|
|
class Logfbank(SpeechFeatureMeta): |
|
""" |
|
ASRT语音识别内置的logfbank声学特征提取类 |
|
""" |
|
|
|
def __init__(self, framesamplerate=16000, nfilt=26): |
|
self.nfilt = nfilt |
|
super().__init__(framesamplerate) |
|
|
|
def run(self, wavsignal, fs=16000): |
|
wavsignal = np.array(wavsignal, dtype=np.float64) |
|
# 获取输入特征 |
|
wav_feature = logfbank(wavsignal, fs, nfilt=self.nfilt) |
|
return wav_feature |
|
|
|
|
|
class Spectrogram(SpeechFeatureMeta): |
|
""" |
|
ASRT语音识别内置的语谱图声学特征提取类 |
|
""" |
|
|
|
def __init__(self, framesamplerate=16000, timewindow=25, timeshift=10): |
|
self.time_window = timewindow |
|
self.window_length = int(framesamplerate / 1000 * self.time_window) # 计算窗长度的公式,目前全部为400固定值 |
|
self.timeshift = timeshift |
|
|
|
''' |
|
# 保留将来用于不同采样频率 |
|
self.x=np.linspace(0, self.window_length - 1, self.window_length, dtype = np.int64) |
|
self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (self.window_length - 1) ) # 汉明窗 |
|
''' |
|
|
|
self.x = np.linspace(0, 400 - 1, 400, dtype=np.int64) |
|
self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (400 - 1)) # 汉明窗 |
|
super().__init__(framesamplerate) |
|
|
|
def run(self, wavsignal, fs=16000): |
|
if fs != 16000: |
|
raise ValueError( |
|
f"[Error] ASRT currently only supports wav audio files with a sampling rate of 16000 Hz, but this " |
|
f"audio is {fs} Hz.") |
|
|
|
# wav波形 加时间窗以及时移10ms |
|
time_window = 25 # 单位ms |
|
window_length = int(fs / 1000 * time_window) # 计算窗长度的公式,目前全部为400固定值 |
|
|
|
wav_arr = np.array(wavsignal) |
|
# wav_length = len(wavsignal[0]) |
|
# wav_length = wav_arr.shape[1] |
|
|
|
range0_end = int(len(wavsignal[0]) / fs * 1000 - time_window) // 10 + 1 # 计算循环终止的位置,也就是最终生成的窗数 |
|
data_input = np.zeros((range0_end, window_length // 2), dtype=np.float64) # 用于存放最终的频率特征数据 |
|
data_line = np.zeros((1, window_length), dtype=np.float64) |
|
|
|
for i in range(0, range0_end): |
|
p_start = i * 160 |
|
p_end = p_start + 400 |
|
|
|
data_line = wav_arr[0, p_start:p_end] |
|
data_line = data_line * self.w # 加窗 |
|
data_line = np.abs(fft(data_line)) |
|
|
|
data_input[i] = data_line[0: window_length // 2] # 设置为400除以2的值(即200)是取一半数据,因为是对称的 |
|
|
|
data_input = np.log(data_input + 1) |
|
return data_input |
|
|
|
|
|
class SpecAugment(SpeechFeatureMeta): |
|
""" |
|
复现谷歌SpecAugment数据增强特征算法,基于Spectrogram语谱图基础特征 |
|
""" |
|
|
|
def __init__(self, framesamplerate=16000, timewindow=25, timeshift=10): |
|
self.time_window = timewindow |
|
self.window_length = int(framesamplerate / 1000 * self.time_window) # 计算窗长度的公式,目前全部为400固定值 |
|
self.timeshift = timeshift |
|
|
|
''' |
|
# 保留将来用于不同采样频率 |
|
self.x=np.linspace(0, self.window_length - 1, self.window_length, dtype = np.int64) |
|
self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (self.window_length - 1) ) # 汉明窗 |
|
''' |
|
|
|
self.x = np.linspace(0, 400 - 1, 400, dtype=np.int64) |
|
self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (400 - 1)) # 汉明窗 |
|
super().__init__(framesamplerate) |
|
|
|
def run(self, wavsignal, fs=16000): |
|
if fs != 16000: |
|
raise ValueError( |
|
f"[Error] ASRT currently only supports wav audio files with a sampling rate of 16000 Hz, but this " |
|
f"audio is {fs} Hz.") |
|
|
|
# wav波形 加时间窗以及时移10ms |
|
time_window = 25 # 单位ms |
|
window_length = int(fs / 1000 * time_window) # 计算窗长度的公式,目前全部为400固定值 |
|
|
|
wav_arr = np.array(wavsignal) |
|
# wav_length = len(wavsignal[0]) |
|
# wav_length = wav_arr.shape[1] |
|
|
|
range0_end = int(len(wavsignal[0]) / fs * 1000 - time_window) // 10 + 1 # 计算循环终止的位置,也就是最终生成的窗数 |
|
data_input = np.zeros((range0_end, window_length // 2), dtype=np.float64) # 用于存放最终的频率特征数据 |
|
data_line = np.zeros((1, window_length), dtype=np.float64) |
|
|
|
for i in range(0, range0_end): |
|
p_start = i * 160 |
|
p_end = p_start + 400 |
|
|
|
data_line = wav_arr[0, p_start:p_end] |
|
data_line = data_line * self.w # 加窗 |
|
data_line = np.abs(fft(data_line)) |
|
|
|
data_input[i] = data_line[0: window_length // 2] # 设置为400除以2的值(即200)是取一半数据,因为是对称的 |
|
|
|
# print(data_input.shape) |
|
data_input = np.log(data_input + 1) |
|
|
|
# 开始对得到的特征应用SpecAugment |
|
mode = random.randint(1, 100) |
|
h_start = random.randint(1, data_input.shape[0]) |
|
h_width = random.randint(1, 100) |
|
|
|
v_start = random.randint(1, data_input.shape[1]) |
|
v_width = random.randint(1, 100) |
|
|
|
if mode <= 60: # 正常特征 60% |
|
pass |
|
elif 60 < mode <= 75: # 横向遮盖 15% |
|
data_input[h_start:h_start + h_width, :] = 0 |
|
elif 75 < mode <= 90: # 纵向遮盖 15% |
|
data_input[:, v_start:v_start + v_width] = 0 |
|
else: # 两种遮盖叠加 10% |
|
data_input[h_start:h_start + h_width, :v_start:v_start + v_width] = 0 |
|
|
|
return data_input
|
|
|