dl-asr/speech_features/speech_features.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright 2016-2099 Ailemon.net
#
# This file is part of ASRT Speech Recognition Tool.
#
# ASRT is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# ASRT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ASRT.  If not, see <https://www.gnu.org/licenses/>.
# ============================================================================

"""
@author: nl8590687
ASRT语音识别内置声学特征提取模块，定义了几个常用的声学特征类
"""

import random
import numpy as np
from scipy.fftpack import fft
from .base import mfcc, delta, logfbank


class SpeechFeatureMeta:
    """
    ASRT语音识别中所有声学特征提取类的基类
    """

    def __init__(self, framesamplerate=16000):
        self.framesamplerate = framesamplerate

    def run(self, wavsignal, fs=16000):
        '''
        run method
        '''
        raise NotImplementedError('[ASRT] `run()` method is not implemented.')


class MFCC(SpeechFeatureMeta):
    """
    ASRT语音识别内置的mfcc声学特征提取类

    Compute MFCC features from an audio signal.

    :param framesamplerate: the sample rate of the signal we are working with, in Hz.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param numcep: the number of cepstrum to return, default 13
    :param nfilt: the number of filters in the filterbank, default 26.
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    """

    def __init__(self, framesamplerate=16000,
                 winlen=0.025,
                 winstep=0.01,
                 numcep=13,
                 nfilt=26,
                 preemph=0.97):
        self.framesamplerate = framesamplerate
        self.winlen = winlen
        self.winstep = winstep
        self.numcep = numcep
        self.nfilt = nfilt
        self.preemph = preemph
        super().__init__(framesamplerate)

    def run(self, wavsignal, fs=16000):
        """
        计算mfcc声学特征，包含静态特征、一阶差分和二阶差分

        :returns: A numpy array of size (NUMFRAMES by numcep * 3) containing features. Each row holds 1 feature vector.
        """
        wavsignal = np.array(wavsignal, dtype=np.float64)
        # 获取输入特征
        feat_mfcc = mfcc(wavsignal[0], samplerate=self.framesamplerate, winlen=self.winlen,
                         winstep=self.winstep, numcep=self.numcep, nfilt=self.nfilt, preemph=self.preemph)
        feat_mfcc_d = delta(feat_mfcc, 2)
        feat_mfcc_dd = delta(feat_mfcc_d, 2)
        # 返回值分别是mfcc特征向量的矩阵及其一阶差分和二阶差分矩阵
        wav_feature = np.column_stack((feat_mfcc, feat_mfcc_d, feat_mfcc_dd))
        return wav_feature


class Logfbank(SpeechFeatureMeta):
    """
    ASRT语音识别内置的logfbank声学特征提取类
    """

    def __init__(self, framesamplerate=16000, nfilt=26):
        self.nfilt = nfilt
        super().__init__(framesamplerate)

    def run(self, wavsignal, fs=16000):
        wavsignal = np.array(wavsignal, dtype=np.float64)
        # 获取输入特征
        wav_feature = logfbank(wavsignal, fs, nfilt=self.nfilt)
        return wav_feature


class Spectrogram(SpeechFeatureMeta):
    """
    ASRT语音识别内置的语谱图声学特征提取类
    """

    def __init__(self, framesamplerate=16000, timewindow=25, timeshift=10):
        self.time_window = timewindow
        self.window_length = int(framesamplerate / 1000 * self.time_window)  # 计算窗长度的公式，目前全部为400固定值
        self.timeshift = timeshift

        '''
        # 保留将来用于不同采样频率
        self.x=np.linspace(0, self.window_length - 1, self.window_length, dtype = np.int64)
        self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (self.window_length - 1) ) # 汉明窗
        '''

        self.x = np.linspace(0, 400 - 1, 400, dtype=np.int64)
        self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (400 - 1))  # 汉明窗
        super().__init__(framesamplerate)

    def run(self, wavsignal, fs=16000):
        if fs != 16000:
            raise ValueError(
                f"[Error] ASRT currently only supports wav audio files with a sampling rate of 16000 Hz, but this "
                f"audio is {fs} Hz.")

        # wav波形 加时间窗以及时移10ms
        time_window = 25  # 单位ms
        window_length = int(fs / 1000 * time_window)  # 计算窗长度的公式，目前全部为400固定值

        wav_arr = np.array(wavsignal)
        # wav_length = len(wavsignal[0])
        # wav_length = wav_arr.shape[1]

        range0_end = int(len(wavsignal[0]) / fs * 1000 - time_window) // 10 + 1  # 计算循环终止的位置，也就是最终生成的窗数
        data_input = np.zeros((range0_end, window_length // 2), dtype=np.float64)  # 用于存放最终的频率特征数据
        data_line = np.zeros((1, window_length), dtype=np.float64)

        for i in range(0, range0_end):
            p_start = i * 160
            p_end = p_start + 400

            data_line = wav_arr[0, p_start:p_end]
            data_line = data_line * self.w  # 加窗
            data_line = np.abs(fft(data_line))

            data_input[i] = data_line[0: window_length // 2]  # 设置为400除以2的值（即200）是取一半数据，因为是对称的

        data_input = np.log(data_input + 1)
        return data_input


class SpecAugment(SpeechFeatureMeta):
    """
    复现谷歌SpecAugment数据增强特征算法，基于Spectrogram语谱图基础特征
    """

    def __init__(self, framesamplerate=16000, timewindow=25, timeshift=10):
        self.time_window = timewindow
        self.window_length = int(framesamplerate / 1000 * self.time_window)  # 计算窗长度的公式，目前全部为400固定值
        self.timeshift = timeshift

        '''
        # 保留将来用于不同采样频率
        self.x=np.linspace(0, self.window_length - 1, self.window_length, dtype = np.int64)
        self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (self.window_length - 1) ) # 汉明窗
        '''

        self.x = np.linspace(0, 400 - 1, 400, dtype=np.int64)
        self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (400 - 1))  # 汉明窗
        super().__init__(framesamplerate)

    def run(self, wavsignal, fs=16000):
        if fs != 16000:
            raise ValueError(
                f"[Error] ASRT currently only supports wav audio files with a sampling rate of 16000 Hz, but this "
                f"audio is {fs} Hz.")

        # wav波形 加时间窗以及时移10ms
        time_window = 25  # 单位ms
        window_length = int(fs / 1000 * time_window)  # 计算窗长度的公式，目前全部为400固定值

        wav_arr = np.array(wavsignal)
        # wav_length = len(wavsignal[0])
        # wav_length = wav_arr.shape[1]

        range0_end = int(len(wavsignal[0]) / fs * 1000 - time_window) // 10 + 1  # 计算循环终止的位置，也就是最终生成的窗数
        data_input = np.zeros((range0_end, window_length // 2), dtype=np.float64)  # 用于存放最终的频率特征数据
        data_line = np.zeros((1, window_length), dtype=np.float64)

        for i in range(0, range0_end):
            p_start = i * 160
            p_end = p_start + 400

            data_line = wav_arr[0, p_start:p_end]
            data_line = data_line * self.w  # 加窗
            data_line = np.abs(fft(data_line))

            data_input[i] = data_line[0: window_length // 2]  # 设置为400除以2的值（即200）是取一半数据，因为是对称的

        # print(data_input.shape)
        data_input = np.log(data_input + 1)

        # 开始对得到的特征应用SpecAugment
        mode = random.randint(1, 100)
        h_start = random.randint(1, data_input.shape[0])
        h_width = random.randint(1, 100)

        v_start = random.randint(1, data_input.shape[1])
        v_width = random.randint(1, 100)

        if mode <= 60:  # 正常特征 60%
            pass
        elif 60 < mode <= 75:  # 横向遮盖 15%
            data_input[h_start:h_start + h_width, :] = 0
        elif 75 < mode <= 90:  # 纵向遮盖 15%
            data_input[:, v_start:v_start + v_width] = 0
        else:  # 两种遮盖叠加 10%
            data_input[h_start:h_start + h_width, :v_start:v_start + v_width] = 0

        return data_input