## 梅尔刻度

$$m=2595\log_{10}(1+\frac{f}{700})$$

def hz2mel(hz):
""" Hz to Mels """
return 2595 * np.log10(1 + hz / 700.0)

mel与f(Hz)的对应关系

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
def hz2mel(hz):
""" Hz to Mels """
return 2595 * np.log10(1 + hz / 700.0)
if __name__ == "__main__":
fs = 16000
hz = np.linspace(0, 8000, 8000)
mel = hz2mel(hz)
fig = plt.figure()
ax = plt.plot(hz, mel, color="r")
plt.xlabel("Hertz scale (Hz)", fontsize=12)  # x轴的名字
plt.ylabel("mel scale", fontsize=12)
plt.xticks(fontsize=10)  # x轴的刻度
plt.yticks(fontsize=10)
plt.xlim(0, 8000)  # 坐标轴的范围
plt.ylim(0)
def formatnum(x, pos):
return '$%.1f$' % (x / 1000)
formatter = FuncFormatter(formatnum)
# plt.gca().xaxis.set_major_formatter(formatter)
# plt.gca().yaxis.set_major_formatter(formatter)
plt.grid(linestyle='--')
plt.tight_layout()
plt.show()

$$f=700e^{\frac{m}{2595}-1}$$

def mel2hz(mel):
""" Mels to HZ """
return 700 * (10 ** (mel / 2595.0) - 1)

## mel 滤波器组

def mel_filterbanks(nfilt=20, nfft=512, samplerate=16000, lowfreq=0, highfreq=None):
"""计算一个Mel-filterbank (M,F)
:param nfilt: filterbank中的滤波器数量
:param nfft: FFT size
:param samplerate: 采样率
:param lowfreq: Mel-filter的最低频带边缘
:param highfreq: Mel-filter的最高频带边缘，默认samplerate/2
"""
highfreq = highfreq or samplerate / 2
# 按梅尔均匀间隔计算 点
lowmel = hz2mel(lowfreq)
highmel = hz2mel(highfreq)
melpoints = np.linspace(lowmel, highmel, nfilt + 2)
hz_points = mel2hz(melpoints)  # 将mel频率再转到hz频率
# bin = samplerate/2 / NFFT/2=sample_rate/NFFT    # 每个频点的频率数
# bins = hz_points/bin=hz_points*NFFT/ sample_rate    # hz_points对应第几个fft频点
bin = np.floor((nfft + 1) * hz_points / samplerate)
fbank = np.zeros([nfilt, int(nfft / 2 + 1)])  # (m,f)
for i in range(0, nfilt):
for j in range(int(bin[i]), int(bin[i + 1])):
fbank[i, j] = (j - bin[i]) / (bin[i + 1] - bin[i])
for j in range(int(bin[i + 1]), int(bin[i + 2])):
fbank[i, j] = (bin[i + 2] - j) / (bin[i + 2] - bin[i + 1])
#    fbank -= (np.mean(fbank, axis=0) + 1e-8)
return fbank

## mel 滤波器组特征

# -*- coding:utf-8 -*-
# Author:凌逆战 | Never
# Date: 2022/5/19
"""
1、提取Mel filterBank
2、提取mel spectrum
"""
import librosa
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
from matplotlib.ticker import FuncFormatter
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示符号
def hz2mel(hz):
""" Hz to Mels """
return 2595 * np.log10(1 + hz / 700.0)
def mel2hz(mel):
""" Mels to HZ """
return 700 * (10 ** (mel / 2595.0) - 1)
def mel_filterbanks(nfilt=20, nfft=512, samplerate=16000, lowfreq=0, highfreq=None):
"""计算一个Mel-filterbank (M,F)
:param nfilt: filterbank中的滤波器数量
:param nfft: FFT size
:param samplerate: 采样率
:param lowfreq: Mel-filter的最低频带边缘
:param highfreq: Mel-filter的最高频带边缘，默认samplerate/2
"""
highfreq = highfreq or samplerate / 2
# 按梅尔均匀间隔计算 点
lowmel = hz2mel(lowfreq)
highmel = hz2mel(highfreq)
melpoints = np.linspace(lowmel, highmel, nfilt + 2)
hz_points = mel2hz(melpoints)  # 将mel频率再转到hz频率
# bin = samplerate/2 / NFFT/2=sample_rate/NFFT    # 每个频点的频率数
# bins = hz_points/bin=hz_points*NFFT/ sample_rate    # hz_points对应第几个fft频点
bin = np.floor((nfft + 1) * hz_points / samplerate)
fbank = np.zeros([nfilt, int(nfft / 2 + 1)])  # (m,f)
for i in range(0, nfilt):
for j in range(int(bin[i]), int(bin[i + 1])):
fbank[i, j] = (j - bin[i]) / (bin[i + 1] - bin[i])
for j in range(int(bin[i + 1]), int(bin[i + 2])):
fbank[i, j] = (bin[i + 2] - j) / (bin[i + 2] - bin[i + 1])
#    fbank -= (np.mean(fbank, axis=0) + 1e-8)
return fbank
wav_path = "./p225_001.wav"
fs = 16000
NFFT = 512
win_length = 512
num_filter = 22
low_freq_mel = 0
high_freq_mel = hz2mel(fs // 2)  # 求最高hz频率对应的mel频率
mel_points = np.linspace(low_freq_mel, high_freq_mel, num_filter + 2)  # 在mel频率上均分成42个点
hz_points = mel2hz(mel_points)  # 将mel频率再转到hz频率
print(hz_points)
# bin = sample_rate/2 / NFFT/2=sample_rate/NFFT    # 每个频点的频率数
# bins = hz_points/bin=hz_points*NFFT/ sample_rate    # hz_points对应第几个fft频点
bins = np.floor((NFFT + 1) * hz_points / fs)
print(bins)
# [  0.   2.   5.   8.  12.  16.  20.  25.  31.  37.  44.  52.  61.  70.
#   81.  93. 107. 122. 138. 157. 178. 201. 227. 256.]
S = librosa.stft(wav, n_fft=NFFT, hop_length=NFFT // 2, win_length=win_length, window="hann", center=False)
mag = np.abs(S)  # 幅度谱 (257, 127) librosa.magphase()
filterbanks = mel_filterbanks(nfilt=num_filter, nfft=NFFT, samplerate=fs, lowfreq=0, highfreq=fs // 2)
# ================ 画三角滤波器 ===========================
FFT_len = NFFT // 2 + 1
fs_bin = fs // 2 / (NFFT // 2)  # 一个频点多少Hz
x = np.linspace(0, FFT_len, FFT_len)
plt.plot(x * fs_bin, filterbanks.T)
plt.xlim(0)  # 坐标轴的范围
plt.ylim(0, 1)
plt.tight_layout()
plt.grid(linestyle='--')
plt.show()
filter_banks = np.dot(filterbanks, mag)  # (M,F)*(F,T)=(M,T)
filter_banks = 20 * np.log10(filter_banks)  # dB
# ================ 绘制语谱图 ==========================
# 绘制 频谱图 方法1
plt.imshow(filter_banks, cmap="jet", aspect='auto')
ax = plt.gca()  # 获取其中某个坐标系
ax.invert_yaxis()  # 将y轴反转
plt.tight_layout()
plt.show()
# 绘制 频谱图 方法2
plt.figure()
librosa.display.specshow(filter_banks, sr=fs, x_axis='time', y_axis='linear', cmap="jet")
plt.xlabel('时间/s', fontsize=14)
plt.ylabel('频率/kHz', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
def formatnum(x, pos):
return '$%d$' % (x / 1000)
formatter = FuncFormatter(formatnum)
plt.gca().yaxis.set_major_formatter(formatter)
plt.tight_layout()
plt.show()

mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)

## 巴克刻度

$$\text { Bark }=13 \arctan (0.00076 f)+3.5 \arctan ((\frac{f}{7500})^{2})$$

Traunmüller, 1990 提出的新的Bark scale公式：

$$\operatorname{Bark}=\frac{26.81f}{1960+f}-0.53$$

Wang, Sekey & Gersho, 1992 提出了新的Bark scale公式：

$$\text { Bark }=6 \sinh ^{-1}(\frac{f}{600})$$

def hz2bark_1961(Hz):
return 13.0 * np.arctan(0.00076 * Hz) + 3.5 * np.arctan((Hz / 7500.0) ** 2)
def hz2bark_1990(Hz):
bark_scale = (26.81 * Hz) / (1960 + Hz) - 0.5
return bark_scale
def hz2bark_1992(Hz):
return 6 * np.arcsinh(Hz / 600)

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
def hz2bark_1961(Hz):
return 13.0 * np.arctan(0.00076 * Hz) + 3.5 * np.arctan((Hz / 7500.0) ** 2)
def hz2bark_1990(Hz):
bark_scale = (26.81 * Hz) / (1960 + Hz) - 0.5
return bark_scale
def hz2bark_1992(Hz):
return 6 * np.arcsinh(Hz / 600)
if __name__ == "__main__":
fs = 16000
hz = np.linspace(0, fs // 2, fs // 2)
bark_1961 = hz2bark_1961(hz)
bark_1990 = hz2bark_1990(hz)
bark_1992 = hz2bark_1992(hz)
plt.plot(hz, bark_1961, label="bark_1961")
plt.plot(hz, bark_1990, label="bark_1990")
plt.plot(hz, bark_1992, label="bark_1992")
plt.legend()  # 显示图例
plt.xlabel("Hertz scale (Hz)", fontsize=12)  # x轴的名字
plt.ylabel("Bark scale", fontsize=12)
plt.xticks(fontsize=10)  # x轴的刻度
plt.yticks(fontsize=10)
plt.xlim(0, fs // 2)  # 坐标轴的范围
plt.ylim(0)
def formatnum(x, pos):
return '$%.1f$' % (x / 1000)
formatter = FuncFormatter(formatnum)
# plt.gca().xaxis.set_major_formatter(formatter)
# plt.gca().yaxis.set_major_formatter(formatter)
plt.grid(linestyle='--')
plt.tight_layout()
plt.show()

## Bark频谱

import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示符号
def hz2bark(f):
""" Hz to bark频率 (Wang, Sekey & Gersho, 1992.) """
return 6. * np.arcsinh(f / 600.)
def bark2hz(fb):
""" Bark频率 to Hz """
return 600. * np.sinh(fb / 6.)
def fft2hz(fft, fs=16000, nfft=512):
""" FFT频点 to Hz """
return (fft * fs) / (nfft + 1)
def hz2fft(fb, fs=16000, nfft=512):
""" Bark频率 to FFT频点 """
return (nfft + 1) * fb / fs
def fft2bark(fft, fs=16000, nfft=512):
""" FFT频点 to Bark频率 """
return hz2bark((fft * fs) / (nfft + 1))
def bark2fft(fb, fs=16000, nfft=512):
""" Bark频率 to FFT频点 """
# bin = sample_rate/2 / nfft/2=sample_rate/nfft    # 每个频点的频率数
# bins = hz_points/bin=hz_points*nfft/ sample_rate    # hz_points对应第几个fft频点
return (nfft + 1) * bark2hz(fb) / fs
def Fm(fb, fc):
""" 计算一个特定的中心频率的Bark filter
:param fb: frequency in Bark.
:param fc: center frequency in Bark.
:return: 相关的Bark filter 值/幅度
"""
if fc - 2.5 <= fb <= fc - 0.5:
return 10 ** (2.5 * (fb - fc + 0.5))
elif fc - 0.5 < fb < fc + 0.5:
return 1
elif fc + 0.5 <= fb <= fc + 1.3:
return 10 ** (-2.5 * (fb - fc - 0.5))
else:
return 0
def bark_filter_banks(nfilts=20, nfft=512, fs=16000, low_freq=0, high_freq=None, scale="constant"):
""" 计算Bark-filterbanks,(B,F)
:param nfilts: 滤波器组中滤波器的数量 (Default 20)
:param nfft: FFT size.(Default is 512)
:param fs: 采样率，(Default 16000 Hz)
:param low_freq: MEL滤波器的最低带边。(Default 0 Hz)
:param high_freq: MEL滤波器的最高带边。(Default samplerate/2)
:param scale (str): 选择Max bins 幅度 "ascend"(上升)，"descend"(下降)或 "constant"(恒定)(=1)。默认是"constant"
:return:一个大小为(nfilts, nfft/2 + 1)的numpy数组，包含滤波器组。
"""
# init freqs
high_freq = high_freq or fs / 2
low_freq = low_freq or 0
# 按Bark scale 均匀间隔计算点数(点数以Bark为单位)
low_bark = hz2bark(low_freq)
high_bark = hz2bark(high_freq)
bark_points = np.linspace(low_bark, high_bark, nfilts + 4)
bins = np.floor(bark2fft(bark_points))  # Bark Scale等分布对应的 FFT bin number
# [  0.   2.   5.   7.  10.  13.  16.  20.  24.  28.  33.  38.  44.  51.
#   59.  67.  77.  88. 101. 115. 132. 151. 172. 197. 224. 256.]
fbank = np.zeros([nfilts, nfft // 2 + 1])
# init scaler
if scale == "descendant" or scale == "constant":
c = 1
else:
c = 0
for i in range(0, nfilts):      # --> B
# compute scaler
if scale == "descendant":
c -= 1 / nfilts
c = c * (c > 0) + 0 * (c < 0)
elif scale == "ascendant":
c += 1 / nfilts
c = c * (c < 1) + 1 * (c > 1)
for j in range(int(bins[i]), int(bins[i + 4])):     # --> F
fc = bark_points[i+2]   # 中心频率
fb = fft2bark(j)        # Bark 频率
fbank[i, j] = c * Fm(fb, fc)
return np.abs(fbank)
if __name__ == "__main__":
nfilts = 22
NFFT = 512
fs = 16000
S = librosa.stft(wav, n_fft=NFFT, hop_length=NFFT // 2, win_length=NFFT, window="hann", center=False)
mag = np.abs(S)  # 幅度谱 (257, 127) librosa.magphase()
filterbanks = bark_filter_banks(nfilts=nfilts, nfft=NFFT, fs=fs, low_freq=0, high_freq=None, scale="constant")
# ================ 画三角滤波器 ===========================
FFT_len = NFFT // 2 + 1
fs_bin = fs // 2 / (NFFT // 2)  # 一个频点多少Hz
x = np.linspace(0, FFT_len, FFT_len)
plt.plot(x * fs_bin, filterbanks.T)
# plt.xlim(0)  # 坐标轴的范围
# plt.ylim(0, 1)
plt.tight_layout()
plt.grid(linestyle='--')
plt.show()
filter_banks = np.dot(filterbanks, mag)  # (M,F)*(F,T)=(M,T)
filter_banks = 20 * np.log10(filter_banks)  # dB
# ================ 绘制语谱图 ==========================
plt.figure()
librosa.display.specshow(filter_banks, sr=fs, x_axis='time', y_axis='linear', cmap="jet")
plt.xlabel('时间/s', fontsize=14)
plt.ylabel('频率/kHz', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
def formatnum(x, pos):
return '$%d$' % (x / 1000)
formatter = FuncFormatter(formatnum)
plt.gca().yaxis.set_major_formatter(formatter)
plt.tight_layout()
plt.show()

## 等效矩阵带宽

Moore 和 Glasberg在1983 年 ，对于中等的声强和年轻的听者，人的听觉滤波器的带宽可以通过以下的多项式方程式近似：

$$公式1：\operatorname{ERB}(f)=6.23 \cdot f^{2}+93.39 \cdot f+28.52$$

$$公式2：\operatorname{ERB}(f)=24.7 \cdot(4.37 \cdot f+1)$$

def hz2erb_1983(f):
""" 中心频率f(Hz) f to ERB(Hz) """
f = f / 1000.0
return (6.23 * (f ** 2)) + (93.39 * f) + 28.52
def hz2erb_1990(f):
""" 中心频率f(Hz) f to ERB(Hz) """
f = f / 1000.0
return 24.7 * (4.37 * f + 1.0)
def hz2erb_1998(f):
""" 中心频率f(Hz) f to ERB(Hz)
hz2erb_1990 和 hz2erb_1990_2 的曲线几乎一模一样
M. Slaney, Auditory Toolbox, Version 2, Technical Report No: 1998-010, Internal Research Corporation, 1998
http://cobweb.ecn.purdue.edu/~malcolm/interval/1998-010/
"""
return 24.7 + (f / 9.26449)

$$公式3：\left\{\begin{array}{l} \operatorname{ERBs}(0)=0 \\ \frac{d f}{d \operatorname{ERBs}(f)}=\operatorname{ERB}(f) \end{array}\right.$$

$$\operatorname{ERBs}(f)=11.17 \cdot \ln \left(\frac{f+0.312}{f+14.675}\right)+43.0$$

$$\operatorname{ERBs}(f)=11.17268 \cdot \ln \left(1+\frac{46.06538 \cdot f}{f+14678.49}\right)$$

$$f=\frac{676170.4}{47.06538-e^{0.08950404 \cdot \operatorname{ERBs}(f)}}-14678.49$$

def erb_matlab_voicebox(f: float) -> float:
cuotient = (46.06538 * f) / float(f + 14678.49)
return 11.17268 * math.log(1 + cuotient)
def ierb_matlab_voicebox(erbf: float) -> float:
denom = 47.06538 - math.exp(0.08950404 - erbf)
return (676170.4 / float(denom)) - 14678.49

$$\operatorname{ERBS}(f)=21.4 \cdot \log _{10}(1+ \frac{4.37\cdot f}{1000})$$

## 线性滤波器组

# -*- coding:utf-8 -*-
# Author:凌逆战 | Never.Ling
# Date: 2022/5/28
"""

https://github.com/wil-j-wil/py_bank
https://github.com/flavioeverardo/erb_bands
"""
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示符号
class EquivalentRectangularBandwidth():
def __init__(self, nfreqs, sample_rate, total_erb_bands, low_freq, max_freq):
if low_freq == None:
low_freq = 20
if max_freq == None:
max_freq = sample_rate // 2
freqs = np.linspace(0, max_freq, nfreqs)  # 每个STFT频点对应多少Hz
self.EarQ = 9.265  # _ERB_Q
self.minBW = 24.7  # minBW
# 在ERB刻度上建立均匀间隔
erb_low = self.freq2erb(low_freq)  # 最低 截止频率
erb_high = self.freq2erb(max_freq)  # 最高 截止频率
# 在ERB频率上均分为（total_erb_bands + ）2个 频带
erb_lims = np.linspace(erb_low, erb_high, total_erb_bands + 2)
cutoffs = self.erb2freq(erb_lims)  # 将 ERB频率再转到 hz频率, 在线性频率Hz上找到ERB截止频率对应的频率
# self.nfreqs  F
# self.freqs # 每个STFT频点对应多少Hz
self.filters = self.get_bands(total_erb_bands, nfreqs, freqs, cutoffs)
def freq2erb(self, frequency):
""" [Hohmann2002] Equation 16"""
return self.EarQ * np.log(1 + frequency / (self.minBW * self.EarQ))
def erb2freq(self, erb):
""" [Hohmann2002] Equation 17"""
return (np.exp(erb / self.EarQ) - 1) * self.minBW * self.EarQ
def get_bands(self, total_erb_bands, nfreqs, freqs, cutoffs):
"""
获取erb bands、索引、带宽和滤波器形状
:param erb_bands_num: ERB 频带数
:param nfreqs: 频点数 F
:param freqs: 每个STFT频点对应多少Hz
:param cutoffs: 中心频率 Hz
:param erb_points: ERB频带界限 列表
:return:
"""
cos_filts = np.zeros([nfreqs, total_erb_bands])  # (F, ERB)
for i in range(total_erb_bands):
lower_cutoff = cutoffs[i]  # 上限截止频率 Hz
higher_cutoff = cutoffs[i + 2]  # 下限截止频率 Hz, 相邻filters重叠50%
lower_index = np.min(np.where(freqs > lower_cutoff))  # 下限截止频率对应的Hz索引 Hz。np.where 返回满足条件的索引
higher_index = np.max(np.where(freqs < higher_cutoff))  # 上限截止频率对应的Hz索引
avg = (self.freq2erb(lower_cutoff) + self.freq2erb(higher_cutoff)) / 2
rnge = self.freq2erb(higher_cutoff) - self.freq2erb(lower_cutoff)
cos_filts[lower_index:higher_index + 1, i] = np.cos(
(self.freq2erb(freqs[lower_index:higher_index + 1]) - avg) / rnge * np.pi)  # 减均值，除方差
# 加入低通和高通，得到完美的重构
filters = np.zeros([nfreqs, total_erb_bands + 2])  # (F, ERB)
filters[:, 1:total_erb_bands + 1] = cos_filts
# 低通滤波器上升到第一个余cos filter的峰值
higher_index = np.max(np.where(freqs < cutoffs[1]))  # 上限截止频率对应的Hz索引
filters[:higher_index + 1, 0] = np.sqrt(1 - np.power(filters[:higher_index + 1, 1], 2))
# 高通滤波器下降到最后一个cos filter的峰值
lower_index = np.min(np.where(freqs > cutoffs[total_erb_bands]))
filters[lower_index:nfreqs, total_erb_bands + 1] = np.sqrt(
1 - np.power(filters[lower_index:nfreqs, total_erb_bands], 2))
return cos_filts
if __name__ == "__main__":
fs = 16000
NFFT = 512  # 信号长度
ERB_num = 20
low_lim = 20  # 最低滤波器中心频率
high_lim = fs / 2  # 最高滤波器中心频率
freq_num = NFFT // 2 + 1
fs_bin = fs // 2 / (NFFT // 2)  # 一个频点多少Hz
x = np.linspace(0, freq_num, freq_num)
# ================ 画三角滤波器 ===========================
ERB = EquivalentRectangularBandwidth(freq_num, fs, ERB_num, low_lim, high_lim)
filterbanks = ERB.filters.T  # (257, 20)
plt.plot(x * fs_bin, filterbanks.T)
# plt.xlim(0)  # 坐标轴的范围
# plt.ylim(0, 1)
plt.tight_layout()
plt.grid(linestyle='--')
plt.show()
# ================ 绘制语谱图 ==========================
S = librosa.stft(wav, n_fft=NFFT, hop_length=NFFT // 2, win_length=NFFT, window="hann", center=False)
mag = np.abs(S)  # 幅度谱 (257, 127) librosa.magphase()
filter_banks = np.dot(filterbanks, mag)  # (M,F)*(F,T)=(M,T)
filter_banks = 20 * np.log10(filter_banks)  # dB
plt.figure()
librosa.display.specshow(filter_banks, sr=fs, x_axis='time', y_axis='linear', cmap="jet")
plt.xlabel('时间/s', fontsize=14)
plt.ylabel('频率/kHz', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
def formatnum(x, pos):
return '$%d$' % (x / 1000)
formatter = FuncFormatter(formatnum)
plt.gca().yaxis.set_major_formatter(formatter)
plt.tight_layout()
plt.show()
View Code

## Gammatone 滤波器组

#### 历史

1972年Johannesma 提出了 gammatone 滤波器用来逼近recvor function：

$$时域表达式：g(t)=a t^{n-1} e^{-2 \pi b t} \cos (2 \pi f_c t+\phi_0)$$

# -*- coding:utf-8 -*-
# Author:凌逆战 | Never.Ling
# Date: 2022/5/24
"""

"""
import librosa
import librosa.display
import numpy as np
from scipy.fftpack import dct
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示符号
def erb_space(low_freq=50, high_freq=8000, n=64):
""" 计算中心频率(ERB scale)
:param min_freq: 中心频率域的最小频率。
:param max_freq: 中心频率域的最大频率。
:param nfilts: 滤波器的个数，即等于计算中心频率的个数。
:return: 一组中心频率
"""
ear_q = 9.26449
min_bw = 24.7
cf_array = -(ear_q * min_bw) + np.exp(
np.linspace(1, n, n) * (-np.log(high_freq + ear_q * min_bw) + np.log(low_freq + ear_q * min_bw)) / n) \
* (high_freq + ear_q * min_bw)
return cf_array
def gammatone_impulse_response(samplerate_hz, length_in_samples, center_freq_hz, p):
""" gammatone滤波器的时域公式
:param samplerate_hz: 采样率
:param length_in_samples: 信号长度
:param center_freq_hz: 中心频率
:param p: 滤波器阶数
:return: gammatone 脉冲响应
"""
# 生成一个gammatone filter (1990 Glasberg&Moore parametrized)
erb = 24.7 + (center_freq_hz / 9.26449)  # equivalent rectangular bandwidth.
# 中心频率
an = (np.pi * np.math.factorial(2 * p - 2) * np.power(2, float(-(2 * p - 2)))) / np.square(np.math.factorial(p - 1))
b = erb / an  # 带宽
a = 1  # 幅度(amplitude). 这在后面的归一化过程中有所不同。
t = np.linspace(1. / samplerate_hz, length_in_samples / samplerate_hz, length_in_samples)
gammatone_ir = a * np.power(t, p - 1) * np.exp(-2 * np.pi * b * t) * np.cos(2 * np.pi * center_freq_hz * t)
return gammatone_ir
def generate_filterbank(fs, fmax, L, N, p=4):
"""
L: 在样本中测量的信号的大小
N: 滤波器数量
p: Gammatone脉冲响应的阶数
"""
# 中心频率
if fs == 8000:
fmax = 4000
center_freqs = erb_space(50, fmax, N)  # 中心频率列表
center_freqs = np.flip(center_freqs)  # 反转数组
n_center_freqs = len(center_freqs)  # 中心频率的数量
filterbank = np.zeros((N, L))
# 为每个中心频率生成 滤波器
for i in range(n_center_freqs):
# aa = gammatone_impulse_response(fs, L, center_freqs[i], p)
filterbank[i, :] = gammatone_impulse_response(fs, L, center_freqs[i], p)
return filterbank
def gfcc(cochleagram, numcep=13):
feat = dct(cochleagram, type=2, axis=1, norm='ortho')[:, :numcep]
#    feat-= (np.mean(feat, axis=0) + 1e-8)#Cepstral mean substration
return feat
def cochleagram(sig_spec, filterbank, nfft):
"""
:param sig_spec: 语音频谱
:param filterbank: 时域滤波器组
:param nfft: fft_size
:return:
"""
filterbank = powerspec(filterbank, nfft)  # 时域滤波器组经过FFT变换
filterbank /= np.max(filterbank, axis=-1)[:, None]  # Normalize filters
cochlea_spec = np.dot(sig_spec, filterbank.T)  # 矩阵相乘
cochlea_spec = np.where(cochlea_spec == 0.0, np.finfo(float).eps, cochlea_spec)  # 把0变成一个很小的数
# cochlea_spec= np.log(cochlea_spec)-np.mean(np.log(cochlea_spec),axis=0)
cochlea_spec = np.log(cochlea_spec)
return cochlea_spec, filterbank
def powerspec(X, nfft):
# Fourier transform
Y = np.fft.fft(X, n=nfft)
Y = np.absolute(Y)
# non-redundant part
m = int(nfft / 2) + 1
Y = Y[:, :m]
return np.abs(Y) ** 2
if __name__ == "__main__":
nfilts = 22
NFFT = 512
fs = 16000
Order = 4
FFT_len = NFFT // 2 + 1
fs_bin = fs // 2 / (NFFT // 2)  # 一个频点多少Hz
x = np.linspace(0, FFT_len, FFT_len)
# ================ 画三角滤波器 ===========================
# gammatone_impulse_response = gammatone_impulse_response(fs/2, 512, 200, Order)    #  gammatone冲击响应
generate_filterbank = generate_filterbank(fs, fs // 2, FFT_len, nfilts, Order)
filterbanks = powerspec(generate_filterbank, NFFT)  # 时域滤波器组经过FFT变换
filterbanks /= np.max(filterbanks, axis=-1)[:, None]  # Normalize filters
print(generate_filterbank.shape)    # (22, 257)
# plt.plot(filterbanks.T)
plt.plot(x * fs_bin, filterbanks.T)
# plt.xlim(0)  # 坐标轴的范围
# plt.ylim(0, 1)
plt.tight_layout()
plt.grid(linestyle='--')
plt.show()
# ================ 绘制语谱图 ==========================
S = librosa.stft(wav, n_fft=NFFT, hop_length=NFFT // 2, win_length=NFFT, window="hann", center=False)
mag = np.abs(S)  # 幅度谱 (257, 127) librosa.magphase()
filter_banks = np.dot(filterbanks, mag)  # (M,F)*(F,T)=(M,T)
filter_banks = 20 * np.log10(filter_banks)  # dB
plt.figure()
librosa.display.specshow(filter_banks, sr=fs, x_axis='time', y_axis='linear', cmap="jet")
plt.xlabel('时间/s', fontsize=14)
plt.ylabel('频率/kHz', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
def formatnum(x, pos):
return '$%d$' % (x / 1000)
formatter = FuncFormatter(formatnum)
plt.gca().yaxis.set_major_formatter(formatter)
plt.tight_layout()
plt.show()
View Code

频域表达式：\begin{aligned} H(f)=& a[R(f) \otimes S(f)] \\ =& \frac{a}{2}(n-1) !(2 \pi b)^{-n}\left\{e^{i \phi_0}\left[1+\frac{i(f-f_c)}{b} \right]^{-n}+e^{-i \phi_0}\left[1+\frac{i(f+f_c)}{b}\right]^{-n}\right\} \end{aligned}

# -*- coding:utf-8 -*-
# Author:凌逆战 | Never
# Date: 2022/5/24
"""
Gammatone-filter-banks implementation
based on https://github.com/mcusi/gammatonegram/
"""
import librosa
import librosa.display
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.ticker import FuncFormatter
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示符号
# Slaney's ERB Filter constants
EarQ = 9.26449
minBW = 24.7
def generate_center_frequencies(min_freq, max_freq, filter_nums):
""" 计算中心频率(ERB scale)
:param min_freq: 中心频率域的最小频率。
:param max_freq: 中心频率域的最大频率。
:param filter_nums: 滤波器的个数，即等于计算中心频率的个数。
:return: 一组中心频率
"""
# init vars
n = np.linspace(1, filter_nums, filter_nums)
c = EarQ * minBW
# 计算中心频率
cfreqs = (max_freq + c) * np.exp((n / filter_nums) * np.log(
(min_freq + c) / (max_freq + c))) - c
return cfreqs
def compute_gain(fcs, B, wT, T):
""" 为了 阶数 计算增益和矩阵化计算
:param fcs: 中心频率
:param B: 滤波器的带宽
:param wT: 对应于用于频域计算的 w * T = 2 * pi * freq * T
:param T: 周期(单位秒s)，1/fs
:return:
Gain: 表示filter gains 的2d numpy数组
A: 用于最终计算的二维数组
"""
# 为了简化 预先计算
K = np.exp(B * T)
Cos = np.cos(2 * fcs * np.pi * T)
Sin = np.sin(2 * fcs * np.pi * T)
Smax = np.sqrt(3 + 2 ** (3 / 2))
Smin = np.sqrt(3 - 2 ** (3 / 2))
# 定义A矩阵的行
A11 = (Cos + Smax * Sin) / K
A12 = (Cos - Smax * Sin) / K
A13 = (Cos + Smin * Sin) / K
A14 = (Cos - Smin * Sin) / K
# 计算增益 (vectorized)
A = np.array([A11, A12, A13, A14])
Kj = np.exp(1j * wT)
Kjmat = np.array([Kj, Kj, Kj, Kj]).T
G = 2 * T * Kjmat * (A.T - Kjmat)
Coe = -2 / K ** 2 - 2 * Kj ** 2 + 2 * (1 + Kj ** 2) / K
Gain = np.abs(G[:, 0] * G[:, 1] * G[:, 2] * G[:, 3] * Coe ** -4)
return A, Gain
def gammatone_filter_banks(nfilts=22, nfft=512, fs=16000, low_freq=None, high_freq=None, scale="contsant", order=4):
""" 计算Gammatone-filterbanks, (G,F)
:param nfilts: filterbank中滤波器的数量 (Default 22)
:param nfft: FFT size (Default is 512)
:param fs: 采样率 (Default 16000 Hz)
:param low_freq: 最低频带 (Default 0 Hz)
:param high_freq: 最高频带 (Default samplerate/2)
:param scale: 选择Max bins 幅度 "ascend"(上升)，"descend"(下降)或 "constant"(恒定)(=1)。默认是"constant"
:param order: 滤波器阶数
:return: 一个大小为(nfilts, nfft/2 + 1)的numpy数组，包含滤波器组。
"""
# init freqs
high_freq = high_freq or fs / 2
low_freq = low_freq or 0
# define custom difference func
def Dif(u, a):
return u - a.reshape(nfilts, 1)
# init vars
fbank = np.zeros([nfilts, nfft])
width = 1.0
maxlen = nfft // 2 + 1
T = 1 / fs
n = 4
u = np.exp(1j * 2 * np.pi * np.array(range(nfft // 2 + 1)) / nfft)
idx = range(nfft // 2 + 1)
fcs = generate_center_frequencies(low_freq, high_freq, nfilts)  # 计算中心频率，转换到ERB scale
ERB = width * ((fcs / EarQ) ** order + minBW ** order) ** (1 / order)  # 计算带宽
B = 1.019 * 2 * np.pi * ERB
# compute input vars
wT = 2 * fcs * np.pi * T
pole = np.exp(1j * wT) / np.exp(B * T)
# compute gain and A matrix
A, Gain = compute_gain(fcs, B, wT, T)
# compute fbank
fbank[:, idx] = (
(T ** 4 / Gain.reshape(nfilts, 1)) *
np.abs(Dif(u, A[0]) * Dif(u, A[1]) * Dif(u, A[2]) * Dif(u, A[3])) *
np.abs(Dif(u, pole) * Dif(u, pole.conj())) ** (-n))
# 确保所有filters的最大值为1.0
try:
fbs = np.array([f / np.max(f) for f in fbank[:, range(maxlen)]])
except BaseException:
fbs = fbank[:, idx]
# compute scaler
if scale == "ascendant":
c = [
0,
]
for i in range(1, nfilts):
x = c[i - 1] + 1 / nfilts
c.append(x * (x < 1) + 1 * (x > 1))
elif scale == "descendant":
c = [
1,
]
for i in range(1, nfilts):
x = c[i - 1] - 1 / nfilts
c.append(x * (x > 0) + 0 * (x < 0))
else:
c = [1 for i in range(nfilts)]
# apply scaler
c = np.array(c).reshape(nfilts, 1)
fbs = c * np.abs(fbs)
return fbs
if __name__ == "__main__":
nfilts = 22
NFFT = 512
fs = 16000
FFT_len = NFFT // 2 + 1
fs_bin = fs // 2 / (NFFT // 2)  # 一个频点多少Hz
x = np.linspace(0, FFT_len, FFT_len)
# ================ 画三角滤波器 ===========================
filterbanks = gammatone_filter_banks(nfilts=22, nfft=512, fs=16000,
low_freq=None, high_freq=None,
scale="contsant", order=4)
print(filterbanks.shape)    # (22, 257)
plt.plot(x * fs_bin, filterbanks.T)
# plt.xlim(0)  # 坐标轴的范围
# plt.ylim(0, 1)
plt.tight_layout()
plt.grid(linestyle='--')
plt.show()
# ================ 绘制语谱图 ==========================
S = librosa.stft(wav, n_fft=NFFT, hop_length=NFFT // 2, win_length=NFFT, window="hann", center=False)
mag = np.abs(S)  # 幅度谱 (257, 127) librosa.magphase()
filter_banks = np.dot(filterbanks, mag)  # (M,F)*(F,T)=(M,T)
filter_banks = 20 * np.log10(filter_banks)  # dB
plt.figure()
librosa.display.specshow(filter_banks, sr=fs, x_axis='time', y_axis='linear', cmap="jet")
plt.xlabel('时间/s', fontsize=14)
plt.ylabel('频率/kHz', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
def formatnum(x, pos):
return '$%d$' % (x / 1000)
formatter = FuncFormatter(formatnum)
plt.gca().yaxis.set_major_formatter(formatter)
plt.tight_layout()
plt.show()
View Code

1988年Holdsworth 等人进一步阐明了GTF的各种特性，而且提供了一个数字IIR滤波器设计方案。这个技术使得GTF能够比FIR更加容易且高效地实现，为后续出现一些 重要的实际应用 做了铺垫。听觉滤波的gammatone模型的 变化和改进 包括复数gammatone滤波器、gammachirp滤波器、全极点(all-pole)和一零(one-zero) gammatone滤波器、双边(two-sided)gammatone滤波器和滤波器级联(filter-cascade)模型，以及各种level相关和这些的动态非线性版本。

## 参考

【百度百科】 心理声学

【维基百科】 Bark scale

【维基百科】 Mel scale

【维基百科】 Gammatone filter （包含了C \ C++ \ mathematica \ matlab的代码实现）

【CSDN】 GammaTone 滤波器详解

【python库】 PyFilterbank

【代码】Brookes, Mike (22 December 2012). “frq2erb”VOICEBOX: Speech Processing Toolbox for MATLAB . Department of Electrical & Electronic Engineering, Imperial College, UK . Retrieved  20 January 2013.

【代码】Brookes, Mike (22 December 2012). “erb2frq”VOICEBOX: Speech Processing Toolbox for MATLAB . Department of Electrical & Electronic Engineering, Imperial College, UK . Retrieved  20 January 2013.

【论文】Smith, Julius O.; Abel, Jonathan S. (10 May 2007). “Equivalent Rectangular Bandwidth”Bark and ERB Bilinear Transforms . Center for Computer Research in Music and Acoustics (CCRMA), Stanford University, USA . Retrieved  20 January 2013.