## 训练阶段

Label： 数据的 label为纯净语音信号的幅度谱 ，这里只需要一帧就够了。

## 增强阶段

spectrum = magnitude * np.exp(1.0j * phase)

## 时频掩蔽

1、我们假设信号能量稀疏的，即对于大多数时频区域它的能量为0，如下图所示，我们可以看到大多数区域的值，即频域能量为0。

2、我们假设信号能量不相交的，即它们的时频区域不重叠或者重叠较少，如下图所示，我们可以看到时频区域不为0的地方不重叠或者有较少部分的重叠。

## 理想二值掩蔽（Ideal Binary Mask，IBM）

$$公式1：I B M(t, f)=\left\{\begin{array}{l} 1, \operatorname{SNR}(t, f)>L C \\ 0, \text { else } \end{array}\right.$$

$$公式2：\operatorname{SNR}(t, f)=10 * \log 10\left(\frac{|S(t, f)|^{2}}{|N(t, f)|^{2}}\right)$$

def IBM(clean_speech, noise):
"""计算 ideal binary mask (IBM)
Erdogan, Hakan, et al. "Phase-sensitive and recognition-boosted speech separation using deep recurrent neural networks." ICASSP, 2015.
:param clean_speech: 纯净语音 STFT
:param noise: 噪声 STFT
:return: 纯净语音的理想二值掩膜 IBM
"""
mask = np.zeros(np.shape(clean_speech), dtype=np.float32)
mask[np.abs(clean_speech) >= np.abs(noise)] = 1.0
return mask

def IBM_SNR(clean_speech, noise_speech):
"""计算 ideal binary mask (IBM)
Erdogan, Hakan, et al. "Phase-sensitive and recognition-boosted speech separation using deep recurrent neural networks." ICASSP, 2015.
:param clean_speech: 纯净语音 STFT
:param noise_speech: 带噪语音 STFT
:return: 纯净语音的理想二值掩膜 IBM
"""
_eps = np.finfo(np.float).eps  # 避免除以0
theta = 0.5  # a majority vote
alpha = 1  # ratio of magnitudes
mask = np.divide(np.abs(clean_speech) ** alpha, (_eps + np.abs(noise_speech) ** alpha))

def IBM_SNR(clean_speech, noise_speech,delta_size):
"""计算 ideal binary mask (IBM)
Erdogan, Hakan, et al. "Phase-sensitive and recognition-boosted speech separation using deep recurrent neural networks." ICASSP, 2015.
:param clean_speech: 纯净语音 STFT
:param noise_speech: 带噪语音 STFT
:return: 纯净语音的理想二值掩膜 IBM
"""
_eps = np.finfo(np.float).eps  # 避免除以0
local_snr = 0
ibm  = np.where(10. * np.log10(np.abs(clean_speech) ** 2 / np.abs(noise_speech) ** 2) >= local_snr, 1., 0.)
if delta_size > 0:
ibm = ibm[:, delta_size: -delta_size]
return ibm
View Code

## 理想浮值掩蔽（Ideal Ratio Mask，IRM）

$$公式2：|\boldsymbol{Y}(t, f)|^{2}=|\boldsymbol{S}(t, f)+\boldsymbol{N}(t, f)|^{2}=|\boldsymbol{S}(t, f)|^{2}+|\boldsymbol{N}(t, f)|^{2}$$

$$公式3：I R M(t, f)=\left(\frac{|S(t, f)|^{2}}{|Y(t, f)|^{2}}\right)^{\beta} =\left(\frac{|S(t, f)|^{2}}{|S(t, f)|^{2}+|N(t, f)|^{2}}\right)^{\beta}$$

def IRM(clean_speech, noise):
"""计算Compute ideal ratio mask (IRM)
"Phase-sensitive and recognition-boosted speech separation using deep recurrent neural networks," in ICASSP 2015, Brisbane, April, 2015.
:param clean_speech: 纯净语音 STFT
:param noise: 噪音 STFT
:return: 在原始音频域中分离(恢复)的语音
"""
_eps = np.finfo(np.float).eps  # 防止分母出现0
mask = np.abs(clean_speech) / (np.abs(clean_speech) + np.abs(noise) + _eps)

def Wiener_like(clean_speech, noise):
"Phase-sensitive and recognition-boosted speech separation using deep recurrent neural networks," in ICASSP 2015, Brisbane, April, 2015.
:param clean_speech: 纯净语音 STFT
:param noise: 噪音 STFT
:return: 在原始音频域中分离(恢复)的语音
"""
_eps = np.finfo(np.float).eps  # 防止分母出现0
mask = np.divide((np.abs(clean_speech) ** 2 + _eps),
(np.abs(clean_speech) ** 2 + np.abs(noise) ** 2) + _eps)
return mask

## 理想幅度掩蔽（Ideal Amplitude Mask，IAM）

$$公式4：\operatorname{IAM}(t, f)=\frac{|S(t, f)|}{|Y(t, f)|}$$

def IAM(clean_speech, noise_speech):
"""计算ideal amplitude mask (IAM)
"Phase-sensitive and recognition-boosted speech separation using deep recurrent neural networks," in ICASSP 2015, Brisbane, April, 2015.
:param clean_speech: 纯净语音 STFT
:param noise_speech: 带噪语音 STFT
:return:
"""
_eps = np.finfo(np.float).eps  # 避免除以0
mask = np.abs(clean_speech) / (np.abs(noise_speech) + _eps)
return mask

## 相位敏感掩蔽（Phase Sensitive Mask，PSM）

PSM在形式上是 IAM 乘上纯净语音和带噪语音之间的余弦相似度

$$公式5：P S M(t, f)=\frac{|S(t, f)|}{|Y(t, f)|} \cos \left(\theta^{S}-\theta^{Y}\right)$$

PSM数值分布直方图

def PSM(clean_speech, noise_speech):
"""计算ideal phase-sensitive mask (PSM)
:param clean_speech: 纯净语音 STFT
:param noise_speech:带噪语音 STFT
:return:
"""
_eps = np.finfo(np.float).eps  # 防止分母出现0
clean_speech_phase = np.angle(clean_speech)
noise_speech_phase = np.angle(noise_speech)
mask = np.abs(clean_speech) / np.abs(noise_speech) * np.cos(clean_speech_phase - noise_speech_phase)
# Truncated Phase Sensitive Masking
# Theta = np.clip(np.cos(clean_speech_phase-noise_speech_phase), a_min=0., a_max=1.)
# mask = np.divide(np.abs(clean_speech), _eps + np.abs(noise_speech)) * Theta
return mask

## 复数理想浮值掩蔽（Complex Ideal Ratio Mask，cIRM）

$条件：\left\{ \begin{array}{l}Y = {Y_r} + i{Y_i}\\M = {M_r} + i{M_i}\\S = {S_r} + i{S_i}\\{S_{t,f}} = {M_{t,f}}*{Y_{t,f}}\end{array} \right.$==>${S_r} + i{S_i} = ({M_r} + i{M_i})*({Y_r} + i{Y_i}) = ({M_r}{Y_r} – {M_i}{Y_i}) + i({M_r}{Y_i} + {M_i}{Y_r})$，

def cIRM(clean_speech, noise_speech):
"""使用复理想比率掩码将语音从源信号的短时傅里叶变换和混合源信号的短时傅里叶变换中分离出来
:param clean_speech:纯净语音
:param noise_speech:带噪语音
:return:
"""
cIRM_r = (np.real(noise_speech) * np.real(clean_speech) + np.imag(noise_speech) * np.imag(clean_speech)) / \
(np.real(noise_speech) ** 2 + np.imag(noise_speech) ** 2)
cIRM_i = (np.real(noise_speech) * np.imag(clean_speech) - np.imag(noise_speech) * np.real(clean_speech)) / \
(np.real(noise_speech) ** 2 + np.imag(noise_speech) ** 2)
mask = cIRM_r + cIRM_i * 1j
return mask

## 总结

 度量 IBM IRM IAM PSM cIRM PESQ 2.47 3.33 3.45 3.71 4.49 STOI 0.91 0.94 0.97 0.97 1

## 题外话

Label ：数据的label为根据信噪比计算的IBM或者IRM，这里只需要一帧就够了

enhance_magnitude = np.multiply(magnitude, mask)

## Demo效果以及代码

"""
@FileName: IBM.py
@Description: Implement IBM
@Author: Ryuk
@CreateDate: 2020/05/08
@LastEditTime: 2020/05/08
@LastEditors: Please set LastEditors
@Version: v0.1
"""
​
import numpy as np
import librosa
from sklearn.preprocessing import StandardScaler
from keras.layers import *
from keras.models import Sequential
​
​
def generateDataset():
mix, sr = librosa.load("./mix.wav", sr=8000)
clean,sr = librosa.load("./clean.wav",  sr=8000)
​
win_length = 256
hop_length = 128
nfft = 512
​
mix_spectrum = librosa.stft(mix, win_length=win_length, hop_length=hop_length, n_fft=nfft)
clean_spectrum = librosa.stft(clean, win_length=win_length, hop_length=hop_length, n_fft=nfft)
​
mix_mag = np.abs(mix_spectrum).T
clean_mag = np.abs(clean_spectrum).T
​
​
frame_num = mix_mag.shape[0] - 4
feature = np.zeros([frame_num, 257*5])
k = 0
for i in range(frame_num - 4):
frame = mix_mag[k:k+5]
feature[i] = np.reshape(frame, 257*5)
k += 1
​
snr = np.divide(clean_mag, mix_mag)
mask = np.around(snr, 0)
​
​
ss = StandardScaler()
feature = ss.fit_transform(feature)
return feature, label
​
​
def getModel():
model = Sequential()
​
​
​
​
return model
​
def train(feature, label, model):
loss='mse',
metrics=['mse'])
model.fit(feature, label, batch_size=128, epochs=20, validation_split=0.1)
model.save("./model.h5")
​
def main():
feature, label = generateDataset()
model = getModel()
train(feature, label, model)
​
​
if __name__ == "__main__":
main()

"""
@FileName: Inference.py
@Description: Implement Inference
@Author: Ryuk
@CreateDate: 2020/05/08
@LastEditTime: 2020/05/08
@LastEditors: Please set LastEditors
@Version: v0.1
"""
​
import librosa
import numpy as np
from basic_functions import *
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from keras.models import load_model
​
def show(data, s):
plt.figure(1)
ax1 = plt.subplot(2, 1, 1)
ax2 = plt.subplot(2, 1, 2)
plt.sca(ax1)
plt.plot(data)
plt.sca(ax2)
plt.plot(s)
plt.show()
​
​
data, fs = librosa.load("./test.wav", sr=8000)
​
win_length = 256
hop_length = 128
nfft = 512
​
spectrum = librosa.stft(data, win_length=win_length, hop_length=hop_length, n_fft=nfft)
magnitude = np.abs(spectrum).T
phase = np.angle(spectrum).T
​
frame_num = magnitude.shape[0] - 4
feature = np.zeros([frame_num, 257 * 5])
k = 0
for i in range(frame_num - 4):
frame = magnitude[k:k + 5]
feature[i] = np.reshape(frame, 257 * 5)
k += 1
​
ss = StandardScaler()
feature = ss.fit_transform(feature)
​
fig = plt.figure()
plt.show()
plt.close(fig)
​
magnitude = magnitude[2:-2]
en_magnitude = np.multiply(magnitude, mask)
phase = phase[2:-2]
​
en_spectrum = en_magnitude.T * np.exp(1.0j * phase.T)
frame = librosa.istft(en_spectrum, win_length=win_length, hop_length=hop_length)
​
show(data, frame)
librosa.output.write_wav("./output.wav",frame, sr=8000)

## 参考

【论文】2020_李劲东_基于深度学习的单通道语音增强研究

【博客文章】 DNN单通道语音增强(附Demo代码)