深度学习几乎渗透到了各行各业,最火热的莫过于视觉算法。然而,音频相关的很多处理算法也逐渐被深度学习所浸润,vad作为音频前处理的一个操作得到了很广泛的应用,比较典型的vad检测算法是通过提取特征,构造高斯模型得到每段音频的概率来确认是人声还是噪声(包含了静默);尽管传统vad通过合理的设置参数也能取得可观的效果,但是在当下数据驱动时代,深度学习的效果要比传统vad算法胜任的多,当然这个前提是模型训练数据要足够的丰富。
下面从深度学习的角度解析vad算法的实现,基于深度学习的vad实现并不困难,关键的部分就是准备数据阶段,网络搭建和损失函数的设计相对容易,因为vad实质上就是一个分类问题,和语音唤醒类似,只是vad只需要做二分类,而语音唤醒需要多分类。具体步骤如下:
(1)数据标签的制作
(2)加噪数据的合成
(3)特征提取
(4)搭建网络模型
(5)模型训练
(1)数据标签的制作
由于真实的音频数据直接打标签的人工成本太大,这里我们采用录制干净的数据(即没有噪声只有人声和静默的音频)进行标签制作,对于干净的数据可以直接采用能量进行人声检测。标签制作代码如下:
audio_tools.py
import numpy as np
from scipy.io import wavfile
def add_wgn(s,var=1e-4):
"""
Add white Gaussian noise to signal
If no variance is given, simply add jitter.
Jitter helps eliminate all-zero values.
"""
np.random.seed(0)
noise = np.random.normal(0,var,len(s))
return s + noise
def read_wav(filename):
"""
read wav file.
Normalizes signal to values between -1 and 1.
Also add some jitter to remove all-zero segments."""
fs, s = wavfile.read(filename) # scipy reads int
s = np.array(s)/float(max(abs(s)))
s = add_wgn(s) # Add jitter for numerical stability
return fs,s
#===============================================================================
import math
def enframe(x, win_len, hop_len):
"""
receives a 1D numpy array and divides it into frames.
outputs a numpy matrix with the frames on the rows.
"""
x = np.squeeze(x)
if x.ndim != 1:
raise TypeError("enframe input must be a 1-dimensional array.")
n_frames = 1 + np.int(math.ceil((len(x) - win_len) / float(hop_len)))
x_framed = np.zeros((n_frames, win_len))
padlen = int((n_frames - 1) * hop_len + win_len)
zeros = np.zeros((padlen - len(x),))
padsignal = np.concatenate((x, zeros))
for i in range(n_frames):
#print('i = ',i)
x_framed[i] = padsignal[i * hop_len : i * hop_len + win_len]
return x_framed
def deframe(x_framed, win_len, hop_len):
'''
interpolates 1D data with framed alignments into persample values.
This function helps as a visual aid and can also be used to change
frame-rate for features, e.g. energy, zero-crossing, etc.
'''
n_frames = len(x_framed)
n_samples = n_frames*hop_len + win_len
x_samples = np.zeros((n_samples,1))
for i in range(n_frames):
x_samples[i*hop_len : i*hop_len + win_len] = x_framed[i]
return x_samples
if __name__=='__main__':
pass
unsupervised_vad.py
#! /usr/bin/python
# Voice Activity Detection (VAD) tool.
# use the vad_help() function for instructions.
# Navid Shokouhi December 2012.
# Updated: May 2017 for Speaker Recognition collaboration.
from audio_tools import *
import numpy as np
import os
##Function definitions:
def vad_help():
"""Voice Activity Detection (VAD) tool.
Navid Shokouhi May 2017.
"""
print("Usage:")
print("python unsupervised_vad.py")
#### Display tools
def plot_this(s,title=''):
"""
"""
import pylab
s = s.squeeze()
if s.ndim ==1:
pylab.plot(s)
else:
pylab.imshow(s,aspect='auto')
pylab.title(title)
pylab.show()
def plot_these(s1,s2):
import pylab
try:
# If values are numpy arrays
pylab.plot(s1/max(abs(s1)),color='red')
pylab.plot(s2/max(abs(s2)),color='blue')
except:
# Values are lists
pylab.plot(s1,color='red')
pylab.plot(s2,color='blue')
pylab.legend()
pylab.show()
def plot_these1(s1,s2):
import matplotlib.pyplot as plt
#plt.ion()
plt.figure(figsize = (16,9))
try:
# If values are numpy arrays
plt.plot(s1/max(abs(s1)),color='red')
plt.plot(s2/max(abs(s2)),color='blue')
except:
# Values are lists
plt.plot(s1,color='red')
plt.plot(s2,color='blue')
plt.legend()
plt.show()
#plt.pause(2)
#plt.close()
#### Energy tools
def zero_mean(xframes):
"""
remove mean of framed signal
return zero-mean frames.
"""
m = np.mean(xframes,axis=1)
xframes = xframes - np.tile(m,(xframes.shape[1],1)).T
return xframes
def compute_nrg(xframes):
# calculate per frame energy
n_frames = xframes.shape[1]
return np.diagonal(np.dot(xframes,xframes.T))/float(n_frames)
def compute_log_nrg(xframes):
# calculate per frame energy in log
n_frames = xframes.shape[1]
raw_nrgs = np.log(compute_nrg(xframes+1e-5))/float(n_frames)
return (raw_nrgs - np.mean(raw_nrgs))/(np.sqrt(np.var(raw_nrgs)))
def power_spectrum(xframes):
"""
x: input signal, each row is one frame
"""
X = np.fft.fft(xframes,axis=1)
X = np.abs(X[:,:X.shape[1]/2])**2
return np.sqrt(X)
def nrg_vad(xframes,percent_thr,nrg_thr=0.,context=5):
"""
Picks frames with high energy as determined by a
user defined threshold.
This function also uses a 'context' parameter to
resolve the fluctuative nature of thresholding.
context is an integer value determining the number
of neighboring frames that should be used to decide
if a frame is voiced.
The log-energy values are subject to mean and var
normalization to simplify the picking the right threshold.
In this framework, the default threshold is 0.0
"""
xframes = zero_mean(xframes)
n_frames = xframes.shape[0]
# Compute per frame energies:
xnrgs = compute_log_nrg(xframes)
xvad = np.zeros((n_frames,1))
for i in range(n_frames):
start = max(i-context,0)
end = min(i+context,n_frames-1)
n_above_thr = np.sum(xnrgs[start:end]>nrg_thr)
n_total = end-start+1
xvad[i] = 1.*((float(n_above_thr)/n_total) > percent_thr)
return xvad
def read_audio_file1(path,fmt,flag = 0):
files = []
names = []
for root ,dir ,filenames in os.walk(path):
#print('root = ',root)
#print('filename.len = ',len(filenames))
for filename in filenames:
if filename.endswith(fmt):
#print('filename = ',filename)
file_path = root + '/' + filename
files.append(file_path)
filename = filename.split('.')[0]
if(flag == 1):
name = file_path.split('.')[0]
name = name.split('/')
filename = name[-3] + '_' + name[-2] + '_' + name[-1]
names.append(filename)
return files,names
# def max_filter(vads):
# hist = 0
# win_len = 20
# half_len = int(win_len / 2)
# vad_len = len(vads)
# new_vads = []
# for i,vad in enumerate(vads):
# if i < win_len:
# new_vads.append(float(vad))
# continue
# #if (i < vad_len - half_len) and vads[i] == 0:
# if (i < vad_len - half_len):
# for j in range(i - half_len,i + half_len):
# hist = hist + vads[j]
# if hist > half_len:
# new_vads.append(1.)
# else:
# new_vads.append(0.)
# else:
# new_vads.append(float(vad))
# hist = 0
# new_vads = np.array(new_vads)
# new_vads = new_vads.reshape(len(new_vads),1)
# return new_vads
def max_filter(vads):
hist = 0
win_len = 20
half_len = int(win_len / 2)
vad_len = len(vads)
new_vads = []
wri_vads = []
for i,vad in enumerate(vads):
if i < win_len:
new_vads.append(float(vad))
wri_vads.append(int(vad))
continue
#if (i < vad_len - half_len) and vads[i] == 0:
if (i < vad_len - half_len):
for j in range(i - half_len,i + half_len):
hist = hist + vads[j]
if hist > half_len:
new_vads.append(1.)
wri_vads.append(1)
else:
new_vads.append(0.)
wri_vads.append(0)
else:
new_vads.append(float(vad))
wri_vads.append(int(vad))
hist = 0
new_vads = np.array(new_vads)
new_vads = new_vads.reshape(len(new_vads),1)
return new_vads,wri_vads
def get_start_end_pts(new_vad):
starts = []
ends = []
flags = 1
for i,vad in enumerate(new_vad):
if int(new_vad[0]) == 1 and flags == 1:
starts.append(0)
flags = 0
if int(new_vad[i]) == 0 and int(new_vad[i + 1]) == 1:
starts.append(i)
elif int(new_vad[i]) == 1 and int(new_vad[i + 1]) == 0:
ends.append(i)
if i == len(new_vad) - 2:
break
if int(new_vad[-2]) == 1 and int(new_vad[-1]) == 1:
ends.append(len(new_vad) - 1)
return starts,ends
def write_startEnd_info(filename,starts,ends):
assert (len(starts) == len(ends))
f = open(filename,'w')
for start,end in zip(starts,ends):
if start > end:
print('==========start end info err========')
f.write(str(start) + ' ' + str(end) + '\n')
f.close()
def write_frame_labels(fp,vads,name):
fp.write(name + ':')
for v in vads:
fp.write(str(v) + ' ')
fp.write('\n')
return fp
if __name__=='__main__':
fs = 16000
win_len = int(fs*0.025)
hop_len = int(fs*0.010)
files,names = read_audio_file1('../data/clean','.wav')
label_p = '../data/speech_labels.txt'
fp = open(label_p,'w')
cnt = 0
ratio = 0
acc_sum = 0
pos_sum = 0
for file,name in zip(files,names):
fs, s = read_wav(file)
print('cnt = ',cnt)
cnt = cnt + 1
sframes = enframe(s,win_len,hop_len) # rows: frame index, cols: each frame
#plot_this(compute_log_nrg(sframes))
# percent_high_nrg is the VAD context ratio. It helps smooth the
# output VAD decisions. Higher values are more strict.
percent_high_nrg = 0.05
vad = nrg_vad(sframes,percent_high_nrg)
new_vad,wri_vad = max_filter(vad)
plot_these1(deframe(new_vad, win_len, hop_len), s)
#print('wri_vad.len = ',len(wri_vad))
acc_sum = acc_sum + len(wri_vad)
pos_sum = pos_sum + np.sum(wri_vad)
#计算语音帧所占比例
if cnt % 100 == 0:
ratio = pos_sum / acc_sum
print('cnt = %d,acc_sum = %d,pos_sum = %d,ratio = %f'%(cnt,acc_sum,pos_sum,ratio))
fp = write_frame_labels(fp,wri_vad,name)
fp.close()
print('ratio = ',ratio)
这里只采用一段音频显示效果如下:
从上图看到,该方法制作的标签还是很准确的。
版权声明:本文为pikaqiu_n95原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。