深度学习vad人声检测之标签制作

深度学习几乎渗透到了各行各业，最火热的莫过于视觉算法。然而，音频相关的很多处理算法也逐渐被深度学习所浸润，vad作为音频前处理的一个操作得到了很广泛的应用，比较典型的vad检测算法是通过提取特征，构造高斯模型得到每段音频的概率来确认是人声还是噪声(包含了静默)；尽管传统vad通过合理的设置参数也能取得可观的效果，但是在当下数据驱动时代，深度学习的效果要比传统vad算法胜任的多，当然这个前提是模型训练数据要足够的丰富。

下面从深度学习的角度解析vad算法的实现，基于深度学习的vad实现并不困难，关键的部分就是准备数据阶段，网络搭建和损失函数的设计相对容易，因为vad实质上就是一个分类问题，和语音唤醒类似，只是vad只需要做二分类，而语音唤醒需要多分类。具体步骤如下：

(1)数据标签的制作

(2)加噪数据的合成

(3)特征提取

(4)搭建网络模型

(5)模型训练

(1)数据标签的制作

由于真实的音频数据直接打标签的人工成本太大，这里我们采用录制干净的数据(即没有噪声只有人声和静默的音频)进行标签制作，对于干净的数据可以直接采用能量进行人声检测。标签制作代码如下：

audio_tools.py

import numpy as np
from scipy.io import wavfile


def add_wgn(s,var=1e-4):
    """
        Add white Gaussian noise to signal
        If no variance is given, simply add jitter. 
        Jitter helps eliminate all-zero values.
        """
    np.random.seed(0)
    noise = np.random.normal(0,var,len(s))
    return s + noise


def read_wav(filename):
    """
        read wav file.
        Normalizes signal to values between -1 and 1.
        Also add some jitter to remove all-zero segments."""
    fs, s = wavfile.read(filename) # scipy reads int
    s = np.array(s)/float(max(abs(s)))
    s = add_wgn(s) # Add jitter for numerical stability
    return fs,s

#===============================================================================
import math
def enframe(x, win_len, hop_len):
    """
        receives a 1D numpy array and divides it into frames.
        outputs a numpy matrix with the frames on the rows.
        """
    x = np.squeeze(x)
    if x.ndim != 1:
        raise TypeError("enframe input must be a 1-dimensional array.")
    n_frames = 1 + np.int(math.ceil((len(x) - win_len) / float(hop_len)))
    x_framed = np.zeros((n_frames, win_len))

    padlen = int((n_frames - 1) * hop_len + win_len)

    zeros = np.zeros((padlen - len(x),))
    padsignal = np.concatenate((x, zeros))
    for i in range(n_frames):
        #print('i = ',i)
        x_framed[i] = padsignal[i * hop_len : i * hop_len + win_len]
    return x_framed


def deframe(x_framed, win_len, hop_len):
    '''
        interpolates 1D data with framed alignments into persample values.
        This function helps as a visual aid and can also be used to change 
        frame-rate for features, e.g. energy, zero-crossing, etc.
        '''
    n_frames = len(x_framed)
    n_samples = n_frames*hop_len + win_len
    x_samples = np.zeros((n_samples,1))
    for i in range(n_frames):
        x_samples[i*hop_len : i*hop_len + win_len] = x_framed[i]
    return x_samples




if __name__=='__main__':
    pass

unsupervised_vad.py

#! /usr/bin/python

# Voice Activity Detection (VAD) tool.
# use the vad_help() function for instructions.
# Navid Shokouhi December 2012.

# Updated: May 2017 for Speaker Recognition collaboration.

from audio_tools import *
import numpy as np
import os

##Function definitions:
def vad_help():
    """Voice Activity Detection (VAD) tool.
	
	Navid Shokouhi May 2017.
    """
    print("Usage:")
    print("python unsupervised_vad.py")

#### Display tools
def plot_this(s,title=''):
    """
     
    """
    import pylab
    s = s.squeeze()
    if s.ndim ==1:
        pylab.plot(s)
    else:
        pylab.imshow(s,aspect='auto')
        pylab.title(title)
    pylab.show()

def plot_these(s1,s2):
    import pylab
    try:
        # If values are numpy arrays
        pylab.plot(s1/max(abs(s1)),color='red')
        pylab.plot(s2/max(abs(s2)),color='blue')
    except:
        # Values are lists
        pylab.plot(s1,color='red')
        pylab.plot(s2,color='blue')
    pylab.legend()
    pylab.show()

def plot_these1(s1,s2):
    import matplotlib.pyplot as plt

    #plt.ion()
    plt.figure(figsize = (16,9))
    try:
        # If values are numpy arrays
        plt.plot(s1/max(abs(s1)),color='red')
        plt.plot(s2/max(abs(s2)),color='blue')
    except:
        # Values are lists
        plt.plot(s1,color='red')
        plt.plot(s2,color='blue')
    plt.legend()
    plt.show()
    #plt.pause(2)
    #plt.close()


#### Energy tools
def zero_mean(xframes):
    """
        remove mean of framed signal
        return zero-mean frames.
        """
    m = np.mean(xframes,axis=1)
    xframes = xframes - np.tile(m,(xframes.shape[1],1)).T
    return xframes

def compute_nrg(xframes):
    # calculate per frame energy
    n_frames = xframes.shape[1]
    return np.diagonal(np.dot(xframes,xframes.T))/float(n_frames)

def compute_log_nrg(xframes):
    # calculate per frame energy in log
    n_frames = xframes.shape[1]
    raw_nrgs = np.log(compute_nrg(xframes+1e-5))/float(n_frames)
    return (raw_nrgs - np.mean(raw_nrgs))/(np.sqrt(np.var(raw_nrgs)))

def power_spectrum(xframes):
    """
        x: input signal, each row is one frame
        """
    X = np.fft.fft(xframes,axis=1)
    X = np.abs(X[:,:X.shape[1]/2])**2
    return np.sqrt(X)



def nrg_vad(xframes,percent_thr,nrg_thr=0.,context=5):
    """
        Picks frames with high energy as determined by a 
        user defined threshold.
        
        This function also uses a 'context' parameter to
        resolve the fluctuative nature of thresholding. 
        context is an integer value determining the number
        of neighboring frames that should be used to decide
        if a frame is voiced.
        
        The log-energy values are subject to mean and var
        normalization to simplify the picking the right threshold. 
        In this framework, the default threshold is 0.0
        """
    xframes = zero_mean(xframes)
    n_frames = xframes.shape[0]
    
    # Compute per frame energies:
    xnrgs = compute_log_nrg(xframes)
    xvad = np.zeros((n_frames,1))
    for i in range(n_frames):
        start = max(i-context,0)
        end = min(i+context,n_frames-1)
        n_above_thr = np.sum(xnrgs[start:end]>nrg_thr)
        n_total = end-start+1
        xvad[i] = 1.*((float(n_above_thr)/n_total) > percent_thr)
    return xvad



def read_audio_file1(path,fmt,flag = 0):

    files = []
    names = []
    for root ,dir ,filenames in os.walk(path):
        #print('root = ',root)
        #print('filename.len = ',len(filenames))
        for filename in filenames:
            if filename.endswith(fmt):
                #print('filename = ',filename)
                file_path = root + '/' + filename
                files.append(file_path)

                filename = filename.split('.')[0]

                if(flag == 1):
                    name = file_path.split('.')[0]
                    name = name.split('/')
                    filename = name[-3] + '_' + name[-2] + '_' + name[-1]
                names.append(filename)

    return files,names



# def max_filter(vads):

#     hist = 0
#     win_len = 20
#     half_len = int(win_len / 2)
#     vad_len = len(vads)
#     new_vads = []
#     for i,vad in enumerate(vads):
#         if i < win_len:
#             new_vads.append(float(vad))
#             continue

#         #if (i < vad_len - half_len) and vads[i] == 0:
#         if (i < vad_len - half_len):
#             for j in range(i - half_len,i + half_len):
#                 hist = hist + vads[j]
#             if hist > half_len:
#                 new_vads.append(1.)
#             else:
#                 new_vads.append(0.)
#         else:
#             new_vads.append(float(vad))

#         hist = 0
#     new_vads = np.array(new_vads)
#     new_vads = new_vads.reshape(len(new_vads),1)

#     return new_vads




def max_filter(vads):

    hist = 0
    win_len = 20
    half_len = int(win_len / 2)
    vad_len = len(vads)
    new_vads = []
    wri_vads = []
    for i,vad in enumerate(vads):
        if i < win_len:
            new_vads.append(float(vad))
            wri_vads.append(int(vad))
            continue

        #if (i < vad_len - half_len) and vads[i] == 0:
        if (i < vad_len - half_len):
            for j in range(i - half_len,i + half_len):
                hist = hist + vads[j]
            if hist > half_len:
                new_vads.append(1.)
                wri_vads.append(1)
            else:
                new_vads.append(0.)
                wri_vads.append(0)
        else:
            new_vads.append(float(vad))
            wri_vads.append(int(vad))

        hist = 0
    new_vads = np.array(new_vads)
    new_vads = new_vads.reshape(len(new_vads),1)

    return new_vads,wri_vads


def get_start_end_pts(new_vad):

    starts = []
    ends = []

    flags = 1
    for i,vad in enumerate(new_vad):

        if int(new_vad[0]) == 1 and flags == 1:
            starts.append(0)
            flags = 0


        if int(new_vad[i]) == 0 and int(new_vad[i + 1]) == 1:
            starts.append(i)
        elif int(new_vad[i]) == 1 and int(new_vad[i + 1]) == 0:
            ends.append(i)

        if i == len(new_vad) - 2:
            break

    if int(new_vad[-2]) == 1 and int(new_vad[-1]) == 1:
        ends.append(len(new_vad) - 1)

    return starts,ends



def write_startEnd_info(filename,starts,ends):

    assert (len(starts) == len(ends))
    f = open(filename,'w')

    for start,end in zip(starts,ends):
        if start > end:
            print('==========start end info err========')
        f.write(str(start) + ' ' + str(end) + '\n')

    f.close()



def write_frame_labels(fp,vads,name):

    fp.write(name + ':')
    for v in vads:
        fp.write(str(v) + ' ')
    fp.write('\n')

    return fp


if __name__=='__main__':

    fs = 16000
    win_len = int(fs*0.025)
    hop_len = int(fs*0.010)

    files,names = read_audio_file1('../data/clean','.wav')

    label_p = '../data/speech_labels.txt'
    fp = open(label_p,'w')

    cnt = 0
    ratio = 0
    acc_sum = 0
    pos_sum = 0
    for file,name in zip(files,names):
        fs, s = read_wav(file)

        print('cnt = ',cnt)
        cnt = cnt + 1

        sframes = enframe(s,win_len,hop_len) # rows: frame index, cols: each frame
        #plot_this(compute_log_nrg(sframes))

        # percent_high_nrg is the VAD context ratio. It helps smooth the
        # output VAD decisions. Higher values are more strict.
        percent_high_nrg = 0.05

        vad = nrg_vad(sframes,percent_high_nrg)

        new_vad,wri_vad = max_filter(vad)
        plot_these1(deframe(new_vad, win_len, hop_len), s)

        #print('wri_vad.len = ',len(wri_vad))

        acc_sum = acc_sum + len(wri_vad)
        pos_sum = pos_sum + np.sum(wri_vad)

        #计算语音帧所占比例
        if cnt % 100 == 0:
            ratio = pos_sum / acc_sum
            print('cnt = %d,acc_sum = %d,pos_sum = %d,ratio = %f'%(cnt,acc_sum,pos_sum,ratio))


        fp = write_frame_labels(fp,wri_vad,name)

    fp.close()

    print('ratio = ',ratio)

这里只采用一段音频显示效果如下：

在这里插入图片描述

从上图看到，该方法制作的标签还是很准确的。

原文链接：https://blog.csdn.net/pikaqiu_n95/article/details/113922931

你可能也喜欢