In [1]:
%matplotlib inline
import numpy as np
import sounddevice as sd # new import
from scipy.io import wavfile as wv
from scipy.signal import iirdesign, lfilter, hilbert, butter, freqz
from glob import glob
import matplotlib.pyplot as pl
import seaborn as sb

sb.set(style='ticks', font_scale=1.5)

This is the final version of a notebook, created from scratch and coded live, and then cleaned up, in which 100 sentences are noise vocoded using two different frequency band specifications.

That is, the sentences are read in, filtered into N frequency bands, the amplitude envelope of the signal in each band is extracted, and then a filtered noise signal is multiplied by the envelope, after which the modulated noise signals are added together, and the final signal is written to the hard drive.

In the end, the code consists of a number of nested loops and function calls, but in order to make sure everything on the most nested levels works, we start with the innermost parts of the code, making sure that we can process a single sentence, and then work outward from there to do the same thing with the full set of sentences.

Since we know that we want amplitude envelopes, we need a function to get them. Here is one such function:

In [2]:
def amp_env(snd,fs=44100,rect='full',fphz=40,nfc=5,plot=False,ax=None):
    '''Returns amplitude envelope of input signal snd'''
    if isinstance(snd,str):
        fs,w = wv.read(snd)
    elif isinstance(snd,np.ndarray):
        w = snd
    if rect=='hilbert':
        wr = np.abs(hilbert(w))
    elif rect=='half':
        wr = (w>=0)*w
    elif rect=='full':
        wr = np.abs(w)
    wrn = wr/np.max(np.abs(wr))
    nyq = np.int(fs/2)
    fprad = fphz/nyq
    b,a = butter(N=nfc,Wn=fprad)
    env = lfilter(b,a,wr)
    env = env/np.max(np.abs(env))
    if plot or ax is not None:
        if ax is None:
            fig, ax = pl.subplots(1,1, figsize=(10,5))
        ax.plot(wrn,color='green')
        ax.plot(env,linewidth=2,color='red')
    return env,wrn

Let's see what that function does with various input parameter values:

In [3]:
file_list = glob('sentences/*wav') # read in file names for wav files in folder sentences
fn_t = file_list[0] # pick out the first one for testing various functions and code, _t indicates "temporary"

fs, s = wv.read(fn_t)
nyq = np.int(fs/2)

pl.figure(figsize=(20,5))
pl.plot(s);

Here is what different specifications of the rectification argument do:

In [4]:
fig, axes = pl.subplots(3,1,figsize=(20,15))
e_t, w_t = amp_env(snd=s, ax=axes[0]); axes[0].text(60000,.8,'full wave rectification',fontsize=24)
e_t, w_t = amp_env(snd=s, rect='half',ax=axes[1]); axes[1].text(60000,.8,'half wave rectification',fontsize=24)
e_t, w_t = amp_env(snd=s, rect='hilbert',ax=axes[2]); axes[2].text(60000,.8,'Hilbert transform',fontsize=24);

And here are some different options for the cutoff frequency for the low-pass filter used to smooth the rectified waveform to get an estimate of the amplitude envelope:

In [5]:
fig, axes = pl.subplots(3,1,figsize=(20,15))
e_t, w_t = amp_env(snd=s, fphz=10, ax=axes[0]); axes[0].text(60000,.8,'10 hz low pass filter',fontsize=24);
e_t, w_t = amp_env(snd=s, fphz=40, ax=axes[1]); axes[1].text(60000,.8,'40 hz low pass filter',fontsize=24);
e_t, w_t = amp_env(snd=s, fphz=80, ax=axes[2]); axes[2].text(60000,.8,'100 hz low pass filter',fontsize=24);

Based on these visualizations, we use half wave rectification (since this seems to be slightly more accurate than full or Hilbert-based rectification), and default input values for the other arguments to amp_env().

The next things we need to figure out are (1) how many channels we want to use, and (2) how these channels are spaced in the frequency domain.

One possible way to break up the frequency range into chunks would be to break up the range in equal-sized Hz-based intervals. Another would be to use arbitrary spacing, possibly to approximate the widths of cochlear filters or to probe some specific hypothesis about particular frequency bands (since we are making stimuli for a perceptual experiment in this example).

In order to work out any problems breaking up the frequency range, we'll write some code to test out various approaches:

In [6]:
n_channels = 8
lower = 50 # lowest frequency to bother with, based on human hearing limits
upper = 12000 # upper limit well below nyq
hz_bands = np.linspace(start=lower, stop=upper, num=n_channels+1, endpoint=True) # equally-spaced bands
ir_bands = np.array([lower,500,1500,3000,6000,upper]) # irregular, hard-coded bands
n_ir_b = len(ir_bands)-1

print(hz_bands)
print(ir_bands)

gpass = 1 # in each pass band, I don't want to lose information
gstop = 20 # in each stop band, I want to lose as much information as possible

coef_dict = {} # for storing filter coefficients

# loop through equally spaced bands and get/store corresponding filter coefficients
for f_idx in range(n_channels):
    low_edge = hz_bands[f_idx]
    upp_edge = hz_bands[f_idx+1]
    if f_idx == 0: # make a low-pass filter
        wpass = upp_edge/nyq
        wstop = 1.1*wpass
    elif f_idx <= n_channels:
        wpass = np.array([low_edge, upp_edge])/nyq
        wstop = np.array([.9,1.1])*wpass
    b_t, a_t = iirdesign(wp=wpass, ws=wstop, gpass=gpass, gstop=gstop, ftype='cheby2')
    coef_dict['hz_' + str(f_idx) + '_b'] = b_t
    coef_dict['hz_' + str(f_idx) + '_a'] = a_t

# loop through irregular bands and get/store corresponding filter coefficients
for f_idx in range(n_ir_b):
    low_edge = ir_bands[f_idx]
    upp_edge = ir_bands[f_idx+1]
    if f_idx == 0: # make a low-pass filter
        wpass = upp_edge/nyq
        wstop = 1.1*wpass
    else:
        wpass = np.array([low_edge, upp_edge])/nyq
        wstop = np.array([.9,1.1])*wpass
    b_t, a_t = iirdesign(wp=wpass, ws=wstop, gpass=gpass, gstop=gstop, ftype='cheby2')
    coef_dict['ir_' + str(f_idx) + '_b'] = b_t
    coef_dict['ir_' + str(f_idx) + '_a'] = a_t
[    50.     1543.75   3037.5    4531.25   6025.     7518.75   9012.5
  10506.25  12000.  ]
[   50   500  1500  3000  6000 12000]

Visualize the equally-spaced bands:

In [7]:
fig, ax = pl.subplots(1, 1, figsize=(15,5))
ll = []
for i in range(n_channels):
    b_t, a_t = coef_dict['hz_' + str(i) + '_b'], coef_dict['hz_' + str(i) + '_a']
    om_t, H_t = freqz(b_t, a_t)
    lt, = ax.plot(om_t,np.abs(H_t))
    ll.append(lt)
ax.legend(ll,np.arange(n_channels));

Visualize the irregular bands:

In [8]:
fig, ax = pl.subplots(1, 1, figsize=(15,5))
ll = []
for i in range(len(ir_bands)-1):
    b_t, a_t = coef_dict['ir_' + str(i) + '_b'], coef_dict['ir_' + str(i) + '_a']
    om_t, H_t = freqz(b_t, a_t)
    lt, = ax.plot(om_t,np.abs(H_t))
    ll.append(lt)
ax.legend(ll,np.arange(n_channels));

Now we can test the filters we created above, get and store the maximum amplitude in each filtered band of speech (for scaling the envelopes later), and see what filtered bands of our sample sentence look like. We'll also save the filtered bands as .wav files so we can listen to them and make sure they sound (more or less) like we expect them to.

In [9]:
hz_comps = np.zeros((n_channels, len(s))) # initialize container for filtered signal
hz_maxv = np.zeros(n_channels) # container for values to scale the amplitude within each channel

for i in range(n_channels):
    b_t, a_t = coef_dict['hz_' + str(i) + '_b'], coef_dict['hz_' + str(i) + '_a'] # get filter coefficients
    s_t = lfilter(b=b_t, a=a_t, x=s) # filter the signal
    hz_comps[i, :] = s_t # store the filtered signal
    hz_maxv[i] = np.max(np.abs(s_t)) # get the maximum amplitude deviation for filtered signal
    
fig, axes = pl.subplots(n_channels, 1, figsize=(10,2*n_channels))
for i in range(n_channels):
    s_t = hz_maxv[i]*hz_comps[i,:]/np.max(np.abs(hz_comps[i,:]))
    axes[i].plot(s_t)
    axes[i].set(ylim=(-1,1))
    wv.write('temp_hz_'+str(i)+'.wav',fs,s_t)

Now we can loop through the bands, generate and filter noise (into the same bands), then multiply the filtered noise by the appropriate amplitude envelope (and max amplitude value), and store the result (visualizing the first step of the loop so we can see what's going on):

In [10]:
hz_envs = np.zeros(hz_comps.shape) # container for amplitude envelopes
hz_ncmp = np.zeros(hz_comps.shape) # container for modulated noise components

fig, axes = pl.subplots(3, 1, figsize=(20,12))
ax1, ax2, ax3 = axes

for i in range(n_channels):
    noise = np.random.normal(size=len(s)) # generate sample of noise
    noise = noise/np.max(np.abs(noise)) # normalize it
    b_t, a_t = coef_dict['hz_' + str(i) + '_b'], coef_dict['hz_' + str(i) + '_a'] # get filter coefficients
    n_t = lfilter(b=b_t, a=a_t, x=noise) # filter the noise
    e_t, w_t = amp_env(hz_comps[i,:], rect='half') # get amplitude envelope of speech component in channel i
    hz_envs[i,:] = e_t # store the envelope (not strictly necessary, useful for plotting)
    hz_ncmp[i,:] = hz_maxv[i]*e_t*n_t # normalize and amplitude modulate noise for channel i
    if i == 0:
        ax1.plot(n_t) # plot filtered noise band for channel 0
        ax1.plot(np.max(np.abs(hz_ncmp[i,:]))*e_t, lw=5) # plot envelope
        ax2.plot(hz_ncmp[i,:]) # plot modulated noise band for channel 0
        ax3.plot(hz_comps[i,:]) # plot original speech band for channel 0