In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
from numpy.random import random, normal
import pandas as pd
from scipy.io import wavfile as wv
from scipy import stats

blue, green, red, purple = sns.color_palette('deep', n_colors=4)

Data analysis & visualization

An important part of data analysis is data visualization. We've already been using matplotlib to do this some, and this week we'll dig deeper into this package to understand a bit more about what it can do and how we can use it to make figures look exactly how we want them to look. The matplotlib documentation provides a huge amount of information, and this blog post discusses a useful way to go about using matplotlib (a fair amount of which we'll discuss here).

As we've seen before, there are two basic objects we will often deal with when generating a figure, the Figure object and Axes objects.

We'll start by making a moderately complicated signal to plot:

In [2]:
np.random.seed(seed=45232)
fs = 1000
t = np.arange(start=0, stop=1, step=1/fs)
y1 = .5*np.cos(2*np.pi*15*t + random()*2*np.pi)
y2 = 1.3*np.cos(2*np.pi*7*t + random()*2*np.pi)
y3 = 2*np.cos(2*np.pi*2*t + random()*2*np.pi)
s = y1 + y2 + y3 #+ n

We can create a figure and axis object with the subplots() method in pyplot (here plt). Then, we can plot our signal (defined above) using the plot() axis method, and we can adjust the limits on the axes by using the set_ylim() and set_xlim() axis methods.

In [3]:
fig, ax = plt.subplots()
ax.plot(t,s,'r')
ax.set_ylim(-10,10)
ax.set_xlim(-.5,1.5);

We can change the size of the figure when we create the figure object. We can also specify where the tick marks are on each axis and what the labels for the tick marks are, using the set_xticks(), set_xticklabels(), set_yticks(), and set_yticklabels() methods.

In [4]:
fig, ax = plt.subplots(1, 1, figsize=(8,4))
ax.plot(t,s,'r',lw=3)
ax.set_ylim(-4,4)
ax.set_xlim(0,1)
ax.set_xticks([0,t[np.argmax(s)],t[np.argmin(s)],1])
ax.set_xticklabels(['$t_0$','$t_{max}$','$t_{min}$','$t_n$'], fontsize=18)
ax.set_yticks(np.linspace(-4,4,5))
ax.set_yticklabels(['$-A$','$-A/2$','$0$','$+A/2$','$+A$'], fontsize=18);

We can easily write our own functions to make changes to figures if we don't like the default way that matplotlib does it. For example, here's a function for automatically adjusting axis limits based on the range of the data on each dimension and a specified 'padding' proportion on each dimension:

In [5]:
def adjust_limits(axis, xpad=.1, ypad=.1, adj_x=True, adj_y=True):
    if adj_x:
        xx = (axis.dataLim.x0,axis.dataLim.x1)
        x_range = xx[1]-xx[0]
        xlims = (xx[0]-xpad*x_range,xx[1]+xpad*x_range)
        axis.set_xlim(xlims)
    else:
        xlims = None
    if adj_y:
        yy = (axis.dataLim.y0,axis.dataLim.y1)
        y_range = yy[1]-yy[0]
        ylims = (yy[0]-ypad*y_range,yy[1]+ypad*y_range)
        axis.set_ylim(ylims)
    else:
        ylims = None
    return xlims, ylims

Matplotlib allows you to annotate figures pretty easily:

In [6]:
fig, ax = plt.subplots(1, 1, figsize=(8,4))
ax.plot(t,np.zeros(len(t)),':',color=[.25]*4)
ax.plot(t,s,'r',lw=3)
xlims, ylims = adjust_limits(ax, xpad=0.01, ypad=.25)
ax.set_xticks(np.arange(0,1,.25))
ax.set_yticks(np.linspace(np.ceil(ylims[0]),np.floor(ylims[1]),5))
ax.set_xticks(np.linspace(0,1,5))
arrow_props = {'shrink':.05, 'width':2, 'headlength':10, 'headwidth':8, 'color':[.1,.1,.8,.75]}
max_xy = np.array([t[np.argmax(s)],np.max(s)])
#ax.annotate(s='$s_{max}$',xy=max_xy, xytext=max_xy+[.1,.5], fontsize=16, arrowprops=arrow_props)
ax.annotate(s=str(max_xy.round(2)),xy=max_xy, xytext=max_xy+[.1,.5], fontsize=14, arrowprops=arrow_props)
min_xy = np.array([t[np.argmin(s)],np.min(s)])
ax.annotate(s='$s_{min}$',xy=min_xy, xytext=min_xy+[-.15,-.25], fontsize=16, arrowprops=arrow_props);
print(max_xy)
print(min_xy)
[ 0.331       3.65698523]
[ 0.558      -3.41729601]

If we have multivariate data, we can easily make a matrix of scatter plots to visualize the relationships between each pair of variables (and the distribution of each variable on its own). Here, we'll generate some random data with different magnitudes of statistical relationships between each pair:

In [7]:
np.random.seed(seed=3773)
n_obs = 500
var_1 = np.random.normal(loc=0,scale=1,size=n_obs)
var_2 = .75*var_1 + np.random.normal(loc=3,scale=2,size=n_obs)
var_3 = -.35*var_2 + np.random.normal(loc=-.5,scale=.5,size=n_obs)
var_4 = .5*var_3 + .5*var_1 + np.random.normal(loc=0,scale=.65,size=n_obs)
rvars = np.stack([var_1,var_2,var_3,var_4],axis=1)

nv = 4
fig, axes = plt.subplots(nrows=nv, ncols=nv, figsize=(8,8))
for i in range(nv):
    for j in range(nv):
        clr = [(i+j)/(2*nv),1-(i+j)/(2*nv),(i+j)/(2*nv),.5]
        axt = axes[i,j]
        if i==j:
            axt.hist(rvars[:,i],bins=20,histtype='step', color=clr[:3], lw=2)
            adjust_limits(axt, adj_y=False)
        else:
            axt.plot(rvars[:,i],rvars[:,j],'o',color=clr)
            adjust_limits(axt)
        if j != 0:
            axt.set_yticks([])
        if i != nv-1:
            axt.set_xticks([])

Matplotlib has a large number of built-in specialty plots. For example, you can make box plots and violin plots using the boxplot() and violinplot() functions:

In [8]:
fig, axes = plt.subplots(1,2,figsize=(16,6))
ax1, ax2 = axes
ax1.boxplot(rvars, notch=True, sym='o');
ax2.violinplot(rvars, showmedians=True); ax2.set_xticks(np.arange(1,5));

If we want multiple subplots in a single figure, we are not limited to matrices of regularly-sized subplots:

In [9]:
fig = plt.figure(figsize=(10,10))
ax1 = plt.subplot2grid((4,4), (0,0), colspan=4)
ax2 = plt.subplot2grid((4,4), (1,0), colspan=3)
ax3 = plt.subplot2grid((4,4), (1,3), rowspan=3)
ax4 = plt.subplot2grid((4,4), (2,0))
ax5 = plt.subplot2grid((4,4), (3,0))
ax6 = plt.subplot2grid((4,4), (2,1), colspan=2, rowspan=2)

ax1.plot(t,s, lw=3)
ax2.plot(t,np.cos(s),'r--', lw=2)
ax3.plot(np.sin(2*np.pi*3*t)**5,t,'g:')
ax4.plot(t,np.sin(s),'m', lw=.5)
ax5.plot(t,s[::-1],'c')
ax6.plot(t,s**3,'k', lw=5);

As we've seen before, matplotlib makes it easy to plot a spectrogram and change the colormap. Here we'll read in a sound file, create a bandpass filter, and illustrate the filter:

In [10]:
from scipy.signal import iirfilter, lfilter, freqz

fs,xx = wv.read('../w08/sentences/f0s2.wav')

nyq = fs/2
cut_freqs = np.array([1350,2350]) # Hz
cf_rad = np.array(cut_freqs/nyq)

Nord = 5
npts = 1024

b, a = iirfilter(N=Nord, Wn=cf_rad, btype='bandpass', ftype='cheby1', rp=2.5, rs=15)
w, fz = freqz(b, a, worN=npts)

fig, axes = plt.subplots(1,2,figsize=(16,4))
ax1, ax2 = axes
xl, xh = np.int(np.floor(.8*cf_rad[0]*npts)), np.int(np.floor(1.2*cf_rad[1]*npts))
ax1.plot(w[xl:xh], 20*np.log10(np.abs(fz[xl:xh])), 'm-', lw=2)
xticks = cf_rad*np.pi; adjust_limits(ax1, xpad=0)
ax1.set_xticks(xticks); ax1.set_xticklabels([np.int(nyq*i/np.pi) for i in xticks])
ax1.grid(axis='x'); ax1.set_ylabel('Amplitude (dB)'); ax1.set_xlabel('Freq (Hz)')
ax2.plot(w, 20*np.log10(np.abs(fz)), 'm-', lw=2)
ax2.set_xticks(xticks); ax2.set_xticklabels([np.int(nyq*i/np.pi) for i in xticks])
ax2.grid(axis='x'); ax2.set_xlabel('Freq (Hz)'); adjust_limits(ax2, xpad=.01, ypad=.05);

Next, we'll filter the sound and plot spectrograms of the unfiltered and filtered sounds:

In [11]:
x_bp = lfilter(b, a, xx)

x_arr = np.stack([xx,x_bp],axis=1)

fig, axes = plt.subplots(1,2,figsize=(15,5), sharex=True, sharey=True)

nfft = 2**10
nolp = 2**5
print(nfft,nolp)

#color_maps = pl.colormaps()
#print(color_maps)
plt.set_cmap('terrain')
for i,xt in enumerate(x_arr.T):
    this_ax = axes[i]
    spt = this_ax.specgram(x=xt,NFFT=nfft,Fs=fs,noverlap=nolp, scale='dB')
    this_ax.set_ylim(0,6500)
1024 32

With some help from numpy, we can easily make contour/mesh/etc plots, too. First, we'll use the numpy function meshgrid() to make 2D arrays of x and y coordinates, then we'll create a 2D array of a complicated function of x and y:

In [12]:
L = 3
d = .15
x = np.arange(-L, L+d/2, d)
y = np.arange(-L, L+d/2, d)
X, Y = np.meshgrid(x, y)
R = np.sqrt(X**2 + Y**2)
Z = np.sin(R)**2 - .8*np.cos(Y+np.pi/7)**2 + .3*np.sin(X+np.pi/3)**2

print(X)
print('')
print(Y)
[[-3.   -2.85 -2.7  ...,  2.7   2.85  3.  ]
 [-3.   -2.85 -2.7  ...,  2.7   2.85  3.  ]
 [-3.   -2.85 -2.7  ...,  2.7   2.85  3.  ]
 ..., 
 [-3.   -2.85 -2.7  ...,  2.7   2.85  3.  ]
 [-3.   -2.85 -2.7  ...,  2.7   2.85  3.  ]
 [-3.   -2.85 -2.7  ...,  2.7   2.85  3.  ]]

[[-3.   -3.   -3.   ..., -3.   -3.   -3.  ]
 [-2.85 -2.85 -2.85 ..., -2.85 -2.85 -2.85]
 [-2.7  -2.7  -2.7  ..., -2.7  -2.7  -2.7 ]
 ..., 
 [ 2.7   2.7   2.7  ...,  2.7   2.7   2.7 ]
 [ 2.85  2.85  2.85 ...,  2.85  2.85  2.85]
 [ 3.    3.    3.   ...,  3.    3.    3.  ]]

Here's a contour plot of the complicated function:

In [13]:
fig, ax = plt.subplots(1, 1, figsize=(8,8))
v = np.linspace(Z.min()-.05,Z.max()+.05,20)
plt.set_cmap('cubehelix_r')
CS = ax.contour(X,Y,Z,v)
ax.clabel(CS, inline=True, fontsize=8, inline_spacing=10, fmt='%1.2f');