In [10]:
%pylab inline
Populating the interactive namespace from numpy and matplotlib

(Problem 1b2)

Compute samples and inverse and Cholesky factorization of K

In [71]:
K = array([[1, 0.5],[0.5,1]])
Kinv = linalg.inv(K)
G = cholesky(K)

# Compute samples
n = 1000
y = randn(2,n) # generate uncorrelated zero-mean samples
z = G.dot(y) # generate correlated samples
z = z + ones((2,n)) # add mean

Gradient Descent

In [72]:
def calc_grad(w,z):
    what = mean(z,axis=1)
    return 2*Kinv.dot(w - what)

a = 1e-1
w = array([-1,0])
maxiters = 60
figure(figsize=(6,3))
for i in range(maxiters):
    w = w - a*calc_grad(w,z)
    subplot(1,2,1)
    plot(i,w[0],'k.')
    subplot(1,2,2)
    plot(i,w[1],'k.')
subplot(1,2,1)
xlabel('Iteration')
ylabel('w[0]')
grid()

subplot(1,2,2)
xlabel('Iteration')
ylabel('w[1]')
grid()
tight_layout()
wgd = w

(Problem 1b3) Stochastic Gradient Descent.

Instead of taking the mean over the entire dataset each iteration, we will simply choose a sample at random each iteration.

In [73]:
def calc_s_grad(w,z):
    i = randint(n)
    what = z[:,i]
    return 2*Kinv.dot(w - what)

a = lambda i: (i+1)**(-1)
w = array([-1,0])
maxiters = 1000
figure(figsize=(6,3))
for i in range(maxiters):
    w = w - a(i)*calc_s_grad(w,z)
    subplot(1,2,1)
    plot(i,w[0],'k.')
    subplot(1,2,2)
    plot(i,w[1],'k.')
subplot(1,2,1)
xlabel('Iteration')
ylabel('w[0]')
grid()

subplot(1,2,2)
xlabel('Iteration')
ylabel('w[1]')
grid()
tight_layout()
wsgd = w

(Problem 1b4)

In [80]:
rstarGD = 0
for i in range(n):
    x = z[:,i] - wgd
    rstarGD += 1./n * x.T.dot(Kinv).dot(x)
rstarSGD = 0
for i in range(n):
    x = z[:,i] - wsgd
    rstarSGD += 1./n * x.T.dot(Kinv).dot(x)
print 'rstarGD is %.8f' % rstarGD
print 'rstarSGD is %.8f' % rstarSGD
rstarGD is 1.99292959
rstarSGD is 1.99912833

Total Computational Work:

  • GD: (60 iters) x (1000 ops per iter) = 60k ops
  • SGD: (1000 iters) x (1 op per iter) = 1k ops

SGD had a similar risk with many fewer operations

(Problem 1c)

In [84]:
def solve_GD(z):
    '''
    Solve the GD problem and return rstar
    '''
    def calc_grad(w,z):
        what = mean(z,axis=1)
        return 2*Kinv.dot(w - what)

    a = 1e-1
    w = array([-1,0])
    maxiters = 60
    for i in range(maxiters):
        w = w - a*calc_grad(w,z)
    wgd = w
    rstarGD = 0
    for i in range(n):
        x = z[:,i] - wgd
        rstarGD += 1./n * x.T.dot(Kinv).dot(x)
    return rstarGD

def solve_SGD(z):
    '''
    Solve the SGD problem and return rstar
    '''
    def calc_s_grad(w,z):
        i = randint(n)
        what = z[:,i]
        return 2*Kinv.dot(w - what)

    a = lambda i: (i+1)**(-1)
    w = array([-1,0])
    maxiters = 1000
    for i in range(maxiters):
        w = w - a(i)*calc_s_grad(w,z)
    wsgd = w

    rstarSGD = 0
    for i in range(n):
        x = z[:,i] - wsgd
        rstarSGD += 1./n * x.T.dot(Kinv).dot(x)
    return rstarSGD
In [86]:
# Run 100 MC simulations
m = 100
rstarGD = zeros(m)
rstarSGD = zeros(m)
for j in range(m):
    y = randn(2,n) # generate uncorrelated zero-mean samples
    z = G.dot(y) # generate correlated samples
    z = z + ones((2,n)) # add mean
    rstarGD[j] = solve_GD(z)
    rstarSGD[j] = solve_SGD(z)
In [92]:
# Calculate excess error
esgd = mean(rstarSGD)
egd = mean(rstarGD)
print 'We assume that we know rstar = 2. (In reality, we do not, unless we know p)'
print 'Excess Error GD = %.8f' % (egd - 2)
print 'Excess Error SGD = %.8f' % (esgd - 2)
print 'Learning Error from part 1b1: %.8f' % (2./n)
We assume that we know rstar = 2. (In reality, we do not, unless we know p)
Excess Error GD = 0.00540472
Excess Error SGD = 0.00930940
Learning Error from part 1b1: 0.00200000

The excess error from SGD is twice as large as for GD, which is about 2.5 times larger than the theoretical ideal learning error.

(Problem 2a)

In [108]:
a = 0.1
n = 1000
hx = zeros(n)
for i in range(n):
    # Generate random sample from f(x)
    x = rand(10)
    # Calculate h(x)
    hx[i] = exp(-a*norm(x)**2)
Iest = mean(hx)
print 'I(%.1f) is estimated to be %.5f' % (a,Iest)

import scipy.stats
def Q(x):
    return 1 - scipy.stats.norm.cdf(x)
Itrue = (pi/a)**5 * (0.5 - Q(sqrt(2*a)))**10
print 'I(%.1f) is actually %.5f (using Q function)' % (a,Itrue) 
I(0.1) is estimated to be 0.71878
I(0.1) is actually 0.71970 (using Q function)
In [113]:
a = 10.
n = 1000
hx = zeros(n)
for i in range(n):
    # Generate random sample from f(x)
    x = rand(10)
    # Calculate h(x)
    hx[i] = exp(-a*norm(x)**2)
Iest = mean(hx)
print 'I(%.1f) is estimated to be %e' % (a,Iest)

import scipy.stats
def Q(x):
    return 1 - scipy.stats.norm.cdf(x)
Itrue = (pi/a)**5 * (0.5 - Q(sqrt(2*a)))**10
print 'I(%.1f) is actually %e (using Q function)' % (a,Itrue) 
I(10.0) is estimated to be 3.413898e-07
I(10.0) is actually 2.988242e-06 (using Q function)

The solution for $a=0.1$ is better than for $a=10$. This is likely because fewer points for $a=10$ are useful. The function is peaked near $\|x\|=0$, while most samples are for larger $\|x\|$. Importance sampling could solve this problem. Alternatively, one could use a Gaussian pdf instead of uniform.

(Problem 2b)

In [192]:
x = linspace(0,1)
f = exp(-10*(x**2)*sqrt(x))
plot(x,f,'k-',label=r'$f(x)h(x)$')
g = exp(-10*x)
semilogy(x,f/g,'r--',label=r'$\frac{f(x)h(x)}{g(x)}$')
legend(loc='best')
xlabel('x')
Out[192]:
<matplotlib.text.Text at 0x7f9d08cac550>
In [189]:
n = 1000
lam = 10
fhg = zeros(n)
nwasted = 0
for i in range(n):
    x = random.exponential(1./lam, size=10)
    fhg[i] = exp(-10*norm(x)**2)*sum(sqrt(x))/(lam**10 * exp(-lam * sum(x)))
    if any(x > 1.0):
        fhg[i] = 0
        nwasted +=1
print '%i Wasted Samples' % nwasted
Iest = mean(fhg)
print 'Estimated integral value: %e' % Iest
2 Wasted Samples
Estimated integral value: 1.114372e-05

(Problem 3)

(Part a) $n=6$

We are interested in the statistics of the normalized estimation error:

$e_k = \sqrt{n} \left(\hat{\theta}(Y_k) - \theta\right)$

where $k= 1,...,K$ indexes the MC trial. But what should we use for $\theta$? In practice, it is not known. In the spirit of bootstrap, we will substitute $\hat{\theta}(Y)$, the estimate using the original data.

In [329]:
n = 6
K = 1000

# Generate data
y = randn(n)
orig_theta = mean(y**2)

err_samples = zeros(K)
for k in range(K):
    
    # Resample data
    idx = randint(n,size=n)
    yk = y[idx]
    
    # Compute variance estimate
    theta_hat = mean(yk**2)
    
    # Compute normalized error sample
    err_samples[k] = sqrt(n)*(theta_hat - orig_theta) 
    
# Draw cdf
e = linspace(-5,5,2000)
Fe = zeros(len(e))
for i in range(len(e)):
    Fe[i] = 1.0*sum(err_samples < e[i])/K
plot(e,Fe,'k-',linewidth=2)
ylim((-0.1,1.1))
grid()
title('Empirical CDF of error: n=%i, K=%i' % (n,K))
Out[329]:
<matplotlib.text.Text at 0x7f9cfb099bd0>

The true mean and variance of the error is calculated next. The mean is zero, since $\hat{\theta}$ is an unbiased estimator:

$$ \left<\hat{\theta}\right> = \left< \frac{1}{n} \sum_i Y_i^2 \right> $$$$= \left\langle Y^2 \right\rangle = \theta$$

The variance is more complicated:

$$ Var(e) = Var\left(\sqrt{n} \frac{1}{n} \sum_i Y_i^2 \right) = Var(Y^2) = \left< Y^4 \right> $$

Using the Wikipedia page for higher moments of Gaussian variables:

$$ = 3 \sigma^4 = 3 $$
In [330]:
print 'Empirical Mean of Error:  %e' % mean(err_samples)
print 'Analytical Mean of Error: 0.0' 
print 'Empirical Var of Error:   %f' % mean((err_samples-mean(err_samples))**2)
print 'Analytical Var of Error:  %f' % (3)
Empirical Mean of Error:  6.626871e-02
Analytical Mean of Error: 0.0
Empirical Var of Error:   2.746079
Analytical Var of Error:  3.000000

The empirical values are close the the analytical values, although the variance can be quite different, when this code is repeated.

We now fit a zero-mean Gaussian to the empirical error distribution. The phrase "fit a Gaussian" could mean different things, but here we will define it as the Gaussian that maximizes a likelihood function:

$$\textrm{L}(\sigma^2) = \ln \prod_i \textit{N}(e_i; \sigma^2) \,\propto\, \sum_i (-\ln \sigma - \frac{e_i^2}{\sigma^2})$$
In [331]:
sigma_vec = linspace(0.1,5,10000) # Minimize by brute force search
L_vec = zeros(len(sigma_vec))
for i in range(len(sigma_vec)):
    L_vec[i] = sum(-log(sigma_vec[i]) - err_samples**2/sigma_vec[i]**2)
sigsqopt = sigma_vec[argmax(L_vec)]
print 'Optimum sigma^2 is approximately %.3f' % sigsqopt

# Plot it against empirical CDF
plot(e,Fe,'k-',label='Empirical')
Fe_fit = zeros(len(e))
for i in range(len(e)):
    Fe_fit[i] = 1-Q(e[i]/sqrt(sigsqopt))
plot(e,Fe_fit,'r--',label='Fit')
ylim((-0.1,1.1))
grid()
title('CDF of error: n=%i, K=%i' % (n,K))
legend(loc='best')
Optimum sigma^2 is approximately 2.345
Out[331]:
<matplotlib.legend.Legend at 0x7f9cfaf81d10>

The Gaussian Fit is not good. This is likely because we only used $n=6$.

We now repeat the analysis for Laplacian samples.

In [335]:
n = 6
K = 1000

# Generate data
y = random.laplace(size=n)
orig_theta = mean(y**2)

err_samples = zeros(K)
for k in range(K):
    
    # Resample data
    idx = randint(n,size=n)
    yk = y[idx]
    
    # Compute variance estimate
    theta_hat = mean(yk**2)
    
    # Compute normalized error sample
    err_samples[k] = sqrt(n)*(theta_hat - orig_theta) 
    
# Draw cdf
e = linspace(-5,5,2000)
Fe = zeros(len(e))
for i in range(len(e)):
    Fe[i] = 1.0*sum(err_samples < e[i])/K
plot(e,Fe,'k-',linewidth=2)
ylim((-0.1,1.1))
grid()
title('Empirical CDF of error: n=%i, K=%i' % (n,K))
Out[335]:
<matplotlib.text.Text at 0x7f9cfad99050>
In [336]:
print 'Empirical Mean of Error:  %e' % mean(err_samples)
print 'Empirical Var of Error:   %f' % mean((err_samples-mean(err_samples))**2)
Empirical Mean of Error:  3.656152e-02
Empirical Var of Error:   5.930745

The behavior with the Laplacian samples is worse. This makes sense because Laplacian samples have more outliers.

(Part b) Repeat everything above, but using $n=100$

In [345]:
n = 100
K = 1000

# Generate data
y = randn(n)
orig_theta = mean(y**2)

err_samples = zeros(K)
for k in range(K):
    
    # Resample data
    idx = randint(n,size=n)
    yk = y[idx]
    
    # Compute variance estimate
    theta_hat = mean(yk**2)
    
    # Compute normalized error sample
    err_samples[k] = sqrt(n)*(theta_hat - orig_theta) 
    
# Draw cdf
e = linspace(-5,5,2000)
Fe = zeros(len(e))
for i in range(len(e)):
    Fe[i] = 1.0*sum(err_samples < e[i])/K
plot(e,Fe,'k-',linewidth=2)
ylim((-0.1,1.1))
grid()
title('Empirical CDF of error: n=%i, K=%i' % (n,K))
Out[345]:
<matplotlib.text.Text at 0x7f9cfa9b4910>
In [346]:
print 'Empirical Mean of Error:  %e' % mean(err_samples)
print 'Analytical Mean of Error: 0.0' 
print 'Empirical Var of Error:   %f' % mean((err_samples-mean(err_samples))**2)
print 'Analytical Var of Error:  %f' % (3)
Empirical Mean of Error:  -2.880882e-02
Analytical Mean of Error: 0.0
Empirical Var of Error:   3.343829
Analytical Var of Error:  3.000000

The empirical values are reasonably close the the analytical values in this case.

We now fit a zero-mean Gaussian to the empirical error distribution. The phrase "fit a Gaussian" could mean different things, but here we will define it as the Gaussian that maximizes a likelihood function:

$$\textrm{L}(\sigma^2) = \ln \prod_i \textit{N}(e_i; \sigma^2) \,\propto\, \sum_i (-\ln \sigma - \frac{e_i^2}{\sigma^2})$$
In [347]:
sigma_vec = linspace(0.1,5,10000) # Minimize by brute force search
L_vec = zeros(len(sigma_vec))
for i in range(len(sigma_vec)):
    L_vec[i] = sum(-log(sigma_vec[i]) - err_samples**2/sigma_vec[i]**2)
sigsqopt = sigma_vec[argmax(L_vec)]
print 'Optimum sigma^2 is approximately %.3f' % sigsqopt

# Plot it against empirical CDF
plot(e,Fe,'k-',label='Empirical')
Fe_fit = zeros(len(e))
for i in range(len(e)):
    Fe_fit[i] = 1-Q(e[i]/sqrt(sigsqopt))
plot(e,Fe_fit,'r--',label='Fit')
ylim((-0.1,1.1))
grid()
title('CDF of error: n=%i, K=%i' % (n,K))
legend(loc='best')
Optimum sigma^2 is approximately 2.587
Out[347]:
<matplotlib.legend.Legend at 0x7f9cfa989990>

The Gaussian fit is better than in Part a, but still not perfect. The true distribution is more heavy-tailed, it seems.

We now repeat the analysis for Laplacian samples.

In [351]:
n = 100
K = 1000

# Generate data
y = random.laplace(size=n)
orig_theta = mean(y**2)

err_samples = zeros(K)
for k in range(K):
    
    # Resample data
    idx = randint(n,size=n)
    yk = y[idx]
    
    # Compute variance estimate
    theta_hat = mean(yk**2)
    
    # Compute normalized error sample
    err_samples[k] = sqrt(n)*(theta_hat - orig_theta) 
    
# Draw cdf
e = linspace(-5,5,2000)
Fe = zeros(len(e))
for i in range(len(e)):
    Fe[i] = 1.0*sum(err_samples < e[i])/K
plot(e,Fe,'k-',linewidth=2)
ylim((-0.1,1.1))
grid()
title('Empirical CDF of error: n=%i, K=%i' % (n,K))
Out[351]:
<matplotlib.text.Text at 0x7f9cfa5ee290>
In [352]:
print 'Empirical Mean of Error:  %e' % mean(err_samples)
print 'Empirical Var of Error:   %f' % mean((err_samples-mean(err_samples))**2)
Empirical Mean of Error:  -1.847888e-02
Empirical Var of Error:   8.596222

Once again, the behavior with the Laplacian samples is worse. This makes sense because Laplacian samples are heavier-tailed than Gaussians.

(Problem 4)

Prediction Step: $X_{t+1}(i)^* \sim \mathcal{N}\left(a X_t(i) + b \frac{X_t(i)}{1+X_t(i)^2} + c \cos(\omega t), 10\right) ~~~ 1 \le i \le n$

Update Step:

  • Look at measurement $y_{t+1}$
  • Evaluate importance weights $w_i = \frac{\mathcal{N}(y_{t+1}; \frac{1}{20} X_{t+1}^*(i)^2, 1)}{\sum_{j=1}^N \mathcal{N}(y_{t+1}; \frac{1}{20} X_{t+1}^*(i)^2, 1)}$
  • Resample n times from the $X_{t+1}(i)^*$ set, using the weights as probabilities, to obtain $X_{t+1}(j)$ set. These are samples of the posterior, $p(x_{t+1}|y_{0:t+1})$

Along the way, we will simulate the actual trajectories and measurements using:

  • $X_{t+1}^* \sim \mathcal{N}\left(a X_t + b \frac{X_t}{1+X_t^2} + c \cos(\omega t), 10\right) $ (with a different noise realization than the one in the Prediction step)
  • $Y_{t+1} \sim \mathcal{N}\left(\frac{1}{20} X_{t+1}^2, 1\right)$
In [490]:
n = 500
tmax = 300
a = 0.5
b = 25.
c = 8.
omega = 1.2
# function to evaluate Gaussian pdf
G = lambda (y,mu,sigsq): 1./sqrt(2*pi*sigsq) * exp(-0.5*(y - mu)**2/sigsq)

# Variables to save values
X = zeros(tmax) # Actual state t
Xtt = zeros(tmax) # Estimate of state t given t
Xttm1 = zeros(tmax) # Prediction of t given t-1
S = zeros((n,tmax)) # Save all particles

# Initialize t=0
Xt = 0.1
Xt_s = 0.1*ones(n)
X[0] = Xt
Xtt[0] = mean(Xt_s)
Xttm1[0] = mean(Xt_s)
S[:,0] = Xt_s

for t in range(tmax-1):

    # Step 0: generate true next step and measurement
    Xtp1 = a*Xt + b*Xt/(1+Xt**2) + c*cos(omega*t) + sqrt(10)*randn()
    Ytp1 = 1./20 * Xtp1**2 + randn()

    # Step 1: Prediction step
    Xtp1_star = a*Xt_s + b*Xt_s/(1+Xt_s**2) + c*cos(omega*t) + sqrt(10)*randn(n)

    # Step 2: Update step
    # (a) Weights
    w_numer = G((Ytp1, 1./20 * Xtp1_star**2, 1))
    w = w_numer/sum(w_numer)
    # (b) Resample
    counts = random.multinomial(n,w)
    Xtp1_s = []
    for i in range(n):
        for j in range(counts[i]):
            Xtp1_s.append(Xtp1_star[i])
    Xtp1_s = array(Xtp1_s)
    
    # Save results of this time
    X[t+1] = Xtp1
    Xtt[t+1] = mean(Xtp1_s)
    Xttm1[t+1] = mean(Xtp1_star)
    S[:,t+1] = Xtp1_s
            
    # Propagate values to next iteration
    Xt_s = Xtp1_s
In [487]:
t = arange(tmax)

figure(figsize=(10,4.5))
subplot(3,1,1)
plot(t,X,'k-')
ylabel('X(t)')
subplot(3,1,2)
plot(t,Xtt,'k-')
ylabel('Estimate X_t|t')
subplot(3,1,3)
plot(t,Xttm1,'k-')
ylabel('Predition X_t|t-1')
xlabel('t')


#plot(t,Xtt,'r-')
Out[487]:
<matplotlib.text.Text at 0x7f9cf805c7d0>
In [488]:
ett = Xtt - X
figure(figsize=(10,1.5))
plot(t,ett,'k-')
title('Estimation Error')
xlabel('t')
Out[488]:
<matplotlib.text.Text at 0x7f9cf80e6bd0>
In [489]:
ettm1 = Xttm1 - X
figure(figsize=(10,1.5))
plot(t,ettm1,'k-')
title('Prediction Error')
xlabel('t')
Out[489]:
<matplotlib.text.Text at 0x7f9cf2ec8150>

Show distribution of particles at time t=5 and t=50, using kernel density estimation

In [515]:
t = 5
d = 0.1 # width of kernel, chosen by trial and error
Xt_s = S[:,t]
x = linspace(-15,15,1000)
f = zeros(len(x))
for i in range(n):
    f += 1./n * G((x, Xt_s[i], sqrt(d)))
plot(x,f,'k-')
title('Particle Distribution at t=%i' % t)
xlabel('x')
ylabel('p(x_t|y_0:t)')
Out[515]:
<matplotlib.text.Text at 0x7f9cf1e2fb90>
In [516]:
t = 50
d = 0.1 # width of kernel, chosen by trial and error
Xt_s = S[:,t]
x = linspace(-15,15,1000)
f = zeros(len(x))
for i in range(n):
    f += 1./n * G((x, Xt_s[i], sqrt(d)))
plot(x,f,'k-')
title('Particle Distribution at t=%i' % t)
xlabel('x')
ylabel('p(x_t|y_0:t)')
Out[516]:
<matplotlib.text.Text at 0x7f9cf1d65190>

We estimate the mean-squared estimation error and prediction error using a Monte Carlo approach with K = 100

In [529]:
K = 100
tmax = 300
n = 500
a = 0.5
b = 25.
c = 8.
omega = 1.2

Z_est = zeros((K,tmax)) # This will hold estimation errors for each MC trial for each time
Z_pre = zeros((K,tmax)) # Same, but for prediction error

for k in range(K):

    # function to evaluate Gaussian pdf
    G = lambda (y,mu,sigsq): 1./sqrt(2*pi*sigsq) * exp(-0.5*(y - mu)**2/sigsq)

    # Variables to save values
    X = zeros(tmax) # Actual state t
    Xtt = zeros(tmax) # Estimate of state t given t
    Xttm1 = zeros(tmax) # Prediction of t given t-1

    # Initialize t=0
    Xt = 0.1
    Xt_s = 0.1*ones(n)
    X[0] = Xt
    Xtt[0] = mean(Xt_s)
    Xttm1[0] = mean(Xt_s)

    for t in range(tmax-1):

        # Step 0: generate true next step and measurement
        Xtp1 = a*Xt + b*Xt/(1+Xt**2) + c*cos(omega*t) + sqrt(10)*randn()
        Ytp1 = 1./20 * Xtp1**2 + randn()

        # Step 1: Prediction step
        Xtp1_star = a*Xt_s + b*Xt_s/(1+Xt_s**2) + c*cos(omega*t) + sqrt(10)*randn(n)

        # Step 2: Update step
        # (a) Weights
        w_numer = G((Ytp1, 1./20 * Xtp1_star**2, 1))
        w = w_numer/sum(w_numer)
        # (b) Resample
        counts = random.multinomial(n,w)
        Xtp1_s = []
        for i in range(n):
            for j in range(counts[i]):
                Xtp1_s.append(Xtp1_star[i])
        Xtp1_s = array(Xtp1_s)

        # Save results of this time
        X[t+1] = Xtp1
        Xtt[t+1] = mean(Xtp1_s)
        Xttm1[t+1] = mean(Xtp1_star)

        # Propagate values to next iteration
        Xt_s = Xtp1_s
        
    # Save results of this MC trial
    err_est = Xtt - Xt
    err_pre = Xttm1 - Xt
    Z_est[k,:] = err_est
    Z_pre[k,:] = err_pre
In [530]:
# Compute MSE for estimation and prediction using empirical distribution from MC trials
mse_est = mean(Z_est**2, axis=0)
mse_pre = mean(Z_pre**2, axis=0)

figure(figsize=(10,3))
t = arange(tmax)
subplot(2,1,1)
plot(t,mse_est,'k-')
ylabel('Estimation Error')
subplot(2,1,2)
plot(t,mse_pre,'r-')
ylabel('Prediction Error')
xlabel('t')
Out[530]:
<matplotlib.text.Text at 0x7f9cf141ca10>

The above plots show the results of estimating the estimation error and prediction error, and how they evolve with time. Clearly, there is a periodicity in the results, which makes sense. The error is larger when the actual state is large.

(Problem 6)

In [645]:
K = 30 # number of iterations of EM
z_vec = zeros(K)
m_vec = zeros(K)

z = 0.5 # zeta
m = 1.0 # lambda
z_vec[0] = z
m_vec[0] = m

# Pre-calculate relevant values
dat = [3062, 587, 284, 103, 33, 4, 2]
N0 = 1.0*dat[0] # number samples equal to 0
N1  = 1.0*sum(dat[1:]) # number samples >= 1
M1 = 1.0*sum([i*dat[i] for i in range(1,7)]) # weighted sum of samples >=1

for k in range(1,K):
    B1 = exp(-m) / (exp(-m) + (1-z)/z)
    B0 = (1-z)/z / ((1-z)/z + exp(-m))
    
    z = (B1*N0 + N1)/(2*N0 + N1)
    m = M1/(N0*B1 + N1)
    z_vec[k] = z
    m_vec[k] = m
        
    
    
In [646]:
print z_vec
plot(z_vec,'k.-')
[ 0.5         0.25732081  0.19554191  0.17212981  0.16440093  0.16202649
  0.16131641  0.16110589  0.16104364  0.16102524  0.16101981  0.1610182
  0.16101773  0.16101759  0.16101755  0.16101754  0.16101753  0.16101753
  0.16101753  0.16101753  0.16101753  0.16101753  0.16101753  0.16101753
  0.16101753  0.16101753  0.16101753  0.16101753  0.16101753  0.16101753]
Out[646]:
[<matplotlib.lines.Line2D at 0x7f9ceaa43410>]
In [647]:
print m_vec
plot(m_vec,'k.-')
[ 1.          0.88646949  1.1665379   1.32520359  1.38750457  1.40783798
  1.41403497  1.41588274  1.41643007  1.41659187  1.41663967  1.41665379
  1.41665796  1.41665919  1.41665956  1.41665967  1.4166597   1.41665971
  1.41665971  1.41665971  1.41665971  1.41665971  1.41665971  1.41665971
  1.41665971  1.41665971  1.41665971  1.41665971  1.41665971  1.41665971]
Out[647]:
[<matplotlib.lines.Line2D at 0x7f9cea9f13d0>]

(Problem 7)

The EM algorithm for this case is given in the course notes on page 6 of lecture9_10_11.pdf, and is readily generalized to 2 dimensions. For the initial estimate, we will start with a single Gaussian fit (i.e., calculating the sample mean and variance), and then perturb the means by +/- 10%. This was a trick used in ECE 544 NA. We'll start with pi_1 = 0.5.

In [740]:
# Read data file
with open('/home/bhardin2/ece598pm/data.txt') as f:
    lines = f.readlines()
n = len(lines)
x = zeros((2,n))
for i in range(len(lines)):
    line = lines[i]
    xys = line.split()
    x[:,i] = [float(v) for v in xys]
plot(x[0,:],x[1,:],'k.')
title('Data points')
Out[740]:
<matplotlib.text.Text at 0x7f9ce9cc0750>
In [745]:
K = 100

# Initial Guess
mu = mean(x,axis=1)
S = 1./n*(x.dot(x.T))
pi1 = 0.5
mu1 = mu*0.90
mu2 = mu*1.10
S1 = S.copy()
S2 = S.copy()

for k in range(K):

    # Calculate posterior
    p1 = zeros(n)
    p2 = zeros(n)
    for j in range(n):
        c = x[:,j] - mu1
        p1[j] = pi1     * (2*pi)**(-1) * linalg.det(S1)**(-0.5) * exp(-0.5*c.T.dot(linalg.inv(S1)).dot(c))
        c = x[:,j] - mu2
        p2[j] = (1-pi1) * (2*pi)**(-1) * linalg.det(S2)**(-0.5) * exp(-0.5*c.T.dot(linalg.inv(S2)).dot(c))
    denom = p1+p2
    p1 = p1/denom
    p2 = p2/denom

    # Calculate new mean
    mu1 = sum(x*p1,axis=1)/sum(p1)
    mu2 = sum(x*p2,axis=1)/sum(p2)

    # Calculate new covariance matrix
    S1top = zeros((2,2))
    S2top = zeros((2,2))
    for j in range(n):
        c1 = x[:,j] - mu1
        c2 = x[:,j] - mu2
        S1top = S1top + p1[j]*outer(c1,c1)
        S2top = S2top + p2[j]*outer(c2,c2)
    S1 = S1top / sum(p1)
    S2 = S2top / sum(p2)

    # Calculate new weights
    pi1 = 1./n * sum(p1)
    
    # Show plot for one variable, just to see
    plot(k,mu1[0],'k.')
xlabel('iteration')
ylabel('mu1[0][k]')
Out[745]:
<matplotlib.text.Text at 0x7f9ce9c9e7d0>
In [739]:
plot(x[0,:],x[1,:],'k.')
plot(mu1[0],mu1[1],'ro',markersize=20)
plot(mu2[0],mu2[1],'ro',markersize=20)
title('Data points with estimated means')
Out[739]:
<matplotlib.text.Text at 0x7f9ce9dfd350>

The values of the estimated parameters are given below.

In [743]:
print 'pi1 is ',pi1
print 'mu1 is ',mu1
print 'mu2 is ',mu2
print 'cov1 is ',S1
print 'cov2 is ',S2
pi1 is  0.680003164256
mu1 is  [ 1.12808961  1.3624249 ]
mu2 is  [-2.59131001 -4.26514449]
cov1 is  [[ 3.12375576 -1.60642511]
 [-1.60642511  2.89376025]]
cov2 is  [[ 0.84868283 -0.02064296]
 [-0.02064296  0.8845108 ]]

By playing around with it, I found that the choice of intialization has a significant impact on the rate of convergence. Sometimes the algorithm gets stuck near a saddle point for some time. Occasionally, if a really bad initialization is chosen, the algorithm will not converge.

In [531]:
import subprocess
import shutil
nb_name = 'ECE598PM_HW2.ipynb' # There's probably a way to get this programmatically
cmd = ['ipython', 'nbconvert', '--to', 'slides', nb_name, \
       '--reveal-prefix', '"http://cdn.jsdelivr.net/reveal.js/2.6.2"',]
#      '--template','output_toggle']
# The --template command points to a custom template which opens and closes the input code cells with a click
subprocess.call(cmd)
html_name = '%s.slides.html' % (nb_name.split('.')[0])
shutil.copy(html_name, '/home/bhardin2/public_html/slideshows/')