%pylab inline
Populating the interactive namespace from numpy and matplotlib
(Problem 1b2)
Compute samples and inverse and Cholesky factorization of K
K = array([[1, 0.5],[0.5,1]])
Kinv = linalg.inv(K)
G = cholesky(K)
# Compute samples
n = 1000
y = randn(2,n) # generate uncorrelated zero-mean samples
z = G.dot(y) # generate correlated samples
z = z + ones((2,n)) # add mean
Gradient Descent
def calc_grad(w,z):
what = mean(z,axis=1)
return 2*Kinv.dot(w - what)
a = 1e-1
w = array([-1,0])
maxiters = 60
figure(figsize=(6,3))
for i in range(maxiters):
w = w - a*calc_grad(w,z)
subplot(1,2,1)
plot(i,w[0],'k.')
subplot(1,2,2)
plot(i,w[1],'k.')
subplot(1,2,1)
xlabel('Iteration')
ylabel('w[0]')
grid()
subplot(1,2,2)
xlabel('Iteration')
ylabel('w[1]')
grid()
tight_layout()
wgd = w
(Problem 1b3) Stochastic Gradient Descent.
Instead of taking the mean over the entire dataset each iteration, we will simply choose a sample at random each iteration.
def calc_s_grad(w,z):
i = randint(n)
what = z[:,i]
return 2*Kinv.dot(w - what)
a = lambda i: (i+1)**(-1)
w = array([-1,0])
maxiters = 1000
figure(figsize=(6,3))
for i in range(maxiters):
w = w - a(i)*calc_s_grad(w,z)
subplot(1,2,1)
plot(i,w[0],'k.')
subplot(1,2,2)
plot(i,w[1],'k.')
subplot(1,2,1)
xlabel('Iteration')
ylabel('w[0]')
grid()
subplot(1,2,2)
xlabel('Iteration')
ylabel('w[1]')
grid()
tight_layout()
wsgd = w
(Problem 1b4)
rstarGD = 0
for i in range(n):
x = z[:,i] - wgd
rstarGD += 1./n * x.T.dot(Kinv).dot(x)
rstarSGD = 0
for i in range(n):
x = z[:,i] - wsgd
rstarSGD += 1./n * x.T.dot(Kinv).dot(x)
print 'rstarGD is %.8f' % rstarGD
print 'rstarSGD is %.8f' % rstarSGD
rstarGD is 1.99292959 rstarSGD is 1.99912833
Total Computational Work:
SGD had a similar risk with many fewer operations
(Problem 1c)
def solve_GD(z):
'''
Solve the GD problem and return rstar
'''
def calc_grad(w,z):
what = mean(z,axis=1)
return 2*Kinv.dot(w - what)
a = 1e-1
w = array([-1,0])
maxiters = 60
for i in range(maxiters):
w = w - a*calc_grad(w,z)
wgd = w
rstarGD = 0
for i in range(n):
x = z[:,i] - wgd
rstarGD += 1./n * x.T.dot(Kinv).dot(x)
return rstarGD
def solve_SGD(z):
'''
Solve the SGD problem and return rstar
'''
def calc_s_grad(w,z):
i = randint(n)
what = z[:,i]
return 2*Kinv.dot(w - what)
a = lambda i: (i+1)**(-1)
w = array([-1,0])
maxiters = 1000
for i in range(maxiters):
w = w - a(i)*calc_s_grad(w,z)
wsgd = w
rstarSGD = 0
for i in range(n):
x = z[:,i] - wsgd
rstarSGD += 1./n * x.T.dot(Kinv).dot(x)
return rstarSGD
# Run 100 MC simulations
m = 100
rstarGD = zeros(m)
rstarSGD = zeros(m)
for j in range(m):
y = randn(2,n) # generate uncorrelated zero-mean samples
z = G.dot(y) # generate correlated samples
z = z + ones((2,n)) # add mean
rstarGD[j] = solve_GD(z)
rstarSGD[j] = solve_SGD(z)
# Calculate excess error
esgd = mean(rstarSGD)
egd = mean(rstarGD)
print 'We assume that we know rstar = 2. (In reality, we do not, unless we know p)'
print 'Excess Error GD = %.8f' % (egd - 2)
print 'Excess Error SGD = %.8f' % (esgd - 2)
print 'Learning Error from part 1b1: %.8f' % (2./n)
We assume that we know rstar = 2. (In reality, we do not, unless we know p) Excess Error GD = 0.00540472 Excess Error SGD = 0.00930940 Learning Error from part 1b1: 0.00200000
The excess error from SGD is twice as large as for GD, which is about 2.5 times larger than the theoretical ideal learning error.
(Problem 2a)
a = 0.1
n = 1000
hx = zeros(n)
for i in range(n):
# Generate random sample from f(x)
x = rand(10)
# Calculate h(x)
hx[i] = exp(-a*norm(x)**2)
Iest = mean(hx)
print 'I(%.1f) is estimated to be %.5f' % (a,Iest)
import scipy.stats
def Q(x):
return 1 - scipy.stats.norm.cdf(x)
Itrue = (pi/a)**5 * (0.5 - Q(sqrt(2*a)))**10
print 'I(%.1f) is actually %.5f (using Q function)' % (a,Itrue)
I(0.1) is estimated to be 0.71878 I(0.1) is actually 0.71970 (using Q function)
a = 10.
n = 1000
hx = zeros(n)
for i in range(n):
# Generate random sample from f(x)
x = rand(10)
# Calculate h(x)
hx[i] = exp(-a*norm(x)**2)
Iest = mean(hx)
print 'I(%.1f) is estimated to be %e' % (a,Iest)
import scipy.stats
def Q(x):
return 1 - scipy.stats.norm.cdf(x)
Itrue = (pi/a)**5 * (0.5 - Q(sqrt(2*a)))**10
print 'I(%.1f) is actually %e (using Q function)' % (a,Itrue)
I(10.0) is estimated to be 3.413898e-07 I(10.0) is actually 2.988242e-06 (using Q function)
The solution for $a=0.1$ is better than for $a=10$. This is likely because fewer points for $a=10$ are useful. The function is peaked near $\|x\|=0$, while most samples are for larger $\|x\|$. Importance sampling could solve this problem. Alternatively, one could use a Gaussian pdf instead of uniform.
(Problem 2b)
x = linspace(0,1)
f = exp(-10*(x**2)*sqrt(x))
plot(x,f,'k-',label=r'$f(x)h(x)$')
g = exp(-10*x)
semilogy(x,f/g,'r--',label=r'$\frac{f(x)h(x)}{g(x)}$')
legend(loc='best')
xlabel('x')
<matplotlib.text.Text at 0x7f9d08cac550>
n = 1000
lam = 10
fhg = zeros(n)
nwasted = 0
for i in range(n):
x = random.exponential(1./lam, size=10)
fhg[i] = exp(-10*norm(x)**2)*sum(sqrt(x))/(lam**10 * exp(-lam * sum(x)))
if any(x > 1.0):
fhg[i] = 0
nwasted +=1
print '%i Wasted Samples' % nwasted
Iest = mean(fhg)
print 'Estimated integral value: %e' % Iest
2 Wasted Samples Estimated integral value: 1.114372e-05
(Problem 3)
(Part a) $n=6$
We are interested in the statistics of the normalized estimation error:
$e_k = \sqrt{n} \left(\hat{\theta}(Y_k) - \theta\right)$
where $k= 1,...,K$ indexes the MC trial. But what should we use for $\theta$? In practice, it is not known. In the spirit of bootstrap, we will substitute $\hat{\theta}(Y)$, the estimate using the original data.
n = 6
K = 1000
# Generate data
y = randn(n)
orig_theta = mean(y**2)
err_samples = zeros(K)
for k in range(K):
# Resample data
idx = randint(n,size=n)
yk = y[idx]
# Compute variance estimate
theta_hat = mean(yk**2)
# Compute normalized error sample
err_samples[k] = sqrt(n)*(theta_hat - orig_theta)
# Draw cdf
e = linspace(-5,5,2000)
Fe = zeros(len(e))
for i in range(len(e)):
Fe[i] = 1.0*sum(err_samples < e[i])/K
plot(e,Fe,'k-',linewidth=2)
ylim((-0.1,1.1))
grid()
title('Empirical CDF of error: n=%i, K=%i' % (n,K))
<matplotlib.text.Text at 0x7f9cfb099bd0>
The true mean and variance of the error is calculated next. The mean is zero, since $\hat{\theta}$ is an unbiased estimator:
$$ \left<\hat{\theta}\right> = \left< \frac{1}{n} \sum_i Y_i^2 \right> $$$$= \left\langle Y^2 \right\rangle = \theta$$The variance is more complicated:
$$ Var(e) = Var\left(\sqrt{n} \frac{1}{n} \sum_i Y_i^2 \right) = Var(Y^2) = \left< Y^4 \right> $$Using the Wikipedia page for higher moments of Gaussian variables:
$$ = 3 \sigma^4 = 3 $$print 'Empirical Mean of Error: %e' % mean(err_samples)
print 'Analytical Mean of Error: 0.0'
print 'Empirical Var of Error: %f' % mean((err_samples-mean(err_samples))**2)
print 'Analytical Var of Error: %f' % (3)
Empirical Mean of Error: 6.626871e-02 Analytical Mean of Error: 0.0 Empirical Var of Error: 2.746079 Analytical Var of Error: 3.000000
The empirical values are close the the analytical values, although the variance can be quite different, when this code is repeated.
We now fit a zero-mean Gaussian to the empirical error distribution. The phrase "fit a Gaussian" could mean different things, but here we will define it as the Gaussian that maximizes a likelihood function:
$$\textrm{L}(\sigma^2) = \ln \prod_i \textit{N}(e_i; \sigma^2) \,\propto\, \sum_i (-\ln \sigma - \frac{e_i^2}{\sigma^2})$$sigma_vec = linspace(0.1,5,10000) # Minimize by brute force search
L_vec = zeros(len(sigma_vec))
for i in range(len(sigma_vec)):
L_vec[i] = sum(-log(sigma_vec[i]) - err_samples**2/sigma_vec[i]**2)
sigsqopt = sigma_vec[argmax(L_vec)]
print 'Optimum sigma^2 is approximately %.3f' % sigsqopt
# Plot it against empirical CDF
plot(e,Fe,'k-',label='Empirical')
Fe_fit = zeros(len(e))
for i in range(len(e)):
Fe_fit[i] = 1-Q(e[i]/sqrt(sigsqopt))
plot(e,Fe_fit,'r--',label='Fit')
ylim((-0.1,1.1))
grid()
title('CDF of error: n=%i, K=%i' % (n,K))
legend(loc='best')
Optimum sigma^2 is approximately 2.345
<matplotlib.legend.Legend at 0x7f9cfaf81d10>
The Gaussian Fit is not good. This is likely because we only used $n=6$.
We now repeat the analysis for Laplacian samples.
n = 6
K = 1000
# Generate data
y = random.laplace(size=n)
orig_theta = mean(y**2)
err_samples = zeros(K)
for k in range(K):
# Resample data
idx = randint(n,size=n)
yk = y[idx]
# Compute variance estimate
theta_hat = mean(yk**2)
# Compute normalized error sample
err_samples[k] = sqrt(n)*(theta_hat - orig_theta)
# Draw cdf
e = linspace(-5,5,2000)
Fe = zeros(len(e))
for i in range(len(e)):
Fe[i] = 1.0*sum(err_samples < e[i])/K
plot(e,Fe,'k-',linewidth=2)
ylim((-0.1,1.1))
grid()
title('Empirical CDF of error: n=%i, K=%i' % (n,K))
<matplotlib.text.Text at 0x7f9cfad99050>
print 'Empirical Mean of Error: %e' % mean(err_samples)
print 'Empirical Var of Error: %f' % mean((err_samples-mean(err_samples))**2)
Empirical Mean of Error: 3.656152e-02 Empirical Var of Error: 5.930745
The behavior with the Laplacian samples is worse. This makes sense because Laplacian samples have more outliers.
(Part b) Repeat everything above, but using $n=100$
n = 100
K = 1000
# Generate data
y = randn(n)
orig_theta = mean(y**2)
err_samples = zeros(K)
for k in range(K):
# Resample data
idx = randint(n,size=n)
yk = y[idx]
# Compute variance estimate
theta_hat = mean(yk**2)
# Compute normalized error sample
err_samples[k] = sqrt(n)*(theta_hat - orig_theta)
# Draw cdf
e = linspace(-5,5,2000)
Fe = zeros(len(e))
for i in range(len(e)):
Fe[i] = 1.0*sum(err_samples < e[i])/K
plot(e,Fe,'k-',linewidth=2)
ylim((-0.1,1.1))
grid()
title('Empirical CDF of error: n=%i, K=%i' % (n,K))
<matplotlib.text.Text at 0x7f9cfa9b4910>
print 'Empirical Mean of Error: %e' % mean(err_samples)
print 'Analytical Mean of Error: 0.0'
print 'Empirical Var of Error: %f' % mean((err_samples-mean(err_samples))**2)
print 'Analytical Var of Error: %f' % (3)
Empirical Mean of Error: -2.880882e-02 Analytical Mean of Error: 0.0 Empirical Var of Error: 3.343829 Analytical Var of Error: 3.000000
The empirical values are reasonably close the the analytical values in this case.
We now fit a zero-mean Gaussian to the empirical error distribution. The phrase "fit a Gaussian" could mean different things, but here we will define it as the Gaussian that maximizes a likelihood function:
$$\textrm{L}(\sigma^2) = \ln \prod_i \textit{N}(e_i; \sigma^2) \,\propto\, \sum_i (-\ln \sigma - \frac{e_i^2}{\sigma^2})$$sigma_vec = linspace(0.1,5,10000) # Minimize by brute force search
L_vec = zeros(len(sigma_vec))
for i in range(len(sigma_vec)):
L_vec[i] = sum(-log(sigma_vec[i]) - err_samples**2/sigma_vec[i]**2)
sigsqopt = sigma_vec[argmax(L_vec)]
print 'Optimum sigma^2 is approximately %.3f' % sigsqopt
# Plot it against empirical CDF
plot(e,Fe,'k-',label='Empirical')
Fe_fit = zeros(len(e))
for i in range(len(e)):
Fe_fit[i] = 1-Q(e[i]/sqrt(sigsqopt))
plot(e,Fe_fit,'r--',label='Fit')
ylim((-0.1,1.1))
grid()
title('CDF of error: n=%i, K=%i' % (n,K))
legend(loc='best')
Optimum sigma^2 is approximately 2.587
<matplotlib.legend.Legend at 0x7f9cfa989990>
The Gaussian fit is better than in Part a, but still not perfect. The true distribution is more heavy-tailed, it seems.
We now repeat the analysis for Laplacian samples.
n = 100
K = 1000
# Generate data
y = random.laplace(size=n)
orig_theta = mean(y**2)
err_samples = zeros(K)
for k in range(K):
# Resample data
idx = randint(n,size=n)
yk = y[idx]
# Compute variance estimate
theta_hat = mean(yk**2)
# Compute normalized error sample
err_samples[k] = sqrt(n)*(theta_hat - orig_theta)
# Draw cdf
e = linspace(-5,5,2000)
Fe = zeros(len(e))
for i in range(len(e)):
Fe[i] = 1.0*sum(err_samples < e[i])/K
plot(e,Fe,'k-',linewidth=2)
ylim((-0.1,1.1))
grid()
title('Empirical CDF of error: n=%i, K=%i' % (n,K))
<matplotlib.text.Text at 0x7f9cfa5ee290>
print 'Empirical Mean of Error: %e' % mean(err_samples)
print 'Empirical Var of Error: %f' % mean((err_samples-mean(err_samples))**2)
Empirical Mean of Error: -1.847888e-02 Empirical Var of Error: 8.596222
Once again, the behavior with the Laplacian samples is worse. This makes sense because Laplacian samples are heavier-tailed than Gaussians.
(Problem 4)
Prediction Step: $X_{t+1}(i)^* \sim \mathcal{N}\left(a X_t(i) + b \frac{X_t(i)}{1+X_t(i)^2} + c \cos(\omega t), 10\right) ~~~ 1 \le i \le n$
Update Step:
Along the way, we will simulate the actual trajectories and measurements using:
n = 500
tmax = 300
a = 0.5
b = 25.
c = 8.
omega = 1.2
# function to evaluate Gaussian pdf
G = lambda (y,mu,sigsq): 1./sqrt(2*pi*sigsq) * exp(-0.5*(y - mu)**2/sigsq)
# Variables to save values
X = zeros(tmax) # Actual state t
Xtt = zeros(tmax) # Estimate of state t given t
Xttm1 = zeros(tmax) # Prediction of t given t-1
S = zeros((n,tmax)) # Save all particles
# Initialize t=0
Xt = 0.1
Xt_s = 0.1*ones(n)
X[0] = Xt
Xtt[0] = mean(Xt_s)
Xttm1[0] = mean(Xt_s)
S[:,0] = Xt_s
for t in range(tmax-1):
# Step 0: generate true next step and measurement
Xtp1 = a*Xt + b*Xt/(1+Xt**2) + c*cos(omega*t) + sqrt(10)*randn()
Ytp1 = 1./20 * Xtp1**2 + randn()
# Step 1: Prediction step
Xtp1_star = a*Xt_s + b*Xt_s/(1+Xt_s**2) + c*cos(omega*t) + sqrt(10)*randn(n)
# Step 2: Update step
# (a) Weights
w_numer = G((Ytp1, 1./20 * Xtp1_star**2, 1))
w = w_numer/sum(w_numer)
# (b) Resample
counts = random.multinomial(n,w)
Xtp1_s = []
for i in range(n):
for j in range(counts[i]):
Xtp1_s.append(Xtp1_star[i])
Xtp1_s = array(Xtp1_s)
# Save results of this time
X[t+1] = Xtp1
Xtt[t+1] = mean(Xtp1_s)
Xttm1[t+1] = mean(Xtp1_star)
S[:,t+1] = Xtp1_s
# Propagate values to next iteration
Xt_s = Xtp1_s
t = arange(tmax)
figure(figsize=(10,4.5))
subplot(3,1,1)
plot(t,X,'k-')
ylabel('X(t)')
subplot(3,1,2)
plot(t,Xtt,'k-')
ylabel('Estimate X_t|t')
subplot(3,1,3)
plot(t,Xttm1,'k-')
ylabel('Predition X_t|t-1')
xlabel('t')
#plot(t,Xtt,'r-')
<matplotlib.text.Text at 0x7f9cf805c7d0>
ett = Xtt - X
figure(figsize=(10,1.5))
plot(t,ett,'k-')
title('Estimation Error')
xlabel('t')
<matplotlib.text.Text at 0x7f9cf80e6bd0>
ettm1 = Xttm1 - X
figure(figsize=(10,1.5))
plot(t,ettm1,'k-')
title('Prediction Error')
xlabel('t')
<matplotlib.text.Text at 0x7f9cf2ec8150>
Show distribution of particles at time t=5 and t=50, using kernel density estimation
t = 5
d = 0.1 # width of kernel, chosen by trial and error
Xt_s = S[:,t]
x = linspace(-15,15,1000)
f = zeros(len(x))
for i in range(n):
f += 1./n * G((x, Xt_s[i], sqrt(d)))
plot(x,f,'k-')
title('Particle Distribution at t=%i' % t)
xlabel('x')
ylabel('p(x_t|y_0:t)')
<matplotlib.text.Text at 0x7f9cf1e2fb90>
t = 50
d = 0.1 # width of kernel, chosen by trial and error
Xt_s = S[:,t]
x = linspace(-15,15,1000)
f = zeros(len(x))
for i in range(n):
f += 1./n * G((x, Xt_s[i], sqrt(d)))
plot(x,f,'k-')
title('Particle Distribution at t=%i' % t)
xlabel('x')
ylabel('p(x_t|y_0:t)')
<matplotlib.text.Text at 0x7f9cf1d65190>
We estimate the mean-squared estimation error and prediction error using a Monte Carlo approach with K = 100
K = 100
tmax = 300
n = 500
a = 0.5
b = 25.
c = 8.
omega = 1.2
Z_est = zeros((K,tmax)) # This will hold estimation errors for each MC trial for each time
Z_pre = zeros((K,tmax)) # Same, but for prediction error
for k in range(K):
# function to evaluate Gaussian pdf
G = lambda (y,mu,sigsq): 1./sqrt(2*pi*sigsq) * exp(-0.5*(y - mu)**2/sigsq)
# Variables to save values
X = zeros(tmax) # Actual state t
Xtt = zeros(tmax) # Estimate of state t given t
Xttm1 = zeros(tmax) # Prediction of t given t-1
# Initialize t=0
Xt = 0.1
Xt_s = 0.1*ones(n)
X[0] = Xt
Xtt[0] = mean(Xt_s)
Xttm1[0] = mean(Xt_s)
for t in range(tmax-1):
# Step 0: generate true next step and measurement
Xtp1 = a*Xt + b*Xt/(1+Xt**2) + c*cos(omega*t) + sqrt(10)*randn()
Ytp1 = 1./20 * Xtp1**2 + randn()
# Step 1: Prediction step
Xtp1_star = a*Xt_s + b*Xt_s/(1+Xt_s**2) + c*cos(omega*t) + sqrt(10)*randn(n)
# Step 2: Update step
# (a) Weights
w_numer = G((Ytp1, 1./20 * Xtp1_star**2, 1))
w = w_numer/sum(w_numer)
# (b) Resample
counts = random.multinomial(n,w)
Xtp1_s = []
for i in range(n):
for j in range(counts[i]):
Xtp1_s.append(Xtp1_star[i])
Xtp1_s = array(Xtp1_s)
# Save results of this time
X[t+1] = Xtp1
Xtt[t+1] = mean(Xtp1_s)
Xttm1[t+1] = mean(Xtp1_star)
# Propagate values to next iteration
Xt_s = Xtp1_s
# Save results of this MC trial
err_est = Xtt - Xt
err_pre = Xttm1 - Xt
Z_est[k,:] = err_est
Z_pre[k,:] = err_pre
# Compute MSE for estimation and prediction using empirical distribution from MC trials
mse_est = mean(Z_est**2, axis=0)
mse_pre = mean(Z_pre**2, axis=0)
figure(figsize=(10,3))
t = arange(tmax)
subplot(2,1,1)
plot(t,mse_est,'k-')
ylabel('Estimation Error')
subplot(2,1,2)
plot(t,mse_pre,'r-')
ylabel('Prediction Error')
xlabel('t')
<matplotlib.text.Text at 0x7f9cf141ca10>
The above plots show the results of estimating the estimation error and prediction error, and how they evolve with time. Clearly, there is a periodicity in the results, which makes sense. The error is larger when the actual state is large.
(Problem 6)
K = 30 # number of iterations of EM
z_vec = zeros(K)
m_vec = zeros(K)
z = 0.5 # zeta
m = 1.0 # lambda
z_vec[0] = z
m_vec[0] = m
# Pre-calculate relevant values
dat = [3062, 587, 284, 103, 33, 4, 2]
N0 = 1.0*dat[0] # number samples equal to 0
N1 = 1.0*sum(dat[1:]) # number samples >= 1
M1 = 1.0*sum([i*dat[i] for i in range(1,7)]) # weighted sum of samples >=1
for k in range(1,K):
B1 = exp(-m) / (exp(-m) + (1-z)/z)
B0 = (1-z)/z / ((1-z)/z + exp(-m))
z = (B1*N0 + N1)/(2*N0 + N1)
m = M1/(N0*B1 + N1)
z_vec[k] = z
m_vec[k] = m
print z_vec
plot(z_vec,'k.-')
[ 0.5 0.25732081 0.19554191 0.17212981 0.16440093 0.16202649 0.16131641 0.16110589 0.16104364 0.16102524 0.16101981 0.1610182 0.16101773 0.16101759 0.16101755 0.16101754 0.16101753 0.16101753 0.16101753 0.16101753 0.16101753 0.16101753 0.16101753 0.16101753 0.16101753 0.16101753 0.16101753 0.16101753 0.16101753 0.16101753]
[<matplotlib.lines.Line2D at 0x7f9ceaa43410>]
print m_vec
plot(m_vec,'k.-')
[ 1. 0.88646949 1.1665379 1.32520359 1.38750457 1.40783798 1.41403497 1.41588274 1.41643007 1.41659187 1.41663967 1.41665379 1.41665796 1.41665919 1.41665956 1.41665967 1.4166597 1.41665971 1.41665971 1.41665971 1.41665971 1.41665971 1.41665971 1.41665971 1.41665971 1.41665971 1.41665971 1.41665971 1.41665971 1.41665971]
[<matplotlib.lines.Line2D at 0x7f9cea9f13d0>]
(Problem 7)
The EM algorithm for this case is given in the course notes on page 6 of lecture9_10_11.pdf, and is readily generalized to 2 dimensions. For the initial estimate, we will start with a single Gaussian fit (i.e., calculating the sample mean and variance), and then perturb the means by +/- 10%. This was a trick used in ECE 544 NA. We'll start with pi_1 = 0.5.
# Read data file
with open('/home/bhardin2/ece598pm/data.txt') as f:
lines = f.readlines()
n = len(lines)
x = zeros((2,n))
for i in range(len(lines)):
line = lines[i]
xys = line.split()
x[:,i] = [float(v) for v in xys]
plot(x[0,:],x[1,:],'k.')
title('Data points')
<matplotlib.text.Text at 0x7f9ce9cc0750>
K = 100
# Initial Guess
mu = mean(x,axis=1)
S = 1./n*(x.dot(x.T))
pi1 = 0.5
mu1 = mu*0.90
mu2 = mu*1.10
S1 = S.copy()
S2 = S.copy()
for k in range(K):
# Calculate posterior
p1 = zeros(n)
p2 = zeros(n)
for j in range(n):
c = x[:,j] - mu1
p1[j] = pi1 * (2*pi)**(-1) * linalg.det(S1)**(-0.5) * exp(-0.5*c.T.dot(linalg.inv(S1)).dot(c))
c = x[:,j] - mu2
p2[j] = (1-pi1) * (2*pi)**(-1) * linalg.det(S2)**(-0.5) * exp(-0.5*c.T.dot(linalg.inv(S2)).dot(c))
denom = p1+p2
p1 = p1/denom
p2 = p2/denom
# Calculate new mean
mu1 = sum(x*p1,axis=1)/sum(p1)
mu2 = sum(x*p2,axis=1)/sum(p2)
# Calculate new covariance matrix
S1top = zeros((2,2))
S2top = zeros((2,2))
for j in range(n):
c1 = x[:,j] - mu1
c2 = x[:,j] - mu2
S1top = S1top + p1[j]*outer(c1,c1)
S2top = S2top + p2[j]*outer(c2,c2)
S1 = S1top / sum(p1)
S2 = S2top / sum(p2)
# Calculate new weights
pi1 = 1./n * sum(p1)
# Show plot for one variable, just to see
plot(k,mu1[0],'k.')
xlabel('iteration')
ylabel('mu1[0][k]')
<matplotlib.text.Text at 0x7f9ce9c9e7d0>
plot(x[0,:],x[1,:],'k.')
plot(mu1[0],mu1[1],'ro',markersize=20)
plot(mu2[0],mu2[1],'ro',markersize=20)
title('Data points with estimated means')
<matplotlib.text.Text at 0x7f9ce9dfd350>
The values of the estimated parameters are given below.
print 'pi1 is ',pi1
print 'mu1 is ',mu1
print 'mu2 is ',mu2
print 'cov1 is ',S1
print 'cov2 is ',S2
pi1 is 0.680003164256 mu1 is [ 1.12808961 1.3624249 ] mu2 is [-2.59131001 -4.26514449] cov1 is [[ 3.12375576 -1.60642511] [-1.60642511 2.89376025]] cov2 is [[ 0.84868283 -0.02064296] [-0.02064296 0.8845108 ]]
By playing around with it, I found that the choice of intialization has a significant impact on the rate of convergence. Sometimes the algorithm gets stuck near a saddle point for some time. Occasionally, if a really bad initialization is chosen, the algorithm will not converge.
import subprocess
import shutil
nb_name = 'ECE598PM_HW2.ipynb' # There's probably a way to get this programmatically
cmd = ['ipython', 'nbconvert', '--to', 'slides', nb_name, \
'--reveal-prefix', '"http://cdn.jsdelivr.net/reveal.js/2.6.2"',]
# '--template','output_toggle']
# The --template command points to a custom template which opens and closes the input code cells with a click
subprocess.call(cmd)
html_name = '%s.slides.html' % (nb_name.split('.')[0])
shutil.copy(html_name, '/home/bhardin2/public_html/slideshows/')