#!/usr/bin/env python # coding: utf-8 # In[1]: from IPython.display import Image Image('../../../python_for_probability_statistics_and_machine_learning.jpg') # [Python for Probability, Statistics, and Machine Learning](https://www.springer.com/fr/book/9783319307152) # In[2]: from __future__ import division get_ipython().run_line_magic('pylab', 'inline') # In this section, we consider the famous Gauss-Markov problem which will give # us an opportunity to use all the material we have so far developed. The # Gauss-Markov model is the fundamental model for noisy parameter estimation because it # estimates unobservable parameters given a noisy indirect measurement. # Incarnations of the same model appear in all studies of Gaussian models. This # case is an excellent opportunity to use everything we have so far learned about # projection and conditional expectation. # # Following Luenberger [[luenberger1968optimization]](#luenberger1968optimization) let's consider the # following problem: # $$ # \mathbf{y} = \mathbf{W} \boldsymbol{\beta} + \boldsymbol{\epsilon} # $$ # where $\mathbf{W}$ is a $ n \times m $ matrix, and $\mathbf{y}$ is a # $n \times 1$ vector. Also, $\boldsymbol{\epsilon}$ is a $n$-dimensional normally # distributed random vector with zero-mean and covariance, # $$ # \mathbb{E}( \boldsymbol{\epsilon} \boldsymbol{\epsilon}^T) = \mathbf{Q} # $$ # Note that engineering systems usually provide a *calibration mode* # where you can estimate $\mathbf{Q}$ so it's not fantastical to assume you have # some knowledge of the noise statistics. The problem is to find a matrix # $\mathbf{K}$ so that $ \boldsymbol{\hat{\beta}}=\mathbf{K} \mathbf{y}$ # approximates $ \boldsymbol{\beta}$. Note that we only have knowledge of # $\boldsymbol{\beta}$ via $ \mathbf{y}$ so we can't measure it directly. # Further, note that $\mathbf{K} $ is a matrix, not a vector, so there are $m # \times n$ entries to compute. # # We can approach this problem the usual way by trying to solve the MMSE # problem: # $$ # \min_K\mathbb{E}(\Vert\boldsymbol{\hat{\beta}}-\boldsymbol{\beta} \Vert^2) # $$ # which we can write out as # $$ # \min_K \mathbb{E}(\Vert \boldsymbol{\hat{\beta}}-\boldsymbol{\beta} \Vert^2) = \min_K\mathbb{E}(\Vert \mathbf{K}\mathbf{y}- \boldsymbol{\beta} \Vert^2) = \min_K\mathbb{E}(\Vert \mathbf{K}\mathbf{W}\mathbf{\boldsymbol{\beta}}+\mathbf{K}\boldsymbol{\epsilon}- \boldsymbol{\beta} \Vert^2) # $$ # and since $\boldsymbol{\epsilon}$ is the only random variable here, # this simplifies to # $$ # \min_K \Vert \mathbf{K}\mathbf{W}\mathbf{\boldsymbol{\beta}}-\boldsymbol{\beta} \Vert^2 + \mathbb{E}(\Vert\mathbf{K}\boldsymbol{\epsilon} \Vert^2) # $$ # The next step is to compute # $$ # \mathbb{E}(\Vert\mathbf{K}\boldsymbol{\epsilon} \Vert^2) = \mathbb{E}(\boldsymbol{\epsilon}^T \mathbf{K}^T \mathbf{K}^T \boldsymbol{\epsilon})=\Tr(\mathbf{K \mathbb{E}(\boldsymbol{\epsilon}\boldsymbol{\epsilon}^T) K}^T)=\Tr(\mathbf{K Q K}^T) # $$ # using the properties of the trace of a matrix. We can assemble # everything as # $$ # \min_K \Vert \mathbf{K W} \boldsymbol{\beta} - \boldsymbol{\beta}\Vert^2 + \Tr(\mathbf{K Q K}^T) # $$ # Now, if we were to solve this for $\mathbf{K}$, it would be a # function of $ \boldsymbol{\beta}$, which is the same thing as saying that the # estimator, $ \boldsymbol{\hat{\beta}}$, is a function of what we are trying to # estimate, $\boldsymbol{\beta}$, which makes no sense. However, writing this out # tells us that if we had $\mathbf{K W}= \mathbf{I}$, then the first term # vanishes and the problem simplifies to # $$ # \min_K \Tr(\mathbf{K Q K}^T) # $$ # with the contraint, # $$ # \mathbf{KW} = \mathbf{I} # $$ # This requirement is the same as asserting that the estimator is unbiased, # $$ # \mathbb{E}(\boldsymbol{\hat{\beta}})=\mathbf{KW} \boldsymbol{\beta} = \boldsymbol{\beta} # $$ # To line this problem up with our earlier work, let's consider the # $i^{th}$ column of $\mathbf{K}$, $\mathbf{k}_i$. Now, we can re-write the # problem as # $$ # \min_k (\mathbf{k}_i^T \mathbf{Q} \mathbf{k}_i) # $$ # with # $$ # \mathbf{k}_i^T \mathbf{W} = \mathbf{e}_i # $$ # and from our previous work on contrained optimization, we know the # solution to this: # $$ # \mathbf{k}_i = \mathbf{Q}^{-1} \mathbf{W}(\mathbf{W}^T \mathbf{Q^{-1} W})^{-1}\mathbf{e}_i # $$ # Now all we have to do is stack these together for the general solution: # $$ # \mathbf{K} = (\mathbf{W}^T \mathbf{Q^{-1} W})^{-1} \mathbf{W}^T\mathbf{Q}^{-1} # $$ # It's easy when you have all of the concepts lined up! For completeness, the # covariance of the error is # $$ # \mathbb{E}(\hat{\boldsymbol{\beta}}-\boldsymbol{\beta}) (\hat{\boldsymbol{\beta}}-\boldsymbol{\beta})^T = \mathbb{E}(\mathbf{K} \boldsymbol{\epsilon} \boldsymbol{\epsilon}^T \mathbf{K}^T)=\mathbf{K}\mathbf{Q}\mathbf{K}^T =(\mathbf{W}^T \mathbf{Q}^{-1} \mathbf{W})^{-1} # $$ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# #The red circles show the points to be estimated in the xy-plane by the black points.
#
#
#
#
#
# [Figure](#fig:Gauss_Markov_001) shows the simulated $\mathbf{y}$ data as red
# circles. The black dots show the corresponding estimates,
# $\boldsymbol{\hat{\beta}}$ for each sample. The black lines show the true value
# of $\boldsymbol{\beta}$ versus the average of the estimated
# $\boldsymbol{\beta}$-values, $\widehat{\boldsymbol{\beta}_m}$. The matrix
# $\mathbf{K}$ maps the red circles in the corresponding dots. Note there are
# many possible ways to map the red circles to the plane, but the $\mathbf{K}$ is
# the one that minimizes the MSE for $\boldsymbol{\beta}$.
#
# **Programming Tip.**
#
# Although the full source code is available in the corresponding
# IPython Notebook, the following snippets provide a quick walkthrough.
# To simulate the target data, we define the relevant
# matrices below,
# In[3]:
Q = np.eye(3)*0.1 # error covariance matrix
# this is what we are trying estimate
beta = matrix(ones((2,1)))
W = matrix([[1,2],
[2,3],
[1,1]])
# Then, we generate the noise terms and create
# the simulated data, $y$,
# In[9]:
ntrials = 50
epsilon = np.random.multivariate_normal((0,0,0),Q,ntrials).T
y=W*beta+epsilon
#
#
#
#
# Focusing on the xy-plane in [Figure](#fig:Gauss_Markov_001), the dashed line shows the true value for $\boldsymbol{\beta}$ versus the mean of the estimated values $\widehat{\boldsymbol{\beta}}_m$.
#
#
#
#
#
# [Figure](#fig:Gauss_Markov_002) shows more detail in the horizontal *xy*-plane
# of [Figure](#fig:Gauss_Markov_001). [Figure](#fig:Gauss_Markov_002) shows
# the dots, which are individual estimates of $\boldsymbol{\hat{\beta}}$ from the
# corresponding simulated $\mathbf{y}$ data. The dashed line is the true value
# for $\boldsymbol{\beta}$ and the filled line ($\widehat{\boldsymbol{\beta}}_m$)
# is the average of all the dots. The gray ellipse provides an error ellipse
# for the covariance of the estimated $\boldsymbol{\beta}$ values.
#
# **Programming Tip.**
#
# Although the full source code is available in the corresponding
# IPython Notebook, the following snippets provide a quick walkthrough of
# the construction of [Figure](#fig:Gauss_Markov_002). To draw
# the ellipse, we need to import the patch primitive,
# In[5]:
# %matplotlib inline
# from matplotlib.patches import Ellipse
# To compute the parameters of the error ellipse based on the
# covariance matrix of the individual estimates of $\boldsymbol{\beta}$
# in the `bm_cov` variable below,
# In[6]:
# U,S,V = linalg.svd(bm_cov)
# err = np.sqrt((matrix(bm))*(bm_cov)*(matrix(bm).T))
# theta = np.arccos(U[0,1])/np.pi*180
# Then, we draw the add the scaled ellipse
# in the following,
# In[7]:
# ax.add_patch(Ellipse(bm,err*2/np.sqrt(S[0]),
# err*2/np.sqrt(S[1]),
# angle=theta,color='gray'))
# In[8]:
from __future__ import division
from mpl_toolkits.mplot3d import proj3d
from numpy.linalg import inv
import matplotlib.pyplot as plt
import numpy as np
from numpy import matrix, linalg, ones, array
Q = np.eye(3)*.1 # error covariance matrix
beta = matrix(ones((2,1))) # this is what we are trying estimate
W = matrix([[1,2],
[2,3],
[1,1]])
ntrials = 50
epsilon = np.random.multivariate_normal((0,0,0),Q,ntrials).T
y=W*beta+epsilon
K=inv(W.T*inv(Q)*W)*matrix(W.T)*inv(Q)
b=K*y #estimated beta from data
fig = plt.figure()
fig.set_size_inches([6,6])
# some convenience definitions for plotting
bb = array(b)
bm = bb.mean(1)
yy = array(y)
ax = fig.add_subplot(111, projection='3d')
ax.plot3D(yy[0,:],yy[1,:],yy[2,:],'ro',label='y',alpha=0.3)
ax.plot3D([beta[0,0],0],[beta[1,0],0],[0,0],'k--',label=r'$\beta$')
ax.plot3D([bm[0],0],[bm[1],0],[0,0],'k-',lw=1,label=r'$\widehat{\beta}_m$')
ax.plot3D(bb[0,:],bb[1,:],0*bb[1,:],'.k',alpha=0.5,lw=3,label=r'$\widehat{\beta}$')
ax.legend(loc=0,fontsize=18)
ax.set_xlabel('$x_1$',fontsize=20)
ax.set_ylabel('$x_2$',fontsize=20)
ax.set_zlabel('$x_3$',fontsize=20)
fig.tight_layout()
from matplotlib.patches import Ellipse
fig, ax = plt.subplots()
fig.set_size_inches((6,6))
ax.set_aspect(1)
ax.plot(bb[0,:],bb[1,:],'ko',alpha=.3)
ax.plot([beta[0,0],0],[beta[1,0],0],'k--',label=r'$\beta$',lw=3)
ax.plot([bm[0],0],[bm[1],0],'k-',lw=3,label=r'$\widehat{\beta}_m$')
ax.legend(loc=0,fontsize=18)
bm_cov = inv(W.T*Q*W)
U,S,V = linalg.svd(bm_cov)
err = np.sqrt((matrix(bm))*(bm_cov)*(matrix(bm).T))
theta = np.arccos(U[0,1])/np.pi*180
ax.add_patch(Ellipse(bm,err*2/np.sqrt(S[0]),err*2/np.sqrt(S[1])
,angle=theta,color='gray',alpha=0.5))
ax.set_xlabel('$x_1$',fontsize=20)
ax.set_ylabel('$x_2$',fontsize=20)
fig.tight_layout()
#
#
#
#