[regression] finished chapter

This commit is contained in:
Jan Benda 2020-12-20 01:05:02 +01:00
parent 46affef86d
commit dea6319e75
10 changed files with 61 additions and 247 deletions

View File

@ -1,8 +1,8 @@
function mse = meanSquaredErrorCubic(x, y, c)
% Mean squared error between data pairs and a cubic relation.
%
% Arguments: x, vector of the input values
% y, vector of the corresponding measured output values
% Arguments: x, vector of the x-data values
% y, vector of the corresponding y-data values
% c, the factor for the cubic relation.
%
% Returns: mse, the mean-squared-error.

View File

@ -75,7 +75,7 @@ def plot_mse_min(ax, x, y, c):
if __name__ == "__main__":
x, y, c = create_data()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 1.1*figure_height))
fig.subplots_adjust(**adjust_fs(left=8.0, right=1))
fig.subplots_adjust(**adjust_fs(left=8.0, right=1.2))
plot_mse(ax1, x, y, c)
plot_mse_min(ax2, x, y, c)
fig.savefig("cubiccost.pdf")

View File

@ -58,8 +58,8 @@ def plot_error_hist(ax, x, y, c):
if __name__ == "__main__":
x, y, c = create_data()
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.subplots_adjust(wspace=0.4, **adjust_fs(left=6.0, right=1.2))
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 0.9*figure_height))
fig.subplots_adjust(wspace=0.5, **adjust_fs(left=6.0, right=1.2))
plot_data_errors(ax1, x, y, c)
plot_error_hist(ax2, x, y, c)
fig.savefig("cubicerrors.pdf")

View File

@ -42,8 +42,8 @@ def plot_data_fac(ax, x, y, c):
if __name__ == "__main__":
x, y, c = create_data()
print('n=%d' % len(x))
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.subplots_adjust(wspace=0.5, **adjust_fs(fig, left=6.0, right=1.5))
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 0.9*figure_height))
fig.subplots_adjust(wspace=0.5, **adjust_fs(fig, left=6.0, right=1.2))
plot_data(ax1, x, y)
plot_data_fac(ax2, x, y, c)
fig.savefig("cubicfunc.pdf")

View File

@ -73,7 +73,7 @@ if __name__ == "__main__":
x, y, c = create_data()
cs, mses = gradient_descent(x, y)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 1.1*figure_height))
fig.subplots_adjust(wspace=0.2, **adjust_fs(left=8.0, right=0.5))
fig.subplots_adjust(**adjust_fs(left=8.0, right=1.2))
plot_mse(ax1, x, y, c, cs)
plot_descent(ax2, cs, mses)
fig.savefig("cubicmse.pdf")

View File

@ -1,56 +0,0 @@
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from plotstyle import *
def create_data():
m = 0.75
n= -40
x = np.arange(10.,110., 2.5)
y = m * x + n;
rng = np.random.RandomState(37281)
noise = rng.randn(len(x))*15
y += noise
return x, y, m, n
def plot_error_plane(ax, x, y, m, n):
ax.set_xlabel('Slope m')
ax.set_ylabel('Intercept b')
ax.set_zlabel('Mean squared error')
ax.set_xlim(-4.5, 5.0)
ax.set_ylim(-60.0, -20.0)
ax.set_zlim(0.0, 700.0)
ax.set_xticks(np.arange(-4, 5, 2))
ax.set_yticks(np.arange(-60, -19, 10))
ax.set_zticks(np.arange(0, 700, 200))
ax.grid(True)
ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
ax.w_yaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
ax.w_zaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
ax.invert_xaxis()
ax.view_init(25, 40)
slopes = np.linspace(-4.5, 5, 40)
intercepts = np.linspace(-60, -20, 40)
x, y = np.meshgrid(slopes, intercepts)
error_surf = np.zeros(x.shape)
for i, s in enumerate(slopes) :
for j, b in enumerate(intercepts) :
error_surf[j,i] = np.mean((y-s*x-b)**2.0)
ax.plot_surface(x, y, error_surf, rstride=1, cstride=1, cmap=cm.coolwarm,
linewidth=0, shade=True)
# Minimum:
mini = np.unravel_index(np.argmin(error_surf), error_surf.shape)
ax.scatter(slopes[mini[1]], intercepts[mini[0]], [0.0], color='#cc0000', s=60)
if __name__ == "__main__":
x, y, m, n = create_data()
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1, projection='3d')
plot_error_plane(ax, x, y, m, n)
fig.set_size_inches(7., 5.)
fig.subplots_adjust(**adjust_fs(fig, 1.0, 0.0, 0.0, 0.0))
fig.savefig("error_surface.pdf")
plt.close()

View File

@ -1,60 +0,0 @@
import numpy as np
import matplotlib.pyplot as plt
from plotstyle import *
def create_data():
m = 0.75
n= -40
x = np.arange(10.,110., 2.5)
y = m * x + n;
rng = np.random.RandomState(37281)
noise = rng.randn(len(x))*15
y += noise
return x, y, m, n
def plot_data(ax, x, y):
ax.plot(x, y, **psA)
ax.set_xlabel('Input x')
ax.set_ylabel('Output y')
ax.set_xlim(0, 120)
ax.set_ylim(-80, 80)
ax.set_xticks(np.arange(0,121, 40))
ax.set_yticks(np.arange(-80,81, 40))
def plot_data_slopes(ax, x, y, m, n):
ax.plot(x, y, **psA)
xx = np.asarray([2, 118])
for i in np.linspace(0.3*m, 2.0*m, 5):
ax.plot(xx, i*xx+n, **lsBm)
ax.set_xlabel('Input x')
#ax.set_ylabel('Output y')
ax.set_xlim(0, 120)
ax.set_ylim(-80, 80)
ax.set_xticks(np.arange(0,121, 40))
ax.set_yticks(np.arange(-80,81, 40))
def plot_data_intercepts(ax, x, y, m, n):
ax.plot(x, y, **psA)
xx = np.asarray([2, 118])
for i in np.linspace(n-1*n, n+1*n, 5):
ax.plot(xx, m*xx + i, **lsBm)
ax.set_xlabel('Input x')
#ax.set_ylabel('Output y')
ax.set_xlim(0, 120)
ax.set_ylim(-80, 80)
ax.set_xticks(np.arange(0,121, 40))
ax.set_yticks(np.arange(-80,81, 40))
if __name__ == "__main__":
x, y, m, n = create_data()
fig, axs = plt.subplots(1, 3)
fig.subplots_adjust(wspace=0.5, **adjust_fs(fig, left=6.0, right=1.5))
plot_data(axs[0], x, y)
plot_data_slopes(axs[1], x, y, m, n)
plot_data_intercepts(axs[2], x, y, m, n)
fig.savefig("lin_regress.pdf")
plt.close()

View File

@ -1,65 +0,0 @@
import numpy as np
import matplotlib.pyplot as plt
from plotstyle import *
def create_data():
m = 0.75
n= -40
x = np.concatenate( (np.arange(10.,110., 2.5), np.arange(0.,120., 2.0)) )
y = m * x + n;
rng = np.random.RandomState(37281)
noise = rng.randn(len(x))*15
y += noise
return x, y, m, n
def plot_data(ax, x, y, m, n):
ax.set_xlabel('Input x')
ax.set_ylabel('Output y')
ax.set_xlim(0, 120)
ax.set_ylim(-80, 80)
ax.set_xticks(np.arange(0,121, 40))
ax.set_yticks(np.arange(-80,81, 40))
ax.annotate('Error',
xy=(x[34]+1, y[34]+15), xycoords='data',
xytext=(80, -50), textcoords='data', ha='left',
arrowprops=dict(arrowstyle="->", relpos=(0.9,1.0),
connectionstyle="angle3,angleA=50,angleB=-30") )
ax.plot(x[:40], y[:40], zorder=0, **psAm)
inxs = [3, 13, 16, 19, 25, 34, 36]
ax.plot(x[inxs], y[inxs], zorder=10, **psA)
xx = np.asarray([2, 118])
ax.plot(xx, m*xx+n, **lsBm)
for i in inxs :
xx = [x[i], x[i]]
yy = [m*x[i]+n, y[i]]
ax.plot(xx, yy, zorder=5, **lsDm)
def plot_error_hist(ax, x, y, m, n):
ax.set_xlabel('Squared error')
ax.set_ylabel('Frequency')
bins = np.arange(0.0, 602.0, 50.0)
ax.set_xlim(bins[0], bins[-1])
ax.set_ylim(0, 35)
ax.set_xticks(np.arange(bins[0], bins[-1], 100))
ax.set_yticks(np.arange(0, 36, 10))
errors = (y-(m*x+n))**2.0
mls = np.mean(errors)
ax.annotate('Mean\nsquared\nerror',
xy=(mls, 0.5), xycoords='data',
xytext=(350, 20), textcoords='data', ha='left',
arrowprops=dict(arrowstyle="->", relpos=(0.0,0.2),
connectionstyle="angle3,angleA=10,angleB=90") )
ax.hist(errors, bins, **fsD)
if __name__ == "__main__":
x, y, m, n = create_data()
fig, axs = plt.subplots(1, 2)
fig.subplots_adjust(**adjust_fs(fig, left=6.0))
plot_data(axs[0], x, y, m, n)
plot_error_hist(axs[1], x, y, m, n)
fig.savefig("linear_least_squares.pdf")
plt.close()

View File

@ -23,47 +23,12 @@
\item Fig 8.2 right: this should be a chi-squared distribution with one degree of freedom!
\end{itemize}
\begin{figure}[t]
\includegraphics[width=0.75\textwidth]{error_surface}
\titlecaption{Error surface.}{The two model parameters $m$ and $b$
define the base area of the surface plot. For each parameter
combination of slope and intercept the error is calculated. The
resulting surface has a minimum which indicates the parameter
combination that best fits the data.}\label{errorsurfacefig}
\end{figure}
\begin{figure}[t]
\includegraphics[width=0.75\textwidth]{error_gradient}
\titlecaption{Gradient of the error surface.} {Each arrow points
into the direction of the greatest ascend at different positions
of the error surface shown in \figref{errorsurfacefig}. The
contour lines in the background illustrate the error surface. Warm
colors indicate high errors, colder colors low error values. Each
contour line connects points of equal
error.}\label{gradientquiverfig}
\end{figure}
\begin{figure}[t]
\includegraphics[width=0.45\textwidth]{gradient_descent}
\titlecaption{Gradient descent.}{The algorithm starts at an
arbitrary position. At each point the gradient is estimated and
the position is updated as long as the length of the gradient is
sufficiently large.The dots show the positions after each
iteration of the algorithm.} \label{gradientdescentfig}
\end{figure}
\subsection{Linear fits}
\begin{itemize}
\item Polyfit is easy: unique solution! $c x^2$ is also a linear fit.
\item Example for overfitting with polyfit of a high order (=number of data points)
\end{itemize}
\section{Fitting in practice}
Fit with matlab functions lsqcurvefit, polyfit
\subsection{Non-linear fits}
\begin{itemize}

View File

@ -227,6 +227,7 @@ to arbitrary precision.
\end{ibox}
\section{Gradient}
Imagine to place a ball at some point on the cost function
\figref{cubiccostfig}. Naturally, it would roll down the slope and
eventually stop at the minimum of the error surface (if it had no
@ -235,10 +236,21 @@ way to the minimum of the cost function. The ball always follows the
steepest slope. Thus we need to figure out the direction of the slope
at the position of the ball.
\begin{figure}[t]
\includegraphics{cubicgradient}
\titlecaption{Derivative of the cost function.}{The gradient, the
derivative \eqref{costderivative} of the cost function, is
negative to the left of the minimum of the cost function, zero at,
and positive to the right of the minimum (left). For each value of
the parameter $c$ the negative gradient (arrows) points towards
the minimum of the cost function
(right).} \label{gradientcubicfig}
\end{figure}
In our one-dimensional example of a single free parameter the slope is
simply the derivative of the cost function with respect to the
parameter $c$. This derivative is called the
\entermde{Gradient}{gradient} of the cost function:
parameter $c$ (\figref{gradientcubicfig}, left). This derivative is called
the \entermde{Gradient}{gradient} of the cost function:
\begin{equation}
\label{costderivative}
\nabla f_{cost}(c) = \frac{{\rm d} f_{cost}(c)}{{\rm d} c}
@ -252,7 +264,8 @@ can be approximated numerically by the difference quotient
\approx \frac{f_{cost}(c + \Delta c) - f_{cost}(c)}{\Delta c}
\end{equation}
The derivative is positive for positive slopes. Since want to go down
the hill, we choose the opposite direction.
the hill, we choose the opposite direction (\figref{gradientcubicfig},
right).
\begin{exercise}{meanSquaredGradientCubic.m}{}\label{gradientcubic}
Implement a function \varcode{meanSquaredGradientCubic()}, that
@ -275,7 +288,7 @@ the hill, we choose the opposite direction.
\titlecaption{Gradient descent.}{The algorithm starts at an
arbitrary position. At each point the gradient is estimated and
the position is updated as long as the length of the gradient is
sufficiently large.The dots show the positions after each
sufficiently large. The dots show the positions after each
iteration of the algorithm.} \label{gradientdescentcubicfig}
\end{figure}
@ -378,18 +391,18 @@ functions that have more than a single parameter, in general $n$
parameters. We then need to find the minimum in an $n$ dimensional
parameter space.
For our tiger problem, we could have also fitted the exponent $\alpha$
of the power-law relation between size and weight, instead of assuming
a cubic relation with $\alpha=3$:
For our tiger problem, we could have also fitted the exponent $a$ of
the power-law relation between size and weight, instead of assuming a
cubic relation with $a=3$:
\begin{equation}
\label{powerfunc}
y = f(x; c, \alpha) = f(x; \vec p) = c\cdot x^\alpha
y = f(x; c, a) = f(x; \vec p) = c\cdot x^a
\end{equation}
We then could check whether the resulting estimate of the exponent
$\alpha$ indeed is close to the expected power of three. The
power-law \eqref{powerfunc} has two free parameters $c$ and $\alpha$.
$a$ indeed is close to the expected power of three. The
power-law \eqref{powerfunc} has two free parameters $c$ and $a$.
Instead of a single parameter we are now dealing with a vector $\vec
p$ containing $n$ parameter values. Here, $\vec p = (c, \alpha)$. All
p$ containing $n$ parameter values. Here, $\vec p = (c, a)$. All
the concepts we introduced on the example of the one dimensional
problem of tiger weights generalize to $n$-dimensional problems. We
only need to adapt a few things. The cost function for the mean
@ -399,12 +412,27 @@ squared error reads
f_{cost}(\vec p|\{(x_i, y_i)\}) = \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i;\vec p))^2
\end{equation}
\begin{figure}[t]
\includegraphics{powergradientdescent}
\titlecaption{Gradient descent on an error surface.}{Contour plot of
the cost function \eqref{ndimcostfunc} for the fit of a power law
\eqref{powerfunc} to some data. Here the cost function is a long
and narrow valley on the plane spanned by the two parameters $c$
and $a$ of the power law. The red line marks the path of the
gradient descent algorithm. The gradient is always perpendicular
to the contour lines. The algorithm quickly descends into the
valley and then slowly creeps on the shallow bottom of the valley
towards the global minimum where it terminates (yellow circle).
} \label{powergradientdescentfig}
\end{figure}
For two-dimensional problems the graph of the cost function is an
\enterm{error surface} (\determ{{Fehlerfl\"ache}}). The two parameters
span a two-dimensional plane. The cost function assigns to each
parameter combination on this plane a single value. This results in a
landscape over the parameter plane with mountains and valleys and we
are searching for the position of the bottom of the deepest valley.
are searching for the position of the bottom of the deepest valley
(\figref{powergradientdescentfig}).
\begin{ibox}[tp]{\label{partialderivativebox}Partial derivatives and gradient}
Some functions depend on more than a single variable. For example, the function
@ -446,13 +474,13 @@ space of our example, the \entermde{Gradient}{gradient}
(box~\ref{partialderivativebox}) of the cost function is a vector
\begin{equation}
\label{gradientpowerlaw}
\nabla f_{cost}(c, \alpha) = \left( \frac{\partial f_{cost}(c, \alpha)}{\partial c},
\frac{\partial f_{cost}(c, \alpha)}{\partial \alpha} \right)
\nabla f_{cost}(c, a) = \left( \frac{\partial f_{cost}(c, a)}{\partial c},
\frac{\partial f_{cost}(c, a)}{\partial a} \right)
\end{equation}
that points into the direction of the strongest ascend of the cost
function. The gradient is given by the partial derivatives
(box~\ref{partialderivativebox}) of the mean squared error with
respect to the parameters $c$ and $\alpha$ of the power law
respect to the parameters $c$ and $a$ of the power law
relation. In general, for $n$-dimensional problems, the gradient is an
$n-$ dimensional vector containing for each of the $n$ parameters
$p_j$ the respective partial derivatives as coordinates:
@ -468,6 +496,9 @@ parameter value $p_i$ becomes a vector $\vec p_i$ of parameter values:
\label{ndimgradientdescent}
\vec p_{i+1} = \vec p_i - \epsilon \cdot \nabla f_{cost}(\vec p_i)
\end{equation}
The algorithm proceeds along the negative gradient
(\figref{powergradientdescentfig}).
For the termination condition we need the length of the gradient. In
one dimension it was just the absolute value of the derivative. For
$n$ dimensions this is according to the \enterm{Pythagorean theorem}
@ -479,8 +510,7 @@ the sum of the squared partial derivatives:
\end{equation}
The \code{norm()} function implements this.
\section{Passing a function as an argument to another function}
\subsection{Passing a function as an argument to another function}
So far, all our code for the gradient descent algorithm was tailored
to a specific function, the cubic relation \eqref{cubicfunc}. It would
@ -500,7 +530,7 @@ argument to our function:
\pageinputlisting[caption={Passing a function handle as an argument to a function.}]{funcplotterexamples.m}
\section{Gradient descent algorithm for arbitrary functions}
\subsection{Gradient descent algorithm for arbitrary functions}
Now we are ready to adapt the gradient descent algorithm from
exercise~\ref{gradientdescentexercise} to arbitrary functions with $n$
@ -524,21 +554,21 @@ For testing our new function we need to implement the power law
Write a function that implements \eqref{powerfunc}. The function
gets as arguments a vector $x$ containing the $x$-data values and
another vector containing as elements the parameters for the power
law, i.e. the factor $c$ and the power $\alpha$. It returns a vector
law, i.e. the factor $c$ and the power $a$. It returns a vector
with the computed $y$ values for each $x$ value.
\end{exercise}
Now let's use the new gradient descent function to fit a power law to
our tiger data-set:
our tiger data-set (\figref{powergradientdescentfig}):
\begin{exercise}{plotgradientdescentpower.m}{}
Use the function \varcode{gradientDescent()} to fit the
\varcode{powerLaw()} function to the simulated data from
exercise~\ref{mseexercise}. Plot the returned values of the two
parameters against each other. Compare the result of the gradient
descent method with the true values of $c$ and $\alpha$ used to
simulate the data. Observe the norm of the gradient and inspect the
plots to adapt $\epsilon$ (smaller than in
descent method with the true values of $c$ and $a$ used to simulate
the data. Observe the norm of the gradient and inspect the plots to
adapt $\epsilon$ (smaller than in
exercise~\ref{plotgradientdescentexercise}) and the threshold (much
larger) appropriately. Finally plot the data together with the best
fitting power-law \eqref{powerfunc}.