[regression] started to simplify chapter to a 1d problem
This commit is contained in:
parent
a097b10024
commit
be79d450df
@ -1 +0,0 @@
|
||||
mse = mean((y - y_est).^2);
|
15
regression/code/meansquarederrorline.m
Normal file
15
regression/code/meansquarederrorline.m
Normal file
@ -0,0 +1,15 @@
|
||||
n = 40;
|
||||
xmin = 2.2;
|
||||
xmax = 3.9;
|
||||
c = 6.0;
|
||||
noise = 50.0;
|
||||
|
||||
% generate data:
|
||||
x = rand(n, 1) * (xmax-xmin) + xmin;
|
||||
yest = c * x.^3;
|
||||
y = yest + noise*randn(n, 1);
|
||||
|
||||
% compute mean squared error:
|
||||
mse = mean((y - y_est).^2);
|
||||
|
||||
fprintf('the mean squared error is %g kg^2\n', mse))
|
@ -15,28 +15,13 @@ def create_data():
|
||||
return x, y, c
|
||||
|
||||
|
||||
def plot_data(ax, x, y, c):
|
||||
ax.plot(x, y, zorder=10, **psAm)
|
||||
xx = np.linspace(2.1, 3.9, 100)
|
||||
ax.plot(xx, c*xx**3.0, zorder=5, **lsBm)
|
||||
for cc in [0.25*c, 0.5*c, 2.0*c, 4.0*c]:
|
||||
ax.plot(xx, cc*xx**3.0, zorder=5, **lsDm)
|
||||
ax.set_xlabel('Size x', 'm')
|
||||
ax.set_ylabel('Weight y', 'kg')
|
||||
ax.set_xlim(2, 4)
|
||||
ax.set_ylim(0, 400)
|
||||
ax.set_xticks(np.arange(2.0, 4.1, 0.5))
|
||||
ax.set_yticks(np.arange(0, 401, 100))
|
||||
|
||||
|
||||
def plot_data_errors(ax, x, y, c):
|
||||
ax.set_xlabel('Size x', 'm')
|
||||
#ax.set_ylabel('Weight y', 'kg')
|
||||
ax.set_ylabel('Weight y', 'kg')
|
||||
ax.set_xlim(2, 4)
|
||||
ax.set_ylim(0, 400)
|
||||
ax.set_xticks(np.arange(2.0, 4.1, 0.5))
|
||||
ax.set_yticks(np.arange(0, 401, 100))
|
||||
ax.set_yticklabels([])
|
||||
ax.annotate('Error',
|
||||
xy=(x[28]+0.05, y[28]+60), xycoords='data',
|
||||
xytext=(3.4, 70), textcoords='data', ha='left',
|
||||
@ -52,31 +37,30 @@ def plot_data_errors(ax, x, y, c):
|
||||
yy = [c*x[i]**3.0, y[i]]
|
||||
ax.plot(xx, yy, zorder=5, **lsDm)
|
||||
|
||||
|
||||
def plot_error_hist(ax, x, y, c):
|
||||
ax.set_xlabel('Squared error')
|
||||
ax.set_ylabel('Frequency')
|
||||
bins = np.arange(0.0, 1250.0, 100)
|
||||
bins = np.arange(0.0, 11000.0, 750)
|
||||
ax.set_xlim(bins[0], bins[-1])
|
||||
#ax.set_ylim(0, 35)
|
||||
ax.set_xticks(np.arange(bins[0], bins[-1], 200))
|
||||
#ax.set_yticks(np.arange(0, 36, 10))
|
||||
ax.set_ylim(0, 15)
|
||||
ax.set_xticks(np.arange(bins[0], bins[-1], 5000))
|
||||
ax.set_yticks(np.arange(0, 16, 5))
|
||||
errors = (y-(c*x**3.0))**2.0
|
||||
mls = np.mean(errors)
|
||||
ax.annotate('Mean\nsquared\nerror',
|
||||
xy=(mls, 0.5), xycoords='data',
|
||||
xytext=(800, 3), textcoords='data', ha='left',
|
||||
xytext=(4500, 6), textcoords='data', ha='left',
|
||||
arrowprops=dict(arrowstyle="->", relpos=(0.0,0.2),
|
||||
connectionstyle="angle3,angleA=10,angleB=90") )
|
||||
ax.hist(errors, bins, **fsC)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
x, y, c = create_data()
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2)
|
||||
fig.subplots_adjust(wspace=0.2, **adjust_fs(left=6.0, right=1.2))
|
||||
plot_data(ax1, x, y, c)
|
||||
plot_data_errors(ax2, x, y, c)
|
||||
#plot_error_hist(ax2, x, y, c)
|
||||
fig.subplots_adjust(wspace=0.4, **adjust_fs(left=6.0, right=1.2))
|
||||
plot_data_errors(ax1, x, y, c)
|
||||
plot_error_hist(ax2, x, y, c)
|
||||
fig.savefig("cubicerrors.pdf")
|
||||
plt.close()
|
||||
|
@ -2,7 +2,7 @@ import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from plotstyle import *
|
||||
|
||||
if __name__ == "__main__":
|
||||
def create_data():
|
||||
# wikipedia:
|
||||
# Generally, males vary in total length from 250 to 390 cm and
|
||||
# weigh between 90 and 306 kg
|
||||
@ -12,10 +12,20 @@ if __name__ == "__main__":
|
||||
rng = np.random.RandomState(32281)
|
||||
noise = rng.randn(len(x))*50
|
||||
y += noise
|
||||
return x, y, c
|
||||
|
||||
|
||||
fig, ax = plt.subplots(figsize=cm_size(figure_width, 1.4*figure_height))
|
||||
fig.subplots_adjust(**adjust_fs(left=6.0, right=1.2))
|
||||
|
||||
def plot_data(ax, x, y):
|
||||
ax.plot(x, y, **psA)
|
||||
ax.set_xlabel('Size x', 'm')
|
||||
ax.set_ylabel('Weight y', 'kg')
|
||||
ax.set_xlim(2, 4)
|
||||
ax.set_ylim(0, 400)
|
||||
ax.set_xticks(np.arange(2.0, 4.1, 0.5))
|
||||
ax.set_yticks(np.arange(0, 401, 100))
|
||||
|
||||
|
||||
def plot_data_fac(ax, x, y, c):
|
||||
ax.plot(x, y, zorder=10, **psA)
|
||||
xx = np.linspace(2.1, 3.9, 100)
|
||||
ax.plot(xx, c*xx**3.0, zorder=5, **lsB)
|
||||
@ -27,6 +37,14 @@ if __name__ == "__main__":
|
||||
ax.set_ylim(0, 400)
|
||||
ax.set_xticks(np.arange(2.0, 4.1, 0.5))
|
||||
ax.set_yticks(np.arange(0, 401, 100))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
x, y, c = create_data()
|
||||
print(len(x))
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2)
|
||||
fig.subplots_adjust(wspace=0.5, **adjust_fs(fig, left=6.0, right=1.5))
|
||||
plot_data(ax1, x, y)
|
||||
plot_data_fac(ax2, x, y, c)
|
||||
fig.savefig("cubicfunc.pdf")
|
||||
plt.close()
|
||||
|
@ -14,6 +14,7 @@ def create_data():
|
||||
y += noise
|
||||
return x, y, c
|
||||
|
||||
|
||||
def gradient_descent(x, y):
|
||||
n = 20
|
||||
dc = 0.01
|
||||
@ -29,6 +30,7 @@ def gradient_descent(x, y):
|
||||
mses.append(m0)
|
||||
cc -= eps*dmdc
|
||||
return cs, mses
|
||||
|
||||
|
||||
def plot_mse(ax, x, y, c, cs):
|
||||
ms = np.zeros(len(cs))
|
||||
@ -54,7 +56,8 @@ def plot_mse(ax, x, y, c, cs):
|
||||
ax.set_ylim(0, 25000)
|
||||
ax.set_xticks(np.arange(0.0, 10.1, 2.0))
|
||||
ax.set_yticks(np.arange(0, 30001, 10000))
|
||||
|
||||
|
||||
|
||||
def plot_descent(ax, cs, mses):
|
||||
ax.plot(np.arange(len(mses))+1, mses, **lpsBm)
|
||||
ax.set_xlabel('Iteration')
|
||||
@ -69,7 +72,7 @@ def plot_descent(ax, cs, mses):
|
||||
if __name__ == "__main__":
|
||||
x, y, c = create_data()
|
||||
cs, mses = gradient_descent(x, y)
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2)
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 1.1*figure_height))
|
||||
fig.subplots_adjust(wspace=0.2, **adjust_fs(left=8.0, right=0.5))
|
||||
plot_mse(ax1, x, y, c, cs)
|
||||
plot_descent(ax2, cs, mses)
|
||||
|
@ -40,6 +40,113 @@
|
||||
\item Homework is to do the 2d problem with the straight line!
|
||||
\end{itemize}
|
||||
|
||||
\subsection{2D fit}
|
||||
|
||||
\begin{exercise}{meanSquaredError.m}{}
|
||||
Implement the objective function \eqref{mseline} as a function
|
||||
\varcode{meanSquaredError()}. The function takes three
|
||||
arguments. The first is a vector of $x$-values and the second
|
||||
contains the measurements $y$ for each value of $x$. The third
|
||||
argument is a 2-element vector that contains the values of
|
||||
parameters \varcode{m} and \varcode{b}. The function returns the
|
||||
mean square error.
|
||||
\end{exercise}
|
||||
|
||||
|
||||
\begin{exercise}{errorSurface.m}{}\label{errorsurfaceexercise}
|
||||
Generate 20 data pairs $(x_i|y_i)$ that are linearly related with
|
||||
slope $m=0.75$ and intercept $b=-40$, using \varcode{rand()} for
|
||||
drawing $x$ values between 0 and 120 and \varcode{randn()} for
|
||||
jittering the $y$ values with a standard deviation of 15. Then
|
||||
calculate the mean squared error between the data and straight lines
|
||||
for a range of slopes and intercepts using the
|
||||
\varcode{meanSquaredError()} function from the previous exercise.
|
||||
Illustrates the error surface using the \code{surface()} function.
|
||||
Consult the documentation to find out how to use \code{surface()}.
|
||||
\end{exercise}
|
||||
|
||||
\begin{exercise}{meanSquaredGradient.m}{}\label{gradientexercise}%
|
||||
Implement a function \varcode{meanSquaredGradient()}, that takes the
|
||||
$x$- and $y$-data and the set of parameters $(m, b)$ of a straight
|
||||
line as a two-element vector as input arguments. The function should
|
||||
return the gradient at the position $(m, b)$ as a vector with two
|
||||
elements.
|
||||
\end{exercise}
|
||||
|
||||
\begin{exercise}{errorGradient.m}{}
|
||||
Extend the script of exercises~\ref{errorsurfaceexercise} to plot
|
||||
both the error surface and gradients using the
|
||||
\varcode{meanSquaredGradient()} function from
|
||||
exercise~\ref{gradientexercise}. Vectors in space can be easily
|
||||
plotted using the function \code{quiver()}. Use \code{contour()}
|
||||
instead of \code{surface()} to plot the error surface.
|
||||
\end{exercise}
|
||||
|
||||
|
||||
\begin{exercise}{gradientDescent.m}{}
|
||||
Implement the gradient descent for the problem of fitting a straight
|
||||
line to some measured data. Reuse the data generated in
|
||||
exercise~\ref{errorsurfaceexercise}.
|
||||
\begin{enumerate}
|
||||
\item Store for each iteration the error value.
|
||||
\item Plot the error values as a function of the iterations, the
|
||||
number of optimization steps.
|
||||
\item Plot the measured data together with the best fitting straight line.
|
||||
\end{enumerate}\vspace{-4.5ex}
|
||||
\end{exercise}
|
||||
|
||||
|
||||
\begin{figure}[t]
|
||||
\includegraphics[width=1\textwidth]{lin_regress}\hfill
|
||||
\titlecaption{Example data suggesting a linear relation.}{A set of
|
||||
input signals $x$, e.g. stimulus intensities, were used to probe a
|
||||
system. The system's output $y$ to the inputs are noted
|
||||
(left). Assuming a linear relation between $x$ and $y$ leaves us
|
||||
with 2 parameters, the slope (center) and the intercept with the
|
||||
y-axis (right panel).}\label{linregressiondatafig}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[t]
|
||||
\includegraphics[width=1\textwidth]{linear_least_squares}
|
||||
\titlecaption{Estimating the \emph{mean square error}.} {The
|
||||
deviation error (orange) between the prediction (red line) and the
|
||||
observations (blue dots) is calculated for each data point
|
||||
(left). Then the deviations are squared and the average is
|
||||
calculated (right).}
|
||||
\label{leastsquareerrorfig}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[t]
|
||||
\includegraphics[width=0.75\textwidth]{error_surface}
|
||||
\titlecaption{Error surface.}{The two model parameters $m$ and $b$
|
||||
define the base area of the surface plot. For each parameter
|
||||
combination of slope and intercept the error is calculated. The
|
||||
resulting surface has a minimum which indicates the parameter
|
||||
combination that best fits the data.}\label{errorsurfacefig}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\begin{figure}[t]
|
||||
\includegraphics[width=0.75\textwidth]{error_gradient}
|
||||
\titlecaption{Gradient of the error surface.} {Each arrow points
|
||||
into the direction of the greatest ascend at different positions
|
||||
of the error surface shown in \figref{errorsurfacefig}. The
|
||||
contour lines in the background illustrate the error surface. Warm
|
||||
colors indicate high errors, colder colors low error values. Each
|
||||
contour line connects points of equal
|
||||
error.}\label{gradientquiverfig}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[t]
|
||||
\includegraphics[width=0.45\textwidth]{gradient_descent}
|
||||
\titlecaption{Gradient descent.}{The algorithm starts at an
|
||||
arbitrary position. At each point the gradient is estimated and
|
||||
the position is updated as long as the length of the gradient is
|
||||
sufficiently large.The dots show the positions after each
|
||||
iteration of the algorithm.} \label{gradientdescentfig}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\subsection{Linear fits}
|
||||
\begin{itemize}
|
||||
\item Polyfit is easy: unique solution! $c x^2$ is also a linear fit.
|
||||
@ -54,8 +161,8 @@ Fit with matlab functions lsqcurvefit, polyfit
|
||||
\subsection{Non-linear fits}
|
||||
\begin{itemize}
|
||||
\item Example that illustrates the Nebenminima Problem (with error surface)
|
||||
\item You need got initial values for the parameter!
|
||||
\item Example that fitting gets harder the more parameter yuo have.
|
||||
\item You need initial values for the parameter!
|
||||
\item Example that fitting gets harder the more parameter you have.
|
||||
\item Try to fix as many parameter before doing the fit.
|
||||
\item How to test the quality of a fit? Residuals. $\chi^2$ test. Run-test.
|
||||
\end{itemize}
|
||||
|
@ -2,99 +2,104 @@
|
||||
\exercisechapter{Optimization and gradient descent}
|
||||
|
||||
Optimization problems arise in many different contexts. For example,
|
||||
to understand the behavior of a given system, the system is probed
|
||||
with a range of input signals and then the resulting responses are
|
||||
measured. This input-output relation can be described by a model. Such
|
||||
a model can be a simple function that maps the input signals to
|
||||
corresponding responses, it can be a filter, or a system of
|
||||
differential equations. In any case, the model has some parameters that
|
||||
specify how input and output signals are related. Which combination
|
||||
of parameter values are best suited to describe the input-output
|
||||
relation? The process of finding the best parameter values is an
|
||||
optimization problem. For a simple parameterized function that maps
|
||||
input to output values, this is the special case of a \enterm{curve
|
||||
fitting} problem, where the average distance between the curve and
|
||||
the response values is minimized. One basic numerical method used for
|
||||
such optimization problems is the so called gradient descent, which is
|
||||
introduced in this chapter.
|
||||
|
||||
%%% Weiteres einfaches verbales Beispiel? Eventuell aus der Populationsoekologie?
|
||||
to understand the behavior of a given neuronal system, the system is
|
||||
probed with a range of input signals and then the resulting responses
|
||||
are measured. This input-output relation can be described by a
|
||||
model. Such a model can be a simple function that maps the input
|
||||
signals to corresponding responses, it can be a filter, or a system of
|
||||
differential equations. In any case, the model has some parameters
|
||||
that specify how input and output signals are related. Which
|
||||
combination of parameter values are best suited to describe the
|
||||
input-output relation? The process of finding the best parameter
|
||||
values is an optimization problem. For a simple parameterized function
|
||||
that maps input to output values, this is the special case of a
|
||||
\enterm{curve fitting} problem, where the average distance between the
|
||||
curve and the response values is minimized. One basic numerical method
|
||||
used for such optimization problems is the so called gradient descent,
|
||||
which is introduced in this chapter.
|
||||
|
||||
\begin{figure}[t]
|
||||
\includegraphics[width=1\textwidth]{lin_regress}\hfill
|
||||
\titlecaption{Example data suggesting a linear relation.}{A set of
|
||||
input signals $x$, e.g. stimulus intensities, were used to probe a
|
||||
system. The system's output $y$ to the inputs are noted
|
||||
(left). Assuming a linear relation between $x$ and $y$ leaves us
|
||||
with 2 parameters, the slope (center) and the intercept with the
|
||||
y-axis (right panel).}\label{linregressiondatafig}
|
||||
\includegraphics{cubicfunc}
|
||||
\titlecaption{Example data suggesting a cubic relation.}{The length
|
||||
$x$ and weight $y$ of $n=34$ male tigers (blue, left). Assuming a
|
||||
cubic relation between size and weight leaves us with a single
|
||||
free parameters, a scaling factor. The cubic relation is shown for
|
||||
a few values of this scaling factor (orange and red,
|
||||
right).}\label{cubicdatafig}
|
||||
\end{figure}
|
||||
|
||||
The data plotted in \figref{linregressiondatafig} suggest a linear
|
||||
relation between input and output of the system. We thus assume that a
|
||||
straight line
|
||||
For demonstrating the curve-fitting problem let's take the simple
|
||||
example of weights and sizes measured for a number of male tigers
|
||||
(\figref{cubicdatafig}). Weight $y$ is proportional to volume
|
||||
$V$ via the density $\rho$. The volume $V$ of any object is
|
||||
proportional to its length $x$ cubed. The factor $\alpha$ relating
|
||||
volume and size cubed depends on the shape of the object and we do not
|
||||
know this factor for tigers. For the data set we thus expect a cubic
|
||||
relation between weight and length
|
||||
\begin{equation}
|
||||
\label{straightline}
|
||||
y = f(x; m, b) = m\cdot x + b
|
||||
\label{cubicfunc}
|
||||
y = f(x; c) = c\cdot x^3
|
||||
\end{equation}
|
||||
is an appropriate model to describe the system. The line has two free
|
||||
parameter, the slope $m$ and the $y$-intercept $b$. We need to find
|
||||
values for the slope and the intercept that best describe the measured
|
||||
data. In this chapter we use this example to illustrate the gradient
|
||||
descent and how this methods can be used to find a combination of
|
||||
slope and intercept that best describes the system.
|
||||
where $c=\rho\alpha$, the product of a tiger's density and form
|
||||
factor, is the only free parameter in the relation. We would like to
|
||||
find out which value of $c$ best describes the measured data. In the
|
||||
following we use this example to illustrate the gradient descent as a
|
||||
basic method for finding such an optimal parameter.
|
||||
|
||||
|
||||
\section{The error function --- mean squared error}
|
||||
|
||||
Before the optimization can be done we need to specify what exactly is
|
||||
considered an optimal fit. In our example we search the parameter
|
||||
combination that describe the relation of $x$ and $y$ best. What is
|
||||
meant by this? Each input $x_i$ leads to an measured output $y_i$ and
|
||||
for each $x_i$ there is a \emph{prediction} or \emph{estimation}
|
||||
$y^{est}(x_i)$ of the output value by the model. At each $x_i$
|
||||
estimation and measurement have a distance or error $y_i -
|
||||
y^{est}(x_i)$. In our example the estimation is given by the equation
|
||||
$y^{est}(x_i) = f(x_i;m,b)$. The best fitting model with parameters
|
||||
$m$ and $b$ is the one that minimizes the distances between
|
||||
observation $y_i$ and estimation $y^{est}(x_i)$
|
||||
(\figref{leastsquareerrorfig}).
|
||||
|
||||
As a first guess we could simply minimize the sum $\sum_{i=1}^N y_i -
|
||||
y^{est}(x_i)$. This approach, however, will not work since a minimal sum
|
||||
can also be achieved if half of the measurements is above and the
|
||||
other half below the predicted line. Positive and negative errors
|
||||
would cancel out and then sum up to values close to zero. A better
|
||||
approach is to sum over the absolute values of the distances:
|
||||
$\sum_{i=1}^N |y_i - y^{est}(x_i)|$. This sum can only be small if all
|
||||
deviations are indeed small no matter if they are above or below the
|
||||
predicted line. Instead of the sum we could also take the average
|
||||
\begin{equation}
|
||||
\label{meanabserror}
|
||||
f_{dist}(\{(x_i, y_i)\}|\{y^{est}(x_i)\}) = \frac{1}{N} \sum_{i=1}^N |y_i - y^{est}(x_i)|
|
||||
\end{equation}
|
||||
Instead of the averaged absolute errors, the \enterm[mean squared
|
||||
error]{mean squared error} (\determ[quadratischer
|
||||
Fehler!mittlerer]{mittlerer quadratischer Fehler})
|
||||
Before we go ahead finding the optimal parameter value we need to
|
||||
specify what exactly we consider as an optimal fit. In our example we
|
||||
search the parameter that describes the relation of $x$ and $y$
|
||||
best. What is meant by this? The length $x_i$ of each tiger is
|
||||
associated with a weight $y_i$ and for each $x_i$ we have a
|
||||
\emph{prediction} or \emph{estimation} $y^{est}(x_i)$ of the weight by
|
||||
the model \eqnref{cubicfunc} for a specific value of the parameter
|
||||
$c$. Prediction and actual data value ideally match (in a perfect
|
||||
noise-free world), but in general the estimate and measurement are
|
||||
separated by some distance or error $y_i - y^{est}(x_i)$. In our
|
||||
example the estimate of the weight for the length $x_i$ is given by
|
||||
equation \eqref{cubicfunc} $y^{est}(x_i) = f(x_i;c)$. The best fitting
|
||||
model with parameter $c$ is the one that somehow minimizes the
|
||||
distances between observations $y_i$ and corresponding estimations
|
||||
$y^{est}(x_i)$ (\figref{cubicerrorsfig}).
|
||||
|
||||
As a first guess we could simply minimize the sum of the distances,
|
||||
$\sum_{i=1}^N y_i - y^{est}(x_i)$. This, however, does not work
|
||||
because positive and negative errors would cancel out, no matter how
|
||||
large they are, and sum up to values close to zero. Better is to sum
|
||||
over absolute distances: $\sum_{i=1}^N |y_i - y^{est}(x_i)|$. This sum
|
||||
can only be small if all deviations are indeed small no matter if they
|
||||
are above or below the prediction. The sum of the squared distances,
|
||||
$\sum_{i=1}^N (y_i - y^{est}(x_i))^2$, turns out to be an even better
|
||||
choice. Instead of the sum we could also minimize the average distance
|
||||
\begin{equation}
|
||||
\label{meansquarederror}
|
||||
f_{mse}(\{(x_i, y_i)\}|\{y^{est}(x_i)\}) = \frac{1}{N} \sum_{i=1}^N (y_i - y^{est}(x_i))^2
|
||||
\end{equation}
|
||||
is commonly used (\figref{leastsquareerrorfig}). Similar to the
|
||||
absolute distance, the square of the errors, $(y_i - y^{est}(x_i))^2$, is
|
||||
always positive and thus positive and negative error values do not
|
||||
This is known as the \enterm[mean squared error]{mean squared error}
|
||||
(\determ[quadratischer Fehler!mittlerer]{mittlerer quadratischer
|
||||
Fehler}). Similar to the absolute distance, the square of the errors
|
||||
is always positive and thus positive and negative error values do not
|
||||
cancel each other out. In addition, the square punishes large
|
||||
deviations over small deviations. In
|
||||
chapter~\ref{maximumlikelihoodchapter} we show that minimizing the
|
||||
mean square error is equivalent to maximizing the likelihood that the
|
||||
mean squared error is equivalent to maximizing the likelihood that the
|
||||
observations originate from the model, if the data are normally
|
||||
distributed around the model prediction.
|
||||
|
||||
\begin{exercise}{meanSquaredErrorLine.m}{}\label{mseexercise}%
|
||||
Given a vector of observations \varcode{y} and a vector with the
|
||||
corresponding predictions \varcode{y\_est}, compute the \emph{mean
|
||||
square error} between \varcode{y} and \varcode{y\_est} in a single
|
||||
line of code.
|
||||
\begin{exercise}{meansquarederrorline.m}{}\label{mseexercise}
|
||||
Simulate $n=40$ tigers ranging from 2.2 to 3.9\,m in size and store
|
||||
these sizes in a vector \varcode{x}. Compute the corresponding
|
||||
predicted weights \varcode{yest} for each tiger according to
|
||||
\eqnref{cubicfunc} with $c=6$\,\kilo\gram\per\meter\cubed. From the
|
||||
predictions generate simulated measurements of the tiger's weights
|
||||
\varcode{y}, by adding normally distributed random numbers to the
|
||||
predictions scaled to a standard deviation of 50\,\kilo\gram.
|
||||
|
||||
Compute the \emph{mean squared error} between \varcode{y} and
|
||||
\varcode{yest} in a single line of code.
|
||||
\end{exercise}
|
||||
|
||||
|
||||
@ -110,13 +115,13 @@ can be any function that describes the quality of the fit by mapping
|
||||
the data and the predictions to a single scalar value.
|
||||
|
||||
\begin{figure}[t]
|
||||
\includegraphics[width=1\textwidth]{linear_least_squares}
|
||||
\titlecaption{Estimating the \emph{mean square error}.} {The
|
||||
deviation error, orange) between the prediction (red
|
||||
line) and the observations (blue dots) is calculated for each data
|
||||
point (left). Then the deviations are squared and the aveage is
|
||||
\includegraphics{cubicerrors}
|
||||
\titlecaption{Estimating the \emph{mean squared error}.} {The
|
||||
deviation error (orange) between the prediction (red line) and the
|
||||
observations (blue dots) is calculated for each data point
|
||||
(left). Then the deviations are squared and the average is
|
||||
calculated (right).}
|
||||
\label{leastsquareerrorfig}
|
||||
\label{cubicerrorsfig}
|
||||
\end{figure}
|
||||
|
||||
Replacing $y^{est}$ in the mean squared error \eqref{meansquarederror}
|
||||
@ -139,7 +144,7 @@ Fehler!kleinster]{Methode der kleinsten Quadrate}).
|
||||
contains the measurements $y$ for each value of $x$. The third
|
||||
argument is a 2-element vector that contains the values of
|
||||
parameters \varcode{m} and \varcode{b}. The function returns the
|
||||
mean square error.
|
||||
mean squared error.
|
||||
\end{exercise}
|
||||
|
||||
|
||||
@ -359,7 +364,7 @@ distance between the red dots in \figref{gradientdescentfig}) is
|
||||
large.
|
||||
|
||||
\begin{figure}[t]
|
||||
\includegraphics[width=0.45\textwidth]{gradient_descent}
|
||||
\includegraphics{cubicmse}
|
||||
\titlecaption{Gradient descent.}{The algorithm starts at an
|
||||
arbitrary position. At each point the gradient is estimated and
|
||||
the position is updated as long as the length of the gradient is
|
||||
|
Reference in New Issue
Block a user