[regression] started to simplify chapter to a 1d problem

This commit is contained in:
Jan Benda 2020-12-15 23:18:12 +01:00
parent a097b10024
commit be79d450df
7 changed files with 247 additions and 116 deletions

View File

@ -1 +0,0 @@
mse = mean((y - y_est).^2);

View File

@ -0,0 +1,15 @@
n = 40;
xmin = 2.2;
xmax = 3.9;
c = 6.0;
noise = 50.0;
% generate data:
x = rand(n, 1) * (xmax-xmin) + xmin;
yest = c * x.^3;
y = yest + noise*randn(n, 1);
% compute mean squared error:
mse = mean((y - y_est).^2);
fprintf('the mean squared error is %g kg^2\n', mse))

View File

@ -15,28 +15,13 @@ def create_data():
return x, y, c return x, y, c
def plot_data(ax, x, y, c):
ax.plot(x, y, zorder=10, **psAm)
xx = np.linspace(2.1, 3.9, 100)
ax.plot(xx, c*xx**3.0, zorder=5, **lsBm)
for cc in [0.25*c, 0.5*c, 2.0*c, 4.0*c]:
ax.plot(xx, cc*xx**3.0, zorder=5, **lsDm)
ax.set_xlabel('Size x', 'm')
ax.set_ylabel('Weight y', 'kg')
ax.set_xlim(2, 4)
ax.set_ylim(0, 400)
ax.set_xticks(np.arange(2.0, 4.1, 0.5))
ax.set_yticks(np.arange(0, 401, 100))
def plot_data_errors(ax, x, y, c): def plot_data_errors(ax, x, y, c):
ax.set_xlabel('Size x', 'm') ax.set_xlabel('Size x', 'm')
#ax.set_ylabel('Weight y', 'kg') ax.set_ylabel('Weight y', 'kg')
ax.set_xlim(2, 4) ax.set_xlim(2, 4)
ax.set_ylim(0, 400) ax.set_ylim(0, 400)
ax.set_xticks(np.arange(2.0, 4.1, 0.5)) ax.set_xticks(np.arange(2.0, 4.1, 0.5))
ax.set_yticks(np.arange(0, 401, 100)) ax.set_yticks(np.arange(0, 401, 100))
ax.set_yticklabels([])
ax.annotate('Error', ax.annotate('Error',
xy=(x[28]+0.05, y[28]+60), xycoords='data', xy=(x[28]+0.05, y[28]+60), xycoords='data',
xytext=(3.4, 70), textcoords='data', ha='left', xytext=(3.4, 70), textcoords='data', ha='left',
@ -52,31 +37,30 @@ def plot_data_errors(ax, x, y, c):
yy = [c*x[i]**3.0, y[i]] yy = [c*x[i]**3.0, y[i]]
ax.plot(xx, yy, zorder=5, **lsDm) ax.plot(xx, yy, zorder=5, **lsDm)
def plot_error_hist(ax, x, y, c): def plot_error_hist(ax, x, y, c):
ax.set_xlabel('Squared error') ax.set_xlabel('Squared error')
ax.set_ylabel('Frequency') ax.set_ylabel('Frequency')
bins = np.arange(0.0, 1250.0, 100) bins = np.arange(0.0, 11000.0, 750)
ax.set_xlim(bins[0], bins[-1]) ax.set_xlim(bins[0], bins[-1])
#ax.set_ylim(0, 35) ax.set_ylim(0, 15)
ax.set_xticks(np.arange(bins[0], bins[-1], 200)) ax.set_xticks(np.arange(bins[0], bins[-1], 5000))
#ax.set_yticks(np.arange(0, 36, 10)) ax.set_yticks(np.arange(0, 16, 5))
errors = (y-(c*x**3.0))**2.0 errors = (y-(c*x**3.0))**2.0
mls = np.mean(errors) mls = np.mean(errors)
ax.annotate('Mean\nsquared\nerror', ax.annotate('Mean\nsquared\nerror',
xy=(mls, 0.5), xycoords='data', xy=(mls, 0.5), xycoords='data',
xytext=(800, 3), textcoords='data', ha='left', xytext=(4500, 6), textcoords='data', ha='left',
arrowprops=dict(arrowstyle="->", relpos=(0.0,0.2), arrowprops=dict(arrowstyle="->", relpos=(0.0,0.2),
connectionstyle="angle3,angleA=10,angleB=90") ) connectionstyle="angle3,angleA=10,angleB=90") )
ax.hist(errors, bins, **fsC) ax.hist(errors, bins, **fsC)
if __name__ == "__main__": if __name__ == "__main__":
x, y, c = create_data() x, y, c = create_data()
fig, (ax1, ax2) = plt.subplots(1, 2) fig, (ax1, ax2) = plt.subplots(1, 2)
fig.subplots_adjust(wspace=0.2, **adjust_fs(left=6.0, right=1.2)) fig.subplots_adjust(wspace=0.4, **adjust_fs(left=6.0, right=1.2))
plot_data(ax1, x, y, c) plot_data_errors(ax1, x, y, c)
plot_data_errors(ax2, x, y, c) plot_error_hist(ax2, x, y, c)
#plot_error_hist(ax2, x, y, c)
fig.savefig("cubicerrors.pdf") fig.savefig("cubicerrors.pdf")
plt.close() plt.close()

View File

@ -2,7 +2,7 @@ import matplotlib.pyplot as plt
import numpy as np import numpy as np
from plotstyle import * from plotstyle import *
if __name__ == "__main__": def create_data():
# wikipedia: # wikipedia:
# Generally, males vary in total length from 250 to 390 cm and # Generally, males vary in total length from 250 to 390 cm and
# weigh between 90 and 306 kg # weigh between 90 and 306 kg
@ -12,10 +12,20 @@ if __name__ == "__main__":
rng = np.random.RandomState(32281) rng = np.random.RandomState(32281)
noise = rng.randn(len(x))*50 noise = rng.randn(len(x))*50
y += noise y += noise
return x, y, c
fig, ax = plt.subplots(figsize=cm_size(figure_width, 1.4*figure_height)) def plot_data(ax, x, y):
fig.subplots_adjust(**adjust_fs(left=6.0, right=1.2)) ax.plot(x, y, **psA)
ax.set_xlabel('Size x', 'm')
ax.set_ylabel('Weight y', 'kg')
ax.set_xlim(2, 4)
ax.set_ylim(0, 400)
ax.set_xticks(np.arange(2.0, 4.1, 0.5))
ax.set_yticks(np.arange(0, 401, 100))
def plot_data_fac(ax, x, y, c):
ax.plot(x, y, zorder=10, **psA) ax.plot(x, y, zorder=10, **psA)
xx = np.linspace(2.1, 3.9, 100) xx = np.linspace(2.1, 3.9, 100)
ax.plot(xx, c*xx**3.0, zorder=5, **lsB) ax.plot(xx, c*xx**3.0, zorder=5, **lsB)
@ -27,6 +37,14 @@ if __name__ == "__main__":
ax.set_ylim(0, 400) ax.set_ylim(0, 400)
ax.set_xticks(np.arange(2.0, 4.1, 0.5)) ax.set_xticks(np.arange(2.0, 4.1, 0.5))
ax.set_yticks(np.arange(0, 401, 100)) ax.set_yticks(np.arange(0, 401, 100))
if __name__ == "__main__":
x, y, c = create_data()
print(len(x))
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.subplots_adjust(wspace=0.5, **adjust_fs(fig, left=6.0, right=1.5))
plot_data(ax1, x, y)
plot_data_fac(ax2, x, y, c)
fig.savefig("cubicfunc.pdf") fig.savefig("cubicfunc.pdf")
plt.close() plt.close()

View File

@ -14,6 +14,7 @@ def create_data():
y += noise y += noise
return x, y, c return x, y, c
def gradient_descent(x, y): def gradient_descent(x, y):
n = 20 n = 20
dc = 0.01 dc = 0.01
@ -29,6 +30,7 @@ def gradient_descent(x, y):
mses.append(m0) mses.append(m0)
cc -= eps*dmdc cc -= eps*dmdc
return cs, mses return cs, mses
def plot_mse(ax, x, y, c, cs): def plot_mse(ax, x, y, c, cs):
ms = np.zeros(len(cs)) ms = np.zeros(len(cs))
@ -54,7 +56,8 @@ def plot_mse(ax, x, y, c, cs):
ax.set_ylim(0, 25000) ax.set_ylim(0, 25000)
ax.set_xticks(np.arange(0.0, 10.1, 2.0)) ax.set_xticks(np.arange(0.0, 10.1, 2.0))
ax.set_yticks(np.arange(0, 30001, 10000)) ax.set_yticks(np.arange(0, 30001, 10000))
def plot_descent(ax, cs, mses): def plot_descent(ax, cs, mses):
ax.plot(np.arange(len(mses))+1, mses, **lpsBm) ax.plot(np.arange(len(mses))+1, mses, **lpsBm)
ax.set_xlabel('Iteration') ax.set_xlabel('Iteration')
@ -69,7 +72,7 @@ def plot_descent(ax, cs, mses):
if __name__ == "__main__": if __name__ == "__main__":
x, y, c = create_data() x, y, c = create_data()
cs, mses = gradient_descent(x, y) cs, mses = gradient_descent(x, y)
fig, (ax1, ax2) = plt.subplots(1, 2) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 1.1*figure_height))
fig.subplots_adjust(wspace=0.2, **adjust_fs(left=8.0, right=0.5)) fig.subplots_adjust(wspace=0.2, **adjust_fs(left=8.0, right=0.5))
plot_mse(ax1, x, y, c, cs) plot_mse(ax1, x, y, c, cs)
plot_descent(ax2, cs, mses) plot_descent(ax2, cs, mses)

View File

@ -40,6 +40,113 @@
\item Homework is to do the 2d problem with the straight line! \item Homework is to do the 2d problem with the straight line!
\end{itemize} \end{itemize}
\subsection{2D fit}
\begin{exercise}{meanSquaredError.m}{}
Implement the objective function \eqref{mseline} as a function
\varcode{meanSquaredError()}. The function takes three
arguments. The first is a vector of $x$-values and the second
contains the measurements $y$ for each value of $x$. The third
argument is a 2-element vector that contains the values of
parameters \varcode{m} and \varcode{b}. The function returns the
mean square error.
\end{exercise}
\begin{exercise}{errorSurface.m}{}\label{errorsurfaceexercise}
Generate 20 data pairs $(x_i|y_i)$ that are linearly related with
slope $m=0.75$ and intercept $b=-40$, using \varcode{rand()} for
drawing $x$ values between 0 and 120 and \varcode{randn()} for
jittering the $y$ values with a standard deviation of 15. Then
calculate the mean squared error between the data and straight lines
for a range of slopes and intercepts using the
\varcode{meanSquaredError()} function from the previous exercise.
Illustrates the error surface using the \code{surface()} function.
Consult the documentation to find out how to use \code{surface()}.
\end{exercise}
\begin{exercise}{meanSquaredGradient.m}{}\label{gradientexercise}%
Implement a function \varcode{meanSquaredGradient()}, that takes the
$x$- and $y$-data and the set of parameters $(m, b)$ of a straight
line as a two-element vector as input arguments. The function should
return the gradient at the position $(m, b)$ as a vector with two
elements.
\end{exercise}
\begin{exercise}{errorGradient.m}{}
Extend the script of exercises~\ref{errorsurfaceexercise} to plot
both the error surface and gradients using the
\varcode{meanSquaredGradient()} function from
exercise~\ref{gradientexercise}. Vectors in space can be easily
plotted using the function \code{quiver()}. Use \code{contour()}
instead of \code{surface()} to plot the error surface.
\end{exercise}
\begin{exercise}{gradientDescent.m}{}
Implement the gradient descent for the problem of fitting a straight
line to some measured data. Reuse the data generated in
exercise~\ref{errorsurfaceexercise}.
\begin{enumerate}
\item Store for each iteration the error value.
\item Plot the error values as a function of the iterations, the
number of optimization steps.
\item Plot the measured data together with the best fitting straight line.
\end{enumerate}\vspace{-4.5ex}
\end{exercise}
\begin{figure}[t]
\includegraphics[width=1\textwidth]{lin_regress}\hfill
\titlecaption{Example data suggesting a linear relation.}{A set of
input signals $x$, e.g. stimulus intensities, were used to probe a
system. The system's output $y$ to the inputs are noted
(left). Assuming a linear relation between $x$ and $y$ leaves us
with 2 parameters, the slope (center) and the intercept with the
y-axis (right panel).}\label{linregressiondatafig}
\end{figure}
\begin{figure}[t]
\includegraphics[width=1\textwidth]{linear_least_squares}
\titlecaption{Estimating the \emph{mean square error}.} {The
deviation error (orange) between the prediction (red line) and the
observations (blue dots) is calculated for each data point
(left). Then the deviations are squared and the average is
calculated (right).}
\label{leastsquareerrorfig}
\end{figure}
\begin{figure}[t]
\includegraphics[width=0.75\textwidth]{error_surface}
\titlecaption{Error surface.}{The two model parameters $m$ and $b$
define the base area of the surface plot. For each parameter
combination of slope and intercept the error is calculated. The
resulting surface has a minimum which indicates the parameter
combination that best fits the data.}\label{errorsurfacefig}
\end{figure}
\begin{figure}[t]
\includegraphics[width=0.75\textwidth]{error_gradient}
\titlecaption{Gradient of the error surface.} {Each arrow points
into the direction of the greatest ascend at different positions
of the error surface shown in \figref{errorsurfacefig}. The
contour lines in the background illustrate the error surface. Warm
colors indicate high errors, colder colors low error values. Each
contour line connects points of equal
error.}\label{gradientquiverfig}
\end{figure}
\begin{figure}[t]
\includegraphics[width=0.45\textwidth]{gradient_descent}
\titlecaption{Gradient descent.}{The algorithm starts at an
arbitrary position. At each point the gradient is estimated and
the position is updated as long as the length of the gradient is
sufficiently large.The dots show the positions after each
iteration of the algorithm.} \label{gradientdescentfig}
\end{figure}
\subsection{Linear fits} \subsection{Linear fits}
\begin{itemize} \begin{itemize}
\item Polyfit is easy: unique solution! $c x^2$ is also a linear fit. \item Polyfit is easy: unique solution! $c x^2$ is also a linear fit.
@ -54,8 +161,8 @@ Fit with matlab functions lsqcurvefit, polyfit
\subsection{Non-linear fits} \subsection{Non-linear fits}
\begin{itemize} \begin{itemize}
\item Example that illustrates the Nebenminima Problem (with error surface) \item Example that illustrates the Nebenminima Problem (with error surface)
\item You need got initial values for the parameter! \item You need initial values for the parameter!
\item Example that fitting gets harder the more parameter yuo have. \item Example that fitting gets harder the more parameter you have.
\item Try to fix as many parameter before doing the fit. \item Try to fix as many parameter before doing the fit.
\item How to test the quality of a fit? Residuals. $\chi^2$ test. Run-test. \item How to test the quality of a fit? Residuals. $\chi^2$ test. Run-test.
\end{itemize} \end{itemize}

View File

@ -2,99 +2,104 @@
\exercisechapter{Optimization and gradient descent} \exercisechapter{Optimization and gradient descent}
Optimization problems arise in many different contexts. For example, Optimization problems arise in many different contexts. For example,
to understand the behavior of a given system, the system is probed to understand the behavior of a given neuronal system, the system is
with a range of input signals and then the resulting responses are probed with a range of input signals and then the resulting responses
measured. This input-output relation can be described by a model. Such are measured. This input-output relation can be described by a
a model can be a simple function that maps the input signals to model. Such a model can be a simple function that maps the input
corresponding responses, it can be a filter, or a system of signals to corresponding responses, it can be a filter, or a system of
differential equations. In any case, the model has some parameters that differential equations. In any case, the model has some parameters
specify how input and output signals are related. Which combination that specify how input and output signals are related. Which
of parameter values are best suited to describe the input-output combination of parameter values are best suited to describe the
relation? The process of finding the best parameter values is an input-output relation? The process of finding the best parameter
optimization problem. For a simple parameterized function that maps values is an optimization problem. For a simple parameterized function
input to output values, this is the special case of a \enterm{curve that maps input to output values, this is the special case of a
fitting} problem, where the average distance between the curve and \enterm{curve fitting} problem, where the average distance between the
the response values is minimized. One basic numerical method used for curve and the response values is minimized. One basic numerical method
such optimization problems is the so called gradient descent, which is used for such optimization problems is the so called gradient descent,
introduced in this chapter. which is introduced in this chapter.
%%% Weiteres einfaches verbales Beispiel? Eventuell aus der Populationsoekologie?
\begin{figure}[t] \begin{figure}[t]
\includegraphics[width=1\textwidth]{lin_regress}\hfill \includegraphics{cubicfunc}
\titlecaption{Example data suggesting a linear relation.}{A set of \titlecaption{Example data suggesting a cubic relation.}{The length
input signals $x$, e.g. stimulus intensities, were used to probe a $x$ and weight $y$ of $n=34$ male tigers (blue, left). Assuming a
system. The system's output $y$ to the inputs are noted cubic relation between size and weight leaves us with a single
(left). Assuming a linear relation between $x$ and $y$ leaves us free parameters, a scaling factor. The cubic relation is shown for
with 2 parameters, the slope (center) and the intercept with the a few values of this scaling factor (orange and red,
y-axis (right panel).}\label{linregressiondatafig} right).}\label{cubicdatafig}
\end{figure} \end{figure}
The data plotted in \figref{linregressiondatafig} suggest a linear For demonstrating the curve-fitting problem let's take the simple
relation between input and output of the system. We thus assume that a example of weights and sizes measured for a number of male tigers
straight line (\figref{cubicdatafig}). Weight $y$ is proportional to volume
$V$ via the density $\rho$. The volume $V$ of any object is
proportional to its length $x$ cubed. The factor $\alpha$ relating
volume and size cubed depends on the shape of the object and we do not
know this factor for tigers. For the data set we thus expect a cubic
relation between weight and length
\begin{equation} \begin{equation}
\label{straightline} \label{cubicfunc}
y = f(x; m, b) = m\cdot x + b y = f(x; c) = c\cdot x^3
\end{equation} \end{equation}
is an appropriate model to describe the system. The line has two free where $c=\rho\alpha$, the product of a tiger's density and form
parameter, the slope $m$ and the $y$-intercept $b$. We need to find factor, is the only free parameter in the relation. We would like to
values for the slope and the intercept that best describe the measured find out which value of $c$ best describes the measured data. In the
data. In this chapter we use this example to illustrate the gradient following we use this example to illustrate the gradient descent as a
descent and how this methods can be used to find a combination of basic method for finding such an optimal parameter.
slope and intercept that best describes the system.
\section{The error function --- mean squared error} \section{The error function --- mean squared error}
Before the optimization can be done we need to specify what exactly is Before we go ahead finding the optimal parameter value we need to
considered an optimal fit. In our example we search the parameter specify what exactly we consider as an optimal fit. In our example we
combination that describe the relation of $x$ and $y$ best. What is search the parameter that describes the relation of $x$ and $y$
meant by this? Each input $x_i$ leads to an measured output $y_i$ and best. What is meant by this? The length $x_i$ of each tiger is
for each $x_i$ there is a \emph{prediction} or \emph{estimation} associated with a weight $y_i$ and for each $x_i$ we have a
$y^{est}(x_i)$ of the output value by the model. At each $x_i$ \emph{prediction} or \emph{estimation} $y^{est}(x_i)$ of the weight by
estimation and measurement have a distance or error $y_i - the model \eqnref{cubicfunc} for a specific value of the parameter
y^{est}(x_i)$. In our example the estimation is given by the equation $c$. Prediction and actual data value ideally match (in a perfect
$y^{est}(x_i) = f(x_i;m,b)$. The best fitting model with parameters noise-free world), but in general the estimate and measurement are
$m$ and $b$ is the one that minimizes the distances between separated by some distance or error $y_i - y^{est}(x_i)$. In our
observation $y_i$ and estimation $y^{est}(x_i)$ example the estimate of the weight for the length $x_i$ is given by
(\figref{leastsquareerrorfig}). equation \eqref{cubicfunc} $y^{est}(x_i) = f(x_i;c)$. The best fitting
model with parameter $c$ is the one that somehow minimizes the
As a first guess we could simply minimize the sum $\sum_{i=1}^N y_i - distances between observations $y_i$ and corresponding estimations
y^{est}(x_i)$. This approach, however, will not work since a minimal sum $y^{est}(x_i)$ (\figref{cubicerrorsfig}).
can also be achieved if half of the measurements is above and the
other half below the predicted line. Positive and negative errors As a first guess we could simply minimize the sum of the distances,
would cancel out and then sum up to values close to zero. A better $\sum_{i=1}^N y_i - y^{est}(x_i)$. This, however, does not work
approach is to sum over the absolute values of the distances: because positive and negative errors would cancel out, no matter how
$\sum_{i=1}^N |y_i - y^{est}(x_i)|$. This sum can only be small if all large they are, and sum up to values close to zero. Better is to sum
deviations are indeed small no matter if they are above or below the over absolute distances: $\sum_{i=1}^N |y_i - y^{est}(x_i)|$. This sum
predicted line. Instead of the sum we could also take the average can only be small if all deviations are indeed small no matter if they
\begin{equation} are above or below the prediction. The sum of the squared distances,
\label{meanabserror} $\sum_{i=1}^N (y_i - y^{est}(x_i))^2$, turns out to be an even better
f_{dist}(\{(x_i, y_i)\}|\{y^{est}(x_i)\}) = \frac{1}{N} \sum_{i=1}^N |y_i - y^{est}(x_i)| choice. Instead of the sum we could also minimize the average distance
\end{equation}
Instead of the averaged absolute errors, the \enterm[mean squared
error]{mean squared error} (\determ[quadratischer
Fehler!mittlerer]{mittlerer quadratischer Fehler})
\begin{equation} \begin{equation}
\label{meansquarederror} \label{meansquarederror}
f_{mse}(\{(x_i, y_i)\}|\{y^{est}(x_i)\}) = \frac{1}{N} \sum_{i=1}^N (y_i - y^{est}(x_i))^2 f_{mse}(\{(x_i, y_i)\}|\{y^{est}(x_i)\}) = \frac{1}{N} \sum_{i=1}^N (y_i - y^{est}(x_i))^2
\end{equation} \end{equation}
is commonly used (\figref{leastsquareerrorfig}). Similar to the This is known as the \enterm[mean squared error]{mean squared error}
absolute distance, the square of the errors, $(y_i - y^{est}(x_i))^2$, is (\determ[quadratischer Fehler!mittlerer]{mittlerer quadratischer
always positive and thus positive and negative error values do not Fehler}). Similar to the absolute distance, the square of the errors
is always positive and thus positive and negative error values do not
cancel each other out. In addition, the square punishes large cancel each other out. In addition, the square punishes large
deviations over small deviations. In deviations over small deviations. In
chapter~\ref{maximumlikelihoodchapter} we show that minimizing the chapter~\ref{maximumlikelihoodchapter} we show that minimizing the
mean square error is equivalent to maximizing the likelihood that the mean squared error is equivalent to maximizing the likelihood that the
observations originate from the model, if the data are normally observations originate from the model, if the data are normally
distributed around the model prediction. distributed around the model prediction.
\begin{exercise}{meanSquaredErrorLine.m}{}\label{mseexercise}% \begin{exercise}{meansquarederrorline.m}{}\label{mseexercise}
Given a vector of observations \varcode{y} and a vector with the Simulate $n=40$ tigers ranging from 2.2 to 3.9\,m in size and store
corresponding predictions \varcode{y\_est}, compute the \emph{mean these sizes in a vector \varcode{x}. Compute the corresponding
square error} between \varcode{y} and \varcode{y\_est} in a single predicted weights \varcode{yest} for each tiger according to
line of code. \eqnref{cubicfunc} with $c=6$\,\kilo\gram\per\meter\cubed. From the
predictions generate simulated measurements of the tiger's weights
\varcode{y}, by adding normally distributed random numbers to the
predictions scaled to a standard deviation of 50\,\kilo\gram.
Compute the \emph{mean squared error} between \varcode{y} and
\varcode{yest} in a single line of code.
\end{exercise} \end{exercise}
@ -110,13 +115,13 @@ can be any function that describes the quality of the fit by mapping
the data and the predictions to a single scalar value. the data and the predictions to a single scalar value.
\begin{figure}[t] \begin{figure}[t]
\includegraphics[width=1\textwidth]{linear_least_squares} \includegraphics{cubicerrors}
\titlecaption{Estimating the \emph{mean square error}.} {The \titlecaption{Estimating the \emph{mean squared error}.} {The
deviation error, orange) between the prediction (red deviation error (orange) between the prediction (red line) and the
line) and the observations (blue dots) is calculated for each data observations (blue dots) is calculated for each data point
point (left). Then the deviations are squared and the aveage is (left). Then the deviations are squared and the average is
calculated (right).} calculated (right).}
\label{leastsquareerrorfig} \label{cubicerrorsfig}
\end{figure} \end{figure}
Replacing $y^{est}$ in the mean squared error \eqref{meansquarederror} Replacing $y^{est}$ in the mean squared error \eqref{meansquarederror}
@ -139,7 +144,7 @@ Fehler!kleinster]{Methode der kleinsten Quadrate}).
contains the measurements $y$ for each value of $x$. The third contains the measurements $y$ for each value of $x$. The third
argument is a 2-element vector that contains the values of argument is a 2-element vector that contains the values of
parameters \varcode{m} and \varcode{b}. The function returns the parameters \varcode{m} and \varcode{b}. The function returns the
mean square error. mean squared error.
\end{exercise} \end{exercise}
@ -359,7 +364,7 @@ distance between the red dots in \figref{gradientdescentfig}) is
large. large.
\begin{figure}[t] \begin{figure}[t]
\includegraphics[width=0.45\textwidth]{gradient_descent} \includegraphics{cubicmse}
\titlecaption{Gradient descent.}{The algorithm starts at an \titlecaption{Gradient descent.}{The algorithm starts at an
arbitrary position. At each point the gradient is estimated and arbitrary position. At each point the gradient is estimated and
the position is updated as long as the length of the gradient is the position is updated as long as the length of the gradient is