From be79d450dfa62b1721125f604d7e15dec93d50be Mon Sep 17 00:00:00 2001
From: Jan Benda <jan.benda@uni-tuebingen.de>
Date: Tue, 15 Dec 2020 23:18:12 +0100
Subject: [PATCH] [regression] started to simplify chapter to a 1d problem

---
 regression/code/meanSquaredErrorLine.m    |   1 -
 regression/code/meansquarederrorline.m    |  15 ++
 regression/lecture/cubicerrors.py         |  36 ++---
 regression/lecture/cubicfunc.py           |  26 +++-
 regression/lecture/cubicmse.py            |   7 +-
 regression/lecture/regression-chapter.tex | 111 +++++++++++++-
 regression/lecture/regression.tex         | 167 +++++++++++-----------
 7 files changed, 247 insertions(+), 116 deletions(-)
 delete mode 100644 regression/code/meanSquaredErrorLine.m
 create mode 100644 regression/code/meansquarederrorline.m

diff --git a/regression/code/meanSquaredErrorLine.m b/regression/code/meanSquaredErrorLine.m
deleted file mode 100644
index c395934..0000000
--- a/regression/code/meanSquaredErrorLine.m
+++ /dev/null
@@ -1 +0,0 @@
-mse = mean((y - y_est).^2);
diff --git a/regression/code/meansquarederrorline.m b/regression/code/meansquarederrorline.m
new file mode 100644
index 0000000..17f9d2f
--- /dev/null
+++ b/regression/code/meansquarederrorline.m
@@ -0,0 +1,15 @@
+n = 40;
+xmin = 2.2;
+xmax = 3.9;
+c = 6.0;
+noise = 50.0;
+
+% generate data:
+x = rand(n, 1) * (xmax-xmin) + xmin;
+yest = c * x.^3;
+y = yest + noise*randn(n, 1);
+
+% compute mean squared error:
+mse = mean((y - y_est).^2);
+
+fprintf('the mean squared error is %g kg^2\n', mse))
diff --git a/regression/lecture/cubicerrors.py b/regression/lecture/cubicerrors.py
index e072d9e..86e608f 100644
--- a/regression/lecture/cubicerrors.py
+++ b/regression/lecture/cubicerrors.py
@@ -15,28 +15,13 @@ def create_data():
     return x, y, c
 
     
-def plot_data(ax, x, y, c):
-    ax.plot(x, y, zorder=10, **psAm)
-    xx = np.linspace(2.1, 3.9, 100)
-    ax.plot(xx, c*xx**3.0, zorder=5, **lsBm)
-    for cc in [0.25*c, 0.5*c, 2.0*c, 4.0*c]:
-        ax.plot(xx, cc*xx**3.0, zorder=5, **lsDm)
-    ax.set_xlabel('Size x', 'm')
-    ax.set_ylabel('Weight y', 'kg')
-    ax.set_xlim(2, 4)
-    ax.set_ylim(0, 400)
-    ax.set_xticks(np.arange(2.0, 4.1, 0.5))
-    ax.set_yticks(np.arange(0, 401, 100))
-    
-    
 def plot_data_errors(ax, x, y, c):
     ax.set_xlabel('Size x', 'm')
-    #ax.set_ylabel('Weight y', 'kg')
+    ax.set_ylabel('Weight y', 'kg')
     ax.set_xlim(2, 4)
     ax.set_ylim(0, 400)
     ax.set_xticks(np.arange(2.0, 4.1, 0.5))
     ax.set_yticks(np.arange(0, 401, 100))
-    ax.set_yticklabels([])
     ax.annotate('Error',
                 xy=(x[28]+0.05, y[28]+60), xycoords='data',
                 xytext=(3.4, 70), textcoords='data', ha='left',
@@ -52,31 +37,30 @@ def plot_data_errors(ax, x, y, c):
         yy = [c*x[i]**3.0, y[i]]
         ax.plot(xx, yy, zorder=5, **lsDm)
 
+        
 def plot_error_hist(ax, x, y, c):
     ax.set_xlabel('Squared error')
     ax.set_ylabel('Frequency')
-    bins = np.arange(0.0, 1250.0, 100)
+    bins = np.arange(0.0, 11000.0, 750)
     ax.set_xlim(bins[0], bins[-1])
-    #ax.set_ylim(0, 35)
-    ax.set_xticks(np.arange(bins[0], bins[-1], 200))
-    #ax.set_yticks(np.arange(0, 36, 10))
+    ax.set_ylim(0, 15)
+    ax.set_xticks(np.arange(bins[0], bins[-1], 5000))
+    ax.set_yticks(np.arange(0, 16, 5))
     errors = (y-(c*x**3.0))**2.0
     mls = np.mean(errors)
     ax.annotate('Mean\nsquared\nerror',
                 xy=(mls, 0.5), xycoords='data',
-                xytext=(800, 3), textcoords='data', ha='left',
+                xytext=(4500, 6), textcoords='data', ha='left',
                 arrowprops=dict(arrowstyle="->", relpos=(0.0,0.2),
                 connectionstyle="angle3,angleA=10,angleB=90") )
     ax.hist(errors, bins, **fsC)
 
 
-
 if __name__ == "__main__":
     x, y, c = create_data()
     fig, (ax1, ax2) = plt.subplots(1, 2)
-    fig.subplots_adjust(wspace=0.2, **adjust_fs(left=6.0, right=1.2))
-    plot_data(ax1, x, y, c)
-    plot_data_errors(ax2, x, y, c)
-    #plot_error_hist(ax2, x, y, c)
+    fig.subplots_adjust(wspace=0.4, **adjust_fs(left=6.0, right=1.2))
+    plot_data_errors(ax1, x, y, c)
+    plot_error_hist(ax2, x, y, c)
     fig.savefig("cubicerrors.pdf")
     plt.close()
diff --git a/regression/lecture/cubicfunc.py b/regression/lecture/cubicfunc.py
index e8c0565..45dd67b 100644
--- a/regression/lecture/cubicfunc.py
+++ b/regression/lecture/cubicfunc.py
@@ -2,7 +2,7 @@ import matplotlib.pyplot as plt
 import numpy as np
 from plotstyle import *
 
-if __name__ == "__main__":
+def create_data():
     # wikipedia:
     # Generally, males vary in total length from 250 to 390 cm and
     # weigh between 90 and 306 kg
@@ -12,10 +12,20 @@ if __name__ == "__main__":
     rng = np.random.RandomState(32281)
     noise = rng.randn(len(x))*50
     y += noise
+    return x, y, c
+
     
-    fig, ax = plt.subplots(figsize=cm_size(figure_width, 1.4*figure_height))
-    fig.subplots_adjust(**adjust_fs(left=6.0, right=1.2))
-    
+def plot_data(ax, x, y):
+    ax.plot(x, y, **psA)
+    ax.set_xlabel('Size x', 'm')
+    ax.set_ylabel('Weight y', 'kg')
+    ax.set_xlim(2, 4)
+    ax.set_ylim(0, 400)
+    ax.set_xticks(np.arange(2.0, 4.1, 0.5))
+    ax.set_yticks(np.arange(0, 401, 100))
+
+        
+def plot_data_fac(ax, x, y, c):
     ax.plot(x, y, zorder=10, **psA)
     xx = np.linspace(2.1, 3.9, 100)
     ax.plot(xx, c*xx**3.0, zorder=5, **lsB)
@@ -27,6 +37,14 @@ if __name__ == "__main__":
     ax.set_ylim(0, 400)
     ax.set_xticks(np.arange(2.0, 4.1, 0.5))
     ax.set_yticks(np.arange(0, 401, 100))
+
     
+if __name__ == "__main__":
+    x, y, c = create_data()
+    print(len(x))
+    fig, (ax1, ax2) = plt.subplots(1, 2)
+    fig.subplots_adjust(wspace=0.5, **adjust_fs(fig, left=6.0, right=1.5))
+    plot_data(ax1, x, y)
+    plot_data_fac(ax2, x, y, c)
     fig.savefig("cubicfunc.pdf")
     plt.close()
diff --git a/regression/lecture/cubicmse.py b/regression/lecture/cubicmse.py
index f687885..55ac64b 100644
--- a/regression/lecture/cubicmse.py
+++ b/regression/lecture/cubicmse.py
@@ -14,6 +14,7 @@ def create_data():
     y += noise
     return x, y, c
 
+
 def gradient_descent(x, y):
     n = 20
     dc = 0.01
@@ -29,6 +30,7 @@ def gradient_descent(x, y):
         mses.append(m0)
         cc -= eps*dmdc
     return cs, mses
+
     
 def plot_mse(ax, x, y, c, cs):
     ms = np.zeros(len(cs))
@@ -54,7 +56,8 @@ def plot_mse(ax, x, y, c, cs):
     ax.set_ylim(0, 25000)
     ax.set_xticks(np.arange(0.0, 10.1, 2.0))
     ax.set_yticks(np.arange(0, 30001, 10000))
-    
+
+        
 def plot_descent(ax, cs, mses):
     ax.plot(np.arange(len(mses))+1, mses, **lpsBm)
     ax.set_xlabel('Iteration')
@@ -69,7 +72,7 @@ def plot_descent(ax, cs, mses):
 if __name__ == "__main__":
     x, y, c = create_data()
     cs, mses = gradient_descent(x, y)
-    fig, (ax1, ax2) = plt.subplots(1, 2)
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 1.1*figure_height))
     fig.subplots_adjust(wspace=0.2, **adjust_fs(left=8.0, right=0.5))
     plot_mse(ax1, x, y, c, cs)
     plot_descent(ax2, cs, mses)
diff --git a/regression/lecture/regression-chapter.tex b/regression/lecture/regression-chapter.tex
index 792dfcd..2169854 100644
--- a/regression/lecture/regression-chapter.tex
+++ b/regression/lecture/regression-chapter.tex
@@ -40,6 +40,113 @@
 \item Homework is to do the 2d problem with the straight line!
 \end{itemize}
 
+\subsection{2D fit}
+
+\begin{exercise}{meanSquaredError.m}{}
+  Implement the objective function \eqref{mseline} as a function
+  \varcode{meanSquaredError()}.  The function takes three
+  arguments. The first is a vector of $x$-values and the second
+  contains the measurements $y$ for each value of $x$. The third
+  argument is a 2-element vector that contains the values of
+  parameters \varcode{m} and \varcode{b}. The function returns the
+  mean square error.
+\end{exercise}
+
+
+\begin{exercise}{errorSurface.m}{}\label{errorsurfaceexercise}
+  Generate 20 data pairs $(x_i|y_i)$ that are linearly related with
+  slope $m=0.75$ and intercept $b=-40$, using \varcode{rand()} for
+  drawing $x$ values between 0 and 120 and \varcode{randn()} for
+  jittering the $y$ values with a standard deviation of 15.  Then
+  calculate the mean squared error between the data and straight lines
+  for a range of slopes and intercepts using the
+  \varcode{meanSquaredError()} function from the previous exercise.
+  Illustrates the error surface using the \code{surface()} function.
+  Consult the documentation to find out how to use \code{surface()}.
+\end{exercise}
+
+\begin{exercise}{meanSquaredGradient.m}{}\label{gradientexercise}%
+  Implement a function \varcode{meanSquaredGradient()}, that takes the
+  $x$- and $y$-data and the set of parameters $(m, b)$ of a straight
+  line as a two-element vector as input arguments. The function should
+  return the gradient at the position $(m, b)$ as a vector with two
+  elements.
+\end{exercise}
+
+\begin{exercise}{errorGradient.m}{}
+  Extend the script of exercises~\ref{errorsurfaceexercise} to plot
+  both the error surface and gradients using the
+  \varcode{meanSquaredGradient()} function from
+  exercise~\ref{gradientexercise}. Vectors in space can be easily
+  plotted using the function \code{quiver()}. Use \code{contour()}
+  instead of \code{surface()} to plot the error surface.
+\end{exercise}
+
+
+\begin{exercise}{gradientDescent.m}{}
+  Implement the gradient descent for the problem of fitting a straight
+  line to some measured data. Reuse the data generated in
+  exercise~\ref{errorsurfaceexercise}.
+  \begin{enumerate}
+  \item Store for each iteration the error value.
+  \item Plot the error values as a function of the iterations, the
+    number of optimization steps.
+  \item Plot the measured data together with the best fitting straight line.
+  \end{enumerate}\vspace{-4.5ex}
+\end{exercise}
+
+
+\begin{figure}[t]
+  \includegraphics[width=1\textwidth]{lin_regress}\hfill
+  \titlecaption{Example data suggesting a linear relation.}{A set of
+    input signals $x$, e.g. stimulus intensities, were used to probe a
+    system. The system's output $y$ to the inputs are noted
+    (left). Assuming a linear relation between $x$ and $y$ leaves us
+    with 2 parameters, the slope (center) and the intercept with the
+    y-axis (right panel).}\label{linregressiondatafig}
+\end{figure}
+
+\begin{figure}[t]
+  \includegraphics[width=1\textwidth]{linear_least_squares}
+  \titlecaption{Estimating the \emph{mean square error}.}  {The
+    deviation error (orange) between the prediction (red line) and the
+    observations (blue dots) is calculated for each data point
+    (left). Then the deviations are squared and the average is
+    calculated (right).}
+  \label{leastsquareerrorfig}
+\end{figure}
+
+\begin{figure}[t]
+  \includegraphics[width=0.75\textwidth]{error_surface}
+  \titlecaption{Error surface.}{The two model parameters $m$ and $b$
+    define the base area of the surface plot. For each parameter
+    combination of slope and intercept the error is calculated. The
+    resulting surface has a minimum which indicates the parameter
+    combination that best fits the data.}\label{errorsurfacefig}
+\end{figure}
+
+
+\begin{figure}[t]
+  \includegraphics[width=0.75\textwidth]{error_gradient}
+  \titlecaption{Gradient of the error surface.}  {Each arrow points
+    into the direction of the greatest ascend at different positions
+    of the error surface shown in \figref{errorsurfacefig}. The
+    contour lines in the background illustrate the error surface. Warm
+    colors indicate high errors, colder colors low error values. Each
+    contour line connects points of equal
+    error.}\label{gradientquiverfig}
+\end{figure}
+
+\begin{figure}[t]
+  \includegraphics[width=0.45\textwidth]{gradient_descent}
+  \titlecaption{Gradient descent.}{The algorithm starts at an
+    arbitrary position. At each point the gradient is estimated and
+    the position is updated as long as the length of the gradient is
+    sufficiently large.The dots show the positions after each
+    iteration of the algorithm.} \label{gradientdescentfig}
+\end{figure}
+
+
 \subsection{Linear fits}
 \begin{itemize}
 \item Polyfit is easy: unique solution! $c x^2$ is also a linear fit.
@@ -54,8 +161,8 @@ Fit with matlab functions lsqcurvefit, polyfit
 \subsection{Non-linear fits}
 \begin{itemize}
 \item Example that illustrates the Nebenminima Problem (with error surface)
-\item You need got initial values for the parameter!
-\item Example that fitting gets harder the more parameter yuo have.
+\item You need initial values for the parameter!
+\item Example that fitting gets harder the more parameter you have.
 \item Try to fix as many parameter before doing the fit.
 \item How to test the quality of a fit? Residuals. $\chi^2$ test. Run-test.
 \end{itemize}
diff --git a/regression/lecture/regression.tex b/regression/lecture/regression.tex
index e63e3f9..779309b 100644
--- a/regression/lecture/regression.tex
+++ b/regression/lecture/regression.tex
@@ -2,99 +2,104 @@
 \exercisechapter{Optimization and gradient descent}
 
 Optimization problems arise in many different contexts. For example,
-to understand the behavior of a given system, the system is probed
-with a range of input signals and then the resulting responses are
-measured. This input-output relation can be described by a model. Such
-a model can be a simple function that maps the input signals to
-corresponding responses, it can be a filter, or a system of
-differential equations. In any case, the model has some parameters that
-specify how input and output signals are related.  Which combination
-of parameter values are best suited to describe the input-output
-relation? The process of finding the best parameter values is an
-optimization problem. For a simple parameterized function that maps
-input to output values, this is the special case of a \enterm{curve
-  fitting} problem, where the average distance between the curve and
-the response values is minimized. One basic numerical method used for
-such optimization problems is the so called gradient descent, which is
-introduced in this chapter.
-
-%%% Weiteres einfaches verbales Beispiel? Eventuell aus der Populationsoekologie?
+to understand the behavior of a given neuronal system, the system is
+probed with a range of input signals and then the resulting responses
+are measured. This input-output relation can be described by a
+model. Such a model can be a simple function that maps the input
+signals to corresponding responses, it can be a filter, or a system of
+differential equations. In any case, the model has some parameters
+that specify how input and output signals are related.  Which
+combination of parameter values are best suited to describe the
+input-output relation? The process of finding the best parameter
+values is an optimization problem. For a simple parameterized function
+that maps input to output values, this is the special case of a
+\enterm{curve fitting} problem, where the average distance between the
+curve and the response values is minimized. One basic numerical method
+used for such optimization problems is the so called gradient descent,
+which is introduced in this chapter.
 
 \begin{figure}[t]
-  \includegraphics[width=1\textwidth]{lin_regress}\hfill
-  \titlecaption{Example data suggesting a linear relation.}{A set of
-    input signals $x$, e.g. stimulus intensities, were used to probe a
-    system. The system's output $y$ to the inputs are noted
-    (left). Assuming a linear relation between $x$ and $y$ leaves us
-    with 2 parameters, the slope (center) and the intercept with the
-    y-axis (right panel).}\label{linregressiondatafig}
+  \includegraphics{cubicfunc}
+  \titlecaption{Example data suggesting a cubic relation.}{The length
+    $x$ and weight $y$ of $n=34$ male tigers (blue, left). Assuming a
+    cubic relation between size and weight leaves us with a single
+    free parameters, a scaling factor. The cubic relation is shown for
+    a few values of this scaling factor (orange and red,
+    right).}\label{cubicdatafig}
 \end{figure}
 
-The data plotted in \figref{linregressiondatafig} suggest a linear
-relation between input and output of the system. We thus assume that a
-straight line
+For demonstrating the curve-fitting problem let's take the simple
+example of weights and sizes measured for a number of male tigers
+(\figref{cubicdatafig}). Weight $y$ is proportional to volume
+$V$ via the density $\rho$. The volume $V$ of any object is
+proportional to its length $x$ cubed. The factor $\alpha$ relating
+volume and size cubed depends on the shape of the object and we do not
+know this factor for tigers. For the data set we thus expect a cubic
+relation between weight and length
 \begin{equation}
-  \label{straightline}
-  y = f(x; m, b) = m\cdot x + b
+  \label{cubicfunc}
+  y = f(x; c) = c\cdot x^3
 \end{equation}
-is an appropriate model to describe the system.  The line has two free
-parameter, the slope $m$ and the $y$-intercept $b$. We need to find
-values for the slope and the intercept that best describe the measured
-data.  In this chapter we use this example to illustrate the gradient
-descent and how this methods can be used to find a combination of
-slope and intercept that best describes the system.
+where $c=\rho\alpha$, the product of a tiger's density and form
+factor, is the only free parameter in the relation. We would like to
+find out which value of $c$ best describes the measured data.  In the
+following we use this example to illustrate the gradient descent as a
+basic method for finding such an optimal parameter.
 
 
 \section{The error function --- mean squared error}
 
-Before the optimization can be done we need to specify what exactly is
-considered an optimal fit. In our example we search the parameter
-combination that describe the relation of $x$ and $y$ best. What is
-meant by this? Each input $x_i$ leads to an measured output $y_i$ and
-for each $x_i$ there is a \emph{prediction} or \emph{estimation}
-$y^{est}(x_i)$ of the output value by the model. At each $x_i$
-estimation and measurement have a distance or error $y_i -
-y^{est}(x_i)$. In our example the estimation is given by the equation
-$y^{est}(x_i) = f(x_i;m,b)$. The best fitting model with parameters
-$m$ and $b$ is the one that minimizes the distances between
-observation $y_i$ and estimation $y^{est}(x_i)$
-(\figref{leastsquareerrorfig}).
-
-As a first guess we could simply minimize the sum $\sum_{i=1}^N y_i -
-y^{est}(x_i)$. This approach, however, will not work since a minimal sum
-can also be achieved if half of the measurements is above and the
-other half below the predicted line. Positive and negative errors
-would cancel out and then sum up to values close to zero. A better
-approach is to sum over the absolute values of the distances:
-$\sum_{i=1}^N |y_i - y^{est}(x_i)|$. This sum can only be small if all
-deviations are indeed small no matter if they are above or below the
-predicted line. Instead of the sum we could also take the average
-\begin{equation}
-  \label{meanabserror}
-  f_{dist}(\{(x_i, y_i)\}|\{y^{est}(x_i)\}) = \frac{1}{N} \sum_{i=1}^N |y_i - y^{est}(x_i)|
-\end{equation}
-Instead of the averaged absolute errors, the \enterm[mean squared
-error]{mean squared error} (\determ[quadratischer
-Fehler!mittlerer]{mittlerer quadratischer Fehler})
+Before we go ahead finding the optimal parameter value we need to
+specify what exactly we consider as an optimal fit. In our example we
+search the parameter that describes the relation of $x$ and $y$
+best. What is meant by this? The length $x_i$ of each tiger is
+associated with a weight $y_i$ and for each $x_i$ we have a
+\emph{prediction} or \emph{estimation} $y^{est}(x_i)$ of the weight by
+the model \eqnref{cubicfunc} for a specific value of the parameter
+$c$. Prediction and actual data value ideally match (in a perfect
+noise-free world), but in general the estimate and measurement are
+separated by some distance or error $y_i - y^{est}(x_i)$. In our
+example the estimate of the weight for the length $x_i$ is given by
+equation \eqref{cubicfunc} $y^{est}(x_i) = f(x_i;c)$. The best fitting
+model with parameter $c$ is the one that somehow minimizes the
+distances between observations $y_i$ and corresponding estimations
+$y^{est}(x_i)$ (\figref{cubicerrorsfig}).
+
+As a first guess we could simply minimize the sum of the distances,
+$\sum_{i=1}^N y_i - y^{est}(x_i)$. This, however, does not work
+because positive and negative errors would cancel out, no matter how
+large they are, and sum up to values close to zero. Better is to sum
+over absolute distances: $\sum_{i=1}^N |y_i - y^{est}(x_i)|$. This sum
+can only be small if all deviations are indeed small no matter if they
+are above or below the prediction.  The sum of the squared distances,
+$\sum_{i=1}^N (y_i - y^{est}(x_i))^2$, turns out to be an even better
+choice. Instead of the sum we could also minimize the average distance
 \begin{equation}
   \label{meansquarederror}
   f_{mse}(\{(x_i, y_i)\}|\{y^{est}(x_i)\}) = \frac{1}{N} \sum_{i=1}^N (y_i - y^{est}(x_i))^2
 \end{equation}
-is commonly used (\figref{leastsquareerrorfig}). Similar to the
-absolute distance, the square of the errors, $(y_i - y^{est}(x_i))^2$, is
-always positive and thus positive and negative error values do not
+This is known as the \enterm[mean squared error]{mean squared error}
+(\determ[quadratischer Fehler!mittlerer]{mittlerer quadratischer
+  Fehler}). Similar to the absolute distance, the square of the errors
+is always positive and thus positive and negative error values do not
 cancel each other out. In addition, the square punishes large
 deviations over small deviations. In
 chapter~\ref{maximumlikelihoodchapter} we show that minimizing the
-mean square error is equivalent to maximizing the likelihood that the
+mean squared error is equivalent to maximizing the likelihood that the
 observations originate from the model, if the data are normally
 distributed around the model prediction.
 
-\begin{exercise}{meanSquaredErrorLine.m}{}\label{mseexercise}%
-  Given a vector of observations \varcode{y} and a vector with the
-  corresponding predictions \varcode{y\_est}, compute the \emph{mean
-    square error} between \varcode{y} and \varcode{y\_est} in a single
-  line of code.
+\begin{exercise}{meansquarederrorline.m}{}\label{mseexercise}
+  Simulate $n=40$ tigers ranging from 2.2 to 3.9\,m in size and store
+  these sizes in a vector \varcode{x}. Compute the corresponding
+  predicted weights \varcode{yest} for each tiger according to
+  \eqnref{cubicfunc} with $c=6$\,\kilo\gram\per\meter\cubed. From the
+  predictions generate simulated measurements of the tiger's weights
+  \varcode{y}, by adding normally distributed random numbers to the
+  predictions scaled to a standard deviation of 50\,\kilo\gram.
+
+  Compute the \emph{mean squared error} between \varcode{y} and
+  \varcode{yest} in a single line of code.
 \end{exercise}
 
 
@@ -110,13 +115,13 @@ can be any function that describes the quality of the fit by mapping
 the data and the predictions to a single scalar value.
 
 \begin{figure}[t]
-  \includegraphics[width=1\textwidth]{linear_least_squares}
-  \titlecaption{Estimating the \emph{mean square error}.}  {The
-    deviation error, orange) between the prediction (red
-    line) and the observations (blue dots) is calculated for each data
-    point (left). Then the deviations are squared and the aveage is
+  \includegraphics{cubicerrors}
+  \titlecaption{Estimating the \emph{mean squared error}.}  {The
+    deviation error (orange) between the prediction (red line) and the
+    observations (blue dots) is calculated for each data point
+    (left). Then the deviations are squared and the average is
     calculated (right).}
-  \label{leastsquareerrorfig}
+  \label{cubicerrorsfig}
 \end{figure}
 
 Replacing $y^{est}$ in the mean squared error \eqref{meansquarederror}
@@ -139,7 +144,7 @@ Fehler!kleinster]{Methode der kleinsten Quadrate}).
   contains the measurements $y$ for each value of $x$. The third
   argument is a 2-element vector that contains the values of
   parameters \varcode{m} and \varcode{b}. The function returns the
-  mean square error.
+  mean squared error.
 \end{exercise}
 
 
@@ -359,7 +364,7 @@ distance between the red dots in \figref{gradientdescentfig}) is
 large.
 
 \begin{figure}[t]
-  \includegraphics[width=0.45\textwidth]{gradient_descent}
+  \includegraphics{cubicmse}
   \titlecaption{Gradient descent.}{The algorithm starts at an
     arbitrary position. At each point the gradient is estimated and
     the position is updated as long as the length of the gradient is