From ca5493624559b9a2efebe9ad79ea1023b49ca9a9 Mon Sep 17 00:00:00 2001
From: Jan Benda <jan.benda@uni-tuebingen.de>
Date: Fri, 18 Dec 2020 23:37:17 +0100
Subject: [PATCH] [regression] finished gradient section

---
 regression/code/meanSquaredGradientCubic.m |  14 ++
 regression/code/plotcubicgradient.m        |   9 +
 regression/lecture/regression-chapter.tex  |  12 +-
 regression/lecture/regression.tex          | 191 ++++++++++-----------
 4 files changed, 115 insertions(+), 111 deletions(-)
 create mode 100644 regression/code/meanSquaredGradientCubic.m
 create mode 100644 regression/code/plotcubicgradient.m

diff --git a/regression/code/meanSquaredGradientCubic.m b/regression/code/meanSquaredGradientCubic.m
new file mode 100644
index 0000000..a7ff6a2
--- /dev/null
+++ b/regression/code/meanSquaredGradientCubic.m
@@ -0,0 +1,14 @@
+function dmsedc = meanSquaredGradientCubic(x, y, c)
+% The gradient of the mean squared error for a cubic relation.
+%
+% Arguments: x, vector of the x-data values
+%            y, vector of the corresponding y-data values
+%            c, the factor for the cubic relation. 
+%
+% Returns: the derivative of the mean squared error at c.
+
+  h = 1e-5;   % stepsize for derivatives
+  mse = meanSquaredErrorCubic(x, y, c);
+  mseh = meanSquaredErrorCubic(x, y, c+h);
+  dmsedc = (mseh - mse)/h;
+end
diff --git a/regression/code/plotcubicgradient.m b/regression/code/plotcubicgradient.m
new file mode 100644
index 0000000..0410b1f
--- /dev/null
+++ b/regression/code/plotcubicgradient.m
@@ -0,0 +1,9 @@
+cs = 2.0:0.1:8.0;
+mseg = zeros(length(cs));
+for i = 1:length(cs)
+    mseg(i) = meanSquaredGradientCubic(x, y, cs(i));
+end
+
+plot(cs, mseg)
+xlabel('c')
+ylabel('gradient')
diff --git a/regression/lecture/regression-chapter.tex b/regression/lecture/regression-chapter.tex
index 8afed44..3983848 100644
--- a/regression/lecture/regression-chapter.tex
+++ b/regression/lecture/regression-chapter.tex
@@ -25,19 +25,11 @@
 
 \subsection{Start with one-dimensional problem!}
 \begin{itemize}
-\item Let's fit a cubic function $y=cx^3$ (weight versus length of a tiger)\\
-\includegraphics[width=0.8\textwidth]{cubicfunc}
-\item Introduce the problem, $c$ is density and form factor
-\item How to generate an artificial data set (refer to simulation chapter)
 \item How to plot a function (do not use the data x values!)
-\item Just the mean square error as a function of the factor c\\
-\includegraphics[width=0.8\textwidth]{cubicerrors}
-\item Also mention the cost function for a straight line
-\item 1-d gradient, NO quiver plot (it is a nightmare to get this right)\\
-\includegraphics[width=0.8\textwidth]{cubicmse}
 \item 1-d gradient descend
-\item Describe in words the n-d problem.
+\item Describe in words the n-d problem (boltzman as example?).
 \item Homework is to do the 2d problem with the straight line!
+\item NO quiver plot (it is a nightmare to get this right)
 \end{itemize}
 
 \subsection{2D fit}
diff --git a/regression/lecture/regression.tex b/regression/lecture/regression.tex
index d903a0b..776f745 100644
--- a/regression/lecture/regression.tex
+++ b/regression/lecture/regression.tex
@@ -152,7 +152,7 @@ For each value of the parameter $c$ of the model we can use
 function. The cost function $f_{cost}(c|\{(x_i, y_i)\}|)$ is a
 function $f_{cost}(c)$ that maps the parameter value $c$ to a scalar
 error value. For a given data set we thus can simply plot the cost
-function as a function of $c$ (\figref{cubiccostfig}).
+function as a function of the parameter $c$ (\figref{cubiccostfig}).
 
 \begin{exercise}{plotcubiccosts.m}{}
   Calculate the mean squared error between the data and the cubic
@@ -181,17 +181,18 @@ automatic optimization process?
 
 The obvious approach would be to calculate the mean squared error for
 a range of parameter values and then find the position of the minimum
-using the \code{min} function. This approach, however has several
-disadvantages: (i) the accuracy of the estimation of the best
+using the \code{min()} function. This approach, however has several
+disadvantages: (i) The accuracy of the estimation of the best
 parameter is limited by the resolution used to sample the parameter
 space. The coarser the parameters are sampled the less precise is the
 obtained position of the minimum (\figref{cubiccostfig}, right).  (ii)
-the range of parameter values might not include the absolute minimum.
-(iii) in particular for functions with more than a single free
+The range of parameter values might not include the absolute minimum.
+(iii) In particular for functions with more than a single free
 parameter it is computationally expensive to calculate the cost
-function for each parameter combination. The number of combinations
-increases exponentially with the number of free parameters. This is
-known as the \enterm{curse of dimensionality}.
+function for each parameter combination at a sufficient
+resolution. The number of combinations increases exponentially with
+the number of free parameters. This is known as the \enterm{curse of
+  dimensionality}.
 
 So we need a different approach. We want a procedure that finds the
 minimum of the cost function with a minimal number of computations and
@@ -219,110 +220,54 @@ to arbitrary precision.
       f'(x) = \frac{{\rm d} f(x)}{{\rm d}x} = \lim\limits_{\Delta x \to 0} \frac{f(x + \Delta x) - f(x)}{\Delta x} \end{equation}
   \end{minipage}\vspace{2ex} 
 
-  It is not possible to calculate the exact value of the derivative,
-  \eqnref{derivative}, numerically. The derivative can only be
-  estimated by computing the difference quotient, \eqnref{difffrac}
-  using sufficiently small $\Delta x$.
+  It is not possible to calculate the exact value of the derivative
+  \eqref{derivative} numerically. The derivative can only be estimated
+  by computing the difference quotient \eqref{difffrac} using
+  sufficiently small $\Delta x$.
 \end{ibox}
 
-\begin{ibox}[tp]{\label{partialderivativebox}Partial derivative and gradient}
-  Some functions that depend on more than a single variable:
-  \[ z = f(x,y) \]
-  for example depends on $x$ and $y$. Using the partial derivative
-  \[ \frac{\partial f(x,y)}{\partial x} = \lim\limits_{\Delta x \to 0} \frac{f(x + \Delta x,y) - f(x,y)}{\Delta x} \]
-  and
-  \[ \frac{\partial f(x,y)}{\partial y} = \lim\limits_{\Delta y \to 0} \frac{f(x, y + \Delta y) - f(x,y)}{\Delta y} \]
-  one can estimate the slope in the direction of the variables
-  individually by using the respective difference quotient
-  (Box~\ref{differentialquotientbox}).  \vspace{1ex}
-
-  \begin{minipage}[t]{0.5\textwidth}
-    \mbox{}\\[-2ex]
-    \includegraphics[width=1\textwidth]{gradient}
-  \end{minipage}
-  \hfill
-  \begin{minipage}[t]{0.46\textwidth}
-    For example, the partial derivatives of
-    \[ f(x,y) = x^2+y^2 \] are 
-    \[ \frac{\partial f(x,y)}{\partial x} = 2x \; , \quad \frac{\partial f(x,y)}{\partial y} = 2y \; .\]
-
-    The gradient is a vector that is constructed from the partial derivatives:
-    \[ \nabla f(x,y) = \left( \begin{array}{c} \frac{\partial f(x,y)}{\partial x} \\[1ex] \frac{\partial f(x,y)}{\partial y} \end{array} \right) \]
-    This vector points into the direction of the strongest ascend of
-    $f(x,y)$.
-  \end{minipage}
-
-  \vspace{0.5ex} The figure shows the contour lines of a bi-variate
-  Gaussian $f(x,y) = \exp(-(x^2+y^2)/2)$ and the gradient (thick
-  arrows) and the corresponding two partial derivatives (thin arrows)
-  for three different locations.
-\end{ibox}
-
-
 \section{Gradient}
-Imagine to place a small ball at some point on the error surface
-\figref{errorsurfacefig}. Naturally, it would roll down the steepest
-slope and eventually stop at the minimum of the error surface (if it had no
-inertia). We will use this picture to develop an algorithm to find our
-way to the minimum of the objective function. The ball will always
-follow the steepest slope. Thus we need to figure out the direction of
-the steepest slope at the position of the ball.
-
-The \entermde{Gradient}{gradient} (Box~\ref{partialderivativebox}) of the
-objective function is the vector
+Imagine to place a ball at some point on the cost function
+\figref{cubiccostfig}. Naturally, it would roll down the slope and
+eventually stop at the minimum of the error surface (if it had no
+inertia). We will use this analogy to develop an algorithm to find our
+way to the minimum of the cost function. The ball always follows the
+steepest slope. Thus we need to figure out the direction of the slope
+at the position of the ball.
+
+In our one-dimensional example of a single free parameter the slope is
+simply the derivative of the cost function with respect to the
+parameter $c$. This derivative is called the
+\entermde{Gradient}{gradient} of the cost function:
 \begin{equation}
-  \label{gradient}
-  \nabla f_{cost}(m,b) = \left( \frac{\partial f(m,b)}{\partial m},
-    \frac{\partial f(m,b)}{\partial b} \right)
+  \label{costderivative}
+  \nabla f_{cost}(c) = \frac{{\rm d} f_{cost}(c)}{{\rm d} c}
 \end{equation}
-that points to the strongest ascend of the objective function.  The
-gradient is given by partial derivatives
-(Box~\ref{partialderivativebox}) of the mean squared error with
-respect to the parameters $m$ and $b$ of the straight line. There is
-no need to calculate it analytically because it can be estimated from
-the partial derivatives using the difference quotient
-(Box~\ref{differentialquotientbox}) for small steps $\Delta m$ and
-$\Delta b$. For example, the partial derivative with respect to $m$
-can be computed as
+There is no need to calculate this derivative analytically, because it
+can be approximated numerically by the difference quotient
+(Box~\ref{differentialquotientbox}) for small steps $\Delta c$:
 \begin{equation}
-  \frac{\partial f_{cost}(m,b)}{\partial m} = \lim\limits_{\Delta m \to
-  0} \frac{f_{cost}(m + \Delta m, b) - f_{cost}(m,b)}{\Delta m}
-\approx \frac{f_{cost}(m + \Delta m, b) - f_{cost}(m,b)}{\Delta m} \; . 
+  \frac{{\rm d} f_{cost}(c)}{{\rm d} c} =
+    \lim\limits_{\Delta c \to 0} \frac{f_{cost}(c + \Delta c) - f_{cost}(c)}{\Delta c}
+    \approx \frac{f_{cost}(c + \Delta c) - f_{cost}(c)}{\Delta c}
 \end{equation}
-The length of the gradient indicates the steepness of the slope
-(\figref{gradientquiverfig}). Since want to go down the hill, we
-choose the opposite direction.
-
-
-\begin{figure}[t]
-  \includegraphics[width=0.75\textwidth]{error_gradient}
-  \titlecaption{Gradient of the error surface.}  {Each arrow points
-    into the direction of the greatest ascend at different positions
-    of the error surface shown in \figref{errorsurfacefig}. The
-    contour lines in the background illustrate the error surface. Warm
-    colors indicate high errors, colder colors low error values. Each
-    contour line connects points of equal
-    error.}\label{gradientquiverfig}
-\end{figure}
-
-\begin{exercise}{meanSquaredGradient.m}{}\label{gradientexercise}%
-  Implement a function \varcode{meanSquaredGradient()}, that takes the
-  $x$- and $y$-data and the set of parameters $(m, b)$ of a straight
-  line as a two-element vector as input arguments. The function should
-  return the gradient at the position $(m, b)$ as a vector with two
-  elements.
+The derivative is positive for positive slopes. Since want to go down
+the hill, we choose the opposite direction.
+
+\begin{exercise}{meanSquaredGradientCubic.m}{}
+  Implement a function \varcode{meanSquaredGradientCubic()}, that
+  takes the $x$- and $y$-data and the parameter $c$ as input
+  arguments. The function should return the derivative of the mean
+  squared error $f_{cost}(c)$ with respect to $c$ at the position
+  $c$.
 \end{exercise}
 
-\begin{exercise}{errorGradient.m}{}
-  Extend the script of exercises~\ref{errorsurfaceexercise} to plot
-  both the error surface and gradients using the
-  \varcode{meanSquaredGradient()} function from
-  exercise~\ref{gradientexercise}. Vectors in space can be easily
-  plotted using the function \code{quiver()}. Use \code{contour()}
-  instead of \code{surface()} to plot the error surface.
+\begin{exercise}{plotcubicgradient.m}{}
+  Using the \varcode{meanSquaredGradientCubic()} function from the
+  previous exercise, plot the derivative of the cost function as a
+  function of $c$.
 \end{exercise}
 
-
 \section{Gradient descent}
 Finally, we are able to implement the optimization itself. By now it
 should be obvious why it is called the gradient descent method. All
@@ -381,6 +326,50 @@ large.
 \end{exercise}
 
 
+\begin{ibox}[tp]{\label{partialderivativebox}Partial derivative and gradient}
+  Some functions that depend on more than a single variable:
+  \[ z = f(x,y) \]
+  for example depends on $x$ and $y$. Using the partial derivative
+  \[ \frac{\partial f(x,y)}{\partial x} = \lim\limits_{\Delta x \to 0} \frac{f(x + \Delta x,y) - f(x,y)}{\Delta x} \]
+  and
+  \[ \frac{\partial f(x,y)}{\partial y} = \lim\limits_{\Delta y \to 0} \frac{f(x, y + \Delta y) - f(x,y)}{\Delta y} \]
+  one can estimate the slope in the direction of the variables
+  individually by using the respective difference quotient
+  (Box~\ref{differentialquotientbox}).  \vspace{1ex}
+
+  \begin{minipage}[t]{0.5\textwidth}
+    \mbox{}\\[-2ex]
+    \includegraphics[width=1\textwidth]{gradient}
+  \end{minipage}
+  \hfill
+  \begin{minipage}[t]{0.46\textwidth}
+    For example, the partial derivatives of
+    \[ f(x,y) = x^2+y^2 \] are 
+    \[ \frac{\partial f(x,y)}{\partial x} = 2x \; , \quad \frac{\partial f(x,y)}{\partial y} = 2y \; .\]
+
+    The gradient is a vector that is constructed from the partial derivatives:
+    \[ \nabla f(x,y) = \left( \begin{array}{c} \frac{\partial f(x,y)}{\partial x} \\[1ex] \frac{\partial f(x,y)}{\partial y} \end{array} \right) \]
+    This vector points into the direction of the strongest ascend of
+    $f(x,y)$.
+  \end{minipage}
+
+  \vspace{0.5ex} The figure shows the contour lines of a bi-variate
+  Gaussian $f(x,y) = \exp(-(x^2+y^2)/2)$ and the gradient (thick
+  arrows) and the corresponding two partial derivatives (thin arrows)
+  for three different locations.
+\end{ibox}
+
+The \entermde{Gradient}{gradient} (Box~\ref{partialderivativebox}) of the
+objective function is the vector
+\begin{equation}
+  \label{gradient}
+  \nabla f_{cost}(m,b) = \left( \frac{\partial f(m,b)}{\partial m},
+    \frac{\partial f(m,b)}{\partial b} \right)
+\end{equation}
+that points to the strongest ascend of the objective function.  The
+gradient is given by partial derivatives
+(Box~\ref{partialderivativebox}) of the mean squared error with
+respect to the parameters $m$ and $b$ of the straight line.
 \section{Summary}
 
 The gradient descent is an important numerical method for solving