From 891515caf86e1e6eb319bf0df5f28993ef6b18a9 Mon Sep 17 00:00:00 2001
From: Jan Benda <jan.benda@uni-tuebingen.de>
Date: Sat, 19 Dec 2020 13:41:28 +0100
Subject: [PATCH] [regression] first part n-dim minimization

---
 regression/code/plotgradientdescentcubic.m |  9 +-
 regression/lecture/regression.tex          | 96 ++++++++++++++++++++--
 2 files changed, 93 insertions(+), 12 deletions(-)

diff --git a/regression/code/plotgradientdescentcubic.m b/regression/code/plotgradientdescentcubic.m
index 4972a92..6ac54af 100644
--- a/regression/code/plotgradientdescentcubic.m
+++ b/regression/code/plotgradientdescentcubic.m
@@ -1,4 +1,4 @@
-meansquarederrorline                % generate data
+meansquarederrorline;               % generate data
 
 c0 = 2.0;
 eps = 0.0001;
@@ -21,9 +21,8 @@ hold on;
 % generate x-values for plottig the fit:
 xx = min(x):0.01:max(x);
 yy = cest * xx.^3;
-plot(xx, yy, 'displayname', 'fit');
-plot(x, y, 'o', 'displayname', 'data'); % plot original data
+plot(xx, yy);
+plot(x, y, 'o');                    % plot original data
 xlabel('Size [m]');
 ylabel('Weight [kg]');
-legend("location", "northwest");
-pause
+legend('fit', 'data', 'location', 'northwest');
diff --git a/regression/lecture/regression.tex b/regression/lecture/regression.tex
index 6e9c591..472f3c6 100644
--- a/regression/lecture/regression.tex
+++ b/regression/lecture/regression.tex
@@ -254,7 +254,7 @@ can be approximated numerically by the difference quotient
 The derivative is positive for positive slopes. Since want to go down
 the hill, we choose the opposite direction.
 
-\begin{exercise}{meanSquaredGradientCubic.m}{}
+\begin{exercise}{meanSquaredGradientCubic.m}{}\label{gradientcubic}
   Implement a function \varcode{meanSquaredGradientCubic()}, that
   takes the $x$- and $y$-data and the parameter $c$ as input
   arguments. The function should return the derivative of the mean
@@ -312,12 +312,12 @@ descent works as follows:
 \item Repeat steps \ref{computegradient} -- \ref{gradientstep}.
 \end{enumerate}
 
-\Figref{gradientdescentcubicfig} illustrates the gradient descent --- the
-path the imaginary ball has chosen to reach the minimum. We walk along
-the parameter axis against the gradient as long as the gradient
+\Figref{gradientdescentcubicfig} illustrates the gradient descent ---
+the path the imaginary ball has chosen to reach the minimum. We walk
+along the parameter axis against the gradient as long as the gradient
 differs sufficiently from zero.  At steep slopes we take large steps
-(the distance between the red dots in \figref{gradientdescentcubicfig}) is
-large.
+(the distance between the red dots in \figref{gradientdescentcubicfig}
+is large).
 
 \begin{exercise}{gradientDescentCubic.m}{}
   Implement the gradient descent algorithm for the problem of fitting
@@ -339,11 +339,78 @@ large.
   squared errors as a function of iteration step (two plots). Compare
   the result of the gradient descent method with the true value of $c$
   used to simulate the data. Inspect the plots and adapt $\epsilon$
-  and the threshold to make the algorithm behave as intended. Also
+  and the threshold to make the algorithm behave as intended. Finally
   plot the data together with the best fitting cubic relation
   \eqref{cubicfunc}.
 \end{exercise}
 
+The $\epsilon$ parameter in \eqnref{gradientdescent} is critical. If
+too large, the algorithm does not converge to the minimum of the cost
+function (try it!). At medium values it oscillates around the minimum
+but might nevertheless converge. Only for sufficiently small values
+(here $\epsilon = 0.0001$) does the algorithm follow the slope
+downwards towards the minimum.
+
+The terminating condition on the absolute value of the gradient
+influences how often the cost function is evaluated. The smaller the
+threshold value the more often the cost is computed and the more
+precisely the fit parameter is estimated. If it is too small, however,
+the increase in precision is negligible, in particular in comparison
+to the increased computational effort. Have a look at the derivatives
+that we plotted in exercise~\ref{gradientcubic} and decide on a
+sensible value for the threshold. Run the gradient descent algorithm
+and check how the resulting $c$ parameter values converge and how many
+iterations were needed. The reduce the threshold (by factors of ten)
+and check how this changes the results.
+
+Many modern algorithms for finding the minimum of a function are based
+on the basic idea of the gradient descent. Luckily these algorithms
+choose $\epsilon$ in a smart adaptive way and they also come up with
+sensible default values for the termination condition. On the other
+hand, these algorithm often take optional arguments that let you
+control how they behave. Now you know what this is all about.
+
+\section{N-dimensional minimization problems}
+
+So far we were concerned about finding the right value for a single
+parameter that minimizes a cost function. The gradient descent method
+for such one dimensional problems seems a bit like over kill. However,
+often we deal with functions that have more than a single parameter,
+in general $n$ parameter. We then need to find the minimum in an $n$
+dimensional parameter space.
+
+For our tiger problem, we could have also fitted the exponent $\alpha$
+of the power-law relation between size and weight, instead of assuming
+a cubic relation:
+\begin{equation}
+  \label{powerfunc}
+  y = f(x; c, \alpha) = f(x; \vec p) = c\cdot x^\alpha
+\end{equation}
+We then could check whether the resulting estimate of the exponent
+$\alpha$ indeed is close to the expected power of three.  The
+power-law \eqref{powerfunc} has two free parameters $c$ and $\alpha$.
+Instead of a single parameter we are now dealing with a vector $\vec
+p$ containing $n$ parameter values. Here, $\vec p = (c,
+\alpha)$. Luckily, all the concepts we introduced on the example of
+the one dimensional problem of tiger weights generalize to
+$n$-dimensional problems. We only need to adapt a few things. The cost
+function for the mean squared error reads
+\begin{equation}
+  \label{ndimcostfunc}
+  f_{cost}(\vec p|\{(x_i, y_i)\}) = \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i;\vec p))^2
+\end{equation}
+
+For two-dimensional problems the graph of the cost function is an
+\enterm{error surface} (\determ{{Fehlerfl\"ache}}). The two parameters
+span a two-dimensional plane. The cost function assigns to each
+parameter combination on this plane a single value. This results in a
+landscape over the parameter plane with mountains and valleys and we
+are searching for the bottom of the deepest valley.
+
+When we place a ball somewhere on the slopes of a hill it rolls
+downwards and eventually stops at the bottom. The ball always rolls in
+the direction of the steepest slope. 
+
 \begin{ibox}[tp]{\label{partialderivativebox}Partial derivative and gradient}
   Some functions that depend on more than a single variable:
   \[ z = f(x,y) \]
@@ -388,6 +455,21 @@ that points to the strongest ascend of the objective function.  The
 gradient is given by partial derivatives
 (Box~\ref{partialderivativebox}) of the mean squared error with
 respect to the parameters $m$ and $b$ of the straight line.
+
+
+
+For example, you measure the response of a passive membrane to a
+current step and you want to estimate membrane time constant. Then you
+need to fit an exponential function
+\begin{equation}
+  \label{expfunc}
+  V(t; \tau, \Delta V, V_{\infty}) = \Delta V e^{-t/\tau} + V_{\infty}
+\end{equation}
+with three free parameters $\tau$, $\Delta y$, $y_{\infty}$ to the
+measured time course of the membrane potential $V(t)$. The $(x_i,y_i)$
+data pairs are the sampling times $t_i$ and the corresponding
+measurements of the membrane potential $V_i$.
+
 \section{Summary}
 
 The gradient descent is an important numerical method for solving