From 46affef86d61bd4ae301979bb46ba326a8e676c2 Mon Sep 17 00:00:00 2001
From: Jan Benda <jan.benda@uni-tuebingen.de>
Date: Sat, 19 Dec 2020 22:56:09 +0100
Subject: [PATCH] [regression] finished main text and exercises

---
 regression/code/gradientDescent.m          |  66 ++++++----
 regression/code/gradientDescentPower.m     |  41 ------
 regression/code/plotgradientdescentpower.m |   4 +-
 regression/lecture/regression-chapter.tex  |  74 -----------
 regression/lecture/regression.tex          | 141 +++++++++++++--------
 5 files changed, 128 insertions(+), 198 deletions(-)
 delete mode 100644 regression/code/gradientDescentPower.m

diff --git a/regression/code/gradientDescent.m b/regression/code/gradientDescent.m
index 1919035..48b976c 100644
--- a/regression/code/gradientDescent.m
+++ b/regression/code/gradientDescent.m
@@ -1,32 +1,42 @@
-% x, y from exercise 8.3
+function [p, ps, mses] = gradientDescent(x, y, func, p0, epsilon, threshold)
+% Gradient descent for fitting a function to data pairs.
+%
+% Arguments: x, vector of the x-data values.
+%            y, vector of the corresponding y-data values.
+%            func, function handle func(x, p)
+%            p0, vector with initial parameter values
+%            epsilon: factor multiplying the gradient.
+%            threshold: minimum value for gradient
+%
+% Returns:   p, vector with the final parameter values.
+%            ps: 2D-vector with all the parameter vectors traversed.
+%            mses: vector with the corresponding mean squared errors
 
-% some arbitrary values for the slope and the intercept to start with:
-position = [-2.0, 10.0];  
+  p = p0;
+  gradient = ones(1, length(p0)) * 1000.0;
+  ps = [];
+  mses = [];
+  while norm(gradient) > threshold
+      ps = [ps, p(:)];
+      mses = [mses, meanSquaredError(x, y, func, p)];
+      gradient = meanSquaredGradient(x, y, func, p);
+      p = p - epsilon * gradient;
+  end
+end
+
+function mse = meanSquaredError(x, y, func, p)
+    mse = mean((y - func(x, p)).^2);
+end
 
-% gradient descent:
-gradient = [];
-errors = [];
-count = 1;
-eps = 0.0001;
-while isempty(gradient) || norm(gradient) > 0.1
-    gradient = meanSquaredGradient(x, y, position);
-    errors(count) = meanSquaredError(x, y, position);
-    position = position - eps .* gradient; 
-    count = count + 1;
+function gradmse = meanSquaredGradient(x, y, func, p)
+  gradmse = zeros(size(p, 1), size(p, 2));
+  h = 1e-5;               % stepsize for derivatives
+  mse = meanSquaredError(x, y, func, p);
+  for i = 1:length(p)     % for each coordinate ...
+    pi = p;
+    pi(i) = pi(i) + h;    %   displace i-th parameter
+    msepi = meanSquaredError(x, y, func, pi);
+    gradmse(i) = (msepi - mse)/h;
+  end
 end
 
-figure()
-subplot(2,1,1)
-hold on
-scatter(x, y, 'displayname', 'data')
-xx = min(x):0.01:max(x);
-yy = position(1).*xx + position(2);
-plot(xx, yy, 'displayname', 'fit')
-xlabel('Input')
-ylabel('Output')
-grid on
-legend show
-subplot(2,1,2)
-plot(errors)
-xlabel('optimization steps')
-ylabel('error')
diff --git a/regression/code/gradientDescentPower.m b/regression/code/gradientDescentPower.m
deleted file mode 100644
index 31390eb..0000000
--- a/regression/code/gradientDescentPower.m
+++ /dev/null
@@ -1,41 +0,0 @@
-function [p, ps, mses] = gradientDescentPower(x, y, p0, epsilon, threshold)
-% Gradient descent for fitting a power-law.
-%
-% Arguments: x, vector of the x-data values.
-%            y, vector of the corresponding y-data values.
-%            p0, vector with initial values for c and alpha. 
-%            epsilon: factor multiplying the gradient.
-%            threshold: minimum value for gradient
-%
-% Returns:   p, vector with the final parameter values.
-%            ps: 2D-vector with all the parameter tuples traversed.
-%            mses: vector with the corresponding mean squared errors
-
-  p = p0;
-  gradient = ones(1, length(p0)) * 1000.0;
-  ps = [];
-  mses = [];
-  while norm(gradient) > threshold
-      ps = [ps, p(:)];
-      mses = [mses, meanSquaredErrorPower(x, y, p)];
-      gradient = meanSquaredGradientPower(x, y, p);
-      p = p - epsilon * gradient;
-  end
-end
-
-function mse = meanSquaredErrorPower(x, y, p)
-    mse = mean((y - p(1)*x.^p(2)).^2);
-end
-
-function gradmse = meanSquaredGradientPower(x, y, p)
-  gradmse = zeros(size(p, 1), size(p, 2));
-  h = 1e-5;               % stepsize for derivatives
-  mse = meanSquaredErrorPower(x, y, p);
-  for i = 1:length(p)     % for each coordinate ...
-    pi = p;
-    pi(i) = pi(i) + h;    %   displace i-th parameter
-    msepi = meanSquaredErrorPower(x, y, pi);
-    gradmse(i) = (msepi - mse)/h;
-  end
-end
-
diff --git a/regression/code/plotgradientdescentpower.m b/regression/code/plotgradientdescentpower.m
index f52bfe7..206e2bb 100644
--- a/regression/code/plotgradientdescentpower.m
+++ b/regression/code/plotgradientdescentpower.m
@@ -3,7 +3,7 @@ meansquarederrorline;               % generate data
 p0 = [2.0, 1.0];
 eps = 0.00001;
 thresh = 50.0;
-[pest, ps, mses] = gradientDescentPower(x, y, p0, eps, thresh);
+[pest, ps, mses] = gradientDescent(x, y, @powerLaw, p0, eps, thresh);
 pest
 
 subplot(2, 2, 1);                   % top left panel
@@ -22,7 +22,7 @@ subplot(1, 2, 2);                   % right panel
 hold on;
 % generate x-values for plottig the fit:
 xx = min(x):0.01:max(x);
-yy = pest(1) * xx.^pest(2);
+yy = powerLaw(xx, pest);
 plot(xx, yy);
 plot(x, y, 'o');                    % plot original data
 xlabel('Size [m]');
diff --git a/regression/lecture/regression-chapter.tex b/regression/lecture/regression-chapter.tex
index 3983848..2ca1ec2 100644
--- a/regression/lecture/regression-chapter.tex
+++ b/regression/lecture/regression-chapter.tex
@@ -23,80 +23,6 @@
 \item Fig 8.2 right: this should be a chi-squared distribution with one degree of freedom!
 \end{itemize}
 
-\subsection{Start with one-dimensional problem!}
-\begin{itemize}
-\item How to plot a function (do not use the data x values!)
-\item 1-d gradient descend
-\item Describe in words the n-d problem (boltzman as example?).
-\item Homework is to do the 2d problem with the straight line!
-\item NO quiver plot (it is a nightmare to get this right)
-\end{itemize}
-
-\subsection{2D fit}
-
-\begin{exercise}{errorSurface.m}{}\label{errorsurfaceexercise}
-  Generate 20 data pairs $(x_i|y_i)$ that are linearly related with
-  slope $m=0.75$ and intercept $b=-40$, using \varcode{rand()} for
-  drawing $x$ values between 0 and 120 and \varcode{randn()} for
-  jittering the $y$ values with a standard deviation of 15.  Then
-  calculate the mean squared error between the data and straight lines
-  for a range of slopes and intercepts using the
-  \varcode{meanSquaredError()} function from the previous exercise.
-  Illustrates the error surface using the \code{surface()} function.
-  Consult the documentation to find out how to use \code{surface()}.
-\end{exercise}
-
-\begin{exercise}{meanSquaredGradient.m}{}\label{gradientexercise}%
-  Implement a function \varcode{meanSquaredGradient()}, that takes the
-  $x$- and $y$-data and the set of parameters $(m, b)$ of a straight
-  line as a two-element vector as input arguments. The function should
-  return the gradient at the position $(m, b)$ as a vector with two
-  elements.
-\end{exercise}
-
-\begin{exercise}{errorGradient.m}{}
-  Extend the script of exercises~\ref{errorsurfaceexercise} to plot
-  both the error surface and gradients using the
-  \varcode{meanSquaredGradient()} function from
-  exercise~\ref{gradientexercise}. Vectors in space can be easily
-  plotted using the function \code{quiver()}. Use \code{contour()}
-  instead of \code{surface()} to plot the error surface.
-\end{exercise}
-
-
-\begin{exercise}{gradientDescent.m}{}
-  Implement the gradient descent for the problem of fitting a straight
-  line to some measured data. Reuse the data generated in
-  exercise~\ref{errorsurfaceexercise}.
-  \begin{enumerate}
-  \item Store for each iteration the error value.
-  \item Plot the error values as a function of the iterations, the
-    number of optimization steps.
-  \item Plot the measured data together with the best fitting straight line.
-  \end{enumerate}\vspace{-4.5ex}
-\end{exercise}
-
-
-\begin{figure}[t]
-  \includegraphics[width=1\textwidth]{lin_regress}\hfill
-  \titlecaption{Example data suggesting a linear relation.}{A set of
-    input signals $x$, e.g. stimulus intensities, were used to probe a
-    system. The system's output $y$ to the inputs are noted
-    (left). Assuming a linear relation between $x$ and $y$ leaves us
-    with 2 parameters, the slope (center) and the intercept with the
-    y-axis (right panel).}\label{linregressiondatafig}
-\end{figure}
-
-\begin{figure}[t]
-  \includegraphics[width=1\textwidth]{linear_least_squares}
-  \titlecaption{Estimating the \emph{mean square error}.}  {The
-    deviation error (orange) between the prediction (red line) and the
-    observations (blue dots) is calculated for each data point
-    (left). Then the deviations are squared and the average is
-    calculated (right).}
-  \label{leastsquareerrorfig}
-\end{figure}
-
 \begin{figure}[t]
   \includegraphics[width=0.75\textwidth]{error_surface}
   \titlecaption{Error surface.}{The two model parameters $m$ and $b$
diff --git a/regression/lecture/regression.tex b/regression/lecture/regression.tex
index 69e7b05..36e1808 100644
--- a/regression/lecture/regression.tex
+++ b/regression/lecture/regression.tex
@@ -479,49 +479,73 @@ the sum of the squared partial derivatives:
 \end{equation}
 The \code{norm()} function implements this.
 
-\begin{exercise}{gradientDescentPower.m}{}
+
+\section{Passing a function as an argument to another function}
+
+So far, all our code for the gradient descent algorithm was tailored
+to a specific function, the cubic relation \eqref{cubicfunc}. It would
+be much better if we could pass an arbitrary function to our gradient
+algorithm. Then we would not need to rewrite it every time anew.
+
+This is possible. We can indeed pass a function as an argument to
+another function. For this use the \code{@}-operator. As an example
+let's define a function that produces a standard plot for a function:
+
+\pageinputlisting[caption={Example function taking a function as argument.}]{funcPlotter.m}
+
+This function can then be used as follows for plotting a sine wave. We
+pass the built in \varcode{sin()} function as \varcode{@sin} as an
+argument to our function:
+
+\pageinputlisting[caption={Passing a function handle as an argument to a function.}]{funcplotterexamples.m}
+
+
+\section{Gradient descent algorithm for arbitrary functions}
+
+Now we are ready to adapt the gradient descent algorithm from
+exercise~\ref{gradientdescentexercise} to arbitrary functions with $n$
+parameters that we want to fit to some data.
+
+\begin{exercise}{gradientDescent.m}{}
   Adapt the function \varcode{gradientDescentCubic()} from
   exercise~\ref{gradientdescentexercise} to implement the gradient
-  descent algorithm for the power law \eqref{powerfunc}.  The new
-  function takes a two element vector $(c,\alpha)$ for the initial
-  parameter values and also returns the best parameter combination as
-  a two-element vector. Use a \varcode{for} loop over the two
-  dimensions for computing the gradient.
+  descent algorithm for any function \varcode{func(x, p)} that takes
+  as first argument the $x$-data values and as second argument a
+  vector with parameter values.  The new function takes a vector $\vec
+  p_0$ for the initial parameter values and also returns the best
+  parameter combination as a vector. Use a \varcode{for} loop over the
+  two dimensions for computing the gradient.
 \end{exercise}
 
+For testing our new function we need to implement the power law
+\eqref{powerfunc}:
+
+\begin{exercise}{powerLaw.m}{}
+  Write a function that implements \eqref{powerfunc}. The function
+  gets as arguments a vector $x$ containing the $x$-data values and
+  another vector containing as elements the parameters for the power
+  law, i.e. the factor $c$ and the power $\alpha$. It returns a vector
+  with the computed $y$ values for each $x$ value. 
+\end{exercise}
+
+Now let's use the new gradient descent function to fit a power law to
+our tiger data-set:
+
 \begin{exercise}{plotgradientdescentpower.m}{}
-  Use the function \varcode{gradientDescentPower()} to fit the
-  simulated data from exercise~\ref{mseexercise}. Plot the returned
-  values of the two parameters against each other. Compare the result
-  of the gradient descent method with the true values of $c$ and
-  $\alpha$ used to simulate the data. Observe the norm of the gradient
-  and inspect the plots to adapt $\epsilon$ (smaller than in
+  Use the function \varcode{gradientDescent()} to fit the
+  \varcode{powerLaw()} function to the simulated data from
+  exercise~\ref{mseexercise}. Plot the returned values of the two
+  parameters against each other. Compare the result of the gradient
+  descent method with the true values of $c$ and $\alpha$ used to
+  simulate the data. Observe the norm of the gradient and inspect the
+  plots to adapt $\epsilon$ (smaller than in
   exercise~\ref{plotgradientdescentexercise}) and the threshold (much
   larger) appropriately. Finally plot the data together with the best
   fitting power-law \eqref{powerfunc}.
 \end{exercise}
 
 
-\section{Curve fit for arbitrary functions}
-
-So far, all our code for the gradient descent algorithm was tailored
-to a specific function, the cubic relation \eqref{cubicfunc} or the
-power law \eqref{powerfunc}.
-
-\section{XXX}
-For example, you measure the response of a passive membrane to a
-current step and you want to estimate membrane the time constant. Then you
-need to fit an exponential function
-\begin{equation}
-  \label{expfunc}
-  V(t; \tau, \Delta V, V_{\infty}) = \Delta V e^{-t/\tau} + V_{\infty}
-\end{equation}
-with three free parameters $\tau$, $\Delta y$, $y_{\infty}$ to the
-measured time course of the membrane potential $V(t)$. The $(x_i,y_i)$
-data pairs are the sampling times $t_i$ and the corresponding
-measurements of the membrane potential $V_i$.
-
-\section{Summary}
+\section{Fitting non-linear functions to data}
 
 The gradient descent is an important numerical method for solving
 optimization problems. It is used to find the global minimum of an
@@ -530,33 +554,44 @@ objective function.
 Curve fitting is a common application for the gradient descent method.
 For the case of fitting straight lines to data pairs, the error
 surface (using the mean squared error) has exactly one clearly defined
-global minimum. In fact, the position of the minimum can be analytically
-calculated as shown in the next chapter.
-
-Problems that involve nonlinear computations on parameters, e.g. the
-rate $\lambda$ in an exponential function $f(x;\lambda) = e^{\lambda
-  x}$, do not have an analytical solution for the least squares. To
-find the least squares for such functions numerical methods such as
-the gradient descent have to be applied.
-
-The suggested gradient descent algorithm can be improved in multiple
-ways to converge faster.  For example one could adapt the step size to
-the length of the gradient. These numerical tricks have already been
-implemented in pre-defined functions. Generic optimization functions
-such as \matlabfun{fminsearch()} have been implemented for arbitrary
-objective functions, while the more specialized function
-\matlabfun{lsqcurvefit()} i specifically designed for optimizations in
-the least square error sense.
-
-%\newpage
+global minimum. In fact, the position of the minimum can be
+analytically calculated as shown in the next chapter. For linear
+fitting problems numerical methods like the gradient descent are not
+needed.
+
+Fitting problems that involve nonlinear functions of the parameters,
+e.g. the power law \eqref{powerfunc} or the exponential function
+$f(x;\lambda) = e^{\lambda x}$, do not have an analytical solution for
+the least squares. To find the least squares for such functions
+numerical methods such as the gradient descent have to be applied.
+
+The suggested gradient descent algorithm is quite fragile and requires
+manually tuned values for $\epsilon$ and the threshold for terminating
+the iteration.  The algorithm can be improved in multiple ways to
+converge more robustly and faster.  For example one could adapt the
+step size to the length of the gradient. These numerical tricks have
+already been implemented in pre-defined functions. Generic
+optimization functions such as \mcode{fminsearch()} have been
+implemented for arbitrary objective functions, while the more
+specialized function \mcode{lsqcurvefit()} is specifically designed
+for optimizations in the least square error sense.
+
+\begin{exercise}{plotlsqcurvefitpower.m}{}
+  Use the \matlab-function \varcode{lsqcurvefit()} instead of
+  \varcode{gradientDescent()} to fit the \varcode{powerLaw()} function
+  to the simulated data from exercise~\ref{mseexercise}. Plot the data
+  and the resulting best fitting power law function.
+\end{exercise}
+
+
 \begin{important}[Beware of secondary minima!]
   Finding the absolute minimum is not always as easy as in the case of
-  fitting a straight line. Often, the error surface has secondary or
+  fitting a straight line. Often, the cost function has secondary or
   local minima in which the gradient descent stops even though there
   is a more optimal solution, a global minimum that is lower than the
   local minimum. Starting from good initial positions is a good
   approach to avoid getting stuck in local minima. Also keep in mind
-  that error surfaces tend to be simpler (less local minima) the fewer
+  that cost functions tend to be simpler (less local minima) the fewer
   parameters are fitted from the data. Each additional parameter
   increases complexity and is computationally more expensive.
 \end{important}