[regression] finished main text and exercises

2020-12-19 22:56:09 +01:00 · 2020-12-19 22:56:09 +01:00 · 46affef86d
commit 46affef86d
parent bfb2f66de2
5 changed files with 126 additions and 196 deletions
--- a/regression/code/gradientDescent.m
+++ b/regression/code/gradientDescent.m
@ -1,32 +1,42 @@
-% x, y from exercise 8.3
+function [p, ps, mses] = gradientDescent(x, y, func, p0, epsilon, threshold)
 % Gradient descent for fitting a function to data pairs.
 %
 % Arguments: x, vector of the x-data values.
 %            y, vector of the corresponding y-data values.
 %            func, function handle func(x, p)
 %            p0, vector with initial parameter values
 %            epsilon: factor multiplying the gradient.
 %            threshold: minimum value for gradient
 %
 % Returns:   p, vector with the final parameter values.
 %            ps: 2D-vector with all the parameter vectors traversed.
 %            mses: vector with the corresponding mean squared errors
-% some arbitrary values for the slope and the intercept to start with:
+  p = p0;
-position = [-2.0, 10.0];  
+  gradient = ones(1, length(p0)) * 1000.0;
-
+  ps = [];
-% gradient descent:
+  mses = [];
-gradient = [];
+  while norm(gradient) > threshold
-errors = [];
+      ps = [ps, p(:)];
-count = 1;
+      mses = [mses, meanSquaredError(x, y, func, p)];
-eps = 0.0001;
+      gradient = meanSquaredGradient(x, y, func, p);
-while isempty(gradient) || norm(gradient) > 0.1
+      p = p - epsilon * gradient;
-    gradient = meanSquaredGradient(x, y, position);
+  end
-    errors(count) = meanSquaredError(x, y, position);
+end
-    position = position - eps .* gradient; 
+
-    count = count + 1;
+function mse = meanSquaredError(x, y, func, p)
    mse = mean((y - func(x, p)).^2);
 end
 function gradmse = meanSquaredGradient(x, y, func, p)
  gradmse = zeros(size(p, 1), size(p, 2));
  h = 1e-5;               % stepsize for derivatives
  mse = meanSquaredError(x, y, func, p);
  for i = 1:length(p)     % for each coordinate ...
    pi = p;
    pi(i) = pi(i) + h;    %   displace i-th parameter
    msepi = meanSquaredError(x, y, func, pi);
    gradmse(i) = (msepi - mse)/h;
  end
 end
 figure()
 subplot(2,1,1)
 hold on
 scatter(x, y, 'displayname', 'data')
 xx = min(x):0.01:max(x);
 yy = position(1).*xx + position(2);
 plot(xx, yy, 'displayname', 'fit')
 xlabel('Input')
 ylabel('Output')
 grid on
 legend show
 subplot(2,1,2)
 plot(errors)
 xlabel('optimization steps')
 ylabel('error')
--- a/regression/code/gradientDescentPower.m
+++ b/regression/code/gradientDescentPower.m
@ -1,41 +0,0 @@
 function [p, ps, mses] = gradientDescentPower(x, y, p0, epsilon, threshold)
 % Gradient descent for fitting a power-law.
 %
 % Arguments: x, vector of the x-data values.
 %            y, vector of the corresponding y-data values.
 %            p0, vector with initial values for c and alpha. 
 %            epsilon: factor multiplying the gradient.
 %            threshold: minimum value for gradient
 %
 % Returns:   p, vector with the final parameter values.
 %            ps: 2D-vector with all the parameter tuples traversed.
 %            mses: vector with the corresponding mean squared errors
  p = p0;
  gradient = ones(1, length(p0)) * 1000.0;
  ps = [];
  mses = [];
  while norm(gradient) > threshold
      ps = [ps, p(:)];
      mses = [mses, meanSquaredErrorPower(x, y, p)];
      gradient = meanSquaredGradientPower(x, y, p);
      p = p - epsilon * gradient;
  end
 end
 function mse = meanSquaredErrorPower(x, y, p)
    mse = mean((y - p(1)*x.^p(2)).^2);
 end
 function gradmse = meanSquaredGradientPower(x, y, p)
  gradmse = zeros(size(p, 1), size(p, 2));
  h = 1e-5;               % stepsize for derivatives
  mse = meanSquaredErrorPower(x, y, p);
  for i = 1:length(p)     % for each coordinate ...
    pi = p;
    pi(i) = pi(i) + h;    %   displace i-th parameter
    msepi = meanSquaredErrorPower(x, y, pi);
    gradmse(i) = (msepi - mse)/h;
  end
 end
--- a/regression/code/plotgradientdescentpower.m
+++ b/regression/code/plotgradientdescentpower.m
@ -3,7 +3,7 @@ meansquarederrorline;               % generate data
 p0 = [2.0, 1.0];
 eps = 0.00001;
 thresh = 50.0;
-[pest, ps, mses] = gradientDescentPower(x, y, p0, eps, thresh);
+[pest, ps, mses] = gradientDescent(x, y, @powerLaw, p0, eps, thresh);
 pest
 subplot(2, 2, 1);                   % top left panel
@ -22,7 +22,7 @@ subplot(1, 2, 2);                   % right panel
 hold on;
 % generate x-values for plottig the fit:
 xx = min(x):0.01:max(x);
-yy = pest(1) * xx.^pest(2);
+yy = powerLaw(xx, pest);
 plot(xx, yy);
 plot(x, y, 'o');                    % plot original data
 xlabel('Size [m]');
--- a/regression/lecture/regression-chapter.tex
+++ b/regression/lecture/regression-chapter.tex
@ -23,80 +23,6 @@
 \item Fig 8.2 right: this should be a chi-squared distribution with one degree of freedom!
 \end{itemize}
 \subsection{Start with one-dimensional problem!}
 \begin{itemize}
 \item How to plot a function (do not use the data x values!)
 \item 1-d gradient descend
 \item Describe in words the n-d problem (boltzman as example?).
 \item Homework is to do the 2d problem with the straight line!
 \item NO quiver plot (it is a nightmare to get this right)
 \end{itemize}
 \subsection{2D fit}
 \begin{exercise}{errorSurface.m}{}\label{errorsurfaceexercise}
  Generate 20 data pairs $(x_i|y_i)$ that are linearly related with
  slope $m=0.75$ and intercept $b=-40$, using \varcode{rand()} for
  drawing $x$ values between 0 and 120 and \varcode{randn()} for
  jittering the $y$ values with a standard deviation of 15.  Then
  calculate the mean squared error between the data and straight lines
  for a range of slopes and intercepts using the
  \varcode{meanSquaredError()} function from the previous exercise.
  Illustrates the error surface using the \code{surface()} function.
  Consult the documentation to find out how to use \code{surface()}.
 \end{exercise}
 \begin{exercise}{meanSquaredGradient.m}{}\label{gradientexercise}%
  Implement a function \varcode{meanSquaredGradient()}, that takes the
  $x$- and $y$-data and the set of parameters $(m, b)$ of a straight
  line as a two-element vector as input arguments. The function should
  return the gradient at the position $(m, b)$ as a vector with two
  elements.
 \end{exercise}
 \begin{exercise}{errorGradient.m}{}
  Extend the script of exercises~\ref{errorsurfaceexercise} to plot
  both the error surface and gradients using the
  \varcode{meanSquaredGradient()} function from
  exercise~\ref{gradientexercise}. Vectors in space can be easily
  plotted using the function \code{quiver()}. Use \code{contour()}
  instead of \code{surface()} to plot the error surface.
 \end{exercise}
 \begin{exercise}{gradientDescent.m}{}
  Implement the gradient descent for the problem of fitting a straight
  line to some measured data. Reuse the data generated in
  exercise~\ref{errorsurfaceexercise}.
  \begin{enumerate}
  \item Store for each iteration the error value.
  \item Plot the error values as a function of the iterations, the
    number of optimization steps.
  \item Plot the measured data together with the best fitting straight line.
  \end{enumerate}\vspace{-4.5ex}
 \end{exercise}
 \begin{figure}[t]
  \includegraphics[width=1\textwidth]{lin_regress}\hfill
  \titlecaption{Example data suggesting a linear relation.}{A set of
    input signals $x$, e.g. stimulus intensities, were used to probe a
    system. The system's output $y$ to the inputs are noted
    (left). Assuming a linear relation between $x$ and $y$ leaves us
    with 2 parameters, the slope (center) and the intercept with the
    y-axis (right panel).}\label{linregressiondatafig}
 \end{figure}
 \begin{figure}[t]
  \includegraphics[width=1\textwidth]{linear_least_squares}
  \titlecaption{Estimating the \emph{mean square error}.}  {The
    deviation error (orange) between the prediction (red line) and the
    observations (blue dots) is calculated for each data point
    (left). Then the deviations are squared and the average is
    calculated (right).}
  \label{leastsquareerrorfig}
 \end{figure}
 \begin{figure}[t]
  \includegraphics[width=0.75\textwidth]{error_surface}
  \titlecaption{Error surface.}{The two model parameters $m$ and $b$
--- a/regression/lecture/regression.tex
+++ b/regression/lecture/regression.tex
@ -479,49 +479,73 @@ the sum of the squared partial derivatives:
 \end{equation}
 The \code{norm()} function implements this.
-\begin{exercise}{gradientDescentPower.m}{}
+
 \section{Passing a function as an argument to another function}
 So far, all our code for the gradient descent algorithm was tailored
 to a specific function, the cubic relation \eqref{cubicfunc}. It would
 be much better if we could pass an arbitrary function to our gradient
 algorithm. Then we would not need to rewrite it every time anew.
 This is possible. We can indeed pass a function as an argument to
 another function. For this use the \code{@}-operator. As an example
 let's define a function that produces a standard plot for a function:
 \pageinputlisting[caption={Example function taking a function as argument.}]{funcPlotter.m}
 This function can then be used as follows for plotting a sine wave. We
 pass the built in \varcode{sin()} function as \varcode{@sin} as an
 argument to our function:
 \pageinputlisting[caption={Passing a function handle as an argument to a function.}]{funcplotterexamples.m}
 \section{Gradient descent algorithm for arbitrary functions}
 Now we are ready to adapt the gradient descent algorithm from
 exercise~\ref{gradientdescentexercise} to arbitrary functions with $n$
 parameters that we want to fit to some data.
 \begin{exercise}{gradientDescent.m}{}
  Adapt the function \varcode{gradientDescentCubic()} from
  exercise~\ref{gradientdescentexercise} to implement the gradient
-  descent algorithm for the power law \eqref{powerfunc}.  The new
+  descent algorithm for any function \varcode{func(x, p)} that takes
-  function takes a two element vector $(c,\alpha)$ for the initial
+  as first argument the $x$-data values and as second argument a
-  parameter values and also returns the best parameter combination as
+  vector with parameter values.  The new function takes a vector $\vec
-  a two-element vector. Use a \varcode{for} loop over the two
+  p_0$ for the initial parameter values and also returns the best
-  dimensions for computing the gradient.
+  parameter combination as a vector. Use a \varcode{for} loop over the
  two dimensions for computing the gradient.
 \end{exercise}
 For testing our new function we need to implement the power law
 \eqref{powerfunc}:
 \begin{exercise}{powerLaw.m}{}
  Write a function that implements \eqref{powerfunc}. The function
  gets as arguments a vector $x$ containing the $x$-data values and
  another vector containing as elements the parameters for the power
  law, i.e. the factor $c$ and the power $\alpha$. It returns a vector
  with the computed $y$ values for each $x$ value. 
 \end{exercise}
 Now let's use the new gradient descent function to fit a power law to
 our tiger data-set:
 \begin{exercise}{plotgradientdescentpower.m}{}
-  Use the function \varcode{gradientDescentPower()} to fit the
+  Use the function \varcode{gradientDescent()} to fit the
-  simulated data from exercise~\ref{mseexercise}. Plot the returned
+  \varcode{powerLaw()} function to the simulated data from
-  values of the two parameters against each other. Compare the result
+  exercise~\ref{mseexercise}. Plot the returned values of the two
-  of the gradient descent method with the true values of $c$ and
+  parameters against each other. Compare the result of the gradient
-  $\alpha$ used to simulate the data. Observe the norm of the gradient
+  descent method with the true values of $c$ and $\alpha$ used to
-  and inspect the plots to adapt $\epsilon$ (smaller than in
+  simulate the data. Observe the norm of the gradient and inspect the
  plots to adapt $\epsilon$ (smaller than in
  exercise~\ref{plotgradientdescentexercise}) and the threshold (much
  larger) appropriately. Finally plot the data together with the best
  fitting power-law \eqref{powerfunc}.
 \end{exercise}
-\section{Curve fit for arbitrary functions}
+\section{Fitting non-linear functions to data}
 So far, all our code for the gradient descent algorithm was tailored
 to a specific function, the cubic relation \eqref{cubicfunc} or the
 power law \eqref{powerfunc}.
 \section{XXX}
 For example, you measure the response of a passive membrane to a
 current step and you want to estimate membrane the time constant. Then you
 need to fit an exponential function
 \begin{equation}
  \label{expfunc}
  V(t; \tau, \Delta V, V_{\infty}) = \Delta V e^{-t/\tau} + V_{\infty}
 \end{equation}
 with three free parameters $\tau$, $\Delta y$, $y_{\infty}$ to the
 measured time course of the membrane potential $V(t)$. The $(x_i,y_i)$
 data pairs are the sampling times $t_i$ and the corresponding
 measurements of the membrane potential $V_i$.
 \section{Summary}
 The gradient descent is an important numerical method for solving
 optimization problems. It is used to find the global minimum of an
@ -530,33 +554,44 @@ objective function.
 Curve fitting is a common application for the gradient descent method.
 For the case of fitting straight lines to data pairs, the error
 surface (using the mean squared error) has exactly one clearly defined
-global minimum. In fact, the position of the minimum can be analytically
+global minimum. In fact, the position of the minimum can be
-calculated as shown in the next chapter.
+analytically calculated as shown in the next chapter. For linear
 fitting problems numerical methods like the gradient descent are not
 needed.
-Problems that involve nonlinear computations on parameters, e.g. the
+Fitting problems that involve nonlinear functions of the parameters,
-rate $\lambda$ in an exponential function $f(x;\lambda) = e^{\lambda
+e.g. the power law \eqref{powerfunc} or the exponential function
-  x}$, do not have an analytical solution for the least squares. To
+$f(x;\lambda) = e^{\lambda x}$, do not have an analytical solution for
-find the least squares for such functions numerical methods such as
+the least squares. To find the least squares for such functions
-the gradient descent have to be applied.
+numerical methods such as the gradient descent have to be applied.
 The suggested gradient descent algorithm is quite fragile and requires
 manually tuned values for $\epsilon$ and the threshold for terminating
 the iteration.  The algorithm can be improved in multiple ways to
 converge more robustly and faster.  For example one could adapt the
 step size to the length of the gradient. These numerical tricks have
 already been implemented in pre-defined functions. Generic
 optimization functions such as \mcode{fminsearch()} have been
 implemented for arbitrary objective functions, while the more
 specialized function \mcode{lsqcurvefit()} is specifically designed
 for optimizations in the least square error sense.
 \begin{exercise}{plotlsqcurvefitpower.m}{}
  Use the \matlab-function \varcode{lsqcurvefit()} instead of
  \varcode{gradientDescent()} to fit the \varcode{powerLaw()} function
  to the simulated data from exercise~\ref{mseexercise}. Plot the data
  and the resulting best fitting power law function.
 \end{exercise}
 The suggested gradient descent algorithm can be improved in multiple
 ways to converge faster.  For example one could adapt the step size to
 the length of the gradient. These numerical tricks have already been
 implemented in pre-defined functions. Generic optimization functions
 such as \matlabfun{fminsearch()} have been implemented for arbitrary
 objective functions, while the more specialized function
 \matlabfun{lsqcurvefit()} i specifically designed for optimizations in
 the least square error sense.
 %\newpage
 \begin{important}[Beware of secondary minima!]
  Finding the absolute minimum is not always as easy as in the case of
-  fitting a straight line. Often, the error surface has secondary or
+  fitting a straight line. Often, the cost function has secondary or
  local minima in which the gradient descent stops even though there
  is a more optimal solution, a global minimum that is lower than the
  local minimum. Starting from good initial positions is a good
  approach to avoid getting stuck in local minima. Also keep in mind
-  that error surfaces tend to be simpler (less local minima) the fewer
+  that cost functions tend to be simpler (less local minima) the fewer
  parameters are fitted from the data. Each additional parameter
  increases complexity and is computationally more expensive.
 \end{important}