From 17bf940101e651cb235be5baa513d09f75723067 Mon Sep 17 00:00:00 2001
From: Jan Benda <jan.benda@uni-tuebingen.de>
Date: Sun, 20 Dec 2020 13:15:40 +0100
Subject: [PATCH] [regression] smaller steps for derivative

---
 regression/code/gradientDescent.m          |  2 +-
 regression/code/meanSquaredGradientCubic.m |  2 +-
 regression/code/plotcubicgradient.m        |  2 ++
 regression/code/plotgradientdescentcubic.m |  4 +--
 regression/code/plotgradientdescentpower.m |  4 +--
 regression/lecture/regression.tex          | 29 +++++++++++-----------
 6 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/regression/code/gradientDescent.m b/regression/code/gradientDescent.m
index 48b976c..20e0b90 100644
--- a/regression/code/gradientDescent.m
+++ b/regression/code/gradientDescent.m
@@ -30,7 +30,7 @@ end
 
 function gradmse = meanSquaredGradient(x, y, func, p)
   gradmse = zeros(size(p, 1), size(p, 2));
-  h = 1e-5;               % stepsize for derivatives
+  h = 1e-7;               % stepsize for derivatives
   mse = meanSquaredError(x, y, func, p);
   for i = 1:length(p)     % for each coordinate ...
     pi = p;
diff --git a/regression/code/meanSquaredGradientCubic.m b/regression/code/meanSquaredGradientCubic.m
index a7ff6a2..99d5b3b 100644
--- a/regression/code/meanSquaredGradientCubic.m
+++ b/regression/code/meanSquaredGradientCubic.m
@@ -7,7 +7,7 @@ function dmsedc = meanSquaredGradientCubic(x, y, c)
 %
 % Returns: the derivative of the mean squared error at c.
 
-  h = 1e-5;   % stepsize for derivatives
+  h = 1e-7;   % stepsize for derivatives
   mse = meanSquaredErrorCubic(x, y, c);
   mseh = meanSquaredErrorCubic(x, y, c+h);
   dmsedc = (mseh - mse)/h;
diff --git a/regression/code/plotcubicgradient.m b/regression/code/plotcubicgradient.m
index 0410b1f..d00bf6e 100644
--- a/regression/code/plotcubicgradient.m
+++ b/regression/code/plotcubicgradient.m
@@ -1,3 +1,5 @@
+meansquarederrorline;               % generate data
+
 cs = 2.0:0.1:8.0;
 mseg = zeros(length(cs));
 for i = 1:length(cs)
diff --git a/regression/code/plotgradientdescentcubic.m b/regression/code/plotgradientdescentcubic.m
index 6ac54af..6aadc12 100644
--- a/regression/code/plotgradientdescentcubic.m
+++ b/regression/code/plotgradientdescentcubic.m
@@ -1,8 +1,8 @@
 meansquarederrorline;               % generate data
 
 c0 = 2.0;
-eps = 0.0001;
-thresh = 0.1;
+eps = 0.00001;
+thresh = 1.0;
 [cest, cs, mses] = gradientDescentCubic(x, y, c0, eps, thresh);
 
 subplot(2, 2, 1);                   % top left panel
diff --git a/regression/code/plotgradientdescentpower.m b/regression/code/plotgradientdescentpower.m
index 206e2bb..2d2a08f 100644
--- a/regression/code/plotgradientdescentpower.m
+++ b/regression/code/plotgradientdescentpower.m
@@ -2,7 +2,7 @@ meansquarederrorline;               % generate data
 
 p0 = [2.0, 1.0];
 eps = 0.00001;
-thresh = 50.0;
+thresh = 1.0;
 [pest, ps, mses] = gradientDescent(x, y, @powerLaw, p0, eps, thresh);
 pest
 
@@ -28,5 +28,3 @@ plot(x, y, 'o');                    % plot original data
 xlabel('Size [m]');
 ylabel('Weight [kg]');
 legend('fit', 'data', 'location', 'northwest');
-pause
-
diff --git a/regression/lecture/regression.tex b/regression/lecture/regression.tex
index 2db6a31..bfff7bb 100644
--- a/regression/lecture/regression.tex
+++ b/regression/lecture/regression.tex
@@ -240,11 +240,11 @@ at the position of the ball.
   \includegraphics{cubicgradient}
   \titlecaption{Derivative of the cost function.}{The gradient, the
     derivative \eqref{costderivative} of the cost function, is
-    negative to the left of the minimum of the cost function, zero at,
-    and positive to the right of the minimum (left). For each value of
-    the parameter $c$ the negative gradient (arrows) points towards
-    the minimum of the cost function
-    (right).} \label{gradientcubicfig}
+    negative to the left of the minimum (vertical line) of the cost
+    function, zero (horizontal line) at, and positive to the right of
+    the minimum (left). For each value of the parameter $c$ the
+    negative gradient (arrows) points towards the minimum of the cost
+    function (right).} \label{gradientcubicfig}
 \end{figure}
 
 In our one-dimensional example of a single free parameter the slope is
@@ -263,9 +263,9 @@ can be approximated numerically by the difference quotient
     \lim\limits_{\Delta c \to 0} \frac{f_{cost}(c + \Delta c) - f_{cost}(c)}{\Delta c}
     \approx \frac{f_{cost}(c + \Delta c) - f_{cost}(c)}{\Delta c}
 \end{equation}
-The derivative is positive for positive slopes. Since want to go down
-the hill, we choose the opposite direction (\figref{gradientcubicfig},
-right).
+Choose, for example, $\Delta c = 10^{-7}$.  The derivative is positive
+for positive slopes. Since want to go down the hill, we choose the
+opposite direction (\figref{gradientcubicfig}, right).
 
 \begin{exercise}{meanSquaredGradientCubic.m}{}\label{gradientcubic}
   Implement a function \varcode{meanSquaredGradientCubic()}, that
@@ -361,7 +361,7 @@ The $\epsilon$ parameter in \eqnref{gradientdescent} is critical. If
 too large, the algorithm does not converge to the minimum of the cost
 function (try it!). At medium values it oscillates around the minimum
 but might nevertheless converge. Only for sufficiently small values
-(here $\epsilon = 0.0001$) does the algorithm follow the slope
+(here $\epsilon = 0.00001$) does the algorithm follow the slope
 downwards towards the minimum.
 
 The terminating condition on the absolute value of the gradient
@@ -373,7 +373,7 @@ to the increased computational effort. Have a look at the derivatives
 that we plotted in exercise~\ref{gradientcubic} and decide on a
 sensible value for the threshold. Run the gradient descent algorithm
 and check how the resulting $c$ parameter values converge and how many
-iterations were needed. The reduce the threshold (by factors of ten)
+iterations were needed. Then reduce the threshold (by factors of ten)
 and check how this changes the results.
 
 Many modern algorithms for finding the minimum of a function are based
@@ -508,7 +508,8 @@ the sum of the squared partial derivatives:
   \label{ndimabsgradient}
   |\nabla f_{cost}(\vec p_i)| = \sqrt{\sum_{i=1}^n \left(\frac{\partial f_{cost}(\vec p)}{\partial p_i}\right)^2}
 \end{equation}
-The \code{norm()} function implements this.
+The \code{norm()} function implements this given a vector with the
+partial derivatives.
 
 \subsection{Passing a function as an argument to another function}
 
@@ -568,10 +569,8 @@ our tiger data-set (\figref{powergradientdescentfig}):
   parameters against each other. Compare the result of the gradient
   descent method with the true values of $c$ and $a$ used to simulate
   the data. Observe the norm of the gradient and inspect the plots to
-  adapt $\epsilon$ (smaller than in
-  exercise~\ref{plotgradientdescentexercise}) and the threshold (much
-  larger) appropriately. Finally plot the data together with the best
-  fitting power-law \eqref{powerfunc}.
+  adapt $\epsilon$ and the threshold if necessary. Finally plot the
+  data together with the best fitting power-law \eqref{powerfunc}.
 \end{exercise}