[regression] finished chapter

2020-12-20 01:05:02 +01:00 · 2020-12-20 01:05:02 +01:00 · dea6319e75
commit dea6319e75
parent 46affef86d
10 changed files with 61 additions and 247 deletions
--- a/regression/code/meanSquaredErrorCubic.m
+++ b/regression/code/meanSquaredErrorCubic.m
@ -1,8 +1,8 @@
 function mse = meanSquaredErrorCubic(x, y, c)
 % Mean squared error between data pairs and a cubic relation.
 %
-% Arguments: x, vector of the input values
+% Arguments: x, vector of the x-data values
-%            y, vector of the corresponding measured output values
+%            y, vector of the corresponding y-data values
 %            c, the factor for the cubic relation. 
 %
 % Returns:   mse, the mean-squared-error.
--- a/regression/lecture/cubiccost.py
+++ b/regression/lecture/cubiccost.py
@ -75,7 +75,7 @@ def plot_mse_min(ax, x, y, c):
 if __name__ == "__main__":
    x, y, c = create_data()
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 1.1*figure_height))
-    fig.subplots_adjust(**adjust_fs(left=8.0, right=1))
+    fig.subplots_adjust(**adjust_fs(left=8.0, right=1.2))
    plot_mse(ax1, x, y, c)
    plot_mse_min(ax2, x, y, c)
    fig.savefig("cubiccost.pdf")
--- a/regression/lecture/cubicerrors.py
+++ b/regression/lecture/cubicerrors.py
@ -58,8 +58,8 @@ def plot_error_hist(ax, x, y, c):
 if __name__ == "__main__":
    x, y, c = create_data()
-    fig, (ax1, ax2) = plt.subplots(1, 2)
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 0.9*figure_height))
-    fig.subplots_adjust(wspace=0.4, **adjust_fs(left=6.0, right=1.2))
+    fig.subplots_adjust(wspace=0.5, **adjust_fs(left=6.0, right=1.2))
    plot_data_errors(ax1, x, y, c)
    plot_error_hist(ax2, x, y, c)
    fig.savefig("cubicerrors.pdf")
--- a/regression/lecture/cubicfunc.py
+++ b/regression/lecture/cubicfunc.py
@ -42,8 +42,8 @@ def plot_data_fac(ax, x, y, c):
 if __name__ == "__main__":
    x, y, c = create_data()
    print('n=%d' % len(x))
-    fig, (ax1, ax2) = plt.subplots(1, 2)
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 0.9*figure_height))
-    fig.subplots_adjust(wspace=0.5, **adjust_fs(fig, left=6.0, right=1.5))
+    fig.subplots_adjust(wspace=0.5, **adjust_fs(fig, left=6.0, right=1.2))
    plot_data(ax1, x, y)
    plot_data_fac(ax2, x, y, c)
    fig.savefig("cubicfunc.pdf")
--- a/regression/lecture/cubicmse.py
+++ b/regression/lecture/cubicmse.py
@ -73,7 +73,7 @@ if __name__ == "__main__":
    x, y, c = create_data()
    cs, mses = gradient_descent(x, y)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 1.1*figure_height))
-    fig.subplots_adjust(wspace=0.2, **adjust_fs(left=8.0, right=0.5))
+    fig.subplots_adjust(**adjust_fs(left=8.0, right=1.2))
    plot_mse(ax1, x, y, c, cs)
    plot_descent(ax2, cs, mses)
    fig.savefig("cubicmse.pdf")
--- a/regression/lecture/error_surface.py
+++ b/regression/lecture/error_surface.py
@ -1,56 +0,0 @@
 import numpy as np
 from mpl_toolkits.mplot3d import Axes3D
 import matplotlib.pyplot as plt
 import matplotlib.cm as cm
 from plotstyle import *
 def create_data():
    m = 0.75
    n=  -40
    x = np.arange(10.,110., 2.5)
    y = m * x + n;
    rng = np.random.RandomState(37281)
    noise = rng.randn(len(x))*15
    y += noise
    return x, y, m, n
 def plot_error_plane(ax, x, y, m, n):
    ax.set_xlabel('Slope m')
    ax.set_ylabel('Intercept b')
    ax.set_zlabel('Mean squared error')
    ax.set_xlim(-4.5, 5.0)
    ax.set_ylim(-60.0, -20.0)
    ax.set_zlim(0.0, 700.0)
    ax.set_xticks(np.arange(-4, 5, 2))
    ax.set_yticks(np.arange(-60, -19, 10))
    ax.set_zticks(np.arange(0, 700, 200))
    ax.grid(True)
    ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
    ax.w_yaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
    ax.w_zaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
    ax.invert_xaxis()
    ax.view_init(25, 40)
    slopes = np.linspace(-4.5, 5, 40)
    intercepts = np.linspace(-60, -20, 40)
    x, y = np.meshgrid(slopes, intercepts)
    error_surf = np.zeros(x.shape)
    for i, s in enumerate(slopes) :
        for j, b in enumerate(intercepts) :
                error_surf[j,i] = np.mean((y-s*x-b)**2.0)
    ax.plot_surface(x, y, error_surf, rstride=1, cstride=1, cmap=cm.coolwarm,
                    linewidth=0, shade=True)
    # Minimum:
    mini = np.unravel_index(np.argmin(error_surf), error_surf.shape)
    ax.scatter(slopes[mini[1]], intercepts[mini[0]], [0.0], color='#cc0000', s=60)
 if __name__ == "__main__":
    x, y, m, n = create_data()
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1, projection='3d')
    plot_error_plane(ax, x, y, m, n)
    fig.set_size_inches(7., 5.)
    fig.subplots_adjust(**adjust_fs(fig, 1.0, 0.0, 0.0, 0.0))
    fig.savefig("error_surface.pdf")
    plt.close()
--- a/regression/lecture/lin_regress.py
+++ b/regression/lecture/lin_regress.py
@ -1,60 +0,0 @@
 import numpy as np
 import matplotlib.pyplot as plt
 from plotstyle import *
 def create_data():
    m = 0.75
    n=  -40
    x = np.arange(10.,110., 2.5)
    y = m * x + n;
    rng = np.random.RandomState(37281)
    noise = rng.randn(len(x))*15
    y += noise
    return x, y, m, n
 def plot_data(ax, x, y):
    ax.plot(x, y, **psA)
    ax.set_xlabel('Input x')
    ax.set_ylabel('Output y')
    ax.set_xlim(0, 120)
    ax.set_ylim(-80, 80)
    ax.set_xticks(np.arange(0,121, 40))
    ax.set_yticks(np.arange(-80,81, 40))
 def plot_data_slopes(ax, x, y, m, n):
    ax.plot(x, y, **psA)
    xx = np.asarray([2, 118])
    for i in np.linspace(0.3*m, 2.0*m, 5):
        ax.plot(xx, i*xx+n, **lsBm)
    ax.set_xlabel('Input x')
    #ax.set_ylabel('Output y')
    ax.set_xlim(0, 120)
    ax.set_ylim(-80, 80)
    ax.set_xticks(np.arange(0,121, 40))
    ax.set_yticks(np.arange(-80,81, 40))
 def plot_data_intercepts(ax, x, y, m, n):
    ax.plot(x, y, **psA)
    xx = np.asarray([2, 118])
    for i in np.linspace(n-1*n, n+1*n, 5):
        ax.plot(xx, m*xx + i, **lsBm)
    ax.set_xlabel('Input x')
    #ax.set_ylabel('Output y')
    ax.set_xlim(0, 120)
    ax.set_ylim(-80, 80)
    ax.set_xticks(np.arange(0,121, 40))
    ax.set_yticks(np.arange(-80,81, 40))
 if __name__ == "__main__":
    x, y, m, n = create_data()
    fig, axs = plt.subplots(1, 3)
    fig.subplots_adjust(wspace=0.5, **adjust_fs(fig, left=6.0, right=1.5))
    plot_data(axs[0], x, y)
    plot_data_slopes(axs[1], x, y, m, n)
    plot_data_intercepts(axs[2], x, y, m, n)
    fig.savefig("lin_regress.pdf")
    plt.close()
--- a/regression/lecture/linear_least_squares.py
+++ b/regression/lecture/linear_least_squares.py
@ -1,65 +0,0 @@
 import numpy as np
 import matplotlib.pyplot as plt
 from plotstyle import *
 def create_data():
    m = 0.75
    n=  -40
    x = np.concatenate( (np.arange(10.,110., 2.5), np.arange(0.,120., 2.0)) )
    y = m * x + n;
    rng = np.random.RandomState(37281)
    noise = rng.randn(len(x))*15
    y += noise
    return x, y, m, n
 def plot_data(ax, x, y, m, n):
    ax.set_xlabel('Input x')
    ax.set_ylabel('Output y')
    ax.set_xlim(0, 120)
    ax.set_ylim(-80, 80)
    ax.set_xticks(np.arange(0,121, 40))
    ax.set_yticks(np.arange(-80,81, 40))
    ax.annotate('Error',
                xy=(x[34]+1, y[34]+15), xycoords='data',
                xytext=(80, -50), textcoords='data', ha='left',
                arrowprops=dict(arrowstyle="->", relpos=(0.9,1.0),
                connectionstyle="angle3,angleA=50,angleB=-30") )
    ax.plot(x[:40], y[:40], zorder=0, **psAm)
    inxs = [3, 13, 16, 19, 25, 34, 36]
    ax.plot(x[inxs], y[inxs], zorder=10, **psA)
    xx = np.asarray([2, 118])
    ax.plot(xx, m*xx+n, **lsBm)
    for i in inxs :
        xx = [x[i], x[i]]
        yy = [m*x[i]+n, y[i]]
        ax.plot(xx, yy, zorder=5, **lsDm)
 def plot_error_hist(ax, x, y, m, n):
    ax.set_xlabel('Squared error')
    ax.set_ylabel('Frequency')
    bins = np.arange(0.0, 602.0, 50.0)
    ax.set_xlim(bins[0], bins[-1])
    ax.set_ylim(0, 35)
    ax.set_xticks(np.arange(bins[0], bins[-1], 100))
    ax.set_yticks(np.arange(0, 36, 10))
    errors = (y-(m*x+n))**2.0
    mls = np.mean(errors)
    ax.annotate('Mean\nsquared\nerror',
                xy=(mls, 0.5), xycoords='data',
                xytext=(350, 20), textcoords='data', ha='left',
                arrowprops=dict(arrowstyle="->", relpos=(0.0,0.2),
                connectionstyle="angle3,angleA=10,angleB=90") )
    ax.hist(errors, bins, **fsD)
 if __name__ == "__main__":
    x, y, m, n = create_data()
    fig, axs = plt.subplots(1, 2)
    fig.subplots_adjust(**adjust_fs(fig, left=6.0))
    plot_data(axs[0], x, y, m, n)
    plot_error_hist(axs[1], x, y, m, n)
    fig.savefig("linear_least_squares.pdf")
    plt.close()
--- a/regression/lecture/regression-chapter.tex
+++ b/regression/lecture/regression-chapter.tex
@ -23,47 +23,12 @@
 \item Fig 8.2 right: this should be a chi-squared distribution with one degree of freedom!
 \end{itemize}
 \begin{figure}[t]
  \includegraphics[width=0.75\textwidth]{error_surface}
  \titlecaption{Error surface.}{The two model parameters $m$ and $b$
    define the base area of the surface plot. For each parameter
    combination of slope and intercept the error is calculated. The
    resulting surface has a minimum which indicates the parameter
    combination that best fits the data.}\label{errorsurfacefig}
 \end{figure}
 \begin{figure}[t]
  \includegraphics[width=0.75\textwidth]{error_gradient}
  \titlecaption{Gradient of the error surface.}  {Each arrow points
    into the direction of the greatest ascend at different positions
    of the error surface shown in \figref{errorsurfacefig}. The
    contour lines in the background illustrate the error surface. Warm
    colors indicate high errors, colder colors low error values. Each
    contour line connects points of equal
    error.}\label{gradientquiverfig}
 \end{figure}
 \begin{figure}[t]
  \includegraphics[width=0.45\textwidth]{gradient_descent}
  \titlecaption{Gradient descent.}{The algorithm starts at an
    arbitrary position. At each point the gradient is estimated and
    the position is updated as long as the length of the gradient is
    sufficiently large.The dots show the positions after each
    iteration of the algorithm.} \label{gradientdescentfig}
 \end{figure}
 \subsection{Linear fits}
 \begin{itemize}
 \item Polyfit is easy: unique solution! $c x^2$ is also a linear fit.
 \item Example for overfitting with polyfit of a high order (=number of data points)
 \end{itemize}
 \section{Fitting in practice}
 Fit with matlab functions lsqcurvefit, polyfit
 \subsection{Non-linear fits}
 \begin{itemize}
--- a/regression/lecture/regression.tex
+++ b/regression/lecture/regression.tex
@ -227,6 +227,7 @@ to arbitrary precision.
 \end{ibox}
 \section{Gradient}
 Imagine to place a ball at some point on the cost function
 \figref{cubiccostfig}. Naturally, it would roll down the slope and
 eventually stop at the minimum of the error surface (if it had no
@ -235,10 +236,21 @@ way to the minimum of the cost function. The ball always follows the
 steepest slope. Thus we need to figure out the direction of the slope
 at the position of the ball.
 \begin{figure}[t]
  \includegraphics{cubicgradient}
  \titlecaption{Derivative of the cost function.}{The gradient, the
    derivative \eqref{costderivative} of the cost function, is
    negative to the left of the minimum of the cost function, zero at,
    and positive to the right of the minimum (left). For each value of
    the parameter $c$ the negative gradient (arrows) points towards
    the minimum of the cost function
    (right).} \label{gradientcubicfig}
 \end{figure}
 In our one-dimensional example of a single free parameter the slope is
 simply the derivative of the cost function with respect to the
-parameter $c$. This derivative is called the
+parameter $c$ (\figref{gradientcubicfig}, left). This derivative is called
-\entermde{Gradient}{gradient} of the cost function:
+the \entermde{Gradient}{gradient} of the cost function:
 \begin{equation}
  \label{costderivative}
  \nabla f_{cost}(c) = \frac{{\rm d} f_{cost}(c)}{{\rm d} c}
@ -252,7 +264,8 @@ can be approximated numerically by the difference quotient
    \approx \frac{f_{cost}(c + \Delta c) - f_{cost}(c)}{\Delta c}
 \end{equation}
 The derivative is positive for positive slopes. Since want to go down
-the hill, we choose the opposite direction.
+the hill, we choose the opposite direction (\figref{gradientcubicfig},
 right).
 \begin{exercise}{meanSquaredGradientCubic.m}{}\label{gradientcubic}
  Implement a function \varcode{meanSquaredGradientCubic()}, that
@ -275,7 +288,7 @@ the hill, we choose the opposite direction.
  \titlecaption{Gradient descent.}{The algorithm starts at an
    arbitrary position. At each point the gradient is estimated and
    the position is updated as long as the length of the gradient is
-    sufficiently large.The dots show the positions after each
+    sufficiently large. The dots show the positions after each
    iteration of the algorithm.} \label{gradientdescentcubicfig}
 \end{figure}
@ -378,18 +391,18 @@ functions that have more than a single parameter, in general $n$
 parameters. We then need to find the minimum in an $n$ dimensional
 parameter space.
-For our tiger problem, we could have also fitted the exponent $\alpha$
+For our tiger problem, we could have also fitted the exponent $a$ of
-of the power-law relation between size and weight, instead of assuming
+the power-law relation between size and weight, instead of assuming a
-a cubic relation with $\alpha=3$:
+cubic relation with $a=3$:
 \begin{equation}
  \label{powerfunc}
-  y = f(x; c, \alpha) = f(x; \vec p) = c\cdot x^\alpha
+  y = f(x; c, a) = f(x; \vec p) = c\cdot x^a
 \end{equation}
 We then could check whether the resulting estimate of the exponent
-$\alpha$ indeed is close to the expected power of three.  The
+$a$ indeed is close to the expected power of three.  The
-power-law \eqref{powerfunc} has two free parameters $c$ and $\alpha$.
+power-law \eqref{powerfunc} has two free parameters $c$ and $a$.
 Instead of a single parameter we are now dealing with a vector $\vec
-p$ containing $n$ parameter values. Here, $\vec p = (c, \alpha)$. All
+p$ containing $n$ parameter values. Here, $\vec p = (c, a)$. All
 the concepts we introduced on the example of the one dimensional
 problem of tiger weights generalize to $n$-dimensional problems. We
 only need to adapt a few things. The cost function for the mean
@ -399,12 +412,27 @@ squared error reads
  f_{cost}(\vec p|\{(x_i, y_i)\}) = \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i;\vec p))^2
 \end{equation}
 \begin{figure}[t]
  \includegraphics{powergradientdescent}
  \titlecaption{Gradient descent on an error surface.}{Contour plot of
    the cost function \eqref{ndimcostfunc} for the fit of a power law
    \eqref{powerfunc} to some data. Here the cost function is a long
    and narrow valley on the plane spanned by the two parameters $c$
    and $a$ of the power law. The red line marks the path of the
    gradient descent algorithm. The gradient is always perpendicular
    to the contour lines. The algorithm quickly descends into the
    valley and then slowly creeps on the shallow bottom of the valley
    towards the global minimum where it terminates (yellow circle).
  } \label{powergradientdescentfig}
 \end{figure}
 For two-dimensional problems the graph of the cost function is an
 \enterm{error surface} (\determ{{Fehlerfl\"ache}}). The two parameters
 span a two-dimensional plane. The cost function assigns to each
 parameter combination on this plane a single value. This results in a
 landscape over the parameter plane with mountains and valleys and we
-are searching for the position of the bottom of the deepest valley.
+are searching for the position of the bottom of the deepest valley
 (\figref{powergradientdescentfig}).
 \begin{ibox}[tp]{\label{partialderivativebox}Partial derivatives and gradient}
  Some functions depend on more than a single variable. For example, the function
@ -446,13 +474,13 @@ space of our example, the \entermde{Gradient}{gradient}
 (box~\ref{partialderivativebox}) of the cost function is a vector
 \begin{equation}
  \label{gradientpowerlaw}
-  \nabla f_{cost}(c, \alpha) = \left( \frac{\partial f_{cost}(c, \alpha)}{\partial c},
+  \nabla f_{cost}(c, a) = \left( \frac{\partial f_{cost}(c, a)}{\partial c},
-    \frac{\partial f_{cost}(c, \alpha)}{\partial \alpha} \right)
+    \frac{\partial f_{cost}(c, a)}{\partial a} \right)
 \end{equation}
 that points into the direction of the strongest ascend of the cost
 function.  The gradient is given by the partial derivatives
 (box~\ref{partialderivativebox}) of the mean squared error with
-respect to the parameters $c$ and $\alpha$ of the power law
+respect to the parameters $c$ and $a$ of the power law
 relation. In general, for $n$-dimensional problems, the gradient is an
 $n-$ dimensional vector containing for each of the $n$ parameters
 $p_j$ the respective partial derivatives as coordinates:
@ -468,6 +496,9 @@ parameter value $p_i$ becomes a vector $\vec p_i$ of parameter values:
  \label{ndimgradientdescent}
  \vec p_{i+1} = \vec p_i - \epsilon \cdot \nabla f_{cost}(\vec p_i)
 \end{equation}
 The algorithm proceeds along the negative gradient
 (\figref{powergradientdescentfig}).
 For the termination condition we need the length of the gradient. In
 one dimension it was just the absolute value of the derivative. For
 $n$ dimensions this is according to the \enterm{Pythagorean theorem}
@ -479,8 +510,7 @@ the sum of the squared partial derivatives:
 \end{equation}
 The \code{norm()} function implements this.
-
+\subsection{Passing a function as an argument to another function}
 \section{Passing a function as an argument to another function}
 So far, all our code for the gradient descent algorithm was tailored
 to a specific function, the cubic relation \eqref{cubicfunc}. It would
@ -500,7 +530,7 @@ argument to our function:
 \pageinputlisting[caption={Passing a function handle as an argument to a function.}]{funcplotterexamples.m}
-\section{Gradient descent algorithm for arbitrary functions}
+\subsection{Gradient descent algorithm for arbitrary functions}
 Now we are ready to adapt the gradient descent algorithm from
 exercise~\ref{gradientdescentexercise} to arbitrary functions with $n$
@ -524,21 +554,21 @@ For testing our new function we need to implement the power law
  Write a function that implements \eqref{powerfunc}. The function
  gets as arguments a vector $x$ containing the $x$-data values and
  another vector containing as elements the parameters for the power
-  law, i.e. the factor $c$ and the power $\alpha$. It returns a vector
+  law, i.e. the factor $c$ and the power $a$. It returns a vector
  with the computed $y$ values for each $x$ value. 
 \end{exercise}
 Now let's use the new gradient descent function to fit a power law to
-our tiger data-set:
+our tiger data-set (\figref{powergradientdescentfig}):
 \begin{exercise}{plotgradientdescentpower.m}{}
  Use the function \varcode{gradientDescent()} to fit the
  \varcode{powerLaw()} function to the simulated data from
  exercise~\ref{mseexercise}. Plot the returned values of the two
  parameters against each other. Compare the result of the gradient
-  descent method with the true values of $c$ and $\alpha$ used to
+  descent method with the true values of $c$ and $a$ used to simulate
-  simulate the data. Observe the norm of the gradient and inspect the
+  the data. Observe the norm of the gradient and inspect the plots to
-  plots to adapt $\epsilon$ (smaller than in
+  adapt $\epsilon$ (smaller than in
  exercise~\ref{plotgradientdescentexercise}) and the threshold (much
  larger) appropriately. Finally plot the data together with the best
  fitting power-law \eqref{powerfunc}.