diff --git a/regression/code/meanSquaredErrorCubic.m b/regression/code/meanSquaredErrorCubic.m
index 44870a1..8953d50 100644
--- a/regression/code/meanSquaredErrorCubic.m
+++ b/regression/code/meanSquaredErrorCubic.m
@@ -1,8 +1,8 @@
 function mse = meanSquaredErrorCubic(x, y, c)
 % Mean squared error between data pairs and a cubic relation.
 %
-% Arguments: x, vector of the input values
-%            y, vector of the corresponding measured output values
+% Arguments: x, vector of the x-data values
+%            y, vector of the corresponding y-data values
 %            c, the factor for the cubic relation. 
 %
 % Returns:   mse, the mean-squared-error.
diff --git a/regression/lecture/cubiccost.py b/regression/lecture/cubiccost.py
index 30d9575..ac145fa 100644
--- a/regression/lecture/cubiccost.py
+++ b/regression/lecture/cubiccost.py
@@ -75,7 +75,7 @@ def plot_mse_min(ax, x, y, c):
 if __name__ == "__main__":
     x, y, c = create_data()
     fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 1.1*figure_height))
-    fig.subplots_adjust(**adjust_fs(left=8.0, right=1))
+    fig.subplots_adjust(**adjust_fs(left=8.0, right=1.2))
     plot_mse(ax1, x, y, c)
     plot_mse_min(ax2, x, y, c)
     fig.savefig("cubiccost.pdf")
diff --git a/regression/lecture/cubicerrors.py b/regression/lecture/cubicerrors.py
index 86e608f..d74c3bf 100644
--- a/regression/lecture/cubicerrors.py
+++ b/regression/lecture/cubicerrors.py
@@ -58,8 +58,8 @@ def plot_error_hist(ax, x, y, c):
 
 if __name__ == "__main__":
     x, y, c = create_data()
-    fig, (ax1, ax2) = plt.subplots(1, 2)
-    fig.subplots_adjust(wspace=0.4, **adjust_fs(left=6.0, right=1.2))
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 0.9*figure_height))
+    fig.subplots_adjust(wspace=0.5, **adjust_fs(left=6.0, right=1.2))
     plot_data_errors(ax1, x, y, c)
     plot_error_hist(ax2, x, y, c)
     fig.savefig("cubicerrors.pdf")
diff --git a/regression/lecture/cubicfunc.py b/regression/lecture/cubicfunc.py
index 8b5b44a..3b34c03 100644
--- a/regression/lecture/cubicfunc.py
+++ b/regression/lecture/cubicfunc.py
@@ -42,8 +42,8 @@ def plot_data_fac(ax, x, y, c):
 if __name__ == "__main__":
     x, y, c = create_data()
     print('n=%d' % len(x))
-    fig, (ax1, ax2) = plt.subplots(1, 2)
-    fig.subplots_adjust(wspace=0.5, **adjust_fs(fig, left=6.0, right=1.5))
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 0.9*figure_height))
+    fig.subplots_adjust(wspace=0.5, **adjust_fs(fig, left=6.0, right=1.2))
     plot_data(ax1, x, y)
     plot_data_fac(ax2, x, y, c)
     fig.savefig("cubicfunc.pdf")
diff --git a/regression/lecture/cubicmse.py b/regression/lecture/cubicmse.py
index 55ac64b..0d956c5 100644
--- a/regression/lecture/cubicmse.py
+++ b/regression/lecture/cubicmse.py
@@ -73,7 +73,7 @@ if __name__ == "__main__":
     x, y, c = create_data()
     cs, mses = gradient_descent(x, y)
     fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 1.1*figure_height))
-    fig.subplots_adjust(wspace=0.2, **adjust_fs(left=8.0, right=0.5))
+    fig.subplots_adjust(**adjust_fs(left=8.0, right=1.2))
     plot_mse(ax1, x, y, c, cs)
     plot_descent(ax2, cs, mses)
     fig.savefig("cubicmse.pdf")
diff --git a/regression/lecture/error_surface.py b/regression/lecture/error_surface.py
deleted file mode 100644
index da0d867..0000000
--- a/regression/lecture/error_surface.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import numpy as np
-from mpl_toolkits.mplot3d import Axes3D
-import matplotlib.pyplot as plt
-import matplotlib.cm as cm
-from plotstyle import *
-
-def create_data():
-    m = 0.75
-    n=  -40
-    x = np.arange(10.,110., 2.5)
-    y = m * x + n;
-    rng = np.random.RandomState(37281)
-    noise = rng.randn(len(x))*15
-    y += noise
-    return x, y, m, n
-    
-
-def plot_error_plane(ax, x, y, m, n):
-    ax.set_xlabel('Slope m')
-    ax.set_ylabel('Intercept b')
-    ax.set_zlabel('Mean squared error')
-    ax.set_xlim(-4.5, 5.0)
-    ax.set_ylim(-60.0, -20.0)
-    ax.set_zlim(0.0, 700.0)
-    ax.set_xticks(np.arange(-4, 5, 2))
-    ax.set_yticks(np.arange(-60, -19, 10))
-    ax.set_zticks(np.arange(0, 700, 200))
-    ax.grid(True)
-    ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
-    ax.w_yaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
-    ax.w_zaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
-    ax.invert_xaxis()
-    ax.view_init(25, 40)
-    slopes = np.linspace(-4.5, 5, 40)
-    intercepts = np.linspace(-60, -20, 40)
-    x, y = np.meshgrid(slopes, intercepts)
-    error_surf = np.zeros(x.shape)
-    for i, s in enumerate(slopes) :
-        for j, b in enumerate(intercepts) :
-                error_surf[j,i] = np.mean((y-s*x-b)**2.0)
-    ax.plot_surface(x, y, error_surf, rstride=1, cstride=1, cmap=cm.coolwarm,
-                    linewidth=0, shade=True)
-    # Minimum:
-    mini = np.unravel_index(np.argmin(error_surf), error_surf.shape)
-    ax.scatter(slopes[mini[1]], intercepts[mini[0]], [0.0], color='#cc0000', s=60)
-
-
-if __name__ == "__main__":
-    x, y, m, n = create_data()
-    fig = plt.figure()
-    ax = fig.add_subplot(1, 1, 1, projection='3d')
-    plot_error_plane(ax, x, y, m, n)
-    fig.set_size_inches(7., 5.)
-    fig.subplots_adjust(**adjust_fs(fig, 1.0, 0.0, 0.0, 0.0))
-    fig.savefig("error_surface.pdf")
-    plt.close()
diff --git a/regression/lecture/lin_regress.py b/regression/lecture/lin_regress.py
deleted file mode 100644
index ae854d5..0000000
--- a/regression/lecture/lin_regress.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import numpy as np
-import matplotlib.pyplot as plt
-from plotstyle import *
-
-def create_data():
-    m = 0.75
-    n=  -40
-    x = np.arange(10.,110., 2.5)
-    y = m * x + n;
-    rng = np.random.RandomState(37281)
-    noise = rng.randn(len(x))*15
-    y += noise
-    return x, y, m, n
-
-    
-def plot_data(ax, x, y):
-    ax.plot(x, y, **psA)
-    ax.set_xlabel('Input x')
-    ax.set_ylabel('Output y')
-    ax.set_xlim(0, 120)
-    ax.set_ylim(-80, 80)
-    ax.set_xticks(np.arange(0,121, 40))
-    ax.set_yticks(np.arange(-80,81, 40))
-    
-
-def plot_data_slopes(ax, x, y, m, n):
-    ax.plot(x, y, **psA)
-    xx = np.asarray([2, 118])
-    for i in np.linspace(0.3*m, 2.0*m, 5):
-        ax.plot(xx, i*xx+n, **lsBm)
-    ax.set_xlabel('Input x')
-    #ax.set_ylabel('Output y')
-    ax.set_xlim(0, 120)
-    ax.set_ylim(-80, 80)
-    ax.set_xticks(np.arange(0,121, 40))
-    ax.set_yticks(np.arange(-80,81, 40))
-
-
-def plot_data_intercepts(ax, x, y, m, n):
-    ax.plot(x, y, **psA)
-    xx = np.asarray([2, 118])
-    for i in np.linspace(n-1*n, n+1*n, 5):
-        ax.plot(xx, m*xx + i, **lsBm)
-    ax.set_xlabel('Input x')
-    #ax.set_ylabel('Output y')
-    ax.set_xlim(0, 120)
-    ax.set_ylim(-80, 80)
-    ax.set_xticks(np.arange(0,121, 40))
-    ax.set_yticks(np.arange(-80,81, 40))
-
-
-if __name__ == "__main__":
-    x, y, m, n = create_data()
-    fig, axs = plt.subplots(1, 3)
-    fig.subplots_adjust(wspace=0.5, **adjust_fs(fig, left=6.0, right=1.5))
-    plot_data(axs[0], x, y)
-    plot_data_slopes(axs[1], x, y, m, n)
-    plot_data_intercepts(axs[2], x, y, m, n)
-    fig.savefig("lin_regress.pdf")
-    plt.close()
diff --git a/regression/lecture/linear_least_squares.py b/regression/lecture/linear_least_squares.py
deleted file mode 100644
index d36fd32..0000000
--- a/regression/lecture/linear_least_squares.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import numpy as np
-import matplotlib.pyplot as plt
-from plotstyle import *
-
-def create_data():
-    m = 0.75
-    n=  -40
-    x = np.concatenate( (np.arange(10.,110., 2.5), np.arange(0.,120., 2.0)) )
-    y = m * x + n;
-    rng = np.random.RandomState(37281)
-    noise = rng.randn(len(x))*15
-    y += noise
-    return x, y, m, n
-
-    
-def plot_data(ax, x, y, m, n):
-    ax.set_xlabel('Input x')
-    ax.set_ylabel('Output y')
-    ax.set_xlim(0, 120)
-    ax.set_ylim(-80, 80)
-    ax.set_xticks(np.arange(0,121, 40))
-    ax.set_yticks(np.arange(-80,81, 40))
-    ax.annotate('Error',
-                xy=(x[34]+1, y[34]+15), xycoords='data',
-                xytext=(80, -50), textcoords='data', ha='left',
-                arrowprops=dict(arrowstyle="->", relpos=(0.9,1.0),
-                connectionstyle="angle3,angleA=50,angleB=-30") )
-    ax.plot(x[:40], y[:40], zorder=0, **psAm)
-    inxs = [3, 13, 16, 19, 25, 34, 36]
-    ax.plot(x[inxs], y[inxs], zorder=10, **psA)
-    xx = np.asarray([2, 118])
-    ax.plot(xx, m*xx+n, **lsBm)
-    for i in inxs :
-        xx = [x[i], x[i]]
-        yy = [m*x[i]+n, y[i]]
-        ax.plot(xx, yy, zorder=5, **lsDm)
-    
-
-def plot_error_hist(ax, x, y, m, n):
-    ax.set_xlabel('Squared error')
-    ax.set_ylabel('Frequency')
-    bins = np.arange(0.0, 602.0, 50.0)
-    ax.set_xlim(bins[0], bins[-1])
-    ax.set_ylim(0, 35)
-    ax.set_xticks(np.arange(bins[0], bins[-1], 100))
-    ax.set_yticks(np.arange(0, 36, 10))
-    errors = (y-(m*x+n))**2.0
-    mls = np.mean(errors)
-    ax.annotate('Mean\nsquared\nerror',
-                xy=(mls, 0.5), xycoords='data',
-                xytext=(350, 20), textcoords='data', ha='left',
-                arrowprops=dict(arrowstyle="->", relpos=(0.0,0.2),
-                connectionstyle="angle3,angleA=10,angleB=90") )
-    ax.hist(errors, bins, **fsD)
-
-
-
-if __name__ == "__main__":
-    x, y, m, n = create_data()
-    fig, axs = plt.subplots(1, 2)
-    fig.subplots_adjust(**adjust_fs(fig, left=6.0))
-    plot_data(axs[0], x, y, m, n)
-    plot_error_hist(axs[1], x, y, m, n)
-    fig.savefig("linear_least_squares.pdf")
-    plt.close()
diff --git a/regression/lecture/regression-chapter.tex b/regression/lecture/regression-chapter.tex
index 2ca1ec2..45fdfda 100644
--- a/regression/lecture/regression-chapter.tex
+++ b/regression/lecture/regression-chapter.tex
@@ -23,47 +23,12 @@
 \item Fig 8.2 right: this should be a chi-squared distribution with one degree of freedom!
 \end{itemize}
 
-\begin{figure}[t]
-  \includegraphics[width=0.75\textwidth]{error_surface}
-  \titlecaption{Error surface.}{The two model parameters $m$ and $b$
-    define the base area of the surface plot. For each parameter
-    combination of slope and intercept the error is calculated. The
-    resulting surface has a minimum which indicates the parameter
-    combination that best fits the data.}\label{errorsurfacefig}
-\end{figure}
-
-
-\begin{figure}[t]
-  \includegraphics[width=0.75\textwidth]{error_gradient}
-  \titlecaption{Gradient of the error surface.}  {Each arrow points
-    into the direction of the greatest ascend at different positions
-    of the error surface shown in \figref{errorsurfacefig}. The
-    contour lines in the background illustrate the error surface. Warm
-    colors indicate high errors, colder colors low error values. Each
-    contour line connects points of equal
-    error.}\label{gradientquiverfig}
-\end{figure}
-
-\begin{figure}[t]
-  \includegraphics[width=0.45\textwidth]{gradient_descent}
-  \titlecaption{Gradient descent.}{The algorithm starts at an
-    arbitrary position. At each point the gradient is estimated and
-    the position is updated as long as the length of the gradient is
-    sufficiently large.The dots show the positions after each
-    iteration of the algorithm.} \label{gradientdescentfig}
-\end{figure}
-
-
 \subsection{Linear fits}
 \begin{itemize}
 \item Polyfit is easy: unique solution! $c x^2$ is also a linear fit.
 \item Example for overfitting with polyfit of a high order (=number of data points)
 \end{itemize}
 
-\section{Fitting in practice}
-
-Fit with matlab functions lsqcurvefit, polyfit
-
 
 \subsection{Non-linear fits}
 \begin{itemize}
diff --git a/regression/lecture/regression.tex b/regression/lecture/regression.tex
index 36e1808..2db6a31 100644
--- a/regression/lecture/regression.tex
+++ b/regression/lecture/regression.tex
@@ -227,6 +227,7 @@ to arbitrary precision.
 \end{ibox}
 
 \section{Gradient}
+
 Imagine to place a ball at some point on the cost function
 \figref{cubiccostfig}. Naturally, it would roll down the slope and
 eventually stop at the minimum of the error surface (if it had no
@@ -235,10 +236,21 @@ way to the minimum of the cost function. The ball always follows the
 steepest slope. Thus we need to figure out the direction of the slope
 at the position of the ball.
 
+\begin{figure}[t]
+  \includegraphics{cubicgradient}
+  \titlecaption{Derivative of the cost function.}{The gradient, the
+    derivative \eqref{costderivative} of the cost function, is
+    negative to the left of the minimum of the cost function, zero at,
+    and positive to the right of the minimum (left). For each value of
+    the parameter $c$ the negative gradient (arrows) points towards
+    the minimum of the cost function
+    (right).} \label{gradientcubicfig}
+\end{figure}
+
 In our one-dimensional example of a single free parameter the slope is
 simply the derivative of the cost function with respect to the
-parameter $c$. This derivative is called the
-\entermde{Gradient}{gradient} of the cost function:
+parameter $c$ (\figref{gradientcubicfig}, left). This derivative is called
+the \entermde{Gradient}{gradient} of the cost function:
 \begin{equation}
   \label{costderivative}
   \nabla f_{cost}(c) = \frac{{\rm d} f_{cost}(c)}{{\rm d} c}
@@ -252,7 +264,8 @@ can be approximated numerically by the difference quotient
     \approx \frac{f_{cost}(c + \Delta c) - f_{cost}(c)}{\Delta c}
 \end{equation}
 The derivative is positive for positive slopes. Since want to go down
-the hill, we choose the opposite direction.
+the hill, we choose the opposite direction (\figref{gradientcubicfig},
+right).
 
 \begin{exercise}{meanSquaredGradientCubic.m}{}\label{gradientcubic}
   Implement a function \varcode{meanSquaredGradientCubic()}, that
@@ -275,7 +288,7 @@ the hill, we choose the opposite direction.
   \titlecaption{Gradient descent.}{The algorithm starts at an
     arbitrary position. At each point the gradient is estimated and
     the position is updated as long as the length of the gradient is
-    sufficiently large.The dots show the positions after each
+    sufficiently large. The dots show the positions after each
     iteration of the algorithm.} \label{gradientdescentcubicfig}
 \end{figure}
 
@@ -378,18 +391,18 @@ functions that have more than a single parameter, in general $n$
 parameters. We then need to find the minimum in an $n$ dimensional
 parameter space.
 
-For our tiger problem, we could have also fitted the exponent $\alpha$
-of the power-law relation between size and weight, instead of assuming
-a cubic relation with $\alpha=3$:
+For our tiger problem, we could have also fitted the exponent $a$ of
+the power-law relation between size and weight, instead of assuming a
+cubic relation with $a=3$:
 \begin{equation}
   \label{powerfunc}
-  y = f(x; c, \alpha) = f(x; \vec p) = c\cdot x^\alpha
+  y = f(x; c, a) = f(x; \vec p) = c\cdot x^a
 \end{equation}
 We then could check whether the resulting estimate of the exponent
-$\alpha$ indeed is close to the expected power of three.  The
-power-law \eqref{powerfunc} has two free parameters $c$ and $\alpha$.
+$a$ indeed is close to the expected power of three.  The
+power-law \eqref{powerfunc} has two free parameters $c$ and $a$.
 Instead of a single parameter we are now dealing with a vector $\vec
-p$ containing $n$ parameter values. Here, $\vec p = (c, \alpha)$. All
+p$ containing $n$ parameter values. Here, $\vec p = (c, a)$. All
 the concepts we introduced on the example of the one dimensional
 problem of tiger weights generalize to $n$-dimensional problems. We
 only need to adapt a few things. The cost function for the mean
@@ -399,12 +412,27 @@ squared error reads
   f_{cost}(\vec p|\{(x_i, y_i)\}) = \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i;\vec p))^2
 \end{equation}
 
+\begin{figure}[t]
+  \includegraphics{powergradientdescent}
+  \titlecaption{Gradient descent on an error surface.}{Contour plot of
+    the cost function \eqref{ndimcostfunc} for the fit of a power law
+    \eqref{powerfunc} to some data. Here the cost function is a long
+    and narrow valley on the plane spanned by the two parameters $c$
+    and $a$ of the power law. The red line marks the path of the
+    gradient descent algorithm. The gradient is always perpendicular
+    to the contour lines. The algorithm quickly descends into the
+    valley and then slowly creeps on the shallow bottom of the valley
+    towards the global minimum where it terminates (yellow circle).
+  } \label{powergradientdescentfig}
+\end{figure}
+
 For two-dimensional problems the graph of the cost function is an
 \enterm{error surface} (\determ{{Fehlerfl\"ache}}). The two parameters
 span a two-dimensional plane. The cost function assigns to each
 parameter combination on this plane a single value. This results in a
 landscape over the parameter plane with mountains and valleys and we
-are searching for the position of the bottom of the deepest valley.
+are searching for the position of the bottom of the deepest valley
+(\figref{powergradientdescentfig}).
 
 \begin{ibox}[tp]{\label{partialderivativebox}Partial derivatives and gradient}
   Some functions depend on more than a single variable. For example, the function
@@ -446,13 +474,13 @@ space of our example, the \entermde{Gradient}{gradient}
 (box~\ref{partialderivativebox}) of the cost function is a vector
 \begin{equation}
   \label{gradientpowerlaw}
-  \nabla f_{cost}(c, \alpha) = \left( \frac{\partial f_{cost}(c, \alpha)}{\partial c},
-    \frac{\partial f_{cost}(c, \alpha)}{\partial \alpha} \right)
+  \nabla f_{cost}(c, a) = \left( \frac{\partial f_{cost}(c, a)}{\partial c},
+    \frac{\partial f_{cost}(c, a)}{\partial a} \right)
 \end{equation}
 that points into the direction of the strongest ascend of the cost
 function.  The gradient is given by the partial derivatives
 (box~\ref{partialderivativebox}) of the mean squared error with
-respect to the parameters $c$ and $\alpha$ of the power law
+respect to the parameters $c$ and $a$ of the power law
 relation. In general, for $n$-dimensional problems, the gradient is an
 $n-$ dimensional vector containing for each of the $n$ parameters
 $p_j$ the respective partial derivatives as coordinates:
@@ -468,6 +496,9 @@ parameter value $p_i$ becomes a vector $\vec p_i$ of parameter values:
   \label{ndimgradientdescent}
   \vec p_{i+1} = \vec p_i - \epsilon \cdot \nabla f_{cost}(\vec p_i)
 \end{equation}
+The algorithm proceeds along the negative gradient
+(\figref{powergradientdescentfig}).
+
 For the termination condition we need the length of the gradient. In
 one dimension it was just the absolute value of the derivative. For
 $n$ dimensions this is according to the \enterm{Pythagorean theorem}
@@ -479,8 +510,7 @@ the sum of the squared partial derivatives:
 \end{equation}
 The \code{norm()} function implements this.
 
-
-\section{Passing a function as an argument to another function}
+\subsection{Passing a function as an argument to another function}
 
 So far, all our code for the gradient descent algorithm was tailored
 to a specific function, the cubic relation \eqref{cubicfunc}. It would
@@ -500,7 +530,7 @@ argument to our function:
 \pageinputlisting[caption={Passing a function handle as an argument to a function.}]{funcplotterexamples.m}
 
 
-\section{Gradient descent algorithm for arbitrary functions}
+\subsection{Gradient descent algorithm for arbitrary functions}
 
 Now we are ready to adapt the gradient descent algorithm from
 exercise~\ref{gradientdescentexercise} to arbitrary functions with $n$
@@ -524,21 +554,21 @@ For testing our new function we need to implement the power law
   Write a function that implements \eqref{powerfunc}. The function
   gets as arguments a vector $x$ containing the $x$-data values and
   another vector containing as elements the parameters for the power
-  law, i.e. the factor $c$ and the power $\alpha$. It returns a vector
+  law, i.e. the factor $c$ and the power $a$. It returns a vector
   with the computed $y$ values for each $x$ value. 
 \end{exercise}
 
 Now let's use the new gradient descent function to fit a power law to
-our tiger data-set:
+our tiger data-set (\figref{powergradientdescentfig}):
 
 \begin{exercise}{plotgradientdescentpower.m}{}
   Use the function \varcode{gradientDescent()} to fit the
   \varcode{powerLaw()} function to the simulated data from
   exercise~\ref{mseexercise}. Plot the returned values of the two
   parameters against each other. Compare the result of the gradient
-  descent method with the true values of $c$ and $\alpha$ used to
-  simulate the data. Observe the norm of the gradient and inspect the
-  plots to adapt $\epsilon$ (smaller than in
+  descent method with the true values of $c$ and $a$ used to simulate
+  the data. Observe the norm of the gradient and inspect the plots to
+  adapt $\epsilon$ (smaller than in
   exercise~\ref{plotgradientdescentexercise}) and the threshold (much
   larger) appropriately. Finally plot the data together with the best
   fitting power-law \eqref{powerfunc}.