[regression] first exercise

2020-12-20 23:16:56 +01:00
parent c2e4d4e40c
commit 4b18c855b9
15 changed files with 444 additions and 75 deletions
--- a/regression/lecture/cubicgradient.py
+++ b/regression/lecture/cubicgradient.py
@@ -0,0 +1,89 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mt
+from plotstyle import *
+
+def create_data():
+    # wikipedia:
+    # Generally, males vary in total length from 250 to 390 cm and
+    # weigh between 90 and 306 kg
+    c = 6
+    x = np.arange(2.2, 3.9, 0.05)
+    y = c * x**3.0
+    rng = np.random.RandomState(32281)
+    noise = rng.randn(len(x))*50
+    y += noise
+    return x, y, c
+
+
+def gradient_descent(x, y):
+    n = 20
+    dc = 0.01
+    eps = 0.0001
+    cc = 1.1
+    cs = []
+    mses = []
+    for k in range(n):
+        m0 = np.mean((y-(cc*x**3.0))**2.0)
+        m1 = np.mean((y-((cc+dc)*x**3.0))**2.0)
+        dmdc = (m1 - m0)/dc
+        cs.append(cc)
+        mses.append(m0)
+        cc -= eps*dmdc
+    return cs, mses
+
+    
+def plot_gradient(ax, x, y, c):
+    ccs = np.linspace(0.5, 10.0, 200)
+    mses = np.zeros(len(ccs))
+    for i, cc in enumerate(ccs):
+        mses[i] = np.mean((y-(cc*x**3.0))**2.0)
+    cmin = ccs[np.argmin(mses)]
+    gradient = np.diff(mses)/(ccs[1]-ccs[0])
+        
+    ax.plot([cmin, cmin], [-10000, 10000], **lsSpine)
+    ax.plot([ccs[0], ccs[-1]], [0, 0], **lsSpine)
+    ax.plot(ccs[:-1], gradient, **lsBm)
+    ax.set_xlabel('c')
+    ax.set_ylabel('Derivative')
+    ax.set_xlim(0, 10)
+    ax.set_ylim(-10000, 10000)
+    ax.set_xticks(np.arange(0.0, 10.1, 2.0))
+    ax.set_yticks(np.arange(-10000, 10001, 10000))
+    ax.set_yticklabels(['', '0', ''])
+
+    
+def plot_mse(ax, x, y, c):
+    ccs = np.linspace(0.5, 10.0, 200)
+    mses = np.zeros(len(ccs))
+    for i, cc in enumerate(ccs):
+        mses[i] = np.mean((y-(cc*x**3.0))**2.0)
+    cmin = ccs[np.argmin(mses)]
+    gradient = np.diff(mses)/(ccs[1]-ccs[0])
+
+    ay = 1500.0
+    asB = dict(arrowprops=dict(arrowstyle="->", shrinkA=0, shrinkB=0,
+                               color=lsB['color'], lw=2))
+    ax.annotate('', xy=(3.0, ay), xytext=(1.0, ay), **asB)
+    ax.annotate('', xy=(5.0, ay), xytext=(3.8, ay), **asB)
+    ax.annotate('', xy=(6.2, ay), xytext=(7.4, ay), **asB)
+    ax.annotate('', xy=(8.0, ay), xytext=(10.0, ay), **asB)
+    ax.plot([cmin, cmin], [0, 30000], **lsSpine)
+    ax.plot(ccs, mses, zorder=10, **lsAm)
+    ax.set_xlabel('c')
+    ax.set_ylabel('Mean squared error')
+    ax.set_xlim(0, 10)
+    ax.set_ylim(0, 25000)
+    ax.set_xticks(np.arange(0.0, 10.1, 2.0))
+    ax.set_yticks(np.arange(0, 30001, 10000))
+    ax.set_yticklabels(['0', '', '', ''])
+
+
+if __name__ == "__main__":
+    x, y, c = create_data()
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 1.1*figure_height))
+    fig.subplots_adjust(wspace=0.5, **adjust_fs(left=5.0, right=1.2))
+    plot_gradient(ax1, x, y, c)
+    plot_mse(ax2, x, y, c)
+    fig.savefig("cubicgradient.pdf")
+    plt.close()
--- a/regression/lecture/powergradientdescent.py
+++ b/regression/lecture/powergradientdescent.py
@@ -0,0 +1,67 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mt
+from plotstyle import *
+
+
+def power_law(x, c, a):
+    return c*x**a
+
+
+def create_data():
+    # wikipedia:
+    # Generally, males vary in total length from 250 to 390 cm and
+    # weigh between 90 and 306 kg
+    c = 6.0
+    x = np.arange(2.2, 3.9, 0.05)
+    y = power_law(x, c, 3.0)
+    rng = np.random.RandomState(32281)
+    noise = rng.randn(len(x))*50
+    y += noise
+    return x, y, c
+
+
+def gradient_descent(x, y, func, p0):
+    n = 20000
+    h = 1e-7
+    ph = np.identity(len(p0))*h
+    eps = 0.00001
+    p = p0
+    ps = np.zeros((n, len(p0)))
+    mses = np.zeros(n)
+    for k in range(n):
+        m0 = np.mean((y-func(x, *p))**2.0)
+        gradient = np.array([(np.mean((y-func(x, *(p+ph[:,i])))**2.0) - m0)/h
+                             for i in range(len(p))])
+        ps[k,:] = p
+        mses[k] = m0
+        p -= eps*gradient
+    return ps, mses
+
+    
+def plot_gradient_descent(ax, x, y, c, ps, mses):
+    cs = np.linspace(0.0, 10.0, 300)
+    bs = np.linspace(1.0, 5.5, 180)
+    mse = np.zeros((len(bs), len(cs)))
+    for i in range(len(bs)):
+        for k in range(len(cs)):
+            mse[i, k] = np.mean((y-power_law(x, cs[k], bs[i]))**2.0)
+    z = np.log10(mse)
+    ax.contourf(cs, bs, z, levels=(3.3, 3.36, 3.5, 4.0, 4.5, 5.5, 6.5, 7.5, 8.5),
+                cmap='Blues_r')
+    ax.plot(ps[::5,0], ps[::5,1], **lsBm)
+    ax.plot(ps[-1,0], ps[-1,1], **psC)
+    ax.set_xlabel('c')
+    ax.set_ylabel('a')
+    ax.yaxis.set_major_locator(mt.MultipleLocator(1.0))
+    ax.set_aspect('equal')
+
+
+if __name__ == "__main__":
+    x, y, c = create_data()
+    ps, mses = gradient_descent(x, y, power_law, [1.0, 1.0])
+    fig, ax = plt.subplots(figsize=cm_size(figure_width, 1.3*figure_height))
+    fig.subplots_adjust(**adjust_fs(left=4.5, right=1.0))
+    plot_gradient_descent(ax, x, y, c, ps, mses)
+    fig.savefig("powergradientdescent.pdf")
+    plt.close()
--- a/regression/lecture/regression.tex
+++ b/regression/lecture/regression.tex
@@ -198,7 +198,20 @@ So we need a different approach. We want a procedure that finds the
 minimum of the cost function with a minimal number of computations and
 to arbitrary precision.

-\begin{ibox}[t]{\label{differentialquotientbox}Difference quotient and derivative}
+\section{Gradient}
+
+\begin{figure}[t]
+  \includegraphics{cubicgradient}
+  \titlecaption{Derivative of the cost function.}{The gradient, the
+    derivative \eqref{costderivative} of the cost function, is
+    negative to the left of the minimum (vertical line) of the cost
+    function, zero (horizontal line) at, and positive to the right of
+    the minimum (left). For each value of the parameter $c$ the
+    negative gradient (arrows) points towards the minimum of the cost
+    function (right).} \label{gradientcubicfig}
+\end{figure}
+
+\begin{ibox}[b]{\label{differentialquotientbox}Difference quotient and derivative}
  \includegraphics[width=0.33\textwidth]{derivative}
  \hfill
  \begin{minipage}[b]{0.63\textwidth}
@@ -226,8 +239,6 @@ to arbitrary precision.
  sufficiently small $\Delta x$.
 \end{ibox}

-\section{Gradient}
-
 Imagine to place a ball at some point on the cost function
 \figref{cubiccostfig}. Naturally, it would roll down the slope and
 eventually stop at the minimum of the error surface (if it had no
@@ -236,17 +247,6 @@ way to the minimum of the cost function. The ball always follows the
 steepest slope. Thus we need to figure out the direction of the slope
 at the position of the ball.

-\begin{figure}[t]
-  \includegraphics{cubicgradient}
-  \titlecaption{Derivative of the cost function.}{The gradient, the
-    derivative \eqref{costderivative} of the cost function, is
-    negative to the left of the minimum (vertical line) of the cost
-    function, zero (horizontal line) at, and positive to the right of
-    the minimum (left). For each value of the parameter $c$ the
-    negative gradient (arrows) points towards the minimum of the cost
-    function (right).} \label{gradientcubicfig}
-\end{figure}
-
 In our one-dimensional example of a single free parameter the slope is
 simply the derivative of the cost function with respect to the
 parameter $c$ (\figref{gradientcubicfig}, left). This derivative is called
@@ -434,7 +434,7 @@ landscape over the parameter plane with mountains and valleys and we
 are searching for the position of the bottom of the deepest valley
 (\figref{powergradientdescentfig}).

-\begin{ibox}[tp]{\label{partialderivativebox}Partial derivatives and gradient}
+\begin{ibox}[t]{\label{partialderivativebox}Partial derivatives and gradient}
  Some functions depend on more than a single variable. For example, the function
  \[ z = f(x,y) \]
  depends on both $x$ and $y$. Using the partial derivatives
@@ -642,13 +642,13 @@ generations. In this way the algorithm is not directed towards higher
 fitness, as the gradient descent method would be. Rather, some
 neighborhood of the parameter space is randomly probed. That way it is
 even possible to escape a local maximum and find a potentially better
-maximum. For this reason, \enterm{genetic algorithms} try to mimic
-evolution in the context of high-dimensional optimization problems, in
-particular with discrete parameter values. In biological evolution,
-the objective function, however, is not a fixed function. It may
-change in time by changing abiotic and biotic environmental
-conditions, making this a very complex but also interesting
-optimization problem.
+maximum. For this reason, \enterm[genetic algorithm]{genetic
+  algorithms} try to mimic evolution in the context of
+high-dimensional optimization problems, in particular with discrete
+parameter values. In biological evolution, the objective function,
+however, is not a fixed function. It may change in time by changing
+abiotic and biotic environmental conditions, making this a very
+complex but also interesting optimization problem.

 How should a neuron or neural network be designed? As a particular
 aspect of the general evolution of a species, this is a fundamental