[regression] finished chapter
This commit is contained in:
parent
46affef86d
commit
dea6319e75
@ -1,8 +1,8 @@
|
|||||||
function mse = meanSquaredErrorCubic(x, y, c)
|
function mse = meanSquaredErrorCubic(x, y, c)
|
||||||
% Mean squared error between data pairs and a cubic relation.
|
% Mean squared error between data pairs and a cubic relation.
|
||||||
%
|
%
|
||||||
% Arguments: x, vector of the input values
|
% Arguments: x, vector of the x-data values
|
||||||
% y, vector of the corresponding measured output values
|
% y, vector of the corresponding y-data values
|
||||||
% c, the factor for the cubic relation.
|
% c, the factor for the cubic relation.
|
||||||
%
|
%
|
||||||
% Returns: mse, the mean-squared-error.
|
% Returns: mse, the mean-squared-error.
|
||||||
|
@ -75,7 +75,7 @@ def plot_mse_min(ax, x, y, c):
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
x, y, c = create_data()
|
x, y, c = create_data()
|
||||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 1.1*figure_height))
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 1.1*figure_height))
|
||||||
fig.subplots_adjust(**adjust_fs(left=8.0, right=1))
|
fig.subplots_adjust(**adjust_fs(left=8.0, right=1.2))
|
||||||
plot_mse(ax1, x, y, c)
|
plot_mse(ax1, x, y, c)
|
||||||
plot_mse_min(ax2, x, y, c)
|
plot_mse_min(ax2, x, y, c)
|
||||||
fig.savefig("cubiccost.pdf")
|
fig.savefig("cubiccost.pdf")
|
||||||
|
@ -58,8 +58,8 @@ def plot_error_hist(ax, x, y, c):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
x, y, c = create_data()
|
x, y, c = create_data()
|
||||||
fig, (ax1, ax2) = plt.subplots(1, 2)
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 0.9*figure_height))
|
||||||
fig.subplots_adjust(wspace=0.4, **adjust_fs(left=6.0, right=1.2))
|
fig.subplots_adjust(wspace=0.5, **adjust_fs(left=6.0, right=1.2))
|
||||||
plot_data_errors(ax1, x, y, c)
|
plot_data_errors(ax1, x, y, c)
|
||||||
plot_error_hist(ax2, x, y, c)
|
plot_error_hist(ax2, x, y, c)
|
||||||
fig.savefig("cubicerrors.pdf")
|
fig.savefig("cubicerrors.pdf")
|
||||||
|
@ -42,8 +42,8 @@ def plot_data_fac(ax, x, y, c):
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
x, y, c = create_data()
|
x, y, c = create_data()
|
||||||
print('n=%d' % len(x))
|
print('n=%d' % len(x))
|
||||||
fig, (ax1, ax2) = plt.subplots(1, 2)
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 0.9*figure_height))
|
||||||
fig.subplots_adjust(wspace=0.5, **adjust_fs(fig, left=6.0, right=1.5))
|
fig.subplots_adjust(wspace=0.5, **adjust_fs(fig, left=6.0, right=1.2))
|
||||||
plot_data(ax1, x, y)
|
plot_data(ax1, x, y)
|
||||||
plot_data_fac(ax2, x, y, c)
|
plot_data_fac(ax2, x, y, c)
|
||||||
fig.savefig("cubicfunc.pdf")
|
fig.savefig("cubicfunc.pdf")
|
||||||
|
@ -73,7 +73,7 @@ if __name__ == "__main__":
|
|||||||
x, y, c = create_data()
|
x, y, c = create_data()
|
||||||
cs, mses = gradient_descent(x, y)
|
cs, mses = gradient_descent(x, y)
|
||||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 1.1*figure_height))
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 1.1*figure_height))
|
||||||
fig.subplots_adjust(wspace=0.2, **adjust_fs(left=8.0, right=0.5))
|
fig.subplots_adjust(**adjust_fs(left=8.0, right=1.2))
|
||||||
plot_mse(ax1, x, y, c, cs)
|
plot_mse(ax1, x, y, c, cs)
|
||||||
plot_descent(ax2, cs, mses)
|
plot_descent(ax2, cs, mses)
|
||||||
fig.savefig("cubicmse.pdf")
|
fig.savefig("cubicmse.pdf")
|
||||||
|
@ -1,56 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
from mpl_toolkits.mplot3d import Axes3D
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import matplotlib.cm as cm
|
|
||||||
from plotstyle import *
|
|
||||||
|
|
||||||
def create_data():
|
|
||||||
m = 0.75
|
|
||||||
n= -40
|
|
||||||
x = np.arange(10.,110., 2.5)
|
|
||||||
y = m * x + n;
|
|
||||||
rng = np.random.RandomState(37281)
|
|
||||||
noise = rng.randn(len(x))*15
|
|
||||||
y += noise
|
|
||||||
return x, y, m, n
|
|
||||||
|
|
||||||
|
|
||||||
def plot_error_plane(ax, x, y, m, n):
|
|
||||||
ax.set_xlabel('Slope m')
|
|
||||||
ax.set_ylabel('Intercept b')
|
|
||||||
ax.set_zlabel('Mean squared error')
|
|
||||||
ax.set_xlim(-4.5, 5.0)
|
|
||||||
ax.set_ylim(-60.0, -20.0)
|
|
||||||
ax.set_zlim(0.0, 700.0)
|
|
||||||
ax.set_xticks(np.arange(-4, 5, 2))
|
|
||||||
ax.set_yticks(np.arange(-60, -19, 10))
|
|
||||||
ax.set_zticks(np.arange(0, 700, 200))
|
|
||||||
ax.grid(True)
|
|
||||||
ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
|
|
||||||
ax.w_yaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
|
|
||||||
ax.w_zaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
|
|
||||||
ax.invert_xaxis()
|
|
||||||
ax.view_init(25, 40)
|
|
||||||
slopes = np.linspace(-4.5, 5, 40)
|
|
||||||
intercepts = np.linspace(-60, -20, 40)
|
|
||||||
x, y = np.meshgrid(slopes, intercepts)
|
|
||||||
error_surf = np.zeros(x.shape)
|
|
||||||
for i, s in enumerate(slopes) :
|
|
||||||
for j, b in enumerate(intercepts) :
|
|
||||||
error_surf[j,i] = np.mean((y-s*x-b)**2.0)
|
|
||||||
ax.plot_surface(x, y, error_surf, rstride=1, cstride=1, cmap=cm.coolwarm,
|
|
||||||
linewidth=0, shade=True)
|
|
||||||
# Minimum:
|
|
||||||
mini = np.unravel_index(np.argmin(error_surf), error_surf.shape)
|
|
||||||
ax.scatter(slopes[mini[1]], intercepts[mini[0]], [0.0], color='#cc0000', s=60)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
x, y, m, n = create_data()
|
|
||||||
fig = plt.figure()
|
|
||||||
ax = fig.add_subplot(1, 1, 1, projection='3d')
|
|
||||||
plot_error_plane(ax, x, y, m, n)
|
|
||||||
fig.set_size_inches(7., 5.)
|
|
||||||
fig.subplots_adjust(**adjust_fs(fig, 1.0, 0.0, 0.0, 0.0))
|
|
||||||
fig.savefig("error_surface.pdf")
|
|
||||||
plt.close()
|
|
@ -1,60 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
from plotstyle import *
|
|
||||||
|
|
||||||
def create_data():
|
|
||||||
m = 0.75
|
|
||||||
n= -40
|
|
||||||
x = np.arange(10.,110., 2.5)
|
|
||||||
y = m * x + n;
|
|
||||||
rng = np.random.RandomState(37281)
|
|
||||||
noise = rng.randn(len(x))*15
|
|
||||||
y += noise
|
|
||||||
return x, y, m, n
|
|
||||||
|
|
||||||
|
|
||||||
def plot_data(ax, x, y):
|
|
||||||
ax.plot(x, y, **psA)
|
|
||||||
ax.set_xlabel('Input x')
|
|
||||||
ax.set_ylabel('Output y')
|
|
||||||
ax.set_xlim(0, 120)
|
|
||||||
ax.set_ylim(-80, 80)
|
|
||||||
ax.set_xticks(np.arange(0,121, 40))
|
|
||||||
ax.set_yticks(np.arange(-80,81, 40))
|
|
||||||
|
|
||||||
|
|
||||||
def plot_data_slopes(ax, x, y, m, n):
|
|
||||||
ax.plot(x, y, **psA)
|
|
||||||
xx = np.asarray([2, 118])
|
|
||||||
for i in np.linspace(0.3*m, 2.0*m, 5):
|
|
||||||
ax.plot(xx, i*xx+n, **lsBm)
|
|
||||||
ax.set_xlabel('Input x')
|
|
||||||
#ax.set_ylabel('Output y')
|
|
||||||
ax.set_xlim(0, 120)
|
|
||||||
ax.set_ylim(-80, 80)
|
|
||||||
ax.set_xticks(np.arange(0,121, 40))
|
|
||||||
ax.set_yticks(np.arange(-80,81, 40))
|
|
||||||
|
|
||||||
|
|
||||||
def plot_data_intercepts(ax, x, y, m, n):
|
|
||||||
ax.plot(x, y, **psA)
|
|
||||||
xx = np.asarray([2, 118])
|
|
||||||
for i in np.linspace(n-1*n, n+1*n, 5):
|
|
||||||
ax.plot(xx, m*xx + i, **lsBm)
|
|
||||||
ax.set_xlabel('Input x')
|
|
||||||
#ax.set_ylabel('Output y')
|
|
||||||
ax.set_xlim(0, 120)
|
|
||||||
ax.set_ylim(-80, 80)
|
|
||||||
ax.set_xticks(np.arange(0,121, 40))
|
|
||||||
ax.set_yticks(np.arange(-80,81, 40))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
x, y, m, n = create_data()
|
|
||||||
fig, axs = plt.subplots(1, 3)
|
|
||||||
fig.subplots_adjust(wspace=0.5, **adjust_fs(fig, left=6.0, right=1.5))
|
|
||||||
plot_data(axs[0], x, y)
|
|
||||||
plot_data_slopes(axs[1], x, y, m, n)
|
|
||||||
plot_data_intercepts(axs[2], x, y, m, n)
|
|
||||||
fig.savefig("lin_regress.pdf")
|
|
||||||
plt.close()
|
|
@ -1,65 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
from plotstyle import *
|
|
||||||
|
|
||||||
def create_data():
|
|
||||||
m = 0.75
|
|
||||||
n= -40
|
|
||||||
x = np.concatenate( (np.arange(10.,110., 2.5), np.arange(0.,120., 2.0)) )
|
|
||||||
y = m * x + n;
|
|
||||||
rng = np.random.RandomState(37281)
|
|
||||||
noise = rng.randn(len(x))*15
|
|
||||||
y += noise
|
|
||||||
return x, y, m, n
|
|
||||||
|
|
||||||
|
|
||||||
def plot_data(ax, x, y, m, n):
|
|
||||||
ax.set_xlabel('Input x')
|
|
||||||
ax.set_ylabel('Output y')
|
|
||||||
ax.set_xlim(0, 120)
|
|
||||||
ax.set_ylim(-80, 80)
|
|
||||||
ax.set_xticks(np.arange(0,121, 40))
|
|
||||||
ax.set_yticks(np.arange(-80,81, 40))
|
|
||||||
ax.annotate('Error',
|
|
||||||
xy=(x[34]+1, y[34]+15), xycoords='data',
|
|
||||||
xytext=(80, -50), textcoords='data', ha='left',
|
|
||||||
arrowprops=dict(arrowstyle="->", relpos=(0.9,1.0),
|
|
||||||
connectionstyle="angle3,angleA=50,angleB=-30") )
|
|
||||||
ax.plot(x[:40], y[:40], zorder=0, **psAm)
|
|
||||||
inxs = [3, 13, 16, 19, 25, 34, 36]
|
|
||||||
ax.plot(x[inxs], y[inxs], zorder=10, **psA)
|
|
||||||
xx = np.asarray([2, 118])
|
|
||||||
ax.plot(xx, m*xx+n, **lsBm)
|
|
||||||
for i in inxs :
|
|
||||||
xx = [x[i], x[i]]
|
|
||||||
yy = [m*x[i]+n, y[i]]
|
|
||||||
ax.plot(xx, yy, zorder=5, **lsDm)
|
|
||||||
|
|
||||||
|
|
||||||
def plot_error_hist(ax, x, y, m, n):
|
|
||||||
ax.set_xlabel('Squared error')
|
|
||||||
ax.set_ylabel('Frequency')
|
|
||||||
bins = np.arange(0.0, 602.0, 50.0)
|
|
||||||
ax.set_xlim(bins[0], bins[-1])
|
|
||||||
ax.set_ylim(0, 35)
|
|
||||||
ax.set_xticks(np.arange(bins[0], bins[-1], 100))
|
|
||||||
ax.set_yticks(np.arange(0, 36, 10))
|
|
||||||
errors = (y-(m*x+n))**2.0
|
|
||||||
mls = np.mean(errors)
|
|
||||||
ax.annotate('Mean\nsquared\nerror',
|
|
||||||
xy=(mls, 0.5), xycoords='data',
|
|
||||||
xytext=(350, 20), textcoords='data', ha='left',
|
|
||||||
arrowprops=dict(arrowstyle="->", relpos=(0.0,0.2),
|
|
||||||
connectionstyle="angle3,angleA=10,angleB=90") )
|
|
||||||
ax.hist(errors, bins, **fsD)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
x, y, m, n = create_data()
|
|
||||||
fig, axs = plt.subplots(1, 2)
|
|
||||||
fig.subplots_adjust(**adjust_fs(fig, left=6.0))
|
|
||||||
plot_data(axs[0], x, y, m, n)
|
|
||||||
plot_error_hist(axs[1], x, y, m, n)
|
|
||||||
fig.savefig("linear_least_squares.pdf")
|
|
||||||
plt.close()
|
|
@ -23,47 +23,12 @@
|
|||||||
\item Fig 8.2 right: this should be a chi-squared distribution with one degree of freedom!
|
\item Fig 8.2 right: this should be a chi-squared distribution with one degree of freedom!
|
||||||
\end{itemize}
|
\end{itemize}
|
||||||
|
|
||||||
\begin{figure}[t]
|
|
||||||
\includegraphics[width=0.75\textwidth]{error_surface}
|
|
||||||
\titlecaption{Error surface.}{The two model parameters $m$ and $b$
|
|
||||||
define the base area of the surface plot. For each parameter
|
|
||||||
combination of slope and intercept the error is calculated. The
|
|
||||||
resulting surface has a minimum which indicates the parameter
|
|
||||||
combination that best fits the data.}\label{errorsurfacefig}
|
|
||||||
\end{figure}
|
|
||||||
|
|
||||||
|
|
||||||
\begin{figure}[t]
|
|
||||||
\includegraphics[width=0.75\textwidth]{error_gradient}
|
|
||||||
\titlecaption{Gradient of the error surface.} {Each arrow points
|
|
||||||
into the direction of the greatest ascend at different positions
|
|
||||||
of the error surface shown in \figref{errorsurfacefig}. The
|
|
||||||
contour lines in the background illustrate the error surface. Warm
|
|
||||||
colors indicate high errors, colder colors low error values. Each
|
|
||||||
contour line connects points of equal
|
|
||||||
error.}\label{gradientquiverfig}
|
|
||||||
\end{figure}
|
|
||||||
|
|
||||||
\begin{figure}[t]
|
|
||||||
\includegraphics[width=0.45\textwidth]{gradient_descent}
|
|
||||||
\titlecaption{Gradient descent.}{The algorithm starts at an
|
|
||||||
arbitrary position. At each point the gradient is estimated and
|
|
||||||
the position is updated as long as the length of the gradient is
|
|
||||||
sufficiently large.The dots show the positions after each
|
|
||||||
iteration of the algorithm.} \label{gradientdescentfig}
|
|
||||||
\end{figure}
|
|
||||||
|
|
||||||
|
|
||||||
\subsection{Linear fits}
|
\subsection{Linear fits}
|
||||||
\begin{itemize}
|
\begin{itemize}
|
||||||
\item Polyfit is easy: unique solution! $c x^2$ is also a linear fit.
|
\item Polyfit is easy: unique solution! $c x^2$ is also a linear fit.
|
||||||
\item Example for overfitting with polyfit of a high order (=number of data points)
|
\item Example for overfitting with polyfit of a high order (=number of data points)
|
||||||
\end{itemize}
|
\end{itemize}
|
||||||
|
|
||||||
\section{Fitting in practice}
|
|
||||||
|
|
||||||
Fit with matlab functions lsqcurvefit, polyfit
|
|
||||||
|
|
||||||
|
|
||||||
\subsection{Non-linear fits}
|
\subsection{Non-linear fits}
|
||||||
\begin{itemize}
|
\begin{itemize}
|
||||||
|
@ -227,6 +227,7 @@ to arbitrary precision.
|
|||||||
\end{ibox}
|
\end{ibox}
|
||||||
|
|
||||||
\section{Gradient}
|
\section{Gradient}
|
||||||
|
|
||||||
Imagine to place a ball at some point on the cost function
|
Imagine to place a ball at some point on the cost function
|
||||||
\figref{cubiccostfig}. Naturally, it would roll down the slope and
|
\figref{cubiccostfig}. Naturally, it would roll down the slope and
|
||||||
eventually stop at the minimum of the error surface (if it had no
|
eventually stop at the minimum of the error surface (if it had no
|
||||||
@ -235,10 +236,21 @@ way to the minimum of the cost function. The ball always follows the
|
|||||||
steepest slope. Thus we need to figure out the direction of the slope
|
steepest slope. Thus we need to figure out the direction of the slope
|
||||||
at the position of the ball.
|
at the position of the ball.
|
||||||
|
|
||||||
|
\begin{figure}[t]
|
||||||
|
\includegraphics{cubicgradient}
|
||||||
|
\titlecaption{Derivative of the cost function.}{The gradient, the
|
||||||
|
derivative \eqref{costderivative} of the cost function, is
|
||||||
|
negative to the left of the minimum of the cost function, zero at,
|
||||||
|
and positive to the right of the minimum (left). For each value of
|
||||||
|
the parameter $c$ the negative gradient (arrows) points towards
|
||||||
|
the minimum of the cost function
|
||||||
|
(right).} \label{gradientcubicfig}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
In our one-dimensional example of a single free parameter the slope is
|
In our one-dimensional example of a single free parameter the slope is
|
||||||
simply the derivative of the cost function with respect to the
|
simply the derivative of the cost function with respect to the
|
||||||
parameter $c$. This derivative is called the
|
parameter $c$ (\figref{gradientcubicfig}, left). This derivative is called
|
||||||
\entermde{Gradient}{gradient} of the cost function:
|
the \entermde{Gradient}{gradient} of the cost function:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\label{costderivative}
|
\label{costderivative}
|
||||||
\nabla f_{cost}(c) = \frac{{\rm d} f_{cost}(c)}{{\rm d} c}
|
\nabla f_{cost}(c) = \frac{{\rm d} f_{cost}(c)}{{\rm d} c}
|
||||||
@ -252,7 +264,8 @@ can be approximated numerically by the difference quotient
|
|||||||
\approx \frac{f_{cost}(c + \Delta c) - f_{cost}(c)}{\Delta c}
|
\approx \frac{f_{cost}(c + \Delta c) - f_{cost}(c)}{\Delta c}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
The derivative is positive for positive slopes. Since want to go down
|
The derivative is positive for positive slopes. Since want to go down
|
||||||
the hill, we choose the opposite direction.
|
the hill, we choose the opposite direction (\figref{gradientcubicfig},
|
||||||
|
right).
|
||||||
|
|
||||||
\begin{exercise}{meanSquaredGradientCubic.m}{}\label{gradientcubic}
|
\begin{exercise}{meanSquaredGradientCubic.m}{}\label{gradientcubic}
|
||||||
Implement a function \varcode{meanSquaredGradientCubic()}, that
|
Implement a function \varcode{meanSquaredGradientCubic()}, that
|
||||||
@ -275,7 +288,7 @@ the hill, we choose the opposite direction.
|
|||||||
\titlecaption{Gradient descent.}{The algorithm starts at an
|
\titlecaption{Gradient descent.}{The algorithm starts at an
|
||||||
arbitrary position. At each point the gradient is estimated and
|
arbitrary position. At each point the gradient is estimated and
|
||||||
the position is updated as long as the length of the gradient is
|
the position is updated as long as the length of the gradient is
|
||||||
sufficiently large.The dots show the positions after each
|
sufficiently large. The dots show the positions after each
|
||||||
iteration of the algorithm.} \label{gradientdescentcubicfig}
|
iteration of the algorithm.} \label{gradientdescentcubicfig}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
@ -378,18 +391,18 @@ functions that have more than a single parameter, in general $n$
|
|||||||
parameters. We then need to find the minimum in an $n$ dimensional
|
parameters. We then need to find the minimum in an $n$ dimensional
|
||||||
parameter space.
|
parameter space.
|
||||||
|
|
||||||
For our tiger problem, we could have also fitted the exponent $\alpha$
|
For our tiger problem, we could have also fitted the exponent $a$ of
|
||||||
of the power-law relation between size and weight, instead of assuming
|
the power-law relation between size and weight, instead of assuming a
|
||||||
a cubic relation with $\alpha=3$:
|
cubic relation with $a=3$:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\label{powerfunc}
|
\label{powerfunc}
|
||||||
y = f(x; c, \alpha) = f(x; \vec p) = c\cdot x^\alpha
|
y = f(x; c, a) = f(x; \vec p) = c\cdot x^a
|
||||||
\end{equation}
|
\end{equation}
|
||||||
We then could check whether the resulting estimate of the exponent
|
We then could check whether the resulting estimate of the exponent
|
||||||
$\alpha$ indeed is close to the expected power of three. The
|
$a$ indeed is close to the expected power of three. The
|
||||||
power-law \eqref{powerfunc} has two free parameters $c$ and $\alpha$.
|
power-law \eqref{powerfunc} has two free parameters $c$ and $a$.
|
||||||
Instead of a single parameter we are now dealing with a vector $\vec
|
Instead of a single parameter we are now dealing with a vector $\vec
|
||||||
p$ containing $n$ parameter values. Here, $\vec p = (c, \alpha)$. All
|
p$ containing $n$ parameter values. Here, $\vec p = (c, a)$. All
|
||||||
the concepts we introduced on the example of the one dimensional
|
the concepts we introduced on the example of the one dimensional
|
||||||
problem of tiger weights generalize to $n$-dimensional problems. We
|
problem of tiger weights generalize to $n$-dimensional problems. We
|
||||||
only need to adapt a few things. The cost function for the mean
|
only need to adapt a few things. The cost function for the mean
|
||||||
@ -399,12 +412,27 @@ squared error reads
|
|||||||
f_{cost}(\vec p|\{(x_i, y_i)\}) = \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i;\vec p))^2
|
f_{cost}(\vec p|\{(x_i, y_i)\}) = \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i;\vec p))^2
|
||||||
\end{equation}
|
\end{equation}
|
||||||
|
|
||||||
|
\begin{figure}[t]
|
||||||
|
\includegraphics{powergradientdescent}
|
||||||
|
\titlecaption{Gradient descent on an error surface.}{Contour plot of
|
||||||
|
the cost function \eqref{ndimcostfunc} for the fit of a power law
|
||||||
|
\eqref{powerfunc} to some data. Here the cost function is a long
|
||||||
|
and narrow valley on the plane spanned by the two parameters $c$
|
||||||
|
and $a$ of the power law. The red line marks the path of the
|
||||||
|
gradient descent algorithm. The gradient is always perpendicular
|
||||||
|
to the contour lines. The algorithm quickly descends into the
|
||||||
|
valley and then slowly creeps on the shallow bottom of the valley
|
||||||
|
towards the global minimum where it terminates (yellow circle).
|
||||||
|
} \label{powergradientdescentfig}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
For two-dimensional problems the graph of the cost function is an
|
For two-dimensional problems the graph of the cost function is an
|
||||||
\enterm{error surface} (\determ{{Fehlerfl\"ache}}). The two parameters
|
\enterm{error surface} (\determ{{Fehlerfl\"ache}}). The two parameters
|
||||||
span a two-dimensional plane. The cost function assigns to each
|
span a two-dimensional plane. The cost function assigns to each
|
||||||
parameter combination on this plane a single value. This results in a
|
parameter combination on this plane a single value. This results in a
|
||||||
landscape over the parameter plane with mountains and valleys and we
|
landscape over the parameter plane with mountains and valleys and we
|
||||||
are searching for the position of the bottom of the deepest valley.
|
are searching for the position of the bottom of the deepest valley
|
||||||
|
(\figref{powergradientdescentfig}).
|
||||||
|
|
||||||
\begin{ibox}[tp]{\label{partialderivativebox}Partial derivatives and gradient}
|
\begin{ibox}[tp]{\label{partialderivativebox}Partial derivatives and gradient}
|
||||||
Some functions depend on more than a single variable. For example, the function
|
Some functions depend on more than a single variable. For example, the function
|
||||||
@ -446,13 +474,13 @@ space of our example, the \entermde{Gradient}{gradient}
|
|||||||
(box~\ref{partialderivativebox}) of the cost function is a vector
|
(box~\ref{partialderivativebox}) of the cost function is a vector
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\label{gradientpowerlaw}
|
\label{gradientpowerlaw}
|
||||||
\nabla f_{cost}(c, \alpha) = \left( \frac{\partial f_{cost}(c, \alpha)}{\partial c},
|
\nabla f_{cost}(c, a) = \left( \frac{\partial f_{cost}(c, a)}{\partial c},
|
||||||
\frac{\partial f_{cost}(c, \alpha)}{\partial \alpha} \right)
|
\frac{\partial f_{cost}(c, a)}{\partial a} \right)
|
||||||
\end{equation}
|
\end{equation}
|
||||||
that points into the direction of the strongest ascend of the cost
|
that points into the direction of the strongest ascend of the cost
|
||||||
function. The gradient is given by the partial derivatives
|
function. The gradient is given by the partial derivatives
|
||||||
(box~\ref{partialderivativebox}) of the mean squared error with
|
(box~\ref{partialderivativebox}) of the mean squared error with
|
||||||
respect to the parameters $c$ and $\alpha$ of the power law
|
respect to the parameters $c$ and $a$ of the power law
|
||||||
relation. In general, for $n$-dimensional problems, the gradient is an
|
relation. In general, for $n$-dimensional problems, the gradient is an
|
||||||
$n-$ dimensional vector containing for each of the $n$ parameters
|
$n-$ dimensional vector containing for each of the $n$ parameters
|
||||||
$p_j$ the respective partial derivatives as coordinates:
|
$p_j$ the respective partial derivatives as coordinates:
|
||||||
@ -468,6 +496,9 @@ parameter value $p_i$ becomes a vector $\vec p_i$ of parameter values:
|
|||||||
\label{ndimgradientdescent}
|
\label{ndimgradientdescent}
|
||||||
\vec p_{i+1} = \vec p_i - \epsilon \cdot \nabla f_{cost}(\vec p_i)
|
\vec p_{i+1} = \vec p_i - \epsilon \cdot \nabla f_{cost}(\vec p_i)
|
||||||
\end{equation}
|
\end{equation}
|
||||||
|
The algorithm proceeds along the negative gradient
|
||||||
|
(\figref{powergradientdescentfig}).
|
||||||
|
|
||||||
For the termination condition we need the length of the gradient. In
|
For the termination condition we need the length of the gradient. In
|
||||||
one dimension it was just the absolute value of the derivative. For
|
one dimension it was just the absolute value of the derivative. For
|
||||||
$n$ dimensions this is according to the \enterm{Pythagorean theorem}
|
$n$ dimensions this is according to the \enterm{Pythagorean theorem}
|
||||||
@ -479,8 +510,7 @@ the sum of the squared partial derivatives:
|
|||||||
\end{equation}
|
\end{equation}
|
||||||
The \code{norm()} function implements this.
|
The \code{norm()} function implements this.
|
||||||
|
|
||||||
|
\subsection{Passing a function as an argument to another function}
|
||||||
\section{Passing a function as an argument to another function}
|
|
||||||
|
|
||||||
So far, all our code for the gradient descent algorithm was tailored
|
So far, all our code for the gradient descent algorithm was tailored
|
||||||
to a specific function, the cubic relation \eqref{cubicfunc}. It would
|
to a specific function, the cubic relation \eqref{cubicfunc}. It would
|
||||||
@ -500,7 +530,7 @@ argument to our function:
|
|||||||
\pageinputlisting[caption={Passing a function handle as an argument to a function.}]{funcplotterexamples.m}
|
\pageinputlisting[caption={Passing a function handle as an argument to a function.}]{funcplotterexamples.m}
|
||||||
|
|
||||||
|
|
||||||
\section{Gradient descent algorithm for arbitrary functions}
|
\subsection{Gradient descent algorithm for arbitrary functions}
|
||||||
|
|
||||||
Now we are ready to adapt the gradient descent algorithm from
|
Now we are ready to adapt the gradient descent algorithm from
|
||||||
exercise~\ref{gradientdescentexercise} to arbitrary functions with $n$
|
exercise~\ref{gradientdescentexercise} to arbitrary functions with $n$
|
||||||
@ -524,21 +554,21 @@ For testing our new function we need to implement the power law
|
|||||||
Write a function that implements \eqref{powerfunc}. The function
|
Write a function that implements \eqref{powerfunc}. The function
|
||||||
gets as arguments a vector $x$ containing the $x$-data values and
|
gets as arguments a vector $x$ containing the $x$-data values and
|
||||||
another vector containing as elements the parameters for the power
|
another vector containing as elements the parameters for the power
|
||||||
law, i.e. the factor $c$ and the power $\alpha$. It returns a vector
|
law, i.e. the factor $c$ and the power $a$. It returns a vector
|
||||||
with the computed $y$ values for each $x$ value.
|
with the computed $y$ values for each $x$ value.
|
||||||
\end{exercise}
|
\end{exercise}
|
||||||
|
|
||||||
Now let's use the new gradient descent function to fit a power law to
|
Now let's use the new gradient descent function to fit a power law to
|
||||||
our tiger data-set:
|
our tiger data-set (\figref{powergradientdescentfig}):
|
||||||
|
|
||||||
\begin{exercise}{plotgradientdescentpower.m}{}
|
\begin{exercise}{plotgradientdescentpower.m}{}
|
||||||
Use the function \varcode{gradientDescent()} to fit the
|
Use the function \varcode{gradientDescent()} to fit the
|
||||||
\varcode{powerLaw()} function to the simulated data from
|
\varcode{powerLaw()} function to the simulated data from
|
||||||
exercise~\ref{mseexercise}. Plot the returned values of the two
|
exercise~\ref{mseexercise}. Plot the returned values of the two
|
||||||
parameters against each other. Compare the result of the gradient
|
parameters against each other. Compare the result of the gradient
|
||||||
descent method with the true values of $c$ and $\alpha$ used to
|
descent method with the true values of $c$ and $a$ used to simulate
|
||||||
simulate the data. Observe the norm of the gradient and inspect the
|
the data. Observe the norm of the gradient and inspect the plots to
|
||||||
plots to adapt $\epsilon$ (smaller than in
|
adapt $\epsilon$ (smaller than in
|
||||||
exercise~\ref{plotgradientdescentexercise}) and the threshold (much
|
exercise~\ref{plotgradientdescentexercise}) and the threshold (much
|
||||||
larger) appropriately. Finally plot the data together with the best
|
larger) appropriately. Finally plot the data together with the best
|
||||||
fitting power-law \eqref{powerfunc}.
|
fitting power-law \eqref{powerfunc}.
|
||||||
|
Reference in New Issue
Block a user