[regression] finished n-dim minimization
This commit is contained in:
parent
891515caf8
commit
bfb2f66de2
@ -72,13 +72,13 @@ x = [2:3:20]; % Some vector.
|
|||||||
y = []; % Empty vector for storing the results.
|
y = []; % Empty vector for storing the results.
|
||||||
for i=1:length(x)
|
for i=1:length(x)
|
||||||
% The function get_something() returns a vector of unspecified size:
|
% The function get_something() returns a vector of unspecified size:
|
||||||
z = get_somehow_more(x(i));
|
z = get_something(x(i));
|
||||||
% The content of z is appended to the result vector y:
|
% The content of z is appended to the result vector y:
|
||||||
y = [y; z(:)];
|
y = [y, z(:)];
|
||||||
% The z(:) syntax ensures that we append column-vectors.
|
% The z(:) syntax ensures that we append column-vectors.
|
||||||
end
|
end
|
||||||
% Process the results stored in the vector z:
|
% Process the results stored in the vector y:
|
||||||
mean(y)
|
mean(y, 1)
|
||||||
\end{pagelisting}
|
\end{pagelisting}
|
||||||
|
|
||||||
|
|
||||||
|
41
regression/code/gradientDescentPower.m
Normal file
41
regression/code/gradientDescentPower.m
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
function [p, ps, mses] = gradientDescentPower(x, y, p0, epsilon, threshold)
|
||||||
|
% Gradient descent for fitting a power-law.
|
||||||
|
%
|
||||||
|
% Arguments: x, vector of the x-data values.
|
||||||
|
% y, vector of the corresponding y-data values.
|
||||||
|
% p0, vector with initial values for c and alpha.
|
||||||
|
% epsilon: factor multiplying the gradient.
|
||||||
|
% threshold: minimum value for gradient
|
||||||
|
%
|
||||||
|
% Returns: p, vector with the final parameter values.
|
||||||
|
% ps: 2D-vector with all the parameter tuples traversed.
|
||||||
|
% mses: vector with the corresponding mean squared errors
|
||||||
|
|
||||||
|
p = p0;
|
||||||
|
gradient = ones(1, length(p0)) * 1000.0;
|
||||||
|
ps = [];
|
||||||
|
mses = [];
|
||||||
|
while norm(gradient) > threshold
|
||||||
|
ps = [ps, p(:)];
|
||||||
|
mses = [mses, meanSquaredErrorPower(x, y, p)];
|
||||||
|
gradient = meanSquaredGradientPower(x, y, p);
|
||||||
|
p = p - epsilon * gradient;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function mse = meanSquaredErrorPower(x, y, p)
|
||||||
|
mse = mean((y - p(1)*x.^p(2)).^2);
|
||||||
|
end
|
||||||
|
|
||||||
|
function gradmse = meanSquaredGradientPower(x, y, p)
|
||||||
|
gradmse = zeros(size(p, 1), size(p, 2));
|
||||||
|
h = 1e-5; % stepsize for derivatives
|
||||||
|
mse = meanSquaredErrorPower(x, y, p);
|
||||||
|
for i = 1:length(p) % for each coordinate ...
|
||||||
|
pi = p;
|
||||||
|
pi(i) = pi(i) + h; % displace i-th parameter
|
||||||
|
msepi = meanSquaredErrorPower(x, y, pi);
|
||||||
|
gradmse(i) = (msepi - mse)/h;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
32
regression/code/plotgradientdescentpower.m
Normal file
32
regression/code/plotgradientdescentpower.m
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
meansquarederrorline; % generate data
|
||||||
|
|
||||||
|
p0 = [2.0, 1.0];
|
||||||
|
eps = 0.00001;
|
||||||
|
thresh = 50.0;
|
||||||
|
[pest, ps, mses] = gradientDescentPower(x, y, p0, eps, thresh);
|
||||||
|
pest
|
||||||
|
|
||||||
|
subplot(2, 2, 1); % top left panel
|
||||||
|
hold on;
|
||||||
|
plot(ps(1,:), ps(2,:), '.');
|
||||||
|
plot(ps(1,end), ps(2,end), 'og');
|
||||||
|
plot(c, 3.0, 'or'); % dot indicating true parameter values
|
||||||
|
hold off;
|
||||||
|
xlabel('Iteration');
|
||||||
|
ylabel('C');
|
||||||
|
subplot(2, 2, 3); % bottom left panel
|
||||||
|
plot(mses, '-o');
|
||||||
|
xlabel('Iteration steps');
|
||||||
|
ylabel('MSE');
|
||||||
|
subplot(1, 2, 2); % right panel
|
||||||
|
hold on;
|
||||||
|
% generate x-values for plottig the fit:
|
||||||
|
xx = min(x):0.01:max(x);
|
||||||
|
yy = pest(1) * xx.^pest(2);
|
||||||
|
plot(xx, yy);
|
||||||
|
plot(x, y, 'o'); % plot original data
|
||||||
|
xlabel('Size [m]');
|
||||||
|
ylabel('Weight [kg]');
|
||||||
|
legend('fit', 'data', 'location', 'northwest');
|
||||||
|
pause
|
||||||
|
|
@ -319,7 +319,7 @@ differs sufficiently from zero. At steep slopes we take large steps
|
|||||||
(the distance between the red dots in \figref{gradientdescentcubicfig}
|
(the distance between the red dots in \figref{gradientdescentcubicfig}
|
||||||
is large).
|
is large).
|
||||||
|
|
||||||
\begin{exercise}{gradientDescentCubic.m}{}
|
\begin{exercise}{gradientDescentCubic.m}{}\label{gradientdescentexercise}
|
||||||
Implement the gradient descent algorithm for the problem of fitting
|
Implement the gradient descent algorithm for the problem of fitting
|
||||||
a cubic function \eqref{cubicfunc} to some measured data pairs $x$
|
a cubic function \eqref{cubicfunc} to some measured data pairs $x$
|
||||||
and $y$ as a function \varcode{gradientDescentCubic()} that returns
|
and $y$ as a function \varcode{gradientDescentCubic()} that returns
|
||||||
@ -332,7 +332,7 @@ is large).
|
|||||||
for the absolute value of the gradient terminating the algorithm.
|
for the absolute value of the gradient terminating the algorithm.
|
||||||
\end{exercise}
|
\end{exercise}
|
||||||
|
|
||||||
\begin{exercise}{plotgradientdescentcubic.m}{}
|
\begin{exercise}{plotgradientdescentcubic.m}{}\label{plotgradientdescentexercise}
|
||||||
Use the function \varcode{gradientDescentCubic()} to fit the
|
Use the function \varcode{gradientDescentCubic()} to fit the
|
||||||
simulated data from exercise~\ref{mseexercise}. Plot the returned
|
simulated data from exercise~\ref{mseexercise}. Plot the returned
|
||||||
values of the parameter $c$ as as well as the corresponding mean
|
values of the parameter $c$ as as well as the corresponding mean
|
||||||
@ -373,15 +373,14 @@ control how they behave. Now you know what this is all about.
|
|||||||
\section{N-dimensional minimization problems}
|
\section{N-dimensional minimization problems}
|
||||||
|
|
||||||
So far we were concerned about finding the right value for a single
|
So far we were concerned about finding the right value for a single
|
||||||
parameter that minimizes a cost function. The gradient descent method
|
parameter that minimizes a cost function. Often we deal with
|
||||||
for such one dimensional problems seems a bit like over kill. However,
|
functions that have more than a single parameter, in general $n$
|
||||||
often we deal with functions that have more than a single parameter,
|
parameters. We then need to find the minimum in an $n$ dimensional
|
||||||
in general $n$ parameter. We then need to find the minimum in an $n$
|
parameter space.
|
||||||
dimensional parameter space.
|
|
||||||
|
|
||||||
For our tiger problem, we could have also fitted the exponent $\alpha$
|
For our tiger problem, we could have also fitted the exponent $\alpha$
|
||||||
of the power-law relation between size and weight, instead of assuming
|
of the power-law relation between size and weight, instead of assuming
|
||||||
a cubic relation:
|
a cubic relation with $\alpha=3$:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\label{powerfunc}
|
\label{powerfunc}
|
||||||
y = f(x; c, \alpha) = f(x; \vec p) = c\cdot x^\alpha
|
y = f(x; c, \alpha) = f(x; \vec p) = c\cdot x^\alpha
|
||||||
@ -390,11 +389,11 @@ We then could check whether the resulting estimate of the exponent
|
|||||||
$\alpha$ indeed is close to the expected power of three. The
|
$\alpha$ indeed is close to the expected power of three. The
|
||||||
power-law \eqref{powerfunc} has two free parameters $c$ and $\alpha$.
|
power-law \eqref{powerfunc} has two free parameters $c$ and $\alpha$.
|
||||||
Instead of a single parameter we are now dealing with a vector $\vec
|
Instead of a single parameter we are now dealing with a vector $\vec
|
||||||
p$ containing $n$ parameter values. Here, $\vec p = (c,
|
p$ containing $n$ parameter values. Here, $\vec p = (c, \alpha)$. All
|
||||||
\alpha)$. Luckily, all the concepts we introduced on the example of
|
the concepts we introduced on the example of the one dimensional
|
||||||
the one dimensional problem of tiger weights generalize to
|
problem of tiger weights generalize to $n$-dimensional problems. We
|
||||||
$n$-dimensional problems. We only need to adapt a few things. The cost
|
only need to adapt a few things. The cost function for the mean
|
||||||
function for the mean squared error reads
|
squared error reads
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\label{ndimcostfunc}
|
\label{ndimcostfunc}
|
||||||
f_{cost}(\vec p|\{(x_i, y_i)\}) = \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i;\vec p))^2
|
f_{cost}(\vec p|\{(x_i, y_i)\}) = \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i;\vec p))^2
|
||||||
@ -405,22 +404,18 @@ For two-dimensional problems the graph of the cost function is an
|
|||||||
span a two-dimensional plane. The cost function assigns to each
|
span a two-dimensional plane. The cost function assigns to each
|
||||||
parameter combination on this plane a single value. This results in a
|
parameter combination on this plane a single value. This results in a
|
||||||
landscape over the parameter plane with mountains and valleys and we
|
landscape over the parameter plane with mountains and valleys and we
|
||||||
are searching for the bottom of the deepest valley.
|
are searching for the position of the bottom of the deepest valley.
|
||||||
|
|
||||||
When we place a ball somewhere on the slopes of a hill it rolls
|
\begin{ibox}[tp]{\label{partialderivativebox}Partial derivatives and gradient}
|
||||||
downwards and eventually stops at the bottom. The ball always rolls in
|
Some functions depend on more than a single variable. For example, the function
|
||||||
the direction of the steepest slope.
|
|
||||||
|
|
||||||
\begin{ibox}[tp]{\label{partialderivativebox}Partial derivative and gradient}
|
|
||||||
Some functions that depend on more than a single variable:
|
|
||||||
\[ z = f(x,y) \]
|
\[ z = f(x,y) \]
|
||||||
for example depends on $x$ and $y$. Using the partial derivative
|
depends on both $x$ and $y$. Using the partial derivatives
|
||||||
\[ \frac{\partial f(x,y)}{\partial x} = \lim\limits_{\Delta x \to 0} \frac{f(x + \Delta x,y) - f(x,y)}{\Delta x} \]
|
\[ \frac{\partial f(x,y)}{\partial x} = \lim\limits_{\Delta x \to 0} \frac{f(x + \Delta x,y) - f(x,y)}{\Delta x} \]
|
||||||
and
|
and
|
||||||
\[ \frac{\partial f(x,y)}{\partial y} = \lim\limits_{\Delta y \to 0} \frac{f(x, y + \Delta y) - f(x,y)}{\Delta y} \]
|
\[ \frac{\partial f(x,y)}{\partial y} = \lim\limits_{\Delta y \to 0} \frac{f(x, y + \Delta y) - f(x,y)}{\Delta y} \]
|
||||||
one can estimate the slope in the direction of the variables
|
one can calculate the slopes in the directions of each of the
|
||||||
individually by using the respective difference quotient
|
variables by means of the respective difference quotient
|
||||||
(Box~\ref{differentialquotientbox}). \vspace{1ex}
|
(see box~\ref{differentialquotientbox}). \vspace{1ex}
|
||||||
|
|
||||||
\begin{minipage}[t]{0.5\textwidth}
|
\begin{minipage}[t]{0.5\textwidth}
|
||||||
\mbox{}\\[-2ex]
|
\mbox{}\\[-2ex]
|
||||||
@ -429,37 +424,93 @@ the direction of the steepest slope.
|
|||||||
\hfill
|
\hfill
|
||||||
\begin{minipage}[t]{0.46\textwidth}
|
\begin{minipage}[t]{0.46\textwidth}
|
||||||
For example, the partial derivatives of
|
For example, the partial derivatives of
|
||||||
\[ f(x,y) = x^2+y^2 \] are
|
\[ f(x,y) = x^2+y^2 \vspace{-1ex} \] are
|
||||||
\[ \frac{\partial f(x,y)}{\partial x} = 2x \; , \quad \frac{\partial f(x,y)}{\partial y} = 2y \; .\]
|
\[ \frac{\partial f(x,y)}{\partial x} = 2x \; , \quad \frac{\partial f(x,y)}{\partial y} = 2y \; .\]
|
||||||
|
|
||||||
The gradient is a vector that is constructed from the partial derivatives:
|
The gradient is a vector with the partial derivatives as coordinates:
|
||||||
\[ \nabla f(x,y) = \left( \begin{array}{c} \frac{\partial f(x,y)}{\partial x} \\[1ex] \frac{\partial f(x,y)}{\partial y} \end{array} \right) \]
|
\[ \nabla f(x,y) = \left( \begin{array}{c} \frac{\partial f(x,y)}{\partial x} \\[1ex] \frac{\partial f(x,y)}{\partial y} \end{array} \right) \]
|
||||||
This vector points into the direction of the strongest ascend of
|
This vector points into the direction of the strongest ascend of
|
||||||
$f(x,y)$.
|
$f(x,y)$.
|
||||||
\end{minipage}
|
\end{minipage}
|
||||||
|
|
||||||
\vspace{0.5ex} The figure shows the contour lines of a bi-variate
|
\vspace{0.5ex} The figure shows the contour lines of a bivariate
|
||||||
Gaussian $f(x,y) = \exp(-(x^2+y^2)/2)$ and the gradient (thick
|
Gaussian $f(x,y) = \exp(-(x^2+y^2)/2)$ and the gradient (thick
|
||||||
arrows) and the corresponding two partial derivatives (thin arrows)
|
arrows) together with the corresponding partial derivatives (thin
|
||||||
for three different locations.
|
arrows) for three different locations.
|
||||||
\end{ibox}
|
\end{ibox}
|
||||||
|
|
||||||
The \entermde{Gradient}{gradient} (Box~\ref{partialderivativebox}) of the
|
When we place a ball somewhere on the slopes of a hill, it rolls
|
||||||
objective function is the vector
|
downwards and eventually stops at the bottom. The ball always rolls in
|
||||||
|
the direction of the steepest slope. For a two-dimensional parameter
|
||||||
|
space of our example, the \entermde{Gradient}{gradient}
|
||||||
|
(box~\ref{partialderivativebox}) of the cost function is a vector
|
||||||
|
\begin{equation}
|
||||||
|
\label{gradientpowerlaw}
|
||||||
|
\nabla f_{cost}(c, \alpha) = \left( \frac{\partial f_{cost}(c, \alpha)}{\partial c},
|
||||||
|
\frac{\partial f_{cost}(c, \alpha)}{\partial \alpha} \right)
|
||||||
|
\end{equation}
|
||||||
|
that points into the direction of the strongest ascend of the cost
|
||||||
|
function. The gradient is given by the partial derivatives
|
||||||
|
(box~\ref{partialderivativebox}) of the mean squared error with
|
||||||
|
respect to the parameters $c$ and $\alpha$ of the power law
|
||||||
|
relation. In general, for $n$-dimensional problems, the gradient is an
|
||||||
|
$n-$ dimensional vector containing for each of the $n$ parameters
|
||||||
|
$p_j$ the respective partial derivatives as coordinates:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\label{gradient}
|
\label{gradient}
|
||||||
\nabla f_{cost}(m,b) = \left( \frac{\partial f(m,b)}{\partial m},
|
\nabla f_{cost}(\vec p) = \left( \frac{\partial f_{cost}(\vec p)}{\partial p_j} \right)
|
||||||
\frac{\partial f(m,b)}{\partial b} \right)
|
\end{equation}
|
||||||
|
|
||||||
|
The iterative equation \eqref{gradientdescent} of the gradient descent
|
||||||
|
stays exactly the same, with the only difference that the current
|
||||||
|
parameter value $p_i$ becomes a vector $\vec p_i$ of parameter values:
|
||||||
|
\begin{equation}
|
||||||
|
\label{ndimgradientdescent}
|
||||||
|
\vec p_{i+1} = \vec p_i - \epsilon \cdot \nabla f_{cost}(\vec p_i)
|
||||||
\end{equation}
|
\end{equation}
|
||||||
that points to the strongest ascend of the objective function. The
|
For the termination condition we need the length of the gradient. In
|
||||||
gradient is given by partial derivatives
|
one dimension it was just the absolute value of the derivative. For
|
||||||
(Box~\ref{partialderivativebox}) of the mean squared error with
|
$n$ dimensions this is according to the \enterm{Pythagorean theorem}
|
||||||
respect to the parameters $m$ and $b$ of the straight line.
|
(\determ[Pythagoras!Satz des]{Satz des Pythagoras}) the square root of
|
||||||
|
the sum of the squared partial derivatives:
|
||||||
|
\begin{equation}
|
||||||
|
\label{ndimabsgradient}
|
||||||
|
|\nabla f_{cost}(\vec p_i)| = \sqrt{\sum_{i=1}^n \left(\frac{\partial f_{cost}(\vec p)}{\partial p_i}\right)^2}
|
||||||
|
\end{equation}
|
||||||
|
The \code{norm()} function implements this.
|
||||||
|
|
||||||
|
\begin{exercise}{gradientDescentPower.m}{}
|
||||||
|
Adapt the function \varcode{gradientDescentCubic()} from
|
||||||
|
exercise~\ref{gradientdescentexercise} to implement the gradient
|
||||||
|
descent algorithm for the power law \eqref{powerfunc}. The new
|
||||||
|
function takes a two element vector $(c,\alpha)$ for the initial
|
||||||
|
parameter values and also returns the best parameter combination as
|
||||||
|
a two-element vector. Use a \varcode{for} loop over the two
|
||||||
|
dimensions for computing the gradient.
|
||||||
|
\end{exercise}
|
||||||
|
|
||||||
|
\begin{exercise}{plotgradientdescentpower.m}{}
|
||||||
|
Use the function \varcode{gradientDescentPower()} to fit the
|
||||||
|
simulated data from exercise~\ref{mseexercise}. Plot the returned
|
||||||
|
values of the two parameters against each other. Compare the result
|
||||||
|
of the gradient descent method with the true values of $c$ and
|
||||||
|
$\alpha$ used to simulate the data. Observe the norm of the gradient
|
||||||
|
and inspect the plots to adapt $\epsilon$ (smaller than in
|
||||||
|
exercise~\ref{plotgradientdescentexercise}) and the threshold (much
|
||||||
|
larger) appropriately. Finally plot the data together with the best
|
||||||
|
fitting power-law \eqref{powerfunc}.
|
||||||
|
\end{exercise}
|
||||||
|
|
||||||
|
|
||||||
|
\section{Curve fit for arbitrary functions}
|
||||||
|
|
||||||
|
So far, all our code for the gradient descent algorithm was tailored
|
||||||
|
to a specific function, the cubic relation \eqref{cubicfunc} or the
|
||||||
|
power law \eqref{powerfunc}.
|
||||||
|
|
||||||
|
\section{XXX}
|
||||||
For example, you measure the response of a passive membrane to a
|
For example, you measure the response of a passive membrane to a
|
||||||
current step and you want to estimate membrane time constant. Then you
|
current step and you want to estimate membrane the time constant. Then you
|
||||||
need to fit an exponential function
|
need to fit an exponential function
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\label{expfunc}
|
\label{expfunc}
|
||||||
|
Reference in New Issue
Block a user