diff --git a/designpattern/lecture/designpattern.tex b/designpattern/lecture/designpattern.tex index 5baada5..05ee5ab 100644 --- a/designpattern/lecture/designpattern.tex +++ b/designpattern/lecture/designpattern.tex @@ -72,13 +72,13 @@ x = [2:3:20]; % Some vector. y = []; % Empty vector for storing the results. for i=1:length(x) % The function get_something() returns a vector of unspecified size: - z = get_somehow_more(x(i)); + z = get_something(x(i)); % The content of z is appended to the result vector y: - y = [y; z(:)]; + y = [y, z(:)]; % The z(:) syntax ensures that we append column-vectors. end -% Process the results stored in the vector z: -mean(y) +% Process the results stored in the vector y: +mean(y, 1) \end{pagelisting} diff --git a/regression/code/gradientDescentPower.m b/regression/code/gradientDescentPower.m new file mode 100644 index 0000000..31390eb --- /dev/null +++ b/regression/code/gradientDescentPower.m @@ -0,0 +1,41 @@ +function [p, ps, mses] = gradientDescentPower(x, y, p0, epsilon, threshold) +% Gradient descent for fitting a power-law. +% +% Arguments: x, vector of the x-data values. +% y, vector of the corresponding y-data values. +% p0, vector with initial values for c and alpha. +% epsilon: factor multiplying the gradient. +% threshold: minimum value for gradient +% +% Returns: p, vector with the final parameter values. +% ps: 2D-vector with all the parameter tuples traversed. +% mses: vector with the corresponding mean squared errors + + p = p0; + gradient = ones(1, length(p0)) * 1000.0; + ps = []; + mses = []; + while norm(gradient) > threshold + ps = [ps, p(:)]; + mses = [mses, meanSquaredErrorPower(x, y, p)]; + gradient = meanSquaredGradientPower(x, y, p); + p = p - epsilon * gradient; + end +end + +function mse = meanSquaredErrorPower(x, y, p) + mse = mean((y - p(1)*x.^p(2)).^2); +end + +function gradmse = meanSquaredGradientPower(x, y, p) + gradmse = zeros(size(p, 1), size(p, 2)); + h = 1e-5; % stepsize for derivatives + mse = meanSquaredErrorPower(x, y, p); + for i = 1:length(p) % for each coordinate ... + pi = p; + pi(i) = pi(i) + h; % displace i-th parameter + msepi = meanSquaredErrorPower(x, y, pi); + gradmse(i) = (msepi - mse)/h; + end +end + diff --git a/regression/code/plotgradientdescentpower.m b/regression/code/plotgradientdescentpower.m new file mode 100644 index 0000000..f52bfe7 --- /dev/null +++ b/regression/code/plotgradientdescentpower.m @@ -0,0 +1,32 @@ +meansquarederrorline; % generate data + +p0 = [2.0, 1.0]; +eps = 0.00001; +thresh = 50.0; +[pest, ps, mses] = gradientDescentPower(x, y, p0, eps, thresh); +pest + +subplot(2, 2, 1); % top left panel +hold on; +plot(ps(1,:), ps(2,:), '.'); +plot(ps(1,end), ps(2,end), 'og'); +plot(c, 3.0, 'or'); % dot indicating true parameter values +hold off; +xlabel('Iteration'); +ylabel('C'); +subplot(2, 2, 3); % bottom left panel +plot(mses, '-o'); +xlabel('Iteration steps'); +ylabel('MSE'); +subplot(1, 2, 2); % right panel +hold on; +% generate x-values for plottig the fit: +xx = min(x):0.01:max(x); +yy = pest(1) * xx.^pest(2); +plot(xx, yy); +plot(x, y, 'o'); % plot original data +xlabel('Size [m]'); +ylabel('Weight [kg]'); +legend('fit', 'data', 'location', 'northwest'); +pause + diff --git a/regression/lecture/regression.tex b/regression/lecture/regression.tex index 472f3c6..69e7b05 100644 --- a/regression/lecture/regression.tex +++ b/regression/lecture/regression.tex @@ -319,7 +319,7 @@ differs sufficiently from zero. At steep slopes we take large steps (the distance between the red dots in \figref{gradientdescentcubicfig} is large). -\begin{exercise}{gradientDescentCubic.m}{} +\begin{exercise}{gradientDescentCubic.m}{}\label{gradientdescentexercise} Implement the gradient descent algorithm for the problem of fitting a cubic function \eqref{cubicfunc} to some measured data pairs $x$ and $y$ as a function \varcode{gradientDescentCubic()} that returns @@ -332,7 +332,7 @@ is large). for the absolute value of the gradient terminating the algorithm. \end{exercise} -\begin{exercise}{plotgradientdescentcubic.m}{} +\begin{exercise}{plotgradientdescentcubic.m}{}\label{plotgradientdescentexercise} Use the function \varcode{gradientDescentCubic()} to fit the simulated data from exercise~\ref{mseexercise}. Plot the returned values of the parameter $c$ as as well as the corresponding mean @@ -373,15 +373,14 @@ control how they behave. Now you know what this is all about. \section{N-dimensional minimization problems} So far we were concerned about finding the right value for a single -parameter that minimizes a cost function. The gradient descent method -for such one dimensional problems seems a bit like over kill. However, -often we deal with functions that have more than a single parameter, -in general $n$ parameter. We then need to find the minimum in an $n$ -dimensional parameter space. +parameter that minimizes a cost function. Often we deal with +functions that have more than a single parameter, in general $n$ +parameters. We then need to find the minimum in an $n$ dimensional +parameter space. For our tiger problem, we could have also fitted the exponent $\alpha$ of the power-law relation between size and weight, instead of assuming -a cubic relation: +a cubic relation with $\alpha=3$: \begin{equation} \label{powerfunc} y = f(x; c, \alpha) = f(x; \vec p) = c\cdot x^\alpha @@ -390,11 +389,11 @@ We then could check whether the resulting estimate of the exponent $\alpha$ indeed is close to the expected power of three. The power-law \eqref{powerfunc} has two free parameters $c$ and $\alpha$. Instead of a single parameter we are now dealing with a vector $\vec -p$ containing $n$ parameter values. Here, $\vec p = (c, -\alpha)$. Luckily, all the concepts we introduced on the example of -the one dimensional problem of tiger weights generalize to -$n$-dimensional problems. We only need to adapt a few things. The cost -function for the mean squared error reads +p$ containing $n$ parameter values. Here, $\vec p = (c, \alpha)$. All +the concepts we introduced on the example of the one dimensional +problem of tiger weights generalize to $n$-dimensional problems. We +only need to adapt a few things. The cost function for the mean +squared error reads \begin{equation} \label{ndimcostfunc} f_{cost}(\vec p|\{(x_i, y_i)\}) = \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i;\vec p))^2 @@ -405,22 +404,18 @@ For two-dimensional problems the graph of the cost function is an span a two-dimensional plane. The cost function assigns to each parameter combination on this plane a single value. This results in a landscape over the parameter plane with mountains and valleys and we -are searching for the bottom of the deepest valley. +are searching for the position of the bottom of the deepest valley. -When we place a ball somewhere on the slopes of a hill it rolls -downwards and eventually stops at the bottom. The ball always rolls in -the direction of the steepest slope. - -\begin{ibox}[tp]{\label{partialderivativebox}Partial derivative and gradient} - Some functions that depend on more than a single variable: +\begin{ibox}[tp]{\label{partialderivativebox}Partial derivatives and gradient} + Some functions depend on more than a single variable. For example, the function \[ z = f(x,y) \] - for example depends on $x$ and $y$. Using the partial derivative + depends on both $x$ and $y$. Using the partial derivatives \[ \frac{\partial f(x,y)}{\partial x} = \lim\limits_{\Delta x \to 0} \frac{f(x + \Delta x,y) - f(x,y)}{\Delta x} \] and \[ \frac{\partial f(x,y)}{\partial y} = \lim\limits_{\Delta y \to 0} \frac{f(x, y + \Delta y) - f(x,y)}{\Delta y} \] - one can estimate the slope in the direction of the variables - individually by using the respective difference quotient - (Box~\ref{differentialquotientbox}). \vspace{1ex} + one can calculate the slopes in the directions of each of the + variables by means of the respective difference quotient + (see box~\ref{differentialquotientbox}). \vspace{1ex} \begin{minipage}[t]{0.5\textwidth} \mbox{}\\[-2ex] @@ -429,37 +424,93 @@ the direction of the steepest slope. \hfill \begin{minipage}[t]{0.46\textwidth} For example, the partial derivatives of - \[ f(x,y) = x^2+y^2 \] are + \[ f(x,y) = x^2+y^2 \vspace{-1ex} \] are \[ \frac{\partial f(x,y)}{\partial x} = 2x \; , \quad \frac{\partial f(x,y)}{\partial y} = 2y \; .\] - The gradient is a vector that is constructed from the partial derivatives: + The gradient is a vector with the partial derivatives as coordinates: \[ \nabla f(x,y) = \left( \begin{array}{c} \frac{\partial f(x,y)}{\partial x} \\[1ex] \frac{\partial f(x,y)}{\partial y} \end{array} \right) \] This vector points into the direction of the strongest ascend of $f(x,y)$. \end{minipage} - \vspace{0.5ex} The figure shows the contour lines of a bi-variate + \vspace{0.5ex} The figure shows the contour lines of a bivariate Gaussian $f(x,y) = \exp(-(x^2+y^2)/2)$ and the gradient (thick - arrows) and the corresponding two partial derivatives (thin arrows) - for three different locations. + arrows) together with the corresponding partial derivatives (thin + arrows) for three different locations. \end{ibox} -The \entermde{Gradient}{gradient} (Box~\ref{partialderivativebox}) of the -objective function is the vector +When we place a ball somewhere on the slopes of a hill, it rolls +downwards and eventually stops at the bottom. The ball always rolls in +the direction of the steepest slope. For a two-dimensional parameter +space of our example, the \entermde{Gradient}{gradient} +(box~\ref{partialderivativebox}) of the cost function is a vector +\begin{equation} + \label{gradientpowerlaw} + \nabla f_{cost}(c, \alpha) = \left( \frac{\partial f_{cost}(c, \alpha)}{\partial c}, + \frac{\partial f_{cost}(c, \alpha)}{\partial \alpha} \right) +\end{equation} +that points into the direction of the strongest ascend of the cost +function. The gradient is given by the partial derivatives +(box~\ref{partialderivativebox}) of the mean squared error with +respect to the parameters $c$ and $\alpha$ of the power law +relation. In general, for $n$-dimensional problems, the gradient is an +$n-$ dimensional vector containing for each of the $n$ parameters +$p_j$ the respective partial derivatives as coordinates: \begin{equation} \label{gradient} - \nabla f_{cost}(m,b) = \left( \frac{\partial f(m,b)}{\partial m}, - \frac{\partial f(m,b)}{\partial b} \right) + \nabla f_{cost}(\vec p) = \left( \frac{\partial f_{cost}(\vec p)}{\partial p_j} \right) +\end{equation} + +The iterative equation \eqref{gradientdescent} of the gradient descent +stays exactly the same, with the only difference that the current +parameter value $p_i$ becomes a vector $\vec p_i$ of parameter values: +\begin{equation} + \label{ndimgradientdescent} + \vec p_{i+1} = \vec p_i - \epsilon \cdot \nabla f_{cost}(\vec p_i) \end{equation} -that points to the strongest ascend of the objective function. The -gradient is given by partial derivatives -(Box~\ref{partialderivativebox}) of the mean squared error with -respect to the parameters $m$ and $b$ of the straight line. +For the termination condition we need the length of the gradient. In +one dimension it was just the absolute value of the derivative. For +$n$ dimensions this is according to the \enterm{Pythagorean theorem} +(\determ[Pythagoras!Satz des]{Satz des Pythagoras}) the square root of +the sum of the squared partial derivatives: +\begin{equation} + \label{ndimabsgradient} + |\nabla f_{cost}(\vec p_i)| = \sqrt{\sum_{i=1}^n \left(\frac{\partial f_{cost}(\vec p)}{\partial p_i}\right)^2} +\end{equation} +The \code{norm()} function implements this. + +\begin{exercise}{gradientDescentPower.m}{} + Adapt the function \varcode{gradientDescentCubic()} from + exercise~\ref{gradientdescentexercise} to implement the gradient + descent algorithm for the power law \eqref{powerfunc}. The new + function takes a two element vector $(c,\alpha)$ for the initial + parameter values and also returns the best parameter combination as + a two-element vector. Use a \varcode{for} loop over the two + dimensions for computing the gradient. +\end{exercise} + +\begin{exercise}{plotgradientdescentpower.m}{} + Use the function \varcode{gradientDescentPower()} to fit the + simulated data from exercise~\ref{mseexercise}. Plot the returned + values of the two parameters against each other. Compare the result + of the gradient descent method with the true values of $c$ and + $\alpha$ used to simulate the data. Observe the norm of the gradient + and inspect the plots to adapt $\epsilon$ (smaller than in + exercise~\ref{plotgradientdescentexercise}) and the threshold (much + larger) appropriately. Finally plot the data together with the best + fitting power-law \eqref{powerfunc}. +\end{exercise} + +\section{Curve fit for arbitrary functions} +So far, all our code for the gradient descent algorithm was tailored +to a specific function, the cubic relation \eqref{cubicfunc} or the +power law \eqref{powerfunc}. +\section{XXX} For example, you measure the response of a passive membrane to a -current step and you want to estimate membrane time constant. Then you +current step and you want to estimate membrane the time constant. Then you need to fit an exponential function \begin{equation} \label{expfunc}