diff --git a/regression/code/checkdescent.m b/regression/code/checkdescent.m new file mode 100644 index 0000000..cc56466 --- /dev/null +++ b/regression/code/checkdescent.m @@ -0,0 +1,25 @@ +% data: +load('lin_regression.mat') + +% compute mean squared error for a range of slopes and intercepts: +slopes = -5:0.25:5; +intercepts = -30:1:30; +errors = zeros(length(slopes), length(intercepts)); +for i = 1:length(slopes) + for j = 1:length(intercepts) + errors(i,j) = lsqError([slopes(i), intercepts(j)], x, y); + end +end + +% minimum of error surface: +[me, mi] = min(errors(:)); +[ia, ib] = ind2sub(size(errors), mi); +eparams = [errors(ia), errors(ib)]; + +% gradient descent: +pstart = [-2. 10.]; +[params, errors] = descent(x, y, pstart); + +% comparison: +fprintf('descent: %6.3f %6.3f\n', params(1), params(2)); +fprintf('errors: %6.3f %6.3f\n', eparams(1), eparams(2)); diff --git a/regression/code/descent.m b/regression/code/descent.m new file mode 100644 index 0000000..1888414 --- /dev/null +++ b/regression/code/descent.m @@ -0,0 +1,15 @@ +function [params, errors] = descent(xdata, ydata, pstart) + mingradient = 0.1; + eps = 0.01; + + errors = []; + params = pstart; + count = 1; + gradient = [100.0, 100.0]; + while norm(gradient) > mingradient + gradient = lsqGradient(params, xdata, ydata); + errors(count) = lsqError(params, xdata, ydata); + params = params - eps .* gradient; + count = count + 1; + end +end diff --git a/regression/code/descentfit.m b/regression/code/descentfit.m new file mode 100644 index 0000000..ecabb68 --- /dev/null +++ b/regression/code/descentfit.m @@ -0,0 +1,22 @@ +clear +close all +load('lin_regression.mat') + +pstart = [-2. 10.]; +[params, errors] = descent(x, y, pstart); + +figure() +subplot(2,1,1) +hold on +scatter(x, y, 'displayname', 'data') +xx = min(x):0.01:max(x); +fx = params(1)*xx + params(2); +plot(xx, fx, 'displayname', 'fit') +xlabel('Input') +ylabel('Output') +grid on +legend show +subplot(2,1,2) +plot(errors) +xlabel('optimization steps') +ylabel('error') \ No newline at end of file diff --git a/regression/code/errorSurface.m b/regression/code/errorSurface.m index 4bd3fd2..0e88c1e 100644 --- a/regression/code/errorSurface.m +++ b/regression/code/errorSurface.m @@ -1,6 +1,6 @@ load('lin_regression.mat'); -% compute mean squared error for a range of sloopes and intercepts: +% compute mean squared error for a range of slopes and intercepts: slopes = -5:0.25:5; intercepts = -30:1:30; error_surf = zeros(length(slopes), length(intercepts)); diff --git a/regression/code/linefit.m b/regression/code/linefit.m new file mode 100644 index 0000000..df600e2 --- /dev/null +++ b/regression/code/linefit.m @@ -0,0 +1,18 @@ +% data: +load('lin_regression.mat') + +% gradient descent: +pstart = [-2. 10.]; +[params, errors] = descent(x, y, pstart); + +% lsqcurvefit: +line = @(p, x) x.* p(1) + p(2); +cparams = lsqcurvefit(line, pstart, x, y); + +% polyfit: +pparams = polyfit(x, y, 1); + +% comparison: +fprintf('descent: %6.3f %6.3f\n', params(1), params(2)); +fprintf('lsqcurvefit: %6.3f %6.3f\n', cparams(1), cparams(2)); +fprintf('polyfit: %6.3f %6.3f\n', pparams(1), pparams(2)); diff --git a/regression/exercises/exercises01-de.tex b/regression/exercises/exercises01-de.tex new file mode 100644 index 0000000..b7a835f --- /dev/null +++ b/regression/exercises/exercises01-de.tex @@ -0,0 +1,82 @@ +\documentclass[12pt,a4paper,pdftex]{exam} + +\usepackage[german]{babel} +\usepackage{natbib} +\usepackage{graphicx} +\usepackage[small]{caption} +\usepackage{sidecap} +\usepackage{pslatex} +\usepackage{amsmath} +\usepackage{amssymb} +\setlength{\marginparwidth}{2cm} +\usepackage[breaklinks=true,bookmarks=true,bookmarksopen=true,pdfpagemode=UseNone,pdfstartview=FitH,colorlinks=true,citecolor=blue]{hyperref} + +%%%%% text size %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\usepackage[left=20mm,right=20mm,top=25mm,bottom=25mm]{geometry} +\pagestyle{headandfoot} +\ifprintanswers +\newcommand{\stitle}{: Solutions} +\else +\newcommand{\stitle}{} +\fi +\header{{\bfseries\large Exercise 11\stitle}}{{\bfseries\large Gradient descent}}{{\bfseries\large January 9th, 2018}} +\firstpagefooter{Dr. Jan Grewe}{Phone: 29 74588}{Email: + jan.grewe@uni-tuebingen.de} +\runningfooter{}{\thepage}{} + +\setlength{\baselineskip}{15pt} +\setlength{\parindent}{0.0cm} +\setlength{\parskip}{0.3cm} +\renewcommand{\baselinestretch}{1.15} + +\newcommand{\code}[1]{\texttt{#1}} +\renewcommand{\solutiontitle}{\noindent\textbf{Solution:}\par\noindent} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{document} + +\input{instructions} + +\begin{questions} + + \question Implementiere den Gradientenabstieg f\"ur das Problem der + Parameteranpassung der linearen Geradengleichung an die Messdaten in + der Datei \emph{lin\_regression.mat}. + + Die daf\"ur ben\"otigten Zutaten haben wir aus den vorangegangenen + \"Ubungen bereits vorbereitet. Wir brauchen: 1. Die Fehlerfunktion + (\code{meanSquareError()}), 2. die Zielfunktion (\code{lsqError()}) + und 3. den Gradienten (\code{lsqGradient()}). Der Algorithmus f\"ur + den Abstieg lautet: + + \begin{enumerate} + \item Starte mit einer beliebigen Parameterkombination $p_0 = (m_0, + b_0)$. + \item \label{computegradient} Berechne den Gradienten an der + akutellen Position $p_i$. + \item Wenn die L\"ange des Gradienten einen bestimmten Wert + unterschreitet, haben wir das Minum gefunden und k\"onnen die + Suche abbrechen. Wir suchen ja das Minimum, bei dem der Gradient + gleich Null ist. Da aus numerischen Gr\"unden der Gradient nie + exakt Null werden wird, k\"onnen wir nur fordern, dass er + hinreichend klein wird (z.B. \code{norm(gradient) < 0.1}). + \item \label{gradientstep} Gehe einen kleinen Schritt ($\epsilon = + 0.01$) in die entgegensetzte Richtung des Gradienten: + \[p_{i+1} = p_i - \epsilon \cdot \nabla f_{cost}(m_i, b_i)\] + \item Wiederhole die Schritte \ref{computegradient} -- + \ref{gradientstep}. + \end{enumerate} + + + \begin{parts} + \part Implementiere den Gradientenabstieg und merke Dir f\"ur jeden Schritt + die Parameterkombination und den zugehörigen Fehler. + \part Erstelle einen Plot der die Originaldaten sowie die Vorhersage mit der + besten Parameterkombination darstellt. + \part Stelle in einem weiteren Plot die Entwicklung des Fehlers als Funktion der + Optimierungsschritte dar. + \end{parts} + +\end{questions} + +\end{document} diff --git a/regression/exercises/exercises01.tex b/regression/exercises/exercises01.tex index 5b6024d..40241cb 100644 --- a/regression/exercises/exercises01.tex +++ b/regression/exercises/exercises01.tex @@ -19,7 +19,7 @@ \else \newcommand{\stitle}{} \fi -\header{{\bfseries\large Exercise 11\stitle}}{{\bfseries\large Gradient descend}}{{\bfseries\large January 9th, 2018}} +\header{{\bfseries\large Exercise 11\stitle}}{{\bfseries\large Gradient descent}}{{\bfseries\large January 9th, 2018}} \firstpagefooter{Dr. Jan Grewe}{Phone: 29 74588}{Email: jan.grewe@uni-tuebingen.de} \runningfooter{}{\thepage}{} @@ -31,6 +31,24 @@ \newcommand{\code}[1]{\texttt{#1}} \renewcommand{\solutiontitle}{\noindent\textbf{Solution:}\par\noindent} +%%%%% listings %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\usepackage{listings} +\lstset{ + language=Matlab, + basicstyle=\ttfamily\footnotesize, + numbers=left, + numberstyle=\tiny, + title=\lstname, + showstringspaces=false, + commentstyle=\itshape\color{darkgray}, + breaklines=true, + breakautoindent=true, + columns=flexible, + frame=single, + xleftmargin=1em, + xrightmargin=1em, + aboveskip=10pt +} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{document} @@ -39,42 +57,68 @@ \begin{questions} - \question Implementiere den Gradientenabstieg f\"ur das Problem der - Parameteranpassung der linearen Geradengleichung an die Messdaten in - der Datei \emph{lin\_regression.mat}. + \question Implement the gradient descent for finding the parameters + of a straigth line that we want to fit to the data in the file + \emph{lin\_regression.mat}. - Die daf\"ur ben\"otigten Zutaten haben wir aus den vorangegangenen - \"Ubungen bereits vorbereitet. Wir brauchen: 1. Die Fehlerfunktion - (\code{meanSquareError()}), 2. die Zielfunktion (\code{lsqError()}) - und 3. den Gradienten (\code{lsqGradient()}). Der Algorithmus f\"ur - den Abstieg lautet: + In the lecture we already prepared the necessary functions: 1. the + error function (\code{meanSquareError()}), 2. the cost function + (\code{lsqError()}), and 3. the gradient (\code{lsqGradient()}). + + The algorithm for the descent towards the minimum of the cost + function is as follows: \begin{enumerate} - \item Starte mit einer beliebigen Parameterkombination $p_0 = (m_0, - b_0)$. - \item \label{computegradient} Berechne den Gradienten an der - akutellen Position $p_i$. - \item Wenn die L\"ange des Gradienten einen bestimmten Wert - unterschreitet, haben wir das Minum gefunden und k\"onnen die - Suche abbrechen. Wir suchen ja das Minimum, bei dem der Gradient - gleich Null ist. Da aus numerischen Gr\"unden der Gradient nie - exakt Null werden wird, k\"onnen wir nur fordern, dass er - hinreichend klein wird (z.B. \code{norm(gradient) < 0.1}). - \item \label{gradientstep} Gehe einen kleinen Schritt ($\epsilon = - 0.01$) in die entgegensetzte Richtung des Gradienten: + \item Start with some arbitrary parameter values $p_0 = (m_0, b_0)$ + for the slope and the intercept of the straight line. + \item \label{computegradient} Compute the gradient of the cost function + at the current values of the parameters $p_i$. + \item If the magnitude (length) of the gradient is smaller than some + small number, the algorithm converged close to the minimum of the + cost function and we abort the descent. Right at the minimum the + magnitude of the gradient is zero. However, since we determine + the gradient numerically, it will never be exactly zero. This is + why we require the gradient to be sufficiently small + (e.g. \code{norm(gradient) < 0.1}). + \item \label{gradientstep} Move against the gradient by a small step + ($\epsilon = 0.01$): \[p_{i+1} = p_i - \epsilon \cdot \nabla f_{cost}(m_i, b_i)\] - \item Wiederhole die Schritte \ref{computegradient} -- - \ref{gradientstep}. + \item Repeat steps \ref{computegradient} -- \ref{gradientstep}. \end{enumerate} - \begin{parts} - \part Implementiere den Gradientenabstieg und merke Dir f\"ur jeden Schritt - die Parameterkombination und den zugehörigen Fehler. - \part Erstelle einen Plot der die Originaldaten sowie die Vorhersage mit der - besten Parameterkombination darstellt. - \part Stelle in einem weiteren Plot die Entwicklung des Fehlers als Funktion der - Optimierungsschritte dar. + \part Implement the gradient descent in a function that returns + the parameter values at the minimum of the cost function and a vector + with the value of the cost function at each step of the algorithm. + \begin{solution} + \lstinputlisting{../code/descent.m} + \end{solution} + + \part Plot the data and the straight line with the parameter + values that you found with the gradient descent method. + + \part Plot the development of the costs as a function of the + iteration step. + \begin{solution} + \lstinputlisting{../code/descentfit.m} + \end{solution} + + \part Find the position of the minimum of the cost function by + means of the \code{min()} function. Compare with the result of the + gradient descent method. Vary the value of $\epsilon$ and the + minimum gradient. What are good values such that the gradient + descent gets closest to the true minimum of the cost function? + \begin{solution} + \lstinputlisting{../code/checkdescent.m} + \end{solution} + + \part Use the functions \code{polyfit()} and \code{lsqcurvefit()} + provided by matlab to find the slope and intercept of a straight + line that fits the data. + \begin{solution} + \lstinputlisting{../code/linefit.m} + \end{solution} + \end{parts} \end{questions} diff --git a/regression/lecture/regression.tex b/regression/lecture/regression.tex index dd6c990..f72a406 100644 --- a/regression/lecture/regression.tex +++ b/regression/lecture/regression.tex @@ -368,6 +368,7 @@ Punkte in Abbildung \ref{gradientdescentfig} gro{\ss}. Optimierungsschritt an.} \label{gradientdescentfig} \end{figure} +\setboolean{showexercisesolutions}{false} \begin{exercise}{gradientDescent.m}{} Implementiere den Gradientenabstieg f\"ur das Problem der Parameteranpassung der linearen Geradengleichung an die Messdaten in @@ -409,6 +410,7 @@ Kostenfunktionen gemacht \matlabfun{fminsearch()}, w\"ahrend spezielle Funktionen z.B. f\"ur die Minimierung des quadratischen Abstands bei einem Kurvenfit angeboten werden \matlabfun{lsqcurvefit()}. +\newpage \begin{important}[Achtung Nebenminima!] Das Finden des globalen Minimums ist leider nur selten so leicht wie bei einem Geradenfit. Oft hat die Kostenfunktion viele Nebenminima, diff --git a/scientificcomputing-script.tex b/scientificcomputing-script.tex index e4ab182..22bbed8 100644 --- a/scientificcomputing-script.tex +++ b/scientificcomputing-script.tex @@ -63,7 +63,6 @@ \lstset{inputpath=bootstrap/code} \include{bootstrap/lecture/bootstrap} -\setboolean{showexercisesolutions}{false} \graphicspath{{regression/lecture/}{regression/lecture/figures/}} \lstset{inputpath=regression/code} \include{regression/lecture/regression} diff --git a/statistics/lecture/statistics.tex b/statistics/lecture/statistics.tex index 1cdfeff..c6c217f 100644 --- a/statistics/lecture/statistics.tex +++ b/statistics/lecture/statistics.tex @@ -348,14 +348,14 @@ probability density functions like the one of the normal distribution \subsection{Kernel densities} A problem of using histograms for estimating probability densities is -that the have hard bin edges. Depending on where the bin edges are placed +that they have hard bin edges. Depending on where the bin edges are placed a data value falls in one or the other bin. \begin{figure}[t] \includegraphics[width=1\textwidth]{kerneldensity} \titlecaption{\label{kerneldensityfig} Kernel densities.}{Left: The histogram-based estimation of the probability density is dependent - also on the position of the bins. In the bottom plot the bins have + on the position of the bins. In the bottom plot the bins have bin shifted by half a bin width (here $\Delta x=0.4$) and as a result details of the probability density look different. Look, for example at the height of the largest bin. Right: In contrast, @@ -366,7 +366,7 @@ a data value falls in one or the other bin. To avoid this problem one can use so called \enterm {kernel densities} for estimating probability densities from data. Here every data point is replaced by a kernel (a function with integral one, like for -example the Gaussian function) that is moved exactly to the position +example the Gaussian) that is moved exactly to the position indicated by the data value. Then all the kernels of all the data values are summed up, the sum is divided by the number of data values, and we get an estimate of the probability density. @@ -417,7 +417,7 @@ and percentiles can be determined from the inverse cumulative function. 100 data values drawn from a normal distribution (red) in comparison to the true cumulative distribution function computed by numerically integrating the normal distribution function - (blue). From the cumulative distribution function one can read of + (blue). From the cumulative distribution function one can read off the probabilities of getting values smaller than a given value (here: $P(x \ge -1) \approx 0.15$). From the inverse cumulative distribution the position of percentiles can be computed (here: