From be79d450dfa62b1721125f604d7e15dec93d50be Mon Sep 17 00:00:00 2001 From: Jan Benda Date: Tue, 15 Dec 2020 23:18:12 +0100 Subject: [PATCH] [regression] started to simplify chapter to a 1d problem --- regression/code/meanSquaredErrorLine.m | 1 - regression/code/meansquarederrorline.m | 15 ++ regression/lecture/cubicerrors.py | 36 ++--- regression/lecture/cubicfunc.py | 26 +++- regression/lecture/cubicmse.py | 7 +- regression/lecture/regression-chapter.tex | 111 +++++++++++++- regression/lecture/regression.tex | 167 +++++++++++----------- 7 files changed, 247 insertions(+), 116 deletions(-) delete mode 100644 regression/code/meanSquaredErrorLine.m create mode 100644 regression/code/meansquarederrorline.m diff --git a/regression/code/meanSquaredErrorLine.m b/regression/code/meanSquaredErrorLine.m deleted file mode 100644 index c395934..0000000 --- a/regression/code/meanSquaredErrorLine.m +++ /dev/null @@ -1 +0,0 @@ -mse = mean((y - y_est).^2); diff --git a/regression/code/meansquarederrorline.m b/regression/code/meansquarederrorline.m new file mode 100644 index 0000000..17f9d2f --- /dev/null +++ b/regression/code/meansquarederrorline.m @@ -0,0 +1,15 @@ +n = 40; +xmin = 2.2; +xmax = 3.9; +c = 6.0; +noise = 50.0; + +% generate data: +x = rand(n, 1) * (xmax-xmin) + xmin; +yest = c * x.^3; +y = yest + noise*randn(n, 1); + +% compute mean squared error: +mse = mean((y - y_est).^2); + +fprintf('the mean squared error is %g kg^2\n', mse)) diff --git a/regression/lecture/cubicerrors.py b/regression/lecture/cubicerrors.py index e072d9e..86e608f 100644 --- a/regression/lecture/cubicerrors.py +++ b/regression/lecture/cubicerrors.py @@ -15,28 +15,13 @@ def create_data(): return x, y, c -def plot_data(ax, x, y, c): - ax.plot(x, y, zorder=10, **psAm) - xx = np.linspace(2.1, 3.9, 100) - ax.plot(xx, c*xx**3.0, zorder=5, **lsBm) - for cc in [0.25*c, 0.5*c, 2.0*c, 4.0*c]: - ax.plot(xx, cc*xx**3.0, zorder=5, **lsDm) - ax.set_xlabel('Size x', 'm') - ax.set_ylabel('Weight y', 'kg') - ax.set_xlim(2, 4) - ax.set_ylim(0, 400) - ax.set_xticks(np.arange(2.0, 4.1, 0.5)) - ax.set_yticks(np.arange(0, 401, 100)) - - def plot_data_errors(ax, x, y, c): ax.set_xlabel('Size x', 'm') - #ax.set_ylabel('Weight y', 'kg') + ax.set_ylabel('Weight y', 'kg') ax.set_xlim(2, 4) ax.set_ylim(0, 400) ax.set_xticks(np.arange(2.0, 4.1, 0.5)) ax.set_yticks(np.arange(0, 401, 100)) - ax.set_yticklabels([]) ax.annotate('Error', xy=(x[28]+0.05, y[28]+60), xycoords='data', xytext=(3.4, 70), textcoords='data', ha='left', @@ -52,31 +37,30 @@ def plot_data_errors(ax, x, y, c): yy = [c*x[i]**3.0, y[i]] ax.plot(xx, yy, zorder=5, **lsDm) + def plot_error_hist(ax, x, y, c): ax.set_xlabel('Squared error') ax.set_ylabel('Frequency') - bins = np.arange(0.0, 1250.0, 100) + bins = np.arange(0.0, 11000.0, 750) ax.set_xlim(bins[0], bins[-1]) - #ax.set_ylim(0, 35) - ax.set_xticks(np.arange(bins[0], bins[-1], 200)) - #ax.set_yticks(np.arange(0, 36, 10)) + ax.set_ylim(0, 15) + ax.set_xticks(np.arange(bins[0], bins[-1], 5000)) + ax.set_yticks(np.arange(0, 16, 5)) errors = (y-(c*x**3.0))**2.0 mls = np.mean(errors) ax.annotate('Mean\nsquared\nerror', xy=(mls, 0.5), xycoords='data', - xytext=(800, 3), textcoords='data', ha='left', + xytext=(4500, 6), textcoords='data', ha='left', arrowprops=dict(arrowstyle="->", relpos=(0.0,0.2), connectionstyle="angle3,angleA=10,angleB=90") ) ax.hist(errors, bins, **fsC) - if __name__ == "__main__": x, y, c = create_data() fig, (ax1, ax2) = plt.subplots(1, 2) - fig.subplots_adjust(wspace=0.2, **adjust_fs(left=6.0, right=1.2)) - plot_data(ax1, x, y, c) - plot_data_errors(ax2, x, y, c) - #plot_error_hist(ax2, x, y, c) + fig.subplots_adjust(wspace=0.4, **adjust_fs(left=6.0, right=1.2)) + plot_data_errors(ax1, x, y, c) + plot_error_hist(ax2, x, y, c) fig.savefig("cubicerrors.pdf") plt.close() diff --git a/regression/lecture/cubicfunc.py b/regression/lecture/cubicfunc.py index e8c0565..45dd67b 100644 --- a/regression/lecture/cubicfunc.py +++ b/regression/lecture/cubicfunc.py @@ -2,7 +2,7 @@ import matplotlib.pyplot as plt import numpy as np from plotstyle import * -if __name__ == "__main__": +def create_data(): # wikipedia: # Generally, males vary in total length from 250 to 390 cm and # weigh between 90 and 306 kg @@ -12,10 +12,20 @@ if __name__ == "__main__": rng = np.random.RandomState(32281) noise = rng.randn(len(x))*50 y += noise + return x, y, c + - fig, ax = plt.subplots(figsize=cm_size(figure_width, 1.4*figure_height)) - fig.subplots_adjust(**adjust_fs(left=6.0, right=1.2)) - +def plot_data(ax, x, y): + ax.plot(x, y, **psA) + ax.set_xlabel('Size x', 'm') + ax.set_ylabel('Weight y', 'kg') + ax.set_xlim(2, 4) + ax.set_ylim(0, 400) + ax.set_xticks(np.arange(2.0, 4.1, 0.5)) + ax.set_yticks(np.arange(0, 401, 100)) + + +def plot_data_fac(ax, x, y, c): ax.plot(x, y, zorder=10, **psA) xx = np.linspace(2.1, 3.9, 100) ax.plot(xx, c*xx**3.0, zorder=5, **lsB) @@ -27,6 +37,14 @@ if __name__ == "__main__": ax.set_ylim(0, 400) ax.set_xticks(np.arange(2.0, 4.1, 0.5)) ax.set_yticks(np.arange(0, 401, 100)) + +if __name__ == "__main__": + x, y, c = create_data() + print(len(x)) + fig, (ax1, ax2) = plt.subplots(1, 2) + fig.subplots_adjust(wspace=0.5, **adjust_fs(fig, left=6.0, right=1.5)) + plot_data(ax1, x, y) + plot_data_fac(ax2, x, y, c) fig.savefig("cubicfunc.pdf") plt.close() diff --git a/regression/lecture/cubicmse.py b/regression/lecture/cubicmse.py index f687885..55ac64b 100644 --- a/regression/lecture/cubicmse.py +++ b/regression/lecture/cubicmse.py @@ -14,6 +14,7 @@ def create_data(): y += noise return x, y, c + def gradient_descent(x, y): n = 20 dc = 0.01 @@ -29,6 +30,7 @@ def gradient_descent(x, y): mses.append(m0) cc -= eps*dmdc return cs, mses + def plot_mse(ax, x, y, c, cs): ms = np.zeros(len(cs)) @@ -54,7 +56,8 @@ def plot_mse(ax, x, y, c, cs): ax.set_ylim(0, 25000) ax.set_xticks(np.arange(0.0, 10.1, 2.0)) ax.set_yticks(np.arange(0, 30001, 10000)) - + + def plot_descent(ax, cs, mses): ax.plot(np.arange(len(mses))+1, mses, **lpsBm) ax.set_xlabel('Iteration') @@ -69,7 +72,7 @@ def plot_descent(ax, cs, mses): if __name__ == "__main__": x, y, c = create_data() cs, mses = gradient_descent(x, y) - fig, (ax1, ax2) = plt.subplots(1, 2) + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=cm_size(figure_width, 1.1*figure_height)) fig.subplots_adjust(wspace=0.2, **adjust_fs(left=8.0, right=0.5)) plot_mse(ax1, x, y, c, cs) plot_descent(ax2, cs, mses) diff --git a/regression/lecture/regression-chapter.tex b/regression/lecture/regression-chapter.tex index 792dfcd..2169854 100644 --- a/regression/lecture/regression-chapter.tex +++ b/regression/lecture/regression-chapter.tex @@ -40,6 +40,113 @@ \item Homework is to do the 2d problem with the straight line! \end{itemize} +\subsection{2D fit} + +\begin{exercise}{meanSquaredError.m}{} + Implement the objective function \eqref{mseline} as a function + \varcode{meanSquaredError()}. The function takes three + arguments. The first is a vector of $x$-values and the second + contains the measurements $y$ for each value of $x$. The third + argument is a 2-element vector that contains the values of + parameters \varcode{m} and \varcode{b}. The function returns the + mean square error. +\end{exercise} + + +\begin{exercise}{errorSurface.m}{}\label{errorsurfaceexercise} + Generate 20 data pairs $(x_i|y_i)$ that are linearly related with + slope $m=0.75$ and intercept $b=-40$, using \varcode{rand()} for + drawing $x$ values between 0 and 120 and \varcode{randn()} for + jittering the $y$ values with a standard deviation of 15. Then + calculate the mean squared error between the data and straight lines + for a range of slopes and intercepts using the + \varcode{meanSquaredError()} function from the previous exercise. + Illustrates the error surface using the \code{surface()} function. + Consult the documentation to find out how to use \code{surface()}. +\end{exercise} + +\begin{exercise}{meanSquaredGradient.m}{}\label{gradientexercise}% + Implement a function \varcode{meanSquaredGradient()}, that takes the + $x$- and $y$-data and the set of parameters $(m, b)$ of a straight + line as a two-element vector as input arguments. The function should + return the gradient at the position $(m, b)$ as a vector with two + elements. +\end{exercise} + +\begin{exercise}{errorGradient.m}{} + Extend the script of exercises~\ref{errorsurfaceexercise} to plot + both the error surface and gradients using the + \varcode{meanSquaredGradient()} function from + exercise~\ref{gradientexercise}. Vectors in space can be easily + plotted using the function \code{quiver()}. Use \code{contour()} + instead of \code{surface()} to plot the error surface. +\end{exercise} + + +\begin{exercise}{gradientDescent.m}{} + Implement the gradient descent for the problem of fitting a straight + line to some measured data. Reuse the data generated in + exercise~\ref{errorsurfaceexercise}. + \begin{enumerate} + \item Store for each iteration the error value. + \item Plot the error values as a function of the iterations, the + number of optimization steps. + \item Plot the measured data together with the best fitting straight line. + \end{enumerate}\vspace{-4.5ex} +\end{exercise} + + +\begin{figure}[t] + \includegraphics[width=1\textwidth]{lin_regress}\hfill + \titlecaption{Example data suggesting a linear relation.}{A set of + input signals $x$, e.g. stimulus intensities, were used to probe a + system. The system's output $y$ to the inputs are noted + (left). Assuming a linear relation between $x$ and $y$ leaves us + with 2 parameters, the slope (center) and the intercept with the + y-axis (right panel).}\label{linregressiondatafig} +\end{figure} + +\begin{figure}[t] + \includegraphics[width=1\textwidth]{linear_least_squares} + \titlecaption{Estimating the \emph{mean square error}.} {The + deviation error (orange) between the prediction (red line) and the + observations (blue dots) is calculated for each data point + (left). Then the deviations are squared and the average is + calculated (right).} + \label{leastsquareerrorfig} +\end{figure} + +\begin{figure}[t] + \includegraphics[width=0.75\textwidth]{error_surface} + \titlecaption{Error surface.}{The two model parameters $m$ and $b$ + define the base area of the surface plot. For each parameter + combination of slope and intercept the error is calculated. The + resulting surface has a minimum which indicates the parameter + combination that best fits the data.}\label{errorsurfacefig} +\end{figure} + + +\begin{figure}[t] + \includegraphics[width=0.75\textwidth]{error_gradient} + \titlecaption{Gradient of the error surface.} {Each arrow points + into the direction of the greatest ascend at different positions + of the error surface shown in \figref{errorsurfacefig}. The + contour lines in the background illustrate the error surface. Warm + colors indicate high errors, colder colors low error values. Each + contour line connects points of equal + error.}\label{gradientquiverfig} +\end{figure} + +\begin{figure}[t] + \includegraphics[width=0.45\textwidth]{gradient_descent} + \titlecaption{Gradient descent.}{The algorithm starts at an + arbitrary position. At each point the gradient is estimated and + the position is updated as long as the length of the gradient is + sufficiently large.The dots show the positions after each + iteration of the algorithm.} \label{gradientdescentfig} +\end{figure} + + \subsection{Linear fits} \begin{itemize} \item Polyfit is easy: unique solution! $c x^2$ is also a linear fit. @@ -54,8 +161,8 @@ Fit with matlab functions lsqcurvefit, polyfit \subsection{Non-linear fits} \begin{itemize} \item Example that illustrates the Nebenminima Problem (with error surface) -\item You need got initial values for the parameter! -\item Example that fitting gets harder the more parameter yuo have. +\item You need initial values for the parameter! +\item Example that fitting gets harder the more parameter you have. \item Try to fix as many parameter before doing the fit. \item How to test the quality of a fit? Residuals. $\chi^2$ test. Run-test. \end{itemize} diff --git a/regression/lecture/regression.tex b/regression/lecture/regression.tex index e63e3f9..779309b 100644 --- a/regression/lecture/regression.tex +++ b/regression/lecture/regression.tex @@ -2,99 +2,104 @@ \exercisechapter{Optimization and gradient descent} Optimization problems arise in many different contexts. For example, -to understand the behavior of a given system, the system is probed -with a range of input signals and then the resulting responses are -measured. This input-output relation can be described by a model. Such -a model can be a simple function that maps the input signals to -corresponding responses, it can be a filter, or a system of -differential equations. In any case, the model has some parameters that -specify how input and output signals are related. Which combination -of parameter values are best suited to describe the input-output -relation? The process of finding the best parameter values is an -optimization problem. For a simple parameterized function that maps -input to output values, this is the special case of a \enterm{curve - fitting} problem, where the average distance between the curve and -the response values is minimized. One basic numerical method used for -such optimization problems is the so called gradient descent, which is -introduced in this chapter. - -%%% Weiteres einfaches verbales Beispiel? Eventuell aus der Populationsoekologie? +to understand the behavior of a given neuronal system, the system is +probed with a range of input signals and then the resulting responses +are measured. This input-output relation can be described by a +model. Such a model can be a simple function that maps the input +signals to corresponding responses, it can be a filter, or a system of +differential equations. In any case, the model has some parameters +that specify how input and output signals are related. Which +combination of parameter values are best suited to describe the +input-output relation? The process of finding the best parameter +values is an optimization problem. For a simple parameterized function +that maps input to output values, this is the special case of a +\enterm{curve fitting} problem, where the average distance between the +curve and the response values is minimized. One basic numerical method +used for such optimization problems is the so called gradient descent, +which is introduced in this chapter. \begin{figure}[t] - \includegraphics[width=1\textwidth]{lin_regress}\hfill - \titlecaption{Example data suggesting a linear relation.}{A set of - input signals $x$, e.g. stimulus intensities, were used to probe a - system. The system's output $y$ to the inputs are noted - (left). Assuming a linear relation between $x$ and $y$ leaves us - with 2 parameters, the slope (center) and the intercept with the - y-axis (right panel).}\label{linregressiondatafig} + \includegraphics{cubicfunc} + \titlecaption{Example data suggesting a cubic relation.}{The length + $x$ and weight $y$ of $n=34$ male tigers (blue, left). Assuming a + cubic relation between size and weight leaves us with a single + free parameters, a scaling factor. The cubic relation is shown for + a few values of this scaling factor (orange and red, + right).}\label{cubicdatafig} \end{figure} -The data plotted in \figref{linregressiondatafig} suggest a linear -relation between input and output of the system. We thus assume that a -straight line +For demonstrating the curve-fitting problem let's take the simple +example of weights and sizes measured for a number of male tigers +(\figref{cubicdatafig}). Weight $y$ is proportional to volume +$V$ via the density $\rho$. The volume $V$ of any object is +proportional to its length $x$ cubed. The factor $\alpha$ relating +volume and size cubed depends on the shape of the object and we do not +know this factor for tigers. For the data set we thus expect a cubic +relation between weight and length \begin{equation} - \label{straightline} - y = f(x; m, b) = m\cdot x + b + \label{cubicfunc} + y = f(x; c) = c\cdot x^3 \end{equation} -is an appropriate model to describe the system. The line has two free -parameter, the slope $m$ and the $y$-intercept $b$. We need to find -values for the slope and the intercept that best describe the measured -data. In this chapter we use this example to illustrate the gradient -descent and how this methods can be used to find a combination of -slope and intercept that best describes the system. +where $c=\rho\alpha$, the product of a tiger's density and form +factor, is the only free parameter in the relation. We would like to +find out which value of $c$ best describes the measured data. In the +following we use this example to illustrate the gradient descent as a +basic method for finding such an optimal parameter. \section{The error function --- mean squared error} -Before the optimization can be done we need to specify what exactly is -considered an optimal fit. In our example we search the parameter -combination that describe the relation of $x$ and $y$ best. What is -meant by this? Each input $x_i$ leads to an measured output $y_i$ and -for each $x_i$ there is a \emph{prediction} or \emph{estimation} -$y^{est}(x_i)$ of the output value by the model. At each $x_i$ -estimation and measurement have a distance or error $y_i - -y^{est}(x_i)$. In our example the estimation is given by the equation -$y^{est}(x_i) = f(x_i;m,b)$. The best fitting model with parameters -$m$ and $b$ is the one that minimizes the distances between -observation $y_i$ and estimation $y^{est}(x_i)$ -(\figref{leastsquareerrorfig}). - -As a first guess we could simply minimize the sum $\sum_{i=1}^N y_i - -y^{est}(x_i)$. This approach, however, will not work since a minimal sum -can also be achieved if half of the measurements is above and the -other half below the predicted line. Positive and negative errors -would cancel out and then sum up to values close to zero. A better -approach is to sum over the absolute values of the distances: -$\sum_{i=1}^N |y_i - y^{est}(x_i)|$. This sum can only be small if all -deviations are indeed small no matter if they are above or below the -predicted line. Instead of the sum we could also take the average -\begin{equation} - \label{meanabserror} - f_{dist}(\{(x_i, y_i)\}|\{y^{est}(x_i)\}) = \frac{1}{N} \sum_{i=1}^N |y_i - y^{est}(x_i)| -\end{equation} -Instead of the averaged absolute errors, the \enterm[mean squared -error]{mean squared error} (\determ[quadratischer -Fehler!mittlerer]{mittlerer quadratischer Fehler}) +Before we go ahead finding the optimal parameter value we need to +specify what exactly we consider as an optimal fit. In our example we +search the parameter that describes the relation of $x$ and $y$ +best. What is meant by this? The length $x_i$ of each tiger is +associated with a weight $y_i$ and for each $x_i$ we have a +\emph{prediction} or \emph{estimation} $y^{est}(x_i)$ of the weight by +the model \eqnref{cubicfunc} for a specific value of the parameter +$c$. Prediction and actual data value ideally match (in a perfect +noise-free world), but in general the estimate and measurement are +separated by some distance or error $y_i - y^{est}(x_i)$. In our +example the estimate of the weight for the length $x_i$ is given by +equation \eqref{cubicfunc} $y^{est}(x_i) = f(x_i;c)$. The best fitting +model with parameter $c$ is the one that somehow minimizes the +distances between observations $y_i$ and corresponding estimations +$y^{est}(x_i)$ (\figref{cubicerrorsfig}). + +As a first guess we could simply minimize the sum of the distances, +$\sum_{i=1}^N y_i - y^{est}(x_i)$. This, however, does not work +because positive and negative errors would cancel out, no matter how +large they are, and sum up to values close to zero. Better is to sum +over absolute distances: $\sum_{i=1}^N |y_i - y^{est}(x_i)|$. This sum +can only be small if all deviations are indeed small no matter if they +are above or below the prediction. The sum of the squared distances, +$\sum_{i=1}^N (y_i - y^{est}(x_i))^2$, turns out to be an even better +choice. Instead of the sum we could also minimize the average distance \begin{equation} \label{meansquarederror} f_{mse}(\{(x_i, y_i)\}|\{y^{est}(x_i)\}) = \frac{1}{N} \sum_{i=1}^N (y_i - y^{est}(x_i))^2 \end{equation} -is commonly used (\figref{leastsquareerrorfig}). Similar to the -absolute distance, the square of the errors, $(y_i - y^{est}(x_i))^2$, is -always positive and thus positive and negative error values do not +This is known as the \enterm[mean squared error]{mean squared error} +(\determ[quadratischer Fehler!mittlerer]{mittlerer quadratischer + Fehler}). Similar to the absolute distance, the square of the errors +is always positive and thus positive and negative error values do not cancel each other out. In addition, the square punishes large deviations over small deviations. In chapter~\ref{maximumlikelihoodchapter} we show that minimizing the -mean square error is equivalent to maximizing the likelihood that the +mean squared error is equivalent to maximizing the likelihood that the observations originate from the model, if the data are normally distributed around the model prediction. -\begin{exercise}{meanSquaredErrorLine.m}{}\label{mseexercise}% - Given a vector of observations \varcode{y} and a vector with the - corresponding predictions \varcode{y\_est}, compute the \emph{mean - square error} between \varcode{y} and \varcode{y\_est} in a single - line of code. +\begin{exercise}{meansquarederrorline.m}{}\label{mseexercise} + Simulate $n=40$ tigers ranging from 2.2 to 3.9\,m in size and store + these sizes in a vector \varcode{x}. Compute the corresponding + predicted weights \varcode{yest} for each tiger according to + \eqnref{cubicfunc} with $c=6$\,\kilo\gram\per\meter\cubed. From the + predictions generate simulated measurements of the tiger's weights + \varcode{y}, by adding normally distributed random numbers to the + predictions scaled to a standard deviation of 50\,\kilo\gram. + + Compute the \emph{mean squared error} between \varcode{y} and + \varcode{yest} in a single line of code. \end{exercise} @@ -110,13 +115,13 @@ can be any function that describes the quality of the fit by mapping the data and the predictions to a single scalar value. \begin{figure}[t] - \includegraphics[width=1\textwidth]{linear_least_squares} - \titlecaption{Estimating the \emph{mean square error}.} {The - deviation error, orange) between the prediction (red - line) and the observations (blue dots) is calculated for each data - point (left). Then the deviations are squared and the aveage is + \includegraphics{cubicerrors} + \titlecaption{Estimating the \emph{mean squared error}.} {The + deviation error (orange) between the prediction (red line) and the + observations (blue dots) is calculated for each data point + (left). Then the deviations are squared and the average is calculated (right).} - \label{leastsquareerrorfig} + \label{cubicerrorsfig} \end{figure} Replacing $y^{est}$ in the mean squared error \eqref{meansquarederror} @@ -139,7 +144,7 @@ Fehler!kleinster]{Methode der kleinsten Quadrate}). contains the measurements $y$ for each value of $x$. The third argument is a 2-element vector that contains the values of parameters \varcode{m} and \varcode{b}. The function returns the - mean square error. + mean squared error. \end{exercise} @@ -359,7 +364,7 @@ distance between the red dots in \figref{gradientdescentfig}) is large. \begin{figure}[t] - \includegraphics[width=0.45\textwidth]{gradient_descent} + \includegraphics{cubicmse} \titlecaption{Gradient descent.}{The algorithm starts at an arbitrary position. At each point the gradient is estimated and the position is updated as long as the length of the gradient is