diff --git a/bootstrap/exercises/bootstrapmean.m b/bootstrap/exercises/bootstrapmean.m index 6f3b494..686df9e 100644 --- a/bootstrap/exercises/bootstrapmean.m +++ b/bootstrap/exercises/bootstrapmean.m @@ -10,7 +10,7 @@ function [bootsem, mu] = bootstrapmean(x, resample) for i = 1:resample % resample: xr = x(randi(nsamples, nsamples, 1)); - % compute statistics on sample: + % compute statistics of resampled sample: mu(i) = mean(xr); end bootsem = std(mu); diff --git a/bootstrap/exercises/correlationbootstrap.m b/bootstrap/exercises/correlationbootstrap.m index 707285f..8f6f16f 100644 --- a/bootstrap/exercises/correlationbootstrap.m +++ b/bootstrap/exercises/correlationbootstrap.m @@ -1,18 +1,18 @@ %% (a) bootstrap: nperm = 1000; -rb = zeros(nperm,1); +rb = zeros(nperm, 1); for i=1:nperm % indices for resampling the data: inx = randi(length(x), length(x), 1); % resampled data pairs: - xb=x(inx); - yb=y(inx); + xb = x(inx); + yb = y(inx); rb(i) = corr(xb, yb); end %% (b) pdf of the correlation coefficients: -[hb,bb] = hist(rb, 20); -hb = hb/sum(hb)/(bb(2)-bb(1)); % normalization +[hb, bb] = hist(rb, 20); +hb = hb/sum(hb)/(bb(2)-bb(1)); % normalization %% (c) significance: rbq = quantile(rb, 0.05); @@ -25,8 +25,8 @@ end %% plot: hold on; -bar(b, h, 'facecolor', [0.5 0.5 0.5]); -bar(bb, hb, 'facecolor', 'b'); +bar(b, h, 'facecolor', [0.5 0.5 0.5]); % permuation test +bar(bb, hb, 'facecolor', 'b'); % bootstrap bar(bb(bb<=rbq), hb(bb<=rbq), 'facecolor', 'r'); plot([rd rd], [0 4], 'r', 'linewidth', 2); xlim([-0.25 0.75]) diff --git a/bootstrap/exercises/correlationsignificance.m b/bootstrap/exercises/correlationsignificance.m index d44af84..c7b1939 100644 --- a/bootstrap/exercises/correlationsignificance.m +++ b/bootstrap/exercises/correlationsignificance.m @@ -1,4 +1,4 @@ -%% (a) generate correlated data +%% (a) generate correlated data: n = 1000; a = 0.2; x = randn(n, 1); @@ -8,7 +8,7 @@ y = randn(n, 1) + a*x; subplot(1, 2, 1); plot(x, a*x, 'r', 'linewidth', 3); hold on -%scatter(x, y ); % either scatter ... +%scatter(x, y ); % either scatter ... plot(x, y, 'o', 'markersize', 2 ); % ... or plot - same plot. xlim([-4 4]) ylim([-4 4]) @@ -20,20 +20,20 @@ hold off %c = corrcoef(x, y); % returns correlation matrix %rd = c(1, 2); rd = corr(x, y); -fprintf('correlation coefficient = %.2f\n', rd ); +fprintf('correlation coefficient = %.2f\n', rd); %% (e) permutation: nperm = 1000; -rs = zeros(nperm,1); -for i=1:nperm - xr=x(randperm(length(x))); % shuffle x - yr=y(randperm(length(y))); % shuffle y +rs = zeros(nperm, 1); +for i = 1:nperm + xr = x(randperm(length(x))); % shuffle x + yr = y(randperm(length(y))); % shuffle y rs(i) = corr(xr, yr); end %% (g) pdf of the correlation coefficients: -[h,b] = hist(rs, 20); -h = h/sum(h)/(b(2)-b(1)); % normalization +[h, b] = hist(rs, 20); +h = h/sum(h)/(b(2)-b(1)); % normalization %% (h) significance: rq = quantile(rs, 0.95); @@ -49,7 +49,7 @@ subplot(1, 2, 2) hold on; bar(b, h, 'facecolor', 'b'); bar(b(b>=rq), h(b>=rq), 'facecolor', 'r'); -plot( [rd rd], [0 4], 'r', 'linewidth', 2); +plot([rd rd], [0 4], 'r', 'linewidth', 2); xlim([-0.25 0.25]) xlabel('Correlation coefficient'); ylabel('Probability density of H0'); diff --git a/bootstrap/exercises/exercises01.tex b/bootstrap/exercises/exercises01.tex index 0917579..dd2380d 100644 --- a/bootstrap/exercises/exercises01.tex +++ b/bootstrap/exercises/exercises01.tex @@ -15,7 +15,7 @@ \else \newcommand{\stitle}{} \fi -\header{{\bfseries\large Exercise 9\stitle}}{{\bfseries\large Bootstrap}}{{\bfseries\large December 9th, 2019}} +\header{{\bfseries\large Exercise 8\stitle}}{{\bfseries\large Resampling}}{{\bfseries\large December 14th, 2020}} \firstpagefooter{Prof. Dr. Jan Benda}{Phone: 29 74573}{Email: jan.benda@uni-tuebingen.de} \runningfooter{}{\thepage}{} @@ -86,6 +86,9 @@ jan.benda@uni-tuebingen.de} \begin{questions} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\question \qt{Read chapter 7 of the script on ``resampling methods''!}\vspace{-3ex} + \question \qt{Bootstrap the standard error of the mean} We want to compute the standard error of the mean of a data set by means of the bootstrap method and compare the result with the formula @@ -149,10 +152,10 @@ normally distributed? \continue -\question \qt{Permutation test} \label{permutationtest} +\question \qt{Permutation test of correlations} \label{correlationtest} We want to compute the significance of a correlation by means of a permutation test. \begin{parts} - \part \label{permutationtestdata} Generate 1000 correlated pairs + \part \label{correlationtestdata} Generate 1000 correlated pairs $x$, $y$ of random numbers according to: \begin{verbatim} n = 1000 @@ -188,9 +191,13 @@ correlation coefficient of zero we can conclude that the correlation coefficient of the data indeed quantifies correlated data. We take the same data set that we have generated in exercise -\ref{permutationtest} (\ref{permutationtestdata}). +\ref{correlationtest} (\ref{correlationtestdata}). \begin{parts} - \part Bootstrap 1000 times the correlation coefficient from the data. + \part Bootstrap 1000 times the correlation coefficient from the + data, i.e. generate bootstrap data by randomly resampling the + original data pairs with replacement. Use the \code{randi()} + function for generating random indices that you can use to select a + random sample from the original data. \part Compute and plot the probability density of these correlation coefficients. \part Is the correlation of the original data set significant? @@ -200,6 +207,43 @@ We take the same data set that we have generated in exercise \includegraphics[width=1\textwidth]{correlationbootstrap} \end{solution} + +\continuepage +\question \qt{Permutation test of difference of means} +We want to test whether two data sets come from distributions that +differ in their mean by means of a permutation test. +\begin{parts} + \part Generate two normally distributed data sets $x$ and $y$ + containing each $n=200$ samples. Let's assume the $x$ samples are + measurements of the membrane potential of a mammalian photoreceptor + in darkness with a mean of $-40$\,mV and a standard deviation of + 1\,mV. The $y$ values are the membrane potentials measured under dim + illumination and come from a distribution with the same standard + deviation and a mean of $-40.5$\,mV. See section 5.2 ``Scaling and + shifting random numbers'' in the script. + \part Plot histograms of the $x$ and $y$ data in a single + plot. Choose appropriate bins. + \part Compute the means of $x$ and $y$ and their difference. + \part The null hypothesis is that the $x$ and $y$ data come from the + same distribution. How can you generate new samples $x_r$ and $y_r$ + from the original data that come from the same distribution? + \part Do exactly this 1000 times and compute each time the + difference of the means of the two resampled samples. + \part Compute and plot the probability density of the resulting + distribution of the null hypothesis. + \part Is the difference of the means of the original data sets significant? + \part Repeat this procedure for $y$ samples that are closer or + further apart from the mean of the $x$ data set. For this put the + computations of the permuation test in a function and all the plotting + in another function. +\end{parts} +\begin{solution} + \lstinputlisting{meandiffpermutation.m} + %\lstinputlisting{meandiffplots.m} + \lstinputlisting{meandiffsignificance.m} + \includegraphics[width=1\textwidth]{meandiffsignificance} +\end{solution} + \end{questions} \end{document} \ No newline at end of file diff --git a/bootstrap/exercises/meandiffpermutation.m b/bootstrap/exercises/meandiffpermutation.m new file mode 100644 index 0000000..f8fb58a --- /dev/null +++ b/bootstrap/exercises/meandiffpermutation.m @@ -0,0 +1,29 @@ +function [md, ds, dq] = meandiffpermutation(x, y, nperm, alpha) +% Permutation test for difference of means of two independent samples. +% +% [md, ds] = meandiffpermutation(x, y, nperm, alpha); +% +% Arguments: +% x: vector with the samples of the x data set. +% y: vector with the samples of the y data set. +% nperm: number of permutations run. +% alpha: significance level. +% +% Returns: +% md: difference of the means +% ds: vector containing the differences of the means of the resampled data sets +% dq: difference of the means at a significance of alpha. + + md = mean(x) - mean(y); % measured difference + xy = [x; y]; % merge data sets + % permutations: + ds = zeros(nperm, 1); + for i = 1:nperm + xyr = xy(randperm(length(xy))); % shuffle xy + xr = xyr(1:length(x)); % random x sample + yr = xyr(length(x)+1:end); % random y sample + ds(i) = mean(xr) - mean(yr); + end + % significance: + dq = quantile(ds, 1.0 - alpha); +end diff --git a/bootstrap/exercises/meandiffsignificance.m b/bootstrap/exercises/meandiffsignificance.m new file mode 100644 index 0000000..cf6586d --- /dev/null +++ b/bootstrap/exercises/meandiffsignificance.m @@ -0,0 +1,48 @@ +%% (a) generate data: +n = 200; +mx = -40.0; +my = -40.5; +x = randn(n, 1) + mx; +y = randn(n, 1) + my; + +%% (b) plot histograms: +subplot(1, 2, 1); +bmin = min([x; y]); +bmax = max([x; y]); +bins = bmin:(bmax-bmin)/20.0:bmax; +hist(x, bins, 'facecolor', 'b'); +hold on +hist(y, bins, 'facecolor', 'r'); +xlabel('x and y') +ylabel('counts') +hold off + +% permutation test: +[md, ds, dq] = meandiffpermutation(x, y, nperm, alpha); + +%% (c) difference of means: +fprintf('difference of means = %.2fmV\n', md); + +%% (f) pdf of the differences: +[h, b] = hist(ds, 20); +h = h/sum(h)/(b(2)-b(1)); % normalization + +%% (g) significance: +fprintf('difference of means at 5%% significance = %.2fmV\n', dq); +if md >= dq + fprintf('--> difference of means %.2fmV is significant\n', md); +else + fprintf('--> %.2fmV is not a significant difference of means\n', md); +end + +%% plot: +subplot(1, 2, 2) +bar(b, h, 'facecolor', 'b'); +hold on; +bar(b(b>=dq), h(b>=dq), 'facecolor', 'r'); +plot([md md], [0 4], 'r', 'linewidth', 2); +xlabel('Difference of means'); +ylabel('Probability density of H0'); +hold off; + +savefigpdf(gcf, 'meandiffsignificance.pdf', 12, 6); diff --git a/bootstrap/exercises/tdistribution.m b/bootstrap/exercises/tdistribution.m index 5fe8341..e20aebc 100644 --- a/bootstrap/exercises/tdistribution.m +++ b/bootstrap/exercises/tdistribution.m @@ -1,10 +1,10 @@ %% (a) generate random numbers: n = 100000; -x=randn(n, 1); +x = randn(n, 1); -for nsamples=[3 5 10 50] +for nsamples = [3 5 10 50] nsamples - %% compute mean, standard deviation and t: + % compute mean, standard deviation and t: nmeans = 10000; means = zeros(nmeans, 1); sdevs = zeros(nmeans, 1); @@ -19,14 +19,14 @@ for nsamples=[3 5 10 50] % Gaussian pdfs: msdev = std(means); tsdev = 1.0; - dxg=0.01; + dxg = 0.01; xmax = 10.0; xmin = -xmax; xg = [xmin:dxg:xmax]; pm = exp(-0.5*(xg/msdev).^2)/sqrt(2.0*pi)/msdev; pt = exp(-0.5*(xg/tsdev).^2)/sqrt(2.0*pi)/tsdev; - %% plots + % plots: subplot(1, 2, 1) bins = xmin:0.2:xmax; [h,b] = hist(means, bins); @@ -35,7 +35,7 @@ for nsamples=[3 5 10 50] hold on plot(xg, pm, 'r', 'linewidth', 2) title(sprintf('sample size = %d', nsamples)); - xlim( [-3, 3] ); + xlim([-3, 3]); xlabel('Mean'); ylabel('pdf'); hold off; @@ -48,11 +48,11 @@ for nsamples=[3 5 10 50] hold on plot(xg, pt, 'r', 'linewidth', 2) title(sprintf('sample size = %d', nsamples)); - xlim( [-8, 8] ); + xlim([-8, 8]); xlabel('Student-t'); ylabel('pdf'); hold off; savefigpdf(gcf, sprintf('tdistribution-n%02d.pdf', nsamples), 14, 5); - pause( 3.0 ) + pause(3.0) end diff --git a/statistics/exercises/fitting.tex b/regression/exercises/fitting.tex similarity index 100% rename from statistics/exercises/fitting.tex rename to regression/exercises/fitting.tex