From 006fa998ccacf058e33d492d920b8a613f1471cc Mon Sep 17 00:00:00 2001 From: Jan Benda Date: Tue, 3 Dec 2019 08:57:40 +0100 Subject: [PATCH] [bootstrap] updated text and exercises --- bootstrap/exercises/bootstrapmean.m | 6 +-- bootstrap/exercises/bootstraptymus.m | 28 ++++++------ bootstrap/exercises/correlationbootstrap.m | 8 ++-- bootstrap/exercises/correlationsignificance.m | 10 ++-- bootstrap/exercises/exercises01.tex | 43 +++++++++--------- bootstrap/exercises/tdistribution.m | 12 ++--- bootstrap/lecture/bootstrap.tex | 22 +++++---- .../lecture/pointprocessscetchA.eps | 6 +-- .../lecture/pointprocessscetchA.pdf | Bin 2790 -> 2786 bytes .../lecture/pointprocessscetchB.eps | 6 +-- .../lecture/pointprocessscetchB.pdf | Bin 4698 -> 4694 bytes 11 files changed, 72 insertions(+), 69 deletions(-) diff --git a/bootstrap/exercises/bootstrapmean.m b/bootstrap/exercises/bootstrapmean.m index 356c531..6f3b494 100644 --- a/bootstrap/exercises/bootstrapmean.m +++ b/bootstrap/exercises/bootstrapmean.m @@ -1,11 +1,11 @@ -function [bootsem, mu] = bootstrapmean( x, resample ) +function [bootsem, mu] = bootstrapmean(x, resample) % computes standard error by bootstrapping the data % x: vector with data % resample: number of resamplings % returns: % bootsem: the standard error of the mean % mu: the bootstrapped means as a vector - mu = zeros( resample, 1 ); + mu = zeros(resample, 1); nsamples = length(x); for i = 1:resample % resample: @@ -13,5 +13,5 @@ function [bootsem, mu] = bootstrapmean( x, resample ) % compute statistics on sample: mu(i) = mean(xr); end - bootsem = std( mu ); + bootsem = std(mu); end diff --git a/bootstrap/exercises/bootstraptymus.m b/bootstrap/exercises/bootstraptymus.m index 0b4aaa7..2a4c43d 100644 --- a/bootstrap/exercises/bootstraptymus.m +++ b/bootstrap/exercises/bootstraptymus.m @@ -1,36 +1,36 @@ %% (b) load the data: -load( 'thymusglandweights.dat' ); +load('thymusglandweights.dat'); nsamples = 80; x = thymusglandweights(1:nsamples); %% (c) mean, sem and hist: sem = std(x)/sqrt(nsamples); -fprintf( 'Mean of the data set = %.2fmg\n', mean(x) ); -fprintf( 'SEM of the data set = %.2fmg\n', sem ); +fprintf('Mean of the data set = %.2fmg\n', mean(x)); +fprintf('SEM of the data set = %.2fmg\n', sem); hist(x,20) xlabel('x') ylabel('count') -savefigpdf( gcf, 'bootstraptymus-datahist.pdf', 6, 5 ); -pause( 2.0 ) +savefigpdf(gcf, 'bootstraptymus-datahist.pdf', 6, 5); +pause(2.0) %% (d) bootstrap the mean: resample = 500; -[bootsem, mu] = bootstrapmean( x, resample ); -hist( mu, 20 ); +[bootsem, mu] = bootstrapmean(x, resample); +hist(mu, 20); xlabel('mean(x)') ylabel('count') -savefigpdf( gcf, 'bootstraptymus-meanhist.pdf', 6, 5 ); -fprintf( ' bootstrap standard error: %.3f\n', bootsem ); -fprintf( 'theoretical standard error: %.3f\n', sem ); +savefigpdf(gcf, 'bootstraptymus-meanhist.pdf', 6, 5); +fprintf(' bootstrap standard error: %.3f\n', bootsem); +fprintf('theoretical standard error: %.3f\n', sem); %% (e) confidence interval: q = quantile(mu, [0.025, 0.975]); -fprintf( '95%% confidence interval of the mean from %.2fmg to %.2fmg\n', q(1), q(2) ); -pause( 2.0 ) +fprintf('95%% confidence interval of the mean from %.2fmg to %.2fmg\n', q(1), q(2)); +pause(2.0) %% (f): dependence on sample size: nsamplesrange = 10:10:1000; -bootsems = zeros( length(nsamplesrange),1); +bootsems = zeros(length(nsamplesrange), 1); for n=1:length(nsamplesrange) nsamples = nsamplesrange(n); % [bootsems(n), mu] = bootstrapmean(x, resample); @@ -43,5 +43,5 @@ hold off xlabel('sample size') ylabel('SEM') legend('bootsrap', 'theory') -savefigpdf( gcf, 'bootstraptymus-samples.pdf', 6, 5 ); +savefigpdf(gcf, 'bootstraptymus-samples.pdf', 6, 5); diff --git a/bootstrap/exercises/correlationbootstrap.m b/bootstrap/exercises/correlationbootstrap.m index 5abb951..707285f 100644 --- a/bootstrap/exercises/correlationbootstrap.m +++ b/bootstrap/exercises/correlationbootstrap.m @@ -11,12 +11,12 @@ for i=1:nperm end %% (b) pdf of the correlation coefficients: -[hb,bb] = hist(rb, 20 ); +[hb,bb] = hist(rb, 20); hb = hb/sum(hb)/(bb(2)-bb(1)); % normalization %% (c) significance: rbq = quantile(rb, 0.05); -fprintf('correlation coefficient at 5%% significance = %.2f\n', rbq ); +fprintf('correlation coefficient at 5%% significance = %.2f\n', rbq); if rbq > 0.0 fprintf('--> correlation r=%.2f is significant\n', rd); else @@ -28,10 +28,10 @@ hold on; bar(b, h, 'facecolor', [0.5 0.5 0.5]); bar(bb, hb, 'facecolor', 'b'); bar(bb(bb<=rbq), hb(bb<=rbq), 'facecolor', 'r'); -plot( [rd rd], [0 4], 'r', 'linewidth', 2 ); +plot([rd rd], [0 4], 'r', 'linewidth', 2); xlim([-0.25 0.75]) xlabel('Correlation coefficient'); ylabel('Probability density'); hold off; -savefigpdf( gcf, 'correlationbootstrap.pdf', 12, 6 ); +savefigpdf(gcf, 'correlationbootstrap.pdf', 12, 6); diff --git a/bootstrap/exercises/correlationsignificance.m b/bootstrap/exercises/correlationsignificance.m index 7c7e8a2..d44af84 100644 --- a/bootstrap/exercises/correlationsignificance.m +++ b/bootstrap/exercises/correlationsignificance.m @@ -6,7 +6,7 @@ y = randn(n, 1) + a*x; %% (b) scatter plot: subplot(1, 2, 1); -plot(x, a*x, 'r', 'linewidth', 3 ); +plot(x, a*x, 'r', 'linewidth', 3); hold on %scatter(x, y ); % either scatter ... plot(x, y, 'o', 'markersize', 2 ); % ... or plot - same plot. @@ -32,12 +32,12 @@ for i=1:nperm end %% (g) pdf of the correlation coefficients: -[h,b] = hist(rs, 20 ); +[h,b] = hist(rs, 20); h = h/sum(h)/(b(2)-b(1)); % normalization %% (h) significance: rq = quantile(rs, 0.95); -fprintf('correlation coefficient at 5%% significance = %.2f\n', rq ); +fprintf('correlation coefficient at 5%% significance = %.2f\n', rq); if rd >= rq fprintf('--> correlation r=%.2f is significant\n', rd); else @@ -49,10 +49,10 @@ subplot(1, 2, 2) hold on; bar(b, h, 'facecolor', 'b'); bar(b(b>=rq), h(b>=rq), 'facecolor', 'r'); -plot( [rd rd], [0 4], 'r', 'linewidth', 2 ); +plot( [rd rd], [0 4], 'r', 'linewidth', 2); xlim([-0.25 0.25]) xlabel('Correlation coefficient'); ylabel('Probability density of H0'); hold off; -savefigpdf( gcf, 'correlationsignificance.pdf', 12, 6 ); +savefigpdf(gcf, 'correlationsignificance.pdf', 12, 6); diff --git a/bootstrap/exercises/exercises01.tex b/bootstrap/exercises/exercises01.tex index 3029cc1..c9e1c8a 100644 --- a/bootstrap/exercises/exercises01.tex +++ b/bootstrap/exercises/exercises01.tex @@ -15,7 +15,7 @@ \else \newcommand{\stitle}{} \fi -\header{{\bfseries\large Exercise 9\stitle}}{{\bfseries\large Bootstrap}}{{\bfseries\large November 20th, 2018}} +\header{{\bfseries\large Exercise 9\stitle}}{{\bfseries\large Bootstrap}}{{\bfseries\large December 9th, 2019}} \firstpagefooter{Prof. Dr. Jan Benda}{Phone: 29 74573}{Email: jan.benda@uni-tuebingen.de} \runningfooter{}{\thepage}{} @@ -86,7 +86,7 @@ jan.benda@uni-tuebingen.de} \begin{questions} -\question \qt{Bootstrap of the standard error of the mean} +\question \qt{Bootstrap the standard error of the mean} We want to compute the standard error of the mean of a data set by means of the bootstrap method and compare the result with the formula ``standard deviation divided by the square-root of $n$''. @@ -118,24 +118,25 @@ means of the bootstrap method and compare the result with the formula \end{solution} -\question \qt{Student t-distribution} -The distribution of Student's t, $t=\bar x/(\sigma_x/\sqrt{m})$, the -estimated mean of a data set divided by the estimated standard error -of the mean, is not a normal distribution but a Student-t distribution. -We want to compute the Student-t distribution and compare it with the -normal distribution. +\question \qt{Student t-distribution} +The distribution of Student's t, $t=\bar x/(\sigma_x/\sqrt{n})$, the +estimated mean $\bar x$ of a data set of size $n$ divided by the +estimated standard error of the mean $\sigma_x/\sqrt{n}$, where +$\sigma_x$ is the estimated standard deviation, is not a normal +distribution but a Student-t distribution. We want to compute the +Student-t distribution and compare it with the normal distribution. \begin{parts} \part Generate 100000 normally distributed random numbers. -\part Draw from these data 1000 samples of size $n=3$, 5, 10, and 50. -\part Compute the mean $\bar x$ of the samples and plot the +\part Draw from these data 1000 samples of size $n=3$, 5, 10, and +50. For each sample size $n$ ... +\part ... compute the mean $\bar x$ of the samples and plot the probability density of these means. -\part Compare the resulting probability densities with corresponding +\part ... compare the resulting probability densities with corresponding normal distributions. -\part Compute in addition $t=\bar x/(\sigma_x/\sqrt{n})$ (standard -deviation of the samples $\sigma_x$) and compare their distribution -with the normal distribution with standard deviation of one. Is $t$ -normally distributed? Under which conditions is $t$ normally -distributed? +\part ... compute Student's $t=\bar x/(\sigma_x/\sqrt{n})$ and compare its +distribution with the normal distribution with standard deviation of +one. Is $t$ normally distributed? Under which conditions is $t$ +normally distributed? \end{parts} \newsolutionpage \begin{solution} @@ -167,16 +168,16 @@ y = randn(n, 1) + a*x; \part Compute and plot the probability density of these correlation coefficients. \part Is the correlation of the original data set significant? - \part What does significance of the correlation mean? - \part Vary the sample size \code{n} and compute in the same way the - significance of the correlation. + \part What does ``significance of the correlation'' mean? +% \part Vary the sample size \code{n} and compute in the same way the +% significance of the correlation. \end{parts} \begin{solution} \lstinputlisting{correlationsignificance.m} \includegraphics[width=1\textwidth]{correlationsignificance} \end{solution} -\question \qt{Bootstrap of the correlation coefficient} +\question \qt{Bootstrap the correlation coefficient} The permutation test generates the distribution of the null hypothesis of uncorrelated data and we check whether the correlation coefficient of the data differs significantly from this @@ -184,7 +185,7 @@ distribution. Alternatively we can bootstrap the data while keeping the pairs and determine the confidence interval of the correlation coefficient of the data. If this differs significantly from a correlation coefficient of zero we can conclude that the correlation -coefficient of the data quantifies indeed a correlated data. +coefficient of the data indeed quantifies correlated data. We take the same data set that we have generated in exercise \ref{permutationtest} (\ref{permutationtestdata}). diff --git a/bootstrap/exercises/tdistribution.m b/bootstrap/exercises/tdistribution.m index 223cbe5..5fe8341 100644 --- a/bootstrap/exercises/tdistribution.m +++ b/bootstrap/exercises/tdistribution.m @@ -6,9 +6,9 @@ for nsamples=[3 5 10 50] nsamples %% compute mean, standard deviation and t: nmeans = 10000; - means = zeros( nmeans, 1 ); - sdevs = zeros( nmeans, 1 ); - students = zeros( nmeans, 1 ); + means = zeros(nmeans, 1); + sdevs = zeros(nmeans, 1); + students = zeros(nmeans, 1 ); for i=1:nmeans sample = x(randi(n, nsamples, 1)); means(i) = mean(sample); @@ -34,7 +34,7 @@ for nsamples=[3 5 10 50] bar(b, h, 'facecolor', 'b', 'edgecolor', 'b') hold on plot(xg, pm, 'r', 'linewidth', 2) - title( sprintf('sample size = %d', nsamples) ); + title(sprintf('sample size = %d', nsamples)); xlim( [-3, 3] ); xlabel('Mean'); ylabel('pdf'); @@ -47,12 +47,12 @@ for nsamples=[3 5 10 50] bar(b, h, 'facecolor', 'b', 'edgecolor', 'b') hold on plot(xg, pt, 'r', 'linewidth', 2) - title( sprintf('sample size = %d', nsamples) ); + title(sprintf('sample size = %d', nsamples)); xlim( [-8, 8] ); xlabel('Student-t'); ylabel('pdf'); hold off; - savefigpdf( gcf, sprintf('tdistribution-n%02d.pdf', nsamples), 14, 5 ); + savefigpdf(gcf, sprintf('tdistribution-n%02d.pdf', nsamples), 14, 5); pause( 3.0 ) end diff --git a/bootstrap/lecture/bootstrap.tex b/bootstrap/lecture/bootstrap.tex index 28dbec0..f0fae62 100644 --- a/bootstrap/lecture/bootstrap.tex +++ b/bootstrap/lecture/bootstrap.tex @@ -84,9 +84,11 @@ standard errors and confidence intervals). Bootstrapping methods create bootstrapped samples from a SRS by resampling. The bootstrapped samples are used to estimate the sampling distribution of a statistical measure. The bootstrapped samples have -the same size as the original sample and are created by randomly drawing with -replacement. That is, each value of the original sample can occur -once, multiple time, or not at all in a bootstrapped sample. +the same size as the original sample and are created by randomly +drawing with replacement. That is, each value of the original sample +can occur once, multiple time, or not at all in a bootstrapped +sample. This can be implemented by generating random indices into the +data set using the \code{randi()} function. \section{Bootstrap of the standard error} @@ -165,13 +167,13 @@ data points $(x_i, y_i)$. By calculating the correlation coefficient we can quantify how strongly $y$ depends on $x$. The correlation coefficient alone, however, does not tell whether the correlation is significantly different from a random correlation. The null hypothesis -for such a situation would be that $y$ does not depend on $x$. In +for such a situation is that $y$ does not depend on $x$. In order to perform a permutation test, we need to destroy the correlation by permuting the $(x_i, y_i)$ pairs, i.e. we rearrange the $x_i$ and $y_i$ values in a random fashion. Generating many sets of -random pairs and computing the resulting correlation coefficients, +random pairs and computing the resulting correlation coefficients yields a distribution of correlation coefficients that result -randomnly from uncorrelated data. By comparing the actually measured +randomly from uncorrelated data. By comparing the actually measured correlation coefficient with this distribution we can directly assess the significance of the correlation (figure\,\ref{permutecorrelationfig}). @@ -183,10 +185,10 @@ Estimate the statistical significance of a correlation coefficient. and calculate the respective $y$-values according to $y_i =0.2 \cdot x_i + u_i$ where $u_i$ is a random number drawn from a normal distribution. \item Calculate the correlation coefficient. -\item Generate the distribution according to the null hypothesis by - generating uncorrelated pairs. For this permute $x$- and $y$-values - \matlabfun{randperm()} 1000 times and calculate for each - permutation the correlation coefficient. +\item Generate the distribution of the null hypothesis by generating + uncorrelated pairs. For this permute $x$- and $y$-values + \matlabfun{randperm()} 1000 times and calculate for each permutation + the correlation coefficient. \item Read out the 95\,\% percentile from the resulting distribution of the null hypothesis and compare it with the correlation coefficient computed from the original data. diff --git a/pointprocesses/lecture/pointprocessscetchA.eps b/pointprocesses/lecture/pointprocessscetchA.eps index 15bfb45..30d6da4 100644 --- a/pointprocesses/lecture/pointprocessscetchA.eps +++ b/pointprocesses/lecture/pointprocessscetchA.eps @@ -1,7 +1,7 @@ %!PS-Adobe-2.0 EPSF-2.0 %%Title: pointprocessscetchA.tex %%Creator: gnuplot 4.6 patchlevel 4 -%%CreationDate: Mon Dec 2 13:03:15 2019 +%%CreationDate: Tue Dec 3 08:08:50 2019 %%DocumentFonts: %%BoundingBox: 50 50 373 135 %%EndComments @@ -430,10 +430,10 @@ SDict begin [ /Title (pointprocessscetchA.tex) /Subject (gnuplot plot) /Creator (gnuplot 4.6 patchlevel 4) - /Author (benda) + /Author (jan) % /Producer (gnuplot) % /Keywords () - /CreationDate (Mon Dec 2 13:03:15 2019) + /CreationDate (Tue Dec 3 08:08:50 2019) /DOCINFO pdfmark end } ifelse diff --git a/pointprocesses/lecture/pointprocessscetchA.pdf b/pointprocesses/lecture/pointprocessscetchA.pdf index afb0114e01cdaf83ecdb97f885ffa871b3163b71..d064a3a00a75327b804866f9670e94f32d37489a 100644 GIT binary patch delta 307 zcmaDR`bcy`I}@Yf<_;!1M($Ko3xiZMLtT@Uq{$bVyV3XxliOKrxQs&#EUbXY)L`;i z77dAz(o_XqmsH(kg%AZ}D+L29u%wm2WM0;04E?)V9ni!iC(E;`qKPR?&SyKqoRyfj z*@ArmqcF$@m(*kh1!D!c&96Bm*^Lb>3@m`I;O5K*DV|)->B?v}`5@b2B4jLtT>;!^sz!yV3XxliOKrxQs#!jjarftqe^k zpJmaI@XgOt&~-`GO;!j|FtSoGv;s?7nNH?qZN|{Qo7DkLOmeb3n<|=^!sL9mBdkfO zc`1pT&Da+(8iVX`NljKzFj9cotzcweXsOAi@0*|El30?e;bH|6Ff=kSGBh?Y2D*)t zGaF*WDojr0&PEecocxE|n#(xEz`_cM zObsSG@@Pnel%^`^x}@qRD}*Q*TPYY=fhDaBCO7glqv=mT<4aDS%&UqfrZD+2?-Ay# z#JtUG_!ckGX4M7#LTy}O`#U+VFB^5=fXTSTma+tP*(r|