[bootstrap] added difference of mean
This commit is contained in:
parent
0e30a01a45
commit
3dd0660b21
@ -1,6 +1,6 @@
|
|||||||
% generate correlated data:
|
% generate correlated data:
|
||||||
n=200;
|
n = 200;
|
||||||
a=0.2;
|
a = 0.2;
|
||||||
x = randn(n, 1);
|
x = randn(n, 1);
|
||||||
y = randn(n, 1) + a*x;
|
y = randn(n, 1) + a*x;
|
||||||
|
|
||||||
@ -11,12 +11,12 @@ fprintf('correlation coefficient of data r = %.2f\n', rd);
|
|||||||
% distribution of null hypothesis by permutation:
|
% distribution of null hypothesis by permutation:
|
||||||
nperm = 1000;
|
nperm = 1000;
|
||||||
rs = zeros(nperm,1);
|
rs = zeros(nperm,1);
|
||||||
for i=1:nperm
|
for i = 1:nperm
|
||||||
xr=x(randperm(length(x))); % shuffle x
|
xr = x(randperm(length(x))); % shuffle x
|
||||||
yr=y(randperm(length(y))); % shuffle y
|
yr = y(randperm(length(y))); % shuffle y
|
||||||
rs(i) = corr(xr, yr);
|
rs(i) = corr(xr, yr);
|
||||||
end
|
end
|
||||||
[h,b] = hist(rs, 20);
|
[h, b] = hist(rs, 20);
|
||||||
h = h/sum(h)/(b(2)-b(1)); % normalization
|
h = h/sum(h)/(b(2)-b(1)); % normalization
|
||||||
|
|
||||||
% significance:
|
% significance:
|
||||||
|
40
bootstrap/code/meandiffsignificance.m
Normal file
40
bootstrap/code/meandiffsignificance.m
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
% generate two data sets:
|
||||||
|
n = 200;
|
||||||
|
d = 0.2;
|
||||||
|
x = randn(n, 1);
|
||||||
|
y = randn(n, 1) + d;
|
||||||
|
|
||||||
|
% difference of sample means:
|
||||||
|
md = mean(y) - mean(x);
|
||||||
|
fprintf('difference of means of data d = %.2f\n', md);
|
||||||
|
|
||||||
|
% distribution of null hypothesis by permutation:
|
||||||
|
nperm = 1000;
|
||||||
|
xy = [x; y]; % x and y data in one column vector
|
||||||
|
ds = zeros(nperm,1);
|
||||||
|
for i = 1:nperm
|
||||||
|
xyr = xy(randperm(length(xy))); % shuffle data
|
||||||
|
xr = xyr(1:length(x)); % random x-data
|
||||||
|
yr = xyr(length(x)+1:end); % random y-data
|
||||||
|
ds(i) = mean(yr) - mean(xr); % difference of means
|
||||||
|
end
|
||||||
|
[h, b] = hist(ds, 20);
|
||||||
|
h = h/sum(h)/(b(2)-b(1)); % normalization
|
||||||
|
|
||||||
|
% significance:
|
||||||
|
dq = quantile(ds, 0.95);
|
||||||
|
fprintf('difference of means of null hypothesis at 5%% significance = %.2f\n', dq);
|
||||||
|
if md >= dq
|
||||||
|
fprintf('--> difference of means d=%.2f is significant\n', md);
|
||||||
|
else
|
||||||
|
fprintf('--> d=%.2f is not a significant difference of means\n', md);
|
||||||
|
end
|
||||||
|
|
||||||
|
% plot:
|
||||||
|
bar(b, h, 'facecolor', 'b');
|
||||||
|
hold on;
|
||||||
|
bar(b(b>=dq), h(b>=dq), 'facecolor', 'r');
|
||||||
|
plot([md md], [0 4], 'r', 'linewidth', 2);
|
||||||
|
xlabel('Difference of means');
|
||||||
|
ylabel('Probability density of H0');
|
||||||
|
hold off;
|
4
bootstrap/code/meandiffsignificance.out
Normal file
4
bootstrap/code/meandiffsignificance.out
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
>> meandiffsignificance
|
||||||
|
difference of means of data d = 0.18
|
||||||
|
difference of means of null hypothesis at 5% significance = 0.17
|
||||||
|
--> difference of means d=0.18 is significant
|
@ -26,6 +26,4 @@ This chapter easily covers two lectures:
|
|||||||
|
|
||||||
Add jacknife methods to bootstrapping
|
Add jacknife methods to bootstrapping
|
||||||
|
|
||||||
Add permutation test for significant different means of two distributions.
|
|
||||||
|
|
||||||
\end{document}
|
\end{document}
|
||||||
|
@ -125,7 +125,6 @@ Via bootstrapping we create a distribution of the mean values
|
|||||||
(\figref{bootstrapsemfig}) and the standard deviation of this
|
(\figref{bootstrapsemfig}) and the standard deviation of this
|
||||||
distribution is the standard error of the mean.
|
distribution is the standard error of the mean.
|
||||||
|
|
||||||
\pagebreak[4]
|
|
||||||
\begin{exercise}{bootstrapsem.m}{bootstrapsem.out}
|
\begin{exercise}{bootstrapsem.m}{bootstrapsem.out}
|
||||||
Create the distribution of mean values from bootstrapped samples
|
Create the distribution of mean values from bootstrapped samples
|
||||||
resampled from a single SRS. Use this distribution to estimate the
|
resampled from a single SRS. Use this distribution to estimate the
|
||||||
@ -152,67 +151,156 @@ desired \entermde{Signifikanz}{significance level}, the
|
|||||||
\entermde{Nullhypothese}{null hypothesis} may be rejected.
|
\entermde{Nullhypothese}{null hypothesis} may be rejected.
|
||||||
|
|
||||||
Traditionally, such probabilities are taken from theoretical
|
Traditionally, such probabilities are taken from theoretical
|
||||||
distributions which are based on assumptions about the data. Thus the
|
distributions which are based on some assumptions about the data. For
|
||||||
applied statistical test has to be appropriate for the type of
|
example, the data should be normally distributed. Given some data one
|
||||||
|
has to find an appropriate test that matches the properties of the
|
||||||
data. An alternative approach is to calculate the probability density
|
data. An alternative approach is to calculate the probability density
|
||||||
of the null hypothesis directly from the data itself. To do this, we
|
of the null hypothesis directly from the data themselves. To do so, we
|
||||||
need to resample the data according to the null hypothesis from the
|
need to resample the data according to the null hypothesis from the
|
||||||
SRS. By such permutation operations we destroy the feature of interest
|
SRS. By such permutation operations we destroy the feature of interest
|
||||||
while we conserve all other statistical properties of the data.
|
while conserving all other statistical properties of the data.
|
||||||
|
|
||||||
|
|
||||||
|
\subsection{Significance of a difference in the mean}
|
||||||
|
|
||||||
|
Often we would like to know whether two data sets differ in their
|
||||||
|
mean. Whether the ears of foxes are larger in southern Europe compared
|
||||||
|
to the ones from Scandinavia, whether a drug decreases blood pressure
|
||||||
|
in humans, whether a sensory stimulus increases the firing rate of a
|
||||||
|
neuron, etc. The \entermde{Nullhypothese}{null hypothesis} is that
|
||||||
|
they do not differ in their means, i.e. that both data sets come from
|
||||||
|
the same distribution. But even if the two data sets come from the
|
||||||
|
same distribution, their sample means may nevertheless differ by
|
||||||
|
chance. We need to figure out how these differences of the means are
|
||||||
|
distributed. Only if the measured difference between the means is
|
||||||
|
significantly larger than the ones obtained by chance we can reject
|
||||||
|
the null hypothesis and consider the two data sets to differ
|
||||||
|
significantly in their means.
|
||||||
|
|
||||||
|
We can easily estimate the distribution of the null hypothesis by
|
||||||
|
putting the data of both data sets in one big bag. By merging the two
|
||||||
|
data sets we assume that all the data values come from the same
|
||||||
|
distribution. We then randomly separate the data values into two new
|
||||||
|
data sets. These random data sets contain data from both original data
|
||||||
|
sets and thus come from the same distribution. From these random data
|
||||||
|
sets we compute the difference of their sample means. This procedure
|
||||||
|
is repeated many, say one thousand, times and each time we get a value
|
||||||
|
for a difference of means. The distribution of these values is the
|
||||||
|
distribution of the null hypothesis. It is the distribution of
|
||||||
|
differences of mean values that we get by chance although the two data
|
||||||
|
sets come from the same distribution. For a one-sided test that checks
|
||||||
|
whether the measured difference of means is significantly larger than
|
||||||
|
zero at a significance level of 5\,\% we compute the value of the
|
||||||
|
95\,\% percentile of the null distribution. If the measured value is
|
||||||
|
larger, we can reject the null hypothesis and consider the two data
|
||||||
|
sets to differ significantly in their means.
|
||||||
|
|
||||||
|
By using the original data to estimate the null hypothesis, we make no
|
||||||
|
assumption about the properties of the data. We do not need to worry
|
||||||
|
about the data being normally distributed. We do not need to memorize
|
||||||
|
which test to use in which situation. And we better understand what we
|
||||||
|
are testing, because we design the test ourselves. Nowadays, computer
|
||||||
|
are powerful enough to iterate even ten thousand times over the data
|
||||||
|
to compute the distribution of the null hypothesis --- with only a few
|
||||||
|
lines of code. This is why \entermde{Permutationstest}{permutaion
|
||||||
|
test} are getting quite popular.
|
||||||
|
|
||||||
\begin{figure}[tp]
|
\begin{figure}[tp]
|
||||||
\includegraphics[width=1\textwidth]{permutecorrelation}
|
\includegraphics[width=1\textwidth]{permuteaverage}
|
||||||
\titlecaption{\label{permutecorrelationfig}Permutation test for
|
\titlecaption{\label{permuteaverage}Permutation test for differences
|
||||||
correlations.}{Let the correlation coefficient of a dataset with
|
in means.}{We want to test whether two datasets
|
||||||
200 samples be $\rho=0.21$. The distribution of the null
|
$\left\{x_i\right\}$ (red) and $\left\{y_i\right\}$ (blue) come
|
||||||
hypothesis (yellow), optained from the correlation coefficients of
|
from different distributions by assessing the significance of the
|
||||||
permuted and therefore uncorrelated datasets is centered around
|
difference in their sample means. The data sets were generated
|
||||||
zero. The measured correlation coefficient is larger than the
|
with a difference in their population means of $d=0.7$. For
|
||||||
95\,\% percentile of the null hypothesis. The null hypothesis may
|
generating the distribution of the null hypothesis, i.e. the
|
||||||
thus be rejected and the measured correlation is considered
|
distribution of differences in the means if the two data sets come
|
||||||
statistically significant.}
|
from the same distribution, we randomly select the same number of
|
||||||
|
samples from both data sets (top right). This is repeated many
|
||||||
|
times and results in the desired distribution of differences of
|
||||||
|
means (bottom). The measured difference is clearly beyond the
|
||||||
|
95\,\% percentile of this distribution and thus indicates a
|
||||||
|
significant difference between the distributions of the two
|
||||||
|
original data sets.}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
%\subsection{Significance of a difference in the mean}
|
\begin{exercise}{meandiffsignificance.m}{meandiffsignificance.out}
|
||||||
|
Estimate the statistical significance of a difference in the mean of two data sets.
|
||||||
|
\vspace{-1ex}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Generate two independent data sets, $\left\{x_i\right\}$ and
|
||||||
|
$\left\{y_i\right\}$, of $n=200$ samples each, by drawing random
|
||||||
|
numbers from a normal distribution. Add 0.2 to all the $y_i$ samples
|
||||||
|
to ensure the population means to differ by 0.2.
|
||||||
|
\item Calculate the difference between the sample means of the two data sets.
|
||||||
|
\item Estimate the distribution of the null hypothesis of no
|
||||||
|
difference of the means by generating new data sets with the same
|
||||||
|
number of samples randomly selected from both data sets. For this
|
||||||
|
lump the two data sets together into a single vector. Then permute
|
||||||
|
the order of the elements in this vector using the function
|
||||||
|
\varcode{randperm()}, split it into two data sets and calculate
|
||||||
|
the difference of their means. Repeat this 1000 times.
|
||||||
|
\item Read out the 95\,\% percentile from the resulting distribution
|
||||||
|
of the differences in the mean, the null hypothesis using the
|
||||||
|
\varcode{quantile()} function, and compare it with the difference of
|
||||||
|
means measured from the original data sets.
|
||||||
|
\end{enumerate}
|
||||||
|
\end{exercise}
|
||||||
|
|
||||||
%See \url{https://en.wikipedia.org/wiki/Resampling_(statistics)#Permutation_tests.}
|
|
||||||
|
|
||||||
\subsection{Significance of correlations}
|
\subsection{Significance of correlations}
|
||||||
|
|
||||||
A good example for the application of a
|
Another nice example for the application of a
|
||||||
\entermde{Permutationstest}{permutaion test} is the statistical
|
\entermde{Permutationstest}{permutaion test} is testing for
|
||||||
assessment of \entermde[correlation]{Korrelation}{correlations}. Given
|
significant \entermde[correlation]{Korrelation}{correlations}
|
||||||
are measured pairs of data points $(x_i, y_i)$. By calculating the
|
(figure\,\ref{permutecorrelationfig}). Given are measured pairs of
|
||||||
|
data points $(x_i, y_i)$. By calculating the
|
||||||
\entermde[correlation!correlation
|
\entermde[correlation!correlation
|
||||||
coefficient]{Korrelationskoeffizient}{correlation
|
coefficient]{Korrelationskoeffizient}{correlation coefficient} we can
|
||||||
coefficient} we can quantify how strongly $y$ depends on $x$. The
|
quantify how strongly $y$ depends on $x$. The correlation coefficient
|
||||||
correlation coefficient alone, however, does not tell whether the
|
alone, however, does not tell whether the correlation is significantly
|
||||||
correlation is significantly different from a random correlation. The
|
different from a non-zero correlation that we might get although there
|
||||||
\entermde{Nullhypothese}{null hypothesis} for such a situation is that
|
is no true correlation in the data. The \entermde{Nullhypothese}{null
|
||||||
$y$ does not depend on $x$. In order to perform a permutation test, we
|
hypothesis} for such a situation is that $y$ does not depend on
|
||||||
need to destroy the correlation by permuting the $(x_i, y_i)$ pairs,
|
$x$. In order to perform a permutation test, we need to destroy the
|
||||||
i.e. we rearrange the $x_i$ and $y_i$ values in a random
|
correlation between the data pairs by permuting the $(x_i, y_i)$
|
||||||
|
pairs, i.e. we rearrange the $x_i$ and $y_i$ values in a random
|
||||||
fashion. Generating many sets of random pairs and computing the
|
fashion. Generating many sets of random pairs and computing the
|
||||||
resulting correlation coefficients yields a distribution of
|
corresponding correlation coefficients yields a distribution of
|
||||||
correlation coefficients that result randomly from uncorrelated
|
correlation coefficients that result randomly from truly uncorrelated
|
||||||
data. By comparing the actually measured correlation coefficient with
|
data. By comparing the actually measured correlation coefficient with
|
||||||
this distribution we can directly assess the significance of the
|
this distribution we can directly assess the significance of the
|
||||||
correlation (figure\,\ref{permutecorrelationfig}).
|
correlation.
|
||||||
|
|
||||||
|
\begin{figure}[tp]
|
||||||
|
\includegraphics[width=1\textwidth]{permutecorrelation}
|
||||||
|
\titlecaption{\label{permutecorrelationfig}Permutation test for
|
||||||
|
correlations.}{Let the correlation coefficient of a dataset with
|
||||||
|
200 samples be $\rho=0.21$ (top left). By shuffling the data pairs
|
||||||
|
we destroy any true correlation (top right). The resulting
|
||||||
|
distribution of the null hypothesis (bottm, yellow), optained from
|
||||||
|
the correlation coefficients of permuted and therefore
|
||||||
|
uncorrelated datasets is centered around zero. The measured
|
||||||
|
correlation coefficient is larger than the 95\,\% percentile of
|
||||||
|
the null hypothesis. The null hypothesis may thus be rejected and
|
||||||
|
the measured correlation is considered statistically significant.}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
\begin{exercise}{correlationsignificance.m}{correlationsignificance.out}
|
\begin{exercise}{correlationsignificance.m}{correlationsignificance.out}
|
||||||
Estimate the statistical significance of a correlation coefficient.
|
Estimate the statistical significance of a correlation coefficient.
|
||||||
\begin{enumerate}
|
\begin{enumerate}
|
||||||
\item Create pairs of $(x_i, y_i)$ values. Randomly choose $x$-values
|
\item Generate pairs of $(x_i, y_i)$ values. Randomly choose $x$-values
|
||||||
and calculate the respective $y$-values according to $y_i =0.2 \cdot x_i + u_i$
|
and calculate the respective $y$-values according to $y_i =0.2 \cdot x_i + u_i$
|
||||||
where $u_i$ is a random number drawn from a normal distribution.
|
where $u_i$ is a random number drawn from a normal distribution.
|
||||||
\item Calculate the correlation coefficient.
|
\item Calculate the correlation coefficient.
|
||||||
\item Generate the distribution of the null hypothesis by generating
|
\item Estimate the distribution of the null hypothesis by generating
|
||||||
uncorrelated pairs. For this permute $x$- and $y$-values
|
uncorrelated pairs. For this permute $x$- and $y$-values
|
||||||
\matlabfun{randperm()} 1000 times and calculate for each permutation
|
\matlabfun{randperm()} 1000 times and calculate for each permutation
|
||||||
the correlation coefficient.
|
the correlation coefficient.
|
||||||
\item Read out the 95\,\% percentile from the resulting distribution
|
\item Read out the 95\,\% percentile from the resulting distribution
|
||||||
of the null hypothesis and compare it with the correlation
|
of the null hypothesis using the \varcode{quantile()} function and
|
||||||
coefficient computed from the original data.
|
compare it with the correlation coefficient computed from the
|
||||||
|
original data.
|
||||||
\end{enumerate}
|
\end{enumerate}
|
||||||
\end{exercise}
|
\end{exercise}
|
||||||
|
|
||||||
|
@ -24,7 +24,7 @@ for i in range(nresamples) :
|
|||||||
musrs.append(np.mean(rng.randn(nsamples)))
|
musrs.append(np.mean(rng.randn(nsamples)))
|
||||||
hmusrs, _ = np.histogram(musrs, bins, density=True)
|
hmusrs, _ = np.histogram(musrs, bins, density=True)
|
||||||
|
|
||||||
fig, ax = plt.subplots(figsize=cm_size(figure_width, 1.2*figure_height))
|
fig, ax = plt.subplots(figsize=cm_size(figure_width, 1.05*figure_height))
|
||||||
fig.subplots_adjust(**adjust_fs(left=4.0, bottom=2.7, right=1.5))
|
fig.subplots_adjust(**adjust_fs(left=4.0, bottom=2.7, right=1.5))
|
||||||
ax.set_xlabel('Mean')
|
ax.set_xlabel('Mean')
|
||||||
ax.set_xlim(-0.4, 0.4)
|
ax.set_xlim(-0.4, 0.4)
|
||||||
|
103
bootstrap/lecture/permuteaverage.py
Normal file
103
bootstrap/lecture/permuteaverage.py
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
import numpy as np
|
||||||
|
import scipy.stats as st
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import matplotlib.gridspec as gridspec
|
||||||
|
import matplotlib.ticker as ticker
|
||||||
|
from plotstyle import *
|
||||||
|
|
||||||
|
rng = np.random.RandomState(637281)
|
||||||
|
|
||||||
|
# generate data that differ in their mein by d:
|
||||||
|
n = 200
|
||||||
|
d = 0.7
|
||||||
|
x = rng.randn(n) + d;
|
||||||
|
y = rng.randn(n);
|
||||||
|
#x = rng.exponential(1.0, n);
|
||||||
|
#y = rng.exponential(1.0, n) + d;
|
||||||
|
|
||||||
|
# histogram of data:
|
||||||
|
db = 0.5
|
||||||
|
bins = np.arange(-2.5, 2.6, db)
|
||||||
|
hx, _ = np.histogram(x, bins)
|
||||||
|
hy, _ = np.histogram(y, bins)
|
||||||
|
|
||||||
|
# Diference of means, pooled standard deviation and Cohen's d:
|
||||||
|
ad = np.mean(x)-np.mean(y)
|
||||||
|
s = np.sqrt(((len(x)-1)*np.var(x)+(len(y)-1)*np.var(y))/(len(x)+len(y)-2))
|
||||||
|
cd = ad/s
|
||||||
|
|
||||||
|
# permutation:
|
||||||
|
nperm = 1000
|
||||||
|
ads = []
|
||||||
|
xy = np.hstack((x, y))
|
||||||
|
for i in range(nperm) :
|
||||||
|
xyp = rng.permutation(xy)
|
||||||
|
ads.append(np.mean(xyp[:len(x)])-np.mean(xyp[len(x):]))
|
||||||
|
|
||||||
|
# histogram of shuffled data:
|
||||||
|
hxp, _ = np.histogram(xyp[:len(x)], bins)
|
||||||
|
hyp, _ = np.histogram(xyp[len(x):], bins)
|
||||||
|
|
||||||
|
# pdf of the differences of means:
|
||||||
|
h, b = np.histogram(ads, 20, density=True)
|
||||||
|
|
||||||
|
# significance:
|
||||||
|
dq = np.percentile(ads, 95.0)
|
||||||
|
print('Measured difference of means = %.2f, difference at 95%% percentile of permutation = %.2f' % (ad, dq))
|
||||||
|
da = 1.0-0.01*st.percentileofscore(ads, ad)
|
||||||
|
print('Measured difference of means %.2f is at %.2f%% percentile of permutation' % (ad, 100.0*da))
|
||||||
|
|
||||||
|
ap, at = st.ttest_ind(x, y)
|
||||||
|
print('Measured difference of means %.2f is at %.2f%% percentile of test' % (ad, ap))
|
||||||
|
|
||||||
|
|
||||||
|
fig = plt.figure(figsize=cm_size(figure_width, 1.8*figure_height))
|
||||||
|
gs = gridspec.GridSpec(nrows=2, ncols=2, wspace=0.35, hspace=0.8,
|
||||||
|
**adjust_fs(fig, left=5.0, right=1.5, top=1.0, bottom=2.7))
|
||||||
|
|
||||||
|
ax = fig.add_subplot(gs[0,0])
|
||||||
|
ax.bar(bins[:-1]-0.25*db, hy, 0.5*db, **fsA)
|
||||||
|
ax.bar(bins[:-1]+0.25*db, hx, 0.5*db, **fsB)
|
||||||
|
ax.annotate('', xy=(0.0, 45.0), xytext=(d, 45.0), arrowprops=dict(arrowstyle='<->'))
|
||||||
|
ax.text(0.5*d, 50.0, 'd=%.1f' % d, ha='center')
|
||||||
|
ax.set_xlim(-2.5, 2.5)
|
||||||
|
ax.set_ylim(0.0, 50)
|
||||||
|
ax.yaxis.set_major_locator(ticker.MultipleLocator(20.0))
|
||||||
|
ax.set_xlabel('Original x and y values')
|
||||||
|
ax.set_ylabel('Counts')
|
||||||
|
|
||||||
|
ax = fig.add_subplot(gs[0,1])
|
||||||
|
ax.bar(bins[:-1]-0.25*db, hyp, 0.5*db, **fsA)
|
||||||
|
ax.bar(bins[:-1]+0.25*db, hxp, 0.5*db, **fsB)
|
||||||
|
ax.set_xlim(-2.5, 2.5)
|
||||||
|
ax.set_ylim(0.0, 50)
|
||||||
|
ax.yaxis.set_major_locator(ticker.MultipleLocator(20.0))
|
||||||
|
ax.set_xlabel('Shuffled x and y values')
|
||||||
|
ax.set_ylabel('Counts')
|
||||||
|
|
||||||
|
ax = fig.add_subplot(gs[1,:])
|
||||||
|
ax.annotate('Measured\ndifference\nis significant!',
|
||||||
|
xy=(ad, 1.2), xycoords='data',
|
||||||
|
xytext=(ad-0.1, 2.2), textcoords='data', ha='right',
|
||||||
|
arrowprops=dict(arrowstyle="->", relpos=(1.0,0.5),
|
||||||
|
connectionstyle="angle3,angleA=-20,angleB=100") )
|
||||||
|
ax.annotate('95% percentile',
|
||||||
|
xy=(0.19, 0.9), xycoords='data',
|
||||||
|
xytext=(0.3, 5.0), textcoords='data', ha='left',
|
||||||
|
arrowprops=dict(arrowstyle="->", relpos=(0.1,0.0),
|
||||||
|
connectionstyle="angle3,angleA=40,angleB=80") )
|
||||||
|
ax.annotate('Distribution of\nnullhypothesis',
|
||||||
|
xy=(-0.08, 3.0), xycoords='data',
|
||||||
|
xytext=(-0.22, 4.5), textcoords='data', ha='left',
|
||||||
|
arrowprops=dict(arrowstyle="->", relpos=(0.2,0.0),
|
||||||
|
connectionstyle="angle3,angleA=60,angleB=150") )
|
||||||
|
ax.bar(b[:-1], h, width=b[1]-b[0], **fsC)
|
||||||
|
ax.bar(b[:-1][b[:-1]>=dq], h[b[:-1]>=dq], width=b[1]-b[0], **fsB)
|
||||||
|
ax.plot( [ad, ad], [0, 1], **lsA)
|
||||||
|
ax.set_xlim(-0.25, 0.85)
|
||||||
|
ax.set_ylim(0.0, 5.0)
|
||||||
|
ax.yaxis.set_major_locator(ticker.MultipleLocator(2.0))
|
||||||
|
ax.set_xlabel('Difference of means')
|
||||||
|
ax.set_ylabel('PDF of H0')
|
||||||
|
|
||||||
|
plt.savefig('permuteaverage.pdf')
|
@ -1,9 +1,11 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import scipy.stats as st
|
import scipy.stats as st
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
import matplotlib.gridspec as gridspec
|
||||||
|
import matplotlib.ticker as ticker
|
||||||
from plotstyle import *
|
from plotstyle import *
|
||||||
|
|
||||||
rng = np.random.RandomState(637281)
|
rng = np.random.RandomState(37281)
|
||||||
|
|
||||||
# generate correlated data:
|
# generate correlated data:
|
||||||
n = 200
|
n = 200
|
||||||
@ -19,32 +21,56 @@ rd = np.corrcoef(x, y)[0, 1]
|
|||||||
nperm = 1000
|
nperm = 1000
|
||||||
rs = []
|
rs = []
|
||||||
for i in range(nperm) :
|
for i in range(nperm) :
|
||||||
xr=rng.permutation(x)
|
xr = rng.permutation(x)
|
||||||
yr=rng.permutation(y)
|
yr = rng.permutation(y)
|
||||||
rs.append( np.corrcoef(xr, yr)[0, 1] )
|
rs.append(np.corrcoef(xr, yr)[0, 1])
|
||||||
|
rr = np.corrcoef(xr, yr)[0, 1]
|
||||||
|
|
||||||
# pdf of the correlation coefficients:
|
# pdf of the correlation coefficients:
|
||||||
h, b = np.histogram(rs, 20, density=True)
|
h, b = np.histogram(rs, 20, density=True)
|
||||||
|
|
||||||
# significance:
|
# significance:
|
||||||
rq = np.percentile(rs, 95.0)
|
rq = np.percentile(rs, 95.0)
|
||||||
print('Measured correlation coefficient = %.2f, correlation coefficient at 95%% percentile of bootstrap = %.2f' % (rd, rq))
|
print('Measured correlation coefficient = %.2f, correlation coefficient at 95%% percentile of permutation = %.2f' % (rd, rq))
|
||||||
ra = 1.0-0.01*st.percentileofscore(rs, rd)
|
ra = 1.0-0.01*st.percentileofscore(rs, rd)
|
||||||
print('Measured correlation coefficient %.2f is at %.4f percentile of bootstrap' % (rd, ra))
|
print('Measured correlation coefficient %.2f is at %.4f percentile of permutation' % (rd, ra))
|
||||||
|
|
||||||
rp, ra = st.pearsonr(x, y)
|
rp, ra = st.pearsonr(x, y)
|
||||||
print('Measured correlation coefficient %.2f is at %.4f percentile of test' % (rp, ra))
|
print('Measured correlation coefficient %.2f is at %.4f percentile of test' % (rp, ra))
|
||||||
|
|
||||||
fig, ax = plt.subplots(figsize=cm_size(figure_width, 1.2*figure_height))
|
fig = plt.figure(figsize=cm_size(figure_width, 1.8*figure_height))
|
||||||
fig.subplots_adjust(**adjust_fs(left=4.0, bottom=2.7, right=0.5, top=1.0))
|
gs = gridspec.GridSpec(nrows=2, ncols=2, wspace=0.35, hspace=0.8,
|
||||||
|
**adjust_fs(fig, left=5.0, right=1.5, top=1.0, bottom=2.7))
|
||||||
|
|
||||||
|
ax = fig.add_subplot(gs[0,0])
|
||||||
|
ax.text(0.0, 4.0, 'r=%.2f' % rd, ha='center')
|
||||||
|
ax.plot(x, y, **psAm)
|
||||||
|
ax.set_xlim(-4.0, 4.0)
|
||||||
|
ax.set_ylim(-4.0, 4.0)
|
||||||
|
ax.xaxis.set_major_locator(ticker.MultipleLocator(2.0))
|
||||||
|
ax.yaxis.set_major_locator(ticker.MultipleLocator(2.0))
|
||||||
|
ax.set_xlabel('x')
|
||||||
|
ax.set_ylabel('y')
|
||||||
|
|
||||||
|
ax = fig.add_subplot(gs[0,1])
|
||||||
|
ax.text(0.0, 4.0, 'r=%.2f' % rr, ha='center')
|
||||||
|
ax.plot(xr, yr, **psAm)
|
||||||
|
ax.set_xlim(-4.0, 4.0)
|
||||||
|
ax.set_ylim(-4.0, 4.0)
|
||||||
|
ax.xaxis.set_major_locator(ticker.MultipleLocator(2.0))
|
||||||
|
ax.yaxis.set_major_locator(ticker.MultipleLocator(2.0))
|
||||||
|
ax.set_xlabel('Shuffled x')
|
||||||
|
ax.set_ylabel('Shuffled y')
|
||||||
|
|
||||||
|
ax = fig.add_subplot(gs[1,:])
|
||||||
ax.annotate('Measured\ncorrelation\nis significant!',
|
ax.annotate('Measured\ncorrelation\nis significant!',
|
||||||
xy=(rd, 1.1), xycoords='data',
|
xy=(rd, 1.1), xycoords='data',
|
||||||
xytext=(rd, 2.2), textcoords='data', ha='left',
|
xytext=(rd-0.01, 3.0), textcoords='data', ha='left',
|
||||||
arrowprops=dict(arrowstyle="->", relpos=(0.2,0.0),
|
arrowprops=dict(arrowstyle="->", relpos=(0.3,0.0),
|
||||||
connectionstyle="angle3,angleA=10,angleB=80") )
|
connectionstyle="angle3,angleA=10,angleB=80") )
|
||||||
ax.annotate('95% percentile',
|
ax.annotate('95% percentile',
|
||||||
xy=(0.14, 0.9), xycoords='data',
|
xy=(0.14, 0.9), xycoords='data',
|
||||||
xytext=(0.18, 4.0), textcoords='data', ha='left',
|
xytext=(0.16, 6.2), textcoords='data', ha='left',
|
||||||
arrowprops=dict(arrowstyle="->", relpos=(0.1,0.0),
|
arrowprops=dict(arrowstyle="->", relpos=(0.1,0.0),
|
||||||
connectionstyle="angle3,angleA=30,angleB=80") )
|
connectionstyle="angle3,angleA=30,angleB=80") )
|
||||||
ax.annotate('Distribution of\nuncorrelated\nsamples',
|
ax.annotate('Distribution of\nuncorrelated\nsamples',
|
||||||
@ -56,7 +82,9 @@ ax.bar(b[:-1], h, width=b[1]-b[0], **fsC)
|
|||||||
ax.bar(b[:-1][b[:-1]>=rq], h[b[:-1]>=rq], width=b[1]-b[0], **fsB)
|
ax.bar(b[:-1][b[:-1]>=rq], h[b[:-1]>=rq], width=b[1]-b[0], **fsB)
|
||||||
ax.plot( [rd, rd], [0, 1], **lsA)
|
ax.plot( [rd, rd], [0, 1], **lsA)
|
||||||
ax.set_xlim(-0.25, 0.35)
|
ax.set_xlim(-0.25, 0.35)
|
||||||
|
ax.set_ylim(0.0, 6.0)
|
||||||
|
ax.yaxis.set_major_locator(ticker.MultipleLocator(2.0))
|
||||||
ax.set_xlabel('Correlation coefficient')
|
ax.set_xlabel('Correlation coefficient')
|
||||||
ax.set_ylabel('Prob. density of H0')
|
ax.set_ylabel('PDF of H0')
|
||||||
|
|
||||||
plt.savefig('permutecorrelation.pdf')
|
plt.savefig('permutecorrelation.pdf')
|
||||||
|
@ -42,6 +42,9 @@ $(BASENAME)-chapter.pdf : $(BASENAME)-chapter.tex $(BASENAME).tex $(wildcard $(B
|
|||||||
watchchapter :
|
watchchapter :
|
||||||
while true; do ! make -q chapter && make chapter; sleep 0.5; done
|
while true; do ! make -q chapter && make chapter; sleep 0.5; done
|
||||||
|
|
||||||
|
watchplots :
|
||||||
|
while true; do ! make -q plots && make plots; sleep 0.5; done
|
||||||
|
|
||||||
cleantex:
|
cleantex:
|
||||||
rm -f *~
|
rm -f *~
|
||||||
rm -f $(BASENAME).aux $(BASENAME).log *.idx
|
rm -f $(BASENAME).aux $(BASENAME).log *.idx
|
||||||
|
@ -33,13 +33,14 @@ colors['white'] = '#FFFFFF'
|
|||||||
# general settings for plot styles:
|
# general settings for plot styles:
|
||||||
lwthick = 3.0
|
lwthick = 3.0
|
||||||
lwthin = 1.8
|
lwthin = 1.8
|
||||||
|
edgewidth = 0.0 if xkcd_style else 1.0
|
||||||
mainline = {'linestyle': '-', 'linewidth': lwthick}
|
mainline = {'linestyle': '-', 'linewidth': lwthick}
|
||||||
minorline = {'linestyle': '-', 'linewidth': lwthin}
|
minorline = {'linestyle': '-', 'linewidth': lwthin}
|
||||||
largemarker = {'marker': 'o', 'markersize': 9, 'markeredgecolor': colors['white'], 'markeredgewidth': 1}
|
largemarker = {'marker': 'o', 'markersize': 9, 'markeredgecolor': colors['white'], 'markeredgewidth': edgewidth}
|
||||||
smallmarker = {'marker': 'o', 'markersize': 6, 'markeredgecolor': colors['white'], 'markeredgewidth': 1}
|
smallmarker = {'marker': 'o', 'markersize': 6, 'markeredgecolor': colors['white'], 'markeredgewidth': edgewidth}
|
||||||
largelinepoints = {'linestyle': '-', 'linewidth': lwthick, 'marker': 'o', 'markersize': 10, 'markeredgecolor': colors['white'], 'markeredgewidth': 1}
|
largelinepoints = {'linestyle': '-', 'linewidth': lwthick, 'marker': 'o', 'markersize': 10, 'markeredgecolor': colors['white'], 'markeredgewidth': 1}
|
||||||
smalllinepoints = {'linestyle': '-', 'linewidth': 1.4, 'marker': 'o', 'markersize': 7, 'markeredgecolor': colors['white'], 'markeredgewidth': 1}
|
smalllinepoints = {'linestyle': '-', 'linewidth': 1.4, 'marker': 'o', 'markersize': 7, 'markeredgecolor': colors['white'], 'markeredgewidth': 1}
|
||||||
filllw = 1.0
|
filllw = edgewidth
|
||||||
fillec = colors['white']
|
fillec = colors['white']
|
||||||
fillalpha = 0.4
|
fillalpha = 0.4
|
||||||
filledge = {'linewidth': filllw, 'joinstyle': 'round'}
|
filledge = {'linewidth': filllw, 'joinstyle': 'round'}
|
||||||
|
Reference in New Issue
Block a user