[logisitc] added code on AUC interpretation

2021-01-11 22:00:25 +01:00 · 2021-01-11 22:00:25 +01:00 · 849f1d8f0d
commit 849f1d8f0d
parent 06ae3bb84e
2 changed files with 174 additions and 6 deletions
--- a/logistic/code/logistic.py
+++ b/logistic/code/logistic.py
@ -0,0 +1,156 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.stats import ttest_ind, mannwhitneyu
+
+
+def auc(n, dx, uniform=False, plot=False):
+    # loser:
+    if uniform:
+        x0 = np.random.rand(n)
+    else:
+        x0 = np.random.randn(n)*0.3
+    y0 = np.zeros(len(x0))
+    # winner:
+    if uniform:
+        x1 = np.random.rand(n) + dx
+    else:
+        x1 = np.random.randn(n)*0.3 + dx
+    y1 = np.ones(len(x1))
+
+    # combine into a single table:
+    data = np.zeros((len(x0) + len(y0), 2))
+    data[:len(x0),0] = x0
+    data[:len(x0),1] = y0
+    data[len(x0):,0] = x1
+    data[len(x0):,1] = y1
+
+    # fraction of overlapping data values:
+    si = np.argsort(data[:,0])
+    i0 = np.argmax(data[si,1] != data[si[0],1])
+    i1 = len(data) - 1 - np.argmax(data[si[::-1],1] != data[si[-1],1])
+    overlap = (i1-i0+1)/len(data)
+
+    # Cohen's d:
+    m0 = np.mean(data[data[:,1] < 0.5,0])
+    v0 = np.var(data[data[:,1] < 0.5,0])
+    m1 = np.mean(data[data[:,1] > 0.5,0])
+    v1 = np.var(data[data[:,1] > 0.5,0])
+    cohensd = (m1 - m0)/np.sqrt(0.5*(v0+v1))
+
+    # t-test:
+    ttest, p = ttest_ind(data[data[:,1] > 0.5,0], data[data[:,1] < 0.5,0])
+
+    # Mann-Whitney U:
+    mannu, p = mannwhitneyu(data[data[:,1] < 0.5,0], data[data[:,1] > 0.5,0])
+
+    # ROC:
+    thresh = np.arange(np.min(data[:,0])-0.1, np.max(data[:,0])+0.2, 0.01)
+    true_pos = np.zeros(len(thresh))
+    false_pos = np.zeros(len(thresh))
+    for k in range(len(thresh)):
+        true_pos[k] = np.sum(data[data[:,0] > thresh[k],1] > 0.5)/np.sum(data[:,1] > 0.5)
+        false_pos[k] = np.sum(data[data[:,0] > thresh[k],1] < 0.5)/np.sum(data[:,1] < 0.5)
+
+    # AUC:
+    droc = 0.001
+    xroc = np.arange(0.0, 1.0+droc, droc)
+    yroc = np.interp(xroc, false_pos[::-1], true_pos[::-1])
+    auc = np.sum(yroc)*droc
+
+    if plot:
+        fig = plt.figure()
+        ax = fig.add_subplot(211)
+        ax.axvline(data[si[i0],0], color='k')
+        ax.axvline(data[si[i1],0], color='k', lw=2)
+        ax.plot(data[:,0], data[:,1], 'o')
+        ax.plot(data[data[:,1] < 0.5,0], np.zeros(len(data[data[:,1] < 0.5,0]))-0.5, 'or')
+        ax.plot(data[data[:,1] > 0.5,0], np.zeros(len(data[data[:,1] > 0.5,0]))-0.5, 'og')
+        ax.text(0.5*(data[si[i0],0]+data[si[i1],0]), 0.65, 'overlap=%.0f%%' % (100.0*overlap), ha='center')
+        ax.text(0.5*(data[si[i0],0]+data[si[i1],0]), 0.35, "Cohen's d=%.2f" % cohensd, ha='center')
+        ax.set_xlabel('x')
+        ax.set_yticks([0, 1])
+        ax.set_yticklabels(['Lose', 'Win'])
+        if uniform:
+            ax.set_title('Uniformly distributed data')
+        else:
+            ax.set_title('Normally distributed data')
+        
+        ax = fig.add_subplot(223)
+        ax.plot(thresh, true_pos, '-og', label='TP')
+        ax.plot(thresh, false_pos, '-or', label='FP')
+        ax.legend()
+        ax.set_xlabel('threshold')
+        
+        ax = fig.add_subplot(224)
+        ax.plot(false_pos, true_pos, '-o')
+        ax.fill_between(xroc, yroc)
+        ax.text(0.5, 0.5, 'AUC=%.0f%%' % (100.0*auc))
+        ax.set_xlabel('FP')
+        ax.set_ylabel('TP')
+        fig.tight_layout()
+        plt.show()
+
+    return auc, overlap, cohensd, ttest, mannu
+
+
+# demo:
+auc(20, 0.5, True, True)
+auc(20, 0.5, False, True)
+
+# AUC versus overlap:
+n = 100
+aucs_uni = []
+overlaps_uni = []
+cohensd_uni = []
+ttest_uni = []
+mannu_uni = []
+aucs_norm = []
+overlaps_norm = []
+cohensd_norm = []
+ttest_norm = []
+mannu_norm = []
+for frac in np.arange(-1.5, 1.5, 0.02):
+    a, o, d, t, u = auc(n, frac, True, False)
+    aucs_uni.append(a)
+    overlaps_uni.append(o)
+    cohensd_uni.append(d)
+    ttest_uni.append(t)
+    mannu_uni.append(u)
+    a, o, d, t, u = auc(n, frac, False, False)
+    aucs_norm.append(a)
+    overlaps_norm.append(o)
+    cohensd_norm.append(d)
+    ttest_norm.append(t)
+    mannu_norm.append(u)
+    
+fig, axs = plt.subplots(2, 2)
+ax = axs[0, 0]
+ax.plot([0.0, 1.0, 0.0], [0.0, 0.5, 1.0], 'k')
+ax.plot(overlaps_uni, aucs_uni, 'o', label='uniform pdfs')
+ax.plot(overlaps_norm, aucs_norm, 'o', label='normal pdfs')
+ax.set_ylim(0, 1)
+ax.set_xlabel('fraction of overlapping data')
+ax.set_ylabel('AUC')
+ax.legend(loc='center left')
+ax = axs[0, 1]
+ax.plot(cohensd_uni, aucs_uni, 'o', label='uniform pdfs')
+ax.plot(cohensd_norm, aucs_norm, 'o', label='normal pdfs')
+ax.set_ylim(0, 1)
+ax.set_xlabel("Cohen's d")
+ax.set_ylabel('AUC')
+#ax.legend(loc='center left')
+ax = axs[1, 1]
+ax.plot(ttest_uni, aucs_uni, 'o', label='uniform pdfs')
+ax.plot(ttest_norm, aucs_norm, 'o', label='normal pdfs')
+ax.set_ylim(0, 1)
+ax.set_xlabel("Student t")
+ax.set_ylabel('AUC')
+ax = axs[1, 0]
+ax.plot(mannu_uni, aucs_uni, 'o', label='uniform pdfs')
+ax.plot(mannu_norm, aucs_norm, 'o', label='normal pdfs')
+ax.set_ylim(0, 1)
+ax.set_xlabel("Mann-Whitney U")
+ax.set_ylabel('AUC')
+fig.savefig('aucoverlap.pdf')
+plt.show()
+
--- a/scientificcomputing-script.tex
+++ b/scientificcomputing-script.tex
@ -56,6 +56,7 @@
 %     to closed source license models), FAIR
 %   * knowledge is freedom ...
 %   * do not consume what companies offer you but know what you want and implement it
+%   * no screenshots on how to use a stupid GUI program
 %   * python as a modern and popular programming language in modern science
 %     (i.e. machine learning, others?) with a strong community support.

@ -128,22 +129,33 @@

 % add chapters on linear algebra, PCA, clustering, see linearalgebra/

+% add chapter on logistic regression, see logistic/
+
 % add chapter on simple machine learning, perceptron

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %\part{Tools}

 % add chapter on Markup and LaTeX -> see latex/
-% with an introduction on mark-up in general, an outlook to markdown, html, xml, jason, yaml
-% make notes, data, and texts understandable by computers!
+% - if you know LaTeX you also know how to better use word/libreoffice
+% - with an introduction on mark-up in general, an outlook to markdown, html, xml, jason, yaml
+% - make notes, data, and texts understandable by computers!
+% - easily generate markup from python scripts via mako .
+
+% bash basics:
+% - calling a (python) script/program with arguments
+% - files: ls, cp, mkdir, find, etc.
+% - text file manipulations: grep, diff, sed, awk
+% - pipes, for loop, if, test

 % Makefile
+
 % distributed computing (ssh and grid engines)
-%
+
 % see replicability/ :
-% version control (git and github) only put source files into repository!
-% data handling (structure, storage (rsync), backup, data bases)
-% data annotation, meta data, FAIR
+% - version control (git and github) only put source files into repository!
+% - data handling (structure, storage (rsync), backup, data bases)
+% - data annotation, meta data, FAIR


 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%