README
4.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#-------------------------------------------------------------------------------
#Train-test cycle
./svmsgdnspdk -i bursi.train.gspan -t bursi.train.target -r 3 -d 8 -a TRAIN -m model
./svmsgdnspdk -i bursi.test.gspan -r 3 -d 8 -a TEST -m model
paste bursi.test.target bursi.test.gspan.prediction | awk '{print $1,$3}' | ./perf -APR -ROC -ACC -t 0 -PRF 2>/dev/null | awk '{printf("%s %s ",$1,$2)}END{printf("\n")}'
ACC 0.93504 PRF 0.93791 APR 0.97995 ROC 0.97822
#-------------------------------------------------------------------------------
#Cross validation cycle
./svmsgdnspdk -i bursi.gspan -t bursi.target -r 3 -d 8 -a CROSS_VALIDATION
cat bursi.gspan.cv_predictions | awk '{print $2,$4}' | ./perf -ACC -PRF -APR -ROC -t 0 2>/dev/null | awk '{printf("%s %s ",$1,$2)}END{printf("\n")}'
ACC 0.92506 PRF 0.93409 APR 0.97962 ROC 0.97660
#-------------------------------------------------------------------------------
#Parameter optimization
./svmsgdnspdk -i bursi.gspan -t bursi.target -r 5 -d 10 -l 1e-10 -e 50 -c 5 -a PARAMETERS_OPTIMIZATION >/dev/null
#-------------------------------------------------------------------------------
#learning curve
LC=10; NUM_REP=10; \rm dat_lc; lcn=$((LC+1));for r in $(seq 1 $NUM_REP); do ./svmsgdnspdk -a LEARNING_CURVE -i bursi.gspan -t bursi.target -p $lcn -R $r | tee log_lc; for i in $(seq 1 $LC); do dim=$(cat bursi.gspan.lc_predictions_train_fold_$i | wc -l); echo -n "$dim " >>dat_lc; cat bursi.gspan.lc_predictions_train_fold_$i | awk '{print $2,$4}' | ./perf -APR -ROC -ACC -t 0 -PRF 2>/dev/null | awk '{printf("%s %s ",$1,$2)}END{printf("\n")}' >>dat_lc; cat bursi.gspan.lc_predictions_test_fold_$i | awk '{print $2,$4}' | ./perf -APR -ROC -ACC -t 0 -PRF 2>/dev/null | awk '{printf("%s %s ",$1,$2)}END{printf("\n")}' >>dat_lc; done; done
cat dat_lc | awk 'NR%2==1{printf("%s ",$0)}NR%2==0{print $0}' | column -t > dat
cat <<EOF >tmp_plot_command
set terminal postscript eps color enhanced "Helvetica" 11
set grid
set xlabel "Training set size"
set ylabel "Area Under ROC Curve"
set out 'LearningCurveROC.ps'
ftr(x)=atr-btr/(x+ctr)
fts(x)=ats-bts/(x+cts)
fit ftr(x) 'dat' u 1:9 via atr,btr,ctr
fit fts(x) 'dat' u 1:17 via ats,bts,cts
plot 'dat' u 1:9 t "" w p lt 1, '' u 1:17 t "" w p lt 2, ftr(x) t "Train" w l lt 1 lw 2, fts(x) t "Test" w l lt 2 lw 2
EOF
gnuplot tmp_plot_command
evince LearningCurveROC.ps
#-------------------------------------------------------------------------------
#Embedding
#make increasingly permuted sequences
echo "abcdefghilmnopqrstuvz" > seq
for j in $(seq 2 40); do \cp seq tmp; for i in $(seq 1 $j); do cat tmp | awk -v I=$i 'BEGIN{srand(I+600)}{l=length($0); i=int(rand()*(l-2)+1); j=int(rand()*(l-i-1)+i+1); print substr($0,0,i-1) substr($0,j,1) substr($0,i+1,j-i-1) substr($0,i,1) substr($0,j+1)}' > tmp2; \mv tmp2 tmp; done; cat tmp; done | tee dat.seq
./svmsgdnspdk -i dat.seq -f SEQUENCE -g DIRECTED -a EMBED -G 2 -N 3
#gnuplot data
dat=dat.seq
cat $dat| awk -v DIM=$(cat $dat | wc -l) 'NR<DIM/2{print 1}NR>=DIM/2{print -1}' > $dat.target
target=$dat.target
xrange=$(cat $dat.embed | awk '{print $1}' | sort -g | awk 'NR==1{b=$1} {e=$1} END{if(b>e){print (b-e)}else {print (e-b)}}')
yrange=$(cat $dat.embed | awk '{print $2}' | sort -g | awk 'NR==1{b=$1} {e=$1} END{if(b>e){print (b-e)}else {print (e-b)}}')
paste $dat.embed $dat.distortion $target | awk -v S=30 -v XRANGE=$xrange -v YRANGE=$yrange 'BEGIN{if(XRANGE<YRANGE){SCALE=S/XRANGE}else{SCALE=S/YRANGE}} { print $1,$2,(1-$3)/SCALE,$4}' > $dat.plot
cat <<EOF >PLOT
set size ratio -1
plot '$dat.plot' u 1:(\$4==-1?\$2:1/0):3 t "" with circles lc rgb "orange" fs transparent solid 0.1, '' u 1:(\$4==1?\$2:1/0):3 t "" with circles lc rgb "blue" fs transparent solid 0.1, '' u 1:(\$4==-1?\$2:1/0) t "" w p pt 7 lc rgb 'red', '' u 1:(\$4==1?\$2:1/0) t "" w p pt 7 lc 0
pause (-1)
EOF
gnuplot PLOT
#note: to plot with shade of color proportional to confidence
./svmsgdnspdk -i bursi.gspan -t bursi.target -a CONFIDENCE -r 3 -d 8 -e 10 -c 3
./svmsgdnspdk -i bursi.gspan -a EMBED -r 3 -d 8
paste bursi.gspan.embed bursi.gspan.conf bursi.gspan.distortion | awk '$3==1' | tr '\t' ' ' | sort -k4,4n > datp
paste bursi.gspan.embed bursi.gspan.conf bursi.gspan.distortion | awk '$3==-1' | tr '\t' ' ' | sort -k4,4n > datn
cat <<EOF >PLOT
set size ratio -1
rgb(r,g,b) = int(r)*65536 + int(g)*256 + int(b)
s=50
plot 'datp' u 1:2:((1-\$7)/s):(rgb(0,0,\$4*255)) t "" with circles lc rgb variable fs transparent solid .3 noborder, 'datn' u 1:2:((1-\$7)/s):(rgb(\$4*255,0,0)) t "" with circles lt rgb variable fs transparent solid .3 noborder
pause (-1)
EOF
gnuplot PLOT