README 4.52 KB
#-------------------------------------------------------------------------------
#Train-test cycle
./svmsgdnspdk -i bursi.train.gspan -t bursi.train.target -r 3 -d 8 -a TRAIN -m model
./svmsgdnspdk -i bursi.test.gspan -r 3 -d 8 -a TEST -m model
paste bursi.test.target bursi.test.gspan.prediction | awk '{print $1,$3}' | ./perf -APR -ROC -ACC -t 0 -PRF 2>/dev/null | awk '{printf("%s %s ",$1,$2)}END{printf("\n")}'
ACC 0.93504 PRF 0.93791 APR 0.97995 ROC 0.97822

#-------------------------------------------------------------------------------
#Cross validation cycle
./svmsgdnspdk -i bursi.gspan -t bursi.target -r 3 -d 8 -a CROSS_VALIDATION
cat bursi.gspan.cv_predictions  | awk '{print $2,$4}' | ./perf -ACC -PRF -APR -ROC -t 0 2>/dev/null | awk '{printf("%s %s ",$1,$2)}END{printf("\n")}'
ACC 0.92506 PRF 0.93409 APR 0.97962 ROC 0.97660

#-------------------------------------------------------------------------------
#Parameter optimization
./svmsgdnspdk -i bursi.gspan -t bursi.target -r 5 -d 10 -l 1e-10 -e 50 -c 5 -a PARAMETERS_OPTIMIZATION >/dev/null

#-------------------------------------------------------------------------------
#learning curve

LC=10; NUM_REP=10; \rm dat_lc; lcn=$((LC+1));for r in $(seq 1 $NUM_REP); do ./svmsgdnspdk -a LEARNING_CURVE -i bursi.gspan -t bursi.target -p $lcn -R $r | tee log_lc; for i in $(seq 1 $LC); do  dim=$(cat  bursi.gspan.lc_predictions_train_fold_$i | wc -l); echo -n "$dim " >>dat_lc; cat bursi.gspan.lc_predictions_train_fold_$i | awk '{print $2,$4}'  | ./perf -APR -ROC -ACC -t 0 -PRF 2>/dev/null | awk '{printf("%s %s ",$1,$2)}END{printf("\n")}' >>dat_lc; cat bursi.gspan.lc_predictions_test_fold_$i | awk '{print $2,$4}'  | ./perf -APR -ROC -ACC -t 0 -PRF 2>/dev/null | awk '{printf("%s %s ",$1,$2)}END{printf("\n")}' >>dat_lc; done; done
cat dat_lc | awk 'NR%2==1{printf("%s ",$0)}NR%2==0{print $0}' | column -t > dat


cat <<EOF >tmp_plot_command
set terminal postscript eps color enhanced "Helvetica" 11
set grid
set xlabel "Training set size"
set ylabel "Area Under ROC Curve"
set out 'LearningCurveROC.ps'
ftr(x)=atr-btr/(x+ctr)
fts(x)=ats-bts/(x+cts)
fit ftr(x) 'dat' u 1:9  via atr,btr,ctr
fit fts(x) 'dat' u 1:17  via ats,bts,cts
plot 'dat' u 1:9 t "" w p lt 1, '' u 1:17 t "" w p lt 2, ftr(x) t "Train" w l lt 1 lw 2, fts(x) t "Test"  w l lt 2 lw 2
EOF
gnuplot tmp_plot_command
evince LearningCurveROC.ps
 
#-------------------------------------------------------------------------------
#Embedding
#make increasingly permuted sequences
echo "abcdefghilmnopqrstuvz" > seq 
for j in $(seq 2 40); do \cp seq tmp; for i in $(seq 1 $j); do cat tmp  |  awk -v I=$i 'BEGIN{srand(I+600)}{l=length($0); i=int(rand()*(l-2)+1); j=int(rand()*(l-i-1)+i+1); print substr($0,0,i-1) substr($0,j,1) substr($0,i+1,j-i-1) substr($0,i,1) substr($0,j+1)}' > tmp2; \mv tmp2 tmp; done; cat tmp; done | tee dat.seq
./svmsgdnspdk -i dat.seq -f SEQUENCE -g DIRECTED -a EMBED -G 2 -N 3
#gnuplot data
dat=dat.seq
cat $dat| awk -v DIM=$(cat $dat | wc -l) 'NR<DIM/2{print 1}NR>=DIM/2{print -1}' > $dat.target
target=$dat.target
xrange=$(cat $dat.embed | awk '{print $1}' | sort -g | awk 'NR==1{b=$1} {e=$1} END{if(b>e){print (b-e)}else {print (e-b)}}')
yrange=$(cat $dat.embed | awk '{print $2}' | sort -g | awk 'NR==1{b=$1} {e=$1} END{if(b>e){print (b-e)}else {print (e-b)}}')
paste $dat.embed $dat.distortion $target | awk -v S=30 -v XRANGE=$xrange -v YRANGE=$yrange 'BEGIN{if(XRANGE<YRANGE){SCALE=S/XRANGE}else{SCALE=S/YRANGE}} {  print $1,$2,(1-$3)/SCALE,$4}' > $dat.plot
cat <<EOF >PLOT
set size ratio -1
plot '$dat.plot' u 1:(\$4==-1?\$2:1/0):3 t "" with circles lc rgb "orange" fs transparent solid 0.1, '' u 1:(\$4==1?\$2:1/0):3 t "" with circles lc rgb "blue" fs transparent solid 0.1, '' u  1:(\$4==-1?\$2:1/0) t "" w p pt 7 lc rgb 'red', '' u  1:(\$4==1?\$2:1/0) t "" w p pt 7 lc 0
pause (-1)
EOF
gnuplot PLOT

#note: to plot with shade of color proportional to confidence
./svmsgdnspdk -i bursi.gspan -t bursi.target -a CONFIDENCE -r 3 -d 8 -e 10 -c 3
./svmsgdnspdk -i bursi.gspan -a EMBED -r 3 -d 8
paste bursi.gspan.embed bursi.gspan.conf bursi.gspan.distortion | awk '$3==1' | tr '\t' ' ' | sort -k4,4n > datp
paste bursi.gspan.embed bursi.gspan.conf bursi.gspan.distortion | awk '$3==-1' | tr '\t' ' ' | sort -k4,4n > datn

cat <<EOF >PLOT
set size ratio -1
rgb(r,g,b) = int(r)*65536 + int(g)*256 + int(b)
s=50
plot 'datp' u 1:2:((1-\$7)/s):(rgb(0,0,\$4*255)) t "" with circles lc rgb variable fs transparent solid .3 noborder, 'datn' u 1:2:((1-\$7)/s):(rgb(\$4*255,0,0)) t "" with circles lt rgb variable fs transparent solid .3 noborder
pause (-1)
EOF
gnuplot PLOT