Ludovic PLATON

Improve classifier in order to handle reject

...@@ -115,19 +115,19 @@ class SOM_basic(object): ...@@ -115,19 +115,19 @@ class SOM_basic(object):
115 res = tf.exp(-gamma*0.5*tf.pow(dist,2.0)) 115 res = tf.exp(-gamma*0.5*tf.pow(dist,2.0))
116 return res 116 return res
117 117
118 - def sim2units_neighbour(self,data,units=None): 118 +# def sim2units_neighbour(self,data,units=None):
119 - if units is None: 119 +# if units is None:
120 - units = self.units 120 +# units = self.units
121 - dist = self.dist2units(data,units) 121 +# dist = self.dist2units(data,units)
122 -# max_dist = tf.reduce_max(tf.sqrt(tf.reduce_sum(tf.pow((tf.expand_dims(units,0) - tf.expand_dims(units,1)),2.0),axis=2))) 122 +## max_dist = tf.reduce_max(tf.sqrt(tf.reduce_sum(tf.pow((tf.expand_dims(units,0) - tf.expand_dims(units,1)),2.0),axis=2)))
123 -# gamma = 1.0 / (max_dist/np.sqrt(2.0*self.ulen)) 123 +## gamma = 1.0 / (max_dist/np.sqrt(2.0*self.ulen))
124 -# res = 1.0 / (dist+1.0) 124 +## res = 1.0 / (dist+1.0)
125 - gamma = 1.0 125 +# gamma = 1.0
126 - res = tf.exp(-gamma*tf.pow(dist,2.0)) 126 +# res = tf.exp(-gamma*tf.pow(dist,2.0))
127 - bmus = tf.argmin(dist,1) 127 +# bmus = tf.argmin(dist,1)
128 - dist_bmu = self.dist_bmus_op(bmus) 128 +# dist_bmu = self.dist_bmus_op(bmus)
129 - neighbour = self.R(dist_bmu,tf.cast(self.learning_rate(self.it),tf.float64)*max(self.dim)/2.0) 129 +# neighbour = self.R(dist_bmu,tf.cast(self.learning_rate(self.it),tf.float64)*max(self.dim)/2.0)
130 - return res*tf.transpose(neighbour) 130 +# return res*tf.transpose(neighbour)
131 131
132 def dist2units(self,data,units=None): 132 def dist2units(self,data,units=None):
133 if units is None: 133 if units is None:
......
...@@ -21,12 +21,14 @@ def init_SLSOM(path,som): ...@@ -21,12 +21,14 @@ def init_SLSOM(path,som):
21 return tmp 21 return tmp
22 22
23 class SLSOM(object): 23 class SLSOM(object):
24 - def __init__(self,som,nb_label,loss_type='cross_entropy',verbose=True): 24 + def __init__(self,som,nb_label,loss_type='cross_entropy',alpha0 = 1.0, alpha1 = 0.6,verbose=True):
25 self.tf_object = som.tf_object 25 self.tf_object = som.tf_object
26 self.ulen = som.ulen 26 self.ulen = som.ulen
27 self.nb_label = nb_label 27 self.nb_label = nb_label
28 self.som = som 28 self.som = som
29 self.loss_type = loss_type 29 self.loss_type = loss_type
30 + self.alpha0 = alpha0
31 + self.alpha1 = alpha1
30 self.verbose = verbose 32 self.verbose = verbose
31 with self.tf_object.graph.as_default(): 33 with self.tf_object.graph.as_default():
32 self.W = tf.Variable(tf.random_normal([self.ulen,self.nb_label],dtype=tf.float64)) 34 self.W = tf.Variable(tf.random_normal([self.ulen,self.nb_label],dtype=tf.float64))
...@@ -41,12 +43,11 @@ class SLSOM(object): ...@@ -41,12 +43,11 @@ class SLSOM(object):
41 self.update_it = self.it.assign_add(1) 43 self.update_it = self.it.assign_add(1)
42 self.data = self.som.sim2units(self.som.data2pred) 44 self.data = self.som.sim2units(self.som.data2pred)
43 45
44 - 46 +# self.datapred = tf.one_hot(
45 - self.datapred = tf.one_hot( 47 +# self.som.bmu_finder(self.som.data2pred,self.som.units),
46 - self.som.bmu_finder(self.som.data2pred,self.som.units), 48 +# self.som.ulen,
47 - self.som.ulen, 49 +# dtype=tf.float64
48 - dtype=tf.float64 50 +# )
49 - )
50 51
51 self.data_size = tf.placeholder(tf.int32,shape=[1]) 52 self.data_size = tf.placeholder(tf.int32,shape=[1])
52 self.lambda_penality = tf.placeholder(tf.float64,shape=[1]) 53 self.lambda_penality = tf.placeholder(tf.float64,shape=[1])
...@@ -58,7 +59,10 @@ class SLSOM(object): ...@@ -58,7 +59,10 @@ class SLSOM(object):
58 self.update_it_som = self.som.it.assign_add(1) 59 self.update_it_som = self.som.it.assign_add(1)
59 60
60 def learning_rate(self,it): 61 def learning_rate(self,it):
61 - return 1.0-tf.cast(self.it,tf.float64)/(tf.cast(self.it_max,tf.float64)) 62 + #tmp = 1.0/(tf.cast(self.it,tf.float64)+1.0)
63 + #return tf.Print(tmp,[tmp],"IT : ")
64 + tmp = 1.0-tf.cast(self.it,tf.float64)/(tf.cast(self.it_max,tf.float64))
65 + return tmp
62 66
63 def save(self,path): 67 def save(self,path):
64 W = self.get_W() 68 W = self.get_W()
...@@ -82,7 +86,7 @@ class SLSOM(object): ...@@ -82,7 +86,7 @@ class SLSOM(object):
82 dist = self.som.dist2units(self.som.data2pred) 86 dist = self.som.dist2units(self.som.data2pred)
83 bmus = tf.argmin(dist,1) 87 bmus = tf.argmin(dist,1)
84 dist_bmu = self.som.dist_bmus_op(bmus) 88 dist_bmu = self.som.dist_bmus_op(bmus)
85 - neighbour = self.som.R(dist_bmu,tf.cast(self.learning_rate(self.it),tf.float64)*max(self.som.dim)/2.0) 89 + neighbour = self.som.R(dist_bmu,(self.alpha1 + (self.alpha0 - self.alpha1)*tf.cast(self.learning_rate(self.it),tf.float64))*max(self.som.dim))
86 x = x*tf.transpose(neighbour) 90 x = x*tf.transpose(neighbour)
87 91
88 y = tf.matmul( 92 y = tf.matmul(
...@@ -90,28 +94,36 @@ class SLSOM(object): ...@@ -90,28 +94,36 @@ class SLSOM(object):
90 self.W 94 self.W
91 ) + self.biases 95 ) + self.biases
92 96
97 + if self.loss_type == 'cross_entropy':
98 + self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=self.labels))
99 + else :
93 y_ = tf.one_hot( 100 y_ = tf.one_hot(
94 self.labels, 101 self.labels,
95 self.nb_label, 102 self.nb_label,
96 dtype=tf.float64 103 dtype=tf.float64
97 ) 104 )
98 - if self.loss_type == 'cross_entropy': 105 + self.loss = 0.5*tf.reduce_mean(tf.pow(tf.nn.softmax(y) - y_,2.0))
99 - loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=self.labels))
100 - else :
101 - loss = 0.5*tf.reduce_mean(tf.pow(tf.nn.softmax(y) - y_,2.0))
102 regularizer = tf.contrib.layers.l2_regularizer(self.lambda_penality) 106 regularizer = tf.contrib.layers.l2_regularizer(self.lambda_penality)
103 penality = regularizer(self.W) 107 penality = regularizer(self.W)
104 108
105 - optimizer = tf.train.GradientDescentOptimizer(0.3*self.learning_rate(self.it))
106 - optimizer2 = tf.train.GradientDescentOptimizer(self.learning_rate(self.it))
107 109
108 - loss2 = tf.add(loss,penality) 110 + optimizer = tf.train.GradientDescentOptimizer(0.1*self.learning_rate(self.it))
109 - applied = optimizer.minimize(loss2,var_list=[self.W,self.biases]) 111 + #optimizer2 = tf.train.GradientDescentOptimizer(0.3*self.learning_rate(self.it))
110 - applied2 = optimizer2.minimize(loss2,var_list=[self.som.units]) 112 +
111 - return (applied,applied2) 113 + loss2 = tf.add(self.loss,penality)
114 + applied = optimizer.minimize(loss2,var_list=[self.W,self.biases,self.som.units])
115 + #applied2 = optimizer2.minimize(loss2,var_list=[])
116 + #return (applied,applied2)
117 + return applied
112 118
113 def proba_class_op(self): 119 def proba_class_op(self):
114 - x = self.datapred 120 +# x = self.datapred
121 + x = self.data
122 + dist = self.som.dist2units(self.som.data2pred)
123 + bmus = tf.argmin(dist,1)
124 + dist_bmu = self.som.dist_bmus_op(bmus)
125 + neighbour = self.som.R(dist_bmu,self.alpha1*max(self.som.dim)/2.0)
126 + x = x*tf.transpose(neighbour)
115 y = tf.matmul( 127 y = tf.matmul(
116 x, 128 x,
117 self.W 129 self.W
...@@ -127,24 +139,39 @@ class SLSOM(object): ...@@ -127,24 +139,39 @@ class SLSOM(object):
127 def get_biases(self): 139 def get_biases(self):
128 return self.tf_object.run(self.biases) 140 return self.tf_object.run(self.biases)
129 141
130 - def train(self,data,labels,max_it=2000,batch_size=10, penality=0.001): 142 + def train(self,data,labels,nb_it=2000,batch_size=10, penality=0.001):
131 - it = np.array([max_it])
132 nb_data = data.shape[0] 143 nb_data = data.shape[0]
133 pen = np.array([penality]) 144 pen = np.array([penality])
134 data2 = data 145 data2 = data
135 - for i in range(max_it): 146 + loss_old = 0.0
147 + run = True
148 + it = 0
149 + while run:
136 if self.verbose: 150 if self.verbose:
137 - print("It SLSOM: "+str(i)) 151 + print("It SLSOM: "+str(it))
138 - idx = np.random.randint(nb_data,size=batch_size) 152 +# idx = np.random.randint(nb_data,size=batch_size)
139 - self.tf_object.run(self.train_op, 153 +# _, loss = self.tf_object.run([self.train_op,self.loss],
154 +# feed_dict={
155 +# self.som.data2pred:data2[idx,:],
156 +# self.data_size:np.array([batch_size]),
157 +# self.labels:labels[idx],
158 +# self.lambda_penality:pen,
159 +# self.it_max:nb_it
160 +# })
161 + _, loss = self.tf_object.run([self.train_op,self.loss],
140 feed_dict={ 162 feed_dict={
141 - self.som.data2pred:data2[idx,:], 163 + self.som.data2pred:data,
142 - self.data_size:np.array([batch_size]), 164 + self.labels:labels,
143 - self.labels:labels[idx],
144 self.lambda_penality:pen, 165 self.lambda_penality:pen,
145 - self.it_max:max_it 166 + self.it_max:nb_it
146 }) 167 })
147 - self.tf_object.run(self.update_it) 168 + delta_loss = np.absolute(loss - loss_old)
169 + if self.verbose:
170 + print("Diff loss: "+str(delta_loss))
171 + if delta_loss < np.power(10.0,-6.0) or not it < nb_it:
172 + run = False
173 + it = self.tf_object.run(self.update_it)
174 + loss_old = loss
148 175
149 def predict(self,data): 176 def predict(self,data):
150 pred,proba = self.tf_object.run([self.prediction,self.proba_data_op], 177 pred,proba = self.tf_object.run([self.prediction,self.proba_data_op],
......
...@@ -12,10 +12,10 @@ class Performance: ...@@ -12,10 +12,10 @@ class Performance:
12 def Compute_stat(self): 12 def Compute_stat(self):
13 positive = (self.arr[:,0] == 1) 13 positive = (self.arr[:,0] == 1)
14 negative = (self.arr[:,0] == 0) 14 negative = (self.arr[:,0] == 0)
15 - self.TP = float(np.sum(self.arr[positive,1])) 15 + self.TP = float(np.sum(self.arr[positive,1]==1))
16 self.TN = float(np.sum(self.arr[negative,1]==0)) 16 self.TN = float(np.sum(self.arr[negative,1]==0))
17 self.FP = float(np.sum(self.arr[positive,1]==0)) 17 self.FP = float(np.sum(self.arr[positive,1]==0))
18 - self.FN = float(np.sum(self.arr[negative,1])) 18 + self.FN = float(np.sum(self.arr[negative,1]==1))
19 19
20 #More advanced measure 20 #More advanced measure
21 def Sensitivity(self): 21 def Sensitivity(self):
...@@ -24,9 +24,6 @@ class Performance: ...@@ -24,9 +24,6 @@ class Performance:
24 def Specificity(self): 24 def Specificity(self):
25 return self.TN / (self.TN + self.FP) 25 return self.TN / (self.TN + self.FP)
26 26
27 - def Precision(self):
28 - return self.TP /(self.TP +self.FP)
29 -
30 # Advanced measure 27 # Advanced measure
31 def Accuracy(self): 28 def Accuracy(self):
32 return (self.TP + self.TN)/(self.TP+self.FP+self.FN+self.TN) 29 return (self.TP + self.TN)/(self.TP+self.FP+self.FN+self.TN)
...@@ -43,7 +40,6 @@ class Performance: ...@@ -43,7 +40,6 @@ class Performance:
43 return { 40 return {
44 "Sensitivity" : self.Sensitivity(), 41 "Sensitivity" : self.Sensitivity(),
45 "Specificity" : self.Specificity(), 42 "Specificity" : self.Specificity(),
46 - "Precision" : self.Precision(),
47 "Accuracy" : self.Accuracy(), 43 "Accuracy" : self.Accuracy(),
48 "F1" : self.F1(), 44 "F1" : self.F1(),
49 "MCC" : self.MCC(), 45 "MCC" : self.MCC(),
......
...@@ -9,6 +9,7 @@ import os ...@@ -9,6 +9,7 @@ import os
9 import pandas as pd 9 import pandas as pd
10 from plotnine import * 10 from plotnine import *
11 from functools import partial 11 from functools import partial
12 +from concurrent.futures import ThreadPoolExecutor
12 13
13 ''' 14 '''
14 Files checking 15 Files checking
...@@ -27,16 +28,23 @@ Import data ...@@ -27,16 +28,23 @@ Import data
27 ''' 28 '''
28 29
29 def import_ncRNA(path): 30 def import_ncRNA(path):
30 - file_order = ["CP.txt","ORF.txt","KMER3.txt","KMER6.txt"] 31 + file_order = ["CB.txt","ORF.txt","KMER3.txt"]#,"KMER6.txt"]
31 - df = pd.read_csv(path+file_order[0],sep=",",header=None) 32 + #df = pd.read_csv(path+file_order[0],sep=",",header=None)
33 + df_raw = []
34 + with ThreadPoolExecutor(max_workers=4) as tp:
35 + for x in file_order:
36 + df_raw.append(tp.submit(pd.read_csv,path+x,sep=",",header=None))
37 + df = df_raw[0].result()
32 for i in range(1,len(file_order)): 38 for i in range(1,len(file_order)):
33 - tmp = pd.read_csv(path+file_order[i],sep=",",header=None) 39 + #tmp = pd.read_csv(path+file_order[i],sep=",",header=None)
40 + tmp = df_raw[i].result()
34 df = df.merge(tmp,on=0) 41 df = df.merge(tmp,on=0)
42 + df = df.fillna(0)
35 data = df.iloc[:,1:].values 43 data = df.iloc[:,1:].values
36 data_names = df.iloc[:,0].values 44 data_names = df.iloc[:,0].values
37 # Modification 45 # Modification
38 - data[:,np.arange(4)] = 0.25*data[:,np.arange(4)] 46 +# data[:,np.arange(4)] = 0.25*data[:,np.arange(4)]
39 - data[:,4] = np.exp(-data[:,4]/200) 47 +# data[:,4] = np.exp(-data[:,4]/200)
40 return (data,data_names) 48 return (data,data_names)
41 49
42 ''' 50 '''
...@@ -70,8 +78,8 @@ def plot_weights(units,m,n,name): ...@@ -70,8 +78,8 @@ def plot_weights(units,m,n,name):
70 features = [ 78 features = [
71 np.arange(4), 79 np.arange(4),
72 np.arange(4,6), 80 np.arange(4,6),
73 - np.arange(6,6+4**3), 81 +# np.arange(6,6+4**3),
74 - np.arange(6+4**3, units.shape[1]) 82 +# np.arange(6+4**3, units.shape[1])
75 ] 83 ]
76 for l,f in enumerate(features): 84 for l,f in enumerate(features):
77 dico = [] 85 dico = []
......
1 """ Train IRSOM model on ncRNA. 1 """ Train IRSOM model on ncRNA.
2 2
3 Usage: 3 Usage:
4 - train.py --featurer=<path> --output=<path> (-c <coding>)... (-n <noncoding>)... [--dim0=<value> --dim1=<value> --batch_size=<value> --penality=<value> --keep_features] 4 + train.py --featurer=<path> --output=<path> (-c <coding>)... (-n <noncoding>)... [--dim0=<value> --dim1=<value> --batch_size=<value> --penality=<value> --keep_features --verbose]
5 train.py (-h | --help) 5 train.py (-h | --help)
6 train.py --version 6 train.py --version
7 7
...@@ -12,9 +12,10 @@ Options: ...@@ -12,9 +12,10 @@ Options:
12 -n <noncoding> Path for one or multiple fasta file(s) containing noncoding transcript. 12 -n <noncoding> Path for one or multiple fasta file(s) containing noncoding transcript.
13 --dim0=<value> SOM dimension 0 (by default at 3) 13 --dim0=<value> SOM dimension 0 (by default at 3)
14 --dim1=<value> SOM dimension 1 (by default at 3). 14 --dim1=<value> SOM dimension 1 (by default at 3).
15 - --batch_size=<value> the size of the batch given at each iteration (by default at 10). 15 + --batch_size=<value> the size of the batch given at each iteration (by default at 100).
16 --penality=<value> Coefficient of the regularization term (by default at 0.001). 16 --penality=<value> Coefficient of the regularization term (by default at 0.001).
17 --keep_features Keep the features computed in the "output" folder. 17 --keep_features Keep the features computed in the "output" folder.
18 + --verbose Produce more output
18 19
19 """ 20 """
20 from docopt import docopt 21 from docopt import docopt
...@@ -31,10 +32,11 @@ def main(): ...@@ -31,10 +32,11 @@ def main():
31 arguments = docopt(__doc__,version="train IRSOM 1.0") 32 arguments = docopt(__doc__,version="train IRSOM 1.0")
32 output_path = os.path.expanduser(os.path.expandvars(arguments["--output"])) 33 output_path = os.path.expanduser(os.path.expandvars(arguments["--output"]))
33 featurer_path = os.path.expanduser(os.path.expandvars(arguments["--featurer"])) 34 featurer_path = os.path.expanduser(os.path.expandvars(arguments["--featurer"]))
34 - map_size_m = int(arguments["--dim0"]) if not arguments["--dim0"] is None else 3 35 + map_size_m = int(arguments["--dim0"]) if not arguments["--dim0"] is None else 4
35 - map_size_n = int(arguments["--dim1"]) if not arguments["--dim0"] is None else 3 36 + map_size_n = int(arguments["--dim1"]) if not arguments["--dim0"] is None else 4
36 - batch_size = float(arguments["--batch_size"]) if not arguments["--batch_size"] is None else 10 37 + batch_size = float(arguments["--batch_size"]) if not arguments["--batch_size"] is None else 1000
37 penality = float(arguments["--penality"]) if not arguments["--penality"] is None else 0.001 38 penality = float(arguments["--penality"]) if not arguments["--penality"] is None else 0.001
39 + verbose = arguments["--verbose"]
38 40
39 #Compute features 41 #Compute features
40 path_feature_root = [output_path+"features/coding", output_path+"features/noncoding"] 42 path_feature_root = [output_path+"features/coding", output_path+"features/noncoding"]
...@@ -76,12 +78,12 @@ def main(): ...@@ -76,12 +78,12 @@ def main():
76 78
77 data = np.concatenate(data_coding_list+data_noncoding_list,axis=0) 79 data = np.concatenate(data_coding_list+data_noncoding_list,axis=0)
78 label = np.repeat([0,1],[nb_coding,nb_noncoding]) 80 label = np.repeat([0,1],[nb_coding,nb_noncoding])
79 - som = SOM(m=map_size_m,n=map_size_n,unit_width=data.shape[1],verbose=False) 81 + som = SOM(m=map_size_m,n=map_size_n,unit_width=data.shape[1],verbose=verbose)
80 82
81 - ssom = SLSOM(som,2,verbose=False) 83 + ssom = SLSOM(som,2,verbose=verbose)
82 ssom.tf_object.initialize() 84 ssom.tf_object.initialize()
83 85
84 - ssom.train(data,label,data.shape[0],batch_size,penality) 86 + ssom.train(data,label,penality = penality)
85 print("SLSOM learned") 87 print("SLSOM learned")
86 88
87 check_dir(output_path+"SOM/") 89 check_dir(output_path+"SOM/")
...@@ -90,6 +92,7 @@ def main(): ...@@ -90,6 +92,7 @@ def main():
90 ssom.save(output_path+"SLSOM/") 92 ssom.save(output_path+"SLSOM/")
91 93
92 y,p = ssom.predict(data) 94 y,p = ssom.predict(data)
95 + np.savetxt(output_path+"proba.txt",np.array(p))
93 rep,_ = som.repartition_map(data,label) 96 rep,_ = som.repartition_map(data,label)
94 plot_repartition(rep,map_size_m,map_size_n,output_path+"plot_repartition") 97 plot_repartition(rep,map_size_m,map_size_n,output_path+"plot_repartition")
95 plot_density(label,p,output_path+"plot_density") 98 plot_density(label,p,output_path+"plot_density")
......