Ludovic PLATON

Improve classifier in order to handle reject

......@@ -115,19 +115,19 @@ class SOM_basic(object):
res = tf.exp(-gamma*0.5*tf.pow(dist,2.0))
return res
def sim2units_neighbour(self,data,units=None):
if units is None:
units = self.units
dist = self.dist2units(data,units)
# max_dist = tf.reduce_max(tf.sqrt(tf.reduce_sum(tf.pow((tf.expand_dims(units,0) - tf.expand_dims(units,1)),2.0),axis=2)))
# gamma = 1.0 / (max_dist/np.sqrt(2.0*self.ulen))
# res = 1.0 / (dist+1.0)
gamma = 1.0
res = tf.exp(-gamma*tf.pow(dist,2.0))
bmus = tf.argmin(dist,1)
dist_bmu = self.dist_bmus_op(bmus)
neighbour = self.R(dist_bmu,tf.cast(self.learning_rate(self.it),tf.float64)*max(self.dim)/2.0)
return res*tf.transpose(neighbour)
# def sim2units_neighbour(self,data,units=None):
# if units is None:
# units = self.units
# dist = self.dist2units(data,units)
## max_dist = tf.reduce_max(tf.sqrt(tf.reduce_sum(tf.pow((tf.expand_dims(units,0) - tf.expand_dims(units,1)),2.0),axis=2)))
## gamma = 1.0 / (max_dist/np.sqrt(2.0*self.ulen))
## res = 1.0 / (dist+1.0)
# gamma = 1.0
# res = tf.exp(-gamma*tf.pow(dist,2.0))
# bmus = tf.argmin(dist,1)
# dist_bmu = self.dist_bmus_op(bmus)
# neighbour = self.R(dist_bmu,tf.cast(self.learning_rate(self.it),tf.float64)*max(self.dim)/2.0)
# return res*tf.transpose(neighbour)
def dist2units(self,data,units=None):
if units is None:
......
......@@ -21,12 +21,14 @@ def init_SLSOM(path,som):
return tmp
class SLSOM(object):
def __init__(self,som,nb_label,loss_type='cross_entropy',verbose=True):
def __init__(self,som,nb_label,loss_type='cross_entropy',alpha0 = 1.0, alpha1 = 0.6,verbose=True):
self.tf_object = som.tf_object
self.ulen = som.ulen
self.nb_label = nb_label
self.som = som
self.loss_type = loss_type
self.alpha0 = alpha0
self.alpha1 = alpha1
self.verbose = verbose
with self.tf_object.graph.as_default():
self.W = tf.Variable(tf.random_normal([self.ulen,self.nb_label],dtype=tf.float64))
......@@ -40,13 +42,12 @@ class SLSOM(object):
self.it = tf.Variable(0,dtype=tf.int32)
self.update_it = self.it.assign_add(1)
self.data = self.som.sim2units(self.som.data2pred)
self.datapred = tf.one_hot(
self.som.bmu_finder(self.som.data2pred,self.som.units),
self.som.ulen,
dtype=tf.float64
)
# self.datapred = tf.one_hot(
# self.som.bmu_finder(self.som.data2pred,self.som.units),
# self.som.ulen,
# dtype=tf.float64
# )
self.data_size = tf.placeholder(tf.int32,shape=[1])
self.lambda_penality = tf.placeholder(tf.float64,shape=[1])
......@@ -58,7 +59,10 @@ class SLSOM(object):
self.update_it_som = self.som.it.assign_add(1)
def learning_rate(self,it):
return 1.0-tf.cast(self.it,tf.float64)/(tf.cast(self.it_max,tf.float64))
#tmp = 1.0/(tf.cast(self.it,tf.float64)+1.0)
#return tf.Print(tmp,[tmp],"IT : ")
tmp = 1.0-tf.cast(self.it,tf.float64)/(tf.cast(self.it_max,tf.float64))
return tmp
def save(self,path):
W = self.get_W()
......@@ -82,36 +86,44 @@ class SLSOM(object):
dist = self.som.dist2units(self.som.data2pred)
bmus = tf.argmin(dist,1)
dist_bmu = self.som.dist_bmus_op(bmus)
neighbour = self.som.R(dist_bmu,tf.cast(self.learning_rate(self.it),tf.float64)*max(self.som.dim)/2.0)
neighbour = self.som.R(dist_bmu,(self.alpha1 + (self.alpha0 - self.alpha1)*tf.cast(self.learning_rate(self.it),tf.float64))*max(self.som.dim))
x = x*tf.transpose(neighbour)
y = tf.matmul(
x,
self.W
) + self.biases
y_ = tf.one_hot(
if self.loss_type == 'cross_entropy':
self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=self.labels))
else :
y_ = tf.one_hot(
self.labels,
self.nb_label,
dtype=tf.float64
)
if self.loss_type == 'cross_entropy':
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=self.labels))
else :
loss = 0.5*tf.reduce_mean(tf.pow(tf.nn.softmax(y) - y_,2.0))
self.loss = 0.5*tf.reduce_mean(tf.pow(tf.nn.softmax(y) - y_,2.0))
regularizer = tf.contrib.layers.l2_regularizer(self.lambda_penality)
penality = regularizer(self.W)
optimizer = tf.train.GradientDescentOptimizer(0.3*self.learning_rate(self.it))
optimizer2 = tf.train.GradientDescentOptimizer(self.learning_rate(self.it))
loss2 = tf.add(loss,penality)
applied = optimizer.minimize(loss2,var_list=[self.W,self.biases])
applied2 = optimizer2.minimize(loss2,var_list=[self.som.units])
return (applied,applied2)
optimizer = tf.train.GradientDescentOptimizer(0.1*self.learning_rate(self.it))
#optimizer2 = tf.train.GradientDescentOptimizer(0.3*self.learning_rate(self.it))
loss2 = tf.add(self.loss,penality)
applied = optimizer.minimize(loss2,var_list=[self.W,self.biases,self.som.units])
#applied2 = optimizer2.minimize(loss2,var_list=[])
#return (applied,applied2)
return applied
def proba_class_op(self):
x = self.datapred
# x = self.datapred
x = self.data
dist = self.som.dist2units(self.som.data2pred)
bmus = tf.argmin(dist,1)
dist_bmu = self.som.dist_bmus_op(bmus)
neighbour = self.som.R(dist_bmu,self.alpha1*max(self.som.dim)/2.0)
x = x*tf.transpose(neighbour)
y = tf.matmul(
x,
self.W
......@@ -127,24 +139,39 @@ class SLSOM(object):
def get_biases(self):
return self.tf_object.run(self.biases)
def train(self,data,labels,max_it=2000,batch_size=10, penality=0.001):
it = np.array([max_it])
def train(self,data,labels,nb_it=2000,batch_size=10, penality=0.001):
nb_data = data.shape[0]
pen = np.array([penality])
data2 = data
for i in range(max_it):
loss_old = 0.0
run = True
it = 0
while run:
if self.verbose:
print("It SLSOM: "+str(i))
idx = np.random.randint(nb_data,size=batch_size)
self.tf_object.run(self.train_op,
print("It SLSOM: "+str(it))
# idx = np.random.randint(nb_data,size=batch_size)
# _, loss = self.tf_object.run([self.train_op,self.loss],
# feed_dict={
# self.som.data2pred:data2[idx,:],
# self.data_size:np.array([batch_size]),
# self.labels:labels[idx],
# self.lambda_penality:pen,
# self.it_max:nb_it
# })
_, loss = self.tf_object.run([self.train_op,self.loss],
feed_dict={
self.som.data2pred:data2[idx,:],
self.data_size:np.array([batch_size]),
self.labels:labels[idx],
self.som.data2pred:data,
self.labels:labels,
self.lambda_penality:pen,
self.it_max:max_it
self.it_max:nb_it
})
self.tf_object.run(self.update_it)
delta_loss = np.absolute(loss - loss_old)
if self.verbose:
print("Diff loss: "+str(delta_loss))
if delta_loss < np.power(10.0,-6.0) or not it < nb_it:
run = False
it = self.tf_object.run(self.update_it)
loss_old = loss
def predict(self,data):
pred,proba = self.tf_object.run([self.prediction,self.proba_data_op],
......
......@@ -12,10 +12,10 @@ class Performance:
def Compute_stat(self):
positive = (self.arr[:,0] == 1)
negative = (self.arr[:,0] == 0)
self.TP = float(np.sum(self.arr[positive,1]))
self.TP = float(np.sum(self.arr[positive,1]==1))
self.TN = float(np.sum(self.arr[negative,1]==0))
self.FP = float(np.sum(self.arr[positive,1]==0))
self.FN = float(np.sum(self.arr[negative,1]))
self.FN = float(np.sum(self.arr[negative,1]==1))
#More advanced measure
def Sensitivity(self):
......@@ -24,9 +24,6 @@ class Performance:
def Specificity(self):
return self.TN / (self.TN + self.FP)
def Precision(self):
return self.TP /(self.TP +self.FP)
# Advanced measure
def Accuracy(self):
return (self.TP + self.TN)/(self.TP+self.FP+self.FN+self.TN)
......@@ -43,7 +40,6 @@ class Performance:
return {
"Sensitivity" : self.Sensitivity(),
"Specificity" : self.Specificity(),
"Precision" : self.Precision(),
"Accuracy" : self.Accuracy(),
"F1" : self.F1(),
"MCC" : self.MCC(),
......
......@@ -9,6 +9,7 @@ import os
import pandas as pd
from plotnine import *
from functools import partial
from concurrent.futures import ThreadPoolExecutor
'''
Files checking
......@@ -27,16 +28,23 @@ Import data
'''
def import_ncRNA(path):
file_order = ["CP.txt","ORF.txt","KMER3.txt","KMER6.txt"]
df = pd.read_csv(path+file_order[0],sep=",",header=None)
for i in range(1,len(file_order)):
tmp = pd.read_csv(path+file_order[i],sep=",",header=None)
df = df.merge(tmp,on=0)
file_order = ["CB.txt","ORF.txt","KMER3.txt"]#,"KMER6.txt"]
#df = pd.read_csv(path+file_order[0],sep=",",header=None)
df_raw = []
with ThreadPoolExecutor(max_workers=4) as tp:
for x in file_order:
df_raw.append(tp.submit(pd.read_csv,path+x,sep=",",header=None))
df = df_raw[0].result()
for i in range(1,len(file_order)):
#tmp = pd.read_csv(path+file_order[i],sep=",",header=None)
tmp = df_raw[i].result()
df = df.merge(tmp,on=0)
df = df.fillna(0)
data = df.iloc[:,1:].values
data_names = df.iloc[:,0].values
# Modification
data[:,np.arange(4)] = 0.25*data[:,np.arange(4)]
data[:,4] = np.exp(-data[:,4]/200)
# data[:,np.arange(4)] = 0.25*data[:,np.arange(4)]
# data[:,4] = np.exp(-data[:,4]/200)
return (data,data_names)
'''
......@@ -70,8 +78,8 @@ def plot_weights(units,m,n,name):
features = [
np.arange(4),
np.arange(4,6),
np.arange(6,6+4**3),
np.arange(6+4**3, units.shape[1])
# np.arange(6,6+4**3),
# np.arange(6+4**3, units.shape[1])
]
for l,f in enumerate(features):
dico = []
......
""" Train IRSOM model on ncRNA.
Usage:
train.py --featurer=<path> --output=<path> (-c <coding>)... (-n <noncoding>)... [--dim0=<value> --dim1=<value> --batch_size=<value> --penality=<value> --keep_features]
train.py --featurer=<path> --output=<path> (-c <coding>)... (-n <noncoding>)... [--dim0=<value> --dim1=<value> --batch_size=<value> --penality=<value> --keep_features --verbose]
train.py (-h | --help)
train.py --version
......@@ -12,9 +12,10 @@ Options:
-n <noncoding> Path for one or multiple fasta file(s) containing noncoding transcript.
--dim0=<value> SOM dimension 0 (by default at 3)
--dim1=<value> SOM dimension 1 (by default at 3).
--batch_size=<value> the size of the batch given at each iteration (by default at 10).
--batch_size=<value> the size of the batch given at each iteration (by default at 100).
--penality=<value> Coefficient of the regularization term (by default at 0.001).
--keep_features Keep the features computed in the "output" folder.
--verbose Produce more output
"""
from docopt import docopt
......@@ -31,10 +32,11 @@ def main():
arguments = docopt(__doc__,version="train IRSOM 1.0")
output_path = os.path.expanduser(os.path.expandvars(arguments["--output"]))
featurer_path = os.path.expanduser(os.path.expandvars(arguments["--featurer"]))
map_size_m = int(arguments["--dim0"]) if not arguments["--dim0"] is None else 3
map_size_n = int(arguments["--dim1"]) if not arguments["--dim0"] is None else 3
batch_size = float(arguments["--batch_size"]) if not arguments["--batch_size"] is None else 10
map_size_m = int(arguments["--dim0"]) if not arguments["--dim0"] is None else 4
map_size_n = int(arguments["--dim1"]) if not arguments["--dim0"] is None else 4
batch_size = float(arguments["--batch_size"]) if not arguments["--batch_size"] is None else 1000
penality = float(arguments["--penality"]) if not arguments["--penality"] is None else 0.001
verbose = arguments["--verbose"]
#Compute features
path_feature_root = [output_path+"features/coding", output_path+"features/noncoding"]
......@@ -76,12 +78,12 @@ def main():
data = np.concatenate(data_coding_list+data_noncoding_list,axis=0)
label = np.repeat([0,1],[nb_coding,nb_noncoding])
som = SOM(m=map_size_m,n=map_size_n,unit_width=data.shape[1],verbose=False)
som = SOM(m=map_size_m,n=map_size_n,unit_width=data.shape[1],verbose=verbose)
ssom = SLSOM(som,2,verbose=False)
ssom = SLSOM(som,2,verbose=verbose)
ssom.tf_object.initialize()
ssom.train(data,label,data.shape[0],batch_size,penality)
ssom.train(data,label,penality = penality)
print("SLSOM learned")
check_dir(output_path+"SOM/")
......@@ -90,6 +92,7 @@ def main():
ssom.save(output_path+"SLSOM/")
y,p = ssom.predict(data)
np.savetxt(output_path+"proba.txt",np.array(p))
rep,_ = som.repartition_map(data,label)
plot_repartition(rep,map_size_m,map_size_n,output_path+"plot_repartition")
plot_density(label,p,output_path+"plot_density")
......