train.py
4.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
""" Train IRSOM model on ncRNA.
Usage:
train.py --featurer=<path> --output=<path> (-c <coding>)... (-n <noncoding>)... [--dim0=<value> --dim1=<value> --batch_size=<value> --penality=<value> --alpha=<value> --keep_features --verbose]
train.py (-h | --help)
train.py --version
Options:
--featurer=<path> Path to the featurer executable.
--output=<path> Path to store the model.
-c <coding> Path for one or multiple fasta file(s) containing coding transcript.
-n <noncoding> Path for one or multiple fasta file(s) containing noncoding transcript.
--dim0=<value> SOM dimension 0 (by default at 3)
--dim1=<value> SOM dimension 1 (by default at 3).
--batch_size=<value> the size of the batch given at each iteration (by default at 100).
--penality=<value> Coefficient of the regularization term (by default at 0.001).
--alpha=<value> Constant in the activation function (by default at 0.5).
--keep_features Keep the features computed in the "output" folder.
--verbose Produce more output
"""
from docopt import docopt
import subprocess
import shutil
import os
import numpy as np
from SLSOM.util import *
from SLSOM.SSOM import *
from SLSOM.performance_measure import *
def main():
arguments = docopt(__doc__,version="train IRSOM 1.0")
output_path = os.path.expanduser(os.path.expandvars(arguments["--output"]))
featurer_path = os.path.expanduser(os.path.expandvars(arguments["--featurer"]))
map_size_m = int(arguments["--dim0"]) if not arguments["--dim0"] is None else 10
map_size_n = int(arguments["--dim1"]) if not arguments["--dim0"] is None else 10
batch_size = int(arguments["--batch_size"]) if not arguments["--batch_size"] is None else 100
penality = float(arguments["--penality"]) if not arguments["--penality"] is None else 0.001
alpha = float(arguments["--alpha"]) if not arguments["--alpha"] is None else 0.5
verbose = arguments["--verbose"]
#Compute features
path_feature_root = [output_path+"features/coding", output_path+"features/noncoding"]
path_feature = [[],[]]
featurer_worker = []
for i,fasta in enumerate(arguments["<coding>"]):
tmp = path_feature_root[0]+str(i)+"/"
path_feature[0].append(tmp)
check_dir(tmp)
featurer_worker.append(subprocess.Popen([featurer_path,fasta,tmp]))
for i,fasta in enumerate(arguments["<noncoding>"]):
tmp = path_feature_root[1]+str(i)+"/"
path_feature[1].append(tmp)
check_dir(tmp)
featurer_worker.append(subprocess.Popen([featurer_path,fasta,tmp]))
for w in featurer_worker:
w.wait()
print("Features computed")
# Import data
data_coding_list = []
data_names_coding_list = []
for path in path_feature[0]:
tmp = import_ncRNA(path)
data_coding_list.append(tmp[0])
data_names_coding_list.append(tmp[1])
data_noncoding_list = []
data_names_noncoding_list = []
for path in path_feature[1]:
tmp = import_ncRNA(path)
data_noncoding_list.append(tmp[0])
data_names_noncoding_list.append(tmp[1])
nb_coding = np.sum([x.shape[0] for x in data_coding_list])
nb_noncoding = np.sum([x.shape[0] for x in data_noncoding_list])
data = np.concatenate(data_coding_list+data_noncoding_list,axis=0)
label = np.repeat([0,1],[nb_coding,nb_noncoding])
som = SOM(m=map_size_m,n=map_size_n,unit_width=data.shape[1],verbose=verbose)
ssom = SLSOM(som,2,alpha=alpha,verbose=verbose)
ssom.tf_object.initialize()
som.train(data.shape[0],data)
ssom.train(data,label,batch_size=batch_size,penality = penality)
print("SLSOM learned")
check_dir(output_path+"SOM/")
check_dir(output_path+"SLSOM/")
som.save(output_path+"SOM/")
ssom.save(output_path+"SLSOM/")
y,p = ssom.predict(data)
np.savetxt(output_path+"proba.txt",np.array(p))
rep_true,_ = som.repartition_map(data,label)
rep_pred,_ = som.repartition_map(data,y)
plot_repartition(rep_true,map_size_m,map_size_n,output_path+"plot_repartition_true")
plot_repartition(rep_pred,map_size_m,map_size_n,output_path+"plot_repartition_pred")
plot_density(label,p,output_path+"plot_density")
perf = Performance(np.array([[y[i],label[i]] for i in range(data.shape[0])])).All_measures()
print(perf)
with open(output_path+"perf.txt","w") as f:
towrite = ""
for key in perf:
towrite += key + "\t" + str(perf[key]) + "\n"
f.write(towrite)
plot_weights(som.get_units(),map_size_m,map_size_n,output_path+"plot_")
if not arguments["--keep_features"]:
shutil.rmtree(output_path+"features/")
print("Training finish")
if __name__ == "__main__":
main()