train.py 4.33 KB
""" Train IRSOM model on ncRNA.

Usage:
	train.py --featurer=<path> --output=<path> (-c <coding>)... (-n <noncoding>)... [--dim0=<value> --dim1=<value> --batch_size=<value> --penality=<value> --alpha=<value> --keep_features --verbose]
	train.py (-h | --help)
	train.py --version

Options:
	--featurer=<path>	Path to the featurer executable.
	--output=<path> Path to store the model.
	-c <coding> Path for one or multiple fasta file(s) containing coding transcript.
	-n <noncoding> Path for one or multiple fasta file(s) containing noncoding transcript.
	--dim0=<value>	SOM dimension 0 (by default at 3)
	--dim1=<value>	SOM dimension 1 (by default at 3).
	--batch_size=<value>	the size of the batch given at each iteration (by default at 100).
	--penality=<value>	Coefficient of the regularization term (by default at 0.001).
	--alpha=<value>	Constant in the activation function (by default at 0.5).
	--keep_features	Keep the features computed in the "output" folder.
	--verbose Produce more output

"""
from docopt import docopt
import subprocess
import shutil
import os

import numpy as np
from SLSOM.util import *
from SLSOM.SSOM import *
from SLSOM.performance_measure import *

def main():
	arguments = docopt(__doc__,version="train IRSOM 1.0")
	output_path = os.path.expanduser(os.path.expandvars(arguments["--output"]))
	featurer_path = os.path.expanduser(os.path.expandvars(arguments["--featurer"]))
	map_size_m = int(arguments["--dim0"]) if not arguments["--dim0"] is None else 10
	map_size_n = int(arguments["--dim1"]) if not arguments["--dim0"] is None else 10
	batch_size = int(arguments["--batch_size"]) if not arguments["--batch_size"] is None else 100
	penality = float(arguments["--penality"]) if not arguments["--penality"] is None else 0.001
	alpha = float(arguments["--alpha"]) if not arguments["--alpha"] is None else 0.5
	verbose = arguments["--verbose"]
	
	#Compute features
	path_feature_root = [output_path+"features/coding", output_path+"features/noncoding"]
	path_feature = [[],[]]
	featurer_worker = []
	for i,fasta in enumerate(arguments["<coding>"]):
		tmp = path_feature_root[0]+str(i)+"/"
		path_feature[0].append(tmp)
		check_dir(tmp)
		featurer_worker.append(subprocess.Popen([featurer_path,fasta,tmp]))
	
	for i,fasta in enumerate(arguments["<noncoding>"]):
		tmp = path_feature_root[1]+str(i)+"/"
		path_feature[1].append(tmp)
		check_dir(tmp)
		featurer_worker.append(subprocess.Popen([featurer_path,fasta,tmp]))
		
	for w in featurer_worker:
		w.wait()
	print("Features computed")
	
	# Import data
	data_coding_list = []
	data_names_coding_list = []
	for path in path_feature[0]:
		tmp = import_ncRNA(path)
		data_coding_list.append(tmp[0])
		data_names_coding_list.append(tmp[1])
	
	data_noncoding_list = []
	data_names_noncoding_list = []
	for path in path_feature[1]:
		tmp = import_ncRNA(path)
		data_noncoding_list.append(tmp[0])
		data_names_noncoding_list.append(tmp[1])
	
	nb_coding = np.sum([x.shape[0] for x in data_coding_list])
	nb_noncoding = np.sum([x.shape[0] for x in data_noncoding_list])
	
	data = np.concatenate(data_coding_list+data_noncoding_list,axis=0)
	label = np.repeat([0,1],[nb_coding,nb_noncoding])
	som = SOM(m=map_size_m,n=map_size_n,unit_width=data.shape[1],verbose=verbose)
	
	ssom = SLSOM(som,2,alpha=alpha,verbose=verbose)
	ssom.tf_object.initialize()
	
	som.train(data.shape[0],data)
	ssom.train(data,label,batch_size=batch_size,penality = penality)
	print("SLSOM learned")
	
	check_dir(output_path+"SOM/")
	check_dir(output_path+"SLSOM/")
	som.save(output_path+"SOM/")
	ssom.save(output_path+"SLSOM/")
	
	y,p = ssom.predict(data)
	np.savetxt(output_path+"proba.txt",np.array(p))
	rep_true,_ = som.repartition_map(data,label)
	rep_pred,_ = som.repartition_map(data,y)
	plot_repartition(rep_true,map_size_m,map_size_n,output_path+"plot_repartition_true")
	plot_repartition(rep_pred,map_size_m,map_size_n,output_path+"plot_repartition_pred")
	plot_density(label,p,output_path+"plot_density")
	
	perf = Performance(np.array([[y[i],label[i]] for i in range(data.shape[0])])).All_measures()
	print(perf)
	with open(output_path+"perf.txt","w") as f:
		towrite = ""
		for key in perf:
			towrite += key + "\t" + str(perf[key]) + "\n"
		f.write(towrite)
	
	plot_weights(som.get_units(),map_size_m,map_size_n,output_path+"plot_")
	
	if not arguments["--keep_features"]:
		shutil.rmtree(output_path+"features/")
	
	print("Training finish")

if __name__ == "__main__":
	main()