util.py 2.93 KB

Raw Blame History Permalink

import numpy as np
import matplotlib
matplotlib.use('Agg')
from pylab import rcParams
rcParams['figure.figsize'] = 19, 20
from matplotlib import pyplot as plt
from matplotlib.pyplot import cm
import os
import pandas as pd
from plotnine import *
from functools import partial
from concurrent.futures import ThreadPoolExecutor

'''
Files checking
'''
def check_dir_file(file_name):
	path = os.path.dirname(file_name)
	if path != "":
		check_dir(path)

def check_dir(path):
	if not os.path.exists(path):
		os.makedirs(path)

'''
Import data
'''

def import_ncRNA(path):
	file_order = ["CB.txt","ORF.txt","KMER3.txt"]#,"KMER6.txt"]
	#df = pd.read_csv(path+file_order[0],sep=",",header=None)
	df_raw = []
	with ThreadPoolExecutor(max_workers=4) as tp:
		for x in file_order:
			df_raw.append(tp.submit(pd.read_csv,path+x,sep=",",header=None))
		df = df_raw[0].result()
		for i in range(1,len(file_order)):
			#tmp = pd.read_csv(path+file_order[i],sep=",",header=None)
			tmp = df_raw[i].result()
			df = df.merge(tmp,on=0)
	df = df.fillna(0)
	data = df.iloc[:,1:].values
	data_names = df.iloc[:,0].values
	# Modification
#	data[:,np.arange(4)] = 0.25*data[:,np.arange(4)]
#	data[:,4] = np.exp(-data[:,4]/200)
	return (data,data_names)

'''
Plot function
'''
def plot_repartition(data,m,n,name):
	labels_names =  [str(x) for x in data[0]]
	repartition = data[1]
	dico = []
	for i in range(m):
		for j in range(n):
			idx = i*n+j
			value_sum = np.sum(repartition[idx,:])
			if value_sum > 0:
				for k in range(len(labels_names)):
					tmp = {"id": idx, "x":i, "y":j, "label": labels_names[k], "percentage in cluster": repartition[idx,k]/value_sum}
					dico.append(tmp)
			else:
				dico.append({"id": idx, "x":i, "y":j, "label": "empty", "percentage in cluster": 1.0})
	df = pd.DataFrame(dico)
	p = ggplot(df,aes(fill="label",y="percentage in cluster",x="1"))
	p += geom_bar(stat="identity")
	p += facet_grid("x ~ y ")
	p += scale_x_discrete(name = "", limits = "1",labels=[""])
	p += xlab("")
	p += ylab("")
	check_dir_file(name)
	p.save(name+".png",width=6, height=7)

def plot_weights(units,m,n,name):
	features = [
		np.arange(4),
		np.arange(4,6),
#		np.arange(6,6+4**3),
#		np.arange(6+4**3, units.shape[1])
	]
	for l,f in enumerate(features):
		dico = []
		for i in range(m):
			for j in range(n):
				idx = i*n + j
				for k in range(f.shape[0]):
					dico.append({"x":i,"y":j,"x2":k+1, "y2": units[idx,f[k]]})
		df = pd.DataFrame(dico)
		p = ggplot(df,aes(x="x2",y="y2"))
		p += geom_line(size=1.0)
		p += scale_x_discrete(name = "", limits = f)
		p += facet_grid("x ~ y ")
		p += xlab("")
		p += ylab("")
		check_dir_file(name)
		p.save(name+str(l)+".png",width=6, height=4)

def plot_density(y, proba,name):
	dico_raw = []
	for i in range(y.shape[0]):
		dico_raw.append({"label":str(y[i]),"value":proba[i,0]})

	df = pd.DataFrame(dico_raw)
	p = ggplot(df,aes(color="label",fill="label",x="value",group="label"))
	p += geom_histogram()
	check_dir_file(name)
	p.save(name+".png",width=6, height=7)