util.py
2.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import numpy as np
import matplotlib
matplotlib.use('Agg')
from pylab import rcParams
rcParams['figure.figsize'] = 19, 20
from matplotlib import pyplot as plt
from matplotlib.pyplot import cm
import os
import pandas as pd
from plotnine import *
from functools import partial
from concurrent.futures import ThreadPoolExecutor
'''
Files checking
'''
def check_dir_file(file_name):
path = os.path.dirname(file_name)
if path != "":
check_dir(path)
def check_dir(path):
if not os.path.exists(path):
os.makedirs(path)
'''
Import data
'''
def import_ncRNA(path):
file_order = ["CB.txt","ORF.txt","KMER3.txt"]#,"KMER6.txt"]
#df = pd.read_csv(path+file_order[0],sep=",",header=None)
df_raw = []
with ThreadPoolExecutor(max_workers=4) as tp:
for x in file_order:
df_raw.append(tp.submit(pd.read_csv,path+x,sep=",",header=None))
df = df_raw[0].result()
for i in range(1,len(file_order)):
#tmp = pd.read_csv(path+file_order[i],sep=",",header=None)
tmp = df_raw[i].result()
df = df.merge(tmp,on=0)
df = df.fillna(0)
data = df.iloc[:,1:].values
data_names = df.iloc[:,0].values
# Modification
# data[:,np.arange(4)] = 0.25*data[:,np.arange(4)]
# data[:,4] = np.exp(-data[:,4]/200)
return (data,data_names)
'''
Plot function
'''
def plot_repartition(data,m,n,name):
labels_names = [str(x) for x in data[0]]
repartition = data[1]
dico = []
for i in range(m):
for j in range(n):
idx = i*n+j
value_sum = np.sum(repartition[idx,:])
if value_sum > 0:
for k in range(len(labels_names)):
tmp = {"id": idx, "x":i, "y":j, "label": labels_names[k], "percentage in cluster": repartition[idx,k]/value_sum}
dico.append(tmp)
else:
dico.append({"id": idx, "x":i, "y":j, "label": "empty", "percentage in cluster": 1.0})
df = pd.DataFrame(dico)
p = ggplot(df,aes(fill="label",y="percentage in cluster",x="1"))
p += geom_bar(stat="identity")
p += facet_grid("x ~ y ")
p += scale_x_discrete(name = "", limits = "1",labels=[""])
p += xlab("")
p += ylab("")
check_dir_file(name)
p.save(name+".png",width=6, height=7)
def plot_weights(units,m,n,name):
features = [
np.arange(4),
np.arange(4,6),
# np.arange(6,6+4**3),
# np.arange(6+4**3, units.shape[1])
]
for l,f in enumerate(features):
dico = []
for i in range(m):
for j in range(n):
idx = i*n + j
for k in range(f.shape[0]):
dico.append({"x":i,"y":j,"x2":k+1, "y2": units[idx,f[k]]})
df = pd.DataFrame(dico)
p = ggplot(df,aes(x="x2",y="y2"))
p += geom_line(size=1.0)
p += scale_x_discrete(name = "", limits = f)
p += facet_grid("x ~ y ")
p += xlab("")
p += ylab("")
check_dir_file(name)
p.save(name+str(l)+".png",width=6, height=4)
def plot_density(y, proba,name):
dico_raw = []
for i in range(y.shape[0]):
dico_raw.append({"label":str(y[i]),"value":proba[i,0]})
df = pd.DataFrame(dico_raw)
p = ggplot(df,aes(color="label",fill="label",x="value",group="label"))
p += geom_histogram()
check_dir_file(name)
p.save(name+".png",width=6, height=7)