Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Louis BECQUEY
/
RNANet
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Louis BECQUEY
2020-03-20 16:42:51 +0100
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
ee5f1256605838f90808dca5c941b8cb086e99c8
ee5f1256
1 parent
220d1a3e
Distance matrices of 3D data
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
63 additions
and
26 deletions
results/clusters_rot180.png
statistics.py
results/clusters_rot180.png
deleted
100644 → 0
View file @
220d1a3
399 KB
statistics.py
View file @
ee5f125
...
...
@@ -6,14 +6,17 @@ import threading as th
import
seaborn
as
sb
import
scipy.stats
as
st
import
matplotlib.pyplot
as
plt
import
matplotlib.patches
as
ptch
import
pylab
import
scipy.cluster.hierarchy
as
sch
from
scipy.spatial.distance
import
squareform
from
mpl_toolkits.mplot3d
import
axes3d
from
Bio.Phylo.TreeConstruction
import
DistanceCalculator
from
Bio
import
AlignIO
from
Bio
import
AlignIO
,
SeqIO
from
matplotlib
import
cm
from
tqdm
import
tqdm
from
functools
import
partial
from
multiprocessing
import
Pool
from
os
import
path
from
RNAnet
import
read_cpu_number
...
...
@@ -182,14 +185,52 @@ def stats_len(mappings_list, points):
# ax.text(600,150, str(len([ x for x in lengths[f] if x != np.NaN ])), fontsize=14)
plt
.
savefig
(
"results/full_length_distribs.png"
)
def
seq_idty
(
mappings_list
):
idty_matrix
=
{}
def
to_dist_matrix
(
f
):
print
(
f
)
dm
=
DistanceCalculator
(
'identity'
)
for
f
in
mappings_list
.
keys
():
with
open
(
path_to_seq_data
+
"realigned/"
+
f
+
"++.stk"
)
as
al_file
:
al
=
AlignIO
.
read
(
al_file
,
"stockholm"
)
idty_matrix
[
f
]
=
dm
.
get_distance
(
al
)
with
open
(
path_to_seq_data
+
"realigned/"
+
f
+
"++.afa"
)
as
al_file
:
al
=
AlignIO
.
read
(
al_file
,
"fasta"
)[
-
len
(
mappings_list
[
f
]):]
idty
=
dm
.
get_distance
(
al
)
.
matrix
# list of lists
del
al
l
=
len
(
idty
)
np
.
save
(
"data/"
+
f
+
".npy"
,
np
.
array
([
idty
[
i
]
+
[
0
]
*
(
l
-
1
-
i
)
if
i
<
l
-
1
else
idty
[
i
]
for
i
in
range
(
l
)
]))
del
idty
return
0
def
seq_idty
(
mappings_list
):
fam_arrays
=
[]
for
f
in
sorted
(
mappings_list
.
keys
()):
if
path
.
isfile
(
"data/"
+
f
+
".npy"
):
fam_arrays
.
append
(
np
.
load
(
"data/"
+
f
+
".npy"
))
else
:
# to_dist_matrix(f)
# fam_arrays.append(np.load("data/"+f+".npy"))
fam_arrays
.
append
([])
fig
,
axs
=
plt
.
subplots
(
11
,
7
,
figsize
=
(
25
,
25
))
axs
=
axs
.
ravel
()
[
axi
.
set_axis_off
()
for
axi
in
axs
]
for
f
,
D
,
ax
in
zip
(
sorted
(
mappings_list
.
keys
()),
fam_arrays
,
axs
):
if
not
len
(
D
):
continue
if
D
.
shape
[
0
]
>
2
:
# Cluster only if there is more than 2 sequences to organize
D
=
D
+
D
.
T
# Copy the lower triangle to upper, to get a symetrical matrix
condensedD
=
squareform
(
D
)
# Compute basic dendrogram by Ward's method
Y
=
sch
.
linkage
(
condensedD
,
method
=
'ward'
)
Z
=
sch
.
dendrogram
(
Y
,
orientation
=
'left'
,
no_plot
=
True
)
# Reorganize rows and cols
idx1
=
Z
[
'leaves'
]
D
=
D
[
idx1
,:]
D
=
D
[:,
idx1
[::
-
1
]]
im
=
ax
.
matshow
(
D
,
vmin
=
0
,
vmax
=
1
,
origin
=
'lower'
)
ax
.
set_title
(
f
)
fig
.
suptitle
(
"Distance matrices of sequences from various families
\n
clustered by sequence identity (Ward's method)"
,
fontsize
=
"18"
)
fig
.
tight_layout
()
fig
.
subplots_adjust
(
top
=
0.92
)
fig
.
colorbar
(
im
,
ax
=
axs
.
tolist
(),
shrink
=
0.98
)
fig
.
savefig
(
f
"results/distances.png"
)
...
...
@@ -205,24 +246,20 @@ if __name__ == "__main__":
mappings_list
=
pd
.
read_csv
(
path_to_seq_data
+
"realigned/mappings_list.csv"
,
sep
=
','
,
index_col
=
0
)
.
to_dict
(
orient
=
'list'
)
for
k
in
mappings_list
.
keys
():
mappings_list
[
k
]
=
[
x
for
x
in
mappings_list
[
k
]
if
str
(
x
)
!=
'nan'
]
print
(
mappings_list
)
exit
()
print
(
"Loading datapoints from file..."
)
rna_points
=
[]
filelist
=
[
path_to_3D_data
+
"/datapoints/"
+
f
for
f
in
os
.
listdir
(
path_to_3D_data
+
"/datapoints"
)
if
".log"
not
in
f
and
".gz"
not
in
f
]
p
=
Pool
(
initializer
=
tqdm
.
set_lock
,
initargs
=
(
tqdm
.
get_lock
(),),
processes
=
read_cpu_number
())
pbar
=
tqdm
(
total
=
len
(
filelist
),
desc
=
"RNA files"
,
position
=
0
,
leave
=
True
)
for
i
,
rna
in
enumerate
(
p
.
imap_unordered
(
load_rna_frome_file
,
filelist
)):
rna_points
.
append
(
rna
)
pbar
.
update
(
1
)
pbar
.
close
()
p
.
close
()
p
.
join
()
npoints
=
len
(
rna_points
)
print
(
npoints
,
"RNA files loaded."
)
exit
()
# print("Loading datapoints from file...")
# rna_points = []
# filelist = [path_to_3D_data+"/datapoints/"+f for f in os.listdir(path_to_3D_data+"/datapoints") if ".log" not in f and ".gz" not in f]
# p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=read_cpu_number())
# pbar = tqdm(total=len(filelist), desc="RNA files", position=0, leave=True)
# for i, rna in enumerate(p.imap_unordered(load_rna_frome_file, filelist)):
# rna_points.append(rna)
# pbar.update(1)
# pbar.close()
# p.close()
# p.join()
# npoints = len(rna_points)
# print(npoints, "RNA files loaded.")
#################################################################
# Define threads for the tasks
...
...
Please
register
or
login
to post a comment