Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Louis BECQUEY
/
RNANet
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Aglaé TABOT
2021-06-14 17:33:23 +0200
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
c0c2efccba5fa09e72ba5bc8807b203ee7382ec8
c0c2efcc
1 parent
0bde149e
Added useful functions for measurements and plots on pairings
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
184 additions
and
6 deletions
statistics.py
statistics.py
View file @
c0c2efc
...
...
@@ -30,6 +30,8 @@ from collections import Counter
from
setproctitle
import
setproctitle
from
RNAnet
import
Job
,
read_cpu_number
,
sql_ask_database
,
sql_execute
,
warn
,
notify
,
init_worker
,
trace_unhandled_exceptions
from
sklearn.mixture
import
GaussianMixture
import
warnings
from
pandas.core.common
import
SettingWithCopyWarning
np
.
set_printoptions
(
threshold
=
sys
.
maxsize
,
linewidth
=
np
.
inf
,
precision
=
8
)
...
...
@@ -1921,7 +1923,7 @@ def angles_plans_hire_RNA(f):
setproctitle
(
f
"RNANet statistics.py Worker {thr_idx+1} angles_plans_hire_RNA({f})"
)
os
.
makedirs
(
runDir
+
"/results/
plane_angles_hRNA
/"
,
exist_ok
=
True
)
os
.
makedirs
(
runDir
+
"/results/
HiRE-RNA/angles
/"
,
exist_ok
=
True
)
parser
=
MMCIFParser
()
s
=
parser
.
get_structure
(
name
,
os
.
path
.
abspath
(
"/home/data/RNA/3D/rna_only/"
+
f
))
...
...
@@ -1960,7 +1962,7 @@ def angles_plans_hire_RNA(f):
pbar
.
close
()
df
.
to_csv
(
runDir
+
'/results/
plane_angles_hRNA
/'
+
'angles_plans_hire_RNA '
+
name
+
'.csv'
)
df
.
to_csv
(
runDir
+
'/results/
HiRE-RNA/angles
/'
+
'angles_plans_hire_RNA '
+
name
+
'.csv'
)
idxQueue
.
put
(
thr_idx
)
# replace the thread index in the queue
setproctitle
(
f
"RNANet statistics.py Worker {thr_idx+1} finished"
)
...
...
@@ -2040,6 +2042,71 @@ def GMM_histo(data, name_data, x, y, nb_fichiers) :
with
open
(
name_data
+
" .json"
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
json
.
dump
(
summary_data
,
f
,
indent
=
4
)
def
GMM_histo_without_saving
(
data
,
name_data
,
x
,
y
,
nb_fichiers
)
:
'''
Plot Gaussian-Mixture-Model on histograms
'''
histogram
(
data
,
name_data
,
x
,
y
,
nb_fichiers
)
#plot the histogram
n_max
=
8
# number of possible values for n_components
n_components_range
=
np
.
arange
(
n_max
)
+
1
aic
=
[]
bic
=
[]
maxlogv
=
[]
md
=
np
.
array
(
data
)
.
reshape
(
-
1
,
1
)
# construction of models and calculation of criteria
nb_components
=
1
nb_log_max
=
n_components_range
[
0
]
log_max
=
0
# chooses the number of components based on the maximum likelihood value (maxlogv)
for
n_comp
in
n_components_range
:
gmm
=
GaussianMixture
(
n_components
=
n_comp
)
.
fit
(
md
)
aic
.
append
(
abs
(
gmm
.
aic
(
md
)))
bic
.
append
(
abs
(
gmm
.
bic
(
md
)))
maxlogv
.
append
(
gmm
.
lower_bound_
)
if
gmm
.
lower_bound_
==
max
(
maxlogv
)
:
# takes the maximum
nb_components
=
n_comp
# if there is convergence, keep the first maximum found
if
abs
(
gmm
.
lower_bound_
-
log_max
)
<
0.02
:
#threshold=0.02
nb_components
=
nb_log_max
break
log_max
=
max
(
maxlogv
)
nb_log_max
=
n_comp
# plot with the appropriate number of components
obs
=
np
.
array
(
data
)
.
reshape
(
-
1
,
1
)
g
=
GaussianMixture
(
n_components
=
nb_components
)
g
.
fit
(
obs
)
weights
=
g
.
weights_
means
=
g
.
means_
covariances
=
g
.
covariances_
D
=
obs
.
ravel
()
xmin
=
D
.
min
()
xmax
=
D
.
max
()
x
=
np
.
linspace
(
xmin
,
xmax
,
1000
)
colors
=
[
'red'
,
'blue'
,
'gold'
,
'cyan'
,
'magenta'
,
'white'
,
'black'
,
'green'
]
# prepare the dictionary to save the parameters
summary_data
=
{}
summary_data
[
"measure"
]
=
name_data
summary_data
[
"weights"
]
=
[]
summary_data
[
"means"
]
=
[]
summary_data
[
"std"
]
=
[]
# plot
for
i
in
range
(
nb_components
):
mean
=
means
[
i
]
sigma
=
np
.
sqrt
(
covariances
[
i
])
weight
=
weights
[
i
]
plt
.
plot
(
x
,(
weights
[
i
]
*
st
.
norm
.
pdf
(
x
,
mean
,
sigma
))[
0
],
c
=
colors
[
i
])
summary_data
[
"means"
]
.
append
(
str
(
mean
))
summary_data
[
"std"
]
.
append
(
str
(
sigma
))
summary_data
[
"weights"
]
.
append
(
str
(
weight
))
axes
=
plt
.
gca
()
plt
.
title
(
"Histogramme "
+
name_data
+
" avec GMM pour "
+
str
(
nb_components
)
+
" composantes ("
+
str
(
nb_fichiers
)
+
" structures)"
)
# save in a json
with
open
(
name_data
+
" .json"
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
json
.
dump
(
summary_data
,
f
,
indent
=
4
)
def
GMM_tot
(
data
,
name_data
,
nb_fichiers
,
couleur
)
:
'''
Plot the sum of the Gaussians (without the histograms)
...
...
@@ -2468,7 +2535,7 @@ def graph_torsion_h_RNA():
def
graph_plans_h_RNA
():
df
=
pd
.
read_csv
(
os
.
path
.
abspath
(
runDir
+
"results/HiRE-RNA/angles/angles_plans_hire_RNA.csv"
))
df
=
pd
.
read_csv
(
os
.
path
.
abspath
(
runDir
+
"
/
results/HiRE-RNA/angles/angles_plans_hire_RNA.csv"
))
p_c1p_psuiv
=
list
(
df
[
"P-C1'-P°"
][
~
np
.
isnan
(
df
[
"P-C1'-P°"
])])
c1p_psuiv_c1psuiv
=
list
(
df
[
"C1'-P°-C1'°"
][
~
np
.
isnan
(
df
[
"C1'-P°-C1'°"
])])
...
...
@@ -2487,6 +2554,112 @@ def graph_plans_h_RNA():
plt
.
savefig
(
"GMM des angles plans (hire-RNA) (100 structures).png"
)
plt
.
close
()
'''
Functions for making measurements on pairings
'''
def
dictio
(
ld
):
'''
creation of a dictionary
key = pdb identifier of the structure
value = list of RNA chains
'''
pdb
=
[]
for
f
in
ld
:
pdb_id
=
str
.
split
(
f
,
'_'
)[
0
]
if
pdb_id
not
in
pdb
:
#we create a list of distinct structures
pdb
.
append
(
pdb_id
)
for
pdb_id
in
pdb
:
#for all structures found
liste_chaines
=
[]
for
f
in
ld
:
if
(
len
(
f
)
<
10
):
#unmapped to a Rfam family
chaine
=
str
.
split
(
f
,
'_'
)
if
chaine
[
0
]
==
pdb_id
:
id_chain
=
chaine
[
2
]
liste_chaines
.
append
(
id_chain
)
if
liste_chaines
!=
[]:
dictionnaire
[
pdb_id
]
=
liste_chaines
return
(
dictionnaire
)
def
dist_pointes
(
res
,
pair
):
'''
measure of the distance between the tips of the paired nucleotides (B1 / B1 or B1 / B2 or B2 / B2)
'''
dist
=
[]
d
=
0
if
res
.
get_resname
()
==
'A'
or
res
.
get_resname
()
==
'G'
:
# different cases if 1 aromatic cycle or 2
atom_res
=
pos_b2
(
res
)
if
pair
.
get_resname
()
==
'A'
or
pair
.
get_resname
()
==
'G'
:
atom_pair
=
pos_b2
(
pair
)
if
pair
.
get_resname
()
==
'C'
or
pair
.
get_resname
()
==
'U'
:
atom_pair
=
pos_b1
(
pair
)
if
res
.
get_resname
()
==
'C'
or
res
.
get_resname
()
==
'U'
:
atom_res
=
pos_b1
(
res
)
if
pair
.
get_resname
()
==
'A'
or
pair
.
get_resname
()
==
'G'
:
atom_pair
=
pos_b2
(
pair
)
if
pair
.
get_resname
()
==
'C'
or
pair
.
get_resname
()
==
'U'
:
atom_pair
=
pos_b1
(
pair
)
dist
=
distance
(
atom_res
,
atom_pair
)
return
dist
def
angle_c1_b1
(
res
,
pair
):
'''
measurement of the plane angles formed by the vectors C1-> B1 of the paired nucleotides
'''
if
res
.
get_resname
()
==
'A'
or
res
.
get_resname
()
==
'G'
or
res
.
get_resname
()
==
'C'
or
res
.
get_resname
()
==
'U'
:
atom_c4_res
=
[
atom
.
get_coord
()
for
atom
in
res
if
"C4'"
in
atom
.
get_fullname
()
]
atom_c1p_res
=
[
atom
.
get_coord
()
for
atom
in
res
if
"C1'"
in
atom
.
get_fullname
()
]
atom_b1_res
=
pos_b1
(
res
)
c4_res
=
Vector
(
atom_c4_res
[
0
])
c1_res
=
Vector
(
atom_c1p_res
[
0
])
b1_res
=
Vector
(
atom_b1_res
)
if
pair
.
get_resname
()
==
'A'
or
pair
.
get_resname
()
==
'G'
or
pair
.
get_resname
()
==
'C'
or
pair
.
get_resname
()
==
'U'
:
atom_c4_pair
=
[
atom
.
get_coord
()
for
atom
in
pair
if
"C4'"
in
atom
.
get_fullname
()
]
atom_c1p_pair
=
[
atom
.
get_coord
()
for
atom
in
pair
if
"C1'"
in
atom
.
get_fullname
()
]
atom_b1_pair
=
pos_b1
(
pair
)
c4_pair
=
Vector
(
atom_c4_pair
[
0
])
c1_pair
=
Vector
(
atom_c1p_pair
[
0
])
b1_pair
=
Vector
(
atom_b1_pair
)
#we calculate the 4 plane angles including these vectors
a
=
calc_angle
(
c4_res
,
c1_res
,
b1_res
)
*
(
180
/
np
.
pi
)
b
=
calc_angle
(
c1_res
,
b1_res
,
b1_pair
)
*
(
180
/
np
.
pi
)
c
=
calc_angle
(
b1_res
,
b1_pair
,
c1_pair
)
*
(
180
/
np
.
pi
)
d
=
calc_angle
(
b1_pair
,
c1_pair
,
c4_pair
)
*
(
180
/
np
.
pi
)
angles
=
[
a
,
b
,
c
,
d
]
return
angles
def
graphe
(
type_LW
,
angle_1
,
angle_2
,
angle_3
,
angle_4
,
distance
,
nb
):
'''
function to plot the statistical figures you want
By type of pairing:
Superposition of GMMs of plane angles
Superposition of the histogram and the GMM of the distances
all in the same window
'''
figure
=
plt
.
figure
(
figsize
=
(
10
,
10
))
plt
.
gcf
()
.
subplots_adjust
(
left
=
0.1
,
bottom
=
0.1
,
right
=
0.9
,
top
=
0.9
,
wspace
=
0
,
hspace
=
0.5
)
plt
.
subplot
(
2
,
1
,
1
)
GMM_tot
(
angle_1
,
"C4'-C1'-B1"
,
nb
,
'cyan'
)
GMM_tot
(
angle_2
,
"C1'-B1-B1pair"
,
nb
,
'magenta'
)
GMM_tot
(
angle_3
,
"B1-B1pair-C1'pair"
,
nb
,
"yellow"
)
GMM_tot
(
angle_4
,
"B1pair-C1'pair-C4'pair"
,
nb
,
'olive'
)
plt
.
xlabel
(
"Angle(degré)"
)
plt
.
title
(
"GMM des angles plans pour les appariements "
+
type_LW
+
" ("
+
str
(
nb
)
+
" valeurs)"
,
fontsize
=
10
)
plt
.
subplot
(
2
,
1
,
2
)
GMM_histo_without_saving
(
distance
,
"Distance pointes "
+
type_LW
,
"Distance (Angström)"
,
"Densité"
,
nb
)
plt
.
savefig
(
"Mesures appariements "
+
type_LW
+
" ("
+
str
(
nb
)
+
" valeurs).png"
)
plt
.
close
()
...
...
@@ -2623,12 +2796,13 @@ if __name__ == "__main__":
#dist_atoms_hire_RNA(os.listdir(path_to_3D_data + "rna_only")[0])
#concatenate('/results/distances/', os.listdir(runDir+'/results/distances/'), 'dist_atoms.csv')
#exit()
exit
()
f_prec
=
os
.
listdir
(
path_to_3D_data
+
"rna_only"
)[
0
]
for
f
in
os
.
listdir
(
path_to_3D_data
+
"rna_only"
)[:
100
]:
#joblist.append(Job(function=dist_atoms, args=(f,)))
#joblist.append(Job(function=dist_atoms_hire_RNA, args=(f,)))
#
joblist.append(Job(function=angles_torsion_hire_RNA, args=(f,)))
joblist
.
append
(
Job
(
function
=
angles_torsion_hire_RNA
,
args
=
(
f
,)))
joblist
.
append
(
Job
(
function
=
angles_plans_hire_RNA
,
args
=
(
f
,)))
...
...
@@ -2676,7 +2850,11 @@ if __name__ == "__main__":
graph_angles_torsion()
concatenate('/results/HiRE-RNA/distances/', 'dist_atoms_hire_RNA.csv')
graph_dist_atoms_h_RNA()
concatenate('/results/HiRE-RNA/torsions/', 'angles_torsion_hire_RNA.csv')
graph_torsion_hire_RNA()
concatenate('/results/HiRE-RNA/angles/', 'angles_plans_hire_RNA.csv')
graph_torsion_h_RNA()
graph_plans_h_RNA()
graph_eta_theta()
'''
...
...
Please
register
or
login
to post a comment