Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Louis BECQUEY
/
RNANet
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Louis BECQUEY
2021-07-15 14:57:31 +0200
Browse Files
Options
Browse Files
Download
Plain Diff
Commit
e4282475d31f2c3add2f1d683207784c2e10d2d4
e4282475
2 parents
3407617d
a54959cc
Merge branch 'stage_aglae'
Former-commit-id:
2259ea13
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
258 additions
and
37 deletions
RNAnet.py
statistics.py
RNAnet.py
View file @
e428247
...
...
@@ -159,14 +159,14 @@ class SelectivePortionSelector(object):
_select
=
Select
()
def
save_mmcif
(
ioobj
,
out_file
,
select
=
_select
,
preserve_atom_numbering
=
False
):
# reuse and modification of the source code of Biopython
# to have the 2 columns of numbering of residues numbered with the index_chain of DSSR
if
isinstance
(
out_file
,
str
):
fp
=
open
(
out_file
,
"w"
)
close_file
=
True
else
:
fp
=
out_file
close_file
=
False
#fp = open(out_file, "w")
#close_file=True
atom_dict
=
defaultdict
(
list
)
for
model
in
ioobj
.
structure
.
get_list
():
...
...
@@ -188,7 +188,7 @@ def save_mmcif(ioobj, out_file, select=_select, preserve_atom_numbering=False):
chain_id
=
chain
.
get_id
()
if
chain_id
==
" "
:
chain_id
=
"."
# This is used to write label_seq_id
and increments from 1
,
# This is used to write label_seq_id,
# remaining blank for hetero residues
prev_residue_type
=
""
...
...
@@ -402,8 +402,9 @@ class Chain:
resseq
=
int
(
resseq
)
index_chain
=
nums
.
at
[
i
,
"index_chain"
]
nt
=
nums
.
at
[
i
,
"nt_name"
]
if
nt
==
'A'
or
nt
==
'G'
or
nt
==
'C'
or
nt
==
'U'
or
nt
in
[
'DG'
,
'DU'
,
'DC'
,
'DA'
,
'DI'
,
'DT'
]
or
nt
==
'N'
or
nt
==
'I'
:
# particular case 6n5s_1_A, residue 201 in the original cif file (resname = G and HETATM = H_G)
if
nt
==
'A'
or
(
nt
==
'G'
and
(
self
.
chain_label
!=
'6n5s_1_A'
or
resseq
!=
201
))
or
nt
==
'C'
or
nt
==
'U'
or
nt
in
[
'DG'
,
'DU'
,
'DC'
,
'DA'
,
'DI'
,
'DT'
]
or
nt
==
'N'
or
nt
==
'I'
:
res
=
chain
[(
' '
,
resseq
,
icode_res
)]
else
:
#modified nucleotides (e.g. chain 5l4o_1_A)
het
=
'H_'
+
nt
...
...
@@ -421,7 +422,15 @@ class Chain:
res_atoms
=
res
.
get_atoms
()
new_residu_t
=
pdb
.
Residue
.
Residue
(
res_id
,
res_name
,
res
.
get_segid
())
for
atom
in
list
(
res
.
get_atoms
()):
if
atom
.
get_name
()
in
[
'PA'
,
'O1A'
,
'O2A'
,
'O3A'
]:
# rename the remaining phosphate group to P, OP1, OP2, OP3
if
atom
.
get_name
()
in
[
'PA'
,
'O1A'
,
'O2A'
,
'O3A'
]
and
res_name
!=
'RIA'
:
# RIA is a residue made up of 2 riboses and 2 phosphates,
# so it has an O2A atom between the C2A and C1 'atoms,
# and it also has an OP2 atom attached to one of its phosphates
# (see chains 6fyx_1_1, 6zu9_1_1, 6fyy_1_1, 6gsm_1_1 , 3jaq_1_1 and 1yfg_1_A)
# we do not modify the atom names of RIA residue
if
atom
.
get_name
()
==
'PA'
:
atom_name
=
'P'
if
atom
.
get_name
()
==
'O1A'
:
...
...
@@ -1511,11 +1520,7 @@ class Pipeline:
if
self
.
HOMOLOGY
and
not
os
.
path
.
isdir
(
path_to_3D_data
+
"rna_mapped_to_Rfam"
):
# for the portions mapped to Rfam
os
.
makedirs
(
path_to_3D_data
+
"rna_mapped_to_Rfam"
)
'''
if (not self.HOMOLOGY) and not os.path.isdir(path_to_3D_data + "rna_only"):
# extract chains of pure RNA
os.makedirs(path_to_3D_data + "rna_only")
'''
if
(
not
self
.
HOMOLOGY
)
and
not
os
.
path
.
isdir
(
path_to_3D_data
+
"rna_only"
):
# extract chains of pure RNA
os
.
makedirs
(
path_to_3D_data
+
"rna_only"
)
...
...
statistics.py
View file @
e428247
...
...
@@ -37,6 +37,7 @@ from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, no
from
sklearn.mixture
import
GaussianMixture
import
warnings
from
pandas.core.common
import
SettingWithCopyWarning
from
joblib
import
Parallel
,
delayed
np
.
set_printoptions
(
threshold
=
sys
.
maxsize
,
linewidth
=
np
.
inf
,
precision
=
8
)
...
...
@@ -1251,6 +1252,25 @@ def nt_3d_centers(cif_file, consider_all_atoms):
result
.
append
(
res
)
return
(
result
)
def
liste_repres
(
fpath
):
repres
=
[]
df
=
pd
.
read_csv
(
os
.
path
.
abspath
(
fpath
))
for
i
in
range
(
df
.
shape
[
0
]):
up_name
=
df
[
"representative"
][
i
]
if
'+'
in
up_name
:
up_name
=
up_name
.
split
(
'+'
)
for
i
in
range
(
len
(
up_name
)):
chain
=
up_name
[
i
]
.
split
(
'|'
)
chain
=
chain
[
0
]
.
lower
()
+
'_'
+
chain
[
1
]
+
'_'
+
chain
[
2
]
repres
.
append
(
chain
+
'.cif'
)
else
:
up_name
=
up_name
.
split
(
'|'
)
low_name
=
up_name
[
0
]
.
lower
()
+
'_'
+
up_name
[
1
]
+
'_'
+
up_name
[
2
]
repres
.
append
(
low_name
+
'.cif'
)
return
repres
def
get_euclidian_distance
(
L1
,
L2
):
"""
Returns the distance between two points (coordinates in lists)
...
...
@@ -1504,14 +1524,16 @@ def measure_from_structure(f):
warnings
.
simplefilter
(
'ignore'
,
Bio
.
PDB
.
PDBExceptions
.
BiopythonWarning
)
parser
=
MMCIFParser
()
s
=
parser
.
get_structure
(
f
,
os
.
path
.
abspath
(
path_to_3D_data
+
"rna_only/"
+
f
))
measures_aa
(
name
,
s
,
thr_idx
)
#pyle_measures(name, s, thr_idx)
#measures_aa(name, s, thr_idx)
if
DO_HIRE_RNA_MEASURES
:
measures_hrna
(
name
,
s
,
thr_idx
)
measures_hrna_basepairs
(
name
,
s
,
thr_idx
)
if
DO_WADLEY_ANALYSIS
:
measures_wadley
(
name
,
s
,
thr_idx
)
#measures_wadley(name, s, thr_idx)
pyle_measures
(
name
,
s
,
thr_idx
)
idxQueue
.
put
(
thr_idx
)
# replace the thread index in the queue
setproctitle
(
f
"RNANet statistics.py Worker {thr_idx+1} finished"
)
...
...
@@ -1931,6 +1953,128 @@ def measures_hrna_basepairs_chain(name, chain, df, thr_idx):
return
results
@trace_unhandled_exceptions
def
pyle_measures
(
name
,
s
,
thr_idx
):
if
(
path
.
isfile
(
runDir
+
'/results/geometry/Pyle/distances/distances_pyle_'
+
name
+
'.csv'
)):
return
liste_dist
=
[]
#classes=[]
#for i in range(0, 150, 5):
#classes.append([i, i+5])
#classes.append([150, 300])
#occur_p_p=len(classes)*[0]
#occur_p_c1=len(classes)*[0]
#occur_p_c4=len(classes)*[0]
#occur_c1_c1=len(classes)*[0]
#occur_c4_c4=len(classes)*[0]
#nb_occurs=[]
setproctitle
(
f
"RNANet statistics.py Worker {thr_idx+1} pyle_measures({name})"
)
chain
=
next
(
s
[
0
]
.
get_chains
())
#residues=list(chain.get_residues())
for
res1
in
tqdm
(
chain
,
position
=
thr_idx
+
1
,
desc
=
f
"Worker {thr_idx+1}: {name} pyle_measures"
,
unit
=
"res"
,
leave
=
False
):
#res1=chain[i]
if
res1
.
get_resname
()
in
[
"A"
,
"C"
,
"G"
,
"U"
]:
resnum1
=
list
(
res1
.
get_id
())[
1
]
atom_p_1
=
[
atom
.
get_coord
()
for
atom
in
res1
if
atom
.
get_name
()
==
"P"
]
atom_c1p_1
=
[
atom
.
get_coord
()
for
atom
in
res1
if
"C1'"
in
atom
.
get_fullname
()
]
atom_c4p_1
=
[
atom
.
get_coord
()
for
atom
in
res1
if
"C4'"
in
atom
.
get_fullname
()
]
for
res2
in
chain
:
resnum2
=
list
(
res2
.
get_id
())[
1
]
if
resnum2
-
resnum1
<
4
:
continue
p_p
=
np
.
nan
p_c4p
=
np
.
nan
p_c1p
=
np
.
nan
c4p_c4p
=
np
.
nan
c1p_c1p
=
np
.
nan
#res2=chain[j]
if
res2
.
get_resname
()
in
[
"A"
,
"C"
,
"G"
,
"U"
]:
atom_p_2
=
[
atom
.
get_coord
()
for
atom
in
res2
if
atom
.
get_name
()
==
"P"
]
atom_c1p_2
=
[
atom
.
get_coord
()
for
atom
in
res2
if
"C1'"
in
atom
.
get_fullname
()
]
atom_c4p_2
=
[
atom
.
get_coord
()
for
atom
in
res2
if
"C4'"
in
atom
.
get_fullname
()
]
p_p
=
get_euclidian_distance
(
atom_p_1
,
atom_p_2
)
p_c4p
=
get_euclidian_distance
(
atom_p_1
,
atom_c4p_2
)
p_c1p
=
get_euclidian_distance
(
atom_p_1
,
atom_c1p_2
)
c4p_c4p
=
get_euclidian_distance
(
atom_c4p_1
,
atom_c4p_2
)
c1p_c1p
=
get_euclidian_distance
(
atom_c1p_1
,
atom_c1p_2
)
liste_dist
.
append
([
res1
.
get_resname
(),
int
(
resnum1
),
res2
.
get_resname
(),
int
(
resnum2
),
p_p
,
p_c4p
,
p_c1p
,
c4p_c4p
,
c1p_c1p
])
'''
for x in range(len(classes)):
if classes[x][0] <= p_p <= classes[x][1]:
occur_p_p[x]=occur_p_p[x]+1
if classes[x][0] <= p_c4p <= classes[x][1]:
occur_p_c4[x]=occur_p_c4[x]+1
if classes[x][0] <= p_c1p <= classes[x][1]:
occur_p_c1[x]=occur_p_c1[x]+1
if classes[x][0] <= c4p_c4p <= classes[x][1]:
occur_c4_c4[x]=occur_c4_c4[x]+1
if classes[x][0] <= c1p_c1p <= classes[x][1]:
occur_c1_c1[x]=occur_c1_c1[x]+1
'''
#for x in range(len(classes)):
# for i in range(len(liste_dist)):
# if classes[x][0] <= liste_dist[i][4] <= classes[x][1]:
# occur_p_p[x]=occur_p_p[x]+1
# if classes[x][0] <= liste_dist[i][5] <= classes[x][1]:
# occur_p_c4[x]=occur_p_c4[x]+1
# if classes[x][0] <= liste_dist[i][6] <= classes[x][1]:
# occur_p_c1[x]=occur_p_c1[x]+1
# if classes[x][0] <= liste_dist[i][7] <= classes[x][1]:
# occur_c4_c4[x]=occur_c4_c4[x]+1
# if classes[x][0] <= liste_dist[i][8] <= classes[x][1]:
# occur_c1_c1[x]=occur_c1_c1[x]+1
#nb_occurs.append([classes[x], occur_p_p[x], occur_p_c1[x], occur_p_c4[x], occur_c1_c1[x], occur_c4_c4[x]])
#df = pd.DataFrame(nb_occurs, columns=["classe", "P-P", "P-C1'", "P-C4'", "C1'-C1'", "C4'-C4'"])
# return df
# nb_occurs.append([classes, occur_p_p, occur_p_c1, occur_p_c4, occur_c1_c1, occur_c4_c4])
# print(nb_occurs)
# return nb_occurs
df
=
pd
.
DataFrame
(
liste_dist
,
columns
=
[
"res1"
,
"resnum1"
,
"res2"
,
"resnum2"
,
"P-P"
,
"P-C4'"
,
"P-C1'"
,
"C4'-C4'"
,
"C1'-C1'"
])
df
.
to_csv
(
runDir
+
"/results/geometry/Pyle/distances/distances_pyle_"
+
name
+
".csv"
)
@trace_unhandled_exceptions
def
count_occur_pyle_dist
(
fpath
):
global
idxQueue
thr_idx
=
idxQueue
.
get
()
setproctitle
(
f
"Worker {thr_idx+1} : Extract occurences of {fpath}"
)
liste
=
os
.
listdir
(
fpath
)
pbar
=
tqdm
(
total
=
len
(
liste
),
position
=
thr_idx
,
desc
=
"Preparing "
,
leave
=
False
)
df
=
pd
.
read_csv
(
os
.
path
.
abspath
(
fpath
+
liste
.
pop
()))
occur_p_p
=
list
(
df
[
"P-P"
])
occur_p_c1
=
list
(
df
[
"P-C1'"
])
occur_p_c4
=
list
(
df
[
"P-C4'"
])
occur_c1_c1
=
list
(
df
[
"C1'-C1'"
])
occur_c4_c4
=
list
(
df
[
"C4'-C4'"
])
nb_occurs
=
[]
for
f
in
range
(
len
(
liste
)):
df
=
pd
.
read_csv
(
os
.
path
.
abspath
(
fpath
+
liste
.
pop
()))
# print(liste[f])
for
k
in
range
(
df
.
shape
[
0
]):
occur_p_p
[
k
]
=
occur_p_p
[
k
]
+
df
[
"P-P"
][
k
]
occur_p_c1
[
k
]
=
occur_p_c1
[
k
]
+
df
[
"P-C1'"
][
k
]
occur_p_c4
[
k
]
=
occur_p_c4
[
k
]
+
df
[
"P-C4'"
][
k
]
occur_c1_c1
[
k
]
=
occur_c1_c1
[
k
]
+
df
[
"C1'-C1'"
][
k
]
occur_c4_c4
[
k
]
=
occur_c4_c4
[
k
]
+
df
[
"C4'-C4'"
][
k
]
pbar
.
update
(
1
)
nb_occurs
=
[
occur_p_p
,
occur_p_c1
,
occur_p_c4
,
occur_c1_c1
,
occur_c4_c4
]
# return(list(df["classe"]), occur_p_p, occur_p_c1, occur_p_c4, occur_c1_c1, occur_c4_c4)
df
=
pd
.
DataFrame
(
nb_occurs
,
columns
=
list
(
df
[
"classe"
]))
df
.
to_csv
(
runDir
+
"/results/geometry/Pyle/classes_dist/occurences_dist.csv"
)
idxQueue
.
put
(
thr_idx
)
# replace the thread index in the queue
setproctitle
(
f
"RNANet statistics.py Worker {thr_idx+1} finished"
)
@trace_unhandled_exceptions
def
GMM_histo
(
data_ori
,
name_data
,
toric
=
False
,
hist
=
True
,
col
=
None
,
save
=
True
)
:
"""
Plot Gaussian-Mixture-Model (with or without histograms)
...
...
@@ -2012,7 +2156,8 @@ def GMM_histo(data_ori, name_data, toric=False, hist=True, col=None, save=True)
else
:
D
=
obs
.
ravel
()
xmin
=
D
.
min
()
xmax
=
min
(
10.0
,
D
.
max
())
#xmax = min(10.0, D.max())
xmax
=
D
.
max
()
x
=
np
.
linspace
(
xmin
,
xmax
,
1000
)
colors
=
[
'red'
,
'blue'
,
'gold'
,
'cyan'
,
'magenta'
,
'white'
,
'black'
,
'green'
]
...
...
@@ -2372,7 +2517,7 @@ def gmm_wadley():
plt
.
savefig
(
"GMM_flat_angles_pyle_model.png"
)
plt
.
close
()
# Torsion an
f
les
# Torsion an
g
les
eta
=
[]
theta
=
[]
eta_prime
=
[]
...
...
@@ -2420,6 +2565,58 @@ def gmm_wadley():
os
.
chdir
(
runDir
)
setproctitle
(
"GMM (Pyle model) finished"
)
def
gmm_pyle_type
(
ntpair
,
data
):
setproctitle
(
f
"GMM (Pyle {ntpair} )"
)
os
.
makedirs
(
runDir
+
"/results/figures/GMM/Pyle/distances/"
,
exist_ok
=
True
)
os
.
chdir
(
runDir
+
"/results/figures/GMM/Pyle/distances/"
)
p_p
=
list
(
data
[
"P-P"
][
~
np
.
isnan
(
data
[
"P-P"
])])
p_c4p
=
list
(
data
[
"P-C4'"
][
~
np
.
isnan
(
data
[
"P-C4'"
])])
p_c1p
=
list
(
data
[
"P-C1'"
][
~
np
.
isnan
(
data
[
"P-C1'"
])])
c4p_c4p
=
list
(
data
[
"C4'-C4'"
][
~
np
.
isnan
(
data
[
"C4'-C4'"
])])
c1p_c1p
=
list
(
data
[
"C1'-C1'"
][
~
np
.
isnan
(
data
[
"C1'-C1'"
])])
print
(
len
(
p_p
))
# res2=list(data["resnum2"])
# res1=list(data["resnum1"])
# diff=[]
# for i in range(len(res1)):
# diff.append(res2[i]-res1[i])
# print(diff[:100])
GMM_histo
(
p_p
,
f
"Distance P-P between {ntpair} tips for {str(len(p_p))} values"
,
toric
=
False
,
hist
=
False
,
col
=
"cyan"
)
GMM_histo
(
p_c4p
,
f
"Distance P-C4' between {ntpair} tips"
,
toric
=
False
,
hist
=
False
,
col
=
"tomato"
)
GMM_histo
(
p_c1p
,
f
"Distance P-C1' between {ntpair} tips"
,
toric
=
False
,
hist
=
False
,
col
=
"goldenrod"
)
GMM_histo
(
c4p_c4p
,
f
"Distance C4'-C4' between {ntpair} tips"
,
toric
=
False
,
hist
=
False
,
col
=
"magenta"
)
GMM_histo
(
c1p_c1p
,
f
"Distance C1'-C1' between {ntpair} tips"
,
toric
=
False
,
hist
=
False
,
col
=
"black"
)
# GMM_histo(diff, f"Gap between {ntpair} tips", toric=False, hist=False, col="tomato")
plt
.
xlabel
(
"Distance (Angströms)"
)
# plt.xlabel("Number of residues")
plt
.
ylabel
(
"Distance (Angströms)"
)
plt
.
title
(
f
"GMM of distances for {ntpair} "
,
fontsize
=
10
)
# plt.savefig(f"Longueurs_Pyle_{ntpair}.png" )
plt
.
savefig
(
f
"Distances_Pyle_{ntpair}.png"
)
plt
.
close
()
setproctitle
(
f
"GMM (Pyle {ntpair} distances) finished"
)
def
gmm_pyle
():
setproctitle
(
"GMM (Pyle model)"
)
df
=
pd
.
read_csv
(
os
.
path
.
abspath
(
runDir
+
"/results/geometry/Pyle/distances/distances.csv"
))
# dist = ["P-P", "P-C4'", "P-C1'", "C4'-C4'", "C1'-C1'"]
data
=
df
if
len
(
data
):
for
b1
in
[
'A'
,
'C'
,
'G'
,
'U'
]:
for
b2
in
[
'A'
,
'C'
,
'G'
,
'U'
]:
thisbases
=
data
[(
data
.
res1
==
b1
)
&
(
data
.
res2
==
b2
)]
if
len
(
thisbases
):
gmm_pyle_type
(
b1
+
b2
,
thisbases
)
@trace_unhandled_exceptions
def
gmm_hrna
():
"""
...
...
@@ -2669,7 +2866,11 @@ def merge_jsons():
os
.
remove
(
f
)
except
FileNotFoundError
:
pass
@trace_unhandled_exceptions
def
loop
(
f
):
return
pd
.
read_csv
(
f
)
@trace_unhandled_exceptions
def
concat_dataframes
(
fpath
,
outfilename
):
"""
...
...
@@ -2682,14 +2883,23 @@ def concat_dataframes(fpath, outfilename):
liste
=
os
.
listdir
(
fpath
)
pbar
=
tqdm
(
total
=
len
(
liste
),
position
=
thr_idx
,
desc
=
"Preparing "
+
outfilename
,
leave
=
False
)
df_tot
=
pd
.
read_csv
(
os
.
path
.
abspath
(
fpath
+
liste
.
pop
()))
df_tot
=
pd
.
read_csv
(
os
.
path
.
abspath
(
fpath
+
liste
.
pop
()),
engine
=
"python"
)
#df=Parallel(n_jobs=-1, verbose=20)(delayed(loop)(os.path.abspath(fpath+liste[f])) for f in range (len(liste)))
#except :
# print(liste[f])
pbar
.
update
(
1
)
for
f
in
range
(
len
(
liste
)):
df
=
pd
.
read_csv
(
os
.
path
.
abspath
(
fpath
+
liste
.
pop
()))
# try :
df
=
pd
.
read_csv
(
os
.
path
.
abspath
(
fpath
+
liste
.
pop
()),
engine
=
'python'
)
# except :
# print(liste[f])
# continue
df_tot
=
pd
.
concat
([
df_tot
,
df
],
ignore_index
=
True
)
pbar
.
update
(
1
)
#df = pd.concat(df, ignore_index=True)
#pbar.update(1)
#df.to_csv(fpath + outfilename)
df_tot
.
to_csv
(
fpath
+
outfilename
)
idxQueue
.
put
(
thr_idx
)
# replace the thread index in the queue
setproctitle
(
f
"RNANet statistics.py Worker {thr_idx+1} finished"
)
...
...
@@ -2789,6 +2999,7 @@ if __name__ == "__main__":
elif
opt
==
'--wadley'
:
DO_WADLEY_ANALYSIS
=
True
os
.
makedirs
(
runDir
+
"/results/geometry/Pyle/distances/"
,
exist_ok
=
True
)
os
.
makedirs
(
runDir
+
"/results/geometry/Pyle/classes_dist/"
,
exist_ok
=
True
)
os
.
makedirs
(
runDir
+
"/results/geometry/Pyle/angles/"
,
exist_ok
=
True
)
os
.
makedirs
(
runDir
+
"/results/geometry/Pyle/pseudotorsions/"
,
exist_ok
=
True
)
os
.
makedirs
(
runDir
+
"/results/figures/GMM/Pyle/distances/"
,
exist_ok
=
True
)
...
...
@@ -2853,9 +3064,9 @@ if __name__ == "__main__":
joblist
=
[]
# Do eta/theta plots
if
n_unmapped_chains
and
DO_WADLEY_ANALYSIS
:
joblist
.
append
(
Job
(
function
=
reproduce_wadley_results
,
args
=
(
1
,
False
,
(
1
,
4
),
res_thr
)))
joblist
.
append
(
Job
(
function
=
reproduce_wadley_results
,
args
=
(
4
,
False
,
(
1
,
4
),
res_thr
)))
#
if n_unmapped_chains and DO_WADLEY_ANALYSIS:
#
joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
#
joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
# Do distance matrices for each family excl. LSU/SSU (will be processed later)
if
DO_AVG_DISTANCE_MATRIX
:
...
...
@@ -2881,21 +3092,24 @@ if __name__ == "__main__":
# Do geometric measures on all chains
#print(liste_repres('/home/data/RNA/3D/latest_nr_list_4.0A.csv'))
# if n_unmapped_chains:
# print(measure_from_structure(os.listdir(path_to_3D_data + "rna_only")[0]))
if
n_unmapped_chains
:
# os.makedirs(runDir+"/results/geometry/all-atoms/distances/", exist_ok=True)
# liste_struct = os.listdir(path_to_3D_data + "rna_only")
# liste_struct = os.listdir(path_to_3D_data + "rna_only")
liste_struct
=
liste_repres
(
'/home/data/RNA/3D/latest_nr_list_4.0A.csv'
)
# if '4zdo_1_E.cif' in liste_struct:
# liste_struct.remove('4zdo_1_E.cif') # weird cases to remove for now
# if '4zdp_1_E.cif' in liste_struct:
# liste_struct.remove('4zdp_1_E.cif')
#
for f in liste_struct:
#
if path.isfile(path_to_3D_data + "datapoints/" + f.split('.')[0]):
#
joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances
for
f
in
liste_struct
:
if
path
.
isfile
(
path_to_3D_data
+
"datapoints/"
+
f
.
split
(
'.'
)[
0
]):
joblist
.
append
(
Job
(
function
=
measure_from_structure
,
args
=
(
f
,),
how_many_in_parallel
=
nworkers
))
# All-atom distances
#
process_jobs(joblist)
process_jobs
(
joblist
)
#count_occur_pyle_dist(runDir + '/results/geometry/Pyle/classes_dist/')
# Now process the memory-heavy tasks family by family
if
DO_AVG_DISTANCE_MATRIX
:
for
f
in
LSU_set
:
...
...
@@ -2913,12 +3127,13 @@ if __name__ == "__main__":
# per_chain_stats() # per chain base frequencies en basepair types
# seq_idty() # identity matrices from pre-computed .npy matrices
# stats_pairs()
concat_dataframes
(
runDir
+
'/results/geometry/Pyle/distances/'
,
'distances.csv'
)
if
n_unmapped_chains
:
# general_stats()
os
.
makedirs
(
runDir
+
"/results/figures/GMM/"
,
exist_ok
=
True
)
os
.
makedirs
(
runDir
+
"/results/geometry/json/"
,
exist_ok
=
True
)
# joblist = []
joblist
=
[]
# joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/Pyle/distances/', 'distances.csv')))
# joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv')))
# if DO_HIRE_RNA_MEASURES:
# joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/distances/', 'dist_atoms_hire_RNA.csv')))
...
...
@@ -2935,9 +3150,10 @@ if __name__ == "__main__":
if
DO_HIRE_RNA_MEASURES
:
# joblist.append(Job(function=gmm_hrna, args=()))
joblist
.
append
(
Job
(
function
=
gmm_hrna_basepairs
,
args
=
()))
if
DO_WADLEY_ANALYSIS
:
joblist
.
append
(
Job
(
function
=
gmm_wadley
,
args
=
()))
# if DO_WADLEY_ANALYSIS:
# joblist.append(Job(function=gmm_wadley, args=()))
# joblist.append(Job(function=gmm_pyle, args=()))
if
len
(
joblist
):
process_jobs
(
joblist
)
merge_jsons
()
#
merge_jsons()
...
...
Please
register
or
login
to post a comment