Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Louis BECQUEY
/
RNANet
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Louis BECQUEY
2020-03-19 11:51:39 +0100
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
47f8a2ba6da54f0a107340870c807df481d46d34
47f8a2ba
1 parent
c9882ee5
Lower RAM usage
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
20 additions
and
9 deletions
RNAnet.py
statistics.py
RNAnet.py
View file @
47f8a2b
...
...
@@ -436,14 +436,14 @@ class Chain:
return
# Creating a df for easy saving to CSV
df
.
to_csv
(
path_to_3D_data
+
f
"annotations/{self.chain_label}.{self.rfam}.csv"
)
df
.
to_csv
(
path_to_3D_data
+
f
"annotations/{self.chain_label}.{self.rfam
_fam
}.csv"
)
del
df
print
(
"
\t
> Saved"
,
self
.
chain_label
,
f
"annotations to CSV.
\t\t
{validsymb}"
,
flush
=
True
)
else
:
print
(
"
\t
> Computing"
,
self
.
chain_label
,
f
"annotations...
\t
{validsymb}
\t
(already done)"
,
flush
=
True
)
# Now load data from the CSV file
d
=
pd
.
read_csv
(
path_to_3D_data
+
f
"annotations/{self.chain_label}.{self.rfam}.csv"
,
index_col
=
0
)
d
=
pd
.
read_csv
(
path_to_3D_data
+
f
"annotations/{self.chain_label}.{self.rfam
_fam
}.csv"
,
index_col
=
0
)
self
.
seq
=
""
.
join
(
d
.
nt_code
.
values
)
self
.
aligned_seq
=
""
.
join
(
d
.
nt_align_code
.
values
)
self
.
length
=
len
([
x
for
x
in
self
.
aligned_seq
if
x
!=
"-"
])
...
...
@@ -561,11 +561,9 @@ class Chain:
'alpha'
,
'beta'
,
'gamma'
,
'delta'
,
'epsilon'
,
'zeta'
,
'epsilon_zeta'
,
'chi'
,
'bb_type'
,
'glyco_bond'
,
'form'
,
'ssZp'
,
'Dp'
,
'eta'
,
'theta'
,
'eta_prime'
,
'theta_prime'
,
'eta_base'
,
'theta_base'
,
'v0'
,
'v1'
,
'v2'
,
'v3'
,
'v4'
,
'amplitude'
,
'phase_angle'
,
'puckering'
,
'P_x'
,
'P_y'
,
'P_z'
,
'C5prime_x'
,
'C5prime_y'
,
'C5prime_z'
'v0'
,
'v1'
,
'v2'
,
'v3'
,
'v4'
,
'amplitude'
,
'phase_angle'
,
'puckering'
]
self
.
data
=
self
.
data
[
cols
]
self
.
save
()
# save to file
def
save
(
self
,
fformat
=
"csv"
):
# save to file
...
...
@@ -1310,6 +1308,7 @@ def alignment_nt_stats(f):
# Compute statistics per column
pssm
=
BufferingSummaryInfo
(
align
)
.
get_pssm
(
f
,
thr_idx
)
frequencies
=
np
.
array
([
summarize_position
(
pssm
[
i
])
for
i
in
range
(
align
.
get_alignment_length
())
])
.
T
del
pssm
# For each sequence, find the right chain and save the PSSMs inside.
pbar
=
tqdm
(
total
=
len
(
chains_ids
),
position
=
thr_idx
+
1
,
desc
=
f
"Worker {thr_idx+1}: {f} chains"
,
leave
=
False
)
...
...
@@ -1320,11 +1319,18 @@ def alignment_nt_stats(f):
# get the right 3D chain:
idx
=
chains_ids
.
index
(
s
.
id
)
# call its method to set its frequencies, and save it
list_of_chains
[
idx
]
.
set_freqs_from_aln
(
s
.
seq
,
frequencies
)
list_of_chains
[
idx
]
.
save
(
fformat
=
'csv'
)
del
list_of_chains
[
idx
]
# saves a bit of memory because of the Chain object sizes
del
chains_ids
[
idx
]
# to keep indexes aligned with list_of_chains
pbar
.
update
(
1
)
pbar
.
close
()
pbar
.
close
()
del
rfam_acc_to_download
[
f
]
# We won't need this family's chain objects anymore, free up
idxQueue
.
put
(
thr_idx
)
# replace the thread index in the queue
return
0
...
...
@@ -1551,7 +1557,8 @@ if __name__ == "__main__":
pdb_chain_id
=
nr
[
2
]
.
upper
()
chain_label
=
f
"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}"
all_chains
.
append
(
Chain
(
pdb_id
,
pdb_model
,
pdb_chain_id
,
chain_label
))
del
full_structures_list
n_chains
=
len
(
all_chains
)
print
(
">"
,
validsymb
,
n_chains
,
"RNA chains of interest."
)
...
...
@@ -1586,6 +1593,8 @@ if __name__ == "__main__":
print
(
f
"> Loaded {len(loaded_chains)} RNA chains ({len(all_chains) - len(loaded_chains)} errors)."
)
del
all_chains
# Here ends its utility, so let's free some memory
del
joblist
del
results
if
not
HOMOLOGY
:
# Save chains to file
...
...
@@ -1613,7 +1622,7 @@ if __name__ == "__main__":
rfam_acc_to_download
[
c
.
rfam_fam
]
.
append
(
c
)
mappings_list
[
c
.
rfam_fam
]
.
append
(
c
.
chain_label
)
pd
.
DataFrame
.
from_dict
(
mappings_list
,
orient
=
'index'
)
.
transpose
()
.
to_csv
(
path_to_seq_data
+
"realigned/mappings_list.csv"
)
exit
()
del
mappings_list
print
(
f
"> Identified {len(rfam_acc_to_download.keys())} families to download and re-align with the crystals' sequences:"
)
# Download the covariance models for all families
...
...
@@ -1632,6 +1641,7 @@ if __name__ == "__main__":
for
f
in
fam_list
:
line
=
fam_stats
[
fam_stats
[
"rfam_acc"
]
==
f
]
print
(
f
"
\t
> {f}: {line.n_seq.values[0]} Rfam hits + {line.n_pdb_seqs.values[0]} PDB sequences to realign"
)
del
fam_stats
# Download the sequences
for
f
in
fam_list
:
...
...
@@ -1650,6 +1660,7 @@ if __name__ == "__main__":
# Execute the jobs
execute_joblist
(
fulljoblist
,
printstats
=
True
)
# printstats=True will show a summary of time/memory usage of the jobs
del
fulljoblist
# ==========================================================================================
# Now compute statistics on base variants at each position of every 3D chain
...
...
@@ -1669,7 +1680,7 @@ if __name__ == "__main__":
# Start a process pool to dispatch the RNA families,
# over multiple CPUs (one family by CPU)
p
=
Pool
(
initializer
=
tqdm
.
set_lock
,
initargs
=
(
tqdm
.
get_lock
(),),
processes
=
ncores
)
p
=
Pool
(
initializer
=
tqdm
.
set_lock
,
initargs
=
(
tqdm
.
get_lock
(),),
processes
=
int
(
ncores
/
2
)
)
fam_pbar
=
tqdm
(
total
=
len
(
fam_list
),
desc
=
"RNA families"
,
position
=
0
,
leave
=
True
)
for
i
,
_
in
enumerate
(
p
.
imap_unordered
(
alignment_nt_stats
,
fam_list
)):
# Apply alignment_nt_stats to each RNA family
...
...
statistics.py
View file @
47f8a2b
This diff is collapsed. Click to expand it.
Please
register
or
login
to post a comment