Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Louis BECQUEY
/
RNANetLegacy
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Network
Create a new issue
Builds
Commits
Authored by
Louis BECQUEY
2020-03-17 18:42:48 +0000
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
c9882ee58e479243ac57b04a220c612b1a29f493
c9882ee5
1 parent
8800ab7b
Mapping inference from BGSU lists
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
410 additions
and
170 deletions
RNAnet.py
results/clusters_rot180.png
statistics.py
RNAnet.py
View file @
c9882ee
#!/usr/bin/python3.8
import
numpy
as
np
import
pandas
as
pd
import
concurrent.futures
,
Bio.PDB.StructureBuilder
,
copy
,
getopt
,
gzip
,
io
,
json
,
os
,
psutil
,
re
,
requests
,
sqlalchemy
,
subprocess
,
sys
,
time
,
warnings
import
concurrent.futures
,
Bio.PDB.StructureBuilder
,
getopt
,
gzip
,
io
,
json
,
os
,
psutil
,
re
,
requests
,
sqlalchemy
,
subprocess
,
sys
,
time
,
warnings
from
Bio
import
AlignIO
,
SeqIO
from
Bio.PDB
import
MMCIFParser
from
Bio.PDB.mmcifio
import
MMCIFIO
...
...
@@ -20,6 +20,8 @@ from time import sleep
from
tqdm
import
tqdm
from
tqdm.contrib.concurrent
import
process_map
pd
.
set_option
(
'display.max_rows'
,
None
)
m
=
Manager
()
running_stats
=
m
.
list
()
running_stats
.
append
(
0
)
# n_launched
...
...
@@ -37,6 +39,7 @@ CRYSTAL_RES = "4.0"
KEEP_HETATM
=
False
FILL_GAPS
=
True
HOMOLOGY
=
True
USE_KNOWN_ISSUES
=
True
class
NtPortionSelector
(
object
):
"""Class passed to MMCIFIO to select some chain portions in an MMCIF file.
...
...
@@ -64,9 +67,9 @@ class NtPortionSelector(object):
if
hetatm_flag
in
[
"W"
,
"H_MG"
]:
return
int
(
KEEP_HETATM
)
# I don't really know what this is but the doc said to warn:
if
icode
!=
" "
:
warn
(
f
"icode {icode} at position {resseq}
\t\t
"
)
#
#
I don't really know what this is but the doc said to warn:
#
if icode != " ":
#
warn(f"icode {icode} at position {resseq}\t\t")
# Accept the residue if it is in the right interval:
return
int
(
self
.
start
<=
resseq
<=
self
.
end
)
...
...
@@ -80,9 +83,10 @@ class NtPortionSelector(object):
# Accept all atoms otherwise.
return
1
class
BufferingSummaryInfo
(
AlignInfo
.
SummaryInfo
):
def
get_pssm
(
self
,
family
,
index
):
def
get_pssm
(
self
,
family
,
index
):
"""Create a position specific score matrix object for the alignment.
This creates a position specific score matrix (pssm) which is an
...
...
@@ -97,17 +101,17 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo):
for
residue_num
in
tqdm
(
range
(
self
.
alignment
.
get_alignment_length
()),
position
=
index
+
1
,
desc
=
f
"Worker {index+1}: {family}"
,
leave
=
False
):
score_dict
=
self
.
_get_base_letters
(
"ACGUN"
)
for
record
in
self
.
alignment
:
this_residue
=
record
.
seq
[
residue_num
]
this_residue
=
record
.
seq
[
residue_num
]
.
upper
()
if
this_residue
not
in
"-."
:
try
:
score_dict
[
this_residue
]
+=
1.0
except
KeyError
:
if
this_residue
in
"acgun"
:
warn
(
f
"Found {this_residue} in {family} alignment..."
)
#
if this_residue in "acgun":
#
warn(f"Found {this_residue} in {family} alignment...")
score_dict
[
this_residue
]
=
1.0
pssm_info
.
append
((
'*'
,
score_dict
))
return
AlignInfo
.
PSSM
(
pssm_info
)
pssm_info
.
append
((
'*'
,
score_dict
))
return
AlignInfo
.
PSSM
(
pssm_info
)
class
Chain
:
...
...
@@ -115,24 +119,24 @@ class Chain:
Chains accumulate information through this scipt, and are saved to files at the end of major steps."""
def
__init__
(
self
,
nrlist_code
):
nr
=
nrlist_code
.
split
(
'|'
)
self
.
pdb_id
=
nr
[
0
]
.
lower
()
# PDB ID
self
.
pdb_model
=
int
(
nr
[
1
])
# model ID, starting at 1
self
.
pdb_chain_id
=
nr
[
2
]
.
upper
()
# chain ID (mmCIF), multiple letters
def
__init__
(
self
,
pdb_id
,
pdb_model
,
pdb_chain_id
,
chain_label
,
rfam
=
""
,
pdb_start
=
None
,
pdb_end
=
None
):
self
.
pdb_id
=
pdb_id
# PDB ID
self
.
pdb_model
=
int
(
pdb_model
)
# model ID, starting at 1
self
.
pdb_chain_id
=
pdb_chain_id
# chain ID (mmCIF), multiple letters
self
.
pdb_start
=
pdb_start
# if portion of chain, the start number (relative to the chain, not residue numbers)
self
.
pdb_end
=
pdb_end
# if portion of chain, the start number (relative to the chain, not residue numbers)
self
.
reversed
=
False
# wether pdb_end > pdb_start in the Rfam mapping
self
.
chain_label
=
""
# chain pretty name
self
.
chain_label
=
chain_label
# chain pretty name
self
.
full_mmCIFpath
=
""
# path to the source mmCIF structure
self
.
file
=
""
# path to the 3D PDB file
self
.
rfam_fam
=
""
# mapping to an RNA family
self
.
rfam_fam
=
rfam
# mapping to an RNA family
self
.
seq
=
""
# sequence with modified nts
self
.
aligned_seq
=
""
# sequence with modified nts replaced, but gaps can exist
self
.
length
=
-
1
# length of the sequence (missing residues are not counted)
self
.
full_length
=
-
1
# length of the chain extracted from source structure ([start; stop] interval)
self
.
delete_me
=
False
# an error occured during production/parsing
self
.
error_messages
=
""
# Error message(s) if any
self
.
frequencies
=
np
.
zeros
((
5
,
0
))
# frequencies of nt at every position: A,C,G,U,Other
self
.
data3D
=
None
# Pandas DataFrame with all the 3D data extracted by DSSR.
self
.
data
=
None
# Pandas DataFrame with all the 3D data extracted by DSSR.
def
__str__
(
self
):
return
self
.
pdb_id
+
'['
+
str
(
self
.
pdb_model
)
+
"]-"
+
self
.
pdb_chain_id
...
...
@@ -168,12 +172,12 @@ class Chain:
self
.
delete_me
=
True
self
.
error_messages
=
f
"Error downloading {url}"
def
extract_portion
(
self
,
filename
,
pdb_start
,
pdb_end
):
def
extract_portion
(
self
):
""" Extract the part which is mapped to Rfam from the main CIF file and save it to another file.
"""
status
=
f
"
\t
> Extract {
pdb_start}-{
pdb_end} atoms from {self.pdb_id}-{self.pdb_chain_id}
\t
"
self
.
file
=
path_to_3D_data
+
"rna_mapped_to_Rfam/"
+
filename
+
".cif"
status
=
f
"
\t
> Extract {
self.pdb_start}-{self.
pdb_end} atoms from {self.pdb_id}-{self.pdb_chain_id}
\t
"
self
.
file
=
path_to_3D_data
+
"rna_mapped_to_Rfam/"
+
self
.
chain_label
+
".cif"
# Check if file exists, if yes, abort (do not recompute)
if
os
.
path
.
exists
(
self
.
file
):
...
...
@@ -181,13 +185,13 @@ class Chain:
return
model_idx
=
self
.
pdb_model
-
(
self
.
pdb_model
>
0
)
# because arrays start at 0, models start at 1
pdb_start
=
int
(
pdb_start
)
pdb_end
=
int
(
pdb_end
)
pdb_start
=
int
(
self
.
pdb_start
)
pdb_end
=
int
(
self
.
pdb_end
)
with
warnings
.
catch_warnings
():
#
TODO: check if this with and warnings catch is still useful since i moved to CIF files
warnings
.
simplefilter
(
'ignore'
,
PDBConstructionWarning
)
# ignore the PDB problems
#
Ignore the PDB problems. This mostly warns that some chain is discontinuous.
warnings
.
simplefilter
(
'ignore'
,
PDBConstructionWarning
)
# Check if the whole mmCIF file exists. If not, abort.
if
self
.
full_mmCIFpath
==
""
:
...
...
@@ -221,6 +225,49 @@ class Chain:
ioobj
.
save
(
self
.
file
,
sel
)
print
(
status
+
f
"
\t
{validsymb}"
)
def
extract_all
(
self
):
""" Extract the RNA chain from the main CIF file and save it to another file.
"""
status
=
f
"
\t
> Extract {self.pdb_id}-{self.pdb_chain_id}
\t
"
self
.
file
=
path_to_3D_data
+
"rna_only/"
+
self
.
chain_label
+
".cif"
# Check if file exists, if yes, abort (do not recompute)
if
os
.
path
.
exists
(
self
.
file
):
print
(
status
+
f
"
\t
{validsymb}
\t
(already done)"
,
flush
=
True
)
return
model_idx
=
self
.
pdb_model
-
(
self
.
pdb_model
>
0
)
# because arrays start at 0, models start at 1
with
warnings
.
catch_warnings
():
# Ignore the PDB problems. This mostly warns that some chain is discontinuous.
warnings
.
simplefilter
(
'ignore'
,
PDBConstructionWarning
)
# ignore the PDB problems
# Check if the whole mmCIF file exists. If not, abort.
if
self
.
full_mmCIFpath
==
""
:
print
(
status
+
f
"
\t\U0000274E\t\033
[31mError with CIF file of {self.pdb_id} !
\033
[0m"
,
flush
=
True
)
self
.
delete_me
=
True
self
.
error_messages
=
f
"Error with CIF file of {self.pdb_id}"
return
# Load the whole mmCIF into a Biopython structure object:
s
=
mmcif_parser
.
get_structure
(
self
.
pdb_id
,
self
.
full_mmCIFpath
)
# Extract the desired chain
c
=
s
[
model_idx
][
self
.
pdb_chain_id
]
# Define a selection
first_number
=
c
.
child_list
[
0
]
.
get_id
()[
1
]
# the chain's first residue is numbered 'first_number'
last_number
=
c
.
child_list
[
-
1
]
.
get_id
()[
1
]
# the chain's last residue number
sel
=
NtPortionSelector
(
model_idx
,
self
.
pdb_chain_id
,
first_number
,
last_number
)
# Save that selection on the mmCIF object s to file
ioobj
=
MMCIFIO
()
ioobj
.
set_structure
(
s
)
ioobj
.
save
(
self
.
file
,
sel
)
print
(
status
+
f
"
\t
{validsymb}"
)
def
set_rfam
(
self
,
rfam
):
""" Rember the Rfam mapping for this chain.
...
...
@@ -232,7 +279,7 @@ class Chain:
""" Runs DSSR to annotate the 3D chain and get various information about it. """
# Check if the file exists. If no, compute it.
if
not
os
.
path
.
exists
(
path_to_3D_data
+
f
"
pseudotorsions/{self.chain_label
}.csv"
):
if
not
os
.
path
.
exists
(
path_to_3D_data
+
f
"
annotations/{self.chain_label}.{self.rfam_fam
}.csv"
):
# run DSSR (you need to have it in your $PATH, follow x3dna installation instructions)
output
=
subprocess
.
run
(
...
...
@@ -240,7 +287,6 @@ class Chain:
stdout
=
output
.
stdout
.
decode
(
'utf-8'
)
# this contains the results in JSON format, or is empty if there are errors
stderr
=
output
.
stderr
.
decode
(
'utf-8'
)
# this contains the evenutal errors
try
:
if
"exception"
in
stderr
:
# DSSR is unable to parse the chain.
...
...
@@ -266,16 +312,22 @@ class Chain:
resnum_start
=
int
(
nts
[
0
][
"nt_resnum"
])
df
=
pd
.
DataFrame
(
nts
)
# remove low pertinence or undocumented descriptors
df
=
df
.
drop
([
'summary'
,
'chain_name'
,
'index'
,
'v0'
,
'v1'
,
'v2'
,
'v3'
,
'v4'
,
'splay_angle'
,
df
=
df
.
drop
([
'summary'
,
'chain_name'
,
'index'
,
'splay_angle'
,
'splay_distance'
,
'splay_ratio'
,
'sugar_class'
,
'amplitude'
,
'phase_angle'
],
axis
=
1
)
df
[
'P_x'
]
=
[
float
(
i
[
0
])
if
i
[
0
]
is
not
None
else
np
.
NaN
for
i
in
df
[
'P_xyz'
]
]
#
df
[
'P_y'
]
=
[
float
(
i
[
1
])
if
i
[
1
]
is
not
None
else
np
.
NaN
for
i
in
df
[
'P_xyz'
]
]
#
df
[
'P_z'
]
=
[
float
(
i
[
2
])
if
i
[
2
]
is
not
None
else
np
.
NaN
for
i
in
df
[
'P_xyz'
]
]
# Flatten the
df
[
'C5prime_x'
]
=
[
float
(
i
[
0
])
if
i
[
0
]
is
not
None
else
np
.
NaN
for
i
in
df
[
'C5prime_xyz'
]
]
# Python dictionary
df
[
'C5prime_y'
]
=
[
float
(
i
[
1
])
if
i
[
1
]
is
not
None
else
np
.
NaN
for
i
in
df
[
'C5prime_xyz'
]
]
#
df
[
'C5prime_z'
]
=
[
float
(
i
[
2
])
if
i
[
2
]
is
not
None
else
np
.
NaN
for
i
in
df
[
'C5prime_xyz'
]
]
#
'bin'
,
'suiteness'
,
'cluster'
],
axis
=
1
)
# df['P_x'] = [ float(i[0]) if i[0] is not None else np.NaN for i in df['P_xyz'] ] #
# df['P_y'] = [ float(i[1]) if i[1] is not None else np.NaN for i in df['P_xyz'] ] #
# df['P_z'] = [ float(i[2]) if i[2] is not None else np.NaN for i in df['P_xyz'] ] # Flatten the
# df['C5prime_x'] = [ float(i[0]) if i[0] is not None else np.NaN for i in df['C5prime_xyz'] ] # Python dictionary
# df['C5prime_y'] = [ float(i[1]) if i[1] is not None else np.NaN for i in df['C5prime_xyz'] ] #
# df['C5prime_z'] = [ float(i[2]) if i[2] is not None else np.NaN for i in df['C5prime_xyz'] ] #
# Convert angles to radians
df
.
loc
[:,[
'alpha'
,
'beta'
,
'gamma'
,
'delta'
,
'epsilon'
,
'zeta'
,
'epsilon_zeta'
,
'chi'
,
'v0'
,
'v1'
,
'v2'
,
'v3'
,
'v4'
,
'eta'
,
'theta'
,
'eta_prime'
,
'theta_prime'
,
'eta_base'
,
'theta_base'
,
'phase_angle'
]]
*=
np
.
pi
/
180.0
# mapping [-pi, pi] into [0, 2pi]
df
.
loc
[:,[
'alpha'
,
'beta'
,
'gamma'
,
'delta'
,
'epsilon'
,
'zeta'
,
'epsilon_zeta'
,
'chi'
,
'v0'
,
'v1'
,
'v2'
,
'v3'
,
'v4'
,
'eta'
,
'theta'
,
'eta_prime'
,
'theta_prime'
,
'eta_base'
,
'theta_base'
,
'phase_angle'
]]
%=
(
2.0
*
np
.
pi
)
# Add a sequence column just for the alignments
df
[
'nt_align_code'
]
=
[
str
(
x
)
.
upper
()
...
...
@@ -305,8 +357,8 @@ class Chain:
# Iterate over pairs to identify base-base interactions
res_ids
=
list
(
df
[
'nt_id'
])
paired
=
[
0
]
*
l
res_ids
=
list
(
df
[
'nt_id'
])
# things like "chainID.C4, chainID.U5"
paired
=
[
"0"
]
*
l
pair_type_LW
=
[
''
]
*
l
pair_type_DSSR
=
[
''
]
*
l
interacts
=
[
0
]
*
l
...
...
@@ -318,14 +370,24 @@ class Chain:
if
nt1
in
res_ids
and
nt2
in
res_ids
:
nt1_idx
=
res_ids
.
index
(
nt1
)
nt2_idx
=
res_ids
.
index
(
nt2
)
paired
[
nt1_idx
]
=
nt2_idx
+
1
paired
[
nt2_idx
]
=
nt1_idx
+
1
if
paired
[
nt1_idx
]
==
"0"
:
paired
[
nt1_idx
]
=
str
(
nt2_idx
+
1
)
pair_type_LW
[
nt1_idx
]
=
p
[
"LW"
]
pair_type_DSSR
[
nt1_idx
]
=
p
[
"DSSR"
]
else
:
paired
[
nt1_idx
]
+=
','
+
str
(
nt2_idx
+
1
)
pair_type_LW
[
nt1_idx
]
+=
','
+
p
[
"LW"
]
pair_type_DSSR
[
nt1_idx
]
+=
','
+
p
[
"DSSR"
]
if
paired
[
nt2_idx
]
==
"0"
:
paired
[
nt2_idx
]
=
str
(
nt1_idx
+
1
)
pair_type_LW
[
nt2_idx
]
=
p
[
"LW"
]
pair_type_DSSR
[
nt2_idx
]
=
p
[
"DSSR"
]
else
:
paired
[
nt2_idx
]
+=
','
+
str
(
nt1_idx
+
1
)
pair_type_LW
[
nt2_idx
]
+=
','
+
p
[
"LW"
]
pair_type_DSSR
[
nt2_idx
]
+=
','
+
p
[
"DSSR"
]
interacts
[
nt1_idx
]
+=
1
interacts
[
nt2_idx
]
+=
1
pair_type_LW
[
nt1_idx
]
=
p
[
"LW"
]
pair_type_LW
[
nt2_idx
]
=
p
[
"LW"
]
pair_type_DSSR
[
nt1_idx
]
=
p
[
"DSSR"
]
pair_type_DSSR
[
nt2_idx
]
=
p
[
"DSSR"
]
elif
nt1
in
res_ids
:
nt1_idx
=
res_ids
.
index
(
nt1
)
interacts
[
nt1_idx
]
+=
1
...
...
@@ -335,26 +397,7 @@ class Chain:
df
[
'paired'
]
=
paired
df
[
'pair_type_LW'
]
=
pair_type_LW
df
[
'pair_type_DSSR'
]
=
pair_type_DSSR
# Iterate over multiplets to identify base-base interactions
if
"multiplets"
in
json_object
.
keys
():
multiplets
=
json_object
[
"multiplets"
]
for
m
in
multiplets
:
nts
=
m
[
"nts_long"
]
.
split
(
','
)
# iterate over the nts of a multiplet
for
j
,
nt
in
enumerate
(
nts
):
# if the nt is in that chain:
if
nt
in
res_ids
:
i
=
res_ids
.
index
(
nt
)
# iterate over those other nts
for
o
in
nts
[:
j
]
+
nts
[
j
+
1
:]:
if
o
in
res_ids
and
str
(
res_ids
.
index
(
o
)
+
1
)
not
in
str
(
df
[
'paired'
][
i
]):
# and it's not already in 'paired'
df
.
loc
[
i
,
'paired'
]
=
str
(
df
[
'paired'
][
i
])
+
','
+
str
(
res_ids
.
index
(
o
)
+
1
)
interacts
[
i
]
=
len
(
str
(
df
[
'paired'
][
i
])
.
split
(
','
))
df
[
'Ninteract'
]
=
interacts
df
=
df
.
drop
([
'C5prime_xyz'
,
'P_xyz'
,
'nt_id'
],
axis
=
1
)
# remove now useless descriptors
if
self
.
reversed
:
...
...
@@ -393,19 +436,19 @@ class Chain:
return
# Creating a df for easy saving to CSV
df
.
to_csv
(
path_to_3D_data
+
f
"
pseudotorsions/{self.chain_label
}.csv"
)
df
.
to_csv
(
path_to_3D_data
+
f
"
annotations/{self.chain_label}.{self.rfam
}.csv"
)
del
df
print
(
"
\t
> Saved"
,
self
.
chain_label
,
f
"
pseudotors
ions to CSV.
\t\t
{validsymb}"
,
flush
=
True
)
print
(
"
\t
> Saved"
,
self
.
chain_label
,
f
"
annotat
ions to CSV.
\t\t
{validsymb}"
,
flush
=
True
)
else
:
print
(
"
\t
> Computing"
,
self
.
chain_label
,
f
"
pseudotors
ions...
\t
{validsymb}
\t
(already done)"
,
flush
=
True
)
print
(
"
\t
> Computing"
,
self
.
chain_label
,
f
"
annotat
ions...
\t
{validsymb}
\t
(already done)"
,
flush
=
True
)
# Now load data from the CSV file
d
=
pd
.
read_csv
(
path_to_3D_data
+
f
"
pseudotorsions/{self.chain_label
}.csv"
,
index_col
=
0
)
d
=
pd
.
read_csv
(
path_to_3D_data
+
f
"
annotations/{self.chain_label}.{self.rfam
}.csv"
,
index_col
=
0
)
self
.
seq
=
""
.
join
(
d
.
nt_code
.
values
)
self
.
aligned_seq
=
""
.
join
(
d
.
nt_align_code
.
values
)
self
.
length
=
len
([
x
for
x
in
self
.
aligned_seq
if
x
!=
"-"
])
self
.
full_length
=
len
(
d
.
nt_code
)
self
.
data
3D
=
d
self
.
data
=
d
print
(
f
"
\t
> Loaded data from CSV
\t\t\t\t
{validsymb}"
,
flush
=
True
)
# Remove too short chains
...
...
@@ -415,11 +458,11 @@ class Chain:
self
.
error_messages
=
"Sequence is too short. (< 5 resolved nts)"
return
def
set_freqs_from_aln
(
self
,
s_seq
,
freqs
):
def
set_freqs_from_aln
(
self
,
s_seq
,
ali_
freqs
):
"""Maps the object's sequence to its version in a MSA, to compute nucleotide frequencies at every position.
s_seq: the aligned version of self.aligned_seq
freqs: the nucleotide frequencies at every position of s_seq
ali_
freqs: the nucleotide frequencies at every position of s_seq
This also replaces gaps by the most common nucleotide.
"""
alilen
=
len
(
s_seq
)
...
...
@@ -427,12 +470,13 @@ class Chain:
# Save colums in the appropriate positions
i
=
0
j
=
0
temp_freqs
=
np
.
zeros
((
5
,
0
))
while
i
<
self
.
full_length
and
j
<
alilen
:
# Here we try to map self.aligned_seq (the sequence of the 3D chain, including gaps when residues are missing),
# with s_seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and .
if
self
.
aligned_seq
[
i
]
==
s_seq
[
j
]
.
upper
():
# alignment and sequence correspond (incl. gaps)
self
.
frequencies
=
np
.
concatenate
((
self
.
frequencies
,
freqs
[:,
j
]
.
reshape
(
-
1
,
1
)),
axis
=
1
)
temp_freqs
=
np
.
concatenate
((
temp_freqs
,
ali_
freqs
[:,
j
]
.
reshape
(
-
1
,
1
)),
axis
=
1
)
i
+=
1
j
+=
1
elif
self
.
aligned_seq
[
i
]
==
'-'
:
# gap in the chain, but not in the aligned sequence
...
...
@@ -451,13 +495,13 @@ class Chain:
# if not, search for a insertion gap nearby
if
j
<
alilen
and
s_seq
[
j
]
==
'.'
:
self
.
frequencies
=
np
.
concatenate
((
self
.
frequencies
,
freqs
[:,
j
]
.
reshape
(
-
1
,
1
)),
axis
=
1
)
temp_freqs
=
np
.
concatenate
((
temp_freqs
,
ali_
freqs
[:,
j
]
.
reshape
(
-
1
,
1
)),
axis
=
1
)
i
+=
1
j
+=
1
continue
# else, just ignore the gap.
self
.
frequencies
=
np
.
concatenate
((
self
.
frequencie
s
,
np
.
array
([
0.0
,
0.0
,
0.0
,
0.0
,
1.0
])
.
reshape
(
-
1
,
1
)),
axis
=
1
)
temp_freqs
=
np
.
concatenate
((
temp_freq
s
,
np
.
array
([
0.0
,
0.0
,
0.0
,
0.0
,
1.0
])
.
reshape
(
-
1
,
1
)),
axis
=
1
)
i
+=
1
elif
s_seq
[
j
]
in
[
'.'
,
'-'
]:
# gap in the alignment, but not in the real chain
j
+=
1
# ignore the column
...
...
@@ -474,11 +518,11 @@ class Chain:
letters
=
[
'A'
,
'C'
,
'G'
,
'U'
,
'N'
]
for
i
in
range
(
self
.
full_length
):
if
c_aligned_seq
[
i
]
==
'-'
:
# (then c_seq[i] also is)
freq
=
self
.
frequencie
s
[:,
i
]
freq
=
temp_freq
s
[:,
i
]
l
=
letters
[
freq
.
tolist
()
.
index
(
max
(
freq
))]
c_aligned_seq
[
i
]
=
l
c_seq
[
i
]
=
l
self
.
data
3D
.
iloc
[
i
,
3
]
=
l
# self.data3D
['nt_code'][i]
self
.
data
.
iloc
[
i
,
3
]
=
l
# self.data
['nt_code'][i]
self
.
aligned_seq
=
''
.
join
(
c_aligned_seq
)
self
.
seq
=
''
.
join
(
c_seq
)
...
...
@@ -495,16 +539,38 @@ class Chain:
point
[
5
,
i
]
=
1
# PSSMs
point
[
6
,
i
]
=
self
.
frequencie
s
[
0
,
i
]
point
[
7
,
i
]
=
self
.
frequencie
s
[
1
,
i
]
point
[
8
,
i
]
=
self
.
frequencie
s
[
2
,
i
]
point
[
9
,
i
]
=
self
.
frequencie
s
[
3
,
i
]
point
[
10
,
i
]
=
self
.
frequencie
s
[
4
,
i
]
point
[
6
,
i
]
=
temp_freq
s
[
0
,
i
]
point
[
7
,
i
]
=
temp_freq
s
[
1
,
i
]
point
[
8
,
i
]
=
temp_freq
s
[
2
,
i
]
point
[
9
,
i
]
=
temp_freq
s
[
3
,
i
]
point
[
10
,
i
]
=
temp_freq
s
[
4
,
i
]
self
.
data3D
=
pd
.
concat
([
self
.
data3D
,
pd
.
DataFrame
(
point
.
T
,
columns
=
[
"position"
,
"is_A"
,
"is_C"
,
"is_G"
,
"is_U"
,
"is_other"
,
"freq_A"
,
"freq_C"
,
"freq_G"
,
"freq_U"
,
"freq_other"
])],
axis
=
1
)
self
.
data
=
pd
.
concat
([
self
.
data
,
pd
.
DataFrame
(
point
.
T
,
columns
=
[
"position"
,
"is_A"
,
"is_C"
,
"is_G"
,
"is_U"
,
"is_other"
,
"freq_A"
,
"freq_C"
,
"freq_G"
,
"freq_U"
,
"freq_other"
])],
axis
=
1
)
# reorder columns:
cols
=
[
# 1D structure descriptors
'index_chain'
,
'nt_resnum'
,
'position'
,
'nt_name'
,
'nt_code'
,
'nt_align_code'
,
'is_A'
,
'is_C'
,
'is_G'
,
'is_U'
,
'is_other'
,
'freq_A'
,
'freq_C'
,
'freq_G'
,
'freq_U'
,
'freq_other'
,
# 2D structure descriptors
'dbn'
,
'paired'
,
'Ninteract'
,
'pair_type_LW'
,
'pair_type_DSSR'
,
# 3D strcuture descriptors
'alpha'
,
'beta'
,
'gamma'
,
'delta'
,
'epsilon'
,
'zeta'
,
'epsilon_zeta'
,
'chi'
,
'bb_type'
,
'glyco_bond'
,
'form'
,
'ssZp'
,
'Dp'
,
'eta'
,
'theta'
,
'eta_prime'
,
'theta_prime'
,
'eta_base'
,
'theta_base'
,
'v0'
,
'v1'
,
'v2'
,
'v3'
,
'v4'
,
'amplitude'
,
'phase_angle'
,
'puckering'
,
'P_x'
,
'P_y'
,
'P_z'
,
'C5prime_x'
,
'C5prime_y'
,
'C5prime_z'
]
self
.
data
=
self
.
data
[
cols
]
self
.
save
()
# save to file
def
save
(
self
,
fformat
=
"csv"
):
# save to file
self
.
data3D
.
to_csv
(
path_to_3D_data
+
"datapoints/"
+
self
.
chain_label
)
if
fformat
==
"csv"
:
self
.
data
.
to_csv
(
path_to_3D_data
+
"datapoints/"
+
self
.
chain_label
+
str
(
'.'
+
self
.
rfam_fam
if
self
.
rfam_fam
!=
''
else
''
))
class
Job
:
...
...
@@ -1058,18 +1124,10 @@ def download_BGSU_NR_list():
full_structures_list
=
nrlist
[
'class_members'
]
.
tolist
()
print
(
f
"
\t
{validsymb}"
,
flush
=
True
)
# Split the codes
all_chains
=
[]
for
code
in
full_structures_list
:
codes
=
code
.
replace
(
'+'
,
','
)
.
split
(
','
)
for
c
in
codes
:
# Convert every PDB code into a Chain object
all_chains
.
append
(
Chain
(
c
))
# The beginning of an adventure.
return
all_chains
return
full_structures_list
def
build_chain
(
c
,
rfam
,
pdb_start
,
pdb_end
):
def
build_chain
(
c
):
""" Additionally adds all the desired information to a Chain object.
"""
...
...
@@ -1078,9 +1136,12 @@ def build_chain(c, rfam, pdb_start, pdb_end):
# If no problems, extract the portion we want
if
not
c
.
delete_me
:
c
.
extract_portion
(
c
.
chain_label
,
pdb_start
,
pdb_end
)
if
HOMOLOGY
:
c
.
extract_portion
()
else
:
c
.
extract_all
()
# If no problems,
map it to an Rfam family, and
annotate it with DSSR
# If no problems, annotate it with DSSR
if
not
c
.
delete_me
:
c
.
extract_3D_data
()
...
...
@@ -1126,6 +1187,8 @@ def cm_realign(rfam_acc, chains, label):
f
.
write
(
">"
+
record
.
description
+
'
\n
'
+
str
(
record
.
seq
)
+
'
\n
'
)
ids
.
append
(
record
.
id
)
print
(
"Adding PDB chains..."
)
# Add the chains sequences to the file
for
c
in
chains
:
f
.
write
(
f
"> {str(c)}
\n
"
+
c
.
aligned_seq
.
replace
(
'-'
,
''
)
.
replace
(
'U'
,
'T'
)
+
'
\n
'
)
...
...
@@ -1240,14 +1303,13 @@ def alignment_nt_stats(f):
# Open the alignment
try
:
align
=
AlignIO
.
read
(
path_to_seq_data
+
f
"realigned/{f}++.afa"
,
"fasta"
)
alilen
=
align
.
get_alignment_length
()
except
:
warn
(
f
"{f}'s alignment is wrong. Recompute it and retry."
,
error
=
True
)
exit
(
1
)
# Compute statistics per column
pssm
=
BufferingSummaryInfo
(
align
)
.
get_pssm
(
f
,
thr_idx
)
frequencies
=
np
.
array
([
summarize_position
(
pssm
[
i
])
for
i
in
pbar
])
.
T
frequencies
=
np
.
array
([
summarize_position
(
pssm
[
i
])
for
i
in
range
(
align
.
get_alignment_length
())
])
.
T
# For each sequence, find the right chain and save the PSSMs inside.
pbar
=
tqdm
(
total
=
len
(
chains_ids
),
position
=
thr_idx
+
1
,
desc
=
f
"Worker {thr_idx+1}: {f} chains"
,
leave
=
False
)
...
...
@@ -1266,17 +1328,115 @@ def alignment_nt_stats(f):
idxQueue
.
put
(
thr_idx
)
# replace the thread index in the queue
return
0
if
__name__
==
"__main__"
:
def
infer_all_mappings
(
allmappings
,
codelist
):
"""Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list,
build a list of Chain() objects mapped to Rfam families, by expanding available mappings
of any element of the list to all the list elements.
"""
newchains
=
[]
known_mappings
=
pd
.
DataFrame
()
# Split the comma-separated list of chain codes into chain codes:
codes
=
str
(
codelist
)
.
replace
(
'+'
,
','
)
.
split
(
','
)
# Search for mappings that apply to an element of this PDB chains list:
for
c
in
codes
:
# search for Rfam mappings with this chain c:
m_row_indices
=
allmappings
.
pdb_id
+
"|1|"
+
allmappings
.
chain
==
c
[:
4
]
.
lower
()
+
c
[
4
:]
m
=
allmappings
.
loc
[
m_row_indices
]
.
drop
([
'bit_score'
,
'evalue_score'
,
'cm_start'
,
'cm_end'
,
'hex_colour'
],
axis
=
1
)
if
len
(
m
):
# remove the found mappings from the dataframe
allmappings
=
allmappings
.
loc
[
m_row_indices
==
False
]
# Add the found mappings to the list of found mappings for this class of equivalence
known_mappings
=
pd
.
concat
([
known_mappings
,
m
])
# Now infer mappings for chains that are not explicitely listed in Rfam-PDB mappings:
if
len
(
known_mappings
):
families
=
set
(
known_mappings
[
'rfam_acc'
])
# generalize
inferred_mappings
=
known_mappings
.
drop
([
'pdb_id'
,
'chain'
],
axis
=
1
)
.
drop_duplicates
()
# check for approximative redundancy:
if
len
(
inferred_mappings
)
!=
len
(
inferred_mappings
.
drop_duplicates
(
subset
=
"rfam_acc"
)):
# Then, there exists some mapping variants onto the same Rfam family CM,
# but varing in the start/end positions in the chain.
# ==> Summarize them in one mapping but with the largest window.
for
rfam
in
families
:
sel_5_to_3
=
(
inferred_mappings
[
'pdb_start'
]
<
inferred_mappings
[
'pdb_end'
])
thisfam_5_3
=
(
inferred_mappings
[
'rfam_acc'
]
==
rfam
)
&
sel_5_to_3
thisfam_3_5
=
(
inferred_mappings
[
'rfam_acc'
]
==
rfam
)
&
(
sel_5_to_3
==
False
)
if
(
len
(
inferred_mappings
[
thisfam_5_3
])
!=
len
(
inferred_mappings
[
inferred_mappings
[
'rfam_acc'
]
==
rfam
])
and
len
(
inferred_mappings
[
thisfam_5_3
])
>
0
):
warn
(
f
"There are mappings for {rfam} in both directions:"
,
error
=
True
)
print
(
inferred_mappings
)
exit
(
1
)
# Compute consensus for chains in 5' -> 3' sense
if
len
(
inferred_mappings
[
thisfam_5_3
]):
pdb_start_min
=
min
(
inferred_mappings
[
thisfam_5_3
][
'pdb_start'
])
pdb_end_max
=
max
(
inferred_mappings
[
thisfam_5_3
][
'pdb_end'
])
pdb_start_max
=
max
(
inferred_mappings
[
thisfam_5_3
][
'pdb_start'
])
pdb_end_min
=
min
(
inferred_mappings
[
thisfam_5_3
][
'pdb_end'
])
if
(
pdb_start_max
-
pdb_start_min
<
100
)
and
(
pdb_end_max
-
pdb_end_min
<
100
):
# the variation is only a few nucleotides, we take the largest window.
inferred_mappings
.
loc
[
thisfam_5_3
,
'pdb_start'
]
=
pdb_start_min
inferred_mappings
.
loc
[
thisfam_5_3
,
'pdb_end'
]
=
pdb_end_max
else
:
# there probably is an outlier. We chose the median value in the whole list of known_mappings.
known_sel_5_to_3
=
(
known_mappings
[
'rfam_acc'
]
==
rfam
)
&
(
known_mappings
[
'pdb_start'
]
<
known_mappings
[
'pdb_end'
])
inferred_mappings
.
loc
[
thisfam_5_3
,
'pdb_start'
]
=
known_mappings
.
loc
[
known_sel_5_to_3
,
'pdb_start'
]
.
median
()
inferred_mappings
.
loc
[
thisfam_5_3
,
'pdb_end'
]
=
known_mappings
.
loc
[
known_sel_5_to_3
,
'pdb_end'
]
.
median
()
# Compute consensus for chains in 3' -> 5' sense
if
len
(
inferred_mappings
[
thisfam_3_5
]):
pdb_start_min
=
min
(
inferred_mappings
[
thisfam_3_5
][
'pdb_start'
])
pdb_end_max
=
max
(
inferred_mappings
[
thisfam_3_5
][
'pdb_end'
])
pdb_start_max
=
max
(
inferred_mappings
[
thisfam_3_5
][
'pdb_start'
])
pdb_end_min
=
min
(
inferred_mappings
[
thisfam_3_5
][
'pdb_end'
])
if
(
pdb_start_max
-
pdb_start_min
<
100
)
and
(
pdb_end_max
-
pdb_end_min
<
100
):
# the variation is only a few nucleotides, we take the largest window.
inferred_mappings
.
loc
[
thisfam_3_5
,
'pdb_start'
]
=
pdb_start_max
inferred_mappings
.
loc
[
thisfam_3_5
,
'pdb_end'
]
=
pdb_end_min
else
:
# there probably is an outlier. We chose the median value in the whole list of known_mappings.
known_sel_3_to_5
=
(
known_mappings
[
'rfam_acc'
]
==
rfam
)
&
(
known_mappings
[
'pdb_start'
]
>
known_mappings
[
'pdb_end'
])
inferred_mappings
.
loc
[
thisfam_3_5
,
'pdb_start'
]
=
known_mappings
.
loc
[
known_sel_3_to_5
,
'pdb_start'
]
.
median
()
inferred_mappings
.
loc
[
thisfam_3_5
,
'pdb_end'
]
=
known_mappings
.
loc
[
known_sel_3_to_5
,
'pdb_end'
]
.
median
()
inferred_mappings
.
drop_duplicates
(
inplace
=
True
)
for
c
in
codes
:
nr
=
c
.
split
(
'|'
)
pdb_id
=
nr
[
0
]
.
lower
()
pdb_model
=
int
(
nr
[
1
])
pdb_chain_id
=
nr
[
2
]
for
rfam
in
families
:
# if a known mapping of this chain on this family exists, apply it
m
=
known_mappings
.
loc
[
(
known_mappings
.
pdb_id
+
"|1|"
+
known_mappings
.
chain
==
c
[:
4
]
.
lower
()
+
c
[
4
:])
&
(
known_mappings
[
'rfam_acc'
]
==
rfam
)
]
if
len
(
m
):
pdb_start
=
int
(
m
.
pdb_start
)
pdb_end
=
int
(
m
.
pdb_end
)
else
:
# otherwise, use the inferred mapping
pdb_start
=
int
(
inferred_mappings
.
loc
[
(
inferred_mappings
[
'rfam_acc'
]
==
rfam
)
]
.
pdb_start
)
pdb_end
=
int
(
inferred_mappings
.
loc
[
(
inferred_mappings
[
'rfam_acc'
]
==
rfam
)
]
.
pdb_end
)
chain_label
=
f
"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}_{pdb_start}-{pdb_end}"
newchains
.
append
(
Chain
(
pdb_id
,
pdb_model
,
pdb_chain_id
,
chain_label
,
rfam
=
rfam
,
pdb_start
=
pdb_start
,
pdb_end
=
pdb_end
))
return
newchains
# # temporary, for debugging: start from zero knowledge
# if os.path.exists(path_to_3D_data + "known_issues.txt"):
# subprocess.run(["rm", path_to_3D_data + "known_issues.txt"])
if
__name__
==
"__main__"
:
# Parse options
try
:
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
"r:h"
,
[
"help"
,
"resolution="
,
"keep-hetatm="
,
"fill-gaps="
,
"3d-folder="
,
"seq-folder="
,
"no-homology"
])
[
"help"
,
"resolution="
,
"keep-hetatm="
,
"fill-gaps="
,
"3d-folder="
,
"seq-folder="
,
"no-homology"
,
"force-retry"
])
except
getopt
.
GetoptError
as
err
:
print
(
err
)
sys
.
exit
(
2
)
...
...
@@ -1299,17 +1459,18 @@ if __name__ == "__main__":
print
(
"--3d-folder=…
\t\t\t
Path to a folder to store the 3D data files. Subfolders will contain:"
"
\n\t\t\t\t\t
RNAcifs/
\t\t
Full structures containing RNA, in mmCIF format"
"
\n\t\t\t\t\t
rna_mapped_to_Rfam/
\t
Extracted 'pure' RNA chains"
"
\n\t\t\t\t\t
pseudotors
ions/
\t\t
Annotations by DSSR"
"
\n\t\t\t\t\t
annotat
ions/
\t\t
Annotations by DSSR"
"
\n\t\t\t\t\t
datapoints/
\t\t
Final results in specified file format."
)
print
(
"--seq-folder=…
\t\t\t
Path to a folder to store the sequence and alignment files."
"
\n\t\t\t\t\t
rfam_sequences/fasta/
\t
Compressed hits to Rfam families"
"
\n\t\t\t\t\t
realigned/
\t\t
Sequences, covariance models, and alignments by family"
)
print
(
"--no-homology
\t\t\t
Do not try to compute PSSMs and do not align sequences."
"
\n\t\t\t\t
Allows to yield more 3D data (consider chains without a Rfam mapping)."
)
print
(
"--force-retry
\t\t\t
Ignore already known issues, and retry to build them from scratch."
)
sys
.
exit
()
elif
opt
==
'--version'
:
print
(
"RNANet
alpha 3
"
)
print
(
"RNANet
0.4 alpha
"
)
sys
.
exit
()
elif
opt
==
"-r"
or
opt
==
"--resolution"
:
assert
arg
in
[
"1.5"
,
"2.0"
,
"2.5"
,
"3.0"
,
"3.5"
,
"4.0"
,
"20.0"
]
...
...
@@ -1320,43 +1481,79 @@ if __name__ == "__main__":
elif
opt
==
"--fill-gaps"
:
assert
arg
in
[
"True"
,
"False"
]
FILL_GAPS
=
(
arg
==
"True"
)
elif
opt
==
"--no-homolgy"
:
HOMOLOGY
=
=
False
elif
opt
==
"--no-homol
o
gy"
:
HOMOLOGY
=
False
elif
opt
==
'--3d-folder'
:
path_to_3D_data
=
path
.
abspath
(
arg
)
if
path_to_3D_data
[
-
1
]
!=
'/'
:
path_to_3D_data
+=
'/'
print
(
"Storing 3D data into"
,
path_to_3D_data
)
elif
opt
==
'--seq-folder'
:
path_to_seq_data
=
path
.
abspath
(
arg
)
if
path_to_seq_data
[
-
1
]
!=
'/'
:
path_to_seq_data
+=
'/'
print
(
"Storing sequences into"
,
path_to_seq_data
)
elif
opt
==
"--force-retry"
:
USE_KNOWN_ISSUES
=
False
if
path_to_3D_data
==
"tobedefinedbyoptions"
or
path_to_seq_data
==
"tobedefinedbyoptions"
:
print
(
"usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments"
)
print
(
"See RNANet.py --help for more information."
)
exit
(
1
)
path_to_3D_data
=
"/home/lbecquey/Data/RNA/3D/"
path_to_seq_data
=
"/home/lbecquey/Data/RNA/sequences/"
print
(
f
"
\n
[DEBUG]
\t
Using hard-coded paths to data:
\n\t\t
{path_to_3D_data}
\n\t\t
{path_to_seq_data}
\n
"
)
# exit(1)
# ===========================================================================
# List 3D chains with available Rfam mapping
# ===========================================================================
# List all 3D RNA chains below 4Ang resolution
all_chains
=
set
(
download_BGSU_NR_list
()
)
full_structures_list
=
download_BGSU_NR_list
(
)
# Ask Rfam if some are mapped to Rfam families
mappings
=
download_Rfam_PDB_mappings
()
# Check for a list of known problems:
known_issues
=
[]
if
path
.
isfile
(
path_to_3D_data
+
"known_issues.txt"
):
f
=
open
(
path_to_3D_data
+
"known_issues.txt"
,
'r'
)
known_issues
=
[
x
[:
-
1
]
for
x
in
f
.
readlines
()
]
f
.
close
()
if
USE_KNOWN_ISSUES
:
print
(
"
\t
> Ignoring known issues:"
)
for
x
in
known_issues
:
print
(
"
\t
"
,
x
)
# Filter the chains with mapping
all_chains
=
[]
if
HOMOLOGY
:
chains_with_mapping
=
[]
for
c
in
all_chains
:
mapping
=
mappings
.
loc
[
(
mappings
.
pdb_id
==
c
.
pdb_id
)
&
(
mappings
.
chain
==
c
.
pdb_chain_id
)
]
n
=
len
(
mapping
.
rfam_acc
.
values
)
for
j
in
range
(
n
):
if
j
==
n
-
1
:
chains_with_mapping
.
append
(
c
)
else
:
chains_with_mapping
.
append
(
copy
.
deepcopy
(
c
))
chains_with_mapping
[
-
1
]
.
set_rfam
(
mapping
.
rfam_acc
.
values
[
j
])
# Ask Rfam if some are mapped to Rfam families
allmappings
=
download_Rfam_PDB_mappings
()
print
(
"> Building list of structures..."
,
flush
=
True
)
ncores
=
read_cpu_number
()
p
=
Pool
(
initializer
=
tqdm
.
set_lock
,
initargs
=
(
tqdm
.
get_lock
(),),
processes
=
ncores
)
pbar
=
tqdm
(
full_structures_list
,
maxinterval
=
1.0
,
miniters
=
1
,
bar_format
=
"{percentage:3.0f}
%
|{bar}|"
)
for
i
,
newchains
in
enumerate
(
p
.
imap_unordered
(
partial
(
infer_all_mappings
,
allmappings
),
full_structures_list
)):
all_chains
+=
newchains
pbar
.
update
(
1
)
# Everytime the iteration finishes, update the global progress bar
pbar
.
close
()
p
.
close
()
p
.
join
()
else
:
chains_with_mapping
=
all_chains
n_chains
=
len
(
chains_with_mapping
)
for
codelist
in
tqdm
(
full_structures_list
):
codes
=
str
(
codelist
)
.
replace
(
'+'
,
','
)
.
split
(
','
)
for
c
in
codes
:
nr
=
c
.
split
(
'|'
)
pdb_id
=
nr
[
0
]
.
lower
()
pdb_model
=
int
(
nr
[
1
])
pdb_chain_id
=
nr
[
2
]
.
upper
()
chain_label
=
f
"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}"
all_chains
.
append
(
Chain
(
pdb_id
,
pdb_model
,
pdb_chain_id
,
chain_label
))
n_chains
=
len
(
all_chains
)
print
(
">"
,
validsymb
,
n_chains
,
"RNA chains of interest."
)
# ===========================================================================
# Download 3D structures, extract the desired chain portions,
...
...
@@ -1364,41 +1561,22 @@ if __name__ == "__main__":
# ===========================================================================
print
(
"> Building download list..."
,
flush
=
True
)
# Check for a list of known problems:
known_issues
=
[]
if
path
.
isfile
(
path_to_3D_data
+
"known_issues.txt"
):
f
=
open
(
path_to_3D_data
+
"known_issues.txt"
,
'r'
)
known_issues
=
[
x
[:
-
1
]
for
x
in
f
.
readlines
()
]
f
.
close
()
print
(
"
\t
> Ignoring known issues:"
)
for
x
in
known_issues
:
print
(
"
\t
"
,
x
)
mmcif_parser
=
MMCIFParser
()
joblist
=
[]
for
c
in
chains_with_mapping
:
# read mappings information
mapping
=
mappings
.
loc
[
(
mappings
.
pdb_id
==
c
.
pdb_id
)
&
(
mappings
.
chain
==
c
.
pdb_chain_id
)
&
(
mappings
.
rfam_acc
==
c
.
rfam_fam
)
]
pdb_start
=
str
(
mapping
.
pdb_start
.
values
[
0
])
pdb_end
=
str
(
mapping
.
pdb_end
.
values
[
0
])
# Add a job to build the chain to the list
c
.
chain_label
=
f
"{c.pdb_id}_{str(c.pdb_model)}_{c.pdb_chain_id}_{pdb_start}-{pdb_end}"
ncores
=
read_cpu_number
()
if
c
.
chain_label
not
in
known_issues
:
for
c
in
all_chains
:
if
(
c
.
chain_label
not
in
known_issues
)
or
not
USE_KNOWN_ISSUES
:
joblist
.
append
(
Job
(
function
=
build_chain
,
# Apply function build_chain to every c.chain_label
how_many_in_parallel
=
ncores
,
args
=
[
c
,
mapping
.
rfam_acc
.
values
[
0
],
pdb_start
,
pdb_end
]))
how_many_in_parallel
=
ncores
,
args
=
[
c
]))
# Prepare the results folders
if
not
path
.
isdir
(
path_to_3D_data
+
"RNAcifs"
):
os
.
makedirs
(
path_to_3D_data
+
"RNAcifs"
)
# for the whole structures
if
not
path
.
isdir
(
path_to_3D_data
+
"rna_mapped_to_Rfam"
):
if
HOMOLOGY
and
not
path
.
isdir
(
path_to_3D_data
+
"rna_mapped_to_Rfam"
):
os
.
makedirs
(
path_to_3D_data
+
"rna_mapped_to_Rfam"
)
# for the portions mapped to Rfam
if
not
path
.
isdir
(
path_to_3D_data
+
"pseudotorsions/"
):
os
.
makedirs
(
path_to_3D_data
+
"pseudotorsions/"
)
# for the annotations by DSSR
if
not
HOMOLOGY
and
not
path
.
isdir
(
path_to_3D_data
+
"rna_only"
):
os
.
makedirs
(
path_to_3D_data
+
"rna_only"
)
# extract chains of pure RNA
if
not
path
.
isdir
(
path_to_3D_data
+
"annotations"
):
os
.
makedirs
(
path_to_3D_data
+
"annotations"
)
# for the annotations by DSSR
# Run the builds and extractions
results
=
execute_joblist
(
joblist
)[
1
]
...
...
@@ -1406,16 +1584,16 @@ if __name__ == "__main__":
# Remove the chains whose parsing resulted in errors
loaded_chains
=
[
c
for
c
in
results
if
not
c
.
delete_me
]
print
(
f
"> Loaded {len(loaded_chains)} RNA chains ({len(chains_with_mapping) - len(loaded_chains)} errors)."
)
print
(
f
"> Loaded {len(loaded_chains)} RNA chains ({len(all_chains) - len(loaded_chains)} errors)."
)
del
all_chains
# Here ends its utility, so let's free some memory
if
not
HOMOLOGY
:
# Save chains to file
for
c
in
loaded_chains
:
c
.
data
3D
.
to_csv
(
path_to_3D_data
+
"datapoints/"
+
c
.
chain_label
)
c
.
data
.
to_csv
(
path_to_3D_data
+
"datapoints/"
+
c
.
chain_label
)
print
(
"Completed."
)
exit
()
# ===========================================================================
# Download RNA sequences of the corresponding Rfam families
# ===========================================================================
...
...
@@ -1426,11 +1604,16 @@ if __name__ == "__main__":
# Get the list of Rfam families found
rfam_acc_to_download
=
{}
mappings_list
=
{}
for
c
in
loaded_chains
:
if
c
.
rfam_fam
not
in
rfam_acc_to_download
:
rfam_acc_to_download
[
c
.
rfam_fam
]
=
[
c
]
mappings_list
[
c
.
rfam_fam
]
=
[
c
.
chain_label
]
else
:
rfam_acc_to_download
[
c
.
rfam_fam
]
.
append
(
c
)
mappings_list
[
c
.
rfam_fam
]
.
append
(
c
.
chain_label
)
pd
.
DataFrame
.
from_dict
(
mappings_list
,
orient
=
'index'
)
.
transpose
()
.
to_csv
(
path_to_seq_data
+
"realigned/mappings_list.csv"
)
exit
()
print
(
f
"> Identified {len(rfam_acc_to_download.keys())} families to download and re-align with the crystals' sequences:"
)
# Download the covariance models for all families
...
...
results/clusters_rot180.png
View file @
c9882ee
448 KB
|
W:
|
H:
399 KB
|
W:
|
H:
2-up
Swipe
Onion skin
statistics.py
View file @
c9882ee
...
...
@@ -2,12 +2,15 @@
import
os
import
numpy
as
np
import
pandas
as
pd
import
threading
as
th
import
scipy.stats
as
st
import
matplotlib.pyplot
as
plt
import
matplotlib.patches
as
ptch
from
mpl_toolkits.mplot3d
import
axes3d
from
matplotlib
import
cm
from
tqdm
import
tqdm
from
multiprocessing
import
Pool
from
RNAnet
import
read_cpu_number
if
os
.
path
.
isdir
(
"/home/ubuntu/"
):
# this is the IFB-core cloud
...
...
@@ -26,27 +29,35 @@ else:
print
(
"I don't know that machine... I'm shy, maybe you should introduce yourself ?"
)
exit
(
1
)
if
__name__
==
"__main__"
:
#TODO: compute nt frequencies, chain lengths
def
load_rna_frome_file
(
path_to_textfile
):
return
pd
.
read_csv
(
path_to_textfile
,
sep
=
','
,
header
=
0
,
engine
=
"c"
,
index_col
=
0
)
print
(
"loading CSV files..."
)
rna_points
=
[]
def
reproduce_wadley_results
(
dfs
,
show
=
True
):
all_etas
=
[]
all_thetas
=
[]
for
csvfile
in
tqdm
(
os
.
listdir
(
path_to_3D_data
+
"pseudotorsions"
)):
df
=
pd
.
read_csv
(
path_to_3D_data
+
"pseudotorsions/"
+
csvfile
)
.
drop
(
'Unnamed: 0'
,
axis
=
1
)
all_forms
=
[]
c
=
0
for
df
in
dfs
:
all_etas
+=
list
(
df
[
'eta'
]
.
values
)
all_thetas
+=
list
(
df
[
'theta'
]
.
values
)
rna_points
.
append
(
df
)
all_forms
+=
list
(
df
[
'form'
]
.
values
)
if
(
len
([
x
for
x
in
df
[
'eta'
]
.
values
if
x
<
0
or
x
>
7
])
or
len
([
x
for
x
in
df
[
'theta'
]
.
values
if
x
<
0
or
x
>
7
])):
c
+=
1
print
(
c
,
"points on"
,
len
(
dfs
),
"have non-radian angles !"
)
print
(
"combining etas and thetas..."
)
# increase all the angles by 180°
alldata
=
[
((
e
+
360
)
%
360
-
180
,
(
t
+
360
)
%
360
-
180
)
for
e
,
t
in
zip
(
all_etas
,
all_thetas
)
# # increase all the angles by 180°
# alldata = [ ((e+360)%360-180, (t+360)%360-180)
# for e, t in zip(all_etas, all_thetas)
# if ('nan' not in str((e,t)))
# and not(e<-150 and t<-110) and not (e>160 and t<-110) ]
alldata
=
[
(
e
,
t
)
for
e
,
t
,
f
in
zip
(
all_etas
,
all_thetas
,
all_forms
)
if
(
'nan'
not
in
str
((
e
,
t
)))
and
not
(
e
<-
150
and
t
<-
110
)
and
not
(
e
>
160
and
t
<-
110
)
]
print
(
len
(
alldata
),
"couples of nts found."
)
and
f
==
'.'
]
print
(
len
(
alldata
),
"couples of n
on-helical n
ts found."
)
x
=
np
.
array
([
p
[
0
]
for
p
in
alldata
])
y
=
np
.
array
([
p
[
1
]
for
p
in
alldata
])
...
...
@@ -71,7 +82,7 @@ if __name__ == "__main__":
plt
.
contourf
(
xx
,
yy
,
z
,
cmap
=
cm
.
BuPu
,
alpha
=
0.5
)
ax
.
set_xlabel
(
"$
\\
eta'=C_1'^{i-1}-P^i-C_1'^i-P^{i+1}$"
)
ax
.
set_ylabel
(
"$
\\
theta'=P^i-C_1'^i-P^{i+1}-C_1'^{i+1}$"
)
ax
.
add_patch
(
ptch
.
Rectangle
((
-
20
,
0
),
50
,
70
,
linewidth
=
1
,
edgecolor
=
'r'
,
facecolor
=
'#ff000080'
))
#
ax.add_patch(ptch.Rectangle((-20,0),50,70, linewidth=1, edgecolor='r', facecolor='#ff000080'))
ax
=
fig
.
add_subplot
(
132
,
projection
=
'3d'
)
ax
.
plot_surface
(
xx
,
yy
,
z_inc
,
cmap
=
cm
.
coolwarm
,
linewidth
=
0
,
antialiased
=
True
)
...
...
@@ -86,4 +97,50 @@ if __name__ == "__main__":
ax
.
set_xlabel
(
"$
\\
eta'=C_1'^{i-1}-P^i-C_1'^i-P^{i+1}$"
)
ax
.
set_ylabel
(
"$
\\
theta'=P^i-C_1'^i-P^{i+1}-C_1'^{i+1}$"
)
plt
.
savefig
(
"results/clusters_rot180.png"
)
plt
.
show
()
if
show
:
plt
.
show
()
def
stats_len
(
dfs
):
lengths
=
[]
full_lengths
=
[]
for
r
in
dfs
:
nt_codes
=
r
[
'nt_code'
]
.
values
.
tolist
()
lengths
.
append
(
len
(
nt_codes
))
full_lengths
.
append
(
len
([
c
for
c
in
nt_codes
if
c
!=
'-'
]))
if
__name__
==
"__main__"
:
#TODO: compute nt frequencies, chain lengths
#################################################################
# LOAD ALL FILES
#################################################################
print
(
"Loading mappings list..."
)
mappings_list
=
pd
.
read_csv
(
path_to_seq_data
+
"realigned/mappings_list.csv"
,
sep
=
','
,
index_col
=
0
)
.
to_dict
()
print
(
"Loading datapoints from file..."
)
filelist
=
[
path_to_3D_data
+
"/datapoints/"
+
f
for
f
in
os
.
listdir
(
path_to_3D_data
+
"/datapoints"
)
if
".log"
not
in
f
and
".gz"
not
in
f
]
rna_points
=
[]
p
=
Pool
(
initializer
=
tqdm
.
set_lock
,
initargs
=
(
tqdm
.
get_lock
(),),
processes
=
read_cpu_number
())
pbar
=
tqdm
(
total
=
len
(
filelist
),
desc
=
"RNA files"
,
position
=
0
,
leave
=
True
)
for
i
,
rna
in
enumerate
(
p
.
imap_unordered
(
load_rna_frome_file
,
filelist
)):
rna_points
.
append
(
rna
)
pbar
.
update
(
1
)
pbar
.
close
()
p
.
close
()
p
.
join
()
npoints
=
len
(
rna_points
)
print
(
npoints
,
"RNA files loaded."
)
#################################################################
# Define threads for the tasks
#################################################################
wadley_thr
=
th
.
Thread
(
target
=
reproduce_wadley_results
,
args
=
[
rna_points
])
wadley_thr
.
start
()
wadley_thr
.
join
()
\ No newline at end of file
...
...
Please
register
or
login
to post a comment