Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Louis BECQUEY
/
RNANet
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Louis BECQUEY
2020-09-21 14:14:43 +0200
Browse Files
Options
Browse Files
Download
Plain Diff
Commit
6cc5142fdbb1330de0ccd673554f02c185cffd90
6cc5142f
2 parents
60bd1aec
d645ce5e
Merge branch 'master' of
https://github.com/persalteas/RNANet
into master
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
27 additions
and
19 deletions
RNAnet.py
RNAnet.py
View file @
6cc5142
...
...
@@ -132,9 +132,11 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo):
class
Chain
:
""" The object which stores all our data and the methods to process it.
"""
The object which stores all our data and the methods to process it.
Chains accumulate information through this scipt, and are saved to files at the end of major steps."""
Chains accumulate information through this scipt, and are saved to files at the end of major steps.
"""
def
__init__
(
self
,
pdb_id
,
pdb_model
,
pdb_chain_id
,
chain_label
,
eq_class
,
rfam
=
""
,
inferred
=
False
,
pdb_start
=
None
,
pdb_end
=
None
):
self
.
pdb_id
=
pdb_id
# PDB ID
...
...
@@ -144,6 +146,7 @@ class Chain:
self
.
mapping
=
Mapping
(
chain_label
,
rfam
,
pdb_start
,
pdb_end
,
inferred
)
else
:
self
.
mapping
=
None
self
.
eq_class
=
eq_class
# BGSU NR list class id
self
.
chain_label
=
chain_label
# chain pretty name
self
.
file
=
""
# path to the 3D PDB file
self
.
seq
=
""
# sequence with modified nts
...
...
@@ -523,30 +526,33 @@ class Chain:
# Register the chain in table chain
if
self
.
mapping
is
not
None
:
sql_execute
(
conn
,
f
""" INSERT INTO chain
(structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, issue)
(structure_id, chain_name, pdb_start, pdb_end, rfam_acc,
eq_class,
inferred, issue)
VALUES
(?, ?, ?, ?, ?, ?, ?)
(?, ?, ?, ?, ?, ?, ?
, ?
)
ON CONFLICT(structure_id, chain_name, rfam_acc) DO
UPDATE SET pdb_start=excluded.pdb_start,
pdb_end=excluded.pdb_end,
eq_class=excluded.eq_class,
inferred=excluded.inferred,
issue=excluded.issue;"""
,
data
=
(
str
(
self
.
pdb_id
),
str
(
self
.
pdb_chain_id
),
int
(
self
.
mapping
.
nt_start
),
int
(
self
.
mapping
.
nt_end
),
str
(
self
.
mapping
.
rfam_acc
),
str
(
self
.
mapping
.
rfam_acc
),
str
(
self
.
eq_class
),
int
(
self
.
mapping
.
inferred
),
int
(
self
.
delete_me
)))
# get the chain id
self
.
db_chain_id
=
sql_ask_database
(
conn
,
f
"""SELECT (chain_id) FROM chain
WHERE structure_id='{self.pdb_id}'
AND chain_name='{self.pdb_chain_id}'
AND rfam_acc='{self.mapping.rfam_acc}';"""
)[
0
][
0
]
AND rfam_acc='{self.mapping.rfam_acc}'
AND eq_class='{self.eq_class}';"""
)[
0
][
0
]
else
:
sql_execute
(
conn
,
"""INSERT INTO chain (structure_id, chain_name, rfam_acc,
issue) VALUES (?, ?, NULL
, ?)
ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue;"""
,
data
=
(
str
(
self
.
pdb_id
),
str
(
self
.
pdb_chain_id
),
int
(
self
.
delete_me
)))
sql_execute
(
conn
,
"""INSERT INTO chain (structure_id, chain_name, rfam_acc,
eq_class, issue) VALUES (?, ?, NULL, ?
, ?)
ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue
, eq_class=excluded.eq_class
;"""
,
data
=
(
str
(
self
.
pdb_id
),
str
(
self
.
pdb_chain_id
),
str
(
self
.
eq_class
),
int
(
self
.
delete_me
)))
self
.
db_chain_id
=
sql_ask_database
(
conn
,
f
"""SELECT (chain_id) FROM chain
WHERE structure_id='{self.pdb_id}'
AND chain_name='{self.pdb_chain_id}'
AND eq_class='{self.eq_class}'
AND rfam_acc IS NULL;"""
)[
0
][
0
]
# Add the nucleotides if the chain is not an issue
...
...
@@ -859,14 +865,14 @@ class Downloader:
if
path
.
isfile
(
path_to_3D_data
+
f
"latest_nr_list_{nr_code}A.csv"
):
print
(
"
\t
> Use of the previous version.
\t
"
,
end
=
""
,
flush
=
True
)
else
:
return
[],
[]
return
pd
.
DataFrame
([],
columns
=
[
"class"
,
"class_members"
])
nrlist
=
pd
.
read_csv
(
path_to_3D_data
+
f
"latest_nr_list_{nr_code}A.csv"
)
full_structures_list
=
nrlist
[
'class_members'
]
.
tolist
()
full_structures_list
=
[
tuple
(
i
[
1
])
for
i
in
nrlist
[[
'class'
,
'class_members'
]]
.
iterrows
()
]
print
(
f
"
\t
{validsymb}"
,
flush
=
True
)
# The beginning of an adventure.
return
full_structures_list
return
full_structures_list
# list of ( str (class), str (class_members) )
def
download_from_SILVA
(
self
,
unit
):
if
not
path
.
isfile
(
path_to_seq_data
+
f
"realigned/{unit}.arb"
):
...
...
@@ -1068,8 +1074,8 @@ class Pipeline:
elif
opt
==
"--from-scratch"
:
warn
(
"Deleting previous database and recomputing from scratch."
)
subprocess
.
run
([
"rm"
,
"-rf"
,
path_to_3D_data
+
"annotations"
,
# path_to_3D_data + "RNAcifs", # DEBUG : keep the cifs !
# path_to_3D_data + "annotations", # DEBUG : keep the annotations !
# path_to_3D_data + "RNAcifs",
# DEBUG : keep the cifs !
path_to_3D_data
+
"rna_mapped_to_Rfam"
,
path_to_3D_data
+
"rnaonly"
,
path_to_seq_data
+
"realigned"
,
...
...
@@ -1103,7 +1109,7 @@ class Pipeline:
If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains."""
# List all 3D RNA chains below given resolution
full_structures_list
=
self
.
dl
.
download_BGSU_NR_list
(
self
.
CRYSTAL_RES
)
full_structures_list
=
self
.
dl
.
download_BGSU_NR_list
(
self
.
CRYSTAL_RES
)
# list of tuples ( class, class_members )
# Check for a list of known problems:
if
path
.
isfile
(
runDir
+
"/known_issues.txt"
):
...
...
@@ -1140,8 +1146,8 @@ class Pipeline:
exit
(
1
)
else
:
conn
=
sqlite3
.
connect
(
runDir
+
"/results/RNANet.db"
,
timeout
=
10.0
)
for
codelist
in
tqdm
(
full_structures_list
):
codes
=
str
(
codelist
)
.
replace
(
'+'
,
','
)
.
split
(
','
)
for
eq_class
,
codelist
in
tqdm
(
full_structures_list
):
codes
=
codelist
.
replace
(
'+'
,
','
)
.
split
(
','
)
# Simply convert the list of codes to Chain() objects
for
c
in
codes
:
...
...
@@ -1408,7 +1414,7 @@ class Pipeline:
with
sqlite3
.
connect
(
runDir
+
"/results/RNANet.db"
)
as
conn
:
pd
.
read_sql_query
(
"SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;"
,
conn
)
.
to_csv
(
runDir
+
f
"/results/archive/families_{time_str}.csv"
,
float_format
=
"
%.2
f"
,
index
=
False
)
pd
.
read_sql_query
(
"""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure
pd
.
read_sql_query
(
"""SELECT
eq_class,
structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure
JOIN chain ON structure.pdb_id = chain.structure_id
ORDER BY structure_id, chain_name, rfam_acc ASC;"""
,
conn
)
.
to_csv
(
runDir
+
f
"/results/archive/summary_{time_str}.csv"
,
float_format
=
"
%.2
f"
,
index
=
False
)
...
...
@@ -1522,6 +1528,7 @@ def sql_define_tables(conn):
chain_id INTEGER PRIMARY KEY NOT NULL,
structure_id CHAR(4) NOT NULL,
chain_name VARCHAR(2) NOT NULL,
eq_class VARCHAR(10),
pdb_start SMALLINT,
pdb_end SMALLINT,
issue TINYINT,
...
...
@@ -1785,7 +1792,8 @@ def work_infer_mappings(update_only, allmappings, codelist):
known_mappings
=
pd
.
DataFrame
()
# Split the comma-separated list of chain codes into chain codes:
codes
=
str
(
codelist
)
.
replace
(
'+'
,
','
)
.
split
(
','
)
eq_class
=
codelist
[
0
]
codes
=
codelist
[
1
]
.
replace
(
'+'
,
','
)
.
split
(
','
)
# Search for mappings that apply to an element of this PDB chains list:
for
c
in
codes
:
...
...
Please
register
or
login
to post a comment