Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Louis BECQUEY
/
RNANet
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Louis BECQUEY
2020-04-23 21:52:27 +0200
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
6d1b967ebb64b53e0103f5fd46bed626b7457ec5
6d1b967e
1 parent
7b99a37e
Latest statistics on basepair counts by chain
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
73 additions
and
74 deletions
.gitignore
RNAnet.py
statistics.py
.gitignore
View file @
6d1b967
...
...
@@ -3,7 +3,7 @@ nohup.out
log_of_the_run.sh
# results
results/
figures/wadley_plots/
results/
# temporary results files
data/
...
...
RNAnet.py
View file @
6d1b967
...
...
@@ -1179,7 +1179,7 @@ class Pipeline:
os
.
makedirs
(
runDir
+
"/results/archive/"
)
# Save to by-chain CSV files
p
=
Pool
(
initializer
=
init_worker
,
initargs
=
(
tqdm
.
get_lock
(),),
processes
=
1
,
maxtasksperchild
=
5
)
p
=
Pool
(
initializer
=
init_worker
,
initargs
=
(
tqdm
.
get_lock
(),),
processes
=
3
)
try
:
pbar
=
tqdm
(
total
=
len
(
self
.
loaded_chains
),
desc
=
"Saving chains to CSV"
,
position
=
0
,
leave
=
True
)
for
i
,
_
in
enumerate
(
p
.
imap_unordered
(
work_save
,
self
.
loaded_chains
)):
...
...
@@ -1208,7 +1208,7 @@ class Pipeline:
# Save additional informations
conn
=
sqlite3
.
connect
(
runDir
+
"/results/RNANet.db"
)
pd
.
read_sql_query
(
"SELECT rfam_acc,
idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family
"
,
pd
.
read_sql_query
(
"SELECT rfam_acc,
description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;
"
,
conn
)
.
to_csv
(
runDir
+
f
"/results/archive/families_{time_str}.csv"
,
float_format
=
"
%.2
f"
,
index
=
False
)
pd
.
read_sql_query
(
"""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, reversed, date, exp_method, resolution, issue FROM structure
JOIN chain ON structure.pdb_id = chain.structure_id
...
...
@@ -1274,6 +1274,7 @@ class Pipeline:
conn
.
close
()
def
read_cpu_number
():
# As one shall not use os.cpu_count() on LXC containers,
# because it reads info from /sys wich is not the VM resources but the host resources.
...
...
@@ -2050,6 +2051,7 @@ def work_pssm(f, fill_gaps):
idxQueue
.
put
(
thr_idx
)
# replace the thread index in the queue
return
0
@trace_unhandled_exceptions
def
work_save
(
c
,
homology
=
True
):
conn
=
sqlite3
.
connect
(
runDir
+
"/results/RNANet.db"
,
timeout
=
15.0
)
if
homology
:
...
...
@@ -2096,38 +2098,36 @@ if __name__ == "__main__":
sql_define_tables
(
conn
)
print
(
"> Storing results into"
,
runDir
+
"/results/RNANet.db"
)
# # compute an update compared to what is in the table "chain"
# #DEBUG: list everything
# pp.REUSE_ALL = True
# pp.list_available_mappings()
# # ===========================================================================
# # 3D information
# # ===========================================================================
# # Download and annotate new RNA 3D chains (Chain objects in pp.update)
# pp.dl_and_annotate(coeff_ncores=0.75)
# # At this point, the structure table is up to date
# pp.build_chains(coeff_ncores=2.0)
# if len(pp.to_retry):
# # Redownload and re-annotate
# print("> Retrying to annotate some structures which just failed.", flush=True)
# pp.dl_and_annotate(retry=True, coeff_ncores=0.5) #
# pp.build_chains(retry=True, coeff_ncores=1.0) # Use half the cores to reduce required amount of memory
# print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} errors).")
# pp.checkpoint_save_chains()
# if not pp.HOMOLOGY:
# # Save chains to file
# for c in pp.loaded_chains:
# work_save(c, homology=False)
# print("Completed.")
# exit()
# compute an update compared to what is in the table "chain"
pp
.
list_available_mappings
()
# ===========================================================================
# 3D information
# ===========================================================================
# Download and annotate new RNA 3D chains (Chain objects in pp.update)
pp
.
dl_and_annotate
(
coeff_ncores
=
0.5
)
# At this point, the structure table is up to date
pp
.
build_chains
(
coeff_ncores
=
2.0
)
if
len
(
pp
.
to_retry
):
# Redownload and re-annotate
print
(
"> Retrying to annotate some structures which just failed."
,
flush
=
True
)
pp
.
dl_and_annotate
(
retry
=
True
,
coeff_ncores
=
0.3
)
#
pp
.
build_chains
(
retry
=
True
,
coeff_ncores
=
1.0
)
# Use half the cores to reduce required amount of memory
print
(
f
"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} errors)."
)
pp
.
checkpoint_save_chains
()
if
not
pp
.
HOMOLOGY
:
# Save chains to file
for
c
in
pp
.
loaded_chains
:
work_save
(
c
,
homology
=
False
)
print
(
"Completed."
)
exit
()
#
#
At this point, structure, chain and nucleotide tables of the database are up to date.
#
#
(Modulo some statistics computed by statistics.py)
# At this point, structure, chain and nucleotide tables of the database are up to date.
# (Modulo some statistics computed by statistics.py)
# ===========================================================================
# Homology information
...
...
statistics.py
View file @
6d1b967
...
...
@@ -26,11 +26,13 @@ from collections import Counter
from
RNAnet
import
read_cpu_number
,
sql_ask_database
,
sql_execute
,
warn
,
notify
,
init_worker
# This sets the paths
path_to_3D_data
=
"/home/lbecquey/Data/RNA/3D/"
path_to_seq_data
=
"/home/lbecquey/Data/RNA/sequences/"
if
len
(
sys
.
argv
)
>
1
:
path_to_3D_data
=
path
.
abspath
(
sys
.
argv
[
1
])
path_to_seq_data
=
path
.
abspath
(
sys
.
argv
[
2
])
else
:
print
(
"Please set paths to 3D data using command line arguments:"
)
print
(
"./statistics.py /path/to/3D/data/ /path/to/sequence/data/"
)
exit
()
LSU_set
=
(
"RF00002"
,
"RF02540"
,
"RF02541"
,
"RF02543"
,
"RF02546"
)
# From Rfam CLAN 00112
SSU_set
=
(
"RF00177"
,
"RF02542"
,
"RF02545"
,
"RF01959"
,
"RF01960"
)
# From Rfam CLAN 00111
...
...
@@ -289,19 +291,32 @@ def parallel_stats_pairs(f):
np
.
where
(
expanded_list
.
nts
.
isin
([
"GU"
,
"UG"
]),
"Wobble"
,
"Other"
)
)
)
# checks
# ct = pd.crosstab(expanded_list.pair_type_LW, expanded_list.basepair)
# ct = ct.loc[[ x for x in ["cWW","cHH","cSS","tWW","tHH","tSS"] if x in ct.index ]]
# for _, symmetric_type in ct.iterrows():
# for x in symmetric_type:
# if x%2:
# print("Odd number found for", symmetric_type.name, "in chain", cid, flush=True)
# print(expanded_list, flush=True)
# exit()
expanded_list
=
expanded_list
[[
"basepair"
,
"pair_type_LW"
]]
# Update the database
vlcnts
=
expanded_list
.
pair_type_LW
.
value_counts
()
sqldata
=
(
vlcnts
.
at
[
"cWW"
]
/
2
if
"cWW"
in
vlcnts
.
index
else
0
,
vlcnts
.
at
[
"cWH"
]
if
"cWH"
in
vlcnts
.
index
else
0
,
vlcnts
.
at
[
"cWS"
]
if
"cWS"
in
vlcnts
.
index
else
0
,
vlcnts
.
at
[
"cHH"
]
/
2
if
"cHH"
in
vlcnts
.
index
else
0
,
vlcnts
.
at
[
"cHS"
]
if
"cHS"
in
vlcnts
.
index
else
0
,
vlcnts
.
at
[
"cSS"
]
/
2
if
"cSS"
in
vlcnts
.
index
else
0
,
vlcnts
.
at
[
"tWW"
]
/
2
if
"tWW"
in
vlcnts
.
index
else
0
,
vlcnts
.
at
[
"tWH"
]
if
"tWH"
in
vlcnts
.
index
else
0
,
vlcnts
.
at
[
"tWS"
]
if
"tWS"
in
vlcnts
.
index
else
0
,
vlcnts
.
at
[
"tHH"
]
/
2
if
"tHH"
in
vlcnts
.
index
else
0
,
vlcnts
.
at
[
"tHS"
]
if
"tHS"
in
vlcnts
.
index
else
0
,
vlcnts
.
at
[
"tSS"
]
/
2
if
"tSS"
in
vlcnts
.
index
else
0
,
int
(
sum
(
vlcnts
.
loc
[[
str
(
x
)
for
x
in
vlcnts
.
index
if
"."
in
str
(
x
)]])
/
2
),
cid
)
with
sqlite3
.
connect
(
"results/RNANet.db"
)
as
conn
:
sql_execute
(
conn
,
"""UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?,
pair_count_tHH = ?, pair_count_tHS = ?, pair_count_tSS = ?, pair_count_other = ? WHERE chain_id = ?;"""
,
data
=
sqldata
)
data
.
append
(
expanded_list
)
# merge all the dataframes from all chains of the family
expanded_list
=
pd
.
concat
(
data
)
...
...
@@ -336,17 +351,6 @@ def stats_pairs():
fam_pbar
.
update
(
1
)
results
.
append
(
fam_df
)
allpairs
.
append
(
newpairs
)
# Checks
vlcnts
=
newpairs
.
pair_type_LW
.
value_counts
()
identical
=
[
fam_df
[
i
][
0
]
==
newpairs
.
pair_type_LW
.
value_counts
()
.
at
[
i
]
for
i
in
fam_df
.
columns
]
if
False
in
identical
:
print
(
fam_df
)
print
(
vlcnts
)
print
(
"Dataframes differ for"
,
fam_df
.
index
[
0
],
flush
=
True
)
for
x
in
[
"cWW"
,
"cHH"
,
"cSS"
,
"tWW"
,
"tHH"
,
"tSS"
]:
if
x
in
vlcnts
.
index
and
vlcnts
[
x
]
%
2
:
print
(
"Trouvé un nombre impair de"
,
x
,
"dans"
,
fam_df
.
index
[
0
],
flush
=
True
)
fam_pbar
.
close
()
p
.
close
()
p
.
join
()
...
...
@@ -359,10 +363,6 @@ def stats_pairs():
all_pairs
=
pd
.
concat
(
allpairs
)
df
=
pd
.
concat
(
results
)
.
fillna
(
0
)
vlcnts
=
all_pairs
.
pair_type_LW
.
value_counts
()
for
x
in
[
"cWW"
,
"cHH"
,
"cSS"
,
"tWW"
,
"tHH"
,
"tSS"
]:
if
x
in
vlcnts
.
index
and
vlcnts
[
x
]
%
2
:
print
(
"Trouvé un nombre impair de"
,
x
,
"après le merge !"
,
flush
=
True
)
df
.
to_csv
(
"data/pair_counts.csv"
)
all_pairs
.
to_csv
(
"data/all_pairs.csv"
)
else
:
...
...
@@ -375,18 +375,16 @@ def stats_pairs():
# Remove not very well defined pair types (not in the 12 LW types)
df
[
'other'
]
=
df
[
col_list
]
.
sum
(
axis
=
1
)
df
.
drop
(
col_list
,
axis
=
1
,
inplace
=
True
)
crosstab
=
crosstab
.
append
(
crosstab
.
loc
[
col_list
]
.
sum
(
axis
=
0
)
.
rename
(
"
Other
"
))
crosstab
=
crosstab
.
append
(
crosstab
.
loc
[
col_list
]
.
sum
(
axis
=
0
)
.
rename
(
"
non-LW
"
))
# drop duplicate types
# The twelve Leontis-Westhof types are
# cWW cWH cWS cHH cHS cSS (do not count cHW cSW and cSH, they are the same as their opposites)
# tWW tWH tWS tHH tHS tSS (do not count tHW tSW and tSH, they are the same as their opposites)
df
.
drop
([
x
for
x
in
[
"cHW"
,
"tHW"
,
"cSW"
,
"tSW"
,
"cHS"
,
"tHS"
]
if
x
in
df
.
columns
],
axis
=
1
)
crosstab
=
crosstab
.
loc
[[
x
for
x
in
[
"cWW"
,
"cWH"
,
"cWS"
,
"cHH"
,
"cHS"
,
"cSS"
,
"tWW"
,
"tWH"
,
"tWS"
,
"tHH"
,
"tHS"
,
"tSS"
,
"
Other
"
]
if
x
in
crosstab
.
index
]]
df
=
df
.
drop
([
x
for
x
in
[
"cHW"
,
"tHW"
,
"cSW"
,
"tSW"
,
"cHS"
,
"tHS"
]
if
x
in
df
.
columns
],
axis
=
1
)
crosstab
=
crosstab
.
loc
[[
x
for
x
in
[
"cWW"
,
"cWH"
,
"cWS"
,
"cHH"
,
"cHS"
,
"cSS"
,
"tWW"
,
"tWH"
,
"tWS"
,
"tHH"
,
"tHS"
,
"tSS"
,
"
non-LW
"
]
if
x
in
crosstab
.
index
]]
df
.
loc
[:,[
x
for
x
in
[
"cWW"
,
"tWW"
,
"cHH"
,
"tHH"
,
"cSS"
,
"tSS"
,
"other"
]
if
x
in
df
.
columns
]
]
/=
2
# crosstab.loc[["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "Other"]] /= 2
print
(
crosstab
)
print
(
df
)
crosstab
.
loc
[[
"cWW"
,
"tWW"
,
"cHH"
,
"tHH"
,
"cSS"
,
"tSS"
,
"non-LW"
]]
/=
2
# Compute total row
total_series
=
df
.
sum
(
numeric_only
=
True
)
.
rename
(
"TOTAL"
)
...
...
@@ -397,15 +395,16 @@ def stats_pairs():
# reorder columns
df
.
sort_values
(
"TOTAL"
,
axis
=
1
,
inplace
=
True
,
ascending
=
False
)
crosstab
=
crosstab
[[
"AU"
,
"GC"
,
"Wobble"
,
"Other"
]]
# Save to CSV
df
.
to_csv
(
"results/pair
ing
s.csv"
)
df
.
to_csv
(
"results/pair
_type
s.csv"
)
# Plot barplot of overall types
total_series
.
sort_values
(
ascending
=
False
,
inplace
=
True
)
ax
=
total_series
.
plot
(
figsize
=
(
5
,
3
),
kind
=
'bar'
,
log
=
True
,
ylim
=
(
1e4
,
5000000
)
)
ax
.
set_
ylabel
(
"Number of observations"
)
plt
.
subplots_adjust
(
bottom
=
0.2
,
right
=
0.99
)
ax
=
crosstab
.
plot
(
figsize
=
(
8
,
5
),
kind
=
'bar'
,
stacked
=
True
,
log
=
False
,
fontsize
=
13
)
ax
.
set_ylabel
(
"Number of observations (millions)"
,
fontsize
=
13
)
ax
.
set_
xlabel
(
None
)
plt
.
subplots_adjust
(
left
=
0.1
,
bottom
=
0.16
,
top
=
0.95
,
right
=
0.99
)
plt
.
savefig
(
"results/figures/pairings.png"
)
notify
(
"Computed nucleotide statistics and saved CSV and PNG file."
)
...
...
@@ -416,7 +415,7 @@ def to_dist_matrix(f):
return
0
dm
=
DistanceCalculator
(
'identity'
)
with
open
(
path_to_seq_data
+
"realigned/"
+
f
+
"++.afa"
)
as
al_file
:
with
open
(
path_to_seq_data
+
"
/
realigned/"
+
f
+
"++.afa"
)
as
al_file
:
al
=
AlignIO
.
read
(
al_file
,
"fasta"
)[
-
len
(
mappings_list
[
f
]):]
idty
=
dm
.
get_distance
(
al
)
.
matrix
# list of lists
del
al
...
...
@@ -457,7 +456,7 @@ def seq_idty():
for
f
,
D
in
zip
(
famlist
,
fam_arrays
):
if
not
len
(
D
):
continue
a
=
1.0
-
np
.
average
(
D
+
D
.
T
)
# Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
conn
.
execute
(
f
"UPDATE family SET idty_percent = {
float(a
)} WHERE rfam_acc = '{f}';"
)
conn
.
execute
(
f
"UPDATE family SET idty_percent = {
round(float(a),2
)} WHERE rfam_acc = '{f}';"
)
conn
.
commit
()
conn
.
close
()
...
...
Please
register
or
login
to post a comment