Louis BECQUEY

Revision 1 for Bioinformatics completed

...@@ -13,3 +13,4 @@ esl* ...@@ -13,3 +13,4 @@ esl*
13 # environment stuff 13 # environment stuff
14 .vscode/ 14 .vscode/
15 *.pyc 15 *.pyc
16 +__pycache__/
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -94,6 +94,8 @@ The detailed list of options is below: ...@@ -94,6 +94,8 @@ The detailed list of options is below:
94 -h [ --help ] Print this help message 94 -h [ --help ] Print this help message
95 --version Print the program version 95 --version Print the program version
96 96
97 +-f [ --full-inference ] Infer new 3D->family mappings even if Rfam already provides some. Yields more copies of chains
98 + mapped to different families.
97 -r 4.0 [ --resolution=4.0 ] Maximum 3D structure resolution to consider a RNA chain. 99 -r 4.0 [ --resolution=4.0 ] Maximum 3D structure resolution to consider a RNA chain.
98 -s Run statistics computations after completion 100 -s Run statistics computations after completion
99 --extract Extract the portions of 3D RNA chains to individual mmCIF files. 101 --extract Extract the portions of 3D RNA chains to individual mmCIF files.
...@@ -105,7 +107,7 @@ The detailed list of options is below: ...@@ -105,7 +107,7 @@ The detailed list of options is below:
105 RNAcifs/ Full structures containing RNA, in mmCIF format 107 RNAcifs/ Full structures containing RNA, in mmCIF format
106 rna_mapped_to_Rfam/ Extracted 'pure' RNA chains 108 rna_mapped_to_Rfam/ Extracted 'pure' RNA chains
107 datapoints/ Final results in CSV file format. 109 datapoints/ Final results in CSV file format.
108 ---seq-folder=… Path to a folder to store the sequence and alignment files. 110 +--seq-folder=… Path to a folder to store the sequence and alignment files. Subfolders will be:
109 rfam_sequences/fasta/ Compressed hits to Rfam families 111 rfam_sequences/fasta/ Compressed hits to Rfam families
110 realigned/ Sequences, covariance models, and alignments by family 112 realigned/ Sequences, covariance models, and alignments by family
111 --no-homology Do not try to compute PSSMs and do not align sequences. 113 --no-homology Do not try to compute PSSMs and do not align sequences.
...@@ -117,11 +119,12 @@ The detailed list of options is below: ...@@ -117,11 +119,12 @@ The detailed list of options is below:
117 --update-homologous Re-download Rfam and SILVA databases, realign all families, and recompute all CSV files 119 --update-homologous Re-download Rfam and SILVA databases, realign all families, and recompute all CSV files
118 --from-scratch Delete database, local 3D and sequence files, and known issues, and recompute. 120 --from-scratch Delete database, local 3D and sequence files, and known issues, and recompute.
119 --archive Create a tar.gz archive of the datapoints text files, and update the link to the latest archive 121 --archive Create a tar.gz archive of the datapoints text files, and update the link to the latest archive
122 +--no-logs Do not save per-chain logs of the numbering modifications
120 ``` 123 ```
121 124
122 Typical usage: 125 Typical usage:
123 ``` 126 ```
124 -nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' & 127 +nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &
125 ``` 128 ```
126 129
127 ## Post-computation task: estimate quality 130 ## Post-computation task: estimate quality
......
1 #!/usr/bin/python3.8 1 #!/usr/bin/python3.8
2 +import Bio
3 +import concurrent.futures
4 +import getopt
5 +import gzip
6 +import io
7 +import json
2 import numpy as np 8 import numpy as np
9 +import os
3 import pandas as pd 10 import pandas as pd
4 -import concurrent.futures, getopt, gzip, io, json, os, pickle, psutil, re, requests, signal, sqlalchemy, sqlite3, subprocess, sys, time, traceback, warnings 11 +import pickle
5 -from Bio import AlignIO, SeqIO 12 +import psutil
6 -from Bio.PDB import MMCIFParser 13 +import re
7 -from Bio.PDB.mmcifio import MMCIFIO 14 +import requests
8 -from Bio.PDB.MMCIF2Dict import MMCIF2Dict 15 +import signal
9 -from Bio.PDB.PDBExceptions import PDBConstructionWarning, BiopythonWarning 16 +import sqlalchemy
10 -from Bio.PDB.Dice import ChainSelector 17 +import sqlite3
11 -from Bio.Alphabet import generic_rna 18 +import subprocess
12 -from Bio.Seq import Seq 19 +import sys
13 -from Bio.SeqRecord import SeqRecord 20 +import time
14 -from Bio.Align import MultipleSeqAlignment, AlignInfo 21 +import traceback
15 -from collections import OrderedDict, defaultdict 22 +import warnings
16 from functools import partial, wraps 23 from functools import partial, wraps
17 -from os import path, makedirs 24 +from multiprocessing import Pool, Manager
18 -from multiprocessing import Pool, Manager, set_start_method
19 from time import sleep 25 from time import sleep
20 from tqdm import tqdm 26 from tqdm import tqdm
21 from setproctitle import setproctitle 27 from setproctitle import setproctitle
22 28
29 +
23 def trace_unhandled_exceptions(func): 30 def trace_unhandled_exceptions(func):
24 @wraps(func) 31 @wraps(func)
25 def wrapped_func(*args, **kwargs): 32 def wrapped_func(*args, **kwargs):
...@@ -36,10 +43,11 @@ def trace_unhandled_exceptions(func): ...@@ -36,10 +43,11 @@ def trace_unhandled_exceptions(func):
36 print(s) 43 print(s)
37 return wrapped_func 44 return wrapped_func
38 45
46 +
39 pd.set_option('display.max_rows', None) 47 pd.set_option('display.max_rows', None)
40 sqlite3.enable_callback_tracebacks(True) 48 sqlite3.enable_callback_tracebacks(True)
41 sqlite3.register_adapter(np.int64, lambda val: int(val)) # Tell Sqlite what to do with <class numpy.int64> objects ---> convert to int 49 sqlite3.register_adapter(np.int64, lambda val: int(val)) # Tell Sqlite what to do with <class numpy.int64> objects ---> convert to int
42 -sqlite3.register_adapter(np.float64, lambda val: float(val)) # Tell Sqlite what to do with <class numpy.int64> objects ---> convert to int 50 +sqlite3.register_adapter(np.float64, lambda val: float(val)) # Tell Sqlite what to do with <class numpy.float64> objects ---> convert to float
43 51
44 m = Manager() 52 m = Manager()
45 running_stats = m.list() 53 running_stats = m.list()
...@@ -52,11 +60,14 @@ validsymb = '\U00002705' ...@@ -52,11 +60,14 @@ validsymb = '\U00002705'
52 warnsymb = '\U000026A0' 60 warnsymb = '\U000026A0'
53 errsymb = '\U0000274C' 61 errsymb = '\U0000274C'
54 62
55 -LSU_set = {"RF00002", "RF02540", "RF02541", "RF02543", "RF02546"} # From Rfam CLAN 00112 63 +LSU_set = {"RF00002", "RF02540", "RF02541",
56 -SSU_set = {"RF00177", "RF02542", "RF02545", "RF01959", "RF01960"} # From Rfam CLAN 00111 64 + "RF02543", "RF02546"} # From Rfam CLAN 00112
65 +SSU_set = {"RF00177", "RF02542", "RF02545",
66 + "RF01959", "RF01960"} # From Rfam CLAN 00111
57 no_nts_set = set() 67 no_nts_set = set()
58 weird_mappings = set() 68 weird_mappings = set()
59 69
70 +
60 class SelectivePortionSelector(object): 71 class SelectivePortionSelector(object):
61 """Class passed to MMCIFIO to select some chain portions in an MMCIF file. 72 """Class passed to MMCIFIO to select some chain portions in an MMCIF file.
62 73
...@@ -101,7 +112,7 @@ class SelectivePortionSelector(object): ...@@ -101,7 +112,7 @@ class SelectivePortionSelector(object):
101 return 1 112 return 1
102 113
103 114
104 -class BufferingSummaryInfo(AlignInfo.SummaryInfo): 115 +class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo):
105 116
106 def get_pssm(self, family, index): 117 def get_pssm(self, family, index):
107 """Create a position specific score matrix object for the alignment. 118 """Create a position specific score matrix object for the alignment.
...@@ -128,7 +139,7 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo): ...@@ -128,7 +139,7 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo):
128 score_dict[this_residue] = 1.0 139 score_dict[this_residue] = 1.0
129 pssm_info.append(('*', score_dict)) 140 pssm_info.append(('*', score_dict))
130 141
131 - return AlignInfo.PSSM(pssm_info) 142 + return Bio.Align.AlignInfo.PSSM(pssm_info)
132 143
133 144
134 class Chain: 145 class Chain:
...@@ -187,11 +198,11 @@ class Chain: ...@@ -187,11 +198,11 @@ class Chain:
187 198
188 with warnings.catch_warnings(): 199 with warnings.catch_warnings():
189 # Ignore the PDB problems. This mostly warns that some chain is discontinuous. 200 # Ignore the PDB problems. This mostly warns that some chain is discontinuous.
190 - warnings.simplefilter('ignore', PDBConstructionWarning) 201 + warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning)
191 - warnings.simplefilter('ignore', BiopythonWarning) 202 + warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning)
192 203
193 # Load the whole mmCIF into a Biopython structure object: 204 # Load the whole mmCIF into a Biopython structure object:
194 - mmcif_parser = MMCIFParser() 205 + mmcif_parser = Bio.PDB.MMCIFParser()
195 try: 206 try:
196 s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif") 207 s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif")
197 except ValueError as e: 208 except ValueError as e:
...@@ -212,7 +223,7 @@ class Chain: ...@@ -212,7 +223,7 @@ class Chain:
212 sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm) 223 sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm)
213 224
214 # Save that selection on the mmCIF object s to file 225 # Save that selection on the mmCIF object s to file
215 - ioobj = MMCIFIO() 226 + ioobj = Bio.PDB.mmcifio.MMCIFIO()
216 ioobj.set_structure(s) 227 ioobj.set_structure(s)
217 ioobj.save(self.file, sel) 228 ioobj.save(self.file, sel)
218 229
...@@ -253,7 +264,7 @@ class Chain: ...@@ -253,7 +264,7 @@ class Chain:
253 # Create the Pandas DataFrame for the nucleotides of the right chain 264 # Create the Pandas DataFrame for the nucleotides of the right chain
254 nts = json_object["nts"] # sub-json-object 265 nts = json_object["nts"] # sub-json-object
255 df = pd.DataFrame(nts) # conversion to dataframe 266 df = pd.DataFrame(nts) # conversion to dataframe
256 - df = df[ df.chain_name == self.pdb_chain_id ] # keeping only this chain's nucleotides 267 + df = df[df.chain_name == self.pdb_chain_id] # keeping only this chain's nucleotides
257 268
258 # Assert nucleotides of the chain are found 269 # Assert nucleotides of the chain are found
259 if df.empty: 270 if df.empty:
...@@ -266,12 +277,12 @@ class Chain: ...@@ -266,12 +277,12 @@ class Chain:
266 # Remove low pertinence or undocumented descriptors, convert angles values 277 # Remove low pertinence or undocumented descriptors, convert angles values
267 cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta", 278 cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta",
268 "epsilon_zeta", "bb_type", "chi", "glyco_bond", "form", "ssZp", "Dp", "eta", "theta", "eta_prime", "theta_prime", "eta_base", "theta_base", 279 "epsilon_zeta", "bb_type", "chi", "glyco_bond", "form", "ssZp", "Dp", "eta", "theta", "eta_prime", "theta_prime", "eta_base", "theta_base",
269 - "v0", "v1", "v2", "v3", "v4", "amplitude", "phase_angle", "puckering" ] 280 + "v0", "v1", "v2", "v3", "v4", "amplitude", "phase_angle", "puckering"]
270 df = df[cols_we_keep] 281 df = df[cols_we_keep]
271 - df.loc[:,['alpha', 'beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi','v0', 'v1', 'v2', 'v3', 'v4', # Conversion to radians 282 + df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4', # Conversion to radians
272 - 'eta','theta','eta_prime','theta_prime','eta_base','theta_base', 'phase_angle']] *= np.pi/180.0 283 + 'eta', 'theta', 'eta_prime', 'theta_prime', 'eta_base', 'theta_base', 'phase_angle']] *= np.pi/180.0
273 - df.loc[:,['alpha', 'beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi','v0', 'v1', 'v2', 'v3', 'v4', # mapping [-pi, pi] into [0, 2pi] 284 + df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4', # mapping [-pi, pi] into [0, 2pi]
274 - 'eta','theta','eta_prime','theta_prime','eta_base','theta_base', 'phase_angle']] %= (2.0*np.pi) 285 + 'eta', 'theta', 'eta_prime', 'theta_prime', 'eta_base', 'theta_base', 'phase_angle']] %= (2.0*np.pi)
275 286
276 except KeyError as e: 287 except KeyError as e:
277 warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True) 288 warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True)
...@@ -295,14 +306,14 @@ class Chain: ...@@ -295,14 +306,14 @@ class Chain:
295 # Duplicate residue numbers : shift numbering 306 # Duplicate residue numbers : shift numbering
296 while True in df.duplicated(['nt_resnum']).values: 307 while True in df.duplicated(['nt_resnum']).values:
297 i = df.duplicated(['nt_resnum']).values.tolist().index(True) 308 i = df.duplicated(['nt_resnum']).values.tolist().index(True)
298 - duplicates = df[df.nt_resnum == df.iloc[i,1]] 309 + duplicates = df[df.nt_resnum == df.iloc[i, 1]]
299 n_dup = len(duplicates.nt_resnum) 310 n_dup = len(duplicates.nt_resnum)
300 index_last_dup = duplicates.index_chain.iloc[-1] - 1 311 index_last_dup = duplicates.index_chain.iloc[-1] - 1
301 if self.mapping is not None: 312 if self.mapping is not None:
302 self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}") 313 self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}")
303 314
304 try: 315 try:
305 - if i > 0 and index_last_dup +1 < len(df.index) and df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]: 316 + if i > 0 and index_last_dup + 1 < len(df.index) and df.iloc[i, 1] == df.iloc[i-1, 1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
306 # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end 317 # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end
307 318
308 if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup: 319 if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup:
...@@ -314,15 +325,15 @@ class Chain: ...@@ -314,15 +325,15 @@ class Chain:
314 else: 325 else:
315 # We solve the problem continous component by continuous component 326 # We solve the problem continous component by continuous component
316 for j in range(1, n_dup+1): 327 for j in range(1, n_dup+1):
317 - if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous 328 + if duplicates.iloc[j, 0] == 1 + duplicates.iloc[j-1, 0]: # continuous
318 - df.iloc[i+j-1,1] += 1 329 + df.iloc[i+j-1, 1] += 1
319 else: 330 else:
320 break 331 break
321 - elif df.iloc[i,1] == df.iloc[i-1,1]: 332 + elif df.iloc[i, 1] == df.iloc[i-1, 1]:
322 # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR. 333 # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
323 # Solution : we shift the numbering of 17A (to 18) and the following residues. 334 # Solution : we shift the numbering of 17A (to 18) and the following residues.
324 df.iloc[i:, 1] += 1 335 df.iloc[i:, 1] += 1
325 - elif duplicates.iloc[0,0] == 1 and df.iloc[i,0] == 3: 336 + elif duplicates.iloc[0, 0] == 1 and df.iloc[i, 0] == 3:
326 # 4wzo_1_1J case, there is a residue numbered -1 and read as 1 before the number 0. 337 # 4wzo_1_1J case, there is a residue numbered -1 and read as 1 before the number 0.
327 df.iloc[1:, 1] += 1 338 df.iloc[1:, 1] += 1
328 df.iloc[0, 1] = 0 339 df.iloc[0, 1] = 0
...@@ -340,12 +351,16 @@ class Chain: ...@@ -340,12 +351,16 @@ class Chain:
340 351
341 # Search for ligands at the end of the selection 352 # Search for ligands at the end of the selection
342 # Drop ligands detected as residues by DSSR, by detecting several markers 353 # Drop ligands detected as residues by DSSR, by detecting several markers
343 - while ( len(df.index_chain) and df.iloc[-1,2] not in ["A", "C", "G", "U"] and ( 354 + while (
344 - (df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all() 355 + len(df.index_chain) and df.iloc[-1, 2] not in ["A", "C", "G", "U"]
345 - or (df.iloc[[-1]].puckering=='').any() 356 + and (
357 + (df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon",
358 + "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all()
359 + or (df.iloc[[-1]].puckering == '').any()
346 ) 360 )
347 - or ( len(df.index_chain) >= 2 and df.iloc[-1,1] > 50 + df.iloc[-2,1] ) # large nt_resnum gap between the two last residues 361 + # large nt_resnum gap between the two last residues
348 - or ( len(df.index_chain) and df.iloc[-1,2] in ["GNG", "E2C", "OHX", "IRI", "MPD", "8UZ"] ) 362 + or (len(df.index_chain) >= 2 and df.iloc[-1, 1] > 50 + df.iloc[-2, 1])
363 + or (len(df.index_chain) and df.iloc[-1, 2] in ["GNG", "E2C", "OHX", "IRI", "MPD", "8UZ"])
349 ): 364 ):
350 if self.mapping is not None: 365 if self.mapping is not None:
351 self.mapping.log("Droping ligand:") 366 self.mapping.log("Droping ligand:")
...@@ -390,17 +405,19 @@ class Chain: ...@@ -390,17 +405,19 @@ class Chain:
390 break 405 break
391 if found: 406 if found:
392 self.mapping.log(f"Residue {i+1+self.mapping.st}-{self.mapping.st} = {i+1} has been saved and renumbered {df.iloc[i,1]} instead of {found['nt_id'].replace(found['chain_name']+ '.' + found['nt_name'], '').replace('^','')}") 407 self.mapping.log(f"Residue {i+1+self.mapping.st}-{self.mapping.st} = {i+1} has been saved and renumbered {df.iloc[i,1]} instead of {found['nt_id'].replace(found['chain_name']+ '.' + found['nt_name'], '').replace('^','')}")
393 - df_row = pd.DataFrame([found], index=[i])[df.columns.values] 408 + df_row = pd.DataFrame([found], index=[i])[
394 - df_row.iloc[0,0] = i+1 # index_chain 409 + df.columns.values]
395 - df_row.iloc[0,1] = df.iloc[i,1] # nt_resnum 410 + df_row.iloc[0, 0] = i+1 # index_chain
396 - df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ]) 411 + df_row.iloc[0, 1] = df.iloc[i, 1] # nt_resnum
412 + df = pd.concat([df.iloc[:i], df_row, df.iloc[i:]])
397 df.iloc[i+1:, 1] += 1 413 df.iloc[i+1:, 1] += 1
398 else: 414 else:
399 warn(f"Missing index_chain {i} in {self.chain_label} !") 415 warn(f"Missing index_chain {i} in {self.chain_label} !")
400 416
401 # Assert some nucleotides still exist 417 # Assert some nucleotides still exist
402 try: 418 try:
403 - l = df.iloc[-1,1] - df.iloc[0,1] + 1 # update length of chain from nt_resnum point of view 419 + # update length of chain from nt_resnum point of view
420 + l = df.iloc[-1, 1] - df.iloc[0, 1] + 1
404 except IndexError: 421 except IndexError:
405 warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.mapping.nt_start} and " 422 warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.mapping.nt_start} and "
406 f"{self.mapping.nt_end} ({'not ' if not self.mapping.inferred else ''}inferred). Ignoring chain {self.chain_label}.") 423 f"{self.mapping.nt_end} ({'not ' if not self.mapping.inferred else ''}inferred). Ignoring chain {self.chain_label}.")
...@@ -426,14 +443,17 @@ class Chain: ...@@ -426,14 +443,17 @@ class Chain:
426 # index_chain 1 |-------------|77 83|------------| 154 443 # index_chain 1 |-------------|77 83|------------| 154
427 # expected data point 1 |--------------------------------| 154 444 # expected data point 1 |--------------------------------| 154
428 # 445 #
446 +
429 if l != len(df['index_chain']): # if some residues are missing, len(df['index_chain']) < l 447 if l != len(df['index_chain']): # if some residues are missing, len(df['index_chain']) < l
430 - resnum_start = df.iloc[0,1] 448 + resnum_start = df.iloc[0, 1]
431 - diff = set(range(l)).difference(df['nt_resnum'] - resnum_start) # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start) 449 + # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start)
450 + diff = set(range(l)).difference(df['nt_resnum'] - resnum_start)
432 for i in sorted(diff): 451 for i in sorted(diff):
433 # Add a row at position i 452 # Add a row at position i
434 - df = pd.concat([ df.iloc[:i], 453 + df = pd.concat([df.iloc[:i],
435 - pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start, "nt_id":"not resolved", "nt_code":'-', "nt_name":'-'}, index=[i]), 454 + pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start,
436 - df.iloc[i:] ]) 455 + "nt_id": "not resolved", "nt_code": '-', "nt_name": '-'}, index=[i]),
456 + df.iloc[i:]])
437 # Increase the index_chain of all following lines 457 # Increase the index_chain of all following lines
438 df.iloc[i+1:, 0] += 1 458 df.iloc[i+1:, 0] += 1
439 df = df.reset_index(drop=True) 459 df = df.reset_index(drop=True)
...@@ -444,27 +464,27 @@ class Chain: ...@@ -444,27 +464,27 @@ class Chain:
444 ####################################### 464 #######################################
445 465
446 # Add a sequence column just for the alignments 466 # Add a sequence column just for the alignments
447 - df['nt_align_code'] = [ str(x).upper() 467 + df['nt_align_code'] = [str(x).upper()
448 .replace('NAN', '-') # Unresolved nucleotides are gaps 468 .replace('NAN', '-') # Unresolved nucleotides are gaps
449 .replace('?', '-') # Unidentified residues, let's delete them 469 .replace('?', '-') # Unidentified residues, let's delete them
450 .replace('T', 'U') # 5MU are modified to t, which gives T 470 .replace('T', 'U') # 5MU are modified to t, which gives T
451 .replace('P', 'U') # Pseudo-uridines, but it is not really right to change them to U, see DSSR paper, Fig 2 471 .replace('P', 'U') # Pseudo-uridines, but it is not really right to change them to U, see DSSR paper, Fig 2
452 - for x in df['nt_code'] ] 472 + for x in df['nt_code']]
453 473
454 # One-hot encoding sequence 474 # One-hot encoding sequence
455 - df["is_A"] = [ 1 if x=="A" else 0 for x in df["nt_code"] ] 475 + df["is_A"] = [1 if x == "A" else 0 for x in df["nt_code"]]
456 - df["is_C"] = [ 1 if x=="C" else 0 for x in df["nt_code"] ] 476 + df["is_C"] = [1 if x == "C" else 0 for x in df["nt_code"]]
457 - df["is_G"] = [ 1 if x=="G" else 0 for x in df["nt_code"] ] 477 + df["is_G"] = [1 if x == "G" else 0 for x in df["nt_code"]]
458 - df["is_U"] = [ 1 if x=="U" else 0 for x in df["nt_code"] ] 478 + df["is_U"] = [1 if x == "U" else 0 for x in df["nt_code"]]
459 - df["is_other"] = [ 0 if x in "ACGU" else 1 for x in df["nt_code"] ] 479 + df["is_other"] = [0 if x in "ACGU" else 1 for x in df["nt_code"]]
460 df["nt_position"] = [ float(i+1)/self.full_length for i in range(self.full_length) ] 480 df["nt_position"] = [ float(i+1)/self.full_length for i in range(self.full_length) ]
461 481
462 # Iterate over pairs to identify base-base interactions 482 # Iterate over pairs to identify base-base interactions
463 res_ids = list(df['nt_id']) # things like "chainID.C4, chainID.U5" 483 res_ids = list(df['nt_id']) # things like "chainID.C4, chainID.U5"
464 - paired = [ '' ] * self.full_length 484 + paired = [''] * self.full_length
465 - pair_type_LW = [ '' ] * self.full_length 485 + pair_type_LW = [''] * self.full_length
466 - pair_type_DSSR = [ '' ] * self.full_length 486 + pair_type_DSSR = [''] * self.full_length
467 - interacts = [ 0 ] * self.full_length 487 + interacts = [0] * self.full_length
468 if "pairs" in json_object.keys(): 488 if "pairs" in json_object.keys():
469 pairs = json_object["pairs"] 489 pairs = json_object["pairs"]
470 for p in pairs: 490 for p in pairs:
...@@ -506,17 +526,19 @@ class Chain: ...@@ -506,17 +526,19 @@ class Chain:
506 paired[nt2_idx] += ',' + str(nt1_idx + 1) 526 paired[nt2_idx] += ',' + str(nt1_idx + 1)
507 527
508 # transform nt_id to shorter values 528 # transform nt_id to shorter values
509 - df['old_nt_resnum'] = [ n.replace(self.pdb_chain_id+'.'+name, '').replace('^','').replace('/','') for n, name in zip(df.nt_id, df.nt_name) ] 529 + df['old_nt_resnum'] = [ n.replace(self.pdb_chain_id+'.'+name, '').replace('^', '').replace('/', '') for n, name in zip(df.nt_id, df.nt_name) ]
510 530
511 df['paired'] = paired 531 df['paired'] = paired
512 df['pair_type_LW'] = pair_type_LW 532 df['pair_type_LW'] = pair_type_LW
513 df['pair_type_DSSR'] = pair_type_DSSR 533 df['pair_type_DSSR'] = pair_type_DSSR
514 df['nb_interact'] = interacts 534 df['nb_interact'] = interacts
515 - df = df.drop(['nt_id', 'nt_resnum'], axis=1) # remove now useless descriptors 535 +
536 + # remove now useless descriptors
537 + df = df.drop(['nt_id', 'nt_resnum'], axis=1)
516 538
517 self.seq = "".join(df.nt_code) 539 self.seq = "".join(df.nt_code)
518 self.seq_to_align = "".join(df.nt_align_code) 540 self.seq_to_align = "".join(df.nt_align_code)
519 - self.length = len([ x for x in self.seq_to_align if x != "-" ]) 541 + self.length = len([x for x in self.seq_to_align if x != "-"])
520 542
521 # Remove too short chains 543 # Remove too short chains
522 if self.length < 5: 544 if self.length < 5:
...@@ -559,7 +581,8 @@ class Chain: ...@@ -559,7 +581,8 @@ class Chain:
559 WHERE structure_id='{self.pdb_id}' 581 WHERE structure_id='{self.pdb_id}'
560 AND chain_name='{self.pdb_chain_id}' 582 AND chain_name='{self.pdb_chain_id}'
561 AND rfam_acc='{self.mapping.rfam_acc}' 583 AND rfam_acc='{self.mapping.rfam_acc}'
562 - AND eq_class='{self.eq_class}';""")[0][0] 584 + AND eq_class='{self.eq_class}';"""
585 + )[0][0]
563 else: 586 else:
564 sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, 'unmappd', ?, ?) 587 sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, 'unmappd', ?, ?)
565 ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue, eq_class=excluded.eq_class;""", 588 ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue, eq_class=excluded.eq_class;""",
...@@ -568,19 +591,18 @@ class Chain: ...@@ -568,19 +591,18 @@ class Chain:
568 WHERE structure_id='{self.pdb_id}' 591 WHERE structure_id='{self.pdb_id}'
569 AND chain_name='{self.pdb_chain_id}' 592 AND chain_name='{self.pdb_chain_id}'
570 AND eq_class='{self.eq_class}' 593 AND eq_class='{self.eq_class}'
571 - AND rfam_acc = 'unmappd';""")[0][0] 594 + AND rfam_acc = 'unmappd';"""
595 + )[0][0]
572 596
573 # Add the nucleotides if the chain is not an issue 597 # Add the nucleotides if the chain is not an issue
574 if df is not None and not self.delete_me: # double condition is theoretically redundant here, but you never know 598 if df is not None and not self.delete_me: # double condition is theoretically redundant here, but you never know
575 - sql_execute(conn, f""" 599 + sql_execute(conn, f"""INSERT OR IGNORE INTO nucleotide
576 - INSERT OR IGNORE INTO nucleotide
577 (chain_id, index_chain, nt_name, nt_code, dbn, alpha, beta, gamma, delta, epsilon, zeta, 600 (chain_id, index_chain, nt_name, nt_code, dbn, alpha, beta, gamma, delta, epsilon, zeta,
578 epsilon_zeta, bb_type, chi, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, 601 epsilon_zeta, bb_type, chi, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base,
579 v0, v1, v2, v3, v4, amplitude, phase_angle, puckering, nt_align_code, is_A, is_C, is_G, is_U, is_other, nt_position, 602 v0, v1, v2, v3, v4, amplitude, phase_angle, puckering, nt_align_code, is_A, is_C, is_G, is_U, is_other, nt_position,
580 old_nt_resnum, paired, pair_type_LW, pair_type_DSSR, nb_interact) 603 old_nt_resnum, paired, pair_type_LW, pair_type_DSSR, nb_interact)
581 VALUES ({self.db_chain_id}, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 604 VALUES ({self.db_chain_id}, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
582 - ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 605 + ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""",
583 - ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""",
584 many=True, data=list(df.to_records(index=False)), warn_every=10) 606 many=True, data=list(df.to_records(index=False)), warn_every=10)
585 607
586 def remap(self, columns_to_save, s_seq): 608 def remap(self, columns_to_save, s_seq):
...@@ -598,40 +620,39 @@ class Chain: ...@@ -598,40 +620,39 @@ class Chain:
598 # Save colums in the appropriate positions 620 # Save colums in the appropriate positions
599 i = 0 621 i = 0
600 j = 0 622 j = 0
601 - while i<self.full_length and j<alilen: 623 + while i < self.full_length and j < alilen:
602 # Here we try to map self.seq_to_align (the sequence of the 3D chain, including gaps when residues are missing), 624 # Here we try to map self.seq_to_align (the sequence of the 3D chain, including gaps when residues are missing),
603 # with s_seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and . 625 # with s_seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and .
604 626
605 if self.seq_to_align[i] == s_seq[j].upper(): # alignment and sequence correspond (incl. gaps) 627 if self.seq_to_align[i] == s_seq[j].upper(): # alignment and sequence correspond (incl. gaps)
606 - re_mappings.append( (self.db_chain_id, i+1, j+1) ) # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1. 628 + re_mappings.append((self.db_chain_id, i+1, j+1)) # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1.
607 columns_to_save.add(j+1) # it's a set, doublons are automaticaly ignored 629 columns_to_save.add(j+1) # it's a set, doublons are automaticaly ignored
608 i += 1 630 i += 1
609 j += 1 631 j += 1
610 elif self.seq_to_align[i] == '-': # gap in the chain, but not in the aligned sequence 632 elif self.seq_to_align[i] == '-': # gap in the chain, but not in the aligned sequence
611 -
612 # search for a gap to the consensus nearby 633 # search for a gap to the consensus nearby
613 k = 0 # Search must start at zero to assert the difference comes from '-' in front of '.' 634 k = 0 # Search must start at zero to assert the difference comes from '-' in front of '.'
614 - while j+k<alilen and s_seq[j+k] == '.': 635 + while j+k < alilen and s_seq[j+k] == '.':
615 k += 1 636 k += 1
616 637
617 # if found, set j to that position 638 # if found, set j to that position
618 - if j+k<alilen and s_seq[j+k] == '-': 639 + if j+k < alilen and s_seq[j+k] == '-':
619 - re_mappings.append( (self.db_chain_id, i+1, j+k+1) ) 640 + re_mappings.append((self.db_chain_id, i+1, j+k+1))
620 columns_to_save.add(j+k+1) 641 columns_to_save.add(j+k+1)
621 i += 1 642 i += 1
622 j += k+1 643 j += k+1
623 continue 644 continue
624 645
625 # if not, take the insertion gap if this is one 646 # if not, take the insertion gap if this is one
626 - if j<alilen and s_seq[j] == '.': 647 + if j < alilen and s_seq[j] == '.':
627 - re_mappings.append( (self.db_chain_id, i+1, j+1) ) 648 + re_mappings.append((self.db_chain_id, i+1, j+1))
628 columns_to_save.add(j+1) 649 columns_to_save.add(j+1)
629 i += 1 650 i += 1
630 j += 1 651 j += 1
631 continue 652 continue
632 653
633 # else, just mark the gap as unknown (there is an alignment mismatch) 654 # else, just mark the gap as unknown (there is an alignment mismatch)
634 - re_mappings.append( (self.db_chain_id, i+1, 0) ) 655 + re_mappings.append((self.db_chain_id, i+1, 0))
635 i += 1 656 i += 1
636 elif s_seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain 657 elif s_seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain
637 j += 1 # ignore the column 658 j += 1 # ignore the column
...@@ -672,7 +693,7 @@ class Chain: ...@@ -672,7 +693,7 @@ class Chain:
672 l = letters[freq.index(max(freq))] 693 l = letters[freq.index(max(freq))]
673 c_seq_to_align[i] = l 694 c_seq_to_align[i] = l
674 c_seq[i] = l 695 c_seq[i] = l
675 - gaps.append((l, l=='A', l=='C', l=='G', l=='U', l=='N', self.db_chain_id, i+1 )) 696 + gaps.append((l, l == 'A', l == 'C', l == 'G', l == 'U', l == 'N', self.db_chain_id, i+1))
676 self.seq_to_align = ''.join(c_seq_to_align) 697 self.seq_to_align = ''.join(c_seq_to_align)
677 self.seq = ''.join(c_seq) 698 self.seq = ''.join(c_seq)
678 return gaps 699 return gaps
...@@ -684,6 +705,7 @@ class Job: ...@@ -684,6 +705,7 @@ class Job:
684 This could be a system command or the execution of a Python function. 705 This could be a system command or the execution of a Python function.
685 Time and memory usage of a job can be monitored. 706 Time and memory usage of a job can be monitored.
686 """ 707 """
708 +
687 def __init__(self, results="", command=[], function=None, args=[], how_many_in_parallel=0, priority=1, timeout=None, checkFunc=None, checkArgs=[], label=""): 709 def __init__(self, results="", command=[], function=None, args=[], how_many_in_parallel=0, priority=1, timeout=None, checkFunc=None, checkArgs=[], label=""):
688 self.cmd_ = command # A system command to run 710 self.cmd_ = command # A system command to run
689 self.func_ = function # A python function to run 711 self.func_ = function # A python function to run
...@@ -709,7 +731,8 @@ class Job: ...@@ -709,7 +731,8 @@ class Job:
709 if self.func_ is None: 731 if self.func_ is None:
710 s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}" + " ".join(self.cmd_) 732 s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}" + " ".join(self.cmd_)
711 else: 733 else:
712 - s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}{self.func_.__name__}(" + " ".join([str(a) for a in self.args_]) + ")" 734 + s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}{self.func_.__name__}(" \
735 + + " ".join([ str(a) for a in self.args_ ]) + ")"
713 return s 736 return s
714 737
715 738
...@@ -767,13 +790,14 @@ class Downloader: ...@@ -767,13 +790,14 @@ class Downloader:
767 print("> Fetching latest PDB mappings from Rfam..." + " " * 29, end='', flush=True) 790 print("> Fetching latest PDB mappings from Rfam..." + " " * 29, end='', flush=True)
768 try: 791 try:
769 db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam') 792 db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam')
770 - mappings = pd.read_sql('SELECT rfam_acc, pdb_id, chain, pdb_start, pdb_end, bit_score, evalue_score, cm_start, cm_end, hex_colour FROM pdb_full_region WHERE is_significant=1;', con=db_connection) 793 + mappings = pd.read_sql('SELECT rfam_acc, pdb_id, chain, pdb_start, pdb_end, bit_score, evalue_score, cm_start, cm_end, hex_colour FROM pdb_full_region WHERE is_significant=1;',
794 + con=db_connection)
771 mappings.to_csv(runDir + "/data/Rfam-PDB-mappings.csv") 795 mappings.to_csv(runDir + "/data/Rfam-PDB-mappings.csv")
772 print(f"\t{validsymb}") 796 print(f"\t{validsymb}")
773 except sqlalchemy.exc.OperationalError: # Cannot connect :'( 797 except sqlalchemy.exc.OperationalError: # Cannot connect :'(
774 print(f"\t{errsymb}") 798 print(f"\t{errsymb}")
775 # Check if a previous run succeeded (if file exists, use it) 799 # Check if a previous run succeeded (if file exists, use it)
776 - if path.isfile(runDir + "/data/Rfam-PDB-mappings.csv"): 800 + if os.path.isfile(runDir + "/data/Rfam-PDB-mappings.csv"):
777 print("\t> Using previous version.") 801 print("\t> Using previous version.")
778 mappings = pd.read_csv(runDir + "/data/Rfam-PDB-mappings.csv") 802 mappings = pd.read_csv(runDir + "/data/Rfam-PDB-mappings.csv")
779 else: # otherwise, abort. 803 else: # otherwise, abort.
...@@ -791,7 +815,7 @@ class Downloader: ...@@ -791,7 +815,7 @@ class Downloader:
791 setproctitle(f"RNANet.py download_Rfam_cm()") 815 setproctitle(f"RNANet.py download_Rfam_cm()")
792 816
793 print(f"\t> Download Rfam.cm.gz from Rfam..." + " " * 37, end='', flush=True) 817 print(f"\t> Download Rfam.cm.gz from Rfam..." + " " * 37, end='', flush=True)
794 - if not path.isfile(path_to_seq_data + "Rfam.cm"): 818 + if not os.path.isfile(path_to_seq_data + "Rfam.cm"):
795 try: 819 try:
796 subprocess.run(["wget", "ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz", "-O", path_to_seq_data + "Rfam.cm.gz"]) 820 subprocess.run(["wget", "ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz", "-O", path_to_seq_data + "Rfam.cm.gz"])
797 print(f"\t{validsymb}", flush=True) 821 print(f"\t{validsymb}", flush=True)
...@@ -815,7 +839,6 @@ class Downloader: ...@@ -815,7 +839,6 @@ class Downloader:
815 try: 839 try:
816 db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam') 840 db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam')
817 841
818 -
819 # Prepare the SQL query. It computes the length of the chains and gets the maximum length by family. 842 # Prepare the SQL query. It computes the length of the chains and gets the maximum length by family.
820 q = """SELECT stats.rfam_acc, k.description, stats.maxlength FROM 843 q = """SELECT stats.rfam_acc, k.description, stats.maxlength FROM
821 (SELECT fr.rfam_acc, MAX( 844 (SELECT fr.rfam_acc, MAX(
...@@ -838,15 +861,17 @@ class Downloader: ...@@ -838,15 +861,17 @@ class Downloader:
838 d = pd.read_sql(q, con=db_connection) 861 d = pd.read_sql(q, con=db_connection)
839 862
840 # filter the results to families we are interested in 863 # filter the results to families we are interested in
841 - d = d[ d["rfam_acc"].isin(list_of_families) ] 864 + d = d[d["rfam_acc"].isin(list_of_families)]
842 865
843 print(d) 866 print(d)
844 867
845 with sqlite3.connect(runDir + "/results/RNANet.db", timeout=20.0) as conn: 868 with sqlite3.connect(runDir + "/results/RNANet.db", timeout=20.0) as conn:
846 - sql_execute(conn, """ 869 + # We use the REPLACE keyword to get the latest information
847 - INSERT OR REPLACE INTO family (rfam_acc, description, max_len) 870 + sql_execute(conn, """INSERT OR REPLACE INTO family (rfam_acc, description, max_len)
848 - VALUES (?, ?, ?);""", many=True, data=list(d.to_records(index=False)) 871 + VALUES (?, ?, ?);""",
849 - ) # We use the replace keyword to get the latest information 872 + many=True,
873 + data=list(d.to_records(index=False))
874 + )
850 875
851 except sqlalchemy.exc.OperationalError: 876 except sqlalchemy.exc.OperationalError:
852 warn("Something's wrong with the SQL database. Check mysql-rfam-public.ebi.ac.uk status and try again later. Not printing statistics.") 877 warn("Something's wrong with the SQL database. Check mysql-rfam-public.ebi.ac.uk status and try again later. Not printing statistics.")
...@@ -858,10 +883,11 @@ class Downloader: ...@@ -858,10 +883,11 @@ class Downloader:
858 883
859 setproctitle(f"RNANet.py download_Rfam_sequences({rfam_acc})") 884 setproctitle(f"RNANet.py download_Rfam_sequences({rfam_acc})")
860 885
861 - if not path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"): 886 + if not os.path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"):
862 for _ in range(10): # retry 100 times if it fails 887 for _ in range(10): # retry 100 times if it fails
863 try: 888 try:
864 - subprocess.run(["wget", f'ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/fasta_files/{rfam_acc}.fa.gz', "-O", path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"], stdout=subprocess.DEVNULL) 889 + subprocess.run(["wget", f'ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/fasta_files/{rfam_acc}.fa.gz', "-O",
890 + path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
865 notify(f"Downloaded {rfam_acc}.fa.gz from Rfam") 891 notify(f"Downloaded {rfam_acc}.fa.gz from Rfam")
866 return # if it worked, no need to retry 892 return # if it worked, no need to retry
867 except Exception as e: 893 except Exception as e:
...@@ -881,8 +907,9 @@ class Downloader: ...@@ -881,8 +907,9 @@ class Downloader:
881 907
882 setproctitle(f"RNANet.py download_BGSU_NR_list({res})") 908 setproctitle(f"RNANet.py download_BGSU_NR_list({res})")
883 909
884 - nr_code = min([ i for i in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 20.0] if i >= res ]) 910 + nr_code = min([i for i in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 20.0] if i >= res])
885 print(f"> Fetching latest list of RNA files at {nr_code} A resolution from BGSU website...", end='', flush=True) 911 print(f"> Fetching latest list of RNA files at {nr_code} A resolution from BGSU website...", end='', flush=True)
912 +
886 # Download latest BGSU non-redundant list 913 # Download latest BGSU non-redundant list
887 try: 914 try:
888 s = requests.get(f"http://rna.bgsu.edu/rna3dhub/nrlist/download/current/{nr_code}A/csv").content 915 s = requests.get(f"http://rna.bgsu.edu/rna3dhub/nrlist/download/current/{nr_code}A/csv").content
...@@ -894,13 +921,13 @@ class Downloader: ...@@ -894,13 +921,13 @@ class Downloader:
894 warn("Error downloading NR list !\t", error=True) 921 warn("Error downloading NR list !\t", error=True)
895 922
896 # Try to read previous file 923 # Try to read previous file
897 - if path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"): 924 + if os.path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"):
898 - print("\t> Use of the previous version.\t", end = "", flush=True) 925 + print("\t> Use of the previous version.\t", end="", flush=True)
899 else: 926 else:
900 return pd.DataFrame([], columns=["class", "class_members"]) 927 return pd.DataFrame([], columns=["class", "class_members"])
901 928
902 nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv") 929 nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv")
903 - full_structures_list = [ tuple(i[1]) for i in nrlist[['class','class_members']].iterrows() ] 930 + full_structures_list = [ tuple(i[1]) for i in nrlist[['class', 'class_members']].iterrows() ]
904 print(f"\t{validsymb}", flush=True) 931 print(f"\t{validsymb}", flush=True)
905 932
906 # The beginning of an adventure. 933 # The beginning of an adventure.
...@@ -910,14 +937,15 @@ class Downloader: ...@@ -910,14 +937,15 @@ class Downloader:
910 937
911 setproctitle(f"RNANet.py download_from_SILVA({unit})") 938 setproctitle(f"RNANet.py download_from_SILVA({unit})")
912 939
913 - 940 + if not os.path.isfile(path_to_seq_data + f"realigned/{unit}.arb"):
914 - if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"):
915 try: 941 try:
916 print(f"Downloading {unit} from SILVA...", end='', flush=True) 942 print(f"Downloading {unit} from SILVA...", end='', flush=True)
917 - if unit=="LSU": 943 + if unit == "LSU":
918 - subprocess.run(["wget", "http://www.arb-silva.de/fileadmin/arb_web_db/release_132/ARB_files/SILVA_132_LSURef_07_12_17_opt.arb.gz", "-O", path_to_seq_data + "realigned/LSU.arb.gz"]) 944 + subprocess.run(["wget", "-nv", "http://www.arb-silva.de/fileadmin/arb_web_db/release_132/ARB_files/SILVA_132_LSURef_07_12_17_opt.arb.gz",
945 + "-O", path_to_seq_data + "realigned/LSU.arb.gz"])
919 else: 946 else:
920 - subprocess.run(["wget", "http://www.arb-silva.de/fileadmin/silva_databases/release_138/ARB_files/SILVA_138_SSURef_05_01_20_opt.arb.gz", "-O", path_to_seq_data + "realigned/SSU.arb.gz"]) 947 + subprocess.run(["wget", "-nv", "http://www.arb-silva.de/fileadmin/silva_databases/release_138/ARB_files/SILVA_138_SSURef_05_01_20_opt.arb.gz",
948 + "-O", path_to_seq_data + "realigned/SSU.arb.gz"])
921 except: 949 except:
922 warn(f"Error downloading the {unit} database from SILVA", error=True) 950 warn(f"Error downloading the {unit} database from SILVA", error=True)
923 exit(1) 951 exit(1)
...@@ -949,7 +977,8 @@ class Mapping: ...@@ -949,7 +977,8 @@ class Mapping:
949 977
950 def filter_df(self, df): 978 def filter_df(self, df):
951 979
952 - newdf = df.drop(df[(df.nt_resnum < self.nt_start) | (df.nt_resnum > self.nt_end)].index) 980 + newdf = df.drop(df[(df.nt_resnum < self.nt_start) |
981 + (df.nt_resnum > self.nt_end)].index)
953 982
954 if len(newdf.index_chain) > 0: 983 if len(newdf.index_chain) > 0:
955 # everything's okay 984 # everything's okay
...@@ -961,19 +990,20 @@ class Mapping: ...@@ -961,19 +990,20 @@ class Mapping:
961 # index_chain and not nt_resnum. 990 # index_chain and not nt_resnum.
962 warn(f"Assuming mapping to {self.rfam_acc} is an absolute position interval.") 991 warn(f"Assuming mapping to {self.rfam_acc} is an absolute position interval.")
963 weird_mappings.add(self.chain_label + "." + self.rfam_acc) 992 weird_mappings.add(self.chain_label + "." + self.rfam_acc)
964 - df = df.drop(df[(df.index_chain < self.nt_start) | (df.index_chain > self.nt_end)].index) 993 + df = df.drop(df[(df.index_chain < self.nt_start) |
994 + (df.index_chain > self.nt_end)].index)
965 995
966 # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one 996 # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one
967 self.st = 0 997 self.st = 0
968 - if len(df.index_chain) and df.iloc[0,0] != 1: 998 + if len(df.index_chain) and df.iloc[0, 0] != 1:
969 - self.st = df.iloc[0,0] -1 999 + self.st = df.iloc[0, 0] - 1
970 df.iloc[:, 0] -= self.st 1000 df.iloc[:, 0] -= self.st
971 self.log(f"Shifting index_chain of {self.st}") 1001 self.log(f"Shifting index_chain of {self.st}")
972 1002
973 # Check that some residues are not included by mistake: 1003 # Check that some residues are not included by mistake:
974 # e.g. 4v4t-AA.RF00382-20-55 contains 4 residues numbered 30 but actually far beyond the mapped part, 1004 # e.g. 4v4t-AA.RF00382-20-55 contains 4 residues numbered 30 but actually far beyond the mapped part,
975 # because the icode are not read by DSSR. 1005 # because the icode are not read by DSSR.
976 - toremove = df[ df.index_chain > self.nt_end ] 1006 + toremove = df[df.index_chain > self.nt_end]
977 if not toremove.empty: 1007 if not toremove.empty:
978 df = df.drop(toremove.index) 1008 df = df.drop(toremove.index)
979 self.log(f"Some nt_resnum values are likely to be wrong, not considering residues:") 1009 self.log(f"Some nt_resnum values are likely to be wrong, not considering residues:")
...@@ -991,9 +1021,9 @@ class Mapping: ...@@ -991,9 +1021,9 @@ class Mapping:
991 if self.logs == []: 1021 if self.logs == []:
992 return # Do not create a log file if there is nothing to log 1022 return # Do not create a log file if there is nothing to log
993 1023
994 - if not path.exists("logs"): 1024 + if not os.path.exists(runDir+"/logs"):
995 - os.makedirs("logs", exist_ok=True) 1025 + os.makedirs(runDir+"/logs", exist_ok=True)
996 - with open("logs/"+filename, "w") as f: 1026 + with open(runDir+"/logs/"+filename, "w") as f:
997 f.writelines(self.logs) 1027 f.writelines(self.logs)
998 1028
999 1029
...@@ -1019,20 +1049,23 @@ class Pipeline: ...@@ -1019,20 +1049,23 @@ class Pipeline:
1019 self.SELECT_ONLY = None 1049 self.SELECT_ONLY = None
1020 self.ARCHIVE = False 1050 self.ARCHIVE = False
1021 self.SAVELOGS = True 1051 self.SAVELOGS = True
1052 + self.FULLINFERENCE = False
1022 1053
1023 def process_options(self): 1054 def process_options(self):
1024 - """Sets the paths and options of the pipeline""" 1055 + """Sets the paths and options of the pipeline
1056 + """
1057 +
1025 global path_to_3D_data 1058 global path_to_3D_data
1026 global path_to_seq_data 1059 global path_to_seq_data
1027 1060
1028 setproctitle("RNANet.py process_options()") 1061 setproctitle("RNANet.py process_options()")
1029 1062
1030 try: 1063 try:
1031 - opts, _ = getopt.getopt( sys.argv[1:], "r:hs", 1064 + opts, _ = getopt.getopt(sys.argv[1:], "r:fhs",
1032 - [ "help", "resolution=", "keep-hetatm=", "from-scratch", 1065 + ["help", "resolution=", "keep-hetatm=", "from-scratch", "full-inference,"
1033 "fill-gaps=", "3d-folder=", "seq-folder=", 1066 "fill-gaps=", "3d-folder=", "seq-folder=",
1034 "no-homology", "ignore-issues", "extract", "only=", "all", "no-logs", 1067 "no-homology", "ignore-issues", "extract", "only=", "all", "no-logs",
1035 - "archive", "update-homologous" ]) 1068 + "archive", "update-homologous"])
1036 except getopt.GetoptError as err: 1069 except getopt.GetoptError as err:
1037 print(err) 1070 print(err)
1038 sys.exit(2) 1071 sys.exit(2)
...@@ -1044,13 +1077,15 @@ class Pipeline: ...@@ -1044,13 +1077,15 @@ class Pipeline:
1044 exit() 1077 exit()
1045 1078
1046 if opt == "-h" or opt == "--help": 1079 if opt == "-h" or opt == "--help":
1047 - print( "RNANet, a script to build a multiscale RNA dataset from public data\n" 1080 + print("RNANet, a script to build a multiscale RNA dataset from public data\n"
1048 "Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2020") 1081 "Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2020")
1049 print() 1082 print()
1050 print("Options:") 1083 print("Options:")
1051 print("-h [ --help ]\t\t\tPrint this help message") 1084 print("-h [ --help ]\t\t\tPrint this help message")
1052 print("--version\t\t\tPrint the program version") 1085 print("--version\t\t\tPrint the program version")
1053 print() 1086 print()
1087 + print("-f [ --full-inference ]\t\tInfer new mappings even if Rfam already provides some. Yields more copies of chains"
1088 + "\n\t\t\t\tmapped to different families.")
1054 print("-r 4.0 [ --resolution=4.0 ]\tMaximum 3D structure resolution to consider a RNA chain.") 1089 print("-r 4.0 [ --resolution=4.0 ]\tMaximum 3D structure resolution to consider a RNA chain.")
1055 print("-s\t\t\t\tRun statistics computations after completion") 1090 print("-s\t\t\t\tRun statistics computations after completion")
1056 print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.") 1091 print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.")
...@@ -1062,7 +1097,7 @@ class Pipeline: ...@@ -1062,7 +1097,7 @@ class Pipeline:
1062 "\n\t\t\t\t\tRNAcifs/\t\tFull structures containing RNA, in mmCIF format" 1097 "\n\t\t\t\t\tRNAcifs/\t\tFull structures containing RNA, in mmCIF format"
1063 "\n\t\t\t\t\trna_mapped_to_Rfam/\tExtracted 'pure' RNA chains" 1098 "\n\t\t\t\t\trna_mapped_to_Rfam/\tExtracted 'pure' RNA chains"
1064 "\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.") 1099 "\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.")
1065 - print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files." 1100 + print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:"
1066 "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families" 1101 "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families"
1067 "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family") 1102 "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
1068 print("--no-homology\t\t\tDo not try to compute PSSMs and do not align sequences." 1103 print("--no-homology\t\t\tDo not try to compute PSSMs and do not align sequences."
...@@ -1077,7 +1112,7 @@ class Pipeline: ...@@ -1077,7 +1112,7 @@ class Pipeline:
1077 print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications") 1112 print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications")
1078 print() 1113 print()
1079 print("Typical usage:") 1114 print("Typical usage:")
1080 - print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &") 1115 + print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &")
1081 sys.exit() 1116 sys.exit()
1082 elif opt == '--version': 1117 elif opt == '--version':
1083 print("RNANet 1.1 beta") 1118 print("RNANet 1.1 beta")
...@@ -1087,21 +1122,21 @@ class Pipeline: ...@@ -1087,21 +1122,21 @@ class Pipeline:
1087 self.CRYSTAL_RES = float(arg) 1122 self.CRYSTAL_RES = float(arg)
1088 elif opt == "-s": 1123 elif opt == "-s":
1089 self.RUN_STATS = True 1124 self.RUN_STATS = True
1090 - elif opt=="--keep-hetatm": 1125 + elif opt == "--keep-hetatm":
1091 - assert arg in [ "True", "False" ] 1126 + assert arg in ["True", "False"]
1092 self.KEEP_HETATM = (arg == "True") 1127 self.KEEP_HETATM = (arg == "True")
1093 - elif opt=="--fill-gaps": 1128 + elif opt == "--fill-gaps":
1094 - assert arg in [ "True", "False" ] 1129 + assert arg in ["True", "False"]
1095 self.FILL_GAPS = (arg == "True") 1130 self.FILL_GAPS = (arg == "True")
1096 - elif opt=="--no-homology": 1131 + elif opt == "--no-homology":
1097 self.HOMOLOGY = False 1132 self.HOMOLOGY = False
1098 - elif opt=='--3d-folder': 1133 + elif opt == '--3d-folder':
1099 - path_to_3D_data = path.abspath(arg) 1134 + path_to_3D_data = os.path.abspath(arg)
1100 if path_to_3D_data[-1] != '/': 1135 if path_to_3D_data[-1] != '/':
1101 path_to_3D_data += '/' 1136 path_to_3D_data += '/'
1102 print("> Storing 3D data into", path_to_3D_data) 1137 print("> Storing 3D data into", path_to_3D_data)
1103 - elif opt=='--seq-folder': 1138 + elif opt == '--seq-folder':
1104 - path_to_seq_data = path.abspath(arg) 1139 + path_to_seq_data = os.path.abspath(arg)
1105 if path_to_seq_data[-1] != '/': 1140 if path_to_seq_data[-1] != '/':
1106 path_to_seq_data += '/' 1141 path_to_seq_data += '/'
1107 print("> Storing sequences into", path_to_seq_data) 1142 print("> Storing sequences into", path_to_seq_data)
...@@ -1138,6 +1173,8 @@ class Pipeline: ...@@ -1138,6 +1173,8 @@ class Pipeline:
1138 self.ARCHIVE = True 1173 self.ARCHIVE = True
1139 elif opt == "--no-logs": 1174 elif opt == "--no-logs":
1140 self.SAVELOGS = False 1175 self.SAVELOGS = False
1176 + elif opt == "-f" or opt == "--full-inference":
1177 + self.FULLINFERENCE = True
1141 1178
1142 if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions": 1179 if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions":
1143 print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") 1180 print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
...@@ -1149,17 +1186,19 @@ class Pipeline: ...@@ -1149,17 +1186,19 @@ class Pipeline:
1149 """List 3D chains with available Rfam mappings. 1186 """List 3D chains with available Rfam mappings.
1150 1187
1151 Return a list of Chain() objects with the mappings set up. 1188 Return a list of Chain() objects with the mappings set up.
1152 - If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains.""" 1189 + If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains.
1190 + """
1153 1191
1154 setproctitle("RNANet.py list_available_mappings()") 1192 setproctitle("RNANet.py list_available_mappings()")
1155 1193
1156 # List all 3D RNA chains below given resolution 1194 # List all 3D RNA chains below given resolution
1157 - full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) # list of tuples ( class, class_members ) 1195 + full_structures_list = self.dl.download_BGSU_NR_list(
1196 + self.CRYSTAL_RES) # list of tuples ( class, class_members )
1158 1197
1159 # Check for a list of known problems: 1198 # Check for a list of known problems:
1160 - if path.isfile(runDir + "/known_issues.txt"): 1199 + if os.path.isfile(runDir + "/known_issues.txt"):
1161 with open(runDir + "/known_issues.txt", 'r') as issues: 1200 with open(runDir + "/known_issues.txt", 'r') as issues:
1162 - self.known_issues = [ x[:-1] for x in issues.readlines() ] 1201 + self.known_issues = [x[:-1] for x in issues.readlines()]
1163 if self.USE_KNOWN_ISSUES: 1202 if self.USE_KNOWN_ISSUES:
1164 print("\t> Ignoring known issues:") 1203 print("\t> Ignoring known issues:")
1165 for x in self.known_issues: 1204 for x in self.known_issues:
...@@ -1175,9 +1214,18 @@ class Pipeline: ...@@ -1175,9 +1214,18 @@ class Pipeline:
1175 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=ncores) 1214 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=ncores)
1176 try: 1215 try:
1177 1216
1178 - pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1, desc="Eq. classes", bar_format="{desc}:{percentage:3.0f}%|{bar}|") 1217 + pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1,
1179 - for _, newchains in enumerate(p.imap_unordered(partial(work_infer_mappings, not self.REUSE_ALL, allmappings), full_structures_list, chunksize=1)): 1218 + desc="Eq. classes", bar_format="{desc}:{percentage:3.0f}%|{bar}|")
1219 + for _, newchains in enumerate(p.imap_unordered(partial(
1220 + work_infer_mappings,
1221 + not self.REUSE_ALL,
1222 + allmappings,
1223 + self.FULLINFERENCE
1224 + ),
1225 + full_structures_list,
1226 + chunksize=1)):
1180 self.update += newchains 1227 self.update += newchains
1228 +
1181 pbar.update(1) # Everytime the iteration finishes, update the global progress bar 1229 pbar.update(1) # Everytime the iteration finishes, update the global progress bar
1182 1230
1183 pbar.close() 1231 pbar.close()
...@@ -1192,7 +1240,7 @@ class Pipeline: ...@@ -1192,7 +1240,7 @@ class Pipeline:
1192 else: 1240 else:
1193 conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) 1241 conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0)
1194 for eq_class, codelist in tqdm(full_structures_list, desc="Eq. classes"): 1242 for eq_class, codelist in tqdm(full_structures_list, desc="Eq. classes"):
1195 - codes = codelist.replace('+',',').split(',') 1243 + codes = codelist.replace('+', ',').split(',')
1196 1244
1197 # Simply convert the list of codes to Chain() objects 1245 # Simply convert the list of codes to Chain() objects
1198 for c in codes: 1246 for c in codes:
...@@ -1201,40 +1249,48 @@ class Pipeline: ...@@ -1201,40 +1249,48 @@ class Pipeline:
1201 pdb_model = int(nr[1]) 1249 pdb_model = int(nr[1])
1202 pdb_chain_id = nr[2].upper() 1250 pdb_chain_id = nr[2].upper()
1203 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}" 1251 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}"
1204 - res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc = 'unmappd' AND issue=0""") 1252 + res = sql_ask_database(conn, f"""SELECT chain_id from chain
1253 + WHERE structure_id='{pdb_id}'
1254 + AND chain_name='{pdb_chain_id}'
1255 + AND rfam_acc = 'unmappd'
1256 + AND issue=0""")
1205 if not len(res) or self.REUSE_ALL: # the chain is NOT yet in the database, or this is a known issue 1257 if not len(res) or self.REUSE_ALL: # the chain is NOT yet in the database, or this is a known issue
1206 self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class)) 1258 self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class))
1207 conn.close() 1259 conn.close()
1208 1260
1209 if self.SELECT_ONLY is not None: 1261 if self.SELECT_ONLY is not None:
1210 - self.update = [ c for c in self.update if c.chain_label == self.SELECT_ONLY ] 1262 + self.update = [
1263 + c for c in self.update if c.chain_label == self.SELECT_ONLY]
1211 1264
1212 self.n_chains = len(self.update) 1265 self.n_chains = len(self.update)
1213 print(str(self.n_chains) + " RNA chains of interest.") 1266 print(str(self.n_chains) + " RNA chains of interest.")
1214 1267
1215 @trace_unhandled_exceptions 1268 @trace_unhandled_exceptions
1216 - def dl_and_annotate(self, retry=False, coeff_ncores = 0.75): 1269 + def dl_and_annotate(self, retry=False, coeff_ncores=0.75):
1217 """ 1270 """
1218 Gets mmCIF files from the PDB, and runs DSSR on them. 1271 Gets mmCIF files from the PDB, and runs DSSR on them.
1219 Ignores a structure if the file already exists (not if we are retrying). 1272 Ignores a structure if the file already exists (not if we are retrying).
1220 1273
1221 REQUIRES the previous definition of self.update, so call list_available_mappings() before. 1274 REQUIRES the previous definition of self.update, so call list_available_mappings() before.
1222 - SETS table structure""" 1275 + SETS table structure
1276 + """
1223 1277
1224 - # setproctitle(f"RNANet.py dl_and_annotate(retry={retry})") 1278 + setproctitle(f"RNANet.py dl_and_annotate(retry={retry})")
1225 1279
1226 # Prepare the results folders 1280 # Prepare the results folders
1227 - if not path.isdir(path_to_3D_data + "RNAcifs"): 1281 + if not os.path.isdir(path_to_3D_data + "RNAcifs"):
1228 - os.makedirs(path_to_3D_data + "RNAcifs") # for the whole structures 1282 + # for the whole structures
1229 - if not path.isdir(path_to_3D_data + "annotations"): 1283 + os.makedirs(path_to_3D_data + "RNAcifs")
1230 - os.makedirs(path_to_3D_data + "annotations") # for DSSR analysis of the whole structures 1284 + if not os.path.isdir(path_to_3D_data + "annotations"):
1285 + # for DSSR analysis of the whole structures
1286 + os.makedirs(path_to_3D_data + "annotations")
1231 1287
1232 # Download and annotate 1288 # Download and annotate
1233 print("> Downloading and annotating structures (or checking previous results if they exist)...", flush=True) 1289 print("> Downloading and annotating structures (or checking previous results if they exist)...", flush=True)
1234 if retry: 1290 if retry:
1235 - mmcif_list = sorted(set([ c.pdb_id for c in self.retry ])) 1291 + mmcif_list = sorted(set([c.pdb_id for c in self.retry]))
1236 else: 1292 else:
1237 - mmcif_list = sorted(set([ c.pdb_id for c in self.update ])) 1293 + mmcif_list = sorted(set([c.pdb_id for c in self.update]))
1238 try: 1294 try:
1239 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=int(coeff_ncores*ncores)) 1295 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=int(coeff_ncores*ncores))
1240 pbar = tqdm(mmcif_list, maxinterval=1.0, miniters=1, desc="mmCIF files") 1296 pbar = tqdm(mmcif_list, maxinterval=1.0, miniters=1, desc="mmCIF files")
...@@ -1255,16 +1311,19 @@ class Pipeline: ...@@ -1255,16 +1311,19 @@ class Pipeline:
1255 and extract their informations from the JSON files to the database. 1311 and extract their informations from the JSON files to the database.
1256 1312
1257 REQUIRES the previous definition of self.update, so call list_available_mappings() before. 1313 REQUIRES the previous definition of self.update, so call list_available_mappings() before.
1258 - SETS self.loaded_chains""" 1314 + SETS self.loaded_chains
1315 + """
1259 1316
1260 setproctitle(f"RNANet.py build_chains(retry={retry})") 1317 setproctitle(f"RNANet.py build_chains(retry={retry})")
1261 1318
1262 # Prepare folders 1319 # Prepare folders
1263 if self.EXTRACT_CHAINS: 1320 if self.EXTRACT_CHAINS:
1264 - if self.HOMOLOGY and not path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"): 1321 + if self.HOMOLOGY and not os.path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"):
1265 - os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam") # for the portions mapped to Rfam 1322 + # for the portions mapped to Rfam
1266 - if (not self.HOMOLOGY) and not path.isdir(path_to_3D_data + "rna_only"): 1323 + os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam")
1267 - os.makedirs(path_to_3D_data + "rna_only") # extract chains of pure RNA 1324 + if (not self.HOMOLOGY) and not os.path.isdir(path_to_3D_data + "rna_only"):
1325 + # extract chains of pure RNA
1326 + os.makedirs(path_to_3D_data + "rna_only")
1268 1327
1269 # define and run jobs 1328 # define and run jobs
1270 joblist = [] 1329 joblist = []
...@@ -1296,44 +1355,48 @@ class Pipeline: ...@@ -1296,44 +1355,48 @@ class Pipeline:
1296 issues += 1 1355 issues += 1
1297 issues_names.append(c[1].chain_label) 1356 issues_names.append(c[1].chain_label)
1298 ki.write(c[1].chain_label + '\n') 1357 ki.write(c[1].chain_label + '\n')
1299 - kir.write(c[1].chain_label + '\n' + c[1].error_messages + '\n\n') 1358 + kir.write(c[1].chain_label + '\n' +
1359 + c[1].error_messages + '\n\n')
1300 with sqlite3.connect(runDir+"/results/RNANet.db") as conn: 1360 with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
1301 sql_execute(conn, f"UPDATE chain SET issue = 1 WHERE chain_id = ?;", data=(c[1].db_chain_id,)) 1361 sql_execute(conn, f"UPDATE chain SET issue = 1 WHERE chain_id = ?;", data=(c[1].db_chain_id,))
1302 ki.close() 1362 ki.close()
1303 kir.close() 1363 kir.close()
1304 if issues: 1364 if issues:
1305 warn(f"Added {issues} newly discovered issues to known issues:") 1365 warn(f"Added {issues} newly discovered issues to known issues:")
1306 - print("\033[33m"+ " ".join(issues_names) + "\033[0m", flush=True) 1366 + print("\033[33m" + " ".join(issues_names) + "\033[0m", flush=True)
1307 1367
1308 # Add successfully built chains to list 1368 # Add successfully built chains to list
1309 - self.loaded_chains += [ c[1] for c in results if not c[1].delete_me ] 1369 + self.loaded_chains += [c[1] for c in results if not c[1].delete_me]
1310 1370
1311 # Identify errors due to empty JSON files (this happen when RAM is full, we believe). 1371 # Identify errors due to empty JSON files (this happen when RAM is full, we believe).
1312 # Retrying often solves the issue... so retry once with half the cores to limit the RAM usage. 1372 # Retrying often solves the issue... so retry once with half the cores to limit the RAM usage.
1313 self.to_retry = [ c[1] for c in results if "Could not load existing" in c[1].error_messages ] 1373 self.to_retry = [ c[1] for c in results if "Could not load existing" in c[1].error_messages ]
1314 1374
1315 def checkpoint_save_chains(self): 1375 def checkpoint_save_chains(self):
1316 - """Saves self.loaded_chains to data/loaded_chains.picke""" 1376 + """Saves self.loaded_chains to data/loaded_chains.picke
1317 - with open(runDir + "/data/loaded_chains.pickle","wb") as pick: 1377 + """
1378 + with open(runDir + "/data/loaded_chains.pickle", "wb") as pick:
1318 pickle.dump(self.loaded_chains, pick) 1379 pickle.dump(self.loaded_chains, pick)
1319 1380
1320 def checkpoint_load_chains(self): 1381 def checkpoint_load_chains(self):
1321 - """Load self.loaded_chains from data/loaded_chains.pickle""" 1382 + """Load self.loaded_chains from data/loaded_chains.pickle
1322 - with open(runDir + "/data/loaded_chains.pickle","rb") as pick: 1383 + """
1384 + with open(runDir + "/data/loaded_chains.pickle", "rb") as pick:
1323 self.loaded_chains = pickle.load(pick) 1385 self.loaded_chains = pickle.load(pick)
1324 1386
1325 def prepare_sequences(self): 1387 def prepare_sequences(self):
1326 """Downloads homologous sequences and covariance models required to compute MSAs. 1388 """Downloads homologous sequences and covariance models required to compute MSAs.
1327 1389
1328 REQUIRES that self.loaded_chains is defined. 1390 REQUIRES that self.loaded_chains is defined.
1329 - SETS family (partially, through call)""" 1391 + SETS family (partially, through call)
1392 + """
1330 1393
1331 setproctitle("RNANet.py prepare_sequences()") 1394 setproctitle("RNANet.py prepare_sequences()")
1332 1395
1333 # Preparing a results folder 1396 # Preparing a results folder
1334 if not os.access(path_to_seq_data + "realigned/", os.F_OK): 1397 if not os.access(path_to_seq_data + "realigned/", os.F_OK):
1335 os.makedirs(path_to_seq_data + "realigned/") 1398 os.makedirs(path_to_seq_data + "realigned/")
1336 - if not path.isdir(path_to_seq_data + "rfam_sequences/fasta/"): 1399 + if not os.path.isdir(path_to_seq_data + "rfam_sequences/fasta/"):
1337 os.makedirs(path_to_seq_data + "rfam_sequences/fasta/", exist_ok=True) 1400 os.makedirs(path_to_seq_data + "rfam_sequences/fasta/", exist_ok=True)
1338 1401
1339 # Update the family table (rfam_acc, description, max_len) 1402 # Update the family table (rfam_acc, description, max_len)
...@@ -1344,7 +1407,8 @@ class Pipeline: ...@@ -1344,7 +1407,8 @@ class Pipeline:
1344 1407
1345 joblist = [] 1408 joblist = []
1346 for f in self.fam_list: 1409 for f in self.fam_list:
1347 - joblist.append(Job(function=work_prepare_sequences, how_many_in_parallel=ncores, args=[self.dl, f, rfam_acc_to_download[f]])) 1410 + joblist.append(Job(function=work_prepare_sequences, how_many_in_parallel=ncores, args=[
1411 + self.dl, f, rfam_acc_to_download[f]]))
1348 try: 1412 try:
1349 execute_joblist(joblist) 1413 execute_joblist(joblist)
1350 1414
...@@ -1360,14 +1424,16 @@ class Pipeline: ...@@ -1360,14 +1424,16 @@ class Pipeline:
1360 """Perform multiple sequence alignments. 1424 """Perform multiple sequence alignments.
1361 1425
1362 REQUIRES self.fam_list to be defined 1426 REQUIRES self.fam_list to be defined
1363 - SETS family (partially)""" 1427 + SETS family (partially)
1428 + """
1364 1429
1365 setproctitle("RNANet.py realign()") 1430 setproctitle("RNANet.py realign()")
1366 1431
1367 # Prepare the job list 1432 # Prepare the job list
1368 joblist = [] 1433 joblist = []
1369 for f in self.fam_list: 1434 for f in self.fam_list:
1370 - joblist.append( Job(function=work_realign, args=[f], how_many_in_parallel=1, label=f)) # the function already uses all CPUs so launch them one by one 1435 + # the function already uses all CPUs so launch them one by one (how_many_in_parallel=1)
1436 + joblist.append(Job(function=work_realign, args=[f], how_many_in_parallel=1, label=f))
1371 1437
1372 # Execute the jobs 1438 # Execute the jobs
1373 try: 1439 try:
...@@ -1379,8 +1445,8 @@ class Pipeline: ...@@ -1379,8 +1445,8 @@ class Pipeline:
1379 # Update the database 1445 # Update the database
1380 data = [] 1446 data = []
1381 for r in results: 1447 for r in results:
1382 - align = AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta") 1448 + align = Bio.AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta")
1383 - nb_3d_chains = len([ 1 for r in align if '[' in r.id ]) 1449 + nb_3d_chains = len([1 for r in align if '[' in r.id])
1384 if r[0] in SSU_set: # SSU v138 is used 1450 if r[0] in SSU_set: # SSU v138 is used
1385 nb_homologs = 2225272 # source: https://www.arb-silva.de/documentation/release-138/ 1451 nb_homologs = 2225272 # source: https://www.arb-silva.de/documentation/release-138/
1386 nb_total_homol = nb_homologs + nb_3d_chains 1452 nb_total_homol = nb_homologs + nb_3d_chains
...@@ -1390,7 +1456,7 @@ class Pipeline: ...@@ -1390,7 +1456,7 @@ class Pipeline:
1390 else: 1456 else:
1391 nb_total_homol = len(align) 1457 nb_total_homol = len(align)
1392 nb_homologs = nb_total_homol - nb_3d_chains 1458 nb_homologs = nb_total_homol - nb_3d_chains
1393 - data.append( (nb_homologs, nb_3d_chains, nb_total_homol, r[2], r[3], r[0]) ) 1459 + data.append((nb_homologs, nb_3d_chains, nb_total_homol, r[2], r[3], r[0]))
1394 1460
1395 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 1461 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
1396 sql_execute(conn, """UPDATE family SET nb_homologs = ?, nb_3d_chains = ?, nb_total_homol = ?, comput_time = ?, comput_peak_mem = ? 1462 sql_execute(conn, """UPDATE family SET nb_homologs = ?, nb_3d_chains = ?, nb_total_homol = ?, comput_time = ?, comput_peak_mem = ?
...@@ -1399,13 +1465,14 @@ class Pipeline: ...@@ -1399,13 +1465,14 @@ class Pipeline:
1399 def remap(self): 1465 def remap(self):
1400 """Compute nucleotide frequencies of some alignments and save them in the database 1466 """Compute nucleotide frequencies of some alignments and save them in the database
1401 1467
1402 - REQUIRES self.fam_list to be defined""" 1468 + REQUIRES self.fam_list to be defined
1469 + """
1403 1470
1404 setproctitle("RNANet.py remap()") 1471 setproctitle("RNANet.py remap()")
1405 1472
1406 print("Computing nucleotide frequencies in alignments...\nThis can be very long on slow storage devices (Hard-drive...)") 1473 print("Computing nucleotide frequencies in alignments...\nThis can be very long on slow storage devices (Hard-drive...)")
1407 print("Check your CPU and disk I/O activity before deciding if the job failed.") 1474 print("Check your CPU and disk I/O activity before deciding if the job failed.")
1408 - nworkers =max(min(ncores, len(self.fam_list)), 1) 1475 + nworkers = max(min(ncores, len(self.fam_list)), 1)
1409 1476
1410 # Prepare the architecture of a shiny multi-progress-bars design 1477 # Prepare the architecture of a shiny multi-progress-bars design
1411 # Push the number of workers to a queue. 1478 # Push the number of workers to a queue.
...@@ -1419,8 +1486,10 @@ class Pipeline: ...@@ -1419,8 +1486,10 @@ class Pipeline:
1419 1486
1420 try: 1487 try:
1421 fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True) 1488 fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True)
1422 - for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)): # Apply work_pssm to each RNA family 1489 + # Apply work_pssm to each RNA family
1423 - fam_pbar.update(1) # Everytime the iteration finishes on a family, update the global progress bar over the RNA families 1490 + for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)):
1491 + # Everytime the iteration finishes on a family, update the global progress bar over the RNA families
1492 + fam_pbar.update(1)
1424 fam_pbar.close() 1493 fam_pbar.close()
1425 p.close() 1494 p.close()
1426 p.join() 1495 p.join()
...@@ -1434,23 +1503,24 @@ class Pipeline: ...@@ -1434,23 +1503,24 @@ class Pipeline:
1434 def output_results(self): 1503 def output_results(self):
1435 """Produces CSV files, archive them, and additional metadata files 1504 """Produces CSV files, archive them, and additional metadata files
1436 1505
1437 - REQUIRES self.loaded_chains (to output corresponding CSV files) and self.fam_list (for statistics)""" 1506 + REQUIRES self.loaded_chains (to output corresponding CSV files) and self.fam_list (for statistics)
1507 + """
1438 1508
1439 setproctitle("RNANet.py output_results()") 1509 setproctitle("RNANet.py output_results()")
1440 1510
1441 time_str = time.strftime("%Y%m%d") 1511 time_str = time.strftime("%Y%m%d")
1442 1512
1443 - #Prepare folders: 1513 + # Prepare folders:
1444 - if not path.isdir(path_to_3D_data + "datapoints/"): 1514 + if not os.path.isdir(path_to_3D_data + "datapoints/"):
1445 os.makedirs(path_to_3D_data + "datapoints/") 1515 os.makedirs(path_to_3D_data + "datapoints/")
1446 - if not path.isdir(runDir + "/results/archive/"): 1516 + if not os.path.isdir(runDir + "/results/archive/"):
1447 os.makedirs(runDir + "/results/archive/") 1517 os.makedirs(runDir + "/results/archive/")
1448 1518
1449 # Save to by-chain CSV files 1519 # Save to by-chain CSV files
1450 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3) 1520 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3)
1451 try: 1521 try:
1452 pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True) 1522 pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True)
1453 - for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains, chunksize=2)): 1523 + for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains)):
1454 pbar.update(1) 1524 pbar.update(1)
1455 pbar.close() 1525 pbar.close()
1456 p.close() 1526 p.close()
...@@ -1465,36 +1535,44 @@ class Pipeline: ...@@ -1465,36 +1535,44 @@ class Pipeline:
1465 # Run statistics 1535 # Run statistics
1466 if self.RUN_STATS: 1536 if self.RUN_STATS:
1467 # Remove previous precomputed data 1537 # Remove previous precomputed data
1468 - subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"]) 1538 + subprocess.run(["rm", "-f", runDir + "/data/wadley_kernel_eta.npz",
1539 + runDir + "/data/wadley_kernel_eta_prime.npz",
1540 + runDir + "/data/pair_counts.csv"])
1469 for f in self.fam_list: 1541 for f in self.fam_list:
1470 - subprocess.run(["rm","-f", f"data/{f}.npy", f"data/{f}_pairs.csv", f"data/{f}_counts.csv"]) 1542 + subprocess.run(["rm", "-f", runDir + f"/data/{f}.npy",
1543 + runDir + f"/data/{f}_pairs.csv",
1544 + runDir + f"/data/{f}_counts.csv"])
1471 1545
1472 # Run statistics files 1546 # Run statistics files
1473 - os.chdir(runDir) 1547 + subprocess.run(["python3.8", fileDir+"/regression.py"])
1474 - subprocess.run(["python3.8", "regression.py"]) 1548 + subprocess.run(["python3.8", fileDir+"/statistics.py", "--3d-folder", path_to_3D_data,
1475 - subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data]) 1549 + "--seq-folder", path_to_seq_data, "-r", str(self.CRYSTAL_RES)])
1476 1550
1477 # Save additional informations 1551 # Save additional informations
1478 with sqlite3.connect(runDir+"/results/RNANet.db") as conn: 1552 with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
1479 - pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 1553 + pd.read_sql_query("""SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem
1554 + FROM family ORDER BY nb_3d_chains DESC;""",
1480 conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) 1555 conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
1481 - pd.read_sql_query("""SELECT eq_class, structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 1556 + pd.read_sql_query("""SELECT eq_class, structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue
1557 + FROM structure
1482 JOIN chain ON structure.pdb_id = chain.structure_id 1558 JOIN chain ON structure.pdb_id = chain.structure_id
1483 - ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False) 1559 + ORDER BY structure_id, chain_name, rfam_acc ASC;""",
1560 + conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
1484 1561
1485 # Archive the results 1562 # Archive the results
1486 - if self.SELECT_ONLY is None: 1563 + if self.ARCHIVE:
1487 - os.makedirs("results/archive", exist_ok=True) 1564 + os.makedirs(runDir + "/results/archive", exist_ok=True)
1488 - subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."]) 1565 + subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf",
1566 + runDir + f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", "."])
1489 1567
1490 # Update shortcuts to latest versions 1568 # Update shortcuts to latest versions
1491 subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz", 1569 subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz",
1492 runDir + "/results/summary_latest.csv", 1570 runDir + "/results/summary_latest.csv",
1493 runDir + "/results/families_latest.csv" 1571 runDir + "/results/families_latest.csv"
1494 ]) 1572 ])
1495 - subprocess.run(['ln',"-s", runDir +f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"]) 1573 + subprocess.run(['ln', "-s", runDir + f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"])
1496 - subprocess.run(['ln',"-s", runDir +f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"]) 1574 + subprocess.run(['ln', "-s", runDir + f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"])
1497 - subprocess.run(['ln',"-s", runDir +f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"]) 1575 + subprocess.run(['ln', "-s", runDir + f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"])
1498 1576
1499 def sanitize_database(self): 1577 def sanitize_database(self):
1500 """Searches for issues in the database and correct them""" 1578 """Searches for issues in the database and correct them"""
...@@ -1518,7 +1596,9 @@ class Pipeline: ...@@ -1518,7 +1596,9 @@ class Pipeline:
1518 if self.HOMOLOGY: 1596 if self.HOMOLOGY:
1519 # check if chains have been re_mapped: 1597 # check if chains have been re_mapped:
1520 r = sql_ask_database(conn, """SELECT COUNT(DISTINCT chain_id) AS Count, rfam_acc FROM chain 1598 r = sql_ask_database(conn, """SELECT COUNT(DISTINCT chain_id) AS Count, rfam_acc FROM chain
1521 - WHERE issue = 0 AND chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping) 1599 + WHERE issue = 0
1600 + AND rfam_acc != 'unmappd'
1601 + AND chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping)
1522 GROUP BY rfam_acc;""") 1602 GROUP BY rfam_acc;""")
1523 try: 1603 try:
1524 if len(r) and r[0][0] is not None: 1604 if len(r) and r[0][0] is not None:
...@@ -1545,22 +1625,25 @@ class Pipeline: ...@@ -1545,22 +1625,25 @@ class Pipeline:
1545 1625
1546 1626
1547 def read_cpu_number(): 1627 def read_cpu_number():
1548 - # As one shall not use os.cpu_count() on LXC containers, 1628 + """This function reads the number of CPU cores available from /proc/cpuinfo.
1549 - # because it reads info from /sys wich is not the VM resources but the host resources. 1629 + One shall not use os.cpu_count() on LXC containers,
1550 - # This function reads it from /proc/cpuinfo instead. 1630 + because it reads info from /sys wich is not the VM resources but the host resources.
1631 + """
1551 p = subprocess.run(['grep', '-Ec', '(Intel|AMD)', '/proc/cpuinfo'], stdout=subprocess.PIPE) 1632 p = subprocess.run(['grep', '-Ec', '(Intel|AMD)', '/proc/cpuinfo'], stdout=subprocess.PIPE)
1552 return int(int(p.stdout.decode('utf-8')[:-1])/2) 1633 return int(int(p.stdout.decode('utf-8')[:-1])/2)
1553 1634
1635 +
1554 def init_worker(tqdm_lock=None): 1636 def init_worker(tqdm_lock=None):
1555 signal.signal(signal.SIGINT, signal.SIG_IGN) 1637 signal.signal(signal.SIGINT, signal.SIG_IGN)
1556 if tqdm_lock is not None: 1638 if tqdm_lock is not None:
1557 tqdm.set_lock(tqdm_lock) 1639 tqdm.set_lock(tqdm_lock)
1558 1640
1641 +
1559 def warn(message, error=False): 1642 def warn(message, error=False):
1560 """Pretty-print warnings and error messages. 1643 """Pretty-print warnings and error messages.
1561 """ 1644 """
1562 # Cut if too long 1645 # Cut if too long
1563 - if len(message)>66: 1646 + if len(message) > 66:
1564 x = message.find(' ', 50, 66) 1647 x = message.find(' ', 50, 66)
1565 if x != -1: 1648 if x != -1:
1566 warn(message[:x], error=error) 1649 warn(message[:x], error=error)
...@@ -1574,11 +1657,13 @@ def warn(message, error=False): ...@@ -1574,11 +1657,13 @@ def warn(message, error=False):
1574 else: 1657 else:
1575 print(f"\t> \033[33mWARN: {message:64s}\033[0m\t{warnsymb}", flush=True) 1658 print(f"\t> \033[33mWARN: {message:64s}\033[0m\t{warnsymb}", flush=True)
1576 1659
1660 +
1577 def notify(message, post=''): 1661 def notify(message, post=''):
1578 if len(post): 1662 if len(post):
1579 post = '(' + post + ')' 1663 post = '(' + post + ')'
1580 print(f"\t> {message:70s}\t{validsymb}\t{post}", flush=True) 1664 print(f"\t> {message:70s}\t{validsymb}\t{post}", flush=True)
1581 1665
1666 +
1582 def sql_define_tables(conn): 1667 def sql_define_tables(conn):
1583 conn.executescript( 1668 conn.executescript(
1584 """ PRAGMA foreign_keys = on; 1669 """ PRAGMA foreign_keys = on;
...@@ -1684,8 +1769,9 @@ def sql_define_tables(conn): ...@@ -1684,8 +1769,9 @@ def sql_define_tables(conn):
1684 """) 1769 """)
1685 conn.commit() 1770 conn.commit()
1686 1771
1772 +
1687 @trace_unhandled_exceptions 1773 @trace_unhandled_exceptions
1688 -def sql_ask_database(conn, sql, warn_every = 10): 1774 +def sql_ask_database(conn, sql, warn_every=10):
1689 """ 1775 """
1690 Reads the SQLite database. 1776 Reads the SQLite database.
1691 Returns a list of tuples. 1777 Returns a list of tuples.
...@@ -1698,11 +1784,13 @@ def sql_ask_database(conn, sql, warn_every = 10): ...@@ -1698,11 +1784,13 @@ def sql_ask_database(conn, sql, warn_every = 10):
1698 return result # if it worked, no need to retry 1784 return result # if it worked, no need to retry
1699 except sqlite3.OperationalError as e: 1785 except sqlite3.OperationalError as e:
1700 if warn_every and not (_+1) % warn_every: 1786 if warn_every and not (_+1) % warn_every:
1701 - warn(str(e) + ", retrying in 0.2s (worker " + str(os.getpid()) + f', try {_+1}/100)') 1787 + warn(str(e) + ", retrying in 0.2s (worker " +
1788 + str(os.getpid()) + f', try {_+1}/100)')
1702 time.sleep(0.2) 1789 time.sleep(0.2)
1703 warn("Tried to reach database 100 times and failed. Aborting.", error=True) 1790 warn("Tried to reach database 100 times and failed. Aborting.", error=True)
1704 return [] 1791 return []
1705 1792
1793 +
1706 @trace_unhandled_exceptions 1794 @trace_unhandled_exceptions
1707 def sql_execute(conn, sql, many=False, data=None, warn_every=10): 1795 def sql_execute(conn, sql, many=False, data=None, warn_every=10):
1708 conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query 1796 conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
...@@ -1721,10 +1809,12 @@ def sql_execute(conn, sql, many=False, data=None, warn_every=10): ...@@ -1721,10 +1809,12 @@ def sql_execute(conn, sql, many=False, data=None, warn_every=10):
1721 return # if it worked, no need to retry 1809 return # if it worked, no need to retry
1722 except sqlite3.OperationalError as e: 1810 except sqlite3.OperationalError as e:
1723 if warn_every and not (_+1) % warn_every: 1811 if warn_every and not (_+1) % warn_every:
1724 - warn(str(e) + ", retrying in 0.2s (worker " + str(os.getpid()) + f', try {_+1}/100)') 1812 + warn(str(e) + ", retrying in 0.2s (worker " +
1813 + str(os.getpid()) + f', try {_+1}/100)')
1725 time.sleep(0.2) 1814 time.sleep(0.2)
1726 warn("Tried to reach database 100 times and failed. Aborting.", error=True) 1815 warn("Tried to reach database 100 times and failed. Aborting.", error=True)
1727 1816
1817 +
1728 @trace_unhandled_exceptions 1818 @trace_unhandled_exceptions
1729 def execute_job(j, jobcount): 1819 def execute_job(j, jobcount):
1730 """Run a Job object. 1820 """Run a Job object.
...@@ -1741,7 +1831,8 @@ def execute_job(j, jobcount): ...@@ -1741,7 +1831,8 @@ def execute_job(j, jobcount):
1741 print(f"[{running_stats[0]+running_stats[2]}/{jobcount}]\t{j.label}") 1831 print(f"[{running_stats[0]+running_stats[2]}/{jobcount}]\t{j.label}")
1742 1832
1743 # Add the command to logfile 1833 # Add the command to logfile
1744 - logfile = open(runDir + "/log_of_the_run.sh", 'a') 1834 + os.makedirs(runDir+"/logs", exist_ok=True)
1835 + logfile = open(runDir + "/logs/log_of_the_run.sh", 'a')
1745 logfile.write(" ".join(j.cmd_)) 1836 logfile.write(" ".join(j.cmd_))
1746 logfile.write("\n") 1837 logfile.write("\n")
1747 logfile.close() 1838 logfile.close()
...@@ -1753,7 +1844,8 @@ def execute_job(j, jobcount): ...@@ -1753,7 +1844,8 @@ def execute_job(j, jobcount):
1753 1844
1754 # run the command. subprocess.run will be a child of this process, and stays monitored. 1845 # run the command. subprocess.run will be a child of this process, and stays monitored.
1755 start_time = time.time() 1846 start_time = time.time()
1756 - r = subprocess.run(j.cmd_, timeout=j.timeout_, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 1847 + r = subprocess.run(j.cmd_, timeout=j.timeout_,
1848 + stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1757 end_time = time.time() 1849 end_time = time.time()
1758 1850
1759 # Stop the Monitor, then get its result 1851 # Stop the Monitor, then get its result
...@@ -1782,7 +1874,8 @@ def execute_job(j, jobcount): ...@@ -1782,7 +1874,8 @@ def execute_job(j, jobcount):
1782 1874
1783 # return time and memory statistics, plus the job results 1875 # return time and memory statistics, plus the job results
1784 t = end_time - start_time 1876 t = end_time - start_time
1785 - return (t,m,r) 1877 + return (t, m, r)
1878 +
1786 1879
1787 def execute_joblist(fulljoblist): 1880 def execute_joblist(fulljoblist):
1788 """ Run a list of job objects. 1881 """ Run a list of job objects.
...@@ -1815,8 +1908,9 @@ def execute_joblist(fulljoblist): ...@@ -1815,8 +1908,9 @@ def execute_joblist(fulljoblist):
1815 1908
1816 # Process the jobs from priority 1 to nprio 1909 # Process the jobs from priority 1 to nprio
1817 results = [] 1910 results = []
1818 - for i in range(1,nprio+1): 1911 + for i in range(1, nprio+1):
1819 - if i not in jobs.keys(): continue # no job has the priority level i 1912 + if i not in jobs.keys():
1913 + continue # no job has the priority level i
1820 1914
1821 print("processing jobs of priority", i) 1915 print("processing jobs of priority", i)
1822 different_thread_numbers = sorted(jobs[i].keys()) 1916 different_thread_numbers = sorted(jobs[i].keys())
...@@ -1825,7 +1919,8 @@ def execute_joblist(fulljoblist): ...@@ -1825,7 +1919,8 @@ def execute_joblist(fulljoblist):
1825 for n in different_thread_numbers: 1919 for n in different_thread_numbers:
1826 # get the bunch of jobs of same priority and thread number 1920 # get the bunch of jobs of same priority and thread number
1827 bunch = jobs[i][n] 1921 bunch = jobs[i][n]
1828 - if not len(bunch): continue # no jobs should be processed n by n 1922 + if not len(bunch):
1923 + continue # no jobs should be processed n by n
1829 1924
1830 print("using", n, "processes:") 1925 print("using", n, "processes:")
1831 # execute jobs of priority i that should be processed n by n: 1926 # execute jobs of priority i that should be processed n by n:
...@@ -1843,13 +1938,14 @@ def execute_joblist(fulljoblist): ...@@ -1843,13 +1938,14 @@ def execute_joblist(fulljoblist):
1843 for j, r in zip(bunch, raw_results): 1938 for j, r in zip(bunch, raw_results):
1844 j.comp_time = round(r[0], 2) # seconds 1939 j.comp_time = round(r[0], 2) # seconds
1845 j.max_mem = int(r[1]/1000000) # MB 1940 j.max_mem = int(r[1]/1000000) # MB
1846 - results.append( (j.label, r[2], round(r[0], 2), int(r[1]/1000000))) 1941 + results.append((j.label, r[2], round(r[0], 2), int(r[1]/1000000)))
1847 1942
1848 # throw back the money 1943 # throw back the money
1849 return results 1944 return results
1850 1945
1946 +
1851 @trace_unhandled_exceptions 1947 @trace_unhandled_exceptions
1852 -def work_infer_mappings(update_only, allmappings, codelist) -> list: 1948 +def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> list:
1853 """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list, 1949 """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list,
1854 build a list of Chain() objects mapped to Rfam families, by expanding available mappings 1950 build a list of Chain() objects mapped to Rfam families, by expanding available mappings
1855 of any element of the list to all the list elements. 1951 of any element of the list to all the list elements.
...@@ -1862,13 +1958,13 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: ...@@ -1862,13 +1958,13 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
1862 1958
1863 # Split the comma-separated list of chain codes into chain codes: 1959 # Split the comma-separated list of chain codes into chain codes:
1864 eq_class = codelist[0] 1960 eq_class = codelist[0]
1865 - codes = codelist[1].replace('+',',').split(',') 1961 + codes = codelist[1].replace('+', ',').split(',')
1866 1962
1867 # Search for mappings that apply to an element of this PDB chains list: 1963 # Search for mappings that apply to an element of this PDB chains list:
1868 for c in codes: 1964 for c in codes:
1869 # search for Rfam mappings with this chain c: 1965 # search for Rfam mappings with this chain c:
1870 m_row_indices = allmappings.pdb_id + "|1|" + allmappings.chain == c[:4].lower()+c[4:] 1966 m_row_indices = allmappings.pdb_id + "|1|" + allmappings.chain == c[:4].lower()+c[4:]
1871 - m = allmappings.loc[m_row_indices].drop(['bit_score','evalue_score','cm_start','cm_end','hex_colour'], axis=1) 1967 + m = allmappings.loc[m_row_indices].drop(['bit_score', 'evalue_score', 'cm_start', 'cm_end', 'hex_colour'], axis=1)
1872 if len(m): 1968 if len(m):
1873 # remove the found mappings from the dataframe 1969 # remove the found mappings from the dataframe
1874 allmappings = allmappings.loc[m_row_indices == False] 1970 allmappings = allmappings.loc[m_row_indices == False]
...@@ -1881,7 +1977,7 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: ...@@ -1881,7 +1977,7 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
1881 families = set(known_mappings['rfam_acc']) 1977 families = set(known_mappings['rfam_acc'])
1882 1978
1883 # generalize 1979 # generalize
1884 - inferred_mappings = known_mappings.drop(['pdb_id','chain'], axis=1).drop_duplicates() 1980 + inferred_mappings = known_mappings.drop(['pdb_id', 'chain'], axis=1).drop_duplicates()
1885 1981
1886 # check for approximative redundancy: 1982 # check for approximative redundancy:
1887 if len(inferred_mappings) != len(inferred_mappings.drop_duplicates(subset="rfam_acc")): 1983 if len(inferred_mappings) != len(inferred_mappings.drop_duplicates(subset="rfam_acc")):
...@@ -1890,11 +1986,11 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: ...@@ -1890,11 +1986,11 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
1890 # ==> Summarize them in one mapping but with the largest window. 1986 # ==> Summarize them in one mapping but with the largest window.
1891 for rfam in families: 1987 for rfam in families:
1892 sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end']) 1988 sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end'])
1893 - thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam ) & sel_5_to_3 1989 + thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam) & sel_5_to_3
1894 - thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam ) & (sel_5_to_3 == False) 1990 + thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam) & (sel_5_to_3 == False)
1895 1991
1896 if ( 1992 if (
1897 - len(inferred_mappings[thisfam_5_3]) != len(inferred_mappings[ inferred_mappings['rfam_acc'] == rfam ]) 1993 + len(inferred_mappings[thisfam_5_3]) != len(inferred_mappings[inferred_mappings['rfam_acc'] == rfam])
1898 and len(inferred_mappings[thisfam_5_3]) > 0 1994 and len(inferred_mappings[thisfam_5_3]) > 0
1899 ): 1995 ):
1900 # there are mappings in both directions... wtf Rfam ?! 1996 # there are mappings in both directions... wtf Rfam ?!
...@@ -1908,8 +2004,8 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: ...@@ -1908,8 +2004,8 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
1908 # We keep only the 5->3 sense. 2004 # We keep only the 5->3 sense.
1909 inferred_mappings = inferred_mappings.drop(index=inferred_mappings.index[thisfam_3_5]) 2005 inferred_mappings = inferred_mappings.drop(index=inferred_mappings.index[thisfam_3_5])
1910 sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end']) 2006 sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end'])
1911 - thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam ) & sel_5_to_3 2007 + thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam) & sel_5_to_3
1912 - thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam ) & (sel_5_to_3 == False) 2008 + thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam) & (sel_5_to_3 == False)
1913 print() 2009 print()
1914 warn(f"Found mappings to {rfam} in both directions on the same interval, keeping only the 5'->3' one.") 2010 warn(f"Found mappings to {rfam} in both directions on the same interval, keeping only the 5'->3' one.")
1915 else: 2011 else:
...@@ -1919,35 +2015,35 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: ...@@ -1919,35 +2015,35 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
1919 2015
1920 # Compute consensus for chains in 5' -> 3' sense 2016 # Compute consensus for chains in 5' -> 3' sense
1921 if len(inferred_mappings[thisfam_5_3]): 2017 if len(inferred_mappings[thisfam_5_3]):
1922 - pdb_start_min = min(inferred_mappings[ thisfam_5_3]['pdb_start']) 2018 + pdb_start_min = min(inferred_mappings[thisfam_5_3]['pdb_start'])
1923 - pdb_end_max = max(inferred_mappings[ thisfam_5_3]['pdb_end']) 2019 + pdb_end_max = max(inferred_mappings[thisfam_5_3]['pdb_end'])
1924 - pdb_start_max = max(inferred_mappings[ thisfam_5_3]['pdb_start']) 2020 + pdb_start_max = max(inferred_mappings[thisfam_5_3]['pdb_start'])
1925 - pdb_end_min = min(inferred_mappings[ thisfam_5_3]['pdb_end']) 2021 + pdb_end_min = min(inferred_mappings[thisfam_5_3]['pdb_end'])
1926 if (pdb_start_max - pdb_start_min < 100) and (pdb_end_max - pdb_end_min < 100): 2022 if (pdb_start_max - pdb_start_min < 100) and (pdb_end_max - pdb_end_min < 100):
1927 # the variation is only a few nucleotides, we take the largest window. 2023 # the variation is only a few nucleotides, we take the largest window.
1928 - inferred_mappings.loc[ thisfam_5_3, 'pdb_start'] = pdb_start_min 2024 + inferred_mappings.loc[thisfam_5_3, 'pdb_start'] = pdb_start_min
1929 - inferred_mappings.loc[ thisfam_5_3, 'pdb_end'] = pdb_end_max 2025 + inferred_mappings.loc[thisfam_5_3, 'pdb_end'] = pdb_end_max
1930 else: 2026 else:
1931 # there probably is an outlier. We chose the median value in the whole list of known_mappings. 2027 # there probably is an outlier. We chose the median value in the whole list of known_mappings.
1932 - known_sel_5_to_3 = (known_mappings['rfam_acc'] == rfam ) & (known_mappings['pdb_start'] < known_mappings['pdb_end']) 2028 + known_sel_5_to_3 = (known_mappings['rfam_acc'] == rfam) & (known_mappings['pdb_start'] < known_mappings['pdb_end'])
1933 - inferred_mappings.loc[ thisfam_5_3, 'pdb_start'] = known_mappings.loc[known_sel_5_to_3, 'pdb_start'].median() 2029 + inferred_mappings.loc[thisfam_5_3, 'pdb_start'] = known_mappings.loc[known_sel_5_to_3, 'pdb_start'].median()
1934 - inferred_mappings.loc[ thisfam_5_3, 'pdb_end'] = known_mappings.loc[known_sel_5_to_3, 'pdb_end'].median() 2030 + inferred_mappings.loc[thisfam_5_3, 'pdb_end'] = known_mappings.loc[known_sel_5_to_3, 'pdb_end'].median()
1935 2031
1936 # Compute consensus for chains in 3' -> 5' sense 2032 # Compute consensus for chains in 3' -> 5' sense
1937 if len(inferred_mappings[thisfam_3_5]): 2033 if len(inferred_mappings[thisfam_3_5]):
1938 - pdb_start_min = min(inferred_mappings[ thisfam_3_5]['pdb_start']) 2034 + pdb_start_min = min(inferred_mappings[thisfam_3_5]['pdb_start'])
1939 - pdb_end_max = max(inferred_mappings[ thisfam_3_5]['pdb_end']) 2035 + pdb_end_max = max(inferred_mappings[thisfam_3_5]['pdb_end'])
1940 - pdb_start_max = max(inferred_mappings[ thisfam_3_5]['pdb_start']) 2036 + pdb_start_max = max(inferred_mappings[thisfam_3_5]['pdb_start'])
1941 - pdb_end_min = min(inferred_mappings[ thisfam_3_5]['pdb_end']) 2037 + pdb_end_min = min(inferred_mappings[thisfam_3_5]['pdb_end'])
1942 if (pdb_start_max - pdb_start_min < 100) and (pdb_end_max - pdb_end_min < 100): 2038 if (pdb_start_max - pdb_start_min < 100) and (pdb_end_max - pdb_end_min < 100):
1943 # the variation is only a few nucleotides, we take the largest window. 2039 # the variation is only a few nucleotides, we take the largest window.
1944 - inferred_mappings.loc[ thisfam_3_5, 'pdb_start'] = pdb_start_max 2040 + inferred_mappings.loc[thisfam_3_5, 'pdb_start'] = pdb_start_max
1945 - inferred_mappings.loc[ thisfam_3_5, 'pdb_end'] = pdb_end_min 2041 + inferred_mappings.loc[thisfam_3_5, 'pdb_end'] = pdb_end_min
1946 else: 2042 else:
1947 # there probably is an outlier. We chose the median value in the whole list of known_mappings. 2043 # there probably is an outlier. We chose the median value in the whole list of known_mappings.
1948 - known_sel_3_to_5 = (known_mappings['rfam_acc'] == rfam ) & (known_mappings['pdb_start'] > known_mappings['pdb_end']) 2044 + known_sel_3_to_5 = (known_mappings['rfam_acc'] == rfam) & (known_mappings['pdb_start'] > known_mappings['pdb_end'])
1949 - inferred_mappings.loc[ thisfam_3_5, 'pdb_start'] = known_mappings.loc[known_sel_3_to_5, 'pdb_start'].median() 2045 + inferred_mappings.loc[thisfam_3_5, 'pdb_start'] = known_mappings.loc[known_sel_3_to_5, 'pdb_start'].median()
1950 - inferred_mappings.loc[ thisfam_3_5, 'pdb_end'] = known_mappings.loc[known_sel_3_to_5, 'pdb_end'].median() 2046 + inferred_mappings.loc[thisfam_3_5, 'pdb_end'] = known_mappings.loc[known_sel_3_to_5, 'pdb_end'].median()
1951 inferred_mappings.drop_duplicates(inplace=True) 2047 inferred_mappings.drop_duplicates(inplace=True)
1952 2048
1953 # Now build Chain() objects for the mapped chains 2049 # Now build Chain() objects for the mapped chains
...@@ -1958,7 +2054,8 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: ...@@ -1958,7 +2054,8 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
1958 pdb_chain_id = nr[2] 2054 pdb_chain_id = nr[2]
1959 for rfam in families: 2055 for rfam in families:
1960 # if a known mapping of this chain on this family exists, apply it 2056 # if a known mapping of this chain on this family exists, apply it
1961 - m = known_mappings.loc[ (known_mappings.pdb_id + "|1|" + known_mappings.chain == c[:4].lower()+c[4:]) & (known_mappings['rfam_acc'] == rfam ) ] 2057 + this_chain_idxs = (known_mappings.pdb_id + "|1|" + known_mappings.chain == c[:4].lower()+c[4:])
2058 + m = known_mappings.loc[this_chain_idxs & (known_mappings['rfam_acc'] == rfam)]
1962 if len(m) and len(m) < 2: 2059 if len(m) and len(m) < 2:
1963 pdb_start = int(m.pdb_start) 2060 pdb_start = int(m.pdb_start)
1964 pdb_end = int(m.pdb_end) 2061 pdb_end = int(m.pdb_end)
...@@ -1969,23 +2066,35 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: ...@@ -1969,23 +2066,35 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
1969 pdb_start = int(m.pdb_start.min()) 2066 pdb_start = int(m.pdb_start.min())
1970 pdb_end = int(m.pdb_end.max()) 2067 pdb_end = int(m.pdb_end.max())
1971 inferred = False 2068 inferred = False
1972 - elif not(pdb_id in known_mappings.pdb_id and pdb_chain_id in known_mappings.chain): # if no known mapping on another family, use the inferred mapping 2069 + elif (fullinference or not(this_chain_idxs.any())):
1973 - pdb_start = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_start) 2070 + # if no known mapping on another family, use the inferred mapping
1974 - pdb_end = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_end) 2071 + # idem if the user said to do so with --full-inference
2072 + pdb_start = int(inferred_mappings.loc[(inferred_mappings['rfam_acc'] == rfam)].pdb_start)
2073 + pdb_end = int(inferred_mappings.loc[(inferred_mappings['rfam_acc'] == rfam)].pdb_end)
1975 inferred = True 2074 inferred = True
2075 + else:
2076 + # skip this family, we cannot map this chain to it.
2077 + continue
1976 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}_{pdb_start}-{pdb_end}" 2078 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}_{pdb_start}-{pdb_end}"
1977 2079
1978 # Check if the chain exists in the database 2080 # Check if the chain exists in the database
1979 if update_only: 2081 if update_only:
1980 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: 2082 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
1981 - res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc='{rfam}' AND issue=0""") 2083 + res = sql_ask_database(conn, f"""SELECT chain_id from chain
2084 + WHERE structure_id='{pdb_id}'
2085 + AND chain_name='{pdb_chain_id}'
2086 + AND rfam_acc='{rfam}'
2087 + AND issue=0""")
1982 if not len(res): # the chain is NOT yet in the database, or this is a known issue 2088 if not len(res): # the chain is NOT yet in the database, or this is a known issue
1983 - newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end)) 2089 + newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class,
2090 + rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
1984 else: 2091 else:
1985 - newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end)) 2092 + newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class,
2093 + rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
1986 2094
1987 return newchains 2095 return newchains
1988 2096
2097 +
1989 @trace_unhandled_exceptions 2098 @trace_unhandled_exceptions
1990 def work_mmcif(pdb_id): 2099 def work_mmcif(pdb_id):
1991 """ Look for a CIF file (with all chains) from RCSB 2100 """ Look for a CIF file (with all chains) from RCSB
...@@ -1999,8 +2108,11 @@ def work_mmcif(pdb_id): ...@@ -1999,8 +2108,11 @@ def work_mmcif(pdb_id):
1999 2108
2000 # Attempt to download it if not present 2109 # Attempt to download it if not present
2001 try: 2110 try:
2002 - if not path.isfile(final_filepath): 2111 + if not os.path.isfile(final_filepath):
2003 - subprocess.run(["wget", f'http://files.rcsb.org/download/{pdb_id}.cif', "-O", final_filepath], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) 2112 + subprocess.run(
2113 + ["wget", f'http://files.rcsb.org/download/{pdb_id}.cif', "-O", final_filepath],
2114 + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
2115 + )
2004 except: 2116 except:
2005 warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True) 2117 warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True)
2006 return 2118 return
...@@ -2012,7 +2124,7 @@ def work_mmcif(pdb_id): ...@@ -2012,7 +2124,7 @@ def work_mmcif(pdb_id):
2012 # if not, read the CIF header and register the structure 2124 # if not, read the CIF header and register the structure
2013 if not len(r): 2125 if not len(r):
2014 # Load the MMCIF file with Biopython 2126 # Load the MMCIF file with Biopython
2015 - mmCif_info = MMCIF2Dict(final_filepath) 2127 + mmCif_info = Bio.PDB.MMCIF2Dict.MMCIF2Dict(final_filepath)
2016 2128
2017 # Get info about that structure 2129 # Get info about that structure
2018 try: 2130 try:
...@@ -2036,9 +2148,9 @@ def work_mmcif(pdb_id): ...@@ -2036,9 +2148,9 @@ def work_mmcif(pdb_id):
2036 # Save into the database 2148 # Save into the database
2037 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 2149 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
2038 sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution) 2150 sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution)
2039 - VALUES (?, ?, DATE(?), ?, ?);""", data = (pdb_id, 1, date, exp_meth, reso)) 2151 + VALUES (?, ?, DATE(?), ?, ?);""", data=(pdb_id, 1, date, exp_meth, reso))
2040 2152
2041 - if not path.isfile(path_to_3D_data + "annotations/" + pdb_id + ".json"): 2153 + if not os.path.isfile(path_to_3D_data + "annotations/" + pdb_id + ".json"):
2042 2154
2043 # run DSSR (you need to have it in your $PATH, follow x3dna installation instructions) 2155 # run DSSR (you need to have it in your $PATH, follow x3dna installation instructions)
2044 output = subprocess.run(["x3dna-dssr", f"-i={final_filepath}", "--json", "--auxfile=no"], 2156 output = subprocess.run(["x3dna-dssr", f"-i={final_filepath}", "--json", "--auxfile=no"],
...@@ -2052,22 +2164,23 @@ def work_mmcif(pdb_id): ...@@ -2052,22 +2164,23 @@ def work_mmcif(pdb_id):
2052 return 1 2164 return 1
2053 2165
2054 # save the analysis to file only if we can load it :/ 2166 # save the analysis to file only if we can load it :/
2055 - json_file = open(path_to_3D_data + "annotations/" + pdb_id + ".json", "w") 2167 + json_file = open(path_to_3D_data + "annotations/" +
2168 + pdb_id + ".json", "w")
2056 json_file.write(stdout) 2169 json_file.write(stdout)
2057 json_file.close() 2170 json_file.close()
2058 2171
2059 return 0 2172 return 0
2060 2173
2174 +
2061 @trace_unhandled_exceptions 2175 @trace_unhandled_exceptions
2062 def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): 2176 def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
2063 """Reads information from JSON and save it to database. 2177 """Reads information from JSON and save it to database.
2064 If asked, also extracts the 3D chains from their original structure files. 2178 If asked, also extracts the 3D chains from their original structure files.
2065 -
2066 """ 2179 """
2067 2180
2068 setproctitle(f"RNAnet.py work_build_chain({c.chain_label})") 2181 setproctitle(f"RNAnet.py work_build_chain({c.chain_label})")
2069 2182
2070 - if not path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json"): 2183 + if not os.path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json"):
2071 warn(f"Could not find annotations for {c.chain_label}, ignoring it.", error=True) 2184 warn(f"Could not find annotations for {c.chain_label}, ignoring it.", error=True)
2072 c.delete_me = True 2185 c.delete_me = True
2073 c.error_messages += f"Could not download and/or find annotations for {c.chain_label}." 2186 c.error_messages += f"Could not download and/or find annotations for {c.chain_label}."
...@@ -2094,25 +2207,28 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): ...@@ -2094,25 +2207,28 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
2094 2207
2095 return c 2208 return c
2096 2209
2210 +
2097 @trace_unhandled_exceptions 2211 @trace_unhandled_exceptions
2098 def work_prepare_sequences(dl, rfam_acc, chains): 2212 def work_prepare_sequences(dl, rfam_acc, chains):
2099 - """Prepares FASTA files of homologous sequences to realign with cmalign or SINA.""" 2213 + """Prepares FASTA files of homologous sequences to realign with cmalign or SINA.
2214 + """
2100 2215
2101 setproctitle("RNAnet.py work_prepare_sequences()") 2216 setproctitle("RNAnet.py work_prepare_sequences()")
2102 2217
2103 if rfam_acc in LSU_set | SSU_set: # rRNA 2218 if rfam_acc in LSU_set | SSU_set: # rRNA
2104 - if path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): 2219 + if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
2105 # Detect doublons and remove them 2220 # Detect doublons and remove them
2106 - existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta") 2221 + existing_afa = Bio.AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
2107 - existing_ids = [ r.id for r in existing_afa ] 2222 + existing_ids = [r.id for r in existing_afa]
2108 del existing_afa 2223 del existing_afa
2109 - new_ids = [ str(c) for c in chains ] 2224 + new_ids = [str(c) for c in chains]
2110 - doublons = [ i for i in existing_ids if i in new_ids ] 2225 + doublons = [i for i in existing_ids if i in new_ids]
2111 del existing_ids, new_ids 2226 del existing_ids, new_ids
2112 if len(doublons): 2227 if len(doublons):
2113 - fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa"
2114 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version") 2228 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version")
2115 - seqfile = SeqIO.parse(fasta, "fasta") 2229 + fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa"
2230 + seqfile = Bio.SeqIO.parse(fasta, "fasta")
2231 + # remove it and rewrite it with its own content filtered
2116 os.remove(fasta) 2232 os.remove(fasta)
2117 with open(fasta, 'w') as f: 2233 with open(fasta, 'w') as f:
2118 for rec in seqfile: 2234 for rec in seqfile:
...@@ -2123,16 +2239,15 @@ def work_prepare_sequences(dl, rfam_acc, chains): ...@@ -2123,16 +2239,15 @@ def work_prepare_sequences(dl, rfam_acc, chains):
2123 with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f: 2239 with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f:
2124 for c in chains: 2240 for c in chains:
2125 if len(c.seq_to_align): 2241 if len(c.seq_to_align):
2126 - f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U','T')+'\n') 2242 + f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U', 'T')+'\n')
2127 status = f"{rfam_acc}: {len(chains)} new PDB sequences to align (with SINA)" 2243 status = f"{rfam_acc}: {len(chains)} new PDB sequences to align (with SINA)"
2128 2244
2129 - 2245 + elif not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.stk"):
2130 - elif not path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.stk"):
2131 # there was no previous aligned sequences, and we use cmalign. 2246 # there was no previous aligned sequences, and we use cmalign.
2132 # So, we need to download homologous sequences from Rfam. 2247 # So, we need to download homologous sequences from Rfam.
2133 2248
2134 # Extracting covariance model for this family 2249 # Extracting covariance model for this family
2135 - if not path.isfile(path_to_seq_data + f"realigned/{rfam_acc}.cm"): 2250 + if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}.cm"):
2136 with open(path_to_seq_data + f"realigned/{rfam_acc}.cm", "w") as f: 2251 with open(path_to_seq_data + f"realigned/{rfam_acc}.cm", "w") as f:
2137 subprocess.run(["cmfetch", path_to_seq_data + "Rfam.cm", rfam_acc], stdout=f) 2252 subprocess.run(["cmfetch", path_to_seq_data + "Rfam.cm", rfam_acc], stdout=f)
2138 notify(f"Extracted {rfam_acc} covariance model (cmfetch)") 2253 notify(f"Extracted {rfam_acc} covariance model (cmfetch)")
...@@ -2141,7 +2256,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): ...@@ -2141,7 +2256,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
2141 dl.download_Rfam_sequences(rfam_acc) 2256 dl.download_Rfam_sequences(rfam_acc)
2142 2257
2143 # Prepare a FASTA file containing Rfamseq hits for that family 2258 # Prepare a FASTA file containing Rfamseq hits for that family
2144 - if path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"): # test if download succeeded 2259 + if os.path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"): # test if download succeeded
2145 2260
2146 # gunzip the file 2261 # gunzip the file
2147 with gzip.open(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz", 'rb') as gz: 2262 with gzip.open(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz", 'rb') as gz:
...@@ -2153,14 +2268,14 @@ def work_prepare_sequences(dl, rfam_acc, chains): ...@@ -2153,14 +2268,14 @@ def work_prepare_sequences(dl, rfam_acc, chains):
2153 with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "w") as plusplus: 2268 with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "w") as plusplus:
2154 ids = set() 2269 ids = set()
2155 # Remove doublons from the Rfam hits 2270 # Remove doublons from the Rfam hits
2156 - for r in SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"): 2271 + for r in Bio.SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"):
2157 if r.id not in ids: 2272 if r.id not in ids:
2158 ids.add(r.id) 2273 ids.add(r.id)
2159 plusplus.write('> '+r.description+'\n'+str(r.seq)+'\n') 2274 plusplus.write('> '+r.description+'\n'+str(r.seq)+'\n')
2160 # Add the 3D chains sequences 2275 # Add the 3D chains sequences
2161 for c in chains: 2276 for c in chains:
2162 if len(c.seq_to_align): 2277 if len(c.seq_to_align):
2163 - plusplus.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U','T')+'\n') 2278 + plusplus.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U', 'T')+'\n')
2164 2279
2165 del file_content 2280 del file_content
2166 # os.remove(path_to_seq_data + f"realigned/{rfam_acc}.fa") 2281 # os.remove(path_to_seq_data + f"realigned/{rfam_acc}.fa")
...@@ -2175,12 +2290,13 @@ def work_prepare_sequences(dl, rfam_acc, chains): ...@@ -2175,12 +2290,13 @@ def work_prepare_sequences(dl, rfam_acc, chains):
2175 with open(path_to_seq_data + f"realigned/{rfam_acc}_new.fa", "w") as f: 2290 with open(path_to_seq_data + f"realigned/{rfam_acc}_new.fa", "w") as f:
2176 for c in chains: 2291 for c in chains:
2177 if len(c.seq_to_align): 2292 if len(c.seq_to_align):
2178 - f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U','T')+'\n') 2293 + f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U', 'T')+'\n')
2179 status = f"{rfam_acc}: {len(chains)} new PDB sequences to realign (with existing cmalign alignment)" 2294 status = f"{rfam_acc}: {len(chains)} new PDB sequences to realign (with existing cmalign alignment)"
2180 2295
2181 # print some stats 2296 # print some stats
2182 notify(status) 2297 notify(status)
2183 2298
2299 +
2184 @trace_unhandled_exceptions 2300 @trace_unhandled_exceptions
2185 def work_realign(rfam_acc): 2301 def work_realign(rfam_acc):
2186 """ Runs multiple sequence alignements by RNA family. 2302 """ Runs multiple sequence alignements by RNA family.
...@@ -2209,10 +2325,10 @@ def work_realign(rfam_acc): ...@@ -2209,10 +2325,10 @@ def work_realign(rfam_acc):
2209 else: 2325 else:
2210 # Align using Infernal for most RNA families 2326 # Align using Infernal for most RNA families
2211 2327
2212 - if path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"): 2328 + if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
2213 # Alignment exists. We just want to add new sequences into it. 2329 # Alignment exists. We just want to add new sequences into it.
2214 2330
2215 - if not path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"): 2331 + if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
2216 # there are no new sequences to align... 2332 # there are no new sequences to align...
2217 return 2333 return
2218 2334
...@@ -2227,13 +2343,13 @@ def work_realign(rfam_acc): ...@@ -2227,13 +2343,13 @@ def work_realign(rfam_acc):
2227 notify("Aligned new sequences together") 2343 notify("Aligned new sequences together")
2228 2344
2229 # Detect doublons and remove them 2345 # Detect doublons and remove them
2230 - existing_stk = AlignIO.read(existing_ali_path, "stockholm") 2346 + existing_stk = Bio.AlignIO.read(existing_ali_path, "stockholm")
2231 - existing_ids = [ r.id for r in existing_stk ] 2347 + existing_ids = [r.id for r in existing_stk]
2232 del existing_stk 2348 del existing_stk
2233 - new_stk = AlignIO.read(new_ali_path, "stockholm") 2349 + new_stk = Bio.AlignIO.read(new_ali_path, "stockholm")
2234 - new_ids = [ r.id for r in new_stk ] 2350 + new_ids = [r.id for r in new_stk]
2235 del new_stk 2351 del new_stk
2236 - doublons = [ i for i in existing_ids if i in new_ids ] 2352 + doublons = [i for i in existing_ids if i in new_ids]
2237 del existing_ids, new_ids 2353 del existing_ids, new_ids
2238 if len(doublons): 2354 if len(doublons):
2239 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version") 2355 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
...@@ -2241,12 +2357,13 @@ def work_realign(rfam_acc): ...@@ -2241,12 +2357,13 @@ def work_realign(rfam_acc):
2241 toremove.write('\n'.join(doublons)+'\n') 2357 toremove.write('\n'.join(doublons)+'\n')
2242 p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path], 2358 p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path],
2243 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) 2359 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2244 - p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path], stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) 2360 + p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path],
2361 + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2245 os.remove(path_to_seq_data + "realigned/toremove.txt") 2362 os.remove(path_to_seq_data + "realigned/toremove.txt")
2246 2363
2247 # And we merge the two alignments 2364 # And we merge the two alignments
2248 - p2= subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", 2365 + p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
2249 - "--rna", existing_ali_path, new_ali_path ], 2366 + "--rna", existing_ali_path, new_ali_path],
2250 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) 2367 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2251 stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8') 2368 stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8')
2252 subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path]) 2369 subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
...@@ -2263,7 +2380,7 @@ def work_realign(rfam_acc): ...@@ -2263,7 +2380,7 @@ def work_realign(rfam_acc):
2263 p = subprocess.run(["cmalign", "--small", "--cyk", "--noprob", "--nonbanded", "--notrunc", 2380 p = subprocess.run(["cmalign", "--small", "--cyk", "--noprob", "--nonbanded", "--notrunc",
2264 '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk", 2381 '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
2265 path_to_seq_data + f"realigned/{rfam_acc}.cm", 2382 path_to_seq_data + f"realigned/{rfam_acc}.cm",
2266 - path_to_seq_data + f"realigned/{rfam_acc}++.fa" ], 2383 + path_to_seq_data + f"realigned/{rfam_acc}++.fa"],
2267 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) 2384 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2268 stderr = p.stderr.decode("utf-8") 2385 stderr = p.stderr.decode("utf-8")
2269 2386
...@@ -2277,7 +2394,9 @@ def work_realign(rfam_acc): ...@@ -2277,7 +2394,9 @@ def work_realign(rfam_acc):
2277 print('\t'+validsymb, flush=True) 2394 print('\t'+validsymb, flush=True)
2278 2395
2279 # Convert Stockholm to aligned FASTA 2396 # Convert Stockholm to aligned FASTA
2280 - subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", "--informat", "stockholm", "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"]) 2397 + subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
2398 + "--informat", "stockholm",
2399 + "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"])
2281 subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part. 2400 subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part.
2282 2401
2283 # Assert everything worked, or save an error 2402 # Assert everything worked, or save an error
...@@ -2288,6 +2407,7 @@ def work_realign(rfam_acc): ...@@ -2288,6 +2407,7 @@ def work_realign(rfam_acc):
2288 with open(runDir + "/errors.txt", "a") as er: 2407 with open(runDir + "/errors.txt", "a") as er:
2289 er.write(f"Failed to realign {rfam_acc} (killed)") 2408 er.write(f"Failed to realign {rfam_acc} (killed)")
2290 2409
2410 +
2291 def summarize_position(counts): 2411 def summarize_position(counts):
2292 """ Counts the number of nucleotides at a given position, given a "column" from a MSA. 2412 """ Counts the number of nucleotides at a given position, given a "column" from a MSA.
2293 """ 2413 """
...@@ -2303,15 +2423,15 @@ def summarize_position(counts): ...@@ -2303,15 +2423,15 @@ def summarize_position(counts):
2303 N += counts[char] # number of ungapped residues 2423 N += counts[char] # number of ungapped residues
2304 2424
2305 if N: # prevent division by zero if the column is only gaps 2425 if N: # prevent division by zero if the column is only gaps
2306 - return ( counts['A']/N, counts['C']/N, counts['G']/N, counts['U']/N, (N - known_chars_count)/N) # other residues, or consensus (N, K, Y...) 2426 + return (counts['A']/N, counts['C']/N, counts['G']/N, counts['U']/N, (N - known_chars_count)/N) # other residues, or consensus (N, K, Y...)
2307 else: 2427 else:
2308 return (0, 0, 0, 0, 0) 2428 return (0, 0, 0, 0, 0)
2309 2429
2430 +
2310 @trace_unhandled_exceptions 2431 @trace_unhandled_exceptions
2311 def work_pssm(f, fill_gaps): 2432 def work_pssm(f, fill_gaps):
2312 """ Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. 2433 """ Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
2313 2434
2314 - Also saves every chain of the family to file.
2315 Uses only 1 core, so this function can be called in parallel. 2435 Uses only 1 core, so this function can be called in parallel.
2316 2436
2317 """ 2437 """
...@@ -2323,18 +2443,17 @@ def work_pssm(f, fill_gaps): ...@@ -2323,18 +2443,17 @@ def work_pssm(f, fill_gaps):
2323 2443
2324 # get the chains of this family 2444 # get the chains of this family
2325 list_of_chains = rfam_acc_to_download[f] 2445 list_of_chains = rfam_acc_to_download[f]
2326 - chains_ids = [ str(c) for c in list_of_chains ] 2446 + chains_ids = [str(c) for c in list_of_chains]
2327 2447
2328 # Open the alignment 2448 # Open the alignment
2329 try: 2449 try:
2330 - align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") 2450 + align = Bio.AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
2331 except: 2451 except:
2332 warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True) 2452 warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True)
2333 with open(runDir + "/errors.txt", "a") as errf: 2453 with open(runDir + "/errors.txt", "a") as errf:
2334 errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n") 2454 errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n")
2335 return 1 2455 return 1
2336 2456
2337 -
2338 # Compute statistics per column 2457 # Compute statistics per column
2339 pssm = BufferingSummaryInfo(align).get_pssm(f, thr_idx) 2458 pssm = BufferingSummaryInfo(align).get_pssm(f, thr_idx)
2340 frequencies = [ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ] 2459 frequencies = [ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ]
...@@ -2378,10 +2497,13 @@ def work_pssm(f, fill_gaps): ...@@ -2378,10 +2497,13 @@ def work_pssm(f, fill_gaps):
2378 2497
2379 # Save the re_mappings 2498 # Save the re_mappings
2380 conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0) 2499 conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0)
2381 - sql_execute(conn, "INSERT INTO re_mapping (chain_id, index_chain, index_ali) VALUES (?, ?, ?) ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;", many=True, data=re_mappings) 2500 + sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali)
2501 + VALUES (?, ?, ?)
2502 + ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""",
2503 + many=True, data=re_mappings)
2382 2504
2383 # Save the useful columns in the database 2505 # Save the useful columns in the database
2384 - data = [ (f, j) + frequencies[j-1] for j in sorted(columns_to_save) ] 2506 + data = [(f, j) + frequencies[j-1] for j in sorted(columns_to_save)]
2385 sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other) 2507 sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other)
2386 VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO 2508 VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO
2387 UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, freq_other=excluded.freq_other;""", many=True, data=data) 2509 UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, freq_other=excluded.freq_other;""", many=True, data=data)
...@@ -2412,12 +2534,13 @@ def work_pssm(f, fill_gaps): ...@@ -2412,12 +2534,13 @@ def work_pssm(f, fill_gaps):
2412 pbar.close() 2534 pbar.close()
2413 sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = ?, 2535 sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = ?,
2414 is_A = ?, is_C = ?, is_G = ?, is_U = ?, is_other = ? 2536 is_A = ?, is_C = ?, is_G = ?, is_U = ?, is_other = ?
2415 - WHERE chain_id = ? AND index_chain = ?;""", many=True, data = gaps) 2537 + WHERE chain_id = ? AND index_chain = ?;""", many=True, data=gaps)
2416 2538
2417 conn.close() 2539 conn.close()
2418 idxQueue.put(thr_idx) # replace the thread index in the queue 2540 idxQueue.put(thr_idx) # replace the thread index in the queue
2419 return 0 2541 return 0
2420 2542
2543 +
2421 @trace_unhandled_exceptions 2544 @trace_unhandled_exceptions
2422 def work_save(c, homology=True): 2545 def work_save(c, homology=True):
2423 2546
...@@ -2451,9 +2574,11 @@ def work_save(c, homology=True): ...@@ -2451,9 +2574,11 @@ def work_save(c, homology=True):
2451 2574
2452 df.to_csv(filename, float_format="%.2f", index=False) 2575 df.to_csv(filename, float_format="%.2f", index=False)
2453 2576
2577 +
2454 if __name__ == "__main__": 2578 if __name__ == "__main__":
2455 2579
2456 - runDir = path.dirname(path.realpath(__file__)) 2580 + runDir = os.getcwd()
2581 + fileDir = os.path.dirname(os.path.realpath(__file__))
2457 ncores = read_cpu_number() 2582 ncores = read_cpu_number()
2458 pp = Pipeline() 2583 pp = Pipeline()
2459 pp.process_options() 2584 pp.process_options()
...@@ -2502,7 +2627,6 @@ if __name__ == "__main__": ...@@ -2502,7 +2627,6 @@ if __name__ == "__main__":
2502 print("Completed.") 2627 print("Completed.")
2503 exit(0) 2628 exit(0)
2504 2629
2505 -
2506 # At this point, structure, chain and nucleotide tables of the database are up to date. 2630 # At this point, structure, chain and nucleotide tables of the database are up to date.
2507 # (Modulo some statistics computed by statistics.py) 2631 # (Modulo some statistics computed by statistics.py)
2508 2632
...@@ -2511,13 +2635,14 @@ if __name__ == "__main__": ...@@ -2511,13 +2635,14 @@ if __name__ == "__main__":
2511 # =========================================================================== 2635 # ===========================================================================
2512 2636
2513 if pp.SELECT_ONLY is None: 2637 if pp.SELECT_ONLY is None:
2514 - pp.checkpoint_load_chains() # If your job failed, you can comment all the "3D information" part and start from here. 2638 + # If your job failed, you can comment all the "3D information" part and start from here.
2639 + pp.checkpoint_load_chains()
2515 2640
2516 # Get the list of Rfam families found 2641 # Get the list of Rfam families found
2517 rfam_acc_to_download = {} 2642 rfam_acc_to_download = {}
2518 for c in pp.loaded_chains: 2643 for c in pp.loaded_chains:
2519 if c.mapping.rfam_acc not in rfam_acc_to_download: 2644 if c.mapping.rfam_acc not in rfam_acc_to_download:
2520 - rfam_acc_to_download[c.mapping.rfam_acc] = [ c ] 2645 + rfam_acc_to_download[c.mapping.rfam_acc] = [c]
2521 else: 2646 else:
2522 rfam_acc_to_download[c.mapping.rfam_acc].append(c) 2647 rfam_acc_to_download[c.mapping.rfam_acc].append(c)
2523 2648
...@@ -2546,5 +2671,5 @@ if __name__ == "__main__": ...@@ -2546,5 +2671,5 @@ if __name__ == "__main__":
2546 2671
2547 print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove 2672 print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove
2548 2673
2549 - # # so i can sleep for the end of the night 2674 + # so i can sleep for the end of the night
2550 # subprocess.run(["poweroff"]) 2675 # subprocess.run(["poweroff"])
......
1 -1ml5_1_a_1-2914 1 +1eg0_1_O_1-73
2 -1ml5_1_a_151-2903
3 -1ml5_1_A_7-1515
4 -1ml5_1_A_2-1520
5 -1ml5_1_A_7-1518
6 -1ml5_1_b_5-121
7 2rdo_1_A_3-118 2 2rdo_1_A_3-118
8 4v48_1_A9_3-118 3 4v48_1_A9_3-118
9 4v47_1_A9_3-118 4 4v47_1_A9_3-118
10 -6zmi_1_L8_1267-4755
11 -6zm7_1_L8_1267-4755
12 -6y6x_1_L8_1267-4755
13 -6z6n_1_L8_1267-4755
14 -6qzp_1_L8_1267-4755
15 -6zme_1_L8_1267-4755
16 -6z6l_1_L8_1267-4755
17 -6ek0_1_L8_1267-4755
18 -6zmo_1_L8_1267-4755
19 -6z6m_1_L8_1267-4755
20 -6ole_1_D_1267-4755
21 -6om0_1_D_1267-4755
22 -6y2l_1_L8_1267-4755
23 -6lqm_1_8_1267-4755
24 -6y0g_1_L8_1267-4755
25 -6lu8_1_8_1267-4755
26 -6lsr_1_8_1267-4755
27 -6lss_1_8_1267-4755
28 -6oli_1_D_1267-4755
29 -6olg_1_A3_1267-4755
30 -6y57_1_L8_1267-4755
31 -5t2c_1_C_1267-4755
32 -6om7_1_D_1267-4755
33 -4ug0_1_L8_1267-4755
34 -6olf_1_D_1267-4755
35 -6ip5_1_1C_1267-4755
36 -6ip8_1_1C_1267-4755
37 -6olz_1_A3_1267-4755
38 -5aj0_1_A3_1267-4755
39 -5lks_1_L8_1267-4755
40 -6ip6_1_1C_1267-4755
41 -4v6x_1_A8_1267-4755
42 1vy7_1_AY_1-73 5 1vy7_1_AY_1-73
43 1vy7_1_CY_1-73 6 1vy7_1_CY_1-73
44 4w2h_1_CY_1-73 7 4w2h_1_CY_1-73
45 -2z9q_1_A_1-72 8 +1jgq_1_A_2-1520
9 +4v42_1_AA_2-1520
10 +1jgo_1_A_2-1520
11 +1jgp_1_A_2-1520
12 +1ml5_1_A_2-1520
13 +4v42_1_BA_1-2914
14 +1ml5_1_a_1-2914
46 4v42_1_BB_5-121 15 4v42_1_BB_5-121
16 +1ml5_1_b_5-121
17 +2rdo_1_B_1-2904
18 +4v48_1_A0_1-2904
19 +4v47_1_A0_1-2904
20 +4v48_1_BA_1-1543
21 +4v47_1_BA_1-1542
47 1ls2_1_B_1-73 22 1ls2_1_B_1-73
48 3ep2_1_Y_1-72 23 3ep2_1_Y_1-72
49 3eq3_1_Y_1-72 24 3eq3_1_Y_1-72
50 4v48_1_A6_1-73 25 4v48_1_A6_1-73
51 -1eg0_1_O_1-73 26 +2z9q_1_A_1-72
52 1gsg_1_T_1-72 27 1gsg_1_T_1-72
53 3jcr_1_H_1-115 28 3jcr_1_H_1-115
54 -4v42_1_BA_1-2914 29 +1x1l_1_A_1-132
55 -4v42_1_BA_151-2903 30 +1zc8_1_Z_1-93
56 -4v48_1_BA_1-91 31 +2ob7_1_D_1-132
57 -4v48_1_BA_6-1541
58 -4v48_1_BA_1-1543
59 -4v48_1_BA_6-1538
60 -4v47_1_BA_1-91
61 -4v47_1_BA_6-1540
62 -4v47_1_BA_1-1542
63 -4v47_1_BA_6-1537
64 -2rdo_1_B_1-2903
65 -2rdo_1_B_6-1460
66 -2rdo_1_B_1-1528
67 -2rdo_1_B_6-1457
68 -2rdo_1_B_160-2893
69 -2rdo_1_B_1-2904
70 -2rdo_1_B_6-1522
71 -4v48_1_A0_1-2903
72 -4v48_1_A0_6-1460
73 -4v48_1_A0_1-1528
74 -4v48_1_A0_6-1457
75 -4v48_1_A0_160-2893
76 -4v48_1_A0_1-2904
77 -4v48_1_A0_6-1522
78 -4v47_1_A0_1-2903
79 -4v47_1_A0_6-1460
80 -4v47_1_A0_1-1528
81 -4v47_1_A0_6-1457
82 -4v47_1_A0_160-2893
83 -4v47_1_A0_1-2904
84 -4v47_1_A0_6-1522
85 2ob7_1_A_10-319 32 2ob7_1_A_10-319
86 -1x1l_1_A_1-130
87 -1zc8_1_Z_1-130
88 -1zc8_1_Z_1-91
89 -2ob7_1_D_1-130
90 -6rxu_1_C2_588-2386
91 -6rxu_1_C2_583-2388
92 -6rxu_1_C2_588-2383
93 -5oql_1_2_588-2386
94 -5oql_1_2_583-2388
95 -5oql_1_2_588-2383
96 -6rxv_1_C2_588-2386
97 -6rxv_1_C2_583-2388
98 -6rxv_1_C2_588-2383
99 -6rxz_1_C2_588-2386
100 -6rxz_1_C2_583-2388
101 -6rxz_1_C2_588-2383
102 -6rxy_1_C2_588-2386
103 -6rxy_1_C2_583-2388
104 -6rxy_1_C2_588-2383
105 -6rxt_1_C2_588-2386
106 -6rxt_1_C2_583-2388
107 -6rxt_1_C2_588-2383
108 1r2x_1_C_1-58 33 1r2x_1_C_1-58
109 1r2w_1_C_1-58 34 1r2w_1_C_1-58
110 -1eg0_1_L_1-57
111 1eg0_1_L_1-56 35 1eg0_1_L_1-56
112 -1jgq_1_A_7-1518
113 -1jgq_1_A_20-55
114 -1jgq_1_A_2-1520
115 -1jgq_1_A_7-1515
116 -4v42_1_AA_7-1518
117 -4v42_1_AA_20-55
118 -4v42_1_AA_2-1520
119 -4v42_1_AA_7-1515
120 -1jgo_1_A_7-1518
121 -1jgo_1_A_20-55
122 -1jgo_1_A_2-1520
123 -1jgo_1_A_7-1515
124 -1jgp_1_A_7-1518
125 -1jgp_1_A_20-55
126 -1jgp_1_A_2-1520
127 -1jgp_1_A_7-1515
128 1zc8_1_A_1-59 36 1zc8_1_A_1-59
129 -1mvr_1_D_1-59 37 +1mvr_1_D_1-61
130 -4c9d_1_D_29-1 38 +4adx_1_9_1-123
131 -4c9d_1_C_29-1
132 -4adx_1_9_1-121
133 1zn1_1_B_1-59 39 1zn1_1_B_1-59
134 1emi_1_B_1-108 40 1emi_1_B_1-108
135 3iy9_1_A_498-1027 41 3iy9_1_A_498-1027
...@@ -143,25 +49,1558 @@ ...@@ -143,25 +49,1558 @@
143 3cw1_1_V_1-138 49 3cw1_1_V_1-138
144 3cw1_1_v_1-138 50 3cw1_1_v_1-138
145 2iy3_1_B_9-105 51 2iy3_1_B_9-105
146 -3jcr_1_N_1-188 52 +3jcr_1_N_1-107
147 -3jcr_1_N_1-106
148 2vaz_1_A_64-177 53 2vaz_1_A_64-177
149 -2ftc_1_R_1-1568
150 -2ftc_1_R_792-1568
151 2ftc_1_R_81-1466 54 2ftc_1_R_81-1466
152 3jcr_1_M_1-141 55 3jcr_1_M_1-141
153 -3jcr_1_M_1-188
154 -3jcr_1_M_1-107
155 -4v5z_1_B0_1-2899
156 4v5z_1_B0_1-2902 56 4v5z_1_B0_1-2902
157 -4v5z_1_B0_1-2840
158 5g2x_1_A_595-692 57 5g2x_1_A_595-692
159 3iy8_1_A_1-540 58 3iy8_1_A_1-540
160 4v5z_1_BY_2-113 59 4v5z_1_BY_2-113
161 4v5z_1_BZ_1-70 60 4v5z_1_BZ_1-70
162 -1mvr_1_B_1-96 61 +4v5z_1_B1_2-125
163 -4adx_1_0_1-2923 62 +1mvr_1_B_3-96
164 -4adx_1_0_132-2915 63 +4adx_1_0_1-2925
165 3eq4_1_Y_1-69 64 3eq4_1_Y_1-69
65 +6uz7_1_8_2140-2827
166 4v5z_1_AA_1-1563 66 4v5z_1_AA_1-1563
167 -4v5z_1_AA_1-1562 67 +6cfj_1_1X
68 +6cfj_1_2X
69 +5hcq_1_1X
70 +6cae_1_1X
71 +5hcq_1_2X
72 +5hcr_1_1X
73 +4z8c_1_1X
74 +5j4b_1_1X
75 +5j4b_1_2X
76 +4z8c_1_2X
77 +6cae_1_2X
78 +5j4c_1_1X
79 +5w4k_1_1X
80 +6of1_1_1X
81 +5hcr_1_2X
82 +5hd1_1_1X
83 +5hcp_1_1X
84 +6of1_1_2X
85 +5hau_1_1W
86 +5j4c_1_2X
87 +5wis_1_1X
88 +6xqd_1_1X
89 +6nd5_1_1X
90 +5w4k_1_2X
91 +5hau_1_2W
92 +6xqd_1_2X
93 +4y4p_1_1X
94 +6o97_1_1X
95 +5hcp_1_2X
96 +5doy_1_1X
97 +4zer_1_1X
98 +5wit_1_1X
99 +5hd1_1_2X
100 +6nd5_1_2X
101 +4z3s_1_1X
102 +7jql_1_1X
103 +7jqm_1_1X
104 +7jql_1_2X
105 +5wis_1_2X
106 +6nd6_1_1X
107 +6o97_1_2X
108 +4y4p_1_2X
109 +7jqm_1_2X
110 +4z3s_1_2X
111 +4zer_1_2X
112 +6uo1_1_2X
113 +6uo1_1_1X
114 +5doy_1_2X
115 +5wit_1_2X
116 +5f8k_1_1X
117 +6nd6_1_2X
118 +6xqe_1_1X
119 +6xqe_1_2X
120 +6n9e_1_1X
121 +6n9e_1_2X
122 +6n9f_1_1X
123 +5f8k_1_2X
124 +6n9f_1_2X
125 +6xz7_1_F
126 +6y69_1_W
127 +5afi_1_V
128 +5afi_1_W
129 +6h4n_1_W
130 +5wdt_1_V
131 +5wfs_1_V
132 +5wdt_1_W
133 +5wfs_1_W
134 +5we4_1_V
135 +5we4_1_W
136 +5uq8_1_Y
137 +6c4i_1_Y
138 +6c4i_1_X
139 +5zeb_1_V
140 +5zep_1_W
141 +5lzd_1_V
142 +5we6_1_V
143 +5wfk_1_V
144 +5wfk_1_W
145 +5we6_1_W
146 +5u4i_1_Y
147 +5uq7_1_Y
148 +5u4i_1_X
149 +5lza_1_V
150 +5wf0_1_V
151 +5wf0_1_W
152 +5zeu_1_V
153 +5l3p_1_X
154 +3jcj_1_V
155 +6gxm_1_X
156 +6gwt_1_X
157 +6gxn_1_X
158 +6gxo_1_X
159 +3j9y_1_V
160 +6o9k_1_Y
161 +6o7k_1_V
162 +5lzf_1_V
163 +3jcn_1_V
164 +5lzc_1_V
165 +5u4j_1_X
166 +5u4j_1_Z
167 +5lzb_1_V
168 +6h58_1_W
169 +6h58_1_WW
170 +1eg0_1_O
171 +5j8b_1_X
172 +4v7j_1_AV
173 +4v7j_1_BV
174 +4v7k_1_BV
175 +4v7k_1_AV
176 +4v7k_1_BW
177 +4v7k_1_AW
178 +4v7j_1_AW
179 +4v7j_1_BW
180 +4v4j_1_Z
181 +6i0v_1_B
182 +5k77_1_X
183 +5k77_1_V
184 +5k77_1_Y
185 +5k77_1_W
186 +5k77_1_Z
187 +4pei_1_X
188 +4pei_1_V
189 +4pei_1_W
190 +4pei_1_Z
191 +4pei_1_Y
192 +4a3c_1_P
193 +4a3e_1_P
194 +6lkq_1_U
195 +7k00_1_B
196 +6qdw_1_A
197 +2rdo_1_A
198 +4v48_1_A9
199 +4v47_1_A9
200 +6hcj_1_Q3
201 +6hcq_1_Q3
202 +5mmm_1_Z
203 +4w2e_1_W
204 +5j4b_1_1Y
205 +6cfj_1_1W
206 +5w4k_1_1Y
207 +5wit_1_1W
208 +6cfj_1_1Y
209 +6cfj_1_2W
210 +5j4c_1_1W
211 +5wis_1_1Y
212 +5j4c_1_1Y
213 +6cfj_1_2Y
214 +5wis_1_1W
215 +5j4b_1_1W
216 +5j4c_1_2W
217 +5j4b_1_2W
218 +5j4b_1_2Y
219 +5j4c_1_2Y
220 +5w4k_1_1W
221 +6nd5_1_1Y
222 +5wis_1_2Y
223 +5wit_1_2W
224 +5doy_1_1Y
225 +5w4k_1_2Y
226 +4y4p_1_1Y
227 +4z3s_1_1Y
228 +5doy_1_1W
229 +5doy_1_2Y
230 +6nd5_1_1W
231 +4z3s_1_2Y
232 +4z3s_1_1W
233 +5w4k_1_2W
234 +6nd5_1_2Y
235 +4y4p_1_2Y
236 +6uo1_1_2Y
237 +6uo1_1_2W
238 +4y4p_1_1W
239 +4z3s_1_2W
240 +6uo1_1_1Y
241 +6uo1_1_1W
242 +5wis_1_2W
243 +5wit_1_1Y
244 +6nd5_1_2W
245 +4y4p_1_2W
246 +5doy_1_2W
247 +5wit_1_2Y
248 +6ucq_1_1Y
249 +4v4i_1_Z
250 +6ucq_1_1X
251 +6ucq_1_2Y
252 +4w2e_1_X
253 +6ucq_1_2X
254 +6yss_1_W
255 +5afi_1_Y
256 +5uq8_1_Z
257 +5wdt_1_Y
258 +5wfs_1_Y
259 +6ysr_1_W
260 +5we4_1_Y
261 +6yst_1_W
262 +5uq7_1_Z
263 +5we6_1_Y
264 +5wfk_1_Y
265 +5wf0_1_Y
266 +6o9j_1_V
267 +6ysu_1_W
268 +3j46_1_A
269 +5j8b_1_Y
270 +5j8b_1_W
271 +3bbv_1_Z
272 +5aj0_1_BV
273 +5aj0_1_BW
274 +4wt8_1_AB
275 +4wt8_1_BB
276 +4v4j_1_Y
277 +4v4i_1_Y
278 +5uq8_1_X
279 +5uq7_1_X
280 +1jgq_1_A
281 +4v42_1_AA
282 +1jgo_1_A
283 +1jgp_1_A
284 +1ml5_1_A
285 +4v4j_1_W
286 +4v4i_1_W
287 +4v42_1_BA
288 +4wt8_1_CS
289 +4wt8_1_DS
290 +4v4j_1_X
291 +4v4i_1_X
292 +4v42_1_BB
293 +6uu4_1_333
294 +6uu0_1_333
295 +6uuc_1_333
296 +6uu2_1_333
297 +6b6h_1_3
298 +6pb4_1_3
299 +6d30_1_C
300 +6j7z_1_C
301 +3er9_1_D
302 +5kal_1_Y
303 +4nia_1_3
304 +5kal_1_Z
305 +4nia_1_7
306 +4nia_1_4
307 +5new_1_C
308 +4nia_1_U
309 +4nia_1_6
310 +4oq9_1_7
311 +4nia_1_1
312 +4oq9_1_4
313 +4nia_1_8
314 +4oq9_1_8
315 +4nia_1_5
316 +2vrt_1_E
317 +4nia_1_W
318 +4oq9_1_6
319 +4oq8_1_D
320 +4nia_1_Z
321 +4oq9_1_W
322 +4oq9_1_5
323 +4nia_1_2
324 +2vrt_1_F
325 +4oq9_1_U
326 +4oq9_1_Z
327 +4oq9_1_2
328 +4oq9_1_3
329 +1ddl_1_E
330 +4oq9_1_1
331 +6rt5_1_A
332 +6rt5_1_E
333 +4qu6_1_B
334 +6lkq_1_T
335 +6qdw_1_B
336 +3jbv_1_B
337 +3jbu_1_B
338 +2rdo_1_B
339 +4v48_1_A0
340 +4v47_1_A0
341 +6do8_1_B
342 +6dpi_1_B
343 +6dp9_1_B
344 +6dpb_1_B
345 +6dmn_1_B
346 +6dpp_1_B
347 +6dpk_1_B
348 +6dpd_1_B
349 +6dot_1_B
350 +6dok_1_B
351 +6dp8_1_B
352 +6dpl_1_B
353 +6dpg_1_B
354 +6dou_1_B
355 +6dpc_1_B
356 +6do9_1_B
357 +6dmv_1_B
358 +6dp4_1_B
359 +6dpn_1_B
360 +6doj_1_B
361 +6dph_1_B
362 +6dos_1_B
363 +6doo_1_B
364 +6dp6_1_B
365 +6dox_1_B
366 +6dp5_1_B
367 +6dol_1_B
368 +6dp1_1_B
369 +6doz_1_B
370 +6dp7_1_B
371 +6doq_1_B
372 +6dpa_1_B
373 +6dom_1_B
374 +6dog_1_B
375 +6dop_1_B
376 +6doh_1_B
377 +6doa_1_B
378 +6don_1_B
379 +6dov_1_B
380 +6dpo_1_B
381 +6dod_1_B
382 +6dob_1_B
383 +6dow_1_B
384 +6dpm_1_B
385 +6dpf_1_B
386 +6dp3_1_B
387 +6dp2_1_B
388 +6dpe_1_B
389 +6dpj_1_B
390 +6dor_1_B
391 +6dof_1_B
392 +6dp0_1_B
393 +6doi_1_B
394 +6doc_1_B
395 +6doe_1_B
396 +6n6g_1_D
397 +6lkq_1_S
398 +5h5u_1_H
399 +5lze_1_Y
400 +5lze_1_V
401 +5lze_1_X
402 +3jcj_1_G
403 +6o7k_1_G
404 +4v48_1_BA
405 +4v47_1_BA
406 +4b3r_1_W
407 +4b3t_1_W
408 +4b3s_1_W
409 +5o2r_1_X
410 +5kcs_1_1X
411 +6fti_1_U
412 +6fti_1_W
413 +6ftj_1_U
414 +6ftj_1_W
415 +6ftg_1_U
416 +6ftg_1_W
417 +6ole_1_T
418 +6om0_1_T
419 +6oli_1_T
420 +6om7_1_T
421 +6olf_1_T
422 +6w6l_1_T
423 +6x1b_1_D
424 +6x1b_1_F
425 +5f6c_1_C
426 +6i0t_1_B
427 +1b2m_1_C
428 +1b2m_1_D
429 +1b2m_1_E
430 +2uxc_1_Y
431 +4a3g_1_P
432 +4a3j_1_P
433 +7k00_1_5
434 +5mmi_1_Z
435 +3j9m_1_U
436 +6nu2_1_U
437 +6nu3_1_U
438 +5c0y_1_C
439 +6n6f_1_D
440 +4ohy_1_B
441 +4oi1_1_B
442 +4oi0_1_B
443 +6raz_1_Y
444 +5ipl_1_3
445 +6utw_1_333
446 +5ipm_1_3
447 +5ipn_1_3
448 +4ylo_1_3
449 +4yln_1_6
450 +4ylo_1_6
451 +4yln_1_3
452 +4yln_1_9
453 +5lzf_1_Y
454 +1n32_1_Z
455 +5zsl_1_D
456 +5zsd_1_C
457 +5zsd_1_D
458 +5zsl_1_E
459 +4nku_1_D
460 +4nku_1_H
461 +1cwp_1_E
462 +6qik_1_Y
463 +6rzz_1_Y
464 +6ri5_1_Y
465 +6qt0_1_Y
466 +6qtz_1_Y
467 +6t83_1_1B
468 +6t83_1_3B
469 +6t83_1_AA
470 +6t83_1_CA
471 +6s05_1_Y
472 +5jcs_1_X
473 +5fl8_1_X
474 +3erc_1_G
475 +6of1_1_1W
476 +6cae_1_1Y
477 +6o97_1_1W
478 +6of1_1_1Y
479 +6of1_1_2W
480 +6o97_1_1Y
481 +6nd6_1_1Y
482 +6cae_1_1W
483 +6of1_1_2Y
484 +6cae_1_2Y
485 +6nd6_1_1W
486 +6cae_1_2W
487 +6o97_1_2Y
488 +6nd6_1_2Y
489 +6o97_1_2W
490 +6nd6_1_2W
491 +6xz7_1_G
492 +6gz5_1_BW
493 +6gz3_1_BW
494 +1ls2_1_B
495 +3ep2_1_Y
496 +3eq3_1_Y
497 +4v48_1_A6
498 +2z9q_1_A
499 +4hot_1_X
500 +6d2z_1_C
501 +4tu0_1_F
502 +4tu0_1_G
503 +6r9o_1_B
504 +6is0_1_C
505 +5lzc_1_X
506 +5lzb_1_X
507 +5lzd_1_Y
508 +5lzc_1_Y
509 +5lzb_1_Y
510 +1gsg_1_T
511 +6zvi_1_D
512 +6sv4_1_NB
513 +6sv4_1_NC
514 +6i7o_1_NB
515 +5y88_1_X
516 +3j6x_1_IR
517 +3j6y_1_IR
518 +6tb3_1_N
519 +6tnu_1_N
520 +2uxb_1_X
521 +2x1f_1_B
522 +2x1a_1_B
523 +3eq3_1_D
524 +3ep2_1_D
525 +1eg0_1_M
526 +3eq4_1_D
527 +5o1y_1_B
528 +3jcr_1_H
529 +6dzi_1_H
530 +5zeu_1_A
531 +6mpi_1_W
532 +5mfx_1_B
533 +5w0m_1_J
534 +5bud_1_E
535 +5w0m_1_I
536 +5w0m_1_H
537 +4j7m_1_B
538 +5bud_1_D
539 +6a4e_1_B
540 +6a4e_1_D
541 +6hxx_1_AA
542 +6hxx_1_AB
543 +6hxx_1_AC
544 +6hxx_1_AD
545 +6hxx_1_AE
546 +6hxx_1_AF
547 +6hxx_1_AG
548 +6hxx_1_AH
549 +6hxx_1_AI
550 +6hxx_1_AJ
551 +6hxx_1_AK
552 +6hxx_1_AL
553 +6hxx_1_AM
554 +6hxx_1_AN
555 +6hxx_1_AO
556 +6hxx_1_AP
557 +6hxx_1_AQ
558 +6hxx_1_AR
559 +6hxx_1_AS
560 +6hxx_1_AT
561 +6hxx_1_AU
562 +6hxx_1_AV
563 +6hxx_1_AW
564 +6hxx_1_AX
565 +6hxx_1_AY
566 +6hxx_1_AZ
567 +6hxx_1_BA
568 +6hxx_1_BB
569 +6hxx_1_BC
570 +6hxx_1_BD
571 +6hxx_1_BE
572 +6hxx_1_BF
573 +6hxx_1_BG
574 +6hxx_1_BH
575 +6hxx_1_BI
576 +5odv_1_A
577 +5odv_1_B
578 +5odv_1_C
579 +5odv_1_D
580 +5odv_1_E
581 +5odv_1_F
582 +5odv_1_G
583 +5odv_1_H
584 +5odv_1_I
585 +5odv_1_J
586 +5odv_1_K
587 +5odv_1_L
588 +5odv_1_M
589 +5odv_1_N
590 +5odv_1_O
591 +5odv_1_P
592 +5odv_1_Q
593 +5odv_1_R
594 +5odv_1_S
595 +5odv_1_T
596 +5odv_1_U
597 +5odv_1_V
598 +5odv_1_W
599 +5odv_1_X
600 +6t34_1_A
601 +6t34_1_B
602 +6t34_1_C
603 +6t34_1_D
604 +6t34_1_E
605 +6t34_1_F
606 +6t34_1_G
607 +6t34_1_H
608 +6t34_1_I
609 +6t34_1_J
610 +6t34_1_K
611 +6t34_1_L
612 +6t34_1_M
613 +6t34_1_N
614 +6t34_1_O
615 +6t34_1_P
616 +6t34_1_Q
617 +6t34_1_R
618 +6t34_1_S
619 +6ip8_1_ZY
620 +6ip5_1_ZY
621 +6ip5_1_ZU
622 +6ip6_1_ZY
623 +6ip8_1_ZZ
624 +6ip6_1_ZZ
625 +6uu3_1_333
626 +6uu1_1_333
627 +1pn8_1_D
628 +3er8_1_H
629 +3er8_1_G
630 +3er8_1_F
631 +5o3j_1_B
632 +4dr7_1_B
633 +1i5l_1_Y
634 +1i5l_1_U
635 +4dr6_1_B
636 +6i2n_1_U
637 +4v68_1_A0
638 +6vyu_1_Y
639 +6vyw_1_Y
640 +6vz7_1_Y
641 +6vz5_1_Y
642 +6vz3_1_Y
643 +6vyy_1_Y
644 +6vyx_1_Y
645 +6vyz_1_Y
646 +6vz2_1_Y
647 +1mvr_1_1
648 +6vyt_1_Y
649 +1cgm_1_I
650 +3jb7_1_T
651 +3jb7_1_M
652 +3j0o_1_D
653 +3j0l_1_D
654 +3j0q_1_D
655 +3j0p_1_D
656 +5elt_1_F
657 +5elt_1_E
658 +2tmv_1_R
659 +5a79_1_R
660 +5a7a_1_R
661 +2om3_1_R
662 +2xea_1_R
663 +4wtl_1_T
664 +4wtl_1_P
665 +1xnq_1_W
666 +1x18_1_C
667 +1x18_1_B
668 +1x18_1_D
669 +1vq6_1_4
670 +4am3_1_D
671 +4am3_1_H
672 +4am3_1_I
673 +4lj0_1_C
674 +4lj0_1_D
675 +4lj0_1_E
676 +5lzy_1_HH
677 +4wtj_1_T
678 +4wtj_1_P
679 +4xbf_1_D
680 +6ow3_1_I
681 +6ovy_1_I
682 +6oy6_1_I
683 +6n6d_1_D
684 +6n6k_1_C
685 +6n6k_1_D
686 +3rtj_1_D
687 +1apg_1_D
688 +6ty9_1_M
689 +6tz1_1_N
690 +4bbl_1_Y
691 +4bbl_1_Z
692 +6sce_1_B
693 +6scf_1_I
694 +6scf_1_K
695 +6yud_1_K
696 +6yud_1_O
697 +6scf_1_M
698 +6yud_1_P
699 +6scf_1_L
700 +6yud_1_M
701 +6yud_1_Q
702 +6o6x_1_D
703 +4ba2_1_R
704 +6o6x_1_C
705 +6o7b_1_C
706 +6o6v_1_C
707 +6r7b_1_D
708 +6r9r_1_D
709 +6ov0_1_E
710 +6ov0_1_H
711 +6ov0_1_G
712 +6o6v_1_D
713 +6ov0_1_F
714 +6o7b_1_D
715 +5e02_1_C
716 +6r9r_1_E
717 +6r7b_1_E
718 +6o7i_1_I
719 +6o7h_1_K
720 +7jyy_1_F
721 +7jyy_1_E
722 +7jz0_1_F
723 +7jz0_1_E
724 +6rt6_1_A
725 +6rt6_1_E
726 +1y1y_1_P
727 +5zuu_1_I
728 +5zuu_1_G
729 +4peh_1_W
730 +4peh_1_V
731 +4peh_1_X
732 +4peh_1_Y
733 +4peh_1_Z
734 +6mkn_1_W
735 +4cxg_1_C
736 +4cxh_1_C
737 +1x1l_1_A
738 +1zc8_1_Z
739 +2ob7_1_D
740 +2ob7_1_A
741 +4eya_1_E
742 +4eya_1_F
743 +4eya_1_Q
744 +4eya_1_R
745 +2r1g_1_B
746 +4ht9_1_E
747 +1cvj_1_M
748 +6z1p_1_AB
749 +6z1p_1_AA
750 +4ii9_1_C
751 +5mq0_1_3
752 +5uk4_1_X
753 +5uk4_1_V
754 +5uk4_1_W
755 +5uk4_1_U
756 +5f6c_1_E
757 +4rcj_1_B
758 +1xnr_1_W
759 +6e0o_1_C
760 +6o75_1_D
761 +6o75_1_C
762 +6e0o_1_B
763 +3j06_1_R
764 +1r2x_1_C
765 +1r2w_1_C
766 +1eg0_1_L
767 +4eya_1_G
768 +4eya_1_H
769 +4eya_1_S
770 +4eya_1_T
771 +4dr4_1_V
772 +1ibl_1_Z
773 +1ibm_1_Z
774 +4dr5_1_V
775 +4d61_1_J
776 +1trj_1_B
777 +1trj_1_C
778 +6q8y_1_N
779 +6sv4_1_N
780 +6i7o_1_N
781 +5k8h_1_A
782 +5z4a_1_B
783 +3jbu_1_V
784 +1h2c_1_R
785 +1h2d_1_S
786 +1h2d_1_R
787 +6szs_1_X
788 +5mgp_1_X
789 +6enu_1_X
790 +6enf_1_X
791 +6enj_1_X
792 +1pvo_1_L
793 +1pvo_1_G
794 +1pvo_1_H
795 +1pvo_1_J
796 +1pvo_1_K
797 +2ht1_1_K
798 +2ht1_1_J
799 +6eri_1_AX
800 +1zc8_1_A
801 +1zc8_1_C
802 +1zc8_1_B
803 +1zc8_1_G
804 +1zc8_1_I
805 +1zc8_1_H
806 +1zc8_1_J
807 +4v8z_1_CX
808 +6kqe_1_I
809 +5uh8_1_I
810 +5vi5_1_Q
811 +4xln_1_T
812 +4xlr_1_T
813 +4xln_1_Q
814 +5i2d_1_K
815 +5i2d_1_V
816 +4xlr_1_Q
817 +6sty_1_C
818 +6sty_1_F
819 +2xs5_1_D
820 +3ok4_1_N
821 +3ok4_1_L
822 +3ok4_1_Z
823 +3ok4_1_4
824 +3ok4_1_V
825 +3ok4_1_X
826 +3ok4_1_P
827 +3ok4_1_H
828 +3ok4_1_J
829 +3ok4_1_R
830 +3ok4_1_T
831 +3ok4_1_2
832 +6n6h_1_D
833 +5wnt_1_B
834 +3b0u_1_B
835 +3b0u_1_A
836 +4x9e_1_G
837 +4x9e_1_H
838 +6z1p_1_BB
839 +6z1p_1_BA
840 +2uxd_1_X
841 +4qvd_1_H
842 +4v7e_1_AB
843 +3ol9_1_D
844 +3ol9_1_H
845 +3ol9_1_L
846 +3ol9_1_P
847 +3olb_1_L
848 +3olb_1_P
849 +3olb_1_D
850 +3olb_1_H
851 +3ol6_1_D
852 +3ol6_1_H
853 +3ol6_1_L
854 +3ol6_1_P
855 +3ol8_1_D
856 +3ol8_1_H
857 +3ol7_1_L
858 +3ol7_1_P
859 +3ol7_1_D
860 +3ol7_1_H
861 +3ol8_1_L
862 +3ol8_1_P
863 +1qzc_1_C
864 +1qzc_1_A
865 +6ole_1_V
866 +6om0_1_V
867 +6oli_1_V
868 +6om7_1_V
869 +6w6l_1_V
870 +6olf_1_V
871 +1mvr_1_D
872 +4wtm_1_T
873 +4wtm_1_P
874 +5x70_1_E
875 +5x70_1_G
876 +6gz5_1_BV
877 +6gz4_1_BV
878 +6gz3_1_BV
879 +6fti_1_Q
880 +4v7e_1_AE
881 +4v7e_1_AD
882 +4x62_1_B
883 +4x64_1_B
884 +4x65_1_B
885 +1xmq_1_W
886 +4x66_1_B
887 +3t1h_1_W
888 +3t1y_1_W
889 +1xmo_1_W
890 +4adx_1_9
891 +6kr6_1_B
892 +1zn1_1_B
893 +6z8k_1_X
894 +1cvj_1_Q
895 +4csf_1_U
896 +4csf_1_Q
897 +4csf_1_G
898 +4csf_1_M
899 +4csf_1_K
900 +4csf_1_A
901 +4csf_1_I
902 +4csf_1_S
903 +4csf_1_C
904 +4csf_1_W
905 +4csf_1_O
906 +4csf_1_E
907 +1cvj_1_N
908 +1cvj_1_O
909 +1cvj_1_S
910 +1cvj_1_P
911 +1cvj_1_T
912 +1cvj_1_R
913 +6th6_1_AA
914 +6skg_1_AA
915 +6skf_1_AA
916 +6q8y_1_M
917 +6i7o_1_M
918 +6zmw_1_W
919 +6ybv_1_W
920 +2fz2_1_D
921 +2xpj_1_D
922 +2vrt_1_H
923 +2vrt_1_G
924 +1emi_1_B
925 +6r9m_1_B
926 +4nia_1_C
927 +4nia_1_A
928 +4nia_1_H
929 +4nia_1_N
930 +4nia_1_G
931 +4nia_1_D
932 +4nia_1_B
933 +4nia_1_I
934 +4nia_1_E
935 +4nia_1_M
936 +4oq9_1_I
937 +4oq9_1_G
938 +4oq9_1_C
939 +4oq9_1_H
940 +4oq9_1_N
941 +4oq9_1_A
942 +4oq9_1_D
943 +4oq9_1_E
944 +4oq9_1_M
945 +4oq9_1_B
946 +5uhc_1_I
947 +1uvn_1_F
948 +1uvn_1_B
949 +1uvn_1_D
950 +3iy9_1_A
951 +4wtk_1_T
952 +4wtk_1_P
953 +1vqn_1_4
954 +4oav_1_C
955 +4oav_1_A
956 +3ep2_1_E
957 +3eq3_1_E
958 +3eq4_1_E
959 +3ep2_1_A
960 +3eq3_1_A
961 +3eq4_1_A
962 +3ep2_1_C
963 +3eq3_1_C
964 +3eq4_1_C
965 +3ep2_1_B
966 +3eq3_1_B
967 +3eq4_1_B
968 +4i67_1_B
969 +3pgw_1_R
970 +3pgw_1_N
971 +3cw1_1_X
972 +3cw1_1_W
973 +3cw1_1_V
974 +5it9_1_I
975 +6k32_1_T
976 +6k32_1_P
977 +5mmj_1_A
978 +5x8r_1_A
979 +3j2k_1_3
980 +3j2k_1_2
981 +3j2k_1_1
982 +3j2k_1_0
983 +3j2k_1_4
984 +3nvk_1_G
985 +3nvk_1_S
986 +2iy3_1_B
987 +1cwp_1_F
988 +5z4j_1_B
989 +5gmf_1_E
990 +5gmf_1_H
991 +6e4p_1_J
992 +5gmf_1_F
993 +5gmf_1_G
994 +5gmg_1_D
995 +5gmg_1_C
996 +6e4p_1_K
997 +3ie1_1_E
998 +3ie1_1_H
999 +3ie1_1_F
1000 +4dr7_1_V
1001 +3ie1_1_G
1002 +3s4g_1_C
1003 +3s4g_1_B
1004 +2qqp_1_R
1005 +2zde_1_E
1006 +2zde_1_F
1007 +2zde_1_H
1008 +2zde_1_G
1009 +1nb7_1_E
1010 +1nb7_1_F
1011 +4hos_1_X
1012 +3p6y_1_T
1013 +3p6y_1_V
1014 +3p6y_1_U
1015 +3p6y_1_Q
1016 +3p6y_1_W
1017 +5dto_1_B
1018 +4cxh_1_X
1019 +1uvj_1_F
1020 +1uvj_1_D
1021 +1uvj_1_E
1022 +6kqd_1_I
1023 +6kqd_1_S
1024 +5uh5_1_I
1025 +1ytu_1_F
1026 +1ytu_1_D
1027 +4kzz_1_J
1028 +5t2c_1_AN
1029 +4v5z_1_BF
1030 +3j6b_1_E
1031 +4v4f_1_B6
1032 +4v4f_1_A5
1033 +4v4f_1_A3
1034 +4v4f_1_B0
1035 +4v4f_1_B9
1036 +4v4f_1_A2
1037 +4v4f_1_A8
1038 +4v4f_1_A1
1039 +4v4f_1_A9
1040 +4v4f_1_BZ
1041 +4v4f_1_B8
1042 +4v4f_1_B7
1043 +4v4f_1_B5
1044 +4v4f_1_A0
1045 +4v4f_1_A7
1046 +4v4f_1_A4
1047 +4v4f_1_AZ
1048 +4v4f_1_B3
1049 +4v4f_1_B1
1050 +4v4f_1_B4
1051 +4v4f_1_A6
1052 +4v4f_1_B2
1053 +5flx_1_Z
1054 +5zsb_1_C
1055 +5zsb_1_D
1056 +5zsn_1_D
1057 +5zsn_1_E
1058 +3jcr_1_N
1059 +6gfw_1_R
1060 +2vaz_1_A
1061 +1qzc_1_B
1062 +1mvr_1_C
1063 +4v5z_1_BP
1064 +6n6e_1_D
1065 +4g7o_1_I
1066 +4g7o_1_S
1067 +5x22_1_S
1068 +5x22_1_I
1069 +5x21_1_I
1070 +5uh6_1_I
1071 +6l74_1_I
1072 +5uh9_1_I
1073 +2ftc_1_R
1074 +6sag_1_R
1075 +4udv_1_R
1076 +2r1g_1_E
1077 +5zsc_1_D
1078 +5zsc_1_C
1079 +6woy_1_I
1080 +6wox_1_I
1081 +6evj_1_N
1082 +6evj_1_M
1083 +4gkk_1_W
1084 +4v9e_1_AG
1085 +4v9e_1_BM
1086 +4v9e_1_AM
1087 +4v9e_1_AA
1088 +4v9e_1_BA
1089 +4v9e_1_BG
1090 +5lzs_1_II
1091 +6fqr_1_C
1092 +6ha1_1_X
1093 +5kcr_1_1X
1094 +2r1g_1_X
1095 +3m7n_1_Z
1096 +3m85_1_X
1097 +3m85_1_Z
1098 +3m85_1_Y
1099 +1e8s_1_C
1100 +5wnp_1_B
1101 +5wnv_1_B
1102 +5yts_1_B
1103 +1utd_1_6
1104 +1utd_1_Z
1105 +1utd_1_4
1106 +1utd_1_7
1107 +1utd_1_9
1108 +1utd_1_5
1109 +1utd_1_3
1110 +1utd_1_2
1111 +1utd_1_8
1112 +1utd_1_1
1113 +6n6i_1_C
1114 +6n6i_1_D
1115 +6n6a_1_D
1116 +6ij2_1_F
1117 +6ij2_1_G
1118 +6ij2_1_H
1119 +6ij2_1_E
1120 +3u2e_1_D
1121 +3u2e_1_C
1122 +5uef_1_C
1123 +5uef_1_D
1124 +4x4u_1_H
1125 +4afy_1_D
1126 +6oy5_1_I
1127 +6owl_1_B
1128 +6owl_1_C
1129 +4afy_1_C
1130 +4lq3_1_R
1131 +6s0m_1_C
1132 +6gx6_1_B
1133 +4k4s_1_D
1134 +4k4s_1_H
1135 +4k4t_1_H
1136 +4k4t_1_D
1137 +1zn1_1_C
1138 +1zn0_1_C
1139 +1xpu_1_G
1140 +1xpu_1_L
1141 +1xpr_1_L
1142 +1xpu_1_H
1143 +1xpo_1_K
1144 +1xpo_1_J
1145 +1xpu_1_J
1146 +1xpo_1_H
1147 +1xpr_1_J
1148 +1xpu_1_K
1149 +1xpr_1_K
1150 +1xpo_1_M
1151 +1xpo_1_L
1152 +1xpu_1_M
1153 +1xpr_1_M
1154 +1xpo_1_G
1155 +1xpr_1_H
1156 +1xpr_1_G
1157 +6gc5_1_F
1158 +6gc5_1_H
1159 +6gc5_1_G
1160 +4v7e_1_AA
1161 +4v7e_1_AC
1162 +1n1h_1_B
1163 +4ohz_1_B
1164 +6t83_1_6B
1165 +4gv6_1_C
1166 +4gv6_1_B
1167 +4gv3_1_C
1168 +4gv3_1_B
1169 +4gv9_1_E
1170 +6i7o_1_L
1171 +2a8v_1_D
1172 +6qx3_1_G
1173 +2xnr_1_C
1174 +4gkj_1_W
1175 +4v5z_1_BC
1176 +4v5z_1_BB
1177 +4v5z_1_BH
1178 +3j0o_1_F
1179 +3j0l_1_F
1180 +3j0p_1_F
1181 +3j0q_1_F
1182 +3j0o_1_B
1183 +3j0l_1_B
1184 +3j0o_1_C
1185 +3j0l_1_C
1186 +3j0q_1_C
1187 +3j0p_1_C
1188 +3j0o_1_A
1189 +3j0l_1_A
1190 +3j0q_1_A
1191 +3j0p_1_A
1192 +1cwp_1_D
1193 +4v5z_1_BJ
1194 +5sze_1_C
1195 +6wre_1_D
1196 +6i0u_1_B
1197 +5zsa_1_C
1198 +5zsa_1_D
1199 +1n34_1_Z
1200 +3pf5_1_S
1201 +6ppn_1_A
1202 +6ppn_1_I
1203 +6qdw_1_V
1204 +5hk0_1_F
1205 +4qm6_1_D
1206 +4qm6_1_C
1207 +4jzu_1_C
1208 +4jzv_1_C
1209 +5ytv_1_B
1210 +4k4z_1_P
1211 +4k4z_1_D
1212 +4k4x_1_L
1213 +4k4z_1_L
1214 +4k4x_1_D
1215 +4k4z_1_H
1216 +4k4x_1_H
1217 +4k4x_1_P
1218 +1t1m_1_A
1219 +1t1m_1_B
1220 +4a3b_1_P
1221 +4a3m_1_P
1222 +6u6y_1_E
1223 +6u6y_1_G
1224 +6u6y_1_F
1225 +6u6y_1_H
1226 +6qik_1_X
1227 +6rzz_1_X
1228 +6ri5_1_X
1229 +6qt0_1_X
1230 +6qtz_1_X
1231 +6s05_1_X
1232 +6t83_1_BB
1233 +6t83_1_4B
1234 +5fl8_1_Z
1235 +5jcs_1_Z
1236 +5mrc_1_BB
1237 +5mre_1_BB
1238 +5mrf_1_BB
1239 +6gz4_1_BW
1240 +3j46_1_P
1241 +3jcr_1_M
1242 +4e6b_1_A
1243 +4e6b_1_B
1244 +6a6l_1_D
1245 +4v5z_1_BS
1246 +4v8t_1_1
1247 +1uvi_1_D
1248 +1uvi_1_F
1249 +1uvi_1_E
1250 +4m7d_1_P
1251 +4k4u_1_D
1252 +4k4u_1_H
1253 +6rt7_1_E
1254 +6rt7_1_A
1255 +2voo_1_C
1256 +2voo_1_D
1257 +5k78_1_X
1258 +5k78_1_Y
1259 +4ylo_1_9
1260 +4kzy_1_I
1261 +4kzz_1_I
1262 +4kzx_1_I
1263 +5vyc_1_I2
1264 +5vyc_1_I3
1265 +5vyc_1_I5
1266 +5vyc_1_I1
1267 +5vyc_1_I6
1268 +5vyc_1_I4
1269 +6ip8_1_2M
1270 +6ip5_1_2M
1271 +6ip6_1_2M
1272 +6qcs_1_M
1273 +486d_1_G
1274 +2r1g_1_C
1275 +486d_1_F
1276 +4v5z_1_B0
1277 +4nia_1_O
1278 +4nia_1_J
1279 +4nia_1_K
1280 +4nia_1_L
1281 +4nia_1_F
1282 +4oq9_1_K
1283 +4oq9_1_O
1284 +4oq9_1_J
1285 +4oq9_1_F
1286 +4oq9_1_L
1287 +5tbw_1_SR
1288 +6hhq_1_SR
1289 +6zvi_1_H
1290 +6sv4_1_2B
1291 +6sv4_1_2C
1292 +6t83_1_2B
1293 +6t83_1_A
1294 +6i7o_1_2B
1295 +6r9q_1_B
1296 +6v3a_1_SN1
1297 +6v3b_1_SN1
1298 +6v39_1_SN1
1299 +6v3e_1_SN1
1300 +1pn7_1_C
1301 +1mj1_1_Q
1302 +1mj1_1_R
1303 +4dr6_1_V
1304 +6kql_1_I
1305 +4eya_1_M
1306 +4eya_1_N
1307 +4eya_1_A
1308 +4eya_1_B
1309 +2wj8_1_D
1310 +2wj8_1_I
1311 +2wj8_1_L
1312 +2wj8_1_F
1313 +2wj8_1_C
1314 +2wj8_1_Q
1315 +2wj8_1_J
1316 +2wj8_1_P
1317 +2wj8_1_K
1318 +2wj8_1_E
1319 +2wj8_1_T
1320 +2wj8_1_B
1321 +2wj8_1_O
1322 +2wj8_1_N
1323 +2wj8_1_A
1324 +2wj8_1_H
1325 +2wj8_1_R
1326 +2wj8_1_M
1327 +2wj8_1_S
1328 +2wj8_1_G
1329 +4e6b_1_E
1330 +4e6b_1_F
1331 +6p71_1_I
1332 +3pdm_1_R
1333 +5det_1_P
1334 +5els_1_I
1335 +4n2s_1_B
1336 +4yoe_1_E
1337 +3j0o_1_H
1338 +3j0l_1_H
1339 +3j0p_1_H
1340 +3j0q_1_H
1341 +5gxi_1_B
1342 +3iy8_1_A
1343 +6tnu_1_M
1344 +5mc6_1_M
1345 +5mc6_1_N
1346 +4eya_1_O
1347 +4eya_1_P
1348 +4eya_1_C
1349 +4eya_1_D
1350 +6htq_1_V
1351 +6htq_1_W
1352 +6htq_1_U
1353 +6uu6_1_333
1354 +6v3a_1_V
1355 +6v39_1_V
1356 +5a0v_1_F
1357 +3avt_1_T
1358 +6d1v_1_C
1359 +4s2x_1_B
1360 +4s2y_1_B
1361 +5wnu_1_B
1362 +1zc8_1_F
1363 +1vtm_1_R
1364 +4v5z_1_BA
1365 +4v5z_1_BE
1366 +4v5z_1_BD
1367 +4v5z_1_BG
1368 +4v5z_1_BI
1369 +4v5z_1_BK
1370 +4v5z_1_BM
1371 +4v5z_1_BL
1372 +4v5z_1_BV
1373 +4v5z_1_BO
1374 +4v5z_1_BN
1375 +4v5z_1_BQ
1376 +4v5z_1_BR
1377 +4v5z_1_BT
1378 +4v5z_1_BU
1379 +4v5z_1_BW
1380 +4v5z_1_BY
1381 +4v5z_1_BX
1382 +4v5z_1_BZ
1383 +6u9x_1_H
1384 +6u9x_1_K
1385 +5elk_1_R
1386 +6okk_1_G
1387 +4cxg_1_A
1388 +4cxh_1_A
1389 +6bk8_1_I
1390 +4cxg_1_B
1391 +4cxh_1_B
1392 +4v5z_1_B1
1393 +5z4d_1_B
1394 +6o78_1_E
1395 +6ha8_1_X
1396 +1m8w_1_E
1397 +1m8w_1_F
1398 +5udi_1_B
1399 +5udl_1_B
1400 +5udk_1_B
1401 +5udj_1_B
1402 +5w5i_1_B
1403 +5w5i_1_D
1404 +5w5h_1_B
1405 +5w5h_1_D
1406 +4eya_1_K
1407 +4eya_1_L
1408 +4eya_1_I
1409 +4eya_1_J
1410 +4g9z_1_E
1411 +4g9z_1_F
1412 +3nma_1_B
1413 +3nma_1_C
1414 +6een_1_G
1415 +6een_1_I
1416 +6een_1_H
1417 +4wti_1_T
1418 +4wti_1_P
1419 +5l3p_1_Y
1420 +4hor_1_X
1421 +3rzo_1_R
1422 +2f4v_1_Z
1423 +1qln_1_R
1424 +2xs7_1_B
1425 +6zvi_1_E
1426 +6sv4_1_MC
1427 +6sv4_1_MB
1428 +6i7o_1_MB
1429 +6ogy_1_M
1430 +6ogy_1_N
1431 +6uej_1_B
1432 +1x18_1_A
1433 +5ytx_1_B
1434 +6o8w_1_U
1435 +4g0a_1_H
1436 +6r9p_1_B
1437 +3koa_1_C
1438 +4n48_1_D
1439 +4n48_1_G
1440 +6kug_1_B
1441 +6ktc_1_V
1442 +6ole_1_U
1443 +6om0_1_U
1444 +6olg_1_BV
1445 +6oli_1_U
1446 +6om7_1_U
1447 +6w6l_1_U
1448 +6olz_1_BV
1449 +6olf_1_U
1450 +5lzd_1_X
1451 +6m7k_1_B
1452 +3cd6_1_4
1453 +3cma_1_5
1454 +6n9e_1_2W
1455 +1vqo_1_4
1456 +1qvg_1_3
1457 +3cme_1_5
1458 +5lzd_1_W
1459 +5lze_1_W
1460 +5lzc_1_W
1461 +5lzb_1_W
1462 +3wzi_1_C
1463 +1mvr_1_E
1464 +1mvr_1_B
1465 +1mvr_1_A
1466 +4adx_1_0
1467 +4adx_1_8
1468 +1n33_1_Z
1469 +6dti_1_W
1470 +3d2s_1_F
1471 +3d2s_1_H
1472 +5mrc_1_AA
1473 +5mre_1_AA
1474 +5mrf_1_AA
1475 +5fl8_1_Y
1476 +5jcs_1_Y
1477 +2r1g_1_A
1478 +2r1g_1_D
1479 +2r1g_1_F
1480 +3eq4_1_Y
1481 +4wkr_1_C
1482 +4v99_1_EC
1483 +4v99_1_AC
1484 +4v99_1_BH
1485 +4v99_1_CH
1486 +4v99_1_AM
1487 +4v99_1_DC
1488 +4v99_1_JW
1489 +4v99_1_EH
1490 +4v99_1_BW
1491 +4v99_1_FW
1492 +4v99_1_AW
1493 +4v99_1_BC
1494 +4v99_1_BM
1495 +4v99_1_IC
1496 +4v99_1_EM
1497 +4v99_1_ER
1498 +4v99_1_IW
1499 +4v99_1_JH
1500 +4v99_1_JR
1501 +4v99_1_AH
1502 +4v99_1_GR
1503 +4v99_1_IR
1504 +4v99_1_BR
1505 +4v99_1_CW
1506 +4v99_1_HR
1507 +4v99_1_FH
1508 +4v99_1_HC
1509 +4v99_1_DW
1510 +4v99_1_GC
1511 +4v99_1_JC
1512 +4v99_1_DM
1513 +4v99_1_EW
1514 +4v99_1_AR
1515 +4v99_1_CR
1516 +4v99_1_JM
1517 +4v99_1_CC
1518 +4v99_1_IH
1519 +4v99_1_FR
1520 +4v99_1_CM
1521 +4v99_1_IM
1522 +4v99_1_FM
1523 +4v99_1_FC
1524 +4v99_1_GH
1525 +4v99_1_HM
1526 +4v99_1_HH
1527 +4v99_1_DR
1528 +4v99_1_HW
1529 +4v99_1_GW
1530 +4v99_1_DH
1531 +4v99_1_GM
1532 +6rt4_1_D
1533 +6rt4_1_C
1534 +6zvh_1_X
1535 +4dwa_1_D
1536 +6n6c_1_D
1537 +6n6j_1_C
1538 +6n6j_1_D
1539 +6p7q_1_E
1540 +6p7q_1_F
1541 +6p7q_1_D
1542 +6rcl_1_C
1543 +5jju_1_C
1544 +4ejt_1_G
1545 +5ceu_1_C
1546 +5ceu_1_D
1547 +6lkq_1_W
1548 +3qsu_1_P
1549 +3qsu_1_R
1550 +1n38_1_B
1551 +4qvc_1_G
1552 +6q1h_1_D
1553 +6q1h_1_H
1554 +6p7p_1_F
1555 +6p7p_1_E
1556 +6p7p_1_D
1557 +6vm6_1_J
1558 +6vm6_1_G
1559 +6wan_1_K
1560 +6wan_1_H
1561 +6wan_1_G
1562 +6wan_1_L
1563 +6wan_1_I
1564 +6ywo_1_F
1565 +6wan_1_J
1566 +4oau_1_A
1567 +6ywo_1_E
1568 +6ywo_1_K
1569 +6vm6_1_I
1570 +6vm6_1_H
1571 +6ywo_1_I
1572 +2a1r_1_C
1573 +2a1r_1_D
1574 +3gpq_1_E
1575 +3gpq_1_F
1576 +6o79_1_C
1577 +6vm6_1_K
1578 +6hyu_1_D
1579 +1laj_1_R
1580 +6ybv_1_K
1581 +6mpf_1_W
1582 +6spc_1_A
1583 +6spe_1_A
1584 +6fti_1_V
1585 +6ftj_1_V
1586 +6ftg_1_V
1587 +4g0a_1_G
1588 +4g0a_1_F
1589 +4g0a_1_E
1590 +2b2d_1_S
1591 +5hkc_1_C
1592 +1rmv_1_B
1593 +4qu7_1_X
1594 +4qu7_1_V
1595 +4qu7_1_U
1596 +4v5z_1_AH
1597 +4v5z_1_AA
1598 +4v5z_1_AB
1599 +4v5z_1_AC
1600 +4v5z_1_AD
1601 +4v5z_1_AE
1602 +4v5z_1_AF
1603 +4v5z_1_AG
1604 +6pmi_1_3
1605 +6pmj_1_3
1606 +5hjz_1_C
......
This diff could not be displayed because it is too large.
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
11 # - Use a specialised database (SILVA) : better alignments (we guess?), but two kind of jobs 11 # - Use a specialised database (SILVA) : better alignments (we guess?), but two kind of jobs
12 # - Use cmalign --small everywhere (homogeneity) 12 # - Use cmalign --small everywhere (homogeneity)
13 # Moreover, --small requires --nonbanded --cyk, which means the output alignement is the optimally scored one. 13 # Moreover, --small requires --nonbanded --cyk, which means the output alignement is the optimally scored one.
14 -# To date, we trust Infernal as the best tool to realign RNA. Is it ? 14 +# To date, we trust Infernal as the best tool to realign ncRNA. Is it ?
15 15
16 # Contact: louis.becquey@univ-evry.fr (PhD student), fariza.tahi@univ-evry.fr (PI) 16 # Contact: louis.becquey@univ-evry.fr (PhD student), fariza.tahi@univ-evry.fr (PI)
17 17
...@@ -28,7 +28,7 @@ pd.set_option('display.max_rows', None) ...@@ -28,7 +28,7 @@ pd.set_option('display.max_rows', None)
28 LSU_set = ["RF00002", "RF02540", "RF02541", "RF02543", "RF02546"] # From Rfam CLAN 00112 28 LSU_set = ["RF00002", "RF02540", "RF02541", "RF02543", "RF02546"] # From Rfam CLAN 00112
29 SSU_set = ["RF00177", "RF02542", "RF02545", "RF01959", "RF01960"] # From Rfam CLAN 00111 29 SSU_set = ["RF00177", "RF02542", "RF02545", "RF01959", "RF01960"] # From Rfam CLAN 00111
30 30
31 -with sqlite3.connect("results/RNANet.db") as conn: 31 +with sqlite3.connect(os.getcwd()+"/results/RNANet.db") as conn:
32 df = pd.read_sql("SELECT rfam_acc, max_len, nb_total_homol, comput_time, comput_peak_mem FROM family;", conn) 32 df = pd.read_sql("SELECT rfam_acc, max_len, nb_total_homol, comput_time, comput_peak_mem FROM family;", conn)
33 33
34 to_remove = [ f for f in df.rfam_acc if f in LSU_set+SSU_set ] 34 to_remove = [ f for f in df.rfam_acc if f in LSU_set+SSU_set ]
...@@ -74,7 +74,7 @@ ax.set_ylabel("Maximum length of sequences ") ...@@ -74,7 +74,7 @@ ax.set_ylabel("Maximum length of sequences ")
74 ax.set_zlabel("Computation time (s)") 74 ax.set_zlabel("Computation time (s)")
75 75
76 plt.subplots_adjust(wspace=0.4) 76 plt.subplots_adjust(wspace=0.4)
77 -plt.savefig("results/cmalign_jobs_performance.png") 77 +plt.savefig(os.getcwd()+"/results/cmalign_jobs_performance.png")
78 78
79 # # ======================================================== 79 # # ========================================================
80 # # Linear Regression of max_mem as function of max_length 80 # # Linear Regression of max_mem as function of max_length
......
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
3 # This file computes additional statistics over the produced dataset. 3 # This file computes additional statistics over the produced dataset.
4 # Run this file if you want the base counts, pair-type counts, identity percents, etc 4 # Run this file if you want the base counts, pair-type counts, identity percents, etc
5 # in the database. 5 # in the database.
6 -# This should be run from the folder where the file is (to access the database with path "results/RNANet.db")
7 6
8 import getopt, os, pickle, sqlite3, shlex, subprocess, sys 7 import getopt, os, pickle, sqlite3, shlex, subprocess, sys
9 import numpy as np 8 import numpy as np
...@@ -22,34 +21,35 @@ from multiprocessing import Pool, Manager ...@@ -22,34 +21,35 @@ from multiprocessing import Pool, Manager
22 from os import path 21 from os import path
23 from tqdm import tqdm 22 from tqdm import tqdm
24 from collections import Counter 23 from collections import Counter
25 -from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker 24 +from setproctitle import setproctitle
25 +from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions
26 26
27 path_to_3D_data = "tobedefinedbyoptions" 27 path_to_3D_data = "tobedefinedbyoptions"
28 path_to_seq_data = "tobedefinedbyoptions" 28 path_to_seq_data = "tobedefinedbyoptions"
29 +runDir = os.getcwd()
29 res_thr = 20.0 # default: all structures 30 res_thr = 20.0 # default: all structures
30 31
31 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112 32 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112
32 SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111 33 SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111
33 34
34 -def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): 35 +@trace_unhandled_exceptions
36 +def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0):
35 """ 37 """
36 Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph. 38 Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph.
37 - See Wadley & Pyle (2007) 39 + See Wadley & Pyle (2007).
40 + Only unique unmapped chains with resolution < res argument are considered.
38 41
39 Arguments: 42 Arguments:
40 - show: True or False, call plt.show() at this end or not
41 - filter_helical: None, "form", "zone", or "both"
42 - None: do not remove helical nucleotide
43 - "form": remove nucleotides if they belong to a A, B or Z form stem
44 - "zone": remove nucleotides falling in an arbitrary zone (see zone argument)
45 - "both": remove nucleotides fulfilling one or both of the above conditions
46 carbon: 1 or 4, use C4' (eta and theta) or C1' (eta_prime and theta_prime) 43 carbon: 1 or 4, use C4' (eta and theta) or C1' (eta_prime and theta_prime)
44 + show: True or False, call plt.show() at this end or not
47 sd_range: tuple, set values below avg + sd_range[0] * stdev to 0, 45 sd_range: tuple, set values below avg + sd_range[0] * stdev to 0,
48 and values above avg + sd_range[1] * stdev to avg + sd_range[1] * stdev. 46 and values above avg + sd_range[1] * stdev to avg + sd_range[1] * stdev.
49 This removes noise and cuts too high peaks, to clearly see the clusters. 47 This removes noise and cuts too high peaks, to clearly see the clusters.
48 + res: Minimal resolution (maximal resolution value, actually) of the structure to
49 + consider its nucleotides.
50 """ 50 """
51 51
52 - os.makedirs("results/figures/wadley_plots/", exist_ok=True) 52 + os.makedirs(runDir + "/results/figures/wadley_plots/", exist_ok=True)
53 53
54 if carbon == 4: 54 if carbon == 4:
55 angle = "eta" 55 angle = "eta"
...@@ -63,30 +63,32 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): ...@@ -63,30 +63,32 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
63 exit("You overestimate my capabilities !") 63 exit("You overestimate my capabilities !")
64 64
65 65
66 - if not path.isfile(f"data/wadley_kernel_{angle}_{res}A.npz"): 66 + if not path.isfile(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz"):
67 67
68 # Get a worker number to position the progress bar 68 # Get a worker number to position the progress bar
69 global idxQueue 69 global idxQueue
70 thr_idx = idxQueue.get() 70 thr_idx = idxQueue.get()
71 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} reproduce_wadley_results(carbon={carbon})")
72 +
71 pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", position=thr_idx+1, leave=False) 73 pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", position=thr_idx+1, leave=False)
72 74
73 # Extract the angle values of c2'-endo and c3'-endo nucleotides 75 # Extract the angle values of c2'-endo and c3'-endo nucleotides
74 - with sqlite3.connect("results/RNANet.db") as conn: 76 + with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
75 df = pd.read_sql(f"""SELECT {angle}, th{angle} 77 df = pd.read_sql(f"""SELECT {angle}, th{angle}
76 - FROM nucleotide JOIN ( 78 + FROM (
77 - SELECT chain_id FROM chain JOIN structure 79 + SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
78 - WHERE structure.resolution <= {res} 80 + WHERE chain.rfam_acc = 'unmappd' AND structure.resolution <= {res} AND issue = 0
79 - ) AS c 81 + ) AS c NATURAL JOIN nucleotide
80 WHERE puckering="C2'-endo" 82 WHERE puckering="C2'-endo"
81 AND {angle} IS NOT NULL 83 AND {angle} IS NOT NULL
82 AND th{angle} IS NOT NULL;""", conn) 84 AND th{angle} IS NOT NULL;""", conn)
83 c2_endo_etas = df[angle].values.tolist() 85 c2_endo_etas = df[angle].values.tolist()
84 c2_endo_thetas = df["th"+angle].values.tolist() 86 c2_endo_thetas = df["th"+angle].values.tolist()
85 df = pd.read_sql(f"""SELECT {angle}, th{angle} 87 df = pd.read_sql(f"""SELECT {angle}, th{angle}
86 - FROM nucleotide JOIN ( 88 + FROM (
87 - SELECT chain_id FROM chain JOIN structure 89 + SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
88 - WHERE structure.resolution <= {res} 90 + WHERE chain.rfam_acc = 'unmappd' AND structure.resolution <= {res} AND issue = 0
89 - ) AS c 91 + ) AS c NATURAL JOIN nucleotide
90 WHERE form = '.' 92 WHERE form = '.'
91 AND puckering="C3'-endo" 93 AND puckering="C3'-endo"
92 AND {angle} IS NOT NULL 94 AND {angle} IS NOT NULL
...@@ -111,14 +113,16 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): ...@@ -111,14 +113,16 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
111 pbar.update(1) 113 pbar.update(1)
112 114
113 # Save the data to an archive for later use without the need to recompute 115 # Save the data to an archive for later use without the need to recompute
114 - np.savez(f"data/wadley_kernel_{angle}_{res}A.npz", 116 + np.savez(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz",
115 c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas, 117 c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas,
116 c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas, 118 c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas,
117 kernel_c3=f_c3, kernel_c2=f_c2) 119 kernel_c3=f_c3, kernel_c2=f_c2)
118 pbar.close() 120 pbar.close()
119 idxQueue.put(thr_idx) 121 idxQueue.put(thr_idx)
120 else: 122 else:
121 - f = np.load(f"data/wadley_kernel_{angle}_{res}A.npz") 123 + setproctitle(f"RNANet statistics.py reproduce_wadley_results(carbon={carbon})")
124 +
125 + f = np.load(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz")
122 c2_endo_etas = f["c2_endo_e"] 126 c2_endo_etas = f["c2_endo_e"]
123 c3_endo_etas = f["c3_endo_e"] 127 c3_endo_etas = f["c3_endo_e"]
124 c2_endo_thetas = f["c2_endo_t"] 128 c2_endo_thetas = f["c2_endo_t"]
...@@ -148,7 +152,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): ...@@ -148,7 +152,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
148 f_low_thr = f.mean() + sd_range[0]*f.std() 152 f_low_thr = f.mean() + sd_range[0]*f.std()
149 f_cut = np.where(f > f_sup_thr, f_sup_thr, f) 153 f_cut = np.where(f > f_sup_thr, f_sup_thr, f)
150 f_cut = np.where(f_cut < f_low_thr, 0, f_cut) 154 f_cut = np.where(f_cut < f_low_thr, 0, f_cut)
151 - levels = [f.mean()+f.std(), f.mean()+2*f.std(), f.mean()+4*f.std()] 155 + levels = [ f.mean()+f.std(), f.mean()+2*f.std(), f.mean()+4*f.std()]
152 156
153 # histogram: 157 # histogram:
154 fig = plt.figure() 158 fig = plt.figure()
...@@ -157,7 +161,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): ...@@ -157,7 +161,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
157 ax.bar3d(xpos.ravel(), ypos.ravel(), 0.0, 0.09, 0.09, hist_cut.ravel(), color=color_values, zorder="max") 161 ax.bar3d(xpos.ravel(), ypos.ravel(), 0.0, 0.09, 0.09, hist_cut.ravel(), color=color_values, zorder="max")
158 ax.set_xlabel(xlabel) 162 ax.set_xlabel(xlabel)
159 ax.set_ylabel(ylabel) 163 ax.set_ylabel(ylabel)
160 - fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res}A.png") 164 + fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res}A.png")
161 if show: 165 if show:
162 fig.show() 166 fig.show()
163 plt.close() 167 plt.close()
...@@ -168,7 +172,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): ...@@ -168,7 +172,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
168 ax.plot_surface(xx, yy, f_cut, cmap=cm.get_cmap("coolwarm"), linewidth=0, antialiased=True) 172 ax.plot_surface(xx, yy, f_cut, cmap=cm.get_cmap("coolwarm"), linewidth=0, antialiased=True)
169 ax.set_xlabel(xlabel) 173 ax.set_xlabel(xlabel)
170 ax.set_ylabel(ylabel) 174 ax.set_ylabel(ylabel)
171 - fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res}A.png") 175 + fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res}A.png")
172 if show: 176 if show:
173 fig.show() 177 fig.show()
174 plt.close() 178 plt.close()
...@@ -177,10 +181,10 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): ...@@ -177,10 +181,10 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
177 fig = plt.figure(figsize=(5,5)) 181 fig = plt.figure(figsize=(5,5))
178 ax = fig.gca() 182 ax = fig.gca()
179 ax.scatter(x, y, s=1, alpha=0.1) 183 ax.scatter(x, y, s=1, alpha=0.1)
180 - ax.contourf(xx, yy, f_cut, alpha=0.5, cmap=cm.get_cmap("coolwarm"), levels=levels, extend="max") 184 + ax.contourf(xx, yy, f, alpha=0.5, cmap=cm.get_cmap("coolwarm"), levels=levels, extend="max")
181 ax.set_xlabel(xlabel) 185 ax.set_xlabel(xlabel)
182 ax.set_ylabel(ylabel) 186 ax.set_ylabel(ylabel)
183 - fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}_{res}A.png") 187 + fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_{angle}_{l}_{res}A.png")
184 if show: 188 if show:
185 fig.show() 189 fig.show()
186 plt.close() 190 plt.close()
...@@ -188,10 +192,13 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): ...@@ -188,10 +192,13 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
188 192
189 def stats_len(): 193 def stats_len():
190 """Plots statistics on chain lengths in RNA families. 194 """Plots statistics on chain lengths in RNA families.
195 + Uses all chains mapped to a family including copies, inferred or not.
191 196
192 REQUIRES tables chain, nucleotide up to date. 197 REQUIRES tables chain, nucleotide up to date.
193 """ 198 """
194 199
200 + setproctitle(f"RNANet statistics.py stats_len({res_thr})")
201 +
195 # Get a worker number to position the progress bar 202 # Get a worker number to position the progress bar
196 global idxQueue 203 global idxQueue
197 thr_idx = idxQueue.get() 204 thr_idx = idxQueue.get()
...@@ -214,7 +221,7 @@ def stats_len(): ...@@ -214,7 +221,7 @@ def stats_len():
214 cols = [] 221 cols = []
215 lengths = [] 222 lengths = []
216 223
217 - for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False)): 224 + for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False):
218 225
219 # Define a color for that family in the plot 226 # Define a color for that family in the plot
220 if f in LSU_set: 227 if f in LSU_set:
...@@ -229,7 +236,7 @@ def stats_len(): ...@@ -229,7 +236,7 @@ def stats_len():
229 cols.append("grey") 236 cols.append("grey")
230 237
231 # Get the lengths of chains 238 # Get the lengths of chains
232 - with sqlite3.connect("results/RNANet.db") as conn: 239 + with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
233 l = [ x[0] for x in sql_ask_database(conn, f"""SELECT COUNT(index_chain) 240 l = [ x[0] for x in sql_ask_database(conn, f"""SELECT COUNT(index_chain)
234 FROM ( 241 FROM (
235 SELECT chain_id 242 SELECT chain_id
...@@ -239,8 +246,6 @@ def stats_len(): ...@@ -239,8 +246,6 @@ def stats_len():
239 GROUP BY chain_id;""", warn_every=0) ] 246 GROUP BY chain_id;""", warn_every=0) ]
240 lengths.append(l) # list of chain lengths from the family 247 lengths.append(l) # list of chain lengths from the family
241 248
242 - # notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
243 -
244 # Plot the figure 249 # Plot the figure
245 fig = plt.figure(figsize=(10,3)) 250 fig = plt.figure(figsize=(10,3))
246 ax = fig.gca() 251 ax = fig.gca()
...@@ -267,7 +272,7 @@ def stats_len(): ...@@ -267,7 +272,7 @@ def stats_len():
267 ncol=1, fontsize='small', bbox_to_anchor=(1.3, 0.5)) 272 ncol=1, fontsize='small', bbox_to_anchor=(1.3, 0.5))
268 273
269 # Save the figure 274 # Save the figure
270 - fig.savefig(f"results/figures/lengths_{res_thr}A.png") 275 + fig.savefig(runDir + f"/results/figures/lengths_{res_thr}A.png")
271 idxQueue.put(thr_idx) # replace the thread index in the queue 276 idxQueue.put(thr_idx) # replace the thread index in the queue
272 # notify("Computed sequence length statistics and saved the figure.") 277 # notify("Computed sequence length statistics and saved the figure.")
273 278
...@@ -285,6 +290,7 @@ def format_percentage(tot, x): ...@@ -285,6 +290,7 @@ def format_percentage(tot, x):
285 290
286 def stats_freq(): 291 def stats_freq():
287 """Computes base frequencies in all RNA families. 292 """Computes base frequencies in all RNA families.
293 + Uses all chains mapped to a family including copies, inferred or not.
288 294
289 Outputs results/frequencies.csv 295 Outputs results/frequencies.csv
290 REQUIRES tables chain, nucleotide up to date.""" 296 REQUIRES tables chain, nucleotide up to date."""
...@@ -293,17 +299,18 @@ def stats_freq(): ...@@ -293,17 +299,18 @@ def stats_freq():
293 global idxQueue 299 global idxQueue
294 thr_idx = idxQueue.get() 300 thr_idx = idxQueue.get()
295 301
302 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} stats_freq()")
303 +
296 # Initialize a Counter object for each family 304 # Initialize a Counter object for each family
297 freqs = {} 305 freqs = {}
298 for f in fam_list: 306 for f in fam_list:
299 freqs[f] = Counter() 307 freqs[f] = Counter()
300 308
301 # List all nt_names happening within a RNA family and store the counts in the Counter 309 # List all nt_names happening within a RNA family and store the counts in the Counter
302 - for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False)): 310 + for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False):
303 - with sqlite3.connect("results/RNANet.db") as conn: 311 + with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
304 counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0)) 312 counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0))
305 freqs[f].update(counts) 313 freqs[f].update(counts)
306 - # notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.")
307 314
308 # Create a pandas DataFrame, and save it to CSV. 315 # Create a pandas DataFrame, and save it to CSV.
309 df = pd.DataFrame() 316 df = pd.DataFrame()
...@@ -311,7 +318,7 @@ def stats_freq(): ...@@ -311,7 +318,7 @@ def stats_freq():
311 tot = sum(freqs[f].values()) 318 tot = sum(freqs[f].values())
312 df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ]) 319 df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ])
313 df = df.fillna(0) 320 df = df.fillna(0)
314 - df.to_csv("results/frequencies.csv") 321 + df.to_csv(runDir + "/results/frequencies.csv")
315 idxQueue.put(thr_idx) # replace the thread index in the queue 322 idxQueue.put(thr_idx) # replace the thread index in the queue
316 # notify("Saved nucleotide frequencies to CSV file.") 323 # notify("Saved nucleotide frequencies to CSV file.")
317 324
...@@ -327,11 +334,13 @@ def parallel_stats_pairs(f): ...@@ -327,11 +334,13 @@ def parallel_stats_pairs(f):
327 global idxQueue 334 global idxQueue
328 thr_idx = idxQueue.get() 335 thr_idx = idxQueue.get()
329 336
337 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} p_stats_pairs({f})")
338 +
330 chain_id_list = mappings_list[f] 339 chain_id_list = mappings_list[f]
331 data = [] 340 data = []
332 sqldata = [] 341 sqldata = []
333 for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", leave=False): 342 for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", leave=False):
334 - with sqlite3.connect("results/RNANet.db") as conn: 343 + with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
335 # Get comma separated lists of basepairs per nucleotide 344 # Get comma separated lists of basepairs per nucleotide
336 interactions = pd.DataFrame( 345 interactions = pd.DataFrame(
337 sql_ask_database(conn, 346 sql_ask_database(conn,
...@@ -398,7 +407,7 @@ def parallel_stats_pairs(f): ...@@ -398,7 +407,7 @@ def parallel_stats_pairs(f):
398 data.append(expanded_list) 407 data.append(expanded_list)
399 408
400 # Update the database 409 # Update the database
401 - with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn: 410 + with sqlite3.connect(runDir + "/results/RNANet.db", isolation_level=None) as conn:
402 conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query 411 conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
403 sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?, 412 sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
404 pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?, 413 pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?,
...@@ -416,8 +425,8 @@ def parallel_stats_pairs(f): ...@@ -416,8 +425,8 @@ def parallel_stats_pairs(f):
416 425
417 # Create an output DataFrame 426 # Create an output DataFrame
418 f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f]) 427 f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f])
419 - f_df.to_csv(f"data/{f}_counts.csv") 428 + f_df.to_csv(runDir + f"/data/{f}_counts.csv")
420 - expanded_list.to_csv(f"data/{f}_pairs.csv") 429 + expanded_list.to_csv(runDir + f"/data/{f}_pairs.csv")
421 430
422 idxQueue.put(thr_idx) # replace the thread index in the queue 431 idxQueue.put(thr_idx) # replace the thread index in the queue
423 432
...@@ -430,28 +439,34 @@ def to_dist_matrix(f): ...@@ -430,28 +439,34 @@ def to_dist_matrix(f):
430 global idxQueue 439 global idxQueue
431 thr_idx = idxQueue.get() 440 thr_idx = idxQueue.get()
432 441
433 - # notify(f"Computing {f} distance matrix from alignment...") 442 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_dist_matrix({f})")
434 - command = f"esl-alipid --rna --noheader --informat stockholm {f}_3d_only.stk"
435 443
436 # Prepare a file 444 # Prepare a file
437 with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file: 445 with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file:
438 al = AlignIO.read(al_file, "fasta") 446 al = AlignIO.read(al_file, "fasta")
439 names = [ x.id for x in al if '[' in x.id ] 447 names = [ x.id for x in al if '[' in x.id ]
440 al = al[-len(names):] 448 al = al[-len(names):]
441 - with open(f + "_3d_only.stk", "w") as only_3d: 449 + with open(path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk", "w") as only_3d:
450 + try:
442 only_3d.write(al.format("stockholm")) 451 only_3d.write(al.format("stockholm"))
452 + except ValueError as e:
453 + warn(e)
443 del al 454 del al
455 + subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap", "-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk", "stockholm", path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"])
444 456
445 # Prepare the job 457 # Prepare the job
446 - process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE) 458 + process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"),
459 + stdout=subprocess.PIPE, stderr=subprocess.PIPE)
447 id_matrix = np.zeros((len(names), len(names))) 460 id_matrix = np.zeros((len(names), len(names)))
448 461
449 pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", leave=False) 462 pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", leave=False)
450 - while process.poll() is None: 463 + cnt = 0
451 - output = process.stdout.readline() 464 + while not cnt or process.poll() is None:
465 + output = process.stdout.read()
452 if output: 466 if output:
453 lines = output.strip().split(b'\n') 467 lines = output.strip().split(b'\n')
454 for l in lines: 468 for l in lines:
469 + cnt += 1
455 line = l.split() 470 line = l.split()
456 s1 = line[0].decode('utf-8') 471 s1 = line[0].decode('utf-8')
457 s2 = line[1].decode('utf-8') 472 s2 = line[1].decode('utf-8')
...@@ -460,9 +475,14 @@ def to_dist_matrix(f): ...@@ -460,9 +475,14 @@ def to_dist_matrix(f):
460 id2 = names.index(s2) 475 id2 = names.index(s2)
461 id_matrix[id1, id2] = float(score) 476 id_matrix[id1, id2] = float(score)
462 pbar.update(1) 477 pbar.update(1)
478 + if cnt != len(names)*(len(names)-1)*0.5:
479 + warn(f"{f} got {cnt} updates on {len(names)*(len(names)-1)*0.5}")
480 + if process.poll() != 0:
481 + l = process.stderr.read().strip().split(b'\n')
482 + warn("\n".join([ line.decode('utf-8') for line in l ]))
463 pbar.close() 483 pbar.close()
464 484
465 - subprocess.run(["rm", "-f", f + "_3d_only.stk"]) 485 + subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk"])
466 np.save("data/"+f+".npy", id_matrix) 486 np.save("data/"+f+".npy", id_matrix)
467 idxQueue.put(thr_idx) # replace the thread index in the queue 487 idxQueue.put(thr_idx) # replace the thread index in the queue
468 return 0 488 return 0
...@@ -471,21 +491,26 @@ def seq_idty(): ...@@ -471,21 +491,26 @@ def seq_idty():
471 """Computes identity matrices for each of the RNA families. 491 """Computes identity matrices for each of the RNA families.
472 492
473 REQUIRES temporary results files in data/*.npy 493 REQUIRES temporary results files in data/*.npy
474 - REQUIRES tables chain, family un to date.""" 494 + REQUIRES tables chain, family up to date."""
475 495
476 # load distance matrices 496 # load distance matrices
497 + fams_to_plot = [ f for f in famlist if f not in ignored ]
477 fam_arrays = [] 498 fam_arrays = []
478 - for f in famlist: 499 + for f in fams_to_plot:
479 if path.isfile("data/"+f+".npy"): 500 if path.isfile("data/"+f+".npy"):
480 - fam_arrays.append(np.load("data/"+f+".npy")) 501 + fam_arrays.append(np.load("data/"+f+".npy") / 100.0) # normalize percentages in [0,1]
481 else: 502 else:
482 - fam_arrays.append([]) 503 + warn("data/"+f+".npy not found !")
504 + fam_arrays.append(np.array([]))
483 505
484 # Update database with identity percentages 506 # Update database with identity percentages
485 - conn = sqlite3.connect("results/RNANet.db") 507 + conn = sqlite3.connect(runDir + "/results/RNANet.db")
486 - for f, D in zip(famlist, fam_arrays): 508 + for f, D in zip(fams_to_plot, fam_arrays):
487 if not len(D): continue 509 if not len(D): continue
488 - a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix 510 + if D.shape[0] > 1:
511 + a = np.sum(D) * 2 / D.shape[0] / (D.shape[0] - 1) # SUM(D) / (n(n-1)/2)
512 + else:
513 + a = D[0][0]
489 conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';") 514 conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
490 conn.commit() 515 conn.commit()
491 conn.close() 516 conn.close()
...@@ -495,10 +520,11 @@ def seq_idty(): ...@@ -495,10 +520,11 @@ def seq_idty():
495 axs = axs.ravel() 520 axs = axs.ravel()
496 [axi.set_axis_off() for axi in axs] 521 [axi.set_axis_off() for axi in axs]
497 im = "" # Just to declare the variable, it will be set in the loop 522 im = "" # Just to declare the variable, it will be set in the loop
498 - for f, D, ax in zip(famlist, fam_arrays, axs): 523 + for f, D, ax in zip(fams_to_plot, fam_arrays, axs):
499 - if not len(D): continue
500 - if D.shape[0] > 2: # Cluster only if there is more than 2 sequences to organize
501 D = D + D.T # Copy the lower triangle to upper, to get a symetrical matrix 524 D = D + D.T # Copy the lower triangle to upper, to get a symetrical matrix
525 + if D.shape[0] > 2: # Cluster only if there is more than 2 sequences to organize
526 + D = 1.0 - D
527 + np.fill_diagonal(D, 0.0)
502 condensedD = squareform(D) 528 condensedD = squareform(D)
503 529
504 # Compute basic dendrogram by Ward's method 530 # Compute basic dendrogram by Ward's method
...@@ -507,15 +533,20 @@ def seq_idty(): ...@@ -507,15 +533,20 @@ def seq_idty():
507 533
508 # Reorganize rows and cols 534 # Reorganize rows and cols
509 idx1 = Z['leaves'] 535 idx1 = Z['leaves']
510 - D = D[idx1,:] 536 + D = D[idx1[::-1],:]
511 D = D[:,idx1[::-1]] 537 D = D[:,idx1[::-1]]
512 - im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D 538 + D = 1.0 - D
513 - ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10) 539 + elif D.shape[0] == 2:
540 + np.fill_diagonal(D, 1.0) # the diagonal has been ignored until now
541 + ax.text(np.floor(D.shape[0]/2.0)-(0.5 if not D.shape[0]%2 else 0), -0.5, f + "\n(" + str(D.shape[0]) + " chains)",
542 + fontsize=9, horizontalalignment = 'center', verticalalignment='bottom')
543 + im = ax.matshow(D, vmin=0, vmax=1)
544 +
514 fig.tight_layout() 545 fig.tight_layout()
515 - fig.subplots_adjust(wspace=0.1, hspace=0.3) 546 + fig.subplots_adjust(hspace=0.3, wspace=0.1)
516 - fig.colorbar(im, ax=axs[-1], shrink=0.8) 547 + fig.colorbar(im, ax=axs[-4], shrink=0.8)
517 - fig.savefig(f"results/figures/distances.png") 548 + fig.savefig(runDir + f"/results/figures/distances.png")
518 - notify("Computed all identity matrices and saved the figure.") 549 + print("> Computed all identity matrices and saved the figure.", flush=True)
519 550
520 def stats_pairs(): 551 def stats_pairs():
521 """Counts occurrences of intra-chain base-pair types in RNA families 552 """Counts occurrences of intra-chain base-pair types in RNA families
...@@ -523,6 +554,8 @@ def stats_pairs(): ...@@ -523,6 +554,8 @@ def stats_pairs():
523 Creates a temporary results file in data/pair_counts.csv, and a results file in results/pairings.csv. 554 Creates a temporary results file in data/pair_counts.csv, and a results file in results/pairings.csv.
524 REQUIRES tables chain, nucleotide up-to-date.""" 555 REQUIRES tables chain, nucleotide up-to-date."""
525 556
557 + setproctitle(f"RNANet statistics.py stats_pairs()")
558 +
526 def line_format(family_data): 559 def line_format(family_data):
527 return family_data.apply(partial(format_percentage, sum(family_data))) 560 return family_data.apply(partial(format_percentage, sum(family_data)))
528 561
...@@ -530,12 +563,12 @@ def stats_pairs(): ...@@ -530,12 +563,12 @@ def stats_pairs():
530 results = [] 563 results = []
531 allpairs = [] 564 allpairs = []
532 for f in fam_list: 565 for f in fam_list:
533 - newpairs = pd.read_csv(f"data/{f}_pairs.csv", index_col=0) 566 + newpairs = pd.read_csv(runDir + f"/data/{f}_pairs.csv", index_col=0)
534 - fam_df = pd.read_csv(f"data/{f}_counts.csv", index_col=0) 567 + fam_df = pd.read_csv(runDir + f"/data/{f}_counts.csv", index_col=0)
535 results.append(fam_df) 568 results.append(fam_df)
536 allpairs.append(newpairs) 569 allpairs.append(newpairs)
537 - subprocess.run(["rm", "-f", f"data/{f}_pairs.csv"]) 570 + subprocess.run(["rm", "-f", runDir + f"/data/{f}_pairs.csv"])
538 - subprocess.run(["rm", "-f", f"data/{f}_counts.csv"]) 571 + subprocess.run(["rm", "-f", runDir + f"/data/{f}_counts.csv"])
539 all_pairs = pd.concat(allpairs) 572 all_pairs = pd.concat(allpairs)
540 df = pd.concat(results).fillna(0) 573 df = pd.concat(results).fillna(0)
541 df.to_csv("data/pair_counts.csv") 574 df.to_csv("data/pair_counts.csv")
...@@ -573,14 +606,14 @@ def stats_pairs(): ...@@ -573,14 +606,14 @@ def stats_pairs():
573 crosstab = crosstab[["AU", "GC", "Wobble", "Other"]] 606 crosstab = crosstab[["AU", "GC", "Wobble", "Other"]]
574 607
575 # Save to CSV 608 # Save to CSV
576 - df.to_csv("results/pair_types.csv") 609 + df.to_csv(runDir + "/results/pair_types.csv")
577 610
578 # Plot barplot of overall types 611 # Plot barplot of overall types
579 ax = crosstab.plot(figsize=(8,5), kind='bar', stacked=True, log=False, fontsize=13) 612 ax = crosstab.plot(figsize=(8,5), kind='bar', stacked=True, log=False, fontsize=13)
580 ax.set_ylabel("Number of observations (millions)", fontsize=13) 613 ax.set_ylabel("Number of observations (millions)", fontsize=13)
581 ax.set_xlabel(None) 614 ax.set_xlabel(None)
582 plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99) 615 plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99)
583 - plt.savefig("results/figures/pairings.png") 616 + plt.savefig(runDir + "/results/figures/pairings.png")
584 617
585 notify("Computed nucleotide statistics and saved CSV and PNG file.") 618 notify("Computed nucleotide statistics and saved CSV and PNG file.")
586 619
...@@ -589,7 +622,9 @@ def per_chain_stats(): ...@@ -589,7 +622,9 @@ def per_chain_stats():
589 622
590 REQUIRES tables chain, nucleotide up to date. """ 623 REQUIRES tables chain, nucleotide up to date. """
591 624
592 - with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn: 625 + setproctitle(f"RNANet statistics.py per_chain_stats()")
626 +
627 + with sqlite3.connect(runDir + "/results/RNANet.db", isolation_level=None) as conn:
593 # Compute per-chain nucleotide frequencies 628 # Compute per-chain nucleotide frequencies
594 df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn) 629 df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn)
595 df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64) 630 df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64)
...@@ -600,25 +635,36 @@ def per_chain_stats(): ...@@ -600,25 +635,36 @@ def per_chain_stats():
600 conn.execute('pragma journal_mode=wal') 635 conn.execute('pragma journal_mode=wal')
601 sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;", 636 sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;",
602 many=True, data=list(df.to_records(index=False)), warn_every=10) 637 many=True, data=list(df.to_records(index=False)), warn_every=10)
603 - notify("Updated the database with per-chain base frequencies") 638 + print("> Updated the database with per-chain base frequencies", flush=True)
604 639
605 def general_stats(): 640 def general_stats():
606 """ 641 """
607 Number of structures as function of the resolution threshold 642 Number of structures as function of the resolution threshold
608 Number of Rfam families as function of the resolution threshold 643 Number of Rfam families as function of the resolution threshold
609 """ 644 """
610 - with sqlite3.connect("results/RNANet.db") as conn: 645 +
611 - df_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution 646 + setproctitle(f"RNANet statistics.py general_stats()")
647 +
648 + reqs = [
649 + # unique unmapped chains with no issues
650 + """ SELECT distinct pdb_id, chain_name, exp_method, resolution
612 FROM chain JOIN structure ON chain.structure_id = structure.pdb_id 651 FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
613 - WHERE rfam_acc = 'unmappd' AND ISSUE=0;""", conn) 652 + WHERE rfam_acc = 'unmappd' AND ISSUE=0;""",
614 - df_mapped_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution 653 +
654 + # unique mapped chains with no issues
655 + """ SELECT distinct pdb_id, chain_name, exp_method, resolution
615 FROM chain JOIN structure ON chain.structure_id = structure.pdb_id 656 FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
616 - WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn) 657 + WHERE rfam_acc != 'unmappd' AND ISSUE=0;""",
617 - df_mapped_copies = pd.read_sql(f"""SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution 658 +
659 + # mapped chains with no issues
660 + """ SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution
618 FROM chain JOIN structure ON chain.structure_id = structure.pdb_id 661 FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
619 - WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn) 662 + WHERE rfam_acc != 'unmappd' AND ISSUE=0;""",
620 - df_inferred_only_unique = pd.read_sql(f"""SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution 663 +
621 - FROM (SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy 664 + # mapped chains with no issues that are all inferred
665 + """ SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution
666 + FROM (
667 + SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy
622 FROM chain 668 FROM chain
623 JOIN (SELECT structure_id, chain_name, COUNT(distinct rfam_acc) AS redundancy, SUM(inferred) AS inf_redundancy 669 JOIN (SELECT structure_id, chain_name, COUNT(distinct rfam_acc) AS redundancy, SUM(inferred) AS inf_redundancy
624 FROM chain 670 FROM chain
...@@ -627,8 +673,105 @@ def general_stats(): ...@@ -627,8 +673,105 @@ def general_stats():
627 ) AS r ON chain.structure_id=r.structure_id AND chain.chain_name = r.chain_name 673 ) AS r ON chain.structure_id=r.structure_id AND chain.chain_name = r.chain_name
628 WHERE r.redundancy=r.inf_redundancy AND rfam_acc != 'unmappd' and issue=0 674 WHERE r.redundancy=r.inf_redundancy AND rfam_acc != 'unmappd' and issue=0
629 ) AS c 675 ) AS c
630 - JOIN structure ON c.structure_id=structure.pdb_id;""", conn) 676 + JOIN structure ON c.structure_id=structure.pdb_id;""",
631 - print("> found", len(df_inferred_only_unique.index), "chains which are mapped only by inference using BGSU NR Lists.") 677 +
678 + # Number of mapped chains (not inferred)
679 + """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0);""",
680 +
681 + # Number of unique mapped chains (not inferred)
682 + """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0);""",
683 +
684 + # Number of mapped chains (inferred)
685 + """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1);""",
686 +
687 + # Number of unique mapped chains (inferred)
688 + """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1);""",
689 +
690 + # Number of mapped chains inferred once
691 + """SELECT count(*) FROM (
692 + SELECT structure_id, chain_name, COUNT(DISTINCT rfam_acc) as c
693 + FROM chain where rfam_acc!='unmappd' and inferred=1
694 + GROUP BY structure_id, chain_name
695 + ) WHERE c=1;""",
696 +
697 + # Number of mapped chains inferred twice
698 + """select count(*) from (
699 + select structure_id, chain_name, count(distinct rfam_acc) as c
700 + from chain where rfam_acc!='unmappd' and inferred=1
701 + group by structure_id, chain_name
702 + ) where c=2;""",
703 +
704 + # Number of mapped chains inferred 3 times or more
705 + """select count(*) from (
706 + select structure_id, chain_name, count(distinct rfam_acc) as c
707 + from chain where rfam_acc!='unmappd' and inferred=1
708 + group by structure_id, chain_name
709 + ) where c>2;""",
710 +
711 + # Number of chains both mapped with and without inferrence
712 + """ SELECT COUNT(*) FROM (
713 + SELECT structure_id, chain_name, sum(inferred) AS s, COUNT(rfam_acc) AS c
714 + FROM chain
715 + WHERE rfam_acc!='unmappd'
716 + GROUP BY structure_id, chain_name
717 + )
718 + WHERE s < c AND s > 0;""",
719 +
720 + # Number of mapped chains (total)
721 + """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd');""",
722 +
723 + # Number of unique mapped chains
724 + """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd');""",
725 +
726 + # Number of unmapped chains
727 + """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc = 'unmappd');""",
728 +
729 + # Number of mapped chains without issues (not inferred)
730 + """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0 AND issue = 0);""",
731 +
732 + # Number of unique mapped chains without issues (not inferred)
733 + """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0 AND issue = 0);""",
734 +
735 + # Number of mapped chains without issues (inferred)
736 + """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1 AND issue=0);""",
737 +
738 + # Number of unique mapped chains without issues (inferred)
739 + """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1 AND issue=0);""",
740 +
741 + # Number of mapped chains without issues (total)
742 + """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND issue=0);""",
743 +
744 + # Number of unique mapped chains without issues
745 + """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND issue=0);""",
746 +
747 + # Number of unmapped chains without issues
748 + """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc = 'unmappd' AND issue=0);"""
749 + ]
750 +
751 + answers = []
752 + with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
753 + for r in reqs:
754 + answers.append(pd.read_sql(r, conn))
755 + df_unique = answers[0]
756 + df_mapped_unique = answers[1]
757 + df_mapped_copies = answers[2]
758 + df_inferred_only_unique = answers[3]
759 + print()
760 + print("> found", answers[4].iloc[0][0], f"chains ({answers[5].iloc[0][0]} unique chains) that are mapped thanks to Rfam. Removing chains with issues, only {answers[15].iloc[0][0]} ({answers[16].iloc[0][0]} unique)")
761 + if answers[4].iloc[0][0] != answers[5].iloc[0][0]:
762 + print("\t> This happens because different parts of the same chain can be mapped to different families.")
763 + print("> found", answers[6].iloc[0][0], f"chains ({answers[7].iloc[0][0]} unique chains) that are mapped by inferrence. Removing chains with issues, only {answers[17].iloc[0][0]} ({answers[18].iloc[0][0]} unique).")
764 + print("\t> ", answers[8].iloc[0][0], "chains are mapped only once,")
765 + print("\t> ", answers[9].iloc[0][0], "are mapped to 2 families,")
766 + print("\t> ", answers[10].iloc[0][0], "are mapped to 3 or more.")
767 + print("> Among them,", answers[11].iloc[0][0], "chains are mapped both with families found on Rfam and by inferrence.")
768 + if answers[11].iloc[0][0]:
769 + print("\t> this is normal if you used option -f (--full-inference). Otherwise, there might be a problem.")
770 + print("> TOTAL:", answers[12].iloc[0][0], f"chains ({answers[13].iloc[0][0]} unique chains) mapped to a family. Removing chains with issues, only {answers[19].iloc[0][0]} ({answers[20].iloc[0][0]} unique).")
771 + print("> TOTAL:", answers[14].iloc[0][0], f"unmapped chains. Removing chains with issues, {answers[21].iloc[0][0]}.")
772 + if answers[14].iloc[0][0]:
773 + print("\t> this is normal if you used option --no-homology. Otherwise, there might be a problem.")
774 + print()
632 775
633 ########################################## 776 ##########################################
634 # plot N = f(resolution, exp_method) 777 # plot N = f(resolution, exp_method)
...@@ -642,7 +785,7 @@ def general_stats(): ...@@ -642,7 +785,7 @@ def general_stats():
642 df_inferred_only_unique.sort_values('resolution', inplace=True, ignore_index=True) 785 df_inferred_only_unique.sort_values('resolution', inplace=True, ignore_index=True)
643 df_mapped_copies.sort_values('resolution', inplace=True, ignore_index=True) 786 df_mapped_copies.sort_values('resolution', inplace=True, ignore_index=True)
644 max_res = max(df_unique.resolution) 787 max_res = max(df_unique.resolution)
645 - max_structs = len(df_mapped_copies.index.tolist()) 788 + max_structs = max(len(df_mapped_copies.index), len(df_unique.index))
646 colors = np.linspace(0,1,1+len(methods)) 789 colors = np.linspace(0,1,1+len(methods))
647 plt.xticks( np.arange(0, max_res+2, 2.0).tolist(), np.arange(0, max_res+2, 2.0).tolist() ) 790 plt.xticks( np.arange(0, max_res+2, 2.0).tolist(), np.arange(0, max_res+2, 2.0).tolist() )
648 791
...@@ -654,7 +797,7 @@ def general_stats(): ...@@ -654,7 +797,7 @@ def general_stats():
654 axs[0][0].set_ylabel("ALL", fontsize=14) 797 axs[0][0].set_ylabel("ALL", fontsize=14)
655 axs[0][0].set_title("Number of unique RNA chains", fontsize=14) 798 axs[0][0].set_title("Number of unique RNA chains", fontsize=14)
656 axs[0][0].set_ylim((0, max_structs * 1.05)) 799 axs[0][0].set_ylim((0, max_structs * 1.05))
657 - axs[0][0].legend(loc="best", fontsize=14) 800 + axs[0][0].legend(loc="lower right", fontsize=14)
658 801
659 axs[0][1].grid(axis='y', ls='dotted', lw=1) 802 axs[0][1].grid(axis='y', ls='dotted', lw=1)
660 axs[0][1].set_yticklabels([]) 803 axs[0][1].set_yticklabels([])
...@@ -663,9 +806,9 @@ def general_stats(): ...@@ -663,9 +806,9 @@ def general_stats():
663 axs[0][1].hist(df_inferred_only_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[0], 0.5), cumulative=True, label='only by inference') 806 axs[0][1].hist(df_inferred_only_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[0], 0.5), cumulative=True, label='only by inference')
664 axs[0][1].text(0.95*max_res, 0.95*len(df_mapped_unique.resolution), "%d " % len(df_mapped_unique.resolution), 807 axs[0][1].text(0.95*max_res, 0.95*len(df_mapped_unique.resolution), "%d " % len(df_mapped_unique.resolution),
665 horizontalalignment='right', verticalalignment='top', fontsize=14) 808 horizontalalignment='right', verticalalignment='top', fontsize=14)
666 - axs[0][1].set_title("Number of unique RNA chains\nmapped to $\geq 1$ family", fontsize=14) 809 + axs[0][1].set_title(r"Number of unique RNA chains\nmapped to $\geq 1$ family", fontsize=14)
667 axs[0][1].set_ylim((0, max_structs * 1.05)) 810 axs[0][1].set_ylim((0, max_structs * 1.05))
668 - axs[0][1].legend(loc="best", fontsize=14) 811 + axs[0][1].legend(loc="upper left", fontsize=14)
669 812
670 axs[0][2].grid(axis='y', ls='dotted', lw=1) 813 axs[0][2].grid(axis='y', ls='dotted', lw=1)
671 axs[0][2].set_yticklabels([]) 814 axs[0][2].set_yticklabels([])
...@@ -675,7 +818,7 @@ def general_stats(): ...@@ -675,7 +818,7 @@ def general_stats():
675 axs[0][2].text(0.95*max_res, 0.95*len(df_mapped_copies.resolution), "%d " % len(df_mapped_copies.resolution), 818 axs[0][2].text(0.95*max_res, 0.95*len(df_mapped_copies.resolution), "%d " % len(df_mapped_copies.resolution),
676 horizontalalignment='right', verticalalignment='top', fontsize=14) 819 horizontalalignment='right', verticalalignment='top', fontsize=14)
677 axs[0][2].set_title("Number of RNA chains mapped to a\nfamily (with copies)", fontsize=14) 820 axs[0][2].set_title("Number of RNA chains mapped to a\nfamily (with copies)", fontsize=14)
678 - axs[0][2].legend(loc="right", fontsize=14) 821 + axs[0][2].legend(loc="upper left", fontsize=14)
679 axs[0][2].set_ylim((0, max_structs * 1.05)) 822 axs[0][2].set_ylim((0, max_structs * 1.05))
680 823
681 for i,m in enumerate(methods): 824 for i,m in enumerate(methods):
...@@ -683,7 +826,7 @@ def general_stats(): ...@@ -683,7 +826,7 @@ def general_stats():
683 df_mapped_unique_m = df_mapped_unique[df_mapped_unique.exp_method == m] 826 df_mapped_unique_m = df_mapped_unique[df_mapped_unique.exp_method == m]
684 df_inferred_only_unique_m = df_inferred_only_unique[df_inferred_only_unique.exp_method == m] 827 df_inferred_only_unique_m = df_inferred_only_unique[df_inferred_only_unique.exp_method == m]
685 df_mapped_copies_m = df_mapped_copies[ df_mapped_copies.exp_method == m] 828 df_mapped_copies_m = df_mapped_copies[ df_mapped_copies.exp_method == m]
686 - max_structs = len(df_mapped_copies_m.resolution.tolist()) 829 + max_structs = max(len(df_mapped_copies_m.index), len(df_unique_m.index))
687 print("> found", max_structs, "structures with method", m, flush=True) 830 print("> found", max_structs, "structures with method", m, flush=True)
688 831
689 axs[1+i][0].grid(axis='y', ls='dotted', lw=1) 832 axs[1+i][0].grid(axis='y', ls='dotted', lw=1)
...@@ -693,7 +836,7 @@ def general_stats(): ...@@ -693,7 +836,7 @@ def general_stats():
693 horizontalalignment='right', verticalalignment='top', fontsize=14) 836 horizontalalignment='right', verticalalignment='top', fontsize=14)
694 axs[1+i][0].set_ylim((0, max_structs * 1.05)) 837 axs[1+i][0].set_ylim((0, max_structs * 1.05))
695 axs[1+i][0].set_ylabel(m, fontsize=14) 838 axs[1+i][0].set_ylabel(m, fontsize=14)
696 - axs[1+i][0].legend(loc="best", fontsize=14) 839 + axs[1+i][0].legend(loc="lower right", fontsize=14)
697 840
698 axs[1+i][1].grid(axis='y', ls='dotted', lw=1) 841 axs[1+i][1].grid(axis='y', ls='dotted', lw=1)
699 axs[1+i][1].set_yticklabels([]) 842 axs[1+i][1].set_yticklabels([])
...@@ -703,7 +846,7 @@ def general_stats(): ...@@ -703,7 +846,7 @@ def general_stats():
703 axs[1+i][1].text(0.95*max_res, 0.95*len(df_mapped_unique_m.resolution), "%d " % len(df_mapped_unique_m.resolution), 846 axs[1+i][1].text(0.95*max_res, 0.95*len(df_mapped_unique_m.resolution), "%d " % len(df_mapped_unique_m.resolution),
704 horizontalalignment='right', verticalalignment='top', fontsize=14) 847 horizontalalignment='right', verticalalignment='top', fontsize=14)
705 axs[1+i][1].set_ylim((0, max_structs * 1.05)) 848 axs[1+i][1].set_ylim((0, max_structs * 1.05))
706 - axs[1+i][1].legend(loc="best", fontsize=14) 849 + axs[1+i][1].legend(loc="upper left", fontsize=14)
707 850
708 axs[1+i][2].grid(axis='y', ls='dotted', lw=1) 851 axs[1+i][2].grid(axis='y', ls='dotted', lw=1)
709 axs[1+i][2].set_yticklabels([]) 852 axs[1+i][2].set_yticklabels([])
...@@ -713,7 +856,7 @@ def general_stats(): ...@@ -713,7 +856,7 @@ def general_stats():
713 axs[1+i][2].text(0.95*max_res, 0.95*len(df_mapped_copies_m.resolution), "%d " % len(df_mapped_copies_m.resolution), 856 axs[1+i][2].text(0.95*max_res, 0.95*len(df_mapped_copies_m.resolution), "%d " % len(df_mapped_copies_m.resolution),
714 horizontalalignment='right', verticalalignment='top', fontsize=14) 857 horizontalalignment='right', verticalalignment='top', fontsize=14)
715 axs[1+i][2].set_ylim((0, max_structs * 1.05)) 858 axs[1+i][2].set_ylim((0, max_structs * 1.05))
716 - axs[1+i][2].legend(loc="right", fontsize=14) 859 + axs[1+i][2].legend(loc="upper left", fontsize=14)
717 860
718 axs[-1][0].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14) 861 axs[-1][0].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14)
719 axs[-1][1].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14) 862 axs[-1][1].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14)
...@@ -722,7 +865,7 @@ def general_stats(): ...@@ -722,7 +865,7 @@ def general_stats():
722 fig.suptitle("Number of RNA chains by experimental method and resolution", fontsize=16) 865 fig.suptitle("Number of RNA chains by experimental method and resolution", fontsize=16)
723 fig.subplots_adjust(left=0.07, right=0.98, wspace=0.05, 866 fig.subplots_adjust(left=0.07, right=0.98, wspace=0.05,
724 hspace=0.05, bottom=0.05, top=0.92) 867 hspace=0.05, bottom=0.05, top=0.92)
725 - fig.savefig("results/figures/resolutions.png") 868 + fig.savefig(runDir + "/results/figures/resolutions.png")
726 plt.close() 869 plt.close()
727 870
728 ########################################## 871 ##########################################
...@@ -765,7 +908,7 @@ def general_stats(): ...@@ -765,7 +908,7 @@ def general_stats():
765 fig.suptitle("Number of RNA families used by experimental method and resolution", fontsize=16) 908 fig.suptitle("Number of RNA families used by experimental method and resolution", fontsize=16)
766 fig.subplots_adjust(left=0.05, right=0.98, wspace=0.05, 909 fig.subplots_adjust(left=0.05, right=0.98, wspace=0.05,
767 hspace=0.05, bottom=0.12, top=0.84) 910 hspace=0.05, bottom=0.12, top=0.84)
768 - fig.savefig("results/figures/Nfamilies.png") 911 + fig.savefig(runDir + "/results/figures/Nfamilies.png")
769 plt.close() 912 plt.close()
770 913
771 def log_to_pbar(pbar): 914 def log_to_pbar(pbar):
...@@ -776,8 +919,10 @@ def log_to_pbar(pbar): ...@@ -776,8 +919,10 @@ def log_to_pbar(pbar):
776 if __name__ == "__main__": 919 if __name__ == "__main__":
777 920
778 # parse options 921 # parse options
922 + DELETE_OLD_DATA = False
923 + DO_WADLEY_ANALYSIS = False
779 try: 924 try:
780 - opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "resolution=", "3d-folder=", "seq-folder=" ]) 925 + opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "from-scratch", "wadley", "resolution=", "3d-folder=", "seq-folder=" ])
781 except getopt.GetoptError as err: 926 except getopt.GetoptError as err:
782 print(err) 927 print(err)
783 sys.exit(2) 928 sys.exit(2)
...@@ -795,6 +940,7 @@ if __name__ == "__main__": ...@@ -795,6 +940,7 @@ if __name__ == "__main__":
795 "\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.") 940 "\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.")
796 print("--seq-folder=…\t\t\tPath to a folder containing the sequence and alignment files. Required subfolder:" 941 print("--seq-folder=…\t\t\tPath to a folder containing the sequence and alignment files. Required subfolder:"
797 "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family") 942 "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
943 + print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything")
798 sys.exit() 944 sys.exit()
799 elif opt == '--version': 945 elif opt == '--version':
800 print("RNANet statistics 1.1 beta") 946 print("RNANet statistics 1.1 beta")
...@@ -810,25 +956,37 @@ if __name__ == "__main__": ...@@ -810,25 +956,37 @@ if __name__ == "__main__":
810 path_to_seq_data = path.abspath(arg) 956 path_to_seq_data = path.abspath(arg)
811 if path_to_seq_data[-1] != '/': 957 if path_to_seq_data[-1] != '/':
812 path_to_seq_data += '/' 958 path_to_seq_data += '/'
959 + elif opt=='--from-scratch':
960 + DELETE_OLD_DATA = True
961 + DO_WADLEY_ANALYSIS = True
962 + subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
963 + elif opt=='--wadley':
964 + DO_WADLEY_ANALYSIS = True
813 965
814 966
815 # Load mappings 967 # Load mappings
816 print("Loading mappings list...") 968 print("Loading mappings list...")
817 - with sqlite3.connect("results/RNANet.db") as conn: 969 + with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
818 fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ] 970 fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ]
819 mappings_list = {} 971 mappings_list = {}
820 for k in fam_list: 972 for k in fam_list:
821 - mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}' and issue=0;") ] 973 + mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain JOIN structure ON chain.structure_id=structure.pdb_id WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};") ]
822 974
823 # List the families for which we will compute sequence identity matrices 975 # List the families for which we will compute sequence identity matrices
824 - with sqlite3.connect("results/RNANet.db") as conn: 976 + with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
825 - famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ] 977 + famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ]
826 - ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ] 978 + ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains < 3 ORDER BY rfam_acc ASC;") ]
979 + n_unmapped_chains = sql_ask_database(conn, "SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0]
827 if len(ignored): 980 if len(ignored):
828 print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n') 981 print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
829 982
983 + if DELETE_OLD_DATA:
984 + for f in fam_list:
985 + subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"])
986 +
987 +
830 # Prepare the multiprocessing execution environment 988 # Prepare the multiprocessing execution environment
831 - nworkers = max(read_cpu_number()-1, 32) 989 + nworkers = min(read_cpu_number()-1, 32)
832 thr_idx_mgr = Manager() 990 thr_idx_mgr = Manager()
833 idxQueue = thr_idx_mgr.Queue() 991 idxQueue = thr_idx_mgr.Queue()
834 for i in range(nworkers): 992 for i in range(nworkers):
...@@ -836,14 +994,15 @@ if __name__ == "__main__": ...@@ -836,14 +994,15 @@ if __name__ == "__main__":
836 994
837 # Define the tasks 995 # Define the tasks
838 joblist = [] 996 joblist = []
839 - # joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 4.0))) # res threshold is 4.0 Angstroms by default 997 + if n_unmapped_chains and DO_WADLEY_ANALYSIS:
840 - # joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 4.0))) # 998 + joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 20.0))) # res threshold is 4.0 Angstroms by default
999 + joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 20.0))) #
841 joblist.append(Job(function=stats_len)) # Computes figures 1000 joblist.append(Job(function=stats_len)) # Computes figures
842 - # joblist.append(Job(function=stats_freq)) # updates the database 1001 + joblist.append(Job(function=stats_freq)) # updates the database
843 - # for f in famlist: 1002 + for f in famlist:
844 - # joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database 1003 + joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
845 - # if f not in ignored: 1004 + if f not in ignored:
846 - # joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database 1005 + joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database
847 1006
848 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) 1007 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
849 pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True) 1008 pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True)
...@@ -867,7 +1026,8 @@ if __name__ == "__main__": ...@@ -867,7 +1026,8 @@ if __name__ == "__main__":
867 print() 1026 print()
868 1027
869 # finish the work after the parallel portions 1028 # finish the work after the parallel portions
870 - # per_chain_stats() 1029 + per_chain_stats()
871 - # seq_idty() 1030 + seq_idty()
872 - # stats_pairs() 1031 + stats_pairs()
1032 + if n_unmapped_chains:
873 general_stats() 1033 general_stats()
......