Showing
7 changed files
with
2255 additions
and
527 deletions
... | @@ -94,6 +94,8 @@ The detailed list of options is below: | ... | @@ -94,6 +94,8 @@ The detailed list of options is below: |
94 | -h [ --help ] Print this help message | 94 | -h [ --help ] Print this help message |
95 | --version Print the program version | 95 | --version Print the program version |
96 | 96 | ||
97 | +-f [ --full-inference ] Infer new 3D->family mappings even if Rfam already provides some. Yields more copies of chains | ||
98 | + mapped to different families. | ||
97 | -r 4.0 [ --resolution=4.0 ] Maximum 3D structure resolution to consider a RNA chain. | 99 | -r 4.0 [ --resolution=4.0 ] Maximum 3D structure resolution to consider a RNA chain. |
98 | -s Run statistics computations after completion | 100 | -s Run statistics computations after completion |
99 | --extract Extract the portions of 3D RNA chains to individual mmCIF files. | 101 | --extract Extract the portions of 3D RNA chains to individual mmCIF files. |
... | @@ -105,7 +107,7 @@ The detailed list of options is below: | ... | @@ -105,7 +107,7 @@ The detailed list of options is below: |
105 | RNAcifs/ Full structures containing RNA, in mmCIF format | 107 | RNAcifs/ Full structures containing RNA, in mmCIF format |
106 | rna_mapped_to_Rfam/ Extracted 'pure' RNA chains | 108 | rna_mapped_to_Rfam/ Extracted 'pure' RNA chains |
107 | datapoints/ Final results in CSV file format. | 109 | datapoints/ Final results in CSV file format. |
108 | ---seq-folder=… Path to a folder to store the sequence and alignment files. | 110 | +--seq-folder=… Path to a folder to store the sequence and alignment files. Subfolders will be: |
109 | rfam_sequences/fasta/ Compressed hits to Rfam families | 111 | rfam_sequences/fasta/ Compressed hits to Rfam families |
110 | realigned/ Sequences, covariance models, and alignments by family | 112 | realigned/ Sequences, covariance models, and alignments by family |
111 | --no-homology Do not try to compute PSSMs and do not align sequences. | 113 | --no-homology Do not try to compute PSSMs and do not align sequences. |
... | @@ -117,11 +119,12 @@ The detailed list of options is below: | ... | @@ -117,11 +119,12 @@ The detailed list of options is below: |
117 | --update-homologous Re-download Rfam and SILVA databases, realign all families, and recompute all CSV files | 119 | --update-homologous Re-download Rfam and SILVA databases, realign all families, and recompute all CSV files |
118 | --from-scratch Delete database, local 3D and sequence files, and known issues, and recompute. | 120 | --from-scratch Delete database, local 3D and sequence files, and known issues, and recompute. |
119 | --archive Create a tar.gz archive of the datapoints text files, and update the link to the latest archive | 121 | --archive Create a tar.gz archive of the datapoints text files, and update the link to the latest archive |
122 | +--no-logs Do not save per-chain logs of the numbering modifications | ||
120 | ``` | 123 | ``` |
121 | 124 | ||
122 | Typical usage: | 125 | Typical usage: |
123 | ``` | 126 | ``` |
124 | -nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' & | 127 | +nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' & |
125 | ``` | 128 | ``` |
126 | 129 | ||
127 | ## Post-computation task: estimate quality | 130 | ## Post-computation task: estimate quality | ... | ... |
1 | #!/usr/bin/python3.8 | 1 | #!/usr/bin/python3.8 |
2 | +import Bio | ||
3 | +import concurrent.futures | ||
4 | +import getopt | ||
5 | +import gzip | ||
6 | +import io | ||
7 | +import json | ||
2 | import numpy as np | 8 | import numpy as np |
9 | +import os | ||
3 | import pandas as pd | 10 | import pandas as pd |
4 | -import concurrent.futures, getopt, gzip, io, json, os, pickle, psutil, re, requests, signal, sqlalchemy, sqlite3, subprocess, sys, time, traceback, warnings | 11 | +import pickle |
5 | -from Bio import AlignIO, SeqIO | 12 | +import psutil |
6 | -from Bio.PDB import MMCIFParser | 13 | +import re |
7 | -from Bio.PDB.mmcifio import MMCIFIO | 14 | +import requests |
8 | -from Bio.PDB.MMCIF2Dict import MMCIF2Dict | 15 | +import signal |
9 | -from Bio.PDB.PDBExceptions import PDBConstructionWarning, BiopythonWarning | 16 | +import sqlalchemy |
10 | -from Bio.PDB.Dice import ChainSelector | 17 | +import sqlite3 |
11 | -from Bio.Alphabet import generic_rna | 18 | +import subprocess |
12 | -from Bio.Seq import Seq | 19 | +import sys |
13 | -from Bio.SeqRecord import SeqRecord | 20 | +import time |
14 | -from Bio.Align import MultipleSeqAlignment, AlignInfo | 21 | +import traceback |
15 | -from collections import OrderedDict, defaultdict | 22 | +import warnings |
16 | from functools import partial, wraps | 23 | from functools import partial, wraps |
17 | -from os import path, makedirs | 24 | +from multiprocessing import Pool, Manager |
18 | -from multiprocessing import Pool, Manager, set_start_method | ||
19 | from time import sleep | 25 | from time import sleep |
20 | from tqdm import tqdm | 26 | from tqdm import tqdm |
21 | from setproctitle import setproctitle | 27 | from setproctitle import setproctitle |
22 | 28 | ||
29 | + | ||
23 | def trace_unhandled_exceptions(func): | 30 | def trace_unhandled_exceptions(func): |
24 | @wraps(func) | 31 | @wraps(func) |
25 | def wrapped_func(*args, **kwargs): | 32 | def wrapped_func(*args, **kwargs): |
... | @@ -36,10 +43,11 @@ def trace_unhandled_exceptions(func): | ... | @@ -36,10 +43,11 @@ def trace_unhandled_exceptions(func): |
36 | print(s) | 43 | print(s) |
37 | return wrapped_func | 44 | return wrapped_func |
38 | 45 | ||
46 | + | ||
39 | pd.set_option('display.max_rows', None) | 47 | pd.set_option('display.max_rows', None) |
40 | sqlite3.enable_callback_tracebacks(True) | 48 | sqlite3.enable_callback_tracebacks(True) |
41 | sqlite3.register_adapter(np.int64, lambda val: int(val)) # Tell Sqlite what to do with <class numpy.int64> objects ---> convert to int | 49 | sqlite3.register_adapter(np.int64, lambda val: int(val)) # Tell Sqlite what to do with <class numpy.int64> objects ---> convert to int |
42 | -sqlite3.register_adapter(np.float64, lambda val: float(val)) # Tell Sqlite what to do with <class numpy.int64> objects ---> convert to int | 50 | +sqlite3.register_adapter(np.float64, lambda val: float(val)) # Tell Sqlite what to do with <class numpy.float64> objects ---> convert to float |
43 | 51 | ||
44 | m = Manager() | 52 | m = Manager() |
45 | running_stats = m.list() | 53 | running_stats = m.list() |
... | @@ -52,11 +60,14 @@ validsymb = '\U00002705' | ... | @@ -52,11 +60,14 @@ validsymb = '\U00002705' |
52 | warnsymb = '\U000026A0' | 60 | warnsymb = '\U000026A0' |
53 | errsymb = '\U0000274C' | 61 | errsymb = '\U0000274C' |
54 | 62 | ||
55 | -LSU_set = {"RF00002", "RF02540", "RF02541", "RF02543", "RF02546"} # From Rfam CLAN 00112 | 63 | +LSU_set = {"RF00002", "RF02540", "RF02541", |
56 | -SSU_set = {"RF00177", "RF02542", "RF02545", "RF01959", "RF01960"} # From Rfam CLAN 00111 | 64 | + "RF02543", "RF02546"} # From Rfam CLAN 00112 |
65 | +SSU_set = {"RF00177", "RF02542", "RF02545", | ||
66 | + "RF01959", "RF01960"} # From Rfam CLAN 00111 | ||
57 | no_nts_set = set() | 67 | no_nts_set = set() |
58 | weird_mappings = set() | 68 | weird_mappings = set() |
59 | 69 | ||
70 | + | ||
60 | class SelectivePortionSelector(object): | 71 | class SelectivePortionSelector(object): |
61 | """Class passed to MMCIFIO to select some chain portions in an MMCIF file. | 72 | """Class passed to MMCIFIO to select some chain portions in an MMCIF file. |
62 | 73 | ||
... | @@ -101,7 +112,7 @@ class SelectivePortionSelector(object): | ... | @@ -101,7 +112,7 @@ class SelectivePortionSelector(object): |
101 | return 1 | 112 | return 1 |
102 | 113 | ||
103 | 114 | ||
104 | -class BufferingSummaryInfo(AlignInfo.SummaryInfo): | 115 | +class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo): |
105 | 116 | ||
106 | def get_pssm(self, family, index): | 117 | def get_pssm(self, family, index): |
107 | """Create a position specific score matrix object for the alignment. | 118 | """Create a position specific score matrix object for the alignment. |
... | @@ -128,7 +139,7 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo): | ... | @@ -128,7 +139,7 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo): |
128 | score_dict[this_residue] = 1.0 | 139 | score_dict[this_residue] = 1.0 |
129 | pssm_info.append(('*', score_dict)) | 140 | pssm_info.append(('*', score_dict)) |
130 | 141 | ||
131 | - return AlignInfo.PSSM(pssm_info) | 142 | + return Bio.Align.AlignInfo.PSSM(pssm_info) |
132 | 143 | ||
133 | 144 | ||
134 | class Chain: | 145 | class Chain: |
... | @@ -187,11 +198,11 @@ class Chain: | ... | @@ -187,11 +198,11 @@ class Chain: |
187 | 198 | ||
188 | with warnings.catch_warnings(): | 199 | with warnings.catch_warnings(): |
189 | # Ignore the PDB problems. This mostly warns that some chain is discontinuous. | 200 | # Ignore the PDB problems. This mostly warns that some chain is discontinuous. |
190 | - warnings.simplefilter('ignore', PDBConstructionWarning) | 201 | + warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning) |
191 | - warnings.simplefilter('ignore', BiopythonWarning) | 202 | + warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning) |
192 | 203 | ||
193 | # Load the whole mmCIF into a Biopython structure object: | 204 | # Load the whole mmCIF into a Biopython structure object: |
194 | - mmcif_parser = MMCIFParser() | 205 | + mmcif_parser = Bio.PDB.MMCIFParser() |
195 | try: | 206 | try: |
196 | s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif") | 207 | s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif") |
197 | except ValueError as e: | 208 | except ValueError as e: |
... | @@ -212,7 +223,7 @@ class Chain: | ... | @@ -212,7 +223,7 @@ class Chain: |
212 | sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm) | 223 | sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm) |
213 | 224 | ||
214 | # Save that selection on the mmCIF object s to file | 225 | # Save that selection on the mmCIF object s to file |
215 | - ioobj = MMCIFIO() | 226 | + ioobj = Bio.PDB.mmcifio.MMCIFIO() |
216 | ioobj.set_structure(s) | 227 | ioobj.set_structure(s) |
217 | ioobj.save(self.file, sel) | 228 | ioobj.save(self.file, sel) |
218 | 229 | ||
... | @@ -253,7 +264,7 @@ class Chain: | ... | @@ -253,7 +264,7 @@ class Chain: |
253 | # Create the Pandas DataFrame for the nucleotides of the right chain | 264 | # Create the Pandas DataFrame for the nucleotides of the right chain |
254 | nts = json_object["nts"] # sub-json-object | 265 | nts = json_object["nts"] # sub-json-object |
255 | df = pd.DataFrame(nts) # conversion to dataframe | 266 | df = pd.DataFrame(nts) # conversion to dataframe |
256 | - df = df[ df.chain_name == self.pdb_chain_id ] # keeping only this chain's nucleotides | 267 | + df = df[df.chain_name == self.pdb_chain_id] # keeping only this chain's nucleotides |
257 | 268 | ||
258 | # Assert nucleotides of the chain are found | 269 | # Assert nucleotides of the chain are found |
259 | if df.empty: | 270 | if df.empty: |
... | @@ -266,12 +277,12 @@ class Chain: | ... | @@ -266,12 +277,12 @@ class Chain: |
266 | # Remove low pertinence or undocumented descriptors, convert angles values | 277 | # Remove low pertinence or undocumented descriptors, convert angles values |
267 | cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta", | 278 | cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta", |
268 | "epsilon_zeta", "bb_type", "chi", "glyco_bond", "form", "ssZp", "Dp", "eta", "theta", "eta_prime", "theta_prime", "eta_base", "theta_base", | 279 | "epsilon_zeta", "bb_type", "chi", "glyco_bond", "form", "ssZp", "Dp", "eta", "theta", "eta_prime", "theta_prime", "eta_base", "theta_base", |
269 | - "v0", "v1", "v2", "v3", "v4", "amplitude", "phase_angle", "puckering" ] | 280 | + "v0", "v1", "v2", "v3", "v4", "amplitude", "phase_angle", "puckering"] |
270 | df = df[cols_we_keep] | 281 | df = df[cols_we_keep] |
271 | - df.loc[:,['alpha', 'beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi','v0', 'v1', 'v2', 'v3', 'v4', # Conversion to radians | 282 | + df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4', # Conversion to radians |
272 | - 'eta','theta','eta_prime','theta_prime','eta_base','theta_base', 'phase_angle']] *= np.pi/180.0 | 283 | + 'eta', 'theta', 'eta_prime', 'theta_prime', 'eta_base', 'theta_base', 'phase_angle']] *= np.pi/180.0 |
273 | - df.loc[:,['alpha', 'beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi','v0', 'v1', 'v2', 'v3', 'v4', # mapping [-pi, pi] into [0, 2pi] | 284 | + df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4', # mapping [-pi, pi] into [0, 2pi] |
274 | - 'eta','theta','eta_prime','theta_prime','eta_base','theta_base', 'phase_angle']] %= (2.0*np.pi) | 285 | + 'eta', 'theta', 'eta_prime', 'theta_prime', 'eta_base', 'theta_base', 'phase_angle']] %= (2.0*np.pi) |
275 | 286 | ||
276 | except KeyError as e: | 287 | except KeyError as e: |
277 | warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True) | 288 | warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True) |
... | @@ -295,14 +306,14 @@ class Chain: | ... | @@ -295,14 +306,14 @@ class Chain: |
295 | # Duplicate residue numbers : shift numbering | 306 | # Duplicate residue numbers : shift numbering |
296 | while True in df.duplicated(['nt_resnum']).values: | 307 | while True in df.duplicated(['nt_resnum']).values: |
297 | i = df.duplicated(['nt_resnum']).values.tolist().index(True) | 308 | i = df.duplicated(['nt_resnum']).values.tolist().index(True) |
298 | - duplicates = df[df.nt_resnum == df.iloc[i,1]] | 309 | + duplicates = df[df.nt_resnum == df.iloc[i, 1]] |
299 | n_dup = len(duplicates.nt_resnum) | 310 | n_dup = len(duplicates.nt_resnum) |
300 | index_last_dup = duplicates.index_chain.iloc[-1] - 1 | 311 | index_last_dup = duplicates.index_chain.iloc[-1] - 1 |
301 | if self.mapping is not None: | 312 | if self.mapping is not None: |
302 | self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}") | 313 | self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}") |
303 | 314 | ||
304 | try: | 315 | try: |
305 | - if i > 0 and index_last_dup +1 < len(df.index) and df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]: | 316 | + if i > 0 and index_last_dup + 1 < len(df.index) and df.iloc[i, 1] == df.iloc[i-1, 1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]: |
306 | # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end | 317 | # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end |
307 | 318 | ||
308 | if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup: | 319 | if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup: |
... | @@ -314,15 +325,15 @@ class Chain: | ... | @@ -314,15 +325,15 @@ class Chain: |
314 | else: | 325 | else: |
315 | # We solve the problem continous component by continuous component | 326 | # We solve the problem continous component by continuous component |
316 | for j in range(1, n_dup+1): | 327 | for j in range(1, n_dup+1): |
317 | - if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous | 328 | + if duplicates.iloc[j, 0] == 1 + duplicates.iloc[j-1, 0]: # continuous |
318 | - df.iloc[i+j-1,1] += 1 | 329 | + df.iloc[i+j-1, 1] += 1 |
319 | else: | 330 | else: |
320 | break | 331 | break |
321 | - elif df.iloc[i,1] == df.iloc[i-1,1]: | 332 | + elif df.iloc[i, 1] == df.iloc[i-1, 1]: |
322 | # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR. | 333 | # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR. |
323 | # Solution : we shift the numbering of 17A (to 18) and the following residues. | 334 | # Solution : we shift the numbering of 17A (to 18) and the following residues. |
324 | df.iloc[i:, 1] += 1 | 335 | df.iloc[i:, 1] += 1 |
325 | - elif duplicates.iloc[0,0] == 1 and df.iloc[i,0] == 3: | 336 | + elif duplicates.iloc[0, 0] == 1 and df.iloc[i, 0] == 3: |
326 | # 4wzo_1_1J case, there is a residue numbered -1 and read as 1 before the number 0. | 337 | # 4wzo_1_1J case, there is a residue numbered -1 and read as 1 before the number 0. |
327 | df.iloc[1:, 1] += 1 | 338 | df.iloc[1:, 1] += 1 |
328 | df.iloc[0, 1] = 0 | 339 | df.iloc[0, 1] = 0 |
... | @@ -340,12 +351,16 @@ class Chain: | ... | @@ -340,12 +351,16 @@ class Chain: |
340 | 351 | ||
341 | # Search for ligands at the end of the selection | 352 | # Search for ligands at the end of the selection |
342 | # Drop ligands detected as residues by DSSR, by detecting several markers | 353 | # Drop ligands detected as residues by DSSR, by detecting several markers |
343 | - while ( len(df.index_chain) and df.iloc[-1,2] not in ["A", "C", "G", "U"] and ( | 354 | + while ( |
344 | - (df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all() | 355 | + len(df.index_chain) and df.iloc[-1, 2] not in ["A", "C", "G", "U"] |
345 | - or (df.iloc[[-1]].puckering=='').any() | 356 | + and ( |
357 | + (df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon", | ||
358 | + "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all() | ||
359 | + or (df.iloc[[-1]].puckering == '').any() | ||
346 | ) | 360 | ) |
347 | - or ( len(df.index_chain) >= 2 and df.iloc[-1,1] > 50 + df.iloc[-2,1] ) # large nt_resnum gap between the two last residues | 361 | + # large nt_resnum gap between the two last residues |
348 | - or ( len(df.index_chain) and df.iloc[-1,2] in ["GNG", "E2C", "OHX", "IRI", "MPD", "8UZ"] ) | 362 | + or (len(df.index_chain) >= 2 and df.iloc[-1, 1] > 50 + df.iloc[-2, 1]) |
363 | + or (len(df.index_chain) and df.iloc[-1, 2] in ["GNG", "E2C", "OHX", "IRI", "MPD", "8UZ"]) | ||
349 | ): | 364 | ): |
350 | if self.mapping is not None: | 365 | if self.mapping is not None: |
351 | self.mapping.log("Droping ligand:") | 366 | self.mapping.log("Droping ligand:") |
... | @@ -390,17 +405,19 @@ class Chain: | ... | @@ -390,17 +405,19 @@ class Chain: |
390 | break | 405 | break |
391 | if found: | 406 | if found: |
392 | self.mapping.log(f"Residue {i+1+self.mapping.st}-{self.mapping.st} = {i+1} has been saved and renumbered {df.iloc[i,1]} instead of {found['nt_id'].replace(found['chain_name']+ '.' + found['nt_name'], '').replace('^','')}") | 407 | self.mapping.log(f"Residue {i+1+self.mapping.st}-{self.mapping.st} = {i+1} has been saved and renumbered {df.iloc[i,1]} instead of {found['nt_id'].replace(found['chain_name']+ '.' + found['nt_name'], '').replace('^','')}") |
393 | - df_row = pd.DataFrame([found], index=[i])[df.columns.values] | 408 | + df_row = pd.DataFrame([found], index=[i])[ |
394 | - df_row.iloc[0,0] = i+1 # index_chain | 409 | + df.columns.values] |
395 | - df_row.iloc[0,1] = df.iloc[i,1] # nt_resnum | 410 | + df_row.iloc[0, 0] = i+1 # index_chain |
396 | - df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ]) | 411 | + df_row.iloc[0, 1] = df.iloc[i, 1] # nt_resnum |
412 | + df = pd.concat([df.iloc[:i], df_row, df.iloc[i:]]) | ||
397 | df.iloc[i+1:, 1] += 1 | 413 | df.iloc[i+1:, 1] += 1 |
398 | else: | 414 | else: |
399 | warn(f"Missing index_chain {i} in {self.chain_label} !") | 415 | warn(f"Missing index_chain {i} in {self.chain_label} !") |
400 | 416 | ||
401 | # Assert some nucleotides still exist | 417 | # Assert some nucleotides still exist |
402 | try: | 418 | try: |
403 | - l = df.iloc[-1,1] - df.iloc[0,1] + 1 # update length of chain from nt_resnum point of view | 419 | + # update length of chain from nt_resnum point of view |
420 | + l = df.iloc[-1, 1] - df.iloc[0, 1] + 1 | ||
404 | except IndexError: | 421 | except IndexError: |
405 | warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.mapping.nt_start} and " | 422 | warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.mapping.nt_start} and " |
406 | f"{self.mapping.nt_end} ({'not ' if not self.mapping.inferred else ''}inferred). Ignoring chain {self.chain_label}.") | 423 | f"{self.mapping.nt_end} ({'not ' if not self.mapping.inferred else ''}inferred). Ignoring chain {self.chain_label}.") |
... | @@ -426,14 +443,17 @@ class Chain: | ... | @@ -426,14 +443,17 @@ class Chain: |
426 | # index_chain 1 |-------------|77 83|------------| 154 | 443 | # index_chain 1 |-------------|77 83|------------| 154 |
427 | # expected data point 1 |--------------------------------| 154 | 444 | # expected data point 1 |--------------------------------| 154 |
428 | # | 445 | # |
446 | + | ||
429 | if l != len(df['index_chain']): # if some residues are missing, len(df['index_chain']) < l | 447 | if l != len(df['index_chain']): # if some residues are missing, len(df['index_chain']) < l |
430 | - resnum_start = df.iloc[0,1] | 448 | + resnum_start = df.iloc[0, 1] |
431 | - diff = set(range(l)).difference(df['nt_resnum'] - resnum_start) # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start) | 449 | + # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start) |
450 | + diff = set(range(l)).difference(df['nt_resnum'] - resnum_start) | ||
432 | for i in sorted(diff): | 451 | for i in sorted(diff): |
433 | # Add a row at position i | 452 | # Add a row at position i |
434 | - df = pd.concat([ df.iloc[:i], | 453 | + df = pd.concat([df.iloc[:i], |
435 | - pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start, "nt_id":"not resolved", "nt_code":'-', "nt_name":'-'}, index=[i]), | 454 | + pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start, |
436 | - df.iloc[i:] ]) | 455 | + "nt_id": "not resolved", "nt_code": '-', "nt_name": '-'}, index=[i]), |
456 | + df.iloc[i:]]) | ||
437 | # Increase the index_chain of all following lines | 457 | # Increase the index_chain of all following lines |
438 | df.iloc[i+1:, 0] += 1 | 458 | df.iloc[i+1:, 0] += 1 |
439 | df = df.reset_index(drop=True) | 459 | df = df.reset_index(drop=True) |
... | @@ -444,27 +464,27 @@ class Chain: | ... | @@ -444,27 +464,27 @@ class Chain: |
444 | ####################################### | 464 | ####################################### |
445 | 465 | ||
446 | # Add a sequence column just for the alignments | 466 | # Add a sequence column just for the alignments |
447 | - df['nt_align_code'] = [ str(x).upper() | 467 | + df['nt_align_code'] = [str(x).upper() |
448 | .replace('NAN', '-') # Unresolved nucleotides are gaps | 468 | .replace('NAN', '-') # Unresolved nucleotides are gaps |
449 | .replace('?', '-') # Unidentified residues, let's delete them | 469 | .replace('?', '-') # Unidentified residues, let's delete them |
450 | .replace('T', 'U') # 5MU are modified to t, which gives T | 470 | .replace('T', 'U') # 5MU are modified to t, which gives T |
451 | .replace('P', 'U') # Pseudo-uridines, but it is not really right to change them to U, see DSSR paper, Fig 2 | 471 | .replace('P', 'U') # Pseudo-uridines, but it is not really right to change them to U, see DSSR paper, Fig 2 |
452 | - for x in df['nt_code'] ] | 472 | + for x in df['nt_code']] |
453 | 473 | ||
454 | # One-hot encoding sequence | 474 | # One-hot encoding sequence |
455 | - df["is_A"] = [ 1 if x=="A" else 0 for x in df["nt_code"] ] | 475 | + df["is_A"] = [1 if x == "A" else 0 for x in df["nt_code"]] |
456 | - df["is_C"] = [ 1 if x=="C" else 0 for x in df["nt_code"] ] | 476 | + df["is_C"] = [1 if x == "C" else 0 for x in df["nt_code"]] |
457 | - df["is_G"] = [ 1 if x=="G" else 0 for x in df["nt_code"] ] | 477 | + df["is_G"] = [1 if x == "G" else 0 for x in df["nt_code"]] |
458 | - df["is_U"] = [ 1 if x=="U" else 0 for x in df["nt_code"] ] | 478 | + df["is_U"] = [1 if x == "U" else 0 for x in df["nt_code"]] |
459 | - df["is_other"] = [ 0 if x in "ACGU" else 1 for x in df["nt_code"] ] | 479 | + df["is_other"] = [0 if x in "ACGU" else 1 for x in df["nt_code"]] |
460 | df["nt_position"] = [ float(i+1)/self.full_length for i in range(self.full_length) ] | 480 | df["nt_position"] = [ float(i+1)/self.full_length for i in range(self.full_length) ] |
461 | 481 | ||
462 | # Iterate over pairs to identify base-base interactions | 482 | # Iterate over pairs to identify base-base interactions |
463 | res_ids = list(df['nt_id']) # things like "chainID.C4, chainID.U5" | 483 | res_ids = list(df['nt_id']) # things like "chainID.C4, chainID.U5" |
464 | - paired = [ '' ] * self.full_length | 484 | + paired = [''] * self.full_length |
465 | - pair_type_LW = [ '' ] * self.full_length | 485 | + pair_type_LW = [''] * self.full_length |
466 | - pair_type_DSSR = [ '' ] * self.full_length | 486 | + pair_type_DSSR = [''] * self.full_length |
467 | - interacts = [ 0 ] * self.full_length | 487 | + interacts = [0] * self.full_length |
468 | if "pairs" in json_object.keys(): | 488 | if "pairs" in json_object.keys(): |
469 | pairs = json_object["pairs"] | 489 | pairs = json_object["pairs"] |
470 | for p in pairs: | 490 | for p in pairs: |
... | @@ -506,17 +526,19 @@ class Chain: | ... | @@ -506,17 +526,19 @@ class Chain: |
506 | paired[nt2_idx] += ',' + str(nt1_idx + 1) | 526 | paired[nt2_idx] += ',' + str(nt1_idx + 1) |
507 | 527 | ||
508 | # transform nt_id to shorter values | 528 | # transform nt_id to shorter values |
509 | - df['old_nt_resnum'] = [ n.replace(self.pdb_chain_id+'.'+name, '').replace('^','').replace('/','') for n, name in zip(df.nt_id, df.nt_name) ] | 529 | + df['old_nt_resnum'] = [ n.replace(self.pdb_chain_id+'.'+name, '').replace('^', '').replace('/', '') for n, name in zip(df.nt_id, df.nt_name) ] |
510 | 530 | ||
511 | df['paired'] = paired | 531 | df['paired'] = paired |
512 | df['pair_type_LW'] = pair_type_LW | 532 | df['pair_type_LW'] = pair_type_LW |
513 | df['pair_type_DSSR'] = pair_type_DSSR | 533 | df['pair_type_DSSR'] = pair_type_DSSR |
514 | df['nb_interact'] = interacts | 534 | df['nb_interact'] = interacts |
515 | - df = df.drop(['nt_id', 'nt_resnum'], axis=1) # remove now useless descriptors | 535 | + |
536 | + # remove now useless descriptors | ||
537 | + df = df.drop(['nt_id', 'nt_resnum'], axis=1) | ||
516 | 538 | ||
517 | self.seq = "".join(df.nt_code) | 539 | self.seq = "".join(df.nt_code) |
518 | self.seq_to_align = "".join(df.nt_align_code) | 540 | self.seq_to_align = "".join(df.nt_align_code) |
519 | - self.length = len([ x for x in self.seq_to_align if x != "-" ]) | 541 | + self.length = len([x for x in self.seq_to_align if x != "-"]) |
520 | 542 | ||
521 | # Remove too short chains | 543 | # Remove too short chains |
522 | if self.length < 5: | 544 | if self.length < 5: |
... | @@ -559,7 +581,8 @@ class Chain: | ... | @@ -559,7 +581,8 @@ class Chain: |
559 | WHERE structure_id='{self.pdb_id}' | 581 | WHERE structure_id='{self.pdb_id}' |
560 | AND chain_name='{self.pdb_chain_id}' | 582 | AND chain_name='{self.pdb_chain_id}' |
561 | AND rfam_acc='{self.mapping.rfam_acc}' | 583 | AND rfam_acc='{self.mapping.rfam_acc}' |
562 | - AND eq_class='{self.eq_class}';""")[0][0] | 584 | + AND eq_class='{self.eq_class}';""" |
585 | + )[0][0] | ||
563 | else: | 586 | else: |
564 | sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, 'unmappd', ?, ?) | 587 | sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, 'unmappd', ?, ?) |
565 | ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue, eq_class=excluded.eq_class;""", | 588 | ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue, eq_class=excluded.eq_class;""", |
... | @@ -568,19 +591,18 @@ class Chain: | ... | @@ -568,19 +591,18 @@ class Chain: |
568 | WHERE structure_id='{self.pdb_id}' | 591 | WHERE structure_id='{self.pdb_id}' |
569 | AND chain_name='{self.pdb_chain_id}' | 592 | AND chain_name='{self.pdb_chain_id}' |
570 | AND eq_class='{self.eq_class}' | 593 | AND eq_class='{self.eq_class}' |
571 | - AND rfam_acc = 'unmappd';""")[0][0] | 594 | + AND rfam_acc = 'unmappd';""" |
595 | + )[0][0] | ||
572 | 596 | ||
573 | # Add the nucleotides if the chain is not an issue | 597 | # Add the nucleotides if the chain is not an issue |
574 | if df is not None and not self.delete_me: # double condition is theoretically redundant here, but you never know | 598 | if df is not None and not self.delete_me: # double condition is theoretically redundant here, but you never know |
575 | - sql_execute(conn, f""" | 599 | + sql_execute(conn, f"""INSERT OR IGNORE INTO nucleotide |
576 | - INSERT OR IGNORE INTO nucleotide | ||
577 | (chain_id, index_chain, nt_name, nt_code, dbn, alpha, beta, gamma, delta, epsilon, zeta, | 600 | (chain_id, index_chain, nt_name, nt_code, dbn, alpha, beta, gamma, delta, epsilon, zeta, |
578 | epsilon_zeta, bb_type, chi, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, | 601 | epsilon_zeta, bb_type, chi, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, |
579 | v0, v1, v2, v3, v4, amplitude, phase_angle, puckering, nt_align_code, is_A, is_C, is_G, is_U, is_other, nt_position, | 602 | v0, v1, v2, v3, v4, amplitude, phase_angle, puckering, nt_align_code, is_A, is_C, is_G, is_U, is_other, nt_position, |
580 | old_nt_resnum, paired, pair_type_LW, pair_type_DSSR, nb_interact) | 603 | old_nt_resnum, paired, pair_type_LW, pair_type_DSSR, nb_interact) |
581 | VALUES ({self.db_chain_id}, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, | 604 | VALUES ({self.db_chain_id}, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, |
582 | - ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, | 605 | + ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""", |
583 | - ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""", | ||
584 | many=True, data=list(df.to_records(index=False)), warn_every=10) | 606 | many=True, data=list(df.to_records(index=False)), warn_every=10) |
585 | 607 | ||
586 | def remap(self, columns_to_save, s_seq): | 608 | def remap(self, columns_to_save, s_seq): |
... | @@ -598,40 +620,39 @@ class Chain: | ... | @@ -598,40 +620,39 @@ class Chain: |
598 | # Save colums in the appropriate positions | 620 | # Save colums in the appropriate positions |
599 | i = 0 | 621 | i = 0 |
600 | j = 0 | 622 | j = 0 |
601 | - while i<self.full_length and j<alilen: | 623 | + while i < self.full_length and j < alilen: |
602 | # Here we try to map self.seq_to_align (the sequence of the 3D chain, including gaps when residues are missing), | 624 | # Here we try to map self.seq_to_align (the sequence of the 3D chain, including gaps when residues are missing), |
603 | # with s_seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and . | 625 | # with s_seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and . |
604 | 626 | ||
605 | if self.seq_to_align[i] == s_seq[j].upper(): # alignment and sequence correspond (incl. gaps) | 627 | if self.seq_to_align[i] == s_seq[j].upper(): # alignment and sequence correspond (incl. gaps) |
606 | - re_mappings.append( (self.db_chain_id, i+1, j+1) ) # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1. | 628 | + re_mappings.append((self.db_chain_id, i+1, j+1)) # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1. |
607 | columns_to_save.add(j+1) # it's a set, doublons are automaticaly ignored | 629 | columns_to_save.add(j+1) # it's a set, doublons are automaticaly ignored |
608 | i += 1 | 630 | i += 1 |
609 | j += 1 | 631 | j += 1 |
610 | elif self.seq_to_align[i] == '-': # gap in the chain, but not in the aligned sequence | 632 | elif self.seq_to_align[i] == '-': # gap in the chain, but not in the aligned sequence |
611 | - | ||
612 | # search for a gap to the consensus nearby | 633 | # search for a gap to the consensus nearby |
613 | k = 0 # Search must start at zero to assert the difference comes from '-' in front of '.' | 634 | k = 0 # Search must start at zero to assert the difference comes from '-' in front of '.' |
614 | - while j+k<alilen and s_seq[j+k] == '.': | 635 | + while j+k < alilen and s_seq[j+k] == '.': |
615 | k += 1 | 636 | k += 1 |
616 | 637 | ||
617 | # if found, set j to that position | 638 | # if found, set j to that position |
618 | - if j+k<alilen and s_seq[j+k] == '-': | 639 | + if j+k < alilen and s_seq[j+k] == '-': |
619 | - re_mappings.append( (self.db_chain_id, i+1, j+k+1) ) | 640 | + re_mappings.append((self.db_chain_id, i+1, j+k+1)) |
620 | columns_to_save.add(j+k+1) | 641 | columns_to_save.add(j+k+1) |
621 | i += 1 | 642 | i += 1 |
622 | j += k+1 | 643 | j += k+1 |
623 | continue | 644 | continue |
624 | 645 | ||
625 | # if not, take the insertion gap if this is one | 646 | # if not, take the insertion gap if this is one |
626 | - if j<alilen and s_seq[j] == '.': | 647 | + if j < alilen and s_seq[j] == '.': |
627 | - re_mappings.append( (self.db_chain_id, i+1, j+1) ) | 648 | + re_mappings.append((self.db_chain_id, i+1, j+1)) |
628 | columns_to_save.add(j+1) | 649 | columns_to_save.add(j+1) |
629 | i += 1 | 650 | i += 1 |
630 | j += 1 | 651 | j += 1 |
631 | continue | 652 | continue |
632 | 653 | ||
633 | # else, just mark the gap as unknown (there is an alignment mismatch) | 654 | # else, just mark the gap as unknown (there is an alignment mismatch) |
634 | - re_mappings.append( (self.db_chain_id, i+1, 0) ) | 655 | + re_mappings.append((self.db_chain_id, i+1, 0)) |
635 | i += 1 | 656 | i += 1 |
636 | elif s_seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain | 657 | elif s_seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain |
637 | j += 1 # ignore the column | 658 | j += 1 # ignore the column |
... | @@ -672,7 +693,7 @@ class Chain: | ... | @@ -672,7 +693,7 @@ class Chain: |
672 | l = letters[freq.index(max(freq))] | 693 | l = letters[freq.index(max(freq))] |
673 | c_seq_to_align[i] = l | 694 | c_seq_to_align[i] = l |
674 | c_seq[i] = l | 695 | c_seq[i] = l |
675 | - gaps.append((l, l=='A', l=='C', l=='G', l=='U', l=='N', self.db_chain_id, i+1 )) | 696 | + gaps.append((l, l == 'A', l == 'C', l == 'G', l == 'U', l == 'N', self.db_chain_id, i+1)) |
676 | self.seq_to_align = ''.join(c_seq_to_align) | 697 | self.seq_to_align = ''.join(c_seq_to_align) |
677 | self.seq = ''.join(c_seq) | 698 | self.seq = ''.join(c_seq) |
678 | return gaps | 699 | return gaps |
... | @@ -684,6 +705,7 @@ class Job: | ... | @@ -684,6 +705,7 @@ class Job: |
684 | This could be a system command or the execution of a Python function. | 705 | This could be a system command or the execution of a Python function. |
685 | Time and memory usage of a job can be monitored. | 706 | Time and memory usage of a job can be monitored. |
686 | """ | 707 | """ |
708 | + | ||
687 | def __init__(self, results="", command=[], function=None, args=[], how_many_in_parallel=0, priority=1, timeout=None, checkFunc=None, checkArgs=[], label=""): | 709 | def __init__(self, results="", command=[], function=None, args=[], how_many_in_parallel=0, priority=1, timeout=None, checkFunc=None, checkArgs=[], label=""): |
688 | self.cmd_ = command # A system command to run | 710 | self.cmd_ = command # A system command to run |
689 | self.func_ = function # A python function to run | 711 | self.func_ = function # A python function to run |
... | @@ -709,7 +731,8 @@ class Job: | ... | @@ -709,7 +731,8 @@ class Job: |
709 | if self.func_ is None: | 731 | if self.func_ is None: |
710 | s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}" + " ".join(self.cmd_) | 732 | s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}" + " ".join(self.cmd_) |
711 | else: | 733 | else: |
712 | - s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}{self.func_.__name__}(" + " ".join([str(a) for a in self.args_]) + ")" | 734 | + s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}{self.func_.__name__}(" \ |
735 | + + " ".join([ str(a) for a in self.args_ ]) + ")" | ||
713 | return s | 736 | return s |
714 | 737 | ||
715 | 738 | ||
... | @@ -767,13 +790,14 @@ class Downloader: | ... | @@ -767,13 +790,14 @@ class Downloader: |
767 | print("> Fetching latest PDB mappings from Rfam..." + " " * 29, end='', flush=True) | 790 | print("> Fetching latest PDB mappings from Rfam..." + " " * 29, end='', flush=True) |
768 | try: | 791 | try: |
769 | db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam') | 792 | db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam') |
770 | - mappings = pd.read_sql('SELECT rfam_acc, pdb_id, chain, pdb_start, pdb_end, bit_score, evalue_score, cm_start, cm_end, hex_colour FROM pdb_full_region WHERE is_significant=1;', con=db_connection) | 793 | + mappings = pd.read_sql('SELECT rfam_acc, pdb_id, chain, pdb_start, pdb_end, bit_score, evalue_score, cm_start, cm_end, hex_colour FROM pdb_full_region WHERE is_significant=1;', |
794 | + con=db_connection) | ||
771 | mappings.to_csv(runDir + "/data/Rfam-PDB-mappings.csv") | 795 | mappings.to_csv(runDir + "/data/Rfam-PDB-mappings.csv") |
772 | print(f"\t{validsymb}") | 796 | print(f"\t{validsymb}") |
773 | except sqlalchemy.exc.OperationalError: # Cannot connect :'( | 797 | except sqlalchemy.exc.OperationalError: # Cannot connect :'( |
774 | print(f"\t{errsymb}") | 798 | print(f"\t{errsymb}") |
775 | # Check if a previous run succeeded (if file exists, use it) | 799 | # Check if a previous run succeeded (if file exists, use it) |
776 | - if path.isfile(runDir + "/data/Rfam-PDB-mappings.csv"): | 800 | + if os.path.isfile(runDir + "/data/Rfam-PDB-mappings.csv"): |
777 | print("\t> Using previous version.") | 801 | print("\t> Using previous version.") |
778 | mappings = pd.read_csv(runDir + "/data/Rfam-PDB-mappings.csv") | 802 | mappings = pd.read_csv(runDir + "/data/Rfam-PDB-mappings.csv") |
779 | else: # otherwise, abort. | 803 | else: # otherwise, abort. |
... | @@ -791,7 +815,7 @@ class Downloader: | ... | @@ -791,7 +815,7 @@ class Downloader: |
791 | setproctitle(f"RNANet.py download_Rfam_cm()") | 815 | setproctitle(f"RNANet.py download_Rfam_cm()") |
792 | 816 | ||
793 | print(f"\t> Download Rfam.cm.gz from Rfam..." + " " * 37, end='', flush=True) | 817 | print(f"\t> Download Rfam.cm.gz from Rfam..." + " " * 37, end='', flush=True) |
794 | - if not path.isfile(path_to_seq_data + "Rfam.cm"): | 818 | + if not os.path.isfile(path_to_seq_data + "Rfam.cm"): |
795 | try: | 819 | try: |
796 | subprocess.run(["wget", "ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz", "-O", path_to_seq_data + "Rfam.cm.gz"]) | 820 | subprocess.run(["wget", "ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz", "-O", path_to_seq_data + "Rfam.cm.gz"]) |
797 | print(f"\t{validsymb}", flush=True) | 821 | print(f"\t{validsymb}", flush=True) |
... | @@ -815,7 +839,6 @@ class Downloader: | ... | @@ -815,7 +839,6 @@ class Downloader: |
815 | try: | 839 | try: |
816 | db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam') | 840 | db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam') |
817 | 841 | ||
818 | - | ||
819 | # Prepare the SQL query. It computes the length of the chains and gets the maximum length by family. | 842 | # Prepare the SQL query. It computes the length of the chains and gets the maximum length by family. |
820 | q = """SELECT stats.rfam_acc, k.description, stats.maxlength FROM | 843 | q = """SELECT stats.rfam_acc, k.description, stats.maxlength FROM |
821 | (SELECT fr.rfam_acc, MAX( | 844 | (SELECT fr.rfam_acc, MAX( |
... | @@ -838,15 +861,17 @@ class Downloader: | ... | @@ -838,15 +861,17 @@ class Downloader: |
838 | d = pd.read_sql(q, con=db_connection) | 861 | d = pd.read_sql(q, con=db_connection) |
839 | 862 | ||
840 | # filter the results to families we are interested in | 863 | # filter the results to families we are interested in |
841 | - d = d[ d["rfam_acc"].isin(list_of_families) ] | 864 | + d = d[d["rfam_acc"].isin(list_of_families)] |
842 | 865 | ||
843 | print(d) | 866 | print(d) |
844 | 867 | ||
845 | with sqlite3.connect(runDir + "/results/RNANet.db", timeout=20.0) as conn: | 868 | with sqlite3.connect(runDir + "/results/RNANet.db", timeout=20.0) as conn: |
846 | - sql_execute(conn, """ | 869 | + # We use the REPLACE keyword to get the latest information |
847 | - INSERT OR REPLACE INTO family (rfam_acc, description, max_len) | 870 | + sql_execute(conn, """INSERT OR REPLACE INTO family (rfam_acc, description, max_len) |
848 | - VALUES (?, ?, ?);""", many=True, data=list(d.to_records(index=False)) | 871 | + VALUES (?, ?, ?);""", |
849 | - ) # We use the replace keyword to get the latest information | 872 | + many=True, |
873 | + data=list(d.to_records(index=False)) | ||
874 | + ) | ||
850 | 875 | ||
851 | except sqlalchemy.exc.OperationalError: | 876 | except sqlalchemy.exc.OperationalError: |
852 | warn("Something's wrong with the SQL database. Check mysql-rfam-public.ebi.ac.uk status and try again later. Not printing statistics.") | 877 | warn("Something's wrong with the SQL database. Check mysql-rfam-public.ebi.ac.uk status and try again later. Not printing statistics.") |
... | @@ -858,10 +883,11 @@ class Downloader: | ... | @@ -858,10 +883,11 @@ class Downloader: |
858 | 883 | ||
859 | setproctitle(f"RNANet.py download_Rfam_sequences({rfam_acc})") | 884 | setproctitle(f"RNANet.py download_Rfam_sequences({rfam_acc})") |
860 | 885 | ||
861 | - if not path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"): | 886 | + if not os.path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"): |
862 | for _ in range(10): # retry 100 times if it fails | 887 | for _ in range(10): # retry 100 times if it fails |
863 | try: | 888 | try: |
864 | - subprocess.run(["wget", f'ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/fasta_files/{rfam_acc}.fa.gz', "-O", path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"], stdout=subprocess.DEVNULL) | 889 | + subprocess.run(["wget", f'ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/fasta_files/{rfam_acc}.fa.gz', "-O", |
890 | + path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | ||
865 | notify(f"Downloaded {rfam_acc}.fa.gz from Rfam") | 891 | notify(f"Downloaded {rfam_acc}.fa.gz from Rfam") |
866 | return # if it worked, no need to retry | 892 | return # if it worked, no need to retry |
867 | except Exception as e: | 893 | except Exception as e: |
... | @@ -881,8 +907,9 @@ class Downloader: | ... | @@ -881,8 +907,9 @@ class Downloader: |
881 | 907 | ||
882 | setproctitle(f"RNANet.py download_BGSU_NR_list({res})") | 908 | setproctitle(f"RNANet.py download_BGSU_NR_list({res})") |
883 | 909 | ||
884 | - nr_code = min([ i for i in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 20.0] if i >= res ]) | 910 | + nr_code = min([i for i in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 20.0] if i >= res]) |
885 | print(f"> Fetching latest list of RNA files at {nr_code} A resolution from BGSU website...", end='', flush=True) | 911 | print(f"> Fetching latest list of RNA files at {nr_code} A resolution from BGSU website...", end='', flush=True) |
912 | + | ||
886 | # Download latest BGSU non-redundant list | 913 | # Download latest BGSU non-redundant list |
887 | try: | 914 | try: |
888 | s = requests.get(f"http://rna.bgsu.edu/rna3dhub/nrlist/download/current/{nr_code}A/csv").content | 915 | s = requests.get(f"http://rna.bgsu.edu/rna3dhub/nrlist/download/current/{nr_code}A/csv").content |
... | @@ -894,13 +921,13 @@ class Downloader: | ... | @@ -894,13 +921,13 @@ class Downloader: |
894 | warn("Error downloading NR list !\t", error=True) | 921 | warn("Error downloading NR list !\t", error=True) |
895 | 922 | ||
896 | # Try to read previous file | 923 | # Try to read previous file |
897 | - if path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"): | 924 | + if os.path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"): |
898 | - print("\t> Use of the previous version.\t", end = "", flush=True) | 925 | + print("\t> Use of the previous version.\t", end="", flush=True) |
899 | else: | 926 | else: |
900 | return pd.DataFrame([], columns=["class", "class_members"]) | 927 | return pd.DataFrame([], columns=["class", "class_members"]) |
901 | 928 | ||
902 | nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv") | 929 | nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv") |
903 | - full_structures_list = [ tuple(i[1]) for i in nrlist[['class','class_members']].iterrows() ] | 930 | + full_structures_list = [ tuple(i[1]) for i in nrlist[['class', 'class_members']].iterrows() ] |
904 | print(f"\t{validsymb}", flush=True) | 931 | print(f"\t{validsymb}", flush=True) |
905 | 932 | ||
906 | # The beginning of an adventure. | 933 | # The beginning of an adventure. |
... | @@ -910,14 +937,15 @@ class Downloader: | ... | @@ -910,14 +937,15 @@ class Downloader: |
910 | 937 | ||
911 | setproctitle(f"RNANet.py download_from_SILVA({unit})") | 938 | setproctitle(f"RNANet.py download_from_SILVA({unit})") |
912 | 939 | ||
913 | - | 940 | + if not os.path.isfile(path_to_seq_data + f"realigned/{unit}.arb"): |
914 | - if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"): | ||
915 | try: | 941 | try: |
916 | print(f"Downloading {unit} from SILVA...", end='', flush=True) | 942 | print(f"Downloading {unit} from SILVA...", end='', flush=True) |
917 | - if unit=="LSU": | 943 | + if unit == "LSU": |
918 | - subprocess.run(["wget", "http://www.arb-silva.de/fileadmin/arb_web_db/release_132/ARB_files/SILVA_132_LSURef_07_12_17_opt.arb.gz", "-O", path_to_seq_data + "realigned/LSU.arb.gz"]) | 944 | + subprocess.run(["wget", "-nv", "http://www.arb-silva.de/fileadmin/arb_web_db/release_132/ARB_files/SILVA_132_LSURef_07_12_17_opt.arb.gz", |
945 | + "-O", path_to_seq_data + "realigned/LSU.arb.gz"]) | ||
919 | else: | 946 | else: |
920 | - subprocess.run(["wget", "http://www.arb-silva.de/fileadmin/silva_databases/release_138/ARB_files/SILVA_138_SSURef_05_01_20_opt.arb.gz", "-O", path_to_seq_data + "realigned/SSU.arb.gz"]) | 947 | + subprocess.run(["wget", "-nv", "http://www.arb-silva.de/fileadmin/silva_databases/release_138/ARB_files/SILVA_138_SSURef_05_01_20_opt.arb.gz", |
948 | + "-O", path_to_seq_data + "realigned/SSU.arb.gz"]) | ||
921 | except: | 949 | except: |
922 | warn(f"Error downloading the {unit} database from SILVA", error=True) | 950 | warn(f"Error downloading the {unit} database from SILVA", error=True) |
923 | exit(1) | 951 | exit(1) |
... | @@ -949,7 +977,8 @@ class Mapping: | ... | @@ -949,7 +977,8 @@ class Mapping: |
949 | 977 | ||
950 | def filter_df(self, df): | 978 | def filter_df(self, df): |
951 | 979 | ||
952 | - newdf = df.drop(df[(df.nt_resnum < self.nt_start) | (df.nt_resnum > self.nt_end)].index) | 980 | + newdf = df.drop(df[(df.nt_resnum < self.nt_start) | |
981 | + (df.nt_resnum > self.nt_end)].index) | ||
953 | 982 | ||
954 | if len(newdf.index_chain) > 0: | 983 | if len(newdf.index_chain) > 0: |
955 | # everything's okay | 984 | # everything's okay |
... | @@ -961,19 +990,20 @@ class Mapping: | ... | @@ -961,19 +990,20 @@ class Mapping: |
961 | # index_chain and not nt_resnum. | 990 | # index_chain and not nt_resnum. |
962 | warn(f"Assuming mapping to {self.rfam_acc} is an absolute position interval.") | 991 | warn(f"Assuming mapping to {self.rfam_acc} is an absolute position interval.") |
963 | weird_mappings.add(self.chain_label + "." + self.rfam_acc) | 992 | weird_mappings.add(self.chain_label + "." + self.rfam_acc) |
964 | - df = df.drop(df[(df.index_chain < self.nt_start) | (df.index_chain > self.nt_end)].index) | 993 | + df = df.drop(df[(df.index_chain < self.nt_start) | |
994 | + (df.index_chain > self.nt_end)].index) | ||
965 | 995 | ||
966 | # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one | 996 | # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one |
967 | self.st = 0 | 997 | self.st = 0 |
968 | - if len(df.index_chain) and df.iloc[0,0] != 1: | 998 | + if len(df.index_chain) and df.iloc[0, 0] != 1: |
969 | - self.st = df.iloc[0,0] -1 | 999 | + self.st = df.iloc[0, 0] - 1 |
970 | df.iloc[:, 0] -= self.st | 1000 | df.iloc[:, 0] -= self.st |
971 | self.log(f"Shifting index_chain of {self.st}") | 1001 | self.log(f"Shifting index_chain of {self.st}") |
972 | 1002 | ||
973 | # Check that some residues are not included by mistake: | 1003 | # Check that some residues are not included by mistake: |
974 | # e.g. 4v4t-AA.RF00382-20-55 contains 4 residues numbered 30 but actually far beyond the mapped part, | 1004 | # e.g. 4v4t-AA.RF00382-20-55 contains 4 residues numbered 30 but actually far beyond the mapped part, |
975 | # because the icode are not read by DSSR. | 1005 | # because the icode are not read by DSSR. |
976 | - toremove = df[ df.index_chain > self.nt_end ] | 1006 | + toremove = df[df.index_chain > self.nt_end] |
977 | if not toremove.empty: | 1007 | if not toremove.empty: |
978 | df = df.drop(toremove.index) | 1008 | df = df.drop(toremove.index) |
979 | self.log(f"Some nt_resnum values are likely to be wrong, not considering residues:") | 1009 | self.log(f"Some nt_resnum values are likely to be wrong, not considering residues:") |
... | @@ -991,9 +1021,9 @@ class Mapping: | ... | @@ -991,9 +1021,9 @@ class Mapping: |
991 | if self.logs == []: | 1021 | if self.logs == []: |
992 | return # Do not create a log file if there is nothing to log | 1022 | return # Do not create a log file if there is nothing to log |
993 | 1023 | ||
994 | - if not path.exists("logs"): | 1024 | + if not os.path.exists(runDir+"/logs"): |
995 | - os.makedirs("logs", exist_ok=True) | 1025 | + os.makedirs(runDir+"/logs", exist_ok=True) |
996 | - with open("logs/"+filename, "w") as f: | 1026 | + with open(runDir+"/logs/"+filename, "w") as f: |
997 | f.writelines(self.logs) | 1027 | f.writelines(self.logs) |
998 | 1028 | ||
999 | 1029 | ||
... | @@ -1019,20 +1049,23 @@ class Pipeline: | ... | @@ -1019,20 +1049,23 @@ class Pipeline: |
1019 | self.SELECT_ONLY = None | 1049 | self.SELECT_ONLY = None |
1020 | self.ARCHIVE = False | 1050 | self.ARCHIVE = False |
1021 | self.SAVELOGS = True | 1051 | self.SAVELOGS = True |
1052 | + self.FULLINFERENCE = False | ||
1022 | 1053 | ||
1023 | def process_options(self): | 1054 | def process_options(self): |
1024 | - """Sets the paths and options of the pipeline""" | 1055 | + """Sets the paths and options of the pipeline |
1056 | + """ | ||
1057 | + | ||
1025 | global path_to_3D_data | 1058 | global path_to_3D_data |
1026 | global path_to_seq_data | 1059 | global path_to_seq_data |
1027 | 1060 | ||
1028 | setproctitle("RNANet.py process_options()") | 1061 | setproctitle("RNANet.py process_options()") |
1029 | 1062 | ||
1030 | try: | 1063 | try: |
1031 | - opts, _ = getopt.getopt( sys.argv[1:], "r:hs", | 1064 | + opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", |
1032 | - [ "help", "resolution=", "keep-hetatm=", "from-scratch", | 1065 | + ["help", "resolution=", "keep-hetatm=", "from-scratch", "full-inference," |
1033 | "fill-gaps=", "3d-folder=", "seq-folder=", | 1066 | "fill-gaps=", "3d-folder=", "seq-folder=", |
1034 | "no-homology", "ignore-issues", "extract", "only=", "all", "no-logs", | 1067 | "no-homology", "ignore-issues", "extract", "only=", "all", "no-logs", |
1035 | - "archive", "update-homologous" ]) | 1068 | + "archive", "update-homologous"]) |
1036 | except getopt.GetoptError as err: | 1069 | except getopt.GetoptError as err: |
1037 | print(err) | 1070 | print(err) |
1038 | sys.exit(2) | 1071 | sys.exit(2) |
... | @@ -1044,13 +1077,15 @@ class Pipeline: | ... | @@ -1044,13 +1077,15 @@ class Pipeline: |
1044 | exit() | 1077 | exit() |
1045 | 1078 | ||
1046 | if opt == "-h" or opt == "--help": | 1079 | if opt == "-h" or opt == "--help": |
1047 | - print( "RNANet, a script to build a multiscale RNA dataset from public data\n" | 1080 | + print("RNANet, a script to build a multiscale RNA dataset from public data\n" |
1048 | "Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2020") | 1081 | "Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2020") |
1049 | print() | 1082 | print() |
1050 | print("Options:") | 1083 | print("Options:") |
1051 | print("-h [ --help ]\t\t\tPrint this help message") | 1084 | print("-h [ --help ]\t\t\tPrint this help message") |
1052 | print("--version\t\t\tPrint the program version") | 1085 | print("--version\t\t\tPrint the program version") |
1053 | print() | 1086 | print() |
1087 | + print("-f [ --full-inference ]\t\tInfer new mappings even if Rfam already provides some. Yields more copies of chains" | ||
1088 | + "\n\t\t\t\tmapped to different families.") | ||
1054 | print("-r 4.0 [ --resolution=4.0 ]\tMaximum 3D structure resolution to consider a RNA chain.") | 1089 | print("-r 4.0 [ --resolution=4.0 ]\tMaximum 3D structure resolution to consider a RNA chain.") |
1055 | print("-s\t\t\t\tRun statistics computations after completion") | 1090 | print("-s\t\t\t\tRun statistics computations after completion") |
1056 | print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.") | 1091 | print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.") |
... | @@ -1062,7 +1097,7 @@ class Pipeline: | ... | @@ -1062,7 +1097,7 @@ class Pipeline: |
1062 | "\n\t\t\t\t\tRNAcifs/\t\tFull structures containing RNA, in mmCIF format" | 1097 | "\n\t\t\t\t\tRNAcifs/\t\tFull structures containing RNA, in mmCIF format" |
1063 | "\n\t\t\t\t\trna_mapped_to_Rfam/\tExtracted 'pure' RNA chains" | 1098 | "\n\t\t\t\t\trna_mapped_to_Rfam/\tExtracted 'pure' RNA chains" |
1064 | "\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.") | 1099 | "\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.") |
1065 | - print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files." | 1100 | + print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:" |
1066 | "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families" | 1101 | "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families" |
1067 | "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family") | 1102 | "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family") |
1068 | print("--no-homology\t\t\tDo not try to compute PSSMs and do not align sequences." | 1103 | print("--no-homology\t\t\tDo not try to compute PSSMs and do not align sequences." |
... | @@ -1077,7 +1112,7 @@ class Pipeline: | ... | @@ -1077,7 +1112,7 @@ class Pipeline: |
1077 | print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications") | 1112 | print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications") |
1078 | print() | 1113 | print() |
1079 | print("Typical usage:") | 1114 | print("Typical usage:") |
1080 | - print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &") | 1115 | + print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &") |
1081 | sys.exit() | 1116 | sys.exit() |
1082 | elif opt == '--version': | 1117 | elif opt == '--version': |
1083 | print("RNANet 1.1 beta") | 1118 | print("RNANet 1.1 beta") |
... | @@ -1087,21 +1122,21 @@ class Pipeline: | ... | @@ -1087,21 +1122,21 @@ class Pipeline: |
1087 | self.CRYSTAL_RES = float(arg) | 1122 | self.CRYSTAL_RES = float(arg) |
1088 | elif opt == "-s": | 1123 | elif opt == "-s": |
1089 | self.RUN_STATS = True | 1124 | self.RUN_STATS = True |
1090 | - elif opt=="--keep-hetatm": | 1125 | + elif opt == "--keep-hetatm": |
1091 | - assert arg in [ "True", "False" ] | 1126 | + assert arg in ["True", "False"] |
1092 | self.KEEP_HETATM = (arg == "True") | 1127 | self.KEEP_HETATM = (arg == "True") |
1093 | - elif opt=="--fill-gaps": | 1128 | + elif opt == "--fill-gaps": |
1094 | - assert arg in [ "True", "False" ] | 1129 | + assert arg in ["True", "False"] |
1095 | self.FILL_GAPS = (arg == "True") | 1130 | self.FILL_GAPS = (arg == "True") |
1096 | - elif opt=="--no-homology": | 1131 | + elif opt == "--no-homology": |
1097 | self.HOMOLOGY = False | 1132 | self.HOMOLOGY = False |
1098 | - elif opt=='--3d-folder': | 1133 | + elif opt == '--3d-folder': |
1099 | - path_to_3D_data = path.abspath(arg) | 1134 | + path_to_3D_data = os.path.abspath(arg) |
1100 | if path_to_3D_data[-1] != '/': | 1135 | if path_to_3D_data[-1] != '/': |
1101 | path_to_3D_data += '/' | 1136 | path_to_3D_data += '/' |
1102 | print("> Storing 3D data into", path_to_3D_data) | 1137 | print("> Storing 3D data into", path_to_3D_data) |
1103 | - elif opt=='--seq-folder': | 1138 | + elif opt == '--seq-folder': |
1104 | - path_to_seq_data = path.abspath(arg) | 1139 | + path_to_seq_data = os.path.abspath(arg) |
1105 | if path_to_seq_data[-1] != '/': | 1140 | if path_to_seq_data[-1] != '/': |
1106 | path_to_seq_data += '/' | 1141 | path_to_seq_data += '/' |
1107 | print("> Storing sequences into", path_to_seq_data) | 1142 | print("> Storing sequences into", path_to_seq_data) |
... | @@ -1138,6 +1173,8 @@ class Pipeline: | ... | @@ -1138,6 +1173,8 @@ class Pipeline: |
1138 | self.ARCHIVE = True | 1173 | self.ARCHIVE = True |
1139 | elif opt == "--no-logs": | 1174 | elif opt == "--no-logs": |
1140 | self.SAVELOGS = False | 1175 | self.SAVELOGS = False |
1176 | + elif opt == "-f" or opt == "--full-inference": | ||
1177 | + self.FULLINFERENCE = True | ||
1141 | 1178 | ||
1142 | if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions": | 1179 | if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions": |
1143 | print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") | 1180 | print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") |
... | @@ -1149,17 +1186,19 @@ class Pipeline: | ... | @@ -1149,17 +1186,19 @@ class Pipeline: |
1149 | """List 3D chains with available Rfam mappings. | 1186 | """List 3D chains with available Rfam mappings. |
1150 | 1187 | ||
1151 | Return a list of Chain() objects with the mappings set up. | 1188 | Return a list of Chain() objects with the mappings set up. |
1152 | - If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains.""" | 1189 | + If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains. |
1190 | + """ | ||
1153 | 1191 | ||
1154 | setproctitle("RNANet.py list_available_mappings()") | 1192 | setproctitle("RNANet.py list_available_mappings()") |
1155 | 1193 | ||
1156 | # List all 3D RNA chains below given resolution | 1194 | # List all 3D RNA chains below given resolution |
1157 | - full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) # list of tuples ( class, class_members ) | 1195 | + full_structures_list = self.dl.download_BGSU_NR_list( |
1196 | + self.CRYSTAL_RES) # list of tuples ( class, class_members ) | ||
1158 | 1197 | ||
1159 | # Check for a list of known problems: | 1198 | # Check for a list of known problems: |
1160 | - if path.isfile(runDir + "/known_issues.txt"): | 1199 | + if os.path.isfile(runDir + "/known_issues.txt"): |
1161 | with open(runDir + "/known_issues.txt", 'r') as issues: | 1200 | with open(runDir + "/known_issues.txt", 'r') as issues: |
1162 | - self.known_issues = [ x[:-1] for x in issues.readlines() ] | 1201 | + self.known_issues = [x[:-1] for x in issues.readlines()] |
1163 | if self.USE_KNOWN_ISSUES: | 1202 | if self.USE_KNOWN_ISSUES: |
1164 | print("\t> Ignoring known issues:") | 1203 | print("\t> Ignoring known issues:") |
1165 | for x in self.known_issues: | 1204 | for x in self.known_issues: |
... | @@ -1175,9 +1214,18 @@ class Pipeline: | ... | @@ -1175,9 +1214,18 @@ class Pipeline: |
1175 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=ncores) | 1214 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=ncores) |
1176 | try: | 1215 | try: |
1177 | 1216 | ||
1178 | - pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1, desc="Eq. classes", bar_format="{desc}:{percentage:3.0f}%|{bar}|") | 1217 | + pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1, |
1179 | - for _, newchains in enumerate(p.imap_unordered(partial(work_infer_mappings, not self.REUSE_ALL, allmappings), full_structures_list, chunksize=1)): | 1218 | + desc="Eq. classes", bar_format="{desc}:{percentage:3.0f}%|{bar}|") |
1219 | + for _, newchains in enumerate(p.imap_unordered(partial( | ||
1220 | + work_infer_mappings, | ||
1221 | + not self.REUSE_ALL, | ||
1222 | + allmappings, | ||
1223 | + self.FULLINFERENCE | ||
1224 | + ), | ||
1225 | + full_structures_list, | ||
1226 | + chunksize=1)): | ||
1180 | self.update += newchains | 1227 | self.update += newchains |
1228 | + | ||
1181 | pbar.update(1) # Everytime the iteration finishes, update the global progress bar | 1229 | pbar.update(1) # Everytime the iteration finishes, update the global progress bar |
1182 | 1230 | ||
1183 | pbar.close() | 1231 | pbar.close() |
... | @@ -1192,7 +1240,7 @@ class Pipeline: | ... | @@ -1192,7 +1240,7 @@ class Pipeline: |
1192 | else: | 1240 | else: |
1193 | conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) | 1241 | conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) |
1194 | for eq_class, codelist in tqdm(full_structures_list, desc="Eq. classes"): | 1242 | for eq_class, codelist in tqdm(full_structures_list, desc="Eq. classes"): |
1195 | - codes = codelist.replace('+',',').split(',') | 1243 | + codes = codelist.replace('+', ',').split(',') |
1196 | 1244 | ||
1197 | # Simply convert the list of codes to Chain() objects | 1245 | # Simply convert the list of codes to Chain() objects |
1198 | for c in codes: | 1246 | for c in codes: |
... | @@ -1201,40 +1249,48 @@ class Pipeline: | ... | @@ -1201,40 +1249,48 @@ class Pipeline: |
1201 | pdb_model = int(nr[1]) | 1249 | pdb_model = int(nr[1]) |
1202 | pdb_chain_id = nr[2].upper() | 1250 | pdb_chain_id = nr[2].upper() |
1203 | chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}" | 1251 | chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}" |
1204 | - res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc = 'unmappd' AND issue=0""") | 1252 | + res = sql_ask_database(conn, f"""SELECT chain_id from chain |
1253 | + WHERE structure_id='{pdb_id}' | ||
1254 | + AND chain_name='{pdb_chain_id}' | ||
1255 | + AND rfam_acc = 'unmappd' | ||
1256 | + AND issue=0""") | ||
1205 | if not len(res) or self.REUSE_ALL: # the chain is NOT yet in the database, or this is a known issue | 1257 | if not len(res) or self.REUSE_ALL: # the chain is NOT yet in the database, or this is a known issue |
1206 | self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class)) | 1258 | self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class)) |
1207 | conn.close() | 1259 | conn.close() |
1208 | 1260 | ||
1209 | if self.SELECT_ONLY is not None: | 1261 | if self.SELECT_ONLY is not None: |
1210 | - self.update = [ c for c in self.update if c.chain_label == self.SELECT_ONLY ] | 1262 | + self.update = [ |
1263 | + c for c in self.update if c.chain_label == self.SELECT_ONLY] | ||
1211 | 1264 | ||
1212 | self.n_chains = len(self.update) | 1265 | self.n_chains = len(self.update) |
1213 | print(str(self.n_chains) + " RNA chains of interest.") | 1266 | print(str(self.n_chains) + " RNA chains of interest.") |
1214 | 1267 | ||
1215 | @trace_unhandled_exceptions | 1268 | @trace_unhandled_exceptions |
1216 | - def dl_and_annotate(self, retry=False, coeff_ncores = 0.75): | 1269 | + def dl_and_annotate(self, retry=False, coeff_ncores=0.75): |
1217 | """ | 1270 | """ |
1218 | Gets mmCIF files from the PDB, and runs DSSR on them. | 1271 | Gets mmCIF files from the PDB, and runs DSSR on them. |
1219 | Ignores a structure if the file already exists (not if we are retrying). | 1272 | Ignores a structure if the file already exists (not if we are retrying). |
1220 | 1273 | ||
1221 | REQUIRES the previous definition of self.update, so call list_available_mappings() before. | 1274 | REQUIRES the previous definition of self.update, so call list_available_mappings() before. |
1222 | - SETS table structure""" | 1275 | + SETS table structure |
1276 | + """ | ||
1223 | 1277 | ||
1224 | - # setproctitle(f"RNANet.py dl_and_annotate(retry={retry})") | 1278 | + setproctitle(f"RNANet.py dl_and_annotate(retry={retry})") |
1225 | 1279 | ||
1226 | # Prepare the results folders | 1280 | # Prepare the results folders |
1227 | - if not path.isdir(path_to_3D_data + "RNAcifs"): | 1281 | + if not os.path.isdir(path_to_3D_data + "RNAcifs"): |
1228 | - os.makedirs(path_to_3D_data + "RNAcifs") # for the whole structures | 1282 | + # for the whole structures |
1229 | - if not path.isdir(path_to_3D_data + "annotations"): | 1283 | + os.makedirs(path_to_3D_data + "RNAcifs") |
1230 | - os.makedirs(path_to_3D_data + "annotations") # for DSSR analysis of the whole structures | 1284 | + if not os.path.isdir(path_to_3D_data + "annotations"): |
1285 | + # for DSSR analysis of the whole structures | ||
1286 | + os.makedirs(path_to_3D_data + "annotations") | ||
1231 | 1287 | ||
1232 | # Download and annotate | 1288 | # Download and annotate |
1233 | print("> Downloading and annotating structures (or checking previous results if they exist)...", flush=True) | 1289 | print("> Downloading and annotating structures (or checking previous results if they exist)...", flush=True) |
1234 | if retry: | 1290 | if retry: |
1235 | - mmcif_list = sorted(set([ c.pdb_id for c in self.retry ])) | 1291 | + mmcif_list = sorted(set([c.pdb_id for c in self.retry])) |
1236 | else: | 1292 | else: |
1237 | - mmcif_list = sorted(set([ c.pdb_id for c in self.update ])) | 1293 | + mmcif_list = sorted(set([c.pdb_id for c in self.update])) |
1238 | try: | 1294 | try: |
1239 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=int(coeff_ncores*ncores)) | 1295 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=int(coeff_ncores*ncores)) |
1240 | pbar = tqdm(mmcif_list, maxinterval=1.0, miniters=1, desc="mmCIF files") | 1296 | pbar = tqdm(mmcif_list, maxinterval=1.0, miniters=1, desc="mmCIF files") |
... | @@ -1255,16 +1311,19 @@ class Pipeline: | ... | @@ -1255,16 +1311,19 @@ class Pipeline: |
1255 | and extract their informations from the JSON files to the database. | 1311 | and extract their informations from the JSON files to the database. |
1256 | 1312 | ||
1257 | REQUIRES the previous definition of self.update, so call list_available_mappings() before. | 1313 | REQUIRES the previous definition of self.update, so call list_available_mappings() before. |
1258 | - SETS self.loaded_chains""" | 1314 | + SETS self.loaded_chains |
1315 | + """ | ||
1259 | 1316 | ||
1260 | setproctitle(f"RNANet.py build_chains(retry={retry})") | 1317 | setproctitle(f"RNANet.py build_chains(retry={retry})") |
1261 | 1318 | ||
1262 | # Prepare folders | 1319 | # Prepare folders |
1263 | if self.EXTRACT_CHAINS: | 1320 | if self.EXTRACT_CHAINS: |
1264 | - if self.HOMOLOGY and not path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"): | 1321 | + if self.HOMOLOGY and not os.path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"): |
1265 | - os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam") # for the portions mapped to Rfam | 1322 | + # for the portions mapped to Rfam |
1266 | - if (not self.HOMOLOGY) and not path.isdir(path_to_3D_data + "rna_only"): | 1323 | + os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam") |
1267 | - os.makedirs(path_to_3D_data + "rna_only") # extract chains of pure RNA | 1324 | + if (not self.HOMOLOGY) and not os.path.isdir(path_to_3D_data + "rna_only"): |
1325 | + # extract chains of pure RNA | ||
1326 | + os.makedirs(path_to_3D_data + "rna_only") | ||
1268 | 1327 | ||
1269 | # define and run jobs | 1328 | # define and run jobs |
1270 | joblist = [] | 1329 | joblist = [] |
... | @@ -1296,44 +1355,48 @@ class Pipeline: | ... | @@ -1296,44 +1355,48 @@ class Pipeline: |
1296 | issues += 1 | 1355 | issues += 1 |
1297 | issues_names.append(c[1].chain_label) | 1356 | issues_names.append(c[1].chain_label) |
1298 | ki.write(c[1].chain_label + '\n') | 1357 | ki.write(c[1].chain_label + '\n') |
1299 | - kir.write(c[1].chain_label + '\n' + c[1].error_messages + '\n\n') | 1358 | + kir.write(c[1].chain_label + '\n' + |
1359 | + c[1].error_messages + '\n\n') | ||
1300 | with sqlite3.connect(runDir+"/results/RNANet.db") as conn: | 1360 | with sqlite3.connect(runDir+"/results/RNANet.db") as conn: |
1301 | sql_execute(conn, f"UPDATE chain SET issue = 1 WHERE chain_id = ?;", data=(c[1].db_chain_id,)) | 1361 | sql_execute(conn, f"UPDATE chain SET issue = 1 WHERE chain_id = ?;", data=(c[1].db_chain_id,)) |
1302 | ki.close() | 1362 | ki.close() |
1303 | kir.close() | 1363 | kir.close() |
1304 | if issues: | 1364 | if issues: |
1305 | warn(f"Added {issues} newly discovered issues to known issues:") | 1365 | warn(f"Added {issues} newly discovered issues to known issues:") |
1306 | - print("\033[33m"+ " ".join(issues_names) + "\033[0m", flush=True) | 1366 | + print("\033[33m" + " ".join(issues_names) + "\033[0m", flush=True) |
1307 | 1367 | ||
1308 | # Add successfully built chains to list | 1368 | # Add successfully built chains to list |
1309 | - self.loaded_chains += [ c[1] for c in results if not c[1].delete_me ] | 1369 | + self.loaded_chains += [c[1] for c in results if not c[1].delete_me] |
1310 | 1370 | ||
1311 | # Identify errors due to empty JSON files (this happen when RAM is full, we believe). | 1371 | # Identify errors due to empty JSON files (this happen when RAM is full, we believe). |
1312 | # Retrying often solves the issue... so retry once with half the cores to limit the RAM usage. | 1372 | # Retrying often solves the issue... so retry once with half the cores to limit the RAM usage. |
1313 | self.to_retry = [ c[1] for c in results if "Could not load existing" in c[1].error_messages ] | 1373 | self.to_retry = [ c[1] for c in results if "Could not load existing" in c[1].error_messages ] |
1314 | 1374 | ||
1315 | def checkpoint_save_chains(self): | 1375 | def checkpoint_save_chains(self): |
1316 | - """Saves self.loaded_chains to data/loaded_chains.picke""" | 1376 | + """Saves self.loaded_chains to data/loaded_chains.picke |
1317 | - with open(runDir + "/data/loaded_chains.pickle","wb") as pick: | 1377 | + """ |
1378 | + with open(runDir + "/data/loaded_chains.pickle", "wb") as pick: | ||
1318 | pickle.dump(self.loaded_chains, pick) | 1379 | pickle.dump(self.loaded_chains, pick) |
1319 | 1380 | ||
1320 | def checkpoint_load_chains(self): | 1381 | def checkpoint_load_chains(self): |
1321 | - """Load self.loaded_chains from data/loaded_chains.pickle""" | 1382 | + """Load self.loaded_chains from data/loaded_chains.pickle |
1322 | - with open(runDir + "/data/loaded_chains.pickle","rb") as pick: | 1383 | + """ |
1384 | + with open(runDir + "/data/loaded_chains.pickle", "rb") as pick: | ||
1323 | self.loaded_chains = pickle.load(pick) | 1385 | self.loaded_chains = pickle.load(pick) |
1324 | 1386 | ||
1325 | def prepare_sequences(self): | 1387 | def prepare_sequences(self): |
1326 | """Downloads homologous sequences and covariance models required to compute MSAs. | 1388 | """Downloads homologous sequences and covariance models required to compute MSAs. |
1327 | 1389 | ||
1328 | REQUIRES that self.loaded_chains is defined. | 1390 | REQUIRES that self.loaded_chains is defined. |
1329 | - SETS family (partially, through call)""" | 1391 | + SETS family (partially, through call) |
1392 | + """ | ||
1330 | 1393 | ||
1331 | setproctitle("RNANet.py prepare_sequences()") | 1394 | setproctitle("RNANet.py prepare_sequences()") |
1332 | 1395 | ||
1333 | # Preparing a results folder | 1396 | # Preparing a results folder |
1334 | if not os.access(path_to_seq_data + "realigned/", os.F_OK): | 1397 | if not os.access(path_to_seq_data + "realigned/", os.F_OK): |
1335 | os.makedirs(path_to_seq_data + "realigned/") | 1398 | os.makedirs(path_to_seq_data + "realigned/") |
1336 | - if not path.isdir(path_to_seq_data + "rfam_sequences/fasta/"): | 1399 | + if not os.path.isdir(path_to_seq_data + "rfam_sequences/fasta/"): |
1337 | os.makedirs(path_to_seq_data + "rfam_sequences/fasta/", exist_ok=True) | 1400 | os.makedirs(path_to_seq_data + "rfam_sequences/fasta/", exist_ok=True) |
1338 | 1401 | ||
1339 | # Update the family table (rfam_acc, description, max_len) | 1402 | # Update the family table (rfam_acc, description, max_len) |
... | @@ -1344,7 +1407,8 @@ class Pipeline: | ... | @@ -1344,7 +1407,8 @@ class Pipeline: |
1344 | 1407 | ||
1345 | joblist = [] | 1408 | joblist = [] |
1346 | for f in self.fam_list: | 1409 | for f in self.fam_list: |
1347 | - joblist.append(Job(function=work_prepare_sequences, how_many_in_parallel=ncores, args=[self.dl, f, rfam_acc_to_download[f]])) | 1410 | + joblist.append(Job(function=work_prepare_sequences, how_many_in_parallel=ncores, args=[ |
1411 | + self.dl, f, rfam_acc_to_download[f]])) | ||
1348 | try: | 1412 | try: |
1349 | execute_joblist(joblist) | 1413 | execute_joblist(joblist) |
1350 | 1414 | ||
... | @@ -1360,14 +1424,16 @@ class Pipeline: | ... | @@ -1360,14 +1424,16 @@ class Pipeline: |
1360 | """Perform multiple sequence alignments. | 1424 | """Perform multiple sequence alignments. |
1361 | 1425 | ||
1362 | REQUIRES self.fam_list to be defined | 1426 | REQUIRES self.fam_list to be defined |
1363 | - SETS family (partially)""" | 1427 | + SETS family (partially) |
1428 | + """ | ||
1364 | 1429 | ||
1365 | setproctitle("RNANet.py realign()") | 1430 | setproctitle("RNANet.py realign()") |
1366 | 1431 | ||
1367 | # Prepare the job list | 1432 | # Prepare the job list |
1368 | joblist = [] | 1433 | joblist = [] |
1369 | for f in self.fam_list: | 1434 | for f in self.fam_list: |
1370 | - joblist.append( Job(function=work_realign, args=[f], how_many_in_parallel=1, label=f)) # the function already uses all CPUs so launch them one by one | 1435 | + # the function already uses all CPUs so launch them one by one (how_many_in_parallel=1) |
1436 | + joblist.append(Job(function=work_realign, args=[f], how_many_in_parallel=1, label=f)) | ||
1371 | 1437 | ||
1372 | # Execute the jobs | 1438 | # Execute the jobs |
1373 | try: | 1439 | try: |
... | @@ -1379,8 +1445,8 @@ class Pipeline: | ... | @@ -1379,8 +1445,8 @@ class Pipeline: |
1379 | # Update the database | 1445 | # Update the database |
1380 | data = [] | 1446 | data = [] |
1381 | for r in results: | 1447 | for r in results: |
1382 | - align = AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta") | 1448 | + align = Bio.AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta") |
1383 | - nb_3d_chains = len([ 1 for r in align if '[' in r.id ]) | 1449 | + nb_3d_chains = len([1 for r in align if '[' in r.id]) |
1384 | if r[0] in SSU_set: # SSU v138 is used | 1450 | if r[0] in SSU_set: # SSU v138 is used |
1385 | nb_homologs = 2225272 # source: https://www.arb-silva.de/documentation/release-138/ | 1451 | nb_homologs = 2225272 # source: https://www.arb-silva.de/documentation/release-138/ |
1386 | nb_total_homol = nb_homologs + nb_3d_chains | 1452 | nb_total_homol = nb_homologs + nb_3d_chains |
... | @@ -1390,7 +1456,7 @@ class Pipeline: | ... | @@ -1390,7 +1456,7 @@ class Pipeline: |
1390 | else: | 1456 | else: |
1391 | nb_total_homol = len(align) | 1457 | nb_total_homol = len(align) |
1392 | nb_homologs = nb_total_homol - nb_3d_chains | 1458 | nb_homologs = nb_total_homol - nb_3d_chains |
1393 | - data.append( (nb_homologs, nb_3d_chains, nb_total_homol, r[2], r[3], r[0]) ) | 1459 | + data.append((nb_homologs, nb_3d_chains, nb_total_homol, r[2], r[3], r[0])) |
1394 | 1460 | ||
1395 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 1461 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
1396 | sql_execute(conn, """UPDATE family SET nb_homologs = ?, nb_3d_chains = ?, nb_total_homol = ?, comput_time = ?, comput_peak_mem = ? | 1462 | sql_execute(conn, """UPDATE family SET nb_homologs = ?, nb_3d_chains = ?, nb_total_homol = ?, comput_time = ?, comput_peak_mem = ? |
... | @@ -1399,13 +1465,14 @@ class Pipeline: | ... | @@ -1399,13 +1465,14 @@ class Pipeline: |
1399 | def remap(self): | 1465 | def remap(self): |
1400 | """Compute nucleotide frequencies of some alignments and save them in the database | 1466 | """Compute nucleotide frequencies of some alignments and save them in the database |
1401 | 1467 | ||
1402 | - REQUIRES self.fam_list to be defined""" | 1468 | + REQUIRES self.fam_list to be defined |
1469 | + """ | ||
1403 | 1470 | ||
1404 | setproctitle("RNANet.py remap()") | 1471 | setproctitle("RNANet.py remap()") |
1405 | 1472 | ||
1406 | print("Computing nucleotide frequencies in alignments...\nThis can be very long on slow storage devices (Hard-drive...)") | 1473 | print("Computing nucleotide frequencies in alignments...\nThis can be very long on slow storage devices (Hard-drive...)") |
1407 | print("Check your CPU and disk I/O activity before deciding if the job failed.") | 1474 | print("Check your CPU and disk I/O activity before deciding if the job failed.") |
1408 | - nworkers =max(min(ncores, len(self.fam_list)), 1) | 1475 | + nworkers = max(min(ncores, len(self.fam_list)), 1) |
1409 | 1476 | ||
1410 | # Prepare the architecture of a shiny multi-progress-bars design | 1477 | # Prepare the architecture of a shiny multi-progress-bars design |
1411 | # Push the number of workers to a queue. | 1478 | # Push the number of workers to a queue. |
... | @@ -1419,8 +1486,10 @@ class Pipeline: | ... | @@ -1419,8 +1486,10 @@ class Pipeline: |
1419 | 1486 | ||
1420 | try: | 1487 | try: |
1421 | fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True) | 1488 | fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True) |
1422 | - for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)): # Apply work_pssm to each RNA family | 1489 | + # Apply work_pssm to each RNA family |
1423 | - fam_pbar.update(1) # Everytime the iteration finishes on a family, update the global progress bar over the RNA families | 1490 | + for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)): |
1491 | + # Everytime the iteration finishes on a family, update the global progress bar over the RNA families | ||
1492 | + fam_pbar.update(1) | ||
1424 | fam_pbar.close() | 1493 | fam_pbar.close() |
1425 | p.close() | 1494 | p.close() |
1426 | p.join() | 1495 | p.join() |
... | @@ -1434,23 +1503,24 @@ class Pipeline: | ... | @@ -1434,23 +1503,24 @@ class Pipeline: |
1434 | def output_results(self): | 1503 | def output_results(self): |
1435 | """Produces CSV files, archive them, and additional metadata files | 1504 | """Produces CSV files, archive them, and additional metadata files |
1436 | 1505 | ||
1437 | - REQUIRES self.loaded_chains (to output corresponding CSV files) and self.fam_list (for statistics)""" | 1506 | + REQUIRES self.loaded_chains (to output corresponding CSV files) and self.fam_list (for statistics) |
1507 | + """ | ||
1438 | 1508 | ||
1439 | setproctitle("RNANet.py output_results()") | 1509 | setproctitle("RNANet.py output_results()") |
1440 | 1510 | ||
1441 | time_str = time.strftime("%Y%m%d") | 1511 | time_str = time.strftime("%Y%m%d") |
1442 | 1512 | ||
1443 | - #Prepare folders: | 1513 | + # Prepare folders: |
1444 | - if not path.isdir(path_to_3D_data + "datapoints/"): | 1514 | + if not os.path.isdir(path_to_3D_data + "datapoints/"): |
1445 | os.makedirs(path_to_3D_data + "datapoints/") | 1515 | os.makedirs(path_to_3D_data + "datapoints/") |
1446 | - if not path.isdir(runDir + "/results/archive/"): | 1516 | + if not os.path.isdir(runDir + "/results/archive/"): |
1447 | os.makedirs(runDir + "/results/archive/") | 1517 | os.makedirs(runDir + "/results/archive/") |
1448 | 1518 | ||
1449 | # Save to by-chain CSV files | 1519 | # Save to by-chain CSV files |
1450 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3) | 1520 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3) |
1451 | try: | 1521 | try: |
1452 | pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True) | 1522 | pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True) |
1453 | - for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains, chunksize=2)): | 1523 | + for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains)): |
1454 | pbar.update(1) | 1524 | pbar.update(1) |
1455 | pbar.close() | 1525 | pbar.close() |
1456 | p.close() | 1526 | p.close() |
... | @@ -1465,36 +1535,44 @@ class Pipeline: | ... | @@ -1465,36 +1535,44 @@ class Pipeline: |
1465 | # Run statistics | 1535 | # Run statistics |
1466 | if self.RUN_STATS: | 1536 | if self.RUN_STATS: |
1467 | # Remove previous precomputed data | 1537 | # Remove previous precomputed data |
1468 | - subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"]) | 1538 | + subprocess.run(["rm", "-f", runDir + "/data/wadley_kernel_eta.npz", |
1539 | + runDir + "/data/wadley_kernel_eta_prime.npz", | ||
1540 | + runDir + "/data/pair_counts.csv"]) | ||
1469 | for f in self.fam_list: | 1541 | for f in self.fam_list: |
1470 | - subprocess.run(["rm","-f", f"data/{f}.npy", f"data/{f}_pairs.csv", f"data/{f}_counts.csv"]) | 1542 | + subprocess.run(["rm", "-f", runDir + f"/data/{f}.npy", |
1543 | + runDir + f"/data/{f}_pairs.csv", | ||
1544 | + runDir + f"/data/{f}_counts.csv"]) | ||
1471 | 1545 | ||
1472 | # Run statistics files | 1546 | # Run statistics files |
1473 | - os.chdir(runDir) | 1547 | + subprocess.run(["python3.8", fileDir+"/regression.py"]) |
1474 | - subprocess.run(["python3.8", "regression.py"]) | 1548 | + subprocess.run(["python3.8", fileDir+"/statistics.py", "--3d-folder", path_to_3D_data, |
1475 | - subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data]) | 1549 | + "--seq-folder", path_to_seq_data, "-r", str(self.CRYSTAL_RES)]) |
1476 | 1550 | ||
1477 | # Save additional informations | 1551 | # Save additional informations |
1478 | with sqlite3.connect(runDir+"/results/RNANet.db") as conn: | 1552 | with sqlite3.connect(runDir+"/results/RNANet.db") as conn: |
1479 | - pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", | 1553 | + pd.read_sql_query("""SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem |
1554 | + FROM family ORDER BY nb_3d_chains DESC;""", | ||
1480 | conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) | 1555 | conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) |
1481 | - pd.read_sql_query("""SELECT eq_class, structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure | 1556 | + pd.read_sql_query("""SELECT eq_class, structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue |
1557 | + FROM structure | ||
1482 | JOIN chain ON structure.pdb_id = chain.structure_id | 1558 | JOIN chain ON structure.pdb_id = chain.structure_id |
1483 | - ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False) | 1559 | + ORDER BY structure_id, chain_name, rfam_acc ASC;""", |
1560 | + conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False) | ||
1484 | 1561 | ||
1485 | # Archive the results | 1562 | # Archive the results |
1486 | - if self.SELECT_ONLY is None: | 1563 | + if self.ARCHIVE: |
1487 | - os.makedirs("results/archive", exist_ok=True) | 1564 | + os.makedirs(runDir + "/results/archive", exist_ok=True) |
1488 | - subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."]) | 1565 | + subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf", |
1566 | + runDir + f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", "."]) | ||
1489 | 1567 | ||
1490 | # Update shortcuts to latest versions | 1568 | # Update shortcuts to latest versions |
1491 | subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz", | 1569 | subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz", |
1492 | runDir + "/results/summary_latest.csv", | 1570 | runDir + "/results/summary_latest.csv", |
1493 | runDir + "/results/families_latest.csv" | 1571 | runDir + "/results/families_latest.csv" |
1494 | ]) | 1572 | ]) |
1495 | - subprocess.run(['ln',"-s", runDir +f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"]) | 1573 | + subprocess.run(['ln', "-s", runDir + f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"]) |
1496 | - subprocess.run(['ln',"-s", runDir +f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"]) | 1574 | + subprocess.run(['ln', "-s", runDir + f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"]) |
1497 | - subprocess.run(['ln',"-s", runDir +f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"]) | 1575 | + subprocess.run(['ln', "-s", runDir + f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"]) |
1498 | 1576 | ||
1499 | def sanitize_database(self): | 1577 | def sanitize_database(self): |
1500 | """Searches for issues in the database and correct them""" | 1578 | """Searches for issues in the database and correct them""" |
... | @@ -1518,7 +1596,9 @@ class Pipeline: | ... | @@ -1518,7 +1596,9 @@ class Pipeline: |
1518 | if self.HOMOLOGY: | 1596 | if self.HOMOLOGY: |
1519 | # check if chains have been re_mapped: | 1597 | # check if chains have been re_mapped: |
1520 | r = sql_ask_database(conn, """SELECT COUNT(DISTINCT chain_id) AS Count, rfam_acc FROM chain | 1598 | r = sql_ask_database(conn, """SELECT COUNT(DISTINCT chain_id) AS Count, rfam_acc FROM chain |
1521 | - WHERE issue = 0 AND chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping) | 1599 | + WHERE issue = 0 |
1600 | + AND rfam_acc != 'unmappd' | ||
1601 | + AND chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping) | ||
1522 | GROUP BY rfam_acc;""") | 1602 | GROUP BY rfam_acc;""") |
1523 | try: | 1603 | try: |
1524 | if len(r) and r[0][0] is not None: | 1604 | if len(r) and r[0][0] is not None: |
... | @@ -1545,22 +1625,25 @@ class Pipeline: | ... | @@ -1545,22 +1625,25 @@ class Pipeline: |
1545 | 1625 | ||
1546 | 1626 | ||
1547 | def read_cpu_number(): | 1627 | def read_cpu_number(): |
1548 | - # As one shall not use os.cpu_count() on LXC containers, | 1628 | + """This function reads the number of CPU cores available from /proc/cpuinfo. |
1549 | - # because it reads info from /sys wich is not the VM resources but the host resources. | 1629 | + One shall not use os.cpu_count() on LXC containers, |
1550 | - # This function reads it from /proc/cpuinfo instead. | 1630 | + because it reads info from /sys wich is not the VM resources but the host resources. |
1631 | + """ | ||
1551 | p = subprocess.run(['grep', '-Ec', '(Intel|AMD)', '/proc/cpuinfo'], stdout=subprocess.PIPE) | 1632 | p = subprocess.run(['grep', '-Ec', '(Intel|AMD)', '/proc/cpuinfo'], stdout=subprocess.PIPE) |
1552 | return int(int(p.stdout.decode('utf-8')[:-1])/2) | 1633 | return int(int(p.stdout.decode('utf-8')[:-1])/2) |
1553 | 1634 | ||
1635 | + | ||
1554 | def init_worker(tqdm_lock=None): | 1636 | def init_worker(tqdm_lock=None): |
1555 | signal.signal(signal.SIGINT, signal.SIG_IGN) | 1637 | signal.signal(signal.SIGINT, signal.SIG_IGN) |
1556 | if tqdm_lock is not None: | 1638 | if tqdm_lock is not None: |
1557 | tqdm.set_lock(tqdm_lock) | 1639 | tqdm.set_lock(tqdm_lock) |
1558 | 1640 | ||
1641 | + | ||
1559 | def warn(message, error=False): | 1642 | def warn(message, error=False): |
1560 | """Pretty-print warnings and error messages. | 1643 | """Pretty-print warnings and error messages. |
1561 | """ | 1644 | """ |
1562 | # Cut if too long | 1645 | # Cut if too long |
1563 | - if len(message)>66: | 1646 | + if len(message) > 66: |
1564 | x = message.find(' ', 50, 66) | 1647 | x = message.find(' ', 50, 66) |
1565 | if x != -1: | 1648 | if x != -1: |
1566 | warn(message[:x], error=error) | 1649 | warn(message[:x], error=error) |
... | @@ -1574,11 +1657,13 @@ def warn(message, error=False): | ... | @@ -1574,11 +1657,13 @@ def warn(message, error=False): |
1574 | else: | 1657 | else: |
1575 | print(f"\t> \033[33mWARN: {message:64s}\033[0m\t{warnsymb}", flush=True) | 1658 | print(f"\t> \033[33mWARN: {message:64s}\033[0m\t{warnsymb}", flush=True) |
1576 | 1659 | ||
1660 | + | ||
1577 | def notify(message, post=''): | 1661 | def notify(message, post=''): |
1578 | if len(post): | 1662 | if len(post): |
1579 | post = '(' + post + ')' | 1663 | post = '(' + post + ')' |
1580 | print(f"\t> {message:70s}\t{validsymb}\t{post}", flush=True) | 1664 | print(f"\t> {message:70s}\t{validsymb}\t{post}", flush=True) |
1581 | 1665 | ||
1666 | + | ||
1582 | def sql_define_tables(conn): | 1667 | def sql_define_tables(conn): |
1583 | conn.executescript( | 1668 | conn.executescript( |
1584 | """ PRAGMA foreign_keys = on; | 1669 | """ PRAGMA foreign_keys = on; |
... | @@ -1684,8 +1769,9 @@ def sql_define_tables(conn): | ... | @@ -1684,8 +1769,9 @@ def sql_define_tables(conn): |
1684 | """) | 1769 | """) |
1685 | conn.commit() | 1770 | conn.commit() |
1686 | 1771 | ||
1772 | + | ||
1687 | @trace_unhandled_exceptions | 1773 | @trace_unhandled_exceptions |
1688 | -def sql_ask_database(conn, sql, warn_every = 10): | 1774 | +def sql_ask_database(conn, sql, warn_every=10): |
1689 | """ | 1775 | """ |
1690 | Reads the SQLite database. | 1776 | Reads the SQLite database. |
1691 | Returns a list of tuples. | 1777 | Returns a list of tuples. |
... | @@ -1698,11 +1784,13 @@ def sql_ask_database(conn, sql, warn_every = 10): | ... | @@ -1698,11 +1784,13 @@ def sql_ask_database(conn, sql, warn_every = 10): |
1698 | return result # if it worked, no need to retry | 1784 | return result # if it worked, no need to retry |
1699 | except sqlite3.OperationalError as e: | 1785 | except sqlite3.OperationalError as e: |
1700 | if warn_every and not (_+1) % warn_every: | 1786 | if warn_every and not (_+1) % warn_every: |
1701 | - warn(str(e) + ", retrying in 0.2s (worker " + str(os.getpid()) + f', try {_+1}/100)') | 1787 | + warn(str(e) + ", retrying in 0.2s (worker " + |
1788 | + str(os.getpid()) + f', try {_+1}/100)') | ||
1702 | time.sleep(0.2) | 1789 | time.sleep(0.2) |
1703 | warn("Tried to reach database 100 times and failed. Aborting.", error=True) | 1790 | warn("Tried to reach database 100 times and failed. Aborting.", error=True) |
1704 | return [] | 1791 | return [] |
1705 | 1792 | ||
1793 | + | ||
1706 | @trace_unhandled_exceptions | 1794 | @trace_unhandled_exceptions |
1707 | def sql_execute(conn, sql, many=False, data=None, warn_every=10): | 1795 | def sql_execute(conn, sql, many=False, data=None, warn_every=10): |
1708 | conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query | 1796 | conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query |
... | @@ -1721,10 +1809,12 @@ def sql_execute(conn, sql, many=False, data=None, warn_every=10): | ... | @@ -1721,10 +1809,12 @@ def sql_execute(conn, sql, many=False, data=None, warn_every=10): |
1721 | return # if it worked, no need to retry | 1809 | return # if it worked, no need to retry |
1722 | except sqlite3.OperationalError as e: | 1810 | except sqlite3.OperationalError as e: |
1723 | if warn_every and not (_+1) % warn_every: | 1811 | if warn_every and not (_+1) % warn_every: |
1724 | - warn(str(e) + ", retrying in 0.2s (worker " + str(os.getpid()) + f', try {_+1}/100)') | 1812 | + warn(str(e) + ", retrying in 0.2s (worker " + |
1813 | + str(os.getpid()) + f', try {_+1}/100)') | ||
1725 | time.sleep(0.2) | 1814 | time.sleep(0.2) |
1726 | warn("Tried to reach database 100 times and failed. Aborting.", error=True) | 1815 | warn("Tried to reach database 100 times and failed. Aborting.", error=True) |
1727 | 1816 | ||
1817 | + | ||
1728 | @trace_unhandled_exceptions | 1818 | @trace_unhandled_exceptions |
1729 | def execute_job(j, jobcount): | 1819 | def execute_job(j, jobcount): |
1730 | """Run a Job object. | 1820 | """Run a Job object. |
... | @@ -1741,7 +1831,8 @@ def execute_job(j, jobcount): | ... | @@ -1741,7 +1831,8 @@ def execute_job(j, jobcount): |
1741 | print(f"[{running_stats[0]+running_stats[2]}/{jobcount}]\t{j.label}") | 1831 | print(f"[{running_stats[0]+running_stats[2]}/{jobcount}]\t{j.label}") |
1742 | 1832 | ||
1743 | # Add the command to logfile | 1833 | # Add the command to logfile |
1744 | - logfile = open(runDir + "/log_of_the_run.sh", 'a') | 1834 | + os.makedirs(runDir+"/logs", exist_ok=True) |
1835 | + logfile = open(runDir + "/logs/log_of_the_run.sh", 'a') | ||
1745 | logfile.write(" ".join(j.cmd_)) | 1836 | logfile.write(" ".join(j.cmd_)) |
1746 | logfile.write("\n") | 1837 | logfile.write("\n") |
1747 | logfile.close() | 1838 | logfile.close() |
... | @@ -1753,7 +1844,8 @@ def execute_job(j, jobcount): | ... | @@ -1753,7 +1844,8 @@ def execute_job(j, jobcount): |
1753 | 1844 | ||
1754 | # run the command. subprocess.run will be a child of this process, and stays monitored. | 1845 | # run the command. subprocess.run will be a child of this process, and stays monitored. |
1755 | start_time = time.time() | 1846 | start_time = time.time() |
1756 | - r = subprocess.run(j.cmd_, timeout=j.timeout_, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | 1847 | + r = subprocess.run(j.cmd_, timeout=j.timeout_, |
1848 | + stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||
1757 | end_time = time.time() | 1849 | end_time = time.time() |
1758 | 1850 | ||
1759 | # Stop the Monitor, then get its result | 1851 | # Stop the Monitor, then get its result |
... | @@ -1782,7 +1874,8 @@ def execute_job(j, jobcount): | ... | @@ -1782,7 +1874,8 @@ def execute_job(j, jobcount): |
1782 | 1874 | ||
1783 | # return time and memory statistics, plus the job results | 1875 | # return time and memory statistics, plus the job results |
1784 | t = end_time - start_time | 1876 | t = end_time - start_time |
1785 | - return (t,m,r) | 1877 | + return (t, m, r) |
1878 | + | ||
1786 | 1879 | ||
1787 | def execute_joblist(fulljoblist): | 1880 | def execute_joblist(fulljoblist): |
1788 | """ Run a list of job objects. | 1881 | """ Run a list of job objects. |
... | @@ -1815,8 +1908,9 @@ def execute_joblist(fulljoblist): | ... | @@ -1815,8 +1908,9 @@ def execute_joblist(fulljoblist): |
1815 | 1908 | ||
1816 | # Process the jobs from priority 1 to nprio | 1909 | # Process the jobs from priority 1 to nprio |
1817 | results = [] | 1910 | results = [] |
1818 | - for i in range(1,nprio+1): | 1911 | + for i in range(1, nprio+1): |
1819 | - if i not in jobs.keys(): continue # no job has the priority level i | 1912 | + if i not in jobs.keys(): |
1913 | + continue # no job has the priority level i | ||
1820 | 1914 | ||
1821 | print("processing jobs of priority", i) | 1915 | print("processing jobs of priority", i) |
1822 | different_thread_numbers = sorted(jobs[i].keys()) | 1916 | different_thread_numbers = sorted(jobs[i].keys()) |
... | @@ -1825,7 +1919,8 @@ def execute_joblist(fulljoblist): | ... | @@ -1825,7 +1919,8 @@ def execute_joblist(fulljoblist): |
1825 | for n in different_thread_numbers: | 1919 | for n in different_thread_numbers: |
1826 | # get the bunch of jobs of same priority and thread number | 1920 | # get the bunch of jobs of same priority and thread number |
1827 | bunch = jobs[i][n] | 1921 | bunch = jobs[i][n] |
1828 | - if not len(bunch): continue # no jobs should be processed n by n | 1922 | + if not len(bunch): |
1923 | + continue # no jobs should be processed n by n | ||
1829 | 1924 | ||
1830 | print("using", n, "processes:") | 1925 | print("using", n, "processes:") |
1831 | # execute jobs of priority i that should be processed n by n: | 1926 | # execute jobs of priority i that should be processed n by n: |
... | @@ -1843,13 +1938,14 @@ def execute_joblist(fulljoblist): | ... | @@ -1843,13 +1938,14 @@ def execute_joblist(fulljoblist): |
1843 | for j, r in zip(bunch, raw_results): | 1938 | for j, r in zip(bunch, raw_results): |
1844 | j.comp_time = round(r[0], 2) # seconds | 1939 | j.comp_time = round(r[0], 2) # seconds |
1845 | j.max_mem = int(r[1]/1000000) # MB | 1940 | j.max_mem = int(r[1]/1000000) # MB |
1846 | - results.append( (j.label, r[2], round(r[0], 2), int(r[1]/1000000))) | 1941 | + results.append((j.label, r[2], round(r[0], 2), int(r[1]/1000000))) |
1847 | 1942 | ||
1848 | # throw back the money | 1943 | # throw back the money |
1849 | return results | 1944 | return results |
1850 | 1945 | ||
1946 | + | ||
1851 | @trace_unhandled_exceptions | 1947 | @trace_unhandled_exceptions |
1852 | -def work_infer_mappings(update_only, allmappings, codelist) -> list: | 1948 | +def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> list: |
1853 | """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list, | 1949 | """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list, |
1854 | build a list of Chain() objects mapped to Rfam families, by expanding available mappings | 1950 | build a list of Chain() objects mapped to Rfam families, by expanding available mappings |
1855 | of any element of the list to all the list elements. | 1951 | of any element of the list to all the list elements. |
... | @@ -1862,13 +1958,13 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: | ... | @@ -1862,13 +1958,13 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: |
1862 | 1958 | ||
1863 | # Split the comma-separated list of chain codes into chain codes: | 1959 | # Split the comma-separated list of chain codes into chain codes: |
1864 | eq_class = codelist[0] | 1960 | eq_class = codelist[0] |
1865 | - codes = codelist[1].replace('+',',').split(',') | 1961 | + codes = codelist[1].replace('+', ',').split(',') |
1866 | 1962 | ||
1867 | # Search for mappings that apply to an element of this PDB chains list: | 1963 | # Search for mappings that apply to an element of this PDB chains list: |
1868 | for c in codes: | 1964 | for c in codes: |
1869 | # search for Rfam mappings with this chain c: | 1965 | # search for Rfam mappings with this chain c: |
1870 | m_row_indices = allmappings.pdb_id + "|1|" + allmappings.chain == c[:4].lower()+c[4:] | 1966 | m_row_indices = allmappings.pdb_id + "|1|" + allmappings.chain == c[:4].lower()+c[4:] |
1871 | - m = allmappings.loc[m_row_indices].drop(['bit_score','evalue_score','cm_start','cm_end','hex_colour'], axis=1) | 1967 | + m = allmappings.loc[m_row_indices].drop(['bit_score', 'evalue_score', 'cm_start', 'cm_end', 'hex_colour'], axis=1) |
1872 | if len(m): | 1968 | if len(m): |
1873 | # remove the found mappings from the dataframe | 1969 | # remove the found mappings from the dataframe |
1874 | allmappings = allmappings.loc[m_row_indices == False] | 1970 | allmappings = allmappings.loc[m_row_indices == False] |
... | @@ -1881,7 +1977,7 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: | ... | @@ -1881,7 +1977,7 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: |
1881 | families = set(known_mappings['rfam_acc']) | 1977 | families = set(known_mappings['rfam_acc']) |
1882 | 1978 | ||
1883 | # generalize | 1979 | # generalize |
1884 | - inferred_mappings = known_mappings.drop(['pdb_id','chain'], axis=1).drop_duplicates() | 1980 | + inferred_mappings = known_mappings.drop(['pdb_id', 'chain'], axis=1).drop_duplicates() |
1885 | 1981 | ||
1886 | # check for approximative redundancy: | 1982 | # check for approximative redundancy: |
1887 | if len(inferred_mappings) != len(inferred_mappings.drop_duplicates(subset="rfam_acc")): | 1983 | if len(inferred_mappings) != len(inferred_mappings.drop_duplicates(subset="rfam_acc")): |
... | @@ -1890,11 +1986,11 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: | ... | @@ -1890,11 +1986,11 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: |
1890 | # ==> Summarize them in one mapping but with the largest window. | 1986 | # ==> Summarize them in one mapping but with the largest window. |
1891 | for rfam in families: | 1987 | for rfam in families: |
1892 | sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end']) | 1988 | sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end']) |
1893 | - thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam ) & sel_5_to_3 | 1989 | + thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam) & sel_5_to_3 |
1894 | - thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam ) & (sel_5_to_3 == False) | 1990 | + thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam) & (sel_5_to_3 == False) |
1895 | 1991 | ||
1896 | if ( | 1992 | if ( |
1897 | - len(inferred_mappings[thisfam_5_3]) != len(inferred_mappings[ inferred_mappings['rfam_acc'] == rfam ]) | 1993 | + len(inferred_mappings[thisfam_5_3]) != len(inferred_mappings[inferred_mappings['rfam_acc'] == rfam]) |
1898 | and len(inferred_mappings[thisfam_5_3]) > 0 | 1994 | and len(inferred_mappings[thisfam_5_3]) > 0 |
1899 | ): | 1995 | ): |
1900 | # there are mappings in both directions... wtf Rfam ?! | 1996 | # there are mappings in both directions... wtf Rfam ?! |
... | @@ -1908,8 +2004,8 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: | ... | @@ -1908,8 +2004,8 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: |
1908 | # We keep only the 5->3 sense. | 2004 | # We keep only the 5->3 sense. |
1909 | inferred_mappings = inferred_mappings.drop(index=inferred_mappings.index[thisfam_3_5]) | 2005 | inferred_mappings = inferred_mappings.drop(index=inferred_mappings.index[thisfam_3_5]) |
1910 | sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end']) | 2006 | sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end']) |
1911 | - thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam ) & sel_5_to_3 | 2007 | + thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam) & sel_5_to_3 |
1912 | - thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam ) & (sel_5_to_3 == False) | 2008 | + thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam) & (sel_5_to_3 == False) |
1913 | print() | 2009 | print() |
1914 | warn(f"Found mappings to {rfam} in both directions on the same interval, keeping only the 5'->3' one.") | 2010 | warn(f"Found mappings to {rfam} in both directions on the same interval, keeping only the 5'->3' one.") |
1915 | else: | 2011 | else: |
... | @@ -1919,35 +2015,35 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: | ... | @@ -1919,35 +2015,35 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: |
1919 | 2015 | ||
1920 | # Compute consensus for chains in 5' -> 3' sense | 2016 | # Compute consensus for chains in 5' -> 3' sense |
1921 | if len(inferred_mappings[thisfam_5_3]): | 2017 | if len(inferred_mappings[thisfam_5_3]): |
1922 | - pdb_start_min = min(inferred_mappings[ thisfam_5_3]['pdb_start']) | 2018 | + pdb_start_min = min(inferred_mappings[thisfam_5_3]['pdb_start']) |
1923 | - pdb_end_max = max(inferred_mappings[ thisfam_5_3]['pdb_end']) | 2019 | + pdb_end_max = max(inferred_mappings[thisfam_5_3]['pdb_end']) |
1924 | - pdb_start_max = max(inferred_mappings[ thisfam_5_3]['pdb_start']) | 2020 | + pdb_start_max = max(inferred_mappings[thisfam_5_3]['pdb_start']) |
1925 | - pdb_end_min = min(inferred_mappings[ thisfam_5_3]['pdb_end']) | 2021 | + pdb_end_min = min(inferred_mappings[thisfam_5_3]['pdb_end']) |
1926 | if (pdb_start_max - pdb_start_min < 100) and (pdb_end_max - pdb_end_min < 100): | 2022 | if (pdb_start_max - pdb_start_min < 100) and (pdb_end_max - pdb_end_min < 100): |
1927 | # the variation is only a few nucleotides, we take the largest window. | 2023 | # the variation is only a few nucleotides, we take the largest window. |
1928 | - inferred_mappings.loc[ thisfam_5_3, 'pdb_start'] = pdb_start_min | 2024 | + inferred_mappings.loc[thisfam_5_3, 'pdb_start'] = pdb_start_min |
1929 | - inferred_mappings.loc[ thisfam_5_3, 'pdb_end'] = pdb_end_max | 2025 | + inferred_mappings.loc[thisfam_5_3, 'pdb_end'] = pdb_end_max |
1930 | else: | 2026 | else: |
1931 | # there probably is an outlier. We chose the median value in the whole list of known_mappings. | 2027 | # there probably is an outlier. We chose the median value in the whole list of known_mappings. |
1932 | - known_sel_5_to_3 = (known_mappings['rfam_acc'] == rfam ) & (known_mappings['pdb_start'] < known_mappings['pdb_end']) | 2028 | + known_sel_5_to_3 = (known_mappings['rfam_acc'] == rfam) & (known_mappings['pdb_start'] < known_mappings['pdb_end']) |
1933 | - inferred_mappings.loc[ thisfam_5_3, 'pdb_start'] = known_mappings.loc[known_sel_5_to_3, 'pdb_start'].median() | 2029 | + inferred_mappings.loc[thisfam_5_3, 'pdb_start'] = known_mappings.loc[known_sel_5_to_3, 'pdb_start'].median() |
1934 | - inferred_mappings.loc[ thisfam_5_3, 'pdb_end'] = known_mappings.loc[known_sel_5_to_3, 'pdb_end'].median() | 2030 | + inferred_mappings.loc[thisfam_5_3, 'pdb_end'] = known_mappings.loc[known_sel_5_to_3, 'pdb_end'].median() |
1935 | 2031 | ||
1936 | # Compute consensus for chains in 3' -> 5' sense | 2032 | # Compute consensus for chains in 3' -> 5' sense |
1937 | if len(inferred_mappings[thisfam_3_5]): | 2033 | if len(inferred_mappings[thisfam_3_5]): |
1938 | - pdb_start_min = min(inferred_mappings[ thisfam_3_5]['pdb_start']) | 2034 | + pdb_start_min = min(inferred_mappings[thisfam_3_5]['pdb_start']) |
1939 | - pdb_end_max = max(inferred_mappings[ thisfam_3_5]['pdb_end']) | 2035 | + pdb_end_max = max(inferred_mappings[thisfam_3_5]['pdb_end']) |
1940 | - pdb_start_max = max(inferred_mappings[ thisfam_3_5]['pdb_start']) | 2036 | + pdb_start_max = max(inferred_mappings[thisfam_3_5]['pdb_start']) |
1941 | - pdb_end_min = min(inferred_mappings[ thisfam_3_5]['pdb_end']) | 2037 | + pdb_end_min = min(inferred_mappings[thisfam_3_5]['pdb_end']) |
1942 | if (pdb_start_max - pdb_start_min < 100) and (pdb_end_max - pdb_end_min < 100): | 2038 | if (pdb_start_max - pdb_start_min < 100) and (pdb_end_max - pdb_end_min < 100): |
1943 | # the variation is only a few nucleotides, we take the largest window. | 2039 | # the variation is only a few nucleotides, we take the largest window. |
1944 | - inferred_mappings.loc[ thisfam_3_5, 'pdb_start'] = pdb_start_max | 2040 | + inferred_mappings.loc[thisfam_3_5, 'pdb_start'] = pdb_start_max |
1945 | - inferred_mappings.loc[ thisfam_3_5, 'pdb_end'] = pdb_end_min | 2041 | + inferred_mappings.loc[thisfam_3_5, 'pdb_end'] = pdb_end_min |
1946 | else: | 2042 | else: |
1947 | # there probably is an outlier. We chose the median value in the whole list of known_mappings. | 2043 | # there probably is an outlier. We chose the median value in the whole list of known_mappings. |
1948 | - known_sel_3_to_5 = (known_mappings['rfam_acc'] == rfam ) & (known_mappings['pdb_start'] > known_mappings['pdb_end']) | 2044 | + known_sel_3_to_5 = (known_mappings['rfam_acc'] == rfam) & (known_mappings['pdb_start'] > known_mappings['pdb_end']) |
1949 | - inferred_mappings.loc[ thisfam_3_5, 'pdb_start'] = known_mappings.loc[known_sel_3_to_5, 'pdb_start'].median() | 2045 | + inferred_mappings.loc[thisfam_3_5, 'pdb_start'] = known_mappings.loc[known_sel_3_to_5, 'pdb_start'].median() |
1950 | - inferred_mappings.loc[ thisfam_3_5, 'pdb_end'] = known_mappings.loc[known_sel_3_to_5, 'pdb_end'].median() | 2046 | + inferred_mappings.loc[thisfam_3_5, 'pdb_end'] = known_mappings.loc[known_sel_3_to_5, 'pdb_end'].median() |
1951 | inferred_mappings.drop_duplicates(inplace=True) | 2047 | inferred_mappings.drop_duplicates(inplace=True) |
1952 | 2048 | ||
1953 | # Now build Chain() objects for the mapped chains | 2049 | # Now build Chain() objects for the mapped chains |
... | @@ -1958,7 +2054,8 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: | ... | @@ -1958,7 +2054,8 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: |
1958 | pdb_chain_id = nr[2] | 2054 | pdb_chain_id = nr[2] |
1959 | for rfam in families: | 2055 | for rfam in families: |
1960 | # if a known mapping of this chain on this family exists, apply it | 2056 | # if a known mapping of this chain on this family exists, apply it |
1961 | - m = known_mappings.loc[ (known_mappings.pdb_id + "|1|" + known_mappings.chain == c[:4].lower()+c[4:]) & (known_mappings['rfam_acc'] == rfam ) ] | 2057 | + this_chain_idxs = (known_mappings.pdb_id + "|1|" + known_mappings.chain == c[:4].lower()+c[4:]) |
2058 | + m = known_mappings.loc[this_chain_idxs & (known_mappings['rfam_acc'] == rfam)] | ||
1962 | if len(m) and len(m) < 2: | 2059 | if len(m) and len(m) < 2: |
1963 | pdb_start = int(m.pdb_start) | 2060 | pdb_start = int(m.pdb_start) |
1964 | pdb_end = int(m.pdb_end) | 2061 | pdb_end = int(m.pdb_end) |
... | @@ -1969,23 +2066,35 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: | ... | @@ -1969,23 +2066,35 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list: |
1969 | pdb_start = int(m.pdb_start.min()) | 2066 | pdb_start = int(m.pdb_start.min()) |
1970 | pdb_end = int(m.pdb_end.max()) | 2067 | pdb_end = int(m.pdb_end.max()) |
1971 | inferred = False | 2068 | inferred = False |
1972 | - elif not(pdb_id in known_mappings.pdb_id and pdb_chain_id in known_mappings.chain): # if no known mapping on another family, use the inferred mapping | 2069 | + elif (fullinference or not(this_chain_idxs.any())): |
1973 | - pdb_start = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_start) | 2070 | + # if no known mapping on another family, use the inferred mapping |
1974 | - pdb_end = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_end) | 2071 | + # idem if the user said to do so with --full-inference |
2072 | + pdb_start = int(inferred_mappings.loc[(inferred_mappings['rfam_acc'] == rfam)].pdb_start) | ||
2073 | + pdb_end = int(inferred_mappings.loc[(inferred_mappings['rfam_acc'] == rfam)].pdb_end) | ||
1975 | inferred = True | 2074 | inferred = True |
2075 | + else: | ||
2076 | + # skip this family, we cannot map this chain to it. | ||
2077 | + continue | ||
1976 | chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}_{pdb_start}-{pdb_end}" | 2078 | chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}_{pdb_start}-{pdb_end}" |
1977 | 2079 | ||
1978 | # Check if the chain exists in the database | 2080 | # Check if the chain exists in the database |
1979 | if update_only: | 2081 | if update_only: |
1980 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: | 2082 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: |
1981 | - res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc='{rfam}' AND issue=0""") | 2083 | + res = sql_ask_database(conn, f"""SELECT chain_id from chain |
2084 | + WHERE structure_id='{pdb_id}' | ||
2085 | + AND chain_name='{pdb_chain_id}' | ||
2086 | + AND rfam_acc='{rfam}' | ||
2087 | + AND issue=0""") | ||
1982 | if not len(res): # the chain is NOT yet in the database, or this is a known issue | 2088 | if not len(res): # the chain is NOT yet in the database, or this is a known issue |
1983 | - newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end)) | 2089 | + newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, |
2090 | + rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end)) | ||
1984 | else: | 2091 | else: |
1985 | - newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end)) | 2092 | + newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, |
2093 | + rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end)) | ||
1986 | 2094 | ||
1987 | return newchains | 2095 | return newchains |
1988 | 2096 | ||
2097 | + | ||
1989 | @trace_unhandled_exceptions | 2098 | @trace_unhandled_exceptions |
1990 | def work_mmcif(pdb_id): | 2099 | def work_mmcif(pdb_id): |
1991 | """ Look for a CIF file (with all chains) from RCSB | 2100 | """ Look for a CIF file (with all chains) from RCSB |
... | @@ -1999,8 +2108,11 @@ def work_mmcif(pdb_id): | ... | @@ -1999,8 +2108,11 @@ def work_mmcif(pdb_id): |
1999 | 2108 | ||
2000 | # Attempt to download it if not present | 2109 | # Attempt to download it if not present |
2001 | try: | 2110 | try: |
2002 | - if not path.isfile(final_filepath): | 2111 | + if not os.path.isfile(final_filepath): |
2003 | - subprocess.run(["wget", f'http://files.rcsb.org/download/{pdb_id}.cif', "-O", final_filepath], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | 2112 | + subprocess.run( |
2113 | + ["wget", f'http://files.rcsb.org/download/{pdb_id}.cif', "-O", final_filepath], | ||
2114 | + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL | ||
2115 | + ) | ||
2004 | except: | 2116 | except: |
2005 | warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True) | 2117 | warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True) |
2006 | return | 2118 | return |
... | @@ -2012,7 +2124,7 @@ def work_mmcif(pdb_id): | ... | @@ -2012,7 +2124,7 @@ def work_mmcif(pdb_id): |
2012 | # if not, read the CIF header and register the structure | 2124 | # if not, read the CIF header and register the structure |
2013 | if not len(r): | 2125 | if not len(r): |
2014 | # Load the MMCIF file with Biopython | 2126 | # Load the MMCIF file with Biopython |
2015 | - mmCif_info = MMCIF2Dict(final_filepath) | 2127 | + mmCif_info = Bio.PDB.MMCIF2Dict.MMCIF2Dict(final_filepath) |
2016 | 2128 | ||
2017 | # Get info about that structure | 2129 | # Get info about that structure |
2018 | try: | 2130 | try: |
... | @@ -2036,9 +2148,9 @@ def work_mmcif(pdb_id): | ... | @@ -2036,9 +2148,9 @@ def work_mmcif(pdb_id): |
2036 | # Save into the database | 2148 | # Save into the database |
2037 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 2149 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
2038 | sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution) | 2150 | sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution) |
2039 | - VALUES (?, ?, DATE(?), ?, ?);""", data = (pdb_id, 1, date, exp_meth, reso)) | 2151 | + VALUES (?, ?, DATE(?), ?, ?);""", data=(pdb_id, 1, date, exp_meth, reso)) |
2040 | 2152 | ||
2041 | - if not path.isfile(path_to_3D_data + "annotations/" + pdb_id + ".json"): | 2153 | + if not os.path.isfile(path_to_3D_data + "annotations/" + pdb_id + ".json"): |
2042 | 2154 | ||
2043 | # run DSSR (you need to have it in your $PATH, follow x3dna installation instructions) | 2155 | # run DSSR (you need to have it in your $PATH, follow x3dna installation instructions) |
2044 | output = subprocess.run(["x3dna-dssr", f"-i={final_filepath}", "--json", "--auxfile=no"], | 2156 | output = subprocess.run(["x3dna-dssr", f"-i={final_filepath}", "--json", "--auxfile=no"], |
... | @@ -2052,22 +2164,23 @@ def work_mmcif(pdb_id): | ... | @@ -2052,22 +2164,23 @@ def work_mmcif(pdb_id): |
2052 | return 1 | 2164 | return 1 |
2053 | 2165 | ||
2054 | # save the analysis to file only if we can load it :/ | 2166 | # save the analysis to file only if we can load it :/ |
2055 | - json_file = open(path_to_3D_data + "annotations/" + pdb_id + ".json", "w") | 2167 | + json_file = open(path_to_3D_data + "annotations/" + |
2168 | + pdb_id + ".json", "w") | ||
2056 | json_file.write(stdout) | 2169 | json_file.write(stdout) |
2057 | json_file.close() | 2170 | json_file.close() |
2058 | 2171 | ||
2059 | return 0 | 2172 | return 0 |
2060 | 2173 | ||
2174 | + | ||
2061 | @trace_unhandled_exceptions | 2175 | @trace_unhandled_exceptions |
2062 | def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): | 2176 | def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): |
2063 | """Reads information from JSON and save it to database. | 2177 | """Reads information from JSON and save it to database. |
2064 | If asked, also extracts the 3D chains from their original structure files. | 2178 | If asked, also extracts the 3D chains from their original structure files. |
2065 | - | ||
2066 | """ | 2179 | """ |
2067 | 2180 | ||
2068 | setproctitle(f"RNAnet.py work_build_chain({c.chain_label})") | 2181 | setproctitle(f"RNAnet.py work_build_chain({c.chain_label})") |
2069 | 2182 | ||
2070 | - if not path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json"): | 2183 | + if not os.path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json"): |
2071 | warn(f"Could not find annotations for {c.chain_label}, ignoring it.", error=True) | 2184 | warn(f"Could not find annotations for {c.chain_label}, ignoring it.", error=True) |
2072 | c.delete_me = True | 2185 | c.delete_me = True |
2073 | c.error_messages += f"Could not download and/or find annotations for {c.chain_label}." | 2186 | c.error_messages += f"Could not download and/or find annotations for {c.chain_label}." |
... | @@ -2094,25 +2207,28 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): | ... | @@ -2094,25 +2207,28 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): |
2094 | 2207 | ||
2095 | return c | 2208 | return c |
2096 | 2209 | ||
2210 | + | ||
2097 | @trace_unhandled_exceptions | 2211 | @trace_unhandled_exceptions |
2098 | def work_prepare_sequences(dl, rfam_acc, chains): | 2212 | def work_prepare_sequences(dl, rfam_acc, chains): |
2099 | - """Prepares FASTA files of homologous sequences to realign with cmalign or SINA.""" | 2213 | + """Prepares FASTA files of homologous sequences to realign with cmalign or SINA. |
2214 | + """ | ||
2100 | 2215 | ||
2101 | setproctitle("RNAnet.py work_prepare_sequences()") | 2216 | setproctitle("RNAnet.py work_prepare_sequences()") |
2102 | 2217 | ||
2103 | if rfam_acc in LSU_set | SSU_set: # rRNA | 2218 | if rfam_acc in LSU_set | SSU_set: # rRNA |
2104 | - if path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): | 2219 | + if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): |
2105 | # Detect doublons and remove them | 2220 | # Detect doublons and remove them |
2106 | - existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta") | 2221 | + existing_afa = Bio.AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta") |
2107 | - existing_ids = [ r.id for r in existing_afa ] | 2222 | + existing_ids = [r.id for r in existing_afa] |
2108 | del existing_afa | 2223 | del existing_afa |
2109 | - new_ids = [ str(c) for c in chains ] | 2224 | + new_ids = [str(c) for c in chains] |
2110 | - doublons = [ i for i in existing_ids if i in new_ids ] | 2225 | + doublons = [i for i in existing_ids if i in new_ids] |
2111 | del existing_ids, new_ids | 2226 | del existing_ids, new_ids |
2112 | if len(doublons): | 2227 | if len(doublons): |
2113 | - fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa" | ||
2114 | warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version") | 2228 | warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version") |
2115 | - seqfile = SeqIO.parse(fasta, "fasta") | 2229 | + fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa" |
2230 | + seqfile = Bio.SeqIO.parse(fasta, "fasta") | ||
2231 | + # remove it and rewrite it with its own content filtered | ||
2116 | os.remove(fasta) | 2232 | os.remove(fasta) |
2117 | with open(fasta, 'w') as f: | 2233 | with open(fasta, 'w') as f: |
2118 | for rec in seqfile: | 2234 | for rec in seqfile: |
... | @@ -2123,16 +2239,15 @@ def work_prepare_sequences(dl, rfam_acc, chains): | ... | @@ -2123,16 +2239,15 @@ def work_prepare_sequences(dl, rfam_acc, chains): |
2123 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f: | 2239 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f: |
2124 | for c in chains: | 2240 | for c in chains: |
2125 | if len(c.seq_to_align): | 2241 | if len(c.seq_to_align): |
2126 | - f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U','T')+'\n') | 2242 | + f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U', 'T')+'\n') |
2127 | status = f"{rfam_acc}: {len(chains)} new PDB sequences to align (with SINA)" | 2243 | status = f"{rfam_acc}: {len(chains)} new PDB sequences to align (with SINA)" |
2128 | 2244 | ||
2129 | - | 2245 | + elif not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.stk"): |
2130 | - elif not path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.stk"): | ||
2131 | # there was no previous aligned sequences, and we use cmalign. | 2246 | # there was no previous aligned sequences, and we use cmalign. |
2132 | # So, we need to download homologous sequences from Rfam. | 2247 | # So, we need to download homologous sequences from Rfam. |
2133 | 2248 | ||
2134 | # Extracting covariance model for this family | 2249 | # Extracting covariance model for this family |
2135 | - if not path.isfile(path_to_seq_data + f"realigned/{rfam_acc}.cm"): | 2250 | + if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}.cm"): |
2136 | with open(path_to_seq_data + f"realigned/{rfam_acc}.cm", "w") as f: | 2251 | with open(path_to_seq_data + f"realigned/{rfam_acc}.cm", "w") as f: |
2137 | subprocess.run(["cmfetch", path_to_seq_data + "Rfam.cm", rfam_acc], stdout=f) | 2252 | subprocess.run(["cmfetch", path_to_seq_data + "Rfam.cm", rfam_acc], stdout=f) |
2138 | notify(f"Extracted {rfam_acc} covariance model (cmfetch)") | 2253 | notify(f"Extracted {rfam_acc} covariance model (cmfetch)") |
... | @@ -2141,7 +2256,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): | ... | @@ -2141,7 +2256,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): |
2141 | dl.download_Rfam_sequences(rfam_acc) | 2256 | dl.download_Rfam_sequences(rfam_acc) |
2142 | 2257 | ||
2143 | # Prepare a FASTA file containing Rfamseq hits for that family | 2258 | # Prepare a FASTA file containing Rfamseq hits for that family |
2144 | - if path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"): # test if download succeeded | 2259 | + if os.path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"): # test if download succeeded |
2145 | 2260 | ||
2146 | # gunzip the file | 2261 | # gunzip the file |
2147 | with gzip.open(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz", 'rb') as gz: | 2262 | with gzip.open(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz", 'rb') as gz: |
... | @@ -2153,14 +2268,14 @@ def work_prepare_sequences(dl, rfam_acc, chains): | ... | @@ -2153,14 +2268,14 @@ def work_prepare_sequences(dl, rfam_acc, chains): |
2153 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "w") as plusplus: | 2268 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "w") as plusplus: |
2154 | ids = set() | 2269 | ids = set() |
2155 | # Remove doublons from the Rfam hits | 2270 | # Remove doublons from the Rfam hits |
2156 | - for r in SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"): | 2271 | + for r in Bio.SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"): |
2157 | if r.id not in ids: | 2272 | if r.id not in ids: |
2158 | ids.add(r.id) | 2273 | ids.add(r.id) |
2159 | plusplus.write('> '+r.description+'\n'+str(r.seq)+'\n') | 2274 | plusplus.write('> '+r.description+'\n'+str(r.seq)+'\n') |
2160 | # Add the 3D chains sequences | 2275 | # Add the 3D chains sequences |
2161 | for c in chains: | 2276 | for c in chains: |
2162 | if len(c.seq_to_align): | 2277 | if len(c.seq_to_align): |
2163 | - plusplus.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U','T')+'\n') | 2278 | + plusplus.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U', 'T')+'\n') |
2164 | 2279 | ||
2165 | del file_content | 2280 | del file_content |
2166 | # os.remove(path_to_seq_data + f"realigned/{rfam_acc}.fa") | 2281 | # os.remove(path_to_seq_data + f"realigned/{rfam_acc}.fa") |
... | @@ -2175,12 +2290,13 @@ def work_prepare_sequences(dl, rfam_acc, chains): | ... | @@ -2175,12 +2290,13 @@ def work_prepare_sequences(dl, rfam_acc, chains): |
2175 | with open(path_to_seq_data + f"realigned/{rfam_acc}_new.fa", "w") as f: | 2290 | with open(path_to_seq_data + f"realigned/{rfam_acc}_new.fa", "w") as f: |
2176 | for c in chains: | 2291 | for c in chains: |
2177 | if len(c.seq_to_align): | 2292 | if len(c.seq_to_align): |
2178 | - f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U','T')+'\n') | 2293 | + f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U', 'T')+'\n') |
2179 | status = f"{rfam_acc}: {len(chains)} new PDB sequences to realign (with existing cmalign alignment)" | 2294 | status = f"{rfam_acc}: {len(chains)} new PDB sequences to realign (with existing cmalign alignment)" |
2180 | 2295 | ||
2181 | # print some stats | 2296 | # print some stats |
2182 | notify(status) | 2297 | notify(status) |
2183 | 2298 | ||
2299 | + | ||
2184 | @trace_unhandled_exceptions | 2300 | @trace_unhandled_exceptions |
2185 | def work_realign(rfam_acc): | 2301 | def work_realign(rfam_acc): |
2186 | """ Runs multiple sequence alignements by RNA family. | 2302 | """ Runs multiple sequence alignements by RNA family. |
... | @@ -2209,10 +2325,10 @@ def work_realign(rfam_acc): | ... | @@ -2209,10 +2325,10 @@ def work_realign(rfam_acc): |
2209 | else: | 2325 | else: |
2210 | # Align using Infernal for most RNA families | 2326 | # Align using Infernal for most RNA families |
2211 | 2327 | ||
2212 | - if path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"): | 2328 | + if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"): |
2213 | # Alignment exists. We just want to add new sequences into it. | 2329 | # Alignment exists. We just want to add new sequences into it. |
2214 | 2330 | ||
2215 | - if not path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"): | 2331 | + if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"): |
2216 | # there are no new sequences to align... | 2332 | # there are no new sequences to align... |
2217 | return | 2333 | return |
2218 | 2334 | ||
... | @@ -2227,13 +2343,13 @@ def work_realign(rfam_acc): | ... | @@ -2227,13 +2343,13 @@ def work_realign(rfam_acc): |
2227 | notify("Aligned new sequences together") | 2343 | notify("Aligned new sequences together") |
2228 | 2344 | ||
2229 | # Detect doublons and remove them | 2345 | # Detect doublons and remove them |
2230 | - existing_stk = AlignIO.read(existing_ali_path, "stockholm") | 2346 | + existing_stk = Bio.AlignIO.read(existing_ali_path, "stockholm") |
2231 | - existing_ids = [ r.id for r in existing_stk ] | 2347 | + existing_ids = [r.id for r in existing_stk] |
2232 | del existing_stk | 2348 | del existing_stk |
2233 | - new_stk = AlignIO.read(new_ali_path, "stockholm") | 2349 | + new_stk = Bio.AlignIO.read(new_ali_path, "stockholm") |
2234 | - new_ids = [ r.id for r in new_stk ] | 2350 | + new_ids = [r.id for r in new_stk] |
2235 | del new_stk | 2351 | del new_stk |
2236 | - doublons = [ i for i in existing_ids if i in new_ids ] | 2352 | + doublons = [i for i in existing_ids if i in new_ids] |
2237 | del existing_ids, new_ids | 2353 | del existing_ids, new_ids |
2238 | if len(doublons): | 2354 | if len(doublons): |
2239 | warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version") | 2355 | warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version") |
... | @@ -2241,12 +2357,13 @@ def work_realign(rfam_acc): | ... | @@ -2241,12 +2357,13 @@ def work_realign(rfam_acc): |
2241 | toremove.write('\n'.join(doublons)+'\n') | 2357 | toremove.write('\n'.join(doublons)+'\n') |
2242 | p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path], | 2358 | p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path], |
2243 | stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | 2359 | stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) |
2244 | - p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path], stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | 2360 | + p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path], |
2361 | + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2245 | os.remove(path_to_seq_data + "realigned/toremove.txt") | 2362 | os.remove(path_to_seq_data + "realigned/toremove.txt") |
2246 | 2363 | ||
2247 | # And we merge the two alignments | 2364 | # And we merge the two alignments |
2248 | - p2= subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", | 2365 | + p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", |
2249 | - "--rna", existing_ali_path, new_ali_path ], | 2366 | + "--rna", existing_ali_path, new_ali_path], |
2250 | stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | 2367 | stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) |
2251 | stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8') | 2368 | stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8') |
2252 | subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path]) | 2369 | subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path]) |
... | @@ -2263,7 +2380,7 @@ def work_realign(rfam_acc): | ... | @@ -2263,7 +2380,7 @@ def work_realign(rfam_acc): |
2263 | p = subprocess.run(["cmalign", "--small", "--cyk", "--noprob", "--nonbanded", "--notrunc", | 2380 | p = subprocess.run(["cmalign", "--small", "--cyk", "--noprob", "--nonbanded", "--notrunc", |
2264 | '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk", | 2381 | '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk", |
2265 | path_to_seq_data + f"realigned/{rfam_acc}.cm", | 2382 | path_to_seq_data + f"realigned/{rfam_acc}.cm", |
2266 | - path_to_seq_data + f"realigned/{rfam_acc}++.fa" ], | 2383 | + path_to_seq_data + f"realigned/{rfam_acc}++.fa"], |
2267 | stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | 2384 | stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) |
2268 | stderr = p.stderr.decode("utf-8") | 2385 | stderr = p.stderr.decode("utf-8") |
2269 | 2386 | ||
... | @@ -2277,7 +2394,9 @@ def work_realign(rfam_acc): | ... | @@ -2277,7 +2394,9 @@ def work_realign(rfam_acc): |
2277 | print('\t'+validsymb, flush=True) | 2394 | print('\t'+validsymb, flush=True) |
2278 | 2395 | ||
2279 | # Convert Stockholm to aligned FASTA | 2396 | # Convert Stockholm to aligned FASTA |
2280 | - subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", "--informat", "stockholm", "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"]) | 2397 | + subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", |
2398 | + "--informat", "stockholm", | ||
2399 | + "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"]) | ||
2281 | subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part. | 2400 | subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part. |
2282 | 2401 | ||
2283 | # Assert everything worked, or save an error | 2402 | # Assert everything worked, or save an error |
... | @@ -2288,6 +2407,7 @@ def work_realign(rfam_acc): | ... | @@ -2288,6 +2407,7 @@ def work_realign(rfam_acc): |
2288 | with open(runDir + "/errors.txt", "a") as er: | 2407 | with open(runDir + "/errors.txt", "a") as er: |
2289 | er.write(f"Failed to realign {rfam_acc} (killed)") | 2408 | er.write(f"Failed to realign {rfam_acc} (killed)") |
2290 | 2409 | ||
2410 | + | ||
2291 | def summarize_position(counts): | 2411 | def summarize_position(counts): |
2292 | """ Counts the number of nucleotides at a given position, given a "column" from a MSA. | 2412 | """ Counts the number of nucleotides at a given position, given a "column" from a MSA. |
2293 | """ | 2413 | """ |
... | @@ -2303,15 +2423,15 @@ def summarize_position(counts): | ... | @@ -2303,15 +2423,15 @@ def summarize_position(counts): |
2303 | N += counts[char] # number of ungapped residues | 2423 | N += counts[char] # number of ungapped residues |
2304 | 2424 | ||
2305 | if N: # prevent division by zero if the column is only gaps | 2425 | if N: # prevent division by zero if the column is only gaps |
2306 | - return ( counts['A']/N, counts['C']/N, counts['G']/N, counts['U']/N, (N - known_chars_count)/N) # other residues, or consensus (N, K, Y...) | 2426 | + return (counts['A']/N, counts['C']/N, counts['G']/N, counts['U']/N, (N - known_chars_count)/N) # other residues, or consensus (N, K, Y...) |
2307 | else: | 2427 | else: |
2308 | return (0, 0, 0, 0, 0) | 2428 | return (0, 0, 0, 0, 0) |
2309 | 2429 | ||
2430 | + | ||
2310 | @trace_unhandled_exceptions | 2431 | @trace_unhandled_exceptions |
2311 | def work_pssm(f, fill_gaps): | 2432 | def work_pssm(f, fill_gaps): |
2312 | """ Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. | 2433 | """ Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. |
2313 | 2434 | ||
2314 | - Also saves every chain of the family to file. | ||
2315 | Uses only 1 core, so this function can be called in parallel. | 2435 | Uses only 1 core, so this function can be called in parallel. |
2316 | 2436 | ||
2317 | """ | 2437 | """ |
... | @@ -2323,18 +2443,17 @@ def work_pssm(f, fill_gaps): | ... | @@ -2323,18 +2443,17 @@ def work_pssm(f, fill_gaps): |
2323 | 2443 | ||
2324 | # get the chains of this family | 2444 | # get the chains of this family |
2325 | list_of_chains = rfam_acc_to_download[f] | 2445 | list_of_chains = rfam_acc_to_download[f] |
2326 | - chains_ids = [ str(c) for c in list_of_chains ] | 2446 | + chains_ids = [str(c) for c in list_of_chains] |
2327 | 2447 | ||
2328 | # Open the alignment | 2448 | # Open the alignment |
2329 | try: | 2449 | try: |
2330 | - align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") | 2450 | + align = Bio.AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") |
2331 | except: | 2451 | except: |
2332 | warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True) | 2452 | warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True) |
2333 | with open(runDir + "/errors.txt", "a") as errf: | 2453 | with open(runDir + "/errors.txt", "a") as errf: |
2334 | errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n") | 2454 | errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n") |
2335 | return 1 | 2455 | return 1 |
2336 | 2456 | ||
2337 | - | ||
2338 | # Compute statistics per column | 2457 | # Compute statistics per column |
2339 | pssm = BufferingSummaryInfo(align).get_pssm(f, thr_idx) | 2458 | pssm = BufferingSummaryInfo(align).get_pssm(f, thr_idx) |
2340 | frequencies = [ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ] | 2459 | frequencies = [ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ] |
... | @@ -2378,10 +2497,13 @@ def work_pssm(f, fill_gaps): | ... | @@ -2378,10 +2497,13 @@ def work_pssm(f, fill_gaps): |
2378 | 2497 | ||
2379 | # Save the re_mappings | 2498 | # Save the re_mappings |
2380 | conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0) | 2499 | conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0) |
2381 | - sql_execute(conn, "INSERT INTO re_mapping (chain_id, index_chain, index_ali) VALUES (?, ?, ?) ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;", many=True, data=re_mappings) | 2500 | + sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali) |
2501 | + VALUES (?, ?, ?) | ||
2502 | + ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""", | ||
2503 | + many=True, data=re_mappings) | ||
2382 | 2504 | ||
2383 | # Save the useful columns in the database | 2505 | # Save the useful columns in the database |
2384 | - data = [ (f, j) + frequencies[j-1] for j in sorted(columns_to_save) ] | 2506 | + data = [(f, j) + frequencies[j-1] for j in sorted(columns_to_save)] |
2385 | sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other) | 2507 | sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other) |
2386 | VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO | 2508 | VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO |
2387 | UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, freq_other=excluded.freq_other;""", many=True, data=data) | 2509 | UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, freq_other=excluded.freq_other;""", many=True, data=data) |
... | @@ -2412,12 +2534,13 @@ def work_pssm(f, fill_gaps): | ... | @@ -2412,12 +2534,13 @@ def work_pssm(f, fill_gaps): |
2412 | pbar.close() | 2534 | pbar.close() |
2413 | sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = ?, | 2535 | sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = ?, |
2414 | is_A = ?, is_C = ?, is_G = ?, is_U = ?, is_other = ? | 2536 | is_A = ?, is_C = ?, is_G = ?, is_U = ?, is_other = ? |
2415 | - WHERE chain_id = ? AND index_chain = ?;""", many=True, data = gaps) | 2537 | + WHERE chain_id = ? AND index_chain = ?;""", many=True, data=gaps) |
2416 | 2538 | ||
2417 | conn.close() | 2539 | conn.close() |
2418 | idxQueue.put(thr_idx) # replace the thread index in the queue | 2540 | idxQueue.put(thr_idx) # replace the thread index in the queue |
2419 | return 0 | 2541 | return 0 |
2420 | 2542 | ||
2543 | + | ||
2421 | @trace_unhandled_exceptions | 2544 | @trace_unhandled_exceptions |
2422 | def work_save(c, homology=True): | 2545 | def work_save(c, homology=True): |
2423 | 2546 | ||
... | @@ -2451,9 +2574,11 @@ def work_save(c, homology=True): | ... | @@ -2451,9 +2574,11 @@ def work_save(c, homology=True): |
2451 | 2574 | ||
2452 | df.to_csv(filename, float_format="%.2f", index=False) | 2575 | df.to_csv(filename, float_format="%.2f", index=False) |
2453 | 2576 | ||
2577 | + | ||
2454 | if __name__ == "__main__": | 2578 | if __name__ == "__main__": |
2455 | 2579 | ||
2456 | - runDir = path.dirname(path.realpath(__file__)) | 2580 | + runDir = os.getcwd() |
2581 | + fileDir = os.path.dirname(os.path.realpath(__file__)) | ||
2457 | ncores = read_cpu_number() | 2582 | ncores = read_cpu_number() |
2458 | pp = Pipeline() | 2583 | pp = Pipeline() |
2459 | pp.process_options() | 2584 | pp.process_options() |
... | @@ -2502,7 +2627,6 @@ if __name__ == "__main__": | ... | @@ -2502,7 +2627,6 @@ if __name__ == "__main__": |
2502 | print("Completed.") | 2627 | print("Completed.") |
2503 | exit(0) | 2628 | exit(0) |
2504 | 2629 | ||
2505 | - | ||
2506 | # At this point, structure, chain and nucleotide tables of the database are up to date. | 2630 | # At this point, structure, chain and nucleotide tables of the database are up to date. |
2507 | # (Modulo some statistics computed by statistics.py) | 2631 | # (Modulo some statistics computed by statistics.py) |
2508 | 2632 | ||
... | @@ -2511,13 +2635,14 @@ if __name__ == "__main__": | ... | @@ -2511,13 +2635,14 @@ if __name__ == "__main__": |
2511 | # =========================================================================== | 2635 | # =========================================================================== |
2512 | 2636 | ||
2513 | if pp.SELECT_ONLY is None: | 2637 | if pp.SELECT_ONLY is None: |
2514 | - pp.checkpoint_load_chains() # If your job failed, you can comment all the "3D information" part and start from here. | 2638 | + # If your job failed, you can comment all the "3D information" part and start from here. |
2639 | + pp.checkpoint_load_chains() | ||
2515 | 2640 | ||
2516 | # Get the list of Rfam families found | 2641 | # Get the list of Rfam families found |
2517 | rfam_acc_to_download = {} | 2642 | rfam_acc_to_download = {} |
2518 | for c in pp.loaded_chains: | 2643 | for c in pp.loaded_chains: |
2519 | if c.mapping.rfam_acc not in rfam_acc_to_download: | 2644 | if c.mapping.rfam_acc not in rfam_acc_to_download: |
2520 | - rfam_acc_to_download[c.mapping.rfam_acc] = [ c ] | 2645 | + rfam_acc_to_download[c.mapping.rfam_acc] = [c] |
2521 | else: | 2646 | else: |
2522 | rfam_acc_to_download[c.mapping.rfam_acc].append(c) | 2647 | rfam_acc_to_download[c.mapping.rfam_acc].append(c) |
2523 | 2648 | ||
... | @@ -2546,5 +2671,5 @@ if __name__ == "__main__": | ... | @@ -2546,5 +2671,5 @@ if __name__ == "__main__": |
2546 | 2671 | ||
2547 | print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove | 2672 | print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove |
2548 | 2673 | ||
2549 | - # # so i can sleep for the end of the night | 2674 | + # so i can sleep for the end of the night |
2550 | # subprocess.run(["poweroff"]) | 2675 | # subprocess.run(["poweroff"]) | ... | ... |
1 | -1ml5_1_a_1-2914 | 1 | +1eg0_1_O_1-73 |
2 | -1ml5_1_a_151-2903 | ||
3 | -1ml5_1_A_7-1515 | ||
4 | -1ml5_1_A_2-1520 | ||
5 | -1ml5_1_A_7-1518 | ||
6 | -1ml5_1_b_5-121 | ||
7 | 2rdo_1_A_3-118 | 2 | 2rdo_1_A_3-118 |
8 | 4v48_1_A9_3-118 | 3 | 4v48_1_A9_3-118 |
9 | 4v47_1_A9_3-118 | 4 | 4v47_1_A9_3-118 |
10 | -6zmi_1_L8_1267-4755 | ||
11 | -6zm7_1_L8_1267-4755 | ||
12 | -6y6x_1_L8_1267-4755 | ||
13 | -6z6n_1_L8_1267-4755 | ||
14 | -6qzp_1_L8_1267-4755 | ||
15 | -6zme_1_L8_1267-4755 | ||
16 | -6z6l_1_L8_1267-4755 | ||
17 | -6ek0_1_L8_1267-4755 | ||
18 | -6zmo_1_L8_1267-4755 | ||
19 | -6z6m_1_L8_1267-4755 | ||
20 | -6ole_1_D_1267-4755 | ||
21 | -6om0_1_D_1267-4755 | ||
22 | -6y2l_1_L8_1267-4755 | ||
23 | -6lqm_1_8_1267-4755 | ||
24 | -6y0g_1_L8_1267-4755 | ||
25 | -6lu8_1_8_1267-4755 | ||
26 | -6lsr_1_8_1267-4755 | ||
27 | -6lss_1_8_1267-4755 | ||
28 | -6oli_1_D_1267-4755 | ||
29 | -6olg_1_A3_1267-4755 | ||
30 | -6y57_1_L8_1267-4755 | ||
31 | -5t2c_1_C_1267-4755 | ||
32 | -6om7_1_D_1267-4755 | ||
33 | -4ug0_1_L8_1267-4755 | ||
34 | -6olf_1_D_1267-4755 | ||
35 | -6ip5_1_1C_1267-4755 | ||
36 | -6ip8_1_1C_1267-4755 | ||
37 | -6olz_1_A3_1267-4755 | ||
38 | -5aj0_1_A3_1267-4755 | ||
39 | -5lks_1_L8_1267-4755 | ||
40 | -6ip6_1_1C_1267-4755 | ||
41 | -4v6x_1_A8_1267-4755 | ||
42 | 1vy7_1_AY_1-73 | 5 | 1vy7_1_AY_1-73 |
43 | 1vy7_1_CY_1-73 | 6 | 1vy7_1_CY_1-73 |
44 | 4w2h_1_CY_1-73 | 7 | 4w2h_1_CY_1-73 |
45 | -2z9q_1_A_1-72 | 8 | +1jgq_1_A_2-1520 |
9 | +4v42_1_AA_2-1520 | ||
10 | +1jgo_1_A_2-1520 | ||
11 | +1jgp_1_A_2-1520 | ||
12 | +1ml5_1_A_2-1520 | ||
13 | +4v42_1_BA_1-2914 | ||
14 | +1ml5_1_a_1-2914 | ||
46 | 4v42_1_BB_5-121 | 15 | 4v42_1_BB_5-121 |
16 | +1ml5_1_b_5-121 | ||
17 | +2rdo_1_B_1-2904 | ||
18 | +4v48_1_A0_1-2904 | ||
19 | +4v47_1_A0_1-2904 | ||
20 | +4v48_1_BA_1-1543 | ||
21 | +4v47_1_BA_1-1542 | ||
47 | 1ls2_1_B_1-73 | 22 | 1ls2_1_B_1-73 |
48 | 3ep2_1_Y_1-72 | 23 | 3ep2_1_Y_1-72 |
49 | 3eq3_1_Y_1-72 | 24 | 3eq3_1_Y_1-72 |
50 | 4v48_1_A6_1-73 | 25 | 4v48_1_A6_1-73 |
51 | -1eg0_1_O_1-73 | 26 | +2z9q_1_A_1-72 |
52 | 1gsg_1_T_1-72 | 27 | 1gsg_1_T_1-72 |
53 | 3jcr_1_H_1-115 | 28 | 3jcr_1_H_1-115 |
54 | -4v42_1_BA_1-2914 | 29 | +1x1l_1_A_1-132 |
55 | -4v42_1_BA_151-2903 | 30 | +1zc8_1_Z_1-93 |
56 | -4v48_1_BA_1-91 | 31 | +2ob7_1_D_1-132 |
57 | -4v48_1_BA_6-1541 | ||
58 | -4v48_1_BA_1-1543 | ||
59 | -4v48_1_BA_6-1538 | ||
60 | -4v47_1_BA_1-91 | ||
61 | -4v47_1_BA_6-1540 | ||
62 | -4v47_1_BA_1-1542 | ||
63 | -4v47_1_BA_6-1537 | ||
64 | -2rdo_1_B_1-2903 | ||
65 | -2rdo_1_B_6-1460 | ||
66 | -2rdo_1_B_1-1528 | ||
67 | -2rdo_1_B_6-1457 | ||
68 | -2rdo_1_B_160-2893 | ||
69 | -2rdo_1_B_1-2904 | ||
70 | -2rdo_1_B_6-1522 | ||
71 | -4v48_1_A0_1-2903 | ||
72 | -4v48_1_A0_6-1460 | ||
73 | -4v48_1_A0_1-1528 | ||
74 | -4v48_1_A0_6-1457 | ||
75 | -4v48_1_A0_160-2893 | ||
76 | -4v48_1_A0_1-2904 | ||
77 | -4v48_1_A0_6-1522 | ||
78 | -4v47_1_A0_1-2903 | ||
79 | -4v47_1_A0_6-1460 | ||
80 | -4v47_1_A0_1-1528 | ||
81 | -4v47_1_A0_6-1457 | ||
82 | -4v47_1_A0_160-2893 | ||
83 | -4v47_1_A0_1-2904 | ||
84 | -4v47_1_A0_6-1522 | ||
85 | 2ob7_1_A_10-319 | 32 | 2ob7_1_A_10-319 |
86 | -1x1l_1_A_1-130 | ||
87 | -1zc8_1_Z_1-130 | ||
88 | -1zc8_1_Z_1-91 | ||
89 | -2ob7_1_D_1-130 | ||
90 | -6rxu_1_C2_588-2386 | ||
91 | -6rxu_1_C2_583-2388 | ||
92 | -6rxu_1_C2_588-2383 | ||
93 | -5oql_1_2_588-2386 | ||
94 | -5oql_1_2_583-2388 | ||
95 | -5oql_1_2_588-2383 | ||
96 | -6rxv_1_C2_588-2386 | ||
97 | -6rxv_1_C2_583-2388 | ||
98 | -6rxv_1_C2_588-2383 | ||
99 | -6rxz_1_C2_588-2386 | ||
100 | -6rxz_1_C2_583-2388 | ||
101 | -6rxz_1_C2_588-2383 | ||
102 | -6rxy_1_C2_588-2386 | ||
103 | -6rxy_1_C2_583-2388 | ||
104 | -6rxy_1_C2_588-2383 | ||
105 | -6rxt_1_C2_588-2386 | ||
106 | -6rxt_1_C2_583-2388 | ||
107 | -6rxt_1_C2_588-2383 | ||
108 | 1r2x_1_C_1-58 | 33 | 1r2x_1_C_1-58 |
109 | 1r2w_1_C_1-58 | 34 | 1r2w_1_C_1-58 |
110 | -1eg0_1_L_1-57 | ||
111 | 1eg0_1_L_1-56 | 35 | 1eg0_1_L_1-56 |
112 | -1jgq_1_A_7-1518 | ||
113 | -1jgq_1_A_20-55 | ||
114 | -1jgq_1_A_2-1520 | ||
115 | -1jgq_1_A_7-1515 | ||
116 | -4v42_1_AA_7-1518 | ||
117 | -4v42_1_AA_20-55 | ||
118 | -4v42_1_AA_2-1520 | ||
119 | -4v42_1_AA_7-1515 | ||
120 | -1jgo_1_A_7-1518 | ||
121 | -1jgo_1_A_20-55 | ||
122 | -1jgo_1_A_2-1520 | ||
123 | -1jgo_1_A_7-1515 | ||
124 | -1jgp_1_A_7-1518 | ||
125 | -1jgp_1_A_20-55 | ||
126 | -1jgp_1_A_2-1520 | ||
127 | -1jgp_1_A_7-1515 | ||
128 | 1zc8_1_A_1-59 | 36 | 1zc8_1_A_1-59 |
129 | -1mvr_1_D_1-59 | 37 | +1mvr_1_D_1-61 |
130 | -4c9d_1_D_29-1 | 38 | +4adx_1_9_1-123 |
131 | -4c9d_1_C_29-1 | ||
132 | -4adx_1_9_1-121 | ||
133 | 1zn1_1_B_1-59 | 39 | 1zn1_1_B_1-59 |
134 | 1emi_1_B_1-108 | 40 | 1emi_1_B_1-108 |
135 | 3iy9_1_A_498-1027 | 41 | 3iy9_1_A_498-1027 |
... | @@ -143,25 +49,1558 @@ | ... | @@ -143,25 +49,1558 @@ |
143 | 3cw1_1_V_1-138 | 49 | 3cw1_1_V_1-138 |
144 | 3cw1_1_v_1-138 | 50 | 3cw1_1_v_1-138 |
145 | 2iy3_1_B_9-105 | 51 | 2iy3_1_B_9-105 |
146 | -3jcr_1_N_1-188 | 52 | +3jcr_1_N_1-107 |
147 | -3jcr_1_N_1-106 | ||
148 | 2vaz_1_A_64-177 | 53 | 2vaz_1_A_64-177 |
149 | -2ftc_1_R_1-1568 | ||
150 | -2ftc_1_R_792-1568 | ||
151 | 2ftc_1_R_81-1466 | 54 | 2ftc_1_R_81-1466 |
152 | 3jcr_1_M_1-141 | 55 | 3jcr_1_M_1-141 |
153 | -3jcr_1_M_1-188 | ||
154 | -3jcr_1_M_1-107 | ||
155 | -4v5z_1_B0_1-2899 | ||
156 | 4v5z_1_B0_1-2902 | 56 | 4v5z_1_B0_1-2902 |
157 | -4v5z_1_B0_1-2840 | ||
158 | 5g2x_1_A_595-692 | 57 | 5g2x_1_A_595-692 |
159 | 3iy8_1_A_1-540 | 58 | 3iy8_1_A_1-540 |
160 | 4v5z_1_BY_2-113 | 59 | 4v5z_1_BY_2-113 |
161 | 4v5z_1_BZ_1-70 | 60 | 4v5z_1_BZ_1-70 |
162 | -1mvr_1_B_1-96 | 61 | +4v5z_1_B1_2-125 |
163 | -4adx_1_0_1-2923 | 62 | +1mvr_1_B_3-96 |
164 | -4adx_1_0_132-2915 | 63 | +4adx_1_0_1-2925 |
165 | 3eq4_1_Y_1-69 | 64 | 3eq4_1_Y_1-69 |
65 | +6uz7_1_8_2140-2827 | ||
166 | 4v5z_1_AA_1-1563 | 66 | 4v5z_1_AA_1-1563 |
167 | -4v5z_1_AA_1-1562 | 67 | +6cfj_1_1X |
68 | +6cfj_1_2X | ||
69 | +5hcq_1_1X | ||
70 | +6cae_1_1X | ||
71 | +5hcq_1_2X | ||
72 | +5hcr_1_1X | ||
73 | +4z8c_1_1X | ||
74 | +5j4b_1_1X | ||
75 | +5j4b_1_2X | ||
76 | +4z8c_1_2X | ||
77 | +6cae_1_2X | ||
78 | +5j4c_1_1X | ||
79 | +5w4k_1_1X | ||
80 | +6of1_1_1X | ||
81 | +5hcr_1_2X | ||
82 | +5hd1_1_1X | ||
83 | +5hcp_1_1X | ||
84 | +6of1_1_2X | ||
85 | +5hau_1_1W | ||
86 | +5j4c_1_2X | ||
87 | +5wis_1_1X | ||
88 | +6xqd_1_1X | ||
89 | +6nd5_1_1X | ||
90 | +5w4k_1_2X | ||
91 | +5hau_1_2W | ||
92 | +6xqd_1_2X | ||
93 | +4y4p_1_1X | ||
94 | +6o97_1_1X | ||
95 | +5hcp_1_2X | ||
96 | +5doy_1_1X | ||
97 | +4zer_1_1X | ||
98 | +5wit_1_1X | ||
99 | +5hd1_1_2X | ||
100 | +6nd5_1_2X | ||
101 | +4z3s_1_1X | ||
102 | +7jql_1_1X | ||
103 | +7jqm_1_1X | ||
104 | +7jql_1_2X | ||
105 | +5wis_1_2X | ||
106 | +6nd6_1_1X | ||
107 | +6o97_1_2X | ||
108 | +4y4p_1_2X | ||
109 | +7jqm_1_2X | ||
110 | +4z3s_1_2X | ||
111 | +4zer_1_2X | ||
112 | +6uo1_1_2X | ||
113 | +6uo1_1_1X | ||
114 | +5doy_1_2X | ||
115 | +5wit_1_2X | ||
116 | +5f8k_1_1X | ||
117 | +6nd6_1_2X | ||
118 | +6xqe_1_1X | ||
119 | +6xqe_1_2X | ||
120 | +6n9e_1_1X | ||
121 | +6n9e_1_2X | ||
122 | +6n9f_1_1X | ||
123 | +5f8k_1_2X | ||
124 | +6n9f_1_2X | ||
125 | +6xz7_1_F | ||
126 | +6y69_1_W | ||
127 | +5afi_1_V | ||
128 | +5afi_1_W | ||
129 | +6h4n_1_W | ||
130 | +5wdt_1_V | ||
131 | +5wfs_1_V | ||
132 | +5wdt_1_W | ||
133 | +5wfs_1_W | ||
134 | +5we4_1_V | ||
135 | +5we4_1_W | ||
136 | +5uq8_1_Y | ||
137 | +6c4i_1_Y | ||
138 | +6c4i_1_X | ||
139 | +5zeb_1_V | ||
140 | +5zep_1_W | ||
141 | +5lzd_1_V | ||
142 | +5we6_1_V | ||
143 | +5wfk_1_V | ||
144 | +5wfk_1_W | ||
145 | +5we6_1_W | ||
146 | +5u4i_1_Y | ||
147 | +5uq7_1_Y | ||
148 | +5u4i_1_X | ||
149 | +5lza_1_V | ||
150 | +5wf0_1_V | ||
151 | +5wf0_1_W | ||
152 | +5zeu_1_V | ||
153 | +5l3p_1_X | ||
154 | +3jcj_1_V | ||
155 | +6gxm_1_X | ||
156 | +6gwt_1_X | ||
157 | +6gxn_1_X | ||
158 | +6gxo_1_X | ||
159 | +3j9y_1_V | ||
160 | +6o9k_1_Y | ||
161 | +6o7k_1_V | ||
162 | +5lzf_1_V | ||
163 | +3jcn_1_V | ||
164 | +5lzc_1_V | ||
165 | +5u4j_1_X | ||
166 | +5u4j_1_Z | ||
167 | +5lzb_1_V | ||
168 | +6h58_1_W | ||
169 | +6h58_1_WW | ||
170 | +1eg0_1_O | ||
171 | +5j8b_1_X | ||
172 | +4v7j_1_AV | ||
173 | +4v7j_1_BV | ||
174 | +4v7k_1_BV | ||
175 | +4v7k_1_AV | ||
176 | +4v7k_1_BW | ||
177 | +4v7k_1_AW | ||
178 | +4v7j_1_AW | ||
179 | +4v7j_1_BW | ||
180 | +4v4j_1_Z | ||
181 | +6i0v_1_B | ||
182 | +5k77_1_X | ||
183 | +5k77_1_V | ||
184 | +5k77_1_Y | ||
185 | +5k77_1_W | ||
186 | +5k77_1_Z | ||
187 | +4pei_1_X | ||
188 | +4pei_1_V | ||
189 | +4pei_1_W | ||
190 | +4pei_1_Z | ||
191 | +4pei_1_Y | ||
192 | +4a3c_1_P | ||
193 | +4a3e_1_P | ||
194 | +6lkq_1_U | ||
195 | +7k00_1_B | ||
196 | +6qdw_1_A | ||
197 | +2rdo_1_A | ||
198 | +4v48_1_A9 | ||
199 | +4v47_1_A9 | ||
200 | +6hcj_1_Q3 | ||
201 | +6hcq_1_Q3 | ||
202 | +5mmm_1_Z | ||
203 | +4w2e_1_W | ||
204 | +5j4b_1_1Y | ||
205 | +6cfj_1_1W | ||
206 | +5w4k_1_1Y | ||
207 | +5wit_1_1W | ||
208 | +6cfj_1_1Y | ||
209 | +6cfj_1_2W | ||
210 | +5j4c_1_1W | ||
211 | +5wis_1_1Y | ||
212 | +5j4c_1_1Y | ||
213 | +6cfj_1_2Y | ||
214 | +5wis_1_1W | ||
215 | +5j4b_1_1W | ||
216 | +5j4c_1_2W | ||
217 | +5j4b_1_2W | ||
218 | +5j4b_1_2Y | ||
219 | +5j4c_1_2Y | ||
220 | +5w4k_1_1W | ||
221 | +6nd5_1_1Y | ||
222 | +5wis_1_2Y | ||
223 | +5wit_1_2W | ||
224 | +5doy_1_1Y | ||
225 | +5w4k_1_2Y | ||
226 | +4y4p_1_1Y | ||
227 | +4z3s_1_1Y | ||
228 | +5doy_1_1W | ||
229 | +5doy_1_2Y | ||
230 | +6nd5_1_1W | ||
231 | +4z3s_1_2Y | ||
232 | +4z3s_1_1W | ||
233 | +5w4k_1_2W | ||
234 | +6nd5_1_2Y | ||
235 | +4y4p_1_2Y | ||
236 | +6uo1_1_2Y | ||
237 | +6uo1_1_2W | ||
238 | +4y4p_1_1W | ||
239 | +4z3s_1_2W | ||
240 | +6uo1_1_1Y | ||
241 | +6uo1_1_1W | ||
242 | +5wis_1_2W | ||
243 | +5wit_1_1Y | ||
244 | +6nd5_1_2W | ||
245 | +4y4p_1_2W | ||
246 | +5doy_1_2W | ||
247 | +5wit_1_2Y | ||
248 | +6ucq_1_1Y | ||
249 | +4v4i_1_Z | ||
250 | +6ucq_1_1X | ||
251 | +6ucq_1_2Y | ||
252 | +4w2e_1_X | ||
253 | +6ucq_1_2X | ||
254 | +6yss_1_W | ||
255 | +5afi_1_Y | ||
256 | +5uq8_1_Z | ||
257 | +5wdt_1_Y | ||
258 | +5wfs_1_Y | ||
259 | +6ysr_1_W | ||
260 | +5we4_1_Y | ||
261 | +6yst_1_W | ||
262 | +5uq7_1_Z | ||
263 | +5we6_1_Y | ||
264 | +5wfk_1_Y | ||
265 | +5wf0_1_Y | ||
266 | +6o9j_1_V | ||
267 | +6ysu_1_W | ||
268 | +3j46_1_A | ||
269 | +5j8b_1_Y | ||
270 | +5j8b_1_W | ||
271 | +3bbv_1_Z | ||
272 | +5aj0_1_BV | ||
273 | +5aj0_1_BW | ||
274 | +4wt8_1_AB | ||
275 | +4wt8_1_BB | ||
276 | +4v4j_1_Y | ||
277 | +4v4i_1_Y | ||
278 | +5uq8_1_X | ||
279 | +5uq7_1_X | ||
280 | +1jgq_1_A | ||
281 | +4v42_1_AA | ||
282 | +1jgo_1_A | ||
283 | +1jgp_1_A | ||
284 | +1ml5_1_A | ||
285 | +4v4j_1_W | ||
286 | +4v4i_1_W | ||
287 | +4v42_1_BA | ||
288 | +4wt8_1_CS | ||
289 | +4wt8_1_DS | ||
290 | +4v4j_1_X | ||
291 | +4v4i_1_X | ||
292 | +4v42_1_BB | ||
293 | +6uu4_1_333 | ||
294 | +6uu0_1_333 | ||
295 | +6uuc_1_333 | ||
296 | +6uu2_1_333 | ||
297 | +6b6h_1_3 | ||
298 | +6pb4_1_3 | ||
299 | +6d30_1_C | ||
300 | +6j7z_1_C | ||
301 | +3er9_1_D | ||
302 | +5kal_1_Y | ||
303 | +4nia_1_3 | ||
304 | +5kal_1_Z | ||
305 | +4nia_1_7 | ||
306 | +4nia_1_4 | ||
307 | +5new_1_C | ||
308 | +4nia_1_U | ||
309 | +4nia_1_6 | ||
310 | +4oq9_1_7 | ||
311 | +4nia_1_1 | ||
312 | +4oq9_1_4 | ||
313 | +4nia_1_8 | ||
314 | +4oq9_1_8 | ||
315 | +4nia_1_5 | ||
316 | +2vrt_1_E | ||
317 | +4nia_1_W | ||
318 | +4oq9_1_6 | ||
319 | +4oq8_1_D | ||
320 | +4nia_1_Z | ||
321 | +4oq9_1_W | ||
322 | +4oq9_1_5 | ||
323 | +4nia_1_2 | ||
324 | +2vrt_1_F | ||
325 | +4oq9_1_U | ||
326 | +4oq9_1_Z | ||
327 | +4oq9_1_2 | ||
328 | +4oq9_1_3 | ||
329 | +1ddl_1_E | ||
330 | +4oq9_1_1 | ||
331 | +6rt5_1_A | ||
332 | +6rt5_1_E | ||
333 | +4qu6_1_B | ||
334 | +6lkq_1_T | ||
335 | +6qdw_1_B | ||
336 | +3jbv_1_B | ||
337 | +3jbu_1_B | ||
338 | +2rdo_1_B | ||
339 | +4v48_1_A0 | ||
340 | +4v47_1_A0 | ||
341 | +6do8_1_B | ||
342 | +6dpi_1_B | ||
343 | +6dp9_1_B | ||
344 | +6dpb_1_B | ||
345 | +6dmn_1_B | ||
346 | +6dpp_1_B | ||
347 | +6dpk_1_B | ||
348 | +6dpd_1_B | ||
349 | +6dot_1_B | ||
350 | +6dok_1_B | ||
351 | +6dp8_1_B | ||
352 | +6dpl_1_B | ||
353 | +6dpg_1_B | ||
354 | +6dou_1_B | ||
355 | +6dpc_1_B | ||
356 | +6do9_1_B | ||
357 | +6dmv_1_B | ||
358 | +6dp4_1_B | ||
359 | +6dpn_1_B | ||
360 | +6doj_1_B | ||
361 | +6dph_1_B | ||
362 | +6dos_1_B | ||
363 | +6doo_1_B | ||
364 | +6dp6_1_B | ||
365 | +6dox_1_B | ||
366 | +6dp5_1_B | ||
367 | +6dol_1_B | ||
368 | +6dp1_1_B | ||
369 | +6doz_1_B | ||
370 | +6dp7_1_B | ||
371 | +6doq_1_B | ||
372 | +6dpa_1_B | ||
373 | +6dom_1_B | ||
374 | +6dog_1_B | ||
375 | +6dop_1_B | ||
376 | +6doh_1_B | ||
377 | +6doa_1_B | ||
378 | +6don_1_B | ||
379 | +6dov_1_B | ||
380 | +6dpo_1_B | ||
381 | +6dod_1_B | ||
382 | +6dob_1_B | ||
383 | +6dow_1_B | ||
384 | +6dpm_1_B | ||
385 | +6dpf_1_B | ||
386 | +6dp3_1_B | ||
387 | +6dp2_1_B | ||
388 | +6dpe_1_B | ||
389 | +6dpj_1_B | ||
390 | +6dor_1_B | ||
391 | +6dof_1_B | ||
392 | +6dp0_1_B | ||
393 | +6doi_1_B | ||
394 | +6doc_1_B | ||
395 | +6doe_1_B | ||
396 | +6n6g_1_D | ||
397 | +6lkq_1_S | ||
398 | +5h5u_1_H | ||
399 | +5lze_1_Y | ||
400 | +5lze_1_V | ||
401 | +5lze_1_X | ||
402 | +3jcj_1_G | ||
403 | +6o7k_1_G | ||
404 | +4v48_1_BA | ||
405 | +4v47_1_BA | ||
406 | +4b3r_1_W | ||
407 | +4b3t_1_W | ||
408 | +4b3s_1_W | ||
409 | +5o2r_1_X | ||
410 | +5kcs_1_1X | ||
411 | +6fti_1_U | ||
412 | +6fti_1_W | ||
413 | +6ftj_1_U | ||
414 | +6ftj_1_W | ||
415 | +6ftg_1_U | ||
416 | +6ftg_1_W | ||
417 | +6ole_1_T | ||
418 | +6om0_1_T | ||
419 | +6oli_1_T | ||
420 | +6om7_1_T | ||
421 | +6olf_1_T | ||
422 | +6w6l_1_T | ||
423 | +6x1b_1_D | ||
424 | +6x1b_1_F | ||
425 | +5f6c_1_C | ||
426 | +6i0t_1_B | ||
427 | +1b2m_1_C | ||
428 | +1b2m_1_D | ||
429 | +1b2m_1_E | ||
430 | +2uxc_1_Y | ||
431 | +4a3g_1_P | ||
432 | +4a3j_1_P | ||
433 | +7k00_1_5 | ||
434 | +5mmi_1_Z | ||
435 | +3j9m_1_U | ||
436 | +6nu2_1_U | ||
437 | +6nu3_1_U | ||
438 | +5c0y_1_C | ||
439 | +6n6f_1_D | ||
440 | +4ohy_1_B | ||
441 | +4oi1_1_B | ||
442 | +4oi0_1_B | ||
443 | +6raz_1_Y | ||
444 | +5ipl_1_3 | ||
445 | +6utw_1_333 | ||
446 | +5ipm_1_3 | ||
447 | +5ipn_1_3 | ||
448 | +4ylo_1_3 | ||
449 | +4yln_1_6 | ||
450 | +4ylo_1_6 | ||
451 | +4yln_1_3 | ||
452 | +4yln_1_9 | ||
453 | +5lzf_1_Y | ||
454 | +1n32_1_Z | ||
455 | +5zsl_1_D | ||
456 | +5zsd_1_C | ||
457 | +5zsd_1_D | ||
458 | +5zsl_1_E | ||
459 | +4nku_1_D | ||
460 | +4nku_1_H | ||
461 | +1cwp_1_E | ||
462 | +6qik_1_Y | ||
463 | +6rzz_1_Y | ||
464 | +6ri5_1_Y | ||
465 | +6qt0_1_Y | ||
466 | +6qtz_1_Y | ||
467 | +6t83_1_1B | ||
468 | +6t83_1_3B | ||
469 | +6t83_1_AA | ||
470 | +6t83_1_CA | ||
471 | +6s05_1_Y | ||
472 | +5jcs_1_X | ||
473 | +5fl8_1_X | ||
474 | +3erc_1_G | ||
475 | +6of1_1_1W | ||
476 | +6cae_1_1Y | ||
477 | +6o97_1_1W | ||
478 | +6of1_1_1Y | ||
479 | +6of1_1_2W | ||
480 | +6o97_1_1Y | ||
481 | +6nd6_1_1Y | ||
482 | +6cae_1_1W | ||
483 | +6of1_1_2Y | ||
484 | +6cae_1_2Y | ||
485 | +6nd6_1_1W | ||
486 | +6cae_1_2W | ||
487 | +6o97_1_2Y | ||
488 | +6nd6_1_2Y | ||
489 | +6o97_1_2W | ||
490 | +6nd6_1_2W | ||
491 | +6xz7_1_G | ||
492 | +6gz5_1_BW | ||
493 | +6gz3_1_BW | ||
494 | +1ls2_1_B | ||
495 | +3ep2_1_Y | ||
496 | +3eq3_1_Y | ||
497 | +4v48_1_A6 | ||
498 | +2z9q_1_A | ||
499 | +4hot_1_X | ||
500 | +6d2z_1_C | ||
501 | +4tu0_1_F | ||
502 | +4tu0_1_G | ||
503 | +6r9o_1_B | ||
504 | +6is0_1_C | ||
505 | +5lzc_1_X | ||
506 | +5lzb_1_X | ||
507 | +5lzd_1_Y | ||
508 | +5lzc_1_Y | ||
509 | +5lzb_1_Y | ||
510 | +1gsg_1_T | ||
511 | +6zvi_1_D | ||
512 | +6sv4_1_NB | ||
513 | +6sv4_1_NC | ||
514 | +6i7o_1_NB | ||
515 | +5y88_1_X | ||
516 | +3j6x_1_IR | ||
517 | +3j6y_1_IR | ||
518 | +6tb3_1_N | ||
519 | +6tnu_1_N | ||
520 | +2uxb_1_X | ||
521 | +2x1f_1_B | ||
522 | +2x1a_1_B | ||
523 | +3eq3_1_D | ||
524 | +3ep2_1_D | ||
525 | +1eg0_1_M | ||
526 | +3eq4_1_D | ||
527 | +5o1y_1_B | ||
528 | +3jcr_1_H | ||
529 | +6dzi_1_H | ||
530 | +5zeu_1_A | ||
531 | +6mpi_1_W | ||
532 | +5mfx_1_B | ||
533 | +5w0m_1_J | ||
534 | +5bud_1_E | ||
535 | +5w0m_1_I | ||
536 | +5w0m_1_H | ||
537 | +4j7m_1_B | ||
538 | +5bud_1_D | ||
539 | +6a4e_1_B | ||
540 | +6a4e_1_D | ||
541 | +6hxx_1_AA | ||
542 | +6hxx_1_AB | ||
543 | +6hxx_1_AC | ||
544 | +6hxx_1_AD | ||
545 | +6hxx_1_AE | ||
546 | +6hxx_1_AF | ||
547 | +6hxx_1_AG | ||
548 | +6hxx_1_AH | ||
549 | +6hxx_1_AI | ||
550 | +6hxx_1_AJ | ||
551 | +6hxx_1_AK | ||
552 | +6hxx_1_AL | ||
553 | +6hxx_1_AM | ||
554 | +6hxx_1_AN | ||
555 | +6hxx_1_AO | ||
556 | +6hxx_1_AP | ||
557 | +6hxx_1_AQ | ||
558 | +6hxx_1_AR | ||
559 | +6hxx_1_AS | ||
560 | +6hxx_1_AT | ||
561 | +6hxx_1_AU | ||
562 | +6hxx_1_AV | ||
563 | +6hxx_1_AW | ||
564 | +6hxx_1_AX | ||
565 | +6hxx_1_AY | ||
566 | +6hxx_1_AZ | ||
567 | +6hxx_1_BA | ||
568 | +6hxx_1_BB | ||
569 | +6hxx_1_BC | ||
570 | +6hxx_1_BD | ||
571 | +6hxx_1_BE | ||
572 | +6hxx_1_BF | ||
573 | +6hxx_1_BG | ||
574 | +6hxx_1_BH | ||
575 | +6hxx_1_BI | ||
576 | +5odv_1_A | ||
577 | +5odv_1_B | ||
578 | +5odv_1_C | ||
579 | +5odv_1_D | ||
580 | +5odv_1_E | ||
581 | +5odv_1_F | ||
582 | +5odv_1_G | ||
583 | +5odv_1_H | ||
584 | +5odv_1_I | ||
585 | +5odv_1_J | ||
586 | +5odv_1_K | ||
587 | +5odv_1_L | ||
588 | +5odv_1_M | ||
589 | +5odv_1_N | ||
590 | +5odv_1_O | ||
591 | +5odv_1_P | ||
592 | +5odv_1_Q | ||
593 | +5odv_1_R | ||
594 | +5odv_1_S | ||
595 | +5odv_1_T | ||
596 | +5odv_1_U | ||
597 | +5odv_1_V | ||
598 | +5odv_1_W | ||
599 | +5odv_1_X | ||
600 | +6t34_1_A | ||
601 | +6t34_1_B | ||
602 | +6t34_1_C | ||
603 | +6t34_1_D | ||
604 | +6t34_1_E | ||
605 | +6t34_1_F | ||
606 | +6t34_1_G | ||
607 | +6t34_1_H | ||
608 | +6t34_1_I | ||
609 | +6t34_1_J | ||
610 | +6t34_1_K | ||
611 | +6t34_1_L | ||
612 | +6t34_1_M | ||
613 | +6t34_1_N | ||
614 | +6t34_1_O | ||
615 | +6t34_1_P | ||
616 | +6t34_1_Q | ||
617 | +6t34_1_R | ||
618 | +6t34_1_S | ||
619 | +6ip8_1_ZY | ||
620 | +6ip5_1_ZY | ||
621 | +6ip5_1_ZU | ||
622 | +6ip6_1_ZY | ||
623 | +6ip8_1_ZZ | ||
624 | +6ip6_1_ZZ | ||
625 | +6uu3_1_333 | ||
626 | +6uu1_1_333 | ||
627 | +1pn8_1_D | ||
628 | +3er8_1_H | ||
629 | +3er8_1_G | ||
630 | +3er8_1_F | ||
631 | +5o3j_1_B | ||
632 | +4dr7_1_B | ||
633 | +1i5l_1_Y | ||
634 | +1i5l_1_U | ||
635 | +4dr6_1_B | ||
636 | +6i2n_1_U | ||
637 | +4v68_1_A0 | ||
638 | +6vyu_1_Y | ||
639 | +6vyw_1_Y | ||
640 | +6vz7_1_Y | ||
641 | +6vz5_1_Y | ||
642 | +6vz3_1_Y | ||
643 | +6vyy_1_Y | ||
644 | +6vyx_1_Y | ||
645 | +6vyz_1_Y | ||
646 | +6vz2_1_Y | ||
647 | +1mvr_1_1 | ||
648 | +6vyt_1_Y | ||
649 | +1cgm_1_I | ||
650 | +3jb7_1_T | ||
651 | +3jb7_1_M | ||
652 | +3j0o_1_D | ||
653 | +3j0l_1_D | ||
654 | +3j0q_1_D | ||
655 | +3j0p_1_D | ||
656 | +5elt_1_F | ||
657 | +5elt_1_E | ||
658 | +2tmv_1_R | ||
659 | +5a79_1_R | ||
660 | +5a7a_1_R | ||
661 | +2om3_1_R | ||
662 | +2xea_1_R | ||
663 | +4wtl_1_T | ||
664 | +4wtl_1_P | ||
665 | +1xnq_1_W | ||
666 | +1x18_1_C | ||
667 | +1x18_1_B | ||
668 | +1x18_1_D | ||
669 | +1vq6_1_4 | ||
670 | +4am3_1_D | ||
671 | +4am3_1_H | ||
672 | +4am3_1_I | ||
673 | +4lj0_1_C | ||
674 | +4lj0_1_D | ||
675 | +4lj0_1_E | ||
676 | +5lzy_1_HH | ||
677 | +4wtj_1_T | ||
678 | +4wtj_1_P | ||
679 | +4xbf_1_D | ||
680 | +6ow3_1_I | ||
681 | +6ovy_1_I | ||
682 | +6oy6_1_I | ||
683 | +6n6d_1_D | ||
684 | +6n6k_1_C | ||
685 | +6n6k_1_D | ||
686 | +3rtj_1_D | ||
687 | +1apg_1_D | ||
688 | +6ty9_1_M | ||
689 | +6tz1_1_N | ||
690 | +4bbl_1_Y | ||
691 | +4bbl_1_Z | ||
692 | +6sce_1_B | ||
693 | +6scf_1_I | ||
694 | +6scf_1_K | ||
695 | +6yud_1_K | ||
696 | +6yud_1_O | ||
697 | +6scf_1_M | ||
698 | +6yud_1_P | ||
699 | +6scf_1_L | ||
700 | +6yud_1_M | ||
701 | +6yud_1_Q | ||
702 | +6o6x_1_D | ||
703 | +4ba2_1_R | ||
704 | +6o6x_1_C | ||
705 | +6o7b_1_C | ||
706 | +6o6v_1_C | ||
707 | +6r7b_1_D | ||
708 | +6r9r_1_D | ||
709 | +6ov0_1_E | ||
710 | +6ov0_1_H | ||
711 | +6ov0_1_G | ||
712 | +6o6v_1_D | ||
713 | +6ov0_1_F | ||
714 | +6o7b_1_D | ||
715 | +5e02_1_C | ||
716 | +6r9r_1_E | ||
717 | +6r7b_1_E | ||
718 | +6o7i_1_I | ||
719 | +6o7h_1_K | ||
720 | +7jyy_1_F | ||
721 | +7jyy_1_E | ||
722 | +7jz0_1_F | ||
723 | +7jz0_1_E | ||
724 | +6rt6_1_A | ||
725 | +6rt6_1_E | ||
726 | +1y1y_1_P | ||
727 | +5zuu_1_I | ||
728 | +5zuu_1_G | ||
729 | +4peh_1_W | ||
730 | +4peh_1_V | ||
731 | +4peh_1_X | ||
732 | +4peh_1_Y | ||
733 | +4peh_1_Z | ||
734 | +6mkn_1_W | ||
735 | +4cxg_1_C | ||
736 | +4cxh_1_C | ||
737 | +1x1l_1_A | ||
738 | +1zc8_1_Z | ||
739 | +2ob7_1_D | ||
740 | +2ob7_1_A | ||
741 | +4eya_1_E | ||
742 | +4eya_1_F | ||
743 | +4eya_1_Q | ||
744 | +4eya_1_R | ||
745 | +2r1g_1_B | ||
746 | +4ht9_1_E | ||
747 | +1cvj_1_M | ||
748 | +6z1p_1_AB | ||
749 | +6z1p_1_AA | ||
750 | +4ii9_1_C | ||
751 | +5mq0_1_3 | ||
752 | +5uk4_1_X | ||
753 | +5uk4_1_V | ||
754 | +5uk4_1_W | ||
755 | +5uk4_1_U | ||
756 | +5f6c_1_E | ||
757 | +4rcj_1_B | ||
758 | +1xnr_1_W | ||
759 | +6e0o_1_C | ||
760 | +6o75_1_D | ||
761 | +6o75_1_C | ||
762 | +6e0o_1_B | ||
763 | +3j06_1_R | ||
764 | +1r2x_1_C | ||
765 | +1r2w_1_C | ||
766 | +1eg0_1_L | ||
767 | +4eya_1_G | ||
768 | +4eya_1_H | ||
769 | +4eya_1_S | ||
770 | +4eya_1_T | ||
771 | +4dr4_1_V | ||
772 | +1ibl_1_Z | ||
773 | +1ibm_1_Z | ||
774 | +4dr5_1_V | ||
775 | +4d61_1_J | ||
776 | +1trj_1_B | ||
777 | +1trj_1_C | ||
778 | +6q8y_1_N | ||
779 | +6sv4_1_N | ||
780 | +6i7o_1_N | ||
781 | +5k8h_1_A | ||
782 | +5z4a_1_B | ||
783 | +3jbu_1_V | ||
784 | +1h2c_1_R | ||
785 | +1h2d_1_S | ||
786 | +1h2d_1_R | ||
787 | +6szs_1_X | ||
788 | +5mgp_1_X | ||
789 | +6enu_1_X | ||
790 | +6enf_1_X | ||
791 | +6enj_1_X | ||
792 | +1pvo_1_L | ||
793 | +1pvo_1_G | ||
794 | +1pvo_1_H | ||
795 | +1pvo_1_J | ||
796 | +1pvo_1_K | ||
797 | +2ht1_1_K | ||
798 | +2ht1_1_J | ||
799 | +6eri_1_AX | ||
800 | +1zc8_1_A | ||
801 | +1zc8_1_C | ||
802 | +1zc8_1_B | ||
803 | +1zc8_1_G | ||
804 | +1zc8_1_I | ||
805 | +1zc8_1_H | ||
806 | +1zc8_1_J | ||
807 | +4v8z_1_CX | ||
808 | +6kqe_1_I | ||
809 | +5uh8_1_I | ||
810 | +5vi5_1_Q | ||
811 | +4xln_1_T | ||
812 | +4xlr_1_T | ||
813 | +4xln_1_Q | ||
814 | +5i2d_1_K | ||
815 | +5i2d_1_V | ||
816 | +4xlr_1_Q | ||
817 | +6sty_1_C | ||
818 | +6sty_1_F | ||
819 | +2xs5_1_D | ||
820 | +3ok4_1_N | ||
821 | +3ok4_1_L | ||
822 | +3ok4_1_Z | ||
823 | +3ok4_1_4 | ||
824 | +3ok4_1_V | ||
825 | +3ok4_1_X | ||
826 | +3ok4_1_P | ||
827 | +3ok4_1_H | ||
828 | +3ok4_1_J | ||
829 | +3ok4_1_R | ||
830 | +3ok4_1_T | ||
831 | +3ok4_1_2 | ||
832 | +6n6h_1_D | ||
833 | +5wnt_1_B | ||
834 | +3b0u_1_B | ||
835 | +3b0u_1_A | ||
836 | +4x9e_1_G | ||
837 | +4x9e_1_H | ||
838 | +6z1p_1_BB | ||
839 | +6z1p_1_BA | ||
840 | +2uxd_1_X | ||
841 | +4qvd_1_H | ||
842 | +4v7e_1_AB | ||
843 | +3ol9_1_D | ||
844 | +3ol9_1_H | ||
845 | +3ol9_1_L | ||
846 | +3ol9_1_P | ||
847 | +3olb_1_L | ||
848 | +3olb_1_P | ||
849 | +3olb_1_D | ||
850 | +3olb_1_H | ||
851 | +3ol6_1_D | ||
852 | +3ol6_1_H | ||
853 | +3ol6_1_L | ||
854 | +3ol6_1_P | ||
855 | +3ol8_1_D | ||
856 | +3ol8_1_H | ||
857 | +3ol7_1_L | ||
858 | +3ol7_1_P | ||
859 | +3ol7_1_D | ||
860 | +3ol7_1_H | ||
861 | +3ol8_1_L | ||
862 | +3ol8_1_P | ||
863 | +1qzc_1_C | ||
864 | +1qzc_1_A | ||
865 | +6ole_1_V | ||
866 | +6om0_1_V | ||
867 | +6oli_1_V | ||
868 | +6om7_1_V | ||
869 | +6w6l_1_V | ||
870 | +6olf_1_V | ||
871 | +1mvr_1_D | ||
872 | +4wtm_1_T | ||
873 | +4wtm_1_P | ||
874 | +5x70_1_E | ||
875 | +5x70_1_G | ||
876 | +6gz5_1_BV | ||
877 | +6gz4_1_BV | ||
878 | +6gz3_1_BV | ||
879 | +6fti_1_Q | ||
880 | +4v7e_1_AE | ||
881 | +4v7e_1_AD | ||
882 | +4x62_1_B | ||
883 | +4x64_1_B | ||
884 | +4x65_1_B | ||
885 | +1xmq_1_W | ||
886 | +4x66_1_B | ||
887 | +3t1h_1_W | ||
888 | +3t1y_1_W | ||
889 | +1xmo_1_W | ||
890 | +4adx_1_9 | ||
891 | +6kr6_1_B | ||
892 | +1zn1_1_B | ||
893 | +6z8k_1_X | ||
894 | +1cvj_1_Q | ||
895 | +4csf_1_U | ||
896 | +4csf_1_Q | ||
897 | +4csf_1_G | ||
898 | +4csf_1_M | ||
899 | +4csf_1_K | ||
900 | +4csf_1_A | ||
901 | +4csf_1_I | ||
902 | +4csf_1_S | ||
903 | +4csf_1_C | ||
904 | +4csf_1_W | ||
905 | +4csf_1_O | ||
906 | +4csf_1_E | ||
907 | +1cvj_1_N | ||
908 | +1cvj_1_O | ||
909 | +1cvj_1_S | ||
910 | +1cvj_1_P | ||
911 | +1cvj_1_T | ||
912 | +1cvj_1_R | ||
913 | +6th6_1_AA | ||
914 | +6skg_1_AA | ||
915 | +6skf_1_AA | ||
916 | +6q8y_1_M | ||
917 | +6i7o_1_M | ||
918 | +6zmw_1_W | ||
919 | +6ybv_1_W | ||
920 | +2fz2_1_D | ||
921 | +2xpj_1_D | ||
922 | +2vrt_1_H | ||
923 | +2vrt_1_G | ||
924 | +1emi_1_B | ||
925 | +6r9m_1_B | ||
926 | +4nia_1_C | ||
927 | +4nia_1_A | ||
928 | +4nia_1_H | ||
929 | +4nia_1_N | ||
930 | +4nia_1_G | ||
931 | +4nia_1_D | ||
932 | +4nia_1_B | ||
933 | +4nia_1_I | ||
934 | +4nia_1_E | ||
935 | +4nia_1_M | ||
936 | +4oq9_1_I | ||
937 | +4oq9_1_G | ||
938 | +4oq9_1_C | ||
939 | +4oq9_1_H | ||
940 | +4oq9_1_N | ||
941 | +4oq9_1_A | ||
942 | +4oq9_1_D | ||
943 | +4oq9_1_E | ||
944 | +4oq9_1_M | ||
945 | +4oq9_1_B | ||
946 | +5uhc_1_I | ||
947 | +1uvn_1_F | ||
948 | +1uvn_1_B | ||
949 | +1uvn_1_D | ||
950 | +3iy9_1_A | ||
951 | +4wtk_1_T | ||
952 | +4wtk_1_P | ||
953 | +1vqn_1_4 | ||
954 | +4oav_1_C | ||
955 | +4oav_1_A | ||
956 | +3ep2_1_E | ||
957 | +3eq3_1_E | ||
958 | +3eq4_1_E | ||
959 | +3ep2_1_A | ||
960 | +3eq3_1_A | ||
961 | +3eq4_1_A | ||
962 | +3ep2_1_C | ||
963 | +3eq3_1_C | ||
964 | +3eq4_1_C | ||
965 | +3ep2_1_B | ||
966 | +3eq3_1_B | ||
967 | +3eq4_1_B | ||
968 | +4i67_1_B | ||
969 | +3pgw_1_R | ||
970 | +3pgw_1_N | ||
971 | +3cw1_1_X | ||
972 | +3cw1_1_W | ||
973 | +3cw1_1_V | ||
974 | +5it9_1_I | ||
975 | +6k32_1_T | ||
976 | +6k32_1_P | ||
977 | +5mmj_1_A | ||
978 | +5x8r_1_A | ||
979 | +3j2k_1_3 | ||
980 | +3j2k_1_2 | ||
981 | +3j2k_1_1 | ||
982 | +3j2k_1_0 | ||
983 | +3j2k_1_4 | ||
984 | +3nvk_1_G | ||
985 | +3nvk_1_S | ||
986 | +2iy3_1_B | ||
987 | +1cwp_1_F | ||
988 | +5z4j_1_B | ||
989 | +5gmf_1_E | ||
990 | +5gmf_1_H | ||
991 | +6e4p_1_J | ||
992 | +5gmf_1_F | ||
993 | +5gmf_1_G | ||
994 | +5gmg_1_D | ||
995 | +5gmg_1_C | ||
996 | +6e4p_1_K | ||
997 | +3ie1_1_E | ||
998 | +3ie1_1_H | ||
999 | +3ie1_1_F | ||
1000 | +4dr7_1_V | ||
1001 | +3ie1_1_G | ||
1002 | +3s4g_1_C | ||
1003 | +3s4g_1_B | ||
1004 | +2qqp_1_R | ||
1005 | +2zde_1_E | ||
1006 | +2zde_1_F | ||
1007 | +2zde_1_H | ||
1008 | +2zde_1_G | ||
1009 | +1nb7_1_E | ||
1010 | +1nb7_1_F | ||
1011 | +4hos_1_X | ||
1012 | +3p6y_1_T | ||
1013 | +3p6y_1_V | ||
1014 | +3p6y_1_U | ||
1015 | +3p6y_1_Q | ||
1016 | +3p6y_1_W | ||
1017 | +5dto_1_B | ||
1018 | +4cxh_1_X | ||
1019 | +1uvj_1_F | ||
1020 | +1uvj_1_D | ||
1021 | +1uvj_1_E | ||
1022 | +6kqd_1_I | ||
1023 | +6kqd_1_S | ||
1024 | +5uh5_1_I | ||
1025 | +1ytu_1_F | ||
1026 | +1ytu_1_D | ||
1027 | +4kzz_1_J | ||
1028 | +5t2c_1_AN | ||
1029 | +4v5z_1_BF | ||
1030 | +3j6b_1_E | ||
1031 | +4v4f_1_B6 | ||
1032 | +4v4f_1_A5 | ||
1033 | +4v4f_1_A3 | ||
1034 | +4v4f_1_B0 | ||
1035 | +4v4f_1_B9 | ||
1036 | +4v4f_1_A2 | ||
1037 | +4v4f_1_A8 | ||
1038 | +4v4f_1_A1 | ||
1039 | +4v4f_1_A9 | ||
1040 | +4v4f_1_BZ | ||
1041 | +4v4f_1_B8 | ||
1042 | +4v4f_1_B7 | ||
1043 | +4v4f_1_B5 | ||
1044 | +4v4f_1_A0 | ||
1045 | +4v4f_1_A7 | ||
1046 | +4v4f_1_A4 | ||
1047 | +4v4f_1_AZ | ||
1048 | +4v4f_1_B3 | ||
1049 | +4v4f_1_B1 | ||
1050 | +4v4f_1_B4 | ||
1051 | +4v4f_1_A6 | ||
1052 | +4v4f_1_B2 | ||
1053 | +5flx_1_Z | ||
1054 | +5zsb_1_C | ||
1055 | +5zsb_1_D | ||
1056 | +5zsn_1_D | ||
1057 | +5zsn_1_E | ||
1058 | +3jcr_1_N | ||
1059 | +6gfw_1_R | ||
1060 | +2vaz_1_A | ||
1061 | +1qzc_1_B | ||
1062 | +1mvr_1_C | ||
1063 | +4v5z_1_BP | ||
1064 | +6n6e_1_D | ||
1065 | +4g7o_1_I | ||
1066 | +4g7o_1_S | ||
1067 | +5x22_1_S | ||
1068 | +5x22_1_I | ||
1069 | +5x21_1_I | ||
1070 | +5uh6_1_I | ||
1071 | +6l74_1_I | ||
1072 | +5uh9_1_I | ||
1073 | +2ftc_1_R | ||
1074 | +6sag_1_R | ||
1075 | +4udv_1_R | ||
1076 | +2r1g_1_E | ||
1077 | +5zsc_1_D | ||
1078 | +5zsc_1_C | ||
1079 | +6woy_1_I | ||
1080 | +6wox_1_I | ||
1081 | +6evj_1_N | ||
1082 | +6evj_1_M | ||
1083 | +4gkk_1_W | ||
1084 | +4v9e_1_AG | ||
1085 | +4v9e_1_BM | ||
1086 | +4v9e_1_AM | ||
1087 | +4v9e_1_AA | ||
1088 | +4v9e_1_BA | ||
1089 | +4v9e_1_BG | ||
1090 | +5lzs_1_II | ||
1091 | +6fqr_1_C | ||
1092 | +6ha1_1_X | ||
1093 | +5kcr_1_1X | ||
1094 | +2r1g_1_X | ||
1095 | +3m7n_1_Z | ||
1096 | +3m85_1_X | ||
1097 | +3m85_1_Z | ||
1098 | +3m85_1_Y | ||
1099 | +1e8s_1_C | ||
1100 | +5wnp_1_B | ||
1101 | +5wnv_1_B | ||
1102 | +5yts_1_B | ||
1103 | +1utd_1_6 | ||
1104 | +1utd_1_Z | ||
1105 | +1utd_1_4 | ||
1106 | +1utd_1_7 | ||
1107 | +1utd_1_9 | ||
1108 | +1utd_1_5 | ||
1109 | +1utd_1_3 | ||
1110 | +1utd_1_2 | ||
1111 | +1utd_1_8 | ||
1112 | +1utd_1_1 | ||
1113 | +6n6i_1_C | ||
1114 | +6n6i_1_D | ||
1115 | +6n6a_1_D | ||
1116 | +6ij2_1_F | ||
1117 | +6ij2_1_G | ||
1118 | +6ij2_1_H | ||
1119 | +6ij2_1_E | ||
1120 | +3u2e_1_D | ||
1121 | +3u2e_1_C | ||
1122 | +5uef_1_C | ||
1123 | +5uef_1_D | ||
1124 | +4x4u_1_H | ||
1125 | +4afy_1_D | ||
1126 | +6oy5_1_I | ||
1127 | +6owl_1_B | ||
1128 | +6owl_1_C | ||
1129 | +4afy_1_C | ||
1130 | +4lq3_1_R | ||
1131 | +6s0m_1_C | ||
1132 | +6gx6_1_B | ||
1133 | +4k4s_1_D | ||
1134 | +4k4s_1_H | ||
1135 | +4k4t_1_H | ||
1136 | +4k4t_1_D | ||
1137 | +1zn1_1_C | ||
1138 | +1zn0_1_C | ||
1139 | +1xpu_1_G | ||
1140 | +1xpu_1_L | ||
1141 | +1xpr_1_L | ||
1142 | +1xpu_1_H | ||
1143 | +1xpo_1_K | ||
1144 | +1xpo_1_J | ||
1145 | +1xpu_1_J | ||
1146 | +1xpo_1_H | ||
1147 | +1xpr_1_J | ||
1148 | +1xpu_1_K | ||
1149 | +1xpr_1_K | ||
1150 | +1xpo_1_M | ||
1151 | +1xpo_1_L | ||
1152 | +1xpu_1_M | ||
1153 | +1xpr_1_M | ||
1154 | +1xpo_1_G | ||
1155 | +1xpr_1_H | ||
1156 | +1xpr_1_G | ||
1157 | +6gc5_1_F | ||
1158 | +6gc5_1_H | ||
1159 | +6gc5_1_G | ||
1160 | +4v7e_1_AA | ||
1161 | +4v7e_1_AC | ||
1162 | +1n1h_1_B | ||
1163 | +4ohz_1_B | ||
1164 | +6t83_1_6B | ||
1165 | +4gv6_1_C | ||
1166 | +4gv6_1_B | ||
1167 | +4gv3_1_C | ||
1168 | +4gv3_1_B | ||
1169 | +4gv9_1_E | ||
1170 | +6i7o_1_L | ||
1171 | +2a8v_1_D | ||
1172 | +6qx3_1_G | ||
1173 | +2xnr_1_C | ||
1174 | +4gkj_1_W | ||
1175 | +4v5z_1_BC | ||
1176 | +4v5z_1_BB | ||
1177 | +4v5z_1_BH | ||
1178 | +3j0o_1_F | ||
1179 | +3j0l_1_F | ||
1180 | +3j0p_1_F | ||
1181 | +3j0q_1_F | ||
1182 | +3j0o_1_B | ||
1183 | +3j0l_1_B | ||
1184 | +3j0o_1_C | ||
1185 | +3j0l_1_C | ||
1186 | +3j0q_1_C | ||
1187 | +3j0p_1_C | ||
1188 | +3j0o_1_A | ||
1189 | +3j0l_1_A | ||
1190 | +3j0q_1_A | ||
1191 | +3j0p_1_A | ||
1192 | +1cwp_1_D | ||
1193 | +4v5z_1_BJ | ||
1194 | +5sze_1_C | ||
1195 | +6wre_1_D | ||
1196 | +6i0u_1_B | ||
1197 | +5zsa_1_C | ||
1198 | +5zsa_1_D | ||
1199 | +1n34_1_Z | ||
1200 | +3pf5_1_S | ||
1201 | +6ppn_1_A | ||
1202 | +6ppn_1_I | ||
1203 | +6qdw_1_V | ||
1204 | +5hk0_1_F | ||
1205 | +4qm6_1_D | ||
1206 | +4qm6_1_C | ||
1207 | +4jzu_1_C | ||
1208 | +4jzv_1_C | ||
1209 | +5ytv_1_B | ||
1210 | +4k4z_1_P | ||
1211 | +4k4z_1_D | ||
1212 | +4k4x_1_L | ||
1213 | +4k4z_1_L | ||
1214 | +4k4x_1_D | ||
1215 | +4k4z_1_H | ||
1216 | +4k4x_1_H | ||
1217 | +4k4x_1_P | ||
1218 | +1t1m_1_A | ||
1219 | +1t1m_1_B | ||
1220 | +4a3b_1_P | ||
1221 | +4a3m_1_P | ||
1222 | +6u6y_1_E | ||
1223 | +6u6y_1_G | ||
1224 | +6u6y_1_F | ||
1225 | +6u6y_1_H | ||
1226 | +6qik_1_X | ||
1227 | +6rzz_1_X | ||
1228 | +6ri5_1_X | ||
1229 | +6qt0_1_X | ||
1230 | +6qtz_1_X | ||
1231 | +6s05_1_X | ||
1232 | +6t83_1_BB | ||
1233 | +6t83_1_4B | ||
1234 | +5fl8_1_Z | ||
1235 | +5jcs_1_Z | ||
1236 | +5mrc_1_BB | ||
1237 | +5mre_1_BB | ||
1238 | +5mrf_1_BB | ||
1239 | +6gz4_1_BW | ||
1240 | +3j46_1_P | ||
1241 | +3jcr_1_M | ||
1242 | +4e6b_1_A | ||
1243 | +4e6b_1_B | ||
1244 | +6a6l_1_D | ||
1245 | +4v5z_1_BS | ||
1246 | +4v8t_1_1 | ||
1247 | +1uvi_1_D | ||
1248 | +1uvi_1_F | ||
1249 | +1uvi_1_E | ||
1250 | +4m7d_1_P | ||
1251 | +4k4u_1_D | ||
1252 | +4k4u_1_H | ||
1253 | +6rt7_1_E | ||
1254 | +6rt7_1_A | ||
1255 | +2voo_1_C | ||
1256 | +2voo_1_D | ||
1257 | +5k78_1_X | ||
1258 | +5k78_1_Y | ||
1259 | +4ylo_1_9 | ||
1260 | +4kzy_1_I | ||
1261 | +4kzz_1_I | ||
1262 | +4kzx_1_I | ||
1263 | +5vyc_1_I2 | ||
1264 | +5vyc_1_I3 | ||
1265 | +5vyc_1_I5 | ||
1266 | +5vyc_1_I1 | ||
1267 | +5vyc_1_I6 | ||
1268 | +5vyc_1_I4 | ||
1269 | +6ip8_1_2M | ||
1270 | +6ip5_1_2M | ||
1271 | +6ip6_1_2M | ||
1272 | +6qcs_1_M | ||
1273 | +486d_1_G | ||
1274 | +2r1g_1_C | ||
1275 | +486d_1_F | ||
1276 | +4v5z_1_B0 | ||
1277 | +4nia_1_O | ||
1278 | +4nia_1_J | ||
1279 | +4nia_1_K | ||
1280 | +4nia_1_L | ||
1281 | +4nia_1_F | ||
1282 | +4oq9_1_K | ||
1283 | +4oq9_1_O | ||
1284 | +4oq9_1_J | ||
1285 | +4oq9_1_F | ||
1286 | +4oq9_1_L | ||
1287 | +5tbw_1_SR | ||
1288 | +6hhq_1_SR | ||
1289 | +6zvi_1_H | ||
1290 | +6sv4_1_2B | ||
1291 | +6sv4_1_2C | ||
1292 | +6t83_1_2B | ||
1293 | +6t83_1_A | ||
1294 | +6i7o_1_2B | ||
1295 | +6r9q_1_B | ||
1296 | +6v3a_1_SN1 | ||
1297 | +6v3b_1_SN1 | ||
1298 | +6v39_1_SN1 | ||
1299 | +6v3e_1_SN1 | ||
1300 | +1pn7_1_C | ||
1301 | +1mj1_1_Q | ||
1302 | +1mj1_1_R | ||
1303 | +4dr6_1_V | ||
1304 | +6kql_1_I | ||
1305 | +4eya_1_M | ||
1306 | +4eya_1_N | ||
1307 | +4eya_1_A | ||
1308 | +4eya_1_B | ||
1309 | +2wj8_1_D | ||
1310 | +2wj8_1_I | ||
1311 | +2wj8_1_L | ||
1312 | +2wj8_1_F | ||
1313 | +2wj8_1_C | ||
1314 | +2wj8_1_Q | ||
1315 | +2wj8_1_J | ||
1316 | +2wj8_1_P | ||
1317 | +2wj8_1_K | ||
1318 | +2wj8_1_E | ||
1319 | +2wj8_1_T | ||
1320 | +2wj8_1_B | ||
1321 | +2wj8_1_O | ||
1322 | +2wj8_1_N | ||
1323 | +2wj8_1_A | ||
1324 | +2wj8_1_H | ||
1325 | +2wj8_1_R | ||
1326 | +2wj8_1_M | ||
1327 | +2wj8_1_S | ||
1328 | +2wj8_1_G | ||
1329 | +4e6b_1_E | ||
1330 | +4e6b_1_F | ||
1331 | +6p71_1_I | ||
1332 | +3pdm_1_R | ||
1333 | +5det_1_P | ||
1334 | +5els_1_I | ||
1335 | +4n2s_1_B | ||
1336 | +4yoe_1_E | ||
1337 | +3j0o_1_H | ||
1338 | +3j0l_1_H | ||
1339 | +3j0p_1_H | ||
1340 | +3j0q_1_H | ||
1341 | +5gxi_1_B | ||
1342 | +3iy8_1_A | ||
1343 | +6tnu_1_M | ||
1344 | +5mc6_1_M | ||
1345 | +5mc6_1_N | ||
1346 | +4eya_1_O | ||
1347 | +4eya_1_P | ||
1348 | +4eya_1_C | ||
1349 | +4eya_1_D | ||
1350 | +6htq_1_V | ||
1351 | +6htq_1_W | ||
1352 | +6htq_1_U | ||
1353 | +6uu6_1_333 | ||
1354 | +6v3a_1_V | ||
1355 | +6v39_1_V | ||
1356 | +5a0v_1_F | ||
1357 | +3avt_1_T | ||
1358 | +6d1v_1_C | ||
1359 | +4s2x_1_B | ||
1360 | +4s2y_1_B | ||
1361 | +5wnu_1_B | ||
1362 | +1zc8_1_F | ||
1363 | +1vtm_1_R | ||
1364 | +4v5z_1_BA | ||
1365 | +4v5z_1_BE | ||
1366 | +4v5z_1_BD | ||
1367 | +4v5z_1_BG | ||
1368 | +4v5z_1_BI | ||
1369 | +4v5z_1_BK | ||
1370 | +4v5z_1_BM | ||
1371 | +4v5z_1_BL | ||
1372 | +4v5z_1_BV | ||
1373 | +4v5z_1_BO | ||
1374 | +4v5z_1_BN | ||
1375 | +4v5z_1_BQ | ||
1376 | +4v5z_1_BR | ||
1377 | +4v5z_1_BT | ||
1378 | +4v5z_1_BU | ||
1379 | +4v5z_1_BW | ||
1380 | +4v5z_1_BY | ||
1381 | +4v5z_1_BX | ||
1382 | +4v5z_1_BZ | ||
1383 | +6u9x_1_H | ||
1384 | +6u9x_1_K | ||
1385 | +5elk_1_R | ||
1386 | +6okk_1_G | ||
1387 | +4cxg_1_A | ||
1388 | +4cxh_1_A | ||
1389 | +6bk8_1_I | ||
1390 | +4cxg_1_B | ||
1391 | +4cxh_1_B | ||
1392 | +4v5z_1_B1 | ||
1393 | +5z4d_1_B | ||
1394 | +6o78_1_E | ||
1395 | +6ha8_1_X | ||
1396 | +1m8w_1_E | ||
1397 | +1m8w_1_F | ||
1398 | +5udi_1_B | ||
1399 | +5udl_1_B | ||
1400 | +5udk_1_B | ||
1401 | +5udj_1_B | ||
1402 | +5w5i_1_B | ||
1403 | +5w5i_1_D | ||
1404 | +5w5h_1_B | ||
1405 | +5w5h_1_D | ||
1406 | +4eya_1_K | ||
1407 | +4eya_1_L | ||
1408 | +4eya_1_I | ||
1409 | +4eya_1_J | ||
1410 | +4g9z_1_E | ||
1411 | +4g9z_1_F | ||
1412 | +3nma_1_B | ||
1413 | +3nma_1_C | ||
1414 | +6een_1_G | ||
1415 | +6een_1_I | ||
1416 | +6een_1_H | ||
1417 | +4wti_1_T | ||
1418 | +4wti_1_P | ||
1419 | +5l3p_1_Y | ||
1420 | +4hor_1_X | ||
1421 | +3rzo_1_R | ||
1422 | +2f4v_1_Z | ||
1423 | +1qln_1_R | ||
1424 | +2xs7_1_B | ||
1425 | +6zvi_1_E | ||
1426 | +6sv4_1_MC | ||
1427 | +6sv4_1_MB | ||
1428 | +6i7o_1_MB | ||
1429 | +6ogy_1_M | ||
1430 | +6ogy_1_N | ||
1431 | +6uej_1_B | ||
1432 | +1x18_1_A | ||
1433 | +5ytx_1_B | ||
1434 | +6o8w_1_U | ||
1435 | +4g0a_1_H | ||
1436 | +6r9p_1_B | ||
1437 | +3koa_1_C | ||
1438 | +4n48_1_D | ||
1439 | +4n48_1_G | ||
1440 | +6kug_1_B | ||
1441 | +6ktc_1_V | ||
1442 | +6ole_1_U | ||
1443 | +6om0_1_U | ||
1444 | +6olg_1_BV | ||
1445 | +6oli_1_U | ||
1446 | +6om7_1_U | ||
1447 | +6w6l_1_U | ||
1448 | +6olz_1_BV | ||
1449 | +6olf_1_U | ||
1450 | +5lzd_1_X | ||
1451 | +6m7k_1_B | ||
1452 | +3cd6_1_4 | ||
1453 | +3cma_1_5 | ||
1454 | +6n9e_1_2W | ||
1455 | +1vqo_1_4 | ||
1456 | +1qvg_1_3 | ||
1457 | +3cme_1_5 | ||
1458 | +5lzd_1_W | ||
1459 | +5lze_1_W | ||
1460 | +5lzc_1_W | ||
1461 | +5lzb_1_W | ||
1462 | +3wzi_1_C | ||
1463 | +1mvr_1_E | ||
1464 | +1mvr_1_B | ||
1465 | +1mvr_1_A | ||
1466 | +4adx_1_0 | ||
1467 | +4adx_1_8 | ||
1468 | +1n33_1_Z | ||
1469 | +6dti_1_W | ||
1470 | +3d2s_1_F | ||
1471 | +3d2s_1_H | ||
1472 | +5mrc_1_AA | ||
1473 | +5mre_1_AA | ||
1474 | +5mrf_1_AA | ||
1475 | +5fl8_1_Y | ||
1476 | +5jcs_1_Y | ||
1477 | +2r1g_1_A | ||
1478 | +2r1g_1_D | ||
1479 | +2r1g_1_F | ||
1480 | +3eq4_1_Y | ||
1481 | +4wkr_1_C | ||
1482 | +4v99_1_EC | ||
1483 | +4v99_1_AC | ||
1484 | +4v99_1_BH | ||
1485 | +4v99_1_CH | ||
1486 | +4v99_1_AM | ||
1487 | +4v99_1_DC | ||
1488 | +4v99_1_JW | ||
1489 | +4v99_1_EH | ||
1490 | +4v99_1_BW | ||
1491 | +4v99_1_FW | ||
1492 | +4v99_1_AW | ||
1493 | +4v99_1_BC | ||
1494 | +4v99_1_BM | ||
1495 | +4v99_1_IC | ||
1496 | +4v99_1_EM | ||
1497 | +4v99_1_ER | ||
1498 | +4v99_1_IW | ||
1499 | +4v99_1_JH | ||
1500 | +4v99_1_JR | ||
1501 | +4v99_1_AH | ||
1502 | +4v99_1_GR | ||
1503 | +4v99_1_IR | ||
1504 | +4v99_1_BR | ||
1505 | +4v99_1_CW | ||
1506 | +4v99_1_HR | ||
1507 | +4v99_1_FH | ||
1508 | +4v99_1_HC | ||
1509 | +4v99_1_DW | ||
1510 | +4v99_1_GC | ||
1511 | +4v99_1_JC | ||
1512 | +4v99_1_DM | ||
1513 | +4v99_1_EW | ||
1514 | +4v99_1_AR | ||
1515 | +4v99_1_CR | ||
1516 | +4v99_1_JM | ||
1517 | +4v99_1_CC | ||
1518 | +4v99_1_IH | ||
1519 | +4v99_1_FR | ||
1520 | +4v99_1_CM | ||
1521 | +4v99_1_IM | ||
1522 | +4v99_1_FM | ||
1523 | +4v99_1_FC | ||
1524 | +4v99_1_GH | ||
1525 | +4v99_1_HM | ||
1526 | +4v99_1_HH | ||
1527 | +4v99_1_DR | ||
1528 | +4v99_1_HW | ||
1529 | +4v99_1_GW | ||
1530 | +4v99_1_DH | ||
1531 | +4v99_1_GM | ||
1532 | +6rt4_1_D | ||
1533 | +6rt4_1_C | ||
1534 | +6zvh_1_X | ||
1535 | +4dwa_1_D | ||
1536 | +6n6c_1_D | ||
1537 | +6n6j_1_C | ||
1538 | +6n6j_1_D | ||
1539 | +6p7q_1_E | ||
1540 | +6p7q_1_F | ||
1541 | +6p7q_1_D | ||
1542 | +6rcl_1_C | ||
1543 | +5jju_1_C | ||
1544 | +4ejt_1_G | ||
1545 | +5ceu_1_C | ||
1546 | +5ceu_1_D | ||
1547 | +6lkq_1_W | ||
1548 | +3qsu_1_P | ||
1549 | +3qsu_1_R | ||
1550 | +1n38_1_B | ||
1551 | +4qvc_1_G | ||
1552 | +6q1h_1_D | ||
1553 | +6q1h_1_H | ||
1554 | +6p7p_1_F | ||
1555 | +6p7p_1_E | ||
1556 | +6p7p_1_D | ||
1557 | +6vm6_1_J | ||
1558 | +6vm6_1_G | ||
1559 | +6wan_1_K | ||
1560 | +6wan_1_H | ||
1561 | +6wan_1_G | ||
1562 | +6wan_1_L | ||
1563 | +6wan_1_I | ||
1564 | +6ywo_1_F | ||
1565 | +6wan_1_J | ||
1566 | +4oau_1_A | ||
1567 | +6ywo_1_E | ||
1568 | +6ywo_1_K | ||
1569 | +6vm6_1_I | ||
1570 | +6vm6_1_H | ||
1571 | +6ywo_1_I | ||
1572 | +2a1r_1_C | ||
1573 | +2a1r_1_D | ||
1574 | +3gpq_1_E | ||
1575 | +3gpq_1_F | ||
1576 | +6o79_1_C | ||
1577 | +6vm6_1_K | ||
1578 | +6hyu_1_D | ||
1579 | +1laj_1_R | ||
1580 | +6ybv_1_K | ||
1581 | +6mpf_1_W | ||
1582 | +6spc_1_A | ||
1583 | +6spe_1_A | ||
1584 | +6fti_1_V | ||
1585 | +6ftj_1_V | ||
1586 | +6ftg_1_V | ||
1587 | +4g0a_1_G | ||
1588 | +4g0a_1_F | ||
1589 | +4g0a_1_E | ||
1590 | +2b2d_1_S | ||
1591 | +5hkc_1_C | ||
1592 | +1rmv_1_B | ||
1593 | +4qu7_1_X | ||
1594 | +4qu7_1_V | ||
1595 | +4qu7_1_U | ||
1596 | +4v5z_1_AH | ||
1597 | +4v5z_1_AA | ||
1598 | +4v5z_1_AB | ||
1599 | +4v5z_1_AC | ||
1600 | +4v5z_1_AD | ||
1601 | +4v5z_1_AE | ||
1602 | +4v5z_1_AF | ||
1603 | +4v5z_1_AG | ||
1604 | +6pmi_1_3 | ||
1605 | +6pmj_1_3 | ||
1606 | +5hjz_1_C | ... | ... |
This diff could not be displayed because it is too large.
... | @@ -11,7 +11,7 @@ | ... | @@ -11,7 +11,7 @@ |
11 | # - Use a specialised database (SILVA) : better alignments (we guess?), but two kind of jobs | 11 | # - Use a specialised database (SILVA) : better alignments (we guess?), but two kind of jobs |
12 | # - Use cmalign --small everywhere (homogeneity) | 12 | # - Use cmalign --small everywhere (homogeneity) |
13 | # Moreover, --small requires --nonbanded --cyk, which means the output alignement is the optimally scored one. | 13 | # Moreover, --small requires --nonbanded --cyk, which means the output alignement is the optimally scored one. |
14 | -# To date, we trust Infernal as the best tool to realign RNA. Is it ? | 14 | +# To date, we trust Infernal as the best tool to realign ncRNA. Is it ? |
15 | 15 | ||
16 | # Contact: louis.becquey@univ-evry.fr (PhD student), fariza.tahi@univ-evry.fr (PI) | 16 | # Contact: louis.becquey@univ-evry.fr (PhD student), fariza.tahi@univ-evry.fr (PI) |
17 | 17 | ||
... | @@ -28,7 +28,7 @@ pd.set_option('display.max_rows', None) | ... | @@ -28,7 +28,7 @@ pd.set_option('display.max_rows', None) |
28 | LSU_set = ["RF00002", "RF02540", "RF02541", "RF02543", "RF02546"] # From Rfam CLAN 00112 | 28 | LSU_set = ["RF00002", "RF02540", "RF02541", "RF02543", "RF02546"] # From Rfam CLAN 00112 |
29 | SSU_set = ["RF00177", "RF02542", "RF02545", "RF01959", "RF01960"] # From Rfam CLAN 00111 | 29 | SSU_set = ["RF00177", "RF02542", "RF02545", "RF01959", "RF01960"] # From Rfam CLAN 00111 |
30 | 30 | ||
31 | -with sqlite3.connect("results/RNANet.db") as conn: | 31 | +with sqlite3.connect(os.getcwd()+"/results/RNANet.db") as conn: |
32 | df = pd.read_sql("SELECT rfam_acc, max_len, nb_total_homol, comput_time, comput_peak_mem FROM family;", conn) | 32 | df = pd.read_sql("SELECT rfam_acc, max_len, nb_total_homol, comput_time, comput_peak_mem FROM family;", conn) |
33 | 33 | ||
34 | to_remove = [ f for f in df.rfam_acc if f in LSU_set+SSU_set ] | 34 | to_remove = [ f for f in df.rfam_acc if f in LSU_set+SSU_set ] |
... | @@ -74,7 +74,7 @@ ax.set_ylabel("Maximum length of sequences ") | ... | @@ -74,7 +74,7 @@ ax.set_ylabel("Maximum length of sequences ") |
74 | ax.set_zlabel("Computation time (s)") | 74 | ax.set_zlabel("Computation time (s)") |
75 | 75 | ||
76 | plt.subplots_adjust(wspace=0.4) | 76 | plt.subplots_adjust(wspace=0.4) |
77 | -plt.savefig("results/cmalign_jobs_performance.png") | 77 | +plt.savefig(os.getcwd()+"/results/cmalign_jobs_performance.png") |
78 | 78 | ||
79 | # # ======================================================== | 79 | # # ======================================================== |
80 | # # Linear Regression of max_mem as function of max_length | 80 | # # Linear Regression of max_mem as function of max_length | ... | ... |
... | @@ -3,7 +3,6 @@ | ... | @@ -3,7 +3,6 @@ |
3 | # This file computes additional statistics over the produced dataset. | 3 | # This file computes additional statistics over the produced dataset. |
4 | # Run this file if you want the base counts, pair-type counts, identity percents, etc | 4 | # Run this file if you want the base counts, pair-type counts, identity percents, etc |
5 | # in the database. | 5 | # in the database. |
6 | -# This should be run from the folder where the file is (to access the database with path "results/RNANet.db") | ||
7 | 6 | ||
8 | import getopt, os, pickle, sqlite3, shlex, subprocess, sys | 7 | import getopt, os, pickle, sqlite3, shlex, subprocess, sys |
9 | import numpy as np | 8 | import numpy as np |
... | @@ -22,34 +21,35 @@ from multiprocessing import Pool, Manager | ... | @@ -22,34 +21,35 @@ from multiprocessing import Pool, Manager |
22 | from os import path | 21 | from os import path |
23 | from tqdm import tqdm | 22 | from tqdm import tqdm |
24 | from collections import Counter | 23 | from collections import Counter |
25 | -from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker | 24 | +from setproctitle import setproctitle |
25 | +from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions | ||
26 | 26 | ||
27 | path_to_3D_data = "tobedefinedbyoptions" | 27 | path_to_3D_data = "tobedefinedbyoptions" |
28 | path_to_seq_data = "tobedefinedbyoptions" | 28 | path_to_seq_data = "tobedefinedbyoptions" |
29 | +runDir = os.getcwd() | ||
29 | res_thr = 20.0 # default: all structures | 30 | res_thr = 20.0 # default: all structures |
30 | 31 | ||
31 | LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112 | 32 | LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112 |
32 | SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111 | 33 | SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111 |
33 | 34 | ||
34 | -def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): | 35 | +@trace_unhandled_exceptions |
36 | +def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0): | ||
35 | """ | 37 | """ |
36 | Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph. | 38 | Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph. |
37 | - See Wadley & Pyle (2007) | 39 | + See Wadley & Pyle (2007). |
40 | + Only unique unmapped chains with resolution < res argument are considered. | ||
38 | 41 | ||
39 | Arguments: | 42 | Arguments: |
40 | - show: True or False, call plt.show() at this end or not | ||
41 | - filter_helical: None, "form", "zone", or "both" | ||
42 | - None: do not remove helical nucleotide | ||
43 | - "form": remove nucleotides if they belong to a A, B or Z form stem | ||
44 | - "zone": remove nucleotides falling in an arbitrary zone (see zone argument) | ||
45 | - "both": remove nucleotides fulfilling one or both of the above conditions | ||
46 | carbon: 1 or 4, use C4' (eta and theta) or C1' (eta_prime and theta_prime) | 43 | carbon: 1 or 4, use C4' (eta and theta) or C1' (eta_prime and theta_prime) |
44 | + show: True or False, call plt.show() at this end or not | ||
47 | sd_range: tuple, set values below avg + sd_range[0] * stdev to 0, | 45 | sd_range: tuple, set values below avg + sd_range[0] * stdev to 0, |
48 | and values above avg + sd_range[1] * stdev to avg + sd_range[1] * stdev. | 46 | and values above avg + sd_range[1] * stdev to avg + sd_range[1] * stdev. |
49 | This removes noise and cuts too high peaks, to clearly see the clusters. | 47 | This removes noise and cuts too high peaks, to clearly see the clusters. |
48 | + res: Minimal resolution (maximal resolution value, actually) of the structure to | ||
49 | + consider its nucleotides. | ||
50 | """ | 50 | """ |
51 | 51 | ||
52 | - os.makedirs("results/figures/wadley_plots/", exist_ok=True) | 52 | + os.makedirs(runDir + "/results/figures/wadley_plots/", exist_ok=True) |
53 | 53 | ||
54 | if carbon == 4: | 54 | if carbon == 4: |
55 | angle = "eta" | 55 | angle = "eta" |
... | @@ -63,30 +63,32 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): | ... | @@ -63,30 +63,32 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): |
63 | exit("You overestimate my capabilities !") | 63 | exit("You overestimate my capabilities !") |
64 | 64 | ||
65 | 65 | ||
66 | - if not path.isfile(f"data/wadley_kernel_{angle}_{res}A.npz"): | 66 | + if not path.isfile(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz"): |
67 | 67 | ||
68 | # Get a worker number to position the progress bar | 68 | # Get a worker number to position the progress bar |
69 | global idxQueue | 69 | global idxQueue |
70 | thr_idx = idxQueue.get() | 70 | thr_idx = idxQueue.get() |
71 | + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} reproduce_wadley_results(carbon={carbon})") | ||
72 | + | ||
71 | pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", position=thr_idx+1, leave=False) | 73 | pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", position=thr_idx+1, leave=False) |
72 | 74 | ||
73 | # Extract the angle values of c2'-endo and c3'-endo nucleotides | 75 | # Extract the angle values of c2'-endo and c3'-endo nucleotides |
74 | - with sqlite3.connect("results/RNANet.db") as conn: | 76 | + with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
75 | df = pd.read_sql(f"""SELECT {angle}, th{angle} | 77 | df = pd.read_sql(f"""SELECT {angle}, th{angle} |
76 | - FROM nucleotide JOIN ( | 78 | + FROM ( |
77 | - SELECT chain_id FROM chain JOIN structure | 79 | + SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id |
78 | - WHERE structure.resolution <= {res} | 80 | + WHERE chain.rfam_acc = 'unmappd' AND structure.resolution <= {res} AND issue = 0 |
79 | - ) AS c | 81 | + ) AS c NATURAL JOIN nucleotide |
80 | WHERE puckering="C2'-endo" | 82 | WHERE puckering="C2'-endo" |
81 | AND {angle} IS NOT NULL | 83 | AND {angle} IS NOT NULL |
82 | AND th{angle} IS NOT NULL;""", conn) | 84 | AND th{angle} IS NOT NULL;""", conn) |
83 | c2_endo_etas = df[angle].values.tolist() | 85 | c2_endo_etas = df[angle].values.tolist() |
84 | c2_endo_thetas = df["th"+angle].values.tolist() | 86 | c2_endo_thetas = df["th"+angle].values.tolist() |
85 | df = pd.read_sql(f"""SELECT {angle}, th{angle} | 87 | df = pd.read_sql(f"""SELECT {angle}, th{angle} |
86 | - FROM nucleotide JOIN ( | 88 | + FROM ( |
87 | - SELECT chain_id FROM chain JOIN structure | 89 | + SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id |
88 | - WHERE structure.resolution <= {res} | 90 | + WHERE chain.rfam_acc = 'unmappd' AND structure.resolution <= {res} AND issue = 0 |
89 | - ) AS c | 91 | + ) AS c NATURAL JOIN nucleotide |
90 | WHERE form = '.' | 92 | WHERE form = '.' |
91 | AND puckering="C3'-endo" | 93 | AND puckering="C3'-endo" |
92 | AND {angle} IS NOT NULL | 94 | AND {angle} IS NOT NULL |
... | @@ -111,14 +113,16 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): | ... | @@ -111,14 +113,16 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): |
111 | pbar.update(1) | 113 | pbar.update(1) |
112 | 114 | ||
113 | # Save the data to an archive for later use without the need to recompute | 115 | # Save the data to an archive for later use without the need to recompute |
114 | - np.savez(f"data/wadley_kernel_{angle}_{res}A.npz", | 116 | + np.savez(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz", |
115 | c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas, | 117 | c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas, |
116 | c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas, | 118 | c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas, |
117 | kernel_c3=f_c3, kernel_c2=f_c2) | 119 | kernel_c3=f_c3, kernel_c2=f_c2) |
118 | pbar.close() | 120 | pbar.close() |
119 | idxQueue.put(thr_idx) | 121 | idxQueue.put(thr_idx) |
120 | else: | 122 | else: |
121 | - f = np.load(f"data/wadley_kernel_{angle}_{res}A.npz") | 123 | + setproctitle(f"RNANet statistics.py reproduce_wadley_results(carbon={carbon})") |
124 | + | ||
125 | + f = np.load(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz") | ||
122 | c2_endo_etas = f["c2_endo_e"] | 126 | c2_endo_etas = f["c2_endo_e"] |
123 | c3_endo_etas = f["c3_endo_e"] | 127 | c3_endo_etas = f["c3_endo_e"] |
124 | c2_endo_thetas = f["c2_endo_t"] | 128 | c2_endo_thetas = f["c2_endo_t"] |
... | @@ -148,7 +152,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): | ... | @@ -148,7 +152,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): |
148 | f_low_thr = f.mean() + sd_range[0]*f.std() | 152 | f_low_thr = f.mean() + sd_range[0]*f.std() |
149 | f_cut = np.where(f > f_sup_thr, f_sup_thr, f) | 153 | f_cut = np.where(f > f_sup_thr, f_sup_thr, f) |
150 | f_cut = np.where(f_cut < f_low_thr, 0, f_cut) | 154 | f_cut = np.where(f_cut < f_low_thr, 0, f_cut) |
151 | - levels = [f.mean()+f.std(), f.mean()+2*f.std(), f.mean()+4*f.std()] | 155 | + levels = [ f.mean()+f.std(), f.mean()+2*f.std(), f.mean()+4*f.std()] |
152 | 156 | ||
153 | # histogram: | 157 | # histogram: |
154 | fig = plt.figure() | 158 | fig = plt.figure() |
... | @@ -157,7 +161,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): | ... | @@ -157,7 +161,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): |
157 | ax.bar3d(xpos.ravel(), ypos.ravel(), 0.0, 0.09, 0.09, hist_cut.ravel(), color=color_values, zorder="max") | 161 | ax.bar3d(xpos.ravel(), ypos.ravel(), 0.0, 0.09, 0.09, hist_cut.ravel(), color=color_values, zorder="max") |
158 | ax.set_xlabel(xlabel) | 162 | ax.set_xlabel(xlabel) |
159 | ax.set_ylabel(ylabel) | 163 | ax.set_ylabel(ylabel) |
160 | - fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res}A.png") | 164 | + fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res}A.png") |
161 | if show: | 165 | if show: |
162 | fig.show() | 166 | fig.show() |
163 | plt.close() | 167 | plt.close() |
... | @@ -168,7 +172,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): | ... | @@ -168,7 +172,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): |
168 | ax.plot_surface(xx, yy, f_cut, cmap=cm.get_cmap("coolwarm"), linewidth=0, antialiased=True) | 172 | ax.plot_surface(xx, yy, f_cut, cmap=cm.get_cmap("coolwarm"), linewidth=0, antialiased=True) |
169 | ax.set_xlabel(xlabel) | 173 | ax.set_xlabel(xlabel) |
170 | ax.set_ylabel(ylabel) | 174 | ax.set_ylabel(ylabel) |
171 | - fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res}A.png") | 175 | + fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res}A.png") |
172 | if show: | 176 | if show: |
173 | fig.show() | 177 | fig.show() |
174 | plt.close() | 178 | plt.close() |
... | @@ -177,10 +181,10 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): | ... | @@ -177,10 +181,10 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): |
177 | fig = plt.figure(figsize=(5,5)) | 181 | fig = plt.figure(figsize=(5,5)) |
178 | ax = fig.gca() | 182 | ax = fig.gca() |
179 | ax.scatter(x, y, s=1, alpha=0.1) | 183 | ax.scatter(x, y, s=1, alpha=0.1) |
180 | - ax.contourf(xx, yy, f_cut, alpha=0.5, cmap=cm.get_cmap("coolwarm"), levels=levels, extend="max") | 184 | + ax.contourf(xx, yy, f, alpha=0.5, cmap=cm.get_cmap("coolwarm"), levels=levels, extend="max") |
181 | ax.set_xlabel(xlabel) | 185 | ax.set_xlabel(xlabel) |
182 | ax.set_ylabel(ylabel) | 186 | ax.set_ylabel(ylabel) |
183 | - fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}_{res}A.png") | 187 | + fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_{angle}_{l}_{res}A.png") |
184 | if show: | 188 | if show: |
185 | fig.show() | 189 | fig.show() |
186 | plt.close() | 190 | plt.close() |
... | @@ -188,10 +192,13 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): | ... | @@ -188,10 +192,13 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): |
188 | 192 | ||
189 | def stats_len(): | 193 | def stats_len(): |
190 | """Plots statistics on chain lengths in RNA families. | 194 | """Plots statistics on chain lengths in RNA families. |
195 | + Uses all chains mapped to a family including copies, inferred or not. | ||
191 | 196 | ||
192 | REQUIRES tables chain, nucleotide up to date. | 197 | REQUIRES tables chain, nucleotide up to date. |
193 | """ | 198 | """ |
194 | 199 | ||
200 | + setproctitle(f"RNANet statistics.py stats_len({res_thr})") | ||
201 | + | ||
195 | # Get a worker number to position the progress bar | 202 | # Get a worker number to position the progress bar |
196 | global idxQueue | 203 | global idxQueue |
197 | thr_idx = idxQueue.get() | 204 | thr_idx = idxQueue.get() |
... | @@ -214,7 +221,7 @@ def stats_len(): | ... | @@ -214,7 +221,7 @@ def stats_len(): |
214 | cols = [] | 221 | cols = [] |
215 | lengths = [] | 222 | lengths = [] |
216 | 223 | ||
217 | - for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False)): | 224 | + for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False): |
218 | 225 | ||
219 | # Define a color for that family in the plot | 226 | # Define a color for that family in the plot |
220 | if f in LSU_set: | 227 | if f in LSU_set: |
... | @@ -229,7 +236,7 @@ def stats_len(): | ... | @@ -229,7 +236,7 @@ def stats_len(): |
229 | cols.append("grey") | 236 | cols.append("grey") |
230 | 237 | ||
231 | # Get the lengths of chains | 238 | # Get the lengths of chains |
232 | - with sqlite3.connect("results/RNANet.db") as conn: | 239 | + with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
233 | l = [ x[0] for x in sql_ask_database(conn, f"""SELECT COUNT(index_chain) | 240 | l = [ x[0] for x in sql_ask_database(conn, f"""SELECT COUNT(index_chain) |
234 | FROM ( | 241 | FROM ( |
235 | SELECT chain_id | 242 | SELECT chain_id |
... | @@ -239,8 +246,6 @@ def stats_len(): | ... | @@ -239,8 +246,6 @@ def stats_len(): |
239 | GROUP BY chain_id;""", warn_every=0) ] | 246 | GROUP BY chain_id;""", warn_every=0) ] |
240 | lengths.append(l) # list of chain lengths from the family | 247 | lengths.append(l) # list of chain lengths from the family |
241 | 248 | ||
242 | - # notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths") | ||
243 | - | ||
244 | # Plot the figure | 249 | # Plot the figure |
245 | fig = plt.figure(figsize=(10,3)) | 250 | fig = plt.figure(figsize=(10,3)) |
246 | ax = fig.gca() | 251 | ax = fig.gca() |
... | @@ -267,7 +272,7 @@ def stats_len(): | ... | @@ -267,7 +272,7 @@ def stats_len(): |
267 | ncol=1, fontsize='small', bbox_to_anchor=(1.3, 0.5)) | 272 | ncol=1, fontsize='small', bbox_to_anchor=(1.3, 0.5)) |
268 | 273 | ||
269 | # Save the figure | 274 | # Save the figure |
270 | - fig.savefig(f"results/figures/lengths_{res_thr}A.png") | 275 | + fig.savefig(runDir + f"/results/figures/lengths_{res_thr}A.png") |
271 | idxQueue.put(thr_idx) # replace the thread index in the queue | 276 | idxQueue.put(thr_idx) # replace the thread index in the queue |
272 | # notify("Computed sequence length statistics and saved the figure.") | 277 | # notify("Computed sequence length statistics and saved the figure.") |
273 | 278 | ||
... | @@ -285,6 +290,7 @@ def format_percentage(tot, x): | ... | @@ -285,6 +290,7 @@ def format_percentage(tot, x): |
285 | 290 | ||
286 | def stats_freq(): | 291 | def stats_freq(): |
287 | """Computes base frequencies in all RNA families. | 292 | """Computes base frequencies in all RNA families. |
293 | + Uses all chains mapped to a family including copies, inferred or not. | ||
288 | 294 | ||
289 | Outputs results/frequencies.csv | 295 | Outputs results/frequencies.csv |
290 | REQUIRES tables chain, nucleotide up to date.""" | 296 | REQUIRES tables chain, nucleotide up to date.""" |
... | @@ -293,17 +299,18 @@ def stats_freq(): | ... | @@ -293,17 +299,18 @@ def stats_freq(): |
293 | global idxQueue | 299 | global idxQueue |
294 | thr_idx = idxQueue.get() | 300 | thr_idx = idxQueue.get() |
295 | 301 | ||
302 | + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} stats_freq()") | ||
303 | + | ||
296 | # Initialize a Counter object for each family | 304 | # Initialize a Counter object for each family |
297 | freqs = {} | 305 | freqs = {} |
298 | for f in fam_list: | 306 | for f in fam_list: |
299 | freqs[f] = Counter() | 307 | freqs[f] = Counter() |
300 | 308 | ||
301 | # List all nt_names happening within a RNA family and store the counts in the Counter | 309 | # List all nt_names happening within a RNA family and store the counts in the Counter |
302 | - for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False)): | 310 | + for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False): |
303 | - with sqlite3.connect("results/RNANet.db") as conn: | 311 | + with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
304 | counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0)) | 312 | counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0)) |
305 | freqs[f].update(counts) | 313 | freqs[f].update(counts) |
306 | - # notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.") | ||
307 | 314 | ||
308 | # Create a pandas DataFrame, and save it to CSV. | 315 | # Create a pandas DataFrame, and save it to CSV. |
309 | df = pd.DataFrame() | 316 | df = pd.DataFrame() |
... | @@ -311,7 +318,7 @@ def stats_freq(): | ... | @@ -311,7 +318,7 @@ def stats_freq(): |
311 | tot = sum(freqs[f].values()) | 318 | tot = sum(freqs[f].values()) |
312 | df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ]) | 319 | df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ]) |
313 | df = df.fillna(0) | 320 | df = df.fillna(0) |
314 | - df.to_csv("results/frequencies.csv") | 321 | + df.to_csv(runDir + "/results/frequencies.csv") |
315 | idxQueue.put(thr_idx) # replace the thread index in the queue | 322 | idxQueue.put(thr_idx) # replace the thread index in the queue |
316 | # notify("Saved nucleotide frequencies to CSV file.") | 323 | # notify("Saved nucleotide frequencies to CSV file.") |
317 | 324 | ||
... | @@ -327,11 +334,13 @@ def parallel_stats_pairs(f): | ... | @@ -327,11 +334,13 @@ def parallel_stats_pairs(f): |
327 | global idxQueue | 334 | global idxQueue |
328 | thr_idx = idxQueue.get() | 335 | thr_idx = idxQueue.get() |
329 | 336 | ||
337 | + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} p_stats_pairs({f})") | ||
338 | + | ||
330 | chain_id_list = mappings_list[f] | 339 | chain_id_list = mappings_list[f] |
331 | data = [] | 340 | data = [] |
332 | sqldata = [] | 341 | sqldata = [] |
333 | for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", leave=False): | 342 | for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", leave=False): |
334 | - with sqlite3.connect("results/RNANet.db") as conn: | 343 | + with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
335 | # Get comma separated lists of basepairs per nucleotide | 344 | # Get comma separated lists of basepairs per nucleotide |
336 | interactions = pd.DataFrame( | 345 | interactions = pd.DataFrame( |
337 | sql_ask_database(conn, | 346 | sql_ask_database(conn, |
... | @@ -398,7 +407,7 @@ def parallel_stats_pairs(f): | ... | @@ -398,7 +407,7 @@ def parallel_stats_pairs(f): |
398 | data.append(expanded_list) | 407 | data.append(expanded_list) |
399 | 408 | ||
400 | # Update the database | 409 | # Update the database |
401 | - with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn: | 410 | + with sqlite3.connect(runDir + "/results/RNANet.db", isolation_level=None) as conn: |
402 | conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query | 411 | conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query |
403 | sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?, | 412 | sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?, |
404 | pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?, | 413 | pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?, |
... | @@ -416,8 +425,8 @@ def parallel_stats_pairs(f): | ... | @@ -416,8 +425,8 @@ def parallel_stats_pairs(f): |
416 | 425 | ||
417 | # Create an output DataFrame | 426 | # Create an output DataFrame |
418 | f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f]) | 427 | f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f]) |
419 | - f_df.to_csv(f"data/{f}_counts.csv") | 428 | + f_df.to_csv(runDir + f"/data/{f}_counts.csv") |
420 | - expanded_list.to_csv(f"data/{f}_pairs.csv") | 429 | + expanded_list.to_csv(runDir + f"/data/{f}_pairs.csv") |
421 | 430 | ||
422 | idxQueue.put(thr_idx) # replace the thread index in the queue | 431 | idxQueue.put(thr_idx) # replace the thread index in the queue |
423 | 432 | ||
... | @@ -430,28 +439,34 @@ def to_dist_matrix(f): | ... | @@ -430,28 +439,34 @@ def to_dist_matrix(f): |
430 | global idxQueue | 439 | global idxQueue |
431 | thr_idx = idxQueue.get() | 440 | thr_idx = idxQueue.get() |
432 | 441 | ||
433 | - # notify(f"Computing {f} distance matrix from alignment...") | 442 | + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_dist_matrix({f})") |
434 | - command = f"esl-alipid --rna --noheader --informat stockholm {f}_3d_only.stk" | ||
435 | 443 | ||
436 | # Prepare a file | 444 | # Prepare a file |
437 | with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file: | 445 | with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file: |
438 | al = AlignIO.read(al_file, "fasta") | 446 | al = AlignIO.read(al_file, "fasta") |
439 | names = [ x.id for x in al if '[' in x.id ] | 447 | names = [ x.id for x in al if '[' in x.id ] |
440 | al = al[-len(names):] | 448 | al = al[-len(names):] |
441 | - with open(f + "_3d_only.stk", "w") as only_3d: | 449 | + with open(path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk", "w") as only_3d: |
450 | + try: | ||
442 | only_3d.write(al.format("stockholm")) | 451 | only_3d.write(al.format("stockholm")) |
452 | + except ValueError as e: | ||
453 | + warn(e) | ||
443 | del al | 454 | del al |
455 | + subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap", "-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk", "stockholm", path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"]) | ||
444 | 456 | ||
445 | # Prepare the job | 457 | # Prepare the job |
446 | - process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE) | 458 | + process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"), |
459 | + stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||
447 | id_matrix = np.zeros((len(names), len(names))) | 460 | id_matrix = np.zeros((len(names), len(names))) |
448 | 461 | ||
449 | pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", leave=False) | 462 | pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", leave=False) |
450 | - while process.poll() is None: | 463 | + cnt = 0 |
451 | - output = process.stdout.readline() | 464 | + while not cnt or process.poll() is None: |
465 | + output = process.stdout.read() | ||
452 | if output: | 466 | if output: |
453 | lines = output.strip().split(b'\n') | 467 | lines = output.strip().split(b'\n') |
454 | for l in lines: | 468 | for l in lines: |
469 | + cnt += 1 | ||
455 | line = l.split() | 470 | line = l.split() |
456 | s1 = line[0].decode('utf-8') | 471 | s1 = line[0].decode('utf-8') |
457 | s2 = line[1].decode('utf-8') | 472 | s2 = line[1].decode('utf-8') |
... | @@ -460,9 +475,14 @@ def to_dist_matrix(f): | ... | @@ -460,9 +475,14 @@ def to_dist_matrix(f): |
460 | id2 = names.index(s2) | 475 | id2 = names.index(s2) |
461 | id_matrix[id1, id2] = float(score) | 476 | id_matrix[id1, id2] = float(score) |
462 | pbar.update(1) | 477 | pbar.update(1) |
478 | + if cnt != len(names)*(len(names)-1)*0.5: | ||
479 | + warn(f"{f} got {cnt} updates on {len(names)*(len(names)-1)*0.5}") | ||
480 | + if process.poll() != 0: | ||
481 | + l = process.stderr.read().strip().split(b'\n') | ||
482 | + warn("\n".join([ line.decode('utf-8') for line in l ])) | ||
463 | pbar.close() | 483 | pbar.close() |
464 | 484 | ||
465 | - subprocess.run(["rm", "-f", f + "_3d_only.stk"]) | 485 | + subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk"]) |
466 | np.save("data/"+f+".npy", id_matrix) | 486 | np.save("data/"+f+".npy", id_matrix) |
467 | idxQueue.put(thr_idx) # replace the thread index in the queue | 487 | idxQueue.put(thr_idx) # replace the thread index in the queue |
468 | return 0 | 488 | return 0 |
... | @@ -471,21 +491,26 @@ def seq_idty(): | ... | @@ -471,21 +491,26 @@ def seq_idty(): |
471 | """Computes identity matrices for each of the RNA families. | 491 | """Computes identity matrices for each of the RNA families. |
472 | 492 | ||
473 | REQUIRES temporary results files in data/*.npy | 493 | REQUIRES temporary results files in data/*.npy |
474 | - REQUIRES tables chain, family un to date.""" | 494 | + REQUIRES tables chain, family up to date.""" |
475 | 495 | ||
476 | # load distance matrices | 496 | # load distance matrices |
497 | + fams_to_plot = [ f for f in famlist if f not in ignored ] | ||
477 | fam_arrays = [] | 498 | fam_arrays = [] |
478 | - for f in famlist: | 499 | + for f in fams_to_plot: |
479 | if path.isfile("data/"+f+".npy"): | 500 | if path.isfile("data/"+f+".npy"): |
480 | - fam_arrays.append(np.load("data/"+f+".npy")) | 501 | + fam_arrays.append(np.load("data/"+f+".npy") / 100.0) # normalize percentages in [0,1] |
481 | else: | 502 | else: |
482 | - fam_arrays.append([]) | 503 | + warn("data/"+f+".npy not found !") |
504 | + fam_arrays.append(np.array([])) | ||
483 | 505 | ||
484 | # Update database with identity percentages | 506 | # Update database with identity percentages |
485 | - conn = sqlite3.connect("results/RNANet.db") | 507 | + conn = sqlite3.connect(runDir + "/results/RNANet.db") |
486 | - for f, D in zip(famlist, fam_arrays): | 508 | + for f, D in zip(fams_to_plot, fam_arrays): |
487 | if not len(D): continue | 509 | if not len(D): continue |
488 | - a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix | 510 | + if D.shape[0] > 1: |
511 | + a = np.sum(D) * 2 / D.shape[0] / (D.shape[0] - 1) # SUM(D) / (n(n-1)/2) | ||
512 | + else: | ||
513 | + a = D[0][0] | ||
489 | conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';") | 514 | conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';") |
490 | conn.commit() | 515 | conn.commit() |
491 | conn.close() | 516 | conn.close() |
... | @@ -495,10 +520,11 @@ def seq_idty(): | ... | @@ -495,10 +520,11 @@ def seq_idty(): |
495 | axs = axs.ravel() | 520 | axs = axs.ravel() |
496 | [axi.set_axis_off() for axi in axs] | 521 | [axi.set_axis_off() for axi in axs] |
497 | im = "" # Just to declare the variable, it will be set in the loop | 522 | im = "" # Just to declare the variable, it will be set in the loop |
498 | - for f, D, ax in zip(famlist, fam_arrays, axs): | 523 | + for f, D, ax in zip(fams_to_plot, fam_arrays, axs): |
499 | - if not len(D): continue | ||
500 | - if D.shape[0] > 2: # Cluster only if there is more than 2 sequences to organize | ||
501 | D = D + D.T # Copy the lower triangle to upper, to get a symetrical matrix | 524 | D = D + D.T # Copy the lower triangle to upper, to get a symetrical matrix |
525 | + if D.shape[0] > 2: # Cluster only if there is more than 2 sequences to organize | ||
526 | + D = 1.0 - D | ||
527 | + np.fill_diagonal(D, 0.0) | ||
502 | condensedD = squareform(D) | 528 | condensedD = squareform(D) |
503 | 529 | ||
504 | # Compute basic dendrogram by Ward's method | 530 | # Compute basic dendrogram by Ward's method |
... | @@ -507,15 +533,20 @@ def seq_idty(): | ... | @@ -507,15 +533,20 @@ def seq_idty(): |
507 | 533 | ||
508 | # Reorganize rows and cols | 534 | # Reorganize rows and cols |
509 | idx1 = Z['leaves'] | 535 | idx1 = Z['leaves'] |
510 | - D = D[idx1,:] | 536 | + D = D[idx1[::-1],:] |
511 | D = D[:,idx1[::-1]] | 537 | D = D[:,idx1[::-1]] |
512 | - im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D | 538 | + D = 1.0 - D |
513 | - ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10) | 539 | + elif D.shape[0] == 2: |
540 | + np.fill_diagonal(D, 1.0) # the diagonal has been ignored until now | ||
541 | + ax.text(np.floor(D.shape[0]/2.0)-(0.5 if not D.shape[0]%2 else 0), -0.5, f + "\n(" + str(D.shape[0]) + " chains)", | ||
542 | + fontsize=9, horizontalalignment = 'center', verticalalignment='bottom') | ||
543 | + im = ax.matshow(D, vmin=0, vmax=1) | ||
544 | + | ||
514 | fig.tight_layout() | 545 | fig.tight_layout() |
515 | - fig.subplots_adjust(wspace=0.1, hspace=0.3) | 546 | + fig.subplots_adjust(hspace=0.3, wspace=0.1) |
516 | - fig.colorbar(im, ax=axs[-1], shrink=0.8) | 547 | + fig.colorbar(im, ax=axs[-4], shrink=0.8) |
517 | - fig.savefig(f"results/figures/distances.png") | 548 | + fig.savefig(runDir + f"/results/figures/distances.png") |
518 | - notify("Computed all identity matrices and saved the figure.") | 549 | + print("> Computed all identity matrices and saved the figure.", flush=True) |
519 | 550 | ||
520 | def stats_pairs(): | 551 | def stats_pairs(): |
521 | """Counts occurrences of intra-chain base-pair types in RNA families | 552 | """Counts occurrences of intra-chain base-pair types in RNA families |
... | @@ -523,6 +554,8 @@ def stats_pairs(): | ... | @@ -523,6 +554,8 @@ def stats_pairs(): |
523 | Creates a temporary results file in data/pair_counts.csv, and a results file in results/pairings.csv. | 554 | Creates a temporary results file in data/pair_counts.csv, and a results file in results/pairings.csv. |
524 | REQUIRES tables chain, nucleotide up-to-date.""" | 555 | REQUIRES tables chain, nucleotide up-to-date.""" |
525 | 556 | ||
557 | + setproctitle(f"RNANet statistics.py stats_pairs()") | ||
558 | + | ||
526 | def line_format(family_data): | 559 | def line_format(family_data): |
527 | return family_data.apply(partial(format_percentage, sum(family_data))) | 560 | return family_data.apply(partial(format_percentage, sum(family_data))) |
528 | 561 | ||
... | @@ -530,12 +563,12 @@ def stats_pairs(): | ... | @@ -530,12 +563,12 @@ def stats_pairs(): |
530 | results = [] | 563 | results = [] |
531 | allpairs = [] | 564 | allpairs = [] |
532 | for f in fam_list: | 565 | for f in fam_list: |
533 | - newpairs = pd.read_csv(f"data/{f}_pairs.csv", index_col=0) | 566 | + newpairs = pd.read_csv(runDir + f"/data/{f}_pairs.csv", index_col=0) |
534 | - fam_df = pd.read_csv(f"data/{f}_counts.csv", index_col=0) | 567 | + fam_df = pd.read_csv(runDir + f"/data/{f}_counts.csv", index_col=0) |
535 | results.append(fam_df) | 568 | results.append(fam_df) |
536 | allpairs.append(newpairs) | 569 | allpairs.append(newpairs) |
537 | - subprocess.run(["rm", "-f", f"data/{f}_pairs.csv"]) | 570 | + subprocess.run(["rm", "-f", runDir + f"/data/{f}_pairs.csv"]) |
538 | - subprocess.run(["rm", "-f", f"data/{f}_counts.csv"]) | 571 | + subprocess.run(["rm", "-f", runDir + f"/data/{f}_counts.csv"]) |
539 | all_pairs = pd.concat(allpairs) | 572 | all_pairs = pd.concat(allpairs) |
540 | df = pd.concat(results).fillna(0) | 573 | df = pd.concat(results).fillna(0) |
541 | df.to_csv("data/pair_counts.csv") | 574 | df.to_csv("data/pair_counts.csv") |
... | @@ -573,14 +606,14 @@ def stats_pairs(): | ... | @@ -573,14 +606,14 @@ def stats_pairs(): |
573 | crosstab = crosstab[["AU", "GC", "Wobble", "Other"]] | 606 | crosstab = crosstab[["AU", "GC", "Wobble", "Other"]] |
574 | 607 | ||
575 | # Save to CSV | 608 | # Save to CSV |
576 | - df.to_csv("results/pair_types.csv") | 609 | + df.to_csv(runDir + "/results/pair_types.csv") |
577 | 610 | ||
578 | # Plot barplot of overall types | 611 | # Plot barplot of overall types |
579 | ax = crosstab.plot(figsize=(8,5), kind='bar', stacked=True, log=False, fontsize=13) | 612 | ax = crosstab.plot(figsize=(8,5), kind='bar', stacked=True, log=False, fontsize=13) |
580 | ax.set_ylabel("Number of observations (millions)", fontsize=13) | 613 | ax.set_ylabel("Number of observations (millions)", fontsize=13) |
581 | ax.set_xlabel(None) | 614 | ax.set_xlabel(None) |
582 | plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99) | 615 | plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99) |
583 | - plt.savefig("results/figures/pairings.png") | 616 | + plt.savefig(runDir + "/results/figures/pairings.png") |
584 | 617 | ||
585 | notify("Computed nucleotide statistics and saved CSV and PNG file.") | 618 | notify("Computed nucleotide statistics and saved CSV and PNG file.") |
586 | 619 | ||
... | @@ -589,7 +622,9 @@ def per_chain_stats(): | ... | @@ -589,7 +622,9 @@ def per_chain_stats(): |
589 | 622 | ||
590 | REQUIRES tables chain, nucleotide up to date. """ | 623 | REQUIRES tables chain, nucleotide up to date. """ |
591 | 624 | ||
592 | - with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn: | 625 | + setproctitle(f"RNANet statistics.py per_chain_stats()") |
626 | + | ||
627 | + with sqlite3.connect(runDir + "/results/RNANet.db", isolation_level=None) as conn: | ||
593 | # Compute per-chain nucleotide frequencies | 628 | # Compute per-chain nucleotide frequencies |
594 | df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn) | 629 | df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn) |
595 | df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64) | 630 | df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64) |
... | @@ -600,25 +635,36 @@ def per_chain_stats(): | ... | @@ -600,25 +635,36 @@ def per_chain_stats(): |
600 | conn.execute('pragma journal_mode=wal') | 635 | conn.execute('pragma journal_mode=wal') |
601 | sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;", | 636 | sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;", |
602 | many=True, data=list(df.to_records(index=False)), warn_every=10) | 637 | many=True, data=list(df.to_records(index=False)), warn_every=10) |
603 | - notify("Updated the database with per-chain base frequencies") | 638 | + print("> Updated the database with per-chain base frequencies", flush=True) |
604 | 639 | ||
605 | def general_stats(): | 640 | def general_stats(): |
606 | """ | 641 | """ |
607 | Number of structures as function of the resolution threshold | 642 | Number of structures as function of the resolution threshold |
608 | Number of Rfam families as function of the resolution threshold | 643 | Number of Rfam families as function of the resolution threshold |
609 | """ | 644 | """ |
610 | - with sqlite3.connect("results/RNANet.db") as conn: | 645 | + |
611 | - df_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution | 646 | + setproctitle(f"RNANet statistics.py general_stats()") |
647 | + | ||
648 | + reqs = [ | ||
649 | + # unique unmapped chains with no issues | ||
650 | + """ SELECT distinct pdb_id, chain_name, exp_method, resolution | ||
612 | FROM chain JOIN structure ON chain.structure_id = structure.pdb_id | 651 | FROM chain JOIN structure ON chain.structure_id = structure.pdb_id |
613 | - WHERE rfam_acc = 'unmappd' AND ISSUE=0;""", conn) | 652 | + WHERE rfam_acc = 'unmappd' AND ISSUE=0;""", |
614 | - df_mapped_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution | 653 | + |
654 | + # unique mapped chains with no issues | ||
655 | + """ SELECT distinct pdb_id, chain_name, exp_method, resolution | ||
615 | FROM chain JOIN structure ON chain.structure_id = structure.pdb_id | 656 | FROM chain JOIN structure ON chain.structure_id = structure.pdb_id |
616 | - WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn) | 657 | + WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", |
617 | - df_mapped_copies = pd.read_sql(f"""SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution | 658 | + |
659 | + # mapped chains with no issues | ||
660 | + """ SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution | ||
618 | FROM chain JOIN structure ON chain.structure_id = structure.pdb_id | 661 | FROM chain JOIN structure ON chain.structure_id = structure.pdb_id |
619 | - WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn) | 662 | + WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", |
620 | - df_inferred_only_unique = pd.read_sql(f"""SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution | 663 | + |
621 | - FROM (SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy | 664 | + # mapped chains with no issues that are all inferred |
665 | + """ SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution | ||
666 | + FROM ( | ||
667 | + SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy | ||
622 | FROM chain | 668 | FROM chain |
623 | JOIN (SELECT structure_id, chain_name, COUNT(distinct rfam_acc) AS redundancy, SUM(inferred) AS inf_redundancy | 669 | JOIN (SELECT structure_id, chain_name, COUNT(distinct rfam_acc) AS redundancy, SUM(inferred) AS inf_redundancy |
624 | FROM chain | 670 | FROM chain |
... | @@ -627,8 +673,105 @@ def general_stats(): | ... | @@ -627,8 +673,105 @@ def general_stats(): |
627 | ) AS r ON chain.structure_id=r.structure_id AND chain.chain_name = r.chain_name | 673 | ) AS r ON chain.structure_id=r.structure_id AND chain.chain_name = r.chain_name |
628 | WHERE r.redundancy=r.inf_redundancy AND rfam_acc != 'unmappd' and issue=0 | 674 | WHERE r.redundancy=r.inf_redundancy AND rfam_acc != 'unmappd' and issue=0 |
629 | ) AS c | 675 | ) AS c |
630 | - JOIN structure ON c.structure_id=structure.pdb_id;""", conn) | 676 | + JOIN structure ON c.structure_id=structure.pdb_id;""", |
631 | - print("> found", len(df_inferred_only_unique.index), "chains which are mapped only by inference using BGSU NR Lists.") | 677 | + |
678 | + # Number of mapped chains (not inferred) | ||
679 | + """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0);""", | ||
680 | + | ||
681 | + # Number of unique mapped chains (not inferred) | ||
682 | + """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0);""", | ||
683 | + | ||
684 | + # Number of mapped chains (inferred) | ||
685 | + """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1);""", | ||
686 | + | ||
687 | + # Number of unique mapped chains (inferred) | ||
688 | + """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1);""", | ||
689 | + | ||
690 | + # Number of mapped chains inferred once | ||
691 | + """SELECT count(*) FROM ( | ||
692 | + SELECT structure_id, chain_name, COUNT(DISTINCT rfam_acc) as c | ||
693 | + FROM chain where rfam_acc!='unmappd' and inferred=1 | ||
694 | + GROUP BY structure_id, chain_name | ||
695 | + ) WHERE c=1;""", | ||
696 | + | ||
697 | + # Number of mapped chains inferred twice | ||
698 | + """select count(*) from ( | ||
699 | + select structure_id, chain_name, count(distinct rfam_acc) as c | ||
700 | + from chain where rfam_acc!='unmappd' and inferred=1 | ||
701 | + group by structure_id, chain_name | ||
702 | + ) where c=2;""", | ||
703 | + | ||
704 | + # Number of mapped chains inferred 3 times or more | ||
705 | + """select count(*) from ( | ||
706 | + select structure_id, chain_name, count(distinct rfam_acc) as c | ||
707 | + from chain where rfam_acc!='unmappd' and inferred=1 | ||
708 | + group by structure_id, chain_name | ||
709 | + ) where c>2;""", | ||
710 | + | ||
711 | + # Number of chains both mapped with and without inferrence | ||
712 | + """ SELECT COUNT(*) FROM ( | ||
713 | + SELECT structure_id, chain_name, sum(inferred) AS s, COUNT(rfam_acc) AS c | ||
714 | + FROM chain | ||
715 | + WHERE rfam_acc!='unmappd' | ||
716 | + GROUP BY structure_id, chain_name | ||
717 | + ) | ||
718 | + WHERE s < c AND s > 0;""", | ||
719 | + | ||
720 | + # Number of mapped chains (total) | ||
721 | + """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd');""", | ||
722 | + | ||
723 | + # Number of unique mapped chains | ||
724 | + """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd');""", | ||
725 | + | ||
726 | + # Number of unmapped chains | ||
727 | + """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc = 'unmappd');""", | ||
728 | + | ||
729 | + # Number of mapped chains without issues (not inferred) | ||
730 | + """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0 AND issue = 0);""", | ||
731 | + | ||
732 | + # Number of unique mapped chains without issues (not inferred) | ||
733 | + """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0 AND issue = 0);""", | ||
734 | + | ||
735 | + # Number of mapped chains without issues (inferred) | ||
736 | + """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1 AND issue=0);""", | ||
737 | + | ||
738 | + # Number of unique mapped chains without issues (inferred) | ||
739 | + """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1 AND issue=0);""", | ||
740 | + | ||
741 | + # Number of mapped chains without issues (total) | ||
742 | + """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND issue=0);""", | ||
743 | + | ||
744 | + # Number of unique mapped chains without issues | ||
745 | + """SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND issue=0);""", | ||
746 | + | ||
747 | + # Number of unmapped chains without issues | ||
748 | + """SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc = 'unmappd' AND issue=0);""" | ||
749 | + ] | ||
750 | + | ||
751 | + answers = [] | ||
752 | + with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | ||
753 | + for r in reqs: | ||
754 | + answers.append(pd.read_sql(r, conn)) | ||
755 | + df_unique = answers[0] | ||
756 | + df_mapped_unique = answers[1] | ||
757 | + df_mapped_copies = answers[2] | ||
758 | + df_inferred_only_unique = answers[3] | ||
759 | + print() | ||
760 | + print("> found", answers[4].iloc[0][0], f"chains ({answers[5].iloc[0][0]} unique chains) that are mapped thanks to Rfam. Removing chains with issues, only {answers[15].iloc[0][0]} ({answers[16].iloc[0][0]} unique)") | ||
761 | + if answers[4].iloc[0][0] != answers[5].iloc[0][0]: | ||
762 | + print("\t> This happens because different parts of the same chain can be mapped to different families.") | ||
763 | + print("> found", answers[6].iloc[0][0], f"chains ({answers[7].iloc[0][0]} unique chains) that are mapped by inferrence. Removing chains with issues, only {answers[17].iloc[0][0]} ({answers[18].iloc[0][0]} unique).") | ||
764 | + print("\t> ", answers[8].iloc[0][0], "chains are mapped only once,") | ||
765 | + print("\t> ", answers[9].iloc[0][0], "are mapped to 2 families,") | ||
766 | + print("\t> ", answers[10].iloc[0][0], "are mapped to 3 or more.") | ||
767 | + print("> Among them,", answers[11].iloc[0][0], "chains are mapped both with families found on Rfam and by inferrence.") | ||
768 | + if answers[11].iloc[0][0]: | ||
769 | + print("\t> this is normal if you used option -f (--full-inference). Otherwise, there might be a problem.") | ||
770 | + print("> TOTAL:", answers[12].iloc[0][0], f"chains ({answers[13].iloc[0][0]} unique chains) mapped to a family. Removing chains with issues, only {answers[19].iloc[0][0]} ({answers[20].iloc[0][0]} unique).") | ||
771 | + print("> TOTAL:", answers[14].iloc[0][0], f"unmapped chains. Removing chains with issues, {answers[21].iloc[0][0]}.") | ||
772 | + if answers[14].iloc[0][0]: | ||
773 | + print("\t> this is normal if you used option --no-homology. Otherwise, there might be a problem.") | ||
774 | + print() | ||
632 | 775 | ||
633 | ########################################## | 776 | ########################################## |
634 | # plot N = f(resolution, exp_method) | 777 | # plot N = f(resolution, exp_method) |
... | @@ -642,7 +785,7 @@ def general_stats(): | ... | @@ -642,7 +785,7 @@ def general_stats(): |
642 | df_inferred_only_unique.sort_values('resolution', inplace=True, ignore_index=True) | 785 | df_inferred_only_unique.sort_values('resolution', inplace=True, ignore_index=True) |
643 | df_mapped_copies.sort_values('resolution', inplace=True, ignore_index=True) | 786 | df_mapped_copies.sort_values('resolution', inplace=True, ignore_index=True) |
644 | max_res = max(df_unique.resolution) | 787 | max_res = max(df_unique.resolution) |
645 | - max_structs = len(df_mapped_copies.index.tolist()) | 788 | + max_structs = max(len(df_mapped_copies.index), len(df_unique.index)) |
646 | colors = np.linspace(0,1,1+len(methods)) | 789 | colors = np.linspace(0,1,1+len(methods)) |
647 | plt.xticks( np.arange(0, max_res+2, 2.0).tolist(), np.arange(0, max_res+2, 2.0).tolist() ) | 790 | plt.xticks( np.arange(0, max_res+2, 2.0).tolist(), np.arange(0, max_res+2, 2.0).tolist() ) |
648 | 791 | ||
... | @@ -654,7 +797,7 @@ def general_stats(): | ... | @@ -654,7 +797,7 @@ def general_stats(): |
654 | axs[0][0].set_ylabel("ALL", fontsize=14) | 797 | axs[0][0].set_ylabel("ALL", fontsize=14) |
655 | axs[0][0].set_title("Number of unique RNA chains", fontsize=14) | 798 | axs[0][0].set_title("Number of unique RNA chains", fontsize=14) |
656 | axs[0][0].set_ylim((0, max_structs * 1.05)) | 799 | axs[0][0].set_ylim((0, max_structs * 1.05)) |
657 | - axs[0][0].legend(loc="best", fontsize=14) | 800 | + axs[0][0].legend(loc="lower right", fontsize=14) |
658 | 801 | ||
659 | axs[0][1].grid(axis='y', ls='dotted', lw=1) | 802 | axs[0][1].grid(axis='y', ls='dotted', lw=1) |
660 | axs[0][1].set_yticklabels([]) | 803 | axs[0][1].set_yticklabels([]) |
... | @@ -663,9 +806,9 @@ def general_stats(): | ... | @@ -663,9 +806,9 @@ def general_stats(): |
663 | axs[0][1].hist(df_inferred_only_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[0], 0.5), cumulative=True, label='only by inference') | 806 | axs[0][1].hist(df_inferred_only_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[0], 0.5), cumulative=True, label='only by inference') |
664 | axs[0][1].text(0.95*max_res, 0.95*len(df_mapped_unique.resolution), "%d " % len(df_mapped_unique.resolution), | 807 | axs[0][1].text(0.95*max_res, 0.95*len(df_mapped_unique.resolution), "%d " % len(df_mapped_unique.resolution), |
665 | horizontalalignment='right', verticalalignment='top', fontsize=14) | 808 | horizontalalignment='right', verticalalignment='top', fontsize=14) |
666 | - axs[0][1].set_title("Number of unique RNA chains\nmapped to $\geq 1$ family", fontsize=14) | 809 | + axs[0][1].set_title(r"Number of unique RNA chains\nmapped to $\geq 1$ family", fontsize=14) |
667 | axs[0][1].set_ylim((0, max_structs * 1.05)) | 810 | axs[0][1].set_ylim((0, max_structs * 1.05)) |
668 | - axs[0][1].legend(loc="best", fontsize=14) | 811 | + axs[0][1].legend(loc="upper left", fontsize=14) |
669 | 812 | ||
670 | axs[0][2].grid(axis='y', ls='dotted', lw=1) | 813 | axs[0][2].grid(axis='y', ls='dotted', lw=1) |
671 | axs[0][2].set_yticklabels([]) | 814 | axs[0][2].set_yticklabels([]) |
... | @@ -675,7 +818,7 @@ def general_stats(): | ... | @@ -675,7 +818,7 @@ def general_stats(): |
675 | axs[0][2].text(0.95*max_res, 0.95*len(df_mapped_copies.resolution), "%d " % len(df_mapped_copies.resolution), | 818 | axs[0][2].text(0.95*max_res, 0.95*len(df_mapped_copies.resolution), "%d " % len(df_mapped_copies.resolution), |
676 | horizontalalignment='right', verticalalignment='top', fontsize=14) | 819 | horizontalalignment='right', verticalalignment='top', fontsize=14) |
677 | axs[0][2].set_title("Number of RNA chains mapped to a\nfamily (with copies)", fontsize=14) | 820 | axs[0][2].set_title("Number of RNA chains mapped to a\nfamily (with copies)", fontsize=14) |
678 | - axs[0][2].legend(loc="right", fontsize=14) | 821 | + axs[0][2].legend(loc="upper left", fontsize=14) |
679 | axs[0][2].set_ylim((0, max_structs * 1.05)) | 822 | axs[0][2].set_ylim((0, max_structs * 1.05)) |
680 | 823 | ||
681 | for i,m in enumerate(methods): | 824 | for i,m in enumerate(methods): |
... | @@ -683,7 +826,7 @@ def general_stats(): | ... | @@ -683,7 +826,7 @@ def general_stats(): |
683 | df_mapped_unique_m = df_mapped_unique[df_mapped_unique.exp_method == m] | 826 | df_mapped_unique_m = df_mapped_unique[df_mapped_unique.exp_method == m] |
684 | df_inferred_only_unique_m = df_inferred_only_unique[df_inferred_only_unique.exp_method == m] | 827 | df_inferred_only_unique_m = df_inferred_only_unique[df_inferred_only_unique.exp_method == m] |
685 | df_mapped_copies_m = df_mapped_copies[ df_mapped_copies.exp_method == m] | 828 | df_mapped_copies_m = df_mapped_copies[ df_mapped_copies.exp_method == m] |
686 | - max_structs = len(df_mapped_copies_m.resolution.tolist()) | 829 | + max_structs = max(len(df_mapped_copies_m.index), len(df_unique_m.index)) |
687 | print("> found", max_structs, "structures with method", m, flush=True) | 830 | print("> found", max_structs, "structures with method", m, flush=True) |
688 | 831 | ||
689 | axs[1+i][0].grid(axis='y', ls='dotted', lw=1) | 832 | axs[1+i][0].grid(axis='y', ls='dotted', lw=1) |
... | @@ -693,7 +836,7 @@ def general_stats(): | ... | @@ -693,7 +836,7 @@ def general_stats(): |
693 | horizontalalignment='right', verticalalignment='top', fontsize=14) | 836 | horizontalalignment='right', verticalalignment='top', fontsize=14) |
694 | axs[1+i][0].set_ylim((0, max_structs * 1.05)) | 837 | axs[1+i][0].set_ylim((0, max_structs * 1.05)) |
695 | axs[1+i][0].set_ylabel(m, fontsize=14) | 838 | axs[1+i][0].set_ylabel(m, fontsize=14) |
696 | - axs[1+i][0].legend(loc="best", fontsize=14) | 839 | + axs[1+i][0].legend(loc="lower right", fontsize=14) |
697 | 840 | ||
698 | axs[1+i][1].grid(axis='y', ls='dotted', lw=1) | 841 | axs[1+i][1].grid(axis='y', ls='dotted', lw=1) |
699 | axs[1+i][1].set_yticklabels([]) | 842 | axs[1+i][1].set_yticklabels([]) |
... | @@ -703,7 +846,7 @@ def general_stats(): | ... | @@ -703,7 +846,7 @@ def general_stats(): |
703 | axs[1+i][1].text(0.95*max_res, 0.95*len(df_mapped_unique_m.resolution), "%d " % len(df_mapped_unique_m.resolution), | 846 | axs[1+i][1].text(0.95*max_res, 0.95*len(df_mapped_unique_m.resolution), "%d " % len(df_mapped_unique_m.resolution), |
704 | horizontalalignment='right', verticalalignment='top', fontsize=14) | 847 | horizontalalignment='right', verticalalignment='top', fontsize=14) |
705 | axs[1+i][1].set_ylim((0, max_structs * 1.05)) | 848 | axs[1+i][1].set_ylim((0, max_structs * 1.05)) |
706 | - axs[1+i][1].legend(loc="best", fontsize=14) | 849 | + axs[1+i][1].legend(loc="upper left", fontsize=14) |
707 | 850 | ||
708 | axs[1+i][2].grid(axis='y', ls='dotted', lw=1) | 851 | axs[1+i][2].grid(axis='y', ls='dotted', lw=1) |
709 | axs[1+i][2].set_yticklabels([]) | 852 | axs[1+i][2].set_yticklabels([]) |
... | @@ -713,7 +856,7 @@ def general_stats(): | ... | @@ -713,7 +856,7 @@ def general_stats(): |
713 | axs[1+i][2].text(0.95*max_res, 0.95*len(df_mapped_copies_m.resolution), "%d " % len(df_mapped_copies_m.resolution), | 856 | axs[1+i][2].text(0.95*max_res, 0.95*len(df_mapped_copies_m.resolution), "%d " % len(df_mapped_copies_m.resolution), |
714 | horizontalalignment='right', verticalalignment='top', fontsize=14) | 857 | horizontalalignment='right', verticalalignment='top', fontsize=14) |
715 | axs[1+i][2].set_ylim((0, max_structs * 1.05)) | 858 | axs[1+i][2].set_ylim((0, max_structs * 1.05)) |
716 | - axs[1+i][2].legend(loc="right", fontsize=14) | 859 | + axs[1+i][2].legend(loc="upper left", fontsize=14) |
717 | 860 | ||
718 | axs[-1][0].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14) | 861 | axs[-1][0].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14) |
719 | axs[-1][1].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14) | 862 | axs[-1][1].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14) |
... | @@ -722,7 +865,7 @@ def general_stats(): | ... | @@ -722,7 +865,7 @@ def general_stats(): |
722 | fig.suptitle("Number of RNA chains by experimental method and resolution", fontsize=16) | 865 | fig.suptitle("Number of RNA chains by experimental method and resolution", fontsize=16) |
723 | fig.subplots_adjust(left=0.07, right=0.98, wspace=0.05, | 866 | fig.subplots_adjust(left=0.07, right=0.98, wspace=0.05, |
724 | hspace=0.05, bottom=0.05, top=0.92) | 867 | hspace=0.05, bottom=0.05, top=0.92) |
725 | - fig.savefig("results/figures/resolutions.png") | 868 | + fig.savefig(runDir + "/results/figures/resolutions.png") |
726 | plt.close() | 869 | plt.close() |
727 | 870 | ||
728 | ########################################## | 871 | ########################################## |
... | @@ -765,7 +908,7 @@ def general_stats(): | ... | @@ -765,7 +908,7 @@ def general_stats(): |
765 | fig.suptitle("Number of RNA families used by experimental method and resolution", fontsize=16) | 908 | fig.suptitle("Number of RNA families used by experimental method and resolution", fontsize=16) |
766 | fig.subplots_adjust(left=0.05, right=0.98, wspace=0.05, | 909 | fig.subplots_adjust(left=0.05, right=0.98, wspace=0.05, |
767 | hspace=0.05, bottom=0.12, top=0.84) | 910 | hspace=0.05, bottom=0.12, top=0.84) |
768 | - fig.savefig("results/figures/Nfamilies.png") | 911 | + fig.savefig(runDir + "/results/figures/Nfamilies.png") |
769 | plt.close() | 912 | plt.close() |
770 | 913 | ||
771 | def log_to_pbar(pbar): | 914 | def log_to_pbar(pbar): |
... | @@ -776,8 +919,10 @@ def log_to_pbar(pbar): | ... | @@ -776,8 +919,10 @@ def log_to_pbar(pbar): |
776 | if __name__ == "__main__": | 919 | if __name__ == "__main__": |
777 | 920 | ||
778 | # parse options | 921 | # parse options |
922 | + DELETE_OLD_DATA = False | ||
923 | + DO_WADLEY_ANALYSIS = False | ||
779 | try: | 924 | try: |
780 | - opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "resolution=", "3d-folder=", "seq-folder=" ]) | 925 | + opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "from-scratch", "wadley", "resolution=", "3d-folder=", "seq-folder=" ]) |
781 | except getopt.GetoptError as err: | 926 | except getopt.GetoptError as err: |
782 | print(err) | 927 | print(err) |
783 | sys.exit(2) | 928 | sys.exit(2) |
... | @@ -795,6 +940,7 @@ if __name__ == "__main__": | ... | @@ -795,6 +940,7 @@ if __name__ == "__main__": |
795 | "\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.") | 940 | "\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.") |
796 | print("--seq-folder=…\t\t\tPath to a folder containing the sequence and alignment files. Required subfolder:" | 941 | print("--seq-folder=…\t\t\tPath to a folder containing the sequence and alignment files. Required subfolder:" |
797 | "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family") | 942 | "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family") |
943 | + print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything") | ||
798 | sys.exit() | 944 | sys.exit() |
799 | elif opt == '--version': | 945 | elif opt == '--version': |
800 | print("RNANet statistics 1.1 beta") | 946 | print("RNANet statistics 1.1 beta") |
... | @@ -810,25 +956,37 @@ if __name__ == "__main__": | ... | @@ -810,25 +956,37 @@ if __name__ == "__main__": |
810 | path_to_seq_data = path.abspath(arg) | 956 | path_to_seq_data = path.abspath(arg) |
811 | if path_to_seq_data[-1] != '/': | 957 | if path_to_seq_data[-1] != '/': |
812 | path_to_seq_data += '/' | 958 | path_to_seq_data += '/' |
959 | + elif opt=='--from-scratch': | ||
960 | + DELETE_OLD_DATA = True | ||
961 | + DO_WADLEY_ANALYSIS = True | ||
962 | + subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"]) | ||
963 | + elif opt=='--wadley': | ||
964 | + DO_WADLEY_ANALYSIS = True | ||
813 | 965 | ||
814 | 966 | ||
815 | # Load mappings | 967 | # Load mappings |
816 | print("Loading mappings list...") | 968 | print("Loading mappings list...") |
817 | - with sqlite3.connect("results/RNANet.db") as conn: | 969 | + with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
818 | fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ] | 970 | fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ] |
819 | mappings_list = {} | 971 | mappings_list = {} |
820 | for k in fam_list: | 972 | for k in fam_list: |
821 | - mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}' and issue=0;") ] | 973 | + mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain JOIN structure ON chain.structure_id=structure.pdb_id WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};") ] |
822 | 974 | ||
823 | # List the families for which we will compute sequence identity matrices | 975 | # List the families for which we will compute sequence identity matrices |
824 | - with sqlite3.connect("results/RNANet.db") as conn: | 976 | + with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
825 | - famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ] | 977 | + famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ] |
826 | - ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ] | 978 | + ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains < 3 ORDER BY rfam_acc ASC;") ] |
979 | + n_unmapped_chains = sql_ask_database(conn, "SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0] | ||
827 | if len(ignored): | 980 | if len(ignored): |
828 | print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n') | 981 | print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n') |
829 | 982 | ||
983 | + if DELETE_OLD_DATA: | ||
984 | + for f in fam_list: | ||
985 | + subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"]) | ||
986 | + | ||
987 | + | ||
830 | # Prepare the multiprocessing execution environment | 988 | # Prepare the multiprocessing execution environment |
831 | - nworkers = max(read_cpu_number()-1, 32) | 989 | + nworkers = min(read_cpu_number()-1, 32) |
832 | thr_idx_mgr = Manager() | 990 | thr_idx_mgr = Manager() |
833 | idxQueue = thr_idx_mgr.Queue() | 991 | idxQueue = thr_idx_mgr.Queue() |
834 | for i in range(nworkers): | 992 | for i in range(nworkers): |
... | @@ -836,14 +994,15 @@ if __name__ == "__main__": | ... | @@ -836,14 +994,15 @@ if __name__ == "__main__": |
836 | 994 | ||
837 | # Define the tasks | 995 | # Define the tasks |
838 | joblist = [] | 996 | joblist = [] |
839 | - # joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 4.0))) # res threshold is 4.0 Angstroms by default | 997 | + if n_unmapped_chains and DO_WADLEY_ANALYSIS: |
840 | - # joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 4.0))) # | 998 | + joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 20.0))) # res threshold is 4.0 Angstroms by default |
999 | + joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 20.0))) # | ||
841 | joblist.append(Job(function=stats_len)) # Computes figures | 1000 | joblist.append(Job(function=stats_len)) # Computes figures |
842 | - # joblist.append(Job(function=stats_freq)) # updates the database | 1001 | + joblist.append(Job(function=stats_freq)) # updates the database |
843 | - # for f in famlist: | 1002 | + for f in famlist: |
844 | - # joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database | 1003 | + joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database |
845 | - # if f not in ignored: | 1004 | + if f not in ignored: |
846 | - # joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database | 1005 | + joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database |
847 | 1006 | ||
848 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) | 1007 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) |
849 | pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True) | 1008 | pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True) |
... | @@ -867,7 +1026,8 @@ if __name__ == "__main__": | ... | @@ -867,7 +1026,8 @@ if __name__ == "__main__": |
867 | print() | 1026 | print() |
868 | 1027 | ||
869 | # finish the work after the parallel portions | 1028 | # finish the work after the parallel portions |
870 | - # per_chain_stats() | 1029 | + per_chain_stats() |
871 | - # seq_idty() | 1030 | + seq_idty() |
872 | - # stats_pairs() | 1031 | + stats_pairs() |
1032 | + if n_unmapped_chains: | ||
873 | general_stats() | 1033 | general_stats() | ... | ... |
-
Please register or login to post a comment