Showing
5 changed files
with
156 additions
and
129 deletions
... | @@ -30,14 +30,12 @@ To help you design your own SQL requests, we provide a description of the databa | ... | @@ -30,14 +30,12 @@ To help you design your own SQL requests, we provide a description of the databa |
30 | * `rfam_acc`: The family which the chain is mapped to (if not mapped, value is *unmappd*) | 30 | * `rfam_acc`: The family which the chain is mapped to (if not mapped, value is *unmappd*) |
31 | * `pdb_start`: Position in the chain where the mapping to Rfam begins (absolute position, not residue number) | 31 | * `pdb_start`: Position in the chain where the mapping to Rfam begins (absolute position, not residue number) |
32 | * `pdb_end`: Position in the chain where the mapping to Rfam ends (absolute position, not residue number) | 32 | * `pdb_end`: Position in the chain where the mapping to Rfam ends (absolute position, not residue number) |
33 | -* `reversed`: Wether the mapping numbering order differs from the residue numbering order in the mmCIF file (eg 4c9d, chains C and D) | ||
34 | * `issue`: Wether an issue occurred with this structure while downloading, extracting, annotating or parsing the annotation. See the file known_issues_reasons.txt for more information about why your chain is marked as an issue. | 33 | * `issue`: Wether an issue occurred with this structure while downloading, extracting, annotating or parsing the annotation. See the file known_issues_reasons.txt for more information about why your chain is marked as an issue. |
35 | * `inferred`: Wether the mapping has been inferred using the redundancy list (value is 1) or just known from Rfam-PDB mappings (value is 0) | 34 | * `inferred`: Wether the mapping has been inferred using the redundancy list (value is 1) or just known from Rfam-PDB mappings (value is 0) |
36 | * `chain_freq_A`, `chain_freq_C`, `chain_freq_G`, `chain_freq_U`, `chain_freq_other`: Nucleotide frequencies in the chain | 35 | * `chain_freq_A`, `chain_freq_C`, `chain_freq_G`, `chain_freq_U`, `chain_freq_other`: Nucleotide frequencies in the chain |
37 | * `pair_count_cWW`, `pair_count_cWH`, ... `pair_count_tSS`: Counts of the non-canonical base-pair types in the chain (intra-chain counts only) | 36 | * `pair_count_cWW`, `pair_count_cWH`, ... `pair_count_tSS`: Counts of the non-canonical base-pair types in the chain (intra-chain counts only) |
38 | 37 | ||
39 | ## Table `nucleotide`, for individual nucleotide descriptors | 38 | ## Table `nucleotide`, for individual nucleotide descriptors |
40 | -* `nt_id`: A unique identifier | ||
41 | * `chain_id`: The chain the nucleotide belongs to | 39 | * `chain_id`: The chain the nucleotide belongs to |
42 | * `index_chain`: its absolute position within the portion of chain mapped to Rfam, from 1 to X. This is completely uncorrelated to any gene start or 3D chain residue numbers. | 40 | * `index_chain`: its absolute position within the portion of chain mapped to Rfam, from 1 to X. This is completely uncorrelated to any gene start or 3D chain residue numbers. |
43 | * `nt_position`: relative position within the portion of chain mapped to RFam, from 0 to 1 | 41 | * `nt_position`: relative position within the portion of chain mapped to RFam, from 0 to 1 |
... | @@ -51,7 +49,7 @@ To help you design your own SQL requests, we provide a description of the databa | ... | @@ -51,7 +49,7 @@ To help you design your own SQL requests, we provide a description of the databa |
51 | * `nb_interact`: number of interactions with other nucleotides. Up to 3 values. Includes inter-chain interactions. | 49 | * `nb_interact`: number of interactions with other nucleotides. Up to 3 values. Includes inter-chain interactions. |
52 | * `pair_type_LW`: The Leontis-Westhof nomenclature codes of the interactions. The first letter concerns cis/trans orientation, the second this base's side interacting, and the third the other base's side. | 50 | * `pair_type_LW`: The Leontis-Westhof nomenclature codes of the interactions. The first letter concerns cis/trans orientation, the second this base's side interacting, and the third the other base's side. |
53 | * `pair_type_DSSR`: Same but using the DSSR nomenclature (Hoogsteen edge approximately corresponds to Major-groove and Sugar edge to minor-groove) | 51 | * `pair_type_DSSR`: Same but using the DSSR nomenclature (Hoogsteen edge approximately corresponds to Major-groove and Sugar edge to minor-groove) |
54 | -* `alpha`, `beta`, `gamma`, `delta`, `epsilon`, `zeta`: The 6 torsion angles of the RNA backabone for this nucleotide | 52 | +* `alpha`, `beta`, `gamma`, `delta`, `epsilon`, `zeta`: The 6 torsion angles of the RNA backbone for this nucleotide, between 0 and 2pi |
55 | * `epsilon_zeta`: Difference between epsilon and zeta angles | 53 | * `epsilon_zeta`: Difference between epsilon and zeta angles |
56 | * `bb_type`: conformation of the backbone (BI, BII or ..) | 54 | * `bb_type`: conformation of the backbone (BI, BII or ..) |
57 | * `chi`: torsion angle between the sugar and base (O-C1'-N-C4) | 55 | * `chi`: torsion angle between the sugar and base (O-C1'-N-C4) |
... | @@ -69,7 +67,8 @@ To help you design your own SQL requests, we provide a description of the databa | ... | @@ -69,7 +67,8 @@ To help you design your own SQL requests, we provide a description of the databa |
69 | 67 | ||
70 | ## Table `align_column`, for positions in multiple sequence alignments | 68 | ## Table `align_column`, for positions in multiple sequence alignments |
71 | * `rfam_acc`: The family's MSA the column belongs to | 69 | * `rfam_acc`: The family's MSA the column belongs to |
72 | -* `index_ali`: Position of the column in the alignment (starts at 1) | 70 | +* `index_ali`: Position of the column in the wide alignment with Rfam sequences (starts at 1) |
71 | +* `index_small_ali`: Position of the column in the small alignment with only 3D chains (starts at 1) | ||
73 | * `cm_coord`: Position of the column in the Rfam covariance model of the family (starts at 1). The value is NULL in portions that are insertions compared to the model. | 72 | * `cm_coord`: Position of the column in the Rfam covariance model of the family (starts at 1). The value is NULL in portions that are insertions compared to the model. |
74 | * `freq_A`, `freq_C`, `freq_G`, `freq_U`, `freq_other`: Nucleotide frequencies in the alignment at this position | 73 | * `freq_A`, `freq_C`, `freq_G`, `freq_U`, `freq_other`: Nucleotide frequencies in the alignment at this position |
75 | * `gap_percent`: The frequencies of gaps at this position in the alignment (between 0.0 and 1.0) | 74 | * `gap_percent`: The frequencies of gaps at this position in the alignment (between 0.0 and 1.0) |
... | @@ -79,7 +78,6 @@ To help you design your own SQL requests, we provide a description of the databa | ... | @@ -79,7 +78,6 @@ To help you design your own SQL requests, we provide a description of the databa |
79 | There always is an entry, for each family (rfam_acc), with index_ali = 0; gap_percent = 1.0; and nucleotide frequencies set to 0.0. This entry is used when the nucleotide frequencies cannot be determined because of local alignment issues. | 78 | There always is an entry, for each family (rfam_acc), with index_ali = 0; gap_percent = 1.0; and nucleotide frequencies set to 0.0. This entry is used when the nucleotide frequencies cannot be determined because of local alignment issues. |
80 | 79 | ||
81 | ## Table `re_mapping`, to map a nucleotide to an alignment column | 80 | ## Table `re_mapping`, to map a nucleotide to an alignment column |
82 | -* `remapping_id`: A unique identifier | ||
83 | * `chain_id`: The chain which is mapped to an alignment | 81 | * `chain_id`: The chain which is mapped to an alignment |
84 | * `index_chain`: The absolute position of the nucleotide in the chain (from 1 to X) | 82 | * `index_chain`: The absolute position of the nucleotide in the chain (from 1 to X) |
85 | * `index_ali` The position of that nucleotide in its family alignment | 83 | * `index_ali` The position of that nucleotide in its family alignment | ... | ... |
... | @@ -31,5 +31,5 @@ We first remove the nucleotides whose number is outside the family mapping (if a | ... | @@ -31,5 +31,5 @@ We first remove the nucleotides whose number is outside the family mapping (if a |
31 | 31 | ||
32 | * **What are the versions of the dependencies you use ?** | 32 | * **What are the versions of the dependencies you use ?** |
33 | 33 | ||
34 | -`cmalign` is v1.1.3, `sina` is v1.6.0, `x3dna-dssr` is v1.9.9, Biopython is v1.78. | 34 | +`cmalign` is v1.1.4, `sina` is v1.6.0, `x3dna-dssr` is v1.9.9, Biopython is v1.78. |
35 | 35 | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
... | @@ -57,55 +57,63 @@ nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq | ... | @@ -57,55 +57,63 @@ nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq |
57 | The detailed list of options is below: | 57 | The detailed list of options is below: |
58 | 58 | ||
59 | ``` | 59 | ``` |
60 | --h [ --help ] Print this help message | 60 | +-h [ --help ] Print this help message |
61 | ---version Print the program version | 61 | +--version Print the program version |
62 | 62 | ||
63 | Select what to do: | 63 | Select what to do: |
64 | -------------------------------------------------------------------------------------------------------------- | 64 | -------------------------------------------------------------------------------------------------------------- |
65 | --f [ --full-inference ] Infer new mappings even if Rfam already provides some. Yields more copies of | 65 | +-f [ --full-inference ] Infer new mappings even if Rfam already provides some. Yields more copies of |
66 | - chains mapped to different families. | 66 | + chains mapped to different families. |
67 | --s Run statistics computations after completion | 67 | +-s Run statistics computations after completion |
68 | ---extract Extract the portions of 3D RNA chains to individual mmCIF files. | 68 | +--stats-opts=… Pass additional command line options to the statistics.py script, e.g. "--wadley --distance-matrices" |
69 | ---keep-hetatm=False (True | False) Keep ions, waters and ligands in produced mmCIF files. | 69 | +--extract Extract the portions of 3D RNA chains to individual mmCIF files. |
70 | - Does not affect the descriptors. | 70 | +--keep-hetatm=False (True | False) Keep ions, waters and ligands in produced mmCIF files. |
71 | ---no-homology Do not try to compute PSSMs and do not align sequences. | 71 | + Does not affect the descriptors. |
72 | - Allows to yield more 3D data (consider chains without a Rfam mapping). | 72 | +--no-homology Do not try to compute PSSMs and do not align sequences. |
73 | + Allows to yield more 3D data (consider chains without a Rfam mapping). | ||
73 | 74 | ||
74 | Select how to do it: | 75 | Select how to do it: |
75 | -------------------------------------------------------------------------------------------------------------- | 76 | -------------------------------------------------------------------------------------------------------------- |
76 | ---3d-folder=… Path to a folder to store the 3D data files. Subfolders will contain: | 77 | +--3d-folder=… Path to a folder to store the 3D data files. Subfolders will contain: |
77 | - RNAcifs/ Full structures containing RNA, in mmCIF format | 78 | + RNAcifs/ Full structures containing RNA, in mmCIF format |
78 | - rna_mapped_to_Rfam/ Extracted 'pure' RNA chains | 79 | + rna_mapped_to_Rfam/ Extracted 'pure' portions of RNA chains mapped to families |
79 | - datapoints/ Final results in CSV file format. | 80 | + rna_only/ Extracted 'pure' RNA chains, not truncated |
80 | ---seq-folder=… Path to a folder to store the sequence and alignment files. Subfolders will be: | 81 | + datapoints/ Final results in CSV file format. |
81 | - rfam_sequences/fasta/ Compressed hits to Rfam families | 82 | +--seq-folder=… Path to a folder to store the sequence and alignment files. Subfolders will be: |
82 | - realigned/ Sequences, covariance models, and alignments by family | 83 | + rfam_sequences/fasta/ Compressed hits to Rfam families |
83 | ---maxcores=… Limit the number of cores to use in parallel portions to reduce the simultaneous | 84 | + realigned/ Sequences, covariance models, and alignments by family |
84 | - need of RAM. Should be a number between 1 and your number of CPUs. Note that portions | 85 | +--sina Align large subunit LSU and small subunit SSU ribosomal RNA using SINA instead of Infernal, |
85 | - of the pipeline already limit themselves to 50% or 70% of that number by default. | 86 | + the other RNA families will be aligned using infernal. |
86 | ---archive Create tar.gz archives of the datapoints text files and the alignments, | 87 | +--maxcores=… Limit the number of cores to use in parallel portions to reduce the simultaneous |
87 | - and update the link to the latest archive. | 88 | + need of RAM. Should be a number between 1 and your number of CPUs. Note that portions |
88 | ---no-logs Do not save per-chain logs of the numbering modifications | 89 | + of the pipeline already limit themselves to 50% or 70% of that number by default. |
90 | +--cmalign-opts=… A string of additional options to pass to cmalign aligner, e.g. "--nonbanded --mxsize 2048" | ||
91 | +--archive Create tar.gz archives of the datapoints text files and the alignments, | ||
92 | + and update the link to the latest archive. | ||
93 | +--no-logs Do not save per-chain logs of the numbering modifications. | ||
89 | 94 | ||
90 | Select which data we are interested in: | 95 | Select which data we are interested in: |
91 | -------------------------------------------------------------------------------------------------------------- | 96 | -------------------------------------------------------------------------------------------------------------- |
92 | --r 4.0 [ --resolution=4.0 ] Maximum 3D structure resolution to consider a RNA chain. | 97 | +-r 4.0 [ --resolution=4.0 ] Maximum 3D structure resolution to consider a RNA chain. |
93 | ---all Build chains even if they already are in the database. | 98 | +--all Process chains even if they already are in the database. |
94 | ---only Ask to process a specific chain label only | 99 | +--redundant Process all members of the equivalence classes not only the representative. |
95 | ---ignore-issues Do not ignore already known issues and attempt to compute them | 100 | +--only Ask to process a specific chains only (e.g. 4v49, 4v49_1_AA, or 4v49_1_AA_5-1523). |
96 | ---update-homologous Re-download Rfam and SILVA databases, realign all families, and recompute all CSV files | 101 | +--ignore-issues Do not ignore already known issues and attempt to compute them. |
97 | ---from-scratch Delete database, local 3D and sequence files, and known issues, and recompute. | 102 | +--update-homologous Re-download Rfam and SILVA databases, realign all families, and recompute all CSV files. |
103 | +--from-scratch Delete database, local 3D and sequence files, and known issues, and recompute. | ||
98 | 104 | ||
99 | ``` | 105 | ``` |
100 | Options --3d-folder and --seq-folder are mandatory for command-line installations, but should not be used for installations with Docker. In the Docker container, they are set by default to the paths you provide with the -v options. | 106 | Options --3d-folder and --seq-folder are mandatory for command-line installations, but should not be used for installations with Docker. In the Docker container, they are set by default to the paths you provide with the -v options. |
101 | 107 | ||
102 | The most useful options in that list are | 108 | The most useful options in that list are |
103 | * ` --extract`, to actually produce some re-numbered 3D mmCIF files of the RNA chains individually, | 109 | * ` --extract`, to actually produce some re-numbered 3D mmCIF files of the RNA chains individually, |
104 | -* ` --no-homology`, to ignore the family mapping and sequence alignment parts and only focus on 3D data download and annotation. This would yield more data since many RNAs are not mapped to any Rfam family. | 110 | +* ` --no-homology`, to ignore the family mapping and sequence alignment parts and only focus on 3D data download and annotation. This would yield more data since many RNAs are not mapped to any Rfam family, |
105 | * ` -s`, to run the "statistics" which are a few useful post-computation tasks such as: | 111 | * ` -s`, to run the "statistics" which are a few useful post-computation tasks such as: |
106 | * Computation of sequence identity matrices | 112 | * Computation of sequence identity matrices |
107 | * Statistics over the sequence lengths, nucleotide frequencies, and basepair types by RNA family | 113 | * Statistics over the sequence lengths, nucleotide frequencies, and basepair types by RNA family |
108 | * Overall database content statistics | 114 | * Overall database content statistics |
115 | + * Detailed analysis of the eta-theta pseudotorsion angles (use `--stats-opts "--wadley"` after `-s`) or 3D distance matrices and their averages per family (use `--stats-opts "--distance-matrices"`) | ||
116 | +* ` --redundant`, to yield all the available data and not only the BGSU NR-List respresentatives | ||
109 | 117 | ||
110 | # Computation time | 118 | # Computation time |
111 | 119 | ... | ... |
... | @@ -285,15 +285,11 @@ class Chain: | ... | @@ -285,15 +285,11 @@ class Chain: |
285 | self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Either there is a problem with {self.pdb_id} mmCIF download, or the bases are not resolved in the structure. Delete it and retry." | 285 | self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Either there is a problem with {self.pdb_id} mmCIF download, or the bases are not resolved in the structure. Delete it and retry." |
286 | return None | 286 | return None |
287 | 287 | ||
288 | - # Remove low pertinence or undocumented descriptors, convert angles values | 288 | + # Remove low pertinence or undocumented descriptors |
289 | cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta", | 289 | cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta", |
290 | "epsilon_zeta", "bb_type", "chi", "glyco_bond", "form", "ssZp", "Dp", "eta", "theta", "eta_prime", "theta_prime", "eta_base", "theta_base", | 290 | "epsilon_zeta", "bb_type", "chi", "glyco_bond", "form", "ssZp", "Dp", "eta", "theta", "eta_prime", "theta_prime", "eta_base", "theta_base", |
291 | "v0", "v1", "v2", "v3", "v4", "amplitude", "phase_angle", "puckering"] | 291 | "v0", "v1", "v2", "v3", "v4", "amplitude", "phase_angle", "puckering"] |
292 | df = df[cols_we_keep] | 292 | df = df[cols_we_keep] |
293 | - df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4', # Conversion to radians | ||
294 | - 'eta', 'theta', 'eta_prime', 'theta_prime', 'eta_base', 'theta_base', 'phase_angle']] *= np.pi/180.0 | ||
295 | - df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4', # mapping [-pi, pi] into [0, 2pi] | ||
296 | - 'eta', 'theta', 'eta_prime', 'theta_prime', 'eta_base', 'theta_base', 'phase_angle']] %= (2.0*np.pi) | ||
297 | 293 | ||
298 | except KeyError as e: | 294 | except KeyError as e: |
299 | warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True) | 295 | warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True) |
... | @@ -412,13 +408,12 @@ class Chain: | ... | @@ -412,13 +408,12 @@ class Chain: |
412 | if nt['chain_name'] != self.pdb_chain_id: | 408 | if nt['chain_name'] != self.pdb_chain_id: |
413 | continue | 409 | continue |
414 | if nt['index_chain'] == i + 1 + self.mapping.st: | 410 | if nt['index_chain'] == i + 1 + self.mapping.st: |
415 | - found = nt | 411 | + found = nt # Retrieves old angle values from the JSON ! |
416 | break | 412 | break |
417 | if found: | 413 | if found: |
418 | self.mapping.log(f"Residue {i+1+self.mapping.st}-{self.mapping.st} = {i+1} has been saved and renumbered {df.iloc[i,1]} instead of {found['nt_id'].replace(found['chain_name']+ '.' + found['nt_name'], '').replace('^','')}") | 414 | self.mapping.log(f"Residue {i+1+self.mapping.st}-{self.mapping.st} = {i+1} has been saved and renumbered {df.iloc[i,1]} instead of {found['nt_id'].replace(found['chain_name']+ '.' + found['nt_name'], '').replace('^','')}") |
419 | - df_row = pd.DataFrame([found], index=[i])[ | 415 | + df_row = pd.DataFrame([found], index=[i])[df.columns.values] |
420 | - df.columns.values] | 416 | + df_row.iloc[0, 0] = i+1 # index_chain |
421 | - df_row.iloc[0, 0] = i+1 # index_chain | ||
422 | df_row.iloc[0, 1] = df.iloc[i, 1] # nt_resnum | 417 | df_row.iloc[0, 1] = df.iloc[i, 1] # nt_resnum |
423 | df = pd.concat([df.iloc[:i], df_row, df.iloc[i:]]) | 418 | df = pd.concat([df.iloc[:i], df_row, df.iloc[i:]]) |
424 | df.iloc[i+1:, 1] += 1 | 419 | df.iloc[i+1:, 1] += 1 |
... | @@ -474,6 +469,12 @@ class Chain: | ... | @@ -474,6 +469,12 @@ class Chain: |
474 | # Compute new features | 469 | # Compute new features |
475 | ####################################### | 470 | ####################################### |
476 | 471 | ||
472 | + # Convert angles | ||
473 | + df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4', # Conversion to radians | ||
474 | + 'eta', 'theta', 'eta_prime', 'theta_prime', 'eta_base', 'theta_base', 'phase_angle']] *= np.pi/180.0 | ||
475 | + df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4', # mapping [-pi, pi] into [0, 2pi] | ||
476 | + 'eta', 'theta', 'eta_prime', 'theta_prime', 'eta_base', 'theta_base', 'phase_angle']] %= (2.0*np.pi) | ||
477 | + | ||
477 | # Add a sequence column just for the alignments | 478 | # Add a sequence column just for the alignments |
478 | df['nt_align_code'] = [str(x).upper() | 479 | df['nt_align_code'] = [str(x).upper() |
479 | .replace('NAN', '-') # Unresolved nucleotides are gaps | 480 | .replace('NAN', '-') # Unresolved nucleotides are gaps |
... | @@ -985,7 +986,7 @@ class Pipeline: | ... | @@ -985,7 +986,7 @@ class Pipeline: |
985 | 986 | ||
986 | try: | 987 | try: |
987 | opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=", | 988 | opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=", |
988 | - "only=", "cmalign-opts=", "maxcores=", "sina", "from-scratch", | 989 | + "only=", "cmalign-opts=", "stats-opts=", "maxcores=", "sina", "from-scratch", |
989 | "full-inference", "no-homology", "redundant", "ignore-issues", "extract", | 990 | "full-inference", "no-homology", "redundant", "ignore-issues", "extract", |
990 | "all", "no-logs", "archive", "update-homologous", "version"]) | 991 | "all", "no-logs", "archive", "update-homologous", "version"]) |
991 | except getopt.GetoptError as err: | 992 | except getopt.GetoptError as err: |
... | @@ -1000,7 +1001,7 @@ class Pipeline: | ... | @@ -1000,7 +1001,7 @@ class Pipeline: |
1000 | 1001 | ||
1001 | if opt == "-h" or opt == "--help": | 1002 | if opt == "-h" or opt == "--help": |
1002 | print("RNANet, a script to build a multiscale RNA dataset from public data\n" | 1003 | print("RNANet, a script to build a multiscale RNA dataset from public data\n" |
1003 | - "Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2020") | 1004 | + "Developped by Louis Becquey and Khodor Hannoush, 2019/2021") |
1004 | print() | 1005 | print() |
1005 | print("Options:") | 1006 | print("Options:") |
1006 | print("-h [ --help ]\t\t\tPrint this help message") | 1007 | print("-h [ --help ]\t\t\tPrint this help message") |
... | @@ -1010,8 +1011,8 @@ class Pipeline: | ... | @@ -1010,8 +1011,8 @@ class Pipeline: |
1010 | print("--------------------------------------------------------------------------------------------------------------") | 1011 | print("--------------------------------------------------------------------------------------------------------------") |
1011 | print("-f [ --full-inference ]\t\tInfer new mappings even if Rfam already provides some. Yields more copies of" | 1012 | print("-f [ --full-inference ]\t\tInfer new mappings even if Rfam already provides some. Yields more copies of" |
1012 | "\n\t\t\t\t chains mapped to different families.") | 1013 | "\n\t\t\t\t chains mapped to different families.") |
1013 | - print("--redundant\t\t\t\tStore the class members in the database thoughts to be redundant for predictions.") | ||
1014 | print("-s\t\t\t\tRun statistics computations after completion") | 1014 | print("-s\t\t\t\tRun statistics computations after completion") |
1015 | + print("--stats-opts=…\t\t\tPass additional command line options to the statistics.py script, e.g. \"--wadley --distance-matrices\"") | ||
1015 | print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.") | 1016 | print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.") |
1016 | print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. " | 1017 | print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. " |
1017 | "\n\t\t\t\t Does not affect the descriptors.") | 1018 | "\n\t\t\t\t Does not affect the descriptors.") |
... | @@ -1022,35 +1023,37 @@ class Pipeline: | ... | @@ -1022,35 +1023,37 @@ class Pipeline: |
1022 | print("--------------------------------------------------------------------------------------------------------------") | 1023 | print("--------------------------------------------------------------------------------------------------------------") |
1023 | print("--3d-folder=…\t\t\tPath to a folder to store the 3D data files. Subfolders will contain:" | 1024 | print("--3d-folder=…\t\t\tPath to a folder to store the 3D data files. Subfolders will contain:" |
1024 | "\n\t\t\t\t\tRNAcifs/\t\tFull structures containing RNA, in mmCIF format" | 1025 | "\n\t\t\t\t\tRNAcifs/\t\tFull structures containing RNA, in mmCIF format" |
1025 | - "\n\t\t\t\t\trna_mapped_to_Rfam/\tExtracted 'pure' RNA chains" | 1026 | + "\n\t\t\t\t\trna_mapped_to_Rfam/\tExtracted 'pure' portions of RNA chains mapped to families" |
1027 | + "\n\t\t\t\t\trna_only/\tExtracted 'pure' RNA chains, not truncated" | ||
1026 | "\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.") | 1028 | "\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.") |
1027 | print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:" | 1029 | print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:" |
1028 | "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families" | 1030 | "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families" |
1029 | "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family") | 1031 | "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family") |
1030 | - print("--sina\t\t\tForce the RNANet to align large subunit LSU and small subunit SSU ribosomal RNA using sina instead of infernal," | 1032 | + print("--sina\t\t\t\tAlign large subunit LSU and small subunit SSU ribosomal RNA using SINA instead of Infernal," |
1031 | - "\n\t\t\t\t\t the other RNA families will be aligned using infernal.") | 1033 | + "\n\t\t\t\t the other RNA families will be aligned using infernal.") |
1032 | print("--maxcores=…\t\t\tLimit the number of cores to use in parallel portions to reduce the simultaneous" | 1034 | print("--maxcores=…\t\t\tLimit the number of cores to use in parallel portions to reduce the simultaneous" |
1033 | "\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions" | 1035 | "\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions" |
1034 | "\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.") | 1036 | "\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.") |
1035 | print("--cmalign-opts=…\t\tA string of additional options to pass to cmalign aligner, e.g. \"--nonbanded --mxsize 2048\"") | 1037 | print("--cmalign-opts=…\t\tA string of additional options to pass to cmalign aligner, e.g. \"--nonbanded --mxsize 2048\"") |
1036 | print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments," | 1038 | print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments," |
1037 | "\n\t\t\t\t and update the link to the latest archive. ") | 1039 | "\n\t\t\t\t and update the link to the latest archive. ") |
1038 | - print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications") | 1040 | + print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications.") |
1039 | print() | 1041 | print() |
1040 | print("Select which data we are interested in:") | 1042 | print("Select which data we are interested in:") |
1041 | print("--------------------------------------------------------------------------------------------------------------") | 1043 | print("--------------------------------------------------------------------------------------------------------------") |
1042 | print("-r 4.0 [ --resolution=4.0 ]\tMaximum 3D structure resolution to consider a RNA chain.") | 1044 | print("-r 4.0 [ --resolution=4.0 ]\tMaximum 3D structure resolution to consider a RNA chain.") |
1043 | - print("--all\t\t\t\tBuild chains even if they already are in the database.") | 1045 | + print("--all\t\t\t\tProcess chains even if they already are in the database.") |
1044 | - print("--only\t\t\t\tAsk to process a specific chain label only") | 1046 | + print("--redundant\t\t\tProcess all members of the equivalence classes not only the representative.") |
1045 | - print("--ignore-issues\t\t\tDo not ignore already known issues and attempt to compute them") | 1047 | + print("--only\t\t\t\tAsk to process a specific chains only (could be 4v49, 4v49_1_AA, or 4v49_1_AA_5-1523).") |
1046 | - print("--update-homologous\t\tRe-download Rfam and SILVA databases, realign all families, and recompute all CSV files") | 1048 | + print("--ignore-issues\t\t\tDo not ignore already known issues and attempt to compute them.") |
1049 | + print("--update-homologous\t\tRe-download Rfam and SILVA databases, realign all families, and recompute all CSV files.") | ||
1047 | print("--from-scratch\t\t\tDelete database, local 3D and sequence files, and known issues, and recompute.") | 1050 | print("--from-scratch\t\t\tDelete database, local 3D and sequence files, and known issues, and recompute.") |
1048 | print() | 1051 | print() |
1049 | print("Typical usage:") | 1052 | print("Typical usage:") |
1050 | print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --no-logs' &") | 1053 | print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --no-logs' &") |
1051 | sys.exit() | 1054 | sys.exit() |
1052 | elif opt == '--version': | 1055 | elif opt == '--version': |
1053 | - print("RNANet v1.4 beta, parallelized, Dockerized") | 1056 | + print("RNANet v1.5 beta, parallelized, Dockerized") |
1054 | print("Last revision : April 2021") | 1057 | print("Last revision : April 2021") |
1055 | sys.exit() | 1058 | sys.exit() |
1056 | elif opt == "-r" or opt == "--resolution": | 1059 | elif opt == "-r" or opt == "--resolution": |
... | @@ -1114,9 +1117,9 @@ class Pipeline: | ... | @@ -1114,9 +1117,9 @@ class Pipeline: |
1114 | elif opt == "-f" or opt == "--full-inference": | 1117 | elif opt == "-f" or opt == "--full-inference": |
1115 | self.FULLINFERENCE = True | 1118 | self.FULLINFERENCE = True |
1116 | elif opt=="--redundant": | 1119 | elif opt=="--redundant": |
1117 | - self.REDUNDANT=True | 1120 | + self.REDUNDANT = True |
1118 | elif opt=="--sina": | 1121 | elif opt=="--sina": |
1119 | - self.USESINA=True | 1122 | + self.USESINA = True |
1120 | 1123 | ||
1121 | if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions": | 1124 | if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions": |
1122 | print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") | 1125 | print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") |
... | @@ -1161,7 +1164,8 @@ class Pipeline: | ... | @@ -1161,7 +1164,8 @@ class Pipeline: |
1161 | 1164 | ||
1162 | pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1, | 1165 | pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1, |
1163 | desc="Eq. classes", bar_format="{desc}:{percentage:3.0f}%|{bar}|") | 1166 | desc="Eq. classes", bar_format="{desc}:{percentage:3.0f}%|{bar}|") |
1164 | - for _, newchains in enumerate(p.imap_unordered(partial( | 1167 | + problems = [] |
1168 | + for _, results in enumerate(p.imap_unordered(partial( | ||
1165 | work_infer_mappings, | 1169 | work_infer_mappings, |
1166 | not self.REUSE_ALL, | 1170 | not self.REUSE_ALL, |
1167 | allmappings, | 1171 | allmappings, |
... | @@ -1170,6 +1174,8 @@ class Pipeline: | ... | @@ -1170,6 +1174,8 @@ class Pipeline: |
1170 | ), | 1174 | ), |
1171 | full_structures_list, | 1175 | full_structures_list, |
1172 | chunksize=1)): | 1176 | chunksize=1)): |
1177 | + newproblems, newchains = results | ||
1178 | + problems += newproblems | ||
1173 | self.update += newchains | 1179 | self.update += newchains |
1174 | 1180 | ||
1175 | pbar.update(1) # Everytime the iteration finishes, update the global progress bar | 1181 | pbar.update(1) # Everytime the iteration finishes, update the global progress bar |
... | @@ -1183,15 +1189,33 @@ class Pipeline: | ... | @@ -1183,15 +1189,33 @@ class Pipeline: |
1183 | p.terminate() | 1189 | p.terminate() |
1184 | p.join() | 1190 | p.join() |
1185 | exit(1) | 1191 | exit(1) |
1192 | + | ||
1193 | + # Display the issues afterwards | ||
1194 | + for p in problems: | ||
1195 | + warn(p) | ||
1186 | else: | 1196 | else: |
1187 | conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) | 1197 | conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) |
1188 | conn.execute('pragma journal_mode=wal') | 1198 | conn.execute('pragma journal_mode=wal') |
1189 | - for eq_class, codelist in tqdm(full_structures_list, desc="Eq. classes"): | 1199 | + for eq_class, representative, codelist in tqdm(full_structures_list, desc="Eq. classes"): |
1190 | codes = codelist.replace('+', ',').split(',') | 1200 | codes = codelist.replace('+', ',').split(',') |
1191 | 1201 | ||
1192 | # Simply convert the list of codes to Chain() objects | 1202 | # Simply convert the list of codes to Chain() objects |
1193 | - for c in codes: | 1203 | + if self.REDUNDANT: |
1194 | - nr = c.split('|') | 1204 | + for c in codes: |
1205 | + nr = c.split('|') | ||
1206 | + pdb_id = nr[0].lower() | ||
1207 | + pdb_model = int(nr[1]) | ||
1208 | + pdb_chain_id = nr[2].upper() | ||
1209 | + chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}" | ||
1210 | + res = sql_ask_database(conn, f"""SELECT chain_id from chain | ||
1211 | + WHERE structure_id='{pdb_id}' | ||
1212 | + AND chain_name='{pdb_chain_id}' | ||
1213 | + AND rfam_acc = 'unmappd' | ||
1214 | + AND issue=0""") | ||
1215 | + if not len(res) or self.REUSE_ALL: # the chain is NOT yet in the database, or this is a known issue | ||
1216 | + self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class)) | ||
1217 | + else: | ||
1218 | + nr = representative.split('|') | ||
1195 | pdb_id = nr[0].lower() | 1219 | pdb_id = nr[0].lower() |
1196 | pdb_model = int(nr[1]) | 1220 | pdb_model = int(nr[1]) |
1197 | pdb_chain_id = nr[2].upper() | 1221 | pdb_chain_id = nr[2].upper() |
... | @@ -1206,7 +1230,13 @@ class Pipeline: | ... | @@ -1206,7 +1230,13 @@ class Pipeline: |
1206 | conn.close() | 1230 | conn.close() |
1207 | 1231 | ||
1208 | if self.SELECT_ONLY is not None: | 1232 | if self.SELECT_ONLY is not None: |
1209 | - self.update = [ c for c in self.update if c.chain_label == self.SELECT_ONLY ] | 1233 | + print("> Using only chains with label " + self.SELECT_ONLY + "... ", end="") |
1234 | + self.update = [ c for c in self.update if self.SELECT_ONLY in c.chain_label ] | ||
1235 | + if len(self.update): | ||
1236 | + print(validsymb) | ||
1237 | + else: | ||
1238 | + print("None found ! " + errsymb) | ||
1239 | + exit(1) | ||
1210 | 1240 | ||
1211 | self.n_chains = len(self.update) | 1241 | self.n_chains = len(self.update) |
1212 | print(str(self.n_chains) + " RNA chains of interest.") | 1242 | print(str(self.n_chains) + " RNA chains of interest.") |
... | @@ -1359,9 +1389,9 @@ class Pipeline: | ... | @@ -1359,9 +1389,9 @@ class Pipeline: |
1359 | try: | 1389 | try: |
1360 | execute_joblist(joblist) | 1390 | execute_joblist(joblist) |
1361 | 1391 | ||
1362 | - if len(set(self.fam_list).intersection(SSU_set)): | 1392 | + if self.USESINA and len(set(self.fam_list).intersection(SSU_set)): |
1363 | self.dl.download_from_SILVA("SSU") | 1393 | self.dl.download_from_SILVA("SSU") |
1364 | - if len(set(self.fam_list).intersection(LSU_set)): | 1394 | + if self.USESINA and len(set(self.fam_list).intersection(LSU_set)): |
1365 | self.dl.download_from_SILVA("LSU") | 1395 | self.dl.download_from_SILVA("LSU") |
1366 | except KeyboardInterrupt: | 1396 | except KeyboardInterrupt: |
1367 | print("Exiting") | 1397 | print("Exiting") |
... | @@ -1762,6 +1792,7 @@ def sql_define_tables(conn): | ... | @@ -1762,6 +1792,7 @@ def sql_define_tables(conn): |
1762 | CREATE TABLE IF NOT EXISTS align_column ( | 1792 | CREATE TABLE IF NOT EXISTS align_column ( |
1763 | rfam_acc CHAR(7) NOT NULL, | 1793 | rfam_acc CHAR(7) NOT NULL, |
1764 | index_ali INT NOT NULL, | 1794 | index_ali INT NOT NULL, |
1795 | + index_small_ali INT NOT NULL, | ||
1765 | cm_coord INT, | 1796 | cm_coord INT, |
1766 | freq_A REAL, | 1797 | freq_A REAL, |
1767 | freq_C REAL, | 1798 | freq_C REAL, |
... | @@ -1938,8 +1969,8 @@ def execute_joblist(fulljoblist): | ... | @@ -1938,8 +1969,8 @@ def execute_joblist(fulljoblist): |
1938 | p.close() | 1969 | p.close() |
1939 | p.join() | 1970 | p.join() |
1940 | except KeyboardInterrupt as e: | 1971 | except KeyboardInterrupt as e: |
1941 | - warn("KeyboardInterrupt, terminating workers.", error=True) | 1972 | + warn("KeyboardInterrupt, killing workers (SIGKILL).", error=True) |
1942 | - p.terminate() | 1973 | + p.kill() |
1943 | p.join() | 1974 | p.join() |
1944 | raise e | 1975 | raise e |
1945 | 1976 | ||
... | @@ -1952,15 +1983,23 @@ def execute_joblist(fulljoblist): | ... | @@ -1952,15 +1983,23 @@ def execute_joblist(fulljoblist): |
1952 | return results | 1983 | return results |
1953 | 1984 | ||
1954 | @trace_unhandled_exceptions | 1985 | @trace_unhandled_exceptions |
1955 | -def work_infer_mappings(update_only, allmappings, fullinference,redundant, codelist) -> list: | 1986 | +def work_infer_mappings(update_only, allmappings, fullinference, redundant, codelist) -> (list, list): |
1956 | """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list, | 1987 | """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list, |
1957 | build a list of Chain() objects mapped to Rfam families, by expanding available mappings | 1988 | build a list of Chain() objects mapped to Rfam families, by expanding available mappings |
1958 | of any element of the list to all the list elements. | 1989 | of any element of the list to all the list elements. |
1990 | + update_only (bool) : Only return chains which are not yet in the database | ||
1991 | + allmappings (DataFrame) : Rfam-PDB mappings CSV | ||
1992 | + fullinference (bool) : include copies of chains mapped to families of the other members of the equivalence class, even if this chain already has a mapping | ||
1993 | + redundant (bool) : include all members of the equivalence class, not just the representative | ||
1994 | + codelist (str) : list of chains of an equivalence class, in the NR-list format | ||
1995 | + | ||
1996 | + returns list[str], list[Chain] : problems faced, and Chain objects to process. | ||
1959 | """ | 1997 | """ |
1960 | 1998 | ||
1961 | setproctitle("RNAnet.py work_infer_mappings()") | 1999 | setproctitle("RNAnet.py work_infer_mappings()") |
1962 | 2000 | ||
1963 | newchains = [] | 2001 | newchains = [] |
2002 | + newproblems = [] | ||
1964 | known_mappings = pd.DataFrame() | 2003 | known_mappings = pd.DataFrame() |
1965 | 2004 | ||
1966 | # Split the comma-separated list of chain codes into chain codes: | 2005 | # Split the comma-separated list of chain codes into chain codes: |
... | @@ -2001,6 +2040,8 @@ def work_infer_mappings(update_only, allmappings, fullinference,redundant, codel | ... | @@ -2001,6 +2040,8 @@ def work_infer_mappings(update_only, allmappings, fullinference,redundant, codel |
2001 | and len(inferred_mappings[thisfam_5_3]) > 0 | 2040 | and len(inferred_mappings[thisfam_5_3]) > 0 |
2002 | ): | 2041 | ): |
2003 | # there are mappings in both directions... wtf Rfam ?! | 2042 | # there are mappings in both directions... wtf Rfam ?! |
2043 | + # Reverse-direction hits of cmscan are hits for the (-) strand --> We are not interested in negative strands, | ||
2044 | + # we do not have their 3D structure ! We should ignore them. | ||
2004 | if (len(inferred_mappings[thisfam_5_3]) == len(inferred_mappings[thisfam_3_5]) == 1 | 2045 | if (len(inferred_mappings[thisfam_5_3]) == len(inferred_mappings[thisfam_3_5]) == 1 |
2005 | and int(inferred_mappings[thisfam_5_3].pdb_start) == int(inferred_mappings[thisfam_3_5].pdb_end) | 2046 | and int(inferred_mappings[thisfam_5_3].pdb_start) == int(inferred_mappings[thisfam_3_5].pdb_end) |
2006 | and int(inferred_mappings[thisfam_5_3].pdb_end) == int(inferred_mappings[thisfam_3_5].pdb_start) | 2047 | and int(inferred_mappings[thisfam_5_3].pdb_end) == int(inferred_mappings[thisfam_3_5].pdb_start) |
... | @@ -2013,12 +2054,10 @@ def work_infer_mappings(update_only, allmappings, fullinference,redundant, codel | ... | @@ -2013,12 +2054,10 @@ def work_infer_mappings(update_only, allmappings, fullinference,redundant, codel |
2013 | sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end']) | 2054 | sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end']) |
2014 | thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam) & sel_5_to_3 | 2055 | thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam) & sel_5_to_3 |
2015 | thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam) & (sel_5_to_3 == False) | 2056 | thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam) & (sel_5_to_3 == False) |
2016 | - # print() | 2057 | + newproblems.append(f"Found mappings to {rfam} in both directions on the same interval, keeping only the 5'->3' one.") |
2017 | - # warn(f"Found mappings to {rfam} in both directions on the same interval, keeping only the 5'->3' one.") | ||
2018 | else: | 2058 | else: |
2019 | - warn(f"There are mappings for {rfam} in both directions:", error=True) | 2059 | + newproblems.append(f"There are mappings for {rfam} in both directions, this is a clue that the hit is wrong. Ignoring it.") |
2020 | - print(inferred_mappings) | 2060 | + continue |
2021 | - # exit(1) | ||
2022 | 2061 | ||
2023 | # Compute consensus for chains in 5' -> 3' sense | 2062 | # Compute consensus for chains in 5' -> 3' sense |
2024 | if len(inferred_mappings[thisfam_5_3]): | 2063 | if len(inferred_mappings[thisfam_5_3]): |
... | @@ -2035,33 +2074,17 @@ def work_infer_mappings(update_only, allmappings, fullinference,redundant, codel | ... | @@ -2035,33 +2074,17 @@ def work_infer_mappings(update_only, allmappings, fullinference,redundant, codel |
2035 | known_sel_5_to_3 = (known_mappings['rfam_acc'] == rfam) & (known_mappings['pdb_start'] < known_mappings['pdb_end']) | 2074 | known_sel_5_to_3 = (known_mappings['rfam_acc'] == rfam) & (known_mappings['pdb_start'] < known_mappings['pdb_end']) |
2036 | inferred_mappings.loc[thisfam_5_3, 'pdb_start'] = known_mappings.loc[known_sel_5_to_3, 'pdb_start'].median() | 2075 | inferred_mappings.loc[thisfam_5_3, 'pdb_start'] = known_mappings.loc[known_sel_5_to_3, 'pdb_start'].median() |
2037 | inferred_mappings.loc[thisfam_5_3, 'pdb_end'] = known_mappings.loc[known_sel_5_to_3, 'pdb_end'].median() | 2076 | inferred_mappings.loc[thisfam_5_3, 'pdb_end'] = known_mappings.loc[known_sel_5_to_3, 'pdb_end'].median() |
2038 | - | 2077 | + |
2039 | - # Compute consensus for chains in 3' -> 5' sense | ||
2040 | - if len(inferred_mappings[thisfam_3_5]): | ||
2041 | - pdb_start_min = min(inferred_mappings[thisfam_3_5]['pdb_start']) | ||
2042 | - pdb_end_max = max(inferred_mappings[thisfam_3_5]['pdb_end']) | ||
2043 | - pdb_start_max = max(inferred_mappings[thisfam_3_5]['pdb_start']) | ||
2044 | - pdb_end_min = min(inferred_mappings[thisfam_3_5]['pdb_end']) | ||
2045 | - if (pdb_start_max - pdb_start_min < 100) and (pdb_end_max - pdb_end_min < 100): | ||
2046 | - # the variation is only a few nucleotides, we take the largest window. | ||
2047 | - inferred_mappings.loc[thisfam_3_5, 'pdb_start'] = pdb_start_max | ||
2048 | - inferred_mappings.loc[thisfam_3_5, 'pdb_end'] = pdb_end_min | ||
2049 | - else: | ||
2050 | - # there probably is an outlier. We chose the median value in the whole list of known_mappings. | ||
2051 | - known_sel_3_to_5 = (known_mappings['rfam_acc'] == rfam) & (known_mappings['pdb_start'] > known_mappings['pdb_end']) | ||
2052 | - inferred_mappings.loc[thisfam_3_5, 'pdb_start'] = known_mappings.loc[known_sel_3_to_5, 'pdb_start'].median() | ||
2053 | - inferred_mappings.loc[thisfam_3_5, 'pdb_end'] = known_mappings.loc[known_sel_3_to_5, 'pdb_end'].median() | ||
2054 | inferred_mappings.drop_duplicates(inplace=True) | 2078 | inferred_mappings.drop_duplicates(inplace=True) |
2055 | 2079 | ||
2056 | # Now build Chain() objects for the mapped chains | 2080 | # Now build Chain() objects for the mapped chains |
2057 | for c in codes: | 2081 | for c in codes: |
2058 | 2082 | ||
2059 | - if not redundant and c!=representative: | 2083 | + if not redundant and c != representative: |
2060 | - ''' | 2084 | + # By default, we save only the representative member. |
2061 | - by default save only the representative member | 2085 | + # If --redundant is passed, then save all the chains of the class members |
2062 | - if redundant is passed then save all the chains of the class members | ||
2063 | - ''' | ||
2064 | continue | 2086 | continue |
2087 | + | ||
2065 | nr = c.split('|') | 2088 | nr = c.split('|') |
2066 | pdb_id = nr[0].lower() | 2089 | pdb_id = nr[0].lower() |
2067 | pdb_model = int(nr[1]) | 2090 | pdb_model = int(nr[1]) |
... | @@ -2107,7 +2130,7 @@ def work_infer_mappings(update_only, allmappings, fullinference,redundant, codel | ... | @@ -2107,7 +2130,7 @@ def work_infer_mappings(update_only, allmappings, fullinference,redundant, codel |
2107 | newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, | 2130 | newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, |
2108 | rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end)) | 2131 | rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end)) |
2109 | 2132 | ||
2110 | - return newchains | 2133 | + return newproblems, newchains |
2111 | 2134 | ||
2112 | @trace_unhandled_exceptions | 2135 | @trace_unhandled_exceptions |
2113 | def work_mmcif(pdb_id): | 2136 | def work_mmcif(pdb_id): |
... | @@ -2437,34 +2460,22 @@ def work_realign(useSina, alignopts, rfam_acc): | ... | @@ -2437,34 +2460,22 @@ def work_realign(useSina, alignopts, rfam_acc): |
2437 | er.write(f"Failed to realign {rfam_acc} (killed)") | 2460 | er.write(f"Failed to realign {rfam_acc} (killed)") |
2438 | 2461 | ||
2439 | @trace_unhandled_exceptions | 2462 | @trace_unhandled_exceptions |
2440 | -def work_pydca(f, columns_to_save): | 2463 | +def work_save_pydca(f,alignment): |
2441 | - """ | ||
2442 | - This function writes an alignment file containing only the columns which will be saved to the database, | ||
2443 | - converted to uppercase, and without non-ACGU nucleotides. | ||
2444 | - This file in then used by pydca to compute DCA features. | ||
2445 | - """ | ||
2446 | - | ||
2447 | - align=read(path_to_seq_data + f"realigned/{f}++.afa") | ||
2448 | - for s in align: | ||
2449 | - s.seq=s.seq.upper() # Convert to uppercase as needed for pydca | ||
2450 | - filtered_alignment = align[:, 1:1] # all the lines, but no columns | ||
2451 | - for p in columns_to_save: | ||
2452 | - filtered_alignment += align[:, p-1:p] # save columns one by one | ||
2453 | - | ||
2454 | # Replace all other letters by a deletion gap just for the | 2464 | # Replace all other letters by a deletion gap just for the |
2455 | # aim to use pydca as sites other than ACGU . and - are not accepted | 2465 | # aim to use pydca as sites other than ACGU . and - are not accepted |
2456 | - for s in filtered_alignment: | 2466 | + for s in alignment: |
2467 | + s.seq = s.seq.toseq().upper().tomutable() # Convert to uppercase as needed for pydca | ||
2457 | for i in range(len(s.seq)): | 2468 | for i in range(len(s.seq)): |
2458 | if s.seq[i].upper() not in "ACGU-.": | 2469 | if s.seq[i].upper() not in "ACGU-.": |
2459 | s.seq[i]='-' | 2470 | s.seq[i]='-' |
2460 | 2471 | ||
2461 | - # Create a fasta file to be used by pydca | 2472 | + #Create a fasta file to be used by pydca |
2462 | with open(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa", "w") as only_3d: | 2473 | with open(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa", "w") as only_3d: |
2463 | try: | 2474 | try: |
2464 | - only_3d.write(format(filtered_alignment, "fasta")) | 2475 | + only_3d.write(format(alignment, "fasta")) |
2465 | except ValueError as e: | 2476 | except ValueError as e: |
2466 | warn(e) | 2477 | warn(e) |
2467 | - | 2478 | + |
2468 | @trace_unhandled_exceptions | 2479 | @trace_unhandled_exceptions |
2469 | def work_pssm_remap(f): | 2480 | def work_pssm_remap(f): |
2470 | """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. | 2481 | """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. |
... | @@ -2648,7 +2659,7 @@ def work_pssm_remap(f): | ... | @@ -2648,7 +2659,7 @@ def work_pssm_remap(f): |
2648 | setproctitle(f"RNAnet.py work_pssm_remap({f}) insert/match states") | 2659 | setproctitle(f"RNAnet.py work_pssm_remap({f}) insert/match states") |
2649 | 2660 | ||
2650 | # Get back the information of match/insertion states from the STK file | 2661 | # Get back the information of match/insertion states from the STK file |
2651 | - if f not in SSU_set and f not in LSU_set: | 2662 | + if (not use_sina) or (f not in SSU_set and f not in LSU_set): |
2652 | alignstk = AlignIO.read(path_to_seq_data + "realigned/" + f + "++.stk", "stockholm") | 2663 | alignstk = AlignIO.read(path_to_seq_data + "realigned/" + f + "++.stk", "stockholm") |
2653 | consensus_2d = alignstk.column_annotations["secondary_structure"] | 2664 | consensus_2d = alignstk.column_annotations["secondary_structure"] |
2654 | del alignstk | 2665 | del alignstk |
... | @@ -2675,20 +2686,17 @@ def work_pssm_remap(f): | ... | @@ -2675,20 +2686,17 @@ def work_pssm_remap(f): |
2675 | cm_coords = [ None for x in range(ncols) ] | 2686 | cm_coords = [ None for x in range(ncols) ] |
2676 | cm_2d = [ None for x in range(ncols) ] | 2687 | cm_2d = [ None for x in range(ncols) ] |
2677 | 2688 | ||
2678 | - setproctitle(f"RNAnet.py work_pssm_remap({f}) Potts model, DCA") | 2689 | + data = [(f,j,i,cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for i, j in enumerate(columns)] |
2679 | - | 2690 | + sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, index_small_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct) |
2680 | - work_pydca(f, sorted(columns_to_save)) | 2691 | + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO |
2681 | - | 2692 | + UPDATE SET index_small_ali=excluded.index_small_ali, cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, |
2682 | - data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)] | ||
2683 | - sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct) | ||
2684 | - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO | ||
2685 | - UPDATE SET cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, | ||
2686 | freq_other=excluded.freq_other, | 2693 | freq_other=excluded.freq_other, |
2687 | gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data) | 2694 | gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data) |
2688 | # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment) | 2695 | # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment) |
2689 | - sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, | 2696 | + sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, index_small_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, |
2690 | gap_percent, consensus, cons_sec_struct) | 2697 | gap_percent, consensus, cons_sec_struct) |
2691 | - VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,)) | 2698 | + VALUES (?, 0, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,)) |
2699 | + | ||
2692 | 2700 | ||
2693 | 2701 | ||
2694 | # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) | 2702 | # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) |
... | @@ -2720,6 +2728,20 @@ def work_pssm_remap(f): | ... | @@ -2720,6 +2728,20 @@ def work_pssm_remap(f): |
2720 | except ValueError as e: | 2728 | except ValueError as e: |
2721 | warn(e) | 2729 | warn(e) |
2722 | 2730 | ||
2731 | + setproctitle(f"RNAnet.py work_pssm_remap({f}) Potts model, DCA") | ||
2732 | + | ||
2733 | + if len(filtered_alignment) < 20: | ||
2734 | + # The 3D-only alignment is not big enough for us to compute PyDCA features on it. | ||
2735 | + # We'll use the large one. | ||
2736 | + del align | ||
2737 | + del filtered_alignment | ||
2738 | + align = read(path_to_seq_data + f"realigned/{f}++.afa") | ||
2739 | + filtered_alignment = align[:, 1:1] # all the lines, but no columns | ||
2740 | + for p in columns: | ||
2741 | + filtered_alignment += align[:, p-1:p] # save columns one by one | ||
2742 | + | ||
2743 | + work_save_pydca(f, filtered_alignment) | ||
2744 | + | ||
2723 | setproctitle(f"RNAnet.py work_pssm_remap({f}) finished") | 2745 | setproctitle(f"RNAnet.py work_pssm_remap({f}) finished") |
2724 | idxQueue.put(thr_idx) # replace the thread index in the queue | 2746 | idxQueue.put(thr_idx) # replace the thread index in the queue |
2725 | return 0 | 2747 | return 0 |
... | @@ -2733,7 +2755,7 @@ def work_save(c, homology=True): | ... | @@ -2733,7 +2755,7 @@ def work_save(c, homology=True): |
2733 | conn.execute('pragma journal_mode=wal') | 2755 | conn.execute('pragma journal_mode=wal') |
2734 | if homology: | 2756 | if homology: |
2735 | df = pd.read_sql_query(f""" | 2757 | df = pd.read_sql_query(f""" |
2736 | - SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, cm_coord, | 2758 | + SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, cm_coord, index_small_ali, |
2737 | is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, | 2759 | is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, |
2738 | gap_percent, consensus, cons_sec_struct, dbn, paired, nb_interact, pair_type_LW, pair_type_DSSR, | 2760 | gap_percent, consensus, cons_sec_struct, dbn, paired, nb_interact, pair_type_LW, pair_type_DSSR, |
2739 | alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, chi, bb_type, glyco_bond, form, ssZp, Dp, | 2761 | alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, chi, bb_type, glyco_bond, form, ssZp, Dp, |
... | @@ -2759,7 +2781,6 @@ def work_save(c, homology=True): | ... | @@ -2759,7 +2781,6 @@ def work_save(c, homology=True): |
2759 | 2781 | ||
2760 | df.to_csv(filename, float_format="%.2f", index=False) | 2782 | df.to_csv(filename, float_format="%.2f", index=False) |
2761 | 2783 | ||
2762 | - | ||
2763 | if __name__ == "__main__": | 2784 | if __name__ == "__main__": |
2764 | 2785 | ||
2765 | fileDir = os.path.dirname(os.path.realpath(__file__)) | 2786 | fileDir = os.path.dirname(os.path.realpath(__file__)) | ... | ... |
... | @@ -1190,7 +1190,7 @@ if __name__ == "__main__": | ... | @@ -1190,7 +1190,7 @@ if __name__ == "__main__": |
1190 | 1190 | ||
1191 | if opt == "-h" or opt == "--help": | 1191 | if opt == "-h" or opt == "--help": |
1192 | print( "RNANet statistics, a script to build a multiscale RNA dataset from public data\n" | 1192 | print( "RNANet statistics, a script to build a multiscale RNA dataset from public data\n" |
1193 | - "Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2020/2021") | 1193 | + "Developped by Louis Becquey an Khodor Hannoush, 2020/2021") |
1194 | print() | 1194 | print() |
1195 | print("Options:") | 1195 | print("Options:") |
1196 | print("-h [ --help ]\t\t\tPrint this help message") | 1196 | print("-h [ --help ]\t\t\tPrint this help message") |
... | @@ -1206,7 +1206,7 @@ if __name__ == "__main__": | ... | @@ -1206,7 +1206,7 @@ if __name__ == "__main__": |
1206 | 1206 | ||
1207 | sys.exit() | 1207 | sys.exit() |
1208 | elif opt == '--version': | 1208 | elif opt == '--version': |
1209 | - print("RNANet statistics 1.4 beta") | 1209 | + print("RNANet statistics 1.5 beta") |
1210 | sys.exit() | 1210 | sys.exit() |
1211 | elif opt == "-r" or opt == "--resolution": | 1211 | elif opt == "-r" or opt == "--resolution": |
1212 | assert float(arg) > 0.0 and float(arg) <= 20.0 | 1212 | assert float(arg) > 0.0 and float(arg) <= 20.0 | ... | ... |
-
Please register or login to post a comment