Showing
5 changed files
with
251 additions
and
150 deletions
CHANGELOG
0 → 100644
1 | +v 1.1 beta, January 2021 | ||
2 | + | ||
3 | +The first uses of RNAnet by people from outside the development team happened between this December. | ||
4 | +A few feedback allowed to identify issues and useful information to add. | ||
5 | + | ||
6 | +FEATURE CHANGES | ||
7 | + - Sequence alignments of the 3D structures mapped to a family are now provided. | ||
8 | + - Full alignements with Rfam sequences are not provided, but you can ask us for the files. | ||
9 | + - Two new fields in table 'family': ali_length and ali_filtered_length. | ||
10 | + They are the MSA lengths of the alignment with and without the Rfam sequences. | ||
11 | + | ||
12 | +TECHNICAL CHANGES | ||
13 | + - SQLite connexions are now all in WAL mode by default (previously, only the writers used WAL mode) | ||
14 | + | ||
15 | +BUG CORRECTIONS | ||
16 | + - When an alignment file is updated in a newer run of RNANet, all the re_mappings are now re-computed | ||
17 | + for this family. Previously, the remappings were computed only for the newly added sequences, | ||
18 | + while the alignment actually changed even for chains added in past runs. | ||
19 | + - Changed the ownership and permissions of files produced by the Docker container. | ||
20 | + They were previously owned by root and the user could not get access to them. | ||
21 | + - Modified nucleotides were not always correctly transformed to N in the alignments (and nucleotide.nt_align_code fields). | ||
22 | + Now, the alignments and nt_align_code only contain "ACGUN-" chars. | ||
23 | + Now, 'N' means 'other', while '-' means 'nothing'. | ||
24 | + | ||
25 | +COMING SOON | ||
26 | + - Automated annotation of detected Recurrent Interaction Networks (RINs), see http://carnaval.lri.fr/ . | ||
27 | + - Possibly, automated detection of HLs and ILs from the 3D Motif Atlas (BGSU). Maybe. Their own website already does the job. | ||
28 | + - A field estimating the quality of the sequence alignment in table family. | ||
29 | + - Possibly, more metrics about the alignments coming from Infernal. | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
... | @@ -249,7 +249,9 @@ To help you design your own requests, here follows a description of the database | ... | @@ -249,7 +249,9 @@ To help you design your own requests, here follows a description of the database |
249 | * `nb_homologs`: The number of hits known to be homologous downloaded from Rfam to compute nucleotide frequencies | 249 | * `nb_homologs`: The number of hits known to be homologous downloaded from Rfam to compute nucleotide frequencies |
250 | * `nb_3d_chains`: The number of 3D RNA chains mapped to the family (from Rfam-PDB mappings, or inferred using the redundancy list) | 250 | * `nb_3d_chains`: The number of 3D RNA chains mapped to the family (from Rfam-PDB mappings, or inferred using the redundancy list) |
251 | * `nb_total_homol`: Sum of the two previous fields, the number of sequences in the multiple sequence alignment, used to compute nucleotide frequencies | 251 | * `nb_total_homol`: Sum of the two previous fields, the number of sequences in the multiple sequence alignment, used to compute nucleotide frequencies |
252 | -* `max_len`: The longest RNA sequence among the homologs (in bases) | 252 | +* `max_len`: The longest RNA sequence among the homologs (in bases, unaligned) |
253 | +* `ali_len`: The aligned sequences length (in bases, aligned) | ||
254 | +* `ali_filtered_len`: The aligned sequences length when we filter the alignment to keep only the RNANet chains (which have a 3D structure) and remove the gap-only columns. | ||
253 | * `comput_time`: Time required to compute the family's multiple sequence alignment in seconds, | 255 | * `comput_time`: Time required to compute the family's multiple sequence alignment in seconds, |
254 | * `comput_peak_mem`: RAM (or swap) required to compute the family's multiple sequence alignment in megabytes, | 256 | * `comput_peak_mem`: RAM (or swap) required to compute the family's multiple sequence alignment in megabytes, |
255 | * `idty_percent`: Average identity percentage over pairs of the 3D chains' sequences from the family | 257 | * `idty_percent`: Average identity percentage over pairs of the 3D chains' sequences from the family | ... | ... |
1 | -#!/usr/bin/python3.8 | 1 | +#!/usr/bin/python3 |
2 | + | ||
3 | +# check Python version before everything | ||
4 | +import platform | ||
5 | +a = ["3.8", platform.python_version()] | ||
6 | +a.sort() | ||
7 | +if a[0] != "3.8": | ||
8 | + print(f"Python is too old: {platform.python_version()}") | ||
9 | + print("Please use version 3.8 or newer.") | ||
10 | + exit(1) | ||
11 | + | ||
2 | import Bio | 12 | import Bio |
3 | import Bio.PDB as pdb | 13 | import Bio.PDB as pdb |
4 | import concurrent.futures | 14 | import concurrent.futures |
... | @@ -58,6 +68,7 @@ running_stats.append(0) # n_finished | ... | @@ -58,6 +68,7 @@ running_stats.append(0) # n_finished |
58 | running_stats.append(0) # n_skipped | 68 | running_stats.append(0) # n_skipped |
59 | path_to_3D_data = "tobedefinedbyoptions" | 69 | path_to_3D_data = "tobedefinedbyoptions" |
60 | path_to_seq_data = "tobedefinedbyoptions" | 70 | path_to_seq_data = "tobedefinedbyoptions" |
71 | +python_executable = "python"+".".join(platform.python_version().split('.')[:2]) # Cuts python3.8.1 into python3.8 for example. | ||
61 | validsymb = '\U00002705' | 72 | validsymb = '\U00002705' |
62 | warnsymb = '\U000026A0' | 73 | warnsymb = '\U000026A0' |
63 | errsymb = '\U0000274C' | 74 | errsymb = '\U0000274C' |
... | @@ -163,7 +174,7 @@ class Chain: | ... | @@ -163,7 +174,7 @@ class Chain: |
163 | self.chain_label = chain_label # chain pretty name | 174 | self.chain_label = chain_label # chain pretty name |
164 | self.file = "" # path to the 3D PDB file | 175 | self.file = "" # path to the 3D PDB file |
165 | self.seq = "" # sequence with modified nts | 176 | self.seq = "" # sequence with modified nts |
166 | - self.seq_to_align = "" # sequence with modified nts replaced, but gaps can exist | 177 | + self.seq_to_align = "" # sequence with modified nts replaced by N, but gaps can exist |
167 | self.length = -1 # length of the sequence (missing residues are not counted) | 178 | self.length = -1 # length of the sequence (missing residues are not counted) |
168 | self.full_length = -1 # length of the chain extracted from source structure ([start; stop] interval, or a subset for inferred mappings) | 179 | self.full_length = -1 # length of the chain extracted from source structure ([start; stop] interval, or a subset for inferred mappings) |
169 | self.delete_me = False # an error occured during production/parsing | 180 | self.delete_me = False # an error occured during production/parsing |
... | @@ -468,10 +479,11 @@ class Chain: | ... | @@ -468,10 +479,11 @@ class Chain: |
468 | # Add a sequence column just for the alignments | 479 | # Add a sequence column just for the alignments |
469 | df['nt_align_code'] = [str(x).upper() | 480 | df['nt_align_code'] = [str(x).upper() |
470 | .replace('NAN', '-') # Unresolved nucleotides are gaps | 481 | .replace('NAN', '-') # Unresolved nucleotides are gaps |
471 | - .replace('?', '-') # Unidentified residues, let's delete them | 482 | + .replace('?', 'N') # Unidentified residues, let's delete them |
472 | - .replace('T', 'U') # 5MU are modified to t, which gives T | 483 | + .replace('T', 'U') # 5MU are modified to t by DSSR, which gives T |
473 | .replace('P', 'U') # Pseudo-uridines, but it is not really right to change them to U, see DSSR paper, Fig 2 | 484 | .replace('P', 'U') # Pseudo-uridines, but it is not really right to change them to U, see DSSR paper, Fig 2 |
474 | for x in df['nt_code']] | 485 | for x in df['nt_code']] |
486 | + df['nt_align_code'] = [ x if x in "ACGU-" else 'N' for x in df['nt_align_code'] ] # All other modified nucleotides are transformed to N | ||
475 | 487 | ||
476 | # One-hot encoding sequence | 488 | # One-hot encoding sequence |
477 | df["is_A"] = [1 if x == "A" else 0 for x in df["nt_code"]] | 489 | df["is_A"] = [1 if x == "A" else 0 for x in df["nt_code"]] |
... | @@ -562,6 +574,7 @@ class Chain: | ... | @@ -562,6 +574,7 @@ class Chain: |
562 | setproctitle(f"RNANet.py {self.chain_label} register_chain()") | 574 | setproctitle(f"RNANet.py {self.chain_label} register_chain()") |
563 | 575 | ||
564 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: | 576 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: |
577 | + conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query | ||
565 | # Register the chain in table chain | 578 | # Register the chain in table chain |
566 | if self.mapping is not None: | 579 | if self.mapping is not None: |
567 | sql_execute(conn, f""" INSERT INTO chain | 580 | sql_execute(conn, f""" INSERT INTO chain |
... | @@ -607,99 +620,6 @@ class Chain: | ... | @@ -607,99 +620,6 @@ class Chain: |
607 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""", | 620 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""", |
608 | many=True, data=list(df.to_records(index=False)), warn_every=10) | 621 | many=True, data=list(df.to_records(index=False)), warn_every=10) |
609 | 622 | ||
610 | - def remap(self, columns_to_save, s_seq): | ||
611 | - """Maps the object's sequence to its version in a MSA, to compute nucleotide frequencies at every position. | ||
612 | - | ||
613 | - columns_to_save: a set of indexes in the alignment that are mapped to previous sequences in the alignment | ||
614 | - s_seq: the aligned version of self.seq_to_align | ||
615 | - """ | ||
616 | - | ||
617 | - setproctitle(f"RNANet.py {self.chain_label} remap()") | ||
618 | - | ||
619 | - alilen = len(s_seq) | ||
620 | - re_mappings = [] | ||
621 | - | ||
622 | - # Save colums in the appropriate positions | ||
623 | - i = 0 | ||
624 | - j = 0 | ||
625 | - while i < self.full_length and j < alilen: | ||
626 | - # Here we try to map self.seq_to_align (the sequence of the 3D chain, including gaps when residues are missing), | ||
627 | - # with s_seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and . | ||
628 | - | ||
629 | - if self.seq_to_align[i] == s_seq[j].upper(): # alignment and sequence correspond (incl. gaps) | ||
630 | - re_mappings.append((self.db_chain_id, i+1, j+1)) # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1. | ||
631 | - columns_to_save.add(j+1) # it's a set, doublons are automaticaly ignored | ||
632 | - i += 1 | ||
633 | - j += 1 | ||
634 | - elif self.seq_to_align[i] == '-': # gap in the chain, but not in the aligned sequence | ||
635 | - # search for a gap to the consensus nearby | ||
636 | - k = 0 # Search must start at zero to assert the difference comes from '-' in front of '.' | ||
637 | - while j+k < alilen and s_seq[j+k] == '.': | ||
638 | - k += 1 | ||
639 | - | ||
640 | - # if found, set j to that position | ||
641 | - if j+k < alilen and s_seq[j+k] == '-': | ||
642 | - re_mappings.append((self.db_chain_id, i+1, j+k+1)) | ||
643 | - columns_to_save.add(j+k+1) | ||
644 | - i += 1 | ||
645 | - j += k+1 | ||
646 | - continue | ||
647 | - | ||
648 | - # if not, take the insertion gap if this is one | ||
649 | - if j < alilen and s_seq[j] == '.': | ||
650 | - re_mappings.append((self.db_chain_id, i+1, j+1)) | ||
651 | - columns_to_save.add(j+1) | ||
652 | - i += 1 | ||
653 | - j += 1 | ||
654 | - continue | ||
655 | - | ||
656 | - # else, just mark the gap as unknown (there is an alignment mismatch) | ||
657 | - re_mappings.append((self.db_chain_id, i+1, 0)) | ||
658 | - i += 1 | ||
659 | - elif s_seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain | ||
660 | - j += 1 # ignore the column | ||
661 | - else: # sequence mismatch which is not a gap... | ||
662 | - print(f"You are never supposed to reach this. Comparing {self.chain_label} in {i} ({self.seq_to_align[i-1:i+2]}) with seq[{j}] ({s_seq[j-3:j+4]}).", | ||
663 | - self.seq_to_align, s_seq, sep='\n', flush=True) | ||
664 | - raise Exception('Something is wrong with sequence alignment.') | ||
665 | - return re_mappings, columns_to_save | ||
666 | - | ||
667 | - def replace_gaps(self, conn): | ||
668 | - """ Replace gapped positions by the consensus sequence. | ||
669 | - | ||
670 | - REQUIRES align_column and re_mapping up to date | ||
671 | - """ | ||
672 | - | ||
673 | - setproctitle(f"RNANet.py {self.chain_label} replace_gaps()") | ||
674 | - | ||
675 | - homology_data = sql_ask_database(conn, f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM | ||
676 | - (SELECT chain_id, rfam_acc FROM chain WHERE chain_id={self.db_chain_id}) | ||
677 | - NATURAL JOIN re_mapping | ||
678 | - NATURAL JOIN align_column; | ||
679 | - """) | ||
680 | - if homology_data is None or not len(homology_data): | ||
681 | - with open(runDir + "/errors.txt", "a") as errf: | ||
682 | - errf.write(f"No homology data found in the database for {self.chain_label} ! Not replacing gaps.\n") | ||
683 | - return [] | ||
684 | - elif len(homology_data) != self.full_length: | ||
685 | - with open(runDir + "/errors.txt", "a") as errf: | ||
686 | - errf.write(f"Found {len(homology_data)} nucleotides for {self.chain_label} of length {self.full_length} ! Not replacing gaps.\n") | ||
687 | - return [] | ||
688 | - c_seq_to_align = list(self.seq_to_align) | ||
689 | - c_seq = list(self.seq) | ||
690 | - letters = ['A', 'C', 'G', 'U', 'N'] | ||
691 | - gaps = [] | ||
692 | - for i in range(self.full_length): | ||
693 | - if c_seq_to_align[i] == '-': # (then c_seq[i] also is) | ||
694 | - freq = homology_data[i] | ||
695 | - l = letters[freq.index(max(freq))] | ||
696 | - c_seq_to_align[i] = l | ||
697 | - c_seq[i] = l | ||
698 | - gaps.append((l, l == 'A', l == 'C', l == 'G', l == 'U', l == 'N', self.db_chain_id, i+1)) | ||
699 | - self.seq_to_align = ''.join(c_seq_to_align) | ||
700 | - self.seq = ''.join(c_seq) | ||
701 | - return gaps | ||
702 | - | ||
703 | 623 | ||
704 | class Job: | 624 | class Job: |
705 | """ This class contains information about a task to run later. | 625 | """ This class contains information about a task to run later. |
... | @@ -868,6 +788,7 @@ class Downloader: | ... | @@ -868,6 +788,7 @@ class Downloader: |
868 | print(d) | 788 | print(d) |
869 | 789 | ||
870 | with sqlite3.connect(runDir + "/results/RNANet.db", timeout=20.0) as conn: | 790 | with sqlite3.connect(runDir + "/results/RNANet.db", timeout=20.0) as conn: |
791 | + conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query | ||
871 | # We use the REPLACE keyword to get the latest information | 792 | # We use the REPLACE keyword to get the latest information |
872 | sql_execute(conn, """INSERT OR REPLACE INTO family (rfam_acc, description, max_len) | 793 | sql_execute(conn, """INSERT OR REPLACE INTO family (rfam_acc, description, max_len) |
873 | VALUES (?, ?, ?);""", | 794 | VALUES (?, ?, ?);""", |
... | @@ -1194,8 +1115,7 @@ class Pipeline: | ... | @@ -1194,8 +1115,7 @@ class Pipeline: |
1194 | setproctitle("RNANet.py list_available_mappings()") | 1115 | setproctitle("RNANet.py list_available_mappings()") |
1195 | 1116 | ||
1196 | # List all 3D RNA chains below given resolution | 1117 | # List all 3D RNA chains below given resolution |
1197 | - full_structures_list = self.dl.download_BGSU_NR_list( | 1118 | + full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) # list of tuples ( class, class_members ) |
1198 | - self.CRYSTAL_RES) # list of tuples ( class, class_members ) | ||
1199 | 1119 | ||
1200 | # Check for a list of known problems: | 1120 | # Check for a list of known problems: |
1201 | if os.path.isfile(runDir + "/known_issues.txt"): | 1121 | if os.path.isfile(runDir + "/known_issues.txt"): |
... | @@ -1209,11 +1129,13 @@ class Pipeline: | ... | @@ -1209,11 +1129,13 @@ class Pipeline: |
1209 | print(" ".join(self.known_issues)) | 1129 | print(" ".join(self.known_issues)) |
1210 | 1130 | ||
1211 | if self.HOMOLOGY: | 1131 | if self.HOMOLOGY: |
1212 | - # Ask Rfam if some are mapped to Rfam families | 1132 | + # Ask Rfam their mappings between PDB structures and Rfam families |
1213 | allmappings = self.dl.download_Rfam_PDB_mappings() | 1133 | allmappings = self.dl.download_Rfam_PDB_mappings() |
1214 | 1134 | ||
1215 | - # Compute the list of mappable structures using NR-list and Rfam-PDB mappings | 1135 | + # Compute the extended list of mappable structures using NR-list and Rfam-PDB mappings |
1216 | - # And get Chain() objects | 1136 | + # And get Chain() objects. |
1137 | + # If self.FULLINFERENCE is False, the extended list is already filtered to remove | ||
1138 | + # the chains which already are in the database. | ||
1217 | print("> Building list of structures...", flush=True) | 1139 | print("> Building list of structures...", flush=True) |
1218 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=ncores) | 1140 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=ncores) |
1219 | try: | 1141 | try: |
... | @@ -1243,6 +1165,7 @@ class Pipeline: | ... | @@ -1243,6 +1165,7 @@ class Pipeline: |
1243 | exit(1) | 1165 | exit(1) |
1244 | else: | 1166 | else: |
1245 | conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) | 1167 | conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) |
1168 | + conn.execute('pragma journal_mode=wal') | ||
1246 | for eq_class, codelist in tqdm(full_structures_list, desc="Eq. classes"): | 1169 | for eq_class, codelist in tqdm(full_structures_list, desc="Eq. classes"): |
1247 | codes = codelist.replace('+', ',').split(',') | 1170 | codes = codelist.replace('+', ',').split(',') |
1248 | 1171 | ||
... | @@ -1361,6 +1284,7 @@ class Pipeline: | ... | @@ -1361,6 +1284,7 @@ class Pipeline: |
1361 | kir.write(c[1].chain_label + '\n' + | 1284 | kir.write(c[1].chain_label + '\n' + |
1362 | c[1].error_messages + '\n\n') | 1285 | c[1].error_messages + '\n\n') |
1363 | with sqlite3.connect(runDir+"/results/RNANet.db") as conn: | 1286 | with sqlite3.connect(runDir+"/results/RNANet.db") as conn: |
1287 | + conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query | ||
1364 | sql_execute(conn, f"UPDATE chain SET issue = 1 WHERE chain_id = ?;", data=(c[1].db_chain_id,)) | 1288 | sql_execute(conn, f"UPDATE chain SET issue = 1 WHERE chain_id = ?;", data=(c[1].db_chain_id,)) |
1365 | ki.close() | 1289 | ki.close() |
1366 | kir.close() | 1290 | kir.close() |
... | @@ -1459,10 +1383,11 @@ class Pipeline: | ... | @@ -1459,10 +1383,11 @@ class Pipeline: |
1459 | else: | 1383 | else: |
1460 | nb_total_homol = len(align) | 1384 | nb_total_homol = len(align) |
1461 | nb_homologs = nb_total_homol - nb_3d_chains | 1385 | nb_homologs = nb_total_homol - nb_3d_chains |
1462 | - data.append((nb_homologs, nb_3d_chains, nb_total_homol, r[2], r[3], r[0])) | 1386 | + data.append((nb_homologs, nb_3d_chains, nb_total_homol, align.get_alignment_length(), r[2], r[3], r[0])) |
1463 | 1387 | ||
1464 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 1388 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
1465 | - sql_execute(conn, """UPDATE family SET nb_homologs = ?, nb_3d_chains = ?, nb_total_homol = ?, comput_time = ?, comput_peak_mem = ? | 1389 | + conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query |
1390 | + sql_execute(conn, """UPDATE family SET nb_homologs = ?, nb_3d_chains = ?, nb_total_homol = ?, ali_len = ?, comput_time = ?, comput_peak_mem = ? | ||
1466 | WHERE rfam_acc = ?;""", many=True, data=data) | 1391 | WHERE rfam_acc = ?;""", many=True, data=data) |
1467 | 1392 | ||
1468 | def remap(self): | 1393 | def remap(self): |
... | @@ -1489,8 +1414,8 @@ class Pipeline: | ... | @@ -1489,8 +1414,8 @@ class Pipeline: |
1489 | 1414 | ||
1490 | try: | 1415 | try: |
1491 | fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True) | 1416 | fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True) |
1492 | - # Apply work_pssm to each RNA family | 1417 | + # Apply work_pssm_remap to each RNA family |
1493 | - for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)): | 1418 | + for i, _ in enumerate(p.imap_unordered(partial(work_pssm_remap, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)): |
1494 | # Everytime the iteration finishes on a family, update the global progress bar over the RNA families | 1419 | # Everytime the iteration finishes on a family, update the global progress bar over the RNA families |
1495 | fam_pbar.update(1) | 1420 | fam_pbar.update(1) |
1496 | fam_pbar.close() | 1421 | fam_pbar.close() |
... | @@ -1543,12 +1468,13 @@ class Pipeline: | ... | @@ -1543,12 +1468,13 @@ class Pipeline: |
1543 | runDir + f"/data/{f}_counts.csv"]) | 1468 | runDir + f"/data/{f}_counts.csv"]) |
1544 | 1469 | ||
1545 | # Run statistics files | 1470 | # Run statistics files |
1546 | - subprocess.run(["python3.8", fileDir+"/scripts/regression.py", runDir + "/results/RNANet.db"]) | 1471 | + subprocess.run([python_executable, fileDir+"/scripts/regression.py", runDir + "/results/RNANet.db"]) |
1547 | - subprocess.run(["python3.8", fileDir+"/statistics.py", "--3d-folder", path_to_3D_data, | 1472 | + subprocess.run([python_executable, fileDir+"/statistics.py", "--3d-folder", path_to_3D_data, |
1548 | "--seq-folder", path_to_seq_data, "-r", str(self.CRYSTAL_RES)]) | 1473 | "--seq-folder", path_to_seq_data, "-r", str(self.CRYSTAL_RES)]) |
1549 | 1474 | ||
1550 | # Save additional informations | 1475 | # Save additional informations |
1551 | with sqlite3.connect(runDir+"/results/RNANet.db") as conn: | 1476 | with sqlite3.connect(runDir+"/results/RNANet.db") as conn: |
1477 | + conn.execute('pragma journal_mode=wal') | ||
1552 | pd.read_sql_query("""SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem | 1478 | pd.read_sql_query("""SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem |
1553 | FROM family ORDER BY nb_3d_chains DESC;""", | 1479 | FROM family ORDER BY nb_3d_chains DESC;""", |
1554 | conn).to_csv(runDir + f"/results/families.csv", float_format="%.2f", index=False) | 1480 | conn).to_csv(runDir + f"/results/families.csv", float_format="%.2f", index=False) |
... | @@ -1571,6 +1497,7 @@ class Pipeline: | ... | @@ -1571,6 +1497,7 @@ class Pipeline: |
1571 | setproctitle("RNANet.py sanitize_database()") | 1497 | setproctitle("RNANet.py sanitize_database()") |
1572 | 1498 | ||
1573 | conn = sqlite3.connect(runDir + "/results/RNANet.db") | 1499 | conn = sqlite3.connect(runDir + "/results/RNANet.db") |
1500 | + conn.execute('pragma journal_mode=wal') | ||
1574 | 1501 | ||
1575 | # Assert every structure is used | 1502 | # Assert every structure is used |
1576 | r = sql_ask_database(conn, """SELECT DISTINCT pdb_id FROM structure WHERE pdb_id NOT IN (SELECT DISTINCT structure_id FROM chain);""") | 1503 | r = sql_ask_database(conn, """SELECT DISTINCT pdb_id FROM structure WHERE pdb_id NOT IN (SELECT DISTINCT structure_id FROM chain);""") |
... | @@ -1742,6 +1669,8 @@ def sql_define_tables(conn): | ... | @@ -1742,6 +1669,8 @@ def sql_define_tables(conn): |
1742 | nb_3d_chains INT, | 1669 | nb_3d_chains INT, |
1743 | nb_total_homol INT, | 1670 | nb_total_homol INT, |
1744 | max_len UNSIGNED SMALLINT, | 1671 | max_len UNSIGNED SMALLINT, |
1672 | + ali_len UNSIGNED SMALLINT, | ||
1673 | + ali_filtered_len UNSIGNED SMALLINT, | ||
1745 | comput_time REAL, | 1674 | comput_time REAL, |
1746 | comput_peak_mem REAL, | 1675 | comput_peak_mem REAL, |
1747 | idty_percent REAL | 1676 | idty_percent REAL |
... | @@ -1778,13 +1707,13 @@ def sql_ask_database(conn, sql, warn_every=10): | ... | @@ -1778,13 +1707,13 @@ def sql_ask_database(conn, sql, warn_every=10): |
1778 | warn(str(e) + ", retrying in 0.2s (worker " + | 1707 | warn(str(e) + ", retrying in 0.2s (worker " + |
1779 | str(os.getpid()) + f', try {_+1}/100)') | 1708 | str(os.getpid()) + f', try {_+1}/100)') |
1780 | time.sleep(0.2) | 1709 | time.sleep(0.2) |
1710 | + cursor.close() | ||
1781 | warn("Tried to reach database 100 times and failed. Aborting.", error=True) | 1711 | warn("Tried to reach database 100 times and failed. Aborting.", error=True) |
1782 | return [] | 1712 | return [] |
1783 | 1713 | ||
1784 | 1714 | ||
1785 | @trace_unhandled_exceptions | 1715 | @trace_unhandled_exceptions |
1786 | def sql_execute(conn, sql, many=False, data=None, warn_every=10): | 1716 | def sql_execute(conn, sql, many=False, data=None, warn_every=10): |
1787 | - conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query | ||
1788 | for _ in range(100): # retry 100 times if it fails | 1717 | for _ in range(100): # retry 100 times if it fails |
1789 | try: | 1718 | try: |
1790 | if many: | 1719 | if many: |
... | @@ -2071,6 +2000,7 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li | ... | @@ -2071,6 +2000,7 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li |
2071 | # Check if the chain exists in the database | 2000 | # Check if the chain exists in the database |
2072 | if update_only: | 2001 | if update_only: |
2073 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: | 2002 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: |
2003 | + conn.execute('pragma journal_mode=wal') | ||
2074 | res = sql_ask_database(conn, f"""SELECT chain_id from chain | 2004 | res = sql_ask_database(conn, f"""SELECT chain_id from chain |
2075 | WHERE structure_id='{pdb_id}' | 2005 | WHERE structure_id='{pdb_id}' |
2076 | AND chain_name='{pdb_chain_id}' | 2006 | AND chain_name='{pdb_chain_id}' |
... | @@ -2110,6 +2040,7 @@ def work_mmcif(pdb_id): | ... | @@ -2110,6 +2040,7 @@ def work_mmcif(pdb_id): |
2110 | 2040 | ||
2111 | # check if it exists in database | 2041 | # check if it exists in database |
2112 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 2042 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
2043 | + conn.execute('pragma journal_mode=wal') | ||
2113 | r = sql_ask_database(conn, f"""SELECT * from structure where pdb_id = '{pdb_id}';""") | 2044 | r = sql_ask_database(conn, f"""SELECT * from structure where pdb_id = '{pdb_id}';""") |
2114 | 2045 | ||
2115 | # if not, read the CIF header and register the structure | 2046 | # if not, read the CIF header and register the structure |
... | @@ -2138,6 +2069,7 @@ def work_mmcif(pdb_id): | ... | @@ -2138,6 +2069,7 @@ def work_mmcif(pdb_id): |
2138 | 2069 | ||
2139 | # Save into the database | 2070 | # Save into the database |
2140 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 2071 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
2072 | + conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query | ||
2141 | sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution) | 2073 | sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution) |
2142 | VALUES (?, ?, DATE(?), ?, ?);""", data=(pdb_id, 1, date, exp_meth, reso)) | 2074 | VALUES (?, ?, DATE(?), ?, ?);""", data=(pdb_id, 1, date, exp_meth, reso)) |
2143 | 2075 | ||
... | @@ -2181,9 +2113,10 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): | ... | @@ -2181,9 +2113,10 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): |
2181 | df = c.extract_3D_data(save_logs) | 2113 | df = c.extract_3D_data(save_logs) |
2182 | c.register_chain(df) | 2114 | c.register_chain(df) |
2183 | 2115 | ||
2184 | - # Small check | 2116 | + # Small check that all nucleotides of a chain have an entry in nucleotide table |
2185 | if not c.delete_me: | 2117 | if not c.delete_me: |
2186 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: | 2118 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: |
2119 | + conn.execute('pragma journal_mode=wal') | ||
2187 | nnts = sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM nucleotide WHERE chain_id={c.db_chain_id};", warn_every=10)[0][0] | 2120 | nnts = sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM nucleotide WHERE chain_id={c.db_chain_id};", warn_every=10)[0][0] |
2188 | if not(nnts): | 2121 | if not(nnts): |
2189 | warn(f"Nucleotides not inserted: {c.error_messages}") | 2122 | warn(f"Nucleotides not inserted: {c.error_messages}") |
... | @@ -2420,22 +2353,29 @@ def summarize_position(counts): | ... | @@ -2420,22 +2353,29 @@ def summarize_position(counts): |
2420 | 2353 | ||
2421 | 2354 | ||
2422 | @trace_unhandled_exceptions | 2355 | @trace_unhandled_exceptions |
2423 | -def work_pssm(f, fill_gaps): | 2356 | +def work_pssm_remap(f, fill_gaps): |
2424 | - """ Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. | 2357 | + """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. |
2358 | + This also remaps the 3D object sequence with the aligned sequence in the MSA. | ||
2359 | + If asked, the 3D object sequence is completed by the consensus nucleotide when one of them is missing. | ||
2425 | 2360 | ||
2426 | Uses only 1 core, so this function can be called in parallel. | 2361 | Uses only 1 core, so this function can be called in parallel. |
2427 | 2362 | ||
2428 | """ | 2363 | """ |
2429 | - setproctitle(f"RNAnet.py work_pssm({f})") | ||
2430 | 2364 | ||
2431 | # Get a worker number to position the progress bar | 2365 | # Get a worker number to position the progress bar |
2432 | global idxQueue | 2366 | global idxQueue |
2433 | thr_idx = idxQueue.get() | 2367 | thr_idx = idxQueue.get() |
2434 | 2368 | ||
2435 | - # get the chains of this family | 2369 | + # get the chains of this family in the update |
2436 | list_of_chains = rfam_acc_to_download[f] | 2370 | list_of_chains = rfam_acc_to_download[f] |
2437 | chains_ids = [str(c) for c in list_of_chains] | 2371 | chains_ids = [str(c) for c in list_of_chains] |
2438 | 2372 | ||
2373 | + ########################################################################################## | ||
2374 | + # Compute frequencies in the alignment | ||
2375 | + ########################################################################################## | ||
2376 | + | ||
2377 | + setproctitle(f"RNAnet.py work_pssm_remap({f}) compute PSSMs") | ||
2378 | + | ||
2439 | # Open the alignment | 2379 | # Open the alignment |
2440 | try: | 2380 | try: |
2441 | align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") | 2381 | align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") |
... | @@ -2450,33 +2390,92 @@ def work_pssm(f, fill_gaps): | ... | @@ -2450,33 +2390,92 @@ def work_pssm(f, fill_gaps): |
2450 | frequencies = [ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ] | 2390 | frequencies = [ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ] |
2451 | del pssm | 2391 | del pssm |
2452 | 2392 | ||
2393 | + ########################################################################################## | ||
2394 | + # Remap sequences of the 3D chains with sequences in the alignment | ||
2395 | + ########################################################################################## | ||
2396 | + | ||
2397 | + setproctitle(f"RNAnet.py work_pssm_remap({f}) remap") | ||
2398 | + | ||
2453 | # For each sequence, find the right chain and remap chain residues with alignment columns | 2399 | # For each sequence, find the right chain and remap chain residues with alignment columns |
2454 | columns_to_save = set() | 2400 | columns_to_save = set() |
2455 | re_mappings = [] | 2401 | re_mappings = [] |
2402 | + alilen = align.get_alignment_length() | ||
2456 | pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Remap {f} chains", leave=False) | 2403 | pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Remap {f} chains", leave=False) |
2457 | pbar.update(0) | 2404 | pbar.update(0) |
2458 | for s in align: | 2405 | for s in align: |
2459 | if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain | 2406 | if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain |
2460 | continue | 2407 | continue |
2461 | 2408 | ||
2462 | - try: | 2409 | + # Check if the chain existed before in the database |
2463 | - # get the right 3D chain: | 2410 | + if chains_ids.index(s.id) in list_of_chains.keys(): |
2464 | - if '|' in s.id: | 2411 | + # a chain object is found in the update, this sequence is new |
2465 | - # for some reason cmalign gets indexes|chainid in the FASTA headers sometimes. | 2412 | + this_chain = list_of_chains[chains_ids.index(s.id)] |
2466 | - # it is maybe when there are doublons ? Removing doublons takes too much time, | 2413 | + seq_to_align = this_chain.seq_to_align |
2467 | - # it is easier to parse the index|id formats. | 2414 | + full_length = this_chain.full_length |
2468 | - idx = chains_ids.index(s.id.split('|')[1]) | 2415 | + db_id = this_chain.db_chain_id |
2416 | + else: | ||
2417 | + # it existed in the database before. | ||
2418 | + this_chain = None | ||
2419 | + | ||
2420 | + # Get the chain id in the database | ||
2421 | + conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0) | ||
2422 | + conn.execute('pragma journal_mode=wal') | ||
2423 | + db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = {s.id.split('[')[0]} AND chain_name = {s.id.split('-')[1]} AND rfam_acc = {f};") | ||
2424 | + if len(db_id): | ||
2425 | + db_id = db_id[0][0] | ||
2469 | else: | 2426 | else: |
2470 | - idx = chains_ids.index(s.id) | 2427 | + conn.close() |
2428 | + warn(f"Bizarre... sequence {s.id} is not found in the database ! Cannot remap it ! Ignoring...") | ||
2429 | + pbar.update(1) | ||
2430 | + continue | ||
2431 | + seq_to_align = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")]) | ||
2432 | + full_length = len(seq_to_align) | ||
2471 | 2433 | ||
2472 | - # call its remap method | 2434 | + conn.close() |
2473 | - new_mappings, columns_to_save = list_of_chains[idx].remap(columns_to_save, s.seq) | ||
2474 | - re_mappings += new_mappings | ||
2475 | 2435 | ||
2476 | - except ValueError: | 2436 | + # Save colums in the appropriate positions |
2477 | - # with open(runDir + "/errors.txt", "a") as errf: | 2437 | + i = 0 # to iterate the object sequence |
2478 | - # errf.write(f"Chain {s.id} not found in list of chains to process. ignoring.\n") | 2438 | + j = 0 # to iterate the alignment sequence |
2479 | - pass | 2439 | + while i < full_length and j < alilen: |
2440 | + # Here we try to map seq_to_align (the sequence of the 3D chain, including gaps when residues are missing), | ||
2441 | + # with s.seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and . | ||
2442 | + | ||
2443 | + if seq_to_align[i] == s.seq[j].upper(): # alignment and sequence correspond (incl. gaps) | ||
2444 | + re_mappings.append((db_id, i+1, j+1)) # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1. | ||
2445 | + columns_to_save.add(j+1) # it's a set, doublons are automaticaly ignored | ||
2446 | + i += 1 | ||
2447 | + j += 1 | ||
2448 | + elif seq_to_align[i] == '-': # gap in the chain, but not in the aligned sequence | ||
2449 | + # search for a gap to the consensus nearby | ||
2450 | + k = 0 # Search must start at zero to assert the difference comes from '-' in front of '.' | ||
2451 | + while j+k < alilen and s.seq[j+k] == '.': | ||
2452 | + k += 1 | ||
2453 | + | ||
2454 | + # if found, set j to that position | ||
2455 | + if j+k < alilen and s.seq[j+k] == '-': | ||
2456 | + re_mappings.append((db_id, i+1, j+k+1)) | ||
2457 | + columns_to_save.add(j+k+1) | ||
2458 | + i += 1 | ||
2459 | + j += k+1 | ||
2460 | + continue | ||
2461 | + | ||
2462 | + # if not, take the insertion gap if this is one | ||
2463 | + if j < alilen and s.seq[j] == '.': | ||
2464 | + re_mappings.append((db_id, i+1, j+1)) | ||
2465 | + columns_to_save.add(j+1) | ||
2466 | + i += 1 | ||
2467 | + j += 1 | ||
2468 | + continue | ||
2469 | + | ||
2470 | + # else, just mark the gap as unknown (there is an alignment mismatch) | ||
2471 | + re_mappings.append((db_id, i+1, 0)) | ||
2472 | + i += 1 | ||
2473 | + elif s.seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain | ||
2474 | + j += 1 # ignore the column | ||
2475 | + else: # sequence mismatch which is not a gap... | ||
2476 | + print(f"You are never supposed to reach this. Comparing {self.chain_label} in {i} ({self.seq_to_align[i-1:i+2]}) with seq[{j}] ({s.seq[j-3:j+4]}).", | ||
2477 | + self.seq_to_align, s.seq, sep='\n', flush=True) | ||
2478 | + raise Exception('Something is wrong with sequence alignment.') | ||
2480 | 2479 | ||
2481 | pbar.update(1) | 2480 | pbar.update(1) |
2482 | pbar.close() | 2481 | pbar.close() |
... | @@ -2486,13 +2485,28 @@ def work_pssm(f, fill_gaps): | ... | @@ -2486,13 +2485,28 @@ def work_pssm(f, fill_gaps): |
2486 | warn(f"Chains were not found in {f}++.afa file: {chains_ids}", error=True) | 2485 | warn(f"Chains were not found in {f}++.afa file: {chains_ids}", error=True) |
2487 | return 1 | 2486 | return 1 |
2488 | 2487 | ||
2488 | + | ||
2489 | + ########################################################################################## | ||
2490 | + # Save the alignment columns and their mappings to the database | ||
2491 | + ########################################################################################## | ||
2492 | + | ||
2493 | + setproctitle(f"RNAnet.py work_pssm_remap({f}) saving") | ||
2494 | + | ||
2489 | # Save the re_mappings | 2495 | # Save the re_mappings |
2490 | conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0) | 2496 | conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0) |
2497 | + conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query | ||
2491 | sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali) | 2498 | sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali) |
2492 | VALUES (?, ?, ?) | 2499 | VALUES (?, ?, ?) |
2493 | ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""", | 2500 | ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""", |
2494 | many=True, data=re_mappings) | 2501 | many=True, data=re_mappings) |
2495 | 2502 | ||
2503 | + # Delete alignment columns that are not used anymore from the database | ||
2504 | + current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = {f}";)] | ||
2505 | + unused = [] | ||
2506 | + for col in current_family_columns: | ||
2507 | + if col not in columns_to_save: | ||
2508 | + unused.append((f, col)) | ||
2509 | + sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused) | ||
2496 | # Save the useful columns in the database | 2510 | # Save the useful columns in the database |
2497 | data = [(f, j) + frequencies[j-1] for j in sorted(columns_to_save)] | 2511 | data = [(f, j) + frequencies[j-1] for j in sorted(columns_to_save)] |
2498 | sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other) | 2512 | sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other) |
... | @@ -2501,34 +2515,72 @@ def work_pssm(f, fill_gaps): | ... | @@ -2501,34 +2515,72 @@ def work_pssm(f, fill_gaps): |
2501 | # Add an unknown values column, with index_ali 0 | 2515 | # Add an unknown values column, with index_ali 0 |
2502 | sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other) | 2516 | sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other) |
2503 | VALUES (?, 0, 0.0, 0.0, 0.0, 0.0, 1.0);""", data=(f,)) | 2517 | VALUES (?, 0, 0.0, 0.0, 0.0, 0.0, 1.0);""", data=(f,)) |
2518 | + # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) | ||
2519 | + sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f)) | ||
2520 | + conn.close() | ||
2521 | + | ||
2522 | + ########################################################################################## | ||
2523 | + # Replacing gaps in the 3D chains by consensus sequences | ||
2524 | + ########################################################################################## | ||
2525 | + | ||
2526 | + setproctitle(f"RNAnet.py work_pssm_remap({f}) replace gaps") | ||
2504 | 2527 | ||
2505 | # Replace gaps by consensus | 2528 | # Replace gaps by consensus |
2506 | if fill_gaps: | 2529 | if fill_gaps: |
2507 | pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Replace {f} gaps", leave=False) | 2530 | pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Replace {f} gaps", leave=False) |
2508 | pbar.update(0) | 2531 | pbar.update(0) |
2509 | gaps = [] | 2532 | gaps = [] |
2533 | + conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0) | ||
2534 | + conn.execute('pragma journal_mode=wal') | ||
2510 | for s in align: | 2535 | for s in align: |
2511 | if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain | 2536 | if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain |
2512 | continue | 2537 | continue |
2513 | 2538 | ||
2514 | - try: | 2539 | + # get the right 3D chain: |
2515 | - # get the right 3D chain: | 2540 | + if chains_ids.index(s.id) in list_of_chains.keys(): |
2516 | - if '|' in s.id: | 2541 | + db_id = list_of_chains[chains_ids.index(s.id)].db_chain_id |
2517 | - idx = chains_ids.index(s.id.split('|')[1]) | 2542 | + seq = this_chain.seq |
2543 | + full_length = this_chain.full_length | ||
2544 | + else: | ||
2545 | + db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = {s.id.split('[')[0]} AND chain_name = {s.id.split('-')[1]} AND rfam_acc = {f};") | ||
2546 | + if len(db_id): | ||
2547 | + db_id = db_id[0][0] | ||
2518 | else: | 2548 | else: |
2519 | - idx = chains_ids.index(s.id) | 2549 | + pbar.update(1) |
2520 | - | 2550 | + continue |
2521 | - gaps += list_of_chains[idx].replace_gaps(conn) | 2551 | + seq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ]) |
2522 | - except ValueError: | 2552 | + full_length = len(seq) |
2523 | - pass # We already printed a warning just above | 2553 | + |
2554 | + # detect gaps | ||
2555 | + c_seq = list(seq) # contains "ACGUNacgu-" | ||
2556 | + letters = ['A', 'C', 'G', 'U', 'N'] | ||
2557 | + homology_data = sql_ask_database(conn, f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM | ||
2558 | + (SELECT chain_id, rfam_acc FROM chain WHERE chain_id={db_id}) | ||
2559 | + NATURAL JOIN re_mapping | ||
2560 | + NATURAL JOIN align_column; | ||
2561 | + """) | ||
2562 | + if homology_data is None or not len(homology_data): | ||
2563 | + with open(runDir + "/errors.txt", "a") as errf: | ||
2564 | + errf.write(f"No homology data found in the database for {s.id} ! Not replacing gaps.\n") | ||
2565 | + continue | ||
2566 | + elif len(homology_data) != full_length: | ||
2567 | + with open(runDir + "/errors.txt", "a") as errf: | ||
2568 | + errf.write(f"Found {len(homology_data)} nucleotides for {s.id} of length {full_length} ! Not replacing gaps.\n") | ||
2569 | + continue | ||
2570 | + for i in range(full_length): | ||
2571 | + if c_seq[i] == '-': | ||
2572 | + freq = homology_data[i] | ||
2573 | + l = letters[freq.index(max(freq))] | ||
2574 | + gaps.append((l, l == 'A', l == 'C', l == 'G', l == 'U', l == 'N', db_id, i+1)) | ||
2524 | pbar.update(1) | 2575 | pbar.update(1) |
2525 | - pbar.close() | ||
2526 | sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = ?, | 2576 | sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = ?, |
2527 | is_A = ?, is_C = ?, is_G = ?, is_U = ?, is_other = ? | 2577 | is_A = ?, is_C = ?, is_G = ?, is_U = ?, is_other = ? |
2528 | WHERE chain_id = ? AND index_chain = ?;""", many=True, data=gaps) | 2578 | WHERE chain_id = ? AND index_chain = ?;""", many=True, data=gaps) |
2529 | - | 2579 | + conn.close() |
2530 | - conn.close() | ||
2531 | idxQueue.put(thr_idx) # replace the thread index in the queue | 2580 | idxQueue.put(thr_idx) # replace the thread index in the queue |
2581 | + | ||
2582 | + setproctitle(f"RNAnet.py work_pssm_remap({f}) finished") | ||
2583 | + | ||
2532 | return 0 | 2584 | return 0 |
2533 | 2585 | ||
2534 | 2586 | ||
... | @@ -2538,6 +2590,7 @@ def work_save(c, homology=True): | ... | @@ -2538,6 +2590,7 @@ def work_save(c, homology=True): |
2538 | setproctitle(f"RNAnet.py work_save({c.chain_label})") | 2590 | setproctitle(f"RNAnet.py work_save({c.chain_label})") |
2539 | 2591 | ||
2540 | conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0) | 2592 | conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0) |
2593 | + conn.execute('pragma journal_mode=wal') | ||
2541 | if homology: | 2594 | if homology: |
2542 | df = pd.read_sql_query(f""" | 2595 | df = pd.read_sql_query(f""" |
2543 | SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, | 2596 | SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, |
... | @@ -2571,6 +2624,7 @@ if __name__ == "__main__": | ... | @@ -2571,6 +2624,7 @@ if __name__ == "__main__": |
2571 | runDir = os.getcwd() | 2624 | runDir = os.getcwd() |
2572 | fileDir = os.path.dirname(os.path.realpath(__file__)) | 2625 | fileDir = os.path.dirname(os.path.realpath(__file__)) |
2573 | ncores = read_cpu_number() | 2626 | ncores = read_cpu_number() |
2627 | + print(f"> Running {python_executable} on {ncores} CPU cores in folder {runDir}.") | ||
2574 | pp = Pipeline() | 2628 | pp = Pipeline() |
2575 | pp.process_options() | 2629 | pp.process_options() |
2576 | 2630 | ||
... | @@ -2584,7 +2638,9 @@ if __name__ == "__main__": | ... | @@ -2584,7 +2638,9 @@ if __name__ == "__main__": |
2584 | sql_define_tables(conn) | 2638 | sql_define_tables(conn) |
2585 | print("> Storing results into", runDir + "/results/RNANet.db") | 2639 | print("> Storing results into", runDir + "/results/RNANet.db") |
2586 | 2640 | ||
2587 | - # compute an update compared to what is in the table "chain" | 2641 | + # compute an update compared to what is in the table "chain" (comparison on structure_id + chain_name + rfam_acc). |
2642 | + # If --all was passed, all the structures are kept. | ||
2643 | + # Fills pp.update with Chain() objects. | ||
2588 | pp.list_available_mappings() | 2644 | pp.list_available_mappings() |
2589 | 2645 | ||
2590 | # =========================================================================== | 2646 | # =========================================================================== |
... | @@ -2592,10 +2648,13 @@ if __name__ == "__main__": | ... | @@ -2592,10 +2648,13 @@ if __name__ == "__main__": |
2592 | # =========================================================================== | 2648 | # =========================================================================== |
2593 | 2649 | ||
2594 | # Download and annotate new RNA 3D chains (Chain objects in pp.update) | 2650 | # Download and annotate new RNA 3D chains (Chain objects in pp.update) |
2651 | + # If the original cif file and/or the Json DSSR annotation file already exist, they are not redownloaded/recomputed. | ||
2595 | pp.dl_and_annotate(coeff_ncores=0.5) | 2652 | pp.dl_and_annotate(coeff_ncores=0.5) |
2596 | print("Here we go.") | 2653 | print("Here we go.") |
2597 | 2654 | ||
2598 | - # At this point, the structure table is up to date | 2655 | + # At this point, the structure table is up to date. |
2656 | + # Now save the DSSR annotations to the database. | ||
2657 | + # Extract the 3D chains to separate structure files if asked with --extract. | ||
2599 | pp.build_chains(coeff_ncores=1.0) | 2658 | pp.build_chains(coeff_ncores=1.0) |
2600 | 2659 | ||
2601 | if len(pp.to_retry): | 2660 | if len(pp.to_retry): |
... | @@ -2629,10 +2688,10 @@ if __name__ == "__main__": | ... | @@ -2629,10 +2688,10 @@ if __name__ == "__main__": |
2629 | # If your job failed, you can comment all the "3D information" part and start from here. | 2688 | # If your job failed, you can comment all the "3D information" part and start from here. |
2630 | pp.checkpoint_load_chains() | 2689 | pp.checkpoint_load_chains() |
2631 | 2690 | ||
2632 | - # Get the list of Rfam families found | 2691 | + # Get the list of Rfam families found in the update |
2633 | rfam_acc_to_download = {} | 2692 | rfam_acc_to_download = {} |
2634 | for c in pp.loaded_chains: | 2693 | for c in pp.loaded_chains: |
2635 | - if c.mapping.rfam_acc not in rfam_acc_to_download: | 2694 | + if c.mapping.rfam_acc not in rfam_acc_to_download.keys(): |
2636 | rfam_acc_to_download[c.mapping.rfam_acc] = [c] | 2695 | rfam_acc_to_download[c.mapping.rfam_acc] = [c] |
2637 | else: | 2696 | else: |
2638 | rfam_acc_to_download[c.mapping.rfam_acc].append(c) | 2697 | rfam_acc_to_download[c.mapping.rfam_acc].append(c) |
... | @@ -2644,7 +2703,8 @@ if __name__ == "__main__": | ... | @@ -2644,7 +2703,8 @@ if __name__ == "__main__": |
2644 | pp.prepare_sequences() | 2703 | pp.prepare_sequences() |
2645 | pp.realign() | 2704 | pp.realign() |
2646 | 2705 | ||
2647 | - # At this point, the family table is up to date | 2706 | + # At this point, the family table is almost up to date |
2707 | + # (lacking idty_percent and ali_filtered_length, both set in statistics.py) | ||
2648 | 2708 | ||
2649 | thr_idx_mgr = Manager() | 2709 | thr_idx_mgr = Manager() |
2650 | idxQueue = thr_idx_mgr.Queue() | 2710 | idxQueue = thr_idx_mgr.Queue() | ... | ... |
... | @@ -4,7 +4,7 @@ cd /home/lbecquey/Projects/RNANet | ... | @@ -4,7 +4,7 @@ cd /home/lbecquey/Projects/RNANet |
4 | rm -rf latest_run.log errors.txt | 4 | rm -rf latest_run.log errors.txt |
5 | 5 | ||
6 | # Run RNANet | 6 | # Run RNANet |
7 | -bash -c 'time ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --extract -s --archive' > latest_run.log 2>&1 | 7 | +bash -c 'time python3.8 /RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --extract -s --archive' > latest_run.log 2>&1 |
8 | echo 'Compressing RNANet.db.gz...' >> latest_run.log | 8 | echo 'Compressing RNANet.db.gz...' >> latest_run.log |
9 | touch results/RNANet.db # update last modification date | 9 | touch results/RNANet.db # update last modification date |
10 | gzip -k /home/lbecquey/Projects/RNANet/results/RNANet.db # compress it | 10 | gzip -k /home/lbecquey/Projects/RNANet/results/RNANet.db # compress it | ... | ... |
... | @@ -417,7 +417,10 @@ def parallel_stats_pairs(f): | ... | @@ -417,7 +417,10 @@ def parallel_stats_pairs(f): |
417 | def to_id_matrix(f): | 417 | def to_id_matrix(f): |
418 | """ | 418 | """ |
419 | Extracts sequences of 3D chains from the family alignments to a distinct STK file, | 419 | Extracts sequences of 3D chains from the family alignments to a distinct STK file, |
420 | - then runs esl-alipid on it to get an identity matrix | 420 | + then runs esl-alipid on it to get an identity matrix. |
421 | + | ||
422 | + Side-effect : also produces the 3D_only family alignment as a separate file. | ||
423 | + So, we use this function to update 'ali_filtered_length' in the family table. | ||
421 | """ | 424 | """ |
422 | if path.isfile("data/"+f+".npy"): | 425 | if path.isfile("data/"+f+".npy"): |
423 | return 0 | 426 | return 0 |
... | @@ -442,7 +445,14 @@ def to_id_matrix(f): | ... | @@ -442,7 +445,14 @@ def to_id_matrix(f): |
442 | subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap", # | 445 | subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap", # |
443 | "-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk", # This run just deletes columns of gaps | 446 | "-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk", # This run just deletes columns of gaps |
444 | "stockholm", path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"]) # | 447 | "stockholm", path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"]) # |
445 | - subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk"]) | 448 | + subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk", f + "_3d_only.stk"]) |
449 | + subprocess.run(["esl-reformat", "-o", path_to_seq_data+f"/realigned/{f}_3d_only.afa", "afa", path_to_seq_data+f"/realigned/{f}_3d_only.stk"]) | ||
450 | + | ||
451 | + # Out-of-scope task : update the database with the length of the filtered alignment: | ||
452 | + align = AlignIO.read(path_to_seq_data+f"/realigned/{f}_3d_only.afa", "fasta") | ||
453 | + with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | ||
454 | + sql_execute(conn, """UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;""", many=True, data=(align.get_alignment_length(), f)) | ||
455 | + del align | ||
446 | 456 | ||
447 | # Prepare the job | 457 | # Prepare the job |
448 | process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"), | 458 | process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"), | ... | ... |
-
Please register or login to post a comment