Showing
1 changed file
with
24 additions
and
8 deletions
... | @@ -516,7 +516,7 @@ class Chain: | ... | @@ -516,7 +516,7 @@ class Chain: |
516 | """ Replace gapped positions by the consensus sequence. | 516 | """ Replace gapped positions by the consensus sequence. |
517 | 517 | ||
518 | REQUIRES align_column and re_mapping up to date | 518 | REQUIRES align_column and re_mapping up to date |
519 | - SETS nucleotide (nt_align_code)""" | 519 | + """ |
520 | 520 | ||
521 | homology_data = sql_ask_database(conn, f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM | 521 | homology_data = sql_ask_database(conn, f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM |
522 | (SELECT chain_id, rfam_acc FROM chain WHERE chain_id={self.db_chain_id}) | 522 | (SELECT chain_id, rfam_acc FROM chain WHERE chain_id={self.db_chain_id}) |
... | @@ -526,24 +526,25 @@ class Chain: | ... | @@ -526,24 +526,25 @@ class Chain: |
526 | if homology_data is None or not len(homology_data): | 526 | if homology_data is None or not len(homology_data): |
527 | with open(runDir + "/errors.txt", "a") as errf: | 527 | with open(runDir + "/errors.txt", "a") as errf: |
528 | errf.write(f"No homology data found in the database for {self.chain_label} ! Not replacing gaps.\n") | 528 | errf.write(f"No homology data found in the database for {self.chain_label} ! Not replacing gaps.\n") |
529 | - return | 529 | + return [] |
530 | elif len(homology_data) != self.full_length: | 530 | elif len(homology_data) != self.full_length: |
531 | with open(runDir + "/errors.txt", "a") as errf: | 531 | with open(runDir + "/errors.txt", "a") as errf: |
532 | errf.write(f"Found {len(homology_data)} nucleotides for {self.chain_label} of length {self.full_length} ! Not replacing gaps.\n") | 532 | errf.write(f"Found {len(homology_data)} nucleotides for {self.chain_label} of length {self.full_length} ! Not replacing gaps.\n") |
533 | - return | 533 | + return [] |
534 | c_seq_to_align = list(self.seq_to_align) | 534 | c_seq_to_align = list(self.seq_to_align) |
535 | c_seq = list(self.seq) | 535 | c_seq = list(self.seq) |
536 | letters = ['A', 'C', 'G', 'U', 'N'] | 536 | letters = ['A', 'C', 'G', 'U', 'N'] |
537 | + gaps = [] | ||
537 | for i in range(self.full_length): | 538 | for i in range(self.full_length): |
538 | if c_seq_to_align[i] == '-': # (then c_seq[i] also is) | 539 | if c_seq_to_align[i] == '-': # (then c_seq[i] also is) |
539 | freq = homology_data[i] | 540 | freq = homology_data[i] |
540 | l = letters[freq.index(max(freq))] | 541 | l = letters[freq.index(max(freq))] |
541 | c_seq_to_align[i] = l | 542 | c_seq_to_align[i] = l |
542 | c_seq[i] = l | 543 | c_seq[i] = l |
543 | - sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = '{l}', is_{l if l in "ACGU" else "other"} = 1 | 544 | + gaps.append((l, l=='A', l=='C', l=='G', l=='U', l=='N', self.db_chain_id, i+1 )) |
544 | - WHERE chain_id = {self.db_chain_id} AND index_chain = {i+1};""") | ||
545 | self.seq_to_align = ''.join(c_seq_to_align) | 545 | self.seq_to_align = ''.join(c_seq_to_align) |
546 | self.seq = ''.join(c_seq) | 546 | self.seq = ''.join(c_seq) |
547 | + return gaps | ||
547 | 548 | ||
548 | 549 | ||
549 | class Job: | 550 | class Job: |
... | @@ -1148,7 +1149,8 @@ class Pipeline: | ... | @@ -1148,7 +1149,8 @@ class Pipeline: |
1148 | 1149 | ||
1149 | REQUIRES self.fam_list to be defined""" | 1150 | REQUIRES self.fam_list to be defined""" |
1150 | 1151 | ||
1151 | - print("Computing nucleotide frequencies in alignments...") | 1152 | + print("Computing nucleotide frequencies in alignments...\nThis can be very long on slow storage devices (Hard-drive...)") |
1153 | + print("Check your CPU and disk I/O activity before deciding if the job failed.") | ||
1152 | nworkers =max(min(ncores, len(self.fam_list)), 1) | 1154 | nworkers =max(min(ncores, len(self.fam_list)), 1) |
1153 | 1155 | ||
1154 | # Prepare the architecture of a shiny multi-progress-bars design | 1156 | # Prepare the architecture of a shiny multi-progress-bars design |
... | @@ -1987,6 +1989,7 @@ def work_pssm(f, fill_gaps): | ... | @@ -1987,6 +1989,7 @@ def work_pssm(f, fill_gaps): |
1987 | 1989 | ||
1988 | Also saves every chain of the family to file. | 1990 | Also saves every chain of the family to file. |
1989 | Uses only 1 core, so this function can be called in parallel. | 1991 | Uses only 1 core, so this function can be called in parallel. |
1992 | + | ||
1990 | """ | 1993 | """ |
1991 | 1994 | ||
1992 | # Get a worker number to position the progress bar | 1995 | # Get a worker number to position the progress bar |
... | @@ -2062,15 +2065,28 @@ def work_pssm(f, fill_gaps): | ... | @@ -2062,15 +2065,28 @@ def work_pssm(f, fill_gaps): |
2062 | 2065 | ||
2063 | # Replace gaps by consensus | 2066 | # Replace gaps by consensus |
2064 | if fill_gaps: | 2067 | if fill_gaps: |
2068 | + pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Replace {f} gaps", leave=False) | ||
2069 | + pbar.update(0) | ||
2070 | + gaps = [] | ||
2065 | for s in align: | 2071 | for s in align: |
2066 | if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain | 2072 | if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain |
2067 | continue | 2073 | continue |
2068 | 2074 | ||
2069 | try: | 2075 | try: |
2076 | + # get the right 3D chain: | ||
2077 | + if '|' in s.id: | ||
2078 | + idx = chains_ids.index(s.id.split('|')[1]) | ||
2079 | + else: | ||
2070 | idx = chains_ids.index(s.id) | 2080 | idx = chains_ids.index(s.id) |
2071 | - list_of_chains[idx].replace_gaps(conn) | 2081 | + |
2082 | + gaps += list_of_chains[idx].replace_gaps(conn) | ||
2072 | except ValueError: | 2083 | except ValueError: |
2073 | pass # We already printed a warning just above | 2084 | pass # We already printed a warning just above |
2085 | + pbar.update(1) | ||
2086 | + pbar.close() | ||
2087 | + sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = ?, | ||
2088 | + is_A = ?, is_C = ?, is_G = ?, is_U = ?, is_other = ? | ||
2089 | + WHERE chain_id = ? AND index_chain = ?;""", many=True, data = gaps) | ||
2074 | 2090 | ||
2075 | conn.close() | 2091 | conn.close() |
2076 | idxQueue.put(thr_idx) # replace the thread index in the queue | 2092 | idxQueue.put(thr_idx) # replace the thread index in the queue |
... | @@ -2158,7 +2174,7 @@ if __name__ == "__main__": | ... | @@ -2158,7 +2174,7 @@ if __name__ == "__main__": |
2158 | # Homology information | 2174 | # Homology information |
2159 | # =========================================================================== | 2175 | # =========================================================================== |
2160 | 2176 | ||
2161 | - pp.checkpoint_load_chains() | 2177 | + pp.checkpoint_load_chains() # If your job failed, you can comment all the "3D information" part and start from here. |
2162 | 2178 | ||
2163 | # Get the list of Rfam families found | 2179 | # Get the list of Rfam families found |
2164 | rfam_acc_to_download = {} | 2180 | rfam_acc_to_download = {} | ... | ... |
-
Please register or login to post a comment