Louis BECQUEY

reduced remap() I/O

Showing 1 changed file with 26 additions and 10 deletions
...@@ -516,7 +516,7 @@ class Chain: ...@@ -516,7 +516,7 @@ class Chain:
516 """ Replace gapped positions by the consensus sequence. 516 """ Replace gapped positions by the consensus sequence.
517 517
518 REQUIRES align_column and re_mapping up to date 518 REQUIRES align_column and re_mapping up to date
519 - SETS nucleotide (nt_align_code)""" 519 + """
520 520
521 homology_data = sql_ask_database(conn, f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM 521 homology_data = sql_ask_database(conn, f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM
522 (SELECT chain_id, rfam_acc FROM chain WHERE chain_id={self.db_chain_id}) 522 (SELECT chain_id, rfam_acc FROM chain WHERE chain_id={self.db_chain_id})
...@@ -526,24 +526,25 @@ class Chain: ...@@ -526,24 +526,25 @@ class Chain:
526 if homology_data is None or not len(homology_data): 526 if homology_data is None or not len(homology_data):
527 with open(runDir + "/errors.txt", "a") as errf: 527 with open(runDir + "/errors.txt", "a") as errf:
528 errf.write(f"No homology data found in the database for {self.chain_label} ! Not replacing gaps.\n") 528 errf.write(f"No homology data found in the database for {self.chain_label} ! Not replacing gaps.\n")
529 - return 529 + return []
530 elif len(homology_data) != self.full_length: 530 elif len(homology_data) != self.full_length:
531 with open(runDir + "/errors.txt", "a") as errf: 531 with open(runDir + "/errors.txt", "a") as errf:
532 errf.write(f"Found {len(homology_data)} nucleotides for {self.chain_label} of length {self.full_length} ! Not replacing gaps.\n") 532 errf.write(f"Found {len(homology_data)} nucleotides for {self.chain_label} of length {self.full_length} ! Not replacing gaps.\n")
533 - return 533 + return []
534 c_seq_to_align = list(self.seq_to_align) 534 c_seq_to_align = list(self.seq_to_align)
535 c_seq = list(self.seq) 535 c_seq = list(self.seq)
536 letters = ['A', 'C', 'G', 'U', 'N'] 536 letters = ['A', 'C', 'G', 'U', 'N']
537 + gaps = []
537 for i in range(self.full_length): 538 for i in range(self.full_length):
538 if c_seq_to_align[i] == '-': # (then c_seq[i] also is) 539 if c_seq_to_align[i] == '-': # (then c_seq[i] also is)
539 freq = homology_data[i] 540 freq = homology_data[i]
540 l = letters[freq.index(max(freq))] 541 l = letters[freq.index(max(freq))]
541 c_seq_to_align[i] = l 542 c_seq_to_align[i] = l
542 c_seq[i] = l 543 c_seq[i] = l
543 - sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = '{l}', is_{l if l in "ACGU" else "other"} = 1 544 + gaps.append((l, l=='A', l=='C', l=='G', l=='U', l=='N', self.db_chain_id, i+1 ))
544 - WHERE chain_id = {self.db_chain_id} AND index_chain = {i+1};""")
545 self.seq_to_align = ''.join(c_seq_to_align) 545 self.seq_to_align = ''.join(c_seq_to_align)
546 self.seq = ''.join(c_seq) 546 self.seq = ''.join(c_seq)
547 + return gaps
547 548
548 549
549 class Job: 550 class Job:
...@@ -1148,7 +1149,8 @@ class Pipeline: ...@@ -1148,7 +1149,8 @@ class Pipeline:
1148 1149
1149 REQUIRES self.fam_list to be defined""" 1150 REQUIRES self.fam_list to be defined"""
1150 1151
1151 - print("Computing nucleotide frequencies in alignments...") 1152 + print("Computing nucleotide frequencies in alignments...\nThis can be very long on slow storage devices (Hard-drive...)")
1153 + print("Check your CPU and disk I/O activity before deciding if the job failed.")
1152 nworkers =max(min(ncores, len(self.fam_list)), 1) 1154 nworkers =max(min(ncores, len(self.fam_list)), 1)
1153 1155
1154 # Prepare the architecture of a shiny multi-progress-bars design 1156 # Prepare the architecture of a shiny multi-progress-bars design
...@@ -1987,6 +1989,7 @@ def work_pssm(f, fill_gaps): ...@@ -1987,6 +1989,7 @@ def work_pssm(f, fill_gaps):
1987 1989
1988 Also saves every chain of the family to file. 1990 Also saves every chain of the family to file.
1989 Uses only 1 core, so this function can be called in parallel. 1991 Uses only 1 core, so this function can be called in parallel.
1992 +
1990 """ 1993 """
1991 1994
1992 # Get a worker number to position the progress bar 1995 # Get a worker number to position the progress bar
...@@ -2062,16 +2065,29 @@ def work_pssm(f, fill_gaps): ...@@ -2062,16 +2065,29 @@ def work_pssm(f, fill_gaps):
2062 2065
2063 # Replace gaps by consensus 2066 # Replace gaps by consensus
2064 if fill_gaps: 2067 if fill_gaps:
2068 + pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Replace {f} gaps", leave=False)
2069 + pbar.update(0)
2070 + gaps = []
2065 for s in align: 2071 for s in align:
2066 if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain 2072 if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain
2067 continue 2073 continue
2068 2074
2069 try: 2075 try:
2070 - idx = chains_ids.index(s.id) 2076 + # get the right 3D chain:
2071 - list_of_chains[idx].replace_gaps(conn) 2077 + if '|' in s.id:
2078 + idx = chains_ids.index(s.id.split('|')[1])
2079 + else:
2080 + idx = chains_ids.index(s.id)
2081 +
2082 + gaps += list_of_chains[idx].replace_gaps(conn)
2072 except ValueError: 2083 except ValueError:
2073 pass # We already printed a warning just above 2084 pass # We already printed a warning just above
2074 - 2085 + pbar.update(1)
2086 + pbar.close()
2087 + sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = ?,
2088 + is_A = ?, is_C = ?, is_G = ?, is_U = ?, is_other = ?
2089 + WHERE chain_id = ? AND index_chain = ?;""", many=True, data = gaps)
2090 +
2075 conn.close() 2091 conn.close()
2076 idxQueue.put(thr_idx) # replace the thread index in the queue 2092 idxQueue.put(thr_idx) # replace the thread index in the queue
2077 return 0 2093 return 0
...@@ -2158,7 +2174,7 @@ if __name__ == "__main__": ...@@ -2158,7 +2174,7 @@ if __name__ == "__main__":
2158 # Homology information 2174 # Homology information
2159 # =========================================================================== 2175 # ===========================================================================
2160 2176
2161 - pp.checkpoint_load_chains() 2177 + pp.checkpoint_load_chains() # If your job failed, you can comment all the "3D information" part and start from here.
2162 2178
2163 # Get the list of Rfam families found 2179 # Get the list of Rfam families found
2164 rfam_acc_to_download = {} 2180 rfam_acc_to_download = {}
......