Louis BECQUEY

Supports mappings of multiple portions to one family

...@@ -540,6 +540,7 @@ class Chain: ...@@ -540,6 +540,7 @@ class Chain:
540 self.seq_to_align = ''.join(c_seq_to_align) 540 self.seq_to_align = ''.join(c_seq_to_align)
541 self.seq = ''.join(c_seq) 541 self.seq = ''.join(c_seq)
542 542
543 +
543 class Job: 544 class Job:
544 """ This class contains information about a task to run later. 545 """ This class contains information about a task to run later.
545 546
...@@ -574,6 +575,7 @@ class Job: ...@@ -574,6 +575,7 @@ class Job:
574 s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}{self.func_.__name__}(" + " ".join([str(a) for a in self.args_]) + ")" 575 s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}{self.func_.__name__}(" + " ".join([str(a) for a in self.args_]) + ")"
575 return s 576 return s
576 577
578 +
577 class Monitor: 579 class Monitor:
578 """ A job that simply watches the memory usage of another process. 580 """ A job that simply watches the memory usage of another process.
579 581
...@@ -1681,10 +1683,16 @@ def work_infer_mappings(update_only, allmappings, codelist): ...@@ -1681,10 +1683,16 @@ def work_infer_mappings(update_only, allmappings, codelist):
1681 for rfam in families: 1683 for rfam in families:
1682 # if a known mapping of this chain on this family exists, apply it 1684 # if a known mapping of this chain on this family exists, apply it
1683 m = known_mappings.loc[ (known_mappings.pdb_id + "|1|" + known_mappings.chain == c[:4].lower()+c[4:]) & (known_mappings['rfam_acc'] == rfam ) ] 1685 m = known_mappings.loc[ (known_mappings.pdb_id + "|1|" + known_mappings.chain == c[:4].lower()+c[4:]) & (known_mappings['rfam_acc'] == rfam ) ]
1684 - if len(m): 1686 + if len(m) and len(m) < 2:
1685 pdb_start = int(m.pdb_start) 1687 pdb_start = int(m.pdb_start)
1686 pdb_end = int(m.pdb_end) 1688 pdb_end = int(m.pdb_end)
1687 inferred = False 1689 inferred = False
1690 + elif len(m):
1691 + # two different parts of the same chain are mapped to the same family... (ex: 6ek0-L5)
1692 + # ==> map the whole chain to that family, not the parts
1693 + pdb_start = int(m.pdb_start.min())
1694 + pdb_end = int(m.pdb_end.max())
1695 + inferred = False
1688 else: # otherwise, use the inferred mapping 1696 else: # otherwise, use the inferred mapping
1689 pdb_start = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_start) 1697 pdb_start = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_start)
1690 pdb_end = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_end) 1698 pdb_end = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_end)
...@@ -2114,7 +2122,7 @@ if __name__ == "__main__": ...@@ -2114,7 +2122,7 @@ if __name__ == "__main__":
2114 2122
2115 # At this point, the structure table is up to date 2123 # At this point, the structure table is up to date
2116 2124
2117 - pp.build_chains(coeff_ncores=2.0) 2125 + pp.build_chains(coeff_ncores=1.0)
2118 if len(pp.to_retry): 2126 if len(pp.to_retry):
2119 # Redownload and re-annotate 2127 # Redownload and re-annotate
2120 print("> Retrying to annotate some structures which just failed.", flush=True) 2128 print("> Retrying to annotate some structures which just failed.", flush=True)
......
...@@ -87,6 +87,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): ...@@ -87,6 +87,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
87 kernel_c2 = st.gaussian_kde(values_c2) 87 kernel_c2 = st.gaussian_kde(values_c2)
88 f_c2 = np.reshape(kernel_c2(positions).T, xx.shape) 88 f_c2 = np.reshape(kernel_c2(positions).T, xx.shape)
89 89
90 +
90 # Uncomment to save the data to an archive for later use without the need to recompute 91 # Uncomment to save the data to an archive for later use without the need to recompute
91 np.savez(f"data/wadley_kernel_{angle}.npz", 92 np.savez(f"data/wadley_kernel_{angle}.npz",
92 c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas, 93 c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas,
...@@ -516,16 +517,16 @@ if __name__ == "__main__": ...@@ -516,16 +517,16 @@ if __name__ == "__main__":
516 mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}';") ] 517 mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}';") ]
517 conn.close() 518 conn.close()
518 519
519 - stats_pairs() 520 + # stats_pairs()
520 521
521 # Define threads for the tasks 522 # Define threads for the tasks
522 threads = [ 523 threads = [
523 th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}), 524 th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}),
524 th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}), 525 th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}),
525 - th.Thread(target=stats_len), 526 + # th.Thread(target=stats_len),
526 - th.Thread(target=stats_freq), 527 + # th.Thread(target=stats_freq),
527 - th.Thread(target=seq_idty), 528 + # th.Thread(target=seq_idty),
528 - th.Thread(target=per_chain_stats) 529 + # th.Thread(target=per_chain_stats)
529 ] 530 ]
530 531
531 # Start the threads 532 # Start the threads
......