Supports mappings of multiple portions to one family

Louis BECQUEY
Commit b910de9ec07aa8147681260d184f8783880c1e1a b910de9e 1 parent 78e5248c
Showing 2 changed files with 16 additions and 7 deletions
RNAnet.py
statistics.py
--- a/RNAnet.py
View file @b910de9
+++ b/RNAnet.py
View file @b910de9
@@ -540,6 +540,7 @@ class Chain:
         self.seq_to_align = ''.join(c_seq_to_align)
         self.seq = ''.join(c_seq)
+
 class Job:
     """ This class contains information about a task to run later.
@@ -574,6 +575,7 @@ class Job:
             s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}{self.func_.__name__}(" + " ".join([str(a) for a in self.args_]) + ")"
         return s
+
 class Monitor:
     """ A job that simply watches the memory usage of another process. 
@@ -1681,10 +1683,16 @@ def work_infer_mappings(update_only, allmappings, codelist):
             for rfam in families:
                 # if a known mapping of this chain on this family exists, apply it
                 m = known_mappings.loc[ (known_mappings.pdb_id + "|1|" + known_mappings.chain == c[:4].lower()+c[4:]) & (known_mappings['rfam_acc'] == rfam ) ]
-                if len(m):
+                if len(m) and len(m) < 2:
                     pdb_start = int(m.pdb_start)
                     pdb_end = int(m.pdb_end)
                     inferred = False
+                elif len(m): 
+                    # two different parts of the same chain are mapped to the same family... (ex: 6ek0-L5)
+                    # ==> map the whole chain to that family, not the parts
+                    pdb_start = int(m.pdb_start.min())
+                    pdb_end = int(m.pdb_end.max())
+                    inferred = False
                 else: # otherwise, use the inferred mapping
                     pdb_start = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_start)
                     pdb_end = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_end)
@@ -2114,7 +2122,7 @@ if __name__ == "__main__":
     # At this point, the structure table is up to date
-    pp.build_chains(coeff_ncores=2.0)
+    pp.build_chains(coeff_ncores=1.0)
     if len(pp.to_retry):
         # Redownload and re-annotate 
         print("> Retrying to annotate some structures which just failed.", flush=True)
--- a/statistics.py
View file @b910de9
+++ b/statistics.py
View file @b910de9
@@ -87,6 +87,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
         kernel_c2 = st.gaussian_kde(values_c2)
         f_c2 = np.reshape(kernel_c2(positions).T, xx.shape)
+
         # Uncomment to save the data to an archive for later use without the need to recompute
         np.savez(f"data/wadley_kernel_{angle}.npz",
                   c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas,
@@ -516,16 +517,16 @@ if __name__ == "__main__":
         mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}';") ]
     conn.close()
-    stats_pairs()
+    # stats_pairs()
     # Define threads for the tasks
     threads = [
         th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}),
         th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}),
-        th.Thread(target=stats_len),
+        # th.Thread(target=stats_len),
-        th.Thread(target=stats_freq),
+        # th.Thread(target=stats_freq),
-        th.Thread(target=seq_idty),
+        # th.Thread(target=seq_idty),
-        th.Thread(target=per_chain_stats)
+        # th.Thread(target=per_chain_stats)
     ]
     # Start the threads