Solved crashes with --no-homology

Louis BECQUEY
Commit d9616243b045590e6e88907316bd3df0d6329527 d9616243 1 parent 31785c99
Showing 2 changed files with 82 additions and 39 deletions
RNAnet.py
statistics.py
--- a/RNAnet.py
View file @d961624
+++ b/RNAnet.py
View file @d961624
@@ -155,7 +155,7 @@ class Chain:
         """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file.
         """
         
-         if (self.pdb_end - self.pdb_start):
+         if self.pdb_start is not None and (self.pdb_end - self.pdb_start):
             status = f"Extract {self.pdb_start}-{self.pdb_end} atoms from {self.pdb_id}-{self.pdb_chain_id}"
             self.file = path_to_3D_data+"rna_mapped_to_Rfam/"+self.chain_label+".cif"
         else:
@@ -182,7 +182,7 @@ class Chain:
             # Extract the desired chain
             c = s[model_idx][self.pdb_chain_id]
 
-             if (self.pdb_end - self.pdb_start):
+             if self.pdb_start is not None and (self.pdb_end - self.pdb_start):
                 # # Pay attention to residue numbering 
                 # first_number = c.child_list[0].get_id()[1]          # the chain's first residue is numbered 'first_number'
                 # if self.pdb_start < self.pdb_end:                             
@@ -309,6 +309,28 @@ class Chain:
         if df.iloc[0,0] != 1:
             st = df.iloc[0,0] -1
             df.iloc[:, 0] -= st
+ 
+             
+         # Find missing index_chain values because of resolved nucleotides that have a strange nt_resnum value
+         # e.g. 4v49-AA, position 5'- 1003 -> 2003 -> 1004 - 3'
+         diff = set(range(df.shape[0])).difference(df['index_chain'] - 1)
+         for i in sorted(diff):
+             # check if a nucleotide numbered +1000 exists
+             looked_for = df[df.index_chain == i].nt_resnum.values[0]
+             found = None
+             for nt in nts:
+                 if nt['chain_name'] != self.pdb_chain_id:
+                     continue
+                 if nt['index_chain'] == i + 1 :
+                     found = nt
+                     break
+             if found:
+                 df_row = pd.DataFrame([found], index=[i])[df.columns.values]
+                 df_row.iloc[0,1] = df.iloc[i,1]
+                 df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ])
+                 df.iloc[i+1:, 1] += 1
+             else:
+                 warn(f"Missing index_chain {i} in {self.chain_label} !")
         df = df.drop(df[df.index_chain < 0].index)  # drop eventual ones with index_chain < the first residue (usually, ligands)
 
         # Re-Assert some nucleotides still exist
@@ -352,6 +374,10 @@ class Chain:
             df = df.reset_index(drop=True)
         self.full_length = len(df.index_chain)
 
+         #######################################
+         # Compute new features
+         #######################################
+ 
         # Add a sequence column just for the alignments
         df['nt_align_code'] = [ str(x).upper()
                                     .replace('NAN', '-')      # Unresolved nucleotides are gaps
@@ -360,11 +386,6 @@ class Chain:
                                     .replace('P', 'U')        # Pseudo-uridines, but it is not really right to change them to U, see DSSR paper, Fig 2
                                 for x in df['nt_code'] ]
 
- 
-         #######################################
-         # Compute new features
-         #######################################
- 
         # One-hot encoding sequence
         df["is_A"] = [ 1 if x=="A" else 0 for x in df["nt_code"] ]
         df["is_C"] = [ 1 if x=="C" else 0 for x in df["nt_code"] ]
@@ -464,6 +485,7 @@ class Chain:
         ####################################
         # Save everything to database
         ####################################
+ 
         with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
             # Register the chain in table chain
             if self.pdb_start is not None:
@@ -472,13 +494,28 @@ class Chain:
                                         VALUES 
                                         (?, ?, ?, ?, ?, ?, ?, ?)
                                         ON CONFLICT(structure_id, chain_name, rfam_acc) DO
-                                         UPDATE SET pdb_start=excluded.pdb_start, pdb_end=excluded.pdb_end, reversed=excluded.reversed, inferred=excluded.inferred, issue=excluded.issue;""", 
-                                         data=(str(self.pdb_id), str(self.pdb_chain_id), int(self.pdb_start), int(self.pdb_end), int(self.reversed), str(self.rfam_fam), int(self.inferred), int(self.delete_me)))
+                                         UPDATE SET  pdb_start=excluded.pdb_start, 
+                                                     pdb_end=excluded.pdb_end, 
+                                                     reversed=excluded.reversed, 
+                                                     inferred=excluded.inferred, 
+                                                     issue=excluded.issue;""", 
+                                         data=(str(self.pdb_id), str(self.pdb_chain_id), 
+                                               int(self.pdb_start), int(self.pdb_end), 
+                                               int(self.reversed), str(self.rfam_fam), 
+                                               int(self.inferred), int(self.delete_me)))
                 # get the chain id
-                 self.db_chain_id = sql_ask_database(conn, f"SELECT (chain_id) FROM chain WHERE structure_id='{self.pdb_id}' AND chain_name='{self.pdb_chain_id}' AND rfam_acc='{self.rfam_fam}';")[0][0]
+                 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain 
+                                                     WHERE structure_id='{self.pdb_id}' 
+                                                     AND chain_name='{self.pdb_chain_id}' 
+                                                     AND rfam_acc='{self.rfam_fam}';""")[0][0]
             else:
-                 sql_execute(conn, "INSERT INTO chain (structure_id, chain_name, issue) VALUES (?, ?, ?) ON CONFLICT(structure_id, chain_name) DO UPDATE SET issue=excluded.issue;", data=(str(self.pdb_id), int(self.pdb_chain_id), int(self.delete_me)))
-                 self.db_chain_id = sql_ask_database(conn, f"SELECT (chain_id) FROM chain WHERE structure_id='{self.pdb_id}' AND chain_name='{self.pdb_chain_id}' AND rfam_acc IS NULL;")[0][0]
+                 sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, issue) VALUES (?, ?, NULL, ?) 
+                                    ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue;""", 
+                             data=(str(self.pdb_id), str(self.pdb_chain_id), int(self.delete_me)))
+                 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain 
+                                                     WHERE structure_id='{self.pdb_id}' 
+                                                     AND chain_name='{self.pdb_chain_id}' 
+                                                     AND rfam_acc IS NULL;""")[0][0]
             
             # Add the nucleotides
             sql_execute(conn, f"""
@@ -492,7 +529,6 @@ class Chain:
                 ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""", 
             many=True, data=list(df.to_records(index=False)), warn_every=10)
 
-         
         # Remove too short chains
         if self.length < 5:
             warn(f"{self.chain_label} sequence is too short, let's ignore it.\t", error=True)
@@ -1064,7 +1100,7 @@ class Pipeline:
         if self.EXTRACT_CHAINS:
             if self.HOMOLOGY and not path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"):
                 os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam") # for the portions mapped to Rfam
-             if not self.HOMOLOGY and not path.isdir(path_to_3D_data + "rna_only"):
+             if (not self.HOMOLOGY) and not path.isdir(path_to_3D_data + "rna_only"):
                 os.makedirs(path_to_3D_data + "rna_only") # extract chains of pure RNA
 
         # define and run jobs
@@ -1295,7 +1331,7 @@ class Pipeline:
         r = sql_ask_database(conn, """SELECT DISTINCT chain_id, structure_id FROM chain WHERE structure_id NOT IN (SELECT DISTINCT pdb_id FROM structure);""")
         if len(r) and r[0][0] is not None:
             warn("Chains without referenced structures have been detected")
-             print(" ".join([x[1]+'-'+x[0] for x in r]))
+             print(" ".join([str(x[1])+'-'+str(x[0]) for x in r]))
         
         if self.HOMOLOGY:
             # check if chains have been re_mapped:
@@ -2187,6 +2223,7 @@ if __name__ == "__main__":
     # At this point, the structure table is up to date
 
     pp.build_chains(coeff_ncores=1.0)
+ 
     if len(pp.to_retry):
         # Redownload and re-annotate 
         print("> Retrying to annotate some structures which just failed.", flush=True)
@@ -2227,6 +2264,7 @@ if __name__ == "__main__":
         pp.prepare_sequences()
         pp.realign()
 
+ 
         # At this point, the family table is up to date    
 
         thr_idx_mgr = Manager()
--- a/statistics.py
View file @d961624
+++ b/statistics.py
View file @d961624
@@ -68,27 +68,30 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
 
     
     if not path.isfile(f"data/wadley_kernel_{angle}.npz"):
-         conn = sqlite3.connect("results/RNANet.db")
-         df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE puckering="C2'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn)
-         c2_endo_etas = df[angle].values.tolist()
-         c2_endo_thetas = df["th"+angle].values.tolist()
-         df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE form = '.' AND puckering="C3'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn)
-         c3_endo_etas = df[angle].values.tolist()
-         c3_endo_thetas = df["th"+angle].values.tolist()
-         conn.close()
+         # Extract the angle values of c2'-endo and c3'-endo nucleotides
+         with sqlite3.connect("results/RNANet.db") as conn:
+             df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE puckering="C2'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn)
+             c2_endo_etas = df[angle].values.tolist()
+             c2_endo_thetas = df["th"+angle].values.tolist()
+             df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE form = '.' AND puckering="C3'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn)
+             c3_endo_etas = df[angle].values.tolist()
+             c3_endo_thetas = df["th"+angle].values.tolist()
+         
+         # Create arrays with (x,y) coordinates of the points
+         values_c3 = np.vstack([c3_endo_etas, c3_endo_thetas])
+         values_c2 = np.vstack([c2_endo_etas, c2_endo_thetas])
+ 
+         # Approximate the density by a gaussian kernel
+         kernel_c3 = st.gaussian_kde(values_c3)
+         kernel_c2 = st.gaussian_kde(values_c2)
 
+         # Create 100x100 regular (x,y,z) values for the plot
         xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j]
         positions = np.vstack([xx.ravel(), yy.ravel()])
- 
-         values_c3 = np.vstack([c3_endo_etas, c3_endo_thetas])
-         kernel_c3 = st.gaussian_kde(values_c3)
         f_c3 = np.reshape(kernel_c3(positions).T, xx.shape)
-         values_c2 = np.vstack([c2_endo_etas, c2_endo_thetas])
-         kernel_c2 = st.gaussian_kde(values_c2)
         f_c2 = np.reshape(kernel_c2(positions).T, xx.shape)
 
- 
-         # Uncomment to save the data to an archive for later use without the need to recompute
+         # Save the data to an archive for later use without the need to recompute
         np.savez(f"data/wadley_kernel_{angle}.npz",
                   c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas,
                   c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas,
@@ -106,8 +109,10 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
     notify(f"Kernel computed for {angle}/th{angle} (or loaded from file).")
 
     # exact counts:
-     hist_c2, xedges, yedges = np.histogram2d(c2_endo_etas, c2_endo_thetas, bins=int(2*np.pi/0.1), range=[[0, 2*np.pi], [0, 2*np.pi]])
-     hist_c3, xedges, yedges = np.histogram2d(c3_endo_etas, c3_endo_thetas, bins=int(2*np.pi/0.1), range=[[0, 2*np.pi], [0, 2*np.pi]])
+     hist_c2, xedges, yedges = np.histogram2d(c2_endo_etas, c2_endo_thetas, bins=int(2*np.pi/0.1), 
+                                              range=[[0, 2*np.pi], [0, 2*np.pi]])
+     hist_c3, xedges, yedges = np.histogram2d(c3_endo_etas, c3_endo_thetas, bins=int(2*np.pi/0.1), 
+                                              range=[[0, 2*np.pi], [0, 2*np.pi]])
     cmap = cm.get_cmap("jet")
     color_values = cmap(hist_c3.ravel()/hist_c3.max())
 
@@ -450,7 +455,7 @@ def seq_idty():
     famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 1 ORDER BY rfam_acc ASC;") ]
     ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ]
     if len(ignored):
-         print("Idty matrices: Ignoring families with only one chain:", " ".join(ignored)+'\n')
+         print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
 
     # compute distance matrices (or ignore if data/RF0****.npy exists)
     p = Pool(processes=8)
@@ -476,7 +481,7 @@ def seq_idty():
     conn.close()
 
     # Plots plots plots
-     fig, axs = plt.subplots(5,13, figsize=(15,9))
+     fig, axs = plt.subplots(4,17, figsize=(17,5.75))
     axs = axs.ravel()
     [axi.set_axis_off() for axi in axs]
     im = "" # Just to declare the variable, it will be set in the loop
@@ -495,7 +500,7 @@ def seq_idty():
             D = D[idx1,:]
             D = D[:,idx1[::-1]]
         im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D
-         ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)")
+         ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10)
     fig.tight_layout()
     fig.subplots_adjust(wspace=0.1, hspace=0.3)
     fig.colorbar(im, ax=axs[-1], shrink=0.8)
@@ -537,10 +542,10 @@ if __name__ == "__main__":
     threads = [
         th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}),
         th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}),
-         # th.Thread(target=stats_len),
-         # th.Thread(target=stats_freq),
-         # th.Thread(target=seq_idty),
-         # th.Thread(target=per_chain_stats)
+         # th.Thread(target=stats_len),            # computes figures
+         # th.Thread(target=stats_freq),           # Updates the database
+         # th.Thread(target=seq_idty),           # produces .npy files and seq idty figures
+         # th.Thread(target=per_chain_stats)       # Updates the database
     ]
     
     # Start the threads