Latest statistics on basepair counts by chain

Louis BECQUEY
Commit 6d1b967ebb64b53e0103f5fd46bed626b7457ec5 6d1b967e 1 parent 7b99a37e
Showing 3 changed files with 73 additions and 74 deletions
.gitignore
RNAnet.py
statistics.py
--- a/.gitignore
View file @6d1b967
+++ b/.gitignore
View file @6d1b967
@@ -3,7 +3,7 @@ nohup.out
 log_of_the_run.sh
 # results
-results/figures/wadley_plots/
+results/
 # temporary results files
 data/
--- a/RNAnet.py
View file @6d1b967
+++ b/RNAnet.py
View file @6d1b967
@@ -1179,7 +1179,7 @@ class Pipeline:
             os.makedirs(runDir + "/results/archive/")
         # Save to by-chain CSV files
-        p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=1, maxtasksperchild=5)
+        p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3)
         try:
             pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True) 
             for i, _ in enumerate(p.imap_unordered(work_save, self.loaded_chains)):
@@ -1208,7 +1208,7 @@ class Pipeline:
         # Save additional informations
         conn = sqlite3.connect(runDir+"/results/RNANet.db")
-        pd.read_sql_query("SELECT rfam_acc, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family", 
+        pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 
                           conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
         pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, reversed, date, exp_method, resolution, issue FROM structure 
                             JOIN chain ON structure.pdb_id = chain.structure_id
@@ -1274,6 +1274,7 @@ class Pipeline:
         conn.close()
+
 def read_cpu_number():
     # As one shall not use os.cpu_count() on LXC containers,
     # because it reads info from /sys wich is not the VM resources but the host resources.
@@ -2050,6 +2051,7 @@ def work_pssm(f, fill_gaps):
     idxQueue.put(thr_idx) # replace the thread index in the queue
     return 0
+@trace_unhandled_exceptions
 def work_save(c, homology=True):
     conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0)
     if homology:
@@ -2096,38 +2098,36 @@ if __name__ == "__main__":
         sql_define_tables(conn)
     print("> Storing results into", runDir + "/results/RNANet.db")
-    # # compute an update compared to what is in the table "chain"
+    # compute an update compared to what is in the table "chain"
-    # #DEBUG: list everything
+    pp.list_available_mappings()
-    # pp.REUSE_ALL = True
+
-    # pp.list_available_mappings()
+    # ===========================================================================
-
+    # 3D information
-    # # ===========================================================================
+    # ===========================================================================
-    # # 3D information
+
-    # # ===========================================================================
+    # Download and annotate new RNA 3D chains (Chain objects in pp.update)
-
+    pp.dl_and_annotate(coeff_ncores=0.5) 
-    # # Download and annotate new RNA 3D chains (Chain objects in pp.update)
+
-    # pp.dl_and_annotate(coeff_ncores=0.75) 
+    # At this point, the structure table is up to date
-
+
-    # # At this point, the structure table is up to date
+    pp.build_chains(coeff_ncores=2.0)
-
+    if len(pp.to_retry):
-    # pp.build_chains(coeff_ncores=2.0)
+        # Redownload and re-annotate 
-    # if len(pp.to_retry):
+        print("> Retrying to annotate some structures which just failed.", flush=True)
-    #     # Redownload and re-annotate 
+        pp.dl_and_annotate(retry=True, coeff_ncores=0.3)  #
-    #     print("> Retrying to annotate some structures which just failed.", flush=True)
+        pp.build_chains(retry=True, coeff_ncores=1.0)     # Use half the cores to reduce required amount of memory
-    #     pp.dl_and_annotate(retry=True, coeff_ncores=0.5)  #
+    print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} errors).")
-    #     pp.build_chains(retry=True, coeff_ncores=1.0)     # Use half the cores to reduce required amount of memory
+    pp.checkpoint_save_chains()
-    # print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} errors).")
+
-    # pp.checkpoint_save_chains()
+    if not pp.HOMOLOGY:
-
+        # Save chains to file
-    # if not pp.HOMOLOGY:
+        for c in pp.loaded_chains:
-    #     # Save chains to file
+            work_save(c, homology=False)
-    #     for c in pp.loaded_chains:
+        print("Completed.")
-    #         work_save(c, homology=False)
+        exit()
-    #     print("Completed.")
-    #     exit()
-    # # At this point, structure, chain and nucleotide tables of the database are up to date.
+    # At this point, structure, chain and nucleotide tables of the database are up to date.
-    # # (Modulo some statistics computed by statistics.py)
+    # (Modulo some statistics computed by statistics.py)
     # ===========================================================================
     # Homology information
--- a/statistics.py
View file @6d1b967
+++ b/statistics.py
View file @6d1b967
@@ -26,11 +26,13 @@ from collections import Counter
 from RNAnet import read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker
 # This sets the paths
-path_to_3D_data = "/home/lbecquey/Data/RNA/3D/"
-path_to_seq_data = "/home/lbecquey/Data/RNA/sequences/"
 if len(sys.argv) > 1:
     path_to_3D_data = path.abspath(sys.argv[1])
     path_to_seq_data = path.abspath(sys.argv[2])
+else:
+    print("Please set paths to 3D data using command line arguments:")
+    print("./statistics.py /path/to/3D/data/ /path/to/sequence/data/")
+    exit()
 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546")   # From Rfam CLAN 00112
 SSU_set = ("RF00177", "RF02542",  "RF02545", "RF01959", "RF01960")  # From Rfam CLAN 00111
@@ -289,19 +291,32 @@ def parallel_stats_pairs(f):
                                             np.where(expanded_list.nts.isin(["GU","UG"]), "Wobble","Other")
                                         )
                                     )
-        # checks
-        # ct = pd.crosstab(expanded_list.pair_type_LW, expanded_list.basepair)
-        # ct = ct.loc[[ x for x in ["cWW","cHH","cSS","tWW","tHH","tSS"] if x in ct.index ]]
-        # for _, symmetric_type in ct.iterrows():
-        #     for x in symmetric_type:
-        #         if x%2:
-        #             print("Odd number found for", symmetric_type.name, "in chain", cid, flush=True)
-        #             print(expanded_list, flush=True)
-        #             exit()
-
         expanded_list = expanded_list[["basepair", "pair_type_LW"]]
+
+        # Update the database
+        vlcnts = expanded_list.pair_type_LW.value_counts()
+        sqldata = ( vlcnts.at["cWW"]/2 if "cWW" in vlcnts.index else 0, 
+                    vlcnts.at["cWH"] if "cWH" in vlcnts.index else 0, 
+                    vlcnts.at["cWS"] if "cWS" in vlcnts.index else 0, 
+                    vlcnts.at["cHH"]/2 if "cHH" in vlcnts.index else 0, 
+                    vlcnts.at["cHS"] if "cHS" in vlcnts.index else 0, 
+                    vlcnts.at["cSS"]/2 if "cSS" in vlcnts.index else 0, 
+                    vlcnts.at["tWW"]/2 if "tWW" in vlcnts.index else 0, 
+                    vlcnts.at["tWH"] if "tWH" in vlcnts.index else 0, 
+                    vlcnts.at["tWS"] if "tWS" in vlcnts.index else 0, 
+                    vlcnts.at["tHH"]/2 if "tHH" in vlcnts.index else 0, 
+                    vlcnts.at["tHS"] if "tHS" in vlcnts.index else 0, 
+                    vlcnts.at["tSS"]/2 if "tSS" in vlcnts.index else 0, 
+                    int(sum(vlcnts.loc[[ str(x) for x in vlcnts.index if "." in str(x)]])/2), 
+                    cid)
+        with sqlite3.connect("results/RNANet.db") as conn:
+            sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
+                                    pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?, 
+                                    pair_count_tHH = ?, pair_count_tHS = ?, pair_count_tSS = ?, pair_count_other = ? WHERE chain_id = ?;""", data=sqldata)
+
         data.append(expanded_list)
+
     # merge all the dataframes from all chains of the family
     expanded_list = pd.concat(data)
@@ -336,17 +351,6 @@ def stats_pairs():
                 fam_pbar.update(1)
                 results.append(fam_df)
                 allpairs.append(newpairs)
-
-                # Checks
-                vlcnts= newpairs.pair_type_LW.value_counts()
-                identical = [fam_df[i][0] == newpairs.pair_type_LW.value_counts().at[i] for i in fam_df.columns]
-                if False in identical:
-                    print(fam_df)
-                    print(vlcnts)
-                    print("Dataframes differ for",fam_df.index[0], flush=True)
-                for x in ["cWW","cHH","cSS","tWW","tHH","tSS"]:
-                    if x in vlcnts.index and vlcnts[x] % 2:
-                        print("Trouvé un nombre impair de",x,"dans",fam_df.index[0], flush=True)
             fam_pbar.close()
             p.close()
             p.join()
@@ -359,10 +363,6 @@ def stats_pairs():
         all_pairs = pd.concat(allpairs)
         df = pd.concat(results).fillna(0)
-        vlcnts= all_pairs.pair_type_LW.value_counts()
-        for x in ["cWW","cHH","cSS","tWW","tHH","tSS"]:
-            if x in vlcnts.index and vlcnts[x] % 2:
-                print("Trouvé un nombre impair de",x,"après le merge !", flush=True)
         df.to_csv("data/pair_counts.csv")
         all_pairs.to_csv("data/all_pairs.csv")
     else:
@@ -375,18 +375,16 @@ def stats_pairs():
     # Remove not very well defined pair types (not in the 12 LW types)
     df['other'] = df[col_list].sum(axis=1)
     df.drop(col_list, axis=1, inplace=True)
-    crosstab = crosstab.append(crosstab.loc[col_list].sum(axis=0).rename("Other"))
+    crosstab = crosstab.append(crosstab.loc[col_list].sum(axis=0).rename("non-LW"))
     # drop duplicate types
     # The twelve Leontis-Westhof types are
     # cWW cWH cWS cHH cHS cSS (do not count cHW cSW and cSH, they are the same as their opposites)
     # tWW tWH tWS tHH tHS tSS (do not count tHW tSW and tSH, they are the same as their opposites)
-    df.drop([ x for x in [ "cHW", "tHW", "cSW", "tSW", "cHS", "tHS"] if x in df.columns], axis=1)
+    df = df.drop([ x for x in [ "cHW", "tHW", "cSW", "tSW", "cHS", "tHS"] if x in df.columns], axis=1)
-    crosstab = crosstab.loc[[ x for x in ["cWW","cWH","cWS","cHH","cHS","cSS","tWW","tWH","tWS","tHH","tHS","tSS","Other"] if x in crosstab.index]]
+    crosstab = crosstab.loc[[ x for x in ["cWW","cWH","cWS","cHH","cHS","cSS","tWW","tWH","tWS","tHH","tHS","tSS","non-LW"] if x in crosstab.index]]
     df.loc[:,[x for x in ["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "other"] if x in df.columns] ] /= 2
-    # crosstab.loc[["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "Other"]] /= 2
+    crosstab.loc[["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "non-LW"]] /= 2
-    print(crosstab)
-    print(df)
     # Compute total row
     total_series = df.sum(numeric_only=True).rename("TOTAL")
@@ -397,15 +395,16 @@ def stats_pairs():
     # reorder columns
     df.sort_values("TOTAL", axis=1, inplace=True, ascending=False)
+    crosstab = crosstab[["AU", "GC", "Wobble", "Other"]]
     # Save to CSV
-    df.to_csv("results/pairings.csv")
+    df.to_csv("results/pair_types.csv")
     # Plot barplot of overall types
-    total_series.sort_values(ascending=False, inplace=True)
+    ax = crosstab.plot(figsize=(8,5), kind='bar', stacked=True, log=False, fontsize=13)
-    ax = total_series.plot(figsize=(5,3), kind='bar', log=True, ylim=(1e4,5000000) )
+    ax.set_ylabel("Number of observations (millions)", fontsize=13)
-    ax.set_ylabel("Number of observations")
+    ax.set_xlabel(None)
-    plt.subplots_adjust(bottom=0.2, right=0.99)
+    plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99)
     plt.savefig("results/figures/pairings.png")
     notify("Computed nucleotide statistics and saved CSV and PNG file.")
@@ -416,7 +415,7 @@ def to_dist_matrix(f):
         return 0
     dm = DistanceCalculator('identity')
-    with open(path_to_seq_data+"realigned/"+f+"++.afa") as al_file:
+    with open(path_to_seq_data+"/realigned/"+f+"++.afa") as al_file:
         al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):]
     idty = dm.get_distance(al).matrix # list of lists
     del al
@@ -457,7 +456,7 @@ def seq_idty():
     for f, D in zip(famlist, fam_arrays):
         if not len(D): continue
         a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
-        conn.execute(f"UPDATE family SET idty_percent = {float(a)} WHERE rfam_acc = '{f}';")
+        conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
     conn.commit()
     conn.close()