corrections des statistiques

Louis BECQUEY
Commit 838a68247933986a27d5d143fae9065dc7b36b85 838a6824 1 parent 693d73e7
Showing 2 changed files with 79 additions and 62 deletions
RNAnet.py
statistics.py
--- a/RNAnet.py
View file @838a682
+++ b/RNAnet.py
View file @838a682
@@ -1410,6 +1410,7 @@ class Pipeline:
         # Start a process pool to dispatch the RNA families,
         # over multiple CPUs (one family by CPU)
+        # p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=1)
         p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
         try:
@@ -2407,7 +2408,7 @@ def work_pssm_remap(f, fill_gaps):
             continue
         # Check if the chain existed before in the database
-        if chains_ids.index(s.id) in list_of_chains.keys():
+        if s.id in chains_ids:
             # a chain object is found in the update, this sequence is new
             this_chain = list_of_chains[chains_ids.index(s.id)]
             seq_to_align = this_chain.seq_to_align
@@ -2415,12 +2416,10 @@ def work_pssm_remap(f, fill_gaps):
             db_id = this_chain.db_chain_id
         else:
             # it existed in the database before.
-            this_chain = None
-
             # Get the chain id in the database
             conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0)
             conn.execute('pragma journal_mode=wal')
-            db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = {s.id.split('[')[0]} AND chain_name = {s.id.split('-')[1]} AND rfam_acc = {f};")
+            db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';")
             if len(db_id):
                 db_id = db_id[0][0]
             else:
@@ -2430,7 +2429,6 @@ def work_pssm_remap(f, fill_gaps):
                 continue
             seq_to_align = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")])
             full_length = len(seq_to_align)
-
             conn.close()
         # Save colums in the appropriate positions
@@ -2501,7 +2499,7 @@ def work_pssm_remap(f, fill_gaps):
                 many=True, data=re_mappings)
     # Delete alignment columns that are not used anymore from the database
-    current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = {f};")]
+    current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = '{f}';")]
     unused = []
     for col in current_family_columns:
         if col not in columns_to_save:
@@ -2535,21 +2533,16 @@ def work_pssm_remap(f, fill_gaps):
         for s in align:
             if not '[' in s.id:  # this is a Rfamseq entry, not a 3D chain
                 continue
-
+    
-            # get the right 3D chain:
+            db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';")
-            if chains_ids.index(s.id) in list_of_chains.keys():
+            if len(db_id):
-                db_id = list_of_chains[chains_ids.index(s.id)].db_chain_id
+                db_id = db_id[0][0]
-                seq = this_chain.seq
-                full_length = this_chain.full_length
             else:
-                db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = {s.id.split('[')[0]} AND chain_name = {s.id.split('-')[1]} AND rfam_acc = {f};")
+                pbar.update(1)
-                if len(db_id):
+                continue
-                    db_id = db_id[0][0]
+            seq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ])
-                else:
+            aliseq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ])
-                    pbar.update(1)
+            full_length = len(seq)
-                    continue
-                seq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ])
-                full_length = len(seq)
             # detect gaps
             c_seq = list(seq)  # contains "ACGUNacgu-"
@@ -2638,47 +2631,47 @@ if __name__ == "__main__":
         sql_define_tables(conn)
     print("> Storing results into", runDir + "/results/RNANet.db")
-    # compute an update compared to what is in the table "chain" (comparison on structure_id + chain_name + rfam_acc).
+    # # compute an update compared to what is in the table "chain" (comparison on structure_id + chain_name + rfam_acc).
-    # If --all was passed, all the structures are kept.
+    # # If --all was passed, all the structures are kept.
-    # Fills pp.update with Chain() objects.
+    # # Fills pp.update with Chain() objects.
-    pp.list_available_mappings()
+    # pp.list_available_mappings()
     # ===========================================================================
     # 3D information
     # ===========================================================================
-    # Download and annotate new RNA 3D chains (Chain objects in pp.update)
+    # # Download and annotate new RNA 3D chains (Chain objects in pp.update)
-    # If the original cif file and/or the Json DSSR annotation file already exist, they are not redownloaded/recomputed.
+    # # If the original cif file and/or the Json DSSR annotation file already exist, they are not redownloaded/recomputed.
-    pp.dl_and_annotate(coeff_ncores=0.5)
+    # pp.dl_and_annotate(coeff_ncores=0.5)
-    print("Here we go.")
+    # print("Here we go.")
-
+
-    # At this point, the structure table is up to date.
+    # # At this point, the structure table is up to date.
-    # Now save the DSSR annotations to the database.
+    # # Now save the DSSR annotations to the database.
-    # Extract the 3D chains to separate structure files if asked with --extract.
+    # # Extract the 3D chains to separate structure files if asked with --extract.
-    pp.build_chains(coeff_ncores=1.0)
+    # pp.build_chains(coeff_ncores=1.0)
-
+
-    if len(pp.to_retry):
+    # if len(pp.to_retry):
-        # Redownload and re-annotate
+    #     # Redownload and re-annotate
-        print("> Retrying to annotate some structures which just failed.", flush=True)
+    #     print("> Retrying to annotate some structures which just failed.", flush=True)
-        pp.dl_and_annotate(retry=True, coeff_ncores=0.3)  #
+    #     pp.dl_and_annotate(retry=True, coeff_ncores=0.3)  #
-        pp.build_chains(retry=True, coeff_ncores=1.0)     # Use half the cores to reduce required amount of memory
+    #     pp.build_chains(retry=True, coeff_ncores=1.0)     # Use half the cores to reduce required amount of memory
-    print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} ignored/errors).")
+    # print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} ignored/errors).")
-    if len(no_nts_set):
+    # if len(no_nts_set):
-        print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True)
+    #     print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True)
-    if len(weird_mappings):
+    # if len(weird_mappings):
-        print(f"{len(weird_mappings)} mappings to Rfam were taken as absolute positions instead of residue numbers:", weird_mappings, flush=True)
+    #     print(f"{len(weird_mappings)} mappings to Rfam were taken as absolute positions instead of residue numbers:", weird_mappings, flush=True)
-    if pp.SELECT_ONLY is None:
+    # if pp.SELECT_ONLY is None:
-        pp.checkpoint_save_chains()
+    #     pp.checkpoint_save_chains()
-
+
-    if not pp.HOMOLOGY:
+    # if not pp.HOMOLOGY:
-        # Save chains to file
+    #     # Save chains to file
-        for c in pp.loaded_chains:
+    #     for c in pp.loaded_chains:
-            work_save(c, homology=False)
+    #         work_save(c, homology=False)
-        print("Completed.")
+    #     print("Completed.")
-        exit(0)
+    #     exit(0)
-
+
-    # At this point, structure, chain and nucleotide tables of the database are up to date.
+    # # At this point, structure, chain and nucleotide tables of the database are up to date.
-    # (Modulo some statistics computed by statistics.py)
+    # # (Modulo some statistics computed by statistics.py)
     # ===========================================================================
     # Homology information
@@ -2700,8 +2693,8 @@ if __name__ == "__main__":
     pp.fam_list = sorted(rfam_acc_to_download.keys())
     if len(pp.fam_list):
-        pp.prepare_sequences()
+        # pp.prepare_sequences()
-        pp.realign()
+        # pp.realign()
         # At this point, the family table is almost up to date 
         # (lacking idty_percent and ali_filtered_length, both set in statistics.py)
--- a/statistics.py
View file @838a682
+++ b/statistics.py
View file @838a682
-#!/usr/bin/python3.8
+#!/usr/bin/python3
 # This file computes additional statistics over the produced dataset.
 # Run this file if you want the base counts, pair-type counts, identity percents, etc
@@ -74,6 +74,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0):
         # Extract the angle values of c2'-endo and c3'-endo nucleotides
         with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
+            conn.execute('pragma journal_mode=wal')
             df = pd.read_sql(f"""SELECT {angle}, th{angle} 
                                  FROM (
                                     SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
@@ -188,8 +189,12 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0):
         if show:
             fig.show()
         plt.close()
+
+    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
+
     # print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.")
+@trace_unhandled_exceptions
 def stats_len():
     """Plots statistics on chain lengths in RNA families.
     Uses all chains mapped to a family including copies, inferred or not.
@@ -222,6 +227,7 @@ def stats_len():
         # Get the lengths of chains
         with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
+            conn.execute('pragma journal_mode=wal')
             l = [ x[0] for x in sql_ask_database(conn, f"""SELECT COUNT(index_chain) 
                                                             FROM (
                                                                 SELECT chain_id 
@@ -259,6 +265,7 @@ def stats_len():
     # Save the figure
     fig.savefig(runDir + f"/results/figures/lengths_{res_thr}A.png")
     idxQueue.put(thr_idx) # replace the thread index in the queue
+    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
     # notify("Computed sequence length statistics and saved the figure.")
 def format_percentage(tot, x):
@@ -273,6 +280,7 @@ def format_percentage(tot, x):
             x = "<.01"
         return x + '%'
+@trace_unhandled_exceptions
 def stats_freq():
     """Computes base frequencies in all RNA families.
     Uses all chains mapped to a family including copies, inferred or not.
@@ -294,6 +302,7 @@ def stats_freq():
     # List all nt_names happening within a RNA family and store the counts in the Counter
     for f in tqdm(famlist, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", unit="family", leave=False):
         with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
+            conn.execute('pragma journal_mode=wal')
             counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0))
         freqs[f].update(counts)
@@ -305,6 +314,7 @@ def stats_freq():
     df = df.fillna(0)
     df.to_csv(runDir + "/results/frequencies.csv")    
     idxQueue.put(thr_idx) # replace the thread index in the queue
+    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
     # notify("Saved nucleotide frequencies to CSV file.")
 @trace_unhandled_exceptions
@@ -327,6 +337,7 @@ def parallel_stats_pairs(f):
     sqldata = []
     for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", unit="chain",leave=False):
         with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
+            conn.execute('pragma journal_mode=wal')
             # Get comma separated lists of basepairs per nucleotide
             interactions = pd.DataFrame(
                             sql_ask_database(conn, f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM nucleotide WHERE chain_id='{cid}';"), 
@@ -413,7 +424,9 @@ def parallel_stats_pairs(f):
     expanded_list.to_csv(runDir + f"/data/{f}_pairs.csv")
     idxQueue.put(thr_idx) # replace the thread index in the queue
+    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
+@trace_unhandled_exceptions
 def to_id_matrix(f):
     """
     Extracts sequences of 3D chains from the family alignments to a distinct STK file,
@@ -451,7 +464,8 @@ def to_id_matrix(f):
     # Out-of-scope task : update the database with the length of the filtered alignment:
     align = AlignIO.read(path_to_seq_data+f"/realigned/{f}_3d_only.afa", "fasta")
     with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
-        sql_execute(conn, """UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;""", many=True, data=(align.get_alignment_length(), f))
+        conn.execute('pragma journal_mode=wal')
+        sql_execute(conn, "UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=[align.get_alignment_length(), f])
     del align
     # Prepare the job
@@ -484,8 +498,10 @@ def to_id_matrix(f):
     np.save("data/"+f+".npy", id_matrix)
     idxQueue.put(thr_idx) # replace the thread index in the queue
+    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
     return 0
+@trace_unhandled_exceptions
 def seq_idty():
     """Computes identity matrices for each of the RNA families.
@@ -504,6 +520,7 @@ def seq_idty():
     # Update database with identity percentages
     conn = sqlite3.connect(runDir + "/results/RNANet.db")
+    conn.execute('pragma journal_mode=wal')
     for f, D in zip(fams_to_plot, fam_arrays):
         if not len(D): continue
         if D.shape[0] > 1:
@@ -547,6 +564,7 @@ def seq_idty():
     fig.savefig(runDir + f"/results/figures/distances_{res_thr}.png")
     print("> Computed all identity matrices and saved the figure.", flush=True)
+@trace_unhandled_exceptions
 def stats_pairs():
     """Counts occurrences of intra-chain base-pair types in RNA families
@@ -614,8 +632,10 @@ def stats_pairs():
     plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99)
     plt.savefig(runDir + f"/results/figures/pairings_{res_thr}.png")
+    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
     notify("Computed nucleotide statistics and saved CSV and PNG file.")
+@trace_unhandled_exceptions
 def per_chain_stats():
     """Computes per-chain frequencies and base-pair type counts.
@@ -623,7 +643,8 @@ def per_chain_stats():
     setproctitle(f"RNANet statistics.py per_chain_stats()")
-    with sqlite3.connect(runDir + "/results/RNANet.db", isolation_level=None) as conn:
+    with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
+        conn.execute('pragma journal_mode=wal')
         # Compute per-chain nucleotide frequencies
         df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn)
         df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64)
@@ -631,11 +652,11 @@ def per_chain_stats():
         df = df.drop("total", axis=1)
         # Set the values
-        conn.execute('pragma journal_mode=wal')
         sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;",
                           many=True, data=list(df.to_records(index=False)), warn_every=10)
     print("> Updated the database with per-chain base frequencies", flush=True)
+@trace_unhandled_exceptions
 def general_stats():
     """
     Number of structures as function of the resolution threshold
@@ -749,6 +770,7 @@ def general_stats():
     answers = []
     with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
+        conn.execute('pragma journal_mode=wal')
         for r in reqs:
             answers.append(pd.read_sql(r, conn))
     df_unique = answers[0]
@@ -909,6 +931,7 @@ def general_stats():
                         hspace=0.05, bottom=0.12, top=0.84)
     fig.savefig(runDir + "/results/figures/Nfamilies.png")
     plt.close()
+    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
 def log_to_pbar(pbar):
     def update(r):
@@ -981,6 +1004,7 @@ if __name__ == "__main__":
     # Load mappings. famlist will contain only families with structures at this resolution threshold.
     print("Loading mappings list...")
     with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
+        conn.execute('pragma journal_mode=wal')
         n_unmapped_chains = sql_ask_database(conn, "SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0]
         families = pd.read_sql(f"""SELECT rfam_acc, count(*) as n_chains 
                                     FROM chain JOIN structure