ON CONFLICT clauses for SQL updates

Louis BECQUEY
Commit 7b99a37e5b6dc985828fba8739c6fe0a606fa2b5 7b99a37e 1 parent d9d32d4e
Showing 2 changed files with 102 additions and 26 deletions
RNAnet.py
statistics.py
--- a/RNAnet.py
View file @7b99a37
+++ b/RNAnet.py
View file @7b99a37
--- a/statistics.py
View file @7b99a37
+++ b/statistics.py
View file @7b99a37
@@ -175,7 +175,7 @@ def stats_len():
             cols.append("orange")
         else:
             cols.append("grey")
-         l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(nt_id) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;") ]
+         l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;") ]
         lengths.append(l)
         notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
     conn.close()
@@ -245,32 +245,83 @@ def parallel_stats_pairs(f):
 
     REQUIRES tables chain, nucleotide up-to-date.""" 
 
-     with sqlite3.connect("results/RNANet.db") as conn:
-         # Get comma separated lists of basepairs per nucleotide
-         interactions = pd.read_sql(f"SELECT paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide WHERE nb_interact>0;", conn)
- 
-     # expand the comma-separated lists in real lists
-     expanded_list = pd.concat([ pd.DataFrame({ 'paired':row['paired'].split(','), 'pair_type_LW':row['pair_type_LW'].split(',') }) 
-                                 for _, row in interactions.iterrows() ]).reset_index(drop=True)
-     # keep only intra-chain interactions
-     expanded_list = expanded_list[ expanded_list.paired != '0' ].pair_type_LW
+     chain_id_list = mappings_list[f]
+     data = []
+     for cid in chain_id_list:
+         with sqlite3.connect("results/RNANet.db") as conn:
+             # Get comma separated lists of basepairs per nucleotide
+             interactions = pd.read_sql(f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;", conn)
+ 
+         # expand the comma-separated lists in real lists
+         expanded_list = pd.concat([ pd.DataFrame({  'nt1':[ row["nt1"] for x in row["paired"].split(',') ],
+                                                     'index_chain':[ row['index_chain'] for x in row["paired"].split(',') ],
+                                                     'paired':row['paired'].split(','), 
+                                                     'pair_type_LW':row['pair_type_LW'].split(',') 
+                                                 }) 
+                                     for _, row in interactions.iterrows() 
+                                 ]).reset_index(drop=True)
+ 
+         # Add second nucleotide
+         nt2 = []
+         for _, row in expanded_list.iterrows():
+             if row.paired in ['', '0']:
+                 nt2.append('')
+             else:
+                 try:
+                     n = expanded_list[expanded_list.index_chain == int(row.paired)].nt1.tolist()[0]
+                     nt2.append(n)
+                 except IndexError:
+                     print(cid, flush=True)
+         try:
+             expanded_list["nt2"] = nt2
+         except ValueError:
+             print(cid, flush=True)
+             print(expanded_list, flush=True)
+             return 0,0
+ 
+         # keep only intra-chain interactions
+         expanded_list = expanded_list[ ~expanded_list.paired.isin(['0','']) ]
+         expanded_list["nts"] = expanded_list["nt1"] + expanded_list["nt2"]
+         
+         # Get basepair type
+         expanded_list["basepair"] = np.where(expanded_list.nts.isin(["AU","UA"]), "AU",
+                                         np.where(expanded_list.nts.isin(["GC","CG"]), "GC",
+                                             np.where(expanded_list.nts.isin(["GU","UG"]), "Wobble","Other")
+                                         )
+                                     )
+         # checks
+         # ct = pd.crosstab(expanded_list.pair_type_LW, expanded_list.basepair)
+         # ct = ct.loc[[ x for x in ["cWW","cHH","cSS","tWW","tHH","tSS"] if x in ct.index ]]
+         # for _, symmetric_type in ct.iterrows():
+         #     for x in symmetric_type:
+         #         if x%2:
+         #             print("Odd number found for", symmetric_type.name, "in chain", cid, flush=True)
+         #             print(expanded_list, flush=True)
+         #             exit()
+ 
+         expanded_list = expanded_list[["basepair", "pair_type_LW"]]
+         data.append(expanded_list)
+ 
+     # merge all the dataframes from all chains of the family
+     expanded_list = pd.concat(data)
 
     # Count each pair type
-     vcnts = expanded_list.value_counts()
+     vcnts = expanded_list.pair_type_LW.value_counts()
 
     # Add these new counts to the family's counter
     cnt = Counter()
     cnt.update(dict(vcnts))
 
     # Create an output DataFrame
-     return pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f])
+     f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f])
+     return expanded_list, f_df
 
 def stats_pairs():
     """Counts occurrences of intra-chain base-pair types in RNA families
 
     Creates a temporary results file in data/pair_counts.csv, and a results file in results/pairings.csv.
     REQUIRES tables chain, nucleotide up-to-date.""" 
- 
+     
     def line_format(family_data):
         return family_data.apply(partial(format_percentage, sum(family_data)))
 
@@ -279,9 +330,23 @@ def stats_pairs():
         try:
             fam_pbar = tqdm(total=len(fam_list), desc="Pair-types in families", position=0, leave=True) 
             results = []
-             for i, fam_df in enumerate(p.imap_unordered(parallel_stats_pairs, fam_list)):
+             allpairs = []
+             for i, _ in enumerate(p.imap_unordered(parallel_stats_pairs, fam_list)):
+                 newpairs, fam_df = _
                 fam_pbar.update(1)
                 results.append(fam_df)
+                 allpairs.append(newpairs)
+ 
+                 # Checks
+                 vlcnts= newpairs.pair_type_LW.value_counts()
+                 identical = [fam_df[i][0] == newpairs.pair_type_LW.value_counts().at[i] for i in fam_df.columns]
+                 if False in identical:
+                     print(fam_df)
+                     print(vlcnts)
+                     print("Dataframes differ for",fam_df.index[0], flush=True)
+                 for x in ["cWW","cHH","cSS","tWW","tHH","tSS"]:
+                     if x in vlcnts.index and vlcnts[x] % 2:
+                         print("Trouvé un nombre impair de",x,"dans",fam_df.index[0], flush=True)
             fam_pbar.close()
             p.close()
             p.join()
@@ -292,24 +357,36 @@ def stats_pairs():
             p.join()
             exit(1)
 
+         all_pairs = pd.concat(allpairs)
         df = pd.concat(results).fillna(0)
+         vlcnts= all_pairs.pair_type_LW.value_counts()
+         for x in ["cWW","cHH","cSS","tWW","tHH","tSS"]:
+             if x in vlcnts.index and vlcnts[x] % 2:
+                 print("Trouvé un nombre impair de",x,"après le merge !", flush=True)
         df.to_csv("data/pair_counts.csv")
+         all_pairs.to_csv("data/all_pairs.csv")
     else:
         df = pd.read_csv("data/pair_counts.csv", index_col=0)
+         all_pairs = pd.read_csv("data/all_pairs.csv", index_col=0)
 
-     print(df)
-     # Remove not very well defined pair types (not in the 12 LW types)
+     crosstab = pd.crosstab(all_pairs.pair_type_LW, all_pairs.basepair)
     col_list = [ x for x in df.columns if '.' in x ]
+ 
+     # Remove not very well defined pair types (not in the 12 LW types)
     df['other'] = df[col_list].sum(axis=1)
     df.drop(col_list, axis=1, inplace=True)
-     print(df)
- 
+     crosstab = crosstab.append(crosstab.loc[col_list].sum(axis=0).rename("Other"))
+     
     # drop duplicate types
     # The twelve Leontis-Westhof types are
     # cWW cWH cWS cHH cHS cSS (do not count cHW cSW and cSH, they are the same as their opposites)
     # tWW tWH tWS tHH tHS tSS (do not count tHW tSW and tSH, they are the same as their opposites)
-     df.drop([ "cHW", "tHW", "cSW", "tSW", "cHS", "tHS"], axis=1)
-     df.loc[ ["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "other"] ] /= 2.0
+     df.drop([ x for x in [ "cHW", "tHW", "cSW", "tSW", "cHS", "tHS"] if x in df.columns], axis=1)
+     crosstab = crosstab.loc[[ x for x in ["cWW","cWH","cWS","cHH","cHS","cSS","tWW","tWH","tWS","tHH","tHS","tSS","Other"] if x in crosstab.index]]
+     df.loc[:,[x for x in ["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "other"] if x in df.columns] ] /= 2
+     # crosstab.loc[["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "Other"]] /= 2
+     print(crosstab)
+     print(df)
 
     # Compute total row
     total_series = df.sum(numeric_only=True).rename("TOTAL")
@@ -326,7 +403,6 @@ def stats_pairs():
 
     # Plot barplot of overall types
     total_series.sort_values(ascending=False, inplace=True)
-     total_series.apply(lambda x: x/2.0) # each interaction was counted twice because one time per extremity
     ax = total_series.plot(figsize=(5,3), kind='bar', log=True, ylim=(1e4,5000000) )
     ax.set_ylabel("Number of observations")
     plt.subplots_adjust(bottom=0.2, right=0.99)
@@ -445,11 +521,11 @@ if __name__ == "__main__":
 
     # Define threads for the tasks
     threads = [
-         # th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}),
-         # th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}),
-         # th.Thread(target=stats_len),
-         # th.Thread(target=stats_freq),
-         # th.Thread(target=seq_idty),
+         th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}),
+         th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}),
+         th.Thread(target=stats_len),
+         th.Thread(target=stats_freq),
+         th.Thread(target=seq_idty),
         th.Thread(target=per_chain_stats)
     ]