Louis BECQUEY

ON CONFLICT clauses for SQL updates

This diff is collapsed. Click to expand it.
...@@ -175,7 +175,7 @@ def stats_len(): ...@@ -175,7 +175,7 @@ def stats_len():
175 cols.append("orange") 175 cols.append("orange")
176 else: 176 else:
177 cols.append("grey") 177 cols.append("grey")
178 - l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(nt_id) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;") ] 178 + l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;") ]
179 lengths.append(l) 179 lengths.append(l)
180 notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths") 180 notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
181 conn.close() 181 conn.close()
...@@ -245,32 +245,83 @@ def parallel_stats_pairs(f): ...@@ -245,32 +245,83 @@ def parallel_stats_pairs(f):
245 245
246 REQUIRES tables chain, nucleotide up-to-date.""" 246 REQUIRES tables chain, nucleotide up-to-date."""
247 247
248 - with sqlite3.connect("results/RNANet.db") as conn: 248 + chain_id_list = mappings_list[f]
249 - # Get comma separated lists of basepairs per nucleotide 249 + data = []
250 - interactions = pd.read_sql(f"SELECT paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide WHERE nb_interact>0;", conn) 250 + for cid in chain_id_list:
251 - 251 + with sqlite3.connect("results/RNANet.db") as conn:
252 - # expand the comma-separated lists in real lists 252 + # Get comma separated lists of basepairs per nucleotide
253 - expanded_list = pd.concat([ pd.DataFrame({ 'paired':row['paired'].split(','), 'pair_type_LW':row['pair_type_LW'].split(',') }) 253 + interactions = pd.read_sql(f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;", conn)
254 - for _, row in interactions.iterrows() ]).reset_index(drop=True) 254 +
255 - # keep only intra-chain interactions 255 + # expand the comma-separated lists in real lists
256 - expanded_list = expanded_list[ expanded_list.paired != '0' ].pair_type_LW 256 + expanded_list = pd.concat([ pd.DataFrame({ 'nt1':[ row["nt1"] for x in row["paired"].split(',') ],
257 + 'index_chain':[ row['index_chain'] for x in row["paired"].split(',') ],
258 + 'paired':row['paired'].split(','),
259 + 'pair_type_LW':row['pair_type_LW'].split(',')
260 + })
261 + for _, row in interactions.iterrows()
262 + ]).reset_index(drop=True)
263 +
264 + # Add second nucleotide
265 + nt2 = []
266 + for _, row in expanded_list.iterrows():
267 + if row.paired in ['', '0']:
268 + nt2.append('')
269 + else:
270 + try:
271 + n = expanded_list[expanded_list.index_chain == int(row.paired)].nt1.tolist()[0]
272 + nt2.append(n)
273 + except IndexError:
274 + print(cid, flush=True)
275 + try:
276 + expanded_list["nt2"] = nt2
277 + except ValueError:
278 + print(cid, flush=True)
279 + print(expanded_list, flush=True)
280 + return 0,0
281 +
282 + # keep only intra-chain interactions
283 + expanded_list = expanded_list[ ~expanded_list.paired.isin(['0','']) ]
284 + expanded_list["nts"] = expanded_list["nt1"] + expanded_list["nt2"]
285 +
286 + # Get basepair type
287 + expanded_list["basepair"] = np.where(expanded_list.nts.isin(["AU","UA"]), "AU",
288 + np.where(expanded_list.nts.isin(["GC","CG"]), "GC",
289 + np.where(expanded_list.nts.isin(["GU","UG"]), "Wobble","Other")
290 + )
291 + )
292 + # checks
293 + # ct = pd.crosstab(expanded_list.pair_type_LW, expanded_list.basepair)
294 + # ct = ct.loc[[ x for x in ["cWW","cHH","cSS","tWW","tHH","tSS"] if x in ct.index ]]
295 + # for _, symmetric_type in ct.iterrows():
296 + # for x in symmetric_type:
297 + # if x%2:
298 + # print("Odd number found for", symmetric_type.name, "in chain", cid, flush=True)
299 + # print(expanded_list, flush=True)
300 + # exit()
301 +
302 + expanded_list = expanded_list[["basepair", "pair_type_LW"]]
303 + data.append(expanded_list)
304 +
305 + # merge all the dataframes from all chains of the family
306 + expanded_list = pd.concat(data)
257 307
258 # Count each pair type 308 # Count each pair type
259 - vcnts = expanded_list.value_counts() 309 + vcnts = expanded_list.pair_type_LW.value_counts()
260 310
261 # Add these new counts to the family's counter 311 # Add these new counts to the family's counter
262 cnt = Counter() 312 cnt = Counter()
263 cnt.update(dict(vcnts)) 313 cnt.update(dict(vcnts))
264 314
265 # Create an output DataFrame 315 # Create an output DataFrame
266 - return pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f]) 316 + f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f])
317 + return expanded_list, f_df
267 318
268 def stats_pairs(): 319 def stats_pairs():
269 """Counts occurrences of intra-chain base-pair types in RNA families 320 """Counts occurrences of intra-chain base-pair types in RNA families
270 321
271 Creates a temporary results file in data/pair_counts.csv, and a results file in results/pairings.csv. 322 Creates a temporary results file in data/pair_counts.csv, and a results file in results/pairings.csv.
272 REQUIRES tables chain, nucleotide up-to-date.""" 323 REQUIRES tables chain, nucleotide up-to-date."""
273 - 324 +
274 def line_format(family_data): 325 def line_format(family_data):
275 return family_data.apply(partial(format_percentage, sum(family_data))) 326 return family_data.apply(partial(format_percentage, sum(family_data)))
276 327
...@@ -279,9 +330,23 @@ def stats_pairs(): ...@@ -279,9 +330,23 @@ def stats_pairs():
279 try: 330 try:
280 fam_pbar = tqdm(total=len(fam_list), desc="Pair-types in families", position=0, leave=True) 331 fam_pbar = tqdm(total=len(fam_list), desc="Pair-types in families", position=0, leave=True)
281 results = [] 332 results = []
282 - for i, fam_df in enumerate(p.imap_unordered(parallel_stats_pairs, fam_list)): 333 + allpairs = []
334 + for i, _ in enumerate(p.imap_unordered(parallel_stats_pairs, fam_list)):
335 + newpairs, fam_df = _
283 fam_pbar.update(1) 336 fam_pbar.update(1)
284 results.append(fam_df) 337 results.append(fam_df)
338 + allpairs.append(newpairs)
339 +
340 + # Checks
341 + vlcnts= newpairs.pair_type_LW.value_counts()
342 + identical = [fam_df[i][0] == newpairs.pair_type_LW.value_counts().at[i] for i in fam_df.columns]
343 + if False in identical:
344 + print(fam_df)
345 + print(vlcnts)
346 + print("Dataframes differ for",fam_df.index[0], flush=True)
347 + for x in ["cWW","cHH","cSS","tWW","tHH","tSS"]:
348 + if x in vlcnts.index and vlcnts[x] % 2:
349 + print("Trouvé un nombre impair de",x,"dans",fam_df.index[0], flush=True)
285 fam_pbar.close() 350 fam_pbar.close()
286 p.close() 351 p.close()
287 p.join() 352 p.join()
...@@ -292,24 +357,36 @@ def stats_pairs(): ...@@ -292,24 +357,36 @@ def stats_pairs():
292 p.join() 357 p.join()
293 exit(1) 358 exit(1)
294 359
360 + all_pairs = pd.concat(allpairs)
295 df = pd.concat(results).fillna(0) 361 df = pd.concat(results).fillna(0)
362 + vlcnts= all_pairs.pair_type_LW.value_counts()
363 + for x in ["cWW","cHH","cSS","tWW","tHH","tSS"]:
364 + if x in vlcnts.index and vlcnts[x] % 2:
365 + print("Trouvé un nombre impair de",x,"après le merge !", flush=True)
296 df.to_csv("data/pair_counts.csv") 366 df.to_csv("data/pair_counts.csv")
367 + all_pairs.to_csv("data/all_pairs.csv")
297 else: 368 else:
298 df = pd.read_csv("data/pair_counts.csv", index_col=0) 369 df = pd.read_csv("data/pair_counts.csv", index_col=0)
370 + all_pairs = pd.read_csv("data/all_pairs.csv", index_col=0)
299 371
300 - print(df) 372 + crosstab = pd.crosstab(all_pairs.pair_type_LW, all_pairs.basepair)
301 - # Remove not very well defined pair types (not in the 12 LW types)
302 col_list = [ x for x in df.columns if '.' in x ] 373 col_list = [ x for x in df.columns if '.' in x ]
374 +
375 + # Remove not very well defined pair types (not in the 12 LW types)
303 df['other'] = df[col_list].sum(axis=1) 376 df['other'] = df[col_list].sum(axis=1)
304 df.drop(col_list, axis=1, inplace=True) 377 df.drop(col_list, axis=1, inplace=True)
305 - print(df) 378 + crosstab = crosstab.append(crosstab.loc[col_list].sum(axis=0).rename("Other"))
306 - 379 +
307 # drop duplicate types 380 # drop duplicate types
308 # The twelve Leontis-Westhof types are 381 # The twelve Leontis-Westhof types are
309 # cWW cWH cWS cHH cHS cSS (do not count cHW cSW and cSH, they are the same as their opposites) 382 # cWW cWH cWS cHH cHS cSS (do not count cHW cSW and cSH, they are the same as their opposites)
310 # tWW tWH tWS tHH tHS tSS (do not count tHW tSW and tSH, they are the same as their opposites) 383 # tWW tWH tWS tHH tHS tSS (do not count tHW tSW and tSH, they are the same as their opposites)
311 - df.drop([ "cHW", "tHW", "cSW", "tSW", "cHS", "tHS"], axis=1) 384 + df.drop([ x for x in [ "cHW", "tHW", "cSW", "tSW", "cHS", "tHS"] if x in df.columns], axis=1)
312 - df.loc[ ["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "other"] ] /= 2.0 385 + crosstab = crosstab.loc[[ x for x in ["cWW","cWH","cWS","cHH","cHS","cSS","tWW","tWH","tWS","tHH","tHS","tSS","Other"] if x in crosstab.index]]
386 + df.loc[:,[x for x in ["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "other"] if x in df.columns] ] /= 2
387 + # crosstab.loc[["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "Other"]] /= 2
388 + print(crosstab)
389 + print(df)
313 390
314 # Compute total row 391 # Compute total row
315 total_series = df.sum(numeric_only=True).rename("TOTAL") 392 total_series = df.sum(numeric_only=True).rename("TOTAL")
...@@ -326,7 +403,6 @@ def stats_pairs(): ...@@ -326,7 +403,6 @@ def stats_pairs():
326 403
327 # Plot barplot of overall types 404 # Plot barplot of overall types
328 total_series.sort_values(ascending=False, inplace=True) 405 total_series.sort_values(ascending=False, inplace=True)
329 - total_series.apply(lambda x: x/2.0) # each interaction was counted twice because one time per extremity
330 ax = total_series.plot(figsize=(5,3), kind='bar', log=True, ylim=(1e4,5000000) ) 406 ax = total_series.plot(figsize=(5,3), kind='bar', log=True, ylim=(1e4,5000000) )
331 ax.set_ylabel("Number of observations") 407 ax.set_ylabel("Number of observations")
332 plt.subplots_adjust(bottom=0.2, right=0.99) 408 plt.subplots_adjust(bottom=0.2, right=0.99)
...@@ -445,11 +521,11 @@ if __name__ == "__main__": ...@@ -445,11 +521,11 @@ if __name__ == "__main__":
445 521
446 # Define threads for the tasks 522 # Define threads for the tasks
447 threads = [ 523 threads = [
448 - # th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}), 524 + th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}),
449 - # th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}), 525 + th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}),
450 - # th.Thread(target=stats_len), 526 + th.Thread(target=stats_len),
451 - # th.Thread(target=stats_freq), 527 + th.Thread(target=stats_freq),
452 - # th.Thread(target=seq_idty), 528 + th.Thread(target=seq_idty),
453 th.Thread(target=per_chain_stats) 529 th.Thread(target=per_chain_stats)
454 ] 530 ]
455 531
......