Louis BECQUEY

Latest statistics on basepair counts by chain

...@@ -3,7 +3,7 @@ nohup.out ...@@ -3,7 +3,7 @@ nohup.out
3 log_of_the_run.sh 3 log_of_the_run.sh
4 4
5 # results 5 # results
6 -results/figures/wadley_plots/ 6 +results/
7 7
8 # temporary results files 8 # temporary results files
9 data/ 9 data/
......
...@@ -1179,7 +1179,7 @@ class Pipeline: ...@@ -1179,7 +1179,7 @@ class Pipeline:
1179 os.makedirs(runDir + "/results/archive/") 1179 os.makedirs(runDir + "/results/archive/")
1180 1180
1181 # Save to by-chain CSV files 1181 # Save to by-chain CSV files
1182 - p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=1, maxtasksperchild=5) 1182 + p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3)
1183 try: 1183 try:
1184 pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True) 1184 pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True)
1185 for i, _ in enumerate(p.imap_unordered(work_save, self.loaded_chains)): 1185 for i, _ in enumerate(p.imap_unordered(work_save, self.loaded_chains)):
...@@ -1208,7 +1208,7 @@ class Pipeline: ...@@ -1208,7 +1208,7 @@ class Pipeline:
1208 1208
1209 # Save additional informations 1209 # Save additional informations
1210 conn = sqlite3.connect(runDir+"/results/RNANet.db") 1210 conn = sqlite3.connect(runDir+"/results/RNANet.db")
1211 - pd.read_sql_query("SELECT rfam_acc, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family", 1211 + pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;",
1212 conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) 1212 conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
1213 pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, reversed, date, exp_method, resolution, issue FROM structure 1213 pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, reversed, date, exp_method, resolution, issue FROM structure
1214 JOIN chain ON structure.pdb_id = chain.structure_id 1214 JOIN chain ON structure.pdb_id = chain.structure_id
...@@ -1274,6 +1274,7 @@ class Pipeline: ...@@ -1274,6 +1274,7 @@ class Pipeline:
1274 1274
1275 conn.close() 1275 conn.close()
1276 1276
1277 +
1277 def read_cpu_number(): 1278 def read_cpu_number():
1278 # As one shall not use os.cpu_count() on LXC containers, 1279 # As one shall not use os.cpu_count() on LXC containers,
1279 # because it reads info from /sys wich is not the VM resources but the host resources. 1280 # because it reads info from /sys wich is not the VM resources but the host resources.
...@@ -2050,6 +2051,7 @@ def work_pssm(f, fill_gaps): ...@@ -2050,6 +2051,7 @@ def work_pssm(f, fill_gaps):
2050 idxQueue.put(thr_idx) # replace the thread index in the queue 2051 idxQueue.put(thr_idx) # replace the thread index in the queue
2051 return 0 2052 return 0
2052 2053
2054 +@trace_unhandled_exceptions
2053 def work_save(c, homology=True): 2055 def work_save(c, homology=True):
2054 conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0) 2056 conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0)
2055 if homology: 2057 if homology:
...@@ -2096,38 +2098,36 @@ if __name__ == "__main__": ...@@ -2096,38 +2098,36 @@ if __name__ == "__main__":
2096 sql_define_tables(conn) 2098 sql_define_tables(conn)
2097 print("> Storing results into", runDir + "/results/RNANet.db") 2099 print("> Storing results into", runDir + "/results/RNANet.db")
2098 2100
2099 - # # compute an update compared to what is in the table "chain" 2101 + # compute an update compared to what is in the table "chain"
2100 - # #DEBUG: list everything 2102 + pp.list_available_mappings()
2101 - # pp.REUSE_ALL = True 2103 +
2102 - # pp.list_available_mappings() 2104 + # ===========================================================================
2103 - 2105 + # 3D information
2104 - # # =========================================================================== 2106 + # ===========================================================================
2105 - # # 3D information 2107 +
2106 - # # =========================================================================== 2108 + # Download and annotate new RNA 3D chains (Chain objects in pp.update)
2107 - 2109 + pp.dl_and_annotate(coeff_ncores=0.5)
2108 - # # Download and annotate new RNA 3D chains (Chain objects in pp.update) 2110 +
2109 - # pp.dl_and_annotate(coeff_ncores=0.75) 2111 + # At this point, the structure table is up to date
2110 - 2112 +
2111 - # # At this point, the structure table is up to date 2113 + pp.build_chains(coeff_ncores=2.0)
2112 - 2114 + if len(pp.to_retry):
2113 - # pp.build_chains(coeff_ncores=2.0) 2115 + # Redownload and re-annotate
2114 - # if len(pp.to_retry): 2116 + print("> Retrying to annotate some structures which just failed.", flush=True)
2115 - # # Redownload and re-annotate 2117 + pp.dl_and_annotate(retry=True, coeff_ncores=0.3) #
2116 - # print("> Retrying to annotate some structures which just failed.", flush=True) 2118 + pp.build_chains(retry=True, coeff_ncores=1.0) # Use half the cores to reduce required amount of memory
2117 - # pp.dl_and_annotate(retry=True, coeff_ncores=0.5) # 2119 + print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} errors).")
2118 - # pp.build_chains(retry=True, coeff_ncores=1.0) # Use half the cores to reduce required amount of memory 2120 + pp.checkpoint_save_chains()
2119 - # print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} errors).") 2121 +
2120 - # pp.checkpoint_save_chains() 2122 + if not pp.HOMOLOGY:
2121 - 2123 + # Save chains to file
2122 - # if not pp.HOMOLOGY: 2124 + for c in pp.loaded_chains:
2123 - # # Save chains to file 2125 + work_save(c, homology=False)
2124 - # for c in pp.loaded_chains: 2126 + print("Completed.")
2125 - # work_save(c, homology=False) 2127 + exit()
2126 - # print("Completed.") 2128 +
2127 - # exit() 2129 + # At this point, structure, chain and nucleotide tables of the database are up to date.
2128 - 2130 + # (Modulo some statistics computed by statistics.py)
2129 - # # At this point, structure, chain and nucleotide tables of the database are up to date.
2130 - # # (Modulo some statistics computed by statistics.py)
2131 2131
2132 # =========================================================================== 2132 # ===========================================================================
2133 # Homology information 2133 # Homology information
......
...@@ -26,11 +26,13 @@ from collections import Counter ...@@ -26,11 +26,13 @@ from collections import Counter
26 from RNAnet import read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker 26 from RNAnet import read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker
27 27
28 # This sets the paths 28 # This sets the paths
29 -path_to_3D_data = "/home/lbecquey/Data/RNA/3D/"
30 -path_to_seq_data = "/home/lbecquey/Data/RNA/sequences/"
31 if len(sys.argv) > 1: 29 if len(sys.argv) > 1:
32 path_to_3D_data = path.abspath(sys.argv[1]) 30 path_to_3D_data = path.abspath(sys.argv[1])
33 path_to_seq_data = path.abspath(sys.argv[2]) 31 path_to_seq_data = path.abspath(sys.argv[2])
32 +else:
33 + print("Please set paths to 3D data using command line arguments:")
34 + print("./statistics.py /path/to/3D/data/ /path/to/sequence/data/")
35 + exit()
34 36
35 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112 37 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112
36 SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111 38 SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111
...@@ -289,19 +291,32 @@ def parallel_stats_pairs(f): ...@@ -289,19 +291,32 @@ def parallel_stats_pairs(f):
289 np.where(expanded_list.nts.isin(["GU","UG"]), "Wobble","Other") 291 np.where(expanded_list.nts.isin(["GU","UG"]), "Wobble","Other")
290 ) 292 )
291 ) 293 )
292 - # checks
293 - # ct = pd.crosstab(expanded_list.pair_type_LW, expanded_list.basepair)
294 - # ct = ct.loc[[ x for x in ["cWW","cHH","cSS","tWW","tHH","tSS"] if x in ct.index ]]
295 - # for _, symmetric_type in ct.iterrows():
296 - # for x in symmetric_type:
297 - # if x%2:
298 - # print("Odd number found for", symmetric_type.name, "in chain", cid, flush=True)
299 - # print(expanded_list, flush=True)
300 - # exit()
301 -
302 expanded_list = expanded_list[["basepair", "pair_type_LW"]] 294 expanded_list = expanded_list[["basepair", "pair_type_LW"]]
295 +
296 + # Update the database
297 + vlcnts = expanded_list.pair_type_LW.value_counts()
298 + sqldata = ( vlcnts.at["cWW"]/2 if "cWW" in vlcnts.index else 0,
299 + vlcnts.at["cWH"] if "cWH" in vlcnts.index else 0,
300 + vlcnts.at["cWS"] if "cWS" in vlcnts.index else 0,
301 + vlcnts.at["cHH"]/2 if "cHH" in vlcnts.index else 0,
302 + vlcnts.at["cHS"] if "cHS" in vlcnts.index else 0,
303 + vlcnts.at["cSS"]/2 if "cSS" in vlcnts.index else 0,
304 + vlcnts.at["tWW"]/2 if "tWW" in vlcnts.index else 0,
305 + vlcnts.at["tWH"] if "tWH" in vlcnts.index else 0,
306 + vlcnts.at["tWS"] if "tWS" in vlcnts.index else 0,
307 + vlcnts.at["tHH"]/2 if "tHH" in vlcnts.index else 0,
308 + vlcnts.at["tHS"] if "tHS" in vlcnts.index else 0,
309 + vlcnts.at["tSS"]/2 if "tSS" in vlcnts.index else 0,
310 + int(sum(vlcnts.loc[[ str(x) for x in vlcnts.index if "." in str(x)]])/2),
311 + cid)
312 + with sqlite3.connect("results/RNANet.db") as conn:
313 + sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
314 + pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?,
315 + pair_count_tHH = ?, pair_count_tHS = ?, pair_count_tSS = ?, pair_count_other = ? WHERE chain_id = ?;""", data=sqldata)
316 +
303 data.append(expanded_list) 317 data.append(expanded_list)
304 318
319 +
305 # merge all the dataframes from all chains of the family 320 # merge all the dataframes from all chains of the family
306 expanded_list = pd.concat(data) 321 expanded_list = pd.concat(data)
307 322
...@@ -336,17 +351,6 @@ def stats_pairs(): ...@@ -336,17 +351,6 @@ def stats_pairs():
336 fam_pbar.update(1) 351 fam_pbar.update(1)
337 results.append(fam_df) 352 results.append(fam_df)
338 allpairs.append(newpairs) 353 allpairs.append(newpairs)
339 -
340 - # Checks
341 - vlcnts= newpairs.pair_type_LW.value_counts()
342 - identical = [fam_df[i][0] == newpairs.pair_type_LW.value_counts().at[i] for i in fam_df.columns]
343 - if False in identical:
344 - print(fam_df)
345 - print(vlcnts)
346 - print("Dataframes differ for",fam_df.index[0], flush=True)
347 - for x in ["cWW","cHH","cSS","tWW","tHH","tSS"]:
348 - if x in vlcnts.index and vlcnts[x] % 2:
349 - print("Trouvé un nombre impair de",x,"dans",fam_df.index[0], flush=True)
350 fam_pbar.close() 354 fam_pbar.close()
351 p.close() 355 p.close()
352 p.join() 356 p.join()
...@@ -359,10 +363,6 @@ def stats_pairs(): ...@@ -359,10 +363,6 @@ def stats_pairs():
359 363
360 all_pairs = pd.concat(allpairs) 364 all_pairs = pd.concat(allpairs)
361 df = pd.concat(results).fillna(0) 365 df = pd.concat(results).fillna(0)
362 - vlcnts= all_pairs.pair_type_LW.value_counts()
363 - for x in ["cWW","cHH","cSS","tWW","tHH","tSS"]:
364 - if x in vlcnts.index and vlcnts[x] % 2:
365 - print("Trouvé un nombre impair de",x,"après le merge !", flush=True)
366 df.to_csv("data/pair_counts.csv") 366 df.to_csv("data/pair_counts.csv")
367 all_pairs.to_csv("data/all_pairs.csv") 367 all_pairs.to_csv("data/all_pairs.csv")
368 else: 368 else:
...@@ -375,18 +375,16 @@ def stats_pairs(): ...@@ -375,18 +375,16 @@ def stats_pairs():
375 # Remove not very well defined pair types (not in the 12 LW types) 375 # Remove not very well defined pair types (not in the 12 LW types)
376 df['other'] = df[col_list].sum(axis=1) 376 df['other'] = df[col_list].sum(axis=1)
377 df.drop(col_list, axis=1, inplace=True) 377 df.drop(col_list, axis=1, inplace=True)
378 - crosstab = crosstab.append(crosstab.loc[col_list].sum(axis=0).rename("Other")) 378 + crosstab = crosstab.append(crosstab.loc[col_list].sum(axis=0).rename("non-LW"))
379 379
380 # drop duplicate types 380 # drop duplicate types
381 # The twelve Leontis-Westhof types are 381 # The twelve Leontis-Westhof types are
382 # cWW cWH cWS cHH cHS cSS (do not count cHW cSW and cSH, they are the same as their opposites) 382 # cWW cWH cWS cHH cHS cSS (do not count cHW cSW and cSH, they are the same as their opposites)
383 # tWW tWH tWS tHH tHS tSS (do not count tHW tSW and tSH, they are the same as their opposites) 383 # tWW tWH tWS tHH tHS tSS (do not count tHW tSW and tSH, they are the same as their opposites)
384 - df.drop([ x for x in [ "cHW", "tHW", "cSW", "tSW", "cHS", "tHS"] if x in df.columns], axis=1) 384 + df = df.drop([ x for x in [ "cHW", "tHW", "cSW", "tSW", "cHS", "tHS"] if x in df.columns], axis=1)
385 - crosstab = crosstab.loc[[ x for x in ["cWW","cWH","cWS","cHH","cHS","cSS","tWW","tWH","tWS","tHH","tHS","tSS","Other"] if x in crosstab.index]] 385 + crosstab = crosstab.loc[[ x for x in ["cWW","cWH","cWS","cHH","cHS","cSS","tWW","tWH","tWS","tHH","tHS","tSS","non-LW"] if x in crosstab.index]]
386 df.loc[:,[x for x in ["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "other"] if x in df.columns] ] /= 2 386 df.loc[:,[x for x in ["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "other"] if x in df.columns] ] /= 2
387 - # crosstab.loc[["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "Other"]] /= 2 387 + crosstab.loc[["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "non-LW"]] /= 2
388 - print(crosstab)
389 - print(df)
390 388
391 # Compute total row 389 # Compute total row
392 total_series = df.sum(numeric_only=True).rename("TOTAL") 390 total_series = df.sum(numeric_only=True).rename("TOTAL")
...@@ -397,15 +395,16 @@ def stats_pairs(): ...@@ -397,15 +395,16 @@ def stats_pairs():
397 395
398 # reorder columns 396 # reorder columns
399 df.sort_values("TOTAL", axis=1, inplace=True, ascending=False) 397 df.sort_values("TOTAL", axis=1, inplace=True, ascending=False)
398 + crosstab = crosstab[["AU", "GC", "Wobble", "Other"]]
400 399
401 # Save to CSV 400 # Save to CSV
402 - df.to_csv("results/pairings.csv") 401 + df.to_csv("results/pair_types.csv")
403 402
404 # Plot barplot of overall types 403 # Plot barplot of overall types
405 - total_series.sort_values(ascending=False, inplace=True) 404 + ax = crosstab.plot(figsize=(8,5), kind='bar', stacked=True, log=False, fontsize=13)
406 - ax = total_series.plot(figsize=(5,3), kind='bar', log=True, ylim=(1e4,5000000) ) 405 + ax.set_ylabel("Number of observations (millions)", fontsize=13)
407 - ax.set_ylabel("Number of observations") 406 + ax.set_xlabel(None)
408 - plt.subplots_adjust(bottom=0.2, right=0.99) 407 + plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99)
409 plt.savefig("results/figures/pairings.png") 408 plt.savefig("results/figures/pairings.png")
410 409
411 notify("Computed nucleotide statistics and saved CSV and PNG file.") 410 notify("Computed nucleotide statistics and saved CSV and PNG file.")
...@@ -416,7 +415,7 @@ def to_dist_matrix(f): ...@@ -416,7 +415,7 @@ def to_dist_matrix(f):
416 return 0 415 return 0
417 416
418 dm = DistanceCalculator('identity') 417 dm = DistanceCalculator('identity')
419 - with open(path_to_seq_data+"realigned/"+f+"++.afa") as al_file: 418 + with open(path_to_seq_data+"/realigned/"+f+"++.afa") as al_file:
420 al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):] 419 al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):]
421 idty = dm.get_distance(al).matrix # list of lists 420 idty = dm.get_distance(al).matrix # list of lists
422 del al 421 del al
...@@ -457,7 +456,7 @@ def seq_idty(): ...@@ -457,7 +456,7 @@ def seq_idty():
457 for f, D in zip(famlist, fam_arrays): 456 for f, D in zip(famlist, fam_arrays):
458 if not len(D): continue 457 if not len(D): continue
459 a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix 458 a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
460 - conn.execute(f"UPDATE family SET idty_percent = {float(a)} WHERE rfam_acc = '{f}';") 459 + conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
461 conn.commit() 460 conn.commit()
462 conn.close() 461 conn.close()
463 462
......