Showing
3 changed files
with
73 additions
and
74 deletions
... | @@ -1179,7 +1179,7 @@ class Pipeline: | ... | @@ -1179,7 +1179,7 @@ class Pipeline: |
1179 | os.makedirs(runDir + "/results/archive/") | 1179 | os.makedirs(runDir + "/results/archive/") |
1180 | 1180 | ||
1181 | # Save to by-chain CSV files | 1181 | # Save to by-chain CSV files |
1182 | - p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=1, maxtasksperchild=5) | 1182 | + p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3) |
1183 | try: | 1183 | try: |
1184 | pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True) | 1184 | pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True) |
1185 | for i, _ in enumerate(p.imap_unordered(work_save, self.loaded_chains)): | 1185 | for i, _ in enumerate(p.imap_unordered(work_save, self.loaded_chains)): |
... | @@ -1208,7 +1208,7 @@ class Pipeline: | ... | @@ -1208,7 +1208,7 @@ class Pipeline: |
1208 | 1208 | ||
1209 | # Save additional informations | 1209 | # Save additional informations |
1210 | conn = sqlite3.connect(runDir+"/results/RNANet.db") | 1210 | conn = sqlite3.connect(runDir+"/results/RNANet.db") |
1211 | - pd.read_sql_query("SELECT rfam_acc, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family", | 1211 | + pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", |
1212 | conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) | 1212 | conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) |
1213 | pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, reversed, date, exp_method, resolution, issue FROM structure | 1213 | pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, reversed, date, exp_method, resolution, issue FROM structure |
1214 | JOIN chain ON structure.pdb_id = chain.structure_id | 1214 | JOIN chain ON structure.pdb_id = chain.structure_id |
... | @@ -1274,6 +1274,7 @@ class Pipeline: | ... | @@ -1274,6 +1274,7 @@ class Pipeline: |
1274 | 1274 | ||
1275 | conn.close() | 1275 | conn.close() |
1276 | 1276 | ||
1277 | + | ||
1277 | def read_cpu_number(): | 1278 | def read_cpu_number(): |
1278 | # As one shall not use os.cpu_count() on LXC containers, | 1279 | # As one shall not use os.cpu_count() on LXC containers, |
1279 | # because it reads info from /sys wich is not the VM resources but the host resources. | 1280 | # because it reads info from /sys wich is not the VM resources but the host resources. |
... | @@ -2050,6 +2051,7 @@ def work_pssm(f, fill_gaps): | ... | @@ -2050,6 +2051,7 @@ def work_pssm(f, fill_gaps): |
2050 | idxQueue.put(thr_idx) # replace the thread index in the queue | 2051 | idxQueue.put(thr_idx) # replace the thread index in the queue |
2051 | return 0 | 2052 | return 0 |
2052 | 2053 | ||
2054 | +@trace_unhandled_exceptions | ||
2053 | def work_save(c, homology=True): | 2055 | def work_save(c, homology=True): |
2054 | conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0) | 2056 | conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0) |
2055 | if homology: | 2057 | if homology: |
... | @@ -2096,38 +2098,36 @@ if __name__ == "__main__": | ... | @@ -2096,38 +2098,36 @@ if __name__ == "__main__": |
2096 | sql_define_tables(conn) | 2098 | sql_define_tables(conn) |
2097 | print("> Storing results into", runDir + "/results/RNANet.db") | 2099 | print("> Storing results into", runDir + "/results/RNANet.db") |
2098 | 2100 | ||
2099 | - # # compute an update compared to what is in the table "chain" | 2101 | + # compute an update compared to what is in the table "chain" |
2100 | - # #DEBUG: list everything | 2102 | + pp.list_available_mappings() |
2101 | - # pp.REUSE_ALL = True | 2103 | + |
2102 | - # pp.list_available_mappings() | 2104 | + # =========================================================================== |
2103 | - | 2105 | + # 3D information |
2104 | - # # =========================================================================== | 2106 | + # =========================================================================== |
2105 | - # # 3D information | 2107 | + |
2106 | - # # =========================================================================== | 2108 | + # Download and annotate new RNA 3D chains (Chain objects in pp.update) |
2107 | - | 2109 | + pp.dl_and_annotate(coeff_ncores=0.5) |
2108 | - # # Download and annotate new RNA 3D chains (Chain objects in pp.update) | 2110 | + |
2109 | - # pp.dl_and_annotate(coeff_ncores=0.75) | 2111 | + # At this point, the structure table is up to date |
2110 | - | 2112 | + |
2111 | - # # At this point, the structure table is up to date | 2113 | + pp.build_chains(coeff_ncores=2.0) |
2112 | - | 2114 | + if len(pp.to_retry): |
2113 | - # pp.build_chains(coeff_ncores=2.0) | 2115 | + # Redownload and re-annotate |
2114 | - # if len(pp.to_retry): | 2116 | + print("> Retrying to annotate some structures which just failed.", flush=True) |
2115 | - # # Redownload and re-annotate | 2117 | + pp.dl_and_annotate(retry=True, coeff_ncores=0.3) # |
2116 | - # print("> Retrying to annotate some structures which just failed.", flush=True) | 2118 | + pp.build_chains(retry=True, coeff_ncores=1.0) # Use half the cores to reduce required amount of memory |
2117 | - # pp.dl_and_annotate(retry=True, coeff_ncores=0.5) # | 2119 | + print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} errors).") |
2118 | - # pp.build_chains(retry=True, coeff_ncores=1.0) # Use half the cores to reduce required amount of memory | 2120 | + pp.checkpoint_save_chains() |
2119 | - # print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} errors).") | 2121 | + |
2120 | - # pp.checkpoint_save_chains() | 2122 | + if not pp.HOMOLOGY: |
2121 | - | 2123 | + # Save chains to file |
2122 | - # if not pp.HOMOLOGY: | 2124 | + for c in pp.loaded_chains: |
2123 | - # # Save chains to file | 2125 | + work_save(c, homology=False) |
2124 | - # for c in pp.loaded_chains: | 2126 | + print("Completed.") |
2125 | - # work_save(c, homology=False) | 2127 | + exit() |
2126 | - # print("Completed.") | ||
2127 | - # exit() | ||
2128 | 2128 | ||
2129 | - # # At this point, structure, chain and nucleotide tables of the database are up to date. | 2129 | + # At this point, structure, chain and nucleotide tables of the database are up to date. |
2130 | - # # (Modulo some statistics computed by statistics.py) | 2130 | + # (Modulo some statistics computed by statistics.py) |
2131 | 2131 | ||
2132 | # =========================================================================== | 2132 | # =========================================================================== |
2133 | # Homology information | 2133 | # Homology information | ... | ... |
... | @@ -26,11 +26,13 @@ from collections import Counter | ... | @@ -26,11 +26,13 @@ from collections import Counter |
26 | from RNAnet import read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker | 26 | from RNAnet import read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker |
27 | 27 | ||
28 | # This sets the paths | 28 | # This sets the paths |
29 | -path_to_3D_data = "/home/lbecquey/Data/RNA/3D/" | ||
30 | -path_to_seq_data = "/home/lbecquey/Data/RNA/sequences/" | ||
31 | if len(sys.argv) > 1: | 29 | if len(sys.argv) > 1: |
32 | path_to_3D_data = path.abspath(sys.argv[1]) | 30 | path_to_3D_data = path.abspath(sys.argv[1]) |
33 | path_to_seq_data = path.abspath(sys.argv[2]) | 31 | path_to_seq_data = path.abspath(sys.argv[2]) |
32 | +else: | ||
33 | + print("Please set paths to 3D data using command line arguments:") | ||
34 | + print("./statistics.py /path/to/3D/data/ /path/to/sequence/data/") | ||
35 | + exit() | ||
34 | 36 | ||
35 | LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112 | 37 | LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112 |
36 | SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111 | 38 | SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111 |
... | @@ -289,19 +291,32 @@ def parallel_stats_pairs(f): | ... | @@ -289,19 +291,32 @@ def parallel_stats_pairs(f): |
289 | np.where(expanded_list.nts.isin(["GU","UG"]), "Wobble","Other") | 291 | np.where(expanded_list.nts.isin(["GU","UG"]), "Wobble","Other") |
290 | ) | 292 | ) |
291 | ) | 293 | ) |
292 | - # checks | ||
293 | - # ct = pd.crosstab(expanded_list.pair_type_LW, expanded_list.basepair) | ||
294 | - # ct = ct.loc[[ x for x in ["cWW","cHH","cSS","tWW","tHH","tSS"] if x in ct.index ]] | ||
295 | - # for _, symmetric_type in ct.iterrows(): | ||
296 | - # for x in symmetric_type: | ||
297 | - # if x%2: | ||
298 | - # print("Odd number found for", symmetric_type.name, "in chain", cid, flush=True) | ||
299 | - # print(expanded_list, flush=True) | ||
300 | - # exit() | ||
301 | - | ||
302 | expanded_list = expanded_list[["basepair", "pair_type_LW"]] | 294 | expanded_list = expanded_list[["basepair", "pair_type_LW"]] |
295 | + | ||
296 | + # Update the database | ||
297 | + vlcnts = expanded_list.pair_type_LW.value_counts() | ||
298 | + sqldata = ( vlcnts.at["cWW"]/2 if "cWW" in vlcnts.index else 0, | ||
299 | + vlcnts.at["cWH"] if "cWH" in vlcnts.index else 0, | ||
300 | + vlcnts.at["cWS"] if "cWS" in vlcnts.index else 0, | ||
301 | + vlcnts.at["cHH"]/2 if "cHH" in vlcnts.index else 0, | ||
302 | + vlcnts.at["cHS"] if "cHS" in vlcnts.index else 0, | ||
303 | + vlcnts.at["cSS"]/2 if "cSS" in vlcnts.index else 0, | ||
304 | + vlcnts.at["tWW"]/2 if "tWW" in vlcnts.index else 0, | ||
305 | + vlcnts.at["tWH"] if "tWH" in vlcnts.index else 0, | ||
306 | + vlcnts.at["tWS"] if "tWS" in vlcnts.index else 0, | ||
307 | + vlcnts.at["tHH"]/2 if "tHH" in vlcnts.index else 0, | ||
308 | + vlcnts.at["tHS"] if "tHS" in vlcnts.index else 0, | ||
309 | + vlcnts.at["tSS"]/2 if "tSS" in vlcnts.index else 0, | ||
310 | + int(sum(vlcnts.loc[[ str(x) for x in vlcnts.index if "." in str(x)]])/2), | ||
311 | + cid) | ||
312 | + with sqlite3.connect("results/RNANet.db") as conn: | ||
313 | + sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?, | ||
314 | + pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?, | ||
315 | + pair_count_tHH = ?, pair_count_tHS = ?, pair_count_tSS = ?, pair_count_other = ? WHERE chain_id = ?;""", data=sqldata) | ||
316 | + | ||
303 | data.append(expanded_list) | 317 | data.append(expanded_list) |
304 | 318 | ||
319 | + | ||
305 | # merge all the dataframes from all chains of the family | 320 | # merge all the dataframes from all chains of the family |
306 | expanded_list = pd.concat(data) | 321 | expanded_list = pd.concat(data) |
307 | 322 | ||
... | @@ -336,17 +351,6 @@ def stats_pairs(): | ... | @@ -336,17 +351,6 @@ def stats_pairs(): |
336 | fam_pbar.update(1) | 351 | fam_pbar.update(1) |
337 | results.append(fam_df) | 352 | results.append(fam_df) |
338 | allpairs.append(newpairs) | 353 | allpairs.append(newpairs) |
339 | - | ||
340 | - # Checks | ||
341 | - vlcnts= newpairs.pair_type_LW.value_counts() | ||
342 | - identical = [fam_df[i][0] == newpairs.pair_type_LW.value_counts().at[i] for i in fam_df.columns] | ||
343 | - if False in identical: | ||
344 | - print(fam_df) | ||
345 | - print(vlcnts) | ||
346 | - print("Dataframes differ for",fam_df.index[0], flush=True) | ||
347 | - for x in ["cWW","cHH","cSS","tWW","tHH","tSS"]: | ||
348 | - if x in vlcnts.index and vlcnts[x] % 2: | ||
349 | - print("Trouvé un nombre impair de",x,"dans",fam_df.index[0], flush=True) | ||
350 | fam_pbar.close() | 354 | fam_pbar.close() |
351 | p.close() | 355 | p.close() |
352 | p.join() | 356 | p.join() |
... | @@ -359,10 +363,6 @@ def stats_pairs(): | ... | @@ -359,10 +363,6 @@ def stats_pairs(): |
359 | 363 | ||
360 | all_pairs = pd.concat(allpairs) | 364 | all_pairs = pd.concat(allpairs) |
361 | df = pd.concat(results).fillna(0) | 365 | df = pd.concat(results).fillna(0) |
362 | - vlcnts= all_pairs.pair_type_LW.value_counts() | ||
363 | - for x in ["cWW","cHH","cSS","tWW","tHH","tSS"]: | ||
364 | - if x in vlcnts.index and vlcnts[x] % 2: | ||
365 | - print("Trouvé un nombre impair de",x,"après le merge !", flush=True) | ||
366 | df.to_csv("data/pair_counts.csv") | 366 | df.to_csv("data/pair_counts.csv") |
367 | all_pairs.to_csv("data/all_pairs.csv") | 367 | all_pairs.to_csv("data/all_pairs.csv") |
368 | else: | 368 | else: |
... | @@ -375,18 +375,16 @@ def stats_pairs(): | ... | @@ -375,18 +375,16 @@ def stats_pairs(): |
375 | # Remove not very well defined pair types (not in the 12 LW types) | 375 | # Remove not very well defined pair types (not in the 12 LW types) |
376 | df['other'] = df[col_list].sum(axis=1) | 376 | df['other'] = df[col_list].sum(axis=1) |
377 | df.drop(col_list, axis=1, inplace=True) | 377 | df.drop(col_list, axis=1, inplace=True) |
378 | - crosstab = crosstab.append(crosstab.loc[col_list].sum(axis=0).rename("Other")) | 378 | + crosstab = crosstab.append(crosstab.loc[col_list].sum(axis=0).rename("non-LW")) |
379 | 379 | ||
380 | # drop duplicate types | 380 | # drop duplicate types |
381 | # The twelve Leontis-Westhof types are | 381 | # The twelve Leontis-Westhof types are |
382 | # cWW cWH cWS cHH cHS cSS (do not count cHW cSW and cSH, they are the same as their opposites) | 382 | # cWW cWH cWS cHH cHS cSS (do not count cHW cSW and cSH, they are the same as their opposites) |
383 | # tWW tWH tWS tHH tHS tSS (do not count tHW tSW and tSH, they are the same as their opposites) | 383 | # tWW tWH tWS tHH tHS tSS (do not count tHW tSW and tSH, they are the same as their opposites) |
384 | - df.drop([ x for x in [ "cHW", "tHW", "cSW", "tSW", "cHS", "tHS"] if x in df.columns], axis=1) | 384 | + df = df.drop([ x for x in [ "cHW", "tHW", "cSW", "tSW", "cHS", "tHS"] if x in df.columns], axis=1) |
385 | - crosstab = crosstab.loc[[ x for x in ["cWW","cWH","cWS","cHH","cHS","cSS","tWW","tWH","tWS","tHH","tHS","tSS","Other"] if x in crosstab.index]] | 385 | + crosstab = crosstab.loc[[ x for x in ["cWW","cWH","cWS","cHH","cHS","cSS","tWW","tWH","tWS","tHH","tHS","tSS","non-LW"] if x in crosstab.index]] |
386 | df.loc[:,[x for x in ["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "other"] if x in df.columns] ] /= 2 | 386 | df.loc[:,[x for x in ["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "other"] if x in df.columns] ] /= 2 |
387 | - # crosstab.loc[["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "Other"]] /= 2 | 387 | + crosstab.loc[["cWW", "tWW", "cHH", "tHH", "cSS", "tSS", "non-LW"]] /= 2 |
388 | - print(crosstab) | ||
389 | - print(df) | ||
390 | 388 | ||
391 | # Compute total row | 389 | # Compute total row |
392 | total_series = df.sum(numeric_only=True).rename("TOTAL") | 390 | total_series = df.sum(numeric_only=True).rename("TOTAL") |
... | @@ -397,15 +395,16 @@ def stats_pairs(): | ... | @@ -397,15 +395,16 @@ def stats_pairs(): |
397 | 395 | ||
398 | # reorder columns | 396 | # reorder columns |
399 | df.sort_values("TOTAL", axis=1, inplace=True, ascending=False) | 397 | df.sort_values("TOTAL", axis=1, inplace=True, ascending=False) |
398 | + crosstab = crosstab[["AU", "GC", "Wobble", "Other"]] | ||
400 | 399 | ||
401 | # Save to CSV | 400 | # Save to CSV |
402 | - df.to_csv("results/pairings.csv") | 401 | + df.to_csv("results/pair_types.csv") |
403 | 402 | ||
404 | # Plot barplot of overall types | 403 | # Plot barplot of overall types |
405 | - total_series.sort_values(ascending=False, inplace=True) | 404 | + ax = crosstab.plot(figsize=(8,5), kind='bar', stacked=True, log=False, fontsize=13) |
406 | - ax = total_series.plot(figsize=(5,3), kind='bar', log=True, ylim=(1e4,5000000) ) | 405 | + ax.set_ylabel("Number of observations (millions)", fontsize=13) |
407 | - ax.set_ylabel("Number of observations") | 406 | + ax.set_xlabel(None) |
408 | - plt.subplots_adjust(bottom=0.2, right=0.99) | 407 | + plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99) |
409 | plt.savefig("results/figures/pairings.png") | 408 | plt.savefig("results/figures/pairings.png") |
410 | 409 | ||
411 | notify("Computed nucleotide statistics and saved CSV and PNG file.") | 410 | notify("Computed nucleotide statistics and saved CSV and PNG file.") |
... | @@ -416,7 +415,7 @@ def to_dist_matrix(f): | ... | @@ -416,7 +415,7 @@ def to_dist_matrix(f): |
416 | return 0 | 415 | return 0 |
417 | 416 | ||
418 | dm = DistanceCalculator('identity') | 417 | dm = DistanceCalculator('identity') |
419 | - with open(path_to_seq_data+"realigned/"+f+"++.afa") as al_file: | 418 | + with open(path_to_seq_data+"/realigned/"+f+"++.afa") as al_file: |
420 | al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):] | 419 | al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):] |
421 | idty = dm.get_distance(al).matrix # list of lists | 420 | idty = dm.get_distance(al).matrix # list of lists |
422 | del al | 421 | del al |
... | @@ -457,7 +456,7 @@ def seq_idty(): | ... | @@ -457,7 +456,7 @@ def seq_idty(): |
457 | for f, D in zip(famlist, fam_arrays): | 456 | for f, D in zip(famlist, fam_arrays): |
458 | if not len(D): continue | 457 | if not len(D): continue |
459 | a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix | 458 | a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix |
460 | - conn.execute(f"UPDATE family SET idty_percent = {float(a)} WHERE rfam_acc = '{f}';") | 459 | + conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';") |
461 | conn.commit() | 460 | conn.commit() |
462 | conn.close() | 461 | conn.close() |
463 | 462 | ... | ... |
-
Please register or login to post a comment