Showing
2 changed files
with
79 additions
and
62 deletions
... | @@ -1410,6 +1410,7 @@ class Pipeline: | ... | @@ -1410,6 +1410,7 @@ class Pipeline: |
1410 | 1410 | ||
1411 | # Start a process pool to dispatch the RNA families, | 1411 | # Start a process pool to dispatch the RNA families, |
1412 | # over multiple CPUs (one family by CPU) | 1412 | # over multiple CPUs (one family by CPU) |
1413 | + # p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=1) | ||
1413 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) | 1414 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) |
1414 | 1415 | ||
1415 | try: | 1416 | try: |
... | @@ -2407,7 +2408,7 @@ def work_pssm_remap(f, fill_gaps): | ... | @@ -2407,7 +2408,7 @@ def work_pssm_remap(f, fill_gaps): |
2407 | continue | 2408 | continue |
2408 | 2409 | ||
2409 | # Check if the chain existed before in the database | 2410 | # Check if the chain existed before in the database |
2410 | - if chains_ids.index(s.id) in list_of_chains.keys(): | 2411 | + if s.id in chains_ids: |
2411 | # a chain object is found in the update, this sequence is new | 2412 | # a chain object is found in the update, this sequence is new |
2412 | this_chain = list_of_chains[chains_ids.index(s.id)] | 2413 | this_chain = list_of_chains[chains_ids.index(s.id)] |
2413 | seq_to_align = this_chain.seq_to_align | 2414 | seq_to_align = this_chain.seq_to_align |
... | @@ -2415,12 +2416,10 @@ def work_pssm_remap(f, fill_gaps): | ... | @@ -2415,12 +2416,10 @@ def work_pssm_remap(f, fill_gaps): |
2415 | db_id = this_chain.db_chain_id | 2416 | db_id = this_chain.db_chain_id |
2416 | else: | 2417 | else: |
2417 | # it existed in the database before. | 2418 | # it existed in the database before. |
2418 | - this_chain = None | ||
2419 | - | ||
2420 | # Get the chain id in the database | 2419 | # Get the chain id in the database |
2421 | conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0) | 2420 | conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0) |
2422 | conn.execute('pragma journal_mode=wal') | 2421 | conn.execute('pragma journal_mode=wal') |
2423 | - db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = {s.id.split('[')[0]} AND chain_name = {s.id.split('-')[1]} AND rfam_acc = {f};") | 2422 | + db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';") |
2424 | if len(db_id): | 2423 | if len(db_id): |
2425 | db_id = db_id[0][0] | 2424 | db_id = db_id[0][0] |
2426 | else: | 2425 | else: |
... | @@ -2430,7 +2429,6 @@ def work_pssm_remap(f, fill_gaps): | ... | @@ -2430,7 +2429,6 @@ def work_pssm_remap(f, fill_gaps): |
2430 | continue | 2429 | continue |
2431 | seq_to_align = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")]) | 2430 | seq_to_align = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")]) |
2432 | full_length = len(seq_to_align) | 2431 | full_length = len(seq_to_align) |
2433 | - | ||
2434 | conn.close() | 2432 | conn.close() |
2435 | 2433 | ||
2436 | # Save colums in the appropriate positions | 2434 | # Save colums in the appropriate positions |
... | @@ -2501,7 +2499,7 @@ def work_pssm_remap(f, fill_gaps): | ... | @@ -2501,7 +2499,7 @@ def work_pssm_remap(f, fill_gaps): |
2501 | many=True, data=re_mappings) | 2499 | many=True, data=re_mappings) |
2502 | 2500 | ||
2503 | # Delete alignment columns that are not used anymore from the database | 2501 | # Delete alignment columns that are not used anymore from the database |
2504 | - current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = {f};")] | 2502 | + current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = '{f}';")] |
2505 | unused = [] | 2503 | unused = [] |
2506 | for col in current_family_columns: | 2504 | for col in current_family_columns: |
2507 | if col not in columns_to_save: | 2505 | if col not in columns_to_save: |
... | @@ -2535,21 +2533,16 @@ def work_pssm_remap(f, fill_gaps): | ... | @@ -2535,21 +2533,16 @@ def work_pssm_remap(f, fill_gaps): |
2535 | for s in align: | 2533 | for s in align: |
2536 | if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain | 2534 | if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain |
2537 | continue | 2535 | continue |
2538 | - | 2536 | + |
2539 | - # get the right 3D chain: | 2537 | + db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';") |
2540 | - if chains_ids.index(s.id) in list_of_chains.keys(): | 2538 | + if len(db_id): |
2541 | - db_id = list_of_chains[chains_ids.index(s.id)].db_chain_id | 2539 | + db_id = db_id[0][0] |
2542 | - seq = this_chain.seq | ||
2543 | - full_length = this_chain.full_length | ||
2544 | else: | 2540 | else: |
2545 | - db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = {s.id.split('[')[0]} AND chain_name = {s.id.split('-')[1]} AND rfam_acc = {f};") | 2541 | + pbar.update(1) |
2546 | - if len(db_id): | 2542 | + continue |
2547 | - db_id = db_id[0][0] | 2543 | + seq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ]) |
2548 | - else: | 2544 | + aliseq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ]) |
2549 | - pbar.update(1) | 2545 | + full_length = len(seq) |
2550 | - continue | ||
2551 | - seq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ]) | ||
2552 | - full_length = len(seq) | ||
2553 | 2546 | ||
2554 | # detect gaps | 2547 | # detect gaps |
2555 | c_seq = list(seq) # contains "ACGUNacgu-" | 2548 | c_seq = list(seq) # contains "ACGUNacgu-" |
... | @@ -2638,47 +2631,47 @@ if __name__ == "__main__": | ... | @@ -2638,47 +2631,47 @@ if __name__ == "__main__": |
2638 | sql_define_tables(conn) | 2631 | sql_define_tables(conn) |
2639 | print("> Storing results into", runDir + "/results/RNANet.db") | 2632 | print("> Storing results into", runDir + "/results/RNANet.db") |
2640 | 2633 | ||
2641 | - # compute an update compared to what is in the table "chain" (comparison on structure_id + chain_name + rfam_acc). | 2634 | + # # compute an update compared to what is in the table "chain" (comparison on structure_id + chain_name + rfam_acc). |
2642 | - # If --all was passed, all the structures are kept. | 2635 | + # # If --all was passed, all the structures are kept. |
2643 | - # Fills pp.update with Chain() objects. | 2636 | + # # Fills pp.update with Chain() objects. |
2644 | - pp.list_available_mappings() | 2637 | + # pp.list_available_mappings() |
2645 | 2638 | ||
2646 | # =========================================================================== | 2639 | # =========================================================================== |
2647 | # 3D information | 2640 | # 3D information |
2648 | # =========================================================================== | 2641 | # =========================================================================== |
2649 | 2642 | ||
2650 | - # Download and annotate new RNA 3D chains (Chain objects in pp.update) | 2643 | + # # Download and annotate new RNA 3D chains (Chain objects in pp.update) |
2651 | - # If the original cif file and/or the Json DSSR annotation file already exist, they are not redownloaded/recomputed. | 2644 | + # # If the original cif file and/or the Json DSSR annotation file already exist, they are not redownloaded/recomputed. |
2652 | - pp.dl_and_annotate(coeff_ncores=0.5) | 2645 | + # pp.dl_and_annotate(coeff_ncores=0.5) |
2653 | - print("Here we go.") | 2646 | + # print("Here we go.") |
2654 | - | 2647 | + |
2655 | - # At this point, the structure table is up to date. | 2648 | + # # At this point, the structure table is up to date. |
2656 | - # Now save the DSSR annotations to the database. | 2649 | + # # Now save the DSSR annotations to the database. |
2657 | - # Extract the 3D chains to separate structure files if asked with --extract. | 2650 | + # # Extract the 3D chains to separate structure files if asked with --extract. |
2658 | - pp.build_chains(coeff_ncores=1.0) | 2651 | + # pp.build_chains(coeff_ncores=1.0) |
2659 | - | 2652 | + |
2660 | - if len(pp.to_retry): | 2653 | + # if len(pp.to_retry): |
2661 | - # Redownload and re-annotate | 2654 | + # # Redownload and re-annotate |
2662 | - print("> Retrying to annotate some structures which just failed.", flush=True) | 2655 | + # print("> Retrying to annotate some structures which just failed.", flush=True) |
2663 | - pp.dl_and_annotate(retry=True, coeff_ncores=0.3) # | 2656 | + # pp.dl_and_annotate(retry=True, coeff_ncores=0.3) # |
2664 | - pp.build_chains(retry=True, coeff_ncores=1.0) # Use half the cores to reduce required amount of memory | 2657 | + # pp.build_chains(retry=True, coeff_ncores=1.0) # Use half the cores to reduce required amount of memory |
2665 | - print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} ignored/errors).") | 2658 | + # print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} ignored/errors).") |
2666 | - if len(no_nts_set): | 2659 | + # if len(no_nts_set): |
2667 | - print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True) | 2660 | + # print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True) |
2668 | - if len(weird_mappings): | 2661 | + # if len(weird_mappings): |
2669 | - print(f"{len(weird_mappings)} mappings to Rfam were taken as absolute positions instead of residue numbers:", weird_mappings, flush=True) | 2662 | + # print(f"{len(weird_mappings)} mappings to Rfam were taken as absolute positions instead of residue numbers:", weird_mappings, flush=True) |
2670 | - if pp.SELECT_ONLY is None: | 2663 | + # if pp.SELECT_ONLY is None: |
2671 | - pp.checkpoint_save_chains() | 2664 | + # pp.checkpoint_save_chains() |
2672 | - | 2665 | + |
2673 | - if not pp.HOMOLOGY: | 2666 | + # if not pp.HOMOLOGY: |
2674 | - # Save chains to file | 2667 | + # # Save chains to file |
2675 | - for c in pp.loaded_chains: | 2668 | + # for c in pp.loaded_chains: |
2676 | - work_save(c, homology=False) | 2669 | + # work_save(c, homology=False) |
2677 | - print("Completed.") | 2670 | + # print("Completed.") |
2678 | - exit(0) | 2671 | + # exit(0) |
2679 | - | 2672 | + |
2680 | - # At this point, structure, chain and nucleotide tables of the database are up to date. | 2673 | + # # At this point, structure, chain and nucleotide tables of the database are up to date. |
2681 | - # (Modulo some statistics computed by statistics.py) | 2674 | + # # (Modulo some statistics computed by statistics.py) |
2682 | 2675 | ||
2683 | # =========================================================================== | 2676 | # =========================================================================== |
2684 | # Homology information | 2677 | # Homology information |
... | @@ -2700,8 +2693,8 @@ if __name__ == "__main__": | ... | @@ -2700,8 +2693,8 @@ if __name__ == "__main__": |
2700 | pp.fam_list = sorted(rfam_acc_to_download.keys()) | 2693 | pp.fam_list = sorted(rfam_acc_to_download.keys()) |
2701 | 2694 | ||
2702 | if len(pp.fam_list): | 2695 | if len(pp.fam_list): |
2703 | - pp.prepare_sequences() | 2696 | + # pp.prepare_sequences() |
2704 | - pp.realign() | 2697 | + # pp.realign() |
2705 | 2698 | ||
2706 | # At this point, the family table is almost up to date | 2699 | # At this point, the family table is almost up to date |
2707 | # (lacking idty_percent and ali_filtered_length, both set in statistics.py) | 2700 | # (lacking idty_percent and ali_filtered_length, both set in statistics.py) | ... | ... |
1 | -#!/usr/bin/python3.8 | 1 | +#!/usr/bin/python3 |
2 | 2 | ||
3 | # This file computes additional statistics over the produced dataset. | 3 | # This file computes additional statistics over the produced dataset. |
4 | # Run this file if you want the base counts, pair-type counts, identity percents, etc | 4 | # Run this file if you want the base counts, pair-type counts, identity percents, etc |
... | @@ -74,6 +74,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0): | ... | @@ -74,6 +74,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0): |
74 | 74 | ||
75 | # Extract the angle values of c2'-endo and c3'-endo nucleotides | 75 | # Extract the angle values of c2'-endo and c3'-endo nucleotides |
76 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 76 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
77 | + conn.execute('pragma journal_mode=wal') | ||
77 | df = pd.read_sql(f"""SELECT {angle}, th{angle} | 78 | df = pd.read_sql(f"""SELECT {angle}, th{angle} |
78 | FROM ( | 79 | FROM ( |
79 | SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id | 80 | SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id |
... | @@ -188,8 +189,12 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0): | ... | @@ -188,8 +189,12 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0): |
188 | if show: | 189 | if show: |
189 | fig.show() | 190 | fig.show() |
190 | plt.close() | 191 | plt.close() |
192 | + | ||
193 | + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | ||
194 | + | ||
191 | # print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.") | 195 | # print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.") |
192 | 196 | ||
197 | +@trace_unhandled_exceptions | ||
193 | def stats_len(): | 198 | def stats_len(): |
194 | """Plots statistics on chain lengths in RNA families. | 199 | """Plots statistics on chain lengths in RNA families. |
195 | Uses all chains mapped to a family including copies, inferred or not. | 200 | Uses all chains mapped to a family including copies, inferred or not. |
... | @@ -222,6 +227,7 @@ def stats_len(): | ... | @@ -222,6 +227,7 @@ def stats_len(): |
222 | 227 | ||
223 | # Get the lengths of chains | 228 | # Get the lengths of chains |
224 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 229 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
230 | + conn.execute('pragma journal_mode=wal') | ||
225 | l = [ x[0] for x in sql_ask_database(conn, f"""SELECT COUNT(index_chain) | 231 | l = [ x[0] for x in sql_ask_database(conn, f"""SELECT COUNT(index_chain) |
226 | FROM ( | 232 | FROM ( |
227 | SELECT chain_id | 233 | SELECT chain_id |
... | @@ -259,6 +265,7 @@ def stats_len(): | ... | @@ -259,6 +265,7 @@ def stats_len(): |
259 | # Save the figure | 265 | # Save the figure |
260 | fig.savefig(runDir + f"/results/figures/lengths_{res_thr}A.png") | 266 | fig.savefig(runDir + f"/results/figures/lengths_{res_thr}A.png") |
261 | idxQueue.put(thr_idx) # replace the thread index in the queue | 267 | idxQueue.put(thr_idx) # replace the thread index in the queue |
268 | + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | ||
262 | # notify("Computed sequence length statistics and saved the figure.") | 269 | # notify("Computed sequence length statistics and saved the figure.") |
263 | 270 | ||
264 | def format_percentage(tot, x): | 271 | def format_percentage(tot, x): |
... | @@ -273,6 +280,7 @@ def format_percentage(tot, x): | ... | @@ -273,6 +280,7 @@ def format_percentage(tot, x): |
273 | x = "<.01" | 280 | x = "<.01" |
274 | return x + '%' | 281 | return x + '%' |
275 | 282 | ||
283 | +@trace_unhandled_exceptions | ||
276 | def stats_freq(): | 284 | def stats_freq(): |
277 | """Computes base frequencies in all RNA families. | 285 | """Computes base frequencies in all RNA families. |
278 | Uses all chains mapped to a family including copies, inferred or not. | 286 | Uses all chains mapped to a family including copies, inferred or not. |
... | @@ -294,6 +302,7 @@ def stats_freq(): | ... | @@ -294,6 +302,7 @@ def stats_freq(): |
294 | # List all nt_names happening within a RNA family and store the counts in the Counter | 302 | # List all nt_names happening within a RNA family and store the counts in the Counter |
295 | for f in tqdm(famlist, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", unit="family", leave=False): | 303 | for f in tqdm(famlist, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", unit="family", leave=False): |
296 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 304 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
305 | + conn.execute('pragma journal_mode=wal') | ||
297 | counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0)) | 306 | counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0)) |
298 | freqs[f].update(counts) | 307 | freqs[f].update(counts) |
299 | 308 | ||
... | @@ -305,6 +314,7 @@ def stats_freq(): | ... | @@ -305,6 +314,7 @@ def stats_freq(): |
305 | df = df.fillna(0) | 314 | df = df.fillna(0) |
306 | df.to_csv(runDir + "/results/frequencies.csv") | 315 | df.to_csv(runDir + "/results/frequencies.csv") |
307 | idxQueue.put(thr_idx) # replace the thread index in the queue | 316 | idxQueue.put(thr_idx) # replace the thread index in the queue |
317 | + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | ||
308 | # notify("Saved nucleotide frequencies to CSV file.") | 318 | # notify("Saved nucleotide frequencies to CSV file.") |
309 | 319 | ||
310 | @trace_unhandled_exceptions | 320 | @trace_unhandled_exceptions |
... | @@ -327,6 +337,7 @@ def parallel_stats_pairs(f): | ... | @@ -327,6 +337,7 @@ def parallel_stats_pairs(f): |
327 | sqldata = [] | 337 | sqldata = [] |
328 | for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", unit="chain",leave=False): | 338 | for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", unit="chain",leave=False): |
329 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 339 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
340 | + conn.execute('pragma journal_mode=wal') | ||
330 | # Get comma separated lists of basepairs per nucleotide | 341 | # Get comma separated lists of basepairs per nucleotide |
331 | interactions = pd.DataFrame( | 342 | interactions = pd.DataFrame( |
332 | sql_ask_database(conn, f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM nucleotide WHERE chain_id='{cid}';"), | 343 | sql_ask_database(conn, f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM nucleotide WHERE chain_id='{cid}';"), |
... | @@ -413,7 +424,9 @@ def parallel_stats_pairs(f): | ... | @@ -413,7 +424,9 @@ def parallel_stats_pairs(f): |
413 | expanded_list.to_csv(runDir + f"/data/{f}_pairs.csv") | 424 | expanded_list.to_csv(runDir + f"/data/{f}_pairs.csv") |
414 | 425 | ||
415 | idxQueue.put(thr_idx) # replace the thread index in the queue | 426 | idxQueue.put(thr_idx) # replace the thread index in the queue |
427 | + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | ||
416 | 428 | ||
429 | +@trace_unhandled_exceptions | ||
417 | def to_id_matrix(f): | 430 | def to_id_matrix(f): |
418 | """ | 431 | """ |
419 | Extracts sequences of 3D chains from the family alignments to a distinct STK file, | 432 | Extracts sequences of 3D chains from the family alignments to a distinct STK file, |
... | @@ -451,7 +464,8 @@ def to_id_matrix(f): | ... | @@ -451,7 +464,8 @@ def to_id_matrix(f): |
451 | # Out-of-scope task : update the database with the length of the filtered alignment: | 464 | # Out-of-scope task : update the database with the length of the filtered alignment: |
452 | align = AlignIO.read(path_to_seq_data+f"/realigned/{f}_3d_only.afa", "fasta") | 465 | align = AlignIO.read(path_to_seq_data+f"/realigned/{f}_3d_only.afa", "fasta") |
453 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 466 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
454 | - sql_execute(conn, """UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;""", many=True, data=(align.get_alignment_length(), f)) | 467 | + conn.execute('pragma journal_mode=wal') |
468 | + sql_execute(conn, "UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=[align.get_alignment_length(), f]) | ||
455 | del align | 469 | del align |
456 | 470 | ||
457 | # Prepare the job | 471 | # Prepare the job |
... | @@ -484,8 +498,10 @@ def to_id_matrix(f): | ... | @@ -484,8 +498,10 @@ def to_id_matrix(f): |
484 | np.save("data/"+f+".npy", id_matrix) | 498 | np.save("data/"+f+".npy", id_matrix) |
485 | 499 | ||
486 | idxQueue.put(thr_idx) # replace the thread index in the queue | 500 | idxQueue.put(thr_idx) # replace the thread index in the queue |
501 | + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | ||
487 | return 0 | 502 | return 0 |
488 | 503 | ||
504 | +@trace_unhandled_exceptions | ||
489 | def seq_idty(): | 505 | def seq_idty(): |
490 | """Computes identity matrices for each of the RNA families. | 506 | """Computes identity matrices for each of the RNA families. |
491 | 507 | ||
... | @@ -504,6 +520,7 @@ def seq_idty(): | ... | @@ -504,6 +520,7 @@ def seq_idty(): |
504 | 520 | ||
505 | # Update database with identity percentages | 521 | # Update database with identity percentages |
506 | conn = sqlite3.connect(runDir + "/results/RNANet.db") | 522 | conn = sqlite3.connect(runDir + "/results/RNANet.db") |
523 | + conn.execute('pragma journal_mode=wal') | ||
507 | for f, D in zip(fams_to_plot, fam_arrays): | 524 | for f, D in zip(fams_to_plot, fam_arrays): |
508 | if not len(D): continue | 525 | if not len(D): continue |
509 | if D.shape[0] > 1: | 526 | if D.shape[0] > 1: |
... | @@ -547,6 +564,7 @@ def seq_idty(): | ... | @@ -547,6 +564,7 @@ def seq_idty(): |
547 | fig.savefig(runDir + f"/results/figures/distances_{res_thr}.png") | 564 | fig.savefig(runDir + f"/results/figures/distances_{res_thr}.png") |
548 | print("> Computed all identity matrices and saved the figure.", flush=True) | 565 | print("> Computed all identity matrices and saved the figure.", flush=True) |
549 | 566 | ||
567 | +@trace_unhandled_exceptions | ||
550 | def stats_pairs(): | 568 | def stats_pairs(): |
551 | """Counts occurrences of intra-chain base-pair types in RNA families | 569 | """Counts occurrences of intra-chain base-pair types in RNA families |
552 | 570 | ||
... | @@ -614,8 +632,10 @@ def stats_pairs(): | ... | @@ -614,8 +632,10 @@ def stats_pairs(): |
614 | plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99) | 632 | plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99) |
615 | plt.savefig(runDir + f"/results/figures/pairings_{res_thr}.png") | 633 | plt.savefig(runDir + f"/results/figures/pairings_{res_thr}.png") |
616 | 634 | ||
635 | + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | ||
617 | notify("Computed nucleotide statistics and saved CSV and PNG file.") | 636 | notify("Computed nucleotide statistics and saved CSV and PNG file.") |
618 | 637 | ||
638 | +@trace_unhandled_exceptions | ||
619 | def per_chain_stats(): | 639 | def per_chain_stats(): |
620 | """Computes per-chain frequencies and base-pair type counts. | 640 | """Computes per-chain frequencies and base-pair type counts. |
621 | 641 | ||
... | @@ -623,7 +643,8 @@ def per_chain_stats(): | ... | @@ -623,7 +643,8 @@ def per_chain_stats(): |
623 | 643 | ||
624 | setproctitle(f"RNANet statistics.py per_chain_stats()") | 644 | setproctitle(f"RNANet statistics.py per_chain_stats()") |
625 | 645 | ||
626 | - with sqlite3.connect(runDir + "/results/RNANet.db", isolation_level=None) as conn: | 646 | + with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
647 | + conn.execute('pragma journal_mode=wal') | ||
627 | # Compute per-chain nucleotide frequencies | 648 | # Compute per-chain nucleotide frequencies |
628 | df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn) | 649 | df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn) |
629 | df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64) | 650 | df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64) |
... | @@ -631,11 +652,11 @@ def per_chain_stats(): | ... | @@ -631,11 +652,11 @@ def per_chain_stats(): |
631 | df = df.drop("total", axis=1) | 652 | df = df.drop("total", axis=1) |
632 | 653 | ||
633 | # Set the values | 654 | # Set the values |
634 | - conn.execute('pragma journal_mode=wal') | ||
635 | sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;", | 655 | sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;", |
636 | many=True, data=list(df.to_records(index=False)), warn_every=10) | 656 | many=True, data=list(df.to_records(index=False)), warn_every=10) |
637 | print("> Updated the database with per-chain base frequencies", flush=True) | 657 | print("> Updated the database with per-chain base frequencies", flush=True) |
638 | 658 | ||
659 | +@trace_unhandled_exceptions | ||
639 | def general_stats(): | 660 | def general_stats(): |
640 | """ | 661 | """ |
641 | Number of structures as function of the resolution threshold | 662 | Number of structures as function of the resolution threshold |
... | @@ -749,6 +770,7 @@ def general_stats(): | ... | @@ -749,6 +770,7 @@ def general_stats(): |
749 | 770 | ||
750 | answers = [] | 771 | answers = [] |
751 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 772 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
773 | + conn.execute('pragma journal_mode=wal') | ||
752 | for r in reqs: | 774 | for r in reqs: |
753 | answers.append(pd.read_sql(r, conn)) | 775 | answers.append(pd.read_sql(r, conn)) |
754 | df_unique = answers[0] | 776 | df_unique = answers[0] |
... | @@ -909,6 +931,7 @@ def general_stats(): | ... | @@ -909,6 +931,7 @@ def general_stats(): |
909 | hspace=0.05, bottom=0.12, top=0.84) | 931 | hspace=0.05, bottom=0.12, top=0.84) |
910 | fig.savefig(runDir + "/results/figures/Nfamilies.png") | 932 | fig.savefig(runDir + "/results/figures/Nfamilies.png") |
911 | plt.close() | 933 | plt.close() |
934 | + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | ||
912 | 935 | ||
913 | def log_to_pbar(pbar): | 936 | def log_to_pbar(pbar): |
914 | def update(r): | 937 | def update(r): |
... | @@ -981,6 +1004,7 @@ if __name__ == "__main__": | ... | @@ -981,6 +1004,7 @@ if __name__ == "__main__": |
981 | # Load mappings. famlist will contain only families with structures at this resolution threshold. | 1004 | # Load mappings. famlist will contain only families with structures at this resolution threshold. |
982 | print("Loading mappings list...") | 1005 | print("Loading mappings list...") |
983 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 1006 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
1007 | + conn.execute('pragma journal_mode=wal') | ||
984 | n_unmapped_chains = sql_ask_database(conn, "SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0] | 1008 | n_unmapped_chains = sql_ask_database(conn, "SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0] |
985 | families = pd.read_sql(f"""SELECT rfam_acc, count(*) as n_chains | 1009 | families = pd.read_sql(f"""SELECT rfam_acc, count(*) as n_chains |
986 | FROM chain JOIN structure | 1010 | FROM chain JOIN structure | ... | ... |
-
Please register or login to post a comment