Louis BECQUEY

corrections des statistiques

...@@ -1410,6 +1410,7 @@ class Pipeline: ...@@ -1410,6 +1410,7 @@ class Pipeline:
1410 1410
1411 # Start a process pool to dispatch the RNA families, 1411 # Start a process pool to dispatch the RNA families,
1412 # over multiple CPUs (one family by CPU) 1412 # over multiple CPUs (one family by CPU)
1413 + # p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=1)
1413 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) 1414 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
1414 1415
1415 try: 1416 try:
...@@ -2407,7 +2408,7 @@ def work_pssm_remap(f, fill_gaps): ...@@ -2407,7 +2408,7 @@ def work_pssm_remap(f, fill_gaps):
2407 continue 2408 continue
2408 2409
2409 # Check if the chain existed before in the database 2410 # Check if the chain existed before in the database
2410 - if chains_ids.index(s.id) in list_of_chains.keys(): 2411 + if s.id in chains_ids:
2411 # a chain object is found in the update, this sequence is new 2412 # a chain object is found in the update, this sequence is new
2412 this_chain = list_of_chains[chains_ids.index(s.id)] 2413 this_chain = list_of_chains[chains_ids.index(s.id)]
2413 seq_to_align = this_chain.seq_to_align 2414 seq_to_align = this_chain.seq_to_align
...@@ -2415,12 +2416,10 @@ def work_pssm_remap(f, fill_gaps): ...@@ -2415,12 +2416,10 @@ def work_pssm_remap(f, fill_gaps):
2415 db_id = this_chain.db_chain_id 2416 db_id = this_chain.db_chain_id
2416 else: 2417 else:
2417 # it existed in the database before. 2418 # it existed in the database before.
2418 - this_chain = None
2419 -
2420 # Get the chain id in the database 2419 # Get the chain id in the database
2421 conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0) 2420 conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0)
2422 conn.execute('pragma journal_mode=wal') 2421 conn.execute('pragma journal_mode=wal')
2423 - db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = {s.id.split('[')[0]} AND chain_name = {s.id.split('-')[1]} AND rfam_acc = {f};") 2422 + db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';")
2424 if len(db_id): 2423 if len(db_id):
2425 db_id = db_id[0][0] 2424 db_id = db_id[0][0]
2426 else: 2425 else:
...@@ -2430,7 +2429,6 @@ def work_pssm_remap(f, fill_gaps): ...@@ -2430,7 +2429,6 @@ def work_pssm_remap(f, fill_gaps):
2430 continue 2429 continue
2431 seq_to_align = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")]) 2430 seq_to_align = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")])
2432 full_length = len(seq_to_align) 2431 full_length = len(seq_to_align)
2433 -
2434 conn.close() 2432 conn.close()
2435 2433
2436 # Save colums in the appropriate positions 2434 # Save colums in the appropriate positions
...@@ -2501,7 +2499,7 @@ def work_pssm_remap(f, fill_gaps): ...@@ -2501,7 +2499,7 @@ def work_pssm_remap(f, fill_gaps):
2501 many=True, data=re_mappings) 2499 many=True, data=re_mappings)
2502 2500
2503 # Delete alignment columns that are not used anymore from the database 2501 # Delete alignment columns that are not used anymore from the database
2504 - current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = {f};")] 2502 + current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = '{f}';")]
2505 unused = [] 2503 unused = []
2506 for col in current_family_columns: 2504 for col in current_family_columns:
2507 if col not in columns_to_save: 2505 if col not in columns_to_save:
...@@ -2535,21 +2533,16 @@ def work_pssm_remap(f, fill_gaps): ...@@ -2535,21 +2533,16 @@ def work_pssm_remap(f, fill_gaps):
2535 for s in align: 2533 for s in align:
2536 if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain 2534 if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain
2537 continue 2535 continue
2538 - 2536 +
2539 - # get the right 3D chain: 2537 + db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';")
2540 - if chains_ids.index(s.id) in list_of_chains.keys(): 2538 + if len(db_id):
2541 - db_id = list_of_chains[chains_ids.index(s.id)].db_chain_id 2539 + db_id = db_id[0][0]
2542 - seq = this_chain.seq
2543 - full_length = this_chain.full_length
2544 else: 2540 else:
2545 - db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = {s.id.split('[')[0]} AND chain_name = {s.id.split('-')[1]} AND rfam_acc = {f};") 2541 + pbar.update(1)
2546 - if len(db_id): 2542 + continue
2547 - db_id = db_id[0][0] 2543 + seq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ])
2548 - else: 2544 + aliseq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ])
2549 - pbar.update(1) 2545 + full_length = len(seq)
2550 - continue
2551 - seq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ])
2552 - full_length = len(seq)
2553 2546
2554 # detect gaps 2547 # detect gaps
2555 c_seq = list(seq) # contains "ACGUNacgu-" 2548 c_seq = list(seq) # contains "ACGUNacgu-"
...@@ -2638,47 +2631,47 @@ if __name__ == "__main__": ...@@ -2638,47 +2631,47 @@ if __name__ == "__main__":
2638 sql_define_tables(conn) 2631 sql_define_tables(conn)
2639 print("> Storing results into", runDir + "/results/RNANet.db") 2632 print("> Storing results into", runDir + "/results/RNANet.db")
2640 2633
2641 - # compute an update compared to what is in the table "chain" (comparison on structure_id + chain_name + rfam_acc). 2634 + # # compute an update compared to what is in the table "chain" (comparison on structure_id + chain_name + rfam_acc).
2642 - # If --all was passed, all the structures are kept. 2635 + # # If --all was passed, all the structures are kept.
2643 - # Fills pp.update with Chain() objects. 2636 + # # Fills pp.update with Chain() objects.
2644 - pp.list_available_mappings() 2637 + # pp.list_available_mappings()
2645 2638
2646 # =========================================================================== 2639 # ===========================================================================
2647 # 3D information 2640 # 3D information
2648 # =========================================================================== 2641 # ===========================================================================
2649 2642
2650 - # Download and annotate new RNA 3D chains (Chain objects in pp.update) 2643 + # # Download and annotate new RNA 3D chains (Chain objects in pp.update)
2651 - # If the original cif file and/or the Json DSSR annotation file already exist, they are not redownloaded/recomputed. 2644 + # # If the original cif file and/or the Json DSSR annotation file already exist, they are not redownloaded/recomputed.
2652 - pp.dl_and_annotate(coeff_ncores=0.5) 2645 + # pp.dl_and_annotate(coeff_ncores=0.5)
2653 - print("Here we go.") 2646 + # print("Here we go.")
2654 - 2647 +
2655 - # At this point, the structure table is up to date. 2648 + # # At this point, the structure table is up to date.
2656 - # Now save the DSSR annotations to the database. 2649 + # # Now save the DSSR annotations to the database.
2657 - # Extract the 3D chains to separate structure files if asked with --extract. 2650 + # # Extract the 3D chains to separate structure files if asked with --extract.
2658 - pp.build_chains(coeff_ncores=1.0) 2651 + # pp.build_chains(coeff_ncores=1.0)
2659 - 2652 +
2660 - if len(pp.to_retry): 2653 + # if len(pp.to_retry):
2661 - # Redownload and re-annotate 2654 + # # Redownload and re-annotate
2662 - print("> Retrying to annotate some structures which just failed.", flush=True) 2655 + # print("> Retrying to annotate some structures which just failed.", flush=True)
2663 - pp.dl_and_annotate(retry=True, coeff_ncores=0.3) # 2656 + # pp.dl_and_annotate(retry=True, coeff_ncores=0.3) #
2664 - pp.build_chains(retry=True, coeff_ncores=1.0) # Use half the cores to reduce required amount of memory 2657 + # pp.build_chains(retry=True, coeff_ncores=1.0) # Use half the cores to reduce required amount of memory
2665 - print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} ignored/errors).") 2658 + # print(f"> Loaded {len(pp.loaded_chains)} RNA chains ({len(pp.update) - len(pp.loaded_chains)} ignored/errors).")
2666 - if len(no_nts_set): 2659 + # if len(no_nts_set):
2667 - print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True) 2660 + # print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True)
2668 - if len(weird_mappings): 2661 + # if len(weird_mappings):
2669 - print(f"{len(weird_mappings)} mappings to Rfam were taken as absolute positions instead of residue numbers:", weird_mappings, flush=True) 2662 + # print(f"{len(weird_mappings)} mappings to Rfam were taken as absolute positions instead of residue numbers:", weird_mappings, flush=True)
2670 - if pp.SELECT_ONLY is None: 2663 + # if pp.SELECT_ONLY is None:
2671 - pp.checkpoint_save_chains() 2664 + # pp.checkpoint_save_chains()
2672 - 2665 +
2673 - if not pp.HOMOLOGY: 2666 + # if not pp.HOMOLOGY:
2674 - # Save chains to file 2667 + # # Save chains to file
2675 - for c in pp.loaded_chains: 2668 + # for c in pp.loaded_chains:
2676 - work_save(c, homology=False) 2669 + # work_save(c, homology=False)
2677 - print("Completed.") 2670 + # print("Completed.")
2678 - exit(0) 2671 + # exit(0)
2679 - 2672 +
2680 - # At this point, structure, chain and nucleotide tables of the database are up to date. 2673 + # # At this point, structure, chain and nucleotide tables of the database are up to date.
2681 - # (Modulo some statistics computed by statistics.py) 2674 + # # (Modulo some statistics computed by statistics.py)
2682 2675
2683 # =========================================================================== 2676 # ===========================================================================
2684 # Homology information 2677 # Homology information
...@@ -2700,8 +2693,8 @@ if __name__ == "__main__": ...@@ -2700,8 +2693,8 @@ if __name__ == "__main__":
2700 pp.fam_list = sorted(rfam_acc_to_download.keys()) 2693 pp.fam_list = sorted(rfam_acc_to_download.keys())
2701 2694
2702 if len(pp.fam_list): 2695 if len(pp.fam_list):
2703 - pp.prepare_sequences() 2696 + # pp.prepare_sequences()
2704 - pp.realign() 2697 + # pp.realign()
2705 2698
2706 # At this point, the family table is almost up to date 2699 # At this point, the family table is almost up to date
2707 # (lacking idty_percent and ali_filtered_length, both set in statistics.py) 2700 # (lacking idty_percent and ali_filtered_length, both set in statistics.py)
......
1 -#!/usr/bin/python3.8 1 +#!/usr/bin/python3
2 2
3 # This file computes additional statistics over the produced dataset. 3 # This file computes additional statistics over the produced dataset.
4 # Run this file if you want the base counts, pair-type counts, identity percents, etc 4 # Run this file if you want the base counts, pair-type counts, identity percents, etc
...@@ -74,6 +74,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0): ...@@ -74,6 +74,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0):
74 74
75 # Extract the angle values of c2'-endo and c3'-endo nucleotides 75 # Extract the angle values of c2'-endo and c3'-endo nucleotides
76 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 76 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
77 + conn.execute('pragma journal_mode=wal')
77 df = pd.read_sql(f"""SELECT {angle}, th{angle} 78 df = pd.read_sql(f"""SELECT {angle}, th{angle}
78 FROM ( 79 FROM (
79 SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id 80 SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
...@@ -188,8 +189,12 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0): ...@@ -188,8 +189,12 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0):
188 if show: 189 if show:
189 fig.show() 190 fig.show()
190 plt.close() 191 plt.close()
192 +
193 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
194 +
191 # print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.") 195 # print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.")
192 196
197 +@trace_unhandled_exceptions
193 def stats_len(): 198 def stats_len():
194 """Plots statistics on chain lengths in RNA families. 199 """Plots statistics on chain lengths in RNA families.
195 Uses all chains mapped to a family including copies, inferred or not. 200 Uses all chains mapped to a family including copies, inferred or not.
...@@ -222,6 +227,7 @@ def stats_len(): ...@@ -222,6 +227,7 @@ def stats_len():
222 227
223 # Get the lengths of chains 228 # Get the lengths of chains
224 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 229 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
230 + conn.execute('pragma journal_mode=wal')
225 l = [ x[0] for x in sql_ask_database(conn, f"""SELECT COUNT(index_chain) 231 l = [ x[0] for x in sql_ask_database(conn, f"""SELECT COUNT(index_chain)
226 FROM ( 232 FROM (
227 SELECT chain_id 233 SELECT chain_id
...@@ -259,6 +265,7 @@ def stats_len(): ...@@ -259,6 +265,7 @@ def stats_len():
259 # Save the figure 265 # Save the figure
260 fig.savefig(runDir + f"/results/figures/lengths_{res_thr}A.png") 266 fig.savefig(runDir + f"/results/figures/lengths_{res_thr}A.png")
261 idxQueue.put(thr_idx) # replace the thread index in the queue 267 idxQueue.put(thr_idx) # replace the thread index in the queue
268 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
262 # notify("Computed sequence length statistics and saved the figure.") 269 # notify("Computed sequence length statistics and saved the figure.")
263 270
264 def format_percentage(tot, x): 271 def format_percentage(tot, x):
...@@ -273,6 +280,7 @@ def format_percentage(tot, x): ...@@ -273,6 +280,7 @@ def format_percentage(tot, x):
273 x = "<.01" 280 x = "<.01"
274 return x + '%' 281 return x + '%'
275 282
283 +@trace_unhandled_exceptions
276 def stats_freq(): 284 def stats_freq():
277 """Computes base frequencies in all RNA families. 285 """Computes base frequencies in all RNA families.
278 Uses all chains mapped to a family including copies, inferred or not. 286 Uses all chains mapped to a family including copies, inferred or not.
...@@ -294,6 +302,7 @@ def stats_freq(): ...@@ -294,6 +302,7 @@ def stats_freq():
294 # List all nt_names happening within a RNA family and store the counts in the Counter 302 # List all nt_names happening within a RNA family and store the counts in the Counter
295 for f in tqdm(famlist, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", unit="family", leave=False): 303 for f in tqdm(famlist, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", unit="family", leave=False):
296 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 304 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
305 + conn.execute('pragma journal_mode=wal')
297 counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0)) 306 counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0))
298 freqs[f].update(counts) 307 freqs[f].update(counts)
299 308
...@@ -305,6 +314,7 @@ def stats_freq(): ...@@ -305,6 +314,7 @@ def stats_freq():
305 df = df.fillna(0) 314 df = df.fillna(0)
306 df.to_csv(runDir + "/results/frequencies.csv") 315 df.to_csv(runDir + "/results/frequencies.csv")
307 idxQueue.put(thr_idx) # replace the thread index in the queue 316 idxQueue.put(thr_idx) # replace the thread index in the queue
317 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
308 # notify("Saved nucleotide frequencies to CSV file.") 318 # notify("Saved nucleotide frequencies to CSV file.")
309 319
310 @trace_unhandled_exceptions 320 @trace_unhandled_exceptions
...@@ -327,6 +337,7 @@ def parallel_stats_pairs(f): ...@@ -327,6 +337,7 @@ def parallel_stats_pairs(f):
327 sqldata = [] 337 sqldata = []
328 for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", unit="chain",leave=False): 338 for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", unit="chain",leave=False):
329 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 339 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
340 + conn.execute('pragma journal_mode=wal')
330 # Get comma separated lists of basepairs per nucleotide 341 # Get comma separated lists of basepairs per nucleotide
331 interactions = pd.DataFrame( 342 interactions = pd.DataFrame(
332 sql_ask_database(conn, f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM nucleotide WHERE chain_id='{cid}';"), 343 sql_ask_database(conn, f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM nucleotide WHERE chain_id='{cid}';"),
...@@ -413,7 +424,9 @@ def parallel_stats_pairs(f): ...@@ -413,7 +424,9 @@ def parallel_stats_pairs(f):
413 expanded_list.to_csv(runDir + f"/data/{f}_pairs.csv") 424 expanded_list.to_csv(runDir + f"/data/{f}_pairs.csv")
414 425
415 idxQueue.put(thr_idx) # replace the thread index in the queue 426 idxQueue.put(thr_idx) # replace the thread index in the queue
427 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
416 428
429 +@trace_unhandled_exceptions
417 def to_id_matrix(f): 430 def to_id_matrix(f):
418 """ 431 """
419 Extracts sequences of 3D chains from the family alignments to a distinct STK file, 432 Extracts sequences of 3D chains from the family alignments to a distinct STK file,
...@@ -451,7 +464,8 @@ def to_id_matrix(f): ...@@ -451,7 +464,8 @@ def to_id_matrix(f):
451 # Out-of-scope task : update the database with the length of the filtered alignment: 464 # Out-of-scope task : update the database with the length of the filtered alignment:
452 align = AlignIO.read(path_to_seq_data+f"/realigned/{f}_3d_only.afa", "fasta") 465 align = AlignIO.read(path_to_seq_data+f"/realigned/{f}_3d_only.afa", "fasta")
453 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 466 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
454 - sql_execute(conn, """UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;""", many=True, data=(align.get_alignment_length(), f)) 467 + conn.execute('pragma journal_mode=wal')
468 + sql_execute(conn, "UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=[align.get_alignment_length(), f])
455 del align 469 del align
456 470
457 # Prepare the job 471 # Prepare the job
...@@ -484,8 +498,10 @@ def to_id_matrix(f): ...@@ -484,8 +498,10 @@ def to_id_matrix(f):
484 np.save("data/"+f+".npy", id_matrix) 498 np.save("data/"+f+".npy", id_matrix)
485 499
486 idxQueue.put(thr_idx) # replace the thread index in the queue 500 idxQueue.put(thr_idx) # replace the thread index in the queue
501 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
487 return 0 502 return 0
488 503
504 +@trace_unhandled_exceptions
489 def seq_idty(): 505 def seq_idty():
490 """Computes identity matrices for each of the RNA families. 506 """Computes identity matrices for each of the RNA families.
491 507
...@@ -504,6 +520,7 @@ def seq_idty(): ...@@ -504,6 +520,7 @@ def seq_idty():
504 520
505 # Update database with identity percentages 521 # Update database with identity percentages
506 conn = sqlite3.connect(runDir + "/results/RNANet.db") 522 conn = sqlite3.connect(runDir + "/results/RNANet.db")
523 + conn.execute('pragma journal_mode=wal')
507 for f, D in zip(fams_to_plot, fam_arrays): 524 for f, D in zip(fams_to_plot, fam_arrays):
508 if not len(D): continue 525 if not len(D): continue
509 if D.shape[0] > 1: 526 if D.shape[0] > 1:
...@@ -547,6 +564,7 @@ def seq_idty(): ...@@ -547,6 +564,7 @@ def seq_idty():
547 fig.savefig(runDir + f"/results/figures/distances_{res_thr}.png") 564 fig.savefig(runDir + f"/results/figures/distances_{res_thr}.png")
548 print("> Computed all identity matrices and saved the figure.", flush=True) 565 print("> Computed all identity matrices and saved the figure.", flush=True)
549 566
567 +@trace_unhandled_exceptions
550 def stats_pairs(): 568 def stats_pairs():
551 """Counts occurrences of intra-chain base-pair types in RNA families 569 """Counts occurrences of intra-chain base-pair types in RNA families
552 570
...@@ -614,8 +632,10 @@ def stats_pairs(): ...@@ -614,8 +632,10 @@ def stats_pairs():
614 plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99) 632 plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99)
615 plt.savefig(runDir + f"/results/figures/pairings_{res_thr}.png") 633 plt.savefig(runDir + f"/results/figures/pairings_{res_thr}.png")
616 634
635 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
617 notify("Computed nucleotide statistics and saved CSV and PNG file.") 636 notify("Computed nucleotide statistics and saved CSV and PNG file.")
618 637
638 +@trace_unhandled_exceptions
619 def per_chain_stats(): 639 def per_chain_stats():
620 """Computes per-chain frequencies and base-pair type counts. 640 """Computes per-chain frequencies and base-pair type counts.
621 641
...@@ -623,7 +643,8 @@ def per_chain_stats(): ...@@ -623,7 +643,8 @@ def per_chain_stats():
623 643
624 setproctitle(f"RNANet statistics.py per_chain_stats()") 644 setproctitle(f"RNANet statistics.py per_chain_stats()")
625 645
626 - with sqlite3.connect(runDir + "/results/RNANet.db", isolation_level=None) as conn: 646 + with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
647 + conn.execute('pragma journal_mode=wal')
627 # Compute per-chain nucleotide frequencies 648 # Compute per-chain nucleotide frequencies
628 df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn) 649 df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn)
629 df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64) 650 df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64)
...@@ -631,11 +652,11 @@ def per_chain_stats(): ...@@ -631,11 +652,11 @@ def per_chain_stats():
631 df = df.drop("total", axis=1) 652 df = df.drop("total", axis=1)
632 653
633 # Set the values 654 # Set the values
634 - conn.execute('pragma journal_mode=wal')
635 sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;", 655 sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;",
636 many=True, data=list(df.to_records(index=False)), warn_every=10) 656 many=True, data=list(df.to_records(index=False)), warn_every=10)
637 print("> Updated the database with per-chain base frequencies", flush=True) 657 print("> Updated the database with per-chain base frequencies", flush=True)
638 658
659 +@trace_unhandled_exceptions
639 def general_stats(): 660 def general_stats():
640 """ 661 """
641 Number of structures as function of the resolution threshold 662 Number of structures as function of the resolution threshold
...@@ -749,6 +770,7 @@ def general_stats(): ...@@ -749,6 +770,7 @@ def general_stats():
749 770
750 answers = [] 771 answers = []
751 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 772 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
773 + conn.execute('pragma journal_mode=wal')
752 for r in reqs: 774 for r in reqs:
753 answers.append(pd.read_sql(r, conn)) 775 answers.append(pd.read_sql(r, conn))
754 df_unique = answers[0] 776 df_unique = answers[0]
...@@ -909,6 +931,7 @@ def general_stats(): ...@@ -909,6 +931,7 @@ def general_stats():
909 hspace=0.05, bottom=0.12, top=0.84) 931 hspace=0.05, bottom=0.12, top=0.84)
910 fig.savefig(runDir + "/results/figures/Nfamilies.png") 932 fig.savefig(runDir + "/results/figures/Nfamilies.png")
911 plt.close() 933 plt.close()
934 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
912 935
913 def log_to_pbar(pbar): 936 def log_to_pbar(pbar):
914 def update(r): 937 def update(r):
...@@ -981,6 +1004,7 @@ if __name__ == "__main__": ...@@ -981,6 +1004,7 @@ if __name__ == "__main__":
981 # Load mappings. famlist will contain only families with structures at this resolution threshold. 1004 # Load mappings. famlist will contain only families with structures at this resolution threshold.
982 print("Loading mappings list...") 1005 print("Loading mappings list...")
983 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 1006 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
1007 + conn.execute('pragma journal_mode=wal')
984 n_unmapped_chains = sql_ask_database(conn, "SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0] 1008 n_unmapped_chains = sql_ask_database(conn, "SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0]
985 families = pd.read_sql(f"""SELECT rfam_acc, count(*) as n_chains 1009 families = pd.read_sql(f"""SELECT rfam_acc, count(*) as n_chains
986 FROM chain JOIN structure 1010 FROM chain JOIN structure
......