Louis BECQUEY

Improved automation

...@@ -1389,7 +1389,7 @@ class Pipeline: ...@@ -1389,7 +1389,7 @@ class Pipeline:
1389 # Remove previous precomputed data 1389 # Remove previous precomputed data
1390 subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"]) 1390 subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
1391 for f in self.fam_list: 1391 for f in self.fam_list:
1392 - subprocess.run(["rm","-f", f"data/{f}.npy"]) 1392 + subprocess.run(["rm","-f", f"data/{f}.npy", f"data/{f}_pairs.csv", f"data/{f}_counts.csv"])
1393 1393
1394 # Run statistics files 1394 # Run statistics files
1395 os.chdir(runDir) 1395 os.chdir(runDir)
...@@ -1397,13 +1397,12 @@ class Pipeline: ...@@ -1397,13 +1397,12 @@ class Pipeline:
1397 subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data]) 1397 subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data])
1398 1398
1399 # Save additional informations 1399 # Save additional informations
1400 - conn = sqlite3.connect(runDir+"/results/RNANet.db") 1400 + with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
1401 - pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 1401 + pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;",
1402 - conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) 1402 + conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
1403 - pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 1403 + pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure
1404 - JOIN chain ON structure.pdb_id = chain.structure_id 1404 + JOIN chain ON structure.pdb_id = chain.structure_id
1405 - ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False) 1405 + ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
1406 - conn.close()
1407 1406
1408 # Archive the results 1407 # Archive the results
1409 if self.SELECT_ONLY is None: 1408 if self.SELECT_ONLY is None:
...@@ -2408,6 +2407,7 @@ if __name__ == "__main__": ...@@ -2408,6 +2407,7 @@ if __name__ == "__main__":
2408 rfam_acc_to_download[c.mapping.rfam_acc] = [ c ] 2407 rfam_acc_to_download[c.mapping.rfam_acc] = [ c ]
2409 else: 2408 else:
2410 rfam_acc_to_download[c.mapping.rfam_acc].append(c) 2409 rfam_acc_to_download[c.mapping.rfam_acc].append(c)
2410 +
2411 print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences") 2411 print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences")
2412 pp.fam_list = sorted(rfam_acc_to_download.keys()) 2412 pp.fam_list = sorted(rfam_acc_to_download.keys())
2413 2413
......
1 # This is a script supposed to be run periodically as a cron job 1 # This is a script supposed to be run periodically as a cron job
2 2
3 -cd /home/lbecquey/Projects/RNANet; 3 +cd /home/lbecquey/Projects/RNANet
4 -rm -f nohup.out errors.txt; 4 +rm -f latest_run.log errors.txt
5 5
6 # Run RNANet 6 # Run RNANet
7 -nohup bash -c 'time ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -s -r 20.0 --archive'; 7 +bash -c 'time ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 -s --archive' &> latest_run.log
8 +touch results/RNANet.db # update last modification date
9 +rm -f results/RNANet.db-wal results/RNANet.db-shm # SQLite temporary files
8 10
9 # Compress 11 # Compress
10 -rm -f results/RNANet.db.gz 12 +rm -f /home/lbecquey/Projects/RNANet/results/RNANet.db.gz
11 -gzip -k results/RNANet.db 13 +echo 'Deleted results/RNANet.db.gz (if existed)' >> latest_run.log
14 +gzip -k /home/lbecquey/Projects/RNANet/results/RNANet.db
15 +echo 'Recreated it.' >> latest_run.log
12 16
13 # Sync in Seafile 17 # Sync in Seafile
14 -seaf-cli start; 18 +seaf-cli start >> latest_run.log 2>&1
15 -sleep 30m; 19 +echo 'Waiting 10m for SeaFile synchronization...' >> latest_run.log
16 -seaf-cli stop; 20 +sleep 10m
21 +echo `seaf-cli status` >> latest_run.log
22 +seaf-cli stop >> latest_run.log 2>&1
23 +echo 'We are '`date`', update completed.' >> latest_run.log
17 24
......
...@@ -161,3 +161,7 @@ ...@@ -161,3 +161,7 @@
161 3eq4_1_Y_1-69 161 3eq4_1_Y_1-69
162 4v5z_1_AA_1-1562 162 4v5z_1_AA_1-1562
163 4v5z_1_AA_1-1563 163 4v5z_1_AA_1-1563
164 +6lqm_1_8_1267-4755
165 +6lu8_1_8_1267-4755
166 +6lsr_1_8_1267-4755
167 +6lss_1_8_1267-4755
......
...@@ -487,3 +487,15 @@ DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_AA_1-1562. ...@@ -487,3 +487,15 @@ DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_AA_1-1562.
487 4v5z_1_AA_1-1563 487 4v5z_1_AA_1-1563
488 DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_AA_1-1563. 488 DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_AA_1-1563.
489 489
490 +6lqm_1_8_1267-4755
491 +Could not find nucleotides of chain 8 in annotation 6lqm.json. Either there is a problem with 6lqm mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
492 +
493 +6lu8_1_8_1267-4755
494 +Could not find nucleotides of chain 8 in annotation 6lu8.json. Either there is a problem with 6lu8 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
495 +
496 +6lsr_1_8_1267-4755
497 +Could not find nucleotides of chain 8 in annotation 6lsr.json. Either there is a problem with 6lsr mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
498 +
499 +6lss_1_8_1267-4755
500 +Could not find nucleotides of chain 8 in annotation 6lss.json. Either there is a problem with 6lss mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
501 +
......
...@@ -288,6 +288,9 @@ def parallel_stats_pairs(f): ...@@ -288,6 +288,9 @@ def parallel_stats_pairs(f):
288 288
289 REQUIRES tables chain, nucleotide up-to-date.""" 289 REQUIRES tables chain, nucleotide up-to-date."""
290 290
291 + if path.isfile("data/"+f+"_pairs.csv") and path.isfile("data/"+f+"_counts.csv"):
292 + return
293 +
291 # Get a worker number to position the progress bar 294 # Get a worker number to position the progress bar
292 global idxQueue 295 global idxQueue
293 thr_idx = idxQueue.get() 296 thr_idx = idxQueue.get()
...@@ -363,7 +366,7 @@ def parallel_stats_pairs(f): ...@@ -363,7 +366,7 @@ def parallel_stats_pairs(f):
363 data.append(expanded_list) 366 data.append(expanded_list)
364 367
365 # Update the database 368 # Update the database
366 - with sqlite3.connect("results/RNANet.db") as conn: 369 + with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
367 conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query 370 conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
368 sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?, 371 sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
369 pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?, 372 pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?,
...@@ -554,7 +557,7 @@ def per_chain_stats(): ...@@ -554,7 +557,7 @@ def per_chain_stats():
554 557
555 REQUIRES tables chain, nucleotide up to date. """ 558 REQUIRES tables chain, nucleotide up to date. """
556 559
557 - with sqlite3.connect("results/RNANet.db") as conn: 560 + with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
558 # Compute per-chain nucleotide frequencies 561 # Compute per-chain nucleotide frequencies
559 df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn) 562 df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn)
560 df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64) 563 df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64)
...@@ -610,7 +613,7 @@ if __name__ == "__main__": ...@@ -610,7 +613,7 @@ if __name__ == "__main__":
610 613
611 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) 614 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
612 pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True) 615 pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True)
613 - 616 +sqlite3
614 try: 617 try:
615 for j in joblist: 618 for j in joblist:
616 p.apply_async(j.func_, args=j.args_, callback=log_to_pbar(pbar)) 619 p.apply_async(j.func_, args=j.args_, callback=log_to_pbar(pbar))
...@@ -626,6 +629,9 @@ if __name__ == "__main__": ...@@ -626,6 +629,9 @@ if __name__ == "__main__":
626 except: 629 except:
627 print("Something went wrong") 630 print("Something went wrong")
628 631
632 + print()
633 + print()
634 +
629 # finish the work after the parallel portions 635 # finish the work after the parallel portions
630 per_chain_stats() 636 per_chain_stats()
631 seq_idty() 637 seq_idty()
......