Improved automation

Louis BECQUEY
Commit e57395a9492ddac5d69a8ee46dbe3a06017449a0 e57395a9 1 parent c9d07387
Showing 5 changed files with 48 additions and 19 deletions
RNAnet.py
automate.sh
known_issues.txt
known_issues_reasons.txt
statistics.py
--- a/RNAnet.py
View file @e57395a
+++ b/RNAnet.py
View file @e57395a
@@ -1389,7 +1389,7 @@ class Pipeline:
             # Remove previous precomputed data
             subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
             for f in self.fam_list:
-                 subprocess.run(["rm","-f", f"data/{f}.npy"])
+                 subprocess.run(["rm","-f", f"data/{f}.npy", f"data/{f}_pairs.csv", f"data/{f}_counts.csv"])
 
             # Run statistics files
             os.chdir(runDir)
@@ -1397,13 +1397,12 @@ class Pipeline:
             subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data])
 
         # Save additional informations
-         conn = sqlite3.connect(runDir+"/results/RNANet.db")
-         pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 
-                           conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
-         pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 
-                             JOIN chain ON structure.pdb_id = chain.structure_id
-                             ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
-         conn.close()
+         with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
+             pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 
+                             conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
+             pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 
+                                 JOIN chain ON structure.pdb_id = chain.structure_id
+                                 ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
 
         # Archive the results
         if self.SELECT_ONLY is None:
@@ -2408,6 +2407,7 @@ if __name__ == "__main__":
             rfam_acc_to_download[c.mapping.rfam_acc] = [ c ]
         else:
             rfam_acc_to_download[c.mapping.rfam_acc].append(c)
+ 
     print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences")
     pp.fam_list = sorted(rfam_acc_to_download.keys())
     
--- a/automate.sh
View file @e57395a
+++ b/automate.sh
View file @e57395a
 # This is a script supposed to be run periodically as a cron job
 
- cd /home/lbecquey/Projects/RNANet;
- rm -f nohup.out errors.txt;
+ cd /home/lbecquey/Projects/RNANet
+ rm -f latest_run.log errors.txt
 
 # Run RNANet
- nohup bash -c 'time ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -s -r 20.0 --archive';
+ bash -c 'time ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 -s --archive' &> latest_run.log
+ touch results/RNANet.db # update last modification date
+ rm -f results/RNANet.db-wal results/RNANet.db-shm # SQLite temporary files
 
 # Compress
- rm -f results/RNANet.db.gz
- gzip -k results/RNANet.db
+ rm -f /home/lbecquey/Projects/RNANet/results/RNANet.db.gz
+ echo 'Deleted results/RNANet.db.gz (if existed)' >> latest_run.log
+ gzip -k /home/lbecquey/Projects/RNANet/results/RNANet.db
+ echo 'Recreated it.' >> latest_run.log
 
 # Sync in Seafile
- seaf-cli start;
- sleep 30m;
- seaf-cli stop;
+ seaf-cli start >> latest_run.log 2>&1
+ echo 'Waiting 10m for SeaFile synchronization...' >> latest_run.log
+ sleep 10m
+ echo `seaf-cli status` >> latest_run.log
+ seaf-cli stop >> latest_run.log 2>&1
+ echo 'We are '`date`', update completed.' >> latest_run.log
 
--- a/known_issues.txt
View file @e57395a
+++ b/known_issues.txt
View file @e57395a
@@ -161,3 +161,7 @@
 3eq4_1_Y_1-69
 4v5z_1_AA_1-1562
 4v5z_1_AA_1-1563
+ 6lqm_1_8_1267-4755
+ 6lu8_1_8_1267-4755
+ 6lsr_1_8_1267-4755
+ 6lss_1_8_1267-4755
--- a/known_issues_reasons.txt
View file @e57395a
+++ b/known_issues_reasons.txt
View file @e57395a
@@ -487,3 +487,15 @@ DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_AA_1-1562.
 4v5z_1_AA_1-1563
 DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_AA_1-1563.
 
+ 6lqm_1_8_1267-4755
+ Could not find nucleotides of chain 8 in annotation 6lqm.json. Either there is a problem with 6lqm mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6lu8_1_8_1267-4755
+ Could not find nucleotides of chain 8 in annotation 6lu8.json. Either there is a problem with 6lu8 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6lsr_1_8_1267-4755
+ Could not find nucleotides of chain 8 in annotation 6lsr.json. Either there is a problem with 6lsr mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6lss_1_8_1267-4755
+ Could not find nucleotides of chain 8 in annotation 6lss.json. Either there is a problem with 6lss mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
--- a/statistics.py
View file @e57395a
+++ b/statistics.py
View file @e57395a
@@ -288,6 +288,9 @@ def parallel_stats_pairs(f):
 
     REQUIRES tables chain, nucleotide up-to-date.""" 
 
+     if path.isfile("data/"+f+"_pairs.csv") and path.isfile("data/"+f+"_counts.csv"):
+         return
+ 
     # Get a worker number to position the progress bar
     global idxQueue
     thr_idx = idxQueue.get()
@@ -363,7 +366,7 @@ def parallel_stats_pairs(f):
         data.append(expanded_list)
 
     # Update the database
-     with sqlite3.connect("results/RNANet.db") as conn:
+     with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
         conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
         sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
                                 pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?, 
@@ -554,7 +557,7 @@ def per_chain_stats():
 
     REQUIRES tables chain, nucleotide up to date. """
 
-     with sqlite3.connect("results/RNANet.db") as conn:
+     with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
         # Compute per-chain nucleotide frequencies
         df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn)
         df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64)
@@ -610,7 +613,7 @@ if __name__ == "__main__":
 
     p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
     pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True)
- 
+ sqlite3 
     try:
         for j in joblist:
             p.apply_async(j.func_, args=j.args_, callback=log_to_pbar(pbar))
@@ -626,6 +629,9 @@ if __name__ == "__main__":
     except:
         print("Something went wrong")
 
+     print()
+     print()
+ 
     # finish the work after the parallel portions
     per_chain_stats()
     seq_idty()