Merge branch 'master' of https://github.com/persalteas/RNANet into master

Louis BECQUEY
Commit 97926e0badad13e955c797c19754d4add6e9a2e9 97926e0b 2 parents 61d2238e e57395a9
Showing 7 changed files with 244 additions and 43 deletions
README.md
RNAnet.py
automate.sh
kill_rnanet.sh
known_issues.txt
known_issues_reasons.txt
statistics.py
--- a/README.md
View file @97926e0
+++ b/README.md
View file @97926e0
@@ -7,7 +7,7 @@ Future versions might compute a real MSA-based clusering directly with Rfamseq n
 This script prepares the dataset from available public data in PDB and Rfam.
 
 
- **Please cite**: *Coming soon, expect it summer 2020*
+ **Please cite**: *Coming soon, expect it in 2021*
 
 # What it does
 The script follows these steps:
@@ -72,7 +72,7 @@ You need to install:
 
 ## Command line
 Run `./RNANet.py --3d-folder path/to/3D/data/folder --seq-folder path/to/sequence/data/folder [ - other options ]`. 
- It requires solid hardware to run. It takes around 15 hours the first time, and 9h then, tested on a server with 32 cores and 48GB of RAM.
+ It requires solid hardware to run. It takes around around 12 to 15 hours the first time, and 1 to 3h then, tested on a server with 32 cores and 48GB of RAM.
 The detailed list of options is below:
 
 ```
--- a/RNAnet.py
View file @97926e0
+++ b/RNAnet.py
View file @97926e0
@@ -273,32 +273,39 @@ class Chain:
             if self.mapping is not None:
                 self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}")
 
-             if df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
-                 # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end
- 
-                 if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup:
-                     # They are all contiguous in the chain
-                     # 4v9n-DA case (and similar ones) : 610-611-611A-611B-611C-611D-611E-611F-611G-617-618...
-                     # there is a redundancy (611) followed by a gap (611-617). 
-                     # We want the redundancy to fill the gap.
-                     df.iloc[i:i+n_dup-1, 1] += 1
+             try:
+                 if i > 0 and index_last_dup +1 < len(df.index) and df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
+                     # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end
+ 
+                     if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup:
+                         # They are all contiguous in the chain
+                         # 4v9n-DA case (and similar ones) : 610-611-611A-611B-611C-611D-611E-611F-611G-617-618...
+                         # there is a redundancy (611) followed by a gap (611-617). 
+                         # We want the redundancy to fill the gap.
+                         df.iloc[i:i+n_dup-1, 1] += 1
+                     else:
+                         # We solve the problem continous component by continuous component
+                         for j in range(1, n_dup+1):
+                             if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous
+                                 df.iloc[i+j-1,1] += 1
+                             else:
+                                 break
+                 elif df.iloc[i,1] == df.iloc[i-1,1]:
+                     # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
+                     # Solution : we shift the numbering of 17A (to 18) and the following residues.
+                     df.iloc[i:, 1] += 1
                 else:
-                     # We solve the problem continous component by continuous component
-                     for j in range(1, n_dup+1):
-                         if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous
-                             df.iloc[i+j-1,1] += 1
-                         else:
-                             break
-             elif df.iloc[i,1] == df.iloc[i-1,1]:
-                 # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
-                 # Solution : we shift the numbering of 17A (to 18) and the following residues.
-                 df.iloc[i:, 1] += 1
-             else:
-                 # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
-                 # Here the 163B is read 163 by DSSR, but there already is a residue 163.
-                 # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1.
-                 df.iloc[i, 1] = 1 + df.iloc[i-1, 1]
-                 df.iloc[i+1:, 1] += 1
+                     # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
+                     # Here the 163B is read 163 by DSSR, but there already is a residue 163.
+                     # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1.
+                     df.iloc[i, 1] = 1 + df.iloc[i-1, 1]
+                     df.iloc[i+1:, 1] += 1
+             except:
+                 warn(f"Error with parsing of {self.chain_label} duplicate residue numbers. Ignoring it.")
+                 self.delete_me = True
+                 self.error_messages = f"Error with parsing of duplicate residues numbers."
+                 return None
+ 
 
         # Search for ligands at the end of the selection
         # Drop ligands detected as residues by DSSR, by detecting several markers
@@ -1019,7 +1026,7 @@ class Pipeline:
                 print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &") 
                 sys.exit()
             elif opt == '--version':
-                 print("RNANet 1.0 alpha ")
+                 print("RNANet 1.1 beta")
                 sys.exit()
             elif opt == "-r" or opt == "--resolution":
                 assert float(arg) > 0.0 and float(arg) <= 20.0 
@@ -1382,7 +1389,7 @@ class Pipeline:
             # Remove previous precomputed data
             subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
             for f in self.fam_list:
-                 subprocess.run(["rm","-f", f"data/{f}.npy"])
+                 subprocess.run(["rm","-f", f"data/{f}.npy", f"data/{f}_pairs.csv", f"data/{f}_counts.csv"])
 
             # Run statistics files
             os.chdir(runDir)
@@ -1390,13 +1397,12 @@ class Pipeline:
             subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data])
 
         # Save additional informations
-         conn = sqlite3.connect(runDir+"/results/RNANet.db")
-         pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 
-                           conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
-         pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 
-                             JOIN chain ON structure.pdb_id = chain.structure_id
-                             ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
-         conn.close()
+         with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
+             pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 
+                             conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
+             pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 
+                                 JOIN chain ON structure.pdb_id = chain.structure_id
+                                 ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
 
         # Archive the results
         if self.SELECT_ONLY is None:
@@ -1404,7 +1410,10 @@ class Pipeline:
             subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."])
 
         # Update shortcuts to latest versions
-         subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz", runDir + "/results/summary_latest.csv", runDir + "/results/families_latest.csv"])
+         subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz", 
+                                     runDir + "/results/summary_latest.csv", 
+                                     runDir + "/results/families_latest.csv"
+                         ])
         subprocess.run(['ln',"-s", runDir +f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"])
         subprocess.run(['ln',"-s", runDir +f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"])
         subprocess.run(['ln',"-s", runDir +f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"])
@@ -1631,6 +1640,7 @@ def sql_ask_database(conn, sql, warn_every = 10):
 
 @trace_unhandled_exceptions
 def sql_execute(conn, sql, many=False, data=None, warn_every=10):
+     conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
     for _ in range(100): # retry 100 times if it fails
         try:
             if many:
@@ -2397,6 +2407,7 @@ if __name__ == "__main__":
             rfam_acc_to_download[c.mapping.rfam_acc] = [ c ]
         else:
             rfam_acc_to_download[c.mapping.rfam_acc].append(c)
+ 
     print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences")
     pp.fam_list = sorted(rfam_acc_to_download.keys())
     
--- a/automate.sh 100644 → 100755
View file @97926e0
+++ b/automate.sh 100644 → 100755
View file @97926e0
 # This is a script supposed to be run periodically as a cron job
 
+ cd /home/lbecquey/Projects/RNANet
+ rm -f latest_run.log errors.txt
+ 
 # Run RNANet
- cd /home/lbecquey/Projects/RNANet;
- rm -f stdout.txt stderr.txt errors.txt;
- time './RNAnet.py --3d-folder /home/lbequey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -s -r 20.0' > stdout.txt 2> stderr.txt;
+ bash -c 'time ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 -s --archive' &> latest_run.log
+ touch results/RNANet.db # update last modification date
+ rm -f results/RNANet.db-wal results/RNANet.db-shm # SQLite temporary files
 
- # Sync in Seafile
- seaf-cli start;
+ # Compress
+ rm -f /home/lbecquey/Projects/RNANet/results/RNANet.db.gz
+ echo 'Deleted results/RNANet.db.gz (if existed)' >> latest_run.log
+ gzip -k /home/lbecquey/Projects/RNANet/results/RNANet.db
+ echo 'Recreated it.' >> latest_run.log
 
- seaf-cli stop;
+ # Sync in Seafile
+ seaf-cli start >> latest_run.log 2>&1
+ echo 'Waiting 10m for SeaFile synchronization...' >> latest_run.log
+ sleep 10m
+ echo `seaf-cli status` >> latest_run.log
+ seaf-cli stop >> latest_run.log 2>&1
+ echo 'We are '`date`', update completed.' >> latest_run.log
 
--- a/kill_rnanet.sh
View file @97926e0
+++ b/kill_rnanet.sh
View file @97926e0
@@ -10,6 +10,17 @@ for KILLPID in $PROCESS_LIST; do
   fi
 done
 
+ PROCESS_TO_KILL="statistics.py"
+ PROCESS_LIST=`ps ax | grep -Ei ${PROCESS_TO_KILL} | grep -Eiv '(grep|vi statistics.py)' | awk ' { print $1;}'`
+ KILLED=
+ for KILLPID in $PROCESS_LIST; do
+   if [ ! -z $KILLPID ];then
+     kill -9 $KILLPID
+     echo "Killed PID ${KILLPID}"
+     KILLED=yes
+   fi
+ done
+ 
 if [ -z $KILLED ];then
     echo "Didn't kill anything"
 fi
--- a/known_issues.txt 0 → 100644
View file @97926e0
+++ b/known_issues.txt 0 → 100644
View file @97926e0
+ 1ml5_1_a_1-2914
+ 1ml5_1_a_151-2903
+ 1ml5_1_A_7-1518
+ 1ml5_1_A_7-1515
+ 1ml5_1_A_2-1520
+ 1ml5_1_b_5-121
+ 2rdo_1_A_3-118
+ 4v48_1_A9_3-118
+ 4v47_1_A9_3-118
+ 1vy7_1_AY_1-73
+ 1vy7_1_CY_1-73
+ 4w2h_1_CY_1-73
+ 6zmi_1_L8_1267-4755
+ 6zm7_1_L8_1267-4755
+ 6y6x_1_L8_1267-4755
+ 6z6n_1_L8_1267-4755
+ 6qzp_1_L8_1267-4755
+ 6zme_1_L8_1267-4755
+ 6z6l_1_L8_1267-4755
+ 6ek0_1_L8_1267-4755
+ 6zmo_1_L8_1267-4755
+ 6z6m_1_L8_1267-4755
+ 6ole_1_D_1267-4755
+ 6om0_1_D_1267-4755
+ 6y2l_1_L8_1267-4755
+ 6y0g_1_L8_1267-4755
+ 6oli_1_D_1267-4755
+ 6olg_1_A3_1267-4755
+ 6y57_1_L8_1267-4755
+ 5t2c_1_C_1267-4755
+ 6om7_1_D_1267-4755
+ 4ug0_1_L8_1267-4755
+ 6olf_1_D_1267-4755
+ 6ip5_1_1C_1267-4755
+ 6ip8_1_1C_1267-4755
+ 6olz_1_A3_1267-4755
+ 5aj0_1_A3_1267-4755
+ 5lks_1_L8_1267-4755
+ 6ip6_1_1C_1267-4755
+ 4v6x_1_A8_1267-4755
+ 2z9q_1_A_1-72
+ 1ls2_1_B_1-73
+ 3ep2_1_Y_1-72
+ 3eq3_1_Y_1-72
+ 4v48_1_A6_1-73
+ 1gsg_1_T_1-72
+ 3jcr_1_H_1-115
+ 1eg0_1_O_1-73
+ 4v42_1_BB_5-121
+ 4v42_1_BA_1-2914
+ 4v42_1_BA_151-2903
+ 2ob7_1_A_10-319
+ 1x1l_1_A_1-130
+ 1zc8_1_Z_1-130
+ 1zc8_1_Z_1-91
+ 2ob7_1_D_1-130
+ 1r2x_1_C_1-58
+ 1r2w_1_C_1-58
+ 1eg0_1_L_1-56
+ 1eg0_1_L_1-57
+ 6rxu_1_C2_588-2386
+ 6rxu_1_C2_588-2383
+ 6rxu_1_C2_583-2388
+ 5oql_1_2_588-2386
+ 5oql_1_2_588-2383
+ 5oql_1_2_583-2388
+ 6rxv_1_C2_588-2386
+ 6rxv_1_C2_588-2383
+ 6rxv_1_C2_583-2388
+ 6rxz_1_C2_588-2386
+ 6rxz_1_C2_588-2383
+ 6rxz_1_C2_583-2388
+ 6rxy_1_C2_588-2386
+ 6rxy_1_C2_588-2383
+ 6rxy_1_C2_583-2388
+ 6rxt_1_C2_588-2386
+ 6rxt_1_C2_588-2383
+ 6rxt_1_C2_583-2388
+ 4v48_1_BA_1-91
+ 4v48_1_BA_6-1541
+ 4v48_1_BA_6-1538
+ 4v48_1_BA_1-1543
+ 4v47_1_BA_1-91
+ 4v47_1_BA_6-1540
+ 4v47_1_BA_6-1537
+ 4v47_1_BA_1-1542
+ 2rdo_1_B_6-1460
+ 2rdo_1_B_6-1522
+ 2rdo_1_B_1-2903
+ 2rdo_1_B_6-1457
+ 2rdo_1_B_1-2904
+ 2rdo_1_B_1-1528
+ 2rdo_1_B_160-2893
+ 4v48_1_A0_6-1460
+ 4v48_1_A0_6-1522
+ 4v48_1_A0_1-2903
+ 4v48_1_A0_6-1457
+ 4v48_1_A0_1-2904
+ 4v48_1_A0_1-1528
+ 4v48_1_A0_160-2893
+ 4v47_1_A0_6-1460
+ 4v47_1_A0_6-1522
+ 4v47_1_A0_1-2903
+ 4v47_1_A0_6-1457
+ 4v47_1_A0_1-2904
+ 4v47_1_A0_1-1528
+ 4v47_1_A0_160-2893
+ 1zc8_1_A_1-59
+ 1mvr_1_D_1-59
+ 4c9d_1_D_29-1
+ 4c9d_1_C_29-1
+ 4adx_1_9_1-121
+ 1zn1_1_B_1-59
+ 1emi_1_B_1-108
+ 3iy9_1_A_498-1027
+ 1jgq_1_A_20-55
+ 1jgq_1_A_7-1518
+ 1jgq_1_A_7-1515
+ 1jgq_1_A_2-1520
+ 4v42_1_AA_20-55
+ 4v42_1_AA_7-1518
+ 4v42_1_AA_7-1515
+ 4v42_1_AA_2-1520
+ 1jgo_1_A_20-55
+ 1jgo_1_A_7-1518
+ 1jgo_1_A_7-1515
+ 1jgo_1_A_2-1520
+ 1jgp_1_A_20-55
+ 1jgp_1_A_7-1518
+ 1jgp_1_A_7-1515
+ 1jgp_1_A_2-1520
+ 3ep2_1_B_1-50
+ 3eq3_1_B_1-50
+ 3eq4_1_B_1-50
+ 3pgw_1_R_1-164
+ 3pgw_1_N_1-164
+ 3cw1_1_x_1-138
+ 3cw1_1_w_1-138
+ 3cw1_1_V_1-138
+ 3cw1_1_v_1-138
+ 2iy3_1_B_9-105
+ 3jcr_1_N_1-106
+ 3jcr_1_N_1-188
+ 2vaz_1_A_64-177
+ 2ftc_1_R_81-1466
+ 2ftc_1_R_1-1568
+ 2ftc_1_R_792-1568
+ 3jcr_1_M_1-141
+ 3jcr_1_M_1-107
+ 3jcr_1_M_1-188
+ 4v5z_1_B0_1-2840
+ 4v5z_1_B0_1-2899
+ 4v5z_1_B0_1-2902
+ 5g2x_1_A_595-692
+ 3iy8_1_A_1-540
+ 4v5z_1_BY_2-113
+ 4v5z_1_BZ_1-70
+ 1mvr_1_B_1-96
+ 4adx_1_0_1-2923
+ 4adx_1_0_132-2915
+ 3eq4_1_Y_1-69
+ 4v5z_1_AA_1-1562
+ 4v5z_1_AA_1-1563
+ 6lqm_1_8_1267-4755
+ 6lu8_1_8_1267-4755
+ 6lsr_1_8_1267-4755
+ 6lss_1_8_1267-4755
--- a/known_issues_reasons.txt 0 → 100644
View file @97926e0
+++ b/known_issues_reasons.txt 0 → 100644
View file @97926e0
--- a/statistics.py
View file @97926e0
+++ b/statistics.py
View file @97926e0