Merge branch 'master' of https://github.com/persalteas/RNANet into master

Louis BECQUEY
Commit 97926e0badad13e955c797c19754d4add6e9a2e9 97926e0b 2 parents 61d2238e e57395a9
Showing 7 changed files with 992 additions and 214 deletions
README.md
RNAnet.py
automate.sh
kill_rnanet.sh
known_issues.txt
known_issues_reasons.txt
statistics.py
--- a/README.md
View file @97926e0
+++ b/README.md
View file @97926e0
@@ -7,7 +7,7 @@ Future versions might compute a real MSA-based clusering directly with Rfamseq n
 This script prepares the dataset from available public data in PDB and Rfam.
 
 
- **Please cite**: *Coming soon, expect it summer 2020*
+ **Please cite**: *Coming soon, expect it in 2021*
 
 # What it does
 The script follows these steps:
@@ -72,7 +72,7 @@ You need to install:
 
 ## Command line
 Run `./RNANet.py --3d-folder path/to/3D/data/folder --seq-folder path/to/sequence/data/folder [ - other options ]`. 
- It requires solid hardware to run. It takes around 15 hours the first time, and 9h then, tested on a server with 32 cores and 48GB of RAM.
+ It requires solid hardware to run. It takes around around 12 to 15 hours the first time, and 1 to 3h then, tested on a server with 32 cores and 48GB of RAM.
 The detailed list of options is below:
 
 ```
--- a/RNAnet.py
View file @97926e0
+++ b/RNAnet.py
View file @97926e0
@@ -273,32 +273,39 @@ class Chain:
             if self.mapping is not None:
                 self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}")
 
-             if df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
-                 # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end
- 
-                 if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup:
-                     # They are all contiguous in the chain
-                     # 4v9n-DA case (and similar ones) : 610-611-611A-611B-611C-611D-611E-611F-611G-617-618...
-                     # there is a redundancy (611) followed by a gap (611-617). 
-                     # We want the redundancy to fill the gap.
-                     df.iloc[i:i+n_dup-1, 1] += 1
+             try:
+                 if i > 0 and index_last_dup +1 < len(df.index) and df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
+                     # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end
+ 
+                     if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup:
+                         # They are all contiguous in the chain
+                         # 4v9n-DA case (and similar ones) : 610-611-611A-611B-611C-611D-611E-611F-611G-617-618...
+                         # there is a redundancy (611) followed by a gap (611-617). 
+                         # We want the redundancy to fill the gap.
+                         df.iloc[i:i+n_dup-1, 1] += 1
+                     else:
+                         # We solve the problem continous component by continuous component
+                         for j in range(1, n_dup+1):
+                             if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous
+                                 df.iloc[i+j-1,1] += 1
+                             else:
+                                 break
+                 elif df.iloc[i,1] == df.iloc[i-1,1]:
+                     # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
+                     # Solution : we shift the numbering of 17A (to 18) and the following residues.
+                     df.iloc[i:, 1] += 1
                 else:
-                     # We solve the problem continous component by continuous component
-                     for j in range(1, n_dup+1):
-                         if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous
-                             df.iloc[i+j-1,1] += 1
-                         else:
-                             break
-             elif df.iloc[i,1] == df.iloc[i-1,1]:
-                 # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
-                 # Solution : we shift the numbering of 17A (to 18) and the following residues.
-                 df.iloc[i:, 1] += 1
-             else:
-                 # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
-                 # Here the 163B is read 163 by DSSR, but there already is a residue 163.
-                 # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1.
-                 df.iloc[i, 1] = 1 + df.iloc[i-1, 1]
-                 df.iloc[i+1:, 1] += 1
+                     # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
+                     # Here the 163B is read 163 by DSSR, but there already is a residue 163.
+                     # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1.
+                     df.iloc[i, 1] = 1 + df.iloc[i-1, 1]
+                     df.iloc[i+1:, 1] += 1
+             except:
+                 warn(f"Error with parsing of {self.chain_label} duplicate residue numbers. Ignoring it.")
+                 self.delete_me = True
+                 self.error_messages = f"Error with parsing of duplicate residues numbers."
+                 return None
+ 
 
         # Search for ligands at the end of the selection
         # Drop ligands detected as residues by DSSR, by detecting several markers
@@ -1019,7 +1026,7 @@ class Pipeline:
                 print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &") 
                 sys.exit()
             elif opt == '--version':
-                 print("RNANet 1.0 alpha ")
+                 print("RNANet 1.1 beta")
                 sys.exit()
             elif opt == "-r" or opt == "--resolution":
                 assert float(arg) > 0.0 and float(arg) <= 20.0 
@@ -1382,7 +1389,7 @@ class Pipeline:
             # Remove previous precomputed data
             subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
             for f in self.fam_list:
-                 subprocess.run(["rm","-f", f"data/{f}.npy"])
+                 subprocess.run(["rm","-f", f"data/{f}.npy", f"data/{f}_pairs.csv", f"data/{f}_counts.csv"])
 
             # Run statistics files
             os.chdir(runDir)
@@ -1390,13 +1397,12 @@ class Pipeline:
             subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data])
 
         # Save additional informations
-         conn = sqlite3.connect(runDir+"/results/RNANet.db")
-         pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 
-                           conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
-         pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 
-                             JOIN chain ON structure.pdb_id = chain.structure_id
-                             ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
-         conn.close()
+         with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
+             pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 
+                             conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
+             pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 
+                                 JOIN chain ON structure.pdb_id = chain.structure_id
+                                 ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
 
         # Archive the results
         if self.SELECT_ONLY is None:
@@ -1404,7 +1410,10 @@ class Pipeline:
             subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."])
 
         # Update shortcuts to latest versions
-         subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz", runDir + "/results/summary_latest.csv", runDir + "/results/families_latest.csv"])
+         subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz", 
+                                     runDir + "/results/summary_latest.csv", 
+                                     runDir + "/results/families_latest.csv"
+                         ])
         subprocess.run(['ln',"-s", runDir +f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"])
         subprocess.run(['ln',"-s", runDir +f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"])
         subprocess.run(['ln',"-s", runDir +f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"])
@@ -1631,6 +1640,7 @@ def sql_ask_database(conn, sql, warn_every = 10):
 
 @trace_unhandled_exceptions
 def sql_execute(conn, sql, many=False, data=None, warn_every=10):
+     conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
     for _ in range(100): # retry 100 times if it fails
         try:
             if many:
@@ -2397,6 +2407,7 @@ if __name__ == "__main__":
             rfam_acc_to_download[c.mapping.rfam_acc] = [ c ]
         else:
             rfam_acc_to_download[c.mapping.rfam_acc].append(c)
+ 
     print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences")
     pp.fam_list = sorted(rfam_acc_to_download.keys())
     
--- a/automate.sh 100644 → 100755
View file @97926e0
+++ b/automate.sh 100644 → 100755
View file @97926e0
 # This is a script supposed to be run periodically as a cron job
 
+ cd /home/lbecquey/Projects/RNANet
+ rm -f latest_run.log errors.txt
+ 
 # Run RNANet
- cd /home/lbecquey/Projects/RNANet;
- rm -f stdout.txt stderr.txt errors.txt;
- time './RNAnet.py --3d-folder /home/lbequey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -s -r 20.0' > stdout.txt 2> stderr.txt;
+ bash -c 'time ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 -s --archive' &> latest_run.log
+ touch results/RNANet.db # update last modification date
+ rm -f results/RNANet.db-wal results/RNANet.db-shm # SQLite temporary files
 
- # Sync in Seafile
- seaf-cli start;
+ # Compress
+ rm -f /home/lbecquey/Projects/RNANet/results/RNANet.db.gz
+ echo 'Deleted results/RNANet.db.gz (if existed)' >> latest_run.log
+ gzip -k /home/lbecquey/Projects/RNANet/results/RNANet.db
+ echo 'Recreated it.' >> latest_run.log
 
- seaf-cli stop;
+ # Sync in Seafile
+ seaf-cli start >> latest_run.log 2>&1
+ echo 'Waiting 10m for SeaFile synchronization...' >> latest_run.log
+ sleep 10m
+ echo `seaf-cli status` >> latest_run.log
+ seaf-cli stop >> latest_run.log 2>&1
+ echo 'We are '`date`', update completed.' >> latest_run.log
 
--- a/kill_rnanet.sh
View file @97926e0
+++ b/kill_rnanet.sh
View file @97926e0
@@ -10,6 +10,17 @@ for KILLPID in $PROCESS_LIST; do
   fi
 done
 
+ PROCESS_TO_KILL="statistics.py"
+ PROCESS_LIST=`ps ax | grep -Ei ${PROCESS_TO_KILL} | grep -Eiv '(grep|vi statistics.py)' | awk ' { print $1;}'`
+ KILLED=
+ for KILLPID in $PROCESS_LIST; do
+   if [ ! -z $KILLPID ];then
+     kill -9 $KILLPID
+     echo "Killed PID ${KILLPID}"
+     KILLED=yes
+   fi
+ done
+ 
 if [ -z $KILLED ];then
     echo "Didn't kill anything"
 fi
--- a/known_issues.txt 0 → 100644
View file @97926e0
+++ b/known_issues.txt 0 → 100644
View file @97926e0
+ 1ml5_1_a_1-2914
+ 1ml5_1_a_151-2903
+ 1ml5_1_A_7-1518
+ 1ml5_1_A_7-1515
+ 1ml5_1_A_2-1520
+ 1ml5_1_b_5-121
+ 2rdo_1_A_3-118
+ 4v48_1_A9_3-118
+ 4v47_1_A9_3-118
+ 1vy7_1_AY_1-73
+ 1vy7_1_CY_1-73
+ 4w2h_1_CY_1-73
+ 6zmi_1_L8_1267-4755
+ 6zm7_1_L8_1267-4755
+ 6y6x_1_L8_1267-4755
+ 6z6n_1_L8_1267-4755
+ 6qzp_1_L8_1267-4755
+ 6zme_1_L8_1267-4755
+ 6z6l_1_L8_1267-4755
+ 6ek0_1_L8_1267-4755
+ 6zmo_1_L8_1267-4755
+ 6z6m_1_L8_1267-4755
+ 6ole_1_D_1267-4755
+ 6om0_1_D_1267-4755
+ 6y2l_1_L8_1267-4755
+ 6y0g_1_L8_1267-4755
+ 6oli_1_D_1267-4755
+ 6olg_1_A3_1267-4755
+ 6y57_1_L8_1267-4755
+ 5t2c_1_C_1267-4755
+ 6om7_1_D_1267-4755
+ 4ug0_1_L8_1267-4755
+ 6olf_1_D_1267-4755
+ 6ip5_1_1C_1267-4755
+ 6ip8_1_1C_1267-4755
+ 6olz_1_A3_1267-4755
+ 5aj0_1_A3_1267-4755
+ 5lks_1_L8_1267-4755
+ 6ip6_1_1C_1267-4755
+ 4v6x_1_A8_1267-4755
+ 2z9q_1_A_1-72
+ 1ls2_1_B_1-73
+ 3ep2_1_Y_1-72
+ 3eq3_1_Y_1-72
+ 4v48_1_A6_1-73
+ 1gsg_1_T_1-72
+ 3jcr_1_H_1-115
+ 1eg0_1_O_1-73
+ 4v42_1_BB_5-121
+ 4v42_1_BA_1-2914
+ 4v42_1_BA_151-2903
+ 2ob7_1_A_10-319
+ 1x1l_1_A_1-130
+ 1zc8_1_Z_1-130
+ 1zc8_1_Z_1-91
+ 2ob7_1_D_1-130
+ 1r2x_1_C_1-58
+ 1r2w_1_C_1-58
+ 1eg0_1_L_1-56
+ 1eg0_1_L_1-57
+ 6rxu_1_C2_588-2386
+ 6rxu_1_C2_588-2383
+ 6rxu_1_C2_583-2388
+ 5oql_1_2_588-2386
+ 5oql_1_2_588-2383
+ 5oql_1_2_583-2388
+ 6rxv_1_C2_588-2386
+ 6rxv_1_C2_588-2383
+ 6rxv_1_C2_583-2388
+ 6rxz_1_C2_588-2386
+ 6rxz_1_C2_588-2383
+ 6rxz_1_C2_583-2388
+ 6rxy_1_C2_588-2386
+ 6rxy_1_C2_588-2383
+ 6rxy_1_C2_583-2388
+ 6rxt_1_C2_588-2386
+ 6rxt_1_C2_588-2383
+ 6rxt_1_C2_583-2388
+ 4v48_1_BA_1-91
+ 4v48_1_BA_6-1541
+ 4v48_1_BA_6-1538
+ 4v48_1_BA_1-1543
+ 4v47_1_BA_1-91
+ 4v47_1_BA_6-1540
+ 4v47_1_BA_6-1537
+ 4v47_1_BA_1-1542
+ 2rdo_1_B_6-1460
+ 2rdo_1_B_6-1522
+ 2rdo_1_B_1-2903
+ 2rdo_1_B_6-1457
+ 2rdo_1_B_1-2904
+ 2rdo_1_B_1-1528
+ 2rdo_1_B_160-2893
+ 4v48_1_A0_6-1460
+ 4v48_1_A0_6-1522
+ 4v48_1_A0_1-2903
+ 4v48_1_A0_6-1457
+ 4v48_1_A0_1-2904
+ 4v48_1_A0_1-1528
+ 4v48_1_A0_160-2893
+ 4v47_1_A0_6-1460
+ 4v47_1_A0_6-1522
+ 4v47_1_A0_1-2903
+ 4v47_1_A0_6-1457
+ 4v47_1_A0_1-2904
+ 4v47_1_A0_1-1528
+ 4v47_1_A0_160-2893
+ 1zc8_1_A_1-59
+ 1mvr_1_D_1-59
+ 4c9d_1_D_29-1
+ 4c9d_1_C_29-1
+ 4adx_1_9_1-121
+ 1zn1_1_B_1-59
+ 1emi_1_B_1-108
+ 3iy9_1_A_498-1027
+ 1jgq_1_A_20-55
+ 1jgq_1_A_7-1518
+ 1jgq_1_A_7-1515
+ 1jgq_1_A_2-1520
+ 4v42_1_AA_20-55
+ 4v42_1_AA_7-1518
+ 4v42_1_AA_7-1515
+ 4v42_1_AA_2-1520
+ 1jgo_1_A_20-55
+ 1jgo_1_A_7-1518
+ 1jgo_1_A_7-1515
+ 1jgo_1_A_2-1520
+ 1jgp_1_A_20-55
+ 1jgp_1_A_7-1518
+ 1jgp_1_A_7-1515
+ 1jgp_1_A_2-1520
+ 3ep2_1_B_1-50
+ 3eq3_1_B_1-50
+ 3eq4_1_B_1-50
+ 3pgw_1_R_1-164
+ 3pgw_1_N_1-164
+ 3cw1_1_x_1-138
+ 3cw1_1_w_1-138
+ 3cw1_1_V_1-138
+ 3cw1_1_v_1-138
+ 2iy3_1_B_9-105
+ 3jcr_1_N_1-106
+ 3jcr_1_N_1-188
+ 2vaz_1_A_64-177
+ 2ftc_1_R_81-1466
+ 2ftc_1_R_1-1568
+ 2ftc_1_R_792-1568
+ 3jcr_1_M_1-141
+ 3jcr_1_M_1-107
+ 3jcr_1_M_1-188
+ 4v5z_1_B0_1-2840
+ 4v5z_1_B0_1-2899
+ 4v5z_1_B0_1-2902
+ 5g2x_1_A_595-692
+ 3iy8_1_A_1-540
+ 4v5z_1_BY_2-113
+ 4v5z_1_BZ_1-70
+ 1mvr_1_B_1-96
+ 4adx_1_0_1-2923
+ 4adx_1_0_132-2915
+ 3eq4_1_Y_1-69
+ 4v5z_1_AA_1-1562
+ 4v5z_1_AA_1-1563
+ 6lqm_1_8_1267-4755
+ 6lu8_1_8_1267-4755
+ 6lsr_1_8_1267-4755
+ 6lss_1_8_1267-4755
--- a/known_issues_reasons.txt 0 → 100644
View file @97926e0
+++ b/known_issues_reasons.txt 0 → 100644
View file @97926e0
+ 1ml5_1_a_1-2914
+ Could not find nucleotides of chain a in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 1ml5_1_a_151-2903
+ Could not find nucleotides of chain a in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 1ml5_1_A_7-1518
+ Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 1ml5_1_A_7-1515
+ Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 1ml5_1_A_2-1520
+ Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 1ml5_1_b_5-121
+ Could not find nucleotides of chain b in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 2rdo_1_A_3-118
+ DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_A_3-118.
+ 
+ 4v48_1_A9_3-118
+ DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A9_3-118.
+ 
+ 4v47_1_A9_3-118
+ DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A9_3-118.
+ 
+ 1vy7_1_AY_1-73
+ Sequence is too short. (< 5 resolved nts)
+ 
+ 1vy7_1_CY_1-73
+ Sequence is too short. (< 5 resolved nts)
+ 
+ 4w2h_1_CY_1-73
+ Sequence is too short. (< 5 resolved nts)
+ 
+ 6zmi_1_L8_1267-4755
+ Could not find nucleotides of chain L8 in annotation 6zmi.json. Either there is a problem with 6zmi mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6zm7_1_L8_1267-4755
+ Could not find nucleotides of chain L8 in annotation 6zm7.json. Either there is a problem with 6zm7 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6y6x_1_L8_1267-4755
+ Could not find nucleotides of chain L8 in annotation 6y6x.json. Either there is a problem with 6y6x mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6z6n_1_L8_1267-4755
+ Could not find nucleotides of chain L8 in annotation 6z6n.json. Either there is a problem with 6z6n mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6qzp_1_L8_1267-4755
+ Could not find nucleotides of chain L8 in annotation 6qzp.json. Either there is a problem with 6qzp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6zme_1_L8_1267-4755
+ Could not find nucleotides of chain L8 in annotation 6zme.json. Either there is a problem with 6zme mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6z6l_1_L8_1267-4755
+ Could not find nucleotides of chain L8 in annotation 6z6l.json. Either there is a problem with 6z6l mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6ek0_1_L8_1267-4755
+ Could not find nucleotides of chain L8 in annotation 6ek0.json. Either there is a problem with 6ek0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6zmo_1_L8_1267-4755
+ Could not find nucleotides of chain L8 in annotation 6zmo.json. Either there is a problem with 6zmo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6z6m_1_L8_1267-4755
+ Could not find nucleotides of chain L8 in annotation 6z6m.json. Either there is a problem with 6z6m mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6ole_1_D_1267-4755
+ Could not find nucleotides of chain D in annotation 6ole.json. Either there is a problem with 6ole mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6om0_1_D_1267-4755
+ Could not find nucleotides of chain D in annotation 6om0.json. Either there is a problem with 6om0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6y2l_1_L8_1267-4755
+ Could not find nucleotides of chain L8 in annotation 6y2l.json. Either there is a problem with 6y2l mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6y0g_1_L8_1267-4755
+ Could not find nucleotides of chain L8 in annotation 6y0g.json. Either there is a problem with 6y0g mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6oli_1_D_1267-4755
+ Could not find nucleotides of chain D in annotation 6oli.json. Either there is a problem with 6oli mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6olg_1_A3_1267-4755
+ Could not find nucleotides of chain A3 in annotation 6olg.json. Either there is a problem with 6olg mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6y57_1_L8_1267-4755
+ Could not find nucleotides of chain L8 in annotation 6y57.json. Either there is a problem with 6y57 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 5t2c_1_C_1267-4755
+ Could not find nucleotides of chain C in annotation 5t2c.json. Either there is a problem with 5t2c mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6om7_1_D_1267-4755
+ Could not find nucleotides of chain D in annotation 6om7.json. Either there is a problem with 6om7 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 4ug0_1_L8_1267-4755
+ Could not find nucleotides of chain L8 in annotation 4ug0.json. Either there is a problem with 4ug0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6olf_1_D_1267-4755
+ Could not find nucleotides of chain D in annotation 6olf.json. Either there is a problem with 6olf mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6ip5_1_1C_1267-4755
+ Could not find nucleotides of chain 1C in annotation 6ip5.json. Either there is a problem with 6ip5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6ip8_1_1C_1267-4755
+ Could not find nucleotides of chain 1C in annotation 6ip8.json. Either there is a problem with 6ip8 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6olz_1_A3_1267-4755
+ Could not find nucleotides of chain A3 in annotation 6olz.json. Either there is a problem with 6olz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 5aj0_1_A3_1267-4755
+ Could not find nucleotides of chain A3 in annotation 5aj0.json. Either there is a problem with 5aj0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 5lks_1_L8_1267-4755
+ Could not find nucleotides of chain L8 in annotation 5lks.json. Either there is a problem with 5lks mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6ip6_1_1C_1267-4755
+ Could not find nucleotides of chain 1C in annotation 6ip6.json. Either there is a problem with 6ip6 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 4v6x_1_A8_1267-4755
+ Could not find nucleotides of chain A8 in annotation 4v6x.json. Either there is a problem with 4v6x mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 2z9q_1_A_1-72
+ DSSR warning 2z9q.json: no nucleotides found. Ignoring 2z9q_1_A_1-72.
+ 
+ 1ls2_1_B_1-73
+ DSSR warning 1ls2.json: no nucleotides found. Ignoring 1ls2_1_B_1-73.
+ 
+ 3ep2_1_Y_1-72
+ DSSR warning 3ep2.json: no nucleotides found. Ignoring 3ep2_1_Y_1-72.
+ 
+ 3eq3_1_Y_1-72
+ DSSR warning 3eq3.json: no nucleotides found. Ignoring 3eq3_1_Y_1-72.
+ 
+ 4v48_1_A6_1-73
+ DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A6_1-73.
+ 
+ 1gsg_1_T_1-72
+ DSSR warning 1gsg.json: no nucleotides found. Ignoring 1gsg_1_T_1-72.
+ 
+ 3jcr_1_H_1-115
+ DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_H_1-115.
+ 
+ 1eg0_1_O_1-73
+ DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_O_1-73.
+ 
+ 4v42_1_BB_5-121
+ Could not find nucleotides of chain BB in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 4v42_1_BA_1-2914
+ Could not find nucleotides of chain BA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 4v42_1_BA_151-2903
+ Could not find nucleotides of chain BA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 2ob7_1_A_10-319
+ DSSR warning 2ob7.json: no nucleotides found. Ignoring 2ob7_1_A_10-319.
+ 
+ 1x1l_1_A_1-130
+ DSSR warning 1x1l.json: no nucleotides found. Ignoring 1x1l_1_A_1-130.
+ 
+ 1zc8_1_Z_1-130
+ DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_Z_1-130.
+ 
+ 1zc8_1_Z_1-91
+ DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_Z_1-91.
+ 
+ 2ob7_1_D_1-130
+ DSSR warning 2ob7.json: no nucleotides found. Ignoring 2ob7_1_D_1-130.
+ 
+ 1r2x_1_C_1-58
+ DSSR warning 1r2x.json: no nucleotides found. Ignoring 1r2x_1_C_1-58.
+ 
+ 1r2w_1_C_1-58
+ DSSR warning 1r2w.json: no nucleotides found. Ignoring 1r2w_1_C_1-58.
+ 
+ 1eg0_1_L_1-56
+ DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_L_1-56.
+ 
+ 1eg0_1_L_1-57
+ DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_L_1-57.
+ 
+ 6rxu_1_C2_588-2386
+ Could not find nucleotides of chain C2 in annotation 6rxu.json. Either there is a problem with 6rxu mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6rxu_1_C2_588-2383
+ Could not find nucleotides of chain C2 in annotation 6rxu.json. Either there is a problem with 6rxu mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6rxu_1_C2_583-2388
+ Could not find nucleotides of chain C2 in annotation 6rxu.json. Either there is a problem with 6rxu mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 5oql_1_2_588-2386
+ Could not find nucleotides of chain 2 in annotation 5oql.json. Either there is a problem with 5oql mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 5oql_1_2_588-2383
+ Could not find nucleotides of chain 2 in annotation 5oql.json. Either there is a problem with 5oql mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 5oql_1_2_583-2388
+ Could not find nucleotides of chain 2 in annotation 5oql.json. Either there is a problem with 5oql mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6rxv_1_C2_588-2386
+ Could not find nucleotides of chain C2 in annotation 6rxv.json. Either there is a problem with 6rxv mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6rxv_1_C2_588-2383
+ Could not find nucleotides of chain C2 in annotation 6rxv.json. Either there is a problem with 6rxv mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6rxv_1_C2_583-2388
+ Could not find nucleotides of chain C2 in annotation 6rxv.json. Either there is a problem with 6rxv mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6rxz_1_C2_588-2386
+ Could not find nucleotides of chain C2 in annotation 6rxz.json. Either there is a problem with 6rxz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6rxz_1_C2_588-2383
+ Could not find nucleotides of chain C2 in annotation 6rxz.json. Either there is a problem with 6rxz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6rxz_1_C2_583-2388
+ Could not find nucleotides of chain C2 in annotation 6rxz.json. Either there is a problem with 6rxz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6rxy_1_C2_588-2386
+ Could not find nucleotides of chain C2 in annotation 6rxy.json. Either there is a problem with 6rxy mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6rxy_1_C2_588-2383
+ Could not find nucleotides of chain C2 in annotation 6rxy.json. Either there is a problem with 6rxy mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6rxy_1_C2_583-2388
+ Could not find nucleotides of chain C2 in annotation 6rxy.json. Either there is a problem with 6rxy mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6rxt_1_C2_588-2386
+ Could not find nucleotides of chain C2 in annotation 6rxt.json. Either there is a problem with 6rxt mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6rxt_1_C2_588-2383
+ Could not find nucleotides of chain C2 in annotation 6rxt.json. Either there is a problem with 6rxt mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6rxt_1_C2_583-2388
+ Could not find nucleotides of chain C2 in annotation 6rxt.json. Either there is a problem with 6rxt mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 4v48_1_BA_1-91
+ DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_1-91.
+ 
+ 4v48_1_BA_6-1541
+ DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_6-1541.
+ 
+ 4v48_1_BA_6-1538
+ DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_6-1538.
+ 
+ 4v48_1_BA_1-1543
+ DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_1-1543.
+ 
+ 4v47_1_BA_1-91
+ DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_1-91.
+ 
+ 4v47_1_BA_6-1540
+ DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_6-1540.
+ 
+ 4v47_1_BA_6-1537
+ DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_6-1537.
+ 
+ 4v47_1_BA_1-1542
+ DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_1-1542.
+ 
+ 2rdo_1_B_6-1460
+ DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_6-1460.
+ 
+ 2rdo_1_B_6-1522
+ DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_6-1522.
+ 
+ 2rdo_1_B_1-2903
+ DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_1-2903.
+ 
+ 2rdo_1_B_6-1457
+ DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_6-1457.
+ 
+ 2rdo_1_B_1-2904
+ DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_1-2904.
+ 
+ 2rdo_1_B_1-1528
+ DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_1-1528.
+ 
+ 2rdo_1_B_160-2893
+ DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_160-2893.
+ 
+ 4v48_1_A0_6-1460
+ DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_6-1460.
+ 
+ 4v48_1_A0_6-1522
+ DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_6-1522.
+ 
+ 4v48_1_A0_1-2903
+ DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_1-2903.
+ 
+ 4v48_1_A0_6-1457
+ DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_6-1457.
+ 
+ 4v48_1_A0_1-2904
+ DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_1-2904.
+ 
+ 4v48_1_A0_1-1528
+ DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_1-1528.
+ 
+ 4v48_1_A0_160-2893
+ DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_160-2893.
+ 
+ 4v47_1_A0_6-1460
+ DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_6-1460.
+ 
+ 4v47_1_A0_6-1522
+ DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_6-1522.
+ 
+ 4v47_1_A0_1-2903
+ DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_1-2903.
+ 
+ 4v47_1_A0_6-1457
+ DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_6-1457.
+ 
+ 4v47_1_A0_1-2904
+ DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_1-2904.
+ 
+ 4v47_1_A0_1-1528
+ DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_1-1528.
+ 
+ 4v47_1_A0_160-2893
+ DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_160-2893.
+ 
+ 1zc8_1_A_1-59
+ DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_A_1-59.
+ 
+ 1mvr_1_D_1-59
+ DSSR warning 1mvr.json: no nucleotides found. Ignoring 1mvr_1_D_1-59.
+ 
+ 4c9d_1_D_29-1
+ Mapping is reversed, this case is not supported (yet).
+ 
+ 4c9d_1_C_29-1
+ Mapping is reversed, this case is not supported (yet).
+ 
+ 4adx_1_9_1-121
+ DSSR warning 4adx.json: no nucleotides found. Ignoring 4adx_1_9_1-121.
+ 
+ 1zn1_1_B_1-59
+ DSSR warning 1zn1.json: no nucleotides found. Ignoring 1zn1_1_B_1-59.
+ 
+ 1emi_1_B_1-108
+ DSSR warning 1emi.json: no nucleotides found. Ignoring 1emi_1_B_1-108.
+ 
+ 3iy9_1_A_498-1027
+ DSSR warning 3iy9.json: no nucleotides found. Ignoring 3iy9_1_A_498-1027.
+ 
+ 1jgq_1_A_20-55
+ Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 1jgq_1_A_7-1518
+ Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 1jgq_1_A_7-1515
+ Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 1jgq_1_A_2-1520
+ Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 4v42_1_AA_20-55
+ Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 4v42_1_AA_7-1518
+ Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 4v42_1_AA_7-1515
+ Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 4v42_1_AA_2-1520
+ Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 1jgo_1_A_20-55
+ Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 1jgo_1_A_7-1518
+ Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 1jgo_1_A_7-1515
+ Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 1jgo_1_A_2-1520
+ Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 1jgp_1_A_20-55
+ Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 1jgp_1_A_7-1518
+ Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 1jgp_1_A_7-1515
+ Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 1jgp_1_A_2-1520
+ Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 3ep2_1_B_1-50
+ DSSR warning 3ep2.json: no nucleotides found. Ignoring 3ep2_1_B_1-50.
+ 
+ 3eq3_1_B_1-50
+ DSSR warning 3eq3.json: no nucleotides found. Ignoring 3eq3_1_B_1-50.
+ 
+ 3eq4_1_B_1-50
+ DSSR warning 3eq4.json: no nucleotides found. Ignoring 3eq4_1_B_1-50.
+ 
+ 3pgw_1_R_1-164
+ DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_R_1-164.
+ 
+ 3pgw_1_N_1-164
+ DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_N_1-164.
+ 
+ 3cw1_1_x_1-138
+ DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_x_1-138.
+ 
+ 3cw1_1_w_1-138
+ DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_w_1-138.
+ 
+ 3cw1_1_V_1-138
+ DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_V_1-138.
+ 
+ 3cw1_1_v_1-138
+ DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_v_1-138.
+ 
+ 2iy3_1_B_9-105
+ DSSR warning 2iy3.json: no nucleotides found. Ignoring 2iy3_1_B_9-105.
+ 
+ 3jcr_1_N_1-106
+ DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_N_1-106.
+ 
+ 3jcr_1_N_1-188
+ DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_N_1-188.
+ 
+ 2vaz_1_A_64-177
+ DSSR warning 2vaz.json: no nucleotides found. Ignoring 2vaz_1_A_64-177.
+ 
+ 2ftc_1_R_81-1466
+ DSSR warning 2ftc.json: no nucleotides found. Ignoring 2ftc_1_R_81-1466.
+ 
+ 2ftc_1_R_1-1568
+ DSSR warning 2ftc.json: no nucleotides found. Ignoring 2ftc_1_R_1-1568.
+ 
+ 2ftc_1_R_792-1568
+ DSSR warning 2ftc.json: no nucleotides found. Ignoring 2ftc_1_R_792-1568.
+ 
+ 3jcr_1_M_1-141
+ DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_M_1-141.
+ 
+ 3jcr_1_M_1-107
+ DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_M_1-107.
+ 
+ 3jcr_1_M_1-188
+ DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_M_1-188.
+ 
+ 4v5z_1_B0_1-2840
+ DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_B0_1-2840.
+ 
+ 4v5z_1_B0_1-2899
+ DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_B0_1-2899.
+ 
+ 4v5z_1_B0_1-2902
+ DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_B0_1-2902.
+ 
+ 5g2x_1_A_595-692
+ Sequence is too short. (< 5 resolved nts)
+ 
+ 3iy8_1_A_1-540
+ DSSR warning 3iy8.json: no nucleotides found. Ignoring 3iy8_1_A_1-540.
+ 
+ 4v5z_1_BY_2-113
+ DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_BY_2-113.
+ 
+ 4v5z_1_BZ_1-70
+ DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_BZ_1-70.
+ 
+ 1mvr_1_B_1-96
+ DSSR warning 1mvr.json: no nucleotides found. Ignoring 1mvr_1_B_1-96.
+ 
+ 4adx_1_0_1-2923
+ DSSR warning 4adx.json: no nucleotides found. Ignoring 4adx_1_0_1-2923.
+ 
+ 4adx_1_0_132-2915
+ DSSR warning 4adx.json: no nucleotides found. Ignoring 4adx_1_0_132-2915.
+ 
+ 3eq4_1_Y_1-69
+ DSSR warning 3eq4.json: no nucleotides found. Ignoring 3eq4_1_Y_1-69.
+ 
+ 4v5z_1_AA_1-1562
+ DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_AA_1-1562.
+ 
+ 4v5z_1_AA_1-1563
+ DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_AA_1-1563.
+ 
+ 6lqm_1_8_1267-4755
+ Could not find nucleotides of chain 8 in annotation 6lqm.json. Either there is a problem with 6lqm mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6lu8_1_8_1267-4755
+ Could not find nucleotides of chain 8 in annotation 6lu8.json. Either there is a problem with 6lu8 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6lsr_1_8_1267-4755
+ Could not find nucleotides of chain 8 in annotation 6lsr.json. Either there is a problem with 6lsr mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
+ 6lss_1_8_1267-4755
+ Could not find nucleotides of chain 8 in annotation 6lss.json. Either there is a problem with 6lss mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+ 
--- a/statistics.py
View file @97926e0
+++ b/statistics.py
View file @97926e0
@@ -5,7 +5,7 @@
 # in the database.
 # This should be run from the folder where the file is (to access the database with path "results/RNANet.db")
 
- import os, pickle, sqlite3, sys
+ import os, pickle, sqlite3, shlex, subprocess, sys
 import numpy as np
 import pandas as pd
 import threading as th
@@ -16,14 +16,13 @@ import matplotlib.patches as mpatches
 import scipy.cluster.hierarchy as sch
 from scipy.spatial.distance import squareform
 from mpl_toolkits.mplot3d import axes3d
- from Bio.Phylo.TreeConstruction import DistanceCalculator
 from Bio import AlignIO, SeqIO
 from functools import partial
- from multiprocessing import Pool
+ from multiprocessing import Pool, Manager
 from os import path
 from tqdm import tqdm
 from collections import Counter
- from RNAnet import read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker
+ from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker
 
 # This sets the paths
 if len(sys.argv) > 1:
@@ -37,7 +36,7 @@ else:
 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546")   # From Rfam CLAN 00112
 SSU_set = ("RF00177", "RF02542",  "RF02545", "RF01959", "RF01960")  # From Rfam CLAN 00111
 
- def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
+ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
     """
     Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph.
     See Wadley & Pyle (2007)
@@ -68,6 +67,12 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
 
     
     if not path.isfile(f"data/wadley_kernel_{angle}.npz"):
+ 
+         # Get a worker number to position the progress bar
+         global idxQueue
+         thr_idx = idxQueue.get()
+         pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", position=thr_idx+1, leave=False)
+ 
         # Extract the angle values of c2'-endo and c3'-endo nucleotides
         with sqlite3.connect("results/RNANet.db") as conn:
             df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE puckering="C2'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn)
@@ -89,13 +94,17 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
         xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j]
         positions = np.vstack([xx.ravel(), yy.ravel()])
         f_c3 = np.reshape(kernel_c3(positions).T, xx.shape)
+         pbar.update(1)
         f_c2 = np.reshape(kernel_c2(positions).T, xx.shape)
+         pbar.update(1)
 
         # Save the data to an archive for later use without the need to recompute
         np.savez(f"data/wadley_kernel_{angle}.npz",
                   c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas,
                   c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas,
                   kernel_c3=f_c3, kernel_c2=f_c2)
+         pbar.close()
+         idxQueue.put(thr_idx)
     else:
         f = np.load(f"data/wadley_kernel_{angle}.npz")
         c2_endo_etas = f["c2_endo_e"]
@@ -106,7 +115,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
         f_c2 = f["kernel_c2"]
         xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j]
 
-     notify(f"Kernel computed for {angle}/th{angle} (or loaded from file).")
+     # notify(f"Kernel computed for {angle}/th{angle} (or loaded from file).")
 
     # exact counts:
     hist_c2, xedges, yedges = np.histogram2d(c2_endo_etas, c2_endo_thetas, bins=int(2*np.pi/0.1), 
@@ -139,7 +148,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
         fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}.png")
         if show:
             fig.show()
-         fig.close()
+         plt.close()
 
         # Smoothed joint distribution
         fig = plt.figure()
@@ -150,7 +159,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
         fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}.png")
         if show:
             fig.show()
-         fig.close()
+         plt.close()
 
         # 2D Wadley plot
         fig = plt.figure(figsize=(5,5))
@@ -163,7 +172,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
         fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}.png")
         if show:
             fig.show()
-         fig.close()
+         plt.close()
     # print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.")
 
 def stats_len():
@@ -171,11 +180,15 @@ def stats_len():
     
     REQUIRES tables chain, nucleotide up to date.
     """
+     
+     # Get a worker number to position the progress bar
+     global idxQueue
+     thr_idx = idxQueue.get()
 
     cols = []
     lengths = []
-     conn = sqlite3.connect("results/RNANet.db")
-     for i,f in enumerate(fam_list):
+     
+     for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False)):
 
         # Define a color for that family in the plot
         if f in LSU_set:
@@ -190,11 +203,11 @@ def stats_len():
             cols.append("grey")
 
         # Get the lengths of chains
-         l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;") ]
+         with sqlite3.connect("results/RNANet.db") as conn:
+             l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;", warn_every=0) ]
         lengths.append(l)
 
-         notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
-     conn.close()
+         # notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
 
     # Plot the figure
     fig = plt.figure(figsize=(10,3))
@@ -223,7 +236,8 @@ def stats_len():
 
     # Save the figure
     fig.savefig("results/figures/lengths.png")
-     notify("Computed sequence length statistics and saved the figure.")
+     idxQueue.put(thr_idx) # replace the thread index in the queue
+     # notify("Computed sequence length statistics and saved the figure.")
 
 def format_percentage(tot, x):
         if not tot:
@@ -242,40 +256,57 @@ def stats_freq():
 
     Outputs results/frequencies.csv
     REQUIRES tables chain, nucleotide up to date."""
+ 
+     # Get a worker number to position the progress bar
+     global idxQueue
+     thr_idx = idxQueue.get()
+ 
     # Initialize a Counter object for each family
     freqs = {}
     for f in fam_list:
         freqs[f] = Counter()
 
     # List all nt_names happening within a RNA family and store the counts in the Counter
-     conn = sqlite3.connect("results/RNANet.db")
-     for i,f in enumerate(fam_list):
-         counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;"))
+     for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False)):
+         with sqlite3.connect("results/RNANet.db") as conn:
+             counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0))
         freqs[f].update(counts)
-         notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.")
-     conn.close()
+         # notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.")
     
     # Create a pandas DataFrame, and save it to CSV.
     df = pd.DataFrame()
-     for f in fam_list:
+     for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False):
         tot = sum(freqs[f].values())
         df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ])
     df = df.fillna(0)
     df.to_csv("results/frequencies.csv")    
-     notify("Saved nucleotide frequencies to CSV file.")
+     idxQueue.put(thr_idx) # replace the thread index in the queue
+     # notify("Saved nucleotide frequencies to CSV file.")
 
 def parallel_stats_pairs(f):
     """Counts occurrences of intra-chain base-pair types in one RNA family
 
     REQUIRES tables chain, nucleotide up-to-date.""" 
 
+     if path.isfile("data/"+f+"_pairs.csv") and path.isfile("data/"+f+"_counts.csv"):
+         return
+ 
+     # Get a worker number to position the progress bar
+     global idxQueue
+     thr_idx = idxQueue.get()
+ 
     chain_id_list = mappings_list[f]
     data = []
-     for cid in chain_id_list:
+     sqldata = []
+     for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", leave=False):
         with sqlite3.connect("results/RNANet.db") as conn:
             # Get comma separated lists of basepairs per nucleotide
-             interactions = pd.read_sql(f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;", conn)
- 
+             interactions = pd.DataFrame(
+                             sql_ask_database(conn, 
+                                             f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;",
+                                             warn_every=0), 
+                             columns = ["nt1", "index_chain", "paired", "pair_type_LW"]
+                            )
         # expand the comma-separated lists in real lists
         expanded_list = pd.concat([ pd.DataFrame({  'nt1':[ row["nt1"] for x in row["paired"].split(',') ],
                                                     'index_chain':[ row['index_chain'] for x in row["paired"].split(',') ],
@@ -317,27 +348,29 @@ def parallel_stats_pairs(f):
 
         # Update the database
         vlcnts = expanded_list.pair_type_LW.value_counts()
-         sqldata = ( vlcnts.at["cWW"]/2 if "cWW" in vlcnts.index else 0, 
-                     vlcnts.at["cWH"] if "cWH" in vlcnts.index else 0, 
-                     vlcnts.at["cWS"] if "cWS" in vlcnts.index else 0, 
-                     vlcnts.at["cHH"]/2 if "cHH" in vlcnts.index else 0, 
-                     vlcnts.at["cHS"] if "cHS" in vlcnts.index else 0, 
-                     vlcnts.at["cSS"]/2 if "cSS" in vlcnts.index else 0, 
-                     vlcnts.at["tWW"]/2 if "tWW" in vlcnts.index else 0, 
-                     vlcnts.at["tWH"] if "tWH" in vlcnts.index else 0, 
-                     vlcnts.at["tWS"] if "tWS" in vlcnts.index else 0, 
-                     vlcnts.at["tHH"]/2 if "tHH" in vlcnts.index else 0, 
-                     vlcnts.at["tHS"] if "tHS" in vlcnts.index else 0, 
-                     vlcnts.at["tSS"]/2 if "tSS" in vlcnts.index else 0, 
-                     int(sum(vlcnts.loc[[ str(x) for x in vlcnts.index if "." in str(x)]])/2), 
-                     cid)
-         with sqlite3.connect("results/RNANet.db") as conn:
-             sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
-                                     pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?, 
-                                     pair_count_tHH = ?, pair_count_tHS = ?, pair_count_tSS = ?, pair_count_other = ? WHERE chain_id = ?;""", data=sqldata)
+         sqldata.append(   ( vlcnts.at["cWW"]/2 if "cWW" in vlcnts.index else 0, 
+                             vlcnts.at["cWH"] if "cWH" in vlcnts.index else 0, 
+                             vlcnts.at["cWS"] if "cWS" in vlcnts.index else 0, 
+                             vlcnts.at["cHH"]/2 if "cHH" in vlcnts.index else 0, 
+                             vlcnts.at["cHS"] if "cHS" in vlcnts.index else 0, 
+                             vlcnts.at["cSS"]/2 if "cSS" in vlcnts.index else 0, 
+                             vlcnts.at["tWW"]/2 if "tWW" in vlcnts.index else 0, 
+                             vlcnts.at["tWH"] if "tWH" in vlcnts.index else 0, 
+                             vlcnts.at["tWS"] if "tWS" in vlcnts.index else 0, 
+                             vlcnts.at["tHH"]/2 if "tHH" in vlcnts.index else 0, 
+                             vlcnts.at["tHS"] if "tHS" in vlcnts.index else 0, 
+                             vlcnts.at["tSS"]/2 if "tSS" in vlcnts.index else 0, 
+                             int(sum(vlcnts.loc[[ str(x) for x in vlcnts.index if "." in str(x)]])/2), 
+                             cid) )
 
         data.append(expanded_list)
 
+     # Update the database
+     with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
+         conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
+         sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
+                                 pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?, 
+                                 pair_count_tHH = ?, pair_count_tHS = ?, pair_count_tSS = ?, pair_count_other = ? WHERE chain_id = ?;""", many=True, data=sqldata, warn_every=0)
 
     # merge all the dataframes from all chains of the family
     expanded_list = pd.concat(data)
@@ -351,7 +384,106 @@ def parallel_stats_pairs(f):
 
     # Create an output DataFrame
     f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f])
-     return expanded_list, f_df
+     f_df.to_csv(f"data/{f}_counts.csv")
+     expanded_list.to_csv(f"data/{f}_pairs.csv")
+     
+     idxQueue.put(thr_idx) # replace the thread index in the queue
+ 
+ def to_dist_matrix(f):
+     if path.isfile("data/"+f+".npy"):
+         # notify(f"Computed {f} distance matrix", "loaded from file")
+         return 0
+   
+     # Get a worker number to position the progress bar
+     global idxQueue
+     thr_idx = idxQueue.get()
+ 
+     # notify(f"Computing {f} distance matrix from alignment...")
+     command = f"esl-alipid --rna --noheader --informat stockholm {f}_3d_only.stk"
+ 
+     # Prepare a file
+     with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file:
+         al = AlignIO.read(al_file, "fasta")
+         names = [ x.id for x in al if '[' in x.id ]
+         al = al[-len(names):]
+     with open(f + "_3d_only.stk", "w") as only_3d:
+         only_3d.write(al.format("stockholm"))
+     del al
+ 
+     # Prepare the job
+     process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE)
+     id_matrix = np.zeros((len(names), len(names)))
+ 
+     pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", leave=False)
+     while process.poll() is None:
+         output = process.stdout.readline()
+         if output:
+             lines = output.strip().split(b'\n')
+             for l in lines:
+                 line = l.split()
+                 s1 = line[0].decode('utf-8')
+                 s2 = line[1].decode('utf-8')
+                 score = line[2].decode('utf-8')
+                 id1 = names.index(s1)
+                 id2 = names.index(s2)
+                 id_matrix[id1, id2] = float(score)
+                 pbar.update(1)
+     pbar.close()
+ 
+     subprocess.run(["rm", "-f", f + "_3d_only.stk"])
+     np.save("data/"+f+".npy", id_matrix)
+     idxQueue.put(thr_idx) # replace the thread index in the queue
+     return 0
+ 
+ def seq_idty():
+     """Computes identity matrices for each of the RNA families.
+     
+     REQUIRES temporary results files in data/*.npy
+     REQUIRES tables chain, family un to date."""
+ 
+     # load distance matrices
+     fam_arrays = []
+     for f in famlist:
+         if path.isfile("data/"+f+".npy"):
+             fam_arrays.append(np.load("data/"+f+".npy"))
+         else:
+             fam_arrays.append([])
+ 
+     # Update database with identity percentages
+     conn = sqlite3.connect("results/RNANet.db")
+     for f, D in zip(famlist, fam_arrays):
+         if not len(D): continue
+         a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
+         conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
+     conn.commit()
+     conn.close()
+ 
+     # Plots plots plots
+     fig, axs = plt.subplots(4,17, figsize=(17,5.75))
+     axs = axs.ravel()
+     [axi.set_axis_off() for axi in axs]
+     im = "" # Just to declare the variable, it will be set in the loop
+     for f, D, ax in zip(famlist, fam_arrays, axs):
+         if not len(D): continue
+         if D.shape[0] > 2:  # Cluster only if there is more than 2 sequences to organize
+             D = D + D.T     # Copy the lower triangle to upper, to get a symetrical matrix
+             condensedD = squareform(D)
+ 
+             # Compute basic dendrogram by Ward's method
+             Y = sch.linkage(condensedD, method='ward')
+             Z = sch.dendrogram(Y, orientation='left', no_plot=True)
+ 
+             # Reorganize rows and cols
+             idx1 = Z['leaves']
+             D = D[idx1,:]
+             D = D[:,idx1[::-1]]
+         im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D
+         ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10)
+     fig.tight_layout()
+     fig.subplots_adjust(wspace=0.1, hspace=0.3)
+     fig.colorbar(im, ax=axs[-1], shrink=0.8)
+     fig.savefig(f"results/figures/distances.png")
+     notify("Computed all identity matrices and saved the figure.")
 
 def stats_pairs():
     """Counts occurrences of intra-chain base-pair types in RNA families
@@ -363,26 +495,15 @@ def stats_pairs():
         return family_data.apply(partial(format_percentage, sum(family_data)))
 
     if not path.isfile("data/pair_counts.csv"):
-         p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=read_cpu_number(), maxtasksperchild=5)
-         try:
-             fam_pbar = tqdm(total=len(fam_list), desc="Pair-types in families", position=0, leave=True) 
-             results = []
-             allpairs = []
-             for _, newp_famdf in enumerate(p.imap_unordered(parallel_stats_pairs, fam_list)):
-                 newpairs, fam_df = newp_famdf
-                 fam_pbar.update(1)
-                 results.append(fam_df)
-                 allpairs.append(newpairs)
-             fam_pbar.close()
-             p.close()
-             p.join()
-         except KeyboardInterrupt:
-             warn("KeyboardInterrupt, terminating workers.", error=True)
-             fam_pbar.close()
-             p.terminate()
-             p.join()
-             exit(1)
- 
+         results = []
+         allpairs = []
+         for f in fam_list:
+             newpairs = pd.read_csv(f"data/{f}_pairs.csv", index_col=0)
+             fam_df = pd.read_csv(f"data/{f}_counts.csv", index_col=0)
+             results.append(fam_df)
+             allpairs.append(newpairs)
+             subprocess.run(["rm", "-f", f"data/{f}_pairs.csv"])
+             subprocess.run(["rm", "-f", f"data/{f}_counts.csv"])
         all_pairs = pd.concat(allpairs)
         df = pd.concat(results).fillna(0)
         df.to_csv("data/pair_counts.csv")
@@ -431,92 +552,12 @@ def stats_pairs():
 
     notify("Computed nucleotide statistics and saved CSV and PNG file.")
 
- def to_dist_matrix(f):
-     if path.isfile("data/"+f+".npy"):
-         notify(f"Computed {f} distance matrix", "loaded from file")
-         return 0
- 
-     notify(f"Computing {f} distance matrix from alignment...")
-     dm = DistanceCalculator('identity')
-     with open(path_to_seq_data+"/realigned/"+f+"++.afa") as al_file:
-         al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):]
-     idty = dm.get_distance(al).matrix # list of lists
-     del al
-     l = len(idty)
-     np.save("data/"+f+".npy", np.array([ idty[i] + [0]*(l-1-i) if i<l-1 else idty[i]  for i in range(l) ], dtype=object))
-     del idty
-     notify(f"Computed {f} distance matrix")
-     return 0
- 
- def seq_idty():
-     """Computes identity matrices for each of the RNA families.
-     
-     Creates temporary results files in data/*.npy
-     REQUIRES tables chain, family un to date."""
- 
-     # List the families for which we will compute sequence identity matrices
-     conn = sqlite3.connect("results/RNANet.db")
-     famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 1 ORDER BY rfam_acc ASC;") ]
-     ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ]
-     if len(ignored):
-         print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
- 
-     # compute distance matrices (or ignore if data/RF0****.npy exists)
-     p = Pool(processes=8)
-     p.map(to_dist_matrix, famlist)
-     p.close()
-     p.join()
- 
-     # load them
-     fam_arrays = []
-     for f in famlist:
-         if path.isfile("data/"+f+".npy"):
-             fam_arrays.append(np.load("data/"+f+".npy"))
-         else:
-             fam_arrays.append([])
- 
-     # Update database with identity percentages
-     conn = sqlite3.connect("results/RNANet.db")
-     for f, D in zip(famlist, fam_arrays):
-         if not len(D): continue
-         a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
-         conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
-     conn.commit()
-     conn.close()
- 
-     # Plots plots plots
-     fig, axs = plt.subplots(4,17, figsize=(17,5.75))
-     axs = axs.ravel()
-     [axi.set_axis_off() for axi in axs]
-     im = "" # Just to declare the variable, it will be set in the loop
-     for f, D, ax in zip(famlist, fam_arrays, axs):
-         if not len(D): continue
-         if D.shape[0] > 2:  # Cluster only if there is more than 2 sequences to organize
-             D = D + D.T     # Copy the lower triangle to upper, to get a symetrical matrix
-             condensedD = squareform(D)
- 
-             # Compute basic dendrogram by Ward's method
-             Y = sch.linkage(condensedD, method='ward')
-             Z = sch.dendrogram(Y, orientation='left', no_plot=True)
- 
-             # Reorganize rows and cols
-             idx1 = Z['leaves']
-             D = D[idx1,:]
-             D = D[:,idx1[::-1]]
-         im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D
-         ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10)
-     fig.tight_layout()
-     fig.subplots_adjust(wspace=0.1, hspace=0.3)
-     fig.colorbar(im, ax=axs[-1], shrink=0.8)
-     fig.savefig(f"results/figures/distances.png")
-     notify("Computed all identity matrices and saved the figure.")
- 
 def per_chain_stats():
     """Computes per-chain frequencies and base-pair type counts.
 
     REQUIRES tables chain, nucleotide up to date. """
 
-     with sqlite3.connect("results/RNANet.db") as conn:
+     with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
         # Compute per-chain nucleotide frequencies
         df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn)
         df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64)
@@ -524,39 +565,74 @@ def per_chain_stats():
         df = df.drop("total", axis=1)
 
         # Set the values
+         conn.execute('pragma journal_mode=wal')
         sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;",
                           many=True, data=list(df.to_records(index=False)), warn_every=10)
     notify("Updated the database with per-chain base frequencies")
 
+ def log_to_pbar(pbar):
+     def update(r):
+         pbar.update(1)
+     return update
+ 
 if __name__ == "__main__":
 
     os.makedirs("results/figures/wadley_plots/", exist_ok=True)
 
     print("Loading mappings list...")
-     conn = sqlite3.connect("results/RNANet.db")
-     fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ]
-     mappings_list = {}
-     for k in fam_list:
-         mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}';") ]
-     conn.close()
-     
-     # stats_pairs()
- 
-     # Define threads for the tasks
-     threads = [
-         th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}),
-         th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}),
-         th.Thread(target=stats_len),            # computes figures
-         th.Thread(target=stats_freq),           # Updates the database
-         th.Thread(target=seq_idty),             # produces .npy files and seq idty figures
-         th.Thread(target=per_chain_stats)       # Updates the database
-     ]
-     
-     # Start the threads
-     for t in threads:
-         t.start()
- 
-     # Wait for the threads to complete
-     for t in threads:
-         t.join()
+     with sqlite3.connect("results/RNANet.db") as conn:
+         fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ]
+         mappings_list = {}
+         for k in fam_list:
+             mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}' and issue=0;") ]
 
+     # List the families for which we will compute sequence identity matrices
+     with sqlite3.connect("results/RNANet.db") as conn:
+         famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ]
+         ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ]
+     if len(ignored):
+         print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
+     
+     # Prepare the multiprocessing execution environment
+     nworkers = max(read_cpu_number()-1, 32)
+     thr_idx_mgr = Manager()
+     idxQueue = thr_idx_mgr.Queue()
+     for i in range(nworkers):
+         idxQueue.put(i)
+ 
+     # Define the tasks
+     joblist = []
+     joblist.append(Job(function=reproduce_wadley_results, args=(1,)))
+     joblist.append(Job(function=reproduce_wadley_results, args=(4,)))
+     joblist.append(Job(function=stats_len)) # Computes figures
+     joblist.append(Job(function=stats_freq)) # updates the database
+     for f in famlist:
+         joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
+         if f not in ignored:
+             joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database
+ 
+     p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
+     pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True)
+ sqlite3 
+     try:
+         for j in joblist:
+             p.apply_async(j.func_, args=j.args_, callback=log_to_pbar(pbar))
+         p.close()
+         p.join()
+         pbar.close()
+     except KeyboardInterrupt:
+         warn("KeyboardInterrupt, terminating workers.", error=True)
+         p.terminate()
+         p.join()
+         pbar.close()
+         exit(1)
+     except:
+         print("Something went wrong")
+ 
+     print()
+     print()
+ 
+     # finish the work after the parallel portions
+     per_chain_stats()
+     seq_idty()
+     stats_pairs()