Merge branch 'master' of https://github.com/persalteas/RNANet into master

Louis BECQUEY
Commit 97926e0badad13e955c797c19754d4add6e9a2e9 97926e0b 2 parents 61d2238e e57395a9
Showing 7 changed files with 992 additions and 214 deletions
README.md
RNAnet.py
automate.sh
kill_rnanet.sh
known_issues.txt
known_issues_reasons.txt
statistics.py
--- a/README.md
View file @97926e0
+++ b/README.md
View file @97926e0
@@ -7,7 +7,7 @@ Future versions might compute a real MSA-based clusering directly with Rfamseq n
 This script prepares the dataset from available public data in PDB and Rfam.
-**Please cite**: *Coming soon, expect it summer 2020*
+**Please cite**: *Coming soon, expect it in 2021*
 # What it does
 The script follows these steps:
@@ -72,7 +72,7 @@ You need to install:
 ## Command line
 Run `./RNANet.py --3d-folder path/to/3D/data/folder --seq-folder path/to/sequence/data/folder [ - other options ]`. 
-It requires solid hardware to run. It takes around 15 hours the first time, and 9h then, tested on a server with 32 cores and 48GB of RAM.
+It requires solid hardware to run. It takes around around 12 to 15 hours the first time, and 1 to 3h then, tested on a server with 32 cores and 48GB of RAM.
 The detailed list of options is below:
 ```
--- a/RNAnet.py
View file @97926e0
+++ b/RNAnet.py
View file @97926e0
@@ -273,32 +273,39 @@ class Chain:
             if self.mapping is not None:
                 self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}")
-            if df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
+            try:
-                # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end
+                if i > 0 and index_last_dup +1 < len(df.index) and df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
-
+                    # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end
-                if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup:
+
-                    # They are all contiguous in the chain
+                    if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup:
-                    # 4v9n-DA case (and similar ones) : 610-611-611A-611B-611C-611D-611E-611F-611G-617-618...
+                        # They are all contiguous in the chain
-                    # there is a redundancy (611) followed by a gap (611-617). 
+                        # 4v9n-DA case (and similar ones) : 610-611-611A-611B-611C-611D-611E-611F-611G-617-618...
-                    # We want the redundancy to fill the gap.
+                        # there is a redundancy (611) followed by a gap (611-617). 
-                    df.iloc[i:i+n_dup-1, 1] += 1
+                        # We want the redundancy to fill the gap.
+                        df.iloc[i:i+n_dup-1, 1] += 1
+                    else:
+                        # We solve the problem continous component by continuous component
+                        for j in range(1, n_dup+1):
+                            if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous
+                                df.iloc[i+j-1,1] += 1
+                            else:
+                                break
+                elif df.iloc[i,1] == df.iloc[i-1,1]:
+                    # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
+                    # Solution : we shift the numbering of 17A (to 18) and the following residues.
+                    df.iloc[i:, 1] += 1
                 else:
-                    # We solve the problem continous component by continuous component
+                    # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
-                    for j in range(1, n_dup+1):
+                    # Here the 163B is read 163 by DSSR, but there already is a residue 163.
-                        if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous
+                    # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1.
-                            df.iloc[i+j-1,1] += 1
+                    df.iloc[i, 1] = 1 + df.iloc[i-1, 1]
-                        else:
+                    df.iloc[i+1:, 1] += 1
-                            break
+            except:
-            elif df.iloc[i,1] == df.iloc[i-1,1]:
+                warn(f"Error with parsing of {self.chain_label} duplicate residue numbers. Ignoring it.")
-                # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
+                self.delete_me = True
-                # Solution : we shift the numbering of 17A (to 18) and the following residues.
+                self.error_messages = f"Error with parsing of duplicate residues numbers."
-                df.iloc[i:, 1] += 1
+                return None
-            else:
+
-                # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
-                # Here the 163B is read 163 by DSSR, but there already is a residue 163.
-                # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1.
-                df.iloc[i, 1] = 1 + df.iloc[i-1, 1]
-                df.iloc[i+1:, 1] += 1
         # Search for ligands at the end of the selection
         # Drop ligands detected as residues by DSSR, by detecting several markers
@@ -1019,7 +1026,7 @@ class Pipeline:
                 print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &") 
                 sys.exit()
             elif opt == '--version':
-                print("RNANet 1.0 alpha ")
+                print("RNANet 1.1 beta")
                 sys.exit()
             elif opt == "-r" or opt == "--resolution":
                 assert float(arg) > 0.0 and float(arg) <= 20.0 
@@ -1382,7 +1389,7 @@ class Pipeline:
             # Remove previous precomputed data
             subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
             for f in self.fam_list:
-                subprocess.run(["rm","-f", f"data/{f}.npy"])
+                subprocess.run(["rm","-f", f"data/{f}.npy", f"data/{f}_pairs.csv", f"data/{f}_counts.csv"])
             # Run statistics files
             os.chdir(runDir)
@@ -1390,13 +1397,12 @@ class Pipeline:
             subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data])
         # Save additional informations
-        conn = sqlite3.connect(runDir+"/results/RNANet.db")
+        with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
-        pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 
+            pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 
-                          conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
+                            conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
-        pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 
+            pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 
-                            JOIN chain ON structure.pdb_id = chain.structure_id
+                                JOIN chain ON structure.pdb_id = chain.structure_id
-                            ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
+                                ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
-        conn.close()
         # Archive the results
         if self.SELECT_ONLY is None:
@@ -1404,7 +1410,10 @@ class Pipeline:
             subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."])
         # Update shortcuts to latest versions
-        subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz", runDir + "/results/summary_latest.csv", runDir + "/results/families_latest.csv"])
+        subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz", 
+                                    runDir + "/results/summary_latest.csv", 
+                                    runDir + "/results/families_latest.csv"
+                        ])
         subprocess.run(['ln',"-s", runDir +f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"])
         subprocess.run(['ln',"-s", runDir +f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"])
         subprocess.run(['ln',"-s", runDir +f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"])
@@ -1631,6 +1640,7 @@ def sql_ask_database(conn, sql, warn_every = 10):
 @trace_unhandled_exceptions
 def sql_execute(conn, sql, many=False, data=None, warn_every=10):
+    conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
     for _ in range(100): # retry 100 times if it fails
         try:
             if many:
@@ -2397,6 +2407,7 @@ if __name__ == "__main__":
             rfam_acc_to_download[c.mapping.rfam_acc] = [ c ]
         else:
             rfam_acc_to_download[c.mapping.rfam_acc].append(c)
+
     print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences")
     pp.fam_list = sorted(rfam_acc_to_download.keys())
--- a/automate.sh 100644 → 100755
View file @97926e0
+++ b/automate.sh 100644 → 100755
View file @97926e0
 # This is a script supposed to be run periodically as a cron job
+cd /home/lbecquey/Projects/RNANet
+rm -f latest_run.log errors.txt
+
 # Run RNANet
-cd /home/lbecquey/Projects/RNANet;
+bash -c 'time ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 -s --archive' &> latest_run.log
-rm -f stdout.txt stderr.txt errors.txt;
+touch results/RNANet.db # update last modification date
-time './RNAnet.py --3d-folder /home/lbequey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -s -r 20.0' > stdout.txt 2> stderr.txt;
+rm -f results/RNANet.db-wal results/RNANet.db-shm # SQLite temporary files
-# Sync in Seafile
+# Compress
-seaf-cli start;
+rm -f /home/lbecquey/Projects/RNANet/results/RNANet.db.gz
+echo 'Deleted results/RNANet.db.gz (if existed)' >> latest_run.log
+gzip -k /home/lbecquey/Projects/RNANet/results/RNANet.db
+echo 'Recreated it.' >> latest_run.log
-seaf-cli stop;
+# Sync in Seafile
+seaf-cli start >> latest_run.log 2>&1
+echo 'Waiting 10m for SeaFile synchronization...' >> latest_run.log
+sleep 10m
+echo `seaf-cli status` >> latest_run.log
+seaf-cli stop >> latest_run.log 2>&1
+echo 'We are '`date`', update completed.' >> latest_run.log
--- a/kill_rnanet.sh
View file @97926e0
+++ b/kill_rnanet.sh
View file @97926e0
@@ -10,6 +10,17 @@ for KILLPID in $PROCESS_LIST; do
   fi
 done
+PROCESS_TO_KILL="statistics.py"
+PROCESS_LIST=`ps ax | grep -Ei ${PROCESS_TO_KILL} | grep -Eiv '(grep|vi statistics.py)' | awk ' { print $1;}'`
+KILLED=
+for KILLPID in $PROCESS_LIST; do
+  if [ ! -z $KILLPID ];then
+    kill -9 $KILLPID
+    echo "Killed PID ${KILLPID}"
+    KILLED=yes
+  fi
+done
+
 if [ -z $KILLED ];then
     echo "Didn't kill anything"
 fi
--- a/known_issues.txt 0 → 100644
View file @97926e0
+++ b/known_issues.txt 0 → 100644
View file @97926e0
+1ml5_1_a_1-2914
+1ml5_1_a_151-2903
+1ml5_1_A_7-1518
+1ml5_1_A_7-1515
+1ml5_1_A_2-1520
+1ml5_1_b_5-121
+2rdo_1_A_3-118
+4v48_1_A9_3-118
+4v47_1_A9_3-118
+1vy7_1_AY_1-73
+1vy7_1_CY_1-73
+4w2h_1_CY_1-73
+6zmi_1_L8_1267-4755
+6zm7_1_L8_1267-4755
+6y6x_1_L8_1267-4755
+6z6n_1_L8_1267-4755
+6qzp_1_L8_1267-4755
+6zme_1_L8_1267-4755
+6z6l_1_L8_1267-4755
+6ek0_1_L8_1267-4755
+6zmo_1_L8_1267-4755
+6z6m_1_L8_1267-4755
+6ole_1_D_1267-4755
+6om0_1_D_1267-4755
+6y2l_1_L8_1267-4755
+6y0g_1_L8_1267-4755
+6oli_1_D_1267-4755
+6olg_1_A3_1267-4755
+6y57_1_L8_1267-4755
+5t2c_1_C_1267-4755
+6om7_1_D_1267-4755
+4ug0_1_L8_1267-4755
+6olf_1_D_1267-4755
+6ip5_1_1C_1267-4755
+6ip8_1_1C_1267-4755
+6olz_1_A3_1267-4755
+5aj0_1_A3_1267-4755
+5lks_1_L8_1267-4755
+6ip6_1_1C_1267-4755
+4v6x_1_A8_1267-4755
+2z9q_1_A_1-72
+1ls2_1_B_1-73
+3ep2_1_Y_1-72
+3eq3_1_Y_1-72
+4v48_1_A6_1-73
+1gsg_1_T_1-72
+3jcr_1_H_1-115
+1eg0_1_O_1-73
+4v42_1_BB_5-121
+4v42_1_BA_1-2914
+4v42_1_BA_151-2903
+2ob7_1_A_10-319
+1x1l_1_A_1-130
+1zc8_1_Z_1-130
+1zc8_1_Z_1-91
+2ob7_1_D_1-130
+1r2x_1_C_1-58
+1r2w_1_C_1-58
+1eg0_1_L_1-56
+1eg0_1_L_1-57
+6rxu_1_C2_588-2386
+6rxu_1_C2_588-2383
+6rxu_1_C2_583-2388
+5oql_1_2_588-2386
+5oql_1_2_588-2383
+5oql_1_2_583-2388
+6rxv_1_C2_588-2386
+6rxv_1_C2_588-2383
+6rxv_1_C2_583-2388
+6rxz_1_C2_588-2386
+6rxz_1_C2_588-2383
+6rxz_1_C2_583-2388
+6rxy_1_C2_588-2386
+6rxy_1_C2_588-2383
+6rxy_1_C2_583-2388
+6rxt_1_C2_588-2386
+6rxt_1_C2_588-2383
+6rxt_1_C2_583-2388
+4v48_1_BA_1-91
+4v48_1_BA_6-1541
+4v48_1_BA_6-1538
+4v48_1_BA_1-1543
+4v47_1_BA_1-91
+4v47_1_BA_6-1540
+4v47_1_BA_6-1537
+4v47_1_BA_1-1542
+2rdo_1_B_6-1460
+2rdo_1_B_6-1522
+2rdo_1_B_1-2903
+2rdo_1_B_6-1457
+2rdo_1_B_1-2904
+2rdo_1_B_1-1528
+2rdo_1_B_160-2893
+4v48_1_A0_6-1460
+4v48_1_A0_6-1522
+4v48_1_A0_1-2903
+4v48_1_A0_6-1457
+4v48_1_A0_1-2904
+4v48_1_A0_1-1528
+4v48_1_A0_160-2893
+4v47_1_A0_6-1460
+4v47_1_A0_6-1522
+4v47_1_A0_1-2903
+4v47_1_A0_6-1457
+4v47_1_A0_1-2904
+4v47_1_A0_1-1528
+4v47_1_A0_160-2893
+1zc8_1_A_1-59
+1mvr_1_D_1-59
+4c9d_1_D_29-1
+4c9d_1_C_29-1
+4adx_1_9_1-121
+1zn1_1_B_1-59
+1emi_1_B_1-108
+3iy9_1_A_498-1027
+1jgq_1_A_20-55
+1jgq_1_A_7-1518
+1jgq_1_A_7-1515
+1jgq_1_A_2-1520
+4v42_1_AA_20-55
+4v42_1_AA_7-1518
+4v42_1_AA_7-1515
+4v42_1_AA_2-1520
+1jgo_1_A_20-55
+1jgo_1_A_7-1518
+1jgo_1_A_7-1515
+1jgo_1_A_2-1520
+1jgp_1_A_20-55
+1jgp_1_A_7-1518
+1jgp_1_A_7-1515
+1jgp_1_A_2-1520
+3ep2_1_B_1-50
+3eq3_1_B_1-50
+3eq4_1_B_1-50
+3pgw_1_R_1-164
+3pgw_1_N_1-164
+3cw1_1_x_1-138
+3cw1_1_w_1-138
+3cw1_1_V_1-138
+3cw1_1_v_1-138
+2iy3_1_B_9-105
+3jcr_1_N_1-106
+3jcr_1_N_1-188
+2vaz_1_A_64-177
+2ftc_1_R_81-1466
+2ftc_1_R_1-1568
+2ftc_1_R_792-1568
+3jcr_1_M_1-141
+3jcr_1_M_1-107
+3jcr_1_M_1-188
+4v5z_1_B0_1-2840
+4v5z_1_B0_1-2899
+4v5z_1_B0_1-2902
+5g2x_1_A_595-692
+3iy8_1_A_1-540
+4v5z_1_BY_2-113
+4v5z_1_BZ_1-70
+1mvr_1_B_1-96
+4adx_1_0_1-2923
+4adx_1_0_132-2915
+3eq4_1_Y_1-69
+4v5z_1_AA_1-1562
+4v5z_1_AA_1-1563
+6lqm_1_8_1267-4755
+6lu8_1_8_1267-4755
+6lsr_1_8_1267-4755
+6lss_1_8_1267-4755
--- a/known_issues_reasons.txt 0 → 100644
View file @97926e0
+++ b/known_issues_reasons.txt 0 → 100644
View file @97926e0
+1ml5_1_a_1-2914
+Could not find nucleotides of chain a in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+1ml5_1_a_151-2903
+Could not find nucleotides of chain a in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+1ml5_1_A_7-1518
+Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+1ml5_1_A_7-1515
+Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+1ml5_1_A_2-1520
+Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+1ml5_1_b_5-121
+Could not find nucleotides of chain b in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+2rdo_1_A_3-118
+DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_A_3-118.
+
+4v48_1_A9_3-118
+DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A9_3-118.
+
+4v47_1_A9_3-118
+DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A9_3-118.
+
+1vy7_1_AY_1-73
+Sequence is too short. (< 5 resolved nts)
+
+1vy7_1_CY_1-73
+Sequence is too short. (< 5 resolved nts)
+
+4w2h_1_CY_1-73
+Sequence is too short. (< 5 resolved nts)
+
+6zmi_1_L8_1267-4755
+Could not find nucleotides of chain L8 in annotation 6zmi.json. Either there is a problem with 6zmi mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6zm7_1_L8_1267-4755
+Could not find nucleotides of chain L8 in annotation 6zm7.json. Either there is a problem with 6zm7 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6y6x_1_L8_1267-4755
+Could not find nucleotides of chain L8 in annotation 6y6x.json. Either there is a problem with 6y6x mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6z6n_1_L8_1267-4755
+Could not find nucleotides of chain L8 in annotation 6z6n.json. Either there is a problem with 6z6n mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6qzp_1_L8_1267-4755
+Could not find nucleotides of chain L8 in annotation 6qzp.json. Either there is a problem with 6qzp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6zme_1_L8_1267-4755
+Could not find nucleotides of chain L8 in annotation 6zme.json. Either there is a problem with 6zme mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6z6l_1_L8_1267-4755
+Could not find nucleotides of chain L8 in annotation 6z6l.json. Either there is a problem with 6z6l mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6ek0_1_L8_1267-4755
+Could not find nucleotides of chain L8 in annotation 6ek0.json. Either there is a problem with 6ek0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6zmo_1_L8_1267-4755
+Could not find nucleotides of chain L8 in annotation 6zmo.json. Either there is a problem with 6zmo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6z6m_1_L8_1267-4755
+Could not find nucleotides of chain L8 in annotation 6z6m.json. Either there is a problem with 6z6m mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6ole_1_D_1267-4755
+Could not find nucleotides of chain D in annotation 6ole.json. Either there is a problem with 6ole mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6om0_1_D_1267-4755
+Could not find nucleotides of chain D in annotation 6om0.json. Either there is a problem with 6om0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6y2l_1_L8_1267-4755
+Could not find nucleotides of chain L8 in annotation 6y2l.json. Either there is a problem with 6y2l mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6y0g_1_L8_1267-4755
+Could not find nucleotides of chain L8 in annotation 6y0g.json. Either there is a problem with 6y0g mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6oli_1_D_1267-4755
+Could not find nucleotides of chain D in annotation 6oli.json. Either there is a problem with 6oli mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6olg_1_A3_1267-4755
+Could not find nucleotides of chain A3 in annotation 6olg.json. Either there is a problem with 6olg mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6y57_1_L8_1267-4755
+Could not find nucleotides of chain L8 in annotation 6y57.json. Either there is a problem with 6y57 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+5t2c_1_C_1267-4755
+Could not find nucleotides of chain C in annotation 5t2c.json. Either there is a problem with 5t2c mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6om7_1_D_1267-4755
+Could not find nucleotides of chain D in annotation 6om7.json. Either there is a problem with 6om7 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+4ug0_1_L8_1267-4755
+Could not find nucleotides of chain L8 in annotation 4ug0.json. Either there is a problem with 4ug0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6olf_1_D_1267-4755
+Could not find nucleotides of chain D in annotation 6olf.json. Either there is a problem with 6olf mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6ip5_1_1C_1267-4755
+Could not find nucleotides of chain 1C in annotation 6ip5.json. Either there is a problem with 6ip5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6ip8_1_1C_1267-4755
+Could not find nucleotides of chain 1C in annotation 6ip8.json. Either there is a problem with 6ip8 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6olz_1_A3_1267-4755
+Could not find nucleotides of chain A3 in annotation 6olz.json. Either there is a problem with 6olz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+5aj0_1_A3_1267-4755
+Could not find nucleotides of chain A3 in annotation 5aj0.json. Either there is a problem with 5aj0 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+5lks_1_L8_1267-4755
+Could not find nucleotides of chain L8 in annotation 5lks.json. Either there is a problem with 5lks mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6ip6_1_1C_1267-4755
+Could not find nucleotides of chain 1C in annotation 6ip6.json. Either there is a problem with 6ip6 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+4v6x_1_A8_1267-4755
+Could not find nucleotides of chain A8 in annotation 4v6x.json. Either there is a problem with 4v6x mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+2z9q_1_A_1-72
+DSSR warning 2z9q.json: no nucleotides found. Ignoring 2z9q_1_A_1-72.
+
+1ls2_1_B_1-73
+DSSR warning 1ls2.json: no nucleotides found. Ignoring 1ls2_1_B_1-73.
+
+3ep2_1_Y_1-72
+DSSR warning 3ep2.json: no nucleotides found. Ignoring 3ep2_1_Y_1-72.
+
+3eq3_1_Y_1-72
+DSSR warning 3eq3.json: no nucleotides found. Ignoring 3eq3_1_Y_1-72.
+
+4v48_1_A6_1-73
+DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A6_1-73.
+
+1gsg_1_T_1-72
+DSSR warning 1gsg.json: no nucleotides found. Ignoring 1gsg_1_T_1-72.
+
+3jcr_1_H_1-115
+DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_H_1-115.
+
+1eg0_1_O_1-73
+DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_O_1-73.
+
+4v42_1_BB_5-121
+Could not find nucleotides of chain BB in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+4v42_1_BA_1-2914
+Could not find nucleotides of chain BA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+4v42_1_BA_151-2903
+Could not find nucleotides of chain BA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+2ob7_1_A_10-319
+DSSR warning 2ob7.json: no nucleotides found. Ignoring 2ob7_1_A_10-319.
+
+1x1l_1_A_1-130
+DSSR warning 1x1l.json: no nucleotides found. Ignoring 1x1l_1_A_1-130.
+
+1zc8_1_Z_1-130
+DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_Z_1-130.
+
+1zc8_1_Z_1-91
+DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_Z_1-91.
+
+2ob7_1_D_1-130
+DSSR warning 2ob7.json: no nucleotides found. Ignoring 2ob7_1_D_1-130.
+
+1r2x_1_C_1-58
+DSSR warning 1r2x.json: no nucleotides found. Ignoring 1r2x_1_C_1-58.
+
+1r2w_1_C_1-58
+DSSR warning 1r2w.json: no nucleotides found. Ignoring 1r2w_1_C_1-58.
+
+1eg0_1_L_1-56
+DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_L_1-56.
+
+1eg0_1_L_1-57
+DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_L_1-57.
+
+6rxu_1_C2_588-2386
+Could not find nucleotides of chain C2 in annotation 6rxu.json. Either there is a problem with 6rxu mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6rxu_1_C2_588-2383
+Could not find nucleotides of chain C2 in annotation 6rxu.json. Either there is a problem with 6rxu mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6rxu_1_C2_583-2388
+Could not find nucleotides of chain C2 in annotation 6rxu.json. Either there is a problem with 6rxu mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+5oql_1_2_588-2386
+Could not find nucleotides of chain 2 in annotation 5oql.json. Either there is a problem with 5oql mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+5oql_1_2_588-2383
+Could not find nucleotides of chain 2 in annotation 5oql.json. Either there is a problem with 5oql mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+5oql_1_2_583-2388
+Could not find nucleotides of chain 2 in annotation 5oql.json. Either there is a problem with 5oql mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6rxv_1_C2_588-2386
+Could not find nucleotides of chain C2 in annotation 6rxv.json. Either there is a problem with 6rxv mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6rxv_1_C2_588-2383
+Could not find nucleotides of chain C2 in annotation 6rxv.json. Either there is a problem with 6rxv mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6rxv_1_C2_583-2388
+Could not find nucleotides of chain C2 in annotation 6rxv.json. Either there is a problem with 6rxv mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6rxz_1_C2_588-2386
+Could not find nucleotides of chain C2 in annotation 6rxz.json. Either there is a problem with 6rxz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6rxz_1_C2_588-2383
+Could not find nucleotides of chain C2 in annotation 6rxz.json. Either there is a problem with 6rxz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6rxz_1_C2_583-2388
+Could not find nucleotides of chain C2 in annotation 6rxz.json. Either there is a problem with 6rxz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6rxy_1_C2_588-2386
+Could not find nucleotides of chain C2 in annotation 6rxy.json. Either there is a problem with 6rxy mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6rxy_1_C2_588-2383
+Could not find nucleotides of chain C2 in annotation 6rxy.json. Either there is a problem with 6rxy mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6rxy_1_C2_583-2388
+Could not find nucleotides of chain C2 in annotation 6rxy.json. Either there is a problem with 6rxy mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6rxt_1_C2_588-2386
+Could not find nucleotides of chain C2 in annotation 6rxt.json. Either there is a problem with 6rxt mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6rxt_1_C2_588-2383
+Could not find nucleotides of chain C2 in annotation 6rxt.json. Either there is a problem with 6rxt mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6rxt_1_C2_583-2388
+Could not find nucleotides of chain C2 in annotation 6rxt.json. Either there is a problem with 6rxt mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+4v48_1_BA_1-91
+DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_1-91.
+
+4v48_1_BA_6-1541
+DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_6-1541.
+
+4v48_1_BA_6-1538
+DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_6-1538.
+
+4v48_1_BA_1-1543
+DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA_1-1543.
+
+4v47_1_BA_1-91
+DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_1-91.
+
+4v47_1_BA_6-1540
+DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_6-1540.
+
+4v47_1_BA_6-1537
+DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_6-1537.
+
+4v47_1_BA_1-1542
+DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA_1-1542.
+
+2rdo_1_B_6-1460
+DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_6-1460.
+
+2rdo_1_B_6-1522
+DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_6-1522.
+
+2rdo_1_B_1-2903
+DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_1-2903.
+
+2rdo_1_B_6-1457
+DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_6-1457.
+
+2rdo_1_B_1-2904
+DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_1-2904.
+
+2rdo_1_B_1-1528
+DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_1-1528.
+
+2rdo_1_B_160-2893
+DSSR warning 2rdo.json: no nucleotides found. Ignoring 2rdo_1_B_160-2893.
+
+4v48_1_A0_6-1460
+DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_6-1460.
+
+4v48_1_A0_6-1522
+DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_6-1522.
+
+4v48_1_A0_1-2903
+DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_1-2903.
+
+4v48_1_A0_6-1457
+DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_6-1457.
+
+4v48_1_A0_1-2904
+DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_1-2904.
+
+4v48_1_A0_1-1528
+DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_1-1528.
+
+4v48_1_A0_160-2893
+DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A0_160-2893.
+
+4v47_1_A0_6-1460
+DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_6-1460.
+
+4v47_1_A0_6-1522
+DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_6-1522.
+
+4v47_1_A0_1-2903
+DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_1-2903.
+
+4v47_1_A0_6-1457
+DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_6-1457.
+
+4v47_1_A0_1-2904
+DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_1-2904.
+
+4v47_1_A0_1-1528
+DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_1-1528.
+
+4v47_1_A0_160-2893
+DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A0_160-2893.
+
+1zc8_1_A_1-59
+DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_A_1-59.
+
+1mvr_1_D_1-59
+DSSR warning 1mvr.json: no nucleotides found. Ignoring 1mvr_1_D_1-59.
+
+4c9d_1_D_29-1
+Mapping is reversed, this case is not supported (yet).
+
+4c9d_1_C_29-1
+Mapping is reversed, this case is not supported (yet).
+
+4adx_1_9_1-121
+DSSR warning 4adx.json: no nucleotides found. Ignoring 4adx_1_9_1-121.
+
+1zn1_1_B_1-59
+DSSR warning 1zn1.json: no nucleotides found. Ignoring 1zn1_1_B_1-59.
+
+1emi_1_B_1-108
+DSSR warning 1emi.json: no nucleotides found. Ignoring 1emi_1_B_1-108.
+
+3iy9_1_A_498-1027
+DSSR warning 3iy9.json: no nucleotides found. Ignoring 3iy9_1_A_498-1027.
+
+1jgq_1_A_20-55
+Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+1jgq_1_A_7-1518
+Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+1jgq_1_A_7-1515
+Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+1jgq_1_A_2-1520
+Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+4v42_1_AA_20-55
+Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+4v42_1_AA_7-1518
+Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+4v42_1_AA_7-1515
+Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+4v42_1_AA_2-1520
+Could not find nucleotides of chain AA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+1jgo_1_A_20-55
+Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+1jgo_1_A_7-1518
+Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+1jgo_1_A_7-1515
+Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+1jgo_1_A_2-1520
+Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a problem with 1jgo mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+1jgp_1_A_20-55
+Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+1jgp_1_A_7-1518
+Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+1jgp_1_A_7-1515
+Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+1jgp_1_A_2-1520
+Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+3ep2_1_B_1-50
+DSSR warning 3ep2.json: no nucleotides found. Ignoring 3ep2_1_B_1-50.
+
+3eq3_1_B_1-50
+DSSR warning 3eq3.json: no nucleotides found. Ignoring 3eq3_1_B_1-50.
+
+3eq4_1_B_1-50
+DSSR warning 3eq4.json: no nucleotides found. Ignoring 3eq4_1_B_1-50.
+
+3pgw_1_R_1-164
+DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_R_1-164.
+
+3pgw_1_N_1-164
+DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_N_1-164.
+
+3cw1_1_x_1-138
+DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_x_1-138.
+
+3cw1_1_w_1-138
+DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_w_1-138.
+
+3cw1_1_V_1-138
+DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_V_1-138.
+
+3cw1_1_v_1-138
+DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_v_1-138.
+
+2iy3_1_B_9-105
+DSSR warning 2iy3.json: no nucleotides found. Ignoring 2iy3_1_B_9-105.
+
+3jcr_1_N_1-106
+DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_N_1-106.
+
+3jcr_1_N_1-188
+DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_N_1-188.
+
+2vaz_1_A_64-177
+DSSR warning 2vaz.json: no nucleotides found. Ignoring 2vaz_1_A_64-177.
+
+2ftc_1_R_81-1466
+DSSR warning 2ftc.json: no nucleotides found. Ignoring 2ftc_1_R_81-1466.
+
+2ftc_1_R_1-1568
+DSSR warning 2ftc.json: no nucleotides found. Ignoring 2ftc_1_R_1-1568.
+
+2ftc_1_R_792-1568
+DSSR warning 2ftc.json: no nucleotides found. Ignoring 2ftc_1_R_792-1568.
+
+3jcr_1_M_1-141
+DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_M_1-141.
+
+3jcr_1_M_1-107
+DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_M_1-107.
+
+3jcr_1_M_1-188
+DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_M_1-188.
+
+4v5z_1_B0_1-2840
+DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_B0_1-2840.
+
+4v5z_1_B0_1-2899
+DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_B0_1-2899.
+
+4v5z_1_B0_1-2902
+DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_B0_1-2902.
+
+5g2x_1_A_595-692
+Sequence is too short. (< 5 resolved nts)
+
+3iy8_1_A_1-540
+DSSR warning 3iy8.json: no nucleotides found. Ignoring 3iy8_1_A_1-540.
+
+4v5z_1_BY_2-113
+DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_BY_2-113.
+
+4v5z_1_BZ_1-70
+DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_BZ_1-70.
+
+1mvr_1_B_1-96
+DSSR warning 1mvr.json: no nucleotides found. Ignoring 1mvr_1_B_1-96.
+
+4adx_1_0_1-2923
+DSSR warning 4adx.json: no nucleotides found. Ignoring 4adx_1_0_1-2923.
+
+4adx_1_0_132-2915
+DSSR warning 4adx.json: no nucleotides found. Ignoring 4adx_1_0_132-2915.
+
+3eq4_1_Y_1-69
+DSSR warning 3eq4.json: no nucleotides found. Ignoring 3eq4_1_Y_1-69.
+
+4v5z_1_AA_1-1562
+DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_AA_1-1562.
+
+4v5z_1_AA_1-1563
+DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_AA_1-1563.
+
+6lqm_1_8_1267-4755
+Could not find nucleotides of chain 8 in annotation 6lqm.json. Either there is a problem with 6lqm mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6lu8_1_8_1267-4755
+Could not find nucleotides of chain 8 in annotation 6lu8.json. Either there is a problem with 6lu8 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6lsr_1_8_1267-4755
+Could not find nucleotides of chain 8 in annotation 6lsr.json. Either there is a problem with 6lsr mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
+6lss_1_8_1267-4755
+Could not find nucleotides of chain 8 in annotation 6lss.json. Either there is a problem with 6lss mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
+
--- a/statistics.py
View file @97926e0
+++ b/statistics.py
View file @97926e0
@@ -5,7 +5,7 @@
 # in the database.
 # This should be run from the folder where the file is (to access the database with path "results/RNANet.db")
-import os, pickle, sqlite3, sys
+import os, pickle, sqlite3, shlex, subprocess, sys
 import numpy as np
 import pandas as pd
 import threading as th
@@ -16,14 +16,13 @@ import matplotlib.patches as mpatches
 import scipy.cluster.hierarchy as sch
 from scipy.spatial.distance import squareform
 from mpl_toolkits.mplot3d import axes3d
-from Bio.Phylo.TreeConstruction import DistanceCalculator
 from Bio import AlignIO, SeqIO
 from functools import partial
-from multiprocessing import Pool
+from multiprocessing import Pool, Manager
 from os import path
 from tqdm import tqdm
 from collections import Counter
-from RNAnet import read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker
+from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker
 # This sets the paths
 if len(sys.argv) > 1:
@@ -37,7 +36,7 @@ else:
 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546")   # From Rfam CLAN 00112
 SSU_set = ("RF00177", "RF02542",  "RF02545", "RF01959", "RF01960")  # From Rfam CLAN 00111
-def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
+def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
     """
     Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph.
     See Wadley & Pyle (2007)
@@ -68,6 +67,12 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
     if not path.isfile(f"data/wadley_kernel_{angle}.npz"):
+
+        # Get a worker number to position the progress bar
+        global idxQueue
+        thr_idx = idxQueue.get()
+        pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", position=thr_idx+1, leave=False)
+
         # Extract the angle values of c2'-endo and c3'-endo nucleotides
         with sqlite3.connect("results/RNANet.db") as conn:
             df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE puckering="C2'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn)
@@ -89,13 +94,17 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
         xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j]
         positions = np.vstack([xx.ravel(), yy.ravel()])
         f_c3 = np.reshape(kernel_c3(positions).T, xx.shape)
+        pbar.update(1)
         f_c2 = np.reshape(kernel_c2(positions).T, xx.shape)
+        pbar.update(1)
         # Save the data to an archive for later use without the need to recompute
         np.savez(f"data/wadley_kernel_{angle}.npz",
                   c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas,
                   c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas,
                   kernel_c3=f_c3, kernel_c2=f_c2)
+        pbar.close()
+        idxQueue.put(thr_idx)
     else:
         f = np.load(f"data/wadley_kernel_{angle}.npz")
         c2_endo_etas = f["c2_endo_e"]
@@ -106,7 +115,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
         f_c2 = f["kernel_c2"]
         xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j]
-    notify(f"Kernel computed for {angle}/th{angle} (or loaded from file).")
+    # notify(f"Kernel computed for {angle}/th{angle} (or loaded from file).")
     # exact counts:
     hist_c2, xedges, yedges = np.histogram2d(c2_endo_etas, c2_endo_thetas, bins=int(2*np.pi/0.1), 
@@ -139,7 +148,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
         fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}.png")
         if show:
             fig.show()
-        fig.close()
+        plt.close()
         # Smoothed joint distribution
         fig = plt.figure()
@@ -150,7 +159,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
         fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}.png")
         if show:
             fig.show()
-        fig.close()
+        plt.close()
         # 2D Wadley plot
         fig = plt.figure(figsize=(5,5))
@@ -163,7 +172,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
         fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}.png")
         if show:
             fig.show()
-        fig.close()
+        plt.close()
     # print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.")
 def stats_len():
@@ -171,11 +180,15 @@ def stats_len():
     REQUIRES tables chain, nucleotide up to date.
     """
+    
+    # Get a worker number to position the progress bar
+    global idxQueue
+    thr_idx = idxQueue.get()
     cols = []
     lengths = []
-    conn = sqlite3.connect("results/RNANet.db")
+    
-    for i,f in enumerate(fam_list):
+    for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False)):
         # Define a color for that family in the plot
         if f in LSU_set:
@@ -190,11 +203,11 @@ def stats_len():
             cols.append("grey")
         # Get the lengths of chains
-        l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;") ]
+        with sqlite3.connect("results/RNANet.db") as conn:
+            l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;", warn_every=0) ]
         lengths.append(l)
-        notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
+        # notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
-    conn.close()
     # Plot the figure
     fig = plt.figure(figsize=(10,3))
@@ -223,7 +236,8 @@ def stats_len():
     # Save the figure
     fig.savefig("results/figures/lengths.png")
-    notify("Computed sequence length statistics and saved the figure.")
+    idxQueue.put(thr_idx) # replace the thread index in the queue
+    # notify("Computed sequence length statistics and saved the figure.")
 def format_percentage(tot, x):
         if not tot:
@@ -242,40 +256,57 @@ def stats_freq():
     Outputs results/frequencies.csv
     REQUIRES tables chain, nucleotide up to date."""
+
+    # Get a worker number to position the progress bar
+    global idxQueue
+    thr_idx = idxQueue.get()
+
     # Initialize a Counter object for each family
     freqs = {}
     for f in fam_list:
         freqs[f] = Counter()
     # List all nt_names happening within a RNA family and store the counts in the Counter
-    conn = sqlite3.connect("results/RNANet.db")
+    for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False)):
-    for i,f in enumerate(fam_list):
+        with sqlite3.connect("results/RNANet.db") as conn:
-        counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;"))
+            counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0))
         freqs[f].update(counts)
-        notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.")
+        # notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.")
-    conn.close()
     # Create a pandas DataFrame, and save it to CSV.
     df = pd.DataFrame()
-    for f in fam_list:
+    for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False):
         tot = sum(freqs[f].values())
         df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ])
     df = df.fillna(0)
     df.to_csv("results/frequencies.csv")    
-    notify("Saved nucleotide frequencies to CSV file.")
+    idxQueue.put(thr_idx) # replace the thread index in the queue
+    # notify("Saved nucleotide frequencies to CSV file.")
 def parallel_stats_pairs(f):
     """Counts occurrences of intra-chain base-pair types in one RNA family
     REQUIRES tables chain, nucleotide up-to-date.""" 
+    if path.isfile("data/"+f+"_pairs.csv") and path.isfile("data/"+f+"_counts.csv"):
+        return
+
+    # Get a worker number to position the progress bar
+    global idxQueue
+    thr_idx = idxQueue.get()
+
     chain_id_list = mappings_list[f]
     data = []
-    for cid in chain_id_list:
+    sqldata = []
+    for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", leave=False):
         with sqlite3.connect("results/RNANet.db") as conn:
             # Get comma separated lists of basepairs per nucleotide
-            interactions = pd.read_sql(f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;", conn)
+            interactions = pd.DataFrame(
-
+                            sql_ask_database(conn, 
+                                            f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;",
+                                            warn_every=0), 
+                            columns = ["nt1", "index_chain", "paired", "pair_type_LW"]
+                           )
         # expand the comma-separated lists in real lists
         expanded_list = pd.concat([ pd.DataFrame({  'nt1':[ row["nt1"] for x in row["paired"].split(',') ],
                                                     'index_chain':[ row['index_chain'] for x in row["paired"].split(',') ],
@@ -317,27 +348,29 @@ def parallel_stats_pairs(f):
         # Update the database
         vlcnts = expanded_list.pair_type_LW.value_counts()
-        sqldata = ( vlcnts.at["cWW"]/2 if "cWW" in vlcnts.index else 0, 
+        sqldata.append(   ( vlcnts.at["cWW"]/2 if "cWW" in vlcnts.index else 0, 
-                    vlcnts.at["cWH"] if "cWH" in vlcnts.index else 0, 
+                            vlcnts.at["cWH"] if "cWH" in vlcnts.index else 0, 
-                    vlcnts.at["cWS"] if "cWS" in vlcnts.index else 0, 
+                            vlcnts.at["cWS"] if "cWS" in vlcnts.index else 0, 
-                    vlcnts.at["cHH"]/2 if "cHH" in vlcnts.index else 0, 
+                            vlcnts.at["cHH"]/2 if "cHH" in vlcnts.index else 0, 
-                    vlcnts.at["cHS"] if "cHS" in vlcnts.index else 0, 
+                            vlcnts.at["cHS"] if "cHS" in vlcnts.index else 0, 
-                    vlcnts.at["cSS"]/2 if "cSS" in vlcnts.index else 0, 
+                            vlcnts.at["cSS"]/2 if "cSS" in vlcnts.index else 0, 
-                    vlcnts.at["tWW"]/2 if "tWW" in vlcnts.index else 0, 
+                            vlcnts.at["tWW"]/2 if "tWW" in vlcnts.index else 0, 
-                    vlcnts.at["tWH"] if "tWH" in vlcnts.index else 0, 
+                            vlcnts.at["tWH"] if "tWH" in vlcnts.index else 0, 
-                    vlcnts.at["tWS"] if "tWS" in vlcnts.index else 0, 
+                            vlcnts.at["tWS"] if "tWS" in vlcnts.index else 0, 
-                    vlcnts.at["tHH"]/2 if "tHH" in vlcnts.index else 0, 
+                            vlcnts.at["tHH"]/2 if "tHH" in vlcnts.index else 0, 
-                    vlcnts.at["tHS"] if "tHS" in vlcnts.index else 0, 
+                            vlcnts.at["tHS"] if "tHS" in vlcnts.index else 0, 
-                    vlcnts.at["tSS"]/2 if "tSS" in vlcnts.index else 0, 
+                            vlcnts.at["tSS"]/2 if "tSS" in vlcnts.index else 0, 
-                    int(sum(vlcnts.loc[[ str(x) for x in vlcnts.index if "." in str(x)]])/2), 
+                            int(sum(vlcnts.loc[[ str(x) for x in vlcnts.index if "." in str(x)]])/2), 
-                    cid)
+                            cid) )
-        with sqlite3.connect("results/RNANet.db") as conn:
-            sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
-                                    pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?, 
-                                    pair_count_tHH = ?, pair_count_tHS = ?, pair_count_tSS = ?, pair_count_other = ? WHERE chain_id = ?;""", data=sqldata)
         data.append(expanded_list)
+    # Update the database
+    with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
+        conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
+        sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
+                                pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?, 
+                                pair_count_tHH = ?, pair_count_tHS = ?, pair_count_tSS = ?, pair_count_other = ? WHERE chain_id = ?;""", many=True, data=sqldata, warn_every=0)
     # merge all the dataframes from all chains of the family
     expanded_list = pd.concat(data)
@@ -351,7 +384,106 @@ def parallel_stats_pairs(f):
     # Create an output DataFrame
     f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f])
-    return expanded_list, f_df
+    f_df.to_csv(f"data/{f}_counts.csv")
+    expanded_list.to_csv(f"data/{f}_pairs.csv")
+    
+    idxQueue.put(thr_idx) # replace the thread index in the queue
+
+def to_dist_matrix(f):
+    if path.isfile("data/"+f+".npy"):
+        # notify(f"Computed {f} distance matrix", "loaded from file")
+        return 0
+  
+    # Get a worker number to position the progress bar
+    global idxQueue
+    thr_idx = idxQueue.get()
+
+    # notify(f"Computing {f} distance matrix from alignment...")
+    command = f"esl-alipid --rna --noheader --informat stockholm {f}_3d_only.stk"
+
+    # Prepare a file
+    with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file:
+        al = AlignIO.read(al_file, "fasta")
+        names = [ x.id for x in al if '[' in x.id ]
+        al = al[-len(names):]
+    with open(f + "_3d_only.stk", "w") as only_3d:
+        only_3d.write(al.format("stockholm"))
+    del al
+
+    # Prepare the job
+    process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE)
+    id_matrix = np.zeros((len(names), len(names)))
+
+    pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", leave=False)
+    while process.poll() is None:
+        output = process.stdout.readline()
+        if output:
+            lines = output.strip().split(b'\n')
+            for l in lines:
+                line = l.split()
+                s1 = line[0].decode('utf-8')
+                s2 = line[1].decode('utf-8')
+                score = line[2].decode('utf-8')
+                id1 = names.index(s1)
+                id2 = names.index(s2)
+                id_matrix[id1, id2] = float(score)
+                pbar.update(1)
+    pbar.close()
+
+    subprocess.run(["rm", "-f", f + "_3d_only.stk"])
+    np.save("data/"+f+".npy", id_matrix)
+    idxQueue.put(thr_idx) # replace the thread index in the queue
+    return 0
+
+def seq_idty():
+    """Computes identity matrices for each of the RNA families.
+    
+    REQUIRES temporary results files in data/*.npy
+    REQUIRES tables chain, family un to date."""
+
+    # load distance matrices
+    fam_arrays = []
+    for f in famlist:
+        if path.isfile("data/"+f+".npy"):
+            fam_arrays.append(np.load("data/"+f+".npy"))
+        else:
+            fam_arrays.append([])
+
+    # Update database with identity percentages
+    conn = sqlite3.connect("results/RNANet.db")
+    for f, D in zip(famlist, fam_arrays):
+        if not len(D): continue
+        a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
+        conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
+    conn.commit()
+    conn.close()
+
+    # Plots plots plots
+    fig, axs = plt.subplots(4,17, figsize=(17,5.75))
+    axs = axs.ravel()
+    [axi.set_axis_off() for axi in axs]
+    im = "" # Just to declare the variable, it will be set in the loop
+    for f, D, ax in zip(famlist, fam_arrays, axs):
+        if not len(D): continue
+        if D.shape[0] > 2:  # Cluster only if there is more than 2 sequences to organize
+            D = D + D.T     # Copy the lower triangle to upper, to get a symetrical matrix
+            condensedD = squareform(D)
+
+            # Compute basic dendrogram by Ward's method
+            Y = sch.linkage(condensedD, method='ward')
+            Z = sch.dendrogram(Y, orientation='left', no_plot=True)
+
+            # Reorganize rows and cols
+            idx1 = Z['leaves']
+            D = D[idx1,:]
+            D = D[:,idx1[::-1]]
+        im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D
+        ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10)
+    fig.tight_layout()
+    fig.subplots_adjust(wspace=0.1, hspace=0.3)
+    fig.colorbar(im, ax=axs[-1], shrink=0.8)
+    fig.savefig(f"results/figures/distances.png")
+    notify("Computed all identity matrices and saved the figure.")
 def stats_pairs():
     """Counts occurrences of intra-chain base-pair types in RNA families
@@ -363,26 +495,15 @@ def stats_pairs():
         return family_data.apply(partial(format_percentage, sum(family_data)))
     if not path.isfile("data/pair_counts.csv"):
-        p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=read_cpu_number(), maxtasksperchild=5)
+        results = []
-        try:
+        allpairs = []
-            fam_pbar = tqdm(total=len(fam_list), desc="Pair-types in families", position=0, leave=True) 
+        for f in fam_list:
-            results = []
+            newpairs = pd.read_csv(f"data/{f}_pairs.csv", index_col=0)
-            allpairs = []
+            fam_df = pd.read_csv(f"data/{f}_counts.csv", index_col=0)
-            for _, newp_famdf in enumerate(p.imap_unordered(parallel_stats_pairs, fam_list)):
+            results.append(fam_df)
-                newpairs, fam_df = newp_famdf
+            allpairs.append(newpairs)
-                fam_pbar.update(1)
+            subprocess.run(["rm", "-f", f"data/{f}_pairs.csv"])
-                results.append(fam_df)
+            subprocess.run(["rm", "-f", f"data/{f}_counts.csv"])
-                allpairs.append(newpairs)
-            fam_pbar.close()
-            p.close()
-            p.join()
-        except KeyboardInterrupt:
-            warn("KeyboardInterrupt, terminating workers.", error=True)
-            fam_pbar.close()
-            p.terminate()
-            p.join()
-            exit(1)
-
         all_pairs = pd.concat(allpairs)
         df = pd.concat(results).fillna(0)
         df.to_csv("data/pair_counts.csv")
@@ -431,92 +552,12 @@ def stats_pairs():
     notify("Computed nucleotide statistics and saved CSV and PNG file.")
-def to_dist_matrix(f):
-    if path.isfile("data/"+f+".npy"):
-        notify(f"Computed {f} distance matrix", "loaded from file")
-        return 0
-
-    notify(f"Computing {f} distance matrix from alignment...")
-    dm = DistanceCalculator('identity')
-    with open(path_to_seq_data+"/realigned/"+f+"++.afa") as al_file:
-        al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):]
-    idty = dm.get_distance(al).matrix # list of lists
-    del al
-    l = len(idty)
-    np.save("data/"+f+".npy", np.array([ idty[i] + [0]*(l-1-i) if i<l-1 else idty[i]  for i in range(l) ], dtype=object))
-    del idty
-    notify(f"Computed {f} distance matrix")
-    return 0
-
-def seq_idty():
-    """Computes identity matrices for each of the RNA families.
-    
-    Creates temporary results files in data/*.npy
-    REQUIRES tables chain, family un to date."""
-
-    # List the families for which we will compute sequence identity matrices
-    conn = sqlite3.connect("results/RNANet.db")
-    famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 1 ORDER BY rfam_acc ASC;") ]
-    ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ]
-    if len(ignored):
-        print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
-
-    # compute distance matrices (or ignore if data/RF0****.npy exists)
-    p = Pool(processes=8)
-    p.map(to_dist_matrix, famlist)
-    p.close()
-    p.join()
-
-    # load them
-    fam_arrays = []
-    for f in famlist:
-        if path.isfile("data/"+f+".npy"):
-            fam_arrays.append(np.load("data/"+f+".npy"))
-        else:
-            fam_arrays.append([])
-
-    # Update database with identity percentages
-    conn = sqlite3.connect("results/RNANet.db")
-    for f, D in zip(famlist, fam_arrays):
-        if not len(D): continue
-        a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
-        conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
-    conn.commit()
-    conn.close()
-
-    # Plots plots plots
-    fig, axs = plt.subplots(4,17, figsize=(17,5.75))
-    axs = axs.ravel()
-    [axi.set_axis_off() for axi in axs]
-    im = "" # Just to declare the variable, it will be set in the loop
-    for f, D, ax in zip(famlist, fam_arrays, axs):
-        if not len(D): continue
-        if D.shape[0] > 2:  # Cluster only if there is more than 2 sequences to organize
-            D = D + D.T     # Copy the lower triangle to upper, to get a symetrical matrix
-            condensedD = squareform(D)
-
-            # Compute basic dendrogram by Ward's method
-            Y = sch.linkage(condensedD, method='ward')
-            Z = sch.dendrogram(Y, orientation='left', no_plot=True)
-
-            # Reorganize rows and cols
-            idx1 = Z['leaves']
-            D = D[idx1,:]
-            D = D[:,idx1[::-1]]
-        im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D
-        ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10)
-    fig.tight_layout()
-    fig.subplots_adjust(wspace=0.1, hspace=0.3)
-    fig.colorbar(im, ax=axs[-1], shrink=0.8)
-    fig.savefig(f"results/figures/distances.png")
-    notify("Computed all identity matrices and saved the figure.")
-
 def per_chain_stats():
     """Computes per-chain frequencies and base-pair type counts.
     REQUIRES tables chain, nucleotide up to date. """
-    with sqlite3.connect("results/RNANet.db") as conn:
+    with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
         # Compute per-chain nucleotide frequencies
         df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn)
         df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64)
@@ -524,39 +565,74 @@ def per_chain_stats():
         df = df.drop("total", axis=1)
         # Set the values
+        conn.execute('pragma journal_mode=wal')
         sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;",
                           many=True, data=list(df.to_records(index=False)), warn_every=10)
     notify("Updated the database with per-chain base frequencies")
+def log_to_pbar(pbar):
+    def update(r):
+        pbar.update(1)
+    return update
+
 if __name__ == "__main__":
     os.makedirs("results/figures/wadley_plots/", exist_ok=True)
     print("Loading mappings list...")
-    conn = sqlite3.connect("results/RNANet.db")
+    with sqlite3.connect("results/RNANet.db") as conn:
-    fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ]
+        fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ]
-    mappings_list = {}
+        mappings_list = {}
-    for k in fam_list:
+        for k in fam_list:
-        mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}';") ]
+            mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}' and issue=0;") ]
-    conn.close()
-    
-    # stats_pairs()
-
-    # Define threads for the tasks
-    threads = [
-        th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}),
-        th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}),
-        th.Thread(target=stats_len),            # computes figures
-        th.Thread(target=stats_freq),           # Updates the database
-        th.Thread(target=seq_idty),             # produces .npy files and seq idty figures
-        th.Thread(target=per_chain_stats)       # Updates the database
-    ]
-    
-    # Start the threads
-    for t in threads:
-        t.start()
-
-    # Wait for the threads to complete
-    for t in threads:
-        t.join()
+    # List the families for which we will compute sequence identity matrices
+    with sqlite3.connect("results/RNANet.db") as conn:
+        famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ]
+        ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ]
+    if len(ignored):
+        print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
+    
+    # Prepare the multiprocessing execution environment
+    nworkers = max(read_cpu_number()-1, 32)
+    thr_idx_mgr = Manager()
+    idxQueue = thr_idx_mgr.Queue()
+    for i in range(nworkers):
+        idxQueue.put(i)
+
+    # Define the tasks
+    joblist = []
+    joblist.append(Job(function=reproduce_wadley_results, args=(1,)))
+    joblist.append(Job(function=reproduce_wadley_results, args=(4,)))
+    joblist.append(Job(function=stats_len)) # Computes figures
+    joblist.append(Job(function=stats_freq)) # updates the database
+    for f in famlist:
+        joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
+        if f not in ignored:
+            joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database
+
+    p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
+    pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True)
+sqlite3 
+    try:
+        for j in joblist:
+            p.apply_async(j.func_, args=j.args_, callback=log_to_pbar(pbar))
+        p.close()
+        p.join()
+        pbar.close()
+    except KeyboardInterrupt:
+        warn("KeyboardInterrupt, terminating workers.", error=True)
+        p.terminate()
+        p.join()
+        pbar.close()
+        exit(1)
+    except:
+        print("Something went wrong")
+
+    print()
+    print()
+
+    # finish the work after the parallel portions
+    per_chain_stats()
+    seq_idty()
+    stats_pairs()