Better parallel statistics computation

Louis BECQUEY
Commit 1854316d31fc4e05584490823720f9bae76a5d40 1854316d 1 parent 3b5ed451
Showing 3 changed files with 219 additions and 148 deletions
RNAnet.py
automate.sh
statistics.py
--- a/RNAnet.py
View file @1854316
+++ b/RNAnet.py
View file @1854316
@@ -1638,6 +1638,7 @@ def sql_ask_database(conn, sql, warn_every = 10):
 @trace_unhandled_exceptions
 def sql_execute(conn, sql, many=False, data=None, warn_every=10):
+    conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
     for _ in range(100): # retry 100 times if it fails
         try:
             if many:
--- a/automate.sh
View file @1854316
+++ b/automate.sh
View file @1854316
@@ -3,7 +3,7 @@
 # Run RNANet
 cd /home/lbecquey/Projects/RNANet;
 rm -f stdout.txt stderr.txt errors.txt;
-time './RNAnet.py --3d-folder /home/lbequey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -s -r 20.0' > stdout.txt 2> stderr.txt;
+time './RNAnet.py --3d-folder /home/lbequey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -s -r 20.0 --archive' > stdout.txt 2> stderr.txt;
 # Sync in Seafile
 seaf-cli start;
--- a/statistics.py
View file @1854316
+++ b/statistics.py
View file @1854316
@@ -5,7 +5,7 @@
 # in the database.
 # This should be run from the folder where the file is (to access the database with path "results/RNANet.db")
-import os, pickle, sqlite3, sys
+import os, pickle, sqlite3, shlex, subprocess, sys
 import numpy as np
 import pandas as pd
 import threading as th
@@ -16,14 +16,13 @@ import matplotlib.patches as mpatches
 import scipy.cluster.hierarchy as sch
 from scipy.spatial.distance import squareform
 from mpl_toolkits.mplot3d import axes3d
-from Bio.Phylo.TreeConstruction import DistanceCalculator
 from Bio import AlignIO, SeqIO
 from functools import partial
-from multiprocessing import Pool
+from multiprocessing import Pool, Manager
 from os import path
 from tqdm import tqdm
 from collections import Counter
-from RNAnet import read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker
+from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker
 # This sets the paths
 if len(sys.argv) > 1:
@@ -37,7 +36,7 @@ else:
 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546")   # From Rfam CLAN 00112
 SSU_set = ("RF00177", "RF02542",  "RF02545", "RF01959", "RF01960")  # From Rfam CLAN 00111
-def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
+def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
     """
     Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph.
     See Wadley & Pyle (2007)
@@ -68,6 +67,12 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
     if not path.isfile(f"data/wadley_kernel_{angle}.npz"):
+
+        # Get a worker number to position the progress bar
+        global idxQueue
+        thr_idx = idxQueue.get()
+        pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", position=thr_idx+1, leave=False)
+
         # Extract the angle values of c2'-endo and c3'-endo nucleotides
         with sqlite3.connect("results/RNANet.db") as conn:
             df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE puckering="C2'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn)
@@ -89,13 +94,17 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
         xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j]
         positions = np.vstack([xx.ravel(), yy.ravel()])
         f_c3 = np.reshape(kernel_c3(positions).T, xx.shape)
+        pbar.update(1)
         f_c2 = np.reshape(kernel_c2(positions).T, xx.shape)
+        pbar.update(1)
         # Save the data to an archive for later use without the need to recompute
         np.savez(f"data/wadley_kernel_{angle}.npz",
                   c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas,
                   c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas,
                   kernel_c3=f_c3, kernel_c2=f_c2)
+        pbar.close()
+        idxQueue.put(thr_idx)
     else:
         f = np.load(f"data/wadley_kernel_{angle}.npz")
         c2_endo_etas = f["c2_endo_e"]
@@ -106,7 +115,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
         f_c2 = f["kernel_c2"]
         xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j]
-    notify(f"Kernel computed for {angle}/th{angle} (or loaded from file).")
+    # notify(f"Kernel computed for {angle}/th{angle} (or loaded from file).")
     # exact counts:
     hist_c2, xedges, yedges = np.histogram2d(c2_endo_etas, c2_endo_thetas, bins=int(2*np.pi/0.1), 
@@ -139,7 +148,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
         fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}.png")
         if show:
             fig.show()
-        fig.close()
+        plt.close()
         # Smoothed joint distribution
         fig = plt.figure()
@@ -150,7 +159,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
         fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}.png")
         if show:
             fig.show()
-        fig.close()
+        plt.close()
         # 2D Wadley plot
         fig = plt.figure(figsize=(5,5))
@@ -163,7 +172,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
         fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}.png")
         if show:
             fig.show()
-        fig.close()
+        plt.close()
     # print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.")
 def stats_len():
@@ -172,10 +181,14 @@ def stats_len():
     REQUIRES tables chain, nucleotide up to date.
     """
+    # Get a worker number to position the progress bar
+    global idxQueue
+    thr_idx = idxQueue.get()
+
     cols = []
     lengths = []
-    conn = sqlite3.connect("results/RNANet.db")
+    
-    for i,f in enumerate(fam_list):
+    for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False)):
         # Define a color for that family in the plot
         if f in LSU_set:
@@ -190,11 +203,11 @@ def stats_len():
             cols.append("grey")
         # Get the lengths of chains
-        l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;") ]
+        with sqlite3.connect("results/RNANet.db") as conn:
+            l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;", warn_every=0) ]
         lengths.append(l)
-        notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
+        # notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
-    conn.close()
     # Plot the figure
     fig = plt.figure(figsize=(10,3))
@@ -223,7 +236,8 @@ def stats_len():
     # Save the figure
     fig.savefig("results/figures/lengths.png")
-    notify("Computed sequence length statistics and saved the figure.")
+    idxQueue.put(thr_idx) # replace the thread index in the queue
+    # notify("Computed sequence length statistics and saved the figure.")
 def format_percentage(tot, x):
         if not tot:
@@ -242,40 +256,54 @@ def stats_freq():
     Outputs results/frequencies.csv
     REQUIRES tables chain, nucleotide up to date."""
+
+    # Get a worker number to position the progress bar
+    global idxQueue
+    thr_idx = idxQueue.get()
+
     # Initialize a Counter object for each family
     freqs = {}
     for f in fam_list:
         freqs[f] = Counter()
     # List all nt_names happening within a RNA family and store the counts in the Counter
-    conn = sqlite3.connect("results/RNANet.db")
+    for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False)):
-    for i,f in enumerate(fam_list):
+        with sqlite3.connect("results/RNANet.db") as conn:
-        counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;"))
+            counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0))
         freqs[f].update(counts)
-        notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.")
+        # notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.")
-    conn.close()
     # Create a pandas DataFrame, and save it to CSV.
     df = pd.DataFrame()
-    for f in fam_list:
+    for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False):
         tot = sum(freqs[f].values())
         df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ])
     df = df.fillna(0)
     df.to_csv("results/frequencies.csv")    
-    notify("Saved nucleotide frequencies to CSV file.")
+    idxQueue.put(thr_idx) # replace the thread index in the queue
+    # notify("Saved nucleotide frequencies to CSV file.")
 def parallel_stats_pairs(f):
     """Counts occurrences of intra-chain base-pair types in one RNA family
     REQUIRES tables chain, nucleotide up-to-date.""" 
+    # Get a worker number to position the progress bar
+    global idxQueue
+    thr_idx = idxQueue.get()
+
     chain_id_list = mappings_list[f]
     data = []
-    for cid in chain_id_list:
+    sqldata = []
+    for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", leave=False):
         with sqlite3.connect("results/RNANet.db") as conn:
             # Get comma separated lists of basepairs per nucleotide
-            interactions = pd.read_sql(f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;", conn)
+            interactions = pd.DataFrame(
-
+                            sql_ask_database(conn, 
+                                            f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;",
+                                            warn_every=0), 
+                            columns = ["nt1", "index_chain", "paired", "pair_type_LW"]
+                           )
         # expand the comma-separated lists in real lists
         expanded_list = pd.concat([ pd.DataFrame({  'nt1':[ row["nt1"] for x in row["paired"].split(',') ],
                                                     'index_chain':[ row['index_chain'] for x in row["paired"].split(',') ],
@@ -317,7 +345,7 @@ def parallel_stats_pairs(f):
         # Update the database
         vlcnts = expanded_list.pair_type_LW.value_counts()
-        sqldata = ( vlcnts.at["cWW"]/2 if "cWW" in vlcnts.index else 0, 
+        sqldata.append(   ( vlcnts.at["cWW"]/2 if "cWW" in vlcnts.index else 0, 
                             vlcnts.at["cWH"] if "cWH" in vlcnts.index else 0, 
                             vlcnts.at["cWS"] if "cWS" in vlcnts.index else 0, 
                             vlcnts.at["cHH"]/2 if "cHH" in vlcnts.index else 0, 
@@ -330,14 +358,16 @@ def parallel_stats_pairs(f):
                             vlcnts.at["tHS"] if "tHS" in vlcnts.index else 0, 
                             vlcnts.at["tSS"]/2 if "tSS" in vlcnts.index else 0, 
                             int(sum(vlcnts.loc[[ str(x) for x in vlcnts.index if "." in str(x)]])/2), 
-                    cid)
+                            cid) )
-        with sqlite3.connect("results/RNANet.db") as conn:
-            sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
-                                    pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?, 
-                                    pair_count_tHH = ?, pair_count_tHS = ?, pair_count_tSS = ?, pair_count_other = ? WHERE chain_id = ?;""", data=sqldata)
         data.append(expanded_list)
+    # Update the database
+    with sqlite3.connect("results/RNANet.db") as conn:
+        conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
+        sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
+                                pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?, 
+                                pair_count_tHH = ?, pair_count_tHS = ?, pair_count_tSS = ?, pair_count_other = ? WHERE chain_id = ?;""", many=True, data=sqldata, warn_every=0)
     # merge all the dataframes from all chains of the family
     expanded_list = pd.concat(data)
@@ -351,7 +381,106 @@ def parallel_stats_pairs(f):
     # Create an output DataFrame
     f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f])
-    return expanded_list, f_df
+    f_df.to_csv(f"data/{f}_counts.csv")
+    expanded_list.to_csv(f"data/{f}_pairs.csv")
+    
+    idxQueue.put(thr_idx) # replace the thread index in the queue
+
+def to_dist_matrix(f):
+    if path.isfile("data/"+f+".npy"):
+        # notify(f"Computed {f} distance matrix", "loaded from file")
+        return 0
+  
+    # Get a worker number to position the progress bar
+    global idxQueue
+    thr_idx = idxQueue.get()
+
+    # notify(f"Computing {f} distance matrix from alignment...")
+    command = f"esl-alipid --rna --noheader --informat stockholm {f}_3d_only.stk"
+
+    # Prepare a file
+    with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file:
+        al = AlignIO.read(al_file, "fasta")
+        names = [ x.id for x in al if '[' in x.id ]
+        al = al[-len(names):]
+    with open(f + "_3d_only.stk", "w") as only_3d:
+        only_3d.write(al.format("stockholm"))
+    del al
+
+    # Prepare the job
+    process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE)
+    id_matrix = np.zeros((len(names), len(names)))
+
+    pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", leave=False)
+    while process.poll() is None:
+        output = process.stdout.readline()
+        if output:
+            lines = output.strip().split(b'\n')
+            for l in lines:
+                line = l.split()
+                s1 = line[0].decode('utf-8')
+                s2 = line[1].decode('utf-8')
+                score = line[2].decode('utf-8')
+                id1 = names.index(s1)
+                id2 = names.index(s2)
+                id_matrix[id1, id2] = float(score)
+                pbar.update(1)
+    pbar.close()
+
+    subprocess.run(["rm", "-f", f + "_3d_only.stk"])
+    np.save("data/"+f+".npy", id_matrix)
+    idxQueue.put(thr_idx) # replace the thread index in the queue
+    return 0
+
+def seq_idty():
+    """Computes identity matrices for each of the RNA families.
+    
+    REQUIRES temporary results files in data/*.npy
+    REQUIRES tables chain, family un to date."""
+
+    # load distance matrices
+    fam_arrays = []
+    for f in famlist:
+        if path.isfile("data/"+f+".npy"):
+            fam_arrays.append(np.load("data/"+f+".npy"))
+        else:
+            fam_arrays.append([])
+
+    # Update database with identity percentages
+    conn = sqlite3.connect("results/RNANet.db")
+    for f, D in zip(famlist, fam_arrays):
+        if not len(D): continue
+        a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
+        conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
+    conn.commit()
+    conn.close()
+
+    # Plots plots plots
+    fig, axs = plt.subplots(4,17, figsize=(17,5.75))
+    axs = axs.ravel()
+    [axi.set_axis_off() for axi in axs]
+    im = "" # Just to declare the variable, it will be set in the loop
+    for f, D, ax in zip(famlist, fam_arrays, axs):
+        if not len(D): continue
+        if D.shape[0] > 2:  # Cluster only if there is more than 2 sequences to organize
+            D = D + D.T     # Copy the lower triangle to upper, to get a symetrical matrix
+            condensedD = squareform(D)
+
+            # Compute basic dendrogram by Ward's method
+            Y = sch.linkage(condensedD, method='ward')
+            Z = sch.dendrogram(Y, orientation='left', no_plot=True)
+
+            # Reorganize rows and cols
+            idx1 = Z['leaves']
+            D = D[idx1,:]
+            D = D[:,idx1[::-1]]
+        im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D
+        ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10)
+    fig.tight_layout()
+    fig.subplots_adjust(wspace=0.1, hspace=0.3)
+    fig.colorbar(im, ax=axs[-1], shrink=0.8)
+    fig.savefig(f"results/figures/distances.png")
+    notify("Computed all identity matrices and saved the figure.")
 def stats_pairs():
     """Counts occurrences of intra-chain base-pair types in RNA families
@@ -363,26 +492,15 @@ def stats_pairs():
         return family_data.apply(partial(format_percentage, sum(family_data)))
     if not path.isfile("data/pair_counts.csv"):
-        p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=read_cpu_number(), maxtasksperchild=5)
-        try:
-            fam_pbar = tqdm(total=len(fam_list), desc="Pair-types in families", position=0, leave=True) 
         results = []
         allpairs = []
-            for _, newp_famdf in enumerate(p.imap_unordered(parallel_stats_pairs, fam_list)):
+        for f in fam_list:
-                newpairs, fam_df = newp_famdf
+            newpairs = pd.read_csv(f"data/{f}_pairs.csv", index_col=0)
-                fam_pbar.update(1)
+            fam_df = pd.read_csv(f"data/{f}_counts.csv", index_col=0)
             results.append(fam_df)
             allpairs.append(newpairs)
-            fam_pbar.close()
+            subprocess.run(["rm", "-f", f"data/{f}_pairs.csv"])
-            p.close()
+            subprocess.run(["rm", "-f", f"data/{f}_counts.csv"])
-            p.join()
-        except KeyboardInterrupt:
-            warn("KeyboardInterrupt, terminating workers.", error=True)
-            fam_pbar.close()
-            p.terminate()
-            p.join()
-            exit(1)
-
         all_pairs = pd.concat(allpairs)
         df = pd.concat(results).fillna(0)
         df.to_csv("data/pair_counts.csv")
@@ -431,86 +549,6 @@ def stats_pairs():
     notify("Computed nucleotide statistics and saved CSV and PNG file.")
-def to_dist_matrix(f):
-    if path.isfile("data/"+f+".npy"):
-        notify(f"Computed {f} distance matrix", "loaded from file")
-        return 0
-
-    notify(f"Computing {f} distance matrix from alignment...")
-    dm = DistanceCalculator('identity')
-    with open(path_to_seq_data+"/realigned/"+f+"++.afa") as al_file:
-        al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):]
-    idty = dm.get_distance(al).matrix # list of lists
-    del al
-    l = len(idty)
-    np.save("data/"+f+".npy", np.array([ idty[i] + [0]*(l-1-i) if i<l-1 else idty[i]  for i in range(l) ], dtype=object))
-    del idty
-    notify(f"Computed {f} distance matrix")
-    return 0
-
-def seq_idty():
-    """Computes identity matrices for each of the RNA families.
-    
-    Creates temporary results files in data/*.npy
-    REQUIRES tables chain, family un to date."""
-
-    # List the families for which we will compute sequence identity matrices
-    conn = sqlite3.connect("results/RNANet.db")
-    famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 1 ORDER BY rfam_acc ASC;") ]
-    ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ]
-    if len(ignored):
-        print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
-
-    # compute distance matrices (or ignore if data/RF0****.npy exists)
-    p = Pool(processes=8)
-    p.map(to_dist_matrix, famlist)
-    p.close()
-    p.join()
-
-    # load them
-    fam_arrays = []
-    for f in famlist:
-        if path.isfile("data/"+f+".npy"):
-            fam_arrays.append(np.load("data/"+f+".npy"))
-        else:
-            fam_arrays.append([])
-
-    # Update database with identity percentages
-    conn = sqlite3.connect("results/RNANet.db")
-    for f, D in zip(famlist, fam_arrays):
-        if not len(D): continue
-        a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
-        conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
-    conn.commit()
-    conn.close()
-
-    # Plots plots plots
-    fig, axs = plt.subplots(4,17, figsize=(17,5.75))
-    axs = axs.ravel()
-    [axi.set_axis_off() for axi in axs]
-    im = "" # Just to declare the variable, it will be set in the loop
-    for f, D, ax in zip(famlist, fam_arrays, axs):
-        if not len(D): continue
-        if D.shape[0] > 2:  # Cluster only if there is more than 2 sequences to organize
-            D = D + D.T     # Copy the lower triangle to upper, to get a symetrical matrix
-            condensedD = squareform(D)
-
-            # Compute basic dendrogram by Ward's method
-            Y = sch.linkage(condensedD, method='ward')
-            Z = sch.dendrogram(Y, orientation='left', no_plot=True)
-
-            # Reorganize rows and cols
-            idx1 = Z['leaves']
-            D = D[idx1,:]
-            D = D[:,idx1[::-1]]
-        im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D
-        ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10)
-    fig.tight_layout()
-    fig.subplots_adjust(wspace=0.1, hspace=0.3)
-    fig.colorbar(im, ax=axs[-1], shrink=0.8)
-    fig.savefig(f"results/figures/distances.png")
-    notify("Computed all identity matrices and saved the figure.")
-
 def per_chain_stats():
     """Computes per-chain frequencies and base-pair type counts.
@@ -524,39 +562,71 @@ def per_chain_stats():
         df = df.drop("total", axis=1)
         # Set the values
+        conn.execute('pragma journal_mode=wal')
         sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;",
                           many=True, data=list(df.to_records(index=False)), warn_every=10)
     notify("Updated the database with per-chain base frequencies")
+def log_to_pbar(pbar):
+    def update(r):
+        pbar.update(1)
+    return update
+
 if __name__ == "__main__":
     os.makedirs("results/figures/wadley_plots/", exist_ok=True)
     print("Loading mappings list...")
-    conn = sqlite3.connect("results/RNANet.db")
+    with sqlite3.connect("results/RNANet.db") as conn:
         fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ]
         mappings_list = {}
         for k in fam_list:
-        mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}';") ]
+            mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}' and issue=0;") ]
-    conn.close()
-    # stats_pairs()
+    # List the families for which we will compute sequence identity matrices
+    with sqlite3.connect("results/RNANet.db") as conn:
+        famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ]
+        ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ]
+    if len(ignored):
+        print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
-    # Define threads for the tasks
+    # Prepare the multiprocessing execution environment
-    threads = [
+    nworkers = max(read_cpu_number()-1, 32)
-        th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}),
+    thr_idx_mgr = Manager()
-        th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}),
+    idxQueue = thr_idx_mgr.Queue()
-        th.Thread(target=stats_len),            # computes figures
+    for i in range(nworkers):
-        th.Thread(target=stats_freq),           # Updates the database
+        idxQueue.put(i)
-        th.Thread(target=seq_idty),             # produces .npy files and seq idty figures
+
-        th.Thread(target=per_chain_stats)       # Updates the database
+    # Define the tasks
-    ]
+    joblist = []
+    joblist.append(Job(function=reproduce_wadley_results, args=(1,)))
+    joblist.append(Job(function=reproduce_wadley_results, args=(4,)))
+    joblist.append(Job(function=stats_len)) # Computes figures
+    joblist.append(Job(function=stats_freq)) # updates the database
+    for f in famlist:
+        joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
+        if f not in ignored:
+            joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database
-    # Start the threads
+    p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
-    for t in threads:
+    pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True)
-        t.start()
-    # Wait for the threads to complete
+    try:
-    for t in threads:
+        for j in joblist:
-        t.join()
+            p.apply_async(j.func_, args=j.args_, callback=log_to_pbar(pbar))
+        p.close()
+        p.join()
+        pbar.close()
+    except KeyboardInterrupt:
+        warn("KeyboardInterrupt, terminating workers.", error=True)
+        p.terminate()
+        p.join()
+        pbar.close()
+        exit(1)
+    except:
+        print("Something went wrong")
+    # finish the work after the parallel portions
+    per_chain_stats()
+    seq_idty()
+    stats_pairs()