more resolution-specific statistics

Louis BECQUEY
Commit ce2cba259da6ca38a5b2f56f7fee505c9c4b6f08 ce2cba25 1 parent 7196427d
Showing 2 changed files with 91 additions and 75 deletions
RNAnet.py
statistics.py
--- a/RNAnet.py
View file @ce2cba2
+++ b/RNAnet.py
View file @ce2cba2
 #!/usr/bin/python3.8
 import Bio
+import Bio.PDB as pdb
 import concurrent.futures
 import getopt
 import gzip
@@ -25,7 +26,8 @@ from multiprocessing import Pool, Manager
 from time import sleep
 from tqdm import tqdm
 from setproctitle import setproctitle
-
+from Bio import AlignIO, SeqIO
+from Bio.Align import AlignInfo
 def trace_unhandled_exceptions(func):
     @wraps(func)
@@ -112,7 +114,7 @@ class SelectivePortionSelector(object):
         return 1
-class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo):
+class BufferingSummaryInfo(AlignInfo.SummaryInfo):
     def get_pssm(self, family, index):
         """Create a position specific score matrix object for the alignment. 
@@ -139,7 +141,7 @@ class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo):
                         score_dict[this_residue] = 1.0
             pssm_info.append(('*', score_dict))
-        return Bio.Align.AlignInfo.PSSM(pssm_info)
+        return AlignInfo.PSSM(pssm_info)
 class Chain:
@@ -198,11 +200,11 @@ class Chain:
         with warnings.catch_warnings():
             # Ignore the PDB problems. This mostly warns that some chain is discontinuous.
-            warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning)
+            warnings.simplefilter('ignore', pdb.PDBExceptions.PDBConstructionWarning)
-            warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning)
+            warnings.simplefilter('ignore', pdb.PDBExceptions.BiopythonWarning)
             # Load the whole mmCIF into a Biopython structure object:
-            mmcif_parser = Bio.PDB.MMCIFParser()
+            mmcif_parser = pdb.MMCIFParser()
             try:
                 s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif")
             except ValueError as e:
@@ -223,7 +225,7 @@ class Chain:
             sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm)
             # Save that selection on the mmCIF object s to file
-            ioobj = Bio.PDB.mmcifio.MMCIFIO()
+            ioobj = pdb.MMCIFIO()
             ioobj.set_structure(s)
             ioobj.save(self.file, sel)
@@ -1115,7 +1117,7 @@ class Pipeline:
                 print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &")
                 sys.exit()
             elif opt == '--version':
-                print("RNANet 1.1 beta")
+                print("RNANet 1.2, parallelized, Dockerized")
                 sys.exit()
             elif opt == "-r" or opt == "--resolution":
                 assert float(arg) > 0.0 and float(arg) <= 20.0
@@ -1445,7 +1447,7 @@ class Pipeline:
         # Update the database
         data = []
         for r in results:
-            align = Bio.AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta")
+            align = AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta")
             nb_3d_chains = len([1 for r in align if '[' in r.id])
             if r[0] in SSU_set:  # SSU v138 is used
                 nb_homologs = 2225272       # source: https://www.arb-silva.de/documentation/release-138/
@@ -1535,9 +1537,9 @@ class Pipeline:
         # Run statistics
         if self.RUN_STATS:
             # Remove previous precomputed data
-            subprocess.run(["rm", "-f", runDir + "/data/wadley_kernel_eta.npz", 
+            subprocess.run(["rm", "-f", runDir + f"/data/wadley_kernel_eta_{self.CRYSTAL_RES}.npz", 
-                                        runDir + "/data/wadley_kernel_eta_prime.npz", 
+                                        runDir + f"/data/wadley_kernel_eta_prime_{self.CRYSTAL_RES}.npz", 
-                                        runDir + "/data/pair_counts.csv"])
+                                        runDir + f"/data/pair_counts_{self.CRYSTAL_RES}.csv"])
             for f in self.fam_list:
                 subprocess.run(["rm", "-f", runDir + f"/data/{f}.npy", 
                                             runDir + f"/data/{f}_pairs.csv", 
@@ -2124,7 +2126,7 @@ def work_mmcif(pdb_id):
     # if not, read the CIF header and register the structure
     if not len(r):
         # Load the MMCIF file with Biopython
-        mmCif_info = Bio.PDB.MMCIF2Dict.MMCIF2Dict(final_filepath)
+        mmCif_info = pdb.MMCIF2Dict.MMCIF2Dict(final_filepath)
         # Get info about that structure
         try:
@@ -2218,7 +2220,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
     if rfam_acc in LSU_set | SSU_set:  # rRNA
         if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
             # Detect doublons and remove them
-            existing_afa = Bio.AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
+            existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
             existing_ids = [r.id for r in existing_afa]
             del existing_afa
             new_ids = [str(c) for c in chains]
@@ -2227,7 +2229,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
             if len(doublons):
                 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version")
                 fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa"
-                seqfile = Bio.SeqIO.parse(fasta, "fasta")
+                seqfile = SeqIO.parse(fasta, "fasta")
                 # remove it and rewrite it with its own content filtered
                 os.remove(fasta)
                 with open(fasta, 'w') as f:
@@ -2268,7 +2270,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
             with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "w") as plusplus:
                 ids = set()
                 # Remove doublons from the Rfam hits
-                for r in Bio.SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"):
+                for r in SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"):
                     if r.id not in ids:
                         ids.add(r.id)
                         plusplus.write('> '+r.description+'\n'+str(r.seq)+'\n')
@@ -2343,10 +2345,10 @@ def work_realign(rfam_acc):
             notify("Aligned new sequences together")
             # Detect doublons and remove them
-            existing_stk = Bio.AlignIO.read(existing_ali_path, "stockholm")
+            existing_stk = AlignIO.read(existing_ali_path, "stockholm")
             existing_ids = [r.id for r in existing_stk]
             del existing_stk
-            new_stk = Bio.AlignIO.read(new_ali_path, "stockholm")
+            new_stk = AlignIO.read(new_ali_path, "stockholm")
             new_ids = [r.id for r in new_stk]
             del new_stk
             doublons = [i for i in existing_ids if i in new_ids]
@@ -2447,7 +2449,7 @@ def work_pssm(f, fill_gaps):
     # Open the alignment
     try:
-        align = Bio.AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
+        align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
     except:
         warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True)
         with open(runDir + "/errors.txt", "a") as errf:
--- a/statistics.py
View file @ce2cba2
+++ b/statistics.py
View file @ce2cba2
@@ -70,7 +70,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0):
         thr_idx = idxQueue.get()
         setproctitle(f"RNANet statistics.py Worker {thr_idx+1} reproduce_wadley_results(carbon={carbon})")
-        pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", position=thr_idx+1, leave=False)
+        pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", unit="kernel", position=thr_idx+1, leave=False)
         # Extract the angle values of c2'-endo and c3'-endo nucleotides
         with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
@@ -203,25 +203,10 @@ def stats_len():
     global idxQueue
     thr_idx = idxQueue.get()
-    # sort the RNA families so that the plot is readable
-    def family_order(f):
-        if f in LSU_set:
-            return 4
-        elif f in SSU_set:
-            return 3
-        elif f in ["RF00001"]:      #
-            return 1                # put tRNAs and 5S rRNAs first,
-        elif f in ["RF00005"]:      # because of the logarithmic scale, otherwise, they look tiny
-            return 0                #
-        else:
-            return 2
-    
-    fam_list.sort(key=family_order)
-
     cols = []
     lengths = []
-    for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False):
+    for f in tqdm(famlist, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", unit="family", leave=False):
         # Define a color for that family in the plot
         if f in LSU_set:
@@ -249,7 +234,7 @@ def stats_len():
     # Plot the figure
     fig = plt.figure(figsize=(10,3))
     ax = fig.gca()
-    ax.hist(lengths, bins=100, stacked=True, log=True, color=cols, label=fam_list)
+    ax.hist(lengths, bins=100, stacked=True, log=True, color=cols, label=famlist)
     ax.set_xlabel("Sequence length (nucleotides)", fontsize=8)
     ax.set_ylabel("Number of 3D chains", fontsize=8)
     ax.set_xlim(left=-150)
@@ -303,18 +288,18 @@ def stats_freq():
     # Initialize a Counter object for each family
     freqs = {}
-    for f in fam_list:
+    for f in famlist:
         freqs[f] = Counter()
     # List all nt_names happening within a RNA family and store the counts in the Counter
-    for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False):
+    for f in tqdm(famlist, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", unit="family", leave=False):
         with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
             counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0))
         freqs[f].update(counts)
     # Create a pandas DataFrame, and save it to CSV.
     df = pd.DataFrame()
-    for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False):
+    for f in tqdm(famlist, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", unit="family", leave=False):
         tot = sum(freqs[f].values())
         df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ])
     df = df.fillna(0)
@@ -322,12 +307,13 @@ def stats_freq():
     idxQueue.put(thr_idx) # replace the thread index in the queue
     # notify("Saved nucleotide frequencies to CSV file.")
+@trace_unhandled_exceptions
 def parallel_stats_pairs(f):
     """Counts occurrences of intra-chain base-pair types in one RNA family
     REQUIRES tables chain, nucleotide up-to-date.""" 
-    if path.isfile("data/"+f+"_pairs.csv") and path.isfile("data/"+f+"_counts.csv"):
+    if path.isfile(runDir + "/data/"+f+"_pairs.csv") and path.isfile(runDir + "/data/"+f+"_counts.csv"):
         return
     # Get a worker number to position the progress bar
@@ -339,7 +325,7 @@ def parallel_stats_pairs(f):
     chain_id_list = mappings_list[f]
     data = []
     sqldata = []
-    for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", leave=False):
+    for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", unit="chain",leave=False):
         with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
             # Get comma separated lists of basepairs per nucleotide
             interactions = pd.DataFrame(
@@ -430,16 +416,19 @@ def parallel_stats_pairs(f):
     idxQueue.put(thr_idx) # replace the thread index in the queue
-def to_dist_matrix(f):
+def to_id_matrix(f):
+    """
+    Extracts sequences of 3D chains from the family alignments to a distinct STK file,
+    then runs esl-alipid on it to get an identity matrix
+    """
     if path.isfile("data/"+f+".npy"):
-        # notify(f"Computed {f} distance matrix", "loaded from file")
         return 0
     # Get a worker number to position the progress bar
     global idxQueue
     thr_idx = idxQueue.get()
-    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_dist_matrix({f})")
+    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_id_matrix({f})")
     # Prepare a file
     with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file:
@@ -452,14 +441,16 @@ def to_dist_matrix(f):
         except ValueError as e:
             warn(e)
     del al
-    subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap", "-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk", "stockholm",  path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"])
+    subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap",              #
+                    "-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk",               # This run just deletes columns of gaps
+                    "stockholm",  path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"])  #
+    subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk"])
     # Prepare the job
     process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"), 
                                stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     id_matrix = np.zeros((len(names), len(names)))
-
+    pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", unit="comparisons", leave=False)
-    pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", leave=False)
     cnt = 0
     while not cnt or process.poll() is None:
         output = process.stdout.read()
@@ -482,8 +473,8 @@ def to_dist_matrix(f):
         warn("\n".join([ line.decode('utf-8') for line in l ]))
     pbar.close()
-    subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk"])
     np.save("data/"+f+".npy", id_matrix)
+
     idxQueue.put(thr_idx) # replace the thread index in the queue
     return 0
@@ -545,7 +536,7 @@ def seq_idty():
     fig.tight_layout()
     fig.subplots_adjust(hspace=0.3, wspace=0.1)
     fig.colorbar(im, ax=axs[-4], shrink=0.8)
-    fig.savefig(runDir + f"/results/figures/distances.png")
+    fig.savefig(runDir + f"/results/figures/distances_{res_thr}.png")
     print("> Computed all identity matrices and saved the figure.", flush=True)
 def stats_pairs():
@@ -559,10 +550,10 @@ def stats_pairs():
     def line_format(family_data):
         return family_data.apply(partial(format_percentage, sum(family_data)))
-    if not path.isfile("data/pair_counts.csv"):
+    if not path.isfile("data/pair_counts_{res_thr}.csv"):
         results = []
         allpairs = []
-        for f in fam_list:
+        for f in famlist:
             newpairs = pd.read_csv(runDir + f"/data/{f}_pairs.csv", index_col=0)
             fam_df = pd.read_csv(runDir + f"/data/{f}_counts.csv", index_col=0)
             results.append(fam_df)
@@ -571,11 +562,11 @@ def stats_pairs():
             subprocess.run(["rm", "-f", runDir + f"/data/{f}_counts.csv"])
         all_pairs = pd.concat(allpairs)
         df = pd.concat(results).fillna(0)
-        df.to_csv("data/pair_counts.csv")
+        df.to_csv(runDir + f"/data/pair_counts_{res_thr}.csv")
-        all_pairs.to_csv("data/all_pairs.csv")
+        all_pairs.to_csv(runDir + f"/data/all_pairs_{res_thr}.csv")
     else:
-        df = pd.read_csv("data/pair_counts.csv", index_col=0)
+        df = pd.read_csv(runDir + f"/data/pair_counts_{res_thr}.csv", index_col=0)
-        all_pairs = pd.read_csv("data/all_pairs.csv", index_col=0)
+        all_pairs = pd.read_csv(runDir + f"/data/all_pairs_{res_thr}.csv", index_col=0)
     crosstab = pd.crosstab(all_pairs.pair_type_LW, all_pairs.basepair)
     col_list = [ x for x in df.columns if '.' in x ]
@@ -613,7 +604,7 @@ def stats_pairs():
     ax.set_ylabel("Number of observations (millions)", fontsize=13)
     ax.set_xlabel(None)
     plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99)
-    plt.savefig(runDir + "/results/figures/pairings.png")
+    plt.savefig(runDir + f"/results/figures/pairings_{res_thr}.png")
     notify("Computed nucleotide statistics and saved CSV and PNG file.")
@@ -916,8 +907,24 @@ def log_to_pbar(pbar):
         pbar.update(1)
     return update
+def family_order(f):
+    # sort the RNA families so that the plots are readable
+
+    if f in LSU_set:
+        return 4
+    elif f in SSU_set:
+        return 3
+    elif f in ["RF00001"]:      #
+        return 1                # put tRNAs and 5S rRNAs first,
+    elif f in ["RF00005"]:      # because of the logarithmic scale of the lengths' figure, otherwise, they look tiny
+        return 0                #
+    else:
+        return 2
+
 if __name__ == "__main__":
+    os.makedirs(runDir + "/results/figures/", exist_ok=True)
+
     # parse options
     DELETE_OLD_DATA = False
     DO_WADLEY_ANALYSIS = False
@@ -943,7 +950,7 @@ if __name__ == "__main__":
             print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything")
             sys.exit()
         elif opt == '--version':
-            print("RNANet statistics 1.1 beta")
+            print("RNANet statistics 1.2")
             sys.exit()
         elif opt == "-r" or opt == "--resolution":
             assert float(arg) > 0.0 and float(arg) <= 20.0 
@@ -959,31 +966,38 @@ if __name__ == "__main__":
         elif opt=='--from-scratch':
             DELETE_OLD_DATA = True
             DO_WADLEY_ANALYSIS = True
-            subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
         elif opt=='--wadley':
             DO_WADLEY_ANALYSIS = True
-    # Load mappings
+    # Load mappings. famlist will contain only families with structures at this resolution threshold.
     print("Loading mappings list...")
     with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
-        fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ]
-        mappings_list = {}
-        for k in fam_list:
-            mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain JOIN structure ON chain.structure_id=structure.pdb_id WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};") ]
-
-    # List the families for which we will compute sequence identity matrices
-    with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
-        famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ]
-        ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains < 3 ORDER BY rfam_acc ASC;") ]
         n_unmapped_chains = sql_ask_database(conn, "SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0]
+        families = pd.read_sql(f"""SELECT rfam_acc, count(*) as n_chains 
+                                    FROM chain JOIN structure
+                                    ON chain.structure_id = structure.pdb_id
+                                    WHERE issue = 0 AND resolution <= {res_thr} AND rfam_acc != 'unmappd'
+                                    GROUP BY rfam_acc;
+                                """, conn)
+    families.drop(families[families.n_chains == 0].index, inplace=True)
+    mappings_list = {}
+    for k in families.rfam_acc:
+        mappings_list[k] = [ x[0] for x in sql_ask_database(conn,  f"""SELECT chain_id 
+                                                                        FROM chain JOIN structure ON chain.structure_id=structure.pdb_id 
+                                                                        WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};""") ]
+    famlist = families.rfam_acc.tolist()
+    ignored = families[families.n_chains < 3].rfam_acc.tolist()
+    famlist.sort(key=family_order)
+    print(f"Found {len(famlist)} families with chains of resolution {res_thr}A or better.")
     if len(ignored):
         print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
     if DELETE_OLD_DATA:
-        for f in fam_list:
+        for f in famlist:
             subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"])
-
+        if DO_WADLEY_ANALYSIS:
+            subprocess.run(["rm","-f", runDir + f"/data/wadley_kernel_eta_{res_thr}.npz", runDir + f"/data/wadley_kernel_eta_prime_{res_thr}.npz", runDir + f"/data/pair_counts_{res_thr}.csv"])
     # Prepare the multiprocessing execution environment
     nworkers = min(read_cpu_number()-1, 32)
@@ -995,17 +1009,17 @@ if __name__ == "__main__":
     # Define the tasks
     joblist = []
     if n_unmapped_chains and DO_WADLEY_ANALYSIS:
-        joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 20.0)))   # res threshold is 4.0 Angstroms by default
+        joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
-        joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 20.0)))   #
+        joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
     joblist.append(Job(function=stats_len)) # Computes figures
     joblist.append(Job(function=stats_freq)) # updates the database
     for f in famlist:
         joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
         if f not in ignored:
-            joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database
+            joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database
     p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
-    pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True)
+    pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, unit="job", leave=True)
     try:
         for j in joblist: