split rRNA distance matrices computations apart

Louis BECQUEY
Commit 4de494b7787fcb3d68a61c2efdeaed06af84bad2 4de494b7 1 parent dc3fd966
Showing 2 changed files with 130 additions and 99 deletions
RNAnet.py
statistics.py
--- a/RNAnet.py
View file @4de494b
+++ b/RNAnet.py
View file @4de494b
@@ -1483,7 +1483,7 @@ class Pipeline:
             os.makedirs(path_to_seq_data + "realigned/3D_only", exist_ok=True)
             subprocess.run(["cp", path_to_seq_data + "realigned/*_3d_only.afa", path_to_seq_data + "realigned/3d_only" ])
             subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_alignments_latest.tar.gz"])
-            subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3d_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."])
+            subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3D_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."])
     def sanitize_database(self):
         """Searches for issues in the database and correct them"""
--- a/statistics.py
View file @4de494b
+++ b/statistics.py
View file @4de494b
@@ -911,128 +911,150 @@ def general_stats():
     fig.savefig(runDir + "/results/figures/Nfamilies.png")
     plt.close()
-def get_matrix_euclidian_distance(cif_file, aligned_seq, consider_all_atoms):
+def par_distance_matrix(filelist, f, label, consider_all_atoms, s):
-    """
+    
-    This function
+    # Identify the right 3D file
-    - loads the coordinates and the alignment, reconctructs the alignment but with coordinates, considering gaps, and 
+    filename = ''
-    - compute the matrix of euclidian distances.
+    for file in filelist:
+        if file.startswith(s.id.replace('-', '').replace('[', '_').replace(']', '_')):
+            filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file
+            break
+    if not len(filename):
+        return None, None, None
+    
+    # Get the coordinates of every existing nt in the 3D file
+    try:
+        coordinates = nt_3d_centers(filename, consider_all_atoms)
+        if not len(coordinates):
+            # there is not nucleotides in the file, or no C1' atoms for example.
+            warn("No C1' atoms in " + filename)
+            return None, None, None
+    except FileNotFoundError:
+        return None, None, None
-    Returns:
-    The 2D np.array of euclidian distances between pairs of nucleotides, with np.NaNs in gap columns.
-    """
-    # Load the baricenter coordinates
-    coordinates = nt_3d_centers(cif_file, consider_all_atoms) 
-    # reconstruct the alignment but with coordinates
+    # Get the coordinates of every position in the alignment
     nb_gap = 0
     coordinates_with_gaps = []
-    for i in range(len(aligned_seq)):
+    for i, letter in enumerate(s.seq):
-        if aligned_seq[i] == '.' or  aligned_seq[i] == '-':
+        if letter in "-.":
-            nb_gap = nb_gap + 1
+            nb_gap += 1
-            coordinates_with_gaps.append('NA')
+            coordinates_with_gaps.append(np.nan)
         else:
             coordinates_with_gaps.append(coordinates[i - nb_gap])
-   
-    nb_nucleotides = len(coordinates_with_gaps)  # number of nucleotides
-    matrix = np.zeros((nb_nucleotides, nb_nucleotides))  # create a new empty matrix of size nxn
-    # Fill this new matrix with the euclidians distances between all amino acids considering gaps:
+    # Build the pairwise distances
-    for i in range(nb_nucleotides):
+    print("> Computing distances for", s.id)
-        for j in range(nb_nucleotides):
+    d = np.zeros((len(s.seq), len(s.seq)), dtype=np.float16)
-            if coordinates_with_gaps[i] == 'NA' or coordinates_with_gaps[j] == 'NA':
+    for i in range(len(s.seq)):
-                matrix[i][j] = np.nan
+        for j in range(len(s.seq)):
+            if np.isnan(coordinates_with_gaps[i]).any() or np.isnan(coordinates_with_gaps[j]).any():
+                d[i,j] = np.nan
             else:
-                matrix[i][j] = round(get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]),3)
+                d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j])
-    return(matrix)
+    
+    print("> finished.")
+    np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
+    return 1-np.isnan(d).astype(int), np.nan_to_num(d), np.nan_to_num(d*d)
 @trace_unhandled_exceptions
-def get_avg_std_distance_matrix(f, consider_all_atoms):
+def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
-    # Get a worker number to position the progress bar
-    global idxQueue
-    thr_idx = idxQueue.get()
-
-    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} {f} residue distance matrices")
-
     if consider_all_atoms:
         label = "base"
     else:
         label = "backbone"
+    if not multithread:
+        # This function call is for ONE worker.
+        # Get a worker number for it to position the progress bar
+        global idxQueue
+        thr_idx = idxQueue.get()
+        setproctitle(f"RNANet statistics.py Worker {thr_idx+1} {f} {label} distance matrices")
+
     os.makedirs(runDir + '/results/distance_matrices/' + f + '_' + label, exist_ok=True )   
-   
-    family_matrices = []
     align = AlignIO.read(path_to_seq_data + f"realigned/{f}_3d_only.afa", "fasta")
+    ncols = align.get_alignment_length()
+    counts = np.zeros((ncols, ncols))
+    avg = np.zeros((ncols, ncols))
+    std = np.zeros((ncols, ncols))
     found = 0
     notfound = 0
-    pbar = tqdm(total = len(align), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} {label} distance matrices", unit="chains", leave=False)
-    pbar.update(0)
     with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
         conn.execute('pragma journal_mode=wal')
         r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';")
-        filelist = [ ''.join(list(x))+'.cif' for x in r ]
+        filelist = sorted([ ''.join(list(x))+'.cif' for x in r ])
-    for s in align:
+    if not multithread:
-        filename = ''
+        pbar = tqdm(total = len(align), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} {label} distance matrices", unit="chains", leave=False)
-        for file in filelist:
+        pbar.update(0)
-            if file.startswith(s.id.replace('-', '').replace('[', '_').replace(']', '_')):
+        for s in align:
-                filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file
+            contrib, d, dsquared = par_distance_matrix(filelist, f, label, consider_all_atoms, s)
-                break
+            if d is not None:
-        if len(filename):
+                found += 1
-            found += 1
+                counts += contrib
-            try:
+                avg += d
-                euclidian_distance = get_matrix_euclidian_distance(filename, s.seq, consider_all_atoms)
+                std += dsquared
-                np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', euclidian_distance, delimiter=",", fmt="%.3f")
+            else:
-                family_matrices.append(euclidian_distance)
-            except FileNotFoundError:
-                found -= 1
                 notfound += 1
-        else:
+            pbar.update(1)
-            notfound += 1
+        pbar.close()
-        pbar.update(1)
+    else:
+        # We split the work for one family on multiple workers.
+        
+        p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
+        try:
+            fam_pbar = tqdm(total=len(align), desc=f"{f} {label} pair distances", position=0, unit="chain", leave=True)
+            # Apply work_pssm_remap to each RNA family
+            for i, (contrib, d, dsquared) in enumerate(p.imap_unordered(partial(par_distance_matrix, filelist, f, label, consider_all_atoms), align, chunksize=1)):
+                if d is not None:
+                    found += 1
+                    counts += contrib
+                    avg += d
+                    std += dsquared
+                else:
+                    notfound += 1
+                fam_pbar.update(1)
+            fam_pbar.close()
+            p.close()
+            p.join()
+        except KeyboardInterrupt:
+            warn("KeyboardInterrupt, terminating workers.", error=True)
+            fam_pbar.close()
+            p.terminate()
+            p.join()
+            exit(1)
+
     # Calculation of the average matrix
-    avgarray = np.array(family_matrices)
+    avg = avg/counts
-    if len(avgarray) == 0 or np.prod(avgarray.shape) == 0: 
+    np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.csv' , avg, delimiter=",", fmt="%.3f")
-        warn(f"Something's wrong with the shapes: {avgarray.shape}", error=True)
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore", category=RuntimeWarning)
-        matrix_average_distances = np.nanmean(avgarray, axis=0 )
-    
-    if len(matrix_average_distances) != 0:
-        matrix_average_distances = np.nan_to_num(matrix_average_distances)
-        np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.csv' , np.triu(matrix_average_distances), delimiter=",", fmt="%.3f")
     fig, ax = plt.subplots()
-    im = ax.imshow(matrix_average_distances)
+    im = ax.imshow(avg)
     cbar = ax.figure.colorbar(im, ax=ax)
     cbar.ax.set_ylabel("Angströms", rotation=-90, va="bottom")
-    ax.set_title("Average distance between residues (Angströms)")
+    ax.set_title(f"Average distance between {f} residues (Angströms)")
     fig.tight_layout()
     fig.savefig(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.png', dpi=300)
     plt.close()
-    # Calculation of the standard deviation matrix
+    # Calculation of the standard deviation matrix by the Huygens theorem
-    with warnings.catch_warnings():
+    std = np.sqrt(std/counts - np.power(avg, 2))
-        warnings.simplefilter("ignore", category=RuntimeWarning)
+    np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_stdev.csv' , std, delimiter=",", fmt="%.3f")
-        matrix_standard_deviation_distances = np.nanstd(avgarray, axis=0 )
-
-    if len(matrix_standard_deviation_distances) != 0:
-        matrix_standard_deviation_distances = np.nan_to_num(matrix_standard_deviation_distances)
-        np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_stdev.csv' , np.triu(matrix_standard_deviation_distances), delimiter=",", fmt="%.3f")
     fig, ax = plt.subplots()
-    im = ax.imshow(matrix_standard_deviation_distances)
+    im = ax.imshow(std)
     cbar = ax.figure.colorbar(im, ax=ax)
     cbar.ax.set_ylabel("Angströms", rotation=-90, va="bottom")
-    ax.set_title("Average distance between residues (Angströms)")
+    ax.set_title(f"Standard deviation of distances between {f} residues (Angströms)")
     fig.tight_layout()
     fig.savefig(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_std.png', dpi=300)
     plt.close()
     # Save log
     with open(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '.log', 'a') as logfile:
-        logfile.write(str(found)+ " chains taken into account for computation. "+ str(notfound)+ " were not found.\n")
+        logfile.write(str(found)+ " chains taken into account for computation. "+ str(notfound)+ " were not found/without atoms.\n")
     # Save associated nucleotide frequencies (off-topic but convenient to do it here)
     with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
@@ -1040,10 +1062,9 @@ def get_avg_std_distance_matrix(f, consider_all_atoms):
         df = pd.read_sql_query(f"SELECT freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;", conn)
         df.to_csv(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_frequencies.csv', float_format="%.3f")
-    pbar.close()
+    if not multithread:
-
+        idxQueue.put(thr_idx) # replace the thread index in the queue
-    idxQueue.put(thr_idx) # replace the thread index in the queue
+        setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
-    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
     return 0
 def log_to_pbar(pbar):
@@ -1069,33 +1090,34 @@ def nt_3d_centers(cif_file, consider_all_atoms):
     """Return the nucleotides' coordinates, summarizing a nucleotide by only one point.
     If consider_all_atoms : barycentre is used
     else: C1' atom is the nucleotide
+
+    Some chains have no C1' (e.g. 4v7f-3), therefore, an empty result is returned.
     """
     result  =[]
     structure = MMCIFParser().get_structure(cif_file, cif_file)
-    if consider_all_atoms == True:
+    for model in structure:
-        for model in structure:
+        for chain in model:
-            for chain in model:
+            for residue in chain:
-                for residue in chain:
+                if consider_all_atoms:
                     temp_list = []
-                    res_isobaricentre = 0
                     for atom in residue:
                         temp_list.append(atom.get_coord())
                     lg = len(temp_list)
-                    
                     summ = np.sum(temp_list, axis = 0)
                     res_isobaricentre = [summ[0]/lg, summ[1]/lg, summ[2]/lg]
                     result.append([res_isobaricentre[0], res_isobaricentre[1], res_isobaricentre[2]])
-     
+                else:
-    elif consider_all_atoms == False:
+                    coordinates = None
-        for model in structure:
-            for chain in model:
-                for residue in chain:
                     for atom in residue:
                         if atom.get_name() == "C1'":
                             coordinates = atom.get_coord()
-                            res = [coordinates[0], coordinates[1], coordinates[2]]
+                    if coordinates is None:
-                            result.append(res)
+                        # Residue has no C1'
+                        res = np.nan
+                    else:
+                        res = [coordinates[0], coordinates[1], coordinates[2]]
+                    result.append(res)
     return(result)
 def get_euclidian_distance(L1, L2):
@@ -1214,9 +1236,9 @@ if __name__ == "__main__":
                 e2 = file.split('_')[1]
                 e3 = file.split('_')[2]
                 extracted_chains.append(e1 + '[' + e2 + ']' + '-' + e3)
-        for f in famlist:
+        for f in [ x for x in famlist if (x not in LSU_set and x not in SSU_set) ]:    # Process the rRNAs later only 3 by 3
-            joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, True)))
+            joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, True, False)))
-            joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False)))
+            joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False, False)))
     joblist.append(Job(function=stats_len)) # Computes figures
     joblist.append(Job(function=stats_freq)) # updates the database
     for f in famlist:
@@ -1224,7 +1246,7 @@ if __name__ == "__main__":
         if f not in ignored:
             joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database
-    p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=int(0.7*nworkers))
+    p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
     pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, unit="job", leave=True)
     try:
@@ -1242,6 +1264,15 @@ if __name__ == "__main__":
     except:
         print("Something went wrong")
+    # Now process the memory-heavy tasks family by family
+    if DO_AVG_DISTANCE_MATRIX:
+        for f in LSU_set:
+            get_avg_std_distance_matrix(f, True, True)
+            get_avg_std_distance_matrix(f, False, True)
+        for f in SSU_set:
+            get_avg_std_distance_matrix(f, True, True)
+            get_avg_std_distance_matrix(f, False, True)
+
     print()
     print()