split rRNA distance matrices computations apart

Louis BECQUEY
Commit 4de494b7787fcb3d68a61c2efdeaed06af84bad2 4de494b7 1 parent dc3fd966
Showing 2 changed files with 130 additions and 99 deletions
RNAnet.py
statistics.py
--- a/RNAnet.py
View file @4de494b
+++ b/RNAnet.py
View file @4de494b
@@ -1483,7 +1483,7 @@ class Pipeline:
             os.makedirs(path_to_seq_data + "realigned/3D_only", exist_ok=True)
             subprocess.run(["cp", path_to_seq_data + "realigned/*_3d_only.afa", path_to_seq_data + "realigned/3d_only" ])
             subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_alignments_latest.tar.gz"])
-             subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3d_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."])
+             subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3D_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."])
 
     def sanitize_database(self):
         """Searches for issues in the database and correct them"""
--- a/statistics.py
View file @4de494b
+++ b/statistics.py
View file @4de494b
@@ -911,128 +911,150 @@ def general_stats():
     fig.savefig(runDir + "/results/figures/Nfamilies.png")
     plt.close()
 
- def get_matrix_euclidian_distance(cif_file, aligned_seq, consider_all_atoms):
-     """
-     This function
-     - loads the coordinates and the alignment, reconctructs the alignment but with coordinates, considering gaps, and 
-     - compute the matrix of euclidian distances.
+ def par_distance_matrix(filelist, f, label, consider_all_atoms, s):
+     
+     # Identify the right 3D file
+     filename = ''
+     for file in filelist:
+         if file.startswith(s.id.replace('-', '').replace('[', '_').replace(']', '_')):
+             filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file
+             break
+     if not len(filename):
+         return None, None, None
+     
+     # Get the coordinates of every existing nt in the 3D file
+     try:
+         coordinates = nt_3d_centers(filename, consider_all_atoms)
+         if not len(coordinates):
+             # there is not nucleotides in the file, or no C1' atoms for example.
+             warn("No C1' atoms in " + filename)
+             return None, None, None
+     except FileNotFoundError:
+         return None, None, None
 
-     Returns:
-     The 2D np.array of euclidian distances between pairs of nucleotides, with np.NaNs in gap columns.
-     """
-     # Load the baricenter coordinates
-     coordinates = nt_3d_centers(cif_file, consider_all_atoms) 
 
-     # reconstruct the alignment but with coordinates
+     # Get the coordinates of every position in the alignment
     nb_gap = 0
     coordinates_with_gaps = []
-     for i in range(len(aligned_seq)):
-         if aligned_seq[i] == '.' or  aligned_seq[i] == '-':
-             nb_gap = nb_gap + 1
-             coordinates_with_gaps.append('NA')
+     for i, letter in enumerate(s.seq):
+         if letter in "-.":
+             nb_gap += 1
+             coordinates_with_gaps.append(np.nan)
         else:
             coordinates_with_gaps.append(coordinates[i - nb_gap])
-    
-     nb_nucleotides = len(coordinates_with_gaps)  # number of nucleotides
-     matrix = np.zeros((nb_nucleotides, nb_nucleotides))  # create a new empty matrix of size nxn
     
-     # Fill this new matrix with the euclidians distances between all amino acids considering gaps:
-     for i in range(nb_nucleotides):
-         for j in range(nb_nucleotides):
-             if coordinates_with_gaps[i] == 'NA' or coordinates_with_gaps[j] == 'NA':
-                 matrix[i][j] = np.nan
+     # Build the pairwise distances
+     print("> Computing distances for", s.id)
+     d = np.zeros((len(s.seq), len(s.seq)), dtype=np.float16)
+     for i in range(len(s.seq)):
+         for j in range(len(s.seq)):
+             if np.isnan(coordinates_with_gaps[i]).any() or np.isnan(coordinates_with_gaps[j]).any():
+                 d[i,j] = np.nan
             else:
-                 matrix[i][j] = round(get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]),3)
-     return(matrix)
+                 d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j])
+     
+     print("> finished.")
+     np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
+     return 1-np.isnan(d).astype(int), np.nan_to_num(d), np.nan_to_num(d*d)
 
 @trace_unhandled_exceptions
- def get_avg_std_distance_matrix(f, consider_all_atoms):
-     # Get a worker number to position the progress bar
-     global idxQueue
-     thr_idx = idxQueue.get()
- 
-     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} {f} residue distance matrices")
- 
+ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
     if consider_all_atoms:
         label = "base"
     else:
         label = "backbone"
 
+     if not multithread:
+         # This function call is for ONE worker.
+         # Get a worker number for it to position the progress bar
+         global idxQueue
+         thr_idx = idxQueue.get()
+         setproctitle(f"RNANet statistics.py Worker {thr_idx+1} {f} {label} distance matrices")
+ 
     os.makedirs(runDir + '/results/distance_matrices/' + f + '_' + label, exist_ok=True )   
 
-    
-     family_matrices = []
     align = AlignIO.read(path_to_seq_data + f"realigned/{f}_3d_only.afa", "fasta")
+     ncols = align.get_alignment_length()
+     counts = np.zeros((ncols, ncols))
+     avg = np.zeros((ncols, ncols))
+     std = np.zeros((ncols, ncols))
     found = 0
     notfound = 0
-     pbar = tqdm(total = len(align), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} {label} distance matrices", unit="chains", leave=False)
-     pbar.update(0)
     with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
         conn.execute('pragma journal_mode=wal')
         r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';")
-         filelist = [ ''.join(list(x))+'.cif' for x in r ]
+         filelist = sorted([ ''.join(list(x))+'.cif' for x in r ])
     
-     for s in align:
-         filename = ''
-         for file in filelist:
-             if file.startswith(s.id.replace('-', '').replace('[', '_').replace(']', '_')):
-                 filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file
-                 break
-         if len(filename):
-             found += 1
-             try:
-                 euclidian_distance = get_matrix_euclidian_distance(filename, s.seq, consider_all_atoms)
-                 np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', euclidian_distance, delimiter=",", fmt="%.3f")
-                 family_matrices.append(euclidian_distance)
-             except FileNotFoundError:
-                 found -= 1
+     if not multithread:
+         pbar = tqdm(total = len(align), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} {label} distance matrices", unit="chains", leave=False)
+         pbar.update(0)
+         for s in align:
+             contrib, d, dsquared = par_distance_matrix(filelist, f, label, consider_all_atoms, s)
+             if d is not None:
+                 found += 1
+                 counts += contrib
+                 avg += d
+                 std += dsquared
+             else:
                 notfound += 1
-         else:
-             notfound += 1
-         pbar.update(1)
+             pbar.update(1)
+         pbar.close()
+     else:
+         # We split the work for one family on multiple workers.
+         
+         p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
+         try:
+             fam_pbar = tqdm(total=len(align), desc=f"{f} {label} pair distances", position=0, unit="chain", leave=True)
+             # Apply work_pssm_remap to each RNA family
+             for i, (contrib, d, dsquared) in enumerate(p.imap_unordered(partial(par_distance_matrix, filelist, f, label, consider_all_atoms), align, chunksize=1)):
+                 if d is not None:
+                     found += 1
+                     counts += contrib
+                     avg += d
+                     std += dsquared
+                 else:
+                     notfound += 1
+                 fam_pbar.update(1)
+             fam_pbar.close()
+             p.close()
+             p.join()
+         except KeyboardInterrupt:
+             warn("KeyboardInterrupt, terminating workers.", error=True)
+             fam_pbar.close()
+             p.terminate()
+             p.join()
+             exit(1)
+ 
     
     # Calculation of the average matrix
-     avgarray = np.array(family_matrices)
-     if len(avgarray) == 0 or np.prod(avgarray.shape) == 0: 
-         warn(f"Something's wrong with the shapes: {avgarray.shape}", error=True)
-     with warnings.catch_warnings():
-         warnings.simplefilter("ignore", category=RuntimeWarning)
-         matrix_average_distances = np.nanmean(avgarray, axis=0 )
-     
-     if len(matrix_average_distances) != 0:
-         matrix_average_distances = np.nan_to_num(matrix_average_distances)
-         np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.csv' , np.triu(matrix_average_distances), delimiter=",", fmt="%.3f")
+     avg = avg/counts
+     np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.csv' , avg, delimiter=",", fmt="%.3f")
     
     fig, ax = plt.subplots()
-     im = ax.imshow(matrix_average_distances)
+     im = ax.imshow(avg)
     cbar = ax.figure.colorbar(im, ax=ax)
     cbar.ax.set_ylabel("Angströms", rotation=-90, va="bottom")
-     ax.set_title("Average distance between residues (Angströms)")
+     ax.set_title(f"Average distance between {f} residues (Angströms)")
     fig.tight_layout()
     fig.savefig(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.png', dpi=300)
     plt.close()
 
-     # Calculation of the standard deviation matrix
-     with warnings.catch_warnings():
-         warnings.simplefilter("ignore", category=RuntimeWarning)
-         matrix_standard_deviation_distances = np.nanstd(avgarray, axis=0 )
- 
-     if len(matrix_standard_deviation_distances) != 0:
-         matrix_standard_deviation_distances = np.nan_to_num(matrix_standard_deviation_distances)
-         np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_stdev.csv' , np.triu(matrix_standard_deviation_distances), delimiter=",", fmt="%.3f")
+     # Calculation of the standard deviation matrix by the Huygens theorem
+     std = np.sqrt(std/counts - np.power(avg, 2))
+     np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_stdev.csv' , std, delimiter=",", fmt="%.3f")
     
     fig, ax = plt.subplots()
-     im = ax.imshow(matrix_standard_deviation_distances)
+     im = ax.imshow(std)
     cbar = ax.figure.colorbar(im, ax=ax)
     cbar.ax.set_ylabel("Angströms", rotation=-90, va="bottom")
-     ax.set_title("Average distance between residues (Angströms)")
+     ax.set_title(f"Standard deviation of distances between {f} residues (Angströms)")
     fig.tight_layout()
     fig.savefig(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_std.png', dpi=300)
     plt.close()
 
     # Save log
     with open(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '.log', 'a') as logfile:
-         logfile.write(str(found)+ " chains taken into account for computation. "+ str(notfound)+ " were not found.\n")
+         logfile.write(str(found)+ " chains taken into account for computation. "+ str(notfound)+ " were not found/without atoms.\n")
 
     # Save associated nucleotide frequencies (off-topic but convenient to do it here)
     with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
@@ -1040,10 +1062,9 @@ def get_avg_std_distance_matrix(f, consider_all_atoms):
         df = pd.read_sql_query(f"SELECT freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;", conn)
         df.to_csv(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_frequencies.csv', float_format="%.3f")
 
-     pbar.close()
- 
-     idxQueue.put(thr_idx) # replace the thread index in the queue
-     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
+     if not multithread:
+         idxQueue.put(thr_idx) # replace the thread index in the queue
+         setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
     return 0
 
 def log_to_pbar(pbar):
@@ -1069,33 +1090,34 @@ def nt_3d_centers(cif_file, consider_all_atoms):
     """Return the nucleotides' coordinates, summarizing a nucleotide by only one point.
     If consider_all_atoms : barycentre is used
     else: C1' atom is the nucleotide
+ 
+     Some chains have no C1' (e.g. 4v7f-3), therefore, an empty result is returned.
     """
     result  =[]
     structure = MMCIFParser().get_structure(cif_file, cif_file)
     
-     if consider_all_atoms == True:
-         for model in structure:
-             for chain in model:
-                 for residue in chain:
+     for model in structure:
+         for chain in model:
+             for residue in chain:
+                 if consider_all_atoms:
                     temp_list = []
-                     res_isobaricentre = 0
                     for atom in residue:
                         temp_list.append(atom.get_coord())
                     lg = len(temp_list)
-                     
                     summ = np.sum(temp_list, axis = 0)
                     res_isobaricentre = [summ[0]/lg, summ[1]/lg, summ[2]/lg]
                     result.append([res_isobaricentre[0], res_isobaricentre[1], res_isobaricentre[2]])
-      
-     elif consider_all_atoms == False:
-         for model in structure:
-             for chain in model:
-                 for residue in chain:
+                 else:
+                     coordinates = None
                     for atom in residue:
                         if atom.get_name() == "C1'":
                             coordinates = atom.get_coord()
-                             res = [coordinates[0], coordinates[1], coordinates[2]]
-                             result.append(res)
+                     if coordinates is None:
+                         # Residue has no C1'
+                         res = np.nan
+                     else:
+                         res = [coordinates[0], coordinates[1], coordinates[2]]
+                     result.append(res)
     return(result)
 
 def get_euclidian_distance(L1, L2):
@@ -1214,9 +1236,9 @@ if __name__ == "__main__":
                 e2 = file.split('_')[1]
                 e3 = file.split('_')[2]
                 extracted_chains.append(e1 + '[' + e2 + ']' + '-' + e3)
-         for f in famlist:
-             joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, True)))
-             joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False)))
+         for f in [ x for x in famlist if (x not in LSU_set and x not in SSU_set) ]:    # Process the rRNAs later only 3 by 3
+             joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, True, False)))
+             joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False, False)))
     joblist.append(Job(function=stats_len)) # Computes figures
     joblist.append(Job(function=stats_freq)) # updates the database
     for f in famlist:
@@ -1224,7 +1246,7 @@ if __name__ == "__main__":
         if f not in ignored:
             joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database
 
-     p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=int(0.7*nworkers))
+     p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
     pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, unit="job", leave=True)
 
     try:
@@ -1242,6 +1264,15 @@ if __name__ == "__main__":
     except:
         print("Something went wrong")
 
+     # Now process the memory-heavy tasks family by family
+     if DO_AVG_DISTANCE_MATRIX:
+         for f in LSU_set:
+             get_avg_std_distance_matrix(f, True, True)
+             get_avg_std_distance_matrix(f, False, True)
+         for f in SSU_set:
+             get_avg_std_distance_matrix(f, True, True)
+             get_avg_std_distance_matrix(f, False, True)
+ 
     print()
     print()