Louis BECQUEY

split rRNA distance matrices computations apart

...@@ -1483,7 +1483,7 @@ class Pipeline: ...@@ -1483,7 +1483,7 @@ class Pipeline:
1483 os.makedirs(path_to_seq_data + "realigned/3D_only", exist_ok=True) 1483 os.makedirs(path_to_seq_data + "realigned/3D_only", exist_ok=True)
1484 subprocess.run(["cp", path_to_seq_data + "realigned/*_3d_only.afa", path_to_seq_data + "realigned/3d_only" ]) 1484 subprocess.run(["cp", path_to_seq_data + "realigned/*_3d_only.afa", path_to_seq_data + "realigned/3d_only" ])
1485 subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_alignments_latest.tar.gz"]) 1485 subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_alignments_latest.tar.gz"])
1486 - subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3d_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."]) 1486 + subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3D_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."])
1487 1487
1488 def sanitize_database(self): 1488 def sanitize_database(self):
1489 """Searches for issues in the database and correct them""" 1489 """Searches for issues in the database and correct them"""
......
...@@ -911,128 +911,150 @@ def general_stats(): ...@@ -911,128 +911,150 @@ def general_stats():
911 fig.savefig(runDir + "/results/figures/Nfamilies.png") 911 fig.savefig(runDir + "/results/figures/Nfamilies.png")
912 plt.close() 912 plt.close()
913 913
914 -def get_matrix_euclidian_distance(cif_file, aligned_seq, consider_all_atoms): 914 +def par_distance_matrix(filelist, f, label, consider_all_atoms, s):
915 - """ 915 +
916 - This function 916 + # Identify the right 3D file
917 - - loads the coordinates and the alignment, reconctructs the alignment but with coordinates, considering gaps, and 917 + filename = ''
918 - - compute the matrix of euclidian distances. 918 + for file in filelist:
919 + if file.startswith(s.id.replace('-', '').replace('[', '_').replace(']', '_')):
920 + filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file
921 + break
922 + if not len(filename):
923 + return None, None, None
924 +
925 + # Get the coordinates of every existing nt in the 3D file
926 + try:
927 + coordinates = nt_3d_centers(filename, consider_all_atoms)
928 + if not len(coordinates):
929 + # there is not nucleotides in the file, or no C1' atoms for example.
930 + warn("No C1' atoms in " + filename)
931 + return None, None, None
932 + except FileNotFoundError:
933 + return None, None, None
919 934
920 - Returns:
921 - The 2D np.array of euclidian distances between pairs of nucleotides, with np.NaNs in gap columns.
922 - """
923 - # Load the baricenter coordinates
924 - coordinates = nt_3d_centers(cif_file, consider_all_atoms)
925 935
926 - # reconstruct the alignment but with coordinates 936 + # Get the coordinates of every position in the alignment
927 nb_gap = 0 937 nb_gap = 0
928 coordinates_with_gaps = [] 938 coordinates_with_gaps = []
929 - for i in range(len(aligned_seq)): 939 + for i, letter in enumerate(s.seq):
930 - if aligned_seq[i] == '.' or aligned_seq[i] == '-': 940 + if letter in "-.":
931 - nb_gap = nb_gap + 1 941 + nb_gap += 1
932 - coordinates_with_gaps.append('NA') 942 + coordinates_with_gaps.append(np.nan)
933 else: 943 else:
934 coordinates_with_gaps.append(coordinates[i - nb_gap]) 944 coordinates_with_gaps.append(coordinates[i - nb_gap])
935 -
936 - nb_nucleotides = len(coordinates_with_gaps) # number of nucleotides
937 - matrix = np.zeros((nb_nucleotides, nb_nucleotides)) # create a new empty matrix of size nxn
938 945
939 - # Fill this new matrix with the euclidians distances between all amino acids considering gaps: 946 + # Build the pairwise distances
940 - for i in range(nb_nucleotides): 947 + print("> Computing distances for", s.id)
941 - for j in range(nb_nucleotides): 948 + d = np.zeros((len(s.seq), len(s.seq)), dtype=np.float16)
942 - if coordinates_with_gaps[i] == 'NA' or coordinates_with_gaps[j] == 'NA': 949 + for i in range(len(s.seq)):
943 - matrix[i][j] = np.nan 950 + for j in range(len(s.seq)):
951 + if np.isnan(coordinates_with_gaps[i]).any() or np.isnan(coordinates_with_gaps[j]).any():
952 + d[i,j] = np.nan
944 else: 953 else:
945 - matrix[i][j] = round(get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]),3) 954 + d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j])
946 - return(matrix) 955 +
956 + print("> finished.")
957 + np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
958 + return 1-np.isnan(d).astype(int), np.nan_to_num(d), np.nan_to_num(d*d)
947 959
948 @trace_unhandled_exceptions 960 @trace_unhandled_exceptions
949 -def get_avg_std_distance_matrix(f, consider_all_atoms): 961 +def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
950 - # Get a worker number to position the progress bar
951 - global idxQueue
952 - thr_idx = idxQueue.get()
953 -
954 - setproctitle(f"RNANet statistics.py Worker {thr_idx+1} {f} residue distance matrices")
955 -
956 if consider_all_atoms: 962 if consider_all_atoms:
957 label = "base" 963 label = "base"
958 else: 964 else:
959 label = "backbone" 965 label = "backbone"
960 966
967 + if not multithread:
968 + # This function call is for ONE worker.
969 + # Get a worker number for it to position the progress bar
970 + global idxQueue
971 + thr_idx = idxQueue.get()
972 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} {f} {label} distance matrices")
973 +
961 os.makedirs(runDir + '/results/distance_matrices/' + f + '_' + label, exist_ok=True ) 974 os.makedirs(runDir + '/results/distance_matrices/' + f + '_' + label, exist_ok=True )
962 975
963 -
964 - family_matrices = []
965 align = AlignIO.read(path_to_seq_data + f"realigned/{f}_3d_only.afa", "fasta") 976 align = AlignIO.read(path_to_seq_data + f"realigned/{f}_3d_only.afa", "fasta")
977 + ncols = align.get_alignment_length()
978 + counts = np.zeros((ncols, ncols))
979 + avg = np.zeros((ncols, ncols))
980 + std = np.zeros((ncols, ncols))
966 found = 0 981 found = 0
967 notfound = 0 982 notfound = 0
968 - pbar = tqdm(total = len(align), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} {label} distance matrices", unit="chains", leave=False)
969 - pbar.update(0)
970 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 983 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
971 conn.execute('pragma journal_mode=wal') 984 conn.execute('pragma journal_mode=wal')
972 r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';") 985 r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';")
973 - filelist = [ ''.join(list(x))+'.cif' for x in r ] 986 + filelist = sorted([ ''.join(list(x))+'.cif' for x in r ])
974 987
975 - for s in align: 988 + if not multithread:
976 - filename = '' 989 + pbar = tqdm(total = len(align), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} {label} distance matrices", unit="chains", leave=False)
977 - for file in filelist: 990 + pbar.update(0)
978 - if file.startswith(s.id.replace('-', '').replace('[', '_').replace(']', '_')): 991 + for s in align:
979 - filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file 992 + contrib, d, dsquared = par_distance_matrix(filelist, f, label, consider_all_atoms, s)
980 - break 993 + if d is not None:
981 - if len(filename): 994 + found += 1
982 - found += 1 995 + counts += contrib
983 - try: 996 + avg += d
984 - euclidian_distance = get_matrix_euclidian_distance(filename, s.seq, consider_all_atoms) 997 + std += dsquared
985 - np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', euclidian_distance, delimiter=",", fmt="%.3f") 998 + else:
986 - family_matrices.append(euclidian_distance)
987 - except FileNotFoundError:
988 - found -= 1
989 notfound += 1 999 notfound += 1
990 - else: 1000 + pbar.update(1)
991 - notfound += 1 1001 + pbar.close()
992 - pbar.update(1) 1002 + else:
1003 + # We split the work for one family on multiple workers.
1004 +
1005 + p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
1006 + try:
1007 + fam_pbar = tqdm(total=len(align), desc=f"{f} {label} pair distances", position=0, unit="chain", leave=True)
1008 + # Apply work_pssm_remap to each RNA family
1009 + for i, (contrib, d, dsquared) in enumerate(p.imap_unordered(partial(par_distance_matrix, filelist, f, label, consider_all_atoms), align, chunksize=1)):
1010 + if d is not None:
1011 + found += 1
1012 + counts += contrib
1013 + avg += d
1014 + std += dsquared
1015 + else:
1016 + notfound += 1
1017 + fam_pbar.update(1)
1018 + fam_pbar.close()
1019 + p.close()
1020 + p.join()
1021 + except KeyboardInterrupt:
1022 + warn("KeyboardInterrupt, terminating workers.", error=True)
1023 + fam_pbar.close()
1024 + p.terminate()
1025 + p.join()
1026 + exit(1)
1027 +
993 1028
994 # Calculation of the average matrix 1029 # Calculation of the average matrix
995 - avgarray = np.array(family_matrices) 1030 + avg = avg/counts
996 - if len(avgarray) == 0 or np.prod(avgarray.shape) == 0: 1031 + np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.csv' , avg, delimiter=",", fmt="%.3f")
997 - warn(f"Something's wrong with the shapes: {avgarray.shape}", error=True)
998 - with warnings.catch_warnings():
999 - warnings.simplefilter("ignore", category=RuntimeWarning)
1000 - matrix_average_distances = np.nanmean(avgarray, axis=0 )
1001 -
1002 - if len(matrix_average_distances) != 0:
1003 - matrix_average_distances = np.nan_to_num(matrix_average_distances)
1004 - np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.csv' , np.triu(matrix_average_distances), delimiter=",", fmt="%.3f")
1005 1032
1006 fig, ax = plt.subplots() 1033 fig, ax = plt.subplots()
1007 - im = ax.imshow(matrix_average_distances) 1034 + im = ax.imshow(avg)
1008 cbar = ax.figure.colorbar(im, ax=ax) 1035 cbar = ax.figure.colorbar(im, ax=ax)
1009 cbar.ax.set_ylabel("Angströms", rotation=-90, va="bottom") 1036 cbar.ax.set_ylabel("Angströms", rotation=-90, va="bottom")
1010 - ax.set_title("Average distance between residues (Angströms)") 1037 + ax.set_title(f"Average distance between {f} residues (Angströms)")
1011 fig.tight_layout() 1038 fig.tight_layout()
1012 fig.savefig(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.png', dpi=300) 1039 fig.savefig(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.png', dpi=300)
1013 plt.close() 1040 plt.close()
1014 1041
1015 - # Calculation of the standard deviation matrix 1042 + # Calculation of the standard deviation matrix by the Huygens theorem
1016 - with warnings.catch_warnings(): 1043 + std = np.sqrt(std/counts - np.power(avg, 2))
1017 - warnings.simplefilter("ignore", category=RuntimeWarning) 1044 + np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_stdev.csv' , std, delimiter=",", fmt="%.3f")
1018 - matrix_standard_deviation_distances = np.nanstd(avgarray, axis=0 )
1019 -
1020 - if len(matrix_standard_deviation_distances) != 0:
1021 - matrix_standard_deviation_distances = np.nan_to_num(matrix_standard_deviation_distances)
1022 - np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_stdev.csv' , np.triu(matrix_standard_deviation_distances), delimiter=",", fmt="%.3f")
1023 1045
1024 fig, ax = plt.subplots() 1046 fig, ax = plt.subplots()
1025 - im = ax.imshow(matrix_standard_deviation_distances) 1047 + im = ax.imshow(std)
1026 cbar = ax.figure.colorbar(im, ax=ax) 1048 cbar = ax.figure.colorbar(im, ax=ax)
1027 cbar.ax.set_ylabel("Angströms", rotation=-90, va="bottom") 1049 cbar.ax.set_ylabel("Angströms", rotation=-90, va="bottom")
1028 - ax.set_title("Average distance between residues (Angströms)") 1050 + ax.set_title(f"Standard deviation of distances between {f} residues (Angströms)")
1029 fig.tight_layout() 1051 fig.tight_layout()
1030 fig.savefig(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_std.png', dpi=300) 1052 fig.savefig(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_std.png', dpi=300)
1031 plt.close() 1053 plt.close()
1032 1054
1033 # Save log 1055 # Save log
1034 with open(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '.log', 'a') as logfile: 1056 with open(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '.log', 'a') as logfile:
1035 - logfile.write(str(found)+ " chains taken into account for computation. "+ str(notfound)+ " were not found.\n") 1057 + logfile.write(str(found)+ " chains taken into account for computation. "+ str(notfound)+ " were not found/without atoms.\n")
1036 1058
1037 # Save associated nucleotide frequencies (off-topic but convenient to do it here) 1059 # Save associated nucleotide frequencies (off-topic but convenient to do it here)
1038 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 1060 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
...@@ -1040,10 +1062,9 @@ def get_avg_std_distance_matrix(f, consider_all_atoms): ...@@ -1040,10 +1062,9 @@ def get_avg_std_distance_matrix(f, consider_all_atoms):
1040 df = pd.read_sql_query(f"SELECT freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;", conn) 1062 df = pd.read_sql_query(f"SELECT freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;", conn)
1041 df.to_csv(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_frequencies.csv', float_format="%.3f") 1063 df.to_csv(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_frequencies.csv', float_format="%.3f")
1042 1064
1043 - pbar.close() 1065 + if not multithread:
1044 - 1066 + idxQueue.put(thr_idx) # replace the thread index in the queue
1045 - idxQueue.put(thr_idx) # replace the thread index in the queue 1067 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
1046 - setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
1047 return 0 1068 return 0
1048 1069
1049 def log_to_pbar(pbar): 1070 def log_to_pbar(pbar):
...@@ -1069,33 +1090,34 @@ def nt_3d_centers(cif_file, consider_all_atoms): ...@@ -1069,33 +1090,34 @@ def nt_3d_centers(cif_file, consider_all_atoms):
1069 """Return the nucleotides' coordinates, summarizing a nucleotide by only one point. 1090 """Return the nucleotides' coordinates, summarizing a nucleotide by only one point.
1070 If consider_all_atoms : barycentre is used 1091 If consider_all_atoms : barycentre is used
1071 else: C1' atom is the nucleotide 1092 else: C1' atom is the nucleotide
1093 +
1094 + Some chains have no C1' (e.g. 4v7f-3), therefore, an empty result is returned.
1072 """ 1095 """
1073 result =[] 1096 result =[]
1074 structure = MMCIFParser().get_structure(cif_file, cif_file) 1097 structure = MMCIFParser().get_structure(cif_file, cif_file)
1075 1098
1076 - if consider_all_atoms == True: 1099 + for model in structure:
1077 - for model in structure: 1100 + for chain in model:
1078 - for chain in model: 1101 + for residue in chain:
1079 - for residue in chain: 1102 + if consider_all_atoms:
1080 temp_list = [] 1103 temp_list = []
1081 - res_isobaricentre = 0
1082 for atom in residue: 1104 for atom in residue:
1083 temp_list.append(atom.get_coord()) 1105 temp_list.append(atom.get_coord())
1084 lg = len(temp_list) 1106 lg = len(temp_list)
1085 -
1086 summ = np.sum(temp_list, axis = 0) 1107 summ = np.sum(temp_list, axis = 0)
1087 res_isobaricentre = [summ[0]/lg, summ[1]/lg, summ[2]/lg] 1108 res_isobaricentre = [summ[0]/lg, summ[1]/lg, summ[2]/lg]
1088 result.append([res_isobaricentre[0], res_isobaricentre[1], res_isobaricentre[2]]) 1109 result.append([res_isobaricentre[0], res_isobaricentre[1], res_isobaricentre[2]])
1089 - 1110 + else:
1090 - elif consider_all_atoms == False: 1111 + coordinates = None
1091 - for model in structure:
1092 - for chain in model:
1093 - for residue in chain:
1094 for atom in residue: 1112 for atom in residue:
1095 if atom.get_name() == "C1'": 1113 if atom.get_name() == "C1'":
1096 coordinates = atom.get_coord() 1114 coordinates = atom.get_coord()
1097 - res = [coordinates[0], coordinates[1], coordinates[2]] 1115 + if coordinates is None:
1098 - result.append(res) 1116 + # Residue has no C1'
1117 + res = np.nan
1118 + else:
1119 + res = [coordinates[0], coordinates[1], coordinates[2]]
1120 + result.append(res)
1099 return(result) 1121 return(result)
1100 1122
1101 def get_euclidian_distance(L1, L2): 1123 def get_euclidian_distance(L1, L2):
...@@ -1214,9 +1236,9 @@ if __name__ == "__main__": ...@@ -1214,9 +1236,9 @@ if __name__ == "__main__":
1214 e2 = file.split('_')[1] 1236 e2 = file.split('_')[1]
1215 e3 = file.split('_')[2] 1237 e3 = file.split('_')[2]
1216 extracted_chains.append(e1 + '[' + e2 + ']' + '-' + e3) 1238 extracted_chains.append(e1 + '[' + e2 + ']' + '-' + e3)
1217 - for f in famlist: 1239 + for f in [ x for x in famlist if (x not in LSU_set and x not in SSU_set) ]: # Process the rRNAs later only 3 by 3
1218 - joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, True))) 1240 + joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, True, False)))
1219 - joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False))) 1241 + joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False, False)))
1220 joblist.append(Job(function=stats_len)) # Computes figures 1242 joblist.append(Job(function=stats_len)) # Computes figures
1221 joblist.append(Job(function=stats_freq)) # updates the database 1243 joblist.append(Job(function=stats_freq)) # updates the database
1222 for f in famlist: 1244 for f in famlist:
...@@ -1224,7 +1246,7 @@ if __name__ == "__main__": ...@@ -1224,7 +1246,7 @@ if __name__ == "__main__":
1224 if f not in ignored: 1246 if f not in ignored:
1225 joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database 1247 joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database
1226 1248
1227 - p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=int(0.7*nworkers)) 1249 + p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
1228 pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, unit="job", leave=True) 1250 pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, unit="job", leave=True)
1229 1251
1230 try: 1252 try:
...@@ -1242,6 +1264,15 @@ if __name__ == "__main__": ...@@ -1242,6 +1264,15 @@ if __name__ == "__main__":
1242 except: 1264 except:
1243 print("Something went wrong") 1265 print("Something went wrong")
1244 1266
1267 + # Now process the memory-heavy tasks family by family
1268 + if DO_AVG_DISTANCE_MATRIX:
1269 + for f in LSU_set:
1270 + get_avg_std_distance_matrix(f, True, True)
1271 + get_avg_std_distance_matrix(f, False, True)
1272 + for f in SSU_set:
1273 + get_avg_std_distance_matrix(f, True, True)
1274 + get_avg_std_distance_matrix(f, False, True)
1275 +
1245 print() 1276 print()
1246 print() 1277 print()
1247 1278
......