Louis BECQUEY

Dist matrices only for match states positions

...@@ -2329,7 +2329,7 @@ def work_realign(rfam_acc): ...@@ -2329,7 +2329,7 @@ def work_realign(rfam_acc):
2329 2329
2330 # Align the new sequences 2330 # Align the new sequences
2331 with open(new_ali_path, 'w') as o: 2331 with open(new_ali_path, 'w') as o:
2332 - p1 = subprocess.run(["cmalign", "--nonbanded", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", 2332 + p1 = subprocess.run(["cmalign", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins",
2333 "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", 2333 "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
2334 path_to_seq_data + f"realigned/{rfam_acc}.cm", 2334 path_to_seq_data + f"realigned/{rfam_acc}.cm",
2335 path_to_seq_data + f"realigned/{rfam_acc}_new.fa"], 2335 path_to_seq_data + f"realigned/{rfam_acc}_new.fa"],
...@@ -2371,7 +2371,7 @@ def work_realign(rfam_acc): ...@@ -2371,7 +2371,7 @@ def work_realign(rfam_acc):
2371 # Alignment does not exist yet. We need to compute it from scratch. 2371 # Alignment does not exist yet. We need to compute it from scratch.
2372 print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True) 2372 print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True)
2373 2373
2374 - p = subprocess.run(["cmalign", "--nonbanded", '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk", 2374 + p = subprocess.run(["cmalign", '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
2375 "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", 2375 "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
2376 path_to_seq_data + f"realigned/{rfam_acc}.cm", 2376 path_to_seq_data + f"realigned/{rfam_acc}.cm",
2377 path_to_seq_data + f"realigned/{rfam_acc}++.fa"], 2377 path_to_seq_data + f"realigned/{rfam_acc}++.fa"],
...@@ -2567,28 +2567,32 @@ def work_pssm_remap(f): ...@@ -2567,28 +2567,32 @@ def work_pssm_remap(f):
2567 setproctitle(f"RNAnet.py work_pssm_remap({f}) saving") 2567 setproctitle(f"RNAnet.py work_pssm_remap({f}) saving")
2568 2568
2569 # Get back the information of match/insertion states from the STK file 2569 # Get back the information of match/insertion states from the STK file
2570 - alignstk = AlignIO.read(path_to_seq_data + "realigned/" + f + "++.stk", "stockholm") 2570 + if f not in SSU_set and f not in LSU_set:
2571 - consensus_2d = alignstk.column_annotations["secondary_structure"] 2571 + alignstk = AlignIO.read(path_to_seq_data + "realigned/" + f + "++.stk", "stockholm")
2572 - del alignstk 2572 + consensus_2d = alignstk.column_annotations["secondary_structure"]
2573 - cm_coord = 1 2573 + del alignstk
2574 - cm_coords = [] 2574 + cm_coord = 1
2575 - cm_2d = [] 2575 + cm_coords = []
2576 - for x in consensus_2d: 2576 + cm_2d = []
2577 - if x == ".": 2577 + for x in consensus_2d:
2578 - cm_coords.append(None) 2578 + if x in ".~":
2579 - cm_2d.append(None) 2579 + cm_coords.append(None)
2580 - else: 2580 + cm_2d.append(None)
2581 - cm_coords.append(cm_coord)
2582 - if x in "[(<{":
2583 - cm_2d.append("(")
2584 - elif x in "])>}":
2585 - cm_2d.append(")")
2586 - elif x in ",_-:":
2587 - cm_2d.append(".")
2588 else: 2581 else:
2589 - warn("Unsupported WUSS secondary structure symbol : "+x) 2582 + cm_coords.append(cm_coord)
2590 - cm_2d.append(".") 2583 + if x in "[(<{":
2591 - cm_coord += 1 2584 + cm_2d.append("(")
2585 + elif x in "])>}":
2586 + cm_2d.append(")")
2587 + elif x in ",_-:":
2588 + cm_2d.append(".")
2589 + else:
2590 + warn("Unsupported WUSS secondary structure symbol : "+x)
2591 + cm_2d.append(".")
2592 + cm_coord += 1
2593 + else:
2594 + cm_coords = [ None for x in range(ncols) ]
2595 + cm_2d = [ None for x in range(ncols) ]
2592 2596
2593 # Save the re_mappings 2597 # Save the re_mappings
2594 conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0) 2598 conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0)
...@@ -2615,7 +2619,7 @@ def work_pssm_remap(f): ...@@ -2615,7 +2619,7 @@ def work_pssm_remap(f):
2615 freq_other=excluded.freq_other, gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data) 2619 freq_other=excluded.freq_other, gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data)
2616 # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment) 2620 # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment)
2617 sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct) 2621 sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct)
2618 - VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', "NULL");""", data=(f,)) 2622 + VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,))
2619 # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) 2623 # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains)
2620 sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f)) 2624 sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f))
2621 conn.close() 2625 conn.close()
......
...@@ -25,6 +25,7 @@ from collections import Counter ...@@ -25,6 +25,7 @@ from collections import Counter
25 from setproctitle import setproctitle 25 from setproctitle import setproctitle
26 from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions 26 from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions
27 27
28 +np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8)
28 path_to_3D_data = "tobedefinedbyoptions" 29 path_to_3D_data = "tobedefinedbyoptions"
29 path_to_seq_data = "tobedefinedbyoptions" 30 path_to_seq_data = "tobedefinedbyoptions"
30 runDir = os.getcwd() 31 runDir = os.getcwd()
...@@ -911,7 +912,7 @@ def general_stats(): ...@@ -911,7 +912,7 @@ def general_stats():
911 fig.savefig(runDir + "/results/figures/Nfamilies.png") 912 fig.savefig(runDir + "/results/figures/Nfamilies.png")
912 plt.close() 913 plt.close()
913 914
914 -def par_distance_matrix(filelist, f, label, consider_all_atoms, s): 915 +def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
915 916
916 # Identify the right 3D file 917 # Identify the right 3D file
917 filename = '' 918 filename = ''
...@@ -948,13 +949,43 @@ def par_distance_matrix(filelist, f, label, consider_all_atoms, s): ...@@ -948,13 +949,43 @@ def par_distance_matrix(filelist, f, label, consider_all_atoms, s):
948 for i in range(len(s.seq)): 949 for i in range(len(s.seq)):
949 for j in range(len(s.seq)): 950 for j in range(len(s.seq)):
950 if np.isnan(coordinates_with_gaps[i]).any() or np.isnan(coordinates_with_gaps[j]).any(): 951 if np.isnan(coordinates_with_gaps[i]).any() or np.isnan(coordinates_with_gaps[j]).any():
951 - d[i,j] = np.nan 952 + d[i,j] = np.NaN
952 else: 953 else:
953 d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]) 954 d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j])
954 955
956 + # Save the individual distance matrices
955 if f not in LSU_set and f not in SSU_set: 957 if f not in LSU_set and f not in SSU_set:
956 np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f") 958 np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
957 - return 1-np.isnan(d).astype(int), np.nan_to_num(d), np.nan_to_num(d*d) 959 +
960 + # For the average and sd, we want to consider only positions of the consensus model. This means:
961 + # - Add empty space when we have deletions
962 + # - skip measures that correspond to insertions
963 + i = len(cm_coords)-1
964 + while cm_coords[i] is None:
965 + i -= 1
966 + family_end = int(cm_coords[i])
967 + i = 0
968 + while cm_coords[i] is None:
969 + i += 1
970 + family_start = int(cm_coords[i])
971 + # c = np.zeros((family_end, family_end), dtype=np.float32) # new matrix of size of the consensus model for the family
972 + c = np.NaN * np.ones((family_end, family_end), dtype=np.float32)
973 + # set to NaN zones that never exist in the 3D data
974 + for i in range(family_start-1):
975 + for j in range(i, family_end):
976 + c[i,j] = np.NaN
977 + c[j,i] = np.NaN
978 + # copy the values ignoring insertions
979 + for i in range(len(s.seq)):
980 + if cm_coords[i] is None:
981 + continue
982 + pos_i = int(cm_coords[i])-1
983 + for j in range(len(s.seq)):
984 + if cm_coords[j] is None:
985 + continue
986 + c[pos_i, int(cm_coords[j])-1] = d[i,j]
987 + # return the matrices counts, c, c^2
988 + return 1-np.isnan(c).astype(int), np.nan_to_num(c), np.nan_to_num(c*c)
958 989
959 @trace_unhandled_exceptions 990 @trace_unhandled_exceptions
960 def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): 991 def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
...@@ -976,21 +1007,28 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): ...@@ -976,21 +1007,28 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
976 1007
977 align = AlignIO.read(path_to_seq_data + f"realigned/{f}_3d_only.afa", "fasta") 1008 align = AlignIO.read(path_to_seq_data + f"realigned/{f}_3d_only.afa", "fasta")
978 ncols = align.get_alignment_length() 1009 ncols = align.get_alignment_length()
979 - counts = np.zeros((ncols, ncols))
980 - avg = np.zeros((ncols, ncols))
981 - std = np.zeros((ncols, ncols))
982 found = 0 1010 found = 0
983 notfound = 0 1011 notfound = 0
1012 + # retrieve the mappings between this family's alignment and the CM model:
984 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 1013 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
985 conn.execute('pragma journal_mode=wal') 1014 conn.execute('pragma journal_mode=wal')
986 r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';") 1015 r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';")
987 filelist = sorted([ ''.join(list(x))+'.cif' for x in r ]) 1016 filelist = sorted([ ''.join(list(x))+'.cif' for x in r ])
1017 + r = sql_ask_database(conn, f"SELECT cm_coord FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;")
1018 + cm_coords = [ x[0] for x in r ]
1019 + i = len(cm_coords)-1
1020 + while cm_coords[i] is None:
1021 + i -= 1
1022 + family_end = int(cm_coords[i])
1023 + counts = np.zeros((family_end, family_end))
1024 + avg = np.zeros((family_end, family_end))
1025 + std = np.zeros((family_end, family_end))
988 1026
989 if not multithread: 1027 if not multithread:
990 pbar = tqdm(total = len(align), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} {label} distance matrices", unit="chains", leave=False) 1028 pbar = tqdm(total = len(align), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} {label} distance matrices", unit="chains", leave=False)
991 pbar.update(0) 1029 pbar.update(0)
992 for s in align: 1030 for s in align:
993 - contrib, d, dsquared = par_distance_matrix(filelist, f, label, consider_all_atoms, s) 1031 + contrib, d, dsquared = par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s)
994 if d is not None: 1032 if d is not None:
995 found += 1 1033 found += 1
996 counts += contrib 1034 counts += contrib
...@@ -1007,7 +1045,7 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): ...@@ -1007,7 +1045,7 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
1007 try: 1045 try:
1008 fam_pbar = tqdm(total=len(align), desc=f"{f} {label} pair distances", position=0, unit="chain", leave=True) 1046 fam_pbar = tqdm(total=len(align), desc=f"{f} {label} pair distances", position=0, unit="chain", leave=True)
1009 # Apply work_pssm_remap to each RNA family 1047 # Apply work_pssm_remap to each RNA family
1010 - for i, (contrib, d, dsquared) in enumerate(p.imap_unordered(partial(par_distance_matrix, filelist, f, label, consider_all_atoms), align, chunksize=1)): 1048 + for i, (contrib, d, dsquared) in enumerate(p.imap_unordered(partial(par_distance_matrix, filelist, f, label, cm_coords, consider_all_atoms), align, chunksize=1)):
1011 if d is not None: 1049 if d is not None:
1012 found += 1 1050 found += 1
1013 counts += contrib 1051 counts += contrib
...@@ -1028,7 +1066,6 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): ...@@ -1028,7 +1066,6 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
1028 1066
1029 # Calculation of the average matrix 1067 # Calculation of the average matrix
1030 avg = np.divide(avg, counts, where=counts>0, out=np.full_like(avg, np.NaN)) # Ultrafancy way to take avg/counts or NaN if counts is 0 1068 avg = np.divide(avg, counts, where=counts>0, out=np.full_like(avg, np.NaN)) # Ultrafancy way to take avg/counts or NaN if counts is 0
1031 -
1032 np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.csv' , avg, delimiter=",", fmt="%.3f") 1069 np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.csv' , avg, delimiter=",", fmt="%.3f")
1033 1070
1034 fig, ax = plt.subplots() 1071 fig, ax = plt.subplots()
...@@ -1153,7 +1190,7 @@ if __name__ == "__main__": ...@@ -1153,7 +1190,7 @@ if __name__ == "__main__":
1153 1190
1154 if opt == "-h" or opt == "--help": 1191 if opt == "-h" or opt == "--help":
1155 print( "RNANet statistics, a script to build a multiscale RNA dataset from public data\n" 1192 print( "RNANet statistics, a script to build a multiscale RNA dataset from public data\n"
1156 - "Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2020") 1193 + "Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2020/2021")
1157 print() 1194 print()
1158 print("Options:") 1195 print("Options:")
1159 print("-h [ --help ]\t\t\tPrint this help message") 1196 print("-h [ --help ]\t\t\tPrint this help message")
...@@ -1211,6 +1248,7 @@ if __name__ == "__main__": ...@@ -1211,6 +1248,7 @@ if __name__ == "__main__":
1211 famlist = families.rfam_acc.tolist() 1248 famlist = families.rfam_acc.tolist()
1212 ignored = families[families.n_chains < 3].rfam_acc.tolist() 1249 ignored = families[families.n_chains < 3].rfam_acc.tolist()
1213 famlist.sort(key=family_order) 1250 famlist.sort(key=family_order)
1251 +
1214 print(f"Found {len(famlist)} families with chains of resolution {res_thr}A or better.") 1252 print(f"Found {len(famlist)} families with chains of resolution {res_thr}A or better.")
1215 if len(ignored): 1253 if len(ignored):
1216 print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n') 1254 print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
...@@ -1271,14 +1309,14 @@ if __name__ == "__main__": ...@@ -1271,14 +1309,14 @@ if __name__ == "__main__":
1271 except: 1309 except:
1272 print("Something went wrong") 1310 print("Something went wrong")
1273 1311
1274 - # Now process the memory-heavy tasks family by family 1312 + # # Now process the memory-heavy tasks family by family
1275 - if DO_AVG_DISTANCE_MATRIX: 1313 + # if DO_AVG_DISTANCE_MATRIX:
1276 - for f in LSU_set: 1314 + # for f in LSU_set:
1277 - get_avg_std_distance_matrix(f, True, True) 1315 + # get_avg_std_distance_matrix(f, True, True)
1278 - get_avg_std_distance_matrix(f, False, True) 1316 + # get_avg_std_distance_matrix(f, False, True)
1279 - for f in SSU_set: 1317 + # for f in SSU_set:
1280 - get_avg_std_distance_matrix(f, True, True) 1318 + # get_avg_std_distance_matrix(f, True, True)
1281 - get_avg_std_distance_matrix(f, False, True) 1319 + # get_avg_std_distance_matrix(f, False, True)
1282 1320
1283 print() 1321 print()
1284 print() 1322 print()
......