Showing
2 changed files
with
84 additions
and
42 deletions
... | @@ -2329,7 +2329,7 @@ def work_realign(rfam_acc): | ... | @@ -2329,7 +2329,7 @@ def work_realign(rfam_acc): |
2329 | 2329 | ||
2330 | # Align the new sequences | 2330 | # Align the new sequences |
2331 | with open(new_ali_path, 'w') as o: | 2331 | with open(new_ali_path, 'w') as o: |
2332 | - p1 = subprocess.run(["cmalign", "--nonbanded", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", | 2332 | + p1 = subprocess.run(["cmalign", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", |
2333 | "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", | 2333 | "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", |
2334 | path_to_seq_data + f"realigned/{rfam_acc}.cm", | 2334 | path_to_seq_data + f"realigned/{rfam_acc}.cm", |
2335 | path_to_seq_data + f"realigned/{rfam_acc}_new.fa"], | 2335 | path_to_seq_data + f"realigned/{rfam_acc}_new.fa"], |
... | @@ -2371,7 +2371,7 @@ def work_realign(rfam_acc): | ... | @@ -2371,7 +2371,7 @@ def work_realign(rfam_acc): |
2371 | # Alignment does not exist yet. We need to compute it from scratch. | 2371 | # Alignment does not exist yet. We need to compute it from scratch. |
2372 | print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True) | 2372 | print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True) |
2373 | 2373 | ||
2374 | - p = subprocess.run(["cmalign", "--nonbanded", '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk", | 2374 | + p = subprocess.run(["cmalign", '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk", |
2375 | "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", | 2375 | "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", |
2376 | path_to_seq_data + f"realigned/{rfam_acc}.cm", | 2376 | path_to_seq_data + f"realigned/{rfam_acc}.cm", |
2377 | path_to_seq_data + f"realigned/{rfam_acc}++.fa"], | 2377 | path_to_seq_data + f"realigned/{rfam_acc}++.fa"], |
... | @@ -2567,28 +2567,32 @@ def work_pssm_remap(f): | ... | @@ -2567,28 +2567,32 @@ def work_pssm_remap(f): |
2567 | setproctitle(f"RNAnet.py work_pssm_remap({f}) saving") | 2567 | setproctitle(f"RNAnet.py work_pssm_remap({f}) saving") |
2568 | 2568 | ||
2569 | # Get back the information of match/insertion states from the STK file | 2569 | # Get back the information of match/insertion states from the STK file |
2570 | - alignstk = AlignIO.read(path_to_seq_data + "realigned/" + f + "++.stk", "stockholm") | 2570 | + if f not in SSU_set and f not in LSU_set: |
2571 | - consensus_2d = alignstk.column_annotations["secondary_structure"] | 2571 | + alignstk = AlignIO.read(path_to_seq_data + "realigned/" + f + "++.stk", "stockholm") |
2572 | - del alignstk | 2572 | + consensus_2d = alignstk.column_annotations["secondary_structure"] |
2573 | - cm_coord = 1 | 2573 | + del alignstk |
2574 | - cm_coords = [] | 2574 | + cm_coord = 1 |
2575 | - cm_2d = [] | 2575 | + cm_coords = [] |
2576 | - for x in consensus_2d: | 2576 | + cm_2d = [] |
2577 | - if x == ".": | 2577 | + for x in consensus_2d: |
2578 | - cm_coords.append(None) | 2578 | + if x in ".~": |
2579 | - cm_2d.append(None) | 2579 | + cm_coords.append(None) |
2580 | - else: | 2580 | + cm_2d.append(None) |
2581 | - cm_coords.append(cm_coord) | ||
2582 | - if x in "[(<{": | ||
2583 | - cm_2d.append("(") | ||
2584 | - elif x in "])>}": | ||
2585 | - cm_2d.append(")") | ||
2586 | - elif x in ",_-:": | ||
2587 | - cm_2d.append(".") | ||
2588 | else: | 2581 | else: |
2589 | - warn("Unsupported WUSS secondary structure symbol : "+x) | 2582 | + cm_coords.append(cm_coord) |
2590 | - cm_2d.append(".") | 2583 | + if x in "[(<{": |
2591 | - cm_coord += 1 | 2584 | + cm_2d.append("(") |
2585 | + elif x in "])>}": | ||
2586 | + cm_2d.append(")") | ||
2587 | + elif x in ",_-:": | ||
2588 | + cm_2d.append(".") | ||
2589 | + else: | ||
2590 | + warn("Unsupported WUSS secondary structure symbol : "+x) | ||
2591 | + cm_2d.append(".") | ||
2592 | + cm_coord += 1 | ||
2593 | + else: | ||
2594 | + cm_coords = [ None for x in range(ncols) ] | ||
2595 | + cm_2d = [ None for x in range(ncols) ] | ||
2592 | 2596 | ||
2593 | # Save the re_mappings | 2597 | # Save the re_mappings |
2594 | conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0) | 2598 | conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0) |
... | @@ -2615,7 +2619,7 @@ def work_pssm_remap(f): | ... | @@ -2615,7 +2619,7 @@ def work_pssm_remap(f): |
2615 | freq_other=excluded.freq_other, gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data) | 2619 | freq_other=excluded.freq_other, gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data) |
2616 | # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment) | 2620 | # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment) |
2617 | sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct) | 2621 | sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct) |
2618 | - VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', "NULL");""", data=(f,)) | 2622 | + VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,)) |
2619 | # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) | 2623 | # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) |
2620 | sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f)) | 2624 | sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f)) |
2621 | conn.close() | 2625 | conn.close() | ... | ... |
... | @@ -25,6 +25,7 @@ from collections import Counter | ... | @@ -25,6 +25,7 @@ from collections import Counter |
25 | from setproctitle import setproctitle | 25 | from setproctitle import setproctitle |
26 | from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions | 26 | from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions |
27 | 27 | ||
28 | +np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8) | ||
28 | path_to_3D_data = "tobedefinedbyoptions" | 29 | path_to_3D_data = "tobedefinedbyoptions" |
29 | path_to_seq_data = "tobedefinedbyoptions" | 30 | path_to_seq_data = "tobedefinedbyoptions" |
30 | runDir = os.getcwd() | 31 | runDir = os.getcwd() |
... | @@ -911,7 +912,7 @@ def general_stats(): | ... | @@ -911,7 +912,7 @@ def general_stats(): |
911 | fig.savefig(runDir + "/results/figures/Nfamilies.png") | 912 | fig.savefig(runDir + "/results/figures/Nfamilies.png") |
912 | plt.close() | 913 | plt.close() |
913 | 914 | ||
914 | -def par_distance_matrix(filelist, f, label, consider_all_atoms, s): | 915 | +def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s): |
915 | 916 | ||
916 | # Identify the right 3D file | 917 | # Identify the right 3D file |
917 | filename = '' | 918 | filename = '' |
... | @@ -948,13 +949,43 @@ def par_distance_matrix(filelist, f, label, consider_all_atoms, s): | ... | @@ -948,13 +949,43 @@ def par_distance_matrix(filelist, f, label, consider_all_atoms, s): |
948 | for i in range(len(s.seq)): | 949 | for i in range(len(s.seq)): |
949 | for j in range(len(s.seq)): | 950 | for j in range(len(s.seq)): |
950 | if np.isnan(coordinates_with_gaps[i]).any() or np.isnan(coordinates_with_gaps[j]).any(): | 951 | if np.isnan(coordinates_with_gaps[i]).any() or np.isnan(coordinates_with_gaps[j]).any(): |
951 | - d[i,j] = np.nan | 952 | + d[i,j] = np.NaN |
952 | else: | 953 | else: |
953 | d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]) | 954 | d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]) |
954 | 955 | ||
956 | + # Save the individual distance matrices | ||
955 | if f not in LSU_set and f not in SSU_set: | 957 | if f not in LSU_set and f not in SSU_set: |
956 | np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f") | 958 | np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f") |
957 | - return 1-np.isnan(d).astype(int), np.nan_to_num(d), np.nan_to_num(d*d) | 959 | + |
960 | + # For the average and sd, we want to consider only positions of the consensus model. This means: | ||
961 | + # - Add empty space when we have deletions | ||
962 | + # - skip measures that correspond to insertions | ||
963 | + i = len(cm_coords)-1 | ||
964 | + while cm_coords[i] is None: | ||
965 | + i -= 1 | ||
966 | + family_end = int(cm_coords[i]) | ||
967 | + i = 0 | ||
968 | + while cm_coords[i] is None: | ||
969 | + i += 1 | ||
970 | + family_start = int(cm_coords[i]) | ||
971 | + # c = np.zeros((family_end, family_end), dtype=np.float32) # new matrix of size of the consensus model for the family | ||
972 | + c = np.NaN * np.ones((family_end, family_end), dtype=np.float32) | ||
973 | + # set to NaN zones that never exist in the 3D data | ||
974 | + for i in range(family_start-1): | ||
975 | + for j in range(i, family_end): | ||
976 | + c[i,j] = np.NaN | ||
977 | + c[j,i] = np.NaN | ||
978 | + # copy the values ignoring insertions | ||
979 | + for i in range(len(s.seq)): | ||
980 | + if cm_coords[i] is None: | ||
981 | + continue | ||
982 | + pos_i = int(cm_coords[i])-1 | ||
983 | + for j in range(len(s.seq)): | ||
984 | + if cm_coords[j] is None: | ||
985 | + continue | ||
986 | + c[pos_i, int(cm_coords[j])-1] = d[i,j] | ||
987 | + # return the matrices counts, c, c^2 | ||
988 | + return 1-np.isnan(c).astype(int), np.nan_to_num(c), np.nan_to_num(c*c) | ||
958 | 989 | ||
959 | @trace_unhandled_exceptions | 990 | @trace_unhandled_exceptions |
960 | def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): | 991 | def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): |
... | @@ -976,21 +1007,28 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): | ... | @@ -976,21 +1007,28 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): |
976 | 1007 | ||
977 | align = AlignIO.read(path_to_seq_data + f"realigned/{f}_3d_only.afa", "fasta") | 1008 | align = AlignIO.read(path_to_seq_data + f"realigned/{f}_3d_only.afa", "fasta") |
978 | ncols = align.get_alignment_length() | 1009 | ncols = align.get_alignment_length() |
979 | - counts = np.zeros((ncols, ncols)) | ||
980 | - avg = np.zeros((ncols, ncols)) | ||
981 | - std = np.zeros((ncols, ncols)) | ||
982 | found = 0 | 1010 | found = 0 |
983 | notfound = 0 | 1011 | notfound = 0 |
1012 | + # retrieve the mappings between this family's alignment and the CM model: | ||
984 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 1013 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
985 | conn.execute('pragma journal_mode=wal') | 1014 | conn.execute('pragma journal_mode=wal') |
986 | r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';") | 1015 | r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';") |
987 | filelist = sorted([ ''.join(list(x))+'.cif' for x in r ]) | 1016 | filelist = sorted([ ''.join(list(x))+'.cif' for x in r ]) |
1017 | + r = sql_ask_database(conn, f"SELECT cm_coord FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;") | ||
1018 | + cm_coords = [ x[0] for x in r ] | ||
1019 | + i = len(cm_coords)-1 | ||
1020 | + while cm_coords[i] is None: | ||
1021 | + i -= 1 | ||
1022 | + family_end = int(cm_coords[i]) | ||
1023 | + counts = np.zeros((family_end, family_end)) | ||
1024 | + avg = np.zeros((family_end, family_end)) | ||
1025 | + std = np.zeros((family_end, family_end)) | ||
988 | 1026 | ||
989 | if not multithread: | 1027 | if not multithread: |
990 | pbar = tqdm(total = len(align), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} {label} distance matrices", unit="chains", leave=False) | 1028 | pbar = tqdm(total = len(align), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} {label} distance matrices", unit="chains", leave=False) |
991 | pbar.update(0) | 1029 | pbar.update(0) |
992 | for s in align: | 1030 | for s in align: |
993 | - contrib, d, dsquared = par_distance_matrix(filelist, f, label, consider_all_atoms, s) | 1031 | + contrib, d, dsquared = par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s) |
994 | if d is not None: | 1032 | if d is not None: |
995 | found += 1 | 1033 | found += 1 |
996 | counts += contrib | 1034 | counts += contrib |
... | @@ -1007,7 +1045,7 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): | ... | @@ -1007,7 +1045,7 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): |
1007 | try: | 1045 | try: |
1008 | fam_pbar = tqdm(total=len(align), desc=f"{f} {label} pair distances", position=0, unit="chain", leave=True) | 1046 | fam_pbar = tqdm(total=len(align), desc=f"{f} {label} pair distances", position=0, unit="chain", leave=True) |
1009 | # Apply work_pssm_remap to each RNA family | 1047 | # Apply work_pssm_remap to each RNA family |
1010 | - for i, (contrib, d, dsquared) in enumerate(p.imap_unordered(partial(par_distance_matrix, filelist, f, label, consider_all_atoms), align, chunksize=1)): | 1048 | + for i, (contrib, d, dsquared) in enumerate(p.imap_unordered(partial(par_distance_matrix, filelist, f, label, cm_coords, consider_all_atoms), align, chunksize=1)): |
1011 | if d is not None: | 1049 | if d is not None: |
1012 | found += 1 | 1050 | found += 1 |
1013 | counts += contrib | 1051 | counts += contrib |
... | @@ -1028,7 +1066,6 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): | ... | @@ -1028,7 +1066,6 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): |
1028 | 1066 | ||
1029 | # Calculation of the average matrix | 1067 | # Calculation of the average matrix |
1030 | avg = np.divide(avg, counts, where=counts>0, out=np.full_like(avg, np.NaN)) # Ultrafancy way to take avg/counts or NaN if counts is 0 | 1068 | avg = np.divide(avg, counts, where=counts>0, out=np.full_like(avg, np.NaN)) # Ultrafancy way to take avg/counts or NaN if counts is 0 |
1031 | - | ||
1032 | np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.csv' , avg, delimiter=",", fmt="%.3f") | 1069 | np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.csv' , avg, delimiter=",", fmt="%.3f") |
1033 | 1070 | ||
1034 | fig, ax = plt.subplots() | 1071 | fig, ax = plt.subplots() |
... | @@ -1153,7 +1190,7 @@ if __name__ == "__main__": | ... | @@ -1153,7 +1190,7 @@ if __name__ == "__main__": |
1153 | 1190 | ||
1154 | if opt == "-h" or opt == "--help": | 1191 | if opt == "-h" or opt == "--help": |
1155 | print( "RNANet statistics, a script to build a multiscale RNA dataset from public data\n" | 1192 | print( "RNANet statistics, a script to build a multiscale RNA dataset from public data\n" |
1156 | - "Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2020") | 1193 | + "Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2020/2021") |
1157 | print() | 1194 | print() |
1158 | print("Options:") | 1195 | print("Options:") |
1159 | print("-h [ --help ]\t\t\tPrint this help message") | 1196 | print("-h [ --help ]\t\t\tPrint this help message") |
... | @@ -1211,6 +1248,7 @@ if __name__ == "__main__": | ... | @@ -1211,6 +1248,7 @@ if __name__ == "__main__": |
1211 | famlist = families.rfam_acc.tolist() | 1248 | famlist = families.rfam_acc.tolist() |
1212 | ignored = families[families.n_chains < 3].rfam_acc.tolist() | 1249 | ignored = families[families.n_chains < 3].rfam_acc.tolist() |
1213 | famlist.sort(key=family_order) | 1250 | famlist.sort(key=family_order) |
1251 | + | ||
1214 | print(f"Found {len(famlist)} families with chains of resolution {res_thr}A or better.") | 1252 | print(f"Found {len(famlist)} families with chains of resolution {res_thr}A or better.") |
1215 | if len(ignored): | 1253 | if len(ignored): |
1216 | print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n') | 1254 | print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n') |
... | @@ -1271,14 +1309,14 @@ if __name__ == "__main__": | ... | @@ -1271,14 +1309,14 @@ if __name__ == "__main__": |
1271 | except: | 1309 | except: |
1272 | print("Something went wrong") | 1310 | print("Something went wrong") |
1273 | 1311 | ||
1274 | - # Now process the memory-heavy tasks family by family | 1312 | + # # Now process the memory-heavy tasks family by family |
1275 | - if DO_AVG_DISTANCE_MATRIX: | 1313 | + # if DO_AVG_DISTANCE_MATRIX: |
1276 | - for f in LSU_set: | 1314 | + # for f in LSU_set: |
1277 | - get_avg_std_distance_matrix(f, True, True) | 1315 | + # get_avg_std_distance_matrix(f, True, True) |
1278 | - get_avg_std_distance_matrix(f, False, True) | 1316 | + # get_avg_std_distance_matrix(f, False, True) |
1279 | - for f in SSU_set: | 1317 | + # for f in SSU_set: |
1280 | - get_avg_std_distance_matrix(f, True, True) | 1318 | + # get_avg_std_distance_matrix(f, True, True) |
1281 | - get_avg_std_distance_matrix(f, False, True) | 1319 | + # get_avg_std_distance_matrix(f, False, True) |
1282 | 1320 | ||
1283 | print() | 1321 | print() |
1284 | print() | 1322 | print() | ... | ... |
-
Please register or login to post a comment