Louis BECQUEY

Dist matrices only for match states positions

......@@ -2329,7 +2329,7 @@ def work_realign(rfam_acc):
# Align the new sequences
with open(new_ali_path, 'w') as o:
p1 = subprocess.run(["cmalign", "--nonbanded", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins",
p1 = subprocess.run(["cmalign", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins",
"--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
path_to_seq_data + f"realigned/{rfam_acc}.cm",
path_to_seq_data + f"realigned/{rfam_acc}_new.fa"],
......@@ -2371,7 +2371,7 @@ def work_realign(rfam_acc):
# Alignment does not exist yet. We need to compute it from scratch.
print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True)
p = subprocess.run(["cmalign", "--nonbanded", '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
p = subprocess.run(["cmalign", '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
"--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
path_to_seq_data + f"realigned/{rfam_acc}.cm",
path_to_seq_data + f"realigned/{rfam_acc}++.fa"],
......@@ -2567,28 +2567,32 @@ def work_pssm_remap(f):
setproctitle(f"RNAnet.py work_pssm_remap({f}) saving")
# Get back the information of match/insertion states from the STK file
alignstk = AlignIO.read(path_to_seq_data + "realigned/" + f + "++.stk", "stockholm")
consensus_2d = alignstk.column_annotations["secondary_structure"]
del alignstk
cm_coord = 1
cm_coords = []
cm_2d = []
for x in consensus_2d:
if x == ".":
cm_coords.append(None)
cm_2d.append(None)
else:
cm_coords.append(cm_coord)
if x in "[(<{":
cm_2d.append("(")
elif x in "])>}":
cm_2d.append(")")
elif x in ",_-:":
cm_2d.append(".")
if f not in SSU_set and f not in LSU_set:
alignstk = AlignIO.read(path_to_seq_data + "realigned/" + f + "++.stk", "stockholm")
consensus_2d = alignstk.column_annotations["secondary_structure"]
del alignstk
cm_coord = 1
cm_coords = []
cm_2d = []
for x in consensus_2d:
if x in ".~":
cm_coords.append(None)
cm_2d.append(None)
else:
warn("Unsupported WUSS secondary structure symbol : "+x)
cm_2d.append(".")
cm_coord += 1
cm_coords.append(cm_coord)
if x in "[(<{":
cm_2d.append("(")
elif x in "])>}":
cm_2d.append(")")
elif x in ",_-:":
cm_2d.append(".")
else:
warn("Unsupported WUSS secondary structure symbol : "+x)
cm_2d.append(".")
cm_coord += 1
else:
cm_coords = [ None for x in range(ncols) ]
cm_2d = [ None for x in range(ncols) ]
# Save the re_mappings
conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0)
......@@ -2615,7 +2619,7 @@ def work_pssm_remap(f):
freq_other=excluded.freq_other, gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data)
# Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment)
sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct)
VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', "NULL");""", data=(f,))
VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,))
# Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains)
sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f))
conn.close()
......
......@@ -25,6 +25,7 @@ from collections import Counter
from setproctitle import setproctitle
from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions
np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8)
path_to_3D_data = "tobedefinedbyoptions"
path_to_seq_data = "tobedefinedbyoptions"
runDir = os.getcwd()
......@@ -911,7 +912,7 @@ def general_stats():
fig.savefig(runDir + "/results/figures/Nfamilies.png")
plt.close()
def par_distance_matrix(filelist, f, label, consider_all_atoms, s):
def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
# Identify the right 3D file
filename = ''
......@@ -948,13 +949,43 @@ def par_distance_matrix(filelist, f, label, consider_all_atoms, s):
for i in range(len(s.seq)):
for j in range(len(s.seq)):
if np.isnan(coordinates_with_gaps[i]).any() or np.isnan(coordinates_with_gaps[j]).any():
d[i,j] = np.nan
d[i,j] = np.NaN
else:
d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j])
# Save the individual distance matrices
if f not in LSU_set and f not in SSU_set:
np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
return 1-np.isnan(d).astype(int), np.nan_to_num(d), np.nan_to_num(d*d)
# For the average and sd, we want to consider only positions of the consensus model. This means:
# - Add empty space when we have deletions
# - skip measures that correspond to insertions
i = len(cm_coords)-1
while cm_coords[i] is None:
i -= 1
family_end = int(cm_coords[i])
i = 0
while cm_coords[i] is None:
i += 1
family_start = int(cm_coords[i])
# c = np.zeros((family_end, family_end), dtype=np.float32) # new matrix of size of the consensus model for the family
c = np.NaN * np.ones((family_end, family_end), dtype=np.float32)
# set to NaN zones that never exist in the 3D data
for i in range(family_start-1):
for j in range(i, family_end):
c[i,j] = np.NaN
c[j,i] = np.NaN
# copy the values ignoring insertions
for i in range(len(s.seq)):
if cm_coords[i] is None:
continue
pos_i = int(cm_coords[i])-1
for j in range(len(s.seq)):
if cm_coords[j] is None:
continue
c[pos_i, int(cm_coords[j])-1] = d[i,j]
# return the matrices counts, c, c^2
return 1-np.isnan(c).astype(int), np.nan_to_num(c), np.nan_to_num(c*c)
@trace_unhandled_exceptions
def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
......@@ -976,21 +1007,28 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
align = AlignIO.read(path_to_seq_data + f"realigned/{f}_3d_only.afa", "fasta")
ncols = align.get_alignment_length()
counts = np.zeros((ncols, ncols))
avg = np.zeros((ncols, ncols))
std = np.zeros((ncols, ncols))
found = 0
notfound = 0
# retrieve the mappings between this family's alignment and the CM model:
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
conn.execute('pragma journal_mode=wal')
r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';")
filelist = sorted([ ''.join(list(x))+'.cif' for x in r ])
r = sql_ask_database(conn, f"SELECT cm_coord FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;")
cm_coords = [ x[0] for x in r ]
i = len(cm_coords)-1
while cm_coords[i] is None:
i -= 1
family_end = int(cm_coords[i])
counts = np.zeros((family_end, family_end))
avg = np.zeros((family_end, family_end))
std = np.zeros((family_end, family_end))
if not multithread:
pbar = tqdm(total = len(align), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} {label} distance matrices", unit="chains", leave=False)
pbar.update(0)
for s in align:
contrib, d, dsquared = par_distance_matrix(filelist, f, label, consider_all_atoms, s)
contrib, d, dsquared = par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s)
if d is not None:
found += 1
counts += contrib
......@@ -1007,7 +1045,7 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
try:
fam_pbar = tqdm(total=len(align), desc=f"{f} {label} pair distances", position=0, unit="chain", leave=True)
# Apply work_pssm_remap to each RNA family
for i, (contrib, d, dsquared) in enumerate(p.imap_unordered(partial(par_distance_matrix, filelist, f, label, consider_all_atoms), align, chunksize=1)):
for i, (contrib, d, dsquared) in enumerate(p.imap_unordered(partial(par_distance_matrix, filelist, f, label, cm_coords, consider_all_atoms), align, chunksize=1)):
if d is not None:
found += 1
counts += contrib
......@@ -1028,7 +1066,6 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
# Calculation of the average matrix
avg = np.divide(avg, counts, where=counts>0, out=np.full_like(avg, np.NaN)) # Ultrafancy way to take avg/counts or NaN if counts is 0
np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.csv' , avg, delimiter=",", fmt="%.3f")
fig, ax = plt.subplots()
......@@ -1153,7 +1190,7 @@ if __name__ == "__main__":
if opt == "-h" or opt == "--help":
print( "RNANet statistics, a script to build a multiscale RNA dataset from public data\n"
"Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2020")
"Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2020/2021")
print()
print("Options:")
print("-h [ --help ]\t\t\tPrint this help message")
......@@ -1211,6 +1248,7 @@ if __name__ == "__main__":
famlist = families.rfam_acc.tolist()
ignored = families[families.n_chains < 3].rfam_acc.tolist()
famlist.sort(key=family_order)
print(f"Found {len(famlist)} families with chains of resolution {res_thr}A or better.")
if len(ignored):
print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
......@@ -1271,14 +1309,14 @@ if __name__ == "__main__":
except:
print("Something went wrong")
# Now process the memory-heavy tasks family by family
if DO_AVG_DISTANCE_MATRIX:
for f in LSU_set:
get_avg_std_distance_matrix(f, True, True)
get_avg_std_distance_matrix(f, False, True)
for f in SSU_set:
get_avg_std_distance_matrix(f, True, True)
get_avg_std_distance_matrix(f, False, True)
# # Now process the memory-heavy tasks family by family
# if DO_AVG_DISTANCE_MATRIX:
# for f in LSU_set:
# get_avg_std_distance_matrix(f, True, True)
# get_avg_std_distance_matrix(f, False, True)
# for f in SSU_set:
# get_avg_std_distance_matrix(f, True, True)
# get_avg_std_distance_matrix(f, False, True)
print()
print()
......