Louis BECQUEY

Solved crashes with --no-homology

......@@ -155,7 +155,7 @@ class Chain:
""" Extract the part which is mapped to Rfam from the main CIF file and save it to another file.
"""
if (self.pdb_end - self.pdb_start):
if self.pdb_start is not None and (self.pdb_end - self.pdb_start):
status = f"Extract {self.pdb_start}-{self.pdb_end} atoms from {self.pdb_id}-{self.pdb_chain_id}"
self.file = path_to_3D_data+"rna_mapped_to_Rfam/"+self.chain_label+".cif"
else:
......@@ -182,7 +182,7 @@ class Chain:
# Extract the desired chain
c = s[model_idx][self.pdb_chain_id]
if (self.pdb_end - self.pdb_start):
if self.pdb_start is not None and (self.pdb_end - self.pdb_start):
# # Pay attention to residue numbering
# first_number = c.child_list[0].get_id()[1] # the chain's first residue is numbered 'first_number'
# if self.pdb_start < self.pdb_end:
......@@ -309,6 +309,28 @@ class Chain:
if df.iloc[0,0] != 1:
st = df.iloc[0,0] -1
df.iloc[:, 0] -= st
# Find missing index_chain values because of resolved nucleotides that have a strange nt_resnum value
# e.g. 4v49-AA, position 5'- 1003 -> 2003 -> 1004 - 3'
diff = set(range(df.shape[0])).difference(df['index_chain'] - 1)
for i in sorted(diff):
# check if a nucleotide numbered +1000 exists
looked_for = df[df.index_chain == i].nt_resnum.values[0]
found = None
for nt in nts:
if nt['chain_name'] != self.pdb_chain_id:
continue
if nt['index_chain'] == i + 1 :
found = nt
break
if found:
df_row = pd.DataFrame([found], index=[i])[df.columns.values]
df_row.iloc[0,1] = df.iloc[i,1]
df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ])
df.iloc[i+1:, 1] += 1
else:
warn(f"Missing index_chain {i} in {self.chain_label} !")
df = df.drop(df[df.index_chain < 0].index) # drop eventual ones with index_chain < the first residue (usually, ligands)
# Re-Assert some nucleotides still exist
......@@ -352,6 +374,10 @@ class Chain:
df = df.reset_index(drop=True)
self.full_length = len(df.index_chain)
#######################################
# Compute new features
#######################################
# Add a sequence column just for the alignments
df['nt_align_code'] = [ str(x).upper()
.replace('NAN', '-') # Unresolved nucleotides are gaps
......@@ -360,11 +386,6 @@ class Chain:
.replace('P', 'U') # Pseudo-uridines, but it is not really right to change them to U, see DSSR paper, Fig 2
for x in df['nt_code'] ]
#######################################
# Compute new features
#######################################
# One-hot encoding sequence
df["is_A"] = [ 1 if x=="A" else 0 for x in df["nt_code"] ]
df["is_C"] = [ 1 if x=="C" else 0 for x in df["nt_code"] ]
......@@ -464,6 +485,7 @@ class Chain:
####################################
# Save everything to database
####################################
with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
# Register the chain in table chain
if self.pdb_start is not None:
......@@ -472,13 +494,28 @@ class Chain:
VALUES
(?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(structure_id, chain_name, rfam_acc) DO
UPDATE SET pdb_start=excluded.pdb_start, pdb_end=excluded.pdb_end, reversed=excluded.reversed, inferred=excluded.inferred, issue=excluded.issue;""",
data=(str(self.pdb_id), str(self.pdb_chain_id), int(self.pdb_start), int(self.pdb_end), int(self.reversed), str(self.rfam_fam), int(self.inferred), int(self.delete_me)))
UPDATE SET pdb_start=excluded.pdb_start,
pdb_end=excluded.pdb_end,
reversed=excluded.reversed,
inferred=excluded.inferred,
issue=excluded.issue;""",
data=(str(self.pdb_id), str(self.pdb_chain_id),
int(self.pdb_start), int(self.pdb_end),
int(self.reversed), str(self.rfam_fam),
int(self.inferred), int(self.delete_me)))
# get the chain id
self.db_chain_id = sql_ask_database(conn, f"SELECT (chain_id) FROM chain WHERE structure_id='{self.pdb_id}' AND chain_name='{self.pdb_chain_id}' AND rfam_acc='{self.rfam_fam}';")[0][0]
self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain
WHERE structure_id='{self.pdb_id}'
AND chain_name='{self.pdb_chain_id}'
AND rfam_acc='{self.rfam_fam}';""")[0][0]
else:
sql_execute(conn, "INSERT INTO chain (structure_id, chain_name, issue) VALUES (?, ?, ?) ON CONFLICT(structure_id, chain_name) DO UPDATE SET issue=excluded.issue;", data=(str(self.pdb_id), int(self.pdb_chain_id), int(self.delete_me)))
self.db_chain_id = sql_ask_database(conn, f"SELECT (chain_id) FROM chain WHERE structure_id='{self.pdb_id}' AND chain_name='{self.pdb_chain_id}' AND rfam_acc IS NULL;")[0][0]
sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, issue) VALUES (?, ?, NULL, ?)
ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue;""",
data=(str(self.pdb_id), str(self.pdb_chain_id), int(self.delete_me)))
self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain
WHERE structure_id='{self.pdb_id}'
AND chain_name='{self.pdb_chain_id}'
AND rfam_acc IS NULL;""")[0][0]
# Add the nucleotides
sql_execute(conn, f"""
......@@ -492,7 +529,6 @@ class Chain:
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""",
many=True, data=list(df.to_records(index=False)), warn_every=10)
# Remove too short chains
if self.length < 5:
warn(f"{self.chain_label} sequence is too short, let's ignore it.\t", error=True)
......@@ -1064,7 +1100,7 @@ class Pipeline:
if self.EXTRACT_CHAINS:
if self.HOMOLOGY and not path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"):
os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam") # for the portions mapped to Rfam
if not self.HOMOLOGY and not path.isdir(path_to_3D_data + "rna_only"):
if (not self.HOMOLOGY) and not path.isdir(path_to_3D_data + "rna_only"):
os.makedirs(path_to_3D_data + "rna_only") # extract chains of pure RNA
# define and run jobs
......@@ -1295,7 +1331,7 @@ class Pipeline:
r = sql_ask_database(conn, """SELECT DISTINCT chain_id, structure_id FROM chain WHERE structure_id NOT IN (SELECT DISTINCT pdb_id FROM structure);""")
if len(r) and r[0][0] is not None:
warn("Chains without referenced structures have been detected")
print(" ".join([x[1]+'-'+x[0] for x in r]))
print(" ".join([str(x[1])+'-'+str(x[0]) for x in r]))
if self.HOMOLOGY:
# check if chains have been re_mapped:
......@@ -2187,6 +2223,7 @@ if __name__ == "__main__":
# At this point, the structure table is up to date
pp.build_chains(coeff_ncores=1.0)
if len(pp.to_retry):
# Redownload and re-annotate
print("> Retrying to annotate some structures which just failed.", flush=True)
......@@ -2227,6 +2264,7 @@ if __name__ == "__main__":
pp.prepare_sequences()
pp.realign()
# At this point, the family table is up to date
thr_idx_mgr = Manager()
......
......@@ -68,27 +68,30 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
if not path.isfile(f"data/wadley_kernel_{angle}.npz"):
conn = sqlite3.connect("results/RNANet.db")
df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE puckering="C2'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn)
c2_endo_etas = df[angle].values.tolist()
c2_endo_thetas = df["th"+angle].values.tolist()
df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE form = '.' AND puckering="C3'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn)
c3_endo_etas = df[angle].values.tolist()
c3_endo_thetas = df["th"+angle].values.tolist()
conn.close()
# Extract the angle values of c2'-endo and c3'-endo nucleotides
with sqlite3.connect("results/RNANet.db") as conn:
df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE puckering="C2'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn)
c2_endo_etas = df[angle].values.tolist()
c2_endo_thetas = df["th"+angle].values.tolist()
df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE form = '.' AND puckering="C3'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn)
c3_endo_etas = df[angle].values.tolist()
c3_endo_thetas = df["th"+angle].values.tolist()
# Create arrays with (x,y) coordinates of the points
values_c3 = np.vstack([c3_endo_etas, c3_endo_thetas])
values_c2 = np.vstack([c2_endo_etas, c2_endo_thetas])
# Approximate the density by a gaussian kernel
kernel_c3 = st.gaussian_kde(values_c3)
kernel_c2 = st.gaussian_kde(values_c2)
# Create 100x100 regular (x,y,z) values for the plot
xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j]
positions = np.vstack([xx.ravel(), yy.ravel()])
values_c3 = np.vstack([c3_endo_etas, c3_endo_thetas])
kernel_c3 = st.gaussian_kde(values_c3)
f_c3 = np.reshape(kernel_c3(positions).T, xx.shape)
values_c2 = np.vstack([c2_endo_etas, c2_endo_thetas])
kernel_c2 = st.gaussian_kde(values_c2)
f_c2 = np.reshape(kernel_c2(positions).T, xx.shape)
# Uncomment to save the data to an archive for later use without the need to recompute
# Save the data to an archive for later use without the need to recompute
np.savez(f"data/wadley_kernel_{angle}.npz",
c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas,
c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas,
......@@ -106,8 +109,10 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
notify(f"Kernel computed for {angle}/th{angle} (or loaded from file).")
# exact counts:
hist_c2, xedges, yedges = np.histogram2d(c2_endo_etas, c2_endo_thetas, bins=int(2*np.pi/0.1), range=[[0, 2*np.pi], [0, 2*np.pi]])
hist_c3, xedges, yedges = np.histogram2d(c3_endo_etas, c3_endo_thetas, bins=int(2*np.pi/0.1), range=[[0, 2*np.pi], [0, 2*np.pi]])
hist_c2, xedges, yedges = np.histogram2d(c2_endo_etas, c2_endo_thetas, bins=int(2*np.pi/0.1),
range=[[0, 2*np.pi], [0, 2*np.pi]])
hist_c3, xedges, yedges = np.histogram2d(c3_endo_etas, c3_endo_thetas, bins=int(2*np.pi/0.1),
range=[[0, 2*np.pi], [0, 2*np.pi]])
cmap = cm.get_cmap("jet")
color_values = cmap(hist_c3.ravel()/hist_c3.max())
......@@ -450,7 +455,7 @@ def seq_idty():
famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 1 ORDER BY rfam_acc ASC;") ]
ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ]
if len(ignored):
print("Idty matrices: Ignoring families with only one chain:", " ".join(ignored)+'\n')
print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
# compute distance matrices (or ignore if data/RF0****.npy exists)
p = Pool(processes=8)
......@@ -476,7 +481,7 @@ def seq_idty():
conn.close()
# Plots plots plots
fig, axs = plt.subplots(5,13, figsize=(15,9))
fig, axs = plt.subplots(4,17, figsize=(17,5.75))
axs = axs.ravel()
[axi.set_axis_off() for axi in axs]
im = "" # Just to declare the variable, it will be set in the loop
......@@ -495,7 +500,7 @@ def seq_idty():
D = D[idx1,:]
D = D[:,idx1[::-1]]
im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D
ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)")
ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10)
fig.tight_layout()
fig.subplots_adjust(wspace=0.1, hspace=0.3)
fig.colorbar(im, ax=axs[-1], shrink=0.8)
......@@ -537,10 +542,10 @@ if __name__ == "__main__":
threads = [
th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}),
th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}),
# th.Thread(target=stats_len),
# th.Thread(target=stats_freq),
# th.Thread(target=seq_idty),
# th.Thread(target=per_chain_stats)
# th.Thread(target=stats_len), # computes figures
# th.Thread(target=stats_freq), # Updates the database
# th.Thread(target=seq_idty), # produces .npy files and seq idty figures
# th.Thread(target=per_chain_stats) # Updates the database
]
# Start the threads
......