Louis BECQUEY

Replaced NULL rfam_acc values by unmappd

......@@ -553,14 +553,14 @@ class Chain:
AND rfam_acc='{self.mapping.rfam_acc}'
AND eq_class='{self.eq_class}';""")[0][0]
else:
sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, NULL, ?, ?)
sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, 'unmappd', ?, ?)
ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue, eq_class=excluded.eq_class;""",
data=(str(self.pdb_id), str(self.pdb_chain_id), str(self.eq_class), int(self.delete_me)))
self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain
WHERE structure_id='{self.pdb_id}'
AND chain_name='{self.pdb_chain_id}'
AND eq_class='{self.eq_class}'
AND rfam_acc IS NULL;""")[0][0]
AND rfam_acc = 'unmappd';""")[0][0]
# Add the nucleotides if the chain is not an issue
if df is not None and not self.delete_me: # double condition is theoretically redundant here, but you never know
......@@ -1193,7 +1193,7 @@ class Pipeline:
pdb_model = int(nr[1])
pdb_chain_id = nr[2].upper()
chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}"
res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc IS NULL AND issue=0""")
res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc = 'unmappd' AND issue=0""")
if not len(res) or self.REUSE_ALL: # the chain is NOT yet in the database, or this is a known issue
self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class))
conn.close()
......
......@@ -610,22 +610,22 @@ def general_stats():
with sqlite3.connect("results/RNANet.db") as conn:
df_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
WHERE rfam_acc IS NULL AND ISSUE=0;""", conn)
WHERE rfam_acc = 'unmappd' AND ISSUE=0;""", conn)
df_mapped_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
WHERE rfam_acc IS NOT NULL AND ISSUE=0;""", conn)
WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn)
df_mapped_copies = pd.read_sql(f"""SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution
FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
WHERE rfam_acc IS NOT NULL AND ISSUE=0;""", conn)
WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn)
df_inferred_only_unique = pd.read_sql(f"""SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution
FROM (SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy
FROM chain
JOIN (SELECT structure_id, chain_name, COUNT(distinct rfam_acc) AS redundancy, SUM(inferred) AS inf_redundancy
FROM chain
WHERE rfam_acc IS NOT NULL AND issue=0
WHERE rfam_acc != 'unmappd' AND issue=0
GROUP BY structure_id, chain_name
) AS r ON chain.structure_id=r.structure_id AND chain.chain_name = r.chain_name
WHERE r.redundancy=r.inf_redundancy AND rfam_acc IS NOT NULL and issue=0
WHERE r.redundancy=r.inf_redundancy AND rfam_acc != 'unmappd' and issue=0
) AS c
JOIN structure ON c.structure_id=structure.pdb_id;""", conn)
print("> found", len(df_inferred_only_unique.index), "chains which are mapped only by inference using BGSU NR Lists.")
......@@ -775,9 +775,6 @@ def log_to_pbar(pbar):
if __name__ == "__main__":
general_stats()
exit()
# parse options
try:
opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "resolution=", "3d-folder=", "seq-folder=" ])
......@@ -839,8 +836,8 @@ if __name__ == "__main__":
# Define the tasks
joblist = []
joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 4.0))) # res threshold is 4.0 Angstroms by default
joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 4.0))) #
# joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 4.0))) # res threshold is 4.0 Angstroms by default
# joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 4.0))) #
joblist.append(Job(function=stats_len)) # Computes figures
# joblist.append(Job(function=stats_freq)) # updates the database
# for f in famlist:
......@@ -873,3 +870,4 @@ if __name__ == "__main__":
# per_chain_stats()
# seq_idty()
# stats_pairs()
general_stats()
......