counts=dict(sql_ask_database(conn,f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;",warn_every=0))
df=pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;",conn)
print("> found",answers[4].iloc[0][0],f"chains ({answers[5].iloc[0][0]} unique chains) that are mapped thanks to Rfam. Removing chains with issues, only {answers[15].iloc[0][0]} ({answers[16].iloc[0][0]} unique)")
ifanswers[4].iloc[0][0]!=answers[5].iloc[0][0]:
print("\t> This happens because different parts of the same chain can be mapped to different families.")
print("> found",answers[6].iloc[0][0],f"chains ({answers[7].iloc[0][0]} unique chains) that are mapped by inferrence. Removing chains with issues, only {answers[17].iloc[0][0]} ({answers[18].iloc[0][0]} unique).")
print("\t> ",answers[8].iloc[0][0],"chains are mapped only once,")
print("\t> ",answers[9].iloc[0][0],"are mapped to 2 families,")
print("\t> ",answers[10].iloc[0][0],"are mapped to 3 or more.")
print("> Among them,",answers[11].iloc[0][0],"chains are mapped both with families found on Rfam and by inferrence.")
ifanswers[11].iloc[0][0]:
print("\t> this is normal if you used option -f (--full-inference). Otherwise, there might be a problem.")
print("> TOTAL:",answers[12].iloc[0][0],f"chains ({answers[13].iloc[0][0]} unique chains) mapped to a family. Removing chains with issues, only {answers[19].iloc[0][0]} ({answers[20].iloc[0][0]} unique).")
print("> TOTAL:",answers[14].iloc[0][0],f"unmapped chains. Removing chains with issues, {answers[21].iloc[0][0]}.")
ifanswers[14].iloc[0][0]:
print("\t> this is normal if you used option --no-homology. Otherwise, there might be a problem.")
fam_list=[x[0]forxinsql_ask_database(conn,"SELECT rfam_acc from family ORDER BY rfam_acc ASC;")]
mappings_list={}
forkinfam_list:
mappings_list[k]=[x[0]forxinsql_ask_database(conn,f"SELECT chain_id from chain WHERE rfam_acc='{k}' and issue=0;")]
mappings_list[k]=[x[0]forxinsql_ask_database(conn,f"SELECT chain_id from chain JOIN structure ON chain.structure_id=structure.pdb_id WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};")]
# List the families for which we will compute sequence identity matrices
withsqlite3.connect("results/RNANet.db")asconn:
famlist=[x[0]forxinsql_ask_database(conn,"SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;")]
ignored=[x[0]forxinsql_ask_database(conn,"SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;")]
famlist=[x[0]forxinsql_ask_database(conn,"SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;")]
ignored=[x[0]forxinsql_ask_database(conn,"SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains < 3 ORDER BY rfam_acc ASC;")]
n_unmapped_chains=sql_ask_database(conn,"SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0]
iflen(ignored):
print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:"," ".join(ignored)+'\n')