self.mapping.log(f"Residue {i+1+self.mapping.st}-{self.mapping.st} = {i+1} has been saved and renumbered {df.iloc[i,1]} instead of {found['nt_id'].replace(found['chain_name']+ '.' + found['nt_name'], '').replace('^','')}")
# expected data point 1 |--------------------------------| 154
#
ifl!=len(df['index_chain']):# if some residues are missing, len(df['index_chain']) < l
resnum_start=df.iloc[0,1]
diff=set(range(l)).difference(df['nt_resnum']-resnum_start)# the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start)
resnum_start=df.iloc[0,1]
# the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start)
res=sql_ask_database(conn,f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc = 'unmappd' AND issue=0""")
res=sql_ask_database(conn,f"""SELECT chain_id from chain
WHERE structure_id='{pdb_id}'
AND chain_name='{pdb_chain_id}'
AND rfam_acc = 'unmappd'
AND issue=0""")
ifnotlen(res)orself.REUSE_ALL:# the chain is NOT yet in the database, or this is a known issue
pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;",
res=sql_ask_database(conn,f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc='{rfam}' AND issue=0""")
res=sql_ask_database(conn,f"""SELECT chain_id from chain
WHERE structure_id='{pdb_id}'
AND chain_name='{pdb_chain_id}'
AND rfam_acc='{rfam}'
AND issue=0""")
ifnotlen(res):# the chain is NOT yet in the database, or this is a known issue
sql_execute(conn,"INSERT INTO re_mapping (chain_id, index_chain, index_ali) VALUES (?, ?, ?) ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;",many=True,data=re_mappings)
sql_execute(conn,"""INSERT INTO re_mapping (chain_id, index_chain, index_ali)
VALUES (?, ?, ?)
ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""",
counts=dict(sql_ask_database(conn,f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;",warn_every=0))
df=pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;",conn)
print("> found",answers[4].iloc[0][0],f"chains ({answers[5].iloc[0][0]} unique chains) that are mapped thanks to Rfam. Removing chains with issues, only {answers[15].iloc[0][0]} ({answers[16].iloc[0][0]} unique)")
ifanswers[4].iloc[0][0]!=answers[5].iloc[0][0]:
print("\t> This happens because different parts of the same chain can be mapped to different families.")
print("> found",answers[6].iloc[0][0],f"chains ({answers[7].iloc[0][0]} unique chains) that are mapped by inferrence. Removing chains with issues, only {answers[17].iloc[0][0]} ({answers[18].iloc[0][0]} unique).")
print("\t> ",answers[8].iloc[0][0],"chains are mapped only once,")
print("\t> ",answers[9].iloc[0][0],"are mapped to 2 families,")
print("\t> ",answers[10].iloc[0][0],"are mapped to 3 or more.")
print("> Among them,",answers[11].iloc[0][0],"chains are mapped both with families found on Rfam and by inferrence.")
ifanswers[11].iloc[0][0]:
print("\t> this is normal if you used option -f (--full-inference). Otherwise, there might be a problem.")
print("> TOTAL:",answers[12].iloc[0][0],f"chains ({answers[13].iloc[0][0]} unique chains) mapped to a family. Removing chains with issues, only {answers[19].iloc[0][0]} ({answers[20].iloc[0][0]} unique).")
print("> TOTAL:",answers[14].iloc[0][0],f"unmapped chains. Removing chains with issues, {answers[21].iloc[0][0]}.")
ifanswers[14].iloc[0][0]:
print("\t> this is normal if you used option --no-homology. Otherwise, there might be a problem.")
fam_list=[x[0]forxinsql_ask_database(conn,"SELECT rfam_acc from family ORDER BY rfam_acc ASC;")]
mappings_list={}
forkinfam_list:
mappings_list[k]=[x[0]forxinsql_ask_database(conn,f"SELECT chain_id from chain WHERE rfam_acc='{k}' and issue=0;")]
mappings_list[k]=[x[0]forxinsql_ask_database(conn,f"SELECT chain_id from chain JOIN structure ON chain.structure_id=structure.pdb_id WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};")]
# List the families for which we will compute sequence identity matrices
withsqlite3.connect("results/RNANet.db")asconn:
famlist=[x[0]forxinsql_ask_database(conn,"SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;")]
ignored=[x[0]forxinsql_ask_database(conn,"SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;")]
famlist=[x[0]forxinsql_ask_database(conn,"SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;")]
ignored=[x[0]forxinsql_ask_database(conn,"SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains < 3 ORDER BY rfam_acc ASC;")]
n_unmapped_chains=sql_ask_database(conn,"SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0]
iflen(ignored):
print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:"," ".join(ignored)+'\n')