ifnot'['ins.id:# this is a Rfamseq entry, not a 3D chain
continue
# filter the alignment
names=[x.idforxinalignif'['inx.id]
align=align[-len(names):]
filtered_alignment=align[:,1:1]# all the lines, but no columns
forpincolumns:
filtered_alignment+=align[:,p-1:p]# save columns one by one
db_id=sql_ask_database(conn,f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';")
iflen(db_id):
db_id=db_id[0][0]
else:
pbar.update(1)
continue
seq=''.join([x[0]forxinsql_ask_database(conn,f"SELECT nt_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")])
aliseq=''.join([x[0]forxinsql_ask_database(conn,f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")])
full_length=len(seq)
# detect gaps
c_seq=list(seq)# contains "ACGUNacgu-"
letters=['A','C','G','U','N']
homology_data=sql_ask_database(conn,f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM
(SELECT chain_id, rfam_acc FROM chain WHERE chain_id={db_id})
NATURAL JOIN re_mapping
NATURAL JOIN align_column;
""")
ifhomology_dataisNoneornotlen(homology_data):
withopen(runDir+"/errors.txt","a")aserrf:
errf.write(f"No homology data found in the database for {s.id} ! Not replacing gaps.\n")
continue
eliflen(homology_data)!=full_length:
withopen(runDir+"/errors.txt","a")aserrf:
errf.write(f"Found {len(homology_data)} nucleotides for {s.id} of length {full_length} ! Not replacing gaps.\n")
r=sql_ask_database(conn,f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';")
df=pd.read_sql_query(f"SELECT freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;",conn)