db_id=sql_ask_database(conn,f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';")
db_id=sql_ask_database(conn,f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';")
iflen(db_id):
db_id=db_id[0][0]
else:
conn.close()
warn(f"Bizarre... sequence {s.id} is not found in the database ! Cannot remap it ! Ignoring...")
pbar.update(1)
continue
seq_to_align=''.join([x[0]forxinsql_ask_database(conn,f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")])
full_length=len(seq_to_align)
conn.close()
warn(f"Bizarre... sequence {s.id} is not found in the database ! Cannot remap it ! Ignoring...")
pbar.update(1)
continue
seq_to_align=''.join([x[0]forxinsql_ask_database(conn,f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")])
full_length=len(seq_to_align)
conn.close()
# Save colums in the appropriate positions
i=0# to iterate the object sequence
j=0# to iterate the alignment sequence
whilei<full_lengthandj<alilen:
whilei<full_lengthandj<ncols:
# Here we try to map seq_to_align (the sequence of the 3D chain, including gaps when residues are missing),
# with s.seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and .
ifseq_to_align[i]==s.seq[j].upper():# alignment and sequence correspond (incl. gaps)
ifseq_to_align[i]==s.seq[j].upper():# alignment and sequence correspond (incl. gaps)
re_mappings.append((db_id,i+1,j+1))# because index_chain in table nucleotide is in [1,N], we use i+1 and j+1.
columns_to_save.add(j+1)# it's a set, doublons are automaticaly ignored
i+=1
j+=1
elifseq_to_align[i]=='-':# gap in the chain, but not in the aligned sequence
elifseq_to_align[i]=='-':# '-' in the chain, but '.' or letter in the aligned sequence
# search for a gap to the consensus nearby
k=0# Search must start at zero to assert the difference comes from '-' in front of '.'
ifnot'['ins.id:# this is a Rfamseq entry, not a 3D chain
continue
db_id=sql_ask_database(conn,f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';")
iflen(db_id):
db_id=db_id[0][0]
else:
pbar.update(1)
continue
seq=''.join([x[0]forxinsql_ask_database(conn,f"SELECT nt_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")])
aliseq=''.join([x[0]forxinsql_ask_database(conn,f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")])
full_length=len(seq)
# detect gaps
c_seq=list(seq)# contains "ACGUNacgu-"
letters=['A','C','G','U','N']
homology_data=sql_ask_database(conn,f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM
(SELECT chain_id, rfam_acc FROM chain WHERE chain_id={db_id})
NATURAL JOIN re_mapping
NATURAL JOIN align_column;
""")
ifhomology_dataisNoneornotlen(homology_data):
withopen(runDir+"/errors.txt","a")aserrf:
errf.write(f"No homology data found in the database for {s.id} ! Not replacing gaps.\n")
continue
eliflen(homology_data)!=full_length:
withopen(runDir+"/errors.txt","a")aserrf:
errf.write(f"Found {len(homology_data)} nucleotides for {s.id} of length {full_length} ! Not replacing gaps.\n")
r=sql_ask_database(conn,f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';")
df=pd.read_sql_query(f"SELECT freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;",conn)