Louis BECQUEY

Merged Khodor-internship with ahead master

......@@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser
from Bio.Seq import MutableSeq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from pydca.plmdca import plmdca
runDir = os.getcwd()
def trace_unhandled_exceptions(func):
......@@ -76,11 +76,11 @@ python_executable = "python"+".".join(platform.python_version().split('.')[:2])
validsymb = '\U00002705'
warnsymb = '\U000026A0'
errsymb = '\U0000274C'
LSU_set = {"RF00002", "RF02540", "RF02541",
"RF02543", "RF02546"} # From Rfam CLAN 00112
SSU_set = {"RF00177", "RF02542", "RF02545",
"RF01959", "RF01960"} # From Rfam CLAN 00111
no_nts_set = set()
weird_mappings = set()
......@@ -845,14 +845,14 @@ class Downloader:
if os.path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"):
print("\t> Use of the previous version.\t", end="", flush=True)
else:
return pd.DataFrame([], columns=["class", "class_members"])
return pd.DataFrame([], columns=["class","representative","class_members"])
nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv")
full_structures_list = [ tuple(i[1]) for i in nrlist[['class', 'class_members']].iterrows() ]
full_structures_list = [ tuple(i[1]) for i in nrlist[["class","representative","class_members"]].iterrows() ]
print(f"\t{validsymb}", flush=True)
# The beginning of an adventure.
return full_structures_list # list of ( str (class), str (class_members) )
return full_structures_list # list of ( str (class), str(representative),str (class_members) )
def download_from_SILVA(self, unit):
......@@ -966,6 +966,9 @@ class Pipeline:
self.RUN_STATS = False
self.EXTRACT_CHAINS = False
self.REUSE_ALL = False
self.REDUNDANT = False
self.ALIGNOPTS = None
self.USESINA = False
self.SELECT_ONLY = None
self.ARCHIVE = False
self.SAVELOGS = True
......@@ -981,8 +984,9 @@ class Pipeline:
setproctitle("RNANet.py process_options()")
try:
opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=", "only=", "maxcores=",
"from-scratch", "full-inference", "no-homology", "ignore-issues", "extract",
opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=",
"only=", "cmalign-opts=", "maxcores=", "sina", "from-scratch",
"full-inference", "no-homology", "redundant", "ignore-issues", "extract",
"all", "no-logs", "archive", "update-homologous", "version"])
except getopt.GetoptError as err:
print(err)
......@@ -1006,6 +1010,7 @@ class Pipeline:
print("--------------------------------------------------------------------------------------------------------------")
print("-f [ --full-inference ]\t\tInfer new mappings even if Rfam already provides some. Yields more copies of"
"\n\t\t\t\t chains mapped to different families.")
print("--redundant\t\t\t\tStore the class members in the database thoughts to be redundant for predictions.")
print("-s\t\t\t\tRun statistics computations after completion")
print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.")
print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. "
......@@ -1022,9 +1027,12 @@ class Pipeline:
print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:"
"\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families"
"\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
print("--sina\t\t\tForce the RNANet to align large subunit LSU and small subunit SSU ribosomal RNA using sina instead of infernal,"
"\n\t\t\t\t\t the other RNA families will be aligned using infernal.")
print("--maxcores=…\t\t\tLimit the number of cores to use in parallel portions to reduce the simultaneous"
"\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions"
"\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.")
print("--cmalign-opts=…\t\tA string of additional options to pass to cmalign aligner, e.g. \"--nonbanded --mxsize 2048\"")
print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments,"
"\n\t\t\t\t and update the link to the latest archive. ")
print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications")
......@@ -1042,8 +1050,8 @@ class Pipeline:
print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --no-logs' &")
sys.exit()
elif opt == '--version':
print("RNANet v1.3 beta, parallelized, Dockerized")
print("Last revision : Jan 2021")
print("RNANet v1.4 beta, parallelized, Dockerized")
print("Last revision : April 2021")
sys.exit()
elif opt == "-r" or opt == "--resolution":
assert float(arg) > 0.0 and float(arg) <= 20.0
......@@ -1089,6 +1097,8 @@ class Pipeline:
path_to_seq_data + "realigned",
path_to_seq_data + "rfam_sequences"])
self.REUSE_ALL = True
elif opt == "cmalign-opts":
self.ALIGNOPTS = arg
elif opt == "--all":
self.REUSE_ALL = True
self.USE_KNOWN_ISSUES = False
......@@ -1103,6 +1113,10 @@ class Pipeline:
ncores = min(ncores, int(arg))
elif opt == "-f" or opt == "--full-inference":
self.FULLINFERENCE = True
elif opt=="--redundant":
self.REDUNDANT=True
elif opt=="--sina":
self.USESINA=True
if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions":
print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
......@@ -1151,7 +1165,8 @@ class Pipeline:
work_infer_mappings,
not self.REUSE_ALL,
allmappings,
self.FULLINFERENCE
self.FULLINFERENCE,
self.REDUNDANT
),
full_structures_list,
chunksize=1)):
......@@ -1340,7 +1355,7 @@ class Pipeline:
joblist = []
for f in self.fam_list:
joblist.append(Job(function=work_prepare_sequences, how_many_in_parallel=ncores, args=[
self.dl, f, rfam_acc_to_download[f]]))
self.dl, self.USESINA, f, rfam_acc_to_download[f]]))
try:
execute_joblist(joblist)
......@@ -1365,7 +1380,7 @@ class Pipeline:
joblist = []
for f in self.fam_list:
# the function already uses all CPUs so launch them one by one (how_many_in_parallel=1)
joblist.append(Job(function=work_realign, args=[f], how_many_in_parallel=1, label=f))
joblist.append(Job(function=work_realign, args=[self.USESINA, self.ALIGNOPTS, f], how_many_in_parallel=1, label=f))
# Execute the jobs
try:
......@@ -1753,6 +1768,10 @@ def sql_define_tables(conn):
freq_G REAL,
freq_U REAL,
freq_other REAL,
fields_A REAL,
fields_C REAL,
fields_G REAL,
fields_U REAL,
gap_percent REAL,
consensus CHAR(1),
cons_sec_struct CHAR(1),
......@@ -1937,7 +1956,7 @@ def execute_joblist(fulljoblist):
return results
@trace_unhandled_exceptions
def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> list:
def work_infer_mappings(update_only, allmappings, fullinference,redundant, codelist) -> list:
"""Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list,
build a list of Chain() objects mapped to Rfam families, by expanding available mappings
of any element of the list to all the list elements.
......@@ -1950,8 +1969,8 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li
# Split the comma-separated list of chain codes into chain codes:
eq_class = codelist[0]
codes = codelist[1].replace('+', ',').split(',')
codes = codelist[2].replace('+', ',').split(',')
representative=codelist[1].replace('+', ',').split(',')[0]
# Search for mappings that apply to an element of this PDB chains list:
for c in codes:
# search for Rfam mappings with this chain c:
......@@ -2040,6 +2059,13 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li
# Now build Chain() objects for the mapped chains
for c in codes:
if not redundant and c!=representative:
'''
by default save only the representative member
if redundant is passed then save all the chains of the class members
'''
continue
nr = c.split('|')
pdb_id = nr[0].lower()
pdb_model = int(nr[1])
......@@ -2202,13 +2228,12 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
return c
@trace_unhandled_exceptions
def work_prepare_sequences(dl, rfam_acc, chains):
def work_prepare_sequences(dl, useSina, rfam_acc, chains):
"""Prepares FASTA files of homologous sequences to realign with cmalign or SINA.
"""
setproctitle("RNAnet.py work_prepare_sequences()")
if rfam_acc in LSU_set | SSU_set: # rRNA
if useSina and rfam_acc in LSU_set | SSU_set:
if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
# Detect doublons and remove them
existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
......@@ -2290,109 +2315,123 @@ def work_prepare_sequences(dl, rfam_acc, chains):
notify(status)
@trace_unhandled_exceptions
def work_realign(rfam_acc):
""" Runs multiple sequence alignements by RNA family.
It aligns the Rfam hits from a RNA family with the sequences from the list of chains.
Rfam covariance models are used with Infernal tools, except for rRNAs.
cmalign requires too much RAM for them, so we use SINA, a specifically designed tool for rRNAs.
def use_sina(rfam_acc):
"""
setproctitle(f"RNAnet.py work_realign({rfam_acc})")
if rfam_acc in LSU_set | SSU_set:
# Ribosomal subunits deserve a special treatment.
# They require too much RAM to be aligned with Infernal.
# Then we will use SINA instead.
if rfam_acc in ["RF00177", "RF01960"]:
arbfile = "realigned/SSU.arb"
else:
arbfile = "realigned/LSU.arb"
# Run alignment
p = subprocess.run(["sina", "-i", path_to_seq_data + f"realigned/{rfam_acc}++.fa",
"-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
"-r", path_to_seq_data + arbfile,
"--meta-fmt=csv"])
When prompted by the user to use SINA the software will use SINA for rRNA SSU and LSU
"""
if rfam_acc in ["RF00177", "RF01960"]:
arbfile = "realigned/SSU.arb"
else:
# Align using Infernal for most RNA families
if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
# Alignment exists. We just want to add new sequences into it.
if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
# there are no new sequences to align...
return
arbfile = "realigned/LSU.arb"
existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk"
new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk"
# Run alignment
subprocess.run(["sina", "-i", path_to_seq_data + f"realigned/{rfam_acc}++.fa",
"-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
"-r", path_to_seq_data + arbfile,
"--meta-fmt=csv"])
# Align the new sequences
with open(new_ali_path, 'w') as o:
p1 = subprocess.run(["cmalign", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins",
"--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
path_to_seq_data + f"realigned/{rfam_acc}.cm",
path_to_seq_data + f"realigned/{rfam_acc}_new.fa"],
stdout=o, stderr=subprocess.PIPE)
notify("Aligned new sequences together")
# Detect doublons and remove them
existing_stk = AlignIO.read(existing_ali_path, "stockholm")
existing_ids = [r.id for r in existing_stk]
del existing_stk
new_stk = AlignIO.read(new_ali_path, "stockholm")
new_ids = [r.id for r in new_stk]
del new_stk
doublons = [i for i in existing_ids if i in new_ids]
del existing_ids, new_ids
if len(doublons):
warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove:
toremove.write('\n'.join(doublons)+'\n')
p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path],
stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path],
stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
os.remove(path_to_seq_data + "realigned/toremove.txt")
# And we merge the two alignments
p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
"--rna", existing_ali_path, new_ali_path],
stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8')
subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
notify("Merged alignments into one")
@trace_unhandled_exceptions
def use_infernal(rfam_acc, alignopts):
"""
Infernal is our default alignment tool except if prompted by the user.
Cmalign will be used for all families except when the user prefers to align rRNA with SINA
"""
if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
# Alignment exists. We just want to add new sequences into it.
# remove the partial files
os.remove(new_ali_path)
os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa")
if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
# there are no new sequences to align...
return
else:
# Alignment does not exist yet. We need to compute it from scratch.
print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True)
existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk"
new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk"
p = subprocess.run(["cmalign", '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
"--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
# Align the new sequences
with open(new_ali_path, 'w') as o:
p1 = subprocess.run(["cmalign", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins",
"--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
"-o", path_to_seq_data + f"realigned/{rfam_acc}++.stk",
path_to_seq_data + f"realigned/{rfam_acc}.cm",
path_to_seq_data + f"realigned/{rfam_acc}++.fa"],
stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
stderr = p.stderr.decode("utf-8")
if len(stderr):
print('', flush=True)
warn(f"Error during sequence alignment: {stderr}", error=True)
with open(runDir + "/errors.txt", "a") as er:
er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n')
return 1
else:
print('\t'+validsymb, flush=True)
path_to_seq_data + f"realigned/{rfam_acc}_new.fa"],
stdout=o, stderr=subprocess.PIPE)
notify("Aligned new sequences together")
# Detect doublons and remove them
existing_stk = AlignIO.read(existing_ali_path, "stockholm")
existing_ids = [r.id for r in existing_stk]
del existing_stk
new_stk = AlignIO.read(new_ali_path, "stockholm")
new_ids = [r.id for r in new_stk]
del new_stk
doublons = [i for i in existing_ids if i in new_ids]
del existing_ids, new_ids
if len(doublons):
warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove:
toremove.write('\n'.join(doublons)+'\n')
p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path],
stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path],
stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
os.remove(path_to_seq_data + "realigned/toremove.txt")
# And we merge the two alignments
p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
"--rna", existing_ali_path, new_ali_path],
stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8')
subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
notify("Merged alignments into one")
# remove the partial files
os.remove(new_ali_path)
os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa")
else:
# Alignment does not exist yet. We need to compute it from scratch.
print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True)
cmd = ["cmalign"]
if alignopts is not None:
cmd += " ".split(alignopts)
cmd += ['-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
"--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins",
"--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
path_to_seq_data + f"realigned/{rfam_acc}.cm",
path_to_seq_data + f"realigned/{rfam_acc}++.fa"]
p = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
stderr = p.stderr.decode("utf-8")
if len(stderr):
print('', flush=True)
warn(f"Error during sequence alignment: {stderr}", error=True)
with open(runDir + "/errors.txt", "a") as er:
er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n')
return 1
else:
print('\t'+validsymb, flush=True)
# Convert Stockholm to aligned FASTA
subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
# Convert Stockholm to aligned FASTA
subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
"--informat", "stockholm",
"afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"])
subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part.
subprocess.run(["rm", "-f", "esltmp*"]) # We can use a joker here, because we are not running in parallel for this part.
@trace_unhandled_exceptions
def work_realign(useSina, alignopts, rfam_acc):
""" Runs multiple sequence alignements by RNA family.
It aligns the Rfam hits from a RNA family with the sequences from the list of chains.
Rfam covariance models are used with Infernal tools or SINA based on the options provided by the user.
Even if the user prefers to use SINA it will be used only for rRNA and cmalign for other families
"""
setproctitle(f"RNAnet.py work_realign({rfam_acc})")
if useSina and rfam_acc in LSU_set | SSU_set:
use_sina(rfam_acc)
else:
use_infernal(rfam_acc, alignopts)
# Assert everything worked, or save an error
with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa", 'r') as output:
if not len(output.readline()):
......@@ -2402,6 +2441,73 @@ def work_realign(rfam_acc):
er.write(f"Failed to realign {rfam_acc} (killed)")
@trace_unhandled_exceptions
def work_pydca(f, columns_to_save):
"""
This function writes an alignment file containing only the columns which will be saved to the database,
converted to uppercase, and without non-ACGU nucleotides.
This file in then used by pydca to compute DCA features, and finally removed.
"""
align=read(path_to_seq_data + f"realigned/{f}++.afa")
for s in align:
s.seq=s.seq.upper() # Convert to uppercase as needed for pydca
filtered_alignment = align[:, 1:1] # all the lines, but no columns
for p in columns_to_save:
filtered_alignment += align[:, p-1:p] # save columns one by one
# Replace all other letters by a deletion gap just for the
# aim to use pydca as sites other than ACGU . and - are not accepted
for s in filtered_alignment:
for i in range(len(s.seq)):
if s.seq[i].upper() not in "ACGU-.":
s.seq[i]='-'
# Create a fasta file to be used by pydca
with open(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa", "w") as only_3d:
try:
only_3d.write(format(filtered_alignment, "fasta"))
except ValueError as e:
warn(e)
# PyDCA instance with options,
# Here lamda_J is set by pydca to 0.2*(L-1) where L is the length of the sequences
# The maximum number of iterations is set to 500 for gradient descent
# Lamda_h is set to 1 and seqid is set to 0.8 as suggested by pydca papers
# Reference:
# Zerihun MB, Pucci F, Peter EK, Schug A. pydca v1. 0: a comprehensive software for Direct Coupling Analysis of RNA and Protein Sequences. Bioinformatics.
# 2020;36(7):2264–2265. 10.1093/bioinformatics/btz892 - DOI - https://pubmed.ncbi.nlm.nih.gov/31778142/
plmdca_inst = plmdca.PlmDCA(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa",
"rna", seqid = 0.8, lambda_h = 1.0, num_threads = 10, max_iterations = 500)
number_of_sites=len(columns_to_save)*(len(columns_to_save)-1)//2 # L*(L-1)/2 where L=len(columns_to_save)
# Tuple of two list of tuples
# - the first list contains the fields of sites (nucleotides)
# - the second contains pairwise fields (2 nucleotides)
# linear distance is zero in order to keep all possible pairs
# because if linear dist=x>0 the pydca will return position |i-j|>x
# which will force us to lose a lot of pairs
params = plmdca_inst.compute_params(linear_dist=0, num_site_pairs=number_of_sites)
# Fröbenius norm with average product correction
fn_apc = plmdca_inst.compute_sorted_FN_APC()
# Save to file
np.savez(path_to_seq_data+f"/realigned/{f}_pydca.npz", PARAMS=params, FNAPC=fn_apc)
# A dictionary to be used in the function where the frequencies are stored in align_column table
return_dict_fields={}
for list_fields in params[0]:
# The element at 0 is the index
# So taking the value from column to save at that index will give us
# the fields to be stored at align_column in the table
return_dict_fields[columns_to_save[list_fields[0]]] = list_fields[1]
# Cleanup
subprocess.run(["rm", "-f", path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa"])
return return_dict_fields
@trace_unhandled_exceptions
def work_pssm_remap(f):
"""Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
This also remaps the 3D object sequence with the aligned sequence in the MSA.
......@@ -2477,7 +2583,6 @@ def work_pssm_remap(f):
# At this point, pssm_info is a numpy array containing the PSSM and consensus a list of consensus chars.
##########################################################################################
# Remap sequences of the 3D chains with sequences in the alignment
##########################################################################################
......@@ -2560,11 +2665,29 @@ def work_pssm_remap(f):
# Get a sorted list from the set
columns = sorted(columns_to_save)
# Save the re_mappings
conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0)
conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali)
VALUES (?, ?, ?)
ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""",
many=True, data=re_mappings)
# Delete alignment columns that are not used anymore from the database
current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = '{f}';")]
unused = []
for col in current_family_columns:
if col not in columns_to_save:
unused.append((f, col))
sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused)
conn.commit()
##########################################################################################
# Save the alignment columns and their mappings to the database
# Retrieve or compute metadata relative to the MSA columns
##########################################################################################
setproctitle(f"RNAnet.py work_pssm_remap({f}) saving")
setproctitle(f"RNAnet.py work_pssm_remap({f}) insert/match states")
# Get back the information of match/insertion states from the STK file
if f not in SSU_set and f not in LSU_set:
......@@ -2594,32 +2717,22 @@ def work_pssm_remap(f):
cm_coords = [ None for x in range(ncols) ]
cm_2d = [ None for x in range(ncols) ]
# Save the re_mappings
conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0)
conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali)
VALUES (?, ?, ?)
ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""",
many=True, data=re_mappings)
setproctitle(f"RNAnet.py work_pssm_remap({f}) Potts model, DCA")
# Delete alignment columns that are not used anymore from the database
current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = '{f}';")]
unused = []
for col in current_family_columns:
if col not in columns_to_save:
unused.append((f, col))
sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused)
conn.commit()
rfam_fields_record = work_pydca(f, columns)
# Save the useful columns in the database
data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)]
sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO
data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + tuple(rfam_fields_record[j]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)]
sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO
UPDATE SET cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U,
freq_other=excluded.freq_other, gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data)
freq_other=excluded.freq_other, fields_A=excluded.fields_A, fields_C=excluded.fields_C, fields_G=excluded.fields_G, fields_U=excluded.fields_U,
gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data)
# Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment)
sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct)
VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,))
sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other,
fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct)
VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,))
# Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains)
sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f))
conn.close()
......@@ -2663,9 +2776,10 @@ def work_save(c, homology=True):
if homology:
df = pd.read_sql_query(f"""
SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, cm_coord,
is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct, dbn,
paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta,
chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base,
is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, fields_A, fields_C, fields_G,
fields_U, gap_percent, consensus, cons_sec_struct, dbn, paired, nb_interact, pair_type_LW, pair_type_DSSR,
alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, chi, bb_type, glyco_bond, form, ssZp, Dp,
eta, theta, eta_prime, theta_prime, eta_base, theta_base,
v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM
(SELECT chain_id, rfam_acc from chain WHERE chain_id = {c.db_chain_id})
NATURAL JOIN re_mapping
......
......@@ -1206,7 +1206,7 @@ if __name__ == "__main__":
sys.exit()
elif opt == '--version':
print("RNANet statistics 1.3 beta")
print("RNANet statistics 1.4 beta")
sys.exit()
elif opt == "-r" or opt == "--resolution":
assert float(arg) > 0.0 and float(arg) <= 20.0
......