Louis BECQUEY

Splitted Aglaé's code in a separate stats file

......@@ -31,7 +31,7 @@ import time
import traceback
import warnings
from functools import partial, wraps
from multiprocessing import Pool, Manager
from multiprocessing import Pool, Manager, Value
from time import sleep
from tqdm import tqdm
from setproctitle import setproctitle
......@@ -45,6 +45,12 @@ from Bio.PDB.PDBIO import Select
runDir = os.getcwd()
def trace_unhandled_exceptions(func):
"""
Captures exceptions even in parallel sections of the code and child processes,
and throws logs in red to stderr and to errors.txt.
Should be defined before the classes that use it.
"""
@wraps(func)
def wrapped_func(*args, **kwargs):
try:
......@@ -60,27 +66,27 @@ def trace_unhandled_exceptions(func):
print(s)
return wrapped_func
pd.set_option('display.max_rows', None)
sqlite3.enable_callback_tracebacks(True)
sqlite3.register_adapter(np.int64, lambda val: int(val)) # Tell Sqlite what to do with <class numpy.int64> objects ---> convert to int
sqlite3.register_adapter(np.float64, lambda val: float(val)) # Tell Sqlite what to do with <class numpy.float64> objects ---> convert to float
m = Manager()
running_stats = m.list()
running_stats.append(0) # n_launched
running_stats.append(0) # n_finished
running_stats.append(0) # n_skipped
# m = Manager()
# running_stats = m.list()
# running_stats.append(0) # n_launched
# running_stats.append(0) # n_finished
# running_stats.append(0) # n_skipped
n_launched = Value('i', 0)
n_finished = Value('i', 0)
n_skipped = Value('i', 0)
path_to_3D_data = "tobedefinedbyoptions"
path_to_seq_data = "tobedefinedbyoptions"
python_executable = "python"+".".join(platform.python_version().split('.')[:2]) # Cuts python3.8.1 into python3.8 for example.
validsymb = '\U00002705'
warnsymb = '\U000026A0'
errsymb = '\U0000274C'
LSU_set = {"RF00002", "RF02540", "RF02541",
"RF02543", "RF02546"} # From Rfam CLAN 00112
SSU_set = {"RF00177", "RF02542", "RF02545",
"RF01959", "RF01960"} # From Rfam CLAN 00111
LSU_set = {"RF00002", "RF02540", "RF02541", "RF02543", "RF02546"} # From Rfam CLAN 00112
SSU_set = {"RF00177", "RF02542", "RF02545", "RF01959", "RF01960"} # From Rfam CLAN 00111
no_nts_set = set()
weird_mappings = set()
......@@ -103,17 +109,15 @@ class MutableFastaIterator(FastaIterator):
first_word = title.split(None, 1)[0]
except IndexError:
assert not title, repr(title)
# Should we use SeqRecord default for no ID?
first_word = ""
yield SeqRecord(
MutableSeq(sequence), id=first_word, name=first_word, description=title,
)
yield SeqRecord(MutableSeq(sequence), id=first_word, name=first_word, description=title)
class SelectivePortionSelector(object):
"""Class passed to MMCIFIO to select some chain portions in an MMCIF file.
Validates every chain, residue, nucleotide, to say if it is in the selection or not.
The primary use is to select the portion of a chain which is mapped to a family.
"""
def __init__(self, model_id, chain_id, valid_resnums, khetatm):
......@@ -156,123 +160,6 @@ class SelectivePortionSelector(object):
return 1
_select=Select()
def save_mmcif(ioobj, out_file, select=_select, preserve_atom_numbering=False):
# reuse and modification of the source code of Biopython
# to have the 2 columns of numbering of residues numbered with the index_chain of DSSR
if isinstance(out_file, str):
fp = open(out_file, "w")
close_file = True
else:
fp = out_file
close_file = False
atom_dict = defaultdict(list)
for model in ioobj.structure.get_list():
if not select.accept_model(model):
continue
# mmCIF files with a single model have it specified as model 1
if model.serial_num == 0:
model_n = "1"
else:
model_n = str(model.serial_num)
# This is used to write label_entity_id and label_asym_id and
# increments from 1, changing with each molecule
entity_id = 0
if not preserve_atom_numbering:
atom_number = 1
for chain in model.get_list():
if not select.accept_chain(chain):
continue
chain_id = chain.get_id()
if chain_id == " ":
chain_id = "."
# This is used to write label_seq_id,
# remaining blank for hetero residues
prev_residue_type = ""
prev_resname = ""
for residue in chain.get_unpacked_list():
if not select.accept_residue(residue):
continue
hetfield, resseq, icode = residue.get_id()
if hetfield == " ":
residue_type = "ATOM"
label_seq_id = str(resseq)
else:
residue_type = "HETATM"
label_seq_id = "."
resseq = str(resseq)
if icode == " ":
icode = "?"
resname = residue.get_resname()
# Check if the molecule changes within the chain
# This will always increment for the first residue in a
# chain due to the starting values above
if residue_type != prev_residue_type or (
residue_type == "HETATM" and resname != prev_resname
):
entity_id += 1
prev_residue_type = residue_type
prev_resname = resname
label_asym_id = ioobj._get_label_asym_id(entity_id)
for atom in residue.get_unpacked_list():
if select.accept_atom(atom):
atom_dict["_atom_site.group_PDB"].append(residue_type)
if preserve_atom_numbering:
atom_number = atom.get_serial_number()
atom_dict["_atom_site.id"].append(str(atom_number))
if not preserve_atom_numbering:
atom_number += 1
element = atom.element.strip()
if element == "":
element = "?"
atom_dict["_atom_site.type_symbol"].append(element)
atom_dict["_atom_site.label_atom_id"].append(
atom.get_name().strip()
)
altloc = atom.get_altloc()
if altloc == " ":
altloc = "."
atom_dict["_atom_site.label_alt_id"].append(altloc)
atom_dict["_atom_site.label_comp_id"].append(
resname.strip()
)
atom_dict["_atom_site.label_asym_id"].append(label_asym_id)
# The entity ID should be the same for similar chains
# However this is non-trivial to calculate so we write "?"
atom_dict["_atom_site.label_entity_id"].append("?")
atom_dict["_atom_site.label_seq_id"].append(label_seq_id)
atom_dict["_atom_site.pdbx_PDB_ins_code"].append(icode)
coord = atom.get_coord()
atom_dict["_atom_site.Cartn_x"].append("%.3f" % coord[0])
atom_dict["_atom_site.Cartn_y"].append("%.3f" % coord[1])
atom_dict["_atom_site.Cartn_z"].append("%.3f" % coord[2])
atom_dict["_atom_site.occupancy"].append(
str(atom.get_occupancy())
)
atom_dict["_atom_site.B_iso_or_equiv"].append(
str(atom.get_bfactor())
)
atom_dict["_atom_site.auth_seq_id"].append(resseq)
atom_dict["_atom_site.auth_asym_id"].append(chain_id)
atom_dict["_atom_site.pdbx_PDB_model_num"].append(model_n)
# Data block name is the structure ID with special characters removed
structure_id = ioobj.structure.id
for c in ["#", "$", "'", '"', "[", "]", " ", "\t", "\n"]:
structure_id = structure_id.replace(c, "")
atom_dict["data_"] = structure_id
# Set the dictionary and write out using the generic dictionary method
ioobj.dic = atom_dict
ioobj._save_dict(fp)
if close_file:
fp.close()
class Chain:
"""
The object which stores all our data and the methods to process it.
......@@ -424,13 +311,11 @@ class Chain:
for atom in list(res.get_atoms()):
# rename the remaining phosphate group to P, OP1, OP2, OP3
if atom.get_name() in ['PA', 'O1A', 'O2A', 'O3A'] and res_name != 'RIA':
# RIA is a residue made up of 2 riboses and 2 phosphates,
# so it has an O2A atom between the C2A and C1 'atoms,
# and it also has an OP2 atom attached to one of its phosphates
# (see chains 6fyx_1_1, 6zu9_1_1, 6fyy_1_1, 6gsm_1_1 , 3jaq_1_1 and 1yfg_1_A)
# we do not modify the atom names of RIA residue
# RIA is a residue made up of 2 riboses and 2 phosphates,
# so it has an O2A atom between the C2A and C1 'atoms,
# and it also has an OP2 atom attached to one of its phosphates
# (see chains 6fyx_1_1, 6zu9_1_1, 6fyy_1_1, 6gsm_1_1 , 3jaq_1_1 and 1yfg_1_A)
# we do not modify the atom names of RIA residue
if atom.get_name() == 'PA':
atom_name = 'P'
if atom.get_name() == 'O1A':
......@@ -440,7 +325,7 @@ class Chain:
if atom.get_name() == 'O3A':
atom_name = 'OP3'
new_atom_t = pdb.Atom.Atom(atom_name, atom.get_coord(), atom.get_bfactor(), atom.get_occupancy(), atom.get_altloc(), atom_name, atom.get_serial_number())
else :
else:
new_atom_t=atom.copy()
new_residu_t.add(new_atom_t)
new_chain_t.add(new_residu_t)
......@@ -787,7 +672,8 @@ class Chain:
return df
def register_chain(self, df):
"""Saves the extracted 3D data to the database.
"""
Saves the extracted 3D data to the database.
"""
setproctitle(f"RNANet.py {self.chain_label} register_chain()")
......@@ -920,6 +806,10 @@ class Monitor:
class Downloader:
"""
An object with methods to download public data from the internet.
"""
def download_Rfam_PDB_mappings(self):
"""Query the Rfam public MySQL database for mappings between their RNA families and PDB structures.
......@@ -1170,6 +1060,10 @@ class Mapping:
class Pipeline:
"""
The RNANet pipeline steps.
"""
def __init__(self):
self.dl = Downloader()
self.known_issues = [] # list of chain_labels to ignore
......@@ -1189,6 +1083,7 @@ class Pipeline:
self.REUSE_ALL = False
self.REDUNDANT = False
self.ALIGNOPTS = None
self.RRNAALIGNOPTS = "--mxsize 8192 --cpu 10 --maxtau 0.1"
self.STATSOPTS = None
self.USESINA = False
self.SELECT_ONLY = None
......@@ -1207,7 +1102,7 @@ class Pipeline:
try:
opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=",
"only=", "cmalign-opts=", "stats-opts=", "maxcores=", "sina", "from-scratch",
"only=", "cmalign-opts=", "cmalign-rrna-opts=", "stats-opts=", "maxcores=", "sina", "from-scratch",
"full-inference", "no-homology", "redundant", "ignore-issues", "extract",
"all", "no-logs", "archive", "update-homologous", "version"])
except getopt.GetoptError as err:
......@@ -1323,6 +1218,8 @@ class Pipeline:
self.REUSE_ALL = True
elif opt == "cmalign-opts":
self.ALIGNOPTS = arg
elif opt == "cmalign-rrna-opts":
self.RRNAALIGNOPTS = arg
elif opt == "stats-opts":
self.STATSOPTS = " ".split(arg)
elif opt == "--all":
......@@ -1382,7 +1279,7 @@ class Pipeline:
# If self.FULLINFERENCE is False, the extended list is already filtered to remove
# the chains which already are in the database.
print("> Building list of structures...", flush=True)
p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=ncores)
p = Pool(initializer=init_with_tqdm, initargs=(tqdm.get_lock(),), processes=ncores)
try:
pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1,
......@@ -1491,7 +1388,7 @@ class Pipeline:
else:
mmcif_list = sorted(set([c.pdb_id for c in self.update]))
try:
p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=int(coeff_ncores*ncores))
p = Pool(initializer=init_with_tqdm, initargs=(tqdm.get_lock(),), processes=int(coeff_ncores*ncores))
pbar = tqdm(mmcif_list, maxinterval=1.0, miniters=1, desc="mmCIF files")
for _ in p.imap_unordered(work_mmcif, mmcif_list, chunksize=1):
pbar.update(1) # Everytime the iteration finishes, update the global progress bar
......@@ -1634,7 +1531,11 @@ class Pipeline:
joblist = []
for f in self.fam_list:
# the function already uses all CPUs so launch them one by one (how_many_in_parallel=1)
joblist.append(Job(function=work_realign, args=[self.USESINA, self.ALIGNOPTS, f], how_many_in_parallel=1, label=f))
if f in LSU_set or f in SSU_set:
opts = self.RRNAALIGNOPTS
else:
opts = self.ALIGNOPTS
joblist.append(Job(function=work_realign, args=[self.USESINA, opts, f], how_many_in_parallel=1, label=f))
# Execute the jobs
try:
......@@ -1684,7 +1585,7 @@ class Pipeline:
# Start a process pool to dispatch the RNA families,
# over multiple CPUs (one family by CPU)
p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
p = Pool(initializer=init_with_tqdm, initargs=(tqdm.get_lock(),), processes=nworkers)
try:
fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True)
......@@ -1741,7 +1642,7 @@ class Pipeline:
os.makedirs(path_to_3D_data + "datapoints/")
# Save to by-chain CSV files
p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3)
p = Pool(initializer=init_with_tqdm, initargs=(tqdm.get_lock(),), processes=3)
try:
pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True)
for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains)):
......@@ -1867,6 +1768,7 @@ class Pipeline:
conn.close()
# ==================== General helper functions =====================
def read_cpu_number():
"""This function reads the number of CPU cores available from /proc/cpuinfo.
......@@ -1876,13 +1778,29 @@ def read_cpu_number():
p = subprocess.run(['grep', '-Ec', '(Intel|AMD)', '/proc/cpuinfo'], stdout=subprocess.PIPE)
return int(int(p.stdout.decode('utf-8')[:-1])/2)
def init_worker(tqdm_lock=None):
def init_with_tqdm(tqdm_lock=None):
"""
This initiation method kills the children when signal is received,
and the children progress is followed using TQDM progress bars.
"""
signal.signal(signal.SIGINT, signal.SIG_IGN)
if tqdm_lock is not None:
tqdm.set_lock(tqdm_lock)
def init_no_tqdm(arg1, arg2, arg3):
"""
This initiaiton method does not kill the children when signal is received,
they will complete and die even after the main process stops.
The children progress is followed using stdout text logs (notify(), warn(), etc)
"""
global n_launched, n_finished, n_skipped
n_launched = arg1
n_finished = arg2
n_skipped = arg3
def warn(message, error=False):
"""Pretty-print warnings and error messages.
"""
Pretty-print warnings and error messages.
"""
# Cut if too long
if len(message) > 66:
......@@ -1900,20 +1818,133 @@ def warn(message, error=False):
print(f"\t> \033[33mWARN: {message:64s}\033[0m\t{warnsymb}", flush=True)
def notify(message, post=''):
"""
Pretty-print successful finished tasks.
"""
if len(post):
post = '(' + post + ')'
print(f"\t> {message:70s}\t{validsymb}\t{post}", flush=True)
def _mutable_SeqIO_to_alignment_iterator(handle):
records = list(MutableFastaIterator(handle))
if records:
yield MultipleSeqAlignment(records)
# ========================= Biopython overloads =====================
def parse(handle):
with open(handle, 'r') as fp:
yield from _mutable_SeqIO_to_alignment_iterator(fp)
def save_mmcif(ioobj, out_file, select=Select(), preserve_atom_numbering=False):
"""
MMCIF writer which renumbers residues according to the RNANet index_chain (coming from DSSR).
"""
if isinstance(out_file, str):
fp = open(out_file, "w")
close_file = True
else:
fp = out_file
close_file = False
atom_dict = defaultdict(list)
# Iterate on models
for model in ioobj.structure.get_list():
if not select.accept_model(model):
continue
# mmCIF files with a single model have it specified as model 1
if model.serial_num == 0:
model_n = "1"
else:
model_n = str(model.serial_num)
# This is used to write label_entity_id and label_asym_id and
# increments from 1, changing with each molecule
entity_id = 0
if not preserve_atom_numbering:
atom_number = 1
# Iterate on chains
for chain in model.get_list():
if not select.accept_chain(chain):
continue
chain_id = chain.get_id()
if chain_id == " ":
chain_id = "."
# This is used to write label_seq_id, remaining blank for hetero residues
prev_residue_type = ""
prev_resname = ""
# Iterate on residues
for residue in chain.get_unpacked_list():
if not select.accept_residue(residue):
continue
hetfield, resseq, icode = residue.get_id()
if hetfield == " ":
residue_type = "ATOM"
label_seq_id = str(resseq)
else:
residue_type = "HETATM"
label_seq_id = "."
resseq = str(resseq)
if icode == " ":
icode = "?"
resname = residue.get_resname()
# Check if the molecule changes within the chain.
# This will always increment for the first residue in a
# chain due to the starting values above
if residue_type != prev_residue_type or (residue_type == "HETATM" and resname != prev_resname):
entity_id += 1
prev_residue_type = residue_type
prev_resname = resname
label_asym_id = ioobj._get_label_asym_id(entity_id)
# Iterate on atoms
for atom in residue.get_unpacked_list():
if select.accept_atom(atom):
atom_dict["_atom_site.group_PDB"].append(residue_type)
if preserve_atom_numbering:
atom_number = atom.get_serial_number()
atom_dict["_atom_site.id"].append(str(atom_number))
if not preserve_atom_numbering:
atom_number += 1
element = atom.element.strip()
if element == "":
element = "?"
atom_dict["_atom_site.type_symbol"].append(element)
atom_dict["_atom_site.label_atom_id"].append(atom.get_name().strip())
altloc = atom.get_altloc()
if altloc == " ":
altloc = "."
atom_dict["_atom_site.label_alt_id"].append(altloc)
atom_dict["_atom_site.label_comp_id"].append(resname.strip())
atom_dict["_atom_site.label_asym_id"].append(label_asym_id)
# The entity ID should be the same for similar chains
# However this is non-trivial to calculate so we write "?"
atom_dict["_atom_site.label_entity_id"].append("?")
atom_dict["_atom_site.label_seq_id"].append(label_seq_id)
atom_dict["_atom_site.pdbx_PDB_ins_code"].append(icode)
coord = atom.get_coord()
atom_dict["_atom_site.Cartn_x"].append("%.3f" % coord[0])
atom_dict["_atom_site.Cartn_y"].append("%.3f" % coord[1])
atom_dict["_atom_site.Cartn_z"].append("%.3f" % coord[2])
atom_dict["_atom_site.occupancy"].append(str(atom.get_occupancy()))
atom_dict["_atom_site.B_iso_or_equiv"].append(str(atom.get_bfactor()) )
atom_dict["_atom_site.auth_seq_id"].append(resseq)
atom_dict["_atom_site.auth_asym_id"].append(chain_id)
atom_dict["_atom_site.pdbx_PDB_model_num"].append(model_n)
# Data block name is the structure ID with special characters removed
structure_id = ioobj.structure.id
for c in ["#", "$", "'", '"', "[", "]", " ", "\t", "\n"]:
structure_id = structure_id.replace(c, "")
atom_dict["data_"] = structure_id
# Set the dictionary and write out using the generic dictionary method
ioobj.dic = atom_dict
ioobj._save_dict(fp)
if close_file:
fp.close()
def read(handle):
"""
A shortcut to parse alignment files with our custom class MutableFastaIterator.
"""
iterator = parse(handle)
try:
alignment = next(iterator)
......@@ -1926,6 +1957,25 @@ def read(handle):
pass
return alignment
def parse(handle):
"""
A shortcut to parse alignment files with our custom class MutableFastaIterator.
Called by function read().
"""
with open(handle, 'r') as fp:
yield from _mutable_SeqIO_to_alignment_iterator(fp)
def _mutable_SeqIO_to_alignment_iterator(handle):
"""
A shortcut to parse alignment files with our custom class MutableFastaIterator.
Used by the parse() function.
"""
records = list(MutableFastaIterator(handle))
if records:
yield MultipleSeqAlignment(records)
# ========================== SQL related ============================
def sql_define_tables(conn):
conn.executescript(
""" PRAGMA foreign_keys = on;
......@@ -2085,12 +2135,19 @@ def sql_execute(conn, sql, many=False, data=None, warn_every=10):
time.sleep(0.2)
warn("Tried to reach database 100 times and failed. Aborting.", error=True)
# ======================= RNANet Jobs and tasks ======================
@trace_unhandled_exceptions
def execute_job(j, jobcount):
"""Run a Job object.
"""
Run a Job object.
"""
global n_launched, n_skipped, n_finished
# increase the counter of running jobs
running_stats[0] += 1
with n_launched.get_lock():
n_launched.value += 1
# Monitor this process
m = -1
......@@ -2098,7 +2155,7 @@ def execute_job(j, jobcount):
if len(j.cmd_): # The job is a system command
print(f"[{running_stats[0]+running_stats[2]}/{jobcount}]\t{j.label}")
print(f"[{n_launched.value+n_skipped.value}/{jobcount}]\t{j.label}")
# Add the command to logfile
os.makedirs(runDir+"/logs", exist_ok=True)
......@@ -2114,9 +2171,20 @@ def execute_job(j, jobcount):
# run the command. subprocess.run will be a child of this process, and stays monitored.
start_time = time.time()
r = subprocess.run(j.cmd_, timeout=j.timeout_,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
r = subprocess.run(j.cmd_, timeout=j.timeout_, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
end_time = time.time()
if r.returncode != 0:
if r.stderr is not None:
print(r.stderr, flush=True)
print(f"[{n_launched.value+n_skipped.value}/{jobcount}]\tIssue faced with {j.label}, skipping it and adding it to known issues (if not known).")
with n_launched.get_lock():
n_launched.value -= 1
with n_skipped.get_lock():
n_skipped.value += 1
if j.label not in issues:
issues.add(j.label)
with open("known_issues.txt", "a") as iss:
iss.write(j.label+"\n")
# Stop the Monitor, then get its result
monitor.keep_watching = False
......@@ -2124,7 +2192,7 @@ def execute_job(j, jobcount):
elif j.func_ is not None:
print(f"[{running_stats[0]+running_stats[2]}/{jobcount}]\t{j.func_.__name__}({', '.join([str(a) for a in j.args_ if type(a) != list])})", flush=True)
print(f"[{n_launched.value+n_skipped.value}/{jobcount}]\t{j.func_.__name__}({', '.join([str(a) for a in j.args_ if type(a) != list])})", flush=True)
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# put the monitor in a different thread
......@@ -2193,7 +2261,7 @@ def execute_joblist(fulljoblist):
print("using", n, "processes:")
# execute jobs of priority i that should be processed n by n:
p = Pool(processes=n, maxtasksperchild=1, initializer=init_worker)
p = Pool(processes=n, maxtasksperchild=1, initializer=init_no_tqdm, initargs=(n_launched, n_finished, n_skipped))
try:
raw_results = p.map(partial(execute_job, jobcount=jobcount), bunch, chunksize=2)
p.close()
......@@ -2207,7 +2275,11 @@ def execute_joblist(fulljoblist):
for j, r in zip(bunch, raw_results):
j.comp_time = round(r[0], 2) # seconds
j.max_mem = int(r[1]/1000000) # MB
results.append((j.label, r[2], round(r[0], 2), int(r[1]/1000000)))
results.append((j.label, r[2], j.comp_time, j.max_mem))
# Job is finished
with n_finished.get_lock():
n_finished.value += 1
# throw back the money
return results
......@@ -2679,8 +2751,8 @@ def use_infernal(rfam_acc, alignopts):
# Convert Stockholm to aligned FASTA
subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
"--informat", "stockholm",
"afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"])
"--informat", "stockholm",
"afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"])
subprocess.run(["rm", "-f", "esltmp*"]) # We can use a joker here, because we are not running in parallel for this part.
@trace_unhandled_exceptions
......@@ -3037,6 +3109,8 @@ def work_save(c, homology=True):
df.to_csv(filename, float_format="%.2f", index=False)
# =========================== Main function =============================
if __name__ == "__main__":
fileDir = os.path.dirname(os.path.realpath(__file__))
......
......@@ -7,6 +7,15 @@ In `cmalign` alignments, - means a nucleotide is missing compared to the covaria
In the final filtered alignment that we provide for download, the same rule applies, but on top of that, some '.' are replaced by '-' when a gap in the 3D structure (a missing, unresolved nucleotide) is mapped to an insertion gap.
* **What are the cmalign options for ?**
From Infernal's user guide, we can quote that Infernal uses an HMM banding technique to accelerate alignment by default. It also takes care of 3' or 5' truncated sequences to be aligned correctly (and we have some).
First, one can choose an algorithm, between `--optacc` (maximizing posterior probabilities, the default) and `--cyk` (maximizing likelihood).
Then, the use of bands allows faster and more memory efficient computation, at the price of the guarantee of determining the optimal alignment. Bands can be disabled using the `--nonbanded` option. A best idea would be to control the threshold of probability mass to be considered negligible during HMM band calculation with the `--tau` parameter. Higher values of Tau yield greater speedups and lower memory usage, but a greater chance to miss the optimal alignment. In practice, the algorithm explores several Tau values (increasing it by a factor 2.0 from the original `--tau` value) until the DP matrix size falls below the threshold given by `--mxsize` (default 1028 Mb) or the value of `--maxtau` is reached (in this case, the program fails). One can disable this exploration with option `--fixedtau`. The default value of `--tau` is 1e-7, the default `--maxtau` is 0.05. Basically, you may decide on a value of `--mxsize` by dividing your available RAM by the number of cores used with cmalign. If necessary, you may use less cores than you have, using option `--cpu`.
Finally, if using `--cyk --nonbanded --notrunc --noprob`, one can use the `--small` option to align using the divide-and-conquer CYK algorithm from Eddy 2002, requiring a very few memory but a lot of time. The major drawback of this is that it requires `--notrunc` and `--noprob`, so we give up on the correct alignment of truncated sequences, and the computation of posterior probabilities.
* **Why are there some gap-only columns in the alignment ?**
These columns are not completely gap-only, they contain at least one dash-gap '-'. This means an actual, physical nucleotide which should exist in the 3D structure should be located there. The previous and following nucleotides are **not** contiguous in space in 3D.
......@@ -31,5 +40,5 @@ We first remove the nucleotides whose number is outside the family mapping (if a
* **What are the versions of the dependencies you use ?**
`cmalign` is v1.1.4, `sina` is v1.6.0, `x3dna-dssr` is v1.9.9, Biopython is v1.78.
`cmalign` is v1.1.4, `sina` is v1.6.0, `x3dna-dssr` is v2.3.2-2021jun29, Biopython is v1.78.
\ No newline at end of file
......
......@@ -6,23 +6,16 @@
* Some chains are not correctly renamed A in the produced separate files (e.g. 1d4r-B)
## Alignment issues
* [SOLVED] Filtered alignments are shorter than the number of alignment columns saved to the SQL table `align_column`
* Chain names appear in triple in the FASTA header (e.g. 1d4r[1]-B 1d4r[1]-B 1d4r[1]-B)
## Technical running issues
* [SOLVED] Files produced by Docker containers are owned by root and require root permissions to be read
* [SOLVED] SQLite WAL files are not deleted properly
# Known feature requests
* [DONE] Get filtered versions of the sequence alignments containing the 3D chains, publicly available for download
* [DONE] Get a consensus residue for each alignement column
* [DONE] Get an option to limit the number of cores
* [DONE] Move to SILVA LSU release 138.1
* [UPCOMING] Automated annotation of detected Recurrent Interaction Networks (RINs), see http://carnaval.lri.fr/ .
* [UPCOMING] Possibly, automated detection of HLs and ILs from the 3D Motif Atlas (BGSU). Maybe. Their own website already does the job.
* [UPCOMING] Weight sequences in alignment to give more importance to rarer sequences
* [UPCOMING] Give both gap_percent and insertion_gap_percent
* Automated annotation of detected Recurrent Interaction Networks (RINs), see http://carnaval.lri.fr/ .
* Possibly, automated detection of HLs and ILs from the 3D Motif Atlas (BGSU). Maybe. Their own website already does the job.
* Weight sequences in alignment to give more importance to rarer sequences
* Give both gap_percent and insertion_gap_percent
* A field estimating the quality of the sequence alignment in table family.
* Possibly, more metrics about the alignments coming from Infernal.
* Run cmscan ourselves from the NDB instead of using Rfam-PDB mappings ? (Iff this actually makes a real difference, untested yet)
* Use and save Infernal alignment bounds and truncation information
* Save if a chain is a representative in BGSU list
* Annotate unstructured regions (on a nucleotide basis)
......
This diff could not be displayed because it is too large.
6ydp_1_AA_1176-2737
6ydw_1_AA_1176-2737
2z9q_1_A_1-72
1ml5_1_b_5-121
1ml5_1_a_1-2914
3ep2_1_Y_1-72
3eq3_1_Y_1-72
4v48_1_A6_1-73
1ml5_1_A_2-1520
1ml5_1_b_5-121
1ml5_1_a_1-2914
1qzb_1_B_1-73
1qza_1_B_1-73
1ls2_1_B_1-73
1ml5_1_A_2-1520
1gsg_1_T_1-72
7d1a_1_A_805-902
7d0g_1_A_805-913
......@@ -22,15 +22,12 @@
2rdo_1_A_3-118
4v48_1_A9_3-118
4v47_1_A9_3-118
4v42_1_BA_1-2914
4v42_1_BB_5-121
2ob7_1_A_10-319
1x1l_1_A_1-130
1zc8_1_Z_1-91
2ob7_1_D_1-130
4v42_1_BA_1-2914
4v42_1_BB_5-121
1r2x_1_C_1-58
1r2w_1_C_1-58
1eg0_1_L_1-56
3dg2_1_A_1-1542
3dg0_1_A_1-1542
4v48_1_BA_1-1543
......@@ -46,11 +43,14 @@
3dg4_1_B_1-2904
3dg5_1_B_1-2904
1eg0_1_O_1-73
1zc8_1_A_1-59
1r2x_1_C_1-58
1r2w_1_C_1-58
1eg0_1_L_1-56
1jgq_1_A_2-1520
4v42_1_AA_2-1520
1jgo_1_A_2-1520
1jgp_1_A_2-1520
1zc8_1_A_1-59
1mvr_1_D_1-59
4c9d_1_D_29-1
4c9d_1_C_29-1
......@@ -61,12 +61,6 @@
3ep2_1_B_1-50
3eq3_1_B_1-50
3eq4_1_B_1-50
3pgw_1_R_1-164
3pgw_1_N_1-164
3cw1_1_x_1-138
3cw1_1_w_1-138
3cw1_1_V_1-138
3cw1_1_v_1-138
2iy3_1_B_9-105
3jcr_1_N_1-106
2vaz_1_A_64-177
......@@ -78,6 +72,12 @@
4v5z_1_BY_2-113
4v5z_1_BZ_1-70
4v5z_1_B1_2-123
3pgw_1_R_1-164
3pgw_1_N_1-164
3cw1_1_x_1-138
3cw1_1_w_1-138
3cw1_1_V_1-138
3cw1_1_v_1-138
1mvr_1_B_1-96
4adx_1_0_1-2923
3eq4_1_Y_1-69
......@@ -295,7 +295,12 @@
6ucq_1_2Y
4w2e_1_X
6ucq_1_2X
7n1p_1_DT
7n2u_1_DT
6yss_1_W
7n30_1_DT
7n31_1_DT
7n2c_1_DT
5afi_1_Y
5uq8_1_Z
5wdt_1_Y
......@@ -333,6 +338,22 @@
4v4j_1_X
4v4i_1_X
4v42_1_BB
4jrc_1_B
4jrc_1_A
6lkq_1_S
5h5u_1_H
7d6z_1_F
5lze_1_Y
5lze_1_V
5lze_1_X
3jcj_1_G
6o7k_1_G
3dg2_1_A
3dg0_1_A
4v48_1_BA
4v47_1_BA
3dg4_1_A
3dg5_1_A
6d30_1_C
6j7z_1_C
3er9_1_D
......@@ -437,25 +458,22 @@
6doc_1_B
6doe_1_B
6n6g_1_D
6lkq_1_S
5h5u_1_H
7d6z_1_F
5lze_1_Y
5lze_1_V
5lze_1_X
3jcj_1_G
6o7k_1_G
3dg2_1_A
3dg0_1_A
4v48_1_BA
4v47_1_BA
3dg4_1_A
3dg5_1_A
4b3r_1_W
4b3t_1_W
4b3s_1_W
7b5k_1_X
5o2r_1_X
5kcs_1_1X
7n1p_1_PT
7n2u_1_PT
7n30_1_PT
7n31_1_PT
7n2c_1_PT
6yl5_1_I
6yl5_1_E
6yl5_1_A
6yl5_1_K
6yl5_1_G
6zvk_1_E2
6zvk_1_H2
7a01_1_E2
......@@ -526,6 +544,7 @@
6w6l_1_V
6olf_1_V
3erc_1_G
4qjd_1_D
6of1_1_1W
6cae_1_1Y
6o97_1_1W
......@@ -557,7 +576,9 @@
4v48_1_A6
2z9q_1_A
4hot_1_X
5ns4_1_C
6d2z_1_C
7eh0_1_I
4tu0_1_F
4tu0_1_G
6r9o_1_B
......@@ -578,20 +599,23 @@
6sv4_1_NC
6i7o_1_NB
1ml5_1_A
7nsq_1_V
6swa_1_Q
6swa_1_R
3j6x_1_IR
3j6y_1_IR
6ole_1_T
6om0_1_T
6oli_1_T
6om7_1_T
6olf_1_T
6w6l_1_T
6tnu_1_M
5mc6_1_M
7nrc_1_SM
6tb3_1_N
7b7d_1_SM
7b7d_1_SN
6tnu_1_N
7nrc_1_SN
7nrd_1_SN
6zot_1_C
2uxb_1_X
......@@ -602,6 +626,9 @@
1eg0_1_M
3eq4_1_D
5o1y_1_B
4kzy_1_I
4kzz_1_I
4kzx_1_I
3jcr_1_H
6dzi_1_H
5zeu_1_A
......@@ -705,7 +732,6 @@
6ip6_1_ZZ
6uu3_1_333
6uu1_1_333
1pn8_1_D
3er8_1_H
3er8_1_G
3er8_1_F
......@@ -744,9 +770,8 @@
4wtl_1_T
4wtl_1_P
1xnq_1_W
1x18_1_C
1x18_1_B
1x18_1_D
7n2v_1_DT
4peh_1_Z
1vq6_1_4
4am3_1_D
4am3_1_H
......@@ -758,12 +783,45 @@
4wtj_1_T
4wtj_1_P
4xbf_1_D
5w1h_1_B
6n6d_1_D
6n6k_1_C
6n6k_1_D
3rtj_1_D
6ty9_1_M
6tz1_1_N
6q1h_1_D
6q1h_1_H
6p7p_1_F
6p7p_1_E
6p7p_1_D
6vm6_1_J
6vm6_1_G
6wan_1_K
6wan_1_H
6wan_1_G
6wan_1_L
6wan_1_I
6ywo_1_F
6wan_1_J
4oau_1_A
6ywo_1_E
6ywo_1_K
6vm6_1_I
6vm6_1_H
6ywo_1_I
2a1r_1_C
6m6v_1_F
6m6v_1_E
2a1r_1_D
3gpq_1_E
3gpq_1_F
6o79_1_C
6vm6_1_K
6m6v_1_G
6hyu_1_D
1laj_1_R
6ybv_1_K
6sce_1_B
6xl1_1_C
6scf_1_I
......@@ -809,11 +867,12 @@
1y1y_1_P
5zuu_1_I
5zuu_1_G
7am2_1_R1
4peh_1_W
4peh_1_V
4peh_1_X
4peh_1_Y
4peh_1_Z
7d8c_1_C
6mkn_1_W
7kl3_1_B
4cxg_1_C
......@@ -826,14 +885,7 @@
4eya_1_F
4eya_1_Q
4eya_1_R
1qzc_1_B
1t1o_1_B
1mvr_1_C
1t1m_1_B
1t1o_1_C
1t1m_1_A
1t1o_1_A
2r1g_1_B
4ht9_1_E
6z1p_1_AB
6z1p_1_AA
......@@ -844,11 +896,9 @@
5uk4_1_W
5uk4_1_U
5f6c_1_E
7nwh_1_HH
4rcj_1_B
1xnr_1_W
2agn_1_A
2agn_1_C
2agn_1_B
6e0o_1_C
6o75_1_D
6o75_1_C
......@@ -866,8 +916,7 @@
1ibm_1_Z
4dr5_1_V
4d61_1_J
1trj_1_B
1trj_1_C
7nwg_1_Q3
5tbw_1_SR
6hhq_1_SR
6zvi_1_H
......@@ -883,6 +932,8 @@
5k8h_1_A
5z4a_1_B
3jbu_1_V
4ts2_1_Y
4ts0_1_Y
1h2c_1_R
1h2d_1_S
1h2d_1_R
......@@ -909,6 +960,7 @@
6ppn_1_I
5flx_1_Z
6eri_1_AX
7k5l_1_R
7d80_1_Y
1zc8_1_A
1zc8_1_C
......@@ -916,6 +968,7 @@
1zc8_1_G
1zc8_1_I
1zc8_1_H
6bfb_1_Y
1zc8_1_J
7du2_1_R
4v8z_1_CX
......@@ -951,6 +1004,8 @@
4x9e_1_H
6z1p_1_BB
6z1p_1_BA
3p22_1_C
3p22_1_G
2uxd_1_X
6ywe_1_BB
3ol9_1_D
......@@ -973,8 +1028,6 @@
3ol7_1_H
3ol8_1_L
3ol8_1_P
1qzc_1_C
1qzc_1_A
6yrq_1_E
6yrq_1_H
6yrq_1_G
......@@ -1054,6 +1107,7 @@
3iy9_1_A
4wtk_1_T
4wtk_1_P
6wlj_3_A
1vqn_1_4
4oav_1_C
4oav_1_A
......@@ -1070,18 +1124,13 @@
3eq3_1_B
3eq4_1_B
4i67_1_B
3pgw_1_R
3pgw_1_N
3cw1_1_X
3cw1_1_W
3cw1_1_V
7b0y_1_A
4jf2_1_A
6k32_1_T
6k32_1_P
5mmj_1_A
5x8r_1_A
2agn_1_E
2agn_1_D
3fu2_1_B
3fu2_1_A
4v5z_1_BD
6yw5_1_AA
6ywe_1_AA
......@@ -1117,6 +1166,17 @@
3p6y_1_Q
3p6y_1_W
5dto_1_B
6yml_1_A
6ymm_1_A
6ymi_1_M
6ymi_1_F
6ymi_1_A
6ylb_1_F
6ymi_1_C
6ymj_1_C
6ylb_1_C
6ymj_1_I
6ymj_1_O
4cxh_1_X
1uvj_1_F
1uvj_1_D
......@@ -1153,6 +1213,12 @@
4v4f_1_B4
4v4f_1_A6
4v4f_1_B2
7m4y_1_V
7m4x_1_V
6v3a_1_V
6v39_1_V
6ck5_1_A
6ck5_1_B
5it9_1_I
7jqc_1_I
5zsb_1_C
......@@ -1162,6 +1228,8 @@
1cwp_1_D
3jcr_1_N
6gfw_1_R
3j6x_1_IR
3j6y_1_IR
2vaz_1_A
6zm6_1_X
6zm5_1_X
......@@ -1177,11 +1245,11 @@
5uh6_1_I
6l74_1_I
5uh9_1_I
4v5z_1_BS
2ftc_1_R
7a5j_1_X
6sag_1_R
4udv_1_R
2r1g_1_E
5zsc_1_D
5zsc_1_C
6woy_1_I
......@@ -1209,7 +1277,7 @@
3m85_1_X
3m85_1_Z
3m85_1_Y
1e8s_1_C
5u34_1_B
5wnp_1_B
5wnv_1_B
5yts_1_B
......@@ -1232,8 +1300,11 @@
6ij2_1_E
3u2e_1_D
3u2e_1_C
7eh1_1_I
5uef_1_C
5uef_1_D
7eh2_1_R
7eh2_1_I
4x4u_1_H
4afy_1_D
6oy5_1_I
......@@ -1244,13 +1315,15 @@
6s0m_1_C
6ymw_1_C
7a5g_1_J
1m5k_1_B
1m5o_1_E
1m5v_1_B
6gx6_1_B
4k4s_1_D
4k4s_1_H
4k4t_1_H
4k4t_1_D
1zn1_1_C
1zn0_1_C
1xpu_1_G
1xpu_1_L
1xpr_1_L
......@@ -1274,7 +1347,9 @@
6gc5_1_F
6gc5_1_H
6gc5_1_G
4rne_1_C
1n1h_1_B
7n2v_1_PT
4ohz_1_B
6t83_1_6B
4gv6_1_C
......@@ -1290,6 +1365,9 @@
4v5z_1_BC
5y88_1_X
4v5z_1_BB
5y85_1_D
5y85_1_B
5y87_1_D
3j0o_1_H
3j0l_1_H
3j0p_1_H
......@@ -1351,11 +1429,11 @@
4e6b_1_A
4e6b_1_B
6a6l_1_D
4v5z_1_BS
4v8t_1_1
1uvi_1_D
1uvi_1_F
1uvi_1_E
3gs5_1_A
4m7d_1_P
4k4u_1_D
4k4u_1_H
......@@ -1376,8 +1454,8 @@
6ip5_1_2M
6ip6_1_2M
6qcs_1_M
7b5k_1_Z
486d_1_G
2r1g_1_C
486d_1_F
4v5z_1_B0
4nia_1_O
......@@ -1391,11 +1469,11 @@
4oq9_1_F
4oq9_1_L
6r9q_1_B
7m4u_1_A
6v3a_1_SN1
6v3b_1_SN1
6v39_1_SN1
6v3e_1_SN1
1pn7_1_C
1mj1_1_Q
1mj1_1_R
4dr6_1_V
......@@ -1437,14 +1515,25 @@
6ow3_1_I
6ovy_1_I
6oy6_1_I
4bbl_1_Y
4bbl_1_Z
4qvd_1_H
5gxi_1_B
3iy8_1_A
6tnu_1_M
5mc6_1_M
7n06_1_G
7n06_1_H
7n06_1_I
7n06_1_J
7n06_1_K
7n06_1_L
7n33_1_G
7n33_1_H
7n33_1_I
7n33_1_J
7n33_1_K
7n33_1_L
5mc6_1_N
2qwy_1_C
2qwy_1_A
2qwy_1_B
4eya_1_O
4eya_1_P
4eya_1_C
......@@ -1453,8 +1542,6 @@
6htq_1_W
6htq_1_U
6uu6_1_333
6v3a_1_V
6v39_1_V
5a0v_1_F
3avt_1_T
6d1v_1_C
......@@ -1497,6 +1584,7 @@
6o78_1_E
6xa1_1_BV
6ha8_1_X
3bnp_1_B
1m8w_1_E
1m8w_1_F
5udi_1_B
......@@ -1520,16 +1608,29 @@
6een_1_H
4wti_1_T
4wti_1_P
6dlr_1_A
6dlt_1_A
6dls_1_A
6dlq_1_A
6dnr_1_A
5l3p_1_Y
4hor_1_X
3rzo_1_R
5wlh_1_B
2f4v_1_Z
5ml7_1_B
1qln_1_R
3pgw_1_R
3pgw_1_N
3cw1_1_X
3cw1_1_W
3cw1_1_V
7b0y_1_A
6ogy_1_M
6ogy_1_N
6uej_1_B
7kga_1_A
6ywy_1_BB
1x18_1_A
5ytx_1_B
4g0a_1_H
6r9p_1_B
......@@ -1572,12 +1673,8 @@
5mre_1_AA
5mrf_1_AA
7jhy_1_Z
2r1g_1_A
2r1g_1_D
2r1g_1_F
3eq4_1_Y
4wkr_1_C
2r1g_1_X
4v99_1_EC
4v99_1_AC
4v99_1_BH
......@@ -1641,44 +1738,21 @@
6rcl_1_C
5jju_1_C
4ejt_1_G
1et4_1_A
1et4_1_C
1et4_1_B
1et4_1_D
1et4_1_E
1ddy_1_C
1ddy_1_A
1ddy_1_E
6lkq_1_W
6r47_1_A
3qsu_1_P
3qsu_1_R
2xs7_1_B
1n38_1_B
4qvc_1_G
6q1h_1_D
6q1h_1_H
6p7p_1_F
6p7p_1_E
6p7p_1_D
6vm6_1_J
6vm6_1_G
6wan_1_K
6wan_1_H
6wan_1_G
6wan_1_L
6wan_1_I
6ywo_1_F
6wan_1_J
4oau_1_A
6ywo_1_E
6ywo_1_K
6vm6_1_I
6vm6_1_H
6ywo_1_I
2a1r_1_C
6m6v_1_F
6m6v_1_E
2a1r_1_D
3gpq_1_E
3gpq_1_F
6o79_1_C
6vm6_1_K
6m6v_1_G
6hyu_1_D
1laj_1_R
6ybv_1_K
6mpf_1_W
6spc_1_A
6spe_1_A
......@@ -1687,14 +1761,12 @@
6fti_1_V
6ftj_1_V
6ftg_1_V
3npn_1_A
4g0a_1_G
4g0a_1_F
4g0a_1_E
2b2d_1_S
5hkc_1_C
4kzy_1_I
4kzz_1_I
4kzx_1_I
1rmv_1_B
4qu7_1_X
4qu7_1_V
......@@ -1710,25 +1782,3 @@
6pmi_1_3
6pmj_1_3
5hjz_1_C
7nrc_1_SM
7nrc_1_SN
7am2_1_R1
7k5l_1_R
7b5k_1_X
7d8c_1_C
7m4y_1_V
7m4x_1_V
7b5k_1_Z
7m4u_1_A
7n06_1_G
7n06_1_H
7n06_1_I
7n06_1_J
7n06_1_K
7n06_1_L
7n33_1_G
7n33_1_H
7n33_1_I
7n33_1_J
7n33_1_K
7n33_1_L
......
......@@ -7,12 +7,6 @@ Could not find nucleotides of chain AA in annotation 6ydw.json. Either there is
2z9q_1_A_1-72
DSSR warning 2z9q.json: no nucleotides found. Ignoring 2z9q_1_A_1-72.
1ml5_1_b_5-121
Could not find nucleotides of chain b in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1ml5_1_a_1-2914
Could not find nucleotides of chain a in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3ep2_1_Y_1-72
DSSR warning 3ep2.json: no nucleotides found. Ignoring 3ep2_1_Y_1-72.
......@@ -22,8 +16,11 @@ DSSR warning 3eq3.json: no nucleotides found. Ignoring 3eq3_1_Y_1-72.
4v48_1_A6_1-73
DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A6_1-73.
1ml5_1_A_2-1520
Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1ml5_1_b_5-121
Could not find nucleotides of chain b in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1ml5_1_a_1-2914
Could not find nucleotides of chain a in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1qzb_1_B_1-73
DSSR warning 1qzb.json: no nucleotides found. Ignoring 1qzb_1_B_1-73.
......@@ -34,6 +31,9 @@ DSSR warning 1qza.json: no nucleotides found. Ignoring 1qza_1_B_1-73.
1ls2_1_B_1-73
DSSR warning 1ls2.json: no nucleotides found. Ignoring 1ls2_1_B_1-73.
1ml5_1_A_2-1520
Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1gsg_1_T_1-72
DSSR warning 1gsg.json: no nucleotides found. Ignoring 1gsg_1_T_1-72.
......@@ -70,6 +70,12 @@ DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_A9_3-118.
4v47_1_A9_3-118
DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_A9_3-118.
4v42_1_BA_1-2914
Could not find nucleotides of chain BA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4v42_1_BB_5-121
Could not find nucleotides of chain BB in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
2ob7_1_A_10-319
DSSR warning 2ob7.json: no nucleotides found. Ignoring 2ob7_1_A_10-319.
......@@ -82,21 +88,6 @@ DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_Z_1-91.
2ob7_1_D_1-130
DSSR warning 2ob7.json: no nucleotides found. Ignoring 2ob7_1_D_1-130.
4v42_1_BA_1-2914
Could not find nucleotides of chain BA in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4v42_1_BB_5-121
Could not find nucleotides of chain BB in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1r2x_1_C_1-58
DSSR warning 1r2x.json: no nucleotides found. Ignoring 1r2x_1_C_1-58.
1r2w_1_C_1-58
DSSR warning 1r2w.json: no nucleotides found. Ignoring 1r2w_1_C_1-58.
1eg0_1_L_1-56
DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_L_1-56.
3dg2_1_A_1-1542
DSSR warning 3dg2.json: no nucleotides found. Ignoring 3dg2_1_A_1-1542.
......@@ -142,8 +133,14 @@ DSSR warning 3dg5.json: no nucleotides found. Ignoring 3dg5_1_B_1-2904.
1eg0_1_O_1-73
DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_O_1-73.
1zc8_1_A_1-59
DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_A_1-59.
1r2x_1_C_1-58
DSSR warning 1r2x.json: no nucleotides found. Ignoring 1r2x_1_C_1-58.
1r2w_1_C_1-58
DSSR warning 1r2w.json: no nucleotides found. Ignoring 1r2w_1_C_1-58.
1eg0_1_L_1-56
DSSR warning 1eg0.json: no nucleotides found. Ignoring 1eg0_1_L_1-56.
1jgq_1_A_2-1520
Could not find nucleotides of chain A in annotation 1jgq.json. Either there is a problem with 1jgq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -157,6 +154,9 @@ Could not find nucleotides of chain A in annotation 1jgo.json. Either there is a
1jgp_1_A_2-1520
Could not find nucleotides of chain A in annotation 1jgp.json. Either there is a problem with 1jgp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1zc8_1_A_1-59
DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_A_1-59.
1mvr_1_D_1-59
DSSR warning 1mvr.json: no nucleotides found. Ignoring 1mvr_1_D_1-59.
......@@ -187,24 +187,6 @@ DSSR warning 3eq3.json: no nucleotides found. Ignoring 3eq3_1_B_1-50.
3eq4_1_B_1-50
DSSR warning 3eq4.json: no nucleotides found. Ignoring 3eq4_1_B_1-50.
3pgw_1_R_1-164
DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_R_1-164.
3pgw_1_N_1-164
DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_N_1-164.
3cw1_1_x_1-138
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_x_1-138.
3cw1_1_w_1-138
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_w_1-138.
3cw1_1_V_1-138
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_V_1-138.
3cw1_1_v_1-138
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_v_1-138.
2iy3_1_B_9-105
DSSR warning 2iy3.json: no nucleotides found. Ignoring 2iy3_1_B_9-105.
......@@ -238,6 +220,24 @@ DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_BZ_1-70.
4v5z_1_B1_2-123
DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_B1_2-123.
3pgw_1_R_1-164
DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_R_1-164.
3pgw_1_N_1-164
DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_N_1-164.
3cw1_1_x_1-138
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_x_1-138.
3cw1_1_w_1-138
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_w_1-138.
3cw1_1_V_1-138
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_V_1-138.
3cw1_1_v_1-138
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_v_1-138.
1mvr_1_B_1-96
DSSR warning 1mvr.json: no nucleotides found. Ignoring 1mvr_1_B_1-96.
......@@ -889,9 +889,24 @@ Could not find nucleotides of chain X in annotation 4w2e.json. Either there is a
6ucq_1_2X
Could not find nucleotides of chain 2X in annotation 6ucq.json. Either there is a problem with 6ucq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7n1p_1_DT
Could not find nucleotides of chain DT in annotation 7n1p.json. Either there is a problem with 7n1p mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7n2u_1_DT
Could not find nucleotides of chain DT in annotation 7n2u.json. Either there is a problem with 7n2u mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6yss_1_W
Could not find nucleotides of chain W in annotation 6yss.json. Either there is a problem with 6yss mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7n30_1_DT
Could not find nucleotides of chain DT in annotation 7n30.json. Either there is a problem with 7n30 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7n31_1_DT
Could not find nucleotides of chain DT in annotation 7n31.json. Either there is a problem with 7n31 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7n2c_1_DT
Could not find nucleotides of chain DT in annotation 7n2c.json. Either there is a problem with 7n2c mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
5afi_1_Y
Could not find nucleotides of chain Y in annotation 5afi.json. Either there is a problem with 5afi mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -1003,6 +1018,54 @@ Could not find nucleotides of chain X in annotation 4v4i.json. Either there is a
4v42_1_BB
Could not find nucleotides of chain BB in annotation 4v42.json. Either there is a problem with 4v42 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4jrc_1_B
Nucleotides not inserted !
4jrc_1_A
Nucleotides not inserted !
6lkq_1_S
Could not find nucleotides of chain S in annotation 6lkq.json. Either there is a problem with 6lkq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
5h5u_1_H
Could not find nucleotides of chain H in annotation 5h5u.json. Either there is a problem with 5h5u mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7d6z_1_F
Could not find nucleotides of chain F in annotation 7d6z.json. Either there is a problem with 7d6z mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
5lze_1_Y
Could not find nucleotides of chain Y in annotation 5lze.json. Either there is a problem with 5lze mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
5lze_1_V
Could not find nucleotides of chain V in annotation 5lze.json. Either there is a problem with 5lze mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
5lze_1_X
Could not find nucleotides of chain X in annotation 5lze.json. Either there is a problem with 5lze mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3jcj_1_G
Could not find nucleotides of chain G in annotation 3jcj.json. Either there is a problem with 3jcj mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6o7k_1_G
Could not find nucleotides of chain G in annotation 6o7k.json. Either there is a problem with 6o7k mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3dg2_1_A
DSSR warning 3dg2.json: no nucleotides found. Ignoring 3dg2_1_A.
3dg0_1_A
DSSR warning 3dg0.json: no nucleotides found. Ignoring 3dg0_1_A.
4v48_1_BA
DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA.
4v47_1_BA
DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA.
3dg4_1_A
DSSR warning 3dg4.json: no nucleotides found. Ignoring 3dg4_1_A.
3dg5_1_A
DSSR warning 3dg5.json: no nucleotides found. Ignoring 3dg5_1_A.
6d30_1_C
Sequence is too short. (< 5 resolved nts)
......@@ -1315,62 +1378,53 @@ Sequence is too short. (< 5 resolved nts)
6n6g_1_D
Sequence is too short. (< 5 resolved nts)
6lkq_1_S
Could not find nucleotides of chain S in annotation 6lkq.json. Either there is a problem with 6lkq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
5h5u_1_H
Could not find nucleotides of chain H in annotation 5h5u.json. Either there is a problem with 5h5u mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7d6z_1_F
Could not find nucleotides of chain F in annotation 7d6z.json. Either there is a problem with 7d6z mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
5lze_1_Y
Could not find nucleotides of chain Y in annotation 5lze.json. Either there is a problem with 5lze mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4b3r_1_W
Sequence is too short. (< 5 resolved nts)
5lze_1_V
Could not find nucleotides of chain V in annotation 5lze.json. Either there is a problem with 5lze mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4b3t_1_W
Sequence is too short. (< 5 resolved nts)
5lze_1_X
Could not find nucleotides of chain X in annotation 5lze.json. Either there is a problem with 5lze mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4b3s_1_W
Sequence is too short. (< 5 resolved nts)
3jcj_1_G
Could not find nucleotides of chain G in annotation 3jcj.json. Either there is a problem with 3jcj mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7b5k_1_X
Could not find nucleotides of chain X in annotation 7b5k.json. Either there is a problem with 7b5k mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6o7k_1_G
Could not find nucleotides of chain G in annotation 6o7k.json. Either there is a problem with 6o7k mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
5o2r_1_X
Could not find nucleotides of chain X in annotation 5o2r.json. Either there is a problem with 5o2r mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3dg2_1_A
DSSR warning 3dg2.json: no nucleotides found. Ignoring 3dg2_1_A.
5kcs_1_1X
Could not find nucleotides of chain 1X in annotation 5kcs.json. Either there is a problem with 5kcs mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3dg0_1_A
DSSR warning 3dg0.json: no nucleotides found. Ignoring 3dg0_1_A.
7n1p_1_PT
Could not find nucleotides of chain PT in annotation 7n1p.json. Either there is a problem with 7n1p mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4v48_1_BA
DSSR warning 4v48.json: no nucleotides found. Ignoring 4v48_1_BA.
7n2u_1_PT
Could not find nucleotides of chain PT in annotation 7n2u.json. Either there is a problem with 7n2u mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4v47_1_BA
DSSR warning 4v47.json: no nucleotides found. Ignoring 4v47_1_BA.
7n30_1_PT
Could not find nucleotides of chain PT in annotation 7n30.json. Either there is a problem with 7n30 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3dg4_1_A
DSSR warning 3dg4.json: no nucleotides found. Ignoring 3dg4_1_A.
7n31_1_PT
Could not find nucleotides of chain PT in annotation 7n31.json. Either there is a problem with 7n31 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3dg5_1_A
DSSR warning 3dg5.json: no nucleotides found. Ignoring 3dg5_1_A.
7n2c_1_PT
Could not find nucleotides of chain PT in annotation 7n2c.json. Either there is a problem with 7n2c mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4b3r_1_W
Sequence is too short. (< 5 resolved nts)
6yl5_1_I
Nucleotides not inserted !
4b3t_1_W
Sequence is too short. (< 5 resolved nts)
6yl5_1_E
Nucleotides not inserted !
4b3s_1_W
Sequence is too short. (< 5 resolved nts)
6yl5_1_A
Nucleotides not inserted !
5o2r_1_X
Could not find nucleotides of chain X in annotation 5o2r.json. Either there is a problem with 5o2r mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6yl5_1_K
Nucleotides not inserted !
5kcs_1_1X
Could not find nucleotides of chain 1X in annotation 5kcs.json. Either there is a problem with 5kcs mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6yl5_1_G
Nucleotides not inserted !
6zvk_1_E2
Could not find nucleotides of chain E2 in annotation 6zvk.json. Either there is a problem with 6zvk mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -1582,6 +1636,9 @@ Could not find nucleotides of chain V in annotation 6olf.json. Either there is a
3erc_1_G
Sequence is too short. (< 5 resolved nts)
4qjd_1_D
Nucleotides not inserted !
6of1_1_1W
Could not find nucleotides of chain 1W in annotation 6of1.json. Either there is a problem with 6of1 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -1675,9 +1732,15 @@ DSSR warning 2z9q.json: no nucleotides found. Ignoring 2z9q_1_A.
4hot_1_X
Sequence is too short. (< 5 resolved nts)
5ns4_1_C
Nucleotides not inserted !
6d2z_1_C
Sequence is too short. (< 5 resolved nts)
7eh0_1_I
Sequence is too short. (< 5 resolved nts)
4tu0_1_F
Sequence is too short. (< 5 resolved nts)
......@@ -1738,18 +1801,15 @@ Could not find nucleotides of chain NB in annotation 6i7o.json. Either there is
1ml5_1_A
Could not find nucleotides of chain A in annotation 1ml5.json. Either there is a problem with 1ml5 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7nsq_1_V
Could not find nucleotides of chain V in annotation 7nsq.json. Either there is a problem with 7nsq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6swa_1_Q
Could not find nucleotides of chain Q in annotation 6swa.json. Either there is a problem with 6swa mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6swa_1_R
Could not find nucleotides of chain R in annotation 6swa.json. Either there is a problem with 6swa mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3j6x_1_IR
Could not find nucleotides of chain IR in annotation 3j6x.json. Either there is a problem with 3j6x mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3j6y_1_IR
Could not find nucleotides of chain IR in annotation 3j6y.json. Either there is a problem with 3j6y mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6ole_1_T
Could not find nucleotides of chain T in annotation 6ole.json. Either there is a problem with 6ole mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -1768,6 +1828,15 @@ Could not find nucleotides of chain T in annotation 6olf.json. Either there is a
6w6l_1_T
Could not find nucleotides of chain T in annotation 6w6l.json. Either there is a problem with 6w6l mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6tnu_1_M
Could not find nucleotides of chain M in annotation 6tnu.json. Either there is a problem with 6tnu mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
5mc6_1_M
Could not find nucleotides of chain M in annotation 5mc6.json. Either there is a problem with 5mc6 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7nrc_1_SM
Could not find nucleotides of chain SM in annotation 7nrc.json. Either there is a problem with 7nrc mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6tb3_1_N
Could not find nucleotides of chain N in annotation 6tb3.json. Either there is a problem with 6tb3 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -1780,6 +1849,9 @@ Could not find nucleotides of chain SN in annotation 7b7d.json. Either there is
6tnu_1_N
Could not find nucleotides of chain N in annotation 6tnu.json. Either there is a problem with 6tnu mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7nrc_1_SN
Could not find nucleotides of chain SN in annotation 7nrc.json. Either there is a problem with 7nrc mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7nrd_1_SN
Could not find nucleotides of chain SN in annotation 7nrd.json. Either there is a problem with 7nrd mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -1810,6 +1882,15 @@ DSSR warning 3eq4.json: no nucleotides found. Ignoring 3eq4_1_D.
5o1y_1_B
Sequence is too short. (< 5 resolved nts)
4kzy_1_I
Could not find nucleotides of chain I in annotation 4kzy.json. Either there is a problem with 4kzy mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4kzz_1_I
Could not find nucleotides of chain I in annotation 4kzz.json. Either there is a problem with 4kzz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4kzx_1_I
Could not find nucleotides of chain I in annotation 4kzx.json. Either there is a problem with 4kzx mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3jcr_1_H
DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_H.
......@@ -2119,9 +2200,6 @@ Sequence is too short. (< 5 resolved nts)
6uu1_1_333
Sequence is too short. (< 5 resolved nts)
1pn8_1_D
DSSR warning 1pn8.json: no nucleotides found. Ignoring 1pn8_1_D.
3er8_1_H
Sequence is too short. (< 5 resolved nts)
......@@ -2236,14 +2314,11 @@ Sequence is too short. (< 5 resolved nts)
1xnq_1_W
Sequence is too short. (< 5 resolved nts)
1x18_1_C
DSSR warning 1x18.json: no nucleotides found. Ignoring 1x18_1_C.
1x18_1_B
DSSR warning 1x18.json: no nucleotides found. Ignoring 1x18_1_B.
7n2v_1_DT
Could not find nucleotides of chain DT in annotation 7n2v.json. Either there is a problem with 7n2v mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1x18_1_D
DSSR warning 1x18.json: no nucleotides found. Ignoring 1x18_1_D.
4peh_1_Z
Sequence is too short. (< 5 resolved nts)
1vq6_1_4
Sequence is too short. (< 5 resolved nts)
......@@ -2278,6 +2353,9 @@ Sequence is too short. (< 5 resolved nts)
4xbf_1_D
Sequence is too short. (< 5 resolved nts)
5w1h_1_B
Nucleotides not inserted !
6n6d_1_D
Sequence is too short. (< 5 resolved nts)
......@@ -2296,52 +2374,148 @@ Sequence is too short. (< 5 resolved nts)
6tz1_1_N
Sequence is too short. (< 5 resolved nts)
6sce_1_B
6q1h_1_D
Sequence is too short. (< 5 resolved nts)
6xl1_1_C
6q1h_1_H
Sequence is too short. (< 5 resolved nts)
6scf_1_I
6p7p_1_F
Sequence is too short. (< 5 resolved nts)
6scf_1_K
6p7p_1_E
Sequence is too short. (< 5 resolved nts)
6yud_1_K
6p7p_1_D
Sequence is too short. (< 5 resolved nts)
6yud_1_O
6vm6_1_J
Sequence is too short. (< 5 resolved nts)
6scf_1_M
6vm6_1_G
Sequence is too short. (< 5 resolved nts)
6yud_1_P
6wan_1_K
Sequence is too short. (< 5 resolved nts)
6scf_1_L
6wan_1_H
Sequence is too short. (< 5 resolved nts)
6yud_1_M
6wan_1_G
Sequence is too short. (< 5 resolved nts)
6yud_1_Q
6wan_1_L
Sequence is too short. (< 5 resolved nts)
6w11_1_C
6wan_1_I
Sequence is too short. (< 5 resolved nts)
6o6x_1_D
6ywo_1_F
Sequence is too short. (< 5 resolved nts)
4ba2_1_R
6wan_1_J
Sequence is too short. (< 5 resolved nts)
7bdv_1_F
4oau_1_A
Sequence is too short. (< 5 resolved nts)
7bdv_1_H
6ywo_1_E
Sequence is too short. (< 5 resolved nts)
6ywo_1_K
Sequence is too short. (< 5 resolved nts)
6vm6_1_I
Sequence is too short. (< 5 resolved nts)
6vm6_1_H
Sequence is too short. (< 5 resolved nts)
6ywo_1_I
Sequence is too short. (< 5 resolved nts)
2a1r_1_C
Sequence is too short. (< 5 resolved nts)
6m6v_1_F
Sequence is too short. (< 5 resolved nts)
6m6v_1_E
Sequence is too short. (< 5 resolved nts)
2a1r_1_D
Sequence is too short. (< 5 resolved nts)
3gpq_1_E
Sequence is too short. (< 5 resolved nts)
3gpq_1_F
Sequence is too short. (< 5 resolved nts)
6o79_1_C
Sequence is too short. (< 5 resolved nts)
6vm6_1_K
Sequence is too short. (< 5 resolved nts)
6m6v_1_G
Sequence is too short. (< 5 resolved nts)
6hyu_1_D
Sequence is too short. (< 5 resolved nts)
1laj_1_R
Sequence is too short. (< 5 resolved nts)
6ybv_1_K
Could not find nucleotides of chain K in annotation 6ybv.json. Either there is a problem with 6ybv mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6sce_1_B
Sequence is too short. (< 5 resolved nts)
6xl1_1_C
Sequence is too short. (< 5 resolved nts)
6scf_1_I
Sequence is too short. (< 5 resolved nts)
6scf_1_K
Sequence is too short. (< 5 resolved nts)
6yud_1_K
Sequence is too short. (< 5 resolved nts)
6yud_1_O
Sequence is too short. (< 5 resolved nts)
6scf_1_M
Sequence is too short. (< 5 resolved nts)
6yud_1_P
Sequence is too short. (< 5 resolved nts)
6scf_1_L
Sequence is too short. (< 5 resolved nts)
6yud_1_M
Sequence is too short. (< 5 resolved nts)
6yud_1_Q
Sequence is too short. (< 5 resolved nts)
6w11_1_C
Sequence is too short. (< 5 resolved nts)
6o6x_1_D
Sequence is too short. (< 5 resolved nts)
4ba2_1_R
Sequence is too short. (< 5 resolved nts)
7bdv_1_F
Sequence is too short. (< 5 resolved nts)
7bdv_1_H
Sequence is too short. (< 5 resolved nts)
6o6x_1_C
......@@ -2423,7 +2597,7 @@ Sequence is too short. (< 5 resolved nts)
Sequence is too short. (< 5 resolved nts)
1y1y_1_P
DSSR warning 1y1y.json: no nucleotides found. Ignoring 1y1y_1_P.
Sequence is too short. (< 5 resolved nts)
5zuu_1_I
Sequence is too short. (< 5 resolved nts)
......@@ -2431,6 +2605,9 @@ Sequence is too short. (< 5 resolved nts)
5zuu_1_G
Sequence is too short. (< 5 resolved nts)
7am2_1_R1
Sequence is too short. (< 5 resolved nts)
4peh_1_W
Sequence is too short. (< 5 resolved nts)
......@@ -2443,7 +2620,7 @@ Sequence is too short. (< 5 resolved nts)
4peh_1_Y
Sequence is too short. (< 5 resolved nts)
4peh_1_Z
7d8c_1_C
Sequence is too short. (< 5 resolved nts)
6mkn_1_W
......@@ -2482,30 +2659,9 @@ Could not find nucleotides of chain Q in annotation 4eya.json. Either there is a
4eya_1_R
Could not find nucleotides of chain R in annotation 4eya.json. Either there is a problem with 4eya mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1qzc_1_B
DSSR warning 1qzc.json: no nucleotides found. Ignoring 1qzc_1_B.
1t1o_1_B
DSSR warning 1t1o.json: no nucleotides found. Ignoring 1t1o_1_B.
1mvr_1_C
DSSR warning 1mvr.json: no nucleotides found. Ignoring 1mvr_1_C.
1t1m_1_B
DSSR warning 1t1m.json: no nucleotides found. Ignoring 1t1m_1_B.
1t1o_1_C
DSSR warning 1t1o.json: no nucleotides found. Ignoring 1t1o_1_C.
1t1m_1_A
DSSR warning 1t1m.json: no nucleotides found. Ignoring 1t1m_1_A.
1t1o_1_A
DSSR warning 1t1o.json: no nucleotides found. Ignoring 1t1o_1_A.
2r1g_1_B
DSSR warning 2r1g.json: no nucleotides found. Ignoring 2r1g_1_B.
4ht9_1_E
Sequence is too short. (< 5 resolved nts)
......@@ -2536,21 +2692,15 @@ Could not find nucleotides of chain U in annotation 5uk4.json. Either there is a
5f6c_1_E
Sequence is too short. (< 5 resolved nts)
7nwh_1_HH
Could not find nucleotides of chain HH in annotation 7nwh.json. Either there is a problem with 7nwh mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4rcj_1_B
Sequence is too short. (< 5 resolved nts)
1xnr_1_W
Sequence is too short. (< 5 resolved nts)
2agn_1_A
DSSR warning 2agn.json: no nucleotides found. Ignoring 2agn_1_A.
2agn_1_C
DSSR warning 2agn.json: no nucleotides found. Ignoring 2agn_1_C.
2agn_1_B
DSSR warning 2agn.json: no nucleotides found. Ignoring 2agn_1_B.
6e0o_1_C
Sequence is too short. (< 5 resolved nts)
......@@ -2602,11 +2752,8 @@ Sequence is too short. (< 5 resolved nts)
4d61_1_J
Could not find nucleotides of chain J in annotation 4d61.json. Either there is a problem with 4d61 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1trj_1_B
DSSR warning 1trj.json: no nucleotides found. Ignoring 1trj_1_B.
1trj_1_C
DSSR warning 1trj.json: no nucleotides found. Ignoring 1trj_1_C.
7nwg_1_Q3
Could not find nucleotides of chain Q3 in annotation 7nwg.json. Either there is a problem with 7nwg mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
5tbw_1_SR
Could not find nucleotides of chain SR in annotation 5tbw.json. Either there is a problem with 5tbw mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -2653,6 +2800,12 @@ Sequence is too short. (< 5 resolved nts)
3jbu_1_V
Could not find nucleotides of chain V in annotation 3jbu.json. Either there is a problem with 3jbu mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4ts2_1_Y
Nucleotides not inserted !
4ts0_1_Y
Nucleotides not inserted !
1h2c_1_R
Sequence is too short. (< 5 resolved nts)
......@@ -2731,6 +2884,9 @@ Could not find nucleotides of chain Z in annotation 5flx.json. Either there is a
6eri_1_AX
Could not find nucleotides of chain AX in annotation 6eri.json. Either there is a problem with 6eri mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7k5l_1_R
Sequence is too short. (< 5 resolved nts)
7d80_1_Y
Could not find nucleotides of chain Y in annotation 7d80.json. Either there is a problem with 7d80 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -2752,6 +2908,9 @@ DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_I.
1zc8_1_H
DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_H.
6bfb_1_Y
Nucleotides not inserted !
1zc8_1_J
DSSR warning 1zc8.json: no nucleotides found. Ignoring 1zc8_1_J.
......@@ -2857,6 +3016,12 @@ Could not find nucleotides of chain BB in annotation 6z1p.json. Either there is
6z1p_1_BA
Could not find nucleotides of chain BA in annotation 6z1p.json. Either there is a problem with 6z1p mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3p22_1_C
Nucleotides not inserted !
3p22_1_G
Nucleotides not inserted !
2uxd_1_X
Sequence is too short. (< 5 resolved nts)
......@@ -2923,12 +3088,6 @@ Sequence is too short. (< 5 resolved nts)
3ol8_1_P
Sequence is too short. (< 5 resolved nts)
1qzc_1_C
DSSR warning 1qzc.json: no nucleotides found. Ignoring 1qzc_1_C.
1qzc_1_A
DSSR warning 1qzc.json: no nucleotides found. Ignoring 1qzc_1_A.
6yrq_1_E
Sequence is too short. (< 5 resolved nts)
......@@ -3166,6 +3325,9 @@ Sequence is too short. (< 5 resolved nts)
4wtk_1_P
Sequence is too short. (< 5 resolved nts)
6wlj_3_A
Nucleotides not inserted !
1vqn_1_4
Sequence is too short. (< 5 resolved nts)
......@@ -3214,23 +3376,8 @@ DSSR warning 3eq4.json: no nucleotides found. Ignoring 3eq4_1_B.
4i67_1_B
Sequence is too short. (< 5 resolved nts)
3pgw_1_R
DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_R.
3pgw_1_N
DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_N.
3cw1_1_X
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_X.
3cw1_1_W
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_W.
3cw1_1_V
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_V.
7b0y_1_A
Could not find nucleotides of chain A in annotation 7b0y.json. Either there is a problem with 7b0y mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4jf2_1_A
Nucleotides not inserted !
6k32_1_T
Could not find nucleotides of chain T in annotation 6k32.json. Either there is a problem with 6k32 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -3244,11 +3391,11 @@ Could not find nucleotides of chain A in annotation 5mmj.json. Either there is a
5x8r_1_A
Could not find nucleotides of chain A in annotation 5x8r.json. Either there is a problem with 5x8r mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
2agn_1_E
DSSR warning 2agn.json: no nucleotides found. Ignoring 2agn_1_E.
3fu2_1_B
Nucleotides not inserted !
2agn_1_D
DSSR warning 2agn.json: no nucleotides found. Ignoring 2agn_1_D.
3fu2_1_A
Nucleotides not inserted !
4v5z_1_BD
DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_BD.
......@@ -3355,6 +3502,39 @@ Sequence is too short. (< 5 resolved nts)
5dto_1_B
Sequence is too short. (< 5 resolved nts)
6yml_1_A
Nucleotides not inserted !
6ymm_1_A
Nucleotides not inserted !
6ymi_1_M
Nucleotides not inserted !
6ymi_1_F
Nucleotides not inserted !
6ymi_1_A
Nucleotides not inserted !
6ylb_1_F
Nucleotides not inserted !
6ymi_1_C
Nucleotides not inserted !
6ymj_1_C
Nucleotides not inserted !
6ylb_1_C
Nucleotides not inserted !
6ymj_1_I
Nucleotides not inserted !
6ymj_1_O
Nucleotides not inserted !
4cxh_1_X
Could not find nucleotides of chain X in annotation 4cxh.json. Either there is a problem with 4cxh mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -3463,6 +3643,24 @@ Sequence is too short. (< 5 resolved nts)
4v4f_1_B2
Sequence is too short. (< 5 resolved nts)
7m4y_1_V
Could not find nucleotides of chain V in annotation 7m4y.json. Either there is a problem with 7m4y mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7m4x_1_V
Could not find nucleotides of chain V in annotation 7m4x.json. Either there is a problem with 7m4x mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6v3a_1_V
Could not find nucleotides of chain V in annotation 6v3a.json. Either there is a problem with 6v3a mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6v39_1_V
Could not find nucleotides of chain V in annotation 6v39.json. Either there is a problem with 6v39 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6ck5_1_A
Nucleotides not inserted !
6ck5_1_B
Nucleotides not inserted !
5it9_1_I
Could not find nucleotides of chain I in annotation 5it9.json. Either there is a problem with 5it9 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -3490,6 +3688,12 @@ DSSR warning 3jcr.json: no nucleotides found. Ignoring 3jcr_1_N.
6gfw_1_R
Sequence is too short. (< 5 resolved nts)
3j6x_1_IR
Could not find nucleotides of chain IR in annotation 3j6x.json. Either there is a problem with 3j6x mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3j6y_1_IR
Could not find nucleotides of chain IR in annotation 3j6y.json. Either there is a problem with 3j6y mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
2vaz_1_A
DSSR warning 2vaz.json: no nucleotides found. Ignoring 2vaz_1_A.
......@@ -3535,6 +3739,9 @@ Sequence is too short. (< 5 resolved nts)
5uh9_1_I
Sequence is too short. (< 5 resolved nts)
4v5z_1_BS
DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_BS.
2ftc_1_R
DSSR warning 2ftc.json: no nucleotides found. Ignoring 2ftc_1_R.
......@@ -3547,9 +3754,6 @@ Sequence is too short. (< 5 resolved nts)
4udv_1_R
Sequence is too short. (< 5 resolved nts)
2r1g_1_E
DSSR warning 2r1g.json: no nucleotides found. Ignoring 2r1g_1_E.
5zsc_1_D
Sequence is too short. (< 5 resolved nts)
......@@ -3631,8 +3835,8 @@ Sequence is too short. (< 5 resolved nts)
3m85_1_Y
Sequence is too short. (< 5 resolved nts)
1e8s_1_C
DSSR warning 1e8s.json: no nucleotides found. Ignoring 1e8s_1_C.
5u34_1_B
Nucleotides not inserted !
5wnp_1_B
Could not find nucleotides of chain B in annotation 5wnp.json. Either there is a problem with 5wnp mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -3700,12 +3904,21 @@ Sequence is too short. (< 5 resolved nts)
3u2e_1_C
Sequence is too short. (< 5 resolved nts)
7eh1_1_I
Sequence is too short. (< 5 resolved nts)
5uef_1_C
Sequence is too short. (< 5 resolved nts)
5uef_1_D
Sequence is too short. (< 5 resolved nts)
7eh2_1_R
Sequence is too short. (< 5 resolved nts)
7eh2_1_I
Sequence is too short. (< 5 resolved nts)
4x4u_1_H
Sequence is too short. (< 5 resolved nts)
......@@ -3736,6 +3949,15 @@ Sequence is too short. (< 5 resolved nts)
7a5g_1_J
Could not find nucleotides of chain J in annotation 7a5g.json. Either there is a problem with 7a5g mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1m5k_1_B
Nucleotides not inserted !
1m5o_1_E
Nucleotides not inserted !
1m5v_1_B
Nucleotides not inserted !
6gx6_1_B
Sequence is too short. (< 5 resolved nts)
......@@ -3754,9 +3976,6 @@ Sequence is too short. (< 5 resolved nts)
1zn1_1_C
DSSR warning 1zn1.json: no nucleotides found. Ignoring 1zn1_1_C.
1zn0_1_C
DSSR warning 1zn0.json: no nucleotides found. Ignoring 1zn0_1_C.
1xpu_1_G
Sequence is too short. (< 5 resolved nts)
......@@ -3826,9 +4045,15 @@ Sequence is too short. (< 5 resolved nts)
6gc5_1_G
Sequence is too short. (< 5 resolved nts)
4rne_1_C
Nucleotides not inserted !
1n1h_1_B
Sequence is too short. (< 5 resolved nts)
7n2v_1_PT
Could not find nucleotides of chain PT in annotation 7n2v.json. Either there is a problem with 7n2v mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4ohz_1_B
Sequence is too short. (< 5 resolved nts)
......@@ -3874,6 +4099,15 @@ Could not find nucleotides of chain X in annotation 5y88.json. Either there is a
4v5z_1_BB
DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_BB.
5y85_1_D
Nucleotides not inserted !
5y85_1_B
Nucleotides not inserted !
5y87_1_D
Nucleotides not inserted !
3j0o_1_H
Could not find nucleotides of chain H in annotation 3j0o.json. Either there is a problem with 3j0o mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -4057,9 +4291,6 @@ Sequence is too short. (< 5 resolved nts)
6a6l_1_D
Sequence is too short. (< 5 resolved nts)
4v5z_1_BS
DSSR warning 4v5z.json: no nucleotides found. Ignoring 4v5z_1_BS.
4v8t_1_1
Could not find nucleotides of chain 1 in annotation 4v8t.json. Either there is a problem with 4v8t mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -4072,6 +4303,9 @@ Sequence is too short. (< 5 resolved nts)
1uvi_1_E
Sequence is too short. (< 5 resolved nts)
3gs5_1_A
Nucleotides not inserted !
4m7d_1_P
Sequence is too short. (< 5 resolved nts)
......@@ -4132,12 +4366,12 @@ Could not find nucleotides of chain 2M in annotation 6ip6.json. Either there is
6qcs_1_M
Sequence is too short. (< 5 resolved nts)
7b5k_1_Z
Could not find nucleotides of chain Z in annotation 7b5k.json. Either there is a problem with 7b5k mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
486d_1_G
Could not find nucleotides of chain G in annotation 486d.json. Either there is a problem with 486d mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
2r1g_1_C
DSSR warning 2r1g.json: no nucleotides found. Ignoring 2r1g_1_C.
486d_1_F
Could not find nucleotides of chain F in annotation 486d.json. Either there is a problem with 486d mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -4177,6 +4411,9 @@ Could not find nucleotides of chain L in annotation 4oq9.json. Either there is a
6r9q_1_B
Sequence is too short. (< 5 resolved nts)
7m4u_1_A
Could not find nucleotides of chain A in annotation 7m4u.json. Either there is a problem with 7m4u mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6v3a_1_SN1
Could not find nucleotides of chain SN1 in annotation 6v3a.json. Either there is a problem with 6v3a mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -4189,9 +4426,6 @@ Could not find nucleotides of chain SN1 in annotation 6v39.json. Either there is
6v3e_1_SN1
Could not find nucleotides of chain SN1 in annotation 6v3e.json. Either there is a problem with 6v3e mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1pn7_1_C
DSSR warning 1pn7.json: no nucleotides found. Ignoring 1pn7_1_C.
1mj1_1_Q
Could not find nucleotides of chain Q in annotation 1mj1.json. Either there is a problem with 1mj1 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -4315,12 +4549,6 @@ Sequence is too short. (< 5 resolved nts)
6oy6_1_I
Sequence is too short. (< 5 resolved nts)
4bbl_1_Y
DSSR warning 4bbl.json: no nucleotides found. Ignoring 4bbl_1_Y.
4bbl_1_Z
DSSR warning 4bbl.json: no nucleotides found. Ignoring 4bbl_1_Z.
4qvd_1_H
Sequence is too short. (< 5 resolved nts)
......@@ -4330,15 +4558,54 @@ Sequence is too short. (< 5 resolved nts)
3iy8_1_A
DSSR warning 3iy8.json: no nucleotides found. Ignoring 3iy8_1_A.
6tnu_1_M
Could not find nucleotides of chain M in annotation 6tnu.json. Either there is a problem with 6tnu mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7n06_1_G
Sequence is too short. (< 5 resolved nts)
5mc6_1_M
Could not find nucleotides of chain M in annotation 5mc6.json. Either there is a problem with 5mc6 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7n06_1_H
Sequence is too short. (< 5 resolved nts)
7n06_1_I
Sequence is too short. (< 5 resolved nts)
7n06_1_J
Sequence is too short. (< 5 resolved nts)
7n06_1_K
Sequence is too short. (< 5 resolved nts)
7n06_1_L
Sequence is too short. (< 5 resolved nts)
7n33_1_G
Sequence is too short. (< 5 resolved nts)
7n33_1_H
Sequence is too short. (< 5 resolved nts)
7n33_1_I
Sequence is too short. (< 5 resolved nts)
7n33_1_J
Sequence is too short. (< 5 resolved nts)
7n33_1_K
Sequence is too short. (< 5 resolved nts)
7n33_1_L
Sequence is too short. (< 5 resolved nts)
5mc6_1_N
Could not find nucleotides of chain N in annotation 5mc6.json. Either there is a problem with 5mc6 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
2qwy_1_C
Nucleotides not inserted !
2qwy_1_A
Nucleotides not inserted !
2qwy_1_B
Nucleotides not inserted !
4eya_1_O
Could not find nucleotides of chain O in annotation 4eya.json. Either there is a problem with 4eya mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -4363,12 +4630,6 @@ Could not find nucleotides of chain U in annotation 6htq.json. Either there is a
6uu6_1_333
Sequence is too short. (< 5 resolved nts)
6v3a_1_V
Could not find nucleotides of chain V in annotation 6v3a.json. Either there is a problem with 6v3a mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6v39_1_V
Could not find nucleotides of chain V in annotation 6v39.json. Either there is a problem with 6v39 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
5a0v_1_F
Sequence is too short. (< 5 resolved nts)
......@@ -4495,6 +4756,9 @@ Could not find nucleotides of chain BV in annotation 6xa1.json. Either there is
6ha8_1_X
Could not find nucleotides of chain X in annotation 6ha8.json. Either there is a problem with 6ha8 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3bnp_1_B
Nucleotides not inserted !
1m8w_1_E
Could not find nucleotides of chain E in annotation 1m8w.json. Either there is a problem with 1m8w mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -4564,6 +4828,21 @@ Sequence is too short. (< 5 resolved nts)
4wti_1_P
Sequence is too short. (< 5 resolved nts)
6dlr_1_A
Nucleotides not inserted !
6dlt_1_A
Nucleotides not inserted !
6dls_1_A
Nucleotides not inserted !
6dlq_1_A
Nucleotides not inserted !
6dnr_1_A
Nucleotides not inserted !
5l3p_1_Y
Could not find nucleotides of chain Y in annotation 5l3p.json. Either there is a problem with 5l3p mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -4573,12 +4852,36 @@ Sequence is too short. (< 5 resolved nts)
3rzo_1_R
Sequence is too short. (< 5 resolved nts)
5wlh_1_B
Nucleotides not inserted !
2f4v_1_Z
Sequence is too short. (< 5 resolved nts)
5ml7_1_B
Nucleotides not inserted !
1qln_1_R
Sequence is too short. (< 5 resolved nts)
3pgw_1_R
DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_R.
3pgw_1_N
DSSR warning 3pgw.json: no nucleotides found. Ignoring 3pgw_1_N.
3cw1_1_X
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_X.
3cw1_1_W
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_W.
3cw1_1_V
DSSR warning 3cw1.json: no nucleotides found. Ignoring 3cw1_1_V.
7b0y_1_A
Could not find nucleotides of chain A in annotation 7b0y.json. Either there is a problem with 7b0y mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6ogy_1_M
Sequence is too short. (< 5 resolved nts)
......@@ -4588,12 +4891,12 @@ Sequence is too short. (< 5 resolved nts)
6uej_1_B
Sequence is too short. (< 5 resolved nts)
7kga_1_A
Nucleotides not inserted !
6ywy_1_BB
Could not find nucleotides of chain BB in annotation 6ywy.json. Either there is a problem with 6ywy mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1x18_1_A
DSSR warning 1x18.json: no nucleotides found. Ignoring 1x18_1_A.
5ytx_1_B
Sequence is too short. (< 5 resolved nts)
......@@ -4720,24 +5023,12 @@ Could not find nucleotides of chain AA in annotation 5mrf.json. Either there is
7jhy_1_Z
Could not find nucleotides of chain Z in annotation 7jhy.json. Either there is a problem with 7jhy mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
2r1g_1_A
DSSR warning 2r1g.json: no nucleotides found. Ignoring 2r1g_1_A.
2r1g_1_D
DSSR warning 2r1g.json: no nucleotides found. Ignoring 2r1g_1_D.
2r1g_1_F
DSSR warning 2r1g.json: no nucleotides found. Ignoring 2r1g_1_F.
3eq4_1_Y
DSSR warning 3eq4.json: no nucleotides found. Ignoring 3eq4_1_Y.
4wkr_1_C
Sequence is too short. (< 5 resolved nts)
2r1g_1_X
DSSR warning 2r1g.json: no nucleotides found. Ignoring 2r1g_1_X.
4v99_1_EC
Could not find nucleotides of chain EC in annotation 4v99.json. Either there is a problem with 4v99 mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
......@@ -4927,120 +5218,51 @@ Sequence is too short. (< 5 resolved nts)
4ejt_1_G
Sequence is too short. (< 5 resolved nts)
6lkq_1_W
Could not find nucleotides of chain W in annotation 6lkq.json. Either there is a problem with 6lkq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3qsu_1_P
Sequence is too short. (< 5 resolved nts)
3qsu_1_R
Sequence is too short. (< 5 resolved nts)
2xs7_1_B
Sequence is too short. (< 5 resolved nts)
1n38_1_B
Sequence is too short. (< 5 resolved nts)
4qvc_1_G
Sequence is too short. (< 5 resolved nts)
6q1h_1_D
Sequence is too short. (< 5 resolved nts)
6q1h_1_H
Sequence is too short. (< 5 resolved nts)
6p7p_1_F
Sequence is too short. (< 5 resolved nts)
6p7p_1_E
Sequence is too short. (< 5 resolved nts)
6p7p_1_D
Sequence is too short. (< 5 resolved nts)
6vm6_1_J
Sequence is too short. (< 5 resolved nts)
1et4_1_A
Nucleotides not inserted !
6vm6_1_G
Sequence is too short. (< 5 resolved nts)
1et4_1_C
Nucleotides not inserted !
6wan_1_K
Sequence is too short. (< 5 resolved nts)
1et4_1_B
Nucleotides not inserted !
6wan_1_H
Sequence is too short. (< 5 resolved nts)
1et4_1_D
Nucleotides not inserted !
6wan_1_G
Sequence is too short. (< 5 resolved nts)
1et4_1_E
Nucleotides not inserted !
6wan_1_L
Sequence is too short. (< 5 resolved nts)
1ddy_1_C
Nucleotides not inserted !
6wan_1_I
Sequence is too short. (< 5 resolved nts)
1ddy_1_A
Nucleotides not inserted !
6ywo_1_F
Sequence is too short. (< 5 resolved nts)
6wan_1_J
Sequence is too short. (< 5 resolved nts)
4oau_1_A
Sequence is too short. (< 5 resolved nts)
1ddy_1_E
Nucleotides not inserted !
6ywo_1_E
Sequence is too short. (< 5 resolved nts)
6ywo_1_K
Sequence is too short. (< 5 resolved nts)
6vm6_1_I
Sequence is too short. (< 5 resolved nts)
6vm6_1_H
Sequence is too short. (< 5 resolved nts)
6ywo_1_I
Sequence is too short. (< 5 resolved nts)
2a1r_1_C
Sequence is too short. (< 5 resolved nts)
6m6v_1_F
Sequence is too short. (< 5 resolved nts)
6m6v_1_E
Sequence is too short. (< 5 resolved nts)
2a1r_1_D
Sequence is too short. (< 5 resolved nts)
3gpq_1_E
Sequence is too short. (< 5 resolved nts)
6lkq_1_W
Could not find nucleotides of chain W in annotation 6lkq.json. Either there is a problem with 6lkq mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3gpq_1_F
Sequence is too short. (< 5 resolved nts)
6r47_1_A
Nucleotides not inserted !
6o79_1_C
3qsu_1_P
Sequence is too short. (< 5 resolved nts)
6vm6_1_K
3qsu_1_R
Sequence is too short. (< 5 resolved nts)
6m6v_1_G
2xs7_1_B
Sequence is too short. (< 5 resolved nts)
6hyu_1_D
1n38_1_B
Sequence is too short. (< 5 resolved nts)
1laj_1_R
4qvc_1_G
Sequence is too short. (< 5 resolved nts)
6ybv_1_K
Could not find nucleotides of chain K in annotation 6ybv.json. Either there is a problem with 6ybv mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
6mpf_1_W
Sequence is too short. (< 5 resolved nts)
......@@ -5065,6 +5287,9 @@ Could not find nucleotides of chain V in annotation 6ftj.json. Either there is a
6ftg_1_V
Could not find nucleotides of chain V in annotation 6ftg.json. Either there is a problem with 6ftg mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
3npn_1_A
Nucleotides not inserted !
4g0a_1_G
Sequence is too short. (< 5 resolved nts)
......@@ -5080,15 +5305,6 @@ Sequence is too short. (< 5 resolved nts)
5hkc_1_C
Sequence is too short. (< 5 resolved nts)
4kzy_1_I
Could not find nucleotides of chain I in annotation 4kzy.json. Either there is a problem with 4kzy mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4kzz_1_I
Could not find nucleotides of chain I in annotation 4kzz.json. Either there is a problem with 4kzz mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
4kzx_1_I
Could not find nucleotides of chain I in annotation 4kzx.json. Either there is a problem with 4kzx mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
1rmv_1_B
Sequence is too short. (< 5 resolved nts)
......@@ -5134,69 +5350,3 @@ Sequence is too short. (< 5 resolved nts)
5hjz_1_C
Sequence is too short. (< 5 resolved nts)
7nrc_1_SM
Could not find nucleotides of chain SM in annotation 7nrc.json. Either there is a problem with 7nrc mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7nrc_1_SN
Could not find nucleotides of chain SN in annotation 7nrc.json. Either there is a problem with 7nrc mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7am2_1_R1
Sequence is too short. (< 5 resolved nts)
7k5l_1_R
Sequence is too short. (< 5 resolved nts)
7b5k_1_X
Could not find nucleotides of chain X in annotation 7b5k.json. Either there is a problem with 7b5k mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7d8c_1_C
Sequence is too short. (< 5 resolved nts)
7m4y_1_V
Could not find nucleotides of chain V in annotation 7m4y.json. Either there is a problem with 7m4y mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7m4x_1_V
Could not find nucleotides of chain V in annotation 7m4x.json. Either there is a problem with 7m4x mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7b5k_1_Z
Could not find nucleotides of chain Z in annotation 7b5k.json. Either there is a problem with 7b5k mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7m4u_1_A
Could not find nucleotides of chain A in annotation 7m4u.json. Either there is a problem with 7m4u mmCIF download, or the bases are not resolved in the structure. Delete it and retry.
7n06_1_G
Sequence is too short. (< 5 resolved nts)
7n06_1_H
Sequence is too short. (< 5 resolved nts)
7n06_1_I
Sequence is too short. (< 5 resolved nts)
7n06_1_J
Sequence is too short. (< 5 resolved nts)
7n06_1_K
Sequence is too short. (< 5 resolved nts)
7n06_1_L
Sequence is too short. (< 5 resolved nts)
7n33_1_G
Sequence is too short. (< 5 resolved nts)
7n33_1_H
Sequence is too short. (< 5 resolved nts)
7n33_1_I
Sequence is too short. (< 5 resolved nts)
7n33_1_J
Sequence is too short. (< 5 resolved nts)
7n33_1_K
Sequence is too short. (< 5 resolved nts)
7n33_1_L
Sequence is too short. (< 5 resolved nts)
......
......@@ -7,38 +7,27 @@
# Run this file if you want the base counts, pair-type counts, identity percents, etc
# in the database.
import getopt, os, pickle, sqlite3, shlex, subprocess, sys, warnings
import getopt, glob, json, os, sqlite3, shlex, subprocess, sys, warnings
import numpy as np
import pandas as pd
import threading as th
import scipy.stats as st
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patches as mpatches
import scipy.cluster.hierarchy as sch
import sklearn
import json
import glob
import pickle
import Bio
from scipy.spatial.distance import squareform
from mpl_toolkits.mplot3d import axes3d
from Bio import AlignIO, SeqIO
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB.vectors import Vector, calc_angle, calc_dihedral
from functools import partial
from multiprocessing import Pool, Manager
from multiprocessing import Pool, Manager, Value
from os import path
from tqdm import tqdm
from collections import Counter
from setproctitle import setproctitle
from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions
from sklearn.mixture import GaussianMixture
import warnings
from pandas.core.common import SettingWithCopyWarning
from joblib import Parallel, delayed
from geometric_stats import *
np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8)
path_to_3D_data = "tobedefinedbyoptions"
......@@ -928,6 +917,7 @@ def general_stats():
fig.savefig(runDir + "/results/figures/Nfamilies.png")
plt.close()
@trace_unhandled_exceptions
def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
# Identify the right 3D file
......@@ -1135,11 +1125,6 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
return 0
def log_to_pbar(pbar):
def update(r):
pbar.update(1)
return update
def family_order(f):
# sort the RNA families so that the plots are readable
......@@ -1154,70 +1139,6 @@ def family_order(f):
else:
return 2
def conversion_angles(bdd):
"""
Convert database torsion angles to degrees
and put them in a list to reuse for statistics
"""
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
db_path = os.path.join(BASE_DIR, bdd)
baseDeDonnees = sqlite3.connect(db_path)
curseur = baseDeDonnees.cursor()
curseur.execute("SELECT chain_id, nt_name, alpha, beta, gamma, delta, epsilon, zeta, chi FROM nucleotide WHERE nt_name='A' OR nt_name='C' OR nt_name='G' OR nt_name='U' ;")
liste=[]
for nt in curseur.fetchall(): # retrieve the angle measurements and put them in a list
liste.append(nt)
angles_torsion=[]
for nt in liste :
angles_deg=[]
angles_deg.append(nt[0]) #chain_id
angles_deg.append(nt[1]) #nt_name
for i in range (2,9): # on all angles
angle=0
if nt[i] == None :
angle=None
elif nt[i]<=np.pi: #if angle value <pi, positive
angle=(180/np.pi)*nt[i]
elif np.pi < nt[i] <= 2*np.pi : #if value of the angle between pi and 2pi, negative
angle=((180/np.pi)*nt[i])-360
else :
angle=nt[i] # in case some angles still in degrees
angles_deg.append(angle)
angles_torsion.append(angles_deg)
return angles_torsion
def conversion_eta_theta(bdd):
"""
Convert database pseudotorsion angles to degrees
and put them in a list to reuse for statistics
"""
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
db_path = os.path.join(BASE_DIR, bdd)
baseDeDonnees = sqlite3.connect(db_path)
curseur = baseDeDonnees.cursor()
curseur.execute("SELECT chain_id, nt_name, eta, theta, eta_prime, theta_prime, eta_base, theta_base FROM nucleotide WHERE nt_name='A' OR nt_name='C' OR nt_name='G' OR nt_name='U';")
liste=[]
for nt in curseur.fetchall():
liste.append(nt)
angles_virtuels=[]
for nt in liste :
angles_deg=[]
angles_deg.append(nt[0]) #chain_id
angles_deg.append(nt[1]) #nt_name
for i in range (2,8):
angle=0
if nt[i] == None :
angle=None
elif nt[i]<=np.pi:
angle=(180/np.pi)*nt[i]
elif np.pi < nt[i] <= 2*np.pi :
angle=((180/np.pi)*nt[i])-360
else :
angle=nt[i]
angles_deg.append(angle)
angles_virtuels.append(angles_deg)
return angles_virtuels
def nt_3d_centers(cif_file, consider_all_atoms):
"""Return the nucleotides' coordinates, summarizing a nucleotide by only one point.
If consider_all_atoms : barycentre is used
......@@ -1252,1674 +1173,30 @@ def nt_3d_centers(cif_file, consider_all_atoms):
result.append(res)
return(result)
def liste_repres(fpath):
repres=[]
df=pd.read_csv(os.path.abspath(fpath))
def representatives_from_nrlist(res):
nr_code = min([i for i in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 20.0] if i >= res])
fpath = f"/home/data/RNA/3D/latest_nr_list_{nr_code}A.csv"
repres = []
df = pd.read_csv(os.path.abspath(fpath))
for i in range(df.shape[0]):
up_name=df["representative"][i]
up_name = df["representative"][i]
if '+' in up_name:
up_name=up_name.split('+')
up_name = up_name.split('+')
for i in range(len(up_name)):
chain=up_name[i].split('|')
chain=chain[0].lower()+'_'+chain[1]+'_'+chain[2]
repres.append(chain+'.cif')
chain = up_name[i].split('|')
chain = chain[0].lower() + '_' + chain[1] + '_' + chain[2]
repres.append(chain + '.cif')
else :
up_name=up_name.split('|')
low_name=up_name[0].lower()+'_'+up_name[1]+'_'+up_name[2]
repres.append(low_name+'.cif')
up_name = up_name.split('|')
low_name = up_name[0].lower() + '_' + up_name[1] + '_' + up_name[2]
repres.append(low_name + '.cif')
return repres
def get_euclidian_distance(L1, L2):
"""
Returns the distance between two points (coordinates in lists)
"""
if len(L1)*len(L2) == 0:
return np.nan
if len(L1) == 1:
L1 = L1[0]
if len(L2) == 1:
L2 = L2[0]
e = 0
for i in range(len(L1)):
try:
e += float(L1[i] - L2[i])**2
except TypeError:
print("Terms: ", L1, L2)
except IndexError:
print("Terms: ", L1, L2)
return np.sqrt(e)
def get_flat_angle(L1, L2, L3):
if len(L1)*len(L2)*len(L3) == 0:
return np.nan
return calc_angle(Vector(L1[0]), Vector(L2[0]), Vector(L3[0]))*(180/np.pi)
def get_torsion_angle(L1, L2, L3, L4):
if len(L1)*len(L2)*len(L3)*len(L4) == 0:
return np.nan
return calc_dihedral(Vector(L1[0]), Vector(L2[0]), Vector(L3[0]), Vector(L4[0]))*(180/np.pi)
def pos_b1(res):
"""
Returns the coordinates of virtual atom B1 (center of the first aromatic cycle)
"""
coordb1=[]
somme_x_b1=0
somme_y_b1=0
somme_z_b1=0
moy_x_b1=0
moy_y_b1=0
moy_z_b1=0
#different cases
#some residues have 2 aromatic cycles
if res.get_resname() in ['A', 'G', '2MG', '7MG', 'MA6', '6IA', 'OMG' , '2MA', 'B9B', 'A2M', '1MA', 'E7G', 'P7G', 'B8W', 'B8K', 'BGH', '6MZ', 'E6G', 'MHG', 'M7A', 'M2G', 'P5P', 'G7M', '1MG', 'T6A', 'MIA', 'YG', 'YYG', 'I', 'DG', 'N79', '574', 'DJF', 'AET', '12A', 'ANZ', 'UY4'] :
c=0
names=[]
for atom in res :
if (atom.get_fullname() in ['N9', 'C8', 'N7', 'C4', 'C5']) :
c=c+1
names.append(atom.get_name())
coord=atom.get_vector()
somme_x_b1=somme_x_b1+coord[0]
somme_y_b1=somme_y_b1+coord[1]
somme_z_b1=somme_z_b1+coord[2]
else :
c=c
#calcul coord B1
if c != 0 :
moy_x_b1=somme_x_b1/c
moy_y_b1=somme_y_b1/c
moy_z_b1=somme_z_b1/c
coordb1.append(moy_x_b1)
coordb1.append(moy_y_b1)
coordb1.append(moy_z_b1)
#others have only one cycle
if res.get_resname() in ['C', 'U', 'AG9', '70U', '1RN', 'RSP', '3AU', 'CM0', 'U8U', 'IU', 'E3C', '4SU', '5HM', 'LV2', 'LHH', '4AC', 'CH', 'Y5P', '2MU', '4OC', 'B8T', 'JMH', 'JMC', 'DC', 'B9H', 'UR3', 'I4U', 'B8Q', 'P4U', 'OMU', 'OMC', '5MU', 'H2U', 'CBV', 'M1Y', 'B8N', '3TD', 'B8H'] :
c=0
for atom in res :
if (atom.get_fullname() in ['C6', 'N3', 'N1', 'C2', 'C4', 'C5']):
c=c+1
coord=atom.get_vector()
somme_x_b1=somme_x_b1+coord[0]
somme_y_b1=somme_y_b1+coord[1]
somme_z_b1=somme_z_b1+coord[2]
#calcul coord B1
if c != 0 :
moy_x_b1=somme_x_b1/c
moy_y_b1=somme_y_b1/c
moy_z_b1=somme_z_b1/c
coordb1.append(moy_x_b1)
coordb1.append(moy_y_b1)
coordb1.append(moy_z_b1)
if len(coordb1):
return [coordb1]
else:
return []
def pos_b2(res):
"""
Returns the coordinates of virtual atom B2 (center of the second aromatic cycle, if exists)
"""
coordb2=[]
somme_x_b2=0
somme_y_b2=0
somme_z_b2=0
moy_x_b2=0
moy_y_b2=0
moy_z_b2=0
if res.get_resname() in ['A', 'G', '2MG', '7MG', 'MA6', '6IA', 'OMG' , '2MA', 'B9B', 'A2M', '1MA', 'E7G', 'P7G', 'B8W', 'B8K', 'BGH', '6MZ', 'E6G', 'MHG', 'M7A', 'M2G', 'P5P', 'G7M', '1MG', 'T6A', 'MIA', 'YG', 'YYG', 'I', 'DG', 'N79', '574', 'DJF', 'AET', '12A', 'ANZ', 'UY4'] : #2 cycles aromatiques
c=0
for atom in res :
if atom.get_fullname() in ['C6', 'N3', 'N1', 'C2', 'C4', 'C5'] :
c=c+1
coord=atom.get_vector()
somme_x_b2=somme_x_b2+coord[0]
somme_y_b2=somme_y_b2+coord[1]
somme_z_b2=somme_z_b2+coord[2]
#calcul coord B2
if c!=0 :
moy_x_b2=somme_x_b2/c
moy_y_b2=somme_y_b2/c
moy_z_b2=somme_z_b2/c
coordb2.append(moy_x_b2)
coordb2.append(moy_y_b2)
coordb2.append(moy_z_b2)
if len(coordb2):
return [coordb2]
else:
return []
@trace_unhandled_exceptions
def basepair_measures(res, pair):
"""
measurement of the flat angles describing a basepair in the HiRE-RNA model
"""
if res.get_resname()=='C' or res.get_resname()=='U' :
atom_c4_res = [ atom.get_coord() for atom in res if "C4'" in atom.get_fullname() ]
atom_c1p_res = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ]
atom_b1_res = pos_b1(res)
if not len(atom_c4_res) or not len(atom_c1p_res) or not len(atom_b1_res):
return
a3_res = Vector(atom_c4_res[0])
a2_res = Vector(atom_c1p_res[0])
a1_res = Vector(atom_b1_res[0])
if res.get_resname()=='A' or res.get_resname()=='G' :
atom_c1p_res = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ]
atom_b1_res = pos_b1(res)
atom_b2_res = pos_b2(res)
if not len(atom_c1p_res) or not len(atom_b1_res) or not len(atom_b2_res):
return
a3_res = Vector(atom_c1p_res[0])
a2_res = Vector(atom_b1_res[0])
a1_res = Vector(atom_b2_res[0])
if pair.get_resname()=='C' or pair.get_resname()=='U' :
atom_c4_pair = [ atom.get_coord() for atom in pair if "C4'" in atom.get_fullname() ]
atom_c1p_pair = [ atom.get_coord() for atom in pair if "C1'" in atom.get_fullname() ]
atom_b1_pair = pos_b1(pair)
if not len(atom_c4_pair) or not len(atom_c1p_pair) or not len(atom_b1_pair):
return
a3_pair = Vector(atom_c4_pair[0])
a2_pair = Vector(atom_c1p_pair[0])
a1_pair = Vector(atom_b1_pair[0])
if pair.get_resname()=='A' or pair.get_resname()=='G' :
atom_c1p_pair = [ atom.get_coord() for atom in pair if "C1'" in atom.get_fullname() ]
atom_b1_pair = pos_b1(pair)
atom_b2_pair = pos_b2(pair)
if not len(atom_c1p_pair) or not len(atom_b1_pair) or not len(atom_b2_pair): # No C1' atom in the paired nucleotide, skip measures.
return
a3_pair = Vector(atom_c1p_pair[0])
a2_pair = Vector(atom_b1_pair[0])
a1_pair = Vector(atom_b2_pair[0])
# Bond vectors
res_32 = a3_res - a2_res
res_12 = a1_res - a2_res
pair_32 = a3_pair - a2_pair
pair_12 = a1_pair - a2_pair
rho = a1_res - a1_pair # from pair to res
# dist
dist = rho.norm()
# we calculate the 2 plane angles
with warnings.catch_warnings():
warnings.simplefilter('ignore', RuntimeWarning)
b = res_12.angle(rho)*(180/np.pi) # equal to the previous implementation
c = pair_12.angle(-rho)*(180/np.pi) #
# a = calc_angle(a1_res, a2_res, a3_res)*(180/np.pi) # not required
# b = calc_angle(a2_res, a1_res, a1_pair)*(180/np.pi)
# c = calc_angle(a1_res, a1_pair, a2_pair)*(180/np.pi)
# d = calc_angle(a3_pair, a2_pair, a1_pair)*(180/np.pi) # not required
# Compute plane vectors
n1 = (res_32**res_12).normalized() # ** between vectors, is the cross product
n2 = (pair_32**pair_12).normalized()
# Distances between base tip and the other base's plane (orthogonal projection)
# if angle(rho, n) > pi/2 the distance is negative (signed following n)
d1 = rho*n1 # projection of rho on axis n1
d2 = rho*n2
# Now the projection of rho in the planes. It's just a sum of the triangles' two other edges.
p1 = (-rho+n1**d1).normalized() # between vector and scalar, ** is the multiplication by a scalar
p2 = (rho-n2**d2).normalized()
# Measure tau, the dihedral
u = (res_12**rho).normalized()
v = (rho**pair_12).normalized()
cosTau1 = n1*u
cosTau2 = v*n2
# cosTau is enough to compute alpha, but we can't distinguish
# yet betwwen tau and -tau. If the full computation if required, then:
tau1 = np.arccos(cosTau1)*(180/np.pi)
tau2 = np.arccos(cosTau2)*(180/np.pi)
w1 = u**n1
w2 = v**n2
if res_12*w1 < 0:
tau1 = -tau1
if pair_12*w2 < 0:
tau2 = -tau2
# And finally, the a1 and a2 angles between res_12 and p1 / pair_12 and p2
with warnings.catch_warnings():
warnings.simplefilter('ignore', RuntimeWarning)
a1 = (-res_12).angle(p1)*(180/np.pi)
a2 = (-pair_12).angle(p2)*(180/np.pi)
if cosTau1 > 0:
# CosTau > 0 (Tau < 90 or Tau > 270) implies that alpha > 180.
a1 = -a1
if cosTau2 > 0:
a2 = -a2
return [dist, b, c, d1, d2, a1, a2, tau1, tau2]
@trace_unhandled_exceptions
def measure_from_structure(f):
"""
Do geometric measures required on a given filename
"""
name = f.split('.')[0]
global idxQueue
thr_idx = idxQueue.get()
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} measure_from_structure({f})")
# Open the structure
with warnings.catch_warnings():
# Ignore the PDB problems. This mostly warns that some chain is discontinuous.
warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning)
warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning)
parser=MMCIFParser()
s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "rna_only/" + f))
#pyle_measures(name, s, thr_idx)
#measures_aa(name, s, thr_idx)
if DO_HIRE_RNA_MEASURES:
measures_hrna(name, s, thr_idx)
measures_hrna_basepairs(name, s, thr_idx)
if DO_WADLEY_ANALYSIS:
#measures_wadley(name, s, thr_idx)
pyle_measures(name, s, thr_idx)
idxQueue.put(thr_idx) # replace the thread index in the queue
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
@trace_unhandled_exceptions
def measures_wadley(name, s, thr_idx):
"""
Measures the distances and plane angles involving C1' and P atoms
Saves the results in a dataframe
"""
# do not recompute something already computed
if (path.isfile(runDir + '/results/geometry/Pyle/angles/flat_angles_pyle_' + name + '.csv') and
path.isfile(runDir + "/results/geometry/Pyle/distances/distances_wadley_" + name + ".csv")):
return
liste_dist = []
liste_angl = []
last_p = []
last_c1p = []
last_c4p = []
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} measures_wadley({name})")
chain = next(s[0].get_chains())
for res in tqdm(chain, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {name} measures_wadley", unit="res", leave=False):
p_c1p_psuiv = np.nan
c1p_psuiv_c1psuiv = np.nan
if res.get_resname() not in ['ATP', 'CCC', 'A3P', 'A23', 'GDP', 'RIA', "2BA"] :
atom_p = [ atom.get_coord() for atom in res if atom.get_name() == "P"]
atom_c1p = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ]
atom_c4p = [ atom.get_coord() for atom in res if "C4'" in atom.get_fullname() ]
if len(atom_c1p) > 1:
for atom in res:
if "C1'" in atom.get_fullname():
print("\n", atom.get_fullname(), "-", res.get_resname(), "\n")
p_c1p_psuiv = get_flat_angle(last_p, last_c1p, atom_p)
c1p_psuiv_c1psuiv = get_flat_angle(last_c1p, atom_p, atom_c1p)
c1p_psuiv = get_euclidian_distance(last_c1p, atom_p)
p_c1p = get_euclidian_distance(atom_p, atom_c1p)
c4p_psuiv = get_euclidian_distance(last_c4p, atom_p)
p_c4p = get_euclidian_distance(atom_p, atom_c4p)
last_p = atom_p
last_c1p = atom_c1p
last_c4p = atom_c4p
liste_dist.append([res.get_resname(), c1p_psuiv, p_c1p, c4p_psuiv, p_c4p])
liste_angl.append([res.get_resname(), p_c1p_psuiv, c1p_psuiv_c1psuiv])
df = pd.DataFrame(liste_dist, columns=["Residu", "C1'-P", "P-C1'", "C4'-P", "P-C4'"])
df.to_csv(runDir + "/results/geometry/Pyle/distances/distances_wadley_" + name + ".csv")
df = pd.DataFrame(liste_angl, columns=["Residu", "P-C1'-P°", "C1'-P°-C1'°"])
df.to_csv(runDir + "/results/geometry/Pyle/angles/flat_angles_pyle_"+name+".csv")
@trace_unhandled_exceptions
def measures_aa(name, s, thr_idx):
"""
Measures the distance between atoms linked by covalent bonds
"""
# do not recompute something already computed
if path.isfile(runDir+"/results/geometry/all-atoms/distances/dist_atoms_"+name+".csv"):
return
last_o3p = [] # o3 'of the previous nucleotide linked to the P of the current nucleotide
liste_common = []
liste_purines = []
liste_pyrimidines = []
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} measure_aa_dists({name})")
chain = next(s[0].get_chains()) # 1 chain per file
residues = list(chain.get_residues())
pbar = tqdm(total=len(residues), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {name} measure_aa_dists", unit="res", leave=False)
pbar.update(0)
for res in chain :
# for residues A, G, C, U
op3_p=[]
p_op1=[]
p_op2=[]
p_o5p=[]
o5p_c5p=[]
c5p_c4p=[]
c4p_o4p=[]
o4p_c1p=[]
c1p_c2p=[]
c2p_o2p=[]
c2p_c3p=[]
c3p_o3p=[]
c4p_c3p=[]
#if res = A or G
c1p_n9=None
n9_c8=None
c8_n7=None
n7_c5=None
c5_c6=None
c6_n1=None
n1_c2=None
c2_n3=None
n3_c4=None
c4_n9=None
c4_c5=None
#if res=G
c6_o6=None
c2_n2=None
#if res = A
c6_n6=None
#if res = C or U
c1p_n1=None
n1_c6=None
c6_c5=None
c5_c4=None
c4_n3=None
n3_c2=None
c2_n1=None
c2_o2=None
#if res =C
c4_n4=None
#if res=U
c4_o4=None
last_o3p_p=None
if res.get_resname()=='A' or res.get_resname()=='G' or res.get_resname()=='C' or res.get_resname()=='U' :
#get the coordinates of the atoms
atom_p = [ atom.get_coord() for atom in res if atom.get_name() == "P"]
atom_op3 = [ atom.get_coord() for atom in res if "OP3" in atom.get_fullname() ]
atom_op1 = [ atom.get_coord() for atom in res if "OP1" in atom.get_fullname() ]
atom_op2 = [ atom.get_coord() for atom in res if "OP2" in atom.get_fullname() ]
atom_o5p= [ atom.get_coord() for atom in res if "O5'" in atom.get_fullname() ]
atom_c5p = [ atom.get_coord() for atom in res if "C5'" in atom.get_fullname() ]
atom_c4p = [ atom.get_coord() for atom in res if "C4'" in atom.get_fullname() ]
atom_o4p = [ atom.get_coord() for atom in res if "O4'" in atom.get_fullname() ]
atom_c3p = [ atom.get_coord() for atom in res if "C3'" in atom.get_fullname() ]
atom_o3p = [ atom.get_coord() for atom in res if "O3'" in atom.get_fullname() ]
atom_c2p = [ atom.get_coord() for atom in res if "C2'" in atom.get_fullname() ]
atom_o2p = [ atom.get_coord() for atom in res if "O2'" in atom.get_fullname() ]
atom_c1p = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ]
atom_n9 = [ atom.get_coord() for atom in res if "N9" in atom.get_fullname() ]
atom_c8 = [ atom.get_coord() for atom in res if "C8" in atom.get_fullname() ]
atom_n7 = [ atom.get_coord() for atom in res if "N7" in atom.get_fullname() ]
atom_c5 = [ atom.get_coord() for atom in res if atom.get_name() == "C5"]
atom_c6 = [ atom.get_coord() for atom in res if "C6" in atom.get_fullname() ]
atom_o6 = [ atom.get_coord() for atom in res if "O6" in atom.get_fullname() ]
atom_n6 = [ atom.get_coord() for atom in res if "N6" in atom.get_fullname() ]
atom_n1 = [ atom.get_coord() for atom in res if "N1" in atom.get_fullname() ]
atom_c2 = [ atom.get_coord() for atom in res if atom.get_name() == "C2"]
atom_n2 = [ atom.get_coord() for atom in res if "N2" in atom.get_fullname() ]
atom_o2 = [ atom.get_coord() for atom in res if atom.get_name() == "O2"]
atom_n3 = [ atom.get_coord() for atom in res if "N3" in atom.get_fullname() ]
atom_c4 = [ atom.get_coord() for atom in res if atom.get_name() == "C4" ]
atom_n4 = [ atom.get_coord() for atom in res if "N4" in atom.get_fullname() ]
atom_o4 = [ atom.get_coord() for atom in res if atom.get_name() == "O4"]
op3_p = get_euclidian_distance(atom_op3, atom_p)
last_o3p_p = get_euclidian_distance(last_o3p, atom_p) # link with the previous nucleotide
p_op1 = get_euclidian_distance(atom_op1, atom_p)
p_op2 = get_euclidian_distance(atom_op2, atom_p)
p_o5p = get_euclidian_distance(atom_o5p, atom_p)
o5p_c5p = get_euclidian_distance(atom_o5p, atom_c5p)
c5p_c4p = get_euclidian_distance(atom_c5p, atom_c4p)
c4p_o4p = get_euclidian_distance(atom_c4p, atom_o4p)
c4p_c3p = get_euclidian_distance(atom_c4p, atom_c3p)
o4p_c1p = get_euclidian_distance(atom_o4p, atom_c1p)
c1p_c2p = get_euclidian_distance(atom_c1p, atom_c2p)
c2p_o2p = get_euclidian_distance(atom_c2p, atom_o2p)
c2p_c3p = get_euclidian_distance(atom_c2p, atom_c3p)
c3p_o3p = get_euclidian_distance(atom_c3p, atom_o3p)
last_o3p=atom_o3p # o3' of this residue becomes the previous o3' of the following
#different cases for the aromatic cycles
if res.get_resname()=='A' or res.get_resname()=='G':
# computes the distances between atoms of aromatic cycles
c1p_n9 = get_euclidian_distance(atom_c1p, atom_n9)
n9_c8 = get_euclidian_distance(atom_n9, atom_c8)
c8_n7 = get_euclidian_distance(atom_c8, atom_n7)
n7_c5 = get_euclidian_distance(atom_n7, atom_c5)
c5_c6 = get_euclidian_distance(atom_c5, atom_c6)
c6_o6 = get_euclidian_distance(atom_c6, atom_o6)
c6_n6 = get_euclidian_distance(atom_c6, atom_n6)
c6_n1 = get_euclidian_distance(atom_c6, atom_n1)
n1_c2 = get_euclidian_distance(atom_n1, atom_c2)
c2_n2 = get_euclidian_distance(atom_c2, atom_n2)
c2_n3 = get_euclidian_distance(atom_c2, atom_n3)
n3_c4 = get_euclidian_distance(atom_n3, atom_c4)
c4_n9 = get_euclidian_distance(atom_c4, atom_n9)
c4_c5 = get_euclidian_distance(atom_c4, atom_c5)
if res.get_resname()=='C' or res.get_resname()=='U' :
c1p_n1 = get_euclidian_distance(atom_c1p, atom_n1)
n1_c6 = get_euclidian_distance(atom_n1, atom_c6)
c6_c5 = get_euclidian_distance(atom_c6, atom_c5)
c5_c4 = get_euclidian_distance(atom_c5, atom_c4)
c4_n3 = get_euclidian_distance(atom_c4, atom_n3)
n3_c2 = get_euclidian_distance(atom_n3, atom_c2)
c2_o2 = get_euclidian_distance(atom_c2, atom_o2)
c2_n1 = get_euclidian_distance(atom_c2, atom_n1)
c4_n4 = get_euclidian_distance(atom_c4, atom_n4)
c4_o4 = get_euclidian_distance(atom_c4, atom_o4)
liste_common.append([res.get_resname(), last_o3p_p, op3_p, p_op1, p_op2, p_o5p, o5p_c5p, c5p_c4p, c4p_o4p, c4p_c3p, o4p_c1p, c1p_c2p, c2p_o2p, c2p_c3p, c3p_o3p] )
liste_purines.append([c1p_n9, n9_c8, c8_n7, n7_c5, c5_c6, c6_o6, c6_n6, c6_n1, n1_c2, c2_n2, c2_n3, n3_c4, c4_n9, c4_c5])
liste_pyrimidines.append([c1p_n1, n1_c6, c6_c5, c5_c4, c4_n3, n3_c2, c2_o2, c2_n1, c4_n4, c4_o4])
pbar.update(1)
df_comm=pd.DataFrame(liste_common, columns=["Residu", "O3'-P", "OP3-P", "P-OP1", "P-OP2", "P-O5'", "O5'-C5'", "C5'-C4'", "C4'-O4'", "C4'-C3'", "O4'-C1'", "C1'-C2'", "C2'-O2'", "C2'-C3'", "C3'-O3'"])
df_pur=pd.DataFrame(liste_purines, columns=["C1'-N9", "N9-C8", "C8-N7", "N7-C5", "C5-C6", "C6-O6", "C6-N6", "C6-N1", "N1-C2", "C2-N2", "C2-N3", "N3-C4", "C4-N9", "C4-C5" ])
df_pyr=pd.DataFrame(liste_pyrimidines, columns=["C1'-N1", "N1-C6", "C6-C5", "C5-C4", "C4-N3", "N3-C2", "C2-O2", "C2-N1", "C4-N4", "C4-O4"])
df=pd.concat([df_comm, df_pur, df_pyr], axis = 1)
pbar.close()
df.to_csv(runDir + "/results/geometry/all-atoms/distances/dist_atoms_" + name + ".csv")
@trace_unhandled_exceptions
def measures_hrna(name, s, thr_idx):
"""
Measures the distance/angles between the atoms of the HiRE-RNA model linked by covalent bonds
"""
# do not recompute something already computed
if (path.isfile(runDir + '/results/geometry/HiRE-RNA/distances/dist_atoms_hire_RNA '+name+'.csv') and
path.isfile(runDir + '/results/geometry/HiRE-RNA/angles/angles_hire_RNA '+name+'.csv') and
path.isfile(runDir + '/results/geometry/HiRE-RNA/torsions/angles_torsion_hire_RNA '+name+'.csv')):
return
liste_dist=[]
liste_angl = []
liste_tors = []
last_c4p = []
last_c5p = []
last_c1p = []
last_o5p = []
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} measures_hrna({name})")
chain = next(s[0].get_chains())
residues=list(chain.get_residues())
for res in tqdm(chain, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {name} measures_hrna", unit="res", leave=False):
# distances
p_o5p = None
o5p_c5p = None
c5p_c4p = None
c4p_c1p = None
c1p_b1 = None
b1_b2 = None
last_c4p_p = np.nan
# angles
p_o5p_c5p = None
o5p_c5p_c4p = None
c5p_c4p_c1p = None
c4p_c1p_b1 = None
c1p_b1_b2 = None
lastc4p_p_o5p = None
lastc5p_lastc4p_p = None
lastc1p_lastc4p_p = None
# torsions
p_o5_c5_c4 = np.nan
o5_c5_c4_c1 = np.nan
c5_c4_c1_b1 = np.nan
c4_c1_b1_b2 = np.nan
o5_c5_c4_psuiv = np.nan
c5_c4_psuiv_o5suiv = np.nan
c4_psuiv_o5suiv_c5suiv = np.nan
c1_c4_psuiv_o5suiv = np.nan
if res.get_resname() not in ['ATP', 'CCC', 'A3P', 'A23', 'GDP', 'RIA', "2BA"] : # several phosphate groups, ignore
atom_p = [ atom.get_coord() for atom in res if atom.get_name() == "P"]
atom_o5p= [ atom.get_coord() for atom in res if "O5'" in atom.get_fullname() ]
atom_c5p = [ atom.get_coord() for atom in res if "C5'" in atom.get_fullname() ]
atom_c4p = [ atom.get_coord() for atom in res if "C4'" in atom.get_fullname() ]
atom_c1p = [ atom.get_coord() for atom in res if "C1'" in atom.get_fullname() ]
atom_b1 = pos_b1(res) # position b1 to be calculated, depending on the case
atom_b2 = pos_b2(res) # position b2 to be calculated only for those with 2 cycles
# Distances. If one of the atoms is empty, the euclidian distance returns NaN.
last_c4p_p = get_euclidian_distance(last_c4p, atom_p)
p_o5p = get_euclidian_distance(atom_p, atom_o5p)
o5p_c5p = get_euclidian_distance(atom_o5p, atom_c5p)
c5p_c4p = get_euclidian_distance(atom_c5p, atom_c4p)
c4p_c1p = get_euclidian_distance(atom_c4p, atom_c1p)
c1p_b1 = get_euclidian_distance(atom_c1p, atom_b1)
b1_b2 = get_euclidian_distance(atom_b1, atom_b2)
# flat angles. Same.
lastc4p_p_o5p = get_flat_angle(last_c4p, atom_p, atom_o5p)
lastc1p_lastc4p_p = get_flat_angle(last_c1p, last_c4p, atom_p)
lastc5p_lastc4p_p = get_flat_angle(last_c5p, last_c4p, atom_p)
p_o5p_c5p = get_flat_angle(atom_p, atom_o5p, atom_c5p)
o5p_c5p_c4p = get_flat_angle(atom_o5p, atom_c5p, atom_c4p)
c5p_c4p_c1p = get_flat_angle(atom_c5p, atom_c4p, atom_c1p)
c4p_c1p_b1 = get_flat_angle(atom_c4p, atom_c1p, atom_b1)
c1p_b1_b2 = get_flat_angle(atom_c1p, atom_b1, atom_b2)
# torsions. Idem.
p_o5_c5_c4 = get_torsion_angle(atom_p, atom_o5p, atom_c5p, atom_c4p)
o5_c5_c4_c1 = get_torsion_angle(atom_o5p, atom_c5p, atom_c4p, atom_c1p)
c5_c4_c1_b1 = get_torsion_angle(atom_c5p, atom_c4p, atom_c1p, atom_b1)
c4_c1_b1_b2 = get_torsion_angle(atom_c4p, atom_c1p, atom_b1, atom_b2)
o5_c5_c4_psuiv = get_torsion_angle(last_o5p, last_c5p, last_c4p, atom_p)
c5_c4_psuiv_o5suiv = get_torsion_angle(last_c5p, last_c4p, atom_p, atom_o5p)
c4_psuiv_o5suiv_c5suiv = get_torsion_angle(last_c4p, atom_p, atom_o5p, atom_c5p)
c1_c4_psuiv_o5suiv = get_torsion_angle(last_c1p, last_c4p, atom_p, atom_o5p)
last_c4p = atom_c4p
last_c5p = atom_c5p
last_c1p = atom_c1p
last_o5p = atom_o5p
liste_dist.append([res.get_resname(), last_c4p_p, p_o5p, o5p_c5p, c5p_c4p, c4p_c1p, c1p_b1, b1_b2])
liste_angl.append([res.get_resname(), lastc4p_p_o5p, lastc1p_lastc4p_p, lastc5p_lastc4p_p, p_o5p_c5p, o5p_c5p_c4p, c5p_c4p_c1p, c4p_c1p_b1, c1p_b1_b2])
liste_tors.append([res.get_resname(), p_o5_c5_c4, o5_c5_c4_c1, c5_c4_c1_b1, c4_c1_b1_b2, o5_c5_c4_psuiv, c5_c4_psuiv_o5suiv, c4_psuiv_o5suiv_c5suiv, c1_c4_psuiv_o5suiv])
df = pd.DataFrame(liste_dist, columns=["Residu", "C4'-P", "P-O5'", "O5'-C5'", "C5'-C4'", "C4'-C1'", "C1'-B1", "B1-B2"])
df.to_csv(runDir + '/results/geometry/HiRE-RNA/distances/dist_atoms_hire_RNA '+name+'.csv')
df = pd.DataFrame(liste_angl, columns=["Residu", "C4'-P-O5'", "C1'-C4'-P", "C5'-C4'-P", "P-O5'-C5'", "O5'-C5'-C4'", "C5'-C4'-C1'", "C4'-C1'-B1", "C1'-B1-B2"])
df.to_csv(runDir + '/results/geometry/HiRE-RNA/angles/angles_hire_RNA ' + name + ".csv")
df=pd.DataFrame(liste_tors, columns=["Residu", "P-O5'-C5'-C4'", "O5'-C5'-C4'-C1'", "C5'-C4'-C1'-B1", "C4'-C1'-B1-B2", "O5'-C5'-C4'-P°", "C5'-C4'-P°-O5'°", "C4'-P°-O5'°-C5'°", "C1'-C4'-P°-O5'°"])
df.to_csv(runDir + '/results/geometry/HiRE-RNA/torsions/angles_torsion_hire_RNA '+name+'.csv')
@trace_unhandled_exceptions
def measures_hrna_basepairs(name, s, thr_idx):
"""
Open a rna_only/ file, and run measures_hrna_basepairs_chain() on every chain
"""
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} measures_hrna_basepairs({name})")
l=[]
chain = next(s[0].get_chains())
# do not recompute something already computed
if path.isfile(runDir + "/results/geometry/HiRE-RNA/basepairs/basepairs_"+name+".csv"):
return
df=pd.read_csv(os.path.abspath(path_to_3D_data +"datapoints/" + name))
if df['index_chain'][0] == 1: # ignore files with numbering errors : TODO : remove when we get DSSR Pro, there should not be numbering errors anymore
l = measures_hrna_basepairs_chain(name, chain, df, thr_idx)
df_calc = pd.DataFrame(l, columns=["type_LW", "nt1_idx", "nt1_res", "nt2_idx", "nt2_res", "Distance",
"211_angle", "112_angle", "dB1", "dB2", "alpha1", "alpha2", "3211_torsion", "1123_torsion"])
df_calc.to_csv(runDir + "/results/geometry/HiRE-RNA/basepairs/"+'basepairs_' + name + '.csv', float_format="%.3f")
@trace_unhandled_exceptions
def measures_hrna_basepairs_chain(name, chain, df, thr_idx):
"""
Cleanup of the dataset
measurements of distances and angles between paired nucleotides in the chain
"""
results = []
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
pairs = df[['index_chain', 'old_nt_resnum', 'paired', 'pair_type_LW']] # columns we keep
for i in range(pairs.shape[0]): # we remove the lines where no pairing (NaN in paired)
index_with_nan = pairs.index[pairs.iloc[:,2].isnull()]
pairs.drop(index_with_nan, 0, inplace=True)
paired_int = []
for i in pairs.index: # convert values ​​from paired to integers or lists of integers
paired = pairs.at[i, 'paired']
if type(paired) is np.int64 or type(paired) is np.float64:
paired_int.append(int(paired))
else : #strings
if len(paired) < 3: # a single pairing
paired_int.append(int(paired))
else : # several pairings
paired = paired.split(',')
l = [ int(i) for i in paired ]
paired_int.append(l)
pair_type_LW_bis = []
for j in pairs.index:
pair_type_LW = pairs.at[j, 'pair_type_LW']
if len(pair_type_LW) < 4 : # a single pairing
pair_type_LW_bis.append(pair_type_LW)
else : # several pairings
pair_type_LW = pair_type_LW.split(',')
l = [ i for i in pair_type_LW ]
pair_type_LW_bis.append(pair_type_LW)
# addition of these new columns
pairs.insert(4, "paired_int", paired_int, True)
pairs.insert(5, "pair_type_LW_bis", pair_type_LW_bis, True)
indexNames = pairs[pairs['paired_int'] == 0].index
pairs.drop(indexNames, inplace=True) # deletion of lines with a 0 in paired_int (matching to another RNA chain)
for i in tqdm(pairs.index, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {name} measures_hrna_basepairs_chain", unit="res", leave=False):
# calculations for each row of the pairs dataset
index = pairs.at[i, 'index_chain']
res1 = chain[(' ', index, ' ')].get_resname()
if res1 not in ['A','C','G','U']:
continue
type_LW = pairs.at[i, 'pair_type_LW_bis'] # pairing type
num_paired = pairs.at[i, 'paired_int'] # number (index_chain) of the paired nucleotide
if type(num_paired) is int or type(num_paired) is np.int64:
res2 = chain[(' ', num_paired, ' ')].get_resname()
if res2 not in ["A","C","G","U"]:
continue
measures = basepair_measures(chain[(' ', index, ' ')], chain[(' ', num_paired, ' ')])
if measures is not None:
results.append([type_LW, index, res1, num_paired, res2] + measures)
else:
for j in range(len(num_paired)): # if several pairings, process them one by one
if num_paired[j] != 0:
res2 = chain[(' ', num_paired[j], ' ')].get_resname()
if res2 not in ["A","C","G","U"]:
continue
measures = basepair_measures(chain[(' ', index, ' ')], chain[(' ', num_paired[j], ' ')])
if measures is not None:
results.append([type_LW[j], index, res1, num_paired[j], res2] + measures)
return results
@trace_unhandled_exceptions
def pyle_measures(name, s, thr_idx):
if (path.isfile(runDir + '/results/geometry/Pyle/distances/distances_pyle_'+name+'.csv')):
return
liste_dist=[]
#classes=[]
#for i in range(0, 150, 5):
#classes.append([i, i+5])
#classes.append([150, 300])
#occur_p_p=len(classes)*[0]
#occur_p_c1=len(classes)*[0]
#occur_p_c4=len(classes)*[0]
#occur_c1_c1=len(classes)*[0]
#occur_c4_c4=len(classes)*[0]
#nb_occurs=[]
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} pyle_measures({name})")
chain = next(s[0].get_chains())
#residues=list(chain.get_residues())
for res1 in tqdm(chain, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {name} pyle_measures", unit="res", leave=False):
#res1=chain[i]
if res1.get_resname() in ["A", "C", "G", "U"]:
resnum1=list(res1.get_id())[1]
atom_p_1 = [ atom.get_coord() for atom in res1 if atom.get_name() == "P"]
atom_c1p_1 = [ atom.get_coord() for atom in res1 if "C1'" in atom.get_fullname() ]
atom_c4p_1 = [ atom.get_coord() for atom in res1 if "C4'" in atom.get_fullname() ]
for res2 in chain:
resnum2=list(res2.get_id())[1]
if resnum2-resnum1 < 4 :
continue
p_p=np.nan
p_c4p=np.nan
p_c1p=np.nan
c4p_c4p=np.nan
c1p_c1p=np.nan
#res2=chain[j]
if res2.get_resname() in ["A", "C", "G", "U"]:
atom_p_2 = [ atom.get_coord() for atom in res2 if atom.get_name() == "P"]
atom_c1p_2 = [ atom.get_coord() for atom in res2 if "C1'" in atom.get_fullname() ]
atom_c4p_2 = [ atom.get_coord() for atom in res2 if "C4'" in atom.get_fullname() ]
p_p = get_euclidian_distance(atom_p_1, atom_p_2)
p_c4p= get_euclidian_distance(atom_p_1, atom_c4p_2)
p_c1p= get_euclidian_distance(atom_p_1, atom_c1p_2)
c4p_c4p= get_euclidian_distance(atom_c4p_1, atom_c4p_2)
c1p_c1p= get_euclidian_distance(atom_c1p_1, atom_c1p_2)
liste_dist.append([res1.get_resname(), int(resnum1), res2.get_resname(), int(resnum2), p_p, p_c4p, p_c1p, c4p_c4p, c1p_c1p])
'''
for x in range(len(classes)):
if classes[x][0] <= p_p <= classes[x][1]:
occur_p_p[x]=occur_p_p[x]+1
if classes[x][0] <= p_c4p <= classes[x][1]:
occur_p_c4[x]=occur_p_c4[x]+1
if classes[x][0] <= p_c1p <= classes[x][1]:
occur_p_c1[x]=occur_p_c1[x]+1
if classes[x][0] <= c4p_c4p <= classes[x][1]:
occur_c4_c4[x]=occur_c4_c4[x]+1
if classes[x][0] <= c1p_c1p <= classes[x][1]:
occur_c1_c1[x]=occur_c1_c1[x]+1
'''
#for x in range(len(classes)):
# for i in range(len(liste_dist)):
# if classes[x][0] <= liste_dist[i][4] <= classes[x][1]:
# occur_p_p[x]=occur_p_p[x]+1
# if classes[x][0] <= liste_dist[i][5] <= classes[x][1]:
# occur_p_c4[x]=occur_p_c4[x]+1
# if classes[x][0] <= liste_dist[i][6] <= classes[x][1]:
# occur_p_c1[x]=occur_p_c1[x]+1
# if classes[x][0] <= liste_dist[i][7] <= classes[x][1]:
# occur_c4_c4[x]=occur_c4_c4[x]+1
# if classes[x][0] <= liste_dist[i][8] <= classes[x][1]:
# occur_c1_c1[x]=occur_c1_c1[x]+1
#nb_occurs.append([classes[x], occur_p_p[x], occur_p_c1[x], occur_p_c4[x], occur_c1_c1[x], occur_c4_c4[x]])
#df = pd.DataFrame(nb_occurs, columns=["classe", "P-P", "P-C1'", "P-C4'", "C1'-C1'", "C4'-C4'"])
# return df
# nb_occurs.append([classes, occur_p_p, occur_p_c1, occur_p_c4, occur_c1_c1, occur_c4_c4])
# print(nb_occurs)
# return nb_occurs
df = pd.DataFrame(liste_dist, columns=["res1", "resnum1", "res2", "resnum2", "P-P", "P-C4'", "P-C1'", "C4'-C4'", "C1'-C1'"])
df.to_csv(runDir + "/results/geometry/Pyle/distances/distances_pyle_" + name + ".csv")
@trace_unhandled_exceptions
def count_occur_pyle_dist(fpath):
global idxQueue
thr_idx = idxQueue.get()
setproctitle(f"Worker {thr_idx+1} : Extract occurences of {fpath}")
liste=os.listdir(fpath)
pbar = tqdm(total=len(liste), position=thr_idx, desc="Preparing ", leave=False)
df = pd.read_csv(os.path.abspath(fpath + liste.pop()))
occur_p_p=list(df["P-P"])
occur_p_c1=list(df["P-C1'"])
occur_p_c4=list(df["P-C4'"])
occur_c1_c1=list(df["C1'-C1'"])
occur_c4_c4=list(df["C4'-C4'"])
nb_occurs=[]
for f in range(len(liste)):
df = pd.read_csv(os.path.abspath(fpath + liste.pop()))
# print(liste[f])
for k in range(df.shape[0]):
occur_p_p[k]=occur_p_p[k]+df["P-P"][k]
occur_p_c1[k]=occur_p_c1[k]+df["P-C1'"][k]
occur_p_c4[k]=occur_p_c4[k]+df["P-C4'"][k]
occur_c1_c1[k]=occur_c1_c1[k]+df["C1'-C1'"][k]
occur_c4_c4[k]=occur_c4_c4[k]+df["C4'-C4'"][k]
pbar.update(1)
nb_occurs=[occur_p_p, occur_p_c1, occur_p_c4, occur_c1_c1, occur_c4_c4]
# return(list(df["classe"]), occur_p_p, occur_p_c1, occur_p_c4, occur_c1_c1, occur_c4_c4)
df = pd.DataFrame(nb_occurs, columns=list(df["classe"]))
df.to_csv(runDir + "/results/geometry/Pyle/classes_dist/occurences_dist.csv")
idxQueue.put(thr_idx) # replace the thread index in the queue
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
@trace_unhandled_exceptions
def GMM_histo(data_ori, name_data, toric=False, hist=True, col=None, save=True) :
"""
Plot Gaussian-Mixture-Model (with or without histograms)
"""
data_ori = np.array(data_ori)
if toric:
# Extend the data on the right and on the left (for angles)
data = np.concatenate([data_ori, data_ori-360.0, data_ori+360.0])
else:
data = data_ori
# chooses the number of components based on the maximum likelihood value (maxlogv)
n_components_range = np.arange(8)+1
# aic = []
# bic = []
maxlogv=[]
md = np.array(data).reshape(-1,1)
nb_components = 1
nb_log_max = n_components_range[0]
log_max = 0
for n_comp in n_components_range:
gmm = GaussianMixture(n_components=n_comp).fit(md)
# aic.append(abs(gmm.aic(md)))
# bic.append(abs(gmm.bic(md)))
maxlogv.append(gmm.lower_bound_)
if gmm.lower_bound_== max(maxlogv) : # takes the maximum
nb_components = n_comp
# if there is convergence, keep the first maximum found
if abs(gmm.lower_bound_-log_max) < 0.02 : #threshold=0.02
nb_components = nb_log_max
break
log_max = max(maxlogv)
nb_log_max = n_comp
# Now compute the final GMM
obs = np.array(data).reshape(-1,1) # still on extended data
g = GaussianMixture(n_components=nb_components)
g.fit(obs)
if toric:
# Now decide which to keep
keep = []
weights = []
means = []
covariances = []
sum_weights = 0.0
for m in g.means_:
keep.append(m > -180 and m <= 180)
for i, w in enumerate(g.weights_):
if not keep[i]:
continue
sum_weights += w
for i in range(nb_components):
if not keep[i]:
continue
means.append(g.means_[i])
covariances.append(g.covariances_[i])
weights.append(g.weights_[i]/sum_weights)
nb_components = len(means)
else:
weights = g.weights_
means = g.means_
covariances = g.covariances_
# plot histograms if asked, with the appropriate number of components
if hist:
plt.hist(data_ori, color="green", edgecolor='black', linewidth=1.2, bins=50, density=True)
if toric:
plt.xlabel("Angle (Degrees)")
else:
plt.xlabel("Distance (Angströms)")
plt.ylabel("Density")
# Prepare the GMM curve with some absciss points
if toric:
x = np.linspace(-360.0,360.0,721)
else:
D = obs.ravel()
xmin = D.min()
#xmax = min(10.0, D.max())
xmax = D.max()
x = np.linspace(xmin,xmax,1000)
colors=['red', 'blue', 'gold', 'cyan', 'magenta', 'white', 'black', 'green']
# prepare the dictionary to save the parameters
summary_data = {}
summary_data["measure"] = name_data
summary_data["weights"] = []
summary_data["means"] = []
summary_data["std"] = []
# plot
curves = []
for i in range(nb_components):
# store the parameters
mean = means[i]
sigma = np.sqrt(covariances[i])
weight = weights[i]
summary_data["means"].append("{:.2f}".format(float(str(mean).strip("[]"))))
summary_data["std"].append("{:.2f}".format(float(str(sigma).strip("[]"))))
summary_data["weights"].append("{:.2f}".format(float(str(weight).strip("[]"))))
# compute the right x and y data to plot
y = weight*st.norm.pdf(x, mean, sigma)
if toric:
y_mod = (((y[0]+180.0)%360.0)-180.0)
x_mod = (((x+180.0)%360.0)-180.0)
s = sorted(zip(x_mod,y_mod))
newx = []
newy = []
for k in range(0, len(s), 2):
if k == 362.0:
continue # this value is dealt with when k = 360.0
# print(k, "summing: ", s[k-int(k>360)], s[k+1-int(k>360)])
newx.append(s[k-int(k>360)][0])
if k == 360.0:
newy.append(s[k][1]+s[k+1][1]+s[k+2][1])
else:
newy.append(s[k-int(k>360)][1]+s[k+1-int(k>360)][1])
else:
newx = x
newy = y[0]
if hist:
# plot on top of the histograms
plt.plot(newx, newy, c=colors[i])
else:
# store for later summation
curves.append(np.array(newy))
if hist:
plt.title(f"Histogram of {name_data} with GMM of {nb_components} components (" + str(len(data_ori))+" values)")
if save:
plt.savefig(f"Histogram_{name_data}_{nb_components}_comps.png")
plt.close()
else:
# Plot their sum, do not save figure yet
try:
plt.plot(newx, sum(curves), c=col, label=name_data)
except TypeError:
print("N curves:", len(curves))
for c in curves:
print(c)
plt.legend()
# Save the json
with open(runDir + "/results/geometry/json/" +name_data + ".json", 'w', encoding='utf-8') as f:
json.dump(summary_data, f, indent=4)
@trace_unhandled_exceptions
def gmm_aa_dists():
"""
Draw the figures representing the data on the measurements of distances between atoms
"""
setproctitle("GMM (all atoms, distances)")
df=pd.read_csv(os.path.abspath(runDir + "/results/geometry/all-atoms/distances/dist_atoms.csv"))
last_o3p_p=list(df["O3'-P"][~ np.isnan(df["O3'-P"])])
#print(last_o3p_p)
op3_p=list(df["OP3-P"][~ np.isnan(df["OP3-P"])])
p_op1=list(df["P-OP1"][~ np.isnan(df["P-OP1"])])
p_op2=list(df["P-OP2"][~ np.isnan(df["P-OP2"])])
p_o5p=list(df["P-O5'"][~ np.isnan(df["P-O5'"])])
o5p_c5p=list(df["O5'-C5'"][~ np.isnan(df["O5'-C5'"])])
c5p_c4p=list(df["C5'-C4'"][~ np.isnan(df["C5'-C4'"])])
c4p_o4p=list(df["C4'-O4'"][~ np.isnan(df["C4'-O4'"])])
o4p_c1p=list(df["O4'-C1'"][~ np.isnan(df["O4'-C1'"])])
c1p_c2p=list(df["C1'-C2'"][~ np.isnan(df["C1'-C2'"])])
c2p_o2p=list(df["C2'-O2'"][~ np.isnan(df["C2'-O2'"])])
c2p_c3p=list(df["C2'-C3'"][~ np.isnan(df["C2'-C3'"])])
c3p_o3p=list(df["C3'-O3'"][~ np.isnan(df["C3'-O3'"])])
c4p_c3p=list(df["C4'-C3'"][~ np.isnan(df["C4'-C3'"])])
#if res = A ou G
c1p_n9=list(df["C1'-N9"][~ np.isnan(df["C1'-N9"])])
n9_c8=list(df["N9-C8"][~ np.isnan(df["N9-C8"])])
c8_n7=list(df["C8-N7"][~ np.isnan(df["C8-N7"])])
n7_c5=list(df["N7-C5"][~ np.isnan(df["N7-C5"])])
c5_c6=list(df["C5-C6"][~ np.isnan(df["C5-C6"])])
c6_n1=list(df["C6-N1"][~ np.isnan(df["C6-N1"])])
n1_c2=list(df["N1-C2"][~ np.isnan(df["N1-C2"])])
c2_n3=list(df["C2-N3"][~ np.isnan(df["C2-N3"])])
n3_c4=list(df["N3-C4"][~ np.isnan(df["N3-C4"])])
c4_n9=list(df["C4-N9"][~ np.isnan(df["C4-N9"])])
c4_c5=list(df["C4-C5"][~ np.isnan(df["C4-C5"])])
#if res=G
c6_o6=list(df["C6-O6"][~ np.isnan(df["C6-O6"])])
c2_n2=list(df["C2-N2"][~ np.isnan(df["C2-N2"])])
#if res = A
c6_n6=list(df["C6-N6"][~ np.isnan(df["C6-N6"])])
#if res = C ou U
c1p_n1=list(df["C1'-N1"][~ np.isnan(df["C1'-N1"])])
n1_c6=list(df["N1-C6"][~ np.isnan(df["N1-C6"])])
c6_c5=list(df["C6-C5"][~ np.isnan(df["C6-C5"])])
c5_c4=list(df["C5-C4"][~ np.isnan(df["C5-C4"])])
c4_n3=list(df["C4-N3"][~ np.isnan(df["C4-N3"])])
n3_c2=list(df["N3-C2"][~ np.isnan(df["N3-C2"])])
c2_n1=list(df["C2-N1"][~ np.isnan(df["C2-N1"])])
c2_o2=list(df["C2-O2"][~ np.isnan(df["C2-O2"])])
#if res =C
c4_n4=list(df["C4-N4"][~ np.isnan(df["C4-N4"])])
#if res=U
c4_o4=list(df["C4-O4"][~ np.isnan(df["C4-O4"])])
os.makedirs(runDir+"/results/figures/GMM/all-atoms/distances/commun/", exist_ok=True)
os.chdir(runDir+"/results/figures/GMM/all-atoms/distances/commun/")
# draw figures for atoms common to all nucleotides
GMM_histo(last_o3p_p, "O3'-P")
if len(op3_p) > 0 :
GMM_histo(op3_p, "OP3-P")
GMM_histo(p_op1, "P-OP1")
GMM_histo(p_op2, "P-OP2")
GMM_histo(p_o5p, "P-O5'")
GMM_histo(o5p_c5p, "O5'-C5'")
GMM_histo(c5p_c4p, "C5'-C4'")
GMM_histo(c4p_o4p, "C4'-O4'")
GMM_histo(c4p_c3p, "C4'-C3'")
GMM_histo(c3p_o3p, "C3'-O3'")
GMM_histo(o4p_c1p, "O4'-C1'")
GMM_histo(c1p_c2p, "C1'-C2'")
GMM_histo(c2p_c3p, "C2'-C3'")
GMM_histo(c2p_o2p, "C2'-O2'")
if len(op3_p) > 0 :
GMM_histo(op3_p, "OP3-P", toric=False, hist=False, col= 'lightcoral')
GMM_histo(p_op1, "P-OP1", toric=False, hist=False, col='gold')
GMM_histo(p_op2, "P-OP2", toric=False, hist=False, col='lightseagreen')
GMM_histo(last_o3p_p, "O3'-P", toric=False, hist=False, col='saddlebrown')
GMM_histo(p_o5p, "P-O5'", toric=False, hist=False, col='darkturquoise')
GMM_histo(o5p_c5p, "O5'-C5'", toric=False, hist=False, col='darkkhaki')
GMM_histo(c5p_c4p, "C5'-C4'", toric=False, hist=False, col='indigo')
GMM_histo(c4p_o4p, "C4'-O4'", toric=False, hist=False, col='maroon')
GMM_histo(c4p_c3p, "C4'-C3'", toric=False, hist=False, col='burlywood')
GMM_histo(c3p_o3p, "C3'-O3'", toric=False, hist=False, col='steelblue')
GMM_histo(o4p_c1p, "O4'-C1'", toric=False, hist=False, col='tomato')
GMM_histo(c1p_c2p, "C1'-C2'", toric=False, hist=False, col='darkolivegreen')
GMM_histo(c2p_c3p, "C2'-C3'", toric=False, hist=False, col='orchid')
GMM_histo(c2p_o2p, "C2'-O2'", toric=False, hist=False, col='deeppink')
axes=plt.gca()
axes.set_ylim(0, 100)
plt.xlabel("Distance (Angströms)")
plt.title("GMM of distances between common atoms ")
plt.savefig(runDir + "/results/figures/GMM/all-atoms/distances/commun/" + "GMM_distances_common_atoms.png")
plt.close()
os.makedirs(runDir+"/results/figures/GMM/all-atoms/distances/purines/", exist_ok=True)
os.chdir(runDir+"/results/figures/GMM/all-atoms/distances/purines/")
# purines
GMM_histo(c1p_n9, "C1'-N9")
GMM_histo(n9_c8, "N9-C8")
GMM_histo(c8_n7, "C8-N7")
GMM_histo(n7_c5, "N7-C5")
GMM_histo(c5_c6, "C5-C6")
GMM_histo(c6_o6, "C6-O6")
GMM_histo(c6_n6, "C6-N6")
GMM_histo(c6_n1, "C6-N1")
GMM_histo(n1_c2, "N1-C2")
GMM_histo(c2_n2, "C2-N2")
GMM_histo(c2_n3, "C2-N3")
GMM_histo(n3_c4, "N3-C4")
GMM_histo(c4_n9, "C4-N9")
GMM_histo(c4_c5, "C4-C5")
GMM_histo(c1p_n9, "C1'-N9", hist=False, col='lightcoral')
GMM_histo(n9_c8, "N9-C8", hist=False, col='gold')
GMM_histo(c8_n7, "C8-N7", hist=False, col='lightseagreen')
GMM_histo(n7_c5, "N7-C5", hist=False, col='saddlebrown')
GMM_histo(c5_c6, "C5-C6", hist=False, col='darkturquoise')
GMM_histo(c6_o6, "C6-O6", hist=False, col='darkkhaki')
GMM_histo(c6_n6, "C6-N6", hist=False, col='indigo')
GMM_histo(c6_n1, "C6-N1", hist=False, col='maroon')
GMM_histo(n1_c2, "N1-C2", hist=False, col='burlywood')
GMM_histo(c2_n2, "C2-N2", hist=False, col='steelblue')
GMM_histo(c2_n3, "C2-N3", hist=False, col='tomato')
GMM_histo(n3_c4, "N3-C4", hist=False, col='darkolivegreen')
GMM_histo(c4_n9, "C4-N9", hist=False, col='orchid')
GMM_histo(c4_c5, "C4-C5", hist=False, col='deeppink')
axes=plt.gca()
axes.set_ylim(0, 100)
plt.xlabel("Distance (Angströms)")
plt.title("GMM of distances between atoms of the purine cycles", fontsize=10)
plt.savefig(runDir+ "/results/figures/GMM/all-atoms/distances/purines/" + "GMM_distances_purine_cycles.png")
plt.close()
os.makedirs(runDir+"/results/figures/GMM/all-atoms/distances/pyrimidines/", exist_ok=True)
os.chdir(runDir+"/results/figures/GMM/all-atoms/distances/pyrimidines/")
# pyrimidines
GMM_histo(c1p_n1, "C1'-N1")
GMM_histo(n1_c6, "N1-C6")
GMM_histo(c6_c5, "C6-C5")
GMM_histo(c5_c4, "C5-C4")
GMM_histo(c4_n3, "C4-N3")
GMM_histo(n3_c2, "N3-C2")
GMM_histo(c2_o2, "C2-O2")
GMM_histo(c2_n1, "C2-N1")
GMM_histo(c4_n4, "C4-N4")
GMM_histo(c4_o4, "C4-O4")
GMM_histo(c1p_n1, "C1'-N1", hist=False, col='lightcoral')
GMM_histo(n1_c6, "N1-C6", hist=False, col='gold')
GMM_histo(c6_c5, "C6-C5", hist=False, col='lightseagreen')
GMM_histo(c5_c4, "C5-C4", hist=False, col='deeppink')
GMM_histo(c4_n3, "C4-N3", hist=False, col='red')
GMM_histo(n3_c2, "N3-C2", hist=False, col='lime')
GMM_histo(c2_o2, "C2-O2", hist=False, col='indigo')
GMM_histo(c2_n1, "C2-N1", hist=False, col='maroon')
GMM_histo(c4_n4, "C4-N4", hist=False, col='burlywood')
GMM_histo(c4_o4, "C4-O4", hist=False, col='steelblue')
axes=plt.gca()
#axes.set_xlim(1, 2)
axes.set_ylim(0, 100)
plt.xlabel("Distance (Angströms")
plt.title("GMM of distances between atoms of the pyrimidine cycles", fontsize=10)
plt.savefig(runDir + "/results/figures/GMM/all-atoms/distances/pyrimidines/" + "GMM_distances_pyrimidine_cycles.png")
plt.close()
os.chdir(runDir)
setproctitle("GMM (all atoms, distances) finished")
@trace_unhandled_exceptions
def gmm_aa_torsions():
"""
Separates the torsion angle measurements by angle type and plots the figures representing the data
"""
setproctitle("GMM (all atoms, torsions)")
# we create lists to store the values ​​of each angle
alpha=[]
beta=[]
gamma=[]
delta=[]
epsilon=[]
zeta=[]
chi = []
for angles_deg in conversion_angles(runDir + "/results/RNANet.db"):
alpha.append(angles_deg[2])
beta.append(angles_deg[3])
gamma.append(angles_deg[4])
delta.append(angles_deg[5])
epsilon.append(angles_deg[6])
zeta.append(angles_deg[7])
chi.append(angles_deg[8])
# we remove the null values
alpha=[i for i in alpha if i != None]
beta=[i for i in beta if i != None]
gamma=[i for i in gamma if i != None]
delta=[i for i in delta if i != None]
epsilon=[i for i in epsilon if i != None]
zeta=[i for i in zeta if i != None]
chi=[i for i in chi if i != None]
os.makedirs(runDir + "/results/figures/GMM/all-atoms/torsions/", exist_ok=True)
os.chdir(runDir + "/results/figures/GMM/all-atoms/torsions/")
"""
We plot the GMMs with histogram for each angle
We create the corresponding json with the means and standard deviations of each Gaussian
We draw the figure grouping the GMMs of all angles without histogram to compare them with each other
"""
GMM_histo(alpha, "Alpha", toric=True)
GMM_histo(beta, "Beta", toric=True)
GMM_histo(gamma, "Gamma", toric=True)
GMM_histo(delta, "Delta", toric=True)
GMM_histo(epsilon, "Epsilon", toric=True)
GMM_histo(zeta, "Zeta", toric=True)
GMM_histo(chi, "Xhi", toric=True)
GMM_histo(alpha, "Alpha", toric=True, hist=False, col='red')
GMM_histo(beta, "Beta", toric=True, hist=False, col='firebrick')
GMM_histo(gamma, "Gamma", toric=True, hist=False, col='limegreen')
GMM_histo(delta, "Delta", toric=True, hist=False, col='darkslateblue')
GMM_histo(epsilon, "Epsilon", toric=True, hist=False, col='goldenrod')
GMM_histo(zeta, "Zeta", toric=True, hist=False, col='teal')
GMM_histo(chi, "Xhi", toric=True, hist=False, col='hotpink')
plt.xlabel("Angle (Degrees)")
plt.title("GMM of torsion angles")
plt.savefig("GMM_torsions.png")
plt.close()
os.chdir(runDir)
setproctitle("GMM (all atoms, torsions) finished")
@trace_unhandled_exceptions
def gmm_wadley():
setproctitle("GMM (Pyle model)")
# Distances
df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/Pyle/distances/distances_wadley.csv"))
p_c1p = list(df["C1'-P"][~ np.isnan(df["C1'-P"])])
c1p_p = list(df["P-C1'"][~ np.isnan(df["P-C1'"])])
p_c4p = list(df["C4'-P"][~ np.isnan(df["C4'-P"])])
c4p_p = list(df["P-C4'"][~ np.isnan(df["P-C4'"])])
os.makedirs(runDir + "/results/figures/GMM/Pyle/distances/", exist_ok=True)
os.chdir(runDir + "/results/figures/GMM/Pyle/distances/")
GMM_histo(p_c1p, "P-C1'")
GMM_histo(c1p_p, "C1'-P")
GMM_histo(p_c4p, "P-C4'")
GMM_histo(c4p_p, "C4'-P")
GMM_histo(p_c4p, "P-C4'", toric=False, hist=False, col='gold')
GMM_histo(c4p_p, "C4'-P", toric=False, hist=False, col='indigo')
GMM_histo(p_c1p, "P-C1'", toric=False, hist=False, col='firebrick')
GMM_histo(c1p_p, "C1'-P", toric=False, hist=False, col='seagreen')
plt.xlabel("Distance (Angströms)")
plt.title("GMM of distances (Pyle model)")
plt.savefig("GMM_distances_pyle_model.png")
plt.close()
# Flat Angles
df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/Pyle/angles/flat_angles_pyle.csv"))
p_c1p_psuiv = list(df["P-C1'-P°"][~ np.isnan(df["P-C1'-P°"])])
c1p_psuiv_c1psuiv = list(df["C1'-P°-C1'°"][~ np.isnan(df["C1'-P°-C1'°"])])
os.makedirs(runDir + "/results/figures/GMM/Pyle/angles/", exist_ok=True)
os.chdir(runDir + "/results/figures/GMM/Pyle/angles/")
GMM_histo(p_c1p_psuiv, "P-C1'-P°", toric=True)
GMM_histo(c1p_psuiv_c1psuiv, "C1'-P°-C1'°", toric=True)
GMM_histo(p_c1p_psuiv, "P-C1'-P°", toric=True, hist=False, col='firebrick')
GMM_histo(c1p_psuiv_c1psuiv, "C1'-P°-C1'°", toric=True, hist=False, col='seagreen')
plt.xlabel("Angle (Degrees)")
plt.title("GMM of flat angles (Pyle model)")
plt.savefig("GMM_flat_angles_pyle_model.png")
plt.close()
# Torsion angles
eta=[]
theta=[]
eta_prime=[]
theta_prime=[]
eta_base=[]
theta_base=[]
for angles_deg in conversion_eta_theta(runDir + "/results/RNANet.db"):
eta.append(angles_deg[2])
theta.append(angles_deg[3])
eta_prime.append(angles_deg[4])
theta_prime.append(angles_deg[5])
eta_base.append(angles_deg[6])
theta_base.append(angles_deg[7])
eta=[i for i in eta if i != None]
theta=[i for i in theta if i != None]
eta_prime=[i for i in eta_prime if i != None]
theta_prime=[i for i in theta_prime if i != None]
eta_base=[i for i in eta_base if i != None]
theta_base=[i for i in theta_base if i != None]
os.makedirs(runDir + "/results/figures/GMM/Pyle/pseudotorsions/", exist_ok=True)
os.chdir(runDir + "/results/figures/GMM/Pyle/pseudotorsions/")
GMM_histo(eta, "Eta", toric=True)
GMM_histo(theta, "Theta", toric=True)
GMM_histo(eta_prime, "Eta'", toric=True)
GMM_histo(theta_prime, "Theta'", toric=True)
GMM_histo(eta_base, "Eta''", toric=True)
GMM_histo(theta_base, "Theta''", toric=True)
GMM_histo(eta, "Eta", toric=True, hist=False, col='mediumaquamarine')
GMM_histo(theta, "Theta", toric=True, hist=False, col='darkorchid')
GMM_histo(eta_prime, "Eta'", toric=True, hist=False, col='cyan')
GMM_histo(theta_prime, "Theta'", toric=True, hist=False, col='crimson')
GMM_histo(eta_base, "Eta''", toric=True, hist=False, col='royalblue')
GMM_histo(theta_base, "Theta''", toric=True, hist=False, col='palevioletred')
plt.xlabel("Angle (Degrees)")
plt.title("GMM of pseudo-torsion angles (Pyle Model)")
plt.savefig("GMM_pseudotorsion_angles_pyle_model.png")
plt.close()
os.chdir(runDir)
setproctitle("GMM (Pyle model) finished")
def gmm_pyle_type(ntpair, data):
setproctitle(f"GMM (Pyle {ntpair} )")
os.makedirs(runDir + "/results/figures/GMM/Pyle/distances/", exist_ok=True)
os.chdir(runDir + "/results/figures/GMM/Pyle/distances/")
p_p=list(data["P-P"][~ np.isnan(data["P-P"])])
p_c4p=list(data["P-C4'"][~ np.isnan(data["P-C4'"])])
p_c1p=list(data["P-C1'"][~ np.isnan(data["P-C1'"])])
c4p_c4p=list(data["C4'-C4'"][~ np.isnan(data["C4'-C4'"])])
c1p_c1p=list(data["C1'-C1'"][~ np.isnan(data["C1'-C1'"])])
print(len(p_p))
# res2=list(data["resnum2"])
# res1=list(data["resnum1"])
# diff=[]
# for i in range(len(res1)):
# diff.append(res2[i]-res1[i])
# print(diff[:100])
GMM_histo(p_p, f"Distance P-P between {ntpair} tips for {str(len(p_p))} values", toric=False, hist=False, col="cyan")
GMM_histo(p_c4p, f"Distance P-C4' between {ntpair} tips", toric=False, hist=False, col="tomato")
GMM_histo(p_c1p, f"Distance P-C1' between {ntpair} tips", toric=False, hist=False, col="goldenrod")
GMM_histo(c4p_c4p, f"Distance C4'-C4' between {ntpair} tips", toric=False, hist=False, col="magenta")
GMM_histo(c1p_c1p, f"Distance C1'-C1' between {ntpair} tips", toric=False, hist=False, col="black")
# GMM_histo(diff, f"Gap between {ntpair} tips", toric=False, hist=False, col="tomato")
plt.xlabel("Distance (Angströms)")
# plt.xlabel("Number of residues")
plt.ylabel("Distance (Angströms)")
plt.title(f"GMM of distances for {ntpair} ", fontsize=10)
# plt.savefig(f"Longueurs_Pyle_{ntpair}.png" )
plt.savefig(f"Distances_Pyle_{ntpair}.png" )
plt.close()
setproctitle(f"GMM (Pyle {ntpair} distances) finished")
def gmm_pyle():
setproctitle("GMM (Pyle model)")
df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/Pyle/distances/distances.csv"))
# dist = ["P-P", "P-C4'", "P-C1'", "C4'-C4'", "C1'-C1'"]
data=df
if len(data):
for b1 in ['A','C','G','U']:
for b2 in ['A','C','G','U']:
thisbases = data[(data.res1 == b1)&(data.res2 == b2)]
if len(thisbases):
gmm_pyle_type(b1+b2, thisbases)
@trace_unhandled_exceptions
def gmm_hrna():
"""
Draw the figures representing the data on the measurements between atoms of the HiRE-RNA model
"""
setproctitle("GMM (HiRE-RNA)")
# Distances
df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/HiRE-RNA/distances/dist_atoms_hire_RNA.csv"))
last_c4p_p = list(df["C4'-P"][~ np.isnan(df["C4'-P"])])
p_o5p = list(df["P-O5'"][~ np.isnan(df["P-O5'"])])
o5p_c5p = list(df["O5'-C5'"][~ np.isnan(df["O5'-C5'"])])
c5p_c4p = list(df["C5'-C4'"][~ np.isnan(df["C5'-C4'"])])
c4p_c1p = list(df["C4'-C1'"][~ np.isnan(df["C4'-C1'"])])
c1p_b1 = list(df["C1'-B1"][~ np.isnan(df["C1'-B1"])])
b1_b2 = list(df["B1-B2"][~ np.isnan(df["B1-B2"])])
os.makedirs(runDir + "/results/figures/GMM/HiRE-RNA/distances/", exist_ok=True)
os.chdir(runDir + "/results/figures/GMM/HiRE-RNA/distances/")
GMM_histo(o5p_c5p, "O5'-C5'")
GMM_histo(b1_b2, "B1-B2")
GMM_histo(c1p_b1, "C1'-B1")
GMM_histo(c5p_c4p, "C5'-C4'")
GMM_histo(c4p_c1p, "C4'-C1'")
GMM_histo(p_o5p, "P-O5'")
GMM_histo(last_c4p_p, "C4'-P")
GMM_histo(o5p_c5p, "O5'-C5'", toric=False, hist=False, col='lightcoral')
GMM_histo(b1_b2, "B1-B2", toric=False, hist=False, col='limegreen')
GMM_histo(c1p_b1, "C1'-B1", toric=False, hist=False, col='tomato')
GMM_histo(c5p_c4p, "C5'-C4'", toric=False, hist=False, col='aquamarine')
GMM_histo(c4p_c1p, "C4'-C1'", toric=False, hist=False, col='goldenrod')
GMM_histo(p_o5p, "P-O5'", toric=False, hist=False, col='darkcyan')
GMM_histo(last_c4p_p, "C4'-P", toric=False, hist=False, col='deeppink')
axes = plt.gca()
axes.set_ylim(0, 100)
plt.xlabel("Distance (Angströms)")
plt.title("GMM of distances between HiRE-RNA beads")
plt.savefig(runDir + "/results/figures/GMM/HiRE-RNA/distances/GMM_distances_HiRE_RNA.png")
plt.close()
# Angles
df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/HiRE-RNA/angles/angles_hire_RNA.csv"))
lastc4p_p_o5p = list(df["C4'-P-O5'"][~ np.isnan(df["C4'-P-O5'"])])
lastc1p_lastc4p_p = list(df["C1'-C4'-P"][~ np.isnan(df["C1'-C4'-P"])])
lastc5p_lastc4p_p = list(df["C5'-C4'-P"][~ np.isnan(df["C5'-C4'-P"])])
p_o5p_c5p = list(df["P-O5'-C5'"][~ np.isnan(df["P-O5'-C5'"])])
o5p_c5p_c4p = list(df["O5'-C5'-C4'"][~ np.isnan(df["O5'-C5'-C4'"])])
c5p_c4p_c1p = list(df["C5'-C4'-C1'"][~ np.isnan(df["C5'-C4'-C1'"])])
c4p_c1p_b1 = list(df["C4'-C1'-B1"][~ np.isnan(df["C4'-C1'-B1"])])
c1p_b1_b2 = list(df["C1'-B1-B2"][~ np.isnan(df["C1'-B1-B2"])])
os.makedirs(runDir + "/results/figures/GMM/HiRE-RNA/angles/", exist_ok=True)
os.chdir(runDir + "/results/figures/GMM/HiRE-RNA/angles/")
GMM_histo(lastc4p_p_o5p, "C4'-P-O5'", toric=True)
GMM_histo(lastc1p_lastc4p_p, "C1'-C4'-P", toric=True)
GMM_histo(lastc5p_lastc4p_p, "C5'-C4'-P", toric=True)
GMM_histo(p_o5p_c5p, "P-O5'-C5'", toric=True)
GMM_histo(o5p_c5p_c4p, "O5'-C5'-C4'", toric=True)
GMM_histo(c5p_c4p_c1p, "C5'-C4'-C1'", toric=True)
GMM_histo(c4p_c1p_b1, "C4'-C1'-B1", toric=True)
GMM_histo(c1p_b1_b2, "C1'-B1-B2", toric=True)
GMM_histo(lastc4p_p_o5p, "C4'-P-O5'", toric=True, hist=False, col='lightcoral')
GMM_histo(lastc1p_lastc4p_p, "C1'-C4'-P", toric=True, hist=False, col='limegreen')
GMM_histo(lastc5p_lastc4p_p, "C5'-C4'-P", toric=True, hist=False, col='tomato')
GMM_histo(p_o5p_c5p, "P-O5'-C5'", toric=True, hist=False, col='aquamarine')
GMM_histo(o5p_c5p_c4p, "O5'-C5'-C4'", toric=True, hist=False, col='goldenrod')
GMM_histo(c5p_c4p_c1p, "C5'-C4'-C1'", toric=True, hist=False, col='darkcyan')
GMM_histo(c4p_c1p_b1, "C4'-C1'-B1", toric=True, hist=False, col='deeppink')
GMM_histo(c1p_b1_b2, "C1'-B1-B2", toric=True, hist=False, col='indigo')
axes = plt.gca()
axes.set_ylim(0, 100)
plt.xlabel("Angle (Degres)")
plt.title("GMM of angles between HiRE-RNA beads")
plt.savefig(runDir + "/results/figures/GMM/HiRE-RNA/angles/GMM_angles_HiRE_RNA.png")
plt.close()
# Torsions
df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/HiRE-RNA/torsions/angles_torsion_hire_RNA.csv"))
p_o5_c5_c4 = list(df["P-O5'-C5'-C4'"][~ np.isnan(df["P-O5'-C5'-C4'"])])
o5_c5_c4_c1 = list(df["O5'-C5'-C4'-C1'"][~ np.isnan(df["O5'-C5'-C4'-C1'"])])
c5_c4_c1_b1 = list(df["C5'-C4'-C1'-B1"][~ np.isnan(df["C5'-C4'-C1'-B1"])])
c4_c1_b1_b2 = list(df["C4'-C1'-B1-B2"][~ np.isnan(df["C4'-C1'-B1-B2"])])
o5_c5_c4_psuiv = list(df["O5'-C5'-C4'-P°"][~ np.isnan(df["O5'-C5'-C4'-P°"])])
c5_c4_psuiv_o5suiv = list(df["C5'-C4'-P°-O5'°"][~ np.isnan(df["C5'-C4'-P°-O5'°"])])
c4_psuiv_o5suiv_c5suiv = list(df["C4'-P°-O5'°-C5'°"][~ np.isnan(df["C4'-P°-O5'°-C5'°"])])
c1_c4_psuiv_o5suiv = list(df["C1'-C4'-P°-O5'°"][~ np.isnan(df["C1'-C4'-P°-O5'°"])])
os.makedirs(runDir + "/results/figures/GMM/HiRE-RNA/torsions/", exist_ok=True)
os.chdir(runDir + "/results/figures/GMM/HiRE-RNA/torsions/")
GMM_histo(p_o5_c5_c4, "P-O5'-C5'-C4'", toric=True)
GMM_histo(o5_c5_c4_c1, "O5'-C5'-C4'-C1'", toric=True)
GMM_histo(c5_c4_c1_b1, "C5'-C4'-C1'-B1", toric=True)
GMM_histo(c4_c1_b1_b2, "C4'-C1'-B1-B2", toric=True)
GMM_histo(o5_c5_c4_psuiv, "O5'-C5'-C4'-P°", toric=True)
GMM_histo(c5_c4_psuiv_o5suiv, "C5'-C4'-P°-O5'°", toric=True)
GMM_histo(c4_psuiv_o5suiv_c5suiv, "C4'-P°-O5'°-C5'°", toric=True)
GMM_histo(c1_c4_psuiv_o5suiv, "C1'-C4'-P°-O5'°", toric=True)
GMM_histo(p_o5_c5_c4, "P-O5'-C5'-C4'", toric=True, hist=False, col='darkred')
GMM_histo(o5_c5_c4_c1, "O5'-C5'-C4'-C1'", toric=True, hist=False, col='chocolate')
GMM_histo(c5_c4_c1_b1, "C5'-C4'-C1'-B1", toric=True, hist=False, col='mediumvioletred')
GMM_histo(c4_c1_b1_b2, "C4'-C1'-B1-B2", toric=True, hist=False, col='cadetblue')
GMM_histo(o5_c5_c4_psuiv, "O5'-C5'-C4'-P°", toric=True, hist=False, col='darkkhaki')
GMM_histo(c5_c4_psuiv_o5suiv, "C5'-C4'-P°-O5'°", toric=True, hist=False, col='springgreen')
GMM_histo(c4_psuiv_o5suiv_c5suiv, "C4'-P°-O5'°-C5'°", toric=True, hist=False, col='indigo')
GMM_histo(c1_c4_psuiv_o5suiv, "C1'-C4'-P°-O5'°", toric=True, hist=False, col='gold')
plt.xlabel("Angle (Degrees)")
plt.title("GMM of torsion angles between HiRE-RNA beads")
plt.savefig("GMM_torsions_HiRE_RNA.png")
plt.close()
os.chdir(runDir)
setproctitle("GMM (HiRE-RNA) finished")
@trace_unhandled_exceptions
def gmm_hrna_basepair_type(type_LW, ntpair, data):
"""
function to plot the statistical figures you want
By type of pairing:
Superposition of GMMs of plane angles
Superposition of the histogram and the GMM of the distances
all in the same window
"""
setproctitle(f"GMM (HiRE-RNA {type_LW} basepairs)")
figure = plt.figure(figsize = (10, 10))
plt.gcf().subplots_adjust(left = 0.1, bottom = 0.1, right = 0.9, top = 0.9, wspace = 0, hspace = 0.5)
plt.subplot(2, 1, 1)
GMM_histo(data["211_angle"], f"{type_LW}_{ntpair}_C1'-B1-B1pair", toric=True, hist=False, col='cyan' )
GMM_histo(data["112_angle"], f"{type_LW}_{ntpair}_B1-B1pair-C1'pair", toric=True, hist=False, col='magenta')
GMM_histo(data["3211_torsion"], f"{type_LW}_{ntpair}_C4'-C1'-B1-B1pair", toric=True, hist=False, col='black' )
GMM_histo(data["1123_torsion"], f"{type_LW}_{ntpair}_B1-B1pair-C1'pair-C4'pair", toric=True, hist=False, col='maroon')
GMM_histo(data["alpha1"], f"{type_LW}_{ntpair}_alpha_1", toric=True, hist=False, col="yellow")
GMM_histo(data["alpha2"], f"{type_LW}_{ntpair}_alpha_2", toric=True, hist=False, col='olive')
plt.xlabel("Angle (degree)")
plt.title(f"GMM of plane angles for {type_LW} {ntpair} basepairs", fontsize=10)
plt.subplot(2, 1, 2)
GMM_histo(data["Distance"], f"Distance between {type_LW} {ntpair} tips", toric=False, hist=False, col="cyan")
GMM_histo(data["dB1"], f"{type_LW} {ntpair} dB1", toric=False, hist=False, col="tomato")
GMM_histo(data["dB2"], f"{type_LW} {ntpair} dB2", toric=False, hist=False, col="goldenrod")
plt.xlabel("Distance (Angströms)")
plt.title(f"GMM of distances for {type_LW} {ntpair} basepairs", fontsize=10)
plt.savefig(f"{type_LW}_{ntpair}_basepairs.png" )
plt.close()
setproctitle(f"GMM (HiRE-RNA {type_LW} {ntpair} basepairs) finished")
@trace_unhandled_exceptions
def gmm_hrna_basepairs():
setproctitle("GMM (HiRE-RNA basepairs)")
df = pd.read_csv(os.path.abspath(runDir + "/results/geometry/HiRE-RNA/basepairs/basepairs.csv"))
lw = ["cWW", "tWW", "cWH", "tWH", "cHW", "tHW", "cWS", "tWS", "cSW", "tSW", "cHH", "tHH", "cSH", "tSH", "cHS", "tHS", "cSS", "tSS"]
os.makedirs(runDir + "/results/figures/GMM/HiRE-RNA/basepairs/", exist_ok=True)
os.chdir(runDir + "/results/figures/GMM/HiRE-RNA/basepairs/")
for lw_type in lw:
data = df[df['type_LW'] == lw_type ]
if len(data):
for b1 in ['A','C','G','U']:
for b2 in ['A','C','G','U']:
thisbases = data[(data.nt1_res == b1)&(data.nt2_res == b2)]
if len(thisbases):
gmm_hrna_basepair_type(lw_type, b1+b2, thisbases)
# colors = ['lightcoral', "lightseagreen", "black", "goldenrod", "olive", "steelblue", "silver", "deeppink", "navy",
# "sienna", "maroon", "orange", "mediumaquamarine", "tomato", "indigo", "orchid", "tan", "lime"]
# for lw_type, col in zip(lw, colors):
# data = df[df['type LW'] == lw_type]
# GMM_histo(data.Distance, lw_type, toric=False, hist=False, col=col)
# plt.xlabel('Distance (Angströms)')
# plt.title("GMM of distances between base tips ("+str(nt)+ " values)", fontsize=8)
# plt.savefig("distances_between_tips.png")
# plt.close()
os.chdir(runDir)
setproctitle(f"GMM (HiRE-RNA basepairs) finished")
def merge_jsons():
# All atom distances
bonds = ["O3'-P", "OP3-P", "P-OP1", "P-OP2", "P-O5'", "O5'-C5'", "C5'-C4'", "C4'-O4'", "C4'-C3'", "O4'-C1'", "C1'-C2'", "C2'-O2'", "C2'-C3'", "C3'-O3'", "C1'-N9",
"N9-C8", "C8-N7", "N7-C5", "C5-C6", "C6-O6", "C6-N6", "C6-N1", "N1-C2", "C2-N2", "C2-N3", "N3-C4", "C4-N9", "C4-C5",
"C1'-N1", "N1-C6", "C6-C5", "C5-C4", "C4-N3", "N3-C2", "C2-O2", "C2-N1", "C4-N4", "C4-O4"]
bonds = [ runDir + "/results/geometry/json/" + x + ".json" for x in bonds ]
concat_jsons(bonds, runDir + "/results/geometry/json/all_atom_distances.json")
# All atom torsions
torsions = ["Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Xhi", "Zeta"]
torsions = [ runDir + "/results/geometry/json/" + x + ".json" for x in torsions ]
concat_jsons(torsions, runDir + "/results/geometry/json/all_atom_torsions.json")
# HiRE-RNA distances
hrnabonds = ["P-O5'", "O5'-C5'", "C5'-C4'", "C4'-C1'", "C1'-B1", "B1-B2", "C4'-P"]
hrnabonds = [ runDir + "/results/geometry/json/" + x + ".json" for x in hrnabonds ]
concat_jsons(hrnabonds, runDir + "/results/geometry/json/hirerna_distances.json")
# HiRE-RNA angles
hrnaangles = ["P-O5'-C5'", "O5'-C5'-C4'", "C5'-C4'-C1'", "C4'-C1'-B1", "C1'-B1-B2", "C4'-P-O5'", "C5'-C4'-P", "C1'-C4'-P"]
hrnaangles = [ runDir + "/results/geometry/json/" + x + ".json" for x in hrnaangles ]
concat_jsons(hrnaangles, runDir + "/results/geometry/json/hirerna_angles.json")
# HiRE-RNA torsions
hrnators = ["P-O5'-C5'-C4'", "O5'-C5'-C4'-C1'", "C5'-C4'-C1'-B1", "C4'-C1'-B1-B2", "C4'-P°-O5'°-C5'°", "C5'-C4'-P°-O5'°", "C1'-C4'-P°-O5'°", "O5'-C5'-C4'-P°"]
hrnators = [ runDir + "/results/geometry/json/" + x + ".json" for x in hrnators ]
concat_jsons(hrnators, runDir + "/results/geometry/json/hirerna_torsions.json")
# HiRE-RNA basepairs
for nt1 in ['A', 'C', 'G', 'U']:
for nt2 in ['A', 'C', 'G', 'U']:
bps = glob.glob(runDir + f"/results/geometry/json/*{nt1}{nt2}*.json")
concat_jsons(bps, runDir + f"/results/geometry/json/hirerna_{nt1}{nt2}_basepairs.json")
# Delete previous files
for f in bonds + torsions + hrnabonds + hrnaangles + hrnators:
try:
os.remove(f)
except FileNotFoundError:
pass
for f in glob.glob(runDir + "/results/geometry/json/t*.json"):
try:
os.remove(f)
except FileNotFoundError:
pass
for f in glob.glob(runDir + "/results/geometry/json/c*.json"):
try:
os.remove(f)
except FileNotFoundError:
pass
for f in glob.glob(runDir + "/results/geometry/json/Distance*.json"):
try:
os.remove(f)
except FileNotFoundError:
pass
@trace_unhandled_exceptions
def loop(f):
return pd.read_csv(f)
@trace_unhandled_exceptions
def concat_dataframes(fpath, outfilename):
"""
Concatenates the dataframes containing measures
and creates a new dataframe gathering all
"""
global idxQueue
thr_idx = idxQueue.get()
setproctitle(f"Worker {thr_idx+1} : Concatenation of {fpath}")
liste = os.listdir(fpath)
pbar = tqdm(total=len(liste), position=thr_idx, desc="Preparing "+outfilename, leave=False)
df_tot = pd.read_csv(os.path.abspath(fpath + liste.pop()), engine="python")
#df=Parallel(n_jobs=-1, verbose=20)(delayed(loop)(os.path.abspath(fpath+liste[f])) for f in range (len(liste)))
#except :
# print(liste[f])
pbar.update(1)
for f in range(len(liste)):
# try :
df = pd.read_csv(os.path.abspath(fpath + liste.pop()), engine='python')
# except :
# print(liste[f])
# continue
df_tot = pd.concat([df_tot, df], ignore_index=True)
def log_to_pbar(pbar):
def update(r):
pbar.update(1)
#df = pd.concat(df, ignore_index=True)
#pbar.update(1)
#df.to_csv(fpath + outfilename)
df_tot.to_csv(fpath + outfilename)
idxQueue.put(thr_idx) # replace the thread index in the queue
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
def concat_jsons(flist, outfilename):
"""
Reads JSON files computed by the geometry jobs and merge them into a smaller
number of files
"""
result = []
for f in flist:
# if not path.isfile(f):
# continue:
with open(f, "rb") as infile:
result.append(json.load(infile))
# write the files
with open(outfilename, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=4)
return update
def process_jobs(joblist):
"""
......@@ -2952,13 +1229,16 @@ if __name__ == "__main__":
DO_WADLEY_ANALYSIS = False
DO_AVG_DISTANCE_MATRIX = False
DO_HIRE_RNA_MEASURES = False
RESCAN_GMM_COMP_NUM = False
try:
opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "from-scratch", "wadley", "distance-matrices", "resolution=", "3d-folder=", "seq-folder=", "hire-rna" ])
opts, _ = getopt.getopt( sys.argv[1:], "r:h",
[ "help", "from-scratch", "wadley", "distance-matrices", "resolution=",
"3d-folder=", "seq-folder=", "hire-rna", "rescan-nmodes" ])
except getopt.GetoptError as err:
print(err)
sys.exit(2)
for opt, arg in opts:
for opt, arg in opts:
if opt == "-h" or opt == "--help":
print( "RNANet statistics, a script to build a multiscale RNA dataset from public data\n"
"Developed by Louis Becquey, Khodor Hannoush, and Aglaé Tabot 2019/2021")
......@@ -2975,28 +1255,28 @@ if __name__ == "__main__":
print("--distance-matrices\t\tCompute average distance between nucleotide pairs for each family.")
print("--wadley\t\t\tReproduce Wadley & al 2007 clustering of pseudotorsions.")
print("--hire-rna\t\t\tCompute distances between atoms and torsion angles for HiRE-RNA model, and plot GMMs on the data.")
print("--rescan-nmodes\t\tDo not assume the number of modes in distances and angles distributions, measure it.")
sys.exit()
elif opt == '--version':
elif opt == "--version":
print("RNANet statistics 1.6 beta")
sys.exit()
elif opt == "-r" or opt == "--resolution":
assert float(arg) > 0.0 and float(arg) <= 20.0
res_thr = float(arg)
elif opt=='--3d-folder':
elif opt == "--3d-folder":
path_to_3D_data = path.abspath(arg)
if path_to_3D_data[-1] != '/':
path_to_3D_data += '/'
elif opt=='--seq-folder':
elif opt == "--seq-folder":
path_to_seq_data = path.abspath(arg)
if path_to_seq_data[-1] != '/':
path_to_seq_data += '/'
elif opt=='--from-scratch':
elif opt == "--from-scratch":
DELETE_OLD_DATA = True
DO_WADLEY_ANALYSIS = True
elif opt=="--distance-matrices":
elif opt == "--distance-matrices":
DO_AVG_DISTANCE_MATRIX = True
elif opt=='--wadley':
elif opt == "--wadley":
DO_WADLEY_ANALYSIS = True
os.makedirs(runDir+"/results/geometry/Pyle/distances/", exist_ok=True)
os.makedirs(runDir+"/results/geometry/Pyle/classes_dist/", exist_ok=True)
......@@ -3005,7 +1285,7 @@ if __name__ == "__main__":
os.makedirs(runDir+"/results/figures/GMM/Pyle/distances/", exist_ok=True)
os.makedirs(runDir+"/results/figures/GMM/Pyle/angles/", exist_ok=True)
os.makedirs(runDir+"/results/figures/GMM/Pyle/pseudotorsions/", exist_ok=True)
elif opt=='--hire-rna':
elif opt == "--hire-rna":
DO_HIRE_RNA_MEASURES = True
os.makedirs(runDir + "/results/geometry/HiRE-RNA/distances/", exist_ok=True)
os.makedirs(runDir + "/results/geometry/HiRE-RNA/angles/", exist_ok=True)
......@@ -3015,7 +1295,8 @@ if __name__ == "__main__":
os.makedirs(runDir + "/results/figures/GMM/HiRE-RNA/angles/", exist_ok=True)
os.makedirs(runDir + "/results/figures/GMM/HiRE-RNA/torsions/", exist_ok=True)
os.makedirs(runDir + "/results/figures/GMM/HiRE-RNA/basepairs/", exist_ok=True)
elif opt == "rescan-nmodes":
RESCAN_GMM_COMP_NUM = True
# Load mappings. famlist will contain only families with structures at this resolution threshold.
......@@ -3053,7 +1334,8 @@ if __name__ == "__main__":
print("Old data deleted.")
# Prepare the multiprocessing execution environment
nworkers = min(read_cpu_number()-1, 50)
global nworkers
nworkers = read_cpu_number()-1
print("Using", nworkers, "threads...")
thr_idx_mgr = Manager()
idxQueue = thr_idx_mgr.Queue()
......@@ -3063,26 +1345,25 @@ if __name__ == "__main__":
# Define the tasks
joblist = []
# Do eta/theta plots
#if n_unmapped_chains and DO_WADLEY_ANALYSIS:
# # Do eta/theta plots
# if n_unmapped_chains and DO_WADLEY_ANALYSIS:
# joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
# joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
# Do distance matrices for each family excl. LSU/SSU (will be processed later)
if DO_AVG_DISTANCE_MATRIX:
extracted_chains = []
for file in os.listdir(path_to_3D_data + "rna_mapped_to_Rfam"):
if os.path.isfile(os.path.join(path_to_3D_data + "rna_mapped_to_Rfam", file)):
e1 = file.split('_')[0]
e2 = file.split('_')[1]
e3 = file.split('_')[2]
extracted_chains.append(e1 + '[' + e2 + ']' + '-' + e3)
for f in [ x for x in famlist if (x not in LSU_set and x not in SSU_set) ]: # Process the rRNAs later only 3 by 3
joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, True, False)))
joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False, False)))
# Do general family statistics
# # Do distance matrices for each family excl. LSU/SSU (will be processed later)
# if DO_AVG_DISTANCE_MATRIX:
# extracted_chains = []
# for file in os.listdir(path_to_3D_data + "rna_mapped_to_Rfam"):
# if os.path.isfile(os.path.join(path_to_3D_data + "rna_mapped_to_Rfam", file)):
# e1 = file.split('_')[0]
# e2 = file.split('_')[1]
# e3 = file.split('_')[2]
# extracted_chains.append(e1 + '[' + e2 + ']' + '-' + e3)
# for f in [ x for x in famlist if (x not in LSU_set and x not in SSU_set) ]: # Process the rRNAs later only 3 by 3
# joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, True, False)))
# joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False, False)))
# # Do general family statistics
# joblist.append(Job(function=stats_len)) # Computes figures about chain lengths
# joblist.append(Job(function=stats_freq)) # updates the database (nucleotide frequencies in families)
# for f in famlist:
......@@ -3091,25 +1372,18 @@ if __name__ == "__main__":
# joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database (identity matrices of families)
# Do geometric measures on all chains
#print(liste_repres('/home/data/RNA/3D/latest_nr_list_4.0A.csv'))
# print(measure_from_structure(os.listdir(path_to_3D_data + "rna_only")[0]))
# Do geometric measures
if n_unmapped_chains:
# os.makedirs(runDir+"/results/geometry/all-atoms/distances/", exist_ok=True)
# liste_struct = os.listdir(path_to_3D_data + "rna_only")
liste_struct=liste_repres('/home/data/RNA/3D/latest_nr_list_4.0A.csv')
# if '4zdo_1_E.cif' in liste_struct:
# liste_struct.remove('4zdo_1_E.cif') # weird cases to remove for now
# if '4zdp_1_E.cif' in liste_struct:
# liste_struct.remove('4zdp_1_E.cif')
for f in liste_struct:
os.makedirs(runDir + "/results/geometry/all-atoms/distances/", exist_ok=True)
# structure_list = os.listdir(path_to_3D_data + "rna_only")
structure_list = representatives_from_nrlist(res_thr)
for f in structure_list:
if path.isfile(path_to_3D_data + "datapoints/" + f.split('.')[0]):
joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances
process_jobs(joblist)
#count_occur_pyle_dist(runDir + '/results/geometry/Pyle/classes_dist/')
# process_jobs(joblist)
# Now process the memory-heavy tasks family by family
if DO_AVG_DISTANCE_MATRIX:
for f in LSU_set:
......@@ -3124,36 +1398,31 @@ if __name__ == "__main__":
# finish the work after the parallel portions
# per_chain_stats() # per chain base frequencies en basepair types
# per_chain_stats() # per chain base frequencies and basepair types
# seq_idty() # identity matrices from pre-computed .npy matrices
# stats_pairs()
concat_dataframes(runDir + '/results/geometry/Pyle/distances/', 'distances.csv')
if n_unmapped_chains:
# general_stats()
os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True)
os.makedirs(runDir+"/results/geometry/json/", exist_ok=True)
concat_dataframes(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv')
if DO_HIRE_RNA_MEASURES:
concat_dataframes(runDir + '/results/geometry/HiRE-RNA/distances/', 'distances_HiRERNA.csv')
concat_dataframes(runDir + '/results/geometry/HiRE-RNA/angles/', 'angles_HiRERNA.csv')
concat_dataframes(runDir + '/results/geometry/HiRE-RNA/torsions/', 'torsions_HiRERNA.csv')
concat_dataframes(runDir + '/results/geometry/HiRE-RNA/basepairs/', 'basepairs_HiRERNA.csv')
if DO_WADLEY_ANALYSIS:
concat_dataframes(runDir + '/results/geometry/Pyle/distances/', 'distances_pyle.csv')
concat_dataframes(runDir + '/results/geometry/Pyle/angles/', 'flat_angles_pyle.csv')
joblist = []
# joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/Pyle/distances/', 'distances.csv')))
# joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv')))
# if DO_HIRE_RNA_MEASURES:
# joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/distances/', 'dist_atoms_hire_RNA.csv')))
# joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/angles/', 'angles_hire_RNA.csv')))
# joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/torsions/', 'angles_torsion_hire_RNA.csv')))
# joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/HiRE-RNA/basepairs/', 'basepairs.csv')))
# if DO_WADLEY_ANALYSIS:
# joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/Pyle/distances/', 'distances_wadley.csv')))
# joblist.append(Job(function=concat_dataframes, args=(runDir + '/results/geometry/Pyle/angles/', 'flat_angles_pyle.csv')))
# process_jobs(joblist)
joblist = []
# joblist.append(Job(function=gmm_aa_dists, args=()))
# joblist.append(Job(function=gmm_aa_torsions, args=()))
joblist.append(Job(function=gmm_aa_dists, args=(RESCAN_GMM_COMP_NUM)))
joblist.append(Job(function=gmm_aa_torsions, args=(RESCAN_GMM_COMP_NUM)))
if DO_HIRE_RNA_MEASURES:
# joblist.append(Job(function=gmm_hrna, args=()))
joblist.append(Job(function=gmm_hrna_basepairs, args=()))
# if DO_WADLEY_ANALYSIS:
# joblist.append(Job(function=gmm_wadley, args=()))
# joblist.append(Job(function=gmm_pyle, args=()))
joblist.append(Job(function=gmm_hrna, args=(RESCAN_GMM_COMP_NUM)))
joblist.append(Job(function=gmm_hrna_basepairs, args=(RESCAN_GMM_COMP_NUM)))
if DO_WADLEY_ANALYSIS:
joblist.append(Job(function=gmm_pyle, args=(RESCAN_GMM_COMP_NUM)))
if len(joblist):
process_jobs(joblist)
#merge_jsons()
merge_jsons()
......