Louis BECQUEY

Revision 1 for Bioinformatics completed

......@@ -13,3 +13,4 @@ esl*
# environment stuff
.vscode/
*.pyc
__pycache__/
\ No newline at end of file
......
......@@ -94,6 +94,8 @@ The detailed list of options is below:
-h [ --help ] Print this help message
--version Print the program version
-f [ --full-inference ] Infer new 3D->family mappings even if Rfam already provides some. Yields more copies of chains
mapped to different families.
-r 4.0 [ --resolution=4.0 ] Maximum 3D structure resolution to consider a RNA chain.
-s Run statistics computations after completion
--extract Extract the portions of 3D RNA chains to individual mmCIF files.
......@@ -105,7 +107,7 @@ The detailed list of options is below:
RNAcifs/ Full structures containing RNA, in mmCIF format
rna_mapped_to_Rfam/ Extracted 'pure' RNA chains
datapoints/ Final results in CSV file format.
--seq-folder=… Path to a folder to store the sequence and alignment files.
--seq-folder=… Path to a folder to store the sequence and alignment files. Subfolders will be:
rfam_sequences/fasta/ Compressed hits to Rfam families
realigned/ Sequences, covariance models, and alignments by family
--no-homology Do not try to compute PSSMs and do not align sequences.
......@@ -117,11 +119,12 @@ The detailed list of options is below:
--update-homologous Re-download Rfam and SILVA databases, realign all families, and recompute all CSV files
--from-scratch Delete database, local 3D and sequence files, and known issues, and recompute.
--archive Create a tar.gz archive of the datapoints text files, and update the link to the latest archive
--no-logs Do not save per-chain logs of the numbering modifications
```
Typical usage:
```
nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &
nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &
```
## Post-computation task: estimate quality
......
#!/usr/bin/python3.8
import Bio
import concurrent.futures
import getopt
import gzip
import io
import json
import numpy as np
import os
import pandas as pd
import concurrent.futures, getopt, gzip, io, json, os, pickle, psutil, re, requests, signal, sqlalchemy, sqlite3, subprocess, sys, time, traceback, warnings
from Bio import AlignIO, SeqIO
from Bio.PDB import MMCIFParser
from Bio.PDB.mmcifio import MMCIFIO
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from Bio.PDB.PDBExceptions import PDBConstructionWarning, BiopythonWarning
from Bio.PDB.Dice import ChainSelector
from Bio.Alphabet import generic_rna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment, AlignInfo
from collections import OrderedDict, defaultdict
import pickle
import psutil
import re
import requests
import signal
import sqlalchemy
import sqlite3
import subprocess
import sys
import time
import traceback
import warnings
from functools import partial, wraps
from os import path, makedirs
from multiprocessing import Pool, Manager, set_start_method
from multiprocessing import Pool, Manager
from time import sleep
from tqdm import tqdm
from setproctitle import setproctitle
def trace_unhandled_exceptions(func):
@wraps(func)
def wrapped_func(*args, **kwargs):
......@@ -36,10 +43,11 @@ def trace_unhandled_exceptions(func):
print(s)
return wrapped_func
pd.set_option('display.max_rows', None)
sqlite3.enable_callback_tracebacks(True)
sqlite3.register_adapter(np.int64, lambda val: int(val)) # Tell Sqlite what to do with <class numpy.int64> objects ---> convert to int
sqlite3.register_adapter(np.float64, lambda val: float(val)) # Tell Sqlite what to do with <class numpy.int64> objects ---> convert to int
sqlite3.register_adapter(np.float64, lambda val: float(val)) # Tell Sqlite what to do with <class numpy.float64> objects ---> convert to float
m = Manager()
running_stats = m.list()
......@@ -52,11 +60,14 @@ validsymb = '\U00002705'
warnsymb = '\U000026A0'
errsymb = '\U0000274C'
LSU_set = {"RF00002", "RF02540", "RF02541", "RF02543", "RF02546"} # From Rfam CLAN 00112
SSU_set = {"RF00177", "RF02542", "RF02545", "RF01959", "RF01960"} # From Rfam CLAN 00111
LSU_set = {"RF00002", "RF02540", "RF02541",
"RF02543", "RF02546"} # From Rfam CLAN 00112
SSU_set = {"RF00177", "RF02542", "RF02545",
"RF01959", "RF01960"} # From Rfam CLAN 00111
no_nts_set = set()
weird_mappings = set()
class SelectivePortionSelector(object):
"""Class passed to MMCIFIO to select some chain portions in an MMCIF file.
......@@ -101,7 +112,7 @@ class SelectivePortionSelector(object):
return 1
class BufferingSummaryInfo(AlignInfo.SummaryInfo):
class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo):
def get_pssm(self, family, index):
"""Create a position specific score matrix object for the alignment.
......@@ -128,7 +139,7 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo):
score_dict[this_residue] = 1.0
pssm_info.append(('*', score_dict))
return AlignInfo.PSSM(pssm_info)
return Bio.Align.AlignInfo.PSSM(pssm_info)
class Chain:
......@@ -187,11 +198,11 @@ class Chain:
with warnings.catch_warnings():
# Ignore the PDB problems. This mostly warns that some chain is discontinuous.
warnings.simplefilter('ignore', PDBConstructionWarning)
warnings.simplefilter('ignore', BiopythonWarning)
warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning)
warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning)
# Load the whole mmCIF into a Biopython structure object:
mmcif_parser = MMCIFParser()
mmcif_parser = Bio.PDB.MMCIFParser()
try:
s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif")
except ValueError as e:
......@@ -212,7 +223,7 @@ class Chain:
sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm)
# Save that selection on the mmCIF object s to file
ioobj = MMCIFIO()
ioobj = Bio.PDB.mmcifio.MMCIFIO()
ioobj.set_structure(s)
ioobj.save(self.file, sel)
......@@ -253,7 +264,7 @@ class Chain:
# Create the Pandas DataFrame for the nucleotides of the right chain
nts = json_object["nts"] # sub-json-object
df = pd.DataFrame(nts) # conversion to dataframe
df = df[ df.chain_name == self.pdb_chain_id ] # keeping only this chain's nucleotides
df = df[df.chain_name == self.pdb_chain_id] # keeping only this chain's nucleotides
# Assert nucleotides of the chain are found
if df.empty:
......@@ -266,12 +277,12 @@ class Chain:
# Remove low pertinence or undocumented descriptors, convert angles values
cols_we_keep = ["index_chain", "nt_resnum", "nt_name", "nt_code", "nt_id", "dbn", "alpha", "beta", "gamma", "delta", "epsilon", "zeta",
"epsilon_zeta", "bb_type", "chi", "glyco_bond", "form", "ssZp", "Dp", "eta", "theta", "eta_prime", "theta_prime", "eta_base", "theta_base",
"v0", "v1", "v2", "v3", "v4", "amplitude", "phase_angle", "puckering" ]
"v0", "v1", "v2", "v3", "v4", "amplitude", "phase_angle", "puckering"]
df = df[cols_we_keep]
df.loc[:,['alpha', 'beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi','v0', 'v1', 'v2', 'v3', 'v4', # Conversion to radians
'eta','theta','eta_prime','theta_prime','eta_base','theta_base', 'phase_angle']] *= np.pi/180.0
df.loc[:,['alpha', 'beta','gamma','delta','epsilon','zeta','epsilon_zeta','chi','v0', 'v1', 'v2', 'v3', 'v4', # mapping [-pi, pi] into [0, 2pi]
'eta','theta','eta_prime','theta_prime','eta_base','theta_base', 'phase_angle']] %= (2.0*np.pi)
df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4', # Conversion to radians
'eta', 'theta', 'eta_prime', 'theta_prime', 'eta_base', 'theta_base', 'phase_angle']] *= np.pi/180.0
df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4', # mapping [-pi, pi] into [0, 2pi]
'eta', 'theta', 'eta_prime', 'theta_prime', 'eta_base', 'theta_base', 'phase_angle']] %= (2.0*np.pi)
except KeyError as e:
warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True)
......@@ -295,14 +306,14 @@ class Chain:
# Duplicate residue numbers : shift numbering
while True in df.duplicated(['nt_resnum']).values:
i = df.duplicated(['nt_resnum']).values.tolist().index(True)
duplicates = df[df.nt_resnum == df.iloc[i,1]]
duplicates = df[df.nt_resnum == df.iloc[i, 1]]
n_dup = len(duplicates.nt_resnum)
index_last_dup = duplicates.index_chain.iloc[-1] - 1
if self.mapping is not None:
self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}")
try:
if i > 0 and index_last_dup +1 < len(df.index) and df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
if i > 0 and index_last_dup + 1 < len(df.index) and df.iloc[i, 1] == df.iloc[i-1, 1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
# The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end
if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup:
......@@ -314,15 +325,15 @@ class Chain:
else:
# We solve the problem continous component by continuous component
for j in range(1, n_dup+1):
if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous
df.iloc[i+j-1,1] += 1
if duplicates.iloc[j, 0] == 1 + duplicates.iloc[j-1, 0]: # continuous
df.iloc[i+j-1, 1] += 1
else:
break
elif df.iloc[i,1] == df.iloc[i-1,1]:
elif df.iloc[i, 1] == df.iloc[i-1, 1]:
# Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
# Solution : we shift the numbering of 17A (to 18) and the following residues.
df.iloc[i:, 1] += 1
elif duplicates.iloc[0,0] == 1 and df.iloc[i,0] == 3:
elif duplicates.iloc[0, 0] == 1 and df.iloc[i, 0] == 3:
# 4wzo_1_1J case, there is a residue numbered -1 and read as 1 before the number 0.
df.iloc[1:, 1] += 1
df.iloc[0, 1] = 0
......@@ -340,12 +351,16 @@ class Chain:
# Search for ligands at the end of the selection
# Drop ligands detected as residues by DSSR, by detecting several markers
while ( len(df.index_chain) and df.iloc[-1,2] not in ["A", "C", "G", "U"] and (
(df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all()
or (df.iloc[[-1]].puckering=='').any()
while (
len(df.index_chain) and df.iloc[-1, 2] not in ["A", "C", "G", "U"]
and (
(df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon",
"zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all()
or (df.iloc[[-1]].puckering == '').any()
)
or ( len(df.index_chain) >= 2 and df.iloc[-1,1] > 50 + df.iloc[-2,1] ) # large nt_resnum gap between the two last residues
or ( len(df.index_chain) and df.iloc[-1,2] in ["GNG", "E2C", "OHX", "IRI", "MPD", "8UZ"] )
# large nt_resnum gap between the two last residues
or (len(df.index_chain) >= 2 and df.iloc[-1, 1] > 50 + df.iloc[-2, 1])
or (len(df.index_chain) and df.iloc[-1, 2] in ["GNG", "E2C", "OHX", "IRI", "MPD", "8UZ"])
):
if self.mapping is not None:
self.mapping.log("Droping ligand:")
......@@ -390,17 +405,19 @@ class Chain:
break
if found:
self.mapping.log(f"Residue {i+1+self.mapping.st}-{self.mapping.st} = {i+1} has been saved and renumbered {df.iloc[i,1]} instead of {found['nt_id'].replace(found['chain_name']+ '.' + found['nt_name'], '').replace('^','')}")
df_row = pd.DataFrame([found], index=[i])[df.columns.values]
df_row.iloc[0,0] = i+1 # index_chain
df_row.iloc[0,1] = df.iloc[i,1] # nt_resnum
df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ])
df_row = pd.DataFrame([found], index=[i])[
df.columns.values]
df_row.iloc[0, 0] = i+1 # index_chain
df_row.iloc[0, 1] = df.iloc[i, 1] # nt_resnum
df = pd.concat([df.iloc[:i], df_row, df.iloc[i:]])
df.iloc[i+1:, 1] += 1
else:
warn(f"Missing index_chain {i} in {self.chain_label} !")
# Assert some nucleotides still exist
try:
l = df.iloc[-1,1] - df.iloc[0,1] + 1 # update length of chain from nt_resnum point of view
# update length of chain from nt_resnum point of view
l = df.iloc[-1, 1] - df.iloc[0, 1] + 1
except IndexError:
warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.mapping.nt_start} and "
f"{self.mapping.nt_end} ({'not ' if not self.mapping.inferred else ''}inferred). Ignoring chain {self.chain_label}.")
......@@ -426,14 +443,17 @@ class Chain:
# index_chain 1 |-------------|77 83|------------| 154
# expected data point 1 |--------------------------------| 154
#
if l != len(df['index_chain']): # if some residues are missing, len(df['index_chain']) < l
resnum_start = df.iloc[0,1]
diff = set(range(l)).difference(df['nt_resnum'] - resnum_start) # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start)
resnum_start = df.iloc[0, 1]
# the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start)
diff = set(range(l)).difference(df['nt_resnum'] - resnum_start)
for i in sorted(diff):
# Add a row at position i
df = pd.concat([ df.iloc[:i],
pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start, "nt_id":"not resolved", "nt_code":'-', "nt_name":'-'}, index=[i]),
df.iloc[i:] ])
df = pd.concat([df.iloc[:i],
pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start,
"nt_id": "not resolved", "nt_code": '-', "nt_name": '-'}, index=[i]),
df.iloc[i:]])
# Increase the index_chain of all following lines
df.iloc[i+1:, 0] += 1
df = df.reset_index(drop=True)
......@@ -444,27 +464,27 @@ class Chain:
#######################################
# Add a sequence column just for the alignments
df['nt_align_code'] = [ str(x).upper()
df['nt_align_code'] = [str(x).upper()
.replace('NAN', '-') # Unresolved nucleotides are gaps
.replace('?', '-') # Unidentified residues, let's delete them
.replace('T', 'U') # 5MU are modified to t, which gives T
.replace('P', 'U') # Pseudo-uridines, but it is not really right to change them to U, see DSSR paper, Fig 2
for x in df['nt_code'] ]
for x in df['nt_code']]
# One-hot encoding sequence
df["is_A"] = [ 1 if x=="A" else 0 for x in df["nt_code"] ]
df["is_C"] = [ 1 if x=="C" else 0 for x in df["nt_code"] ]
df["is_G"] = [ 1 if x=="G" else 0 for x in df["nt_code"] ]
df["is_U"] = [ 1 if x=="U" else 0 for x in df["nt_code"] ]
df["is_other"] = [ 0 if x in "ACGU" else 1 for x in df["nt_code"] ]
df["is_A"] = [1 if x == "A" else 0 for x in df["nt_code"]]
df["is_C"] = [1 if x == "C" else 0 for x in df["nt_code"]]
df["is_G"] = [1 if x == "G" else 0 for x in df["nt_code"]]
df["is_U"] = [1 if x == "U" else 0 for x in df["nt_code"]]
df["is_other"] = [0 if x in "ACGU" else 1 for x in df["nt_code"]]
df["nt_position"] = [ float(i+1)/self.full_length for i in range(self.full_length) ]
# Iterate over pairs to identify base-base interactions
res_ids = list(df['nt_id']) # things like "chainID.C4, chainID.U5"
paired = [ '' ] * self.full_length
pair_type_LW = [ '' ] * self.full_length
pair_type_DSSR = [ '' ] * self.full_length
interacts = [ 0 ] * self.full_length
paired = [''] * self.full_length
pair_type_LW = [''] * self.full_length
pair_type_DSSR = [''] * self.full_length
interacts = [0] * self.full_length
if "pairs" in json_object.keys():
pairs = json_object["pairs"]
for p in pairs:
......@@ -506,17 +526,19 @@ class Chain:
paired[nt2_idx] += ',' + str(nt1_idx + 1)
# transform nt_id to shorter values
df['old_nt_resnum'] = [ n.replace(self.pdb_chain_id+'.'+name, '').replace('^','').replace('/','') for n, name in zip(df.nt_id, df.nt_name) ]
df['old_nt_resnum'] = [ n.replace(self.pdb_chain_id+'.'+name, '').replace('^', '').replace('/', '') for n, name in zip(df.nt_id, df.nt_name) ]
df['paired'] = paired
df['pair_type_LW'] = pair_type_LW
df['pair_type_DSSR'] = pair_type_DSSR
df['nb_interact'] = interacts
df = df.drop(['nt_id', 'nt_resnum'], axis=1) # remove now useless descriptors
# remove now useless descriptors
df = df.drop(['nt_id', 'nt_resnum'], axis=1)
self.seq = "".join(df.nt_code)
self.seq_to_align = "".join(df.nt_align_code)
self.length = len([ x for x in self.seq_to_align if x != "-" ])
self.length = len([x for x in self.seq_to_align if x != "-"])
# Remove too short chains
if self.length < 5:
......@@ -559,7 +581,8 @@ class Chain:
WHERE structure_id='{self.pdb_id}'
AND chain_name='{self.pdb_chain_id}'
AND rfam_acc='{self.mapping.rfam_acc}'
AND eq_class='{self.eq_class}';""")[0][0]
AND eq_class='{self.eq_class}';"""
)[0][0]
else:
sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, 'unmappd', ?, ?)
ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue, eq_class=excluded.eq_class;""",
......@@ -568,19 +591,18 @@ class Chain:
WHERE structure_id='{self.pdb_id}'
AND chain_name='{self.pdb_chain_id}'
AND eq_class='{self.eq_class}'
AND rfam_acc = 'unmappd';""")[0][0]
AND rfam_acc = 'unmappd';"""
)[0][0]
# Add the nucleotides if the chain is not an issue
if df is not None and not self.delete_me: # double condition is theoretically redundant here, but you never know
sql_execute(conn, f"""
INSERT OR IGNORE INTO nucleotide
sql_execute(conn, f"""INSERT OR IGNORE INTO nucleotide
(chain_id, index_chain, nt_name, nt_code, dbn, alpha, beta, gamma, delta, epsilon, zeta,
epsilon_zeta, bb_type, chi, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base,
v0, v1, v2, v3, v4, amplitude, phase_angle, puckering, nt_align_code, is_A, is_C, is_G, is_U, is_other, nt_position,
old_nt_resnum, paired, pair_type_LW, pair_type_DSSR, nb_interact)
VALUES ({self.db_chain_id}, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""",
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""",
many=True, data=list(df.to_records(index=False)), warn_every=10)
def remap(self, columns_to_save, s_seq):
......@@ -598,40 +620,39 @@ class Chain:
# Save colums in the appropriate positions
i = 0
j = 0
while i<self.full_length and j<alilen:
while i < self.full_length and j < alilen:
# Here we try to map self.seq_to_align (the sequence of the 3D chain, including gaps when residues are missing),
# with s_seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and .
if self.seq_to_align[i] == s_seq[j].upper(): # alignment and sequence correspond (incl. gaps)
re_mappings.append( (self.db_chain_id, i+1, j+1) ) # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1.
re_mappings.append((self.db_chain_id, i+1, j+1)) # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1.
columns_to_save.add(j+1) # it's a set, doublons are automaticaly ignored
i += 1
j += 1
elif self.seq_to_align[i] == '-': # gap in the chain, but not in the aligned sequence
# search for a gap to the consensus nearby
k = 0 # Search must start at zero to assert the difference comes from '-' in front of '.'
while j+k<alilen and s_seq[j+k] == '.':
while j+k < alilen and s_seq[j+k] == '.':
k += 1
# if found, set j to that position
if j+k<alilen and s_seq[j+k] == '-':
re_mappings.append( (self.db_chain_id, i+1, j+k+1) )
if j+k < alilen and s_seq[j+k] == '-':
re_mappings.append((self.db_chain_id, i+1, j+k+1))
columns_to_save.add(j+k+1)
i += 1
j += k+1
continue
# if not, take the insertion gap if this is one
if j<alilen and s_seq[j] == '.':
re_mappings.append( (self.db_chain_id, i+1, j+1) )
if j < alilen and s_seq[j] == '.':
re_mappings.append((self.db_chain_id, i+1, j+1))
columns_to_save.add(j+1)
i += 1
j += 1
continue
# else, just mark the gap as unknown (there is an alignment mismatch)
re_mappings.append( (self.db_chain_id, i+1, 0) )
re_mappings.append((self.db_chain_id, i+1, 0))
i += 1
elif s_seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain
j += 1 # ignore the column
......@@ -672,7 +693,7 @@ class Chain:
l = letters[freq.index(max(freq))]
c_seq_to_align[i] = l
c_seq[i] = l
gaps.append((l, l=='A', l=='C', l=='G', l=='U', l=='N', self.db_chain_id, i+1 ))
gaps.append((l, l == 'A', l == 'C', l == 'G', l == 'U', l == 'N', self.db_chain_id, i+1))
self.seq_to_align = ''.join(c_seq_to_align)
self.seq = ''.join(c_seq)
return gaps
......@@ -684,6 +705,7 @@ class Job:
This could be a system command or the execution of a Python function.
Time and memory usage of a job can be monitored.
"""
def __init__(self, results="", command=[], function=None, args=[], how_many_in_parallel=0, priority=1, timeout=None, checkFunc=None, checkArgs=[], label=""):
self.cmd_ = command # A system command to run
self.func_ = function # A python function to run
......@@ -709,7 +731,8 @@ class Job:
if self.func_ is None:
s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}" + " ".join(self.cmd_)
else:
s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}{self.func_.__name__}(" + " ".join([str(a) for a in self.args_]) + ")"
s = f"{self.priority_}({self.nthreads}) [{self.comp_time}]\t{self.label:25}{self.func_.__name__}(" \
+ " ".join([ str(a) for a in self.args_ ]) + ")"
return s
......@@ -767,13 +790,14 @@ class Downloader:
print("> Fetching latest PDB mappings from Rfam..." + " " * 29, end='', flush=True)
try:
db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam')
mappings = pd.read_sql('SELECT rfam_acc, pdb_id, chain, pdb_start, pdb_end, bit_score, evalue_score, cm_start, cm_end, hex_colour FROM pdb_full_region WHERE is_significant=1;', con=db_connection)
mappings = pd.read_sql('SELECT rfam_acc, pdb_id, chain, pdb_start, pdb_end, bit_score, evalue_score, cm_start, cm_end, hex_colour FROM pdb_full_region WHERE is_significant=1;',
con=db_connection)
mappings.to_csv(runDir + "/data/Rfam-PDB-mappings.csv")
print(f"\t{validsymb}")
except sqlalchemy.exc.OperationalError: # Cannot connect :'(
print(f"\t{errsymb}")
# Check if a previous run succeeded (if file exists, use it)
if path.isfile(runDir + "/data/Rfam-PDB-mappings.csv"):
if os.path.isfile(runDir + "/data/Rfam-PDB-mappings.csv"):
print("\t> Using previous version.")
mappings = pd.read_csv(runDir + "/data/Rfam-PDB-mappings.csv")
else: # otherwise, abort.
......@@ -791,7 +815,7 @@ class Downloader:
setproctitle(f"RNANet.py download_Rfam_cm()")
print(f"\t> Download Rfam.cm.gz from Rfam..." + " " * 37, end='', flush=True)
if not path.isfile(path_to_seq_data + "Rfam.cm"):
if not os.path.isfile(path_to_seq_data + "Rfam.cm"):
try:
subprocess.run(["wget", "ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz", "-O", path_to_seq_data + "Rfam.cm.gz"])
print(f"\t{validsymb}", flush=True)
......@@ -815,7 +839,6 @@ class Downloader:
try:
db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam')
# Prepare the SQL query. It computes the length of the chains and gets the maximum length by family.
q = """SELECT stats.rfam_acc, k.description, stats.maxlength FROM
(SELECT fr.rfam_acc, MAX(
......@@ -838,15 +861,17 @@ class Downloader:
d = pd.read_sql(q, con=db_connection)
# filter the results to families we are interested in
d = d[ d["rfam_acc"].isin(list_of_families) ]
d = d[d["rfam_acc"].isin(list_of_families)]
print(d)
with sqlite3.connect(runDir + "/results/RNANet.db", timeout=20.0) as conn:
sql_execute(conn, """
INSERT OR REPLACE INTO family (rfam_acc, description, max_len)
VALUES (?, ?, ?);""", many=True, data=list(d.to_records(index=False))
) # We use the replace keyword to get the latest information
# We use the REPLACE keyword to get the latest information
sql_execute(conn, """INSERT OR REPLACE INTO family (rfam_acc, description, max_len)
VALUES (?, ?, ?);""",
many=True,
data=list(d.to_records(index=False))
)
except sqlalchemy.exc.OperationalError:
warn("Something's wrong with the SQL database. Check mysql-rfam-public.ebi.ac.uk status and try again later. Not printing statistics.")
......@@ -858,10 +883,11 @@ class Downloader:
setproctitle(f"RNANet.py download_Rfam_sequences({rfam_acc})")
if not path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"):
if not os.path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"):
for _ in range(10): # retry 100 times if it fails
try:
subprocess.run(["wget", f'ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/fasta_files/{rfam_acc}.fa.gz', "-O", path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"], stdout=subprocess.DEVNULL)
subprocess.run(["wget", f'ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/fasta_files/{rfam_acc}.fa.gz', "-O",
path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
notify(f"Downloaded {rfam_acc}.fa.gz from Rfam")
return # if it worked, no need to retry
except Exception as e:
......@@ -881,8 +907,9 @@ class Downloader:
setproctitle(f"RNANet.py download_BGSU_NR_list({res})")
nr_code = min([ i for i in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 20.0] if i >= res ])
nr_code = min([i for i in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 20.0] if i >= res])
print(f"> Fetching latest list of RNA files at {nr_code} A resolution from BGSU website...", end='', flush=True)
# Download latest BGSU non-redundant list
try:
s = requests.get(f"http://rna.bgsu.edu/rna3dhub/nrlist/download/current/{nr_code}A/csv").content
......@@ -894,13 +921,13 @@ class Downloader:
warn("Error downloading NR list !\t", error=True)
# Try to read previous file
if path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"):
print("\t> Use of the previous version.\t", end = "", flush=True)
if os.path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"):
print("\t> Use of the previous version.\t", end="", flush=True)
else:
return pd.DataFrame([], columns=["class", "class_members"])
nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv")
full_structures_list = [ tuple(i[1]) for i in nrlist[['class','class_members']].iterrows() ]
full_structures_list = [ tuple(i[1]) for i in nrlist[['class', 'class_members']].iterrows() ]
print(f"\t{validsymb}", flush=True)
# The beginning of an adventure.
......@@ -910,14 +937,15 @@ class Downloader:
setproctitle(f"RNANet.py download_from_SILVA({unit})")
if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"):
if not os.path.isfile(path_to_seq_data + f"realigned/{unit}.arb"):
try:
print(f"Downloading {unit} from SILVA...", end='', flush=True)
if unit=="LSU":
subprocess.run(["wget", "http://www.arb-silva.de/fileadmin/arb_web_db/release_132/ARB_files/SILVA_132_LSURef_07_12_17_opt.arb.gz", "-O", path_to_seq_data + "realigned/LSU.arb.gz"])
if unit == "LSU":
subprocess.run(["wget", "-nv", "http://www.arb-silva.de/fileadmin/arb_web_db/release_132/ARB_files/SILVA_132_LSURef_07_12_17_opt.arb.gz",
"-O", path_to_seq_data + "realigned/LSU.arb.gz"])
else:
subprocess.run(["wget", "http://www.arb-silva.de/fileadmin/silva_databases/release_138/ARB_files/SILVA_138_SSURef_05_01_20_opt.arb.gz", "-O", path_to_seq_data + "realigned/SSU.arb.gz"])
subprocess.run(["wget", "-nv", "http://www.arb-silva.de/fileadmin/silva_databases/release_138/ARB_files/SILVA_138_SSURef_05_01_20_opt.arb.gz",
"-O", path_to_seq_data + "realigned/SSU.arb.gz"])
except:
warn(f"Error downloading the {unit} database from SILVA", error=True)
exit(1)
......@@ -949,7 +977,8 @@ class Mapping:
def filter_df(self, df):
newdf = df.drop(df[(df.nt_resnum < self.nt_start) | (df.nt_resnum > self.nt_end)].index)
newdf = df.drop(df[(df.nt_resnum < self.nt_start) |
(df.nt_resnum > self.nt_end)].index)
if len(newdf.index_chain) > 0:
# everything's okay
......@@ -961,19 +990,20 @@ class Mapping:
# index_chain and not nt_resnum.
warn(f"Assuming mapping to {self.rfam_acc} is an absolute position interval.")
weird_mappings.add(self.chain_label + "." + self.rfam_acc)
df = df.drop(df[(df.index_chain < self.nt_start) | (df.index_chain > self.nt_end)].index)
df = df.drop(df[(df.index_chain < self.nt_start) |
(df.index_chain > self.nt_end)].index)
# If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one
self.st = 0
if len(df.index_chain) and df.iloc[0,0] != 1:
self.st = df.iloc[0,0] -1
if len(df.index_chain) and df.iloc[0, 0] != 1:
self.st = df.iloc[0, 0] - 1
df.iloc[:, 0] -= self.st
self.log(f"Shifting index_chain of {self.st}")
# Check that some residues are not included by mistake:
# e.g. 4v4t-AA.RF00382-20-55 contains 4 residues numbered 30 but actually far beyond the mapped part,
# because the icode are not read by DSSR.
toremove = df[ df.index_chain > self.nt_end ]
toremove = df[df.index_chain > self.nt_end]
if not toremove.empty:
df = df.drop(toremove.index)
self.log(f"Some nt_resnum values are likely to be wrong, not considering residues:")
......@@ -991,9 +1021,9 @@ class Mapping:
if self.logs == []:
return # Do not create a log file if there is nothing to log
if not path.exists("logs"):
os.makedirs("logs", exist_ok=True)
with open("logs/"+filename, "w") as f:
if not os.path.exists(runDir+"/logs"):
os.makedirs(runDir+"/logs", exist_ok=True)
with open(runDir+"/logs/"+filename, "w") as f:
f.writelines(self.logs)
......@@ -1019,20 +1049,23 @@ class Pipeline:
self.SELECT_ONLY = None
self.ARCHIVE = False
self.SAVELOGS = True
self.FULLINFERENCE = False
def process_options(self):
"""Sets the paths and options of the pipeline"""
"""Sets the paths and options of the pipeline
"""
global path_to_3D_data
global path_to_seq_data
setproctitle("RNANet.py process_options()")
try:
opts, _ = getopt.getopt( sys.argv[1:], "r:hs",
[ "help", "resolution=", "keep-hetatm=", "from-scratch",
opts, _ = getopt.getopt(sys.argv[1:], "r:fhs",
["help", "resolution=", "keep-hetatm=", "from-scratch", "full-inference,"
"fill-gaps=", "3d-folder=", "seq-folder=",
"no-homology", "ignore-issues", "extract", "only=", "all", "no-logs",
"archive", "update-homologous" ])
"archive", "update-homologous"])
except getopt.GetoptError as err:
print(err)
sys.exit(2)
......@@ -1044,13 +1077,15 @@ class Pipeline:
exit()
if opt == "-h" or opt == "--help":
print( "RNANet, a script to build a multiscale RNA dataset from public data\n"
print("RNANet, a script to build a multiscale RNA dataset from public data\n"
"Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2020")
print()
print("Options:")
print("-h [ --help ]\t\t\tPrint this help message")
print("--version\t\t\tPrint the program version")
print()
print("-f [ --full-inference ]\t\tInfer new mappings even if Rfam already provides some. Yields more copies of chains"
"\n\t\t\t\tmapped to different families.")
print("-r 4.0 [ --resolution=4.0 ]\tMaximum 3D structure resolution to consider a RNA chain.")
print("-s\t\t\t\tRun statistics computations after completion")
print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.")
......@@ -1062,7 +1097,7 @@ class Pipeline:
"\n\t\t\t\t\tRNAcifs/\t\tFull structures containing RNA, in mmCIF format"
"\n\t\t\t\t\trna_mapped_to_Rfam/\tExtracted 'pure' RNA chains"
"\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.")
print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files."
print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:"
"\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families"
"\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
print("--no-homology\t\t\tDo not try to compute PSSMs and do not align sequences."
......@@ -1077,7 +1112,7 @@ class Pipeline:
print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications")
print()
print("Typical usage:")
print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &")
print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &")
sys.exit()
elif opt == '--version':
print("RNANet 1.1 beta")
......@@ -1087,21 +1122,21 @@ class Pipeline:
self.CRYSTAL_RES = float(arg)
elif opt == "-s":
self.RUN_STATS = True
elif opt=="--keep-hetatm":
assert arg in [ "True", "False" ]
elif opt == "--keep-hetatm":
assert arg in ["True", "False"]
self.KEEP_HETATM = (arg == "True")
elif opt=="--fill-gaps":
assert arg in [ "True", "False" ]
elif opt == "--fill-gaps":
assert arg in ["True", "False"]
self.FILL_GAPS = (arg == "True")
elif opt=="--no-homology":
elif opt == "--no-homology":
self.HOMOLOGY = False
elif opt=='--3d-folder':
path_to_3D_data = path.abspath(arg)
elif opt == '--3d-folder':
path_to_3D_data = os.path.abspath(arg)
if path_to_3D_data[-1] != '/':
path_to_3D_data += '/'
print("> Storing 3D data into", path_to_3D_data)
elif opt=='--seq-folder':
path_to_seq_data = path.abspath(arg)
elif opt == '--seq-folder':
path_to_seq_data = os.path.abspath(arg)
if path_to_seq_data[-1] != '/':
path_to_seq_data += '/'
print("> Storing sequences into", path_to_seq_data)
......@@ -1138,6 +1173,8 @@ class Pipeline:
self.ARCHIVE = True
elif opt == "--no-logs":
self.SAVELOGS = False
elif opt == "-f" or opt == "--full-inference":
self.FULLINFERENCE = True
if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions":
print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
......@@ -1149,17 +1186,19 @@ class Pipeline:
"""List 3D chains with available Rfam mappings.
Return a list of Chain() objects with the mappings set up.
If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains."""
If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains.
"""
setproctitle("RNANet.py list_available_mappings()")
# List all 3D RNA chains below given resolution
full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) # list of tuples ( class, class_members )
full_structures_list = self.dl.download_BGSU_NR_list(
self.CRYSTAL_RES) # list of tuples ( class, class_members )
# Check for a list of known problems:
if path.isfile(runDir + "/known_issues.txt"):
if os.path.isfile(runDir + "/known_issues.txt"):
with open(runDir + "/known_issues.txt", 'r') as issues:
self.known_issues = [ x[:-1] for x in issues.readlines() ]
self.known_issues = [x[:-1] for x in issues.readlines()]
if self.USE_KNOWN_ISSUES:
print("\t> Ignoring known issues:")
for x in self.known_issues:
......@@ -1175,9 +1214,18 @@ class Pipeline:
p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=ncores)
try:
pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1, desc="Eq. classes", bar_format="{desc}:{percentage:3.0f}%|{bar}|")
for _, newchains in enumerate(p.imap_unordered(partial(work_infer_mappings, not self.REUSE_ALL, allmappings), full_structures_list, chunksize=1)):
pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1,
desc="Eq. classes", bar_format="{desc}:{percentage:3.0f}%|{bar}|")
for _, newchains in enumerate(p.imap_unordered(partial(
work_infer_mappings,
not self.REUSE_ALL,
allmappings,
self.FULLINFERENCE
),
full_structures_list,
chunksize=1)):
self.update += newchains
pbar.update(1) # Everytime the iteration finishes, update the global progress bar
pbar.close()
......@@ -1192,7 +1240,7 @@ class Pipeline:
else:
conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0)
for eq_class, codelist in tqdm(full_structures_list, desc="Eq. classes"):
codes = codelist.replace('+',',').split(',')
codes = codelist.replace('+', ',').split(',')
# Simply convert the list of codes to Chain() objects
for c in codes:
......@@ -1201,40 +1249,48 @@ class Pipeline:
pdb_model = int(nr[1])
pdb_chain_id = nr[2].upper()
chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}"
res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc = 'unmappd' AND issue=0""")
res = sql_ask_database(conn, f"""SELECT chain_id from chain
WHERE structure_id='{pdb_id}'
AND chain_name='{pdb_chain_id}'
AND rfam_acc = 'unmappd'
AND issue=0""")
if not len(res) or self.REUSE_ALL: # the chain is NOT yet in the database, or this is a known issue
self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class))
conn.close()
if self.SELECT_ONLY is not None:
self.update = [ c for c in self.update if c.chain_label == self.SELECT_ONLY ]
self.update = [
c for c in self.update if c.chain_label == self.SELECT_ONLY]
self.n_chains = len(self.update)
print(str(self.n_chains) + " RNA chains of interest.")
@trace_unhandled_exceptions
def dl_and_annotate(self, retry=False, coeff_ncores = 0.75):
def dl_and_annotate(self, retry=False, coeff_ncores=0.75):
"""
Gets mmCIF files from the PDB, and runs DSSR on them.
Ignores a structure if the file already exists (not if we are retrying).
REQUIRES the previous definition of self.update, so call list_available_mappings() before.
SETS table structure"""
SETS table structure
"""
# setproctitle(f"RNANet.py dl_and_annotate(retry={retry})")
setproctitle(f"RNANet.py dl_and_annotate(retry={retry})")
# Prepare the results folders
if not path.isdir(path_to_3D_data + "RNAcifs"):
os.makedirs(path_to_3D_data + "RNAcifs") # for the whole structures
if not path.isdir(path_to_3D_data + "annotations"):
os.makedirs(path_to_3D_data + "annotations") # for DSSR analysis of the whole structures
if not os.path.isdir(path_to_3D_data + "RNAcifs"):
# for the whole structures
os.makedirs(path_to_3D_data + "RNAcifs")
if not os.path.isdir(path_to_3D_data + "annotations"):
# for DSSR analysis of the whole structures
os.makedirs(path_to_3D_data + "annotations")
# Download and annotate
print("> Downloading and annotating structures (or checking previous results if they exist)...", flush=True)
if retry:
mmcif_list = sorted(set([ c.pdb_id for c in self.retry ]))
mmcif_list = sorted(set([c.pdb_id for c in self.retry]))
else:
mmcif_list = sorted(set([ c.pdb_id for c in self.update ]))
mmcif_list = sorted(set([c.pdb_id for c in self.update]))
try:
p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=int(coeff_ncores*ncores))
pbar = tqdm(mmcif_list, maxinterval=1.0, miniters=1, desc="mmCIF files")
......@@ -1255,16 +1311,19 @@ class Pipeline:
and extract their informations from the JSON files to the database.
REQUIRES the previous definition of self.update, so call list_available_mappings() before.
SETS self.loaded_chains"""
SETS self.loaded_chains
"""
setproctitle(f"RNANet.py build_chains(retry={retry})")
# Prepare folders
if self.EXTRACT_CHAINS:
if self.HOMOLOGY and not path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"):
os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam") # for the portions mapped to Rfam
if (not self.HOMOLOGY) and not path.isdir(path_to_3D_data + "rna_only"):
os.makedirs(path_to_3D_data + "rna_only") # extract chains of pure RNA
if self.HOMOLOGY and not os.path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"):
# for the portions mapped to Rfam
os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam")
if (not self.HOMOLOGY) and not os.path.isdir(path_to_3D_data + "rna_only"):
# extract chains of pure RNA
os.makedirs(path_to_3D_data + "rna_only")
# define and run jobs
joblist = []
......@@ -1296,44 +1355,48 @@ class Pipeline:
issues += 1
issues_names.append(c[1].chain_label)
ki.write(c[1].chain_label + '\n')
kir.write(c[1].chain_label + '\n' + c[1].error_messages + '\n\n')
kir.write(c[1].chain_label + '\n' +
c[1].error_messages + '\n\n')
with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
sql_execute(conn, f"UPDATE chain SET issue = 1 WHERE chain_id = ?;", data=(c[1].db_chain_id,))
ki.close()
kir.close()
if issues:
warn(f"Added {issues} newly discovered issues to known issues:")
print("\033[33m"+ " ".join(issues_names) + "\033[0m", flush=True)
print("\033[33m" + " ".join(issues_names) + "\033[0m", flush=True)
# Add successfully built chains to list
self.loaded_chains += [ c[1] for c in results if not c[1].delete_me ]
self.loaded_chains += [c[1] for c in results if not c[1].delete_me]
# Identify errors due to empty JSON files (this happen when RAM is full, we believe).
# Retrying often solves the issue... so retry once with half the cores to limit the RAM usage.
self.to_retry = [ c[1] for c in results if "Could not load existing" in c[1].error_messages ]
def checkpoint_save_chains(self):
"""Saves self.loaded_chains to data/loaded_chains.picke"""
with open(runDir + "/data/loaded_chains.pickle","wb") as pick:
"""Saves self.loaded_chains to data/loaded_chains.picke
"""
with open(runDir + "/data/loaded_chains.pickle", "wb") as pick:
pickle.dump(self.loaded_chains, pick)
def checkpoint_load_chains(self):
"""Load self.loaded_chains from data/loaded_chains.pickle"""
with open(runDir + "/data/loaded_chains.pickle","rb") as pick:
"""Load self.loaded_chains from data/loaded_chains.pickle
"""
with open(runDir + "/data/loaded_chains.pickle", "rb") as pick:
self.loaded_chains = pickle.load(pick)
def prepare_sequences(self):
"""Downloads homologous sequences and covariance models required to compute MSAs.
REQUIRES that self.loaded_chains is defined.
SETS family (partially, through call)"""
SETS family (partially, through call)
"""
setproctitle("RNANet.py prepare_sequences()")
# Preparing a results folder
if not os.access(path_to_seq_data + "realigned/", os.F_OK):
os.makedirs(path_to_seq_data + "realigned/")
if not path.isdir(path_to_seq_data + "rfam_sequences/fasta/"):
if not os.path.isdir(path_to_seq_data + "rfam_sequences/fasta/"):
os.makedirs(path_to_seq_data + "rfam_sequences/fasta/", exist_ok=True)
# Update the family table (rfam_acc, description, max_len)
......@@ -1344,7 +1407,8 @@ class Pipeline:
joblist = []
for f in self.fam_list:
joblist.append(Job(function=work_prepare_sequences, how_many_in_parallel=ncores, args=[self.dl, f, rfam_acc_to_download[f]]))
joblist.append(Job(function=work_prepare_sequences, how_many_in_parallel=ncores, args=[
self.dl, f, rfam_acc_to_download[f]]))
try:
execute_joblist(joblist)
......@@ -1360,14 +1424,16 @@ class Pipeline:
"""Perform multiple sequence alignments.
REQUIRES self.fam_list to be defined
SETS family (partially)"""
SETS family (partially)
"""
setproctitle("RNANet.py realign()")
# Prepare the job list
joblist = []
for f in self.fam_list:
joblist.append( Job(function=work_realign, args=[f], how_many_in_parallel=1, label=f)) # the function already uses all CPUs so launch them one by one
# the function already uses all CPUs so launch them one by one (how_many_in_parallel=1)
joblist.append(Job(function=work_realign, args=[f], how_many_in_parallel=1, label=f))
# Execute the jobs
try:
......@@ -1379,8 +1445,8 @@ class Pipeline:
# Update the database
data = []
for r in results:
align = AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta")
nb_3d_chains = len([ 1 for r in align if '[' in r.id ])
align = Bio.AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta")
nb_3d_chains = len([1 for r in align if '[' in r.id])
if r[0] in SSU_set: # SSU v138 is used
nb_homologs = 2225272 # source: https://www.arb-silva.de/documentation/release-138/
nb_total_homol = nb_homologs + nb_3d_chains
......@@ -1390,7 +1456,7 @@ class Pipeline:
else:
nb_total_homol = len(align)
nb_homologs = nb_total_homol - nb_3d_chains
data.append( (nb_homologs, nb_3d_chains, nb_total_homol, r[2], r[3], r[0]) )
data.append((nb_homologs, nb_3d_chains, nb_total_homol, r[2], r[3], r[0]))
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
sql_execute(conn, """UPDATE family SET nb_homologs = ?, nb_3d_chains = ?, nb_total_homol = ?, comput_time = ?, comput_peak_mem = ?
......@@ -1399,13 +1465,14 @@ class Pipeline:
def remap(self):
"""Compute nucleotide frequencies of some alignments and save them in the database
REQUIRES self.fam_list to be defined"""
REQUIRES self.fam_list to be defined
"""
setproctitle("RNANet.py remap()")
print("Computing nucleotide frequencies in alignments...\nThis can be very long on slow storage devices (Hard-drive...)")
print("Check your CPU and disk I/O activity before deciding if the job failed.")
nworkers =max(min(ncores, len(self.fam_list)), 1)
nworkers = max(min(ncores, len(self.fam_list)), 1)
# Prepare the architecture of a shiny multi-progress-bars design
# Push the number of workers to a queue.
......@@ -1419,8 +1486,10 @@ class Pipeline:
try:
fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True)
for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)): # Apply work_pssm to each RNA family
fam_pbar.update(1) # Everytime the iteration finishes on a family, update the global progress bar over the RNA families
# Apply work_pssm to each RNA family
for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)):
# Everytime the iteration finishes on a family, update the global progress bar over the RNA families
fam_pbar.update(1)
fam_pbar.close()
p.close()
p.join()
......@@ -1434,23 +1503,24 @@ class Pipeline:
def output_results(self):
"""Produces CSV files, archive them, and additional metadata files
REQUIRES self.loaded_chains (to output corresponding CSV files) and self.fam_list (for statistics)"""
REQUIRES self.loaded_chains (to output corresponding CSV files) and self.fam_list (for statistics)
"""
setproctitle("RNANet.py output_results()")
time_str = time.strftime("%Y%m%d")
#Prepare folders:
if not path.isdir(path_to_3D_data + "datapoints/"):
# Prepare folders:
if not os.path.isdir(path_to_3D_data + "datapoints/"):
os.makedirs(path_to_3D_data + "datapoints/")
if not path.isdir(runDir + "/results/archive/"):
if not os.path.isdir(runDir + "/results/archive/"):
os.makedirs(runDir + "/results/archive/")
# Save to by-chain CSV files
p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3)
try:
pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True)
for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains, chunksize=2)):
for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains)):
pbar.update(1)
pbar.close()
p.close()
......@@ -1465,36 +1535,44 @@ class Pipeline:
# Run statistics
if self.RUN_STATS:
# Remove previous precomputed data
subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
subprocess.run(["rm", "-f", runDir + "/data/wadley_kernel_eta.npz",
runDir + "/data/wadley_kernel_eta_prime.npz",
runDir + "/data/pair_counts.csv"])
for f in self.fam_list:
subprocess.run(["rm","-f", f"data/{f}.npy", f"data/{f}_pairs.csv", f"data/{f}_counts.csv"])
subprocess.run(["rm", "-f", runDir + f"/data/{f}.npy",
runDir + f"/data/{f}_pairs.csv",
runDir + f"/data/{f}_counts.csv"])
# Run statistics files
os.chdir(runDir)
subprocess.run(["python3.8", "regression.py"])
subprocess.run(["python3.8", "statistics.py", path_to_3D_data, path_to_seq_data])
subprocess.run(["python3.8", fileDir+"/regression.py"])
subprocess.run(["python3.8", fileDir+"/statistics.py", "--3d-folder", path_to_3D_data,
"--seq-folder", path_to_seq_data, "-r", str(self.CRYSTAL_RES)])
# Save additional informations
with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;",
pd.read_sql_query("""SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem
FROM family ORDER BY nb_3d_chains DESC;""",
conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
pd.read_sql_query("""SELECT eq_class, structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure
pd.read_sql_query("""SELECT eq_class, structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue
FROM structure
JOIN chain ON structure.pdb_id = chain.structure_id
ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
ORDER BY structure_id, chain_name, rfam_acc ASC;""",
conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
# Archive the results
if self.SELECT_ONLY is None:
os.makedirs("results/archive", exist_ok=True)
subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."])
if self.ARCHIVE:
os.makedirs(runDir + "/results/archive", exist_ok=True)
subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf",
runDir + f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", "."])
# Update shortcuts to latest versions
subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz",
runDir + "/results/summary_latest.csv",
runDir + "/results/families_latest.csv"
])
subprocess.run(['ln',"-s", runDir +f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"])
subprocess.run(['ln',"-s", runDir +f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"])
subprocess.run(['ln',"-s", runDir +f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"])
subprocess.run(['ln', "-s", runDir + f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"])
subprocess.run(['ln', "-s", runDir + f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"])
subprocess.run(['ln', "-s", runDir + f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"])
def sanitize_database(self):
"""Searches for issues in the database and correct them"""
......@@ -1518,7 +1596,9 @@ class Pipeline:
if self.HOMOLOGY:
# check if chains have been re_mapped:
r = sql_ask_database(conn, """SELECT COUNT(DISTINCT chain_id) AS Count, rfam_acc FROM chain
WHERE issue = 0 AND chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping)
WHERE issue = 0
AND rfam_acc != 'unmappd'
AND chain_id NOT IN (SELECT DISTINCT chain_id FROM re_mapping)
GROUP BY rfam_acc;""")
try:
if len(r) and r[0][0] is not None:
......@@ -1545,22 +1625,25 @@ class Pipeline:
def read_cpu_number():
# As one shall not use os.cpu_count() on LXC containers,
# because it reads info from /sys wich is not the VM resources but the host resources.
# This function reads it from /proc/cpuinfo instead.
"""This function reads the number of CPU cores available from /proc/cpuinfo.
One shall not use os.cpu_count() on LXC containers,
because it reads info from /sys wich is not the VM resources but the host resources.
"""
p = subprocess.run(['grep', '-Ec', '(Intel|AMD)', '/proc/cpuinfo'], stdout=subprocess.PIPE)
return int(int(p.stdout.decode('utf-8')[:-1])/2)
def init_worker(tqdm_lock=None):
signal.signal(signal.SIGINT, signal.SIG_IGN)
if tqdm_lock is not None:
tqdm.set_lock(tqdm_lock)
def warn(message, error=False):
"""Pretty-print warnings and error messages.
"""
# Cut if too long
if len(message)>66:
if len(message) > 66:
x = message.find(' ', 50, 66)
if x != -1:
warn(message[:x], error=error)
......@@ -1574,11 +1657,13 @@ def warn(message, error=False):
else:
print(f"\t> \033[33mWARN: {message:64s}\033[0m\t{warnsymb}", flush=True)
def notify(message, post=''):
if len(post):
post = '(' + post + ')'
print(f"\t> {message:70s}\t{validsymb}\t{post}", flush=True)
def sql_define_tables(conn):
conn.executescript(
""" PRAGMA foreign_keys = on;
......@@ -1684,8 +1769,9 @@ def sql_define_tables(conn):
""")
conn.commit()
@trace_unhandled_exceptions
def sql_ask_database(conn, sql, warn_every = 10):
def sql_ask_database(conn, sql, warn_every=10):
"""
Reads the SQLite database.
Returns a list of tuples.
......@@ -1698,11 +1784,13 @@ def sql_ask_database(conn, sql, warn_every = 10):
return result # if it worked, no need to retry
except sqlite3.OperationalError as e:
if warn_every and not (_+1) % warn_every:
warn(str(e) + ", retrying in 0.2s (worker " + str(os.getpid()) + f', try {_+1}/100)')
warn(str(e) + ", retrying in 0.2s (worker " +
str(os.getpid()) + f', try {_+1}/100)')
time.sleep(0.2)
warn("Tried to reach database 100 times and failed. Aborting.", error=True)
return []
@trace_unhandled_exceptions
def sql_execute(conn, sql, many=False, data=None, warn_every=10):
conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
......@@ -1721,10 +1809,12 @@ def sql_execute(conn, sql, many=False, data=None, warn_every=10):
return # if it worked, no need to retry
except sqlite3.OperationalError as e:
if warn_every and not (_+1) % warn_every:
warn(str(e) + ", retrying in 0.2s (worker " + str(os.getpid()) + f', try {_+1}/100)')
warn(str(e) + ", retrying in 0.2s (worker " +
str(os.getpid()) + f', try {_+1}/100)')
time.sleep(0.2)
warn("Tried to reach database 100 times and failed. Aborting.", error=True)
@trace_unhandled_exceptions
def execute_job(j, jobcount):
"""Run a Job object.
......@@ -1741,7 +1831,8 @@ def execute_job(j, jobcount):
print(f"[{running_stats[0]+running_stats[2]}/{jobcount}]\t{j.label}")
# Add the command to logfile
logfile = open(runDir + "/log_of_the_run.sh", 'a')
os.makedirs(runDir+"/logs", exist_ok=True)
logfile = open(runDir + "/logs/log_of_the_run.sh", 'a')
logfile.write(" ".join(j.cmd_))
logfile.write("\n")
logfile.close()
......@@ -1753,7 +1844,8 @@ def execute_job(j, jobcount):
# run the command. subprocess.run will be a child of this process, and stays monitored.
start_time = time.time()
r = subprocess.run(j.cmd_, timeout=j.timeout_, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
r = subprocess.run(j.cmd_, timeout=j.timeout_,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
end_time = time.time()
# Stop the Monitor, then get its result
......@@ -1782,7 +1874,8 @@ def execute_job(j, jobcount):
# return time and memory statistics, plus the job results
t = end_time - start_time
return (t,m,r)
return (t, m, r)
def execute_joblist(fulljoblist):
""" Run a list of job objects.
......@@ -1815,8 +1908,9 @@ def execute_joblist(fulljoblist):
# Process the jobs from priority 1 to nprio
results = []
for i in range(1,nprio+1):
if i not in jobs.keys(): continue # no job has the priority level i
for i in range(1, nprio+1):
if i not in jobs.keys():
continue # no job has the priority level i
print("processing jobs of priority", i)
different_thread_numbers = sorted(jobs[i].keys())
......@@ -1825,7 +1919,8 @@ def execute_joblist(fulljoblist):
for n in different_thread_numbers:
# get the bunch of jobs of same priority and thread number
bunch = jobs[i][n]
if not len(bunch): continue # no jobs should be processed n by n
if not len(bunch):
continue # no jobs should be processed n by n
print("using", n, "processes:")
# execute jobs of priority i that should be processed n by n:
......@@ -1843,13 +1938,14 @@ def execute_joblist(fulljoblist):
for j, r in zip(bunch, raw_results):
j.comp_time = round(r[0], 2) # seconds
j.max_mem = int(r[1]/1000000) # MB
results.append( (j.label, r[2], round(r[0], 2), int(r[1]/1000000)))
results.append((j.label, r[2], round(r[0], 2), int(r[1]/1000000)))
# throw back the money
return results
@trace_unhandled_exceptions
def work_infer_mappings(update_only, allmappings, codelist) -> list:
def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> list:
"""Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list,
build a list of Chain() objects mapped to Rfam families, by expanding available mappings
of any element of the list to all the list elements.
......@@ -1862,13 +1958,13 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
# Split the comma-separated list of chain codes into chain codes:
eq_class = codelist[0]
codes = codelist[1].replace('+',',').split(',')
codes = codelist[1].replace('+', ',').split(',')
# Search for mappings that apply to an element of this PDB chains list:
for c in codes:
# search for Rfam mappings with this chain c:
m_row_indices = allmappings.pdb_id + "|1|" + allmappings.chain == c[:4].lower()+c[4:]
m = allmappings.loc[m_row_indices].drop(['bit_score','evalue_score','cm_start','cm_end','hex_colour'], axis=1)
m = allmappings.loc[m_row_indices].drop(['bit_score', 'evalue_score', 'cm_start', 'cm_end', 'hex_colour'], axis=1)
if len(m):
# remove the found mappings from the dataframe
allmappings = allmappings.loc[m_row_indices == False]
......@@ -1881,7 +1977,7 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
families = set(known_mappings['rfam_acc'])
# generalize
inferred_mappings = known_mappings.drop(['pdb_id','chain'], axis=1).drop_duplicates()
inferred_mappings = known_mappings.drop(['pdb_id', 'chain'], axis=1).drop_duplicates()
# check for approximative redundancy:
if len(inferred_mappings) != len(inferred_mappings.drop_duplicates(subset="rfam_acc")):
......@@ -1890,11 +1986,11 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
# ==> Summarize them in one mapping but with the largest window.
for rfam in families:
sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end'])
thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam ) & sel_5_to_3
thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam ) & (sel_5_to_3 == False)
thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam) & sel_5_to_3
thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam) & (sel_5_to_3 == False)
if (
len(inferred_mappings[thisfam_5_3]) != len(inferred_mappings[ inferred_mappings['rfam_acc'] == rfam ])
len(inferred_mappings[thisfam_5_3]) != len(inferred_mappings[inferred_mappings['rfam_acc'] == rfam])
and len(inferred_mappings[thisfam_5_3]) > 0
):
# there are mappings in both directions... wtf Rfam ?!
......@@ -1908,8 +2004,8 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
# We keep only the 5->3 sense.
inferred_mappings = inferred_mappings.drop(index=inferred_mappings.index[thisfam_3_5])
sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end'])
thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam ) & sel_5_to_3
thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam ) & (sel_5_to_3 == False)
thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam) & sel_5_to_3
thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam) & (sel_5_to_3 == False)
print()
warn(f"Found mappings to {rfam} in both directions on the same interval, keeping only the 5'->3' one.")
else:
......@@ -1919,35 +2015,35 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
# Compute consensus for chains in 5' -> 3' sense
if len(inferred_mappings[thisfam_5_3]):
pdb_start_min = min(inferred_mappings[ thisfam_5_3]['pdb_start'])
pdb_end_max = max(inferred_mappings[ thisfam_5_3]['pdb_end'])
pdb_start_max = max(inferred_mappings[ thisfam_5_3]['pdb_start'])
pdb_end_min = min(inferred_mappings[ thisfam_5_3]['pdb_end'])
pdb_start_min = min(inferred_mappings[thisfam_5_3]['pdb_start'])
pdb_end_max = max(inferred_mappings[thisfam_5_3]['pdb_end'])
pdb_start_max = max(inferred_mappings[thisfam_5_3]['pdb_start'])
pdb_end_min = min(inferred_mappings[thisfam_5_3]['pdb_end'])
if (pdb_start_max - pdb_start_min < 100) and (pdb_end_max - pdb_end_min < 100):
# the variation is only a few nucleotides, we take the largest window.
inferred_mappings.loc[ thisfam_5_3, 'pdb_start'] = pdb_start_min
inferred_mappings.loc[ thisfam_5_3, 'pdb_end'] = pdb_end_max
inferred_mappings.loc[thisfam_5_3, 'pdb_start'] = pdb_start_min
inferred_mappings.loc[thisfam_5_3, 'pdb_end'] = pdb_end_max
else:
# there probably is an outlier. We chose the median value in the whole list of known_mappings.
known_sel_5_to_3 = (known_mappings['rfam_acc'] == rfam ) & (known_mappings['pdb_start'] < known_mappings['pdb_end'])
inferred_mappings.loc[ thisfam_5_3, 'pdb_start'] = known_mappings.loc[known_sel_5_to_3, 'pdb_start'].median()
inferred_mappings.loc[ thisfam_5_3, 'pdb_end'] = known_mappings.loc[known_sel_5_to_3, 'pdb_end'].median()
known_sel_5_to_3 = (known_mappings['rfam_acc'] == rfam) & (known_mappings['pdb_start'] < known_mappings['pdb_end'])
inferred_mappings.loc[thisfam_5_3, 'pdb_start'] = known_mappings.loc[known_sel_5_to_3, 'pdb_start'].median()
inferred_mappings.loc[thisfam_5_3, 'pdb_end'] = known_mappings.loc[known_sel_5_to_3, 'pdb_end'].median()
# Compute consensus for chains in 3' -> 5' sense
if len(inferred_mappings[thisfam_3_5]):
pdb_start_min = min(inferred_mappings[ thisfam_3_5]['pdb_start'])
pdb_end_max = max(inferred_mappings[ thisfam_3_5]['pdb_end'])
pdb_start_max = max(inferred_mappings[ thisfam_3_5]['pdb_start'])
pdb_end_min = min(inferred_mappings[ thisfam_3_5]['pdb_end'])
pdb_start_min = min(inferred_mappings[thisfam_3_5]['pdb_start'])
pdb_end_max = max(inferred_mappings[thisfam_3_5]['pdb_end'])
pdb_start_max = max(inferred_mappings[thisfam_3_5]['pdb_start'])
pdb_end_min = min(inferred_mappings[thisfam_3_5]['pdb_end'])
if (pdb_start_max - pdb_start_min < 100) and (pdb_end_max - pdb_end_min < 100):
# the variation is only a few nucleotides, we take the largest window.
inferred_mappings.loc[ thisfam_3_5, 'pdb_start'] = pdb_start_max
inferred_mappings.loc[ thisfam_3_5, 'pdb_end'] = pdb_end_min
inferred_mappings.loc[thisfam_3_5, 'pdb_start'] = pdb_start_max
inferred_mappings.loc[thisfam_3_5, 'pdb_end'] = pdb_end_min
else:
# there probably is an outlier. We chose the median value in the whole list of known_mappings.
known_sel_3_to_5 = (known_mappings['rfam_acc'] == rfam ) & (known_mappings['pdb_start'] > known_mappings['pdb_end'])
inferred_mappings.loc[ thisfam_3_5, 'pdb_start'] = known_mappings.loc[known_sel_3_to_5, 'pdb_start'].median()
inferred_mappings.loc[ thisfam_3_5, 'pdb_end'] = known_mappings.loc[known_sel_3_to_5, 'pdb_end'].median()
known_sel_3_to_5 = (known_mappings['rfam_acc'] == rfam) & (known_mappings['pdb_start'] > known_mappings['pdb_end'])
inferred_mappings.loc[thisfam_3_5, 'pdb_start'] = known_mappings.loc[known_sel_3_to_5, 'pdb_start'].median()
inferred_mappings.loc[thisfam_3_5, 'pdb_end'] = known_mappings.loc[known_sel_3_to_5, 'pdb_end'].median()
inferred_mappings.drop_duplicates(inplace=True)
# Now build Chain() objects for the mapped chains
......@@ -1958,7 +2054,8 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
pdb_chain_id = nr[2]
for rfam in families:
# if a known mapping of this chain on this family exists, apply it
m = known_mappings.loc[ (known_mappings.pdb_id + "|1|" + known_mappings.chain == c[:4].lower()+c[4:]) & (known_mappings['rfam_acc'] == rfam ) ]
this_chain_idxs = (known_mappings.pdb_id + "|1|" + known_mappings.chain == c[:4].lower()+c[4:])
m = known_mappings.loc[this_chain_idxs & (known_mappings['rfam_acc'] == rfam)]
if len(m) and len(m) < 2:
pdb_start = int(m.pdb_start)
pdb_end = int(m.pdb_end)
......@@ -1969,23 +2066,35 @@ def work_infer_mappings(update_only, allmappings, codelist) -> list:
pdb_start = int(m.pdb_start.min())
pdb_end = int(m.pdb_end.max())
inferred = False
elif not(pdb_id in known_mappings.pdb_id and pdb_chain_id in known_mappings.chain): # if no known mapping on another family, use the inferred mapping
pdb_start = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_start)
pdb_end = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_end)
elif (fullinference or not(this_chain_idxs.any())):
# if no known mapping on another family, use the inferred mapping
# idem if the user said to do so with --full-inference
pdb_start = int(inferred_mappings.loc[(inferred_mappings['rfam_acc'] == rfam)].pdb_start)
pdb_end = int(inferred_mappings.loc[(inferred_mappings['rfam_acc'] == rfam)].pdb_end)
inferred = True
else:
# skip this family, we cannot map this chain to it.
continue
chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}_{pdb_start}-{pdb_end}"
# Check if the chain exists in the database
if update_only:
with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc='{rfam}' AND issue=0""")
res = sql_ask_database(conn, f"""SELECT chain_id from chain
WHERE structure_id='{pdb_id}'
AND chain_name='{pdb_chain_id}'
AND rfam_acc='{rfam}'
AND issue=0""")
if not len(res): # the chain is NOT yet in the database, or this is a known issue
newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class,
rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
else:
newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class,
rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
return newchains
@trace_unhandled_exceptions
def work_mmcif(pdb_id):
""" Look for a CIF file (with all chains) from RCSB
......@@ -1999,8 +2108,11 @@ def work_mmcif(pdb_id):
# Attempt to download it if not present
try:
if not path.isfile(final_filepath):
subprocess.run(["wget", f'http://files.rcsb.org/download/{pdb_id}.cif', "-O", final_filepath], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
if not os.path.isfile(final_filepath):
subprocess.run(
["wget", f'http://files.rcsb.org/download/{pdb_id}.cif', "-O", final_filepath],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
)
except:
warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True)
return
......@@ -2012,7 +2124,7 @@ def work_mmcif(pdb_id):
# if not, read the CIF header and register the structure
if not len(r):
# Load the MMCIF file with Biopython
mmCif_info = MMCIF2Dict(final_filepath)
mmCif_info = Bio.PDB.MMCIF2Dict.MMCIF2Dict(final_filepath)
# Get info about that structure
try:
......@@ -2036,9 +2148,9 @@ def work_mmcif(pdb_id):
# Save into the database
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution)
VALUES (?, ?, DATE(?), ?, ?);""", data = (pdb_id, 1, date, exp_meth, reso))
VALUES (?, ?, DATE(?), ?, ?);""", data=(pdb_id, 1, date, exp_meth, reso))
if not path.isfile(path_to_3D_data + "annotations/" + pdb_id + ".json"):
if not os.path.isfile(path_to_3D_data + "annotations/" + pdb_id + ".json"):
# run DSSR (you need to have it in your $PATH, follow x3dna installation instructions)
output = subprocess.run(["x3dna-dssr", f"-i={final_filepath}", "--json", "--auxfile=no"],
......@@ -2052,22 +2164,23 @@ def work_mmcif(pdb_id):
return 1
# save the analysis to file only if we can load it :/
json_file = open(path_to_3D_data + "annotations/" + pdb_id + ".json", "w")
json_file = open(path_to_3D_data + "annotations/" +
pdb_id + ".json", "w")
json_file.write(stdout)
json_file.close()
return 0
@trace_unhandled_exceptions
def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
"""Reads information from JSON and save it to database.
If asked, also extracts the 3D chains from their original structure files.
"""
setproctitle(f"RNAnet.py work_build_chain({c.chain_label})")
if not path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json"):
if not os.path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json"):
warn(f"Could not find annotations for {c.chain_label}, ignoring it.", error=True)
c.delete_me = True
c.error_messages += f"Could not download and/or find annotations for {c.chain_label}."
......@@ -2094,25 +2207,28 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
return c
@trace_unhandled_exceptions
def work_prepare_sequences(dl, rfam_acc, chains):
"""Prepares FASTA files of homologous sequences to realign with cmalign or SINA."""
"""Prepares FASTA files of homologous sequences to realign with cmalign or SINA.
"""
setproctitle("RNAnet.py work_prepare_sequences()")
if rfam_acc in LSU_set | SSU_set: # rRNA
if path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
# Detect doublons and remove them
existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
existing_ids = [ r.id for r in existing_afa ]
existing_afa = Bio.AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
existing_ids = [r.id for r in existing_afa]
del existing_afa
new_ids = [ str(c) for c in chains ]
doublons = [ i for i in existing_ids if i in new_ids ]
new_ids = [str(c) for c in chains]
doublons = [i for i in existing_ids if i in new_ids]
del existing_ids, new_ids
if len(doublons):
fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa"
warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version")
seqfile = SeqIO.parse(fasta, "fasta")
fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa"
seqfile = Bio.SeqIO.parse(fasta, "fasta")
# remove it and rewrite it with its own content filtered
os.remove(fasta)
with open(fasta, 'w') as f:
for rec in seqfile:
......@@ -2123,16 +2239,15 @@ def work_prepare_sequences(dl, rfam_acc, chains):
with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f:
for c in chains:
if len(c.seq_to_align):
f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U','T')+'\n')
f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U', 'T')+'\n')
status = f"{rfam_acc}: {len(chains)} new PDB sequences to align (with SINA)"
elif not path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.stk"):
elif not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.stk"):
# there was no previous aligned sequences, and we use cmalign.
# So, we need to download homologous sequences from Rfam.
# Extracting covariance model for this family
if not path.isfile(path_to_seq_data + f"realigned/{rfam_acc}.cm"):
if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}.cm"):
with open(path_to_seq_data + f"realigned/{rfam_acc}.cm", "w") as f:
subprocess.run(["cmfetch", path_to_seq_data + "Rfam.cm", rfam_acc], stdout=f)
notify(f"Extracted {rfam_acc} covariance model (cmfetch)")
......@@ -2141,7 +2256,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
dl.download_Rfam_sequences(rfam_acc)
# Prepare a FASTA file containing Rfamseq hits for that family
if path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"): # test if download succeeded
if os.path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"): # test if download succeeded
# gunzip the file
with gzip.open(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz", 'rb') as gz:
......@@ -2153,14 +2268,14 @@ def work_prepare_sequences(dl, rfam_acc, chains):
with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "w") as plusplus:
ids = set()
# Remove doublons from the Rfam hits
for r in SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"):
for r in Bio.SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"):
if r.id not in ids:
ids.add(r.id)
plusplus.write('> '+r.description+'\n'+str(r.seq)+'\n')
# Add the 3D chains sequences
for c in chains:
if len(c.seq_to_align):
plusplus.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U','T')+'\n')
plusplus.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U', 'T')+'\n')
del file_content
# os.remove(path_to_seq_data + f"realigned/{rfam_acc}.fa")
......@@ -2175,12 +2290,13 @@ def work_prepare_sequences(dl, rfam_acc, chains):
with open(path_to_seq_data + f"realigned/{rfam_acc}_new.fa", "w") as f:
for c in chains:
if len(c.seq_to_align):
f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U','T')+'\n')
f.write(f"> {str(c)}\n"+c.seq_to_align.replace('-', '').replace('U', 'T')+'\n')
status = f"{rfam_acc}: {len(chains)} new PDB sequences to realign (with existing cmalign alignment)"
# print some stats
notify(status)
@trace_unhandled_exceptions
def work_realign(rfam_acc):
""" Runs multiple sequence alignements by RNA family.
......@@ -2209,10 +2325,10 @@ def work_realign(rfam_acc):
else:
# Align using Infernal for most RNA families
if path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
# Alignment exists. We just want to add new sequences into it.
if not path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
# there are no new sequences to align...
return
......@@ -2227,13 +2343,13 @@ def work_realign(rfam_acc):
notify("Aligned new sequences together")
# Detect doublons and remove them
existing_stk = AlignIO.read(existing_ali_path, "stockholm")
existing_ids = [ r.id for r in existing_stk ]
existing_stk = Bio.AlignIO.read(existing_ali_path, "stockholm")
existing_ids = [r.id for r in existing_stk]
del existing_stk
new_stk = AlignIO.read(new_ali_path, "stockholm")
new_ids = [ r.id for r in new_stk ]
new_stk = Bio.AlignIO.read(new_ali_path, "stockholm")
new_ids = [r.id for r in new_stk]
del new_stk
doublons = [ i for i in existing_ids if i in new_ids ]
doublons = [i for i in existing_ids if i in new_ids]
del existing_ids, new_ids
if len(doublons):
warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
......@@ -2241,12 +2357,13 @@ def work_realign(rfam_acc):
toremove.write('\n'.join(doublons)+'\n')
p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path],
stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path], stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path],
stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
os.remove(path_to_seq_data + "realigned/toremove.txt")
# And we merge the two alignments
p2= subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
"--rna", existing_ali_path, new_ali_path ],
p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
"--rna", existing_ali_path, new_ali_path],
stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8')
subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
......@@ -2263,7 +2380,7 @@ def work_realign(rfam_acc):
p = subprocess.run(["cmalign", "--small", "--cyk", "--noprob", "--nonbanded", "--notrunc",
'-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
path_to_seq_data + f"realigned/{rfam_acc}.cm",
path_to_seq_data + f"realigned/{rfam_acc}++.fa" ],
path_to_seq_data + f"realigned/{rfam_acc}++.fa"],
stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
stderr = p.stderr.decode("utf-8")
......@@ -2277,7 +2394,9 @@ def work_realign(rfam_acc):
print('\t'+validsymb, flush=True)
# Convert Stockholm to aligned FASTA
subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", "--informat", "stockholm", "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"])
subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
"--informat", "stockholm",
"afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"])
subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part.
# Assert everything worked, or save an error
......@@ -2288,6 +2407,7 @@ def work_realign(rfam_acc):
with open(runDir + "/errors.txt", "a") as er:
er.write(f"Failed to realign {rfam_acc} (killed)")
def summarize_position(counts):
""" Counts the number of nucleotides at a given position, given a "column" from a MSA.
"""
......@@ -2303,15 +2423,15 @@ def summarize_position(counts):
N += counts[char] # number of ungapped residues
if N: # prevent division by zero if the column is only gaps
return ( counts['A']/N, counts['C']/N, counts['G']/N, counts['U']/N, (N - known_chars_count)/N) # other residues, or consensus (N, K, Y...)
return (counts['A']/N, counts['C']/N, counts['G']/N, counts['U']/N, (N - known_chars_count)/N) # other residues, or consensus (N, K, Y...)
else:
return (0, 0, 0, 0, 0)
@trace_unhandled_exceptions
def work_pssm(f, fill_gaps):
""" Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
Also saves every chain of the family to file.
Uses only 1 core, so this function can be called in parallel.
"""
......@@ -2323,18 +2443,17 @@ def work_pssm(f, fill_gaps):
# get the chains of this family
list_of_chains = rfam_acc_to_download[f]
chains_ids = [ str(c) for c in list_of_chains ]
chains_ids = [str(c) for c in list_of_chains]
# Open the alignment
try:
align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
align = Bio.AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
except:
warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True)
with open(runDir + "/errors.txt", "a") as errf:
errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n")
return 1
# Compute statistics per column
pssm = BufferingSummaryInfo(align).get_pssm(f, thr_idx)
frequencies = [ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ]
......@@ -2378,10 +2497,13 @@ def work_pssm(f, fill_gaps):
# Save the re_mappings
conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0)
sql_execute(conn, "INSERT INTO re_mapping (chain_id, index_chain, index_ali) VALUES (?, ?, ?) ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;", many=True, data=re_mappings)
sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali)
VALUES (?, ?, ?)
ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""",
many=True, data=re_mappings)
# Save the useful columns in the database
data = [ (f, j) + frequencies[j-1] for j in sorted(columns_to_save) ]
data = [(f, j) + frequencies[j-1] for j in sorted(columns_to_save)]
sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other)
VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO
UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, freq_other=excluded.freq_other;""", many=True, data=data)
......@@ -2412,12 +2534,13 @@ def work_pssm(f, fill_gaps):
pbar.close()
sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = ?,
is_A = ?, is_C = ?, is_G = ?, is_U = ?, is_other = ?
WHERE chain_id = ? AND index_chain = ?;""", many=True, data = gaps)
WHERE chain_id = ? AND index_chain = ?;""", many=True, data=gaps)
conn.close()
idxQueue.put(thr_idx) # replace the thread index in the queue
return 0
@trace_unhandled_exceptions
def work_save(c, homology=True):
......@@ -2451,9 +2574,11 @@ def work_save(c, homology=True):
df.to_csv(filename, float_format="%.2f", index=False)
if __name__ == "__main__":
runDir = path.dirname(path.realpath(__file__))
runDir = os.getcwd()
fileDir = os.path.dirname(os.path.realpath(__file__))
ncores = read_cpu_number()
pp = Pipeline()
pp.process_options()
......@@ -2502,7 +2627,6 @@ if __name__ == "__main__":
print("Completed.")
exit(0)
# At this point, structure, chain and nucleotide tables of the database are up to date.
# (Modulo some statistics computed by statistics.py)
......@@ -2511,13 +2635,14 @@ if __name__ == "__main__":
# ===========================================================================
if pp.SELECT_ONLY is None:
pp.checkpoint_load_chains() # If your job failed, you can comment all the "3D information" part and start from here.
# If your job failed, you can comment all the "3D information" part and start from here.
pp.checkpoint_load_chains()
# Get the list of Rfam families found
rfam_acc_to_download = {}
for c in pp.loaded_chains:
if c.mapping.rfam_acc not in rfam_acc_to_download:
rfam_acc_to_download[c.mapping.rfam_acc] = [ c ]
rfam_acc_to_download[c.mapping.rfam_acc] = [c]
else:
rfam_acc_to_download[c.mapping.rfam_acc].append(c)
......@@ -2546,5 +2671,5 @@ if __name__ == "__main__":
print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove
# # so i can sleep for the end of the night
# so i can sleep for the end of the night
# subprocess.run(["poweroff"])
......
1ml5_1_a_1-2914
1ml5_1_a_151-2903
1ml5_1_A_7-1515
1ml5_1_A_2-1520
1ml5_1_A_7-1518
1ml5_1_b_5-121
1eg0_1_O_1-73
2rdo_1_A_3-118
4v48_1_A9_3-118
4v47_1_A9_3-118
6zmi_1_L8_1267-4755
6zm7_1_L8_1267-4755
6y6x_1_L8_1267-4755
6z6n_1_L8_1267-4755
6qzp_1_L8_1267-4755
6zme_1_L8_1267-4755
6z6l_1_L8_1267-4755
6ek0_1_L8_1267-4755
6zmo_1_L8_1267-4755
6z6m_1_L8_1267-4755
6ole_1_D_1267-4755
6om0_1_D_1267-4755
6y2l_1_L8_1267-4755
6lqm_1_8_1267-4755
6y0g_1_L8_1267-4755
6lu8_1_8_1267-4755
6lsr_1_8_1267-4755
6lss_1_8_1267-4755
6oli_1_D_1267-4755
6olg_1_A3_1267-4755
6y57_1_L8_1267-4755
5t2c_1_C_1267-4755
6om7_1_D_1267-4755
4ug0_1_L8_1267-4755
6olf_1_D_1267-4755
6ip5_1_1C_1267-4755
6ip8_1_1C_1267-4755
6olz_1_A3_1267-4755
5aj0_1_A3_1267-4755
5lks_1_L8_1267-4755
6ip6_1_1C_1267-4755
4v6x_1_A8_1267-4755
1vy7_1_AY_1-73
1vy7_1_CY_1-73
4w2h_1_CY_1-73
2z9q_1_A_1-72
1jgq_1_A_2-1520
4v42_1_AA_2-1520
1jgo_1_A_2-1520
1jgp_1_A_2-1520
1ml5_1_A_2-1520
4v42_1_BA_1-2914
1ml5_1_a_1-2914
4v42_1_BB_5-121
1ml5_1_b_5-121
2rdo_1_B_1-2904
4v48_1_A0_1-2904
4v47_1_A0_1-2904
4v48_1_BA_1-1543
4v47_1_BA_1-1542
1ls2_1_B_1-73
3ep2_1_Y_1-72
3eq3_1_Y_1-72
4v48_1_A6_1-73
1eg0_1_O_1-73
2z9q_1_A_1-72
1gsg_1_T_1-72
3jcr_1_H_1-115
4v42_1_BA_1-2914
4v42_1_BA_151-2903
4v48_1_BA_1-91
4v48_1_BA_6-1541
4v48_1_BA_1-1543
4v48_1_BA_6-1538
4v47_1_BA_1-91
4v47_1_BA_6-1540
4v47_1_BA_1-1542
4v47_1_BA_6-1537
2rdo_1_B_1-2903
2rdo_1_B_6-1460
2rdo_1_B_1-1528
2rdo_1_B_6-1457
2rdo_1_B_160-2893
2rdo_1_B_1-2904
2rdo_1_B_6-1522
4v48_1_A0_1-2903
4v48_1_A0_6-1460
4v48_1_A0_1-1528
4v48_1_A0_6-1457
4v48_1_A0_160-2893
4v48_1_A0_1-2904
4v48_1_A0_6-1522
4v47_1_A0_1-2903
4v47_1_A0_6-1460
4v47_1_A0_1-1528
4v47_1_A0_6-1457
4v47_1_A0_160-2893
4v47_1_A0_1-2904
4v47_1_A0_6-1522
1x1l_1_A_1-132
1zc8_1_Z_1-93
2ob7_1_D_1-132
2ob7_1_A_10-319
1x1l_1_A_1-130
1zc8_1_Z_1-130
1zc8_1_Z_1-91
2ob7_1_D_1-130
6rxu_1_C2_588-2386
6rxu_1_C2_583-2388
6rxu_1_C2_588-2383
5oql_1_2_588-2386
5oql_1_2_583-2388
5oql_1_2_588-2383
6rxv_1_C2_588-2386
6rxv_1_C2_583-2388
6rxv_1_C2_588-2383
6rxz_1_C2_588-2386
6rxz_1_C2_583-2388
6rxz_1_C2_588-2383
6rxy_1_C2_588-2386
6rxy_1_C2_583-2388
6rxy_1_C2_588-2383
6rxt_1_C2_588-2386
6rxt_1_C2_583-2388
6rxt_1_C2_588-2383
1r2x_1_C_1-58
1r2w_1_C_1-58
1eg0_1_L_1-57
1eg0_1_L_1-56
1jgq_1_A_7-1518
1jgq_1_A_20-55
1jgq_1_A_2-1520
1jgq_1_A_7-1515
4v42_1_AA_7-1518
4v42_1_AA_20-55
4v42_1_AA_2-1520
4v42_1_AA_7-1515
1jgo_1_A_7-1518
1jgo_1_A_20-55
1jgo_1_A_2-1520
1jgo_1_A_7-1515
1jgp_1_A_7-1518
1jgp_1_A_20-55
1jgp_1_A_2-1520
1jgp_1_A_7-1515
1zc8_1_A_1-59
1mvr_1_D_1-59
4c9d_1_D_29-1
4c9d_1_C_29-1
4adx_1_9_1-121
1mvr_1_D_1-61
4adx_1_9_1-123
1zn1_1_B_1-59
1emi_1_B_1-108
3iy9_1_A_498-1027
......@@ -143,25 +49,1558 @@
3cw1_1_V_1-138
3cw1_1_v_1-138
2iy3_1_B_9-105
3jcr_1_N_1-188
3jcr_1_N_1-106
3jcr_1_N_1-107
2vaz_1_A_64-177
2ftc_1_R_1-1568
2ftc_1_R_792-1568
2ftc_1_R_81-1466
3jcr_1_M_1-141
3jcr_1_M_1-188
3jcr_1_M_1-107
4v5z_1_B0_1-2899
4v5z_1_B0_1-2902
4v5z_1_B0_1-2840
5g2x_1_A_595-692
3iy8_1_A_1-540
4v5z_1_BY_2-113
4v5z_1_BZ_1-70
1mvr_1_B_1-96
4adx_1_0_1-2923
4adx_1_0_132-2915
4v5z_1_B1_2-125
1mvr_1_B_3-96
4adx_1_0_1-2925
3eq4_1_Y_1-69
6uz7_1_8_2140-2827
4v5z_1_AA_1-1563
4v5z_1_AA_1-1562
6cfj_1_1X
6cfj_1_2X
5hcq_1_1X
6cae_1_1X
5hcq_1_2X
5hcr_1_1X
4z8c_1_1X
5j4b_1_1X
5j4b_1_2X
4z8c_1_2X
6cae_1_2X
5j4c_1_1X
5w4k_1_1X
6of1_1_1X
5hcr_1_2X
5hd1_1_1X
5hcp_1_1X
6of1_1_2X
5hau_1_1W
5j4c_1_2X
5wis_1_1X
6xqd_1_1X
6nd5_1_1X
5w4k_1_2X
5hau_1_2W
6xqd_1_2X
4y4p_1_1X
6o97_1_1X
5hcp_1_2X
5doy_1_1X
4zer_1_1X
5wit_1_1X
5hd1_1_2X
6nd5_1_2X
4z3s_1_1X
7jql_1_1X
7jqm_1_1X
7jql_1_2X
5wis_1_2X
6nd6_1_1X
6o97_1_2X
4y4p_1_2X
7jqm_1_2X
4z3s_1_2X
4zer_1_2X
6uo1_1_2X
6uo1_1_1X
5doy_1_2X
5wit_1_2X
5f8k_1_1X
6nd6_1_2X
6xqe_1_1X
6xqe_1_2X
6n9e_1_1X
6n9e_1_2X
6n9f_1_1X
5f8k_1_2X
6n9f_1_2X
6xz7_1_F
6y69_1_W
5afi_1_V
5afi_1_W
6h4n_1_W
5wdt_1_V
5wfs_1_V
5wdt_1_W
5wfs_1_W
5we4_1_V
5we4_1_W
5uq8_1_Y
6c4i_1_Y
6c4i_1_X
5zeb_1_V
5zep_1_W
5lzd_1_V
5we6_1_V
5wfk_1_V
5wfk_1_W
5we6_1_W
5u4i_1_Y
5uq7_1_Y
5u4i_1_X
5lza_1_V
5wf0_1_V
5wf0_1_W
5zeu_1_V
5l3p_1_X
3jcj_1_V
6gxm_1_X
6gwt_1_X
6gxn_1_X
6gxo_1_X
3j9y_1_V
6o9k_1_Y
6o7k_1_V
5lzf_1_V
3jcn_1_V
5lzc_1_V
5u4j_1_X
5u4j_1_Z
5lzb_1_V
6h58_1_W
6h58_1_WW
1eg0_1_O
5j8b_1_X
4v7j_1_AV
4v7j_1_BV
4v7k_1_BV
4v7k_1_AV
4v7k_1_BW
4v7k_1_AW
4v7j_1_AW
4v7j_1_BW
4v4j_1_Z
6i0v_1_B
5k77_1_X
5k77_1_V
5k77_1_Y
5k77_1_W
5k77_1_Z
4pei_1_X
4pei_1_V
4pei_1_W
4pei_1_Z
4pei_1_Y
4a3c_1_P
4a3e_1_P
6lkq_1_U
7k00_1_B
6qdw_1_A
2rdo_1_A
4v48_1_A9
4v47_1_A9
6hcj_1_Q3
6hcq_1_Q3
5mmm_1_Z
4w2e_1_W
5j4b_1_1Y
6cfj_1_1W
5w4k_1_1Y
5wit_1_1W
6cfj_1_1Y
6cfj_1_2W
5j4c_1_1W
5wis_1_1Y
5j4c_1_1Y
6cfj_1_2Y
5wis_1_1W
5j4b_1_1W
5j4c_1_2W
5j4b_1_2W
5j4b_1_2Y
5j4c_1_2Y
5w4k_1_1W
6nd5_1_1Y
5wis_1_2Y
5wit_1_2W
5doy_1_1Y
5w4k_1_2Y
4y4p_1_1Y
4z3s_1_1Y
5doy_1_1W
5doy_1_2Y
6nd5_1_1W
4z3s_1_2Y
4z3s_1_1W
5w4k_1_2W
6nd5_1_2Y
4y4p_1_2Y
6uo1_1_2Y
6uo1_1_2W
4y4p_1_1W
4z3s_1_2W
6uo1_1_1Y
6uo1_1_1W
5wis_1_2W
5wit_1_1Y
6nd5_1_2W
4y4p_1_2W
5doy_1_2W
5wit_1_2Y
6ucq_1_1Y
4v4i_1_Z
6ucq_1_1X
6ucq_1_2Y
4w2e_1_X
6ucq_1_2X
6yss_1_W
5afi_1_Y
5uq8_1_Z
5wdt_1_Y
5wfs_1_Y
6ysr_1_W
5we4_1_Y
6yst_1_W
5uq7_1_Z
5we6_1_Y
5wfk_1_Y
5wf0_1_Y
6o9j_1_V
6ysu_1_W
3j46_1_A
5j8b_1_Y
5j8b_1_W
3bbv_1_Z
5aj0_1_BV
5aj0_1_BW
4wt8_1_AB
4wt8_1_BB
4v4j_1_Y
4v4i_1_Y
5uq8_1_X
5uq7_1_X
1jgq_1_A
4v42_1_AA
1jgo_1_A
1jgp_1_A
1ml5_1_A
4v4j_1_W
4v4i_1_W
4v42_1_BA
4wt8_1_CS
4wt8_1_DS
4v4j_1_X
4v4i_1_X
4v42_1_BB
6uu4_1_333
6uu0_1_333
6uuc_1_333
6uu2_1_333
6b6h_1_3
6pb4_1_3
6d30_1_C
6j7z_1_C
3er9_1_D
5kal_1_Y
4nia_1_3
5kal_1_Z
4nia_1_7
4nia_1_4
5new_1_C
4nia_1_U
4nia_1_6
4oq9_1_7
4nia_1_1
4oq9_1_4
4nia_1_8
4oq9_1_8
4nia_1_5
2vrt_1_E
4nia_1_W
4oq9_1_6
4oq8_1_D
4nia_1_Z
4oq9_1_W
4oq9_1_5
4nia_1_2
2vrt_1_F
4oq9_1_U
4oq9_1_Z
4oq9_1_2
4oq9_1_3
1ddl_1_E
4oq9_1_1
6rt5_1_A
6rt5_1_E
4qu6_1_B
6lkq_1_T
6qdw_1_B
3jbv_1_B
3jbu_1_B
2rdo_1_B
4v48_1_A0
4v47_1_A0
6do8_1_B
6dpi_1_B
6dp9_1_B
6dpb_1_B
6dmn_1_B
6dpp_1_B
6dpk_1_B
6dpd_1_B
6dot_1_B
6dok_1_B
6dp8_1_B
6dpl_1_B
6dpg_1_B
6dou_1_B
6dpc_1_B
6do9_1_B
6dmv_1_B
6dp4_1_B
6dpn_1_B
6doj_1_B
6dph_1_B
6dos_1_B
6doo_1_B
6dp6_1_B
6dox_1_B
6dp5_1_B
6dol_1_B
6dp1_1_B
6doz_1_B
6dp7_1_B
6doq_1_B
6dpa_1_B
6dom_1_B
6dog_1_B
6dop_1_B
6doh_1_B
6doa_1_B
6don_1_B
6dov_1_B
6dpo_1_B
6dod_1_B
6dob_1_B
6dow_1_B
6dpm_1_B
6dpf_1_B
6dp3_1_B
6dp2_1_B
6dpe_1_B
6dpj_1_B
6dor_1_B
6dof_1_B
6dp0_1_B
6doi_1_B
6doc_1_B
6doe_1_B
6n6g_1_D
6lkq_1_S
5h5u_1_H
5lze_1_Y
5lze_1_V
5lze_1_X
3jcj_1_G
6o7k_1_G
4v48_1_BA
4v47_1_BA
4b3r_1_W
4b3t_1_W
4b3s_1_W
5o2r_1_X
5kcs_1_1X
6fti_1_U
6fti_1_W
6ftj_1_U
6ftj_1_W
6ftg_1_U
6ftg_1_W
6ole_1_T
6om0_1_T
6oli_1_T
6om7_1_T
6olf_1_T
6w6l_1_T
6x1b_1_D
6x1b_1_F
5f6c_1_C
6i0t_1_B
1b2m_1_C
1b2m_1_D
1b2m_1_E
2uxc_1_Y
4a3g_1_P
4a3j_1_P
7k00_1_5
5mmi_1_Z
3j9m_1_U
6nu2_1_U
6nu3_1_U
5c0y_1_C
6n6f_1_D
4ohy_1_B
4oi1_1_B
4oi0_1_B
6raz_1_Y
5ipl_1_3
6utw_1_333
5ipm_1_3
5ipn_1_3
4ylo_1_3
4yln_1_6
4ylo_1_6
4yln_1_3
4yln_1_9
5lzf_1_Y
1n32_1_Z
5zsl_1_D
5zsd_1_C
5zsd_1_D
5zsl_1_E
4nku_1_D
4nku_1_H
1cwp_1_E
6qik_1_Y
6rzz_1_Y
6ri5_1_Y
6qt0_1_Y
6qtz_1_Y
6t83_1_1B
6t83_1_3B
6t83_1_AA
6t83_1_CA
6s05_1_Y
5jcs_1_X
5fl8_1_X
3erc_1_G
6of1_1_1W
6cae_1_1Y
6o97_1_1W
6of1_1_1Y
6of1_1_2W
6o97_1_1Y
6nd6_1_1Y
6cae_1_1W
6of1_1_2Y
6cae_1_2Y
6nd6_1_1W
6cae_1_2W
6o97_1_2Y
6nd6_1_2Y
6o97_1_2W
6nd6_1_2W
6xz7_1_G
6gz5_1_BW
6gz3_1_BW
1ls2_1_B
3ep2_1_Y
3eq3_1_Y
4v48_1_A6
2z9q_1_A
4hot_1_X
6d2z_1_C
4tu0_1_F
4tu0_1_G
6r9o_1_B
6is0_1_C
5lzc_1_X
5lzb_1_X
5lzd_1_Y
5lzc_1_Y
5lzb_1_Y
1gsg_1_T
6zvi_1_D
6sv4_1_NB
6sv4_1_NC
6i7o_1_NB
5y88_1_X
3j6x_1_IR
3j6y_1_IR
6tb3_1_N
6tnu_1_N
2uxb_1_X
2x1f_1_B
2x1a_1_B
3eq3_1_D
3ep2_1_D
1eg0_1_M
3eq4_1_D
5o1y_1_B
3jcr_1_H
6dzi_1_H
5zeu_1_A
6mpi_1_W
5mfx_1_B
5w0m_1_J
5bud_1_E
5w0m_1_I
5w0m_1_H
4j7m_1_B
5bud_1_D
6a4e_1_B
6a4e_1_D
6hxx_1_AA
6hxx_1_AB
6hxx_1_AC
6hxx_1_AD
6hxx_1_AE
6hxx_1_AF
6hxx_1_AG
6hxx_1_AH
6hxx_1_AI
6hxx_1_AJ
6hxx_1_AK
6hxx_1_AL
6hxx_1_AM
6hxx_1_AN
6hxx_1_AO
6hxx_1_AP
6hxx_1_AQ
6hxx_1_AR
6hxx_1_AS
6hxx_1_AT
6hxx_1_AU
6hxx_1_AV
6hxx_1_AW
6hxx_1_AX
6hxx_1_AY
6hxx_1_AZ
6hxx_1_BA
6hxx_1_BB
6hxx_1_BC
6hxx_1_BD
6hxx_1_BE
6hxx_1_BF
6hxx_1_BG
6hxx_1_BH
6hxx_1_BI
5odv_1_A
5odv_1_B
5odv_1_C
5odv_1_D
5odv_1_E
5odv_1_F
5odv_1_G
5odv_1_H
5odv_1_I
5odv_1_J
5odv_1_K
5odv_1_L
5odv_1_M
5odv_1_N
5odv_1_O
5odv_1_P
5odv_1_Q
5odv_1_R
5odv_1_S
5odv_1_T
5odv_1_U
5odv_1_V
5odv_1_W
5odv_1_X
6t34_1_A
6t34_1_B
6t34_1_C
6t34_1_D
6t34_1_E
6t34_1_F
6t34_1_G
6t34_1_H
6t34_1_I
6t34_1_J
6t34_1_K
6t34_1_L
6t34_1_M
6t34_1_N
6t34_1_O
6t34_1_P
6t34_1_Q
6t34_1_R
6t34_1_S
6ip8_1_ZY
6ip5_1_ZY
6ip5_1_ZU
6ip6_1_ZY
6ip8_1_ZZ
6ip6_1_ZZ
6uu3_1_333
6uu1_1_333
1pn8_1_D
3er8_1_H
3er8_1_G
3er8_1_F
5o3j_1_B
4dr7_1_B
1i5l_1_Y
1i5l_1_U
4dr6_1_B
6i2n_1_U
4v68_1_A0
6vyu_1_Y
6vyw_1_Y
6vz7_1_Y
6vz5_1_Y
6vz3_1_Y
6vyy_1_Y
6vyx_1_Y
6vyz_1_Y
6vz2_1_Y
1mvr_1_1
6vyt_1_Y
1cgm_1_I
3jb7_1_T
3jb7_1_M
3j0o_1_D
3j0l_1_D
3j0q_1_D
3j0p_1_D
5elt_1_F
5elt_1_E
2tmv_1_R
5a79_1_R
5a7a_1_R
2om3_1_R
2xea_1_R
4wtl_1_T
4wtl_1_P
1xnq_1_W
1x18_1_C
1x18_1_B
1x18_1_D
1vq6_1_4
4am3_1_D
4am3_1_H
4am3_1_I
4lj0_1_C
4lj0_1_D
4lj0_1_E
5lzy_1_HH
4wtj_1_T
4wtj_1_P
4xbf_1_D
6ow3_1_I
6ovy_1_I
6oy6_1_I
6n6d_1_D
6n6k_1_C
6n6k_1_D
3rtj_1_D
1apg_1_D
6ty9_1_M
6tz1_1_N
4bbl_1_Y
4bbl_1_Z
6sce_1_B
6scf_1_I
6scf_1_K
6yud_1_K
6yud_1_O
6scf_1_M
6yud_1_P
6scf_1_L
6yud_1_M
6yud_1_Q
6o6x_1_D
4ba2_1_R
6o6x_1_C
6o7b_1_C
6o6v_1_C
6r7b_1_D
6r9r_1_D
6ov0_1_E
6ov0_1_H
6ov0_1_G
6o6v_1_D
6ov0_1_F
6o7b_1_D
5e02_1_C
6r9r_1_E
6r7b_1_E
6o7i_1_I
6o7h_1_K
7jyy_1_F
7jyy_1_E
7jz0_1_F
7jz0_1_E
6rt6_1_A
6rt6_1_E
1y1y_1_P
5zuu_1_I
5zuu_1_G
4peh_1_W
4peh_1_V
4peh_1_X
4peh_1_Y
4peh_1_Z
6mkn_1_W
4cxg_1_C
4cxh_1_C
1x1l_1_A
1zc8_1_Z
2ob7_1_D
2ob7_1_A
4eya_1_E
4eya_1_F
4eya_1_Q
4eya_1_R
2r1g_1_B
4ht9_1_E
1cvj_1_M
6z1p_1_AB
6z1p_1_AA
4ii9_1_C
5mq0_1_3
5uk4_1_X
5uk4_1_V
5uk4_1_W
5uk4_1_U
5f6c_1_E
4rcj_1_B
1xnr_1_W
6e0o_1_C
6o75_1_D
6o75_1_C
6e0o_1_B
3j06_1_R
1r2x_1_C
1r2w_1_C
1eg0_1_L
4eya_1_G
4eya_1_H
4eya_1_S
4eya_1_T
4dr4_1_V
1ibl_1_Z
1ibm_1_Z
4dr5_1_V
4d61_1_J
1trj_1_B
1trj_1_C
6q8y_1_N
6sv4_1_N
6i7o_1_N
5k8h_1_A
5z4a_1_B
3jbu_1_V
1h2c_1_R
1h2d_1_S
1h2d_1_R
6szs_1_X
5mgp_1_X
6enu_1_X
6enf_1_X
6enj_1_X
1pvo_1_L
1pvo_1_G
1pvo_1_H
1pvo_1_J
1pvo_1_K
2ht1_1_K
2ht1_1_J
6eri_1_AX
1zc8_1_A
1zc8_1_C
1zc8_1_B
1zc8_1_G
1zc8_1_I
1zc8_1_H
1zc8_1_J
4v8z_1_CX
6kqe_1_I
5uh8_1_I
5vi5_1_Q
4xln_1_T
4xlr_1_T
4xln_1_Q
5i2d_1_K
5i2d_1_V
4xlr_1_Q
6sty_1_C
6sty_1_F
2xs5_1_D
3ok4_1_N
3ok4_1_L
3ok4_1_Z
3ok4_1_4
3ok4_1_V
3ok4_1_X
3ok4_1_P
3ok4_1_H
3ok4_1_J
3ok4_1_R
3ok4_1_T
3ok4_1_2
6n6h_1_D
5wnt_1_B
3b0u_1_B
3b0u_1_A
4x9e_1_G
4x9e_1_H
6z1p_1_BB
6z1p_1_BA
2uxd_1_X
4qvd_1_H
4v7e_1_AB
3ol9_1_D
3ol9_1_H
3ol9_1_L
3ol9_1_P
3olb_1_L
3olb_1_P
3olb_1_D
3olb_1_H
3ol6_1_D
3ol6_1_H
3ol6_1_L
3ol6_1_P
3ol8_1_D
3ol8_1_H
3ol7_1_L
3ol7_1_P
3ol7_1_D
3ol7_1_H
3ol8_1_L
3ol8_1_P
1qzc_1_C
1qzc_1_A
6ole_1_V
6om0_1_V
6oli_1_V
6om7_1_V
6w6l_1_V
6olf_1_V
1mvr_1_D
4wtm_1_T
4wtm_1_P
5x70_1_E
5x70_1_G
6gz5_1_BV
6gz4_1_BV
6gz3_1_BV
6fti_1_Q
4v7e_1_AE
4v7e_1_AD
4x62_1_B
4x64_1_B
4x65_1_B
1xmq_1_W
4x66_1_B
3t1h_1_W
3t1y_1_W
1xmo_1_W
4adx_1_9
6kr6_1_B
1zn1_1_B
6z8k_1_X
1cvj_1_Q
4csf_1_U
4csf_1_Q
4csf_1_G
4csf_1_M
4csf_1_K
4csf_1_A
4csf_1_I
4csf_1_S
4csf_1_C
4csf_1_W
4csf_1_O
4csf_1_E
1cvj_1_N
1cvj_1_O
1cvj_1_S
1cvj_1_P
1cvj_1_T
1cvj_1_R
6th6_1_AA
6skg_1_AA
6skf_1_AA
6q8y_1_M
6i7o_1_M
6zmw_1_W
6ybv_1_W
2fz2_1_D
2xpj_1_D
2vrt_1_H
2vrt_1_G
1emi_1_B
6r9m_1_B
4nia_1_C
4nia_1_A
4nia_1_H
4nia_1_N
4nia_1_G
4nia_1_D
4nia_1_B
4nia_1_I
4nia_1_E
4nia_1_M
4oq9_1_I
4oq9_1_G
4oq9_1_C
4oq9_1_H
4oq9_1_N
4oq9_1_A
4oq9_1_D
4oq9_1_E
4oq9_1_M
4oq9_1_B
5uhc_1_I
1uvn_1_F
1uvn_1_B
1uvn_1_D
3iy9_1_A
4wtk_1_T
4wtk_1_P
1vqn_1_4
4oav_1_C
4oav_1_A
3ep2_1_E
3eq3_1_E
3eq4_1_E
3ep2_1_A
3eq3_1_A
3eq4_1_A
3ep2_1_C
3eq3_1_C
3eq4_1_C
3ep2_1_B
3eq3_1_B
3eq4_1_B
4i67_1_B
3pgw_1_R
3pgw_1_N
3cw1_1_X
3cw1_1_W
3cw1_1_V
5it9_1_I
6k32_1_T
6k32_1_P
5mmj_1_A
5x8r_1_A
3j2k_1_3
3j2k_1_2
3j2k_1_1
3j2k_1_0
3j2k_1_4
3nvk_1_G
3nvk_1_S
2iy3_1_B
1cwp_1_F
5z4j_1_B
5gmf_1_E
5gmf_1_H
6e4p_1_J
5gmf_1_F
5gmf_1_G
5gmg_1_D
5gmg_1_C
6e4p_1_K
3ie1_1_E
3ie1_1_H
3ie1_1_F
4dr7_1_V
3ie1_1_G
3s4g_1_C
3s4g_1_B
2qqp_1_R
2zde_1_E
2zde_1_F
2zde_1_H
2zde_1_G
1nb7_1_E
1nb7_1_F
4hos_1_X
3p6y_1_T
3p6y_1_V
3p6y_1_U
3p6y_1_Q
3p6y_1_W
5dto_1_B
4cxh_1_X
1uvj_1_F
1uvj_1_D
1uvj_1_E
6kqd_1_I
6kqd_1_S
5uh5_1_I
1ytu_1_F
1ytu_1_D
4kzz_1_J
5t2c_1_AN
4v5z_1_BF
3j6b_1_E
4v4f_1_B6
4v4f_1_A5
4v4f_1_A3
4v4f_1_B0
4v4f_1_B9
4v4f_1_A2
4v4f_1_A8
4v4f_1_A1
4v4f_1_A9
4v4f_1_BZ
4v4f_1_B8
4v4f_1_B7
4v4f_1_B5
4v4f_1_A0
4v4f_1_A7
4v4f_1_A4
4v4f_1_AZ
4v4f_1_B3
4v4f_1_B1
4v4f_1_B4
4v4f_1_A6
4v4f_1_B2
5flx_1_Z
5zsb_1_C
5zsb_1_D
5zsn_1_D
5zsn_1_E
3jcr_1_N
6gfw_1_R
2vaz_1_A
1qzc_1_B
1mvr_1_C
4v5z_1_BP
6n6e_1_D
4g7o_1_I
4g7o_1_S
5x22_1_S
5x22_1_I
5x21_1_I
5uh6_1_I
6l74_1_I
5uh9_1_I
2ftc_1_R
6sag_1_R
4udv_1_R
2r1g_1_E
5zsc_1_D
5zsc_1_C
6woy_1_I
6wox_1_I
6evj_1_N
6evj_1_M
4gkk_1_W
4v9e_1_AG
4v9e_1_BM
4v9e_1_AM
4v9e_1_AA
4v9e_1_BA
4v9e_1_BG
5lzs_1_II
6fqr_1_C
6ha1_1_X
5kcr_1_1X
2r1g_1_X
3m7n_1_Z
3m85_1_X
3m85_1_Z
3m85_1_Y
1e8s_1_C
5wnp_1_B
5wnv_1_B
5yts_1_B
1utd_1_6
1utd_1_Z
1utd_1_4
1utd_1_7
1utd_1_9
1utd_1_5
1utd_1_3
1utd_1_2
1utd_1_8
1utd_1_1
6n6i_1_C
6n6i_1_D
6n6a_1_D
6ij2_1_F
6ij2_1_G
6ij2_1_H
6ij2_1_E
3u2e_1_D
3u2e_1_C
5uef_1_C
5uef_1_D
4x4u_1_H
4afy_1_D
6oy5_1_I
6owl_1_B
6owl_1_C
4afy_1_C
4lq3_1_R
6s0m_1_C
6gx6_1_B
4k4s_1_D
4k4s_1_H
4k4t_1_H
4k4t_1_D
1zn1_1_C
1zn0_1_C
1xpu_1_G
1xpu_1_L
1xpr_1_L
1xpu_1_H
1xpo_1_K
1xpo_1_J
1xpu_1_J
1xpo_1_H
1xpr_1_J
1xpu_1_K
1xpr_1_K
1xpo_1_M
1xpo_1_L
1xpu_1_M
1xpr_1_M
1xpo_1_G
1xpr_1_H
1xpr_1_G
6gc5_1_F
6gc5_1_H
6gc5_1_G
4v7e_1_AA
4v7e_1_AC
1n1h_1_B
4ohz_1_B
6t83_1_6B
4gv6_1_C
4gv6_1_B
4gv3_1_C
4gv3_1_B
4gv9_1_E
6i7o_1_L
2a8v_1_D
6qx3_1_G
2xnr_1_C
4gkj_1_W
4v5z_1_BC
4v5z_1_BB
4v5z_1_BH
3j0o_1_F
3j0l_1_F
3j0p_1_F
3j0q_1_F
3j0o_1_B
3j0l_1_B
3j0o_1_C
3j0l_1_C
3j0q_1_C
3j0p_1_C
3j0o_1_A
3j0l_1_A
3j0q_1_A
3j0p_1_A
1cwp_1_D
4v5z_1_BJ
5sze_1_C
6wre_1_D
6i0u_1_B
5zsa_1_C
5zsa_1_D
1n34_1_Z
3pf5_1_S
6ppn_1_A
6ppn_1_I
6qdw_1_V
5hk0_1_F
4qm6_1_D
4qm6_1_C
4jzu_1_C
4jzv_1_C
5ytv_1_B
4k4z_1_P
4k4z_1_D
4k4x_1_L
4k4z_1_L
4k4x_1_D
4k4z_1_H
4k4x_1_H
4k4x_1_P
1t1m_1_A
1t1m_1_B
4a3b_1_P
4a3m_1_P
6u6y_1_E
6u6y_1_G
6u6y_1_F
6u6y_1_H
6qik_1_X
6rzz_1_X
6ri5_1_X
6qt0_1_X
6qtz_1_X
6s05_1_X
6t83_1_BB
6t83_1_4B
5fl8_1_Z
5jcs_1_Z
5mrc_1_BB
5mre_1_BB
5mrf_1_BB
6gz4_1_BW
3j46_1_P
3jcr_1_M
4e6b_1_A
4e6b_1_B
6a6l_1_D
4v5z_1_BS
4v8t_1_1
1uvi_1_D
1uvi_1_F
1uvi_1_E
4m7d_1_P
4k4u_1_D
4k4u_1_H
6rt7_1_E
6rt7_1_A
2voo_1_C
2voo_1_D
5k78_1_X
5k78_1_Y
4ylo_1_9
4kzy_1_I
4kzz_1_I
4kzx_1_I
5vyc_1_I2
5vyc_1_I3
5vyc_1_I5
5vyc_1_I1
5vyc_1_I6
5vyc_1_I4
6ip8_1_2M
6ip5_1_2M
6ip6_1_2M
6qcs_1_M
486d_1_G
2r1g_1_C
486d_1_F
4v5z_1_B0
4nia_1_O
4nia_1_J
4nia_1_K
4nia_1_L
4nia_1_F
4oq9_1_K
4oq9_1_O
4oq9_1_J
4oq9_1_F
4oq9_1_L
5tbw_1_SR
6hhq_1_SR
6zvi_1_H
6sv4_1_2B
6sv4_1_2C
6t83_1_2B
6t83_1_A
6i7o_1_2B
6r9q_1_B
6v3a_1_SN1
6v3b_1_SN1
6v39_1_SN1
6v3e_1_SN1
1pn7_1_C
1mj1_1_Q
1mj1_1_R
4dr6_1_V
6kql_1_I
4eya_1_M
4eya_1_N
4eya_1_A
4eya_1_B
2wj8_1_D
2wj8_1_I
2wj8_1_L
2wj8_1_F
2wj8_1_C
2wj8_1_Q
2wj8_1_J
2wj8_1_P
2wj8_1_K
2wj8_1_E
2wj8_1_T
2wj8_1_B
2wj8_1_O
2wj8_1_N
2wj8_1_A
2wj8_1_H
2wj8_1_R
2wj8_1_M
2wj8_1_S
2wj8_1_G
4e6b_1_E
4e6b_1_F
6p71_1_I
3pdm_1_R
5det_1_P
5els_1_I
4n2s_1_B
4yoe_1_E
3j0o_1_H
3j0l_1_H
3j0p_1_H
3j0q_1_H
5gxi_1_B
3iy8_1_A
6tnu_1_M
5mc6_1_M
5mc6_1_N
4eya_1_O
4eya_1_P
4eya_1_C
4eya_1_D
6htq_1_V
6htq_1_W
6htq_1_U
6uu6_1_333
6v3a_1_V
6v39_1_V
5a0v_1_F
3avt_1_T
6d1v_1_C
4s2x_1_B
4s2y_1_B
5wnu_1_B
1zc8_1_F
1vtm_1_R
4v5z_1_BA
4v5z_1_BE
4v5z_1_BD
4v5z_1_BG
4v5z_1_BI
4v5z_1_BK
4v5z_1_BM
4v5z_1_BL
4v5z_1_BV
4v5z_1_BO
4v5z_1_BN
4v5z_1_BQ
4v5z_1_BR
4v5z_1_BT
4v5z_1_BU
4v5z_1_BW
4v5z_1_BY
4v5z_1_BX
4v5z_1_BZ
6u9x_1_H
6u9x_1_K
5elk_1_R
6okk_1_G
4cxg_1_A
4cxh_1_A
6bk8_1_I
4cxg_1_B
4cxh_1_B
4v5z_1_B1
5z4d_1_B
6o78_1_E
6ha8_1_X
1m8w_1_E
1m8w_1_F
5udi_1_B
5udl_1_B
5udk_1_B
5udj_1_B
5w5i_1_B
5w5i_1_D
5w5h_1_B
5w5h_1_D
4eya_1_K
4eya_1_L
4eya_1_I
4eya_1_J
4g9z_1_E
4g9z_1_F
3nma_1_B
3nma_1_C
6een_1_G
6een_1_I
6een_1_H
4wti_1_T
4wti_1_P
5l3p_1_Y
4hor_1_X
3rzo_1_R
2f4v_1_Z
1qln_1_R
2xs7_1_B
6zvi_1_E
6sv4_1_MC
6sv4_1_MB
6i7o_1_MB
6ogy_1_M
6ogy_1_N
6uej_1_B
1x18_1_A
5ytx_1_B
6o8w_1_U
4g0a_1_H
6r9p_1_B
3koa_1_C
4n48_1_D
4n48_1_G
6kug_1_B
6ktc_1_V
6ole_1_U
6om0_1_U
6olg_1_BV
6oli_1_U
6om7_1_U
6w6l_1_U
6olz_1_BV
6olf_1_U
5lzd_1_X
6m7k_1_B
3cd6_1_4
3cma_1_5
6n9e_1_2W
1vqo_1_4
1qvg_1_3
3cme_1_5
5lzd_1_W
5lze_1_W
5lzc_1_W
5lzb_1_W
3wzi_1_C
1mvr_1_E
1mvr_1_B
1mvr_1_A
4adx_1_0
4adx_1_8
1n33_1_Z
6dti_1_W
3d2s_1_F
3d2s_1_H
5mrc_1_AA
5mre_1_AA
5mrf_1_AA
5fl8_1_Y
5jcs_1_Y
2r1g_1_A
2r1g_1_D
2r1g_1_F
3eq4_1_Y
4wkr_1_C
4v99_1_EC
4v99_1_AC
4v99_1_BH
4v99_1_CH
4v99_1_AM
4v99_1_DC
4v99_1_JW
4v99_1_EH
4v99_1_BW
4v99_1_FW
4v99_1_AW
4v99_1_BC
4v99_1_BM
4v99_1_IC
4v99_1_EM
4v99_1_ER
4v99_1_IW
4v99_1_JH
4v99_1_JR
4v99_1_AH
4v99_1_GR
4v99_1_IR
4v99_1_BR
4v99_1_CW
4v99_1_HR
4v99_1_FH
4v99_1_HC
4v99_1_DW
4v99_1_GC
4v99_1_JC
4v99_1_DM
4v99_1_EW
4v99_1_AR
4v99_1_CR
4v99_1_JM
4v99_1_CC
4v99_1_IH
4v99_1_FR
4v99_1_CM
4v99_1_IM
4v99_1_FM
4v99_1_FC
4v99_1_GH
4v99_1_HM
4v99_1_HH
4v99_1_DR
4v99_1_HW
4v99_1_GW
4v99_1_DH
4v99_1_GM
6rt4_1_D
6rt4_1_C
6zvh_1_X
4dwa_1_D
6n6c_1_D
6n6j_1_C
6n6j_1_D
6p7q_1_E
6p7q_1_F
6p7q_1_D
6rcl_1_C
5jju_1_C
4ejt_1_G
5ceu_1_C
5ceu_1_D
6lkq_1_W
3qsu_1_P
3qsu_1_R
1n38_1_B
4qvc_1_G
6q1h_1_D
6q1h_1_H
6p7p_1_F
6p7p_1_E
6p7p_1_D
6vm6_1_J
6vm6_1_G
6wan_1_K
6wan_1_H
6wan_1_G
6wan_1_L
6wan_1_I
6ywo_1_F
6wan_1_J
4oau_1_A
6ywo_1_E
6ywo_1_K
6vm6_1_I
6vm6_1_H
6ywo_1_I
2a1r_1_C
2a1r_1_D
3gpq_1_E
3gpq_1_F
6o79_1_C
6vm6_1_K
6hyu_1_D
1laj_1_R
6ybv_1_K
6mpf_1_W
6spc_1_A
6spe_1_A
6fti_1_V
6ftj_1_V
6ftg_1_V
4g0a_1_G
4g0a_1_F
4g0a_1_E
2b2d_1_S
5hkc_1_C
1rmv_1_B
4qu7_1_X
4qu7_1_V
4qu7_1_U
4v5z_1_AH
4v5z_1_AA
4v5z_1_AB
4v5z_1_AC
4v5z_1_AD
4v5z_1_AE
4v5z_1_AF
4v5z_1_AG
6pmi_1_3
6pmj_1_3
5hjz_1_C
......
This diff could not be displayed because it is too large.
......@@ -11,7 +11,7 @@
# - Use a specialised database (SILVA) : better alignments (we guess?), but two kind of jobs
# - Use cmalign --small everywhere (homogeneity)
# Moreover, --small requires --nonbanded --cyk, which means the output alignement is the optimally scored one.
# To date, we trust Infernal as the best tool to realign RNA. Is it ?
# To date, we trust Infernal as the best tool to realign ncRNA. Is it ?
# Contact: louis.becquey@univ-evry.fr (PhD student), fariza.tahi@univ-evry.fr (PI)
......@@ -28,7 +28,7 @@ pd.set_option('display.max_rows', None)
LSU_set = ["RF00002", "RF02540", "RF02541", "RF02543", "RF02546"] # From Rfam CLAN 00112
SSU_set = ["RF00177", "RF02542", "RF02545", "RF01959", "RF01960"] # From Rfam CLAN 00111
with sqlite3.connect("results/RNANet.db") as conn:
with sqlite3.connect(os.getcwd()+"/results/RNANet.db") as conn:
df = pd.read_sql("SELECT rfam_acc, max_len, nb_total_homol, comput_time, comput_peak_mem FROM family;", conn)
to_remove = [ f for f in df.rfam_acc if f in LSU_set+SSU_set ]
......@@ -74,7 +74,7 @@ ax.set_ylabel("Maximum length of sequences ")
ax.set_zlabel("Computation time (s)")
plt.subplots_adjust(wspace=0.4)
plt.savefig("results/cmalign_jobs_performance.png")
plt.savefig(os.getcwd()+"/results/cmalign_jobs_performance.png")
# # ========================================================
# # Linear Regression of max_mem as function of max_length
......
......@@ -3,7 +3,6 @@
# This file computes additional statistics over the produced dataset.
# Run this file if you want the base counts, pair-type counts, identity percents, etc
# in the database.
# This should be run from the folder where the file is (to access the database with path "results/RNANet.db")
import getopt, os, pickle, sqlite3, shlex, subprocess, sys
import numpy as np
......@@ -22,34 +21,35 @@ from multiprocessing import Pool, Manager
from os import path
from tqdm import tqdm
from collections import Counter
from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker
from setproctitle import setproctitle
from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions
path_to_3D_data = "tobedefinedbyoptions"
path_to_seq_data = "tobedefinedbyoptions"
runDir = os.getcwd()
res_thr = 20.0 # default: all structures
LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112
SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111
def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
@trace_unhandled_exceptions
def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0):
"""
Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph.
See Wadley & Pyle (2007)
See Wadley & Pyle (2007).
Only unique unmapped chains with resolution < res argument are considered.
Arguments:
show: True or False, call plt.show() at this end or not
filter_helical: None, "form", "zone", or "both"
None: do not remove helical nucleotide
"form": remove nucleotides if they belong to a A, B or Z form stem
"zone": remove nucleotides falling in an arbitrary zone (see zone argument)
"both": remove nucleotides fulfilling one or both of the above conditions
carbon: 1 or 4, use C4' (eta and theta) or C1' (eta_prime and theta_prime)
show: True or False, call plt.show() at this end or not
sd_range: tuple, set values below avg + sd_range[0] * stdev to 0,
and values above avg + sd_range[1] * stdev to avg + sd_range[1] * stdev.
This removes noise and cuts too high peaks, to clearly see the clusters.
res: Minimal resolution (maximal resolution value, actually) of the structure to
consider its nucleotides.
"""
os.makedirs("results/figures/wadley_plots/", exist_ok=True)
os.makedirs(runDir + "/results/figures/wadley_plots/", exist_ok=True)
if carbon == 4:
angle = "eta"
......@@ -63,30 +63,32 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
exit("You overestimate my capabilities !")
if not path.isfile(f"data/wadley_kernel_{angle}_{res}A.npz"):
if not path.isfile(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz"):
# Get a worker number to position the progress bar
global idxQueue
thr_idx = idxQueue.get()
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} reproduce_wadley_results(carbon={carbon})")
pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", position=thr_idx+1, leave=False)
# Extract the angle values of c2'-endo and c3'-endo nucleotides
with sqlite3.connect("results/RNANet.db") as conn:
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
df = pd.read_sql(f"""SELECT {angle}, th{angle}
FROM nucleotide JOIN (
SELECT chain_id FROM chain JOIN structure
WHERE structure.resolution <= {res}
) AS c
FROM (
SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
WHERE chain.rfam_acc = 'unmappd' AND structure.resolution <= {res} AND issue = 0
) AS c NATURAL JOIN nucleotide
WHERE puckering="C2'-endo"
AND {angle} IS NOT NULL
AND th{angle} IS NOT NULL;""", conn)
c2_endo_etas = df[angle].values.tolist()
c2_endo_thetas = df["th"+angle].values.tolist()
df = pd.read_sql(f"""SELECT {angle}, th{angle}
FROM nucleotide JOIN (
SELECT chain_id FROM chain JOIN structure
WHERE structure.resolution <= {res}
) AS c
FROM (
SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
WHERE chain.rfam_acc = 'unmappd' AND structure.resolution <= {res} AND issue = 0
) AS c NATURAL JOIN nucleotide
WHERE form = '.'
AND puckering="C3'-endo"
AND {angle} IS NOT NULL
......@@ -111,14 +113,16 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
pbar.update(1)
# Save the data to an archive for later use without the need to recompute
np.savez(f"data/wadley_kernel_{angle}_{res}A.npz",
np.savez(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz",
c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas,
c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas,
kernel_c3=f_c3, kernel_c2=f_c2)
pbar.close()
idxQueue.put(thr_idx)
else:
f = np.load(f"data/wadley_kernel_{angle}_{res}A.npz")
setproctitle(f"RNANet statistics.py reproduce_wadley_results(carbon={carbon})")
f = np.load(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz")
c2_endo_etas = f["c2_endo_e"]
c3_endo_etas = f["c3_endo_e"]
c2_endo_thetas = f["c2_endo_t"]
......@@ -148,7 +152,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
f_low_thr = f.mean() + sd_range[0]*f.std()
f_cut = np.where(f > f_sup_thr, f_sup_thr, f)
f_cut = np.where(f_cut < f_low_thr, 0, f_cut)
levels = [f.mean()+f.std(), f.mean()+2*f.std(), f.mean()+4*f.std()]
levels = [ f.mean()+f.std(), f.mean()+2*f.std(), f.mean()+4*f.std()]
# histogram:
fig = plt.figure()
......@@ -157,7 +161,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
ax.bar3d(xpos.ravel(), ypos.ravel(), 0.0, 0.09, 0.09, hist_cut.ravel(), color=color_values, zorder="max")
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res}A.png")
fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res}A.png")
if show:
fig.show()
plt.close()
......@@ -168,7 +172,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
ax.plot_surface(xx, yy, f_cut, cmap=cm.get_cmap("coolwarm"), linewidth=0, antialiased=True)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res}A.png")
fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res}A.png")
if show:
fig.show()
plt.close()
......@@ -177,10 +181,10 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
fig = plt.figure(figsize=(5,5))
ax = fig.gca()
ax.scatter(x, y, s=1, alpha=0.1)
ax.contourf(xx, yy, f_cut, alpha=0.5, cmap=cm.get_cmap("coolwarm"), levels=levels, extend="max")
ax.contourf(xx, yy, f, alpha=0.5, cmap=cm.get_cmap("coolwarm"), levels=levels, extend="max")
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}_{res}A.png")
fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_{angle}_{l}_{res}A.png")
if show:
fig.show()
plt.close()
......@@ -188,10 +192,13 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
def stats_len():
"""Plots statistics on chain lengths in RNA families.
Uses all chains mapped to a family including copies, inferred or not.
REQUIRES tables chain, nucleotide up to date.
"""
setproctitle(f"RNANet statistics.py stats_len({res_thr})")
# Get a worker number to position the progress bar
global idxQueue
thr_idx = idxQueue.get()
......@@ -214,7 +221,7 @@ def stats_len():
cols = []
lengths = []
for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False)):
for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False):
# Define a color for that family in the plot
if f in LSU_set:
......@@ -229,7 +236,7 @@ def stats_len():
cols.append("grey")
# Get the lengths of chains
with sqlite3.connect("results/RNANet.db") as conn:
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
l = [ x[0] for x in sql_ask_database(conn, f"""SELECT COUNT(index_chain)
FROM (
SELECT chain_id
......@@ -239,8 +246,6 @@ def stats_len():
GROUP BY chain_id;""", warn_every=0) ]
lengths.append(l) # list of chain lengths from the family
# notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
# Plot the figure
fig = plt.figure(figsize=(10,3))
ax = fig.gca()
......@@ -267,7 +272,7 @@ def stats_len():
ncol=1, fontsize='small', bbox_to_anchor=(1.3, 0.5))
# Save the figure
fig.savefig(f"results/figures/lengths_{res_thr}A.png")
fig.savefig(runDir + f"/results/figures/lengths_{res_thr}A.png")
idxQueue.put(thr_idx) # replace the thread index in the queue
# notify("Computed sequence length statistics and saved the figure.")
......@@ -285,6 +290,7 @@ def format_percentage(tot, x):
def stats_freq():
"""Computes base frequencies in all RNA families.
Uses all chains mapped to a family including copies, inferred or not.
Outputs results/frequencies.csv
REQUIRES tables chain, nucleotide up to date."""
......@@ -293,17 +299,18 @@ def stats_freq():
global idxQueue
thr_idx = idxQueue.get()
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} stats_freq()")
# Initialize a Counter object for each family
freqs = {}
for f in fam_list:
freqs[f] = Counter()
# List all nt_names happening within a RNA family and store the counts in the Counter
for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False)):
with sqlite3.connect("results/RNANet.db") as conn:
for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False):
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0))
freqs[f].update(counts)
# notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.")
# Create a pandas DataFrame, and save it to CSV.
df = pd.DataFrame()
......@@ -311,7 +318,7 @@ def stats_freq():
tot = sum(freqs[f].values())
df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ])
df = df.fillna(0)
df.to_csv("results/frequencies.csv")
df.to_csv(runDir + "/results/frequencies.csv")
idxQueue.put(thr_idx) # replace the thread index in the queue
# notify("Saved nucleotide frequencies to CSV file.")
......@@ -327,11 +334,13 @@ def parallel_stats_pairs(f):
global idxQueue
thr_idx = idxQueue.get()
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} p_stats_pairs({f})")
chain_id_list = mappings_list[f]
data = []
sqldata = []
for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", leave=False):
with sqlite3.connect("results/RNANet.db") as conn:
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
# Get comma separated lists of basepairs per nucleotide
interactions = pd.DataFrame(
sql_ask_database(conn,
......@@ -398,7 +407,7 @@ def parallel_stats_pairs(f):
data.append(expanded_list)
# Update the database
with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
with sqlite3.connect(runDir + "/results/RNANet.db", isolation_level=None) as conn:
conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?,
......@@ -416,8 +425,8 @@ def parallel_stats_pairs(f):
# Create an output DataFrame
f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f])
f_df.to_csv(f"data/{f}_counts.csv")
expanded_list.to_csv(f"data/{f}_pairs.csv")
f_df.to_csv(runDir + f"/data/{f}_counts.csv")
expanded_list.to_csv(runDir + f"/data/{f}_pairs.csv")
idxQueue.put(thr_idx) # replace the thread index in the queue
......@@ -430,28 +439,34 @@ def to_dist_matrix(f):
global idxQueue
thr_idx = idxQueue.get()
# notify(f"Computing {f} distance matrix from alignment...")
command = f"esl-alipid --rna --noheader --informat stockholm {f}_3d_only.stk"
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_dist_matrix({f})")
# Prepare a file
with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file:
al = AlignIO.read(al_file, "fasta")
names = [ x.id for x in al if '[' in x.id ]
al = al[-len(names):]
with open(f + "_3d_only.stk", "w") as only_3d:
with open(path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk", "w") as only_3d:
try:
only_3d.write(al.format("stockholm"))
except ValueError as e:
warn(e)
del al
subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap", "-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk", "stockholm", path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"])
# Prepare the job
process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE)
process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"),
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
id_matrix = np.zeros((len(names), len(names)))
pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", leave=False)
while process.poll() is None:
output = process.stdout.readline()
cnt = 0
while not cnt or process.poll() is None:
output = process.stdout.read()
if output:
lines = output.strip().split(b'\n')
for l in lines:
cnt += 1
line = l.split()
s1 = line[0].decode('utf-8')
s2 = line[1].decode('utf-8')
......@@ -460,9 +475,14 @@ def to_dist_matrix(f):
id2 = names.index(s2)
id_matrix[id1, id2] = float(score)
pbar.update(1)
if cnt != len(names)*(len(names)-1)*0.5:
warn(f"{f} got {cnt} updates on {len(names)*(len(names)-1)*0.5}")
if process.poll() != 0:
l = process.stderr.read().strip().split(b'\n')
warn("\n".join([ line.decode('utf-8') for line in l ]))
pbar.close()
subprocess.run(["rm", "-f", f + "_3d_only.stk"])
subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk"])
np.save("data/"+f+".npy", id_matrix)
idxQueue.put(thr_idx) # replace the thread index in the queue
return 0
......@@ -471,21 +491,26 @@ def seq_idty():
"""Computes identity matrices for each of the RNA families.
REQUIRES temporary results files in data/*.npy
REQUIRES tables chain, family un to date."""
REQUIRES tables chain, family up to date."""
# load distance matrices
fams_to_plot = [ f for f in famlist if f not in ignored ]
fam_arrays = []
for f in famlist:
for f in fams_to_plot:
if path.isfile("data/"+f+".npy"):
fam_arrays.append(np.load("data/"+f+".npy"))
fam_arrays.append(np.load("data/"+f+".npy") / 100.0) # normalize percentages in [0,1]
else:
fam_arrays.append([])
warn("data/"+f+".npy not found !")
fam_arrays.append(np.array([]))
# Update database with identity percentages
conn = sqlite3.connect("results/RNANet.db")
for f, D in zip(famlist, fam_arrays):
conn = sqlite3.connect(runDir + "/results/RNANet.db")
for f, D in zip(fams_to_plot, fam_arrays):
if not len(D): continue
a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
if D.shape[0] > 1:
a = np.sum(D) * 2 / D.shape[0] / (D.shape[0] - 1) # SUM(D) / (n(n-1)/2)
else:
a = D[0][0]
conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
conn.commit()
conn.close()
......@@ -495,10 +520,11 @@ def seq_idty():
axs = axs.ravel()
[axi.set_axis_off() for axi in axs]
im = "" # Just to declare the variable, it will be set in the loop
for f, D, ax in zip(famlist, fam_arrays, axs):
if not len(D): continue
if D.shape[0] > 2: # Cluster only if there is more than 2 sequences to organize
for f, D, ax in zip(fams_to_plot, fam_arrays, axs):
D = D + D.T # Copy the lower triangle to upper, to get a symetrical matrix
if D.shape[0] > 2: # Cluster only if there is more than 2 sequences to organize
D = 1.0 - D
np.fill_diagonal(D, 0.0)
condensedD = squareform(D)
# Compute basic dendrogram by Ward's method
......@@ -507,15 +533,20 @@ def seq_idty():
# Reorganize rows and cols
idx1 = Z['leaves']
D = D[idx1,:]
D = D[idx1[::-1],:]
D = D[:,idx1[::-1]]
im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D
ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10)
D = 1.0 - D
elif D.shape[0] == 2:
np.fill_diagonal(D, 1.0) # the diagonal has been ignored until now
ax.text(np.floor(D.shape[0]/2.0)-(0.5 if not D.shape[0]%2 else 0), -0.5, f + "\n(" + str(D.shape[0]) + " chains)",
fontsize=9, horizontalalignment = 'center', verticalalignment='bottom')
im = ax.matshow(D, vmin=0, vmax=1)
fig.tight_layout()
fig.subplots_adjust(wspace=0.1, hspace=0.3)
fig.colorbar(im, ax=axs[-1], shrink=0.8)
fig.savefig(f"results/figures/distances.png")
notify("Computed all identity matrices and saved the figure.")
fig.subplots_adjust(hspace=0.3, wspace=0.1)
fig.colorbar(im, ax=axs[-4], shrink=0.8)
fig.savefig(runDir + f"/results/figures/distances.png")
print("> Computed all identity matrices and saved the figure.", flush=True)
def stats_pairs():
"""Counts occurrences of intra-chain base-pair types in RNA families
......@@ -523,6 +554,8 @@ def stats_pairs():
Creates a temporary results file in data/pair_counts.csv, and a results file in results/pairings.csv.
REQUIRES tables chain, nucleotide up-to-date."""
setproctitle(f"RNANet statistics.py stats_pairs()")
def line_format(family_data):
return family_data.apply(partial(format_percentage, sum(family_data)))
......@@ -530,12 +563,12 @@ def stats_pairs():
results = []
allpairs = []
for f in fam_list:
newpairs = pd.read_csv(f"data/{f}_pairs.csv", index_col=0)
fam_df = pd.read_csv(f"data/{f}_counts.csv", index_col=0)
newpairs = pd.read_csv(runDir + f"/data/{f}_pairs.csv", index_col=0)
fam_df = pd.read_csv(runDir + f"/data/{f}_counts.csv", index_col=0)
results.append(fam_df)
allpairs.append(newpairs)
subprocess.run(["rm", "-f", f"data/{f}_pairs.csv"])
subprocess.run(["rm", "-f", f"data/{f}_counts.csv"])
subprocess.run(["rm", "-f", runDir + f"/data/{f}_pairs.csv"])
subprocess.run(["rm", "-f", runDir + f"/data/{f}_counts.csv"])
all_pairs = pd.concat(allpairs)
df = pd.concat(results).fillna(0)
df.to_csv("data/pair_counts.csv")
......@@ -573,14 +606,14 @@ def stats_pairs():
crosstab = crosstab[["AU", "GC", "Wobble", "Other"]]
# Save to CSV
df.to_csv("results/pair_types.csv")
df.to_csv(runDir + "/results/pair_types.csv")
# Plot barplot of overall types
ax = crosstab.plot(figsize=(8,5), kind='bar', stacked=True, log=False, fontsize=13)
ax.set_ylabel("Number of observations (millions)", fontsize=13)
ax.set_xlabel(None)
plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99)
plt.savefig("results/figures/pairings.png")
plt.savefig(runDir + "/results/figures/pairings.png")
notify("Computed nucleotide statistics and saved CSV and PNG file.")
......@@ -589,7 +622,9 @@ def per_chain_stats():
REQUIRES tables chain, nucleotide up to date. """
with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
setproctitle(f"RNANet statistics.py per_chain_stats()")
with sqlite3.connect(runDir + "/results/RNANet.db", isolation_level=None) as conn:
# Compute per-chain nucleotide frequencies
df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn)
df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64)
......@@ -600,25 +635,36 @@ def per_chain_stats():
conn.execute('pragma journal_mode=wal')
sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;",
many=True, data=list(df.to_records(index=False)), warn_every=10)
notify("Updated the database with per-chain base frequencies")
print("> Updated the database with per-chain base frequencies", flush=True)
def general_stats():
"""
Number of structures as function of the resolution threshold
Number of Rfam families as function of the resolution threshold
"""
with sqlite3.connect("results/RNANet.db") as conn:
df_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
setproctitle(f"RNANet statistics.py general_stats()")
reqs = [
# unique unmapped chains with no issues
""" SELECT distinct pdb_id, chain_name, exp_method, resolution
FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
WHERE rfam_acc = 'unmappd' AND ISSUE=0;""", conn)
df_mapped_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
WHERE rfam_acc = 'unmappd' AND ISSUE=0;""",
# unique mapped chains with no issues
""" SELECT distinct pdb_id, chain_name, exp_method, resolution
FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn)
df_mapped_copies = pd.read_sql(f"""SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution
WHERE rfam_acc != 'unmappd' AND ISSUE=0;""",
# mapped chains with no issues
""" SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution
FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn)
df_inferred_only_unique = pd.read_sql(f"""SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution
FROM (SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy
WHERE rfam_acc != 'unmappd' AND ISSUE=0;""",
# mapped chains with no issues that are all inferred
""" SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution
FROM (
SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy
FROM chain
JOIN (SELECT structure_id, chain_name, COUNT(distinct rfam_acc) AS redundancy, SUM(inferred) AS inf_redundancy
FROM chain
......@@ -627,8 +673,105 @@ def general_stats():
) AS r ON chain.structure_id=r.structure_id AND chain.chain_name = r.chain_name
WHERE r.redundancy=r.inf_redundancy AND rfam_acc != 'unmappd' and issue=0
) AS c
JOIN structure ON c.structure_id=structure.pdb_id;""", conn)
print("> found", len(df_inferred_only_unique.index), "chains which are mapped only by inference using BGSU NR Lists.")
JOIN structure ON c.structure_id=structure.pdb_id;""",
# Number of mapped chains (not inferred)
"""SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0);""",
# Number of unique mapped chains (not inferred)
"""SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0);""",
# Number of mapped chains (inferred)
"""SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1);""",
# Number of unique mapped chains (inferred)
"""SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1);""",
# Number of mapped chains inferred once
"""SELECT count(*) FROM (
SELECT structure_id, chain_name, COUNT(DISTINCT rfam_acc) as c
FROM chain where rfam_acc!='unmappd' and inferred=1
GROUP BY structure_id, chain_name
) WHERE c=1;""",
# Number of mapped chains inferred twice
"""select count(*) from (
select structure_id, chain_name, count(distinct rfam_acc) as c
from chain where rfam_acc!='unmappd' and inferred=1
group by structure_id, chain_name
) where c=2;""",
# Number of mapped chains inferred 3 times or more
"""select count(*) from (
select structure_id, chain_name, count(distinct rfam_acc) as c
from chain where rfam_acc!='unmappd' and inferred=1
group by structure_id, chain_name
) where c>2;""",
# Number of chains both mapped with and without inferrence
""" SELECT COUNT(*) FROM (
SELECT structure_id, chain_name, sum(inferred) AS s, COUNT(rfam_acc) AS c
FROM chain
WHERE rfam_acc!='unmappd'
GROUP BY structure_id, chain_name
)
WHERE s < c AND s > 0;""",
# Number of mapped chains (total)
"""SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd');""",
# Number of unique mapped chains
"""SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd');""",
# Number of unmapped chains
"""SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc = 'unmappd');""",
# Number of mapped chains without issues (not inferred)
"""SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0 AND issue = 0);""",
# Number of unique mapped chains without issues (not inferred)
"""SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0 AND issue = 0);""",
# Number of mapped chains without issues (inferred)
"""SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1 AND issue=0);""",
# Number of unique mapped chains without issues (inferred)
"""SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1 AND issue=0);""",
# Number of mapped chains without issues (total)
"""SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND issue=0);""",
# Number of unique mapped chains without issues
"""SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND issue=0);""",
# Number of unmapped chains without issues
"""SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc = 'unmappd' AND issue=0);"""
]
answers = []
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
for r in reqs:
answers.append(pd.read_sql(r, conn))
df_unique = answers[0]
df_mapped_unique = answers[1]
df_mapped_copies = answers[2]
df_inferred_only_unique = answers[3]
print()
print("> found", answers[4].iloc[0][0], f"chains ({answers[5].iloc[0][0]} unique chains) that are mapped thanks to Rfam. Removing chains with issues, only {answers[15].iloc[0][0]} ({answers[16].iloc[0][0]} unique)")
if answers[4].iloc[0][0] != answers[5].iloc[0][0]:
print("\t> This happens because different parts of the same chain can be mapped to different families.")
print("> found", answers[6].iloc[0][0], f"chains ({answers[7].iloc[0][0]} unique chains) that are mapped by inferrence. Removing chains with issues, only {answers[17].iloc[0][0]} ({answers[18].iloc[0][0]} unique).")
print("\t> ", answers[8].iloc[0][0], "chains are mapped only once,")
print("\t> ", answers[9].iloc[0][0], "are mapped to 2 families,")
print("\t> ", answers[10].iloc[0][0], "are mapped to 3 or more.")
print("> Among them,", answers[11].iloc[0][0], "chains are mapped both with families found on Rfam and by inferrence.")
if answers[11].iloc[0][0]:
print("\t> this is normal if you used option -f (--full-inference). Otherwise, there might be a problem.")
print("> TOTAL:", answers[12].iloc[0][0], f"chains ({answers[13].iloc[0][0]} unique chains) mapped to a family. Removing chains with issues, only {answers[19].iloc[0][0]} ({answers[20].iloc[0][0]} unique).")
print("> TOTAL:", answers[14].iloc[0][0], f"unmapped chains. Removing chains with issues, {answers[21].iloc[0][0]}.")
if answers[14].iloc[0][0]:
print("\t> this is normal if you used option --no-homology. Otherwise, there might be a problem.")
print()
##########################################
# plot N = f(resolution, exp_method)
......@@ -642,7 +785,7 @@ def general_stats():
df_inferred_only_unique.sort_values('resolution', inplace=True, ignore_index=True)
df_mapped_copies.sort_values('resolution', inplace=True, ignore_index=True)
max_res = max(df_unique.resolution)
max_structs = len(df_mapped_copies.index.tolist())
max_structs = max(len(df_mapped_copies.index), len(df_unique.index))
colors = np.linspace(0,1,1+len(methods))
plt.xticks( np.arange(0, max_res+2, 2.0).tolist(), np.arange(0, max_res+2, 2.0).tolist() )
......@@ -654,7 +797,7 @@ def general_stats():
axs[0][0].set_ylabel("ALL", fontsize=14)
axs[0][0].set_title("Number of unique RNA chains", fontsize=14)
axs[0][0].set_ylim((0, max_structs * 1.05))
axs[0][0].legend(loc="best", fontsize=14)
axs[0][0].legend(loc="lower right", fontsize=14)
axs[0][1].grid(axis='y', ls='dotted', lw=1)
axs[0][1].set_yticklabels([])
......@@ -663,9 +806,9 @@ def general_stats():
axs[0][1].hist(df_inferred_only_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[0], 0.5), cumulative=True, label='only by inference')
axs[0][1].text(0.95*max_res, 0.95*len(df_mapped_unique.resolution), "%d " % len(df_mapped_unique.resolution),
horizontalalignment='right', verticalalignment='top', fontsize=14)
axs[0][1].set_title("Number of unique RNA chains\nmapped to $\geq 1$ family", fontsize=14)
axs[0][1].set_title(r"Number of unique RNA chains\nmapped to $\geq 1$ family", fontsize=14)
axs[0][1].set_ylim((0, max_structs * 1.05))
axs[0][1].legend(loc="best", fontsize=14)
axs[0][1].legend(loc="upper left", fontsize=14)
axs[0][2].grid(axis='y', ls='dotted', lw=1)
axs[0][2].set_yticklabels([])
......@@ -675,7 +818,7 @@ def general_stats():
axs[0][2].text(0.95*max_res, 0.95*len(df_mapped_copies.resolution), "%d " % len(df_mapped_copies.resolution),
horizontalalignment='right', verticalalignment='top', fontsize=14)
axs[0][2].set_title("Number of RNA chains mapped to a\nfamily (with copies)", fontsize=14)
axs[0][2].legend(loc="right", fontsize=14)
axs[0][2].legend(loc="upper left", fontsize=14)
axs[0][2].set_ylim((0, max_structs * 1.05))
for i,m in enumerate(methods):
......@@ -683,7 +826,7 @@ def general_stats():
df_mapped_unique_m = df_mapped_unique[df_mapped_unique.exp_method == m]
df_inferred_only_unique_m = df_inferred_only_unique[df_inferred_only_unique.exp_method == m]
df_mapped_copies_m = df_mapped_copies[ df_mapped_copies.exp_method == m]
max_structs = len(df_mapped_copies_m.resolution.tolist())
max_structs = max(len(df_mapped_copies_m.index), len(df_unique_m.index))
print("> found", max_structs, "structures with method", m, flush=True)
axs[1+i][0].grid(axis='y', ls='dotted', lw=1)
......@@ -693,7 +836,7 @@ def general_stats():
horizontalalignment='right', verticalalignment='top', fontsize=14)
axs[1+i][0].set_ylim((0, max_structs * 1.05))
axs[1+i][0].set_ylabel(m, fontsize=14)
axs[1+i][0].legend(loc="best", fontsize=14)
axs[1+i][0].legend(loc="lower right", fontsize=14)
axs[1+i][1].grid(axis='y', ls='dotted', lw=1)
axs[1+i][1].set_yticklabels([])
......@@ -703,7 +846,7 @@ def general_stats():
axs[1+i][1].text(0.95*max_res, 0.95*len(df_mapped_unique_m.resolution), "%d " % len(df_mapped_unique_m.resolution),
horizontalalignment='right', verticalalignment='top', fontsize=14)
axs[1+i][1].set_ylim((0, max_structs * 1.05))
axs[1+i][1].legend(loc="best", fontsize=14)
axs[1+i][1].legend(loc="upper left", fontsize=14)
axs[1+i][2].grid(axis='y', ls='dotted', lw=1)
axs[1+i][2].set_yticklabels([])
......@@ -713,7 +856,7 @@ def general_stats():
axs[1+i][2].text(0.95*max_res, 0.95*len(df_mapped_copies_m.resolution), "%d " % len(df_mapped_copies_m.resolution),
horizontalalignment='right', verticalalignment='top', fontsize=14)
axs[1+i][2].set_ylim((0, max_structs * 1.05))
axs[1+i][2].legend(loc="right", fontsize=14)
axs[1+i][2].legend(loc="upper left", fontsize=14)
axs[-1][0].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14)
axs[-1][1].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14)
......@@ -722,7 +865,7 @@ def general_stats():
fig.suptitle("Number of RNA chains by experimental method and resolution", fontsize=16)
fig.subplots_adjust(left=0.07, right=0.98, wspace=0.05,
hspace=0.05, bottom=0.05, top=0.92)
fig.savefig("results/figures/resolutions.png")
fig.savefig(runDir + "/results/figures/resolutions.png")
plt.close()
##########################################
......@@ -765,7 +908,7 @@ def general_stats():
fig.suptitle("Number of RNA families used by experimental method and resolution", fontsize=16)
fig.subplots_adjust(left=0.05, right=0.98, wspace=0.05,
hspace=0.05, bottom=0.12, top=0.84)
fig.savefig("results/figures/Nfamilies.png")
fig.savefig(runDir + "/results/figures/Nfamilies.png")
plt.close()
def log_to_pbar(pbar):
......@@ -776,8 +919,10 @@ def log_to_pbar(pbar):
if __name__ == "__main__":
# parse options
DELETE_OLD_DATA = False
DO_WADLEY_ANALYSIS = False
try:
opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "resolution=", "3d-folder=", "seq-folder=" ])
opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "from-scratch", "wadley", "resolution=", "3d-folder=", "seq-folder=" ])
except getopt.GetoptError as err:
print(err)
sys.exit(2)
......@@ -795,6 +940,7 @@ if __name__ == "__main__":
"\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.")
print("--seq-folder=…\t\t\tPath to a folder containing the sequence and alignment files. Required subfolder:"
"\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything")
sys.exit()
elif opt == '--version':
print("RNANet statistics 1.1 beta")
......@@ -810,25 +956,37 @@ if __name__ == "__main__":
path_to_seq_data = path.abspath(arg)
if path_to_seq_data[-1] != '/':
path_to_seq_data += '/'
elif opt=='--from-scratch':
DELETE_OLD_DATA = True
DO_WADLEY_ANALYSIS = True
subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
elif opt=='--wadley':
DO_WADLEY_ANALYSIS = True
# Load mappings
print("Loading mappings list...")
with sqlite3.connect("results/RNANet.db") as conn:
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ]
mappings_list = {}
for k in fam_list:
mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}' and issue=0;") ]
mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain JOIN structure ON chain.structure_id=structure.pdb_id WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};") ]
# List the families for which we will compute sequence identity matrices
with sqlite3.connect("results/RNANet.db") as conn:
famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ]
ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ]
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ]
ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains < 3 ORDER BY rfam_acc ASC;") ]
n_unmapped_chains = sql_ask_database(conn, "SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0]
if len(ignored):
print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
if DELETE_OLD_DATA:
for f in fam_list:
subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"])
# Prepare the multiprocessing execution environment
nworkers = max(read_cpu_number()-1, 32)
nworkers = min(read_cpu_number()-1, 32)
thr_idx_mgr = Manager()
idxQueue = thr_idx_mgr.Queue()
for i in range(nworkers):
......@@ -836,14 +994,15 @@ if __name__ == "__main__":
# Define the tasks
joblist = []
# joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 4.0))) # res threshold is 4.0 Angstroms by default
# joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 4.0))) #
if n_unmapped_chains and DO_WADLEY_ANALYSIS:
joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 20.0))) # res threshold is 4.0 Angstroms by default
joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 20.0))) #
joblist.append(Job(function=stats_len)) # Computes figures
# joblist.append(Job(function=stats_freq)) # updates the database
# for f in famlist:
# joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
# if f not in ignored:
# joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database
joblist.append(Job(function=stats_freq)) # updates the database
for f in famlist:
joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
if f not in ignored:
joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database
p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True)
......@@ -867,7 +1026,8 @@ if __name__ == "__main__":
print()
# finish the work after the parallel portions
# per_chain_stats()
# seq_idty()
# stats_pairs()
per_chain_stats()
seq_idty()
stats_pairs()
if n_unmapped_chains:
general_stats()
......