Louis BECQUEY

January 2021 update

v 1.1 beta, January 2021
The first uses of RNAnet by people from outside the development team happened between this December.
A few feedback allowed to identify issues and useful information to add.
FEATURE CHANGES
- Sequence alignments of the 3D structures mapped to a family are now provided.
- Full alignements with Rfam sequences are not provided, but you can ask us for the files.
- Two new fields in table 'family': ali_length and ali_filtered_length.
They are the MSA lengths of the alignment with and without the Rfam sequences.
TECHNICAL CHANGES
- SQLite connexions are now all in WAL mode by default (previously, only the writers used WAL mode)
BUG CORRECTIONS
- When an alignment file is updated in a newer run of RNANet, all the re_mappings are now re-computed
for this family. Previously, the remappings were computed only for the newly added sequences,
while the alignment actually changed even for chains added in past runs.
- Changed the ownership and permissions of files produced by the Docker container.
They were previously owned by root and the user could not get access to them.
- Modified nucleotides were not always correctly transformed to N in the alignments (and nucleotide.nt_align_code fields).
Now, the alignments and nt_align_code only contain "ACGUN-" chars.
Now, 'N' means 'other', while '-' means 'nothing'.
COMING SOON
- Automated annotation of detected Recurrent Interaction Networks (RINs), see http://carnaval.lri.fr/ .
- Possibly, automated detection of HLs and ILs from the 3D Motif Atlas (BGSU). Maybe. Their own website already does the job.
- A field estimating the quality of the sequence alignment in table family.
- Possibly, more metrics about the alignments coming from Infernal.
\ No newline at end of file
......@@ -249,7 +249,9 @@ To help you design your own requests, here follows a description of the database
* `nb_homologs`: The number of hits known to be homologous downloaded from Rfam to compute nucleotide frequencies
* `nb_3d_chains`: The number of 3D RNA chains mapped to the family (from Rfam-PDB mappings, or inferred using the redundancy list)
* `nb_total_homol`: Sum of the two previous fields, the number of sequences in the multiple sequence alignment, used to compute nucleotide frequencies
* `max_len`: The longest RNA sequence among the homologs (in bases)
* `max_len`: The longest RNA sequence among the homologs (in bases, unaligned)
* `ali_len`: The aligned sequences length (in bases, aligned)
* `ali_filtered_len`: The aligned sequences length when we filter the alignment to keep only the RNANet chains (which have a 3D structure) and remove the gap-only columns.
* `comput_time`: Time required to compute the family's multiple sequence alignment in seconds,
* `comput_peak_mem`: RAM (or swap) required to compute the family's multiple sequence alignment in megabytes,
* `idty_percent`: Average identity percentage over pairs of the 3D chains' sequences from the family
......
#!/usr/bin/python3.8
#!/usr/bin/python3
# check Python version before everything
import platform
a = ["3.8", platform.python_version()]
a.sort()
if a[0] != "3.8":
print(f"Python is too old: {platform.python_version()}")
print("Please use version 3.8 or newer.")
exit(1)
import Bio
import Bio.PDB as pdb
import concurrent.futures
......@@ -58,6 +68,7 @@ running_stats.append(0) # n_finished
running_stats.append(0) # n_skipped
path_to_3D_data = "tobedefinedbyoptions"
path_to_seq_data = "tobedefinedbyoptions"
python_executable = "python"+".".join(platform.python_version().split('.')[:2]) # Cuts python3.8.1 into python3.8 for example.
validsymb = '\U00002705'
warnsymb = '\U000026A0'
errsymb = '\U0000274C'
......@@ -163,7 +174,7 @@ class Chain:
self.chain_label = chain_label # chain pretty name
self.file = "" # path to the 3D PDB file
self.seq = "" # sequence with modified nts
self.seq_to_align = "" # sequence with modified nts replaced, but gaps can exist
self.seq_to_align = "" # sequence with modified nts replaced by N, but gaps can exist
self.length = -1 # length of the sequence (missing residues are not counted)
self.full_length = -1 # length of the chain extracted from source structure ([start; stop] interval, or a subset for inferred mappings)
self.delete_me = False # an error occured during production/parsing
......@@ -468,10 +479,11 @@ class Chain:
# Add a sequence column just for the alignments
df['nt_align_code'] = [str(x).upper()
.replace('NAN', '-') # Unresolved nucleotides are gaps
.replace('?', '-') # Unidentified residues, let's delete them
.replace('T', 'U') # 5MU are modified to t, which gives T
.replace('?', 'N') # Unidentified residues, let's delete them
.replace('T', 'U') # 5MU are modified to t by DSSR, which gives T
.replace('P', 'U') # Pseudo-uridines, but it is not really right to change them to U, see DSSR paper, Fig 2
for x in df['nt_code']]
df['nt_align_code'] = [ x if x in "ACGU-" else 'N' for x in df['nt_align_code'] ] # All other modified nucleotides are transformed to N
# One-hot encoding sequence
df["is_A"] = [1 if x == "A" else 0 for x in df["nt_code"]]
......@@ -562,6 +574,7 @@ class Chain:
setproctitle(f"RNANet.py {self.chain_label} register_chain()")
with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
# Register the chain in table chain
if self.mapping is not None:
sql_execute(conn, f""" INSERT INTO chain
......@@ -607,99 +620,6 @@ class Chain:
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""",
many=True, data=list(df.to_records(index=False)), warn_every=10)
def remap(self, columns_to_save, s_seq):
"""Maps the object's sequence to its version in a MSA, to compute nucleotide frequencies at every position.
columns_to_save: a set of indexes in the alignment that are mapped to previous sequences in the alignment
s_seq: the aligned version of self.seq_to_align
"""
setproctitle(f"RNANet.py {self.chain_label} remap()")
alilen = len(s_seq)
re_mappings = []
# Save colums in the appropriate positions
i = 0
j = 0
while i < self.full_length and j < alilen:
# Here we try to map self.seq_to_align (the sequence of the 3D chain, including gaps when residues are missing),
# with s_seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and .
if self.seq_to_align[i] == s_seq[j].upper(): # alignment and sequence correspond (incl. gaps)
re_mappings.append((self.db_chain_id, i+1, j+1)) # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1.
columns_to_save.add(j+1) # it's a set, doublons are automaticaly ignored
i += 1
j += 1
elif self.seq_to_align[i] == '-': # gap in the chain, but not in the aligned sequence
# search for a gap to the consensus nearby
k = 0 # Search must start at zero to assert the difference comes from '-' in front of '.'
while j+k < alilen and s_seq[j+k] == '.':
k += 1
# if found, set j to that position
if j+k < alilen and s_seq[j+k] == '-':
re_mappings.append((self.db_chain_id, i+1, j+k+1))
columns_to_save.add(j+k+1)
i += 1
j += k+1
continue
# if not, take the insertion gap if this is one
if j < alilen and s_seq[j] == '.':
re_mappings.append((self.db_chain_id, i+1, j+1))
columns_to_save.add(j+1)
i += 1
j += 1
continue
# else, just mark the gap as unknown (there is an alignment mismatch)
re_mappings.append((self.db_chain_id, i+1, 0))
i += 1
elif s_seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain
j += 1 # ignore the column
else: # sequence mismatch which is not a gap...
print(f"You are never supposed to reach this. Comparing {self.chain_label} in {i} ({self.seq_to_align[i-1:i+2]}) with seq[{j}] ({s_seq[j-3:j+4]}).",
self.seq_to_align, s_seq, sep='\n', flush=True)
raise Exception('Something is wrong with sequence alignment.')
return re_mappings, columns_to_save
def replace_gaps(self, conn):
""" Replace gapped positions by the consensus sequence.
REQUIRES align_column and re_mapping up to date
"""
setproctitle(f"RNANet.py {self.chain_label} replace_gaps()")
homology_data = sql_ask_database(conn, f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM
(SELECT chain_id, rfam_acc FROM chain WHERE chain_id={self.db_chain_id})
NATURAL JOIN re_mapping
NATURAL JOIN align_column;
""")
if homology_data is None or not len(homology_data):
with open(runDir + "/errors.txt", "a") as errf:
errf.write(f"No homology data found in the database for {self.chain_label} ! Not replacing gaps.\n")
return []
elif len(homology_data) != self.full_length:
with open(runDir + "/errors.txt", "a") as errf:
errf.write(f"Found {len(homology_data)} nucleotides for {self.chain_label} of length {self.full_length} ! Not replacing gaps.\n")
return []
c_seq_to_align = list(self.seq_to_align)
c_seq = list(self.seq)
letters = ['A', 'C', 'G', 'U', 'N']
gaps = []
for i in range(self.full_length):
if c_seq_to_align[i] == '-': # (then c_seq[i] also is)
freq = homology_data[i]
l = letters[freq.index(max(freq))]
c_seq_to_align[i] = l
c_seq[i] = l
gaps.append((l, l == 'A', l == 'C', l == 'G', l == 'U', l == 'N', self.db_chain_id, i+1))
self.seq_to_align = ''.join(c_seq_to_align)
self.seq = ''.join(c_seq)
return gaps
class Job:
""" This class contains information about a task to run later.
......@@ -868,6 +788,7 @@ class Downloader:
print(d)
with sqlite3.connect(runDir + "/results/RNANet.db", timeout=20.0) as conn:
conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
# We use the REPLACE keyword to get the latest information
sql_execute(conn, """INSERT OR REPLACE INTO family (rfam_acc, description, max_len)
VALUES (?, ?, ?);""",
......@@ -1194,8 +1115,7 @@ class Pipeline:
setproctitle("RNANet.py list_available_mappings()")
# List all 3D RNA chains below given resolution
full_structures_list = self.dl.download_BGSU_NR_list(
self.CRYSTAL_RES) # list of tuples ( class, class_members )
full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) # list of tuples ( class, class_members )
# Check for a list of known problems:
if os.path.isfile(runDir + "/known_issues.txt"):
......@@ -1209,11 +1129,13 @@ class Pipeline:
print(" ".join(self.known_issues))
if self.HOMOLOGY:
# Ask Rfam if some are mapped to Rfam families
# Ask Rfam their mappings between PDB structures and Rfam families
allmappings = self.dl.download_Rfam_PDB_mappings()
# Compute the list of mappable structures using NR-list and Rfam-PDB mappings
# And get Chain() objects
# Compute the extended list of mappable structures using NR-list and Rfam-PDB mappings
# And get Chain() objects.
# If self.FULLINFERENCE is False, the extended list is already filtered to remove
# the chains which already are in the database.
print("> Building list of structures...", flush=True)
p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=ncores)
try:
......@@ -1243,6 +1165,7 @@ class Pipeline:
exit(1)
else:
conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0)
conn.execute('pragma journal_mode=wal')
for eq_class, codelist in tqdm(full_structures_list, desc="Eq. classes"):
codes = codelist.replace('+', ',').split(',')
......@@ -1361,6 +1284,7 @@ class Pipeline:
kir.write(c[1].chain_label + '\n' +
c[1].error_messages + '\n\n')
with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
sql_execute(conn, f"UPDATE chain SET issue = 1 WHERE chain_id = ?;", data=(c[1].db_chain_id,))
ki.close()
kir.close()
......@@ -1459,10 +1383,11 @@ class Pipeline:
else:
nb_total_homol = len(align)
nb_homologs = nb_total_homol - nb_3d_chains
data.append((nb_homologs, nb_3d_chains, nb_total_homol, r[2], r[3], r[0]))
data.append((nb_homologs, nb_3d_chains, nb_total_homol, align.get_alignment_length(), r[2], r[3], r[0]))
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
sql_execute(conn, """UPDATE family SET nb_homologs = ?, nb_3d_chains = ?, nb_total_homol = ?, comput_time = ?, comput_peak_mem = ?
conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
sql_execute(conn, """UPDATE family SET nb_homologs = ?, nb_3d_chains = ?, nb_total_homol = ?, ali_len = ?, comput_time = ?, comput_peak_mem = ?
WHERE rfam_acc = ?;""", many=True, data=data)
def remap(self):
......@@ -1489,8 +1414,8 @@ class Pipeline:
try:
fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True)
# Apply work_pssm to each RNA family
for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)):
# Apply work_pssm_remap to each RNA family
for i, _ in enumerate(p.imap_unordered(partial(work_pssm_remap, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)):
# Everytime the iteration finishes on a family, update the global progress bar over the RNA families
fam_pbar.update(1)
fam_pbar.close()
......@@ -1543,12 +1468,13 @@ class Pipeline:
runDir + f"/data/{f}_counts.csv"])
# Run statistics files
subprocess.run(["python3.8", fileDir+"/scripts/regression.py", runDir + "/results/RNANet.db"])
subprocess.run(["python3.8", fileDir+"/statistics.py", "--3d-folder", path_to_3D_data,
subprocess.run([python_executable, fileDir+"/scripts/regression.py", runDir + "/results/RNANet.db"])
subprocess.run([python_executable, fileDir+"/statistics.py", "--3d-folder", path_to_3D_data,
"--seq-folder", path_to_seq_data, "-r", str(self.CRYSTAL_RES)])
# Save additional informations
with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
conn.execute('pragma journal_mode=wal')
pd.read_sql_query("""SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem
FROM family ORDER BY nb_3d_chains DESC;""",
conn).to_csv(runDir + f"/results/families.csv", float_format="%.2f", index=False)
......@@ -1571,6 +1497,7 @@ class Pipeline:
setproctitle("RNANet.py sanitize_database()")
conn = sqlite3.connect(runDir + "/results/RNANet.db")
conn.execute('pragma journal_mode=wal')
# Assert every structure is used
r = sql_ask_database(conn, """SELECT DISTINCT pdb_id FROM structure WHERE pdb_id NOT IN (SELECT DISTINCT structure_id FROM chain);""")
......@@ -1742,6 +1669,8 @@ def sql_define_tables(conn):
nb_3d_chains INT,
nb_total_homol INT,
max_len UNSIGNED SMALLINT,
ali_len UNSIGNED SMALLINT,
ali_filtered_len UNSIGNED SMALLINT,
comput_time REAL,
comput_peak_mem REAL,
idty_percent REAL
......@@ -1778,13 +1707,13 @@ def sql_ask_database(conn, sql, warn_every=10):
warn(str(e) + ", retrying in 0.2s (worker " +
str(os.getpid()) + f', try {_+1}/100)')
time.sleep(0.2)
cursor.close()
warn("Tried to reach database 100 times and failed. Aborting.", error=True)
return []
@trace_unhandled_exceptions
def sql_execute(conn, sql, many=False, data=None, warn_every=10):
conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
for _ in range(100): # retry 100 times if it fails
try:
if many:
......@@ -2071,6 +2000,7 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li
# Check if the chain exists in the database
if update_only:
with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
conn.execute('pragma journal_mode=wal')
res = sql_ask_database(conn, f"""SELECT chain_id from chain
WHERE structure_id='{pdb_id}'
AND chain_name='{pdb_chain_id}'
......@@ -2110,6 +2040,7 @@ def work_mmcif(pdb_id):
# check if it exists in database
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
conn.execute('pragma journal_mode=wal')
r = sql_ask_database(conn, f"""SELECT * from structure where pdb_id = '{pdb_id}';""")
# if not, read the CIF header and register the structure
......@@ -2138,6 +2069,7 @@ def work_mmcif(pdb_id):
# Save into the database
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution)
VALUES (?, ?, DATE(?), ?, ?);""", data=(pdb_id, 1, date, exp_meth, reso))
......@@ -2181,9 +2113,10 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
df = c.extract_3D_data(save_logs)
c.register_chain(df)
# Small check
# Small check that all nucleotides of a chain have an entry in nucleotide table
if not c.delete_me:
with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
conn.execute('pragma journal_mode=wal')
nnts = sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM nucleotide WHERE chain_id={c.db_chain_id};", warn_every=10)[0][0]
if not(nnts):
warn(f"Nucleotides not inserted: {c.error_messages}")
......@@ -2420,22 +2353,29 @@ def summarize_position(counts):
@trace_unhandled_exceptions
def work_pssm(f, fill_gaps):
""" Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
def work_pssm_remap(f, fill_gaps):
"""Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
This also remaps the 3D object sequence with the aligned sequence in the MSA.
If asked, the 3D object sequence is completed by the consensus nucleotide when one of them is missing.
Uses only 1 core, so this function can be called in parallel.
"""
setproctitle(f"RNAnet.py work_pssm({f})")
# Get a worker number to position the progress bar
global idxQueue
thr_idx = idxQueue.get()
# get the chains of this family
# get the chains of this family in the update
list_of_chains = rfam_acc_to_download[f]
chains_ids = [str(c) for c in list_of_chains]
##########################################################################################
# Compute frequencies in the alignment
##########################################################################################
setproctitle(f"RNAnet.py work_pssm_remap({f}) compute PSSMs")
# Open the alignment
try:
align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
......@@ -2450,33 +2390,92 @@ def work_pssm(f, fill_gaps):
frequencies = [ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ]
del pssm
##########################################################################################
# Remap sequences of the 3D chains with sequences in the alignment
##########################################################################################
setproctitle(f"RNAnet.py work_pssm_remap({f}) remap")
# For each sequence, find the right chain and remap chain residues with alignment columns
columns_to_save = set()
re_mappings = []
alilen = align.get_alignment_length()
pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Remap {f} chains", leave=False)
pbar.update(0)
for s in align:
if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain
continue
try:
# get the right 3D chain:
if '|' in s.id:
# for some reason cmalign gets indexes|chainid in the FASTA headers sometimes.
# it is maybe when there are doublons ? Removing doublons takes too much time,
# it is easier to parse the index|id formats.
idx = chains_ids.index(s.id.split('|')[1])
# Check if the chain existed before in the database
if chains_ids.index(s.id) in list_of_chains.keys():
# a chain object is found in the update, this sequence is new
this_chain = list_of_chains[chains_ids.index(s.id)]
seq_to_align = this_chain.seq_to_align
full_length = this_chain.full_length
db_id = this_chain.db_chain_id
else:
# it existed in the database before.
this_chain = None
# Get the chain id in the database
conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0)
conn.execute('pragma journal_mode=wal')
db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = {s.id.split('[')[0]} AND chain_name = {s.id.split('-')[1]} AND rfam_acc = {f};")
if len(db_id):
db_id = db_id[0][0]
else:
idx = chains_ids.index(s.id)
conn.close()
warn(f"Bizarre... sequence {s.id} is not found in the database ! Cannot remap it ! Ignoring...")
pbar.update(1)
continue
seq_to_align = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")])
full_length = len(seq_to_align)
# call its remap method
new_mappings, columns_to_save = list_of_chains[idx].remap(columns_to_save, s.seq)
re_mappings += new_mappings
conn.close()
except ValueError:
# with open(runDir + "/errors.txt", "a") as errf:
# errf.write(f"Chain {s.id} not found in list of chains to process. ignoring.\n")
pass
# Save colums in the appropriate positions
i = 0 # to iterate the object sequence
j = 0 # to iterate the alignment sequence
while i < full_length and j < alilen:
# Here we try to map seq_to_align (the sequence of the 3D chain, including gaps when residues are missing),
# with s.seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and .
if seq_to_align[i] == s.seq[j].upper(): # alignment and sequence correspond (incl. gaps)
re_mappings.append((db_id, i+1, j+1)) # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1.
columns_to_save.add(j+1) # it's a set, doublons are automaticaly ignored
i += 1
j += 1
elif seq_to_align[i] == '-': # gap in the chain, but not in the aligned sequence
# search for a gap to the consensus nearby
k = 0 # Search must start at zero to assert the difference comes from '-' in front of '.'
while j+k < alilen and s.seq[j+k] == '.':
k += 1
# if found, set j to that position
if j+k < alilen and s.seq[j+k] == '-':
re_mappings.append((db_id, i+1, j+k+1))
columns_to_save.add(j+k+1)
i += 1
j += k+1
continue
# if not, take the insertion gap if this is one
if j < alilen and s.seq[j] == '.':
re_mappings.append((db_id, i+1, j+1))
columns_to_save.add(j+1)
i += 1
j += 1
continue
# else, just mark the gap as unknown (there is an alignment mismatch)
re_mappings.append((db_id, i+1, 0))
i += 1
elif s.seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain
j += 1 # ignore the column
else: # sequence mismatch which is not a gap...
print(f"You are never supposed to reach this. Comparing {self.chain_label} in {i} ({self.seq_to_align[i-1:i+2]}) with seq[{j}] ({s.seq[j-3:j+4]}).",
self.seq_to_align, s.seq, sep='\n', flush=True)
raise Exception('Something is wrong with sequence alignment.')
pbar.update(1)
pbar.close()
......@@ -2486,13 +2485,28 @@ def work_pssm(f, fill_gaps):
warn(f"Chains were not found in {f}++.afa file: {chains_ids}", error=True)
return 1
##########################################################################################
# Save the alignment columns and their mappings to the database
##########################################################################################
setproctitle(f"RNAnet.py work_pssm_remap({f}) saving")
# Save the re_mappings
conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0)
conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali)
VALUES (?, ?, ?)
ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""",
many=True, data=re_mappings)
# Delete alignment columns that are not used anymore from the database
current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = {f}";)]
unused = []
for col in current_family_columns:
if col not in columns_to_save:
unused.append((f, col))
sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused)
# Save the useful columns in the database
data = [(f, j) + frequencies[j-1] for j in sorted(columns_to_save)]
sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other)
......@@ -2501,34 +2515,72 @@ def work_pssm(f, fill_gaps):
# Add an unknown values column, with index_ali 0
sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other)
VALUES (?, 0, 0.0, 0.0, 0.0, 0.0, 1.0);""", data=(f,))
# Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains)
sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f))
conn.close()
##########################################################################################
# Replacing gaps in the 3D chains by consensus sequences
##########################################################################################
setproctitle(f"RNAnet.py work_pssm_remap({f}) replace gaps")
# Replace gaps by consensus
if fill_gaps:
pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Replace {f} gaps", leave=False)
pbar.update(0)
gaps = []
conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0)
conn.execute('pragma journal_mode=wal')
for s in align:
if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain
continue
try:
# get the right 3D chain:
if '|' in s.id:
idx = chains_ids.index(s.id.split('|')[1])
# get the right 3D chain:
if chains_ids.index(s.id) in list_of_chains.keys():
db_id = list_of_chains[chains_ids.index(s.id)].db_chain_id
seq = this_chain.seq
full_length = this_chain.full_length
else:
db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = {s.id.split('[')[0]} AND chain_name = {s.id.split('-')[1]} AND rfam_acc = {f};")
if len(db_id):
db_id = db_id[0][0]
else:
idx = chains_ids.index(s.id)
gaps += list_of_chains[idx].replace_gaps(conn)
except ValueError:
pass # We already printed a warning just above
pbar.update(1)
continue
seq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ])
full_length = len(seq)
# detect gaps
c_seq = list(seq) # contains "ACGUNacgu-"
letters = ['A', 'C', 'G', 'U', 'N']
homology_data = sql_ask_database(conn, f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM
(SELECT chain_id, rfam_acc FROM chain WHERE chain_id={db_id})
NATURAL JOIN re_mapping
NATURAL JOIN align_column;
""")
if homology_data is None or not len(homology_data):
with open(runDir + "/errors.txt", "a") as errf:
errf.write(f"No homology data found in the database for {s.id} ! Not replacing gaps.\n")
continue
elif len(homology_data) != full_length:
with open(runDir + "/errors.txt", "a") as errf:
errf.write(f"Found {len(homology_data)} nucleotides for {s.id} of length {full_length} ! Not replacing gaps.\n")
continue
for i in range(full_length):
if c_seq[i] == '-':
freq = homology_data[i]
l = letters[freq.index(max(freq))]
gaps.append((l, l == 'A', l == 'C', l == 'G', l == 'U', l == 'N', db_id, i+1))
pbar.update(1)
pbar.close()
sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = ?,
is_A = ?, is_C = ?, is_G = ?, is_U = ?, is_other = ?
WHERE chain_id = ? AND index_chain = ?;""", many=True, data=gaps)
conn.close()
conn.close()
idxQueue.put(thr_idx) # replace the thread index in the queue
setproctitle(f"RNAnet.py work_pssm_remap({f}) finished")
return 0
......@@ -2538,6 +2590,7 @@ def work_save(c, homology=True):
setproctitle(f"RNAnet.py work_save({c.chain_label})")
conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0)
conn.execute('pragma journal_mode=wal')
if homology:
df = pd.read_sql_query(f"""
SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code,
......@@ -2571,6 +2624,7 @@ if __name__ == "__main__":
runDir = os.getcwd()
fileDir = os.path.dirname(os.path.realpath(__file__))
ncores = read_cpu_number()
print(f"> Running {python_executable} on {ncores} CPU cores in folder {runDir}.")
pp = Pipeline()
pp.process_options()
......@@ -2584,7 +2638,9 @@ if __name__ == "__main__":
sql_define_tables(conn)
print("> Storing results into", runDir + "/results/RNANet.db")
# compute an update compared to what is in the table "chain"
# compute an update compared to what is in the table "chain" (comparison on structure_id + chain_name + rfam_acc).
# If --all was passed, all the structures are kept.
# Fills pp.update with Chain() objects.
pp.list_available_mappings()
# ===========================================================================
......@@ -2592,10 +2648,13 @@ if __name__ == "__main__":
# ===========================================================================
# Download and annotate new RNA 3D chains (Chain objects in pp.update)
# If the original cif file and/or the Json DSSR annotation file already exist, they are not redownloaded/recomputed.
pp.dl_and_annotate(coeff_ncores=0.5)
print("Here we go.")
# At this point, the structure table is up to date
# At this point, the structure table is up to date.
# Now save the DSSR annotations to the database.
# Extract the 3D chains to separate structure files if asked with --extract.
pp.build_chains(coeff_ncores=1.0)
if len(pp.to_retry):
......@@ -2629,10 +2688,10 @@ if __name__ == "__main__":
# If your job failed, you can comment all the "3D information" part and start from here.
pp.checkpoint_load_chains()
# Get the list of Rfam families found
# Get the list of Rfam families found in the update
rfam_acc_to_download = {}
for c in pp.loaded_chains:
if c.mapping.rfam_acc not in rfam_acc_to_download:
if c.mapping.rfam_acc not in rfam_acc_to_download.keys():
rfam_acc_to_download[c.mapping.rfam_acc] = [c]
else:
rfam_acc_to_download[c.mapping.rfam_acc].append(c)
......@@ -2644,7 +2703,8 @@ if __name__ == "__main__":
pp.prepare_sequences()
pp.realign()
# At this point, the family table is up to date
# At this point, the family table is almost up to date
# (lacking idty_percent and ali_filtered_length, both set in statistics.py)
thr_idx_mgr = Manager()
idxQueue = thr_idx_mgr.Queue()
......
......@@ -4,7 +4,7 @@ cd /home/lbecquey/Projects/RNANet
rm -rf latest_run.log errors.txt
# Run RNANet
bash -c 'time ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --extract -s --archive' > latest_run.log 2>&1
bash -c 'time python3.8 /RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --extract -s --archive' > latest_run.log 2>&1
echo 'Compressing RNANet.db.gz...' >> latest_run.log
touch results/RNANet.db # update last modification date
gzip -k /home/lbecquey/Projects/RNANet/results/RNANet.db # compress it
......
......@@ -417,7 +417,10 @@ def parallel_stats_pairs(f):
def to_id_matrix(f):
"""
Extracts sequences of 3D chains from the family alignments to a distinct STK file,
then runs esl-alipid on it to get an identity matrix
then runs esl-alipid on it to get an identity matrix.
Side-effect : also produces the 3D_only family alignment as a separate file.
So, we use this function to update 'ali_filtered_length' in the family table.
"""
if path.isfile("data/"+f+".npy"):
return 0
......@@ -442,7 +445,14 @@ def to_id_matrix(f):
subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap", #
"-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk", # This run just deletes columns of gaps
"stockholm", path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"]) #
subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk"])
subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk", f + "_3d_only.stk"])
subprocess.run(["esl-reformat", "-o", path_to_seq_data+f"/realigned/{f}_3d_only.afa", "afa", path_to_seq_data+f"/realigned/{f}_3d_only.stk"])
# Out-of-scope task : update the database with the length of the filtered alignment:
align = AlignIO.read(path_to_seq_data+f"/realigned/{f}_3d_only.afa", "fasta")
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
sql_execute(conn, """UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;""", many=True, data=(align.get_alignment_length(), f))
del align
# Prepare the job
process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"),
......