Louis BECQUEY

v1.3 large update

v 1.1 beta, January 2021
v 1.3 beta, January 2021
The first uses of RNAnet by people from outside the development team happened between this December.
A few feedback allowed to identify issues and useful information to add.
FEATURE CHANGES
- Sequence alignments of the 3D structures mapped to a family are now provided.
- Sequence alignments of the 3D structures mapped to a family are now provided.
- Full alignements with Rfam sequences are not provided, but you can ask us for the files.
- Two new fields in table 'family': ali_length and ali_filtered_length.
They are the MSA lengths of the alignment with and without the Rfam sequences.
- Gap replacement by consensus (--fill-gaps) has been removed. Now, the gap percentage and consensus are saved
in the align_column table and the datapoints in CSV format, in separate columns.
Consensus is one of ACGUN-, the gap being chosen if >75% of the sequences are gaps at this position.
Otherwise, A/C/G/U is chosen if >50% of the non-gap positions are A/C/G/U. Otherwise, N is the consensus.
TECHNICAL CHANGES
- SQLite connexions are now all in WAL mode by default (previously, only the writers used WAL mode)
- SQLite connexions are now all in WAL mode by default (previously, only the writers used WAL mode, but this is useless)
- Moved to Python3.9 for internal testing.
- Latest version of BioPython is now supported (1.78)
BUG CORRECTIONS
- When an alignment file is updated in a newer run of RNANet, all the re_mappings are now re-computed
......@@ -19,8 +25,8 @@ BUG CORRECTIONS
- Changed the ownership and permissions of files produced by the Docker container.
They were previously owned by root and the user could not get access to them.
- Modified nucleotides were not always correctly transformed to N in the alignments (and nucleotide.nt_align_code fields).
Now, the alignments and nt_align_code only contain "ACGUN-" chars.
Now, 'N' means 'other', while '-' means 'nothing'.
Now, the alignments and nt_align_code (and consensus) only contain "ACGUN-" chars.
Now, 'N' means 'other', while '-' means 'nothing' or 'unknown'.
COMING SOON
- Automated annotation of detected Recurrent Interaction Networks (RINs), see http://carnaval.lri.fr/ .
......
......@@ -14,7 +14,7 @@ RUN apk update && apk add --no-cache \
py3-matplotlib py3-requests py3-scipy py3-setproctitle py3-sqlalchemy py3-tqdm \
sqlite \
\
&& python3 -m pip install biopython==1.76 pandas psutil pymysql && \
&& python3 -m pip install biopython pandas psutil pymysql && \
\
wget -q -O /etc/apk/keys/sgerrand.rsa.pub https://alpine-pkgs.sgerrand.com/sgerrand.rsa.pub && \
wget https://github.com/sgerrand/alpine-pkg-glibc/releases/download/2.32-r0/glibc-2.32-r0.apk && \
......
......@@ -125,36 +125,6 @@ class SelectivePortionSelector(object):
return 1
class BufferingSummaryInfo(AlignInfo.SummaryInfo):
def get_pssm(self, family, index):
"""Create a position specific score matrix object for the alignment.
This creates a position specific score matrix (pssm) which is an
alternative method to look at a consensus sequence.
Returns:
- A PSSM (position specific score matrix) object.
"""
pssm_info = []
# now start looping through all of the sequences and getting info
for residue_num in tqdm(range(self.alignment.get_alignment_length()), position=index+1, desc=f"Worker {index+1}: Count bases in fam {family}", leave=False):
score_dict = self._get_base_letters("ACGUN")
for record in self.alignment:
this_residue = record.seq[residue_num].upper()
if this_residue not in "-.":
try:
score_dict[this_residue] += 1.0
except KeyError:
# if this_residue in "acgun":
# warn(f"Found {this_residue} in {family} alignment...")
score_dict[this_residue] = 1.0
pssm_info.append(('*', score_dict))
return AlignInfo.PSSM(pssm_info)
class Chain:
"""
The object which stores all our data and the methods to process it.
......@@ -963,7 +933,6 @@ class Pipeline:
# Default options:
self.CRYSTAL_RES = 4.0
self.KEEP_HETATM = False
self.FILL_GAPS = True
self.HOMOLOGY = True
self.USE_KNOWN_ISSUES = True
self.RUN_STATS = False
......@@ -984,11 +953,9 @@ class Pipeline:
setproctitle("RNANet.py process_options()")
try:
opts, _ = getopt.getopt(sys.argv[1:], "r:fhs",
["help", "resolution=", "keep-hetatm=", "from-scratch", "full-inference,"
"fill-gaps=", "3d-folder=", "seq-folder=",
"no-homology", "ignore-issues", "extract", "only=", "all", "no-logs",
"archive", "update-homologous"])
opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=", "only=",
"from-scratch", "full-inference", "no-homology", "ignore-issues", "extract",
"all", "no-logs", "archive", "update-homologous"])
except getopt.GetoptError as err:
print(err)
sys.exit(2)
......@@ -1014,8 +981,6 @@ class Pipeline:
print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.")
print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. "
"\n\t\t\t\tDoes not affect the descriptors.")
print("--fill-gaps=True\t\t(True | False) Replace gaps in nt_align_code field due to unresolved residues"
"\n\t\t\t\tby the most common nucleotide at this position in the alignment.")
print("--3d-folder=…\t\t\tPath to a folder to store the 3D data files. Subfolders will contain:"
"\n\t\t\t\t\tRNAcifs/\t\tFull structures containing RNA, in mmCIF format"
"\n\t\t\t\t\trna_mapped_to_Rfam/\tExtracted 'pure' RNA chains"
......@@ -1038,7 +1003,7 @@ class Pipeline:
print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &")
sys.exit()
elif opt == '--version':
print("RNANet 1.2, parallelized, Dockerized")
print("RNANet 1.3 beta, parallelized, Dockerized")
sys.exit()
elif opt == "-r" or opt == "--resolution":
assert float(arg) > 0.0 and float(arg) <= 20.0
......@@ -1048,9 +1013,6 @@ class Pipeline:
elif opt == "--keep-hetatm":
assert arg in ["True", "False"]
self.KEEP_HETATM = (arg == "True")
elif opt == "--fill-gaps":
assert arg in ["True", "False"]
self.FILL_GAPS = (arg == "True")
elif opt == "--no-homology":
self.HOMOLOGY = False
elif opt == '--3d-folder':
......@@ -1410,13 +1372,12 @@ class Pipeline:
# Start a process pool to dispatch the RNA families,
# over multiple CPUs (one family by CPU)
# p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=1)
p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
try:
fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True)
# Apply work_pssm_remap to each RNA family
for i, _ in enumerate(p.imap_unordered(partial(work_pssm_remap, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)):
for i, _ in enumerate(p.imap_unordered(work_pssm_remap, self.fam_list, chunksize=1)):
# Everytime the iteration finishes on a family, update the global progress bar over the RNA families
fam_pbar.update(1)
fam_pbar.close()
......@@ -1492,6 +1453,12 @@ class Pipeline:
subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", "."])
subprocess.run(["ln", "-s", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", runDir + f"/archive/RNANET_datapoints_latest.tar.gz"])
# gather the alignments
os.makedirs(path_to_seq_data + "realigned/3D_only", exist_ok=True)
subprocess.run(["cp", path_to_seq_data + "realigned/*_3d_only.afa", path_to_seq_data + "realigned/3d_only" ])
subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_alignments_latest.tar.gz"])
subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3d_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."])
def sanitize_database(self):
"""Searches for issues in the database and correct them"""
......@@ -1540,6 +1507,20 @@ class Pipeline:
# for x in r:
# print(x)
# check that filtered alignment have the same length than the number of saved alignment columns for a family
r = sql_ask_database(conn, """select family.rfam_acc, count, ali_filtered_len
FROM family
LEFT JOIN (
SELECT rfam_acc, count(distinct index_ali) as count from align_column where index_ali>0 group by rfam_acc
) AS s ON family.rfam_acc=s.rfam_acc;""")
for f in r:
if f[1] is None or f[2] is None:
warn(f"{f[0]} has incomplete alignement data: {f[1]} alignement columns saved, filtered alignment is of length {f[2]}")
continue
if f[1] != f[2]:
warn(f"{f[0]} has {f[1]} alignement columns saved, but its filtered alignment is of length {f[2]} !")
conn.close()
......@@ -1684,6 +1665,8 @@ def sql_define_tables(conn):
freq_G REAL,
freq_U REAL,
freq_other REAL,
gap_percent REAL,
consensus CHAR(1),
PRIMARY KEY (rfam_acc, index_ali),
FOREIGN KEY(rfam_acc) REFERENCES family(rfam_acc)
);
......@@ -2158,7 +2141,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
with open(fasta, 'w') as f:
for rec in seqfile:
if rec.id not in doublons:
f.write(rec.format("fasta"))
f.write(format(rec, "fasta"))
# Add the new sequences with previous ones, if any
with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f:
......@@ -2333,28 +2316,8 @@ def work_realign(rfam_acc):
er.write(f"Failed to realign {rfam_acc} (killed)")
def summarize_position(counts):
""" Counts the number of nucleotides at a given position, given a "column" from a MSA.
"""
# Count modified nucleotides
chars = counts.keys()
known_chars_count = 0
N = 0
for char in chars:
if char in "ACGU":
known_chars_count += counts[char]
if char not in ".-":
N += counts[char] # number of ungapped residues
if N: # prevent division by zero if the column is only gaps
return (counts['A']/N, counts['C']/N, counts['G']/N, counts['U']/N, (N - known_chars_count)/N) # other residues, or consensus (N, K, Y...)
else:
return (0, 0, 0, 0, 0)
@trace_unhandled_exceptions
def work_pssm_remap(f, fill_gaps):
def work_pssm_remap(f):
"""Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
This also remaps the 3D object sequence with the aligned sequence in the MSA.
If asked, the 3D object sequence is completed by the consensus nucleotide when one of them is missing.
......@@ -2385,11 +2348,54 @@ def work_pssm_remap(f, fill_gaps):
with open(runDir + "/errors.txt", "a") as errf:
errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n")
return 1
nseqs = len(align)
ncols = align.get_alignment_length()
# Compute statistics per column
pssm = BufferingSummaryInfo(align).get_pssm(f, thr_idx)
frequencies = [ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ]
del pssm
pssm_info = np.zeros((6, ncols))
res_index = {'A':0, 'C':1, 'G':2, 'U':3, 'N':4, '-':5}
letters = "ACGUN"
consensus = []
for residue_num in tqdm(range(ncols), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Count bases in fam {f}", leave=False):
# Count the bases (iterate lines)
for record in align:
letter = record.seq[residue_num].upper().replace('.','-')
try:
idx = res_index[letter]
except KeyError:
# warn(f"Unknown residue found in {family} family: {letter}", error=True)
# These are K, R, etc from Rfam. The RNANet sequences provided are pure ACGUN, but not the Rfam ones.
idx = 4 # consider it is N
pssm_info[idx,residue_num] += 1.0
# Get the number of non-gap nucleotides
N = 0
for i in range(5):
N += pssm_info[i,residue_num]
if N>0:
# Divide base counts by number of non-gaps
for i in range(5):
pssm_info[i,residue_num] /= N
# last line is for the gap percentage (Ngaps/Nlines)
pssm_info[5,residue_num] /= nseqs
# Define consensus base for this position:
if pssm_info[5,residue_num] > 0.7:
# gaps are in majority if over 75% (that's my definition)
consensus.append('-')
else:
idx = np.argmax(pssm_info[0:5,residue_num])
if pssm_info[idx, residue_num] > 0.5:
consensus.append(letters[idx])
else:
consensus.append('N')
# At this point, pssm_info is a numpy array containing the PSSM and consensus a list of consensus chars.
##########################################################################################
# Remap sequences of the 3D chains with sequences in the alignment
......@@ -2397,60 +2403,51 @@ def work_pssm_remap(f, fill_gaps):
setproctitle(f"RNAnet.py work_pssm_remap({f}) remap")
# For each sequence, find the right chain and remap chain residues with alignment columns
# For each sequence, remap chain residues with sequence alignment
columns_to_save = set()
re_mappings = []
alilen = align.get_alignment_length()
pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Remap {f} chains", leave=False)
pbar = tqdm(total=nseqs, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Remap {f} chains", leave=False)
pbar.update(0)
for s in align:
if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain
# skip Rfamseq entries
if not '[' in s.id:
continue
# Check if the chain existed before in the database
if s.id in chains_ids:
# a chain object is found in the update, this sequence is new
this_chain = list_of_chains[chains_ids.index(s.id)]
seq_to_align = this_chain.seq_to_align
full_length = this_chain.full_length
db_id = this_chain.db_chain_id
# Get the chain id in the database
conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0)
conn.execute('pragma journal_mode=wal')
db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';")
if len(db_id):
db_id = db_id[0][0]
else:
# it existed in the database before.
# Get the chain id in the database
conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0)
conn.execute('pragma journal_mode=wal')
db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';")
if len(db_id):
db_id = db_id[0][0]
else:
conn.close()
warn(f"Bizarre... sequence {s.id} is not found in the database ! Cannot remap it ! Ignoring...")
pbar.update(1)
continue
seq_to_align = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")])
full_length = len(seq_to_align)
conn.close()
warn(f"Bizarre... sequence {s.id} is not found in the database ! Cannot remap it ! Ignoring...")
pbar.update(1)
continue
seq_to_align = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")])
full_length = len(seq_to_align)
conn.close()
# Save colums in the appropriate positions
i = 0 # to iterate the object sequence
j = 0 # to iterate the alignment sequence
while i < full_length and j < alilen:
while i < full_length and j < ncols:
# Here we try to map seq_to_align (the sequence of the 3D chain, including gaps when residues are missing),
# with s.seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and .
if seq_to_align[i] == s.seq[j].upper(): # alignment and sequence correspond (incl. gaps)
if seq_to_align[i] == s.seq[j].upper(): # alignment and sequence correspond (incl. gaps)
re_mappings.append((db_id, i+1, j+1)) # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1.
columns_to_save.add(j+1) # it's a set, doublons are automaticaly ignored
i += 1
j += 1
elif seq_to_align[i] == '-': # gap in the chain, but not in the aligned sequence
elif seq_to_align[i] == '-': # '-' in the chain, but '.' or letter in the aligned sequence
# search for a gap to the consensus nearby
k = 0 # Search must start at zero to assert the difference comes from '-' in front of '.'
while j+k < alilen and s.seq[j+k] == '.':
while j+k < ncols and s.seq[j+k] == '.':
k += 1
# if found, set j to that position
if j+k < alilen and s.seq[j+k] == '-':
if j+k < ncols and s.seq[j+k] == '-':
re_mappings.append((db_id, i+1, j+k+1))
columns_to_save.add(j+k+1)
i += 1
......@@ -2458,31 +2455,28 @@ def work_pssm_remap(f, fill_gaps):
continue
# if not, take the insertion gap if this is one
if j < alilen and s.seq[j] == '.':
if j < ncols and s.seq[j] == '.':
re_mappings.append((db_id, i+1, j+1))
columns_to_save.add(j+1)
i += 1
j += 1
continue
# else, just mark the gap as unknown (there is an alignment mismatch)
# else, just mark the gap as unknown (there is an alignment mismatch '-' in the 3D facing a letter in the alignment)
re_mappings.append((db_id, i+1, 0))
i += 1
elif s.seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain
j += 1 # ignore the column
else: # sequence mismatch which is not a gap...
print(f"You are never supposed to reach this. Comparing {self.chain_label} in {i} ({self.seq_to_align[i-1:i+2]}) with seq[{j}] ({s.seq[j-3:j+4]}).",
self.seq_to_align, s.seq, sep='\n', flush=True)
print(f"You are never supposed to reach this. Comparing {s.id} in {i} ({seq_to_align[i-1:i+2]}) with seq[{j}] ({s.seq[j-3:j+4]}).",
seq_to_align, s.seq, sep='\n', flush=True)
raise Exception('Something is wrong with sequence alignment.')
pbar.update(1)
pbar.close()
# Check we found something
if not len(re_mappings):
warn(f"Chains were not found in {f}++.afa file: {chains_ids}", error=True)
return 1
# Get a sorted list from the set
columns = sorted(columns_to_save)
##########################################################################################
# Save the alignment columns and their mappings to the database
......@@ -2505,75 +2499,48 @@ def work_pssm_remap(f, fill_gaps):
if col not in columns_to_save:
unused.append((f, col))
sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused)
conn.commit()
# Save the useful columns in the database
data = [(f, j) + frequencies[j-1] for j in sorted(columns_to_save)]
sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other)
VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO
UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, freq_other=excluded.freq_other;""", many=True, data=data)
# Add an unknown values column, with index_ali 0
sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other)
VALUES (?, 0, 0.0, 0.0, 0.0, 0.0, 1.0);""", data=(f,))
data = [(f, j) + tuple(pssm_info[:,j-1]) + (consensus[j-1],) for j in sorted(columns_to_save)]
sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO
UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U,
freq_other=excluded.freq_other, gap_percent=excluded.gap_percent, consensus=excluded.consensus;""", many=True, data=data)
# Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment)
sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus)
VALUES (?, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-');""", data=(f,))
# Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains)
sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f))
conn.close()
##########################################################################################
# Replacing gaps in the 3D chains by consensus sequences
# Saving the filtered alignement with only the saved positinos
##########################################################################################
setproctitle(f"RNAnet.py work_pssm_remap({f}) replace gaps")
setproctitle(f"RNAnet.py work_pssm_remap({f}) filtering alignment")
# Replace gaps by consensus
if fill_gaps:
pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Replace {f} gaps", leave=False)
pbar.update(0)
gaps = []
conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0)
conn.execute('pragma journal_mode=wal')
for s in align:
if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain
continue
db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';")
if len(db_id):
db_id = db_id[0][0]
else:
pbar.update(1)
continue
seq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ])
aliseq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ])
full_length = len(seq)
# detect gaps
c_seq = list(seq) # contains "ACGUNacgu-"
letters = ['A', 'C', 'G', 'U', 'N']
homology_data = sql_ask_database(conn, f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM
(SELECT chain_id, rfam_acc FROM chain WHERE chain_id={db_id})
NATURAL JOIN re_mapping
NATURAL JOIN align_column;
""")
if homology_data is None or not len(homology_data):
with open(runDir + "/errors.txt", "a") as errf:
errf.write(f"No homology data found in the database for {s.id} ! Not replacing gaps.\n")
continue
elif len(homology_data) != full_length:
with open(runDir + "/errors.txt", "a") as errf:
errf.write(f"Found {len(homology_data)} nucleotides for {s.id} of length {full_length} ! Not replacing gaps.\n")
continue
for i in range(full_length):
if c_seq[i] == '-':
freq = homology_data[i]
l = letters[freq.index(max(freq))]
gaps.append((l, l == 'A', l == 'C', l == 'G', l == 'U', l == 'N', db_id, i+1))
pbar.update(1)
sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = ?,
is_A = ?, is_C = ?, is_G = ?, is_U = ?, is_other = ?
WHERE chain_id = ? AND index_chain = ?;""", many=True, data=gaps)
conn.close()
idxQueue.put(thr_idx) # replace the thread index in the queue
# filter the alignment
names = [ x.id for x in align if '[' in x.id ]
align = align[-len(names):]
filtered_alignment = align[:, 1:1] # all the lines, but no columns
for p in columns:
filtered_alignment += align[:, p-1:p] # save columns one by one
setproctitle(f"RNAnet.py work_pssm_remap({f}) finished")
# write it to file in both STK and FASTA formats (STK required for distance matrices in statistics)
with open(path_to_seq_data+f"/realigned/{f}_3d_only.stk", "w") as only_3d:
try:
only_3d.write(format(filtered_alignment, "stockholm"))
except ValueError as e:
warn(e)
with open(path_to_seq_data+f"/realigned/{f}_3d_only.afa", "w") as only_3d:
try:
only_3d.write(format(filtered_alignment, "fasta"))
except ValueError as e:
warn(e)
setproctitle(f"RNAnet.py work_pssm_remap({f}) finished")
idxQueue.put(thr_idx) # replace the thread index in the queue
return 0
......@@ -2587,7 +2554,7 @@ def work_save(c, homology=True):
if homology:
df = pd.read_sql_query(f"""
SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code,
is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, dbn,
is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, dbn,
paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta,
chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base,
v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM
......@@ -2631,9 +2598,9 @@ if __name__ == "__main__":
sql_define_tables(conn)
print("> Storing results into", runDir + "/results/RNANet.db")
# # compute an update compared to what is in the table "chain" (comparison on structure_id + chain_name + rfam_acc).
# # If --all was passed, all the structures are kept.
# # Fills pp.update with Chain() objects.
# compute an update compared to what is in the table "chain" (comparison on structure_id + chain_name + rfam_acc).
# If --all was passed, all the structures are kept.
# Fills pp.update with Chain() objects.
# pp.list_available_mappings()
# ===========================================================================
......@@ -2642,7 +2609,8 @@ if __name__ == "__main__":
# # Download and annotate new RNA 3D chains (Chain objects in pp.update)
# # If the original cif file and/or the Json DSSR annotation file already exist, they are not redownloaded/recomputed.
# pp.dl_and_annotate(coeff_ncores=0.5)
# # pp.dl_and_annotate(coeff_ncores=0.5)
# pp.dl_and_annotate(coeff_ncores=1.0)
# print("Here we go.")
# # At this point, the structure table is up to date.
......@@ -2710,7 +2678,7 @@ if __name__ == "__main__":
# Prepare the results
# ==========================================================================================
pp.sanitize_database()
# pp.sanitize_database()
pp.output_results()
print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove
......
......@@ -4,7 +4,7 @@
# Run this file if you want the base counts, pair-type counts, identity percents, etc
# in the database.
import getopt, os, pickle, sqlite3, shlex, subprocess, sys
import getopt, os, pickle, sqlite3, shlex, subprocess, sys, warnings
import numpy as np
import pandas as pd
import threading as th
......@@ -16,6 +16,7 @@ import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import squareform
from mpl_toolkits.mplot3d import axes3d
from Bio import AlignIO, SeqIO
from Bio.PDB.MMCIFParser import MMCIFParser
from functools import partial
from multiprocessing import Pool, Manager
from os import path
......@@ -429,11 +430,7 @@ def parallel_stats_pairs(f):
@trace_unhandled_exceptions
def to_id_matrix(f):
"""
Extracts sequences of 3D chains from the family alignments to a distinct STK file,
then runs esl-alipid on it to get an identity matrix.
Side-effect : also produces the 3D_only family alignment as a separate file.
So, we use this function to update 'ali_filtered_length' in the family table.
Runs esl-alipid on the filtered alignment to get an identity matrix.
"""
if path.isfile("data/"+f+".npy"):
return 0
......@@ -444,35 +441,18 @@ def to_id_matrix(f):
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_id_matrix({f})")
# Prepare a file
with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file:
al = AlignIO.read(al_file, "fasta")
names = [ x.id for x in al if '[' in x.id ]
al = al[-len(names):]
with open(path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk", "w") as only_3d:
try:
only_3d.write(al.format("stockholm"))
except ValueError as e:
warn(e)
del al
subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap", #
"-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk", # This run just deletes columns of gaps
"stockholm", path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"]) #
subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk", f + "_3d_only.stk"])
subprocess.run(["esl-reformat", "-o", path_to_seq_data+f"/realigned/{f}_3d_only.afa", "afa", path_to_seq_data+f"/realigned/{f}_3d_only.stk"])
# Out-of-scope task : update the database with the length of the filtered alignment:
align = AlignIO.read(path_to_seq_data+f"/realigned/{f}_3d_only.afa", "fasta")
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
conn.execute('pragma journal_mode=wal')
sql_execute(conn, "UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=[align.get_alignment_length(), f])
if not path.isfile(f"{path_to_seq_data}/realigned/{f}_3d_only.stk"):
warn(f"File not found: {path_to_seq_data}/realigned/{f}_3d_only.stk")
align = AlignIO.read(f"{path_to_seq_data}/realigned/{f}_3d_only.stk", "stockholm")
names = [ x.id for x in align if '[' in x.id ]
del align
pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", unit="comparisons", leave=False)
pbar.update(0)
# Prepare the job
process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"),
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}/realigned/{f}_3d_only.stk"), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
id_matrix = np.zeros((len(names), len(names)))
pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", unit="comparisons", leave=False)
cnt = 0
while not cnt or process.poll() is None:
output = process.stdout.read()
......@@ -632,7 +612,6 @@ def stats_pairs():
plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99)
plt.savefig(runDir + f"/results/figures/pairings_{res_thr}.png")
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
notify("Computed nucleotide statistics and saved CSV and PNG file.")
@trace_unhandled_exceptions
......@@ -931,7 +910,141 @@ def general_stats():
hspace=0.05, bottom=0.12, top=0.84)
fig.savefig(runDir + "/results/figures/Nfamilies.png")
plt.close()
def get_matrix_euclidian_distance(cif_file, aligned_seq, consider_all_atoms):
"""
This function
- loads the coordinates and the alignment, reconctructs the alignment but with coordinates, considering gaps, and
- compute the matrix of euclidian distances.
Returns:
The 2D np.array of euclidian distances between pairs of nucleotides, with np.NaNs in gap columns.
"""
# Load the baricenter coordinates
coordinates = nt_3d_centers(cif_file, consider_all_atoms)
# reconstruct the alignment but with coordinates
nb_gap = 0
coordinates_with_gaps = []
for i in range(len(aligned_seq)):
if aligned_seq[i] == '.' or aligned_seq[i] == '-':
nb_gap = nb_gap + 1
coordinates_with_gaps.append('NA')
else:
coordinates_with_gaps.append(coordinates[i - nb_gap])
nb_nucleotides = len(coordinates_with_gaps) # number of nucleotides
matrix = np.zeros((nb_nucleotides, nb_nucleotides)) # create a new empty matrix of size nxn
# Fill this new matrix with the euclidians distances between all amino acids considering gaps:
for i in range(nb_nucleotides):
for j in range(nb_nucleotides):
if coordinates_with_gaps[i] == 'NA' or coordinates_with_gaps[j] == 'NA':
matrix[i][j] = np.nan
else:
matrix[i][j] = round(get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]),3)
return(matrix)
@trace_unhandled_exceptions
def get_avg_std_distance_matrix(f, consider_all_atoms):
# Get a worker number to position the progress bar
global idxQueue
thr_idx = idxQueue.get()
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} {f} residue distance matrices")
if consider_all_atoms:
label = "base"
else:
label = "backbone"
os.makedirs(runDir + '/results/distance_matrices/' + f + '_' + label, exist_ok=True )
family_matrices = []
align = AlignIO.read(path_to_seq_data + f"realigned/{f}_3d_only.afa", "fasta")
found = 0
notfound = 0
pbar = tqdm(total = len(align), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} {label} distance matrices", unit="chains", leave=False)
pbar.update(0)
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
conn.execute('pragma journal_mode=wal')
r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';")
filelist = [ ''.join(list(x))+'.cif' for x in r ]
for s in align:
filename = ''
for file in filelist:
if file.startswith(s.id.replace('-', '').replace('[', '_').replace(']', '_')):
filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file
break
if len(filename):
found += 1
try:
euclidian_distance = get_matrix_euclidian_distance(filename, s.seq, consider_all_atoms)
np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', euclidian_distance, delimiter=",", fmt="%.3f")
family_matrices.append(euclidian_distance)
except FileNotFoundError:
found -= 1
notfound += 1
else:
notfound += 1
pbar.update(1)
# Calculation of the average matrix
avgarray = np.array(family_matrices)
if len(avgarray) == 0 or np.prod(avgarray.shape) == 0:
warn(f"Something's wrong with the shapes: {avgarray.shape}", error=True)
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=RuntimeWarning)
matrix_average_distances = np.nanmean(avgarray, axis=0 )
if len(matrix_average_distances) != 0:
matrix_average_distances = np.nan_to_num(matrix_average_distances)
np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.csv' , np.triu(matrix_average_distances), delimiter=",", fmt="%.3f")
fig, ax = plt.subplots()
im = ax.imshow(matrix_average_distances)
cbar = ax.figure.colorbar(im, ax=ax)
cbar.ax.set_ylabel("Angströms", rotation=-90, va="bottom")
ax.set_title("Average distance between residues (Angströms)")
fig.tight_layout()
fig.savefig(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.png', dpi=300)
plt.close()
# Calculation of the standard deviation matrix
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=RuntimeWarning)
matrix_standard_deviation_distances = np.nanstd(avgarray, axis=0 )
if len(matrix_standard_deviation_distances) != 0:
matrix_standard_deviation_distances = np.nan_to_num(matrix_standard_deviation_distances)
np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_stdev.csv' , np.triu(matrix_standard_deviation_distances), delimiter=",", fmt="%.3f")
fig, ax = plt.subplots()
im = ax.imshow(matrix_standard_deviation_distances)
cbar = ax.figure.colorbar(im, ax=ax)
cbar.ax.set_ylabel("Angströms", rotation=-90, va="bottom")
ax.set_title("Average distance between residues (Angströms)")
fig.tight_layout()
fig.savefig(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_std.png', dpi=300)
plt.close()
# Save log
with open(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '.log', 'a') as logfile:
logfile.write(str(found)+ " chains taken into account for computation. "+ str(notfound)+ " were not found.\n")
# Save associated nucleotide frequencies (off-topic but convenient to do it here)
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
conn.execute('pragma journal_mode=wal')
df = pd.read_sql_query(f"SELECT freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;", conn)
df.to_csv(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_frequencies.csv', float_format="%.3f")
pbar.close()
idxQueue.put(thr_idx) # replace the thread index in the queue
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
return 0
def log_to_pbar(pbar):
def update(r):
......@@ -952,6 +1065,48 @@ def family_order(f):
else:
return 2
def nt_3d_centers(cif_file, consider_all_atoms):
"""Return the nucleotides' coordinates, summarizing a nucleotide by only one point.
If consider_all_atoms : barycentre is used
else: C1' atom is the nucleotide
"""
result =[]
structure = MMCIFParser().get_structure(cif_file, cif_file)
if consider_all_atoms == True:
for model in structure:
for chain in model:
for residue in chain:
temp_list = []
res_isobaricentre = 0
for atom in residue:
temp_list.append(atom.get_coord())
lg = len(temp_list)
summ = np.sum(temp_list, axis = 0)
res_isobaricentre = [summ[0]/lg, summ[1]/lg, summ[2]/lg]
result.append([res_isobaricentre[0], res_isobaricentre[1], res_isobaricentre[2]])
elif consider_all_atoms == False:
for model in structure:
for chain in model:
for residue in chain:
for atom in residue:
if atom.get_name() == "C1'":
coordinates = atom.get_coord()
res = [coordinates[0], coordinates[1], coordinates[2]]
result.append(res)
return(result)
def get_euclidian_distance(L1, L2):
"""Returns the distance between two points (coordinates in lists)
"""
e = 0
for i in range(len(L1)):
e += float(L1[i] - L2[i])**2
return np.sqrt(e)
if __name__ == "__main__":
os.makedirs(runDir + "/results/figures/", exist_ok=True)
......@@ -959,8 +1114,9 @@ if __name__ == "__main__":
# parse options
DELETE_OLD_DATA = False
DO_WADLEY_ANALYSIS = False
DO_AVG_DISTANCE_MATRIX = False
try:
opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "from-scratch", "wadley", "resolution=", "3d-folder=", "seq-folder=" ])
opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "from-scratch", "wadley", "distance-matrices", "resolution=", "3d-folder=", "seq-folder=" ])
except getopt.GetoptError as err:
print(err)
sys.exit(2)
......@@ -979,9 +1135,12 @@ if __name__ == "__main__":
print("--seq-folder=…\t\t\tPath to a folder containing the sequence and alignment files. Required subfolder:"
"\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything")
print("--distance-matrices\t\tCompute average distance between nucleotide pairs for each family.")
print("--wadley\t\t\tReproduce Wadley & al 2007 clustering of pseudotorsions.")
sys.exit()
elif opt == '--version':
print("RNANet statistics 1.2")
print("RNANet statistics 1.3 beta")
sys.exit()
elif opt == "-r" or opt == "--resolution":
assert float(arg) > 0.0 and float(arg) <= 20.0
......@@ -997,6 +1156,8 @@ if __name__ == "__main__":
elif opt=='--from-scratch':
DELETE_OLD_DATA = True
DO_WADLEY_ANALYSIS = True
elif opt=="--distance-matrices":
DO_AVG_DISTANCE_MATRIX = True
elif opt=='--wadley':
DO_WADLEY_ANALYSIS = True
......@@ -1030,6 +1191,8 @@ if __name__ == "__main__":
subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"])
if DO_WADLEY_ANALYSIS:
subprocess.run(["rm","-f", runDir + f"/data/wadley_kernel_eta_{res_thr}.npz", runDir + f"/data/wadley_kernel_eta_prime_{res_thr}.npz", runDir + f"/data/pair_counts_{res_thr}.csv"])
if DO_AVG_DISTANCE_MATRIX:
subprocess.run(["rm", "-rf", runDir + f"/results/distance_matrices/"])
# Prepare the multiprocessing execution environment
nworkers = min(read_cpu_number()-1, 32)
......@@ -1043,6 +1206,17 @@ if __name__ == "__main__":
if n_unmapped_chains and DO_WADLEY_ANALYSIS:
joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
if DO_AVG_DISTANCE_MATRIX:
extracted_chains = []
for file in os.listdir(path_to_3D_data + "rna_mapped_to_Rfam"):
if os.path.isfile(os.path.join(path_to_3D_data + "rna_mapped_to_Rfam", file)):
e1 = file.split('_')[0]
e2 = file.split('_')[1]
e3 = file.split('_')[2]
extracted_chains.append(e1 + '[' + e2 + ']' + '-' + e3)
for f in famlist:
joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, True)))
joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False)))
joblist.append(Job(function=stats_len)) # Computes figures
joblist.append(Job(function=stats_freq)) # updates the database
for f in famlist:
......