v1.3 large update

Louis BECQUEY
Commit e74c6fe35285163ef9afd9d1c727f929b25864b0 e74c6fe3 1 parent 838a6824
Showing 4 changed files with 361 additions and 213 deletions
CHANGELOG
Dockerfile
RNAnet.py
statistics.py
--- a/CHANGELOG
View file @e74c6fe
+++ b/CHANGELOG
View file @e74c6fe
- v 1.1 beta, January 2021
+ v 1.3 beta, January 2021
 
 The first uses of RNAnet by people from outside the development team happened between this December.
 A few feedback allowed to identify issues and useful information to add.
 
 FEATURE CHANGES
-     - Sequence alignments of the 3D structures mapped to a family are now provided. 
+     - Sequence alignments of the 3D structures mapped to a family are now provided.
     - Full alignements with Rfam sequences are not provided, but you can ask us for the files.
     - Two new fields in table 'family': ali_length and ali_filtered_length. 
     They are the MSA lengths of the alignment with and without the Rfam sequences. 
+     - Gap replacement by consensus (--fill-gaps) has been removed. Now, the gap percentage and consensus are saved 
+     in the align_column table and the datapoints in CSV format, in separate columns. 
+     Consensus is one of ACGUN-, the gap being chosen if >75% of the sequences are gaps at this position. 
+     Otherwise, A/C/G/U is chosen if >50% of the non-gap positions are A/C/G/U. Otherwise, N is the consensus.
 
 TECHNICAL CHANGES
-     - SQLite connexions are now all in WAL mode by default (previously, only the writers used WAL mode)
+     - SQLite connexions are now all in WAL mode by default (previously, only the writers used WAL mode, but this is useless)
+     - Moved to Python3.9 for internal testing.
+     - Latest version of BioPython is now supported (1.78)
 
 BUG CORRECTIONS
     - When an alignment file is updated in a newer run of RNANet, all the re_mappings are now re-computed 
@@ -19,8 +25,8 @@ BUG CORRECTIONS
     - Changed the ownership and permissions of files produced by the Docker container. 
     They were previously owned by root and the user could not get access to them.
     - Modified nucleotides were not always correctly transformed to N in the alignments (and nucleotide.nt_align_code fields).
-     Now, the alignments and nt_align_code only contain "ACGUN-" chars. 
-     Now, 'N' means 'other', while '-' means 'nothing'.
+     Now, the alignments and nt_align_code (and consensus) only contain "ACGUN-" chars. 
+     Now, 'N' means 'other', while '-' means 'nothing' or 'unknown'.
 
 COMING SOON
     - Automated annotation of detected Recurrent Interaction Networks (RINs), see http://carnaval.lri.fr/ .
--- a/Dockerfile
View file @e74c6fe
+++ b/Dockerfile
View file @e74c6fe
@@ -14,7 +14,7 @@ RUN apk update && apk add --no-cache \
         py3-matplotlib py3-requests py3-scipy py3-setproctitle py3-sqlalchemy py3-tqdm \
         sqlite \
     \
-     && python3 -m pip install biopython==1.76 pandas psutil pymysql && \
+     && python3 -m pip install biopython pandas psutil pymysql && \
     \
     wget -q -O /etc/apk/keys/sgerrand.rsa.pub https://alpine-pkgs.sgerrand.com/sgerrand.rsa.pub && \
     wget https://github.com/sgerrand/alpine-pkg-glibc/releases/download/2.32-r0/glibc-2.32-r0.apk && \
--- a/RNAnet.py
View file @e74c6fe
+++ b/RNAnet.py
View file @e74c6fe
@@ -125,36 +125,6 @@ class SelectivePortionSelector(object):
         return 1
 
 
- class BufferingSummaryInfo(AlignInfo.SummaryInfo):
- 
-     def get_pssm(self, family, index):
-         """Create a position specific score matrix object for the alignment. 
- 
-         This creates a position specific score matrix (pssm) which is an 
-         alternative method to look at a consensus sequence. 
- 
-         Returns: 
-          - A PSSM (position specific score matrix) object. 
-         """
- 
-         pssm_info = []
-         # now start looping through all of the sequences and getting info
-         for residue_num in tqdm(range(self.alignment.get_alignment_length()), position=index+1, desc=f"Worker {index+1}: Count bases in fam {family}", leave=False):
-             score_dict = self._get_base_letters("ACGUN")
-             for record in self.alignment:
-                 this_residue = record.seq[residue_num].upper()
-                 if this_residue not in "-.":
-                     try:
-                         score_dict[this_residue] += 1.0
-                     except KeyError:
-                         # if this_residue in "acgun":
-                         #     warn(f"Found {this_residue} in {family} alignment...")
-                         score_dict[this_residue] = 1.0
-             pssm_info.append(('*', score_dict))
- 
-         return AlignInfo.PSSM(pssm_info)
- 
- 
 class Chain:
     """ 
     The object which stores all our data and the methods to process it.
@@ -963,7 +933,6 @@ class Pipeline:
         # Default options:
         self.CRYSTAL_RES = 4.0
         self.KEEP_HETATM = False
-         self.FILL_GAPS = True
         self.HOMOLOGY = True
         self.USE_KNOWN_ISSUES = True
         self.RUN_STATS = False
@@ -984,11 +953,9 @@ class Pipeline:
         setproctitle("RNANet.py process_options()")
 
         try:
-             opts, _ = getopt.getopt(sys.argv[1:], "r:fhs",
-                                     ["help", "resolution=", "keep-hetatm=", "from-scratch", "full-inference,"
-                                         "fill-gaps=", "3d-folder=", "seq-folder=",
-                                         "no-homology", "ignore-issues", "extract", "only=", "all", "no-logs",
-                                         "archive", "update-homologous"])
+             opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=",  "only=",
+                                                             "from-scratch", "full-inference", "no-homology", "ignore-issues", "extract", 
+                                                             "all", "no-logs", "archive", "update-homologous"])
         except getopt.GetoptError as err:
             print(err)
             sys.exit(2)
@@ -1014,8 +981,6 @@ class Pipeline:
                 print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.")
                 print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. "
                       "\n\t\t\t\tDoes not affect the descriptors.")
-                 print("--fill-gaps=True\t\t(True | False) Replace gaps in nt_align_code field due to unresolved residues"
-                       "\n\t\t\t\tby the most common nucleotide at this position in the alignment.")
                 print("--3d-folder=…\t\t\tPath to a folder to store the 3D data files. Subfolders will contain:"
                       "\n\t\t\t\t\tRNAcifs/\t\tFull structures containing RNA, in mmCIF format"
                       "\n\t\t\t\t\trna_mapped_to_Rfam/\tExtracted 'pure' RNA chains"
@@ -1038,7 +1003,7 @@ class Pipeline:
                 print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &")
                 sys.exit()
             elif opt == '--version':
-                 print("RNANet 1.2, parallelized, Dockerized")
+                 print("RNANet 1.3 beta, parallelized, Dockerized")
                 sys.exit()
             elif opt == "-r" or opt == "--resolution":
                 assert float(arg) > 0.0 and float(arg) <= 20.0
@@ -1048,9 +1013,6 @@ class Pipeline:
             elif opt == "--keep-hetatm":
                 assert arg in ["True", "False"]
                 self.KEEP_HETATM = (arg == "True")
-             elif opt == "--fill-gaps":
-                 assert arg in ["True", "False"]
-                 self.FILL_GAPS = (arg == "True")
             elif opt == "--no-homology":
                 self.HOMOLOGY = False
             elif opt == '--3d-folder':
@@ -1410,13 +1372,12 @@ class Pipeline:
 
         # Start a process pool to dispatch the RNA families,
         # over multiple CPUs (one family by CPU)
-         # p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=1)
         p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
 
         try:
             fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True)
             # Apply work_pssm_remap to each RNA family
-             for i, _ in enumerate(p.imap_unordered(partial(work_pssm_remap, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)):
+             for i, _ in enumerate(p.imap_unordered(work_pssm_remap, self.fam_list, chunksize=1)):
                 # Everytime the iteration finishes on a family, update the global progress bar over the RNA families
                 fam_pbar.update(1)
             fam_pbar.close()
@@ -1492,6 +1453,12 @@ class Pipeline:
             subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", "."])
             subprocess.run(["ln", "-s", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", runDir + f"/archive/RNANET_datapoints_latest.tar.gz"])
 
+             # gather the alignments
+             os.makedirs(path_to_seq_data + "realigned/3D_only", exist_ok=True)
+             subprocess.run(["cp", path_to_seq_data + "realigned/*_3d_only.afa", path_to_seq_data + "realigned/3d_only" ])
+             subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_alignments_latest.tar.gz"])
+             subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3d_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."])
+ 
     def sanitize_database(self):
         """Searches for issues in the database and correct them"""
 
@@ -1540,6 +1507,20 @@ class Pipeline:
             #     for x in r:
             #         print(x)
 
+             # check that filtered alignment have the same length than the number of saved alignment columns for a family
+             r = sql_ask_database(conn, """select family.rfam_acc, count, ali_filtered_len 
+                                           FROM family 
+                                           LEFT JOIN (
+                                               SELECT rfam_acc, count(distinct index_ali) as count from align_column where index_ali>0 group by rfam_acc
+                                           ) AS s ON family.rfam_acc=s.rfam_acc;""")
+             for f in r:
+                 if f[1] is None or f[2] is None:
+                     warn(f"{f[0]} has incomplete alignement data: {f[1]} alignement columns saved, filtered alignment is of length {f[2]}")
+                     continue
+ 
+                 if f[1] != f[2]:
+                     warn(f"{f[0]} has {f[1]} alignement columns saved, but its filtered alignment is of length {f[2]} !")
+ 
         conn.close()
 
 
@@ -1684,6 +1665,8 @@ def sql_define_tables(conn):
                 freq_G          REAL,
                 freq_U          REAL,
                 freq_other      REAL,
+                 gap_percent     REAL,
+                 consensus       CHAR(1),
                 PRIMARY KEY (rfam_acc, index_ali),
                 FOREIGN KEY(rfam_acc) REFERENCES family(rfam_acc)
             );
@@ -2158,7 +2141,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
                 with open(fasta, 'w') as f:
                     for rec in seqfile:
                         if rec.id not in doublons:
-                             f.write(rec.format("fasta"))
+                             f.write(format(rec, "fasta"))
 
         # Add the new sequences with previous ones, if any
         with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f:
@@ -2333,28 +2316,8 @@ def work_realign(rfam_acc):
                 er.write(f"Failed to realign {rfam_acc} (killed)")
 
 
- def summarize_position(counts):
-     """ Counts the number of nucleotides at a given position, given a "column" from a MSA.
-     """
- 
-     # Count modified nucleotides
-     chars = counts.keys()
-     known_chars_count = 0
-     N = 0
-     for char in chars:
-         if char in "ACGU":
-             known_chars_count += counts[char]
-         if char not in ".-":
-             N += counts[char]  # number of ungapped residues
- 
-     if N:  # prevent division by zero if the column is only gaps
-         return (counts['A']/N, counts['C']/N, counts['G']/N, counts['U']/N, (N - known_chars_count)/N)  # other residues, or consensus (N, K, Y...)
-     else:
-         return (0, 0, 0, 0, 0)
- 
- 
 @trace_unhandled_exceptions
- def work_pssm_remap(f, fill_gaps):
+ def work_pssm_remap(f):
     """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
     This also remaps the 3D object sequence with the aligned sequence in the MSA.
     If asked, the 3D object sequence is completed by the consensus nucleotide when one of them is missing.
@@ -2385,11 +2348,54 @@ def work_pssm_remap(f, fill_gaps):
         with open(runDir + "/errors.txt", "a") as errf:
             errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n")
         return 1
+     nseqs = len(align)
+     ncols = align.get_alignment_length()
 
     # Compute statistics per column
-     pssm = BufferingSummaryInfo(align).get_pssm(f, thr_idx)
-     frequencies = [ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ]
-     del pssm
+     pssm_info = np.zeros((6, ncols))
+     res_index = {'A':0, 'C':1, 'G':2, 'U':3, 'N':4, '-':5}
+     letters = "ACGUN"
+     consensus = []
+     
+     for residue_num in tqdm(range(ncols), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Count bases in fam {f}", leave=False):
+         
+         # Count the bases (iterate lines)
+         for record in align:
+             letter = record.seq[residue_num].upper().replace('.','-')
+             try:
+                 idx = res_index[letter]
+             except KeyError:
+                 # warn(f"Unknown residue found in {family} family: {letter}", error=True)
+                 # These are K, R, etc from Rfam. The RNANet sequences provided are pure ACGUN, but not the Rfam ones.
+                 idx = 4 # consider it is N
+             pssm_info[idx,residue_num] += 1.0
+         
+         # Get the number of non-gap nucleotides
+         N = 0
+         for i in range(5):
+             N += pssm_info[i,residue_num]
+ 
+         if N>0:
+             # Divide base counts by number of non-gaps
+             for i in range(5):
+                 pssm_info[i,residue_num] /= N
+         
+         # last line is for the gap percentage (Ngaps/Nlines)
+         pssm_info[5,residue_num] /= nseqs
+ 
+         # Define consensus base for this position:
+         if pssm_info[5,residue_num] > 0.7:
+             # gaps are in majority if over 75% (that's my definition)
+             consensus.append('-')
+         else:
+             idx = np.argmax(pssm_info[0:5,residue_num])
+             if pssm_info[idx, residue_num] > 0.5:
+                 consensus.append(letters[idx])
+             else:
+                 consensus.append('N')
+ 
+     # At this point, pssm_info is a numpy array containing the PSSM and consensus a list of consensus chars.
+ 
 
     ##########################################################################################
     #           Remap sequences of the 3D chains with sequences in the alignment
@@ -2397,60 +2403,51 @@ def work_pssm_remap(f, fill_gaps):
 
     setproctitle(f"RNAnet.py work_pssm_remap({f}) remap")
 
-     # For each sequence, find the right chain and remap chain residues with alignment columns
+     # For each sequence, remap chain residues with sequence alignment 
     columns_to_save = set()
     re_mappings = []
-     alilen = align.get_alignment_length()
-     pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Remap {f} chains", leave=False)
+     pbar = tqdm(total=nseqs, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Remap {f} chains", leave=False)
     pbar.update(0)
     for s in align:
-         if not '[' in s.id:  # this is a Rfamseq entry, not a 3D chain
+         # skip Rfamseq entries
+         if not '[' in s.id:  
             continue
 
-         # Check if the chain existed before in the database
-         if s.id in chains_ids:
-             # a chain object is found in the update, this sequence is new
-             this_chain = list_of_chains[chains_ids.index(s.id)]
-             seq_to_align = this_chain.seq_to_align
-             full_length = this_chain.full_length
-             db_id = this_chain.db_chain_id
+         # Get the chain id in the database
+         conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0)
+         conn.execute('pragma journal_mode=wal')
+         db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';")
+         if len(db_id):
+             db_id = db_id[0][0]
         else:
-             # it existed in the database before.
-             # Get the chain id in the database
-             conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0)
-             conn.execute('pragma journal_mode=wal')
-             db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';")
-             if len(db_id):
-                 db_id = db_id[0][0]
-             else:
-                 conn.close()
-                 warn(f"Bizarre... sequence {s.id} is not found in the database ! Cannot remap it ! Ignoring...")
-                 pbar.update(1)
-                 continue
-             seq_to_align = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")])
-             full_length = len(seq_to_align)
             conn.close()
+             warn(f"Bizarre... sequence {s.id} is not found in the database ! Cannot remap it ! Ignoring...")
+             pbar.update(1)
+             continue
+         seq_to_align = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")])
+         full_length = len(seq_to_align)
+         conn.close()
 
         # Save colums in the appropriate positions
         i = 0   # to iterate the object sequence
         j = 0   # to iterate the alignment sequence
-         while i < full_length and j < alilen:
+         while i < full_length and j < ncols:
             # Here we try to map seq_to_align (the sequence of the 3D chain, including gaps when residues are missing),
             # with s.seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and .
 
-             if seq_to_align[i] == s.seq[j].upper():    # alignment and sequence correspond (incl. gaps)
+             if seq_to_align[i] == s.seq[j].upper():      # alignment and sequence correspond (incl. gaps)
                 re_mappings.append((db_id, i+1, j+1))    # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1.
                 columns_to_save.add(j+1)    # it's a set, doublons are automaticaly ignored
                 i += 1
                 j += 1
-             elif seq_to_align[i] == '-':   # gap in the chain, but not in the aligned sequence
+             elif seq_to_align[i] == '-':   # '-' in the chain, but '.' or letter in the aligned sequence
                 # search for a gap to the consensus nearby
                 k = 0  # Search must start at zero to assert the difference comes from '-' in front of '.'
-                 while j+k < alilen and s.seq[j+k] == '.':
+                 while j+k < ncols and s.seq[j+k] == '.':
                     k += 1
 
                 # if found, set j to that position
-                 if j+k < alilen and s.seq[j+k] == '-':
+                 if j+k < ncols and s.seq[j+k] == '-':
                     re_mappings.append((db_id, i+1, j+k+1))
                     columns_to_save.add(j+k+1)
                     i += 1
@@ -2458,31 +2455,28 @@ def work_pssm_remap(f, fill_gaps):
                     continue
 
                 # if not, take the insertion gap if this is one
-                 if j < alilen and s.seq[j] == '.':
+                 if j < ncols and s.seq[j] == '.':
                     re_mappings.append((db_id, i+1, j+1))
                     columns_to_save.add(j+1)
                     i += 1
                     j += 1
                     continue
 
-                 # else, just mark the gap as unknown (there is an alignment mismatch)
+                 # else, just mark the gap as unknown (there is an alignment mismatch '-' in the 3D facing a letter in the alignment)
                 re_mappings.append((db_id, i+1, 0))
                 i += 1
             elif s.seq[j] in ['.', '-']:  # gap in the alignment, but not in the real chain
                 j += 1  # ignore the column
             else:  # sequence mismatch which is not a gap...
-                 print(f"You are never supposed to reach this. Comparing {self.chain_label} in {i} ({self.seq_to_align[i-1:i+2]}) with seq[{j}] ({s.seq[j-3:j+4]}).",
-                       self.seq_to_align, s.seq, sep='\n', flush=True)
+                 print(f"You are never supposed to reach this. Comparing {s.id} in {i} ({seq_to_align[i-1:i+2]}) with seq[{j}] ({s.seq[j-3:j+4]}).",
+                       seq_to_align, s.seq, sep='\n', flush=True)
                 raise Exception('Something is wrong with sequence alignment.')
 
         pbar.update(1)
     pbar.close()
 
-     # Check we found something
-     if not len(re_mappings):
-         warn(f"Chains were not found in {f}++.afa file: {chains_ids}", error=True)
-         return 1
- 
+     # Get a sorted list from the set
+     columns = sorted(columns_to_save)
 
     ##########################################################################################
     #           Save the alignment columns and their mappings to the database
@@ -2505,75 +2499,48 @@ def work_pssm_remap(f, fill_gaps):
         if col not in columns_to_save:
             unused.append((f, col))
     sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused)
+     conn.commit()
+ 
     # Save the useful columns in the database
-     data = [(f, j) + frequencies[j-1] for j in sorted(columns_to_save)]
-     sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other)
-                          VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO 
-                          UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, freq_other=excluded.freq_other;""", many=True, data=data)
-     # Add an unknown values column, with index_ali 0
-     sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other)
-                           VALUES (?, 0, 0.0, 0.0, 0.0, 0.0, 1.0);""", data=(f,))
+     data = [(f, j) + tuple(pssm_info[:,j-1]) + (consensus[j-1],) for j in sorted(columns_to_save)]
+     sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus)
+                          VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO 
+                          UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, 
+                                     freq_other=excluded.freq_other, gap_percent=excluded.gap_percent, consensus=excluded.consensus;""", many=True, data=data)
+     # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment)
+     sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus)
+                           VALUES (?, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-');""", data=(f,))
     # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains)
     sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f))
     conn.close()
 
     ##########################################################################################
-     #           Replacing gaps in the 3D chains by consensus sequences
+     #               Saving the filtered alignement with only the saved positinos
     ##########################################################################################
 
-     setproctitle(f"RNAnet.py work_pssm_remap({f}) replace gaps")
+     setproctitle(f"RNAnet.py work_pssm_remap({f}) filtering alignment")
 
-     # Replace gaps by consensus
-     if fill_gaps:
-         pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Replace {f} gaps", leave=False)
-         pbar.update(0)
-         gaps = []
-         conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0)
-         conn.execute('pragma journal_mode=wal')
-         for s in align:
-             if not '[' in s.id:  # this is a Rfamseq entry, not a 3D chain
-                 continue
-     
-             db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';")
-             if len(db_id):
-                 db_id = db_id[0][0]
-             else:
-                 pbar.update(1)
-                 continue
-             seq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ])
-             aliseq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ])
-             full_length = len(seq)
- 
-             # detect gaps
-             c_seq = list(seq)  # contains "ACGUNacgu-"
-             letters = ['A', 'C', 'G', 'U', 'N']
-             homology_data = sql_ask_database(conn, f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM
-                                                     (SELECT chain_id, rfam_acc FROM chain WHERE chain_id={db_id})
-                                                     NATURAL JOIN re_mapping
-                                                     NATURAL JOIN align_column;
-                                                 """)
-             if homology_data is None or not len(homology_data):
-                 with open(runDir + "/errors.txt", "a") as errf:
-                     errf.write(f"No homology data found in the database for {s.id} ! Not replacing gaps.\n")
-                 continue
-             elif len(homology_data) != full_length:
-                 with open(runDir + "/errors.txt", "a") as errf:
-                     errf.write(f"Found {len(homology_data)} nucleotides for {s.id} of length {full_length} ! Not replacing gaps.\n")
-                 continue
-             for i in range(full_length):
-                 if c_seq[i] == '-':
-                     freq = homology_data[i]
-                     l = letters[freq.index(max(freq))]
-                     gaps.append((l, l == 'A', l == 'C', l == 'G', l == 'U', l == 'N', db_id, i+1))
-             pbar.update(1)
-         sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = ?, 
-                               is_A = ?, is_C = ?, is_G = ?, is_U = ?, is_other = ?
-                               WHERE chain_id = ? AND index_chain = ?;""", many=True, data=gaps)
-         conn.close()
-     idxQueue.put(thr_idx)  # replace the thread index in the queue
+     # filter the alignment
+     names = [ x.id for x in align if '[' in x.id ]
+     align = align[-len(names):]
+     filtered_alignment = align[:, 1:1] # all the lines, but no columns
+     for p in columns: 
+         filtered_alignment += align[:, p-1:p] # save columns one by one
 
-     setproctitle(f"RNAnet.py work_pssm_remap({f}) finished")
+     # write it to file in both STK and FASTA formats (STK required for distance matrices in statistics)
+     with open(path_to_seq_data+f"/realigned/{f}_3d_only.stk", "w") as only_3d:
+         try:
+             only_3d.write(format(filtered_alignment, "stockholm"))
+         except ValueError as e:
+             warn(e)
+     with open(path_to_seq_data+f"/realigned/{f}_3d_only.afa", "w") as only_3d:
+         try:
+             only_3d.write(format(filtered_alignment, "fasta"))
+         except ValueError as e:
+             warn(e)
 
+     setproctitle(f"RNAnet.py work_pssm_remap({f}) finished")
+     idxQueue.put(thr_idx) # replace the thread index in the queue
     return 0
 
 
@@ -2587,7 +2554,7 @@ def work_save(c, homology=True):
     if homology:
         df = pd.read_sql_query(f"""
                 SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, 
-                 is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, dbn,
+                 is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, dbn,
                 paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta,
                 chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base,
                 v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM 
@@ -2631,9 +2598,9 @@ if __name__ == "__main__":
         sql_define_tables(conn)
     print("> Storing results into", runDir + "/results/RNANet.db")
 
-     # # compute an update compared to what is in the table "chain" (comparison on structure_id + chain_name + rfam_acc).
-     # # If --all was passed, all the structures are kept.
-     # # Fills pp.update with Chain() objects.
+     # compute an update compared to what is in the table "chain" (comparison on structure_id + chain_name + rfam_acc).
+     # If --all was passed, all the structures are kept.
+     # Fills pp.update with Chain() objects.
     # pp.list_available_mappings()
 
     # ===========================================================================
@@ -2642,7 +2609,8 @@ if __name__ == "__main__":
 
     # # Download and annotate new RNA 3D chains (Chain objects in pp.update)
     # # If the original cif file and/or the Json DSSR annotation file already exist, they are not redownloaded/recomputed.
-     # pp.dl_and_annotate(coeff_ncores=0.5)
+     # # pp.dl_and_annotate(coeff_ncores=0.5)
+     # pp.dl_and_annotate(coeff_ncores=1.0)
     # print("Here we go.")
 
     # # At this point, the structure table is up to date.
@@ -2710,7 +2678,7 @@ if __name__ == "__main__":
     # Prepare the results
     # ==========================================================================================
 
-     pp.sanitize_database()
+     # pp.sanitize_database()
     pp.output_results()
 
     print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove
--- a/statistics.py
View file @e74c6fe
+++ b/statistics.py
View file @e74c6fe
@@ -4,7 +4,7 @@
 # Run this file if you want the base counts, pair-type counts, identity percents, etc
 # in the database.
 
- import getopt, os, pickle, sqlite3, shlex, subprocess, sys
+ import getopt, os, pickle, sqlite3, shlex, subprocess, sys, warnings
 import numpy as np
 import pandas as pd
 import threading as th
@@ -16,6 +16,7 @@ import scipy.cluster.hierarchy as sch
 from scipy.spatial.distance import squareform
 from mpl_toolkits.mplot3d import axes3d
 from Bio import AlignIO, SeqIO
+ from Bio.PDB.MMCIFParser import MMCIFParser
 from functools import partial
 from multiprocessing import Pool, Manager
 from os import path
@@ -429,11 +430,7 @@ def parallel_stats_pairs(f):
 @trace_unhandled_exceptions
 def to_id_matrix(f):
     """
-     Extracts sequences of 3D chains from the family alignments to a distinct STK file,
-     then runs esl-alipid on it to get an identity matrix.
- 
-     Side-effect : also produces the 3D_only family alignment as a separate file. 
-     So, we use this function to update 'ali_filtered_length' in the family table.
+     Runs esl-alipid on the filtered alignment to get an identity matrix.
     """
     if path.isfile("data/"+f+".npy"):
         return 0
@@ -444,35 +441,18 @@ def to_id_matrix(f):
 
     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_id_matrix({f})")
 
-     # Prepare a file
-     with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file:
-         al = AlignIO.read(al_file, "fasta")
-         names = [ x.id for x in al if '[' in x.id ]
-         al = al[-len(names):]
-     with open(path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk", "w") as only_3d:
-         try:
-             only_3d.write(al.format("stockholm"))
-         except ValueError as e:
-             warn(e)
-     del al
-     subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap",              #
-                     "-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk",               # This run just deletes columns of gaps
-                     "stockholm",  path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"])  #
-     subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk", f + "_3d_only.stk"])
-     subprocess.run(["esl-reformat", "-o", path_to_seq_data+f"/realigned/{f}_3d_only.afa", "afa", path_to_seq_data+f"/realigned/{f}_3d_only.stk"])
- 
-     # Out-of-scope task : update the database with the length of the filtered alignment:
-     align = AlignIO.read(path_to_seq_data+f"/realigned/{f}_3d_only.afa", "fasta")
-     with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
-         conn.execute('pragma journal_mode=wal')
-         sql_execute(conn, "UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=[align.get_alignment_length(), f])
+     if not path.isfile(f"{path_to_seq_data}/realigned/{f}_3d_only.stk"):
+         warn(f"File not found: {path_to_seq_data}/realigned/{f}_3d_only.stk")
+     align = AlignIO.read(f"{path_to_seq_data}/realigned/{f}_3d_only.stk", "stockholm")
+     names = [ x.id for x in align if '[' in x.id ]
     del align
- 
+     
+     pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", unit="comparisons", leave=False)
+     pbar.update(0)
+     
     # Prepare the job
-     process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"), 
-                                stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+     process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}/realigned/{f}_3d_only.stk"), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     id_matrix = np.zeros((len(names), len(names)))
-     pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", unit="comparisons", leave=False)
     cnt = 0
     while not cnt or process.poll() is None:
         output = process.stdout.read()
@@ -632,7 +612,6 @@ def stats_pairs():
     plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99)
     plt.savefig(runDir + f"/results/figures/pairings_{res_thr}.png")
 
-     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
     notify("Computed nucleotide statistics and saved CSV and PNG file.")
 
 @trace_unhandled_exceptions
@@ -931,7 +910,141 @@ def general_stats():
                         hspace=0.05, bottom=0.12, top=0.84)
     fig.savefig(runDir + "/results/figures/Nfamilies.png")
     plt.close()
+ 
+ def get_matrix_euclidian_distance(cif_file, aligned_seq, consider_all_atoms):
+     """
+     This function
+     - loads the coordinates and the alignment, reconctructs the alignment but with coordinates, considering gaps, and 
+     - compute the matrix of euclidian distances.
+ 
+     Returns:
+     The 2D np.array of euclidian distances between pairs of nucleotides, with np.NaNs in gap columns.
+     """
+     # Load the baricenter coordinates
+     coordinates = nt_3d_centers(cif_file, consider_all_atoms) 
+ 
+     # reconstruct the alignment but with coordinates
+     nb_gap = 0
+     coordinates_with_gaps = []
+     for i in range(len(aligned_seq)):
+         if aligned_seq[i] == '.' or  aligned_seq[i] == '-':
+             nb_gap = nb_gap + 1
+             coordinates_with_gaps.append('NA')
+         else:
+             coordinates_with_gaps.append(coordinates[i - nb_gap])
+    
+     nb_nucleotides = len(coordinates_with_gaps)  # number of nucleotides
+     matrix = np.zeros((nb_nucleotides, nb_nucleotides))  # create a new empty matrix of size nxn
+     
+     # Fill this new matrix with the euclidians distances between all amino acids considering gaps:
+     for i in range(nb_nucleotides):
+         for j in range(nb_nucleotides):
+             if coordinates_with_gaps[i] == 'NA' or coordinates_with_gaps[j] == 'NA':
+                 matrix[i][j] = np.nan
+             else:
+                 matrix[i][j] = round(get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]),3)
+     return(matrix)
+ 
+ @trace_unhandled_exceptions
+ def get_avg_std_distance_matrix(f, consider_all_atoms):
+     # Get a worker number to position the progress bar
+     global idxQueue
+     thr_idx = idxQueue.get()
+ 
+     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} {f} residue distance matrices")
+ 
+     if consider_all_atoms:
+         label = "base"
+     else:
+         label = "backbone"
+ 
+     os.makedirs(runDir + '/results/distance_matrices/' + f + '_' + label, exist_ok=True )   
+ 
+    
+     family_matrices = []
+     align = AlignIO.read(path_to_seq_data + f"realigned/{f}_3d_only.afa", "fasta")
+     found = 0
+     notfound = 0
+     pbar = tqdm(total = len(align), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} {label} distance matrices", unit="chains", leave=False)
+     pbar.update(0)
+     with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
+         conn.execute('pragma journal_mode=wal')
+         r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';")
+         filelist = [ ''.join(list(x))+'.cif' for x in r ]
+     
+     for s in align:
+         filename = ''
+         for file in filelist:
+             if file.startswith(s.id.replace('-', '').replace('[', '_').replace(']', '_')):
+                 filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file
+                 break
+         if len(filename):
+             found += 1
+             try:
+                 euclidian_distance = get_matrix_euclidian_distance(filename, s.seq, consider_all_atoms)
+                 np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', euclidian_distance, delimiter=",", fmt="%.3f")
+                 family_matrices.append(euclidian_distance)
+             except FileNotFoundError:
+                 found -= 1
+                 notfound += 1
+         else:
+             notfound += 1
+         pbar.update(1)
+     
+     # Calculation of the average matrix
+     avgarray = np.array(family_matrices)
+     if len(avgarray) == 0 or np.prod(avgarray.shape) == 0: 
+         warn(f"Something's wrong with the shapes: {avgarray.shape}", error=True)
+     with warnings.catch_warnings():
+         warnings.simplefilter("ignore", category=RuntimeWarning)
+         matrix_average_distances = np.nanmean(avgarray, axis=0 )
+     
+     if len(matrix_average_distances) != 0:
+         matrix_average_distances = np.nan_to_num(matrix_average_distances)
+         np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.csv' , np.triu(matrix_average_distances), delimiter=",", fmt="%.3f")
+     
+     fig, ax = plt.subplots()
+     im = ax.imshow(matrix_average_distances)
+     cbar = ax.figure.colorbar(im, ax=ax)
+     cbar.ax.set_ylabel("Angströms", rotation=-90, va="bottom")
+     ax.set_title("Average distance between residues (Angströms)")
+     fig.tight_layout()
+     fig.savefig(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.png', dpi=300)
+     plt.close()
+ 
+     # Calculation of the standard deviation matrix
+     with warnings.catch_warnings():
+         warnings.simplefilter("ignore", category=RuntimeWarning)
+         matrix_standard_deviation_distances = np.nanstd(avgarray, axis=0 )
+ 
+     if len(matrix_standard_deviation_distances) != 0:
+         matrix_standard_deviation_distances = np.nan_to_num(matrix_standard_deviation_distances)
+         np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_stdev.csv' , np.triu(matrix_standard_deviation_distances), delimiter=",", fmt="%.3f")
+     
+     fig, ax = plt.subplots()
+     im = ax.imshow(matrix_standard_deviation_distances)
+     cbar = ax.figure.colorbar(im, ax=ax)
+     cbar.ax.set_ylabel("Angströms", rotation=-90, va="bottom")
+     ax.set_title("Average distance between residues (Angströms)")
+     fig.tight_layout()
+     fig.savefig(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_std.png', dpi=300)
+     plt.close()
+ 
+     # Save log
+     with open(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '.log', 'a') as logfile:
+         logfile.write(str(found)+ " chains taken into account for computation. "+ str(notfound)+ " were not found.\n")
+ 
+     # Save associated nucleotide frequencies (off-topic but convenient to do it here)
+     with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
+         conn.execute('pragma journal_mode=wal')
+         df = pd.read_sql_query(f"SELECT freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;", conn)
+         df.to_csv(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_frequencies.csv', float_format="%.3f")
+ 
+     pbar.close()
+ 
+     idxQueue.put(thr_idx) # replace the thread index in the queue
     setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
+     return 0
 
 def log_to_pbar(pbar):
     def update(r):
@@ -952,6 +1065,48 @@ def family_order(f):
     else:
         return 2
 
+ def nt_3d_centers(cif_file, consider_all_atoms):
+     """Return the nucleotides' coordinates, summarizing a nucleotide by only one point.
+     If consider_all_atoms : barycentre is used
+     else: C1' atom is the nucleotide
+     """
+     result  =[]
+     structure = MMCIFParser().get_structure(cif_file, cif_file)
+     
+     if consider_all_atoms == True:
+         for model in structure:
+             for chain in model:
+                 for residue in chain:
+                     temp_list = []
+                     res_isobaricentre = 0
+                     for atom in residue:
+                         temp_list.append(atom.get_coord())
+                     lg = len(temp_list)
+                     
+                     summ = np.sum(temp_list, axis = 0)
+                     res_isobaricentre = [summ[0]/lg, summ[1]/lg, summ[2]/lg]
+                     result.append([res_isobaricentre[0], res_isobaricentre[1], res_isobaricentre[2]])
+      
+     elif consider_all_atoms == False:
+         for model in structure:
+             for chain in model:
+                 for residue in chain:
+                     for atom in residue:
+                         if atom.get_name() == "C1'":
+                             coordinates = atom.get_coord()
+                             res = [coordinates[0], coordinates[1], coordinates[2]]
+                             result.append(res)
+     return(result)
+ 
+ def get_euclidian_distance(L1, L2):
+     """Returns the distance between two points (coordinates in lists)
+     """
+ 
+     e = 0
+     for i in range(len(L1)):
+         e += float(L1[i] - L2[i])**2
+     return np.sqrt(e)
+ 
 if __name__ == "__main__":
 
     os.makedirs(runDir + "/results/figures/", exist_ok=True)
@@ -959,8 +1114,9 @@ if __name__ == "__main__":
     # parse options
     DELETE_OLD_DATA = False
     DO_WADLEY_ANALYSIS = False
+     DO_AVG_DISTANCE_MATRIX = False
     try:
-         opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "from-scratch", "wadley", "resolution=", "3d-folder=", "seq-folder=" ])
+         opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "from-scratch", "wadley", "distance-matrices", "resolution=", "3d-folder=", "seq-folder=" ])
     except getopt.GetoptError as err:
         print(err)
         sys.exit(2)
@@ -979,9 +1135,12 @@ if __name__ == "__main__":
             print("--seq-folder=…\t\t\tPath to a folder containing the sequence and alignment files. Required subfolder:"
                     "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
             print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything")
+             print("--distance-matrices\t\tCompute average distance between nucleotide pairs for each family.")
+             print("--wadley\t\t\tReproduce Wadley & al 2007 clustering of pseudotorsions.")
+ 
             sys.exit()
         elif opt == '--version':
-             print("RNANet statistics 1.2")
+             print("RNANet statistics 1.3 beta")
             sys.exit()
         elif opt == "-r" or opt == "--resolution":
             assert float(arg) > 0.0 and float(arg) <= 20.0 
@@ -997,6 +1156,8 @@ if __name__ == "__main__":
         elif opt=='--from-scratch':
             DELETE_OLD_DATA = True
             DO_WADLEY_ANALYSIS = True
+         elif opt=="--distance-matrices":
+             DO_AVG_DISTANCE_MATRIX = True
         elif opt=='--wadley':
             DO_WADLEY_ANALYSIS = True
     
@@ -1030,6 +1191,8 @@ if __name__ == "__main__":
             subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"])
         if DO_WADLEY_ANALYSIS:
             subprocess.run(["rm","-f", runDir + f"/data/wadley_kernel_eta_{res_thr}.npz", runDir + f"/data/wadley_kernel_eta_prime_{res_thr}.npz", runDir + f"/data/pair_counts_{res_thr}.csv"])
+         if DO_AVG_DISTANCE_MATRIX:
+             subprocess.run(["rm", "-rf", runDir + f"/results/distance_matrices/"])
 
     # Prepare the multiprocessing execution environment
     nworkers = min(read_cpu_number()-1, 32)
@@ -1043,6 +1206,17 @@ if __name__ == "__main__":
     if n_unmapped_chains and DO_WADLEY_ANALYSIS:
         joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
         joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
+     if DO_AVG_DISTANCE_MATRIX:
+         extracted_chains = []
+         for file in os.listdir(path_to_3D_data + "rna_mapped_to_Rfam"):
+             if os.path.isfile(os.path.join(path_to_3D_data + "rna_mapped_to_Rfam", file)):
+                 e1 = file.split('_')[0]
+                 e2 = file.split('_')[1]
+                 e3 = file.split('_')[2]
+                 extracted_chains.append(e1 + '[' + e2 + ']' + '-' + e3)
+         for f in famlist:
+             joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, True)))
+             joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False)))
     joblist.append(Job(function=stats_len)) # Computes figures
     joblist.append(Job(function=stats_freq)) # updates the database
     for f in famlist: