Command line options

Louis BECQUEY
Commit cc198903e521969ff58d81184ac6b708eb2b7aef cc198903 1 parent 39010e76
Showing 1 changed file with 150 additions and 64 deletions
RNAnet.py
--- a/RNAnet.py
View file @cc19890
+++ b/RNAnet.py
View file @cc19890
 #!/usr/bin/python3.8
 import numpy as np
 import pandas as pd
- import concurrent.futures, Bio.PDB.StructureBuilder, copy, gzip, io, json, os, psutil, re, requests, sqlalchemy, subprocess, sys, time, warnings
+ import concurrent.futures, Bio.PDB.StructureBuilder, copy, getopt, gzip, io, json, os, psutil, re, requests, sqlalchemy, subprocess, sys, time, warnings
 from Bio import AlignIO, SeqIO
 from Bio.PDB import MMCIFParser
 from Bio.PDB.mmcifio import MMCIFIO
@@ -11,7 +11,7 @@ from Bio._py3k import urlcleanup as _urlcleanup
 from Bio.Alphabet import generic_rna
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
- from Bio.Align import MultipleSeqAlignment
+ from Bio.Align import MultipleSeqAlignment, AlignInfo
 from collections import OrderedDict
 from functools import partial
 from os import path, makedirs
@@ -20,33 +20,24 @@ from time import sleep
 from tqdm import tqdm
 from tqdm.contrib.concurrent import process_map
 
- if path.isdir("/home/ubuntu/"): # this is the IFB-core cloud
-     path_to_3D_data = "/mnt/Data/RNA/3D/"
-     path_to_seq_data = "/mnt/Data/RNA/sequences/"
- elif path.isdir("/home/persalteas"): # this is my personal workstation
-     path_to_3D_data = "/home/persalteas/Data/RNA/3D/"
-     path_to_seq_data = "/home/persalteas/Data/RNA/sequences/"
- elif path.isdir("/home/lbecquey"): # this is the IBISC server
-     path_to_3D_data = "/home/lbecquey/Data/RNA/3D/"
-     path_to_seq_data = "/home/lbecquey/Data/RNA/sequences/"
- elif path.isdir("/nhome/siniac/lbecquey"): # this is the office PC
-     path_to_3D_data = "/nhome/siniac/lbecquey/Data/RNA/3D/"
-     path_to_seq_data = "/nhome/siniac/lbecquey/Data/RNA/sequences/"
- else:
-     print("I don't know that machine... I'm shy, maybe you should introduce yourself ?")
-     exit(1)
- 
 m = Manager()
 running_stats = m.list()
 running_stats.append(0) # n_launched
 running_stats.append(0) # n_finished
 running_stats.append(0) # n_skipped
 runDir = path.dirname(path.realpath(__file__))
- 
+ path_to_3D_data = "tobedefinedbyoptions"
+ path_to_seq_data = "tobedefinedbyoptions"
 validsymb = '\U00002705'
 warnsymb = '\U000026A0'
 errsymb = '\U0000274C'
 
+ # Default options:
+ CRYSTAL_RES = "4.0"
+ KEEP_HETATM = False
+ FILL_GAPS = True 
+ HOMOLOGY = True
+ 
 class NtPortionSelector(object):
     """Class passed to MMCIFIO to select some chain portions in an MMCIF file.
 
@@ -71,9 +62,9 @@ class NtPortionSelector(object):
 
         # Refuse waters and magnesium ions
         if hetatm_flag in ["W", "H_MG"]:
-             return 0         
+             return int(KEEP_HETATM)      
 
-         # I don't really know what this is but the doc said:          
+         # I don't really know what this is but the doc said to warn:          
         if icode != " ":
             warn(f"icode {icode} at position {resseq}\t\t")
 
@@ -89,6 +80,35 @@ class NtPortionSelector(object):
         # Accept all atoms otherwise.
         return 1
 
+ class BufferingSummaryInfo(AlignInfo.SummaryInfo):
+ 
+     def get_pssm(self, family, index): 
+         """Create a position specific score matrix object for the alignment. 
+  
+         This creates a position specific score matrix (pssm) which is an 
+         alternative method to look at a consensus sequence. 
+  
+         Returns: 
+          - A PSSM (position specific score matrix) object. 
+         """ 
+ 
+         pssm_info = [] 
+         # now start looping through all of the sequences and getting info 
+         for residue_num in tqdm(range(self.alignment.get_alignment_length()), position=index+1, desc=f"Worker {index+1}: {family}", leave=False): 
+             score_dict = self._get_base_letters("ACGUN") 
+             for record in self.alignment: 
+                 this_residue = record.seq[residue_num] 
+                 if this_residue not in "-.": 
+                     try:
+                         score_dict[this_residue] += 1.0 
+                     except KeyError:
+                         if this_residue in "acgun":
+                             warn(f"Found {this_residue} in {family} alignment...")
+                         score_dict[this_residue] = 1.0
+             pssm_info.append(('*', score_dict)) 
+    
+         return AlignInfo.PSSM(pssm_info) 
+ 
 
 class Chain:
     """ The object which stores all our data and the methods to process it.
@@ -448,18 +468,19 @@ class Chain:
                 exit(1)
 
         # Replace gapped positions by the consensus sequence:
-         c_aligned_seq = list(self.aligned_seq)
-         c_seq = list(self.seq)
-         letters = ['A', 'C', 'G', 'U', 'N']
-         for i in range(self.full_length):
-             if c_aligned_seq[i] == '-':      # (then c_seq[i] also is)
-                 freq = self.frequencies[:,i]
-                 l = letters[freq.tolist().index(max(freq))]
-                 c_aligned_seq[i] = l
-                 c_seq[i] = l
-                 self.data3D.iloc[i,3] = l # self.data3D['nt_code'][i]
-         self.aligned_seq = ''.join(c_aligned_seq)
-         self.seq = ''.join(c_seq)
+         if FILL_GAPS:
+             c_aligned_seq = list(self.aligned_seq)
+             c_seq = list(self.seq)
+             letters = ['A', 'C', 'G', 'U', 'N']
+             for i in range(self.full_length):
+                 if c_aligned_seq[i] == '-':      # (then c_seq[i] also is)
+                     freq = self.frequencies[:,i]
+                     l = letters[freq.tolist().index(max(freq))]
+                     c_aligned_seq[i] = l
+                     c_seq[i] = l
+                     self.data3D.iloc[i,3] = l # self.data3D['nt_code'][i]
+             self.aligned_seq = ''.join(c_aligned_seq)
+             self.seq = ''.join(c_seq)
 
         # Temporary np array to store the computations
         point = np.zeros((11, self.full_length))
@@ -1016,11 +1037,11 @@ def download_BGSU_NR_list():
     Does not remove structural redundancy. Resolution threshold used is 4 Angströms.
     """
 
-     print("> Fetching latest NR list from BGSU website...", end='', flush=True)
+     print(f"> Fetching latest list of RNA files at {CRYSTAL_RES} A resolution from BGSU website...", end='', flush=True)
     # Download latest BGSU non-redundant list
     try:
-         s = requests.get("http://rna.bgsu.edu/rna3dhub/nrlist/download/current/4.0A/csv").content
-         nr = open(path_to_3D_data + "latest_nr_list.csv", 'w')
+         s = requests.get(f"http://rna.bgsu.edu/rna3dhub/nrlist/download/current/{CRYSTAL_RES}A/csv").content
+         nr = open(path_to_3D_data + f"latest_nr_list_{CRYSTAL_RES}A.csv", 'w')
         nr.write("class,representative,class_members\n")
         nr.write(io.StringIO(s.decode('utf-8')).getvalue())
         nr.close()
@@ -1028,12 +1049,12 @@ def download_BGSU_NR_list():
         warn("Error downloading NR list !\t", error=True)
 
         # Try to read previous file
-         if path.isfile(path_to_3D_data + "latest_nr_list.csv"):
+         if path.isfile(path_to_3D_data + f"latest_nr_list_{CRYSTAL_RES}A.csv"):
             print("\t> Use of the previous version.\t", end = "", flush=True)
         else:
             return [], []
 
-     nrlist = pd.read_csv(path_to_3D_data + "latest_nr_list.csv")
+     nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{CRYSTAL_RES}A.csv")
     full_structures_list = nrlist['class_members'].tolist()
     print(f"\t{validsymb}", flush=True)
 
@@ -1100,6 +1121,8 @@ def cm_realign(rfam_acc, chains, label):
             ids = []
             for record in SeqIO.parse(gz, "fasta"):
                 if record.id not in ids:
+                     # Note: here we copy the sequences without modification. 
+                     # But, sequences with non ACGU letters exit (W, R, M, Y for example)
                     f.write(">"+record.description+'\n'+str(record.seq)+'\n')
                     ids.append(record.id)
 
@@ -1180,24 +1203,19 @@ def cm_realign(rfam_acc, chains, label):
                                "--meta-fmt=csv"])
     return 0
 
- def summarize_position(col):
+ def summarize_position(counts):
     """ Counts the number of nucleotides at a given position, given a "column" from a MSA.
     """
 
-     # Count the different chars in the column
-     counts = { 'A':col.count('A'), 'C':col.count('C'), 
-                'G':col.count('G'), 'U':col.count('U'), 
-                '-':col.count('-'), '.':col.count('.') }
- 
     # Count modified nucleotides
+     chars = counts.keys()
     known_chars_count = 0
-     chars = set(col)
+     N = 0
     for char in chars:
         if char in "ACGU":
             known_chars_count += counts[char]
-         # elif char not in "-.":
-             # counts[char] = col.count(char)
-     N = len(col) - counts['-'] - counts['.'] # number of ungapped residues
+         if char not in ".-":
+             N += counts[char]  # number of ungapped residues
 
     if N: # prevent division by zero if the column is only gaps
         return ( counts['A']/N, counts['C']/N, counts['G']/N, counts['U']/N, (N - known_chars_count)/N) # other residues, or consensus (N, K, Y...)
@@ -1228,10 +1246,8 @@ def alignment_nt_stats(f):
         exit(1)
 
     # Compute statistics per column
-     pbar = tqdm(iterable=range(alilen), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f}", leave=False)
-     results = [ summarize_position(align[:,i]) for i in pbar ]
-     pbar.close()
-     frequencies = np.array(results).T
+     pssm = BufferingSummaryInfo(align).get_pssm(f, thr_idx)
+     frequencies = np.array([ summarize_position(pssm[i]) for i in pbar ]).T
 
     # For each sequence, find the right chain and save the PSSMs inside.
     pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} chains", leave=False)
@@ -1251,12 +1267,71 @@ def alignment_nt_stats(f):
     return 0
 
 if __name__ == "__main__":
-     print("Main process running. (PID", os.getpid(), ")")
 
     # # temporary, for debugging: start from zero knowledge
     # if os.path.exists(path_to_3D_data + "known_issues.txt"):
     #     subprocess.run(["rm", path_to_3D_data + "known_issues.txt"])
 
+     # Parse options
+     try:
+         opts, args = getopt.getopt( sys.argv[1:], 
+                                     "r:h", 
+                                 [   "help", "resolution=", "keep-hetatm=", "fill-gaps=", "3d-folder=", "seq-folder=", "no-homology"])
+     except getopt.GetoptError as err:
+         print(err)
+         sys.exit(2)
+ 
+     for opt, arg in opts:
+         if opt == "-h" or opt == "--help":
+             print(  "RNANet, a script to build a multiscale RNA dataset from public data\n"
+                     "Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2020")
+             print()
+             print("Options:")
+             print("-h [ --help ]\t\t\tPrint this help message")
+             print("--version\t\t\tPrint the program version")
+             print()
+             print("-r 4.0 [ --resolution=4.0 ]\t(1.5 | 2.0 | 2.5 | 3.0 | 3.5 | 4.0 | 20.0)"
+                     "\n\t\t\t\tMinimum 3D structure resolution to consider a RNA chain.")
+             print("--keep-hetatm=False\t\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. "
+                     "\n\t\t\t\tDoes not affect the descriptors.")
+             print("--fill-gaps=True\t\t(True | False) Replace gaps in sequence due to unresolved residues"
+                     "\n\t\t\t\tby the most common nucleotide at this position in the alignment.")
+             print("--3d-folder=…\t\t\tPath to a folder to store the 3D data files. Subfolders will contain:"
+                     "\n\t\t\t\t\tRNAcifs/\t\tFull structures containing RNA, in mmCIF format"
+                     "\n\t\t\t\t\trna_mapped_to_Rfam/\tExtracted 'pure' RNA chains"
+                     "\n\t\t\t\t\tpseudotorsions/\t\tAnnotations by DSSR"
+                     "\n\t\t\t\t\tdatapoints/\t\tFinal results in specified file format.")
+             print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files."
+                     "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families"
+                     "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
+             print("--no-homology\t\t\tDo not try to compute PSSMs and do not align sequences."
+                     "\n\t\t\t\tAllows to yield more 3D data (consider chains without a Rfam mapping).")
+             sys.exit()
+ 
+         elif opt == '--version':
+             print("RNANet alpha 3")
+             sys.exit()
+         elif opt == "-r" or opt == "--resolution":
+             assert arg in ["1.5", "2.0", "2.5", "3.0", "3.5", "4.0", "20.0"]
+             CRYSTAL_RES = arg
+         elif opt=="--keep-hetatm":
+             assert arg in [ "True", "False" ]
+             KEEP_HETATM = (arg == "True")
+         elif opt=="--fill-gaps":
+             assert arg in [ "True", "False" ]
+             FILL_GAPS = (arg == "True")
+         elif opt=="--no-homolgy":
+             HOMOLOGY == False
+         elif opt=='--3d-folder':
+             path_to_3D_data = path.abspath(arg)
+         elif opt=='--seq-folder':
+             path_to_seq_data = path.abspath(arg)
+     
+     if path_to_3D_data == "tobedefinedbyoptions" or path_to_seq_data == "tobedefinedbyoptions":
+         print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
+         print("See RNANet.py --help for more information.")
+         exit(1)
+ 
     # ===========================================================================
     # List 3D chains with available Rfam mapping
     # ===========================================================================
@@ -1268,16 +1343,19 @@ if __name__ == "__main__":
     mappings = download_Rfam_PDB_mappings()
 
     # Filter the chains with mapping
-     chains_with_mapping = []
-     for c in all_chains:
-         mapping = mappings.loc[ (mappings.pdb_id == c.pdb_id) & (mappings.chain == c.pdb_chain_id) ]
-         n = len(mapping.rfam_acc.values)
-         for j in range(n):
-             if j == n-1:
-                 chains_with_mapping.append(c)
-             else:
-                 chains_with_mapping.append(copy.deepcopy(c))
-             chains_with_mapping[-1].set_rfam(mapping.rfam_acc.values[j])
+     if HOMOLOGY:
+         chains_with_mapping = []
+         for c in all_chains:
+             mapping = mappings.loc[ (mappings.pdb_id == c.pdb_id) & (mappings.chain == c.pdb_chain_id) ]
+             n = len(mapping.rfam_acc.values)
+             for j in range(n):
+                 if j == n-1:
+                     chains_with_mapping.append(c)
+                 else:
+                     chains_with_mapping.append(copy.deepcopy(c))
+                 chains_with_mapping[-1].set_rfam(mapping.rfam_acc.values[j])
+     else:
+         chains_with_mapping = all_chains
     n_chains = len(chains_with_mapping)
 
     # ===========================================================================
@@ -1330,10 +1408,18 @@ if __name__ == "__main__":
 
     print(f"> Loaded {len(loaded_chains)} RNA chains ({len(chains_with_mapping) - len(loaded_chains)} errors).")
 
+     if not HOMOLOGY:
+         # Save chains to file
+         for c in loaded_chains:
+             c.data3D.to_csv(path_to_3D_data + "datapoints/" + c.chain_label)
+         print("Completed.")
+         exit()
+ 
+ 
     # ===========================================================================
     # Download RNA sequences of the corresponding Rfam families
     # ===========================================================================
- 
+     
     # Preparing a results folder
     if not os.access(path_to_seq_data + "realigned/", os.F_OK):
         os.makedirs(path_to_seq_data + "realigned/")