more resolution-specific statistics

Louis BECQUEY
Commit ce2cba259da6ca38a5b2f56f7fee505c9c4b6f08 ce2cba25 1 parent 7196427d
Showing 2 changed files with 21 additions and 19 deletions
RNAnet.py
statistics.py
--- a/RNAnet.py
View file @ce2cba2
+++ b/RNAnet.py
View file @ce2cba2
 #!/usr/bin/python3.8
 import Bio
+ import Bio.PDB as pdb
 import concurrent.futures
 import getopt
 import gzip
@@ -25,7 +26,8 @@ from multiprocessing import Pool, Manager
 from time import sleep
 from tqdm import tqdm
 from setproctitle import setproctitle
- 
+ from Bio import AlignIO, SeqIO
+ from Bio.Align import AlignInfo
 
 def trace_unhandled_exceptions(func):
     @wraps(func)
@@ -112,7 +114,7 @@ class SelectivePortionSelector(object):
         return 1
 
 
- class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo):
+ class BufferingSummaryInfo(AlignInfo.SummaryInfo):
 
     def get_pssm(self, family, index):
         """Create a position specific score matrix object for the alignment. 
@@ -139,7 +141,7 @@ class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo):
                         score_dict[this_residue] = 1.0
             pssm_info.append(('*', score_dict))
 
-         return Bio.Align.AlignInfo.PSSM(pssm_info)
+         return AlignInfo.PSSM(pssm_info)
 
 
 class Chain:
@@ -198,11 +200,11 @@ class Chain:
 
         with warnings.catch_warnings():
             # Ignore the PDB problems. This mostly warns that some chain is discontinuous.
-             warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning)
-             warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning)
+             warnings.simplefilter('ignore', pdb.PDBExceptions.PDBConstructionWarning)
+             warnings.simplefilter('ignore', pdb.PDBExceptions.BiopythonWarning)
 
             # Load the whole mmCIF into a Biopython structure object:
-             mmcif_parser = Bio.PDB.MMCIFParser()
+             mmcif_parser = pdb.MMCIFParser()
             try:
                 s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif")
             except ValueError as e:
@@ -223,7 +225,7 @@ class Chain:
             sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm)
 
             # Save that selection on the mmCIF object s to file
-             ioobj = Bio.PDB.mmcifio.MMCIFIO()
+             ioobj = pdb.MMCIFIO()
             ioobj.set_structure(s)
             ioobj.save(self.file, sel)
 
@@ -1115,7 +1117,7 @@ class Pipeline:
                 print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &")
                 sys.exit()
             elif opt == '--version':
-                 print("RNANet 1.1 beta")
+                 print("RNANet 1.2, parallelized, Dockerized")
                 sys.exit()
             elif opt == "-r" or opt == "--resolution":
                 assert float(arg) > 0.0 and float(arg) <= 20.0
@@ -1445,7 +1447,7 @@ class Pipeline:
         # Update the database
         data = []
         for r in results:
-             align = Bio.AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta")
+             align = AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta")
             nb_3d_chains = len([1 for r in align if '[' in r.id])
             if r[0] in SSU_set:  # SSU v138 is used
                 nb_homologs = 2225272       # source: https://www.arb-silva.de/documentation/release-138/
@@ -1535,9 +1537,9 @@ class Pipeline:
         # Run statistics
         if self.RUN_STATS:
             # Remove previous precomputed data
-             subprocess.run(["rm", "-f", runDir + "/data/wadley_kernel_eta.npz", 
-                                         runDir + "/data/wadley_kernel_eta_prime.npz", 
-                                         runDir + "/data/pair_counts.csv"])
+             subprocess.run(["rm", "-f", runDir + f"/data/wadley_kernel_eta_{self.CRYSTAL_RES}.npz", 
+                                         runDir + f"/data/wadley_kernel_eta_prime_{self.CRYSTAL_RES}.npz", 
+                                         runDir + f"/data/pair_counts_{self.CRYSTAL_RES}.csv"])
             for f in self.fam_list:
                 subprocess.run(["rm", "-f", runDir + f"/data/{f}.npy", 
                                             runDir + f"/data/{f}_pairs.csv", 
@@ -2124,7 +2126,7 @@ def work_mmcif(pdb_id):
     # if not, read the CIF header and register the structure
     if not len(r):
         # Load the MMCIF file with Biopython
-         mmCif_info = Bio.PDB.MMCIF2Dict.MMCIF2Dict(final_filepath)
+         mmCif_info = pdb.MMCIF2Dict.MMCIF2Dict(final_filepath)
 
         # Get info about that structure
         try:
@@ -2218,7 +2220,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
     if rfam_acc in LSU_set | SSU_set:  # rRNA
         if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
             # Detect doublons and remove them
-             existing_afa = Bio.AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
+             existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
             existing_ids = [r.id for r in existing_afa]
             del existing_afa
             new_ids = [str(c) for c in chains]
@@ -2227,7 +2229,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
             if len(doublons):
                 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version")
                 fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa"
-                 seqfile = Bio.SeqIO.parse(fasta, "fasta")
+                 seqfile = SeqIO.parse(fasta, "fasta")
                 # remove it and rewrite it with its own content filtered
                 os.remove(fasta)
                 with open(fasta, 'w') as f:
@@ -2268,7 +2270,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
             with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "w") as plusplus:
                 ids = set()
                 # Remove doublons from the Rfam hits
-                 for r in Bio.SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"):
+                 for r in SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"):
                     if r.id not in ids:
                         ids.add(r.id)
                         plusplus.write('> '+r.description+'\n'+str(r.seq)+'\n')
@@ -2343,10 +2345,10 @@ def work_realign(rfam_acc):
             notify("Aligned new sequences together")
 
             # Detect doublons and remove them
-             existing_stk = Bio.AlignIO.read(existing_ali_path, "stockholm")
+             existing_stk = AlignIO.read(existing_ali_path, "stockholm")
             existing_ids = [r.id for r in existing_stk]
             del existing_stk
-             new_stk = Bio.AlignIO.read(new_ali_path, "stockholm")
+             new_stk = AlignIO.read(new_ali_path, "stockholm")
             new_ids = [r.id for r in new_stk]
             del new_stk
             doublons = [i for i in existing_ids if i in new_ids]
@@ -2447,7 +2449,7 @@ def work_pssm(f, fill_gaps):
 
     # Open the alignment
     try:
-         align = Bio.AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
+         align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
     except:
         warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True)
         with open(runDir + "/errors.txt", "a") as errf:
--- a/statistics.py
View file @ce2cba2
+++ b/statistics.py
View file @ce2cba2