some cleaning + temp folder

Louis BECQUEY
Commit 1c59c9601f544ca7db11402b8de56bfced10fd2e 1c59c960 1 parent 72060257
Showing 1 changed file with 35 additions and 250 deletions
biorseo.py
--- a/biorseo.py
View file @1c59c96
+++ b/biorseo.py
View file @1c59c96
@@ -25,6 +25,7 @@ modulespath = biorseoDir + "/data/modules"
 HLmotifDir = modulespath + "/BGSU/HL/3.2/lib"
 ILmotifDir = modulespath + "/BGSU/IL/3.2/lib"
 descfolder = modulespath + "/DESC"
+tempDir = "temp/"
 # Parse options
 try:
@@ -84,12 +85,6 @@ class Loop:
         self.type = looptype
         self.position = position
-    def get_header(self):
-        return self.header
-
-    def subsequence(self):
-        return self.seq
-
 class InsertionSite:
     def __init__(self, loop, csv_line):
@@ -151,177 +146,6 @@ class RNA:
         self.biorseoBGSUBayesPairD = []
         self.biorseoBGSUBayesPairB = []
-    def get_RNAsubopt_results(self):
-        rna = open(self.outputf + self.basename + ".subopt", "r")
-        lines = rna.readlines()
-        rna.close()
-        for i in range(2, len(lines)):
-            ss = lines[i].split(' ')[0]
-            if ss not in self.rnasubopt.predictions:
-                self.rnasubopt.predictions.append(ss)
-
-    def get_biorseoBayesPairA_results(self, targetdir):  
-        if path.isfile(targetdir+ self.basename + ".bypA"):
-            rna = open(targetdir+ self.basename + ".bypA", "r")
-            lines = rna.readlines()
-            rna.close()
-            for i in range(2, len(lines)):
-                ss = lines[i].split(' ')[0].split('\t')[0]
-                if ss not in self.biorseoBayesPairA.predictions:
-                    self.biorseoBayesPairA.predictions.append(ss)
-                self.biorseoBayesPairA.ninsertions.append(lines[i].count('+'))
-    
-    def get_biorseoBayesPairB_results(self, targetdir):
-        if path.isfile(targetdir+ self.basename + ".bypB"):
-            rna = open(targetdir+ self.basename + ".bypB", "r")
-            lines = rna.readlines()
-            rna.close()
-            for i in range(2, len(lines)):
-                ss = lines[i].split(' ')[0].split('\t')[0]
-                if ss not in self.biorseoBayesPairB.predictions:
-                    self.biorseoBayesPairB.predictions.append(ss)
-                self.biorseoBayesPairB.ninsertions.append(lines[i].count('+'))
-
-    def get_biorseoBayesPairC_results(self, targetdir):  
-        if path.isfile(targetdir+ self.basename + ".bypC"):
-            rna = open(targetdir+ self.basename + ".bypC", "r")
-            lines = rna.readlines()
-            rna.close()
-            for i in range(2, len(lines)):
-                ss = lines[i].split(' ')[0].split('\t')[0]
-                if ss not in self.biorseoBayesPairC.predictions:
-                    self.biorseoBayesPairC.predictions.append(ss)
-                self.biorseoBayesPairC.ninsertions.append(lines[i].count('+'))
-
-    def get_biorseoBayesPairD_results(self, targetdir):  
-        if path.isfile(targetdir+ self.basename + ".bypD"):
-            rna = open(targetdir+ self.basename + ".bypD", "r")
-            lines = rna.readlines()
-            rna.close()
-            for i in range(2, len(lines)):
-                ss = lines[i].split(' ')[0].split('\t')[0]
-                if ss not in self.biorseoBayesPairD.predictions:
-                    self.biorseoBayesPairD.predictions.append(ss)
-                self.biorseoBayesPairD.ninsertions.append(lines[i].count('+'))
-
-    def get_biorseoRawA_results(self, targetdir):
-        if path.isfile(targetdir+ self.basename + ".rawA"):
-            rna = open(targetdir+ self.basename + ".rawA", "r")
-            lines = rna.readlines()
-            rna.close()
-            for i in range(2, len(lines)):
-                ss = lines[i].split(' ')[0].split('\t')[0]
-                if ss not in self.biorseoRawA.predictions:
-                    self.biorseoRawA.predictions.append(ss)
-                self.biorseoRawA.ninsertions.append(lines[i].count('+'))
-
-    def get_biorseoRawB_results(self, targetdir):
-        if path.isfile(targetdir+ self.basename + ".rawB"):
-            rna = open(targetdir+ self.basename + ".rawB", "r")
-            lines = rna.readlines()
-            rna.close()
-            for i in range(2, len(lines)):
-                ss = lines[i].split(' ')[0].split('\t')[0]
-                if ss not in self.biorseoRawB.predictions:
-                    self.biorseoRawB.predictions.append(ss)
-                self.biorseoRawB.ninsertions.append(lines[i].count('+'))
-
-    def get_biorseoBGSUJAR3DA_results(self, targetdir):  
-        if path.isfile(targetdir+ self.basename + ".jar3dA"):
-            rna = open(targetdir+ self.basename + ".jar3dA", "r")
-            lines = rna.readlines()
-            rna.close()
-            for i in range(2, len(lines)):
-                ss = lines[i].split(' ')[0].split('\t')[0]
-                if ss not in self.biorseoBGSUJAR3DA.predictions:
-                    self.biorseoBGSUJAR3DA.predictions.append(ss)
-                self.biorseoBGSUJAR3DA.ninsertions.append(lines[i].count('+'))
-    
-    def get_biorseoBGSUJAR3DB_results(self, targetdir):
-        if path.isfile(targetdir+ self.basename + ".jar3dB"):
-            rna = open(targetdir+ self.basename + ".jar3dB", "r")
-            lines = rna.readlines()
-            rna.close()
-            for i in range(2, len(lines)):
-                ss = lines[i].split(' ')[0].split('\t')[0]
-                if ss not in self.biorseoBGSUJAR3DB.predictions:
-                    self.biorseoBGSUJAR3DB.predictions.append(ss)
-                self.biorseoBGSUJAR3DB.ninsertions.append(lines[i].count('+'))
-
-    def get_biorseoBGSUJAR3DC_results(self, targetdir):  
-        if path.isfile(targetdir+ self.basename + ".jar3dC"):
-            rna = open(targetdir+ self.basename + ".jar3dC", "r")
-            lines = rna.readlines()
-            rna.close()
-            for i in range(2, len(lines)):
-                ss = lines[i].split(' ')[0].split('\t')[0]
-                if ss not in self.biorseoBGSUJAR3DC.predictions:
-                    self.biorseoBGSUJAR3DC.predictions.append(ss)
-                self.biorseoBGSUJAR3DC.ninsertions.append(lines[i].count('+'))
-
-    def get_biorseoBGSUJAR3DD_results(self, targetdir):  
-        if path.isfile(targetdir+ self.basename + ".jar3dD"):
-            rna = open(targetdir+ self.basename + ".jar3dD", "r")
-            lines = rna.readlines()
-            rna.close()
-            for i in range(2, len(lines)):
-                ss = lines[i].split(' ')[0].split('\t')[0]
-                if ss not in self.biorseoBGSUJAR3DD.predictions:
-                    self.biorseoBGSUJAR3DD.predictions.append(ss)
-                self.biorseoBGSUJAR3DD.ninsertions.append(lines[i].count('+'))
-
-    def get_biorseoBGSUBayesPairA_results(self, targetdir):  
-        if path.isfile(targetdir+ self.basename + ".bgsubypA"):
-            rna = open(targetdir+ self.basename + ".bgsubypA", "r")
-            lines = rna.readlines()
-            rna.close()
-            for i in range(2, len(lines)):
-                ss = lines[i].split(' ')[0].split('\t')[0]
-                if ss not in self.biorseoBGSUBayesPairA.predictions:
-                    self.biorseoBGSUBayesPairA.predictions.append(ss)
-                self.biorseoBGSUBayesPairA.ninsertions.append(lines[i].count('+'))
-        # else:
-        #     print(targetdir+ self.basename + ".bgsubypA not found !")
-    
-    def get_biorseoBGSUBayesPairB_results(self, targetdir):
-        if path.isfile(targetdir+ self.basename + ".bgsubypB"):
-            rna = open(targetdir+ self.basename + ".bgsubypB", "r")
-            lines = rna.readlines()
-            rna.close()
-            for i in range(2, len(lines)):
-                ss = lines[i].split(' ')[0].split('\t')[0]
-                if ss not in self.biorseoBGSUBayesPairB.predictions:
-                    self.biorseoBGSUBayesPairB.predictions.append(ss)
-                self.biorseoBGSUBayesPairB.ninsertions.append(lines[i].count('+'))
-        # else:
-        #     print(targetdir+ self.basename + ".bgsubypB not found !")
-
-    def get_biorseoBGSUBayesPairC_results(self, targetdir):  
-        if path.isfile(targetdir+ self.basename + ".bgsubypC"):
-            rna = open(targetdir+ self.basename + ".bgsubypC", "r")
-            lines = rna.readlines()
-            rna.close()
-            for i in range(2, len(lines)):
-                ss = lines[i].split(' ')[0].split('\t')[0]
-                if ss not in self.biorseoBGSUBayesPairC.predictions:
-                    self.biorseoBGSUBayesPairC.predictions.append(ss)
-                self.biorseoBGSUBayesPairC.ninsertions.append(lines[i].count('+'))
-        # else:
-        #     print(targetdir+ self.basename + ".bgsubypC not found !")
-
-    def get_biorseoBGSUBayesPairD_results(self, targetdir):  
-        if path.isfile(targetdir+ self.basename + ".bgsubypD"):
-            rna = open(targetdir+ self.basename + ".bgsubypD", "r")
-            lines = rna.readlines()
-            rna.close()
-            for i in range(2, len(lines)):
-                ss = lines[i].split(' ')[0].split('\t')[0]
-                if ss not in self.biorseoBGSUBayesPairD.predictions:
-                    self.biorseoBGSUBayesPairD.predictions.append(ss)
-                self.biorseoBGSUBayesPairD.ninsertions.append(lines[i].count('+'))
-        # else:
-        #     print(targetdir+ self.basename + ".bgsubypD not found !")
-
 class BiorseoInstance:
     def __init__(self, opts):
@@ -420,7 +244,9 @@ class BiorseoInstance:
         self.list_jobs()
         # run them
-        self.execute_jobs()           
+        self.execute_jobs()         
+
+        # subprocess.call(["rm", "-rf", tempDir])  # empty the temp folder  
     def enumerate_loops(self, s):
         def resort(unclosedLoops):
@@ -532,28 +358,29 @@ class BiorseoInstance:
     def launch_JAR3D_worker(self, loop):
         # write motif to a file
-        newpath = getcwd()+'/'+loop.header[1:]
+        modulefolder = tempDir + loop.header[1:] + '/'
-        if not path.exists(newpath):
+        if not path.exists(modulefolder):
-            makedirs(newpath)
+            makedirs(modulefolder)
-        chdir(newpath)
+        filename = modulefolder + loop.header[1:]+".fasta"
-        filename = loop.header[1:]+".fasta"
         fasta = open(filename, 'w')
-        fasta.write('>'+loop.get_header()+'\n'+loop.subsequence()+'\n')
+        fasta.write('>'+loop.header+'\n'+loop.seq+'\n')
         fasta.close()
         # Launch Jar3D on it
         if loop.type == 'h':
-            cmd = ["java", "-jar", jar3dexec, filename, HLmotifDir+"/all.txt",
+            cmd = ["java", "-jar", jar3dexec, loop.header[1:]+".fasta", HLmotifDir+"/all.txt",
                    loop.header[1:]+".HLloop.csv", loop.header[1:]+".HLseq.csv"]
         else:
-            cmd = ["java", "-jar", jar3dexec, filename, ILmotifDir+"/all.txt",
+            cmd = ["java", "-jar", jar3dexec, loop.header[1:]+".fasta", ILmotifDir+"/all.txt",
                    loop.header[1:]+".ILloop.csv", loop.header[1:]+".ILseq.csv"]
         nowhere = open(devnull, 'w')
         logfile = open(biorseoDir + "/log_of_the_run.sh", 'a')
         logfile.write(' '.join(cmd))
         logfile.write("\n")
         logfile.close()
+        chdir(modulefolder)
         subprocess.call(cmd, stdout=nowhere)
+        chdir(biorseoDir)
         nowhere.close()
         # Retrieve results
@@ -562,7 +389,7 @@ class BiorseoInstance:
             capstype = "HL"
         else:
             capstype = "IL"
-        csv = open(loop.header[1:]+".%sseq.csv" % capstype, 'r')
+        csv = open(modulefolder + loop.header[1:] +".%sseq.csv" % capstype, 'r')
         l = csv.readline()
         while l:
             if "true" in l:
@@ -570,15 +397,12 @@ class BiorseoInstance:
             l = csv.readline()
         csv.close()
-        # Cleaning
-        chdir("..")
-        subprocess.call(["rm", "-r", loop.header[1:]])
         return insertion_sites
     def launch_JAR3D(self, seq_, basename):
         rnasubopt_preds = []
         # Extracting probable loops from RNA-subopt structures
-        rna = open(self.outputf + basename + ".subopt", "r")
+        rna = open(tempDir + basename + ".subopt", "r")
         lines = rna.readlines()
         rna.close()
         for i in range(2, len(lines)):
@@ -597,19 +421,16 @@ class BiorseoInstance:
         # Retrieve subsequences corresponding to the possible loops
         loops = []
         for i, l in enumerate(HLs):
-            loops.append(
+            loops.append(Loop(">HL%d" % (i+1), seq_[l[0][0]-1:l[0][1]], "h", l))
-                Loop(">HL%d" % (i+1), seq_[l[0][0]-1:l[0][1]], "h", l))
         for i, l in enumerate(ILs):
-            loops.append(
+            loops.append(Loop(">IL%d" % (i+1), seq_[l[0][0]-1:l[0][1]]+'*'+seq_[l[1][0]-1:l[1][1]], "i", l))
-                Loop(">IL%d" % (i+1), seq_[l[0][0]-1:l[0][1]]+'*'+seq_[l[1][0]-1:l[1][1]], "i", l))
         # Scanning loop subsequences against motif database
         pool = MyPool(processes=cpu_count())
-        insertion_sites = [x for y in pool.map(
+        insertion_sites = [x for y in pool.map(self.launch_JAR3D_worker, loops) for x in y]
-            self.launch_JAR3D_worker, loops) for x in y]
         insertion_sites.sort(reverse=True)
         # Writing results to CSV file
         c = 0
-        resultsfile = open(self.outputf+basename+".sites.csv", "w")
+        resultsfile = open(biorseoDir + "/" + tempDir+basename+".sites.csv", "w")
         resultsfile.write("Motif,Rotation,Score,Start1,End1,Start2,End2\n")
         for site in insertion_sites:
             if site.score > 10:
@@ -617,7 +438,7 @@ class BiorseoInstance:
                 string = "FOUND with score %d:\t\t possible insertion of motif " % site.score + site.atlas_id
                 if site.rotation:
                     string += " (reversed)"
-                string += (" on " + site.loop.get_header() + " at positions")
+                string += (" on " + site.loop.header + " at positions")
             resultsfile.write(site.atlas_id+',' +
                               str(bool(site.rotation))+",%d" % site.score+',')
             positions = [','.join([str(y) for y in x]) for x in site.position]
@@ -629,10 +450,10 @@ class BiorseoInstance:
     def launch_BayesPairing(self, module_type, seq_, header_):
         chdir(bypdir)
-        cmd = ["python3", "parse_sequences.py", "-seq", self.outputf +
+        cmd = ["python3", "parse_sequences.py", "-seq", biorseoDir + '/' + tempDir +
                header_ + ".fa", "-d", module_type, "-interm", "1"]
-        logfile = open("log_of_the_run.sh", 'a')
+        logfile = open(biorseoDir + "/log_of_the_run.sh", 'a')
         logfile.write(" ".join(cmd))
         logfile.write("\n")
         logfile.close()
@@ -646,9 +467,9 @@ class BiorseoInstance:
             l = BypLog[idx]
         insertion_sites = [x for x in ast.literal_eval(l.split(":")[1][1:])]
         if module_type == "rna3dmotif":
-            rna = open(self.outputf + header_ + ".byp.csv", "w")
+            rna = open(biorseoDir + "/" + tempDir + header_ + ".byp.csv", "w")
         else:
-            rna = open(self.outputf + header_ + ".bgsubyp.csv", "w")
+            rna = open(biorseoDir + "/" + tempDir + header_ + ".bgsubyp.csv", "w")
         rna.write("Motif,Score,Start1,End1,Start2,End2...\n")
         for i, module in enumerate(insertion_sites):
             if len(module):
@@ -687,8 +508,8 @@ class BiorseoInstance:
             except:
                 r = 1
                 pass
-        if r:
+        # if r:
-            fails.append(j)
+            # fails.append(j)
         running_stats[1] += 1
         return r
@@ -729,48 +550,12 @@ class BiorseoInstance:
             print("Computations ran successfully.")
             print()
-    def check_result_existence(self, datatype, method, function, with_PK, basename):
-        folder = self.outputf+"PK/" if with_PK else self.outputf+"noPK/"
-        if datatype == "bgsu":
-            if method == "jar3d":
-                extension = ".jar3d"
-            elif method == "byp":
-                extension = ".bgsubyp"
-            else:
-                raise "Unknown method !"
-        elif datatype == "desc":
-            if method == "dpm":
-                extension = ".raw"
-            elif method == "byp":
-                extension = ".byp"
-            else:
-                raise "Unknown method !"
-        else:
-            raise "Unknown data type !"
-        return path.isfile(folder + basename + extension + function)
-
-    def check_csv_existence(self, datatype, method, basename):
-        if datatype == "bgsu":
-            if method == "jar3d":
-                extension = ".sites.csv"
-            elif method == "byp":
-                extension = ".bgsubyp.csv"
-            else:
-                raise "Unknown method !"
-        elif datatype == "desc":
-            if method == "byp":
-                extension = ".byp.csv"
-            else:
-                raise "You cannot use " + method + " with " + datatype + " data !"
-        else:
-            raise "Unknown data type !"
-        return path.isfile(self.outputf + basename + extension)
-
     def list_jobs(self):
         # Read fasta file, which can contain one or several RNAs
         RNAcontainer = []
         subprocess.call(["mkdir", "-p", self.outputf])  # Create the output folder
+        subprocess.call(["mkdir", "-p", tempDir])  # Create the temp folder
         print("loading file %s..." % self.inputfile)
         db = open(self.inputfile, "r")
         c = 0
@@ -789,10 +574,10 @@ class BiorseoInstance:
             if c == 0:
                 seq = l[:-1].upper()
                 if is_canonical_nts(seq):
-                    header = header.replace('/', '_').replace('\'','').replace('(','').replace(')','').replace(' ','_')
+                    header = header.replace('/', '_').replace('\'','').replace('(','').replace(')','').replace(' ','_').replace('>','')
                     RNAcontainer.append(RNA(header, seq))
-                    if not path.isfile(self.outputf + header + ".fa"):
+                    if not path.isfile(tempDir + header + ".fa"):
-                        rna = open(self.outputf + header + ".fa", "w")
+                        rna = open(tempDir + header + ".fa", "w")
                         rna.write(">" + header +'\n')
                         rna.write(seq +'\n')
                         rna.close()
@@ -807,7 +592,7 @@ class BiorseoInstance:
         for instance in RNAcontainer:
             executable = biorseoDir + "/bin/biorseo"
-            fastafile = self.outputf+instance.header+".fa"
+            fastafile = tempDir+instance.header+".fa"
             method_type = ""
             ext = ".raw"
             priority = 1
@@ -815,11 +600,11 @@ class BiorseoInstance:
             if self.type == "jar3d":
                 ext = ".jar3d"
                 method_type = "--jar3dcsv"
-                csv = self.outputf + instance.header + ".sites.csv"
+                csv = tempDir + instance.header + ".sites.csv"
                 # RNAsubopt
                 self.joblist.append(Job(command=["RNAsubopt", "-i", fastafile, "--outfile="+ instance.header + ".subopt"], priority=1))
-                self.joblist.append(Job(command=["mv", instance.header + ".subopt", self.outputf], priority=2))
+                self.joblist.append(Job(command=["mv", instance.header + ".subopt", tempDir], priority=2))
                 # JAR3D
                 self.joblist.append(Job(function=self.launch_JAR3D, args=[instance.seq_, instance.header], priority=3, how_many_in_parallel=1))
                 priority = 4
@@ -827,11 +612,11 @@ class BiorseoInstance:
                 method_type = "--bayespaircsv"
                 if self.modules == "desc":
                     ext = ".byp"
-                    csv = self.outputf + instance.header + ".byp.csv"
+                    csv = tempDir + instance.header + ".byp.csv"
                     self.joblist.append(Job(function=self.launch_BayesPairing, args=["rna3dmotif", instance.seq_, instance.header], how_many_in_parallel=-1, priority=1))
                 elif self.modules == "bgsu":
                     ext = ".bgsubyp"
-                    csv = self.outputf + instance.header + ".bgsubyp.csv"
+                    csv = tempDir + instance.header + ".bgsubyp.csv"
                     self.joblist.append(Job(function=self.launch_BayesPairing, args=["3dmotifatlas", instance.seq_, instance.header], how_many_in_parallel=-1, priority=1))
                 priority = 2
             if self.type == "dpm":