some cleaning + temp folder

Louis BECQUEY
Commit 1c59c9601f544ca7db11402b8de56bfced10fd2e 1c59c960 1 parent 72060257
Showing 1 changed file with 34 additions and 249 deletions
biorseo.py
--- a/biorseo.py
View file @1c59c96
+++ b/biorseo.py
View file @1c59c96
@@ -25,6 +25,7 @@ modulespath = biorseoDir + "/data/modules"
 HLmotifDir = modulespath + "/BGSU/HL/3.2/lib"
 ILmotifDir = modulespath + "/BGSU/IL/3.2/lib"
 descfolder = modulespath + "/DESC"
+ tempDir = "temp/"
 
 # Parse options
 try:
@@ -84,12 +85,6 @@ class Loop:
         self.type = looptype
         self.position = position
 
-     def get_header(self):
-         return self.header
- 
-     def subsequence(self):
-         return self.seq
- 
 
 class InsertionSite:
     def __init__(self, loop, csv_line):
@@ -151,177 +146,6 @@ class RNA:
         self.biorseoBGSUBayesPairD = []
         self.biorseoBGSUBayesPairB = []
 
-     def get_RNAsubopt_results(self):
-         rna = open(self.outputf + self.basename + ".subopt", "r")
-         lines = rna.readlines()
-         rna.close()
-         for i in range(2, len(lines)):
-             ss = lines[i].split(' ')[0]
-             if ss not in self.rnasubopt.predictions:
-                 self.rnasubopt.predictions.append(ss)
- 
-     def get_biorseoBayesPairA_results(self, targetdir):  
-         if path.isfile(targetdir+ self.basename + ".bypA"):
-             rna = open(targetdir+ self.basename + ".bypA", "r")
-             lines = rna.readlines()
-             rna.close()
-             for i in range(2, len(lines)):
-                 ss = lines[i].split(' ')[0].split('\t')[0]
-                 if ss not in self.biorseoBayesPairA.predictions:
-                     self.biorseoBayesPairA.predictions.append(ss)
-                 self.biorseoBayesPairA.ninsertions.append(lines[i].count('+'))
-     
-     def get_biorseoBayesPairB_results(self, targetdir):
-         if path.isfile(targetdir+ self.basename + ".bypB"):
-             rna = open(targetdir+ self.basename + ".bypB", "r")
-             lines = rna.readlines()
-             rna.close()
-             for i in range(2, len(lines)):
-                 ss = lines[i].split(' ')[0].split('\t')[0]
-                 if ss not in self.biorseoBayesPairB.predictions:
-                     self.biorseoBayesPairB.predictions.append(ss)
-                 self.biorseoBayesPairB.ninsertions.append(lines[i].count('+'))
- 
-     def get_biorseoBayesPairC_results(self, targetdir):  
-         if path.isfile(targetdir+ self.basename + ".bypC"):
-             rna = open(targetdir+ self.basename + ".bypC", "r")
-             lines = rna.readlines()
-             rna.close()
-             for i in range(2, len(lines)):
-                 ss = lines[i].split(' ')[0].split('\t')[0]
-                 if ss not in self.biorseoBayesPairC.predictions:
-                     self.biorseoBayesPairC.predictions.append(ss)
-                 self.biorseoBayesPairC.ninsertions.append(lines[i].count('+'))
- 
-     def get_biorseoBayesPairD_results(self, targetdir):  
-         if path.isfile(targetdir+ self.basename + ".bypD"):
-             rna = open(targetdir+ self.basename + ".bypD", "r")
-             lines = rna.readlines()
-             rna.close()
-             for i in range(2, len(lines)):
-                 ss = lines[i].split(' ')[0].split('\t')[0]
-                 if ss not in self.biorseoBayesPairD.predictions:
-                     self.biorseoBayesPairD.predictions.append(ss)
-                 self.biorseoBayesPairD.ninsertions.append(lines[i].count('+'))
- 
-     def get_biorseoRawA_results(self, targetdir):
-         if path.isfile(targetdir+ self.basename + ".rawA"):
-             rna = open(targetdir+ self.basename + ".rawA", "r")
-             lines = rna.readlines()
-             rna.close()
-             for i in range(2, len(lines)):
-                 ss = lines[i].split(' ')[0].split('\t')[0]
-                 if ss not in self.biorseoRawA.predictions:
-                     self.biorseoRawA.predictions.append(ss)
-                 self.biorseoRawA.ninsertions.append(lines[i].count('+'))
- 
-     def get_biorseoRawB_results(self, targetdir):
-         if path.isfile(targetdir+ self.basename + ".rawB"):
-             rna = open(targetdir+ self.basename + ".rawB", "r")
-             lines = rna.readlines()
-             rna.close()
-             for i in range(2, len(lines)):
-                 ss = lines[i].split(' ')[0].split('\t')[0]
-                 if ss not in self.biorseoRawB.predictions:
-                     self.biorseoRawB.predictions.append(ss)
-                 self.biorseoRawB.ninsertions.append(lines[i].count('+'))
- 
-     def get_biorseoBGSUJAR3DA_results(self, targetdir):  
-         if path.isfile(targetdir+ self.basename + ".jar3dA"):
-             rna = open(targetdir+ self.basename + ".jar3dA", "r")
-             lines = rna.readlines()
-             rna.close()
-             for i in range(2, len(lines)):
-                 ss = lines[i].split(' ')[0].split('\t')[0]
-                 if ss not in self.biorseoBGSUJAR3DA.predictions:
-                     self.biorseoBGSUJAR3DA.predictions.append(ss)
-                 self.biorseoBGSUJAR3DA.ninsertions.append(lines[i].count('+'))
-     
-     def get_biorseoBGSUJAR3DB_results(self, targetdir):
-         if path.isfile(targetdir+ self.basename + ".jar3dB"):
-             rna = open(targetdir+ self.basename + ".jar3dB", "r")
-             lines = rna.readlines()
-             rna.close()
-             for i in range(2, len(lines)):
-                 ss = lines[i].split(' ')[0].split('\t')[0]
-                 if ss not in self.biorseoBGSUJAR3DB.predictions:
-                     self.biorseoBGSUJAR3DB.predictions.append(ss)
-                 self.biorseoBGSUJAR3DB.ninsertions.append(lines[i].count('+'))
- 
-     def get_biorseoBGSUJAR3DC_results(self, targetdir):  
-         if path.isfile(targetdir+ self.basename + ".jar3dC"):
-             rna = open(targetdir+ self.basename + ".jar3dC", "r")
-             lines = rna.readlines()
-             rna.close()
-             for i in range(2, len(lines)):
-                 ss = lines[i].split(' ')[0].split('\t')[0]
-                 if ss not in self.biorseoBGSUJAR3DC.predictions:
-                     self.biorseoBGSUJAR3DC.predictions.append(ss)
-                 self.biorseoBGSUJAR3DC.ninsertions.append(lines[i].count('+'))
- 
-     def get_biorseoBGSUJAR3DD_results(self, targetdir):  
-         if path.isfile(targetdir+ self.basename + ".jar3dD"):
-             rna = open(targetdir+ self.basename + ".jar3dD", "r")
-             lines = rna.readlines()
-             rna.close()
-             for i in range(2, len(lines)):
-                 ss = lines[i].split(' ')[0].split('\t')[0]
-                 if ss not in self.biorseoBGSUJAR3DD.predictions:
-                     self.biorseoBGSUJAR3DD.predictions.append(ss)
-                 self.biorseoBGSUJAR3DD.ninsertions.append(lines[i].count('+'))
- 
-     def get_biorseoBGSUBayesPairA_results(self, targetdir):  
-         if path.isfile(targetdir+ self.basename + ".bgsubypA"):
-             rna = open(targetdir+ self.basename + ".bgsubypA", "r")
-             lines = rna.readlines()
-             rna.close()
-             for i in range(2, len(lines)):
-                 ss = lines[i].split(' ')[0].split('\t')[0]
-                 if ss not in self.biorseoBGSUBayesPairA.predictions:
-                     self.biorseoBGSUBayesPairA.predictions.append(ss)
-                 self.biorseoBGSUBayesPairA.ninsertions.append(lines[i].count('+'))
-         # else:
-         #     print(targetdir+ self.basename + ".bgsubypA not found !")
-     
-     def get_biorseoBGSUBayesPairB_results(self, targetdir):
-         if path.isfile(targetdir+ self.basename + ".bgsubypB"):
-             rna = open(targetdir+ self.basename + ".bgsubypB", "r")
-             lines = rna.readlines()
-             rna.close()
-             for i in range(2, len(lines)):
-                 ss = lines[i].split(' ')[0].split('\t')[0]
-                 if ss not in self.biorseoBGSUBayesPairB.predictions:
-                     self.biorseoBGSUBayesPairB.predictions.append(ss)
-                 self.biorseoBGSUBayesPairB.ninsertions.append(lines[i].count('+'))
-         # else:
-         #     print(targetdir+ self.basename + ".bgsubypB not found !")
- 
-     def get_biorseoBGSUBayesPairC_results(self, targetdir):  
-         if path.isfile(targetdir+ self.basename + ".bgsubypC"):
-             rna = open(targetdir+ self.basename + ".bgsubypC", "r")
-             lines = rna.readlines()
-             rna.close()
-             for i in range(2, len(lines)):
-                 ss = lines[i].split(' ')[0].split('\t')[0]
-                 if ss not in self.biorseoBGSUBayesPairC.predictions:
-                     self.biorseoBGSUBayesPairC.predictions.append(ss)
-                 self.biorseoBGSUBayesPairC.ninsertions.append(lines[i].count('+'))
-         # else:
-         #     print(targetdir+ self.basename + ".bgsubypC not found !")
- 
-     def get_biorseoBGSUBayesPairD_results(self, targetdir):  
-         if path.isfile(targetdir+ self.basename + ".bgsubypD"):
-             rna = open(targetdir+ self.basename + ".bgsubypD", "r")
-             lines = rna.readlines()
-             rna.close()
-             for i in range(2, len(lines)):
-                 ss = lines[i].split(' ')[0].split('\t')[0]
-                 if ss not in self.biorseoBGSUBayesPairD.predictions:
-                     self.biorseoBGSUBayesPairD.predictions.append(ss)
-                 self.biorseoBGSUBayesPairD.ninsertions.append(lines[i].count('+'))
-         # else:
-         #     print(targetdir+ self.basename + ".bgsubypD not found !")
- 
 
 class BiorseoInstance:
     def __init__(self, opts):
@@ -422,6 +246,8 @@ class BiorseoInstance:
         # run them
         self.execute_jobs()         
 
+         # subprocess.call(["rm", "-rf", tempDir])  # empty the temp folder  
+ 
     def enumerate_loops(self, s):
         def resort(unclosedLoops):
             loops.insert(len(loops)-1-unclosedLoops, loops[-1])
@@ -532,28 +358,29 @@ class BiorseoInstance:
 
     def launch_JAR3D_worker(self, loop):
         # write motif to a file
-         newpath = getcwd()+'/'+loop.header[1:]
-         if not path.exists(newpath):
-             makedirs(newpath)
-         chdir(newpath)
-         filename = loop.header[1:]+".fasta"
+         modulefolder = tempDir + loop.header[1:] + '/'
+         if not path.exists(modulefolder):
+             makedirs(modulefolder)
+         filename = modulefolder + loop.header[1:]+".fasta"
         fasta = open(filename, 'w')
-         fasta.write('>'+loop.get_header()+'\n'+loop.subsequence()+'\n')
+         fasta.write('>'+loop.header+'\n'+loop.seq+'\n')
         fasta.close()
 
         # Launch Jar3D on it
         if loop.type == 'h':
-             cmd = ["java", "-jar", jar3dexec, filename, HLmotifDir+"/all.txt",
+             cmd = ["java", "-jar", jar3dexec, loop.header[1:]+".fasta", HLmotifDir+"/all.txt",
                    loop.header[1:]+".HLloop.csv", loop.header[1:]+".HLseq.csv"]
         else:
-             cmd = ["java", "-jar", jar3dexec, filename, ILmotifDir+"/all.txt",
+             cmd = ["java", "-jar", jar3dexec, loop.header[1:]+".fasta", ILmotifDir+"/all.txt",
                    loop.header[1:]+".ILloop.csv", loop.header[1:]+".ILseq.csv"]
         nowhere = open(devnull, 'w')
         logfile = open(biorseoDir + "/log_of_the_run.sh", 'a')
         logfile.write(' '.join(cmd))
         logfile.write("\n")
         logfile.close()
+         chdir(modulefolder)
         subprocess.call(cmd, stdout=nowhere)
+         chdir(biorseoDir)
         nowhere.close()
 
         # Retrieve results
@@ -562,7 +389,7 @@ class BiorseoInstance:
             capstype = "HL"
         else:
             capstype = "IL"
-         csv = open(loop.header[1:]+".%sseq.csv" % capstype, 'r')
+         csv = open(modulefolder + loop.header[1:] +".%sseq.csv" % capstype, 'r')
         l = csv.readline()
         while l:
             if "true" in l:
@@ -570,15 +397,12 @@ class BiorseoInstance:
             l = csv.readline()
         csv.close()
 
-         # Cleaning
-         chdir("..")
-         subprocess.call(["rm", "-r", loop.header[1:]])
         return insertion_sites
 
     def launch_JAR3D(self, seq_, basename):
         rnasubopt_preds = []
         # Extracting probable loops from RNA-subopt structures
-         rna = open(self.outputf + basename + ".subopt", "r")
+         rna = open(tempDir + basename + ".subopt", "r")
         lines = rna.readlines()
         rna.close()
         for i in range(2, len(lines)):
@@ -597,19 +421,16 @@ class BiorseoInstance:
         # Retrieve subsequences corresponding to the possible loops
         loops = []
         for i, l in enumerate(HLs):
-             loops.append(
-                 Loop(">HL%d" % (i+1), seq_[l[0][0]-1:l[0][1]], "h", l))
+             loops.append(Loop(">HL%d" % (i+1), seq_[l[0][0]-1:l[0][1]], "h", l))
         for i, l in enumerate(ILs):
-             loops.append(
-                 Loop(">IL%d" % (i+1), seq_[l[0][0]-1:l[0][1]]+'*'+seq_[l[1][0]-1:l[1][1]], "i", l))
+             loops.append(Loop(">IL%d" % (i+1), seq_[l[0][0]-1:l[0][1]]+'*'+seq_[l[1][0]-1:l[1][1]], "i", l))
         # Scanning loop subsequences against motif database
         pool = MyPool(processes=cpu_count())
-         insertion_sites = [x for y in pool.map(
-             self.launch_JAR3D_worker, loops) for x in y]
+         insertion_sites = [x for y in pool.map(self.launch_JAR3D_worker, loops) for x in y]
         insertion_sites.sort(reverse=True)
         # Writing results to CSV file
         c = 0
-         resultsfile = open(self.outputf+basename+".sites.csv", "w")
+         resultsfile = open(biorseoDir + "/" + tempDir+basename+".sites.csv", "w")
         resultsfile.write("Motif,Rotation,Score,Start1,End1,Start2,End2\n")
         for site in insertion_sites:
             if site.score > 10:
@@ -617,7 +438,7 @@ class BiorseoInstance:
                 string = "FOUND with score %d:\t\t possible insertion of motif " % site.score + site.atlas_id
                 if site.rotation:
                     string += " (reversed)"
-                 string += (" on " + site.loop.get_header() + " at positions")
+                 string += (" on " + site.loop.header + " at positions")
             resultsfile.write(site.atlas_id+',' +
                               str(bool(site.rotation))+",%d" % site.score+',')
             positions = [','.join([str(y) for y in x]) for x in site.position]
@@ -629,10 +450,10 @@ class BiorseoInstance:
     def launch_BayesPairing(self, module_type, seq_, header_):
         chdir(bypdir)
 
-         cmd = ["python3", "parse_sequences.py", "-seq", self.outputf +
+         cmd = ["python3", "parse_sequences.py", "-seq", biorseoDir + '/' + tempDir +
                header_ + ".fa", "-d", module_type, "-interm", "1"]
 
-         logfile = open("log_of_the_run.sh", 'a')
+         logfile = open(biorseoDir + "/log_of_the_run.sh", 'a')
         logfile.write(" ".join(cmd))
         logfile.write("\n")
         logfile.close()
@@ -646,9 +467,9 @@ class BiorseoInstance:
             l = BypLog[idx]
         insertion_sites = [x for x in ast.literal_eval(l.split(":")[1][1:])]
         if module_type == "rna3dmotif":
-             rna = open(self.outputf + header_ + ".byp.csv", "w")
+             rna = open(biorseoDir + "/" + tempDir + header_ + ".byp.csv", "w")
         else:
-             rna = open(self.outputf + header_ + ".bgsubyp.csv", "w")
+             rna = open(biorseoDir + "/" + tempDir + header_ + ".bgsubyp.csv", "w")
         rna.write("Motif,Score,Start1,End1,Start2,End2...\n")
         for i, module in enumerate(insertion_sites):
             if len(module):
@@ -687,8 +508,8 @@ class BiorseoInstance:
             except:
                 r = 1
                 pass
-         if r:
-             fails.append(j)
+         # if r:
+             # fails.append(j)
         running_stats[1] += 1
         return r
 
@@ -729,48 +550,12 @@ class BiorseoInstance:
             print("Computations ran successfully.")
             print()
 
-     def check_result_existence(self, datatype, method, function, with_PK, basename):
-         folder = self.outputf+"PK/" if with_PK else self.outputf+"noPK/"
-         if datatype == "bgsu":
-             if method == "jar3d":
-                 extension = ".jar3d"
-             elif method == "byp":
-                 extension = ".bgsubyp"
-             else:
-                 raise "Unknown method !"
-         elif datatype == "desc":
-             if method == "dpm":
-                 extension = ".raw"
-             elif method == "byp":
-                 extension = ".byp"
-             else:
-                 raise "Unknown method !"
-         else:
-             raise "Unknown data type !"
-         return path.isfile(folder + basename + extension + function)
- 
-     def check_csv_existence(self, datatype, method, basename):
-         if datatype == "bgsu":
-             if method == "jar3d":
-                 extension = ".sites.csv"
-             elif method == "byp":
-                 extension = ".bgsubyp.csv"
-             else:
-                 raise "Unknown method !"
-         elif datatype == "desc":
-             if method == "byp":
-                 extension = ".byp.csv"
-             else:
-                 raise "You cannot use " + method + " with " + datatype + " data !"
-         else:
-             raise "Unknown data type !"
-         return path.isfile(self.outputf + basename + extension)
- 
     def list_jobs(self):
 
         # Read fasta file, which can contain one or several RNAs
         RNAcontainer = []
         subprocess.call(["mkdir", "-p", self.outputf])  # Create the output folder
+         subprocess.call(["mkdir", "-p", tempDir])  # Create the temp folder
         print("loading file %s..." % self.inputfile)
         db = open(self.inputfile, "r")
         c = 0
@@ -789,10 +574,10 @@ class BiorseoInstance:
             if c == 0:
                 seq = l[:-1].upper()
                 if is_canonical_nts(seq):
-                     header = header.replace('/', '_').replace('\'','').replace('(','').replace(')','').replace(' ','_')
+                     header = header.replace('/', '_').replace('\'','').replace('(','').replace(')','').replace(' ','_').replace('>','')
                     RNAcontainer.append(RNA(header, seq))
-                     if not path.isfile(self.outputf + header + ".fa"):
-                         rna = open(self.outputf + header + ".fa", "w")
+                     if not path.isfile(tempDir + header + ".fa"):
+                         rna = open(tempDir + header + ".fa", "w")
                         rna.write(">" + header +'\n')
                         rna.write(seq +'\n')
                         rna.close()
@@ -807,7 +592,7 @@ class BiorseoInstance:
         for instance in RNAcontainer:
             
             executable = biorseoDir + "/bin/biorseo"
-             fastafile = self.outputf+instance.header+".fa"
+             fastafile = tempDir+instance.header+".fa"
             method_type = ""
             ext = ".raw"
             priority = 1
@@ -815,11 +600,11 @@ class BiorseoInstance:
             if self.type == "jar3d":
                 ext = ".jar3d"
                 method_type = "--jar3dcsv"
-                 csv = self.outputf + instance.header + ".sites.csv"
+                 csv = tempDir + instance.header + ".sites.csv"
 
                 # RNAsubopt
                 self.joblist.append(Job(command=["RNAsubopt", "-i", fastafile, "--outfile="+ instance.header + ".subopt"], priority=1))
-                 self.joblist.append(Job(command=["mv", instance.header + ".subopt", self.outputf], priority=2))
+                 self.joblist.append(Job(command=["mv", instance.header + ".subopt", tempDir], priority=2))
                 # JAR3D
                 self.joblist.append(Job(function=self.launch_JAR3D, args=[instance.seq_, instance.header], priority=3, how_many_in_parallel=1))
                 priority = 4
@@ -827,11 +612,11 @@ class BiorseoInstance:
                 method_type = "--bayespaircsv"
                 if self.modules == "desc":
                     ext = ".byp"
-                     csv = self.outputf + instance.header + ".byp.csv"
+                     csv = tempDir + instance.header + ".byp.csv"
                     self.joblist.append(Job(function=self.launch_BayesPairing, args=["rna3dmotif", instance.seq_, instance.header], how_many_in_parallel=-1, priority=1))
                 elif self.modules == "bgsu":
                     ext = ".bgsubyp"
-                     csv = self.outputf + instance.header + ".bgsubyp.csv"
+                     csv = tempDir + instance.header + ".bgsubyp.csv"
                     self.joblist.append(Job(function=self.launch_BayesPairing, args=["3dmotifatlas", instance.seq_, instance.header], how_many_in_parallel=-1, priority=1))
                 priority = 2
             if self.type == "dpm":