Louis BECQUEY

Standalone script biorseo.py

...@@ -8,7 +8,7 @@ from os import path, makedirs, getcwd, chdir, devnull ...@@ -8,7 +8,7 @@ from os import path, makedirs, getcwd, chdir, devnull
8 import matplotlib.pyplot as plt 8 import matplotlib.pyplot as plt
9 from matplotlib import colors 9 from matplotlib import colors
10 from math import sqrt 10 from math import sqrt
11 -from multiprocessing import Pool, cpu_count, Manager 11 +from multiprocessing import cpu_count, Manager
12 import multiprocessing 12 import multiprocessing
13 import ast 13 import ast
14 14
...@@ -21,12 +21,26 @@ bypdir = "" ...@@ -21,12 +21,26 @@ bypdir = ""
21 biorseoDir = "." 21 biorseoDir = "."
22 exec(compile(open(biorseoDir+"/EditMe").read(), '', 'exec')) 22 exec(compile(open(biorseoDir+"/EditMe").read(), '', 'exec'))
23 runDir = path.dirname(path.realpath(__file__)) 23 runDir = path.dirname(path.realpath(__file__))
24 -self.outputf = biorseoDir + "/results/"
25 tempDir = biorseoDir + "/temp/" 24 tempDir = biorseoDir + "/temp/"
26 HLmotifDir = biorseoDir + "/data/modules/BGSU/HL/3.2/lib" 25 HLmotifDir = biorseoDir + "/data/modules/BGSU/HL/3.2/lib"
27 ILmotifDir = biorseoDir + "/data/modules/BGSU/IL/3.2/lib" 26 ILmotifDir = biorseoDir + "/data/modules/BGSU/IL/3.2/lib"
28 descfolder = biorseoDir + "/data/modules/DESC" 27 descfolder = biorseoDir + "/data/modules/DESC"
29 28
29 +# Parse options
30 +try:
31 + opts, args = getopt.getopt(sys.argv[1:], "hi:o:", ["rna3dmotifs","3dmotifatlas","jar3d","bayespairing","patternmatch","func="])
32 +except getopt.GetoptError:
33 + print("Please provide arguments !")
34 + sys.exit(2)
35 +
36 +
37 +m = Manager()
38 +running_stats = m.list()
39 +running_stats.append(0) # n_launched
40 +running_stats.append(0) # n_finished
41 +running_stats.append(0) # n_skipped
42 +fails = m.list()
43 +
30 44
31 # ================== CLASSES AND FUNCTIONS ================================ 45 # ================== CLASSES AND FUNCTIONS ================================
32 46
...@@ -55,11 +69,10 @@ class NoDaemonProcess(multiprocessing.Process): ...@@ -55,11 +69,10 @@ class NoDaemonProcess(multiprocessing.Process):
55 class NoDaemonContext(type(multiprocessing.get_context())): 69 class NoDaemonContext(type(multiprocessing.get_context())):
56 Process = NoDaemonProcess 70 Process = NoDaemonProcess
57 71
58 -# We sub-class multiprocessing.pool.Pool instead of multiprocessing.Pool
59 -# because the latter is only a wrapper function, not a proper class.
60 -
61 72
62 class MyPool(multiprocessing.pool.Pool): 73 class MyPool(multiprocessing.pool.Pool):
74 + # We sub-class multiprocessing.pool.Pool instead of multiprocessing.Pool
75 + # because the latter is only a wrapper function, not a proper class.
63 def __init__(self, *args, **kwargs): 76 def __init__(self, *args, **kwargs):
64 kwargs['context'] = NoDaemonContext() 77 kwargs['context'] = NoDaemonContext()
65 super(MyPool, self).__init__(*args, **kwargs) 78 super(MyPool, self).__init__(*args, **kwargs)
...@@ -103,12 +116,10 @@ class InsertionSite: ...@@ -103,12 +116,10 @@ class InsertionSite:
103 116
104 117
105 class Job: 118 class Job:
106 - def __init__(self, command=[], function=None, args=[], how_many_in_parallel=0, priority=1, timeout=None, checkFunc=None, checkArgs=[]): 119 + def __init__(self, command=[], function=None, args=[], how_many_in_parallel=0, priority=1, timeout=None):
107 self.cmd_ = command 120 self.cmd_ = command
108 self.func_ = function 121 self.func_ = function
109 self.args_ = args 122 self.args_ = args
110 - self.checkFunc_ = checkFunc
111 - self.checkArgs_ = checkArgs
112 self.priority_ = priority 123 self.priority_ = priority
113 self.timeout_ = timeout 124 self.timeout_ = timeout
114 if not how_many_in_parallel: 125 if not how_many_in_parallel:
...@@ -122,7 +133,7 @@ class Job: ...@@ -122,7 +133,7 @@ class Job:
122 class RNA: 133 class RNA:
123 def __init__(self, header, seq): 134 def __init__(self, header, seq):
124 self.seq_ = seq 135 self.seq_ = seq
125 - self.header_ = header 136 + self.header = header
126 self.length = len(seq) 137 self.length = len(seq)
127 138
128 self.rnasubopt = [] 139 self.rnasubopt = []
...@@ -314,62 +325,55 @@ class RNA: ...@@ -314,62 +325,55 @@ class RNA:
314 325
315 326
316 class BiorseoInstance: 327 class BiorseoInstance:
317 - def __init__(self, argv): 328 + def __init__(self, opts):
318 # set default options 329 # set default options
319 self.type = "dpm" 330 self.type = "dpm"
320 self.modules = "desc" 331 self.modules = "desc"
321 self.func = 'B' 332 self.func = 'B'
322 - self.outputf = self.outputf 333 + self.inputfile = ""
334 + self.outputf = biorseoDir + "/results/" # default results location
323 self.jobcount = 0 335 self.jobcount = 0
336 + self.joblist = []
337 + self.mode = 0 # default is single sequence mode
324 338
325 - # Parse options
326 - try:
327 - opts, args = getopt.getopt(
328 - argv, "hi:o:", ["type=", "func=", "modules="])
329 - except getopt.GetoptError:
330 - print("Please provide arguments !")
331 - sys.exit(2)
332 for opt, arg in opts: 339 for opt, arg in opts:
333 if opt == "-h": 340 if opt == "-h":
334 - print("biorseo.py -i myRNA.fa -o myRNA.jar3dB --type jar3d --func B") 341 + print("biorseo.py -i myRNA.fa -o myRNA.rawB --rna3dmotifs --patternmatch --func B")
342 + print("biorseo.py -i myRNA.fa -o myRNA.jar3dB --3dmotifatlas --jar3d --func B")
343 + print("biorseo.py -i myRNA.fa -o myRNA.bgsubypD --3dmotifatlas --bayespairing --func D")
335 sys.exit() 344 sys.exit()
336 elif opt == "-i": 345 elif opt == "-i":
337 self.inputfile = arg 346 self.inputfile = arg
338 - self.mode = 0 # single sequence mode
339 elif opt == "-o": 347 elif opt == "-o":
340 - self.outputf = arg # output file or folder... 348 + self.outputf = arg # output file or folder...
349 + if self.outputf[1] != '/':
350 + self.outputf = getcwd() + '/' + self.outputf
351 + if self.outputf[-1] != '/':
352 + self.outputf = self.outputf + '/'
341 elif opt == "--func": 353 elif opt == "--func":
342 if arg in ['A', 'B', 'C', 'D']: 354 if arg in ['A', 'B', 'C', 'D']:
343 self.func = arg 355 self.func = arg
344 else: 356 else:
345 raise "Unknown scoring function " + arg 357 raise "Unknown scoring function " + arg
346 - elif opt == "--type": 358 + elif opt == "--patternmatch":
347 - if arg in ['dpm', 'jar3d', 'byp']: 359 + self.type = "dpm"
348 - self.type = arg 360 + elif opt == "--jar3d":
349 - else: 361 + self.type = "jar3d"
350 - raise "Unknown pattern matching method " + arg 362 + elif opt == "--bayespairing":
351 - elif opt == "--modules": 363 + self.type = "byp"
352 - if arg in ['desc', 'bgsu']: 364 + elif opt == "--rna3dmotifs":
353 - self.modules = arg 365 + self.modules = "desc"
354 - else: 366 + elif opt == "--3dmotifatlas":
355 - raise "Unsupported module model type " + arg 367 + self.modules = "bgsu"
356 else: 368 else:
357 raise "Unknown option " + opt 369 raise "Unknown option " + opt
358 370
371 + print("saving files to", self.outputf)
359 # create jobs 372 # create jobs
360 self.list_jobs() 373 self.list_jobs()
361 374
362 - if self.mode: 375 + # run them
363 - # Create a job manager 376 + self.execute_jobs()
364 - self.manager = Manager()
365 - self.running_stats = self.manager.list()
366 - self.running_stats.append(0) # n_launched
367 - self.running_stats.append(0) # n_finished
368 - self.running_stats.append(0) # n_skipped
369 - self.fails = self.manager.list()
370 -
371 - # Create the output folder
372 - subprocess.call(["mkdir", "-p", self.outputf])
373 377
374 def enumerate_loops(self, s): 378 def enumerate_loops(self, s):
375 def resort(unclosedLoops): 379 def resort(unclosedLoops):
...@@ -575,11 +579,11 @@ class BiorseoInstance: ...@@ -575,11 +579,11 @@ class BiorseoInstance:
575 resultsfile.write(','.join(positions)+'\n') 579 resultsfile.write(','.join(positions)+'\n')
576 resultsfile.close() 580 resultsfile.close()
577 581
578 - def launch_BayesPairing(self, module_type, seq_, header_, basename): 582 + def launch_BayesPairing(self, module_type, seq_, header_):
579 chdir(bypdir) 583 chdir(bypdir)
580 584
581 cmd = ["python3", "parse_sequences.py", "-seq", self.outputf + 585 cmd = ["python3", "parse_sequences.py", "-seq", self.outputf +
582 - basename + ".fa", "-d", module_type, "-interm", "1"] 586 + header_ + ".fa", "-d", module_type, "-interm", "1"]
583 587
584 logfile = open("log_of_the_run.sh", 'a') 588 logfile = open("log_of_the_run.sh", 'a')
585 logfile.write(" ".join(cmd)) 589 logfile.write(" ".join(cmd))
...@@ -595,9 +599,9 @@ class BiorseoInstance: ...@@ -595,9 +599,9 @@ class BiorseoInstance:
595 l = BypLog[idx] 599 l = BypLog[idx]
596 insertion_sites = [x for x in ast.literal_eval(l.split(":")[1][1:])] 600 insertion_sites = [x for x in ast.literal_eval(l.split(":")[1][1:])]
597 if module_type == "rna3dmotif": 601 if module_type == "rna3dmotif":
598 - rna = open(self.outputf + basename + ".byp.csv", "w") 602 + rna = open(self.outputf + header_ + ".byp.csv", "w")
599 else: 603 else:
600 - rna = open(self.outputf + basename + ".bgsubyp.csv", "w") 604 + rna = open(self.outputf + header_ + ".bgsubyp.csv", "w")
601 rna.write("Motif,Score,Start1,End1,Start2,End2...\n") 605 rna.write("Motif,Score,Start1,End1,Start2,End2...\n")
602 for i, module in enumerate(insertion_sites): 606 for i, module in enumerate(insertion_sites):
603 if len(module): 607 if len(module):
...@@ -618,23 +622,18 @@ class BiorseoInstance: ...@@ -618,23 +622,18 @@ class BiorseoInstance:
618 rna.close() 622 rna.close()
619 623
620 def execute_job(self, j): 624 def execute_job(self, j):
621 - if j.checkFunc_ is not None: 625 +
622 - if j.checkFunc_(*j.checkArgs_): 626 + running_stats[0] += 1
623 - self.running_stats[2] += 1
624 - print("["+str(self.running_stats[0]+self.running_stats[2]) +
625 - '/'+str(self.jobcount)+"]\tSkipping a finished job")
626 - return 0
627 - self.running_stats[0] += 1
628 if len(j.cmd_): 627 if len(j.cmd_):
629 logfile = open("log_of_the_run.sh", 'a') 628 logfile = open("log_of_the_run.sh", 'a')
630 logfile.write(" ".join(j.cmd_)) 629 logfile.write(" ".join(j.cmd_))
631 logfile.write("\n") 630 logfile.write("\n")
632 logfile.close() 631 logfile.close()
633 - print("["+str(self.running_stats[0]+self.running_stats[2]) + 632 + print("["+str(running_stats[0]+running_stats[2]) +
634 '/'+str(self.jobcount)+"]\t"+" ".join(j.cmd_)) 633 '/'+str(self.jobcount)+"]\t"+" ".join(j.cmd_))
635 r = subprocess.call(j.cmd_, timeout=j.timeout_) 634 r = subprocess.call(j.cmd_, timeout=j.timeout_)
636 elif j.func_ is not None: 635 elif j.func_ is not None:
637 - print("["+str(self.running_stats[0]+self.running_stats[2])+'/'+str(self.jobcount) + 636 + print("["+str(running_stats[0]+running_stats[2])+'/'+str(self.jobcount) +
638 "]\t"+j.func_.__name__+'('+", ".join([a for a in j.args_])+')') 637 "]\t"+j.func_.__name__+'('+", ".join([a for a in j.args_])+')')
639 try: 638 try:
640 r = j.func_(*j.args_) 639 r = j.func_(*j.args_)
...@@ -642,10 +641,47 @@ class BiorseoInstance: ...@@ -642,10 +641,47 @@ class BiorseoInstance:
642 r = 1 641 r = 1
643 pass 642 pass
644 if r: 643 if r:
645 - self.fails.append(j) 644 + fails.append(j)
646 - self.running_stats[1] += 1 645 + running_stats[1] += 1
647 return r 646 return r
648 647
648 + def execute_jobs(self):
649 + jobs = {}
650 + self.jobcount = len(self.joblist)
651 + for job in self.joblist:
652 + if job.priority_ not in jobs.keys():
653 + jobs[job.priority_] = {}
654 + if job.nthreads not in jobs[job.priority_].keys():
655 + jobs[job.priority_][job.nthreads] = []
656 + jobs[job.priority_][job.nthreads].append(job)
657 + nprio = max(jobs.keys())
658 +
659 + for i in range(1,nprio+1):
660 + if not len(jobs[i].keys()): continue
661 +
662 + # check the thread numbers
663 + different_thread_numbers = [n for n in jobs[i].keys()]
664 + different_thread_numbers.sort()
665 +
666 + for n in different_thread_numbers:
667 + bunch = jobs[i][n]
668 + if not len(bunch): continue
669 + pool = MyPool(processes=n)
670 + pool.map(self.execute_job, bunch)
671 + pool.close()
672 + pool.join()
673 +
674 + if len(fails):
675 + print()
676 + print("Some jobs failed! :")
677 + print()
678 + for j in fails:
679 + print(j.cmd_)
680 + else:
681 + print()
682 + print("Computations ran successfully.")
683 + print()
684 +
649 def check_result_existence(self, datatype, method, function, with_PK, basename): 685 def check_result_existence(self, datatype, method, function, with_PK, basename):
650 folder = self.outputf+"PK/" if with_PK else self.outputf+"noPK/" 686 folder = self.outputf+"PK/" if with_PK else self.outputf+"noPK/"
651 if datatype == "bgsu": 687 if datatype == "bgsu":
...@@ -687,8 +723,8 @@ class BiorseoInstance: ...@@ -687,8 +723,8 @@ class BiorseoInstance:
687 723
688 # Read fasta file, which can contain one or several RNAs 724 # Read fasta file, which can contain one or several RNAs
689 RNAcontainer = [] 725 RNAcontainer = []
690 - print("loading file(s)...") 726 + subprocess.call(["mkdir", "-p", self.outputf]) # Create the output folder
691 - 727 + print("loading file %s..." % self.inputfile)
692 db = open(self.inputfile, "r") 728 db = open(self.inputfile, "r")
693 c = 0 729 c = 0
694 header = "" 730 header = ""
...@@ -701,33 +737,33 @@ class BiorseoInstance: ...@@ -701,33 +737,33 @@ class BiorseoInstance:
701 c = c % 2 737 c = c % 2
702 if c == 1: 738 if c == 1:
703 if header != "": # This is our second RNA in the fasta file 739 if header != "": # This is our second RNA in the fasta file
704 - self.mode = 1 740 + self.mode = 1 # we switch to batch mode
705 - header = l[:-1] 741 + header = l[1:-1]
706 if c == 0: 742 if c == 0:
707 seq = l[:-1].upper() 743 seq = l[:-1].upper()
708 if is_canonical_nts(seq): 744 if is_canonical_nts(seq):
709 - header = header.replace('/', '_') 745 + header = header.replace('/', '_').replace('\'','').replace('(','').replace(')','')
710 RNAcontainer.append(RNA(header, seq)) 746 RNAcontainer.append(RNA(header, seq))
711 if not path.isfile(self.outputf + header + ".fa"): 747 if not path.isfile(self.outputf + header + ".fa"):
712 rna = open(self.outputf + header + ".fa", "w") 748 rna = open(self.outputf + header + ".fa", "w")
713 rna.write(">" + header +'\n') 749 rna.write(">" + header +'\n')
714 rna.write(seq +'\n') 750 rna.write(seq +'\n')
715 rna.close() 751 rna.close()
716 - db.close() 752 + db.close()
717 753
718 for nt, number in ignored_nt_dict.items(): 754 for nt, number in ignored_nt_dict.items():
719 print("ignored %d sequences because of char %c" % (number, nt)) 755 print("ignored %d sequences because of char %c" % (number, nt))
720 tot = len(RNAcontainer) 756 tot = len(RNAcontainer)
721 print("Loaded %d RNAs." % (tot)) 757 print("Loaded %d RNAs." % (tot))
722 758
723 - #define job list 759 + # define job list
724 - joblist = []
725 for instance in RNAcontainer: 760 for instance in RNAcontainer:
726 761
727 executable = biorseoDir + "/bin/biorseo" 762 executable = biorseoDir + "/bin/biorseo"
728 fastafile = self.outputf+instance.header+".fa" 763 fastafile = self.outputf+instance.header+".fa"
729 method_type = "" 764 method_type = ""
730 ext = ".raw" 765 ext = ".raw"
766 + priority = 1
731 767
732 if self.type == "jar3d": 768 if self.type == "jar3d":
733 ext = ".jar3d" 769 ext = ".jar3d"
...@@ -735,28 +771,30 @@ class BiorseoInstance: ...@@ -735,28 +771,30 @@ class BiorseoInstance:
735 csv = self.outputf + instance.header + ".sites.csv" 771 csv = self.outputf + instance.header + ".sites.csv"
736 772
737 # RNAsubopt 773 # RNAsubopt
738 - joblist.append(Job(command=["RNAsubopt", "-i", fastafile, "--outfile="+ instance.header + ".subopt"], priority=1, checkFunc=check_RNAsubopt, checkArgs=[instance.header])) 774 + self.joblist.append(Job(command=["RNAsubopt", "-i", fastafile, "--outfile="+ instance.header + ".subopt"], priority=1))
739 - joblist.append(Job(command=["mv", instance.header + ".subopt", self.outputf], priority=2, checkFunc=check_RNAsubopt, checkArgs=[instance.header])) 775 + self.joblist.append(Job(command=["mv", instance.header + ".subopt", self.outputf], priority=2))
740 # JAR3D 776 # JAR3D
741 - joblist.append(Job(function=self.launch_JAR3D, args=[instance.seq_, instance.header], priority=3, how_many_in_parallel=1, checkFunc=check_JAR3D, checkArgs=[instance.header])) 777 + self.joblist.append(Job(function=self.launch_JAR3D, args=[instance.seq_, instance.header], priority=3, how_many_in_parallel=1))
742 - 778 + priority = 4
743 if self.type == "byp": 779 if self.type == "byp":
744 method_type = "--bayespaircsv" 780 method_type = "--bayespaircsv"
745 if self.modules == "desc": 781 if self.modules == "desc":
746 ext = ".byp" 782 ext = ".byp"
747 csv = self.outputf + instance.header + ".byp.csv" 783 csv = self.outputf + instance.header + ".byp.csv"
748 - joblist.append(Job(function=self.launch_BayesPairing, args=["rna3dmotif", instance.seq_, instance.header_, instance.header], how_many_in_parallel=-1, priority=1, checkFunc=check_BayesPairing, checkArgs=[instance.header])) 784 + self.joblist.append(Job(function=self.launch_BayesPairing, args=["rna3dmotif", instance.seq_, instance.header], how_many_in_parallel=-1, priority=1))
749 elif self.modules == "bgsu": 785 elif self.modules == "bgsu":
750 ext = ".bgsubyp" 786 ext = ".bgsubyp"
751 csv = self.outputf + instance.header + ".bgsubyp.csv" 787 csv = self.outputf + instance.header + ".bgsubyp.csv"
752 - joblist.append(Job(function=self.launch_BayesPairing, args=["3dmotifatlas", instance.seq_, instance.header_, instance.header], how_many_in_parallel=-1, priority=1, checkFunc=check_BGSUBayesPairing, checkArgs=[instance.header])) 788 + self.joblist.append(Job(function=self.launch_BayesPairing, args=["3dmotifatlas", instance.seq_, instance.header], how_many_in_parallel=-1, priority=1))
753 - 789 + priority = 2
790 + if self.type == "dpm":
791 + method_type = "--descfolder"
792 + csv = descfolder
754 command = [executable, "-s", fastafile ] 793 command = [executable, "-s", fastafile ]
755 if method_type: 794 if method_type:
756 command += [ method_type, csv ] 795 command += [ method_type, csv ]
757 command += [ "-o", self.outputf + instance.header + ext + self.func, "--type", self.func ] 796 command += [ "-o", self.outputf + instance.header + ext + self.func, "--type", self.func ]
758 - joblist.append(Job(command=command, priority=4, timeout=3600, how_many_in_parallel=3)) 797 + self.joblist.append(Job(command=command, priority=priority, timeout=3600, how_many_in_parallel=3))
759 798
760 799
761 -if __name__ == "__main__": 800 +BiorseoInstance(opts)
762 - BiorseoInstance(sys.argv)
......
...@@ -22,7 +22,8 @@ typedef unsigned int uint; ...@@ -22,7 +22,8 @@ typedef unsigned int uint;
22 unsigned int Fasta::load(std::list<Fasta>& data, const char* file){ 22 unsigned int Fasta::load(std::list<Fasta>& data, const char* file){
23 23
24 std::string line, name, seq, str; 24 std::string line, name, seq, str;
25 - std::ifstream ifs(file); 25 + std::ifstream ifs;
26 + ifs.open(file, std::ios::in);
26 while (std::getline(ifs, line)) { 27 while (std::getline(ifs, line)) {
27 if (line[0]=='>') { // header 28 if (line[0]=='>') { // header
28 if (!name.empty()) { 29 if (!name.empty()) {
......
...@@ -69,6 +69,7 @@ class NoDaemonProcess(multiprocessing.Process): ...@@ -69,6 +69,7 @@ class NoDaemonProcess(multiprocessing.Process):
69 class NoDaemonContext(type(multiprocessing.get_context())): 69 class NoDaemonContext(type(multiprocessing.get_context())):
70 Process = NoDaemonProcess 70 Process = NoDaemonProcess
71 71
72 +print(multiprocessing)
72 # We sub-class multiprocessing.pool.Pool instead of multiprocessing.Pool 73 # We sub-class multiprocessing.pool.Pool instead of multiprocessing.Pool
73 # because the latter is only a wrapper function, not a proper class. 74 # because the latter is only a wrapper function, not a proper class.
74 class MyPool(multiprocessing.pool.Pool): 75 class MyPool(multiprocessing.pool.Pool):
...@@ -76,7 +77,7 @@ class MyPool(multiprocessing.pool.Pool): ...@@ -76,7 +77,7 @@ class MyPool(multiprocessing.pool.Pool):
76 kwargs['context'] = NoDaemonContext() 77 kwargs['context'] = NoDaemonContext()
77 super(MyPool, self).__init__(*args, **kwargs) 78 super(MyPool, self).__init__(*args, **kwargs)
78 79
79 - 80 +exit()
80 def execute_job(j): 81 def execute_job(j):
81 82
82 if j.checkFunc_ is not None: 83 if j.checkFunc_ is not None:
......