Louis BECQUEY

Average MCCs

......@@ -12,7 +12,6 @@ doc/*.fls
doc/*.fdb_latexmk
# Compiled Object files
*.o
obj/*
# Executables
......
......@@ -21,7 +21,8 @@ bypdir = ""
biorseoDir = "."
exec(compile(open(biorseoDir+"/EditMe").read(), '', 'exec'))
runDir = path.dirname(path.realpath(__file__))
outputDir = biorseoDir + "/results/"
self.outputf = biorseoDir + "/results/"
tempDir = biorseoDir + "/temp/"
HLmotifDir = biorseoDir + "/data/modules/BGSU/HL/3.2/lib"
ILmotifDir = biorseoDir + "/data/modules/BGSU/IL/3.2/lib"
descfolder = biorseoDir + "/data/modules/DESC"
......@@ -29,6 +30,17 @@ descfolder = biorseoDir + "/data/modules/DESC"
# ================== CLASSES AND FUNCTIONS ================================
ignored_nt_dict = {}
def is_canonical_nts(seq):
for c in seq[:-1]:
if c not in "ACGU":
if c in ignored_nt_dict.keys():
ignored_nt_dict[c] += 1
else:
ignored_nt_dict[c] = 1
return False
return True
class NoDaemonProcess(multiprocessing.Process):
@property
......@@ -107,19 +119,213 @@ class Job:
self.nthreads = how_many_in_parallel
class RNA:
def __init__(self, header, seq):
self.seq_ = seq
self.header_ = header
self.length = len(seq)
self.rnasubopt = []
self.biorseoRawA = []
self.biorseoRawB = []
self.biorseoBGSUJAR3DA = []
self.biorseoBGSUJAR3DC = []
self.biorseoBGSUJAR3DD = []
self.biorseoBGSUJAR3DB = []
self.biorseoBayesPairA = []
self.biorseoBayesPairC = []
self.biorseoBayesPairD = []
self.biorseoBayesPairB = []
self.biorseoBGSUBayesPairA = []
self.biorseoBGSUBayesPairC = []
self.biorseoBGSUBayesPairD = []
self.biorseoBGSUBayesPairB = []
def get_RNAsubopt_results(self):
rna = open(self.outputf + self.basename + ".subopt", "r")
lines = rna.readlines()
rna.close()
for i in range(2, len(lines)):
ss = lines[i].split(' ')[0]
if ss not in self.rnasubopt.predictions:
self.rnasubopt.predictions.append(ss)
def get_biorseoBayesPairA_results(self, targetdir):
if path.isfile(targetdir+ self.basename + ".bypA"):
rna = open(targetdir+ self.basename + ".bypA", "r")
lines = rna.readlines()
rna.close()
for i in range(2, len(lines)):
ss = lines[i].split(' ')[0].split('\t')[0]
if ss not in self.biorseoBayesPairA.predictions:
self.biorseoBayesPairA.predictions.append(ss)
self.biorseoBayesPairA.ninsertions.append(lines[i].count('+'))
def get_biorseoBayesPairB_results(self, targetdir):
if path.isfile(targetdir+ self.basename + ".bypB"):
rna = open(targetdir+ self.basename + ".bypB", "r")
lines = rna.readlines()
rna.close()
for i in range(2, len(lines)):
ss = lines[i].split(' ')[0].split('\t')[0]
if ss not in self.biorseoBayesPairB.predictions:
self.biorseoBayesPairB.predictions.append(ss)
self.biorseoBayesPairB.ninsertions.append(lines[i].count('+'))
def get_biorseoBayesPairC_results(self, targetdir):
if path.isfile(targetdir+ self.basename + ".bypC"):
rna = open(targetdir+ self.basename + ".bypC", "r")
lines = rna.readlines()
rna.close()
for i in range(2, len(lines)):
ss = lines[i].split(' ')[0].split('\t')[0]
if ss not in self.biorseoBayesPairC.predictions:
self.biorseoBayesPairC.predictions.append(ss)
self.biorseoBayesPairC.ninsertions.append(lines[i].count('+'))
def get_biorseoBayesPairD_results(self, targetdir):
if path.isfile(targetdir+ self.basename + ".bypD"):
rna = open(targetdir+ self.basename + ".bypD", "r")
lines = rna.readlines()
rna.close()
for i in range(2, len(lines)):
ss = lines[i].split(' ')[0].split('\t')[0]
if ss not in self.biorseoBayesPairD.predictions:
self.biorseoBayesPairD.predictions.append(ss)
self.biorseoBayesPairD.ninsertions.append(lines[i].count('+'))
def get_biorseoRawA_results(self, targetdir):
if path.isfile(targetdir+ self.basename + ".rawA"):
rna = open(targetdir+ self.basename + ".rawA", "r")
lines = rna.readlines()
rna.close()
for i in range(2, len(lines)):
ss = lines[i].split(' ')[0].split('\t')[0]
if ss not in self.biorseoRawA.predictions:
self.biorseoRawA.predictions.append(ss)
self.biorseoRawA.ninsertions.append(lines[i].count('+'))
def get_biorseoRawB_results(self, targetdir):
if path.isfile(targetdir+ self.basename + ".rawB"):
rna = open(targetdir+ self.basename + ".rawB", "r")
lines = rna.readlines()
rna.close()
for i in range(2, len(lines)):
ss = lines[i].split(' ')[0].split('\t')[0]
if ss not in self.biorseoRawB.predictions:
self.biorseoRawB.predictions.append(ss)
self.biorseoRawB.ninsertions.append(lines[i].count('+'))
def get_biorseoBGSUJAR3DA_results(self, targetdir):
if path.isfile(targetdir+ self.basename + ".jar3dA"):
rna = open(targetdir+ self.basename + ".jar3dA", "r")
lines = rna.readlines()
rna.close()
for i in range(2, len(lines)):
ss = lines[i].split(' ')[0].split('\t')[0]
if ss not in self.biorseoBGSUJAR3DA.predictions:
self.biorseoBGSUJAR3DA.predictions.append(ss)
self.biorseoBGSUJAR3DA.ninsertions.append(lines[i].count('+'))
def get_biorseoBGSUJAR3DB_results(self, targetdir):
if path.isfile(targetdir+ self.basename + ".jar3dB"):
rna = open(targetdir+ self.basename + ".jar3dB", "r")
lines = rna.readlines()
rna.close()
for i in range(2, len(lines)):
ss = lines[i].split(' ')[0].split('\t')[0]
if ss not in self.biorseoBGSUJAR3DB.predictions:
self.biorseoBGSUJAR3DB.predictions.append(ss)
self.biorseoBGSUJAR3DB.ninsertions.append(lines[i].count('+'))
def get_biorseoBGSUJAR3DC_results(self, targetdir):
if path.isfile(targetdir+ self.basename + ".jar3dC"):
rna = open(targetdir+ self.basename + ".jar3dC", "r")
lines = rna.readlines()
rna.close()
for i in range(2, len(lines)):
ss = lines[i].split(' ')[0].split('\t')[0]
if ss not in self.biorseoBGSUJAR3DC.predictions:
self.biorseoBGSUJAR3DC.predictions.append(ss)
self.biorseoBGSUJAR3DC.ninsertions.append(lines[i].count('+'))
def get_biorseoBGSUJAR3DD_results(self, targetdir):
if path.isfile(targetdir+ self.basename + ".jar3dD"):
rna = open(targetdir+ self.basename + ".jar3dD", "r")
lines = rna.readlines()
rna.close()
for i in range(2, len(lines)):
ss = lines[i].split(' ')[0].split('\t')[0]
if ss not in self.biorseoBGSUJAR3DD.predictions:
self.biorseoBGSUJAR3DD.predictions.append(ss)
self.biorseoBGSUJAR3DD.ninsertions.append(lines[i].count('+'))
def get_biorseoBGSUBayesPairA_results(self, targetdir):
if path.isfile(targetdir+ self.basename + ".bgsubypA"):
rna = open(targetdir+ self.basename + ".bgsubypA", "r")
lines = rna.readlines()
rna.close()
for i in range(2, len(lines)):
ss = lines[i].split(' ')[0].split('\t')[0]
if ss not in self.biorseoBGSUBayesPairA.predictions:
self.biorseoBGSUBayesPairA.predictions.append(ss)
self.biorseoBGSUBayesPairA.ninsertions.append(lines[i].count('+'))
# else:
# print(targetdir+ self.basename + ".bgsubypA not found !")
def get_biorseoBGSUBayesPairB_results(self, targetdir):
if path.isfile(targetdir+ self.basename + ".bgsubypB"):
rna = open(targetdir+ self.basename + ".bgsubypB", "r")
lines = rna.readlines()
rna.close()
for i in range(2, len(lines)):
ss = lines[i].split(' ')[0].split('\t')[0]
if ss not in self.biorseoBGSUBayesPairB.predictions:
self.biorseoBGSUBayesPairB.predictions.append(ss)
self.biorseoBGSUBayesPairB.ninsertions.append(lines[i].count('+'))
# else:
# print(targetdir+ self.basename + ".bgsubypB not found !")
def get_biorseoBGSUBayesPairC_results(self, targetdir):
if path.isfile(targetdir+ self.basename + ".bgsubypC"):
rna = open(targetdir+ self.basename + ".bgsubypC", "r")
lines = rna.readlines()
rna.close()
for i in range(2, len(lines)):
ss = lines[i].split(' ')[0].split('\t')[0]
if ss not in self.biorseoBGSUBayesPairC.predictions:
self.biorseoBGSUBayesPairC.predictions.append(ss)
self.biorseoBGSUBayesPairC.ninsertions.append(lines[i].count('+'))
# else:
# print(targetdir+ self.basename + ".bgsubypC not found !")
def get_biorseoBGSUBayesPairD_results(self, targetdir):
if path.isfile(targetdir+ self.basename + ".bgsubypD"):
rna = open(targetdir+ self.basename + ".bgsubypD", "r")
lines = rna.readlines()
rna.close()
for i in range(2, len(lines)):
ss = lines[i].split(' ')[0].split('\t')[0]
if ss not in self.biorseoBGSUBayesPairD.predictions:
self.biorseoBGSUBayesPairD.predictions.append(ss)
self.biorseoBGSUBayesPairD.ninsertions.append(lines[i].count('+'))
# else:
# print(targetdir+ self.basename + ".bgsubypD not found !")
class BiorseoInstance:
def __init__(self, argv):
# set default options
self.type = "dpm"
self.modules = "desc"
self.func = 'B'
self.outputf = outputDir
self.outputf = self.outputf
self.jobcount = 0
# Parse options
try:
opts, args = getopt.getopt(
argv, "hil::o:", ["type=", "func=", "modules="])
argv, "hi:o:", ["type=", "func=", "modules="])
except getopt.GetoptError:
print("Please provide arguments !")
sys.exit(2)
......@@ -130,9 +336,6 @@ class BiorseoInstance:
elif opt == "-i":
self.inputfile = arg
self.mode = 0 # single sequence mode
elif opt == "-l":
self.inputfile = arg
self.mode = 1 # batch mode
elif opt == "-o":
self.outputf = arg # output file or folder...
elif opt == "--func":
......@@ -153,6 +356,9 @@ class BiorseoInstance:
else:
raise "Unknown option " + opt
# create jobs
self.list_jobs()
if self.mode:
# Create a job manager
self.manager = Manager()
......@@ -321,7 +527,7 @@ class BiorseoInstance:
def launch_JAR3D(self, seq_, basename):
rnasubopt_preds = []
# Extracting probable loops from RNA-subopt structures
rna = open(outputDir + basename + ".subopt", "r")
rna = open(self.outputf + basename + ".subopt", "r")
lines = rna.readlines()
rna.close()
for i in range(2, len(lines)):
......@@ -352,7 +558,7 @@ class BiorseoInstance:
insertion_sites.sort(reverse=True)
# Writing results to CSV file
c = 0
resultsfile = open(outputDir+basename+".sites.csv", "w")
resultsfile = open(self.outputf+basename+".sites.csv", "w")
resultsfile.write("Motif,Rotation,Score,Start1,End1,Start2,End2\n")
for site in insertion_sites:
if site.score > 10:
......@@ -372,7 +578,7 @@ class BiorseoInstance:
def launch_BayesPairing(self, module_type, seq_, header_, basename):
chdir(bypdir)
cmd = ["python3", "parse_sequences.py", "-seq", outputDir +
cmd = ["python3", "parse_sequences.py", "-seq", self.outputf +
basename + ".fa", "-d", module_type, "-interm", "1"]
logfile = open("log_of_the_run.sh", 'a')
......@@ -389,9 +595,9 @@ class BiorseoInstance:
l = BypLog[idx]
insertion_sites = [x for x in ast.literal_eval(l.split(":")[1][1:])]
if module_type == "rna3dmotif":
rna = open(outputDir + basename + ".byp.csv", "w")
rna = open(self.outputf + basename + ".byp.csv", "w")
else:
rna = open(outputDir + basename + ".bgsubyp.csv", "w")
rna = open(self.outputf + basename + ".bgsubyp.csv", "w")
rna.write("Motif,Score,Start1,End1,Start2,End2...\n")
for i, module in enumerate(insertion_sites):
if len(module):
......@@ -477,6 +683,80 @@ class BiorseoInstance:
raise "Unknown data type !"
return path.isfile(self.outputf + basename + extension)
def list_jobs(self):
# Read fasta file, which can contain one or several RNAs
RNAcontainer = []
print("loading file(s)...")
db = open(self.inputfile, "r")
c = 0
header = ""
seq = ""
while True:
l = db.readline()
if l == "":
break
c += 1
c = c % 2
if c == 1:
if header != "": # This is our second RNA in the fasta file
self.mode = 1
header = l[:-1]
if c == 0:
seq = l[:-1].upper()
if is_canonical_nts(seq):
header = header.replace('/', '_')
RNAcontainer.append(RNA(header, seq))
if not path.isfile(self.outputf + header + ".fa"):
rna = open(self.outputf + header + ".fa", "w")
rna.write(">" + header +'\n')
rna.write(seq +'\n')
rna.close()
db.close()
for nt, number in ignored_nt_dict.items():
print("ignored %d sequences because of char %c" % (number, nt))
tot = len(RNAcontainer)
print("Loaded %d RNAs." % (tot))
#define job list
joblist = []
for instance in RNAcontainer:
executable = biorseoDir + "/bin/biorseo"
fastafile = self.outputf+instance.header+".fa"
method_type = ""
ext = ".raw"
if self.type == "jar3d":
ext = ".jar3d"
method_type = "--jar3dcsv"
csv = self.outputf + instance.header + ".sites.csv"
# RNAsubopt
joblist.append(Job(command=["RNAsubopt", "-i", fastafile, "--outfile="+ instance.header + ".subopt"], priority=1, checkFunc=check_RNAsubopt, checkArgs=[instance.header]))
joblist.append(Job(command=["mv", instance.header + ".subopt", self.outputf], priority=2, checkFunc=check_RNAsubopt, checkArgs=[instance.header]))
# JAR3D
joblist.append(Job(function=self.launch_JAR3D, args=[instance.seq_, instance.header], priority=3, how_many_in_parallel=1, checkFunc=check_JAR3D, checkArgs=[instance.header]))
if self.type == "byp":
method_type = "--bayespaircsv"
if self.modules == "desc":
ext = ".byp"
csv = self.outputf + instance.header + ".byp.csv"
joblist.append(Job(function=self.launch_BayesPairing, args=["rna3dmotif", instance.seq_, instance.header_, instance.header], how_many_in_parallel=-1, priority=1, checkFunc=check_BayesPairing, checkArgs=[instance.header]))
elif self.modules == "bgsu":
ext = ".bgsubyp"
csv = self.outputf + instance.header + ".bgsubyp.csv"
joblist.append(Job(function=self.launch_BayesPairing, args=["3dmotifatlas", instance.seq_, instance.header_, instance.header], how_many_in_parallel=-1, priority=1, checkFunc=check_BGSUBayesPairing, checkArgs=[instance.header]))
command = [executable, "-s", fastafile ]
if method_type:
command += [ method_type, csv ]
command += [ "-o", self.outputf + instance.header + ext + self.func, "--type", self.func ]
joblist.append(Job(command=command, priority=4, timeout=3600, how_many_in_parallel=3))
if __name__ == "__main__":
BiorseoInstance(sys.argv)
......
......@@ -736,7 +736,7 @@ class RNA:
m.best_pred = p
if max(m.ninsertions) > 0 and float(n)/max(m.ninsertions) > m.ratio:
m.ratio = float(n)/max(m.ninsertions)
def get_biokop_results(self):
if path.isfile(outputDir + self.basename + ".biok"):
rna = open(outputDir + self.basename + ".biok", "r")
......@@ -1005,93 +1005,93 @@ print("Loaded %d RNAs of length between 10 and 100. %d of them contain pseudokno
# #================= PREDICTION OF STRUCTURES ===============================
# #define job list
# joblist = []
# for instance in RNAcontainer:
# basename = instance.basename
# # RNAsubopt
# joblist.append(Job(command=["RNAsubopt", "-i", outputDir + basename + ".fa", "--outfile="+ basename + ".subopt"], priority=1, checkFunc=check_RNAsubopt, checkArgs=[basename]))
# joblist.append(Job(command=["mv", basename + ".subopt", outputDir], priority=2, checkFunc=check_RNAsubopt, checkArgs=[basename]))
# # JAR3D
# joblist.append(Job(function=launch_JAR3D, args=[instance.seq_, basename], priority=3, how_many_in_parallel=1, checkFunc=check_JAR3D, checkArgs=[basename]))
# # BayesPairing and BGSUBayesPairing
# joblist.append(Job(function=launch_BayesPairing, args=["rna3dmotif", instance.seq_, instance.header_, basename], how_many_in_parallel=-1, priority=3, checkFunc=check_BayesPairing, checkArgs=[basename]))
# joblist.append(Job(function=launch_BayesPairing, args=["3dmotifatlas", instance.seq_, instance.header_, basename], how_many_in_parallel=-1, priority=3, checkFunc=check_BGSUBayesPairing, checkArgs=[basename]))
# # biorseoBGSUJAR3DA-D
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--jar3dcsv", outputDir+basename+".sites.csv", "-o", outputDir+"noPK/"+basename+".jar3dA", "--type", "A", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUJAR3DA, checkArgs=[basename, False]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--jar3dcsv", outputDir+basename+".sites.csv", "-o", outputDir+"noPK/"+basename+".jar3dB", "--type", "B", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUJAR3DB, checkArgs=[basename, False]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--jar3dcsv", outputDir+basename+".sites.csv", "-o", outputDir+"noPK/"+basename+".jar3dC", "--type", "C", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUJAR3DC, checkArgs=[basename, False]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--jar3dcsv", outputDir+basename+".sites.csv", "-o", outputDir+"noPK/"+basename+".jar3dD", "--type", "D", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUJAR3DD, checkArgs=[basename, False]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--jar3dcsv", outputDir+basename+".sites.csv", "-o", outputDir+"PK/"+basename+".jar3dA", "--type", "A"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUJAR3DA, checkArgs=[basename, True]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--jar3dcsv", outputDir+basename+".sites.csv", "-o", outputDir+"PK/"+basename+".jar3dB", "--type", "B"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUJAR3DB, checkArgs=[basename, True]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--jar3dcsv", outputDir+basename+".sites.csv", "-o", outputDir+"PK/"+basename+".jar3dC", "--type", "C"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUJAR3DC, checkArgs=[basename, True]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--jar3dcsv", outputDir+basename+".sites.csv", "-o", outputDir+"PK/"+basename+".jar3dD", "--type", "D"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUJAR3DD, checkArgs=[basename, True]))
# # biorseoBGSUBayesPairA-D
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".bgsubyp.csv", "-o", outputDir+"noPK/"+basename+".bgsubypA", "--type", "A", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUBayesPairA, checkArgs=[basename, False]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".bgsubyp.csv", "-o", outputDir+"noPK/"+basename+".bgsubypB", "--type", "B", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUBayesPairB, checkArgs=[basename, False]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".bgsubyp.csv", "-o", outputDir+"noPK/"+basename+".bgsubypC", "--type", "C", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUBayesPairC, checkArgs=[basename, False]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".bgsubyp.csv", "-o", outputDir+"noPK/"+basename+".bgsubypD", "--type", "D", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUBayesPairD, checkArgs=[basename, False]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".bgsubyp.csv", "-o", outputDir+"PK/"+basename+".bgsubypA", "--type", "A"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUBayesPairA, checkArgs=[basename, True]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".bgsubyp.csv", "-o", outputDir+"PK/"+basename+".bgsubypB", "--type", "B"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUBayesPairB, checkArgs=[basename, True]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".bgsubyp.csv", "-o", outputDir+"PK/"+basename+".bgsubypC", "--type", "C"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUBayesPairC, checkArgs=[basename, True]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".bgsubyp.csv", "-o", outputDir+"PK/"+basename+".bgsubypD", "--type", "D"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUBayesPairD, checkArgs=[basename, True]))
# # biorseoBayesPairA-D
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".byp.csv", "-o", outputDir+"noPK/"+basename+".bypA", "--type", "A", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBayesPairA, checkArgs=[basename, False]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".byp.csv", "-o", outputDir+"noPK/"+basename+".bypB", "--type", "B", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBayesPairB, checkArgs=[basename, False]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".byp.csv", "-o", outputDir+"noPK/"+basename+".bypC", "--type", "C", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBayesPairC, checkArgs=[basename, False]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".byp.csv", "-o", outputDir+"noPK/"+basename+".bypD", "--type", "D", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBayesPairD, checkArgs=[basename, False]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".byp.csv", "-o", outputDir+"PK/"+basename+".bypA", "--type", "A"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBayesPairA, checkArgs=[basename, True]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".byp.csv", "-o", outputDir+"PK/"+basename+".bypB", "--type", "B"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBayesPairB, checkArgs=[basename, True]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".byp.csv", "-o", outputDir+"PK/"+basename+".bypC", "--type", "C"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBayesPairC, checkArgs=[basename, True]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".byp.csv", "-o", outputDir+"PK/"+basename+".bypD", "--type", "D"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBayesPairD, checkArgs=[basename, True]))
# # biorseoRawA,B
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir + basename + ".fa", "-d", descfolder, "-o", outputDir+"noPK/" + basename + ".rawA", "--type", "A", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoRawA, checkArgs=[basename, False]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir + basename + ".fa", "-d", descfolder, "-o", outputDir+"noPK/" + basename + ".rawB", "--type", "B", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoRawB, checkArgs=[basename, False]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir + basename + ".fa", "-d", descfolder, "-o", outputDir+"PK/" + basename + ".rawA", "--type", "A"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoRawA, checkArgs=[basename, True]))
# joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir + basename + ".fa", "-d", descfolder, "-o", outputDir+"PK/" + basename + ".rawB", "--type", "B"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoRawB, checkArgs=[basename, True]))
# # RNA-MoIP
# joblist.append(Job(function=launch_RNAMoIP, args=[instance.seq_, instance.header_, basename], priority=3, timeout=3600, checkFunc=check_RNAMoIP, checkArgs=[basename]))
# # Biokop
# joblist.append(Job(command=[biorseoDir + "/../biokop/biokop", "-n1", "-i", outputDir + basename + ".fa", "-o", outputDir + basename + ".biok"], priority=5, timeout=15000, how_many_in_parallel=3, checkFunc=check_biokop, checkArgs=[basename]))
# # execute jobs
# jobs = {}
# jobcount = len(joblist)
# for job in joblist:
# if job.priority_ not in jobs.keys():
# jobs[job.priority_] = {}
# if job.nthreads not in jobs[job.priority_].keys():
# jobs[job.priority_][job.nthreads] = []
# jobs[job.priority_][job.nthreads].append(job)
# nprio = max(jobs.keys())
# for i in range(1,nprio+1):
# if not len(jobs[i].keys()): continue
# # check the thread numbers
# different_thread_numbers = [n for n in jobs[i].keys()]
# different_thread_numbers.sort()
# for n in different_thread_numbers:
# bunch = jobs[i][n]
# if not len(bunch): continue
# pool = MyPool(processes=n)
# results = pool.map(execute_job, bunch)
# pool.close()
# pool.join()
# if len(fails):
# print()
# print("Some jobs failed! :")
# print()
# for j in fails:
# print(j.cmd_)
# else:
# print()
# print("Computations ran successfully.")
# print()
#define job list
joblist = []
for instance in RNAcontainer:
basename = instance.basename
# RNAsubopt
joblist.append(Job(command=["RNAsubopt", "-i", outputDir + basename + ".fa", "--outfile="+ basename + ".subopt"], priority=1, checkFunc=check_RNAsubopt, checkArgs=[basename]))
joblist.append(Job(command=["mv", basename + ".subopt", outputDir], priority=2, checkFunc=check_RNAsubopt, checkArgs=[basename]))
# JAR3D
joblist.append(Job(function=launch_JAR3D, args=[instance.seq_, basename], priority=3, how_many_in_parallel=1, checkFunc=check_JAR3D, checkArgs=[basename]))
# BayesPairing and BGSUBayesPairing
joblist.append(Job(function=launch_BayesPairing, args=["rna3dmotif", instance.seq_, instance.header_, basename], how_many_in_parallel=-1, priority=3, checkFunc=check_BayesPairing, checkArgs=[basename]))
joblist.append(Job(function=launch_BayesPairing, args=["3dmotifatlas", instance.seq_, instance.header_, basename], how_many_in_parallel=-1, priority=3, checkFunc=check_BGSUBayesPairing, checkArgs=[basename]))
# biorseoBGSUJAR3DA-D
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--jar3dcsv", outputDir+basename+".sites.csv", "-o", outputDir+"noPK/"+basename+".jar3dA", "--type", "A", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUJAR3DA, checkArgs=[basename, False]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--jar3dcsv", outputDir+basename+".sites.csv", "-o", outputDir+"noPK/"+basename+".jar3dB", "--type", "B", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUJAR3DB, checkArgs=[basename, False]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--jar3dcsv", outputDir+basename+".sites.csv", "-o", outputDir+"noPK/"+basename+".jar3dC", "--type", "C", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUJAR3DC, checkArgs=[basename, False]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--jar3dcsv", outputDir+basename+".sites.csv", "-o", outputDir+"noPK/"+basename+".jar3dD", "--type", "D", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUJAR3DD, checkArgs=[basename, False]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--jar3dcsv", outputDir+basename+".sites.csv", "-o", outputDir+"PK/"+basename+".jar3dA", "--type", "A"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUJAR3DA, checkArgs=[basename, True]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--jar3dcsv", outputDir+basename+".sites.csv", "-o", outputDir+"PK/"+basename+".jar3dB", "--type", "B"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUJAR3DB, checkArgs=[basename, True]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--jar3dcsv", outputDir+basename+".sites.csv", "-o", outputDir+"PK/"+basename+".jar3dC", "--type", "C"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUJAR3DC, checkArgs=[basename, True]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--jar3dcsv", outputDir+basename+".sites.csv", "-o", outputDir+"PK/"+basename+".jar3dD", "--type", "D"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUJAR3DD, checkArgs=[basename, True]))
# biorseoBGSUBayesPairA-D
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".bgsubyp.csv", "-o", outputDir+"noPK/"+basename+".bgsubypA", "--type", "A", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUBayesPairA, checkArgs=[basename, False]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".bgsubyp.csv", "-o", outputDir+"noPK/"+basename+".bgsubypB", "--type", "B", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUBayesPairB, checkArgs=[basename, False]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".bgsubyp.csv", "-o", outputDir+"noPK/"+basename+".bgsubypC", "--type", "C", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUBayesPairC, checkArgs=[basename, False]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".bgsubyp.csv", "-o", outputDir+"noPK/"+basename+".bgsubypD", "--type", "D", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUBayesPairD, checkArgs=[basename, False]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".bgsubyp.csv", "-o", outputDir+"PK/"+basename+".bgsubypA", "--type", "A"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUBayesPairA, checkArgs=[basename, True]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".bgsubyp.csv", "-o", outputDir+"PK/"+basename+".bgsubypB", "--type", "B"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUBayesPairB, checkArgs=[basename, True]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".bgsubyp.csv", "-o", outputDir+"PK/"+basename+".bgsubypC", "--type", "C"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUBayesPairC, checkArgs=[basename, True]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".bgsubyp.csv", "-o", outputDir+"PK/"+basename+".bgsubypD", "--type", "D"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBGSUBayesPairD, checkArgs=[basename, True]))
# biorseoBayesPairA-D
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".byp.csv", "-o", outputDir+"noPK/"+basename+".bypA", "--type", "A", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBayesPairA, checkArgs=[basename, False]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".byp.csv", "-o", outputDir+"noPK/"+basename+".bypB", "--type", "B", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBayesPairB, checkArgs=[basename, False]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".byp.csv", "-o", outputDir+"noPK/"+basename+".bypC", "--type", "C", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBayesPairC, checkArgs=[basename, False]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".byp.csv", "-o", outputDir+"noPK/"+basename+".bypD", "--type", "D", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBayesPairD, checkArgs=[basename, False]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".byp.csv", "-o", outputDir+"PK/"+basename+".bypA", "--type", "A"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBayesPairA, checkArgs=[basename, True]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".byp.csv", "-o", outputDir+"PK/"+basename+".bypB", "--type", "B"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBayesPairB, checkArgs=[basename, True]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".byp.csv", "-o", outputDir+"PK/"+basename+".bypC", "--type", "C"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBayesPairC, checkArgs=[basename, True]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir+basename+".fa", "--bayespaircsv", outputDir+basename+".byp.csv", "-o", outputDir+"PK/"+basename+".bypD", "--type", "D"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoBayesPairD, checkArgs=[basename, True]))
# biorseoRawA,B
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir + basename + ".fa", "-d", descfolder, "-o", outputDir+"noPK/" + basename + ".rawA", "--type", "A", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoRawA, checkArgs=[basename, False]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir + basename + ".fa", "-d", descfolder, "-o", outputDir+"noPK/" + basename + ".rawB", "--type", "B", "-n"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoRawB, checkArgs=[basename, False]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir + basename + ".fa", "-d", descfolder, "-o", outputDir+"PK/" + basename + ".rawA", "--type", "A"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoRawA, checkArgs=[basename, True]))
joblist.append(Job(command=[biorseoDir+"/bin/biorseo", "-s", outputDir + basename + ".fa", "-d", descfolder, "-o", outputDir+"PK/" + basename + ".rawB", "--type", "B"], priority=4, timeout=3600, how_many_in_parallel=3, checkFunc=check_biorseoRawB, checkArgs=[basename, True]))
# RNA-MoIP
joblist.append(Job(function=launch_RNAMoIP, args=[instance.seq_, instance.header_, basename], priority=3, timeout=3600, checkFunc=check_RNAMoIP, checkArgs=[basename]))
# Biokop
joblist.append(Job(command=[biorseoDir + "/../biokop/biokop", "-n1", "-i", outputDir + basename + ".fa", "-o", outputDir + basename + ".biok"], priority=5, timeout=15000, how_many_in_parallel=3, checkFunc=check_biokop, checkArgs=[basename]))
# execute jobs
jobs = {}
jobcount = len(joblist)
for job in joblist:
if job.priority_ not in jobs.keys():
jobs[job.priority_] = {}
if job.nthreads not in jobs[job.priority_].keys():
jobs[job.priority_][job.nthreads] = []
jobs[job.priority_][job.nthreads].append(job)
nprio = max(jobs.keys())
for i in range(1,nprio+1):
if not len(jobs[i].keys()): continue
# check the thread numbers
different_thread_numbers = [n for n in jobs[i].keys()]
different_thread_numbers.sort()
for n in different_thread_numbers:
bunch = jobs[i][n]
if not len(bunch): continue
pool = MyPool(processes=n)
results = pool.map(execute_job, bunch)
pool.close()
pool.join()
if len(fails):
print()
print("Some jobs failed! :")
print()
for j in fails:
print(j.cmd_)
else:
print()
print("Computations ran successfully.")
print()
# ================= Statistics (without pseudoknots) ========================
......@@ -1183,43 +1183,43 @@ for instance in RNAcontainer:
instance.evaluate()
x_PK = [
[ rna.biokop.max_mcc for rna in RNAcontainer if len(rna.biokop.predictions)],
[ rna.biokop.max_mcc for rna in RNAcontainer if len(rna.biokop.predictions)],
[ rna.biorseoRawA.max_mcc for rna in RNAcontainer if len(rna.biorseoRawA.predictions)],
[ rna.biorseoRawB.max_mcc for rna in RNAcontainer if len(rna.biorseoRawB.predictions)],
[ rna.biorseoBayesPairA.max_mcc for rna in RNAcontainer if len(rna.biorseoBayesPairA.predictions)],
[ rna.biorseoBayesPairB.max_mcc for rna in RNAcontainer if len(rna.biorseoBayesPairB.predictions)],
[ rna.biorseoBayesPairC.max_mcc for rna in RNAcontainer if len(rna.biorseoBayesPairC.predictions)],
[ rna.biorseoBayesPairD.max_mcc for rna in RNAcontainer if len(rna.biorseoBayesPairD.predictions)],
[ rna.biorseoBGSUJAR3DA.max_mcc for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DA.predictions)],
[ rna.biorseoBGSUJAR3DB.max_mcc for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DB.predictions)],
[ rna.biorseoBGSUJAR3DC.max_mcc for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DC.predictions)],
[ rna.biorseoBGSUJAR3DD.max_mcc for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DD.predictions)],
[ rna.biorseoBGSUBayesPairA.max_mcc for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairA.predictions)],
[ rna.biorseoBGSUBayesPairB.max_mcc for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairB.predictions)],
[ rna.biorseoBGSUBayesPairC.max_mcc for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairC.predictions)],
[ rna.biorseoBGSUBayesPairD.max_mcc for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairD.predictions)]
[ rna.biokop.avg_mcc for rna in RNAcontainer if len(rna.biokop.predictions)],
[ rna.biokop.avg_mcc for rna in RNAcontainer if len(rna.biokop.predictions)],
[ rna.biorseoRawA.avg_mcc for rna in RNAcontainer if len(rna.biorseoRawA.predictions)],
[ rna.biorseoRawB.avg_mcc for rna in RNAcontainer if len(rna.biorseoRawB.predictions)],
[ rna.biorseoBayesPairA.avg_mcc for rna in RNAcontainer if len(rna.biorseoBayesPairA.predictions)],
[ rna.biorseoBayesPairB.avg_mcc for rna in RNAcontainer if len(rna.biorseoBayesPairB.predictions)],
[ rna.biorseoBayesPairC.avg_mcc for rna in RNAcontainer if len(rna.biorseoBayesPairC.predictions)],
[ rna.biorseoBayesPairD.avg_mcc for rna in RNAcontainer if len(rna.biorseoBayesPairD.predictions)],
[ rna.biorseoBGSUJAR3DA.avg_mcc for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DA.predictions)],
[ rna.biorseoBGSUJAR3DB.avg_mcc for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DB.predictions)],
[ rna.biorseoBGSUJAR3DC.avg_mcc for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DC.predictions)],
[ rna.biorseoBGSUJAR3DD.avg_mcc for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DD.predictions)],
[ rna.biorseoBGSUBayesPairA.avg_mcc for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairA.predictions)],
[ rna.biorseoBGSUBayesPairB.avg_mcc for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairB.predictions)],
[ rna.biorseoBGSUBayesPairC.avg_mcc for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairC.predictions)],
[ rna.biorseoBGSUBayesPairD.avg_mcc for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairD.predictions)]
]
RNAs_fully_predicted = [ x for x in RNAcontainer if x.has_complete_results(True)]
x_PK_fully = [
[ rna.biokop.max_mcc for rna in RNAs_fully_predicted],
[ rna.biokop.max_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoRawA.max_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoRawB.max_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBayesPairA.max_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBayesPairB.max_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBayesPairC.max_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBayesPairD.max_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBGSUJAR3DA.max_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBGSUJAR3DB.max_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBGSUJAR3DC.max_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBGSUJAR3DD.max_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBGSUBayesPairA.max_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBGSUBayesPairB.max_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBGSUBayesPairC.max_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBGSUBayesPairD.max_mcc for rna in RNAs_fully_predicted],
[ rna.biokop.avg_mcc for rna in RNAs_fully_predicted],
[ rna.biokop.avg_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoRawA.avg_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoRawB.avg_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBayesPairA.avg_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBayesPairB.avg_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBayesPairC.avg_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBayesPairD.avg_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBGSUJAR3DA.avg_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBGSUJAR3DB.avg_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBGSUJAR3DC.avg_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBGSUJAR3DD.avg_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBGSUBayesPairA.avg_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBGSUBayesPairB.avg_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBGSUBayesPairC.avg_mcc for rna in RNAs_fully_predicted],
[ rna.biorseoBGSUBayesPairD.avg_mcc for rna in RNAs_fully_predicted],
] # We ensure having the same number of RNAs in every sample by discarding the one for which computations did not ended/succeeded.
print()
......@@ -1296,43 +1296,43 @@ print("Wilcoxon signed rank test with PK: H0 = 'The position parameter of Biokop
# ================= PLOTS OF RESULTS =======================================
merge = [ x_PK_fully[0], # Biokop
x_noPK_fully[0], # RNA subopt
x_noPK_fully[1], # RNA-MoIP
x_noPK_fully[2], x_PK_fully[2], #biorseoRawA
x_noPK_fully[3], x_PK_fully[3], #biorseoRawB
x_noPK_fully[4], x_PK_fully[4], #biorseoBayesPairA
x_noPK_fully[5], x_PK_fully[5], #biorseoBayesPairB
x_noPK_fully[6], x_PK_fully[6], #biorseoBayesPairC
x_noPK_fully[7], x_PK_fully[7], #biorseoBayesPairD
x_noPK_fully[8], x_PK_fully[8], #biorseoBGSUJAR3DA
x_noPK_fully[9], x_PK_fully[9], #biorseoBGSUJAR3DB
x_noPK_fully[10], x_PK_fully[10], #biorseoBGSUJAR3DC
x_noPK_fully[11], x_PK_fully[11], #biorseoBGSUJAR3DD
x_noPK_fully[12], x_PK_fully[12], #biorseoBGSUBayesPairA
x_noPK_fully[13], x_PK_fully[13], #biorseoBGSUBayesPairB
x_noPK_fully[14], x_PK_fully[14], #biorseoBGSUBayesPairC
x_noPK_fully[15], x_PK_fully[15], #biorseoBGSUBayesPairD
merge = [ x_noPK[0], # RNA subopt
x_noPK[1], # RNA-MoIP
x_PK[0], # Biokop
x_PK[2], #biorseoRawA
x_PK[3], #biorseoRawB
x_PK[4], #biorseoBayesPairA
x_PK[5], #biorseoBayesPairB
x_PK[6], #biorseoBayesPairC
x_PK[7], #biorseoBayesPairD
x_PK[8], #biorseoBGSUJAR3DA
x_PK[9], #biorseoBGSUJAR3DB
x_PK[10], #biorseoBGSUJAR3DC
x_PK[11], #biorseoBGSUJAR3DD
x_PK[12], #biorseoBGSUBayesPairA
x_PK[13], #biorseoBGSUBayesPairB
x_PK[14], #biorseoBGSUBayesPairC
x_PK[15], #biorseoBGSUBayesPairD
]
colors = [ 'green', 'blue', 'goldenrod',
'darkturquoise', 'darkturquoise',
'red', 'red',
'firebrick', 'firebrick',
'limegreen', 'limegreen',
'olive', 'olive',
'forestgreen', 'forestgreen',
'lime', 'lime',
'darkcyan', 'darkcyan',
'royalblue', 'royalblue',
'navy', 'navy',
'limegreen', 'limegreen',
'olive', 'olive',
'forestgreen', 'forestgreen',
'lime', 'lime'
colors = [ 'blue', 'goldenrod', 'green',
'red',
'firebrick',
'limegreen',
'olive',
'forestgreen',
'lime',
'darkcyan',
'royalblue',
'navy',
'limegreen',
'olive',
'forestgreen',
'lime'
]
labels = [ "Biokop", "RNAsubopt",
labels = [ "RNAsubopt",
"RNA-MoIP",
"Biokop",
"$f_{1A}$",
"$f_{1B}$",
"$f_{1A}$",
......@@ -1349,49 +1349,64 @@ labels = [ "Biokop", "RNAsubopt",
"$f_{1D}$"
]
ax = plt.subplot(211)
ax.tick_params(labelsize=12)
for y in [ i/10 for i in range(11) ]:
plt.axhline(y=y, color="grey", linestyle="--", linewidth=1)
colors = [ 'blue','goldenrod',
'red', 'firebrick','limegreen','olive', 'forestgreen', 'lime',
'darkturquoise', 'darkcyan', 'royalblue', 'navy', 'limegreen','olive', 'forestgreen', 'lime'
]
bplot = plt.boxplot(x_noPK_fully, vert=True, patch_artist=True, notch=False, whis=[3,97])
for patch, color in zip(bplot['boxes'], colors):
patch.set_facecolor(color)
# plt.axhline(y=0, color="black", linewidth=1)
# plt.axhline(y=1, color="black", linewidth=1)
plt.xticks([1.0+i for i in range(16)], labels[1:])
plt.ylim((0.5, 1.01))
plt.ylabel("MCC", fontsize=12)
plt.subplots_adjust(left=0.05, right=0.95)
# plt.title("Performance without pseudoknots (%d RNAs included)" % len(x_noPK_fully[0]))
ax = plt.subplot(212)
ax.tick_params(top=True, bottom=False, labeltop=True, labelbottom=False, labelsize=12)
ax.xaxis.set_label_position('top')
for y in [ i/10 for i in range(11) ]:
plt.axhline(y=y, color="grey", linestyle="--", linewidth=1)
colors = [ 'green','green',
'red', 'firebrick','limegreen','olive', 'forestgreen', 'lime',
'darkturquoise', 'darkcyan', 'royalblue', 'navy', 'limegreen','olive', 'forestgreen', 'lime'
]
labels = [ "Biokop"]
bplot = plt.boxplot(x_PK_fully, vert=True, patch_artist=True, notch=False, whis=[3,97])
for patch, color in zip(bplot['boxes'], colors):
patch.set_facecolor(color)
# for y in [ i/10 for i in range(11) ]:
# plt.axhline(y=y, color="grey", linestyle="--", linewidth=1)
# plt.axhline(y=0, color="black", linewidth=1)
# plt.axhline(y=1, color="black", linewidth=1)
plt.xticks([1.0+i for i in range(16)], labels)
plt.ylim((0.5, 1.01))
plt.ylabel("MCC", fontsize=12)
plt.subplots_adjust(left=0.05, right=0.95)
# plt.text(6.2,-0.3,"Performance with pseudoknots (%d RNAs included)" % len(x_PK_fully[0]), fontsize=12)
# bplot = plt.boxplot(merge, vert=True, patch_artist=True, notch=False, whis=[3,97])
# for patch, color in zip(bplot['boxes'], colors):
# patch.set_facecolor(color)
# plt.xticks([1.0+i for i in range(17)], labels)
# plt.ylim((-0.1, 1.01))
# plt.ylabel("MCC", fontsize=12)
# plt.subplots_adjust(left=0.05, right=0.95)
# # plt.title("Performance with pseudoknotted dataset (%d RNAs from Pseudobase++)" % len(merge[0]))
# plt.show()
# # Separating PK and non-PK
# ax = plt.subplot(211)
# ax.tick_params(labelsize=12)
# for y in [ i/10 for i in range(11) ]:
# plt.axhline(y=y, color="grey", linestyle="--", linewidth=1)
# colors = [ 'blue','goldenrod',
# 'red', 'firebrick','limegreen','olive', 'forestgreen', 'lime',
# 'darkturquoise', 'darkcyan', 'royalblue', 'navy', 'limegreen','olive', 'forestgreen', 'lime'
# ]
# bplot = plt.boxplot(x_noPK_fully, vert=True, patch_artist=True, notch=False, whis=[3,97])
# for patch, color in zip(bplot['boxes'], colors):
# patch.set_facecolor(color)
# # plt.axhline(y=0, color="black", linewidth=1)
# # plt.axhline(y=1, color="black", linewidth=1)
# plt.xticks([1.0+i for i in range(16)], labels[1:])
# plt.ylim((0.5, 1.01))
# plt.ylabel("MCC", fontsize=12)
# plt.subplots_adjust(left=0.05, right=0.95)
# # plt.title("Performance without pseudoknots (%d RNAs included)" % len(x_noPK_fully[0]))
# ax = plt.subplot(212)
# ax.tick_params(top=True, bottom=False, labeltop=True, labelbottom=False, labelsize=12)
# ax.xaxis.set_label_position('top')
# for y in [ i/10 for i in range(11) ]:
# plt.axhline(y=y, color="grey", linestyle="--", linewidth=1)
# colors = [ 'green','green',
# 'red', 'firebrick','limegreen','olive', 'forestgreen', 'lime',
# 'darkturquoise', 'darkcyan', 'royalblue', 'navy', 'limegreen','olive', 'forestgreen', 'lime'
# ]
# labels = [ "Biokop"]
# bplot = plt.boxplot(x_PK_fully, vert=True, patch_artist=True, notch=False, whis=[3,97])
# for patch, color in zip(bplot['boxes'], colors):
# patch.set_facecolor(color)
# # plt.axhline(y=0, color="black", linewidth=1)
# # plt.axhline(y=1, color="black", linewidth=1)
# plt.xticks([1.0+i for i in range(16)], labels)
# plt.ylim((0.4, 1.01))
# plt.ylabel("MCC", fontsize=12)
# plt.subplots_adjust(left=0.05, right=0.95)
# # plt.text(6.2,-0.3,"Performance with pseudoknots (%d RNAs included)" % len(x_PK_fully[0]), fontsize=12)
plt.show()
# plt.show()
# # ================== MCC performance ====================================
......@@ -1536,82 +1551,6 @@ plt.show()
# plt.show()
# # MCC boost compared to RNA subopt
# plt.subplot(143)
# x = [
# [ rna.rnamoip.max_mcc - rna.rnasubopt.max_mcc for rna in RNAcontainer if len(rna.rnamoip.predictions)],
# [ rna.biorseoRawA.max_mcc - rna.rnasubopt.max_mcc for rna in RNAcontainer if len(rna.biorseoRawA.predictions)],
# [ rna.biorseoRawB.max_mcc - rna.rnasubopt.max_mcc for rna in RNAcontainer if len(rna.biorseoRawB.predictions)],
# [ rna.biokop.max_mcc - rna.rnasubopt.max_mcc for rna in RNAcontainer if len(rna.biokop.predictions)],
#]
# colors = ['xkcd:goldenrod', 'xkcd:red', 'firebrick', 'limegreen']
# labels = ["$\Delta$MCC(RNAsubopt,RNA-MoIP)","$\Delta$MCC(RNAsubopt,RNA MoBOIP)",
# "$\Delta$MCC(RNAsubopt,RNA MoBOIP++)","$\Delta$MCC(RNAsubopt,Biokop)"]
# bplot = plt.boxplot(x, vert=False, patch_artist=True, notch=False, whis=[3,97])
# for patch, color in zip(bplot['boxes'], colors):
# patch.set_facecolor(color)
# plt.axvline(x=0, color="black", linewidth=1)
# plt.yticks([1.0+i for i in range(4)], labels)
# plt.xlim((-1.1, 1.1))
# plt.xlabel("Improvement in MCC")
# plt.title("MCC performance relatively to RNAsubopt")
# plt.show()
# plt.subplot(222)
# x = [
# [ rna.biorseoBGSUBayesPairA.max_mcc - rna.rnasubopt.max_mcc for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairA.predictions)],
# [ rna.biorseoBGSUBayesPairB.max_mcc - rna.rnasubopt.max_mcc for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairB.predictions)],
# [ rna.biorseoBGSUBayesPairC.max_mcc - rna.rnasubopt.max_mcc for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairC.predictions)],
# [ rna.biorseoBGSUBayesPairD.max_mcc - rna.rnasubopt.max_mcc for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairD.predictions)],
#]
# bplot = plt.boxplot(x, vert=False, patch_artist=True, notch=False, whis=[3,97])
# for patch, color in zip(bplot['boxes'], colors):
# patch.set_facecolor(color)
# plt.axvline(x=0, color="black", linewidth=1)
# plt.yticks([1.0+i for i in range(4)], labels)
# plt.xlim((-1.1, 1.1))
# # plt.xlabel("Improvement in MCC")
# plt.title("(B) The RNA Motif Atlas 3.2 + BayesPairing")
# plt.subplot(223)
# x = [
# [ rna.biorseoRawA.max_mcc - rna.rnasubopt.max_mcc for rna in RNAcontainer if len(rna.biorseoRawA.predictions)],
# [ rna.biorseoRawB.max_mcc - rna.rnasubopt.max_mcc for rna in RNAcontainer if len(rna.biorseoRawB.predictions)],
#]
# colors = ['red', 'firebrick']
# labels = ["$f_{1A}$", "$f_{1B}$"]
# bplot = plt.boxplot(x, vert=False, patch_artist=True, notch=False, whis=[3,97])
# for patch, color in zip(bplot['boxes'], colors):
# patch.set_facecolor(color)
# plt.axvline(x=0, color="black", linewidth=1)
# plt.yticks([1.0+i for i in range(2)], labels)
# plt.xlabel("Improvement in MCC")
# plt.xlim((-1.1, 1.1))
# plt.title("(C) Rna3Dmotifs + Simple pattern matching")
# plt.subplot(224)
# x = [
# [ rna.biorseoBGSUJAR3DA.max_mcc - rna.rnasubopt.max_mcc for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DA.predictions)],
# [ rna.biorseoBGSUJAR3DB.max_mcc - rna.rnasubopt.max_mcc for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DB.predictions)],
# [ rna.biorseoBGSUJAR3DC.max_mcc - rna.rnasubopt.max_mcc for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DC.predictions)],
# [ rna.biorseoBGSUJAR3DD.max_mcc - rna.rnasubopt.max_mcc for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DD.predictions)],
#]
# colors = ['darkturquoise', 'darkcyan', 'royalblue', 'navy']
# labels = ["$f_{1A}$", "$f_{1B}$", "$f_{1C}$", "$f_{1D}$"]
# bplot = plt.boxplot(x, vert=False, patch_artist=True, notch=False, whis=[3,97])
# for patch, color in zip(bplot['boxes'], colors):
# patch.set_facecolor(color)
# plt.axvline(x=0, color="black", linewidth=1)
# plt.yticks([1.0+i for i in range(4)], labels)
# plt.xlabel("Improvement in MCC")
# plt.xlim((-1.1, 1.1))
# plt.title("(D) The RNA Motif Atlas 3.2 + JAR3D")
# plt.show()
# # insertion ratio of the best structure
# plt.subplot(221)
# x = [
......