Merge branch 'master' of https://github.com/persalteas/biominserter

Louis BECQUEY
Commit 720602576982d5fa19ed03e99769ef4a153edb05 72060257 2 parents 2e2ec263 73734e9d
Showing 2 changed files with 123 additions and 120 deletions
biorseo.py
test_on_RNAstrand.py
--- a/biorseo.py
View file @7206025
+++ b/biorseo.py
View file @7206025
@@ -549,7 +549,7 @@ class BiorseoInstance:
             cmd = ["java", "-jar", jar3dexec, filename, ILmotifDir+"/all.txt",
                    loop.header[1:]+".ILloop.csv", loop.header[1:]+".ILseq.csv"]
         nowhere = open(devnull, 'w')
-         logfile = open("log_of_the_run.sh", 'a')
+         logfile = open(biorseoDir + "/log_of_the_run.sh", 'a')
         logfile.write(' '.join(cmd))
         logfile.write("\n")
         logfile.close()
@@ -789,7 +789,7 @@ class BiorseoInstance:
             if c == 0:
                 seq = l[:-1].upper()
                 if is_canonical_nts(seq):
-                     header = header.replace('/', '_').replace('\'','').replace('(','').replace(')','')
+                     header = header.replace('/', '_').replace('\'','').replace('(','').replace(')','').replace(' ','_')
                     RNAcontainer.append(RNA(header, seq))
                     if not path.isfile(self.outputf + header + ".fa"):
                         rna = open(self.outputf + header + ".fa", "w")
@@ -845,4 +845,4 @@ class BiorseoInstance:
             self.joblist.append(Job(command=command, priority=priority, timeout=3600, how_many_in_parallel=3))
             
 
- BiorseoInstance(opts)
+ BiorseoInstance(opts)
\ No newline at end of file
--- a/test_on_RNAstrand.py
View file @7206025
+++ b/test_on_RNAstrand.py
View file @7206025
@@ -9,7 +9,7 @@ from matplotlib import colors
 from math import sqrt
 from multiprocessing import Pool, cpu_count, Manager
 import multiprocessing
- import ast
+ import ast, time
 
 # ================== DEFINITION OF THE PATHS ==============================
 
@@ -77,9 +77,8 @@ class MyPool(multiprocessing.pool.Pool):
         kwargs['context'] = NoDaemonContext()
         super(MyPool, self).__init__(*args, **kwargs)
 
- exit()
- def execute_job(j):
 
+ def execute_job(j):
     if j.checkFunc_ is not None:
         if j.checkFunc_(*j.checkArgs_):
             running_stats[2] += 1
@@ -223,6 +222,7 @@ def launch_JAR3D_worker(loop):
     return insertion_sites
 
 def launch_JAR3D(seq_, basename):
+     time1 = time.time()
     rnasubopt_preds = []
     # Extracting probable loops from RNA-subopt structures
     rna = open(outputDir + basename + ".subopt", "r")
@@ -270,9 +270,10 @@ def launch_JAR3D(seq_, basename):
             positions.append("-,-")
         resultsfile.write(','.join(positions)+'\n')
     resultsfile.close()
+     time2 = time.time()
+     print("<%s | %.3fs" % (basename, time2-time1))
 
 def launch_BayesPairing(module_type, seq_, header_, basename):
-     chdir(bypdir)
 
     cmd = ["python3","parse_sequences.py","-seq",outputDir + basename + ".fa", "-d", module_type, "-interm","1"]
 
@@ -281,6 +282,7 @@ def launch_BayesPairing(module_type, seq_, header_, basename):
     logfile.write("\n")
     logfile.close()
 
+     chdir(bypdir)
     out = subprocess.check_output(cmd).decode('utf-8')
     BypLog = out.split('\n')
     idx = 0
@@ -1176,43 +1178,43 @@ for instance in RNAcontainer:
     instance.evaluate()
 
 x_PK = [
-     [ rna.biokop.avg_mcc for rna in RNAcontainer if len(rna.biokop.predictions)],
-     [ rna.biokop.avg_mcc for rna in RNAcontainer if len(rna.biokop.predictions)],
-     [ rna.biorseoRawA.avg_mcc for rna in RNAcontainer if len(rna.biorseoRawA.predictions)],
-     [ rna.biorseoRawB.avg_mcc for rna in RNAcontainer if len(rna.biorseoRawB.predictions)],
-     [ rna.biorseoBayesPairA.avg_mcc  for rna in RNAcontainer if len(rna.biorseoBayesPairA.predictions)],
-     [ rna.biorseoBayesPairB.avg_mcc  for rna in RNAcontainer if len(rna.biorseoBayesPairB.predictions)],
-     [ rna.biorseoBayesPairC.avg_mcc  for rna in RNAcontainer if len(rna.biorseoBayesPairC.predictions)],
-     [ rna.biorseoBayesPairD.avg_mcc  for rna in RNAcontainer if len(rna.biorseoBayesPairD.predictions)],
-     [ rna.biorseoBGSUJAR3DA.avg_mcc  for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DA.predictions)],
-     [ rna.biorseoBGSUJAR3DB.avg_mcc  for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DB.predictions)],
-     [ rna.biorseoBGSUJAR3DC.avg_mcc  for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DC.predictions)],
-     [ rna.biorseoBGSUJAR3DD.avg_mcc  for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DD.predictions)],
-     [ rna.biorseoBGSUBayesPairA.avg_mcc  for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairA.predictions)],
-     [ rna.biorseoBGSUBayesPairB.avg_mcc  for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairB.predictions)],
-     [ rna.biorseoBGSUBayesPairC.avg_mcc  for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairC.predictions)],
-     [ rna.biorseoBGSUBayesPairD.avg_mcc  for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairD.predictions)]
+     [ rna.biokop.max_mcc for rna in RNAcontainer if len(rna.biokop.predictions)],
+     [ rna.biokop.max_mcc for rna in RNAcontainer if len(rna.biokop.predictions)],
+     [ rna.biorseoRawA.max_mcc for rna in RNAcontainer if len(rna.biorseoRawA.predictions)],
+     [ rna.biorseoRawB.max_mcc for rna in RNAcontainer if len(rna.biorseoRawB.predictions)],
+     [ rna.biorseoBayesPairA.max_mcc  for rna in RNAcontainer if len(rna.biorseoBayesPairA.predictions)],
+     [ rna.biorseoBayesPairB.max_mcc  for rna in RNAcontainer if len(rna.biorseoBayesPairB.predictions)],
+     [ rna.biorseoBayesPairC.max_mcc  for rna in RNAcontainer if len(rna.biorseoBayesPairC.predictions)],
+     [ rna.biorseoBayesPairD.max_mcc  for rna in RNAcontainer if len(rna.biorseoBayesPairD.predictions)],
+     [ rna.biorseoBGSUJAR3DA.max_mcc  for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DA.predictions)],
+     [ rna.biorseoBGSUJAR3DB.max_mcc  for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DB.predictions)],
+     [ rna.biorseoBGSUJAR3DC.max_mcc  for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DC.predictions)],
+     [ rna.biorseoBGSUJAR3DD.max_mcc  for rna in RNAcontainer if len(rna.biorseoBGSUJAR3DD.predictions)],
+     [ rna.biorseoBGSUBayesPairA.max_mcc  for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairA.predictions)],
+     [ rna.biorseoBGSUBayesPairB.max_mcc  for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairB.predictions)],
+     [ rna.biorseoBGSUBayesPairC.max_mcc  for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairC.predictions)],
+     [ rna.biorseoBGSUBayesPairD.max_mcc  for rna in RNAcontainer if len(rna.biorseoBGSUBayesPairD.predictions)]
 ]
 
 RNAs_fully_predicted = [ x for x in RNAcontainer if x.has_complete_results(True)]
 
 x_PK_fully = [
-     [ rna.biokop.avg_mcc for rna in RNAs_fully_predicted],
-     [ rna.biokop.avg_mcc for rna in RNAs_fully_predicted],
-     [ rna.biorseoRawA.avg_mcc for rna in RNAs_fully_predicted],
-     [ rna.biorseoRawB.avg_mcc for rna in RNAs_fully_predicted],
-     [ rna.biorseoBayesPairA.avg_mcc  for rna in RNAs_fully_predicted],
-     [ rna.biorseoBayesPairB.avg_mcc  for rna in RNAs_fully_predicted],
-     [ rna.biorseoBayesPairC.avg_mcc  for rna in RNAs_fully_predicted],
-     [ rna.biorseoBayesPairD.avg_mcc  for rna in RNAs_fully_predicted],
-     [ rna.biorseoBGSUJAR3DA.avg_mcc  for rna in RNAs_fully_predicted],
-     [ rna.biorseoBGSUJAR3DB.avg_mcc  for rna in RNAs_fully_predicted],
-     [ rna.biorseoBGSUJAR3DC.avg_mcc  for rna in RNAs_fully_predicted],
-     [ rna.biorseoBGSUJAR3DD.avg_mcc  for rna in RNAs_fully_predicted],
-     [ rna.biorseoBGSUBayesPairA.avg_mcc  for rna in RNAs_fully_predicted],
-     [ rna.biorseoBGSUBayesPairB.avg_mcc  for rna in RNAs_fully_predicted],
-     [ rna.biorseoBGSUBayesPairC.avg_mcc  for rna in RNAs_fully_predicted],
-     [ rna.biorseoBGSUBayesPairD.avg_mcc  for rna in RNAs_fully_predicted],
+     [ rna.biokop.max_mcc for rna in RNAs_fully_predicted],
+     [ rna.biokop.max_mcc for rna in RNAs_fully_predicted],
+     [ rna.biorseoRawA.max_mcc for rna in RNAs_fully_predicted],
+     [ rna.biorseoRawB.max_mcc for rna in RNAs_fully_predicted],
+     [ rna.biorseoBayesPairA.max_mcc  for rna in RNAs_fully_predicted],
+     [ rna.biorseoBayesPairB.max_mcc  for rna in RNAs_fully_predicted],
+     [ rna.biorseoBayesPairC.max_mcc  for rna in RNAs_fully_predicted],
+     [ rna.biorseoBayesPairD.max_mcc  for rna in RNAs_fully_predicted],
+     [ rna.biorseoBGSUJAR3DA.max_mcc  for rna in RNAs_fully_predicted],
+     [ rna.biorseoBGSUJAR3DB.max_mcc  for rna in RNAs_fully_predicted],
+     [ rna.biorseoBGSUJAR3DC.max_mcc  for rna in RNAs_fully_predicted],
+     [ rna.biorseoBGSUJAR3DD.max_mcc  for rna in RNAs_fully_predicted],
+     [ rna.biorseoBGSUBayesPairA.max_mcc  for rna in RNAs_fully_predicted],
+     [ rna.biorseoBGSUBayesPairB.max_mcc  for rna in RNAs_fully_predicted],
+     [ rna.biorseoBGSUBayesPairC.max_mcc  for rna in RNAs_fully_predicted],
+     [ rna.biorseoBGSUBayesPairD.max_mcc  for rna in RNAs_fully_predicted],
 ]  # We ensure having the same number of RNAs in every sample by discarding the one for which computations did not ended/succeeded.
 
 print()
@@ -1260,87 +1262,88 @@ test = stats.wilcoxon(x_PK_fully[0], x_PK_fully[11])
 print("Wilcoxon signed rank test with PK: H0 = 'The position parameter of Biokop and Jar3dD are equal', p-value = ", test.pvalue)
 
 
- # # ================== Print results for application cases =====================
- 
- # labels = ["Biokop","Biokop","RawA","RawB","BayesPairingA","BayesPairingB","BayesPairingC","BayesPairingD","JAR3DA","JAR3DB","JAR3DC","JAR3DD","BGSUBayesPairingA","BGSUBayesPairingB","BGSUBayesPairingC","BGSUBayesPairingD"]
- # print("RNAsubopt",":",x_noPK[0])
- # print("RNA-MOIP",":",x_noPK[1])
- # for data, name in zip(x_PK, labels):
- #     print(name,":",data)
- # labels = ["RNAsubopt","Biokop\t", "RNA-MoIP\t","RawA\t","RawB\t","BayesPairingA","BayesPairingB","BayesPairingC","BayesPairingD","JAR3DA\t","JAR3DB\t","JAR3DC\t","JAR3DD\t","BGSUBPairingA","BGSUBPairingB","BGSUBPairingC","BGSUBPairingD"]
- # for r in RNAcontainer:
- #     print("\n",r.header_,"\nTrue structure:\t", r.true2d)
- #     for m, name in zip([r.rnasubopt, r.biokop, r.rnamoip,
- #                    r.biorseoRawA, 
- #                    r.biorseoRawB,
- #                    r.biorseoBayesPairA, 
- #                    r.biorseoBayesPairB, 
- #                    r.biorseoBayesPairC, 
- #                    r.biorseoBayesPairD,
- #                    r.biorseoBGSUJAR3DA, 
- #                    r.biorseoBGSUJAR3DB, 
- #                    r.biorseoBGSUJAR3DC, 
- #                    r.biorseoBGSUJAR3DD,
- #                    r.biorseoBGSUBayesPairA, 
- #                    r.biorseoBGSUBayesPairB, 
- #                    r.biorseoBGSUBayesPairC, 
- #                    r.biorseoBGSUBayesPairD ], labels):
- #         print(name+":\t",m.best_pred)
- 
- # ================= PLOTS OF RESULTS =======================================
- 
- merge = [   x_noPK[0], # RNA subopt
-             x_noPK[1], # RNA-MoIP
-             x_PK[0], # Biokop
-             x_PK[2], #biorseoRawA
-             x_PK[3], #biorseoRawB
-             x_PK[4], #biorseoBayesPairA
-             x_PK[5], #biorseoBayesPairB
-             x_PK[6], #biorseoBayesPairC
-             x_PK[7], #biorseoBayesPairD
-             x_PK[8], #biorseoBGSUJAR3DA
-             x_PK[9], #biorseoBGSUJAR3DB
-             x_PK[10], #biorseoBGSUJAR3DC
-             x_PK[11], #biorseoBGSUJAR3DD
-             x_PK[12], #biorseoBGSUBayesPairA
-             x_PK[13], #biorseoBGSUBayesPairB
-             x_PK[14], #biorseoBGSUBayesPairC
-             x_PK[15], #biorseoBGSUBayesPairD
- ]
+ # ================== Print results for application cases =====================
+ 
+ labels = ["Biokop","Biokop","RawA","RawB","BayesPairingA","BayesPairingB","BayesPairingC","BayesPairingD","JAR3DA","JAR3DB","JAR3DC","JAR3DD","BGSUBayesPairingA","BGSUBayesPairingB","BGSUBayesPairingC","BGSUBayesPairingD"]
+ print("RNAsubopt",":",x_noPK[0])
+ print("RNA-MOIP",":",x_noPK[1])
+ for data, name in zip(x_PK, labels):
+     print(name,":",data)
+ labels = ["RNAsubopt","Biokop\t", "RNA-MoIP\t","RawA\t","RawB\t","BayesPairingA","BayesPairingB","BayesPairingC","BayesPairingD","JAR3DA\t","JAR3DB\t","JAR3DC\t","JAR3DD\t","BGSUBPairingA","BGSUBPairingB","BGSUBPairingC","BGSUBPairingD"]
+ for r in RNAcontainer:
+     print("\n",r.header_,"\nTrue structure:\t", r.true2d)
+     for m, name in zip([r.rnasubopt, r.biokop, r.rnamoip,
+                    r.biorseoRawA, 
+                    r.biorseoRawB,
+                    r.biorseoBayesPairA, 
+                    r.biorseoBayesPairB, 
+                    r.biorseoBayesPairC, 
+                    r.biorseoBayesPairD,
+                    r.biorseoBGSUJAR3DA, 
+                    r.biorseoBGSUJAR3DB, 
+                    r.biorseoBGSUJAR3DC, 
+                    r.biorseoBGSUJAR3DD,
+                    r.biorseoBGSUBayesPairA, 
+                    r.biorseoBGSUBayesPairB, 
+                    r.biorseoBGSUBayesPairC, 
+                    r.biorseoBGSUBayesPairD ], labels):
+         print(name+":\t",m.best_pred, "%.2f"% m.max_mcc, m.n_pred)
+ 
+ # # ================= PLOTS OF RESULTS =======================================
+ 
+ # merge = [   
+ #             x_PK[0], # Biokop
+ #             x_noPK[0], # RNA subopt
+ #             x_noPK[1], # RNA-MoIP
+ #             x_PK[2], #biorseoRawA
+ #             x_PK[3], #biorseoRawB
+ #             x_PK[4], #biorseoBayesPairA
+ #             x_PK[5], #biorseoBayesPairB
+ #             x_PK[6], #biorseoBayesPairC
+ #             x_PK[7], #biorseoBayesPairD
+ #             x_PK[8], #biorseoBGSUJAR3DA
+ #             x_PK[9], #biorseoBGSUJAR3DB
+ #             x_PK[10], #biorseoBGSUJAR3DC
+ #             x_PK[11], #biorseoBGSUJAR3DD
+ #             x_PK[12], #biorseoBGSUBayesPairA
+ #             x_PK[13], #biorseoBGSUBayesPairB
+ #             x_PK[14], #biorseoBGSUBayesPairC
+ #             x_PK[15], #biorseoBGSUBayesPairD
+ # ]
 
- colors = [  'blue', 'goldenrod', 'green', 
-             'red',
-             'firebrick', 
-             'limegreen', 
-             'olive', 
-             'forestgreen', 
-             'lime', 
-             'darkcyan', 
-             'royalblue',
-             'navy', 
-             'limegreen', 
-             'olive', 
-             'forestgreen', 
-             'lime'
- ]
- labels = [  "RNAsubopt",
-             "RNA-MoIP",
-             "Biokop", 
-             "$f_{1A}$",
-             "$f_{1B}$",
-             "$f_{1A}$",
-             "$f_{1B}$",
-             "$f_{1C}$",
-             "$f_{1D}$",
-             "$f_{1A}$",
-             "$f_{1B}$",
-             "$f_{1C}$",
-             "$f_{1D}$",
-             "$f_{1A}$",
-             "$f_{1B}$",
-             "$f_{1C}$",
-             "$f_{1D}$"        
- ]
+ # colors = [  'green', 'blue', 'goldenrod', 
+ #             'red',
+ #             'firebrick', 
+ #             'limegreen', 
+ #             'olive', 
+ #             'forestgreen', 
+ #             'lime', 
+ #             'darkturquoise',
+ #             'darkcyan', 
+ #             'royalblue',
+ #             'navy', 
+ #             'limegreen', 
+ #             'olive', 
+ #             'forestgreen', 
+ #             'lime'
+ # ]
+ # labels = [  "Biokop", "RNAsubopt",
+ #             "RNA-MoIP",            
+ #             "$f_{1A}$",
+ #             "$f_{1B}$",
+ #             "$f_{1A}$",
+ #             "$f_{1B}$",
+ #             "$f_{1C}$",
+ #             "$f_{1D}$",
+ #             "$f_{1A}$",
+ #             "$f_{1B}$",
+ #             "$f_{1C}$",
+ #             "$f_{1D}$",
+ #             "$f_{1A}$",
+ #             "$f_{1B}$",
+ #             "$f_{1C}$",
+ #             "$f_{1D}$"        
+ # ]
 
 
 # for y in [ i/10 for i in range(11) ]:
@@ -1371,7 +1374,7 @@ labels = [  "RNAsubopt",
 # # plt.axhline(y=0, color="black", linewidth=1)
 # # plt.axhline(y=1, color="black", linewidth=1)
 # plt.xticks([1.0+i for i in range(16)], labels[1:])
- # plt.ylim((0.5, 1.01))
+ # plt.ylim((0.4, 1.01))
 # plt.ylabel("MCC", fontsize=12)
 # plt.subplots_adjust(left=0.05, right=0.95)
 # # plt.title("Performance without pseudoknots (%d RNAs included)" % len(x_noPK_fully[0]))