With stats results

Louis BECQUEY
Commit 40cb5b7fbfcbea929792b7ca300b5978fb1b3c1e 40cb5b7f 1 parent 36f12c24
Showing 18 changed files with 366 additions and 328 deletions
.gitignore
RNAnet.py
data/jobstats.csv
data/statistics.csv
regression.py
results/2D.png
results/distances.png
results/figures/distances.png
results/figures/lengths.png
results/figures/pairings.png
results/frequencies.csv
results/lengths.png
results/mappings_list.csv
results/pairings.csv
results/pairings.png
results/realign_jobs_performance.png
results/regression.png
statistics.py
--- a/.gitignore
View file @40cb5b7
+++ b/.gitignore
View file @40cb5b7
 # execution outputs:
 nohup.out
- jobstats.csv
 log_of_the_run.sh
 
+ # results
+ results/figures/wadley_plots/
+ 
 # temporary results files
- data/*.npy
- data/*.npz
- data/olddata
+ data/
 
 # environment stuff
 .vscode/
--- a/RNAnet.py
View file @40cb5b7
+++ b/RNAnet.py
View file @40cb5b7
@@ -27,7 +27,6 @@ running_stats = m.list()
 running_stats.append(0) # n_launched
 running_stats.append(0) # n_finished
 running_stats.append(0) # n_skipped
- runDir = path.dirname(path.realpath(__file__))
 path_to_3D_data = "tobedefinedbyoptions"
 path_to_seq_data = "tobedefinedbyoptions"
 validsymb = '\U00002705'
@@ -40,6 +39,7 @@ KEEP_HETATM = False
 FILL_GAPS = True 
 HOMOLOGY = True
 USE_KNOWN_ISSUES = True
+ RUN_STATS = False
 
 class NtPortionSelector(object):
     """Class passed to MMCIFIO to select some chain portions in an MMCIF file.
@@ -119,17 +119,18 @@ class Chain:
 
     Chains accumulate information through this scipt, and are saved to files at the end of major steps."""
 
-     def __init__(self, pdb_id, pdb_model, pdb_chain_id, chain_label, rfam="", pdb_start=None, pdb_end=None):
+     def __init__(self, pdb_id, pdb_model, pdb_chain_id, chain_label, rfam="", inferred=False, pdb_start=None, pdb_end=None):
         self.pdb_id = pdb_id                    # PDB ID
         self.pdb_model = int(pdb_model)         # model ID, starting at 1
         self.pdb_chain_id = pdb_chain_id        # chain ID (mmCIF), multiple letters
         self.pdb_start = pdb_start              # if portion of chain, the start number (relative to the chain, not residue numbers)
         self.pdb_end = pdb_end                  # if portion of chain, the start number (relative to the chain, not residue numbers)
-         self.reversed = False                   # wether pdb_end > pdb_start in the Rfam mapping
+         self.reversed = (pdb_start > pdb_end)   # wether pdb_start > pdb_end in the Rfam mapping
         self.chain_label = chain_label          # chain pretty name 
         self.full_mmCIFpath = ""                # path to the source mmCIF structure
         self.file = ""                          # path to the 3D PDB file
         self.rfam_fam = rfam                    # mapping to an RNA family
+         self.inferred = inferred                # Wether this mapping has been inferred from BGSU's NR list
         self.seq = ""                           # sequence with modified nts
         self.aligned_seq = ""                   # sequence with modified nts replaced, but gaps can exist
         self.length = -1                        # length of the sequence (missing residues are not counted)
@@ -848,7 +849,7 @@ def execute_job(j, jobcount):
         print(f"[{running_stats[0]+running_stats[2]}/{jobcount}]\t{j.label}")
 
         # Add the command to logfile
-         logfile = open(runDir + "/log_of_the_run.sh", 'a')
+         logfile = open("log_of_the_run.sh", 'a')
         logfile.write(" ".join(j.cmd_))
         logfile.write("\n")
         logfile.close()
@@ -916,7 +917,7 @@ def execute_joblist(fulljoblist, printstats=False):
 
     if printstats:
         # Write statistics in a file (header here)
-         f = open("data/jobstats.csv", "w")
+         f = open(runDir + "/data/jobstats.csv", "w")
         f.write("label,comp_time,max_mem\n")
         f.close()
 
@@ -948,7 +949,7 @@ def execute_joblist(fulljoblist, printstats=False):
                 mems = [ r[1] for r in raw_results ]
 
                 # Write them to file
-                 f = open("data/jobstats.csv", "a")
+                 f = open(runDir + "/data/jobstats.csv", "a")
                 for j, t, m in zip(bunch, times, mems):
                     j.comp_time = t
                     j.max_mem = m
@@ -1426,11 +1427,13 @@ def infer_all_mappings(allmappings, codelist):
                 if len(m):
                     pdb_start = int(m.pdb_start)
                     pdb_end = int(m.pdb_end)
+                     inferred = False
                 else: # otherwise, use the inferred mapping
                     pdb_start = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_start)
                     pdb_end = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_end)
+                     inferred = True
                 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}_{pdb_start}-{pdb_end}"
-                 newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, rfam=rfam, pdb_start=pdb_start, pdb_end=pdb_end))
+                 newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
     
     return newchains
 
@@ -1439,7 +1442,7 @@ if __name__ == "__main__":
     # Parse options
     try:
         opts, args = getopt.getopt( sys.argv[1:], 
-                                     "r:h", 
+                                     "r:hs", 
                                 [   "help", "resolution=", "keep-hetatm=", 
                                     "fill-gaps=", "3d-folder=", "seq-folder=", 
                                     "no-homology", "force-retry" ])
@@ -1458,7 +1461,8 @@ if __name__ == "__main__":
             print()
             print("-r 4.0 [ --resolution=4.0 ]\t(1.5 | 2.0 | 2.5 | 3.0 | 3.5 | 4.0 | 20.0)"
                     "\n\t\t\t\tMinimum 3D structure resolution to consider a RNA chain.")
-             print("--keep-hetatm=False\t\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. "
+             print("-s\t\t\t\tRun statistics computations after completion")
+             print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. "
                     "\n\t\t\t\tDoes not affect the descriptors.")
             print("--fill-gaps=True\t\t(True | False) Replace gaps in sequence due to unresolved residues"
                     "\n\t\t\t\tby the most common nucleotide at this position in the alignment.")
@@ -1481,6 +1485,8 @@ if __name__ == "__main__":
         elif opt == "-r" or opt == "--resolution":
             assert arg in ["1.5", "2.0", "2.5", "3.0", "3.5", "4.0", "20.0"]
             CRYSTAL_RES = arg
+         elif opt == "-s":
+             RUN_STATS = True
         elif opt=="--keep-hetatm":
             assert arg in [ "True", "False" ]
             KEEP_HETATM = (arg == "True")
@@ -1505,17 +1511,18 @@ if __name__ == "__main__":
     if path_to_3D_data == "tobedefinedbyoptions" or path_to_seq_data == "tobedefinedbyoptions":
         print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
         print("See RNANet.py --help for more information.")
-         
-         path_to_3D_data = "/home/lbecquey/Data/RNA/3D/"
-         path_to_seq_data = "/home/lbecquey/Data/RNA/sequences/"
-         print(f"\n[DEBUG]\tUsing hard-coded paths to data:\n\t\t{path_to_3D_data}\n\t\t{path_to_seq_data}\n")
-         # exit(1)
+         exit(1)
+ 
+     runDir = path.dirname(path.realpath(__file__))
+     os.makedirs(runDir + "/results", exist_ok=True)
 
     # ===========================================================================
     # List 3D chains with available Rfam mapping
     # ===========================================================================
 
-     # List all 3D RNA chains below 4Ang resolution
+     chains_database = pd.DataFrame(columns=['pdb_id', 'pdb_model', 'pdb_chain', 'rfam_fam', 'pdb_start', 'pdb_end', 'reversed', 'inferred', 'issue'])
+ 
+     # List all 3D RNA chains below given resolution
     full_structures_list = download_BGSU_NR_list()
 
     # Check for a list of known problems:
@@ -1528,6 +1535,13 @@ if __name__ == "__main__":
             print("\t> Ignoring known issues:")
             for x in known_issues:
                 print("\t  ", x)
+                 chains_database = chains_database.append(pd.DataFrame({ 'pdb_id':x.split('_')[0],
+                                                                         'pdb_model':x.split('_')[1],
+                                                                         'pdb_chain':x.split('_')[2],
+                                                                         'pdb_start':x.split('_')[3].split('-')[0],
+                                                                         'pdb_end':x.split('_')[3].split('-')[1],
+                                                                         'issue':True
+                                                                         }, index=[x]))
 
     all_chains = []
     if HOMOLOGY:
@@ -1559,6 +1573,11 @@ if __name__ == "__main__":
                 all_chains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label))
 
     del full_structures_list
+     chains_database = chains_database.append(pd.DataFrame.from_dict(
+                             {c.chain_label:[ c.pdb_id, c.pdb_model, c.pdb_chain_id, c.rfam_fam, c.pdb_start, c.pdb_end, c.reversed, c.inferred, False ] for c in all_chains},
+                             orient='index',
+                             columns=['pdb_id', 'pdb_model', 'pdb_chain', 'rfam_fam', 'pdb_start', 'pdb_end', 'reversed', 'inferred', 'issue'] ))
+     chains_database.to_csv(runDir + "/results/results_database.csv")
     n_chains = len(all_chains)
     print(">", validsymb, n_chains, "RNA chains of interest.")
 
@@ -1621,7 +1640,7 @@ if __name__ == "__main__":
         else:
             rfam_acc_to_download[c.rfam_fam].append(c)
             mappings_list[c.rfam_fam].append(c.chain_label)
-     pd.DataFrame.from_dict(mappings_list, orient='index').transpose().to_csv(path_to_seq_data + "realigned/mappings_list.csv")
+     pd.DataFrame.from_dict(mappings_list, orient='index').transpose().to_csv(runDir + "/results/mappings_list.csv")
     del mappings_list
     print(f"> Identified {len(rfam_acc_to_download.keys())} families to download and re-align with the crystals' sequences:")
 
@@ -1636,7 +1655,7 @@ if __name__ == "__main__":
         n_pdb = [ len(rfam_acc_to_download[f]) for f in fam_stats["rfam_acc"] ]
         fam_stats["n_pdb_seqs"] = n_pdb
         fam_stats["total_seqs"] = fam_stats["n_seq"] + fam_stats["n_pdb_seqs"]
-         fam_stats.to_csv(path_to_seq_data + "data/statistics.csv")
+         fam_stats.to_csv(runDir + "/data/statistics.csv")
         # print the stats
         for f in fam_list:
             line = fam_stats[fam_stats["rfam_acc"]==f]
@@ -1690,7 +1709,23 @@ if __name__ == "__main__":
     p.close()
     p.join()
 
-     print("Completed.")  # This part of the code is supposed to release some serotonin in the modeller's brain
+     # ==========================================================================================
+     # Post computation tasks
+     # ==========================================================================================
+ 
+     # Archive the results
+     os.makedirs("results/archive", exist_ok=True)
+     time_str = time.strftime("%Y%m%d")
+     subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."])
+     subprocess.run(['ln',"-s", runDir +f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"])
+ 
+     # Run statistics
+     if RUN_STATS:
+         os.chdir(runDir)
+         subprocess.run(["python3", "regression.py"])
+         subprocess.run(["python3", "statistics.py", path_to_3D_data, path_to_seq_data])
+ 
+     print("Completed.")  # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove
 
     # # so i can sleep for the end of the night
     # subprocess.run(["shutdown","now"]) 
--- a/data/jobstats.csv deleted 100644 → 0
View file @36f12c2
+++ b/data/jobstats.csv deleted 100644 → 0
View file @36f12c2
- label,comp_time,max_mem
- Realign RF00001 + 733 chains,347.5666003227234,783781888
- Realign RF00002 + 138 chains,15.574181318283081,710549504
- Realign RF00004 + 10 chains,331.88619780540466,2516779008
- Realign RF00005 + 869 chains,2349.9712748527527,6085918720
- Realign RF00008 + 9 chains,7.597636461257935,247132160
- Realign RF00009 + 4 chains,423.78941464424133,22123020288
- Realign RF00010 + 3 chains,167.0309178829193,5554601984
- Realign RF00011 + 4 chains,10.090157508850098,996966400
- Realign RF00013 + 1 chains,17.571903228759766,474783744
- Realign RF00015 + 6 chains,98.247323513031,1385431040
- Realign RF00017 + 10 chains,2218.9181191921234,13771866112
- Realign RF00020 + 17 chains,23.84590220451355,431484928
- Realign RF00023 + 7 chains,1196.2392709255219,7625351168
- Realign RF00026 + 20 chains,82.25747513771057,518791168
- Realign RF00028 + 5 chains,240.64744520187378,11369852928
- Realign RF00029 + 1 chains,62.898540019989014,898707456
- Realign RF00032 + 9 chains,7.049402236938477,162136064
- Realign RF00037 + 2 chains,0.27519845962524414,108863488
- Realign RF00050 + 6 chains,9.991205930709839,397705216
- Realign RF00059 + 24 chains,52.07490301132202,532307968
- Realign RF00061 + 1 chains,0.3395853042602539,233058304
- Realign RF00080 + 4 chains,19.957021474838257,1301696512
- Realign RF00100 + 6 chains,415.4162850379944,4435156992
- Realign RF00162 + 27 chains,16.753626108169556,408281088
- Realign RF00164 + 1 chains,0.05605888366699219,83927040
- Realign RF00167 + 48 chains,4.422192573547363,264232960
- Realign RF00168 + 16 chains,17.653642892837524,796184576
- Realign RF00169 + 11 chains,9.363726615905762,226705408
- Realign RF00174 + 2 chains,171.14065551757812,2648383488
- Realign RF00177 + 498 chains,2885.531806945801,45187723264
- Realign RF00233 + 2 chains,0.16314435005187988,138911744
- Realign RF00234 + 37 chains,10.552204132080078,1207234560
- Realign RF00250 + 1 chains,0.08687877655029297,83755008
- Realign RF00379 + 7 chains,27.468972206115723,655532032
- Realign RF00380 + 3 chains,2.397320508956909,245669888
- Realign RF00442 + 1 chains,2.0599684715270996,222887936
- Realign RF00458 + 7 chains,0.24766230583190918,197394432
- Realign RF00488 + 3 chains,1.4626531600952148,850460672
- Realign RF00504 + 18 chains,12.249290227890015,366731264
- Realign RF00505 + 1 chains,0.06069207191467285,83628032
- Realign RF01051 + 17 chains,7.672087907791138,297189376
- Realign RF01510 + 16 chains,0.0939493179321289,83746816
- Realign RF01689 + 4 chains,1.2797768115997314,400691200
- Realign RF01725 + 2 chains,2.976431369781494,294690816
- Realign RF01734 + 5 chains,1.8893005847930908,163631104
- Realign RF01739 + 4 chains,1.6384203433990479,271265792
- Realign RF01750 + 6 chains,8.268307209014893,421974016
- Realign RF01763 + 13 chains,0.5894784927368164,135094272
- Realign RF01786 + 2 chains,0.8764479160308838,182689792
- Realign RF01807 + 1 chains,0.19919967651367188,166801408
- Realign RF01826 + 1 chains,0.06825041770935059,83787776
- Realign RF01831 + 10 chains,2.0323476791381836,254255104
- Realign RF01846 + 2 chains,15.989834308624268,1073623040
- Realign RF01852 + 16 chains,4.523370265960693,249016320
- Realign RF01854 + 3 chains,8.060775518417358,647757824
- Realign RF01857 + 1 chains,3.9880683422088623,587083776
- Realign RF01960 + 140 chains,3388.5226855278015,56313212928
- Realign RF02001 + 26 chains,22.095701456069946,1335533568
- Realign RF02012 + 3 chains,10.277246713638306,796667904
- Realign RF02253 + 1 chains,0.2654685974121094,104386560
- Realign RF02348 + 2 chains,0.11346197128295898,82419712
- Realign RF02519 + 1 chains,0.039333343505859375,81330176
- Realign RF02540 + 67 chains,726.7017936706543,48769855488
- Realign RF02545 + 3 chains,0.451732873916626,513720320
- Realign RF02546 + 1 chains,0.3498055934906006,405676032
- Realign RF02553 + 1 chains,1.2360577583312988,281141248
- Realign RF02680 + 1 chains,0.09950971603393555,80687104
- Realign RF02683 + 1 chains,1.070310115814209,282808320
- Realign RF02796 + 6 chains,0.0940089225769043,81862656
--- a/data/statistics.csv deleted 100644 → 0
View file @36f12c2
+++ b/data/statistics.csv deleted 100644 → 0
View file @36f12c2
- rfam_acc,n_seq,maxlength,n_pdb_seqs,total_seqs
- RF00001,70460,345,733,71193
- RF00002,11746,289,138,11884
- RF00004,10251,342,10,10261
- RF00005,436080,293,869,436949
- RF00008,2383,132,9,2392
- RF00009,1217,1029,4,1221
- RF00010,6473,812,3,6476
- RF00011,787,436,4,791
- RF00013,3502,254,1,3503
- RF00015,5016,310,6,5022
- RF00017,3733,806,10,3743
- RF00020,4459,188,17,4476
- RF00023,6656,784,7,6663
- RF00026,23130,431,20,23150
- RF00028,2051,892,5,2056
- RF00029,8804,341,1,8805
- RF00032,16724,88,9,16733
- RF00037,1607,56,2,1609
- RF00050,3746,347,6,3752
- RF00059,9846,255,24,9870
- RF00061,80,261,1,81
- RF00080,788,241,4,792
- RF00100,7822,636,6,7828
- RF00162,4049,375,27,4076
- RF00164,63,43,1,64
- RF00167,1765,156,48,1813
- RF00168,1889,334,16,1905
- RF00169,6295,121,11,6306
- RF00174,9480,476,2,9482
- RF00177,25969,3531,498,26467
- RF00233,49,87,2,51
- RF00234,930,380,37,967
- RF00250,63,60,1,64
- RF00379,2637,324,7,2644
- RF00380,921,282,3,924
- RF00442,770,226,1,771
- RF00458,16,215,7,23
- RF00488,40,824,3,43
- RF00504,3582,249,18,3600
- RF00505,21,65,1,22
- RF01051,3217,270,17,3234
- RF01510,5,63,16,21
- RF01689,344,215,4,348
- RF01725,767,158,2,769
- RF01734,1748,159,5,1753
- RF01739,761,273,4,765
- RF01750,1513,203,6,1519
- RF01763,640,82,13,653
- RF01786,496,122,2,498
- RF01807,12,218,1,13
- RF01826,14,93,1,15
- RF01831,614,249,10,624
- RF01846,616,537,2,618
- RF01852,4469,112,16,4485
- RF01854,1707,302,3,1710
- RF01857,442,343,1,443
- RF01960,27108,5325,140,27248
- RF02001,2268,340,26,2294
- RF02012,838,191,3,841
- RF02253,677,63,1,678
- RF02348,77,105,2,79
- RF02519,6,33,1,7
- RF02540,34679,9019,67,34746
- RF02541,35613,8885,689,36302
- RF02543,38161,11046,147,38308
- RF02545,16,628,3,19
- RF02546,18,572,1,19
- RF02553,116,188,1,117
- RF02680,34,103,1,35
- RF02683,229,187,1,230
- RF02796,13,70,6,19
--- a/regression.py
View file @40cb5b7
+++ b/regression.py
View file @40cb5b7
- #!/usr/bin/python3
+ #!/usr/bin/python3.8
 # This file is supposed to propose regression models on the computation time and mem usage of the re-alignment jobs.
 # Light jobs are monitored by the Monitor class in RNAnet.py, and the measures are saved in jobstats.csv.
 # This was done to guess the amount of memory required to re-align the large ribosomal subunit families RF02541 and RF02543.
- # INFO: Our home hardware was a 24-core VM with 50GB RAM + 8GB Swap.
+ # INFO: Our home hardware was a 32-core VM with 50GB RAM + 8GB Swap.
 
 import matplotlib.pyplot as plt
 import pandas as pd
 import numpy as np
- import scipy
+ import scipy, os
 from sklearn.linear_model import LinearRegression
 from mpl_toolkits.mplot3d import Axes3D
 
@@ -31,105 +31,109 @@ for index, fam in jobstats.iterrows():
         maxlengths.append(
             families.loc[families["rfam_acc"] == rfam_acc, "maxlength"].values[0])
 
- nchains = [x/1000 for x in nchains]  # compte en milliers de séquences
 comptimes = [x/3600 for x in comptimes]  # compte en heures
 maxlengths = [x/1000 for x in maxlengths]  # compte en kB
 maxmem = [x/1024/1024 for x in maxmem]  # compte en MB
 
 summary = pd.DataFrame({"family": computed_families, "n_chains": nchains,
-                         "max_length": maxlengths, "comp_time": comptimes, "max_mem": maxmem})
- summary.sort_values("max_length", inplace=True)
- summary.to_csv("summary.csv")
+                         "max_length(kB)": maxlengths, "comp_time(h)": comptimes, "max_mem(MB)": maxmem})
+ summary.sort_values("max_length(kB)", inplace=True)
+ summary.to_csv("results/summary.csv")
 
 # ========================================================
 # Plot the data
 # ========================================================
 
- fig = plt.figure(dpi=100)
+ fig = plt.figure(figsize=(12,8), dpi=100)
+ 
 plt.subplot(231)
- plt.scatter(summary.n_chains, summary.max_mem)
- plt.xlabel("Number of sequences (x1000 seqs)")
+ plt.scatter(summary.n_chains, summary["max_mem(MB)"])
+ plt.xlabel("Number of sequences")
 plt.ylabel("Peak memory (MB)")
+ 
 plt.subplot(232)
- plt.scatter(summary.max_length, summary.max_mem)
+ plt.scatter(summary["max_length(kB)"], summary["max_mem(MB)"])
 plt.xlabel("Maximum length of sequences (kB)")
 plt.ylabel("Peak memory (MB)")
+ 
 ax = fig.add_subplot(233, projection='3d')
- ax.scatter(summary.n_chains, summary.max_length, summary.max_mem)
- ax.set_xlabel("Number of sequences (x1000 seqs)")
+ ax.scatter(summary.n_chains, summary["max_length(kB)"], summary["max_mem(MB)"])
+ ax.set_xlabel("Number of sequences")
 ax.set_ylabel("Maximum length of sequences (kB)")
 ax.set_zlabel("Peak memory (MB)")
+ 
 plt.subplot(234)
- plt.scatter(summary.n_chains, summary.comp_time)
- plt.xlabel("Number of sequences (x1000 seqs)")
+ plt.scatter(summary.n_chains, summary["comp_time(h)"])
+ plt.xlabel("Number of sequences")
 plt.ylabel("Computation time (h)")
+ 
 plt.subplot(235)
- plt.scatter(summary.max_length, summary.comp_time)
+ plt.scatter(summary["max_length(kB)"], summary["comp_time(h)"])
 plt.xlabel("Maximum length of sequences (kB)")
 plt.ylabel("Computation time (h)")
+ 
 ax = fig.add_subplot(236, projection='3d')
- ax.scatter(summary.n_chains, summary.max_length, summary.comp_time)
- ax.set_xlabel("Number of sequences (x1000 seqs)")
+ ax.scatter(summary.n_chains, summary["max_length(kB)"], summary["comp_time(h)"])
+ ax.set_xlabel("Number of sequences")
 ax.set_ylabel("Maximum length of sequences (kB)")
 ax.set_zlabel("Computation time (h)")
- plt.show()
- 
- # ========================================================
- # Linear Regression of max_mem as function of max_length
- # ========================================================
 
- # With scikit-learn
- model = LinearRegression(normalize=True, n_jobs=-1)
- model.fit(np.array(summary.max_length).reshape(-1, 1), summary.max_mem)
- b0 = model.intercept_
- b1 = model.coef_[0]
- print(f"peak_mem = {b0:.0f} + {b1:.0f} * max_length")
- 
- # with scipy
- coeffs = scipy.optimize.curve_fit(lambda t, B0, B1: B0+np.exp(B1*t),
-                                   np.array(summary.max_length[:-3]), np.array(summary.max_mem[:-3]))[0]
- print(f"peak_mem = {coeffs[0]:.0f} + e^({coeffs[1]:.0f} * max_length)")
- coeffs_log = scipy.optimize.curve_fit(lambda t, B0, B1: B0+B1*np.log(t),
-                                       np.array(summary.max_length), np.array(summary.max_mem), p0=(400, 12000))[0]
- print(
-     f"peak_mem = {coeffs_log[0]:.0f} + {coeffs_log[1]:.0f} * log(max_length)")
- 
- # Re-plot
- x = np.linspace(0, 10, 1000)
- plt.figure()
- plt.scatter(summary.max_length, summary.max_mem)
- plt.xlabel("Maximum length of sequences (kB)")
- plt.ylabel("Peak memory (MB)")
- plt.plot(x, b0 + b1*x, "-r", label="linear fit")
- plt.plot(x, coeffs[0] + np.exp(coeffs[1]*x), "-g", label="expo fit on [:-3]")
- plt.plot(x, coeffs_log[0] + coeffs_log[1]*np.log(x), "-b", label="log fit")
- plt.ylim(0, 60000)
- plt.legend()
- plt.show()
- 
- print("Estimated mem required to compute RF02543 and its 11kB sequences:",
-       model.predict(np.array([11]).reshape(-1, 1)))
- 
- # ========================================================
- # Linear Regression of comp_time as function of n_chains
- # ========================================================
- 
- # With scikit-learn
- model = LinearRegression(normalize=True, n_jobs=-1)
- model.fit(np.array(summary.n_chains).reshape(-1, 1), summary.comp_time)
- b0 = model.intercept_
- b1 = model.coef_[0]
- print(f"comp_time = {b0:.3f} + {b1:.3f} * n_chains")
- print("Estimated computation time required for RF02543 and its 38k sequences:",
-       model.predict(np.array([38]).reshape(-1, 1)))
- 
- # Re-plot
- x = np.linspace(0, 500, 1000)
- plt.figure()
- plt.scatter(summary.n_chains, summary.comp_time)
- plt.xlabel("Number of sequences (x1000)")
- plt.ylabel("Computation time (h)")
- plt.plot(x, b0 + b1*x, "-r", label="linear fit")
- plt.ylim(0, 10)
- plt.legend()
- plt.show()
+ plt.subplots_adjust(wspace=0.4)
+ plt.savefig("results/realign_jobs_performance.png")
+ 
+ # # ========================================================
+ # # Linear Regression of max_mem as function of max_length
+ # # ========================================================
+ 
+ # # With scikit-learn
+ # model = LinearRegression(normalize=True, n_jobs=-1)
+ # model.fit(summary["max_length(kB)"].values.reshape(-1, 1), summary["max_mem(MB)"])
+ # b0 = model.intercept_
+ # b1 = model.coef_[0]
+ # print(f"peak_mem = {b0:.0f} + {b1:.0f} * max_length")
+ 
+ # # with scipy
+ # coeffs = scipy.optimize.curve_fit(  lambda t, B0, B1: B0+np.exp(B1*t), 
+ #                                     summary["max_length(kB)"].values, 
+ #                                     summary["max_mem(MB)"].values
+ #                                  )[0]
+ # print(f"peak_mem = {coeffs[0]:.0f} + e^({coeffs[1]:.0f} * max_length)")
+ # coeffs_log = scipy.optimize.curve_fit(  lambda t, B0, B1: B0+B1*np.log(t),
+ #                                         summary["max_length(kB)"].values, 
+ #                                         summary["max_mem(MB)"].values,
+ #                                         p0=(400, 12000)
+ #                                      )[0]
+ # print(f"peak_mem = {coeffs_log[0]:.0f} + {coeffs_log[1]:.0f} * log(max_length)")
+ 
+ # # Re-plot
+ # x = np.linspace(0, 10, 1000)
+ # plt.figure()
+ # plt.scatter(summary["max_length(kB)"], summary["max_mem(MB)"])
+ # plt.xlabel("Maximum length of sequences (kB)")
+ # plt.ylabel("Peak memory (MB)")
+ # plt.plot(x, b0 + b1*x, "-r", label="linear fit")
+ # plt.plot(x, coeffs[0] + np.exp(coeffs[1]*x), "-g", label="expo fit")
+ # plt.plot(x, coeffs_log[0] + coeffs_log[1]*np.log(x), "-b", label="log fit")
+ # plt.legend()
+ # plt.savefig("results/regression/memory_linear_model.png")
+ 
+ # # ========================================================
+ # # Linear Regression of comp_time as function of n_chains
+ # # ========================================================
+ 
+ # # With scikit-learn
+ # model = LinearRegression(normalize=True, n_jobs=-1)
+ # model.fit(summary.n_chains.values.reshape(-1, 1), summary["comp_time(h)"])
+ # b0 = model.intercept_
+ # b1 = model.coef_[0]
+ # print(f"comp_time = {b0:.3f} + {b1:.3f} * n_chains")
+ 
+ # # Re-plot
+ # x = np.linspace(0, 500000, 1000)
+ # plt.figure()
+ # plt.scatter(summary.n_chains, summary["comp_time(h)"])
+ # plt.xlabel("Number of sequences")
+ # plt.ylabel("Computation time (h)")
+ # plt.plot(x, b0 + b1*x, "-r", label="linear fit")
+ # plt.legend()
+ # plt.savefig("results/regression/comp_time_linear_model.png")
--- a/results/2D.png deleted 100644 → 0
View file @36f12c2
+++ b/results/2D.png deleted 100644 → 0
View file @36f12c2
--- a/results/distances.png 0 → 100644
View file @40cb5b7
+++ b/results/distances.png 0 → 100644
View file @40cb5b7
--- a/results/figures/distances.png 0 → 100644
View file @40cb5b7
+++ b/results/figures/distances.png 0 → 100644
View file @40cb5b7
--- a/results/figures/lengths.png 0 → 100644
View file @40cb5b7
+++ b/results/figures/lengths.png 0 → 100644
View file @40cb5b7
--- a/results/figures/pairings.png 0 → 100644
View file @40cb5b7
+++ b/results/figures/pairings.png 0 → 100644
View file @40cb5b7
--- a/results/frequencies.csv 0 → 100644
View file @40cb5b7
+++ b/results/frequencies.csv 0 → 100644
View file @40cb5b7
+ ,G,C,A,U,-,A2M,OMU,OMG,OMC,7MG,PSU,5MU,4SU,MIA,H2U,U8U,T6A,DJF,6MZ,CM0,5MC,2MG,1MA,YYG,M2G,2MA,QUO,G7M,4OC,YG,AET,2MU,12A,70U,6IA,1MG,GTP,574,I,RSP,RIA,3AU,AG9,ANZ,1RN,N79,365,UBD,9QV,CCC,IU,MA6,UR3,A3P,A23,23G,N,GDP,CBV,4AC,M7A,E3C,B8Q,B8N,C4J,M1Y,JMH,3TD,B9B,E7G,B9H,P7G,I4U,B8H,P4U,B8W,P5P,Y5P,B8T,B8K,E6G,BGH,MHG
+ RF00001,33.99%,29.98%,20.01%,16.01%,0.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00002,26.80%,23.51%,27.36%,21.86%,0.43%,0.01%,0.02%,<.01%,<.01%,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00004,18.12%,16.77%,23.33%,25.90%,15.82%,0,0,0,0,0,0.06%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00005,31.37%,27.32%,19.93%,17.61%,1.23%,0,<.01%,0.03%,0.07%,0.18%,0.73%,0.41%,0.33%,0.15%,0.20%,0.02%,0.02%,<.01%,0.02%,0.02%,0.14%,0.02%,0.02%,<.01%,0.02%,<.01%,0.02%,0.02%,0.01%,0.01%,<.01%,<.01%,<.01%,<.01%,<.01%,0.02%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00008,31.25%,26.35%,24.16%,18.24%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00009,31.11%,26.48%,20.69%,21.71%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00010,35.64%,29.65%,17.52%,11.12%,6.07%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00011,21.41%,15.95%,17.10%,11.65%,33.89%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00013,25.23%,24.32%,21.62%,19.82%,9.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00015,18.15%,14.11%,19.30%,23.34%,25.10%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00017,32.85%,24.43%,19.37%,14.49%,8.73%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.13%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00020,16.76%,19.36%,20.57%,30.63%,12.69%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00023,31.22%,22.68%,11.46%,16.10%,16.59%,0,0,0,0,0,0.98%,0.98%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00026,18.40%,16.77%,25.32%,26.02%,13.45%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.02%,0.02%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00028,27.66%,20.61%,28.66%,22.05%,1.02%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00029,32.73%,21.82%,26.91%,18.55%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00032,17.00%,40.32%,22.92%,19.76%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00037,23.33%,20.00%,23.33%,31.67%,1.67%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00050,28.57%,16.07%,27.68%,23.21%,2.68%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.89%,0,0,0,0,0,0,0,0,0,0,0,0,0.89%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00059,31.16%,23.60%,22.54%,20.11%,2.17%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.21%,0,0,0,0,0,0,0,0,0,0,0,0,0.21%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00061,26.51%,23.06%,15.52%,14.87%,20.04%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00080,28.00%,17.41%,31.06%,22.00%,1.53%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00100,26.23%,24.59%,9.84%,21.31%,18.03%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00162,31.84%,23.64%,29.81%,14.68%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.03%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00164,34.88%,23.26%,25.58%,16.28%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00167,23.89%,22.76%,26.40%,26.79%,0.06%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.03%,0.06%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00168,39.10%,26.42%,19.10%,15.37%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00169,36.50%,31.99%,22.19%,9.32%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00174,35.38%,25.15%,19.59%,12.28%,7.60%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00177,33.24%,24.72%,22.68%,16.89%,2.29%,0,0,<.01%,0,0.01%,0.02%,<.01%,<.01%,0,<.01%,0,0,0,0,0,0.05%,0.02%,0,0,0.01%,0,0,<.01%,0.01%,0,0,0,0,0,<.01%,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.03%,0.01%,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00233,28.21%,29.49%,21.79%,20.51%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00234,29.82%,20.65%,23.80%,24.45%,0.65%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.49%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.14%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00250,17.65%,29.41%,35.29%,17.65%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00379,28.42%,24.27%,24.32%,19.37%,3.57%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.05%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00380,24.26%,21.94%,27.64%,24.47%,1.69%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00382,36.40%,26.00%,20.97%,16.63%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00390,13.04%,17.39%,30.43%,39.13%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00442,30.34%,21.35%,28.09%,19.10%,1.12%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00458,18.53%,16.06%,28.60%,30.36%,6.44%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00488,18.10%,13.22%,18.91%,26.63%,22.54%,0,0,0,0,0,0.06%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.53%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00504,30.89%,21.55%,30.64%,14.69%,0.54%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.08%,0,0,0,0,0,0,0,0.60%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF00505,28.33%,28.33%,11.67%,26.67%,5.00%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01051,28.75%,25.58%,26.41%,13.07%,6.19%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01357,32.00%,24.00%,20.00%,16.00%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.00%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01510,21.88%,24.22%,28.12%,25.78%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01689,25.81%,22.04%,31.72%,20.43%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01725,37.91%,29.67%,25.27%,7.14%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01734,31.37%,31.37%,21.57%,15.69%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01739,32.79%,27.87%,24.59%,14.75%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01750,32.13%,23.91%,23.19%,15.22%,5.56%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01763,38.27%,29.64%,18.76%,6.94%,2.25%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.13%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01786,27.03%,17.57%,27.03%,27.03%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.35%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01807,27.17%,23.91%,26.63%,22.28%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01826,19.23%,15.38%,32.69%,23.08%,7.69%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.92%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01831,33.06%,19.76%,25.41%,21.77%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01846,24.93%,22.72%,15.88%,19.09%,17.37%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01852,28.27%,21.04%,27.74%,22.33%,0.55%,0,0,0,0,0,0.02%,0.02%,<.01%,0,0.02%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01854,33.22%,28.24%,20.27%,18.27%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01857,37.74%,29.53%,18.80%,13.93%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01960,24.97%,19.86%,24.33%,24.49%,6.22%,0.02%,<.01%,0.01%,0.01%,<.01%,0.02%,<.01%,0,0,0,0,0,0,<.01%,0,<.01%,<.01%,0,0,0,0,0,0,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,<.01%,<.01%,0,0,0,0.01%,0,0,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF01998,32.10%,21.36%,27.78%,17.72%,1.05%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF02001,26.78%,17.17%,32.96%,21.51%,1.58%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF02012,29.11%,22.15%,23.42%,24.89%,0.42%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF02253,20.69%,24.14%,27.59%,27.59%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF02348,21.52%,16.46%,36.71%,25.32%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF02519,23.53%,14.71%,29.41%,29.41%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.94%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF02540,29.03%,23.70%,24.55%,17.33%,5.28%,0,0.02%,0.02%,0,0,0.02%,0,0,0,0,0,0,0,0,0,0,0,0.02%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.02%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF02541,33.08%,24.10%,23.09%,16.25%,3.38%,0,<.01%,<.01%,<.01%,<.01%,0.04%,0.01%,0,0,<.01%,0,0,0,<.01%,0,0.01%,<.01%,0,0,0,<.01%,0,<.01%,<.01%,0,0,<.01%,0,0,0,<.01%,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF02543,25.03%,18.64%,20.96%,18.86%,16.44%,0.01%,<.01%,0.02%,<.01%,<.01%,<.01%,<.01%,0,0,<.01%,0,0,0,<.01%,0,<.01%,<.01%,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,<.01%,0,0,0,0,0,0,0,<.01%,0,<.01%,0,0,0,<.01%,0,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%
+ RF02545,9.88%,4.94%,35.83%,38.95%,10.39%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF02546,2.40%,1.07%,16.80%,11.73%,68.00%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF02553,32.50%,22.50%,20.00%,23.75%,1.25%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF02680,28.71%,29.70%,19.80%,18.81%,1.98%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.99%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF02683,31.40%,24.42%,29.07%,13.95%,1.16%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ RF02796,33.33%,36.84%,17.54%,12.28%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
--- a/results/lengths.png 0 → 100644
View file @40cb5b7
+++ b/results/lengths.png 0 → 100644
View file @40cb5b7
--- a/results/mappings_list.csv 0 → 100644
View file @40cb5b7
+++ b/results/mappings_list.csv 0 → 100644
View file @40cb5b7
--- a/results/pairings.csv 0 → 100644
View file @40cb5b7
+++ b/results/pairings.csv 0 → 100644
View file @40cb5b7
+ ,cWW,tSH,tWH,tHS,other,tWW,tSS,tHW,cSH,cSW,cSS,tSW,cWH,cWS,tWS,tHH,cHW,cHH,cHS
+ RF00001,61.87%,4.31%,3.21%,1.98%,3.33%,0.42%,0.97%,2.64%,5.30%,5.61%,0.11%,4.14%,0.61%,3.04%,0.93%,0.53%,0.89%,<.01%,0.10%
+ RF00002,62.36%,5.36%,2.71%,6.11%,1.72%,2.25%,1.23%,2.54%,1.87%,4.10%,0.63%,1.50%,1.14%,0.68%,0.57%,3.20%,1.38%,0.59%,0.05%
+ RF00004,85.28%,3.30%,5.23%,0.96%,0.69%,0.14%,0 %,0 %,0.28%,0.28%,0 %,0.69%,0.55%,0 %,0 %,0 %,0.28%,0.28%,2.06%
+ RF00005,70.47%,0.91%,6.92%,0.09%,1.74%,3.56%,0.08%,3.29%,0.53%,0.52%,0.22%,1.75%,1.24%,2.00%,2.31%,1.71%,0.65%,0.48%,1.53%
+ RF00008,64.74%,4.62%,8.09%,2.89%,1.16%,0 %,0 %,0 %,1.16%,5.20%,0 %,1.16%,0.58%,4.05%,4.62%,1.73%,0 %,0 %,0 %
+ RF00009,81.68%,0.58%,2.53%,0.58%,0.97%,0 %,0.39%,1.36%,1.17%,2.73%,0.97%,2.34%,0.58%,0.78%,0.78%,0 %,1.36%,0.39%,0.78%
+ RF00010,69.24%,2.58%,4.60%,0.37%,3.31%,0.55%,1.29%,0.92%,2.03%,2.76%,2.39%,2.76%,0.18%,1.84%,1.66%,0.55%,2.21%,0 %,0.74%
+ RF00011,64.71%,4.50%,4.50%,1.04%,3.46%,2.08%,2.42%,2.77%,3.11%,1.04%,1.38%,2.08%,2.08%,1.04%,1.04%,1.04%,1.73%,0 %,0 %
+ RF00013,89.66%,3.45%,0 %,0 %,3.45%,3.45%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
+ RF00015,86.76%,4.18%,0.70%,3.48%,0.70%,0 %,0 %,0 %,0.70%,0.35%,0 %,1.74%,0.35%,0 %,0 %,0.35%,0 %,0.70%,0 %
+ RF00017,75.15%,2.90%,3.05%,0.76%,3.35%,2.74%,0.46%,1.68%,1.07%,0.30%,2.13%,2.59%,1.68%,0.30%,0 %,0 %,0.91%,0.91%,0 %
+ RF00020,88.26%,0.73%,2.39%,0.37%,0.55%,0.73%,0 %,0 %,0.73%,1.10%,1.28%,1.10%,0.37%,1.28%,0 %,0 %,0.73%,0 %,0.37%
+ RF00023,73.83%,1.87%,12.15%,0.93%,1.87%,0.93%,0 %,0.93%,0 %,1.87%,0 %,0 %,0 %,1.87%,3.74%,0 %,0 %,0 %,0 %
+ RF00026,81.41%,3.66%,6.15%,1.17%,0.44%,1.17%,0 %,0 %,0.29%,0.44%,0.15%,1.02%,0.29%,0.29%,0.44%,0.15%,0.15%,0.29%,2.49%
+ RF00028,65.73%,2.86%,2.64%,3.83%,2.16%,1.62%,2.91%,2.05%,3.12%,1.29%,1.94%,0.38%,1.67%,0.54%,1.45%,0.22%,4.58%,0.86%,0.16%
+ RF00029,80.70%,6.14%,0 %,0 %,0 %,3.51%,0 %,3.51%,0 %,0.88%,0 %,0 %,0.88%,0.88%,0 %,0 %,0.88%,0 %,2.63%
+ RF00032,100.00%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
+ RF00037,100.00%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
+ RF00050,68.39%,3.87%,7.74%,3.87%,2.26%,0.32%,5.48%,0 %,0 %,0 %,5.81%,0 %,0 %,0.32%,0 %,0 %,1.94%,0 %,0 %
+ RF00059,60.28%,1.50%,4.97%,3.70%,2.54%,1.85%,5.31%,0 %,0 %,0 %,7.16%,4.97%,4.50%,0.35%,0.12%,1.85%,0.23%,0.69%,0 %
+ RF00061,77.86%,3.05%,2.29%,2.29%,0 %,2.29%,0 %,1.53%,2.29%,0 %,0 %,0.76%,0.76%,2.29%,0 %,1.53%,2.29%,0 %,0.76%
+ RF00080,84.19%,6.45%,0 %,0 %,2.26%,0 %,1.94%,0 %,4.19%,0 %,0 %,0.65%,0 %,0 %,0 %,0 %,0 %,0 %,0.32%
+ RF00100,65.22%,0 %,4.35%,0 %,5.07%,0.72%,0 %,8.70%,0 %,0 %,0 %,2.90%,13.04%,0 %,0 %,0 %,0 %,0 %,0 %
+ RF00162,73.74%,6.90%,0.07%,2.15%,0.96%,0 %,0.59%,0 %,2.52%,2.82%,4.15%,2.37%,0.07%,0.45%,3.04%,0 %,0 %,0.15%,0 %
+ RF00164,76.19%,4.76%,0 %,0 %,0 %,0 %,0 %,0 %,4.76%,4.76%,9.52%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
+ RF00167,67.80%,0 %,7.77%,0.23%,2.51%,0 %,0 %,2.63%,2.22%,3.10%,2.63%,2.98%,0 %,5.14%,2.63%,0.29%,0.06%,0 %,0 %
+ RF00168,76.92%,4.74%,1.95%,2.41%,0.45%,1.20%,1.20%,2.41%,3.23%,1.20%,0.68%,1.43%,0.98%,0 %,0 %,1.20%,0 %,0 %,0 %
+ RF00169,70.92%,9.56%,3.19%,0.80%,4.78%,0 %,0.40%,9.16%,0 %,0 %,0 %,0 %,0.80%,0 %,0.40%,0 %,0 %,0 %,0 %
+ RF00174,71.01%,2.90%,5.07%,4.35%,2.90%,0.72%,1.45%,2.17%,0 %,2.17%,2.90%,1.45%,0.72%,2.17%,0 %,0 %,0 %,0 %,0 %
+ RF00177,63.05%,3.95%,4.48%,2.84%,3.20%,2.13%,2.18%,2.57%,2.50%,2.24%,2.00%,1.72%,2.02%,1.58%,1.44%,0.78%,0.70%,0.34%,0.29%
+ RF00233,72.06%,1.47%,7.35%,2.94%,0 %,2.94%,0 %,0 %,4.41%,0 %,2.94%,1.47%,2.94%,0 %,0 %,0 %,1.47%,0 %,0 %
+ RF00234,73.03%,1.96%,0.68%,0.64%,1.28%,1.96%,2.42%,5.29%,2.92%,0.59%,0.41%,7.07%,1.32%,0 %,0.23%,0 %,0.18%,0 %,0 %
+ RF00250,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
+ RF00379,71.10%,6.46%,1.46%,7.07%,1.10%,0.12%,3.29%,0.24%,2.93%,1.46%,1.95%,1.59%,0.61%,0 %,0 %,0.12%,0.49%,0 %,0 %
+ RF00380,64.46%,5.37%,1.24%,2.07%,6.20%,3.31%,2.89%,4.96%,2.48%,1.24%,2.07%,0 %,0 %,1.24%,1.24%,0 %,1.24%,0 %,0 %
+ RF00382,50.00%,0 %,0 %,0 %,20.59%,0 %,0 %,0 %,0 %,0 %,0 %,2.94%,20.59%,0 %,0 %,0 %,0 %,5.88%,0 %
+ RF00390,55.17%,0 %,0 %,0 %,6.90%,0 %,0 %,0 %,13.79%,6.90%,0 %,0 %,17.24%,0 %,0 %,0 %,0 %,0 %,0 %
+ RF00442,56.52%,6.52%,6.52%,2.17%,8.70%,2.17%,2.17%,2.17%,0 %,4.35%,2.17%,0 %,4.35%,0 %,0 %,2.17%,0 %,0 %,0 %
+ RF00458,70.22%,3.37%,5.06%,0 %,5.34%,1.97%,0 %,1.40%,1.97%,1.97%,0.28%,0.28%,2.81%,1.97%,0.84%,0.84%,0.56%,0.84%,0.28%
+ RF00488,91.95%,0.20%,0 %,0.20%,0.80%,1.41%,0.10%,0.50%,0.91%,1.21%,0.10%,0.30%,0.70%,0.70%,0 %,0 %,0.30%,0.50%,0.10%
+ RF00504,72.66%,3.88%,2.59%,7.77%,3.02%,0 %,2.45%,0.29%,2.59%,0 %,1.58%,0 %,0 %,0 %,0.14%,0.14%,2.88%,0 %,0 %
+ RF00505,100.00%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
+ RF01051,64.48%,5.37%,0 %,2.84%,4.93%,0 %,2.84%,4.18%,4.33%,2.09%,1.49%,1.94%,0.60%,3.43%,0.60%,0.60%,0 %,0.15%,0.15%
+ RF01357,80.00%,10.00%,0 %,0 %,0 %,0 %,0 %,0 %,10.00%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
+ RF01510,85.62%,0 %,0 %,0 %,1.09%,0 %,0 %,0 %,3.27%,0.22%,0 %,0 %,0 %,6.32%,3.49%,0 %,0 %,0 %,0 %
+ RF01689,75.95%,3.80%,5.06%,0 %,1.27%,5.06%,0 %,0.63%,1.27%,0 %,1.27%,3.16%,0 %,0 %,2.53%,0 %,0 %,0 %,0 %
+ RF01725,71.25%,7.50%,0 %,0 %,1.25%,0 %,5.00%,0 %,5.00%,0 %,5.00%,2.50%,0 %,0 %,2.50%,0 %,0 %,0 %,0 %
+ RF01734,75.76%,8.08%,0 %,0 %,0 %,5.05%,3.03%,5.05%,0 %,0 %,1.01%,2.02%,0 %,0 %,0 %,0 %,0 %,0 %,0 %
+ RF01739,61.06%,3.54%,4.42%,3.54%,7.96%,3.54%,0 %,0 %,3.54%,1.77%,0 %,0 %,3.54%,0 %,0 %,3.54%,3.54%,0 %,0 %
+ RF01750,79.22%,4.55%,0 %,3.90%,1.30%,0 %,0 %,1.30%,0 %,0 %,3.90%,0 %,1.30%,0 %,0 %,0 %,4.55%,0 %,0 %
+ RF01763,42.70%,0.28%,5.23%,0 %,12.67%,3.58%,0 %,0 %,2.20%,0 %,3.03%,2.75%,20.94%,6.61%,0 %,0 %,0 %,0 %,0 %
+ RF01786,76.39%,2.78%,5.56%,2.78%,1.39%,0 %,0 %,2.78%,5.56%,0 %,0 %,0 %,0 %,0 %,2.78%,0 %,0 %,0 %,0 %
+ RF01807,74.12%,3.53%,2.35%,0 %,2.35%,4.71%,2.35%,1.18%,0 %,1.18%,0 %,1.18%,2.35%,1.18%,0 %,1.18%,0 %,0 %,2.35%
+ RF01826,50.00%,0 %,8.33%,4.17%,4.17%,4.17%,4.17%,0 %,0 %,0 %,4.17%,0 %,20.83%,0 %,0 %,0 %,0 %,0 %,0 %
+ RF01831,78.61%,1.19%,2.97%,1.98%,1.19%,0 %,3.56%,3.96%,1.78%,2.38%,0 %,0 %,0 %,0 %,2.38%,0 %,0 %,0 %,0 %
+ RF01846,86.57%,3.14%,0.43%,1.71%,1.00%,0.57%,0.29%,1.43%,0.29%,1.14%,0 %,1.00%,0.43%,0.57%,0.29%,0.29%,0.86%,0 %,0 %
+ RF01852,71.41%,0.42%,1.47%,0.10%,4.63%,1.18%,0.06%,4.89%,4.63%,2.20%,0.03%,0.45%,6.65%,0.22%,0.64%,0 %,0.77%,0.06%,0.19%
+ RF01854,68.87%,5.96%,4.64%,3.97%,3.97%,1.99%,2.65%,2.65%,0 %,0 %,1.99%,0 %,1.32%,0 %,0.66%,0 %,1.32%,0 %,0 %
+ RF01857,71.35%,4.21%,2.81%,0 %,3.93%,2.25%,2.53%,5.34%,0 %,0.56%,1.97%,1.69%,0.56%,1.12%,1.69%,0 %,0 %,0 %,0 %
+ RF01960,66.53%,3.35%,3.47%,2.51%,3.10%,2.23%,1.24%,2.17%,1.66%,2.49%,1.75%,1.64%,2.30%,1.38%,1.71%,0.42%,1.34%,0.49%,0.22%
+ RF01998,56.65%,4.92%,4.37%,6.74%,3.10%,0.91%,7.10%,4.01%,2.73%,1.09%,0 %,0.36%,3.64%,0.36%,0 %,3.46%,0.55%,0 %,0 %
+ RF02001,74.15%,5.56%,0.28%,5.07%,0.83%,0.07%,4.86%,3.47%,0.14%,0 %,0.07%,0.90%,0.63%,0.35%,0.49%,0 %,2.78%,0 %,0.35%
+ RF02012,76.03%,5.48%,0 %,4.11%,1.37%,0.68%,0 %,0 %,2.74%,0 %,0 %,0 %,1.37%,2.05%,0 %,0 %,4.11%,1.37%,0.68%
+ RF02253,100.00%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
+ RF02348,80.00%,5.00%,0 %,3.33%,0 %,0 %,0 %,1.67%,1.67%,3.33%,0 %,0 %,0 %,0 %,0 %,0 %,5.00%,0 %,0 %
+ RF02519,66.67%,0 %,0 %,0 %,16.67%,0 %,8.33%,0 %,8.33%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
+ RF02540,60.17%,5.14%,3.83%,3.92%,2.79%,2.53%,3.11%,1.90%,2.22%,1.96%,2.38%,2.25%,1.45%,1.79%,1.50%,1.94%,0.55%,0.28%,0.31%
+ RF02541,62.00%,4.13%,3.68%,3.79%,2.68%,2.55%,2.84%,2.12%,2.25%,1.87%,2.18%,1.89%,1.71%,1.78%,1.53%,1.61%,0.65%,0.35%,0.38%
+ RF02543,66.82%,3.48%,2.88%,3.00%,2.51%,2.52%,1.61%,2.09%,1.74%,2.13%,1.88%,1.84%,1.95%,1.51%,1.25%,1.41%,0.74%,0.36%,0.26%
+ RF02545,65.43%,0.82%,4.12%,2.88%,1.23%,3.70%,1.65%,1.65%,2.47%,2.47%,1.23%,1.23%,0.82%,2.47%,3.70%,2.47%,0.82%,0.82%,0 %
+ RF02546,82.61%,0 %,8.70%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,4.35%,0 %,0 %,0 %,0 %,4.35%
+ RF02553,73.68%,2.63%,7.89%,0 %,0 %,2.63%,0 %,0 %,2.63%,0 %,0 %,5.26%,0 %,2.63%,0 %,2.63%,0 %,0 %,0 %
+ RF02680,88.89%,0 %,2.78%,0 %,2.78%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,5.56%,0 %,0 %,0 %,0 %,0 %,0 %
+ RF02683,80.56%,2.78%,0 %,5.56%,2.78%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,2.78%,2.78%,0 %,2.78%,0 %,0 %,0 %
+ RF02796,78.69%,4.92%,0 %,4.92%,4.92%,0 %,0 %,0 %,4.92%,0 %,0 %,1.64%,0 %,0 %,0 %,0 %,0 %,0 %,0 %
+ TOTAL,63.42%,3.93%,3.83%,3.23%,2.83%,2.35%,2.28%,2.28%,2.26%,2.13%,1.96%,1.88%,1.82%,1.68%,1.46%,1.25%,0.73%,0.35%,0.33%
--- a/results/pairings.png 0 → 100644
View file @40cb5b7
+++ b/results/pairings.png 0 → 100644
View file @40cb5b7
--- a/results/realign_jobs_performance.png 0 → 100644
View file @40cb5b7
+++ b/results/realign_jobs_performance.png 0 → 100644
View file @40cb5b7
--- a/results/regression.png deleted 100644 → 0
View file @36f12c2
+++ b/results/regression.png deleted 100644 → 0
View file @36f12c2
--- a/statistics.py
View file @40cb5b7
+++ b/statistics.py
View file @40cb5b7
 #!/usr/bin/python3.8
- import os, pickle
+ import os, pickle, sys
 import numpy as np
 import pandas as pd
 import threading as th
@@ -20,21 +20,12 @@ from collections import Counter
 from RNAnet import read_cpu_number
 
 
- if os.path.isdir("/home/ubuntu/"): # this is the IFB-core cloud
-     path_to_3D_data = "/mnt/Data/RNA/3D/"
-     path_to_seq_data = "/mnt/Data/RNA/sequences/"
- elif os.path.isdir("/home/persalteas"): # this is my personal workstation
-     path_to_3D_data = "/home/persalteas/Data/RNA/3D/"
-     path_to_seq_data = "/home/persalteas/Data/RNA/sequences/"
- elif os.path.isdir("/home/lbecquey"): # this is the IBISC server
-     path_to_3D_data = "/home/lbecquey/Data/RNA/3D/"
-     path_to_seq_data = "/home/lbecquey/Data/RNA/sequences/"
- elif os.path.isdir("/nhome/siniac/lbecquey"): # this is the office PC
-     path_to_3D_data = "/nhome/siniac/lbecquey/Data/RNA/3D/"
-     path_to_seq_data = "/nhome/siniac/lbecquey/Data/RNA/sequences/"
- else:
-     print("I don't know that machine... I'm shy, maybe you should introduce yourself ?")
-     exit(1)
+ path_to_3D_data = "/nhome/siniac/lbecquey/Data/RNA/3D/"
+ path_to_seq_data = "/nhome/siniac/lbecquey/Data/RNA/sequences/"
+ 
+ if len(sys.argv) > 1:
+     path_to_3D_data = path.abspath(sys.argv[1])
+     path_to_seq_data = path.abspath(sys.argv[2])
 
 class DataPoint():
     def __init__(self, path_to_textfile):
@@ -80,7 +71,7 @@ def reproduce_wadley_results(points, show=False, carbon=4, sd_range=(1,4)):
         c3_endo_etas = []
         c2_endo_thetas = []
         c3_endo_thetas = []
-         for p in points:
+         for p in tqdm(points, desc="Loading eta/thetas", position=worker_nbr, leave=False):
             df = p.df.loc[(p.df[angle].isna()==False) & (p.df["th"+angle].isna()==False), ["form","puckering", angle,"th"+angle]]
             c2_endo_etas   += list(df.loc[ (df.puckering=="C2'-endo"), angle ].values)
             c3_endo_etas   += list(df.loc[ (df.form=='.') & (df.puckering=="C3'-endo"), angle ].values)
@@ -112,14 +103,17 @@ def reproduce_wadley_results(points, show=False, carbon=4, sd_range=(1,4)):
         f_c2 = f["kernel_c2"]
         xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j]
 
-     print(f"[{worker_nbr}]\tKernel computed (or loaded from file).")
+     # print(f"[{worker_nbr}]\tKernel computed (or loaded from file).")
 
     # exact counts:
     hist_c2, xedges, yedges = np.histogram2d(c2_endo_etas, c2_endo_thetas, bins=int(2*np.pi/0.1), range=[[0, 2*np.pi], [0, 2*np.pi]])
     hist_c3, xedges, yedges = np.histogram2d(c3_endo_etas, c3_endo_thetas, bins=int(2*np.pi/0.1), range=[[0, 2*np.pi], [0, 2*np.pi]])
     color_values = cm.jet(hist_c3.ravel()/hist_c3.max())
 
-     for x, y, hist, f, l in zip( (c3_endo_etas, c2_endo_etas), (c3_endo_thetas, c2_endo_thetas), (hist_c3, hist_c2), (f_c3, f_c2), ("c3","c2")):
+     for x, y, hist, f, l in zip( (c3_endo_etas, c2_endo_etas), 
+                                  (c3_endo_thetas, c2_endo_thetas), 
+                                  (hist_c3, hist_c2), 
+                                  (f_c3, f_c2), ("c3","c2")):
         # cut hist and kernel
         hist_sup_thr = hist.mean() + sd_range[1]*hist.std()
         hist_cut = np.where( hist > hist_sup_thr, hist_sup_thr, hist)
@@ -136,10 +130,9 @@ def reproduce_wadley_results(points, show=False, carbon=4, sd_range=(1,4)):
         ax.bar3d(xpos.ravel(), ypos.ravel(), 0.0, 0.09, 0.09, hist_cut.ravel(), color=color_values, zorder="max")
         ax.set_xlabel(xlabel)
         ax.set_ylabel(ylabel)
-         plt.savefig(f"results/wadley_hist_{angle}_{l}.png")
+         fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}.png")
         if show:
-             plt.show()
-         plt.close()
+             fig.show()
 
         # Smoothed joint distribution
         fig = plt.figure()
@@ -147,10 +140,9 @@ def reproduce_wadley_results(points, show=False, carbon=4, sd_range=(1,4)):
         ax.plot_surface(xx, yy, f_cut, cmap=cm.coolwarm, linewidth=0, antialiased=True)
         ax.set_xlabel(xlabel)
         ax.set_ylabel(ylabel)
-         plt.savefig(f"results/wadley_distrib_{angle}_{l}.png")
+         fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}.png")
         if show:
-             plt.show()
-         plt.close()
+             fig.show()
 
         # 2D Wadley plot
         fig = plt.figure(figsize=(5,5))
@@ -160,15 +152,15 @@ def reproduce_wadley_results(points, show=False, carbon=4, sd_range=(1,4)):
 
         ax.set_xlabel(xlabel)
         ax.set_ylabel(ylabel)
-         fig.savefig(f"results/wadley_{angle}_{l}.png")
+         fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}.png")
         if show:
-             plt.show()
-     print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.")
+             fig.show()
+     # print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.")
 
 def stats_len(mappings_list, points):
     cols = []
     lengths = []
-     for f in sorted(mappings_list.keys()):
+     for f in tqdm(sorted(mappings_list.keys()), desc="Chain length by family", position=3, leave=False):
         if f in ["RF02540","RF02541","RF02543"]:
             cols.append("red") # LSU
         elif f in ["RF00177","RF01960","RF01959","RF02542"]:
@@ -187,22 +179,21 @@ def stats_len(mappings_list, points):
             l.append(len(r.df['nt_code']))
         lengths.append(l)
 
-     plt.figure(figsize=(10,3))
-     ax = plt.gca()
+     fig = plt.figure(figsize=(10,3))
+     ax = fig.gca()
     ax.hist(lengths, bins=100, stacked=True, log=True, color=cols, label=sorted(mappings_list.keys()))
     ax.set_xlabel("Sequence length (nucleotides)")
     ax.set_ylabel("Number of 3D chains")
-     plt.tight_layout()
-     handles, labels = ax.get_legend_handles_labels()
+     fig.tight_layout()
     filtered_handles = [mpatches.Patch(color='red'), mpatches.Patch(color='white'),
                         mpatches.Patch(color='blue'), mpatches.Patch(color='white'),
                         mpatches.Patch(color='green'), mpatches.Patch(color='purple'),
                         mpatches.Patch(color='orange'), mpatches.Patch(color='grey')]
     filtered_labels = ['Large Ribosomal Subunits', '(RF02540, RF02541, RF02543)','Small Ribosomal Subunits','(RF01960, RF00177)',
                        '5S rRNA (RF00001)', '5.8S rRNA (RF00002)', 'tRNA (RF00005)', 'Other']
-     ax.legend(filtered_handles, filtered_labels, loc='best', ncol=2)# bbox_to_anchor=(0.5, -0.5), ncol=4, fontsize=)
-     plt.savefig("results/lengths.png")
-     print("[3]\tComputed sequence length statistics and saved the figure.")
+     ax.legend(filtered_handles, filtered_labels, loc='best', ncol=2)
+     fig.savefig("results/lengths.png")
+     # print("[3]\tComputed sequence length statistics and saved the figure.")
 
 def format_percentage(tot, x):
         if not tot:
@@ -210,6 +201,8 @@ def format_percentage(tot, x):
         x = 100*x/tot
         if x >= 0.01:
             x = "%.2f" % x
+         elif x == 0:
+             return "0 %"
         else:
             x = "<.01"
         return x + '%'
@@ -219,7 +212,7 @@ def stats_freq(mappings_list, points):
     for f in mappings_list.keys():
         freqs[f] = Counter()
 
-     for r in points:
+     for r in tqdm(points, desc="Nucleotide frequencies", position=4, leave=False):
         freqs[r.family].update(dict(r.df['nt_name'].value_counts()))
     
     df = pd.DataFrame()
@@ -229,7 +222,7 @@ def stats_freq(mappings_list, points):
     df = df.fillna(0)
     df.to_csv("results/frequencies.csv")
 
-     print("[4]\tComputed nucleotide statistics and saved CSV file.")
+     # print("[4]\tComputed nucleotide statistics and saved CSV file.")
 
 def stats_pairs(mappings_list, points):
 
@@ -242,25 +235,30 @@ def stats_pairs(mappings_list, points):
         freqs[f] = Counter()
 
     # Iterate over data points
-     for r in tqdm(points, desc="RNA points", position=0, leave=False):
-         # Skip if linear piece of RNA
-         if not sum([ x != 0 for x in r.df.paired ]):
-             continue 
- 
-         # Count each pair type within the molecule
-         vcnts = pd.concat(
-                             [   pd.Series(row['pair_type_LW'].split(',')) 
-                                 for _, row in r.df.dropna(subset=["pair_type_LW"]).iterrows() ]
-                         ).reset_index(drop=True).value_counts()
- 
-         # Add these new counts to the family's counter
-         freqs[r.family].update(dict(vcnts))
-     
-     # Create the output dataframe
-     df = pd.DataFrame()
-     for f in sorted(mappings_list.keys()):
-         df = pd.concat([ df, pd.DataFrame([[ x for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ])
-     df = df.fillna(0)
+     if not path.isfile("data/pair_counts.csv"):
+         for r in tqdm(points, desc="Leontis-Westhof basepair stats", position=5, leave=False):
+             # Skip if linear piece of RNA
+             if r.df.pair_type_LW.isna().all():
+                 continue 
+ 
+             # Count each pair type within the molecule
+             vcnts = pd.concat(
+                                 [   pd.Series(row['pair_type_LW'].split(',')) 
+                                     for _, row in r.df.dropna(subset=["pair_type_LW"]).iterrows() ]
+                             ).reset_index(drop=True).value_counts()
+ 
+             # Add these new counts to the family's counter
+             freqs[r.family].update(dict(vcnts))
+         
+         # Create the output dataframe
+         df = pd.DataFrame()
+         for f in sorted(mappings_list.keys()):
+             df = pd.concat([ df, pd.DataFrame([[ x for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ])
+         df = df.fillna(0)
+         df.to_csv("data/pair_counts.csv")
+     else:
+         df = pd.read_csv("data/pair_counts.csv", index_col=0)
+ 
 
     # Remove not very well defined pair types (not in the 12 LW types)
     col_list = [ x for x in df.columns if '.' in x ]
@@ -288,7 +286,7 @@ def stats_pairs(mappings_list, points):
     plt.subplots_adjust(bottom=0.2, right=0.99)
     plt.savefig("results/pairings.png")
 
-     print("[5]\tComputed nucleotide statistics and saved CSV and PNG file.")
+     # print("[5]\tComputed nucleotide statistics and saved CSV and PNG file.")
 
 def to_dist_matrix(f):
     if path.isfile("data/"+f+".npy"):
@@ -311,11 +309,11 @@ def seq_idty(mappings_list):
         if len(mappings_list[x]) == 1:
             ignored.append(x)
     if len(ignored):
-         print("Ignoring families with only one chain:", " ".join(ignored))
+         print("Idty matrices: Ignoring families with only one chain:", " ".join(ignored)+'\n')
 
     # compute distance matrices
     p = Pool(processes=8)
-     pbar = tqdm(total=len(famlist), desc="Families idty matrices", position=1, leave=True)
+     pbar = tqdm(total=len(famlist), desc="Families idty matrices", position=0, leave=False)
     for i, _ in enumerate(p.imap_unordered(to_dist_matrix, famlist)):
         pbar.update(1)
     pbar.close()
@@ -353,16 +351,17 @@ def seq_idty(mappings_list):
     fig.subplots_adjust(wspace=0.1, hspace=0.3)
     fig.colorbar(im, ax=axs[-1], shrink=0.8)
     fig.savefig(f"results/distances.png")
-     print("[6]\tComputed identity matrices and saved the figure.")
+     # print("[6]\tComputed identity matrices and saved the figure.")
 
 if __name__ == "__main__":
 
     #################################################################
     #               LOAD ALL FILES
     #################################################################
+     os.makedirs("results/figures/wadley_plots/", exist_ok=True)
 
     print("Loading mappings list...")
-     mappings_list = pd.read_csv(path_to_seq_data + "realigned/mappings_list.csv", sep=',', index_col=0).to_dict(orient='list')
+     mappings_list = pd.read_csv("results/mappings_list.csv", sep=',', index_col=0).to_dict(orient='list')
     for k in mappings_list.keys():
         mappings_list[k] = [ x for x in mappings_list[k] if str(x) != 'nan' ]
 
@@ -372,9 +371,9 @@ if __name__ == "__main__":
             rna_points = pickle.load(f)
     else:
         rna_points = []
-         filelist = [path_to_3D_data+"/datapoints/"+f for f in os.listdir(path_to_3D_data+"/datapoints") if ".log" not in f and ".gz" not in f]
+         filelist = [path_to_3D_data+"/datapoints/"+f for f in os.listdir(path_to_3D_data+"/datapoints") ]
         p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=read_cpu_number())
-         pbar = tqdm(total=len(filelist), desc="RNA files", position=0, leave=True)
+         pbar = tqdm(total=len(filelist), desc="RNA files", position=0, leave=False)
         for i, rna in enumerate(p.imap_unordered(load_rna_frome_file, filelist)):
             rna_points.append(rna)
             pbar.update(1)
@@ -389,29 +388,18 @@ if __name__ == "__main__":
     #################################################################
     #               Define threads for the tasks
     #################################################################
-     wadley_thr = []
-     wadley_thr.append(th.Thread(target=reproduce_wadley_results, args=[rna_points], kwargs={'carbon': 1}))
-     wadley_thr.append(th.Thread(target=reproduce_wadley_results, args=[rna_points], kwargs={'carbon': 4}))
- 
-     seq_len_thr = th.Thread(target=partial(stats_len, mappings_list), args=[rna_points])
-     nt_freq_thr = th.Thread(target=partial(stats_freq, mappings_list), args=[rna_points])
-     pairs_freq_thr = th.Thread(target=partial(stats_pairs, mappings_list), args=[rna_points])
-     dist_thr = th.Thread(target=seq_idty, args=[mappings_list])
- 
- 
- 
-     for t in wadley_thr:
+     threads = [
+         th.Thread(target=reproduce_wadley_results, args=[rna_points], kwargs={'carbon': 1}),
+         th.Thread(target=reproduce_wadley_results, args=[rna_points], kwargs={'carbon': 4}),
+         th.Thread(target=partial(stats_len, mappings_list), args=[rna_points]),
+         th.Thread(target=partial(stats_freq, mappings_list), args=[rna_points]),
+         th.Thread(target=partial(stats_pairs, mappings_list), args=[rna_points]),
+         th.Thread(target=seq_idty, args=[mappings_list])
+     ]
+ 
+     for t in threads:
         t.start()
-     seq_len_thr.start()
-     nt_freq_thr.start()
-     pairs_freq_thr.start()
-     dist_thr.start()
- 
 
-     for t in wadley_thr:
+     for t in threads:
         t.join()
-     seq_len_thr.join()
-     nt_freq_thr.join()
-     pairs_freq_thr.join()
-     dist_thr.join()