Louis BECQUEY

With stats results

# execution outputs:
nohup.out
jobstats.csv
log_of_the_run.sh
# results
results/figures/wadley_plots/
# temporary results files
data/*.npy
data/*.npz
data/olddata
data/
# environment stuff
.vscode/
......
......@@ -27,7 +27,6 @@ running_stats = m.list()
running_stats.append(0) # n_launched
running_stats.append(0) # n_finished
running_stats.append(0) # n_skipped
runDir = path.dirname(path.realpath(__file__))
path_to_3D_data = "tobedefinedbyoptions"
path_to_seq_data = "tobedefinedbyoptions"
validsymb = '\U00002705'
......@@ -40,6 +39,7 @@ KEEP_HETATM = False
FILL_GAPS = True
HOMOLOGY = True
USE_KNOWN_ISSUES = True
RUN_STATS = False
class NtPortionSelector(object):
"""Class passed to MMCIFIO to select some chain portions in an MMCIF file.
......@@ -119,17 +119,18 @@ class Chain:
Chains accumulate information through this scipt, and are saved to files at the end of major steps."""
def __init__(self, pdb_id, pdb_model, pdb_chain_id, chain_label, rfam="", pdb_start=None, pdb_end=None):
def __init__(self, pdb_id, pdb_model, pdb_chain_id, chain_label, rfam="", inferred=False, pdb_start=None, pdb_end=None):
self.pdb_id = pdb_id # PDB ID
self.pdb_model = int(pdb_model) # model ID, starting at 1
self.pdb_chain_id = pdb_chain_id # chain ID (mmCIF), multiple letters
self.pdb_start = pdb_start # if portion of chain, the start number (relative to the chain, not residue numbers)
self.pdb_end = pdb_end # if portion of chain, the start number (relative to the chain, not residue numbers)
self.reversed = False # wether pdb_end > pdb_start in the Rfam mapping
self.reversed = (pdb_start > pdb_end) # wether pdb_start > pdb_end in the Rfam mapping
self.chain_label = chain_label # chain pretty name
self.full_mmCIFpath = "" # path to the source mmCIF structure
self.file = "" # path to the 3D PDB file
self.rfam_fam = rfam # mapping to an RNA family
self.inferred = inferred # Wether this mapping has been inferred from BGSU's NR list
self.seq = "" # sequence with modified nts
self.aligned_seq = "" # sequence with modified nts replaced, but gaps can exist
self.length = -1 # length of the sequence (missing residues are not counted)
......@@ -848,7 +849,7 @@ def execute_job(j, jobcount):
print(f"[{running_stats[0]+running_stats[2]}/{jobcount}]\t{j.label}")
# Add the command to logfile
logfile = open(runDir + "/log_of_the_run.sh", 'a')
logfile = open("log_of_the_run.sh", 'a')
logfile.write(" ".join(j.cmd_))
logfile.write("\n")
logfile.close()
......@@ -916,7 +917,7 @@ def execute_joblist(fulljoblist, printstats=False):
if printstats:
# Write statistics in a file (header here)
f = open("data/jobstats.csv", "w")
f = open(runDir + "/data/jobstats.csv", "w")
f.write("label,comp_time,max_mem\n")
f.close()
......@@ -948,7 +949,7 @@ def execute_joblist(fulljoblist, printstats=False):
mems = [ r[1] for r in raw_results ]
# Write them to file
f = open("data/jobstats.csv", "a")
f = open(runDir + "/data/jobstats.csv", "a")
for j, t, m in zip(bunch, times, mems):
j.comp_time = t
j.max_mem = m
......@@ -1426,11 +1427,13 @@ def infer_all_mappings(allmappings, codelist):
if len(m):
pdb_start = int(m.pdb_start)
pdb_end = int(m.pdb_end)
inferred = False
else: # otherwise, use the inferred mapping
pdb_start = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_start)
pdb_end = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_end)
inferred = True
chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}_{pdb_start}-{pdb_end}"
newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, rfam=rfam, pdb_start=pdb_start, pdb_end=pdb_end))
newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
return newchains
......@@ -1439,7 +1442,7 @@ if __name__ == "__main__":
# Parse options
try:
opts, args = getopt.getopt( sys.argv[1:],
"r:h",
"r:hs",
[ "help", "resolution=", "keep-hetatm=",
"fill-gaps=", "3d-folder=", "seq-folder=",
"no-homology", "force-retry" ])
......@@ -1458,7 +1461,8 @@ if __name__ == "__main__":
print()
print("-r 4.0 [ --resolution=4.0 ]\t(1.5 | 2.0 | 2.5 | 3.0 | 3.5 | 4.0 | 20.0)"
"\n\t\t\t\tMinimum 3D structure resolution to consider a RNA chain.")
print("--keep-hetatm=False\t\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. "
print("-s\t\t\t\tRun statistics computations after completion")
print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. "
"\n\t\t\t\tDoes not affect the descriptors.")
print("--fill-gaps=True\t\t(True | False) Replace gaps in sequence due to unresolved residues"
"\n\t\t\t\tby the most common nucleotide at this position in the alignment.")
......@@ -1481,6 +1485,8 @@ if __name__ == "__main__":
elif opt == "-r" or opt == "--resolution":
assert arg in ["1.5", "2.0", "2.5", "3.0", "3.5", "4.0", "20.0"]
CRYSTAL_RES = arg
elif opt == "-s":
RUN_STATS = True
elif opt=="--keep-hetatm":
assert arg in [ "True", "False" ]
KEEP_HETATM = (arg == "True")
......@@ -1505,17 +1511,18 @@ if __name__ == "__main__":
if path_to_3D_data == "tobedefinedbyoptions" or path_to_seq_data == "tobedefinedbyoptions":
print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
print("See RNANet.py --help for more information.")
path_to_3D_data = "/home/lbecquey/Data/RNA/3D/"
path_to_seq_data = "/home/lbecquey/Data/RNA/sequences/"
print(f"\n[DEBUG]\tUsing hard-coded paths to data:\n\t\t{path_to_3D_data}\n\t\t{path_to_seq_data}\n")
# exit(1)
exit(1)
runDir = path.dirname(path.realpath(__file__))
os.makedirs(runDir + "/results", exist_ok=True)
# ===========================================================================
# List 3D chains with available Rfam mapping
# ===========================================================================
# List all 3D RNA chains below 4Ang resolution
chains_database = pd.DataFrame(columns=['pdb_id', 'pdb_model', 'pdb_chain', 'rfam_fam', 'pdb_start', 'pdb_end', 'reversed', 'inferred', 'issue'])
# List all 3D RNA chains below given resolution
full_structures_list = download_BGSU_NR_list()
# Check for a list of known problems:
......@@ -1528,6 +1535,13 @@ if __name__ == "__main__":
print("\t> Ignoring known issues:")
for x in known_issues:
print("\t ", x)
chains_database = chains_database.append(pd.DataFrame({ 'pdb_id':x.split('_')[0],
'pdb_model':x.split('_')[1],
'pdb_chain':x.split('_')[2],
'pdb_start':x.split('_')[3].split('-')[0],
'pdb_end':x.split('_')[3].split('-')[1],
'issue':True
}, index=[x]))
all_chains = []
if HOMOLOGY:
......@@ -1559,6 +1573,11 @@ if __name__ == "__main__":
all_chains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label))
del full_structures_list
chains_database = chains_database.append(pd.DataFrame.from_dict(
{c.chain_label:[ c.pdb_id, c.pdb_model, c.pdb_chain_id, c.rfam_fam, c.pdb_start, c.pdb_end, c.reversed, c.inferred, False ] for c in all_chains},
orient='index',
columns=['pdb_id', 'pdb_model', 'pdb_chain', 'rfam_fam', 'pdb_start', 'pdb_end', 'reversed', 'inferred', 'issue'] ))
chains_database.to_csv(runDir + "/results/results_database.csv")
n_chains = len(all_chains)
print(">", validsymb, n_chains, "RNA chains of interest.")
......@@ -1621,7 +1640,7 @@ if __name__ == "__main__":
else:
rfam_acc_to_download[c.rfam_fam].append(c)
mappings_list[c.rfam_fam].append(c.chain_label)
pd.DataFrame.from_dict(mappings_list, orient='index').transpose().to_csv(path_to_seq_data + "realigned/mappings_list.csv")
pd.DataFrame.from_dict(mappings_list, orient='index').transpose().to_csv(runDir + "/results/mappings_list.csv")
del mappings_list
print(f"> Identified {len(rfam_acc_to_download.keys())} families to download and re-align with the crystals' sequences:")
......@@ -1636,7 +1655,7 @@ if __name__ == "__main__":
n_pdb = [ len(rfam_acc_to_download[f]) for f in fam_stats["rfam_acc"] ]
fam_stats["n_pdb_seqs"] = n_pdb
fam_stats["total_seqs"] = fam_stats["n_seq"] + fam_stats["n_pdb_seqs"]
fam_stats.to_csv(path_to_seq_data + "data/statistics.csv")
fam_stats.to_csv(runDir + "/data/statistics.csv")
# print the stats
for f in fam_list:
line = fam_stats[fam_stats["rfam_acc"]==f]
......@@ -1690,7 +1709,23 @@ if __name__ == "__main__":
p.close()
p.join()
print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain
# ==========================================================================================
# Post computation tasks
# ==========================================================================================
# Archive the results
os.makedirs("results/archive", exist_ok=True)
time_str = time.strftime("%Y%m%d")
subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."])
subprocess.run(['ln',"-s", runDir +f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"])
# Run statistics
if RUN_STATS:
os.chdir(runDir)
subprocess.run(["python3", "regression.py"])
subprocess.run(["python3", "statistics.py", path_to_3D_data, path_to_seq_data])
print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove
# # so i can sleep for the end of the night
# subprocess.run(["shutdown","now"])
......
label,comp_time,max_mem
Realign RF00001 + 733 chains,347.5666003227234,783781888
Realign RF00002 + 138 chains,15.574181318283081,710549504
Realign RF00004 + 10 chains,331.88619780540466,2516779008
Realign RF00005 + 869 chains,2349.9712748527527,6085918720
Realign RF00008 + 9 chains,7.597636461257935,247132160
Realign RF00009 + 4 chains,423.78941464424133,22123020288
Realign RF00010 + 3 chains,167.0309178829193,5554601984
Realign RF00011 + 4 chains,10.090157508850098,996966400
Realign RF00013 + 1 chains,17.571903228759766,474783744
Realign RF00015 + 6 chains,98.247323513031,1385431040
Realign RF00017 + 10 chains,2218.9181191921234,13771866112
Realign RF00020 + 17 chains,23.84590220451355,431484928
Realign RF00023 + 7 chains,1196.2392709255219,7625351168
Realign RF00026 + 20 chains,82.25747513771057,518791168
Realign RF00028 + 5 chains,240.64744520187378,11369852928
Realign RF00029 + 1 chains,62.898540019989014,898707456
Realign RF00032 + 9 chains,7.049402236938477,162136064
Realign RF00037 + 2 chains,0.27519845962524414,108863488
Realign RF00050 + 6 chains,9.991205930709839,397705216
Realign RF00059 + 24 chains,52.07490301132202,532307968
Realign RF00061 + 1 chains,0.3395853042602539,233058304
Realign RF00080 + 4 chains,19.957021474838257,1301696512
Realign RF00100 + 6 chains,415.4162850379944,4435156992
Realign RF00162 + 27 chains,16.753626108169556,408281088
Realign RF00164 + 1 chains,0.05605888366699219,83927040
Realign RF00167 + 48 chains,4.422192573547363,264232960
Realign RF00168 + 16 chains,17.653642892837524,796184576
Realign RF00169 + 11 chains,9.363726615905762,226705408
Realign RF00174 + 2 chains,171.14065551757812,2648383488
Realign RF00177 + 498 chains,2885.531806945801,45187723264
Realign RF00233 + 2 chains,0.16314435005187988,138911744
Realign RF00234 + 37 chains,10.552204132080078,1207234560
Realign RF00250 + 1 chains,0.08687877655029297,83755008
Realign RF00379 + 7 chains,27.468972206115723,655532032
Realign RF00380 + 3 chains,2.397320508956909,245669888
Realign RF00442 + 1 chains,2.0599684715270996,222887936
Realign RF00458 + 7 chains,0.24766230583190918,197394432
Realign RF00488 + 3 chains,1.4626531600952148,850460672
Realign RF00504 + 18 chains,12.249290227890015,366731264
Realign RF00505 + 1 chains,0.06069207191467285,83628032
Realign RF01051 + 17 chains,7.672087907791138,297189376
Realign RF01510 + 16 chains,0.0939493179321289,83746816
Realign RF01689 + 4 chains,1.2797768115997314,400691200
Realign RF01725 + 2 chains,2.976431369781494,294690816
Realign RF01734 + 5 chains,1.8893005847930908,163631104
Realign RF01739 + 4 chains,1.6384203433990479,271265792
Realign RF01750 + 6 chains,8.268307209014893,421974016
Realign RF01763 + 13 chains,0.5894784927368164,135094272
Realign RF01786 + 2 chains,0.8764479160308838,182689792
Realign RF01807 + 1 chains,0.19919967651367188,166801408
Realign RF01826 + 1 chains,0.06825041770935059,83787776
Realign RF01831 + 10 chains,2.0323476791381836,254255104
Realign RF01846 + 2 chains,15.989834308624268,1073623040
Realign RF01852 + 16 chains,4.523370265960693,249016320
Realign RF01854 + 3 chains,8.060775518417358,647757824
Realign RF01857 + 1 chains,3.9880683422088623,587083776
Realign RF01960 + 140 chains,3388.5226855278015,56313212928
Realign RF02001 + 26 chains,22.095701456069946,1335533568
Realign RF02012 + 3 chains,10.277246713638306,796667904
Realign RF02253 + 1 chains,0.2654685974121094,104386560
Realign RF02348 + 2 chains,0.11346197128295898,82419712
Realign RF02519 + 1 chains,0.039333343505859375,81330176
Realign RF02540 + 67 chains,726.7017936706543,48769855488
Realign RF02545 + 3 chains,0.451732873916626,513720320
Realign RF02546 + 1 chains,0.3498055934906006,405676032
Realign RF02553 + 1 chains,1.2360577583312988,281141248
Realign RF02680 + 1 chains,0.09950971603393555,80687104
Realign RF02683 + 1 chains,1.070310115814209,282808320
Realign RF02796 + 6 chains,0.0940089225769043,81862656
rfam_acc,n_seq,maxlength,n_pdb_seqs,total_seqs
RF00001,70460,345,733,71193
RF00002,11746,289,138,11884
RF00004,10251,342,10,10261
RF00005,436080,293,869,436949
RF00008,2383,132,9,2392
RF00009,1217,1029,4,1221
RF00010,6473,812,3,6476
RF00011,787,436,4,791
RF00013,3502,254,1,3503
RF00015,5016,310,6,5022
RF00017,3733,806,10,3743
RF00020,4459,188,17,4476
RF00023,6656,784,7,6663
RF00026,23130,431,20,23150
RF00028,2051,892,5,2056
RF00029,8804,341,1,8805
RF00032,16724,88,9,16733
RF00037,1607,56,2,1609
RF00050,3746,347,6,3752
RF00059,9846,255,24,9870
RF00061,80,261,1,81
RF00080,788,241,4,792
RF00100,7822,636,6,7828
RF00162,4049,375,27,4076
RF00164,63,43,1,64
RF00167,1765,156,48,1813
RF00168,1889,334,16,1905
RF00169,6295,121,11,6306
RF00174,9480,476,2,9482
RF00177,25969,3531,498,26467
RF00233,49,87,2,51
RF00234,930,380,37,967
RF00250,63,60,1,64
RF00379,2637,324,7,2644
RF00380,921,282,3,924
RF00442,770,226,1,771
RF00458,16,215,7,23
RF00488,40,824,3,43
RF00504,3582,249,18,3600
RF00505,21,65,1,22
RF01051,3217,270,17,3234
RF01510,5,63,16,21
RF01689,344,215,4,348
RF01725,767,158,2,769
RF01734,1748,159,5,1753
RF01739,761,273,4,765
RF01750,1513,203,6,1519
RF01763,640,82,13,653
RF01786,496,122,2,498
RF01807,12,218,1,13
RF01826,14,93,1,15
RF01831,614,249,10,624
RF01846,616,537,2,618
RF01852,4469,112,16,4485
RF01854,1707,302,3,1710
RF01857,442,343,1,443
RF01960,27108,5325,140,27248
RF02001,2268,340,26,2294
RF02012,838,191,3,841
RF02253,677,63,1,678
RF02348,77,105,2,79
RF02519,6,33,1,7
RF02540,34679,9019,67,34746
RF02541,35613,8885,689,36302
RF02543,38161,11046,147,38308
RF02545,16,628,3,19
RF02546,18,572,1,19
RF02553,116,188,1,117
RF02680,34,103,1,35
RF02683,229,187,1,230
RF02796,13,70,6,19
#!/usr/bin/python3
#!/usr/bin/python3.8
# This file is supposed to propose regression models on the computation time and mem usage of the re-alignment jobs.
# Light jobs are monitored by the Monitor class in RNAnet.py, and the measures are saved in jobstats.csv.
# This was done to guess the amount of memory required to re-align the large ribosomal subunit families RF02541 and RF02543.
# INFO: Our home hardware was a 24-core VM with 50GB RAM + 8GB Swap.
# INFO: Our home hardware was a 32-core VM with 50GB RAM + 8GB Swap.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy
import scipy, os
from sklearn.linear_model import LinearRegression
from mpl_toolkits.mplot3d import Axes3D
......@@ -31,105 +31,109 @@ for index, fam in jobstats.iterrows():
maxlengths.append(
families.loc[families["rfam_acc"] == rfam_acc, "maxlength"].values[0])
nchains = [x/1000 for x in nchains] # compte en milliers de séquences
comptimes = [x/3600 for x in comptimes] # compte en heures
maxlengths = [x/1000 for x in maxlengths] # compte en kB
maxmem = [x/1024/1024 for x in maxmem] # compte en MB
summary = pd.DataFrame({"family": computed_families, "n_chains": nchains,
"max_length": maxlengths, "comp_time": comptimes, "max_mem": maxmem})
summary.sort_values("max_length", inplace=True)
summary.to_csv("summary.csv")
"max_length(kB)": maxlengths, "comp_time(h)": comptimes, "max_mem(MB)": maxmem})
summary.sort_values("max_length(kB)", inplace=True)
summary.to_csv("results/summary.csv")
# ========================================================
# Plot the data
# ========================================================
fig = plt.figure(dpi=100)
fig = plt.figure(figsize=(12,8), dpi=100)
plt.subplot(231)
plt.scatter(summary.n_chains, summary.max_mem)
plt.xlabel("Number of sequences (x1000 seqs)")
plt.scatter(summary.n_chains, summary["max_mem(MB)"])
plt.xlabel("Number of sequences")
plt.ylabel("Peak memory (MB)")
plt.subplot(232)
plt.scatter(summary.max_length, summary.max_mem)
plt.scatter(summary["max_length(kB)"], summary["max_mem(MB)"])
plt.xlabel("Maximum length of sequences (kB)")
plt.ylabel("Peak memory (MB)")
ax = fig.add_subplot(233, projection='3d')
ax.scatter(summary.n_chains, summary.max_length, summary.max_mem)
ax.set_xlabel("Number of sequences (x1000 seqs)")
ax.scatter(summary.n_chains, summary["max_length(kB)"], summary["max_mem(MB)"])
ax.set_xlabel("Number of sequences")
ax.set_ylabel("Maximum length of sequences (kB)")
ax.set_zlabel("Peak memory (MB)")
plt.subplot(234)
plt.scatter(summary.n_chains, summary.comp_time)
plt.xlabel("Number of sequences (x1000 seqs)")
plt.scatter(summary.n_chains, summary["comp_time(h)"])
plt.xlabel("Number of sequences")
plt.ylabel("Computation time (h)")
plt.subplot(235)
plt.scatter(summary.max_length, summary.comp_time)
plt.scatter(summary["max_length(kB)"], summary["comp_time(h)"])
plt.xlabel("Maximum length of sequences (kB)")
plt.ylabel("Computation time (h)")
ax = fig.add_subplot(236, projection='3d')
ax.scatter(summary.n_chains, summary.max_length, summary.comp_time)
ax.set_xlabel("Number of sequences (x1000 seqs)")
ax.scatter(summary.n_chains, summary["max_length(kB)"], summary["comp_time(h)"])
ax.set_xlabel("Number of sequences")
ax.set_ylabel("Maximum length of sequences (kB)")
ax.set_zlabel("Computation time (h)")
plt.show()
# ========================================================
# Linear Regression of max_mem as function of max_length
# ========================================================
# With scikit-learn
model = LinearRegression(normalize=True, n_jobs=-1)
model.fit(np.array(summary.max_length).reshape(-1, 1), summary.max_mem)
b0 = model.intercept_
b1 = model.coef_[0]
print(f"peak_mem = {b0:.0f} + {b1:.0f} * max_length")
# with scipy
coeffs = scipy.optimize.curve_fit(lambda t, B0, B1: B0+np.exp(B1*t),
np.array(summary.max_length[:-3]), np.array(summary.max_mem[:-3]))[0]
print(f"peak_mem = {coeffs[0]:.0f} + e^({coeffs[1]:.0f} * max_length)")
coeffs_log = scipy.optimize.curve_fit(lambda t, B0, B1: B0+B1*np.log(t),
np.array(summary.max_length), np.array(summary.max_mem), p0=(400, 12000))[0]
print(
f"peak_mem = {coeffs_log[0]:.0f} + {coeffs_log[1]:.0f} * log(max_length)")
# Re-plot
x = np.linspace(0, 10, 1000)
plt.figure()
plt.scatter(summary.max_length, summary.max_mem)
plt.xlabel("Maximum length of sequences (kB)")
plt.ylabel("Peak memory (MB)")
plt.plot(x, b0 + b1*x, "-r", label="linear fit")
plt.plot(x, coeffs[0] + np.exp(coeffs[1]*x), "-g", label="expo fit on [:-3]")
plt.plot(x, coeffs_log[0] + coeffs_log[1]*np.log(x), "-b", label="log fit")
plt.ylim(0, 60000)
plt.legend()
plt.show()
print("Estimated mem required to compute RF02543 and its 11kB sequences:",
model.predict(np.array([11]).reshape(-1, 1)))
# ========================================================
# Linear Regression of comp_time as function of n_chains
# ========================================================
# With scikit-learn
model = LinearRegression(normalize=True, n_jobs=-1)
model.fit(np.array(summary.n_chains).reshape(-1, 1), summary.comp_time)
b0 = model.intercept_
b1 = model.coef_[0]
print(f"comp_time = {b0:.3f} + {b1:.3f} * n_chains")
print("Estimated computation time required for RF02543 and its 38k sequences:",
model.predict(np.array([38]).reshape(-1, 1)))
# Re-plot
x = np.linspace(0, 500, 1000)
plt.figure()
plt.scatter(summary.n_chains, summary.comp_time)
plt.xlabel("Number of sequences (x1000)")
plt.ylabel("Computation time (h)")
plt.plot(x, b0 + b1*x, "-r", label="linear fit")
plt.ylim(0, 10)
plt.legend()
plt.show()
plt.subplots_adjust(wspace=0.4)
plt.savefig("results/realign_jobs_performance.png")
# # ========================================================
# # Linear Regression of max_mem as function of max_length
# # ========================================================
# # With scikit-learn
# model = LinearRegression(normalize=True, n_jobs=-1)
# model.fit(summary["max_length(kB)"].values.reshape(-1, 1), summary["max_mem(MB)"])
# b0 = model.intercept_
# b1 = model.coef_[0]
# print(f"peak_mem = {b0:.0f} + {b1:.0f} * max_length")
# # with scipy
# coeffs = scipy.optimize.curve_fit( lambda t, B0, B1: B0+np.exp(B1*t),
# summary["max_length(kB)"].values,
# summary["max_mem(MB)"].values
# )[0]
# print(f"peak_mem = {coeffs[0]:.0f} + e^({coeffs[1]:.0f} * max_length)")
# coeffs_log = scipy.optimize.curve_fit( lambda t, B0, B1: B0+B1*np.log(t),
# summary["max_length(kB)"].values,
# summary["max_mem(MB)"].values,
# p0=(400, 12000)
# )[0]
# print(f"peak_mem = {coeffs_log[0]:.0f} + {coeffs_log[1]:.0f} * log(max_length)")
# # Re-plot
# x = np.linspace(0, 10, 1000)
# plt.figure()
# plt.scatter(summary["max_length(kB)"], summary["max_mem(MB)"])
# plt.xlabel("Maximum length of sequences (kB)")
# plt.ylabel("Peak memory (MB)")
# plt.plot(x, b0 + b1*x, "-r", label="linear fit")
# plt.plot(x, coeffs[0] + np.exp(coeffs[1]*x), "-g", label="expo fit")
# plt.plot(x, coeffs_log[0] + coeffs_log[1]*np.log(x), "-b", label="log fit")
# plt.legend()
# plt.savefig("results/regression/memory_linear_model.png")
# # ========================================================
# # Linear Regression of comp_time as function of n_chains
# # ========================================================
# # With scikit-learn
# model = LinearRegression(normalize=True, n_jobs=-1)
# model.fit(summary.n_chains.values.reshape(-1, 1), summary["comp_time(h)"])
# b0 = model.intercept_
# b1 = model.coef_[0]
# print(f"comp_time = {b0:.3f} + {b1:.3f} * n_chains")
# # Re-plot
# x = np.linspace(0, 500000, 1000)
# plt.figure()
# plt.scatter(summary.n_chains, summary["comp_time(h)"])
# plt.xlabel("Number of sequences")
# plt.ylabel("Computation time (h)")
# plt.plot(x, b0 + b1*x, "-r", label="linear fit")
# plt.legend()
# plt.savefig("results/regression/comp_time_linear_model.png")
......
,G,C,A,U,-,A2M,OMU,OMG,OMC,7MG,PSU,5MU,4SU,MIA,H2U,U8U,T6A,DJF,6MZ,CM0,5MC,2MG,1MA,YYG,M2G,2MA,QUO,G7M,4OC,YG,AET,2MU,12A,70U,6IA,1MG,GTP,574,I,RSP,RIA,3AU,AG9,ANZ,1RN,N79,365,UBD,9QV,CCC,IU,MA6,UR3,A3P,A23,23G,N,GDP,CBV,4AC,M7A,E3C,B8Q,B8N,C4J,M1Y,JMH,3TD,B9B,E7G,B9H,P7G,I4U,B8H,P4U,B8W,P5P,Y5P,B8T,B8K,E6G,BGH,MHG
RF00001,33.99%,29.98%,20.01%,16.01%,0.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00002,26.80%,23.51%,27.36%,21.86%,0.43%,0.01%,0.02%,<.01%,<.01%,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00004,18.12%,16.77%,23.33%,25.90%,15.82%,0,0,0,0,0,0.06%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00005,31.37%,27.32%,19.93%,17.61%,1.23%,0,<.01%,0.03%,0.07%,0.18%,0.73%,0.41%,0.33%,0.15%,0.20%,0.02%,0.02%,<.01%,0.02%,0.02%,0.14%,0.02%,0.02%,<.01%,0.02%,<.01%,0.02%,0.02%,0.01%,0.01%,<.01%,<.01%,<.01%,<.01%,<.01%,0.02%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00008,31.25%,26.35%,24.16%,18.24%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00009,31.11%,26.48%,20.69%,21.71%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00010,35.64%,29.65%,17.52%,11.12%,6.07%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00011,21.41%,15.95%,17.10%,11.65%,33.89%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00013,25.23%,24.32%,21.62%,19.82%,9.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00015,18.15%,14.11%,19.30%,23.34%,25.10%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00017,32.85%,24.43%,19.37%,14.49%,8.73%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.13%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00020,16.76%,19.36%,20.57%,30.63%,12.69%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00023,31.22%,22.68%,11.46%,16.10%,16.59%,0,0,0,0,0,0.98%,0.98%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00026,18.40%,16.77%,25.32%,26.02%,13.45%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.02%,0.02%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00028,27.66%,20.61%,28.66%,22.05%,1.02%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00029,32.73%,21.82%,26.91%,18.55%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00032,17.00%,40.32%,22.92%,19.76%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00037,23.33%,20.00%,23.33%,31.67%,1.67%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00050,28.57%,16.07%,27.68%,23.21%,2.68%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.89%,0,0,0,0,0,0,0,0,0,0,0,0,0.89%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00059,31.16%,23.60%,22.54%,20.11%,2.17%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.21%,0,0,0,0,0,0,0,0,0,0,0,0,0.21%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00061,26.51%,23.06%,15.52%,14.87%,20.04%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00080,28.00%,17.41%,31.06%,22.00%,1.53%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00100,26.23%,24.59%,9.84%,21.31%,18.03%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00162,31.84%,23.64%,29.81%,14.68%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.03%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00164,34.88%,23.26%,25.58%,16.28%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00167,23.89%,22.76%,26.40%,26.79%,0.06%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.03%,0.06%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00168,39.10%,26.42%,19.10%,15.37%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00169,36.50%,31.99%,22.19%,9.32%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00174,35.38%,25.15%,19.59%,12.28%,7.60%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00177,33.24%,24.72%,22.68%,16.89%,2.29%,0,0,<.01%,0,0.01%,0.02%,<.01%,<.01%,0,<.01%,0,0,0,0,0,0.05%,0.02%,0,0,0.01%,0,0,<.01%,0.01%,0,0,0,0,0,<.01%,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.03%,0.01%,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00233,28.21%,29.49%,21.79%,20.51%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00234,29.82%,20.65%,23.80%,24.45%,0.65%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.49%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.14%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00250,17.65%,29.41%,35.29%,17.65%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00379,28.42%,24.27%,24.32%,19.37%,3.57%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.05%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00380,24.26%,21.94%,27.64%,24.47%,1.69%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00382,36.40%,26.00%,20.97%,16.63%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00390,13.04%,17.39%,30.43%,39.13%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00442,30.34%,21.35%,28.09%,19.10%,1.12%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00458,18.53%,16.06%,28.60%,30.36%,6.44%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00488,18.10%,13.22%,18.91%,26.63%,22.54%,0,0,0,0,0,0.06%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.53%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00504,30.89%,21.55%,30.64%,14.69%,0.54%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.08%,0,0,0,0,0,0,0,0.60%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF00505,28.33%,28.33%,11.67%,26.67%,5.00%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01051,28.75%,25.58%,26.41%,13.07%,6.19%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01357,32.00%,24.00%,20.00%,16.00%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.00%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01510,21.88%,24.22%,28.12%,25.78%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01689,25.81%,22.04%,31.72%,20.43%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01725,37.91%,29.67%,25.27%,7.14%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01734,31.37%,31.37%,21.57%,15.69%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01739,32.79%,27.87%,24.59%,14.75%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01750,32.13%,23.91%,23.19%,15.22%,5.56%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01763,38.27%,29.64%,18.76%,6.94%,2.25%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.13%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01786,27.03%,17.57%,27.03%,27.03%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.35%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01807,27.17%,23.91%,26.63%,22.28%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01826,19.23%,15.38%,32.69%,23.08%,7.69%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.92%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01831,33.06%,19.76%,25.41%,21.77%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01846,24.93%,22.72%,15.88%,19.09%,17.37%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01852,28.27%,21.04%,27.74%,22.33%,0.55%,0,0,0,0,0,0.02%,0.02%,<.01%,0,0.02%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01854,33.22%,28.24%,20.27%,18.27%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01857,37.74%,29.53%,18.80%,13.93%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01960,24.97%,19.86%,24.33%,24.49%,6.22%,0.02%,<.01%,0.01%,0.01%,<.01%,0.02%,<.01%,0,0,0,0,0,0,<.01%,0,<.01%,<.01%,0,0,0,0,0,0,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,<.01%,<.01%,0,0,0,0.01%,0,0,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF01998,32.10%,21.36%,27.78%,17.72%,1.05%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF02001,26.78%,17.17%,32.96%,21.51%,1.58%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF02012,29.11%,22.15%,23.42%,24.89%,0.42%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF02253,20.69%,24.14%,27.59%,27.59%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF02348,21.52%,16.46%,36.71%,25.32%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF02519,23.53%,14.71%,29.41%,29.41%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.94%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF02540,29.03%,23.70%,24.55%,17.33%,5.28%,0,0.02%,0.02%,0,0,0.02%,0,0,0,0,0,0,0,0,0,0,0,0.02%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.02%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF02541,33.08%,24.10%,23.09%,16.25%,3.38%,0,<.01%,<.01%,<.01%,<.01%,0.04%,0.01%,0,0,<.01%,0,0,0,<.01%,0,0.01%,<.01%,0,0,0,<.01%,0,<.01%,<.01%,0,0,<.01%,0,0,0,<.01%,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF02543,25.03%,18.64%,20.96%,18.86%,16.44%,0.01%,<.01%,0.02%,<.01%,<.01%,<.01%,<.01%,0,0,<.01%,0,0,0,<.01%,0,<.01%,<.01%,<.01%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,<.01%,0,0,0,0,0,0,0,<.01%,0,<.01%,0,0,0,<.01%,0,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%,<.01%
RF02545,9.88%,4.94%,35.83%,38.95%,10.39%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF02546,2.40%,1.07%,16.80%,11.73%,68.00%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF02553,32.50%,22.50%,20.00%,23.75%,1.25%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF02680,28.71%,29.70%,19.80%,18.81%,1.98%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.99%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF02683,31.40%,24.42%,29.07%,13.95%,1.16%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RF02796,33.33%,36.84%,17.54%,12.28%,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
This diff could not be displayed because it is too large.
,cWW,tSH,tWH,tHS,other,tWW,tSS,tHW,cSH,cSW,cSS,tSW,cWH,cWS,tWS,tHH,cHW,cHH,cHS
RF00001,61.87%,4.31%,3.21%,1.98%,3.33%,0.42%,0.97%,2.64%,5.30%,5.61%,0.11%,4.14%,0.61%,3.04%,0.93%,0.53%,0.89%,<.01%,0.10%
RF00002,62.36%,5.36%,2.71%,6.11%,1.72%,2.25%,1.23%,2.54%,1.87%,4.10%,0.63%,1.50%,1.14%,0.68%,0.57%,3.20%,1.38%,0.59%,0.05%
RF00004,85.28%,3.30%,5.23%,0.96%,0.69%,0.14%,0 %,0 %,0.28%,0.28%,0 %,0.69%,0.55%,0 %,0 %,0 %,0.28%,0.28%,2.06%
RF00005,70.47%,0.91%,6.92%,0.09%,1.74%,3.56%,0.08%,3.29%,0.53%,0.52%,0.22%,1.75%,1.24%,2.00%,2.31%,1.71%,0.65%,0.48%,1.53%
RF00008,64.74%,4.62%,8.09%,2.89%,1.16%,0 %,0 %,0 %,1.16%,5.20%,0 %,1.16%,0.58%,4.05%,4.62%,1.73%,0 %,0 %,0 %
RF00009,81.68%,0.58%,2.53%,0.58%,0.97%,0 %,0.39%,1.36%,1.17%,2.73%,0.97%,2.34%,0.58%,0.78%,0.78%,0 %,1.36%,0.39%,0.78%
RF00010,69.24%,2.58%,4.60%,0.37%,3.31%,0.55%,1.29%,0.92%,2.03%,2.76%,2.39%,2.76%,0.18%,1.84%,1.66%,0.55%,2.21%,0 %,0.74%
RF00011,64.71%,4.50%,4.50%,1.04%,3.46%,2.08%,2.42%,2.77%,3.11%,1.04%,1.38%,2.08%,2.08%,1.04%,1.04%,1.04%,1.73%,0 %,0 %
RF00013,89.66%,3.45%,0 %,0 %,3.45%,3.45%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF00015,86.76%,4.18%,0.70%,3.48%,0.70%,0 %,0 %,0 %,0.70%,0.35%,0 %,1.74%,0.35%,0 %,0 %,0.35%,0 %,0.70%,0 %
RF00017,75.15%,2.90%,3.05%,0.76%,3.35%,2.74%,0.46%,1.68%,1.07%,0.30%,2.13%,2.59%,1.68%,0.30%,0 %,0 %,0.91%,0.91%,0 %
RF00020,88.26%,0.73%,2.39%,0.37%,0.55%,0.73%,0 %,0 %,0.73%,1.10%,1.28%,1.10%,0.37%,1.28%,0 %,0 %,0.73%,0 %,0.37%
RF00023,73.83%,1.87%,12.15%,0.93%,1.87%,0.93%,0 %,0.93%,0 %,1.87%,0 %,0 %,0 %,1.87%,3.74%,0 %,0 %,0 %,0 %
RF00026,81.41%,3.66%,6.15%,1.17%,0.44%,1.17%,0 %,0 %,0.29%,0.44%,0.15%,1.02%,0.29%,0.29%,0.44%,0.15%,0.15%,0.29%,2.49%
RF00028,65.73%,2.86%,2.64%,3.83%,2.16%,1.62%,2.91%,2.05%,3.12%,1.29%,1.94%,0.38%,1.67%,0.54%,1.45%,0.22%,4.58%,0.86%,0.16%
RF00029,80.70%,6.14%,0 %,0 %,0 %,3.51%,0 %,3.51%,0 %,0.88%,0 %,0 %,0.88%,0.88%,0 %,0 %,0.88%,0 %,2.63%
RF00032,100.00%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF00037,100.00%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF00050,68.39%,3.87%,7.74%,3.87%,2.26%,0.32%,5.48%,0 %,0 %,0 %,5.81%,0 %,0 %,0.32%,0 %,0 %,1.94%,0 %,0 %
RF00059,60.28%,1.50%,4.97%,3.70%,2.54%,1.85%,5.31%,0 %,0 %,0 %,7.16%,4.97%,4.50%,0.35%,0.12%,1.85%,0.23%,0.69%,0 %
RF00061,77.86%,3.05%,2.29%,2.29%,0 %,2.29%,0 %,1.53%,2.29%,0 %,0 %,0.76%,0.76%,2.29%,0 %,1.53%,2.29%,0 %,0.76%
RF00080,84.19%,6.45%,0 %,0 %,2.26%,0 %,1.94%,0 %,4.19%,0 %,0 %,0.65%,0 %,0 %,0 %,0 %,0 %,0 %,0.32%
RF00100,65.22%,0 %,4.35%,0 %,5.07%,0.72%,0 %,8.70%,0 %,0 %,0 %,2.90%,13.04%,0 %,0 %,0 %,0 %,0 %,0 %
RF00162,73.74%,6.90%,0.07%,2.15%,0.96%,0 %,0.59%,0 %,2.52%,2.82%,4.15%,2.37%,0.07%,0.45%,3.04%,0 %,0 %,0.15%,0 %
RF00164,76.19%,4.76%,0 %,0 %,0 %,0 %,0 %,0 %,4.76%,4.76%,9.52%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF00167,67.80%,0 %,7.77%,0.23%,2.51%,0 %,0 %,2.63%,2.22%,3.10%,2.63%,2.98%,0 %,5.14%,2.63%,0.29%,0.06%,0 %,0 %
RF00168,76.92%,4.74%,1.95%,2.41%,0.45%,1.20%,1.20%,2.41%,3.23%,1.20%,0.68%,1.43%,0.98%,0 %,0 %,1.20%,0 %,0 %,0 %
RF00169,70.92%,9.56%,3.19%,0.80%,4.78%,0 %,0.40%,9.16%,0 %,0 %,0 %,0 %,0.80%,0 %,0.40%,0 %,0 %,0 %,0 %
RF00174,71.01%,2.90%,5.07%,4.35%,2.90%,0.72%,1.45%,2.17%,0 %,2.17%,2.90%,1.45%,0.72%,2.17%,0 %,0 %,0 %,0 %,0 %
RF00177,63.05%,3.95%,4.48%,2.84%,3.20%,2.13%,2.18%,2.57%,2.50%,2.24%,2.00%,1.72%,2.02%,1.58%,1.44%,0.78%,0.70%,0.34%,0.29%
RF00233,72.06%,1.47%,7.35%,2.94%,0 %,2.94%,0 %,0 %,4.41%,0 %,2.94%,1.47%,2.94%,0 %,0 %,0 %,1.47%,0 %,0 %
RF00234,73.03%,1.96%,0.68%,0.64%,1.28%,1.96%,2.42%,5.29%,2.92%,0.59%,0.41%,7.07%,1.32%,0 %,0.23%,0 %,0.18%,0 %,0 %
RF00250,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF00379,71.10%,6.46%,1.46%,7.07%,1.10%,0.12%,3.29%,0.24%,2.93%,1.46%,1.95%,1.59%,0.61%,0 %,0 %,0.12%,0.49%,0 %,0 %
RF00380,64.46%,5.37%,1.24%,2.07%,6.20%,3.31%,2.89%,4.96%,2.48%,1.24%,2.07%,0 %,0 %,1.24%,1.24%,0 %,1.24%,0 %,0 %
RF00382,50.00%,0 %,0 %,0 %,20.59%,0 %,0 %,0 %,0 %,0 %,0 %,2.94%,20.59%,0 %,0 %,0 %,0 %,5.88%,0 %
RF00390,55.17%,0 %,0 %,0 %,6.90%,0 %,0 %,0 %,13.79%,6.90%,0 %,0 %,17.24%,0 %,0 %,0 %,0 %,0 %,0 %
RF00442,56.52%,6.52%,6.52%,2.17%,8.70%,2.17%,2.17%,2.17%,0 %,4.35%,2.17%,0 %,4.35%,0 %,0 %,2.17%,0 %,0 %,0 %
RF00458,70.22%,3.37%,5.06%,0 %,5.34%,1.97%,0 %,1.40%,1.97%,1.97%,0.28%,0.28%,2.81%,1.97%,0.84%,0.84%,0.56%,0.84%,0.28%
RF00488,91.95%,0.20%,0 %,0.20%,0.80%,1.41%,0.10%,0.50%,0.91%,1.21%,0.10%,0.30%,0.70%,0.70%,0 %,0 %,0.30%,0.50%,0.10%
RF00504,72.66%,3.88%,2.59%,7.77%,3.02%,0 %,2.45%,0.29%,2.59%,0 %,1.58%,0 %,0 %,0 %,0.14%,0.14%,2.88%,0 %,0 %
RF00505,100.00%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF01051,64.48%,5.37%,0 %,2.84%,4.93%,0 %,2.84%,4.18%,4.33%,2.09%,1.49%,1.94%,0.60%,3.43%,0.60%,0.60%,0 %,0.15%,0.15%
RF01357,80.00%,10.00%,0 %,0 %,0 %,0 %,0 %,0 %,10.00%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF01510,85.62%,0 %,0 %,0 %,1.09%,0 %,0 %,0 %,3.27%,0.22%,0 %,0 %,0 %,6.32%,3.49%,0 %,0 %,0 %,0 %
RF01689,75.95%,3.80%,5.06%,0 %,1.27%,5.06%,0 %,0.63%,1.27%,0 %,1.27%,3.16%,0 %,0 %,2.53%,0 %,0 %,0 %,0 %
RF01725,71.25%,7.50%,0 %,0 %,1.25%,0 %,5.00%,0 %,5.00%,0 %,5.00%,2.50%,0 %,0 %,2.50%,0 %,0 %,0 %,0 %
RF01734,75.76%,8.08%,0 %,0 %,0 %,5.05%,3.03%,5.05%,0 %,0 %,1.01%,2.02%,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF01739,61.06%,3.54%,4.42%,3.54%,7.96%,3.54%,0 %,0 %,3.54%,1.77%,0 %,0 %,3.54%,0 %,0 %,3.54%,3.54%,0 %,0 %
RF01750,79.22%,4.55%,0 %,3.90%,1.30%,0 %,0 %,1.30%,0 %,0 %,3.90%,0 %,1.30%,0 %,0 %,0 %,4.55%,0 %,0 %
RF01763,42.70%,0.28%,5.23%,0 %,12.67%,3.58%,0 %,0 %,2.20%,0 %,3.03%,2.75%,20.94%,6.61%,0 %,0 %,0 %,0 %,0 %
RF01786,76.39%,2.78%,5.56%,2.78%,1.39%,0 %,0 %,2.78%,5.56%,0 %,0 %,0 %,0 %,0 %,2.78%,0 %,0 %,0 %,0 %
RF01807,74.12%,3.53%,2.35%,0 %,2.35%,4.71%,2.35%,1.18%,0 %,1.18%,0 %,1.18%,2.35%,1.18%,0 %,1.18%,0 %,0 %,2.35%
RF01826,50.00%,0 %,8.33%,4.17%,4.17%,4.17%,4.17%,0 %,0 %,0 %,4.17%,0 %,20.83%,0 %,0 %,0 %,0 %,0 %,0 %
RF01831,78.61%,1.19%,2.97%,1.98%,1.19%,0 %,3.56%,3.96%,1.78%,2.38%,0 %,0 %,0 %,0 %,2.38%,0 %,0 %,0 %,0 %
RF01846,86.57%,3.14%,0.43%,1.71%,1.00%,0.57%,0.29%,1.43%,0.29%,1.14%,0 %,1.00%,0.43%,0.57%,0.29%,0.29%,0.86%,0 %,0 %
RF01852,71.41%,0.42%,1.47%,0.10%,4.63%,1.18%,0.06%,4.89%,4.63%,2.20%,0.03%,0.45%,6.65%,0.22%,0.64%,0 %,0.77%,0.06%,0.19%
RF01854,68.87%,5.96%,4.64%,3.97%,3.97%,1.99%,2.65%,2.65%,0 %,0 %,1.99%,0 %,1.32%,0 %,0.66%,0 %,1.32%,0 %,0 %
RF01857,71.35%,4.21%,2.81%,0 %,3.93%,2.25%,2.53%,5.34%,0 %,0.56%,1.97%,1.69%,0.56%,1.12%,1.69%,0 %,0 %,0 %,0 %
RF01960,66.53%,3.35%,3.47%,2.51%,3.10%,2.23%,1.24%,2.17%,1.66%,2.49%,1.75%,1.64%,2.30%,1.38%,1.71%,0.42%,1.34%,0.49%,0.22%
RF01998,56.65%,4.92%,4.37%,6.74%,3.10%,0.91%,7.10%,4.01%,2.73%,1.09%,0 %,0.36%,3.64%,0.36%,0 %,3.46%,0.55%,0 %,0 %
RF02001,74.15%,5.56%,0.28%,5.07%,0.83%,0.07%,4.86%,3.47%,0.14%,0 %,0.07%,0.90%,0.63%,0.35%,0.49%,0 %,2.78%,0 %,0.35%
RF02012,76.03%,5.48%,0 %,4.11%,1.37%,0.68%,0 %,0 %,2.74%,0 %,0 %,0 %,1.37%,2.05%,0 %,0 %,4.11%,1.37%,0.68%
RF02253,100.00%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF02348,80.00%,5.00%,0 %,3.33%,0 %,0 %,0 %,1.67%,1.67%,3.33%,0 %,0 %,0 %,0 %,0 %,0 %,5.00%,0 %,0 %
RF02519,66.67%,0 %,0 %,0 %,16.67%,0 %,8.33%,0 %,8.33%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF02540,60.17%,5.14%,3.83%,3.92%,2.79%,2.53%,3.11%,1.90%,2.22%,1.96%,2.38%,2.25%,1.45%,1.79%,1.50%,1.94%,0.55%,0.28%,0.31%
RF02541,62.00%,4.13%,3.68%,3.79%,2.68%,2.55%,2.84%,2.12%,2.25%,1.87%,2.18%,1.89%,1.71%,1.78%,1.53%,1.61%,0.65%,0.35%,0.38%
RF02543,66.82%,3.48%,2.88%,3.00%,2.51%,2.52%,1.61%,2.09%,1.74%,2.13%,1.88%,1.84%,1.95%,1.51%,1.25%,1.41%,0.74%,0.36%,0.26%
RF02545,65.43%,0.82%,4.12%,2.88%,1.23%,3.70%,1.65%,1.65%,2.47%,2.47%,1.23%,1.23%,0.82%,2.47%,3.70%,2.47%,0.82%,0.82%,0 %
RF02546,82.61%,0 %,8.70%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,4.35%,0 %,0 %,0 %,0 %,4.35%
RF02553,73.68%,2.63%,7.89%,0 %,0 %,2.63%,0 %,0 %,2.63%,0 %,0 %,5.26%,0 %,2.63%,0 %,2.63%,0 %,0 %,0 %
RF02680,88.89%,0 %,2.78%,0 %,2.78%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,5.56%,0 %,0 %,0 %,0 %,0 %,0 %
RF02683,80.56%,2.78%,0 %,5.56%,2.78%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,2.78%,2.78%,0 %,2.78%,0 %,0 %,0 %
RF02796,78.69%,4.92%,0 %,4.92%,4.92%,0 %,0 %,0 %,4.92%,0 %,0 %,1.64%,0 %,0 %,0 %,0 %,0 %,0 %,0 %
TOTAL,63.42%,3.93%,3.83%,3.23%,2.83%,2.35%,2.28%,2.28%,2.26%,2.13%,1.96%,1.88%,1.82%,1.68%,1.46%,1.25%,0.73%,0.35%,0.33%
#!/usr/bin/python3.8
import os, pickle
import os, pickle, sys
import numpy as np
import pandas as pd
import threading as th
......@@ -20,21 +20,12 @@ from collections import Counter
from RNAnet import read_cpu_number
if os.path.isdir("/home/ubuntu/"): # this is the IFB-core cloud
path_to_3D_data = "/mnt/Data/RNA/3D/"
path_to_seq_data = "/mnt/Data/RNA/sequences/"
elif os.path.isdir("/home/persalteas"): # this is my personal workstation
path_to_3D_data = "/home/persalteas/Data/RNA/3D/"
path_to_seq_data = "/home/persalteas/Data/RNA/sequences/"
elif os.path.isdir("/home/lbecquey"): # this is the IBISC server
path_to_3D_data = "/home/lbecquey/Data/RNA/3D/"
path_to_seq_data = "/home/lbecquey/Data/RNA/sequences/"
elif os.path.isdir("/nhome/siniac/lbecquey"): # this is the office PC
path_to_3D_data = "/nhome/siniac/lbecquey/Data/RNA/3D/"
path_to_seq_data = "/nhome/siniac/lbecquey/Data/RNA/sequences/"
else:
print("I don't know that machine... I'm shy, maybe you should introduce yourself ?")
exit(1)
path_to_3D_data = "/nhome/siniac/lbecquey/Data/RNA/3D/"
path_to_seq_data = "/nhome/siniac/lbecquey/Data/RNA/sequences/"
if len(sys.argv) > 1:
path_to_3D_data = path.abspath(sys.argv[1])
path_to_seq_data = path.abspath(sys.argv[2])
class DataPoint():
def __init__(self, path_to_textfile):
......@@ -80,7 +71,7 @@ def reproduce_wadley_results(points, show=False, carbon=4, sd_range=(1,4)):
c3_endo_etas = []
c2_endo_thetas = []
c3_endo_thetas = []
for p in points:
for p in tqdm(points, desc="Loading eta/thetas", position=worker_nbr, leave=False):
df = p.df.loc[(p.df[angle].isna()==False) & (p.df["th"+angle].isna()==False), ["form","puckering", angle,"th"+angle]]
c2_endo_etas += list(df.loc[ (df.puckering=="C2'-endo"), angle ].values)
c3_endo_etas += list(df.loc[ (df.form=='.') & (df.puckering=="C3'-endo"), angle ].values)
......@@ -112,14 +103,17 @@ def reproduce_wadley_results(points, show=False, carbon=4, sd_range=(1,4)):
f_c2 = f["kernel_c2"]
xx, yy = np.mgrid[0:2*np.pi:100j, 0:2*np.pi:100j]
print(f"[{worker_nbr}]\tKernel computed (or loaded from file).")
# print(f"[{worker_nbr}]\tKernel computed (or loaded from file).")
# exact counts:
hist_c2, xedges, yedges = np.histogram2d(c2_endo_etas, c2_endo_thetas, bins=int(2*np.pi/0.1), range=[[0, 2*np.pi], [0, 2*np.pi]])
hist_c3, xedges, yedges = np.histogram2d(c3_endo_etas, c3_endo_thetas, bins=int(2*np.pi/0.1), range=[[0, 2*np.pi], [0, 2*np.pi]])
color_values = cm.jet(hist_c3.ravel()/hist_c3.max())
for x, y, hist, f, l in zip( (c3_endo_etas, c2_endo_etas), (c3_endo_thetas, c2_endo_thetas), (hist_c3, hist_c2), (f_c3, f_c2), ("c3","c2")):
for x, y, hist, f, l in zip( (c3_endo_etas, c2_endo_etas),
(c3_endo_thetas, c2_endo_thetas),
(hist_c3, hist_c2),
(f_c3, f_c2), ("c3","c2")):
# cut hist and kernel
hist_sup_thr = hist.mean() + sd_range[1]*hist.std()
hist_cut = np.where( hist > hist_sup_thr, hist_sup_thr, hist)
......@@ -136,10 +130,9 @@ def reproduce_wadley_results(points, show=False, carbon=4, sd_range=(1,4)):
ax.bar3d(xpos.ravel(), ypos.ravel(), 0.0, 0.09, 0.09, hist_cut.ravel(), color=color_values, zorder="max")
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
plt.savefig(f"results/wadley_hist_{angle}_{l}.png")
fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}.png")
if show:
plt.show()
plt.close()
fig.show()
# Smoothed joint distribution
fig = plt.figure()
......@@ -147,10 +140,9 @@ def reproduce_wadley_results(points, show=False, carbon=4, sd_range=(1,4)):
ax.plot_surface(xx, yy, f_cut, cmap=cm.coolwarm, linewidth=0, antialiased=True)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
plt.savefig(f"results/wadley_distrib_{angle}_{l}.png")
fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}.png")
if show:
plt.show()
plt.close()
fig.show()
# 2D Wadley plot
fig = plt.figure(figsize=(5,5))
......@@ -160,15 +152,15 @@ def reproduce_wadley_results(points, show=False, carbon=4, sd_range=(1,4)):
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
fig.savefig(f"results/wadley_{angle}_{l}.png")
fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}.png")
if show:
plt.show()
print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.")
fig.show()
# print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.")
def stats_len(mappings_list, points):
cols = []
lengths = []
for f in sorted(mappings_list.keys()):
for f in tqdm(sorted(mappings_list.keys()), desc="Chain length by family", position=3, leave=False):
if f in ["RF02540","RF02541","RF02543"]:
cols.append("red") # LSU
elif f in ["RF00177","RF01960","RF01959","RF02542"]:
......@@ -187,22 +179,21 @@ def stats_len(mappings_list, points):
l.append(len(r.df['nt_code']))
lengths.append(l)
plt.figure(figsize=(10,3))
ax = plt.gca()
fig = plt.figure(figsize=(10,3))
ax = fig.gca()
ax.hist(lengths, bins=100, stacked=True, log=True, color=cols, label=sorted(mappings_list.keys()))
ax.set_xlabel("Sequence length (nucleotides)")
ax.set_ylabel("Number of 3D chains")
plt.tight_layout()
handles, labels = ax.get_legend_handles_labels()
fig.tight_layout()
filtered_handles = [mpatches.Patch(color='red'), mpatches.Patch(color='white'),
mpatches.Patch(color='blue'), mpatches.Patch(color='white'),
mpatches.Patch(color='green'), mpatches.Patch(color='purple'),
mpatches.Patch(color='orange'), mpatches.Patch(color='grey')]
filtered_labels = ['Large Ribosomal Subunits', '(RF02540, RF02541, RF02543)','Small Ribosomal Subunits','(RF01960, RF00177)',
'5S rRNA (RF00001)', '5.8S rRNA (RF00002)', 'tRNA (RF00005)', 'Other']
ax.legend(filtered_handles, filtered_labels, loc='best', ncol=2)# bbox_to_anchor=(0.5, -0.5), ncol=4, fontsize=)
plt.savefig("results/lengths.png")
print("[3]\tComputed sequence length statistics and saved the figure.")
ax.legend(filtered_handles, filtered_labels, loc='best', ncol=2)
fig.savefig("results/lengths.png")
# print("[3]\tComputed sequence length statistics and saved the figure.")
def format_percentage(tot, x):
if not tot:
......@@ -210,6 +201,8 @@ def format_percentage(tot, x):
x = 100*x/tot
if x >= 0.01:
x = "%.2f" % x
elif x == 0:
return "0 %"
else:
x = "<.01"
return x + '%'
......@@ -219,7 +212,7 @@ def stats_freq(mappings_list, points):
for f in mappings_list.keys():
freqs[f] = Counter()
for r in points:
for r in tqdm(points, desc="Nucleotide frequencies", position=4, leave=False):
freqs[r.family].update(dict(r.df['nt_name'].value_counts()))
df = pd.DataFrame()
......@@ -229,7 +222,7 @@ def stats_freq(mappings_list, points):
df = df.fillna(0)
df.to_csv("results/frequencies.csv")
print("[4]\tComputed nucleotide statistics and saved CSV file.")
# print("[4]\tComputed nucleotide statistics and saved CSV file.")
def stats_pairs(mappings_list, points):
......@@ -242,25 +235,30 @@ def stats_pairs(mappings_list, points):
freqs[f] = Counter()
# Iterate over data points
for r in tqdm(points, desc="RNA points", position=0, leave=False):
# Skip if linear piece of RNA
if not sum([ x != 0 for x in r.df.paired ]):
continue
# Count each pair type within the molecule
vcnts = pd.concat(
[ pd.Series(row['pair_type_LW'].split(','))
for _, row in r.df.dropna(subset=["pair_type_LW"]).iterrows() ]
).reset_index(drop=True).value_counts()
# Add these new counts to the family's counter
freqs[r.family].update(dict(vcnts))
# Create the output dataframe
df = pd.DataFrame()
for f in sorted(mappings_list.keys()):
df = pd.concat([ df, pd.DataFrame([[ x for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ])
df = df.fillna(0)
if not path.isfile("data/pair_counts.csv"):
for r in tqdm(points, desc="Leontis-Westhof basepair stats", position=5, leave=False):
# Skip if linear piece of RNA
if r.df.pair_type_LW.isna().all():
continue
# Count each pair type within the molecule
vcnts = pd.concat(
[ pd.Series(row['pair_type_LW'].split(','))
for _, row in r.df.dropna(subset=["pair_type_LW"]).iterrows() ]
).reset_index(drop=True).value_counts()
# Add these new counts to the family's counter
freqs[r.family].update(dict(vcnts))
# Create the output dataframe
df = pd.DataFrame()
for f in sorted(mappings_list.keys()):
df = pd.concat([ df, pd.DataFrame([[ x for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ])
df = df.fillna(0)
df.to_csv("data/pair_counts.csv")
else:
df = pd.read_csv("data/pair_counts.csv", index_col=0)
# Remove not very well defined pair types (not in the 12 LW types)
col_list = [ x for x in df.columns if '.' in x ]
......@@ -288,7 +286,7 @@ def stats_pairs(mappings_list, points):
plt.subplots_adjust(bottom=0.2, right=0.99)
plt.savefig("results/pairings.png")
print("[5]\tComputed nucleotide statistics and saved CSV and PNG file.")
# print("[5]\tComputed nucleotide statistics and saved CSV and PNG file.")
def to_dist_matrix(f):
if path.isfile("data/"+f+".npy"):
......@@ -311,11 +309,11 @@ def seq_idty(mappings_list):
if len(mappings_list[x]) == 1:
ignored.append(x)
if len(ignored):
print("Ignoring families with only one chain:", " ".join(ignored))
print("Idty matrices: Ignoring families with only one chain:", " ".join(ignored)+'\n')
# compute distance matrices
p = Pool(processes=8)
pbar = tqdm(total=len(famlist), desc="Families idty matrices", position=1, leave=True)
pbar = tqdm(total=len(famlist), desc="Families idty matrices", position=0, leave=False)
for i, _ in enumerate(p.imap_unordered(to_dist_matrix, famlist)):
pbar.update(1)
pbar.close()
......@@ -353,16 +351,17 @@ def seq_idty(mappings_list):
fig.subplots_adjust(wspace=0.1, hspace=0.3)
fig.colorbar(im, ax=axs[-1], shrink=0.8)
fig.savefig(f"results/distances.png")
print("[6]\tComputed identity matrices and saved the figure.")
# print("[6]\tComputed identity matrices and saved the figure.")
if __name__ == "__main__":
#################################################################
# LOAD ALL FILES
#################################################################
os.makedirs("results/figures/wadley_plots/", exist_ok=True)
print("Loading mappings list...")
mappings_list = pd.read_csv(path_to_seq_data + "realigned/mappings_list.csv", sep=',', index_col=0).to_dict(orient='list')
mappings_list = pd.read_csv("results/mappings_list.csv", sep=',', index_col=0).to_dict(orient='list')
for k in mappings_list.keys():
mappings_list[k] = [ x for x in mappings_list[k] if str(x) != 'nan' ]
......@@ -372,9 +371,9 @@ if __name__ == "__main__":
rna_points = pickle.load(f)
else:
rna_points = []
filelist = [path_to_3D_data+"/datapoints/"+f for f in os.listdir(path_to_3D_data+"/datapoints") if ".log" not in f and ".gz" not in f]
filelist = [path_to_3D_data+"/datapoints/"+f for f in os.listdir(path_to_3D_data+"/datapoints") ]
p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=read_cpu_number())
pbar = tqdm(total=len(filelist), desc="RNA files", position=0, leave=True)
pbar = tqdm(total=len(filelist), desc="RNA files", position=0, leave=False)
for i, rna in enumerate(p.imap_unordered(load_rna_frome_file, filelist)):
rna_points.append(rna)
pbar.update(1)
......@@ -389,29 +388,18 @@ if __name__ == "__main__":
#################################################################
# Define threads for the tasks
#################################################################
wadley_thr = []
wadley_thr.append(th.Thread(target=reproduce_wadley_results, args=[rna_points], kwargs={'carbon': 1}))
wadley_thr.append(th.Thread(target=reproduce_wadley_results, args=[rna_points], kwargs={'carbon': 4}))
seq_len_thr = th.Thread(target=partial(stats_len, mappings_list), args=[rna_points])
nt_freq_thr = th.Thread(target=partial(stats_freq, mappings_list), args=[rna_points])
pairs_freq_thr = th.Thread(target=partial(stats_pairs, mappings_list), args=[rna_points])
dist_thr = th.Thread(target=seq_idty, args=[mappings_list])
for t in wadley_thr:
threads = [
th.Thread(target=reproduce_wadley_results, args=[rna_points], kwargs={'carbon': 1}),
th.Thread(target=reproduce_wadley_results, args=[rna_points], kwargs={'carbon': 4}),
th.Thread(target=partial(stats_len, mappings_list), args=[rna_points]),
th.Thread(target=partial(stats_freq, mappings_list), args=[rna_points]),
th.Thread(target=partial(stats_pairs, mappings_list), args=[rna_points]),
th.Thread(target=seq_idty, args=[mappings_list])
]
for t in threads:
t.start()
seq_len_thr.start()
nt_freq_thr.start()
pairs_freq_thr.start()
dist_thr.start()
for t in wadley_thr:
for t in threads:
t.join()
seq_len_thr.join()
nt_freq_thr.join()
pairs_freq_thr.join()
dist_thr.join()
......