Pipeline class

This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
# This is a script supposed to be run periodically as a cron job
# Run RNANet
cd /home/lbecquey/Projects/RNANet;
rm -f stdout.txt stderr.txt errors.txt;
time './RNAnet.py --3d-folder /home/lbequey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/' > stdout.txt 2> stderr.txt;
# Sync in Seafile
seaf-cli start;
seaf-cli stop;
DSSR warning for 1gsg_1_T_1-72: no nucleotides found
# This file is supposed to propose regression models on the computation time and mem usage of the re-alignment jobs.
# Light jobs are monitored by the Monitor class in RNAnet.py, and the measures are saved in jobstats.csv.
# Jobs are monitored by the Monitor class in RNAnet.py, and the measures are saved in jobstats.csv.
# This was done to guess the amount of memory required to re-align the large ribosomal subunit families RF02541 and RF02543.
# INFO: Our home hardware was a 32-core VM with 50GB RAM + 8GB Swap.
# INFO: Our home hardware was a 32-core VM with 50GB RAM
# The conclusion of this was to move to SINA for ribosomal subunits.
# However, this was before we use cmalign with --small, which is after all required for RF00005, RF00382 and RF01852
# (we do not understand why the two last very small families require that much memory).
# Feedback would be appreciated on wether it is better to
# - Use a specialised database (SILVA) : better alignments (we guess?), but two kind of jobs
# - Use cmalign --small everywhere (homogeneity)
# Moreover, --small requires --nonbanded --cyk, which means the output alignement is the optimally scored one.
# To date, we trust Infernal as the best tool to realign RNA. Is it ?
# Contact: louis.becquey@univ-evry.fr (PhD student), fariza.tahi@univ-evry.fr (PI)
# Running this file is not required to compute the dataset.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy, os
from sklearn.linear_model import LinearRegression
import scipy, os, sqlite3
# from sklearn.linear_model import LinearRegression
from mpl_toolkits.mplot3d import Axes3D
pd.set_option('display.max_rows', None)
LSU_set = ["RF00002", "RF02540", "RF02541", "RF02543", "RF02546"] # From Rfam CLAN 00112
SSU_set = ["RF00177", "RF02542", "RF02545", "RF01959", "RF01960"] # From Rfam CLAN 00111
with sqlite3.connect("results/RNANet.db") as conn:
df = pd.read_sql("SELECT rfam_acc, max_len, nb_total_homol, comput_time, comput_peak_mem FROM family;", conn)
jobstats = pd.read_csv("data/jobstats.csv", sep=",")
families = pd.read_csv("data/statistics.csv", sep=",")
computed_families = []
comptimes = []
maxmem = []
nchains = []
maxlengths = []
for index, fam in jobstats.iterrows():
if fam["max_mem"] != -1 and fam["comp_time"] != -1:
rfam_acc = fam["label"].split(' ')[1]
families.loc[families["rfam_acc"] == rfam_acc, "total_seqs"].values[0])
families.loc[families["rfam_acc"] == rfam_acc, "maxlength"].values[0])
comptimes = [x/3600 for x in comptimes] # compte en heures
maxlengths = [x/1000 for x in maxlengths] # compte en kB
maxmem = [x/1024/1024 for x in maxmem] # compte en MB
summary = pd.DataFrame({"family": computed_families, "n_chains": nchains,
"max_length(kB)": maxlengths, "comp_time(h)": comptimes, "max_mem(MB)": maxmem})
summary.sort_values("max_length(kB)", inplace=True)
to_remove = [ f for f in df.rfam_acc if f in LSU_set+SSU_set ]
df = df.set_index('rfam_acc').drop(to_remove)
# ========================================================
# Plot the data
......@@ -47,39 +42,39 @@ summary.to_csv("results/summary.csv")
fig = plt.figure(figsize=(12,8), dpi=100)
plt.scatter(summary.n_chains, summary["max_mem(MB)"])
plt.scatter(df.nb_total_homol, df.comput_peak_mem)
plt.xlabel("Number of sequences")
plt.ylabel("Peak memory (MB)")
plt.scatter(summary["max_length(kB)"], summary["max_mem(MB)"])
plt.xlabel("Maximum length of sequences (kB)")
plt.scatter(df.max_len, df.comput_peak_mem)
plt.xlabel("Maximum length of sequences ")
plt.ylabel("Peak memory (MB)")
ax = fig.add_subplot(233, projection='3d')
ax.scatter(summary.n_chains, summary["max_length(kB)"], summary["max_mem(MB)"])
ax.scatter(df.nb_total_homol, df.max_len, df.comput_peak_mem)
ax.set_xlabel("Number of sequences")
ax.set_ylabel("Maximum length of sequences (kB)")
ax.set_ylabel("Maximum length of sequences ")
ax.set_zlabel("Peak memory (MB)")
plt.scatter(summary.n_chains, summary["comp_time(h)"])
plt.scatter(df.nb_total_homol, df.comput_time)
plt.xlabel("Number of sequences")
plt.ylabel("Computation time (h)")
plt.ylabel("Computation time (s)")
plt.scatter(summary["max_length(kB)"], summary["comp_time(h)"])
plt.xlabel("Maximum length of sequences (kB)")
plt.ylabel("Computation time (h)")
plt.scatter(df.max_len, df.comput_time)
plt.xlabel("Maximum length of sequences ")
plt.ylabel("Computation time (s)")
ax = fig.add_subplot(236, projection='3d')
ax.scatter(summary.n_chains, summary["max_length(kB)"], summary["comp_time(h)"])
ax.scatter(df.nb_total_homol, df.max_len, df.comput_time)
ax.set_xlabel("Number of sequences")
ax.set_ylabel("Maximum length of sequences (kB)")
ax.set_zlabel("Computation time (h)")
ax.set_ylabel("Maximum length of sequences ")
ax.set_zlabel("Computation time (s)")
# # ========================================================
# # Linear Regression of max_mem as function of max_length
......@@ -87,20 +82,20 @@ plt.savefig("results/realign_jobs_performance.png")
# # With scikit-learn
# model = LinearRegression(normalize=True, n_jobs=-1)
# model.fit(summary["max_length(kB)"].values.reshape(-1, 1), summary["max_mem(MB)"])
# model.fit(df.max_len.values.reshape(-1, 1), df.comput_peak_mem)
# b0 = model.intercept_
# b1 = model.coef_[0]
# print(f"peak_mem = {b0:.0f} + {b1:.0f} * max_length")
# # with scipy
# coeffs = scipy.optimize.curve_fit( lambda t, B0, B1: B0+np.exp(B1*t),
# summary["max_length(kB)"].values,
# summary["max_mem(MB)"].values
# df.max_len.values,
# df.comput_peak_mem.values
# )[0]
# print(f"peak_mem = {coeffs[0]:.0f} + e^({coeffs[1]:.0f} * max_length)")
# coeffs_log = scipy.optimize.curve_fit( lambda t, B0, B1: B0+B1*np.log(t),
# summary["max_length(kB)"].values,
# summary["max_mem(MB)"].values,
# df.max_len.values,
# df.comput_peak_mem.values,
# p0=(400, 12000)
# )[0]
# print(f"peak_mem = {coeffs_log[0]:.0f} + {coeffs_log[1]:.0f} * log(max_length)")
......@@ -108,8 +103,8 @@ plt.savefig("results/realign_jobs_performance.png")
# # Re-plot
# x = np.linspace(0, 10, 1000)
# plt.figure()
# plt.scatter(summary["max_length(kB)"], summary["max_mem(MB)"])
# plt.xlabel("Maximum length of sequences (kB)")
# plt.scatter(df.max_len, df.comput_peak_mem)
# plt.xlabel("Maximum length of sequences ")
# plt.ylabel("Peak memory (MB)")
# plt.plot(x, b0 + b1*x, "-r", label="linear fit")
# plt.plot(x, coeffs[0] + np.exp(coeffs[1]*x), "-g", label="expo fit")
......@@ -123,7 +118,7 @@ plt.savefig("results/realign_jobs_performance.png")
# # With scikit-learn
# model = LinearRegression(normalize=True, n_jobs=-1)
# model.fit(summary.n_chains.values.reshape(-1, 1), summary["comp_time(h)"])
# model.fit(df.nb_total_homol.values.reshape(-1, 1), df.comput_time)
# b0 = model.intercept_
# b1 = model.coef_[0]
# print(f"comp_time = {b0:.3f} + {b1:.3f} * n_chains")
......@@ -131,9 +126,9 @@ plt.savefig("results/realign_jobs_performance.png")
# # Re-plot
# x = np.linspace(0, 500000, 1000)
# plt.figure()
# plt.scatter(summary.n_chains, summary["comp_time(h)"])
# plt.scatter(df.nb_total_homol, df.comput_time)
# plt.xlabel("Number of sequences")
# plt.ylabel("Computation time (h)")
# plt.ylabel("Computation time (s)")
# plt.plot(x, b0 + b1*x, "-r", label="linear fit")
# plt.legend()
# plt.savefig("results/regression/comp_time_linear_model.png")
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
RF00004,85.28%,3.30%,5.23%,0.96%,0.69%,0.14%,0 %,0 %,0.28%,0.28%,0 %,0.69%,0.55%,0 %,0 %,0 %,0.28%,0.28%,2.06%
RF00008,64.74%,4.62%,8.09%,2.89%,1.16%,0 %,0 %,0 %,1.16%,5.20%,0 %,1.16%,0.58%,4.05%,4.62%,1.73%,0 %,0 %,0 %
RF00009,81.68%,0.58%,2.53%,0.58%,0.97%,0 %,0.39%,1.36%,1.17%,2.73%,0.97%,2.34%,0.58%,0.78%,0.78%,0 %,1.36%,0.39%,0.78%
RF00010,69.24%,2.58%,4.60%,0.37%,3.31%,0.55%,1.29%,0.92%,2.03%,2.76%,2.39%,2.76%,0.18%,1.84%,1.66%,0.55%,2.21%,0 %,0.74%
RF00011,64.71%,4.50%,4.50%,1.04%,3.46%,2.08%,2.42%,2.77%,3.11%,1.04%,1.38%,2.08%,2.08%,1.04%,1.04%,1.04%,1.73%,0 %,0 %
RF00013,89.66%,3.45%,0 %,0 %,3.45%,3.45%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF00015,86.76%,4.18%,0.70%,3.48%,0.70%,0 %,0 %,0 %,0.70%,0.35%,0 %,1.74%,0.35%,0 %,0 %,0.35%,0 %,0.70%,0 %
RF00017,75.15%,2.90%,3.05%,0.76%,3.35%,2.74%,0.46%,1.68%,1.07%,0.30%,2.13%,2.59%,1.68%,0.30%,0 %,0 %,0.91%,0.91%,0 %
RF00020,88.26%,0.73%,2.39%,0.37%,0.55%,0.73%,0 %,0 %,0.73%,1.10%,1.28%,1.10%,0.37%,1.28%,0 %,0 %,0.73%,0 %,0.37%
RF00023,73.83%,1.87%,12.15%,0.93%,1.87%,0.93%,0 %,0.93%,0 %,1.87%,0 %,0 %,0 %,1.87%,3.74%,0 %,0 %,0 %,0 %
RF00026,81.41%,3.66%,6.15%,1.17%,0.44%,1.17%,0 %,0 %,0.29%,0.44%,0.15%,1.02%,0.29%,0.29%,0.44%,0.15%,0.15%,0.29%,2.49%
RF00029,80.70%,6.14%,0 %,0 %,0 %,3.51%,0 %,3.51%,0 %,0.88%,0 %,0 %,0.88%,0.88%,0 %,0 %,0.88%,0 %,2.63%
RF00032,100.00%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF00037,100.00%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF00050,68.39%,3.87%,7.74%,3.87%,2.26%,0.32%,5.48%,0 %,0 %,0 %,5.81%,0 %,0 %,0.32%,0 %,0 %,1.94%,0 %,0 %
RF00059,60.28%,1.50%,4.97%,3.70%,2.54%,1.85%,5.31%,0 %,0 %,0 %,7.16%,4.97%,4.50%,0.35%,0.12%,1.85%,0.23%,0.69%,0 %
RF00061,77.86%,3.05%,2.29%,2.29%,0 %,2.29%,0 %,1.53%,2.29%,0 %,0 %,0.76%,0.76%,2.29%,0 %,1.53%,2.29%,0 %,0.76%
RF00080,84.19%,6.45%,0 %,0 %,2.26%,0 %,1.94%,0 %,4.19%,0 %,0 %,0.65%,0 %,0 %,0 %,0 %,0 %,0 %,0.32%
RF00100,65.22%,0 %,4.35%,0 %,5.07%,0.72%,0 %,8.70%,0 %,0 %,0 %,2.90%,13.04%,0 %,0 %,0 %,0 %,0 %,0 %
RF00162,73.74%,6.90%,0.07%,2.15%,0.96%,0 %,0.59%,0 %,2.52%,2.82%,4.15%,2.37%,0.07%,0.45%,3.04%,0 %,0 %,0.15%,0 %
RF00164,76.19%,4.76%,0 %,0 %,0 %,0 %,0 %,0 %,4.76%,4.76%,9.52%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF00167,67.80%,0 %,7.77%,0.23%,2.51%,0 %,0 %,2.63%,2.22%,3.10%,2.63%,2.98%,0 %,5.14%,2.63%,0.29%,0.06%,0 %,0 %
RF00168,76.92%,4.74%,1.95%,2.41%,0.45%,1.20%,1.20%,2.41%,3.23%,1.20%,0.68%,1.43%,0.98%,0 %,0 %,1.20%,0 %,0 %,0 %
RF00169,70.92%,9.56%,3.19%,0.80%,4.78%,0 %,0.40%,9.16%,0 %,0 %,0 %,0 %,0.80%,0 %,0.40%,0 %,0 %,0 %,0 %
RF00174,71.01%,2.90%,5.07%,4.35%,2.90%,0.72%,1.45%,2.17%,0 %,2.17%,2.90%,1.45%,0.72%,2.17%,0 %,0 %,0 %,0 %,0 %
RF00233,72.06%,1.47%,7.35%,2.94%,0 %,2.94%,0 %,0 %,4.41%,0 %,2.94%,1.47%,2.94%,0 %,0 %,0 %,1.47%,0 %,0 %
RF00234,73.03%,1.96%,0.68%,0.64%,1.28%,1.96%,2.42%,5.29%,2.92%,0.59%,0.41%,7.07%,1.32%,0 %,0.23%,0 %,0.18%,0 %,0 %
RF00250,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF00379,71.10%,6.46%,1.46%,7.07%,1.10%,0.12%,3.29%,0.24%,2.93%,1.46%,1.95%,1.59%,0.61%,0 %,0 %,0.12%,0.49%,0 %,0 %
RF00380,64.46%,5.37%,1.24%,2.07%,6.20%,3.31%,2.89%,4.96%,2.48%,1.24%,2.07%,0 %,0 %,1.24%,1.24%,0 %,1.24%,0 %,0 %
RF00382,50.00%,0 %,0 %,0 %,20.59%,0 %,0 %,0 %,0 %,0 %,0 %,2.94%,20.59%,0 %,0 %,0 %,0 %,5.88%,0 %
RF00390,55.17%,0 %,0 %,0 %,6.90%,0 %,0 %,0 %,13.79%,6.90%,0 %,0 %,17.24%,0 %,0 %,0 %,0 %,0 %,0 %
RF00442,56.52%,6.52%,6.52%,2.17%,8.70%,2.17%,2.17%,2.17%,0 %,4.35%,2.17%,0 %,4.35%,0 %,0 %,2.17%,0 %,0 %,0 %
RF00458,70.22%,3.37%,5.06%,0 %,5.34%,1.97%,0 %,1.40%,1.97%,1.97%,0.28%,0.28%,2.81%,1.97%,0.84%,0.84%,0.56%,0.84%,0.28%
RF00488,91.95%,0.20%,0 %,0.20%,0.80%,1.41%,0.10%,0.50%,0.91%,1.21%,0.10%,0.30%,0.70%,0.70%,0 %,0 %,0.30%,0.50%,0.10%
RF00504,72.66%,3.88%,2.59%,7.77%,3.02%,0 %,2.45%,0.29%,2.59%,0 %,1.58%,0 %,0 %,0 %,0.14%,0.14%,2.88%,0 %,0 %
RF00505,100.00%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF01051,64.48%,5.37%,0 %,2.84%,4.93%,0 %,2.84%,4.18%,4.33%,2.09%,1.49%,1.94%,0.60%,3.43%,0.60%,0.60%,0 %,0.15%,0.15%
RF01357,80.00%,10.00%,0 %,0 %,0 %,0 %,0 %,0 %,10.00%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF01510,85.62%,0 %,0 %,0 %,1.09%,0 %,0 %,0 %,3.27%,0.22%,0 %,0 %,0 %,6.32%,3.49%,0 %,0 %,0 %,0 %
RF01689,75.95%,3.80%,5.06%,0 %,1.27%,5.06%,0 %,0.63%,1.27%,0 %,1.27%,3.16%,0 %,0 %,2.53%,0 %,0 %,0 %,0 %
RF01725,71.25%,7.50%,0 %,0 %,1.25%,0 %,5.00%,0 %,5.00%,0 %,5.00%,2.50%,0 %,0 %,2.50%,0 %,0 %,0 %,0 %
RF01734,75.76%,8.08%,0 %,0 %,0 %,5.05%,3.03%,5.05%,0 %,0 %,1.01%,2.02%,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF01739,61.06%,3.54%,4.42%,3.54%,7.96%,3.54%,0 %,0 %,3.54%,1.77%,0 %,0 %,3.54%,0 %,0 %,3.54%,3.54%,0 %,0 %
RF01750,79.22%,4.55%,0 %,3.90%,1.30%,0 %,0 %,1.30%,0 %,0 %,3.90%,0 %,1.30%,0 %,0 %,0 %,4.55%,0 %,0 %
RF01763,42.70%,0.28%,5.23%,0 %,12.67%,3.58%,0 %,0 %,2.20%,0 %,3.03%,2.75%,20.94%,6.61%,0 %,0 %,0 %,0 %,0 %
RF01786,76.39%,2.78%,5.56%,2.78%,1.39%,0 %,0 %,2.78%,5.56%,0 %,0 %,0 %,0 %,0 %,2.78%,0 %,0 %,0 %,0 %
RF01807,74.12%,3.53%,2.35%,0 %,2.35%,4.71%,2.35%,1.18%,0 %,1.18%,0 %,1.18%,2.35%,1.18%,0 %,1.18%,0 %,0 %,2.35%
RF01826,50.00%,0 %,8.33%,4.17%,4.17%,4.17%,4.17%,0 %,0 %,0 %,4.17%,0 %,20.83%,0 %,0 %,0 %,0 %,0 %,0 %
RF01831,78.61%,1.19%,2.97%,1.98%,1.19%,0 %,3.56%,3.96%,1.78%,2.38%,0 %,0 %,0 %,0 %,2.38%,0 %,0 %,0 %,0 %
RF01846,86.57%,3.14%,0.43%,1.71%,1.00%,0.57%,0.29%,1.43%,0.29%,1.14%,0 %,1.00%,0.43%,0.57%,0.29%,0.29%,0.86%,0 %,0 %
RF01852,71.41%,0.42%,1.47%,0.10%,4.63%,1.18%,0.06%,4.89%,4.63%,2.20%,0.03%,0.45%,6.65%,0.22%,0.64%,0 %,0.77%,0.06%,0.19%
RF01854,68.87%,5.96%,4.64%,3.97%,3.97%,1.99%,2.65%,2.65%,0 %,0 %,1.99%,0 %,1.32%,0 %,0.66%,0 %,1.32%,0 %,0 %
RF01857,71.35%,4.21%,2.81%,0 %,3.93%,2.25%,2.53%,5.34%,0 %,0.56%,1.97%,1.69%,0.56%,1.12%,1.69%,0 %,0 %,0 %,0 %
RF01998,56.65%,4.92%,4.37%,6.74%,3.10%,0.91%,7.10%,4.01%,2.73%,1.09%,0 %,0.36%,3.64%,0.36%,0 %,3.46%,0.55%,0 %,0 %
RF02001,74.15%,5.56%,0.28%,5.07%,0.83%,0.07%,4.86%,3.47%,0.14%,0 %,0.07%,0.90%,0.63%,0.35%,0.49%,0 %,2.78%,0 %,0.35%
RF02012,76.03%,5.48%,0 %,4.11%,1.37%,0.68%,0 %,0 %,2.74%,0 %,0 %,0 %,1.37%,2.05%,0 %,0 %,4.11%,1.37%,0.68%
RF02253,100.00%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF02348,80.00%,5.00%,0 %,3.33%,0 %,0 %,0 %,1.67%,1.67%,3.33%,0 %,0 %,0 %,0 %,0 %,0 %,5.00%,0 %,0 %
RF02519,66.67%,0 %,0 %,0 %,16.67%,0 %,8.33%,0 %,8.33%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %
RF02545,65.43%,0.82%,4.12%,2.88%,1.23%,3.70%,1.65%,1.65%,2.47%,2.47%,1.23%,1.23%,0.82%,2.47%,3.70%,2.47%,0.82%,0.82%,0 %
RF02546,82.61%,0 %,8.70%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,0 %,4.35%,0 %,0 %,0 %,0 %,4.35%
RF02553,73.68%,2.63%,7.89%,0 %,0 %,2.63%,0 %,0 %,2.63%,0 %,0 %,5.26%,0 %,2.63%,0 %,2.63%,0 %,0 %,0 %
RF02680,88.89%,0 %,2.78%,0 %,2.78%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,5.56%,0 %,0 %,0 %,0 %,0 %,0 %
RF02683,80.56%,2.78%,0 %,5.56%,2.78%,0 %,0 %,0 %,0 %,0 %,0 %,0 %,2.78%,2.78%,0 %,2.78%,0 %,0 %,0 %
RF02796,78.69%,4.92%,0 %,4.92%,4.92%,0 %,0 %,0 %,4.92%,0 %,0 %,1.64%,0 %,0 %,0 %,0 %,0 %,0 %,0 %
This diff is collapsed. Click to expand it.