Louis BECQUEY

scripts/ folder

......@@ -10,9 +10,9 @@ __pycache__/
errors.txt
known_issues.txt
known_issues_reasons.txt
kill_rnanet.sh
Dockerfile
LICENSE
README.md
automate.sh
build_docker_image.sh
\ No newline at end of file
scripts/automate.sh
scripts/kill_rnanet.sh
scripts/build_docker_image.sh
\ No newline at end of file
......
......@@ -23,7 +23,7 @@ RUN apk update && apk add --no-cache \
\
mkdir /3D && mkdir /sequences && \
\
mv /RNANet/x3dna-dssr /usr/local/bin/x3dna-dssr && chmod +x /usr/local/bin/x3dna-dssr && \
mv /RNANet/scripts/x3dna-dssr /usr/local/bin/x3dna-dssr && chmod +x /usr/local/bin/x3dna-dssr && \
\
curl -SL http://eddylab.org/infernal/infernal-1.1.3.tar.gz | tar xz && cd infernal-1.1.3 && \
./configure && make -j 16 && make install && cd easel && make install && cd / && \
......
......@@ -86,7 +86,7 @@ $ docker image import rnanet_v1.2_docker.tar rnanet
```
* Step 3 : Run the container, giving it 3 folders to mount as volumes: a first to store the 3D data, a second to store the sequence data and alignments, and a third to output the results, data and logs:
```
$ docker run -v path/to/3D/data/folder:/3D -v path/to/sequence/data/folder:/sequences -v path/to/experiment/results/folder:/runDir rnanet [ - other options ]
$ docker run --rm -v path/to/3D/data/folder:/3D -v path/to/sequence/data/folder:/sequences -v path/to/experiment/results/folder:/runDir rnanet [ - other options ]
```
The detailed list of options is below:
......@@ -124,6 +124,11 @@ The detailed list of options is below:
```
You may not use the --3d-folder and --seq-folder options, they are set by default to the paths you provide with the -v options when running Docker.
Typical usage:
```
nohup bash -c 'time docker run --rm -v /path/to/3D/data/folder:/3D -v /path/to/sequence/data/folder:/sequences -v /path/to/experiment/folder:/runDir rnanet -s --no-logs ' &
```
## Using classical command line installation
You need to install the dependencies:
......
......@@ -1546,35 +1546,33 @@ class Pipeline:
runDir + f"/data/{f}_counts.csv"])
# Run statistics files
subprocess.run(["python3.8", fileDir+"/regression.py"])
subprocess.run(["python3.8", fileDir+"/scripts/regression.py", runDir + "/results/RNANet.db"])
subprocess.run(["python3.8", fileDir+"/statistics.py", "--3d-folder", path_to_3D_data,
"--seq-folder", path_to_seq_data, "-r", str(self.CRYSTAL_RES)])
# Save additional informations
os.makedirs(runDir + "/archive", exist_ok=True)
with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
pd.read_sql_query("""SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem
FROM family ORDER BY nb_3d_chains DESC;""",
conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
conn).to_csv(runDir + f"/archive/families_{time_str}.csv", float_format="%.2f", index=False)
pd.read_sql_query("""SELECT eq_class, structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue
FROM structure
JOIN chain ON structure.pdb_id = chain.structure_id
ORDER BY structure_id, chain_name, rfam_acc ASC;""",
conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
conn).to_csv(runDir + f"/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
# Archive the results
# Update shortcuts to latest versions
subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz",
runDir + "/results/summary_latest.csv",
runDir + "/results/families_latest.csv"
])
if self.ARCHIVE:
os.makedirs(runDir + "/results/archive", exist_ok=True)
subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf",
runDir + f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", "."])
# Update shortcuts to latest versions
subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz",
runDir + "/results/summary_latest.csv",
runDir + "/results/families_latest.csv"
])
subprocess.run(['ln', "-s", runDir + f"/results/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"])
subprocess.run(['ln', "-s", runDir + f"/results/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"])
subprocess.run(['ln', "-s", runDir + f"/results/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"])
runDir + f"/archive/RNANET_datapoints_{time_str}.tar.gz", "."])
subprocess.run(['ln', "-s", runDir + f"/archive/RNANET_datapoints_{time_str}.tar.gz", runDir + "/results/RNANET_datapoints_latest.tar.gz"])
subprocess.run(['ln', "-s", runDir + f"/archive/summary_{time_str}.csv", runDir + "/results/summary_latest.csv"])
subprocess.run(['ln', "-s", runDir + f"/archive/families_{time_str}.csv", runDir + "/results/families_latest.csv"])
def sanitize_database(self):
"""Searches for issues in the database and correct them"""
......
......@@ -15,7 +15,7 @@ cp `which x3dna-dssr` $THISDIR
######################################################## Build Docker image ######################################################
# Execute the Dockerfile and build the image
docker build -t persalteas/rnanet .
docker build -t persalteas/rnanet ..
############################################################## Cleaning ##########################################################
rm x3dna-dssr
......
......@@ -7,15 +7,18 @@ problems = [
"1k73_1_B"
]
# provide the path to your data folders, the RNANet.db file, and the RNANet.py file as arguments to this script
path_to_3D_data = sys.argv[1]
path_to_seq_data = sys.argv[2]
path_to_db = sys.argv[3]
path_to_RNANet = sys.argv[4]
for p in problems:
print()
print()
print()
print()
homology = ('-' in p)
homology = ('-' in p) # H4cky l1f3
# Remove the datapoints files and 3D files
subprocess.run(["rm", '-f', path_to_3D_data + f"/rna_mapped_to_Rfam/{p}.cif"])
......@@ -31,21 +34,20 @@ for p in problems:
# Delete the chain from the database, and the associated nucleotides and re_mappings, using foreign keys
for fam in families:
command = ["sqlite3", "results/RNANet.db", f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc=\"{fam}\";"]
command = ["sqlite3", path_to_db, f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc=\"{fam}\";"]
print(' '.join(command))
subprocess.run(command)
command = ["python3.8", "RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--extract", "--only", p]
command = ["python3.8", path_to_RNANet, "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--extract", "--only", p]
else:
# Delete the chain from the database, and the associated nucleotides and re_mappings, using foreign keys
command = ["sqlite3", "results/RNANet.db", f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc is null;"]
command = ["sqlite3", path_to_db, f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc is null;"]
print(' '.join(command))
subprocess.run(command)
command = ["python3.8", "RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--no-homology", "--extract", "--only", p]
command = ["python3.8", path_to_RNANet, "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--no-homology", "--extract", "--only", p]
# Re-run RNANet
os.chdir(os.path.dirname(os.path.realpath(path_to_db)) + '/../')
print('\n',' '.join(command),'\n')
subprocess.run(command)
# run statistics
......
......@@ -20,7 +20,7 @@
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy, os, sqlite3
import scipy, os, sqlite3, sys
# from sklearn.linear_model import LinearRegression
from mpl_toolkits.mplot3d import Axes3D
pd.set_option('display.max_rows', None)
......@@ -28,7 +28,10 @@ pd.set_option('display.max_rows', None)
LSU_set = ["RF00002", "RF02540", "RF02541", "RF02543", "RF02546"] # From Rfam CLAN 00112
SSU_set = ["RF00177", "RF02542", "RF02545", "RF01959", "RF01960"] # From Rfam CLAN 00111
with sqlite3.connect(os.getcwd()+"/results/RNANet.db") as conn:
path_to_db = sys.argv[1] # The RNANet.db file
runDir = os.path.dirname(os.path.realpath(path_to_db)) + '/../'
with sqlite3.connect(path_to_db) as conn:
df = pd.read_sql("SELECT rfam_acc, max_len, nb_total_homol, comput_time, comput_peak_mem FROM family;", conn)
to_remove = [ f for f in df.rfam_acc if f in LSU_set+SSU_set ]
......@@ -74,7 +77,7 @@ ax.set_ylabel("Maximum length of sequences ")
ax.set_zlabel("Computation time (s)")
plt.subplots_adjust(wspace=0.4)
plt.savefig(os.getcwd()+"/results/cmalign_jobs_performance.png")
plt.savefig(runDir+"/results/cmalign_jobs_performance.png")
# # ========================================================
# # Linear Regression of max_mem as function of max_length
......@@ -110,7 +113,7 @@ plt.savefig(os.getcwd()+"/results/cmalign_jobs_performance.png")
# plt.plot(x, coeffs[0] + np.exp(coeffs[1]*x), "-g", label="expo fit")
# plt.plot(x, coeffs_log[0] + coeffs_log[1]*np.log(x), "-b", label="log fit")
# plt.legend()
# plt.savefig("results/regression/memory_linear_model.png")
# plt.savefig(runDir + "/results/regression/memory_linear_model.png")
# # ========================================================
# # Linear Regression of comp_time as function of n_chains
......@@ -131,4 +134,4 @@ plt.savefig(os.getcwd()+"/results/cmalign_jobs_performance.png")
# plt.ylabel("Computation time (s)")
# plt.plot(x, b0 + b1*x, "-r", label="linear fit")
# plt.legend()
# plt.savefig("results/regression/comp_time_linear_model.png")
# plt.savefig(runDir + "/results/regression/comp_time_linear_model.png")
......