Louis BECQUEY

removed stuff from biorseo

results_*
build_BiORSEO_docker_image_ubuntu18.sh
deploy_BiORSEO_docker_image_linux.sh
INSTALL.md
Readme.md
benchmark_results/
doc/
FROM ubuntu:bionic
# installing dependencies
RUN apt-get update -yq && \
apt-get upgrade -y && \
apt-get install -y python3-dev python3-pip openjdk-11-jre libgsl23 libgslcblas0 libboost-program-options-dev libboost-filesystem-dev && \
rm -rf /var/lib/apt/lists/*
# compiled biorseo
COPY . /biorseo
# ViennaRNA installer
ADD "https://www.tbi.univie.ac.at/RNA/download/ubuntu/ubuntu_18_04/viennarna_2.4.14-1_amd64.deb" /
# jar3d archive
ADD http://rna.bgsu.edu/data/jar3d/models/jar3d_2014-12-11.jar /
# install codes
RUN dpkg -i /viennarna_2.4.14-1_amd64.deb && \
apt-get install -f && \
\
pip3 install networkx numpy regex wrapt biopython /biorseo/BayesPairing && \
\
cd / && \
rm -rf /biorseo/BayesPairing /ViennaRNA-2.4.13 /ViennaRNA-2.4.13.tar.gz
WORKDIR /biorseo
\ No newline at end of file
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
#!/usr/bin/python3
# This script's purpose is to extract information about the CaRNAval
# RINS from a Python pickle object containing RINs from their RIN.py class.
# We do this because the official JSON file is hard to understand, and Antoine Soulé
# recommended the pickle.
import networkx, os, pickle, subprocess, sys
if __name__=="__main__":
rin_DIR = os.getcwd() + "/../data/modules/RIN/"
filename = "CaRNAval_1_as_dictionnary.nxpickled"
# Check that we can find CaRNAval RINs, and load the dataset
try:
sys.path.append(os.path.abspath(rin_DIR))
import RIN
except ImportError:
# We have to download it
subprocess.run(["wget", '-O', '../data/modules/carnaval_dataset.zip', "http://carnaval.lri.fr/carnaval_dataset.zip"])
subprocess.run(["unzip", '-ou', '../data/modules/carnaval_dataset.zip', "carnaval_dataset/CaRNAval_1_as_dictionnary.nxpickled", "carnaval_dataset/RIN.py"])
subprocess.run(["rm", "-f", "../data/modules/RIN/", "../data/modules/carnaval_dataset.zip"])
subprocess.run(["mv", "carnaval_dataset/", "../data/modules/RIN/"])
sys.path.append(os.path.abspath(rin_DIR))
import RIN
try:
objects = []
with (open(rin_DIR+filename, "rb")) as openfile:
while True:
try:
objects.append(pickle.load(openfile))
except EOFError:
break
print("Dataset loaded")
except OSError:
print("File not found : " + rin_DIR + filename)
exit(1)
# Creation of a directory to extract RINs from the pickle file to individual files
try:
os.makedirs(rin_DIR + "Subfiles", exist_ok=True)
except OSError:
print("Creation of the directory %s failed" % (rin_DIR + "Subfiles"))
exit(1)
# Loop on every CaRNAval module and extract it from the Python object to flat text file
n_modules = len(objects[0]) # ? to
for i in range(1,1+n_modules):
motif = objects[0][i].graph
f = open(rin_DIR + "Subfiles/" + str(i-1) + ".txt", "w+")
f.write("ntA,ntB,long_range;...\n")
components = []
comp = []
nodes = list(motif)
nodes.sort()
for node in nodes:
if comp == []:
comp.append(node)
else:
if comp[-1] + 1 != node : #not the same component
components.append(comp)
comp = []
comp.append(node)
else :
comp.append(node)
components.append(comp)
#print(nodes)
basepairs = ""
edges = list(motif.edges())
for a in edges:
if motif.edges[a]['label'] == 'CWW' :
ntA = nodes.index(a[0])
ntB = nodes.index(a[1])
if ntA <= ntB :
basepairs += str(ntA) + "," + str(ntB) + "," + str(motif.edges[a]['long_range']) + ";"
f.write(basepairs + "\n")
f.write("pos;k;seq\n")
num_nt = -1
for a in components:
seq = ""
data_comp = str(num_nt+1)
for b in a:
num_nt += 1
# sometimes in the nxpicled file, a node has the attribute "realnt",
# and sometimes "real_nt", but it's the same thing
try:
seq += motif.nodes[b]["realnt"]
except:
seq += motif.nodes[b]["real_nt"]
data_comp += "," + str(num_nt) + ";" + str(len(a)) + ";" + seq + "\n"
f.write(data_comp)
f.close()
# print(str(i-1) + ".txt created")
print("Successfully parsed "+filename, ", now individual RINs are saved in Subfiles/ folder.", sep='')
# ============================ IMPORTS ====================================
import subprocess
import time
import resource
# take a RNA sequence and cut it from 100 bases to actual length
# then measure computation time, peak memory, and number of solutions for each length
# This RNA is actually a 16S rRNA from PDB 1J5E.
# http://ndbserver.rutgers.edu/service/ndb/atlas/summary
seq = "UUUGUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGUGCCUAAGACAUGCAAGUCGUGCGGGCCGCGGGGUUUUACUCCGUGGUCAGCGGCGGACGGGUGAGUAACGCGUGGGUGACCUACCCGGAAGAGGGGGACAACCCGGGGAAACUCGGGCUAAUCCCCCAUGUGGACCCGCCCCUUGGGGUGUGUCCAAAGGGCUUUGCCCGCUUCCGGAUGGGCCCGCGUCCCAUCAGCUAGUUGGUGGGGUAAUGGCCCACCAAGGCGACGACGGGUAGCCGGUCUGAGAGGAUGGCCGGCCACAGGGGCACUGAGACACGGGCCCCACUCCUACGGGAGGCAGCAGUUAGGAAUCUUCCGCAAUGGGCGCAAGCCUGACGGAGCGACGCCGCUUGGAGGAAGAAGCCCUUCGGGGUGUAAACUCCUGAACCCGGGACGAAACCCCCGACGAGGGGACUGACGGUACCGGGGUAAUAGCGCCGGCCAACUCCGUGCCAGCAGCCGCGGUAAUACGGAGGGCGCGAGCGUUACCCGGAUUCACUGGGCGUAAAGGGCGUGUAGGCGGCCUGGGGCGUCCCAUGUGAAAGACCACGGCUCAACCGUGGGGGAGCGUGGGAUACGCUCAGGCUAGACGGUGGGAGAGGGUGGUGGAAUUCCCGGAGUAGCGGUGAAAUGCGCAGAUACCGGGAGGAACGCCGAUGGCGAAGGCAGCCACCUGGUCCACCCGUGACGCUGAGGCGCGAAAGCGUGGGGAGCAAACCGGAUUAGAUACCCGGGUAGUCCACGCCCUAAACGAUGCGCGCUAGGUCUCUGGGUCUCCUGGGGGCCGAAGCUAACGCGUUAAGCGCGCCGCCUGGGGAGUACGGCCGCAAGGCUGAAACUCAAAGGAAUUGACGGGGGCCCGCACAAGCGGUGGAGCAUGUGGUUUAAUUCGAAGCAACGCGAAGAACCUUACCAGGCCUUGACAUGCUAGGGAACCCGGGUGAAAGCCUGGGGUGCCCCGCGAGGGGAGCCCUAGCACAGGUGCUGCAUGGCCGUCGUCAGCUCGUGCCGUGAGGUGUUGGGUUAAGUCCCGCAACGAGCGCAACCCCCGCCGUUAGUUGCCAGCGGUUCGGCCGGGCACUCUAACGGGACUGCCCGCGAAAGCGGGAGGAAGGAGGGGACGACGUCUGGUCAGCAUGGCCCUUACGGCCUGGGCGACACACGUGCUACAAUGCCCACUACAAAGCGAUGCCACCCGGCAACGGGGAGCUAAUCGCAAAAAGGUGGGCCCAGUUCGGAUUGGGGUCUGCAACCCGACCCCAUGAAGCCGGAAUCGCUAGUAAUCGCGGAUCAGCCAUGCCGCGGUGAAUACGUUCCCGGGCCUUGUACACACCGCCCGUCACGCCAUGGGAGCGGGCUCUACCCGAAGUCGCCGGGAGCCUACGGGCAGGCGCCGAGGGUAGGGCCCGUGACUGGGGCGAAGUCGUAACAAGGUAGCUGUACCGGAAGGUGCGGCUGGAUCACCUCCUUUCU"
step = 100
n = len(seq)
while step < len(seq)+50:
sub_seq = seq[0:(min(step,n))]
# write the sequence to file
fasta = open("data/fasta/ZDFS33.fa", 'w')
fasta.write(">__'ZDFS33 : 0-" + str(len(sub_seq)) + "'\n" + sub_seq)
fasta.close()
# run biorseo on it, with default options
cmd = ["./bin/biorseo", "-d", "./data/modules/DESC", "-s", "./ZDFS33.fa", "-v"]
old_time = time.time()
output = subprocess.check_output(cmd, stderr=subprocess.DEVNULL).decode("utf-8").split("\n")[-5:]
run_time = time.time() - old_time
max_ram = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
for line in output :
if "Quitting because combinatorial issues" in line :
nb_sol = -1
elif "solutions kept" in line :
nb_sol = line.split(",")[1].split()[0]
print(len(sub_seq), "first nucleotides :", nb_sol, "solutions in", run_time, "seconds, using", max_ram, "kb of RAM")
step += 50
#!/bin/bash
echo "WARNING: The purpose of this file is to document how the docker image was built.";
echo "You cannot execute it directly, because of licensing reasons. Please get your own:";
echo "- CPLEX academic version: cplex_installer_12.8_Student.bin";
echo "- Nupack header files: nupack_3.2.2.tar.gz";
exit 0;
cd ../
THISDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
####################################################### Dependencies ##############################################################
sudo apt install -y clang-7 cmake make automake libboost-program-options-dev libboost-filesystem-dev openjdk-11-jre
sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-7 100
sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-7 100
# CPLEX: only to build biorseo
# HERE YOU SHOULD GET YOUR OWN cplex_installer_12.8_Student.bin ! I am not allowed to share mine anymore.
chmod +x cplex_installer_12.8_Student.bin
printf "4\n\n1\n\n\n\n\n" | sudo ./cplex_installer_12.8_Student.bin
rm cplex_installer_12.8_Student.bin
# Eigen: only to build biorseo (no need to give it to the docker image)
wget http://bitbucket.org/eigen/eigen/get/3.3.7.tar.gz -O eigen_src.tar.gz
tar -xf eigen_src.tar.gz
cd eigen-eigen-323c052e1731
mkdir build
cd build
cmake ..
sudo make install
cd ../..
rm -rf eigen_src.tar.gz eigen-eigen-323c052e1731
# Nupack: only to build biorseo (no need to give it to the docker image)
#curl -u yourname@yourUni.com:yourPassword http://www.nupack.org/downloads/serve_file/nupack3.2.2.tar.gz --output nupack3.2.2.tar.gz
tar -xf nupack3.2.2.tar.gz
cd nupack3.2.2
mkdir build
cd build
cmake ..
make -j8
sudo make install
cd ../..
sudo cp nupack3.2.2/src/thermo/*.h /usr/local/include/nupack/thermo/
rm -rf nupack3.2.2.tar.gz nupack3.2.2/
# BayesPairing: install on the docker image (done by the Dockerfile)
git clone http://jwgitlab.cs.mcgill.ca/sarrazin/rnabayespairing.git BayesPairing
######################################################### Build Biorseo ###########################################################
# build here, install later on the docker image (done by the Dockerfile)
mkdir -p results
make -j 8
make clean
rm -rf doc/ obj/
######################################################## Build Docker container ##################################################
# Execute the Dockerfile and build the image
docker build . -t biorseo
#!/bin/bash
######################################################## RNA modules ##############################################################
cd ../
# Rna3Dmotifs data
mkdir -p data/modules/DESC
wget https://github.com/McGill-CSB/RNAMoIP/raw/master/CATALOGUE.tgz
tar -xvzf CATALOGUE.tgz
mv No_Redondance_DESC/*.desc data/modules/DESC/
rm -r No_Redondance_VIEW3D No_Redondance_DESC CATALOGUE.tgz
# The RNA 3D Motif Atlas
mkdir -p data/modules/BGSU
wget http://rna.bgsu.edu/data/jar3d/models/HL/HL_3.2_models.zip
unzip HL_3.2_models.zip
mv HL data/modules/BGSU
rm HL_3.2_models.zip
wget http://rna.bgsu.edu/data/jar3d/models/IL/IL_3.2_models.zip
unzip IL_3.2_models.zip
mv IL data/modules/BGSU
rm IL_3.2_models.zip
# Install BayesPairing
sudo -H pip3 install --upgrade pip
sudo -H pip3 install networkx numpy regex wrapt biopython
git clone http://jwgitlab.cs.mcgill.ca/sarrazin/rnabayespairing.git BayesPairing
cd BayesPairing
sudo -H pip3 install .
# Train Bayes Pairing (it has been installed on the image and the source has been deleted, we train the models now, and will remount it as volume at run time)
cd bayespairing/src
python3 parse_sequences.py -d rna3dmotif -seq ACACGGGGUAAGAGCUGAACGCAUCUAAGCUCGAAACCCACUUGGAAAAGAGACACCGCCGAGGUCCCGCGUACAAGACGCGGUCGAUAGACUCGGGGUGUGCGCGUCGAGGUAACGAGACGUUAAGCCCACGAGCACUAACAGACCAAAGCCAUCAU -ss ".................................................................((...............)xxxx(...................................................)xxx).............."
python3 parse_sequences.py -d 3dmotifatlas -seq ACACGGGGUAAGAGCUGAACGCAUCUAAGCUCGAAACCCACUUGGAAAAGAGACACCGCCGAGGUCCCGCGUACAAGACGCGGUCGAUAGACUCGGGGUGUGCGCGUCGAGGUAACGAGACGUUAAGCCCACGAGCACUAACAGACCAAAGCCAUCAU -ss ".................................................................((...............)xxxx(...................................................)xxx).............."
cd ../../..
######################################################## Run it ##############################################################
# docker run -v `pwd`/data/modules:/modules -v `pwd`/BayesPairing/bayespairing:/byp -v `pwd`/results:/biorseo/results biorseo ./biorseo.py -i /biorseo/data/fasta/applications.fa --rna3dmotifs --patternmatch --func B
\ No newline at end of file
This diff is collapsed. Click to expand it.