Showing
5 changed files
with
145 additions
and
30 deletions
.dockerignore
0 → 100644
1 | +nohup.out | ||
2 | +log_of_the_run.sh | ||
3 | +results/ | ||
4 | +logs/ | ||
5 | +data/ | ||
6 | +esl* | ||
7 | +.vscode/ | ||
8 | +__pycache__/ | ||
9 | +.git/ | ||
10 | +errors.txt | ||
11 | +known_issues.txt | ||
12 | +known_issues_reasons.txt | ||
13 | +kill_rnanet.sh | ||
14 | +Dockerfile | ||
15 | +LICENSE | ||
16 | +README.md | ||
17 | +automate.sh | ||
18 | +build_docker_image.sh | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
Dockerfile
0 → 100644
1 | +FROM alpine:latest | ||
2 | +COPY . /RNANet | ||
3 | +WORKDIR / | ||
4 | +RUN apk update && apk add --no-cache \ | ||
5 | + curl \ | ||
6 | + freetype-dev \ | ||
7 | + gcc g++ \ | ||
8 | + linux-headers \ | ||
9 | + lapack-dev \ | ||
10 | + make \ | ||
11 | + musl-dev \ | ||
12 | + openblas-dev \ | ||
13 | + python3 python3-dev py3-pip py3-six py3-wheel \ | ||
14 | + py3-matplotlib py3-requests py3-scipy py3-setproctitle py3-sqlalchemy py3-tqdm \ | ||
15 | + sqlite \ | ||
16 | + \ | ||
17 | + && python3 -m pip install biopython==1.76 pandas psutil pymysql && \ | ||
18 | + \ | ||
19 | + wget -q -O /etc/apk/keys/sgerrand.rsa.pub https://alpine-pkgs.sgerrand.com/sgerrand.rsa.pub && \ | ||
20 | + wget https://github.com/sgerrand/alpine-pkg-glibc/releases/download/2.32-r0/glibc-2.32-r0.apk && \ | ||
21 | + apk add glibc-2.32-r0.apk && \ | ||
22 | + rm glibc-2.32-r0.apk && \ | ||
23 | + \ | ||
24 | + mkdir /3D && mkdir /sequences && \ | ||
25 | + \ | ||
26 | + mv /RNANet/x3dna-dssr /usr/local/bin/x3dna-dssr && chmod +x /usr/local/bin/x3dna-dssr && \ | ||
27 | + \ | ||
28 | + curl -SL http://eddylab.org/infernal/infernal-1.1.3.tar.gz | tar xz && cd infernal-1.1.3 && \ | ||
29 | + ./configure && make -j 16 && make install && cd easel && make install && cd / && \ | ||
30 | + \ | ||
31 | + curl -SL https://github.com/epruesse/SINA/releases/download/v1.7.1/sina-1.7.1-linux.tar.gz | tar xz && mv sina-1.7.1-linux /sina && \ | ||
32 | + ln -s /sina/bin/sina /usr/local/bin/sina && \ | ||
33 | + \ | ||
34 | + rm -rf /infernal-1.1.3 && \ | ||
35 | + \ | ||
36 | + apk del openblas-dev gcc g++ gfortran binutils \ | ||
37 | + curl \ | ||
38 | + linux-headers \ | ||
39 | + make \ | ||
40 | + musl-dev \ | ||
41 | + py3-pip py3-wheel \ | ||
42 | + freetype-dev zlib-dev | ||
43 | +VOLUME ["/3D", "/sequences", "/runDir"] | ||
44 | +WORKDIR /runDir | ||
45 | +ENTRYPOINT ["/RNANet/RNAnet.py", "--3d-folder", "/3D", "--seq-folder", "/sequences" ] | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
... | @@ -11,8 +11,8 @@ Contents: | ... | @@ -11,8 +11,8 @@ Contents: |
11 | * [Output files](#output-files) | 11 | * [Output files](#output-files) |
12 | * [How to run](#how-to-run) | 12 | * [How to run](#how-to-run) |
13 | * [Required computational resources](#required-computational-resources) | 13 | * [Required computational resources](#required-computational-resources) |
14 | - * [Dependencies](#dependencies) | 14 | + * [Using Docker](#using-docker) |
15 | - * [Command line](#command-line) | 15 | + * [Using classical command line installation](#using-classical-command-line-installation) |
16 | * [Post-computation task: estimate quality](#post-computation-task:-estimate-quality) | 16 | * [Post-computation task: estimate quality](#post-computation-task:-estimate-quality) |
17 | * [How to further filter the dataset](#how-to-further-filter-the-dataset) | 17 | * [How to further filter the dataset](#how-to-further-filter-the-dataset) |
18 | * [Filter on 3D structure resolution](#filter-on-3D-structure-resolution) | 18 | * [Filter on 3D structure resolution](#filter-on-3D-structure-resolution) |
... | @@ -63,7 +63,7 @@ Other folders are created and not deleted, which you might want to conserve to a | ... | @@ -63,7 +63,7 @@ Other folders are created and not deleted, which you might want to conserve to a |
63 | * `path-to-3D-folder-you-passed-in-option/RNAcifs/` contains mmCIF structures directly downloaded from the PDB, which contain RNA chains, | 63 | * `path-to-3D-folder-you-passed-in-option/RNAcifs/` contains mmCIF structures directly downloaded from the PDB, which contain RNA chains, |
64 | * `path-to-3D-folder-you-passed-in-option/annotations/` contains the raw JSON annotation files of the previous mmCIF structures. You may find additional information into them which is not properly supported by RNANet yet. | 64 | * `path-to-3D-folder-you-passed-in-option/annotations/` contains the raw JSON annotation files of the previous mmCIF structures. You may find additional information into them which is not properly supported by RNANet yet. |
65 | 65 | ||
66 | -# How to run | 66 | +# How to run (on Linux x86-64 only) |
67 | 67 | ||
68 | ## Required computational resources | 68 | ## Required computational resources |
69 | - CPU: no requirements. The program is optimized for multi-core CPUs, you might want to use Intel Xeons, AMD Ryzens, etc. | 69 | - CPU: no requirements. The program is optimized for multi-core CPUs, you might want to use Intel Xeons, AMD Ryzens, etc. |
... | @@ -77,17 +77,18 @@ Measured the 23rd of June 2020 on a 16-core AMD Ryzen 7 3700X CPU @3.60GHz, plus | ... | @@ -77,17 +77,18 @@ Measured the 23rd of June 2020 on a 16-core AMD Ryzen 7 3700X CPU @3.60GHz, plus |
77 | 77 | ||
78 | Update runs are much quicker, around 3 hours. It depends mostly on what RNA families are concerned by the update. | 78 | Update runs are much quicker, around 3 hours. It depends mostly on what RNA families are concerned by the update. |
79 | 79 | ||
80 | -## Dependencies | 80 | +## Using Docker |
81 | -You need to install: | 81 | + |
82 | -- DSSR, you need to register to the X3DNA forum [here](http://forum.x3dna.org/site-announcements/download-instructions/) and then download the DSSR binary [on that page](http://forum.x3dna.org/downloads/3dna-download/). | 82 | +* Step 1 : Download the [Docker container](#soon). Open a terminal and move to the appropriate directory. |
83 | -- Infernal, to download at [Eddylab](http://eddylab.org/infernal/), several options are available depending on your preferences. Make sure to have the `cmalign`, `esl-alimanip` and `esl-reformat` binaries in your $PATH variable, so that RNANet.py can find them.You don't need the whole X3DNA suite of tools, just DSSR is fine. Make sure to have the `x3dna-dssr` binary in your $PATH variable so that RNANet.py finds it. | 83 | +* Step 2 : Extract the archive to a Docker image named *rnanet* in your local installation |
84 | -- SINA, follow [these instructions](https://sina.readthedocs.io/en/latest/install.html) for example. Make sure to have the `sina` binary in your $PATH. | 84 | +``` |
85 | -- Python >= 3.8, (Unfortunately, python3.6 is no longer supported, because of changes in the multiprocessing and Threading packages. Untested with Python 3.7.\*) | 85 | +$ docker image import rnanet_v1.2_docker.tar rnanet |
86 | -- The following Python packages: `python3.8 -m pip install numpy matplotlib pandas biopython psutil pymysql requests sqlalchemy sqlite3 tqdm` | 86 | +``` |
87 | +* Step 3 : Run the container, giving it 3 folders to mount as volumes: a first to store the 3D data, a second to store the sequence data and alignments, and a third to output the results, data and logs: | ||
88 | +``` | ||
89 | +$ docker run -v path/to/3D/data/folder:/3D -v path/to/sequence/data/folder:/sequences -v path/to/experiment/results/folder:/runDir rnanet [ - other options ] | ||
90 | +``` | ||
87 | 91 | ||
88 | -## Command line | ||
89 | -Run `./RNANet.py --3d-folder path/to/3D/data/folder --seq-folder path/to/sequence/data/folder [ - other options ]`. | ||
90 | -It requires solid hardware to run. It takes around around 12 to 15 hours the first time, and 1 to 3h then, tested on a server with 32 cores and 48GB of RAM. | ||
91 | The detailed list of options is below: | 92 | The detailed list of options is below: |
92 | 93 | ||
93 | ``` | 94 | ``` |
... | @@ -121,18 +122,43 @@ The detailed list of options is below: | ... | @@ -121,18 +122,43 @@ The detailed list of options is below: |
121 | --archive Create a tar.gz archive of the datapoints text files, and update the link to the latest archive | 122 | --archive Create a tar.gz archive of the datapoints text files, and update the link to the latest archive |
122 | --no-logs Do not save per-chain logs of the numbering modifications | 123 | --no-logs Do not save per-chain logs of the numbering modifications |
123 | ``` | 124 | ``` |
125 | +You may not use the --3d-folder and --seq-folder options, they are set by default to the paths you provide with the -v options when running Docker. | ||
126 | + | ||
127 | +## Using classical command line installation | ||
128 | + | ||
129 | +You need to install the dependencies: | ||
130 | +- DSSR, you need to register to the X3DNA forum [here](http://forum.x3dna.org/site-announcements/download-instructions/) and then download the DSSR binary [on that page](http://forum.x3dna.org/downloads/3dna-download/). Make sure to have the `x3dna-dssr` binary in your $PATH variable so that RNANet.py finds it. | ||
131 | +- Infernal, to download at [Eddylab](http://eddylab.org/infernal/), several options are available depending on your preferences. Make sure to have the `cmalign`, `esl-alimanip`, `esl-alipid` and `esl-reformat` binaries in your $PATH variable, so that RNANet.py can find them. | ||
132 | +- SINA, follow [these instructions](https://sina.readthedocs.io/en/latest/install.html) for example. Make sure to have the `sina` binary in your $PATH. | ||
133 | +- Sqlite 3, available under the name *sqlite* in every distro's package manager, | ||
134 | +- Python >= 3.8, (Unfortunately, python3.6 is no longer supported, because of changes in the multiprocessing and Threading packages. Untested with Python 3.7.\*) | ||
135 | +- The following Python packages: `python3.8 -m pip install biopython==1.76 matplotlib pandas psutil pymysql requests scipy setproctitle sqlalchemy tqdm`. Note that Biopython versions 1.77 or later do not work (yet) since they removed the alphabet system. | ||
136 | + | ||
137 | +Then, run it from the command line, preferably using nohup if your shell will be interrupted: | ||
138 | +``` | ||
139 | + ./RNANet.py --3d-folder path/to/3D/data/folder --seq-folder path/to/sequence/data/folder [ - other options ] | ||
140 | +``` | ||
141 | +See the list of possible options juste above in the [Using Docker](#using-docker) section. Expect hours (maybe days) of computation. | ||
124 | 142 | ||
125 | Typical usage: | 143 | Typical usage: |
126 | ``` | 144 | ``` |
127 | -nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' & | 145 | +nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences --no-logs -s' & |
128 | ``` | 146 | ``` |
129 | 147 | ||
130 | ## Post-computation task: estimate quality | 148 | ## Post-computation task: estimate quality |
131 | -The file statistics.py is supposed to give a summary on the produced dataset. See the results/ folder. It can be run automatically after RNANet if you pass the `-s` option. | 149 | +If your did not ask for automatic run of statistics over the produced dataset with the `-s` option, you can run them later using the file statistics.py. |
150 | +``` | ||
151 | +python3.8 statistics.py --3d-folder path/to/3D/data/folder --seq-folder path/to/sequence/data/folder -r 20.0 | ||
152 | +``` | ||
153 | +/!\ Beware, if not precised with option `-r`, no resolution threshold is applied and all the data in RNANet.db is used. | ||
154 | + | ||
155 | +If you have run RNANet twice, once with option `--no-homology`, and once without, you unlock new statistics over unmapped chains. You will also be allowed to use option `--wadley` to reproduce Wadley & al. (2007) results automatically. | ||
132 | 156 | ||
133 | # How to further filter the dataset | 157 | # How to further filter the dataset |
134 | You may want to build your own sub-dataset by querying the results/RNANet.db file. Here are quick examples using Python3 and its sqlite3 package. | 158 | You may want to build your own sub-dataset by querying the results/RNANet.db file. Here are quick examples using Python3 and its sqlite3 package. |
135 | 159 | ||
160 | +*Note: you cannot install the sqlite3 package through pip. Install it using your OS' package manager, search for 'sqlite'.* | ||
161 | + | ||
136 | ## Filter on 3D structure resolution | 162 | ## Filter on 3D structure resolution |
137 | 163 | ||
138 | We need to import sqlite3 and pandas packages first. | 164 | We need to import sqlite3 and pandas packages first. |
... | @@ -157,13 +183,16 @@ with sqlite3.connect("results/RNANet.db) as connection: | ... | @@ -157,13 +183,16 @@ with sqlite3.connect("results/RNANet.db) as connection: |
157 | Step 2 : Then, we define a template string, containing the SQL request we use to get all information of one RNA chain, with brackets { } at the place we will insert every chain_id. | 183 | Step 2 : Then, we define a template string, containing the SQL request we use to get all information of one RNA chain, with brackets { } at the place we will insert every chain_id. |
158 | You can remove fields you are not interested in. | 184 | You can remove fields you are not interested in. |
159 | ``` | 185 | ``` |
160 | -req = """SELECT index_chain, old_nt_resnum, position, nt_name, nt_code, nt_align_code, is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, dbn, paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, | 186 | +req = """SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, |
161 | -v0, v1, v2, v3, v4, amlitude, phase_angle, puckering | 187 | + is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, dbn, |
162 | -FROM | 188 | + paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, |
163 | -(SELECT chain_id, rfam_acc from chain WHERE chain_id = {}) | 189 | + chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, |
164 | -NATURAL JOIN re_mapping | 190 | + v0, v1, v2, v3, v4, amplitude, phase_angle, puckering |
165 | -NATURAL JOIN nucleotide | 191 | + FROM |
166 | -NATURAL JOIN align_column;""" | 192 | + (SELECT chain_id, rfam_acc from chain WHERE chain_id = {}) |
193 | + NATURAL JOIN re_mapping | ||
194 | + NATURAL JOIN nucleotide | ||
195 | + NATURAL JOIN align_column;""" | ||
167 | ``` | 196 | ``` |
168 | 197 | ||
169 | Step 3 : Finally, we iterate over this list of chains and save their information in CSV files: | 198 | Step 3 : Finally, we iterate over this list of chains and save their information in CSV files: |
... | @@ -199,12 +228,13 @@ If you want just one example of each RNA 3D chain, use in Step 1: | ... | @@ -199,12 +228,13 @@ If you want just one example of each RNA 3D chain, use in Step 1: |
199 | 228 | ||
200 | ``` | 229 | ``` |
201 | with sqlite3.connect("results/RNANet.db) as connection: | 230 | with sqlite3.connect("results/RNANet.db) as connection: |
202 | - chain_list = pd.read_sql("""SELECT UNIQUE chain_id, structure_id, chain_name | 231 | + chain_list = pd.read_sql("""SELECT DISTINCT chain_id, structure_id, chain_name |
203 | FROM chain JOIN structure | 232 | FROM chain JOIN structure |
204 | ON chain.structure_id = structure.pdb_id | 233 | ON chain.structure_id = structure.pdb_id |
205 | ORDER BY structure_id ASC;""", | 234 | ORDER BY structure_id ASC;""", |
206 | con=connection) | 235 | con=connection) |
207 | ``` | 236 | ``` |
237 | +Then proceed to steps 2 and 3. | ||
208 | 238 | ||
209 | # More about the database structure | 239 | # More about the database structure |
210 | To help you design your own requests, here follows a description of the database tables and fields. | 240 | To help you design your own requests, here follows a description of the database tables and fields. |
... | @@ -231,13 +261,12 @@ To help you design your own requests, here follows a description of the database | ... | @@ -231,13 +261,12 @@ To help you design your own requests, here follows a description of the database |
231 | * `chain_id`: A unique identifier | 261 | * `chain_id`: A unique identifier |
232 | * `structure_id`: The `pdb_id` where the chain comes from | 262 | * `structure_id`: The `pdb_id` where the chain comes from |
233 | * `chain_name`: The chain label, extracted from the 3D file | 263 | * `chain_name`: The chain label, extracted from the 3D file |
264 | +* `eq_class`: The BGSU equivalence class label containing this chain | ||
265 | +* `rfam_acc`: The family which the chain is mapped to (if not mapped, value is *unmappd*) | ||
234 | * `pdb_start`: Position in the chain where the mapping to Rfam begins (absolute position, not residue number) | 266 | * `pdb_start`: Position in the chain where the mapping to Rfam begins (absolute position, not residue number) |
235 | * `pdb_end`: Position in the chain where the mapping to Rfam ends (absolute position, not residue number) | 267 | * `pdb_end`: Position in the chain where the mapping to Rfam ends (absolute position, not residue number) |
236 | -* `pdb_start`: Position in the chain where the mapping to Rfam begins (absolute position, not residue number) | ||
237 | -* `pdb_start`: Position in the chain where the mapping to Rfam begins (absolute position, not residue number) | ||
238 | * `reversed`: Wether the mapping numbering order differs from the residue numbering order in the mmCIF file (eg 4c9d, chains C and D) | 268 | * `reversed`: Wether the mapping numbering order differs from the residue numbering order in the mmCIF file (eg 4c9d, chains C and D) |
239 | -* `issue`: Wether an issue occurred with this structure while downloading, extracting, annotating or parsing the annotation. Chains with issues are removed from the dataset (Only one known to date: 1gsg, chain T, which is too short) | 269 | +* `issue`: Wether an issue occurred with this structure while downloading, extracting, annotating or parsing the annotation. See the file known_issues_reasons.txt for more information about why your chain is marked as an issue. |
240 | -* `rfam_acc`: The family which the chain is mapped to | ||
241 | * `inferred`: Wether the mapping has been inferred using the redundancy list (value is 1) or just known from Rfam-PDB mappings (value is 0) | 270 | * `inferred`: Wether the mapping has been inferred using the redundancy list (value is 1) or just known from Rfam-PDB mappings (value is 0) |
242 | * `chain_freq_A`, `chain_freq_C`, `chain_freq_G`, `chain_freq_U`, `chain_freq_other`: Nucleotide frequencies in the chain | 271 | * `chain_freq_A`, `chain_freq_C`, `chain_freq_G`, `chain_freq_U`, `chain_freq_other`: Nucleotide frequencies in the chain |
243 | * `pair_count_cWW`, `pair_count_cWH`, ... `pair_count_tSS`: Counts of the non-canonical base-pair types in the chain (intra-chain counts only) | 272 | * `pair_count_cWW`, `pair_count_cWH`, ... `pair_count_tSS`: Counts of the non-canonical base-pair types in the chain (intra-chain counts only) | ... | ... |
build_docker_image.sh
0 → 100755
1 | +#!/bin/bash | ||
2 | + | ||
3 | +# echo "WARNING: The purpose of this file is to document how the docker image was built."; | ||
4 | +# echo "You cannot execute it directly, because of licensing reasons. Please get your own"; | ||
5 | +# echo "DSSR 2.0 executable at http://innovation.columbia.edu/technologies/CU20391"; | ||
6 | +# echo "and place it in this folder."; | ||
7 | +# exit 0; | ||
8 | + | ||
9 | +THISDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" | ||
10 | + | ||
11 | +####################################################### Dependencies ############################################################## | ||
12 | + | ||
13 | +# The $THISDIR folder is supposed to contain the x3dna-dssr executable | ||
14 | +cp `which x3dna-dssr` $THISDIR | ||
15 | + | ||
16 | +######################################################## Build Docker image ###################################################### | ||
17 | +# Execute the Dockerfile and build the image | ||
18 | +docker build -t persalteas/rnanet . | ||
19 | + | ||
20 | +############################################################## Cleaning ########################################################## | ||
21 | +rm x3dna-dssr | ||
22 | + | ||
23 | +# to run, use something like: | ||
24 | +# docker run -v /home/persalteas/Data/RNA/3D/:/3D -v /home/persalteas/Data/RNA/sequences/:/sequences -v /home/persalteas/labo/:/runDir persalteas/rnanet [ additional options here ] | ||
25 | +# Without additional options, this runs a standard pass with known issues support, log output, and no statistics. The default resolution threshold is 4.0 Angstroms. | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
... | @@ -329,9 +329,7 @@ def parallel_stats_pairs(f): | ... | @@ -329,9 +329,7 @@ def parallel_stats_pairs(f): |
329 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 329 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
330 | # Get comma separated lists of basepairs per nucleotide | 330 | # Get comma separated lists of basepairs per nucleotide |
331 | interactions = pd.DataFrame( | 331 | interactions = pd.DataFrame( |
332 | - sql_ask_database(conn, | 332 | + sql_ask_database(conn, f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM nucleotide WHERE chain_id='{cid}';"), |
333 | - f"SELECT nt_code as nt1, index_chain, paired, pair_type_LW FROM (SELECT chain_id FROM chain WHERE chain_id='{cid}') NATURAL JOIN nucleotide;", | ||
334 | - warn_every=0), | ||
335 | columns = ["nt1", "index_chain", "paired", "pair_type_LW"] | 333 | columns = ["nt1", "index_chain", "paired", "pair_type_LW"] |
336 | ) | 334 | ) |
337 | # expand the comma-separated lists in real lists | 335 | # expand the comma-separated lists in real lists | ... | ... |
-
Please register or login to post a comment