Merge branch 'master' into stage_aglae

Aglaé TABOT
Commit ad0e234c8b100878728b92c6249f99fc271abf47 ad0e234c 2 parents 9dedcd5b 60f0af5a
Showing 18 changed files with 251 additions and 361 deletions
.dockerignore
CHANGELOG
LICENSE
README.md
RNAnet.py
doc/Errors.md
doc/FAQ.md
doc/INSTALL.md
doc/KnownIssues.md
geometric_stats.py
known_issues.txt
known_issues_reasons.txt
scripts/automate.sh
scripts/automate_from_scratch.sh
scripts/build_docker_image.sh
scripts/recompute_family.py
scripts/recompute_some_chains.py
statistics.py
--- a/.dockerignore
View file @ad0e234
+++ b/.dockerignore
View file @ad0e234
@@ -23,3 +23,5 @@ scripts/*.sh
 scripts/*.tar
 scripts/measure.py
 scripts/recompute_some_chains.py
+ scripts/convert_rna_jsons.py
+ scripts/recompute_family.py
--- a/CHANGELOG
View file @ad0e234
+++ b/CHANGELOG
View file @ad0e234
 ############################################################################################
+ v 1.6 beta, August 2021
+ 
+ Aglaé Tabot joins the development team. Khodor Hannoush leaves.
+ 
+ FEATURE CHANGES
+     - Distinct options --cmalign-opts and --cmalign-rrna-opts allow to adapt the parameters for LSU and SSU families.
+       The LSU and SSU are now aligned with Infernal options '--cpu 10 --mxsize 8192 --mxtau 0.1', which is slow, 
+       requires up to 100 GB of RAM, and yields a suboptimal alignment (tau=0.1 is quite bad), but is homogenous with the other families.
+     - The LSU and SSU therefore have defined cm_coords fields, and therefore distance matrices can be computed.
+     - Distances matrices are computed on all availables molecules of the family by default, but you can use statistics.py --non-redundant to only
+       select the equivalence class representatives at a given resolution into account (new option). For storage reasons, rRNAs are always run in 
+       this mode (but this might change in the future : space required is 'only' ~300 GB).
+     - We now provide for download the renumbered (standardised) 3D MMCIF files, the nucleotides being numbered by their "index_chain" in the database.
+     - We now provide for download the sequences of the 3D chains aligned by Rfam family (without Rfam sequences, which have been removed).
+     - statistics.py now computes histograms and a density estimation with Gaussian mixture models for a large set of geometric parameters, 
+       measured on the unmapped data at a given resolution threshold. The parameters include:
+         * All atom bonded distances and torsion angles
+         * Distances, flat angles and torsion angles in the Pyle/VFold model
+         * Distances, flat angles and torsion anfles in the HiRE-RNA model
+         * Sequence-dependant geometric parameters of the basepairs for all non-canonical basepairs in the HiRE-RNA model.
+       The data is saved as JSON files of parameters, and numerous figures are produced to illustrate the distributions.
+       The number of gaussians to use in the GMMs are hard-coded in geometric_stats.py after our first estimation. If you do not want to trust this estimation,
+       you can ignore it with option --rescan-nmodes. An exploration of the number of Gaussians from 1 to 8 will be performed, and the best GMM will be kept. 
+ 
+ BUG CORRECTIONS
+     - New code file geometric_stats.py
+     - New automation script that starts from scratch
+     - Many small fixes, leading to the support of many previously "known issues"
+     - Performance tweaks
+ 
+ TECHNICAL CHANGES
+     - Switched to DSSR Pro.
+     - Switched to esl-alimerge instead of cmalign --merge to merge alignments.
+     - Tested successfully with Python 3.9.6 + BioPython 1.79. 
+       However, the production server still runs with Python 3.8.1 + BioPython 1.78.
+ 
+ ############################################################################################
 v 1.5 beta, April 2021
 
 FEATURE CHANGES
--- a/LICENSE
View file @ad0e234
+++ b/LICENSE
View file @ad0e234
 MIT License
 
- Copyright (c) 2019 Louis Becquey
+ Copyright (c) 2019-2021 IBISC, Université Paris Saclay
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/README.md
View file @ad0e234
+++ b/README.md
View file @ad0e234
@@ -10,6 +10,7 @@ Contents:
 * [Database tables documentation](doc/Database.md)
 * [FAQ](doc/FAQ.md)
 * [Troubleshooting](#troubleshooting)
+ * [Known Issues and Feature Requests](doc/KnownIssues.md)
 * [Contact](#contact)
 
 ## Cite us
@@ -18,15 +19,13 @@ Contents:
 
 Additional relevant references:
 
- The "ProteinNet" philosophy which inspired this work:
- * AlQuraishi, M. (2019b). **ProteinNet: A standardized data set for machine learning of protein structure.** *BMC Bioinformatics*, 20(1), 311
- 
 If you use our annotations by DSSR, you might want to cite:
 * Lu, X.-J.et al.(2015). **DSSR: An integrated software tool for dissecting the spatial structure of RNA.** *Nucleic Acids Research*, 43(21), e142–e142.
 
 If you use our multiple sequence alignments and homology data, you might want to cite:
- * Pruesse, E. et al.(2012). **Sina: accurate high-throughput multiple sequence alignment of ribosomal RNA genes.** *Bioinformatics*, 28(14), 1823–1829
 * Nawrocki, E. P. and Eddy, S. R. (2013). **Infernal 1.1: 100-fold faster RNA homology searches.** *Bioinformatics*, 29(22), 2933–2935.
+ * Pruesse, E. et al.(2012). **Sina: accurate high-throughput multiple sequence alignment of ribosomal RNA genes.** *Bioinformatics*, 28(14), 1823–1829
+ 
 
 
 # What is RNANet ?
@@ -39,7 +38,8 @@ Most interestingly, nucleotides have been renumered in a standardized way, and t
 
 ## Methodology
 We use the Rfam mappings between 3D structures and known Rfam families, using the sequences that are known to belong to an Rfam family (hits provided in RF0XXXX.fasta files from Rfam).
- Future versions might compute a real MSA-based clusering directly with Rfamseq ncRNA sequences, like ProteinNet does with protein sequences, but this requires a tool similar to jackHMMER in the Infernal software suite, which is not available yet.
+ Future versions might compute a real MSA-based clusering directly with Rfamseq ncRNA sequences, like ProteinNet does with protein sequences, but this requires a tool similar to jackHMMER in the Infernal software suite, which is not available yet. 
+ If interested by such approaches, the user may check tools like RNAlien.
 
 This script prepares the dataset from available public data in PDB, RNA 3D Hub, Rfam and SILVA.
 
@@ -48,15 +48,16 @@ This script prepares the dataset from available public data in PDB, RNA 3D Hub, 
 The script follows these steps:
 
 To gather structures:
- * Gets a list of 3D structures containing RNA from BGSU's non-redundant list (but keeps the redundant structures /!\\),
+ * Gets a list of 3D structures containing RNA from BGSU's non-redundant list (redundancy can be kept or eliminated, see command line option `--redundant`),
 * Asks Rfam for mappings of these structures onto Rfam families (~50% of structures have a direct mapping, some more are inferred using the redundancy list)
 * Downloads the corresponding 3D structures (mmCIFs)
- * If desired, extracts the right chain portions that map onto an Rfam family to a separate mmCIF file
+ * Standardizes the residue numbering from 1 to N, including missing residues (gaps)
+ * If desired, extracts the renumbered chain portions that map onto an Rfam family to a separate mmCIF file
 
 To compute homology information:
- * Extract the sequence for every 3D chain
+ * Extracts the sequence of every 3D chain
 * Downloads Rfamseq ncRNA sequence hits for the concerned Rfam families (or ARB databases of SSU or LSU sequences from SILVA for rRNAs)
- * Realigns Rfamseq hits and sequences from the 3D structures together to obtain a multiple sequence alignment for each Rfam family (using `cmalign --cyk`, except for ribosomal LSU and SSU, where SINA is used)
+ * Realigns Rfamseq hits and sequences from the 3D structures together to obtain a multiple sequence alignment for each Rfam family (using `cmalign`, but SINA can be used for ribosomal LSU and SSU)
 * Computes nucleotide frequencies at every position for each alignment
 * Map each nucleotide of a 3D chain to its position in the corresponding family sequence alignment
 
@@ -65,6 +66,15 @@ To compute 3D annotations:
 
 Finally, export this data from the SQLite database into flat CSV files.
 
+ Statistical analysis of the structures:
+ * Computes statistics about the amount of data from various resolutions and experimental methods (by RNA family)
+ * Computes basic statistics about the frequency of (modified) nucleotides by chain and by family,
+ * Computes basic statistics about the frequencies of non-canonical interactions,
+ * Computes density estimations (using Gaussian mixtures) for various geometrical parameters like distances and torsion angles for different representations : all-atom, the Pyle/VFold model, and the HiRE-RNA model,
+ * Computes pairwise residue distance matrices for each chain, and average + std-dev by RNA family
+ * Computes sequence identity matrices for each RNA family (based on the alignments)
+ * Saves covariance models (Infernal .cm files) for each RNA family
+ 
 ## Data provided
 
 We provide couple of resources to exploit this dataset. You can download them on [EvryRNA](https://evryrna.ibisc.univ-evry.fr/evryrna/rnanet/rnanet_home).
--- a/RNAnet.py
View file @ad0e234
+++ b/RNAnet.py
View file @ad0e234
--- a/doc/Errors.md
View file @ad0e234
+++ b/doc/Errors.md
View file @ad0e234
- 
 # Warnings and errors in RNANet
 
 Use Ctrl + F on this page to look for your error message in the list.
@@ -27,7 +26,7 @@ DSSR complains because the CIF structure does not seem to contain nucleotides. T
 
 * **Error downloading and/or extracting Rfam.cm !** : We cannot retrieve the Rfam covariance models file. RNANet tries to find it at ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz so, check that your network is not blocking the FTP protocol (port 21 is open on your network), and check that the adress has not changed. If so, contact us so that we update RNANet with the correct address.
 
- * **Something's wrong with the SQL database. Check mysql-rfam-public.ebi.ac.uk status and try again later. Not printing statistics.** : We cannot retrieve family statistics from Rfam public server. Check if you can connect to it by hand : `mysql -u rfamro -P 4497 -D Rfam -h mysql-rfam-public.ebi.ac.uk`. if not, check that the port 497 is opened on your network.
+ * **Something's wrong with the SQL database. Check mysql-rfam-public.ebi.ac.uk status and try again later. Not printing statistics.** : We cannot retrieve family statistics from Rfam public server. Check if you can connect to it by hand : `mysql -u rfamro -P 4497 -D Rfam -h mysql-rfam-public.ebi.ac.uk`. if not, check that the port 4497 is opened on your network.
 
 * **Error downloading RFXXXXX.fa.gz: {custom-error}** : We cannot reach the Rfam FTP server to download homologous sequences. We look in ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/fasta_files/ so, check if you can access it from your network (check that port 21 is opened on your network). Check if the address has changed and notify us.
 
--- a/doc/FAQ.md
View file @ad0e234
+++ b/doc/FAQ.md
View file @ad0e234
@@ -7,6 +7,15 @@ In `cmalign` alignments, - means a nucleotide is missing compared to the covaria
 
 In the final filtered alignment that we provide for download, the same rule applies, but on top of that, some '.' are replaced by '-' when a gap in the 3D structure (a missing, unresolved nucleotide) is mapped to an insertion gap.
 
+ * **What are the cmalign options for ?**
+ 
+ From Infernal's user guide, we can quote that Infernal uses an HMM banding technique to accelerate alignment by default. It also takes care of 3' or 5' truncated sequences to be aligned correctly (and we have some).
+ First, one can choose an algorithm, between `--optacc` (maximizing posterior probabilities, the default) and `--cyk` (maximizing likelihood).
+ 
+ Then, the use of bands allows faster and more memory efficient computation, at the price of the guarantee of determining the optimal alignment. Bands can be disabled using the `--nonbanded` option. A best idea would be to control the threshold of probability mass to be considered negligible during HMM band calculation with the `--tau` parameter. Higher values of Tau yield greater speedups and lower memory usage, but a greater chance to miss the optimal alignment. In practice, the algorithm explores several Tau values (increasing it by a factor 2.0 from the original `--tau` value) until the DP matrix size falls below the threshold given by `--mxsize` (default 1028 Mb) or the value of `--maxtau` is reached (in this case, the program fails). One can disable this exploration with option `--fixedtau`. The default value of `--tau` is 1e-7, the default `--maxtau` is 0.05. Basically, you may decide on a value of `--mxsize` by dividing your available RAM by the number of cores used with cmalign. If necessary, you may use less cores than you have, using option `--cpu`.
+ 
+ Finally, if using `--cyk --nonbanded --notrunc --noprob`, one can use the `--small` option to align using the divide-and-conquer CYK algorithm from Eddy 2002, requiring a very few memory but a lot of time. The major drawback of this is that it requires `--notrunc` and `--noprob`, so we give up on the correct alignment of truncated sequences, and the computation of posterior probabilities.
+ 
 * **Why are there some gap-only columns in the alignment ?**
 
 These columns are not completely gap-only, they contain at least one dash-gap '-'. This means an actual, physical nucleotide which should exist in the 3D structure should be located there. The previous and following nucleotides are **not** contiguous in space in 3D.
@@ -31,5 +40,5 @@ We first remove the nucleotides whose number is outside the family mapping (if a
 
 * **What are the versions of the dependencies you use ?**
 
- `cmalign` is v1.1.4, `sina` is v1.6.0, `x3dna-dssr` is v1.9.9, Biopython is v1.78.
+ `cmalign` is v1.1.4, `sina` is v1.6.0, `x3dna-dssr` is v2.3.2-2021jun29, Biopython is v1.78.
     
\ No newline at end of file
--- a/doc/INSTALL.md
View file @ad0e234
+++ b/doc/INSTALL.md
View file @ad0e234
--- a/doc/KnownIssues.md
View file @ad0e234
+++ b/doc/KnownIssues.md
View file @ad0e234
 # Known Issues
 
 ## Annotation and numbering issues
- * Some GDPs that are listed as HETATMs in the mmCIF files are not detected correctly to be real nucleotides. (e.g. 1e8o-E)
+ * [SOLVED] Some GDPs that are listed as HETATMs in the mmCIF files are not detected correctly to be real nucleotides. (e.g. 1e8o-E)
 * Some chains are truncated in different pieces with different chain names. Reason unknown (e.g. 6ztp-AX)
- * Some chains are not correctly renamed A in the produced separate files (e.g. 1d4r-B)
+ * [SOLVED] Some chains are not correctly renamed A in the produced separate files (e.g. 1d4r-B)
 
 ## Alignment issues
- * [SOLVED] Filtered alignments are shorter than the number of alignment columns saved to the SQL table `align_column`
- * Chain names appear in triple in the FASTA header (e.g. 1d4r[1]-B 1d4r[1]-B 1d4r[1]-B)
- 
- ## Technical running issues
- * [SOLVED] Files produced by Docker containers are owned by root and require root permissions to be read 
- * [SOLVED] SQLite WAL files are not deleted properly
+ * [SOLVED] Chain names appear in triple in the FASTA header (e.g. 1d4r[1]-B 1d4r[1]-B 1d4r[1]-B)
 
 # Known feature requests
- * [DONE] Get filtered versions of the sequence alignments containing the 3D chains, publicly available for download
- * [DONE] Get a consensus residue for each alignement column
- * [DONE] Get an option to limit the number of cores 
- * [DONE] Move to SILVA LSU release 138.1
- * [UPCOMING] Automated annotation of detected Recurrent Interaction Networks (RINs), see http://carnaval.lri.fr/ .
- * [UPCOMING] Possibly, automated detection of HLs and ILs from the 3D Motif Atlas (BGSU). Maybe. Their own website already does the job.
- * [UPCOMING] Weight sequences in alignment to give more importance to rarer sequences 
- * [UPCOMING] Give both gap_percent and insertion_gap_percent
+ * Automated annotation of detected Recurrent Interaction Networks (RINs), see http://carnaval.lri.fr/ .
+ * Possibly, automated detection of HLs and ILs from the 3D Motif Atlas (BGSU). Maybe. Their own website already does the job.
+ * Weight sequences in alignment to give more importance to rarer sequences 
+ * Give both gap_percent and insertion_gap_percent
 * A field estimating the quality of the sequence alignment in table family.
 * Possibly, more metrics about the alignments coming from Infernal.
 * Run cmscan ourselves from the NDB instead of using Rfam-PDB mappings ? (Iff this actually makes a real difference, untested yet)
 * Use and save Infernal alignment bounds and truncation information
+ * Save if a chain is a representative or not in BGSU list, so that they can be filtered easily
+ * Annotate unstructured regions (on a nucleotide basis)
+ 
+ ## Technical to-do list
+ * `cmalign --merge` is now deprecated, we use `esl-alimerge` instead. But, esl is a single-core process. We should run the merges of alignements of different families in parallel to save some time [TODO]. 
--- a/geometric_stats.py 0 → 100644
View file @ad0e234
+++ b/geometric_stats.py 0 → 100644
View file @ad0e234
--- a/known_issues.txt
View file @ad0e234
+++ b/known_issues.txt
View file @ad0e234
- 6ydp_1_AA_1176-2737
- 6ydw_1_AA_1176-2737
- 2z9q_1_A_1-72
- 1ml5_1_b_5-121
- 1ml5_1_a_1-2914
- 3ep2_1_Y_1-72
- 3eq3_1_Y_1-72
- 4v48_1_A6_1-73
- 1ml5_1_A_2-1520
- 1qzb_1_B_1-73
- 1qza_1_B_1-73
- 1ls2_1_B_1-73
- 1gsg_1_T_1-72
- 7d1a_1_A_805-902
- 7d0g_1_A_805-913
- 7d0f_1_A_817-913
- 3jcr_1_H_1-115
- 1vy7_1_AY_1-73
- 1vy7_1_CY_1-73
- 4w2h_1_CY_1-73
- 5zzm_1_M_3-118
- 2rdo_1_A_3-118
- 4v48_1_A9_3-118
- 4v47_1_A9_3-118
- 2ob7_1_A_10-319
- 1x1l_1_A_1-130
- 1zc8_1_Z_1-91
- 2ob7_1_D_1-130
- 4v42_1_BA_1-2914
- 4v42_1_BB_5-121
- 1r2x_1_C_1-58
- 1r2w_1_C_1-58
- 1eg0_1_L_1-56
- 3dg2_1_A_1-1542
- 3dg0_1_A_1-1542
- 4v48_1_BA_1-1543
- 4v47_1_BA_1-1542
- 3dg4_1_A_1-1542
- 3dg5_1_A_1-1542
- 5zzm_1_N_1-2903
- 2rdo_1_B_1-2904
- 3dg2_1_B_1-2904
- 3dg0_1_B_1-2904
- 4v48_1_A0_1-2904
- 4v47_1_A0_1-2904
- 3dg4_1_B_1-2904
- 3dg5_1_B_1-2904
- 1eg0_1_O_1-73
- 1zc8_1_A_1-59
- 1jgq_1_A_2-1520
- 4v42_1_AA_2-1520
- 1jgo_1_A_2-1520
- 1jgp_1_A_2-1520
- 1mvr_1_D_1-59
- 4c9d_1_D_29-1
- 4c9d_1_C_29-1
- 4adx_1_9_1-121
- 1zn1_1_B_1-59
- 1emi_1_B_1-108
- 3iy9_1_A_498-1027
- 3ep2_1_B_1-50
- 3eq3_1_B_1-50
- 3eq4_1_B_1-50
- 3pgw_1_R_1-164
- 3pgw_1_N_1-164
- 3cw1_1_x_1-138
- 3cw1_1_w_1-138
- 3cw1_1_V_1-138
- 3cw1_1_v_1-138
- 2iy3_1_B_9-105
- 3jcr_1_N_1-106
- 2vaz_1_A_64-177
- 2ftc_1_R_81-1466
- 3jcr_1_M_1-141
- 4v5z_1_B0_1-2902
- 5g2x_1_A_595-692
- 3iy8_1_A_1-540
- 4v5z_1_BY_2-113
- 4v5z_1_BZ_1-70
- 4v5z_1_B1_2-123
- 1mvr_1_B_1-96
- 4adx_1_0_1-2923
- 3eq4_1_Y_1-69
- 7a5p_1_2_259-449
- 6uz7_1_8_2140-2825
- 4v5z_1_AA_1-1563
 6cfj_1_1X
 6cfj_1_2X
 5hcq_1_1X
@@ -196,7 +110,6 @@
 5lzb_1_V
 6h58_1_W
 6h58_1_WW
- 1eg0_1_O
 5j8b_1_X
 4v7j_1_AV
 4v7j_1_BV
@@ -224,10 +137,6 @@
 7k00_1_B
 6ys3_1_A
 6qdw_1_A
- 5zzm_1_M
- 2rdo_1_A
- 4v48_1_A9
- 4v47_1_A9
 6hcj_1_Q3
 6hcq_1_Q3
 6o8w_1_U
@@ -295,7 +204,12 @@
 6ucq_1_2Y
 4w2e_1_X
 6ucq_1_2X
+ 7n1p_1_DT
+ 7n2u_1_DT
 6yss_1_W
+ 7n30_1_DT
+ 7n31_1_DT
+ 7n2c_1_DT
 5afi_1_Y
 5uq8_1_Z
 5wdt_1_Y
@@ -321,18 +235,20 @@
 4v4i_1_Y
 5uq8_1_X
 5uq7_1_X
- 1jgq_1_A
- 4v42_1_AA
- 1jgo_1_A
- 1jgp_1_A
 4v4j_1_W
 4v4i_1_W
- 4v42_1_BA
 4wt8_1_CS
 4wt8_1_DS
 4v4j_1_X
 4v4i_1_X
- 4v42_1_BB
+ 6lkq_1_S
+ 5h5u_1_H
+ 7d6z_1_F
+ 5lze_1_Y
+ 5lze_1_V
+ 5lze_1_X
+ 3jcj_1_G
+ 6o7k_1_G
 6d30_1_C
 6j7z_1_C
 3er9_1_D
@@ -367,20 +283,11 @@
 4oq9_1_1
 6rt5_1_A
 6rt5_1_E
- 4qu6_1_B
 6lkq_1_T
 6ys3_1_B
 6qdw_1_B
 3jbv_1_B
 3jbu_1_B
- 5zzm_1_N
- 2rdo_1_B
- 3dg2_1_B
- 3dg0_1_B
- 4v48_1_A0
- 4v47_1_A0
- 3dg4_1_B
- 3dg5_1_B
 6do8_1_B
 6dpi_1_B
 6dp9_1_B
@@ -437,25 +344,17 @@
 6doc_1_B
 6doe_1_B
 6n6g_1_D
- 6lkq_1_S
- 5h5u_1_H
- 7d6z_1_F
- 5lze_1_Y
- 5lze_1_V
- 5lze_1_X
- 3jcj_1_G
- 6o7k_1_G
- 3dg2_1_A
- 3dg0_1_A
- 4v48_1_BA
- 4v47_1_BA
- 3dg4_1_A
- 3dg5_1_A
 4b3r_1_W
 4b3t_1_W
 4b3s_1_W
+ 7b5k_1_X
 5o2r_1_X
 5kcs_1_1X
+ 7n1p_1_PT
+ 7n2u_1_PT
+ 7n30_1_PT
+ 7n31_1_PT
+ 7n2c_1_PT
 6zvk_1_E2
 6zvk_1_H2
 7a01_1_E2
@@ -549,15 +448,9 @@
 6xzb_1_G2
 6gz5_1_BW
 6gz3_1_BW
- 1qzb_1_B
- 1qza_1_B
- 1ls2_1_B
- 3ep2_1_Y
- 3eq3_1_Y
- 4v48_1_A6
- 2z9q_1_A
 4hot_1_X
 6d2z_1_C
+ 7eh0_1_I
 4tu0_1_F
 4tu0_1_G
 6r9o_1_B
@@ -572,37 +465,38 @@
 6sv4_1_MB
 7nrd_1_SM
 6i7o_1_MB
- 1gsg_1_T
 6zvi_1_D
 6sv4_1_NB
 6sv4_1_NC
 6i7o_1_NB
- 1ml5_1_A
+ 7nsq_1_V
+ 7nsp_1_V
 6swa_1_Q
 6swa_1_R
- 3j6x_1_IR
- 3j6y_1_IR
 6ole_1_T
 6om0_1_T
 6oli_1_T
 6om7_1_T
 6olf_1_T
 6w6l_1_T
+ 6tnu_1_M
+ 5mc6_1_M
+ 7nrc_1_SM
 6tb3_1_N
 7b7d_1_SM
 7b7d_1_SN
 6tnu_1_N
+ 7nrc_1_SN
 7nrd_1_SN
 6zot_1_C
+ 4qu6_1_B
 2uxb_1_X
 2x1f_1_B
 2x1a_1_B
- 3ep2_1_D
- 3eq3_1_D
- 1eg0_1_M
- 3eq4_1_D
 5o1y_1_B
- 3jcr_1_H
+ 4kzy_1_I
+ 4kzz_1_I
+ 4kzx_1_I
 6dzi_1_H
 5zeu_1_A
 6evj_1_N
@@ -705,7 +599,6 @@
 6ip6_1_ZZ
 6uu3_1_333
 6uu1_1_333
- 1pn8_1_D
 3er8_1_H
 3er8_1_G
 3er8_1_F
@@ -744,9 +637,8 @@
 4wtl_1_T
 4wtl_1_P
 1xnq_1_W
- 1x18_1_C
- 1x18_1_B
- 1x18_1_D
+ 7n2v_1_DT
+ 4peh_1_Z
 1vq6_1_4
 4am3_1_D
 4am3_1_H
@@ -764,6 +656,38 @@
 3rtj_1_D
 6ty9_1_M
 6tz1_1_N
+ 6q1h_1_D
+ 6q1h_1_H
+ 6p7p_1_F
+ 6p7p_1_E
+ 6p7p_1_D
+ 6vm6_1_J
+ 6vm6_1_G
+ 6wan_1_K
+ 6wan_1_H
+ 6wan_1_G
+ 6wan_1_L
+ 6wan_1_I
+ 6ywo_1_F
+ 6wan_1_J
+ 4oau_1_A
+ 6ywo_1_E
+ 6ywo_1_K
+ 6vm6_1_I
+ 6vm6_1_H
+ 6ywo_1_I
+ 2a1r_1_C
+ 6m6v_1_F
+ 6m6v_1_E
+ 2a1r_1_D
+ 3gpq_1_E
+ 3gpq_1_F
+ 6o79_1_C
+ 6vm6_1_K
+ 6m6v_1_G
+ 6hyu_1_D
+ 1laj_1_R
+ 6ybv_1_K
 6sce_1_B
 6xl1_1_C
 6scf_1_I
@@ -809,31 +733,20 @@
 1y1y_1_P
 5zuu_1_I
 5zuu_1_G
+ 7am2_1_R1
 4peh_1_W
 4peh_1_V
 4peh_1_X
 4peh_1_Y
- 4peh_1_Z
+ 7d8c_1_C
 6mkn_1_W
 7kl3_1_B
 4cxg_1_C
 4cxh_1_C
- 1x1l_1_A
- 1zc8_1_Z
- 2ob7_1_D
- 2ob7_1_A
 4eya_1_E
 4eya_1_F
 4eya_1_Q
 4eya_1_R
- 1qzc_1_B
- 1t1o_1_B
- 1mvr_1_C
- 1t1m_1_B
- 1t1o_1_C
- 1t1m_1_A
- 1t1o_1_A
- 2r1g_1_B
 4ht9_1_E
 6z1p_1_AB
 6z1p_1_AA
@@ -844,19 +757,14 @@
 5uk4_1_W
 5uk4_1_U
 5f6c_1_E
+ 7nwh_1_HH
 4rcj_1_B
 1xnr_1_W
- 2agn_1_A
- 2agn_1_C
- 2agn_1_B
 6e0o_1_C
 6o75_1_D
 6o75_1_C
 6e0o_1_B
 3j06_1_R
- 1r2x_1_C
- 1r2w_1_C
- 1eg0_1_L
 4eya_1_G
 4eya_1_H
 4eya_1_S
@@ -866,8 +774,7 @@
 1ibm_1_Z
 4dr5_1_V
 4d61_1_J
- 1trj_1_B
- 1trj_1_C
+ 7nwg_1_Q3
 5tbw_1_SR
 6hhq_1_SR
 6zvi_1_H
@@ -909,14 +816,8 @@
 6ppn_1_I
 5flx_1_Z
 6eri_1_AX
+ 7k5l_1_R
 7d80_1_Y
- 1zc8_1_A
- 1zc8_1_C
- 1zc8_1_B
- 1zc8_1_G
- 1zc8_1_I
- 1zc8_1_H
- 1zc8_1_J
 7du2_1_R
 4v8z_1_CX
 6kqe_1_I
@@ -930,7 +831,6 @@
 4xlr_1_Q
 6sty_1_C
 6sty_1_F
- 2xs5_1_D
 3ok4_1_N
 3ok4_1_L
 3ok4_1_Z
@@ -973,19 +873,17 @@
 3ol7_1_H
 3ol8_1_L
 3ol8_1_P
- 1qzc_1_C
- 1qzc_1_A
 6yrq_1_E
 6yrq_1_H
 6yrq_1_G
 6yrq_1_F
 6yrb_1_C
 6yrb_1_D
- 1mvr_1_D
 6gz5_1_BV
 6gz4_1_BV
 6gz3_1_BV
 6fti_1_Q
+ 7njc_1_B
 4v7e_1_AB
 4v7e_1_AE
 4v7e_1_AD
@@ -997,9 +895,7 @@
 3t1h_1_W
 3t1y_1_W
 1xmo_1_W
- 4adx_1_9
 6kr6_1_B
- 1zn1_1_B
 6z8k_1_X
 4csf_1_U
 4csf_1_Q
@@ -1025,7 +921,6 @@
 2xpj_1_D
 2vrt_1_H
 2vrt_1_G
- 1emi_1_B
 6r9m_1_B
 4nia_1_C
 4nia_1_A
@@ -1051,45 +946,23 @@
 1uvn_1_F
 1uvn_1_B
 1uvn_1_D
- 3iy9_1_A
 4wtk_1_T
 4wtk_1_P
 1vqn_1_4
 4oav_1_C
 4oav_1_A
- 3ep2_1_E
- 3eq3_1_E
- 3eq4_1_E
- 3ep2_1_A
- 3eq3_1_A
- 3eq4_1_A
- 3ep2_1_C
- 3eq3_1_C
- 3eq4_1_C
- 3ep2_1_B
- 3eq3_1_B
- 3eq4_1_B
 4i67_1_B
- 3pgw_1_R
- 3pgw_1_N
- 3cw1_1_X
- 3cw1_1_W
- 3cw1_1_V
- 7b0y_1_A
 6k32_1_T
 6k32_1_P
 5mmj_1_A
 5x8r_1_A
- 2agn_1_E
- 2agn_1_D
- 4v5z_1_BD
 6yw5_1_AA
 6ywe_1_AA
 6ywy_1_AA
 6ywx_1_AA
 3nvk_1_G
 3nvk_1_S
- 2iy3_1_B
+ 1cwp_1_D
 1cwp_1_F
 5z4j_1_B
 5gmf_1_E
@@ -1129,7 +1002,6 @@
 4kzz_1_J
 7a09_1_F
 5t2c_1_AN
- 4v5z_1_BF
 3j6b_1_E
 4v4f_1_B6
 4v4f_1_A5
@@ -1153,21 +1025,21 @@
 4v4f_1_B4
 4v4f_1_A6
 4v4f_1_B2
+ 7m4y_1_V
+ 7m4x_1_V
+ 6v3a_1_V
+ 6v39_1_V
 5it9_1_I
 7jqc_1_I
 5zsb_1_C
 5zsb_1_D
 5zsn_1_D
 5zsn_1_E
- 1cwp_1_D
- 3jcr_1_N
 6gfw_1_R
- 2vaz_1_A
 6zm6_1_X
 6zm5_1_X
 6zm6_1_W
 6zm5_1_W
- 4v5z_1_BP
 6n6e_1_D
 4g7o_1_I
 4g7o_1_S
@@ -1177,11 +1049,9 @@
 5uh6_1_I
 6l74_1_I
 5uh9_1_I
- 2ftc_1_R
 7a5j_1_X
 6sag_1_R
 4udv_1_R
- 2r1g_1_E
 5zsc_1_D
 5zsc_1_C
 6woy_1_I
@@ -1209,7 +1079,6 @@
 3m85_1_X
 3m85_1_Z
 3m85_1_Y
- 1e8s_1_C
 5wnp_1_B
 5wnv_1_B
 5yts_1_B
@@ -1232,8 +1101,11 @@
 6ij2_1_E
 3u2e_1_D
 3u2e_1_C
+ 7eh1_1_I
 5uef_1_C
 5uef_1_D
+ 7eh2_1_R
+ 7eh2_1_I
 4x4u_1_H
 4afy_1_D
 6oy5_1_I
@@ -1249,8 +1121,6 @@
 4k4s_1_H
 4k4t_1_H
 4k4t_1_D
- 1zn1_1_C
- 1zn0_1_C
 1xpu_1_G
 1xpu_1_L
 1xpr_1_L
@@ -1275,6 +1145,7 @@
 6gc5_1_H
 6gc5_1_G
 1n1h_1_B
+ 7n2v_1_PT
 4ohz_1_B
 6t83_1_6B
 4gv6_1_C
@@ -1287,14 +1158,11 @@
 6qx3_1_G
 2xnr_1_C
 4gkj_1_W
- 4v5z_1_BC
 5y88_1_X
- 4v5z_1_BB
 3j0o_1_H
 3j0l_1_H
 3j0p_1_H
 3j0q_1_H
- 4v5z_1_BH
 3j0o_1_F
 3j0l_1_F
 3j0p_1_F
@@ -1309,7 +1177,6 @@
 3j0l_1_A
 3j0q_1_A
 3j0p_1_A
- 4v5z_1_BJ
 6ys3_1_V
 6qdw_1_V
 5hk0_1_F
@@ -1345,14 +1212,10 @@
 5mrc_1_BB
 5mre_1_BB
 5mrf_1_BB
- 4v5z_1_BN
 3j46_1_P
- 3jcr_1_M
 4e6b_1_A
 4e6b_1_B
 6a6l_1_D
- 4v5z_1_BS
- 4v8t_1_1
 1uvi_1_D
 1uvi_1_F
 1uvi_1_E
@@ -1376,10 +1239,7 @@
 6ip5_1_2M
 6ip6_1_2M
 6qcs_1_M
- 486d_1_G
- 2r1g_1_C
- 486d_1_F
- 4v5z_1_B0
+ 7b5k_1_Z
 4nia_1_O
 4nia_1_J
 4nia_1_K
@@ -1391,13 +1251,11 @@
 4oq9_1_F
 4oq9_1_L
 6r9q_1_B
+ 7m4u_1_A
 6v3a_1_SN1
 6v3b_1_SN1
 6v39_1_SN1
 6v3e_1_SN1
- 1pn7_1_C
- 1mj1_1_Q
- 1mj1_1_R
 4dr6_1_V
 6kql_1_I
 4eya_1_M
@@ -1437,13 +1295,20 @@
 6ow3_1_I
 6ovy_1_I
 6oy6_1_I
- 4bbl_1_Y
- 4bbl_1_Z
 4qvd_1_H
 5gxi_1_B
- 3iy8_1_A
- 6tnu_1_M
- 5mc6_1_M
+ 7n06_1_G
+ 7n06_1_H
+ 7n06_1_I
+ 7n06_1_J
+ 7n06_1_K
+ 7n06_1_L
+ 7n33_1_G
+ 7n33_1_H
+ 7n33_1_I
+ 7n33_1_J
+ 7n33_1_K
+ 7n33_1_L
 5mc6_1_N
 4eya_1_O
 4eya_1_P
@@ -1453,33 +1318,13 @@
 6htq_1_W
 6htq_1_U
 6uu6_1_333
- 6v3a_1_V
- 6v39_1_V
 5a0v_1_F
 3avt_1_T
 6d1v_1_C
 4s2x_1_B
 4s2y_1_B
 5wnu_1_B
- 1zc8_1_F
 1vtm_1_R
- 4v5z_1_BA
- 4v5z_1_BE
- 4v5z_1_BG
- 4v5z_1_BI
- 4v5z_1_BK
- 4v5z_1_BM
- 4v5z_1_BL
- 4v5z_1_BV
- 4v5z_1_BO
- 4v5z_1_BQ
- 4v5z_1_BR
- 4v5z_1_BT
- 4v5z_1_BU
- 4v5z_1_BW
- 4v5z_1_BY
- 4v5z_1_BX
- 4v5z_1_BZ
 5elt_1_F
 5elt_1_E
 6xlj_1_R
@@ -1492,11 +1337,11 @@
 6bk8_1_I
 4cxg_1_B
 4cxh_1_B
- 4v5z_1_B1
 5z4d_1_B
 6o78_1_E
 6xa1_1_BV
 6ha8_1_X
+ 2xs5_1_D
 1m8w_1_E
 1m8w_1_F
 5udi_1_B
@@ -1525,11 +1370,13 @@
 3rzo_1_R
 2f4v_1_Z
 1qln_1_R
+ 3cw1_1_X
+ 3cw1_1_W
+ 7b0y_1_A
 6ogy_1_M
 6ogy_1_N
 6uej_1_B
 6ywy_1_BB
- 1x18_1_A
 5ytx_1_B
 4g0a_1_H
 6r9p_1_B
@@ -1559,11 +1406,6 @@
 5lzc_1_W
 5lzb_1_W
 3wzi_1_C
- 1mvr_1_E
- 1mvr_1_B
- 1mvr_1_A
- 4adx_1_0
- 4adx_1_8
 1n33_1_Z
 6dti_1_W
 3d2s_1_F
@@ -1572,12 +1414,7 @@
 5mre_1_AA
 5mrf_1_AA
 7jhy_1_Z
- 2r1g_1_A
- 2r1g_1_D
- 2r1g_1_F
- 3eq4_1_Y
 4wkr_1_C
- 2r1g_1_X
 4v99_1_EC
 4v99_1_AC
 4v99_1_BH
@@ -1647,38 +1484,6 @@
 2xs7_1_B
 1n38_1_B
 4qvc_1_G
- 6q1h_1_D
- 6q1h_1_H
- 6p7p_1_F
- 6p7p_1_E
- 6p7p_1_D
- 6vm6_1_J
- 6vm6_1_G
- 6wan_1_K
- 6wan_1_H
- 6wan_1_G
- 6wan_1_L
- 6wan_1_I
- 6ywo_1_F
- 6wan_1_J
- 4oau_1_A
- 6ywo_1_E
- 6ywo_1_K
- 6vm6_1_I
- 6vm6_1_H
- 6ywo_1_I
- 2a1r_1_C
- 6m6v_1_F
- 6m6v_1_E
- 2a1r_1_D
- 3gpq_1_E
- 3gpq_1_F
- 6o79_1_C
- 6vm6_1_K
- 6m6v_1_G
- 6hyu_1_D
- 1laj_1_R
- 6ybv_1_K
 6mpf_1_W
 6spc_1_A
 6spe_1_A
@@ -1692,43 +1497,36 @@
 4g0a_1_E
 2b2d_1_S
 5hkc_1_C
- 4kzy_1_I
- 4kzz_1_I
- 4kzx_1_I
 1rmv_1_B
 4qu7_1_X
 4qu7_1_V
 4qu7_1_U
- 4v5z_1_AH
- 4v5z_1_AA
- 4v5z_1_AB
- 4v5z_1_AC
- 4v5z_1_AD
- 4v5z_1_AE
- 4v5z_1_AF
- 4v5z_1_AG
 6pmi_1_3
 6pmj_1_3
 5hjz_1_C
- 7nrc_1_SM
- 7nrc_1_SN
- 7am2_1_R1
- 7k5l_1_R
- 7b5k_1_X
- 7d8c_1_C
- 7m4y_1_V
- 7m4x_1_V
- 7b5k_1_Z
- 7m4u_1_A
- 7n06_1_G
- 7n06_1_H
- 7n06_1_I
- 7n06_1_J
- 7n06_1_K
- 7n06_1_L
- 7n33_1_G
- 7n33_1_H
- 7n33_1_I
- 7n33_1_J
- 7n33_1_K
- 7n33_1_L
+ 6ydp_1_AA_1176-2737
+ 6ydw_1_AA_1176-2737
+ 1vy7_1_AY_1-73
+ 1vy7_1_CY_1-73
+ 4w2h_1_CY_1-73
+ 7d1a_1_A_805-902
+ 7d0g_1_A_805-913
+ 7d0f_1_A_817-913
+ 7o7z_1_AH_144-220
+ 4c9d_1_D_29-1
+ 4c9d_1_C_29-1
+ 7aih_1_1_2400-2963
+ 7aih_1_1_2984-3610
+ 7ane_1_2_1904-2468
+ 7ane_1_2_2489-3115
+ 5g2x_1_A_595-692
+ 7aor_1_2_2020-2579
+ 7aor_1_2_2589-3210
+ 7a5p_1_2_259-449
+ 7aor_1_A_2020-2579
+ 7aor_1_A_2589-3210
+ 7am2_1_1_1904-2470
+ 7am2_1_1_2491-3117
+ 7ane_1_1_1904-2468
+ 7ane_1_1_2489-3115
+ 6uz7_1_8_2140-2825
--- a/known_issues_reasons.txt
View file @ad0e234
+++ b/known_issues_reasons.txt
View file @ad0e234
--- a/scripts/automate.sh
View file @ad0e234
+++ b/scripts/automate.sh
View file @ad0e234
@@ -5,7 +5,7 @@ rm -rf latest_run.log errors.txt
 
 # Run RNANet
 bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --no-homology --redundant --extract' > latest_run.log 2>&1
- bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --redundant --sina --extract -s --stats-opts="--wadley --distance-matrices" --archive' > latest_run.log 2>&1
+ bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --redundant --extract -s --stats-opts="-r 20.0 --wadley --hire-rna --distance-matrices" --archive' >> latest_run.log 2>&1
 echo 'Compressing RNANet.db.gz...' >> latest_run.log
 touch results/RNANet.db                                         # update last modification date
 gzip -k /home/lbecquey/Projects/RNANet/results/RNANet.db        # compress it
--- a/scripts/automate_from_scratch.sh 0 → 100755
View file @ad0e234
+++ b/scripts/automate_from_scratch.sh 0 → 100755
View file @ad0e234
+ # This is a script supposed to be run periodically as a cron job
+ # This one uses argument --from-scratch, so all is recomputed ! /!\ 
+ # run it one or twice a year, otherwise, the faster update runs should be enough.
+ 
+ cd /home/lbecquey/Projects/RNANet
+ rm -rf latest_run.log errors.txt known_issues.txt known_issues_reasons.txt
+ 
+ # Run RNANet
+ bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ --from-scratch --ignore-issues -r 20.0 --no-homology --redundant --extract' > latest_run.log 2>&1
+ bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ --from-scratch --ignore-issues -r 20.0 --redundant --extract -s --stats-opts="-r 20.0 --wadley --hire-rna --distance-matrices" --archive' >> latest_run.log 2>&1
+ echo 'Compressing RNANet.db.gz...' >> latest_run.log
+ touch results/RNANet.db                                         # update last modification date
+ gzip -k /home/lbecquey/Projects/RNANet/results/RNANet.db        # compress it
+ rm -f results/RNANet.db-wal results/RNANet.db-shm               # SQLite temporary files
+ 
+ # Save the latest results
+ export DATE=`date +%Y%m%d`
+ echo "Creating new release in ./archive/ folder ($DATE)..." >> latest_run.log
+ cp /home/lbecquey/Projects/RNANet/results/summary.csv /home/lbecquey/Projects/RNANet/archive/summary_latest.csv
+ cp /home/lbecquey/Projects/RNANet/results/summary.csv "/home/lbecquey/Projects/RNANet/archive/summary_$DATE.csv"
+ cp /home/lbecquey/Projects/RNANet/results/families.csv /home/lbecquey/Projects/RNANet/archive/families_latest.csv
+ cp /home/lbecquey/Projects/RNANet/results/families.csv "/home/lbecquey/Projects/RNANet/archive/families_$DATE.csv"
+ cp /home/lbecquey/Projects/RNANet/results/frequencies.csv /home/lbecquey/Projects/RNANet/archive/frequencies_latest.csv
+ cp /home/lbecquey/Projects/RNANet/results/pair_types.csv /home/lbecquey/Projects/RNANet/archive/pair_types_latest.csv
+ mv /home/lbecquey/Projects/RNANet/results/RNANet.db.gz /home/lbecquey/Projects/RNANet/archive/
+ 
+ # Init Seafile synchronization between RNANet library and ./archive/ folder (just the first time !)
+ # seaf-cli sync -l 8e082c6e-b9ed-4b2f-9279-de2177134c57 -s https://entrepot.ibisc.univ-evry.fr -u l****.b*****y@univ-evry.fr -p ****************** -d archive/
+ 
+ # Sync in Seafile
+ seaf-cli start >> latest_run.log 2>&1
+ echo 'Waiting 10m for SeaFile synchronization...' >> latest_run.log
+ sleep 15m
+ echo `seaf-cli status` >> latest_run.log
+ seaf-cli stop >> latest_run.log 2>&1
+ echo 'We are '`date`', update completed.' >> latest_run.log
+ 
--- a/scripts/build_docker_image.sh
View file @ad0e234
+++ b/scripts/build_docker_image.sh
View file @ad0e234
@@ -21,6 +21,6 @@ docker build -t rnanet:latest ..
 rm x3dna-dssr
 
 # to run, use something like:
- # docker run -v /home/persalteas/Data/RNA/3D/:/3D -v /home/persalteas/Data/RNA/sequences/:/sequences -v /home/persalteas/labo/:/runDir persalteas/rnanet [ additional options here ]
+ # docker run -v /home/lbecquey/Data/RNA/3D/:/3D -v /home/lbecquey/Data/RNA/sequences/:/sequences -v /home/lbecquey/labo/:/runDir rnanet [ additional options here ]
 # Without additional options, this runs a standard pass with known issues support, log output, and no statistics. The default resolution threshold is 4.0 Angstroms.
 
--- a/scripts/recompute_family.py
View file @ad0e234
+++ b/scripts/recompute_family.py
View file @ad0e234
@@ -36,6 +36,6 @@ for fam in families:
 
 # Now re run RNANet normally.
 command = ["python3.8", "./RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0",
-             "--redundant", "--sina", "--extract", "-s", "--stats-opts=\"--wadley --distance-matrices\""]
+             "--redundant", "--extract", "-s", "--stats-opts=\"-r 20.0 --wadley --hire-rna --distance-matrices\""]
 print(' '.join(command))
 subprocess.run(command)
\ No newline at end of file
--- a/scripts/recompute_some_chains.py
View file @ad0e234
+++ b/scripts/recompute_some_chains.py
View file @ad0e234
@@ -3,8 +3,9 @@ import subprocess, os, sys
 
 # Put a list of problematic chains here, they will be properly deleted and recomputed
 problems = [
-     "1k73_1_A",
-     "1k73_1_B"
+     "7nhm_1_A_1-2923"
+     "4wfa_1_X_1-2923"
+     "4wce_1_X_1-2923"
 ]
 
 # provide the path to your data folders, the RNANet.db file, and the RNANet.py file as arguments to this script
@@ -22,6 +23,7 @@ for p in problems:
 
     # Remove the datapoints files and 3D files
     subprocess.run(["rm", '-f', path_to_3D_data + f"/rna_mapped_to_Rfam/{p}.cif"])
+     subprocess.run(["rm", '-f', path_to_3D_data + f"/rna_only/{p}.cif"])
     files = [ f for f in os.listdir(path_to_3D_data + "/datapoints") if p in f ]
     for f in files:
         subprocess.run(["rm", '-f', path_to_3D_data + f"/datapoints/{f}"])
@@ -38,14 +40,14 @@ for p in problems:
             print(' '.join(command))
             subprocess.run(command)
 
-         command = ["python3.8", path_to_RNANet, "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--extract", "--only", p]
+         command = ["python3.8", path_to_RNANet, "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "--redundant", "-r", "20.0", "--extract", "--only", p]
     else:
         # Delete the chain from the database, and the associated nucleotides and re_mappings, using foreign keys
         command = ["sqlite3", path_to_db, f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc is null;"]
         print(' '.join(command))
         subprocess.run(command)
 
-         command = ["python3.8", path_to_RNANet, "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--no-homology", "--extract", "--only", p]
+         command = ["python3.8", path_to_RNANet, "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "--redundant", "-r", "20.0", "--no-homology", "--extract", "--only", p]
 
     # Re-run RNANet
     os.chdir(os.path.dirname(os.path.realpath(path_to_db)) + '/../')
--- a/statistics.py
View file @ad0e234
+++ b/statistics.py
View file @ad0e234