Louis BECQUEY

NR class representatives only for rRNA distance matrices

......@@ -23,3 +23,5 @@ scripts/*.sh
scripts/*.tar
scripts/measure.py
scripts/recompute_some_chains.py
scripts/convert_rna_jsons.py
scripts/recompute_family.py
......
......@@ -8,6 +8,9 @@ FEATURE CHANGES
The LSU and SSU are now aligned with Infernal options '--cpu 10 --mxsize 8192 --mxtau 0.1', which is slow,
requires up to 100 GB of RAM, and yields a suboptimal alignment (tau=0.1 is quite bad), but is homogenous with the other families.
- The LSU and SSU therefore have defined cm_coords fields, and therefore distance matrices can be computed.
- Distances matrices are computed on all availables molecules of the family by default, but you can use statistics.py --non-redundant to only
select the equivalence class representatives at a given resolution into account (new option). For storage reasons, rRNAs are always run in
this mode (but this might change in the future : space required is 'only' ~300 GB).
- We now provide for download the renumbered (standardised) 3D MMCIF files, the nucleotides being numbered by their "index_chain" in the database.
- We now provide for download the sequences of the 3D chains aligned by Rfam family (without Rfam sequences, which have been removed).
- statistics.py now computes histograms and a density estimation with Gaussian mixture models for a large set of geometric parameters,
......@@ -23,7 +26,7 @@ FEATURE CHANGES
BUG CORRECTIONS
- New code file geometric_stats.py
- New automation script that starts from scratch
- Many small fixes
- Many small fixes, leading to the support of many previously "known issues"
- Performance tweaks
TECHNICAL CHANGES
......
6cfj_1_1X
6cfj_1_2X
5hcq_1_1X
6cae_1_1X
5hcq_1_2X
5hcr_1_1X
4z8c_1_1X
5j4b_1_1X
6xhy_1_1X
6xhy_1_2X
5j4b_1_2X
4z8c_1_2X
6cae_1_2X
5j4c_1_1X
5w4k_1_1X
6of1_1_1X
6xhw_1_1X
5hcr_1_2X
5hd1_1_1X
5hcp_1_1X
6of1_1_2X
5hau_1_1W
5j4c_1_2X
5wis_1_1X
6xhv_1_1X
6xqd_1_1X
6nd5_1_1X
5w4k_1_2X
6xhw_1_2X
5hau_1_2W
6xqd_1_2X
6xhv_1_2X
4y4p_1_1X
6o97_1_1X
5hcp_1_2X
5doy_1_1X
4zer_1_1X
5wit_1_1X
5hd1_1_2X
6nd5_1_2X
4z3s_1_1X
7jql_1_1X
7jqm_1_1X
7jql_1_2X
5wis_1_2X
6nd6_1_1X
6o97_1_2X
4y4p_1_2X
7jqm_1_2X
4z3s_1_2X
4zer_1_2X
6uo1_1_2X
6uo1_1_1X
5doy_1_2X
5wit_1_2X
5f8k_1_1X
6nd6_1_2X
6xqe_1_1X
6xqe_1_2X
6n9e_1_1X
6n9e_1_2X
6n9f_1_1X
5f8k_1_2X
6n9f_1_2X
6xz7_1_F
6xzb_1_F2
6xza_1_F2
6y69_1_W
5afi_1_V
5afi_1_W
6h4n_1_W
5wdt_1_V
5wfs_1_V
5wdt_1_W
5wfs_1_W
5we4_1_V
5we4_1_W
5uq8_1_Y
6c4i_1_Y
6c4i_1_X
6yef_1_X
5zeb_1_V
5zep_1_W
5lzd_1_V
5we6_1_V
5wfk_1_V
5wfk_1_W
5we6_1_W
5u4i_1_Y
5uq7_1_Y
5u4i_1_X
5lza_1_V
5wf0_1_V
5wf0_1_W
5zeu_1_V
5l3p_1_X
3jcj_1_V
6gxm_1_X
6gwt_1_X
6gxn_1_X
6gxo_1_X
3j9y_1_V
6o9k_1_Y
6o7k_1_V
5lzf_1_V
3jcn_1_V
5lzc_1_V
5u4j_1_X
5u4j_1_Z
5lzb_1_V
6h58_1_W
6h58_1_WW
5j8b_1_X
4v7j_1_AV
4v7j_1_BV
4v7k_1_BV
4v7k_1_AV
4v7k_1_BW
4v7k_1_AW
4v7j_1_AW
4v7j_1_BW
4v4j_1_Z
6i0v_1_B
5k77_1_X
5k77_1_V
5k77_1_Y
5k77_1_W
5k77_1_Z
4pei_1_X
4pei_1_V
4pei_1_W
4pei_1_Z
4pei_1_Y
4a3c_1_P
4a3e_1_P
6lkq_1_U
7k00_1_B
6ys3_1_A
6qdw_1_A
6hcj_1_Q3
6hcq_1_Q3
6o8w_1_U
5mmm_1_Z
4w2e_1_W
5j4b_1_1Y
6cfj_1_1W
5w4k_1_1Y
6xhy_1_1W
5wit_1_1W
6cfj_1_1Y
6cfj_1_2W
5j4c_1_1W
5wis_1_1Y
5j4c_1_1Y
6xhw_1_1W
6cfj_1_2Y
5wis_1_1W
5j4b_1_1W
6xhv_1_1W
6xhy_1_2W
5j4c_1_2W
5j4b_1_2W
5j4b_1_2Y
5j4c_1_2Y
5w4k_1_1W
6nd5_1_1Y
6xhw_1_2W
5wis_1_2Y
5wit_1_2W
6xhv_1_2W
5doy_1_1Y
5w4k_1_2Y
4y4p_1_1Y
4z3s_1_1Y
5doy_1_1W
5doy_1_2Y
6nd5_1_1W
4z3s_1_2Y
4z3s_1_1W
5w4k_1_2W
6nd5_1_2Y
4y4p_1_2Y
6uo1_1_2Y
6uo1_1_2W
4y4p_1_1W
4z3s_1_2W
6uo1_1_1Y
6xhy_1_1Y
6uo1_1_1W
5wis_1_2W
5wit_1_1Y
6nd5_1_2W
4y4p_1_2W
5doy_1_2W
5wit_1_2Y
6xhv_1_1Y
6xhy_1_2Y
6xhw_1_1Y
6xhw_1_2Y
6ucq_1_1Y
6xhv_1_2Y
4v4i_1_Z
6ucq_1_1X
6ucq_1_2Y
4w2e_1_X
6ucq_1_2X
7n1p_1_DT
7n2u_1_DT
6yss_1_W
7n30_1_DT
7n31_1_DT
7n2c_1_DT
5afi_1_Y
5uq8_1_Z
5wdt_1_Y
5wfs_1_Y
6ysr_1_W
5we4_1_Y
6yst_1_W
5uq7_1_Z
5we6_1_Y
5wfk_1_Y
5wf0_1_Y
6o9j_1_V
6ysu_1_W
3j46_1_A
5j8b_1_Y
5j8b_1_W
3bbv_1_Z
5aj0_1_BV
5aj0_1_BW
4wt8_1_AB
4wt8_1_BB
4v4j_1_Y
4v4i_1_Y
5uq8_1_X
5uq7_1_X
4v4j_1_W
4v4i_1_W
4wt8_1_CS
4wt8_1_DS
4v4j_1_X
4v4i_1_X
6lkq_1_S
5h5u_1_H
7d6z_1_F
5lze_1_Y
5lze_1_V
5lze_1_X
3jcj_1_G
6o7k_1_G
6d30_1_C
6j7z_1_C
3er9_1_D
5kal_1_Y
4nia_1_3
5kal_1_Z
4nia_1_7
4nia_1_4
5new_1_C
4nia_1_U
4nia_1_6
4oq9_1_7
4nia_1_1
4oq9_1_4
4nia_1_8
4oq9_1_8
4nia_1_5
2vrt_1_E
4nia_1_W
4oq9_1_6
4oq8_1_D
4nia_1_Z
4oq9_1_W
4oq9_1_5
4nia_1_2
2vrt_1_F
4oq9_1_U
4oq9_1_Z
4oq9_1_2
4oq9_1_3
1ddl_1_E
4oq9_1_1
6rt5_1_A
6rt5_1_E
6lkq_1_T
6ys3_1_B
6qdw_1_B
3jbv_1_B
3jbu_1_B
6do8_1_B
6dpi_1_B
6dp9_1_B
6dpb_1_B
6dmn_1_B
6dpp_1_B
6dpk_1_B
6dpd_1_B
6dot_1_B
6dok_1_B
6dp8_1_B
6dpl_1_B
6dpg_1_B
6dou_1_B
6dpc_1_B
6do9_1_B
6dmv_1_B
6dp4_1_B
6dpn_1_B
6doj_1_B
6dph_1_B
6dos_1_B
6doo_1_B
6dp6_1_B
6dox_1_B
6dp5_1_B
6dol_1_B
6dp1_1_B
6doz_1_B
6dp7_1_B
6doq_1_B
6dpa_1_B
6dom_1_B
6dog_1_B
6dop_1_B
6doh_1_B
6doa_1_B
6don_1_B
6dov_1_B
6dpo_1_B
6dod_1_B
6dob_1_B
6dow_1_B
6dpm_1_B
6dpf_1_B
6dp3_1_B
6dp2_1_B
6dpe_1_B
6dpj_1_B
6dor_1_B
6dof_1_B
6dp0_1_B
6doi_1_B
6doc_1_B
6doe_1_B
6n6g_1_D
4b3r_1_W
4b3t_1_W
4b3s_1_W
7b5k_1_X
5o2r_1_X
5kcs_1_1X
7n1p_1_PT
7n2u_1_PT
7n30_1_PT
7n31_1_PT
7n2c_1_PT
6zvk_1_E2
6zvk_1_H2
7a01_1_E2
7a01_1_H2
6fti_1_U
6fti_1_W
6ftj_1_U
6ftj_1_W
6ftg_1_U
6ftg_1_W
6x1b_1_D
6x1b_1_F
5f6c_1_C
6i0t_1_B
1b2m_1_C
1b2m_1_D
1b2m_1_E
2uxc_1_Y
4a3g_1_P
4a3j_1_P
7k00_1_5
5mmi_1_Z
3j9m_1_U
7a5k_1_U3
6nu2_1_U
7a5g_1_U3
6nu3_1_U
5c0y_1_C
6n6f_1_D
4ohy_1_B
4oi1_1_B
4oi0_1_B
5ipl_1_3
6utw_1_333
5ipm_1_3
5ipn_1_3
4ylo_1_3
4yln_1_6
4ylo_1_6
4yln_1_3
4yln_1_9
5lzf_1_Y
1n32_1_Z
5zsl_1_D
5zsd_1_C
5zsd_1_D
5zsl_1_E
4nku_1_D
4nku_1_H
1cwp_1_E
6thn_1_A
6qik_1_Y
6rzz_1_Y
6ri5_1_Y
6qt0_1_Y
6qtz_1_Y
6t83_1_1B
6t83_1_3B
6t83_1_AA
6t83_1_CA
6s05_1_Y
5jcs_1_X
5fl8_1_X
6ole_1_V
6om0_1_V
6oli_1_V
6om7_1_V
6w6l_1_V
6olf_1_V
3erc_1_G
6of1_1_1W
6cae_1_1Y
6o97_1_1W
6of1_1_1Y
6of1_1_2W
6o97_1_1Y
6nd6_1_1Y
6cae_1_1W
6of1_1_2Y
6cae_1_2Y
6nd6_1_1W
6cae_1_2W
6o97_1_2Y
6nd6_1_2Y
6o97_1_2W
6nd6_1_2W
4wtm_1_T
4wtm_1_P
6gz4_1_BW
6xz7_1_G
6xzb_1_G2
6gz5_1_BW
6gz3_1_BW
4hot_1_X
6d2z_1_C
7eh0_1_I
4tu0_1_F
4tu0_1_G
6r9o_1_B
6is0_1_C
5lzc_1_X
5lzb_1_X
5lzd_1_Y
5lzc_1_Y
5lzb_1_Y
6zvi_1_E
6sv4_1_MC
6sv4_1_MB
7nrd_1_SM
6i7o_1_MB
6zvi_1_D
6sv4_1_NB
6sv4_1_NC
6i7o_1_NB
7nsq_1_V
6swa_1_Q
6swa_1_R
6ole_1_T
6om0_1_T
6oli_1_T
6om7_1_T
6olf_1_T
6w6l_1_T
6tnu_1_M
5mc6_1_M
7nrc_1_SM
6tb3_1_N
7b7d_1_SM
7b7d_1_SN
6tnu_1_N
7nrc_1_SN
7nrd_1_SN
6zot_1_C
4qu6_1_B
2uxb_1_X
2x1f_1_B
2x1a_1_B
5o1y_1_B
4kzy_1_I
4kzz_1_I
4kzx_1_I
6dzi_1_H
5zeu_1_A
6evj_1_N
6evj_1_M
6wub_1_A
6wua_1_A
6mpi_1_W
5mfx_1_B
5w0m_1_J
5bud_1_E
5w0m_1_I
5w0m_1_H
4j7m_1_B
5bud_1_D
6a4e_1_B
6a4e_1_D
6hxx_1_AA
6hxx_1_AB
6hxx_1_AC
6hxx_1_AD
6hxx_1_AE
6hxx_1_AF
6hxx_1_AG
6hxx_1_AH
6hxx_1_AI
6hxx_1_AJ
6hxx_1_AK
6hxx_1_AL
6hxx_1_AM
6hxx_1_AN
6hxx_1_AO
6hxx_1_AP
6hxx_1_AQ
6hxx_1_AR
6hxx_1_AS
6hxx_1_AT
6hxx_1_AU
6hxx_1_AV
6hxx_1_AW
6hxx_1_AX
6hxx_1_AY
6hxx_1_AZ
6hxx_1_BA
6hxx_1_BB
6hxx_1_BC
6hxx_1_BD
6hxx_1_BE
6hxx_1_BF
6hxx_1_BG
6hxx_1_BH
6hxx_1_BI
5odv_1_A
5odv_1_B
5odv_1_C
5odv_1_D
5odv_1_E
5odv_1_F
5odv_1_G
5odv_1_H
5odv_1_I
5odv_1_J
5odv_1_K
5odv_1_L
5odv_1_M
5odv_1_N
5odv_1_O
5odv_1_P
5odv_1_Q
5odv_1_R
5odv_1_S
5odv_1_T
5odv_1_U
5odv_1_V
5odv_1_W
5odv_1_X
6t34_1_A
6t34_1_B
6t34_1_C
6t34_1_D
6t34_1_E
6t34_1_F
6t34_1_G
6t34_1_H
6t34_1_I
6t34_1_J
6t34_1_K
6t34_1_L
6t34_1_M
6t34_1_N
6t34_1_O
6t34_1_P
6t34_1_Q
6t34_1_R
6t34_1_S
6ip8_1_ZY
6ip5_1_ZY
6ip5_1_ZU
6ip6_1_ZY
6ip8_1_ZZ
6ip6_1_ZZ
6uu3_1_333
6uu1_1_333
3er8_1_H
3er8_1_G
3er8_1_F
5o3j_1_B
4dr7_1_B
1i5l_1_Y
1i5l_1_U
4dr6_1_B
6i2n_1_U
4v68_1_A0
6vyu_1_Y
6vyw_1_Y
6vz7_1_Y
6vz5_1_Y
6vz3_1_Y
6vyy_1_Y
6vyx_1_Y
6vyz_1_Y
6vz2_1_Y
1mvr_1_1
6vyt_1_Y
1cgm_1_I
3jb7_1_T
3jb7_1_M
3j0o_1_D
3j0l_1_D
3j0q_1_D
3j0p_1_D
2tmv_1_R
5a79_1_R
5a7a_1_R
2om3_1_R
2xea_1_R
4v7e_1_AA
4v7e_1_AC
4wtl_1_T
4wtl_1_P
1xnq_1_W
7n2v_1_DT
4peh_1_Z
1vq6_1_4
4am3_1_D
4am3_1_H
4am3_1_I
4lj0_1_C
4lj0_1_D
4lj0_1_E
5lzy_1_HH
4wtj_1_T
4wtj_1_P
4xbf_1_D
6n6d_1_D
6n6k_1_C
6n6k_1_D
3rtj_1_D
6ty9_1_M
6tz1_1_N
6q1h_1_D
6q1h_1_H
6p7p_1_F
6p7p_1_E
6p7p_1_D
6vm6_1_J
6vm6_1_G
6wan_1_K
6wan_1_H
6wan_1_G
6wan_1_L
6wan_1_I
6ywo_1_F
6wan_1_J
4oau_1_A
6ywo_1_E
6ywo_1_K
6vm6_1_I
6vm6_1_H
6ywo_1_I
2a1r_1_C
6m6v_1_F
6m6v_1_E
2a1r_1_D
3gpq_1_E
3gpq_1_F
6o79_1_C
6vm6_1_K
6m6v_1_G
6hyu_1_D
1laj_1_R
6ybv_1_K
6sce_1_B
6xl1_1_C
6scf_1_I
6scf_1_K
6yud_1_K
6yud_1_O
6scf_1_M
6yud_1_P
6scf_1_L
6yud_1_M
6yud_1_Q
6w11_1_C
6o6x_1_D
4ba2_1_R
7bdv_1_F
7bdv_1_H
6o6x_1_C
7did_1_C
6o7b_1_C
6o6v_1_C
6wxx_1_Y
6wxx_1_X
6r7b_1_D
6r9r_1_D
6ov0_1_E
6ov0_1_H
6ov0_1_G
6o6v_1_D
6ov0_1_F
6o7b_1_D
5e02_1_C
6r9r_1_E
6r7b_1_E
6o7i_1_I
6o7h_1_K
7l6t_1_C
7jyy_1_F
7jyy_1_E
7jz0_1_F
7jz0_1_E
6rt6_1_A
6rt6_1_E
1y1y_1_P
5zuu_1_I
5zuu_1_G
7am2_1_R1
4peh_1_W
4peh_1_V
4peh_1_X
4peh_1_Y
7d8c_1_C
6mkn_1_W
7kl3_1_B
4cxg_1_C
4cxh_1_C
4eya_1_E
4eya_1_F
4eya_1_Q
4eya_1_R
4ht9_1_E
6z1p_1_AB
6z1p_1_AA
4ii9_1_C
5mq0_1_3
5uk4_1_X
5uk4_1_V
5uk4_1_W
5uk4_1_U
5f6c_1_E
7nwh_1_HH
4rcj_1_B
1xnr_1_W
6e0o_1_C
6o75_1_D
6o75_1_C
6e0o_1_B
3j06_1_R
4eya_1_G
4eya_1_H
4eya_1_S
4eya_1_T
4dr4_1_V
1ibl_1_Z
1ibm_1_Z
4dr5_1_V
4d61_1_J
7nwg_1_Q3
5tbw_1_SR
6hhq_1_SR
6zvi_1_H
6sv4_1_2B
6sv4_1_2C
6t83_1_2B
6t83_1_A
6i7o_1_2B
6q8y_1_N
6sv4_1_N
6i7o_1_N
6swa_1_S
5k8h_1_A
5z4a_1_B
3jbu_1_V
1h2c_1_R
1h2d_1_S
1h2d_1_R
6szs_1_X
5mgp_1_X
6enu_1_X
6enf_1_X
6enj_1_X
1pvo_1_L
1pvo_1_G
1pvo_1_H
1pvo_1_J
1pvo_1_K
2ht1_1_K
2ht1_1_J
5sze_1_C
6wre_1_D
6i0u_1_B
5zsa_1_C
5zsa_1_D
1n34_1_Z
3pf5_1_S
6ppn_1_A
6ppn_1_I
5flx_1_Z
6eri_1_AX
7k5l_1_R
7d80_1_Y
7du2_1_R
4v8z_1_CX
6kqe_1_I
5uh8_1_I
5vi5_1_Q
4xln_1_T
4xlr_1_T
4xln_1_Q
5i2d_1_K
5i2d_1_V
4xlr_1_Q
6sty_1_C
6sty_1_F
2xs5_1_D
3ok4_1_N
3ok4_1_L
3ok4_1_Z
3ok4_1_4
3ok4_1_V
3ok4_1_X
3ok4_1_P
3ok4_1_H
3ok4_1_J
3ok4_1_R
3ok4_1_T
3ok4_1_2
6n6h_1_D
5wnt_1_B
3b0u_1_B
3b0u_1_A
4x9e_1_G
4x9e_1_H
6z1p_1_BB
6z1p_1_BA
2uxd_1_X
6ywe_1_BB
3ol9_1_D
3ol9_1_H
3ol9_1_L
3ol9_1_P
3olb_1_L
3olb_1_P
3olb_1_D
3olb_1_H
3ol6_1_D
3ol6_1_H
3ol6_1_L
3ol6_1_P
3ol8_1_D
3ol8_1_H
3ol7_1_L
3ol7_1_P
3ol7_1_D
3ol7_1_H
3ol8_1_L
3ol8_1_P
6yrq_1_E
6yrq_1_H
6yrq_1_G
6yrq_1_F
6yrb_1_C
6yrb_1_D
6gz5_1_BV
6gz4_1_BV
6gz3_1_BV
6fti_1_Q
7njc_1_B
4v7e_1_AB
4v7e_1_AE
4v7e_1_AD
4x62_1_B
4x64_1_B
4x65_1_B
1xmq_1_W
4x66_1_B
3t1h_1_W
3t1y_1_W
1xmo_1_W
6kr6_1_B
6z8k_1_X
4csf_1_U
4csf_1_Q
4csf_1_G
4csf_1_M
4csf_1_K
4csf_1_A
4csf_1_I
4csf_1_S
4csf_1_C
4csf_1_W
4csf_1_O
4csf_1_E
6ywx_1_BB
6th6_1_AA
6skg_1_AA
6skf_1_AA
6q8y_1_M
6i7o_1_M
6zmw_1_W
6ybv_1_W
2fz2_1_D
2xpj_1_D
2vrt_1_H
2vrt_1_G
6r9m_1_B
4nia_1_C
4nia_1_A
4nia_1_H
4nia_1_N
4nia_1_G
4nia_1_D
4nia_1_B
4nia_1_I
4nia_1_E
4nia_1_M
4oq9_1_I
4oq9_1_G
4oq9_1_C
4oq9_1_H
4oq9_1_N
4oq9_1_A
4oq9_1_D
4oq9_1_E
4oq9_1_M
4oq9_1_B
5uhc_1_I
1uvn_1_F
1uvn_1_B
1uvn_1_D
4wtk_1_T
4wtk_1_P
1vqn_1_4
4oav_1_C
4oav_1_A
4i67_1_B
6k32_1_T
6k32_1_P
5mmj_1_A
5x8r_1_A
6yw5_1_AA
6ywe_1_AA
6ywy_1_AA
6ywx_1_AA
3nvk_1_G
3nvk_1_S
1cwp_1_D
1cwp_1_F
5z4j_1_B
5gmf_1_E
5gmf_1_H
6e4p_1_J
5gmf_1_F
5gmf_1_G
5gmg_1_D
5gmg_1_C
6e4p_1_K
3ie1_1_E
3ie1_1_H
3ie1_1_F
4dr7_1_V
3ie1_1_G
3s4g_1_C
3s4g_1_B
2qqp_1_R
1nb7_1_E
1nb7_1_F
4hos_1_X
3p6y_1_T
3p6y_1_V
3p6y_1_U
3p6y_1_Q
3p6y_1_W
5dto_1_B
4cxh_1_X
1uvj_1_F
1uvj_1_D
1uvj_1_E
6kqd_1_I
6kqd_1_S
5uh5_1_I
1ytu_1_F
1ytu_1_D
4kzz_1_J
7a09_1_F
5t2c_1_AN
3j6b_1_E
4v4f_1_B6
4v4f_1_A5
4v4f_1_A3
4v4f_1_B0
4v4f_1_B9
4v4f_1_A2
4v4f_1_A8
4v4f_1_A1
4v4f_1_A9
4v4f_1_BZ
4v4f_1_B8
4v4f_1_B7
4v4f_1_B5
4v4f_1_A0
4v4f_1_A7
4v4f_1_A4
4v4f_1_AZ
4v4f_1_B3
4v4f_1_B1
4v4f_1_B4
4v4f_1_A6
4v4f_1_B2
7m4y_1_V
7m4x_1_V
6v3a_1_V
6v39_1_V
5it9_1_I
7jqc_1_I
5zsb_1_C
5zsb_1_D
5zsn_1_D
5zsn_1_E
6gfw_1_R
6zm6_1_X
6zm5_1_X
6zm6_1_W
6zm5_1_W
6n6e_1_D
4g7o_1_I
4g7o_1_S
5x22_1_S
5x22_1_I
5x21_1_I
5uh6_1_I
6l74_1_I
5uh9_1_I
7a5j_1_X
6sag_1_R
4udv_1_R
5zsc_1_D
5zsc_1_C
6woy_1_I
6wox_1_I
4gkk_1_W
4v9e_1_AG
4v9e_1_BM
4v9e_1_AM
4v9e_1_AA
4v9e_1_BA
4v9e_1_BG
5lzs_1_II
6fqr_1_C
6ha1_1_X
5kcr_1_1X
6uu4_1_333
6uu0_1_333
6uuc_1_333
6uu2_1_333
6xl9_1_R
6b6h_1_3
6xh8_1_3
6pb4_1_3
3m7n_1_Z
3m85_1_X
3m85_1_Z
3m85_1_Y
5wnp_1_B
5wnv_1_B
5yts_1_B
1utd_1_6
1utd_1_Z
1utd_1_4
1utd_1_7
1utd_1_9
1utd_1_5
1utd_1_3
1utd_1_2
1utd_1_8
1utd_1_1
6n6i_1_C
6n6i_1_D
6n6a_1_D
6ij2_1_F
6ij2_1_G
6ij2_1_H
6ij2_1_E
3u2e_1_D
3u2e_1_C
7eh1_1_I
5uef_1_C
5uef_1_D
7eh2_1_R
7eh2_1_I
4x4u_1_H
4afy_1_D
6oy5_1_I
6owl_1_B
6owl_1_C
4afy_1_C
4lq3_1_R
6s0m_1_C
6ymw_1_C
7a5g_1_J
6gx6_1_B
4k4s_1_D
4k4s_1_H
4k4t_1_H
4k4t_1_D
1xpu_1_G
1xpu_1_L
1xpr_1_L
1xpu_1_H
1xpo_1_K
1xpo_1_J
1xpu_1_J
1xpo_1_H
1xpr_1_J
1xpu_1_K
1xpr_1_K
1xpo_1_M
1xpo_1_L
1xpu_1_M
1xpr_1_M
1xpo_1_G
1xpr_1_H
1xpr_1_G
5x70_1_E
5x70_1_G
6gc5_1_F
6gc5_1_H
6gc5_1_G
1n1h_1_B
7n2v_1_PT
4ohz_1_B
6t83_1_6B
4gv6_1_C
4gv6_1_B
4gv3_1_C
4gv3_1_B
4gv9_1_E
6i7o_1_L
2a8v_1_D
6qx3_1_G
2xnr_1_C
4gkj_1_W
5y88_1_X
3j0o_1_H
3j0l_1_H
3j0p_1_H
3j0q_1_H
3j0o_1_F
3j0l_1_F
3j0p_1_F
3j0q_1_F
3j0o_1_B
3j0l_1_B
3j0o_1_C
3j0l_1_C
3j0q_1_C
3j0p_1_C
3j0o_1_A
3j0l_1_A
3j0q_1_A
3j0p_1_A
6ys3_1_V
6qdw_1_V
5hk0_1_F
4qm6_1_D
4qm6_1_C
4jzu_1_C
4jzv_1_C
5ytv_1_B
4k4z_1_P
4k4z_1_D
4k4x_1_L
4k4z_1_L
4k4x_1_D
4k4z_1_H
4k4x_1_H
4k4x_1_P
4a3b_1_P
4a3m_1_P
6u6y_1_E
6u6y_1_G
6u6y_1_F
6u6y_1_H
6qik_1_X
6rzz_1_X
6ri5_1_X
6qt0_1_X
6qtz_1_X
6s05_1_X
6t83_1_BB
6t83_1_4B
5fl8_1_Z
5jcs_1_Z
5mrc_1_BB
5mre_1_BB
5mrf_1_BB
3j46_1_P
4e6b_1_A
4e6b_1_B
6a6l_1_D
1uvi_1_D
1uvi_1_F
1uvi_1_E
4m7d_1_P
4k4u_1_D
4k4u_1_H
6rt7_1_E
6rt7_1_A
2voo_1_C
2voo_1_D
5k78_1_X
5k78_1_Y
4ylo_1_9
5vyc_1_I2
5vyc_1_I3
5vyc_1_I5
5vyc_1_I1
5vyc_1_I6
5vyc_1_I4
6ip8_1_2M
6ip5_1_2M
6ip6_1_2M
6qcs_1_M
7b5k_1_Z
4nia_1_O
4nia_1_J
4nia_1_K
4nia_1_L
4nia_1_F
4oq9_1_K
4oq9_1_O
4oq9_1_J
4oq9_1_F
4oq9_1_L
6r9q_1_B
7m4u_1_A
6v3a_1_SN1
6v3b_1_SN1
6v39_1_SN1
6v3e_1_SN1
4dr6_1_V
6kql_1_I
4eya_1_M
4eya_1_N
4eya_1_A
4eya_1_B
2wj8_1_D
2wj8_1_I
2wj8_1_L
2wj8_1_F
2wj8_1_C
2wj8_1_Q
2wj8_1_J
2wj8_1_P
2wj8_1_K
2wj8_1_E
2wj8_1_T
2wj8_1_B
2wj8_1_O
2wj8_1_N
2wj8_1_A
2wj8_1_H
2wj8_1_R
2wj8_1_M
2wj8_1_S
2wj8_1_G
4e6b_1_E
4e6b_1_F
6p71_1_I
3pdm_1_R
5det_1_P
5els_1_I
4n2s_1_B
5fl8_1_Y
5jcs_1_Y
4yoe_1_E
6ow3_1_I
6ovy_1_I
6oy6_1_I
4qvd_1_H
5gxi_1_B
7n06_1_G
7n06_1_H
7n06_1_I
7n06_1_J
7n06_1_K
7n06_1_L
7n33_1_G
7n33_1_H
7n33_1_I
7n33_1_J
7n33_1_K
7n33_1_L
5mc6_1_N
4eya_1_O
4eya_1_P
4eya_1_C
4eya_1_D
6htq_1_V
6htq_1_W
6htq_1_U
6uu6_1_333
5a0v_1_F
3avt_1_T
6d1v_1_C
4s2x_1_B
4s2y_1_B
5wnu_1_B
1vtm_1_R
5elt_1_F
5elt_1_E
6xlj_1_R
6u9x_1_H
6u9x_1_K
5elk_1_R
6okk_1_G
4cxg_1_A
4cxh_1_A
6bk8_1_I
4cxg_1_B
4cxh_1_B
5z4d_1_B
6o78_1_E
6xa1_1_BV
6ha8_1_X
1m8w_1_E
1m8w_1_F
5udi_1_B
5udl_1_B
5udk_1_B
5udj_1_B
5w5i_1_B
5w5i_1_D
5w5h_1_B
5w5h_1_D
4eya_1_K
4eya_1_L
4eya_1_I
4eya_1_J
4g9z_1_E
4g9z_1_F
3nma_1_B
3nma_1_C
6een_1_G
6een_1_I
6een_1_H
4wti_1_T
4wti_1_P
5l3p_1_Y
4hor_1_X
3rzo_1_R
2f4v_1_Z
1qln_1_R
3cw1_1_X
3cw1_1_W
7b0y_1_A
6ogy_1_M
6ogy_1_N
6uej_1_B
6ywy_1_BB
5ytx_1_B
4g0a_1_H
6r9p_1_B
3koa_1_C
4n48_1_D
4n48_1_G
6kug_1_B
6ktc_1_V
6ole_1_U
6om0_1_U
6olg_1_BV
6oli_1_U
6om7_1_U
6w6l_1_U
6olz_1_BV
6olf_1_U
5lzd_1_X
6m7k_1_B
3cd6_1_4
3cma_1_5
6n9e_1_2W
1vqo_1_4
1qvg_1_3
3cme_1_5
5lzd_1_W
5lze_1_W
5lzc_1_W
5lzb_1_W
3wzi_1_C
1n33_1_Z
6dti_1_W
3d2s_1_F
3d2s_1_H
5mrc_1_AA
5mre_1_AA
5mrf_1_AA
7jhy_1_Z
4wkr_1_C
4v99_1_EC
4v99_1_AC
4v99_1_BH
4v99_1_CH
4v99_1_AM
4v99_1_DC
4v99_1_JW
4v99_1_EH
4v99_1_BW
4v99_1_FW
4v99_1_AW
4v99_1_BC
4v99_1_BM
4v99_1_IC
4v99_1_EM
4v99_1_ER
4v99_1_IW
4v99_1_JH
4v99_1_JR
4v99_1_AH
4v99_1_GR
4v99_1_IR
4v99_1_BR
4v99_1_CW
4v99_1_HR
4v99_1_FH
4v99_1_HC
4v99_1_DW
4v99_1_GC
4v99_1_JC
4v99_1_DM
4v99_1_EW
4v99_1_AR
4v99_1_CR
4v99_1_JM
4v99_1_CC
4v99_1_IH
4v99_1_FR
4v99_1_CM
4v99_1_IM
4v99_1_FM
4v99_1_FC
4v99_1_GH
4v99_1_HM
4v99_1_HH
4v99_1_DR
4v99_1_HW
4v99_1_GW
4v99_1_DH
4v99_1_GM
6rt4_1_D
6rt4_1_C
6zvh_1_X
4dwa_1_D
6n6c_1_D
6n6j_1_C
6n6j_1_D
6p7q_1_E
6p7q_1_F
6p7q_1_D
6rcl_1_C
5jju_1_C
4ejt_1_G
6lkq_1_W
3qsu_1_P
3qsu_1_R
2xs7_1_B
1n38_1_B
4qvc_1_G
6mpf_1_W
6spc_1_A
6spe_1_A
6zvk_1_D2
7a01_1_D2
6fti_1_V
6ftj_1_V
6ftg_1_V
4g0a_1_G
4g0a_1_F
4g0a_1_E
2b2d_1_S
5hkc_1_C
1rmv_1_B
4qu7_1_X
4qu7_1_V
4qu7_1_U
6pmi_1_3
6pmj_1_3
5hjz_1_C
6ydp_1_AA_1176-2737
6ydw_1_AA_1176-2737
7d1a_1_A_805-902
......@@ -1514,18 +9,18 @@
7o7z_1_AH_144-220
4c9d_1_D_29-1
4c9d_1_C_29-1
7aih_1_1_2400-2963
7aih_1_1_2984-3610
7ane_1_2_1904-2468
7aih_1_1_2400-2963
7ane_1_2_2489-3115
7ane_1_2_1904-2468
5g2x_1_A_595-692
7aor_1_2_2020-2579
7aor_1_2_2589-3210
7aor_1_2_2020-2579
7a5p_1_2_259-449
7aor_1_A_2020-2579
7aor_1_A_2589-3210
7am2_1_1_1904-2470
7aor_1_A_2020-2579
7am2_1_1_2491-3117
7ane_1_1_1904-2468
7am2_1_1_1904-2470
7ane_1_1_2489-3115
7ane_1_1_1904-2468
6uz7_1_8_2140-2825
......
This diff could not be displayed because it is too large.
......@@ -920,14 +920,22 @@ def general_stats():
@trace_unhandled_exceptions
def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
"""
Get the pairwise distances in one 3D molecule, given its aligned sequence (with gaps).
Returns a tuple of numpy arrays:
- The first is a boolean matrix, whose values are 1 if the distance is NaN (unresolved residue, or missing atom...), 0 otherwise
- The second is the distance matrix (in angströms), unresolved positions are 0 (not NaN)
- The third is the square of the second (square-distance matrix), unresolved positions are 0 (not NaN)
"""
# Identify the right 3D file
filename = ''
filename = ""
for file in filelist:
if file.startswith(s.id.split("RF")[0].replace('-', '').replace('[', '_').replace(']', '_')):
filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file
break
if not len(filename):
# chain is not in file list. Maybe you are in non-redundant mode and it is not a representative (normal case).
return None, None, None
# Get the coordinates of every existing nt in the 3D file
......@@ -938,9 +946,9 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
warn("No C1' atoms in " + filename.split('/')[-1] + ", ignoring")
return None, None, None
except FileNotFoundError:
warn(f"{label} not found in the mapped mmCIF files")
return None, None, None
# Get the coordinates of every position in the alignment
nb_gap = 0
coordinates_with_gaps = []
......@@ -965,7 +973,6 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j])
# Save the individual distance matrices
# if f not in LSU_set and f not in SSU_set:
np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
# For the average and sd, we want to consider only positions of the consensus model. This means:
......@@ -979,7 +986,7 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
while cm_coords[i] is None:
i += 1
family_start = int(cm_coords[i])
# c = np.zeros((family_end, family_end), dtype=np.float32) # new matrix of size of the consensus model for the family
# new matrix of size of the consensus model for the family
c = np.NaN * np.ones((family_end, family_end), dtype=np.float32)
# set to NaN zones that never exist in the 3D data
for i in range(family_start-1):
......@@ -1000,8 +1007,8 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
return 1-np.isnan(c).astype(int), np.nan_to_num(c), np.nan_to_num(c*c)
@trace_unhandled_exceptions
def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
np.seterr(divide='ignore') # ignore division by zero issues
def get_avg_std_distance_matrix(f, res, consider_all_atoms=False, redundancy=False, multithread=False):
# np.seterr(divide='ignore') # ignore division by zero issues
if consider_all_atoms:
label = "base"
......@@ -1009,23 +1016,38 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
label = "backbone"
if not multithread:
# This function call is for ONE worker.
# Get a worker number for it to position the progress bar
# This function call is for ONE worker. Get a worker number for it to position the progress bar.
global idxQueue
thr_idx = idxQueue.get()
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} {f} {label} distance matrices")
os.makedirs(runDir + '/results/distance_matrices/' + f + '_' + label, exist_ok=True )
align = AlignIO.read(path_to_seq_data + f"realigned/{f}_3d_only.afa", "fasta")
ncols = align.get_alignment_length()
# Get the list of 3D files. They should exist in the folder from the last RNANet run with --extract option.
if redundancy:
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
conn.execute('pragma journal_mode=wal')
r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}' AND issue=0;")
filelist = sorted([ ''.join(list(x))+'.cif' for x in r ])
else:
filelist = sorted(representatives_from_nrlist(res, mapped_to=f))
# Open the 3D-only alignment. keep only files that will be considered in 3D (e.g. representatives)
temp_align = AlignIO.read(path_to_seq_data + f"realigned/{f}_3d_only.afa", "fasta")
align = []
for s in temp_align:
filename = ""
for file in filelist:
if file.startswith(s.id.split("RF")[0].replace('-', '').replace('[', '_').replace(']', '_')):
align.append(s)
break
ncols = temp_align.get_alignment_length()
found = 0
notfound = 0
# retrieve the mappings between this family's alignment and the CM model:
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
conn.execute('pragma journal_mode=wal')
r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';")
filelist = sorted([ ''.join(list(x))+'.cif' for x in r ])
r = sql_ask_database(conn, f"SELECT cm_coord FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;")
cm_coords = [ x[0] for x in r ] # len(cm_coords) is the number of saved columns. There are many None values in the list.
i = len(cm_coords)-1
......@@ -1054,16 +1076,15 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
avg += d
std += dsquared
else:
# d is None means the considered RNA is not in the filelist (e.g., not a representative), or is not found.
notfound += 1
pbar.update(1)
pbar.close()
else:
# We split the work for one family on multiple workers.
p = Pool(initializer=init_with_tqdm, initargs=(tqdm.get_lock(),), processes=nworkers)
try:
fam_pbar = tqdm(total=len(align), desc=f"{f} {label} pair distances", position=0, unit="chain", leave=True)
# Apply work_pssm_remap to each RNA family
for i, (contrib, d, dsquared) in enumerate(p.imap_unordered(partial(par_distance_matrix, filelist, f, label, cm_coords, consider_all_atoms), align, chunksize=1)):
if d is not None:
found += 1
......@@ -1128,15 +1149,15 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
if not multithread:
idxQueue.put(thr_idx) # replace the thread index in the queue
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
else:
# basically, for the rRNAs
# we delete the unique csv files for each chain, they wheight hundreds of gigabytes together
warn(f"Removing {f} ({label}) individual distance matrices, they weight too much. keeping the averages and standard deviations.")
for csv in glob.glob(runDir + '/results/distance_matrices/' + f + '_'+ label + "/*-" + f + ".csv"):
try:
os.remove(csv)
except FileNotFoundError:
pass
# else:
# # basically, for the rRNAs
# # we delete the unique csv files for each chain, they wheight hundreds of gigabytes together
# warn(f"Removing {f} ({label}) individual distance matrices, they weight too much. keeping the averages and standard deviations.")
# for csv in glob.glob(runDir + '/results/distance_matrices/' + f + '_'+ label + "/*-" + f + ".csv"):
# try:
# os.remove(csv)
# except FileNotFoundError:
# pass
return 0
@trace_unhandled_exceptions
......@@ -1195,7 +1216,7 @@ def nt_3d_centers(cif_file, consider_all_atoms):
try:
structure = MMCIFParser().get_structure(cif_file, cif_file)
except Exception as e:
warn(f"{cif_file.split('/')[-1]} : {e}", error=True)
warn(f"\n{cif_file.split('/')[-1]} : {e}", error=True)
with open(runDir + "/errors.txt", "a") as f:
f.write(f"Exception in nt_3d_centers({cif_file.split('/')[-1]})\n")
f.write(str(e))
......@@ -1225,23 +1246,53 @@ def nt_3d_centers(cif_file, consider_all_atoms):
result.append(res)
return(result)
def representatives_from_nrlist(res):
def representatives_from_nrlist(res, mapped_to=None):
"""
Returns the list of filenames corresponding to the 3D cif files of structures
that represent a "cluster" (a redundancy class) at the given resolution.
If mapped is not None, then the database is searched for a mapping to a family.
"""
# Read the NR file
nr_code = min([i for i in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 20.0] if i >= res])
fpath = f"/home/data/RNA/3D/latest_nr_list_{nr_code}A.csv"
fpath = f"{path_to_3D_data}/latest_nr_list_{nr_code}A.csv"
repres = []
df = pd.read_csv(os.path.abspath(fpath))
# define a function to transform a code into a filename
def query_mapping_to(structure, model, chain, family):
if family is None:
return structure + '_' + model + '_' + chain + ".cif"
# if we need a mapping start and end, query database
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
conn.execute('pragma journal_mode=wal')
r = sql_ask_database(conn, f"SELECT pdb_start, pdb_end FROM chain WHERE rfam_acc='{family}' AND structure_id='{structure}' AND chain_name='{chain}' AND issue=0;")
if not len(r):
# there is no chain named like this and mapped to this family
return None
if len(r) > 1:
warn(f"Several entries found for structure {structure}-{chain} ({family}) : {len(r)} entries")
return structure + '_' + model + '_' + chain + '_' + str(r[0][0]) + '-' + str(r[0][1]) + ".cif"
# build the list
for i in range(df.shape[0]):
up_name = df["representative"][i]
if '+' in up_name:
up_name = up_name.split('+')
for i in range(len(up_name)):
chain = up_name[i].split('|')
chain = chain[0].lower() + '_' + chain[1] + '_' + chain[2]
repres.append(chain + '.cif')
reference = query_mapping_to(chain[0].lower(), chain[1], chain[2], mapped_to)
if reference is not None:
repres.append(reference)
else :
up_name = up_name.split('|')
low_name = up_name[0].lower() + '_' + up_name[1] + '_' + up_name[2]
repres.append(low_name + '.cif')
reference = query_mapping_to(up_name[0].lower(), up_name[1], up_name[2], mapped_to)
if reference is not None:
repres.append(reference)
return repres
......@@ -1281,11 +1332,12 @@ if __name__ == "__main__":
DELETE_OLD_DATA = False
DO_WADLEY_ANALYSIS = False
DO_AVG_DISTANCE_MATRIX = False
REDUNDANT_DIST_MAT = True
DO_HIRE_RNA_MEASURES = False
RESCAN_GMM_COMP_NUM = False
try:
opts, _ = getopt.getopt( sys.argv[1:], "r:h",
[ "help", "from-scratch", "wadley", "distance-matrices", "resolution=",
[ "help", "from-scratch", "wadley", "distance-matrices", "non-redundant", "resolution=",
"3d-folder=", "seq-folder=", "hire-rna", "rescan-nmodes" ])
except getopt.GetoptError as err:
print(err)
......@@ -1301,14 +1353,17 @@ if __name__ == "__main__":
print()
print("-r 20.0 [ --resolution=20.0 ]\tCompute statistics using chains of resolution 20.0A or better.")
print("--3d-folder=…\t\t\tPath to a folder containing the 3D data files. Required subfolders should be:"
"\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.")
"\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format."
"\n\t\t\t\t\trna_mapped_to_Rfam/\tmmCIF files produced by RNANet (using --extract)."
"\n\t\t\t\t\trna_only/\t\tmmCIF files produced by RNANet in no-homology mode.")
print("--seq-folder=…\t\t\tPath to a folder containing the sequence and alignment files. Required subfolder:"
"\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything")
print("--distance-matrices\t\tCompute average distance between nucleotide pairs for each family.")
print("--non-redundant\t\t\tIn distance matrix computation, only use the equivalence class representatives.\n\t\t\t\t Does not apply to rRNAs, where the option is always True.")
print("--wadley\t\t\tReproduce Wadley & al 2007 clustering of pseudotorsions.")
print("--hire-rna\t\t\tCompute distances between atoms and torsion angles for HiRE-RNA model, and plot GMMs on the data.")
print("--rescan-nmodes\t\tDo not assume the number of modes in distances and angles distributions, measure it.")
print("--hire-rna\t\t\tCompute distances between atoms and torsion angles for HiRE-RNA model,\n\t\t\t\t and plot GMMs on the data.")
print("--rescan-nmodes\t\t\tDo not assume the number of modes in distances and angles distributions, measure it.")
sys.exit()
elif opt == "--version":
print("RNANet statistics 1.6 beta")
......@@ -1350,6 +1405,8 @@ if __name__ == "__main__":
os.makedirs(runDir + "/results/figures/GMM/HiRE-RNA/basepairs/", exist_ok=True)
elif opt == "--rescan-nmodes":
RESCAN_GMM_COMP_NUM = True
elif opt == "--non-redundant":
REDUNDANT_DIST_MAT = False
# Load mappings. famlist will contain only families with structures at this resolution threshold.
......@@ -1373,7 +1430,7 @@ if __name__ == "__main__":
ignored = families[families.n_chains < 3].rfam_acc.tolist()
famlist.sort(key=family_order)
print(f"Found {len(famlist)} families with chains of resolution {res_thr}A or better.")
print(f"Found {len(famlist)} families with chains or better.")
if len(ignored):
print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
......@@ -1413,8 +1470,8 @@ if __name__ == "__main__":
e3 = file.split('_')[2]
extracted_chains.append(e1 + '[' + e2 + ']' + '-' + e3)
for f in [ x for x in famlist if (x not in LSU_set and x not in SSU_set) ]: # Process the rRNAs later only 3 by 3
joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, True, False)))
joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False, False)))
joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, res_thr, True, REDUNDANT_DIST_MAT, False)))
joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, res_thr, False, REDUNDANT_DIST_MAT, False)))
# Do general family statistics
joblist.append(Job(function=stats_len)) # Computes figures about chain lengths
......@@ -1428,7 +1485,6 @@ if __name__ == "__main__":
# Do geometric measures
if n_unmapped_chains:
os.makedirs(runDir + "/results/geometry/all-atoms/distances/", exist_ok=True)
# structure_list = os.listdir(path_to_3D_data + "rna_only")
structure_list = representatives_from_nrlist(res_thr)
for f in structure_list:
if path.isfile(path_to_3D_data + "datapoints/" + f.split('.')[0]):
......@@ -1438,12 +1494,17 @@ if __name__ == "__main__":
# Now process the memory-heavy tasks family by family
if DO_AVG_DISTANCE_MATRIX:
print("Computing distances matrices of rRNA families using only the equivalence class representatives, for storage purposes.")
# Note that, if the user has more than 300 GB of free storage space, one could use all the rRNAs.
# Yes, within an equivalence class, the rRNA molecules are close in sequence and structure.
# But yet, having several 3D structures of the same molecule gives an insight about structure flexibility in some regions.
# Detect free space automatically ? TODISCUSS + TODECIDE + TODO
for f in LSU_set:
get_avg_std_distance_matrix(f, True, True)
get_avg_std_distance_matrix(f, False, True)
get_avg_std_distance_matrix(f, res_thr, True, False, True)
get_avg_std_distance_matrix(f, res_thr, False, False, True)
for f in SSU_set:
get_avg_std_distance_matrix(f, True, True)
get_avg_std_distance_matrix(f, False, True)
get_avg_std_distance_matrix(f, res_thr, True, False, True)
get_avg_std_distance_matrix(f, res_thr, False, False, True)
print()
print()
......@@ -1477,3 +1538,4 @@ if __name__ == "__main__":
process_jobs(joblist)
merge_jsons()
......