Louis BECQUEY

Revision 1 for Bioinformatics completed

......@@ -12,4 +12,5 @@ esl*
# environment stuff
.vscode/
*.pyc
\ No newline at end of file
*.pyc
__pycache__/
\ No newline at end of file
......
......@@ -94,6 +94,8 @@ The detailed list of options is below:
-h [ --help ] Print this help message
--version Print the program version
-f [ --full-inference ] Infer new 3D->family mappings even if Rfam already provides some. Yields more copies of chains
mapped to different families.
-r 4.0 [ --resolution=4.0 ] Maximum 3D structure resolution to consider a RNA chain.
-s Run statistics computations after completion
--extract Extract the portions of 3D RNA chains to individual mmCIF files.
......@@ -105,7 +107,7 @@ The detailed list of options is below:
RNAcifs/ Full structures containing RNA, in mmCIF format
rna_mapped_to_Rfam/ Extracted 'pure' RNA chains
datapoints/ Final results in CSV file format.
--seq-folder=… Path to a folder to store the sequence and alignment files.
--seq-folder=… Path to a folder to store the sequence and alignment files. Subfolders will be:
rfam_sequences/fasta/ Compressed hits to Rfam families
realigned/ Sequences, covariance models, and alignments by family
--no-homology Do not try to compute PSSMs and do not align sequences.
......@@ -117,11 +119,12 @@ The detailed list of options is below:
--update-homologous Re-download Rfam and SILVA databases, realign all families, and recompute all CSV files
--from-scratch Delete database, local 3D and sequence files, and known issues, and recompute.
--archive Create a tar.gz archive of the datapoints text files, and update the link to the latest archive
--no-logs Do not save per-chain logs of the numbering modifications
```
Typical usage:
```
nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &
nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &
```
## Post-computation task: estimate quality
......
This diff could not be displayed because it is too large.
1ml5_1_a_1-2914
1ml5_1_a_151-2903
1ml5_1_A_7-1515
1ml5_1_A_2-1520
1ml5_1_A_7-1518
1ml5_1_b_5-121
1eg0_1_O_1-73
2rdo_1_A_3-118
4v48_1_A9_3-118
4v47_1_A9_3-118
6zmi_1_L8_1267-4755
6zm7_1_L8_1267-4755
6y6x_1_L8_1267-4755
6z6n_1_L8_1267-4755
6qzp_1_L8_1267-4755
6zme_1_L8_1267-4755
6z6l_1_L8_1267-4755
6ek0_1_L8_1267-4755
6zmo_1_L8_1267-4755
6z6m_1_L8_1267-4755
6ole_1_D_1267-4755
6om0_1_D_1267-4755
6y2l_1_L8_1267-4755
6lqm_1_8_1267-4755
6y0g_1_L8_1267-4755
6lu8_1_8_1267-4755
6lsr_1_8_1267-4755
6lss_1_8_1267-4755
6oli_1_D_1267-4755
6olg_1_A3_1267-4755
6y57_1_L8_1267-4755
5t2c_1_C_1267-4755
6om7_1_D_1267-4755
4ug0_1_L8_1267-4755
6olf_1_D_1267-4755
6ip5_1_1C_1267-4755
6ip8_1_1C_1267-4755
6olz_1_A3_1267-4755
5aj0_1_A3_1267-4755
5lks_1_L8_1267-4755
6ip6_1_1C_1267-4755
4v6x_1_A8_1267-4755
1vy7_1_AY_1-73
1vy7_1_CY_1-73
4w2h_1_CY_1-73
2z9q_1_A_1-72
1jgq_1_A_2-1520
4v42_1_AA_2-1520
1jgo_1_A_2-1520
1jgp_1_A_2-1520
1ml5_1_A_2-1520
4v42_1_BA_1-2914
1ml5_1_a_1-2914
4v42_1_BB_5-121
1ml5_1_b_5-121
2rdo_1_B_1-2904
4v48_1_A0_1-2904
4v47_1_A0_1-2904
4v48_1_BA_1-1543
4v47_1_BA_1-1542
1ls2_1_B_1-73
3ep2_1_Y_1-72
3eq3_1_Y_1-72
4v48_1_A6_1-73
1eg0_1_O_1-73
2z9q_1_A_1-72
1gsg_1_T_1-72
3jcr_1_H_1-115
4v42_1_BA_1-2914
4v42_1_BA_151-2903
4v48_1_BA_1-91
4v48_1_BA_6-1541
4v48_1_BA_1-1543
4v48_1_BA_6-1538
4v47_1_BA_1-91
4v47_1_BA_6-1540
4v47_1_BA_1-1542
4v47_1_BA_6-1537
2rdo_1_B_1-2903
2rdo_1_B_6-1460
2rdo_1_B_1-1528
2rdo_1_B_6-1457
2rdo_1_B_160-2893
2rdo_1_B_1-2904
2rdo_1_B_6-1522
4v48_1_A0_1-2903
4v48_1_A0_6-1460
4v48_1_A0_1-1528
4v48_1_A0_6-1457
4v48_1_A0_160-2893
4v48_1_A0_1-2904
4v48_1_A0_6-1522
4v47_1_A0_1-2903
4v47_1_A0_6-1460
4v47_1_A0_1-1528
4v47_1_A0_6-1457
4v47_1_A0_160-2893
4v47_1_A0_1-2904
4v47_1_A0_6-1522
1x1l_1_A_1-132
1zc8_1_Z_1-93
2ob7_1_D_1-132
2ob7_1_A_10-319
1x1l_1_A_1-130
1zc8_1_Z_1-130
1zc8_1_Z_1-91
2ob7_1_D_1-130
6rxu_1_C2_588-2386
6rxu_1_C2_583-2388
6rxu_1_C2_588-2383
5oql_1_2_588-2386
5oql_1_2_583-2388
5oql_1_2_588-2383
6rxv_1_C2_588-2386
6rxv_1_C2_583-2388
6rxv_1_C2_588-2383
6rxz_1_C2_588-2386
6rxz_1_C2_583-2388
6rxz_1_C2_588-2383
6rxy_1_C2_588-2386
6rxy_1_C2_583-2388
6rxy_1_C2_588-2383
6rxt_1_C2_588-2386
6rxt_1_C2_583-2388
6rxt_1_C2_588-2383
1r2x_1_C_1-58
1r2w_1_C_1-58
1eg0_1_L_1-57
1eg0_1_L_1-56
1jgq_1_A_7-1518
1jgq_1_A_20-55
1jgq_1_A_2-1520
1jgq_1_A_7-1515
4v42_1_AA_7-1518
4v42_1_AA_20-55
4v42_1_AA_2-1520
4v42_1_AA_7-1515
1jgo_1_A_7-1518
1jgo_1_A_20-55
1jgo_1_A_2-1520
1jgo_1_A_7-1515
1jgp_1_A_7-1518
1jgp_1_A_20-55
1jgp_1_A_2-1520
1jgp_1_A_7-1515
1zc8_1_A_1-59
1mvr_1_D_1-59
4c9d_1_D_29-1
4c9d_1_C_29-1
4adx_1_9_1-121
1mvr_1_D_1-61
4adx_1_9_1-123
1zn1_1_B_1-59
1emi_1_B_1-108
3iy9_1_A_498-1027
......@@ -143,25 +49,1558 @@
3cw1_1_V_1-138
3cw1_1_v_1-138
2iy3_1_B_9-105
3jcr_1_N_1-188
3jcr_1_N_1-106
3jcr_1_N_1-107
2vaz_1_A_64-177
2ftc_1_R_1-1568
2ftc_1_R_792-1568
2ftc_1_R_81-1466
3jcr_1_M_1-141
3jcr_1_M_1-188
3jcr_1_M_1-107
4v5z_1_B0_1-2899
4v5z_1_B0_1-2902
4v5z_1_B0_1-2840
5g2x_1_A_595-692
3iy8_1_A_1-540
4v5z_1_BY_2-113
4v5z_1_BZ_1-70
1mvr_1_B_1-96
4adx_1_0_1-2923
4adx_1_0_132-2915
4v5z_1_B1_2-125
1mvr_1_B_3-96
4adx_1_0_1-2925
3eq4_1_Y_1-69
6uz7_1_8_2140-2827
4v5z_1_AA_1-1563
4v5z_1_AA_1-1562
6cfj_1_1X
6cfj_1_2X
5hcq_1_1X
6cae_1_1X
5hcq_1_2X
5hcr_1_1X
4z8c_1_1X
5j4b_1_1X
5j4b_1_2X
4z8c_1_2X
6cae_1_2X
5j4c_1_1X
5w4k_1_1X
6of1_1_1X
5hcr_1_2X
5hd1_1_1X
5hcp_1_1X
6of1_1_2X
5hau_1_1W
5j4c_1_2X
5wis_1_1X
6xqd_1_1X
6nd5_1_1X
5w4k_1_2X
5hau_1_2W
6xqd_1_2X
4y4p_1_1X
6o97_1_1X
5hcp_1_2X
5doy_1_1X
4zer_1_1X
5wit_1_1X
5hd1_1_2X
6nd5_1_2X
4z3s_1_1X
7jql_1_1X
7jqm_1_1X
7jql_1_2X
5wis_1_2X
6nd6_1_1X
6o97_1_2X
4y4p_1_2X
7jqm_1_2X
4z3s_1_2X
4zer_1_2X
6uo1_1_2X
6uo1_1_1X
5doy_1_2X
5wit_1_2X
5f8k_1_1X
6nd6_1_2X
6xqe_1_1X
6xqe_1_2X
6n9e_1_1X
6n9e_1_2X
6n9f_1_1X
5f8k_1_2X
6n9f_1_2X
6xz7_1_F
6y69_1_W
5afi_1_V
5afi_1_W
6h4n_1_W
5wdt_1_V
5wfs_1_V
5wdt_1_W
5wfs_1_W
5we4_1_V
5we4_1_W
5uq8_1_Y
6c4i_1_Y
6c4i_1_X
5zeb_1_V
5zep_1_W
5lzd_1_V
5we6_1_V
5wfk_1_V
5wfk_1_W
5we6_1_W
5u4i_1_Y
5uq7_1_Y
5u4i_1_X
5lza_1_V
5wf0_1_V
5wf0_1_W
5zeu_1_V
5l3p_1_X
3jcj_1_V
6gxm_1_X
6gwt_1_X
6gxn_1_X
6gxo_1_X
3j9y_1_V
6o9k_1_Y
6o7k_1_V
5lzf_1_V
3jcn_1_V
5lzc_1_V
5u4j_1_X
5u4j_1_Z
5lzb_1_V
6h58_1_W
6h58_1_WW
1eg0_1_O
5j8b_1_X
4v7j_1_AV
4v7j_1_BV
4v7k_1_BV
4v7k_1_AV
4v7k_1_BW
4v7k_1_AW
4v7j_1_AW
4v7j_1_BW
4v4j_1_Z
6i0v_1_B
5k77_1_X
5k77_1_V
5k77_1_Y
5k77_1_W
5k77_1_Z
4pei_1_X
4pei_1_V
4pei_1_W
4pei_1_Z
4pei_1_Y
4a3c_1_P
4a3e_1_P
6lkq_1_U
7k00_1_B
6qdw_1_A
2rdo_1_A
4v48_1_A9
4v47_1_A9
6hcj_1_Q3
6hcq_1_Q3
5mmm_1_Z
4w2e_1_W
5j4b_1_1Y
6cfj_1_1W
5w4k_1_1Y
5wit_1_1W
6cfj_1_1Y
6cfj_1_2W
5j4c_1_1W
5wis_1_1Y
5j4c_1_1Y
6cfj_1_2Y
5wis_1_1W
5j4b_1_1W
5j4c_1_2W
5j4b_1_2W
5j4b_1_2Y
5j4c_1_2Y
5w4k_1_1W
6nd5_1_1Y
5wis_1_2Y
5wit_1_2W
5doy_1_1Y
5w4k_1_2Y
4y4p_1_1Y
4z3s_1_1Y
5doy_1_1W
5doy_1_2Y
6nd5_1_1W
4z3s_1_2Y
4z3s_1_1W
5w4k_1_2W
6nd5_1_2Y
4y4p_1_2Y
6uo1_1_2Y
6uo1_1_2W
4y4p_1_1W
4z3s_1_2W
6uo1_1_1Y
6uo1_1_1W
5wis_1_2W
5wit_1_1Y
6nd5_1_2W
4y4p_1_2W
5doy_1_2W
5wit_1_2Y
6ucq_1_1Y
4v4i_1_Z
6ucq_1_1X
6ucq_1_2Y
4w2e_1_X
6ucq_1_2X
6yss_1_W
5afi_1_Y
5uq8_1_Z
5wdt_1_Y
5wfs_1_Y
6ysr_1_W
5we4_1_Y
6yst_1_W
5uq7_1_Z
5we6_1_Y
5wfk_1_Y
5wf0_1_Y
6o9j_1_V
6ysu_1_W
3j46_1_A
5j8b_1_Y
5j8b_1_W
3bbv_1_Z
5aj0_1_BV
5aj0_1_BW
4wt8_1_AB
4wt8_1_BB
4v4j_1_Y
4v4i_1_Y
5uq8_1_X
5uq7_1_X
1jgq_1_A
4v42_1_AA
1jgo_1_A
1jgp_1_A
1ml5_1_A
4v4j_1_W
4v4i_1_W
4v42_1_BA
4wt8_1_CS
4wt8_1_DS
4v4j_1_X
4v4i_1_X
4v42_1_BB
6uu4_1_333
6uu0_1_333
6uuc_1_333
6uu2_1_333
6b6h_1_3
6pb4_1_3
6d30_1_C
6j7z_1_C
3er9_1_D
5kal_1_Y
4nia_1_3
5kal_1_Z
4nia_1_7
4nia_1_4
5new_1_C
4nia_1_U
4nia_1_6
4oq9_1_7
4nia_1_1
4oq9_1_4
4nia_1_8
4oq9_1_8
4nia_1_5
2vrt_1_E
4nia_1_W
4oq9_1_6
4oq8_1_D
4nia_1_Z
4oq9_1_W
4oq9_1_5
4nia_1_2
2vrt_1_F
4oq9_1_U
4oq9_1_Z
4oq9_1_2
4oq9_1_3
1ddl_1_E
4oq9_1_1
6rt5_1_A
6rt5_1_E
4qu6_1_B
6lkq_1_T
6qdw_1_B
3jbv_1_B
3jbu_1_B
2rdo_1_B
4v48_1_A0
4v47_1_A0
6do8_1_B
6dpi_1_B
6dp9_1_B
6dpb_1_B
6dmn_1_B
6dpp_1_B
6dpk_1_B
6dpd_1_B
6dot_1_B
6dok_1_B
6dp8_1_B
6dpl_1_B
6dpg_1_B
6dou_1_B
6dpc_1_B
6do9_1_B
6dmv_1_B
6dp4_1_B
6dpn_1_B
6doj_1_B
6dph_1_B
6dos_1_B
6doo_1_B
6dp6_1_B
6dox_1_B
6dp5_1_B
6dol_1_B
6dp1_1_B
6doz_1_B
6dp7_1_B
6doq_1_B
6dpa_1_B
6dom_1_B
6dog_1_B
6dop_1_B
6doh_1_B
6doa_1_B
6don_1_B
6dov_1_B
6dpo_1_B
6dod_1_B
6dob_1_B
6dow_1_B
6dpm_1_B
6dpf_1_B
6dp3_1_B
6dp2_1_B
6dpe_1_B
6dpj_1_B
6dor_1_B
6dof_1_B
6dp0_1_B
6doi_1_B
6doc_1_B
6doe_1_B
6n6g_1_D
6lkq_1_S
5h5u_1_H
5lze_1_Y
5lze_1_V
5lze_1_X
3jcj_1_G
6o7k_1_G
4v48_1_BA
4v47_1_BA
4b3r_1_W
4b3t_1_W
4b3s_1_W
5o2r_1_X
5kcs_1_1X
6fti_1_U
6fti_1_W
6ftj_1_U
6ftj_1_W
6ftg_1_U
6ftg_1_W
6ole_1_T
6om0_1_T
6oli_1_T
6om7_1_T
6olf_1_T
6w6l_1_T
6x1b_1_D
6x1b_1_F
5f6c_1_C
6i0t_1_B
1b2m_1_C
1b2m_1_D
1b2m_1_E
2uxc_1_Y
4a3g_1_P
4a3j_1_P
7k00_1_5
5mmi_1_Z
3j9m_1_U
6nu2_1_U
6nu3_1_U
5c0y_1_C
6n6f_1_D
4ohy_1_B
4oi1_1_B
4oi0_1_B
6raz_1_Y
5ipl_1_3
6utw_1_333
5ipm_1_3
5ipn_1_3
4ylo_1_3
4yln_1_6
4ylo_1_6
4yln_1_3
4yln_1_9
5lzf_1_Y
1n32_1_Z
5zsl_1_D
5zsd_1_C
5zsd_1_D
5zsl_1_E
4nku_1_D
4nku_1_H
1cwp_1_E
6qik_1_Y
6rzz_1_Y
6ri5_1_Y
6qt0_1_Y
6qtz_1_Y
6t83_1_1B
6t83_1_3B
6t83_1_AA
6t83_1_CA
6s05_1_Y
5jcs_1_X
5fl8_1_X
3erc_1_G
6of1_1_1W
6cae_1_1Y
6o97_1_1W
6of1_1_1Y
6of1_1_2W
6o97_1_1Y
6nd6_1_1Y
6cae_1_1W
6of1_1_2Y
6cae_1_2Y
6nd6_1_1W
6cae_1_2W
6o97_1_2Y
6nd6_1_2Y
6o97_1_2W
6nd6_1_2W
6xz7_1_G
6gz5_1_BW
6gz3_1_BW
1ls2_1_B
3ep2_1_Y
3eq3_1_Y
4v48_1_A6
2z9q_1_A
4hot_1_X
6d2z_1_C
4tu0_1_F
4tu0_1_G
6r9o_1_B
6is0_1_C
5lzc_1_X
5lzb_1_X
5lzd_1_Y
5lzc_1_Y
5lzb_1_Y
1gsg_1_T
6zvi_1_D
6sv4_1_NB
6sv4_1_NC
6i7o_1_NB
5y88_1_X
3j6x_1_IR
3j6y_1_IR
6tb3_1_N
6tnu_1_N
2uxb_1_X
2x1f_1_B
2x1a_1_B
3eq3_1_D
3ep2_1_D
1eg0_1_M
3eq4_1_D
5o1y_1_B
3jcr_1_H
6dzi_1_H
5zeu_1_A
6mpi_1_W
5mfx_1_B
5w0m_1_J
5bud_1_E
5w0m_1_I
5w0m_1_H
4j7m_1_B
5bud_1_D
6a4e_1_B
6a4e_1_D
6hxx_1_AA
6hxx_1_AB
6hxx_1_AC
6hxx_1_AD
6hxx_1_AE
6hxx_1_AF
6hxx_1_AG
6hxx_1_AH
6hxx_1_AI
6hxx_1_AJ
6hxx_1_AK
6hxx_1_AL
6hxx_1_AM
6hxx_1_AN
6hxx_1_AO
6hxx_1_AP
6hxx_1_AQ
6hxx_1_AR
6hxx_1_AS
6hxx_1_AT
6hxx_1_AU
6hxx_1_AV
6hxx_1_AW
6hxx_1_AX
6hxx_1_AY
6hxx_1_AZ
6hxx_1_BA
6hxx_1_BB
6hxx_1_BC
6hxx_1_BD
6hxx_1_BE
6hxx_1_BF
6hxx_1_BG
6hxx_1_BH
6hxx_1_BI
5odv_1_A
5odv_1_B
5odv_1_C
5odv_1_D
5odv_1_E
5odv_1_F
5odv_1_G
5odv_1_H
5odv_1_I
5odv_1_J
5odv_1_K
5odv_1_L
5odv_1_M
5odv_1_N
5odv_1_O
5odv_1_P
5odv_1_Q
5odv_1_R
5odv_1_S
5odv_1_T
5odv_1_U
5odv_1_V
5odv_1_W
5odv_1_X
6t34_1_A
6t34_1_B
6t34_1_C
6t34_1_D
6t34_1_E
6t34_1_F
6t34_1_G
6t34_1_H
6t34_1_I
6t34_1_J
6t34_1_K
6t34_1_L
6t34_1_M
6t34_1_N
6t34_1_O
6t34_1_P
6t34_1_Q
6t34_1_R
6t34_1_S
6ip8_1_ZY
6ip5_1_ZY
6ip5_1_ZU
6ip6_1_ZY
6ip8_1_ZZ
6ip6_1_ZZ
6uu3_1_333
6uu1_1_333
1pn8_1_D
3er8_1_H
3er8_1_G
3er8_1_F
5o3j_1_B
4dr7_1_B
1i5l_1_Y
1i5l_1_U
4dr6_1_B
6i2n_1_U
4v68_1_A0
6vyu_1_Y
6vyw_1_Y
6vz7_1_Y
6vz5_1_Y
6vz3_1_Y
6vyy_1_Y
6vyx_1_Y
6vyz_1_Y
6vz2_1_Y
1mvr_1_1
6vyt_1_Y
1cgm_1_I
3jb7_1_T
3jb7_1_M
3j0o_1_D
3j0l_1_D
3j0q_1_D
3j0p_1_D
5elt_1_F
5elt_1_E
2tmv_1_R
5a79_1_R
5a7a_1_R
2om3_1_R
2xea_1_R
4wtl_1_T
4wtl_1_P
1xnq_1_W
1x18_1_C
1x18_1_B
1x18_1_D
1vq6_1_4
4am3_1_D
4am3_1_H
4am3_1_I
4lj0_1_C
4lj0_1_D
4lj0_1_E
5lzy_1_HH
4wtj_1_T
4wtj_1_P
4xbf_1_D
6ow3_1_I
6ovy_1_I
6oy6_1_I
6n6d_1_D
6n6k_1_C
6n6k_1_D
3rtj_1_D
1apg_1_D
6ty9_1_M
6tz1_1_N
4bbl_1_Y
4bbl_1_Z
6sce_1_B
6scf_1_I
6scf_1_K
6yud_1_K
6yud_1_O
6scf_1_M
6yud_1_P
6scf_1_L
6yud_1_M
6yud_1_Q
6o6x_1_D
4ba2_1_R
6o6x_1_C
6o7b_1_C
6o6v_1_C
6r7b_1_D
6r9r_1_D
6ov0_1_E
6ov0_1_H
6ov0_1_G
6o6v_1_D
6ov0_1_F
6o7b_1_D
5e02_1_C
6r9r_1_E
6r7b_1_E
6o7i_1_I
6o7h_1_K
7jyy_1_F
7jyy_1_E
7jz0_1_F
7jz0_1_E
6rt6_1_A
6rt6_1_E
1y1y_1_P
5zuu_1_I
5zuu_1_G
4peh_1_W
4peh_1_V
4peh_1_X
4peh_1_Y
4peh_1_Z
6mkn_1_W
4cxg_1_C
4cxh_1_C
1x1l_1_A
1zc8_1_Z
2ob7_1_D
2ob7_1_A
4eya_1_E
4eya_1_F
4eya_1_Q
4eya_1_R
2r1g_1_B
4ht9_1_E
1cvj_1_M
6z1p_1_AB
6z1p_1_AA
4ii9_1_C
5mq0_1_3
5uk4_1_X
5uk4_1_V
5uk4_1_W
5uk4_1_U
5f6c_1_E
4rcj_1_B
1xnr_1_W
6e0o_1_C
6o75_1_D
6o75_1_C
6e0o_1_B
3j06_1_R
1r2x_1_C
1r2w_1_C
1eg0_1_L
4eya_1_G
4eya_1_H
4eya_1_S
4eya_1_T
4dr4_1_V
1ibl_1_Z
1ibm_1_Z
4dr5_1_V
4d61_1_J
1trj_1_B
1trj_1_C
6q8y_1_N
6sv4_1_N
6i7o_1_N
5k8h_1_A
5z4a_1_B
3jbu_1_V
1h2c_1_R
1h2d_1_S
1h2d_1_R
6szs_1_X
5mgp_1_X
6enu_1_X
6enf_1_X
6enj_1_X
1pvo_1_L
1pvo_1_G
1pvo_1_H
1pvo_1_J
1pvo_1_K
2ht1_1_K
2ht1_1_J
6eri_1_AX
1zc8_1_A
1zc8_1_C
1zc8_1_B
1zc8_1_G
1zc8_1_I
1zc8_1_H
1zc8_1_J
4v8z_1_CX
6kqe_1_I
5uh8_1_I
5vi5_1_Q
4xln_1_T
4xlr_1_T
4xln_1_Q
5i2d_1_K
5i2d_1_V
4xlr_1_Q
6sty_1_C
6sty_1_F
2xs5_1_D
3ok4_1_N
3ok4_1_L
3ok4_1_Z
3ok4_1_4
3ok4_1_V
3ok4_1_X
3ok4_1_P
3ok4_1_H
3ok4_1_J
3ok4_1_R
3ok4_1_T
3ok4_1_2
6n6h_1_D
5wnt_1_B
3b0u_1_B
3b0u_1_A
4x9e_1_G
4x9e_1_H
6z1p_1_BB
6z1p_1_BA
2uxd_1_X
4qvd_1_H
4v7e_1_AB
3ol9_1_D
3ol9_1_H
3ol9_1_L
3ol9_1_P
3olb_1_L
3olb_1_P
3olb_1_D
3olb_1_H
3ol6_1_D
3ol6_1_H
3ol6_1_L
3ol6_1_P
3ol8_1_D
3ol8_1_H
3ol7_1_L
3ol7_1_P
3ol7_1_D
3ol7_1_H
3ol8_1_L
3ol8_1_P
1qzc_1_C
1qzc_1_A
6ole_1_V
6om0_1_V
6oli_1_V
6om7_1_V
6w6l_1_V
6olf_1_V
1mvr_1_D
4wtm_1_T
4wtm_1_P
5x70_1_E
5x70_1_G
6gz5_1_BV
6gz4_1_BV
6gz3_1_BV
6fti_1_Q
4v7e_1_AE
4v7e_1_AD
4x62_1_B
4x64_1_B
4x65_1_B
1xmq_1_W
4x66_1_B
3t1h_1_W
3t1y_1_W
1xmo_1_W
4adx_1_9
6kr6_1_B
1zn1_1_B
6z8k_1_X
1cvj_1_Q
4csf_1_U
4csf_1_Q
4csf_1_G
4csf_1_M
4csf_1_K
4csf_1_A
4csf_1_I
4csf_1_S
4csf_1_C
4csf_1_W
4csf_1_O
4csf_1_E
1cvj_1_N
1cvj_1_O
1cvj_1_S
1cvj_1_P
1cvj_1_T
1cvj_1_R
6th6_1_AA
6skg_1_AA
6skf_1_AA
6q8y_1_M
6i7o_1_M
6zmw_1_W
6ybv_1_W
2fz2_1_D
2xpj_1_D
2vrt_1_H
2vrt_1_G
1emi_1_B
6r9m_1_B
4nia_1_C
4nia_1_A
4nia_1_H
4nia_1_N
4nia_1_G
4nia_1_D
4nia_1_B
4nia_1_I
4nia_1_E
4nia_1_M
4oq9_1_I
4oq9_1_G
4oq9_1_C
4oq9_1_H
4oq9_1_N
4oq9_1_A
4oq9_1_D
4oq9_1_E
4oq9_1_M
4oq9_1_B
5uhc_1_I
1uvn_1_F
1uvn_1_B
1uvn_1_D
3iy9_1_A
4wtk_1_T
4wtk_1_P
1vqn_1_4
4oav_1_C
4oav_1_A
3ep2_1_E
3eq3_1_E
3eq4_1_E
3ep2_1_A
3eq3_1_A
3eq4_1_A
3ep2_1_C
3eq3_1_C
3eq4_1_C
3ep2_1_B
3eq3_1_B
3eq4_1_B
4i67_1_B
3pgw_1_R
3pgw_1_N
3cw1_1_X
3cw1_1_W
3cw1_1_V
5it9_1_I
6k32_1_T
6k32_1_P
5mmj_1_A
5x8r_1_A
3j2k_1_3
3j2k_1_2
3j2k_1_1
3j2k_1_0
3j2k_1_4
3nvk_1_G
3nvk_1_S
2iy3_1_B
1cwp_1_F
5z4j_1_B
5gmf_1_E
5gmf_1_H
6e4p_1_J
5gmf_1_F
5gmf_1_G
5gmg_1_D
5gmg_1_C
6e4p_1_K
3ie1_1_E
3ie1_1_H
3ie1_1_F
4dr7_1_V
3ie1_1_G
3s4g_1_C
3s4g_1_B
2qqp_1_R
2zde_1_E
2zde_1_F
2zde_1_H
2zde_1_G
1nb7_1_E
1nb7_1_F
4hos_1_X
3p6y_1_T
3p6y_1_V
3p6y_1_U
3p6y_1_Q
3p6y_1_W
5dto_1_B
4cxh_1_X
1uvj_1_F
1uvj_1_D
1uvj_1_E
6kqd_1_I
6kqd_1_S
5uh5_1_I
1ytu_1_F
1ytu_1_D
4kzz_1_J
5t2c_1_AN
4v5z_1_BF
3j6b_1_E
4v4f_1_B6
4v4f_1_A5
4v4f_1_A3
4v4f_1_B0
4v4f_1_B9
4v4f_1_A2
4v4f_1_A8
4v4f_1_A1
4v4f_1_A9
4v4f_1_BZ
4v4f_1_B8
4v4f_1_B7
4v4f_1_B5
4v4f_1_A0
4v4f_1_A7
4v4f_1_A4
4v4f_1_AZ
4v4f_1_B3
4v4f_1_B1
4v4f_1_B4
4v4f_1_A6
4v4f_1_B2
5flx_1_Z
5zsb_1_C
5zsb_1_D
5zsn_1_D
5zsn_1_E
3jcr_1_N
6gfw_1_R
2vaz_1_A
1qzc_1_B
1mvr_1_C
4v5z_1_BP
6n6e_1_D
4g7o_1_I
4g7o_1_S
5x22_1_S
5x22_1_I
5x21_1_I
5uh6_1_I
6l74_1_I
5uh9_1_I
2ftc_1_R
6sag_1_R
4udv_1_R
2r1g_1_E
5zsc_1_D
5zsc_1_C
6woy_1_I
6wox_1_I
6evj_1_N
6evj_1_M
4gkk_1_W
4v9e_1_AG
4v9e_1_BM
4v9e_1_AM
4v9e_1_AA
4v9e_1_BA
4v9e_1_BG
5lzs_1_II
6fqr_1_C
6ha1_1_X
5kcr_1_1X
2r1g_1_X
3m7n_1_Z
3m85_1_X
3m85_1_Z
3m85_1_Y
1e8s_1_C
5wnp_1_B
5wnv_1_B
5yts_1_B
1utd_1_6
1utd_1_Z
1utd_1_4
1utd_1_7
1utd_1_9
1utd_1_5
1utd_1_3
1utd_1_2
1utd_1_8
1utd_1_1
6n6i_1_C
6n6i_1_D
6n6a_1_D
6ij2_1_F
6ij2_1_G
6ij2_1_H
6ij2_1_E
3u2e_1_D
3u2e_1_C
5uef_1_C
5uef_1_D
4x4u_1_H
4afy_1_D
6oy5_1_I
6owl_1_B
6owl_1_C
4afy_1_C
4lq3_1_R
6s0m_1_C
6gx6_1_B
4k4s_1_D
4k4s_1_H
4k4t_1_H
4k4t_1_D
1zn1_1_C
1zn0_1_C
1xpu_1_G
1xpu_1_L
1xpr_1_L
1xpu_1_H
1xpo_1_K
1xpo_1_J
1xpu_1_J
1xpo_1_H
1xpr_1_J
1xpu_1_K
1xpr_1_K
1xpo_1_M
1xpo_1_L
1xpu_1_M
1xpr_1_M
1xpo_1_G
1xpr_1_H
1xpr_1_G
6gc5_1_F
6gc5_1_H
6gc5_1_G
4v7e_1_AA
4v7e_1_AC
1n1h_1_B
4ohz_1_B
6t83_1_6B
4gv6_1_C
4gv6_1_B
4gv3_1_C
4gv3_1_B
4gv9_1_E
6i7o_1_L
2a8v_1_D
6qx3_1_G
2xnr_1_C
4gkj_1_W
4v5z_1_BC
4v5z_1_BB
4v5z_1_BH
3j0o_1_F
3j0l_1_F
3j0p_1_F
3j0q_1_F
3j0o_1_B
3j0l_1_B
3j0o_1_C
3j0l_1_C
3j0q_1_C
3j0p_1_C
3j0o_1_A
3j0l_1_A
3j0q_1_A
3j0p_1_A
1cwp_1_D
4v5z_1_BJ
5sze_1_C
6wre_1_D
6i0u_1_B
5zsa_1_C
5zsa_1_D
1n34_1_Z
3pf5_1_S
6ppn_1_A
6ppn_1_I
6qdw_1_V
5hk0_1_F
4qm6_1_D
4qm6_1_C
4jzu_1_C
4jzv_1_C
5ytv_1_B
4k4z_1_P
4k4z_1_D
4k4x_1_L
4k4z_1_L
4k4x_1_D
4k4z_1_H
4k4x_1_H
4k4x_1_P
1t1m_1_A
1t1m_1_B
4a3b_1_P
4a3m_1_P
6u6y_1_E
6u6y_1_G
6u6y_1_F
6u6y_1_H
6qik_1_X
6rzz_1_X
6ri5_1_X
6qt0_1_X
6qtz_1_X
6s05_1_X
6t83_1_BB
6t83_1_4B
5fl8_1_Z
5jcs_1_Z
5mrc_1_BB
5mre_1_BB
5mrf_1_BB
6gz4_1_BW
3j46_1_P
3jcr_1_M
4e6b_1_A
4e6b_1_B
6a6l_1_D
4v5z_1_BS
4v8t_1_1
1uvi_1_D
1uvi_1_F
1uvi_1_E
4m7d_1_P
4k4u_1_D
4k4u_1_H
6rt7_1_E
6rt7_1_A
2voo_1_C
2voo_1_D
5k78_1_X
5k78_1_Y
4ylo_1_9
4kzy_1_I
4kzz_1_I
4kzx_1_I
5vyc_1_I2
5vyc_1_I3
5vyc_1_I5
5vyc_1_I1
5vyc_1_I6
5vyc_1_I4
6ip8_1_2M
6ip5_1_2M
6ip6_1_2M
6qcs_1_M
486d_1_G
2r1g_1_C
486d_1_F
4v5z_1_B0
4nia_1_O
4nia_1_J
4nia_1_K
4nia_1_L
4nia_1_F
4oq9_1_K
4oq9_1_O
4oq9_1_J
4oq9_1_F
4oq9_1_L
5tbw_1_SR
6hhq_1_SR
6zvi_1_H
6sv4_1_2B
6sv4_1_2C
6t83_1_2B
6t83_1_A
6i7o_1_2B
6r9q_1_B
6v3a_1_SN1
6v3b_1_SN1
6v39_1_SN1
6v3e_1_SN1
1pn7_1_C
1mj1_1_Q
1mj1_1_R
4dr6_1_V
6kql_1_I
4eya_1_M
4eya_1_N
4eya_1_A
4eya_1_B
2wj8_1_D
2wj8_1_I
2wj8_1_L
2wj8_1_F
2wj8_1_C
2wj8_1_Q
2wj8_1_J
2wj8_1_P
2wj8_1_K
2wj8_1_E
2wj8_1_T
2wj8_1_B
2wj8_1_O
2wj8_1_N
2wj8_1_A
2wj8_1_H
2wj8_1_R
2wj8_1_M
2wj8_1_S
2wj8_1_G
4e6b_1_E
4e6b_1_F
6p71_1_I
3pdm_1_R
5det_1_P
5els_1_I
4n2s_1_B
4yoe_1_E
3j0o_1_H
3j0l_1_H
3j0p_1_H
3j0q_1_H
5gxi_1_B
3iy8_1_A
6tnu_1_M
5mc6_1_M
5mc6_1_N
4eya_1_O
4eya_1_P
4eya_1_C
4eya_1_D
6htq_1_V
6htq_1_W
6htq_1_U
6uu6_1_333
6v3a_1_V
6v39_1_V
5a0v_1_F
3avt_1_T
6d1v_1_C
4s2x_1_B
4s2y_1_B
5wnu_1_B
1zc8_1_F
1vtm_1_R
4v5z_1_BA
4v5z_1_BE
4v5z_1_BD
4v5z_1_BG
4v5z_1_BI
4v5z_1_BK
4v5z_1_BM
4v5z_1_BL
4v5z_1_BV
4v5z_1_BO
4v5z_1_BN
4v5z_1_BQ
4v5z_1_BR
4v5z_1_BT
4v5z_1_BU
4v5z_1_BW
4v5z_1_BY
4v5z_1_BX
4v5z_1_BZ
6u9x_1_H
6u9x_1_K
5elk_1_R
6okk_1_G
4cxg_1_A
4cxh_1_A
6bk8_1_I
4cxg_1_B
4cxh_1_B
4v5z_1_B1
5z4d_1_B
6o78_1_E
6ha8_1_X
1m8w_1_E
1m8w_1_F
5udi_1_B
5udl_1_B
5udk_1_B
5udj_1_B
5w5i_1_B
5w5i_1_D
5w5h_1_B
5w5h_1_D
4eya_1_K
4eya_1_L
4eya_1_I
4eya_1_J
4g9z_1_E
4g9z_1_F
3nma_1_B
3nma_1_C
6een_1_G
6een_1_I
6een_1_H
4wti_1_T
4wti_1_P
5l3p_1_Y
4hor_1_X
3rzo_1_R
2f4v_1_Z
1qln_1_R
2xs7_1_B
6zvi_1_E
6sv4_1_MC
6sv4_1_MB
6i7o_1_MB
6ogy_1_M
6ogy_1_N
6uej_1_B
1x18_1_A
5ytx_1_B
6o8w_1_U
4g0a_1_H
6r9p_1_B
3koa_1_C
4n48_1_D
4n48_1_G
6kug_1_B
6ktc_1_V
6ole_1_U
6om0_1_U
6olg_1_BV
6oli_1_U
6om7_1_U
6w6l_1_U
6olz_1_BV
6olf_1_U
5lzd_1_X
6m7k_1_B
3cd6_1_4
3cma_1_5
6n9e_1_2W
1vqo_1_4
1qvg_1_3
3cme_1_5
5lzd_1_W
5lze_1_W
5lzc_1_W
5lzb_1_W
3wzi_1_C
1mvr_1_E
1mvr_1_B
1mvr_1_A
4adx_1_0
4adx_1_8
1n33_1_Z
6dti_1_W
3d2s_1_F
3d2s_1_H
5mrc_1_AA
5mre_1_AA
5mrf_1_AA
5fl8_1_Y
5jcs_1_Y
2r1g_1_A
2r1g_1_D
2r1g_1_F
3eq4_1_Y
4wkr_1_C
4v99_1_EC
4v99_1_AC
4v99_1_BH
4v99_1_CH
4v99_1_AM
4v99_1_DC
4v99_1_JW
4v99_1_EH
4v99_1_BW
4v99_1_FW
4v99_1_AW
4v99_1_BC
4v99_1_BM
4v99_1_IC
4v99_1_EM
4v99_1_ER
4v99_1_IW
4v99_1_JH
4v99_1_JR
4v99_1_AH
4v99_1_GR
4v99_1_IR
4v99_1_BR
4v99_1_CW
4v99_1_HR
4v99_1_FH
4v99_1_HC
4v99_1_DW
4v99_1_GC
4v99_1_JC
4v99_1_DM
4v99_1_EW
4v99_1_AR
4v99_1_CR
4v99_1_JM
4v99_1_CC
4v99_1_IH
4v99_1_FR
4v99_1_CM
4v99_1_IM
4v99_1_FM
4v99_1_FC
4v99_1_GH
4v99_1_HM
4v99_1_HH
4v99_1_DR
4v99_1_HW
4v99_1_GW
4v99_1_DH
4v99_1_GM
6rt4_1_D
6rt4_1_C
6zvh_1_X
4dwa_1_D
6n6c_1_D
6n6j_1_C
6n6j_1_D
6p7q_1_E
6p7q_1_F
6p7q_1_D
6rcl_1_C
5jju_1_C
4ejt_1_G
5ceu_1_C
5ceu_1_D
6lkq_1_W
3qsu_1_P
3qsu_1_R
1n38_1_B
4qvc_1_G
6q1h_1_D
6q1h_1_H
6p7p_1_F
6p7p_1_E
6p7p_1_D
6vm6_1_J
6vm6_1_G
6wan_1_K
6wan_1_H
6wan_1_G
6wan_1_L
6wan_1_I
6ywo_1_F
6wan_1_J
4oau_1_A
6ywo_1_E
6ywo_1_K
6vm6_1_I
6vm6_1_H
6ywo_1_I
2a1r_1_C
2a1r_1_D
3gpq_1_E
3gpq_1_F
6o79_1_C
6vm6_1_K
6hyu_1_D
1laj_1_R
6ybv_1_K
6mpf_1_W
6spc_1_A
6spe_1_A
6fti_1_V
6ftj_1_V
6ftg_1_V
4g0a_1_G
4g0a_1_F
4g0a_1_E
2b2d_1_S
5hkc_1_C
1rmv_1_B
4qu7_1_X
4qu7_1_V
4qu7_1_U
4v5z_1_AH
4v5z_1_AA
4v5z_1_AB
4v5z_1_AC
4v5z_1_AD
4v5z_1_AE
4v5z_1_AF
4v5z_1_AG
6pmi_1_3
6pmj_1_3
5hjz_1_C
......
This diff could not be displayed because it is too large.
......@@ -11,7 +11,7 @@
# - Use a specialised database (SILVA) : better alignments (we guess?), but two kind of jobs
# - Use cmalign --small everywhere (homogeneity)
# Moreover, --small requires --nonbanded --cyk, which means the output alignement is the optimally scored one.
# To date, we trust Infernal as the best tool to realign RNA. Is it ?
# To date, we trust Infernal as the best tool to realign ncRNA. Is it ?
# Contact: louis.becquey@univ-evry.fr (PhD student), fariza.tahi@univ-evry.fr (PI)
......@@ -28,7 +28,7 @@ pd.set_option('display.max_rows', None)
LSU_set = ["RF00002", "RF02540", "RF02541", "RF02543", "RF02546"] # From Rfam CLAN 00112
SSU_set = ["RF00177", "RF02542", "RF02545", "RF01959", "RF01960"] # From Rfam CLAN 00111
with sqlite3.connect("results/RNANet.db") as conn:
with sqlite3.connect(os.getcwd()+"/results/RNANet.db") as conn:
df = pd.read_sql("SELECT rfam_acc, max_len, nb_total_homol, comput_time, comput_peak_mem FROM family;", conn)
to_remove = [ f for f in df.rfam_acc if f in LSU_set+SSU_set ]
......@@ -74,7 +74,7 @@ ax.set_ylabel("Maximum length of sequences ")
ax.set_zlabel("Computation time (s)")
plt.subplots_adjust(wspace=0.4)
plt.savefig("results/cmalign_jobs_performance.png")
plt.savefig(os.getcwd()+"/results/cmalign_jobs_performance.png")
# # ========================================================
# # Linear Regression of max_mem as function of max_length
......
......@@ -3,7 +3,6 @@
# This file computes additional statistics over the produced dataset.
# Run this file if you want the base counts, pair-type counts, identity percents, etc
# in the database.
# This should be run from the folder where the file is (to access the database with path "results/RNANet.db")
import getopt, os, pickle, sqlite3, shlex, subprocess, sys
import numpy as np
......@@ -22,34 +21,35 @@ from multiprocessing import Pool, Manager
from os import path
from tqdm import tqdm
from collections import Counter
from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker
from setproctitle import setproctitle
from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions
path_to_3D_data = "tobedefinedbyoptions"
path_to_seq_data = "tobedefinedbyoptions"
runDir = os.getcwd()
res_thr = 20.0 # default: all structures
LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112
SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111
def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
@trace_unhandled_exceptions
def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0):
"""
Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph.
See Wadley & Pyle (2007)
See Wadley & Pyle (2007).
Only unique unmapped chains with resolution < res argument are considered.
Arguments:
show: True or False, call plt.show() at this end or not
filter_helical: None, "form", "zone", or "both"
None: do not remove helical nucleotide
"form": remove nucleotides if they belong to a A, B or Z form stem
"zone": remove nucleotides falling in an arbitrary zone (see zone argument)
"both": remove nucleotides fulfilling one or both of the above conditions
carbon: 1 or 4, use C4' (eta and theta) or C1' (eta_prime and theta_prime)
sd_range: tuple, set values below avg + sd_range[0] * stdev to 0,
and values above avg + sd_range[1] * stdev to avg + sd_range[1] * stdev.
This removes noise and cuts too high peaks, to clearly see the clusters.
carbon: 1 or 4, use C4' (eta and theta) or C1' (eta_prime and theta_prime)
show: True or False, call plt.show() at this end or not
sd_range: tuple, set values below avg + sd_range[0] * stdev to 0,
and values above avg + sd_range[1] * stdev to avg + sd_range[1] * stdev.
This removes noise and cuts too high peaks, to clearly see the clusters.
res: Minimal resolution (maximal resolution value, actually) of the structure to
consider its nucleotides.
"""
os.makedirs("results/figures/wadley_plots/", exist_ok=True)
os.makedirs(runDir + "/results/figures/wadley_plots/", exist_ok=True)
if carbon == 4:
angle = "eta"
......@@ -63,30 +63,32 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
exit("You overestimate my capabilities !")
if not path.isfile(f"data/wadley_kernel_{angle}_{res}A.npz"):
if not path.isfile(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz"):
# Get a worker number to position the progress bar
global idxQueue
thr_idx = idxQueue.get()
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} reproduce_wadley_results(carbon={carbon})")
pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", position=thr_idx+1, leave=False)
# Extract the angle values of c2'-endo and c3'-endo nucleotides
with sqlite3.connect("results/RNANet.db") as conn:
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
df = pd.read_sql(f"""SELECT {angle}, th{angle}
FROM nucleotide JOIN (
SELECT chain_id FROM chain JOIN structure
WHERE structure.resolution <= {res}
) AS c
FROM (
SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
WHERE chain.rfam_acc = 'unmappd' AND structure.resolution <= {res} AND issue = 0
) AS c NATURAL JOIN nucleotide
WHERE puckering="C2'-endo"
AND {angle} IS NOT NULL
AND th{angle} IS NOT NULL;""", conn)
c2_endo_etas = df[angle].values.tolist()
c2_endo_thetas = df["th"+angle].values.tolist()
df = pd.read_sql(f"""SELECT {angle}, th{angle}
FROM nucleotide JOIN (
SELECT chain_id FROM chain JOIN structure
WHERE structure.resolution <= {res}
) AS c
FROM (
SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
WHERE chain.rfam_acc = 'unmappd' AND structure.resolution <= {res} AND issue = 0
) AS c NATURAL JOIN nucleotide
WHERE form = '.'
AND puckering="C3'-endo"
AND {angle} IS NOT NULL
......@@ -111,14 +113,16 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
pbar.update(1)
# Save the data to an archive for later use without the need to recompute
np.savez(f"data/wadley_kernel_{angle}_{res}A.npz",
np.savez(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz",
c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas,
c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas,
kernel_c3=f_c3, kernel_c2=f_c2)
pbar.close()
idxQueue.put(thr_idx)
else:
f = np.load(f"data/wadley_kernel_{angle}_{res}A.npz")
setproctitle(f"RNANet statistics.py reproduce_wadley_results(carbon={carbon})")
f = np.load(runDir + f"/data/wadley_kernel_{angle}_{res}A.npz")
c2_endo_etas = f["c2_endo_e"]
c3_endo_etas = f["c3_endo_e"]
c2_endo_thetas = f["c2_endo_t"]
......@@ -148,7 +152,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
f_low_thr = f.mean() + sd_range[0]*f.std()
f_cut = np.where(f > f_sup_thr, f_sup_thr, f)
f_cut = np.where(f_cut < f_low_thr, 0, f_cut)
levels = [f.mean()+f.std(), f.mean()+2*f.std(), f.mean()+4*f.std()]
levels = [ f.mean()+f.std(), f.mean()+2*f.std(), f.mean()+4*f.std()]
# histogram:
fig = plt.figure()
......@@ -157,7 +161,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
ax.bar3d(xpos.ravel(), ypos.ravel(), 0.0, 0.09, 0.09, hist_cut.ravel(), color=color_values, zorder="max")
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res}A.png")
fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res}A.png")
if show:
fig.show()
plt.close()
......@@ -168,7 +172,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
ax.plot_surface(xx, yy, f_cut, cmap=cm.get_cmap("coolwarm"), linewidth=0, antialiased=True)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res}A.png")
fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res}A.png")
if show:
fig.show()
plt.close()
......@@ -177,10 +181,10 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
fig = plt.figure(figsize=(5,5))
ax = fig.gca()
ax.scatter(x, y, s=1, alpha=0.1)
ax.contourf(xx, yy, f_cut, alpha=0.5, cmap=cm.get_cmap("coolwarm"), levels=levels, extend="max")
ax.contourf(xx, yy, f, alpha=0.5, cmap=cm.get_cmap("coolwarm"), levels=levels, extend="max")
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}_{res}A.png")
fig.savefig(runDir + f"/results/figures/wadley_plots/wadley_{angle}_{l}_{res}A.png")
if show:
fig.show()
plt.close()
......@@ -188,9 +192,12 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
def stats_len():
"""Plots statistics on chain lengths in RNA families.
Uses all chains mapped to a family including copies, inferred or not.
REQUIRES tables chain, nucleotide up to date.
"""
setproctitle(f"RNANet statistics.py stats_len({res_thr})")
# Get a worker number to position the progress bar
global idxQueue
......@@ -214,7 +221,7 @@ def stats_len():
cols = []
lengths = []
for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False)):
for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False):
# Define a color for that family in the plot
if f in LSU_set:
......@@ -229,7 +236,7 @@ def stats_len():
cols.append("grey")
# Get the lengths of chains
with sqlite3.connect("results/RNANet.db") as conn:
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
l = [ x[0] for x in sql_ask_database(conn, f"""SELECT COUNT(index_chain)
FROM (
SELECT chain_id
......@@ -239,8 +246,6 @@ def stats_len():
GROUP BY chain_id;""", warn_every=0) ]
lengths.append(l) # list of chain lengths from the family
# notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
# Plot the figure
fig = plt.figure(figsize=(10,3))
ax = fig.gca()
......@@ -267,7 +272,7 @@ def stats_len():
ncol=1, fontsize='small', bbox_to_anchor=(1.3, 0.5))
# Save the figure
fig.savefig(f"results/figures/lengths_{res_thr}A.png")
fig.savefig(runDir + f"/results/figures/lengths_{res_thr}A.png")
idxQueue.put(thr_idx) # replace the thread index in the queue
# notify("Computed sequence length statistics and saved the figure.")
......@@ -285,6 +290,7 @@ def format_percentage(tot, x):
def stats_freq():
"""Computes base frequencies in all RNA families.
Uses all chains mapped to a family including copies, inferred or not.
Outputs results/frequencies.csv
REQUIRES tables chain, nucleotide up to date."""
......@@ -293,17 +299,18 @@ def stats_freq():
global idxQueue
thr_idx = idxQueue.get()
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} stats_freq()")
# Initialize a Counter object for each family
freqs = {}
for f in fam_list:
freqs[f] = Counter()
# List all nt_names happening within a RNA family and store the counts in the Counter
for i,f in enumerate(tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False)):
with sqlite3.connect("results/RNANet.db") as conn:
for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False):
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0))
freqs[f].update(counts)
# notify(f"[{i+1}/{len(fam_list)}] Computed {f} nucleotide frequencies.")
# Create a pandas DataFrame, and save it to CSV.
df = pd.DataFrame()
......@@ -311,7 +318,7 @@ def stats_freq():
tot = sum(freqs[f].values())
df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ])
df = df.fillna(0)
df.to_csv("results/frequencies.csv")
df.to_csv(runDir + "/results/frequencies.csv")
idxQueue.put(thr_idx) # replace the thread index in the queue
# notify("Saved nucleotide frequencies to CSV file.")
......@@ -327,11 +334,13 @@ def parallel_stats_pairs(f):
global idxQueue
thr_idx = idxQueue.get()
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} p_stats_pairs({f})")
chain_id_list = mappings_list[f]
data = []
sqldata = []
for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", leave=False):
with sqlite3.connect("results/RNANet.db") as conn:
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
# Get comma separated lists of basepairs per nucleotide
interactions = pd.DataFrame(
sql_ask_database(conn,
......@@ -398,7 +407,7 @@ def parallel_stats_pairs(f):
data.append(expanded_list)
# Update the database
with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
with sqlite3.connect(runDir + "/results/RNANet.db", isolation_level=None) as conn:
conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
sql_execute(conn, """UPDATE chain SET pair_count_cWW = ?, pair_count_cWH = ?, pair_count_cWS = ?, pair_count_cHH = ?,
pair_count_cHS = ?, pair_count_cSS = ?, pair_count_tWW = ?, pair_count_tWH = ?, pair_count_tWS = ?,
......@@ -416,8 +425,8 @@ def parallel_stats_pairs(f):
# Create an output DataFrame
f_df = pd.DataFrame([[ x for x in cnt.values() ]], columns=list(cnt), index=[f])
f_df.to_csv(f"data/{f}_counts.csv")
expanded_list.to_csv(f"data/{f}_pairs.csv")
f_df.to_csv(runDir + f"/data/{f}_counts.csv")
expanded_list.to_csv(runDir + f"/data/{f}_pairs.csv")
idxQueue.put(thr_idx) # replace the thread index in the queue
......@@ -430,28 +439,34 @@ def to_dist_matrix(f):
global idxQueue
thr_idx = idxQueue.get()
# notify(f"Computing {f} distance matrix from alignment...")
command = f"esl-alipid --rna --noheader --informat stockholm {f}_3d_only.stk"
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_dist_matrix({f})")
# Prepare a file
with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file:
al = AlignIO.read(al_file, "fasta")
names = [ x.id for x in al if '[' in x.id ]
al = al[-len(names):]
with open(f + "_3d_only.stk", "w") as only_3d:
only_3d.write(al.format("stockholm"))
with open(path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk", "w") as only_3d:
try:
only_3d.write(al.format("stockholm"))
except ValueError as e:
warn(e)
del al
subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap", "-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk", "stockholm", path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"])
# Prepare the job
process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE)
process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"),
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
id_matrix = np.zeros((len(names), len(names)))
pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", leave=False)
while process.poll() is None:
output = process.stdout.readline()
cnt = 0
while not cnt or process.poll() is None:
output = process.stdout.read()
if output:
lines = output.strip().split(b'\n')
for l in lines:
cnt += 1
line = l.split()
s1 = line[0].decode('utf-8')
s2 = line[1].decode('utf-8')
......@@ -460,9 +475,14 @@ def to_dist_matrix(f):
id2 = names.index(s2)
id_matrix[id1, id2] = float(score)
pbar.update(1)
if cnt != len(names)*(len(names)-1)*0.5:
warn(f"{f} got {cnt} updates on {len(names)*(len(names)-1)*0.5}")
if process.poll() != 0:
l = process.stderr.read().strip().split(b'\n')
warn("\n".join([ line.decode('utf-8') for line in l ]))
pbar.close()
subprocess.run(["rm", "-f", f + "_3d_only.stk"])
subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk"])
np.save("data/"+f+".npy", id_matrix)
idxQueue.put(thr_idx) # replace the thread index in the queue
return 0
......@@ -471,21 +491,26 @@ def seq_idty():
"""Computes identity matrices for each of the RNA families.
REQUIRES temporary results files in data/*.npy
REQUIRES tables chain, family un to date."""
REQUIRES tables chain, family up to date."""
# load distance matrices
fams_to_plot = [ f for f in famlist if f not in ignored ]
fam_arrays = []
for f in famlist:
for f in fams_to_plot:
if path.isfile("data/"+f+".npy"):
fam_arrays.append(np.load("data/"+f+".npy"))
fam_arrays.append(np.load("data/"+f+".npy") / 100.0) # normalize percentages in [0,1]
else:
fam_arrays.append([])
warn("data/"+f+".npy not found !")
fam_arrays.append(np.array([]))
# Update database with identity percentages
conn = sqlite3.connect("results/RNANet.db")
for f, D in zip(famlist, fam_arrays):
conn = sqlite3.connect(runDir + "/results/RNANet.db")
for f, D in zip(fams_to_plot, fam_arrays):
if not len(D): continue
a = 1.0 - np.average(D + D.T) # Get symmetric matrix instead of lower triangle + convert from distance matrix to identity matrix
if D.shape[0] > 1:
a = np.sum(D) * 2 / D.shape[0] / (D.shape[0] - 1) # SUM(D) / (n(n-1)/2)
else:
a = D[0][0]
conn.execute(f"UPDATE family SET idty_percent = {round(float(a),2)} WHERE rfam_acc = '{f}';")
conn.commit()
conn.close()
......@@ -495,10 +520,11 @@ def seq_idty():
axs = axs.ravel()
[axi.set_axis_off() for axi in axs]
im = "" # Just to declare the variable, it will be set in the loop
for f, D, ax in zip(famlist, fam_arrays, axs):
if not len(D): continue
for f, D, ax in zip(fams_to_plot, fam_arrays, axs):
D = D + D.T # Copy the lower triangle to upper, to get a symetrical matrix
if D.shape[0] > 2: # Cluster only if there is more than 2 sequences to organize
D = D + D.T # Copy the lower triangle to upper, to get a symetrical matrix
D = 1.0 - D
np.fill_diagonal(D, 0.0)
condensedD = squareform(D)
# Compute basic dendrogram by Ward's method
......@@ -507,15 +533,20 @@ def seq_idty():
# Reorganize rows and cols
idx1 = Z['leaves']
D = D[idx1,:]
D = D[idx1[::-1],:]
D = D[:,idx1[::-1]]
im = ax.matshow(1.0 - D, vmin=0, vmax=1, origin='lower') # convert to identity matrix 1 - D from distance matrix D
ax.set_title(f + "\n(" + str(len(mappings_list[f]))+ " chains)", fontsize=10)
D = 1.0 - D
elif D.shape[0] == 2:
np.fill_diagonal(D, 1.0) # the diagonal has been ignored until now
ax.text(np.floor(D.shape[0]/2.0)-(0.5 if not D.shape[0]%2 else 0), -0.5, f + "\n(" + str(D.shape[0]) + " chains)",
fontsize=9, horizontalalignment = 'center', verticalalignment='bottom')
im = ax.matshow(D, vmin=0, vmax=1)
fig.tight_layout()
fig.subplots_adjust(wspace=0.1, hspace=0.3)
fig.colorbar(im, ax=axs[-1], shrink=0.8)
fig.savefig(f"results/figures/distances.png")
notify("Computed all identity matrices and saved the figure.")
fig.subplots_adjust(hspace=0.3, wspace=0.1)
fig.colorbar(im, ax=axs[-4], shrink=0.8)
fig.savefig(runDir + f"/results/figures/distances.png")
print("> Computed all identity matrices and saved the figure.", flush=True)
def stats_pairs():
"""Counts occurrences of intra-chain base-pair types in RNA families
......@@ -523,6 +554,8 @@ def stats_pairs():
Creates a temporary results file in data/pair_counts.csv, and a results file in results/pairings.csv.
REQUIRES tables chain, nucleotide up-to-date."""
setproctitle(f"RNANet statistics.py stats_pairs()")
def line_format(family_data):
return family_data.apply(partial(format_percentage, sum(family_data)))
......@@ -530,12 +563,12 @@ def stats_pairs():
results = []
allpairs = []
for f in fam_list:
newpairs = pd.read_csv(f"data/{f}_pairs.csv", index_col=0)
fam_df = pd.read_csv(f"data/{f}_counts.csv", index_col=0)
newpairs = pd.read_csv(runDir + f"/data/{f}_pairs.csv", index_col=0)
fam_df = pd.read_csv(runDir + f"/data/{f}_counts.csv", index_col=0)
results.append(fam_df)
allpairs.append(newpairs)
subprocess.run(["rm", "-f", f"data/{f}_pairs.csv"])
subprocess.run(["rm", "-f", f"data/{f}_counts.csv"])
subprocess.run(["rm", "-f", runDir + f"/data/{f}_pairs.csv"])
subprocess.run(["rm", "-f", runDir + f"/data/{f}_counts.csv"])
all_pairs = pd.concat(allpairs)
df = pd.concat(results).fillna(0)
df.to_csv("data/pair_counts.csv")
......@@ -573,14 +606,14 @@ def stats_pairs():
crosstab = crosstab[["AU", "GC", "Wobble", "Other"]]
# Save to CSV
df.to_csv("results/pair_types.csv")
df.to_csv(runDir + "/results/pair_types.csv")
# Plot barplot of overall types
ax = crosstab.plot(figsize=(8,5), kind='bar', stacked=True, log=False, fontsize=13)
ax.set_ylabel("Number of observations (millions)", fontsize=13)
ax.set_xlabel(None)
plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99)
plt.savefig("results/figures/pairings.png")
plt.savefig(runDir + "/results/figures/pairings.png")
notify("Computed nucleotide statistics and saved CSV and PNG file.")
......@@ -588,8 +621,10 @@ def per_chain_stats():
"""Computes per-chain frequencies and base-pair type counts.
REQUIRES tables chain, nucleotide up to date. """
setproctitle(f"RNANet statistics.py per_chain_stats()")
with sqlite3.connect("results/RNANet.db", isolation_level=None) as conn:
with sqlite3.connect(runDir + "/results/RNANet.db", isolation_level=None) as conn:
# Compute per-chain nucleotide frequencies
df = pd.read_sql("SELECT SUM(is_A) as A, SUM(is_C) AS C, SUM(is_G) AS G, SUM(is_U) AS U, SUM(is_other) AS O, chain_id FROM nucleotide GROUP BY chain_id;", conn)
df["total"] = pd.Series(df.A + df.C + df.G + df.U + df.O, dtype=np.float64)
......@@ -600,35 +635,143 @@ def per_chain_stats():
conn.execute('pragma journal_mode=wal')
sql_execute(conn, "UPDATE chain SET chain_freq_A = ?, chain_freq_C = ?, chain_freq_G = ?, chain_freq_U = ?, chain_freq_other = ? WHERE chain_id= ?;",
many=True, data=list(df.to_records(index=False)), warn_every=10)
notify("Updated the database with per-chain base frequencies")
print("> Updated the database with per-chain base frequencies", flush=True)
def general_stats():
"""
Number of structures as function of the resolution threshold
Number of Rfam families as function of the resolution threshold
"""
with sqlite3.connect("results/RNANet.db") as conn:
df_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
WHERE rfam_acc = 'unmappd' AND ISSUE=0;""", conn)
df_mapped_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn)
df_mapped_copies = pd.read_sql(f"""SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution
FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn)
df_inferred_only_unique = pd.read_sql(f"""SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution
FROM (SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy
FROM chain
JOIN (SELECT structure_id, chain_name, COUNT(distinct rfam_acc) AS redundancy, SUM(inferred) AS inf_redundancy
FROM chain
WHERE rfam_acc != 'unmappd' AND issue=0
GROUP BY structure_id, chain_name
) AS r ON chain.structure_id=r.structure_id AND chain.chain_name = r.chain_name
WHERE r.redundancy=r.inf_redundancy AND rfam_acc != 'unmappd' and issue=0
) AS c
JOIN structure ON c.structure_id=structure.pdb_id;""", conn)
print("> found", len(df_inferred_only_unique.index), "chains which are mapped only by inference using BGSU NR Lists.")
setproctitle(f"RNANet statistics.py general_stats()")
reqs = [
# unique unmapped chains with no issues
""" SELECT distinct pdb_id, chain_name, exp_method, resolution
FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
WHERE rfam_acc = 'unmappd' AND ISSUE=0;""",
# unique mapped chains with no issues
""" SELECT distinct pdb_id, chain_name, exp_method, resolution
FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
WHERE rfam_acc != 'unmappd' AND ISSUE=0;""",
# mapped chains with no issues
""" SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution
FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
WHERE rfam_acc != 'unmappd' AND ISSUE=0;""",
# mapped chains with no issues that are all inferred
""" SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution
FROM (
SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy
FROM chain
JOIN (SELECT structure_id, chain_name, COUNT(distinct rfam_acc) AS redundancy, SUM(inferred) AS inf_redundancy
FROM chain
WHERE rfam_acc != 'unmappd' AND issue=0
GROUP BY structure_id, chain_name
) AS r ON chain.structure_id=r.structure_id AND chain.chain_name = r.chain_name
WHERE r.redundancy=r.inf_redundancy AND rfam_acc != 'unmappd' and issue=0
) AS c
JOIN structure ON c.structure_id=structure.pdb_id;""",
# Number of mapped chains (not inferred)
"""SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0);""",
# Number of unique mapped chains (not inferred)
"""SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0);""",
# Number of mapped chains (inferred)
"""SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1);""",
# Number of unique mapped chains (inferred)
"""SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1);""",
# Number of mapped chains inferred once
"""SELECT count(*) FROM (
SELECT structure_id, chain_name, COUNT(DISTINCT rfam_acc) as c
FROM chain where rfam_acc!='unmappd' and inferred=1
GROUP BY structure_id, chain_name
) WHERE c=1;""",
# Number of mapped chains inferred twice
"""select count(*) from (
select structure_id, chain_name, count(distinct rfam_acc) as c
from chain where rfam_acc!='unmappd' and inferred=1
group by structure_id, chain_name
) where c=2;""",
# Number of mapped chains inferred 3 times or more
"""select count(*) from (
select structure_id, chain_name, count(distinct rfam_acc) as c
from chain where rfam_acc!='unmappd' and inferred=1
group by structure_id, chain_name
) where c>2;""",
# Number of chains both mapped with and without inferrence
""" SELECT COUNT(*) FROM (
SELECT structure_id, chain_name, sum(inferred) AS s, COUNT(rfam_acc) AS c
FROM chain
WHERE rfam_acc!='unmappd'
GROUP BY structure_id, chain_name
)
WHERE s < c AND s > 0;""",
# Number of mapped chains (total)
"""SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd');""",
# Number of unique mapped chains
"""SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd');""",
# Number of unmapped chains
"""SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc = 'unmappd');""",
# Number of mapped chains without issues (not inferred)
"""SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0 AND issue = 0);""",
# Number of unique mapped chains without issues (not inferred)
"""SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 0 AND issue = 0);""",
# Number of mapped chains without issues (inferred)
"""SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1 AND issue=0);""",
# Number of unique mapped chains without issues (inferred)
"""SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND inferred = 1 AND issue=0);""",
# Number of mapped chains without issues (total)
"""SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND issue=0);""",
# Number of unique mapped chains without issues
"""SELECT count(*) FROM (SELECT DISTINCT structure_id, chain_name FROM chain WHERE rfam_acc != 'unmappd' AND issue=0);""",
# Number of unmapped chains without issues
"""SELECT count(*) FROM (SELECT structure_id, chain_name FROM chain WHERE rfam_acc = 'unmappd' AND issue=0);"""
]
answers = []
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
for r in reqs:
answers.append(pd.read_sql(r, conn))
df_unique = answers[0]
df_mapped_unique = answers[1]
df_mapped_copies = answers[2]
df_inferred_only_unique = answers[3]
print()
print("> found", answers[4].iloc[0][0], f"chains ({answers[5].iloc[0][0]} unique chains) that are mapped thanks to Rfam. Removing chains with issues, only {answers[15].iloc[0][0]} ({answers[16].iloc[0][0]} unique)")
if answers[4].iloc[0][0] != answers[5].iloc[0][0]:
print("\t> This happens because different parts of the same chain can be mapped to different families.")
print("> found", answers[6].iloc[0][0], f"chains ({answers[7].iloc[0][0]} unique chains) that are mapped by inferrence. Removing chains with issues, only {answers[17].iloc[0][0]} ({answers[18].iloc[0][0]} unique).")
print("\t> ", answers[8].iloc[0][0], "chains are mapped only once,")
print("\t> ", answers[9].iloc[0][0], "are mapped to 2 families,")
print("\t> ", answers[10].iloc[0][0], "are mapped to 3 or more.")
print("> Among them,", answers[11].iloc[0][0], "chains are mapped both with families found on Rfam and by inferrence.")
if answers[11].iloc[0][0]:
print("\t> this is normal if you used option -f (--full-inference). Otherwise, there might be a problem.")
print("> TOTAL:", answers[12].iloc[0][0], f"chains ({answers[13].iloc[0][0]} unique chains) mapped to a family. Removing chains with issues, only {answers[19].iloc[0][0]} ({answers[20].iloc[0][0]} unique).")
print("> TOTAL:", answers[14].iloc[0][0], f"unmapped chains. Removing chains with issues, {answers[21].iloc[0][0]}.")
if answers[14].iloc[0][0]:
print("\t> this is normal if you used option --no-homology. Otherwise, there might be a problem.")
print()
##########################################
# plot N = f(resolution, exp_method)
......@@ -642,7 +785,7 @@ def general_stats():
df_inferred_only_unique.sort_values('resolution', inplace=True, ignore_index=True)
df_mapped_copies.sort_values('resolution', inplace=True, ignore_index=True)
max_res = max(df_unique.resolution)
max_structs = len(df_mapped_copies.index.tolist())
max_structs = max(len(df_mapped_copies.index), len(df_unique.index))
colors = np.linspace(0,1,1+len(methods))
plt.xticks( np.arange(0, max_res+2, 2.0).tolist(), np.arange(0, max_res+2, 2.0).tolist() )
......@@ -654,7 +797,7 @@ def general_stats():
axs[0][0].set_ylabel("ALL", fontsize=14)
axs[0][0].set_title("Number of unique RNA chains", fontsize=14)
axs[0][0].set_ylim((0, max_structs * 1.05))
axs[0][0].legend(loc="best", fontsize=14)
axs[0][0].legend(loc="lower right", fontsize=14)
axs[0][1].grid(axis='y', ls='dotted', lw=1)
axs[0][1].set_yticklabels([])
......@@ -663,9 +806,9 @@ def general_stats():
axs[0][1].hist(df_inferred_only_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[0], 0.5), cumulative=True, label='only by inference')
axs[0][1].text(0.95*max_res, 0.95*len(df_mapped_unique.resolution), "%d " % len(df_mapped_unique.resolution),
horizontalalignment='right', verticalalignment='top', fontsize=14)
axs[0][1].set_title("Number of unique RNA chains\nmapped to $\geq 1$ family", fontsize=14)
axs[0][1].set_title(r"Number of unique RNA chains\nmapped to $\geq 1$ family", fontsize=14)
axs[0][1].set_ylim((0, max_structs * 1.05))
axs[0][1].legend(loc="best", fontsize=14)
axs[0][1].legend(loc="upper left", fontsize=14)
axs[0][2].grid(axis='y', ls='dotted', lw=1)
axs[0][2].set_yticklabels([])
......@@ -675,7 +818,7 @@ def general_stats():
axs[0][2].text(0.95*max_res, 0.95*len(df_mapped_copies.resolution), "%d " % len(df_mapped_copies.resolution),
horizontalalignment='right', verticalalignment='top', fontsize=14)
axs[0][2].set_title("Number of RNA chains mapped to a\nfamily (with copies)", fontsize=14)
axs[0][2].legend(loc="right", fontsize=14)
axs[0][2].legend(loc="upper left", fontsize=14)
axs[0][2].set_ylim((0, max_structs * 1.05))
for i,m in enumerate(methods):
......@@ -683,7 +826,7 @@ def general_stats():
df_mapped_unique_m = df_mapped_unique[df_mapped_unique.exp_method == m]
df_inferred_only_unique_m = df_inferred_only_unique[df_inferred_only_unique.exp_method == m]
df_mapped_copies_m = df_mapped_copies[ df_mapped_copies.exp_method == m]
max_structs = len(df_mapped_copies_m.resolution.tolist())
max_structs = max(len(df_mapped_copies_m.index), len(df_unique_m.index))
print("> found", max_structs, "structures with method", m, flush=True)
axs[1+i][0].grid(axis='y', ls='dotted', lw=1)
......@@ -693,7 +836,7 @@ def general_stats():
horizontalalignment='right', verticalalignment='top', fontsize=14)
axs[1+i][0].set_ylim((0, max_structs * 1.05))
axs[1+i][0].set_ylabel(m, fontsize=14)
axs[1+i][0].legend(loc="best", fontsize=14)
axs[1+i][0].legend(loc="lower right", fontsize=14)
axs[1+i][1].grid(axis='y', ls='dotted', lw=1)
axs[1+i][1].set_yticklabels([])
......@@ -703,7 +846,7 @@ def general_stats():
axs[1+i][1].text(0.95*max_res, 0.95*len(df_mapped_unique_m.resolution), "%d " % len(df_mapped_unique_m.resolution),
horizontalalignment='right', verticalalignment='top', fontsize=14)
axs[1+i][1].set_ylim((0, max_structs * 1.05))
axs[1+i][1].legend(loc="best", fontsize=14)
axs[1+i][1].legend(loc="upper left", fontsize=14)
axs[1+i][2].grid(axis='y', ls='dotted', lw=1)
axs[1+i][2].set_yticklabels([])
......@@ -713,7 +856,7 @@ def general_stats():
axs[1+i][2].text(0.95*max_res, 0.95*len(df_mapped_copies_m.resolution), "%d " % len(df_mapped_copies_m.resolution),
horizontalalignment='right', verticalalignment='top', fontsize=14)
axs[1+i][2].set_ylim((0, max_structs * 1.05))
axs[1+i][2].legend(loc="right", fontsize=14)
axs[1+i][2].legend(loc="upper left", fontsize=14)
axs[-1][0].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14)
axs[-1][1].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14)
......@@ -722,7 +865,7 @@ def general_stats():
fig.suptitle("Number of RNA chains by experimental method and resolution", fontsize=16)
fig.subplots_adjust(left=0.07, right=0.98, wspace=0.05,
hspace=0.05, bottom=0.05, top=0.92)
fig.savefig("results/figures/resolutions.png")
fig.savefig(runDir + "/results/figures/resolutions.png")
plt.close()
##########################################
......@@ -765,7 +908,7 @@ def general_stats():
fig.suptitle("Number of RNA families used by experimental method and resolution", fontsize=16)
fig.subplots_adjust(left=0.05, right=0.98, wspace=0.05,
hspace=0.05, bottom=0.12, top=0.84)
fig.savefig("results/figures/Nfamilies.png")
fig.savefig(runDir + "/results/figures/Nfamilies.png")
plt.close()
def log_to_pbar(pbar):
......@@ -776,8 +919,10 @@ def log_to_pbar(pbar):
if __name__ == "__main__":
# parse options
DELETE_OLD_DATA = False
DO_WADLEY_ANALYSIS = False
try:
opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "resolution=", "3d-folder=", "seq-folder=" ])
opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "from-scratch", "wadley", "resolution=", "3d-folder=", "seq-folder=" ])
except getopt.GetoptError as err:
print(err)
sys.exit(2)
......@@ -795,6 +940,7 @@ if __name__ == "__main__":
"\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.")
print("--seq-folder=…\t\t\tPath to a folder containing the sequence and alignment files. Required subfolder:"
"\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything")
sys.exit()
elif opt == '--version':
print("RNANet statistics 1.1 beta")
......@@ -810,25 +956,37 @@ if __name__ == "__main__":
path_to_seq_data = path.abspath(arg)
if path_to_seq_data[-1] != '/':
path_to_seq_data += '/'
elif opt=='--from-scratch':
DELETE_OLD_DATA = True
DO_WADLEY_ANALYSIS = True
subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
elif opt=='--wadley':
DO_WADLEY_ANALYSIS = True
# Load mappings
print("Loading mappings list...")
with sqlite3.connect("results/RNANet.db") as conn:
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ]
mappings_list = {}
for k in fam_list:
mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain WHERE rfam_acc='{k}' and issue=0;") ]
mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain JOIN structure ON chain.structure_id=structure.pdb_id WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};") ]
# List the families for which we will compute sequence identity matrices
with sqlite3.connect("results/RNANet.db") as conn:
famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ]
ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain GROUP BY rfam_acc) WHERE n_chains < 2 ORDER BY rfam_acc ASC;") ]
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ]
ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains < 3 ORDER BY rfam_acc ASC;") ]
n_unmapped_chains = sql_ask_database(conn, "SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0]
if len(ignored):
print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
if DELETE_OLD_DATA:
for f in fam_list:
subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"])
# Prepare the multiprocessing execution environment
nworkers = max(read_cpu_number()-1, 32)
nworkers = min(read_cpu_number()-1, 32)
thr_idx_mgr = Manager()
idxQueue = thr_idx_mgr.Queue()
for i in range(nworkers):
......@@ -836,14 +994,15 @@ if __name__ == "__main__":
# Define the tasks
joblist = []
# joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 4.0))) # res threshold is 4.0 Angstroms by default
# joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 4.0))) #
if n_unmapped_chains and DO_WADLEY_ANALYSIS:
joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 20.0))) # res threshold is 4.0 Angstroms by default
joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 20.0))) #
joblist.append(Job(function=stats_len)) # Computes figures
# joblist.append(Job(function=stats_freq)) # updates the database
# for f in famlist:
# joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
# if f not in ignored:
# joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database
joblist.append(Job(function=stats_freq)) # updates the database
for f in famlist:
joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
if f not in ignored:
joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database
p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True)
......@@ -867,7 +1026,8 @@ if __name__ == "__main__":
print()
# finish the work after the parallel portions
# per_chain_stats()
# seq_idty()
# stats_pairs()
general_stats()
per_chain_stats()
seq_idty()
stats_pairs()
if n_unmapped_chains:
general_stats()
......