Louis BECQUEY

delete unused cm_coords at updates

......@@ -1576,13 +1576,14 @@ class Pipeline:
subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", "."])
subprocess.run(["ln", "-s", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", runDir + f"/archive/RNANET_datapoints_latest.tar.gz"])
# gather the alignments
os.makedirs(path_to_seq_data + "realigned/3d_only", exist_ok=True)
for f in os.listdir(path_to_seq_data + "realigned"):
if "3d_only.afa" in f:
subprocess.run(["cp", path_to_seq_data + "realigned/" + f, path_to_seq_data + "realigned/3d_only"])
subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_alignments_latest.tar.gz"])
subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3d_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."])
if self.HOMOLOGY:
# gather the alignments
os.makedirs(path_to_seq_data + "realigned/3d_only", exist_ok=True)
for f in os.listdir(path_to_seq_data + "realigned"):
if "3d_only.afa" in f:
subprocess.run(["cp", path_to_seq_data + "realigned/" + f, path_to_seq_data + "realigned/3d_only"])
subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_alignments_latest.tar.gz"])
subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3d_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."])
def sanitize_database(self):
"""Searches for issues in the database and correct them"""
......@@ -2699,7 +2700,15 @@ def work_pssm_remap(f):
cm_coords = [ None for x in range(ncols) ]
cm_2d = [ None for x in range(ncols) ]
data = [(f,j,i,cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for i, j in enumerate(columns)]
# remove columns from the database if they are not supposed to be saved anymore
already_saved = sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc='{f}';")
already_saved = set([ x[0] for x in already_saved ])
to_remove = already_saved - columns_to_save
if len(to_remove):
sql_execute(conn, f"DELETE FROM align_column WHERE rfam_acc='{f}' AND index_ali = ?;", data=(sorted(to_remove),))
# Now store the columns
data = [(f,j,i+1,cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for i, j in enumerate(columns)]
sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, index_small_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO
UPDATE SET index_small_ali=excluded.index_small_ali, cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U,
......@@ -2768,7 +2777,7 @@ def work_save(c, homology=True):
conn.execute('pragma journal_mode=wal')
if homology:
df = pd.read_sql_query(f"""
SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, cm_coord, index_small_ali,
SELECT index_chain, cm_coord, index_small_ali, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code,
is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other,
gap_percent, consensus, cons_sec_struct, dbn, paired, nb_interact, pair_type_LW, pair_type_DSSR,
alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, chi, bb_type, glyco_bond, form, ssZp, Dp,
......
......@@ -84,3 +84,1629 @@
7a5p_1_2_259-449
6uz7_1_8_2140-2825
4v5z_1_AA_1-1563
6cfj_1_1X
6cfj_1_2X
5hcq_1_1X
6cae_1_1X
5hcq_1_2X
5hcr_1_1X
4z8c_1_1X
5j4b_1_1X
6xhy_1_1X
6xhy_1_2X
5j4b_1_2X
4z8c_1_2X
6cae_1_2X
5j4c_1_1X
5w4k_1_1X
6of1_1_1X
6xhw_1_1X
5hcr_1_2X
5hd1_1_1X
5hcp_1_1X
6of1_1_2X
5hau_1_1W
5j4c_1_2X
5wis_1_1X
6xhv_1_1X
6xqd_1_1X
6nd5_1_1X
5w4k_1_2X
6xhw_1_2X
5hau_1_2W
6xqd_1_2X
6xhv_1_2X
4y4p_1_1X
6o97_1_1X
5hcp_1_2X
5doy_1_1X
4zer_1_1X
5wit_1_1X
5hd1_1_2X
6nd5_1_2X
4z3s_1_1X
7jql_1_1X
7jqm_1_1X
7jql_1_2X
5wis_1_2X
6nd6_1_1X
6o97_1_2X
4y4p_1_2X
7jqm_1_2X
4z3s_1_2X
4zer_1_2X
6uo1_1_2X
6uo1_1_1X
5doy_1_2X
5wit_1_2X
5f8k_1_1X
6nd6_1_2X
6xqe_1_1X
6xqe_1_2X
6n9e_1_1X
6n9e_1_2X
6n9f_1_1X
5f8k_1_2X
6n9f_1_2X
6xz7_1_F
6xzb_1_F2
6xza_1_F2
6y69_1_W
5afi_1_V
5afi_1_W
6h4n_1_W
5wdt_1_V
5wfs_1_V
5wdt_1_W
5wfs_1_W
5we4_1_V
5we4_1_W
5uq8_1_Y
6c4i_1_Y
6c4i_1_X
6yef_1_X
5zeb_1_V
5zep_1_W
5lzd_1_V
5we6_1_V
5wfk_1_V
5wfk_1_W
5we6_1_W
5u4i_1_Y
5uq7_1_Y
5u4i_1_X
5lza_1_V
5wf0_1_V
5wf0_1_W
5zeu_1_V
5l3p_1_X
3jcj_1_V
6gxm_1_X
6gwt_1_X
6gxn_1_X
6gxo_1_X
3j9y_1_V
6o9k_1_Y
6o7k_1_V
5lzf_1_V
3jcn_1_V
5lzc_1_V
5u4j_1_X
5u4j_1_Z
5lzb_1_V
6h58_1_W
6h58_1_WW
1eg0_1_O
5j8b_1_X
4v7j_1_AV
4v7j_1_BV
4v7k_1_BV
4v7k_1_AV
4v7k_1_BW
4v7k_1_AW
4v7j_1_AW
4v7j_1_BW
4v4j_1_Z
6i0v_1_B
5k77_1_X
5k77_1_V
5k77_1_Y
5k77_1_W
5k77_1_Z
4pei_1_X
4pei_1_V
4pei_1_W
4pei_1_Z
4pei_1_Y
4a3c_1_P
4a3e_1_P
6lkq_1_U
7k00_1_B
6ys3_1_A
6qdw_1_A
5zzm_1_M
2rdo_1_A
4v48_1_A9
4v47_1_A9
6hcj_1_Q3
6hcq_1_Q3
6o8w_1_U
5mmm_1_Z
4w2e_1_W
5j4b_1_1Y
6cfj_1_1W
5w4k_1_1Y
6xhy_1_1W
5wit_1_1W
6cfj_1_1Y
6cfj_1_2W
5j4c_1_1W
5wis_1_1Y
5j4c_1_1Y
6xhw_1_1W
6cfj_1_2Y
5wis_1_1W
5j4b_1_1W
6xhv_1_1W
6xhy_1_2W
5j4c_1_2W
5j4b_1_2W
5j4b_1_2Y
5j4c_1_2Y
5w4k_1_1W
6nd5_1_1Y
6xhw_1_2W
5wis_1_2Y
5wit_1_2W
6xhv_1_2W
5doy_1_1Y
5w4k_1_2Y
4y4p_1_1Y
4z3s_1_1Y
5doy_1_1W
5doy_1_2Y
6nd5_1_1W
4z3s_1_2Y
4z3s_1_1W
5w4k_1_2W
6nd5_1_2Y
4y4p_1_2Y
6uo1_1_2Y
6uo1_1_2W
4y4p_1_1W
4z3s_1_2W
6uo1_1_1Y
6xhy_1_1Y
6uo1_1_1W
5wis_1_2W
5wit_1_1Y
6nd5_1_2W
4y4p_1_2W
5doy_1_2W
5wit_1_2Y
6xhv_1_1Y
6xhy_1_2Y
6xhw_1_1Y
6xhw_1_2Y
6ucq_1_1Y
6xhv_1_2Y
4v4i_1_Z
6ucq_1_1X
6ucq_1_2Y
4w2e_1_X
6ucq_1_2X
6yss_1_W
5afi_1_Y
5uq8_1_Z
5wdt_1_Y
5wfs_1_Y
6ysr_1_W
5we4_1_Y
6yst_1_W
5uq7_1_Z
5we6_1_Y
5wfk_1_Y
5wf0_1_Y
6o9j_1_V
6ysu_1_W
3j46_1_A
5j8b_1_Y
5j8b_1_W
3bbv_1_Z
5aj0_1_BV
5aj0_1_BW
4wt8_1_AB
4wt8_1_BB
4v4j_1_Y
4v4i_1_Y
5uq8_1_X
5uq7_1_X
1jgq_1_A
4v42_1_AA
1jgo_1_A
1jgp_1_A
4v4j_1_W
4v4i_1_W
4v42_1_BA
4wt8_1_CS
4wt8_1_DS
4v4j_1_X
4v4i_1_X
4v42_1_BB
6d30_1_C
6j7z_1_C
3er9_1_D
5kal_1_Y
4nia_1_3
5kal_1_Z
4nia_1_7
4nia_1_4
5new_1_C
4nia_1_U
4nia_1_6
4oq9_1_7
4nia_1_1
4oq9_1_4
4nia_1_8
4oq9_1_8
4nia_1_5
2vrt_1_E
4nia_1_W
4oq9_1_6
4oq8_1_D
4nia_1_Z
4oq9_1_W
4oq9_1_5
4nia_1_2
2vrt_1_F
4oq9_1_U
4oq9_1_Z
4oq9_1_2
4oq9_1_3
1ddl_1_E
4oq9_1_1
6rt5_1_A
6rt5_1_E
4qu6_1_B
6lkq_1_T
6ys3_1_B
6qdw_1_B
3jbv_1_B
3jbu_1_B
5zzm_1_N
2rdo_1_B
3dg2_1_B
3dg0_1_B
4v48_1_A0
4v47_1_A0
3dg4_1_B
3dg5_1_B
6do8_1_B
6dpi_1_B
6dp9_1_B
6dpb_1_B
6dmn_1_B
6dpp_1_B
6dpk_1_B
6dpd_1_B
6dot_1_B
6dok_1_B
6dp8_1_B
6dpl_1_B
6dpg_1_B
6dou_1_B
6dpc_1_B
6do9_1_B
6dmv_1_B
6dp4_1_B
6dpn_1_B
6doj_1_B
6dph_1_B
6dos_1_B
6doo_1_B
6dp6_1_B
6dox_1_B
6dp5_1_B
6dol_1_B
6dp1_1_B
6doz_1_B
6dp7_1_B
6doq_1_B
6dpa_1_B
6dom_1_B
6dog_1_B
6dop_1_B
6doh_1_B
6doa_1_B
6don_1_B
6dov_1_B
6dpo_1_B
6dod_1_B
6dob_1_B
6dow_1_B
6dpm_1_B
6dpf_1_B
6dp3_1_B
6dp2_1_B
6dpe_1_B
6dpj_1_B
6dor_1_B
6dof_1_B
6dp0_1_B
6doi_1_B
6doc_1_B
6doe_1_B
6n6g_1_D
6lkq_1_S
5h5u_1_H
7d6z_1_F
5lze_1_Y
5lze_1_V
5lze_1_X
3jcj_1_G
6o7k_1_G
3dg2_1_A
3dg0_1_A
4v48_1_BA
4v47_1_BA
3dg4_1_A
3dg5_1_A
4b3r_1_W
4b3t_1_W
4b3s_1_W
5o2r_1_X
5kcs_1_1X
6zvk_1_E2
6zvk_1_H2
7a01_1_E2
7a01_1_H2
6fti_1_U
6fti_1_W
6ftj_1_U
6ftj_1_W
6ftg_1_U
6ftg_1_W
6x1b_1_D
6x1b_1_F
5f6c_1_C
6i0t_1_B
1b2m_1_C
1b2m_1_D
1b2m_1_E
2uxc_1_Y
4a3g_1_P
4a3j_1_P
7k00_1_5
5mmi_1_Z
3j9m_1_U
7a5k_1_U3
6nu2_1_U
7a5g_1_U3
6nu3_1_U
5c0y_1_C
6n6f_1_D
4ohy_1_B
4oi1_1_B
4oi0_1_B
5ipl_1_3
6utw_1_333
5ipm_1_3
5ipn_1_3
4ylo_1_3
4yln_1_6
4ylo_1_6
4yln_1_3
4yln_1_9
5lzf_1_Y
1n32_1_Z
5zsl_1_D
5zsd_1_C
5zsd_1_D
5zsl_1_E
4nku_1_D
4nku_1_H
1cwp_1_E
6thn_1_A
6qik_1_Y
6rzz_1_Y
6ri5_1_Y
6qt0_1_Y
6qtz_1_Y
6t83_1_1B
6t83_1_3B
6t83_1_AA
6t83_1_CA
6s05_1_Y
5jcs_1_X
5fl8_1_X
6ole_1_V
6om0_1_V
6oli_1_V
6om7_1_V
6w6l_1_V
6olf_1_V
3erc_1_G
6of1_1_1W
6cae_1_1Y
6o97_1_1W
6of1_1_1Y
6of1_1_2W
6o97_1_1Y
6nd6_1_1Y
6cae_1_1W
6of1_1_2Y
6cae_1_2Y
6nd6_1_1W
6cae_1_2W
6o97_1_2Y
6nd6_1_2Y
6o97_1_2W
6nd6_1_2W
4wtm_1_T
4wtm_1_P
6gz4_1_BW
6xz7_1_G
6xzb_1_G2
6gz5_1_BW
6gz3_1_BW
1qzb_1_B
1qza_1_B
1ls2_1_B
3ep2_1_Y
3eq3_1_Y
4v48_1_A6
2z9q_1_A
4hot_1_X
6d2z_1_C
4tu0_1_F
4tu0_1_G
6r9o_1_B
6is0_1_C
5lzc_1_X
5lzb_1_X
5lzd_1_Y
5lzc_1_Y
5lzb_1_Y
6zvi_1_E
6sv4_1_MC
6sv4_1_MB
7nrd_1_SM
6i7o_1_MB
1gsg_1_T
6zvi_1_D
6sv4_1_NB
6sv4_1_NC
6i7o_1_NB
1ml5_1_A
6swa_1_Q
6swa_1_R
3j6x_1_IR
3j6y_1_IR
6ole_1_T
6om0_1_T
6oli_1_T
6om7_1_T
6olf_1_T
6w6l_1_T
6tb3_1_N
7b7d_1_SM
7b7d_1_SN
6tnu_1_N
7nrd_1_SN
6zot_1_C
2uxb_1_X
2x1f_1_B
2x1a_1_B
3ep2_1_D
3eq3_1_D
1eg0_1_M
3eq4_1_D
5o1y_1_B
3jcr_1_H
6dzi_1_H
5zeu_1_A
6evj_1_N
6evj_1_M
6wub_1_A
6wua_1_A
6mpi_1_W
5mfx_1_B
5w0m_1_J
5bud_1_E
5w0m_1_I
5w0m_1_H
4j7m_1_B
5bud_1_D
6a4e_1_B
6a4e_1_D
6hxx_1_AA
6hxx_1_AB
6hxx_1_AC
6hxx_1_AD
6hxx_1_AE
6hxx_1_AF
6hxx_1_AG
6hxx_1_AH
6hxx_1_AI
6hxx_1_AJ
6hxx_1_AK
6hxx_1_AL
6hxx_1_AM
6hxx_1_AN
6hxx_1_AO
6hxx_1_AP
6hxx_1_AQ
6hxx_1_AR
6hxx_1_AS
6hxx_1_AT
6hxx_1_AU
6hxx_1_AV
6hxx_1_AW
6hxx_1_AX
6hxx_1_AY
6hxx_1_AZ
6hxx_1_BA
6hxx_1_BB
6hxx_1_BC
6hxx_1_BD
6hxx_1_BE
6hxx_1_BF
6hxx_1_BG
6hxx_1_BH
6hxx_1_BI
5odv_1_A
5odv_1_B
5odv_1_C
5odv_1_D
5odv_1_E
5odv_1_F
5odv_1_G
5odv_1_H
5odv_1_I
5odv_1_J
5odv_1_K
5odv_1_L
5odv_1_M
5odv_1_N
5odv_1_O
5odv_1_P
5odv_1_Q
5odv_1_R
5odv_1_S
5odv_1_T
5odv_1_U
5odv_1_V
5odv_1_W
5odv_1_X
6t34_1_A
6t34_1_B
6t34_1_C
6t34_1_D
6t34_1_E
6t34_1_F
6t34_1_G
6t34_1_H
6t34_1_I
6t34_1_J
6t34_1_K
6t34_1_L
6t34_1_M
6t34_1_N
6t34_1_O
6t34_1_P
6t34_1_Q
6t34_1_R
6t34_1_S
6ip8_1_ZY
6ip5_1_ZY
6ip5_1_ZU
6ip6_1_ZY
6ip8_1_ZZ
6ip6_1_ZZ
6uu3_1_333
6uu1_1_333
1pn8_1_D
3er8_1_H
3er8_1_G
3er8_1_F
5o3j_1_B
4dr7_1_B
1i5l_1_Y
1i5l_1_U
4dr6_1_B
6i2n_1_U
4v68_1_A0
6vyu_1_Y
6vyw_1_Y
6vz7_1_Y
6vz5_1_Y
6vz3_1_Y
6vyy_1_Y
6vyx_1_Y
6vyz_1_Y
6vz2_1_Y
1mvr_1_1
6vyt_1_Y
1cgm_1_I
3jb7_1_T
3jb7_1_M
3j0o_1_D
3j0l_1_D
3j0q_1_D
3j0p_1_D
2tmv_1_R
5a79_1_R
5a7a_1_R
2om3_1_R
2xea_1_R
4v7e_1_AA
4v7e_1_AC
4wtl_1_T
4wtl_1_P
1xnq_1_W
1x18_1_C
1x18_1_B
1x18_1_D
1vq6_1_4
4am3_1_D
4am3_1_H
4am3_1_I
4lj0_1_C
4lj0_1_D
4lj0_1_E
5lzy_1_HH
4wtj_1_T
4wtj_1_P
4xbf_1_D
6n6d_1_D
6n6k_1_C
6n6k_1_D
3rtj_1_D
6ty9_1_M
6tz1_1_N
6sce_1_B
6xl1_1_C
6scf_1_I
6scf_1_K
6yud_1_K
6yud_1_O
6scf_1_M
6yud_1_P
6scf_1_L
6yud_1_M
6yud_1_Q
6w11_1_C
6o6x_1_D
4ba2_1_R
7bdv_1_F
7bdv_1_H
6o6x_1_C
7did_1_C
6o7b_1_C
6o6v_1_C
6wxx_1_Y
6wxx_1_X
6r7b_1_D
6r9r_1_D
6ov0_1_E
6ov0_1_H
6ov0_1_G
6o6v_1_D
6ov0_1_F
6o7b_1_D
5e02_1_C
6r9r_1_E
6r7b_1_E
6o7i_1_I
6o7h_1_K
7l6t_1_C
7jyy_1_F
7jyy_1_E
7jz0_1_F
7jz0_1_E
6rt6_1_A
6rt6_1_E
1y1y_1_P
5zuu_1_I
5zuu_1_G
4peh_1_W
4peh_1_V
4peh_1_X
4peh_1_Y
4peh_1_Z
6mkn_1_W
7kl3_1_B
4cxg_1_C
4cxh_1_C
1x1l_1_A
1zc8_1_Z
2ob7_1_D
2ob7_1_A
4eya_1_E
4eya_1_F
4eya_1_Q
4eya_1_R
1qzc_1_B
1t1o_1_B
1mvr_1_C
1t1m_1_B
1t1o_1_C
1t1m_1_A
1t1o_1_A
2r1g_1_B
4ht9_1_E
6z1p_1_AB
6z1p_1_AA
4ii9_1_C
5mq0_1_3
5uk4_1_X
5uk4_1_V
5uk4_1_W
5uk4_1_U
5f6c_1_E
4rcj_1_B
1xnr_1_W
2agn_1_A
2agn_1_C
2agn_1_B
6e0o_1_C
6o75_1_D
6o75_1_C
6e0o_1_B
3j06_1_R
1r2x_1_C
1r2w_1_C
1eg0_1_L
4eya_1_G
4eya_1_H
4eya_1_S
4eya_1_T
4dr4_1_V
1ibl_1_Z
1ibm_1_Z
4dr5_1_V
4d61_1_J
1trj_1_B
1trj_1_C
5tbw_1_SR
6hhq_1_SR
6zvi_1_H
6sv4_1_2B
6sv4_1_2C
6t83_1_2B
6t83_1_A
6i7o_1_2B
6q8y_1_N
6sv4_1_N
6i7o_1_N
6swa_1_S
5k8h_1_A
5z4a_1_B
3jbu_1_V
1h2c_1_R
1h2d_1_S
1h2d_1_R
6szs_1_X
5mgp_1_X
6enu_1_X
6enf_1_X
6enj_1_X
1pvo_1_L
1pvo_1_G
1pvo_1_H
1pvo_1_J
1pvo_1_K
2ht1_1_K
2ht1_1_J
5sze_1_C
6wre_1_D
6i0u_1_B
5zsa_1_C
5zsa_1_D
1n34_1_Z
3pf5_1_S
6ppn_1_A
6ppn_1_I
5flx_1_Z
6eri_1_AX
7d80_1_Y
1zc8_1_A
1zc8_1_C
1zc8_1_B
1zc8_1_G
1zc8_1_I
1zc8_1_H
1zc8_1_J
7du2_1_R
4v8z_1_CX
6kqe_1_I
5uh8_1_I
5vi5_1_Q
4xln_1_T
4xlr_1_T
4xln_1_Q
5i2d_1_K
5i2d_1_V
4xlr_1_Q
6sty_1_C
6sty_1_F
2xs5_1_D
3ok4_1_N
3ok4_1_L
3ok4_1_Z
3ok4_1_4
3ok4_1_V
3ok4_1_X
3ok4_1_P
3ok4_1_H
3ok4_1_J
3ok4_1_R
3ok4_1_T
3ok4_1_2
6n6h_1_D
5wnt_1_B
3b0u_1_B
3b0u_1_A
4x9e_1_G
4x9e_1_H
6z1p_1_BB
6z1p_1_BA
2uxd_1_X
6ywe_1_BB
3ol9_1_D
3ol9_1_H
3ol9_1_L
3ol9_1_P
3olb_1_L
3olb_1_P
3olb_1_D
3olb_1_H
3ol6_1_D
3ol6_1_H
3ol6_1_L
3ol6_1_P
3ol8_1_D
3ol8_1_H
3ol7_1_L
3ol7_1_P
3ol7_1_D
3ol7_1_H
3ol8_1_L
3ol8_1_P
1qzc_1_C
1qzc_1_A
6yrq_1_E
6yrq_1_H
6yrq_1_G
6yrq_1_F
6yrb_1_C
6yrb_1_D
1mvr_1_D
6gz5_1_BV
6gz4_1_BV
6gz3_1_BV
6fti_1_Q
4v7e_1_AB
4v7e_1_AE
4v7e_1_AD
4x62_1_B
4x64_1_B
4x65_1_B
1xmq_1_W
4x66_1_B
3t1h_1_W
3t1y_1_W
1xmo_1_W
4adx_1_9
6kr6_1_B
1zn1_1_B
6z8k_1_X
4csf_1_U
4csf_1_Q
4csf_1_G
4csf_1_M
4csf_1_K
4csf_1_A
4csf_1_I
4csf_1_S
4csf_1_C
4csf_1_W
4csf_1_O
4csf_1_E
6ywx_1_BB
6th6_1_AA
6skg_1_AA
6skf_1_AA
6q8y_1_M
6i7o_1_M
6zmw_1_W
6ybv_1_W
2fz2_1_D
2xpj_1_D
2vrt_1_H
2vrt_1_G
1emi_1_B
6r9m_1_B
4nia_1_C
4nia_1_A
4nia_1_H
4nia_1_N
4nia_1_G
4nia_1_D
4nia_1_B
4nia_1_I
4nia_1_E
4nia_1_M
4oq9_1_I
4oq9_1_G
4oq9_1_C
4oq9_1_H
4oq9_1_N
4oq9_1_A
4oq9_1_D
4oq9_1_E
4oq9_1_M
4oq9_1_B
5uhc_1_I
1uvn_1_F
1uvn_1_B
1uvn_1_D
3iy9_1_A
4wtk_1_T
4wtk_1_P
1vqn_1_4
4oav_1_C
4oav_1_A
3ep2_1_E
3eq3_1_E
3eq4_1_E
3ep2_1_A
3eq3_1_A
3eq4_1_A
3ep2_1_C
3eq3_1_C
3eq4_1_C
3ep2_1_B
3eq3_1_B
3eq4_1_B
4i67_1_B
3pgw_1_R
3pgw_1_N
3cw1_1_X
3cw1_1_W
3cw1_1_V
7b0y_1_A
6k32_1_T
6k32_1_P
5mmj_1_A
5x8r_1_A
2agn_1_E
2agn_1_D
4v5z_1_BD
6yw5_1_AA
6ywe_1_AA
6ywy_1_AA
6ywx_1_AA
3nvk_1_G
3nvk_1_S
2iy3_1_B
1cwp_1_F
5z4j_1_B
5gmf_1_E
5gmf_1_H
6e4p_1_J
5gmf_1_F
5gmf_1_G
5gmg_1_D
5gmg_1_C
6e4p_1_K
3ie1_1_E
3ie1_1_H
3ie1_1_F
4dr7_1_V
3ie1_1_G
3s4g_1_C
3s4g_1_B
2qqp_1_R
1nb7_1_E
1nb7_1_F
4hos_1_X
3p6y_1_T
3p6y_1_V
3p6y_1_U
3p6y_1_Q
3p6y_1_W
5dto_1_B
4cxh_1_X
1uvj_1_F
1uvj_1_D
1uvj_1_E
6kqd_1_I
6kqd_1_S
5uh5_1_I
1ytu_1_F
1ytu_1_D
4kzz_1_J
7a09_1_F
5t2c_1_AN
4v5z_1_BF
3j6b_1_E
4v4f_1_B6
4v4f_1_A5
4v4f_1_A3
4v4f_1_B0
4v4f_1_B9
4v4f_1_A2
4v4f_1_A8
4v4f_1_A1
4v4f_1_A9
4v4f_1_BZ
4v4f_1_B8
4v4f_1_B7
4v4f_1_B5
4v4f_1_A0
4v4f_1_A7
4v4f_1_A4
4v4f_1_AZ
4v4f_1_B3
4v4f_1_B1
4v4f_1_B4
4v4f_1_A6
4v4f_1_B2
5it9_1_I
7jqc_1_I
5zsb_1_C
5zsb_1_D
5zsn_1_D
5zsn_1_E
1cwp_1_D
3jcr_1_N
6gfw_1_R
2vaz_1_A
6zm6_1_X
6zm5_1_X
6zm6_1_W
6zm5_1_W
4v5z_1_BP
6n6e_1_D
4g7o_1_I
4g7o_1_S
5x22_1_S
5x22_1_I
5x21_1_I
5uh6_1_I
6l74_1_I
5uh9_1_I
2ftc_1_R
7a5j_1_X
6sag_1_R
4udv_1_R
2r1g_1_E
5zsc_1_D
5zsc_1_C
6woy_1_I
6wox_1_I
4gkk_1_W
4v9e_1_AG
4v9e_1_BM
4v9e_1_AM
4v9e_1_AA
4v9e_1_BA
4v9e_1_BG
5lzs_1_II
6fqr_1_C
6ha1_1_X
5kcr_1_1X
6uu4_1_333
6uu0_1_333
6uuc_1_333
6uu2_1_333
6xl9_1_R
6b6h_1_3
6xh8_1_3
6pb4_1_3
3m7n_1_Z
3m85_1_X
3m85_1_Z
3m85_1_Y
1e8s_1_C
5wnp_1_B
5wnv_1_B
5yts_1_B
1utd_1_6
1utd_1_Z
1utd_1_4
1utd_1_7
1utd_1_9
1utd_1_5
1utd_1_3
1utd_1_2
1utd_1_8
1utd_1_1
6n6i_1_C
6n6i_1_D
6n6a_1_D
6ij2_1_F
6ij2_1_G
6ij2_1_H
6ij2_1_E
3u2e_1_D
3u2e_1_C
5uef_1_C
5uef_1_D
4x4u_1_H
4afy_1_D
6oy5_1_I
6owl_1_B
6owl_1_C
4afy_1_C
4lq3_1_R
6s0m_1_C
6ymw_1_C
7a5g_1_J
6gx6_1_B
4k4s_1_D
4k4s_1_H
4k4t_1_H
4k4t_1_D
1zn1_1_C
1zn0_1_C
1xpu_1_G
1xpu_1_L
1xpr_1_L
1xpu_1_H
1xpo_1_K
1xpo_1_J
1xpu_1_J
1xpo_1_H
1xpr_1_J
1xpu_1_K
1xpr_1_K
1xpo_1_M
1xpo_1_L
1xpu_1_M
1xpr_1_M
1xpo_1_G
1xpr_1_H
1xpr_1_G
5x70_1_E
5x70_1_G
6gc5_1_F
6gc5_1_H
6gc5_1_G
1n1h_1_B
4ohz_1_B
6t83_1_6B
4gv6_1_C
4gv6_1_B
4gv3_1_C
4gv3_1_B
4gv9_1_E
6i7o_1_L
2a8v_1_D
6qx3_1_G
2xnr_1_C
4gkj_1_W
4v5z_1_BC
5y88_1_X
4v5z_1_BB
3j0o_1_H
3j0l_1_H
3j0p_1_H
3j0q_1_H
4v5z_1_BH
3j0o_1_F
3j0l_1_F
3j0p_1_F
3j0q_1_F
3j0o_1_B
3j0l_1_B
3j0o_1_C
3j0l_1_C
3j0q_1_C
3j0p_1_C
3j0o_1_A
3j0l_1_A
3j0q_1_A
3j0p_1_A
4v5z_1_BJ
6ys3_1_V
6qdw_1_V
5hk0_1_F
4qm6_1_D
4qm6_1_C
4jzu_1_C
4jzv_1_C
5ytv_1_B
4k4z_1_P
4k4z_1_D
4k4x_1_L
4k4z_1_L
4k4x_1_D
4k4z_1_H
4k4x_1_H
4k4x_1_P
4a3b_1_P
4a3m_1_P
6u6y_1_E
6u6y_1_G
6u6y_1_F
6u6y_1_H
6qik_1_X
6rzz_1_X
6ri5_1_X
6qt0_1_X
6qtz_1_X
6s05_1_X
6t83_1_BB
6t83_1_4B
5fl8_1_Z
5jcs_1_Z
5mrc_1_BB
5mre_1_BB
5mrf_1_BB
4v5z_1_BN
3j46_1_P
3jcr_1_M
4e6b_1_A
4e6b_1_B
6a6l_1_D
4v5z_1_BS
4v8t_1_1
1uvi_1_D
1uvi_1_F
1uvi_1_E
4m7d_1_P
4k4u_1_D
4k4u_1_H
6rt7_1_E
6rt7_1_A
2voo_1_C
2voo_1_D
5k78_1_X
5k78_1_Y
4ylo_1_9
5vyc_1_I2
5vyc_1_I3
5vyc_1_I5
5vyc_1_I1
5vyc_1_I6
5vyc_1_I4
6ip8_1_2M
6ip5_1_2M
6ip6_1_2M
6qcs_1_M
486d_1_G
2r1g_1_C
486d_1_F
4v5z_1_B0
4nia_1_O
4nia_1_J
4nia_1_K
4nia_1_L
4nia_1_F
4oq9_1_K
4oq9_1_O
4oq9_1_J
4oq9_1_F
4oq9_1_L
6r9q_1_B
6v3a_1_SN1
6v3b_1_SN1
6v39_1_SN1
6v3e_1_SN1
1pn7_1_C
1mj1_1_Q
1mj1_1_R
4dr6_1_V
6kql_1_I
4eya_1_M
4eya_1_N
4eya_1_A
4eya_1_B
2wj8_1_D
2wj8_1_I
2wj8_1_L
2wj8_1_F
2wj8_1_C
2wj8_1_Q
2wj8_1_J
2wj8_1_P
2wj8_1_K
2wj8_1_E
2wj8_1_T
2wj8_1_B
2wj8_1_O
2wj8_1_N
2wj8_1_A
2wj8_1_H
2wj8_1_R
2wj8_1_M
2wj8_1_S
2wj8_1_G
4e6b_1_E
4e6b_1_F
6p71_1_I
3pdm_1_R
5det_1_P
5els_1_I
4n2s_1_B
5fl8_1_Y
5jcs_1_Y
4yoe_1_E
6ow3_1_I
6ovy_1_I
6oy6_1_I
4bbl_1_Y
4bbl_1_Z
4qvd_1_H
5gxi_1_B
3iy8_1_A
6tnu_1_M
5mc6_1_M
5mc6_1_N
4eya_1_O
4eya_1_P
4eya_1_C
4eya_1_D
6htq_1_V
6htq_1_W
6htq_1_U
6uu6_1_333
6v3a_1_V
6v39_1_V
5a0v_1_F
3avt_1_T
6d1v_1_C
4s2x_1_B
4s2y_1_B
5wnu_1_B
1zc8_1_F
1vtm_1_R
4v5z_1_BA
4v5z_1_BE
4v5z_1_BG
4v5z_1_BI
4v5z_1_BK
4v5z_1_BM
4v5z_1_BL
4v5z_1_BV
4v5z_1_BO
4v5z_1_BQ
4v5z_1_BR
4v5z_1_BT
4v5z_1_BU
4v5z_1_BW
4v5z_1_BY
4v5z_1_BX
4v5z_1_BZ
5elt_1_F
5elt_1_E
6xlj_1_R
6u9x_1_H
6u9x_1_K
5elk_1_R
6okk_1_G
4cxg_1_A
4cxh_1_A
6bk8_1_I
4cxg_1_B
4cxh_1_B
4v5z_1_B1
5z4d_1_B
6o78_1_E
6xa1_1_BV
6ha8_1_X
1m8w_1_E
1m8w_1_F
5udi_1_B
5udl_1_B
5udk_1_B
5udj_1_B
5w5i_1_B
5w5i_1_D
5w5h_1_B
5w5h_1_D
4eya_1_K
4eya_1_L
4eya_1_I
4eya_1_J
4g9z_1_E
4g9z_1_F
3nma_1_B
3nma_1_C
6een_1_G
6een_1_I
6een_1_H
4wti_1_T
4wti_1_P
5l3p_1_Y
4hor_1_X
3rzo_1_R
2f4v_1_Z
1qln_1_R
6ogy_1_M
6ogy_1_N
6uej_1_B
6ywy_1_BB
1x18_1_A
5ytx_1_B
4g0a_1_H
6r9p_1_B
3koa_1_C
4n48_1_D
4n48_1_G
6kug_1_B
6ktc_1_V
6ole_1_U
6om0_1_U
6olg_1_BV
6oli_1_U
6om7_1_U
6w6l_1_U
6olz_1_BV
6olf_1_U
5lzd_1_X
6m7k_1_B
3cd6_1_4
3cma_1_5
6n9e_1_2W
1vqo_1_4
1qvg_1_3
3cme_1_5
5lzd_1_W
5lze_1_W
5lzc_1_W
5lzb_1_W
3wzi_1_C
1mvr_1_E
1mvr_1_B
1mvr_1_A
4adx_1_0
4adx_1_8
1n33_1_Z
6dti_1_W
3d2s_1_F
3d2s_1_H
5mrc_1_AA
5mre_1_AA
5mrf_1_AA
7jhy_1_Z
2r1g_1_A
2r1g_1_D
2r1g_1_F
3eq4_1_Y
4wkr_1_C
2r1g_1_X
4v99_1_EC
4v99_1_AC
4v99_1_BH
4v99_1_CH
4v99_1_AM
4v99_1_DC
4v99_1_JW
4v99_1_EH
4v99_1_BW
4v99_1_FW
4v99_1_AW
4v99_1_BC
4v99_1_BM
4v99_1_IC
4v99_1_EM
4v99_1_ER
4v99_1_IW
4v99_1_JH
4v99_1_JR
4v99_1_AH
4v99_1_GR
4v99_1_IR
4v99_1_BR
4v99_1_CW
4v99_1_HR
4v99_1_FH
4v99_1_HC
4v99_1_DW
4v99_1_GC
4v99_1_JC
4v99_1_DM
4v99_1_EW
4v99_1_AR
4v99_1_CR
4v99_1_JM
4v99_1_CC
4v99_1_IH
4v99_1_FR
4v99_1_CM
4v99_1_IM
4v99_1_FM
4v99_1_FC
4v99_1_GH
4v99_1_HM
4v99_1_HH
4v99_1_DR
4v99_1_HW
4v99_1_GW
4v99_1_DH
4v99_1_GM
6rt4_1_D
6rt4_1_C
6zvh_1_X
4dwa_1_D
6n6c_1_D
6n6j_1_C
6n6j_1_D
6p7q_1_E
6p7q_1_F
6p7q_1_D
6rcl_1_C
5jju_1_C
4ejt_1_G
6lkq_1_W
3qsu_1_P
3qsu_1_R
2xs7_1_B
1n38_1_B
4qvc_1_G
6q1h_1_D
6q1h_1_H
6p7p_1_F
6p7p_1_E
6p7p_1_D
6vm6_1_J
6vm6_1_G
6wan_1_K
6wan_1_H
6wan_1_G
6wan_1_L
6wan_1_I
6ywo_1_F
6wan_1_J
4oau_1_A
6ywo_1_E
6ywo_1_K
6vm6_1_I
6vm6_1_H
6ywo_1_I
2a1r_1_C
6m6v_1_F
6m6v_1_E
2a1r_1_D
3gpq_1_E
3gpq_1_F
6o79_1_C
6vm6_1_K
6m6v_1_G
6hyu_1_D
1laj_1_R
6ybv_1_K
6mpf_1_W
6spc_1_A
6spe_1_A
6zvk_1_D2
7a01_1_D2
6fti_1_V
6ftj_1_V
6ftg_1_V
4g0a_1_G
4g0a_1_F
4g0a_1_E
2b2d_1_S
5hkc_1_C
4kzy_1_I
4kzz_1_I
4kzx_1_I
1rmv_1_B
4qu7_1_X
4qu7_1_V
4qu7_1_U
4v5z_1_AH
4v5z_1_AA
4v5z_1_AB
4v5z_1_AC
4v5z_1_AD
4v5z_1_AE
4v5z_1_AF
4v5z_1_AG
6pmi_1_3
6pmj_1_3
5hjz_1_C
......
This diff could not be displayed because it is too large.
......@@ -4,6 +4,7 @@ cd /home/lbecquey/Projects/RNANet
rm -rf latest_run.log errors.txt
# Run RNANet
bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --no-homology --redundant --extract' > latest_run.log 2>&1
bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --redundant --sina --extract -s --stats-opts="--wadley --distance-matrices" --archive' > latest_run.log 2>&1
echo 'Compressing RNANet.db.gz...' >> latest_run.log
touch results/RNANet.db # update last modification date
......
#!python3
import subprocess, os, sys
from RNAnet import *
# Put a list of problematic families here, they will be properly deleted and recomputed
families = [
"RF00005"
]
# provide the path to your data folders, the RNANet.db file, and the RNANet.py file as arguments to this script
path_to_3D_data = "/home/lbecquey/Data/RNA/3D/"
path_to_seq_data = "/home/lbecquey/Data/RNA/sequences/"
path_to_db = "/home/lbecquey/Projects/RNANet/results/RNANet.db"
for fam in families:
print()
print()
print()
print(f"Removing {fam} files...")
# Remove the datapoints files
files = [ f for f in os.listdir(path_to_3D_data + "/datapoints") if fam in f ]
for f in files:
subprocess.run(["rm", '-f', path_to_3D_data + f"/datapoints/{f}"])
# Remove the alignments
files = [ f for f in os.listdir(path_to_seq_data + "/realigned") if fam in f ]
for f in files:
subprocess.run(["rm", '-f', path_to_seq_data + f"/realigned/{f}"])
# Delete the family from the database, and the associated nucleotides and re_mappings, using foreign keys
command = ["sqlite3", path_to_db, f"PRAGMA foreign_keys=ON; delete from family where rfam_acc=\"{fam}\";"]
print(' '.join(command))
subprocess.run(command)
# Now re run RNANet normally.
command = ["python3.8", "./RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0",
"--redundant", "--sina", "--extract", "-s", "--stats-opts=\"--wadley --distance-matrices\""]
print(' '.join(command))
subprocess.run(command)
\ No newline at end of file
......@@ -917,7 +917,7 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
# Identify the right 3D file
filename = ''
for file in filelist:
if file.startswith(s.id.replace('-', '').replace('[', '_').replace(']', '_')):
if file.startswith(s.id.split("RF")[0].replace('-', '').replace('[', '_').replace(']', '_')):
filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file
break
if not len(filename):
......@@ -954,8 +954,8 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j])
# Save the individual distance matrices
if f not in LSU_set and f not in SSU_set:
np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
# if f not in LSU_set and f not in SSU_set:
np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
# For the average and sd, we want to consider only positions of the consensus model. This means:
# - Add empty space when we have deletions
......@@ -979,11 +979,12 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
for i in range(len(s.seq)):
if cm_coords[i] is None:
continue
pos_i = int(cm_coords[i])-1
for j in range(len(s.seq)):
if j >= len(cm_coords):
print(f"Issue with {s.id} mapped to {f} ({label}, {j}/{len(s.seq)}, {len(cm_coords)})")
if cm_coords[j] is None:
continue
c[pos_i, int(cm_coords[j])-1] = d[i,j]
c[int(cm_coords[i])-1, int(cm_coords[j])-1] = d[i,j]
# return the matrices counts, c, c^2
return 1-np.isnan(c).astype(int), np.nan_to_num(c), np.nan_to_num(c*c)
......@@ -1015,9 +1016,16 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';")
filelist = sorted([ ''.join(list(x))+'.cif' for x in r ])
r = sql_ask_database(conn, f"SELECT cm_coord FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;")
cm_coords = [ x[0] for x in r ]
cm_coords = [ x[0] for x in r ] # len(cm_coords) is the number of saved columns. There are many None values in the list.
i = len(cm_coords)-1
while cm_coords[i] is None:
if i == 0:
# Issue somewhere. Abort.
warn(f"{f} has no mapping to CM. Ignoring distance matrix.")
if not multithread:
idxQueue.put(thr_idx) # replace the thread index in the queue
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
return 0
i -= 1
family_end = int(cm_coords[i])
counts = np.zeros((family_end, family_end))
......@@ -1309,14 +1317,14 @@ if __name__ == "__main__":
except:
print("Something went wrong")
# # Now process the memory-heavy tasks family by family
# if DO_AVG_DISTANCE_MATRIX:
# for f in LSU_set:
# get_avg_std_distance_matrix(f, True, True)
# get_avg_std_distance_matrix(f, False, True)
# for f in SSU_set:
# get_avg_std_distance_matrix(f, True, True)
# get_avg_std_distance_matrix(f, False, True)
# Now process the memory-heavy tasks family by family
if DO_AVG_DISTANCE_MATRIX:
for f in LSU_set:
get_avg_std_distance_matrix(f, True, True)
get_avg_std_distance_matrix(f, False, True)
for f in SSU_set:
get_avg_std_distance_matrix(f, True, True)
get_avg_std_distance_matrix(f, False, True)
print()
print()
......