Showing
6 changed files
with
82 additions
and
23 deletions
... | @@ -1576,13 +1576,14 @@ class Pipeline: | ... | @@ -1576,13 +1576,14 @@ class Pipeline: |
1576 | subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", "."]) | 1576 | subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", "."]) |
1577 | subprocess.run(["ln", "-s", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", runDir + f"/archive/RNANET_datapoints_latest.tar.gz"]) | 1577 | subprocess.run(["ln", "-s", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", runDir + f"/archive/RNANET_datapoints_latest.tar.gz"]) |
1578 | 1578 | ||
1579 | - # gather the alignments | 1579 | + if self.HOMOLOGY: |
1580 | - os.makedirs(path_to_seq_data + "realigned/3d_only", exist_ok=True) | 1580 | + # gather the alignments |
1581 | - for f in os.listdir(path_to_seq_data + "realigned"): | 1581 | + os.makedirs(path_to_seq_data + "realigned/3d_only", exist_ok=True) |
1582 | - if "3d_only.afa" in f: | 1582 | + for f in os.listdir(path_to_seq_data + "realigned"): |
1583 | - subprocess.run(["cp", path_to_seq_data + "realigned/" + f, path_to_seq_data + "realigned/3d_only"]) | 1583 | + if "3d_only.afa" in f: |
1584 | - subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_alignments_latest.tar.gz"]) | 1584 | + subprocess.run(["cp", path_to_seq_data + "realigned/" + f, path_to_seq_data + "realigned/3d_only"]) |
1585 | - subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3d_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."]) | 1585 | + subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_alignments_latest.tar.gz"]) |
1586 | + subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3d_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."]) | ||
1586 | 1587 | ||
1587 | def sanitize_database(self): | 1588 | def sanitize_database(self): |
1588 | """Searches for issues in the database and correct them""" | 1589 | """Searches for issues in the database and correct them""" |
... | @@ -2699,7 +2700,15 @@ def work_pssm_remap(f): | ... | @@ -2699,7 +2700,15 @@ def work_pssm_remap(f): |
2699 | cm_coords = [ None for x in range(ncols) ] | 2700 | cm_coords = [ None for x in range(ncols) ] |
2700 | cm_2d = [ None for x in range(ncols) ] | 2701 | cm_2d = [ None for x in range(ncols) ] |
2701 | 2702 | ||
2702 | - data = [(f,j,i,cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for i, j in enumerate(columns)] | 2703 | + # remove columns from the database if they are not supposed to be saved anymore |
2704 | + already_saved = sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc='{f}';") | ||
2705 | + already_saved = set([ x[0] for x in already_saved ]) | ||
2706 | + to_remove = already_saved - columns_to_save | ||
2707 | + if len(to_remove): | ||
2708 | + sql_execute(conn, f"DELETE FROM align_column WHERE rfam_acc='{f}' AND index_ali = ?;", data=(sorted(to_remove),)) | ||
2709 | + | ||
2710 | + # Now store the columns | ||
2711 | + data = [(f,j,i+1,cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for i, j in enumerate(columns)] | ||
2703 | sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, index_small_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct) | 2712 | sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, index_small_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct) |
2704 | VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO | 2713 | VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO |
2705 | UPDATE SET index_small_ali=excluded.index_small_ali, cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, | 2714 | UPDATE SET index_small_ali=excluded.index_small_ali, cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, |
... | @@ -2768,7 +2777,7 @@ def work_save(c, homology=True): | ... | @@ -2768,7 +2777,7 @@ def work_save(c, homology=True): |
2768 | conn.execute('pragma journal_mode=wal') | 2777 | conn.execute('pragma journal_mode=wal') |
2769 | if homology: | 2778 | if homology: |
2770 | df = pd.read_sql_query(f""" | 2779 | df = pd.read_sql_query(f""" |
2771 | - SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, cm_coord, index_small_ali, | 2780 | + SELECT index_chain, cm_coord, index_small_ali, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, |
2772 | is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, | 2781 | is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, |
2773 | gap_percent, consensus, cons_sec_struct, dbn, paired, nb_interact, pair_type_LW, pair_type_DSSR, | 2782 | gap_percent, consensus, cons_sec_struct, dbn, paired, nb_interact, pair_type_LW, pair_type_DSSR, |
2774 | alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, chi, bb_type, glyco_bond, form, ssZp, Dp, | 2783 | alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, chi, bb_type, glyco_bond, form, ssZp, Dp, | ... | ... |
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
... | @@ -4,6 +4,7 @@ cd /home/lbecquey/Projects/RNANet | ... | @@ -4,6 +4,7 @@ cd /home/lbecquey/Projects/RNANet |
4 | rm -rf latest_run.log errors.txt | 4 | rm -rf latest_run.log errors.txt |
5 | 5 | ||
6 | # Run RNANet | 6 | # Run RNANet |
7 | +bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --no-homology --redundant --extract' > latest_run.log 2>&1 | ||
7 | bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --redundant --sina --extract -s --stats-opts="--wadley --distance-matrices" --archive' > latest_run.log 2>&1 | 8 | bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --redundant --sina --extract -s --stats-opts="--wadley --distance-matrices" --archive' > latest_run.log 2>&1 |
8 | echo 'Compressing RNANet.db.gz...' >> latest_run.log | 9 | echo 'Compressing RNANet.db.gz...' >> latest_run.log |
9 | touch results/RNANet.db # update last modification date | 10 | touch results/RNANet.db # update last modification date | ... | ... |
scripts/recompute_family.py
0 → 100644
1 | +#!python3 | ||
2 | +import subprocess, os, sys | ||
3 | +from RNAnet import * | ||
4 | + | ||
5 | + | ||
6 | +# Put a list of problematic families here, they will be properly deleted and recomputed | ||
7 | +families = [ | ||
8 | + "RF00005" | ||
9 | +] | ||
10 | + | ||
11 | +# provide the path to your data folders, the RNANet.db file, and the RNANet.py file as arguments to this script | ||
12 | +path_to_3D_data = "/home/lbecquey/Data/RNA/3D/" | ||
13 | +path_to_seq_data = "/home/lbecquey/Data/RNA/sequences/" | ||
14 | +path_to_db = "/home/lbecquey/Projects/RNANet/results/RNANet.db" | ||
15 | + | ||
16 | +for fam in families: | ||
17 | + print() | ||
18 | + print() | ||
19 | + print() | ||
20 | + print(f"Removing {fam} files...") | ||
21 | + | ||
22 | + # Remove the datapoints files | ||
23 | + files = [ f for f in os.listdir(path_to_3D_data + "/datapoints") if fam in f ] | ||
24 | + for f in files: | ||
25 | + subprocess.run(["rm", '-f', path_to_3D_data + f"/datapoints/{f}"]) | ||
26 | + | ||
27 | + # Remove the alignments | ||
28 | + files = [ f for f in os.listdir(path_to_seq_data + "/realigned") if fam in f ] | ||
29 | + for f in files: | ||
30 | + subprocess.run(["rm", '-f', path_to_seq_data + f"/realigned/{f}"]) | ||
31 | + | ||
32 | + # Delete the family from the database, and the associated nucleotides and re_mappings, using foreign keys | ||
33 | + command = ["sqlite3", path_to_db, f"PRAGMA foreign_keys=ON; delete from family where rfam_acc=\"{fam}\";"] | ||
34 | + print(' '.join(command)) | ||
35 | + subprocess.run(command) | ||
36 | + | ||
37 | +# Now re run RNANet normally. | ||
38 | +command = ["python3.8", "./RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", | ||
39 | + "--redundant", "--sina", "--extract", "-s", "--stats-opts=\"--wadley --distance-matrices\""] | ||
40 | +print(' '.join(command)) | ||
41 | +subprocess.run(command) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
... | @@ -917,7 +917,7 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s): | ... | @@ -917,7 +917,7 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s): |
917 | # Identify the right 3D file | 917 | # Identify the right 3D file |
918 | filename = '' | 918 | filename = '' |
919 | for file in filelist: | 919 | for file in filelist: |
920 | - if file.startswith(s.id.replace('-', '').replace('[', '_').replace(']', '_')): | 920 | + if file.startswith(s.id.split("RF")[0].replace('-', '').replace('[', '_').replace(']', '_')): |
921 | filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file | 921 | filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file |
922 | break | 922 | break |
923 | if not len(filename): | 923 | if not len(filename): |
... | @@ -954,8 +954,8 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s): | ... | @@ -954,8 +954,8 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s): |
954 | d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]) | 954 | d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]) |
955 | 955 | ||
956 | # Save the individual distance matrices | 956 | # Save the individual distance matrices |
957 | - if f not in LSU_set and f not in SSU_set: | 957 | + # if f not in LSU_set and f not in SSU_set: |
958 | - np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f") | 958 | + np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f") |
959 | 959 | ||
960 | # For the average and sd, we want to consider only positions of the consensus model. This means: | 960 | # For the average and sd, we want to consider only positions of the consensus model. This means: |
961 | # - Add empty space when we have deletions | 961 | # - Add empty space when we have deletions |
... | @@ -979,11 +979,12 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s): | ... | @@ -979,11 +979,12 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s): |
979 | for i in range(len(s.seq)): | 979 | for i in range(len(s.seq)): |
980 | if cm_coords[i] is None: | 980 | if cm_coords[i] is None: |
981 | continue | 981 | continue |
982 | - pos_i = int(cm_coords[i])-1 | ||
983 | for j in range(len(s.seq)): | 982 | for j in range(len(s.seq)): |
983 | + if j >= len(cm_coords): | ||
984 | + print(f"Issue with {s.id} mapped to {f} ({label}, {j}/{len(s.seq)}, {len(cm_coords)})") | ||
984 | if cm_coords[j] is None: | 985 | if cm_coords[j] is None: |
985 | continue | 986 | continue |
986 | - c[pos_i, int(cm_coords[j])-1] = d[i,j] | 987 | + c[int(cm_coords[i])-1, int(cm_coords[j])-1] = d[i,j] |
987 | # return the matrices counts, c, c^2 | 988 | # return the matrices counts, c, c^2 |
988 | return 1-np.isnan(c).astype(int), np.nan_to_num(c), np.nan_to_num(c*c) | 989 | return 1-np.isnan(c).astype(int), np.nan_to_num(c), np.nan_to_num(c*c) |
989 | 990 | ||
... | @@ -1015,9 +1016,16 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): | ... | @@ -1015,9 +1016,16 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): |
1015 | r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';") | 1016 | r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';") |
1016 | filelist = sorted([ ''.join(list(x))+'.cif' for x in r ]) | 1017 | filelist = sorted([ ''.join(list(x))+'.cif' for x in r ]) |
1017 | r = sql_ask_database(conn, f"SELECT cm_coord FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;") | 1018 | r = sql_ask_database(conn, f"SELECT cm_coord FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;") |
1018 | - cm_coords = [ x[0] for x in r ] | 1019 | + cm_coords = [ x[0] for x in r ] # len(cm_coords) is the number of saved columns. There are many None values in the list. |
1019 | i = len(cm_coords)-1 | 1020 | i = len(cm_coords)-1 |
1020 | while cm_coords[i] is None: | 1021 | while cm_coords[i] is None: |
1022 | + if i == 0: | ||
1023 | + # Issue somewhere. Abort. | ||
1024 | + warn(f"{f} has no mapping to CM. Ignoring distance matrix.") | ||
1025 | + if not multithread: | ||
1026 | + idxQueue.put(thr_idx) # replace the thread index in the queue | ||
1027 | + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | ||
1028 | + return 0 | ||
1021 | i -= 1 | 1029 | i -= 1 |
1022 | family_end = int(cm_coords[i]) | 1030 | family_end = int(cm_coords[i]) |
1023 | counts = np.zeros((family_end, family_end)) | 1031 | counts = np.zeros((family_end, family_end)) |
... | @@ -1309,14 +1317,14 @@ if __name__ == "__main__": | ... | @@ -1309,14 +1317,14 @@ if __name__ == "__main__": |
1309 | except: | 1317 | except: |
1310 | print("Something went wrong") | 1318 | print("Something went wrong") |
1311 | 1319 | ||
1312 | - # # Now process the memory-heavy tasks family by family | 1320 | + # Now process the memory-heavy tasks family by family |
1313 | - # if DO_AVG_DISTANCE_MATRIX: | 1321 | + if DO_AVG_DISTANCE_MATRIX: |
1314 | - # for f in LSU_set: | 1322 | + for f in LSU_set: |
1315 | - # get_avg_std_distance_matrix(f, True, True) | 1323 | + get_avg_std_distance_matrix(f, True, True) |
1316 | - # get_avg_std_distance_matrix(f, False, True) | 1324 | + get_avg_std_distance_matrix(f, False, True) |
1317 | - # for f in SSU_set: | 1325 | + for f in SSU_set: |
1318 | - # get_avg_std_distance_matrix(f, True, True) | 1326 | + get_avg_std_distance_matrix(f, True, True) |
1319 | - # get_avg_std_distance_matrix(f, False, True) | 1327 | + get_avg_std_distance_matrix(f, False, True) |
1320 | 1328 | ||
1321 | print() | 1329 | print() |
1322 | print() | 1330 | print() | ... | ... |
-
Please register or login to post a comment