Louis BECQUEY

delete unused cm_coords at updates

...@@ -1576,13 +1576,14 @@ class Pipeline: ...@@ -1576,13 +1576,14 @@ class Pipeline:
1576 subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", "."]) 1576 subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", "."])
1577 subprocess.run(["ln", "-s", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", runDir + f"/archive/RNANET_datapoints_latest.tar.gz"]) 1577 subprocess.run(["ln", "-s", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", runDir + f"/archive/RNANET_datapoints_latest.tar.gz"])
1578 1578
1579 - # gather the alignments 1579 + if self.HOMOLOGY:
1580 - os.makedirs(path_to_seq_data + "realigned/3d_only", exist_ok=True) 1580 + # gather the alignments
1581 - for f in os.listdir(path_to_seq_data + "realigned"): 1581 + os.makedirs(path_to_seq_data + "realigned/3d_only", exist_ok=True)
1582 - if "3d_only.afa" in f: 1582 + for f in os.listdir(path_to_seq_data + "realigned"):
1583 - subprocess.run(["cp", path_to_seq_data + "realigned/" + f, path_to_seq_data + "realigned/3d_only"]) 1583 + if "3d_only.afa" in f:
1584 - subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_alignments_latest.tar.gz"]) 1584 + subprocess.run(["cp", path_to_seq_data + "realigned/" + f, path_to_seq_data + "realigned/3d_only"])
1585 - subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3d_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."]) 1585 + subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_alignments_latest.tar.gz"])
1586 + subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3d_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."])
1586 1587
1587 def sanitize_database(self): 1588 def sanitize_database(self):
1588 """Searches for issues in the database and correct them""" 1589 """Searches for issues in the database and correct them"""
...@@ -2699,7 +2700,15 @@ def work_pssm_remap(f): ...@@ -2699,7 +2700,15 @@ def work_pssm_remap(f):
2699 cm_coords = [ None for x in range(ncols) ] 2700 cm_coords = [ None for x in range(ncols) ]
2700 cm_2d = [ None for x in range(ncols) ] 2701 cm_2d = [ None for x in range(ncols) ]
2701 2702
2702 - data = [(f,j,i,cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for i, j in enumerate(columns)] 2703 + # remove columns from the database if they are not supposed to be saved anymore
2704 + already_saved = sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc='{f}';")
2705 + already_saved = set([ x[0] for x in already_saved ])
2706 + to_remove = already_saved - columns_to_save
2707 + if len(to_remove):
2708 + sql_execute(conn, f"DELETE FROM align_column WHERE rfam_acc='{f}' AND index_ali = ?;", data=(sorted(to_remove),))
2709 +
2710 + # Now store the columns
2711 + data = [(f,j,i+1,cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for i, j in enumerate(columns)]
2703 sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, index_small_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct) 2712 sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, index_small_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct)
2704 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO 2713 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO
2705 UPDATE SET index_small_ali=excluded.index_small_ali, cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, 2714 UPDATE SET index_small_ali=excluded.index_small_ali, cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U,
...@@ -2768,7 +2777,7 @@ def work_save(c, homology=True): ...@@ -2768,7 +2777,7 @@ def work_save(c, homology=True):
2768 conn.execute('pragma journal_mode=wal') 2777 conn.execute('pragma journal_mode=wal')
2769 if homology: 2778 if homology:
2770 df = pd.read_sql_query(f""" 2779 df = pd.read_sql_query(f"""
2771 - SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, cm_coord, index_small_ali, 2780 + SELECT index_chain, cm_coord, index_small_ali, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code,
2772 is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, 2781 is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other,
2773 gap_percent, consensus, cons_sec_struct, dbn, paired, nb_interact, pair_type_LW, pair_type_DSSR, 2782 gap_percent, consensus, cons_sec_struct, dbn, paired, nb_interact, pair_type_LW, pair_type_DSSR,
2774 alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, chi, bb_type, glyco_bond, form, ssZp, Dp, 2783 alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, chi, bb_type, glyco_bond, form, ssZp, Dp,
......
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
...@@ -4,6 +4,7 @@ cd /home/lbecquey/Projects/RNANet ...@@ -4,6 +4,7 @@ cd /home/lbecquey/Projects/RNANet
4 rm -rf latest_run.log errors.txt 4 rm -rf latest_run.log errors.txt
5 5
6 # Run RNANet 6 # Run RNANet
7 +bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --no-homology --redundant --extract' > latest_run.log 2>&1
7 bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --redundant --sina --extract -s --stats-opts="--wadley --distance-matrices" --archive' > latest_run.log 2>&1 8 bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --redundant --sina --extract -s --stats-opts="--wadley --distance-matrices" --archive' > latest_run.log 2>&1
8 echo 'Compressing RNANet.db.gz...' >> latest_run.log 9 echo 'Compressing RNANet.db.gz...' >> latest_run.log
9 touch results/RNANet.db # update last modification date 10 touch results/RNANet.db # update last modification date
......
1 +#!python3
2 +import subprocess, os, sys
3 +from RNAnet import *
4 +
5 +
6 +# Put a list of problematic families here, they will be properly deleted and recomputed
7 +families = [
8 + "RF00005"
9 +]
10 +
11 +# provide the path to your data folders, the RNANet.db file, and the RNANet.py file as arguments to this script
12 +path_to_3D_data = "/home/lbecquey/Data/RNA/3D/"
13 +path_to_seq_data = "/home/lbecquey/Data/RNA/sequences/"
14 +path_to_db = "/home/lbecquey/Projects/RNANet/results/RNANet.db"
15 +
16 +for fam in families:
17 + print()
18 + print()
19 + print()
20 + print(f"Removing {fam} files...")
21 +
22 + # Remove the datapoints files
23 + files = [ f for f in os.listdir(path_to_3D_data + "/datapoints") if fam in f ]
24 + for f in files:
25 + subprocess.run(["rm", '-f', path_to_3D_data + f"/datapoints/{f}"])
26 +
27 + # Remove the alignments
28 + files = [ f for f in os.listdir(path_to_seq_data + "/realigned") if fam in f ]
29 + for f in files:
30 + subprocess.run(["rm", '-f', path_to_seq_data + f"/realigned/{f}"])
31 +
32 + # Delete the family from the database, and the associated nucleotides and re_mappings, using foreign keys
33 + command = ["sqlite3", path_to_db, f"PRAGMA foreign_keys=ON; delete from family where rfam_acc=\"{fam}\";"]
34 + print(' '.join(command))
35 + subprocess.run(command)
36 +
37 +# Now re run RNANet normally.
38 +command = ["python3.8", "./RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0",
39 + "--redundant", "--sina", "--extract", "-s", "--stats-opts=\"--wadley --distance-matrices\""]
40 +print(' '.join(command))
41 +subprocess.run(command)
...\ No newline at end of file ...\ No newline at end of file
...@@ -917,7 +917,7 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s): ...@@ -917,7 +917,7 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
917 # Identify the right 3D file 917 # Identify the right 3D file
918 filename = '' 918 filename = ''
919 for file in filelist: 919 for file in filelist:
920 - if file.startswith(s.id.replace('-', '').replace('[', '_').replace(']', '_')): 920 + if file.startswith(s.id.split("RF")[0].replace('-', '').replace('[', '_').replace(']', '_')):
921 filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file 921 filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file
922 break 922 break
923 if not len(filename): 923 if not len(filename):
...@@ -954,8 +954,8 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s): ...@@ -954,8 +954,8 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
954 d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]) 954 d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j])
955 955
956 # Save the individual distance matrices 956 # Save the individual distance matrices
957 - if f not in LSU_set and f not in SSU_set: 957 + # if f not in LSU_set and f not in SSU_set:
958 - np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f") 958 + np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
959 959
960 # For the average and sd, we want to consider only positions of the consensus model. This means: 960 # For the average and sd, we want to consider only positions of the consensus model. This means:
961 # - Add empty space when we have deletions 961 # - Add empty space when we have deletions
...@@ -979,11 +979,12 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s): ...@@ -979,11 +979,12 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
979 for i in range(len(s.seq)): 979 for i in range(len(s.seq)):
980 if cm_coords[i] is None: 980 if cm_coords[i] is None:
981 continue 981 continue
982 - pos_i = int(cm_coords[i])-1
983 for j in range(len(s.seq)): 982 for j in range(len(s.seq)):
983 + if j >= len(cm_coords):
984 + print(f"Issue with {s.id} mapped to {f} ({label}, {j}/{len(s.seq)}, {len(cm_coords)})")
984 if cm_coords[j] is None: 985 if cm_coords[j] is None:
985 continue 986 continue
986 - c[pos_i, int(cm_coords[j])-1] = d[i,j] 987 + c[int(cm_coords[i])-1, int(cm_coords[j])-1] = d[i,j]
987 # return the matrices counts, c, c^2 988 # return the matrices counts, c, c^2
988 return 1-np.isnan(c).astype(int), np.nan_to_num(c), np.nan_to_num(c*c) 989 return 1-np.isnan(c).astype(int), np.nan_to_num(c), np.nan_to_num(c*c)
989 990
...@@ -1015,9 +1016,16 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): ...@@ -1015,9 +1016,16 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
1015 r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';") 1016 r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';")
1016 filelist = sorted([ ''.join(list(x))+'.cif' for x in r ]) 1017 filelist = sorted([ ''.join(list(x))+'.cif' for x in r ])
1017 r = sql_ask_database(conn, f"SELECT cm_coord FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;") 1018 r = sql_ask_database(conn, f"SELECT cm_coord FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;")
1018 - cm_coords = [ x[0] for x in r ] 1019 + cm_coords = [ x[0] for x in r ] # len(cm_coords) is the number of saved columns. There are many None values in the list.
1019 i = len(cm_coords)-1 1020 i = len(cm_coords)-1
1020 while cm_coords[i] is None: 1021 while cm_coords[i] is None:
1022 + if i == 0:
1023 + # Issue somewhere. Abort.
1024 + warn(f"{f} has no mapping to CM. Ignoring distance matrix.")
1025 + if not multithread:
1026 + idxQueue.put(thr_idx) # replace the thread index in the queue
1027 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
1028 + return 0
1021 i -= 1 1029 i -= 1
1022 family_end = int(cm_coords[i]) 1030 family_end = int(cm_coords[i])
1023 counts = np.zeros((family_end, family_end)) 1031 counts = np.zeros((family_end, family_end))
...@@ -1309,14 +1317,14 @@ if __name__ == "__main__": ...@@ -1309,14 +1317,14 @@ if __name__ == "__main__":
1309 except: 1317 except:
1310 print("Something went wrong") 1318 print("Something went wrong")
1311 1319
1312 - # # Now process the memory-heavy tasks family by family 1320 + # Now process the memory-heavy tasks family by family
1313 - # if DO_AVG_DISTANCE_MATRIX: 1321 + if DO_AVG_DISTANCE_MATRIX:
1314 - # for f in LSU_set: 1322 + for f in LSU_set:
1315 - # get_avg_std_distance_matrix(f, True, True) 1323 + get_avg_std_distance_matrix(f, True, True)
1316 - # get_avg_std_distance_matrix(f, False, True) 1324 + get_avg_std_distance_matrix(f, False, True)
1317 - # for f in SSU_set: 1325 + for f in SSU_set:
1318 - # get_avg_std_distance_matrix(f, True, True) 1326 + get_avg_std_distance_matrix(f, True, True)
1319 - # get_avg_std_distance_matrix(f, False, True) 1327 + get_avg_std_distance_matrix(f, False, True)
1320 1328
1321 print() 1329 print()
1322 print() 1330 print()
......