Louis BECQUEY

delete unused cm_coords at updates

......@@ -1576,6 +1576,7 @@ class Pipeline:
subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", "."])
subprocess.run(["ln", "-s", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", runDir + f"/archive/RNANET_datapoints_latest.tar.gz"])
if self.HOMOLOGY:
# gather the alignments
os.makedirs(path_to_seq_data + "realigned/3d_only", exist_ok=True)
for f in os.listdir(path_to_seq_data + "realigned"):
......@@ -2699,7 +2700,15 @@ def work_pssm_remap(f):
cm_coords = [ None for x in range(ncols) ]
cm_2d = [ None for x in range(ncols) ]
data = [(f,j,i,cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for i, j in enumerate(columns)]
# remove columns from the database if they are not supposed to be saved anymore
already_saved = sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc='{f}';")
already_saved = set([ x[0] for x in already_saved ])
to_remove = already_saved - columns_to_save
if len(to_remove):
sql_execute(conn, f"DELETE FROM align_column WHERE rfam_acc='{f}' AND index_ali = ?;", data=(sorted(to_remove),))
# Now store the columns
data = [(f,j,i+1,cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for i, j in enumerate(columns)]
sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, index_small_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO
UPDATE SET index_small_ali=excluded.index_small_ali, cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U,
......@@ -2768,7 +2777,7 @@ def work_save(c, homology=True):
conn.execute('pragma journal_mode=wal')
if homology:
df = pd.read_sql_query(f"""
SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, cm_coord, index_small_ali,
SELECT index_chain, cm_coord, index_small_ali, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code,
is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other,
gap_percent, consensus, cons_sec_struct, dbn, paired, nb_interact, pair_type_LW, pair_type_DSSR,
alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, chi, bb_type, glyco_bond, form, ssZp, Dp,
......
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
......@@ -4,6 +4,7 @@ cd /home/lbecquey/Projects/RNANet
rm -rf latest_run.log errors.txt
# Run RNANet
bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --no-homology --redundant --extract' > latest_run.log 2>&1
bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --redundant --sina --extract -s --stats-opts="--wadley --distance-matrices" --archive' > latest_run.log 2>&1
echo 'Compressing RNANet.db.gz...' >> latest_run.log
touch results/RNANet.db # update last modification date
......
#!python3
import subprocess, os, sys
from RNAnet import *
# Put a list of problematic families here, they will be properly deleted and recomputed
families = [
"RF00005"
]
# provide the path to your data folders, the RNANet.db file, and the RNANet.py file as arguments to this script
path_to_3D_data = "/home/lbecquey/Data/RNA/3D/"
path_to_seq_data = "/home/lbecquey/Data/RNA/sequences/"
path_to_db = "/home/lbecquey/Projects/RNANet/results/RNANet.db"
for fam in families:
print()
print()
print()
print(f"Removing {fam} files...")
# Remove the datapoints files
files = [ f for f in os.listdir(path_to_3D_data + "/datapoints") if fam in f ]
for f in files:
subprocess.run(["rm", '-f', path_to_3D_data + f"/datapoints/{f}"])
# Remove the alignments
files = [ f for f in os.listdir(path_to_seq_data + "/realigned") if fam in f ]
for f in files:
subprocess.run(["rm", '-f', path_to_seq_data + f"/realigned/{f}"])
# Delete the family from the database, and the associated nucleotides and re_mappings, using foreign keys
command = ["sqlite3", path_to_db, f"PRAGMA foreign_keys=ON; delete from family where rfam_acc=\"{fam}\";"]
print(' '.join(command))
subprocess.run(command)
# Now re run RNANet normally.
command = ["python3.8", "./RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0",
"--redundant", "--sina", "--extract", "-s", "--stats-opts=\"--wadley --distance-matrices\""]
print(' '.join(command))
subprocess.run(command)
\ No newline at end of file
......@@ -917,7 +917,7 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
# Identify the right 3D file
filename = ''
for file in filelist:
if file.startswith(s.id.replace('-', '').replace('[', '_').replace(']', '_')):
if file.startswith(s.id.split("RF")[0].replace('-', '').replace('[', '_').replace(']', '_')):
filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file
break
if not len(filename):
......@@ -954,7 +954,7 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j])
# Save the individual distance matrices
if f not in LSU_set and f not in SSU_set:
# if f not in LSU_set and f not in SSU_set:
np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
# For the average and sd, we want to consider only positions of the consensus model. This means:
......@@ -979,11 +979,12 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
for i in range(len(s.seq)):
if cm_coords[i] is None:
continue
pos_i = int(cm_coords[i])-1
for j in range(len(s.seq)):
if j >= len(cm_coords):
print(f"Issue with {s.id} mapped to {f} ({label}, {j}/{len(s.seq)}, {len(cm_coords)})")
if cm_coords[j] is None:
continue
c[pos_i, int(cm_coords[j])-1] = d[i,j]
c[int(cm_coords[i])-1, int(cm_coords[j])-1] = d[i,j]
# return the matrices counts, c, c^2
return 1-np.isnan(c).astype(int), np.nan_to_num(c), np.nan_to_num(c*c)
......@@ -1015,9 +1016,16 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';")
filelist = sorted([ ''.join(list(x))+'.cif' for x in r ])
r = sql_ask_database(conn, f"SELECT cm_coord FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;")
cm_coords = [ x[0] for x in r ]
cm_coords = [ x[0] for x in r ] # len(cm_coords) is the number of saved columns. There are many None values in the list.
i = len(cm_coords)-1
while cm_coords[i] is None:
if i == 0:
# Issue somewhere. Abort.
warn(f"{f} has no mapping to CM. Ignoring distance matrix.")
if not multithread:
idxQueue.put(thr_idx) # replace the thread index in the queue
setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
return 0
i -= 1
family_end = int(cm_coords[i])
counts = np.zeros((family_end, family_end))
......@@ -1309,14 +1317,14 @@ if __name__ == "__main__":
except:
print("Something went wrong")
# # Now process the memory-heavy tasks family by family
# if DO_AVG_DISTANCE_MATRIX:
# for f in LSU_set:
# get_avg_std_distance_matrix(f, True, True)
# get_avg_std_distance_matrix(f, False, True)
# for f in SSU_set:
# get_avg_std_distance_matrix(f, True, True)
# get_avg_std_distance_matrix(f, False, True)
# Now process the memory-heavy tasks family by family
if DO_AVG_DISTANCE_MATRIX:
for f in LSU_set:
get_avg_std_distance_matrix(f, True, True)
get_avg_std_distance_matrix(f, False, True)
for f in SSU_set:
get_avg_std_distance_matrix(f, True, True)
get_avg_std_distance_matrix(f, False, True)
print()
print()
......