delete unused cm_coords at updates

Louis BECQUEY
Commit d4cb6f782f2fc0f897a4cf0c7b31d3668fe63bef d4cb6f78 1 parent 4e8abda3
Showing 6 changed files with 82 additions and 23 deletions
RNAnet.py
known_issues.txt
known_issues_reasons.txt
scripts/automate.sh
scripts/recompute_family.py
statistics.py
--- a/RNAnet.py
View file @d4cb6f7
+++ b/RNAnet.py
View file @d4cb6f7
@@ -1576,13 +1576,14 @@ class Pipeline:
             subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", "."])
             subprocess.run(["ln", "-s", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", runDir + f"/archive/RNANET_datapoints_latest.tar.gz"])
-            # gather the alignments
+            if self.HOMOLOGY:
-            os.makedirs(path_to_seq_data + "realigned/3d_only", exist_ok=True)
+                # gather the alignments
-            for f in os.listdir(path_to_seq_data + "realigned"):
+                os.makedirs(path_to_seq_data + "realigned/3d_only", exist_ok=True)
-                if "3d_only.afa" in f:
+                for f in os.listdir(path_to_seq_data + "realigned"):
-                    subprocess.run(["cp", path_to_seq_data + "realigned/" + f, path_to_seq_data + "realigned/3d_only"])
+                    if "3d_only.afa" in f:
-            subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_alignments_latest.tar.gz"])
+                        subprocess.run(["cp", path_to_seq_data + "realigned/" + f, path_to_seq_data + "realigned/3d_only"])
-            subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3d_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."])
+                subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_alignments_latest.tar.gz"])
+                subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3d_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."])
     def sanitize_database(self):
         """Searches for issues in the database and correct them"""
@@ -2699,7 +2700,15 @@ def work_pssm_remap(f):
         cm_coords = [ None for x in range(ncols) ]
         cm_2d = [ None for x in range(ncols) ]
-    data = [(f,j,i,cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for i, j in enumerate(columns)]
+    # remove columns from the database if they are not supposed to be saved anymore
+    already_saved = sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc='{f}';")
+    already_saved = set([ x[0] for x in already_saved ])
+    to_remove = already_saved - columns_to_save
+    if len(to_remove):
+        sql_execute(conn, f"DELETE FROM align_column WHERE rfam_acc='{f}' AND index_ali = ?;", data=(sorted(to_remove),))
+
+    # Now store the columns
+    data = [(f,j,i+1,cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for i, j in enumerate(columns)]
     sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, index_small_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct)
                          VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO 
                          UPDATE SET index_small_ali=excluded.index_small_ali, cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, 
@@ -2768,7 +2777,7 @@ def work_save(c, homology=True):
     conn.execute('pragma journal_mode=wal')
     if homology:
         df = pd.read_sql_query(f"""
-                SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, cm_coord, index_small_ali,
+                SELECT index_chain, cm_coord, index_small_ali, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code,
                 is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, 
                 gap_percent, consensus, cons_sec_struct, dbn, paired, nb_interact, pair_type_LW, pair_type_DSSR, 
                 alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, chi, bb_type, glyco_bond, form, ssZp, Dp, 
--- a/known_issues.txt
View file @d4cb6f7
+++ b/known_issues.txt
View file @d4cb6f7
--- a/known_issues_reasons.txt
View file @d4cb6f7
+++ b/known_issues_reasons.txt
View file @d4cb6f7
--- a/scripts/automate.sh
View file @d4cb6f7
+++ b/scripts/automate.sh
View file @d4cb6f7
@@ -4,6 +4,7 @@ cd /home/lbecquey/Projects/RNANet
 rm -rf latest_run.log errors.txt
 # Run RNANet
+bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --no-homology --redundant --extract' > latest_run.log 2>&1
 bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --redundant --sina --extract -s --stats-opts="--wadley --distance-matrices" --archive' > latest_run.log 2>&1
 echo 'Compressing RNANet.db.gz...' >> latest_run.log
 touch results/RNANet.db                                         # update last modification date
--- a/scripts/recompute_family.py 0 → 100644
View file @d4cb6f7
+++ b/scripts/recompute_family.py 0 → 100644
View file @d4cb6f7
+#!python3
+import subprocess, os, sys
+from RNAnet import *
+
+
+# Put a list of problematic families here, they will be properly deleted and recomputed
+families = [
+    "RF00005"
+]
+
+# provide the path to your data folders, the RNANet.db file, and the RNANet.py file as arguments to this script
+path_to_3D_data = "/home/lbecquey/Data/RNA/3D/"
+path_to_seq_data = "/home/lbecquey/Data/RNA/sequences/"
+path_to_db = "/home/lbecquey/Projects/RNANet/results/RNANet.db"
+
+for fam in families:
+    print()
+    print()
+    print()
+    print(f"Removing {fam} files...")
+
+    # Remove the datapoints files
+    files = [ f for f in os.listdir(path_to_3D_data + "/datapoints") if fam in f ]
+    for f in files:
+        subprocess.run(["rm", '-f', path_to_3D_data + f"/datapoints/{f}"])
+
+    # Remove the alignments
+    files = [ f for f in os.listdir(path_to_seq_data + "/realigned") if fam in f ]
+    for f in files:
+        subprocess.run(["rm", '-f', path_to_seq_data + f"/realigned/{f}"])
+
+    # Delete the family from the database, and the associated nucleotides and re_mappings, using foreign keys
+    command = ["sqlite3", path_to_db, f"PRAGMA foreign_keys=ON; delete from family where rfam_acc=\"{fam}\";"]
+    print(' '.join(command))
+    subprocess.run(command)
+
+# Now re run RNANet normally.
+command = ["python3.8", "./RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0",
+            "--redundant", "--sina", "--extract", "-s", "--stats-opts=\"--wadley --distance-matrices\""]
+print(' '.join(command))
+subprocess.run(command)
\ No newline at end of file
--- a/statistics.py
View file @d4cb6f7
+++ b/statistics.py
View file @d4cb6f7
@@ -917,7 +917,7 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
     # Identify the right 3D file
     filename = ''
     for file in filelist:
-        if file.startswith(s.id.replace('-', '').replace('[', '_').replace(']', '_')):
+        if file.startswith(s.id.split("RF")[0].replace('-', '').replace('[', '_').replace(']', '_')):
             filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file
             break
     if not len(filename):
@@ -954,8 +954,8 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
                 d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j])
     # Save the individual distance matrices
-    if f not in LSU_set and f not in SSU_set:
+    # if f not in LSU_set and f not in SSU_set:
-        np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
+    np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
     # For the average and sd, we want to consider only positions of the consensus model. This means:
     #  - Add empty space when we have deletions
@@ -979,11 +979,12 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
     for i in range(len(s.seq)):
         if cm_coords[i] is None:
             continue
-        pos_i = int(cm_coords[i])-1
         for j in range(len(s.seq)):
+            if j >= len(cm_coords):
+                print(f"Issue with {s.id} mapped to {f} ({label}, {j}/{len(s.seq)}, {len(cm_coords)})")
             if cm_coords[j] is None:
                 continue
-            c[pos_i, int(cm_coords[j])-1] = d[i,j]
+            c[int(cm_coords[i])-1, int(cm_coords[j])-1] = d[i,j]
     # return the matrices counts, c, c^2
     return 1-np.isnan(c).astype(int), np.nan_to_num(c), np.nan_to_num(c*c)
@@ -1015,9 +1016,16 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
         r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';")
         filelist = sorted([ ''.join(list(x))+'.cif' for x in r ])
         r = sql_ask_database(conn, f"SELECT cm_coord FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;")
-        cm_coords = [ x[0] for x in r ]
+        cm_coords = [ x[0] for x in r ] # len(cm_coords) is the number of saved columns. There are many None values in the list.
         i = len(cm_coords)-1
         while cm_coords[i] is None:
+            if i == 0:
+                # Issue somewhere. Abort.
+                warn(f"{f} has no mapping to CM. Ignoring distance matrix.")
+                if not multithread:
+                    idxQueue.put(thr_idx) # replace the thread index in the queue
+                    setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
+                return 0
             i -= 1
         family_end = int(cm_coords[i])
     counts = np.zeros((family_end, family_end))
@@ -1309,14 +1317,14 @@ if __name__ == "__main__":
     except:
         print("Something went wrong")
-    # # Now process the memory-heavy tasks family by family
+    # Now process the memory-heavy tasks family by family
-    # if DO_AVG_DISTANCE_MATRIX:
+    if DO_AVG_DISTANCE_MATRIX:
-    #     for f in LSU_set:
+        for f in LSU_set:
-    #         get_avg_std_distance_matrix(f, True, True)
+            get_avg_std_distance_matrix(f, True, True)
-    #         get_avg_std_distance_matrix(f, False, True)
+            get_avg_std_distance_matrix(f, False, True)
-    #     for f in SSU_set:
+        for f in SSU_set:
-    #         get_avg_std_distance_matrix(f, True, True)
+            get_avg_std_distance_matrix(f, True, True)
-    #         get_avg_std_distance_matrix(f, False, True)
+            get_avg_std_distance_matrix(f, False, True)
     print()
     print()