Replaced NULL rfam_acc values by unmappd

Louis BECQUEY
Commit 752ddc4ec04f5e315bf110c64c65caef71bc7922 752ddc4e 1 parent 28c116ce
Showing 2 changed files with 11 additions and 13 deletions
RNAnet.py
statistics.py
--- a/RNAnet.py
View file @752ddc4
+++ b/RNAnet.py
View file @752ddc4
@@ -553,14 +553,14 @@ class Chain:
                                                     AND rfam_acc='{self.mapping.rfam_acc}'
                                                     AND eq_class='{self.eq_class}';""")[0][0]
             else:
-                sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, NULL, ?, ?) 
+                sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, 'unmappd', ?, ?) 
                                    ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue, eq_class=excluded.eq_class;""", 
                             data=(str(self.pdb_id), str(self.pdb_chain_id), str(self.eq_class), int(self.delete_me)))
                 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain 
                                                     WHERE structure_id='{self.pdb_id}' 
                                                     AND chain_name='{self.pdb_chain_id}' 
                                                     AND eq_class='{self.eq_class}'
-                                                    AND rfam_acc IS NULL;""")[0][0]
+                                                    AND rfam_acc = 'unmappd';""")[0][0]
             # Add the nucleotides if the chain is not an issue
             if df is not None and not self.delete_me:  # double condition is theoretically redundant here, but you never know
@@ -1193,7 +1193,7 @@ class Pipeline:
                     pdb_model = int(nr[1])
                     pdb_chain_id = nr[2].upper()
                     chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}"
-                    res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc IS NULL AND issue=0""")
+                    res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc = 'unmappd' AND issue=0""")
                     if not len(res) or self.REUSE_ALL: # the chain is NOT yet in the database, or this is a known issue
                         self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class))
             conn.close()
--- a/statistics.py
View file @752ddc4
+++ b/statistics.py
View file @752ddc4
@@ -610,22 +610,22 @@ def general_stats():
     with sqlite3.connect("results/RNANet.db") as conn:
         df_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
                                         FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
-                                        WHERE rfam_acc IS NULL AND ISSUE=0;""", conn)
+                                        WHERE rfam_acc = 'unmappd' AND ISSUE=0;""", conn)
         df_mapped_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
                                             FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
-                                            WHERE rfam_acc IS NOT NULL AND ISSUE=0;""", conn)
+                                            WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn)
         df_mapped_copies = pd.read_sql(f"""SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution
                                             FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
-                                            WHERE rfam_acc IS NOT NULL AND ISSUE=0;""", conn)
+                                            WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn)
         df_inferred_only_unique = pd.read_sql(f"""SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution
                                                     FROM (SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy
                                                             FROM chain 
                                                             JOIN (SELECT structure_id, chain_name, COUNT(distinct rfam_acc) AS redundancy, SUM(inferred) AS inf_redundancy 
                                                                     FROM chain 
-                                                                    WHERE rfam_acc IS NOT NULL AND issue=0 
+                                                                    WHERE rfam_acc != 'unmappd' AND issue=0 
                                                                     GROUP BY structure_id, chain_name
                                                             ) AS r ON chain.structure_id=r.structure_id AND chain.chain_name = r.chain_name 
-                                                            WHERE r.redundancy=r.inf_redundancy AND rfam_acc IS NOT NULL and issue=0
+                                                            WHERE r.redundancy=r.inf_redundancy AND rfam_acc != 'unmappd' and issue=0
                                                     ) AS c
                                                     JOIN structure ON c.structure_id=structure.pdb_id;""", conn)
     print("> found", len(df_inferred_only_unique.index), "chains which are mapped only by inference using BGSU NR Lists.")
@@ -775,9 +775,6 @@ def log_to_pbar(pbar):
 if __name__ == "__main__":
-    general_stats()
-    exit()
-
     # parse options
     try:
         opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "resolution=", "3d-folder=", "seq-folder=" ])
@@ -839,8 +836,8 @@ if __name__ == "__main__":
     # Define the tasks
     joblist = []
-    joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 4.0)))   # res threshold is 4.0 Angstroms by default
+    # joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 4.0)))   # res threshold is 4.0 Angstroms by default
-    joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 4.0)))   #
+    # joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 4.0)))   #
     joblist.append(Job(function=stats_len)) # Computes figures
     # joblist.append(Job(function=stats_freq)) # updates the database
     # for f in famlist:
@@ -873,3 +870,4 @@ if __name__ == "__main__":
     # per_chain_stats()
     # seq_idty()
     # stats_pairs()
+    general_stats()