Louis BECQUEY

Replaced NULL rfam_acc values by unmappd

...@@ -553,14 +553,14 @@ class Chain: ...@@ -553,14 +553,14 @@ class Chain:
553 AND rfam_acc='{self.mapping.rfam_acc}' 553 AND rfam_acc='{self.mapping.rfam_acc}'
554 AND eq_class='{self.eq_class}';""")[0][0] 554 AND eq_class='{self.eq_class}';""")[0][0]
555 else: 555 else:
556 - sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, NULL, ?, ?) 556 + sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, 'unmappd', ?, ?)
557 ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue, eq_class=excluded.eq_class;""", 557 ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue, eq_class=excluded.eq_class;""",
558 data=(str(self.pdb_id), str(self.pdb_chain_id), str(self.eq_class), int(self.delete_me))) 558 data=(str(self.pdb_id), str(self.pdb_chain_id), str(self.eq_class), int(self.delete_me)))
559 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain 559 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain
560 WHERE structure_id='{self.pdb_id}' 560 WHERE structure_id='{self.pdb_id}'
561 AND chain_name='{self.pdb_chain_id}' 561 AND chain_name='{self.pdb_chain_id}'
562 AND eq_class='{self.eq_class}' 562 AND eq_class='{self.eq_class}'
563 - AND rfam_acc IS NULL;""")[0][0] 563 + AND rfam_acc = 'unmappd';""")[0][0]
564 564
565 # Add the nucleotides if the chain is not an issue 565 # Add the nucleotides if the chain is not an issue
566 if df is not None and not self.delete_me: # double condition is theoretically redundant here, but you never know 566 if df is not None and not self.delete_me: # double condition is theoretically redundant here, but you never know
...@@ -1193,7 +1193,7 @@ class Pipeline: ...@@ -1193,7 +1193,7 @@ class Pipeline:
1193 pdb_model = int(nr[1]) 1193 pdb_model = int(nr[1])
1194 pdb_chain_id = nr[2].upper() 1194 pdb_chain_id = nr[2].upper()
1195 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}" 1195 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}"
1196 - res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc IS NULL AND issue=0""") 1196 + res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc = 'unmappd' AND issue=0""")
1197 if not len(res) or self.REUSE_ALL: # the chain is NOT yet in the database, or this is a known issue 1197 if not len(res) or self.REUSE_ALL: # the chain is NOT yet in the database, or this is a known issue
1198 self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class)) 1198 self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class))
1199 conn.close() 1199 conn.close()
......
...@@ -610,22 +610,22 @@ def general_stats(): ...@@ -610,22 +610,22 @@ def general_stats():
610 with sqlite3.connect("results/RNANet.db") as conn: 610 with sqlite3.connect("results/RNANet.db") as conn:
611 df_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution 611 df_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
612 FROM chain JOIN structure ON chain.structure_id = structure.pdb_id 612 FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
613 - WHERE rfam_acc IS NULL AND ISSUE=0;""", conn) 613 + WHERE rfam_acc = 'unmappd' AND ISSUE=0;""", conn)
614 df_mapped_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution 614 df_mapped_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
615 FROM chain JOIN structure ON chain.structure_id = structure.pdb_id 615 FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
616 - WHERE rfam_acc IS NOT NULL AND ISSUE=0;""", conn) 616 + WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn)
617 df_mapped_copies = pd.read_sql(f"""SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution 617 df_mapped_copies = pd.read_sql(f"""SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution
618 FROM chain JOIN structure ON chain.structure_id = structure.pdb_id 618 FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
619 - WHERE rfam_acc IS NOT NULL AND ISSUE=0;""", conn) 619 + WHERE rfam_acc != 'unmappd' AND ISSUE=0;""", conn)
620 df_inferred_only_unique = pd.read_sql(f"""SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution 620 df_inferred_only_unique = pd.read_sql(f"""SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution
621 FROM (SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy 621 FROM (SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy
622 FROM chain 622 FROM chain
623 JOIN (SELECT structure_id, chain_name, COUNT(distinct rfam_acc) AS redundancy, SUM(inferred) AS inf_redundancy 623 JOIN (SELECT structure_id, chain_name, COUNT(distinct rfam_acc) AS redundancy, SUM(inferred) AS inf_redundancy
624 FROM chain 624 FROM chain
625 - WHERE rfam_acc IS NOT NULL AND issue=0 625 + WHERE rfam_acc != 'unmappd' AND issue=0
626 GROUP BY structure_id, chain_name 626 GROUP BY structure_id, chain_name
627 ) AS r ON chain.structure_id=r.structure_id AND chain.chain_name = r.chain_name 627 ) AS r ON chain.structure_id=r.structure_id AND chain.chain_name = r.chain_name
628 - WHERE r.redundancy=r.inf_redundancy AND rfam_acc IS NOT NULL and issue=0 628 + WHERE r.redundancy=r.inf_redundancy AND rfam_acc != 'unmappd' and issue=0
629 ) AS c 629 ) AS c
630 JOIN structure ON c.structure_id=structure.pdb_id;""", conn) 630 JOIN structure ON c.structure_id=structure.pdb_id;""", conn)
631 print("> found", len(df_inferred_only_unique.index), "chains which are mapped only by inference using BGSU NR Lists.") 631 print("> found", len(df_inferred_only_unique.index), "chains which are mapped only by inference using BGSU NR Lists.")
...@@ -775,9 +775,6 @@ def log_to_pbar(pbar): ...@@ -775,9 +775,6 @@ def log_to_pbar(pbar):
775 775
776 if __name__ == "__main__": 776 if __name__ == "__main__":
777 777
778 - general_stats()
779 - exit()
780 -
781 # parse options 778 # parse options
782 try: 779 try:
783 opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "resolution=", "3d-folder=", "seq-folder=" ]) 780 opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "resolution=", "3d-folder=", "seq-folder=" ])
...@@ -839,8 +836,8 @@ if __name__ == "__main__": ...@@ -839,8 +836,8 @@ if __name__ == "__main__":
839 836
840 # Define the tasks 837 # Define the tasks
841 joblist = [] 838 joblist = []
842 - joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 4.0))) # res threshold is 4.0 Angstroms by default 839 + # joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 4.0))) # res threshold is 4.0 Angstroms by default
843 - joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 4.0))) # 840 + # joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 4.0))) #
844 joblist.append(Job(function=stats_len)) # Computes figures 841 joblist.append(Job(function=stats_len)) # Computes figures
845 # joblist.append(Job(function=stats_freq)) # updates the database 842 # joblist.append(Job(function=stats_freq)) # updates the database
846 # for f in famlist: 843 # for f in famlist:
...@@ -873,3 +870,4 @@ if __name__ == "__main__": ...@@ -873,3 +870,4 @@ if __name__ == "__main__":
873 # per_chain_stats() 870 # per_chain_stats()
874 # seq_idty() 871 # seq_idty()
875 # stats_pairs() 872 # stats_pairs()
873 + general_stats()
......