Louis BECQUEY

Inclusion of NR list class in DB

Showing 1 changed file with 31 additions and 23 deletions
...@@ -119,11 +119,13 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo): ...@@ -119,11 +119,13 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo):
119 119
120 120
121 class Chain: 121 class Chain:
122 - """ The object which stores all our data and the methods to process it. 122 + """
123 + The object which stores all our data and the methods to process it.
123 124
124 - Chains accumulate information through this scipt, and are saved to files at the end of major steps.""" 125 + Chains accumulate information through this scipt, and are saved to files at the end of major steps.
126 + """
125 127
126 - def __init__(self, pdb_id, pdb_model, pdb_chain_id, chain_label, rfam="", inferred=False, pdb_start=None, pdb_end=None): 128 + def __init__(self, pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam="", inferred=False, pdb_start=None, pdb_end=None):
127 self.pdb_id = pdb_id # PDB ID 129 self.pdb_id = pdb_id # PDB ID
128 self.pdb_model = int(pdb_model) # model ID, starting at 1 130 self.pdb_model = int(pdb_model) # model ID, starting at 1
129 self.pdb_chain_id = pdb_chain_id # chain ID (mmCIF), multiple letters 131 self.pdb_chain_id = pdb_chain_id # chain ID (mmCIF), multiple letters
...@@ -131,6 +133,7 @@ class Chain: ...@@ -131,6 +133,7 @@ class Chain:
131 self.mapping = Mapping(chain_label, rfam, pdb_start, pdb_end, inferred) 133 self.mapping = Mapping(chain_label, rfam, pdb_start, pdb_end, inferred)
132 else: 134 else:
133 self.mapping = None 135 self.mapping = None
136 + self.eq_class = eq_class # BGSU NR list class id
134 self.chain_label = chain_label # chain pretty name 137 self.chain_label = chain_label # chain pretty name
135 self.file = "" # path to the 3D PDB file 138 self.file = "" # path to the 3D PDB file
136 self.seq = "" # sequence with modified nts 139 self.seq = "" # sequence with modified nts
...@@ -509,30 +512,33 @@ class Chain: ...@@ -509,30 +512,33 @@ class Chain:
509 # Register the chain in table chain 512 # Register the chain in table chain
510 if self.mapping is not None: 513 if self.mapping is not None:
511 sql_execute(conn, f""" INSERT INTO chain 514 sql_execute(conn, f""" INSERT INTO chain
512 - (structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, issue) 515 + (structure_id, chain_name, pdb_start, pdb_end, rfam_acc, eq_class, inferred, issue)
513 VALUES 516 VALUES
514 - (?, ?, ?, ?, ?, ?, ?) 517 + (?, ?, ?, ?, ?, ?, ?, ?)
515 ON CONFLICT(structure_id, chain_name, rfam_acc) DO 518 ON CONFLICT(structure_id, chain_name, rfam_acc) DO
516 UPDATE SET pdb_start=excluded.pdb_start, 519 UPDATE SET pdb_start=excluded.pdb_start,
517 pdb_end=excluded.pdb_end, 520 pdb_end=excluded.pdb_end,
521 + eq_class=excluded.eq_class,
518 inferred=excluded.inferred, 522 inferred=excluded.inferred,
519 issue=excluded.issue;""", 523 issue=excluded.issue;""",
520 data=(str(self.pdb_id), str(self.pdb_chain_id), 524 data=(str(self.pdb_id), str(self.pdb_chain_id),
521 int(self.mapping.nt_start), int(self.mapping.nt_end), 525 int(self.mapping.nt_start), int(self.mapping.nt_end),
522 - str(self.mapping.rfam_acc), 526 + str(self.mapping.rfam_acc), str(self.eq_class),
523 int(self.mapping.inferred), int(self.delete_me))) 527 int(self.mapping.inferred), int(self.delete_me)))
524 # get the chain id 528 # get the chain id
525 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain 529 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain
526 WHERE structure_id='{self.pdb_id}' 530 WHERE structure_id='{self.pdb_id}'
527 AND chain_name='{self.pdb_chain_id}' 531 AND chain_name='{self.pdb_chain_id}'
528 - AND rfam_acc='{self.mapping.rfam_acc}';""")[0][0] 532 + AND rfam_acc='{self.mapping.rfam_acc}'
533 + AND eq_class='{self.eq_class}';""")[0][0]
529 else: 534 else:
530 - sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, issue) VALUES (?, ?, NULL, ?) 535 + sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, NULL, ?, ?)
531 - ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue;""", 536 + ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue, eq_class=excluded.eq_class;""",
532 - data=(str(self.pdb_id), str(self.pdb_chain_id), int(self.delete_me))) 537 + data=(str(self.pdb_id), str(self.pdb_chain_id), str(self.eq_class), int(self.delete_me)))
533 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain 538 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain
534 WHERE structure_id='{self.pdb_id}' 539 WHERE structure_id='{self.pdb_id}'
535 AND chain_name='{self.pdb_chain_id}' 540 AND chain_name='{self.pdb_chain_id}'
541 + AND eq_class='{self.eq_class}'
536 AND rfam_acc IS NULL;""")[0][0] 542 AND rfam_acc IS NULL;""")[0][0]
537 543
538 # Add the nucleotides if the chain is not an issue 544 # Add the nucleotides if the chain is not an issue
...@@ -848,14 +854,14 @@ class Downloader: ...@@ -848,14 +854,14 @@ class Downloader:
848 if path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"): 854 if path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"):
849 print("\t> Use of the previous version.\t", end = "", flush=True) 855 print("\t> Use of the previous version.\t", end = "", flush=True)
850 else: 856 else:
851 - return [], [] 857 + return pd.DataFrame([], columns=["class", "class_members"])
852 858
853 nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv") 859 nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv")
854 - full_structures_list = nrlist['class_members'].tolist() 860 + full_structures_list = [ tuple(i[1]) for i in nrlist[['class','class_members']].iterrows() ]
855 print(f"\t{validsymb}", flush=True) 861 print(f"\t{validsymb}", flush=True)
856 862
857 # The beginning of an adventure. 863 # The beginning of an adventure.
858 - return full_structures_list 864 + return full_structures_list # list of ( str (class), str (class_members) )
859 865
860 def download_from_SILVA(self, unit): 866 def download_from_SILVA(self, unit):
861 if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"): 867 if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"):
...@@ -1060,8 +1066,8 @@ class Pipeline: ...@@ -1060,8 +1066,8 @@ class Pipeline:
1060 elif opt == "--from-scratch": 1066 elif opt == "--from-scratch":
1061 warn("Deleting previous database and recomputing from scratch.") 1067 warn("Deleting previous database and recomputing from scratch.")
1062 subprocess.run(["rm", "-rf", 1068 subprocess.run(["rm", "-rf",
1063 - path_to_3D_data + "annotations", 1069 + # path_to_3D_data + "annotations", # DEBUG : keep the annotations !
1064 - # path_to_3D_data + "RNAcifs", # DEBUG : keep the cifs ! 1070 + # path_to_3D_data + "RNAcifs", # DEBUG : keep the cifs !
1065 path_to_3D_data + "rna_mapped_to_Rfam", 1071 path_to_3D_data + "rna_mapped_to_Rfam",
1066 path_to_3D_data + "rnaonly", 1072 path_to_3D_data + "rnaonly",
1067 path_to_seq_data + "realigned", 1073 path_to_seq_data + "realigned",
...@@ -1095,7 +1101,7 @@ class Pipeline: ...@@ -1095,7 +1101,7 @@ class Pipeline:
1095 If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains.""" 1101 If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains."""
1096 1102
1097 # List all 3D RNA chains below given resolution 1103 # List all 3D RNA chains below given resolution
1098 - full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) 1104 + full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) # list of tuples ( class, class_members )
1099 1105
1100 # Check for a list of known problems: 1106 # Check for a list of known problems:
1101 if path.isfile(runDir + "/known_issues.txt"): 1107 if path.isfile(runDir + "/known_issues.txt"):
...@@ -1132,8 +1138,8 @@ class Pipeline: ...@@ -1132,8 +1138,8 @@ class Pipeline:
1132 exit(1) 1138 exit(1)
1133 else: 1139 else:
1134 conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) 1140 conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0)
1135 - for codelist in tqdm(full_structures_list): 1141 + for eq_class, codelist in tqdm(full_structures_list):
1136 - codes = str(codelist).replace('+',',').split(',') 1142 + codes = codelist.replace('+',',').split(',')
1137 1143
1138 # Simply convert the list of codes to Chain() objects 1144 # Simply convert the list of codes to Chain() objects
1139 for c in codes: 1145 for c in codes:
...@@ -1144,7 +1150,7 @@ class Pipeline: ...@@ -1144,7 +1150,7 @@ class Pipeline:
1144 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}" 1150 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}"
1145 res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc IS NULL AND issue=0""") 1151 res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc IS NULL AND issue=0""")
1146 if not len(res): # the chain is NOT yet in the database, or this is a known issue 1152 if not len(res): # the chain is NOT yet in the database, or this is a known issue
1147 - self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label)) 1153 + self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class))
1148 conn.close() 1154 conn.close()
1149 1155
1150 if self.SELECT_ONLY is not None: 1156 if self.SELECT_ONLY is not None:
...@@ -1400,7 +1406,7 @@ class Pipeline: ...@@ -1400,7 +1406,7 @@ class Pipeline:
1400 with sqlite3.connect(runDir+"/results/RNANet.db") as conn: 1406 with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
1401 pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 1407 pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;",
1402 conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) 1408 conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
1403 - pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 1409 + pd.read_sql_query("""SELECT eq_class, structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure
1404 JOIN chain ON structure.pdb_id = chain.structure_id 1410 JOIN chain ON structure.pdb_id = chain.structure_id
1405 ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False) 1411 ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
1406 1412
...@@ -1530,6 +1536,7 @@ def sql_define_tables(conn): ...@@ -1530,6 +1536,7 @@ def sql_define_tables(conn):
1530 chain_id INTEGER PRIMARY KEY NOT NULL, 1536 chain_id INTEGER PRIMARY KEY NOT NULL,
1531 structure_id CHAR(4) NOT NULL, 1537 structure_id CHAR(4) NOT NULL,
1532 chain_name VARCHAR(2) NOT NULL, 1538 chain_name VARCHAR(2) NOT NULL,
1539 + eq_class VARCHAR(10),
1533 pdb_start SMALLINT, 1540 pdb_start SMALLINT,
1534 pdb_end SMALLINT, 1541 pdb_end SMALLINT,
1535 issue TINYINT, 1542 issue TINYINT,
...@@ -1793,7 +1800,8 @@ def work_infer_mappings(update_only, allmappings, codelist): ...@@ -1793,7 +1800,8 @@ def work_infer_mappings(update_only, allmappings, codelist):
1793 known_mappings = pd.DataFrame() 1800 known_mappings = pd.DataFrame()
1794 1801
1795 # Split the comma-separated list of chain codes into chain codes: 1802 # Split the comma-separated list of chain codes into chain codes:
1796 - codes = str(codelist).replace('+',',').split(',') 1803 + eq_class = codelist[0]
1804 + codes = codelist[1].replace('+',',').split(',')
1797 1805
1798 # Search for mappings that apply to an element of this PDB chains list: 1806 # Search for mappings that apply to an element of this PDB chains list:
1799 for c in codes: 1807 for c in codes:
...@@ -1894,9 +1902,9 @@ def work_infer_mappings(update_only, allmappings, codelist): ...@@ -1894,9 +1902,9 @@ def work_infer_mappings(update_only, allmappings, codelist):
1894 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: 1902 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
1895 res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc='{rfam}' AND issue=0""") 1903 res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc='{rfam}' AND issue=0""")
1896 if not len(res): # the chain is NOT yet in the database, or this is a known issue 1904 if not len(res): # the chain is NOT yet in the database, or this is a known issue
1897 - newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end)) 1905 + newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
1898 else: 1906 else:
1899 - newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end)) 1907 + newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
1900 1908
1901 return newchains 1909 return newchains
1902 1910
......