Showing
1 changed file
with
31 additions
and
23 deletions
... | @@ -119,11 +119,13 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo): | ... | @@ -119,11 +119,13 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo): |
119 | 119 | ||
120 | 120 | ||
121 | class Chain: | 121 | class Chain: |
122 | - """ The object which stores all our data and the methods to process it. | 122 | + """ |
123 | + The object which stores all our data and the methods to process it. | ||
123 | 124 | ||
124 | - Chains accumulate information through this scipt, and are saved to files at the end of major steps.""" | 125 | + Chains accumulate information through this scipt, and are saved to files at the end of major steps. |
126 | + """ | ||
125 | 127 | ||
126 | - def __init__(self, pdb_id, pdb_model, pdb_chain_id, chain_label, rfam="", inferred=False, pdb_start=None, pdb_end=None): | 128 | + def __init__(self, pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam="", inferred=False, pdb_start=None, pdb_end=None): |
127 | self.pdb_id = pdb_id # PDB ID | 129 | self.pdb_id = pdb_id # PDB ID |
128 | self.pdb_model = int(pdb_model) # model ID, starting at 1 | 130 | self.pdb_model = int(pdb_model) # model ID, starting at 1 |
129 | self.pdb_chain_id = pdb_chain_id # chain ID (mmCIF), multiple letters | 131 | self.pdb_chain_id = pdb_chain_id # chain ID (mmCIF), multiple letters |
... | @@ -131,6 +133,7 @@ class Chain: | ... | @@ -131,6 +133,7 @@ class Chain: |
131 | self.mapping = Mapping(chain_label, rfam, pdb_start, pdb_end, inferred) | 133 | self.mapping = Mapping(chain_label, rfam, pdb_start, pdb_end, inferred) |
132 | else: | 134 | else: |
133 | self.mapping = None | 135 | self.mapping = None |
136 | + self.eq_class = eq_class # BGSU NR list class id | ||
134 | self.chain_label = chain_label # chain pretty name | 137 | self.chain_label = chain_label # chain pretty name |
135 | self.file = "" # path to the 3D PDB file | 138 | self.file = "" # path to the 3D PDB file |
136 | self.seq = "" # sequence with modified nts | 139 | self.seq = "" # sequence with modified nts |
... | @@ -509,30 +512,33 @@ class Chain: | ... | @@ -509,30 +512,33 @@ class Chain: |
509 | # Register the chain in table chain | 512 | # Register the chain in table chain |
510 | if self.mapping is not None: | 513 | if self.mapping is not None: |
511 | sql_execute(conn, f""" INSERT INTO chain | 514 | sql_execute(conn, f""" INSERT INTO chain |
512 | - (structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, issue) | 515 | + (structure_id, chain_name, pdb_start, pdb_end, rfam_acc, eq_class, inferred, issue) |
513 | VALUES | 516 | VALUES |
514 | - (?, ?, ?, ?, ?, ?, ?) | 517 | + (?, ?, ?, ?, ?, ?, ?, ?) |
515 | ON CONFLICT(structure_id, chain_name, rfam_acc) DO | 518 | ON CONFLICT(structure_id, chain_name, rfam_acc) DO |
516 | UPDATE SET pdb_start=excluded.pdb_start, | 519 | UPDATE SET pdb_start=excluded.pdb_start, |
517 | pdb_end=excluded.pdb_end, | 520 | pdb_end=excluded.pdb_end, |
521 | + eq_class=excluded.eq_class, | ||
518 | inferred=excluded.inferred, | 522 | inferred=excluded.inferred, |
519 | issue=excluded.issue;""", | 523 | issue=excluded.issue;""", |
520 | data=(str(self.pdb_id), str(self.pdb_chain_id), | 524 | data=(str(self.pdb_id), str(self.pdb_chain_id), |
521 | int(self.mapping.nt_start), int(self.mapping.nt_end), | 525 | int(self.mapping.nt_start), int(self.mapping.nt_end), |
522 | - str(self.mapping.rfam_acc), | 526 | + str(self.mapping.rfam_acc), str(self.eq_class), |
523 | int(self.mapping.inferred), int(self.delete_me))) | 527 | int(self.mapping.inferred), int(self.delete_me))) |
524 | # get the chain id | 528 | # get the chain id |
525 | self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain | 529 | self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain |
526 | WHERE structure_id='{self.pdb_id}' | 530 | WHERE structure_id='{self.pdb_id}' |
527 | AND chain_name='{self.pdb_chain_id}' | 531 | AND chain_name='{self.pdb_chain_id}' |
528 | - AND rfam_acc='{self.mapping.rfam_acc}';""")[0][0] | 532 | + AND rfam_acc='{self.mapping.rfam_acc}' |
533 | + AND eq_class='{self.eq_class}';""")[0][0] | ||
529 | else: | 534 | else: |
530 | - sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, issue) VALUES (?, ?, NULL, ?) | 535 | + sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, NULL, ?, ?) |
531 | - ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue;""", | 536 | + ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue, eq_class=excluded.eq_class;""", |
532 | - data=(str(self.pdb_id), str(self.pdb_chain_id), int(self.delete_me))) | 537 | + data=(str(self.pdb_id), str(self.pdb_chain_id), str(self.eq_class), int(self.delete_me))) |
533 | self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain | 538 | self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain |
534 | WHERE structure_id='{self.pdb_id}' | 539 | WHERE structure_id='{self.pdb_id}' |
535 | AND chain_name='{self.pdb_chain_id}' | 540 | AND chain_name='{self.pdb_chain_id}' |
541 | + AND eq_class='{self.eq_class}' | ||
536 | AND rfam_acc IS NULL;""")[0][0] | 542 | AND rfam_acc IS NULL;""")[0][0] |
537 | 543 | ||
538 | # Add the nucleotides if the chain is not an issue | 544 | # Add the nucleotides if the chain is not an issue |
... | @@ -848,14 +854,14 @@ class Downloader: | ... | @@ -848,14 +854,14 @@ class Downloader: |
848 | if path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"): | 854 | if path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"): |
849 | print("\t> Use of the previous version.\t", end = "", flush=True) | 855 | print("\t> Use of the previous version.\t", end = "", flush=True) |
850 | else: | 856 | else: |
851 | - return [], [] | 857 | + return pd.DataFrame([], columns=["class", "class_members"]) |
852 | 858 | ||
853 | nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv") | 859 | nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv") |
854 | - full_structures_list = nrlist['class_members'].tolist() | 860 | + full_structures_list = [ tuple(i[1]) for i in nrlist[['class','class_members']].iterrows() ] |
855 | print(f"\t{validsymb}", flush=True) | 861 | print(f"\t{validsymb}", flush=True) |
856 | 862 | ||
857 | # The beginning of an adventure. | 863 | # The beginning of an adventure. |
858 | - return full_structures_list | 864 | + return full_structures_list # list of ( str (class), str (class_members) ) |
859 | 865 | ||
860 | def download_from_SILVA(self, unit): | 866 | def download_from_SILVA(self, unit): |
861 | if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"): | 867 | if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"): |
... | @@ -1060,8 +1066,8 @@ class Pipeline: | ... | @@ -1060,8 +1066,8 @@ class Pipeline: |
1060 | elif opt == "--from-scratch": | 1066 | elif opt == "--from-scratch": |
1061 | warn("Deleting previous database and recomputing from scratch.") | 1067 | warn("Deleting previous database and recomputing from scratch.") |
1062 | subprocess.run(["rm", "-rf", | 1068 | subprocess.run(["rm", "-rf", |
1063 | - path_to_3D_data + "annotations", | 1069 | + # path_to_3D_data + "annotations", # DEBUG : keep the annotations ! |
1064 | - # path_to_3D_data + "RNAcifs", # DEBUG : keep the cifs ! | 1070 | + # path_to_3D_data + "RNAcifs", # DEBUG : keep the cifs ! |
1065 | path_to_3D_data + "rna_mapped_to_Rfam", | 1071 | path_to_3D_data + "rna_mapped_to_Rfam", |
1066 | path_to_3D_data + "rnaonly", | 1072 | path_to_3D_data + "rnaonly", |
1067 | path_to_seq_data + "realigned", | 1073 | path_to_seq_data + "realigned", |
... | @@ -1095,7 +1101,7 @@ class Pipeline: | ... | @@ -1095,7 +1101,7 @@ class Pipeline: |
1095 | If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains.""" | 1101 | If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains.""" |
1096 | 1102 | ||
1097 | # List all 3D RNA chains below given resolution | 1103 | # List all 3D RNA chains below given resolution |
1098 | - full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) | 1104 | + full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) # list of tuples ( class, class_members ) |
1099 | 1105 | ||
1100 | # Check for a list of known problems: | 1106 | # Check for a list of known problems: |
1101 | if path.isfile(runDir + "/known_issues.txt"): | 1107 | if path.isfile(runDir + "/known_issues.txt"): |
... | @@ -1132,8 +1138,8 @@ class Pipeline: | ... | @@ -1132,8 +1138,8 @@ class Pipeline: |
1132 | exit(1) | 1138 | exit(1) |
1133 | else: | 1139 | else: |
1134 | conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) | 1140 | conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) |
1135 | - for codelist in tqdm(full_structures_list): | 1141 | + for eq_class, codelist in tqdm(full_structures_list): |
1136 | - codes = str(codelist).replace('+',',').split(',') | 1142 | + codes = codelist.replace('+',',').split(',') |
1137 | 1143 | ||
1138 | # Simply convert the list of codes to Chain() objects | 1144 | # Simply convert the list of codes to Chain() objects |
1139 | for c in codes: | 1145 | for c in codes: |
... | @@ -1144,7 +1150,7 @@ class Pipeline: | ... | @@ -1144,7 +1150,7 @@ class Pipeline: |
1144 | chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}" | 1150 | chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}" |
1145 | res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc IS NULL AND issue=0""") | 1151 | res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc IS NULL AND issue=0""") |
1146 | if not len(res): # the chain is NOT yet in the database, or this is a known issue | 1152 | if not len(res): # the chain is NOT yet in the database, or this is a known issue |
1147 | - self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label)) | 1153 | + self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class)) |
1148 | conn.close() | 1154 | conn.close() |
1149 | 1155 | ||
1150 | if self.SELECT_ONLY is not None: | 1156 | if self.SELECT_ONLY is not None: |
... | @@ -1400,7 +1406,7 @@ class Pipeline: | ... | @@ -1400,7 +1406,7 @@ class Pipeline: |
1400 | with sqlite3.connect(runDir+"/results/RNANet.db") as conn: | 1406 | with sqlite3.connect(runDir+"/results/RNANet.db") as conn: |
1401 | pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", | 1407 | pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", |
1402 | conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) | 1408 | conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) |
1403 | - pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure | 1409 | + pd.read_sql_query("""SELECT eq_class, structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure |
1404 | JOIN chain ON structure.pdb_id = chain.structure_id | 1410 | JOIN chain ON structure.pdb_id = chain.structure_id |
1405 | ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False) | 1411 | ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False) |
1406 | 1412 | ||
... | @@ -1530,6 +1536,7 @@ def sql_define_tables(conn): | ... | @@ -1530,6 +1536,7 @@ def sql_define_tables(conn): |
1530 | chain_id INTEGER PRIMARY KEY NOT NULL, | 1536 | chain_id INTEGER PRIMARY KEY NOT NULL, |
1531 | structure_id CHAR(4) NOT NULL, | 1537 | structure_id CHAR(4) NOT NULL, |
1532 | chain_name VARCHAR(2) NOT NULL, | 1538 | chain_name VARCHAR(2) NOT NULL, |
1539 | + eq_class VARCHAR(10), | ||
1533 | pdb_start SMALLINT, | 1540 | pdb_start SMALLINT, |
1534 | pdb_end SMALLINT, | 1541 | pdb_end SMALLINT, |
1535 | issue TINYINT, | 1542 | issue TINYINT, |
... | @@ -1793,7 +1800,8 @@ def work_infer_mappings(update_only, allmappings, codelist): | ... | @@ -1793,7 +1800,8 @@ def work_infer_mappings(update_only, allmappings, codelist): |
1793 | known_mappings = pd.DataFrame() | 1800 | known_mappings = pd.DataFrame() |
1794 | 1801 | ||
1795 | # Split the comma-separated list of chain codes into chain codes: | 1802 | # Split the comma-separated list of chain codes into chain codes: |
1796 | - codes = str(codelist).replace('+',',').split(',') | 1803 | + eq_class = codelist[0] |
1804 | + codes = codelist[1].replace('+',',').split(',') | ||
1797 | 1805 | ||
1798 | # Search for mappings that apply to an element of this PDB chains list: | 1806 | # Search for mappings that apply to an element of this PDB chains list: |
1799 | for c in codes: | 1807 | for c in codes: |
... | @@ -1894,9 +1902,9 @@ def work_infer_mappings(update_only, allmappings, codelist): | ... | @@ -1894,9 +1902,9 @@ def work_infer_mappings(update_only, allmappings, codelist): |
1894 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: | 1902 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: |
1895 | res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc='{rfam}' AND issue=0""") | 1903 | res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc='{rfam}' AND issue=0""") |
1896 | if not len(res): # the chain is NOT yet in the database, or this is a known issue | 1904 | if not len(res): # the chain is NOT yet in the database, or this is a known issue |
1897 | - newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end)) | 1905 | + newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end)) |
1898 | else: | 1906 | else: |
1899 | - newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end)) | 1907 | + newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end)) |
1900 | 1908 | ||
1901 | return newchains | 1909 | return newchains |
1902 | 1910 | ... | ... |
-
Please register or login to post a comment