Merge branch 'master' of https://github.com/persalteas/RNANet into master
Showing
1 changed file
with
27 additions
and
19 deletions
... | @@ -132,9 +132,11 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo): | ... | @@ -132,9 +132,11 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo): |
132 | 132 | ||
133 | 133 | ||
134 | class Chain: | 134 | class Chain: |
135 | - """ The object which stores all our data and the methods to process it. | 135 | + """ |
136 | + The object which stores all our data and the methods to process it. | ||
136 | 137 | ||
137 | - Chains accumulate information through this scipt, and are saved to files at the end of major steps.""" | 138 | + Chains accumulate information through this scipt, and are saved to files at the end of major steps. |
139 | + """ | ||
138 | 140 | ||
139 | def __init__(self, pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam="", inferred=False, pdb_start=None, pdb_end=None): | 141 | def __init__(self, pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam="", inferred=False, pdb_start=None, pdb_end=None): |
140 | self.pdb_id = pdb_id # PDB ID | 142 | self.pdb_id = pdb_id # PDB ID |
... | @@ -144,6 +146,7 @@ class Chain: | ... | @@ -144,6 +146,7 @@ class Chain: |
144 | self.mapping = Mapping(chain_label, rfam, pdb_start, pdb_end, inferred) | 146 | self.mapping = Mapping(chain_label, rfam, pdb_start, pdb_end, inferred) |
145 | else: | 147 | else: |
146 | self.mapping = None | 148 | self.mapping = None |
149 | + self.eq_class = eq_class # BGSU NR list class id | ||
147 | self.chain_label = chain_label # chain pretty name | 150 | self.chain_label = chain_label # chain pretty name |
148 | self.file = "" # path to the 3D PDB file | 151 | self.file = "" # path to the 3D PDB file |
149 | self.seq = "" # sequence with modified nts | 152 | self.seq = "" # sequence with modified nts |
... | @@ -523,30 +526,33 @@ class Chain: | ... | @@ -523,30 +526,33 @@ class Chain: |
523 | # Register the chain in table chain | 526 | # Register the chain in table chain |
524 | if self.mapping is not None: | 527 | if self.mapping is not None: |
525 | sql_execute(conn, f""" INSERT INTO chain | 528 | sql_execute(conn, f""" INSERT INTO chain |
526 | - (structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, issue) | 529 | + (structure_id, chain_name, pdb_start, pdb_end, rfam_acc, eq_class, inferred, issue) |
527 | VALUES | 530 | VALUES |
528 | - (?, ?, ?, ?, ?, ?, ?) | 531 | + (?, ?, ?, ?, ?, ?, ?, ?) |
529 | ON CONFLICT(structure_id, chain_name, rfam_acc) DO | 532 | ON CONFLICT(structure_id, chain_name, rfam_acc) DO |
530 | UPDATE SET pdb_start=excluded.pdb_start, | 533 | UPDATE SET pdb_start=excluded.pdb_start, |
531 | pdb_end=excluded.pdb_end, | 534 | pdb_end=excluded.pdb_end, |
535 | + eq_class=excluded.eq_class, | ||
532 | inferred=excluded.inferred, | 536 | inferred=excluded.inferred, |
533 | issue=excluded.issue;""", | 537 | issue=excluded.issue;""", |
534 | data=(str(self.pdb_id), str(self.pdb_chain_id), | 538 | data=(str(self.pdb_id), str(self.pdb_chain_id), |
535 | int(self.mapping.nt_start), int(self.mapping.nt_end), | 539 | int(self.mapping.nt_start), int(self.mapping.nt_end), |
536 | - str(self.mapping.rfam_acc), | 540 | + str(self.mapping.rfam_acc), str(self.eq_class), |
537 | int(self.mapping.inferred), int(self.delete_me))) | 541 | int(self.mapping.inferred), int(self.delete_me))) |
538 | # get the chain id | 542 | # get the chain id |
539 | self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain | 543 | self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain |
540 | WHERE structure_id='{self.pdb_id}' | 544 | WHERE structure_id='{self.pdb_id}' |
541 | AND chain_name='{self.pdb_chain_id}' | 545 | AND chain_name='{self.pdb_chain_id}' |
542 | - AND rfam_acc='{self.mapping.rfam_acc}';""")[0][0] | 546 | + AND rfam_acc='{self.mapping.rfam_acc}' |
547 | + AND eq_class='{self.eq_class}';""")[0][0] | ||
543 | else: | 548 | else: |
544 | - sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, issue) VALUES (?, ?, NULL, ?) | 549 | + sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, NULL, ?, ?) |
545 | - ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue;""", | 550 | + ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue, eq_class=excluded.eq_class;""", |
546 | - data=(str(self.pdb_id), str(self.pdb_chain_id), int(self.delete_me))) | 551 | + data=(str(self.pdb_id), str(self.pdb_chain_id), str(self.eq_class), int(self.delete_me))) |
547 | self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain | 552 | self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain |
548 | WHERE structure_id='{self.pdb_id}' | 553 | WHERE structure_id='{self.pdb_id}' |
549 | AND chain_name='{self.pdb_chain_id}' | 554 | AND chain_name='{self.pdb_chain_id}' |
555 | + AND eq_class='{self.eq_class}' | ||
550 | AND rfam_acc IS NULL;""")[0][0] | 556 | AND rfam_acc IS NULL;""")[0][0] |
551 | 557 | ||
552 | # Add the nucleotides if the chain is not an issue | 558 | # Add the nucleotides if the chain is not an issue |
... | @@ -859,14 +865,14 @@ class Downloader: | ... | @@ -859,14 +865,14 @@ class Downloader: |
859 | if path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"): | 865 | if path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"): |
860 | print("\t> Use of the previous version.\t", end = "", flush=True) | 866 | print("\t> Use of the previous version.\t", end = "", flush=True) |
861 | else: | 867 | else: |
862 | - return [], [] | 868 | + return pd.DataFrame([], columns=["class", "class_members"]) |
863 | 869 | ||
864 | nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv") | 870 | nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv") |
865 | - full_structures_list = nrlist['class_members'].tolist() | 871 | + full_structures_list = [ tuple(i[1]) for i in nrlist[['class','class_members']].iterrows() ] |
866 | print(f"\t{validsymb}", flush=True) | 872 | print(f"\t{validsymb}", flush=True) |
867 | 873 | ||
868 | # The beginning of an adventure. | 874 | # The beginning of an adventure. |
869 | - return full_structures_list | 875 | + return full_structures_list # list of ( str (class), str (class_members) ) |
870 | 876 | ||
871 | def download_from_SILVA(self, unit): | 877 | def download_from_SILVA(self, unit): |
872 | if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"): | 878 | if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"): |
... | @@ -1068,8 +1074,8 @@ class Pipeline: | ... | @@ -1068,8 +1074,8 @@ class Pipeline: |
1068 | elif opt == "--from-scratch": | 1074 | elif opt == "--from-scratch": |
1069 | warn("Deleting previous database and recomputing from scratch.") | 1075 | warn("Deleting previous database and recomputing from scratch.") |
1070 | subprocess.run(["rm", "-rf", | 1076 | subprocess.run(["rm", "-rf", |
1071 | - path_to_3D_data + "annotations", | 1077 | + # path_to_3D_data + "annotations", # DEBUG : keep the annotations ! |
1072 | - # path_to_3D_data + "RNAcifs", # DEBUG : keep the cifs ! | 1078 | + # path_to_3D_data + "RNAcifs", # DEBUG : keep the cifs ! |
1073 | path_to_3D_data + "rna_mapped_to_Rfam", | 1079 | path_to_3D_data + "rna_mapped_to_Rfam", |
1074 | path_to_3D_data + "rnaonly", | 1080 | path_to_3D_data + "rnaonly", |
1075 | path_to_seq_data + "realigned", | 1081 | path_to_seq_data + "realigned", |
... | @@ -1103,7 +1109,7 @@ class Pipeline: | ... | @@ -1103,7 +1109,7 @@ class Pipeline: |
1103 | If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains.""" | 1109 | If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains.""" |
1104 | 1110 | ||
1105 | # List all 3D RNA chains below given resolution | 1111 | # List all 3D RNA chains below given resolution |
1106 | - full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) | 1112 | + full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) # list of tuples ( class, class_members ) |
1107 | 1113 | ||
1108 | # Check for a list of known problems: | 1114 | # Check for a list of known problems: |
1109 | if path.isfile(runDir + "/known_issues.txt"): | 1115 | if path.isfile(runDir + "/known_issues.txt"): |
... | @@ -1140,8 +1146,8 @@ class Pipeline: | ... | @@ -1140,8 +1146,8 @@ class Pipeline: |
1140 | exit(1) | 1146 | exit(1) |
1141 | else: | 1147 | else: |
1142 | conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) | 1148 | conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) |
1143 | - for codelist in tqdm(full_structures_list): | 1149 | + for eq_class, codelist in tqdm(full_structures_list): |
1144 | - codes = str(codelist).replace('+',',').split(',') | 1150 | + codes = codelist.replace('+',',').split(',') |
1145 | 1151 | ||
1146 | # Simply convert the list of codes to Chain() objects | 1152 | # Simply convert the list of codes to Chain() objects |
1147 | for c in codes: | 1153 | for c in codes: |
... | @@ -1408,7 +1414,7 @@ class Pipeline: | ... | @@ -1408,7 +1414,7 @@ class Pipeline: |
1408 | with sqlite3.connect(runDir+"/results/RNANet.db") as conn: | 1414 | with sqlite3.connect(runDir+"/results/RNANet.db") as conn: |
1409 | pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", | 1415 | pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", |
1410 | conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) | 1416 | conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) |
1411 | - pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure | 1417 | + pd.read_sql_query("""SELECT eq_class, structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure |
1412 | JOIN chain ON structure.pdb_id = chain.structure_id | 1418 | JOIN chain ON structure.pdb_id = chain.structure_id |
1413 | ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False) | 1419 | ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False) |
1414 | 1420 | ||
... | @@ -1522,6 +1528,7 @@ def sql_define_tables(conn): | ... | @@ -1522,6 +1528,7 @@ def sql_define_tables(conn): |
1522 | chain_id INTEGER PRIMARY KEY NOT NULL, | 1528 | chain_id INTEGER PRIMARY KEY NOT NULL, |
1523 | structure_id CHAR(4) NOT NULL, | 1529 | structure_id CHAR(4) NOT NULL, |
1524 | chain_name VARCHAR(2) NOT NULL, | 1530 | chain_name VARCHAR(2) NOT NULL, |
1531 | + eq_class VARCHAR(10), | ||
1525 | pdb_start SMALLINT, | 1532 | pdb_start SMALLINT, |
1526 | pdb_end SMALLINT, | 1533 | pdb_end SMALLINT, |
1527 | issue TINYINT, | 1534 | issue TINYINT, |
... | @@ -1785,7 +1792,8 @@ def work_infer_mappings(update_only, allmappings, codelist): | ... | @@ -1785,7 +1792,8 @@ def work_infer_mappings(update_only, allmappings, codelist): |
1785 | known_mappings = pd.DataFrame() | 1792 | known_mappings = pd.DataFrame() |
1786 | 1793 | ||
1787 | # Split the comma-separated list of chain codes into chain codes: | 1794 | # Split the comma-separated list of chain codes into chain codes: |
1788 | - codes = str(codelist).replace('+',',').split(',') | 1795 | + eq_class = codelist[0] |
1796 | + codes = codelist[1].replace('+',',').split(',') | ||
1789 | 1797 | ||
1790 | # Search for mappings that apply to an element of this PDB chains list: | 1798 | # Search for mappings that apply to an element of this PDB chains list: |
1791 | for c in codes: | 1799 | for c in codes: | ... | ... |
-
Please register or login to post a comment