Louis BECQUEY

Merge branch 'master' of https://github.com/persalteas/RNANet into master

Showing 1 changed file with 27 additions and 19 deletions
...@@ -132,9 +132,11 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo): ...@@ -132,9 +132,11 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo):
132 132
133 133
134 class Chain: 134 class Chain:
135 - """ The object which stores all our data and the methods to process it. 135 + """
136 + The object which stores all our data and the methods to process it.
136 137
137 - Chains accumulate information through this scipt, and are saved to files at the end of major steps.""" 138 + Chains accumulate information through this scipt, and are saved to files at the end of major steps.
139 + """
138 140
139 def __init__(self, pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam="", inferred=False, pdb_start=None, pdb_end=None): 141 def __init__(self, pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam="", inferred=False, pdb_start=None, pdb_end=None):
140 self.pdb_id = pdb_id # PDB ID 142 self.pdb_id = pdb_id # PDB ID
...@@ -144,6 +146,7 @@ class Chain: ...@@ -144,6 +146,7 @@ class Chain:
144 self.mapping = Mapping(chain_label, rfam, pdb_start, pdb_end, inferred) 146 self.mapping = Mapping(chain_label, rfam, pdb_start, pdb_end, inferred)
145 else: 147 else:
146 self.mapping = None 148 self.mapping = None
149 + self.eq_class = eq_class # BGSU NR list class id
147 self.chain_label = chain_label # chain pretty name 150 self.chain_label = chain_label # chain pretty name
148 self.file = "" # path to the 3D PDB file 151 self.file = "" # path to the 3D PDB file
149 self.seq = "" # sequence with modified nts 152 self.seq = "" # sequence with modified nts
...@@ -523,30 +526,33 @@ class Chain: ...@@ -523,30 +526,33 @@ class Chain:
523 # Register the chain in table chain 526 # Register the chain in table chain
524 if self.mapping is not None: 527 if self.mapping is not None:
525 sql_execute(conn, f""" INSERT INTO chain 528 sql_execute(conn, f""" INSERT INTO chain
526 - (structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, issue) 529 + (structure_id, chain_name, pdb_start, pdb_end, rfam_acc, eq_class, inferred, issue)
527 VALUES 530 VALUES
528 - (?, ?, ?, ?, ?, ?, ?) 531 + (?, ?, ?, ?, ?, ?, ?, ?)
529 ON CONFLICT(structure_id, chain_name, rfam_acc) DO 532 ON CONFLICT(structure_id, chain_name, rfam_acc) DO
530 UPDATE SET pdb_start=excluded.pdb_start, 533 UPDATE SET pdb_start=excluded.pdb_start,
531 pdb_end=excluded.pdb_end, 534 pdb_end=excluded.pdb_end,
535 + eq_class=excluded.eq_class,
532 inferred=excluded.inferred, 536 inferred=excluded.inferred,
533 issue=excluded.issue;""", 537 issue=excluded.issue;""",
534 data=(str(self.pdb_id), str(self.pdb_chain_id), 538 data=(str(self.pdb_id), str(self.pdb_chain_id),
535 int(self.mapping.nt_start), int(self.mapping.nt_end), 539 int(self.mapping.nt_start), int(self.mapping.nt_end),
536 - str(self.mapping.rfam_acc), 540 + str(self.mapping.rfam_acc), str(self.eq_class),
537 int(self.mapping.inferred), int(self.delete_me))) 541 int(self.mapping.inferred), int(self.delete_me)))
538 # get the chain id 542 # get the chain id
539 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain 543 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain
540 WHERE structure_id='{self.pdb_id}' 544 WHERE structure_id='{self.pdb_id}'
541 AND chain_name='{self.pdb_chain_id}' 545 AND chain_name='{self.pdb_chain_id}'
542 - AND rfam_acc='{self.mapping.rfam_acc}';""")[0][0] 546 + AND rfam_acc='{self.mapping.rfam_acc}'
547 + AND eq_class='{self.eq_class}';""")[0][0]
543 else: 548 else:
544 - sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, issue) VALUES (?, ?, NULL, ?) 549 + sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, NULL, ?, ?)
545 - ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue;""", 550 + ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue, eq_class=excluded.eq_class;""",
546 - data=(str(self.pdb_id), str(self.pdb_chain_id), int(self.delete_me))) 551 + data=(str(self.pdb_id), str(self.pdb_chain_id), str(self.eq_class), int(self.delete_me)))
547 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain 552 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain
548 WHERE structure_id='{self.pdb_id}' 553 WHERE structure_id='{self.pdb_id}'
549 AND chain_name='{self.pdb_chain_id}' 554 AND chain_name='{self.pdb_chain_id}'
555 + AND eq_class='{self.eq_class}'
550 AND rfam_acc IS NULL;""")[0][0] 556 AND rfam_acc IS NULL;""")[0][0]
551 557
552 # Add the nucleotides if the chain is not an issue 558 # Add the nucleotides if the chain is not an issue
...@@ -859,14 +865,14 @@ class Downloader: ...@@ -859,14 +865,14 @@ class Downloader:
859 if path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"): 865 if path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"):
860 print("\t> Use of the previous version.\t", end = "", flush=True) 866 print("\t> Use of the previous version.\t", end = "", flush=True)
861 else: 867 else:
862 - return [], [] 868 + return pd.DataFrame([], columns=["class", "class_members"])
863 869
864 nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv") 870 nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv")
865 - full_structures_list = nrlist['class_members'].tolist() 871 + full_structures_list = [ tuple(i[1]) for i in nrlist[['class','class_members']].iterrows() ]
866 print(f"\t{validsymb}", flush=True) 872 print(f"\t{validsymb}", flush=True)
867 873
868 # The beginning of an adventure. 874 # The beginning of an adventure.
869 - return full_structures_list 875 + return full_structures_list # list of ( str (class), str (class_members) )
870 876
871 def download_from_SILVA(self, unit): 877 def download_from_SILVA(self, unit):
872 if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"): 878 if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"):
...@@ -1068,8 +1074,8 @@ class Pipeline: ...@@ -1068,8 +1074,8 @@ class Pipeline:
1068 elif opt == "--from-scratch": 1074 elif opt == "--from-scratch":
1069 warn("Deleting previous database and recomputing from scratch.") 1075 warn("Deleting previous database and recomputing from scratch.")
1070 subprocess.run(["rm", "-rf", 1076 subprocess.run(["rm", "-rf",
1071 - path_to_3D_data + "annotations", 1077 + # path_to_3D_data + "annotations", # DEBUG : keep the annotations !
1072 - # path_to_3D_data + "RNAcifs", # DEBUG : keep the cifs ! 1078 + # path_to_3D_data + "RNAcifs", # DEBUG : keep the cifs !
1073 path_to_3D_data + "rna_mapped_to_Rfam", 1079 path_to_3D_data + "rna_mapped_to_Rfam",
1074 path_to_3D_data + "rnaonly", 1080 path_to_3D_data + "rnaonly",
1075 path_to_seq_data + "realigned", 1081 path_to_seq_data + "realigned",
...@@ -1103,7 +1109,7 @@ class Pipeline: ...@@ -1103,7 +1109,7 @@ class Pipeline:
1103 If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains.""" 1109 If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains."""
1104 1110
1105 # List all 3D RNA chains below given resolution 1111 # List all 3D RNA chains below given resolution
1106 - full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) 1112 + full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) # list of tuples ( class, class_members )
1107 1113
1108 # Check for a list of known problems: 1114 # Check for a list of known problems:
1109 if path.isfile(runDir + "/known_issues.txt"): 1115 if path.isfile(runDir + "/known_issues.txt"):
...@@ -1140,8 +1146,8 @@ class Pipeline: ...@@ -1140,8 +1146,8 @@ class Pipeline:
1140 exit(1) 1146 exit(1)
1141 else: 1147 else:
1142 conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) 1148 conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0)
1143 - for codelist in tqdm(full_structures_list): 1149 + for eq_class, codelist in tqdm(full_structures_list):
1144 - codes = str(codelist).replace('+',',').split(',') 1150 + codes = codelist.replace('+',',').split(',')
1145 1151
1146 # Simply convert the list of codes to Chain() objects 1152 # Simply convert the list of codes to Chain() objects
1147 for c in codes: 1153 for c in codes:
...@@ -1408,7 +1414,7 @@ class Pipeline: ...@@ -1408,7 +1414,7 @@ class Pipeline:
1408 with sqlite3.connect(runDir+"/results/RNANet.db") as conn: 1414 with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
1409 pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 1415 pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;",
1410 conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) 1416 conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
1411 - pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 1417 + pd.read_sql_query("""SELECT eq_class, structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure
1412 JOIN chain ON structure.pdb_id = chain.structure_id 1418 JOIN chain ON structure.pdb_id = chain.structure_id
1413 ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False) 1419 ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
1414 1420
...@@ -1522,6 +1528,7 @@ def sql_define_tables(conn): ...@@ -1522,6 +1528,7 @@ def sql_define_tables(conn):
1522 chain_id INTEGER PRIMARY KEY NOT NULL, 1528 chain_id INTEGER PRIMARY KEY NOT NULL,
1523 structure_id CHAR(4) NOT NULL, 1529 structure_id CHAR(4) NOT NULL,
1524 chain_name VARCHAR(2) NOT NULL, 1530 chain_name VARCHAR(2) NOT NULL,
1531 + eq_class VARCHAR(10),
1525 pdb_start SMALLINT, 1532 pdb_start SMALLINT,
1526 pdb_end SMALLINT, 1533 pdb_end SMALLINT,
1527 issue TINYINT, 1534 issue TINYINT,
...@@ -1785,7 +1792,8 @@ def work_infer_mappings(update_only, allmappings, codelist): ...@@ -1785,7 +1792,8 @@ def work_infer_mappings(update_only, allmappings, codelist):
1785 known_mappings = pd.DataFrame() 1792 known_mappings = pd.DataFrame()
1786 1793
1787 # Split the comma-separated list of chain codes into chain codes: 1794 # Split the comma-separated list of chain codes into chain codes:
1788 - codes = str(codelist).replace('+',',').split(',') 1795 + eq_class = codelist[0]
1796 + codes = codelist[1].replace('+',',').split(',')
1789 1797
1790 # Search for mappings that apply to an element of this PDB chains list: 1798 # Search for mappings that apply to an element of this PDB chains list:
1791 for c in codes: 1799 for c in codes:
......