Showing
4 changed files
with
232 additions
and
125 deletions
... | @@ -44,16 +44,15 @@ SSU_set = {"RF00177", "RF02542", "RF02545", "RF01959", "RF01960"} # From Rfam | ... | @@ -44,16 +44,15 @@ SSU_set = {"RF00177", "RF02542", "RF02545", "RF01959", "RF01960"} # From Rfam |
44 | no_nts_set = set() | 44 | no_nts_set = set() |
45 | weird_mappings = set() | 45 | weird_mappings = set() |
46 | 46 | ||
47 | -class NtPortionSelector(object): | 47 | +class SelectivePortionSelector(object): |
48 | """Class passed to MMCIFIO to select some chain portions in an MMCIF file. | 48 | """Class passed to MMCIFIO to select some chain portions in an MMCIF file. |
49 | 49 | ||
50 | Validates every chain, residue, nucleotide, to say if it is in the selection or not. | 50 | Validates every chain, residue, nucleotide, to say if it is in the selection or not. |
51 | """ | 51 | """ |
52 | 52 | ||
53 | - def __init__(self, model_id, chain_id, start, end, khetatm): | 53 | + def __init__(self, model_id, chain_id, valid_resnums, khetatm): |
54 | self.chain_id = chain_id | 54 | self.chain_id = chain_id |
55 | - self.start = start | 55 | + self.resnums = valid_resnums |
56 | - self.end = end | ||
57 | self.pdb_model_id = model_id | 56 | self.pdb_model_id = model_id |
58 | self.hydrogen_regex = re.compile("[123 ]*H.*") | 57 | self.hydrogen_regex = re.compile("[123 ]*H.*") |
59 | self.keep_hetatm = khetatm | 58 | self.keep_hetatm = khetatm |
... | @@ -77,7 +76,10 @@ class NtPortionSelector(object): | ... | @@ -77,7 +76,10 @@ class NtPortionSelector(object): |
77 | # warn(f"icode {icode} at position {resseq}\t\t") | 76 | # warn(f"icode {icode} at position {resseq}\t\t") |
78 | 77 | ||
79 | # Accept the residue if it is in the right interval: | 78 | # Accept the residue if it is in the right interval: |
80 | - return int(self.start <= resseq <= self.end) | 79 | + if len(self.resnums): |
80 | + return int(resseq in self.resnums) | ||
81 | + else: | ||
82 | + return 1 | ||
81 | 83 | ||
82 | def accept_atom(self, atom): | 84 | def accept_atom(self, atom): |
83 | 85 | ||
... | @@ -128,13 +130,12 @@ class Chain: | ... | @@ -128,13 +130,12 @@ class Chain: |
128 | self.pdb_id = pdb_id # PDB ID | 130 | self.pdb_id = pdb_id # PDB ID |
129 | self.pdb_model = int(pdb_model) # model ID, starting at 1 | 131 | self.pdb_model = int(pdb_model) # model ID, starting at 1 |
130 | self.pdb_chain_id = pdb_chain_id # chain ID (mmCIF), multiple letters | 132 | self.pdb_chain_id = pdb_chain_id # chain ID (mmCIF), multiple letters |
131 | - self.pdb_start = pdb_start # if portion of chain, the start number (relative to the chain, not residue numbers) | 133 | + if len(rfam): |
132 | - self.pdb_end = pdb_end # if portion of chain, the start number (relative to the chain, not residue numbers) | 134 | + self.mapping = Mapping(chain_label, rfam, pdb_start, pdb_end, inferred) |
133 | - self.reversed = (pdb_start > pdb_end) if pdb_start is not None else False # wether pdb_start > pdb_end in the Rfam mapping | 135 | + else: |
136 | + self.mapping = None | ||
134 | self.chain_label = chain_label # chain pretty name | 137 | self.chain_label = chain_label # chain pretty name |
135 | self.file = "" # path to the 3D PDB file | 138 | self.file = "" # path to the 3D PDB file |
136 | - self.rfam_fam = rfam # mapping to an RNA family | ||
137 | - self.inferred = inferred # Wether this mapping has been inferred from BGSU's NR list | ||
138 | self.seq = "" # sequence with modified nts | 139 | self.seq = "" # sequence with modified nts |
139 | self.seq_to_align = "" # sequence with modified nts replaced, but gaps can exist | 140 | self.seq_to_align = "" # sequence with modified nts replaced, but gaps can exist |
140 | self.length = -1 # length of the sequence (missing residues are not counted) | 141 | self.length = -1 # length of the sequence (missing residues are not counted) |
... | @@ -156,8 +157,8 @@ class Chain: | ... | @@ -156,8 +157,8 @@ class Chain: |
156 | """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file. | 157 | """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file. |
157 | """ | 158 | """ |
158 | 159 | ||
159 | - if self.pdb_start is not None and (self.pdb_end - self.pdb_start): | 160 | + if self.mapping is not None: |
160 | - status = f"Extract {self.pdb_start}-{self.pdb_end} atoms from {self.pdb_id}-{self.pdb_chain_id}" | 161 | + status = f"Extract {self.mapping.nt_start}-{self.mapping.nt_end} atoms from {self.pdb_id}-{self.pdb_chain_id}" |
161 | self.file = path_to_3D_data+"rna_mapped_to_Rfam/"+self.chain_label+".cif" | 162 | self.file = path_to_3D_data+"rna_mapped_to_Rfam/"+self.chain_label+".cif" |
162 | else: | 163 | else: |
163 | status = f"Extract {self.pdb_id}-{self.pdb_chain_id}" | 164 | status = f"Extract {self.pdb_id}-{self.pdb_chain_id}" |
... | @@ -183,36 +184,19 @@ class Chain: | ... | @@ -183,36 +184,19 @@ class Chain: |
183 | # Extract the desired chain | 184 | # Extract the desired chain |
184 | c = s[model_idx][self.pdb_chain_id] | 185 | c = s[model_idx][self.pdb_chain_id] |
185 | 186 | ||
186 | - if self.pdb_start is not None and (self.pdb_end - self.pdb_start): | 187 | + if self.mapping is not None: |
187 | - # # Pay attention to residue numbering | 188 | + valid_set = set(self.mapping.old_nt_resnums) |
188 | - # first_number = c.child_list[0].get_id()[1] # the chain's first residue is numbered 'first_number' | ||
189 | - # if self.pdb_start < self.pdb_end: | ||
190 | - # start = self.pdb_start + first_number - 1 # shift our start_position by 'first_number' | ||
191 | - # end = self.pdb_end + first_number - 1 # same for the end position | ||
192 | - # else: | ||
193 | - # self.reversed = True # the 3D chain is numbered backwards compared to the Rfam family | ||
194 | - # end = self.pdb_start + first_number - 1 | ||
195 | - # start = self.pdb_end + first_number - 1 | ||
196 | - | ||
197 | - if self.pdb_start < self.pdb_end: | ||
198 | - start = self.pdb_start # shift our start_position by 'first_number' | ||
199 | - end = self.pdb_end # same for the end position | ||
200 | - else: | ||
201 | - self.reversed = True # the 3D chain is numbered backwards compared to the Rfam family | ||
202 | - end = self.pdb_start | ||
203 | - start = self.pdb_end | ||
204 | else: | 189 | else: |
205 | - start = c.child_list[0].get_id()[1] # the chain's first residue is numbered 'first_number' | 190 | + valid_set = set() |
206 | - end = c.child_list[-1].get_id()[1] # the chain's last residue number | 191 | + |
207 | - | ||
208 | # Define a selection | 192 | # Define a selection |
209 | - # sel = ChainSelector(self.pdb_chain_id, start, end, model_id = model_idx) | 193 | + sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm) |
210 | - sel = NtPortionSelector(model_idx, self.pdb_chain_id, start, end, khetatm) | ||
211 | 194 | ||
212 | # Save that selection on the mmCIF object s to file | 195 | # Save that selection on the mmCIF object s to file |
213 | ioobj = MMCIFIO() | 196 | ioobj = MMCIFIO() |
214 | ioobj.set_structure(s) | 197 | ioobj.set_structure(s) |
215 | ioobj.save(self.file, sel) | 198 | ioobj.save(self.file, sel) |
199 | + | ||
216 | 200 | ||
217 | notify(status) | 201 | notify(status) |
218 | 202 | ||
... | @@ -225,7 +209,7 @@ class Chain: | ... | @@ -225,7 +209,7 @@ class Chain: |
225 | try: | 209 | try: |
226 | with open(path_to_3D_data + "annotations/" + self.pdb_id + ".json", 'r') as json_file: | 210 | with open(path_to_3D_data + "annotations/" + self.pdb_id + ".json", 'r') as json_file: |
227 | json_object = json.load(json_file) | 211 | json_object = json.load(json_file) |
228 | - notify(f"Read {self.chain_label} DSSR annotations") | 212 | + notify(f"Read {self.pdb_id} DSSR annotations") |
229 | except json.decoder.JSONDecodeError as e: | 213 | except json.decoder.JSONDecodeError as e: |
230 | warn("Could not load "+self.pdb_id+f".json with JSON package: {e}", error=True) | 214 | warn("Could not load "+self.pdb_id+f".json with JSON package: {e}", error=True) |
231 | self.delete_me = True | 215 | self.delete_me = True |
... | @@ -275,87 +259,87 @@ class Chain: | ... | @@ -275,87 +259,87 @@ class Chain: |
275 | return None | 259 | return None |
276 | 260 | ||
277 | ############################################# | 261 | ############################################# |
278 | - # Solve some common issues and drop ligands | 262 | + # Select the nucleotides we need |
279 | ############################################# | 263 | ############################################# |
280 | 264 | ||
281 | - # Shift numbering when duplicate residue numbers are found. | 265 | + |
266 | + # Remove nucleotides of the chain that are outside the Rfam mapping, if any | ||
267 | + if self.mapping is not None: | ||
268 | + df = self.mapping.filter_df(df) | ||
269 | + | ||
270 | + # Duplicate residue numbers : shift numbering | ||
282 | # Example: 4v9q-DV contains 17 and 17A which are both read 17 by DSSR. | 271 | # Example: 4v9q-DV contains 17 and 17A which are both read 17 by DSSR. |
283 | while True in df.duplicated(['nt_resnum']).values: | 272 | while True in df.duplicated(['nt_resnum']).values: |
284 | i = df.duplicated(['nt_resnum']).values.tolist().index(True) | 273 | i = df.duplicated(['nt_resnum']).values.tolist().index(True) |
274 | + self.mapping.shift_resnum_range(i) | ||
285 | df.iloc[i:, 1] += 1 | 275 | df.iloc[i:, 1] += 1 |
286 | 276 | ||
277 | + # Search for ligands at the end of the selection | ||
287 | # Drop ligands detected as residues by DSSR, by detecting several markers | 278 | # Drop ligands detected as residues by DSSR, by detecting several markers |
288 | - df = df.drop_duplicates("index_chain", keep="first") # drop doublons in index_chain | 279 | + while ( len(df.index_chain) and df.iloc[-1,2] not in ["A", "C", "G", "U"] and ( |
289 | - while (len(df.index_chain) and df.iloc[[-1]].nt_name.tolist()[0] not in ["A", "C", "G", "U"] and | 280 | + (df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all() |
290 | - ((df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all() | 281 | + or (df.iloc[[-1]].puckering=='').any() |
291 | - or (df.iloc[[-1]].puckering=='').any()) | 282 | + ) |
292 | - or (len(df.index_chain) >= 2 and df.iloc[[-1]].nt_resnum.iloc[0] > 50 + df.iloc[[-2]].nt_resnum.iloc[0])): | 283 | + or ( len(df.index_chain) >= 2 and df.iloc[-1,1] > 50 + df.iloc[-2,1] ) |
284 | + or ( len(df.index_chain) and df.iloc[-1,2] in ["GNG", "E2C", "OHX", "IRI", "MPD", "8UZ"] ) | ||
285 | + ): | ||
286 | + if self.mapping is not None: | ||
287 | + self.mapping.drop_ligand(df.tail(1)) | ||
293 | df = df.head(-1) | 288 | df = df.head(-1) |
294 | 289 | ||
295 | - # drop eventual nts with index_chain < the first residue (usually, ligands) | 290 | + # Duplicates in index_chain : drop, they are ligands |
296 | - df = df.drop(df[df.index_chain < 0].index) | 291 | + # e.g. 3iwn_1_B_1-91, ligand C2E has index_chain 1 (and nt_resnum 601) |
297 | - | 292 | + duplicates = [ index for index, element in enumerate(df.duplicated(['index_chain']).values) if element ] |
298 | - # Assert some nucleotides still exist | 293 | + if len(duplicates): |
299 | - try: | 294 | + for i in duplicates: |
300 | - l = df.iloc[-1,1] - df.iloc[0,1] + 1 # length of chain from nt_resnum point of view | 295 | + warn(f"Found duplicated index_chain {df.iloc[i,0]} in {self.chain_label}. Keeping only the first.") |
301 | - except IndexError: | 296 | + if self.mapping is not None: |
302 | - warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Ignoring chain {self.chain_label}.", error=True) | 297 | + self.mapping.log(f"Found duplicated index_chain {df.iloc[i,0]}. Keeping only the first.") |
303 | - no_nts_set.add(self.pdb_id) | 298 | + with open("duplicates.txt", "a") as f: |
304 | - self.delete_me = True | 299 | + f.write(f"DEBUG: {self.chain_label} has duplicate index_chains !\n") |
305 | - self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry." | 300 | + df = df.drop_duplicates("index_chain", keep="first") # drop doublons in index_chain |
306 | - return None | 301 | + |
307 | - | 302 | + # drop eventual nts with index_chain < the first residue, |
308 | - # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one | 303 | + # now negative because we renumber to 1 (usually, ligands) |
309 | - if df.iloc[0,0] != 1: | 304 | + ligands = df[df.index_chain < 0] |
310 | - st = df.iloc[0,0] -1 | 305 | + if len(ligands.index_chain): |
311 | - df.iloc[:, 0] -= st | 306 | + if self.mapping is not None: |
312 | - | 307 | + for line in ligands.iterrows(): |
313 | - | 308 | + self.mapping.drop_ligand(line) |
314 | - # Find missing index_chain values because of resolved nucleotides that have a strange nt_resnum value | 309 | + df = df.drop(ligands.index) |
310 | + | ||
311 | + # Find missing index_chain values | ||
312 | + # This happens because of resolved nucleotides that have a | ||
313 | + # strange nt_resnum value | ||
315 | # e.g. 4v49-AA, position 5'- 1003 -> 2003 -> 1004 - 3' | 314 | # e.g. 4v49-AA, position 5'- 1003 -> 2003 -> 1004 - 3' |
316 | diff = set(range(df.shape[0])).difference(df['index_chain'] - 1) | 315 | diff = set(range(df.shape[0])).difference(df['index_chain'] - 1) |
317 | - for i in sorted(diff): | 316 | + if len(diff): |
318 | - # check if a nucleotide numbered +1000 exists | 317 | + warn(f"Missing residues regarding index_chain: {[1+i for i in sorted(diff)]}") |
319 | - looked_for = df[df.index_chain == i].nt_resnum.values[0] | 318 | + for i in sorted(diff): |
320 | - found = None | 319 | + # check if a nucleotide numbered +1000 exists in the nts object |
321 | - for nt in nts: | 320 | + found = None |
322 | - if nt['chain_name'] != self.pdb_chain_id: | 321 | + for nt in nts: # nts is the object from the loaded JSON and contains all nts |
323 | - continue | 322 | + if nt['chain_name'] != self.pdb_chain_id: |
324 | - if nt['index_chain'] == i + 1 : | 323 | + continue |
325 | - found = nt | 324 | + if nt['index_chain'] == i + 1 : |
326 | - break | 325 | + found = nt |
327 | - if found: | 326 | + break |
328 | - df_row = pd.DataFrame([found], index=[i])[df.columns.values] | 327 | + if found: |
329 | - df_row.iloc[0,1] = df.iloc[i,1] | 328 | + df_row = pd.DataFrame([found], index=[i])[df.columns.values] |
330 | - df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ]) | 329 | + if self.mapping is not None: |
331 | - df.iloc[i+1:, 1] += 1 | 330 | + self.mapping.insert_new(i+1, found['nt_resnum'], df.iloc[i,1]) |
332 | - else: | 331 | + df_row.iloc[0,1] = df.iloc[i,1] |
333 | - warn(f"Missing index_chain {i} in {self.chain_label} !") | 332 | + df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ]) |
334 | - | 333 | + df.iloc[i+1:, 1] += 1 |
335 | - # Remove nucleotides of the chain that are outside the Rfam mapping, if any | 334 | + else: |
336 | - if self.pdb_start and self.pdb_end: | 335 | + warn(f"Missing index_chain {i} in {self.chain_label} !") |
337 | - if self.pdb_start < self.pdb_end: | ||
338 | - newdf = df.drop(df[(df.nt_resnum < self.pdb_start) | (df.nt_resnum > self.pdb_end)].index) | ||
339 | - else: | ||
340 | - newdf = df.drop(df[(df.nt_resnum < self.pdb_end) | (df.nt_resnum > self.pdb_start)].index) | ||
341 | 336 | ||
342 | - if len(newdf.index_chain) > 0: | 337 | + # Assert some nucleotides still exist |
343 | - # everything's okay | ||
344 | - df = newdf | ||
345 | - else: | ||
346 | - # There were nucleotides in this chain but we removed them all while | ||
347 | - # filtering the ones outside the Rfam mapping. | ||
348 | - # This probably means that, for this chain, the mapping is relative to | ||
349 | - # index_chain and not nt_resnum. | ||
350 | - warn(f"Assuming {self.chain_label}'s mapping to {self.rfam_fam} is an absolute position interval.") | ||
351 | - weird_mappings.add(self.chain_label + "." + self.rfam_fam) | ||
352 | - df = df.drop(df[(df.index_chain < self.pdb_start) | (df.index_chain > self.pdb_end)].index) | ||
353 | - | ||
354 | try: | 338 | try: |
355 | l = df.iloc[-1,1] - df.iloc[0,1] + 1 # update length of chain from nt_resnum point of view | 339 | l = df.iloc[-1,1] - df.iloc[0,1] + 1 # update length of chain from nt_resnum point of view |
356 | except IndexError: | 340 | except IndexError: |
357 | - warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.pdb_start} and " | 341 | + warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.mapping.nt_start} and " |
358 | - f"{self.pdb_end} ({'not ' if not self.inferred else ''}inferred). Ignoring chain {self.chain_label}.") | 342 | + f"{self.mapping.nt_end} ({'not ' if not self.mapping.inferred else ''}inferred). Ignoring chain {self.chain_label}.") |
359 | no_nts_set.add(self.pdb_id) | 343 | no_nts_set.add(self.pdb_id) |
360 | self.delete_me = True | 344 | self.delete_me = True |
361 | self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Either there is a problem with {self.pdb_id} mmCIF download, or the bases are not resolved in the structure. Delete it and retry." | 345 | self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Either there is a problem with {self.pdb_id} mmCIF download, or the bases are not resolved in the structure. Delete it and retry." |
... | @@ -464,7 +448,7 @@ class Chain: | ... | @@ -464,7 +448,7 @@ class Chain: |
464 | df['nb_interact'] = interacts | 448 | df['nb_interact'] = interacts |
465 | df = df.drop(['nt_id'], axis=1) # remove now useless descriptors | 449 | df = df.drop(['nt_id'], axis=1) # remove now useless descriptors |
466 | 450 | ||
467 | - if self.reversed: | 451 | + if self.mapping.reversed: |
468 | # The 3D structure is numbered from 3' to 5' instead of standard 5' to 3' | 452 | # The 3D structure is numbered from 3' to 5' instead of standard 5' to 3' |
469 | # or the sequence that matches the Rfam family is 3' to 5' instead of standard 5' to 3'. | 453 | # or the sequence that matches the Rfam family is 3' to 5' instead of standard 5' to 3'. |
470 | # Anyways, you need to invert the angles. | 454 | # Anyways, you need to invert the angles. |
... | @@ -507,6 +491,10 @@ class Chain: | ... | @@ -507,6 +491,10 @@ class Chain: |
507 | self.error_messages = "Sequence is too short. (< 5 resolved nts)" | 491 | self.error_messages = "Sequence is too short. (< 5 resolved nts)" |
508 | return None | 492 | return None |
509 | 493 | ||
494 | + # Log chain info to file | ||
495 | + if self.mapping is not None: | ||
496 | + self.mapping.to_file(self.chain_label+".log") | ||
497 | + | ||
510 | return df | 498 | return df |
511 | 499 | ||
512 | def register_chain(self, df): | 500 | def register_chain(self, df): |
... | @@ -515,7 +503,7 @@ class Chain: | ... | @@ -515,7 +503,7 @@ class Chain: |
515 | 503 | ||
516 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: | 504 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: |
517 | # Register the chain in table chain | 505 | # Register the chain in table chain |
518 | - if self.pdb_start is not None: | 506 | + if self.mapping.nt_start is not None: |
519 | sql_execute(conn, f""" INSERT INTO chain | 507 | sql_execute(conn, f""" INSERT INTO chain |
520 | (structure_id, chain_name, pdb_start, pdb_end, reversed, rfam_acc, inferred, issue) | 508 | (structure_id, chain_name, pdb_start, pdb_end, reversed, rfam_acc, inferred, issue) |
521 | VALUES | 509 | VALUES |
... | @@ -527,14 +515,14 @@ class Chain: | ... | @@ -527,14 +515,14 @@ class Chain: |
527 | inferred=excluded.inferred, | 515 | inferred=excluded.inferred, |
528 | issue=excluded.issue;""", | 516 | issue=excluded.issue;""", |
529 | data=(str(self.pdb_id), str(self.pdb_chain_id), | 517 | data=(str(self.pdb_id), str(self.pdb_chain_id), |
530 | - int(self.pdb_start), int(self.pdb_end), | 518 | + int(self.mapping.nt_start), int(self.mapping.nt_end), |
531 | - int(self.reversed), str(self.rfam_fam), | 519 | + int(self.mapping.reversed), str(self.mapping.rfam_acc), |
532 | - int(self.inferred), int(self.delete_me))) | 520 | + int(self.mapping.inferred), int(self.delete_me))) |
533 | # get the chain id | 521 | # get the chain id |
534 | self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain | 522 | self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain |
535 | WHERE structure_id='{self.pdb_id}' | 523 | WHERE structure_id='{self.pdb_id}' |
536 | AND chain_name='{self.pdb_chain_id}' | 524 | AND chain_name='{self.pdb_chain_id}' |
537 | - AND rfam_acc='{self.rfam_fam}';""")[0][0] | 525 | + AND rfam_acc='{self.mapping.rfam_acc}';""")[0][0] |
538 | else: | 526 | else: |
539 | sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, issue) VALUES (?, ?, NULL, ?) | 527 | sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, issue) VALUES (?, ?, NULL, ?) |
540 | ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue;""", | 528 | ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue;""", |
... | @@ -886,6 +874,103 @@ class Downloader: | ... | @@ -886,6 +874,103 @@ class Downloader: |
886 | notify(f"Downloaded and extracted {unit} database from SILVA", "used previous file") | 874 | notify(f"Downloaded and extracted {unit} database from SILVA", "used previous file") |
887 | 875 | ||
888 | 876 | ||
877 | +class Mapping: | ||
878 | + """ | ||
879 | + A custom class to store more information about nucleotide mappings. | ||
880 | + """ | ||
881 | + | ||
882 | + def __init__(self, chain_label, rfam_acc, pdb_start, pdb_end, inferred): | ||
883 | + """ | ||
884 | + Arguments: | ||
885 | + rfam_acc : Rfam family accession number of the mapping | ||
886 | + pdb_start/pdb_end : nt_resnum start and end values in the 3D data that are mapped to the family | ||
887 | + inferred : wether the mapping has been inferred using BGSU's NR list | ||
888 | + """ | ||
889 | + self.chain_label = chain_label | ||
890 | + self.rfam_acc = rfam_acc | ||
891 | + self.nt_start = pdb_start # nt_resnum numbering | ||
892 | + self.nt_end = pdb_end # nt_resnum numbering | ||
893 | + self.reversed = (pdb_start > pdb_end) | ||
894 | + self.inferred = inferred | ||
895 | + self.interval = range(pdb_start, pdb_end+1) | ||
896 | + | ||
897 | + self.old_nt_resnums = [] # to be computed | ||
898 | + self.new_nt_resnums = [] # | ||
899 | + | ||
900 | + self.logs = [] # Events are logged when modifying the mapping | ||
901 | + | ||
902 | + def shift_resnum_range(self, i): | ||
903 | + self.log(f"Shifting nt_resnum numbering because of duplicate residue {self.new_nt_resnums[i]}") | ||
904 | + for j in range(i, len(self.new_nt_resnums)): | ||
905 | + self.new_nt_resnums[j] += 1 | ||
906 | + | ||
907 | + def insert_new(self, index_chain, oldresnum, newresnum): | ||
908 | + # Adds a nt that did not passed the mapping filter at first | ||
909 | + # because it was numbered with a very high nt_resnum value (outside the bounds of the mapping) | ||
910 | + # But, in practice, its index_chain is correct and in the bounds and it belongs to the mapped chain. | ||
911 | + # Those nts are only searched if there are missing index_chain values in the mapping bounds. | ||
912 | + | ||
913 | + # insert the nt_resnum values in the lists | ||
914 | + self.old_nt_resnums.insert(index_chain-1, oldresnum) | ||
915 | + self.new_nt_resnums.insert(index_chain-1, newresnum) | ||
916 | + | ||
917 | + # shift the new_nt_resnum values if needed, to avoid creating a doublon | ||
918 | + if self.new_nt_resnums[index_chain-1] == self.new_nt_resnums[index_chain]: | ||
919 | + for j in range(index_chain, len(self.new_nt_resnums)): | ||
920 | + self.new_nt_resnums[j] += 1 | ||
921 | + # warn(f"Residue {index_chain} has been saved and renumbered {newresnum} instead of {oldresnum}") | ||
922 | + self.log(f"Residue {index_chain} has been saved and renumbered {newresnum} instead of {oldresnum}") | ||
923 | + | ||
924 | + def filter_df(self, df): | ||
925 | + if not self.reversed: | ||
926 | + newdf = df.drop(df[(df.nt_resnum < self.nt_start) | (df.nt_resnum > self.nt_end)].index) | ||
927 | + else: | ||
928 | + newdf = df.drop(df[(df.nt_resnum < self.nt_end) | (df.nt_resnum > self.nt_start)].index) | ||
929 | + | ||
930 | + if len(newdf.index_chain) > 0: | ||
931 | + # everything's okay | ||
932 | + df = newdf | ||
933 | + else: | ||
934 | + # There were nucleotides in this chain but we removed them all while | ||
935 | + # filtering the ones outside the Rfam mapping. | ||
936 | + # This probably means that, for this chain, the mapping is relative to | ||
937 | + # index_chain and not nt_resnum. | ||
938 | + warn(f"Assuming mapping to {self.rfam_acc} is an absolute position interval.") | ||
939 | + weird_mappings.add(self.chain_label + "." + self.rfam_acc) | ||
940 | + df = df.drop(df[(df.index_chain < self.nt_start) | (df.index_chain > self.nt_end)].index) | ||
941 | + | ||
942 | + | ||
943 | + # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one | ||
944 | + if len(df.index_chain) and df.iloc[0,0] != 1: | ||
945 | + st = df.iloc[0,0] -1 | ||
946 | + df.iloc[:, 0] -= st | ||
947 | + | ||
948 | + self.old_nt_resnums = df.nt_resnum.tolist() | ||
949 | + self.new_nt_resnums = df.nt_resnum.tolist() | ||
950 | + | ||
951 | + return df | ||
952 | + | ||
953 | + def drop_ligand(self, df_row): | ||
954 | + self.log("Droping ligand:") | ||
955 | + self.log(df_row) | ||
956 | + i = self.new_nt_resnums.index(df_row.iloc[0,1]) | ||
957 | + self.old_nt_resnums.pop(i) | ||
958 | + self.new_nt_resnums.pop(i) | ||
959 | + | ||
960 | + def log(self, message): | ||
961 | + if isinstance(message, str): | ||
962 | + self.logs.append(message+'\n') | ||
963 | + else: | ||
964 | + self.logs.append(str(message)) | ||
965 | + | ||
966 | + def to_file(self, filename): | ||
967 | + if not path.exists("logs"): | ||
968 | + os.makedirs("logs", exist_ok=True) | ||
969 | + with open("logs/"+filename, "w") as f: | ||
970 | + f.writelines(self.logs) | ||
971 | + | ||
972 | + | ||
973 | + | ||
889 | class Pipeline: | 974 | class Pipeline: |
890 | def __init__(self): | 975 | def __init__(self): |
891 | self.dl = Downloader() | 976 | self.dl = Downloader() |
... | @@ -1928,6 +2013,24 @@ def work_prepare_sequences(dl, rfam_acc, chains): | ... | @@ -1928,6 +2013,24 @@ def work_prepare_sequences(dl, rfam_acc, chains): |
1928 | """Prepares FASTA files of homologous sequences to realign with cmalign or SINA.""" | 2013 | """Prepares FASTA files of homologous sequences to realign with cmalign or SINA.""" |
1929 | 2014 | ||
1930 | if rfam_acc in LSU_set | SSU_set: # rRNA | 2015 | if rfam_acc in LSU_set | SSU_set: # rRNA |
2016 | + if path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): | ||
2017 | + # Detect doublons and remove them | ||
2018 | + existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta") | ||
2019 | + existing_ids = [ r.id for r in existing_afa ] | ||
2020 | + del existing_afa | ||
2021 | + new_ids = [ str(c) for c in chains ] | ||
2022 | + doublons = [ i for i in existing_ids if i in new_ids ] | ||
2023 | + del existing_ids, new_ids | ||
2024 | + if len(doublons): | ||
2025 | + fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa" | ||
2026 | + warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version") | ||
2027 | + seqfile = SeqIO.parse(fasta, "fasta") | ||
2028 | + os.remove(fasta) | ||
2029 | + with open(fasta, 'w') as f: | ||
2030 | + for rec in seqfile: | ||
2031 | + if rec.id not in doublons: | ||
2032 | + f.write(rec.format("fasta")) | ||
2033 | + | ||
1931 | # Add the new sequences with previous ones, if any | 2034 | # Add the new sequences with previous ones, if any |
1932 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f: | 2035 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f: |
1933 | for c in chains: | 2036 | for c in chains: |
... | @@ -2037,7 +2140,7 @@ def work_realign(rfam_acc): | ... | @@ -2037,7 +2140,7 @@ def work_realign(rfam_acc): |
2037 | existing_stk = AlignIO.read(existing_ali_path, "stockholm") | 2140 | existing_stk = AlignIO.read(existing_ali_path, "stockholm") |
2038 | existing_ids = [ r.id for r in existing_stk ] | 2141 | existing_ids = [ r.id for r in existing_stk ] |
2039 | del existing_stk | 2142 | del existing_stk |
2040 | - new_stk = AlignIO.read(new_ali_path, "stk") | 2143 | + new_stk = AlignIO.read(new_ali_path, "stockholm") |
2041 | new_ids = [ r.id for r in new_stk ] | 2144 | new_ids = [ r.id for r in new_stk ] |
2042 | del new_stk | 2145 | del new_stk |
2043 | doublons = [ i for i in existing_ids if i in new_ids ] | 2146 | doublons = [ i for i in existing_ids if i in new_ids ] |
... | @@ -2046,8 +2149,9 @@ def work_realign(rfam_acc): | ... | @@ -2046,8 +2149,9 @@ def work_realign(rfam_acc): |
2046 | warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version") | 2149 | warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version") |
2047 | with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove: | 2150 | with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove: |
2048 | toremove.write('\n'.join(doublons)+'\n') | 2151 | toremove.write('\n'.join(doublons)+'\n') |
2049 | - p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path], | 2152 | + p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path], |
2050 | stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | 2153 | stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) |
2154 | + p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path], stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2051 | os.remove(path_to_seq_data + "realigned/toremove.txt") | 2155 | os.remove(path_to_seq_data + "realigned/toremove.txt") |
2052 | 2156 | ||
2053 | # And we merge the two alignments | 2157 | # And we merge the two alignments |
... | @@ -2075,7 +2179,7 @@ def work_realign(rfam_acc): | ... | @@ -2075,7 +2179,7 @@ def work_realign(rfam_acc): |
2075 | 2179 | ||
2076 | if len(stderr): | 2180 | if len(stderr): |
2077 | print('', flush=True) | 2181 | print('', flush=True) |
2078 | - warn(f"Error while during sequence alignment: {stderr}", error=True) | 2182 | + warn(f"Error during sequence alignment: {stderr}", error=True) |
2079 | with open(runDir + "/errors.txt", "a") as er: | 2183 | with open(runDir + "/errors.txt", "a") as er: |
2080 | er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n') | 2184 | er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n') |
2081 | return 1 | 2185 | return 1 |
... | @@ -2087,7 +2191,7 @@ def work_realign(rfam_acc): | ... | @@ -2087,7 +2191,7 @@ def work_realign(rfam_acc): |
2087 | subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part. | 2191 | subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part. |
2088 | 2192 | ||
2089 | # Assert everything worked, or save an error | 2193 | # Assert everything worked, or save an error |
2090 | - with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa") as output: | 2194 | + with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa", 'r') as output: |
2091 | if not len(output.readline()): | 2195 | if not len(output.readline()): |
2092 | # The process crashed, probably because of RAM overflow | 2196 | # The process crashed, probably because of RAM overflow |
2093 | warn(f"Failed to realign {rfam_acc} (killed)", error=True) | 2197 | warn(f"Failed to realign {rfam_acc} (killed)", error=True) |
... | @@ -2237,7 +2341,7 @@ def work_save(c, homology=True): | ... | @@ -2237,7 +2341,7 @@ def work_save(c, homology=True): |
2237 | NATURAL JOIN nucleotide | 2341 | NATURAL JOIN nucleotide |
2238 | NATURAL JOIN align_column;""", | 2342 | NATURAL JOIN align_column;""", |
2239 | conn) | 2343 | conn) |
2240 | - filename = path_to_3D_data + "datapoints/" + c.chain_label + '.' + c.rfam_fam | 2344 | + filename = path_to_3D_data + "datapoints/" + c.chain_label + '.' + c.mapping.rfam_acc |
2241 | else: | 2345 | else: |
2242 | df = pd.read_sql_query(f""" | 2346 | df = pd.read_sql_query(f""" |
2243 | SELECT index_chain, nt_resnum, nt_position, nt_name, nt_code, nt_align_code, | 2347 | SELECT index_chain, nt_resnum, nt_position, nt_name, nt_code, nt_align_code, |
... | @@ -2280,7 +2384,6 @@ if __name__ == "__main__": | ... | @@ -2280,7 +2384,6 @@ if __name__ == "__main__": |
2280 | pp.dl_and_annotate(coeff_ncores=0.5) | 2384 | pp.dl_and_annotate(coeff_ncores=0.5) |
2281 | 2385 | ||
2282 | # At this point, the structure table is up to date | 2386 | # At this point, the structure table is up to date |
2283 | - | ||
2284 | pp.build_chains(coeff_ncores=1.0) | 2387 | pp.build_chains(coeff_ncores=1.0) |
2285 | 2388 | ||
2286 | if len(pp.to_retry): | 2389 | if len(pp.to_retry): |
... | @@ -2293,7 +2396,8 @@ if __name__ == "__main__": | ... | @@ -2293,7 +2396,8 @@ if __name__ == "__main__": |
2293 | print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True) | 2396 | print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True) |
2294 | if len(weird_mappings): | 2397 | if len(weird_mappings): |
2295 | print(f"{len(weird_mappings)} mappings to Rfam were taken as absolute positions instead of residue numbers:", weird_mappings, flush=True) | 2398 | print(f"{len(weird_mappings)} mappings to Rfam were taken as absolute positions instead of residue numbers:", weird_mappings, flush=True) |
2296 | - pp.checkpoint_save_chains() | 2399 | + if pp.SELECT_ONLY is None: |
2400 | + pp.checkpoint_save_chains() | ||
2297 | 2401 | ||
2298 | if not pp.HOMOLOGY: | 2402 | if not pp.HOMOLOGY: |
2299 | # Save chains to file | 2403 | # Save chains to file |
... | @@ -2301,7 +2405,7 @@ if __name__ == "__main__": | ... | @@ -2301,7 +2405,7 @@ if __name__ == "__main__": |
2301 | work_save(c, homology=False) | 2405 | work_save(c, homology=False) |
2302 | print("Completed.") | 2406 | print("Completed.") |
2303 | exit(0) | 2407 | exit(0) |
2304 | - | 2408 | + |
2305 | # At this point, structure, chain and nucleotide tables of the database are up to date. | 2409 | # At this point, structure, chain and nucleotide tables of the database are up to date. |
2306 | # (Modulo some statistics computed by statistics.py) | 2410 | # (Modulo some statistics computed by statistics.py) |
2307 | 2411 | ||
... | @@ -2309,15 +2413,16 @@ if __name__ == "__main__": | ... | @@ -2309,15 +2413,16 @@ if __name__ == "__main__": |
2309 | # Homology information | 2413 | # Homology information |
2310 | # =========================================================================== | 2414 | # =========================================================================== |
2311 | 2415 | ||
2312 | - pp.checkpoint_load_chains() # If your job failed, you can comment all the "3D information" part and start from here. | 2416 | + if pp.SELECT_ONLY is None: |
2417 | + pp.checkpoint_load_chains() # If your job failed, you can comment all the "3D information" part and start from here. | ||
2313 | 2418 | ||
2314 | # Get the list of Rfam families found | 2419 | # Get the list of Rfam families found |
2315 | rfam_acc_to_download = {} | 2420 | rfam_acc_to_download = {} |
2316 | for c in pp.loaded_chains: | 2421 | for c in pp.loaded_chains: |
2317 | - if c.rfam_fam not in rfam_acc_to_download: | 2422 | + if c.mapping.rfam_acc not in rfam_acc_to_download: |
2318 | - rfam_acc_to_download[c.rfam_fam] = [ c ] | 2423 | + rfam_acc_to_download[c.mapping.rfam_acc] = [ c ] |
2319 | else: | 2424 | else: |
2320 | - rfam_acc_to_download[c.rfam_fam].append(c) | 2425 | + rfam_acc_to_download[c.mapping.rfam_acc].append(c) |
2321 | print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences") | 2426 | print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences") |
2322 | pp.fam_list = sorted(rfam_acc_to_download.keys()) | 2427 | pp.fam_list = sorted(rfam_acc_to_download.keys()) |
2323 | 2428 | ... | ... |
... | @@ -3,7 +3,7 @@ | ... | @@ -3,7 +3,7 @@ |
3 | # Run RNANet | 3 | # Run RNANet |
4 | cd /home/lbecquey/Projects/RNANet; | 4 | cd /home/lbecquey/Projects/RNANet; |
5 | rm -f stdout.txt stderr.txt errors.txt; | 5 | rm -f stdout.txt stderr.txt errors.txt; |
6 | -time './RNAnet.py --3d-folder /home/lbequey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/' > stdout.txt 2> stderr.txt; | 6 | +time './RNAnet.py --3d-folder /home/lbequey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -s -r 20.0' > stdout.txt 2> stderr.txt; |
7 | 7 | ||
8 | # Sync in Seafile | 8 | # Sync in Seafile |
9 | seaf-cli start; | 9 | seaf-cli start; | ... | ... |
... | @@ -433,6 +433,7 @@ def to_dist_matrix(f): | ... | @@ -433,6 +433,7 @@ def to_dist_matrix(f): |
433 | notify(f"Computed {f} distance matrix", "loaded from file") | 433 | notify(f"Computed {f} distance matrix", "loaded from file") |
434 | return 0 | 434 | return 0 |
435 | 435 | ||
436 | + notify(f"Computing {f} distance matrix from alignment...") | ||
436 | dm = DistanceCalculator('identity') | 437 | dm = DistanceCalculator('identity') |
437 | with open(path_to_seq_data+"/realigned/"+f+"++.afa") as al_file: | 438 | with open(path_to_seq_data+"/realigned/"+f+"++.afa") as al_file: |
438 | al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):] | 439 | al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):] |
... | @@ -542,10 +543,10 @@ if __name__ == "__main__": | ... | @@ -542,10 +543,10 @@ if __name__ == "__main__": |
542 | threads = [ | 543 | threads = [ |
543 | th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}), | 544 | th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}), |
544 | th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}), | 545 | th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}), |
545 | - # th.Thread(target=stats_len), # computes figures | 546 | + th.Thread(target=stats_len), # computes figures |
546 | - # th.Thread(target=stats_freq), # Updates the database | 547 | + th.Thread(target=stats_freq), # Updates the database |
547 | - # th.Thread(target=seq_idty), # produces .npy files and seq idty figures | 548 | + th.Thread(target=seq_idty), # produces .npy files and seq idty figures |
548 | - # th.Thread(target=per_chain_stats) # Updates the database | 549 | + th.Thread(target=per_chain_stats) # Updates the database |
549 | ] | 550 | ] |
550 | 551 | ||
551 | # Start the threads | 552 | # Start the threads | ... | ... |
-
Please register or login to post a comment