Louis BECQUEY

Nt-specific Biopython PortionSelector

...@@ -3,7 +3,8 @@ nohup.out ...@@ -3,7 +3,8 @@ nohup.out
3 log_of_the_run.sh 3 log_of_the_run.sh
4 4
5 # results 5 # results
6 -results/ 6 +results/*
7 +logs/*
7 8
8 # temporary results files 9 # temporary results files
9 data/ 10 data/
......
...@@ -44,16 +44,15 @@ SSU_set = {"RF00177", "RF02542", "RF02545", "RF01959", "RF01960"} # From Rfam ...@@ -44,16 +44,15 @@ SSU_set = {"RF00177", "RF02542", "RF02545", "RF01959", "RF01960"} # From Rfam
44 no_nts_set = set() 44 no_nts_set = set()
45 weird_mappings = set() 45 weird_mappings = set()
46 46
47 -class NtPortionSelector(object): 47 +class SelectivePortionSelector(object):
48 """Class passed to MMCIFIO to select some chain portions in an MMCIF file. 48 """Class passed to MMCIFIO to select some chain portions in an MMCIF file.
49 49
50 Validates every chain, residue, nucleotide, to say if it is in the selection or not. 50 Validates every chain, residue, nucleotide, to say if it is in the selection or not.
51 """ 51 """
52 52
53 - def __init__(self, model_id, chain_id, start, end, khetatm): 53 + def __init__(self, model_id, chain_id, valid_resnums, khetatm):
54 self.chain_id = chain_id 54 self.chain_id = chain_id
55 - self.start = start 55 + self.resnums = valid_resnums
56 - self.end = end
57 self.pdb_model_id = model_id 56 self.pdb_model_id = model_id
58 self.hydrogen_regex = re.compile("[123 ]*H.*") 57 self.hydrogen_regex = re.compile("[123 ]*H.*")
59 self.keep_hetatm = khetatm 58 self.keep_hetatm = khetatm
...@@ -77,7 +76,10 @@ class NtPortionSelector(object): ...@@ -77,7 +76,10 @@ class NtPortionSelector(object):
77 # warn(f"icode {icode} at position {resseq}\t\t") 76 # warn(f"icode {icode} at position {resseq}\t\t")
78 77
79 # Accept the residue if it is in the right interval: 78 # Accept the residue if it is in the right interval:
80 - return int(self.start <= resseq <= self.end) 79 + if len(self.resnums):
80 + return int(resseq in self.resnums)
81 + else:
82 + return 1
81 83
82 def accept_atom(self, atom): 84 def accept_atom(self, atom):
83 85
...@@ -128,13 +130,12 @@ class Chain: ...@@ -128,13 +130,12 @@ class Chain:
128 self.pdb_id = pdb_id # PDB ID 130 self.pdb_id = pdb_id # PDB ID
129 self.pdb_model = int(pdb_model) # model ID, starting at 1 131 self.pdb_model = int(pdb_model) # model ID, starting at 1
130 self.pdb_chain_id = pdb_chain_id # chain ID (mmCIF), multiple letters 132 self.pdb_chain_id = pdb_chain_id # chain ID (mmCIF), multiple letters
131 - self.pdb_start = pdb_start # if portion of chain, the start number (relative to the chain, not residue numbers) 133 + if len(rfam):
132 - self.pdb_end = pdb_end # if portion of chain, the start number (relative to the chain, not residue numbers) 134 + self.mapping = Mapping(chain_label, rfam, pdb_start, pdb_end, inferred)
133 - self.reversed = (pdb_start > pdb_end) if pdb_start is not None else False # wether pdb_start > pdb_end in the Rfam mapping 135 + else:
136 + self.mapping = None
134 self.chain_label = chain_label # chain pretty name 137 self.chain_label = chain_label # chain pretty name
135 self.file = "" # path to the 3D PDB file 138 self.file = "" # path to the 3D PDB file
136 - self.rfam_fam = rfam # mapping to an RNA family
137 - self.inferred = inferred # Wether this mapping has been inferred from BGSU's NR list
138 self.seq = "" # sequence with modified nts 139 self.seq = "" # sequence with modified nts
139 self.seq_to_align = "" # sequence with modified nts replaced, but gaps can exist 140 self.seq_to_align = "" # sequence with modified nts replaced, but gaps can exist
140 self.length = -1 # length of the sequence (missing residues are not counted) 141 self.length = -1 # length of the sequence (missing residues are not counted)
...@@ -156,8 +157,8 @@ class Chain: ...@@ -156,8 +157,8 @@ class Chain:
156 """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file. 157 """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file.
157 """ 158 """
158 159
159 - if self.pdb_start is not None and (self.pdb_end - self.pdb_start): 160 + if self.mapping is not None:
160 - status = f"Extract {self.pdb_start}-{self.pdb_end} atoms from {self.pdb_id}-{self.pdb_chain_id}" 161 + status = f"Extract {self.mapping.nt_start}-{self.mapping.nt_end} atoms from {self.pdb_id}-{self.pdb_chain_id}"
161 self.file = path_to_3D_data+"rna_mapped_to_Rfam/"+self.chain_label+".cif" 162 self.file = path_to_3D_data+"rna_mapped_to_Rfam/"+self.chain_label+".cif"
162 else: 163 else:
163 status = f"Extract {self.pdb_id}-{self.pdb_chain_id}" 164 status = f"Extract {self.pdb_id}-{self.pdb_chain_id}"
...@@ -183,36 +184,19 @@ class Chain: ...@@ -183,36 +184,19 @@ class Chain:
183 # Extract the desired chain 184 # Extract the desired chain
184 c = s[model_idx][self.pdb_chain_id] 185 c = s[model_idx][self.pdb_chain_id]
185 186
186 - if self.pdb_start is not None and (self.pdb_end - self.pdb_start): 187 + if self.mapping is not None:
187 - # # Pay attention to residue numbering 188 + valid_set = set(self.mapping.old_nt_resnums)
188 - # first_number = c.child_list[0].get_id()[1] # the chain's first residue is numbered 'first_number'
189 - # if self.pdb_start < self.pdb_end:
190 - # start = self.pdb_start + first_number - 1 # shift our start_position by 'first_number'
191 - # end = self.pdb_end + first_number - 1 # same for the end position
192 - # else:
193 - # self.reversed = True # the 3D chain is numbered backwards compared to the Rfam family
194 - # end = self.pdb_start + first_number - 1
195 - # start = self.pdb_end + first_number - 1
196 -
197 - if self.pdb_start < self.pdb_end:
198 - start = self.pdb_start # shift our start_position by 'first_number'
199 - end = self.pdb_end # same for the end position
200 - else:
201 - self.reversed = True # the 3D chain is numbered backwards compared to the Rfam family
202 - end = self.pdb_start
203 - start = self.pdb_end
204 else: 189 else:
205 - start = c.child_list[0].get_id()[1] # the chain's first residue is numbered 'first_number' 190 + valid_set = set()
206 - end = c.child_list[-1].get_id()[1] # the chain's last residue number 191 +
207 -
208 # Define a selection 192 # Define a selection
209 - # sel = ChainSelector(self.pdb_chain_id, start, end, model_id = model_idx) 193 + sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm)
210 - sel = NtPortionSelector(model_idx, self.pdb_chain_id, start, end, khetatm)
211 194
212 # Save that selection on the mmCIF object s to file 195 # Save that selection on the mmCIF object s to file
213 ioobj = MMCIFIO() 196 ioobj = MMCIFIO()
214 ioobj.set_structure(s) 197 ioobj.set_structure(s)
215 ioobj.save(self.file, sel) 198 ioobj.save(self.file, sel)
199 +
216 200
217 notify(status) 201 notify(status)
218 202
...@@ -225,7 +209,7 @@ class Chain: ...@@ -225,7 +209,7 @@ class Chain:
225 try: 209 try:
226 with open(path_to_3D_data + "annotations/" + self.pdb_id + ".json", 'r') as json_file: 210 with open(path_to_3D_data + "annotations/" + self.pdb_id + ".json", 'r') as json_file:
227 json_object = json.load(json_file) 211 json_object = json.load(json_file)
228 - notify(f"Read {self.chain_label} DSSR annotations") 212 + notify(f"Read {self.pdb_id} DSSR annotations")
229 except json.decoder.JSONDecodeError as e: 213 except json.decoder.JSONDecodeError as e:
230 warn("Could not load "+self.pdb_id+f".json with JSON package: {e}", error=True) 214 warn("Could not load "+self.pdb_id+f".json with JSON package: {e}", error=True)
231 self.delete_me = True 215 self.delete_me = True
...@@ -275,87 +259,87 @@ class Chain: ...@@ -275,87 +259,87 @@ class Chain:
275 return None 259 return None
276 260
277 ############################################# 261 #############################################
278 - # Solve some common issues and drop ligands 262 + # Select the nucleotides we need
279 ############################################# 263 #############################################
280 264
281 - # Shift numbering when duplicate residue numbers are found. 265 +
266 + # Remove nucleotides of the chain that are outside the Rfam mapping, if any
267 + if self.mapping is not None:
268 + df = self.mapping.filter_df(df)
269 +
270 + # Duplicate residue numbers : shift numbering
282 # Example: 4v9q-DV contains 17 and 17A which are both read 17 by DSSR. 271 # Example: 4v9q-DV contains 17 and 17A which are both read 17 by DSSR.
283 while True in df.duplicated(['nt_resnum']).values: 272 while True in df.duplicated(['nt_resnum']).values:
284 i = df.duplicated(['nt_resnum']).values.tolist().index(True) 273 i = df.duplicated(['nt_resnum']).values.tolist().index(True)
274 + self.mapping.shift_resnum_range(i)
285 df.iloc[i:, 1] += 1 275 df.iloc[i:, 1] += 1
286 276
277 + # Search for ligands at the end of the selection
287 # Drop ligands detected as residues by DSSR, by detecting several markers 278 # Drop ligands detected as residues by DSSR, by detecting several markers
288 - df = df.drop_duplicates("index_chain", keep="first") # drop doublons in index_chain 279 + while ( len(df.index_chain) and df.iloc[-1,2] not in ["A", "C", "G", "U"] and (
289 - while (len(df.index_chain) and df.iloc[[-1]].nt_name.tolist()[0] not in ["A", "C", "G", "U"] and 280 + (df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all()
290 - ((df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all() 281 + or (df.iloc[[-1]].puckering=='').any()
291 - or (df.iloc[[-1]].puckering=='').any()) 282 + )
292 - or (len(df.index_chain) >= 2 and df.iloc[[-1]].nt_resnum.iloc[0] > 50 + df.iloc[[-2]].nt_resnum.iloc[0])): 283 + or ( len(df.index_chain) >= 2 and df.iloc[-1,1] > 50 + df.iloc[-2,1] )
284 + or ( len(df.index_chain) and df.iloc[-1,2] in ["GNG", "E2C", "OHX", "IRI", "MPD", "8UZ"] )
285 + ):
286 + if self.mapping is not None:
287 + self.mapping.drop_ligand(df.tail(1))
293 df = df.head(-1) 288 df = df.head(-1)
294 289
295 - # drop eventual nts with index_chain < the first residue (usually, ligands) 290 + # Duplicates in index_chain : drop, they are ligands
296 - df = df.drop(df[df.index_chain < 0].index) 291 + # e.g. 3iwn_1_B_1-91, ligand C2E has index_chain 1 (and nt_resnum 601)
297 - 292 + duplicates = [ index for index, element in enumerate(df.duplicated(['index_chain']).values) if element ]
298 - # Assert some nucleotides still exist 293 + if len(duplicates):
299 - try: 294 + for i in duplicates:
300 - l = df.iloc[-1,1] - df.iloc[0,1] + 1 # length of chain from nt_resnum point of view 295 + warn(f"Found duplicated index_chain {df.iloc[i,0]} in {self.chain_label}. Keeping only the first.")
301 - except IndexError: 296 + if self.mapping is not None:
302 - warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Ignoring chain {self.chain_label}.", error=True) 297 + self.mapping.log(f"Found duplicated index_chain {df.iloc[i,0]}. Keeping only the first.")
303 - no_nts_set.add(self.pdb_id) 298 + with open("duplicates.txt", "a") as f:
304 - self.delete_me = True 299 + f.write(f"DEBUG: {self.chain_label} has duplicate index_chains !\n")
305 - self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry." 300 + df = df.drop_duplicates("index_chain", keep="first") # drop doublons in index_chain
306 - return None 301 +
307 - 302 + # drop eventual nts with index_chain < the first residue,
308 - # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one 303 + # now negative because we renumber to 1 (usually, ligands)
309 - if df.iloc[0,0] != 1: 304 + ligands = df[df.index_chain < 0]
310 - st = df.iloc[0,0] -1 305 + if len(ligands.index_chain):
311 - df.iloc[:, 0] -= st 306 + if self.mapping is not None:
312 - 307 + for line in ligands.iterrows():
313 - 308 + self.mapping.drop_ligand(line)
314 - # Find missing index_chain values because of resolved nucleotides that have a strange nt_resnum value 309 + df = df.drop(ligands.index)
310 +
311 + # Find missing index_chain values
312 + # This happens because of resolved nucleotides that have a
313 + # strange nt_resnum value
315 # e.g. 4v49-AA, position 5'- 1003 -> 2003 -> 1004 - 3' 314 # e.g. 4v49-AA, position 5'- 1003 -> 2003 -> 1004 - 3'
316 diff = set(range(df.shape[0])).difference(df['index_chain'] - 1) 315 diff = set(range(df.shape[0])).difference(df['index_chain'] - 1)
317 - for i in sorted(diff): 316 + if len(diff):
318 - # check if a nucleotide numbered +1000 exists 317 + warn(f"Missing residues regarding index_chain: {[1+i for i in sorted(diff)]}")
319 - looked_for = df[df.index_chain == i].nt_resnum.values[0] 318 + for i in sorted(diff):
320 - found = None 319 + # check if a nucleotide numbered +1000 exists in the nts object
321 - for nt in nts: 320 + found = None
322 - if nt['chain_name'] != self.pdb_chain_id: 321 + for nt in nts: # nts is the object from the loaded JSON and contains all nts
323 - continue 322 + if nt['chain_name'] != self.pdb_chain_id:
324 - if nt['index_chain'] == i + 1 : 323 + continue
325 - found = nt 324 + if nt['index_chain'] == i + 1 :
326 - break 325 + found = nt
327 - if found: 326 + break
328 - df_row = pd.DataFrame([found], index=[i])[df.columns.values] 327 + if found:
329 - df_row.iloc[0,1] = df.iloc[i,1] 328 + df_row = pd.DataFrame([found], index=[i])[df.columns.values]
330 - df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ]) 329 + if self.mapping is not None:
331 - df.iloc[i+1:, 1] += 1 330 + self.mapping.insert_new(i+1, found['nt_resnum'], df.iloc[i,1])
332 - else: 331 + df_row.iloc[0,1] = df.iloc[i,1]
333 - warn(f"Missing index_chain {i} in {self.chain_label} !") 332 + df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ])
334 - 333 + df.iloc[i+1:, 1] += 1
335 - # Remove nucleotides of the chain that are outside the Rfam mapping, if any 334 + else:
336 - if self.pdb_start and self.pdb_end: 335 + warn(f"Missing index_chain {i} in {self.chain_label} !")
337 - if self.pdb_start < self.pdb_end:
338 - newdf = df.drop(df[(df.nt_resnum < self.pdb_start) | (df.nt_resnum > self.pdb_end)].index)
339 - else:
340 - newdf = df.drop(df[(df.nt_resnum < self.pdb_end) | (df.nt_resnum > self.pdb_start)].index)
341 336
342 - if len(newdf.index_chain) > 0: 337 + # Assert some nucleotides still exist
343 - # everything's okay
344 - df = newdf
345 - else:
346 - # There were nucleotides in this chain but we removed them all while
347 - # filtering the ones outside the Rfam mapping.
348 - # This probably means that, for this chain, the mapping is relative to
349 - # index_chain and not nt_resnum.
350 - warn(f"Assuming {self.chain_label}'s mapping to {self.rfam_fam} is an absolute position interval.")
351 - weird_mappings.add(self.chain_label + "." + self.rfam_fam)
352 - df = df.drop(df[(df.index_chain < self.pdb_start) | (df.index_chain > self.pdb_end)].index)
353 -
354 try: 338 try:
355 l = df.iloc[-1,1] - df.iloc[0,1] + 1 # update length of chain from nt_resnum point of view 339 l = df.iloc[-1,1] - df.iloc[0,1] + 1 # update length of chain from nt_resnum point of view
356 except IndexError: 340 except IndexError:
357 - warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.pdb_start} and " 341 + warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.mapping.nt_start} and "
358 - f"{self.pdb_end} ({'not ' if not self.inferred else ''}inferred). Ignoring chain {self.chain_label}.") 342 + f"{self.mapping.nt_end} ({'not ' if not self.mapping.inferred else ''}inferred). Ignoring chain {self.chain_label}.")
359 no_nts_set.add(self.pdb_id) 343 no_nts_set.add(self.pdb_id)
360 self.delete_me = True 344 self.delete_me = True
361 self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Either there is a problem with {self.pdb_id} mmCIF download, or the bases are not resolved in the structure. Delete it and retry." 345 self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Either there is a problem with {self.pdb_id} mmCIF download, or the bases are not resolved in the structure. Delete it and retry."
...@@ -464,7 +448,7 @@ class Chain: ...@@ -464,7 +448,7 @@ class Chain:
464 df['nb_interact'] = interacts 448 df['nb_interact'] = interacts
465 df = df.drop(['nt_id'], axis=1) # remove now useless descriptors 449 df = df.drop(['nt_id'], axis=1) # remove now useless descriptors
466 450
467 - if self.reversed: 451 + if self.mapping.reversed:
468 # The 3D structure is numbered from 3' to 5' instead of standard 5' to 3' 452 # The 3D structure is numbered from 3' to 5' instead of standard 5' to 3'
469 # or the sequence that matches the Rfam family is 3' to 5' instead of standard 5' to 3'. 453 # or the sequence that matches the Rfam family is 3' to 5' instead of standard 5' to 3'.
470 # Anyways, you need to invert the angles. 454 # Anyways, you need to invert the angles.
...@@ -507,6 +491,10 @@ class Chain: ...@@ -507,6 +491,10 @@ class Chain:
507 self.error_messages = "Sequence is too short. (< 5 resolved nts)" 491 self.error_messages = "Sequence is too short. (< 5 resolved nts)"
508 return None 492 return None
509 493
494 + # Log chain info to file
495 + if self.mapping is not None:
496 + self.mapping.to_file(self.chain_label+".log")
497 +
510 return df 498 return df
511 499
512 def register_chain(self, df): 500 def register_chain(self, df):
...@@ -515,7 +503,7 @@ class Chain: ...@@ -515,7 +503,7 @@ class Chain:
515 503
516 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: 504 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
517 # Register the chain in table chain 505 # Register the chain in table chain
518 - if self.pdb_start is not None: 506 + if self.mapping.nt_start is not None:
519 sql_execute(conn, f""" INSERT INTO chain 507 sql_execute(conn, f""" INSERT INTO chain
520 (structure_id, chain_name, pdb_start, pdb_end, reversed, rfam_acc, inferred, issue) 508 (structure_id, chain_name, pdb_start, pdb_end, reversed, rfam_acc, inferred, issue)
521 VALUES 509 VALUES
...@@ -527,14 +515,14 @@ class Chain: ...@@ -527,14 +515,14 @@ class Chain:
527 inferred=excluded.inferred, 515 inferred=excluded.inferred,
528 issue=excluded.issue;""", 516 issue=excluded.issue;""",
529 data=(str(self.pdb_id), str(self.pdb_chain_id), 517 data=(str(self.pdb_id), str(self.pdb_chain_id),
530 - int(self.pdb_start), int(self.pdb_end), 518 + int(self.mapping.nt_start), int(self.mapping.nt_end),
531 - int(self.reversed), str(self.rfam_fam), 519 + int(self.mapping.reversed), str(self.mapping.rfam_acc),
532 - int(self.inferred), int(self.delete_me))) 520 + int(self.mapping.inferred), int(self.delete_me)))
533 # get the chain id 521 # get the chain id
534 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain 522 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain
535 WHERE structure_id='{self.pdb_id}' 523 WHERE structure_id='{self.pdb_id}'
536 AND chain_name='{self.pdb_chain_id}' 524 AND chain_name='{self.pdb_chain_id}'
537 - AND rfam_acc='{self.rfam_fam}';""")[0][0] 525 + AND rfam_acc='{self.mapping.rfam_acc}';""")[0][0]
538 else: 526 else:
539 sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, issue) VALUES (?, ?, NULL, ?) 527 sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, issue) VALUES (?, ?, NULL, ?)
540 ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue;""", 528 ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue;""",
...@@ -886,6 +874,103 @@ class Downloader: ...@@ -886,6 +874,103 @@ class Downloader:
886 notify(f"Downloaded and extracted {unit} database from SILVA", "used previous file") 874 notify(f"Downloaded and extracted {unit} database from SILVA", "used previous file")
887 875
888 876
877 +class Mapping:
878 + """
879 + A custom class to store more information about nucleotide mappings.
880 + """
881 +
882 + def __init__(self, chain_label, rfam_acc, pdb_start, pdb_end, inferred):
883 + """
884 + Arguments:
885 + rfam_acc : Rfam family accession number of the mapping
886 + pdb_start/pdb_end : nt_resnum start and end values in the 3D data that are mapped to the family
887 + inferred : wether the mapping has been inferred using BGSU's NR list
888 + """
889 + self.chain_label = chain_label
890 + self.rfam_acc = rfam_acc
891 + self.nt_start = pdb_start # nt_resnum numbering
892 + self.nt_end = pdb_end # nt_resnum numbering
893 + self.reversed = (pdb_start > pdb_end)
894 + self.inferred = inferred
895 + self.interval = range(pdb_start, pdb_end+1)
896 +
897 + self.old_nt_resnums = [] # to be computed
898 + self.new_nt_resnums = [] #
899 +
900 + self.logs = [] # Events are logged when modifying the mapping
901 +
902 + def shift_resnum_range(self, i):
903 + self.log(f"Shifting nt_resnum numbering because of duplicate residue {self.new_nt_resnums[i]}")
904 + for j in range(i, len(self.new_nt_resnums)):
905 + self.new_nt_resnums[j] += 1
906 +
907 + def insert_new(self, index_chain, oldresnum, newresnum):
908 + # Adds a nt that did not passed the mapping filter at first
909 + # because it was numbered with a very high nt_resnum value (outside the bounds of the mapping)
910 + # But, in practice, its index_chain is correct and in the bounds and it belongs to the mapped chain.
911 + # Those nts are only searched if there are missing index_chain values in the mapping bounds.
912 +
913 + # insert the nt_resnum values in the lists
914 + self.old_nt_resnums.insert(index_chain-1, oldresnum)
915 + self.new_nt_resnums.insert(index_chain-1, newresnum)
916 +
917 + # shift the new_nt_resnum values if needed, to avoid creating a doublon
918 + if self.new_nt_resnums[index_chain-1] == self.new_nt_resnums[index_chain]:
919 + for j in range(index_chain, len(self.new_nt_resnums)):
920 + self.new_nt_resnums[j] += 1
921 + # warn(f"Residue {index_chain} has been saved and renumbered {newresnum} instead of {oldresnum}")
922 + self.log(f"Residue {index_chain} has been saved and renumbered {newresnum} instead of {oldresnum}")
923 +
924 + def filter_df(self, df):
925 + if not self.reversed:
926 + newdf = df.drop(df[(df.nt_resnum < self.nt_start) | (df.nt_resnum > self.nt_end)].index)
927 + else:
928 + newdf = df.drop(df[(df.nt_resnum < self.nt_end) | (df.nt_resnum > self.nt_start)].index)
929 +
930 + if len(newdf.index_chain) > 0:
931 + # everything's okay
932 + df = newdf
933 + else:
934 + # There were nucleotides in this chain but we removed them all while
935 + # filtering the ones outside the Rfam mapping.
936 + # This probably means that, for this chain, the mapping is relative to
937 + # index_chain and not nt_resnum.
938 + warn(f"Assuming mapping to {self.rfam_acc} is an absolute position interval.")
939 + weird_mappings.add(self.chain_label + "." + self.rfam_acc)
940 + df = df.drop(df[(df.index_chain < self.nt_start) | (df.index_chain > self.nt_end)].index)
941 +
942 +
943 + # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one
944 + if len(df.index_chain) and df.iloc[0,0] != 1:
945 + st = df.iloc[0,0] -1
946 + df.iloc[:, 0] -= st
947 +
948 + self.old_nt_resnums = df.nt_resnum.tolist()
949 + self.new_nt_resnums = df.nt_resnum.tolist()
950 +
951 + return df
952 +
953 + def drop_ligand(self, df_row):
954 + self.log("Droping ligand:")
955 + self.log(df_row)
956 + i = self.new_nt_resnums.index(df_row.iloc[0,1])
957 + self.old_nt_resnums.pop(i)
958 + self.new_nt_resnums.pop(i)
959 +
960 + def log(self, message):
961 + if isinstance(message, str):
962 + self.logs.append(message+'\n')
963 + else:
964 + self.logs.append(str(message))
965 +
966 + def to_file(self, filename):
967 + if not path.exists("logs"):
968 + os.makedirs("logs", exist_ok=True)
969 + with open("logs/"+filename, "w") as f:
970 + f.writelines(self.logs)
971 +
972 +
973 +
889 class Pipeline: 974 class Pipeline:
890 def __init__(self): 975 def __init__(self):
891 self.dl = Downloader() 976 self.dl = Downloader()
...@@ -1928,6 +2013,24 @@ def work_prepare_sequences(dl, rfam_acc, chains): ...@@ -1928,6 +2013,24 @@ def work_prepare_sequences(dl, rfam_acc, chains):
1928 """Prepares FASTA files of homologous sequences to realign with cmalign or SINA.""" 2013 """Prepares FASTA files of homologous sequences to realign with cmalign or SINA."""
1929 2014
1930 if rfam_acc in LSU_set | SSU_set: # rRNA 2015 if rfam_acc in LSU_set | SSU_set: # rRNA
2016 + if path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
2017 + # Detect doublons and remove them
2018 + existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
2019 + existing_ids = [ r.id for r in existing_afa ]
2020 + del existing_afa
2021 + new_ids = [ str(c) for c in chains ]
2022 + doublons = [ i for i in existing_ids if i in new_ids ]
2023 + del existing_ids, new_ids
2024 + if len(doublons):
2025 + fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa"
2026 + warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version")
2027 + seqfile = SeqIO.parse(fasta, "fasta")
2028 + os.remove(fasta)
2029 + with open(fasta, 'w') as f:
2030 + for rec in seqfile:
2031 + if rec.id not in doublons:
2032 + f.write(rec.format("fasta"))
2033 +
1931 # Add the new sequences with previous ones, if any 2034 # Add the new sequences with previous ones, if any
1932 with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f: 2035 with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f:
1933 for c in chains: 2036 for c in chains:
...@@ -2037,7 +2140,7 @@ def work_realign(rfam_acc): ...@@ -2037,7 +2140,7 @@ def work_realign(rfam_acc):
2037 existing_stk = AlignIO.read(existing_ali_path, "stockholm") 2140 existing_stk = AlignIO.read(existing_ali_path, "stockholm")
2038 existing_ids = [ r.id for r in existing_stk ] 2141 existing_ids = [ r.id for r in existing_stk ]
2039 del existing_stk 2142 del existing_stk
2040 - new_stk = AlignIO.read(new_ali_path, "stk") 2143 + new_stk = AlignIO.read(new_ali_path, "stockholm")
2041 new_ids = [ r.id for r in new_stk ] 2144 new_ids = [ r.id for r in new_stk ]
2042 del new_stk 2145 del new_stk
2043 doublons = [ i for i in existing_ids if i in new_ids ] 2146 doublons = [ i for i in existing_ids if i in new_ids ]
...@@ -2046,8 +2149,9 @@ def work_realign(rfam_acc): ...@@ -2046,8 +2149,9 @@ def work_realign(rfam_acc):
2046 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version") 2149 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
2047 with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove: 2150 with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove:
2048 toremove.write('\n'.join(doublons)+'\n') 2151 toremove.write('\n'.join(doublons)+'\n')
2049 - p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path], 2152 + p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path],
2050 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) 2153 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2154 + p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path], stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2051 os.remove(path_to_seq_data + "realigned/toremove.txt") 2155 os.remove(path_to_seq_data + "realigned/toremove.txt")
2052 2156
2053 # And we merge the two alignments 2157 # And we merge the two alignments
...@@ -2075,7 +2179,7 @@ def work_realign(rfam_acc): ...@@ -2075,7 +2179,7 @@ def work_realign(rfam_acc):
2075 2179
2076 if len(stderr): 2180 if len(stderr):
2077 print('', flush=True) 2181 print('', flush=True)
2078 - warn(f"Error while during sequence alignment: {stderr}", error=True) 2182 + warn(f"Error during sequence alignment: {stderr}", error=True)
2079 with open(runDir + "/errors.txt", "a") as er: 2183 with open(runDir + "/errors.txt", "a") as er:
2080 er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n') 2184 er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n')
2081 return 1 2185 return 1
...@@ -2087,7 +2191,7 @@ def work_realign(rfam_acc): ...@@ -2087,7 +2191,7 @@ def work_realign(rfam_acc):
2087 subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part. 2191 subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part.
2088 2192
2089 # Assert everything worked, or save an error 2193 # Assert everything worked, or save an error
2090 - with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa") as output: 2194 + with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa", 'r') as output:
2091 if not len(output.readline()): 2195 if not len(output.readline()):
2092 # The process crashed, probably because of RAM overflow 2196 # The process crashed, probably because of RAM overflow
2093 warn(f"Failed to realign {rfam_acc} (killed)", error=True) 2197 warn(f"Failed to realign {rfam_acc} (killed)", error=True)
...@@ -2237,7 +2341,7 @@ def work_save(c, homology=True): ...@@ -2237,7 +2341,7 @@ def work_save(c, homology=True):
2237 NATURAL JOIN nucleotide 2341 NATURAL JOIN nucleotide
2238 NATURAL JOIN align_column;""", 2342 NATURAL JOIN align_column;""",
2239 conn) 2343 conn)
2240 - filename = path_to_3D_data + "datapoints/" + c.chain_label + '.' + c.rfam_fam 2344 + filename = path_to_3D_data + "datapoints/" + c.chain_label + '.' + c.mapping.rfam_acc
2241 else: 2345 else:
2242 df = pd.read_sql_query(f""" 2346 df = pd.read_sql_query(f"""
2243 SELECT index_chain, nt_resnum, nt_position, nt_name, nt_code, nt_align_code, 2347 SELECT index_chain, nt_resnum, nt_position, nt_name, nt_code, nt_align_code,
...@@ -2280,7 +2384,6 @@ if __name__ == "__main__": ...@@ -2280,7 +2384,6 @@ if __name__ == "__main__":
2280 pp.dl_and_annotate(coeff_ncores=0.5) 2384 pp.dl_and_annotate(coeff_ncores=0.5)
2281 2385
2282 # At this point, the structure table is up to date 2386 # At this point, the structure table is up to date
2283 -
2284 pp.build_chains(coeff_ncores=1.0) 2387 pp.build_chains(coeff_ncores=1.0)
2285 2388
2286 if len(pp.to_retry): 2389 if len(pp.to_retry):
...@@ -2293,7 +2396,8 @@ if __name__ == "__main__": ...@@ -2293,7 +2396,8 @@ if __name__ == "__main__":
2293 print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True) 2396 print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True)
2294 if len(weird_mappings): 2397 if len(weird_mappings):
2295 print(f"{len(weird_mappings)} mappings to Rfam were taken as absolute positions instead of residue numbers:", weird_mappings, flush=True) 2398 print(f"{len(weird_mappings)} mappings to Rfam were taken as absolute positions instead of residue numbers:", weird_mappings, flush=True)
2296 - pp.checkpoint_save_chains() 2399 + if pp.SELECT_ONLY is None:
2400 + pp.checkpoint_save_chains()
2297 2401
2298 if not pp.HOMOLOGY: 2402 if not pp.HOMOLOGY:
2299 # Save chains to file 2403 # Save chains to file
...@@ -2301,7 +2405,7 @@ if __name__ == "__main__": ...@@ -2301,7 +2405,7 @@ if __name__ == "__main__":
2301 work_save(c, homology=False) 2405 work_save(c, homology=False)
2302 print("Completed.") 2406 print("Completed.")
2303 exit(0) 2407 exit(0)
2304 - 2408 +
2305 # At this point, structure, chain and nucleotide tables of the database are up to date. 2409 # At this point, structure, chain and nucleotide tables of the database are up to date.
2306 # (Modulo some statistics computed by statistics.py) 2410 # (Modulo some statistics computed by statistics.py)
2307 2411
...@@ -2309,15 +2413,16 @@ if __name__ == "__main__": ...@@ -2309,15 +2413,16 @@ if __name__ == "__main__":
2309 # Homology information 2413 # Homology information
2310 # =========================================================================== 2414 # ===========================================================================
2311 2415
2312 - pp.checkpoint_load_chains() # If your job failed, you can comment all the "3D information" part and start from here. 2416 + if pp.SELECT_ONLY is None:
2417 + pp.checkpoint_load_chains() # If your job failed, you can comment all the "3D information" part and start from here.
2313 2418
2314 # Get the list of Rfam families found 2419 # Get the list of Rfam families found
2315 rfam_acc_to_download = {} 2420 rfam_acc_to_download = {}
2316 for c in pp.loaded_chains: 2421 for c in pp.loaded_chains:
2317 - if c.rfam_fam not in rfam_acc_to_download: 2422 + if c.mapping.rfam_acc not in rfam_acc_to_download:
2318 - rfam_acc_to_download[c.rfam_fam] = [ c ] 2423 + rfam_acc_to_download[c.mapping.rfam_acc] = [ c ]
2319 else: 2424 else:
2320 - rfam_acc_to_download[c.rfam_fam].append(c) 2425 + rfam_acc_to_download[c.mapping.rfam_acc].append(c)
2321 print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences") 2426 print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences")
2322 pp.fam_list = sorted(rfam_acc_to_download.keys()) 2427 pp.fam_list = sorted(rfam_acc_to_download.keys())
2323 2428
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
3 # Run RNANet 3 # Run RNANet
4 cd /home/lbecquey/Projects/RNANet; 4 cd /home/lbecquey/Projects/RNANet;
5 rm -f stdout.txt stderr.txt errors.txt; 5 rm -f stdout.txt stderr.txt errors.txt;
6 -time './RNAnet.py --3d-folder /home/lbequey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/' > stdout.txt 2> stderr.txt; 6 +time './RNAnet.py --3d-folder /home/lbequey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -s -r 20.0' > stdout.txt 2> stderr.txt;
7 7
8 # Sync in Seafile 8 # Sync in Seafile
9 seaf-cli start; 9 seaf-cli start;
......
...@@ -433,6 +433,7 @@ def to_dist_matrix(f): ...@@ -433,6 +433,7 @@ def to_dist_matrix(f):
433 notify(f"Computed {f} distance matrix", "loaded from file") 433 notify(f"Computed {f} distance matrix", "loaded from file")
434 return 0 434 return 0
435 435
436 + notify(f"Computing {f} distance matrix from alignment...")
436 dm = DistanceCalculator('identity') 437 dm = DistanceCalculator('identity')
437 with open(path_to_seq_data+"/realigned/"+f+"++.afa") as al_file: 438 with open(path_to_seq_data+"/realigned/"+f+"++.afa") as al_file:
438 al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):] 439 al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):]
...@@ -542,10 +543,10 @@ if __name__ == "__main__": ...@@ -542,10 +543,10 @@ if __name__ == "__main__":
542 threads = [ 543 threads = [
543 th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}), 544 th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}),
544 th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}), 545 th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}),
545 - # th.Thread(target=stats_len), # computes figures 546 + th.Thread(target=stats_len), # computes figures
546 - # th.Thread(target=stats_freq), # Updates the database 547 + th.Thread(target=stats_freq), # Updates the database
547 - # th.Thread(target=seq_idty), # produces .npy files and seq idty figures 548 + th.Thread(target=seq_idty), # produces .npy files and seq idty figures
548 - # th.Thread(target=per_chain_stats) # Updates the database 549 + th.Thread(target=per_chain_stats) # Updates the database
549 ] 550 ]
550 551
551 # Start the threads 552 # Start the threads
......