Louis BECQUEY

Reordered steps in 3D data extraction + removed inverted numbering support

...@@ -52,7 +52,7 @@ class SelectivePortionSelector(object): ...@@ -52,7 +52,7 @@ class SelectivePortionSelector(object):
52 52
53 def __init__(self, model_id, chain_id, valid_resnums, khetatm): 53 def __init__(self, model_id, chain_id, valid_resnums, khetatm):
54 self.chain_id = chain_id 54 self.chain_id = chain_id
55 - self.resnums = valid_resnums 55 + self.resnums = valid_resnums # list of strings, that are mostly ints
56 self.pdb_model_id = model_id 56 self.pdb_model_id = model_id
57 self.hydrogen_regex = re.compile("[123 ]*H.*") 57 self.hydrogen_regex = re.compile("[123 ]*H.*")
58 self.keep_hetatm = khetatm 58 self.keep_hetatm = khetatm
...@@ -70,15 +70,12 @@ class SelectivePortionSelector(object): ...@@ -70,15 +70,12 @@ class SelectivePortionSelector(object):
70 if hetatm_flag in ["W", "H_MG"]: 70 if hetatm_flag in ["W", "H_MG"]:
71 return int(self.keep_hetatm) 71 return int(self.keep_hetatm)
72 72
73 - # I don't really know what this is but the doc said to warn:
74 - if icode != " ":
75 - pass
76 - # warn(f"icode {icode} at position {resseq}\t\t")
77 -
78 # Accept the residue if it is in the right interval: 73 # Accept the residue if it is in the right interval:
79 - if len(self.resnums): 74 + if icode == " " and len(self.resnums):
80 - return int(resseq in self.resnums) 75 + return int(str(resseq) in self.resnums)
81 - else: 76 + elif icode != " " and len(self.resnums):
77 + return int(str(resseq)+icode in self.resnums)
78 + else: # len(resnum) == 0, we don't use mappings (--no-homology option)
82 return 1 79 return 1
83 80
84 def accept_atom(self, atom): 81 def accept_atom(self, atom):
...@@ -153,7 +150,7 @@ class Chain: ...@@ -153,7 +150,7 @@ class Chain:
153 def __hash__(self): 150 def __hash__(self):
154 return hash((self.pdb_id, self.pdb_model, self.pdb_chain_id, self.chain_label)) 151 return hash((self.pdb_id, self.pdb_model, self.pdb_chain_id, self.chain_label))
155 152
156 - def extract(self, khetatm): 153 + def extract(self, df, khetatm):
157 """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file. 154 """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file.
158 """ 155 """
159 156
...@@ -180,12 +177,8 @@ class Chain: ...@@ -180,12 +177,8 @@ class Chain:
180 mmcif_parser = MMCIFParser() 177 mmcif_parser = MMCIFParser()
181 s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif") 178 s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif")
182 179
183 -
184 - # Extract the desired chain
185 - c = s[model_idx][self.pdb_chain_id]
186 -
187 if self.mapping is not None: 180 if self.mapping is not None:
188 - valid_set = set(self.mapping.old_nt_resnums) 181 + valid_set = set(df.old_nt_resnum)
189 else: 182 else:
190 valid_set = set() 183 valid_set = set()
191 184
...@@ -261,18 +254,33 @@ class Chain: ...@@ -261,18 +254,33 @@ class Chain:
261 ############################################# 254 #############################################
262 # Select the nucleotides we need 255 # Select the nucleotides we need
263 ############################################# 256 #############################################
264 -
265 257
266 # Remove nucleotides of the chain that are outside the Rfam mapping, if any 258 # Remove nucleotides of the chain that are outside the Rfam mapping, if any
267 if self.mapping is not None: 259 if self.mapping is not None:
260 + if self.mapping.nt_start > self.mapping.nt_end:
261 + warn(f"Mapping is reversed, this case is not supported (yet). Ignoring chain {self.chain_label}.")
262 + self.delete_me = True
263 + self.error_messages = f"Mapping is reversed, this case is not supported (yet)."
264 + return None
268 df = self.mapping.filter_df(df) 265 df = self.mapping.filter_df(df)
269 - 266 +
270 # Duplicate residue numbers : shift numbering 267 # Duplicate residue numbers : shift numbering
271 - # Example: 4v9q-DV contains 17 and 17A which are both read 17 by DSSR.
272 while True in df.duplicated(['nt_resnum']).values: 268 while True in df.duplicated(['nt_resnum']).values:
273 i = df.duplicated(['nt_resnum']).values.tolist().index(True) 269 i = df.duplicated(['nt_resnum']).values.tolist().index(True)
274 - self.mapping.shift_resnum_range(i) 270 + resnumlist = df.nt_resnum.tolist()
275 - df.iloc[i:, 1] += 1 271 + if self.mapping is not None:
272 + self.mapping.log(f"Shifting nt_resnum numbering because of duplicate residue {resnumlist[i]}")
273 +
274 + if resnumlist[i] == resnumlist[i-1]:
275 + # Common 4v9q-DV case : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
276 + # Solution : we shift the numbering of 17A (to 18) and the following residues.
277 + df.iloc[i:, 1] += 1
278 + else:
279 + # 4v9k-DA case : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
280 + # Here the 163B is read 163 by DSSR, but there already is a residue 163.
281 + # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1.
282 + df.iloc[i, 1] = 1 + df.iloc[i-1, 1]
283 + df.iloc[i+1:, 1] += 1
276 284
277 # Search for ligands at the end of the selection 285 # Search for ligands at the end of the selection
278 # Drop ligands detected as residues by DSSR, by detecting several markers 286 # Drop ligands detected as residues by DSSR, by detecting several markers
...@@ -280,11 +288,12 @@ class Chain: ...@@ -280,11 +288,12 @@ class Chain:
280 (df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all() 288 (df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all()
281 or (df.iloc[[-1]].puckering=='').any() 289 or (df.iloc[[-1]].puckering=='').any()
282 ) 290 )
283 - or ( len(df.index_chain) >= 2 and df.iloc[-1,1] > 50 + df.iloc[-2,1] ) 291 + or ( len(df.index_chain) >= 2 and df.iloc[-1,1] > 50 + df.iloc[-2,1] ) # large nt_resnum gap between the two last residues
284 or ( len(df.index_chain) and df.iloc[-1,2] in ["GNG", "E2C", "OHX", "IRI", "MPD", "8UZ"] ) 292 or ( len(df.index_chain) and df.iloc[-1,2] in ["GNG", "E2C", "OHX", "IRI", "MPD", "8UZ"] )
285 ): 293 ):
286 if self.mapping is not None: 294 if self.mapping is not None:
287 - self.mapping.drop_ligand(df.tail(1)) 295 + self.mapping.log("Droping ligand:")
296 + self.mapping.log(df.tail(1))
288 df = df.head(-1) 297 df = df.head(-1)
289 298
290 # Duplicates in index_chain : drop, they are ligands 299 # Duplicates in index_chain : drop, they are ligands
...@@ -295,8 +304,6 @@ class Chain: ...@@ -295,8 +304,6 @@ class Chain:
295 warn(f"Found duplicated index_chain {df.iloc[i,0]} in {self.chain_label}. Keeping only the first.") 304 warn(f"Found duplicated index_chain {df.iloc[i,0]} in {self.chain_label}. Keeping only the first.")
296 if self.mapping is not None: 305 if self.mapping is not None:
297 self.mapping.log(f"Found duplicated index_chain {df.iloc[i,0]}. Keeping only the first.") 306 self.mapping.log(f"Found duplicated index_chain {df.iloc[i,0]}. Keeping only the first.")
298 - with open("duplicates.txt", "a") as f:
299 - f.write(f"DEBUG: {self.chain_label} has duplicate index_chains !\n")
300 df = df.drop_duplicates("index_chain", keep="first") # drop doublons in index_chain 307 df = df.drop_duplicates("index_chain", keep="first") # drop doublons in index_chain
301 308
302 # drop eventual nts with index_chain < the first residue, 309 # drop eventual nts with index_chain < the first residue,
...@@ -305,30 +312,31 @@ class Chain: ...@@ -305,30 +312,31 @@ class Chain:
305 if len(ligands.index_chain): 312 if len(ligands.index_chain):
306 if self.mapping is not None: 313 if self.mapping is not None:
307 for line in ligands.iterrows(): 314 for line in ligands.iterrows():
308 - self.mapping.drop_ligand(line) 315 + self.mapping.log("Droping ligand:")
316 + self.mapping.log(line)
309 df = df.drop(ligands.index) 317 df = df.drop(ligands.index)
310 318
311 # Find missing index_chain values 319 # Find missing index_chain values
312 # This happens because of resolved nucleotides that have a 320 # This happens because of resolved nucleotides that have a
313 - # strange nt_resnum value 321 + # strange nt_resnum value. Thanks, biologists ! :@ :(
314 # e.g. 4v49-AA, position 5'- 1003 -> 2003 -> 1004 - 3' 322 # e.g. 4v49-AA, position 5'- 1003 -> 2003 -> 1004 - 3'
315 diff = set(range(df.shape[0])).difference(df['index_chain'] - 1) 323 diff = set(range(df.shape[0])).difference(df['index_chain'] - 1)
316 - if len(diff): 324 + if len(diff) and self.mapping is not None:
317 - warn(f"Missing residues regarding index_chain: {[1+i for i in sorted(diff)]}") 325 + # warn(f"Missing residues in chain numbering: {[1+i for i in sorted(diff)]}")
318 for i in sorted(diff): 326 for i in sorted(diff):
319 - # check if a nucleotide numbered +1000 exists in the nts object 327 + # check if a nucleotide with the correct index_chain exists in the nts object
320 found = None 328 found = None
321 for nt in nts: # nts is the object from the loaded JSON and contains all nts 329 for nt in nts: # nts is the object from the loaded JSON and contains all nts
322 if nt['chain_name'] != self.pdb_chain_id: 330 if nt['chain_name'] != self.pdb_chain_id:
323 continue 331 continue
324 - if nt['index_chain'] == i + 1 : 332 + if nt['index_chain'] == i + 1 + self.mapping.st:
325 found = nt 333 found = nt
326 break 334 break
327 if found: 335 if found:
336 + self.mapping.log(f"Residue {i+1+self.mapping.st}-{self.mapping.st} = {i+1} has been saved and renumbered {df.iloc[i,1]} instead of {found['nt_id'].replace(found['chain_name']+ '.' + found['nt_name'], '').replace('^','')}")
328 df_row = pd.DataFrame([found], index=[i])[df.columns.values] 337 df_row = pd.DataFrame([found], index=[i])[df.columns.values]
329 - if self.mapping is not None: 338 + df_row.iloc[0,0] = i+1 # index_chain
330 - self.mapping.insert_new(i+1, found['nt_resnum'], df.iloc[i,1]) 339 + df_row.iloc[0,1] = df.iloc[i,1] # nt_resnum
331 - df_row.iloc[0,1] = df.iloc[i,1]
332 df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ]) 340 df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ])
333 df.iloc[i+1:, 1] += 1 341 df.iloc[i+1:, 1] += 1
334 else: 342 else:
...@@ -369,7 +377,7 @@ class Chain: ...@@ -369,7 +377,7 @@ class Chain:
369 for i in sorted(diff): 377 for i in sorted(diff):
370 # Add a row at position i 378 # Add a row at position i
371 df = pd.concat([ df.iloc[:i], 379 df = pd.concat([ df.iloc[:i],
372 - pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start, "nt_code":'-', "nt_name":'-'}, index=[i]), 380 + pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start, "nt_id":"not resolved", "nt_code":'-', "nt_name":'-'}, index=[i]),
373 df.iloc[i:] ]) 381 df.iloc[i:] ])
374 # Increase the index_chain of all following lines 382 # Increase the index_chain of all following lines
375 df.iloc[i+1:, 0] += 1 383 df.iloc[i+1:, 0] += 1
...@@ -424,11 +432,11 @@ class Chain: ...@@ -424,11 +432,11 @@ class Chain:
424 if paired[nt1_idx] == "": 432 if paired[nt1_idx] == "":
425 pair_type_LW[nt1_idx] = lw_pair 433 pair_type_LW[nt1_idx] = lw_pair
426 pair_type_DSSR[nt1_idx] = dssr_pair 434 pair_type_DSSR[nt1_idx] = dssr_pair
427 - paired[nt1_idx] = str(nt2_idx + 1) 435 + paired[nt1_idx] = str(nt2_idx + 1) # index + 1 is actually index_chain.
428 else: 436 else:
429 pair_type_LW[nt1_idx] += ',' + lw_pair 437 pair_type_LW[nt1_idx] += ',' + lw_pair
430 pair_type_DSSR[nt1_idx] += ',' + dssr_pair 438 pair_type_DSSR[nt1_idx] += ',' + dssr_pair
431 - paired[nt1_idx] += ',' + str(nt2_idx + 1) 439 + paired[nt1_idx] += ',' + str(nt2_idx + 1) # index + 1 is actually index_chain.
432 440
433 # set nucleotide 2 with the opposite base-pair 441 # set nucleotide 2 with the opposite base-pair
434 if nt2 in res_ids: 442 if nt2 in res_ids:
...@@ -442,44 +450,15 @@ class Chain: ...@@ -442,44 +450,15 @@ class Chain:
442 pair_type_DSSR[nt2_idx] += ',' + dssr_pair[0] + dssr_pair[3] + dssr_pair[2] + dssr_pair[1] 450 pair_type_DSSR[nt2_idx] += ',' + dssr_pair[0] + dssr_pair[3] + dssr_pair[2] + dssr_pair[1]
443 paired[nt2_idx] += ',' + str(nt1_idx + 1) 451 paired[nt2_idx] += ',' + str(nt1_idx + 1)
444 452
453 + # transform nt_id to shorter values
454 + df['old_nt_resnum'] = [ n.replace(self.pdb_chain_id+'.'+name, '').replace('^','') for n, name in zip(df.nt_id, df.nt_name) ]
455 +
445 df['paired'] = paired 456 df['paired'] = paired
446 df['pair_type_LW'] = pair_type_LW 457 df['pair_type_LW'] = pair_type_LW
447 df['pair_type_DSSR'] = pair_type_DSSR 458 df['pair_type_DSSR'] = pair_type_DSSR
448 df['nb_interact'] = interacts 459 df['nb_interact'] = interacts
449 - df = df.drop(['nt_id'], axis=1) # remove now useless descriptors 460 + df = df.drop(['nt_id', 'nt_resnum'], axis=1) # remove now useless descriptors
450 - 461 +
451 - if self.mapping.reversed:
452 - # The 3D structure is numbered from 3' to 5' instead of standard 5' to 3'
453 - # or the sequence that matches the Rfam family is 3' to 5' instead of standard 5' to 3'.
454 - # Anyways, you need to invert the angles.
455 - # TODO: angles alpha, beta, etc
456 - warn(f"Has {self.chain_label} been numbered from 3' to 5' ? Inverting pseudotorsions, other angle measures are not corrected.")
457 - df = df.reindex(index=df.index[::-1]).reset_index(drop=True)
458 - df['index_chain'] = 1 + df.index
459 - temp_eta = df['eta']
460 - df['eta'] = [ df['theta'][n] for n in range(l) ] # eta(n) = theta(l-n+1) forall n in ]1, l]
461 - df['theta'] = [ temp_eta[n] for n in range(l) ] # theta(n) = eta(l-n+1) forall n in [1, l[
462 - temp_eta = df['eta_prime']
463 - df['eta_prime'] = [ df['theta_prime'][n] for n in range(l) ] # eta(n) = theta(l-n+1) forall n in ]1, l]
464 - df['theta_prime'] = [ temp_eta[n] for n in range(l) ] # theta(n) = eta(l-n+1) forall n in [1, l[
465 - temp_eta = df['eta_base']
466 - df['eta_base'] = [ df['theta_base'][n] for n in range(l) ] # eta(n) = theta(l-n+1) forall n in ]1, l]
467 - df['theta_base'] = [ temp_eta[n] for n in range(l) ] # theta(n) = eta(l-n+1) forall n in [1, l[
468 - newpairs = []
469 - for v in df['paired']:
470 - if ',' in v:
471 - temp_v = []
472 - vs = v.split(',')
473 - for _ in vs:
474 - temp_v.append(str(l-int(_)+1) if int(_) else _ )
475 - newpairs.append(','.join(temp_v))
476 - else:
477 - if len(v):
478 - newpairs.append(str(l-int(v)+1) if int(v) else v )
479 - else: # means unpaired
480 - newpairs.append(v)
481 - df['paired'] = newpairs
482 -
483 self.seq = "".join(df.nt_code) 462 self.seq = "".join(df.nt_code)
484 self.seq_to_align = "".join(df.nt_align_code) 463 self.seq_to_align = "".join(df.nt_align_code)
485 self.length = len([ x for x in self.seq_to_align if x != "-" ]) 464 self.length = len([ x for x in self.seq_to_align if x != "-" ])
...@@ -503,20 +482,19 @@ class Chain: ...@@ -503,20 +482,19 @@ class Chain:
503 482
504 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: 483 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
505 # Register the chain in table chain 484 # Register the chain in table chain
506 - if self.mapping.nt_start is not None: 485 + if self.mapping is not None:
507 sql_execute(conn, f""" INSERT INTO chain 486 sql_execute(conn, f""" INSERT INTO chain
508 - (structure_id, chain_name, pdb_start, pdb_end, reversed, rfam_acc, inferred, issue) 487 + (structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, issue)
509 VALUES 488 VALUES
510 - (?, ?, ?, ?, ?, ?, ?, ?) 489 + (?, ?, ?, ?, ?, ?, ?)
511 ON CONFLICT(structure_id, chain_name, rfam_acc) DO 490 ON CONFLICT(structure_id, chain_name, rfam_acc) DO
512 UPDATE SET pdb_start=excluded.pdb_start, 491 UPDATE SET pdb_start=excluded.pdb_start,
513 pdb_end=excluded.pdb_end, 492 pdb_end=excluded.pdb_end,
514 - reversed=excluded.reversed,
515 inferred=excluded.inferred, 493 inferred=excluded.inferred,
516 issue=excluded.issue;""", 494 issue=excluded.issue;""",
517 data=(str(self.pdb_id), str(self.pdb_chain_id), 495 data=(str(self.pdb_id), str(self.pdb_chain_id),
518 int(self.mapping.nt_start), int(self.mapping.nt_end), 496 int(self.mapping.nt_start), int(self.mapping.nt_end),
519 - int(self.mapping.reversed), str(self.mapping.rfam_acc), 497 + str(self.mapping.rfam_acc),
520 int(self.mapping.inferred), int(self.delete_me))) 498 int(self.mapping.inferred), int(self.delete_me)))
521 # get the chain id 499 # get the chain id
522 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain 500 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain
...@@ -536,10 +514,10 @@ class Chain: ...@@ -536,10 +514,10 @@ class Chain:
536 if df is not None and not self.delete_me: # double condition is theoretically redundant here, but you never know 514 if df is not None and not self.delete_me: # double condition is theoretically redundant here, but you never know
537 sql_execute(conn, f""" 515 sql_execute(conn, f"""
538 INSERT OR IGNORE INTO nucleotide 516 INSERT OR IGNORE INTO nucleotide
539 - (chain_id, index_chain, nt_resnum, nt_name, nt_code, dbn, alpha, beta, gamma, delta, epsilon, zeta, 517 + (chain_id, index_chain, nt_name, nt_code, dbn, alpha, beta, gamma, delta, epsilon, zeta,
540 epsilon_zeta, bb_type, chi, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, 518 epsilon_zeta, bb_type, chi, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base,
541 v0, v1, v2, v3, v4, amplitude, phase_angle, puckering, nt_align_code, is_A, is_C, is_G, is_U, is_other, nt_position, 519 v0, v1, v2, v3, v4, amplitude, phase_angle, puckering, nt_align_code, is_A, is_C, is_G, is_U, is_other, nt_position,
542 - paired, pair_type_LW, pair_type_DSSR, nb_interact) 520 + old_nt_resnum, paired, pair_type_LW, pair_type_DSSR, nb_interact)
543 VALUES ({self.db_chain_id}, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 521 VALUES ({self.db_chain_id}, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
544 ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 522 ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
545 ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""", 523 ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""",
...@@ -890,42 +868,13 @@ class Mapping: ...@@ -890,42 +868,13 @@ class Mapping:
890 self.rfam_acc = rfam_acc 868 self.rfam_acc = rfam_acc
891 self.nt_start = pdb_start # nt_resnum numbering 869 self.nt_start = pdb_start # nt_resnum numbering
892 self.nt_end = pdb_end # nt_resnum numbering 870 self.nt_end = pdb_end # nt_resnum numbering
893 - self.reversed = (pdb_start > pdb_end) 871 + self.inferred = inferred
894 - self.inferred = inferred
895 - self.interval = range(pdb_start, pdb_end+1)
896 -
897 - self.old_nt_resnums = [] # to be computed
898 - self.new_nt_resnums = [] #
899 872
900 self.logs = [] # Events are logged when modifying the mapping 873 self.logs = [] # Events are logged when modifying the mapping
901 874
902 - def shift_resnum_range(self, i):
903 - self.log(f"Shifting nt_resnum numbering because of duplicate residue {self.new_nt_resnums[i]}")
904 - for j in range(i, len(self.new_nt_resnums)):
905 - self.new_nt_resnums[j] += 1
906 -
907 - def insert_new(self, index_chain, oldresnum, newresnum):
908 - # Adds a nt that did not passed the mapping filter at first
909 - # because it was numbered with a very high nt_resnum value (outside the bounds of the mapping)
910 - # But, in practice, its index_chain is correct and in the bounds and it belongs to the mapped chain.
911 - # Those nts are only searched if there are missing index_chain values in the mapping bounds.
912 -
913 - # insert the nt_resnum values in the lists
914 - self.old_nt_resnums.insert(index_chain-1, oldresnum)
915 - self.new_nt_resnums.insert(index_chain-1, newresnum)
916 -
917 - # shift the new_nt_resnum values if needed, to avoid creating a doublon
918 - if self.new_nt_resnums[index_chain-1] == self.new_nt_resnums[index_chain]:
919 - for j in range(index_chain, len(self.new_nt_resnums)):
920 - self.new_nt_resnums[j] += 1
921 - # warn(f"Residue {index_chain} has been saved and renumbered {newresnum} instead of {oldresnum}")
922 - self.log(f"Residue {index_chain} has been saved and renumbered {newresnum} instead of {oldresnum}")
923 -
924 def filter_df(self, df): 875 def filter_df(self, df):
925 - if not self.reversed: 876 +
926 - newdf = df.drop(df[(df.nt_resnum < self.nt_start) | (df.nt_resnum > self.nt_end)].index) 877 + newdf = df.drop(df[(df.nt_resnum < self.nt_start) | (df.nt_resnum > self.nt_end)].index)
927 - else:
928 - newdf = df.drop(df[(df.nt_resnum < self.nt_end) | (df.nt_resnum > self.nt_start)].index)
929 878
930 if len(newdf.index_chain) > 0: 879 if len(newdf.index_chain) > 0:
931 # everything's okay 880 # everything's okay
...@@ -939,24 +888,24 @@ class Mapping: ...@@ -939,24 +888,24 @@ class Mapping:
939 weird_mappings.add(self.chain_label + "." + self.rfam_acc) 888 weird_mappings.add(self.chain_label + "." + self.rfam_acc)
940 df = df.drop(df[(df.index_chain < self.nt_start) | (df.index_chain > self.nt_end)].index) 889 df = df.drop(df[(df.index_chain < self.nt_start) | (df.index_chain > self.nt_end)].index)
941 890
942 -
943 # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one 891 # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one
892 + self.st = 0
944 if len(df.index_chain) and df.iloc[0,0] != 1: 893 if len(df.index_chain) and df.iloc[0,0] != 1:
945 - st = df.iloc[0,0] -1 894 + self.st = df.iloc[0,0] -1
946 - df.iloc[:, 0] -= st 895 + df.iloc[:, 0] -= self.st
947 - 896 + self.log(f"Shifting index_chain of {self.st}")
948 - self.old_nt_resnums = df.nt_resnum.tolist() 897 +
949 - self.new_nt_resnums = df.nt_resnum.tolist() 898 + # Check that some residues are not included by mistake:
899 + # e.g. 4v4t-AA.RF00382-20-55 contains 4 residues numbered 30 but actually far beyond the mapped part,
900 + # because the icode are not read by DSSR.
901 + toremove = df[ df.index_chain > self.nt_end ]
902 + if not toremove.empty:
903 + df = df.drop(toremove.index)
904 + self.log(f"Some nt_resnum values are likely to be wrong, not considering residues:")
905 + self.log(str(toremove))
950 906
951 return df 907 return df
952 908
953 - def drop_ligand(self, df_row):
954 - self.log("Droping ligand:")
955 - self.log(df_row)
956 - i = self.new_nt_resnums.index(df_row.iloc[0,1])
957 - self.old_nt_resnums.pop(i)
958 - self.new_nt_resnums.pop(i)
959 -
960 def log(self, message): 909 def log(self, message):
961 if isinstance(message, str): 910 if isinstance(message, str):
962 self.logs.append(message+'\n') 911 self.logs.append(message+'\n')
...@@ -964,13 +913,15 @@ class Mapping: ...@@ -964,13 +913,15 @@ class Mapping:
964 self.logs.append(str(message)) 913 self.logs.append(str(message))
965 914
966 def to_file(self, filename): 915 def to_file(self, filename):
916 + if self.logs == []:
917 + return # Do not create a log file if there is nothing to log
918 +
967 if not path.exists("logs"): 919 if not path.exists("logs"):
968 os.makedirs("logs", exist_ok=True) 920 os.makedirs("logs", exist_ok=True)
969 with open("logs/"+filename, "w") as f: 921 with open("logs/"+filename, "w") as f:
970 f.writelines(self.logs) 922 f.writelines(self.logs)
971 923
972 924
973 -
974 class Pipeline: 925 class Pipeline:
975 def __init__(self): 926 def __init__(self):
976 self.dl = Downloader() 927 self.dl = Downloader()
...@@ -991,6 +942,7 @@ class Pipeline: ...@@ -991,6 +942,7 @@ class Pipeline:
991 self.EXTRACT_CHAINS = False 942 self.EXTRACT_CHAINS = False
992 self.REUSE_ALL = False 943 self.REUSE_ALL = False
993 self.SELECT_ONLY = None 944 self.SELECT_ONLY = None
945 + self.ARCHIVE = False
994 946
995 def process_options(self): 947 def process_options(self):
996 """Sets the paths and options of the pipeline""" 948 """Sets the paths and options of the pipeline"""
...@@ -1002,7 +954,7 @@ class Pipeline: ...@@ -1002,7 +954,7 @@ class Pipeline:
1002 [ "help", "resolution=", "keep-hetatm=", "from-scratch", 954 [ "help", "resolution=", "keep-hetatm=", "from-scratch",
1003 "fill-gaps=", "3d-folder=", "seq-folder=", 955 "fill-gaps=", "3d-folder=", "seq-folder=",
1004 "no-homology", "ignore-issues", "extract", "only=", "all", 956 "no-homology", "ignore-issues", "extract", "only=", "all",
1005 - "update-homologous" ]) 957 + "archive", "update-homologous" ])
1006 except getopt.GetoptError as err: 958 except getopt.GetoptError as err:
1007 print(err) 959 print(err)
1008 sys.exit(2) 960 sys.exit(2)
...@@ -1043,9 +995,10 @@ class Pipeline: ...@@ -1043,9 +995,10 @@ class Pipeline:
1043 print("--ignore-issues\t\t\tDo not ignore already known issues and attempt to compute them") 995 print("--ignore-issues\t\t\tDo not ignore already known issues and attempt to compute them")
1044 print("--update-homologous\t\tRe-download Rfam and SILVA databases, realign all families, and recompute all CSV files") 996 print("--update-homologous\t\tRe-download Rfam and SILVA databases, realign all families, and recompute all CSV files")
1045 print("--from-scratch\t\t\tDelete database, local 3D and sequence files, and known issues, and recompute.") 997 print("--from-scratch\t\t\tDelete database, local 3D and sequence files, and known issues, and recompute.")
998 + print("--archive\t\t\tCreate a tar.gz archive of the datapoints text files, and update the link to the latest archive")
1046 print() 999 print()
1047 print("Typical usage:") 1000 print("Typical usage:")
1048 - print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder /Data/RNA/sequences' &") 1001 + print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder /Data/RNA/sequences' -s --archive &")
1049 sys.exit() 1002 sys.exit()
1050 elif opt == '--version': 1003 elif opt == '--version':
1051 print("RNANet 1.0 alpha ") 1004 print("RNANet 1.0 alpha ")
...@@ -1102,7 +1055,9 @@ class Pipeline: ...@@ -1102,7 +1055,9 @@ class Pipeline:
1102 self.USE_KNOWN_ISSUES = False 1055 self.USE_KNOWN_ISSUES = False
1103 elif opt == "--extract": 1056 elif opt == "--extract":
1104 self.EXTRACT_CHAINS = True 1057 self.EXTRACT_CHAINS = True
1105 - 1058 + elif opt == "--archive":
1059 + self.ARCHIVE = True
1060 +
1106 if "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data]: 1061 if "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data]:
1107 print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") 1062 print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
1108 print("See RNANet.py --help for more information.") 1063 print("See RNANet.py --help for more information.")
...@@ -1420,14 +1375,15 @@ class Pipeline: ...@@ -1420,14 +1375,15 @@ class Pipeline:
1420 conn = sqlite3.connect(runDir+"/results/RNANet.db") 1375 conn = sqlite3.connect(runDir+"/results/RNANet.db")
1421 pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 1376 pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;",
1422 conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) 1377 conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
1423 - pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, reversed, date, exp_method, resolution, issue FROM structure 1378 + pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure
1424 JOIN chain ON structure.pdb_id = chain.structure_id 1379 JOIN chain ON structure.pdb_id = chain.structure_id
1425 ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False) 1380 ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
1426 conn.close() 1381 conn.close()
1427 1382
1428 # Archive the results 1383 # Archive the results
1429 - os.makedirs("results/archive", exist_ok=True) 1384 + if self.SELECT_ONLY is None:
1430 - subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."]) 1385 + os.makedirs("results/archive", exist_ok=True)
1386 + subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."])
1431 1387
1432 # Update shortcuts to latest versions 1388 # Update shortcuts to latest versions
1433 subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz", runDir + "/results/summary_latest.csv", runDir + "/results/families_latest.csv"]) 1389 subprocess.run(["rm", "-f", runDir + "/results/RNANET_datapoints_latest.tar.gz", runDir + "/results/summary_latest.csv", runDir + "/results/families_latest.csv"])
...@@ -1549,7 +1505,6 @@ def sql_define_tables(conn): ...@@ -1549,7 +1505,6 @@ def sql_define_tables(conn):
1549 chain_name VARCHAR(2) NOT NULL, 1505 chain_name VARCHAR(2) NOT NULL,
1550 pdb_start SMALLINT, 1506 pdb_start SMALLINT,
1551 pdb_end SMALLINT, 1507 pdb_end SMALLINT,
1552 - reversed TINYINT,
1553 issue TINYINT, 1508 issue TINYINT,
1554 rfam_acc CHAR(7), 1509 rfam_acc CHAR(7),
1555 inferred TINYINT, 1510 inferred TINYINT,
...@@ -1578,7 +1533,7 @@ def sql_define_tables(conn): ...@@ -1578,7 +1533,7 @@ def sql_define_tables(conn):
1578 CREATE TABLE IF NOT EXISTS nucleotide ( 1533 CREATE TABLE IF NOT EXISTS nucleotide (
1579 chain_id INT, 1534 chain_id INT,
1580 index_chain SMALLINT, 1535 index_chain SMALLINT,
1581 - nt_resnum SMALLINT, 1536 + old_nt_resnum VARCHAR(5),
1582 nt_position SMALLINT, 1537 nt_position SMALLINT,
1583 nt_name VARCHAR(5), 1538 nt_name VARCHAR(5),
1584 nt_code CHAR(1), 1539 nt_code CHAR(1),
...@@ -2004,7 +1959,7 @@ def work_build_chain(c, extract, khetatm, retrying=False): ...@@ -2004,7 +1959,7 @@ def work_build_chain(c, extract, khetatm, retrying=False):
2004 1959
2005 # extract the portion we want 1960 # extract the portion we want
2006 if extract and not c.delete_me: 1961 if extract and not c.delete_me:
2007 - c.extract(khetatm) 1962 + c.extract(df, khetatm)
2008 1963
2009 return c 1964 return c
2010 1965
...@@ -2331,7 +2286,7 @@ def work_save(c, homology=True): ...@@ -2331,7 +2286,7 @@ def work_save(c, homology=True):
2331 conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0) 2286 conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0)
2332 if homology: 2287 if homology:
2333 df = pd.read_sql_query(f""" 2288 df = pd.read_sql_query(f"""
2334 - SELECT index_chain, nt_resnum, nt_position, nt_name, nt_code, nt_align_code, 2289 + SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code,
2335 is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, dbn, 2290 is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, dbn,
2336 paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, 2291 paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta,
2337 chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, 2292 chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base,
...@@ -2344,7 +2299,7 @@ def work_save(c, homology=True): ...@@ -2344,7 +2299,7 @@ def work_save(c, homology=True):
2344 filename = path_to_3D_data + "datapoints/" + c.chain_label + '.' + c.mapping.rfam_acc 2299 filename = path_to_3D_data + "datapoints/" + c.chain_label + '.' + c.mapping.rfam_acc
2345 else: 2300 else:
2346 df = pd.read_sql_query(f""" 2301 df = pd.read_sql_query(f"""
2347 - SELECT index_chain, nt_resnum, nt_position, nt_name, nt_code, nt_align_code, 2302 + SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code,
2348 is_A, is_C, is_G, is_U, is_other, dbn, 2303 is_A, is_C, is_G, is_U, is_other, dbn,
2349 paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, 2304 paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta,
2350 chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, 2305 chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base,
...@@ -2405,7 +2360,8 @@ if __name__ == "__main__": ...@@ -2405,7 +2360,8 @@ if __name__ == "__main__":
2405 work_save(c, homology=False) 2360 work_save(c, homology=False)
2406 print("Completed.") 2361 print("Completed.")
2407 exit(0) 2362 exit(0)
2408 - 2363 +
2364 +
2409 # At this point, structure, chain and nucleotide tables of the database are up to date. 2365 # At this point, structure, chain and nucleotide tables of the database are up to date.
2410 # (Modulo some statistics computed by statistics.py) 2366 # (Modulo some statistics computed by statistics.py)
2411 2367
......
1 +#!python3
2 +import subprocess, os, sys
3 +
4 +# Put a list of problematic chains here, they will be properly deleted and recomputed
5 +problems = [
6 +"4v7f_1_1_4-3396",
7 +"4v7f_1_1_1-3167",
8 +"4v7f_1_1_1-3255",
9 +"6g90_1_1_1-407",
10 +"3q3z_1_A_1-74",
11 +"3q3z_1_V_1-74",
12 +"4v7f_1_3_1-121"
13 +]
14 +
15 +path_to_3D_data = sys.argv[1]
16 +path_to_seq_data = sys.argv[2]
17 +
18 +for p in problems:
19 + print()
20 + print()
21 + print()
22 + print()
23 +
24 + # Remove the datapoints files and 3D files
25 + subprocess.run(["rm", '-f', path_to_3D_data + f"/rna_mapped_to_Rfam/{p}.cif"])
26 + files = [ f for f in os.listdir(path_to_3D_data + "/datapoints") if p in f ]
27 + for f in files:
28 + subprocess.run(["rm", '-f', path_to_3D_data + f"/datapoints/{f}"])
29 +
30 + # Find more information
31 + structure = p.split('_')[0]
32 + chain = p.split('_')[2]
33 + families = [ f.split('.')[1] for f in files ] # The RFAM families this chain has been mapped onto
34 +
35 + # Delete the chain from the database, and the associated nucleotides and re_mappings, using foreign keys
36 + for fam in families:
37 + command = ["sqlite3", "results/RNANet.db", f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc=\"{fam}\";"]
38 + print(' '.join(command))
39 + subprocess.run(command)
40 +
41 + # Re-run RNANet
42 + command = ["python3.8", "RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--extract", "--only", p]
43 + print('\n',' '.join(command),'\n')
44 + subprocess.run(command)
45 +
46 +# run statistics
...@@ -139,6 +139,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): ...@@ -139,6 +139,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
139 fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}.png") 139 fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}.png")
140 if show: 140 if show:
141 fig.show() 141 fig.show()
142 + fig.close()
142 143
143 # Smoothed joint distribution 144 # Smoothed joint distribution
144 fig = plt.figure() 145 fig = plt.figure()
...@@ -149,6 +150,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): ...@@ -149,6 +150,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
149 fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}.png") 150 fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}.png")
150 if show: 151 if show:
151 fig.show() 152 fig.show()
153 + fig.close()
152 154
153 # 2D Wadley plot 155 # 2D Wadley plot
154 fig = plt.figure(figsize=(5,5)) 156 fig = plt.figure(figsize=(5,5))
...@@ -161,6 +163,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): ...@@ -161,6 +163,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)):
161 fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}.png") 163 fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}.png")
162 if show: 164 if show:
163 fig.show() 165 fig.show()
166 + fig.close()
164 # print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.") 167 # print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.")
165 168
166 def stats_len(): 169 def stats_len():
...@@ -440,7 +443,7 @@ def to_dist_matrix(f): ...@@ -440,7 +443,7 @@ def to_dist_matrix(f):
440 idty = dm.get_distance(al).matrix # list of lists 443 idty = dm.get_distance(al).matrix # list of lists
441 del al 444 del al
442 l = len(idty) 445 l = len(idty)
443 - np.save("data/"+f+".npy", np.array([ idty[i] + [0]*(l-1-i) if i<l-1 else idty[i] for i in range(l) ])) 446 + np.save("data/"+f+".npy", np.array([ idty[i] + [0]*(l-1-i) if i<l-1 else idty[i] for i in range(l) ], dtype=object))
444 del idty 447 del idty
445 notify(f"Computed {f} distance matrix") 448 notify(f"Computed {f} distance matrix")
446 return 0 449 return 0
......