Louis BECQUEY

resnum redundancy followed by gaps is not correctly parsed

Showing 1 changed file with 25 additions and 7 deletions
...@@ -254,7 +254,7 @@ class Chain: ...@@ -254,7 +254,7 @@ class Chain:
254 ############################################# 254 #############################################
255 # Select the nucleotides we need 255 # Select the nucleotides we need
256 ############################################# 256 #############################################
257 - 257 +
258 # Remove nucleotides of the chain that are outside the Rfam mapping, if any 258 # Remove nucleotides of the chain that are outside the Rfam mapping, if any
259 if self.mapping is not None: 259 if self.mapping is not None:
260 if self.mapping.nt_start > self.mapping.nt_end: 260 if self.mapping.nt_start > self.mapping.nt_end:
...@@ -267,16 +267,34 @@ class Chain: ...@@ -267,16 +267,34 @@ class Chain:
267 # Duplicate residue numbers : shift numbering 267 # Duplicate residue numbers : shift numbering
268 while True in df.duplicated(['nt_resnum']).values: 268 while True in df.duplicated(['nt_resnum']).values:
269 i = df.duplicated(['nt_resnum']).values.tolist().index(True) 269 i = df.duplicated(['nt_resnum']).values.tolist().index(True)
270 - resnumlist = df.nt_resnum.tolist() 270 + duplicates = df[df.nt_resnum == df.iloc[i,1]]
271 + n_dup = len(duplicates.nt_resnum)
272 + index_last_dup = duplicates.index_chain.iloc[-1] - 1
271 if self.mapping is not None: 273 if self.mapping is not None:
272 - self.mapping.log(f"Shifting nt_resnum numbering because of duplicate residue {resnumlist[i]}") 274 + self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}")
275 +
276 + if df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
277 + # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end
273 278
274 - if resnumlist[i] == resnumlist[i-1]: 279 + if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup:
275 - # Common 4v9q-DV case : e.g. chains contains 17 and 17A which are both read 17 by DSSR. 280 + # They are all contiguous in the chain
281 + # 4v9n-DA case (and similar ones) : 610-611-611A-611B-611C-611D-611E-611F-611G-617-618...
282 + # there is a redundancy (611) followed by a gap (611-617).
283 + # We want the redundancy to fill the gap.
284 + df.iloc[i:i+n_dup-1, 1] += 1
285 + else:
286 + # We solve the problem continous component by continuous component
287 + for j in range(1, n_dup+1):
288 + if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous
289 + df.iloc[i+j-1,1] += 1
290 + else:
291 + break
292 + elif df.iloc[i,1] == df.iloc[i-1,1]:
293 + # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
276 # Solution : we shift the numbering of 17A (to 18) and the following residues. 294 # Solution : we shift the numbering of 17A (to 18) and the following residues.
277 df.iloc[i:, 1] += 1 295 df.iloc[i:, 1] += 1
278 else: 296 else:
279 - # 4v9k-DA case : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ... 297 + # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
280 # Here the 163B is read 163 by DSSR, but there already is a residue 163. 298 # Here the 163B is read 163 by DSSR, but there already is a residue 163.
281 # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1. 299 # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1.
282 df.iloc[i, 1] = 1 + df.iloc[i-1, 1] 300 df.iloc[i, 1] = 1 + df.iloc[i-1, 1]
...@@ -296,6 +314,7 @@ class Chain: ...@@ -296,6 +314,7 @@ class Chain:
296 self.mapping.log(df.tail(1)) 314 self.mapping.log(df.tail(1))
297 df = df.head(-1) 315 df = df.head(-1)
298 316
317 +
299 # Duplicates in index_chain : drop, they are ligands 318 # Duplicates in index_chain : drop, they are ligands
300 # e.g. 3iwn_1_B_1-91, ligand C2E has index_chain 1 (and nt_resnum 601) 319 # e.g. 3iwn_1_B_1-91, ligand C2E has index_chain 1 (and nt_resnum 601)
301 duplicates = [ index for index, element in enumerate(df.duplicated(['index_chain']).values) if element ] 320 duplicates = [ index for index, element in enumerate(df.duplicated(['index_chain']).values) if element ]
...@@ -370,7 +389,6 @@ class Chain: ...@@ -370,7 +389,6 @@ class Chain:
370 # index_chain 1 |-------------|77 83|------------| 154 389 # index_chain 1 |-------------|77 83|------------| 154
371 # expected data point 1 |--------------------------------| 154 390 # expected data point 1 |--------------------------------| 154
372 # 391 #
373 -
374 if l != len(df['index_chain']): # if some residues are missing, len(df['index_chain']) < l 392 if l != len(df['index_chain']): # if some residues are missing, len(df['index_chain']) < l
375 resnum_start = df.iloc[0,1] 393 resnum_start = df.iloc[0,1]
376 diff = set(range(l)).difference(df['nt_resnum'] - resnum_start) # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start) 394 diff = set(range(l)).difference(df['nt_resnum'] - resnum_start) # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start)
......