Showing
1 changed file
with
25 additions
and
7 deletions
... | @@ -254,7 +254,7 @@ class Chain: | ... | @@ -254,7 +254,7 @@ class Chain: |
254 | ############################################# | 254 | ############################################# |
255 | # Select the nucleotides we need | 255 | # Select the nucleotides we need |
256 | ############################################# | 256 | ############################################# |
257 | - | 257 | + |
258 | # Remove nucleotides of the chain that are outside the Rfam mapping, if any | 258 | # Remove nucleotides of the chain that are outside the Rfam mapping, if any |
259 | if self.mapping is not None: | 259 | if self.mapping is not None: |
260 | if self.mapping.nt_start > self.mapping.nt_end: | 260 | if self.mapping.nt_start > self.mapping.nt_end: |
... | @@ -267,16 +267,34 @@ class Chain: | ... | @@ -267,16 +267,34 @@ class Chain: |
267 | # Duplicate residue numbers : shift numbering | 267 | # Duplicate residue numbers : shift numbering |
268 | while True in df.duplicated(['nt_resnum']).values: | 268 | while True in df.duplicated(['nt_resnum']).values: |
269 | i = df.duplicated(['nt_resnum']).values.tolist().index(True) | 269 | i = df.duplicated(['nt_resnum']).values.tolist().index(True) |
270 | - resnumlist = df.nt_resnum.tolist() | 270 | + duplicates = df[df.nt_resnum == df.iloc[i,1]] |
271 | + n_dup = len(duplicates.nt_resnum) | ||
272 | + index_last_dup = duplicates.index_chain.iloc[-1] - 1 | ||
271 | if self.mapping is not None: | 273 | if self.mapping is not None: |
272 | - self.mapping.log(f"Shifting nt_resnum numbering because of duplicate residue {resnumlist[i]}") | 274 | + self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}") |
275 | + | ||
276 | + if df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]: | ||
277 | + # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end | ||
273 | 278 | ||
274 | - if resnumlist[i] == resnumlist[i-1]: | 279 | + if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup: |
275 | - # Common 4v9q-DV case : e.g. chains contains 17 and 17A which are both read 17 by DSSR. | 280 | + # They are all contiguous in the chain |
281 | + # 4v9n-DA case (and similar ones) : 610-611-611A-611B-611C-611D-611E-611F-611G-617-618... | ||
282 | + # there is a redundancy (611) followed by a gap (611-617). | ||
283 | + # We want the redundancy to fill the gap. | ||
284 | + df.iloc[i:i+n_dup-1, 1] += 1 | ||
285 | + else: | ||
286 | + # We solve the problem continous component by continuous component | ||
287 | + for j in range(1, n_dup+1): | ||
288 | + if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous | ||
289 | + df.iloc[i+j-1,1] += 1 | ||
290 | + else: | ||
291 | + break | ||
292 | + elif df.iloc[i,1] == df.iloc[i-1,1]: | ||
293 | + # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR. | ||
276 | # Solution : we shift the numbering of 17A (to 18) and the following residues. | 294 | # Solution : we shift the numbering of 17A (to 18) and the following residues. |
277 | df.iloc[i:, 1] += 1 | 295 | df.iloc[i:, 1] += 1 |
278 | else: | 296 | else: |
279 | - # 4v9k-DA case : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ... | 297 | + # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ... |
280 | # Here the 163B is read 163 by DSSR, but there already is a residue 163. | 298 | # Here the 163B is read 163 by DSSR, but there already is a residue 163. |
281 | # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1. | 299 | # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1. |
282 | df.iloc[i, 1] = 1 + df.iloc[i-1, 1] | 300 | df.iloc[i, 1] = 1 + df.iloc[i-1, 1] |
... | @@ -296,6 +314,7 @@ class Chain: | ... | @@ -296,6 +314,7 @@ class Chain: |
296 | self.mapping.log(df.tail(1)) | 314 | self.mapping.log(df.tail(1)) |
297 | df = df.head(-1) | 315 | df = df.head(-1) |
298 | 316 | ||
317 | + | ||
299 | # Duplicates in index_chain : drop, they are ligands | 318 | # Duplicates in index_chain : drop, they are ligands |
300 | # e.g. 3iwn_1_B_1-91, ligand C2E has index_chain 1 (and nt_resnum 601) | 319 | # e.g. 3iwn_1_B_1-91, ligand C2E has index_chain 1 (and nt_resnum 601) |
301 | duplicates = [ index for index, element in enumerate(df.duplicated(['index_chain']).values) if element ] | 320 | duplicates = [ index for index, element in enumerate(df.duplicated(['index_chain']).values) if element ] |
... | @@ -370,7 +389,6 @@ class Chain: | ... | @@ -370,7 +389,6 @@ class Chain: |
370 | # index_chain 1 |-------------|77 83|------------| 154 | 389 | # index_chain 1 |-------------|77 83|------------| 154 |
371 | # expected data point 1 |--------------------------------| 154 | 390 | # expected data point 1 |--------------------------------| 154 |
372 | # | 391 | # |
373 | - | ||
374 | if l != len(df['index_chain']): # if some residues are missing, len(df['index_chain']) < l | 392 | if l != len(df['index_chain']): # if some residues are missing, len(df['index_chain']) < l |
375 | resnum_start = df.iloc[0,1] | 393 | resnum_start = df.iloc[0,1] |
376 | diff = set(range(l)).difference(df['nt_resnum'] - resnum_start) # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start) | 394 | diff = set(range(l)).difference(df['nt_resnum'] - resnum_start) # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start) | ... | ... |
-
Please register or login to post a comment