resnum redundancy followed by gaps is not correctly parsed

Louis BECQUEY
Commit a4b2c505f1a73eb9a19d70dfef19531380bb908d a4b2c505 1 parent fa955118
Showing 1 changed file with 25 additions and 7 deletions
RNAnet.py
--- a/RNAnet.py
View file @a4b2c50
+++ b/RNAnet.py
View file @a4b2c50
@@ -254,7 +254,7 @@ class Chain:
         #############################################
         # Select the nucleotides we need
         #############################################
-        
+
         # Remove nucleotides of the chain that are outside the Rfam mapping, if any
         if self.mapping is not None:
             if self.mapping.nt_start > self.mapping.nt_end:
@@ -267,16 +267,34 @@ class Chain:
         # Duplicate residue numbers : shift numbering
         while True in df.duplicated(['nt_resnum']).values:
             i = df.duplicated(['nt_resnum']).values.tolist().index(True)
-            resnumlist = df.nt_resnum.tolist()
+            duplicates = df[df.nt_resnum == df.iloc[i,1]]
+            n_dup = len(duplicates.nt_resnum)
+            index_last_dup = duplicates.index_chain.iloc[-1] - 1
             if self.mapping is not None:
-                self.mapping.log(f"Shifting nt_resnum numbering because of duplicate residue {resnumlist[i]}")
+                self.mapping.log(f"Shifting nt_resnum numbering because of {n_dup} duplicate residues {df.iloc[i,1]}")
+
+            if df.iloc[i,1] == df.iloc[i-1,1] and df.iloc[index_last_dup + 1, 1] - 1 > df.iloc[index_last_dup, 1]:
+                # The redundant nts are consecutive in the chain (at the begining at least), and there is a gap at the end
-            if resnumlist[i] == resnumlist[i-1]:
+                if duplicates.iloc[n_dup-1, 0] - duplicates.iloc[0, 0] + 1 == n_dup:
-                # Common 4v9q-DV case : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
+                    # They are all contiguous in the chain
+                    # 4v9n-DA case (and similar ones) : 610-611-611A-611B-611C-611D-611E-611F-611G-617-618...
+                    # there is a redundancy (611) followed by a gap (611-617). 
+                    # We want the redundancy to fill the gap.
+                    df.iloc[i:i+n_dup-1, 1] += 1
+                else:
+                    # We solve the problem continous component by continuous component
+                    for j in range(1, n_dup+1):
+                        if duplicates.iloc[j,0] == 1 + duplicates.iloc[j-1,0]: # continuous
+                            df.iloc[i+j-1,1] += 1
+                        else:
+                            break
+            elif df.iloc[i,1] == df.iloc[i-1,1]:
+                # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
                 # Solution : we shift the numbering of 17A (to 18) and the following residues.
                 df.iloc[i:, 1] += 1
             else:
-                # 4v9k-DA case : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
+                # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
                 # Here the 163B is read 163 by DSSR, but there already is a residue 163.
                 # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1.
                 df.iloc[i, 1] = 1 + df.iloc[i-1, 1]
@@ -296,6 +314,7 @@ class Chain:
                 self.mapping.log(df.tail(1))
             df = df.head(-1) 
+
         # Duplicates in index_chain : drop, they are ligands
         # e.g. 3iwn_1_B_1-91, ligand C2E has index_chain 1 (and nt_resnum 601)
         duplicates = [ index for index, element in enumerate(df.duplicated(['index_chain']).values) if element ]
@@ -370,7 +389,6 @@ class Chain:
         # index_chain            1 |-------------|77 83|------------|  154 
         # expected data point    1 |--------------------------------|  154
         #
-
         if l != len(df['index_chain']):         # if some residues are missing, len(df['index_chain']) < l
             resnum_start = df.iloc[0,1]
             diff = set(range(l)).difference(df['nt_resnum'] - resnum_start)     # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start)