Nt-specific Biopython PortionSelector

Louis BECQUEY
Commit bd665f4f84bdd55afc21b9e74abb6a6eecb82b71 bd665f4f 1 parent 18394e1c
Showing 4 changed files with 232 additions and 125 deletions
.gitignore
RNAnet.py
automate.sh
statistics.py
--- a/.gitignore
View file @bd665f4
+++ b/.gitignore
View file @bd665f4
@@ -3,7 +3,8 @@ nohup.out
 log_of_the_run.sh
 
 # results
- results/
+ results/*
+ logs/*
 
 # temporary results files
 data/
--- a/RNAnet.py
View file @bd665f4
+++ b/RNAnet.py
View file @bd665f4
@@ -44,16 +44,15 @@ SSU_set = {"RF00177", "RF02542",  "RF02545", "RF01959", "RF01960"}  # From Rfam 
 no_nts_set = set()
 weird_mappings = set()
 
- class NtPortionSelector(object):
+ class SelectivePortionSelector(object):
     """Class passed to MMCIFIO to select some chain portions in an MMCIF file.
 
     Validates every chain, residue, nucleotide, to say if it is in the selection or not.
     """
 
-     def __init__(self, model_id, chain_id, start, end, khetatm):
+     def __init__(self, model_id, chain_id, valid_resnums, khetatm):
         self.chain_id = chain_id
-         self.start = start
-         self.end = end
+         self.resnums = valid_resnums
         self.pdb_model_id = model_id
         self.hydrogen_regex = re.compile("[123 ]*H.*")
         self.keep_hetatm = khetatm
@@ -77,7 +76,10 @@ class NtPortionSelector(object):
             # warn(f"icode {icode} at position {resseq}\t\t")
 
         # Accept the residue if it is in the right interval:
-         return int(self.start <= resseq <= self.end)
+         if len(self.resnums):
+             return int(resseq in self.resnums)
+         else:
+             return 1
 
     def accept_atom(self, atom):
 
@@ -128,13 +130,12 @@ class Chain:
         self.pdb_id = pdb_id                    # PDB ID
         self.pdb_model = int(pdb_model)         # model ID, starting at 1
         self.pdb_chain_id = pdb_chain_id        # chain ID (mmCIF), multiple letters
-         self.pdb_start = pdb_start              # if portion of chain, the start number (relative to the chain, not residue numbers)
-         self.pdb_end = pdb_end                  # if portion of chain, the start number (relative to the chain, not residue numbers)
-         self.reversed = (pdb_start > pdb_end) if pdb_start is not None else False  # wether pdb_start > pdb_end in the Rfam mapping
+         if len(rfam):
+             self.mapping = Mapping(chain_label, rfam, pdb_start, pdb_end, inferred)
+         else:
+             self.mapping = None
         self.chain_label = chain_label          # chain pretty name 
         self.file = ""                          # path to the 3D PDB file
-         self.rfam_fam = rfam                    # mapping to an RNA family
-         self.inferred = inferred                # Wether this mapping has been inferred from BGSU's NR list
         self.seq = ""                           # sequence with modified nts
         self.seq_to_align = ""                  # sequence with modified nts replaced, but gaps can exist
         self.length = -1                        # length of the sequence (missing residues are not counted)
@@ -156,8 +157,8 @@ class Chain:
         """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file.
         """
         
-         if self.pdb_start is not None and (self.pdb_end - self.pdb_start):
-             status = f"Extract {self.pdb_start}-{self.pdb_end} atoms from {self.pdb_id}-{self.pdb_chain_id}"
+         if self.mapping is not None:
+             status = f"Extract {self.mapping.nt_start}-{self.mapping.nt_end} atoms from {self.pdb_id}-{self.pdb_chain_id}"
             self.file = path_to_3D_data+"rna_mapped_to_Rfam/"+self.chain_label+".cif"
         else:
             status = f"Extract {self.pdb_id}-{self.pdb_chain_id}"
@@ -183,36 +184,19 @@ class Chain:
             # Extract the desired chain
             c = s[model_idx][self.pdb_chain_id]
 
-             if self.pdb_start is not None and (self.pdb_end - self.pdb_start):
-                 # # Pay attention to residue numbering 
-                 # first_number = c.child_list[0].get_id()[1]          # the chain's first residue is numbered 'first_number'
-                 # if self.pdb_start < self.pdb_end:                             
-                 #     start = self.pdb_start + first_number - 1       # shift our start_position by 'first_number'
-                 #     end = self.pdb_end + first_number - 1           # same for the end position
-                 # else:
-                 #     self.reversed = True                            # the 3D chain is numbered backwards compared to the Rfam family
-                 #     end = self.pdb_start + first_number - 1
-                 #     start = self.pdb_end + first_number - 1
- 
-                 if self.pdb_start < self.pdb_end:                             
-                     start = self.pdb_start        # shift our start_position by 'first_number'
-                     end = self.pdb_end            # same for the end position
-                 else:
-                     self.reversed = True          # the 3D chain is numbered backwards compared to the Rfam family
-                     end = self.pdb_start
-                     start = self.pdb_end
+             if self.mapping is not None:
+                 valid_set = set(self.mapping.old_nt_resnums)
             else:
-                 start = c.child_list[0].get_id()[1]  # the chain's first residue is numbered 'first_number'
-                 end = c.child_list[-1].get_id()[1]   # the chain's last residue number
-             
+                 valid_set = set()
+ 
             # Define a selection
-             # sel = ChainSelector(self.pdb_chain_id, start, end, model_id = model_idx)
-             sel = NtPortionSelector(model_idx, self.pdb_chain_id, start, end, khetatm)
+             sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm)
 
             # Save that selection on the mmCIF object s to file
             ioobj = MMCIFIO()
             ioobj.set_structure(s)
             ioobj.save(self.file, sel)
+             
 
         notify(status)
 
@@ -225,7 +209,7 @@ class Chain:
         try:
             with open(path_to_3D_data + "annotations/" + self.pdb_id + ".json", 'r') as json_file:
                 json_object = json.load(json_file)
-             notify(f"Read {self.chain_label} DSSR annotations")
+             notify(f"Read {self.pdb_id} DSSR annotations")
         except json.decoder.JSONDecodeError as e:
             warn("Could not load "+self.pdb_id+f".json with JSON package: {e}", error=True)
             self.delete_me = True
@@ -275,87 +259,87 @@ class Chain:
             return None
 
         #############################################
-         # Solve some common issues and drop ligands
+         # Select the nucleotides we need
         #############################################
 
-         # Shift numbering when duplicate residue numbers are found.
+         
+         # Remove nucleotides of the chain that are outside the Rfam mapping, if any
+         if self.mapping is not None:
+             df = self.mapping.filter_df(df)
+         
+         # Duplicate residue numbers : shift numbering
         # Example: 4v9q-DV contains 17 and 17A which are both read 17 by DSSR.
         while True in df.duplicated(['nt_resnum']).values:
             i = df.duplicated(['nt_resnum']).values.tolist().index(True)
+             self.mapping.shift_resnum_range(i)
             df.iloc[i:, 1] += 1
 
+         # Search for ligands at the end of the selection
         # Drop ligands detected as residues by DSSR, by detecting several markers
-         df = df.drop_duplicates("index_chain", keep="first") # drop doublons in index_chain
-         while (len(df.index_chain) and df.iloc[[-1]].nt_name.tolist()[0] not in ["A", "C", "G", "U"] and 
-             ((df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all()
-             or (df.iloc[[-1]].puckering=='').any())
-             or (len(df.index_chain) >= 2 and df.iloc[[-1]].nt_resnum.iloc[0] > 50 + df.iloc[[-2]].nt_resnum.iloc[0])):
+         while ( len(df.index_chain) and df.iloc[-1,2] not in ["A", "C", "G", "U"] and (
+                         (df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all()
+                         or (df.iloc[[-1]].puckering=='').any()
+                     )
+                 or  (   len(df.index_chain) >= 2 and df.iloc[-1,1] > 50 + df.iloc[-2,1]    )
+                 or  (   len(df.index_chain) and df.iloc[-1,2] in ["GNG", "E2C", "OHX", "IRI", "MPD", "8UZ"]   )
+               ):
+             if self.mapping is not None:
+                 self.mapping.drop_ligand(df.tail(1))
             df = df.head(-1) 
 
-         # drop eventual nts with index_chain < the first residue (usually, ligands)
-         df = df.drop(df[df.index_chain < 0].index)  
- 
-         # Assert some nucleotides still exist
-         try:
-             l = df.iloc[-1,1] - df.iloc[0,1] + 1    # length of chain from nt_resnum point of view
-         except IndexError:
-             warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Ignoring chain {self.chain_label}.", error=True)
-             no_nts_set.add(self.pdb_id)
-             self.delete_me = True
-             self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. We expect a problem with {self.pdb_id} mmCIF download. Delete it and retry."
-             return None
- 
-         # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one
-         if df.iloc[0,0] != 1:
-             st = df.iloc[0,0] -1
-             df.iloc[:, 0] -= st
- 
-             
-         # Find missing index_chain values because of resolved nucleotides that have a strange nt_resnum value
+         # Duplicates in index_chain : drop, they are ligands
+         # e.g. 3iwn_1_B_1-91, ligand C2E has index_chain 1 (and nt_resnum 601)
+         duplicates = [ index for index, element in enumerate(df.duplicated(['index_chain']).values) if element ]
+         if len(duplicates):
+             for i in duplicates:
+                 warn(f"Found duplicated index_chain {df.iloc[i,0]} in {self.chain_label}. Keeping only the first.")
+                 if self.mapping is not None:
+                     self.mapping.log(f"Found duplicated index_chain {df.iloc[i,0]}. Keeping only the first.")
+             with open("duplicates.txt", "a") as f:
+                 f.write(f"DEBUG: {self.chain_label} has duplicate index_chains !\n")
+             df = df.drop_duplicates("index_chain", keep="first") # drop doublons in index_chain
+ 
+         # drop eventual nts with index_chain < the first residue,
+         # now negative because we renumber to 1 (usually, ligands)
+         ligands = df[df.index_chain < 0]
+         if len(ligands.index_chain):
+             if self.mapping is not None:
+                 for line in ligands.iterrows():
+                     self.mapping.drop_ligand(line)
+             df = df.drop(ligands.index)
+         
+         # Find missing index_chain values 
+         # This happens because of resolved nucleotides that have a 
+         # strange nt_resnum value
         # e.g. 4v49-AA, position 5'- 1003 -> 2003 -> 1004 - 3'
         diff = set(range(df.shape[0])).difference(df['index_chain'] - 1)
-         for i in sorted(diff):
-             # check if a nucleotide numbered +1000 exists
-             looked_for = df[df.index_chain == i].nt_resnum.values[0]
-             found = None
-             for nt in nts:
-                 if nt['chain_name'] != self.pdb_chain_id:
-                     continue
-                 if nt['index_chain'] == i + 1 :
-                     found = nt
-                     break
-             if found:
-                 df_row = pd.DataFrame([found], index=[i])[df.columns.values]
-                 df_row.iloc[0,1] = df.iloc[i,1]
-                 df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ])
-                 df.iloc[i+1:, 1] += 1
-             else:
-                 warn(f"Missing index_chain {i} in {self.chain_label} !")
-         
-         # Remove nucleotides of the chain that are outside the Rfam mapping, if any
-         if self.pdb_start and self.pdb_end:
-             if self.pdb_start < self.pdb_end:
-                 newdf = df.drop(df[(df.nt_resnum < self.pdb_start) | (df.nt_resnum > self.pdb_end)].index)
-             else:
-                 newdf = df.drop(df[(df.nt_resnum < self.pdb_end) | (df.nt_resnum > self.pdb_start)].index)
+         if len(diff):
+             warn(f"Missing residues regarding index_chain: {[1+i for i in sorted(diff)]}")
+             for i in sorted(diff):
+                 # check if a nucleotide numbered +1000 exists in the nts object
+                 found = None
+                 for nt in nts: # nts is the object from the loaded JSON and contains all nts
+                     if nt['chain_name'] != self.pdb_chain_id:
+                         continue
+                     if nt['index_chain'] == i + 1 :
+                         found = nt
+                         break
+                 if found:
+                     df_row = pd.DataFrame([found], index=[i])[df.columns.values]
+                     if self.mapping is not None:
+                         self.mapping.insert_new(i+1, found['nt_resnum'], df.iloc[i,1])
+                     df_row.iloc[0,1] = df.iloc[i,1]
+                     df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ])
+                     df.iloc[i+1:, 1] += 1
+                 else:
+                     warn(f"Missing index_chain {i} in {self.chain_label} !")
 
-             if len(newdf.index_chain) > 0:
-                 # everything's okay 
-                 df = newdf
-             else:
-                 # There were nucleotides in this chain but we removed them all while
-                 # filtering the ones outside the Rfam mapping.
-                 # This probably means that, for this chain, the mapping is relative to 
-                 # index_chain and not nt_resnum.
-                 warn(f"Assuming {self.chain_label}'s mapping to {self.rfam_fam} is an absolute position interval.")
-                 weird_mappings.add(self.chain_label + "." + self.rfam_fam)
-                 df = df.drop(df[(df.index_chain < self.pdb_start) | (df.index_chain > self.pdb_end)].index)
-         
+         # Assert some nucleotides still exist
         try:
             l = df.iloc[-1,1] - df.iloc[0,1] + 1    # update length of chain from nt_resnum point of view
         except IndexError:
-             warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.pdb_start} and "
-                  f"{self.pdb_end} ({'not ' if not self.inferred else ''}inferred). Ignoring chain {self.chain_label}.")
+             warn(f"Could not find real nucleotides of chain {self.pdb_chain_id} between {self.mapping.nt_start} and "
+                  f"{self.mapping.nt_end} ({'not ' if not self.mapping.inferred else ''}inferred). Ignoring chain {self.chain_label}.")
             no_nts_set.add(self.pdb_id)
             self.delete_me = True
             self.error_messages = f"Could not find nucleotides of chain {self.pdb_chain_id} in annotation {self.pdb_id}.json. Either there is a problem with {self.pdb_id} mmCIF download, or the bases are not resolved in the structure. Delete it and retry."
@@ -464,7 +448,7 @@ class Chain:
         df['nb_interact'] = interacts
         df = df.drop(['nt_id'], axis=1) # remove now useless descriptors
 
-         if self.reversed:
+         if self.mapping.reversed:
             # The 3D structure is numbered from 3' to 5' instead of standard 5' to 3'
             # or the sequence that matches the Rfam family is 3' to 5' instead of standard 5' to 3'.
             # Anyways, you need to invert the angles.
@@ -507,6 +491,10 @@ class Chain:
             self.error_messages = "Sequence is too short. (< 5 resolved nts)"
             return None
 
+         # Log chain info to file
+         if self.mapping is not None:
+             self.mapping.to_file(self.chain_label+".log")
+ 
         return df
 
     def register_chain(self, df):
@@ -515,7 +503,7 @@ class Chain:
 
         with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
             # Register the chain in table chain
-             if self.pdb_start is not None:
+             if self.mapping.nt_start is not None:
                 sql_execute(conn, f"""  INSERT INTO chain 
                                         (structure_id, chain_name, pdb_start, pdb_end, reversed, rfam_acc, inferred, issue)
                                         VALUES 
@@ -527,14 +515,14 @@ class Chain:
                                                     inferred=excluded.inferred, 
                                                     issue=excluded.issue;""", 
                                         data=(str(self.pdb_id), str(self.pdb_chain_id), 
-                                               int(self.pdb_start), int(self.pdb_end), 
-                                               int(self.reversed), str(self.rfam_fam), 
-                                               int(self.inferred), int(self.delete_me)))
+                                               int(self.mapping.nt_start), int(self.mapping.nt_end), 
+                                               int(self.mapping.reversed), str(self.mapping.rfam_acc), 
+                                               int(self.mapping.inferred), int(self.delete_me)))
                 # get the chain id
                 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain 
                                                     WHERE structure_id='{self.pdb_id}' 
                                                     AND chain_name='{self.pdb_chain_id}' 
-                                                     AND rfam_acc='{self.rfam_fam}';""")[0][0]
+                                                     AND rfam_acc='{self.mapping.rfam_acc}';""")[0][0]
             else:
                 sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, issue) VALUES (?, ?, NULL, ?) 
                                    ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue;""", 
@@ -886,6 +874,103 @@ class Downloader:
             notify(f"Downloaded and extracted {unit} database from SILVA", "used previous file")
 
 
+ class Mapping:
+     """
+     A custom class to store more information about nucleotide mappings.
+     """
+ 
+     def __init__(self, chain_label, rfam_acc, pdb_start, pdb_end, inferred):
+         """
+         Arguments:
+         rfam_acc : Rfam family accession number of the mapping
+         pdb_start/pdb_end : nt_resnum start and end values in the 3D data that are mapped to the family
+         inferred : wether the mapping has been inferred using BGSU's NR list
+         """
+         self.chain_label = chain_label
+         self.rfam_acc = rfam_acc
+         self.nt_start = pdb_start # nt_resnum numbering
+         self.nt_end = pdb_end # nt_resnum numbering
+         self.reversed = (pdb_start > pdb_end) 
+         self.inferred = inferred                
+         self.interval = range(pdb_start, pdb_end+1)
+ 
+         self.old_nt_resnums = []    # to be computed
+         self.new_nt_resnums = []    # 
+ 
+         self.logs = [] # Events are logged when modifying the mapping
+ 
+     def shift_resnum_range(self, i):
+         self.log(f"Shifting nt_resnum numbering because of duplicate residue {self.new_nt_resnums[i]}")
+         for j in range(i, len(self.new_nt_resnums)):
+             self.new_nt_resnums[j] += 1
+ 
+     def insert_new(self, index_chain, oldresnum, newresnum):
+         # Adds a nt that did not passed the mapping filter at first
+         # because it was numbered with a very high nt_resnum value (outside the bounds of the mapping)
+         # But, in practice, its index_chain is correct and in the bounds and it belongs to the mapped chain.
+         # Those nts are only searched if there are missing index_chain values in the mapping bounds.
+ 
+         # insert the nt_resnum values in the lists
+         self.old_nt_resnums.insert(index_chain-1, oldresnum)
+         self.new_nt_resnums.insert(index_chain-1, newresnum)
+ 
+         # shift the new_nt_resnum values if needed, to avoid creating a doublon
+         if self.new_nt_resnums[index_chain-1] == self.new_nt_resnums[index_chain]:
+             for j in range(index_chain, len(self.new_nt_resnums)):
+                 self.new_nt_resnums[j] += 1
+         # warn(f"Residue {index_chain} has been saved and renumbered {newresnum} instead of {oldresnum}")
+         self.log(f"Residue {index_chain} has been saved and renumbered {newresnum} instead of {oldresnum}")
+     
+     def filter_df(self, df):
+         if not self.reversed:
+             newdf = df.drop(df[(df.nt_resnum < self.nt_start) | (df.nt_resnum > self.nt_end)].index)
+         else:
+             newdf = df.drop(df[(df.nt_resnum < self.nt_end) | (df.nt_resnum > self.nt_start)].index)
+        
+         if len(newdf.index_chain) > 0:
+             # everything's okay 
+             df = newdf
+         else:
+             # There were nucleotides in this chain but we removed them all while
+             # filtering the ones outside the Rfam mapping.
+             # This probably means that, for this chain, the mapping is relative to 
+             # index_chain and not nt_resnum.
+             warn(f"Assuming mapping to {self.rfam_acc} is an absolute position interval.")
+             weird_mappings.add(self.chain_label + "." + self.rfam_acc)
+             df = df.drop(df[(df.index_chain < self.nt_start) | (df.index_chain > self.nt_end)].index)
+ 
+ 
+         # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one
+         if len(df.index_chain) and df.iloc[0,0] != 1:
+             st = df.iloc[0,0] -1
+             df.iloc[:, 0] -= st
+ 
+         self.old_nt_resnums = df.nt_resnum.tolist()
+         self.new_nt_resnums = df.nt_resnum.tolist()
+ 
+         return df
+ 
+     def drop_ligand(self, df_row):
+         self.log("Droping ligand:")
+         self.log(df_row)
+         i = self.new_nt_resnums.index(df_row.iloc[0,1])
+         self.old_nt_resnums.pop(i)
+         self.new_nt_resnums.pop(i)
+ 
+     def log(self, message):
+         if isinstance(message, str):
+             self.logs.append(message+'\n')
+         else:
+             self.logs.append(str(message))
+ 
+     def to_file(self, filename):
+         if not path.exists("logs"):
+             os.makedirs("logs", exist_ok=True)
+         with open("logs/"+filename, "w") as f:
+             f.writelines(self.logs)
+ 
+ 
+ 
 class Pipeline:
     def __init__(self):
         self.dl = Downloader()
@@ -1928,6 +2013,24 @@ def work_prepare_sequences(dl, rfam_acc, chains):
     """Prepares FASTA files of homologous sequences to realign with cmalign or SINA."""
 
     if rfam_acc in LSU_set | SSU_set: # rRNA
+         if path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
+             # Detect doublons and remove them
+             existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
+             existing_ids = [ r.id for r in existing_afa ]
+             del existing_afa
+             new_ids = [ str(c) for c in chains ]
+             doublons = [ i for i in existing_ids if i in new_ids ]
+             del existing_ids, new_ids
+             if len(doublons):
+                 fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa"
+                 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version")
+                 seqfile = SeqIO.parse(fasta, "fasta")
+                 os.remove(fasta)
+                 with open(fasta, 'w') as f:
+                     for rec in seqfile:
+                         if rec.id not in doublons:
+                             f.write(rec.format("fasta"))
+             
         # Add the new sequences with previous ones, if any
         with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f:
             for c in chains:
@@ -2037,7 +2140,7 @@ def work_realign(rfam_acc):
             existing_stk = AlignIO.read(existing_ali_path, "stockholm")
             existing_ids = [ r.id for r in existing_stk ]
             del existing_stk
-             new_stk = AlignIO.read(new_ali_path, "stk")
+             new_stk = AlignIO.read(new_ali_path, "stockholm")
             new_ids = [ r.id for r in new_stk ]
             del new_stk
             doublons = [ i for i in existing_ids if i in new_ids ]
@@ -2046,8 +2149,9 @@ def work_realign(rfam_acc):
                 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
                 with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove:
                     toremove.write('\n'.join(doublons)+'\n')
-                 p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path], 
+                 p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path], 
                                     stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+                 p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path], stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
                 os.remove(path_to_seq_data + "realigned/toremove.txt")
 
             # And we merge the two alignments
@@ -2075,7 +2179,7 @@ def work_realign(rfam_acc):
             
         if len(stderr):
             print('', flush=True)
-             warn(f"Error while during sequence alignment: {stderr}", error=True)
+             warn(f"Error during sequence alignment: {stderr}", error=True)
             with open(runDir + "/errors.txt", "a") as er:
                 er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n')
             return 1
@@ -2087,7 +2191,7 @@ def work_realign(rfam_acc):
         subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part.
 
     # Assert everything worked, or save an error
-     with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa") as output:
+     with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa", 'r') as output:
         if not len(output.readline()):
             # The process crashed, probably because of RAM overflow
             warn(f"Failed to realign {rfam_acc} (killed)", error=True)
@@ -2237,7 +2341,7 @@ def work_save(c, homology=True):
                 NATURAL JOIN nucleotide
                 NATURAL JOIN align_column;""", 
             conn)
-         filename = path_to_3D_data + "datapoints/" + c.chain_label + '.' + c.rfam_fam
+         filename = path_to_3D_data + "datapoints/" + c.chain_label + '.' + c.mapping.rfam_acc
     else:
         df = pd.read_sql_query(f"""
                 SELECT index_chain, nt_resnum, nt_position, nt_name, nt_code, nt_align_code, 
@@ -2280,7 +2384,6 @@ if __name__ == "__main__":
     pp.dl_and_annotate(coeff_ncores=0.5) 
 
     # At this point, the structure table is up to date
- 
     pp.build_chains(coeff_ncores=1.0)
 
     if len(pp.to_retry):
@@ -2293,7 +2396,8 @@ if __name__ == "__main__":
         print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True)
     if len(weird_mappings):
         print(f"{len(weird_mappings)} mappings to Rfam were taken as absolute positions instead of residue numbers:", weird_mappings, flush=True)
-     pp.checkpoint_save_chains()
+     if pp.SELECT_ONLY is None:
+         pp.checkpoint_save_chains()
 
     if not pp.HOMOLOGY:
         # Save chains to file
@@ -2301,7 +2405,7 @@ if __name__ == "__main__":
             work_save(c, homology=False)
         print("Completed.")
         exit(0)
-     
+ 
     # At this point, structure, chain and nucleotide tables of the database are up to date.
     # (Modulo some statistics computed by statistics.py)
 
@@ -2309,15 +2413,16 @@ if __name__ == "__main__":
     # Homology information
     # ===========================================================================
 
-     pp.checkpoint_load_chains()  # If your job failed, you can comment all the "3D information" part and start from here.
+     if pp.SELECT_ONLY is None:
+         pp.checkpoint_load_chains()  # If your job failed, you can comment all the "3D information" part and start from here.
 
     # Get the list of Rfam families found
     rfam_acc_to_download = {}
     for c in pp.loaded_chains:
-         if c.rfam_fam not in rfam_acc_to_download:
-             rfam_acc_to_download[c.rfam_fam] = [ c ]
+         if c.mapping.rfam_acc not in rfam_acc_to_download:
+             rfam_acc_to_download[c.mapping.rfam_acc] = [ c ]
         else:
-             rfam_acc_to_download[c.rfam_fam].append(c)
+             rfam_acc_to_download[c.mapping.rfam_acc].append(c)
     print(f"> Identified {len(rfam_acc_to_download.keys())} families to update and re-align with the crystals' sequences")
     pp.fam_list = sorted(rfam_acc_to_download.keys())
     
--- a/automate.sh
View file @bd665f4
+++ b/automate.sh
View file @bd665f4
@@ -3,7 +3,7 @@
 # Run RNANet
 cd /home/lbecquey/Projects/RNANet;
 rm -f stdout.txt stderr.txt errors.txt;
- time './RNAnet.py --3d-folder /home/lbequey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/' > stdout.txt 2> stderr.txt;
+ time './RNAnet.py --3d-folder /home/lbequey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -s -r 20.0' > stdout.txt 2> stderr.txt;
 
 # Sync in Seafile
 seaf-cli start;
--- a/statistics.py
View file @bd665f4
+++ b/statistics.py
View file @bd665f4
@@ -433,6 +433,7 @@ def to_dist_matrix(f):
         notify(f"Computed {f} distance matrix", "loaded from file")
         return 0
 
+     notify(f"Computing {f} distance matrix from alignment...")
     dm = DistanceCalculator('identity')
     with open(path_to_seq_data+"/realigned/"+f+"++.afa") as al_file:
         al = AlignIO.read(al_file, "fasta")[-len(mappings_list[f]):]
@@ -542,10 +543,10 @@ if __name__ == "__main__":
     threads = [
         th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 1}),
         th.Thread(target=reproduce_wadley_results, kwargs={'carbon': 4}),
-         # th.Thread(target=stats_len),            # computes figures
-         # th.Thread(target=stats_freq),           # Updates the database
-         # th.Thread(target=seq_idty),           # produces .npy files and seq idty figures
-         # th.Thread(target=per_chain_stats)       # Updates the database
+         th.Thread(target=stats_len),            # computes figures
+         th.Thread(target=stats_freq),           # Updates the database
+         th.Thread(target=seq_idty),             # produces .npy files and seq idty figures
+         th.Thread(target=per_chain_stats)       # Updates the database
     ]
     
     # Start the threads