Merge branch 'master' of https://github.com/persalteas/RNANet into master

Louis BECQUEY
Commit 6cc5142fdbb1330de0ccd673554f02c185cffd90 6cc5142f 2 parents 60bd1aec d645ce5e
Showing 1 changed file with 27 additions and 19 deletions
RNAnet.py
--- a/RNAnet.py
View file @6cc5142
+++ b/RNAnet.py
View file @6cc5142
@@ -132,9 +132,11 @@ class BufferingSummaryInfo(AlignInfo.SummaryInfo):
 class Chain:
-    """ The object which stores all our data and the methods to process it.
+    """ 
+    The object which stores all our data and the methods to process it.
-    Chains accumulate information through this scipt, and are saved to files at the end of major steps."""
+    Chains accumulate information through this scipt, and are saved to files at the end of major steps.
+    """
     def __init__(self, pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam="", inferred=False, pdb_start=None, pdb_end=None):
         self.pdb_id = pdb_id                    # PDB ID
@@ -144,6 +146,7 @@ class Chain:
             self.mapping = Mapping(chain_label, rfam, pdb_start, pdb_end, inferred)
         else:
             self.mapping = None
+        self.eq_class = eq_class                # BGSU NR list class id
         self.chain_label = chain_label          # chain pretty name 
         self.file = ""                          # path to the 3D PDB file
         self.seq = ""                           # sequence with modified nts
@@ -523,30 +526,33 @@ class Chain:
             # Register the chain in table chain
             if self.mapping is not None:
                 sql_execute(conn, f"""  INSERT INTO chain 
-                                        (structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, issue)
+                                        (structure_id, chain_name, pdb_start, pdb_end, rfam_acc, eq_class, inferred, issue)
                                         VALUES 
-                                        (?, ?, ?, ?, ?, ?, ?)
+                                        (?, ?, ?, ?, ?, ?, ?, ?)
                                         ON CONFLICT(structure_id, chain_name, rfam_acc) DO
                                         UPDATE SET  pdb_start=excluded.pdb_start, 
                                                     pdb_end=excluded.pdb_end, 
+                                                    eq_class=excluded.eq_class,
                                                     inferred=excluded.inferred, 
                                                     issue=excluded.issue;""", 
                                         data=(str(self.pdb_id), str(self.pdb_chain_id), 
                                               int(self.mapping.nt_start), int(self.mapping.nt_end), 
-                                              str(self.mapping.rfam_acc), 
+                                              str(self.mapping.rfam_acc), str(self.eq_class),
                                               int(self.mapping.inferred), int(self.delete_me)))
                 # get the chain id
                 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain 
                                                     WHERE structure_id='{self.pdb_id}' 
                                                     AND chain_name='{self.pdb_chain_id}' 
-                                                    AND rfam_acc='{self.mapping.rfam_acc}';""")[0][0]
+                                                    AND rfam_acc='{self.mapping.rfam_acc}'
+                                                    AND eq_class='{self.eq_class}';""")[0][0]
             else:
-                sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, issue) VALUES (?, ?, NULL, ?) 
+                sql_execute(conn, """INSERT INTO chain (structure_id, chain_name, rfam_acc, eq_class, issue) VALUES (?, ?, NULL, ?, ?) 
-                                   ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue;""", 
+                                   ON CONFLICT(structure_id, chain_name, rfam_acc) DO UPDATE SET issue=excluded.issue, eq_class=excluded.eq_class;""", 
-                            data=(str(self.pdb_id), str(self.pdb_chain_id), int(self.delete_me)))
+                            data=(str(self.pdb_id), str(self.pdb_chain_id), str(self.eq_class), int(self.delete_me)))
                 self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain 
                                                     WHERE structure_id='{self.pdb_id}' 
                                                     AND chain_name='{self.pdb_chain_id}' 
+                                                    AND eq_class='{self.eq_class}'
                                                     AND rfam_acc IS NULL;""")[0][0]
             # Add the nucleotides if the chain is not an issue
@@ -859,14 +865,14 @@ class Downloader:
             if path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"):
                 print("\t> Use of the previous version.\t", end = "", flush=True)
             else:
-                return [], []
+                return pd.DataFrame([], columns=["class", "class_members"])
         nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv")
-        full_structures_list = nrlist['class_members'].tolist()
+        full_structures_list = [ tuple(i[1]) for i in nrlist[['class','class_members']].iterrows() ]
         print(f"\t{validsymb}", flush=True)
         # The beginning of an adventure.
-        return full_structures_list
+        return full_structures_list # list of ( str (class), str (class_members) )
     def download_from_SILVA(self, unit):
         if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"):
@@ -1068,8 +1074,8 @@ class Pipeline:
             elif opt == "--from-scratch":
                 warn("Deleting previous database and recomputing from scratch.")
                 subprocess.run(["rm", "-rf", 
-                                path_to_3D_data + "annotations",
+                                # path_to_3D_data + "annotations",  # DEBUG : keep the annotations !
-                                # path_to_3D_data + "RNAcifs",  # DEBUG : keep the cifs !
+                                # path_to_3D_data + "RNAcifs",      # DEBUG : keep the cifs !
                                 path_to_3D_data + "rna_mapped_to_Rfam",
                                 path_to_3D_data + "rnaonly",
                                 path_to_seq_data + "realigned",
@@ -1103,7 +1109,7 @@ class Pipeline:
         If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains."""
         # List all 3D RNA chains below given resolution
-        full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES)
+        full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) # list of tuples ( class, class_members )
         # Check for a list of known problems:
         if path.isfile(runDir + "/known_issues.txt"):
@@ -1140,8 +1146,8 @@ class Pipeline:
                 exit(1)
         else:
             conn = sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0)
-            for codelist in tqdm(full_structures_list):
+            for eq_class, codelist in tqdm(full_structures_list):
-                codes = str(codelist).replace('+',',').split(',')
+                codes = codelist.replace('+',',').split(',')
                 # Simply convert the list of codes to Chain() objects
                 for c in codes:
@@ -1408,7 +1414,7 @@ class Pipeline:
         with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
             pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", 
                             conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False)
-            pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 
+            pd.read_sql_query("""SELECT eq_class, structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure 
                                 JOIN chain ON structure.pdb_id = chain.structure_id
                                 ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False)
@@ -1522,6 +1528,7 @@ def sql_define_tables(conn):
                 chain_id        INTEGER PRIMARY KEY NOT NULL,
                 structure_id    CHAR(4) NOT NULL,
                 chain_name      VARCHAR(2) NOT NULL,
+                eq_class        VARCHAR(10),
                 pdb_start       SMALLINT,
                 pdb_end         SMALLINT,
                 issue           TINYINT,
@@ -1785,7 +1792,8 @@ def work_infer_mappings(update_only, allmappings, codelist):
     known_mappings = pd.DataFrame()
     # Split the comma-separated list of chain codes into chain codes:
-    codes = str(codelist).replace('+',',').split(',')
+    eq_class = codelist[0]
+    codes = codelist[1].replace('+',',').split(',')
     # Search for mappings that apply to an element of this PDB chains list:
     for c in codes: