Khodor

Updated RNANet to store only representative members

Showing 1 changed file with 19 additions and 7 deletions
...@@ -845,14 +845,14 @@ class Downloader: ...@@ -845,14 +845,14 @@ class Downloader:
845 if os.path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"): 845 if os.path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"):
846 print("\t> Use of the previous version.\t", end="", flush=True) 846 print("\t> Use of the previous version.\t", end="", flush=True)
847 else: 847 else:
848 - return pd.DataFrame([], columns=["class", "class_members"]) 848 + return pd.DataFrame([], columns=["class","representative","class_members"])
849 849
850 nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv") 850 nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv")
851 - full_structures_list = [ tuple(i[1]) for i in nrlist[['class', 'class_members']].iterrows() ] 851 + full_structures_list = [ tuple(i[1]) for i in nrlist[["class","representative","class_members"]].iterrows() ]
852 print(f"\t{validsymb}", flush=True) 852 print(f"\t{validsymb}", flush=True)
853 853
854 # The beginning of an adventure. 854 # The beginning of an adventure.
855 - return full_structures_list # list of ( str (class), str (class_members) ) 855 + return full_structures_list # list of ( str (class), str(representative),str (class_members) )
856 856
857 def download_from_SILVA(self, unit): 857 def download_from_SILVA(self, unit):
858 858
...@@ -966,6 +966,7 @@ class Pipeline: ...@@ -966,6 +966,7 @@ class Pipeline:
966 self.RUN_STATS = False 966 self.RUN_STATS = False
967 self.EXTRACT_CHAINS = False 967 self.EXTRACT_CHAINS = False
968 self.REUSE_ALL = False 968 self.REUSE_ALL = False
969 + self.REDUNDANT = False
969 self.SELECT_ONLY = None 970 self.SELECT_ONLY = None
970 self.ARCHIVE = False 971 self.ARCHIVE = False
971 self.SAVELOGS = True 972 self.SAVELOGS = True
...@@ -982,7 +983,7 @@ class Pipeline: ...@@ -982,7 +983,7 @@ class Pipeline:
982 983
983 try: 984 try:
984 opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=", "only=", "maxcores=", 985 opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=", "only=", "maxcores=",
985 - "from-scratch", "full-inference", "no-homology", "ignore-issues", "extract", 986 + "from-scratch", "full-inference", "no-homology","redundant", "ignore-issues", "extract",
986 "all", "no-logs", "archive", "update-homologous", "version"]) 987 "all", "no-logs", "archive", "update-homologous", "version"])
987 except getopt.GetoptError as err: 988 except getopt.GetoptError as err:
988 print(err) 989 print(err)
...@@ -1006,6 +1007,7 @@ class Pipeline: ...@@ -1006,6 +1007,7 @@ class Pipeline:
1006 print("--------------------------------------------------------------------------------------------------------------") 1007 print("--------------------------------------------------------------------------------------------------------------")
1007 print("-f [ --full-inference ]\t\tInfer new mappings even if Rfam already provides some. Yields more copies of" 1008 print("-f [ --full-inference ]\t\tInfer new mappings even if Rfam already provides some. Yields more copies of"
1008 "\n\t\t\t\t chains mapped to different families.") 1009 "\n\t\t\t\t chains mapped to different families.")
1010 + print("--redundant\t\t\t\tStore the class members in the database thoughts to be redundant for predictions.")
1009 print("-s\t\t\t\tRun statistics computations after completion") 1011 print("-s\t\t\t\tRun statistics computations after completion")
1010 print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.") 1012 print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.")
1011 print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. " 1013 print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. "
...@@ -1103,6 +1105,8 @@ class Pipeline: ...@@ -1103,6 +1105,8 @@ class Pipeline:
1103 ncores = min(ncores, int(arg)) 1105 ncores = min(ncores, int(arg))
1104 elif opt == "-f" or opt == "--full-inference": 1106 elif opt == "-f" or opt == "--full-inference":
1105 self.FULLINFERENCE = True 1107 self.FULLINFERENCE = True
1108 + elif opt=="--redundant":
1109 + self.REDUNDANT=True
1106 1110
1107 if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions": 1111 if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions":
1108 print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") 1112 print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
...@@ -1151,7 +1155,8 @@ class Pipeline: ...@@ -1151,7 +1155,8 @@ class Pipeline:
1151 work_infer_mappings, 1155 work_infer_mappings,
1152 not self.REUSE_ALL, 1156 not self.REUSE_ALL,
1153 allmappings, 1157 allmappings,
1154 - self.FULLINFERENCE 1158 + self.FULLINFERENCE,
1159 + self.REDUNDANT
1155 ), 1160 ),
1156 full_structures_list, 1161 full_structures_list,
1157 chunksize=1)): 1162 chunksize=1)):
...@@ -1905,7 +1910,7 @@ def execute_joblist(fulljoblist): ...@@ -1905,7 +1910,7 @@ def execute_joblist(fulljoblist):
1905 return results 1910 return results
1906 1911
1907 @trace_unhandled_exceptions 1912 @trace_unhandled_exceptions
1908 -def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> list: 1913 +def work_infer_mappings(update_only, allmappings, fullinference,redundant, codelist) -> list:
1909 """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list, 1914 """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list,
1910 build a list of Chain() objects mapped to Rfam families, by expanding available mappings 1915 build a list of Chain() objects mapped to Rfam families, by expanding available mappings
1911 of any element of the list to all the list elements. 1916 of any element of the list to all the list elements.
...@@ -1919,7 +1924,7 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li ...@@ -1919,7 +1924,7 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li
1919 # Split the comma-separated list of chain codes into chain codes: 1924 # Split the comma-separated list of chain codes into chain codes:
1920 eq_class = codelist[0] 1925 eq_class = codelist[0]
1921 codes = codelist[1].replace('+', ',').split(',') 1926 codes = codelist[1].replace('+', ',').split(',')
1922 - 1927 + representative=codelist[1].replace('+', ',').split(',')[0]
1923 # Search for mappings that apply to an element of this PDB chains list: 1928 # Search for mappings that apply to an element of this PDB chains list:
1924 for c in codes: 1929 for c in codes:
1925 # search for Rfam mappings with this chain c: 1930 # search for Rfam mappings with this chain c:
...@@ -2008,6 +2013,13 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li ...@@ -2008,6 +2013,13 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li
2008 2013
2009 # Now build Chain() objects for the mapped chains 2014 # Now build Chain() objects for the mapped chains
2010 for c in codes: 2015 for c in codes:
2016 +
2017 + if not redundant and c!=representative:
2018 + '''
2019 + by default save only the representative member
2020 + if redundant is passed then save all the chains of the class members
2021 + '''
2022 + continue
2011 nr = c.split('|') 2023 nr = c.split('|')
2012 pdb_id = nr[0].lower() 2024 pdb_id = nr[0].lower()
2013 pdb_model = int(nr[1]) 2025 pdb_model = int(nr[1])
......