Louis Becquey

Merged Khodor-internship to master + general cmalign options

...@@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser ...@@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser
40 from Bio.Seq import MutableSeq 40 from Bio.Seq import MutableSeq
41 from Bio.SeqRecord import SeqRecord 41 from Bio.SeqRecord import SeqRecord
42 from Bio.Align import MultipleSeqAlignment 42 from Bio.Align import MultipleSeqAlignment
43 - 43 +from pydca.plmdca import plmdca
44 runDir = os.getcwd() 44 runDir = os.getcwd()
45 45
46 def trace_unhandled_exceptions(func): 46 def trace_unhandled_exceptions(func):
...@@ -76,11 +76,11 @@ python_executable = "python"+".".join(platform.python_version().split('.')[:2]) ...@@ -76,11 +76,11 @@ python_executable = "python"+".".join(platform.python_version().split('.')[:2])
76 validsymb = '\U00002705' 76 validsymb = '\U00002705'
77 warnsymb = '\U000026A0' 77 warnsymb = '\U000026A0'
78 errsymb = '\U0000274C' 78 errsymb = '\U0000274C'
79 -
80 LSU_set = {"RF00002", "RF02540", "RF02541", 79 LSU_set = {"RF00002", "RF02540", "RF02541",
81 "RF02543", "RF02546"} # From Rfam CLAN 00112 80 "RF02543", "RF02546"} # From Rfam CLAN 00112
82 SSU_set = {"RF00177", "RF02542", "RF02545", 81 SSU_set = {"RF00177", "RF02542", "RF02545",
83 "RF01959", "RF01960"} # From Rfam CLAN 00111 82 "RF01959", "RF01960"} # From Rfam CLAN 00111
83 +
84 no_nts_set = set() 84 no_nts_set = set()
85 weird_mappings = set() 85 weird_mappings = set()
86 86
...@@ -845,14 +845,14 @@ class Downloader: ...@@ -845,14 +845,14 @@ class Downloader:
845 if os.path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"): 845 if os.path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"):
846 print("\t> Use of the previous version.\t", end="", flush=True) 846 print("\t> Use of the previous version.\t", end="", flush=True)
847 else: 847 else:
848 - return pd.DataFrame([], columns=["class", "class_members"]) 848 + return pd.DataFrame([], columns=["class","representative","class_members"])
849 849
850 nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv") 850 nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv")
851 - full_structures_list = [ tuple(i[1]) for i in nrlist[['class', 'class_members']].iterrows() ] 851 + full_structures_list = [ tuple(i[1]) for i in nrlist[["class","representative","class_members"]].iterrows() ]
852 print(f"\t{validsymb}", flush=True) 852 print(f"\t{validsymb}", flush=True)
853 853
854 # The beginning of an adventure. 854 # The beginning of an adventure.
855 - return full_structures_list # list of ( str (class), str (class_members) ) 855 + return full_structures_list # list of ( str (class), str(representative),str (class_members) )
856 856
857 def download_from_SILVA(self, unit): 857 def download_from_SILVA(self, unit):
858 858
...@@ -966,6 +966,9 @@ class Pipeline: ...@@ -966,6 +966,9 @@ class Pipeline:
966 self.RUN_STATS = False 966 self.RUN_STATS = False
967 self.EXTRACT_CHAINS = False 967 self.EXTRACT_CHAINS = False
968 self.REUSE_ALL = False 968 self.REUSE_ALL = False
969 + self.REDUNDANT = False
970 + self.ALIGNOPTS = None
971 + self.USESINA = False
969 self.SELECT_ONLY = None 972 self.SELECT_ONLY = None
970 self.ARCHIVE = False 973 self.ARCHIVE = False
971 self.SAVELOGS = True 974 self.SAVELOGS = True
...@@ -981,8 +984,9 @@ class Pipeline: ...@@ -981,8 +984,9 @@ class Pipeline:
981 setproctitle("RNANet.py process_options()") 984 setproctitle("RNANet.py process_options()")
982 985
983 try: 986 try:
984 - opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=", "only=", "maxcores=", 987 + opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=",
985 - "from-scratch", "full-inference", "no-homology", "ignore-issues", "extract", 988 + "only=", "cmalign-opts=", "maxcores=", "sina", "from-scratch",
989 + "full-inference", "no-homology", "redundant", "ignore-issues", "extract",
986 "all", "no-logs", "archive", "update-homologous", "version"]) 990 "all", "no-logs", "archive", "update-homologous", "version"])
987 except getopt.GetoptError as err: 991 except getopt.GetoptError as err:
988 print(err) 992 print(err)
...@@ -1006,6 +1010,7 @@ class Pipeline: ...@@ -1006,6 +1010,7 @@ class Pipeline:
1006 print("--------------------------------------------------------------------------------------------------------------") 1010 print("--------------------------------------------------------------------------------------------------------------")
1007 print("-f [ --full-inference ]\t\tInfer new mappings even if Rfam already provides some. Yields more copies of" 1011 print("-f [ --full-inference ]\t\tInfer new mappings even if Rfam already provides some. Yields more copies of"
1008 "\n\t\t\t\t chains mapped to different families.") 1012 "\n\t\t\t\t chains mapped to different families.")
1013 + print("--redundant\t\t\t\tStore the class members in the database thoughts to be redundant for predictions.")
1009 print("-s\t\t\t\tRun statistics computations after completion") 1014 print("-s\t\t\t\tRun statistics computations after completion")
1010 print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.") 1015 print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.")
1011 print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. " 1016 print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. "
...@@ -1022,9 +1027,12 @@ class Pipeline: ...@@ -1022,9 +1027,12 @@ class Pipeline:
1022 print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:" 1027 print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:"
1023 "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families" 1028 "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families"
1024 "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family") 1029 "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
1030 + print("--sina\t\t\tForce the RNANet to align large subunit LSU and small subunit SSU ribosomal RNA using sina instead of infernal,"
1031 + "\n\t\t\t\t\t the other RNA families will be aligned using infernal.")
1025 print("--maxcores=…\t\t\tLimit the number of cores to use in parallel portions to reduce the simultaneous" 1032 print("--maxcores=…\t\t\tLimit the number of cores to use in parallel portions to reduce the simultaneous"
1026 "\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions" 1033 "\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions"
1027 "\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.") 1034 "\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.")
1035 + print("--cmalign-opts=…\t\tA string of additional options to pass to cmalign aligner, e.g. \"--nonbanded --mxsize 2048\"")
1028 print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments," 1036 print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments,"
1029 "\n\t\t\t\t and update the link to the latest archive. ") 1037 "\n\t\t\t\t and update the link to the latest archive. ")
1030 print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications") 1038 print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications")
...@@ -1042,8 +1050,8 @@ class Pipeline: ...@@ -1042,8 +1050,8 @@ class Pipeline:
1042 print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --no-logs' &") 1050 print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --no-logs' &")
1043 sys.exit() 1051 sys.exit()
1044 elif opt == '--version': 1052 elif opt == '--version':
1045 - print("RNANet v1.3 beta, parallelized, Dockerized") 1053 + print("RNANet v1.4 beta, parallelized, Dockerized")
1046 - print("Last revision : Jan 2021") 1054 + print("Last revision : April 2021")
1047 sys.exit() 1055 sys.exit()
1048 elif opt == "-r" or opt == "--resolution": 1056 elif opt == "-r" or opt == "--resolution":
1049 assert float(arg) > 0.0 and float(arg) <= 20.0 1057 assert float(arg) > 0.0 and float(arg) <= 20.0
...@@ -1089,6 +1097,8 @@ class Pipeline: ...@@ -1089,6 +1097,8 @@ class Pipeline:
1089 path_to_seq_data + "realigned", 1097 path_to_seq_data + "realigned",
1090 path_to_seq_data + "rfam_sequences"]) 1098 path_to_seq_data + "rfam_sequences"])
1091 self.REUSE_ALL = True 1099 self.REUSE_ALL = True
1100 + elif opt == "cmalign-opts":
1101 + self.ALIGNOPTS = arg
1092 elif opt == "--all": 1102 elif opt == "--all":
1093 self.REUSE_ALL = True 1103 self.REUSE_ALL = True
1094 self.USE_KNOWN_ISSUES = False 1104 self.USE_KNOWN_ISSUES = False
...@@ -1103,6 +1113,10 @@ class Pipeline: ...@@ -1103,6 +1113,10 @@ class Pipeline:
1103 ncores = min(ncores, int(arg)) 1113 ncores = min(ncores, int(arg))
1104 elif opt == "-f" or opt == "--full-inference": 1114 elif opt == "-f" or opt == "--full-inference":
1105 self.FULLINFERENCE = True 1115 self.FULLINFERENCE = True
1116 + elif opt=="--redundant":
1117 + self.REDUNDANT=True
1118 + elif opt=="--sina":
1119 + self.USESINA=True
1106 1120
1107 if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions": 1121 if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions":
1108 print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") 1122 print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
...@@ -1151,7 +1165,8 @@ class Pipeline: ...@@ -1151,7 +1165,8 @@ class Pipeline:
1151 work_infer_mappings, 1165 work_infer_mappings,
1152 not self.REUSE_ALL, 1166 not self.REUSE_ALL,
1153 allmappings, 1167 allmappings,
1154 - self.FULLINFERENCE 1168 + self.FULLINFERENCE,
1169 + self.REDUNDANT
1155 ), 1170 ),
1156 full_structures_list, 1171 full_structures_list,
1157 chunksize=1)): 1172 chunksize=1)):
...@@ -1340,7 +1355,7 @@ class Pipeline: ...@@ -1340,7 +1355,7 @@ class Pipeline:
1340 joblist = [] 1355 joblist = []
1341 for f in self.fam_list: 1356 for f in self.fam_list:
1342 joblist.append(Job(function=work_prepare_sequences, how_many_in_parallel=ncores, args=[ 1357 joblist.append(Job(function=work_prepare_sequences, how_many_in_parallel=ncores, args=[
1343 - self.dl, f, rfam_acc_to_download[f]])) 1358 + self.dl, self.USESINA, f, rfam_acc_to_download[f]]))
1344 try: 1359 try:
1345 execute_joblist(joblist) 1360 execute_joblist(joblist)
1346 1361
...@@ -1365,7 +1380,7 @@ class Pipeline: ...@@ -1365,7 +1380,7 @@ class Pipeline:
1365 joblist = [] 1380 joblist = []
1366 for f in self.fam_list: 1381 for f in self.fam_list:
1367 # the function already uses all CPUs so launch them one by one (how_many_in_parallel=1) 1382 # the function already uses all CPUs so launch them one by one (how_many_in_parallel=1)
1368 - joblist.append(Job(function=work_realign, args=[f], how_many_in_parallel=1, label=f)) 1383 + joblist.append(Job(function=work_realign, args=[self.USESINA, self.ALIGNOPTS, f], how_many_in_parallel=1, label=f))
1369 1384
1370 # Execute the jobs 1385 # Execute the jobs
1371 try: 1386 try:
...@@ -1753,6 +1768,10 @@ def sql_define_tables(conn): ...@@ -1753,6 +1768,10 @@ def sql_define_tables(conn):
1753 freq_G REAL, 1768 freq_G REAL,
1754 freq_U REAL, 1769 freq_U REAL,
1755 freq_other REAL, 1770 freq_other REAL,
1771 + fields_A REAL,
1772 + fields_C REAL,
1773 + fields_G REAL,
1774 + fields_U REAL,
1756 gap_percent REAL, 1775 gap_percent REAL,
1757 consensus CHAR(1), 1776 consensus CHAR(1),
1758 cons_sec_struct CHAR(1), 1777 cons_sec_struct CHAR(1),
...@@ -1937,7 +1956,7 @@ def execute_joblist(fulljoblist): ...@@ -1937,7 +1956,7 @@ def execute_joblist(fulljoblist):
1937 return results 1956 return results
1938 1957
1939 @trace_unhandled_exceptions 1958 @trace_unhandled_exceptions
1940 -def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> list: 1959 +def work_infer_mappings(update_only, allmappings, fullinference,redundant, codelist) -> list:
1941 """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list, 1960 """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list,
1942 build a list of Chain() objects mapped to Rfam families, by expanding available mappings 1961 build a list of Chain() objects mapped to Rfam families, by expanding available mappings
1943 of any element of the list to all the list elements. 1962 of any element of the list to all the list elements.
...@@ -1950,8 +1969,8 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li ...@@ -1950,8 +1969,8 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li
1950 1969
1951 # Split the comma-separated list of chain codes into chain codes: 1970 # Split the comma-separated list of chain codes into chain codes:
1952 eq_class = codelist[0] 1971 eq_class = codelist[0]
1953 - codes = codelist[1].replace('+', ',').split(',') 1972 + codes = codelist[2].replace('+', ',').split(',')
1954 - 1973 + representative=codelist[1].replace('+', ',').split(',')[0]
1955 # Search for mappings that apply to an element of this PDB chains list: 1974 # Search for mappings that apply to an element of this PDB chains list:
1956 for c in codes: 1975 for c in codes:
1957 # search for Rfam mappings with this chain c: 1976 # search for Rfam mappings with this chain c:
...@@ -2040,6 +2059,13 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li ...@@ -2040,6 +2059,13 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li
2040 2059
2041 # Now build Chain() objects for the mapped chains 2060 # Now build Chain() objects for the mapped chains
2042 for c in codes: 2061 for c in codes:
2062 +
2063 + if not redundant and c!=representative:
2064 + '''
2065 + by default save only the representative member
2066 + if redundant is passed then save all the chains of the class members
2067 + '''
2068 + continue
2043 nr = c.split('|') 2069 nr = c.split('|')
2044 pdb_id = nr[0].lower() 2070 pdb_id = nr[0].lower()
2045 pdb_model = int(nr[1]) 2071 pdb_model = int(nr[1])
...@@ -2202,13 +2228,12 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): ...@@ -2202,13 +2228,12 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
2202 return c 2228 return c
2203 2229
2204 @trace_unhandled_exceptions 2230 @trace_unhandled_exceptions
2205 -def work_prepare_sequences(dl, rfam_acc, chains): 2231 +def work_prepare_sequences(dl, useSina, rfam_acc, chains):
2206 """Prepares FASTA files of homologous sequences to realign with cmalign or SINA. 2232 """Prepares FASTA files of homologous sequences to realign with cmalign or SINA.
2207 """ 2233 """
2208 2234
2209 setproctitle("RNAnet.py work_prepare_sequences()") 2235 setproctitle("RNAnet.py work_prepare_sequences()")
2210 - 2236 + if useSina and rfam_acc in LSU_set | SSU_set:
2211 - if rfam_acc in LSU_set | SSU_set: # rRNA
2212 if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): 2237 if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
2213 # Detect doublons and remove them 2238 # Detect doublons and remove them
2214 existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta") 2239 existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
...@@ -2290,109 +2315,122 @@ def work_prepare_sequences(dl, rfam_acc, chains): ...@@ -2290,109 +2315,122 @@ def work_prepare_sequences(dl, rfam_acc, chains):
2290 notify(status) 2315 notify(status)
2291 2316
2292 @trace_unhandled_exceptions 2317 @trace_unhandled_exceptions
2293 -def work_realign(rfam_acc): 2318 +def use_sina(rfam_acc):
2294 - """ Runs multiple sequence alignements by RNA family.
2295 -
2296 - It aligns the Rfam hits from a RNA family with the sequences from the list of chains.
2297 - Rfam covariance models are used with Infernal tools, except for rRNAs.
2298 - cmalign requires too much RAM for them, so we use SINA, a specifically designed tool for rRNAs.
2299 """ 2319 """
2300 - 2320 + When prompted by the user to use SINA the software will use SINA for rRNA SSU and LSU
2301 - setproctitle(f"RNAnet.py work_realign({rfam_acc})") 2321 + """
2302 - 2322 + if rfam_acc in ["RF00177", "RF01960"]:
2303 - if rfam_acc in LSU_set | SSU_set: 2323 + arbfile = "realigned/SSU.arb"
2304 - # Ribosomal subunits deserve a special treatment.
2305 - # They require too much RAM to be aligned with Infernal.
2306 - # Then we will use SINA instead.
2307 - if rfam_acc in ["RF00177", "RF01960"]:
2308 - arbfile = "realigned/SSU.arb"
2309 - else:
2310 - arbfile = "realigned/LSU.arb"
2311 -
2312 - # Run alignment
2313 - p = subprocess.run(["sina", "-i", path_to_seq_data + f"realigned/{rfam_acc}++.fa",
2314 - "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
2315 - "-r", path_to_seq_data + arbfile,
2316 - "--meta-fmt=csv"])
2317 else: 2324 else:
2318 - # Align using Infernal for most RNA families 2325 + arbfile = "realigned/LSU.arb"
2319 -
2320 - if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
2321 - # Alignment exists. We just want to add new sequences into it.
2322 -
2323 - if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
2324 - # there are no new sequences to align...
2325 - return
2326 2326
2327 - existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk" 2327 + # Run alignment
2328 - new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk" 2328 + subprocess.run(["sina", "-i", path_to_seq_data + f"realigned/{rfam_acc}++.fa",
2329 + "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
2330 + "-r", path_to_seq_data + arbfile,
2331 + "--meta-fmt=csv"])
2329 2332
2330 - # Align the new sequences 2333 +@trace_unhandled_exceptions
2331 - with open(new_ali_path, 'w') as o: 2334 +def use_infernal(rfam_acc, alignopts):
2332 - p1 = subprocess.run(["cmalign", "--nonbanded", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", 2335 + """
2333 - "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", 2336 + Infernal is our default alignment tool except if prompted by the user.
2334 - path_to_seq_data + f"realigned/{rfam_acc}.cm", 2337 + Cmalign will be used for all families except when the user prefers to align rRNA with SINA
2335 - path_to_seq_data + f"realigned/{rfam_acc}_new.fa"], 2338 + """
2336 - stdout=o, stderr=subprocess.PIPE) 2339 + if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
2337 - notify("Aligned new sequences together") 2340 + # Alignment exists. We just want to add new sequences into it.
2338 -
2339 - # Detect doublons and remove them
2340 - existing_stk = AlignIO.read(existing_ali_path, "stockholm")
2341 - existing_ids = [r.id for r in existing_stk]
2342 - del existing_stk
2343 - new_stk = AlignIO.read(new_ali_path, "stockholm")
2344 - new_ids = [r.id for r in new_stk]
2345 - del new_stk
2346 - doublons = [i for i in existing_ids if i in new_ids]
2347 - del existing_ids, new_ids
2348 - if len(doublons):
2349 - warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
2350 - with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove:
2351 - toremove.write('\n'.join(doublons)+'\n')
2352 - p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path],
2353 - stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2354 - p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path],
2355 - stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2356 - os.remove(path_to_seq_data + "realigned/toremove.txt")
2357 -
2358 - # And we merge the two alignments
2359 - p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
2360 - "--rna", existing_ali_path, new_ali_path],
2361 - stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2362 - stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8')
2363 - subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
2364 - notify("Merged alignments into one")
2365 2341
2366 - # remove the partial files 2342 + if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
2367 - os.remove(new_ali_path) 2343 + # there are no new sequences to align...
2368 - os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa") 2344 + return
2369 2345
2370 - else: 2346 + existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk"
2371 - # Alignment does not exist yet. We need to compute it from scratch. 2347 + new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk"
2372 - print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True)
2373 2348
2374 - p = subprocess.run(["cmalign", "--nonbanded", '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk", 2349 + # Align the new sequences
2375 - "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", 2350 + with open(new_ali_path, 'w') as o:
2351 + p1 = subprocess.run(["cmalign", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins",
2352 + "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
2376 path_to_seq_data + f"realigned/{rfam_acc}.cm", 2353 path_to_seq_data + f"realigned/{rfam_acc}.cm",
2377 - path_to_seq_data + f"realigned/{rfam_acc}++.fa"], 2354 + path_to_seq_data + f"realigned/{rfam_acc}_new.fa"],
2378 - stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) 2355 + stdout=o, stderr=subprocess.PIPE)
2379 - stderr = p.stderr.decode("utf-8") 2356 + notify("Aligned new sequences together")
2380 - 2357 +
2381 - if len(stderr): 2358 + # Detect doublons and remove them
2382 - print('', flush=True) 2359 + existing_stk = AlignIO.read(existing_ali_path, "stockholm")
2383 - warn(f"Error during sequence alignment: {stderr}", error=True) 2360 + existing_ids = [r.id for r in existing_stk]
2384 - with open(runDir + "/errors.txt", "a") as er: 2361 + del existing_stk
2385 - er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n') 2362 + new_stk = AlignIO.read(new_ali_path, "stockholm")
2386 - return 1 2363 + new_ids = [r.id for r in new_stk]
2387 - else: 2364 + del new_stk
2388 - print('\t'+validsymb, flush=True) 2365 + doublons = [i for i in existing_ids if i in new_ids]
2366 + del existing_ids, new_ids
2367 + if len(doublons):
2368 + warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
2369 + with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove:
2370 + toremove.write('\n'.join(doublons)+'\n')
2371 + p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path],
2372 + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2373 + p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path],
2374 + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2375 + os.remove(path_to_seq_data + "realigned/toremove.txt")
2376 +
2377 + # And we merge the two alignments
2378 + p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
2379 + "--rna", existing_ali_path, new_ali_path],
2380 + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2381 + stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8')
2382 + subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
2383 + notify("Merged alignments into one")
2384 +
2385 + # remove the partial files
2386 + os.remove(new_ali_path)
2387 + os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa")
2388 + else:
2389 + # Alignment does not exist yet. We need to compute it from scratch.
2390 + print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True)
2391 +
2392 + cmd = ["cmalign"]
2393 + if alignopts is not None:
2394 + cmd += " ".split(alignopts)
2395 + cmd += ['-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
2396 + "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins",
2397 + "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
2398 + path_to_seq_data + f"realigned/{rfam_acc}.cm",
2399 + path_to_seq_data + f"realigned/{rfam_acc}++.fa"]
2400 +
2401 + p = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2402 + stderr = p.stderr.decode("utf-8")
2403 +
2404 + if len(stderr):
2405 + print('', flush=True)
2406 + warn(f"Error during sequence alignment: {stderr}", error=True)
2407 + with open(runDir + "/errors.txt", "a") as er:
2408 + er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n')
2409 + return 1
2410 + else:
2411 + print('\t'+validsymb, flush=True)
2389 2412
2390 - # Convert Stockholm to aligned FASTA 2413 + # Convert Stockholm to aligned FASTA
2391 - subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", 2414 + subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
2392 "--informat", "stockholm", 2415 "--informat", "stockholm",
2393 "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"]) 2416 "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"])
2394 - subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part. 2417 + subprocess.run(["rm", "-f", "esltmp*"]) # We can use a joker here, because we are not running in parallel for this part.
2418 +
2419 +@trace_unhandled_exceptions
2420 +def work_realign(useSina, alignopts, rfam_acc):
2421 + """ Runs multiple sequence alignements by RNA family.
2395 2422
2423 + It aligns the Rfam hits from a RNA family with the sequences from the list of chains.
2424 + Rfam covariance models are used with Infernal tools or SINA based on the options provided by the user.
2425 + Even if the user prefers to use SINA it will be used only for rRNA and cmalign for other families
2426 + """
2427 +
2428 + setproctitle(f"RNAnet.py work_realign({rfam_acc})")
2429 + if useSina and rfam_acc in LSU_set | SSU_set:
2430 + use_sina(rfam_acc)
2431 + else:
2432 + use_infernal(rfam_acc, alignopts)
2433 +
2396 # Assert everything worked, or save an error 2434 # Assert everything worked, or save an error
2397 with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa", 'r') as output: 2435 with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa", 'r') as output:
2398 if not len(output.readline()): 2436 if not len(output.readline()):
...@@ -2402,6 +2440,73 @@ def work_realign(rfam_acc): ...@@ -2402,6 +2440,73 @@ def work_realign(rfam_acc):
2402 er.write(f"Failed to realign {rfam_acc} (killed)") 2440 er.write(f"Failed to realign {rfam_acc} (killed)")
2403 2441
2404 @trace_unhandled_exceptions 2442 @trace_unhandled_exceptions
2443 +def work_pydca(f, columns_to_save):
2444 + """
2445 + This function writes an alignment file containing only the columns which will be saved to the database,
2446 + converted to uppercase, and without non-ACGU nucleotides.
2447 + This file in then used by pydca to compute DCA features, and finally removed.
2448 + """
2449 +
2450 + align=read(path_to_seq_data + f"realigned/{f}++.afa")
2451 + for s in align:
2452 + s.seq=s.seq.upper() # Convert to uppercase as needed for pydca
2453 + filtered_alignment = align[:, 1:1] # all the lines, but no columns
2454 + for p in columns_to_save:
2455 + filtered_alignment += align[:, p-1:p] # save columns one by one
2456 +
2457 + # Replace all other letters by a deletion gap just for the
2458 + # aim to use pydca as sites other than ACGU . and - are not accepted
2459 + for s in filtered_alignment:
2460 + for i in range(len(s.seq)):
2461 + if s.seq[i].upper() not in "ACGU-.":
2462 + s.seq[i]='-'
2463 +
2464 + # Create a fasta file to be used by pydca
2465 + with open(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa", "w") as only_3d:
2466 + try:
2467 + only_3d.write(format(filtered_alignment, "fasta"))
2468 + except ValueError as e:
2469 + warn(e)
2470 +
2471 + # PyDCA instance with options,
2472 + # Here lamda_J is set by pydca to 0.2*(L-1) where L is the length of the sequences
2473 + # The maximum number of iterations is set to 500 for gradient descent
2474 + # Lamda_h is set to 1 and seqid is set to 0.8 as suggested by pydca papers
2475 + # Reference:
2476 + # Zerihun MB, Pucci F, Peter EK, Schug A. pydca v1. 0: a comprehensive software for Direct Coupling Analysis of RNA and Protein Sequences. Bioinformatics.
2477 + # 2020;36(7):2264–2265. 10.1093/bioinformatics/btz892 - DOI - https://pubmed.ncbi.nlm.nih.gov/31778142/
2478 + plmdca_inst = plmdca.PlmDCA(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa",
2479 + "rna", seqid = 0.8, lambda_h = 1.0, num_threads = 10, max_iterations = 500)
2480 + number_of_sites=len(columns_to_save)*(len(columns_to_save)-1)//2 # L*(L-1)/2 where L=len(columns_to_save)
2481 +
2482 + # Tuple of two list of tuples
2483 + # - the first list contains the fields of sites (nucleotides)
2484 + # - the second contains pairwise fields (2 nucleotides)
2485 + # linear distance is zero in order to keep all possible pairs
2486 + # because if linear dist=x>0 the pydca will return position |i-j|>x
2487 + # which will force us to lose a lot of pairs
2488 + params = plmdca_inst.compute_params(linear_dist=0, num_site_pairs=number_of_sites)
2489 +
2490 + # Fröbenius norm with average product correction
2491 + fn_apc = plmdca_inst.compute_sorted_FN_APC()
2492 +
2493 + # Save to file
2494 + np.savez(path_to_seq_data+f"/realigned/{f}_pydca.npz", PARAMS=params, FNAPC=fn_apc)
2495 +
2496 + # A dictionary to be used in the function where the frequencies are stored in align_column table
2497 + return_dict_fields={}
2498 + for list_fields in params[0]:
2499 + # The element at 0 is the index
2500 + # So taking the value from column to save at that index will give us
2501 + # the fields to be stored at align_column in the table
2502 + return_dict_fields[columns_to_save[list_fields[0]]] = list_fields[1]
2503 +
2504 + # Cleanup
2505 + subprocess.run(["rm", "-f", path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa"])
2506 +
2507 + return return_dict_fields
2508 +
2509 +@trace_unhandled_exceptions
2405 def work_pssm_remap(f): 2510 def work_pssm_remap(f):
2406 """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. 2511 """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
2407 This also remaps the 3D object sequence with the aligned sequence in the MSA. 2512 This also remaps the 3D object sequence with the aligned sequence in the MSA.
...@@ -2477,7 +2582,6 @@ def work_pssm_remap(f): ...@@ -2477,7 +2582,6 @@ def work_pssm_remap(f):
2477 2582
2478 # At this point, pssm_info is a numpy array containing the PSSM and consensus a list of consensus chars. 2583 # At this point, pssm_info is a numpy array containing the PSSM and consensus a list of consensus chars.
2479 2584
2480 -
2481 ########################################################################################## 2585 ##########################################################################################
2482 # Remap sequences of the 3D chains with sequences in the alignment 2586 # Remap sequences of the 3D chains with sequences in the alignment
2483 ########################################################################################## 2587 ##########################################################################################
...@@ -2560,11 +2664,29 @@ def work_pssm_remap(f): ...@@ -2560,11 +2664,29 @@ def work_pssm_remap(f):
2560 # Get a sorted list from the set 2664 # Get a sorted list from the set
2561 columns = sorted(columns_to_save) 2665 columns = sorted(columns_to_save)
2562 2666
2667 + # Save the re_mappings
2668 + conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0)
2669 + conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
2670 + sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali)
2671 + VALUES (?, ?, ?)
2672 + ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""",
2673 + many=True, data=re_mappings)
2674 +
2675 + # Delete alignment columns that are not used anymore from the database
2676 + current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = '{f}';")]
2677 + unused = []
2678 + for col in current_family_columns:
2679 + if col not in columns_to_save:
2680 + unused.append((f, col))
2681 + sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused)
2682 + conn.commit()
2683 +
2684 +
2563 ########################################################################################## 2685 ##########################################################################################
2564 - # Save the alignment columns and their mappings to the database 2686 + # Retrieve or compute metadata relative to the MSA columns
2565 ########################################################################################## 2687 ##########################################################################################
2566 2688
2567 - setproctitle(f"RNAnet.py work_pssm_remap({f}) saving") 2689 + setproctitle(f"RNAnet.py work_pssm_remap({f}) insert/match states")
2568 2690
2569 # Get back the information of match/insertion states from the STK file 2691 # Get back the information of match/insertion states from the STK file
2570 alignstk = AlignIO.read(path_to_seq_data + "realigned/" + f + "++.stk", "stockholm") 2692 alignstk = AlignIO.read(path_to_seq_data + "realigned/" + f + "++.stk", "stockholm")
...@@ -2590,32 +2712,22 @@ def work_pssm_remap(f): ...@@ -2590,32 +2712,22 @@ def work_pssm_remap(f):
2590 cm_2d.append(".") 2712 cm_2d.append(".")
2591 cm_coord += 1 2713 cm_coord += 1
2592 2714
2593 - # Save the re_mappings 2715 + setproctitle(f"RNAnet.py work_pssm_remap({f}) Potts model, DCA")
2594 - conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0)
2595 - conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
2596 - sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali)
2597 - VALUES (?, ?, ?)
2598 - ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""",
2599 - many=True, data=re_mappings)
2600 2716
2601 - # Delete alignment columns that are not used anymore from the database 2717 + rfam_fields_record = work_pydca(f, columns)
2602 - current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = '{f}';")]
2603 - unused = []
2604 - for col in current_family_columns:
2605 - if col not in columns_to_save:
2606 - unused.append((f, col))
2607 - sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused)
2608 - conn.commit()
2609 2718
2610 - # Save the useful columns in the database 2719 + data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + tuple(rfam_fields_record[j]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)]
2611 - data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)] 2720 + sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct)
2612 - sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct) 2721 + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO
2613 - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO
2614 UPDATE SET cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, 2722 UPDATE SET cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U,
2615 - freq_other=excluded.freq_other, gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data) 2723 + freq_other=excluded.freq_other, fields_A=excluded.fields_A, fields_C=excluded.fields_C, fields_G=excluded.fields_G, fields_U=excluded.fields_U,
2724 + gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data)
2616 # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment) 2725 # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment)
2617 - sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct) 2726 + sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other,
2618 - VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', "NULL");""", data=(f,)) 2727 + fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct)
2728 + VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,))
2729 +
2730 +
2619 # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) 2731 # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains)
2620 sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f)) 2732 sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f))
2621 conn.close() 2733 conn.close()
...@@ -2659,9 +2771,10 @@ def work_save(c, homology=True): ...@@ -2659,9 +2771,10 @@ def work_save(c, homology=True):
2659 if homology: 2771 if homology:
2660 df = pd.read_sql_query(f""" 2772 df = pd.read_sql_query(f"""
2661 SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, cm_coord, 2773 SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, cm_coord,
2662 - is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct, dbn, 2774 + is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, fields_A, fields_C, fields_G,
2663 - paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, 2775 + fields_U, gap_percent, consensus, cons_sec_struct, dbn, paired, nb_interact, pair_type_LW, pair_type_DSSR,
2664 - chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, 2776 + alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, chi, bb_type, glyco_bond, form, ssZp, Dp,
2777 + eta, theta, eta_prime, theta_prime, eta_base, theta_base,
2665 v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM 2778 v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM
2666 (SELECT chain_id, rfam_acc from chain WHERE chain_id = {c.db_chain_id}) 2779 (SELECT chain_id, rfam_acc from chain WHERE chain_id = {c.db_chain_id})
2667 NATURAL JOIN re_mapping 2780 NATURAL JOIN re_mapping
......
...@@ -952,8 +952,8 @@ def par_distance_matrix(filelist, f, label, consider_all_atoms, s): ...@@ -952,8 +952,8 @@ def par_distance_matrix(filelist, f, label, consider_all_atoms, s):
952 else: 952 else:
953 d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]) 953 d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j])
954 954
955 - if f not in LSU_set and f not in SSU_set: 955 + # if f not in LSU_set and f not in SSU_set:
956 - np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f") 956 + np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
957 return 1-np.isnan(d).astype(int), np.nan_to_num(d), np.nan_to_num(d*d) 957 return 1-np.isnan(d).astype(int), np.nan_to_num(d), np.nan_to_num(d*d)
958 958
959 @trace_unhandled_exceptions 959 @trace_unhandled_exceptions
...@@ -1169,7 +1169,7 @@ if __name__ == "__main__": ...@@ -1169,7 +1169,7 @@ if __name__ == "__main__":
1169 1169
1170 sys.exit() 1170 sys.exit()
1171 elif opt == '--version': 1171 elif opt == '--version':
1172 - print("RNANet statistics 1.3 beta") 1172 + print("RNANet statistics 1.4 beta")
1173 sys.exit() 1173 sys.exit()
1174 elif opt == "-r" or opt == "--resolution": 1174 elif opt == "-r" or opt == "--resolution":
1175 assert float(arg) > 0.0 and float(arg) <= 20.0 1175 assert float(arg) > 0.0 and float(arg) <= 20.0
......