Moved to version beta 1.4 by making sina as a command line and adding pydca for feature calculation
Showing
2 changed files
with
217 additions
and
105 deletions
... | @@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser | ... | @@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser |
40 | from Bio.Seq import MutableSeq | 40 | from Bio.Seq import MutableSeq |
41 | from Bio.SeqRecord import SeqRecord | 41 | from Bio.SeqRecord import SeqRecord |
42 | from Bio.Align import MultipleSeqAlignment | 42 | from Bio.Align import MultipleSeqAlignment |
43 | - | 43 | +from pydca.plmdca import plmdca |
44 | runDir = os.getcwd() | 44 | runDir = os.getcwd() |
45 | 45 | ||
46 | def trace_unhandled_exceptions(func): | 46 | def trace_unhandled_exceptions(func): |
... | @@ -76,11 +76,11 @@ python_executable = "python"+".".join(platform.python_version().split('.')[:2]) | ... | @@ -76,11 +76,11 @@ python_executable = "python"+".".join(platform.python_version().split('.')[:2]) |
76 | validsymb = '\U00002705' | 76 | validsymb = '\U00002705' |
77 | warnsymb = '\U000026A0' | 77 | warnsymb = '\U000026A0' |
78 | errsymb = '\U0000274C' | 78 | errsymb = '\U0000274C' |
79 | - | ||
80 | LSU_set = {"RF00002", "RF02540", "RF02541", | 79 | LSU_set = {"RF00002", "RF02540", "RF02541", |
81 | "RF02543", "RF02546"} # From Rfam CLAN 00112 | 80 | "RF02543", "RF02546"} # From Rfam CLAN 00112 |
82 | SSU_set = {"RF00177", "RF02542", "RF02545", | 81 | SSU_set = {"RF00177", "RF02542", "RF02545", |
83 | "RF01959", "RF01960"} # From Rfam CLAN 00111 | 82 | "RF01959", "RF01960"} # From Rfam CLAN 00111 |
83 | + | ||
84 | no_nts_set = set() | 84 | no_nts_set = set() |
85 | weird_mappings = set() | 85 | weird_mappings = set() |
86 | 86 | ||
... | @@ -960,6 +960,9 @@ class Pipeline: | ... | @@ -960,6 +960,9 @@ class Pipeline: |
960 | 960 | ||
961 | # Default options: | 961 | # Default options: |
962 | self.CRYSTAL_RES = 4.0 | 962 | self.CRYSTAL_RES = 4.0 |
963 | + self.MXSIZE=48000 | ||
964 | + self.TAU=1e-17 | ||
965 | + self.NONBANDED=False | ||
963 | self.KEEP_HETATM = False | 966 | self.KEEP_HETATM = False |
964 | self.HOMOLOGY = True | 967 | self.HOMOLOGY = True |
965 | self.USE_KNOWN_ISSUES = True | 968 | self.USE_KNOWN_ISSUES = True |
... | @@ -967,6 +970,7 @@ class Pipeline: | ... | @@ -967,6 +970,7 @@ class Pipeline: |
967 | self.EXTRACT_CHAINS = False | 970 | self.EXTRACT_CHAINS = False |
968 | self.REUSE_ALL = False | 971 | self.REUSE_ALL = False |
969 | self.REDUNDANT = False | 972 | self.REDUNDANT = False |
973 | + self.USESINA=False | ||
970 | self.SELECT_ONLY = None | 974 | self.SELECT_ONLY = None |
971 | self.ARCHIVE = False | 975 | self.ARCHIVE = False |
972 | self.SAVELOGS = True | 976 | self.SAVELOGS = True |
... | @@ -982,7 +986,7 @@ class Pipeline: | ... | @@ -982,7 +986,7 @@ class Pipeline: |
982 | setproctitle("RNANet.py process_options()") | 986 | setproctitle("RNANet.py process_options()") |
983 | 987 | ||
984 | try: | 988 | try: |
985 | - opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=", "only=", "maxcores=", | 989 | + opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=","mxsize=","seq-folder=", "keep-hetatm=","tau=","only=", "maxcores=","sina", |
986 | "from-scratch", "full-inference", "no-homology","redundant", "ignore-issues", "extract", | 990 | "from-scratch", "full-inference", "no-homology","redundant", "ignore-issues", "extract", |
987 | "all", "no-logs", "archive", "update-homologous", "version"]) | 991 | "all", "no-logs", "archive", "update-homologous", "version"]) |
988 | except getopt.GetoptError as err: | 992 | except getopt.GetoptError as err: |
... | @@ -1024,9 +1028,13 @@ class Pipeline: | ... | @@ -1024,9 +1028,13 @@ class Pipeline: |
1024 | print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:" | 1028 | print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:" |
1025 | "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families" | 1029 | "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families" |
1026 | "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family") | 1030 | "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family") |
1031 | + print("--sina\t\t\tForce the RNANet to align large subunit LSU and small subunit SSU ribosomal RNA using sina instead of infernal," | ||
1032 | + "\n\t\t\t\t\t the other RNA families will be aligned using infernal.") | ||
1027 | print("--maxcores=…\t\t\tLimit the number of cores to use in parallel portions to reduce the simultaneous" | 1033 | print("--maxcores=…\t\t\tLimit the number of cores to use in parallel portions to reduce the simultaneous" |
1028 | "\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions" | 1034 | "\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions" |
1029 | "\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.") | 1035 | "\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.") |
1036 | + print("--tau=…\t\t\tThe tail loss probability used during HMM band calculation.") | ||
1037 | + print("--mxsize=…\t\t\tThe maximum allowable total DP matrix size.") | ||
1030 | print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments," | 1038 | print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments," |
1031 | "\n\t\t\t\t and update the link to the latest archive. ") | 1039 | "\n\t\t\t\t and update the link to the latest archive. ") |
1032 | print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications") | 1040 | print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications") |
... | @@ -1044,7 +1052,7 @@ class Pipeline: | ... | @@ -1044,7 +1052,7 @@ class Pipeline: |
1044 | print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --no-logs' &") | 1052 | print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --no-logs' &") |
1045 | sys.exit() | 1053 | sys.exit() |
1046 | elif opt == '--version': | 1054 | elif opt == '--version': |
1047 | - print("RNANet v1.3 beta, parallelized, Dockerized") | 1055 | + print("RNANet v1.4 beta, parallelized, Dockerized") |
1048 | print("Last revision : Jan 2021") | 1056 | print("Last revision : Jan 2021") |
1049 | sys.exit() | 1057 | sys.exit() |
1050 | elif opt == "-r" or opt == "--resolution": | 1058 | elif opt == "-r" or opt == "--resolution": |
... | @@ -1091,6 +1099,11 @@ class Pipeline: | ... | @@ -1091,6 +1099,11 @@ class Pipeline: |
1091 | path_to_seq_data + "realigned", | 1099 | path_to_seq_data + "realigned", |
1092 | path_to_seq_data + "rfam_sequences"]) | 1100 | path_to_seq_data + "rfam_sequences"]) |
1093 | self.REUSE_ALL = True | 1101 | self.REUSE_ALL = True |
1102 | + elif opt=="--tau": | ||
1103 | + self.TAU=float(arg) | ||
1104 | + elif opt=="--mxsize": | ||
1105 | + self.MXSIZE=int(arg) | ||
1106 | + self.NONBANDED=True | ||
1094 | elif opt == "--all": | 1107 | elif opt == "--all": |
1095 | self.REUSE_ALL = True | 1108 | self.REUSE_ALL = True |
1096 | self.USE_KNOWN_ISSUES = False | 1109 | self.USE_KNOWN_ISSUES = False |
... | @@ -1107,6 +1120,8 @@ class Pipeline: | ... | @@ -1107,6 +1120,8 @@ class Pipeline: |
1107 | self.FULLINFERENCE = True | 1120 | self.FULLINFERENCE = True |
1108 | elif opt=="--redundant": | 1121 | elif opt=="--redundant": |
1109 | self.REDUNDANT=True | 1122 | self.REDUNDANT=True |
1123 | + elif opt=="--sina": | ||
1124 | + self.USESINA=True | ||
1110 | 1125 | ||
1111 | if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions": | 1126 | if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions": |
1112 | print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") | 1127 | print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") |
... | @@ -1731,6 +1746,10 @@ def sql_define_tables(conn): | ... | @@ -1731,6 +1746,10 @@ def sql_define_tables(conn): |
1731 | freq_G REAL, | 1746 | freq_G REAL, |
1732 | freq_U REAL, | 1747 | freq_U REAL, |
1733 | freq_other REAL, | 1748 | freq_other REAL, |
1749 | + fields_A REAL, | ||
1750 | + fields_C REAL, | ||
1751 | + fields_G REAL, | ||
1752 | + fields_U REAL, | ||
1734 | gap_percent REAL, | 1753 | gap_percent REAL, |
1735 | consensus CHAR(1), | 1754 | consensus CHAR(1), |
1736 | PRIMARY KEY (rfam_acc, index_ali), | 1755 | PRIMARY KEY (rfam_acc, index_ali), |
... | @@ -2187,8 +2206,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): | ... | @@ -2187,8 +2206,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): |
2187 | """ | 2206 | """ |
2188 | 2207 | ||
2189 | setproctitle("RNAnet.py work_prepare_sequences()") | 2208 | setproctitle("RNAnet.py work_prepare_sequences()") |
2190 | - | 2209 | + if self.USESINA and rfam_acc in LSU_set | SSU_set: |
2191 | - if rfam_acc in LSU_set | SSU_set: # rRNA | ||
2192 | if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): | 2210 | if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): |
2193 | # Detect doublons and remove them | 2211 | # Detect doublons and remove them |
2194 | existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta") | 2212 | existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta") |
... | @@ -2270,107 +2288,126 @@ def work_prepare_sequences(dl, rfam_acc, chains): | ... | @@ -2270,107 +2288,126 @@ def work_prepare_sequences(dl, rfam_acc, chains): |
2270 | notify(status) | 2288 | notify(status) |
2271 | 2289 | ||
2272 | @trace_unhandled_exceptions | 2290 | @trace_unhandled_exceptions |
2273 | -def work_realign(rfam_acc): | 2291 | +def use_sina(rfam_acc): |
2274 | - """ Runs multiple sequence alignements by RNA family. | ||
2275 | - | ||
2276 | - It aligns the Rfam hits from a RNA family with the sequences from the list of chains. | ||
2277 | - Rfam covariance models are used with Infernal tools, except for rRNAs. | ||
2278 | - cmalign requires too much RAM for them, so we use SINA, a specifically designed tool for rRNAs. | ||
2279 | """ | 2292 | """ |
2293 | + When prompted by the user to use SINA the software will use SINA for rRNA SSU and LSU | ||
2294 | + """ | ||
2295 | + if rfam_acc in ["RF00177", "RF01960"]: | ||
2296 | + arbfile = "realigned/SSU.arb" | ||
2297 | + else: | ||
2298 | + arbfile = "realigned/LSU.arb" | ||
2280 | 2299 | ||
2281 | - setproctitle(f"RNAnet.py work_realign({rfam_acc})") | 2300 | + # Run alignment |
2282 | - | 2301 | + p = subprocess.run(["sina", "-i", path_to_seq_data + f"realigned/{rfam_acc}++.fa", |
2283 | - if rfam_acc in LSU_set | SSU_set: | 2302 | + "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", |
2284 | - # Ribosomal subunits deserve a special treatment. | 2303 | + "-r", path_to_seq_data + arbfile, |
2285 | - # They require too much RAM to be aligned with Infernal. | 2304 | + "--meta-fmt=csv"]) |
2286 | - # Then we will use SINA instead. | 2305 | +@trace_unhandled_exceptions |
2287 | - if rfam_acc in ["RF00177", "RF01960"]: | 2306 | +def use_infernal(rfam_acc): |
2288 | - arbfile = "realigned/SSU.arb" | 2307 | + """ |
2289 | - else: | 2308 | + Infernal is our default alignment tool except if prompted by the user. |
2290 | - arbfile = "realigned/LSU.arb" | 2309 | + Cmalign will be used for all families except when the user prefers to align rRNA with SINA |
2310 | + """ | ||
2311 | + if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"): | ||
2312 | + # Alignment exists. We just want to add new sequences into it. | ||
2291 | 2313 | ||
2292 | - # Run alignment | 2314 | + if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"): |
2293 | - p = subprocess.run(["sina", "-i", path_to_seq_data + f"realigned/{rfam_acc}++.fa", | 2315 | + # there are no new sequences to align... |
2294 | - "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", | 2316 | + return |
2295 | - "-r", path_to_seq_data + arbfile, | ||
2296 | - "--meta-fmt=csv"]) | ||
2297 | - else: | ||
2298 | - # Align using Infernal for most RNA families | ||
2299 | 2317 | ||
2300 | - if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"): | 2318 | + existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk" |
2301 | - # Alignment exists. We just want to add new sequences into it. | 2319 | + new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk" |
2320 | + | ||
2321 | + # Align the new sequences | ||
2322 | + with open(new_ali_path, 'w') as o: | ||
2323 | + p1 = subprocess.run(["cmalign", path_to_seq_data + f"realigned/{rfam_acc}.cm", | ||
2324 | + path_to_seq_data + f"realigned/{rfam_acc}_new.fa"], | ||
2325 | + stdout=o, stderr=subprocess.PIPE) | ||
2326 | + notify("Aligned new sequences together") | ||
2327 | + | ||
2328 | + # Detect doublons and remove them | ||
2329 | + existing_stk = AlignIO.read(existing_ali_path, "stockholm") | ||
2330 | + existing_ids = [r.id for r in existing_stk] | ||
2331 | + del existing_stk | ||
2332 | + new_stk = AlignIO.read(new_ali_path, "stockholm") | ||
2333 | + new_ids = [r.id for r in new_stk] | ||
2334 | + del new_stk | ||
2335 | + doublons = [i for i in existing_ids if i in new_ids] | ||
2336 | + del existing_ids, new_ids | ||
2337 | + if len(doublons): | ||
2338 | + warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version") | ||
2339 | + with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove: | ||
2340 | + toremove.write('\n'.join(doublons)+'\n') | ||
2341 | + p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path], | ||
2342 | + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2343 | + p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path], | ||
2344 | + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2345 | + os.remove(path_to_seq_data + "realigned/toremove.txt") | ||
2302 | 2346 | ||
2303 | - if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"): | 2347 | + # And we merge the two alignments |
2304 | - # there are no new sequences to align... | 2348 | + p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", |
2305 | - return | 2349 | + "--rna", existing_ali_path, new_ali_path], |
2350 | + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2351 | + stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8') | ||
2352 | + subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path]) | ||
2353 | + notify("Merged alignments into one") | ||
2306 | 2354 | ||
2307 | - existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk" | 2355 | + # remove the partial files |
2308 | - new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk" | 2356 | + os.remove(new_ali_path) |
2357 | + os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa") | ||
2309 | 2358 | ||
2310 | - # Align the new sequences | 2359 | + else: |
2311 | - with open(new_ali_path, 'w') as o: | 2360 | + # Alignment does not exist yet. We need to compute it from scratch. |
2312 | - p1 = subprocess.run(["cmalign", path_to_seq_data + f"realigned/{rfam_acc}.cm", | 2361 | + print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True) |
2313 | - path_to_seq_data + f"realigned/{rfam_acc}_new.fa"], | 2362 | + |
2314 | - stdout=o, stderr=subprocess.PIPE) | 2363 | + #Here the idea is to run one of the two proposed commands either cmalign --tau <tau val> or cmalign with mxsize |
2315 | - notify("Aligned new sequences together") | 2364 | + |
2365 | + if not self.NONBANDED: | ||
2366 | + p = subprocess.run(["cmalign", "--tau",f"{self.TAU}", | ||
2367 | + '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk", | ||
2368 | + path_to_seq_data + f"realigned/{rfam_acc}.cm", | ||
2369 | + path_to_seq_data + f"realigned/{rfam_acc}++.fa"], | ||
2370 | + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2371 | + else: | ||
2372 | + p = subprocess.run(["cmalign", "--nonbanded","--noprob","--mxsize",f"{self.MXSIZE}" | ||
2373 | + '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk", | ||
2374 | + path_to_seq_data + f"realigned/{rfam_acc}.cm", | ||
2375 | + path_to_seq_data + f"realigned/{rfam_acc}++.fa"], | ||
2376 | + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2377 | + stderr = p.stderr.decode("utf-8") | ||
2378 | + | ||
2379 | + if len(stderr): | ||
2380 | + print('', flush=True) | ||
2381 | + warn(f"Error during sequence alignment: {stderr}", error=True) | ||
2382 | + with open(runDir + "/errors.txt", "a") as er: | ||
2383 | + er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n') | ||
2384 | + return 1 | ||
2385 | + else: | ||
2386 | + print('\t'+validsymb, flush=True) | ||
2316 | 2387 | ||
2317 | - # Detect doublons and remove them | 2388 | +@trace_unhandled_exceptions |
2318 | - existing_stk = AlignIO.read(existing_ali_path, "stockholm") | 2389 | +def work_realign(rfam_acc): |
2319 | - existing_ids = [r.id for r in existing_stk] | 2390 | + """ Runs multiple sequence alignements by RNA family. |
2320 | - del existing_stk | ||
2321 | - new_stk = AlignIO.read(new_ali_path, "stockholm") | ||
2322 | - new_ids = [r.id for r in new_stk] | ||
2323 | - del new_stk | ||
2324 | - doublons = [i for i in existing_ids if i in new_ids] | ||
2325 | - del existing_ids, new_ids | ||
2326 | - if len(doublons): | ||
2327 | - warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version") | ||
2328 | - with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove: | ||
2329 | - toremove.write('\n'.join(doublons)+'\n') | ||
2330 | - p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path], | ||
2331 | - stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2332 | - p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path], | ||
2333 | - stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2334 | - os.remove(path_to_seq_data + "realigned/toremove.txt") | ||
2335 | - | ||
2336 | - # And we merge the two alignments | ||
2337 | - p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", | ||
2338 | - "--rna", existing_ali_path, new_ali_path], | ||
2339 | - stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2340 | - stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8') | ||
2341 | - subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path]) | ||
2342 | - notify("Merged alignments into one") | ||
2343 | 2391 | ||
2344 | - # remove the partial files | 2392 | + It aligns the Rfam hits from a RNA family with the sequences from the list of chains. |
2345 | - os.remove(new_ali_path) | 2393 | + Rfam covariance models are used with Infernal tools or SINA based on the options provided by the user. |
2346 | - os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa") | 2394 | + Even if the user prefers to use SINA it will be used only for rRNA and cmalign for other families |
2395 | + """ | ||
2347 | 2396 | ||
2397 | + setproctitle(f"RNAnet.py work_realign({rfam_acc})") | ||
2398 | + if self.USESINA: | ||
2399 | + if rfam_acc in LSU_set | SSU_set: | ||
2400 | + use_sina(rfam_acc) | ||
2348 | else: | 2401 | else: |
2349 | - # Alignment does not exist yet. We need to compute it from scratch. | 2402 | + use_infernal(rfam_acc) |
2350 | - print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True) | 2403 | + else: |
2351 | - | 2404 | + use_infernal(rfam_acc) |
2352 | - p = subprocess.run(["cmalign", "--small", "--cyk", "--noprob", "--nonbanded", "--notrunc", | 2405 | + # Convert Stockholm to aligned FASTA |
2353 | - '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk", | 2406 | + subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", |
2354 | - path_to_seq_data + f"realigned/{rfam_acc}.cm", | ||
2355 | - path_to_seq_data + f"realigned/{rfam_acc}++.fa"], | ||
2356 | - stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | ||
2357 | - stderr = p.stderr.decode("utf-8") | ||
2358 | - | ||
2359 | - if len(stderr): | ||
2360 | - print('', flush=True) | ||
2361 | - warn(f"Error during sequence alignment: {stderr}", error=True) | ||
2362 | - with open(runDir + "/errors.txt", "a") as er: | ||
2363 | - er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n') | ||
2364 | - return 1 | ||
2365 | - else: | ||
2366 | - print('\t'+validsymb, flush=True) | ||
2367 | - | ||
2368 | - # Convert Stockholm to aligned FASTA | ||
2369 | - subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", | ||
2370 | "--informat", "stockholm", | 2407 | "--informat", "stockholm", |
2371 | "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"]) | 2408 | "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"]) |
2372 | - subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part. | 2409 | + subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part. |
2373 | - | 2410 | + |
2374 | # Assert everything worked, or save an error | 2411 | # Assert everything worked, or save an error |
2375 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa", 'r') as output: | 2412 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa", 'r') as output: |
2376 | if not len(output.readline()): | 2413 | if not len(output.readline()): |
... | @@ -2380,6 +2417,81 @@ def work_realign(rfam_acc): | ... | @@ -2380,6 +2417,81 @@ def work_realign(rfam_acc): |
2380 | er.write(f"Failed to realign {rfam_acc} (killed)") | 2417 | er.write(f"Failed to realign {rfam_acc} (killed)") |
2381 | 2418 | ||
2382 | @trace_unhandled_exceptions | 2419 | @trace_unhandled_exceptions |
2420 | +def compute_from_pydca(f,columns_to_save): | ||
2421 | + | ||
2422 | + align=read(path_to_seq_data + f"realigned/{f}++.afa") | ||
2423 | + #convert to uppercase as needed for pydca | ||
2424 | + for s in align: | ||
2425 | + s.seq=s.seq.upper() | ||
2426 | + filtered_alignment = align[:, 1:1] # all the lines, but no columns | ||
2427 | + for p in columns_to_save: | ||
2428 | + filtered_alignment += align[:, p-1:p] # save columns one by one | ||
2429 | + | ||
2430 | + #replace all other letters by a gap consensus just for the | ||
2431 | + #aim to use pydca as sites other than ACGU . and - are not accepted | ||
2432 | + | ||
2433 | + | ||
2434 | + for s in filtered_alignment: | ||
2435 | + for i in range(len(s.seq)): | ||
2436 | + if s.seq[i].upper() not in "ACGU-.": | ||
2437 | + s.seq[i]='-' | ||
2438 | + #create a fasta file to be used by pydca | ||
2439 | + with open(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa", "w") as only_3d: | ||
2440 | + try: | ||
2441 | + only_3d.write(format(filtered_alignment, "fasta")) | ||
2442 | + except ValueError as e: | ||
2443 | + warn(e) | ||
2444 | + | ||
2445 | + #pydca instance | ||
2446 | + #here lamda_J is set by pydca to 0.2*(L-1) where L is the length of the sequence | ||
2447 | + #the maximum number of iterations is set to 500 for gradient descent | ||
2448 | + #lamda_h is set to 1 and seqid is set to 0.8 as suggested by pydca papers | ||
2449 | + | ||
2450 | + #Reference: | ||
2451 | + #Zerihun MB, Pucci F, Peter EK, Schug A. pydca v1. 0: a comprehensive software for Direct Coupling Analysis of RNA and Protein Sequences. Bioinformatics. | ||
2452 | + #2020;36(7):2264–2265. 10.1093/bioinformatics/btz892 - DOI - https://pubmed.ncbi.nlm.nih.gov/31778142/ | ||
2453 | + | ||
2454 | + | ||
2455 | + plmdca_inst = plmdca.PlmDCA( | ||
2456 | + path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa", | ||
2457 | + "rna", | ||
2458 | + seqid = 0.8, | ||
2459 | + lambda_h = 1.0, | ||
2460 | + num_threads = 10, | ||
2461 | + max_iterations = 500,) | ||
2462 | + #number of sites=L*(L-1)/2 so L=len(columns_to_Save) | ||
2463 | + number_of_sites=len(columns_to_save)*(len(columns_to_save)-1)//2 | ||
2464 | + | ||
2465 | + #a tuple of two list of tuples | ||
2466 | + #the first list contains the fields of sites (nucleotides) | ||
2467 | + #the second contains | ||
2468 | + #linear distance is zero in order to keep all possible pairs | ||
2469 | + | ||
2470 | + #because if linear dist=x>0 the pydca will return position |i-j|>x | ||
2471 | + #which will force us to lose a lot of pairs | ||
2472 | + | ||
2473 | + | ||
2474 | + params=plmdca_inst.compute_params(linear_dist=0,num_site_pairs=number_of_sites) | ||
2475 | + #frobenius norm with average product correction | ||
2476 | + fn_apc=plmdca_inst.compute_sorted_FN_APC() | ||
2477 | + | ||
2478 | + family_dca_data={"PARAMS":params,"FNAPC":fn_apc} | ||
2479 | + np.savez(path_to_seq_data+f"/realigned/{f}_pydca.npz") | ||
2480 | + | ||
2481 | + #a dictionary to be used in the function where the frequencies are stored in align column table | ||
2482 | + return_dict_fields={} | ||
2483 | + for list_fields in params[0]: | ||
2484 | + | ||
2485 | + #The element at 0 is the index | ||
2486 | + #So taking the value from column to save at that index will give us | ||
2487 | + #the fields to be stored at ali_col in the table | ||
2488 | + | ||
2489 | + return_dict_fields[columns_to_save[list_fields[0]]]=list_fields[1] | ||
2490 | + subprocess.run(["rm", "-f", path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa"]) | ||
2491 | + | ||
2492 | + return return_dict_fields | ||
2493 | + | ||
2494 | +@trace_unhandled_exceptions | ||
2383 | def work_pssm_remap(f): | 2495 | def work_pssm_remap(f): |
2384 | """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. | 2496 | """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. |
2385 | This also remaps the 3D object sequence with the aligned sequence in the MSA. | 2497 | This also remaps the 3D object sequence with the aligned sequence in the MSA. |
... | @@ -2560,16 +2672,16 @@ def work_pssm_remap(f): | ... | @@ -2560,16 +2672,16 @@ def work_pssm_remap(f): |
2560 | unused.append((f, col)) | 2672 | unused.append((f, col)) |
2561 | sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused) | 2673 | sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused) |
2562 | conn.commit() | 2674 | conn.commit() |
2563 | - | 2675 | + rfam_fields_record=compute_from_pydca(f,sorted(columns_to_save)) |
2564 | # Save the useful columns in the database | 2676 | # Save the useful columns in the database |
2565 | - data = [(f, j) + tuple(pssm_info[:,j-1]) + (consensus[j-1],) for j in sorted(columns_to_save)] | 2677 | + data = [(f, j) + tuple(pssm_info[:,j-1]) +tuple(rfam_fields_record[j]) + (consensus[j-1],) for j in sorted(columns_to_save)] |
2566 | - sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus) | 2678 | + sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U,freq_other,fields_A,fields_C,fields_G,fields_U , gap_percent, consensus) |
2567 | - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO | 2679 | + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?,?,?,?,?) ON CONFLICT(rfam_acc, index_ali) DO |
2568 | UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, | 2680 | UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, |
2569 | - freq_other=excluded.freq_other, gap_percent=excluded.gap_percent, consensus=excluded.consensus;""", many=True, data=data) | 2681 | + freq_other=excluded.freq_other,fields_A=excluded.fields_A,fields_C=excluded.fields_C,fields_G=excluded.fields_G,fields_U=excluded.fields_U,gap_percent=excluded.gap_percent, consensus=excluded.consensus;""", many=True, data=data) |
2570 | # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment) | 2682 | # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment) |
2571 | - sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus) | 2683 | + sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other,fields_A,fields_C,fields_G,fields_U, gap_percent, consensus) |
2572 | - VALUES (?, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-');""", data=(f,)) | 2684 | + VALUES (?, 0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0,0.0,1.0,'-');""", data=(f,)) |
2573 | # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) | 2685 | # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) |
2574 | sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f)) | 2686 | sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f)) |
2575 | conn.close() | 2687 | conn.close() |
... | @@ -2613,7 +2725,7 @@ def work_save(c, homology=True): | ... | @@ -2613,7 +2725,7 @@ def work_save(c, homology=True): |
2613 | if homology: | 2725 | if homology: |
2614 | df = pd.read_sql_query(f""" | 2726 | df = pd.read_sql_query(f""" |
2615 | SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, | 2727 | SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, |
2616 | - is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, dbn, | 2728 | + is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other,fields_A,fields_C,fields_G,fields_U,gap_percent, consensus, dbn, |
2617 | paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, | 2729 | paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, |
2618 | chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, | 2730 | chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, |
2619 | v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM | 2731 | v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM | ... | ... |
... | @@ -952,8 +952,8 @@ def par_distance_matrix(filelist, f, label, consider_all_atoms, s): | ... | @@ -952,8 +952,8 @@ def par_distance_matrix(filelist, f, label, consider_all_atoms, s): |
952 | else: | 952 | else: |
953 | d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]) | 953 | d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]) |
954 | 954 | ||
955 | - if f not in LSU_set and f not in SSU_set: | 955 | + # if f not in LSU_set and f not in SSU_set: |
956 | - np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f") | 956 | + np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f") |
957 | return 1-np.isnan(d).astype(int), np.nan_to_num(d), np.nan_to_num(d*d) | 957 | return 1-np.isnan(d).astype(int), np.nan_to_num(d), np.nan_to_num(d*d) |
958 | 958 | ||
959 | @trace_unhandled_exceptions | 959 | @trace_unhandled_exceptions |
... | @@ -1173,7 +1173,7 @@ if __name__ == "__main__": | ... | @@ -1173,7 +1173,7 @@ if __name__ == "__main__": |
1173 | 1173 | ||
1174 | sys.exit() | 1174 | sys.exit() |
1175 | elif opt == '--version': | 1175 | elif opt == '--version': |
1176 | - print("RNANet statistics 1.3 beta") | 1176 | + print("RNANet statistics 1.4 beta") |
1177 | sys.exit() | 1177 | sys.exit() |
1178 | elif opt == "-r" or opt == "--resolution": | 1178 | elif opt == "-r" or opt == "--resolution": |
1179 | assert float(arg) > 0.0 and float(arg) <= 20.0 | 1179 | assert float(arg) > 0.0 and float(arg) <= 20.0 | ... | ... |
-
Please register or login to post a comment