Khodor HANNOUSH

Moved to version beta 1.4 by making sina as a command line and adding pydca for feature calculation

...@@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser ...@@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser
40 from Bio.Seq import MutableSeq 40 from Bio.Seq import MutableSeq
41 from Bio.SeqRecord import SeqRecord 41 from Bio.SeqRecord import SeqRecord
42 from Bio.Align import MultipleSeqAlignment 42 from Bio.Align import MultipleSeqAlignment
43 - 43 +from pydca.plmdca import plmdca
44 runDir = os.getcwd() 44 runDir = os.getcwd()
45 45
46 def trace_unhandled_exceptions(func): 46 def trace_unhandled_exceptions(func):
...@@ -76,11 +76,11 @@ python_executable = "python"+".".join(platform.python_version().split('.')[:2]) ...@@ -76,11 +76,11 @@ python_executable = "python"+".".join(platform.python_version().split('.')[:2])
76 validsymb = '\U00002705' 76 validsymb = '\U00002705'
77 warnsymb = '\U000026A0' 77 warnsymb = '\U000026A0'
78 errsymb = '\U0000274C' 78 errsymb = '\U0000274C'
79 -
80 LSU_set = {"RF00002", "RF02540", "RF02541", 79 LSU_set = {"RF00002", "RF02540", "RF02541",
81 "RF02543", "RF02546"} # From Rfam CLAN 00112 80 "RF02543", "RF02546"} # From Rfam CLAN 00112
82 SSU_set = {"RF00177", "RF02542", "RF02545", 81 SSU_set = {"RF00177", "RF02542", "RF02545",
83 "RF01959", "RF01960"} # From Rfam CLAN 00111 82 "RF01959", "RF01960"} # From Rfam CLAN 00111
83 +
84 no_nts_set = set() 84 no_nts_set = set()
85 weird_mappings = set() 85 weird_mappings = set()
86 86
...@@ -960,6 +960,9 @@ class Pipeline: ...@@ -960,6 +960,9 @@ class Pipeline:
960 960
961 # Default options: 961 # Default options:
962 self.CRYSTAL_RES = 4.0 962 self.CRYSTAL_RES = 4.0
963 + self.MXSIZE=48000
964 + self.TAU=1e-17
965 + self.NONBANDED=False
963 self.KEEP_HETATM = False 966 self.KEEP_HETATM = False
964 self.HOMOLOGY = True 967 self.HOMOLOGY = True
965 self.USE_KNOWN_ISSUES = True 968 self.USE_KNOWN_ISSUES = True
...@@ -967,6 +970,7 @@ class Pipeline: ...@@ -967,6 +970,7 @@ class Pipeline:
967 self.EXTRACT_CHAINS = False 970 self.EXTRACT_CHAINS = False
968 self.REUSE_ALL = False 971 self.REUSE_ALL = False
969 self.REDUNDANT = False 972 self.REDUNDANT = False
973 + self.USESINA=False
970 self.SELECT_ONLY = None 974 self.SELECT_ONLY = None
971 self.ARCHIVE = False 975 self.ARCHIVE = False
972 self.SAVELOGS = True 976 self.SAVELOGS = True
...@@ -982,7 +986,7 @@ class Pipeline: ...@@ -982,7 +986,7 @@ class Pipeline:
982 setproctitle("RNANet.py process_options()") 986 setproctitle("RNANet.py process_options()")
983 987
984 try: 988 try:
985 - opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=", "only=", "maxcores=", 989 + opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=","mxsize=","seq-folder=", "keep-hetatm=","tau=","only=", "maxcores=","sina",
986 "from-scratch", "full-inference", "no-homology","redundant", "ignore-issues", "extract", 990 "from-scratch", "full-inference", "no-homology","redundant", "ignore-issues", "extract",
987 "all", "no-logs", "archive", "update-homologous", "version"]) 991 "all", "no-logs", "archive", "update-homologous", "version"])
988 except getopt.GetoptError as err: 992 except getopt.GetoptError as err:
...@@ -1024,9 +1028,13 @@ class Pipeline: ...@@ -1024,9 +1028,13 @@ class Pipeline:
1024 print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:" 1028 print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:"
1025 "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families" 1029 "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families"
1026 "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family") 1030 "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
1031 + print("--sina\t\t\tForce the RNANet to align large subunit LSU and small subunit SSU ribosomal RNA using sina instead of infernal,"
1032 + "\n\t\t\t\t\t the other RNA families will be aligned using infernal.")
1027 print("--maxcores=…\t\t\tLimit the number of cores to use in parallel portions to reduce the simultaneous" 1033 print("--maxcores=…\t\t\tLimit the number of cores to use in parallel portions to reduce the simultaneous"
1028 "\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions" 1034 "\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions"
1029 "\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.") 1035 "\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.")
1036 + print("--tau=…\t\t\tThe tail loss probability used during HMM band calculation.")
1037 + print("--mxsize=…\t\t\tThe maximum allowable total DP matrix size.")
1030 print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments," 1038 print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments,"
1031 "\n\t\t\t\t and update the link to the latest archive. ") 1039 "\n\t\t\t\t and update the link to the latest archive. ")
1032 print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications") 1040 print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications")
...@@ -1044,7 +1052,7 @@ class Pipeline: ...@@ -1044,7 +1052,7 @@ class Pipeline:
1044 print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --no-logs' &") 1052 print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --no-logs' &")
1045 sys.exit() 1053 sys.exit()
1046 elif opt == '--version': 1054 elif opt == '--version':
1047 - print("RNANet v1.3 beta, parallelized, Dockerized") 1055 + print("RNANet v1.4 beta, parallelized, Dockerized")
1048 print("Last revision : Jan 2021") 1056 print("Last revision : Jan 2021")
1049 sys.exit() 1057 sys.exit()
1050 elif opt == "-r" or opt == "--resolution": 1058 elif opt == "-r" or opt == "--resolution":
...@@ -1091,6 +1099,11 @@ class Pipeline: ...@@ -1091,6 +1099,11 @@ class Pipeline:
1091 path_to_seq_data + "realigned", 1099 path_to_seq_data + "realigned",
1092 path_to_seq_data + "rfam_sequences"]) 1100 path_to_seq_data + "rfam_sequences"])
1093 self.REUSE_ALL = True 1101 self.REUSE_ALL = True
1102 + elif opt=="--tau":
1103 + self.TAU=float(arg)
1104 + elif opt=="--mxsize":
1105 + self.MXSIZE=int(arg)
1106 + self.NONBANDED=True
1094 elif opt == "--all": 1107 elif opt == "--all":
1095 self.REUSE_ALL = True 1108 self.REUSE_ALL = True
1096 self.USE_KNOWN_ISSUES = False 1109 self.USE_KNOWN_ISSUES = False
...@@ -1107,6 +1120,8 @@ class Pipeline: ...@@ -1107,6 +1120,8 @@ class Pipeline:
1107 self.FULLINFERENCE = True 1120 self.FULLINFERENCE = True
1108 elif opt=="--redundant": 1121 elif opt=="--redundant":
1109 self.REDUNDANT=True 1122 self.REDUNDANT=True
1123 + elif opt=="--sina":
1124 + self.USESINA=True
1110 1125
1111 if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions": 1126 if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions":
1112 print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") 1127 print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
...@@ -1731,6 +1746,10 @@ def sql_define_tables(conn): ...@@ -1731,6 +1746,10 @@ def sql_define_tables(conn):
1731 freq_G REAL, 1746 freq_G REAL,
1732 freq_U REAL, 1747 freq_U REAL,
1733 freq_other REAL, 1748 freq_other REAL,
1749 + fields_A REAL,
1750 + fields_C REAL,
1751 + fields_G REAL,
1752 + fields_U REAL,
1734 gap_percent REAL, 1753 gap_percent REAL,
1735 consensus CHAR(1), 1754 consensus CHAR(1),
1736 PRIMARY KEY (rfam_acc, index_ali), 1755 PRIMARY KEY (rfam_acc, index_ali),
...@@ -2187,8 +2206,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): ...@@ -2187,8 +2206,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
2187 """ 2206 """
2188 2207
2189 setproctitle("RNAnet.py work_prepare_sequences()") 2208 setproctitle("RNAnet.py work_prepare_sequences()")
2190 - 2209 + if self.USESINA and rfam_acc in LSU_set | SSU_set:
2191 - if rfam_acc in LSU_set | SSU_set: # rRNA
2192 if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): 2210 if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
2193 # Detect doublons and remove them 2211 # Detect doublons and remove them
2194 existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta") 2212 existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
...@@ -2270,107 +2288,126 @@ def work_prepare_sequences(dl, rfam_acc, chains): ...@@ -2270,107 +2288,126 @@ def work_prepare_sequences(dl, rfam_acc, chains):
2270 notify(status) 2288 notify(status)
2271 2289
2272 @trace_unhandled_exceptions 2290 @trace_unhandled_exceptions
2273 -def work_realign(rfam_acc): 2291 +def use_sina(rfam_acc):
2274 - """ Runs multiple sequence alignements by RNA family.
2275 -
2276 - It aligns the Rfam hits from a RNA family with the sequences from the list of chains.
2277 - Rfam covariance models are used with Infernal tools, except for rRNAs.
2278 - cmalign requires too much RAM for them, so we use SINA, a specifically designed tool for rRNAs.
2279 """ 2292 """
2293 + When prompted by the user to use SINA the software will use SINA for rRNA SSU and LSU
2294 + """
2295 + if rfam_acc in ["RF00177", "RF01960"]:
2296 + arbfile = "realigned/SSU.arb"
2297 + else:
2298 + arbfile = "realigned/LSU.arb"
2280 2299
2281 - setproctitle(f"RNAnet.py work_realign({rfam_acc})") 2300 + # Run alignment
2282 - 2301 + p = subprocess.run(["sina", "-i", path_to_seq_data + f"realigned/{rfam_acc}++.fa",
2283 - if rfam_acc in LSU_set | SSU_set: 2302 + "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
2284 - # Ribosomal subunits deserve a special treatment. 2303 + "-r", path_to_seq_data + arbfile,
2285 - # They require too much RAM to be aligned with Infernal. 2304 + "--meta-fmt=csv"])
2286 - # Then we will use SINA instead. 2305 +@trace_unhandled_exceptions
2287 - if rfam_acc in ["RF00177", "RF01960"]: 2306 +def use_infernal(rfam_acc):
2288 - arbfile = "realigned/SSU.arb" 2307 + """
2289 - else: 2308 + Infernal is our default alignment tool except if prompted by the user.
2290 - arbfile = "realigned/LSU.arb" 2309 + Cmalign will be used for all families except when the user prefers to align rRNA with SINA
2310 + """
2311 + if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
2312 + # Alignment exists. We just want to add new sequences into it.
2291 2313
2292 - # Run alignment 2314 + if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
2293 - p = subprocess.run(["sina", "-i", path_to_seq_data + f"realigned/{rfam_acc}++.fa", 2315 + # there are no new sequences to align...
2294 - "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", 2316 + return
2295 - "-r", path_to_seq_data + arbfile,
2296 - "--meta-fmt=csv"])
2297 - else:
2298 - # Align using Infernal for most RNA families
2299 2317
2300 - if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"): 2318 + existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk"
2301 - # Alignment exists. We just want to add new sequences into it. 2319 + new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk"
2320 +
2321 + # Align the new sequences
2322 + with open(new_ali_path, 'w') as o:
2323 + p1 = subprocess.run(["cmalign", path_to_seq_data + f"realigned/{rfam_acc}.cm",
2324 + path_to_seq_data + f"realigned/{rfam_acc}_new.fa"],
2325 + stdout=o, stderr=subprocess.PIPE)
2326 + notify("Aligned new sequences together")
2327 +
2328 + # Detect doublons and remove them
2329 + existing_stk = AlignIO.read(existing_ali_path, "stockholm")
2330 + existing_ids = [r.id for r in existing_stk]
2331 + del existing_stk
2332 + new_stk = AlignIO.read(new_ali_path, "stockholm")
2333 + new_ids = [r.id for r in new_stk]
2334 + del new_stk
2335 + doublons = [i for i in existing_ids if i in new_ids]
2336 + del existing_ids, new_ids
2337 + if len(doublons):
2338 + warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
2339 + with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove:
2340 + toremove.write('\n'.join(doublons)+'\n')
2341 + p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path],
2342 + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2343 + p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path],
2344 + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2345 + os.remove(path_to_seq_data + "realigned/toremove.txt")
2302 2346
2303 - if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"): 2347 + # And we merge the two alignments
2304 - # there are no new sequences to align... 2348 + p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
2305 - return 2349 + "--rna", existing_ali_path, new_ali_path],
2350 + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2351 + stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8')
2352 + subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
2353 + notify("Merged alignments into one")
2306 2354
2307 - existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk" 2355 + # remove the partial files
2308 - new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk" 2356 + os.remove(new_ali_path)
2357 + os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa")
2309 2358
2310 - # Align the new sequences 2359 + else:
2311 - with open(new_ali_path, 'w') as o: 2360 + # Alignment does not exist yet. We need to compute it from scratch.
2312 - p1 = subprocess.run(["cmalign", path_to_seq_data + f"realigned/{rfam_acc}.cm", 2361 + print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True)
2313 - path_to_seq_data + f"realigned/{rfam_acc}_new.fa"], 2362 +
2314 - stdout=o, stderr=subprocess.PIPE) 2363 + #Here the idea is to run one of the two proposed commands either cmalign --tau <tau val> or cmalign with mxsize
2315 - notify("Aligned new sequences together") 2364 +
2365 + if not self.NONBANDED:
2366 + p = subprocess.run(["cmalign", "--tau",f"{self.TAU}",
2367 + '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
2368 + path_to_seq_data + f"realigned/{rfam_acc}.cm",
2369 + path_to_seq_data + f"realigned/{rfam_acc}++.fa"],
2370 + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2371 + else:
2372 + p = subprocess.run(["cmalign", "--nonbanded","--noprob","--mxsize",f"{self.MXSIZE}"
2373 + '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
2374 + path_to_seq_data + f"realigned/{rfam_acc}.cm",
2375 + path_to_seq_data + f"realigned/{rfam_acc}++.fa"],
2376 + stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2377 + stderr = p.stderr.decode("utf-8")
2378 +
2379 + if len(stderr):
2380 + print('', flush=True)
2381 + warn(f"Error during sequence alignment: {stderr}", error=True)
2382 + with open(runDir + "/errors.txt", "a") as er:
2383 + er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n')
2384 + return 1
2385 + else:
2386 + print('\t'+validsymb, flush=True)
2316 2387
2317 - # Detect doublons and remove them 2388 +@trace_unhandled_exceptions
2318 - existing_stk = AlignIO.read(existing_ali_path, "stockholm") 2389 +def work_realign(rfam_acc):
2319 - existing_ids = [r.id for r in existing_stk] 2390 + """ Runs multiple sequence alignements by RNA family.
2320 - del existing_stk
2321 - new_stk = AlignIO.read(new_ali_path, "stockholm")
2322 - new_ids = [r.id for r in new_stk]
2323 - del new_stk
2324 - doublons = [i for i in existing_ids if i in new_ids]
2325 - del existing_ids, new_ids
2326 - if len(doublons):
2327 - warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
2328 - with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove:
2329 - toremove.write('\n'.join(doublons)+'\n')
2330 - p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path],
2331 - stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2332 - p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path],
2333 - stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2334 - os.remove(path_to_seq_data + "realigned/toremove.txt")
2335 -
2336 - # And we merge the two alignments
2337 - p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
2338 - "--rna", existing_ali_path, new_ali_path],
2339 - stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2340 - stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8')
2341 - subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
2342 - notify("Merged alignments into one")
2343 2391
2344 - # remove the partial files 2392 + It aligns the Rfam hits from a RNA family with the sequences from the list of chains.
2345 - os.remove(new_ali_path) 2393 + Rfam covariance models are used with Infernal tools or SINA based on the options provided by the user.
2346 - os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa") 2394 + Even if the user prefers to use SINA it will be used only for rRNA and cmalign for other families
2395 + """
2347 2396
2397 + setproctitle(f"RNAnet.py work_realign({rfam_acc})")
2398 + if self.USESINA:
2399 + if rfam_acc in LSU_set | SSU_set:
2400 + use_sina(rfam_acc)
2348 else: 2401 else:
2349 - # Alignment does not exist yet. We need to compute it from scratch. 2402 + use_infernal(rfam_acc)
2350 - print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True) 2403 + else:
2351 - 2404 + use_infernal(rfam_acc)
2352 - p = subprocess.run(["cmalign", "--small", "--cyk", "--noprob", "--nonbanded", "--notrunc", 2405 + # Convert Stockholm to aligned FASTA
2353 - '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk", 2406 + subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
2354 - path_to_seq_data + f"realigned/{rfam_acc}.cm",
2355 - path_to_seq_data + f"realigned/{rfam_acc}++.fa"],
2356 - stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2357 - stderr = p.stderr.decode("utf-8")
2358 -
2359 - if len(stderr):
2360 - print('', flush=True)
2361 - warn(f"Error during sequence alignment: {stderr}", error=True)
2362 - with open(runDir + "/errors.txt", "a") as er:
2363 - er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n')
2364 - return 1
2365 - else:
2366 - print('\t'+validsymb, flush=True)
2367 -
2368 - # Convert Stockholm to aligned FASTA
2369 - subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
2370 "--informat", "stockholm", 2407 "--informat", "stockholm",
2371 "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"]) 2408 "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"])
2372 - subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part. 2409 + subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part.
2373 - 2410 +
2374 # Assert everything worked, or save an error 2411 # Assert everything worked, or save an error
2375 with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa", 'r') as output: 2412 with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa", 'r') as output:
2376 if not len(output.readline()): 2413 if not len(output.readline()):
...@@ -2380,6 +2417,81 @@ def work_realign(rfam_acc): ...@@ -2380,6 +2417,81 @@ def work_realign(rfam_acc):
2380 er.write(f"Failed to realign {rfam_acc} (killed)") 2417 er.write(f"Failed to realign {rfam_acc} (killed)")
2381 2418
2382 @trace_unhandled_exceptions 2419 @trace_unhandled_exceptions
2420 +def compute_from_pydca(f,columns_to_save):
2421 +
2422 + align=read(path_to_seq_data + f"realigned/{f}++.afa")
2423 + #convert to uppercase as needed for pydca
2424 + for s in align:
2425 + s.seq=s.seq.upper()
2426 + filtered_alignment = align[:, 1:1] # all the lines, but no columns
2427 + for p in columns_to_save:
2428 + filtered_alignment += align[:, p-1:p] # save columns one by one
2429 +
2430 + #replace all other letters by a gap consensus just for the
2431 + #aim to use pydca as sites other than ACGU . and - are not accepted
2432 +
2433 +
2434 + for s in filtered_alignment:
2435 + for i in range(len(s.seq)):
2436 + if s.seq[i].upper() not in "ACGU-.":
2437 + s.seq[i]='-'
2438 + #create a fasta file to be used by pydca
2439 + with open(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa", "w") as only_3d:
2440 + try:
2441 + only_3d.write(format(filtered_alignment, "fasta"))
2442 + except ValueError as e:
2443 + warn(e)
2444 +
2445 + #pydca instance
2446 + #here lamda_J is set by pydca to 0.2*(L-1) where L is the length of the sequence
2447 + #the maximum number of iterations is set to 500 for gradient descent
2448 + #lamda_h is set to 1 and seqid is set to 0.8 as suggested by pydca papers
2449 +
2450 + #Reference:
2451 + #Zerihun MB, Pucci F, Peter EK, Schug A. pydca v1. 0: a comprehensive software for Direct Coupling Analysis of RNA and Protein Sequences. Bioinformatics.
2452 + #2020;36(7):2264–2265. 10.1093/bioinformatics/btz892 - DOI - https://pubmed.ncbi.nlm.nih.gov/31778142/
2453 +
2454 +
2455 + plmdca_inst = plmdca.PlmDCA(
2456 + path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa",
2457 + "rna",
2458 + seqid = 0.8,
2459 + lambda_h = 1.0,
2460 + num_threads = 10,
2461 + max_iterations = 500,)
2462 + #number of sites=L*(L-1)/2 so L=len(columns_to_Save)
2463 + number_of_sites=len(columns_to_save)*(len(columns_to_save)-1)//2
2464 +
2465 + #a tuple of two list of tuples
2466 + #the first list contains the fields of sites (nucleotides)
2467 + #the second contains
2468 + #linear distance is zero in order to keep all possible pairs
2469 +
2470 + #because if linear dist=x>0 the pydca will return position |i-j|>x
2471 + #which will force us to lose a lot of pairs
2472 +
2473 +
2474 + params=plmdca_inst.compute_params(linear_dist=0,num_site_pairs=number_of_sites)
2475 + #frobenius norm with average product correction
2476 + fn_apc=plmdca_inst.compute_sorted_FN_APC()
2477 +
2478 + family_dca_data={"PARAMS":params,"FNAPC":fn_apc}
2479 + np.savez(path_to_seq_data+f"/realigned/{f}_pydca.npz")
2480 +
2481 + #a dictionary to be used in the function where the frequencies are stored in align column table
2482 + return_dict_fields={}
2483 + for list_fields in params[0]:
2484 +
2485 + #The element at 0 is the index
2486 + #So taking the value from column to save at that index will give us
2487 + #the fields to be stored at ali_col in the table
2488 +
2489 + return_dict_fields[columns_to_save[list_fields[0]]]=list_fields[1]
2490 + subprocess.run(["rm", "-f", path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa"])
2491 +
2492 + return return_dict_fields
2493 +
2494 +@trace_unhandled_exceptions
2383 def work_pssm_remap(f): 2495 def work_pssm_remap(f):
2384 """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. 2496 """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
2385 This also remaps the 3D object sequence with the aligned sequence in the MSA. 2497 This also remaps the 3D object sequence with the aligned sequence in the MSA.
...@@ -2560,16 +2672,16 @@ def work_pssm_remap(f): ...@@ -2560,16 +2672,16 @@ def work_pssm_remap(f):
2560 unused.append((f, col)) 2672 unused.append((f, col))
2561 sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused) 2673 sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused)
2562 conn.commit() 2674 conn.commit()
2563 - 2675 + rfam_fields_record=compute_from_pydca(f,sorted(columns_to_save))
2564 # Save the useful columns in the database 2676 # Save the useful columns in the database
2565 - data = [(f, j) + tuple(pssm_info[:,j-1]) + (consensus[j-1],) for j in sorted(columns_to_save)] 2677 + data = [(f, j) + tuple(pssm_info[:,j-1]) +tuple(rfam_fields_record[j]) + (consensus[j-1],) for j in sorted(columns_to_save)]
2566 - sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus) 2678 + sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U,freq_other,fields_A,fields_C,fields_G,fields_U , gap_percent, consensus)
2567 - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO 2679 + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?,?,?,?,?) ON CONFLICT(rfam_acc, index_ali) DO
2568 UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, 2680 UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U,
2569 - freq_other=excluded.freq_other, gap_percent=excluded.gap_percent, consensus=excluded.consensus;""", many=True, data=data) 2681 + freq_other=excluded.freq_other,fields_A=excluded.fields_A,fields_C=excluded.fields_C,fields_G=excluded.fields_G,fields_U=excluded.fields_U,gap_percent=excluded.gap_percent, consensus=excluded.consensus;""", many=True, data=data)
2570 # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment) 2682 # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment)
2571 - sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus) 2683 + sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other,fields_A,fields_C,fields_G,fields_U, gap_percent, consensus)
2572 - VALUES (?, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-');""", data=(f,)) 2684 + VALUES (?, 0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0,0.0,1.0,'-');""", data=(f,))
2573 # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) 2685 # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains)
2574 sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f)) 2686 sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f))
2575 conn.close() 2687 conn.close()
...@@ -2613,7 +2725,7 @@ def work_save(c, homology=True): ...@@ -2613,7 +2725,7 @@ def work_save(c, homology=True):
2613 if homology: 2725 if homology:
2614 df = pd.read_sql_query(f""" 2726 df = pd.read_sql_query(f"""
2615 SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, 2727 SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code,
2616 - is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, dbn, 2728 + is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other,fields_A,fields_C,fields_G,fields_U,gap_percent, consensus, dbn,
2617 paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, 2729 paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta,
2618 chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, 2730 chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base,
2619 v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM 2731 v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM
......
...@@ -952,8 +952,8 @@ def par_distance_matrix(filelist, f, label, consider_all_atoms, s): ...@@ -952,8 +952,8 @@ def par_distance_matrix(filelist, f, label, consider_all_atoms, s):
952 else: 952 else:
953 d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]) 953 d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j])
954 954
955 - if f not in LSU_set and f not in SSU_set: 955 + # if f not in LSU_set and f not in SSU_set:
956 - np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f") 956 + np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
957 return 1-np.isnan(d).astype(int), np.nan_to_num(d), np.nan_to_num(d*d) 957 return 1-np.isnan(d).astype(int), np.nan_to_num(d), np.nan_to_num(d*d)
958 958
959 @trace_unhandled_exceptions 959 @trace_unhandled_exceptions
...@@ -1173,7 +1173,7 @@ if __name__ == "__main__": ...@@ -1173,7 +1173,7 @@ if __name__ == "__main__":
1173 1173
1174 sys.exit() 1174 sys.exit()
1175 elif opt == '--version': 1175 elif opt == '--version':
1176 - print("RNANet statistics 1.3 beta") 1176 + print("RNANet statistics 1.4 beta")
1177 sys.exit() 1177 sys.exit()
1178 elif opt == "-r" or opt == "--resolution": 1178 elif opt == "-r" or opt == "--resolution":
1179 assert float(arg) > 0.0 and float(arg) <= 20.0 1179 assert float(arg) > 0.0 and float(arg) <= 20.0
......