Showing
7 changed files
with
96 additions
and
75 deletions
... | @@ -71,11 +71,6 @@ sqlite3.enable_callback_tracebacks(True) | ... | @@ -71,11 +71,6 @@ sqlite3.enable_callback_tracebacks(True) |
71 | sqlite3.register_adapter(np.int64, lambda val: int(val)) # Tell Sqlite what to do with <class numpy.int64> objects ---> convert to int | 71 | sqlite3.register_adapter(np.int64, lambda val: int(val)) # Tell Sqlite what to do with <class numpy.int64> objects ---> convert to int |
72 | sqlite3.register_adapter(np.float64, lambda val: float(val)) # Tell Sqlite what to do with <class numpy.float64> objects ---> convert to float | 72 | sqlite3.register_adapter(np.float64, lambda val: float(val)) # Tell Sqlite what to do with <class numpy.float64> objects ---> convert to float |
73 | 73 | ||
74 | -# m = Manager() | ||
75 | -# running_stats = m.list() | ||
76 | -# running_stats.append(0) # n_launched | ||
77 | -# running_stats.append(0) # n_finished | ||
78 | -# running_stats.append(0) # n_skipped | ||
79 | n_launched = Value('i', 0) | 74 | n_launched = Value('i', 0) |
80 | n_finished = Value('i', 0) | 75 | n_finished = Value('i', 0) |
81 | n_skipped = Value('i', 0) | 76 | n_skipped = Value('i', 0) |
... | @@ -635,12 +630,24 @@ class Chain: | ... | @@ -635,12 +630,24 @@ class Chain: |
635 | if nt2 in res_ids: | 630 | if nt2 in res_ids: |
636 | interacts[nt2_idx] += 1 | 631 | interacts[nt2_idx] += 1 |
637 | if paired[nt2_idx] == "": | 632 | if paired[nt2_idx] == "": |
638 | - pair_type_LW[nt2_idx] = lw_pair[0] + lw_pair[2] + lw_pair[1] | 633 | + if lw_pair != "--": |
639 | - pair_type_DSSR[nt2_idx] = dssr_pair[0] + dssr_pair[3] + dssr_pair[2] + dssr_pair[1] | 634 | + pair_type_LW[nt2_idx] = lw_pair[0] + lw_pair[2] + lw_pair[1] |
635 | + else: | ||
636 | + pair_type_LW[nt2_idx] = "--" | ||
637 | + if dssr_pair != "--": | ||
638 | + pair_type_DSSR[nt2_idx] = dssr_pair[0] + dssr_pair[3] + dssr_pair[2] + dssr_pair[1] | ||
639 | + else: | ||
640 | + pair_type_DSSR[nt2_idx] = "--" | ||
640 | paired[nt2_idx] = str(nt1_idx + 1) | 641 | paired[nt2_idx] = str(nt1_idx + 1) |
641 | else: | 642 | else: |
642 | - pair_type_LW[nt2_idx] += ',' + lw_pair[0] + lw_pair[2] + lw_pair[1] | 643 | + if lw_pair != "--": |
643 | - pair_type_DSSR[nt2_idx] += ',' + dssr_pair[0] + dssr_pair[3] + dssr_pair[2] + dssr_pair[1] | 644 | + pair_type_LW[nt2_idx] += ',' + lw_pair[0] + lw_pair[2] + lw_pair[1] |
645 | + else: | ||
646 | + pair_type_LW[nt2_idx] += ",--" | ||
647 | + if dssr_pair != "--": | ||
648 | + pair_type_DSSR[nt2_idx] += ',' + dssr_pair[0] + dssr_pair[3] + dssr_pair[2] + dssr_pair[1] | ||
649 | + else: | ||
650 | + pair_type_DSSR[nt2_idx] += ",--" | ||
644 | paired[nt2_idx] += ',' + str(nt1_idx + 1) | 651 | paired[nt2_idx] += ',' + str(nt1_idx + 1) |
645 | 652 | ||
646 | # transform nt_id to shorter values | 653 | # transform nt_id to shorter values |
... | @@ -1083,7 +1090,7 @@ class Pipeline: | ... | @@ -1083,7 +1090,7 @@ class Pipeline: |
1083 | self.REUSE_ALL = False | 1090 | self.REUSE_ALL = False |
1084 | self.REDUNDANT = False | 1091 | self.REDUNDANT = False |
1085 | self.ALIGNOPTS = None | 1092 | self.ALIGNOPTS = None |
1086 | - self.RRNAALIGNOPTS = "--mxsize 8192 --cpu 10 --maxtau 0.1" | 1093 | + self.RRNAALIGNOPTS = ["--mxsize", "8192", "--cpu", "10", "--maxtau", "0.1"] |
1087 | self.STATSOPTS = None | 1094 | self.STATSOPTS = None |
1088 | self.USESINA = False | 1095 | self.USESINA = False |
1089 | self.SELECT_ONLY = None | 1096 | self.SELECT_ONLY = None |
... | @@ -1151,6 +1158,7 @@ class Pipeline: | ... | @@ -1151,6 +1158,7 @@ class Pipeline: |
1151 | "\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions" | 1158 | "\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions" |
1152 | "\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.") | 1159 | "\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.") |
1153 | print("--cmalign-opts=…\t\tA string of additional options to pass to cmalign aligner, e.g. \"--nonbanded --mxsize 2048\"") | 1160 | print("--cmalign-opts=…\t\tA string of additional options to pass to cmalign aligner, e.g. \"--nonbanded --mxsize 2048\"") |
1161 | + print("--cmalign-rrna-opts=…\tLike cmalign-opts, but applied for rRNA (large families, memory-heavy jobs).") | ||
1154 | print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments," | 1162 | print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments," |
1155 | "\n\t\t\t\t and update the link to the latest archive. ") | 1163 | "\n\t\t\t\t and update the link to the latest archive. ") |
1156 | print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications.") | 1164 | print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications.") |
... | @@ -1219,7 +1227,7 @@ class Pipeline: | ... | @@ -1219,7 +1227,7 @@ class Pipeline: |
1219 | elif opt == "cmalign-opts": | 1227 | elif opt == "cmalign-opts": |
1220 | self.ALIGNOPTS = arg | 1228 | self.ALIGNOPTS = arg |
1221 | elif opt == "cmalign-rrna-opts": | 1229 | elif opt == "cmalign-rrna-opts": |
1222 | - self.RRNAALIGNOPTS = arg | 1230 | + self.RRNAALIGNOPTS = " ".split(arg) |
1223 | elif opt == "stats-opts": | 1231 | elif opt == "stats-opts": |
1224 | self.STATSOPTS = " ".split(arg) | 1232 | self.STATSOPTS = " ".split(arg) |
1225 | elif opt == "--all": | 1233 | elif opt == "--all": |
... | @@ -1436,8 +1444,9 @@ class Pipeline: | ... | @@ -1436,8 +1444,9 @@ class Pipeline: |
1436 | args=[c, self.EXTRACT_CHAINS, self.KEEP_HETATM, retry, self.SAVELOGS])) | 1444 | args=[c, self.EXTRACT_CHAINS, self.KEEP_HETATM, retry, self.SAVELOGS])) |
1437 | try: | 1445 | try: |
1438 | results = execute_joblist(joblist) | 1446 | results = execute_joblist(joblist) |
1439 | - except: | 1447 | + except Exception as e: |
1440 | - print("Exiting", flush=True) | 1448 | + warn(str(e), error=True) |
1449 | + print("Exiting", str(e), flush=True) | ||
1441 | exit(1) | 1450 | exit(1) |
1442 | 1451 | ||
1443 | # If there were newly discovered problems, add this chain to the known issues | 1452 | # If there were newly discovered problems, add this chain to the known issues |
... | @@ -1550,7 +1559,7 @@ class Pipeline: | ... | @@ -1550,7 +1559,7 @@ class Pipeline: |
1550 | align = AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta") | 1559 | align = AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta") |
1551 | nb_3d_chains = len([1 for r in align if '[' in r.id]) | 1560 | nb_3d_chains = len([1 for r in align if '[' in r.id]) |
1552 | if r[0] in SSU_set: # SSU v138.1 is used | 1561 | if r[0] in SSU_set: # SSU v138.1 is used |
1553 | - nb_homologs = 2224740 # source: https://www.arb-silva.de/documentation/release-1381/ | 1562 | + nb_homologs = 2224740 # source: https://www.arb-silva.de/documentation/release-1381/ |
1554 | nb_total_homol = nb_homologs + nb_3d_chains | 1563 | nb_total_homol = nb_homologs + nb_3d_chains |
1555 | elif r[0] in LSU_set: # LSU v138.1 is used | 1564 | elif r[0] in LSU_set: # LSU v138.1 is used |
1556 | nb_homologs = 227331 # source: https://www.arb-silva.de/documentation/release-1381/ | 1565 | nb_homologs = 227331 # source: https://www.arb-silva.de/documentation/release-1381/ |
... | @@ -1794,9 +1803,9 @@ def init_no_tqdm(arg1, arg2, arg3): | ... | @@ -1794,9 +1803,9 @@ def init_no_tqdm(arg1, arg2, arg3): |
1794 | The children progress is followed using stdout text logs (notify(), warn(), etc) | 1803 | The children progress is followed using stdout text logs (notify(), warn(), etc) |
1795 | """ | 1804 | """ |
1796 | global n_launched, n_finished, n_skipped | 1805 | global n_launched, n_finished, n_skipped |
1797 | - n_launched = arg1 | 1806 | + n_launched = arg1 |
1798 | - n_finished = arg2 | 1807 | + n_finished = arg2 |
1799 | - n_skipped = arg3 | 1808 | + n_skipped = arg3 |
1800 | 1809 | ||
1801 | def warn(message, error=False): | 1810 | def warn(message, error=False): |
1802 | """ | 1811 | """ |
... | @@ -2147,7 +2156,7 @@ def execute_job(j, jobcount): | ... | @@ -2147,7 +2156,7 @@ def execute_job(j, jobcount): |
2147 | 2156 | ||
2148 | # increase the counter of running jobs | 2157 | # increase the counter of running jobs |
2149 | with n_launched.get_lock(): | 2158 | with n_launched.get_lock(): |
2150 | - n_launched.value += 1 | 2159 | + n_launched.value += 1 |
2151 | 2160 | ||
2152 | # Monitor this process | 2161 | # Monitor this process |
2153 | m = -1 | 2162 | m = -1 |
... | @@ -2208,7 +2217,8 @@ def execute_job(j, jobcount): | ... | @@ -2208,7 +2217,8 @@ def execute_job(j, jobcount): |
2208 | m = assistant_future.result() | 2217 | m = assistant_future.result() |
2209 | 2218 | ||
2210 | # increase the counter of finished jobs | 2219 | # increase the counter of finished jobs |
2211 | - running_stats[1] += 1 | 2220 | + with n_finished.get_lock(): |
2221 | + n_finished.value += 1 | ||
2212 | 2222 | ||
2213 | # return time and memory statistics, plus the job results | 2223 | # return time and memory statistics, plus the job results |
2214 | t = end_time - start_time | 2224 | t = end_time - start_time |
... | @@ -2223,9 +2233,12 @@ def execute_joblist(fulljoblist): | ... | @@ -2223,9 +2233,12 @@ def execute_joblist(fulljoblist): |
2223 | """ | 2233 | """ |
2224 | 2234 | ||
2225 | # Reset counters | 2235 | # Reset counters |
2226 | - running_stats[0] = 0 # started | 2236 | + with n_launched.get_lock(): |
2227 | - running_stats[1] = 0 # finished | 2237 | + n_launched.value = 0 |
2228 | - running_stats[2] = 0 # failed | 2238 | + with n_skipped.get_lock(): |
2239 | + n_skipped.value = 0 | ||
2240 | + with n_finished.get_lock(): | ||
2241 | + n_finished.value = 0 | ||
2229 | 2242 | ||
2230 | # Sort jobs in a tree structure, first by priority, then by CPU numbers | 2243 | # Sort jobs in a tree structure, first by priority, then by CPU numbers |
2231 | jobs = {} | 2244 | jobs = {} |
... | @@ -2276,10 +2289,6 @@ def execute_joblist(fulljoblist): | ... | @@ -2276,10 +2289,6 @@ def execute_joblist(fulljoblist): |
2276 | j.comp_time = round(r[0], 2) # seconds | 2289 | j.comp_time = round(r[0], 2) # seconds |
2277 | j.max_mem = int(r[1]/1000000) # MB | 2290 | j.max_mem = int(r[1]/1000000) # MB |
2278 | results.append((j.label, r[2], j.comp_time, j.max_mem)) | 2291 | results.append((j.label, r[2], j.comp_time, j.max_mem)) |
2279 | - | ||
2280 | - # Job is finished | ||
2281 | - with n_finished.get_lock(): | ||
2282 | - n_finished.value += 1 | ||
2283 | 2292 | ||
2284 | # throw back the money | 2293 | # throw back the money |
2285 | return results | 2294 | return results |
... | @@ -2672,13 +2681,17 @@ def use_infernal(rfam_acc, alignopts): | ... | @@ -2672,13 +2681,17 @@ def use_infernal(rfam_acc, alignopts): |
2672 | with open(path_to_seq_data + f"realigned/{rfam_acc}_new.log", 'w') as o: | 2681 | with open(path_to_seq_data + f"realigned/{rfam_acc}_new.log", 'w') as o: |
2673 | p1 = subprocess.run(["cmalign", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", | 2682 | p1 = subprocess.run(["cmalign", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", |
2674 | "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", | 2683 | "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", |
2675 | - "-o", path_to_seq_data + f"realigned/{rfam_acc}_new.stk", | 2684 | + "-o", new_ali_path, |
2676 | path_to_seq_data + f"realigned/{rfam_acc}.cm", | 2685 | path_to_seq_data + f"realigned/{rfam_acc}.cm", |
2677 | path_to_seq_data + f"realigned/{rfam_acc}_new.fa"], | 2686 | path_to_seq_data + f"realigned/{rfam_acc}_new.fa"], |
2678 | stdout=o, stderr=subprocess.PIPE) | 2687 | stdout=o, stderr=subprocess.PIPE) |
2679 | - if "--mxsize" in p1.stderr.decode("utf-8"): | 2688 | + align_errors = p1.stderr.decode("utf-8") |
2680 | - # not enough available RAM to allocate the DP matrix | 2689 | + if len(align_errors): |
2681 | - warn(f"Not enough RAM to allocate cmalign DP matrix for family {rfam_acc}. Use --sina or --cmalign-opts.", error=True) | 2690 | + if "--mxsize" in align_errors: |
2691 | + # not enough available RAM to allocate the DP matrix | ||
2692 | + warn(f"Not enough RAM to allocate cmalign DP matrix for family {rfam_acc}. Use --sina or --cmalign-opts.", error=True) | ||
2693 | + else: | ||
2694 | + warn(align_errors, error=True) | ||
2682 | notify("Aligned new sequences together") | 2695 | notify("Aligned new sequences together") |
2683 | 2696 | ||
2684 | # Detect doublons and remove them | 2697 | # Detect doublons and remove them |
... | @@ -2710,8 +2723,8 @@ def use_infernal(rfam_acc, alignopts): | ... | @@ -2710,8 +2723,8 @@ def use_infernal(rfam_acc, alignopts): |
2710 | os.remove(path_to_seq_data + "realigned/toremove.txt") | 2723 | os.remove(path_to_seq_data + "realigned/toremove.txt") |
2711 | 2724 | ||
2712 | # And we merge the two alignments | 2725 | # And we merge the two alignments |
2713 | - p2 = subprocess.run(["cmalign", "--merge" "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", | 2726 | + p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", |
2714 | - "--rna", path_to_seq_data + f"realigned/{rfam_acc}.cm", existing_ali_path, new_ali_path], | 2727 | + "--rna", existing_ali_path, new_ali_path], |
2715 | stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) | 2728 | stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) |
2716 | alignErrors = p1.stderr.decode('utf-8') | 2729 | alignErrors = p1.stderr.decode('utf-8') |
2717 | mergeErrors = p2.stderr.decode('utf-8') | 2730 | mergeErrors = p2.stderr.decode('utf-8') |
... | @@ -2730,7 +2743,7 @@ def use_infernal(rfam_acc, alignopts): | ... | @@ -2730,7 +2743,7 @@ def use_infernal(rfam_acc, alignopts): |
2730 | 2743 | ||
2731 | cmd = ["cmalign"] | 2744 | cmd = ["cmalign"] |
2732 | if alignopts is not None: | 2745 | if alignopts is not None: |
2733 | - cmd += " ".split(alignopts) | 2746 | + cmd += alignopts |
2734 | cmd += ['-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk", | 2747 | cmd += ['-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk", |
2735 | "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", | 2748 | "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", |
2736 | "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", | 2749 | "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", |
... | @@ -3166,8 +3179,6 @@ if __name__ == "__main__": | ... | @@ -3166,8 +3179,6 @@ if __name__ == "__main__": |
3166 | for c in pp.loaded_chains: | 3179 | for c in pp.loaded_chains: |
3167 | work_save(c, homology=False) | 3180 | work_save(c, homology=False) |
3168 | print("Completed.") | 3181 | print("Completed.") |
3169 | - exit(0) | ||
3170 | - | ||
3171 | 3182 | ||
3172 | # At this point, structure, chain and nucleotide tables of the database are up to date. | 3183 | # At this point, structure, chain and nucleotide tables of the database are up to date. |
3173 | # (Modulo some statistics computed by statistics.py) | 3184 | # (Modulo some statistics computed by statistics.py) | ... | ... |
... | @@ -35,11 +35,11 @@ nohup bash -c 'time docker run --rm -v /path/to/3D/data/folder:/3D -v /path/to/s | ... | @@ -35,11 +35,11 @@ nohup bash -c 'time docker run --rm -v /path/to/3D/data/folder:/3D -v /path/to/s |
35 | # Method 2 : Classical command line installation (Linux only) | 35 | # Method 2 : Classical command line installation (Linux only) |
36 | 36 | ||
37 | You need to install the dependencies: | 37 | You need to install the dependencies: |
38 | -- DSSR, you need to register to the X3DNA forum [here](http://forum.x3dna.org/site-announcements/download-instructions/) and then download the DSSR binary [on that page](http://forum.x3dna.org/downloads/3dna-download/). Make sure to have the `x3dna-dssr` binary in your $PATH variable so that RNANet.py finds it. | 38 | +- DSSR 1.9.9 or newer, you need to register to the X3DNA forum [here](http://forum.x3dna.org/site-announcements/download-instructions/) and then download the DSSR binary [on that page](http://forum.x3dna.org/downloads/3dna-download/). Make sure to have the `x3dna-dssr` binary in your $PATH variable so that RNANet.py finds it. |
39 | -- Infernal, to download at [Eddylab](http://eddylab.org/infernal/), several options are available depending on your preferences. Make sure to have the `cmalign`, `cmfetch`, `cmbuild`, `esl-alimanip`, `esl-alipid` and `esl-reformat` binaries in your $PATH variable, so that RNANet.py can find them. | 39 | +- Infernal 1.1.4 or newer, to download at [Eddylab](http://eddylab.org/infernal/), several options are available depending on your preferences. Make sure to have the `cmalign`, `cmfetch`, `cmbuild`, `esl-alimanip`, `esl-alimerge`, `esl-alipid` and `esl-reformat` binaries in your $PATH variable, so that RNANet.py can find them. |
40 | -- SINA, follow [these instructions](https://sina.readthedocs.io/en/latest/install.html) for example. Make sure to have the `sina` binary in your $PATH. | 40 | +- SINA (if you plan to use it), follow [these instructions](https://sina.readthedocs.io/en/latest/install.html) for example. Make sure to have the `sina` binary in your $PATH. |
41 | - Sqlite 3, available under the name *sqlite* in every distro's package manager, | 41 | - Sqlite 3, available under the name *sqlite* in every distro's package manager, |
42 | -- Python >= 3.8, (Unfortunately, python3.6 is no longer supported, because of changes in the multiprocessing and Threading packages. Untested with Python 3.7.\*) | 42 | +- Python >= 3.8, (Unfortunately, python3.6 is no longer supported, because of changes in the multiprocessing and Threading packages. Untested with Python 3.7.\*). |
43 | - The following Python packages: `python3.8 -m pip install biopython matplotlib pandas psutil pymysql requests scipy setproctitle sqlalchemy tqdm`. | 43 | - The following Python packages: `python3.8 -m pip install biopython matplotlib pandas psutil pymysql requests scipy setproctitle sqlalchemy tqdm`. |
44 | 44 | ||
45 | Then, run it from the command line, preferably using nohup if your shell will be interrupted: | 45 | Then, run it from the command line, preferably using nohup if your shell will be interrupted: | ... | ... |
... | @@ -19,3 +19,6 @@ | ... | @@ -19,3 +19,6 @@ |
19 | * Use and save Infernal alignment bounds and truncation information | 19 | * Use and save Infernal alignment bounds and truncation information |
20 | * Save if a chain is a representative in BGSU list | 20 | * Save if a chain is a representative in BGSU list |
21 | * Annotate unstructured regions (on a nucleotide basis) | 21 | * Annotate unstructured regions (on a nucleotide basis) |
22 | + | ||
23 | +## Technical to-do list | ||
24 | +* `cmalign --merge` is now deprecated, we use `esl-alimerge` instead. But, esl is a single-core process. We should run the merges of alignements of different families in parallel to save some time [TODO]. | ... | ... |
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
... | @@ -26,7 +26,7 @@ from os import path | ... | @@ -26,7 +26,7 @@ from os import path |
26 | from tqdm import tqdm | 26 | from tqdm import tqdm |
27 | from collections import Counter | 27 | from collections import Counter |
28 | from setproctitle import setproctitle | 28 | from setproctitle import setproctitle |
29 | -from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions | 29 | +from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_with_tqdm, trace_unhandled_exceptions |
30 | from geometric_stats import * | 30 | from geometric_stats import * |
31 | 31 | ||
32 | np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8) | 32 | np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8) |
... | @@ -948,7 +948,11 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s): | ... | @@ -948,7 +948,11 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s): |
948 | nb_gap += 1 | 948 | nb_gap += 1 |
949 | coordinates_with_gaps.append(np.nan) | 949 | coordinates_with_gaps.append(np.nan) |
950 | else: | 950 | else: |
951 | - coordinates_with_gaps.append(coordinates[i - nb_gap]) | 951 | + try: |
952 | + coordinates_with_gaps.append(coordinates[i - nb_gap]) | ||
953 | + except IndexError as e: | ||
954 | + warn(f"{filename} : {s.seq} at position {i}, we get {e}.", error=True) | ||
955 | + exit(0) | ||
952 | 956 | ||
953 | # Build the pairwise distances | 957 | # Build the pairwise distances |
954 | d = np.zeros((len(s.seq), len(s.seq)), dtype=np.float32) | 958 | d = np.zeros((len(s.seq), len(s.seq)), dtype=np.float32) |
... | @@ -1055,7 +1059,7 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): | ... | @@ -1055,7 +1059,7 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): |
1055 | else: | 1059 | else: |
1056 | # We split the work for one family on multiple workers. | 1060 | # We split the work for one family on multiple workers. |
1057 | 1061 | ||
1058 | - p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) | 1062 | + p = Pool(initializer=init_with_tqdm, initargs=(tqdm.get_lock(),), processes=nworkers) |
1059 | try: | 1063 | try: |
1060 | fam_pbar = tqdm(total=len(align), desc=f"{f} {label} pair distances", position=0, unit="chain", leave=True) | 1064 | fam_pbar = tqdm(total=len(align), desc=f"{f} {label} pair distances", position=0, unit="chain", leave=True) |
1061 | # Apply work_pssm_remap to each RNA family | 1065 | # Apply work_pssm_remap to each RNA family |
... | @@ -1147,8 +1151,11 @@ def nt_3d_centers(cif_file, consider_all_atoms): | ... | @@ -1147,8 +1151,11 @@ def nt_3d_centers(cif_file, consider_all_atoms): |
1147 | Some chains have no C1' (e.g. 4v7f-3), therefore, an empty result is returned. | 1151 | Some chains have no C1' (e.g. 4v7f-3), therefore, an empty result is returned. |
1148 | """ | 1152 | """ |
1149 | result =[] | 1153 | result =[] |
1150 | - structure = MMCIFParser().get_structure(cif_file, cif_file) | 1154 | + try: |
1151 | - | 1155 | + structure = MMCIFParser().get_structure(cif_file, cif_file) |
1156 | + except Exception as e: | ||
1157 | + warn(f"{cif_file} : {e}", error=True) | ||
1158 | + return result | ||
1152 | for model in structure: | 1159 | for model in structure: |
1153 | for chain in model: | 1160 | for chain in model: |
1154 | for residue in chain: | 1161 | for residue in chain: |
... | @@ -1203,7 +1210,7 @@ def process_jobs(joblist): | ... | @@ -1203,7 +1210,7 @@ def process_jobs(joblist): |
1203 | Starts a Pool to run the Job() objects in joblist. | 1210 | Starts a Pool to run the Job() objects in joblist. |
1204 | """ | 1211 | """ |
1205 | tmp_nworkers = min(len(joblist), nworkers) | 1212 | tmp_nworkers = min(len(joblist), nworkers) |
1206 | - p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=tmp_nworkers) | 1213 | + p = Pool(initializer=init_with_tqdm, initargs=(tqdm.get_lock(),), processes=tmp_nworkers) |
1207 | pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, unit="job", leave=True) | 1214 | pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, unit="job", leave=True) |
1208 | 1215 | ||
1209 | try: | 1216 | try: |
... | @@ -1345,31 +1352,31 @@ if __name__ == "__main__": | ... | @@ -1345,31 +1352,31 @@ if __name__ == "__main__": |
1345 | # Define the tasks | 1352 | # Define the tasks |
1346 | joblist = [] | 1353 | joblist = [] |
1347 | 1354 | ||
1348 | - # # Do eta/theta plots | 1355 | + # Do eta/theta plots |
1349 | - # if n_unmapped_chains and DO_WADLEY_ANALYSIS: | 1356 | + if n_unmapped_chains and DO_WADLEY_ANALYSIS: |
1350 | - # joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr))) | 1357 | + joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr))) |
1351 | - # joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr))) | 1358 | + joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr))) |
1352 | - | 1359 | + |
1353 | - # # Do distance matrices for each family excl. LSU/SSU (will be processed later) | 1360 | + # Do distance matrices for each family excl. LSU/SSU (will be processed later) |
1354 | - # if DO_AVG_DISTANCE_MATRIX: | 1361 | + if DO_AVG_DISTANCE_MATRIX: |
1355 | - # extracted_chains = [] | 1362 | + extracted_chains = [] |
1356 | - # for file in os.listdir(path_to_3D_data + "rna_mapped_to_Rfam"): | 1363 | + for file in os.listdir(path_to_3D_data + "rna_mapped_to_Rfam"): |
1357 | - # if os.path.isfile(os.path.join(path_to_3D_data + "rna_mapped_to_Rfam", file)): | 1364 | + if os.path.isfile(os.path.join(path_to_3D_data + "rna_mapped_to_Rfam", file)): |
1358 | - # e1 = file.split('_')[0] | 1365 | + e1 = file.split('_')[0] |
1359 | - # e2 = file.split('_')[1] | 1366 | + e2 = file.split('_')[1] |
1360 | - # e3 = file.split('_')[2] | 1367 | + e3 = file.split('_')[2] |
1361 | - # extracted_chains.append(e1 + '[' + e2 + ']' + '-' + e3) | 1368 | + extracted_chains.append(e1 + '[' + e2 + ']' + '-' + e3) |
1362 | - # for f in [ x for x in famlist if (x not in LSU_set and x not in SSU_set) ]: # Process the rRNAs later only 3 by 3 | 1369 | + for f in [ x for x in famlist if (x not in LSU_set and x not in SSU_set) ]: # Process the rRNAs later only 3 by 3 |
1363 | - # joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, True, False))) | 1370 | + joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, True, False))) |
1364 | - # joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False, False))) | 1371 | + joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False, False))) |
1365 | - | 1372 | + |
1366 | - # # Do general family statistics | 1373 | + # Do general family statistics |
1367 | - # joblist.append(Job(function=stats_len)) # Computes figures about chain lengths | 1374 | + joblist.append(Job(function=stats_len)) # Computes figures about chain lengths |
1368 | - # joblist.append(Job(function=stats_freq)) # updates the database (nucleotide frequencies in families) | 1375 | + joblist.append(Job(function=stats_freq)) # updates the database (nucleotide frequencies in families) |
1369 | - # for f in famlist: | 1376 | + for f in famlist: |
1370 | - # joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database (intra-chain basepair types within a family) | 1377 | + joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database (intra-chain basepair types within a family) |
1371 | - # if f not in ignored: | 1378 | + if f not in ignored: |
1372 | - # joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database (identity matrices of families) | 1379 | + joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database (identity matrices of families) |
1373 | 1380 | ||
1374 | 1381 | ||
1375 | # Do geometric measures | 1382 | # Do geometric measures |
... | @@ -1382,7 +1389,7 @@ if __name__ == "__main__": | ... | @@ -1382,7 +1389,7 @@ if __name__ == "__main__": |
1382 | joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances | 1389 | joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances |
1383 | 1390 | ||
1384 | 1391 | ||
1385 | - # process_jobs(joblist) | 1392 | + process_jobs(joblist) |
1386 | 1393 | ||
1387 | # Now process the memory-heavy tasks family by family | 1394 | # Now process the memory-heavy tasks family by family |
1388 | if DO_AVG_DISTANCE_MATRIX: | 1395 | if DO_AVG_DISTANCE_MATRIX: |
... | @@ -1398,11 +1405,11 @@ if __name__ == "__main__": | ... | @@ -1398,11 +1405,11 @@ if __name__ == "__main__": |
1398 | 1405 | ||
1399 | # finish the work after the parallel portions | 1406 | # finish the work after the parallel portions |
1400 | 1407 | ||
1401 | - # per_chain_stats() # per chain base frequencies and basepair types | 1408 | + per_chain_stats() # per chain base frequencies and basepair types |
1402 | - # seq_idty() # identity matrices from pre-computed .npy matrices | 1409 | + seq_idty() # identity matrices from pre-computed .npy matrices |
1403 | - # stats_pairs() | 1410 | + stats_pairs() |
1404 | if n_unmapped_chains: | 1411 | if n_unmapped_chains: |
1405 | - # general_stats() | 1412 | + general_stats() |
1406 | os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True) | 1413 | os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True) |
1407 | os.makedirs(runDir+"/results/geometry/json/", exist_ok=True) | 1414 | os.makedirs(runDir+"/results/geometry/json/", exist_ok=True) |
1408 | concat_dataframes(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv') | 1415 | concat_dataframes(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv') | ... | ... |
-
Please register or login to post a comment