Louis BECQUEY

Fixes + using esl-alimerge again

...@@ -71,11 +71,6 @@ sqlite3.enable_callback_tracebacks(True) ...@@ -71,11 +71,6 @@ sqlite3.enable_callback_tracebacks(True)
71 sqlite3.register_adapter(np.int64, lambda val: int(val)) # Tell Sqlite what to do with <class numpy.int64> objects ---> convert to int 71 sqlite3.register_adapter(np.int64, lambda val: int(val)) # Tell Sqlite what to do with <class numpy.int64> objects ---> convert to int
72 sqlite3.register_adapter(np.float64, lambda val: float(val)) # Tell Sqlite what to do with <class numpy.float64> objects ---> convert to float 72 sqlite3.register_adapter(np.float64, lambda val: float(val)) # Tell Sqlite what to do with <class numpy.float64> objects ---> convert to float
73 73
74 -# m = Manager()
75 -# running_stats = m.list()
76 -# running_stats.append(0) # n_launched
77 -# running_stats.append(0) # n_finished
78 -# running_stats.append(0) # n_skipped
79 n_launched = Value('i', 0) 74 n_launched = Value('i', 0)
80 n_finished = Value('i', 0) 75 n_finished = Value('i', 0)
81 n_skipped = Value('i', 0) 76 n_skipped = Value('i', 0)
...@@ -635,12 +630,24 @@ class Chain: ...@@ -635,12 +630,24 @@ class Chain:
635 if nt2 in res_ids: 630 if nt2 in res_ids:
636 interacts[nt2_idx] += 1 631 interacts[nt2_idx] += 1
637 if paired[nt2_idx] == "": 632 if paired[nt2_idx] == "":
638 - pair_type_LW[nt2_idx] = lw_pair[0] + lw_pair[2] + lw_pair[1] 633 + if lw_pair != "--":
639 - pair_type_DSSR[nt2_idx] = dssr_pair[0] + dssr_pair[3] + dssr_pair[2] + dssr_pair[1] 634 + pair_type_LW[nt2_idx] = lw_pair[0] + lw_pair[2] + lw_pair[1]
635 + else:
636 + pair_type_LW[nt2_idx] = "--"
637 + if dssr_pair != "--":
638 + pair_type_DSSR[nt2_idx] = dssr_pair[0] + dssr_pair[3] + dssr_pair[2] + dssr_pair[1]
639 + else:
640 + pair_type_DSSR[nt2_idx] = "--"
640 paired[nt2_idx] = str(nt1_idx + 1) 641 paired[nt2_idx] = str(nt1_idx + 1)
641 else: 642 else:
642 - pair_type_LW[nt2_idx] += ',' + lw_pair[0] + lw_pair[2] + lw_pair[1] 643 + if lw_pair != "--":
643 - pair_type_DSSR[nt2_idx] += ',' + dssr_pair[0] + dssr_pair[3] + dssr_pair[2] + dssr_pair[1] 644 + pair_type_LW[nt2_idx] += ',' + lw_pair[0] + lw_pair[2] + lw_pair[1]
645 + else:
646 + pair_type_LW[nt2_idx] += ",--"
647 + if dssr_pair != "--":
648 + pair_type_DSSR[nt2_idx] += ',' + dssr_pair[0] + dssr_pair[3] + dssr_pair[2] + dssr_pair[1]
649 + else:
650 + pair_type_DSSR[nt2_idx] += ",--"
644 paired[nt2_idx] += ',' + str(nt1_idx + 1) 651 paired[nt2_idx] += ',' + str(nt1_idx + 1)
645 652
646 # transform nt_id to shorter values 653 # transform nt_id to shorter values
...@@ -1083,7 +1090,7 @@ class Pipeline: ...@@ -1083,7 +1090,7 @@ class Pipeline:
1083 self.REUSE_ALL = False 1090 self.REUSE_ALL = False
1084 self.REDUNDANT = False 1091 self.REDUNDANT = False
1085 self.ALIGNOPTS = None 1092 self.ALIGNOPTS = None
1086 - self.RRNAALIGNOPTS = "--mxsize 8192 --cpu 10 --maxtau 0.1" 1093 + self.RRNAALIGNOPTS = ["--mxsize", "8192", "--cpu", "10", "--maxtau", "0.1"]
1087 self.STATSOPTS = None 1094 self.STATSOPTS = None
1088 self.USESINA = False 1095 self.USESINA = False
1089 self.SELECT_ONLY = None 1096 self.SELECT_ONLY = None
...@@ -1151,6 +1158,7 @@ class Pipeline: ...@@ -1151,6 +1158,7 @@ class Pipeline:
1151 "\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions" 1158 "\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions"
1152 "\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.") 1159 "\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.")
1153 print("--cmalign-opts=…\t\tA string of additional options to pass to cmalign aligner, e.g. \"--nonbanded --mxsize 2048\"") 1160 print("--cmalign-opts=…\t\tA string of additional options to pass to cmalign aligner, e.g. \"--nonbanded --mxsize 2048\"")
1161 + print("--cmalign-rrna-opts=…\tLike cmalign-opts, but applied for rRNA (large families, memory-heavy jobs).")
1154 print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments," 1162 print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments,"
1155 "\n\t\t\t\t and update the link to the latest archive. ") 1163 "\n\t\t\t\t and update the link to the latest archive. ")
1156 print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications.") 1164 print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications.")
...@@ -1219,7 +1227,7 @@ class Pipeline: ...@@ -1219,7 +1227,7 @@ class Pipeline:
1219 elif opt == "cmalign-opts": 1227 elif opt == "cmalign-opts":
1220 self.ALIGNOPTS = arg 1228 self.ALIGNOPTS = arg
1221 elif opt == "cmalign-rrna-opts": 1229 elif opt == "cmalign-rrna-opts":
1222 - self.RRNAALIGNOPTS = arg 1230 + self.RRNAALIGNOPTS = " ".split(arg)
1223 elif opt == "stats-opts": 1231 elif opt == "stats-opts":
1224 self.STATSOPTS = " ".split(arg) 1232 self.STATSOPTS = " ".split(arg)
1225 elif opt == "--all": 1233 elif opt == "--all":
...@@ -1436,8 +1444,9 @@ class Pipeline: ...@@ -1436,8 +1444,9 @@ class Pipeline:
1436 args=[c, self.EXTRACT_CHAINS, self.KEEP_HETATM, retry, self.SAVELOGS])) 1444 args=[c, self.EXTRACT_CHAINS, self.KEEP_HETATM, retry, self.SAVELOGS]))
1437 try: 1445 try:
1438 results = execute_joblist(joblist) 1446 results = execute_joblist(joblist)
1439 - except: 1447 + except Exception as e:
1440 - print("Exiting", flush=True) 1448 + warn(str(e), error=True)
1449 + print("Exiting", str(e), flush=True)
1441 exit(1) 1450 exit(1)
1442 1451
1443 # If there were newly discovered problems, add this chain to the known issues 1452 # If there were newly discovered problems, add this chain to the known issues
...@@ -1550,7 +1559,7 @@ class Pipeline: ...@@ -1550,7 +1559,7 @@ class Pipeline:
1550 align = AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta") 1559 align = AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta")
1551 nb_3d_chains = len([1 for r in align if '[' in r.id]) 1560 nb_3d_chains = len([1 for r in align if '[' in r.id])
1552 if r[0] in SSU_set: # SSU v138.1 is used 1561 if r[0] in SSU_set: # SSU v138.1 is used
1553 - nb_homologs = 2224740 # source: https://www.arb-silva.de/documentation/release-1381/ 1562 + nb_homologs = 2224740 # source: https://www.arb-silva.de/documentation/release-1381/
1554 nb_total_homol = nb_homologs + nb_3d_chains 1563 nb_total_homol = nb_homologs + nb_3d_chains
1555 elif r[0] in LSU_set: # LSU v138.1 is used 1564 elif r[0] in LSU_set: # LSU v138.1 is used
1556 nb_homologs = 227331 # source: https://www.arb-silva.de/documentation/release-1381/ 1565 nb_homologs = 227331 # source: https://www.arb-silva.de/documentation/release-1381/
...@@ -1794,9 +1803,9 @@ def init_no_tqdm(arg1, arg2, arg3): ...@@ -1794,9 +1803,9 @@ def init_no_tqdm(arg1, arg2, arg3):
1794 The children progress is followed using stdout text logs (notify(), warn(), etc) 1803 The children progress is followed using stdout text logs (notify(), warn(), etc)
1795 """ 1804 """
1796 global n_launched, n_finished, n_skipped 1805 global n_launched, n_finished, n_skipped
1797 - n_launched = arg1 1806 + n_launched = arg1
1798 - n_finished = arg2 1807 + n_finished = arg2
1799 - n_skipped = arg3 1808 + n_skipped = arg3
1800 1809
1801 def warn(message, error=False): 1810 def warn(message, error=False):
1802 """ 1811 """
...@@ -2147,7 +2156,7 @@ def execute_job(j, jobcount): ...@@ -2147,7 +2156,7 @@ def execute_job(j, jobcount):
2147 2156
2148 # increase the counter of running jobs 2157 # increase the counter of running jobs
2149 with n_launched.get_lock(): 2158 with n_launched.get_lock():
2150 - n_launched.value += 1 2159 + n_launched.value += 1
2151 2160
2152 # Monitor this process 2161 # Monitor this process
2153 m = -1 2162 m = -1
...@@ -2208,7 +2217,8 @@ def execute_job(j, jobcount): ...@@ -2208,7 +2217,8 @@ def execute_job(j, jobcount):
2208 m = assistant_future.result() 2217 m = assistant_future.result()
2209 2218
2210 # increase the counter of finished jobs 2219 # increase the counter of finished jobs
2211 - running_stats[1] += 1 2220 + with n_finished.get_lock():
2221 + n_finished.value += 1
2212 2222
2213 # return time and memory statistics, plus the job results 2223 # return time and memory statistics, plus the job results
2214 t = end_time - start_time 2224 t = end_time - start_time
...@@ -2223,9 +2233,12 @@ def execute_joblist(fulljoblist): ...@@ -2223,9 +2233,12 @@ def execute_joblist(fulljoblist):
2223 """ 2233 """
2224 2234
2225 # Reset counters 2235 # Reset counters
2226 - running_stats[0] = 0 # started 2236 + with n_launched.get_lock():
2227 - running_stats[1] = 0 # finished 2237 + n_launched.value = 0
2228 - running_stats[2] = 0 # failed 2238 + with n_skipped.get_lock():
2239 + n_skipped.value = 0
2240 + with n_finished.get_lock():
2241 + n_finished.value = 0
2229 2242
2230 # Sort jobs in a tree structure, first by priority, then by CPU numbers 2243 # Sort jobs in a tree structure, first by priority, then by CPU numbers
2231 jobs = {} 2244 jobs = {}
...@@ -2276,10 +2289,6 @@ def execute_joblist(fulljoblist): ...@@ -2276,10 +2289,6 @@ def execute_joblist(fulljoblist):
2276 j.comp_time = round(r[0], 2) # seconds 2289 j.comp_time = round(r[0], 2) # seconds
2277 j.max_mem = int(r[1]/1000000) # MB 2290 j.max_mem = int(r[1]/1000000) # MB
2278 results.append((j.label, r[2], j.comp_time, j.max_mem)) 2291 results.append((j.label, r[2], j.comp_time, j.max_mem))
2279 -
2280 - # Job is finished
2281 - with n_finished.get_lock():
2282 - n_finished.value += 1
2283 2292
2284 # throw back the money 2293 # throw back the money
2285 return results 2294 return results
...@@ -2672,13 +2681,17 @@ def use_infernal(rfam_acc, alignopts): ...@@ -2672,13 +2681,17 @@ def use_infernal(rfam_acc, alignopts):
2672 with open(path_to_seq_data + f"realigned/{rfam_acc}_new.log", 'w') as o: 2681 with open(path_to_seq_data + f"realigned/{rfam_acc}_new.log", 'w') as o:
2673 p1 = subprocess.run(["cmalign", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", 2682 p1 = subprocess.run(["cmalign", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins",
2674 "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", 2683 "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
2675 - "-o", path_to_seq_data + f"realigned/{rfam_acc}_new.stk", 2684 + "-o", new_ali_path,
2676 path_to_seq_data + f"realigned/{rfam_acc}.cm", 2685 path_to_seq_data + f"realigned/{rfam_acc}.cm",
2677 path_to_seq_data + f"realigned/{rfam_acc}_new.fa"], 2686 path_to_seq_data + f"realigned/{rfam_acc}_new.fa"],
2678 stdout=o, stderr=subprocess.PIPE) 2687 stdout=o, stderr=subprocess.PIPE)
2679 - if "--mxsize" in p1.stderr.decode("utf-8"): 2688 + align_errors = p1.stderr.decode("utf-8")
2680 - # not enough available RAM to allocate the DP matrix 2689 + if len(align_errors):
2681 - warn(f"Not enough RAM to allocate cmalign DP matrix for family {rfam_acc}. Use --sina or --cmalign-opts.", error=True) 2690 + if "--mxsize" in align_errors:
2691 + # not enough available RAM to allocate the DP matrix
2692 + warn(f"Not enough RAM to allocate cmalign DP matrix for family {rfam_acc}. Use --sina or --cmalign-opts.", error=True)
2693 + else:
2694 + warn(align_errors, error=True)
2682 notify("Aligned new sequences together") 2695 notify("Aligned new sequences together")
2683 2696
2684 # Detect doublons and remove them 2697 # Detect doublons and remove them
...@@ -2710,8 +2723,8 @@ def use_infernal(rfam_acc, alignopts): ...@@ -2710,8 +2723,8 @@ def use_infernal(rfam_acc, alignopts):
2710 os.remove(path_to_seq_data + "realigned/toremove.txt") 2723 os.remove(path_to_seq_data + "realigned/toremove.txt")
2711 2724
2712 # And we merge the two alignments 2725 # And we merge the two alignments
2713 - p2 = subprocess.run(["cmalign", "--merge" "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", 2726 + p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
2714 - "--rna", path_to_seq_data + f"realigned/{rfam_acc}.cm", existing_ali_path, new_ali_path], 2727 + "--rna", existing_ali_path, new_ali_path],
2715 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) 2728 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
2716 alignErrors = p1.stderr.decode('utf-8') 2729 alignErrors = p1.stderr.decode('utf-8')
2717 mergeErrors = p2.stderr.decode('utf-8') 2730 mergeErrors = p2.stderr.decode('utf-8')
...@@ -2730,7 +2743,7 @@ def use_infernal(rfam_acc, alignopts): ...@@ -2730,7 +2743,7 @@ def use_infernal(rfam_acc, alignopts):
2730 2743
2731 cmd = ["cmalign"] 2744 cmd = ["cmalign"]
2732 if alignopts is not None: 2745 if alignopts is not None:
2733 - cmd += " ".split(alignopts) 2746 + cmd += alignopts
2734 cmd += ['-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk", 2747 cmd += ['-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
2735 "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", 2748 "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins",
2736 "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv", 2749 "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
...@@ -3166,8 +3179,6 @@ if __name__ == "__main__": ...@@ -3166,8 +3179,6 @@ if __name__ == "__main__":
3166 for c in pp.loaded_chains: 3179 for c in pp.loaded_chains:
3167 work_save(c, homology=False) 3180 work_save(c, homology=False)
3168 print("Completed.") 3181 print("Completed.")
3169 - exit(0)
3170 -
3171 3182
3172 # At this point, structure, chain and nucleotide tables of the database are up to date. 3183 # At this point, structure, chain and nucleotide tables of the database are up to date.
3173 # (Modulo some statistics computed by statistics.py) 3184 # (Modulo some statistics computed by statistics.py)
......
...@@ -35,11 +35,11 @@ nohup bash -c 'time docker run --rm -v /path/to/3D/data/folder:/3D -v /path/to/s ...@@ -35,11 +35,11 @@ nohup bash -c 'time docker run --rm -v /path/to/3D/data/folder:/3D -v /path/to/s
35 # Method 2 : Classical command line installation (Linux only) 35 # Method 2 : Classical command line installation (Linux only)
36 36
37 You need to install the dependencies: 37 You need to install the dependencies:
38 -- DSSR, you need to register to the X3DNA forum [here](http://forum.x3dna.org/site-announcements/download-instructions/) and then download the DSSR binary [on that page](http://forum.x3dna.org/downloads/3dna-download/). Make sure to have the `x3dna-dssr` binary in your $PATH variable so that RNANet.py finds it. 38 +- DSSR 1.9.9 or newer, you need to register to the X3DNA forum [here](http://forum.x3dna.org/site-announcements/download-instructions/) and then download the DSSR binary [on that page](http://forum.x3dna.org/downloads/3dna-download/). Make sure to have the `x3dna-dssr` binary in your $PATH variable so that RNANet.py finds it.
39 -- Infernal, to download at [Eddylab](http://eddylab.org/infernal/), several options are available depending on your preferences. Make sure to have the `cmalign`, `cmfetch`, `cmbuild`, `esl-alimanip`, `esl-alipid` and `esl-reformat` binaries in your $PATH variable, so that RNANet.py can find them. 39 +- Infernal 1.1.4 or newer, to download at [Eddylab](http://eddylab.org/infernal/), several options are available depending on your preferences. Make sure to have the `cmalign`, `cmfetch`, `cmbuild`, `esl-alimanip`, `esl-alimerge`, `esl-alipid` and `esl-reformat` binaries in your $PATH variable, so that RNANet.py can find them.
40 -- SINA, follow [these instructions](https://sina.readthedocs.io/en/latest/install.html) for example. Make sure to have the `sina` binary in your $PATH. 40 +- SINA (if you plan to use it), follow [these instructions](https://sina.readthedocs.io/en/latest/install.html) for example. Make sure to have the `sina` binary in your $PATH.
41 - Sqlite 3, available under the name *sqlite* in every distro's package manager, 41 - Sqlite 3, available under the name *sqlite* in every distro's package manager,
42 -- Python >= 3.8, (Unfortunately, python3.6 is no longer supported, because of changes in the multiprocessing and Threading packages. Untested with Python 3.7.\*) 42 +- Python >= 3.8, (Unfortunately, python3.6 is no longer supported, because of changes in the multiprocessing and Threading packages. Untested with Python 3.7.\*).
43 - The following Python packages: `python3.8 -m pip install biopython matplotlib pandas psutil pymysql requests scipy setproctitle sqlalchemy tqdm`. 43 - The following Python packages: `python3.8 -m pip install biopython matplotlib pandas psutil pymysql requests scipy setproctitle sqlalchemy tqdm`.
44 44
45 Then, run it from the command line, preferably using nohup if your shell will be interrupted: 45 Then, run it from the command line, preferably using nohup if your shell will be interrupted:
......
...@@ -19,3 +19,6 @@ ...@@ -19,3 +19,6 @@
19 * Use and save Infernal alignment bounds and truncation information 19 * Use and save Infernal alignment bounds and truncation information
20 * Save if a chain is a representative in BGSU list 20 * Save if a chain is a representative in BGSU list
21 * Annotate unstructured regions (on a nucleotide basis) 21 * Annotate unstructured regions (on a nucleotide basis)
22 +
23 +## Technical to-do list
24 +* `cmalign --merge` is now deprecated, we use `esl-alimerge` instead. But, esl is a single-core process. We should run the merges of alignements of different families in parallel to save some time [TODO].
......
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
...@@ -26,7 +26,7 @@ from os import path ...@@ -26,7 +26,7 @@ from os import path
26 from tqdm import tqdm 26 from tqdm import tqdm
27 from collections import Counter 27 from collections import Counter
28 from setproctitle import setproctitle 28 from setproctitle import setproctitle
29 -from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker, trace_unhandled_exceptions 29 +from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_with_tqdm, trace_unhandled_exceptions
30 from geometric_stats import * 30 from geometric_stats import *
31 31
32 np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8) 32 np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8)
...@@ -948,7 +948,11 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s): ...@@ -948,7 +948,11 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
948 nb_gap += 1 948 nb_gap += 1
949 coordinates_with_gaps.append(np.nan) 949 coordinates_with_gaps.append(np.nan)
950 else: 950 else:
951 - coordinates_with_gaps.append(coordinates[i - nb_gap]) 951 + try:
952 + coordinates_with_gaps.append(coordinates[i - nb_gap])
953 + except IndexError as e:
954 + warn(f"{filename} : {s.seq} at position {i}, we get {e}.", error=True)
955 + exit(0)
952 956
953 # Build the pairwise distances 957 # Build the pairwise distances
954 d = np.zeros((len(s.seq), len(s.seq)), dtype=np.float32) 958 d = np.zeros((len(s.seq), len(s.seq)), dtype=np.float32)
...@@ -1055,7 +1059,7 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): ...@@ -1055,7 +1059,7 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
1055 else: 1059 else:
1056 # We split the work for one family on multiple workers. 1060 # We split the work for one family on multiple workers.
1057 1061
1058 - p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) 1062 + p = Pool(initializer=init_with_tqdm, initargs=(tqdm.get_lock(),), processes=nworkers)
1059 try: 1063 try:
1060 fam_pbar = tqdm(total=len(align), desc=f"{f} {label} pair distances", position=0, unit="chain", leave=True) 1064 fam_pbar = tqdm(total=len(align), desc=f"{f} {label} pair distances", position=0, unit="chain", leave=True)
1061 # Apply work_pssm_remap to each RNA family 1065 # Apply work_pssm_remap to each RNA family
...@@ -1147,8 +1151,11 @@ def nt_3d_centers(cif_file, consider_all_atoms): ...@@ -1147,8 +1151,11 @@ def nt_3d_centers(cif_file, consider_all_atoms):
1147 Some chains have no C1' (e.g. 4v7f-3), therefore, an empty result is returned. 1151 Some chains have no C1' (e.g. 4v7f-3), therefore, an empty result is returned.
1148 """ 1152 """
1149 result =[] 1153 result =[]
1150 - structure = MMCIFParser().get_structure(cif_file, cif_file) 1154 + try:
1151 - 1155 + structure = MMCIFParser().get_structure(cif_file, cif_file)
1156 + except Exception as e:
1157 + warn(f"{cif_file} : {e}", error=True)
1158 + return result
1152 for model in structure: 1159 for model in structure:
1153 for chain in model: 1160 for chain in model:
1154 for residue in chain: 1161 for residue in chain:
...@@ -1203,7 +1210,7 @@ def process_jobs(joblist): ...@@ -1203,7 +1210,7 @@ def process_jobs(joblist):
1203 Starts a Pool to run the Job() objects in joblist. 1210 Starts a Pool to run the Job() objects in joblist.
1204 """ 1211 """
1205 tmp_nworkers = min(len(joblist), nworkers) 1212 tmp_nworkers = min(len(joblist), nworkers)
1206 - p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=tmp_nworkers) 1213 + p = Pool(initializer=init_with_tqdm, initargs=(tqdm.get_lock(),), processes=tmp_nworkers)
1207 pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, unit="job", leave=True) 1214 pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, unit="job", leave=True)
1208 1215
1209 try: 1216 try:
...@@ -1345,31 +1352,31 @@ if __name__ == "__main__": ...@@ -1345,31 +1352,31 @@ if __name__ == "__main__":
1345 # Define the tasks 1352 # Define the tasks
1346 joblist = [] 1353 joblist = []
1347 1354
1348 - # # Do eta/theta plots 1355 + # Do eta/theta plots
1349 - # if n_unmapped_chains and DO_WADLEY_ANALYSIS: 1356 + if n_unmapped_chains and DO_WADLEY_ANALYSIS:
1350 - # joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr))) 1357 + joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
1351 - # joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr))) 1358 + joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
1352 - 1359 +
1353 - # # Do distance matrices for each family excl. LSU/SSU (will be processed later) 1360 + # Do distance matrices for each family excl. LSU/SSU (will be processed later)
1354 - # if DO_AVG_DISTANCE_MATRIX: 1361 + if DO_AVG_DISTANCE_MATRIX:
1355 - # extracted_chains = [] 1362 + extracted_chains = []
1356 - # for file in os.listdir(path_to_3D_data + "rna_mapped_to_Rfam"): 1363 + for file in os.listdir(path_to_3D_data + "rna_mapped_to_Rfam"):
1357 - # if os.path.isfile(os.path.join(path_to_3D_data + "rna_mapped_to_Rfam", file)): 1364 + if os.path.isfile(os.path.join(path_to_3D_data + "rna_mapped_to_Rfam", file)):
1358 - # e1 = file.split('_')[0] 1365 + e1 = file.split('_')[0]
1359 - # e2 = file.split('_')[1] 1366 + e2 = file.split('_')[1]
1360 - # e3 = file.split('_')[2] 1367 + e3 = file.split('_')[2]
1361 - # extracted_chains.append(e1 + '[' + e2 + ']' + '-' + e3) 1368 + extracted_chains.append(e1 + '[' + e2 + ']' + '-' + e3)
1362 - # for f in [ x for x in famlist if (x not in LSU_set and x not in SSU_set) ]: # Process the rRNAs later only 3 by 3 1369 + for f in [ x for x in famlist if (x not in LSU_set and x not in SSU_set) ]: # Process the rRNAs later only 3 by 3
1363 - # joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, True, False))) 1370 + joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, True, False)))
1364 - # joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False, False))) 1371 + joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False, False)))
1365 - 1372 +
1366 - # # Do general family statistics 1373 + # Do general family statistics
1367 - # joblist.append(Job(function=stats_len)) # Computes figures about chain lengths 1374 + joblist.append(Job(function=stats_len)) # Computes figures about chain lengths
1368 - # joblist.append(Job(function=stats_freq)) # updates the database (nucleotide frequencies in families) 1375 + joblist.append(Job(function=stats_freq)) # updates the database (nucleotide frequencies in families)
1369 - # for f in famlist: 1376 + for f in famlist:
1370 - # joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database (intra-chain basepair types within a family) 1377 + joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database (intra-chain basepair types within a family)
1371 - # if f not in ignored: 1378 + if f not in ignored:
1372 - # joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database (identity matrices of families) 1379 + joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database (identity matrices of families)
1373 1380
1374 1381
1375 # Do geometric measures 1382 # Do geometric measures
...@@ -1382,7 +1389,7 @@ if __name__ == "__main__": ...@@ -1382,7 +1389,7 @@ if __name__ == "__main__":
1382 joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances 1389 joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances
1383 1390
1384 1391
1385 - # process_jobs(joblist) 1392 + process_jobs(joblist)
1386 1393
1387 # Now process the memory-heavy tasks family by family 1394 # Now process the memory-heavy tasks family by family
1388 if DO_AVG_DISTANCE_MATRIX: 1395 if DO_AVG_DISTANCE_MATRIX:
...@@ -1398,11 +1405,11 @@ if __name__ == "__main__": ...@@ -1398,11 +1405,11 @@ if __name__ == "__main__":
1398 1405
1399 # finish the work after the parallel portions 1406 # finish the work after the parallel portions
1400 1407
1401 - # per_chain_stats() # per chain base frequencies and basepair types 1408 + per_chain_stats() # per chain base frequencies and basepair types
1402 - # seq_idty() # identity matrices from pre-computed .npy matrices 1409 + seq_idty() # identity matrices from pre-computed .npy matrices
1403 - # stats_pairs() 1410 + stats_pairs()
1404 if n_unmapped_chains: 1411 if n_unmapped_chains:
1405 - # general_stats() 1412 + general_stats()
1406 os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True) 1413 os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True)
1407 os.makedirs(runDir+"/results/geometry/json/", exist_ok=True) 1414 os.makedirs(runDir+"/results/geometry/json/", exist_ok=True)
1408 concat_dataframes(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv') 1415 concat_dataframes(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv')
......