Showing
2 changed files
with
57 additions
and
32 deletions
... | @@ -18,6 +18,7 @@ from functools import partial | ... | @@ -18,6 +18,7 @@ from functools import partial |
18 | from os import path, makedirs | 18 | from os import path, makedirs |
19 | from multiprocessing import Pool, cpu_count, Manager | 19 | from multiprocessing import Pool, cpu_count, Manager |
20 | from time import sleep | 20 | from time import sleep |
21 | +from tqdm import tqdm | ||
21 | 22 | ||
22 | if path.isdir("/home/ubuntu/"): # this is the IFB-core cloud | 23 | if path.isdir("/home/ubuntu/"): # this is the IFB-core cloud |
23 | path_to_3D_data = "/mnt/Data/RNA/3D/" | 24 | path_to_3D_data = "/mnt/Data/RNA/3D/" |
... | @@ -267,9 +268,9 @@ class Job: | ... | @@ -267,9 +268,9 @@ class Job: |
267 | self.max_mem = -1 # not executed yet | 268 | self.max_mem = -1 # not executed yet |
268 | self.label = label | 269 | self.label = label |
269 | if not how_many_in_parallel: | 270 | if not how_many_in_parallel: |
270 | - self.nthreads = cpu_count() | 271 | + self.nthreads = read_cpu_number() |
271 | elif how_many_in_parallel == -1: | 272 | elif how_many_in_parallel == -1: |
272 | - self.nthreads = cpu_count() - 1 | 273 | + self.nthreads = read_cpu_number() - 1 |
273 | else: | 274 | else: |
274 | self.nthreads = how_many_in_parallel | 275 | self.nthreads = how_many_in_parallel |
275 | self.useless_bool = False | 276 | self.useless_bool = False |
... | @@ -472,6 +473,13 @@ class Monitor: | ... | @@ -472,6 +473,13 @@ class Monitor: |
472 | sleep(0.1) | 473 | sleep(0.1) |
473 | return max_mem | 474 | return max_mem |
474 | 475 | ||
476 | +def read_cpu_number(): | ||
477 | + # do not use os.cpu_count() on LXC containers | ||
478 | + # it reads info from /sys wich is not the VM resources but the host resources. | ||
479 | + # Read from /proc/cpuinfo instead. | ||
480 | + p = subprocess.run(['grep', '-c', 'Intel(', '/proc/cpuinfo'], stdout=subprocess.PIPE) | ||
481 | + return int(p.stdout.decode('utf-8')[:-1]) | ||
482 | + | ||
475 | def warn(message, error=False): | 483 | def warn(message, error=False): |
476 | if error: | 484 | if error: |
477 | print(f"\t> \033[31mERR: {message}\033[0m{errsymb}", flush=True) | 485 | print(f"\t> \033[31mERR: {message}\033[0m{errsymb}", flush=True) |
... | @@ -499,11 +507,10 @@ def execute_job(j, jobcount): | ... | @@ -499,11 +507,10 @@ def execute_job(j, jobcount): |
499 | 507 | ||
500 | monitor.keep_watching = False | 508 | monitor.keep_watching = False |
501 | m = assistant_future.result() | 509 | m = assistant_future.result() |
502 | - | ||
503 | 510 | ||
504 | elif j.func_ is not None: | 511 | elif j.func_ is not None: |
505 | 512 | ||
506 | - print(f"[{running_stats[0]+running_stats[2]}/{jobcount}]\t{j.func_.__name__}({', '.join([str(a) for a in j.args_ if not ((type(a) == list) and len(a)>3)])})") | 513 | + #print(f"[{running_stats[0]+running_stats[2]}/{jobcount}]\t{j.func_.__name__}({', '.join([str(a) for a in j.args_ if not ((type(a) == list) and len(a)>3)])})") |
507 | 514 | ||
508 | m = -1 | 515 | m = -1 |
509 | monitor = Monitor(os.getpid()) | 516 | monitor = Monitor(os.getpid()) |
... | @@ -558,7 +565,7 @@ def execute_joblist(fulljoblist, printstats=False): | ... | @@ -558,7 +565,7 @@ def execute_joblist(fulljoblist, printstats=False): |
558 | print("using", n, "processes:") | 565 | print("using", n, "processes:") |
559 | 566 | ||
560 | # execute jobs of priority i that should be processed n by n: | 567 | # execute jobs of priority i that should be processed n by n: |
561 | - p = Pool(processes=n, maxtasksperchild=10) | 568 | + p = Pool(processes=n) |
562 | raw_results = p.map(partial(execute_job, jobcount=jobcount), bunch) | 569 | raw_results = p.map(partial(execute_job, jobcount=jobcount), bunch) |
563 | p.close() | 570 | p.close() |
564 | p.join() | 571 | p.join() |
... | @@ -833,23 +840,26 @@ def summarize_position(col): | ... | @@ -833,23 +840,26 @@ def summarize_position(col): |
833 | else: | 840 | else: |
834 | return (0, 0, 0, 0, 0) | 841 | return (0, 0, 0, 0, 0) |
835 | 842 | ||
836 | -def alignment_nt_stats(f, list_of_chains): | 843 | +def alignment_nt_stats(f, list_of_chains) : |
837 | - print("\t>",f,"... ", flush=True) | 844 | + global idxQueue |
845 | + #print("\t>",f,"... ", flush=True) | ||
838 | chains_ids = [ str(c) for c in list_of_chains ] | 846 | chains_ids = [ str(c) for c in list_of_chains ] |
839 | - | 847 | + thr_idx = idxQueue.get() |
848 | + print(thr_idx, flush=True) | ||
849 | + | ||
840 | # Open the alignment | 850 | # Open the alignment |
841 | align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") | 851 | align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") |
842 | alilen = align.get_alignment_length() | 852 | alilen = align.get_alignment_length() |
843 | - print("\t>",f,"... loaded", flush=True) | 853 | + #print("\t>",f,"... loaded", flush=True) |
844 | - | 854 | + |
845 | # Compute statistics per column | 855 | # Compute statistics per column |
846 | - results = [ summarize_position(align[:,i]) for i in range(alilen) ] | 856 | + results = [ summarize_position(align[:,i]) for i in tqdm(range(alilen), position=thr_idx) ] |
847 | frequencies = np.array(results).T | 857 | frequencies = np.array(results).T |
848 | - print("\t>",f,"... loaded, computed", flush=True) | 858 | + #print("\t>",f,"... loaded, computed", flush=True) |
849 | 859 | ||
850 | for s in align: | 860 | for s in align: |
851 | if not '[' in s.id: # this is a Rfamseq entry, not PDB | 861 | if not '[' in s.id: # this is a Rfamseq entry, not PDB |
852 | - continue | 862 | + continue |
853 | 863 | ||
854 | # get the right 3D chain: | 864 | # get the right 3D chain: |
855 | idx = chains_ids.index(s.id) | 865 | idx = chains_ids.index(s.id) |
... | @@ -868,16 +878,16 @@ def alignment_nt_stats(f, list_of_chains): | ... | @@ -868,16 +878,16 @@ def alignment_nt_stats(f, list_of_chains): |
868 | i += 1 | 878 | i += 1 |
869 | j += 1 | 879 | j += 1 |
870 | elif c.seq[i] == '-': # gap in the chain, but not in the aligned sequence | 880 | elif c.seq[i] == '-': # gap in the chain, but not in the aligned sequence |
871 | - | 881 | + |
872 | # search for a gap to the consensus nearby | 882 | # search for a gap to the consensus nearby |
873 | k = 0 | 883 | k = 0 |
874 | while j+k<alilen and s.seq[j+k] in ['.','-']: | 884 | while j+k<alilen and s.seq[j+k] in ['.','-']: |
875 | if s.seq[j+k] == '-': | 885 | if s.seq[j+k] == '-': |
876 | break | 886 | break |
877 | k += 1 | 887 | k += 1 |
878 | - | 888 | + |
879 | # if found, set j to that position | 889 | # if found, set j to that position |
880 | - if j+k<alilen and s.seq[j+k] == '-': | 890 | + if j+k<alilen and s.seq[j+k] == '-': |
881 | j = j + k | 891 | j = j + k |
882 | continue | 892 | continue |
883 | 893 | ||
... | @@ -897,8 +907,8 @@ def alignment_nt_stats(f, list_of_chains): | ... | @@ -897,8 +907,8 @@ def alignment_nt_stats(f, list_of_chains): |
897 | else: | 907 | else: |
898 | print(f"You are never supposed to reach this. Comparing {c.chain_label} in {i} ({c.seq[i-1:i+2]}) with seq[{j}] ({s.seq[j-3:j+4]}).\n", c.seq, '\n', s.seq, sep='', flush=True) | 908 | print(f"You are never supposed to reach this. Comparing {c.chain_label} in {i} ({c.seq[i-1:i+2]}) with seq[{j}] ({s.seq[j-3:j+4]}).\n", c.seq, '\n', s.seq, sep='', flush=True) |
899 | exit(1) | 909 | exit(1) |
900 | - if warn_gaps: | 910 | + #if warn_gaps: |
901 | - warn(f"Some gap(s) in {c.chain_label} were not re-found in the aligned sequence... Ignoring them.") | 911 | + #warn(f"Some gap(s) in {c.chain_label} were not re-found in the aligned sequence... Ignoring them.") |
902 | 912 | ||
903 | # Replace masked positions by the consensus sequence: | 913 | # Replace masked positions by the consensus sequence: |
904 | c_seq = c.seq.split() | 914 | c_seq = c.seq.split() |
... | @@ -934,11 +944,11 @@ def alignment_nt_stats(f, list_of_chains): | ... | @@ -934,11 +944,11 @@ def alignment_nt_stats(f, list_of_chains): |
934 | line = [str(x) for x in list(point[i,:]) ] | 944 | line = [str(x) for x in list(point[i,:]) ] |
935 | file.write(','.join(line)+'\n') | 945 | file.write(','.join(line)+'\n') |
936 | file.close() | 946 | file.close() |
937 | - print("\t\tWritten", c.chain_label, f"to file\t{validsymb}", flush=True) | 947 | + #print("\t\tWritten", c.chain_label, f"to file\t{validsymb}", flush=True) |
948 | + | ||
949 | + #print("\t>", f, f"... loaded, computed, saved\t{validsymb}", flush=True) | ||
950 | + return 0 | ||
938 | 951 | ||
939 | - print("\t>", f, f"... loaded, computed, saved\t{validsymb}", flush=True) | ||
940 | - return None | ||
941 | - | ||
942 | if __name__ == "__main__": | 952 | if __name__ == "__main__": |
943 | print("Main process running. (PID", os.getpid(), ")") | 953 | print("Main process running. (PID", os.getpid(), ")") |
944 | 954 | ||
... | @@ -991,15 +1001,15 @@ if __name__ == "__main__": | ... | @@ -991,15 +1001,15 @@ if __name__ == "__main__": |
991 | os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam") | 1001 | os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam") |
992 | if not path.isdir(path_to_3D_data + "RNAcifs"): | 1002 | if not path.isdir(path_to_3D_data + "RNAcifs"): |
993 | os.makedirs(path_to_3D_data + "RNAcifs") | 1003 | os.makedirs(path_to_3D_data + "RNAcifs") |
994 | - | 1004 | + |
995 | - results = execute_joblist(joblist)[1] | 1005 | + results = execute_joblist(joblist)[1] |
996 | loaded_chains = [ c for c in results if not c.delete_me ] | 1006 | loaded_chains = [ c for c in results if not c.delete_me ] |
997 | print(f"> Loaded {len(loaded_chains)} RNA chains ({len(chains_with_mapping) - len(loaded_chains)} errors).") | 1007 | print(f"> Loaded {len(loaded_chains)} RNA chains ({len(chains_with_mapping) - len(loaded_chains)} errors).") |
998 | 1008 | ||
999 | # =========================================================================== | 1009 | # =========================================================================== |
1000 | # Download RNA sequences of the corresponding Rfam families | 1010 | # Download RNA sequences of the corresponding Rfam families |
1001 | # =========================================================================== | 1011 | # =========================================================================== |
1002 | - | 1012 | + |
1003 | # Get the list of Rfam families found | 1013 | # Get the list of Rfam families found |
1004 | rfam_acc_to_download = {} | 1014 | rfam_acc_to_download = {} |
1005 | for c in loaded_chains: | 1015 | for c in loaded_chains: |
... | @@ -1041,18 +1051,18 @@ if __name__ == "__main__": | ... | @@ -1041,18 +1051,18 @@ if __name__ == "__main__": |
1041 | if not path.isdir(path_to_3D_data + "datapoints/"): | 1051 | if not path.isdir(path_to_3D_data + "datapoints/"): |
1042 | os.makedirs(path_to_3D_data + "datapoints/") | 1052 | os.makedirs(path_to_3D_data + "datapoints/") |
1043 | print("Computing nucleotide frequencies in alignments...") | 1053 | print("Computing nucleotide frequencies in alignments...") |
1044 | - families = sorted([f for f in rfam_acc_to_download.keys() if f not in ["RF01960", "RF02540"]]) | 1054 | + families = sorted([f for f in rfam_acc_to_download.keys() ]) |
1045 | - # pool = Pool(processes=cpu_count(), maxtasksperchild=10) | ||
1046 | - # results = pool.map(alignment_nt_stats, families) | ||
1047 | - # pool.close() | ||
1048 | - # pool.join() | ||
1049 | - # loaded_chains = list(itertools.chain.from_iterable(results)) | ||
1050 | 1055 | ||
1051 | # Build job list | 1056 | # Build job list |
1057 | + thr_idx_mgr = multiprocessing.Manager() | ||
1058 | + idxQueue = thr_idx_mgr.Queue() | ||
1059 | + for i in range(10): | ||
1060 | + idxQueue.put(i) | ||
1052 | fulljoblist = [] | 1061 | fulljoblist = [] |
1053 | for f in families: | 1062 | for f in families: |
1054 | label = f"Save {f} PSSMs" | 1063 | label = f"Save {f} PSSMs" |
1055 | list_of_chains = rfam_acc_to_download[f] | 1064 | list_of_chains = rfam_acc_to_download[f] |
1056 | - fulljoblist.append(Job(function=alignment_nt_stats, args=[f, list_of_chains, label], how_many_in_parallel=10, priority=1, label=label)) | 1065 | + fulljoblist.append(Job(function=alignment_nt_stats, args=[f, list_of_chains], how_many_in_parallel=10, priority=1, label=label)) |
1057 | execute_joblist(fulljoblist, printstats=False) | 1066 | execute_joblist(fulljoblist, printstats=False) |
1067 | + | ||
1058 | print("Completed.") | 1068 | print("Completed.") | ... | ... |
kill_rnanet.sh
0 → 100755
1 | +#!/bin/bash | ||
2 | +PROCESS_TO_KILL="RNAnet.py" | ||
3 | +PROCESS_LIST=`ps ax | grep -Ei ${PROCESS_TO_KILL} | grep -Eiv '(grep|vi RNAnet.py)' | awk ' { print $1;}'` | ||
4 | +KILLED= | ||
5 | +for KILLPID in $PROCESS_LIST; do | ||
6 | + if [ ! -z $KILLPID ];then | ||
7 | + kill -9 $KILLPID | ||
8 | + echo "Killed PID ${KILLPID}" | ||
9 | + KILLED=yes | ||
10 | + fi | ||
11 | +done | ||
12 | + | ||
13 | +if [ -z $KILLED ];then | ||
14 | + echo "Didn't kill anything" | ||
15 | +fi |
-
Please register or login to post a comment