Louis BECQUEY

tqdm progress bars

...@@ -18,6 +18,7 @@ from functools import partial ...@@ -18,6 +18,7 @@ from functools import partial
18 from os import path, makedirs 18 from os import path, makedirs
19 from multiprocessing import Pool, cpu_count, Manager 19 from multiprocessing import Pool, cpu_count, Manager
20 from time import sleep 20 from time import sleep
21 +from tqdm import tqdm
21 22
22 if path.isdir("/home/ubuntu/"): # this is the IFB-core cloud 23 if path.isdir("/home/ubuntu/"): # this is the IFB-core cloud
23 path_to_3D_data = "/mnt/Data/RNA/3D/" 24 path_to_3D_data = "/mnt/Data/RNA/3D/"
...@@ -267,9 +268,9 @@ class Job: ...@@ -267,9 +268,9 @@ class Job:
267 self.max_mem = -1 # not executed yet 268 self.max_mem = -1 # not executed yet
268 self.label = label 269 self.label = label
269 if not how_many_in_parallel: 270 if not how_many_in_parallel:
270 - self.nthreads = cpu_count() 271 + self.nthreads = read_cpu_number()
271 elif how_many_in_parallel == -1: 272 elif how_many_in_parallel == -1:
272 - self.nthreads = cpu_count() - 1 273 + self.nthreads = read_cpu_number() - 1
273 else: 274 else:
274 self.nthreads = how_many_in_parallel 275 self.nthreads = how_many_in_parallel
275 self.useless_bool = False 276 self.useless_bool = False
...@@ -472,6 +473,13 @@ class Monitor: ...@@ -472,6 +473,13 @@ class Monitor:
472 sleep(0.1) 473 sleep(0.1)
473 return max_mem 474 return max_mem
474 475
476 +def read_cpu_number():
477 + # do not use os.cpu_count() on LXC containers
478 + # it reads info from /sys wich is not the VM resources but the host resources.
479 + # Read from /proc/cpuinfo instead.
480 + p = subprocess.run(['grep', '-c', 'Intel(', '/proc/cpuinfo'], stdout=subprocess.PIPE)
481 + return int(p.stdout.decode('utf-8')[:-1])
482 +
475 def warn(message, error=False): 483 def warn(message, error=False):
476 if error: 484 if error:
477 print(f"\t> \033[31mERR: {message}\033[0m{errsymb}", flush=True) 485 print(f"\t> \033[31mERR: {message}\033[0m{errsymb}", flush=True)
...@@ -499,11 +507,10 @@ def execute_job(j, jobcount): ...@@ -499,11 +507,10 @@ def execute_job(j, jobcount):
499 507
500 monitor.keep_watching = False 508 monitor.keep_watching = False
501 m = assistant_future.result() 509 m = assistant_future.result()
502 -
503 510
504 elif j.func_ is not None: 511 elif j.func_ is not None:
505 512
506 - print(f"[{running_stats[0]+running_stats[2]}/{jobcount}]\t{j.func_.__name__}({', '.join([str(a) for a in j.args_ if not ((type(a) == list) and len(a)>3)])})") 513 + #print(f"[{running_stats[0]+running_stats[2]}/{jobcount}]\t{j.func_.__name__}({', '.join([str(a) for a in j.args_ if not ((type(a) == list) and len(a)>3)])})")
507 514
508 m = -1 515 m = -1
509 monitor = Monitor(os.getpid()) 516 monitor = Monitor(os.getpid())
...@@ -558,7 +565,7 @@ def execute_joblist(fulljoblist, printstats=False): ...@@ -558,7 +565,7 @@ def execute_joblist(fulljoblist, printstats=False):
558 print("using", n, "processes:") 565 print("using", n, "processes:")
559 566
560 # execute jobs of priority i that should be processed n by n: 567 # execute jobs of priority i that should be processed n by n:
561 - p = Pool(processes=n, maxtasksperchild=10) 568 + p = Pool(processes=n)
562 raw_results = p.map(partial(execute_job, jobcount=jobcount), bunch) 569 raw_results = p.map(partial(execute_job, jobcount=jobcount), bunch)
563 p.close() 570 p.close()
564 p.join() 571 p.join()
...@@ -833,23 +840,26 @@ def summarize_position(col): ...@@ -833,23 +840,26 @@ def summarize_position(col):
833 else: 840 else:
834 return (0, 0, 0, 0, 0) 841 return (0, 0, 0, 0, 0)
835 842
836 -def alignment_nt_stats(f, list_of_chains): 843 +def alignment_nt_stats(f, list_of_chains) :
837 - print("\t>",f,"... ", flush=True) 844 + global idxQueue
845 + #print("\t>",f,"... ", flush=True)
838 chains_ids = [ str(c) for c in list_of_chains ] 846 chains_ids = [ str(c) for c in list_of_chains ]
839 - 847 + thr_idx = idxQueue.get()
848 + print(thr_idx, flush=True)
849 +
840 # Open the alignment 850 # Open the alignment
841 align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") 851 align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
842 alilen = align.get_alignment_length() 852 alilen = align.get_alignment_length()
843 - print("\t>",f,"... loaded", flush=True) 853 + #print("\t>",f,"... loaded", flush=True)
844 - 854 +
845 # Compute statistics per column 855 # Compute statistics per column
846 - results = [ summarize_position(align[:,i]) for i in range(alilen) ] 856 + results = [ summarize_position(align[:,i]) for i in tqdm(range(alilen), position=thr_idx) ]
847 frequencies = np.array(results).T 857 frequencies = np.array(results).T
848 - print("\t>",f,"... loaded, computed", flush=True) 858 + #print("\t>",f,"... loaded, computed", flush=True)
849 859
850 for s in align: 860 for s in align:
851 if not '[' in s.id: # this is a Rfamseq entry, not PDB 861 if not '[' in s.id: # this is a Rfamseq entry, not PDB
852 - continue 862 + continue
853 863
854 # get the right 3D chain: 864 # get the right 3D chain:
855 idx = chains_ids.index(s.id) 865 idx = chains_ids.index(s.id)
...@@ -868,16 +878,16 @@ def alignment_nt_stats(f, list_of_chains): ...@@ -868,16 +878,16 @@ def alignment_nt_stats(f, list_of_chains):
868 i += 1 878 i += 1
869 j += 1 879 j += 1
870 elif c.seq[i] == '-': # gap in the chain, but not in the aligned sequence 880 elif c.seq[i] == '-': # gap in the chain, but not in the aligned sequence
871 - 881 +
872 # search for a gap to the consensus nearby 882 # search for a gap to the consensus nearby
873 k = 0 883 k = 0
874 while j+k<alilen and s.seq[j+k] in ['.','-']: 884 while j+k<alilen and s.seq[j+k] in ['.','-']:
875 if s.seq[j+k] == '-': 885 if s.seq[j+k] == '-':
876 break 886 break
877 k += 1 887 k += 1
878 - 888 +
879 # if found, set j to that position 889 # if found, set j to that position
880 - if j+k<alilen and s.seq[j+k] == '-': 890 + if j+k<alilen and s.seq[j+k] == '-':
881 j = j + k 891 j = j + k
882 continue 892 continue
883 893
...@@ -897,8 +907,8 @@ def alignment_nt_stats(f, list_of_chains): ...@@ -897,8 +907,8 @@ def alignment_nt_stats(f, list_of_chains):
897 else: 907 else:
898 print(f"You are never supposed to reach this. Comparing {c.chain_label} in {i} ({c.seq[i-1:i+2]}) with seq[{j}] ({s.seq[j-3:j+4]}).\n", c.seq, '\n', s.seq, sep='', flush=True) 908 print(f"You are never supposed to reach this. Comparing {c.chain_label} in {i} ({c.seq[i-1:i+2]}) with seq[{j}] ({s.seq[j-3:j+4]}).\n", c.seq, '\n', s.seq, sep='', flush=True)
899 exit(1) 909 exit(1)
900 - if warn_gaps: 910 + #if warn_gaps:
901 - warn(f"Some gap(s) in {c.chain_label} were not re-found in the aligned sequence... Ignoring them.") 911 + #warn(f"Some gap(s) in {c.chain_label} were not re-found in the aligned sequence... Ignoring them.")
902 912
903 # Replace masked positions by the consensus sequence: 913 # Replace masked positions by the consensus sequence:
904 c_seq = c.seq.split() 914 c_seq = c.seq.split()
...@@ -934,11 +944,11 @@ def alignment_nt_stats(f, list_of_chains): ...@@ -934,11 +944,11 @@ def alignment_nt_stats(f, list_of_chains):
934 line = [str(x) for x in list(point[i,:]) ] 944 line = [str(x) for x in list(point[i,:]) ]
935 file.write(','.join(line)+'\n') 945 file.write(','.join(line)+'\n')
936 file.close() 946 file.close()
937 - print("\t\tWritten", c.chain_label, f"to file\t{validsymb}", flush=True) 947 + #print("\t\tWritten", c.chain_label, f"to file\t{validsymb}", flush=True)
948 +
949 + #print("\t>", f, f"... loaded, computed, saved\t{validsymb}", flush=True)
950 + return 0
938 951
939 - print("\t>", f, f"... loaded, computed, saved\t{validsymb}", flush=True)
940 - return None
941 -
942 if __name__ == "__main__": 952 if __name__ == "__main__":
943 print("Main process running. (PID", os.getpid(), ")") 953 print("Main process running. (PID", os.getpid(), ")")
944 954
...@@ -991,15 +1001,15 @@ if __name__ == "__main__": ...@@ -991,15 +1001,15 @@ if __name__ == "__main__":
991 os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam") 1001 os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam")
992 if not path.isdir(path_to_3D_data + "RNAcifs"): 1002 if not path.isdir(path_to_3D_data + "RNAcifs"):
993 os.makedirs(path_to_3D_data + "RNAcifs") 1003 os.makedirs(path_to_3D_data + "RNAcifs")
994 - 1004 +
995 - results = execute_joblist(joblist)[1] 1005 + results = execute_joblist(joblist)[1]
996 loaded_chains = [ c for c in results if not c.delete_me ] 1006 loaded_chains = [ c for c in results if not c.delete_me ]
997 print(f"> Loaded {len(loaded_chains)} RNA chains ({len(chains_with_mapping) - len(loaded_chains)} errors).") 1007 print(f"> Loaded {len(loaded_chains)} RNA chains ({len(chains_with_mapping) - len(loaded_chains)} errors).")
998 1008
999 # =========================================================================== 1009 # ===========================================================================
1000 # Download RNA sequences of the corresponding Rfam families 1010 # Download RNA sequences of the corresponding Rfam families
1001 # =========================================================================== 1011 # ===========================================================================
1002 - 1012 +
1003 # Get the list of Rfam families found 1013 # Get the list of Rfam families found
1004 rfam_acc_to_download = {} 1014 rfam_acc_to_download = {}
1005 for c in loaded_chains: 1015 for c in loaded_chains:
...@@ -1041,18 +1051,18 @@ if __name__ == "__main__": ...@@ -1041,18 +1051,18 @@ if __name__ == "__main__":
1041 if not path.isdir(path_to_3D_data + "datapoints/"): 1051 if not path.isdir(path_to_3D_data + "datapoints/"):
1042 os.makedirs(path_to_3D_data + "datapoints/") 1052 os.makedirs(path_to_3D_data + "datapoints/")
1043 print("Computing nucleotide frequencies in alignments...") 1053 print("Computing nucleotide frequencies in alignments...")
1044 - families = sorted([f for f in rfam_acc_to_download.keys() if f not in ["RF01960", "RF02540"]]) 1054 + families = sorted([f for f in rfam_acc_to_download.keys() ])
1045 - # pool = Pool(processes=cpu_count(), maxtasksperchild=10)
1046 - # results = pool.map(alignment_nt_stats, families)
1047 - # pool.close()
1048 - # pool.join()
1049 - # loaded_chains = list(itertools.chain.from_iterable(results))
1050 1055
1051 # Build job list 1056 # Build job list
1057 + thr_idx_mgr = multiprocessing.Manager()
1058 + idxQueue = thr_idx_mgr.Queue()
1059 + for i in range(10):
1060 + idxQueue.put(i)
1052 fulljoblist = [] 1061 fulljoblist = []
1053 for f in families: 1062 for f in families:
1054 label = f"Save {f} PSSMs" 1063 label = f"Save {f} PSSMs"
1055 list_of_chains = rfam_acc_to_download[f] 1064 list_of_chains = rfam_acc_to_download[f]
1056 - fulljoblist.append(Job(function=alignment_nt_stats, args=[f, list_of_chains, label], how_many_in_parallel=10, priority=1, label=label)) 1065 + fulljoblist.append(Job(function=alignment_nt_stats, args=[f, list_of_chains], how_many_in_parallel=10, priority=1, label=label))
1057 execute_joblist(fulljoblist, printstats=False) 1066 execute_joblist(fulljoblist, printstats=False)
1067 +
1058 print("Completed.") 1068 print("Completed.")
......
1 +#!/bin/bash
2 +PROCESS_TO_KILL="RNAnet.py"
3 +PROCESS_LIST=`ps ax | grep -Ei ${PROCESS_TO_KILL} | grep -Eiv '(grep|vi RNAnet.py)' | awk ' { print $1;}'`
4 +KILLED=
5 +for KILLPID in $PROCESS_LIST; do
6 + if [ ! -z $KILLPID ];then
7 + kill -9 $KILLPID
8 + echo "Killed PID ${KILLPID}"
9 + KILLED=yes
10 + fi
11 +done
12 +
13 +if [ -z $KILLED ];then
14 + echo "Didn't kill anything"
15 +fi