Louis BECQUEY

More command line options

...@@ -210,7 +210,7 @@ class Chain: ...@@ -210,7 +210,7 @@ class Chain:
210 notify(status) 210 notify(status)
211 211
212 @trace_unhandled_exceptions 212 @trace_unhandled_exceptions
213 - def extract_3D_data(self): 213 + def extract_3D_data(self, save_logs=True):
214 """ Maps DSSR annotations to the chain. """ 214 """ Maps DSSR annotations to the chain. """
215 215
216 ############################################ 216 ############################################
...@@ -513,7 +513,7 @@ class Chain: ...@@ -513,7 +513,7 @@ class Chain:
513 return None 513 return None
514 514
515 # Log chain info to file 515 # Log chain info to file
516 - if self.mapping is not None: 516 + if save_logs and self.mapping is not None:
517 self.mapping.to_file(self.chain_label+".log") 517 self.mapping.to_file(self.chain_label+".log")
518 518
519 return df 519 return df
...@@ -982,6 +982,7 @@ class Pipeline: ...@@ -982,6 +982,7 @@ class Pipeline:
982 self.REUSE_ALL = False 982 self.REUSE_ALL = False
983 self.SELECT_ONLY = None 983 self.SELECT_ONLY = None
984 self.ARCHIVE = False 984 self.ARCHIVE = False
985 + self.SAVELOGS = True
985 986
986 def process_options(self): 987 def process_options(self):
987 """Sets the paths and options of the pipeline""" 988 """Sets the paths and options of the pipeline"""
...@@ -992,7 +993,7 @@ class Pipeline: ...@@ -992,7 +993,7 @@ class Pipeline:
992 opts, _ = getopt.getopt( sys.argv[1:], "r:hs", 993 opts, _ = getopt.getopt( sys.argv[1:], "r:hs",
993 [ "help", "resolution=", "keep-hetatm=", "from-scratch", 994 [ "help", "resolution=", "keep-hetatm=", "from-scratch",
994 "fill-gaps=", "3d-folder=", "seq-folder=", 995 "fill-gaps=", "3d-folder=", "seq-folder=",
995 - "no-homology", "ignore-issues", "extract", "only=", "all", 996 + "no-homology", "ignore-issues", "extract", "only=", "all", "no-logs",
996 "archive", "update-homologous" ]) 997 "archive", "update-homologous" ])
997 except getopt.GetoptError as err: 998 except getopt.GetoptError as err:
998 print(err) 999 print(err)
...@@ -1035,6 +1036,7 @@ class Pipeline: ...@@ -1035,6 +1036,7 @@ class Pipeline:
1035 print("--update-homologous\t\tRe-download Rfam and SILVA databases, realign all families, and recompute all CSV files") 1036 print("--update-homologous\t\tRe-download Rfam and SILVA databases, realign all families, and recompute all CSV files")
1036 print("--from-scratch\t\t\tDelete database, local 3D and sequence files, and known issues, and recompute.") 1037 print("--from-scratch\t\t\tDelete database, local 3D and sequence files, and known issues, and recompute.")
1037 print("--archive\t\t\tCreate a tar.gz archive of the datapoints text files, and update the link to the latest archive") 1038 print("--archive\t\t\tCreate a tar.gz archive of the datapoints text files, and update the link to the latest archive")
1039 + print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications")
1038 print() 1040 print()
1039 print("Typical usage:") 1041 print("Typical usage:")
1040 print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &") 1042 print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &")
...@@ -1096,6 +1098,8 @@ class Pipeline: ...@@ -1096,6 +1098,8 @@ class Pipeline:
1096 self.EXTRACT_CHAINS = True 1098 self.EXTRACT_CHAINS = True
1097 elif opt == "--archive": 1099 elif opt == "--archive":
1098 self.ARCHIVE = True 1100 self.ARCHIVE = True
1101 + elif opt == "--no-logs":
1102 + self.SAVELOGS = False
1099 1103
1100 if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions": 1104 if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions":
1101 print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") 1105 print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
...@@ -1227,7 +1231,7 @@ class Pipeline: ...@@ -1227,7 +1231,7 @@ class Pipeline:
1227 c.delete_me = False # give a second chance 1231 c.delete_me = False # give a second chance
1228 if (c.chain_label not in self.known_issues) or not self.USE_KNOWN_ISSUES: 1232 if (c.chain_label not in self.known_issues) or not self.USE_KNOWN_ISSUES:
1229 joblist.append(Job(function=work_build_chain, how_many_in_parallel=int(coeff_ncores*ncores), 1233 joblist.append(Job(function=work_build_chain, how_many_in_parallel=int(coeff_ncores*ncores),
1230 - args=[c, self.EXTRACT_CHAINS, self.KEEP_HETATM, retry])) 1234 + args=[c, self.EXTRACT_CHAINS, self.KEEP_HETATM, retry, self.SAVELOGS]))
1231 try: 1235 try:
1232 results = execute_joblist(joblist) 1236 results = execute_joblist(joblist)
1233 except: 1237 except:
...@@ -1957,7 +1961,7 @@ def work_mmcif(pdb_id): ...@@ -1957,7 +1961,7 @@ def work_mmcif(pdb_id):
1957 return 0 1961 return 0
1958 1962
1959 @trace_unhandled_exceptions 1963 @trace_unhandled_exceptions
1960 -def work_build_chain(c, extract, khetatm, retrying=False): 1964 +def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
1961 """Reads information from JSON and save it to database. 1965 """Reads information from JSON and save it to database.
1962 If asked, also extracts the 3D chains from their original structure files. 1966 If asked, also extracts the 3D chains from their original structure files.
1963 1967
...@@ -1969,7 +1973,7 @@ def work_build_chain(c, extract, khetatm, retrying=False): ...@@ -1969,7 +1973,7 @@ def work_build_chain(c, extract, khetatm, retrying=False):
1969 1973
1970 # extract the 3D descriptors 1974 # extract the 3D descriptors
1971 if not c.delete_me: 1975 if not c.delete_me:
1972 - df = c.extract_3D_data() 1976 + df = c.extract_3D_data(save_logs)
1973 c.register_chain(df) 1977 c.register_chain(df)
1974 1978
1975 # Small check 1979 # Small check
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
5 # in the database. 5 # in the database.
6 # This should be run from the folder where the file is (to access the database with path "results/RNANet.db") 6 # This should be run from the folder where the file is (to access the database with path "results/RNANet.db")
7 7
8 -import os, pickle, sqlite3, shlex, subprocess, sys 8 +import getopt, os, pickle, sqlite3, shlex, subprocess, sys
9 import numpy as np 9 import numpy as np
10 import pandas as pd 10 import pandas as pd
11 import threading as th 11 import threading as th
...@@ -24,14 +24,9 @@ from tqdm import tqdm ...@@ -24,14 +24,9 @@ from tqdm import tqdm
24 from collections import Counter 24 from collections import Counter
25 from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker 25 from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_worker
26 26
27 -# This sets the paths 27 +path_to_3D_data = "tobedefinedbyoptions"
28 -if len(sys.argv) > 1: 28 +path_to_seq_data = "tobedefinedbyoptions"
29 - path_to_3D_data = path.abspath(sys.argv[1]) 29 +res_thr = 20.0 # default: all structures
30 - path_to_seq_data = path.abspath(sys.argv[2])
31 -else:
32 - print("Please set paths to 3D data using command line arguments:")
33 - print("./statistics.py /path/to/3D/data/ /path/to/sequence/data/")
34 - exit()
35 30
36 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112 31 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112
37 SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111 32 SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111
...@@ -54,6 +49,8 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): ...@@ -54,6 +49,8 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
54 This removes noise and cuts too high peaks, to clearly see the clusters. 49 This removes noise and cuts too high peaks, to clearly see the clusters.
55 """ 50 """
56 51
52 + os.makedirs("results/figures/wadley_plots/", exist_ok=True)
53 +
57 if carbon == 4: 54 if carbon == 4:
58 angle = "eta" 55 angle = "eta"
59 xlabel = "$\\eta=C_4'^{i-1}-P^i-C_4'^i-P^{i+1}$" 56 xlabel = "$\\eta=C_4'^{i-1}-P^i-C_4'^i-P^{i+1}$"
...@@ -66,7 +63,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): ...@@ -66,7 +63,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
66 exit("You overestimate my capabilities !") 63 exit("You overestimate my capabilities !")
67 64
68 65
69 - if not path.isfile(f"data/wadley_kernel_{angle}.npz"): 66 + if not path.isfile(f"data/wadley_kernel_{angle}_{res_thr}A.npz"):
70 67
71 # Get a worker number to position the progress bar 68 # Get a worker number to position the progress bar
72 global idxQueue 69 global idxQueue
...@@ -75,10 +72,25 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): ...@@ -75,10 +72,25 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
75 72
76 # Extract the angle values of c2'-endo and c3'-endo nucleotides 73 # Extract the angle values of c2'-endo and c3'-endo nucleotides
77 with sqlite3.connect("results/RNANet.db") as conn: 74 with sqlite3.connect("results/RNANet.db") as conn:
78 - df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE puckering="C2'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn) 75 + df = pd.read_sql(f"""SELECT {angle}, th{angle}
76 + FROM nucleotide JOIN (
77 + SELECT chain_id FROM chain JOIN structure
78 + WHERE structure.resolution <= {res_thr}
79 + ) AS c
80 + WHERE puckering="C2'-endo"
81 + AND {angle} IS NOT NULL
82 + AND th{angle} IS NOT NULL;""", conn)
79 c2_endo_etas = df[angle].values.tolist() 83 c2_endo_etas = df[angle].values.tolist()
80 c2_endo_thetas = df["th"+angle].values.tolist() 84 c2_endo_thetas = df["th"+angle].values.tolist()
81 - df = pd.read_sql(f"""SELECT {angle}, th{angle} FROM nucleotide WHERE form = '.' AND puckering="C3'-endo" AND {angle} IS NOT NULL AND th{angle} IS NOT NULL;""", conn) 85 + df = pd.read_sql(f"""SELECT {angle}, th{angle}
86 + FROM nucleotide JOIN (
87 + SELECT chain_id FROM chain JOIN structure
88 + WHERE structure.resolution <= {res_thr}
89 + ) AS c
90 + WHERE form = '.'
91 + AND puckering="C3'-endo"
92 + AND {angle} IS NOT NULL
93 + AND th{angle} IS NOT NULL;""", conn)
82 c3_endo_etas = df[angle].values.tolist() 94 c3_endo_etas = df[angle].values.tolist()
83 c3_endo_thetas = df["th"+angle].values.tolist() 95 c3_endo_thetas = df["th"+angle].values.tolist()
84 96
...@@ -145,7 +157,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): ...@@ -145,7 +157,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
145 ax.bar3d(xpos.ravel(), ypos.ravel(), 0.0, 0.09, 0.09, hist_cut.ravel(), color=color_values, zorder="max") 157 ax.bar3d(xpos.ravel(), ypos.ravel(), 0.0, 0.09, 0.09, hist_cut.ravel(), color=color_values, zorder="max")
146 ax.set_xlabel(xlabel) 158 ax.set_xlabel(xlabel)
147 ax.set_ylabel(ylabel) 159 ax.set_ylabel(ylabel)
148 - fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}.png") 160 + fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res_thr}A.png")
149 if show: 161 if show:
150 fig.show() 162 fig.show()
151 plt.close() 163 plt.close()
...@@ -156,7 +168,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): ...@@ -156,7 +168,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
156 ax.plot_surface(xx, yy, f_cut, cmap=cm.get_cmap("coolwarm"), linewidth=0, antialiased=True) 168 ax.plot_surface(xx, yy, f_cut, cmap=cm.get_cmap("coolwarm"), linewidth=0, antialiased=True)
157 ax.set_xlabel(xlabel) 169 ax.set_xlabel(xlabel)
158 ax.set_ylabel(ylabel) 170 ax.set_ylabel(ylabel)
159 - fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}.png") 171 + fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res_thr}A.png")
160 if show: 172 if show:
161 fig.show() 173 fig.show()
162 plt.close() 174 plt.close()
...@@ -169,7 +181,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): ...@@ -169,7 +181,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
169 181
170 ax.set_xlabel(xlabel) 182 ax.set_xlabel(xlabel)
171 ax.set_ylabel(ylabel) 183 ax.set_ylabel(ylabel)
172 - fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}.png") 184 + fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}_{res_thr}A.png")
173 if show: 185 if show:
174 fig.show() 186 fig.show()
175 plt.close() 187 plt.close()
...@@ -185,6 +197,21 @@ def stats_len(): ...@@ -185,6 +197,21 @@ def stats_len():
185 global idxQueue 197 global idxQueue
186 thr_idx = idxQueue.get() 198 thr_idx = idxQueue.get()
187 199
200 + # sort the RNA families so that the plot is readable
201 + def family_order(f):
202 + if f in LSU_set:
203 + return 4
204 + elif f in SSU_set:
205 + return 3
206 + elif f in ["RF00001"]: #
207 + return 1 # put tRNAs and 5S rRNAs first,
208 + elif f in ["RF00005"]: # because of the logarithmic scale, otherwise, they look tiny
209 + return 0 #
210 + else:
211 + return 2
212 +
213 + fam_list.sort(key=family_order)
214 +
188 cols = [] 215 cols = []
189 lengths = [] 216 lengths = []
190 217
...@@ -204,8 +231,8 @@ def stats_len(): ...@@ -204,8 +231,8 @@ def stats_len():
204 231
205 # Get the lengths of chains 232 # Get the lengths of chains
206 with sqlite3.connect("results/RNANet.db") as conn: 233 with sqlite3.connect("results/RNANet.db") as conn:
207 - l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY chain_id;", warn_every=0) ] 234 + l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id WHERE rfam_acc='{f}' AND resolution <= {res_thr}) NATURAL JOIN nucleotide GROUP BY chain_id;", warn_every=0) ]
208 - lengths.append(l) 235 + lengths.append(l) # list of chain lengths from the family
209 236
210 # notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths") 237 # notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
211 238
...@@ -235,7 +262,7 @@ def stats_len(): ...@@ -235,7 +262,7 @@ def stats_len():
235 ncol=1, fontsize='small', bbox_to_anchor=(1.3, 0.5)) 262 ncol=1, fontsize='small', bbox_to_anchor=(1.3, 0.5))
236 263
237 # Save the figure 264 # Save the figure
238 - fig.savefig("results/figures/lengths.png") 265 + fig.savefig(f"results/figures/lengths_{res_thr}A.png")
239 idxQueue.put(thr_idx) # replace the thread index in the queue 266 idxQueue.put(thr_idx) # replace the thread index in the queue
240 # notify("Computed sequence length statistics and saved the figure.") 267 # notify("Computed sequence length statistics and saved the figure.")
241 268
...@@ -577,8 +604,44 @@ def log_to_pbar(pbar): ...@@ -577,8 +604,44 @@ def log_to_pbar(pbar):
577 604
578 if __name__ == "__main__": 605 if __name__ == "__main__":
579 606
580 - os.makedirs("results/figures/wadley_plots/", exist_ok=True) 607 + # parse options
608 + try:
609 + opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "resolution=", "3d-folder=", "seq-folder=" ])
610 + except getopt.GetoptError as err:
611 + print(err)
612 + sys.exit(2)
613 + for opt, arg in opts:
614 +
615 + if opt == "-h" or opt == "--help":
616 + print( "RNANet statistics, a script to build a multiscale RNA dataset from public data\n"
617 + "Developped by Louis Becquey (louis.becquey@univ-evry.fr), 2020")
618 + print()
619 + print("Options:")
620 + print("-h [ --help ]\t\t\tPrint this help message")
621 + print()
622 + print("-r 20.0 [ --resolution=20.0 ]\tCompute statistics using chains of resolution 20.0A or better.")
623 + print("--3d-folder=…\t\t\tPath to a folder containing the 3D data files. Required subfolders should be:"
624 + "\n\t\t\t\t\tdatapoints/\t\tFinal results in CSV file format.")
625 + print("--seq-folder=…\t\t\tPath to a folder containing the sequence and alignment files. Required subfolder:"
626 + "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
627 + sys.exit()
628 + elif opt == '--version':
629 + print("RNANet statistics 1.1 beta")
630 + sys.exit()
631 + elif opt == "-r" or opt == "--resolution":
632 + assert float(arg) > 0.0 and float(arg) <= 20.0
633 + res_thr = float(arg)
634 + elif opt=='--3d-folder':
635 + path_to_3D_data = path.abspath(arg)
636 + if path_to_3D_data[-1] != '/':
637 + path_to_3D_data += '/'
638 + elif opt=='--seq-folder':
639 + path_to_seq_data = path.abspath(arg)
640 + if path_to_seq_data[-1] != '/':
641 + path_to_seq_data += '/'
642 +
581 643
644 + # Load mappings
582 print("Loading mappings list...") 645 print("Loading mappings list...")
583 with sqlite3.connect("results/RNANet.db") as conn: 646 with sqlite3.connect("results/RNANet.db") as conn:
584 fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ] 647 fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ]
...@@ -602,14 +665,14 @@ if __name__ == "__main__": ...@@ -602,14 +665,14 @@ if __name__ == "__main__":
602 665
603 # Define the tasks 666 # Define the tasks
604 joblist = [] 667 joblist = []
605 - joblist.append(Job(function=reproduce_wadley_results, args=(1,))) 668 + # joblist.append(Job(function=reproduce_wadley_results, args=(1,)))
606 - joblist.append(Job(function=reproduce_wadley_results, args=(4,))) 669 + # joblist.append(Job(function=reproduce_wadley_results, args=(4,)))
607 joblist.append(Job(function=stats_len)) # Computes figures 670 joblist.append(Job(function=stats_len)) # Computes figures
608 - joblist.append(Job(function=stats_freq)) # updates the database 671 + # joblist.append(Job(function=stats_freq)) # updates the database
609 - for f in famlist: 672 + # for f in famlist:
610 - joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database 673 + # joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
611 - if f not in ignored: 674 + # if f not in ignored:
612 - joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database 675 + # joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database
613 676
614 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) 677 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
615 pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True) 678 pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True)
...@@ -633,6 +696,6 @@ if __name__ == "__main__": ...@@ -633,6 +696,6 @@ if __name__ == "__main__":
633 print() 696 print()
634 697
635 # finish the work after the parallel portions 698 # finish the work after the parallel portions
636 - per_chain_stats() 699 + # per_chain_stats()
637 - seq_idty() 700 + # seq_idty()
638 - stats_pairs() 701 + # stats_pairs()
......