Louis BECQUEY

more resolution-specific statistics

1 #!/usr/bin/python3.8 1 #!/usr/bin/python3.8
2 import Bio 2 import Bio
3 +import Bio.PDB as pdb
3 import concurrent.futures 4 import concurrent.futures
4 import getopt 5 import getopt
5 import gzip 6 import gzip
...@@ -25,7 +26,8 @@ from multiprocessing import Pool, Manager ...@@ -25,7 +26,8 @@ from multiprocessing import Pool, Manager
25 from time import sleep 26 from time import sleep
26 from tqdm import tqdm 27 from tqdm import tqdm
27 from setproctitle import setproctitle 28 from setproctitle import setproctitle
28 - 29 +from Bio import AlignIO, SeqIO
30 +from Bio.Align import AlignInfo
29 31
30 def trace_unhandled_exceptions(func): 32 def trace_unhandled_exceptions(func):
31 @wraps(func) 33 @wraps(func)
...@@ -112,7 +114,7 @@ class SelectivePortionSelector(object): ...@@ -112,7 +114,7 @@ class SelectivePortionSelector(object):
112 return 1 114 return 1
113 115
114 116
115 -class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo): 117 +class BufferingSummaryInfo(AlignInfo.SummaryInfo):
116 118
117 def get_pssm(self, family, index): 119 def get_pssm(self, family, index):
118 """Create a position specific score matrix object for the alignment. 120 """Create a position specific score matrix object for the alignment.
...@@ -139,7 +141,7 @@ class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo): ...@@ -139,7 +141,7 @@ class BufferingSummaryInfo(Bio.Align.AlignInfo.SummaryInfo):
139 score_dict[this_residue] = 1.0 141 score_dict[this_residue] = 1.0
140 pssm_info.append(('*', score_dict)) 142 pssm_info.append(('*', score_dict))
141 143
142 - return Bio.Align.AlignInfo.PSSM(pssm_info) 144 + return AlignInfo.PSSM(pssm_info)
143 145
144 146
145 class Chain: 147 class Chain:
...@@ -198,11 +200,11 @@ class Chain: ...@@ -198,11 +200,11 @@ class Chain:
198 200
199 with warnings.catch_warnings(): 201 with warnings.catch_warnings():
200 # Ignore the PDB problems. This mostly warns that some chain is discontinuous. 202 # Ignore the PDB problems. This mostly warns that some chain is discontinuous.
201 - warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning) 203 + warnings.simplefilter('ignore', pdb.PDBExceptions.PDBConstructionWarning)
202 - warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning) 204 + warnings.simplefilter('ignore', pdb.PDBExceptions.BiopythonWarning)
203 205
204 # Load the whole mmCIF into a Biopython structure object: 206 # Load the whole mmCIF into a Biopython structure object:
205 - mmcif_parser = Bio.PDB.MMCIFParser() 207 + mmcif_parser = pdb.MMCIFParser()
206 try: 208 try:
207 s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif") 209 s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif")
208 except ValueError as e: 210 except ValueError as e:
...@@ -223,7 +225,7 @@ class Chain: ...@@ -223,7 +225,7 @@ class Chain:
223 sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm) 225 sel = SelectivePortionSelector(model_idx, self.pdb_chain_id, valid_set, khetatm)
224 226
225 # Save that selection on the mmCIF object s to file 227 # Save that selection on the mmCIF object s to file
226 - ioobj = Bio.PDB.mmcifio.MMCIFIO() 228 + ioobj = pdb.MMCIFIO()
227 ioobj.set_structure(s) 229 ioobj.set_structure(s)
228 ioobj.save(self.file, sel) 230 ioobj.save(self.file, sel)
229 231
...@@ -1115,7 +1117,7 @@ class Pipeline: ...@@ -1115,7 +1117,7 @@ class Pipeline:
1115 print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &") 1117 print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &")
1116 sys.exit() 1118 sys.exit()
1117 elif opt == '--version': 1119 elif opt == '--version':
1118 - print("RNANet 1.1 beta") 1120 + print("RNANet 1.2, parallelized, Dockerized")
1119 sys.exit() 1121 sys.exit()
1120 elif opt == "-r" or opt == "--resolution": 1122 elif opt == "-r" or opt == "--resolution":
1121 assert float(arg) > 0.0 and float(arg) <= 20.0 1123 assert float(arg) > 0.0 and float(arg) <= 20.0
...@@ -1445,7 +1447,7 @@ class Pipeline: ...@@ -1445,7 +1447,7 @@ class Pipeline:
1445 # Update the database 1447 # Update the database
1446 data = [] 1448 data = []
1447 for r in results: 1449 for r in results:
1448 - align = Bio.AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta") 1450 + align = AlignIO.read(path_to_seq_data + "realigned/" + r[0] + "++.afa", "fasta")
1449 nb_3d_chains = len([1 for r in align if '[' in r.id]) 1451 nb_3d_chains = len([1 for r in align if '[' in r.id])
1450 if r[0] in SSU_set: # SSU v138 is used 1452 if r[0] in SSU_set: # SSU v138 is used
1451 nb_homologs = 2225272 # source: https://www.arb-silva.de/documentation/release-138/ 1453 nb_homologs = 2225272 # source: https://www.arb-silva.de/documentation/release-138/
...@@ -1535,9 +1537,9 @@ class Pipeline: ...@@ -1535,9 +1537,9 @@ class Pipeline:
1535 # Run statistics 1537 # Run statistics
1536 if self.RUN_STATS: 1538 if self.RUN_STATS:
1537 # Remove previous precomputed data 1539 # Remove previous precomputed data
1538 - subprocess.run(["rm", "-f", runDir + "/data/wadley_kernel_eta.npz", 1540 + subprocess.run(["rm", "-f", runDir + f"/data/wadley_kernel_eta_{self.CRYSTAL_RES}.npz",
1539 - runDir + "/data/wadley_kernel_eta_prime.npz", 1541 + runDir + f"/data/wadley_kernel_eta_prime_{self.CRYSTAL_RES}.npz",
1540 - runDir + "/data/pair_counts.csv"]) 1542 + runDir + f"/data/pair_counts_{self.CRYSTAL_RES}.csv"])
1541 for f in self.fam_list: 1543 for f in self.fam_list:
1542 subprocess.run(["rm", "-f", runDir + f"/data/{f}.npy", 1544 subprocess.run(["rm", "-f", runDir + f"/data/{f}.npy",
1543 runDir + f"/data/{f}_pairs.csv", 1545 runDir + f"/data/{f}_pairs.csv",
...@@ -2124,7 +2126,7 @@ def work_mmcif(pdb_id): ...@@ -2124,7 +2126,7 @@ def work_mmcif(pdb_id):
2124 # if not, read the CIF header and register the structure 2126 # if not, read the CIF header and register the structure
2125 if not len(r): 2127 if not len(r):
2126 # Load the MMCIF file with Biopython 2128 # Load the MMCIF file with Biopython
2127 - mmCif_info = Bio.PDB.MMCIF2Dict.MMCIF2Dict(final_filepath) 2129 + mmCif_info = pdb.MMCIF2Dict.MMCIF2Dict(final_filepath)
2128 2130
2129 # Get info about that structure 2131 # Get info about that structure
2130 try: 2132 try:
...@@ -2218,7 +2220,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): ...@@ -2218,7 +2220,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
2218 if rfam_acc in LSU_set | SSU_set: # rRNA 2220 if rfam_acc in LSU_set | SSU_set: # rRNA
2219 if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): 2221 if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
2220 # Detect doublons and remove them 2222 # Detect doublons and remove them
2221 - existing_afa = Bio.AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta") 2223 + existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
2222 existing_ids = [r.id for r in existing_afa] 2224 existing_ids = [r.id for r in existing_afa]
2223 del existing_afa 2225 del existing_afa
2224 new_ids = [str(c) for c in chains] 2226 new_ids = [str(c) for c in chains]
...@@ -2227,7 +2229,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): ...@@ -2227,7 +2229,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
2227 if len(doublons): 2229 if len(doublons):
2228 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version") 2230 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.fa and using their newest version")
2229 fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa" 2231 fasta = path_to_seq_data + f"realigned/{rfam_acc}++.fa"
2230 - seqfile = Bio.SeqIO.parse(fasta, "fasta") 2232 + seqfile = SeqIO.parse(fasta, "fasta")
2231 # remove it and rewrite it with its own content filtered 2233 # remove it and rewrite it with its own content filtered
2232 os.remove(fasta) 2234 os.remove(fasta)
2233 with open(fasta, 'w') as f: 2235 with open(fasta, 'w') as f:
...@@ -2268,7 +2270,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): ...@@ -2268,7 +2270,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
2268 with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "w") as plusplus: 2270 with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "w") as plusplus:
2269 ids = set() 2271 ids = set()
2270 # Remove doublons from the Rfam hits 2272 # Remove doublons from the Rfam hits
2271 - for r in Bio.SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"): 2273 + for r in SeqIO.parse(path_to_seq_data + f"realigned/{rfam_acc}.fa", "fasta"):
2272 if r.id not in ids: 2274 if r.id not in ids:
2273 ids.add(r.id) 2275 ids.add(r.id)
2274 plusplus.write('> '+r.description+'\n'+str(r.seq)+'\n') 2276 plusplus.write('> '+r.description+'\n'+str(r.seq)+'\n')
...@@ -2343,10 +2345,10 @@ def work_realign(rfam_acc): ...@@ -2343,10 +2345,10 @@ def work_realign(rfam_acc):
2343 notify("Aligned new sequences together") 2345 notify("Aligned new sequences together")
2344 2346
2345 # Detect doublons and remove them 2347 # Detect doublons and remove them
2346 - existing_stk = Bio.AlignIO.read(existing_ali_path, "stockholm") 2348 + existing_stk = AlignIO.read(existing_ali_path, "stockholm")
2347 existing_ids = [r.id for r in existing_stk] 2349 existing_ids = [r.id for r in existing_stk]
2348 del existing_stk 2350 del existing_stk
2349 - new_stk = Bio.AlignIO.read(new_ali_path, "stockholm") 2351 + new_stk = AlignIO.read(new_ali_path, "stockholm")
2350 new_ids = [r.id for r in new_stk] 2352 new_ids = [r.id for r in new_stk]
2351 del new_stk 2353 del new_stk
2352 doublons = [i for i in existing_ids if i in new_ids] 2354 doublons = [i for i in existing_ids if i in new_ids]
...@@ -2447,7 +2449,7 @@ def work_pssm(f, fill_gaps): ...@@ -2447,7 +2449,7 @@ def work_pssm(f, fill_gaps):
2447 2449
2448 # Open the alignment 2450 # Open the alignment
2449 try: 2451 try:
2450 - align = Bio.AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") 2452 + align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
2451 except: 2453 except:
2452 warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True) 2454 warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True)
2453 with open(runDir + "/errors.txt", "a") as errf: 2455 with open(runDir + "/errors.txt", "a") as errf:
......
...@@ -70,7 +70,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0): ...@@ -70,7 +70,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0):
70 thr_idx = idxQueue.get() 70 thr_idx = idxQueue.get()
71 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} reproduce_wadley_results(carbon={carbon})") 71 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} reproduce_wadley_results(carbon={carbon})")
72 72
73 - pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", position=thr_idx+1, leave=False) 73 + pbar = tqdm(total=2, desc=f"Worker {thr_idx+1}: eta/theta C{carbon} kernels", unit="kernel", position=thr_idx+1, leave=False)
74 74
75 # Extract the angle values of c2'-endo and c3'-endo nucleotides 75 # Extract the angle values of c2'-endo and c3'-endo nucleotides
76 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 76 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
...@@ -203,25 +203,10 @@ def stats_len(): ...@@ -203,25 +203,10 @@ def stats_len():
203 global idxQueue 203 global idxQueue
204 thr_idx = idxQueue.get() 204 thr_idx = idxQueue.get()
205 205
206 - # sort the RNA families so that the plot is readable
207 - def family_order(f):
208 - if f in LSU_set:
209 - return 4
210 - elif f in SSU_set:
211 - return 3
212 - elif f in ["RF00001"]: #
213 - return 1 # put tRNAs and 5S rRNAs first,
214 - elif f in ["RF00005"]: # because of the logarithmic scale, otherwise, they look tiny
215 - return 0 #
216 - else:
217 - return 2
218 -
219 - fam_list.sort(key=family_order)
220 -
221 cols = [] 206 cols = []
222 lengths = [] 207 lengths = []
223 208
224 - for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", leave=False): 209 + for f in tqdm(famlist, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Average chain lengths", unit="family", leave=False):
225 210
226 # Define a color for that family in the plot 211 # Define a color for that family in the plot
227 if f in LSU_set: 212 if f in LSU_set:
...@@ -249,7 +234,7 @@ def stats_len(): ...@@ -249,7 +234,7 @@ def stats_len():
249 # Plot the figure 234 # Plot the figure
250 fig = plt.figure(figsize=(10,3)) 235 fig = plt.figure(figsize=(10,3))
251 ax = fig.gca() 236 ax = fig.gca()
252 - ax.hist(lengths, bins=100, stacked=True, log=True, color=cols, label=fam_list) 237 + ax.hist(lengths, bins=100, stacked=True, log=True, color=cols, label=famlist)
253 ax.set_xlabel("Sequence length (nucleotides)", fontsize=8) 238 ax.set_xlabel("Sequence length (nucleotides)", fontsize=8)
254 ax.set_ylabel("Number of 3D chains", fontsize=8) 239 ax.set_ylabel("Number of 3D chains", fontsize=8)
255 ax.set_xlim(left=-150) 240 ax.set_xlim(left=-150)
...@@ -303,18 +288,18 @@ def stats_freq(): ...@@ -303,18 +288,18 @@ def stats_freq():
303 288
304 # Initialize a Counter object for each family 289 # Initialize a Counter object for each family
305 freqs = {} 290 freqs = {}
306 - for f in fam_list: 291 + for f in famlist:
307 freqs[f] = Counter() 292 freqs[f] = Counter()
308 293
309 # List all nt_names happening within a RNA family and store the counts in the Counter 294 # List all nt_names happening within a RNA family and store the counts in the Counter
310 - for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False): 295 + for f in tqdm(famlist, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", unit="family", leave=False):
311 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 296 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
312 counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0)) 297 counts = dict(sql_ask_database(conn, f"SELECT nt_name, COUNT(nt_name) FROM (SELECT chain_id from chain WHERE rfam_acc='{f}') NATURAL JOIN nucleotide GROUP BY nt_name;", warn_every=0))
313 freqs[f].update(counts) 298 freqs[f].update(counts)
314 299
315 # Create a pandas DataFrame, and save it to CSV. 300 # Create a pandas DataFrame, and save it to CSV.
316 df = pd.DataFrame() 301 df = pd.DataFrame()
317 - for f in tqdm(fam_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", leave=False): 302 + for f in tqdm(famlist, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Base frequencies", unit="family", leave=False):
318 tot = sum(freqs[f].values()) 303 tot = sum(freqs[f].values())
319 df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ]) 304 df = pd.concat([ df, pd.DataFrame([[ format_percentage(tot, x) for x in freqs[f].values() ]], columns=list(freqs[f]), index=[f]) ])
320 df = df.fillna(0) 305 df = df.fillna(0)
...@@ -322,12 +307,13 @@ def stats_freq(): ...@@ -322,12 +307,13 @@ def stats_freq():
322 idxQueue.put(thr_idx) # replace the thread index in the queue 307 idxQueue.put(thr_idx) # replace the thread index in the queue
323 # notify("Saved nucleotide frequencies to CSV file.") 308 # notify("Saved nucleotide frequencies to CSV file.")
324 309
310 +@trace_unhandled_exceptions
325 def parallel_stats_pairs(f): 311 def parallel_stats_pairs(f):
326 """Counts occurrences of intra-chain base-pair types in one RNA family 312 """Counts occurrences of intra-chain base-pair types in one RNA family
327 313
328 REQUIRES tables chain, nucleotide up-to-date.""" 314 REQUIRES tables chain, nucleotide up-to-date."""
329 315
330 - if path.isfile("data/"+f+"_pairs.csv") and path.isfile("data/"+f+"_counts.csv"): 316 + if path.isfile(runDir + "/data/"+f+"_pairs.csv") and path.isfile(runDir + "/data/"+f+"_counts.csv"):
331 return 317 return
332 318
333 # Get a worker number to position the progress bar 319 # Get a worker number to position the progress bar
...@@ -339,7 +325,7 @@ def parallel_stats_pairs(f): ...@@ -339,7 +325,7 @@ def parallel_stats_pairs(f):
339 chain_id_list = mappings_list[f] 325 chain_id_list = mappings_list[f]
340 data = [] 326 data = []
341 sqldata = [] 327 sqldata = []
342 - for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", leave=False): 328 + for cid in tqdm(chain_id_list, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} basepair types", unit="chain",leave=False):
343 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 329 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
344 # Get comma separated lists of basepairs per nucleotide 330 # Get comma separated lists of basepairs per nucleotide
345 interactions = pd.DataFrame( 331 interactions = pd.DataFrame(
...@@ -430,16 +416,19 @@ def parallel_stats_pairs(f): ...@@ -430,16 +416,19 @@ def parallel_stats_pairs(f):
430 416
431 idxQueue.put(thr_idx) # replace the thread index in the queue 417 idxQueue.put(thr_idx) # replace the thread index in the queue
432 418
433 -def to_dist_matrix(f): 419 +def to_id_matrix(f):
420 + """
421 + Extracts sequences of 3D chains from the family alignments to a distinct STK file,
422 + then runs esl-alipid on it to get an identity matrix
423 + """
434 if path.isfile("data/"+f+".npy"): 424 if path.isfile("data/"+f+".npy"):
435 - # notify(f"Computed {f} distance matrix", "loaded from file")
436 return 0 425 return 0
437 426
438 # Get a worker number to position the progress bar 427 # Get a worker number to position the progress bar
439 global idxQueue 428 global idxQueue
440 thr_idx = idxQueue.get() 429 thr_idx = idxQueue.get()
441 430
442 - setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_dist_matrix({f})") 431 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_id_matrix({f})")
443 432
444 # Prepare a file 433 # Prepare a file
445 with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file: 434 with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file:
...@@ -452,14 +441,16 @@ def to_dist_matrix(f): ...@@ -452,14 +441,16 @@ def to_dist_matrix(f):
452 except ValueError as e: 441 except ValueError as e:
453 warn(e) 442 warn(e)
454 del al 443 del al
455 - subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap", "-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk", "stockholm", path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"]) 444 + subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap", #
445 + "-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk", # This run just deletes columns of gaps
446 + "stockholm", path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"]) #
447 + subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk"])
456 448
457 # Prepare the job 449 # Prepare the job
458 process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"), 450 process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"),
459 stdout=subprocess.PIPE, stderr=subprocess.PIPE) 451 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
460 id_matrix = np.zeros((len(names), len(names))) 452 id_matrix = np.zeros((len(names), len(names)))
461 - 453 + pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", unit="comparisons", leave=False)
462 - pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", leave=False)
463 cnt = 0 454 cnt = 0
464 while not cnt or process.poll() is None: 455 while not cnt or process.poll() is None:
465 output = process.stdout.read() 456 output = process.stdout.read()
...@@ -482,8 +473,8 @@ def to_dist_matrix(f): ...@@ -482,8 +473,8 @@ def to_dist_matrix(f):
482 warn("\n".join([ line.decode('utf-8') for line in l ])) 473 warn("\n".join([ line.decode('utf-8') for line in l ]))
483 pbar.close() 474 pbar.close()
484 475
485 - subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk"])
486 np.save("data/"+f+".npy", id_matrix) 476 np.save("data/"+f+".npy", id_matrix)
477 +
487 idxQueue.put(thr_idx) # replace the thread index in the queue 478 idxQueue.put(thr_idx) # replace the thread index in the queue
488 return 0 479 return 0
489 480
...@@ -545,7 +536,7 @@ def seq_idty(): ...@@ -545,7 +536,7 @@ def seq_idty():
545 fig.tight_layout() 536 fig.tight_layout()
546 fig.subplots_adjust(hspace=0.3, wspace=0.1) 537 fig.subplots_adjust(hspace=0.3, wspace=0.1)
547 fig.colorbar(im, ax=axs[-4], shrink=0.8) 538 fig.colorbar(im, ax=axs[-4], shrink=0.8)
548 - fig.savefig(runDir + f"/results/figures/distances.png") 539 + fig.savefig(runDir + f"/results/figures/distances_{res_thr}.png")
549 print("> Computed all identity matrices and saved the figure.", flush=True) 540 print("> Computed all identity matrices and saved the figure.", flush=True)
550 541
551 def stats_pairs(): 542 def stats_pairs():
...@@ -559,10 +550,10 @@ def stats_pairs(): ...@@ -559,10 +550,10 @@ def stats_pairs():
559 def line_format(family_data): 550 def line_format(family_data):
560 return family_data.apply(partial(format_percentage, sum(family_data))) 551 return family_data.apply(partial(format_percentage, sum(family_data)))
561 552
562 - if not path.isfile("data/pair_counts.csv"): 553 + if not path.isfile("data/pair_counts_{res_thr}.csv"):
563 results = [] 554 results = []
564 allpairs = [] 555 allpairs = []
565 - for f in fam_list: 556 + for f in famlist:
566 newpairs = pd.read_csv(runDir + f"/data/{f}_pairs.csv", index_col=0) 557 newpairs = pd.read_csv(runDir + f"/data/{f}_pairs.csv", index_col=0)
567 fam_df = pd.read_csv(runDir + f"/data/{f}_counts.csv", index_col=0) 558 fam_df = pd.read_csv(runDir + f"/data/{f}_counts.csv", index_col=0)
568 results.append(fam_df) 559 results.append(fam_df)
...@@ -571,11 +562,11 @@ def stats_pairs(): ...@@ -571,11 +562,11 @@ def stats_pairs():
571 subprocess.run(["rm", "-f", runDir + f"/data/{f}_counts.csv"]) 562 subprocess.run(["rm", "-f", runDir + f"/data/{f}_counts.csv"])
572 all_pairs = pd.concat(allpairs) 563 all_pairs = pd.concat(allpairs)
573 df = pd.concat(results).fillna(0) 564 df = pd.concat(results).fillna(0)
574 - df.to_csv("data/pair_counts.csv") 565 + df.to_csv(runDir + f"/data/pair_counts_{res_thr}.csv")
575 - all_pairs.to_csv("data/all_pairs.csv") 566 + all_pairs.to_csv(runDir + f"/data/all_pairs_{res_thr}.csv")
576 else: 567 else:
577 - df = pd.read_csv("data/pair_counts.csv", index_col=0) 568 + df = pd.read_csv(runDir + f"/data/pair_counts_{res_thr}.csv", index_col=0)
578 - all_pairs = pd.read_csv("data/all_pairs.csv", index_col=0) 569 + all_pairs = pd.read_csv(runDir + f"/data/all_pairs_{res_thr}.csv", index_col=0)
579 570
580 crosstab = pd.crosstab(all_pairs.pair_type_LW, all_pairs.basepair) 571 crosstab = pd.crosstab(all_pairs.pair_type_LW, all_pairs.basepair)
581 col_list = [ x for x in df.columns if '.' in x ] 572 col_list = [ x for x in df.columns if '.' in x ]
...@@ -613,7 +604,7 @@ def stats_pairs(): ...@@ -613,7 +604,7 @@ def stats_pairs():
613 ax.set_ylabel("Number of observations (millions)", fontsize=13) 604 ax.set_ylabel("Number of observations (millions)", fontsize=13)
614 ax.set_xlabel(None) 605 ax.set_xlabel(None)
615 plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99) 606 plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99)
616 - plt.savefig(runDir + "/results/figures/pairings.png") 607 + plt.savefig(runDir + f"/results/figures/pairings_{res_thr}.png")
617 608
618 notify("Computed nucleotide statistics and saved CSV and PNG file.") 609 notify("Computed nucleotide statistics and saved CSV and PNG file.")
619 610
...@@ -916,8 +907,24 @@ def log_to_pbar(pbar): ...@@ -916,8 +907,24 @@ def log_to_pbar(pbar):
916 pbar.update(1) 907 pbar.update(1)
917 return update 908 return update
918 909
910 +def family_order(f):
911 + # sort the RNA families so that the plots are readable
912 +
913 + if f in LSU_set:
914 + return 4
915 + elif f in SSU_set:
916 + return 3
917 + elif f in ["RF00001"]: #
918 + return 1 # put tRNAs and 5S rRNAs first,
919 + elif f in ["RF00005"]: # because of the logarithmic scale of the lengths' figure, otherwise, they look tiny
920 + return 0 #
921 + else:
922 + return 2
923 +
919 if __name__ == "__main__": 924 if __name__ == "__main__":
920 925
926 + os.makedirs(runDir + "/results/figures/", exist_ok=True)
927 +
921 # parse options 928 # parse options
922 DELETE_OLD_DATA = False 929 DELETE_OLD_DATA = False
923 DO_WADLEY_ANALYSIS = False 930 DO_WADLEY_ANALYSIS = False
...@@ -943,7 +950,7 @@ if __name__ == "__main__": ...@@ -943,7 +950,7 @@ if __name__ == "__main__":
943 print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything") 950 print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything")
944 sys.exit() 951 sys.exit()
945 elif opt == '--version': 952 elif opt == '--version':
946 - print("RNANet statistics 1.1 beta") 953 + print("RNANet statistics 1.2")
947 sys.exit() 954 sys.exit()
948 elif opt == "-r" or opt == "--resolution": 955 elif opt == "-r" or opt == "--resolution":
949 assert float(arg) > 0.0 and float(arg) <= 20.0 956 assert float(arg) > 0.0 and float(arg) <= 20.0
...@@ -959,31 +966,38 @@ if __name__ == "__main__": ...@@ -959,31 +966,38 @@ if __name__ == "__main__":
959 elif opt=='--from-scratch': 966 elif opt=='--from-scratch':
960 DELETE_OLD_DATA = True 967 DELETE_OLD_DATA = True
961 DO_WADLEY_ANALYSIS = True 968 DO_WADLEY_ANALYSIS = True
962 - subprocess.run(["rm","-f", "data/wadley_kernel_eta.npz", "data/wadley_kernel_eta_prime.npz", "data/pair_counts.csv"])
963 elif opt=='--wadley': 969 elif opt=='--wadley':
964 DO_WADLEY_ANALYSIS = True 970 DO_WADLEY_ANALYSIS = True
965 971
966 972
967 - # Load mappings 973 + # Load mappings. famlist will contain only families with structures at this resolution threshold.
968 print("Loading mappings list...") 974 print("Loading mappings list...")
969 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 975 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
970 - fam_list = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from family ORDER BY rfam_acc ASC;") ]
971 - mappings_list = {}
972 - for k in fam_list:
973 - mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"SELECT chain_id from chain JOIN structure ON chain.structure_id=structure.pdb_id WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};") ]
974 -
975 - # List the families for which we will compute sequence identity matrices
976 - with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
977 - famlist = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains > 0 ORDER BY rfam_acc ASC;") ]
978 - ignored = [ x[0] for x in sql_ask_database(conn, "SELECT rfam_acc from (SELECT rfam_acc, COUNT(chain_id) as n_chains FROM family NATURAL JOIN chain WHERE issue = 0 GROUP BY rfam_acc) WHERE n_chains < 3 ORDER BY rfam_acc ASC;") ]
979 n_unmapped_chains = sql_ask_database(conn, "SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0] 976 n_unmapped_chains = sql_ask_database(conn, "SELECT COUNT(*) FROM chain WHERE rfam_acc='unmappd' AND issue=0;")[0][0]
977 + families = pd.read_sql(f"""SELECT rfam_acc, count(*) as n_chains
978 + FROM chain JOIN structure
979 + ON chain.structure_id = structure.pdb_id
980 + WHERE issue = 0 AND resolution <= {res_thr} AND rfam_acc != 'unmappd'
981 + GROUP BY rfam_acc;
982 + """, conn)
983 + families.drop(families[families.n_chains == 0].index, inplace=True)
984 + mappings_list = {}
985 + for k in families.rfam_acc:
986 + mappings_list[k] = [ x[0] for x in sql_ask_database(conn, f"""SELECT chain_id
987 + FROM chain JOIN structure ON chain.structure_id=structure.pdb_id
988 + WHERE rfam_acc='{k}' AND issue=0 AND resolution <= {res_thr};""") ]
989 + famlist = families.rfam_acc.tolist()
990 + ignored = families[families.n_chains < 3].rfam_acc.tolist()
991 + famlist.sort(key=family_order)
992 + print(f"Found {len(famlist)} families with chains of resolution {res_thr}A or better.")
980 if len(ignored): 993 if len(ignored):
981 print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n') 994 print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'\n')
982 995
983 if DELETE_OLD_DATA: 996 if DELETE_OLD_DATA:
984 - for f in fam_list: 997 + for f in famlist:
985 subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"]) 998 subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"])
986 - 999 + if DO_WADLEY_ANALYSIS:
1000 + subprocess.run(["rm","-f", runDir + f"/data/wadley_kernel_eta_{res_thr}.npz", runDir + f"/data/wadley_kernel_eta_prime_{res_thr}.npz", runDir + f"/data/pair_counts_{res_thr}.csv"])
987 1001
988 # Prepare the multiprocessing execution environment 1002 # Prepare the multiprocessing execution environment
989 nworkers = min(read_cpu_number()-1, 32) 1003 nworkers = min(read_cpu_number()-1, 32)
...@@ -995,17 +1009,17 @@ if __name__ == "__main__": ...@@ -995,17 +1009,17 @@ if __name__ == "__main__":
995 # Define the tasks 1009 # Define the tasks
996 joblist = [] 1010 joblist = []
997 if n_unmapped_chains and DO_WADLEY_ANALYSIS: 1011 if n_unmapped_chains and DO_WADLEY_ANALYSIS:
998 - joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 20.0))) # res threshold is 4.0 Angstroms by default 1012 + joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
999 - joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 20.0))) # 1013 + joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
1000 joblist.append(Job(function=stats_len)) # Computes figures 1014 joblist.append(Job(function=stats_len)) # Computes figures
1001 joblist.append(Job(function=stats_freq)) # updates the database 1015 joblist.append(Job(function=stats_freq)) # updates the database
1002 for f in famlist: 1016 for f in famlist:
1003 joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database 1017 joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
1004 if f not in ignored: 1018 if f not in ignored:
1005 - joblist.append(Job(function=to_dist_matrix, args=(f,))) # updates the database 1019 + joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database
1006 1020
1007 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) 1021 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
1008 - pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, leave=True) 1022 + pbar = tqdm(total=len(joblist), desc="Stat jobs", position=0, unit="job", leave=True)
1009 1023
1010 try: 1024 try:
1011 for j in joblist: 1025 for j in joblist:
......