Louis BECQUEY

v pre-1.6b


Former-commit-id: 33fafe91
1 ############################################################################################ 1 ############################################################################################
2 +v 1.6 beta, August 2021
3 +
4 +Aglaé Tabot joins the development team. Khodor Hannoush leaves.
5 +
6 +FEATURE CHANGES
7 + - Distinct options --cmalign-opts and --cmalign-rrna-opts allow to adapt the parameters for LSU and SSU families.
8 + The LSU and SSU are now aligned with Infernal options '--cpu 10 --mxsize 8192 --mxtau 0.1', which is slow,
9 + requires up to 100 GB of RAM, and yields a suboptimal alignment (tau=0.1 is quite bad), but is homogenous with the other families.
10 + - The LSU and SSU therefore have defined cm_coords fields, and therefore distance matrices can be computed.
11 + - We now provide for download the renumbered (standardised) 3D MMCIF files, the nucleotides being numbered by their "index_chain" in the database.
12 + - We now provide for download the sequences of the 3D chains aligned by Rfam family (without Rfam sequences, which have been removed).
13 + - statistics.py now computes histograms and a density estimation with Gaussian mixture models for a large set of geometric parameters,
14 + measured on the unmapped data at a given resolution threshold. The parameters include:
15 + * All atom bonded distances and torsion angles
16 + * Distances, flat angles and torsion angles in the Pyle/VFold model
17 + * Distances, flat angles and torsion anfles in the HiRE-RNA model
18 + * Sequence-dependant geometric parameters of the basepairs for all non-canonical basepairs in the HiRE-RNA model.
19 + The data is saved as JSON files of parameters, and numerous figures are produced to illustrate the distributions.
20 + The number of gaussians to use in the GMMs are hard-coded in geometric_stats.py after our first estimation. If you do not want to trust this estimation,
21 + you can ignore it with option --rescan-nmodes. An exploration of the number of Gaussians from 1 to 8 will be performed, and the best GMM will be kept.
22 +
23 +BUG CORRECTIONS
24 + - New code file geometric_stats.py
25 + - New automation script that starts from scratch
26 + - Many small fixes
27 + - Performance tweaks
28 +
29 +TECHNICAL CHANGES
30 + - Switched to DSSR Pro.
31 + - Switched to esl-alimerge instead of cmalign --merge to merge alignments.
32 +
33 +############################################################################################
2 v 1.5 beta, April 2021 34 v 1.5 beta, April 2021
3 35
4 FEATURE CHANGES 36 FEATURE CHANGES
......
1 MIT License 1 MIT License
2 2
3 -Copyright (c) 2019 Louis Becquey 3 +Copyright (c) 2019-2021 IBISC, Université Paris Saclay
4 4
5 Permission is hereby granted, free of charge, to any person obtaining a copy 5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal 6 of this software and associated documentation files (the "Software"), to deal
......
...@@ -57,6 +57,7 @@ def trace_unhandled_exceptions(func): ...@@ -57,6 +57,7 @@ def trace_unhandled_exceptions(func):
57 return func(*args, **kwargs) 57 return func(*args, **kwargs)
58 except: 58 except:
59 s = traceback.format_exc() 59 s = traceback.format_exc()
60 + if not "KeyboardInterrupt" in s:
60 with open(runDir + "/errors.txt", "a") as f: 61 with open(runDir + "/errors.txt", "a") as f:
61 f.write("Exception in "+func.__name__+"\n") 62 f.write("Exception in "+func.__name__+"\n")
62 f.write(s) 63 f.write(s)
...@@ -261,7 +262,7 @@ class Chain: ...@@ -261,7 +262,7 @@ class Chain:
261 new_s.add(new_model) 262 new_s.add(new_model)
262 263
263 # renumber this structure (portion of the original) with the index_chain and save it in a cif file 264 # renumber this structure (portion of the original) with the index_chain and save it in a cif file
264 - t=pdb.Structure.Structure(new_s.get_id()) 265 + t = pdb.Structure.Structure(new_s.get_id())
265 for model in new_s: 266 for model in new_s:
266 new_model_t=pdb.Model.Model(model.get_id()) 267 new_model_t=pdb.Model.Model(model.get_id())
267 for chain in model: 268 for chain in model:
...@@ -288,7 +289,7 @@ class Chain: ...@@ -288,7 +289,7 @@ class Chain:
288 # particular case 6n5s_1_A, residue 201 in the original cif file (resname = G and HETATM = H_G) 289 # particular case 6n5s_1_A, residue 201 in the original cif file (resname = G and HETATM = H_G)
289 if nt == 'A' or (nt == 'G' and (self.chain_label != '6n5s_1_A' or resseq != 201)) or nt == 'C' or nt == 'U' or nt in ['DG', 'DU', 'DC', 'DA', 'DI', 'DT' ] or nt == 'N' or nt == 'I' : 290 if nt == 'A' or (nt == 'G' and (self.chain_label != '6n5s_1_A' or resseq != 201)) or nt == 'C' or nt == 'U' or nt in ['DG', 'DU', 'DC', 'DA', 'DI', 'DT' ] or nt == 'N' or nt == 'I' :
290 res=chain[(' ', resseq, icode_res)] 291 res=chain[(' ', resseq, icode_res)]
291 - else : #modified nucleotides (e.g. chain 5l4o_1_A) 292 + else : # modified nucleotides (e.g. chain 5l4o_1_A)
292 het='H_' + nt 293 het='H_' + nt
293 res=chain[(het, resseq, icode_res)] 294 res=chain[(het, resseq, icode_res)]
294 res_id=res.get_id() 295 res_id=res.get_id()
...@@ -1599,7 +1600,7 @@ class Pipeline: ...@@ -1599,7 +1600,7 @@ class Pipeline:
1599 try: 1600 try:
1600 fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True) 1601 fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True)
1601 # Apply work_pssm_remap to each RNA family 1602 # Apply work_pssm_remap to each RNA family
1602 - for i, _ in enumerate(p.imap_unordered(work_pssm_remap, self.fam_list, chunksize=1)): 1603 + for i, _ in enumerate(p.imap_unordered(partial(work_pssm_remap, useSina=pp.USESINA), self.fam_list, chunksize=1)):
1603 # Everytime the iteration finishes on a family, update the global progress bar over the RNA families 1604 # Everytime the iteration finishes on a family, update the global progress bar over the RNA families
1604 fam_pbar.update(1) 1605 fam_pbar.update(1)
1605 fam_pbar.close() 1606 fam_pbar.close()
...@@ -1654,7 +1655,7 @@ class Pipeline: ...@@ -1654,7 +1655,7 @@ class Pipeline:
1654 p = Pool(initializer=init_with_tqdm, initargs=(tqdm.get_lock(),), processes=3) 1655 p = Pool(initializer=init_with_tqdm, initargs=(tqdm.get_lock(),), processes=3)
1655 try: 1656 try:
1656 pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True) 1657 pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True)
1657 - for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains)): 1658 + for _, _2 in enumerate(p.imap_unordered(partial(work_save, homology=pp.HOMOLOGY), self.loaded_chains)):
1658 pbar.update(1) 1659 pbar.update(1)
1659 pbar.close() 1660 pbar.close()
1660 p.close() 1661 p.close()
...@@ -1700,18 +1701,28 @@ class Pipeline: ...@@ -1700,18 +1701,28 @@ class Pipeline:
1700 if self.ARCHIVE: 1701 if self.ARCHIVE:
1701 os.makedirs(runDir + "/archive", exist_ok=True) 1702 os.makedirs(runDir + "/archive", exist_ok=True)
1702 datestr = time.strftime('%Y%m%d') 1703 datestr = time.strftime('%Y%m%d')
1704 +
1705 + # The text files
1703 subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_datapoints_latest.tar.gz"]) 1706 subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_datapoints_latest.tar.gz"])
1704 subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", "."]) 1707 subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", "."])
1705 subprocess.run(["ln", "-s", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", runDir + f"/archive/RNANET_datapoints_latest.tar.gz"]) 1708 subprocess.run(["ln", "-s", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", runDir + f"/archive/RNANET_datapoints_latest.tar.gz"])
1706 1709
1710 + # The alignments
1707 if self.HOMOLOGY: 1711 if self.HOMOLOGY:
1708 - # gather the alignments
1709 os.makedirs(path_to_seq_data + "realigned/3d_only", exist_ok=True) 1712 os.makedirs(path_to_seq_data + "realigned/3d_only", exist_ok=True)
1710 for f in os.listdir(path_to_seq_data + "realigned"): 1713 for f in os.listdir(path_to_seq_data + "realigned"):
1711 if "3d_only.afa" in f: 1714 if "3d_only.afa" in f:
1712 subprocess.run(["cp", path_to_seq_data + "realigned/" + f, path_to_seq_data + "realigned/3d_only"]) 1715 subprocess.run(["cp", path_to_seq_data + "realigned/" + f, path_to_seq_data + "realigned/3d_only"])
1713 - subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_alignments_latest.tar.gz"]) 1716 + subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_3dOnlyAlignments_latest.tar.gz"])
1714 - subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3d_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."]) 1717 + subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3d_only" , "-czf", runDir + f"/archive/RNANET_3dOnlyAlignments_latest.tar.gz", "."])
1718 +
1719 + # The 3D files
1720 + if os.path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"):
1721 + subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_MMCIFmappedToRfam_latest.tar.gz"])
1722 + subprocess.run(["tar", "-C", path_to_3D_data + "rna_mapped_to_Rfam" , "-czf", runDir + f"/archive/RNANET_MMCIFmappedToRfam_latest.tar.gz", "."])
1723 + if os.path.isdir(path_to_3D_data + "rna_only"):
1724 + subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_MMCIFall_latest.tar.gz"])
1725 + subprocess.run(["tar", "-C", path_to_3D_data + "rna_only" , "-czf", runDir + f"/archive/RNANET_MMCIFall_latest.tar.gz", "."])
1715 1726
1716 def sanitize_database(self): 1727 def sanitize_database(self):
1717 """Searches for issues in the database and correct them""" 1728 """Searches for issues in the database and correct them"""
...@@ -1813,7 +1824,7 @@ def warn(message, error=False): ...@@ -1813,7 +1824,7 @@ def warn(message, error=False):
1813 """ 1824 """
1814 # Cut if too long 1825 # Cut if too long
1815 if len(message) > 66: 1826 if len(message) > 66:
1816 - x = message.find(' ', 50, 66) 1827 + x = message.find(' ', 40, 66)
1817 if x != -1: 1828 if x != -1:
1818 warn(message[:x], error=error) 1829 warn(message[:x], error=error)
1819 warn(message[x+1:], error=error) 1830 warn(message[x+1:], error=error)
...@@ -2809,7 +2820,7 @@ def work_save_pydca(f,alignment): ...@@ -2809,7 +2820,7 @@ def work_save_pydca(f,alignment):
2809 warn(e) 2820 warn(e)
2810 2821
2811 @trace_unhandled_exceptions 2822 @trace_unhandled_exceptions
2812 -def work_pssm_remap(f): 2823 +def work_pssm_remap(f, useSina=False):
2813 """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. 2824 """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
2814 This also remaps the 3D object sequence with the aligned sequence in the MSA. 2825 This also remaps the 3D object sequence with the aligned sequence in the MSA.
2815 If asked, the 3D object sequence is completed by the consensus nucleotide when one of them is missing. 2826 If asked, the 3D object sequence is completed by the consensus nucleotide when one of them is missing.
...@@ -2991,7 +3002,7 @@ def work_pssm_remap(f): ...@@ -2991,7 +3002,7 @@ def work_pssm_remap(f):
2991 setproctitle(f"RNAnet.py work_pssm_remap({f}) insert/match states") 3002 setproctitle(f"RNAnet.py work_pssm_remap({f}) insert/match states")
2992 3003
2993 # Get back the information of match/insertion states from the STK file 3004 # Get back the information of match/insertion states from the STK file
2994 - if (not use_sina) or (f not in SSU_set and f not in LSU_set): 3005 + if (not useSina) or (f not in SSU_set and f not in LSU_set):
2995 alignstk = AlignIO.read(path_to_seq_data + "realigned/" + f + "++.stk", "stockholm") 3006 alignstk = AlignIO.read(path_to_seq_data + "realigned/" + f + "++.stk", "stockholm")
2996 consensus_2d = alignstk.column_annotations["secondary_structure"] 3007 consensus_2d = alignstk.column_annotations["secondary_structure"]
2997 del alignstk 3008 del alignstk
...@@ -3037,8 +3048,6 @@ def work_pssm_remap(f): ...@@ -3037,8 +3048,6 @@ def work_pssm_remap(f):
3037 gap_percent, consensus, cons_sec_struct) 3048 gap_percent, consensus, cons_sec_struct)
3038 VALUES (?, 0, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,)) 3049 VALUES (?, 0, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,))
3039 3050
3040 -
3041 -
3042 # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) 3051 # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains)
3043 sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f)) 3052 sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f))
3044 conn.close() 3053 conn.close()
...@@ -3171,15 +3180,9 @@ if __name__ == "__main__": ...@@ -3171,15 +3180,9 @@ if __name__ == "__main__":
3171 print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True) 3180 print(f"Among errors, {len(no_nts_set)} structures seem to contain RNA chains without defined nucleotides:", no_nts_set, flush=True)
3172 if len(weird_mappings): 3181 if len(weird_mappings):
3173 print(f"{len(weird_mappings)} mappings to Rfam were taken as absolute positions instead of residue numbers:", weird_mappings, flush=True) 3182 print(f"{len(weird_mappings)} mappings to Rfam were taken as absolute positions instead of residue numbers:", weird_mappings, flush=True)
3174 - if pp.SELECT_ONLY is None: 3183 + if pp.HOMOLOGY and pp.SELECT_ONLY is None:
3175 pp.checkpoint_save_chains() 3184 pp.checkpoint_save_chains()
3176 3185
3177 - if not pp.HOMOLOGY:
3178 - # Save chains to file
3179 - for c in pp.loaded_chains:
3180 - work_save(c, homology=False)
3181 - print("Completed.")
3182 -
3183 # At this point, structure, chain and nucleotide tables of the database are up to date. 3186 # At this point, structure, chain and nucleotide tables of the database are up to date.
3184 # (Modulo some statistics computed by statistics.py) 3187 # (Modulo some statistics computed by statistics.py)
3185 3188
...@@ -3187,6 +3190,7 @@ if __name__ == "__main__": ...@@ -3187,6 +3190,7 @@ if __name__ == "__main__":
3187 # Homology information 3190 # Homology information
3188 # =========================================================================== 3191 # ===========================================================================
3189 3192
3193 + if pp.HOMOLOGY:
3190 if pp.SELECT_ONLY is None: 3194 if pp.SELECT_ONLY is None:
3191 # If your job failed, you can comment all the "3D information" part and start from here. 3195 # If your job failed, you can comment all the "3D information" part and start from here.
3192 pp.checkpoint_load_chains() 3196 pp.checkpoint_load_chains()
......
...@@ -35,7 +35,7 @@ nohup bash -c 'time docker run --rm -v /path/to/3D/data/folder:/3D -v /path/to/s ...@@ -35,7 +35,7 @@ nohup bash -c 'time docker run --rm -v /path/to/3D/data/folder:/3D -v /path/to/s
35 # Method 2 : Classical command line installation (Linux only) 35 # Method 2 : Classical command line installation (Linux only)
36 36
37 You need to install the dependencies: 37 You need to install the dependencies:
38 -- DSSR 1.9.9 or newer, you need to register to the X3DNA forum [here](http://forum.x3dna.org/site-announcements/download-instructions/) and then download the DSSR binary [on that page](http://forum.x3dna.org/downloads/3dna-download/). Make sure to have the `x3dna-dssr` binary in your $PATH variable so that RNANet.py finds it. 38 +- DSSR 1.9.9 or newer, you need to register to ask for a DSSR (academic) license [on that page](http://innovation.columbia.edu/technologies/CU20391). Make sure to have the `x3dna-dssr` binary in your $PATH variable so that RNANet.py finds it.
39 - Infernal 1.1.4 or newer, to download at [Eddylab](http://eddylab.org/infernal/), several options are available depending on your preferences. Make sure to have the `cmalign`, `cmfetch`, `cmbuild`, `esl-alimanip`, `esl-alimerge`, `esl-alipid` and `esl-reformat` binaries in your $PATH variable, so that RNANet.py can find them. 39 - Infernal 1.1.4 or newer, to download at [Eddylab](http://eddylab.org/infernal/), several options are available depending on your preferences. Make sure to have the `cmalign`, `cmfetch`, `cmbuild`, `esl-alimanip`, `esl-alimerge`, `esl-alipid` and `esl-reformat` binaries in your $PATH variable, so that RNANet.py can find them.
40 - SINA (if you plan to use it), follow [these instructions](https://sina.readthedocs.io/en/latest/install.html) for example. Make sure to have the `sina` binary in your $PATH. 40 - SINA (if you plan to use it), follow [these instructions](https://sina.readthedocs.io/en/latest/install.html) for example. Make sure to have the `sina` binary in your $PATH.
41 - Sqlite 3, available under the name *sqlite* in every distro's package manager, 41 - Sqlite 3, available under the name *sqlite* in every distro's package manager,
......
This diff could not be displayed because it is too large.
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
...@@ -5,7 +5,7 @@ rm -rf latest_run.log errors.txt ...@@ -5,7 +5,7 @@ rm -rf latest_run.log errors.txt
5 5
6 # Run RNANet 6 # Run RNANet
7 bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --no-homology --redundant --extract' > latest_run.log 2>&1 7 bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --no-homology --redundant --extract' > latest_run.log 2>&1
8 -bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --redundant --sina --extract -s --stats-opts="--wadley --distance-matrices" --archive' > latest_run.log 2>&1 8 +bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ -r 20.0 --redundant --extract -s --stats-opts="-r 20.0 --wadley --hire-rna --distance-matrices" --archive' >> latest_run.log 2>&1
9 echo 'Compressing RNANet.db.gz...' >> latest_run.log 9 echo 'Compressing RNANet.db.gz...' >> latest_run.log
10 touch results/RNANet.db # update last modification date 10 touch results/RNANet.db # update last modification date
11 gzip -k /home/lbecquey/Projects/RNANet/results/RNANet.db # compress it 11 gzip -k /home/lbecquey/Projects/RNANet/results/RNANet.db # compress it
......
1 +# This is a script supposed to be run periodically as a cron job
2 +# This one uses argument --from-scratch, so all is recomputed ! /!\
3 +# run it one or twice a year, otherwise, the faster update runs should be enough.
4 +
5 +cd /home/lbecquey/Projects/RNANet
6 +rm -rf latest_run.log errors.txt
7 +
8 +# Run RNANet
9 +bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ --from-scratch --ignore-issues -r 20.0 --no-homology --redundant --extract' > latest_run.log 2>&1
10 +bash -c 'time python3.8 ./RNAnet.py --3d-folder /home/lbecquey/Data/RNA/3D/ --seq-folder /home/lbecquey/Data/RNA/sequences/ --from-scratch --ignore-issues -r 20.0 --redundant --extract -s --stats-opts="-r 20.0 --wadley --hire-rna --distance-matrices" --archive' >> latest_run.log 2>&1
11 +echo 'Compressing RNANet.db.gz...' >> latest_run.log
12 +touch results/RNANet.db # update last modification date
13 +gzip -k /home/lbecquey/Projects/RNANet/results/RNANet.db # compress it
14 +rm -f results/RNANet.db-wal results/RNANet.db-shm # SQLite temporary files
15 +
16 +# Save the latest results
17 +export DATE=`date +%Y%m%d`
18 +echo "Creating new release in ./archive/ folder ($DATE)..." >> latest_run.log
19 +cp /home/lbecquey/Projects/RNANet/results/summary.csv /home/lbecquey/Projects/RNANet/archive/summary_latest.csv
20 +cp /home/lbecquey/Projects/RNANet/results/summary.csv "/home/lbecquey/Projects/RNANet/archive/summary_$DATE.csv"
21 +cp /home/lbecquey/Projects/RNANet/results/families.csv /home/lbecquey/Projects/RNANet/archive/families_latest.csv
22 +cp /home/lbecquey/Projects/RNANet/results/families.csv "/home/lbecquey/Projects/RNANet/archive/families_$DATE.csv"
23 +cp /home/lbecquey/Projects/RNANet/results/frequencies.csv /home/lbecquey/Projects/RNANet/archive/frequencies_latest.csv
24 +cp /home/lbecquey/Projects/RNANet/results/pair_types.csv /home/lbecquey/Projects/RNANet/archive/pair_types_latest.csv
25 +mv /home/lbecquey/Projects/RNANet/results/RNANet.db.gz /home/lbecquey/Projects/RNANet/archive/
26 +
27 +# Init Seafile synchronization between RNANet library and ./archive/ folder (just the first time !)
28 +# seaf-cli sync -l 8e082c6e-b9ed-4b2f-9279-de2177134c57 -s https://entrepot.ibisc.univ-evry.fr -u l****.b*****y@univ-evry.fr -p ****************** -d archive/
29 +
30 +# Sync in Seafile
31 +seaf-cli start >> latest_run.log 2>&1
32 +echo 'Waiting 10m for SeaFile synchronization...' >> latest_run.log
33 +sleep 15m
34 +echo `seaf-cli status` >> latest_run.log
35 +seaf-cli stop >> latest_run.log 2>&1
36 +echo 'We are '`date`', update completed.' >> latest_run.log
37 +
...@@ -36,6 +36,6 @@ for fam in families: ...@@ -36,6 +36,6 @@ for fam in families:
36 36
37 # Now re run RNANet normally. 37 # Now re run RNANet normally.
38 command = ["python3.8", "./RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", 38 command = ["python3.8", "./RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0",
39 - "--redundant", "--sina", "--extract", "-s", "--stats-opts=\"--wadley --distance-matrices\""] 39 + "--redundant", "--extract", "-s", "--stats-opts=\"-r 20.0 --wadley --hire-rna --distance-matrices\""]
40 print(' '.join(command)) 40 print(' '.join(command))
41 subprocess.run(command) 41 subprocess.run(command)
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -3,8 +3,9 @@ import subprocess, os, sys ...@@ -3,8 +3,9 @@ import subprocess, os, sys
3 3
4 # Put a list of problematic chains here, they will be properly deleted and recomputed 4 # Put a list of problematic chains here, they will be properly deleted and recomputed
5 problems = [ 5 problems = [
6 - "1k73_1_A", 6 + "7nhm_1_A_1-2923"
7 - "1k73_1_B" 7 + "4wfa_1_X_1-2923"
8 + "4wce_1_X_1-2923"
8 ] 9 ]
9 10
10 # provide the path to your data folders, the RNANet.db file, and the RNANet.py file as arguments to this script 11 # provide the path to your data folders, the RNANet.db file, and the RNANet.py file as arguments to this script
...@@ -22,6 +23,7 @@ for p in problems: ...@@ -22,6 +23,7 @@ for p in problems:
22 23
23 # Remove the datapoints files and 3D files 24 # Remove the datapoints files and 3D files
24 subprocess.run(["rm", '-f', path_to_3D_data + f"/rna_mapped_to_Rfam/{p}.cif"]) 25 subprocess.run(["rm", '-f', path_to_3D_data + f"/rna_mapped_to_Rfam/{p}.cif"])
26 + subprocess.run(["rm", '-f', path_to_3D_data + f"/rna_only/{p}.cif"])
25 files = [ f for f in os.listdir(path_to_3D_data + "/datapoints") if p in f ] 27 files = [ f for f in os.listdir(path_to_3D_data + "/datapoints") if p in f ]
26 for f in files: 28 for f in files:
27 subprocess.run(["rm", '-f', path_to_3D_data + f"/datapoints/{f}"]) 29 subprocess.run(["rm", '-f', path_to_3D_data + f"/datapoints/{f}"])
...@@ -38,14 +40,14 @@ for p in problems: ...@@ -38,14 +40,14 @@ for p in problems:
38 print(' '.join(command)) 40 print(' '.join(command))
39 subprocess.run(command) 41 subprocess.run(command)
40 42
41 - command = ["python3.8", path_to_RNANet, "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--extract", "--only", p] 43 + command = ["python3.8", path_to_RNANet, "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "--redundant", "-r", "20.0", "--extract", "--only", p]
42 else: 44 else:
43 # Delete the chain from the database, and the associated nucleotides and re_mappings, using foreign keys 45 # Delete the chain from the database, and the associated nucleotides and re_mappings, using foreign keys
44 command = ["sqlite3", path_to_db, f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc is null;"] 46 command = ["sqlite3", path_to_db, f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc is null;"]
45 print(' '.join(command)) 47 print(' '.join(command))
46 subprocess.run(command) 48 subprocess.run(command)
47 49
48 - command = ["python3.8", path_to_RNANet, "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--no-homology", "--extract", "--only", p] 50 + command = ["python3.8", path_to_RNANet, "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "--redundant", "-r", "20.0", "--no-homology", "--extract", "--only", p]
49 51
50 # Re-run RNANet 52 # Re-run RNANet
51 os.chdir(os.path.dirname(os.path.realpath(path_to_db)) + '/../') 53 os.chdir(os.path.dirname(os.path.realpath(path_to_db)) + '/../')
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
7 # Run this file if you want the base counts, pair-type counts, identity percents, etc 7 # Run this file if you want the base counts, pair-type counts, identity percents, etc
8 # in the database. 8 # in the database.
9 9
10 -import getopt, glob, json, os, sqlite3, shlex, subprocess, sys, warnings 10 +import getopt, json, os, sqlite3, shlex, subprocess, sys, warnings
11 import numpy as np 11 import numpy as np
12 import pandas as pd 12 import pandas as pd
13 import scipy.stats as st 13 import scipy.stats as st
...@@ -27,7 +27,6 @@ from tqdm import tqdm ...@@ -27,7 +27,6 @@ from tqdm import tqdm
27 from collections import Counter 27 from collections import Counter
28 from setproctitle import setproctitle 28 from setproctitle import setproctitle
29 from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_with_tqdm, trace_unhandled_exceptions 29 from RNAnet import Job, read_cpu_number, sql_ask_database, sql_execute, warn, notify, init_with_tqdm, trace_unhandled_exceptions
30 -from geometric_stats import *
31 30
32 np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8) 31 np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf, precision=8)
33 path_to_3D_data = "tobedefinedbyoptions" 32 path_to_3D_data = "tobedefinedbyoptions"
...@@ -38,6 +37,8 @@ res_thr = 20.0 # default: all structures ...@@ -38,6 +37,8 @@ res_thr = 20.0 # default: all structures
38 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112 37 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112
39 SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111 38 SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111
40 39
40 +from geometric_stats import * # after definition of the variables
41 +
41 @trace_unhandled_exceptions 42 @trace_unhandled_exceptions
42 def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0): 43 def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=2.0):
43 """ 44 """
...@@ -934,7 +935,7 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s): ...@@ -934,7 +935,7 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
934 coordinates = nt_3d_centers(filename, consider_all_atoms) 935 coordinates = nt_3d_centers(filename, consider_all_atoms)
935 if not len(coordinates): 936 if not len(coordinates):
936 # there is not nucleotides in the file, or no C1' atoms for example. 937 # there is not nucleotides in the file, or no C1' atoms for example.
937 - warn("No C1' atoms in " + filename) 938 + warn("No C1' atoms in " + filename.split('/')[-1] + ", ignoring")
938 return None, None, None 939 return None, None, None
939 except FileNotFoundError: 940 except FileNotFoundError:
940 return None, None, None 941 return None, None, None
...@@ -951,8 +952,8 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s): ...@@ -951,8 +952,8 @@ def par_distance_matrix(filelist, f, label, cm_coords, consider_all_atoms, s):
951 try: 952 try:
952 coordinates_with_gaps.append(coordinates[i - nb_gap]) 953 coordinates_with_gaps.append(coordinates[i - nb_gap])
953 except IndexError as e: 954 except IndexError as e:
954 - warn(f"{filename} : {s.seq} at position {i}, we get {e}.", error=True) 955 + warn(f"{filename.split('/')[-1]} : {s.seq} at position {i}, we get {e}.", error=True)
955 - exit(0) 956 + return None, None, None
956 957
957 # Build the pairwise distances 958 # Build the pairwise distances
958 d = np.zeros((len(s.seq), len(s.seq)), dtype=np.float32) 959 d = np.zeros((len(s.seq), len(s.seq)), dtype=np.float32)
...@@ -1099,7 +1100,7 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): ...@@ -1099,7 +1100,7 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
1099 std = np.divide(std, counts, where=counts>0, out=np.full_like(std, np.NaN)) 1100 std = np.divide(std, counts, where=counts>0, out=np.full_like(std, np.NaN))
1100 mask = np.invert(np.isnan(std)) 1101 mask = np.invert(np.isnan(std))
1101 value = std[mask] - np.power(avg[mask], 2) 1102 value = std[mask] - np.power(avg[mask], 2)
1102 - if ((value[value<0] < -1e-2).any()): 1103 + if ((value[value < 0] < -1e-2).any()):
1103 warn("Erasing very negative variance value !") 1104 warn("Erasing very negative variance value !")
1104 value[value<0] = 0.0 # floating point problems ! 1105 value[value<0] = 0.0 # floating point problems !
1105 std[mask] = np.sqrt(value) 1106 std[mask] = np.sqrt(value)
...@@ -1127,8 +1128,48 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False): ...@@ -1127,8 +1128,48 @@ def get_avg_std_distance_matrix(f, consider_all_atoms, multithread=False):
1127 if not multithread: 1128 if not multithread:
1128 idxQueue.put(thr_idx) # replace the thread index in the queue 1129 idxQueue.put(thr_idx) # replace the thread index in the queue
1129 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") 1130 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
1131 + else:
1132 + # basically, for the rRNAs
1133 + # we delete the unique csv files for each chain, they wheight hundreds of gigabytes together
1134 + warn(f"Removing {f} ({label}) individual distance matrices, they weight too much. keeping the averages and standard deviations.")
1135 + for csv in glob.glob(runDir + '/results/distance_matrices/' + f + '_'+ label + "/*-" + f + ".csv"):
1136 + try:
1137 + os.remove(csv)
1138 + except FileNotFoundError:
1139 + pass
1130 return 0 1140 return 0
1131 1141
1142 +@trace_unhandled_exceptions
1143 +def measure_from_structure(f):
1144 + """
1145 + Do geometric measures required on a given filename
1146 + """
1147 +
1148 + name = f.split('.')[0]
1149 +
1150 + global idxQueue
1151 + thr_idx = idxQueue.get()
1152 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} measure_from_structure({f})")
1153 +
1154 + # Open the structure
1155 + with warnings.catch_warnings():
1156 + # Ignore the PDB problems. This mostly warns that some chain is discontinuous.
1157 + warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.PDBConstructionWarning)
1158 + warnings.simplefilter('ignore', Bio.PDB.PDBExceptions.BiopythonWarning)
1159 + parser = MMCIFParser()
1160 + s = parser.get_structure(f, os.path.abspath(path_to_3D_data+ "rna_only/" + f))
1161 +
1162 + #pyle_measures(name, s, thr_idx)
1163 + measures_aa(name, s, thr_idx)
1164 + if DO_HIRE_RNA_MEASURES:
1165 + measures_hrna(name, s, thr_idx)
1166 + measures_hrna_basepairs(name, s, path_to_3D_data, thr_idx)
1167 + if DO_WADLEY_ANALYSIS:
1168 + measures_pyle(name, s, thr_idx)
1169 +
1170 + idxQueue.put(thr_idx) # replace the thread index in the queue
1171 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
1172 +
1132 def family_order(f): 1173 def family_order(f):
1133 # sort the RNA families so that the plots are readable 1174 # sort the RNA families so that the plots are readable
1134 1175
...@@ -1154,7 +1195,11 @@ def nt_3d_centers(cif_file, consider_all_atoms): ...@@ -1154,7 +1195,11 @@ def nt_3d_centers(cif_file, consider_all_atoms):
1154 try: 1195 try:
1155 structure = MMCIFParser().get_structure(cif_file, cif_file) 1196 structure = MMCIFParser().get_structure(cif_file, cif_file)
1156 except Exception as e: 1197 except Exception as e:
1157 - warn(f"{cif_file} : {e}", error=True) 1198 + warn(f"{cif_file.split('/')[-1]} : {e}", error=True)
1199 + with open(runDir + "/errors.txt", "a") as f:
1200 + f.write(f"Exception in nt_3d_centers({cif_file.split('/')[-1]})\n")
1201 + f.write(str(e))
1202 + f.write("\n\n")
1158 return result 1203 return result
1159 for model in structure: 1204 for model in structure:
1160 for chain in model: 1205 for chain in model:
...@@ -1205,6 +1250,7 @@ def log_to_pbar(pbar): ...@@ -1205,6 +1250,7 @@ def log_to_pbar(pbar):
1205 pbar.update(1) 1250 pbar.update(1)
1206 return update 1251 return update
1207 1252
1253 +@trace_unhandled_exceptions
1208 def process_jobs(joblist): 1254 def process_jobs(joblist):
1209 """ 1255 """
1210 Starts a Pool to run the Job() objects in joblist. 1256 Starts a Pool to run the Job() objects in joblist.
...@@ -1302,7 +1348,7 @@ if __name__ == "__main__": ...@@ -1302,7 +1348,7 @@ if __name__ == "__main__":
1302 os.makedirs(runDir + "/results/figures/GMM/HiRE-RNA/angles/", exist_ok=True) 1348 os.makedirs(runDir + "/results/figures/GMM/HiRE-RNA/angles/", exist_ok=True)
1303 os.makedirs(runDir + "/results/figures/GMM/HiRE-RNA/torsions/", exist_ok=True) 1349 os.makedirs(runDir + "/results/figures/GMM/HiRE-RNA/torsions/", exist_ok=True)
1304 os.makedirs(runDir + "/results/figures/GMM/HiRE-RNA/basepairs/", exist_ok=True) 1350 os.makedirs(runDir + "/results/figures/GMM/HiRE-RNA/basepairs/", exist_ok=True)
1305 - elif opt == "rescan-nmodes": 1351 + elif opt == "--rescan-nmodes":
1306 RESCAN_GMM_COMP_NUM = True 1352 RESCAN_GMM_COMP_NUM = True
1307 1353
1308 # Load mappings. famlist will contain only families with structures at this resolution threshold. 1354 # Load mappings. famlist will contain only families with structures at this resolution threshold.
...@@ -1388,7 +1434,6 @@ if __name__ == "__main__": ...@@ -1388,7 +1434,6 @@ if __name__ == "__main__":
1388 if path.isfile(path_to_3D_data + "datapoints/" + f.split('.')[0]): 1434 if path.isfile(path_to_3D_data + "datapoints/" + f.split('.')[0]):
1389 joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances 1435 joblist.append(Job(function=measure_from_structure, args=(f,), how_many_in_parallel=nworkers)) # All-atom distances
1390 1436
1391 -
1392 process_jobs(joblist) 1437 process_jobs(joblist)
1393 1438
1394 # Now process the memory-heavy tasks family by family 1439 # Now process the memory-heavy tasks family by family
...@@ -1412,24 +1457,23 @@ if __name__ == "__main__": ...@@ -1412,24 +1457,23 @@ if __name__ == "__main__":
1412 general_stats() 1457 general_stats()
1413 os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True) 1458 os.makedirs(runDir+"/results/figures/GMM/", exist_ok=True)
1414 os.makedirs(runDir+"/results/geometry/json/", exist_ok=True) 1459 os.makedirs(runDir+"/results/geometry/json/", exist_ok=True)
1415 - concat_dataframes(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv') 1460 + concat_dataframes(runDir + '/results/geometry/all-atoms/distances/', 'dist_atoms.csv', nworkers)
1416 if DO_HIRE_RNA_MEASURES: 1461 if DO_HIRE_RNA_MEASURES:
1417 - concat_dataframes(runDir + '/results/geometry/HiRE-RNA/distances/', 'distances_HiRERNA.csv') 1462 + concat_dataframes(runDir + '/results/geometry/HiRE-RNA/distances/', 'distances_HiRERNA.csv', nworkers)
1418 - concat_dataframes(runDir + '/results/geometry/HiRE-RNA/angles/', 'angles_HiRERNA.csv') 1463 + concat_dataframes(runDir + '/results/geometry/HiRE-RNA/angles/', 'angles_HiRERNA.csv', nworkers)
1419 - concat_dataframes(runDir + '/results/geometry/HiRE-RNA/torsions/', 'torsions_HiRERNA.csv') 1464 + concat_dataframes(runDir + '/results/geometry/HiRE-RNA/torsions/', 'torsions_HiRERNA.csv', nworkers)
1420 - concat_dataframes(runDir + '/results/geometry/HiRE-RNA/basepairs/', 'basepairs_HiRERNA.csv') 1465 + concat_dataframes(runDir + '/results/geometry/HiRE-RNA/basepairs/', 'basepairs_HiRERNA.csv', nworkers)
1421 if DO_WADLEY_ANALYSIS: 1466 if DO_WADLEY_ANALYSIS:
1422 - concat_dataframes(runDir + '/results/geometry/Pyle/distances/', 'distances_pyle.csv') 1467 + concat_dataframes(runDir + '/results/geometry/Pyle/distances/', 'distances_pyle.csv', nworkers)
1423 - concat_dataframes(runDir + '/results/geometry/Pyle/angles/', 'flat_angles_pyle.csv') 1468 + concat_dataframes(runDir + '/results/geometry/Pyle/angles/', 'flat_angles_pyle.csv', nworkers)
1424 joblist = [] 1469 joblist = []
1425 - joblist.append(Job(function=gmm_aa_dists, args=(RESCAN_GMM_COMP_NUM))) 1470 + joblist.append(Job(function=gmm_aa_dists, args=(RESCAN_GMM_COMP_NUM,)))
1426 - joblist.append(Job(function=gmm_aa_torsions, args=(RESCAN_GMM_COMP_NUM))) 1471 + joblist.append(Job(function=gmm_aa_torsions, args=(RESCAN_GMM_COMP_NUM, res_thr)))
1427 if DO_HIRE_RNA_MEASURES: 1472 if DO_HIRE_RNA_MEASURES:
1428 - joblist.append(Job(function=gmm_hrna, args=(RESCAN_GMM_COMP_NUM))) 1473 + joblist.append(Job(function=gmm_hrna, args=(RESCAN_GMM_COMP_NUM,)))
1429 - joblist.append(Job(function=gmm_hrna_basepairs, args=(RESCAN_GMM_COMP_NUM))) 1474 + joblist.append(Job(function=gmm_hrna_basepairs, args=(RESCAN_GMM_COMP_NUM,)))
1430 if DO_WADLEY_ANALYSIS: 1475 if DO_WADLEY_ANALYSIS:
1431 - joblist.append(Job(function=gmm_pyle, args=(RESCAN_GMM_COMP_NUM))) 1476 + joblist.append(Job(function=gmm_pyle, args=(RESCAN_GMM_COMP_NUM, res_thr)))
1432 - if len(joblist):
1433 process_jobs(joblist) 1477 process_jobs(joblist)
1434 merge_jsons() 1478 merge_jsons()
1435 1479
......