Showing
4 changed files
with
363 additions
and
84 deletions
... | @@ -18,7 +18,7 @@ from os import path, makedirs | ... | @@ -18,7 +18,7 @@ from os import path, makedirs |
18 | from multiprocessing import Pool, Manager, set_start_method | 18 | from multiprocessing import Pool, Manager, set_start_method |
19 | from time import sleep | 19 | from time import sleep |
20 | from tqdm import tqdm | 20 | from tqdm import tqdm |
21 | -from tqdm.contrib.concurrent import process_map | 21 | +from setproctitle import setproctitle |
22 | 22 | ||
23 | def trace_unhandled_exceptions(func): | 23 | def trace_unhandled_exceptions(func): |
24 | @wraps(func) | 24 | @wraps(func) |
... | @@ -169,7 +169,8 @@ class Chain: | ... | @@ -169,7 +169,8 @@ class Chain: |
169 | def extract(self, df, khetatm): | 169 | def extract(self, df, khetatm): |
170 | """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file. | 170 | """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file. |
171 | """ | 171 | """ |
172 | - | 172 | + setproctitle(f"RNANet.py {self.chain_label} extract()") |
173 | + | ||
173 | if self.mapping is not None: | 174 | if self.mapping is not None: |
174 | status = f"Extract {self.mapping.nt_start}-{self.mapping.nt_end} atoms from {self.pdb_id}-{self.pdb_chain_id}" | 175 | status = f"Extract {self.mapping.nt_start}-{self.mapping.nt_end} atoms from {self.pdb_id}-{self.pdb_chain_id}" |
175 | self.file = path_to_3D_data+"rna_mapped_to_Rfam/"+self.chain_label+".cif" | 176 | self.file = path_to_3D_data+"rna_mapped_to_Rfam/"+self.chain_label+".cif" |
... | @@ -213,6 +214,8 @@ class Chain: | ... | @@ -213,6 +214,8 @@ class Chain: |
213 | def extract_3D_data(self, save_logs=True): | 214 | def extract_3D_data(self, save_logs=True): |
214 | """ Maps DSSR annotations to the chain. """ | 215 | """ Maps DSSR annotations to the chain. """ |
215 | 216 | ||
217 | + setproctitle(f"RNANet.py {self.chain_label} extract_3D_data()") | ||
218 | + | ||
216 | ############################################ | 219 | ############################################ |
217 | # Load the mmCIF annotations from file | 220 | # Load the mmCIF annotations from file |
218 | ############################################ | 221 | ############################################ |
... | @@ -311,6 +314,10 @@ class Chain: | ... | @@ -311,6 +314,10 @@ class Chain: |
311 | # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR. | 314 | # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR. |
312 | # Solution : we shift the numbering of 17A (to 18) and the following residues. | 315 | # Solution : we shift the numbering of 17A (to 18) and the following residues. |
313 | df.iloc[i:, 1] += 1 | 316 | df.iloc[i:, 1] += 1 |
317 | + elif duplicates.iloc[0,0] == 1 and df.iloc[i,0] == 3: | ||
318 | + # 4wzo_1_1J case, there is a residue numbered -1 and read as 1 before the number 0. | ||
319 | + df.iloc[1:, 1] += 1 | ||
320 | + df.iloc[0, 1] = 0 | ||
314 | else: | 321 | else: |
315 | # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ... | 322 | # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ... |
316 | # Here the 163B is read 163 by DSSR, but there already is a residue 163. | 323 | # Here the 163B is read 163 by DSSR, but there already is a residue 163. |
... | @@ -323,7 +330,6 @@ class Chain: | ... | @@ -323,7 +330,6 @@ class Chain: |
323 | self.error_messages = f"Error with parsing of duplicate residues numbers." | 330 | self.error_messages = f"Error with parsing of duplicate residues numbers." |
324 | return None | 331 | return None |
325 | 332 | ||
326 | - | ||
327 | # Search for ligands at the end of the selection | 333 | # Search for ligands at the end of the selection |
328 | # Drop ligands detected as residues by DSSR, by detecting several markers | 334 | # Drop ligands detected as residues by DSSR, by detecting several markers |
329 | while ( len(df.index_chain) and df.iloc[-1,2] not in ["A", "C", "G", "U"] and ( | 335 | while ( len(df.index_chain) and df.iloc[-1,2] not in ["A", "C", "G", "U"] and ( |
... | @@ -338,7 +344,6 @@ class Chain: | ... | @@ -338,7 +344,6 @@ class Chain: |
338 | self.mapping.log(df.tail(1)) | 344 | self.mapping.log(df.tail(1)) |
339 | df = df.head(-1) | 345 | df = df.head(-1) |
340 | 346 | ||
341 | - | ||
342 | # Duplicates in index_chain : drop, they are ligands | 347 | # Duplicates in index_chain : drop, they are ligands |
343 | # e.g. 3iwn_1_B_1-91, ligand C2E has index_chain 1 (and nt_resnum 601) | 348 | # e.g. 3iwn_1_B_1-91, ligand C2E has index_chain 1 (and nt_resnum 601) |
344 | duplicates = [ index for index, element in enumerate(df.duplicated(['index_chain']).values) if element ] | 349 | duplicates = [ index for index, element in enumerate(df.duplicated(['index_chain']).values) if element ] |
... | @@ -384,7 +389,7 @@ class Chain: | ... | @@ -384,7 +389,7 @@ class Chain: |
384 | df.iloc[i+1:, 1] += 1 | 389 | df.iloc[i+1:, 1] += 1 |
385 | else: | 390 | else: |
386 | warn(f"Missing index_chain {i} in {self.chain_label} !") | 391 | warn(f"Missing index_chain {i} in {self.chain_label} !") |
387 | - | 392 | + |
388 | # Assert some nucleotides still exist | 393 | # Assert some nucleotides still exist |
389 | try: | 394 | try: |
390 | l = df.iloc[-1,1] - df.iloc[0,1] + 1 # update length of chain from nt_resnum point of view | 395 | l = df.iloc[-1,1] - df.iloc[0,1] + 1 # update length of chain from nt_resnum point of view |
... | @@ -522,6 +527,8 @@ class Chain: | ... | @@ -522,6 +527,8 @@ class Chain: |
522 | """Saves the extracted 3D data to the database. | 527 | """Saves the extracted 3D data to the database. |
523 | """ | 528 | """ |
524 | 529 | ||
530 | + setproctitle(f"RNANet.py {self.chain_label} register_chain()") | ||
531 | + | ||
525 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: | 532 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: |
526 | # Register the chain in table chain | 533 | # Register the chain in table chain |
527 | if self.mapping is not None: | 534 | if self.mapping is not None: |
... | @@ -575,6 +582,8 @@ class Chain: | ... | @@ -575,6 +582,8 @@ class Chain: |
575 | s_seq: the aligned version of self.seq_to_align | 582 | s_seq: the aligned version of self.seq_to_align |
576 | """ | 583 | """ |
577 | 584 | ||
585 | + setproctitle(f"RNANet.py {self.chain_label} remap()") | ||
586 | + | ||
578 | alilen = len(s_seq) | 587 | alilen = len(s_seq) |
579 | re_mappings = [] | 588 | re_mappings = [] |
580 | 589 | ||
... | @@ -630,6 +639,8 @@ class Chain: | ... | @@ -630,6 +639,8 @@ class Chain: |
630 | REQUIRES align_column and re_mapping up to date | 639 | REQUIRES align_column and re_mapping up to date |
631 | """ | 640 | """ |
632 | 641 | ||
642 | + setproctitle(f"RNANet.py {self.chain_label} replace_gaps()") | ||
643 | + | ||
633 | homology_data = sql_ask_database(conn, f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM | 644 | homology_data = sql_ask_database(conn, f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM |
634 | (SELECT chain_id, rfam_acc FROM chain WHERE chain_id={self.db_chain_id}) | 645 | (SELECT chain_id, rfam_acc FROM chain WHERE chain_id={self.db_chain_id}) |
635 | NATURAL JOIN re_mapping | 646 | NATURAL JOIN re_mapping |
... | @@ -741,6 +752,9 @@ class Downloader: | ... | @@ -741,6 +752,9 @@ class Downloader: |
741 | """Query the Rfam public MySQL database for mappings between their RNA families and PDB structures. | 752 | """Query the Rfam public MySQL database for mappings between their RNA families and PDB structures. |
742 | 753 | ||
743 | """ | 754 | """ |
755 | + | ||
756 | + setproctitle(f"RNANet.py download_Rfam_PDB_mappings()") | ||
757 | + | ||
744 | # Download PDB mappings to Rfam family | 758 | # Download PDB mappings to Rfam family |
745 | print("> Fetching latest PDB mappings from Rfam..." + " " * 29, end='', flush=True) | 759 | print("> Fetching latest PDB mappings from Rfam..." + " " * 29, end='', flush=True) |
746 | try: | 760 | try: |
... | @@ -766,6 +780,8 @@ class Downloader: | ... | @@ -766,6 +780,8 @@ class Downloader: |
766 | Does not download if already there. | 780 | Does not download if already there. |
767 | """ | 781 | """ |
768 | 782 | ||
783 | + setproctitle(f"RNANet.py download_Rfam_cm()") | ||
784 | + | ||
769 | print(f"\t> Download Rfam.cm.gz from Rfam..." + " " * 37, end='', flush=True) | 785 | print(f"\t> Download Rfam.cm.gz from Rfam..." + " " * 37, end='', flush=True) |
770 | if not path.isfile(path_to_seq_data + "Rfam.cm"): | 786 | if not path.isfile(path_to_seq_data + "Rfam.cm"): |
771 | try: | 787 | try: |
... | @@ -785,6 +801,9 @@ class Downloader: | ... | @@ -785,6 +801,9 @@ class Downloader: |
785 | Family ID, number of sequences identified, maximum length of those sequences. | 801 | Family ID, number of sequences identified, maximum length of those sequences. |
786 | SETS family in the database (partially) | 802 | SETS family in the database (partially) |
787 | """ | 803 | """ |
804 | + | ||
805 | + setproctitle(f"RNANet.py download_Rfam_family_stats()") | ||
806 | + | ||
788 | try: | 807 | try: |
789 | db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam') | 808 | db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam') |
790 | 809 | ||
... | @@ -829,6 +848,8 @@ class Downloader: | ... | @@ -829,6 +848,8 @@ class Downloader: |
829 | 848 | ||
830 | Actually gets a FASTA archive from the public Rfam FTP. Does not download if already there.""" | 849 | Actually gets a FASTA archive from the public Rfam FTP. Does not download if already there.""" |
831 | 850 | ||
851 | + setproctitle(f"RNANet.py download_Rfam_sequences({rfam_acc})") | ||
852 | + | ||
832 | if not path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"): | 853 | if not path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"): |
833 | for _ in range(10): # retry 100 times if it fails | 854 | for _ in range(10): # retry 100 times if it fails |
834 | try: | 855 | try: |
... | @@ -849,6 +870,9 @@ class Downloader: | ... | @@ -849,6 +870,9 @@ class Downloader: |
849 | 870 | ||
850 | Does not remove structural redundancy. | 871 | Does not remove structural redundancy. |
851 | """ | 872 | """ |
873 | + | ||
874 | + setproctitle(f"RNANet.py download_BGSU_NR_list({res})") | ||
875 | + | ||
852 | nr_code = min([ i for i in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 20.0] if i >= res ]) | 876 | nr_code = min([ i for i in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 20.0] if i >= res ]) |
853 | print(f"> Fetching latest list of RNA files at {nr_code} A resolution from BGSU website...", end='', flush=True) | 877 | print(f"> Fetching latest list of RNA files at {nr_code} A resolution from BGSU website...", end='', flush=True) |
854 | # Download latest BGSU non-redundant list | 878 | # Download latest BGSU non-redundant list |
... | @@ -875,6 +899,10 @@ class Downloader: | ... | @@ -875,6 +899,10 @@ class Downloader: |
875 | return full_structures_list # list of ( str (class), str (class_members) ) | 899 | return full_structures_list # list of ( str (class), str (class_members) ) |
876 | 900 | ||
877 | def download_from_SILVA(self, unit): | 901 | def download_from_SILVA(self, unit): |
902 | + | ||
903 | + setproctitle(f"RNANet.py download_from_SILVA({unit})") | ||
904 | + | ||
905 | + | ||
878 | if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"): | 906 | if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"): |
879 | try: | 907 | try: |
880 | print(f"Downloading {unit} from SILVA...", end='', flush=True) | 908 | print(f"Downloading {unit} from SILVA...", end='', flush=True) |
... | @@ -989,6 +1017,8 @@ class Pipeline: | ... | @@ -989,6 +1017,8 @@ class Pipeline: |
989 | global path_to_3D_data | 1017 | global path_to_3D_data |
990 | global path_to_seq_data | 1018 | global path_to_seq_data |
991 | 1019 | ||
1020 | + setproctitle("RNANet.py process_options()") | ||
1021 | + | ||
992 | try: | 1022 | try: |
993 | opts, _ = getopt.getopt( sys.argv[1:], "r:hs", | 1023 | opts, _ = getopt.getopt( sys.argv[1:], "r:hs", |
994 | [ "help", "resolution=", "keep-hetatm=", "from-scratch", | 1024 | [ "help", "resolution=", "keep-hetatm=", "from-scratch", |
... | @@ -1105,13 +1135,16 @@ class Pipeline: | ... | @@ -1105,13 +1135,16 @@ class Pipeline: |
1105 | print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") | 1135 | print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") |
1106 | print("See RNANet.py --help for more information.") | 1136 | print("See RNANet.py --help for more information.") |
1107 | exit(1) | 1137 | exit(1) |
1108 | - | 1138 | + |
1139 | + @trace_unhandled_exceptions | ||
1109 | def list_available_mappings(self): | 1140 | def list_available_mappings(self): |
1110 | """List 3D chains with available Rfam mappings. | 1141 | """List 3D chains with available Rfam mappings. |
1111 | 1142 | ||
1112 | Return a list of Chain() objects with the mappings set up. | 1143 | Return a list of Chain() objects with the mappings set up. |
1113 | If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains.""" | 1144 | If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains.""" |
1114 | 1145 | ||
1146 | + setproctitle("RNANet.py list_available_mappings()") | ||
1147 | + | ||
1115 | # List all 3D RNA chains below given resolution | 1148 | # List all 3D RNA chains below given resolution |
1116 | full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) # list of tuples ( class, class_members ) | 1149 | full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) # list of tuples ( class, class_members ) |
1117 | 1150 | ||
... | @@ -1131,11 +1164,11 @@ class Pipeline: | ... | @@ -1131,11 +1164,11 @@ class Pipeline: |
1131 | # Compute the list of mappable structures using NR-list and Rfam-PDB mappings | 1164 | # Compute the list of mappable structures using NR-list and Rfam-PDB mappings |
1132 | # And get Chain() objects | 1165 | # And get Chain() objects |
1133 | print("> Building list of structures...", flush=True) | 1166 | print("> Building list of structures...", flush=True) |
1134 | - p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=ncores) | 1167 | + p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=ncores, maxtasksperchild=1) |
1135 | try: | 1168 | try: |
1136 | 1169 | ||
1137 | - pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1, bar_format="{percentage:3.0f}%|{bar}|") | 1170 | + pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1, desc="Eq. classes", bar_format="{percentage:3.0f}%|{bar}|") |
1138 | - for _, newchains in enumerate(p.imap_unordered(partial(work_infer_mappings, not self.REUSE_ALL, allmappings), full_structures_list)): | 1171 | + for _, newchains in enumerate(p.imap_unordered(partial(work_infer_mappings, not self.REUSE_ALL, allmappings), full_structures_list, chunksize=1)): |
1139 | self.update += newchains | 1172 | self.update += newchains |
1140 | pbar.update(1) # Everytime the iteration finishes, update the global progress bar | 1173 | pbar.update(1) # Everytime the iteration finishes, update the global progress bar |
1141 | 1174 | ||
... | @@ -1161,7 +1194,7 @@ class Pipeline: | ... | @@ -1161,7 +1194,7 @@ class Pipeline: |
1161 | pdb_chain_id = nr[2].upper() | 1194 | pdb_chain_id = nr[2].upper() |
1162 | chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}" | 1195 | chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}" |
1163 | res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc IS NULL AND issue=0""") | 1196 | res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc IS NULL AND issue=0""") |
1164 | - if not len(res): # the chain is NOT yet in the database, or this is a known issue | 1197 | + if not len(res) or self.REUSE_ALL: # the chain is NOT yet in the database, or this is a known issue |
1165 | self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class)) | 1198 | self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class)) |
1166 | conn.close() | 1199 | conn.close() |
1167 | 1200 | ||
... | @@ -1179,6 +1212,8 @@ class Pipeline: | ... | @@ -1179,6 +1212,8 @@ class Pipeline: |
1179 | REQUIRES the previous definition of self.update, so call list_available_mappings() before. | 1212 | REQUIRES the previous definition of self.update, so call list_available_mappings() before. |
1180 | SETS table structure""" | 1213 | SETS table structure""" |
1181 | 1214 | ||
1215 | + setproctitle(f"RNANet.py dl_and_annotate(retry={retry})") | ||
1216 | + | ||
1182 | # Prepare the results folders | 1217 | # Prepare the results folders |
1183 | if not path.isdir(path_to_3D_data + "RNAcifs"): | 1218 | if not path.isdir(path_to_3D_data + "RNAcifs"): |
1184 | os.makedirs(path_to_3D_data + "RNAcifs") # for the whole structures | 1219 | os.makedirs(path_to_3D_data + "RNAcifs") # for the whole structures |
... | @@ -1186,15 +1221,15 @@ class Pipeline: | ... | @@ -1186,15 +1221,15 @@ class Pipeline: |
1186 | os.makedirs(path_to_3D_data + "annotations") # for DSSR analysis of the whole structures | 1221 | os.makedirs(path_to_3D_data + "annotations") # for DSSR analysis of the whole structures |
1187 | 1222 | ||
1188 | # Download and annotate | 1223 | # Download and annotate |
1189 | - print("> Downloading and annotating structures...", flush=True) | 1224 | + print("> Downloading and annotating structures (or checking previous results if they exist)...", flush=True) |
1190 | if retry: | 1225 | if retry: |
1191 | mmcif_list = sorted(set([ c.pdb_id for c in self.retry ])) | 1226 | mmcif_list = sorted(set([ c.pdb_id for c in self.retry ])) |
1192 | else: | 1227 | else: |
1193 | - mmcif_list = sorted(set([ c.pdb_id for c in self.update if not path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json") ])) | 1228 | + mmcif_list = sorted(set([ c.pdb_id for c in self.update ])) |
1194 | try: | 1229 | try: |
1195 | - p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=int(coeff_ncores*ncores)) | 1230 | + p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=int(coeff_ncores*ncores), maxtasksperchild=1) |
1196 | pbar = tqdm(mmcif_list, maxinterval=1.0, miniters=1, desc="mmCIF files") | 1231 | pbar = tqdm(mmcif_list, maxinterval=1.0, miniters=1, desc="mmCIF files") |
1197 | - for _ in p.imap_unordered(work_mmcif, mmcif_list): | 1232 | + for _ in p.imap_unordered(work_mmcif, mmcif_list, chunksize=1): |
1198 | pbar.update(1) # Everytime the iteration finishes, update the global progress bar | 1233 | pbar.update(1) # Everytime the iteration finishes, update the global progress bar |
1199 | pbar.close() | 1234 | pbar.close() |
1200 | p.close() | 1235 | p.close() |
... | @@ -1213,6 +1248,8 @@ class Pipeline: | ... | @@ -1213,6 +1248,8 @@ class Pipeline: |
1213 | REQUIRES the previous definition of self.update, so call list_available_mappings() before. | 1248 | REQUIRES the previous definition of self.update, so call list_available_mappings() before. |
1214 | SETS self.loaded_chains""" | 1249 | SETS self.loaded_chains""" |
1215 | 1250 | ||
1251 | + setproctitle(f"RNANet.py build_chains(retry={retry})") | ||
1252 | + | ||
1216 | # Prepare folders | 1253 | # Prepare folders |
1217 | if self.EXTRACT_CHAINS: | 1254 | if self.EXTRACT_CHAINS: |
1218 | if self.HOMOLOGY and not path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"): | 1255 | if self.HOMOLOGY and not path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"): |
... | @@ -1239,19 +1276,25 @@ class Pipeline: | ... | @@ -1239,19 +1276,25 @@ class Pipeline: |
1239 | exit(1) | 1276 | exit(1) |
1240 | 1277 | ||
1241 | # If there were newly discovered problems, add this chain to the known issues | 1278 | # If there were newly discovered problems, add this chain to the known issues |
1279 | + issues = 0 | ||
1280 | + issues_names = [] | ||
1242 | ki = open(runDir + "/known_issues.txt", 'a') | 1281 | ki = open(runDir + "/known_issues.txt", 'a') |
1243 | kir = open(runDir + "/known_issues_reasons.txt", 'a') | 1282 | kir = open(runDir + "/known_issues_reasons.txt", 'a') |
1244 | for c in results: | 1283 | for c in results: |
1245 | if c[1].delete_me and c[1].chain_label not in self.known_issues: | 1284 | if c[1].delete_me and c[1].chain_label not in self.known_issues: |
1246 | if retry or "Could not load existing" not in c[1].error_messages: | 1285 | if retry or "Could not load existing" not in c[1].error_messages: |
1247 | self.known_issues.append(c[1].chain_label) | 1286 | self.known_issues.append(c[1].chain_label) |
1248 | - warn(f"Adding {c[1].chain_label} to known issues.") | 1287 | + issues += 1 |
1288 | + issues_names.append(c[1].chain_label) | ||
1249 | ki.write(c[1].chain_label + '\n') | 1289 | ki.write(c[1].chain_label + '\n') |
1250 | kir.write(c[1].chain_label + '\n' + c[1].error_messages + '\n\n') | 1290 | kir.write(c[1].chain_label + '\n' + c[1].error_messages + '\n\n') |
1251 | with sqlite3.connect(runDir+"/results/RNANet.db") as conn: | 1291 | with sqlite3.connect(runDir+"/results/RNANet.db") as conn: |
1252 | sql_execute(conn, f"UPDATE chain SET issue = 1 WHERE chain_id = ?;", data=(c[1].db_chain_id,)) | 1292 | sql_execute(conn, f"UPDATE chain SET issue = 1 WHERE chain_id = ?;", data=(c[1].db_chain_id,)) |
1253 | ki.close() | 1293 | ki.close() |
1254 | kir.close() | 1294 | kir.close() |
1295 | + if issues: | ||
1296 | + warn("Added newly discovered issues to known issues:") | ||
1297 | + print("\033[33m"+ " ".join(issues_names) + "\033[0m", flush=True) | ||
1255 | 1298 | ||
1256 | # Add successfully built chains to list | 1299 | # Add successfully built chains to list |
1257 | self.loaded_chains += [ c[1] for c in results if not c[1].delete_me ] | 1300 | self.loaded_chains += [ c[1] for c in results if not c[1].delete_me ] |
... | @@ -1276,6 +1319,8 @@ class Pipeline: | ... | @@ -1276,6 +1319,8 @@ class Pipeline: |
1276 | REQUIRES that self.loaded_chains is defined. | 1319 | REQUIRES that self.loaded_chains is defined. |
1277 | SETS family (partially, through call)""" | 1320 | SETS family (partially, through call)""" |
1278 | 1321 | ||
1322 | + setproctitle("RNANet.py prepare_sequences()") | ||
1323 | + | ||
1279 | # Preparing a results folder | 1324 | # Preparing a results folder |
1280 | if not os.access(path_to_seq_data + "realigned/", os.F_OK): | 1325 | if not os.access(path_to_seq_data + "realigned/", os.F_OK): |
1281 | os.makedirs(path_to_seq_data + "realigned/") | 1326 | os.makedirs(path_to_seq_data + "realigned/") |
... | @@ -1308,6 +1353,8 @@ class Pipeline: | ... | @@ -1308,6 +1353,8 @@ class Pipeline: |
1308 | REQUIRES self.fam_list to be defined | 1353 | REQUIRES self.fam_list to be defined |
1309 | SETS family (partially)""" | 1354 | SETS family (partially)""" |
1310 | 1355 | ||
1356 | + setproctitle("RNANet.py realign()") | ||
1357 | + | ||
1311 | # Prepare the job list | 1358 | # Prepare the job list |
1312 | joblist = [] | 1359 | joblist = [] |
1313 | for f in self.fam_list: | 1360 | for f in self.fam_list: |
... | @@ -1345,6 +1392,8 @@ class Pipeline: | ... | @@ -1345,6 +1392,8 @@ class Pipeline: |
1345 | 1392 | ||
1346 | REQUIRES self.fam_list to be defined""" | 1393 | REQUIRES self.fam_list to be defined""" |
1347 | 1394 | ||
1395 | + setproctitle("RNANet.py remap()") | ||
1396 | + | ||
1348 | print("Computing nucleotide frequencies in alignments...\nThis can be very long on slow storage devices (Hard-drive...)") | 1397 | print("Computing nucleotide frequencies in alignments...\nThis can be very long on slow storage devices (Hard-drive...)") |
1349 | print("Check your CPU and disk I/O activity before deciding if the job failed.") | 1398 | print("Check your CPU and disk I/O activity before deciding if the job failed.") |
1350 | nworkers =max(min(ncores, len(self.fam_list)), 1) | 1399 | nworkers =max(min(ncores, len(self.fam_list)), 1) |
... | @@ -1357,11 +1406,11 @@ class Pipeline: | ... | @@ -1357,11 +1406,11 @@ class Pipeline: |
1357 | 1406 | ||
1358 | # Start a process pool to dispatch the RNA families, | 1407 | # Start a process pool to dispatch the RNA families, |
1359 | # over multiple CPUs (one family by CPU) | 1408 | # over multiple CPUs (one family by CPU) |
1360 | - p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers, maxtasksperchild=5) | 1409 | + p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers, maxtasksperchild=1) |
1361 | 1410 | ||
1362 | try: | 1411 | try: |
1363 | fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True) | 1412 | fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True) |
1364 | - for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list)): # Apply work_pssm to each RNA family | 1413 | + for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)): # Apply work_pssm to each RNA family |
1365 | fam_pbar.update(1) # Everytime the iteration finishes on a family, update the global progress bar over the RNA families | 1414 | fam_pbar.update(1) # Everytime the iteration finishes on a family, update the global progress bar over the RNA families |
1366 | fam_pbar.close() | 1415 | fam_pbar.close() |
1367 | p.close() | 1416 | p.close() |
... | @@ -1378,6 +1427,8 @@ class Pipeline: | ... | @@ -1378,6 +1427,8 @@ class Pipeline: |
1378 | 1427 | ||
1379 | REQUIRES self.loaded_chains (to output corresponding CSV files) and self.fam_list (for statistics)""" | 1428 | REQUIRES self.loaded_chains (to output corresponding CSV files) and self.fam_list (for statistics)""" |
1380 | 1429 | ||
1430 | + setproctitle("RNANet.py output_results()") | ||
1431 | + | ||
1381 | time_str = time.strftime("%Y%m%d") | 1432 | time_str = time.strftime("%Y%m%d") |
1382 | 1433 | ||
1383 | #Prepare folders: | 1434 | #Prepare folders: |
... | @@ -1387,10 +1438,10 @@ class Pipeline: | ... | @@ -1387,10 +1438,10 @@ class Pipeline: |
1387 | os.makedirs(runDir + "/results/archive/") | 1438 | os.makedirs(runDir + "/results/archive/") |
1388 | 1439 | ||
1389 | # Save to by-chain CSV files | 1440 | # Save to by-chain CSV files |
1390 | - p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3) | 1441 | + p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3, maxtasksperchild=1) |
1391 | try: | 1442 | try: |
1392 | pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True) | 1443 | pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True) |
1393 | - for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains)): | 1444 | + for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains, chunksize=2)): |
1394 | pbar.update(1) | 1445 | pbar.update(1) |
1395 | pbar.close() | 1446 | pbar.close() |
1396 | p.close() | 1447 | p.close() |
... | @@ -1439,6 +1490,8 @@ class Pipeline: | ... | @@ -1439,6 +1490,8 @@ class Pipeline: |
1439 | def sanitize_database(self): | 1490 | def sanitize_database(self): |
1440 | """Searches for issues in the database and correct them""" | 1491 | """Searches for issues in the database and correct them""" |
1441 | 1492 | ||
1493 | + setproctitle("RNANet.py sanitize_database()") | ||
1494 | + | ||
1442 | conn = sqlite3.connect(runDir + "/results/RNANet.db") | 1495 | conn = sqlite3.connect(runDir + "/results/RNANet.db") |
1443 | 1496 | ||
1444 | # Assert every structure is used | 1497 | # Assert every structure is used |
... | @@ -1532,7 +1585,7 @@ def sql_define_tables(conn): | ... | @@ -1532,7 +1585,7 @@ def sql_define_tables(conn): |
1532 | chain_id INTEGER PRIMARY KEY NOT NULL, | 1585 | chain_id INTEGER PRIMARY KEY NOT NULL, |
1533 | structure_id CHAR(4) NOT NULL, | 1586 | structure_id CHAR(4) NOT NULL, |
1534 | chain_name VARCHAR(2) NOT NULL, | 1587 | chain_name VARCHAR(2) NOT NULL, |
1535 | - eq_class VARCHAR(10), | 1588 | + eq_class VARCHAR(16), |
1536 | pdb_start SMALLINT, | 1589 | pdb_start SMALLINT, |
1537 | pdb_end SMALLINT, | 1590 | pdb_end SMALLINT, |
1538 | issue TINYINT, | 1591 | issue TINYINT, |
... | @@ -1767,9 +1820,9 @@ def execute_joblist(fulljoblist): | ... | @@ -1767,9 +1820,9 @@ def execute_joblist(fulljoblist): |
1767 | 1820 | ||
1768 | print("using", n, "processes:") | 1821 | print("using", n, "processes:") |
1769 | # execute jobs of priority i that should be processed n by n: | 1822 | # execute jobs of priority i that should be processed n by n: |
1770 | - p = Pool(processes=n, maxtasksperchild=5, initializer=init_worker) | 1823 | + p = Pool(processes=n, maxtasksperchild=1, initializer=init_worker) |
1771 | try: | 1824 | try: |
1772 | - raw_results = p.map(partial(execute_job, jobcount=jobcount), bunch) | 1825 | + raw_results = p.map(partial(execute_job, jobcount=jobcount), bunch, chunksize=2) |
1773 | p.close() | 1826 | p.close() |
1774 | p.join() | 1827 | p.join() |
1775 | except KeyboardInterrupt as e: | 1828 | except KeyboardInterrupt as e: |
... | @@ -1792,6 +1845,9 @@ def work_infer_mappings(update_only, allmappings, codelist): | ... | @@ -1792,6 +1845,9 @@ def work_infer_mappings(update_only, allmappings, codelist): |
1792 | build a list of Chain() objects mapped to Rfam families, by expanding available mappings | 1845 | build a list of Chain() objects mapped to Rfam families, by expanding available mappings |
1793 | of any element of the list to all the list elements. | 1846 | of any element of the list to all the list elements. |
1794 | """ | 1847 | """ |
1848 | + | ||
1849 | + setproctitle("RNAnet.py work_infer_mappings()") | ||
1850 | + | ||
1795 | newchains = [] | 1851 | newchains = [] |
1796 | known_mappings = pd.DataFrame() | 1852 | known_mappings = pd.DataFrame() |
1797 | 1853 | ||
... | @@ -1812,11 +1868,12 @@ def work_infer_mappings(update_only, allmappings, codelist): | ... | @@ -1812,11 +1868,12 @@ def work_infer_mappings(update_only, allmappings, codelist): |
1812 | 1868 | ||
1813 | # Now infer mappings for chains that are not explicitely listed in Rfam-PDB mappings: | 1869 | # Now infer mappings for chains that are not explicitely listed in Rfam-PDB mappings: |
1814 | if len(known_mappings): | 1870 | if len(known_mappings): |
1871 | + | ||
1815 | families = set(known_mappings['rfam_acc']) | 1872 | families = set(known_mappings['rfam_acc']) |
1816 | 1873 | ||
1817 | # generalize | 1874 | # generalize |
1818 | inferred_mappings = known_mappings.drop(['pdb_id','chain'], axis=1).drop_duplicates() | 1875 | inferred_mappings = known_mappings.drop(['pdb_id','chain'], axis=1).drop_duplicates() |
1819 | - | 1876 | + |
1820 | # check for approximative redundancy: | 1877 | # check for approximative redundancy: |
1821 | if len(inferred_mappings) != len(inferred_mappings.drop_duplicates(subset="rfam_acc")): | 1878 | if len(inferred_mappings) != len(inferred_mappings.drop_duplicates(subset="rfam_acc")): |
1822 | # Then, there exists some mapping variants onto the same Rfam family CM, | 1879 | # Then, there exists some mapping variants onto the same Rfam family CM, |
... | @@ -1831,9 +1888,25 @@ def work_infer_mappings(update_only, allmappings, codelist): | ... | @@ -1831,9 +1888,25 @@ def work_infer_mappings(update_only, allmappings, codelist): |
1831 | len(inferred_mappings[thisfam_5_3]) != len(inferred_mappings[ inferred_mappings['rfam_acc'] == rfam ]) | 1888 | len(inferred_mappings[thisfam_5_3]) != len(inferred_mappings[ inferred_mappings['rfam_acc'] == rfam ]) |
1832 | and len(inferred_mappings[thisfam_5_3]) > 0 | 1889 | and len(inferred_mappings[thisfam_5_3]) > 0 |
1833 | ): | 1890 | ): |
1834 | - warn(f"There are mappings for {rfam} in both directions:", error=True) | 1891 | + # there are mappings in both directions... wtf Rfam ?! |
1835 | - print(inferred_mappings) | 1892 | + if (len(inferred_mappings[thisfam_5_3]) == len(inferred_mappings[thisfam_3_5]) == 1 |
1836 | - exit(1) | 1893 | + and int(inferred_mappings[thisfam_5_3].pdb_start) == int(inferred_mappings[thisfam_3_5].pdb_end) |
1894 | + and int(inferred_mappings[thisfam_5_3].pdb_end) == int(inferred_mappings[thisfam_3_5].pdb_start) | ||
1895 | + ): | ||
1896 | + # The two mappings are on the same nucleotide interval, but in each sense. | ||
1897 | + # e.g. RF00254 6v5b and 6v5c... maybe a bug on their side ? | ||
1898 | + # How can a chain match a CM in both senses ? | ||
1899 | + # We keep only the 5->3 sense. | ||
1900 | + inferred_mappings = inferred_mappings.drop(index=inferred_mappings.index[thisfam_3_5]) | ||
1901 | + sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end']) | ||
1902 | + thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam ) & sel_5_to_3 | ||
1903 | + thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam ) & (sel_5_to_3 == False) | ||
1904 | + print() | ||
1905 | + warn(f"Found mappings to {rfam} in both directions on the same interval, keeping only the 5'->3' one.") | ||
1906 | + else: | ||
1907 | + warn(f"There are mappings for {rfam} in both directions:", error=True) | ||
1908 | + print(inferred_mappings) | ||
1909 | + exit(1) | ||
1837 | 1910 | ||
1838 | # Compute consensus for chains in 5' -> 3' sense | 1911 | # Compute consensus for chains in 5' -> 3' sense |
1839 | if len(inferred_mappings[thisfam_5_3]): | 1912 | if len(inferred_mappings[thisfam_5_3]): |
... | @@ -1887,7 +1960,7 @@ def work_infer_mappings(update_only, allmappings, codelist): | ... | @@ -1887,7 +1960,7 @@ def work_infer_mappings(update_only, allmappings, codelist): |
1887 | pdb_start = int(m.pdb_start.min()) | 1960 | pdb_start = int(m.pdb_start.min()) |
1888 | pdb_end = int(m.pdb_end.max()) | 1961 | pdb_end = int(m.pdb_end.max()) |
1889 | inferred = False | 1962 | inferred = False |
1890 | - else: # otherwise, use the inferred mapping | 1963 | + elif not(pdb_id in known_mappings.pdb_id and pdb_chain_id in known_mappings.chain): # if no known mapping on another family, use the inferred mapping |
1891 | pdb_start = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_start) | 1964 | pdb_start = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_start) |
1892 | pdb_end = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_end) | 1965 | pdb_end = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_end) |
1893 | inferred = True | 1966 | inferred = True |
... | @@ -1911,6 +1984,8 @@ def work_mmcif(pdb_id): | ... | @@ -1911,6 +1984,8 @@ def work_mmcif(pdb_id): |
1911 | SETS table structure | 1984 | SETS table structure |
1912 | """ | 1985 | """ |
1913 | 1986 | ||
1987 | + setproctitle(f"RNAnet.py work_mmcif({pdb_id})") | ||
1988 | + | ||
1914 | final_filepath = path_to_3D_data+"RNAcifs/"+pdb_id+".cif" | 1989 | final_filepath = path_to_3D_data+"RNAcifs/"+pdb_id+".cif" |
1915 | 1990 | ||
1916 | # Attempt to download it if not present | 1991 | # Attempt to download it if not present |
... | @@ -1921,43 +1996,52 @@ def work_mmcif(pdb_id): | ... | @@ -1921,43 +1996,52 @@ def work_mmcif(pdb_id): |
1921 | warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True) | 1996 | warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True) |
1922 | return | 1997 | return |
1923 | 1998 | ||
1924 | - # Load the MMCIF file with Biopython | 1999 | + # check if it exists in database |
1925 | - mmCif_info = MMCIF2Dict(final_filepath) | ||
1926 | - | ||
1927 | - # Get info about that structure | ||
1928 | - exp_meth = mmCif_info["_exptl.method"][0] | ||
1929 | - date = mmCif_info["_pdbx_database_status.recvd_initial_deposition_date"][0] | ||
1930 | - if "_refine.ls_d_res_high" in mmCif_info.keys() and mmCif_info["_refine.ls_d_res_high"][0] not in ['.', '?']: | ||
1931 | - reso = float(mmCif_info["_refine.ls_d_res_high"][0]) | ||
1932 | - elif "_refine.ls_d_res_low" in mmCif_info.keys() and mmCif_info["_refine.ls_d_res_low"][0] not in ['.', '?']: | ||
1933 | - reso = float(mmCif_info["_refine.ls_d_res_low"][0]) | ||
1934 | - elif "_em_3d_reconstruction.resolution" in mmCif_info.keys() and mmCif_info["_em_3d_reconstruction.resolution"][0] not in ['.', '?']: | ||
1935 | - reso = float(mmCif_info["_em_3d_reconstruction.resolution"][0]) | ||
1936 | - else: | ||
1937 | - warn(f"Wtf, structure {pdb_id} has no resolution ?") | ||
1938 | - warn(f"Check https://files.rcsb.org/header/{pdb_id}.cif to figure it out.") | ||
1939 | - reso = 0.0 | ||
1940 | - | ||
1941 | - # Save into the database | ||
1942 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | 2000 | with sqlite3.connect(runDir + "/results/RNANet.db") as conn: |
1943 | - sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution) | 2001 | + r = sql_ask_database(conn, f"""SELECT * from structure where pdb_id = '{pdb_id}';""") |
1944 | - VALUES (?, ?, DATE(?), ?, ?);""", data = (pdb_id, 1, date, exp_meth, reso)) | 2002 | + |
1945 | - | 2003 | + # if not, read the CIF header and register the structure |
1946 | - # run DSSR (you need to have it in your $PATH, follow x3dna installation instructions) | 2004 | + if not len(r): |
1947 | - output = subprocess.run(["x3dna-dssr", f"-i={final_filepath}", "--json", "--auxfile=no"], | 2005 | + # Load the MMCIF file with Biopython |
1948 | - stdout=subprocess.PIPE, stderr=subprocess.PIPE) | 2006 | + mmCif_info = MMCIF2Dict(final_filepath) |
1949 | - stdout = output.stdout.decode('utf-8') | 2007 | + |
1950 | - stderr = output.stderr.decode('utf-8') | 2008 | + # Get info about that structure |
1951 | - | 2009 | + exp_meth = mmCif_info["_exptl.method"][0] |
1952 | - if "exception" in stderr: | 2010 | + date = mmCif_info["_pdbx_database_status.recvd_initial_deposition_date"][0] |
1953 | - # DSSR is unable to parse the chain. | 2011 | + if "_refine.ls_d_res_high" in mmCif_info.keys() and mmCif_info["_refine.ls_d_res_high"][0] not in ['.', '?']: |
1954 | - warn(f"Exception while running DSSR, ignoring {pdb_id}.", error=True) | 2012 | + reso = float(mmCif_info["_refine.ls_d_res_high"][0]) |
1955 | - return 1 | 2013 | + elif "_refine.ls_d_res_low" in mmCif_info.keys() and mmCif_info["_refine.ls_d_res_low"][0] not in ['.', '?']: |
2014 | + reso = float(mmCif_info["_refine.ls_d_res_low"][0]) | ||
2015 | + elif "_em_3d_reconstruction.resolution" in mmCif_info.keys() and mmCif_info["_em_3d_reconstruction.resolution"][0] not in ['.', '?']: | ||
2016 | + reso = float(mmCif_info["_em_3d_reconstruction.resolution"][0]) | ||
2017 | + else: | ||
2018 | + warn(f"Wtf, structure {pdb_id} has no resolution ?") | ||
2019 | + warn(f"Check https://files.rcsb.org/header/{pdb_id}.cif to figure it out.") | ||
2020 | + reso = 0.0 | ||
2021 | + | ||
2022 | + # Save into the database | ||
2023 | + with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | ||
2024 | + sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution) | ||
2025 | + VALUES (?, ?, DATE(?), ?, ?);""", data = (pdb_id, 1, date, exp_meth, reso)) | ||
2026 | + | ||
2027 | + if not path.isfile(path_to_3D_data + "annotations/" + pdb_id + ".json"): | ||
2028 | + | ||
2029 | + # run DSSR (you need to have it in your $PATH, follow x3dna installation instructions) | ||
2030 | + output = subprocess.run(["x3dna-dssr", f"-i={final_filepath}", "--json", "--auxfile=no"], | ||
2031 | + stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||
2032 | + stdout = output.stdout.decode('utf-8') | ||
2033 | + stderr = output.stderr.decode('utf-8') | ||
2034 | + | ||
2035 | + if "exception" in stderr: | ||
2036 | + # DSSR is unable to parse the chain. | ||
2037 | + warn(f"Exception while running DSSR, ignoring {pdb_id}.", error=True) | ||
2038 | + return 1 | ||
2039 | + | ||
2040 | + # save the analysis to file only if we can load it :/ | ||
2041 | + json_file = open(path_to_3D_data + "annotations/" + pdb_id + ".json", "w") | ||
2042 | + json_file.write(stdout) | ||
2043 | + json_file.close() | ||
1956 | 2044 | ||
1957 | - # save the analysis to file only if we can load it :/ | ||
1958 | - json_file = open(path_to_3D_data + "annotations/" + pdb_id + ".json", "w") | ||
1959 | - json_file.write(stdout) | ||
1960 | - json_file.close() | ||
1961 | return 0 | 2045 | return 0 |
1962 | 2046 | ||
1963 | @trace_unhandled_exceptions | 2047 | @trace_unhandled_exceptions |
... | @@ -1966,6 +2050,9 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): | ... | @@ -1966,6 +2050,9 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): |
1966 | If asked, also extracts the 3D chains from their original structure files. | 2050 | If asked, also extracts the 3D chains from their original structure files. |
1967 | 2051 | ||
1968 | """ | 2052 | """ |
2053 | + | ||
2054 | + setproctitle(f"RNAnet.py work_build_chain({c.chain_label})") | ||
2055 | + | ||
1969 | if not path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json"): | 2056 | if not path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json"): |
1970 | warn(f"Could not find annotations for {c.chain_label}, ignoring it.", error=True) | 2057 | warn(f"Could not find annotations for {c.chain_label}, ignoring it.", error=True) |
1971 | c.delete_me = True | 2058 | c.delete_me = True |
... | @@ -1997,6 +2084,8 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): | ... | @@ -1997,6 +2084,8 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): |
1997 | def work_prepare_sequences(dl, rfam_acc, chains): | 2084 | def work_prepare_sequences(dl, rfam_acc, chains): |
1998 | """Prepares FASTA files of homologous sequences to realign with cmalign or SINA.""" | 2085 | """Prepares FASTA files of homologous sequences to realign with cmalign or SINA.""" |
1999 | 2086 | ||
2087 | + setproctitle("RNAnet.py work_prepare_sequences()") | ||
2088 | + | ||
2000 | if rfam_acc in LSU_set | SSU_set: # rRNA | 2089 | if rfam_acc in LSU_set | SSU_set: # rRNA |
2001 | if path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): | 2090 | if path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): |
2002 | # Detect doublons and remove them | 2091 | # Detect doublons and remove them |
... | @@ -2087,6 +2176,8 @@ def work_realign(rfam_acc): | ... | @@ -2087,6 +2176,8 @@ def work_realign(rfam_acc): |
2087 | cmalign requires too much RAM for them, so we use SINA, a specifically designed tool for rRNAs. | 2176 | cmalign requires too much RAM for them, so we use SINA, a specifically designed tool for rRNAs. |
2088 | """ | 2177 | """ |
2089 | 2178 | ||
2179 | + setproctitle(f"RNAnet.py work_realign({rfam_acc})") | ||
2180 | + | ||
2090 | if rfam_acc in LSU_set | SSU_set: | 2181 | if rfam_acc in LSU_set | SSU_set: |
2091 | # Ribosomal subunits deserve a special treatment. | 2182 | # Ribosomal subunits deserve a special treatment. |
2092 | # They require too much RAM to be aligned with Infernal. | 2183 | # They require too much RAM to be aligned with Infernal. |
... | @@ -2210,6 +2301,7 @@ def work_pssm(f, fill_gaps): | ... | @@ -2210,6 +2301,7 @@ def work_pssm(f, fill_gaps): |
2210 | Uses only 1 core, so this function can be called in parallel. | 2301 | Uses only 1 core, so this function can be called in parallel. |
2211 | 2302 | ||
2212 | """ | 2303 | """ |
2304 | + setproctitle(f"RNAnet.py work_pssm({f})") | ||
2213 | 2305 | ||
2214 | # Get a worker number to position the progress bar | 2306 | # Get a worker number to position the progress bar |
2215 | global idxQueue | 2307 | global idxQueue |
... | @@ -2223,6 +2315,7 @@ def work_pssm(f, fill_gaps): | ... | @@ -2223,6 +2315,7 @@ def work_pssm(f, fill_gaps): |
2223 | try: | 2315 | try: |
2224 | align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") | 2316 | align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") |
2225 | except: | 2317 | except: |
2318 | + warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True) | ||
2226 | with open(runDir + "/errors.txt", "a") as errf: | 2319 | with open(runDir + "/errors.txt", "a") as errf: |
2227 | errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n") | 2320 | errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n") |
2228 | return 1 | 2321 | return 1 |
... | @@ -2313,6 +2406,9 @@ def work_pssm(f, fill_gaps): | ... | @@ -2313,6 +2406,9 @@ def work_pssm(f, fill_gaps): |
2313 | 2406 | ||
2314 | @trace_unhandled_exceptions | 2407 | @trace_unhandled_exceptions |
2315 | def work_save(c, homology=True): | 2408 | def work_save(c, homology=True): |
2409 | + | ||
2410 | + setproctitle(f"RNAnet.py work_save({c.chain_label})") | ||
2411 | + | ||
2316 | conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0) | 2412 | conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0) |
2317 | if homology: | 2413 | if homology: |
2318 | df = pd.read_sql_query(f""" | 2414 | df = pd.read_sql_query(f""" | ... | ... |
... | @@ -12,7 +12,6 @@ done | ... | @@ -12,7 +12,6 @@ done |
12 | 12 | ||
13 | PROCESS_TO_KILL="statistics.py" | 13 | PROCESS_TO_KILL="statistics.py" |
14 | PROCESS_LIST=`ps ax | grep -Ei ${PROCESS_TO_KILL} | grep -Eiv '(grep|vi statistics.py)' | awk ' { print $1;}'` | 14 | PROCESS_LIST=`ps ax | grep -Ei ${PROCESS_TO_KILL} | grep -Eiv '(grep|vi statistics.py)' | awk ' { print $1;}'` |
15 | -KILLED= | ||
16 | for KILLPID in $PROCESS_LIST; do | 15 | for KILLPID in $PROCESS_LIST; do |
17 | if [ ! -z $KILLPID ];then | 16 | if [ ! -z $KILLPID ];then |
18 | kill -9 $KILLPID | 17 | kill -9 $KILLPID | ... | ... |
... | @@ -3,8 +3,8 @@ import subprocess, os, sys | ... | @@ -3,8 +3,8 @@ import subprocess, os, sys |
3 | 3 | ||
4 | # Put a list of problematic chains here, they will be properly deleted and recomputed | 4 | # Put a list of problematic chains here, they will be properly deleted and recomputed |
5 | problems = [ | 5 | problems = [ |
6 | -"4v9n_1_DA_1-2879", | 6 | + "1k73_1_A", |
7 | -"4v9n_1_DA_148-2875" | 7 | + "1k73_1_B" |
8 | ] | 8 | ] |
9 | 9 | ||
10 | path_to_3D_data = sys.argv[1] | 10 | path_to_3D_data = sys.argv[1] |
... | @@ -15,6 +15,7 @@ for p in problems: | ... | @@ -15,6 +15,7 @@ for p in problems: |
15 | print() | 15 | print() |
16 | print() | 16 | print() |
17 | print() | 17 | print() |
18 | + homology = ('-' in p) | ||
18 | 19 | ||
19 | # Remove the datapoints files and 3D files | 20 | # Remove the datapoints files and 3D files |
20 | subprocess.run(["rm", '-f', path_to_3D_data + f"/rna_mapped_to_Rfam/{p}.cif"]) | 21 | subprocess.run(["rm", '-f', path_to_3D_data + f"/rna_mapped_to_Rfam/{p}.cif"]) |
... | @@ -25,16 +26,25 @@ for p in problems: | ... | @@ -25,16 +26,25 @@ for p in problems: |
25 | # Find more information | 26 | # Find more information |
26 | structure = p.split('_')[0] | 27 | structure = p.split('_')[0] |
27 | chain = p.split('_')[2] | 28 | chain = p.split('_')[2] |
28 | - families = [ f.split('.')[1] for f in files ] # The RFAM families this chain has been mapped onto | 29 | + if homology: |
29 | - | 30 | + families = [ f.split('.')[1] for f in files ] # The RFAM families this chain has been mapped onto |
30 | - # Delete the chain from the database, and the associated nucleotides and re_mappings, using foreign keys | 31 | + |
31 | - for fam in families: | 32 | + # Delete the chain from the database, and the associated nucleotides and re_mappings, using foreign keys |
32 | - command = ["sqlite3", "results/RNANet.db", f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc=\"{fam}\";"] | 33 | + for fam in families: |
34 | + command = ["sqlite3", "results/RNANet.db", f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc=\"{fam}\";"] | ||
35 | + print(' '.join(command)) | ||
36 | + subprocess.run(command) | ||
37 | + | ||
38 | + command = ["python3.8", "RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--extract", "--only", p] | ||
39 | + else: | ||
40 | + # Delete the chain from the database, and the associated nucleotides and re_mappings, using foreign keys | ||
41 | + command = ["sqlite3", "results/RNANet.db", f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc is null;"] | ||
33 | print(' '.join(command)) | 42 | print(' '.join(command)) |
34 | subprocess.run(command) | 43 | subprocess.run(command) |
35 | 44 | ||
45 | + command = ["python3.8", "RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--no-homology", "--extract", "--only", p] | ||
46 | + | ||
36 | # Re-run RNANet | 47 | # Re-run RNANet |
37 | - command = ["python3.8", "RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--extract", "--only", p] | ||
38 | print('\n',' '.join(command),'\n') | 48 | print('\n',' '.join(command),'\n') |
39 | subprocess.run(command) | 49 | subprocess.run(command) |
40 | 50 | ... | ... |
... | @@ -31,7 +31,7 @@ res_thr = 20.0 # default: all structures | ... | @@ -31,7 +31,7 @@ res_thr = 20.0 # default: all structures |
31 | LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112 | 31 | LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112 |
32 | SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111 | 32 | SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111 |
33 | 33 | ||
34 | -def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): | 34 | +def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0): |
35 | """ | 35 | """ |
36 | Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph. | 36 | Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph. |
37 | See Wadley & Pyle (2007) | 37 | See Wadley & Pyle (2007) |
... | @@ -63,7 +63,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): | ... | @@ -63,7 +63,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): |
63 | exit("You overestimate my capabilities !") | 63 | exit("You overestimate my capabilities !") |
64 | 64 | ||
65 | 65 | ||
66 | - if not path.isfile(f"data/wadley_kernel_{angle}_{res_thr}A.npz"): | 66 | + if not path.isfile(f"data/wadley_kernel_{angle}_{res}A.npz"): |
67 | 67 | ||
68 | # Get a worker number to position the progress bar | 68 | # Get a worker number to position the progress bar |
69 | global idxQueue | 69 | global idxQueue |
... | @@ -75,7 +75,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): | ... | @@ -75,7 +75,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): |
75 | df = pd.read_sql(f"""SELECT {angle}, th{angle} | 75 | df = pd.read_sql(f"""SELECT {angle}, th{angle} |
76 | FROM nucleotide JOIN ( | 76 | FROM nucleotide JOIN ( |
77 | SELECT chain_id FROM chain JOIN structure | 77 | SELECT chain_id FROM chain JOIN structure |
78 | - WHERE structure.resolution <= {res_thr} | 78 | + WHERE structure.resolution <= {res} |
79 | ) AS c | 79 | ) AS c |
80 | WHERE puckering="C2'-endo" | 80 | WHERE puckering="C2'-endo" |
81 | AND {angle} IS NOT NULL | 81 | AND {angle} IS NOT NULL |
... | @@ -85,7 +85,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): | ... | @@ -85,7 +85,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): |
85 | df = pd.read_sql(f"""SELECT {angle}, th{angle} | 85 | df = pd.read_sql(f"""SELECT {angle}, th{angle} |
86 | FROM nucleotide JOIN ( | 86 | FROM nucleotide JOIN ( |
87 | SELECT chain_id FROM chain JOIN structure | 87 | SELECT chain_id FROM chain JOIN structure |
88 | - WHERE structure.resolution <= {res_thr} | 88 | + WHERE structure.resolution <= {res} |
89 | ) AS c | 89 | ) AS c |
90 | WHERE form = '.' | 90 | WHERE form = '.' |
91 | AND puckering="C3'-endo" | 91 | AND puckering="C3'-endo" |
... | @@ -111,14 +111,14 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): | ... | @@ -111,14 +111,14 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): |
111 | pbar.update(1) | 111 | pbar.update(1) |
112 | 112 | ||
113 | # Save the data to an archive for later use without the need to recompute | 113 | # Save the data to an archive for later use without the need to recompute |
114 | - np.savez(f"data/wadley_kernel_{angle}.npz", | 114 | + np.savez(f"data/wadley_kernel_{angle}_{res}A.npz", |
115 | c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas, | 115 | c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas, |
116 | c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas, | 116 | c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas, |
117 | kernel_c3=f_c3, kernel_c2=f_c2) | 117 | kernel_c3=f_c3, kernel_c2=f_c2) |
118 | pbar.close() | 118 | pbar.close() |
119 | idxQueue.put(thr_idx) | 119 | idxQueue.put(thr_idx) |
120 | else: | 120 | else: |
121 | - f = np.load(f"data/wadley_kernel_{angle}.npz") | 121 | + f = np.load(f"data/wadley_kernel_{angle}_{res}A.npz") |
122 | c2_endo_etas = f["c2_endo_e"] | 122 | c2_endo_etas = f["c2_endo_e"] |
123 | c3_endo_etas = f["c3_endo_e"] | 123 | c3_endo_etas = f["c3_endo_e"] |
124 | c2_endo_thetas = f["c2_endo_t"] | 124 | c2_endo_thetas = f["c2_endo_t"] |
... | @@ -157,7 +157,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): | ... | @@ -157,7 +157,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): |
157 | ax.bar3d(xpos.ravel(), ypos.ravel(), 0.0, 0.09, 0.09, hist_cut.ravel(), color=color_values, zorder="max") | 157 | ax.bar3d(xpos.ravel(), ypos.ravel(), 0.0, 0.09, 0.09, hist_cut.ravel(), color=color_values, zorder="max") |
158 | ax.set_xlabel(xlabel) | 158 | ax.set_xlabel(xlabel) |
159 | ax.set_ylabel(ylabel) | 159 | ax.set_ylabel(ylabel) |
160 | - fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res_thr}A.png") | 160 | + fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res}A.png") |
161 | if show: | 161 | if show: |
162 | fig.show() | 162 | fig.show() |
163 | plt.close() | 163 | plt.close() |
... | @@ -168,7 +168,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): | ... | @@ -168,7 +168,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): |
168 | ax.plot_surface(xx, yy, f_cut, cmap=cm.get_cmap("coolwarm"), linewidth=0, antialiased=True) | 168 | ax.plot_surface(xx, yy, f_cut, cmap=cm.get_cmap("coolwarm"), linewidth=0, antialiased=True) |
169 | ax.set_xlabel(xlabel) | 169 | ax.set_xlabel(xlabel) |
170 | ax.set_ylabel(ylabel) | 170 | ax.set_ylabel(ylabel) |
171 | - fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res_thr}A.png") | 171 | + fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res}A.png") |
172 | if show: | 172 | if show: |
173 | fig.show() | 173 | fig.show() |
174 | plt.close() | 174 | plt.close() |
... | @@ -178,10 +178,9 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): | ... | @@ -178,10 +178,9 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): |
178 | ax = fig.gca() | 178 | ax = fig.gca() |
179 | ax.scatter(x, y, s=1, alpha=0.1) | 179 | ax.scatter(x, y, s=1, alpha=0.1) |
180 | ax.contourf(xx, yy, f_cut, alpha=0.5, cmap=cm.get_cmap("coolwarm"), levels=levels, extend="max") | 180 | ax.contourf(xx, yy, f_cut, alpha=0.5, cmap=cm.get_cmap("coolwarm"), levels=levels, extend="max") |
181 | - | ||
182 | ax.set_xlabel(xlabel) | 181 | ax.set_xlabel(xlabel) |
183 | ax.set_ylabel(ylabel) | 182 | ax.set_ylabel(ylabel) |
184 | - fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}_{res_thr}A.png") | 183 | + fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}_{res}A.png") |
185 | if show: | 184 | if show: |
186 | fig.show() | 185 | fig.show() |
187 | plt.close() | 186 | plt.close() |
... | @@ -231,7 +230,13 @@ def stats_len(): | ... | @@ -231,7 +230,13 @@ def stats_len(): |
231 | 230 | ||
232 | # Get the lengths of chains | 231 | # Get the lengths of chains |
233 | with sqlite3.connect("results/RNANet.db") as conn: | 232 | with sqlite3.connect("results/RNANet.db") as conn: |
234 | - l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id WHERE rfam_acc='{f}' AND resolution <= {res_thr}) NATURAL JOIN nucleotide GROUP BY chain_id;", warn_every=0) ] | 233 | + l = [ x[0] for x in sql_ask_database(conn, f"""SELECT COUNT(index_chain) |
234 | + FROM ( | ||
235 | + SELECT chain_id | ||
236 | + FROM chain JOIN structure ON chain.structure_id = structure.pdb_id | ||
237 | + WHERE rfam_acc='{f}' AND resolution <= {res_thr} | ||
238 | + ) NATURAL JOIN nucleotide | ||
239 | + GROUP BY chain_id;""", warn_every=0) ] | ||
235 | lengths.append(l) # list of chain lengths from the family | 240 | lengths.append(l) # list of chain lengths from the family |
236 | 241 | ||
237 | # notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths") | 242 | # notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths") |
... | @@ -597,6 +602,172 @@ def per_chain_stats(): | ... | @@ -597,6 +602,172 @@ def per_chain_stats(): |
597 | many=True, data=list(df.to_records(index=False)), warn_every=10) | 602 | many=True, data=list(df.to_records(index=False)), warn_every=10) |
598 | notify("Updated the database with per-chain base frequencies") | 603 | notify("Updated the database with per-chain base frequencies") |
599 | 604 | ||
605 | +def general_stats(): | ||
606 | + """ | ||
607 | + Number of structures as function of the resolution threshold | ||
608 | + Number of Rfam families as function of the resolution threshold | ||
609 | + """ | ||
610 | + with sqlite3.connect("results/RNANet.db") as conn: | ||
611 | + df_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution | ||
612 | + FROM chain JOIN structure ON chain.structure_id = structure.pdb_id | ||
613 | + WHERE rfam_acc IS NULL AND ISSUE=0;""", conn) | ||
614 | + df_mapped_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution | ||
615 | + FROM chain JOIN structure ON chain.structure_id = structure.pdb_id | ||
616 | + WHERE rfam_acc IS NOT NULL AND ISSUE=0;""", conn) | ||
617 | + df_mapped_copies = pd.read_sql(f"""SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution | ||
618 | + FROM chain JOIN structure ON chain.structure_id = structure.pdb_id | ||
619 | + WHERE rfam_acc IS NOT NULL AND ISSUE=0;""", conn) | ||
620 | + df_inferred_only_unique = pd.read_sql(f"""SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution | ||
621 | + FROM (SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy | ||
622 | + FROM chain | ||
623 | + JOIN (SELECT structure_id, chain_name, COUNT(distinct rfam_acc) AS redundancy, SUM(inferred) AS inf_redundancy | ||
624 | + FROM chain | ||
625 | + WHERE rfam_acc IS NOT NULL AND issue=0 | ||
626 | + GROUP BY structure_id, chain_name | ||
627 | + ) AS r ON chain.structure_id=r.structure_id AND chain.chain_name = r.chain_name | ||
628 | + WHERE r.redundancy=r.inf_redundancy AND rfam_acc IS NOT NULL and issue=0 | ||
629 | + ) AS c | ||
630 | + JOIN structure ON c.structure_id=structure.pdb_id;""", conn) | ||
631 | + print("> found", len(df_inferred_only_unique.index), "chains which are mapped only by inference using BGSU NR Lists.") | ||
632 | + | ||
633 | + ########################################## | ||
634 | + # plot N = f(resolution, exp_method) | ||
635 | + ########################################## | ||
636 | + | ||
637 | + methods = df_unique.exp_method.unique() | ||
638 | + | ||
639 | + fig, axs = plt.subplots(1+len(methods), 3, figsize=(15,5*(1+len(methods))), sharex=True) | ||
640 | + df_unique.sort_values('resolution', inplace=True, ignore_index=True) | ||
641 | + df_mapped_unique.sort_values('resolution', inplace=True, ignore_index=True) | ||
642 | + df_inferred_only_unique.sort_values('resolution', inplace=True, ignore_index=True) | ||
643 | + df_mapped_copies.sort_values('resolution', inplace=True, ignore_index=True) | ||
644 | + max_res = max(df_unique.resolution) | ||
645 | + max_structs = len(df_mapped_copies.index.tolist()) | ||
646 | + colors = np.linspace(0,1,1+len(methods)) | ||
647 | + plt.xticks( np.arange(0, max_res+2, 2.0).tolist(), np.arange(0, max_res+2, 2.0).tolist() ) | ||
648 | + | ||
649 | + axs[0][0].grid(axis='y', ls='dotted', lw=1) | ||
650 | + axs[0][0].hist(df_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[0], 1), label='distribution') | ||
651 | + axs[0][0].hist(df_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[0], 0.5), cumulative=True, label='cumulative') | ||
652 | + axs[0][0].text(0.95*max_res, 0.95*len(df_unique.resolution), "%d " % len(df_unique.resolution), | ||
653 | + horizontalalignment='right', verticalalignment='top', fontsize=14) | ||
654 | + axs[0][0].set_ylabel("ALL", fontsize=14) | ||
655 | + axs[0][0].set_title("Number of unique RNA chains", fontsize=14) | ||
656 | + axs[0][0].set_ylim((0, max_structs * 1.05)) | ||
657 | + axs[0][0].legend(loc="best", fontsize=14) | ||
658 | + | ||
659 | + axs[0][1].grid(axis='y', ls='dotted', lw=1) | ||
660 | + axs[0][1].set_yticklabels([]) | ||
661 | + axs[0][1].hist(df_mapped_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[0], 1), label='distribution') | ||
662 | + axs[0][1].hist(df_mapped_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[0], 0.5), cumulative=True, label='cumulative') | ||
663 | + axs[0][1].hist(df_inferred_only_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[0], 0.5), cumulative=True, label='only by inference') | ||
664 | + axs[0][1].text(0.95*max_res, 0.95*len(df_mapped_unique.resolution), "%d " % len(df_mapped_unique.resolution), | ||
665 | + horizontalalignment='right', verticalalignment='top', fontsize=14) | ||
666 | + axs[0][1].set_title("Number of unique RNA chains\nmapped to $\geq 1$ family", fontsize=14) | ||
667 | + axs[0][1].set_ylim((0, max_structs * 1.05)) | ||
668 | + axs[0][1].legend(loc="best", fontsize=14) | ||
669 | + | ||
670 | + axs[0][2].grid(axis='y', ls='dotted', lw=1) | ||
671 | + axs[0][2].set_yticklabels([]) | ||
672 | + axs[0][2].hist(df_mapped_copies.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[0], 1), label='distribution') | ||
673 | + axs[0][2].hist(df_mapped_copies.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[0], 0.5), cumulative=True, label='cumulative') | ||
674 | + axs[0][2].hist(df_mapped_copies[df_mapped_copies.inferred == 1].resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[0], 0.5), cumulative=True, label='inferred') | ||
675 | + axs[0][2].text(0.95*max_res, 0.95*len(df_mapped_copies.resolution), "%d " % len(df_mapped_copies.resolution), | ||
676 | + horizontalalignment='right', verticalalignment='top', fontsize=14) | ||
677 | + axs[0][2].set_title("Number of RNA chains mapped to a\nfamily (with copies)", fontsize=14) | ||
678 | + axs[0][2].legend(loc="right", fontsize=14) | ||
679 | + axs[0][2].set_ylim((0, max_structs * 1.05)) | ||
680 | + | ||
681 | + for i,m in enumerate(methods): | ||
682 | + df_unique_m = df_unique[df_unique.exp_method == m] | ||
683 | + df_mapped_unique_m = df_mapped_unique[df_mapped_unique.exp_method == m] | ||
684 | + df_inferred_only_unique_m = df_inferred_only_unique[df_inferred_only_unique.exp_method == m] | ||
685 | + df_mapped_copies_m = df_mapped_copies[ df_mapped_copies.exp_method == m] | ||
686 | + max_structs = len(df_mapped_copies_m.resolution.tolist()) | ||
687 | + print("> found", max_structs, "structures with method", m, flush=True) | ||
688 | + | ||
689 | + axs[1+i][0].grid(axis='y', ls='dotted', lw=1) | ||
690 | + axs[1+i][0].hist(df_unique_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[1+i], 1), label='distribution') | ||
691 | + axs[1+i][0].hist(df_unique_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[1+i], 0.5), cumulative=True, label='cumulative') | ||
692 | + axs[1+i][0].text(0.95*max_res, 0.95*len(df_unique_m.resolution), "%d " % len(df_unique_m.resolution), | ||
693 | + horizontalalignment='right', verticalalignment='top', fontsize=14) | ||
694 | + axs[1+i][0].set_ylim((0, max_structs * 1.05)) | ||
695 | + axs[1+i][0].set_ylabel(m, fontsize=14) | ||
696 | + axs[1+i][0].legend(loc="best", fontsize=14) | ||
697 | + | ||
698 | + axs[1+i][1].grid(axis='y', ls='dotted', lw=1) | ||
699 | + axs[1+i][1].set_yticklabels([]) | ||
700 | + axs[1+i][1].hist(df_mapped_unique_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[1+i], 1), label='distribution') | ||
701 | + axs[1+i][1].hist(df_mapped_unique_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[1+i], 0.5), cumulative=True, label='cumulative') | ||
702 | + axs[1+i][1].hist(df_inferred_only_unique_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[1+i], 0.5), cumulative=True, label='only by inference') | ||
703 | + axs[1+i][1].text(0.95*max_res, 0.95*len(df_mapped_unique_m.resolution), "%d " % len(df_mapped_unique_m.resolution), | ||
704 | + horizontalalignment='right', verticalalignment='top', fontsize=14) | ||
705 | + axs[1+i][1].set_ylim((0, max_structs * 1.05)) | ||
706 | + axs[1+i][1].legend(loc="best", fontsize=14) | ||
707 | + | ||
708 | + axs[1+i][2].grid(axis='y', ls='dotted', lw=1) | ||
709 | + axs[1+i][2].set_yticklabels([]) | ||
710 | + axs[1+i][2].hist(df_mapped_copies_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[1+i], 1), label='distribution') | ||
711 | + axs[1+i][2].hist(df_mapped_copies_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[1+i], 0.5), cumulative=True, label='cumulative') | ||
712 | + axs[1+i][2].hist(df_mapped_copies_m[df_mapped_copies_m.inferred == 1].resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[1+i], 0.5), cumulative=True, label='inferred') | ||
713 | + axs[1+i][2].text(0.95*max_res, 0.95*len(df_mapped_copies_m.resolution), "%d " % len(df_mapped_copies_m.resolution), | ||
714 | + horizontalalignment='right', verticalalignment='top', fontsize=14) | ||
715 | + axs[1+i][2].set_ylim((0, max_structs * 1.05)) | ||
716 | + axs[1+i][2].legend(loc="right", fontsize=14) | ||
717 | + | ||
718 | + axs[-1][0].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14) | ||
719 | + axs[-1][1].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14) | ||
720 | + axs[-1][2].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14) | ||
721 | + | ||
722 | + fig.suptitle("Number of RNA chains by experimental method and resolution", fontsize=16) | ||
723 | + fig.subplots_adjust(left=0.07, right=0.98, wspace=0.05, | ||
724 | + hspace=0.05, bottom=0.05, top=0.92) | ||
725 | + fig.savefig("results/figures/resolutions.png") | ||
726 | + plt.close() | ||
727 | + | ||
728 | + ########################################## | ||
729 | + # plot Nfam = f(resolution, exp_method) | ||
730 | + ########################################## | ||
731 | + | ||
732 | + df_mapped_copies['n_fam'] = [ len(df_mapped_copies.rfam_acc[:i+1].unique()) for i in range(len(df_mapped_copies.index)) ] | ||
733 | + | ||
734 | + fig, axs = plt.subplots(1, 1+len(methods), figsize=(5*(1+len(methods)), 5)) | ||
735 | + max_res = max(df_mapped_copies.resolution) | ||
736 | + max_fams = max(df_mapped_copies.n_fam) | ||
737 | + colors = np.linspace(0,1,1+len(methods)) | ||
738 | + plt.xticks( np.arange(0, max_res+2, 2.0).tolist(), np.arange(0, max_res+2, 2.0).tolist() ) | ||
739 | + | ||
740 | + axs[0].grid(axis='y', ls='dotted', lw=1) | ||
741 | + axs[0].plot(df_mapped_copies.resolution, df_mapped_copies.n_fam) | ||
742 | + axs[0].text(0.95*max_res, 0.95*df_mapped_copies.n_fam.iloc[-1], "%d " % df_mapped_copies.n_fam.iloc[-1], | ||
743 | + horizontalalignment='right', verticalalignment='top', fontsize=14) | ||
744 | + axs[0].set_title("ALL", fontsize=14) | ||
745 | + axs[0].set_xlabel("Structure resolution (Angströms)", fontsize=14) | ||
746 | + axs[0].set_ylabel("Number of Rfam families", fontsize=14) | ||
747 | + axs[0].set_ylim((0, max_res * 1.05)) | ||
748 | + axs[0].set_ylim((0, max_fams * 1.05)) | ||
749 | + | ||
750 | + for i,m in enumerate(methods): | ||
751 | + df_mapped_copies_m = df_mapped_copies[ df_mapped_copies.exp_method == m].drop("n_fam", axis=1).copy() | ||
752 | + df_mapped_copies_m['n_fam'] = [ len(df_mapped_copies_m.rfam_acc[:i+1].unique()) for i in range(len(df_mapped_copies_m.index)) ] | ||
753 | + print(">", df_mapped_copies_m.n_fam.iloc[-1], "different RNA families have a 3D structure solved by", m) | ||
754 | + | ||
755 | + axs[1+i].grid(axis='y', ls='dotted', lw=1) | ||
756 | + axs[1+i].plot(df_mapped_copies_m.resolution, df_mapped_copies_m.n_fam, ) | ||
757 | + axs[1+i].text(0.95*max(df_mapped_copies_m.resolution), 0.95*df_mapped_copies_m.n_fam.iloc[-1], "%d " % df_mapped_copies_m.n_fam.iloc[-1], | ||
758 | + horizontalalignment='right', verticalalignment='top', fontsize=14) | ||
759 | + axs[1+i].set_xlim((0, max_res * 1.05)) | ||
760 | + axs[1+i].set_ylim((0, max_fams * 1.05)) | ||
761 | + axs[1+i].set_xlabel("Structure resolution (Angströms)", fontsize=14) | ||
762 | + axs[1+i].set_title(m, fontsize=14) | ||
763 | + axs[1+i].set_yticklabels([]) | ||
764 | + | ||
765 | + fig.suptitle("Number of RNA families used by experimental method and resolution", fontsize=16) | ||
766 | + fig.subplots_adjust(left=0.05, right=0.98, wspace=0.05, | ||
767 | + hspace=0.05, bottom=0.12, top=0.84) | ||
768 | + fig.savefig("results/figures/Nfamilies.png") | ||
769 | + plt.close() | ||
770 | + | ||
600 | def log_to_pbar(pbar): | 771 | def log_to_pbar(pbar): |
601 | def update(r): | 772 | def update(r): |
602 | pbar.update(1) | 773 | pbar.update(1) |
... | @@ -604,6 +775,9 @@ def log_to_pbar(pbar): | ... | @@ -604,6 +775,9 @@ def log_to_pbar(pbar): |
604 | 775 | ||
605 | if __name__ == "__main__": | 776 | if __name__ == "__main__": |
606 | 777 | ||
778 | + general_stats() | ||
779 | + exit() | ||
780 | + | ||
607 | # parse options | 781 | # parse options |
608 | try: | 782 | try: |
609 | opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "resolution=", "3d-folder=", "seq-folder=" ]) | 783 | opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "resolution=", "3d-folder=", "seq-folder=" ]) |
... | @@ -665,8 +839,8 @@ if __name__ == "__main__": | ... | @@ -665,8 +839,8 @@ if __name__ == "__main__": |
665 | 839 | ||
666 | # Define the tasks | 840 | # Define the tasks |
667 | joblist = [] | 841 | joblist = [] |
668 | - # joblist.append(Job(function=reproduce_wadley_results, args=(1,))) | 842 | + joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 4.0))) # res threshold is 4.0 Angstroms by default |
669 | - # joblist.append(Job(function=reproduce_wadley_results, args=(4,))) | 843 | + joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 4.0))) # |
670 | joblist.append(Job(function=stats_len)) # Computes figures | 844 | joblist.append(Job(function=stats_len)) # Computes figures |
671 | # joblist.append(Job(function=stats_freq)) # updates the database | 845 | # joblist.append(Job(function=stats_freq)) # updates the database |
672 | # for f in famlist: | 846 | # for f in famlist: | ... | ... |
-
Please register or login to post a comment