Louis BECQUEY

Rethink how inferred mappings work

...@@ -18,7 +18,7 @@ from os import path, makedirs ...@@ -18,7 +18,7 @@ from os import path, makedirs
18 from multiprocessing import Pool, Manager, set_start_method 18 from multiprocessing import Pool, Manager, set_start_method
19 from time import sleep 19 from time import sleep
20 from tqdm import tqdm 20 from tqdm import tqdm
21 -from tqdm.contrib.concurrent import process_map 21 +from setproctitle import setproctitle
22 22
23 def trace_unhandled_exceptions(func): 23 def trace_unhandled_exceptions(func):
24 @wraps(func) 24 @wraps(func)
...@@ -169,7 +169,8 @@ class Chain: ...@@ -169,7 +169,8 @@ class Chain:
169 def extract(self, df, khetatm): 169 def extract(self, df, khetatm):
170 """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file. 170 """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file.
171 """ 171 """
172 - 172 + setproctitle(f"RNANet.py {self.chain_label} extract()")
173 +
173 if self.mapping is not None: 174 if self.mapping is not None:
174 status = f"Extract {self.mapping.nt_start}-{self.mapping.nt_end} atoms from {self.pdb_id}-{self.pdb_chain_id}" 175 status = f"Extract {self.mapping.nt_start}-{self.mapping.nt_end} atoms from {self.pdb_id}-{self.pdb_chain_id}"
175 self.file = path_to_3D_data+"rna_mapped_to_Rfam/"+self.chain_label+".cif" 176 self.file = path_to_3D_data+"rna_mapped_to_Rfam/"+self.chain_label+".cif"
...@@ -213,6 +214,8 @@ class Chain: ...@@ -213,6 +214,8 @@ class Chain:
213 def extract_3D_data(self, save_logs=True): 214 def extract_3D_data(self, save_logs=True):
214 """ Maps DSSR annotations to the chain. """ 215 """ Maps DSSR annotations to the chain. """
215 216
217 + setproctitle(f"RNANet.py {self.chain_label} extract_3D_data()")
218 +
216 ############################################ 219 ############################################
217 # Load the mmCIF annotations from file 220 # Load the mmCIF annotations from file
218 ############################################ 221 ############################################
...@@ -311,6 +314,10 @@ class Chain: ...@@ -311,6 +314,10 @@ class Chain:
311 # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR. 314 # Common 4v9q-DV case (and similar ones) : e.g. chains contains 17 and 17A which are both read 17 by DSSR.
312 # Solution : we shift the numbering of 17A (to 18) and the following residues. 315 # Solution : we shift the numbering of 17A (to 18) and the following residues.
313 df.iloc[i:, 1] += 1 316 df.iloc[i:, 1] += 1
317 + elif duplicates.iloc[0,0] == 1 and df.iloc[i,0] == 3:
318 + # 4wzo_1_1J case, there is a residue numbered -1 and read as 1 before the number 0.
319 + df.iloc[1:, 1] += 1
320 + df.iloc[0, 1] = 0
314 else: 321 else:
315 # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ... 322 # 4v9k-DA case (and similar ones) : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ...
316 # Here the 163B is read 163 by DSSR, but there already is a residue 163. 323 # Here the 163B is read 163 by DSSR, but there already is a residue 163.
...@@ -323,7 +330,6 @@ class Chain: ...@@ -323,7 +330,6 @@ class Chain:
323 self.error_messages = f"Error with parsing of duplicate residues numbers." 330 self.error_messages = f"Error with parsing of duplicate residues numbers."
324 return None 331 return None
325 332
326 -
327 # Search for ligands at the end of the selection 333 # Search for ligands at the end of the selection
328 # Drop ligands detected as residues by DSSR, by detecting several markers 334 # Drop ligands detected as residues by DSSR, by detecting several markers
329 while ( len(df.index_chain) and df.iloc[-1,2] not in ["A", "C", "G", "U"] and ( 335 while ( len(df.index_chain) and df.iloc[-1,2] not in ["A", "C", "G", "U"] and (
...@@ -338,7 +344,6 @@ class Chain: ...@@ -338,7 +344,6 @@ class Chain:
338 self.mapping.log(df.tail(1)) 344 self.mapping.log(df.tail(1))
339 df = df.head(-1) 345 df = df.head(-1)
340 346
341 -
342 # Duplicates in index_chain : drop, they are ligands 347 # Duplicates in index_chain : drop, they are ligands
343 # e.g. 3iwn_1_B_1-91, ligand C2E has index_chain 1 (and nt_resnum 601) 348 # e.g. 3iwn_1_B_1-91, ligand C2E has index_chain 1 (and nt_resnum 601)
344 duplicates = [ index for index, element in enumerate(df.duplicated(['index_chain']).values) if element ] 349 duplicates = [ index for index, element in enumerate(df.duplicated(['index_chain']).values) if element ]
...@@ -384,7 +389,7 @@ class Chain: ...@@ -384,7 +389,7 @@ class Chain:
384 df.iloc[i+1:, 1] += 1 389 df.iloc[i+1:, 1] += 1
385 else: 390 else:
386 warn(f"Missing index_chain {i} in {self.chain_label} !") 391 warn(f"Missing index_chain {i} in {self.chain_label} !")
387 - 392 +
388 # Assert some nucleotides still exist 393 # Assert some nucleotides still exist
389 try: 394 try:
390 l = df.iloc[-1,1] - df.iloc[0,1] + 1 # update length of chain from nt_resnum point of view 395 l = df.iloc[-1,1] - df.iloc[0,1] + 1 # update length of chain from nt_resnum point of view
...@@ -522,6 +527,8 @@ class Chain: ...@@ -522,6 +527,8 @@ class Chain:
522 """Saves the extracted 3D data to the database. 527 """Saves the extracted 3D data to the database.
523 """ 528 """
524 529
530 + setproctitle(f"RNANet.py {self.chain_label} register_chain()")
531 +
525 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: 532 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
526 # Register the chain in table chain 533 # Register the chain in table chain
527 if self.mapping is not None: 534 if self.mapping is not None:
...@@ -575,6 +582,8 @@ class Chain: ...@@ -575,6 +582,8 @@ class Chain:
575 s_seq: the aligned version of self.seq_to_align 582 s_seq: the aligned version of self.seq_to_align
576 """ 583 """
577 584
585 + setproctitle(f"RNANet.py {self.chain_label} remap()")
586 +
578 alilen = len(s_seq) 587 alilen = len(s_seq)
579 re_mappings = [] 588 re_mappings = []
580 589
...@@ -630,6 +639,8 @@ class Chain: ...@@ -630,6 +639,8 @@ class Chain:
630 REQUIRES align_column and re_mapping up to date 639 REQUIRES align_column and re_mapping up to date
631 """ 640 """
632 641
642 + setproctitle(f"RNANet.py {self.chain_label} replace_gaps()")
643 +
633 homology_data = sql_ask_database(conn, f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM 644 homology_data = sql_ask_database(conn, f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM
634 (SELECT chain_id, rfam_acc FROM chain WHERE chain_id={self.db_chain_id}) 645 (SELECT chain_id, rfam_acc FROM chain WHERE chain_id={self.db_chain_id})
635 NATURAL JOIN re_mapping 646 NATURAL JOIN re_mapping
...@@ -741,6 +752,9 @@ class Downloader: ...@@ -741,6 +752,9 @@ class Downloader:
741 """Query the Rfam public MySQL database for mappings between their RNA families and PDB structures. 752 """Query the Rfam public MySQL database for mappings between their RNA families and PDB structures.
742 753
743 """ 754 """
755 +
756 + setproctitle(f"RNANet.py download_Rfam_PDB_mappings()")
757 +
744 # Download PDB mappings to Rfam family 758 # Download PDB mappings to Rfam family
745 print("> Fetching latest PDB mappings from Rfam..." + " " * 29, end='', flush=True) 759 print("> Fetching latest PDB mappings from Rfam..." + " " * 29, end='', flush=True)
746 try: 760 try:
...@@ -766,6 +780,8 @@ class Downloader: ...@@ -766,6 +780,8 @@ class Downloader:
766 Does not download if already there. 780 Does not download if already there.
767 """ 781 """
768 782
783 + setproctitle(f"RNANet.py download_Rfam_cm()")
784 +
769 print(f"\t> Download Rfam.cm.gz from Rfam..." + " " * 37, end='', flush=True) 785 print(f"\t> Download Rfam.cm.gz from Rfam..." + " " * 37, end='', flush=True)
770 if not path.isfile(path_to_seq_data + "Rfam.cm"): 786 if not path.isfile(path_to_seq_data + "Rfam.cm"):
771 try: 787 try:
...@@ -785,6 +801,9 @@ class Downloader: ...@@ -785,6 +801,9 @@ class Downloader:
785 Family ID, number of sequences identified, maximum length of those sequences. 801 Family ID, number of sequences identified, maximum length of those sequences.
786 SETS family in the database (partially) 802 SETS family in the database (partially)
787 """ 803 """
804 +
805 + setproctitle(f"RNANet.py download_Rfam_family_stats()")
806 +
788 try: 807 try:
789 db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam') 808 db_connection = sqlalchemy.create_engine('mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam')
790 809
...@@ -829,6 +848,8 @@ class Downloader: ...@@ -829,6 +848,8 @@ class Downloader:
829 848
830 Actually gets a FASTA archive from the public Rfam FTP. Does not download if already there.""" 849 Actually gets a FASTA archive from the public Rfam FTP. Does not download if already there."""
831 850
851 + setproctitle(f"RNANet.py download_Rfam_sequences({rfam_acc})")
852 +
832 if not path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"): 853 if not path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"):
833 for _ in range(10): # retry 100 times if it fails 854 for _ in range(10): # retry 100 times if it fails
834 try: 855 try:
...@@ -849,6 +870,9 @@ class Downloader: ...@@ -849,6 +870,9 @@ class Downloader:
849 870
850 Does not remove structural redundancy. 871 Does not remove structural redundancy.
851 """ 872 """
873 +
874 + setproctitle(f"RNANet.py download_BGSU_NR_list({res})")
875 +
852 nr_code = min([ i for i in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 20.0] if i >= res ]) 876 nr_code = min([ i for i in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 20.0] if i >= res ])
853 print(f"> Fetching latest list of RNA files at {nr_code} A resolution from BGSU website...", end='', flush=True) 877 print(f"> Fetching latest list of RNA files at {nr_code} A resolution from BGSU website...", end='', flush=True)
854 # Download latest BGSU non-redundant list 878 # Download latest BGSU non-redundant list
...@@ -875,6 +899,10 @@ class Downloader: ...@@ -875,6 +899,10 @@ class Downloader:
875 return full_structures_list # list of ( str (class), str (class_members) ) 899 return full_structures_list # list of ( str (class), str (class_members) )
876 900
877 def download_from_SILVA(self, unit): 901 def download_from_SILVA(self, unit):
902 +
903 + setproctitle(f"RNANet.py download_from_SILVA({unit})")
904 +
905 +
878 if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"): 906 if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"):
879 try: 907 try:
880 print(f"Downloading {unit} from SILVA...", end='', flush=True) 908 print(f"Downloading {unit} from SILVA...", end='', flush=True)
...@@ -989,6 +1017,8 @@ class Pipeline: ...@@ -989,6 +1017,8 @@ class Pipeline:
989 global path_to_3D_data 1017 global path_to_3D_data
990 global path_to_seq_data 1018 global path_to_seq_data
991 1019
1020 + setproctitle("RNANet.py process_options()")
1021 +
992 try: 1022 try:
993 opts, _ = getopt.getopt( sys.argv[1:], "r:hs", 1023 opts, _ = getopt.getopt( sys.argv[1:], "r:hs",
994 [ "help", "resolution=", "keep-hetatm=", "from-scratch", 1024 [ "help", "resolution=", "keep-hetatm=", "from-scratch",
...@@ -1105,13 +1135,16 @@ class Pipeline: ...@@ -1105,13 +1135,16 @@ class Pipeline:
1105 print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") 1135 print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
1106 print("See RNANet.py --help for more information.") 1136 print("See RNANet.py --help for more information.")
1107 exit(1) 1137 exit(1)
1108 - 1138 +
1139 + @trace_unhandled_exceptions
1109 def list_available_mappings(self): 1140 def list_available_mappings(self):
1110 """List 3D chains with available Rfam mappings. 1141 """List 3D chains with available Rfam mappings.
1111 1142
1112 Return a list of Chain() objects with the mappings set up. 1143 Return a list of Chain() objects with the mappings set up.
1113 If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains.""" 1144 If self.HOMOLOGY is set to False, simply returns a list of Chain() objects with available 3D chains."""
1114 1145
1146 + setproctitle("RNANet.py list_available_mappings()")
1147 +
1115 # List all 3D RNA chains below given resolution 1148 # List all 3D RNA chains below given resolution
1116 full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) # list of tuples ( class, class_members ) 1149 full_structures_list = self.dl.download_BGSU_NR_list(self.CRYSTAL_RES) # list of tuples ( class, class_members )
1117 1150
...@@ -1131,11 +1164,11 @@ class Pipeline: ...@@ -1131,11 +1164,11 @@ class Pipeline:
1131 # Compute the list of mappable structures using NR-list and Rfam-PDB mappings 1164 # Compute the list of mappable structures using NR-list and Rfam-PDB mappings
1132 # And get Chain() objects 1165 # And get Chain() objects
1133 print("> Building list of structures...", flush=True) 1166 print("> Building list of structures...", flush=True)
1134 - p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=ncores) 1167 + p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=ncores, maxtasksperchild=1)
1135 try: 1168 try:
1136 1169
1137 - pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1, bar_format="{percentage:3.0f}%|{bar}|") 1170 + pbar = tqdm(full_structures_list, maxinterval=1.0, miniters=1, desc="Eq. classes", bar_format="{percentage:3.0f}%|{bar}|")
1138 - for _, newchains in enumerate(p.imap_unordered(partial(work_infer_mappings, not self.REUSE_ALL, allmappings), full_structures_list)): 1171 + for _, newchains in enumerate(p.imap_unordered(partial(work_infer_mappings, not self.REUSE_ALL, allmappings), full_structures_list, chunksize=1)):
1139 self.update += newchains 1172 self.update += newchains
1140 pbar.update(1) # Everytime the iteration finishes, update the global progress bar 1173 pbar.update(1) # Everytime the iteration finishes, update the global progress bar
1141 1174
...@@ -1161,7 +1194,7 @@ class Pipeline: ...@@ -1161,7 +1194,7 @@ class Pipeline:
1161 pdb_chain_id = nr[2].upper() 1194 pdb_chain_id = nr[2].upper()
1162 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}" 1195 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}"
1163 res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc IS NULL AND issue=0""") 1196 res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc IS NULL AND issue=0""")
1164 - if not len(res): # the chain is NOT yet in the database, or this is a known issue 1197 + if not len(res) or self.REUSE_ALL: # the chain is NOT yet in the database, or this is a known issue
1165 self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class)) 1198 self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class))
1166 conn.close() 1199 conn.close()
1167 1200
...@@ -1179,6 +1212,8 @@ class Pipeline: ...@@ -1179,6 +1212,8 @@ class Pipeline:
1179 REQUIRES the previous definition of self.update, so call list_available_mappings() before. 1212 REQUIRES the previous definition of self.update, so call list_available_mappings() before.
1180 SETS table structure""" 1213 SETS table structure"""
1181 1214
1215 + setproctitle(f"RNANet.py dl_and_annotate(retry={retry})")
1216 +
1182 # Prepare the results folders 1217 # Prepare the results folders
1183 if not path.isdir(path_to_3D_data + "RNAcifs"): 1218 if not path.isdir(path_to_3D_data + "RNAcifs"):
1184 os.makedirs(path_to_3D_data + "RNAcifs") # for the whole structures 1219 os.makedirs(path_to_3D_data + "RNAcifs") # for the whole structures
...@@ -1186,15 +1221,15 @@ class Pipeline: ...@@ -1186,15 +1221,15 @@ class Pipeline:
1186 os.makedirs(path_to_3D_data + "annotations") # for DSSR analysis of the whole structures 1221 os.makedirs(path_to_3D_data + "annotations") # for DSSR analysis of the whole structures
1187 1222
1188 # Download and annotate 1223 # Download and annotate
1189 - print("> Downloading and annotating structures...", flush=True) 1224 + print("> Downloading and annotating structures (or checking previous results if they exist)...", flush=True)
1190 if retry: 1225 if retry:
1191 mmcif_list = sorted(set([ c.pdb_id for c in self.retry ])) 1226 mmcif_list = sorted(set([ c.pdb_id for c in self.retry ]))
1192 else: 1227 else:
1193 - mmcif_list = sorted(set([ c.pdb_id for c in self.update if not path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json") ])) 1228 + mmcif_list = sorted(set([ c.pdb_id for c in self.update ]))
1194 try: 1229 try:
1195 - p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=int(coeff_ncores*ncores)) 1230 + p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=int(coeff_ncores*ncores), maxtasksperchild=1)
1196 pbar = tqdm(mmcif_list, maxinterval=1.0, miniters=1, desc="mmCIF files") 1231 pbar = tqdm(mmcif_list, maxinterval=1.0, miniters=1, desc="mmCIF files")
1197 - for _ in p.imap_unordered(work_mmcif, mmcif_list): 1232 + for _ in p.imap_unordered(work_mmcif, mmcif_list, chunksize=1):
1198 pbar.update(1) # Everytime the iteration finishes, update the global progress bar 1233 pbar.update(1) # Everytime the iteration finishes, update the global progress bar
1199 pbar.close() 1234 pbar.close()
1200 p.close() 1235 p.close()
...@@ -1213,6 +1248,8 @@ class Pipeline: ...@@ -1213,6 +1248,8 @@ class Pipeline:
1213 REQUIRES the previous definition of self.update, so call list_available_mappings() before. 1248 REQUIRES the previous definition of self.update, so call list_available_mappings() before.
1214 SETS self.loaded_chains""" 1249 SETS self.loaded_chains"""
1215 1250
1251 + setproctitle(f"RNANet.py build_chains(retry={retry})")
1252 +
1216 # Prepare folders 1253 # Prepare folders
1217 if self.EXTRACT_CHAINS: 1254 if self.EXTRACT_CHAINS:
1218 if self.HOMOLOGY and not path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"): 1255 if self.HOMOLOGY and not path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"):
...@@ -1239,19 +1276,25 @@ class Pipeline: ...@@ -1239,19 +1276,25 @@ class Pipeline:
1239 exit(1) 1276 exit(1)
1240 1277
1241 # If there were newly discovered problems, add this chain to the known issues 1278 # If there were newly discovered problems, add this chain to the known issues
1279 + issues = 0
1280 + issues_names = []
1242 ki = open(runDir + "/known_issues.txt", 'a') 1281 ki = open(runDir + "/known_issues.txt", 'a')
1243 kir = open(runDir + "/known_issues_reasons.txt", 'a') 1282 kir = open(runDir + "/known_issues_reasons.txt", 'a')
1244 for c in results: 1283 for c in results:
1245 if c[1].delete_me and c[1].chain_label not in self.known_issues: 1284 if c[1].delete_me and c[1].chain_label not in self.known_issues:
1246 if retry or "Could not load existing" not in c[1].error_messages: 1285 if retry or "Could not load existing" not in c[1].error_messages:
1247 self.known_issues.append(c[1].chain_label) 1286 self.known_issues.append(c[1].chain_label)
1248 - warn(f"Adding {c[1].chain_label} to known issues.") 1287 + issues += 1
1288 + issues_names.append(c[1].chain_label)
1249 ki.write(c[1].chain_label + '\n') 1289 ki.write(c[1].chain_label + '\n')
1250 kir.write(c[1].chain_label + '\n' + c[1].error_messages + '\n\n') 1290 kir.write(c[1].chain_label + '\n' + c[1].error_messages + '\n\n')
1251 with sqlite3.connect(runDir+"/results/RNANet.db") as conn: 1291 with sqlite3.connect(runDir+"/results/RNANet.db") as conn:
1252 sql_execute(conn, f"UPDATE chain SET issue = 1 WHERE chain_id = ?;", data=(c[1].db_chain_id,)) 1292 sql_execute(conn, f"UPDATE chain SET issue = 1 WHERE chain_id = ?;", data=(c[1].db_chain_id,))
1253 ki.close() 1293 ki.close()
1254 kir.close() 1294 kir.close()
1295 + if issues:
1296 + warn("Added newly discovered issues to known issues:")
1297 + print("\033[33m"+ " ".join(issues_names) + "\033[0m", flush=True)
1255 1298
1256 # Add successfully built chains to list 1299 # Add successfully built chains to list
1257 self.loaded_chains += [ c[1] for c in results if not c[1].delete_me ] 1300 self.loaded_chains += [ c[1] for c in results if not c[1].delete_me ]
...@@ -1276,6 +1319,8 @@ class Pipeline: ...@@ -1276,6 +1319,8 @@ class Pipeline:
1276 REQUIRES that self.loaded_chains is defined. 1319 REQUIRES that self.loaded_chains is defined.
1277 SETS family (partially, through call)""" 1320 SETS family (partially, through call)"""
1278 1321
1322 + setproctitle("RNANet.py prepare_sequences()")
1323 +
1279 # Preparing a results folder 1324 # Preparing a results folder
1280 if not os.access(path_to_seq_data + "realigned/", os.F_OK): 1325 if not os.access(path_to_seq_data + "realigned/", os.F_OK):
1281 os.makedirs(path_to_seq_data + "realigned/") 1326 os.makedirs(path_to_seq_data + "realigned/")
...@@ -1308,6 +1353,8 @@ class Pipeline: ...@@ -1308,6 +1353,8 @@ class Pipeline:
1308 REQUIRES self.fam_list to be defined 1353 REQUIRES self.fam_list to be defined
1309 SETS family (partially)""" 1354 SETS family (partially)"""
1310 1355
1356 + setproctitle("RNANet.py realign()")
1357 +
1311 # Prepare the job list 1358 # Prepare the job list
1312 joblist = [] 1359 joblist = []
1313 for f in self.fam_list: 1360 for f in self.fam_list:
...@@ -1345,6 +1392,8 @@ class Pipeline: ...@@ -1345,6 +1392,8 @@ class Pipeline:
1345 1392
1346 REQUIRES self.fam_list to be defined""" 1393 REQUIRES self.fam_list to be defined"""
1347 1394
1395 + setproctitle("RNANet.py remap()")
1396 +
1348 print("Computing nucleotide frequencies in alignments...\nThis can be very long on slow storage devices (Hard-drive...)") 1397 print("Computing nucleotide frequencies in alignments...\nThis can be very long on slow storage devices (Hard-drive...)")
1349 print("Check your CPU and disk I/O activity before deciding if the job failed.") 1398 print("Check your CPU and disk I/O activity before deciding if the job failed.")
1350 nworkers =max(min(ncores, len(self.fam_list)), 1) 1399 nworkers =max(min(ncores, len(self.fam_list)), 1)
...@@ -1357,11 +1406,11 @@ class Pipeline: ...@@ -1357,11 +1406,11 @@ class Pipeline:
1357 1406
1358 # Start a process pool to dispatch the RNA families, 1407 # Start a process pool to dispatch the RNA families,
1359 # over multiple CPUs (one family by CPU) 1408 # over multiple CPUs (one family by CPU)
1360 - p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers, maxtasksperchild=5) 1409 + p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers, maxtasksperchild=1)
1361 1410
1362 try: 1411 try:
1363 fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True) 1412 fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True)
1364 - for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list)): # Apply work_pssm to each RNA family 1413 + for i, _ in enumerate(p.imap_unordered(partial(work_pssm, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)): # Apply work_pssm to each RNA family
1365 fam_pbar.update(1) # Everytime the iteration finishes on a family, update the global progress bar over the RNA families 1414 fam_pbar.update(1) # Everytime the iteration finishes on a family, update the global progress bar over the RNA families
1366 fam_pbar.close() 1415 fam_pbar.close()
1367 p.close() 1416 p.close()
...@@ -1378,6 +1427,8 @@ class Pipeline: ...@@ -1378,6 +1427,8 @@ class Pipeline:
1378 1427
1379 REQUIRES self.loaded_chains (to output corresponding CSV files) and self.fam_list (for statistics)""" 1428 REQUIRES self.loaded_chains (to output corresponding CSV files) and self.fam_list (for statistics)"""
1380 1429
1430 + setproctitle("RNANet.py output_results()")
1431 +
1381 time_str = time.strftime("%Y%m%d") 1432 time_str = time.strftime("%Y%m%d")
1382 1433
1383 #Prepare folders: 1434 #Prepare folders:
...@@ -1387,10 +1438,10 @@ class Pipeline: ...@@ -1387,10 +1438,10 @@ class Pipeline:
1387 os.makedirs(runDir + "/results/archive/") 1438 os.makedirs(runDir + "/results/archive/")
1388 1439
1389 # Save to by-chain CSV files 1440 # Save to by-chain CSV files
1390 - p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3) 1441 + p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=3, maxtasksperchild=1)
1391 try: 1442 try:
1392 pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True) 1443 pbar = tqdm(total=len(self.loaded_chains), desc="Saving chains to CSV", position=0, leave=True)
1393 - for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains)): 1444 + for _, _2 in enumerate(p.imap_unordered(work_save, self.loaded_chains, chunksize=2)):
1394 pbar.update(1) 1445 pbar.update(1)
1395 pbar.close() 1446 pbar.close()
1396 p.close() 1447 p.close()
...@@ -1439,6 +1490,8 @@ class Pipeline: ...@@ -1439,6 +1490,8 @@ class Pipeline:
1439 def sanitize_database(self): 1490 def sanitize_database(self):
1440 """Searches for issues in the database and correct them""" 1491 """Searches for issues in the database and correct them"""
1441 1492
1493 + setproctitle("RNANet.py sanitize_database()")
1494 +
1442 conn = sqlite3.connect(runDir + "/results/RNANet.db") 1495 conn = sqlite3.connect(runDir + "/results/RNANet.db")
1443 1496
1444 # Assert every structure is used 1497 # Assert every structure is used
...@@ -1532,7 +1585,7 @@ def sql_define_tables(conn): ...@@ -1532,7 +1585,7 @@ def sql_define_tables(conn):
1532 chain_id INTEGER PRIMARY KEY NOT NULL, 1585 chain_id INTEGER PRIMARY KEY NOT NULL,
1533 structure_id CHAR(4) NOT NULL, 1586 structure_id CHAR(4) NOT NULL,
1534 chain_name VARCHAR(2) NOT NULL, 1587 chain_name VARCHAR(2) NOT NULL,
1535 - eq_class VARCHAR(10), 1588 + eq_class VARCHAR(16),
1536 pdb_start SMALLINT, 1589 pdb_start SMALLINT,
1537 pdb_end SMALLINT, 1590 pdb_end SMALLINT,
1538 issue TINYINT, 1591 issue TINYINT,
...@@ -1767,9 +1820,9 @@ def execute_joblist(fulljoblist): ...@@ -1767,9 +1820,9 @@ def execute_joblist(fulljoblist):
1767 1820
1768 print("using", n, "processes:") 1821 print("using", n, "processes:")
1769 # execute jobs of priority i that should be processed n by n: 1822 # execute jobs of priority i that should be processed n by n:
1770 - p = Pool(processes=n, maxtasksperchild=5, initializer=init_worker) 1823 + p = Pool(processes=n, maxtasksperchild=1, initializer=init_worker)
1771 try: 1824 try:
1772 - raw_results = p.map(partial(execute_job, jobcount=jobcount), bunch) 1825 + raw_results = p.map(partial(execute_job, jobcount=jobcount), bunch, chunksize=2)
1773 p.close() 1826 p.close()
1774 p.join() 1827 p.join()
1775 except KeyboardInterrupt as e: 1828 except KeyboardInterrupt as e:
...@@ -1792,6 +1845,9 @@ def work_infer_mappings(update_only, allmappings, codelist): ...@@ -1792,6 +1845,9 @@ def work_infer_mappings(update_only, allmappings, codelist):
1792 build a list of Chain() objects mapped to Rfam families, by expanding available mappings 1845 build a list of Chain() objects mapped to Rfam families, by expanding available mappings
1793 of any element of the list to all the list elements. 1846 of any element of the list to all the list elements.
1794 """ 1847 """
1848 +
1849 + setproctitle("RNAnet.py work_infer_mappings()")
1850 +
1795 newchains = [] 1851 newchains = []
1796 known_mappings = pd.DataFrame() 1852 known_mappings = pd.DataFrame()
1797 1853
...@@ -1812,11 +1868,12 @@ def work_infer_mappings(update_only, allmappings, codelist): ...@@ -1812,11 +1868,12 @@ def work_infer_mappings(update_only, allmappings, codelist):
1812 1868
1813 # Now infer mappings for chains that are not explicitely listed in Rfam-PDB mappings: 1869 # Now infer mappings for chains that are not explicitely listed in Rfam-PDB mappings:
1814 if len(known_mappings): 1870 if len(known_mappings):
1871 +
1815 families = set(known_mappings['rfam_acc']) 1872 families = set(known_mappings['rfam_acc'])
1816 1873
1817 # generalize 1874 # generalize
1818 inferred_mappings = known_mappings.drop(['pdb_id','chain'], axis=1).drop_duplicates() 1875 inferred_mappings = known_mappings.drop(['pdb_id','chain'], axis=1).drop_duplicates()
1819 - 1876 +
1820 # check for approximative redundancy: 1877 # check for approximative redundancy:
1821 if len(inferred_mappings) != len(inferred_mappings.drop_duplicates(subset="rfam_acc")): 1878 if len(inferred_mappings) != len(inferred_mappings.drop_duplicates(subset="rfam_acc")):
1822 # Then, there exists some mapping variants onto the same Rfam family CM, 1879 # Then, there exists some mapping variants onto the same Rfam family CM,
...@@ -1831,9 +1888,25 @@ def work_infer_mappings(update_only, allmappings, codelist): ...@@ -1831,9 +1888,25 @@ def work_infer_mappings(update_only, allmappings, codelist):
1831 len(inferred_mappings[thisfam_5_3]) != len(inferred_mappings[ inferred_mappings['rfam_acc'] == rfam ]) 1888 len(inferred_mappings[thisfam_5_3]) != len(inferred_mappings[ inferred_mappings['rfam_acc'] == rfam ])
1832 and len(inferred_mappings[thisfam_5_3]) > 0 1889 and len(inferred_mappings[thisfam_5_3]) > 0
1833 ): 1890 ):
1834 - warn(f"There are mappings for {rfam} in both directions:", error=True) 1891 + # there are mappings in both directions... wtf Rfam ?!
1835 - print(inferred_mappings) 1892 + if (len(inferred_mappings[thisfam_5_3]) == len(inferred_mappings[thisfam_3_5]) == 1
1836 - exit(1) 1893 + and int(inferred_mappings[thisfam_5_3].pdb_start) == int(inferred_mappings[thisfam_3_5].pdb_end)
1894 + and int(inferred_mappings[thisfam_5_3].pdb_end) == int(inferred_mappings[thisfam_3_5].pdb_start)
1895 + ):
1896 + # The two mappings are on the same nucleotide interval, but in each sense.
1897 + # e.g. RF00254 6v5b and 6v5c... maybe a bug on their side ?
1898 + # How can a chain match a CM in both senses ?
1899 + # We keep only the 5->3 sense.
1900 + inferred_mappings = inferred_mappings.drop(index=inferred_mappings.index[thisfam_3_5])
1901 + sel_5_to_3 = (inferred_mappings['pdb_start'] < inferred_mappings['pdb_end'])
1902 + thisfam_5_3 = (inferred_mappings['rfam_acc'] == rfam ) & sel_5_to_3
1903 + thisfam_3_5 = (inferred_mappings['rfam_acc'] == rfam ) & (sel_5_to_3 == False)
1904 + print()
1905 + warn(f"Found mappings to {rfam} in both directions on the same interval, keeping only the 5'->3' one.")
1906 + else:
1907 + warn(f"There are mappings for {rfam} in both directions:", error=True)
1908 + print(inferred_mappings)
1909 + exit(1)
1837 1910
1838 # Compute consensus for chains in 5' -> 3' sense 1911 # Compute consensus for chains in 5' -> 3' sense
1839 if len(inferred_mappings[thisfam_5_3]): 1912 if len(inferred_mappings[thisfam_5_3]):
...@@ -1887,7 +1960,7 @@ def work_infer_mappings(update_only, allmappings, codelist): ...@@ -1887,7 +1960,7 @@ def work_infer_mappings(update_only, allmappings, codelist):
1887 pdb_start = int(m.pdb_start.min()) 1960 pdb_start = int(m.pdb_start.min())
1888 pdb_end = int(m.pdb_end.max()) 1961 pdb_end = int(m.pdb_end.max())
1889 inferred = False 1962 inferred = False
1890 - else: # otherwise, use the inferred mapping 1963 + elif not(pdb_id in known_mappings.pdb_id and pdb_chain_id in known_mappings.chain): # if no known mapping on another family, use the inferred mapping
1891 pdb_start = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_start) 1964 pdb_start = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_start)
1892 pdb_end = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_end) 1965 pdb_end = int(inferred_mappings.loc[ (inferred_mappings['rfam_acc'] == rfam) ].pdb_end)
1893 inferred = True 1966 inferred = True
...@@ -1911,6 +1984,8 @@ def work_mmcif(pdb_id): ...@@ -1911,6 +1984,8 @@ def work_mmcif(pdb_id):
1911 SETS table structure 1984 SETS table structure
1912 """ 1985 """
1913 1986
1987 + setproctitle(f"RNAnet.py work_mmcif({pdb_id})")
1988 +
1914 final_filepath = path_to_3D_data+"RNAcifs/"+pdb_id+".cif" 1989 final_filepath = path_to_3D_data+"RNAcifs/"+pdb_id+".cif"
1915 1990
1916 # Attempt to download it if not present 1991 # Attempt to download it if not present
...@@ -1921,43 +1996,52 @@ def work_mmcif(pdb_id): ...@@ -1921,43 +1996,52 @@ def work_mmcif(pdb_id):
1921 warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True) 1996 warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True)
1922 return 1997 return
1923 1998
1924 - # Load the MMCIF file with Biopython 1999 + # check if it exists in database
1925 - mmCif_info = MMCIF2Dict(final_filepath)
1926 -
1927 - # Get info about that structure
1928 - exp_meth = mmCif_info["_exptl.method"][0]
1929 - date = mmCif_info["_pdbx_database_status.recvd_initial_deposition_date"][0]
1930 - if "_refine.ls_d_res_high" in mmCif_info.keys() and mmCif_info["_refine.ls_d_res_high"][0] not in ['.', '?']:
1931 - reso = float(mmCif_info["_refine.ls_d_res_high"][0])
1932 - elif "_refine.ls_d_res_low" in mmCif_info.keys() and mmCif_info["_refine.ls_d_res_low"][0] not in ['.', '?']:
1933 - reso = float(mmCif_info["_refine.ls_d_res_low"][0])
1934 - elif "_em_3d_reconstruction.resolution" in mmCif_info.keys() and mmCif_info["_em_3d_reconstruction.resolution"][0] not in ['.', '?']:
1935 - reso = float(mmCif_info["_em_3d_reconstruction.resolution"][0])
1936 - else:
1937 - warn(f"Wtf, structure {pdb_id} has no resolution ?")
1938 - warn(f"Check https://files.rcsb.org/header/{pdb_id}.cif to figure it out.")
1939 - reso = 0.0
1940 -
1941 - # Save into the database
1942 with sqlite3.connect(runDir + "/results/RNANet.db") as conn: 2000 with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
1943 - sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution) 2001 + r = sql_ask_database(conn, f"""SELECT * from structure where pdb_id = '{pdb_id}';""")
1944 - VALUES (?, ?, DATE(?), ?, ?);""", data = (pdb_id, 1, date, exp_meth, reso)) 2002 +
1945 - 2003 + # if not, read the CIF header and register the structure
1946 - # run DSSR (you need to have it in your $PATH, follow x3dna installation instructions) 2004 + if not len(r):
1947 - output = subprocess.run(["x3dna-dssr", f"-i={final_filepath}", "--json", "--auxfile=no"], 2005 + # Load the MMCIF file with Biopython
1948 - stdout=subprocess.PIPE, stderr=subprocess.PIPE) 2006 + mmCif_info = MMCIF2Dict(final_filepath)
1949 - stdout = output.stdout.decode('utf-8') 2007 +
1950 - stderr = output.stderr.decode('utf-8') 2008 + # Get info about that structure
1951 - 2009 + exp_meth = mmCif_info["_exptl.method"][0]
1952 - if "exception" in stderr: 2010 + date = mmCif_info["_pdbx_database_status.recvd_initial_deposition_date"][0]
1953 - # DSSR is unable to parse the chain. 2011 + if "_refine.ls_d_res_high" in mmCif_info.keys() and mmCif_info["_refine.ls_d_res_high"][0] not in ['.', '?']:
1954 - warn(f"Exception while running DSSR, ignoring {pdb_id}.", error=True) 2012 + reso = float(mmCif_info["_refine.ls_d_res_high"][0])
1955 - return 1 2013 + elif "_refine.ls_d_res_low" in mmCif_info.keys() and mmCif_info["_refine.ls_d_res_low"][0] not in ['.', '?']:
2014 + reso = float(mmCif_info["_refine.ls_d_res_low"][0])
2015 + elif "_em_3d_reconstruction.resolution" in mmCif_info.keys() and mmCif_info["_em_3d_reconstruction.resolution"][0] not in ['.', '?']:
2016 + reso = float(mmCif_info["_em_3d_reconstruction.resolution"][0])
2017 + else:
2018 + warn(f"Wtf, structure {pdb_id} has no resolution ?")
2019 + warn(f"Check https://files.rcsb.org/header/{pdb_id}.cif to figure it out.")
2020 + reso = 0.0
2021 +
2022 + # Save into the database
2023 + with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
2024 + sql_execute(conn, """INSERT OR REPLACE INTO structure (pdb_id, pdb_model, date, exp_method, resolution)
2025 + VALUES (?, ?, DATE(?), ?, ?);""", data = (pdb_id, 1, date, exp_meth, reso))
2026 +
2027 + if not path.isfile(path_to_3D_data + "annotations/" + pdb_id + ".json"):
2028 +
2029 + # run DSSR (you need to have it in your $PATH, follow x3dna installation instructions)
2030 + output = subprocess.run(["x3dna-dssr", f"-i={final_filepath}", "--json", "--auxfile=no"],
2031 + stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2032 + stdout = output.stdout.decode('utf-8')
2033 + stderr = output.stderr.decode('utf-8')
2034 +
2035 + if "exception" in stderr:
2036 + # DSSR is unable to parse the chain.
2037 + warn(f"Exception while running DSSR, ignoring {pdb_id}.", error=True)
2038 + return 1
2039 +
2040 + # save the analysis to file only if we can load it :/
2041 + json_file = open(path_to_3D_data + "annotations/" + pdb_id + ".json", "w")
2042 + json_file.write(stdout)
2043 + json_file.close()
1956 2044
1957 - # save the analysis to file only if we can load it :/
1958 - json_file = open(path_to_3D_data + "annotations/" + pdb_id + ".json", "w")
1959 - json_file.write(stdout)
1960 - json_file.close()
1961 return 0 2045 return 0
1962 2046
1963 @trace_unhandled_exceptions 2047 @trace_unhandled_exceptions
...@@ -1966,6 +2050,9 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): ...@@ -1966,6 +2050,9 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
1966 If asked, also extracts the 3D chains from their original structure files. 2050 If asked, also extracts the 3D chains from their original structure files.
1967 2051
1968 """ 2052 """
2053 +
2054 + setproctitle(f"RNAnet.py work_build_chain({c.chain_label})")
2055 +
1969 if not path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json"): 2056 if not path.isfile(path_to_3D_data + "annotations/" + c.pdb_id + ".json"):
1970 warn(f"Could not find annotations for {c.chain_label}, ignoring it.", error=True) 2057 warn(f"Could not find annotations for {c.chain_label}, ignoring it.", error=True)
1971 c.delete_me = True 2058 c.delete_me = True
...@@ -1997,6 +2084,8 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): ...@@ -1997,6 +2084,8 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
1997 def work_prepare_sequences(dl, rfam_acc, chains): 2084 def work_prepare_sequences(dl, rfam_acc, chains):
1998 """Prepares FASTA files of homologous sequences to realign with cmalign or SINA.""" 2085 """Prepares FASTA files of homologous sequences to realign with cmalign or SINA."""
1999 2086
2087 + setproctitle("RNAnet.py work_prepare_sequences()")
2088 +
2000 if rfam_acc in LSU_set | SSU_set: # rRNA 2089 if rfam_acc in LSU_set | SSU_set: # rRNA
2001 if path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"): 2090 if path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
2002 # Detect doublons and remove them 2091 # Detect doublons and remove them
...@@ -2087,6 +2176,8 @@ def work_realign(rfam_acc): ...@@ -2087,6 +2176,8 @@ def work_realign(rfam_acc):
2087 cmalign requires too much RAM for them, so we use SINA, a specifically designed tool for rRNAs. 2176 cmalign requires too much RAM for them, so we use SINA, a specifically designed tool for rRNAs.
2088 """ 2177 """
2089 2178
2179 + setproctitle(f"RNAnet.py work_realign({rfam_acc})")
2180 +
2090 if rfam_acc in LSU_set | SSU_set: 2181 if rfam_acc in LSU_set | SSU_set:
2091 # Ribosomal subunits deserve a special treatment. 2182 # Ribosomal subunits deserve a special treatment.
2092 # They require too much RAM to be aligned with Infernal. 2183 # They require too much RAM to be aligned with Infernal.
...@@ -2210,6 +2301,7 @@ def work_pssm(f, fill_gaps): ...@@ -2210,6 +2301,7 @@ def work_pssm(f, fill_gaps):
2210 Uses only 1 core, so this function can be called in parallel. 2301 Uses only 1 core, so this function can be called in parallel.
2211 2302
2212 """ 2303 """
2304 + setproctitle(f"RNAnet.py work_pssm({f})")
2213 2305
2214 # Get a worker number to position the progress bar 2306 # Get a worker number to position the progress bar
2215 global idxQueue 2307 global idxQueue
...@@ -2223,6 +2315,7 @@ def work_pssm(f, fill_gaps): ...@@ -2223,6 +2315,7 @@ def work_pssm(f, fill_gaps):
2223 try: 2315 try:
2224 align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") 2316 align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
2225 except: 2317 except:
2318 + warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True)
2226 with open(runDir + "/errors.txt", "a") as errf: 2319 with open(runDir + "/errors.txt", "a") as errf:
2227 errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n") 2320 errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n")
2228 return 1 2321 return 1
...@@ -2313,6 +2406,9 @@ def work_pssm(f, fill_gaps): ...@@ -2313,6 +2406,9 @@ def work_pssm(f, fill_gaps):
2313 2406
2314 @trace_unhandled_exceptions 2407 @trace_unhandled_exceptions
2315 def work_save(c, homology=True): 2408 def work_save(c, homology=True):
2409 +
2410 + setproctitle(f"RNAnet.py work_save({c.chain_label})")
2411 +
2316 conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0) 2412 conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0)
2317 if homology: 2413 if homology:
2318 df = pd.read_sql_query(f""" 2414 df = pd.read_sql_query(f"""
......
...@@ -12,7 +12,6 @@ done ...@@ -12,7 +12,6 @@ done
12 12
13 PROCESS_TO_KILL="statistics.py" 13 PROCESS_TO_KILL="statistics.py"
14 PROCESS_LIST=`ps ax | grep -Ei ${PROCESS_TO_KILL} | grep -Eiv '(grep|vi statistics.py)' | awk ' { print $1;}'` 14 PROCESS_LIST=`ps ax | grep -Ei ${PROCESS_TO_KILL} | grep -Eiv '(grep|vi statistics.py)' | awk ' { print $1;}'`
15 -KILLED=
16 for KILLPID in $PROCESS_LIST; do 15 for KILLPID in $PROCESS_LIST; do
17 if [ ! -z $KILLPID ];then 16 if [ ! -z $KILLPID ];then
18 kill -9 $KILLPID 17 kill -9 $KILLPID
......
...@@ -3,8 +3,8 @@ import subprocess, os, sys ...@@ -3,8 +3,8 @@ import subprocess, os, sys
3 3
4 # Put a list of problematic chains here, they will be properly deleted and recomputed 4 # Put a list of problematic chains here, they will be properly deleted and recomputed
5 problems = [ 5 problems = [
6 -"4v9n_1_DA_1-2879", 6 + "1k73_1_A",
7 -"4v9n_1_DA_148-2875" 7 + "1k73_1_B"
8 ] 8 ]
9 9
10 path_to_3D_data = sys.argv[1] 10 path_to_3D_data = sys.argv[1]
...@@ -15,6 +15,7 @@ for p in problems: ...@@ -15,6 +15,7 @@ for p in problems:
15 print() 15 print()
16 print() 16 print()
17 print() 17 print()
18 + homology = ('-' in p)
18 19
19 # Remove the datapoints files and 3D files 20 # Remove the datapoints files and 3D files
20 subprocess.run(["rm", '-f', path_to_3D_data + f"/rna_mapped_to_Rfam/{p}.cif"]) 21 subprocess.run(["rm", '-f', path_to_3D_data + f"/rna_mapped_to_Rfam/{p}.cif"])
...@@ -25,16 +26,25 @@ for p in problems: ...@@ -25,16 +26,25 @@ for p in problems:
25 # Find more information 26 # Find more information
26 structure = p.split('_')[0] 27 structure = p.split('_')[0]
27 chain = p.split('_')[2] 28 chain = p.split('_')[2]
28 - families = [ f.split('.')[1] for f in files ] # The RFAM families this chain has been mapped onto 29 + if homology:
29 - 30 + families = [ f.split('.')[1] for f in files ] # The RFAM families this chain has been mapped onto
30 - # Delete the chain from the database, and the associated nucleotides and re_mappings, using foreign keys 31 +
31 - for fam in families: 32 + # Delete the chain from the database, and the associated nucleotides and re_mappings, using foreign keys
32 - command = ["sqlite3", "results/RNANet.db", f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc=\"{fam}\";"] 33 + for fam in families:
34 + command = ["sqlite3", "results/RNANet.db", f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc=\"{fam}\";"]
35 + print(' '.join(command))
36 + subprocess.run(command)
37 +
38 + command = ["python3.8", "RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--extract", "--only", p]
39 + else:
40 + # Delete the chain from the database, and the associated nucleotides and re_mappings, using foreign keys
41 + command = ["sqlite3", "results/RNANet.db", f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc is null;"]
33 print(' '.join(command)) 42 print(' '.join(command))
34 subprocess.run(command) 43 subprocess.run(command)
35 44
45 + command = ["python3.8", "RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--no-homology", "--extract", "--only", p]
46 +
36 # Re-run RNANet 47 # Re-run RNANet
37 - command = ["python3.8", "RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--extract", "--only", p]
38 print('\n',' '.join(command),'\n') 48 print('\n',' '.join(command),'\n')
39 subprocess.run(command) 49 subprocess.run(command)
40 50
......
...@@ -31,7 +31,7 @@ res_thr = 20.0 # default: all structures ...@@ -31,7 +31,7 @@ res_thr = 20.0 # default: all structures
31 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112 31 LSU_set = ("RF00002", "RF02540", "RF02541", "RF02543", "RF02546") # From Rfam CLAN 00112
32 SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111 32 SSU_set = ("RF00177", "RF02542", "RF02545", "RF01959", "RF01960") # From Rfam CLAN 00111
33 33
34 -def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): 34 +def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4), res=4.0):
35 """ 35 """
36 Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph. 36 Plot the joint distribution of pseudotorsion angles, in a Ramachandran-style graph.
37 See Wadley & Pyle (2007) 37 See Wadley & Pyle (2007)
...@@ -63,7 +63,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): ...@@ -63,7 +63,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
63 exit("You overestimate my capabilities !") 63 exit("You overestimate my capabilities !")
64 64
65 65
66 - if not path.isfile(f"data/wadley_kernel_{angle}_{res_thr}A.npz"): 66 + if not path.isfile(f"data/wadley_kernel_{angle}_{res}A.npz"):
67 67
68 # Get a worker number to position the progress bar 68 # Get a worker number to position the progress bar
69 global idxQueue 69 global idxQueue
...@@ -75,7 +75,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): ...@@ -75,7 +75,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
75 df = pd.read_sql(f"""SELECT {angle}, th{angle} 75 df = pd.read_sql(f"""SELECT {angle}, th{angle}
76 FROM nucleotide JOIN ( 76 FROM nucleotide JOIN (
77 SELECT chain_id FROM chain JOIN structure 77 SELECT chain_id FROM chain JOIN structure
78 - WHERE structure.resolution <= {res_thr} 78 + WHERE structure.resolution <= {res}
79 ) AS c 79 ) AS c
80 WHERE puckering="C2'-endo" 80 WHERE puckering="C2'-endo"
81 AND {angle} IS NOT NULL 81 AND {angle} IS NOT NULL
...@@ -85,7 +85,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): ...@@ -85,7 +85,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
85 df = pd.read_sql(f"""SELECT {angle}, th{angle} 85 df = pd.read_sql(f"""SELECT {angle}, th{angle}
86 FROM nucleotide JOIN ( 86 FROM nucleotide JOIN (
87 SELECT chain_id FROM chain JOIN structure 87 SELECT chain_id FROM chain JOIN structure
88 - WHERE structure.resolution <= {res_thr} 88 + WHERE structure.resolution <= {res}
89 ) AS c 89 ) AS c
90 WHERE form = '.' 90 WHERE form = '.'
91 AND puckering="C3'-endo" 91 AND puckering="C3'-endo"
...@@ -111,14 +111,14 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): ...@@ -111,14 +111,14 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
111 pbar.update(1) 111 pbar.update(1)
112 112
113 # Save the data to an archive for later use without the need to recompute 113 # Save the data to an archive for later use without the need to recompute
114 - np.savez(f"data/wadley_kernel_{angle}.npz", 114 + np.savez(f"data/wadley_kernel_{angle}_{res}A.npz",
115 c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas, 115 c3_endo_e=c3_endo_etas, c3_endo_t=c3_endo_thetas,
116 c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas, 116 c2_endo_e=c2_endo_etas, c2_endo_t=c2_endo_thetas,
117 kernel_c3=f_c3, kernel_c2=f_c2) 117 kernel_c3=f_c3, kernel_c2=f_c2)
118 pbar.close() 118 pbar.close()
119 idxQueue.put(thr_idx) 119 idxQueue.put(thr_idx)
120 else: 120 else:
121 - f = np.load(f"data/wadley_kernel_{angle}.npz") 121 + f = np.load(f"data/wadley_kernel_{angle}_{res}A.npz")
122 c2_endo_etas = f["c2_endo_e"] 122 c2_endo_etas = f["c2_endo_e"]
123 c3_endo_etas = f["c3_endo_e"] 123 c3_endo_etas = f["c3_endo_e"]
124 c2_endo_thetas = f["c2_endo_t"] 124 c2_endo_thetas = f["c2_endo_t"]
...@@ -157,7 +157,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): ...@@ -157,7 +157,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
157 ax.bar3d(xpos.ravel(), ypos.ravel(), 0.0, 0.09, 0.09, hist_cut.ravel(), color=color_values, zorder="max") 157 ax.bar3d(xpos.ravel(), ypos.ravel(), 0.0, 0.09, 0.09, hist_cut.ravel(), color=color_values, zorder="max")
158 ax.set_xlabel(xlabel) 158 ax.set_xlabel(xlabel)
159 ax.set_ylabel(ylabel) 159 ax.set_ylabel(ylabel)
160 - fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res_thr}A.png") 160 + fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}_{res}A.png")
161 if show: 161 if show:
162 fig.show() 162 fig.show()
163 plt.close() 163 plt.close()
...@@ -168,7 +168,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): ...@@ -168,7 +168,7 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
168 ax.plot_surface(xx, yy, f_cut, cmap=cm.get_cmap("coolwarm"), linewidth=0, antialiased=True) 168 ax.plot_surface(xx, yy, f_cut, cmap=cm.get_cmap("coolwarm"), linewidth=0, antialiased=True)
169 ax.set_xlabel(xlabel) 169 ax.set_xlabel(xlabel)
170 ax.set_ylabel(ylabel) 170 ax.set_ylabel(ylabel)
171 - fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res_thr}A.png") 171 + fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}_{res}A.png")
172 if show: 172 if show:
173 fig.show() 173 fig.show()
174 plt.close() 174 plt.close()
...@@ -178,10 +178,9 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)): ...@@ -178,10 +178,9 @@ def reproduce_wadley_results(carbon=4, show=False, sd_range=(1,4)):
178 ax = fig.gca() 178 ax = fig.gca()
179 ax.scatter(x, y, s=1, alpha=0.1) 179 ax.scatter(x, y, s=1, alpha=0.1)
180 ax.contourf(xx, yy, f_cut, alpha=0.5, cmap=cm.get_cmap("coolwarm"), levels=levels, extend="max") 180 ax.contourf(xx, yy, f_cut, alpha=0.5, cmap=cm.get_cmap("coolwarm"), levels=levels, extend="max")
181 -
182 ax.set_xlabel(xlabel) 181 ax.set_xlabel(xlabel)
183 ax.set_ylabel(ylabel) 182 ax.set_ylabel(ylabel)
184 - fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}_{res_thr}A.png") 183 + fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}_{res}A.png")
185 if show: 184 if show:
186 fig.show() 185 fig.show()
187 plt.close() 186 plt.close()
...@@ -231,7 +230,13 @@ def stats_len(): ...@@ -231,7 +230,13 @@ def stats_len():
231 230
232 # Get the lengths of chains 231 # Get the lengths of chains
233 with sqlite3.connect("results/RNANet.db") as conn: 232 with sqlite3.connect("results/RNANet.db") as conn:
234 - l = [ x[0] for x in sql_ask_database(conn, f"SELECT COUNT(index_chain) FROM (SELECT chain_id FROM chain JOIN structure ON chain.structure_id = structure.pdb_id WHERE rfam_acc='{f}' AND resolution <= {res_thr}) NATURAL JOIN nucleotide GROUP BY chain_id;", warn_every=0) ] 233 + l = [ x[0] for x in sql_ask_database(conn, f"""SELECT COUNT(index_chain)
234 + FROM (
235 + SELECT chain_id
236 + FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
237 + WHERE rfam_acc='{f}' AND resolution <= {res_thr}
238 + ) NATURAL JOIN nucleotide
239 + GROUP BY chain_id;""", warn_every=0) ]
235 lengths.append(l) # list of chain lengths from the family 240 lengths.append(l) # list of chain lengths from the family
236 241
237 # notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths") 242 # notify(f"[{i+1}/{len(fam_list)}] Computed {f} chains lengths")
...@@ -597,6 +602,172 @@ def per_chain_stats(): ...@@ -597,6 +602,172 @@ def per_chain_stats():
597 many=True, data=list(df.to_records(index=False)), warn_every=10) 602 many=True, data=list(df.to_records(index=False)), warn_every=10)
598 notify("Updated the database with per-chain base frequencies") 603 notify("Updated the database with per-chain base frequencies")
599 604
605 +def general_stats():
606 + """
607 + Number of structures as function of the resolution threshold
608 + Number of Rfam families as function of the resolution threshold
609 + """
610 + with sqlite3.connect("results/RNANet.db") as conn:
611 + df_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
612 + FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
613 + WHERE rfam_acc IS NULL AND ISSUE=0;""", conn)
614 + df_mapped_unique = pd.read_sql(f"""SELECT distinct pdb_id, chain_name, exp_method, resolution
615 + FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
616 + WHERE rfam_acc IS NOT NULL AND ISSUE=0;""", conn)
617 + df_mapped_copies = pd.read_sql(f"""SELECT pdb_id, chain_name, inferred, rfam_acc, pdb_start, pdb_end, exp_method, resolution
618 + FROM chain JOIN structure ON chain.structure_id = structure.pdb_id
619 + WHERE rfam_acc IS NOT NULL AND ISSUE=0;""", conn)
620 + df_inferred_only_unique = pd.read_sql(f"""SELECT DISTINCT pdb_id, c.chain_name, exp_method, resolution
621 + FROM (SELECT inferred, rfam_acc, pdb_start, pdb_end, chain.structure_id, chain.chain_name, r.redundancy, r.inf_redundancy
622 + FROM chain
623 + JOIN (SELECT structure_id, chain_name, COUNT(distinct rfam_acc) AS redundancy, SUM(inferred) AS inf_redundancy
624 + FROM chain
625 + WHERE rfam_acc IS NOT NULL AND issue=0
626 + GROUP BY structure_id, chain_name
627 + ) AS r ON chain.structure_id=r.structure_id AND chain.chain_name = r.chain_name
628 + WHERE r.redundancy=r.inf_redundancy AND rfam_acc IS NOT NULL and issue=0
629 + ) AS c
630 + JOIN structure ON c.structure_id=structure.pdb_id;""", conn)
631 + print("> found", len(df_inferred_only_unique.index), "chains which are mapped only by inference using BGSU NR Lists.")
632 +
633 + ##########################################
634 + # plot N = f(resolution, exp_method)
635 + ##########################################
636 +
637 + methods = df_unique.exp_method.unique()
638 +
639 + fig, axs = plt.subplots(1+len(methods), 3, figsize=(15,5*(1+len(methods))), sharex=True)
640 + df_unique.sort_values('resolution', inplace=True, ignore_index=True)
641 + df_mapped_unique.sort_values('resolution', inplace=True, ignore_index=True)
642 + df_inferred_only_unique.sort_values('resolution', inplace=True, ignore_index=True)
643 + df_mapped_copies.sort_values('resolution', inplace=True, ignore_index=True)
644 + max_res = max(df_unique.resolution)
645 + max_structs = len(df_mapped_copies.index.tolist())
646 + colors = np.linspace(0,1,1+len(methods))
647 + plt.xticks( np.arange(0, max_res+2, 2.0).tolist(), np.arange(0, max_res+2, 2.0).tolist() )
648 +
649 + axs[0][0].grid(axis='y', ls='dotted', lw=1)
650 + axs[0][0].hist(df_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[0], 1), label='distribution')
651 + axs[0][0].hist(df_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[0], 0.5), cumulative=True, label='cumulative')
652 + axs[0][0].text(0.95*max_res, 0.95*len(df_unique.resolution), "%d " % len(df_unique.resolution),
653 + horizontalalignment='right', verticalalignment='top', fontsize=14)
654 + axs[0][0].set_ylabel("ALL", fontsize=14)
655 + axs[0][0].set_title("Number of unique RNA chains", fontsize=14)
656 + axs[0][0].set_ylim((0, max_structs * 1.05))
657 + axs[0][0].legend(loc="best", fontsize=14)
658 +
659 + axs[0][1].grid(axis='y', ls='dotted', lw=1)
660 + axs[0][1].set_yticklabels([])
661 + axs[0][1].hist(df_mapped_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[0], 1), label='distribution')
662 + axs[0][1].hist(df_mapped_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[0], 0.5), cumulative=True, label='cumulative')
663 + axs[0][1].hist(df_inferred_only_unique.resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[0], 0.5), cumulative=True, label='only by inference')
664 + axs[0][1].text(0.95*max_res, 0.95*len(df_mapped_unique.resolution), "%d " % len(df_mapped_unique.resolution),
665 + horizontalalignment='right', verticalalignment='top', fontsize=14)
666 + axs[0][1].set_title("Number of unique RNA chains\nmapped to $\geq 1$ family", fontsize=14)
667 + axs[0][1].set_ylim((0, max_structs * 1.05))
668 + axs[0][1].legend(loc="best", fontsize=14)
669 +
670 + axs[0][2].grid(axis='y', ls='dotted', lw=1)
671 + axs[0][2].set_yticklabels([])
672 + axs[0][2].hist(df_mapped_copies.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[0], 1), label='distribution')
673 + axs[0][2].hist(df_mapped_copies.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[0], 0.5), cumulative=True, label='cumulative')
674 + axs[0][2].hist(df_mapped_copies[df_mapped_copies.inferred == 1].resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[0], 0.5), cumulative=True, label='inferred')
675 + axs[0][2].text(0.95*max_res, 0.95*len(df_mapped_copies.resolution), "%d " % len(df_mapped_copies.resolution),
676 + horizontalalignment='right', verticalalignment='top', fontsize=14)
677 + axs[0][2].set_title("Number of RNA chains mapped to a\nfamily (with copies)", fontsize=14)
678 + axs[0][2].legend(loc="right", fontsize=14)
679 + axs[0][2].set_ylim((0, max_structs * 1.05))
680 +
681 + for i,m in enumerate(methods):
682 + df_unique_m = df_unique[df_unique.exp_method == m]
683 + df_mapped_unique_m = df_mapped_unique[df_mapped_unique.exp_method == m]
684 + df_inferred_only_unique_m = df_inferred_only_unique[df_inferred_only_unique.exp_method == m]
685 + df_mapped_copies_m = df_mapped_copies[ df_mapped_copies.exp_method == m]
686 + max_structs = len(df_mapped_copies_m.resolution.tolist())
687 + print("> found", max_structs, "structures with method", m, flush=True)
688 +
689 + axs[1+i][0].grid(axis='y', ls='dotted', lw=1)
690 + axs[1+i][0].hist(df_unique_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[1+i], 1), label='distribution')
691 + axs[1+i][0].hist(df_unique_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[1+i], 0.5), cumulative=True, label='cumulative')
692 + axs[1+i][0].text(0.95*max_res, 0.95*len(df_unique_m.resolution), "%d " % len(df_unique_m.resolution),
693 + horizontalalignment='right', verticalalignment='top', fontsize=14)
694 + axs[1+i][0].set_ylim((0, max_structs * 1.05))
695 + axs[1+i][0].set_ylabel(m, fontsize=14)
696 + axs[1+i][0].legend(loc="best", fontsize=14)
697 +
698 + axs[1+i][1].grid(axis='y', ls='dotted', lw=1)
699 + axs[1+i][1].set_yticklabels([])
700 + axs[1+i][1].hist(df_mapped_unique_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[1+i], 1), label='distribution')
701 + axs[1+i][1].hist(df_mapped_unique_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[1+i], 0.5), cumulative=True, label='cumulative')
702 + axs[1+i][1].hist(df_inferred_only_unique_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[1+i], 0.5), cumulative=True, label='only by inference')
703 + axs[1+i][1].text(0.95*max_res, 0.95*len(df_mapped_unique_m.resolution), "%d " % len(df_mapped_unique_m.resolution),
704 + horizontalalignment='right', verticalalignment='top', fontsize=14)
705 + axs[1+i][1].set_ylim((0, max_structs * 1.05))
706 + axs[1+i][1].legend(loc="best", fontsize=14)
707 +
708 + axs[1+i][2].grid(axis='y', ls='dotted', lw=1)
709 + axs[1+i][2].set_yticklabels([])
710 + axs[1+i][2].hist(df_mapped_copies_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 1, colors[1+i], 1), label='distribution')
711 + axs[1+i][2].hist(df_mapped_copies_m.resolution, bins=np.arange(0, max_res, 0.5), fc=(0, 0, colors[1+i], 0.5), cumulative=True, label='cumulative')
712 + axs[1+i][2].hist(df_mapped_copies_m[df_mapped_copies_m.inferred == 1].resolution, bins=np.arange(0, max_res, 0.5), fc=(0.2, 0, colors[1+i], 0.5), cumulative=True, label='inferred')
713 + axs[1+i][2].text(0.95*max_res, 0.95*len(df_mapped_copies_m.resolution), "%d " % len(df_mapped_copies_m.resolution),
714 + horizontalalignment='right', verticalalignment='top', fontsize=14)
715 + axs[1+i][2].set_ylim((0, max_structs * 1.05))
716 + axs[1+i][2].legend(loc="right", fontsize=14)
717 +
718 + axs[-1][0].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14)
719 + axs[-1][1].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14)
720 + axs[-1][2].set_xlabel("Structure resolution\n(Angströms, lower is better)", fontsize=14)
721 +
722 + fig.suptitle("Number of RNA chains by experimental method and resolution", fontsize=16)
723 + fig.subplots_adjust(left=0.07, right=0.98, wspace=0.05,
724 + hspace=0.05, bottom=0.05, top=0.92)
725 + fig.savefig("results/figures/resolutions.png")
726 + plt.close()
727 +
728 + ##########################################
729 + # plot Nfam = f(resolution, exp_method)
730 + ##########################################
731 +
732 + df_mapped_copies['n_fam'] = [ len(df_mapped_copies.rfam_acc[:i+1].unique()) for i in range(len(df_mapped_copies.index)) ]
733 +
734 + fig, axs = plt.subplots(1, 1+len(methods), figsize=(5*(1+len(methods)), 5))
735 + max_res = max(df_mapped_copies.resolution)
736 + max_fams = max(df_mapped_copies.n_fam)
737 + colors = np.linspace(0,1,1+len(methods))
738 + plt.xticks( np.arange(0, max_res+2, 2.0).tolist(), np.arange(0, max_res+2, 2.0).tolist() )
739 +
740 + axs[0].grid(axis='y', ls='dotted', lw=1)
741 + axs[0].plot(df_mapped_copies.resolution, df_mapped_copies.n_fam)
742 + axs[0].text(0.95*max_res, 0.95*df_mapped_copies.n_fam.iloc[-1], "%d " % df_mapped_copies.n_fam.iloc[-1],
743 + horizontalalignment='right', verticalalignment='top', fontsize=14)
744 + axs[0].set_title("ALL", fontsize=14)
745 + axs[0].set_xlabel("Structure resolution (Angströms)", fontsize=14)
746 + axs[0].set_ylabel("Number of Rfam families", fontsize=14)
747 + axs[0].set_ylim((0, max_res * 1.05))
748 + axs[0].set_ylim((0, max_fams * 1.05))
749 +
750 + for i,m in enumerate(methods):
751 + df_mapped_copies_m = df_mapped_copies[ df_mapped_copies.exp_method == m].drop("n_fam", axis=1).copy()
752 + df_mapped_copies_m['n_fam'] = [ len(df_mapped_copies_m.rfam_acc[:i+1].unique()) for i in range(len(df_mapped_copies_m.index)) ]
753 + print(">", df_mapped_copies_m.n_fam.iloc[-1], "different RNA families have a 3D structure solved by", m)
754 +
755 + axs[1+i].grid(axis='y', ls='dotted', lw=1)
756 + axs[1+i].plot(df_mapped_copies_m.resolution, df_mapped_copies_m.n_fam, )
757 + axs[1+i].text(0.95*max(df_mapped_copies_m.resolution), 0.95*df_mapped_copies_m.n_fam.iloc[-1], "%d " % df_mapped_copies_m.n_fam.iloc[-1],
758 + horizontalalignment='right', verticalalignment='top', fontsize=14)
759 + axs[1+i].set_xlim((0, max_res * 1.05))
760 + axs[1+i].set_ylim((0, max_fams * 1.05))
761 + axs[1+i].set_xlabel("Structure resolution (Angströms)", fontsize=14)
762 + axs[1+i].set_title(m, fontsize=14)
763 + axs[1+i].set_yticklabels([])
764 +
765 + fig.suptitle("Number of RNA families used by experimental method and resolution", fontsize=16)
766 + fig.subplots_adjust(left=0.05, right=0.98, wspace=0.05,
767 + hspace=0.05, bottom=0.12, top=0.84)
768 + fig.savefig("results/figures/Nfamilies.png")
769 + plt.close()
770 +
600 def log_to_pbar(pbar): 771 def log_to_pbar(pbar):
601 def update(r): 772 def update(r):
602 pbar.update(1) 773 pbar.update(1)
...@@ -604,6 +775,9 @@ def log_to_pbar(pbar): ...@@ -604,6 +775,9 @@ def log_to_pbar(pbar):
604 775
605 if __name__ == "__main__": 776 if __name__ == "__main__":
606 777
778 + general_stats()
779 + exit()
780 +
607 # parse options 781 # parse options
608 try: 782 try:
609 opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "resolution=", "3d-folder=", "seq-folder=" ]) 783 opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "resolution=", "3d-folder=", "seq-folder=" ])
...@@ -665,8 +839,8 @@ if __name__ == "__main__": ...@@ -665,8 +839,8 @@ if __name__ == "__main__":
665 839
666 # Define the tasks 840 # Define the tasks
667 joblist = [] 841 joblist = []
668 - # joblist.append(Job(function=reproduce_wadley_results, args=(1,))) 842 + joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), 4.0))) # res threshold is 4.0 Angstroms by default
669 - # joblist.append(Job(function=reproduce_wadley_results, args=(4,))) 843 + joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), 4.0))) #
670 joblist.append(Job(function=stats_len)) # Computes figures 844 joblist.append(Job(function=stats_len)) # Computes figures
671 # joblist.append(Job(function=stats_freq)) # updates the database 845 # joblist.append(Job(function=stats_freq)) # updates the database
672 # for f in famlist: 846 # for f in famlist:
......