Louis BECQUEY

replaced old urllib2 functions by calls to wget

Showing 1 changed file with 28 additions and 38 deletions
...@@ -8,8 +8,6 @@ from Bio.PDB.mmcifio import MMCIFIO ...@@ -8,8 +8,6 @@ from Bio.PDB.mmcifio import MMCIFIO
8 from Bio.PDB.MMCIF2Dict import MMCIF2Dict 8 from Bio.PDB.MMCIF2Dict import MMCIF2Dict
9 from Bio.PDB.PDBExceptions import PDBConstructionWarning, BiopythonWarning 9 from Bio.PDB.PDBExceptions import PDBConstructionWarning, BiopythonWarning
10 from Bio.PDB.Dice import ChainSelector 10 from Bio.PDB.Dice import ChainSelector
11 -from Bio._py3k import urlretrieve as _urlretrieve
12 -from Bio._py3k import urlcleanup as _urlcleanup
13 from Bio.Alphabet import generic_rna 11 from Bio.Alphabet import generic_rna
14 from Bio.Seq import Seq 12 from Bio.Seq import Seq
15 from Bio.SeqRecord import SeqRecord 13 from Bio.SeqRecord import SeqRecord
...@@ -22,6 +20,21 @@ from time import sleep ...@@ -22,6 +20,21 @@ from time import sleep
22 from tqdm import tqdm 20 from tqdm import tqdm
23 from tqdm.contrib.concurrent import process_map 21 from tqdm.contrib.concurrent import process_map
24 22
23 +def trace_unhandled_exceptions(func):
24 + @wraps(func)
25 + def wrapped_func(*args, **kwargs):
26 + try:
27 + return func(*args, **kwargs)
28 + except:
29 + s = traceback.format_exc()
30 + with open(runDir + "/errors.txt", "a") as f:
31 + f.write("Exception in "+func.__name__+"\n")
32 + f.write(s)
33 + f.write("\n\n")
34 +
35 + warn('Exception in '+func.__name__, error=True)
36 + print(s)
37 + return wrapped_func
25 38
26 pd.set_option('display.max_rows', None) 39 pd.set_option('display.max_rows', None)
27 sqlite3.enable_callback_tracebacks(True) 40 sqlite3.enable_callback_tracebacks(True)
...@@ -123,7 +136,7 @@ class Chain: ...@@ -123,7 +136,7 @@ class Chain:
123 136
124 Chains accumulate information through this scipt, and are saved to files at the end of major steps.""" 137 Chains accumulate information through this scipt, and are saved to files at the end of major steps."""
125 138
126 - def __init__(self, pdb_id, pdb_model, pdb_chain_id, chain_label, rfam="", inferred=False, pdb_start=None, pdb_end=None): 139 + def __init__(self, pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam="", inferred=False, pdb_start=None, pdb_end=None):
127 self.pdb_id = pdb_id # PDB ID 140 self.pdb_id = pdb_id # PDB ID
128 self.pdb_model = int(pdb_model) # model ID, starting at 1 141 self.pdb_model = int(pdb_model) # model ID, starting at 1
129 self.pdb_chain_id = pdb_chain_id # chain ID (mmCIF), multiple letters 142 self.pdb_chain_id = pdb_chain_id # chain ID (mmCIF), multiple letters
...@@ -193,6 +206,7 @@ class Chain: ...@@ -193,6 +206,7 @@ class Chain:
193 206
194 notify(status) 207 notify(status)
195 208
209 + @trace_unhandled_exceptions
196 def extract_3D_data(self): 210 def extract_3D_data(self):
197 """ Maps DSSR annotations to the chain. """ 211 """ Maps DSSR annotations to the chain. """
198 212
...@@ -749,8 +763,7 @@ class Downloader: ...@@ -749,8 +763,7 @@ class Downloader:
749 print(f"\t> Download Rfam.cm.gz from Rfam..." + " " * 37, end='', flush=True) 763 print(f"\t> Download Rfam.cm.gz from Rfam..." + " " * 37, end='', flush=True)
750 if not path.isfile(path_to_seq_data + "Rfam.cm"): 764 if not path.isfile(path_to_seq_data + "Rfam.cm"):
751 try: 765 try:
752 - _urlcleanup() 766 + subprocess.run(["wget", "ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz", "-O", path_to_seq_data + "Rfam.cm.gz"])
753 - _urlretrieve(f'ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz', path_to_seq_data + "Rfam.cm.gz")
754 print(f"\t{validsymb}", flush=True) 767 print(f"\t{validsymb}", flush=True)
755 print(f"\t\t> Uncompressing Rfam.cm...", end='', flush=True) 768 print(f"\t\t> Uncompressing Rfam.cm...", end='', flush=True)
756 subprocess.run(["gunzip", path_to_seq_data + "Rfam.cm.gz"], stdout=subprocess.DEVNULL) 769 subprocess.run(["gunzip", path_to_seq_data + "Rfam.cm.gz"], stdout=subprocess.DEVNULL)
...@@ -813,16 +826,14 @@ class Downloader: ...@@ -813,16 +826,14 @@ class Downloader:
813 if not path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"): 826 if not path.isfile(path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"):
814 for _ in range(10): # retry 100 times if it fails 827 for _ in range(10): # retry 100 times if it fails
815 try: 828 try:
816 - _urlcleanup() 829 + subprocess.run(["wget", f'ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/fasta_files/{rfam_acc}.fa.gz', "-O", path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz"], stdout=subprocess.DEVNULL)
817 - _urlretrieve( f'ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/fasta_files/{rfam_acc}.fa.gz',
818 - path_to_seq_data + f"rfam_sequences/fasta/{rfam_acc}.fa.gz")
819 notify(f"Downloaded {rfam_acc}.fa.gz from Rfam") 830 notify(f"Downloaded {rfam_acc}.fa.gz from Rfam")
820 return # if it worked, no need to retry 831 return # if it worked, no need to retry
821 except Exception as e: 832 except Exception as e:
822 warn(f"Error downloading {rfam_acc}.fa.gz: {e}") 833 warn(f"Error downloading {rfam_acc}.fa.gz: {e}")
823 warn("retrying in 0.2s (worker " + str(os.getpid()) + f', try {_+1}/100)') 834 warn("retrying in 0.2s (worker " + str(os.getpid()) + f', try {_+1}/100)')
824 time.sleep(0.2) 835 time.sleep(0.2)
825 - warn("Tried to reach database 100 times and failed. Aborting.", error=True) 836 + warn("Tried to reach Rfam FTP 100 times and failed. Aborting.", error=True)
826 else: 837 else:
827 notify(f"Downloaded {rfam_acc}.fa.gz from Rfam", "already there") 838 notify(f"Downloaded {rfam_acc}.fa.gz from Rfam", "already there")
828 839
...@@ -860,14 +871,11 @@ class Downloader: ...@@ -860,14 +871,11 @@ class Downloader:
860 def download_from_SILVA(self, unit): 871 def download_from_SILVA(self, unit):
861 if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"): 872 if not path.isfile(path_to_seq_data + f"realigned/{unit}.arb"):
862 try: 873 try:
863 - _urlcleanup()
864 print(f"Downloading {unit} from SILVA...", end='', flush=True) 874 print(f"Downloading {unit} from SILVA...", end='', flush=True)
865 if unit=="LSU": 875 if unit=="LSU":
866 - _urlretrieve('http://www.arb-silva.de/fileadmin/arb_web_db/release_132/ARB_files/SILVA_132_LSURef_07_12_17_opt.arb.gz', 876 + subprocess.run(["wget", "http://www.arb-silva.de/fileadmin/arb_web_db/release_132/ARB_files/SILVA_132_LSURef_07_12_17_opt.arb.gz", "-O", path_to_seq_data + "realigned/LSU.arb.gz"])
867 - path_to_seq_data + "realigned/LSU.arb.gz")
868 else: 877 else:
869 - _urlretrieve('http://www.arb-silva.de/fileadmin/silva_databases/release_138/ARB_files/SILVA_138_SSURef_05_01_20_opt.arb.gz', 878 + subprocess.run(["wget", "http://www.arb-silva.de/fileadmin/silva_databases/release_138/ARB_files/SILVA_138_SSURef_05_01_20_opt.arb.gz", "-O", path_to_seq_data + "realigned/SSU.arb.gz"])
870 - path_to_seq_data + "realigned/SSU.arb.gz")
871 except: 879 except:
872 warn(f"Error downloading the {unit} database from SILVA", error=True) 880 warn(f"Error downloading the {unit} database from SILVA", error=True)
873 exit(1) 881 exit(1)
...@@ -986,7 +994,7 @@ class Pipeline: ...@@ -986,7 +994,7 @@ class Pipeline:
986 994
987 for opt, arg in opts: 995 for opt, arg in opts:
988 996
989 - if opt in ["--from-scratch", "--update-mmcifs", "--update-homologous"] and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data]: 997 + if opt in ["--from-scratch", "--update-homologous"] and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data]:
990 print("Please provide --3d-folder and --seq-folder first, so that we know what to delete and update.") 998 print("Please provide --3d-folder and --seq-folder first, so that we know what to delete and update.")
991 exit() 999 exit()
992 1000
...@@ -1083,7 +1091,7 @@ class Pipeline: ...@@ -1083,7 +1091,7 @@ class Pipeline:
1083 elif opt == "--archive": 1091 elif opt == "--archive":
1084 self.ARCHIVE = True 1092 self.ARCHIVE = True
1085 1093
1086 - if "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data]: 1094 + if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions":
1087 print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") 1095 print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
1088 print("See RNANet.py --help for more information.") 1096 print("See RNANet.py --help for more information.")
1089 exit(1) 1097 exit(1)
...@@ -1144,7 +1152,7 @@ class Pipeline: ...@@ -1144,7 +1152,7 @@ class Pipeline:
1144 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}" 1152 chain_label = f"{pdb_id}_{str(pdb_model)}_{pdb_chain_id}"
1145 res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc IS NULL AND issue=0""") 1153 res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc IS NULL AND issue=0""")
1146 if not len(res): # the chain is NOT yet in the database, or this is a known issue 1154 if not len(res): # the chain is NOT yet in the database, or this is a known issue
1147 - self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label)) 1155 + self.update.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class))
1148 conn.close() 1156 conn.close()
1149 1157
1150 if self.SELECT_ONLY is not None: 1158 if self.SELECT_ONLY is not None:
...@@ -1476,22 +1484,6 @@ def init_worker(tqdm_lock=None): ...@@ -1476,22 +1484,6 @@ def init_worker(tqdm_lock=None):
1476 if tqdm_lock is not None: 1484 if tqdm_lock is not None:
1477 tqdm.set_lock(tqdm_lock) 1485 tqdm.set_lock(tqdm_lock)
1478 1486
1479 -def trace_unhandled_exceptions(func):
1480 - @wraps(func)
1481 - def wrapped_func(*args, **kwargs):
1482 - try:
1483 - return func(*args, **kwargs)
1484 - except:
1485 - s = traceback.format_exc()
1486 - with open(runDir + "/errors.txt", "a") as f:
1487 - f.write("Exception in "+func.__name__+"\n")
1488 - f.write(s)
1489 - f.write("\n\n")
1490 -
1491 - warn('Exception in '+func.__name__, error=True)
1492 - print(s)
1493 - return wrapped_func
1494 -
1495 def warn(message, error=False): 1487 def warn(message, error=False):
1496 """Pretty-print warnings and error messages. 1488 """Pretty-print warnings and error messages.
1497 """ 1489 """
...@@ -1894,9 +1886,9 @@ def work_infer_mappings(update_only, allmappings, codelist): ...@@ -1894,9 +1886,9 @@ def work_infer_mappings(update_only, allmappings, codelist):
1894 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: 1886 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
1895 res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc='{rfam}' AND issue=0""") 1887 res = sql_ask_database(conn, f"""SELECT chain_id from chain WHERE structure_id='{pdb_id}' AND chain_name='{pdb_chain_id}' AND rfam_acc='{rfam}' AND issue=0""")
1896 if not len(res): # the chain is NOT yet in the database, or this is a known issue 1888 if not len(res): # the chain is NOT yet in the database, or this is a known issue
1897 - newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end)) 1889 + newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
1898 else: 1890 else:
1899 - newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end)) 1891 + newchains.append(Chain(pdb_id, pdb_model, pdb_chain_id, chain_label, eq_class, rfam=rfam, inferred=inferred, pdb_start=pdb_start, pdb_end=pdb_end))
1900 1892
1901 return newchains 1893 return newchains
1902 1894
...@@ -1907,14 +1899,12 @@ def work_mmcif(pdb_id): ...@@ -1907,14 +1899,12 @@ def work_mmcif(pdb_id):
1907 SETS table structure 1899 SETS table structure
1908 """ 1900 """
1909 1901
1910 - url = 'http://files.rcsb.org/download/%s.cif' % (pdb_id)
1911 final_filepath = path_to_3D_data+"RNAcifs/"+pdb_id+".cif" 1902 final_filepath = path_to_3D_data+"RNAcifs/"+pdb_id+".cif"
1912 1903
1913 # Attempt to download it if not present 1904 # Attempt to download it if not present
1914 try: 1905 try:
1915 if not path.isfile(final_filepath): 1906 if not path.isfile(final_filepath):
1916 - _urlcleanup() 1907 + subprocess.run(["wget", f'http://files.rcsb.org/download/{pdb_id}.cif', "-O", final_filepath], stdout=subprocess.DEVNULL)
1917 - _urlretrieve(url, final_filepath)
1918 except: 1908 except:
1919 warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True) 1909 warn(f"Unable to download {pdb_id}.cif. Ignoring it.", error=True)
1920 return 1910 return
......