Showing
1 changed file
with
51 additions
and
21 deletions
... | @@ -9,7 +9,6 @@ if a[0] != "3.8": | ... | @@ -9,7 +9,6 @@ if a[0] != "3.8": |
9 | print("Please use version 3.8 or newer.") | 9 | print("Please use version 3.8 or newer.") |
10 | exit(1) | 10 | exit(1) |
11 | 11 | ||
12 | -import Bio | ||
13 | import Bio.PDB as pdb | 12 | import Bio.PDB as pdb |
14 | import concurrent.futures | 13 | import concurrent.futures |
15 | import getopt | 14 | import getopt |
... | @@ -37,7 +36,10 @@ from time import sleep | ... | @@ -37,7 +36,10 @@ from time import sleep |
37 | from tqdm import tqdm | 36 | from tqdm import tqdm |
38 | from setproctitle import setproctitle | 37 | from setproctitle import setproctitle |
39 | from Bio import AlignIO, SeqIO | 38 | from Bio import AlignIO, SeqIO |
40 | -from Bio.Align import AlignInfo | 39 | +from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser |
40 | +from Bio.Seq import MutableSeq | ||
41 | +from Bio.SeqRecord import SeqRecord | ||
42 | +from Bio.Align import MultipleSeqAlignment | ||
41 | 43 | ||
42 | def trace_unhandled_exceptions(func): | 44 | def trace_unhandled_exceptions(func): |
43 | @wraps(func) | 45 | @wraps(func) |
... | @@ -81,6 +83,30 @@ no_nts_set = set() | ... | @@ -81,6 +83,30 @@ no_nts_set = set() |
81 | weird_mappings = set() | 83 | weird_mappings = set() |
82 | 84 | ||
83 | 85 | ||
86 | +class MutableFastaIterator(FastaIterator): | ||
87 | + """ | ||
88 | + Same as Biopython's FastaIterator, but uses Bio.Seq.MutableSeq objects instead of Bio.Seq.Seq. | ||
89 | + """ | ||
90 | + def iterate(self, handle): | ||
91 | + """Parse the file and generate SeqRecord objects.""" | ||
92 | + title2ids = self.title2ids | ||
93 | + if title2ids: | ||
94 | + for title, sequence in SimpleFastaParser(handle): | ||
95 | + id, name, descr = title2ids(title) | ||
96 | + yield SeqRecord(MutableSeq(sequence), id=id, name=name, description=descr) | ||
97 | + else: | ||
98 | + for title, sequence in SimpleFastaParser(handle): | ||
99 | + try: | ||
100 | + first_word = title.split(None, 1)[0] | ||
101 | + except IndexError: | ||
102 | + assert not title, repr(title) | ||
103 | + # Should we use SeqRecord default for no ID? | ||
104 | + first_word = "" | ||
105 | + yield SeqRecord( | ||
106 | + MutableSeq(sequence), id=first_word, name=first_word, description=title, | ||
107 | + ) | ||
108 | + | ||
109 | + | ||
84 | class SelectivePortionSelector(object): | 110 | class SelectivePortionSelector(object): |
85 | """Class passed to MMCIFIO to select some chain portions in an MMCIF file. | 111 | """Class passed to MMCIFIO to select some chain portions in an MMCIF file. |
86 | 112 | ||
... | @@ -1532,13 +1558,11 @@ def read_cpu_number(): | ... | @@ -1532,13 +1558,11 @@ def read_cpu_number(): |
1532 | p = subprocess.run(['grep', '-Ec', '(Intel|AMD)', '/proc/cpuinfo'], stdout=subprocess.PIPE) | 1558 | p = subprocess.run(['grep', '-Ec', '(Intel|AMD)', '/proc/cpuinfo'], stdout=subprocess.PIPE) |
1533 | return int(int(p.stdout.decode('utf-8')[:-1])/2) | 1559 | return int(int(p.stdout.decode('utf-8')[:-1])/2) |
1534 | 1560 | ||
1535 | - | ||
1536 | def init_worker(tqdm_lock=None): | 1561 | def init_worker(tqdm_lock=None): |
1537 | signal.signal(signal.SIGINT, signal.SIG_IGN) | 1562 | signal.signal(signal.SIGINT, signal.SIG_IGN) |
1538 | if tqdm_lock is not None: | 1563 | if tqdm_lock is not None: |
1539 | tqdm.set_lock(tqdm_lock) | 1564 | tqdm.set_lock(tqdm_lock) |
1540 | 1565 | ||
1541 | - | ||
1542 | def warn(message, error=False): | 1566 | def warn(message, error=False): |
1543 | """Pretty-print warnings and error messages. | 1567 | """Pretty-print warnings and error messages. |
1544 | """ | 1568 | """ |
... | @@ -1557,12 +1581,32 @@ def warn(message, error=False): | ... | @@ -1557,12 +1581,32 @@ def warn(message, error=False): |
1557 | else: | 1581 | else: |
1558 | print(f"\t> \033[33mWARN: {message:64s}\033[0m\t{warnsymb}", flush=True) | 1582 | print(f"\t> \033[33mWARN: {message:64s}\033[0m\t{warnsymb}", flush=True) |
1559 | 1583 | ||
1560 | - | ||
1561 | def notify(message, post=''): | 1584 | def notify(message, post=''): |
1562 | if len(post): | 1585 | if len(post): |
1563 | post = '(' + post + ')' | 1586 | post = '(' + post + ')' |
1564 | print(f"\t> {message:70s}\t{validsymb}\t{post}", flush=True) | 1587 | print(f"\t> {message:70s}\t{validsymb}\t{post}", flush=True) |
1565 | 1588 | ||
1589 | +def _mutable_SeqIO_to_alignment_iterator(handle): | ||
1590 | + records = list(MutableFastaIterator(handle)) | ||
1591 | + if records: | ||
1592 | + yield MultipleSeqAlignment(records) | ||
1593 | + | ||
1594 | +def parse(handle): | ||
1595 | + with open(handle, 'r') as fp: | ||
1596 | + yield from _mutable_SeqIO_to_alignment_iterator(fp) | ||
1597 | + | ||
1598 | +def read(handle): | ||
1599 | + iterator = parse(handle) | ||
1600 | + try: | ||
1601 | + alignment = next(iterator) | ||
1602 | + except StopIteration: | ||
1603 | + raise ValueError("No records found in handle") from None | ||
1604 | + try: | ||
1605 | + next(iterator) | ||
1606 | + raise ValueError("More than one record found in handle") | ||
1607 | + except StopIteration: | ||
1608 | + pass | ||
1609 | + return alignment | ||
1566 | 1610 | ||
1567 | def sql_define_tables(conn): | 1611 | def sql_define_tables(conn): |
1568 | conn.executescript( | 1612 | conn.executescript( |
... | @@ -1673,7 +1717,6 @@ def sql_define_tables(conn): | ... | @@ -1673,7 +1717,6 @@ def sql_define_tables(conn): |
1673 | """) | 1717 | """) |
1674 | conn.commit() | 1718 | conn.commit() |
1675 | 1719 | ||
1676 | - | ||
1677 | @trace_unhandled_exceptions | 1720 | @trace_unhandled_exceptions |
1678 | def sql_ask_database(conn, sql, warn_every=10): | 1721 | def sql_ask_database(conn, sql, warn_every=10): |
1679 | """ | 1722 | """ |
... | @@ -1695,7 +1738,6 @@ def sql_ask_database(conn, sql, warn_every=10): | ... | @@ -1695,7 +1738,6 @@ def sql_ask_database(conn, sql, warn_every=10): |
1695 | warn("Tried to reach database 100 times and failed. Aborting.", error=True) | 1738 | warn("Tried to reach database 100 times and failed. Aborting.", error=True) |
1696 | return [] | 1739 | return [] |
1697 | 1740 | ||
1698 | - | ||
1699 | @trace_unhandled_exceptions | 1741 | @trace_unhandled_exceptions |
1700 | def sql_execute(conn, sql, many=False, data=None, warn_every=10): | 1742 | def sql_execute(conn, sql, many=False, data=None, warn_every=10): |
1701 | for _ in range(100): # retry 100 times if it fails | 1743 | for _ in range(100): # retry 100 times if it fails |
... | @@ -1718,7 +1760,6 @@ def sql_execute(conn, sql, many=False, data=None, warn_every=10): | ... | @@ -1718,7 +1760,6 @@ def sql_execute(conn, sql, many=False, data=None, warn_every=10): |
1718 | time.sleep(0.2) | 1760 | time.sleep(0.2) |
1719 | warn("Tried to reach database 100 times and failed. Aborting.", error=True) | 1761 | warn("Tried to reach database 100 times and failed. Aborting.", error=True) |
1720 | 1762 | ||
1721 | - | ||
1722 | @trace_unhandled_exceptions | 1763 | @trace_unhandled_exceptions |
1723 | def execute_job(j, jobcount): | 1764 | def execute_job(j, jobcount): |
1724 | """Run a Job object. | 1765 | """Run a Job object. |
... | @@ -1780,7 +1821,6 @@ def execute_job(j, jobcount): | ... | @@ -1780,7 +1821,6 @@ def execute_job(j, jobcount): |
1780 | t = end_time - start_time | 1821 | t = end_time - start_time |
1781 | return (t, m, r) | 1822 | return (t, m, r) |
1782 | 1823 | ||
1783 | - | ||
1784 | def execute_joblist(fulljoblist): | 1824 | def execute_joblist(fulljoblist): |
1785 | """ Run a list of job objects. | 1825 | """ Run a list of job objects. |
1786 | 1826 | ||
... | @@ -1847,7 +1887,6 @@ def execute_joblist(fulljoblist): | ... | @@ -1847,7 +1887,6 @@ def execute_joblist(fulljoblist): |
1847 | # throw back the money | 1887 | # throw back the money |
1848 | return results | 1888 | return results |
1849 | 1889 | ||
1850 | - | ||
1851 | @trace_unhandled_exceptions | 1890 | @trace_unhandled_exceptions |
1852 | def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> list: | 1891 | def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> list: |
1853 | """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list, | 1892 | """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list, |
... | @@ -1999,7 +2038,6 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li | ... | @@ -1999,7 +2038,6 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li |
1999 | 2038 | ||
2000 | return newchains | 2039 | return newchains |
2001 | 2040 | ||
2002 | - | ||
2003 | @trace_unhandled_exceptions | 2041 | @trace_unhandled_exceptions |
2004 | def work_mmcif(pdb_id): | 2042 | def work_mmcif(pdb_id): |
2005 | """ Look for a CIF file (with all chains) from RCSB | 2043 | """ Look for a CIF file (with all chains) from RCSB |
... | @@ -2078,7 +2116,6 @@ def work_mmcif(pdb_id): | ... | @@ -2078,7 +2116,6 @@ def work_mmcif(pdb_id): |
2078 | 2116 | ||
2079 | return 0 | 2117 | return 0 |
2080 | 2118 | ||
2081 | - | ||
2082 | @trace_unhandled_exceptions | 2119 | @trace_unhandled_exceptions |
2083 | def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): | 2120 | def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): |
2084 | """Reads information from JSON and save it to database. | 2121 | """Reads information from JSON and save it to database. |
... | @@ -2115,7 +2152,6 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): | ... | @@ -2115,7 +2152,6 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): |
2115 | 2152 | ||
2116 | return c | 2153 | return c |
2117 | 2154 | ||
2118 | - | ||
2119 | @trace_unhandled_exceptions | 2155 | @trace_unhandled_exceptions |
2120 | def work_prepare_sequences(dl, rfam_acc, chains): | 2156 | def work_prepare_sequences(dl, rfam_acc, chains): |
2121 | """Prepares FASTA files of homologous sequences to realign with cmalign or SINA. | 2157 | """Prepares FASTA files of homologous sequences to realign with cmalign or SINA. |
... | @@ -2204,7 +2240,6 @@ def work_prepare_sequences(dl, rfam_acc, chains): | ... | @@ -2204,7 +2240,6 @@ def work_prepare_sequences(dl, rfam_acc, chains): |
2204 | # print some stats | 2240 | # print some stats |
2205 | notify(status) | 2241 | notify(status) |
2206 | 2242 | ||
2207 | - | ||
2208 | @trace_unhandled_exceptions | 2243 | @trace_unhandled_exceptions |
2209 | def work_realign(rfam_acc): | 2244 | def work_realign(rfam_acc): |
2210 | """ Runs multiple sequence alignements by RNA family. | 2245 | """ Runs multiple sequence alignements by RNA family. |
... | @@ -2315,7 +2350,6 @@ def work_realign(rfam_acc): | ... | @@ -2315,7 +2350,6 @@ def work_realign(rfam_acc): |
2315 | with open(runDir + "/errors.txt", "a") as er: | 2350 | with open(runDir + "/errors.txt", "a") as er: |
2316 | er.write(f"Failed to realign {rfam_acc} (killed)") | 2351 | er.write(f"Failed to realign {rfam_acc} (killed)") |
2317 | 2352 | ||
2318 | - | ||
2319 | @trace_unhandled_exceptions | 2353 | @trace_unhandled_exceptions |
2320 | def work_pssm_remap(f): | 2354 | def work_pssm_remap(f): |
2321 | """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. | 2355 | """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. |
... | @@ -2330,10 +2364,6 @@ def work_pssm_remap(f): | ... | @@ -2330,10 +2364,6 @@ def work_pssm_remap(f): |
2330 | global idxQueue | 2364 | global idxQueue |
2331 | thr_idx = idxQueue.get() | 2365 | thr_idx = idxQueue.get() |
2332 | 2366 | ||
2333 | - # get the chains of this family in the update | ||
2334 | - list_of_chains = rfam_acc_to_download[f] | ||
2335 | - chains_ids = [str(c) for c in list_of_chains] | ||
2336 | - | ||
2337 | ########################################################################################## | 2367 | ########################################################################################## |
2338 | # Compute frequencies in the alignment | 2368 | # Compute frequencies in the alignment |
2339 | ########################################################################################## | 2369 | ########################################################################################## |
... | @@ -2342,7 +2372,7 @@ def work_pssm_remap(f): | ... | @@ -2342,7 +2372,7 @@ def work_pssm_remap(f): |
2342 | 2372 | ||
2343 | # Open the alignment | 2373 | # Open the alignment |
2344 | try: | 2374 | try: |
2345 | - align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") | 2375 | + align = read(path_to_seq_data + f"realigned/{f}++.afa") # This is our custom AlignIO overload which uses MutableSeq instead of Seq |
2346 | except: | 2376 | except: |
2347 | warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True) | 2377 | warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True) |
2348 | with open(runDir + "/errors.txt", "a") as errf: | 2378 | with open(runDir + "/errors.txt", "a") as errf: |
... | @@ -2458,6 +2488,7 @@ def work_pssm_remap(f): | ... | @@ -2458,6 +2488,7 @@ def work_pssm_remap(f): |
2458 | if j < ncols and s.seq[j] == '.': | 2488 | if j < ncols and s.seq[j] == '.': |
2459 | re_mappings.append((db_id, i+1, j+1)) | 2489 | re_mappings.append((db_id, i+1, j+1)) |
2460 | columns_to_save.add(j+1) | 2490 | columns_to_save.add(j+1) |
2491 | + s.seq[j] = '-' # We replace the insertion gap by a real gap (thanks to MutableSeqs) | ||
2461 | i += 1 | 2492 | i += 1 |
2462 | j += 1 | 2493 | j += 1 |
2463 | continue | 2494 | continue |
... | @@ -2543,7 +2574,6 @@ def work_pssm_remap(f): | ... | @@ -2543,7 +2574,6 @@ def work_pssm_remap(f): |
2543 | idxQueue.put(thr_idx) # replace the thread index in the queue | 2574 | idxQueue.put(thr_idx) # replace the thread index in the queue |
2544 | return 0 | 2575 | return 0 |
2545 | 2576 | ||
2546 | - | ||
2547 | @trace_unhandled_exceptions | 2577 | @trace_unhandled_exceptions |
2548 | def work_save(c, homology=True): | 2578 | def work_save(c, homology=True): |
2549 | 2579 | ... | ... |
-
Please register or login to post a comment