Louis BECQUEY

MutableSeqs overload of ALignIO

Showing 1 changed file with 51 additions and 21 deletions
...@@ -9,7 +9,6 @@ if a[0] != "3.8": ...@@ -9,7 +9,6 @@ if a[0] != "3.8":
9 print("Please use version 3.8 or newer.") 9 print("Please use version 3.8 or newer.")
10 exit(1) 10 exit(1)
11 11
12 -import Bio
13 import Bio.PDB as pdb 12 import Bio.PDB as pdb
14 import concurrent.futures 13 import concurrent.futures
15 import getopt 14 import getopt
...@@ -37,7 +36,10 @@ from time import sleep ...@@ -37,7 +36,10 @@ from time import sleep
37 from tqdm import tqdm 36 from tqdm import tqdm
38 from setproctitle import setproctitle 37 from setproctitle import setproctitle
39 from Bio import AlignIO, SeqIO 38 from Bio import AlignIO, SeqIO
40 -from Bio.Align import AlignInfo 39 +from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser
40 +from Bio.Seq import MutableSeq
41 +from Bio.SeqRecord import SeqRecord
42 +from Bio.Align import MultipleSeqAlignment
41 43
42 def trace_unhandled_exceptions(func): 44 def trace_unhandled_exceptions(func):
43 @wraps(func) 45 @wraps(func)
...@@ -81,6 +83,30 @@ no_nts_set = set() ...@@ -81,6 +83,30 @@ no_nts_set = set()
81 weird_mappings = set() 83 weird_mappings = set()
82 84
83 85
86 +class MutableFastaIterator(FastaIterator):
87 + """
88 + Same as Biopython's FastaIterator, but uses Bio.Seq.MutableSeq objects instead of Bio.Seq.Seq.
89 + """
90 + def iterate(self, handle):
91 + """Parse the file and generate SeqRecord objects."""
92 + title2ids = self.title2ids
93 + if title2ids:
94 + for title, sequence in SimpleFastaParser(handle):
95 + id, name, descr = title2ids(title)
96 + yield SeqRecord(MutableSeq(sequence), id=id, name=name, description=descr)
97 + else:
98 + for title, sequence in SimpleFastaParser(handle):
99 + try:
100 + first_word = title.split(None, 1)[0]
101 + except IndexError:
102 + assert not title, repr(title)
103 + # Should we use SeqRecord default for no ID?
104 + first_word = ""
105 + yield SeqRecord(
106 + MutableSeq(sequence), id=first_word, name=first_word, description=title,
107 + )
108 +
109 +
84 class SelectivePortionSelector(object): 110 class SelectivePortionSelector(object):
85 """Class passed to MMCIFIO to select some chain portions in an MMCIF file. 111 """Class passed to MMCIFIO to select some chain portions in an MMCIF file.
86 112
...@@ -1532,13 +1558,11 @@ def read_cpu_number(): ...@@ -1532,13 +1558,11 @@ def read_cpu_number():
1532 p = subprocess.run(['grep', '-Ec', '(Intel|AMD)', '/proc/cpuinfo'], stdout=subprocess.PIPE) 1558 p = subprocess.run(['grep', '-Ec', '(Intel|AMD)', '/proc/cpuinfo'], stdout=subprocess.PIPE)
1533 return int(int(p.stdout.decode('utf-8')[:-1])/2) 1559 return int(int(p.stdout.decode('utf-8')[:-1])/2)
1534 1560
1535 -
1536 def init_worker(tqdm_lock=None): 1561 def init_worker(tqdm_lock=None):
1537 signal.signal(signal.SIGINT, signal.SIG_IGN) 1562 signal.signal(signal.SIGINT, signal.SIG_IGN)
1538 if tqdm_lock is not None: 1563 if tqdm_lock is not None:
1539 tqdm.set_lock(tqdm_lock) 1564 tqdm.set_lock(tqdm_lock)
1540 1565
1541 -
1542 def warn(message, error=False): 1566 def warn(message, error=False):
1543 """Pretty-print warnings and error messages. 1567 """Pretty-print warnings and error messages.
1544 """ 1568 """
...@@ -1557,12 +1581,32 @@ def warn(message, error=False): ...@@ -1557,12 +1581,32 @@ def warn(message, error=False):
1557 else: 1581 else:
1558 print(f"\t> \033[33mWARN: {message:64s}\033[0m\t{warnsymb}", flush=True) 1582 print(f"\t> \033[33mWARN: {message:64s}\033[0m\t{warnsymb}", flush=True)
1559 1583
1560 -
1561 def notify(message, post=''): 1584 def notify(message, post=''):
1562 if len(post): 1585 if len(post):
1563 post = '(' + post + ')' 1586 post = '(' + post + ')'
1564 print(f"\t> {message:70s}\t{validsymb}\t{post}", flush=True) 1587 print(f"\t> {message:70s}\t{validsymb}\t{post}", flush=True)
1565 1588
1589 +def _mutable_SeqIO_to_alignment_iterator(handle):
1590 + records = list(MutableFastaIterator(handle))
1591 + if records:
1592 + yield MultipleSeqAlignment(records)
1593 +
1594 +def parse(handle):
1595 + with open(handle, 'r') as fp:
1596 + yield from _mutable_SeqIO_to_alignment_iterator(fp)
1597 +
1598 +def read(handle):
1599 + iterator = parse(handle)
1600 + try:
1601 + alignment = next(iterator)
1602 + except StopIteration:
1603 + raise ValueError("No records found in handle") from None
1604 + try:
1605 + next(iterator)
1606 + raise ValueError("More than one record found in handle")
1607 + except StopIteration:
1608 + pass
1609 + return alignment
1566 1610
1567 def sql_define_tables(conn): 1611 def sql_define_tables(conn):
1568 conn.executescript( 1612 conn.executescript(
...@@ -1673,7 +1717,6 @@ def sql_define_tables(conn): ...@@ -1673,7 +1717,6 @@ def sql_define_tables(conn):
1673 """) 1717 """)
1674 conn.commit() 1718 conn.commit()
1675 1719
1676 -
1677 @trace_unhandled_exceptions 1720 @trace_unhandled_exceptions
1678 def sql_ask_database(conn, sql, warn_every=10): 1721 def sql_ask_database(conn, sql, warn_every=10):
1679 """ 1722 """
...@@ -1695,7 +1738,6 @@ def sql_ask_database(conn, sql, warn_every=10): ...@@ -1695,7 +1738,6 @@ def sql_ask_database(conn, sql, warn_every=10):
1695 warn("Tried to reach database 100 times and failed. Aborting.", error=True) 1738 warn("Tried to reach database 100 times and failed. Aborting.", error=True)
1696 return [] 1739 return []
1697 1740
1698 -
1699 @trace_unhandled_exceptions 1741 @trace_unhandled_exceptions
1700 def sql_execute(conn, sql, many=False, data=None, warn_every=10): 1742 def sql_execute(conn, sql, many=False, data=None, warn_every=10):
1701 for _ in range(100): # retry 100 times if it fails 1743 for _ in range(100): # retry 100 times if it fails
...@@ -1718,7 +1760,6 @@ def sql_execute(conn, sql, many=False, data=None, warn_every=10): ...@@ -1718,7 +1760,6 @@ def sql_execute(conn, sql, many=False, data=None, warn_every=10):
1718 time.sleep(0.2) 1760 time.sleep(0.2)
1719 warn("Tried to reach database 100 times and failed. Aborting.", error=True) 1761 warn("Tried to reach database 100 times and failed. Aborting.", error=True)
1720 1762
1721 -
1722 @trace_unhandled_exceptions 1763 @trace_unhandled_exceptions
1723 def execute_job(j, jobcount): 1764 def execute_job(j, jobcount):
1724 """Run a Job object. 1765 """Run a Job object.
...@@ -1780,7 +1821,6 @@ def execute_job(j, jobcount): ...@@ -1780,7 +1821,6 @@ def execute_job(j, jobcount):
1780 t = end_time - start_time 1821 t = end_time - start_time
1781 return (t, m, r) 1822 return (t, m, r)
1782 1823
1783 -
1784 def execute_joblist(fulljoblist): 1824 def execute_joblist(fulljoblist):
1785 """ Run a list of job objects. 1825 """ Run a list of job objects.
1786 1826
...@@ -1847,7 +1887,6 @@ def execute_joblist(fulljoblist): ...@@ -1847,7 +1887,6 @@ def execute_joblist(fulljoblist):
1847 # throw back the money 1887 # throw back the money
1848 return results 1888 return results
1849 1889
1850 -
1851 @trace_unhandled_exceptions 1890 @trace_unhandled_exceptions
1852 def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> list: 1891 def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> list:
1853 """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list, 1892 """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list,
...@@ -1999,7 +2038,6 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li ...@@ -1999,7 +2038,6 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li
1999 2038
2000 return newchains 2039 return newchains
2001 2040
2002 -
2003 @trace_unhandled_exceptions 2041 @trace_unhandled_exceptions
2004 def work_mmcif(pdb_id): 2042 def work_mmcif(pdb_id):
2005 """ Look for a CIF file (with all chains) from RCSB 2043 """ Look for a CIF file (with all chains) from RCSB
...@@ -2078,7 +2116,6 @@ def work_mmcif(pdb_id): ...@@ -2078,7 +2116,6 @@ def work_mmcif(pdb_id):
2078 2116
2079 return 0 2117 return 0
2080 2118
2081 -
2082 @trace_unhandled_exceptions 2119 @trace_unhandled_exceptions
2083 def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): 2120 def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
2084 """Reads information from JSON and save it to database. 2121 """Reads information from JSON and save it to database.
...@@ -2115,7 +2152,6 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): ...@@ -2115,7 +2152,6 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
2115 2152
2116 return c 2153 return c
2117 2154
2118 -
2119 @trace_unhandled_exceptions 2155 @trace_unhandled_exceptions
2120 def work_prepare_sequences(dl, rfam_acc, chains): 2156 def work_prepare_sequences(dl, rfam_acc, chains):
2121 """Prepares FASTA files of homologous sequences to realign with cmalign or SINA. 2157 """Prepares FASTA files of homologous sequences to realign with cmalign or SINA.
...@@ -2204,7 +2240,6 @@ def work_prepare_sequences(dl, rfam_acc, chains): ...@@ -2204,7 +2240,6 @@ def work_prepare_sequences(dl, rfam_acc, chains):
2204 # print some stats 2240 # print some stats
2205 notify(status) 2241 notify(status)
2206 2242
2207 -
2208 @trace_unhandled_exceptions 2243 @trace_unhandled_exceptions
2209 def work_realign(rfam_acc): 2244 def work_realign(rfam_acc):
2210 """ Runs multiple sequence alignements by RNA family. 2245 """ Runs multiple sequence alignements by RNA family.
...@@ -2315,7 +2350,6 @@ def work_realign(rfam_acc): ...@@ -2315,7 +2350,6 @@ def work_realign(rfam_acc):
2315 with open(runDir + "/errors.txt", "a") as er: 2350 with open(runDir + "/errors.txt", "a") as er:
2316 er.write(f"Failed to realign {rfam_acc} (killed)") 2351 er.write(f"Failed to realign {rfam_acc} (killed)")
2317 2352
2318 -
2319 @trace_unhandled_exceptions 2353 @trace_unhandled_exceptions
2320 def work_pssm_remap(f): 2354 def work_pssm_remap(f):
2321 """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. 2355 """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
...@@ -2330,10 +2364,6 @@ def work_pssm_remap(f): ...@@ -2330,10 +2364,6 @@ def work_pssm_remap(f):
2330 global idxQueue 2364 global idxQueue
2331 thr_idx = idxQueue.get() 2365 thr_idx = idxQueue.get()
2332 2366
2333 - # get the chains of this family in the update
2334 - list_of_chains = rfam_acc_to_download[f]
2335 - chains_ids = [str(c) for c in list_of_chains]
2336 -
2337 ########################################################################################## 2367 ##########################################################################################
2338 # Compute frequencies in the alignment 2368 # Compute frequencies in the alignment
2339 ########################################################################################## 2369 ##########################################################################################
...@@ -2342,7 +2372,7 @@ def work_pssm_remap(f): ...@@ -2342,7 +2372,7 @@ def work_pssm_remap(f):
2342 2372
2343 # Open the alignment 2373 # Open the alignment
2344 try: 2374 try:
2345 - align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta") 2375 + align = read(path_to_seq_data + f"realigned/{f}++.afa") # This is our custom AlignIO overload which uses MutableSeq instead of Seq
2346 except: 2376 except:
2347 warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True) 2377 warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True)
2348 with open(runDir + "/errors.txt", "a") as errf: 2378 with open(runDir + "/errors.txt", "a") as errf:
...@@ -2458,6 +2488,7 @@ def work_pssm_remap(f): ...@@ -2458,6 +2488,7 @@ def work_pssm_remap(f):
2458 if j < ncols and s.seq[j] == '.': 2488 if j < ncols and s.seq[j] == '.':
2459 re_mappings.append((db_id, i+1, j+1)) 2489 re_mappings.append((db_id, i+1, j+1))
2460 columns_to_save.add(j+1) 2490 columns_to_save.add(j+1)
2491 + s.seq[j] = '-' # We replace the insertion gap by a real gap (thanks to MutableSeqs)
2461 i += 1 2492 i += 1
2462 j += 1 2493 j += 1
2463 continue 2494 continue
...@@ -2543,7 +2574,6 @@ def work_pssm_remap(f): ...@@ -2543,7 +2574,6 @@ def work_pssm_remap(f):
2543 idxQueue.put(thr_idx) # replace the thread index in the queue 2574 idxQueue.put(thr_idx) # replace the thread index in the queue
2544 return 0 2575 return 0
2545 2576
2546 -
2547 @trace_unhandled_exceptions 2577 @trace_unhandled_exceptions
2548 def work_save(c, homology=True): 2578 def work_save(c, homology=True):
2549 2579
......