Source code for gecco.hmmer

"""Compatibility wrapper for HMMER binaries and output.
"""
import abc
import atexit
import collections
import configparser
import contextlib
import csv
import errno
import glob
import gzip
import itertools
import os
import re
import subprocess
import tempfile
import typing
from typing import Any, BinaryIO, Callable, Container, Dict, Optional, Iterable, Iterator, List, Mapping, Type, Sequence

import pyhmmer
from Bio import SeqIO
from pyhmmer.hmmer import hmmsearch

from .._meta import requires, UniversalContainer, zopen
from ..model import Gene, Domain
from ..interpro import InterPro

try:
    from importlib.resources import files, as_file
except ImportError:
    from importlib_resources import files, as_file  # type: ignore

__all__ = ["DomainAnnotator", "HMM", "PyHMMER", "embedded_hmms"]


[docs]class HMM(typing.NamedTuple): """A Hidden Markov Model library to use with `~gecco.hmmer.HMMER`. """ id: str version: str url: str path: str size: Optional[int] = None relabel_with: Optional[str] = None md5: Optional[str] = None
[docs] def relabel(self, domain: str) -> str: """Rename a domain obtained by this HMM to the right accession. This method can be used with HMM libraries that have separate HMMs for the same domain, such as Pfam. """ if self.relabel_with is None: return domain before, after = re.match("^s/(.*)/(.*)/$", self.relabel_with).groups() # type: ignore regex = re.compile(before) return regex.sub(after, domain)
[docs]class DomainAnnotator(metaclass=abc.ABCMeta): """An abstract class for annotating genes with protein domains. """
[docs] def __init__(self, hmm: HMM, cpus: Optional[int] = None, whitelist: Optional[Container[str]] = None) -> None: """Prepare a new HMMER annotation handler with the given ``hmms``. Arguments: hmm (str): The path to the file containing the HMMs. cpus (int, optional): The number of CPUs to allocate for the ``hmmsearch`` command. Give ``None`` to use the default. whitelist (container of str): If given, a container containing the accessions of the individual HMMs to annotate with. If `None` is given, annotate with the entire file. """ super().__init__() self.hmm = hmm self.cpus = cpus self.whitelist = UniversalContainer() if whitelist is None else whitelist
[docs] @abc.abstractmethod def run(self, genes: Iterable[Gene]) -> List[Gene]: """Run annotation on proteins of ``genes`` and update their domains. Arguments: genes (iterable of `~gecco.model.Gene`): An iterable that yield genes to annotate with ``self.hmm``. """ return NotImplemented
[docs]class PyHMMER(DomainAnnotator): """A domain annotator that uses `pyhmmer`. """
[docs] def run( self, genes: Iterable[Gene], progress: Optional[Callable[[pyhmmer.plan7.HMM, int], None]] = None, bit_cutoffs: Optional[str] = None, ) -> List[Gene]: # collect genes and keep them in original order gene_index = list(genes) # convert proteins to Easel sequences, namind them after # their location in the original input to ignore any duplicate # protein identifiers esl_abc = pyhmmer.easel.Alphabet.amino() esl_sqs = [ pyhmmer.easel.TextSequence( name=str(i).encode(), sequence=str(gene.protein.seq) ).digitize(esl_abc) for i, gene in enumerate(gene_index) ] with contextlib.ExitStack() as ctx: # decompress the input if needed file: BinaryIO = ctx.enter_context(zopen(self.hmm.path)) # Only retain the HMMs which are in the whitelist hmm_file = ctx.enter_context(pyhmmer.plan7.HMMFile(file)) profiles = ( hmm for hmm in hmm_file if hmm.accession is None or self.hmm.relabel(hmm.accession.decode()) in self.whitelist ) # Run search pipeline using the filtered HMMs cpus = 0 if self.cpus is None else self.cpus hmms_hits = hmmsearch( profiles, esl_sqs, cpus=cpus, callback=progress, # type: ignore Z=self.hmm.size, # type: ignore domZ=self.hmm.size, # type: ignore bit_cutoffs=bit_cutoffs, # type: ignore ) # Load InterPro metadata for the annotation interpro = InterPro.load() # Transcribe HMMER hits to GECCO model for hits in hmms_hits: for hit in hits.reported: target_index = int(hit.name) for domain in hit.domains.reported: # extract name and get InterPro metadata about hit raw_acc = domain.alignment.hmm_accession or domain.alignment.hmm_name accession = self.hmm.relabel(raw_acc.decode('utf-8')) entry = interpro.by_accession.get(accession) # extract coordinates start = domain.alignment.target_from end = domain.alignment.target_to # extract qualifiers and GO terms qualifiers: Dict[str, List[str]] = { "inference": ["protein motif"], "db_xref": ["{}:{}".format(self.hmm.id.upper(), accession)], "note": [ "e-value: {}".format(domain.i_evalue), "p-value: {}".format(domain.pvalue), ], } if entry is not None: qualifiers["function"] = [entry.name] qualifiers["db_xref"].append("InterPro:{}".format(entry.accession)) go_terms = entry.go_terms go_functions = entry.go_functions else: go_terms = [] go_functions = [] # add the domain to the protein domains of the right gene assert domain.env_from < domain.env_to assert domain.i_evalue >= 0 assert domain.pvalue >= 0 gene_index[target_index].protein.domains.append( Domain( accession, start, end, self.hmm.id, domain.i_evalue, domain.pvalue, go_terms=go_terms, go_functions=go_functions, qualifiers=qualifiers, ) ) # return the updated list of genes that was given in argument return gene_index
[docs]def embedded_hmms() -> Iterator[HMM]: """Iterate over the embedded HMMs that are shipped with GECCO. """ for filename in files(__name__).glob("*.ini"): ini_ctx = as_file(filename) ini_path = ini_ctx.__enter__() atexit.register(ini_ctx.__exit__, None, None, None) cfg = configparser.ConfigParser() cfg.read(ini_path) args: Dict[str, Any] = dict(cfg.items("hmm")) size = int(args.pop("size", 0)) hmm_ctx = as_file(filename.with_suffix(".h3m")) hmm_path = hmm_ctx.__enter__() atexit.register(hmm_ctx.__exit__, None, None, None) yield HMM(path=os.fspath(hmm_path), size=size, **args)