Source code for gecco.model

"""Data layer classes storing information needed for gene cluster detection.
"""

import collections
import copy
import csv
import datetime
import enum
import functools
import itertools
import math
import operator
import os
import re
import statistics
import typing
from array import array
from collections import OrderedDict
from collections.abc import Sized
from dataclasses import dataclass, field
from typing import Dict, Iterable, List, Mapping, Optional, Sequence, BinaryIO, NamedTuple, Union, Iterator, Set
from typing import Dict, Iterable, List, Mapping, Optional, Sequence, BinaryIO, NamedTuple, Union, Iterator

import Bio
import numpy
import polars
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation, Reference
from Bio.SeqRecord import SeqRecord

from . import __version__
from .interpro import GOTerm
from ._base import Dumpable, Table, _POLARS_VERSION
from ._meta import patch_locale

if typing.TYPE_CHECKING:
    from numpy.typing import NDArray


__all__ = [
    "ClusterType",
    "Strand",
    "Domain",
    "Protein",
    "Gene",
    "Cluster",
    "FeatureTable",
    "ClusterTable"
]


# fmt: off
[docs]class ClusterType(object):
    """An immutable storage for the type of a gene cluster.
    """

[docs]    def __init__(self, *names: str) -> None:
        """Create a new product type from one or more base types.

        Example:
            >>> t1 = ClusterType()                    # unknown type
            >>> t2 = ClusterType("Polyketide")        # single type
            >>> t3 = ClusterType("Polyketide", "NRP") # multiple types

        """
        self.names = frozenset(names)

    def __repr__(self) -> str:  # noqa: D105
        return "ClusterType({})".format(", ".join(map(repr, sorted(self.names))))

    def __str__(self) -> str: # noqa: D105
        return "Unknown" if not self else ";".join(sorted(self.names))

    def __hash__(self) -> int:  # noqa: D105
        return hash(self.names)

    def __eq__(self, other: object) -> bool:  # noqa: D105
        if not isinstance(other, ClusterType):
            return NotImplemented
        return self.names == other.names

    def __bool__(self) -> bool:  # noqa: D105
        return len(self.names) != 0

[docs]    def unpack(self) -> List["ClusterType"]:
        """Unpack a composite `ClusterType` into a list of individual types.

        Example:
            >>> ty = ClusterType("Polyketide", "Saccharide")
            >>> ty.unpack()
            [ClusterType('Polyketide'), ClusterType('Saccharide')]

        """
        return [ ClusterType(x) for x in sorted(self.names) ]


[docs]class Strand(enum.IntEnum):
    """A flag to declare on which DNA strand a gene is located.
    """

    Coding = 1
    Reverse = -1

    @property
    def sign(self) -> str:
        """`str`: The strand as a single sign (``+`` or ``-``).
        """
        return "+" if self is Strand.Coding else "-"


[docs]@dataclass(frozen=True)
class Domain:
    """A conserved region within a protein.

    Attributes:
        name (`str`): The accession of the protein domain in the source HMM.
        start (`int`): The start coordinate of the domain within the protein
            sequence (first amino-acid at 1).
        end (`int`): The end coordinate of the domain within the protein
            sequence (inclusive).
        hmm (`str`): The name of the HMM library this domain belongs to
            (e.g. ``Pfam``, ``Panther``).
        i_evalue (`float`): The independent e-value reported by ``hmmsearch``
            that measures how reliable the domain annotation is.
        pvalue (`float`): The p-value reported by ``hmmsearch`` that measure
            how likely the domain score is.
        probability (`float`, optional): The probability that this domain
            is part of a gene cluster, or `None` if no prediction has been 
            made yet.
        cluster_weight (`float`, optional): The weight for this domain, 
            measuring its importance as infered from the training clusters
            by the CRF model.
        go_terms (`list` of `GOTerm`): The Gene Ontology terms
            for this particular domain.
        go_functions (`list` of `GOTerm`): The Gene Ontology term families for 
            this particular domain. Term families are extracted by taking the 
            highest superclasses (excluding the root) of each Gene Ontology 
            term in the ``molecular_function`` namespace associated with this 
            domain.
        qualifiers (`dict`): A dictionary of feature qualifiers that
            is added to the `~Bio.SeqFeature.SeqFeature` built from this
            `Domain`.

    """

    name: str
    start: int
    end: int
    hmm: str
    i_evalue: float
    pvalue: float
    probability: Optional[float] = None
    cluster_weight: Optional[float] = None
    go_terms: List[GOTerm] = field(default_factory=list)
    go_functions: List[GOTerm] = field(default_factory=list)
    qualifiers: Dict[str, List[str]] = field(default_factory=dict)

[docs]    def with_probability(self, probability: Optional[float]) -> "Domain":
        """Copy the current domain and assign it a cluster probability.
        """
        return Domain(
            self.name, self.start, self.end, self.hmm, self.i_evalue, self.pvalue,
            probability,
            self.cluster_weight,
            self.go_terms,
            self.go_functions,
            self.qualifiers.copy()
        )

[docs]    def with_cluster_weight(self, cluster_weight: Optional[float]) -> "Domain":
        """Copy the current domain and assign it a cluster weight.
        """
        return Domain(
            self.name, self.start, self.end, self.hmm, self.i_evalue, self.pvalue,
            self.probability,
            cluster_weight,
            self.go_terms,
            self.go_functions,
            self.qualifiers.copy()
        )

[docs]    def to_seq_feature(self, protein_coordinates: bool = False) -> SeqFeature:
        """Convert the domain to a single feature.

        Arguments:
            protein_coordinates (`bool`): Set to `True` for the feature
                coordinates to be given in amino-acids, or to `False` in
                nucleotides.

        """
        stride = 1 if protein_coordinates else 3
        loc = FeatureLocation((self.start-1)*stride, self.end*stride)
        qualifiers = dict(self.qualifiers)
        qualifiers.setdefault("standard_name", [self.name])
        for go_term in self.go_terms:
            qualifiers.setdefault("db_xref", []).append(go_term.accession)
        return SeqFeature(location=loc, type="misc_feature", qualifiers=qualifiers)


[docs]@dataclass(frozen=True)
class Protein:
    """A sequence of amino-acids translated from a gene.

    Attributes:
        id (`str`): The identifier of the protein.
        seq (`~Bio.Seq.Seq`): The sequence of amino-acids of this protein.
        domains (`list` of `~gecco.model.Domain`): A list of domains found
            in the protein sequence.

    """

    id: str
    seq: Seq
    domains: List[Domain] = field(default_factory=list)

[docs]    def to_seq_record(self) -> SeqRecord:
        """Convert the protein to a single record.
        """
        # FIXME: add domains
        record = SeqRecord(self.seq, id=self.id, name=self.id)

        biopython_version = tuple(map(int, Bio.__version__.split(".")))
        if biopython_version < (1, 77):
            from Bio import Alphabet

            record.seq.alphabet = Alphabet.generic_protein

        return record

[docs]    def with_seq(self, seq: Seq) -> "Protein":
        """Copy the current protein and assign it a new sequence.
        """
        return Protein(self.id, seq, copy.deepcopy(self.domains))

[docs]    def with_domains(self, domains: Iterable[Domain]) -> "Protein":
        """Copy the current protein and assign it new domains.
        """
        return Protein(self.id, self.seq, list(domains))


[docs]@dataclass(frozen=True)
class Gene:
    """A nucleotide sequence coding a protein.

    Attributes:
        source (`~Bio.SeqRecord.SeqRecord`): The DNA sequence this gene was
            found in, as a Biopython record.
        start (`int`): The index of the leftmost nucleotide of the gene within
            the source sequence, independent of the strandedness.
        end (`int`): The index of the rightmost nucleotide of the gene within
            the source sequence.
        strand (`~gecco.model.Strand`): The strand where the gene is located.
        protein (`~gecco.model.Protein`): The protein translated from this
            gene.
        qualifiers (`dict`, optional): A dictionary of feature qualifiers that
            is added to the `~Bio.SeqFeature.SeqFeature` built from this
            `Gene`.

    """

    source: SeqRecord
    start: int
    end: int
    strand: Strand
    protein: Protein
    qualifiers: Dict[str, List[str]] = field(default_factory=dict)
    _probability: Optional[float] = field(default_factory=lambda: None)

    @property
    def id(self) -> str:
        """`str`: The identifier of the gene (same as the protein identifier).
        """
        return self.protein.id

    @property
    def average_probability(self) -> Optional[float]:
        """`float`: The average of domain probabilities of being in a cluster.
        """
        if self._probability is not None:
            return self._probability
        p = [d.probability for d in self.protein.domains if d.probability is not None]
        return statistics.mean(p) if p else None

    @property
    def maximum_probability(self) -> Optional[float]:
        """`float`: The highest of domain probabilities of being in a cluster.
        """
        if self._probability is not None:
            return self._probability
        p = [d.probability for d in self.protein.domains if d.probability is not None]
        return max(p) if p else None

    # ---

    # NB(@althonos): Color scheme taken from MIBiG.
    #                This is sorted by priority!
    _FUNCTION_PALETTE = {
        # transporter: blue
        "transporter activity": (0x64, 0x95, 0xed),
        "cargo receptor activity": (0x64, 0x95, 0xed),
        "molecular carrier activity": (0x64, 0x95, 0xed),
        # regulatory: green
        "translation regulator activity": (0x2e, 0x8b, 0x56),
        "molecular function regulator activity": (0x2e, 0x8b, 0x56),
        "transcription regulator activity": (0x2e, 0x8b, 0x56),
        "regulation of molecular function": (0x2e, 0x8b, 0x56),
        "general transcription initiation factor activity": (0x2e, 0x8b, 0x56),
        # core biosynthetic: red
        "toxin activity": (0x81, 0x0e, 0x15),
        "catalytic activity": (0x81, 0x0e, 0x15),
        # additional biosynthetic: pink
        "biosynthetic activity": (0xf1, 0x6d, 0x75),
        # non-biosynthetic: olive green
        "non-biosynthetic activity": (0xbd, 0xb7, 0x6b),
        # unknown: grey
        "unknown": (0x80, 0x80, 0x80),
    }

[docs]    def to_seq_feature(self, color: bool = True) -> SeqFeature:
        """Convert the gene to a single feature.
        """
        # NB(@althonos): we use inclusive 1-based ranges in the data model
        # but Biopython expects 0-based ranges with exclusive ends
        loc = FeatureLocation(start=self.start, end=self.end+1, strand=int(self.strand))
        qualifiers = dict(self.qualifiers)
        qualifiers.setdefault("locus_tag", [self.protein.id])
        qualifiers.setdefault("translation", [str(self.protein.seq)])

        # NB(@althonos): Attempt to assign a function for the gene based on the
        #                domain content.
        functions = self.functions()
        qualifiers.setdefault("function", sorted(functions))
        if color:
            for k, hexcolor in self._FUNCTION_PALETTE.items():
                if k in functions:
                    break
            else:
                hexcolor = self._FUNCTION_PALETTE["unknown"]
            # EasyFig qualifier
            qualifiers.setdefault("colour", [" ".join(str(x) for x in hexcolor)])
            # SnapGene qualifiers
            qualifiers.setdefault("ApEinfo_fwdcolor", ["#{:02x}{:02x}{:02x}".format(*hexcolor)])
            qualifiers.setdefault("ApEinfo_revcolor", ["#{:02x}{:02x}{:02x}".format(*hexcolor)])

        return SeqFeature(location=loc, type="CDS", qualifiers=qualifiers)

[docs]    def with_protein(self, protein: "Protein") -> "Gene":
        """Copy the current gene and assign it a different protein.
        """
        return Gene(
            self.source, self.start, self.end, self.strand, protein,
            self.qualifiers.copy(),
            _probability=self._probability,
        )

[docs]    def with_source(self, source: "SeqRecord") -> "Gene":
        """Copy the current gene and assign it a different source.
        """
        return Gene(
            source, self.start, self.end, self.strand, self.protein,
            self.qualifiers.copy(),
            _probability=self._probability,
        )

[docs]    def with_probability(self, probability: float) -> "Gene":
        """Copy the current gene and assign it a different probability.
        """
        return Gene(
            self.source, self.start, self.end, self.strand,
            self.protein.with_domains([
                domain.with_probability(probability)
                for domain in self.protein.domains
            ]),
            self.qualifiers.copy(),
            _probability=probability
        )

[docs]    def functions(self) -> Set[str]:
        """Predict the function(s) of the gene from its domain annotations.
        """
        functions = {
            term.name
            for domain in self.protein.domains
            for term in domain.go_functions
        }
        if not functions:
            functions.add("unknown")
        return functions


[docs]@dataclass
class Cluster:
    """A sequence of contiguous genes.

    Attributes:
        id (`str`): The identifier of the gene cluster.
        genes (`list` of `~gecco.model.Gene`): A list of the genes belonging
            to this gene cluster.
        types (`gecco.model.ClusterType`): The putative types of this gene 
            cluster, according to similarity in domain composition with 
            curated clusters.
        types_probabilities (`list` of `float`): The probability with which
            each cluster type was identified (same dimension as the ``types``
            attribute).

    """

    id: str
    genes: List[Gene]
    type: Optional[ClusterType]
    type_probabilities: Dict[str, float]

    def __init__(
        self,
        id: str,
        genes: Optional[List[Gene]] = None,
        type: Optional[ClusterType] = None,
        type_probabilities: Optional[Dict[str, float]] = None,
    ):  # noqa: D107
        self.id = id
        self.genes = genes or list()
        self.type = type
        self.type_probabilities = type_probabilities or dict()

    @property
    def source(self) -> SeqRecord:  # type: ignore
        """`~Bio.SeqRecord.SeqRecord`: The sequence this cluster was found in.
        """
        return self.genes[0].source

    @property
    def start(self) -> int:
        """`int`: The start of this cluster in the source sequence.
        """
        return min(gene.start for gene in self.genes)

    @property
    def end(self) -> int:
        """`int`: The end of this cluster in the source sequence.
        """
        return max(gene.end for gene in self.genes)

    @property
    def average_probability(self) -> Optional[float]:
        """`float`: The average of proteins probability of being biosynthetic.
        """
        p = [g.average_probability for g in self.genes if g.average_probability is not None]
        return  statistics.mean(p) if p else None

    @property
    def maximum_probability(self) -> Optional[float]:
        """`float`: The highest of proteins probability of being biosynthetic.
        """
        p = [g.maximum_probability for g in self.genes if g.maximum_probability is not None]
        return max(p) if p else None

    # ---

[docs]    def domain_composition(
        self,
        all_possible: Optional[Sequence[str]] = None,
        normalize: bool = True,
        minlog_weights: bool = False,
        pvalue: bool = True,
    ) -> "NDArray[numpy.double]":
        """Compute weighted domain composition with respect to ``all_possible``.

        Arguments:
            all_possible (sequence of `str`, optional): A sequence containing
                all domain names to consider when computing domain composition
                for the cluster. If `None` given, then only domains within the
                cluster are taken into account.
            normalize (`bool`): Normalize the composition vector so that it
                sums to 1.
            minlog_weights (`bool`): Compute weight for each domain as
                :math:`-log_10(v)` (where :math:`v` is either the ``pvalue``
                or the ``i_evalue``, depending on the value of ``normalize``).
                Use :math:`1 - v` otherwise.
            pvalue (`bool`): Compute composition weights using the ``pvalue``
                of each domain, instead of the ``i_evalue``.

        Returns:
            `~numpy.ndarray`: A numerical array containing the relative domain
            composition of the gene cluster.

        """
        domains = [d for gene in self.genes for d in gene.protein.domains]
        names = numpy.array([domain.name for domain in domains])
        field = operator.attrgetter("pvalue" if pvalue else "i_evalue")
        if minlog_weights:
            weights = numpy.array([- math.log10(field(domain)) for domain in domains])
        else:
            weights = numpy.array([1 - field(domain) for domain in domains])

        unique_names = set(names)
        if all_possible is None:
            all_possible = numpy.unique(names)
        composition = numpy.zeros(len(all_possible))
        for i, dom in enumerate(all_possible):
            if dom in unique_names:
                composition[i] = numpy.sum(weights[names == dom])
        if normalize:
            return composition / (composition.sum() or 1)
        return composition

    # ---

[docs]    def to_seq_record(self) -> SeqRecord:
        """Convert the cluster to a single record.

        Annotations of the source sequence are kept intact if they don't
        overlap with the cluster boundaries. Component genes are added on the
        record as *CDS* features. Annotated protein domains are added as
        *misc_feature*.

        """
        # store time of record creation
        now = datetime.datetime.now()

        # NB(@althonos): we use inclusive 1-based ranges in the data model
        # but slicing expects 0-based ranges with exclusive ends
        cluster = self.source[self.start - 1 : self.end]
        cluster.id = cluster.name = self.id

        # copy sequence annotations
        cluster.annotations = self.source.annotations.copy()
        cluster.annotations["topology"] = "linear"
        cluster.annotations["molecule_type"] = "DNA"
        with patch_locale("C"):
            cluster.annotations['date'] = now.strftime("%d-%b-%Y").upper()

        biopython_version = tuple(map(int, Bio.__version__.split(".")))
        if biopython_version < (1, 77):
            from Bio import Alphabet

            cluster.seq.alphabet = Alphabet.generic_dna

        # add GECCO preprint as a reference
        ref = Reference()
        ref.title = "Accurate de novo identification of biosynthetic gene clusters with GECCO"
        ref.journal = "bioRxiv (2021.05.03.442509)"
        ref.comment = "doi:10.1101/2021.05.03.442509"
        ref.authors = ", ".join([
            "Laura M Carroll",
            "Martin Larralde",
            "Jonas Simon Fleck",
            "Ruby Ponnudurai",
            "Alessio Milanese",
            "Elisa Cappio Barazzone",
            "Georg Zeller"
        ])
        cluster.annotations.setdefault("references", []).append(ref)

        # add GECCO-specific annotations as a structured comment
        if self.type is not None:
            cluster_type = {
                "cluster_type": ";".join(sorted(self.type.names)) or "Unknown"
            }
            probabilities = {
                f"{key.lower()}_probability":f"{value:.3f}"
                for key, value in self.type_probabilities.items()
            }
        else:
            cluster_type = probabilities = {}

        structured_comment = cluster.annotations.setdefault("structured_comment", OrderedDict())
        structured_comment['GECCO-Data'] = {
            "version": f"GECCO v{__version__}",
            "creation_date": now.isoformat(),
            **cluster_type,
            **probabilities,
        }

        # add proteins as CDS features
        for gene in self.genes:
            # write gene as a /cds GenBank record
            cds = gene.to_seq_feature()
            cds.location += -self.start
            cluster.features.append(cds)
            # write domains as /misc_feature annotations
            for domain in gene.protein.domains:
                misc = domain.to_seq_feature(protein_coordinates=False)
                if gene.strand == Strand.Coding:
                    misc.location = FeatureLocation(
                        cds.location.start + misc.location.start,
                        cds.location.start + misc.location.end
                    )
                else:
                    misc.location = FeatureLocation(
                        cds.location.end - misc.location.end,
                        cds.location.end - misc.location.start
                    )
                cluster.features.append(misc)

        # return the complete gene cluster record
        return cluster


class _UnknownSeq(Seq):
    """An unknown sequence that uses limited memory.

    Used by `FeatureTable.to_genes` to fake the `Gene.source.seq` attribute.
    """

    def __init__(self) -> None:
        super().__init__(data="")

    @typing.overload
    def __getitem__(self, index: int) -> str:
        pass

    @typing.overload
    def __getitem__(self, index: slice) -> Seq:
        pass

    def __getitem__(self, index: Union[slice, int]) -> Union[str, Seq]:
        if isinstance(index, slice):
            return Seq("N" * ((index.stop - index.start) // (index.step or 1)) )
        return "N"


[docs]class FeatureTable(Table):
    """A table storing condensed domain annotations from different genes.
    """

    @classmethod
    def _get_columns(cls) -> List["Table.Column"]:
        return [
            Table.Column("sequence_id", polars.Utf8),
            Table.Column("protein_id", polars.Utf8),
            Table.Column("start", polars.Int64),
            Table.Column("end", polars.Int64),
            Table.Column("strand", polars.Utf8),
            Table.Column("domain", polars.Utf8),
            Table.Column("hmm", polars.Utf8),
            Table.Column("i_evalue", polars.Float64),
            Table.Column("pvalue", polars.Float64),
            Table.Column("domain_start", polars.Int64),
            Table.Column("domain_end", polars.Int64),
            Table.Column("cluster_probability", polars.Float64, default=math.nan),
        ]

[docs]    @classmethod
    def from_genes(cls, genes: Iterable[Gene]) -> "FeatureTable":
        """Create a new feature table from an iterable of genes.
        """
        columns = cls._get_columns()
        data = { column.name: [] for column in columns }
        for gene in genes:
            for domain in gene.protein.domains:
                data["sequence_id"].append(gene.source.id)
                data["protein_id"].append(gene.protein.id)
                data["start"].append(gene.start)
                data["end"].append(gene.end)
                data["strand"].append(gene.strand.sign)
                data["domain"].append(domain.name)
                data["hmm"].append(domain.hmm)
                data["i_evalue"].append(domain.i_evalue)
                data["pvalue"].append(domain.pvalue)
                data["domain_start"].append(domain.start)
                data["domain_end"].append(domain.end)
                if domain.probability is not None:
                    data["cluster_probability"].append(domain.probability)
        for name in list(data):
            if not data[name]:
                del data[name]
        return cls(polars.DataFrame(data))

[docs]    def to_genes(self) -> Iterable[Gene]:
        """Convert a feature table to actual genes.

        Since the source sequence cannot be known, a *dummy* sequence is
        built for each gene of size ``gene.end``, so that each gene can still
        be converted to a `~Bio.SeqRecord.SeqRecord` if needed.

        """
        # group rows by protein/gene ID
        protein_indices = collections.defaultdict(list)
        for i, protein_id in enumerate(self.protein_id):
            protein_indices[protein_id].append(i)
        # yield genes in order
        for protein_id in sorted(protein_indices):
            indices = protein_indices[protein_id]
            assert all(self.sequence_id[i] == self.sequence_id[indices[0]] for i in indices)
            assert all(self.protein_id[i] == self.protein_id[indices[0]] for i in indices)
            assert all(self.start[i] == self.start[indices[0]] for i in indices)
            assert all(self.end[i] == self.end[indices[0]] for i in indices)
            source = SeqRecord(id=self.sequence_id[indices[0]], seq=_UnknownSeq())
            strand = Strand.Coding if self.strand[indices[0]] == "+" else Strand.Reverse
            protein = Protein(self.protein_id[indices[0]], seq=_UnknownSeq)
            gene = Gene(source, self.start[indices[0]], self.end[indices[0]], strand, protein)
            for i in indices:
                domain = Domain(
                    self.domain[i], 
                    self.domain_start[i], 
                    self.domain_end[i],
                    self.hmm[i], 
                    self.i_evalue[i], 
                    self.pvalue[i], 
                    self.cluster_probability[i]
                )
                gene.protein.domains.append(domain)
            yield gene


[docs]class ClusterTable(Table):
    """A table storing condensed information from several clusters.
    """

    @classmethod
    def _get_columns(cls) -> List["Table.Column"]:
        return [
            Table.Column("sequence_id", polars.Utf8),
            Table.Column("cluster_id", polars.Utf8),
            Table.Column("start", polars.Int64),
            Table.Column("end", polars.Int64),
            Table.Column("average_p", polars.Float64, default=math.nan),
            Table.Column("max_p", polars.Float64, default=math.nan),
            Table.Column("type", polars.Utf8, default="Unknown"),
            # + possible type columns that are handled in `from_clusters`
            Table.Column("proteins", polars.Utf8, default=""),
            Table.Column("domains", polars.Utf8, default=""),
        ]

[docs]    @classmethod
    def from_clusters(cls, clusters: Iterable[Cluster]) -> "ClusterTable":
        """Create a new cluster table from an iterable of clusters.
        """
        data = collections.defaultdict(list)
        for cluster in clusters:
            data["sequence_id"].append(cluster.source.id)
            data["cluster_id"].append(cluster.id)
            data["start"].append(cluster.start)
            data["end"].append(cluster.end)
            if cluster.average_probability is not None:
                data["average_p"].append(cluster.average_probability)
            if cluster.maximum_probability is not None:
                data["max_p"].append(cluster.maximum_probability)
            if cluster.type is not None:
                data["type"].append(str(cluster.type))
                for type_name in sorted(cluster.type_probabilities, key=str.casefold):
                    data[f"{type_name.lower()}_probability"].append(cluster.type_probabilities[type_name])
            data["proteins"].append(";".join(
                sorted(gene.protein.id for gene in cluster.genes)
            ))
            data["domains"].append(";".join(sorted( 
                domain.name 
                for gene in cluster.genes 
                for domain in gene.protein.domains 
            )))
            # TODO: member proteins
            # TODO: member domains
        return cls(polars.DataFrame(data))

    def dump(self, fh: Union[BinaryIO, str, os.PathLike]) -> None:
        # patch `Table.dump` so that all columns are always written
        data = self.data
        for column_name in data.columns:
            if data[column_name].dtype in (polars.Float64, polars.Float64):
                data = data.with_columns(polars.col(column_name).fill_nan(None))
        if _POLARS_VERSION < (0, 16, 14):
            data.write_csv(fh, sep="\t")
        else:
            data.write_csv(fh, separator="\t")


class GeneTable(Table):
    """A table storing gene coordinates and optional cluster probabilities.
    """

    @classmethod
    def _get_columns(cls) -> List["Table.Column"]:
        return [
            Table.Column("sequence_id", polars.Utf8),
            Table.Column("protein_id", polars.Utf8),
            Table.Column("start", polars.Int64),
            Table.Column("end", polars.Int64),
            Table.Column("strand", polars.Utf8),
            Table.Column("average_p", polars.Float64, default=math.nan),
            Table.Column("max_p", polars.Float64, default=math.nan),
        ]

    @classmethod
    def from_genes(cls, genes: Iterable[Gene]) -> "GeneTable":
        """Create a new gene table from an iterable of genes.
        """
        columns = cls._get_columns()
        data = { column.name: [] for column in columns }
        for gene in genes:
            data["sequence_id"].append(gene.source.id)
            data["protein_id"].append(gene.protein.id)
            data["start"].append(gene.start)
            data["end"].append(gene.end)
            data["strand"].append(gene.strand.sign)
            if gene.average_probability is not None:
                data["average_p"].append(gene.average_probability)
            else:
                data["average_p"].append(math.nan)
            if gene.maximum_probability is not None:
                data["max_p"].append(gene.maximum_probability)
            else:
                data["max_p"].append(math.nan)
        return cls(polars.DataFrame(data))

    def to_genes(self) -> Iterable[Gene]:
        """Convert a gene table to actual genes.

        Since the source sequence cannot be known, a *dummy* sequence is
        built for each gene of size ``gene.end``, so that each gene can still
        be converted to a `~Bio.SeqRecord.SeqRecord` if needed.

        """
        # check if a probability column is available 
        has_probas = "average_p" in self.data.columns
        # yield genes in order
        for i, protein_id in enumerate(self.protein_id):
            source = SeqRecord(id=self.sequence_id[i], seq=_UnknownSeq())
            strand = Strand.Coding if self.strand[i] == "+" else Strand.Reverse
            start = self.start[i]
            end = self.end[i]
            seq = Seq("X" * ((end - start) // 3))
            protein = Protein(self.protein_id[i], seq=seq)
            probability = self.average_p[i] if has_probas else None
            gene = Gene(source, start, end, strand, protein, _probability=probability)
            yield gene