Source code for enkie.dbs.metanetx

"""Methods for accessing the MetaNetX namespace."""

import logging
from enum import Enum
from typing import Dict, Optional, Set, Tuple

import pandas as pd
import pkg_resources
from path import Path

from ..singleton_meta import SingletonMeta
from ..storage import get_data_path

logger = logging.getLogger(__name__)


[docs]class MetaboliteFormat(Enum):
    """Format of a metabolite query."""

[docs]    IDENTIFIER = 0
[docs]    NAME = 1


[docs]class MnxFile(str, Enum):
    """The MetaNetX data files accessible in ENKIE."""

[docs]    CHEM_XREF = "chem_xref"
[docs]    CHEM_PROP = "chem_prop"
[docs]    REAC_XREF = "reac_xref"
[docs]    REAC_PROP = "reac_prop"


[docs]class Metanetx(metaclass=SingletonMeta):
    """Singleton class for mapping metabolite and reaction identifiers using
    MetaNetX."""

[docs]    COLUMNS = {
        MnxFile.CHEM_XREF: ([0, 1, 2], ["source", "ID", "description"]),
        MnxFile.CHEM_PROP: ([0, 5, 6], ["ID", "mass", "InChI"]),
        MnxFile.REAC_XREF: ([0, 1], ["source", "ID"]),
        MnxFile.REAC_PROP: ([0, 1, 4], ["ID", "mnx_equation", "is_balanced"]),
    }

[docs]    MNX_URL_PREFIX = "https://www.metanetx.org/cgi-bin/mnxget/mnxref/"
[docs]    REACTIONS_CURATION_FILE = "data/reaction_mappings_curation.csv"
[docs]    METABOLITES_CURATION_FILE = "data/metabolite_mappings_curation.csv"

    _HARD_MAPPINGS = {
        "o2": {"MNXM735438"},
        "dextran": {None},
        "diacylglycerol": {None},
        "triacylglycerol": {None},
        "lecithin": {None},
        "sphingomyelin": {None},
        "chondroitin 4-sulfate": {None},
        "platelet-activating factor": {None},
        "heparin": {None},
        "heparan sulfate": {None},
        "gm2": {None},
    }

    def __init__(self):
        self._files = {}
        self._name_to_mnx_compound_map = None
        self._mnx_id_to_formula_map = None
        self._mnx_id_to_mass_map = None
        self._rxn_id_to_mnx_id_dict = None
        self._cmp_id_to_mnx_id_dict = None

    @staticmethod
[docs]    def get_data_path() -> Path:
        """Get the path to the folder containing the cached mapping files."""
        return get_data_path()

    @property
[docs]    def chem_xref(self) -> pd.DataFrame:
        """Get the CHEM_XREF MetaNetX table."""
        return self._get_mnx_file(MnxFile.CHEM_XREF)

    @property
[docs]    def chem_prop(self) -> pd.DataFrame:
        """Get the CHEM_PROP MetaNetX table."""
        return self._get_mnx_file(MnxFile.CHEM_PROP)

    @property
[docs]    def reac_xref(self) -> pd.DataFrame:
        """Get the REAC_XREF MetaNetX table."""
        return self._get_mnx_file(MnxFile.REAC_XREF)

    @property
[docs]    def reac_prop(self) -> pd.DataFrame:
        """Get the REAC_PROP MetaNetX table."""
        return self._get_mnx_file(MnxFile.REAC_PROP)

    @property
[docs]    def rxn_id_to_mnx_id(self) -> Dict[str, str]:
        """Dictionary mapping from reaction identifiers to MetaNetX identifiers."""
        if self._rxn_id_to_mnx_id_dict is None:
            self._rxn_id_to_mnx_id_dict = dict(
                zip(self.reac_xref["source"], self.reac_xref["ID"])
            )
            self._rxn_id_to_mnx_id_dict = {
                k: v for (k, v) in self._rxn_id_to_mnx_id_dict.items() if not pd.isna(v)
            }

            # Add curated entries.
            curation_df = pd.read_csv(
                pkg_resources.resource_filename("enkie", self.REACTIONS_CURATION_FILE)
            )
            for rxn_id, mnx_id, _ in curation_df.itertuples(index=False):
                self._rxn_id_to_mnx_id_dict[rxn_id] = mnx_id
        return self._rxn_id_to_mnx_id_dict

    @property
[docs]    def cmp_id_to_mnx_id(self) -> Dict[str, str]:
        """Dictionary mapping from metabolite identifiers to MetaNetX identifiers."""
        if self._cmp_id_to_mnx_id_dict is None:
            self._cmp_id_to_mnx_id_dict = dict(
                zip(self.chem_xref["source"], self.chem_xref["ID"])
            )
            self._cmp_id_to_mnx_id_dict = {
                k: v for (k, v) in self._cmp_id_to_mnx_id_dict.items() if not pd.isna(v)
            }

            # Add curated entries.
            curation_df = pd.read_csv(
                pkg_resources.resource_filename(
                    "enkie", self.METABOLITES_CURATION_FILE
                ),
                header=None,
            )
            for cmp_id, mnx_id, _ in curation_df.itertuples(index=False):
                self._cmp_id_to_mnx_id_dict[cmp_id] = mnx_id
        return self._cmp_id_to_mnx_id_dict

    @property
[docs]    def name_to_mnx_compound_map(self) -> Dict[str, Set[str]]:
        """A dictionary mapping from (lower case) compound names to metanetx compound
        identifiers."""
        if self._name_to_mnx_compound_map is None:
            self._name_to_mnx_compound_map = self._HARD_MAPPINGS.copy()
            defined_chem_xref_df = pd.merge(
                self.chem_xref,
                self.chem_prop[~self.chem_prop["InChI"].isna()],
                on="ID",
                how="inner",
            )[["source", "ID", "description"]]

            for cid, mnx_id, names_string in defined_chem_xref_df.itertuples(
                name=None, index=False
            ):
                if cid.startswith("hmdb:") or cid.startswith("chebi:"):
                    # Skip HMDB entries as they consistently contain incorrect naming.
                    # Skip CHEBI entries as they consistently contain duplicates.
                    continue
                names = names_string.lower().split("||")
                for name in names:
                    if name not in self._HARD_MAPPINGS:
                        self._name_to_mnx_compound_map.setdefault(name, set()).add(
                            mnx_id
                        )

        return self._name_to_mnx_compound_map

    @property
[docs]    def mnx_id_to_mass(self) -> Dict[str, int]:
        """Dictionary mapping from MetaNetX metabolite identifiers to masses."""
        if self._mnx_id_to_mass_map is None:
            self._mnx_id_to_mass_map = dict(
                zip(self.chem_prop["ID"], self.chem_prop["mass"])
            )
        return self._mnx_id_to_mass_map

    @property
[docs]    def mnx_id_to_formula_map(self) -> Dict[str, str]:
        """Dictionary mapping from MetaNetX reaction identifiers to reaction
        formulas."""
        if self._mnx_id_to_formula_map is None:
            self._mnx_id_to_formula_map = dict(
                zip(self.reac_prop["ID"], self.reac_prop["mnx_equation"])
            )
        return self._mnx_id_to_formula_map

[docs]    def to_mnx_reaction(
        self,
        query_id: str,
        substrates: Set[str] = None,
        products: Set[str] = None,
        metabolite_format: MetaboliteFormat = MetaboliteFormat.IDENTIFIER,
        default: str = None,
    ) -> Tuple[str, bool]:
        """Map the given reaction identifier to a MetaNetX identifier, checking whether
        the mapping preserves directionality.

        Parameters
        ----------
        query_id : str
            The query identifier, in the form <namespace>:<identifier>.
        substrates : Set[str], optional
            The reaction substrates, by default None
        products : Set[str], optional
            The reaction products, by default None
        metabolite_format : MetaboliteFormat, optional
            Specifies the format of the substrate and products, by default
            MetaboliteFormat.IDENTIFIER
        default : str, optional
            Value to return if no mapping is found, by default None

        Returns
        -------
        Tuple[str, bool]
            The MetaNetX identifier of the reaction, and a flag denoting whether the
            MetaNetX reaction is defined in the same direction as the input reaction.
        """
        if not query_id.startswith("MNXR"):
            mnx_id = self.rxn_id_to_mnx_id.get(query_id, None)
            if mnx_id is None:
                logger.warning("Could not find MNX ID for: %s.", query_id)
                return (default, True)

            # Verify whether the mapping preserved the directionality.
            if metabolite_format == MetaboliteFormat.IDENTIFIER:
                substrate_ids = {self.to_mnx_compound(s) for s in substrates}
                product_ids = {self.to_mnx_compound(p) for p in products}
            else:
                substrate_ids = {
                    c
                    for s in substrates
                    if s.lower() in self.name_to_mnx_compound_map
                    for c in self.name_to_mnx_compound_map[s.lower()]
                }
                product_ids = {
                    c
                    for p in products
                    if p.lower() in self.name_to_mnx_compound_map
                    for c in self.name_to_mnx_compound_map[p.lower()]
                }
            is_forward = self.is_forward(
                self.mnx_id_to_formula_map[mnx_id], substrate_ids, product_ids
            )
            if is_forward is None:
                logger.warning(
                    "Unable to determine whether the %s-%s mapping preserves the "
                    "direction. Assuming it does.",
                    query_id,
                    mnx_id,
                )
                is_forward = True

            return (mnx_id, is_forward)
        else:
            return (query_id, True)

[docs]    def to_mnx_compound(self, query_id: str, default: str = None) -> Optional[str]:
        """Map the given compound identifier to a MetaNetX compound.

        Parameters
        ----------
        query_id : str
            The query identifier, in the format <namespace>:<identifier>.
        default : str, optional
            Value to return if no mapping is found, by default None

        Returns
        -------
        Optional[str]
            The MetaNetX identifier.
        """
        if not query_id.startswith("MNXM"):
            mnx_id = self.cmp_id_to_mnx_id.get(query_id, None)
            if mnx_id is None:
                logger.warning("Could not find MNX ID for: %s.", query_id)
                return default
            return mnx_id
        else:
            return query_id

[docs]    def get_compound_mass(self, mnx_id: str) -> float:
        """Get the mass of the given compound.

        Parameters
        ----------
        mnx_id : str
            The MetaNetX identifier of the query compound.

        Returns
        -------
        float
            The mass of the compound, in g/mol.
        """
        return self.mnx_id_to_mass[mnx_id]

    def _parse_mnx_half_rxn_metabolite_ids(self, rxn: str) -> Set[str]:
        return [t.strip().split(" ")[1].split("@")[0] for t in rxn.split(" + ")]

    def _get_mnx_rxn_participants(
        self, mnx_rxn_string: str
    ) -> Tuple[Set[str], Set[str]]:
        return (
            self._parse_mnx_half_rxn_metabolite_ids(hr)
            for hr in mnx_rxn_string.split(" = ")
        )

[docs]    def is_forward(
        self,
        reaction_formula: Optional[str],
        substrate_ids: Set[str],
        product_ids: Set[str],
    ) -> Optional[bool]:
        """Detects whether the reaction formula is defined in the forward direction with
        respect to the given substrates and products.

        Parameters
        ----------
        reaction_formula : Optional[str]
            The MetaNetX reaction formula.
        substrate_ids : Set[str]
            The identifiers fo teh substrates.
        product_ids : Set[str]
            The identifiers fo the products.

        Returns
        -------
        Optional[bool]
            True if the formula and the substrate/products show the same direction.
        """
        left_side, right_side = self._get_mnx_rxn_participants(reaction_formula)

        forward_count = len(substrate_ids.intersection(left_side)) + len(
            product_ids.intersection(right_side)
        )
        backward_count = len(substrate_ids.intersection(right_side)) + len(
            product_ids.intersection(left_side)
        )

        if forward_count == backward_count:
            return None
        else:
            return forward_count > backward_count

    def _get_mnx_file(self, file: MnxFile) -> pd.DataFrame:
        if file not in self._files:
            data_dir: Path = Metanetx.get_data_path()
            file_path = data_dir / (file + ".tsv")

            if not file_path.exists():
                # Download file if it is missing.
                logger.debug("MetaNetX cross-reference file is missing: %s.", file_path)
                file_url = self.MNX_URL_PREFIX + file + ".tsv"
                logger.debug("Downloading file from: %s.", file_url)

                # Download the file and only retain the columns we need.
                df = pd.read_csv(
                    file_url,
                    sep="\t",
                    comment="#",
                    usecols=self.COLUMNS[file][0],
                    names=self.COLUMNS[file][1],
                )
                df.to_csv(file_path, sep="\t", index=False)

            # Load the file from disk and add it to the files dictionary.
            self._files[file] = pd.read_csv(file_path, sep="\t")

        return self._files[file]