Source code for pemt.chemical_extractor.experimental_data_extraction

# -*- coding: utf-8 -*-

"""Script for extracting experimental bioassay information from ChEMBL."""

import json
import logging
import os
from collections import defaultdict
from typing import List

import pandas as pd
from chembl_webresource_client.new_client import new_client
from tqdm import tqdm

from pemt.constants import MAPPER_DIR
from pemt.utils import hgnc_to_chembl, uniprot_to_chembl

logger = logging.getLogger(__name__)

# Change logging level for packages
chembl_logger = logging.getLogger("chembl_webresource_client")
chembl_logger.setLevel(logging.WARNING)

activity = new_client.activity

tqdm.pandas()

os.makedirs(MAPPER_DIR, exist_ok=True)


def get_chemical_overview(file_path: str) -> None:
    """Method to report incomplete information in the chemical enrichment.

    :param file_path: Path of the JSON file storing the gene and chemical information.
    """
    gene_chemical_dict = json.load(open(file_path))

    counter_dict = defaultdict(int)

    for gene, counter in tqdm(gene_chemical_dict.items()):
        counter_dict[gene] += len(counter)

    counter_dict = dict(
        sorted(counter_dict.items(), key=lambda item: item[1], reverse=True)
    )

    df = pd.DataFrame(counter_dict, index=[0]).transpose()
    value_count_dict = df[0].value_counts().to_dict()
    logger.warning(
        f"{value_count_dict[0]} genes found with no relevant chemical bioassay information."
    )


def target_to_chemical(
    chemical_mapping: dict,
    protein: str,
    protein_mapping: dict = None,
    is_uniprot: bool = False,
) -> List[dict]:
    """Method to retrieve bioactive chemicals, from proteins, based on biochemical/ functional bioassays.
    A chemical is considered active if it has a pChEMBL > 6.

    :param chemical_mapping: A dictionary mapping the UNIPROT identifiers to ChEMBL identifiers
    :param protein: The protein name or identifier
    :param protein_mapping: A dictionary mapping the HGNC symbols to UNIPROT identifiers.
    By default, the value is set to None.
    :param is_uniprot: Boolean indicating whether the protein is an HGNC symbol or UNIPROT identifier.
    If using UniProt ids for protein, set the value to "True" and the protein_mapping parameter can be omitted.
    If using HGNC symbols, then the protein mapping dictionary needs to be provided.
    """
    chemicals = []

    if not is_uniprot:
        try:
            assert protein_mapping is not None
        except AssertionError:
            raise ValueError(
                f"HGNC symbol given without passing the HGNC to UNIPROT mapping file. \
            Either pass the mapping file to hgnc_mapping variable or set the parameter is_uniprot=True"
            )
        target_chembl = hgnc_to_chembl(
            uniprot_mapper=protein_mapping,
            chemical_mapper=chemical_mapping,
            hgnc_symbol=protein,
        )
    else:
        target_chembl = uniprot_to_chembl(
            chemical_mapper=chemical_mapping, uniprot_id=protein
        )

    if not target_chembl:
        return chemicals

    prot_activity_data = activity.filter(
        target_chembl_id=target_chembl,
        assay_type_iregex="(B|F)",
    ).only(["pchembl_value", "molecule_chembl_id"])

    if len(prot_activity_data) < 1:
        return chemicals

    logger.debug(f"Analysing {len(prot_activity_data)} chemicals")
    for i in prot_activity_data:
        pchembl_val = i["pchembl_value"]

        if pd.isna(pchembl_val):
            continue

        if float(pchembl_val) < 6:
            continue

        chemicals.append(
            i["molecule_chembl_id"],
        )

    return chemicals


[docs]def extract_chemicals(
    analysis_name: str,
    gene_list: list = None,
    gene_file_path: str = None,
    file_separator: str = "comma",
    is_uniprot: bool = False,
    chembl_version: str = "30",
):
    """Enrich genes with chemical data from CheMBL bioassays.

    :param analysis_name: The name of the analysis you want to run. This name would be used to save the resultant file
    :param gene_list: The list of gene you want to extract chemicals for.
    :param gene_file_path: The path of the gene file
    :param file_separator: The separator used within the file. This can be 'comma', 'tab', or 'semicolon'.  By default,
    the file separator is set to csv.
    :param is_uniprot: A boolean value indicating whether the given gene list or file containing uniprot ids or HGNC
    symbols. By default, the value is set to False indicating that a "symbol" column is present with the respective
    HGNC symbols. If set to True, the file with "uniprot" column is expected.
    """

    # Load chembl target mapper files
    chembl_mapper = pd.read_csv(
        "https://raw.githubusercontent.com/Fraunhofer-ITMP/PEMT/main/data/mapper/chembl_uniprot_mapping.txt",
        dtype=str,
        skiprows=1,
        sep="\t",
        names=["uniprot", "chembl_id", "name", "type"],
    )
    chembl_mapper = chembl_mapper[["uniprot", "chembl_id"]]
    chembl_mapper.set_index("uniprot", inplace=True)
    chembl_mapper = chembl_mapper.to_dict()["chembl_id"]

    hgnc_mapper = pd.read_csv(
        "https://raw.githubusercontent.com/Fraunhofer-ITMP/PEMT/main/data/mapper/hgnc_mapper.tsv",
        sep="\t",
        index_col="Approved symbol",
    ).to_dict()["UniProt ID(supplied by UniProt)"]

    # Loop to get and store the genes-chemical information from ChEMBL
    if os.path.exists(f"{MAPPER_DIR}/{analysis_name}_gene_to_chemicals.json"):
        gene_chemical_dict = json.load(
            open(f"{MAPPER_DIR}/{analysis_name}_gene_to_chemicals.json")
        )
    else:
        gene_chemical_dict = defaultdict()

    new_count = 0

    # Extract the gene
    if file_separator == "comma":
        _separator = ","
    elif file_separator == "semicolon":
        _separator = ";"
    else:
        assert file_separator == "tab"
        _separator = "\t"

    if gene_file_path:
        df = pd.read_csv(gene_file_path, sep=_separator)
        if is_uniprot:
            if "uniprot" not in list(df.columns):
                raise ValueError(
                    f'Please rename columns to : "uniprot" in case of uniprot id or "symbol" in case of HGNC symbols'
                )
        elif not is_uniprot:
            if "symbol" not in list(df.columns):
                raise ValueError(
                    f'Please rename columns to : "uniprot" in case of uniprot id or "symbol" in case of HGNC symbols'
                )

    if gene_file_path and _separator and is_uniprot:
        proteins = set(pd.read_csv(gene_file_path, sep=_separator)["uniprot"].tolist())
    elif gene_file_path and _separator and not is_uniprot:
        proteins = set(pd.read_csv(gene_file_path, sep=_separator)["symbol"].tolist())
    else:
        proteins = gene_list

    # Loop to get chemicals related to target
    for identifier in tqdm(proteins, desc="Extracting chemicals for targets"):
        if identifier in gene_chemical_dict:
            continue

        new_count += 1
        # Extract chemical-target data from ChEMBL
        chemical_list = target_to_chemical(
            protein=identifier,
            protein_mapping=hgnc_mapper,
            chemical_mapping=chembl_mapper,
            is_uniprot=is_uniprot,
        )
        gene_chemical_dict[identifier] = chemical_list

        if new_count == 50:
            with open(f"{MAPPER_DIR}/{analysis_name}_gene_to_chemicals.json", "w") as f:
                json.dump(gene_chemical_dict, f, ensure_ascii=False, indent=2)
            new_count = 0

    # Save dict for re-use
    if new_count > 0:
        with open(f"{MAPPER_DIR}/{analysis_name}_gene_to_chemicals.json", "w") as f:
            json.dump(gene_chemical_dict, f, ensure_ascii=False, indent=2)

    # Get genes with no chemical hits
    get_chemical_overview(f"{MAPPER_DIR}/{analysis_name}_gene_to_chemicals.json")

    return gene_chemical_dict