Source code for pemt.chemical_extractor.experimental_data_extraction

# -*- coding: utf-8 -*-

"""Script for extracting experimental bioassay information from ChEMBL."""

import json
import logging
import os
from collections import defaultdict
from typing import List

import pandas as pd
from chembl_webresource_client.new_client import new_client
from tqdm import tqdm

from pemt.constants import MAPPER_DIR
from pemt.utils import hgnc_to_chembl, uniprot_to_chembl

logger = logging.getLogger(__name__)

# Change logging level for packages
chembl_logger = logging.getLogger("chembl_webresource_client")
chembl_logger.setLevel(logging.WARNING)

activity = new_client.activity

tqdm.pandas()

os.makedirs(MAPPER_DIR, exist_ok=True)


def get_chemical_overview(file_path: str) -> None:
    """Method to report incomplete information in the chemical enrichment.

    :param file_path: Path of the JSON file storing the gene and chemical information.
    """
    gene_chemical_dict = json.load(open(file_path))

    counter_dict = defaultdict(int)

    for gene, counter in tqdm(gene_chemical_dict.items()):
        counter_dict[gene] += len(counter)

    counter_dict = dict(
        sorted(counter_dict.items(), key=lambda item: item[1], reverse=True)
    )

    df = pd.DataFrame(counter_dict, index=[0]).transpose()
    value_count_dict = df[0].value_counts().to_dict()
    logger.warning(
        f"{value_count_dict[0]} genes found with no relevant chemical bioassay information."
    )


def target_to_chemical(
    chemical_mapping: dict,
    protein: str,
    protein_mapping: dict = None,
    is_uniprot: bool = False,
) -> List[dict]:
    """Method to retrieve bioactive chemicals, from proteins, based on biochemical/ functional bioassays.
    A chemical is considered active if it has a pChEMBL > 6.

    :param chemical_mapping: A dictionary mapping the UNIPROT identifiers to ChEMBL identifiers
    :param protein: The protein name or identifier
    :param protein_mapping: A dictionary mapping the HGNC symbols to UNIPROT identifiers.
    By default, the value is set to None.
    :param is_uniprot: Boolean indicating whether the protein is an HGNC symbol or UNIPROT identifier.
    If using UniProt ids for protein, set the value to "True" and the protein_mapping parameter can be omitted.
    If using HGNC symbols, then the protein mapping dictionary needs to be provided.
    """
    chemicals = []

    if not is_uniprot:
        try:
            assert protein_mapping is not None
        except AssertionError:
            raise ValueError(
                f"HGNC symbol given without passing the HGNC to UNIPROT mapping file. \
            Either pass the mapping file to hgnc_mapping variable or set the parameter is_uniprot=True"
            )
        target_chembl = hgnc_to_chembl(
            uniprot_mapper=protein_mapping,
            chemical_mapper=chemical_mapping,
            hgnc_symbol=protein,
        )
    else:
        target_chembl = uniprot_to_chembl(
            chemical_mapper=chemical_mapping, uniprot_id=protein
        )

    if not target_chembl:
        return chemicals

    prot_activity_data = activity.filter(
        target_chembl_id=target_chembl,
        assay_type_iregex="(B|F)",
    ).only(["pchembl_value", "molecule_chembl_id"])

    if len(prot_activity_data) < 1:
        return chemicals

    logger.debug(f"Analysing {len(prot_activity_data)} chemicals")
    for i in prot_activity_data:
        pchembl_val = i["pchembl_value"]

        if pd.isna(pchembl_val):
            continue

        if float(pchembl_val) < 6:
            continue

        chemicals.append(
            i["molecule_chembl_id"],
        )

    return chemicals


[docs]def extract_chemicals( analysis_name: str, gene_list: list = None, gene_file_path: str = None, file_separator: str = "comma", is_uniprot: bool = False, chembl_version: str = "30", ): """Enrich genes with chemical data from CheMBL bioassays. :param analysis_name: The name of the analysis you want to run. This name would be used to save the resultant file :param gene_list: The list of gene you want to extract chemicals for. :param gene_file_path: The path of the gene file :param file_separator: The separator used within the file. This can be 'comma', 'tab', or 'semicolon'. By default, the file separator is set to csv. :param is_uniprot: A boolean value indicating whether the given gene list or file containing uniprot ids or HGNC symbols. By default, the value is set to False indicating that a "symbol" column is present with the respective HGNC symbols. If set to True, the file with "uniprot" column is expected. """ # Load chembl target mapper files chembl_mapper = pd.read_csv( "https://raw.githubusercontent.com/Fraunhofer-ITMP/PEMT/main/data/mapper/chembl_uniprot_mapping.txt", dtype=str, skiprows=1, sep="\t", names=["uniprot", "chembl_id", "name", "type"], ) chembl_mapper = chembl_mapper[["uniprot", "chembl_id"]] chembl_mapper.set_index("uniprot", inplace=True) chembl_mapper = chembl_mapper.to_dict()["chembl_id"] hgnc_mapper = pd.read_csv( "https://raw.githubusercontent.com/Fraunhofer-ITMP/PEMT/main/data/mapper/hgnc_mapper.tsv", sep="\t", index_col="Approved symbol", ).to_dict()["UniProt ID(supplied by UniProt)"] # Loop to get and store the genes-chemical information from ChEMBL if os.path.exists(f"{MAPPER_DIR}/{analysis_name}_gene_to_chemicals.json"): gene_chemical_dict = json.load( open(f"{MAPPER_DIR}/{analysis_name}_gene_to_chemicals.json") ) else: gene_chemical_dict = defaultdict() new_count = 0 # Extract the gene if file_separator == "comma": _separator = "," elif file_separator == "semicolon": _separator = ";" else: assert file_separator == "tab" _separator = "\t" if gene_file_path: df = pd.read_csv(gene_file_path, sep=_separator) if is_uniprot: if "uniprot" not in list(df.columns): raise ValueError( f'Please rename columns to : "uniprot" in case of uniprot id or "symbol" in case of HGNC symbols' ) elif not is_uniprot: if "symbol" not in list(df.columns): raise ValueError( f'Please rename columns to : "uniprot" in case of uniprot id or "symbol" in case of HGNC symbols' ) if gene_file_path and _separator and is_uniprot: proteins = set(pd.read_csv(gene_file_path, sep=_separator)["uniprot"].tolist()) elif gene_file_path and _separator and not is_uniprot: proteins = set(pd.read_csv(gene_file_path, sep=_separator)["symbol"].tolist()) else: proteins = gene_list # Loop to get chemicals related to target for identifier in tqdm(proteins, desc="Extracting chemicals for targets"): if identifier in gene_chemical_dict: continue new_count += 1 # Extract chemical-target data from ChEMBL chemical_list = target_to_chemical( protein=identifier, protein_mapping=hgnc_mapper, chemical_mapping=chembl_mapper, is_uniprot=is_uniprot, ) gene_chemical_dict[identifier] = chemical_list if new_count == 50: with open(f"{MAPPER_DIR}/{analysis_name}_gene_to_chemicals.json", "w") as f: json.dump(gene_chemical_dict, f, ensure_ascii=False, indent=2) new_count = 0 # Save dict for re-use if new_count > 0: with open(f"{MAPPER_DIR}/{analysis_name}_gene_to_chemicals.json", "w") as f: json.dump(gene_chemical_dict, f, ensure_ascii=False, indent=2) # Get genes with no chemical hits get_chemical_overview(f"{MAPPER_DIR}/{analysis_name}_gene_to_chemicals.json") return gene_chemical_dict