Source code for pemt.patent_extractor.patent_chemical_harmonizer

# -*- coding: utf-8 -*-

"""Script for harmonizing the ChEMBL chemicals with patent chemicals."""

import json
import logging
import os
from collections import defaultdict

import pandas as pd
from pubchempy import get_synonyms
from tqdm import tqdm

from pemt.constants import MAPPER_DIR, PATENT_DIR
from pemt.utils import get_chemical_names

logger = logging.getLogger(__name__)

os.makedirs(MAPPER_DIR, exist_ok=True)
os.makedirs(PATENT_DIR, exist_ok=True)


def get_surechembl_id(
    chemical_id: str, chemical_name: str, chemical_mapper: dict
) -> str:
    """Method to get SureChEMBL identifier of chemicals.

    :param chemical_id: ChEMBL identifier of the chemical
    :param chemical_name: Name of the chemical
    :param chemical_mapper: Dictionary of identifier mapping between ChEMBL and SureChEMBL
    """
    surechembl_id = chemical_mapper.get(chemical_id)

    if surechembl_id:
        return surechembl_id

    try:
        synm_dict = get_synonyms(chemical_name, namespace="name")[0]
    except IndexError:
        return None

    try:
        surechembl_id = [
            synonym for synonym in synm_dict["Synonym"] if synonym.startswith("SCHEMBL")
        ][0]
    except IndexError:
        return None

    return surechembl_id


[docs]def harmonize_chemicals(analysis_name: str, from_genes: bool = True) -> None: """Method that allows mapping from ChEMBL to SureChEMBL identifiers. :param analysis_name: The name of the analysis you want to run. This name would be used to save the resultant file. :param from_genes: Boolean indicating where the process needs to get chemicals based on genes or not. """ # Load cached data if it exists if os.path.exists(f"{PATENT_DIR}/{analysis_name}_chemicals.tsv"): chemical_df = pd.read_csv( f"{PATENT_DIR}/{analysis_name}_chemicals.tsv", sep="\t", dtype=str, ) cache_dict = pd.read_csv( f"{PATENT_DIR}/{analysis_name}_chemicals.tsv", sep="\t", dtype=str, index_col="chembl", ).to_dict()["schembl_id"] else: chemical_df = pd.DataFrame(columns=["chembl", "schembl_id", "name"]) cache_dict = {} # Load chembl - schembl mapper with open(f"{MAPPER_DIR}/chemical_mapper.json") as f: chemical_mapper = json.load(f) # Add caching system cache = 0 genes_skipped = 0 if os.path.exists(f"{MAPPER_DIR}/{analysis_name}_chemical_names.json"): chemical_names = json.load( open(f"{MAPPER_DIR}/{analysis_name}_chemical_names.json") ) else: chemical_names = defaultdict(str) if from_genes: if os.path.exists(f"{MAPPER_DIR}/{analysis_name}_gene_to_chemicals.json"): gene_chemical_dict = json.load( open(f"{MAPPER_DIR}/{analysis_name}_gene_to_chemicals.json") ) else: raise FileNotFoundError( f"Please ensure that you run the experimental data extractor file first." ) for genes in tqdm( gene_chemical_dict, desc="Harmonizing chemicals for patent retrieval" ): # Extract chemical-target data from ChEMBL chemical_list = gene_chemical_dict[genes] if len(chemical_list) < 1: genes_skipped += 1 continue for chembl_id in chemical_list: # Get name for chemical and store in dict if chembl_id not in chemical_names: chemical_names[chembl_id] = get_chemical_names(chembl_id) # Store chembl to surechembl mapping in separate file if chembl_id not in cache_dict: surechembl_id = get_surechembl_id( chemical_id=chembl_id, chemical_name=chemical_names[chembl_id], chemical_mapper=chemical_mapper, ) if not surechembl_id: continue cache += 1 # add new data only cache_dict[chembl_id] = surechembl_id tmp_df = pd.DataFrame( { "chembl": chembl_id, "schembl_id": surechembl_id, "name": chemical_names[chembl_id], }, index=[0], ) chemical_df = pd.concat([chemical_df, tmp_df], ignore_index=True) if cache == 10: chemical_df.to_csv( f"{PATENT_DIR}/{analysis_name}_chemicals.tsv", sep="\t", index=False, ) cache = 0 # Save chemical mapping dict for re-use with open( f"{MAPPER_DIR}/{analysis_name}_chemical_names.json", "w" ) as f: json.dump(chemical_names, f, ensure_ascii=False, indent=2) else: for _, row in tqdm( chemical_df.iterrows(), total=chemical_df.shape[0], desc="Harmonzing chemicals for patent retrival", ): chembl_id = row["chembl"] # Get name for chemical and store in dict if chembl_id not in chemical_names: chemical_names[chembl_id] = get_chemical_names(chembl_id) # Store chembl to surechembl mapping in separate file if chembl_id not in cache_dict: surechembl_id = get_surechembl_id( chemical_id=chembl_id, chemical_name=chemical_names[chembl_id], chemical_mapper=chemical_mapper, ) if not surechembl_id: continue cache += 1 # add new data only cache_dict[chembl_id] = surechembl_id tmp_df = pd.DataFrame( { "chembl": chembl_id, "schembl_id": surechembl_id, "name": chemical_names[chembl_id], }, index=[0], ) chemical_df = pd.concat([chemical_df, tmp_df], ignore_index=True) if cache == 10: chemical_df.to_csv( f"{PATENT_DIR}/{analysis_name}_chemicals.tsv", sep="\t", index=False, ) cache = 0 # Save chemical mapping dict for re-use with open( f"{MAPPER_DIR}/{analysis_name}_chemical_names.json", "w" ) as f: json.dump(chemical_names, f, ensure_ascii=False, indent=2) chemical_df.dropna(subset=["schembl_id"], inplace=True) chemical_df.to_csv( f"{PATENT_DIR}/{analysis_name}_chemicals.tsv", sep="\t", index=False ) with open(f"{MAPPER_DIR}/{analysis_name}_chemical_names.json", "w") as f: json.dump(chemical_names, f, ensure_ascii=False, indent=2)