Skip to content

Commit

Permalink
ATC Example (#32)
Browse files Browse the repository at this point in the history
* Example for linking against ATC without mention context
  • Loading branch information
phlobo authored Mar 8, 2024
1 parent 8129b17 commit d6c1f90
Show file tree
Hide file tree
Showing 6 changed files with 345 additions and 4 deletions.
263 changes: 263 additions & 0 deletions examples/04_Drug_Names_ATC.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "62ff831a-ce8b-4fc0-89e6-d910b31c5f85",
"metadata": {},
"source": [
"# Linking Drug Mentions in German (without Context) to ATC\n",
"\n",
"## Preparation\n",
"\n",
"- Get German ATC 2023 version from: https://www.wido.de/publikationen-produkte/arzneimittel-klassifikation/ \n",
"- Optional: get access to DrugBank (https://go.drugbank.com/releases/latest) for much more aliases (e.g., trade names)\n",
"- `pip install openpyxl`\n",
"- Prepare xMEN KB and indices:\n",
" - `xmen dict examples/conf/atc.yaml --code examples/dicts/atc2023_de.py`\n",
" - `xmen index examples/conf/atc.yaml --all`"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "2cbdaaa9-1b2d-4f6b-a193-554da8226217",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from xmen import load_kb\n",
"from xmen.linkers import default_ensemble\n",
"import os\n",
"from pathlib import Path"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "0756e7cf-9478-4bb2-bdf0-2cc147c44e55",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"base_path = Path(os.path.expanduser('~/.cache/xmen/atc/'))\n",
"kb = load_kb(base_path / 'atc.jsonl')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "449f6fe0-bcbd-4572-b48a-8f47a98c52f2",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[03/07/24 18:18:49] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Loading hierarchical faiss index <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">sap_bert_linker.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">153</span></a>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[2;36m[03/07/24 18:18:49]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading hierarchical faiss index \u001b]8;id=789095;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\u001b\\\u001b[2msap_bert_linker.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=471624;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\u001b\\\u001b[2m153\u001b[0m\u001b]8;;\u001b\\\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Loading index from <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">faiss_indexer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">64</span></a>\n",
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #800080; text-decoration-color: #800080\">/home/Florian.Borchert/.cache/xmen/atc/index/sapbert/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">embed_faiss_h</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span>\n",
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">ier.pickle</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading index from \u001b]8;id=852479;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=39591;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\u001b\\\u001b[2m64\u001b[0m\u001b]8;;\u001b\\\n",
"\u001b[2;36m \u001b[0m \u001b[35m/home/Florian.Borchert/.cache/xmen/atc/index/sapbert/\u001b[0m\u001b[95membed_faiss_h\u001b[0m \u001b[2m \u001b[0m\n",
"\u001b[2;36m \u001b[0m \u001b[95mier.pickle\u001b[0m \u001b[2m \u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[03/07/24 18:18:50] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Loaded index of type <span style=\"font-weight: bold\">&lt;</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">class</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'faiss.swigfaiss.IndexHNSWFlat'</span><span style=\"font-weight: bold\">&gt;</span> and <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">faiss_indexer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">66</span></a>\n",
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> size <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">470941</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[2;36m[03/07/24 18:18:50]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loaded index of type \u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'faiss.swigfaiss.IndexHNSWFlat'\u001b[0m\u001b[1m>\u001b[0m and \u001b]8;id=788991;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=743737;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\u001b\\\u001b[2m66\u001b[0m\u001b]8;;\u001b\\\n",
"\u001b[2;36m \u001b[0m size \u001b[1;36m470941\u001b[0m \u001b[2m \u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"linker = default_ensemble(base_path / 'index')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "89561707-0f5e-480e-b96c-c29229e1fd70",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"drug_mentions = [\n",
" 'Ursodeoxycholsäure, 250 mg - Kapsel',\n",
" 'Propofol 2%, 20 mg/ml 1000 mg/50 ml Injektionslösung',\n",
" 'Norepinephrin 20 µg/ml',\n",
" 'Amphotericin B, 10 mg - Lutschtablette',\n",
" 'Fentanyl (50 µg/ml) i.v.',\n",
" 'Vollelektrolyt-Lösung',\n",
" 'Sufentanil 5µg/ml 250 µg/50 ml Injektionslösung'\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "2c09eda3-462c-4f6b-b055-52ff77a6ba65",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5a4c8dd8146148aaa5694a517d6aa965",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Map: 0%| | 0/7 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"predictions = linker.predict_no_context(drug_mentions)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "8158be67-5c2e-4358-b986-c72b9a74e43c",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Input: Ursodeoxycholsäure, 250 mg - Kapsel\n",
"Confidence: 0.7435341477394104\n",
"CUI: A05AA02, Name: Ursodeoxycholsäure\n",
"Definition: None\n",
"TUI(s): \n",
"Aliases (abbreviated, total: 47): \n",
"\t Litursol, Solutrat, Ursochol, Ag-ursodiol, 3alpha,7beta-Dihydroxy-5beta-cholan-24-oic acid, Urso DS, PMS-ursodiol, Ursodeoxycholic acid, (3alpha,5beta,7beta)-3,7-dihydroxycholan-24-oic acid, Urusa\n",
"------\n",
"Input: Propofol 2%, 20 mg/ml 1000 mg/50 ml Injektionslösung\n",
"Confidence: 0.6950462460517883\n",
"CUI: N01AX10, Name: Propofol\n",
"Definition: None\n",
"TUI(s): \n",
"Aliases (abbreviated, total: 45): \n",
"\t Diprivan, Gobbifol, Hypro, Disoprivan, Propofil, Anesthesia S/I-50, Anesthesia S/I-60, Propofol-II Injection, Propoven, Anepol\n",
"------\n",
"Input: Norepinephrin 20 µg/ml\n",
"Confidence: 0.848741888999939\n",
"CUI: C01CA03, Name: Norepinephrin\n",
"Definition: None\n",
"TUI(s): \n",
"Aliases (abbreviated, total: 27): \n",
"\t (R)-norepinephrine, Norepinephrine Bitartrate In 5% Dextrose Injection, (R)-4-(2-amino-1-hydroxyethyl)-1,2-benzenediol, Norepinephrine Bitartrate Injection USP, Levophed(r) Norepinephrine Bitartrate, Norépinéphrine, Norepinephrine, Levophed, (R)-(−)-norepinephrine, L-noradrenaline\n",
"------\n",
"Input: Amphotericin B, 10 mg - Lutschtablette\n",
"Confidence: 0.6953610181808472\n",
"CUI: J02AA01, Name: Amphotericin B\n",
"Definition: None\n",
"TUI(s): \n",
"Aliases (abbreviated, total: 22): \n",
"\t Amphocin, Amphotericinum B, Liposomal amphotericin B, Amphotericin, Fungizone, Amphotec 50 mg, Amphotec 100 mg, Abelect, Amphotericin B, Amphocil\n",
"------\n",
"Input: Fentanyl (50 µg/ml) i.v.\n",
"Confidence: 0.8534790277481079\n",
"CUI: N02AB03, Name: Fentanyl\n",
"Definition: None\n",
"TUI(s): \n",
"Aliases (abbreviated, total: 70): \n",
"\t Instanyl, Duragesic 12, Fentanyl Buccal, Fentanyl Transdermal, Pecfent, N-(1-phenethylpiperidin-4-yl)-N-phenylpropionamide, Fentora, Lazanda, Mylan-fentanyl Matrix Patch, Abstral\n",
"------\n",
"Input: Vollelektrolyt-Lösung\n",
"Confidence: 0.8366699814796448\n",
"CUI: B05XA, Name: Elektrolytlösungen\n",
"Definition: None\n",
"TUI(s): \n",
"Aliases: (total: 0): \n",
"\t \n",
"------\n",
"Input: Sufentanil 5µg/ml 250 µg/50 ml Injektionslösung\n",
"Confidence: 0.7572200894355774\n",
"CUI: N01AH03, Name: Sufentanil\n",
"Definition: None\n",
"TUI(s): \n",
"Aliases (abbreviated, total: 21): \n",
"\t Sufentanyl, N-(4-(Methoxymethyl)-1-(2-(2-thienyl)ethyl)-4-piperidyl)propionanilide, Sufentanilum, Sufentil, Dsuvia, N-(4-(Methoxymethyl)-1-(2-(2-thienyl)ethyl)-4-piperidinyl)-N-phenylpropanamide, Zalviso, Sufentanil Citrate, Sufenta, Chronogesic\n",
"------\n"
]
}
],
"source": [
"for d, p in zip(drug_mentions, predictions):\n",
" print('Input:', d)\n",
" top_candidate = p['normalized'][0]\n",
" print('Confidence:', top_candidate['score'])\n",
" print(kb.cui_to_entity[top_candidate['db_id']])\n",
" print('------')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eb7e79b7-06b9-4fed-a603-8a63cb77b97d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:xmen_notebooks]",
"language": "python",
"name": "conda-env-xmen_notebooks-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
1 change: 1 addition & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
|[01_BRONCO_German.ipynb](01_BRONCO_German.ipynb)|🇩🇪|Candidate generation and supervised re-ranking using the BRONCO corpus.<br>Shows how you can configure multiple dictionaries in the same config file.|
|[02_spaCy_German.ipynb](02_spaCy_German.ipynb)|🇩🇪|Using a spaCy NER model with xMEN<br>Shows how to build a pipeline without labelled data using candidate generation, type filtering and pre-trained re-rankers|
|[03_SNOMED_Linking_German.ipynb](03_SNOMED_Linking_German.ipynb)|🇩🇪|Linking against codes in UMLS source vocabularies (here SNOMED CT)|
|[04_Drug_Names_ATC.ipynb](04_Drug_Names_ATC.ipynb)|🇩🇪|Normalization of medication mentions (without surrounding text) to ATC codes|

## External Links

Expand Down
10 changes: 10 additions & 0 deletions examples/conf/atc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Config for German ATC 2023 version
name : atc

dict:
custom:
# Get 2023 version from: https://www.wido.de/publikationen-produkte/arzneimittel-klassifikation/
atc_path: local_files/atc2023
drug_bank_xml: local_files/drugbank/5.1.8/full-database.xml
lang:
- de
63 changes: 63 additions & 0 deletions examples/dicts/atc2023_de.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import pandas as pd
from pathlib import Path
from collections import defaultdict
import openpyxl
import xml.etree.ElementTree as ET
from xmen.log import logger


def get_concept_details(cfg) -> dict:
path = cfg.dict.custom.atc_path
wb = openpyxl.load_workbook(f"{path}/ATC GKV-AI_2023.xlsm")
sheet = wb["WIdO-Index 2023 ATC-sortiert"]
sort = pd.DataFrame(sheet.values)
sort = sort.rename(columns={0: "code", 2: "text", 4: "DDD_Info"})
sort.drop(0, axis=0, inplace=True)
sort.dropna(axis=0, how="all", inplace=True)
sort.drop(sort[sort.text.isnull()].index, inplace=True)
sort.drop(sort[sort.code.isnull()].index, inplace=True)

sort["code"] = sort["code"].str.rstrip()
sort = sort[sort["code"].str.len() > 3]

if drug_bank_xml := cfg.dict.custom.get("drug_bank_xml", None):
logger.info("Extending ATC by DrugBank synonyms")

ns = {"": "http://www.drugbank.ca"}
tree = ET.parse(drug_bank_xml)

atc2name = defaultdict(set)

for drug in tree.getroot():
aliases = set()

syns = [s.text for s in drug.findall("synonyms/synonym", ns)]
aliases.update(syns)

products = [p.text for p in drug.findall("products/product/name", ns)]
aliases.update(products)

mixtures = [p.text for p in drug.findall("mixtures/mixture/name", ns)]
aliases.update(mixtures)

international = [i.text for i in drug.findall("international-brands/international-brand/name", ns)]
aliases.update(international)

atc_codes = drug.find("atc-codes", ns)
for atc_code in atc_codes:
atc2name[atc_code.get("code")].update(aliases)

logger.info("Building concept dictionary")
concept_details = {}
for _, entry in sort.iterrows():
sid = entry.code
if not sid in concept_details:
concept_details[sid] = {"concept_id": sid, "canonical_name": entry.text, "types": [], "aliases": []}
elif sid in concept_details:
concept_details[sid]["aliases"].append(entry.text)
if drug_bank_xml:
for alias in atc2name[sid]:
if not alias in concept_details[sid]["aliases"]:
concept_details[sid]["aliases"].append(alias)

return concept_details
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "xmen"
version = "1.0.5"
version = "1.0.6"
description = "An extensible toolkit for Cross-lingual (x) Medical Entity Normalization."
license = "Apache-2.0"
authors = ["Florian Borchert <[email protected]>"]
Expand Down
10 changes: 7 additions & 3 deletions xmen/linkers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,14 @@ def predict_no_context(
ds = from_spans(entities=spans, sentences=sentences)
result = self.predict_batch(ds, batch_size)
if is_str:
assert len(result["entities"]) == 1
return result["entities"][0]
assert len(result["entities"]) == 1 and len(result["entities"][0]) == 1
return result["entities"][0][0]
else:
return result["entities"]
_result = []
for r in result["entities"]:
assert len(r) == 1
_result.append(r[0])
return _result


class RerankedLinker(EntityLinker):
Expand Down

0 comments on commit d6c1f90

Please sign in to comment.