Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ATC Example #32

Merged
merged 6 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
263 changes: 263 additions & 0 deletions examples/04_Drug_Names_ATC.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "62ff831a-ce8b-4fc0-89e6-d910b31c5f85",
"metadata": {},
"source": [
"# Linking Drug Mentions in German (without Context) to ATC\n",
"\n",
"## Preparation\n",
"\n",
"- Get German ATC 2023 version from: https://www.wido.de/publikationen-produkte/arzneimittel-klassifikation/ \n",
"- Optional: get access to DrugBank (https://go.drugbank.com/releases/latest) for much more aliases (e.g., trade names)\n",
"- `pip install openpyxl`\n",
"- Prepare xMEN KB and indices:\n",
" - `xmen dict examples/conf/atc.yaml --code examples/dicts/atc2023_de.py`\n",
" - `xmen index examples/conf/atc.yaml --all`"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "2cbdaaa9-1b2d-4f6b-a193-554da8226217",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from xmen import load_kb\n",
"from xmen.linkers import default_ensemble\n",
"import os\n",
"from pathlib import Path"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "0756e7cf-9478-4bb2-bdf0-2cc147c44e55",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"base_path = Path(os.path.expanduser('~/.cache/xmen/atc/'))\n",
"kb = load_kb(base_path / 'atc.jsonl')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "449f6fe0-bcbd-4572-b48a-8f47a98c52f2",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[03/07/24 18:18:49] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Loading hierarchical faiss index <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">sap_bert_linker.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">153</span></a>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[2;36m[03/07/24 18:18:49]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading hierarchical faiss index \u001b]8;id=789095;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\u001b\\\u001b[2msap_bert_linker.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=471624;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\u001b\\\u001b[2m153\u001b[0m\u001b]8;;\u001b\\\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Loading index from <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">faiss_indexer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">64</span></a>\n",
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #800080; text-decoration-color: #800080\">/home/Florian.Borchert/.cache/xmen/atc/index/sapbert/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">embed_faiss_h</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span>\n",
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">ier.pickle</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading index from \u001b]8;id=852479;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=39591;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\u001b\\\u001b[2m64\u001b[0m\u001b]8;;\u001b\\\n",
"\u001b[2;36m \u001b[0m \u001b[35m/home/Florian.Borchert/.cache/xmen/atc/index/sapbert/\u001b[0m\u001b[95membed_faiss_h\u001b[0m \u001b[2m \u001b[0m\n",
"\u001b[2;36m \u001b[0m \u001b[95mier.pickle\u001b[0m \u001b[2m \u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[03/07/24 18:18:50] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Loaded index of type <span style=\"font-weight: bold\">&lt;</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">class</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'faiss.swigfaiss.IndexHNSWFlat'</span><span style=\"font-weight: bold\">&gt;</span> and <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">faiss_indexer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">66</span></a>\n",
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> size <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">470941</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[2;36m[03/07/24 18:18:50]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loaded index of type \u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'faiss.swigfaiss.IndexHNSWFlat'\u001b[0m\u001b[1m>\u001b[0m and \u001b]8;id=788991;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=743737;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\u001b\\\u001b[2m66\u001b[0m\u001b]8;;\u001b\\\n",
"\u001b[2;36m \u001b[0m size \u001b[1;36m470941\u001b[0m \u001b[2m \u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"linker = default_ensemble(base_path / 'index')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "89561707-0f5e-480e-b96c-c29229e1fd70",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"drug_mentions = [\n",
" 'Ursodeoxycholsäure, 250 mg - Kapsel',\n",
" 'Propofol 2%, 20 mg/ml 1000 mg/50 ml Injektionslösung',\n",
" 'Norepinephrin 20 µg/ml',\n",
" 'Amphotericin B, 10 mg - Lutschtablette',\n",
" 'Fentanyl (50 µg/ml) i.v.',\n",
" 'Vollelektrolyt-Lösung',\n",
" 'Sufentanil 5µg/ml 250 µg/50 ml Injektionslösung'\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "2c09eda3-462c-4f6b-b055-52ff77a6ba65",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5a4c8dd8146148aaa5694a517d6aa965",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Map: 0%| | 0/7 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"predictions = linker.predict_no_context(drug_mentions)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "8158be67-5c2e-4358-b986-c72b9a74e43c",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Input: Ursodeoxycholsäure, 250 mg - Kapsel\n",
"Confidence: 0.7435341477394104\n",
"CUI: A05AA02, Name: Ursodeoxycholsäure\n",
"Definition: None\n",
"TUI(s): \n",
"Aliases (abbreviated, total: 47): \n",
"\t Litursol, Solutrat, Ursochol, Ag-ursodiol, 3alpha,7beta-Dihydroxy-5beta-cholan-24-oic acid, Urso DS, PMS-ursodiol, Ursodeoxycholic acid, (3alpha,5beta,7beta)-3,7-dihydroxycholan-24-oic acid, Urusa\n",
"------\n",
"Input: Propofol 2%, 20 mg/ml 1000 mg/50 ml Injektionslösung\n",
"Confidence: 0.6950462460517883\n",
"CUI: N01AX10, Name: Propofol\n",
"Definition: None\n",
"TUI(s): \n",
"Aliases (abbreviated, total: 45): \n",
"\t Diprivan, Gobbifol, Hypro, Disoprivan, Propofil, Anesthesia S/I-50, Anesthesia S/I-60, Propofol-II Injection, Propoven, Anepol\n",
"------\n",
"Input: Norepinephrin 20 µg/ml\n",
"Confidence: 0.848741888999939\n",
"CUI: C01CA03, Name: Norepinephrin\n",
"Definition: None\n",
"TUI(s): \n",
"Aliases (abbreviated, total: 27): \n",
"\t (R)-norepinephrine, Norepinephrine Bitartrate In 5% Dextrose Injection, (R)-4-(2-amino-1-hydroxyethyl)-1,2-benzenediol, Norepinephrine Bitartrate Injection USP, Levophed(r) Norepinephrine Bitartrate, Norépinéphrine, Norepinephrine, Levophed, (R)-(−)-norepinephrine, L-noradrenaline\n",
"------\n",
"Input: Amphotericin B, 10 mg - Lutschtablette\n",
"Confidence: 0.6953610181808472\n",
"CUI: J02AA01, Name: Amphotericin B\n",
"Definition: None\n",
"TUI(s): \n",
"Aliases (abbreviated, total: 22): \n",
"\t Amphocin, Amphotericinum B, Liposomal amphotericin B, Amphotericin, Fungizone, Amphotec 50 mg, Amphotec 100 mg, Abelect, Amphotericin B, Amphocil\n",
"------\n",
"Input: Fentanyl (50 µg/ml) i.v.\n",
"Confidence: 0.8534790277481079\n",
"CUI: N02AB03, Name: Fentanyl\n",
"Definition: None\n",
"TUI(s): \n",
"Aliases (abbreviated, total: 70): \n",
"\t Instanyl, Duragesic 12, Fentanyl Buccal, Fentanyl Transdermal, Pecfent, N-(1-phenethylpiperidin-4-yl)-N-phenylpropionamide, Fentora, Lazanda, Mylan-fentanyl Matrix Patch, Abstral\n",
"------\n",
"Input: Vollelektrolyt-Lösung\n",
"Confidence: 0.8366699814796448\n",
"CUI: B05XA, Name: Elektrolytlösungen\n",
"Definition: None\n",
"TUI(s): \n",
"Aliases: (total: 0): \n",
"\t \n",
"------\n",
"Input: Sufentanil 5µg/ml 250 µg/50 ml Injektionslösung\n",
"Confidence: 0.7572200894355774\n",
"CUI: N01AH03, Name: Sufentanil\n",
"Definition: None\n",
"TUI(s): \n",
"Aliases (abbreviated, total: 21): \n",
"\t Sufentanyl, N-(4-(Methoxymethyl)-1-(2-(2-thienyl)ethyl)-4-piperidyl)propionanilide, Sufentanilum, Sufentil, Dsuvia, N-(4-(Methoxymethyl)-1-(2-(2-thienyl)ethyl)-4-piperidinyl)-N-phenylpropanamide, Zalviso, Sufentanil Citrate, Sufenta, Chronogesic\n",
"------\n"
]
}
],
"source": [
"for d, p in zip(drug_mentions, predictions):\n",
" print('Input:', d)\n",
" top_candidate = p['normalized'][0]\n",
" print('Confidence:', top_candidate['score'])\n",
" print(kb.cui_to_entity[top_candidate['db_id']])\n",
" print('------')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eb7e79b7-06b9-4fed-a603-8a63cb77b97d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:xmen_notebooks]",
"language": "python",
"name": "conda-env-xmen_notebooks-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
1 change: 1 addition & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
|[01_BRONCO_German.ipynb](01_BRONCO_German.ipynb)|🇩🇪|Candidate generation and supervised re-ranking using the BRONCO corpus.<br>Shows how you can configure multiple dictionaries in the same config file.|
|[02_spaCy_German.ipynb](02_spaCy_German.ipynb)|🇩🇪|Using a spaCy NER model with xMEN<br>Shows how to build a pipeline without labelled data using candidate generation, type filtering and pre-trained re-rankers|
|[03_SNOMED_Linking_German.ipynb](03_SNOMED_Linking_German.ipynb)|🇩🇪|Linking against codes in UMLS source vocabularies (here SNOMED CT)|
|[04_Drug_Names_ATC.ipynb](04_Drug_Names_ATC.ipynb)|🇩🇪|Normalization of medication mentions (without surrounding text) to ATC codes|

## External Links

Expand Down
10 changes: 10 additions & 0 deletions examples/conf/atc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Config for German ATC 2023 version
name : atc

dict:
custom:
# Get 2023 version from: https://www.wido.de/publikationen-produkte/arzneimittel-klassifikation/
atc_path: local_files/atc2023
drug_bank_xml: local_files/drugbank/5.1.8/full-database.xml
lang:
- de
63 changes: 63 additions & 0 deletions examples/dicts/atc2023_de.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import pandas as pd
from pathlib import Path
from collections import defaultdict
import openpyxl
import xml.etree.ElementTree as ET
from xmen.log import logger


def get_concept_details(cfg) -> dict:
path = cfg.dict.custom.atc_path
wb = openpyxl.load_workbook(f"{path}/ATC GKV-AI_2023.xlsm")
sheet = wb["WIdO-Index 2023 ATC-sortiert"]
sort = pd.DataFrame(sheet.values)
sort = sort.rename(columns={0: "code", 2: "text", 4: "DDD_Info"})
sort.drop(0, axis=0, inplace=True)
sort.dropna(axis=0, how="all", inplace=True)
sort.drop(sort[sort.text.isnull()].index, inplace=True)
sort.drop(sort[sort.code.isnull()].index, inplace=True)

sort["code"] = sort["code"].str.rstrip()
sort = sort[sort["code"].str.len() > 3]

if drug_bank_xml := cfg.dict.custom.get("drug_bank_xml", None):
logger.info("Extending ATC by DrugBank synonyms")

ns = {"": "http://www.drugbank.ca"}
tree = ET.parse(drug_bank_xml)

atc2name = defaultdict(set)

for drug in tree.getroot():
aliases = set()

syns = [s.text for s in drug.findall("synonyms/synonym", ns)]
aliases.update(syns)

products = [p.text for p in drug.findall("products/product/name", ns)]
aliases.update(products)

mixtures = [p.text for p in drug.findall("mixtures/mixture/name", ns)]
aliases.update(mixtures)

international = [i.text for i in drug.findall("international-brands/international-brand/name", ns)]
aliases.update(international)

atc_codes = drug.find("atc-codes", ns)
for atc_code in atc_codes:
atc2name[atc_code.get("code")].update(aliases)

logger.info("Building concept dictionary")
concept_details = {}
for _, entry in sort.iterrows():
sid = entry.code
if not sid in concept_details:
concept_details[sid] = {"concept_id": sid, "canonical_name": entry.text, "types": [], "aliases": []}
elif sid in concept_details:
concept_details[sid]["aliases"].append(entry.text)
if drug_bank_xml:
for alias in atc2name[sid]:
if not alias in concept_details[sid]["aliases"]:
concept_details[sid]["aliases"].append(alias)

return concept_details
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "xmen"
version = "1.0.5"
version = "1.0.6"
description = "An extensible toolkit for Cross-lingual (x) Medical Entity Normalization."
license = "Apache-2.0"
authors = ["Florian Borchert <[email protected]>"]
Expand Down
10 changes: 7 additions & 3 deletions xmen/linkers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,14 @@ def predict_no_context(
ds = from_spans(entities=spans, sentences=sentences)
result = self.predict_batch(ds, batch_size)
if is_str:
assert len(result["entities"]) == 1
return result["entities"][0]
assert len(result["entities"]) == 1 and len(result["entities"][0]) == 1
return result["entities"][0][0]
else:
return result["entities"]
_result = []
for r in result["entities"]:
assert len(r) == 1
_result.append(r[0])
return _result


class RerankedLinker(EntityLinker):
Expand Down
Loading