diff --git a/edsnlp/pipes/ner/tnm/model.py b/edsnlp/pipes/ner/tnm/model.py index 206930976..1916b2b73 100644 --- a/edsnlp/pipes/ner/tnm/model.py +++ b/edsnlp/pipes/ner/tnm/model.py @@ -79,15 +79,21 @@ class Metastasis(TnmEnum): class TNM(pydantic.BaseModel): - prefix: Optional[Prefix] = None - tumour: Optional[Tumour] = None - tumour_specification: Optional[Specification] = None + tumour_prefix: Optional[str] = None + tumour: Optional[str] = None + tumour_specification: Optional[str] = None tumour_suffix: Optional[str] = None - node: Optional[Node] = None - node_specification: Optional[Specification] = None + node_prefix: Optional[str] = None + node: Optional[str] = None + node_specification: Optional[str] = None node_suffix: Optional[str] = None - metastasis: Optional[Metastasis] = None - resection_completeness: Optional[int] = None + metastasis_prefix: Optional[str] = None + metastasis: Optional[str] = None + metastasis_specification: Optional[str] = None + pleura: Optional[str] = None + resection: Optional[str] = None + resection_specification: Optional[str] = None + resection_loc: Optional[str] = None version: Optional[str] = None version_year: Optional[int] = None @@ -112,32 +118,43 @@ def validate_year(cls, v): def norm(self) -> str: norm = [] - if self.prefix is not None: - norm.append(str(self.prefix)) + if self.tumour_prefix: + norm.append(f"{self.tumour_prefix or ''}") - if ( - (self.tumour is not None) - | (self.tumour_specification is not None) - | (self.tumour_suffix is not None) - ): - norm.append(f"T{str(self.tumour or '')}") - norm.append(f"{str(self.tumour_specification or '')}") - norm.append(f"{str(self.tumour_suffix or '')}") - - if ( - (self.node is not None) - | (self.node_specification is not None) - | (self.node_suffix is not None) - ): - norm.append(f"N{str(self.node or '')}") - norm.append(f"{str(self.node_specification or '')}") - norm.append(f"{str(self.node_suffix or '')}") + if self.tumour: + norm.append(f"T{self.tumour}") + if self.tumour_specification: + norm.append(f"{self.tumour_specification or ''}") + if self.tumour_suffix: + norm.append(f"{self.tumour_suffix or ''}") + + if self.node_prefix: + norm.append(f"{self.node_prefix or ''}") - if self.metastasis is not None: + if self.node: + norm.append(f"N{self.node}") + if self.node_specification: + norm.append(f"{self.node_specification or ''}") + if self.node_suffix: + norm.append(f"{self.node_suffix or ''}") + + if self.metastasis_prefix: + norm.append(f"{self.metastasis_prefix or ''}") + + if self.metastasis: norm.append(f"M{self.metastasis}") + if self.metastasis_specification: + norm.append(f"{self.metastasis_specification or ''}") + + if self.pleura: + norm.append(f"PL{self.pleura}") - if self.resection_completeness is not None: - norm.append(f"R{self.resection_completeness}") + if self.resection: + norm.append(f"R{self.resection}") + if self.resection_specification: + norm.append(f"{self.resection_specification or ''}") + if self.resection_loc: + norm.append(f"{self.resection_loc or ''}") if self.version is not None and self.version_year is not None: norm.append(f" ({self.version.upper()} {self.version_year})") @@ -182,14 +199,21 @@ def dict( set_keys = set(d.keys()) for k in set_keys.intersection( { - "prefix", + "tumour_prefix", "tumour", - "node", - "metastasis", "tumour_specification", - "node_specification", "tumour_suffix", + "node_prefix", + "node", + "node_specification", "node_suffix", + "metastasis_prefix", + "metastasis", + "metastasis_specification", + "pleura", + "resection", + "resection_specification", + "resection_loc", } ): v = d[k] diff --git a/edsnlp/pipes/ner/tnm/patterns.py b/edsnlp/pipes/ner/tnm/patterns.py index f99d857d7..f0d4a1b28 100644 --- a/edsnlp/pipes/ner/tnm/patterns.py +++ b/edsnlp/pipes/ner/tnm/patterns.py @@ -1,5 +1,7 @@ -prefix_pattern = r"(?P[cpPyraum]p?)" -tumour_pattern = r"T\s?(?P([0-4o]|is))?(?P[abcdx]|mi)?" +prefix_pattern = r"(?P[cpPyraum]p?)" +tumour_pattern = ( + r"T\s?(?P([0-4o]|is|[Xx]))?(?P[abcdx]|mi)?" +) tumour_pattern += r"(?:\((?P[^()]{1,10})\))?" node_pattern = r"(\s{,2}\/?\s{,2}([cpPyraum]p?)?\s{,2}N\s?(?P[0-3o]|x)" node_pattern += ( @@ -7,9 +9,9 @@ ) metastasis_pattern = ( - r"(\s{,2}\/?\s{,2}([cpPyraum]p?)?\s{,2}M\s?(?P([01o]|x))x?)" # noqa: E501 + r"(\s{,2}\/?\s{,2}([cpPyraum]p?)?\s{,2}M\s?(?P([01o]|x))x?)" ) -resection_completeness = r"(\s{,2}\/?\s{,2}R\s?(?P[012]))" +resection_pattern = r"(\s{,2}\/?\s{,2}R\s?(?P[012]))" version_pattern = ( r"\(?(?Puicc|accj|tnm|UICC|ACCJ|TNM)" @@ -23,6 +25,6 @@ tnm_pattern += prefix_pattern + r"\s{,2}?" + f"({tumour_pattern})" tnm_pattern += r"(\s{,2}" + f"{node_pattern})?" tnm_pattern += r"(\s{,2}" + f"{metastasis_pattern})?" -tnm_pattern += r"(\s{,2}" + f"{resection_completeness})?" +tnm_pattern += r"(\s{,2}" + f"{resection_pattern})?" tnm_pattern += f"({spacer}{version_pattern})?" tnm_pattern = r"(?:\b|^)" + tnm_pattern + r"(?:\b|$)" diff --git a/edsnlp/pipes/ner/tnm/patterns_new.py b/edsnlp/pipes/ner/tnm/patterns_new.py new file mode 100644 index 000000000..a3eac8217 --- /dev/null +++ b/edsnlp/pipes/ner/tnm/patterns_new.py @@ -0,0 +1,92 @@ +tumour_pattern = ( + r"(?P[cpyramP]{1,2}\s?)?" # Optional tumour prefix + r"T\s?" # 'T' followed by optional space + r"(?P([0-4]|is|[Xx]))" # Tumour size (required if 'T' is present) + r"(?P[abcdx]|mi)?" # Optional tumour specification + r"(?:\s?\((?P[^()]{1,10})\))?" # Optional tumour suffix +) + +node_pattern = ( + r"(?P[cpyraP]{1,2}\s?)?" # Optional node prefix + r"N\s?" # 'N' followed by optional space + r"(?P[Xx01234\+])" # Node size/status (required if 'N' is present) + r"(?P[abcdx]|mi|sn|i[-,+]|mol[-,+]|\(mi\)|\(sn\)|" + r"\(i[-,+]\)|\(mol[-,+]\)|\(\d+\s*/\s*\d+\))?" # Optional node specification + r"(?:\s?\((?P[^()]{1,10})\))?" # Optional node suffix +) + +metastasis_pattern = ( + r"(?P[cpyraP]{1,2}\s?)?" # Optional metastasis prefix + r"M\s?" # 'M' followed by optional space + r"(?P[Xx0123\+])" # Metastasis status (required if 'M' is present) + r"(?P[abcd]|i\+|mol\+|cy\+|\(i\+\)|\(mol\+\)|" + r"\(cy\+\)|PUL|OSS|HEP|BRA|LYM|OTH|MAR|PLE|PER|ADR|SKI)?" # Optional specification +) + +pleura_pattern = ( + r"PL\s?(?P([0123]|x))?" # Optional pleura status (for lung cancer) +) + +resection_pattern = ( + r"R\s?" + r"(?P[Xx012])?" # Optional resection completeness + r"(?P(is|cy\+|\(is\)|\(cy\+\))?)?" # Optional spec + r"(?P(\((?P[a-z]+)\)[,;\s]*)*)?" # Optional localization +) + +version_pattern = ( + r"\(?(?Puicc|accj|tnm|UICC|ACCJ|TNM)" # TNM version + r"\s+([éeE]ditions|[éeE]d\.?)?\s*" + r"(?P\d{4}|\d{2})\)?" # Year of the version +) + +TNM_space = r"(\s*[,\/]?\s*|\n)" # Allow space, comma, or slash as delimiters + +# We need te exclude pattern like 'T1', 'T2' if they are not followed by node or +# metastasis sections. +exclude_pattern = ( + r"(?!T\s*[0-4]\s*[.,\/](?!\s*" + + node_pattern + + "?" + + TNM_space + + "?" + + metastasis_pattern + + "?" + + "))" +) + +tnm_pattern_new = ( + r"(?:\b|^)" + + exclude_pattern + + r"(?:" + + r"(?P" + + tumour_pattern + + ")" + + TNM_space + + "?" + + r"(?P" + + node_pattern + + ")?" + + TNM_space + + "?" + + r"(?P" + + metastasis_pattern + + ")?" + + TNM_space + + "?" + + r"(?P" + + pleura_pattern + + ")?" + + TNM_space + + "?" + + r"(?P" + + resection_pattern + + ")?" + + TNM_space + + "?" + + r"(?P" + + version_pattern + + ")?" + + r")" + + r"(?:\b|$|\n)" +) diff --git a/edsnlp/pipes/ner/tnm/tnm.py b/edsnlp/pipes/ner/tnm/tnm.py index 58a564d73..22574498b 100644 --- a/edsnlp/pipes/ner/tnm/tnm.py +++ b/edsnlp/pipes/ner/tnm/tnm.py @@ -12,7 +12,7 @@ from edsnlp.utils.typing import cast from .model import TNM -from .patterns import tnm_pattern +from .patterns_new import tnm_pattern_new class TNMMatcher(BaseNERComponent): @@ -75,7 +75,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "tnm", *, - pattern: Optional[Union[List[str], str]] = tnm_pattern, + pattern: Optional[Union[List[str], str]] = tnm_pattern_new, attr: str = "TEXT", label: str = "tnm", span_setter: SpanSetterArg = {"ents": True, "tnm": True}, diff --git a/tests/pipelines/ner/test_tnm.py b/tests/pipelines/ner/test_tnm.py index 080bc67a4..9eb2409ba 100644 --- a/tests/pipelines/ner/test_tnm.py +++ b/tests/pipelines/ner/test_tnm.py @@ -1,3 +1,4 @@ +from edsnlp.pipes.ner.tnm.patterns import tnm_pattern from edsnlp.utils.examples import parse_example examples = [ @@ -22,7 +23,7 @@ def test_scores(blank_nlp): - blank_nlp.add_pipe("eds.tnm") + blank_nlp.add_pipe("eds.tnm", config=dict(pattern=tnm_pattern)) for example in examples: text, entities = parse_example(example=example)