diff --git a/chart_review/cli.py b/chart_review/cli.py index 0fd1203..88507b9 100644 --- a/chart_review/cli.py +++ b/chart_review/cli.py @@ -3,7 +3,7 @@ import argparse import sys -from chart_review.commands import accuracy, default, ids, labels +from chart_review.commands import accuracy, default, ids, labels, mentions def define_parser() -> argparse.ArgumentParser: @@ -15,6 +15,7 @@ def define_parser() -> argparse.ArgumentParser: accuracy.make_subparser(subparsers.add_parser("accuracy", help="calculate F1 and Kappa scores")) ids.make_subparser(subparsers.add_parser("ids", help="map Label Studio IDs to FHIR IDs")) labels.make_subparser(subparsers.add_parser("labels", help="show label usage by annotator")) + mentions.make_subparser(subparsers.add_parser("mentions", help="show each mention of a label")) return parser diff --git a/chart_review/cohort.py b/chart_review/cohort.py index 591dbe4..a19071d 100644 --- a/chart_review/cohort.py +++ b/chart_review/cohort.py @@ -44,11 +44,9 @@ def __init__(self, proj_config: config.ProjectConfig): # Calculate the final set of note ranges for each annotator self.note_range, self.ignored_notes = self._collect_note_ranges(self.ls_export) - # Remove any ignored notes from the mentions table, for ease of consuming code - for mentions in self.annotations.mentions.values(): - for note in self.ignored_notes: - if note in mentions: - del mentions[note] + # Remove any ignored notes from the annotations, for ease of consuming code + for note in self.ignored_notes: + self.annotations.remove(note) def _collect_note_ranges( self, exported_json: list[dict] diff --git a/chart_review/commands/labels.py b/chart_review/commands/labels.py index 82a417d..96a4cd7 100644 --- a/chart_review/commands/labels.py +++ b/chart_review/commands/labels.py @@ -29,19 +29,19 @@ def print_labels(args: argparse.Namespace) -> None: label_notes[annotator][name] = note_ids any_annotator_note_sets.setdefault(name, types.NoteSet()).update(note_ids) - label_table = cli_utils.create_table("Annotator", "Chart Count", "Label") + label_table = cli_utils.create_table("Annotator", "Label", "Chart Count") # First add summary entries, for counts across the union of all annotators for name in label_names: count = str(len(any_annotator_note_sets.get(name, {}))) - label_table.add_row(rich.text.Text("Any", style="italic"), count, name) + label_table.add_row(rich.text.Text("Any", style="italic"), name, count) # Now do each annotator as their own little boxed section for annotator in sorted(label_notes.keys(), key=str.casefold): label_table.add_section() for name, note_set in label_notes[annotator].items(): count = str(len(note_set)) - label_table.add_row(annotator, count, name) + label_table.add_row(annotator, name, count) if args.csv: cli_utils.print_table_as_csv(label_table) diff --git a/chart_review/commands/mentions.py b/chart_review/commands/mentions.py new file mode 100644 index 0000000..1d8bf9a --- /dev/null +++ b/chart_review/commands/mentions.py @@ -0,0 +1,38 @@ +import argparse + +import rich +import rich.box +import rich.table +import rich.text + +from chart_review import cli_utils, console_utils, types + + +def make_subparser(parser: argparse.ArgumentParser) -> None: + cli_utils.add_project_args(parser) + cli_utils.add_output_args(parser) + parser.set_defaults(func=print_mentions) + + +def print_mentions(args: argparse.Namespace) -> None: + """ + Print Label Studio export's mentions (text associated with the label). + """ + reader = cli_utils.get_cohort_reader(args) + + table = cli_utils.create_table("Annotator", "Chart ID", "Mention", "Label") + + for annotator in sorted(reader.annotations.original_text_mentions, key=str.casefold): + table.add_section() + mentions = reader.annotations.original_text_mentions[annotator] + for note_id, labeled_texts in mentions.items(): + for label_text in labeled_texts: + for label in sorted(label_text.labels, key=str.casefold): + if label in reader.annotations.labels: + table.add_row(annotator, str(note_id), label_text.text, label) + + if args.csv: + cli_utils.print_table_as_csv(table) + else: + rich.get_console().print(table) + console_utils.print_ignored_charts(reader) diff --git a/chart_review/console_utils.py b/chart_review/console_utils.py index 320d179..db9e782 100644 --- a/chart_review/console_utils.py +++ b/chart_review/console_utils.py @@ -39,7 +39,15 @@ def end_range() -> None: def print_ignored_charts(reader: cohort.CohortReader): - """Prints a line about ignored charts, suitable for underlying a table""" + """ + Prints a line about ignored charts, suitable for underlying a table. + + It's recommended that any CLI command that shows individual chart IDs + call this for their normal output view (i.e. not a formatted view like --csv). + + For commands that just show aggregate chart numbers, + use your judgement if it helps or is just confusing extra info. + """ if not reader.ignored_notes: return diff --git a/chart_review/types.py b/chart_review/types.py index e723b74..bebe627 100644 --- a/chart_review/types.py +++ b/chart_review/types.py @@ -39,3 +39,12 @@ class ProjectAnnotations: original_text_mentions: dict[str, dict[int, list[LabeledText]]] = dataclasses.field( default_factory=dict ) + + def remove(self, chart_id: int): + # Remove any instance of this chart ID + for mentions in self.mentions.values(): + if chart_id in mentions: + del mentions[chart_id] + for mentions in self.original_text_mentions.values(): + if chart_id in mentions: + del mentions[chart_id] diff --git a/docs/labels.md b/docs/labels.md index 5f21c87..13b5f9f 100644 --- a/docs/labels.md +++ b/docs/labels.md @@ -8,32 +8,32 @@ nav_order: 7 # The Labels Command -The `labels` prints some statistics on the project labels +The `labels` command prints some statistics on the project labels and how often each annotator used each label. ## Example ```shell $ chart-review labels -╭───────────┬─────────────┬──────────╮ -│ Annotator │ Chart Count │ Label │ -├───────────┼─────────────┼──────────┤ -│ Any │ 2 │ Cough │ -│ Any │ 3 │ Fatigue │ -│ Any │ 3 │ Headache │ -├───────────┼─────────────┼──────────┤ -│ jane │ 1 │ Cough │ -│ jane │ 2 │ Fatigue │ -│ jane │ 2 │ Headache │ -├───────────┼─────────────┼──────────┤ -│ jill │ 2 │ Cough │ -│ jill │ 3 │ Fatigue │ -│ jill │ 0 │ Headache │ -├───────────┼─────────────┼──────────┤ -│ john │ 1 │ Cough │ -│ john │ 2 │ Fatigue │ -│ john │ 2 │ Headache │ -╰───────────┴─────────────┴──────────╯ +╭───────────┬──────────┬─────────────╮ +│ Annotator │ Label │ Chart Count │ +├───────────┼──────────┼─────────────┤ +│ Any │ Cough │ 2 │ +│ Any │ Fatigue │ 3 │ +│ Any │ Headache │ 3 │ +├───────────┼──────────┼─────────────┤ +│ jane │ Cough │ 1 │ +│ jane │ Fatigue │ 2 │ +│ jane │ Headache │ 2 │ +├───────────┼──────────┼─────────────┤ +│ jill │ Cough │ 2 │ +│ jill │ Fatigue │ 3 │ +│ jill │ Headache │ 0 │ +├───────────┼──────────┼─────────────┤ +│ john │ Cough │ 1 │ +│ john │ Fatigue │ 2 │ +│ john │ Headache │ 2 │ +╰───────────┴──────────┴─────────────╯ ``` ## Options @@ -49,17 +49,17 @@ $ chart-review labels --csv > labels.csv ```shell $ chart-review labels --csv -annotator,chart_count,label -Any,2,Cough -Any,3,Fatigue -Any,3,Headache -jane,1,Cough -jane,2,Fatigue -jane,2,Headache -jill,2,Cough -jill,3,Fatigue -jill,0,Headache -john,1,Cough -john,2,Fatigue -john,2,Headache +annotator,label,chart_count +Any,Cough,2 +Any,Fatigue,3 +Any,Headache,3 +jane,Cough,1 +jane,Fatigue,2 +jane,Headache,2 +jill,Cough,2 +jill,Fatigue,3 +jill,Headache,0 +john,Cough,1 +john,Fatigue,2 +john,Headache,2 ``` diff --git a/docs/mentions.md b/docs/mentions.md new file mode 100644 index 0000000..99369bd --- /dev/null +++ b/docs/mentions.md @@ -0,0 +1,70 @@ +--- +title: Mentions Command +parent: Chart Review +nav_order: 8 +# audience: lightly technical folks +# type: how-to +--- + +# The Mentions Command + +The `mentions` command prints each time a piece of text was labeled +and with what label. + +## Example + +```shell +$ chart-review mentions +╭───────────┬──────────┬─────────┬──────────╮ +│ Annotator │ Chart ID │ Mention │ Label │ +├───────────┼──────────┼─────────┼──────────┤ +│ jane │ 1 │ achoo │ Cough │ +│ jane │ 1 │ sigh │ Headache │ +│ jane │ 1 │ sigh │ Fatigue │ +│ jane │ 4 │ sleepy │ Fatigue │ +│ jane │ 4 │ pain │ Headache │ +├───────────┼──────────┼─────────┼──────────┤ +│ jill │ 1 │ achoo │ Cough │ +│ jill │ 1 │ sigh │ Fatigue │ +│ jill │ 2 │ ouch │ Fatigue │ +│ jill │ 4 │ sleepy │ Fatigue │ +│ jill │ 4 │ pain │ Cough │ +├───────────┼──────────┼─────────┼──────────┤ +│ john │ 1 │ achoo │ Cough │ +│ john │ 1 │ sigh │ Fatigue │ +│ john │ 2 │ ouch │ Headache │ +│ john │ 4 │ sleepy │ Fatigue │ +│ john │ 4 │ pain │ Headache │ +╰───────────┴──────────┴─────────┴──────────╯ +``` + +## Options + +### --csv + +Print the mentions in a machine-parseable CSV format. + +#### Examples +```shell +$ chart-review mentions --csv > mentions.csv +``` + +```shell +$ chart-review mentions --csv +annotator,chart_id,mention,label +jane,1,achoo,Cough +jane,1,sigh,Headache +jane,1,sigh,Fatigue +jane,4,sleepy,Fatigue +jane,4,pain,Headache +jill,1,achoo,Cough +jill,1,sigh,Fatigue +jill,2,ouch,Fatigue +jill,4,sleepy,Fatigue +jill,4,pain,Cough +john,1,achoo,Cough +john,1,sigh,Fatigue +john,2,ouch,Headache +john,4,sleepy,Fatigue +john,4,pain,Headache +``` diff --git a/tests/test_labels.py b/tests/test_labels.py index 7163f3d..685a602 100644 --- a/tests/test_labels.py +++ b/tests/test_labels.py @@ -13,25 +13,25 @@ def test_labels(self): stdout = self.run_cli("labels", path=f"{self.DATA_DIR}/cold") self.assertEqual( - """╭───────────┬─────────────┬──────────╮ -│ Annotator │ Chart Count │ Label │ -├───────────┼─────────────┼──────────┤ -│ Any │ 2 │ Cough │ -│ Any │ 3 │ Fatigue │ -│ Any │ 3 │ Headache │ -├───────────┼─────────────┼──────────┤ -│ jane │ 1 │ Cough │ -│ jane │ 2 │ Fatigue │ -│ jane │ 2 │ Headache │ -├───────────┼─────────────┼──────────┤ -│ jill │ 2 │ Cough │ -│ jill │ 3 │ Fatigue │ -│ jill │ 0 │ Headache │ -├───────────┼─────────────┼──────────┤ -│ john │ 1 │ Cough │ -│ john │ 2 │ Fatigue │ -│ john │ 2 │ Headache │ -╰───────────┴─────────────┴──────────╯ + """╭───────────┬──────────┬─────────────╮ +│ Annotator │ Label │ Chart Count │ +├───────────┼──────────┼─────────────┤ +│ Any │ Cough │ 2 │ +│ Any │ Fatigue │ 3 │ +│ Any │ Headache │ 3 │ +├───────────┼──────────┼─────────────┤ +│ jane │ Cough │ 1 │ +│ jane │ Fatigue │ 2 │ +│ jane │ Headache │ 2 │ +├───────────┼──────────┼─────────────┤ +│ jill │ Cough │ 2 │ +│ jill │ Fatigue │ 3 │ +│ jill │ Headache │ 0 │ +├───────────┼──────────┼─────────────┤ +│ john │ Cough │ 1 │ +│ john │ Fatigue │ 2 │ +│ john │ Headache │ 2 │ +╰───────────┴──────────┴─────────────╯ """, stdout, ) @@ -53,12 +53,12 @@ def test_labels_grouped(self): stdout = self.run_cli("labels", path=tmpdir) self.assertEqual( - """╭───────────┬─────────────┬──────────╮ -│ Annotator │ Chart Count │ Label │ -├───────────┼─────────────┼──────────┤ -│ Any │ 0 │ recent │ -│ Any │ 0 │ symptoms │ -╰───────────┴─────────────┴──────────╯ + """╭───────────┬──────────┬─────────────╮ +│ Annotator │ Label │ Chart Count │ +├───────────┼──────────┼─────────────┤ +│ Any │ recent │ 0 │ +│ Any │ symptoms │ 0 │ +╰───────────┴──────────┴─────────────╯ """, stdout, ) @@ -90,23 +90,19 @@ def test_labels_ignored(self): def test_labels_csv(self): """Verify that can print in CSV format""" - stdout = self.run_cli("labels", "--csv", path=f"{self.DATA_DIR}/cold") + stdout = self.run_cli("labels", "--csv", path=f"{self.DATA_DIR}/external") self.assertEqual( [ - "annotator,chart_count,label", - "Any,2,Cough", - "Any,3,Fatigue", - "Any,3,Headache", - "jane,1,Cough", - "jane,2,Fatigue", - "jane,2,Headache", - "jill,2,Cough", - "jill,3,Fatigue", - "jill,0,Headache", - "john,1,Cough", - "john,2,Fatigue", - "john,2,Headache", + "annotator,label,chart_count", + "Any,happy,1", + "Any,sad,1", + "human,happy,1", + "human,sad,1", + "icd10-doc,happy,1", + "icd10-doc,sad,1", + "icd10-enc,happy,1", + "icd10-enc,sad,1", ], stdout.splitlines(), ) diff --git a/tests/test_mentions.py b/tests/test_mentions.py new file mode 100644 index 0000000..930553b --- /dev/null +++ b/tests/test_mentions.py @@ -0,0 +1,162 @@ +"""Tests for commands/mentions.py""" + +import tempfile + +from chart_review import common +from tests import base + + +class TestMentions(base.TestCase): + """Test case for the top-level mentions code""" + + def test_basic_output(self): + stdout = self.run_cli("mentions", path=f"{self.DATA_DIR}/cold") + + self.assertEqual( + """╭───────────┬──────────┬─────────┬──────────╮ +│ Annotator │ Chart ID │ Mention │ Label │ +├───────────┼──────────┼─────────┼──────────┤ +│ jane │ 1 │ achoo │ Cough │ +│ jane │ 1 │ sigh │ Fatigue │ +│ jane │ 1 │ sigh │ Headache │ +│ jane │ 4 │ sleepy │ Fatigue │ +│ jane │ 4 │ pain │ Headache │ +├───────────┼──────────┼─────────┼──────────┤ +│ jill │ 1 │ achoo │ Cough │ +│ jill │ 1 │ sigh │ Fatigue │ +│ jill │ 2 │ ouch │ Fatigue │ +│ jill │ 4 │ sleepy │ Fatigue │ +│ jill │ 4 │ pain │ Cough │ +├───────────┼──────────┼─────────┼──────────┤ +│ john │ 1 │ achoo │ Cough │ +│ john │ 1 │ sigh │ Fatigue │ +│ john │ 2 │ ouch │ Headache │ +│ john │ 4 │ sleepy │ Fatigue │ +│ john │ 4 │ pain │ Headache │ +╰───────────┴──────────┴─────────┴──────────╯ +""", + stdout, + ) + + def test_ignored(self): + """Verify that we show info on ignored notes""" + stdout = self.run_cli("mentions", path=f"{self.DATA_DIR}/ignore") + + # Blank mentions are correct - the ignore project doesn't list the source text. + # Good to confirm that we still do something reasonable in this edge case. + self.assertEqual( + """╭───────────┬──────────┬─────────┬───────╮ +│ Annotator │ Chart ID │ Mention │ Label │ +├───────────┼──────────┼─────────┼───────┤ +│ adam │ 1 │ │ A │ +│ adam │ 2 │ │ B │ +├───────────┼──────────┼─────────┼───────┤ +│ allison │ 1 │ │ A │ +│ allison │ 2 │ │ B │ +╰───────────┴──────────┴─────────┴───────╯ + Ignoring 3 charts (3–5) +""", + stdout, + ) + + def test_external(self): + """Verify that we don't show external annotators""" + stdout = self.run_cli("mentions", path=f"{self.DATA_DIR}/external") + + self.assertEqual( + """╭───────────┬──────────┬─────────┬───────╮ +│ Annotator │ Chart ID │ Mention │ Label │ +├───────────┼──────────┼─────────┼───────┤ +│ human │ 1 │ woo │ happy │ +│ human │ 1 │ sigh │ sad │ +╰───────────┴──────────┴─────────┴───────╯ +""", + stdout, + ) + + def test_odd_text(self): + """Verify that unusual text like multi-word unicode doesn't trip us up""" + with tempfile.TemporaryDirectory() as tmpdir: + common.write_json(f"{tmpdir}/config.json", {"annotators": {"chris": 1}}) + common.write_json( + f"{tmpdir}/labelstudio-export.json", + [ + { + "id": 1, + "annotations": [ + { + "completed_by": 1, + "result": [ + {"value": {"text": "Cute Li🦁n", "labels": ["Cat"]}}, + {"value": {"text": "Multi\nLine-on", "labels": ["Cat"]}}, + ], + }, + ], + }, + ], + ) + stdout = self.run_cli("mentions", path=tmpdir) + + self.assertEqual( + """╭───────────┬──────────┬────────────┬───────╮ +│ Annotator │ Chart ID │ Mention │ Label │ +├───────────┼──────────┼────────────┼───────┤ +│ chris │ 1 │ Cute Li🦁n │ Cat │ +│ chris │ 1 │ Multi │ Cat │ +│ │ │ Line-on │ │ +╰───────────┴──────────┴────────────┴───────╯ +""", + stdout, + ) + + def test_unused_labels(self): + """Verify that we don't list mentions for labels that aren't in consideration""" + with tempfile.TemporaryDirectory() as tmpdir: + common.write_json( + f"{tmpdir}/config.json", + { + "annotators": {"chris": 1}, + "labels": ["Valid"], + }, + ) + common.write_json( + f"{tmpdir}/labelstudio-export.json", + [ + { + "id": 1, + "annotations": [ + { + "completed_by": 1, + "result": [ + {"value": {"text": "good", "labels": ["Valid"]}}, + {"value": {"text": "bad", "labels": ["Invalid"]}}, + ], + }, + ], + }, + ], + ) + stdout = self.run_cli("mentions", path=tmpdir) + + self.assertEqual( + """╭───────────┬──────────┬─────────┬───────╮ +│ Annotator │ Chart ID │ Mention │ Label │ +├───────────┼──────────┼─────────┼───────┤ +│ chris │ 1 │ good │ Valid │ +╰───────────┴──────────┴─────────┴───────╯ +""", + stdout, + ) + + def test_csv(self): + """Verify that can print in CSV format""" + stdout = self.run_cli("mentions", "--csv", path=f"{self.DATA_DIR}/external") + + self.assertEqual( + [ + "annotator,chart_id,mention,label", + "human,1,woo,happy", + "human,1,sigh,sad", + ], + stdout.splitlines(), + )