ORNL · Sep 27, 2024 · Jan 2, 2025 · Jan 2, 2025 · Jan 2, 2025 · Jan 14, 2025
diff --git a/README.md b/README.md
@@ -44,6 +44,18 @@ We implemented an ipywidget version of AnchorViz and use it in this project, it
 
 <!-- documentation section -->
 
+## Contributing
+
+Contributions for improving ICAT are welcome! If you run into any problems, find
+bugs, or think of useful improvements and enhancements, feel free to open an
+[issue](https://github.com/ORNL/icat/issues).
+
+If you add a feature or fix a bug yourself and want it considered for
+integration, feel free to open a pull request with the changes. Please provide
+a detailed description of what the pull request is doing and briefly list any
+significant changes made. If it's in regards to a specific issue, please include
+or link the issue number.
+
 ## Citation
 
 To cite usage of ICAT, please use the following bibtex:

diff --git a/icat/anchorlist.py b/icat/anchorlist.py
@@ -298,6 +298,7 @@ def __init__(
         self._anchor_removed_callbacks: list[Callable] = []
         self._anchor_types_changed_callbacks: list[Callable] = []
         self._default_example_anchor_type_changed_callbacks: list[Callable] = []
+        self._status_event_callbacks: list[Callable] = []
 
         self.add_anchor_types(anchor_types)
 
@@ -430,7 +431,7 @@ def on_anchor_types_changed(self, callback: Callable):
         self._anchor_types_changed_callbacks.append(callback)
 
     def on_default_example_anchor_type_changed(self, callback: Callable):
-        """Register a callback function for the "default example anchor changed
+        """Register a callback function for the "default example anchor changed"
         event.
 
         Callbacks for this event should take the anchor type config dictionary, which
@@ -439,6 +440,20 @@ def on_default_example_anchor_type_changed(self, callback: Callable):
         self._default_example_anchor_type_changed_callbacks.append(callback)
         pass
 
+    def on_status_event(self, callback: Callable):
+        """Register a callback function for whenever something that should update a status
+        label occurs.
+
+        Callbacks for this event should take the text event description string, and a string
+        with the source of the event.
+        If None is passed, this means any prior event from this source is complete.
+        """
+        self._status_event_callbacks.append(callback)
+
+    def fire_on_status_event(self, event: str):
+        for callback in self._status_event_callbacks:
+            callback(event, "anchorlist")
+
     def fire_on_anchor_added(self, anchor: Anchor):
         """Trigger the event to notify that a new anchor was added.
 
@@ -1089,9 +1104,11 @@ def featurize(
         self.table.processing = True
         features = []
         for anchor in self.anchors:
+            # self.fire_on_status_event(f"Computing features for {anchor.anchor_name}...")
             self.table._set_anchor_processing(anchor.name, True)
             data[f"_{anchor.anchor_name}"] = anchor.featurize(data) * anchor.weight
             features.append(f"_{anchor.anchor_name}")
+            # self.fire_on_status_event(None)
             self.table._set_anchor_processing(anchor.name, False)
         if normalize:
             if reference_data is not None:

diff --git a/icat/data.py b/icat/data.py
@@ -295,6 +295,7 @@ def __init__(
         self._data_label_callbacks: list[Callable] = []
         self._row_selected_callbacks: list[Callable] = []
         self._sample_changed_callbacks: list[Callable] = []
+        self._data_changed_callbacks: list[Callable] = []
 
         super().__init__(**params)  # required for panel components
         # Note that no widgets can be declared _after_ the above, or their values won't be
@@ -452,6 +453,14 @@ def on_row_selected(self, callback: Callable):
         """
         self._row_selected_callbacks.append(callback)
 
+    def on_data_changed(self, callback: Callable):
+        """Register a callback function for the "data changed" event, when the
+        active_data dataframe is switched out.
+
+        Callbacks for this event should take no parameters.
+        """
+        self._data_changed_callbacks.append(callback)
+
     @param.depends("sample_indices", watch=True)
     def fire_on_sample_changed(self):
         for callback in self._sample_changed_callbacks:
@@ -465,6 +474,10 @@ def fire_on_row_selected(self, index: int):
         for callback in self._row_selected_callbacks:
             callback(index)
 
+    def fire_on_data_changed(self):
+        for callback in self._data_changed_callbacks:
+            callback()
+
     # ============================================================
     # INTERNAL FUNCTIONS
     # ============================================================
@@ -698,6 +711,8 @@ def set_data(self, data: pd.DataFrame):
         if self.label_col not in self.active_data:
             self.active_data[self.label_col] = -1
 
+        self.fire_on_data_changed()
+
         self.set_random_sample()
         # TODO: seems weird to handle this here
         self._apply_filters()

diff --git a/icat/model.py b/icat/model.py
@@ -5,6 +5,7 @@
 
 import json
 import os
+from collections.abc import Callable
 from datetime import datetime
 
 import joblib
@@ -52,6 +53,8 @@ def __init__(
                 },
             ]
 
+        self._status_event_callbacks: list[Callable] = []
+
         self.training_data: pd.DataFrame = None
         """The rows (and only those rows) of the original data explicitly used for training."""
         self.text_col = text_col
@@ -80,6 +83,7 @@ def __init__(
         self.anchor_list.on_anchor_removed(self._on_anchor_remove)
         self.anchor_list.on_anchor_changed(self._on_anchor_change)
         self.data.on_data_labeled(self._on_data_label)
+        self.data.on_data_changed(self._on_data_changed)
         self.view.on_selected_points_change(self._on_selected_points_change)
 
         self._last_anchor_names: dict[str, str] = []
@@ -88,6 +92,25 @@ def __init__(
 
         self.anchor_list.build_tfidf_features()
 
+    def on_status_event(self, callback: Callable):
+        """Register a callback function for whenever something that should update a status
+        label occurs.
+
+        Callbacks for this event should take the text event description string, and a string
+        with the source of the event.
+        If None is passed, this means any prior event from this source is complete.
+        """
+        self._status_event_callbacks.append(callback)
+
+    def fire_on_status_event(self, event: str):
+        for callback in self._status_event_callbacks:
+            callback(event, "model")
+
+    def _on_data_changed(self):
+        """Event handler for when set_data in datamanager is called."""
+        # self.data.active_data = self.featurize(self.data.active_data, normalize=False)
+        self.fit()
+
     def _on_data_label(self, index: int | list[int], new_label: int | list[int]):
         """Event handler for datamanager.
 
@@ -182,6 +205,7 @@ def _on_selected_points_change(self, selected_ids: list[str]):
     def _train_model(self):
         """Fits the data to the current training dataset, note that this function
         assumes the data has already been featurized."""
+        self.fire_on_status_event(None)  # reset/clear status
 
         if not self.is_seeded():
             # we short circuit training the model here, but we do still want to show
@@ -200,11 +224,15 @@ def _train_model(self):
 
         if len(self.feature_names(in_model_only=True)) < 1:
             return False
+
+        self.fire_on_status_event("Training model...")
         self.classifier.fit(
             self.training_data[self.feature_names(in_model_only=True)],
             self.training_data[self.data.label_col],
         )
+        self.fire_on_status_event("Predicting on remaining data...")
         self.data.active_data[self.data.prediction_col] = self.predict()
+        self.fire_on_status_event(None)
         coverage_info = self.compute_coverage()
         self.anchor_list.set_coverage(coverage_info)
 
@@ -324,6 +352,7 @@ def fit(self):
         # self.norm_reference = self.featurize(self.data.active_data, normalize=False)[
         #     features
         # ].copy()
+        self.fire_on_status_event("Computing features...")
         self.data.active_data = self.featurize(self.data.active_data, normalize=False)
         if self.training_data is not None:
             self.training_data = self.featurize(
@@ -336,6 +365,7 @@ def fit(self):
         # self.data.active_data.loc[:, features] = self.data.active_data[features].apply(
         #     AnchorList._l1_col_normalize, axis=0
         # )
+        self.fire_on_status_event(None)
 
         self._train_model()
 

diff --git a/icat/view.py b/icat/view.py
@@ -6,6 +6,7 @@
 from collections.abc import Callable
 from typing import Any
 
+import ipyvuetify as v
 import ipywidgets as ipw
 import pandas as pd
 import panel as pn
@@ -43,13 +44,17 @@ def __init__(self, model, **params):
 
         self.histograms = Histograms()
 
+        self.status_label = v.Label(children=["Status: hi!"])
+
         self.debug = ipw.Output()
 
         self._selected_points_change_callbacks: list[Callable] = []
 
         self.layout = pn.Row(
             pn.Column(self.anchorviz, self.model.anchor_list, self.debug),
-            pn.Column(self.model.data.widget, self.histograms, width=700),
+            pn.Column(
+                self.model.data.widget, self.status_label, self.histograms, width=700
+            ),
             height=1150,
         )
 
@@ -73,8 +78,46 @@ def __init__(self, model, **params):
         self.model.data.table.on_point_hover(self._set_anchorviz_selected_point)
         self.model.data.on_sample_changed(self._handle_data_sample_changed)
         self.histograms.on_range_changed(self._histograms_range_changed)
+
+        self.model.anchor_list.on_status_event(self._handle_status_event)
+        self.model.on_status_event(self._handle_status_event)
+
         super().__init__(**params)
         self.refresh_data()
+        self._handle_status_event(None, None)  # reset status label
+
+    def _handle_status_event(self, status_text: str, source: str):
+        """Whenever any component wants to say something in the status line, handle
+        updating that text element here."""
+        # NOTE: status_text of None means "done"/ready
+        if status_text is not None:
+            self.status_label.children = f"Status: {status_text}"
+        else:
+            if self.model.is_seeded():
+                self.status_label.children = "Status: Model ready"
+            else:
+                # TODO: replicating logic from model.is_seeded, better way to handle?
+                labeled_df = None
+                if (
+                    self.model.training_data is None
+                    or self.model.data.label_col not in self.model.training_data.columns
+                ):
+                    remaining_labels = 10
+                else:
+                    labeled_df = self.model.training_data[
+                        self.model.training_data[self.model.data.label_col] != -1
+                    ]
+
+                if labeled_df is not None:
+                    remaining_labels = 10 - len(labeled_df)
+                label_str = f"Status: Model isn't seeded yet, label at least {remaining_labels} more points."
+
+                if labeled_df is not None:
+                    if len(labeled_df[labeled_df[self.model.data.label_col] == 0]) == 0:
+                        label_str += " Need at least one point labeled 'uninteresting'."
+                    if len(labeled_df[labeled_df[self.model.data.label_col] == 1]) == 0:
+                        label_str += " Need at least one point labeled 'interesting'."
+                self.status_label.children = label_str
 
     def _handle_data_sample_changed(self, new_sample_indices: list[int]):
         """When the model's data manager sample_indices changes, it fires the

diff --git a/notebooks/anchors.ipynb b/notebooks/anchors.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "# Anchors Notebook\n",
     "\n",
-    "This notebook walks through how anchors work in ICAT ..."
+    "This notebook walks through how anchors work in ICAT"
    ]
   },
   {
@@ -176,7 +176,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.10.14"
   },
   "toc-autonumbering": false,
   "toc-showmarkdowntxt": false

diff --git a/notebooks/example_indices.json b/notebooks/example_indices.json
@@ -0,0 +1 @@
+[9079, 8660, 6275, 636, 532, 1398, 4805, 10457, 266, 10372, 10852, 2354, 5409, 1809, 4716, 3751, 10260, 8642, 8787, 4013, 1245, 2105, 8981, 919, 7499, 787, 10802, 10087, 6495, 9825, 802, 5707, 3138, 6370, 9012, 4483, 3644, 7232, 4060, 1814, 8963, 5939, 2388, 1482, 2174, 1079, 7718, 7280, 6901, 10682, 1703, 5788, 9344, 1872, 9936, 3209, 6917, 6305, 654, 10242, 7442, 9052, 7, 7130, 2829, 987, 8721, 2624, 7139, 9570, 626, 7223, 3005, 8008, 2533, 9276, 9161, 3701, 9866, 10100, 3858, 169, 1108, 1032, 1100, 9080, 3487, 238, 4103, 10479, 5073, 8830, 1162, 966, 1777, 2498, 7313, 6802, 4153, 6507, 7144, 2579, 5429, 7548, 415, 1031, 6530, 4331, 3408, 2549, 1502, 5934, 3245, 9790, 2063, 1263, 11227, 10142, 5006, 8213, 8398, 9409, 10524, 6414, 6767, 9791, 3985, 1554, 4524, 9122, 909, 8735, 4298, 10603, 3584, 3830, 6156, 10198, 6006, 9631, 186, 2096, 5225, 5197, 1446, 1006, 5822, 9736, 5204, 7268, 10360, 10279, 5611, 8923, 3081, 11094, 10011, 8050, 7357, 6096, 6921, 11169, 5843, 4149, 7387, 7516, 4376, 10433, 1864, 8482, 7435, 6514, 4330, 2536, 7181, 2152, 2370, 6339, 6729, 3376, 8834, 11151, 847, 9258, 6036, 6081, 3478, 9243, 9022, 2646, 11001, 7049, 788, 6060, 8489, 4288, 1697, 149, 773, 5280, 7588, 8910, 9213, 5583, 6817, 5735, 7936, 10197, 505, 8064, 7208, 8638, 10842, 6084, 11216, 7402, 10184, 6777, 1753, 6684, 10722, 3188, 7020, 8345, 1715, 8171, 4617, 4110, 3943, 5497, 3760, 7044, 10519, 3289, 9189, 94, 483, 539, 10796, 10580, 9334, 1085, 11113, 2971, 4709, 10310, 7291, 6208, 6806, 3691, 1355, 6334, 3772, 4681, 6233, 3406, 798, 1159, 2873, 5535, 5569, 446, 6212, 2843, 8475, 8899, 6108, 4395, 11099, 1418, 1233, 409, 346, 2244, 3724, 5182, 11166, 8674, 9583, 5168, 4196, 6366, 5684, 1173, 6092, 2160, 1831, 5753, 2825, 3611, 6178, 9829, 7479, 10483, 10769, 10716, 9617, 10056, 1377, 11251, 11306, 5495, 5973, 8120, 3454, 3592, 10847, 8693, 9210, 2289, 7669, 4443, 308, 3433, 4576, 157, 10058, 10895, 78, 968, 10853, 3486, 3248, 10323, 8197, 6225, 3820, 1416, 1964, 10470, 2591, 2562, 6331, 1381, 3093, 5928, 4606, 8929, 8175, 2518, 6437, 8234, 3747, 9719, 4881, 7017, 1305, 9127, 6016, 5172, 1161, 5317, 6341, 10387, 694, 8763, 9055, 3685, 7179, 4935, 5464, 1952, 9193, 2452, 4015, 460, 9689, 10870, 5046, 1340, 5118, 1441, 4080, 4463, 2013, 9496, 6273, 5780, 10250, 7847, 9558, 106, 6484, 10587, 4538, 5394, 7808, 6853, 4627, 8248, 2099, 6047, 2424, 7294, 10024, 5144, 10695, 8478, 1706, 4241, 1333, 6987, 6409, 7765, 8775, 6584, 3431, 6192, 7301, 48, 9333, 867, 6392, 9927, 3578, 6659, 2673, 10859, 4619, 5969, 2118, 7079, 6326, 5412, 4687, 6670, 10495, 2246, 10446, 5024, 4097, 4895, 6071, 1289, 4817, 1642, 9995, 9133, 8798, 9605, 7491, 10599, 2577, 6007, 2697, 7361, 6711, 947, 2818, 1244, 4665, 3512, 5997, 8357, 9891, 10711, 7526, 10908, 2342, 7109, 2404, 1496, 1246, 1156, 1058, 4953, 5294, 1461, 2420, 2687, 4253, 2959, 10419, 1394, 4201, 3572, 245, 8972, 3536, 3555, 1754, 4688, 10658, 9567, 5007, 6127, 3954, 2367, 10475, 7853, 2071, 2844, 396, 3479, 6733, 3657, 1888, 5398, 1165, 3865]
diff --git a/notebooks/lm_similarity_example.ipynb b/notebooks/lm_similarity_example.ipynb
@@ -7,7 +7,22 @@
    "source": [
     "# Language Model Similarity Example\n",
     "\n",
-    "This notebook shows how to provide a language model to a similarity anchor, allowing the utilization of knowledge inside embedding spaces as part of the ICAT model."
+    "This notebook shows how to provide a language model to a similarity anchor, allowing the utilization of knowledge inside embedding spaces as part of the ICAT model.\n",
+    "\n",
+    "You will need to install the huggingface transformers and pytorch libraries for this notebook to run, please use\n",
+    "```\n",
+    "pip install transformers torch\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb6a33c6-e0f2-414f-9356-97f6fb47e2b9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch"
    ]
   },
   {
@@ -21,7 +36,7 @@
    "source": [
     "# change these constants as needed based on your hardware constraints\n",
     "BATCH_SIZE = 16\n",
-    "DEVICE = \"cuda\"\n",
+    "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
     "MODEL_NAME = \"bert-base-uncased\""
    ]
   },
@@ -175,7 +190,7 @@
     "\n",
     "dataset = fetch_20newsgroups(subset=\"train\")\n",
     "df = pd.DataFrame({\"text\": dataset[\"data\"], \"category\": [dataset[\"target_names\"][i] for i in dataset[\"target\"]]})\n",
-    "#df = df.iloc[0:1999]\n",
+    "df = df.iloc[0:1999]  # NOTE: if running on CPU or weaker GPU, recommend uncommenting this to avoid long processing times on first BERT anchor creation.\n",
     "df.head()"
    ]
   },
@@ -196,7 +211,7 @@
    },
    "outputs": [],
    "source": [
-    "icat.initialize(offline=True)"
+    "icat.initialize(offline=False)"
    ]
   },
   {
@@ -279,7 +294,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.10.15"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/simple_example.ipynb b/notebooks/simple_example.ipynb
@@ -126,7 +126,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		[9079, 8660, 6275, 636, 532, 1398, 4805, 10457, 266, 10372, 10852, 2354, 5409, 1809, 4716, 3751, 10260, 8642, 8787, 4013, 1245, 2105, 8981, 919, 7499, 787, 10802, 10087, 6495, 9825, 802, 5707, 3138, 6370, 9012, 4483, 3644, 7232, 4060, 1814, 8963, 5939, 2388, 1482, 2174, 1079, 7718, 7280, 6901, 10682, 1703, 5788, 9344, 1872, 9936, 3209, 6917, 6305, 654, 10242, 7442, 9052, 7, 7130, 2829, 987, 8721, 2624, 7139, 9570, 626, 7223, 3005, 8008, 2533, 9276, 9161, 3701, 9866, 10100, 3858, 169, 1108, 1032, 1100, 9080, 3487, 238, 4103, 10479, 5073, 8830, 1162, 966, 1777, 2498, 7313, 6802, 4153, 6507, 7144, 2579, 5429, 7548, 415, 1031, 6530, 4331, 3408, 2549, 1502, 5934, 3245, 9790, 2063, 1263, 11227, 10142, 5006, 8213, 8398, 9409, 10524, 6414, 6767, 9791, 3985, 1554, 4524, 9122, 909, 8735, 4298, 10603, 3584, 3830, 6156, 10198, 6006, 9631, 186, 2096, 5225, 5197, 1446, 1006, 5822, 9736, 5204, 7268, 10360, 10279, 5611, 8923, 3081, 11094, 10011, 8050, 7357, 6096, 6921, 11169, 5843, 4149, 7387, 7516, 4376, 10433, 1864, 8482, 7435, 6514, 4330, 2536, 7181, 2152, 2370, 6339, 6729, 3376, 8834, 11151, 847, 9258, 6036, 6081, 3478, 9243, 9022, 2646, 11001, 7049, 788, 6060, 8489, 4288, 1697, 149, 773, 5280, 7588, 8910, 9213, 5583, 6817, 5735, 7936, 10197, 505, 8064, 7208, 8638, 10842, 6084, 11216, 7402, 10184, 6777, 1753, 6684, 10722, 3188, 7020, 8345, 1715, 8171, 4617, 4110, 3943, 5497, 3760, 7044, 10519, 3289, 9189, 94, 483, 539, 10796, 10580, 9334, 1085, 11113, 2971, 4709, 10310, 7291, 6208, 6806, 3691, 1355, 6334, 3772, 4681, 6233, 3406, 798, 1159, 2873, 5535, 5569, 446, 6212, 2843, 8475, 8899, 6108, 4395, 11099, 1418, 1233, 409, 346, 2244, 3724, 5182, 11166, 8674, 9583, 5168, 4196, 6366, 5684, 1173, 6092, 2160, 1831, 5753, 2825, 3611, 6178, 9829, 7479, 10483, 10769, 10716, 9617, 10056, 1377, 11251, 11306, 5495, 5973, 8120, 3454, 3592, 10847, 8693, 9210, 2289, 7669, 4443, 308, 3433, 4576, 157, 10058, 10895, 78, 968, 10853, 3486, 3248, 10323, 8197, 6225, 3820, 1416, 1964, 10470, 2591, 2562, 6331, 1381, 3093, 5928, 4606, 8929, 8175, 2518, 6437, 8234, 3747, 9719, 4881, 7017, 1305, 9127, 6016, 5172, 1161, 5317, 6341, 10387, 694, 8763, 9055, 3685, 7179, 4935, 5464, 1952, 9193, 2452, 4015, 460, 9689, 10870, 5046, 1340, 5118, 1441, 4080, 4463, 2013, 9496, 6273, 5780, 10250, 7847, 9558, 106, 6484, 10587, 4538, 5394, 7808, 6853, 4627, 8248, 2099, 6047, 2424, 7294, 10024, 5144, 10695, 8478, 1706, 4241, 1333, 6987, 6409, 7765, 8775, 6584, 3431, 6192, 7301, 48, 9333, 867, 6392, 9927, 3578, 6659, 2673, 10859, 4619, 5969, 2118, 7079, 6326, 5412, 4687, 6670, 10495, 2246, 10446, 5024, 4097, 4895, 6071, 1289, 4817, 1642, 9995, 9133, 8798, 9605, 7491, 10599, 2577, 6007, 2697, 7361, 6711, 947, 2818, 1244, 4665, 3512, 5997, 8357, 9891, 10711, 7526, 10908, 2342, 7109, 2404, 1496, 1246, 1156, 1058, 4953, 5294, 1461, 2420, 2687, 4253, 2959, 10419, 1394, 4201, 3572, 245, 8972, 3536, 3555, 1754, 4688, 10658, 9567, 5007, 6127, 3954, 2367, 10475, 7853, 2071, 2844, 396, 3479, 6733, 3657, 1888, 5398, 1165, 3865]