Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: ORNL/icat
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: v0.7.4
Choose a base ref
...
head repository: ORNL/icat
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: main
Choose a head ref
  • 7 commits
  • 11 files changed
  • 1 contributor

Commits on Sep 27, 2024

  1. Copy the full SHA
    b93c661 View commit details

Commits on Jan 2, 2025

  1. Copy the full SHA
    837f8e0 View commit details
  2. Fix error when switching data on already trained model

    New dataset didn't have the predictions column yet which errored when
    trying to serialize and send to anchorviz. Fixed by adding a
    "data_changed" event that the model class listens for and calls `fit()`
    WarmCyan committed Jan 2, 2025
    Copy the full SHA
    f2532d9 View commit details
  3. Copy the full SHA
    1323233 View commit details

Commits on Jan 14, 2025

  1. Copy the full SHA
    922be02 View commit details

Commits on Jan 16, 2025

  1. Clean up status text

    WarmCyan committed Jan 16, 2025
    Copy the full SHA
    faa1a08 View commit details

Commits on Feb 24, 2025

  1. Copy the full SHA
    1a13e8d View commit details
Showing with 1,016 additions and 11 deletions.
  1. +12 −0 README.md
  2. +18 −1 icat/anchorlist.py
  3. +15 −0 icat/data.py
  4. +30 −0 icat/model.py
  5. +44 −1 icat/view.py
  6. +2 −2 notebooks/anchors.ipynb
  7. +1 −0 notebooks/example_indices.json
  8. +20 −5 notebooks/lm_similarity_example.ipynb
  9. +1 −1 notebooks/simple_example.ipynb
  10. +871 −0 notebooks/usage_walkthrough.ipynb
  11. +2 −1 tests/test_browser.py
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -44,6 +44,18 @@ We implemented an ipywidget version of AnchorViz and use it in this project, it

<!-- documentation section -->

## Contributing

Contributions for improving ICAT are welcome! If you run into any problems, find
bugs, or think of useful improvements and enhancements, feel free to open an
[issue](https://github.com/ORNL/icat/issues).

If you add a feature or fix a bug yourself and want it considered for
integration, feel free to open a pull request with the changes. Please provide
a detailed description of what the pull request is doing and briefly list any
significant changes made. If it's in regards to a specific issue, please include
or link the issue number.

## Citation

To cite usage of ICAT, please use the following bibtex:
19 changes: 18 additions & 1 deletion icat/anchorlist.py
Original file line number Diff line number Diff line change
@@ -298,6 +298,7 @@ def __init__(
self._anchor_removed_callbacks: list[Callable] = []
self._anchor_types_changed_callbacks: list[Callable] = []
self._default_example_anchor_type_changed_callbacks: list[Callable] = []
self._status_event_callbacks: list[Callable] = []

self.add_anchor_types(anchor_types)

@@ -430,7 +431,7 @@ def on_anchor_types_changed(self, callback: Callable):
self._anchor_types_changed_callbacks.append(callback)

def on_default_example_anchor_type_changed(self, callback: Callable):
"""Register a callback function for the "default example anchor changed
"""Register a callback function for the "default example anchor changed"
event.
Callbacks for this event should take the anchor type config dictionary, which
@@ -439,6 +440,20 @@ def on_default_example_anchor_type_changed(self, callback: Callable):
self._default_example_anchor_type_changed_callbacks.append(callback)
pass

def on_status_event(self, callback: Callable):
"""Register a callback function for whenever something that should update a status
label occurs.
Callbacks for this event should take the text event description string, and a string
with the source of the event.
If None is passed, this means any prior event from this source is complete.
"""
self._status_event_callbacks.append(callback)

def fire_on_status_event(self, event: str):
for callback in self._status_event_callbacks:
callback(event, "anchorlist")

def fire_on_anchor_added(self, anchor: Anchor):
"""Trigger the event to notify that a new anchor was added.
@@ -1089,9 +1104,11 @@ def featurize(
self.table.processing = True
features = []
for anchor in self.anchors:
# self.fire_on_status_event(f"Computing features for {anchor.anchor_name}...")
self.table._set_anchor_processing(anchor.name, True)
data[f"_{anchor.anchor_name}"] = anchor.featurize(data) * anchor.weight
features.append(f"_{anchor.anchor_name}")
# self.fire_on_status_event(None)
self.table._set_anchor_processing(anchor.name, False)
if normalize:
if reference_data is not None:
15 changes: 15 additions & 0 deletions icat/data.py
Original file line number Diff line number Diff line change
@@ -295,6 +295,7 @@ def __init__(
self._data_label_callbacks: list[Callable] = []
self._row_selected_callbacks: list[Callable] = []
self._sample_changed_callbacks: list[Callable] = []
self._data_changed_callbacks: list[Callable] = []

super().__init__(**params) # required for panel components
# Note that no widgets can be declared _after_ the above, or their values won't be
@@ -452,6 +453,14 @@ def on_row_selected(self, callback: Callable):
"""
self._row_selected_callbacks.append(callback)

def on_data_changed(self, callback: Callable):
"""Register a callback function for the "data changed" event, when the
active_data dataframe is switched out.
Callbacks for this event should take no parameters.
"""
self._data_changed_callbacks.append(callback)

@param.depends("sample_indices", watch=True)
def fire_on_sample_changed(self):
for callback in self._sample_changed_callbacks:
@@ -465,6 +474,10 @@ def fire_on_row_selected(self, index: int):
for callback in self._row_selected_callbacks:
callback(index)

def fire_on_data_changed(self):
for callback in self._data_changed_callbacks:
callback()

# ============================================================
# INTERNAL FUNCTIONS
# ============================================================
@@ -698,6 +711,8 @@ def set_data(self, data: pd.DataFrame):
if self.label_col not in self.active_data:
self.active_data[self.label_col] = -1

self.fire_on_data_changed()

self.set_random_sample()
# TODO: seems weird to handle this here
self._apply_filters()
30 changes: 30 additions & 0 deletions icat/model.py
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@

import json
import os
from collections.abc import Callable
from datetime import datetime

import joblib
@@ -52,6 +53,8 @@ def __init__(
},
]

self._status_event_callbacks: list[Callable] = []

self.training_data: pd.DataFrame = None
"""The rows (and only those rows) of the original data explicitly used for training."""
self.text_col = text_col
@@ -80,6 +83,7 @@ def __init__(
self.anchor_list.on_anchor_removed(self._on_anchor_remove)
self.anchor_list.on_anchor_changed(self._on_anchor_change)
self.data.on_data_labeled(self._on_data_label)
self.data.on_data_changed(self._on_data_changed)
self.view.on_selected_points_change(self._on_selected_points_change)

self._last_anchor_names: dict[str, str] = []
@@ -88,6 +92,25 @@ def __init__(

self.anchor_list.build_tfidf_features()

def on_status_event(self, callback: Callable):
"""Register a callback function for whenever something that should update a status
label occurs.
Callbacks for this event should take the text event description string, and a string
with the source of the event.
If None is passed, this means any prior event from this source is complete.
"""
self._status_event_callbacks.append(callback)

def fire_on_status_event(self, event: str):
for callback in self._status_event_callbacks:
callback(event, "model")

def _on_data_changed(self):
"""Event handler for when set_data in datamanager is called."""
# self.data.active_data = self.featurize(self.data.active_data, normalize=False)
self.fit()

def _on_data_label(self, index: int | list[int], new_label: int | list[int]):
"""Event handler for datamanager.
@@ -182,6 +205,7 @@ def _on_selected_points_change(self, selected_ids: list[str]):
def _train_model(self):
"""Fits the data to the current training dataset, note that this function
assumes the data has already been featurized."""
self.fire_on_status_event(None) # reset/clear status

if not self.is_seeded():
# we short circuit training the model here, but we do still want to show
@@ -200,11 +224,15 @@ def _train_model(self):

if len(self.feature_names(in_model_only=True)) < 1:
return False

self.fire_on_status_event("Training model...")
self.classifier.fit(
self.training_data[self.feature_names(in_model_only=True)],
self.training_data[self.data.label_col],
)
self.fire_on_status_event("Predicting on remaining data...")
self.data.active_data[self.data.prediction_col] = self.predict()
self.fire_on_status_event(None)
coverage_info = self.compute_coverage()
self.anchor_list.set_coverage(coverage_info)

@@ -324,6 +352,7 @@ def fit(self):
# self.norm_reference = self.featurize(self.data.active_data, normalize=False)[
# features
# ].copy()
self.fire_on_status_event("Computing features...")
self.data.active_data = self.featurize(self.data.active_data, normalize=False)
if self.training_data is not None:
self.training_data = self.featurize(
@@ -336,6 +365,7 @@ def fit(self):
# self.data.active_data.loc[:, features] = self.data.active_data[features].apply(
# AnchorList._l1_col_normalize, axis=0
# )
self.fire_on_status_event(None)

self._train_model()

45 changes: 44 additions & 1 deletion icat/view.py
Original file line number Diff line number Diff line change
@@ -6,6 +6,7 @@
from collections.abc import Callable
from typing import Any

import ipyvuetify as v
import ipywidgets as ipw
import pandas as pd
import panel as pn
@@ -43,13 +44,17 @@ def __init__(self, model, **params):

self.histograms = Histograms()

self.status_label = v.Label(children=["Status: hi!"])

self.debug = ipw.Output()

self._selected_points_change_callbacks: list[Callable] = []

self.layout = pn.Row(
pn.Column(self.anchorviz, self.model.anchor_list, self.debug),
pn.Column(self.model.data.widget, self.histograms, width=700),
pn.Column(
self.model.data.widget, self.status_label, self.histograms, width=700
),
height=1150,
)

@@ -73,8 +78,46 @@ def __init__(self, model, **params):
self.model.data.table.on_point_hover(self._set_anchorviz_selected_point)
self.model.data.on_sample_changed(self._handle_data_sample_changed)
self.histograms.on_range_changed(self._histograms_range_changed)

self.model.anchor_list.on_status_event(self._handle_status_event)
self.model.on_status_event(self._handle_status_event)

super().__init__(**params)
self.refresh_data()
self._handle_status_event(None, None) # reset status label

def _handle_status_event(self, status_text: str, source: str):
"""Whenever any component wants to say something in the status line, handle
updating that text element here."""
# NOTE: status_text of None means "done"/ready
if status_text is not None:
self.status_label.children = f"Status: {status_text}"
else:
if self.model.is_seeded():
self.status_label.children = "Status: Model ready"
else:
# TODO: replicating logic from model.is_seeded, better way to handle?
labeled_df = None
if (
self.model.training_data is None
or self.model.data.label_col not in self.model.training_data.columns
):
remaining_labels = 10
else:
labeled_df = self.model.training_data[
self.model.training_data[self.model.data.label_col] != -1
]

if labeled_df is not None:
remaining_labels = 10 - len(labeled_df)
label_str = f"Status: Model isn't seeded yet, label at least {remaining_labels} more points."

if labeled_df is not None:
if len(labeled_df[labeled_df[self.model.data.label_col] == 0]) == 0:
label_str += " Need at least one point labeled 'uninteresting'."
if len(labeled_df[labeled_df[self.model.data.label_col] == 1]) == 0:
label_str += " Need at least one point labeled 'interesting'."
self.status_label.children = label_str

def _handle_data_sample_changed(self, new_sample_indices: list[int]):
"""When the model's data manager sample_indices changes, it fires the
4 changes: 2 additions & 2 deletions notebooks/anchors.ipynb
Original file line number Diff line number Diff line change
@@ -7,7 +7,7 @@
"source": [
"# Anchors Notebook\n",
"\n",
"This notebook walks through how anchors work in ICAT ..."
"This notebook walks through how anchors work in ICAT"
]
},
{
@@ -176,7 +176,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.10.14"
},
"toc-autonumbering": false,
"toc-showmarkdowntxt": false
1 change: 1 addition & 0 deletions notebooks/example_indices.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[9079, 8660, 6275, 636, 532, 1398, 4805, 10457, 266, 10372, 10852, 2354, 5409, 1809, 4716, 3751, 10260, 8642, 8787, 4013, 1245, 2105, 8981, 919, 7499, 787, 10802, 10087, 6495, 9825, 802, 5707, 3138, 6370, 9012, 4483, 3644, 7232, 4060, 1814, 8963, 5939, 2388, 1482, 2174, 1079, 7718, 7280, 6901, 10682, 1703, 5788, 9344, 1872, 9936, 3209, 6917, 6305, 654, 10242, 7442, 9052, 7, 7130, 2829, 987, 8721, 2624, 7139, 9570, 626, 7223, 3005, 8008, 2533, 9276, 9161, 3701, 9866, 10100, 3858, 169, 1108, 1032, 1100, 9080, 3487, 238, 4103, 10479, 5073, 8830, 1162, 966, 1777, 2498, 7313, 6802, 4153, 6507, 7144, 2579, 5429, 7548, 415, 1031, 6530, 4331, 3408, 2549, 1502, 5934, 3245, 9790, 2063, 1263, 11227, 10142, 5006, 8213, 8398, 9409, 10524, 6414, 6767, 9791, 3985, 1554, 4524, 9122, 909, 8735, 4298, 10603, 3584, 3830, 6156, 10198, 6006, 9631, 186, 2096, 5225, 5197, 1446, 1006, 5822, 9736, 5204, 7268, 10360, 10279, 5611, 8923, 3081, 11094, 10011, 8050, 7357, 6096, 6921, 11169, 5843, 4149, 7387, 7516, 4376, 10433, 1864, 8482, 7435, 6514, 4330, 2536, 7181, 2152, 2370, 6339, 6729, 3376, 8834, 11151, 847, 9258, 6036, 6081, 3478, 9243, 9022, 2646, 11001, 7049, 788, 6060, 8489, 4288, 1697, 149, 773, 5280, 7588, 8910, 9213, 5583, 6817, 5735, 7936, 10197, 505, 8064, 7208, 8638, 10842, 6084, 11216, 7402, 10184, 6777, 1753, 6684, 10722, 3188, 7020, 8345, 1715, 8171, 4617, 4110, 3943, 5497, 3760, 7044, 10519, 3289, 9189, 94, 483, 539, 10796, 10580, 9334, 1085, 11113, 2971, 4709, 10310, 7291, 6208, 6806, 3691, 1355, 6334, 3772, 4681, 6233, 3406, 798, 1159, 2873, 5535, 5569, 446, 6212, 2843, 8475, 8899, 6108, 4395, 11099, 1418, 1233, 409, 346, 2244, 3724, 5182, 11166, 8674, 9583, 5168, 4196, 6366, 5684, 1173, 6092, 2160, 1831, 5753, 2825, 3611, 6178, 9829, 7479, 10483, 10769, 10716, 9617, 10056, 1377, 11251, 11306, 5495, 5973, 8120, 3454, 3592, 10847, 8693, 9210, 2289, 7669, 4443, 308, 3433, 4576, 157, 10058, 10895, 78, 968, 10853, 3486, 3248, 10323, 8197, 6225, 3820, 1416, 1964, 10470, 2591, 2562, 6331, 1381, 3093, 5928, 4606, 8929, 8175, 2518, 6437, 8234, 3747, 9719, 4881, 7017, 1305, 9127, 6016, 5172, 1161, 5317, 6341, 10387, 694, 8763, 9055, 3685, 7179, 4935, 5464, 1952, 9193, 2452, 4015, 460, 9689, 10870, 5046, 1340, 5118, 1441, 4080, 4463, 2013, 9496, 6273, 5780, 10250, 7847, 9558, 106, 6484, 10587, 4538, 5394, 7808, 6853, 4627, 8248, 2099, 6047, 2424, 7294, 10024, 5144, 10695, 8478, 1706, 4241, 1333, 6987, 6409, 7765, 8775, 6584, 3431, 6192, 7301, 48, 9333, 867, 6392, 9927, 3578, 6659, 2673, 10859, 4619, 5969, 2118, 7079, 6326, 5412, 4687, 6670, 10495, 2246, 10446, 5024, 4097, 4895, 6071, 1289, 4817, 1642, 9995, 9133, 8798, 9605, 7491, 10599, 2577, 6007, 2697, 7361, 6711, 947, 2818, 1244, 4665, 3512, 5997, 8357, 9891, 10711, 7526, 10908, 2342, 7109, 2404, 1496, 1246, 1156, 1058, 4953, 5294, 1461, 2420, 2687, 4253, 2959, 10419, 1394, 4201, 3572, 245, 8972, 3536, 3555, 1754, 4688, 10658, 9567, 5007, 6127, 3954, 2367, 10475, 7853, 2071, 2844, 396, 3479, 6733, 3657, 1888, 5398, 1165, 3865]
25 changes: 20 additions & 5 deletions notebooks/lm_similarity_example.ipynb
Original file line number Diff line number Diff line change
@@ -7,7 +7,22 @@
"source": [
"# Language Model Similarity Example\n",
"\n",
"This notebook shows how to provide a language model to a similarity anchor, allowing the utilization of knowledge inside embedding spaces as part of the ICAT model."
"This notebook shows how to provide a language model to a similarity anchor, allowing the utilization of knowledge inside embedding spaces as part of the ICAT model.\n",
"\n",
"You will need to install the huggingface transformers and pytorch libraries for this notebook to run, please use\n",
"```\n",
"pip install transformers torch\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bb6a33c6-e0f2-414f-9356-97f6fb47e2b9",
"metadata": {},
"outputs": [],
"source": [
"import torch"
]
},
{
@@ -21,7 +36,7 @@
"source": [
"# change these constants as needed based on your hardware constraints\n",
"BATCH_SIZE = 16\n",
"DEVICE = \"cuda\"\n",
"DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"MODEL_NAME = \"bert-base-uncased\""
]
},
@@ -175,7 +190,7 @@
"\n",
"dataset = fetch_20newsgroups(subset=\"train\")\n",
"df = pd.DataFrame({\"text\": dataset[\"data\"], \"category\": [dataset[\"target_names\"][i] for i in dataset[\"target\"]]})\n",
"#df = df.iloc[0:1999]\n",
"df = df.iloc[0:1999] # NOTE: if running on CPU or weaker GPU, recommend uncommenting this to avoid long processing times on first BERT anchor creation.\n",
"df.head()"
]
},
@@ -196,7 +211,7 @@
},
"outputs": [],
"source": [
"icat.initialize(offline=True)"
"icat.initialize(offline=False)"
]
},
{
@@ -279,7 +294,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.10.15"
}
},
"nbformat": 4,
2 changes: 1 addition & 1 deletion notebooks/simple_example.ipynb
Original file line number Diff line number Diff line change
@@ -126,7 +126,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.10.14"
}
},
"nbformat": 4,
Loading