diff --git a/composeml/conftest.py b/composeml/conftest.py index 7339d36c..d4d72014 100644 --- a/composeml/conftest.py +++ b/composeml/conftest.py @@ -1,7 +1,30 @@ import pandas as pd import pytest -from .label_times import LabelTimes +from composeml import LabelTimes +from composeml.tests.utils import read_csv + + +@pytest.fixture(scope="module") +def total_spent(): + data = [ + 'id,customer_id,cutoff_time,total_spent', + '0,0,2019-01-01 08:00:00,9', + '1,0,2019-01-01 08:30:00,8', + '2,1,2019-01-01 09:00:00,7', + '3,1,2019-01-01 09:30:00,6', + '4,1,2019-01-01 10:00:00,5', + '5,2,2019-01-01 10:30:00,4', + '6,2,2019-01-01 11:00:00,3', + '7,2,2019-01-01 11:30:00,2', + '8,2,2019-01-01 12:00:00,1', + '9,3,2019-01-01 12:30:00,0', + ] + + data = read_csv(data, index_col='id', parse_dates=['cutoff_time']) + lt = LabelTimes(data=data, name='total_spent') + lt.settings.update({'num_examples_per_instance': -1}) + return lt @pytest.fixture(scope="module") diff --git a/composeml/label_maker.py b/composeml/label_maker.py index ca8e6e27..a6a1e5cc 100644 --- a/composeml/label_maker.py +++ b/composeml/label_maker.py @@ -97,7 +97,7 @@ def to_offset(value): class LabelMaker: """Automatically makes labels for prediction problems.""" - def __init__(self, target_entity, time_index, labeling_function, window_size=None): + def __init__(self, target_entity, time_index, labeling_function, window_size=None, label_type=None): """Creates an instance of label maker. Args: @@ -255,6 +255,7 @@ def search(self, minimum_data=None, gap=None, drop_empty=True, + label_type=None, verbose=True, *args, **kwargs): @@ -267,6 +268,7 @@ def search(self, gap (str or int) : Time between examples. Default value is window size. If an integer, search will start on the first event after the minimum data. drop_empty (bool) : Whether to drop empty slices. Default value is True. + label_type (str) : The label type can be "continuous" or "categorical". Default value is the inferred label type. verbose (bool) : Whether to render progress bar. Default value is True. *args : Positional arguments for labeling function. **kwargs : Keyword arguments for labeling function. @@ -325,16 +327,19 @@ def search(self, progress_bar.update(n=total) progress_bar.close() - labels = LabelTimes(data=labels, name=name, target_entity=self.target_entity) + labels = LabelTimes(data=labels, name=name, target_entity=self.target_entity, label_type=label_type) labels = labels.rename_axis('id', axis=0) - labels = labels._with_plots() if labels.empty: return labels + if labels.is_discrete: + labels[labels.name] = labels[labels.name].astype('category') + labels.settings.update({ + 'labeling_function': name, 'num_examples_per_instance': num_examples_per_instance, - 'minimum_data': minimum_data, + 'minimum_data': str(minimum_data), 'window_size': self.window_size, 'gap': gap, }) diff --git a/composeml/label_plots.py b/composeml/label_plots.py new file mode 100644 index 00000000..efd523b9 --- /dev/null +++ b/composeml/label_plots.py @@ -0,0 +1,88 @@ +import matplotlib as mpl +import pandas as pd +import seaborn as sns + +pd.plotting.register_matplotlib_converters() +sns.set_context('notebook') +sns.set_style('darkgrid') +COLOR = sns.color_palette("Set1", n_colors=100, desat=.75) + + +class LabelPlots: + """Creates plots for Label Times.""" + + def __init__(self, label_times): + """Initializes Label Plots. + + Args: + label_times (LabelTimes) : instance of Label Times + """ + self._label_times = label_times + + def count_by_time(self, ax=None, **kwargs): + """Plots the label distribution across cutoff times.""" + count_by_time = self._label_times.count_by_time + count_by_time.sort_index(inplace=True) + + ax = ax or mpl.pyplot.axes() + vmin = count_by_time.index.min() + vmax = count_by_time.index.max() + ax.set_xlim(vmin, vmax) + + locator = mpl.dates.AutoDateLocator() + formatter = mpl.dates.AutoDateFormatter(locator) + ax.xaxis.set_major_locator(locator) + ax.xaxis.set_major_formatter(formatter) + ax.figure.autofmt_xdate() + + if len(count_by_time.shape) > 1: + ax.stackplot( + count_by_time.index, + count_by_time.values.T, + labels=count_by_time.columns, + colors=COLOR, + alpha=.9, + **kwargs, + ) + + ax.legend( + loc='upper left', + title=self._label_times.name, + facecolor='w', + framealpha=.9, + ) + + ax.set_title('Label Count vs. Cutoff Times') + ax.set_ylabel('Count') + ax.set_xlabel('Time') + + else: + ax.fill_between( + count_by_time.index, + count_by_time.values.T, + color=COLOR[1], + ) + + ax.set_title('Label vs. Cutoff Times') + ax.set_ylabel(self._label_times.name) + ax.set_xlabel('Time') + + return ax + + @property + def dist(self): + """Alias for distribution.""" + return self.distribution + + def distribution(self, **kwargs): + """Plots the label distribution.""" + dist = self._label_times[self._label_times.name] + + if self._label_times.is_discrete: + ax = sns.countplot(dist, palette=COLOR, **kwargs) + else: + ax = sns.distplot(dist, kde=True, color=COLOR[1], **kwargs) + + ax.set_title('Label Distribution') + ax.set_ylabel('Count') + return ax diff --git a/composeml/label_times.py b/composeml/label_times.py index cdbf4267..96243fca 100644 --- a/composeml/label_times.py +++ b/composeml/label_times.py @@ -1,73 +1,87 @@ import pandas as pd +from composeml.label_plots import LabelPlots + class LabelTimes(pd.DataFrame): - """ - A data frame containing labels made by a label maker. + """A data frame containing labels made by a label maker. Attributes: name target_entity transforms """ - _metadata = ['name', 'target_entity', 'settings', 'transforms'] - - def __init__(self, data=None, name=None, target_entity=None, settings=None, transforms=None, *args, **kwargs): + _metadata = ['name', 'target_entity', 'settings', 'transforms', 'label_type'] + + def __init__(self, + data=None, + name=None, + target_entity=None, + settings=None, + transforms=None, + label_type=None, + *args, + **kwargs): super().__init__(data=data, *args, **kwargs) self.name = name self.target_entity = target_entity - self.settings = settings or {} self.transforms = transforms or [] + self.plot = LabelPlots(self) + + if label_type is not None: + error = 'label type must be "continuous" or "discrete"' + assert label_type in ['continuous', 'discrete'], error + + self.label_type = label_type + self.settings = settings or {} + self.settings['label_type'] = self.label_type @property def _constructor(self): return LabelTimes @property - def distribution(self): - labels = self.assign(count=1) - labels = labels.groupby(self.name) - distribution = labels['count'].count() - return distribution + def is_discrete(self): + """Whether labels are discrete.""" + if self.label_type is None: + self.label_type = self.infer_type() + self.settings['label_type'] = self.label_type - def _plot_distribution(self, **kwargs): - plot = self.distribution.plot(kind='bar', **kwargs) - plot.set_title('Label Distribution') - plot.set_ylabel('count') - return plot + return self.label_type == 'discrete' + + @property + def distribution(self): + """Returns label distribution if labels are discrete.""" + if self.is_discrete: + labels = self.assign(count=1) + labels = labels.groupby(self.name) + distribution = labels['count'].count() + return distribution @property def count_by_time(self): - count = self.assign(count=1) - count = count.sort_values('cutoff_time') - count = count.set_index([self.name, 'cutoff_time']) - count = count.groupby(self.name) - count = count['count'].cumsum() - return count - - def _plot_count_by_time(self, **kwargs): - count = self.count_by_time - count = count.unstack(self.name) - count = count.ffill() - - plot = count.plot(kind='area', **kwargs) - plot.set_title('Label Count vs. Time') - plot.set_ylabel('count') - return plot - - def _with_plots(self): - self.plot.count_by_time = self._plot_count_by_time - self.plot.distribution = self._plot_distribution - return self + """Returns label count across cutoff times.""" + if self.is_discrete: + keys = ['cutoff_time', self.name] + value = self.groupby(keys).cutoff_time.count() + value = value.unstack(self.name).fillna(0) + value = value.cumsum() + return value + else: + value = self.groupby('cutoff_time') + value = value[self.name].count() + value = value.cumsum() + return value def describe(self): """Prints out label info with transform settings that reproduce labels.""" - print('Label Distribution\n' + '-' * 18, end='\n') - distribution = self[self.name].value_counts() - distribution.index = distribution.index.astype('str') - distribution['Total:'] = distribution.sum() - print(distribution.to_string(), end='\n\n\n') + if self.is_discrete: + print('Label Distribution\n' + '-' * 18, end='\n') + distribution = self[self.name].value_counts() + distribution.index = distribution.index.astype('str') + distribution['Total:'] = distribution.sum() + print(distribution.to_string(), end='\n\n\n') print('Settings\n' + '-' * 8, end='\n') settings = pd.Series(self.settings) @@ -99,7 +113,7 @@ def copy(self): """ labels = super().copy() labels.transforms = labels.transforms.copy() - return labels._with_plots() + return labels def threshold(self, value, inplace=False): """ @@ -115,6 +129,9 @@ def threshold(self, value, inplace=False): labels = self if inplace else self.copy() labels[self.name] = labels[self.name].gt(value) + labels.label_type = 'discrete' + labels.settings['label_type'] = 'discrete' + transform = {'__name__': 'threshold', 'value': value} labels.transforms.append(transform) @@ -225,6 +242,8 @@ def bin(self, bins, quantiles=False, labels=None, right=True): } label_times.transforms.append(transform) + label_times.label_type = 'discrete' + label_times.settings['label_type'] = 'discrete' return label_times def sample(self, n=None, frac=None, random_state=None): @@ -318,3 +337,19 @@ def sample(self, n=None, frac=None, random_state=None): labels = pd.concat(sample_per_label, axis=0, sort=False) return labels + + def infer_type(self): + """Infer label type. + + Returns: + str : Inferred label type. Either "continuous" or "discrete". + """ + dtype = self[self.name].dtype + is_discrete = pd.api.types.is_bool_dtype(dtype) + is_discrete = is_discrete or pd.api.types.is_categorical_dtype(dtype) + is_discrete = is_discrete or pd.api.types.is_object_dtype(dtype) + + if is_discrete: + return 'discrete' + else: + return 'continuous' diff --git a/composeml/tests/test_label_maker.py b/composeml/tests/test_label_maker.py index 882e08cd..3d485c91 100644 --- a/composeml/tests/test_label_maker.py +++ b/composeml/tests/test_label_maker.py @@ -426,3 +426,9 @@ def test_slice_overlap(transactions): start, end = metadata['window'] is_overlap = df.index == end assert not is_overlap.any() + + +def test_label_type(transactions): + lm = LabelMaker(target_entity='customer_id', time_index='time', labeling_function=total_spent) + lt = lm.search(transactions, num_examples_per_instance=1, label_type='discrete', verbose=False) + assert lt.label_type == 'discrete' diff --git a/composeml/tests/test_label_plots.py b/composeml/tests/test_label_plots.py index ad4d8f87..b77629fc 100644 --- a/composeml/tests/test_label_plots.py +++ b/composeml/tests/test_label_plots.py @@ -1,10 +1,21 @@ -def test_distribution_plot(labels): - labels = labels.threshold(200) - plot = labels.plot.distribution() - assert plot.get_title() == 'Label Distribution' +def test_count_by_time_categorical(total_spent): + labels = range(2) + total_spent = total_spent.bin(2, labels=labels) + ax = total_spent.plot.count_by_time() + assert ax.get_title() == 'Label Count vs. Cutoff Times' -def test_count_by_time_plot(labels): - labels = labels.threshold(200) - plot = labels.plot.count_by_time() - assert plot.get_title() == 'Label Count vs. Time' +def test_count_by_time_continuous(total_spent): + ax = total_spent.plot.count_by_time() + assert ax.get_title() == 'Label vs. Cutoff Times' + + +def test_distribution_categorical(total_spent): + ax = total_spent.bin(2, labels=range(2)) + ax = ax.plot.dist() + assert ax.get_title() == 'Label Distribution' + + +def test_distribution_continuous(total_spent): + ax = total_spent.plot.dist() + assert ax.get_title() == 'Label Distribution' diff --git a/composeml/tests/test_label_times.py b/composeml/tests/test_label_times.py index b1305a5c..97b10fba 100644 --- a/composeml/tests/test_label_times.py +++ b/composeml/tests/test_label_times.py @@ -1,9 +1,77 @@ -def test_describe(labels): - labels = labels.bin(2) - labels.settings.update(num_examples_per_instance=2) - assert labels.describe() is None +def test_count_by_time_categorical(total_spent): + labels = range(2) + given_answer = total_spent.bin(2, labels=labels).count_by_time + given_answer = given_answer.to_csv(header=True).splitlines() + answer = [ + 'cutoff_time,0,1', + '2019-01-01 08:00:00,0.0,1.0', + '2019-01-01 08:30:00,0.0,2.0', + '2019-01-01 09:00:00,0.0,3.0', + '2019-01-01 09:30:00,0.0,4.0', + '2019-01-01 10:00:00,0.0,5.0', + '2019-01-01 10:30:00,1.0,5.0', + '2019-01-01 11:00:00,2.0,5.0', + '2019-01-01 11:30:00,3.0,5.0', + '2019-01-01 12:00:00,4.0,5.0', + '2019-01-01 12:30:00,5.0,5.0', + ] -def test_describe_empty(labels): - labels.settings.clear() - assert labels.describe() is None + assert given_answer == answer + + +def test_count_by_time_continuous(total_spent): + given_answer = total_spent.count_by_time + given_answer = given_answer.to_csv(header=True).splitlines() + + answer = [ + 'cutoff_time,total_spent', + '2019-01-01 08:00:00,1', + '2019-01-01 08:30:00,2', + '2019-01-01 09:00:00,3', + '2019-01-01 09:30:00,4', + '2019-01-01 10:00:00,5', + '2019-01-01 10:30:00,6', + '2019-01-01 11:00:00,7', + '2019-01-01 11:30:00,8', + '2019-01-01 12:00:00,9', + '2019-01-01 12:30:00,10', + ] + + assert given_answer == answer + + +def test_describe(total_spent): + assert total_spent.bin(2).describe() is None + + +def test_describe_no_settings(total_spent): + total_spent = total_spent.copy() + total_spent.settings.clear() + assert total_spent.describe() is None + + +def test_distribution_categorical(total_spent): + labels = range(2) + given_answer = total_spent.bin(2, labels=labels).distribution + given_answer = given_answer.to_csv(header=True).splitlines() + + answer = [ + 'total_spent,count', + '0,5', + '1,5', + ] + + assert given_answer == answer + + +def test_distribution_continous(total_spent): + assert total_spent.distribution is None + + +def test_infer_type(total_spent): + assert total_spent.infer_type() == 'continuous' + + total_spent = total_spent.threshold(5) + total_spent.label_type = None + assert total_spent.infer_type() == 'discrete' diff --git a/composeml/tests/test_label_transforms/test_threshold.py b/composeml/tests/test_label_transforms/test_threshold.py index 36a6c937..cffbe49e 100644 --- a/composeml/tests/test_label_transforms/test_threshold.py +++ b/composeml/tests/test_label_transforms/test_threshold.py @@ -1,13 +1,10 @@ -import pandas as pd - - def test_threshold(labels): - given_labels = labels.threshold(200) - transform = given_labels.transforms[0] + labels = labels.threshold(200) + transform = labels.transforms[0] assert transform['__name__'] == 'threshold' assert transform['value'] == 200 answer = [True, False, True, False] - labels = labels.assign(my_labeling_function=answer) - pd.testing.assert_frame_equal(given_labels, labels) + given_answer = labels[labels.name].values.tolist() + assert given_answer == answer diff --git a/composeml/tests/utils.py b/composeml/tests/utils.py index ddc5bb2f..da8cdea1 100644 --- a/composeml/tests/utils.py +++ b/composeml/tests/utils.py @@ -3,11 +3,11 @@ import pandas as pd -def read_csv(csv): +def read_csv(csv, **kwargs): if isinstance(csv, list): csv = '\n'.join(csv) with StringIO(csv) as file: - df = pd.read_csv(file) + df = pd.read_csv(file, **kwargs) return df diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst index 460c4bcd..69b1c966 100644 --- a/docs/source/api_reference.rst +++ b/docs/source/api_reference.rst @@ -35,14 +35,23 @@ Transform Methods LabelTimes.sample LabelTimes.threshold +.. currentmodule:: composeml.label_plots + +Label Plots +=========== + +.. autosummary:: + :toctree: generated + :template: class.rst + :nosignatures: + + LabelPlots + Plotting Methods ---------------- -.. list-table:: - :widths: 25 75 - :header-rows: 0 +.. autosummary:: + :nosignatures: - * - :mod:`LabelTimes.plot.distribution` - - Plot the label distribution. - * - :mod:`LabelTimes.plot.count_by_time` - - Plot the label count vs. time. + LabelPlots.count_by_time + LabelPlots.distribution diff --git a/docs/source/examples/predict-next-purchase/example.ipynb b/docs/source/examples/predict-next-purchase/example.ipynb index 1157c15c..678e6bfe 100644 --- a/docs/source/examples/predict-next-purchase/example.ipynb +++ b/docs/source/examples/predict-next-purchase/example.ipynb @@ -234,7 +234,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Elapsed: 01:30 | Remaining: 00:00 | Progress: 100%|██████████| user_id: 19477/19477 \n" + "Elapsed: 01:37 | Remaining: 00:00 | Progress: 100%|██████████| user_id: 19477/19477 \n" ] }, { @@ -374,6 +374,73 @@ "source": [ "lt.describe()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plot Labels\n", + "\n", + "Additionally, there are plots available for insight to the labels.\n", + "\n", + "\n", + "#### Distribution\n", + "\n", + "This plot shows the label distribution." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZMAAAEXCAYAAABoPamvAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAdLUlEQVR4nO3de5RdZZnn8W9VCBJJAhgKCSICg3lAECM3LzSINjbSeFdIQ+SiArIw4oyKlzZB2oWtoz0qKFEniEFjIz1xUFsI2GorQRowMiAXebRnuHQgNDGCSZRAkqr5Y78Fh0oqOVW7zjlU1fezVpZnP/vdZ7+bdTy/et+9z95dfX19SJJUR3enOyBJGv0ME0lSbYaJJKk2w0SSVJthIkmqzTCRJNW2Tac7ILVKROwJ3JGZk4e4XR/Qk5m/H8I2C8u+/mFA/XzgvcADpTQR+DVwXmb+rrS5FTgqMx8d5L13AK7MzNcMsv5W4CjgzcDbM/P1zfa7bH8ecFtmfj8iPgn8e2Z+cyjvIRkmUutdkZlz+hci4mTgpxGxf2auzsyZW9l+J+CwwVb2bx8Rw+3fa4C7ynudN9w30fhmmGhciogZwMXAFGA6cCswKzPXlSafiohDqaaC52bmD8t27wbOLvVVwJzMvHso+87Mb5VAOQn4av9IiOr/j98Edi5Nr8rMecA3gEllBHIw8Gfg+8BLgNnAL8v2ANMj4hpgN+A+4IzMfCgifgZ8OTMXl+P4GfBl4LnAIcDnImIj8CbKCCsijgA+BzwbeKL8d7gmIk4D3gL0Ai8s/Tk1M38zlP8OGls8Z6Lx6gzgssx8ObAPsBdwXMP6/5eZBwHvAC6LiJ6IeBVwKnBEZr4U+Cxw5TD3fxvw4s30qX+/RwAvLFNc7wQey8yZmbkR2Bb458yMzFw24D1mUAXcgcDtwIVb6kRmXgwsA87NzCePJSKmAYuB95f3OhVYFBF7lSavAt6XmQcANwEfHeLxa4wxTDRefQRYGREfBr5C9Zd847mVrwJk5h1UU0CvoAqbfYAbyijhs8BOEfGcYey/j+ov+kbXAG+LiKuB9wAfzcw/DrL90kHqP87Mfy+vvw68dhh9A3gZ1bmTmwAy807gF1TnZgB+lZnLy+tbgOH8N9AYYphovLocOJNqKugLVF+IXQ3rNza87gbWAxOAb5URwkzgIKopokeGsf9DqUYOT8rMX1KNkP4nsCdwc0QcPMj2awepb67fUIVX4/Ftu5X+TSjbNOqmuoAA4LGG+sD31jhkmGi8Ogb4ZGZeUZZfRvUF2u80gIg4iGo0chNwLXBiREwvbc4CfjLUHZfzLnsD/zSg/hlgXmZ+D3g/cCdwALABmBARzXxhvzoi9mjo35LyeiVV8BERLwIObNhmA0+FRL9/A/aNiMPKNvsDRwI/a6IPGoc8Aa+xbvuIGPhX/CuAvwWujIg/AX8Efk4VGv32joj/Q/VX999k5h+AH0XEfwf+JSJ6gdXAWzOzbytXUs2KiL8o79UNJNWlwOsGtPsi1fmZO4DHqc6rfIfqy/5m4M5yUnxLfg1cGhG7Ar+hmi4DuKC893HA3cB1Ddv8APh0RDw5WsnM30fE8cCXIuLZVCfb35mZv42IV26lDxqHurwFvSSpLqe5JEm1GSaSpNoME0lSbYaJJKm28Xg117OorvFfwdOvyZckDW4C1a2Hfkl1teHTjMcwOZTBfz0sSdqyI4DrBxbHY5isAHjkkT/R2+tl0ZLUjO7uLnbaaXso36EDjccw2QjQ29tnmEjS0G329EBLwyQipgI3AK/PzHsb6nOoHuJzVFneA1gE7EL16+DZmbk2InYEvk1164mVwAnldtrbUt3E7hCqewSdNNTbgEuSRk7LruaKiJdRzavNGFB/EZverno+MD8z96W6Hfa8Ur8AWJqZ+wELeOp22ucAfyr1/wosbMUxSJKa08pLg8+gelzpg/2FiHgW8DXgvIbaRKobyC0upYXA8eX1cVQjE6ju8npsaf9kPTOvA3oabm4nSWqzlk1zZebpsMmjRD8NXArc01DbGVidmRvK8gpg9/J6t7JMZm6IiNVUT5R7sj5gm/ub7d+0aUN6LLgkaQvadgI+Il4L7JGZH4iIoxpWdbPpcxN6y/8OvOV2V1k3cJuuhm2asmrVWk/AS1KTuru7tvhHeDt/AX8isH95Qt0lwCERcQXwMLBDRPQ/S2I6T02NPQDsChAR21A9r3sVsLy067drwzaSpDZrW5hk5rsyc7/yhLrTgWWZOSsz11P9iHBWaXoKTz3Q5+qyTFm/tLR/sl6eE7EuM5ue4pIkjaxnyu9MzqZ6cM9cqvMeJ5b6PGBhRNwJPArMLvUvAV8r9ceBk9vcX3acsi0Tt3tWu3erZ7j16x7n0TVPdLobUtuNx4dj7QncU/ecSU/PFK495nUj1imNDcdcew0rV67pdDekEddwzmQv4N5N1re7Q5KksccwkSTVZphIkmozTCRJtRkmkqTaDBNJUm2GiSSpNsNEklSbYSJJqs0wkSTVZphIkmozTCRJtRkmkqTaDBNJUm2GiSSpNsNEklSbYSJJqs0wkSTVZphIkmozTCRJtRkmkqTatmn1DiJiKnAD8PrMvDcizgTOAfqAZcB7MvOJiJgJXAJMBa4DzsrMDRGxB7AI2AVIYHZmro2IHYFvA3sDK4ETMvOhVh+PJGlTLR2ZRMTLgOuBGWV5BnAu8ErgwLL/95bmi4A5mTkD6ALOKPX5wPzM3JcqfOaV+gXA0szcD1gAXNjKY5EkDa7V01xnUIXFg2X5ceDszFydmX3A7cAeEfECYFJm3ljaLQSOj4iJwJHA4sZ6eX0c1cgE4HLg2NJektRmLZ3myszTASKif/k+4L5S6wHmAKcBuwErGjZdAewO7AyszswNA+o0blOmw1YDPTwVXJKkNmn5OZPNiYjnAUuAr2fmzyLicKpzKP26gF6qkVPfgM17G9o06mpYt1XTpk0eUp+lZvX0TOl0F6S2a3uYRMS+wLXARZn5P0p5OTC9odmuVCOMh4EdImJCZm4sbfpHHg+UdssjYhtgCrCq2X6sWrWW3t6BOdU8vzA0mJUr13S6C9KI6+7u2uIf4W29NDgipgA/AuY2BEn/9Ne6MkIBOBlYkpnrgaXArFI/hWpEA3B1WaasX1raS5LarN0jk9OB5wIfjIgPltoPMvM8YDawoFxKfAtwUVl/NnBZRMwF7gdOLPV5wMKIuBN4tGwvSeqArr6+4U/1jFJ7AveMxDTXtce8bsQ6pbHhmGuvcZpLY1LDNNdewL2brG93hyRJY49hIkmqzTCRJNVmmEiSajNMJEm1GSaSpNoME0lSbYaJJKk2w0SSVJthIkmqzTCRJNVmmEiSajNMJEm1GSaSpNoME0lSbYaJJKk2w0SSVJthIkmqzTCRJNVmmEiSajNMJEm1bdPqHUTEVOAG4PWZeW9EHA18HpgEXJGZc0u7mcAlwFTgOuCszNwQEXsAi4BdgARmZ+baiNgR+DawN7ASOCEzH2r18UiSNtXSkUlEvAy4HphRlicBlwJvAvYDDo2IY0vzRcCczJwBdAFnlPp8YH5m7gssA+aV+gXA0szcD1gAXNjKY5EkDa7V01xnAO8FHizLhwG/y8x7MnMDVYAcHxEvACZl5o2l3cJSnwgcCSxurJfXx1GNTAAuB44t7SVJbdbSMMnM0zNzaUNpN2BFw/IKYPct1HcGVpfgaaw/7b3K+tVAz0gfgyRp61p+zmSAbqCvYbkL6B1CnVLvb9Ooq2HdVk2bNrnZptKQ9PRM6XQXpLZrd5gsB6Y3LO9KNQU2WP1hYIeImJCZG0ub/imzB0q75RGxDTAFWNVsR1atWktv78Ccap5fGBrMypVrOt0FacR1d3dt8Y/wdl8afBMQEbFPREwATgKWZOZ9wLqIOLy0O7nU1wNLgVmlfgqwpLy+uixT1i8t7SVJbdbWMMnMdcBpwHeBu4C7eerk+mzgCxFxNzAZuKjUzwbOjIi7gCOAuaU+D3h5RNxZ2ry3HccgSdpUV1/f8Kd6Rqk9gXtGYprr2mNeN2Kd0thwzLXXOM2lMalhmmsv4N5N1re7Q5KksccwkSTVZphIkmozTCRJtRkmkqTaDBNJUm2GiSSpNsNEklSbYSJJqs0wkSTVZphIkmozTCRJtRkmkqTaDBNJUm2GiSSpNsNEklSbYSJJqs0wkSTVZphIkmozTCRJtRkmkqTatunETiPiHcDHyuKSzPxQRMwELgGmAtcBZ2XmhojYA1gE7AIkMDsz10bEjsC3gb2BlcAJmflQu49FktSBkUlEPBu4CHgV8BLgiIg4miow5mTmDKALOKNsMh+Yn5n7AsuAeaV+AbA0M/cDFgAXtu8oJEmNmgqTiPj6ZmqLh7nPCWW/2wMTy7/1wKTMvLG0WQgcHxETgSOBxY318vo4qpEJwOXAsaW9JKnNtjjNFRFfAZ5HNXroaVg1kWp6acgyc01EzAPuBv4M/Bx4AljR0GwFsDuwM7A6MzcMqAPs1r9NmQ5bDfQADw6nX5Kk4dvaOZOvAwdQTUd9t6G+Abhxs1tsRUQcCLwLeAHwR6rprb8C+hqadQG9VCOYvgFv0dvQplFXw7qtmjZtcvOdloagp2dKp7sgtd0WwyQzlwHLIuLHmbl8hPZ5DPCTzHwYICIWAh8Cpje02ZVqhPEwsENETMjMjaVN/8jjgdJueURsA0wBVjXbiVWr1tLbOzCnmucXhgazcuWaTndBGnHd3V1b/CO82RPwz4+If42I2yLi1/3/htmn24CjI2L7iOgC3kA11bUuIg4vbU6musprPbAUmFXqpwBLyuuryzJl/dLSXpLUZs1eGvw1qpPft7DptNOQZOaPIuKlwK+oTrzfDHwGuBJYEBFTy34uKpucDVwWEXOB+4ETS30esDAi7gQeBWbX6Zckafi6+vq2ng0RcUtmHtSG/rTDnsA9IzHNde0xrxuxTmlsOObaa5zm0pjUMM21F3DvJuubfJ87IuLFI9gvSdIY0uw0197AryLiPuCx/mJmHtiSXkmSRpVmw+TjLe2FJGlUazZMbm9pLyRJo1qzYfJ7qqu4unjqaq7GX6NLksaxpsIkM588UR8R2wInAdGqTkmSRpch3zU4M5/IzIXAa0e+O5Kk0aipkUlEPKdhsQs4BNipJT2SJI06wzlnAtU9s85pSY8kSaPOkM+ZSHrmmzJ1O7Z7lo/30dOte3w9a1ava8l7NzvN1U11Z99jqZ5l8iPg7xueMyLpGWS7Z03k7XM3eaadxrnFF7ybNbQmTJodcXwaeA3Vo3E/D7wS+FxLeiRJGnWaPWfyOuCQ/lu8R8RVVLeS/2+t6pgkafRodmTS3fiskMx8nOr28ZIkNT0yuTUivgB8meqqrvcBw304liRpjGl2ZPJeqt+V3ADcBOxMFSiSJG15ZFJunbIA+F5mnlZqVwEbgdUt750kaVTY2sjkk8BU4BcNtTOAHYHzW9QnSdIos7UweT1wUmY+3F/IzAeBU4C3tLJjkqTRY2th8kRmPjawmJmrgcdb0yVJ0miztTDZGBFTBhZLzXs1SJKArV8afDlwSUS8KzP/BBAR2wOXAN8d7k4j4g3AJ4DtgR9l5vsj4miqX9dPAq7IzLml7cyyv6nAdcBZmbkhIvYAFgG7AAnMzsy1w+2TJGn4tjYy+SLwR+ChiLgxIm4GHgIeoTo5P2QRsTfwVeDNwIHAQRFxLHAp8CZgP+DQUoMqMOZk5gyquxafUerzgfmZuS+wDJg3nP5Ikurb4sgkM3uBMyPiU8DBQC9wU2auqLHPt1CNPJYDRMQs4IXA7zLznlJbBBwfEXcBkzLzxrLtQuDvIuIS4EiqQOqv/xz4SI1+SZKGqdlb0N8H3DdC+9wHeCIifgDsAfwQuJPqmfL9+p8vv9sg9Z2B1Q13LfZ59JLUQc3eTmWk93kkcBSwFvgB8BjVbVr6dVGNgrqbrFPqTZs2bfJQmktN6+nZ5JoV6RmjVZ/PToTJQ8CPM3MlQERcCRxP9av6frsCDwLLgembqT8M7BAREzJzY2nz4FA6sWrVWnp7B+ZR8/zC0GBWrlzT6S74+dSghvv57O7u2uIf4Z14guIPgWMiYseImED1wK3FQETEPqV2ErCkTK+ti4jDy7Ynl/p6YCkwq9RPAZa09SgkSU9qe5hk5k3AZ4HrgbuozsV8BTiN6nLju4C7qQIGYDbwhYi4G5gMXFTqZ1NdHHAXcAQwt02HIEkaoBPTXGTmpVSXAjf6CfCSzbS9DThsM/X7qM67SJI6rBPTXJKkMcYwkSTVZphIkmozTCRJtRkmkqTaDBNJUm2GiSSpNsNEklSbYSJJqs0wkSTVZphIkmozTCRJtRkmkqTaDBNJUm2GiSSpNsNEklSbYSJJqs0wkSTVZphIkmozTCRJtRkmkqTatunkziPiH4CdM/O0iJgJXAJMBa4DzsrMDRGxB7AI2AVIYHZmro2IHYFvA3sDK4ETMvOhjhyIJI1zHRuZRMRfAqc2lBYBczJzBtAFnFHq84H5mbkvsAyYV+oXAEszcz9gAXBhWzouSdpER8IkIp4DfAr4+7L8AmBSZt5YmiwEjo+IicCRwOLGenl9HNXIBOBy4NjSXpLUZp0amXwN+DjwSFneDVjRsH4FsDuwM7A6MzcMqD9tm7J+NdDT2m5Lkjan7edMIuJ04D8y8ycRcVopdwN9Dc26gN7N1Cn1/jaNuhrWbdW0aZObbSoNSU/PlE53QRpUqz6fnTgBPwuYHhG3As8BJlMFxvSGNrsCDwIPAztExITM3FjaPFjaPFDaLY+IbYApwKpmO7Fq1Vp6ewfmVPP8wtBgVq5c0+ku+PnUoIb7+ezu7triH+Ftn+bKzNdm5gGZORM4D/hBZr4TWBcRh5dmJwNLMnM9sJQqgABOAZaU11eXZcr6paW9JKnNOnpp8ACzgQURMRW4Bbio1M8GLouIucD9wImlPg9YGBF3Ao+W7SVJHdDRMMnMhVRXaJGZtwGHbabNfcBRm6n/AXhjSzsoSWqKv4CXJNVmmEiSajNMJEm1GSaSpNoME0lSbYaJJKk2w0SSVJthIkmqzTCRJNVmmEiSajNMJEm1GSaSpNoME0lSbYaJJKk2w0SSVJthIkmqzTCRJNVmmEiSajNMJEm1GSaSpNoME0lSbYaJJKm2bTqx04j4BHBCWbwqMz8cEUcDnwcmAVdk5tzSdiZwCTAVuA44KzM3RMQewCJgFyCB2Zm5ts2HIkmiAyOTEhp/BbwUmAkcHBEnApcCbwL2Aw6NiGPLJouAOZk5A+gCzij1+cD8zNwXWAbMa99RSJIadWKaawXwwcx8IjPXA78BZgC/y8x7MnMDVYAcHxEvACZl5o1l24WlPhE4EljcWG/jMUiSGrR9misz7+x/HREvpJru+hJVyPRbAewO7DZIfWdgdQmexnrTpk2bPOS+S83o6ZnS6S5Ig2rV57Mj50wAImJ/4CrgXGAD1eikXxfQSzVy6muiTqk3bdWqtfT2DnyL5vmFocGsXLmm013w86lBDffz2d3dtcU/wjtyNVdEHA78BPhoZl4GLAemNzTZFXhwC/WHgR0iYkKpTy91SVIHdOIE/POB7wEnZeZ3SvmmalXsUwLiJGBJZt4HrCvhA3Byqa8HlgKzSv0UYEnbDkKS9DSdmOb6ELAd8PmI6K99FTgN+G5ZdzVPnVyfDSyIiKnALcBFpX42cFlEzAXuB05sR+clSZvqxAn49wPvH2T1SzbT/jbgsM3U7wOOGtHOSZKGxV/AS5JqM0wkSbUZJpKk2gwTSVJthokkqTbDRJJUm2EiSarNMJEk1WaYSJJqM0wkSbUZJpKk2gwTSVJthokkqTbDRJJUm2EiSarNMJEk1WaYSJJqM0wkSbUZJpKk2gwTSVJthokkqbZtOt2BOiLiJGAuMBH4YmZe3OEuSdK4NGpHJhHxPOBTwF8AM4EzI+JFne2VJI1Po3lkcjTw08z8A0BELAbeDnxyK9tNAOju7qrdge2e+9za76GxZyQ+WyOhZ8fJne6CnoGG+/ls2G7C5taP5jDZDVjRsLwCOKyJ7aYD7LTT9rU78KpvXlb7PTT2TJv2zPgS/8qHZnW6C3oGGoHP53Tg/w4sjuYw6Qb6Gpa7gN4mtvslcARV+GxsQb8kaSyaQBUkv9zcytEcJsupQqHfrsCDTWz3OHB9S3okSWPbJiOSfqM5TH4MnB8RPcCfgLcBZ3a2S5I0Po3aq7ky8wHg48C/ArcC/5iZN3e2V5I0PnX19fVtvZUkSVswakcmkqRnDsNEklSbYSJJqs0wkSTVNpovDVaLRMSewG+BuwasekNm/sdm2p8PkJnnt7pvUkRcDBwObAvsw1Of0wsz8xsd69g4Z5hoMA9m5sxOd0IaKDPfC0/+0fMzP6fPDIaJmhYRBwBfAiYDuwCfzsyvNqyfCFwKHFBK8zNzQUQ8F/ga8HyqW958LDN/3NbOa8wrI+SXA3tQfU5nAedn5s8agmdPP4+t4TkTDWa3iLi14d+5wOnABZl5KPBq4HMDtnkl8JzMfClwHE/d7uZC4NLMPBh4I/C1iJjSnsPQOLNdZr4oM7+yhTZ+HlvAkYkGs8k0V0RMAF4XER8DXkw1Qml0R9UsrgWuBs4t9aOBfSOi//EAE4H/QnXnAmkk3dREGz+PLWCYaCj+CXgE+GfgO8CJjSszc1VE7A+8Fvhr4JayPAF4TcOzZ6YDD7ez4xo3Hmt43Ud1N3GoAqOfn8cWcJpLQ/Fa4LzM/D5wLDw5WqG8fiPwLeAq4BxgLdW89E+Bs0ubF1GNYJ7d1p5rPPo9sH95/eaGup/HFjBMNBTnA9dHxF1U50PuBfZqWL+E6i/DO4GbgUWZeTvwPuDlEfFr4ArgHZm5po391vj0WeDsiLgFmNRQ9/PYAt7oUZJUmyMTSVJthokkqTbDRJJUm2EiSarNMJEk1WaYaFyJiKMi4o427GdhRHxokHXnRcSbWt2HQfZ9R0QcNcxtd4iIn45wlzRGGCZS+72Gp/8ie7TYCTis053QM5O3U9F4NDkiFlM9C+NR4EzgP4GLgZlUt+FYAvxtZm6IiD6gJzN/D9C4HBEfBd4NrAGuA96cmXuW/bwyIm4Ankv1K+uTgNOAQ4DPRcTGzLxysE5GxAbgM1R3G9i+9Od/R8RpZZ/bA3/MzFdHxDyq29tsoHoWzZzMfKj8wvtSql9431226b99+x2ZOXmQ5Y8Bp5b3+13p9zeASRFxK3BwZm4cyn90jW2OTDQePR/4fLmR5T9S3QLmImAV1Q0sDwFeAmx2mqpfRBxD9SV7KHAwMPDOs8+juqngDGB34K2ZeTGwDDh3S0FSTAD+XO5uewJwaUT0lHX7A0eVIHknVeAcmpkHUgXXwtLu28CCUr8QeMFW9tl/W5zTgFdk5gHAPcAc4J3AY5k50yDRQIaJxqNfZ+YN5fVCqvB4I/DlzOzLzMeBr1LuP7YFfw38r8x8NDP7qEY2jb6XmX8uX7x3UD0DZqi+DJCZvwZuB45sOIbV5fWxwDcy809l+ULgL8tzOw4Evlne4xelH1tzdDmuR8p2H8jMTw2j7xpHnObSeDTwr+q+hn/9unn6eY0ugIjYtqG2gafuSru5910/YB9dDN2GAX3q38fahvoENu174/+3G/fb/34D+zPwuJ58v4jYEdhxSL3WuOPIROPRSyKi/1kt7wGupzpHMiciuiLiWVTnUf6ltFlJNXqB6rxHv6uAt0XEDmX53Tz9S30wG2j+BPwpABFxELAv8PPNtLkGeFdEbF+WzwGuy8z/BH5F9VCz/vd4cWnzKLBtOacCT3+cwI+Bt0bE1LJ8PvCB0u8JETGcUNQYZ5hoPPoN8ImIuI1qeutUqi/gXaimkm4HEuif2jkHuLjcfXY/YAVAZv4UWAD8W0QsA3YA/tzE/n8AfDoiTm2i7eFlv5cCs/qnngb4OlUA3BwRvwEOAmaXdScCfxMRtwPzyrGTmX8EPgwsiYhf0vAckMy8mupk+y/KdrsCHy/HfTNwZ0RMa6LvGke8a7A0TBFxCPDKzLyoLH8AeFlmzhqh93/aVWTSM5nnTKTh+y3wkYg4k2p6636q6bGmRMS5PDWCGOhz9bsntY8jE0lSbZ4zkSTVZphIkmozTCRJtRkmkqTaDBNJUm2GiSSptv8POPOJNC/tUe8AAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "lt.plot.distribution();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Count by Time\n", + "\n", + "This plot shows the label distribution across cutoff times." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "lt.plot.count_by_time();" + ] } ], "metadata": { diff --git a/docs/source/getting_started.ipynb b/docs/source/getting_started.ipynb index 2048871d..43b4c9a5 100644 --- a/docs/source/getting_started.ipynb +++ b/docs/source/getting_started.ipynb @@ -10,8 +10,24 @@ "Getting Started\n", "===============\n", "\n", - "In this example, we will generate labels on a mock dataset of transactions. For each customer, we want to label whether the total purchase amount over the next hour of transactions will exceed $300. Additionally, we want to predict one hour in advance.\n", - "\n", + "In this example, we will generate labels on a mock dataset of transactions. For each customer, we want to label whether the total purchase amount over the next hour of transactions will exceed $300. Additionally, we want to predict one hour in advance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import composeml as cp" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ "Load Data\n", "=========\n", "\n", @@ -24,8 +40,6 @@ "metadata": {}, "outputs": [], "source": [ - "import composeml as cp\n", - "\n", "df = cp.demos.load_transactions()\n", "\n", "df[df.columns[:7]].head()" @@ -192,18 +206,41 @@ "\n", "Also, there are plots available for insight to the labels.\n", "\n", - ".. code-block:: python\n", "\n", - " import matplotlib.pyplot as plt\n", + "Distribution\n", + "------------\n", "\n", - " fig, axs = plt.subplots(1,2)\n", - " fig.subplots_adjust(wspace=.34)\n", + "This plot shows the label distribution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "labels.plot.distribution();" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + "Count by Time\n", + "-------------\n", "\n", - " color = ['#4285F4', '#DB4437']\n", - " labels.plot.distribution(color=color, ax=axs[0])\n", - " labels.plot.count_by_time(figsize=(12, 4), color=color, ax=axs[1]);\n", - " \n", - ".. image:: images/getting_started_0.0.png\n" + "This plot shows the label distribution across cutoff times." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "labels.plot.count_by_time();" ] } ], diff --git a/docs/source/guides/using_label_transforms.ipynb b/docs/source/guides/using_label_transforms.ipynb index 7711ff48..0fe2106e 100644 --- a/docs/source/guides/using_label_transforms.ipynb +++ b/docs/source/guides/using_label_transforms.ipynb @@ -49,6 +49,7 @@ "labels = label_maker.search(\n", " cp.demos.load_transactions(),\n", " num_examples_per_instance=10,\n", + " label_type='continuous',\n", " minimum_data='2h',\n", " gap='2min',\n", " verbose=True,\n", diff --git a/requirements.txt b/requirements.txt index c1547c10..6ebc7499 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ pandas>=0.23.0 numpy>=1.13.3 tqdm>=4.19.2 -matplotlib>=3.0.2 \ No newline at end of file +matplotlib>=3.0.2 +seaborn>=0.9.0