Skip to content

Commit

Permalink
Seaborn Plots (#62)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeff Hernandez authored Sep 10, 2019
1 parent 49dca85 commit 50e9f90
Show file tree
Hide file tree
Showing 14 changed files with 442 additions and 94 deletions.
25 changes: 24 additions & 1 deletion composeml/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,30 @@
import pandas as pd
import pytest

from .label_times import LabelTimes
from composeml import LabelTimes
from composeml.tests.utils import read_csv


@pytest.fixture(scope="module")
def total_spent():
data = [
'id,customer_id,cutoff_time,total_spent',
'0,0,2019-01-01 08:00:00,9',
'1,0,2019-01-01 08:30:00,8',
'2,1,2019-01-01 09:00:00,7',
'3,1,2019-01-01 09:30:00,6',
'4,1,2019-01-01 10:00:00,5',
'5,2,2019-01-01 10:30:00,4',
'6,2,2019-01-01 11:00:00,3',
'7,2,2019-01-01 11:30:00,2',
'8,2,2019-01-01 12:00:00,1',
'9,3,2019-01-01 12:30:00,0',
]

data = read_csv(data, index_col='id', parse_dates=['cutoff_time'])
lt = LabelTimes(data=data, name='total_spent')
lt.settings.update({'num_examples_per_instance': -1})
return lt


@pytest.fixture(scope="module")
Expand Down
13 changes: 9 additions & 4 deletions composeml/label_maker.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def __str__(self):
class LabelMaker:
"""Automatically makes labels for prediction problems."""

def __init__(self, target_entity, time_index, labeling_function, window_size=None):
def __init__(self, target_entity, time_index, labeling_function, window_size=None, label_type=None):
"""Creates an instance of label maker.
Args:
Expand Down Expand Up @@ -285,6 +285,7 @@ def search(self,
minimum_data=None,
gap=None,
drop_empty=True,
label_type=None,
verbose=True,
*args,
**kwargs):
Expand All @@ -297,6 +298,7 @@ def search(self,
gap (str or int) : Time between examples. Default value is window size.
If an integer, search will start on the first event after the minimum data.
drop_empty (bool) : Whether to drop empty slices. Default value is True.
label_type (str) : The label type can be "continuous" or "categorical". Default value is the inferred label type.
verbose (bool) : Whether to render progress bar. Default value is True.
*args : Positional arguments for labeling function.
**kwargs : Keyword arguments for labeling function.
Expand Down Expand Up @@ -353,16 +355,19 @@ def search(self,
progress_bar.update(n=total)
progress_bar.close()

labels = LabelTimes(data=labels, name=name, target_entity=self.target_entity)
labels = LabelTimes(data=labels, name=name, target_entity=self.target_entity, label_type=label_type)
labels = labels.rename_axis('id', axis=0)
labels = labels._with_plots()

if labels.empty:
return labels

if labels.is_discrete:
labels[labels.name] = labels[labels.name].astype('category')

labels.settings.update({
'labeling_function': name,
'num_examples_per_instance': num_examples_per_instance,
'minimum_data': minimum_data,
'minimum_data': str(minimum_data),
'window_size': self.window_size,
'gap': gap,
})
Expand Down
88 changes: 88 additions & 0 deletions composeml/label_plots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import matplotlib as mpl
import pandas as pd
import seaborn as sns

pd.plotting.register_matplotlib_converters()
sns.set_context('notebook')
sns.set_style('darkgrid')
COLOR = sns.color_palette("Set1", n_colors=100, desat=.75)


class LabelPlots:
"""Creates plots for Label Times."""

def __init__(self, label_times):
"""Initializes Label Plots.
Args:
label_times (LabelTimes) : instance of Label Times
"""
self._label_times = label_times

def count_by_time(self, ax=None, **kwargs):
"""Plots the label distribution across cutoff times."""
count_by_time = self._label_times.count_by_time
count_by_time.sort_index(inplace=True)

ax = ax or mpl.pyplot.axes()
vmin = count_by_time.index.min()
vmax = count_by_time.index.max()
ax.set_xlim(vmin, vmax)

locator = mpl.dates.AutoDateLocator()
formatter = mpl.dates.AutoDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)
ax.figure.autofmt_xdate()

if len(count_by_time.shape) > 1:
ax.stackplot(
count_by_time.index,
count_by_time.values.T,
labels=count_by_time.columns,
colors=COLOR,
alpha=.9,
**kwargs,
)

ax.legend(
loc='upper left',
title=self._label_times.name,
facecolor='w',
framealpha=.9,
)

ax.set_title('Label Count vs. Cutoff Times')
ax.set_ylabel('Count')
ax.set_xlabel('Time')

else:
ax.fill_between(
count_by_time.index,
count_by_time.values.T,
color=COLOR[1],
)

ax.set_title('Label vs. Cutoff Times')
ax.set_ylabel(self._label_times.name)
ax.set_xlabel('Time')

return ax

@property
def dist(self):
"""Alias for distribution."""
return self.distribution

def distribution(self, **kwargs):
"""Plots the label distribution."""
dist = self._label_times[self._label_times.name]

if self._label_times.is_discrete:
ax = sns.countplot(dist, palette=COLOR, **kwargs)
else:
ax = sns.distplot(dist, kde=True, color=COLOR[1], **kwargs)

ax.set_title('Label Distribution')
ax.set_ylabel('Count')
return ax
121 changes: 78 additions & 43 deletions composeml/label_times.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,87 @@
import pandas as pd

from composeml.label_plots import LabelPlots


class LabelTimes(pd.DataFrame):
"""
A data frame containing labels made by a label maker.
"""A data frame containing labels made by a label maker.
Attributes:
name
target_entity
transforms
"""
_metadata = ['name', 'target_entity', 'settings', 'transforms']

def __init__(self, data=None, name=None, target_entity=None, settings=None, transforms=None, *args, **kwargs):
_metadata = ['name', 'target_entity', 'settings', 'transforms', 'label_type']

def __init__(self,
data=None,
name=None,
target_entity=None,
settings=None,
transforms=None,
label_type=None,
*args,
**kwargs):
super().__init__(data=data, *args, **kwargs)

self.name = name
self.target_entity = target_entity
self.settings = settings or {}
self.transforms = transforms or []
self.plot = LabelPlots(self)

if label_type is not None:
error = 'label type must be "continuous" or "discrete"'
assert label_type in ['continuous', 'discrete'], error

self.label_type = label_type
self.settings = settings or {}
self.settings['label_type'] = self.label_type

@property
def _constructor(self):
return LabelTimes

@property
def distribution(self):
labels = self.assign(count=1)
labels = labels.groupby(self.name)
distribution = labels['count'].count()
return distribution
def is_discrete(self):
"""Whether labels are discrete."""
if self.label_type is None:
self.label_type = self.infer_type()
self.settings['label_type'] = self.label_type

def _plot_distribution(self, **kwargs):
plot = self.distribution.plot(kind='bar', **kwargs)
plot.set_title('Label Distribution')
plot.set_ylabel('count')
return plot
return self.label_type == 'discrete'

@property
def distribution(self):
"""Returns label distribution if labels are discrete."""
if self.is_discrete:
labels = self.assign(count=1)
labels = labels.groupby(self.name)
distribution = labels['count'].count()
return distribution

@property
def count_by_time(self):
count = self.assign(count=1)
count = count.sort_values('cutoff_time')
count = count.set_index([self.name, 'cutoff_time'])
count = count.groupby(self.name)
count = count['count'].cumsum()
return count

def _plot_count_by_time(self, **kwargs):
count = self.count_by_time
count = count.unstack(self.name)
count = count.ffill()

plot = count.plot(kind='area', **kwargs)
plot.set_title('Label Count vs. Time')
plot.set_ylabel('count')
return plot

def _with_plots(self):
self.plot.count_by_time = self._plot_count_by_time
self.plot.distribution = self._plot_distribution
return self
"""Returns label count across cutoff times."""
if self.is_discrete:
keys = ['cutoff_time', self.name]
value = self.groupby(keys).cutoff_time.count()
value = value.unstack(self.name).fillna(0)
value = value.cumsum()
return value
else:
value = self.groupby('cutoff_time')
value = value[self.name].count()
value = value.cumsum()
return value

def describe(self):
"""Prints out label info with transform settings that reproduce labels."""
print('Label Distribution\n' + '-' * 18, end='\n')
distribution = self[self.name].value_counts()
distribution.index = distribution.index.astype('str')
distribution['Total:'] = distribution.sum()
print(distribution.to_string(), end='\n\n\n')
if self.is_discrete:
print('Label Distribution\n' + '-' * 18, end='\n')
distribution = self[self.name].value_counts()
distribution.index = distribution.index.astype('str')
distribution['Total:'] = distribution.sum()
print(distribution.to_string(), end='\n\n\n')

print('Settings\n' + '-' * 8, end='\n')
settings = pd.Series(self.settings)
Expand Down Expand Up @@ -99,7 +113,7 @@ def copy(self):
"""
labels = super().copy()
labels.transforms = labels.transforms.copy()
return labels._with_plots()
return labels

def threshold(self, value, inplace=False):
"""
Expand All @@ -115,6 +129,9 @@ def threshold(self, value, inplace=False):
labels = self if inplace else self.copy()
labels[self.name] = labels[self.name].gt(value)

labels.label_type = 'discrete'
labels.settings['label_type'] = 'discrete'

transform = {'__name__': 'threshold', 'value': value}
labels.transforms.append(transform)

Expand Down Expand Up @@ -225,6 +242,8 @@ def bin(self, bins, quantiles=False, labels=None, right=True):
}

label_times.transforms.append(transform)
label_times.label_type = 'discrete'
label_times.settings['label_type'] = 'discrete'
return label_times

def sample(self, n=None, frac=None, random_state=None):
Expand Down Expand Up @@ -318,3 +337,19 @@ def sample(self, n=None, frac=None, random_state=None):

labels = pd.concat(sample_per_label, axis=0, sort=False)
return labels

def infer_type(self):
"""Infer label type.
Returns:
str : Inferred label type. Either "continuous" or "discrete".
"""
dtype = self[self.name].dtype
is_discrete = pd.api.types.is_bool_dtype(dtype)
is_discrete = is_discrete or pd.api.types.is_categorical_dtype(dtype)
is_discrete = is_discrete or pd.api.types.is_object_dtype(dtype)

if is_discrete:
return 'discrete'
else:
return 'continuous'
6 changes: 6 additions & 0 deletions composeml/tests/test_label_maker.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,3 +421,9 @@ def test_slice_overlap(transactions):
start, end = df.context.window
is_overlap = df.index == end
assert not is_overlap.any()


def test_label_type(transactions):
lm = LabelMaker(target_entity='customer_id', time_index='time', labeling_function=total_spent)
lt = lm.search(transactions, num_examples_per_instance=1, label_type='discrete', verbose=False)
assert lt.label_type == 'discrete'
27 changes: 19 additions & 8 deletions composeml/tests/test_label_plots.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@
def test_distribution_plot(labels):
labels = labels.threshold(200)
plot = labels.plot.distribution()
assert plot.get_title() == 'Label Distribution'
def test_count_by_time_categorical(total_spent):
labels = range(2)
total_spent = total_spent.bin(2, labels=labels)
ax = total_spent.plot.count_by_time()
assert ax.get_title() == 'Label Count vs. Cutoff Times'


def test_count_by_time_plot(labels):
labels = labels.threshold(200)
plot = labels.plot.count_by_time()
assert plot.get_title() == 'Label Count vs. Time'
def test_count_by_time_continuous(total_spent):
ax = total_spent.plot.count_by_time()
assert ax.get_title() == 'Label vs. Cutoff Times'


def test_distribution_categorical(total_spent):
ax = total_spent.bin(2, labels=range(2))
ax = ax.plot.dist()
assert ax.get_title() == 'Label Distribution'


def test_distribution_continuous(total_spent):
ax = total_spent.plot.dist()
assert ax.get_title() == 'Label Distribution'
Loading

0 comments on commit 50e9f90

Please sign in to comment.