Skip to content

Seaborn Plots #62

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 31 commits into from
Sep 10, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 5 additions & 11 deletions composeml/label_maker.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,25 +327,19 @@ def search(self,
progress_bar.update(n=total)
progress_bar.close()

labels = LabelTimes(data=labels, name=name, target_entity=self.target_entity)
labels = LabelTimes(data=labels, name=name, target_entity=self.target_entity, label_type=label_type)
labels = labels.rename_axis('id', axis=0)

if labels.empty:
return labels

if label_type is not None:
error = 'label type must be "continuous" or "categorical"'
assert label_type in ['continuous', 'categorical'], error

if label_type == 'categorical':
labels[labels.name] = labels[labels.name].astype('category')

else:
labels = labels.infer_type()
if labels.label_type == 'discrete':
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we do this casting before we init the label_times? like above line 330

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Labels are records (list of dictionaries) above line 330. Should I pass records to a pandas data frame to make categorical before initializing label times?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. it's fine to leave it here then

labels[labels.name] = labels[labels.name].astype('category')

labels.settings.update({
'labeling_function': name,
'num_examples_per_instance': num_examples_per_instance,
'minimum_data': minimum_data or 0,
'minimum_data': str(minimum_data),
'window_size': self.window_size,
'gap': gap,
})
Expand Down
2 changes: 1 addition & 1 deletion composeml/label_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def distribution(self, **kwargs):
"""Plots the label distribution."""
dist = self._label_times[self._label_times.name]

if self._label_times.is_categorical:
if self._label_times.label_type == 'discrete':
ax = sns.countplot(dist, palette=COLOR, **kwargs)
else:
ax = sns.distplot(dist, kde=True, color=COLOR[1], **kwargs)
Expand Down
72 changes: 50 additions & 22 deletions composeml/label_times.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,17 @@ class LabelTimes(pd.DataFrame):
target_entity
transforms
"""
_metadata = ['name', 'target_entity', 'settings', 'transforms']

def __init__(self, data=None, name=None, target_entity=None, settings=None, transforms=None, *args, **kwargs):
_metadata = ['name', 'target_entity', 'settings', 'transforms', 'label_type']

def __init__(self,
data=None,
name=None,
target_entity=None,
settings=None,
transforms=None,
label_type=None,
*args,
**kwargs):
super().__init__(data=data, *args, **kwargs)

self.name = name
Expand All @@ -22,20 +30,44 @@ def __init__(self, data=None, name=None, target_entity=None, settings=None, tran
self.transforms = transforms or []
self.plot = LabelPlots(self)

if label_type is not None:
error = 'label type must be "continuous" or "discrete"'
assert label_type in ['continuous', 'discrete'], error

if label_type is None and name in self.columns:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why would name not be in self.columns?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Came across behavior where pandas would initialize label times without passing value for name. I refactored logic to infer inside is_discrete only if label type is none. link

label_type = self.infer_type()

self.label_type = label_type
self.settings['label_type'] = self.label_type

@property
def _constructor(self):
return LabelTimes

@property
def is_categorical(self):
"""Whether labels are categorical."""
def is_discrete(self):
"""Whether labels are discrete."""
dtype = self[self.name].dtype
return pd.api.types.is_categorical_dtype(dtype)

is_discrete = pd.api.types.is_bool_dtype(dtype) \
or pd.api.types.is_categorical_dtype(dtype) \
or pd.api.types.is_object_dtype(dtype)

if is_discrete:
return True

labels = self[self.name].iloc[:100]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for now, let's only look at the dtype to infer type. we can always add this functionality in later

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated function

is_discrete = labels.nunique() / len(labels) <= .5

if is_discrete:
return True

return False

@property
def distribution(self):
"""Returns label distribution if labels are discrete."""
if self.is_categorical:
if self.label_type == 'discrete':
labels = self.assign(count=1)
labels = labels.groupby(self.name)
distribution = labels['count'].count()
Expand All @@ -44,7 +76,7 @@ def distribution(self):
@property
def count_by_time(self):
"""Returns label count across cutoff times."""
if self.is_categorical:
if self.label_type == 'discrete':
keys = ['cutoff_time', self.name]
value = self.groupby(keys).cutoff_time.count()
value = value.unstack(self.name).fillna(0)
Expand All @@ -58,7 +90,7 @@ def count_by_time(self):

def describe(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the describe method should say the label type

Copy link
Contributor Author

@jeff-hernandez jeff-hernandez Sep 9, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should label type be under settings? I guess it is a parameter of search.

"""Prints out label info with transform settings that reproduce labels."""
if self.is_categorical:
if self.label_type == 'discrete':
print('Label Distribution\n' + '-' * 18, end='\n')
distribution = self[self.name].value_counts()
distribution.index = distribution.index.astype('str')
Expand Down Expand Up @@ -110,7 +142,9 @@ def threshold(self, value, inplace=False):
"""
labels = self if inplace else self.copy()
labels[self.name] = labels[self.name].gt(value)
labels.infer_type()

labels.label_type = 'discrete'
labels.settings['label_type'] = 'discrete'

transform = {'__name__': 'threshold', 'value': value}
labels.transforms.append(transform)
Expand Down Expand Up @@ -222,6 +256,7 @@ def bin(self, bins, quantiles=False, labels=None, right=True):
}

label_times.transforms.append(transform)
label_times.label_type = 'discrete'
return label_times

def sample(self, n=None, frac=None, random_state=None):
Expand Down Expand Up @@ -320,16 +355,9 @@ def infer_type(self):
"""Infer label type.

Returns:
LabelTimes : Label Times as inferred type.
str : Inferred label type. Can be "continuous" or "discrete".
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think intuitively I'd expect the logic in is_discrete to be here in infer_type and then I'd expect is_discrete to just check if label_type == "discrete.

then every we currently check if self.label_type == 'discrete' we'd just replace with is_discrete

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated logic

"""
if self.is_categorical:
return self

labels = self[self.name].iloc[:100]
is_category_like = pd.api.types.is_bool_dtype(labels.dtype) or pd.api.types.is_object_dtype(labels.dtype)

if is_category_like or labels.nunique() / len(labels) <= .5:
self[self.name] = self[self.name].astype('category')
return self

return self
if self.is_discrete:
return 'discrete'
else:
return 'continuous'
4 changes: 2 additions & 2 deletions composeml/tests/test_label_maker.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,5 +430,5 @@ def test_slice_overlap(transactions):

def test_label_type(transactions):
lm = LabelMaker(target_entity='customer_id', time_index='time', labeling_function=total_spent)
lt = lm.search(transactions, num_examples_per_instance=1, label_type='categorical', verbose=False)
assert lt.is_categorical
lt = lm.search(transactions, num_examples_per_instance=1, label_type='discrete', verbose=False)
assert lt.label_type == 'discrete'
4 changes: 2 additions & 2 deletions composeml/tests/test_label_times.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,5 +70,5 @@ def test_distribution_continous(total_spent):


def test_infer_type(total_spent):
assert total_spent.threshold(5).is_categorical
assert total_spent.bin(2).infer_type().is_categorical
assert total_spent.threshold(5).is_discrete
assert total_spent.bin(2).is_discrete