-
Notifications
You must be signed in to change notification settings - Fork 47
Seaborn Plots #62
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Seaborn Plots #62
Changes from 1 commit
2c39774
1d3a4f8
e0e31e7
c812d00
76d61fe
93e8996
f5148df
b3de01e
ab0caba
3e6fcd2
9ebecf7
f59cbc6
06e952c
fd6f934
9e37e47
8907710
ab9a1df
107ed6a
a2f1bbc
aa4434e
3f997a7
19ad10e
0a58ff3
e7e45c6
4669167
b4e1a56
e3c1beb
dfe7f80
2fe86ea
8371b75
f182cf8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,9 +11,17 @@ class LabelTimes(pd.DataFrame): | |
target_entity | ||
transforms | ||
""" | ||
_metadata = ['name', 'target_entity', 'settings', 'transforms'] | ||
|
||
def __init__(self, data=None, name=None, target_entity=None, settings=None, transforms=None, *args, **kwargs): | ||
_metadata = ['name', 'target_entity', 'settings', 'transforms', 'label_type'] | ||
|
||
def __init__(self, | ||
data=None, | ||
name=None, | ||
target_entity=None, | ||
settings=None, | ||
transforms=None, | ||
label_type=None, | ||
*args, | ||
**kwargs): | ||
super().__init__(data=data, *args, **kwargs) | ||
|
||
self.name = name | ||
|
@@ -22,20 +30,44 @@ def __init__(self, data=None, name=None, target_entity=None, settings=None, tran | |
self.transforms = transforms or [] | ||
self.plot = LabelPlots(self) | ||
|
||
if label_type is not None: | ||
error = 'label type must be "continuous" or "discrete"' | ||
assert label_type in ['continuous', 'discrete'], error | ||
|
||
if label_type is None and name in self.columns: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why would There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Came across behavior where pandas would initialize label times without passing value for |
||
label_type = self.infer_type() | ||
|
||
self.label_type = label_type | ||
self.settings['label_type'] = self.label_type | ||
|
||
@property | ||
def _constructor(self): | ||
return LabelTimes | ||
|
||
@property | ||
def is_categorical(self): | ||
"""Whether labels are categorical.""" | ||
def is_discrete(self): | ||
"""Whether labels are discrete.""" | ||
dtype = self[self.name].dtype | ||
return pd.api.types.is_categorical_dtype(dtype) | ||
|
||
is_discrete = pd.api.types.is_bool_dtype(dtype) \ | ||
or pd.api.types.is_categorical_dtype(dtype) \ | ||
or pd.api.types.is_object_dtype(dtype) | ||
|
||
if is_discrete: | ||
return True | ||
|
||
labels = self[self.name].iloc[:100] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for now, let's only look at the dtype to infer type. we can always add this functionality in later There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated function |
||
is_discrete = labels.nunique() / len(labels) <= .5 | ||
|
||
if is_discrete: | ||
return True | ||
|
||
return False | ||
|
||
@property | ||
def distribution(self): | ||
"""Returns label distribution if labels are discrete.""" | ||
if self.is_categorical: | ||
if self.label_type == 'discrete': | ||
labels = self.assign(count=1) | ||
labels = labels.groupby(self.name) | ||
distribution = labels['count'].count() | ||
|
@@ -44,7 +76,7 @@ def distribution(self): | |
@property | ||
def count_by_time(self): | ||
"""Returns label count across cutoff times.""" | ||
if self.is_categorical: | ||
if self.label_type == 'discrete': | ||
keys = ['cutoff_time', self.name] | ||
value = self.groupby(keys).cutoff_time.count() | ||
value = value.unstack(self.name).fillna(0) | ||
|
@@ -58,7 +90,7 @@ def count_by_time(self): | |
|
||
def describe(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the describe method should say the label type There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should label type be under settings? I guess it is a parameter of search. |
||
"""Prints out label info with transform settings that reproduce labels.""" | ||
if self.is_categorical: | ||
if self.label_type == 'discrete': | ||
print('Label Distribution\n' + '-' * 18, end='\n') | ||
distribution = self[self.name].value_counts() | ||
distribution.index = distribution.index.astype('str') | ||
|
@@ -110,7 +142,9 @@ def threshold(self, value, inplace=False): | |
""" | ||
labels = self if inplace else self.copy() | ||
labels[self.name] = labels[self.name].gt(value) | ||
labels.infer_type() | ||
|
||
labels.label_type = 'discrete' | ||
labels.settings['label_type'] = 'discrete' | ||
|
||
transform = {'__name__': 'threshold', 'value': value} | ||
labels.transforms.append(transform) | ||
|
@@ -222,6 +256,7 @@ def bin(self, bins, quantiles=False, labels=None, right=True): | |
} | ||
|
||
label_times.transforms.append(transform) | ||
label_times.label_type = 'discrete' | ||
return label_times | ||
|
||
def sample(self, n=None, frac=None, random_state=None): | ||
|
@@ -320,16 +355,9 @@ def infer_type(self): | |
"""Infer label type. | ||
|
||
Returns: | ||
LabelTimes : Label Times as inferred type. | ||
str : Inferred label type. Can be "continuous" or "discrete". | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think intuitively I'd expect the logic in then every we currently check There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated logic |
||
""" | ||
if self.is_categorical: | ||
return self | ||
|
||
labels = self[self.name].iloc[:100] | ||
is_category_like = pd.api.types.is_bool_dtype(labels.dtype) or pd.api.types.is_object_dtype(labels.dtype) | ||
|
||
if is_category_like or labels.nunique() / len(labels) <= .5: | ||
self[self.name] = self[self.name].astype('category') | ||
return self | ||
|
||
return self | ||
if self.is_discrete: | ||
return 'discrete' | ||
else: | ||
return 'continuous' |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we do this casting before we init the label_times? like above line 330
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Labels are records (list of dictionaries) above line 330. Should I pass records to a pandas data frame to make categorical before initializing label times?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see. it's fine to leave it here then