Replies: 1 comment
-
Instead of modifying your df and losing categorical columns like "Make", apply the outlier removal only on numeric columns: import pandas as pd
import numpy as np
# Select only numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
# Compute IQR only for numeric columns
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1
# Remove outliers only from numeric columns, keeping categorical ones
df = df[~((df[numeric_cols] < (Q1 - 1.5 * IQR)) | (df[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)] This keeps categorical columns (like "Make") intact while filtering out numerical outliers. |
Beta Was this translation helpful? Give feedback.
0 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
-
Excuse me, as I was executing the cell "
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
df.shape
"
I got an TypeError as below, I've tried to remove columns that types are not int or float. But if I removed that, I couldn't run the bar chart with "Make" category in the next step.
Is there any solution to this problem? Thanks a lot!
TypeError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\internals.py in eval(self, func, other, errors, try_cast, mgr)
1414 with np.errstate(all='ignore'):
-> 1415 result = get_result(other)
1416
~\Anaconda3\lib\site-packages\pandas\core\internals.py in get_result(other)
1382 else:
-> 1383 result = func(values, other)
1384
TypeError: '<' not supported between instances of 'str' and 'float'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
in
----> 1 df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
2 df.shape
~\Anaconda3\lib\site-packages\pandas\core\ops.py in f(self, other)
1618 return _combine_series_frame(self, other, func,
1619 fill_value=None, axis=None,
-> 1620 level=None, try_cast=False)
1621 else:
1622
~\Anaconda3\lib\site-packages\pandas\core\ops.py in _combine_series_frame(self, other, func, fill_value, axis, level, try_cast)
1437 # default axis is columns
1438 return self._combine_match_columns(other, func, level=level,
-> 1439 try_cast=try_cast)
1440
1441
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _combine_match_columns(self, other, func, level, try_cast)
4771 new_data = left._data.eval(func=func, other=right,
4772 axes=[left.columns, self.index],
-> 4773 try_cast=try_cast)
4774 return self._constructor(new_data)
4775
~\Anaconda3\lib\site-packages\pandas\core\internals.py in eval(self, **kwargs)
3685
3686 def eval(self, **kwargs):
-> 3687 return self.apply('eval', **kwargs)
3688
3689 def quantile(self, **kwargs):
~\Anaconda3\lib\site-packages\pandas\core\internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
3579
3580 kwargs['mgr'] = self
-> 3581 applied = getattr(b, f)(**kwargs)
3582 result_blocks = _extend_blocks(applied, result_blocks)
3583
~\Anaconda3\lib\site-packages\pandas\core\internals.py in eval(self, func, other, errors, try_cast, mgr)
1420 raise
1421 except Exception as detail:
-> 1422 result = handle_error()
1423
1424 # technically a broadcast error in numpy can 'work' by returning a
~\Anaconda3\lib\site-packages\pandas\core\internals.py in handle_error()
1403 raise TypeError(
1404 'Could not operate {other!r} with block values '
-> 1405 '{detail!s}'.format(other=other, detail=detail)) # noqa
1406 else:
1407 # return the values
TypeError: Could not operate array([nan, nan, nan, nan]) with block values '<' not supported between instances of 'str' and 'float'
Beta Was this translation helpful? Give feedback.
All reactions