@@ -38,14 +38,12 @@ class FSDataFrame:
38
38
"""
39
39
40
40
def __init__ (
41
- self ,
42
- df : pd .DataFrame ,
43
- sample_col : Optional [str ] = None ,
44
- label_col : Optional [str ] = None ,
45
- sparse_threshold : float = 0.7 , # Threshold for sparsity
46
- memory_threshold : Optional [
47
- float
48
- ] = 0.75 , # Proportion of system memory to use for dense arrays
41
+ self ,
42
+ df : pd .DataFrame ,
43
+ sample_col : Optional [str ] = None ,
44
+ label_col : Optional [str ] = None ,
45
+ sparse_threshold : float = 0.7 , # Threshold for sparsity
46
+ memory_threshold : Optional [float ] = 0.75 , # Proportion of system memory to use for dense arrays
49
47
):
50
48
"""
51
49
Create an instance of FSDataFrame.
@@ -61,21 +59,15 @@ def __init__(
61
59
in the feature matrix exceeds this value, the matrix is stored in a sparse format unless memory allows.
62
60
:param memory_threshold: Proportion of system memory available to use before deciding on sparse/dense.
63
61
"""
64
- # TODO: We are loading full data into memory, look for other options. Maybe Dask?
65
- self .__df = df .copy ()
66
-
67
- # Check for necessary columns
68
- columns_to_drop = []
62
+ # Copy the DataFrame for internal usage
63
+ self .__df = df
69
64
70
65
# Handle sample column
71
66
if sample_col :
72
67
if sample_col not in df .columns :
73
- raise ValueError (
74
- f"Sample column '{ sample_col } ' not found in DataFrame."
75
- )
68
+ raise ValueError (f"Sample column '{ sample_col } ' not found in DataFrame." )
76
69
self .__sample_col = sample_col
77
70
self .__samples = df [sample_col ].tolist ()
78
- columns_to_drop .append (sample_col )
79
71
else :
80
72
self .__sample_col = None
81
73
self .__samples = []
@@ -90,55 +82,47 @@ def __init__(
90
82
self .__label_col = label_col
91
83
self .__labels = df [label_col ].tolist ()
92
84
93
- # Encode labels
94
- # TODO: Check if labels are categorical or continuous? For now, assume categorical
85
+ # Encode labels (assume categorical for now)
95
86
label_encoder = LabelEncoder ()
96
87
self .__labels_matrix = label_encoder .fit_transform (df [label_col ]).tolist ()
97
- columns_to_drop .append (label_col )
98
88
99
- # Drop both sample and label columns in one step
100
- self .__df = self .__df .drop (columns = columns_to_drop )
89
+ # Select only numerical columns, excluding sample_col and label_col
90
+ feature_columns = df .select_dtypes (include = [np .number ]).columns .tolist ()
91
+ self .__original_features = [col for col in feature_columns if col not in [sample_col , label_col ]]
101
92
102
- # Extract features
103
- self . __original_features = self .__df . columns . tolist ()
93
+ # Select only the feature columns directly (no drop)
94
+ numerical_df = df [ self .__original_features ]
104
95
105
- # Ensure only numerical features are retained
106
- numerical_df = self .__df .select_dtypes (include = [np .number ])
107
96
if numerical_df .empty :
108
97
raise ValueError ("No numerical features found in the DataFrame." )
109
98
110
- # Check sparsity
99
+ # Calculate sparsity
111
100
num_elements = numerical_df .size
112
- num_zeros = np . count_nonzero (numerical_df == 0 )
101
+ num_zeros = (numerical_df == 0 ). sum (). sum ( )
113
102
sparsity = num_zeros / num_elements
114
103
104
+ # Estimate memory usage
115
105
dense_matrix_size = numerical_df .memory_usage (deep = True ).sum () # In bytes
116
106
available_memory = psutil .virtual_memory ().available # In bytes
117
107
108
+ # Handle sparse or dense matrix based on sparsity and available memory
118
109
if sparsity > sparse_threshold :
119
110
if dense_matrix_size < memory_threshold * available_memory :
120
- # Use dense matrix if enough memory is available
121
111
logging .info (
122
112
f"Data is sparse (sparsity={ sparsity :.2f} ) but enough memory available. "
123
113
f"Using a dense matrix."
124
114
)
125
115
self .__matrix = numerical_df .to_numpy (dtype = np .float32 )
126
116
self .__is_sparse = False
127
117
else :
128
- # Use sparse matrix due to memory constraints
129
118
logging .info (
130
119
f"Data is sparse (sparsity={ sparsity :.2f} ), memory insufficient for dense matrix. "
131
120
f"Using a sparse matrix representation."
132
121
)
133
- self .__matrix = sparse .csr_matrix (
134
- numerical_df .to_numpy (dtype = np .float32 )
135
- )
122
+ self .__matrix = sparse .csr_matrix (numerical_df .to_numpy (dtype = np .float32 ))
136
123
self .__is_sparse = True
137
124
else :
138
- # Use dense matrix since it's not sparse
139
- logging .info (
140
- f"Data is not sparse (sparsity={ sparsity :.2f} ), using a dense matrix."
141
- )
125
+ logging .info (f"Data is not sparse (sparsity={ sparsity :.2f} ), using a dense matrix." )
142
126
self .__matrix = numerical_df .to_numpy (dtype = np .float32 )
143
127
self .__is_sparse = False
144
128
0 commit comments