Skip to content

Commit 07cb771

Browse files
committed
small changes
1 parent 3909487 commit 07cb771

File tree

3 files changed

+46
-99
lines changed

3 files changed

+46
-99
lines changed

examples/loom2parquetmerge.py

Lines changed: 0 additions & 62 deletions
This file was deleted.

fslite/fs/fdataframe.py

Lines changed: 21 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,12 @@ class FSDataFrame:
3838
"""
3939

4040
def __init__(
41-
self,
42-
df: pd.DataFrame,
43-
sample_col: Optional[str] = None,
44-
label_col: Optional[str] = None,
45-
sparse_threshold: float = 0.7, # Threshold for sparsity
46-
memory_threshold: Optional[
47-
float
48-
] = 0.75, # Proportion of system memory to use for dense arrays
41+
self,
42+
df: pd.DataFrame,
43+
sample_col: Optional[str] = None,
44+
label_col: Optional[str] = None,
45+
sparse_threshold: float = 0.7, # Threshold for sparsity
46+
memory_threshold: Optional[float] = 0.75, # Proportion of system memory to use for dense arrays
4947
):
5048
"""
5149
Create an instance of FSDataFrame.
@@ -61,21 +59,15 @@ def __init__(
6159
in the feature matrix exceeds this value, the matrix is stored in a sparse format unless memory allows.
6260
:param memory_threshold: Proportion of system memory available to use before deciding on sparse/dense.
6361
"""
64-
# TODO: We are loading full data into memory, look for other options. Maybe Dask?
65-
self.__df = df.copy()
66-
67-
# Check for necessary columns
68-
columns_to_drop = []
62+
# Copy the DataFrame for internal usage
63+
self.__df = df
6964

7065
# Handle sample column
7166
if sample_col:
7267
if sample_col not in df.columns:
73-
raise ValueError(
74-
f"Sample column '{sample_col}' not found in DataFrame."
75-
)
68+
raise ValueError(f"Sample column '{sample_col}' not found in DataFrame.")
7669
self.__sample_col = sample_col
7770
self.__samples = df[sample_col].tolist()
78-
columns_to_drop.append(sample_col)
7971
else:
8072
self.__sample_col = None
8173
self.__samples = []
@@ -90,55 +82,47 @@ def __init__(
9082
self.__label_col = label_col
9183
self.__labels = df[label_col].tolist()
9284

93-
# Encode labels
94-
# TODO: Check if labels are categorical or continuous? For now, assume categorical
85+
# Encode labels (assume categorical for now)
9586
label_encoder = LabelEncoder()
9687
self.__labels_matrix = label_encoder.fit_transform(df[label_col]).tolist()
97-
columns_to_drop.append(label_col)
9888

99-
# Drop both sample and label columns in one step
100-
self.__df = self.__df.drop(columns=columns_to_drop)
89+
# Select only numerical columns, excluding sample_col and label_col
90+
feature_columns = df.select_dtypes(include=[np.number]).columns.tolist()
91+
self.__original_features = [col for col in feature_columns if col not in [sample_col, label_col]]
10192

102-
# Extract features
103-
self.__original_features = self.__df.columns.tolist()
93+
# Select only the feature columns directly (no drop)
94+
numerical_df = df[self.__original_features]
10495

105-
# Ensure only numerical features are retained
106-
numerical_df = self.__df.select_dtypes(include=[np.number])
10796
if numerical_df.empty:
10897
raise ValueError("No numerical features found in the DataFrame.")
10998

110-
# Check sparsity
99+
# Calculate sparsity
111100
num_elements = numerical_df.size
112-
num_zeros = np.count_nonzero(numerical_df == 0)
101+
num_zeros = (numerical_df == 0).sum().sum()
113102
sparsity = num_zeros / num_elements
114103

104+
# Estimate memory usage
115105
dense_matrix_size = numerical_df.memory_usage(deep=True).sum() # In bytes
116106
available_memory = psutil.virtual_memory().available # In bytes
117107

108+
# Handle sparse or dense matrix based on sparsity and available memory
118109
if sparsity > sparse_threshold:
119110
if dense_matrix_size < memory_threshold * available_memory:
120-
# Use dense matrix if enough memory is available
121111
logging.info(
122112
f"Data is sparse (sparsity={sparsity:.2f}) but enough memory available. "
123113
f"Using a dense matrix."
124114
)
125115
self.__matrix = numerical_df.to_numpy(dtype=np.float32)
126116
self.__is_sparse = False
127117
else:
128-
# Use sparse matrix due to memory constraints
129118
logging.info(
130119
f"Data is sparse (sparsity={sparsity:.2f}), memory insufficient for dense matrix. "
131120
f"Using a sparse matrix representation."
132121
)
133-
self.__matrix = sparse.csr_matrix(
134-
numerical_df.to_numpy(dtype=np.float32)
135-
)
122+
self.__matrix = sparse.csr_matrix(numerical_df.to_numpy(dtype=np.float32))
136123
self.__is_sparse = True
137124
else:
138-
# Use dense matrix since it's not sparse
139-
logging.info(
140-
f"Data is not sparse (sparsity={sparsity:.2f}), using a dense matrix."
141-
)
125+
logging.info(f"Data is not sparse (sparsity={sparsity:.2f}), using a dense matrix.")
142126
self.__matrix = numerical_df.to_numpy(dtype=np.float32)
143127
self.__is_sparse = False
144128

fslite/tests/test_univariate_methods.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import pandas as pd
2+
import psutil
23

34
from fslite.fs.fdataframe import FSDataFrame
45
from fslite.fs.univariate import FSUnivariate
@@ -29,6 +30,30 @@ def test_univariate_filter_corr():
2930
df_filtered = fsdf_filtered.to_pandas()
3031
df_filtered.to_csv("filtered_tnbc_data.csv", index=False)
3132

33+
def test_univariate_filter_big_corr():
34+
# import tsv as pandas DataFrame
35+
df = pd.read_parquet(path="../../examples/GSE156793.parquet")
36+
df.drop(columns=["development_day", "assay_id"], inplace=True)
37+
print(df.shape[1])
38+
39+
dense_matrix_size = (df.memory_usage(deep=True).sum() / 1e+6) # In megabytes
40+
available_memory = (psutil.virtual_memory().available / 1e+6) # In megabytes
41+
42+
# create FSDataFrame instance
43+
fs_df = FSDataFrame(df=df, sample_col="sample_id", label_col="cell_cluster_id")
44+
45+
# create FSUnivariate instance
46+
fs_univariate = FSUnivariate(fs_method="u_corr", selection_threshold=0.3)
47+
48+
fsdf_filtered = fs_univariate.select_features(fs_df)
49+
50+
assert fs_df.count_features() == 500
51+
assert fsdf_filtered.count_features() == 211
52+
53+
# Export the filtered DataFrame as Pandas DataFrame
54+
df_filtered = fsdf_filtered.to_pandas()
55+
df_filtered.to_csv("single_cell_output.csv", index=False)
56+
3257

3358
# test the univariate_filter method with 'anova' method
3459
def test_univariate_filter_anova():

0 commit comments

Comments
 (0)