Skip to content

Commit a787707

Browse files
committed
move from unitests to pytests
1 parent 516b4c6 commit a787707

File tree

4 files changed

+66
-33
lines changed

4 files changed

+66
-33
lines changed

fsspark/fs/fdataframe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
import logging
2-
from typing import Optional, Union, List, Set, Tuple
2+
from typing import List, Tuple
33

44
import numpy
55
import numpy as np
66
import pandas as pd
7-
from pandas import DataFrame, Series
7+
from pandas import DataFrame
88
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, RobustScaler, LabelEncoder
99

1010
logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")

fsspark/tests/generate_big_tests.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import logging
2+
3+
import pandas as pd
4+
import numpy as np
5+
import pyarrow as pa
6+
import pyarrow.parquet as pq
7+
8+
def test_generate_big_dataset():
9+
# Parameters for the dataset
10+
n_samples = 1200
11+
n_features = 10_000
12+
chunk_size = 100 # Adjust chunk size for memory efficiency
13+
14+
# Generate sample IDs and labels
15+
sample_ids = np.arange(1, n_samples + 1)
16+
labels = np.random.choice(['LV', 'RV', 'LA', 'RA'], size=n_samples)
17+
18+
# Parquet schema definition
19+
schema = pa.schema([pa.field('sample_id', pa.int32()), pa.field('label', pa.string())] +
20+
[pa.field(f'feature{i}', pa.float32()) for i in range(1, n_features + 1)])
21+
22+
# Create an empty Parquet file
23+
output_file = 'large_dataset_optimized_samples_{}_features_{}.parquet'.format(n_samples, n_features)
24+
with pq.ParquetWriter(output_file, schema, compression='snappy') as writer:
25+
# Process in chunks to reduce memory usage
26+
for chunk_start in range(0, n_samples, chunk_size):
27+
chunk_end = min(chunk_start + chunk_size, n_samples)
28+
29+
# Generate chunk of samples and labels
30+
chunk_sample_ids = sample_ids[chunk_start:chunk_end]
31+
chunk_labels = labels[chunk_start:chunk_end]
32+
33+
# Generate chunk of features
34+
chunk_features = {f'feature{i}': np.random.rand(chunk_end - chunk_start) for i in range(1, n_features + 1)}
35+
36+
# Create DataFrame chunk
37+
chunk_data = {
38+
'sample_id': chunk_sample_ids,
39+
'label': chunk_labels
40+
}
41+
chunk_data.update(chunk_features)
42+
43+
df_chunk = pd.DataFrame(chunk_data)
44+
45+
# Convert to PyArrow Table and write chunk to Parquet file
46+
table_chunk = pa.Table.from_pandas(df_chunk, schema=schema)
47+
writer.write_table(table_chunk)
48+
logging.info(f'Processed samples {chunk_start + 1} to {chunk_end}')
49+
50+
print("Optimized Parquet file created successfully!")
51+

fsspark/tests/test_fsdataframe.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import pytest
21
import pandas as pd
32
from fsspark.fs.fdataframe import FSDataFrame
43

Lines changed: 13 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,27 @@
1-
import unittest
2-
31
import pandas as pd
42
from fsspark.utils.datasets import get_tnbc_data_path
53
from fsspark.fs.fdataframe import FSDataFrame
64

75
from fsspark.fs.univariate import univariate_filter
86

9-
10-
class UnivariateMethodsTest(unittest.TestCase):
7+
def test_univariate_filter_corr():
118
"""
12-
Define testing methods for FSDataFrame class.
9+
Test univariate_filter method with 'u_corr' method.
10+
:return: None
1311
"""
1412

15-
def setUp(self) -> None:
16-
# import tsv as pandas DataFrame
17-
self.df = pd.read_csv(get_tnbc_data_path(), sep='\t')
18-
19-
# create FSDataFrame instance
20-
self.fsdf = FSDataFrame(df=self.df,
21-
sample_col='Sample',
22-
label_col='label')
23-
24-
def tearDown(self) -> None:
25-
pass
13+
# import tsv as pandas DataFrame
14+
df = pd.read_csv(get_tnbc_data_path(), sep='\t')
2615

27-
def test_univariate_filter_corr(self):
28-
"""
29-
Test univariate_filter method with 'u_corr' method.
30-
:return: None
31-
"""
16+
# create FSDataFrame instance
17+
fs_df = FSDataFrame(df=df,sample_col='Sample',label_col='label')
3218

33-
fsdf = self.fsdf
34-
fsdf_filtered = univariate_filter(fsdf,
35-
univariate_method='u_corr',
36-
corr_threshold=0.3)
19+
fsdf_filtered = univariate_filter(fs_df,univariate_method='u_corr', corr_threshold=0.3)
3720

38-
self.assertEqual(fsdf.count_features(), 500)
39-
self.assertEqual(fsdf_filtered.count_features(), 211)
21+
assert fs_df.count_features() == 500
22+
assert fsdf_filtered.count_features() == 211
4023

41-
# Export the filtered DataFrame as Pandas DataFrame
42-
df_filtered = fsdf_filtered.to_pandas()
43-
df_filtered.to_csv('filtered_tnbc_data.csv', index=False)
24+
# Export the filtered DataFrame as Pandas DataFrame
25+
df_filtered = fsdf_filtered.to_pandas()
26+
df_filtered.to_csv('filtered_tnbc_data.csv', index=False)
4427

0 commit comments

Comments
 (0)