Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/linux-steps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ steps:
displayName: 'Build models'

# Run CTests.
- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest
- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test .
displayName: 'Run tests via ctest'

# Publish test results to Azure Pipelines
Expand Down
2 changes: 1 addition & 1 deletion .ci/macos-steps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ steps:
displayName: 'Build models'

# Run CTests.
- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest
- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test .
displayName: 'Run tests via ctest'

# Publish test results to Azure Pipelines
Expand Down
2 changes: 1 addition & 1 deletion .ci/windows-steps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ steps:
# Run tests via ctest.
- bash: |
cd build/tests
CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release -R UtilsTest
CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release .
displayName: 'Run tests via ctest'

# Publish test results to Azure Pipelines
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,8 @@ xcode*
.idea
cmake-build-*
*.csv
*.tar
*.zip
*.tar.gz
.travis/configs.hpp
Testing/*
36 changes: 28 additions & 8 deletions dataloader/dataloader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,14 @@ class DataLoader
*
* @param datasetPath Path or name of dataset.
* @param shuffle whether or not to shuffle the data.
* @param ratio Ratio for train-test split.
* @param validRatio Ratio of dataset to be used for validation set.
* @param useScaler Use feature scaler for pre-processing the dataset.
* @param augmentation Adds augmentation to training data only.
* @param augmentationProbability Probability of applying augmentation on dataset.
*/
DataLoader(const std::string& dataset,
const bool shuffle,
const double ratio = 0.75,
const double validRatio = 0.25,
const bool useScaler = true,
const std::vector<std::string> augmentation =
std::vector<std::string>(),
Expand All @@ -85,7 +85,7 @@ class DataLoader
* Note: This option augmentation to NULL, set ratio to 1 and
* scaler will be used to only transform the test data.
* @param shuffle Boolean to determine whether or not to shuffle the data.
* @param ratio Ratio for train-test split.
* @param validRatio Ratio of dataset to be used for validation set.
* @param useScaler Fits the scaler on training data and transforms dataset.
* @param dropHeader Drops the first row from CSV.
* @param startInputFeatures First Index which will be fed into the model as input.
Expand All @@ -106,7 +106,7 @@ class DataLoader
void LoadCSV(const std::string& datasetPath,
const bool loadTrainData = true,
const bool shuffle = true,
const double ratio = 0.75,
const double validRatio = 0.25,
const bool useScaler = false,
const bool dropHeader = false,
const int startInputFeatures = -1,
Expand Down Expand Up @@ -179,11 +179,30 @@ class DataLoader
*/
void DownloadDataset(const std::string& dataset)
{
if (datasetMap[dataset].zipFile && (!Utils::PathExists(
datasetMap[dataset].trainPath) ||
!Utils::PathExists(datasetMap[dataset].testPath)))
{
Utils::DownloadFile(datasetMap[dataset].datasetURL,
datasetMap[dataset].datasetPath, dataset + "_training_data.",
false, false, datasetMap[dataset].serverName,
datasetMap[dataset].zipFile);

if (!Utils::CompareCRC32(datasetMap[dataset].datasetPath,
datasetMap[dataset].datasetHash))
{
mlpack::Log::Fatal << "Corrupted Data for " << dataset <<
" downloaded." << std::endl;
}

return;
}

if (!Utils::PathExists(datasetMap[dataset].trainPath))
{
Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl,
Utils::DownloadFile(datasetMap[dataset].trainDownloadURL,
datasetMap[dataset].trainPath, dataset + "_training_data.",
false);
false, false, datasetMap[dataset].serverName);

if (!Utils::CompareCRC32(datasetMap[dataset].trainPath,
datasetMap[dataset].trainHash))
Expand All @@ -192,11 +211,12 @@ class DataLoader
dataset << " downloaded." << std::endl;
}
}

if (!Utils::PathExists(datasetMap[dataset].testPath))
{
Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl,
Utils::DownloadFile(datasetMap[dataset].trainDownloadURL,
datasetMap[dataset].testPath, dataset + "_testing_data.",
false);
false, false, datasetMap[dataset].serverName);

if (!Utils::CompareCRC32(datasetMap[dataset].testPath,
datasetMap[dataset].testHash))
Expand Down
32 changes: 16 additions & 16 deletions dataloader/dataloader_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ template<
DatasetX, DatasetY, ScalerType
>::DataLoader(const std::string& dataset,
const bool shuffle,
const double ratio,
const double validRatio,
const bool useScaler,
const std::vector<std::string> augmentation,
const double augmentationProbability)
Expand All @@ -49,14 +49,14 @@ template<

if (datasetMap[dataset].loadCSV)
{
LoadCSV(datasetMap[dataset].trainPath, true, shuffle, ratio, useScaler,
datasetMap[dataset].dropHeader,
LoadCSV(datasetMap[dataset].trainPath, true, shuffle, validRatio,
useScaler, datasetMap[dataset].dropHeader,
datasetMap[dataset].startTrainingInputFeatures,
datasetMap[dataset].endTrainingInputFeatures,
datasetMap[dataset].endTrainingPredictionFeatures,
datasetMap[dataset].endTrainingPredictionFeatures);

LoadCSV(datasetMap[dataset].testPath, false, false, useScaler,
LoadCSV(datasetMap[dataset].testPath, false, false, validRatio, useScaler,
datasetMap[dataset].dropHeader,
datasetMap[dataset].startTestingInputFeatures,
datasetMap[dataset].endTestingInputFeatures);
Expand Down Expand Up @@ -85,7 +85,7 @@ template<
>::LoadCSV(const std::string& datasetPath,
const bool loadTrainData,
const bool shuffle,
const double ratio,
const double validRatio,
const bool useScaler,
const bool dropHeader,
const int startInputFeatures,
Expand All @@ -104,14 +104,7 @@ template<
if (loadTrainData)
{
arma::mat trainDataset, validDataset;
data::Split(dataset, trainDataset, validDataset, ratio, shuffle);

if (useScaler)
{
scaler.Fit(trainDataset);
scaler.Transform(trainDataset, trainDataset);
scaler.Transform(validDataset, validDataset);
}
data::Split(dataset, trainDataset, validDataset, validRatio, shuffle);

trainFeatures = trainDataset.rows(WrapIndex(startInputFeatures,
trainDataset.n_rows), WrapIndex(endInputFeatures,
Expand All @@ -125,10 +118,16 @@ template<
validDataset.n_rows), WrapIndex(endInputFeatures,
validDataset.n_rows));

validLabels = trainDataset.rows(WrapIndex(startPredictionFeatures,
validLabels = validDataset.rows(WrapIndex(startPredictionFeatures,
validDataset.n_rows), WrapIndex(endPredictionFeatures,
validDataset.n_rows));

if (useScaler)
{
scaler.Fit(trainFeatures);
scaler.Transform(trainFeatures, trainFeatures);
scaler.Transform(validFeatures, validFeatures);
}
// TODO : Add support for augmentation here.
mlpack::Log::Info << "Training Dataset Loaded." << std::endl;
}
Expand All @@ -139,8 +138,9 @@ template<
scaler.Transform(dataset, dataset);
}

testFeatures = dataset.submat(WrapIndex(startInputFeatures, dataset.n_rows),
0, WrapIndex(endInputFeatures, dataset.n_rows), dataset.n_cols - 1);
testFeatures = dataset.rows(WrapIndex(startInputFeatures, dataset.n_rows),
WrapIndex(endInputFeatures, dataset.n_rows));

mlpack::Log::Info << "Testing Dataset Loaded." << std::endl;
}
}
Expand Down
130 changes: 115 additions & 15 deletions dataloader/datasets.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,47 @@ template<
>
struct DatasetDetails
{
//! Locally stored name of dataset used for identification
//! during dataloader call.
std::string datasetName;
std::string trainDownloadUrl;
std::string testDownloadUrl;

//! Locally stored URL for downloading training data.
std::string trainDownloadURL;

//! Locally stored URL for downloading testing data.
std::string testDownloadURL;

//! CRC-32 checksum for training data file.
std::string trainHash;

//! CRC-32 checksum for testing data file.
std::string testHash;

//! Locally stored boolean to determine if dataset is of CSV or similar
//! format.
bool loadCSV;

//! Locally stored path to file / directory for training data.
std::string trainPath;

//! Locally stored path to file / directory for testing data.
std::string testPath;

//! Locally held boolean to determine whether dataset will be in zip format.
bool zipFile;

//! Locally stored URL for downloading dataset.
std::string datasetURL;

//! Locally stored CRC-32 checksum for the dataset.
std::string datasetHash;

//! Locally stored path for saving the archived / zip dataset.
std::string datasetPath;

//! Locally stored server name for download file.
std::string serverName;

// Pre-Process functor.
std::function<void(DatasetX&, DatasetY&,
DatasetX&, DatasetY&, DatasetX&)> PreProcess;
Expand All @@ -61,13 +93,18 @@ struct DatasetDetails
// Default constructor.
DatasetDetails() :
datasetName(""),
trainDownloadUrl(""),
testDownloadUrl(""),
trainDownloadURL(""),
testDownloadURL(""),
trainHash(""),
testHash(""),
loadCSV(false),
trainPath(""),
testPath(""),
zipFile(false),
datasetURL(""),
datasetPath(""),
datasetHash(""),
serverName("www.mlpack.org"),
startTrainingInputFeatures(0),
endTrainingInputFeatures(0),
startTrainingPredictionFeatures(0),
Expand All @@ -77,23 +114,85 @@ struct DatasetDetails
dropHeader(false)
{/* Nothing to do here. */}

// Constructor for initializing object.
/**
* Constructor for initializing object for seperate
* train and test download URL.
*
* @param datasetName Name of dataset used for identification during
* dataloader call.
* @param trainDownloadURL URL for downloading training data.
* @param testDownloadURL URL for downloading testing data.
* @param trainHash CRC-32 checksum for training data.
* @param testHash CRC-32 checksum for testing data.
* @param loadCSV Determines if the format of dataset is similar to CSV.
* @param trainPath Path for training dataset.
* @param testPath Path for testing dataset.
*/
DatasetDetails(const std::string& datasetName,
const std::string& trainDownloadUrl,
const std::string& testDownloadUrl,
const std::string& trainDownloadURL,
const std::string& testDownloadURL,
const std::string& trainHash,
const std::string& testHash,
const bool loadCSV,
const std::string& trainPath,
const std::string& testPath) :
datasetName(datasetName),
trainDownloadUrl(trainDownloadUrl),
testDownloadUrl(testDownloadUrl),
trainDownloadURL(trainDownloadURL),
testDownloadURL(testDownloadURL),
trainHash(trainHash),
testHash(testHash),
loadCSV(loadCSV),
trainPath(trainPath),
testPath(testPath),
zipFile(false),
datasetURL(""),
datasetHash(""),
serverName("www.mlpack.org"),
startTrainingInputFeatures(0),
endTrainingInputFeatures(0),
startTrainingPredictionFeatures(0),
endTrainingPredictionFeatures(0),
startTestingInputFeatures(0),
endTestingInputFeatures(0),
dropHeader(false)
{
// Nothing to do here.
}

/**
* Constructor for initializing paths for zip files.
*
* @param datasetName Name of dataset used for identification during
* dataloader call.
* @param zipFile Boolean to determine if dataset is stored in zip format.
* @param datasetURL URL for downloading dataset.
* @param datasetPath Path where the dataset will be downloaded.
* @param datasetHash CRC-32 checksum for dataset.
* @param loadCSV Determines if the format of dataset is similar to CSV.
* @param trainPath Path for training dataset.
* @param testPath Path for testing dataset.
*/
DatasetDetails(const std::string& datasetName,
const bool zipFile,
const std::string& datasetURL,
const std::string& datasetPath,
const std::string& datasetHash,
const bool loadCSV,
const std::string& trainPath,
const std::string& testPath) :
datasetName(datasetName),
zipFile(zipFile),
datasetURL(datasetURL),
datasetHash(datasetHash),
datasetPath(datasetPath),
loadCSV(loadCSV),
trainPath(trainPath),
testPath(testPath),
trainDownloadURL(""),
testDownloadURL(""),
trainHash(""),
testHash(""),
serverName("www.mlpack.org"),
startTrainingInputFeatures(0),
endTrainingInputFeatures(0),
startTrainingPredictionFeatures(0),
Expand All @@ -119,17 +218,18 @@ template<
class Datasets
{
public:
//! Get details of MNIST Dataset.
const static DatasetDetails<DatasetX, DatasetY> MNIST()
{
DatasetDetails<DatasetX, DatasetY> mnistDetails(
"mnist",
"/datasets/mnist_train.csv",
"/datasets/mnist_test.csv",
"772495e3",
"8bcdb7e1",
true,
"./../data/mnist_train.csv",
"./../data/mnist_test.csv");
"/datasets/mnist.tar.gz",
"./../data/mnist.tar.gz",
"33470ca3",
true,
"./../data/mnist-dataset/mnist_train.csv",
"./../data/mnist-dataset/mnist_test.csv");

// Set the Pre-Processor Function.
mnistDetails.PreProcess = PreProcessor<DatasetX, DatasetY>::MNIST;
Expand Down
Loading