Skip to content

Commit 5276f84

Browse files
Implementing the logic for Kaggle dataset upload Lambda (#73)
* logic has been added * import has been corrected
1 parent ab7df38 commit 5276f84

File tree

1 file changed

+108
-3
lines changed

1 file changed

+108
-3
lines changed
Lines changed: 108 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,119 @@
1+
import json
12
import logging
2-
from mens_t20i_data_collector._lambdas.utils import exception_handler
3+
import os
4+
import tempfile
5+
import boto3
6+
from mens_t20i_data_collector._lambdas.constants import (
7+
CRICSHEET_DATA_S3_OUTPUT_FOLDER,
8+
DELIVERYWISE_DATA_CSV_FILE_NAME,
9+
MATCHWISE_DATA_CSV_FILE_NAME
10+
)
11+
from mens_t20i_data_collector._lambdas.utils import (
12+
exception_handler,
13+
get_environmental_variable_value
14+
)
315

416
# Set up Logging
517
logger = logging.getLogger(__name__)
618
logger.setLevel(logging.INFO)
719

20+
21+
class KaggleDatasetUploader:
22+
23+
"""Handler to upload dataset to Kaggle."""
24+
25+
def __init__(self):
26+
self._temporary_directory = tempfile.gettempdir()
27+
self._s3_bucket_name = get_environmental_variable_value("DOWNLOAD_BUCKET_NAME")
28+
self._kaggle_username = get_environmental_variable_value("KAGGLE_USERNAME")
29+
self._create_kaggle_json_file()
30+
self._matchwise_data_csv_file_s3_key = f"{CRICSHEET_DATA_S3_OUTPUT_FOLDER}/{MATCHWISE_DATA_CSV_FILE_NAME}"
31+
self._deliverywise_data_csv_file_s3_key = f"{CRICSHEET_DATA_S3_OUTPUT_FOLDER}/{DELIVERYWISE_DATA_CSV_FILE_NAME}"
32+
self._folder_to_keep_the_files_to_upload = os.path.join(self._temporary_directory, "files_to_upload_to_kaggle")
33+
self._s3_client = boto3.client("s3")
34+
35+
def upload_dataset_to_kaggle(self):
36+
"""
37+
Uploads the dataset to Kaggle.
38+
"""
39+
os.makedirs(self._folder_to_keep_the_files_to_upload, exist_ok=True)
40+
self._create_metadata_json_file()
41+
self._download_dataset_files_from_s3()
42+
self._authenticate_to_kaggle_and_upload_dataset()
43+
44+
def _authenticate_to_kaggle_and_upload_dataset(self):
45+
"""
46+
Authenticates to Kaggle and uploads the dataset.
47+
"""
48+
try:
49+
logger.info("Authenticating to Kaggle and uploading dataset...")
50+
from kaggle.api.kaggle_api_extended import \
51+
KaggleApi # pylint: disable=import-outside-toplevel
52+
api = KaggleApi()
53+
api.authenticate()
54+
logger.info("Kaggle authentication successful")
55+
logger.info("Uploading dataset to Kaggle...")
56+
api.dataset_create_version(
57+
delete_old_versions=True,
58+
folder=self._folder_to_keep_the_files_to_upload,
59+
version_notes="Mens T20I Dataset",
60+
)
61+
logger.info("Dataset uploaded to Kaggle successfully")
62+
except Exception as e:
63+
logger.error(f"Error occurred while uploading dataset to Kaggle: {e}", exc_info=True)
64+
raise
65+
66+
def _create_kaggle_json_file(self):
67+
"""
68+
Creates a kaggle.json file with the username and key for Kaggle API authentication.
69+
"""
70+
logger.info("Creating kaggle.json file...")
71+
kaggle_json = {
72+
"username": self._kaggle_username,
73+
"key": get_environmental_variable_value("KAGGLE_SECRET_KEY"),
74+
}
75+
kaggle_json_file_path = f"{self._temporary_directory}/kaggle.json"
76+
with open(kaggle_json_file_path, "w", encoding="utf-8") as kaggle_json_file:
77+
kaggle_json_file.write(json.dumps(kaggle_json))
78+
os.environ["KAGGLE_CONFIG_DIR"] = self._temporary_directory
79+
logger.info(f"kaggle.json file created at the temporary path {kaggle_json_file_path}")
80+
81+
def _create_metadata_json_file(self):
82+
"""
83+
Creates a metadata.json file with the dataset metadata for Kaggle API.
84+
"""
85+
logger.info("Creating metadata.json file...")
86+
metadata = {
87+
"id": f"{self._kaggle_username}/{get_environmental_variable_value('KAGGLE_DATASET_SLUG')}",
88+
}
89+
metadata_file_path = os.path.join(self._folder_to_keep_the_files_to_upload, "dataset-metadata.json")
90+
with open(metadata_file_path, "w", encoding="utf-8") as metadata_file:
91+
metadata_file.write(json.dumps(metadata))
92+
logger.info(f"metadata.json file created at the temporary path {metadata_file_path}")
93+
94+
def _download_dataset_files_from_s3(self):
95+
"""
96+
Downloads the dataset files from S3.
97+
"""
98+
logger.info("Downloading dataset files from S3...")
99+
self._s3_client.download_file(
100+
self._s3_bucket_name,
101+
self._matchwise_data_csv_file_s3_key,
102+
os.path.join(self._folder_to_keep_the_files_to_upload, MATCHWISE_DATA_CSV_FILE_NAME)
103+
)
104+
self._s3_client.download_file(
105+
self._s3_bucket_name,
106+
self._deliverywise_data_csv_file_s3_key,
107+
os.path.join(self._folder_to_keep_the_files_to_upload, DELIVERYWISE_DATA_CSV_FILE_NAME)
108+
)
109+
logger.info("Dataset files downloaded from S3")
110+
111+
8112
@exception_handler # noqa: Vulture
9-
def handler():
113+
def handler(_, __):
10114
"""
11115
Lambda function handler to upload dataset to Kaggle.
12116
"""
13-
logger.info("Uploading dataset to Kaggle")
117+
kaggle_uploader = KaggleDatasetUploader()
118+
kaggle_uploader.upload_dataset_to_kaggle()
14119
return "Dataset uploaded to Kaggle successfully"

0 commit comments

Comments
 (0)