Implementation of other NNJA sensors (#134)

yuvraajnarula · pre-commit-ci[bot] · web-flow · commit ab992fd5bf1f · 2025-02-24T17:01:05.000Z
* nnjai support * ruff format * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * changes as per requested * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * test_nnjai.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * NNJAI with different tensors * Refactor: Updated naming for consistency and clarity * Refactor: data/__init__.py * pytest fixture implementation * nnjai_wrapp removal * removal nnjai_wrapp.pr * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/graph_weather/__init__.py b/graph_weather/__init__.py
@@ -1,5 +1,5 @@
 """Main import for the complete models"""
 
-from .data.nnja_ai import AMSUDataset, collate_fn
+from .data.nnja_ai import SensorDataset, collate_fn
 from .models.analysis import GraphWeatherAssimilator
 from .models.forecast import GraphWeatherForecaster
diff --git a/graph_weather/data/__init__.py b/graph_weather/data/__init__.py
@@ -1,3 +1,3 @@
 """Dataloaders and data processing utilities"""
 
-from .nnja_ai import AMSUDataset, collate_fn
+from .nnja_ai import SensorDataset, collate_fn
diff --git a/graph_weather/data/nnja_ai.py b/graph_weather/data/nnja_ai.py
@@ -1,7 +1,6 @@
 """
-A custom PyTorch Dataset implementation for AMSU datasets.
+A custom PyTorch Dataset implementation for various sensors like AMSU, ATMS, MHS, IASI, CrIS
 
-This script defines a custom PyTorch Dataset (`AMSUDataset`) for working with AMSU datasets.
 The dataset is loaded via the nnja library's `DataCatalog` and filtered for specific times and
 variables. Each data point consists of a timestamp, latitude, longitude, and associated metadata.
 """
@@ -18,36 +17,62 @@
     )
 
 
-class AMSUDataset(Dataset):
-    """A custom PyTorch Dataset for handling AMSU data.
+class SensorDataset(Dataset):
+    """A custom PyTorch Dataset for handling various sensor data."""
 
-    This dataset retrieves observations and their metadata, filtered by the provided time and
-    variable descriptors.
-    """
-
-    def __init__(self, dataset_name, time, primary_descriptors, additional_variables):
-        """Initialize the AMSU dataset loader.
+    def __init__(
+        self, dataset_name, time, primary_descriptors, additional_variables, sensor_type="AMSU"
+    ):
+        """Initialize the dataset loader for various sensors.
 
         Args:
             dataset_name: Name of the dataset to load.
             time: Specific timestamp to filter the data.
-            primary_descriptors: List of primary descriptor variables to include (e.g.,
-                               OBS_TIMESTAMP, LAT, LON).
+            primary_descriptors: List of primary descriptor variables to include (e.g., OBS_TIMESTAMP, LAT, LON).
             additional_variables: List of additional variables to include in metadata.
+            sensor_type: Type of sensor (AMSU, ATMS, MHS, IASI, CrIS)
         """
         self.dataset_name = dataset_name
         self.time = time
         self.primary_descriptors = primary_descriptors
         self.additional_variables = additional_variables
+        self.sensor_type = sensor_type  # New argument for selecting sensor type
 
         # Load data catalog and dataset
         self.catalog = DataCatalog(skip_manifest=True)
         self.dataset = self.catalog[self.dataset_name]
         self.dataset.load_manifest()
 
-        self.dataset = self.dataset.sel(
-            time=self.time, variables=self.primary_descriptors + self.additional_variables
-        )
+        if self.sensor_type == "AMSU":
+            self.dataset = self.dataset.sel(
+                time=self.time,
+                variables=self.primary_descriptors + [f"TMBR_000{i:02d}" for i in range(1, 16)],
+            )
+        elif self.sensor_type == "ATMS":
+            self.dataset = self.dataset.sel(
+                time=self.time,
+                variables=self.primary_descriptors + [f"TMBR_000{i:02d}" for i in range(1, 23)],
+            )
+        elif self.sensor_type == "MHS":
+            self.dataset = self.dataset.sel(
+                time=self.time,
+                variables=self.primary_descriptors + [f"TMBR_000{i:02d}" for i in range(1, 6)],
+            )
+        elif self.sensor_type == "IASI":
+            self.dataset = self.dataset.sel(
+                time=self.time,
+                variables=self.primary_descriptors
+                + ["SCRA_" + str(i).zfill(5) for i in range(1, 617)],
+            )
+        elif self.sensor_type == "CrIS":
+            self.dataset = self.dataset.sel(
+                time=self.time,
+                variables=self.primary_descriptors
+                + [f"SRAD01_{str(i).zfill(5)}" for i in range(1, 432)],
+            )
+        else:
+            raise ValueError(f"Unsupported sensor type: {self.sensor_type}")
+
         self.dataframe = self.dataset.load_dataset(engine="pandas")
 
         for col in primary_descriptors:
@@ -63,14 +88,7 @@ def __len__(self):
         return len(self.dataframe)
 
     def __getitem__(self, index):
-        """Return the observation and metadata for a given index.
-
-        Args:
-            index: Index of the observation to retrieve.
-
-        Returns:
-            A dictionary containing timestamp, latitude, longitude, and metadata.
-        """
+        """Return the observation and metadata for a given index."""
         row = self.dataframe.iloc[index]
         time = row["OBS_TIMESTAMP"].timestamp()
         latitude = row["LAT"]
diff --git a/tests/test_nnjai.py b/tests/test_nnjai.py
@@ -1,111 +1,185 @@
 """
-Tests for the nnjai_wrapp module in the graph_weather package.
-
-This file contains unit tests for AMSUDataset and collate_fn functions.
+Unit tests for the `SensorDataset` class, mocking the `DataCatalog` to simulate sensor data loading and validate dataset behavior.
+The tests ensure correct handling of data types, shapes, and batch processing for various sensor types.
 """
 
 from datetime import datetime
 from unittest.mock import MagicMock, patch
-
+import numpy as np
 import pytest
 import torch
+import pandas as pd
+
+from graph_weather.data.nnja_ai import SensorDataset, collate_fn
 
-from graph_weather.data.nnja_ai import AMSUDataset, collate_fn
+
+def get_sensor_variables(sensor_type):
+    """Helper function to get the correct variables for each sensor type."""
+    if sensor_type == "AMSU":
+        return [f"TMBR_000{i:02d}" for i in range(1, 16)]  # 15 channels
+    elif sensor_type == "ATMS":
+        return [f"TMBR_000{i:02d}" for i in range(1, 23)]  # 22 channels
+    elif sensor_type == "MHS":
+        return [f"TMBR_000{i:02d}" for i in range(1, 6)]  # 5 channels
+    elif sensor_type == "IASI":
+        return [f"SCRA_{str(i).zfill(5)}" for i in range(1, 617)]  # 616 channels
+    elif sensor_type == "CrIS":
+        return [f"SRAD01_{str(i).zfill(5)}" for i in range(1, 432)]  # 431 channels
+    return []
 
 
-# Mock the DataCatalog to avoid actual data loading
 @pytest.fixture
 def mock_datacatalog():
     """
     Fixture to mock the DataCatalog for unit tests to avoid actual data loading.
-
-    This mock provides a mock dataset with predefined columns and values.
     """
-    with patch("graph_weather.data.nnjai_wrapp.DataCatalog") as mock:
-        # Mock dataset structure
-        mock_df = MagicMock()
-        mock_df.columns = ["OBS_TIMESTAMP", "LAT", "LON", "TMBR_00001", "TMBR_00002"]
-
-        # Define a mock row
-        class MockRow:
-            def __getitem__(self, key):
-                data = {
-                    "OBS_TIMESTAMP": datetime.now(),
-                    "LAT": 45.0,
-                    "LON": -120.0,
-                    "TMBR_00001": 250.0,
-                    "TMBR_00002": 260.0,
-                }
-                return data.get(key, None)
-
-        # Configure mock dataset
-        mock_row = MockRow()
-        mock_df.iloc = MagicMock()
-        mock_df.iloc.__getitem__.return_value = mock_row
-        mock_df.__len__.return_value = 100
+    with patch("graph_weather.data.nnja_ai.DataCatalog") as mock:
+        # Create a mock catalog
+        mock_catalog = MagicMock()
 
+        # Create a mock dataset with direct DataFrame return
         mock_dataset = MagicMock()
-        mock_dataset.load_dataset.return_value = mock_df
-        mock_dataset.sel.return_value = mock_dataset
         mock_dataset.load_manifest = MagicMock()
+        mock_dataset.sel = MagicMock(return_value=mock_dataset)  # Return self to chain calls
+
+        def create_mock_df(engine="pandas"):
+            # Get the sensor type from the mock dataset
+            sensor_vars = get_sensor_variables(mock_dataset.sensor_type)
+
+            # Create DataFrame with required columns
+            df = pd.DataFrame(
+                {
+                    "OBS_TIMESTAMP": pd.date_range(
+                        start=datetime(2021, 1, 1), periods=100, freq="H"
+                    ),
+                    "LAT": np.full(100, 45.0),
+                    "LON": np.full(100, -120.0),
+                }
+            )
 
-        mock.return_value.__getitem__.return_value = mock_dataset
-        yield mock
+            # Add sensor-specific variables
+            for var in sensor_vars:
+                df[var] = np.full(100, 250.0)
 
+            return df
 
-def test_amsu_dataset(mock_datacatalog):
-    """
-    Test the AMSUDataset class to ensure proper data loading and tensor structure.
+        # Set up the mock to return our DataFrame
+        mock_dataset.load_dataset = create_mock_df
 
-    This test validates the AMSUDataset class for its ability to load the dataset
-    correctly, check for the appropriate tensor properties, and ensure the keys
-    and data types match expectations.
-    """
-    # Initialize dataset parameters
-    dataset_name = "amsua-1bamua-NC021023"
-    time = "2021-01-01 00Z"
+        # Configure the catalog to return our mock dataset
+        def get_mock_dataset(self, name):
+            # Set the sensor type based on the requested dataset name
+            mock_dataset.sensor_type = next(
+                config["sensor_type"] for config in SENSOR_CONFIGS if config["name"] == name
+            )
+            return mock_dataset
+
+        mock_catalog.__getitem__ = get_mock_dataset  # Fix: Explicitly define the method with `self`
+        mock.return_value = mock_catalog
+
+        yield mock
+
+
+# Test configurations
+SENSOR_CONFIGS = [
+    {
+        "name": "amsu-1bamua-NC021023",
+        "sensor_type": "AMSU",
+        "expected_metadata_size": 15,  # 15 TMBR channels
+    },
+    {
+        "name": "atms-atms-NC021203",
+        "sensor_type": "ATMS",
+        "expected_metadata_size": 22,  # 22 TMBR channels
+    },
+    {
+        "name": "mhs-1bmhs-NC021027",
+        "sensor_type": "MHS",
+        "expected_metadata_size": 5,  # 5 TMBR channels
+    },
+    {
+        "name": "iasi-mtiasi-NC021241",
+        "sensor_type": "IASI",
+        "expected_metadata_size": 616,  # 616 SCRA channels
+    },
+    {
+        "name": "cris-crisf4-NC021206",
+        "sensor_type": "CrIS",
+        "expected_metadata_size": 431,  # 431 SRAD channels
+    },
+]
+
+
+@pytest.mark.parametrize("sensor_config", SENSOR_CONFIGS)
+def test_sensor_dataset(mock_datacatalog, sensor_config):
+    """Test the SensorDataset class for different sensor types."""
+    time = datetime(2021, 1, 1, 0, 0)
     primary_descriptors = ["OBS_TIMESTAMP", "LAT", "LON"]
-    additional_variables = ["TMBR_00001", "TMBR_00002"]
 
-    dataset = AMSUDataset(dataset_name, time, primary_descriptors, additional_variables)
+    dataset = SensorDataset(
+        dataset_name=sensor_config["name"],
+        time=time,
+        primary_descriptors=primary_descriptors,
+        additional_variables=get_sensor_variables(sensor_config["sensor_type"]),
+        sensor_type=sensor_config["sensor_type"],
+    )
 
     # Test dataset length
-    assert len(dataset) > 0, "Dataset should not be empty."
+    assert len(dataset) > 0, f"Dataset should not be empty for {sensor_config['sensor_type']}"
 
+    # Test single item structure
     item = dataset[0]
     expected_keys = {"timestamp", "latitude", "longitude", "metadata"}
-    assert set(item.keys()) == expected_keys, "Dataset item keys are not as expected."
+    assert (
+        set(item.keys()) == expected_keys
+    ), f"Dataset item keys are not as expected for {sensor_config['sensor_type']}"
 
     # Validate tensor properties
-    assert isinstance(item["timestamp"], torch.Tensor), "Timestamp should be a tensor."
-    assert item["timestamp"].dtype == torch.float32, "Timestamp should have dtype float32."
-    assert item["timestamp"].ndim == 0, "Timestamp should be a scalar tensor."
-
-    assert isinstance(item["latitude"], torch.Tensor), "Latitude should be a tensor."
-    assert item["latitude"].dtype == torch.float32, "Latitude should have dtype float32."
-    assert item["latitude"].ndim == 0, "Latitude should be a scalar tensor."
-
-    assert isinstance(item["longitude"], torch.Tensor), "Longitude should be a tensor."
-    assert item["longitude"].dtype == torch.float32, "Longitude should have dtype float32."
-    assert item["longitude"].ndim == 0, "Longitude should be a scalar tensor."
-
-    assert isinstance(item["metadata"], torch.Tensor), "Metadata should be a tensor."
+    assert isinstance(
+        item["timestamp"], torch.Tensor
+    ), f"Timestamp should be a tensor for {sensor_config['sensor_type']}"
+    assert (
+        item["timestamp"].dtype == torch.float32
+    ), f"Timestamp should have dtype float32 for {sensor_config['sensor_type']}"
+    assert (
+        item["timestamp"].ndim == 0
+    ), f"Timestamp should be a scalar tensor for {sensor_config['sensor_type']}"
+
+    assert isinstance(
+        item["latitude"], torch.Tensor
+    ), f"Latitude should be a tensor for {sensor_config['sensor_type']}"
+    assert (
+        item["latitude"].dtype == torch.float32
+    ), f"Latitude should have dtype float32 for {sensor_config['sensor_type']}"
+    assert (
+        item["latitude"].ndim == 0
+    ), f"Latitude should be a scalar tensor for {sensor_config['sensor_type']}"
+
+    assert isinstance(
+        item["longitude"], torch.Tensor
+    ), f"Longitude should be a tensor for {sensor_config['sensor_type']}"
+    assert (
+        item["longitude"].dtype == torch.float32
+    ), f"Longitude should have dtype float32 for {sensor_config['sensor_type']}"
+    assert (
+        item["longitude"].ndim == 0
+    ), f"Longitude should be a scalar tensor for {sensor_config['sensor_type']}"
+
+    assert isinstance(
+        item["metadata"], torch.Tensor
+    ), f"Metadata should be a tensor for {sensor_config['sensor_type']}"
     assert item["metadata"].shape == (
-        len(additional_variables),
-    ), f"Metadata shape mismatch. Expected ({len(additional_variables)},)."
-    assert item["metadata"].dtype == torch.float32, "Metadata should have dtype float32."
+        sensor_config["expected_metadata_size"],
+    ), f"Metadata shape mismatch for {sensor_config['sensor_type']}. Expected ({sensor_config['expected_metadata_size']},)"
+    assert (
+        item["metadata"].dtype == torch.float32
+    ), f"Metadata should have dtype float32 for {sensor_config['sensor_type']}"
 
 
 def test_collate_function():
-    """
-    Test the collate_fn function to ensure proper batching of dataset items.
-
-    This test checks that the collate_fn properly batches the timestamp, latitude,
-    longitude, and metadata fields of the dataset, ensuring correct shapes and data types.
-    """
-    # Mock a batch of items
+    """Test the collate_fn function to ensure proper batching of dataset items."""
     batch_size = 4
-    metadata_size = 2
+    metadata_size = 15  # Using AMSU size for this test
     mock_batch = [
         {
             "timestamp": torch.tensor(datetime.now().timestamp(), dtype=torch.float32),
@@ -116,19 +190,13 @@ def test_collate_function():
         for _ in range(batch_size)
     ]
 
-    # Collate the batch
     batched = collate_fn(mock_batch)
 
-    # Validate batched shapes and types
-    assert batched["timestamp"].shape == (batch_size,), "Timestamp batch shape mismatch."
-    assert batched["latitude"].shape == (batch_size,), "Latitude batch shape mismatch."
-    assert batched["longitude"].shape == (batch_size,), "Longitude batch shape mismatch."
-    assert batched["metadata"].shape == (
-        batch_size,
-        metadata_size,
-    ), "Metadata batch shape mismatch."
-
-    assert batched["timestamp"].dtype == torch.float32, "Timestamp dtype mismatch."
-    assert batched["latitude"].dtype == torch.float32, "Latitude dtype mismatch."
-    assert batched["longitude"].dtype == torch.float32, "Longitude dtype mismatch."
-    assert batched["metadata"].dtype == torch.float32, "Metadata dtype mismatch."
+    assert batched["timestamp"].shape == (batch_size,), "Timestamp batch shape mismatch"
+    assert batched["latitude"].shape == (batch_size,), "Latitude batch shape mismatch"
+    assert batched["longitude"].shape == (batch_size,), "Longitude batch shape mismatch"
+    assert batched["metadata"].shape == (batch_size, metadata_size), "Metadata batch shape mismatch"
+    assert batched["timestamp"].dtype == torch.float32, "Timestamp dtype mismatch"
+    assert batched["latitude"].dtype == torch.float32, "Latitude dtype mismatch"
+    assert batched["longitude"].dtype == torch.float32, "Longitude dtype mismatch"
+    assert batched["metadata"].dtype == torch.float32, "Metadata dtype mismatch"

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`"""Dataloaders and data processing utilities"""`
`2`	`2`
`3`		`-from .nnja_ai import AMSUDataset, collate_fn`
	`3`	`+from .nnja_ai import SensorDataset, collate_fn`