Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
17ff4c9
refactor(config): Use enum types for package storage and query engine…
junhaoliao Sep 30, 2025
4c01485
refactor(config): Change custom_serialized_fields to a set and pass i…
junhaoliao Sep 30, 2025
3e484e2
refactor(config): Use shared annotated Port type for port fields and …
junhaoliao Sep 30, 2025
d441993
refactor(config): Consolidate host validation by introducing shared H…
junhaoliao Sep 30, 2025
68a3a06
refactor(config): Introduce DatabaseEngine enum, use it for database.…
junhaoliao Sep 30, 2025
fbe91b0
refactor(config): Replace manual non‑empty string validators with sha…
junhaoliao Sep 30, 2025
e829a86
refactor(config): Use PositiveFloat type for jobs_poll_delay fields i…
junhaoliao Sep 30, 2025
79b8043
Refactor(config): Rename jobs_poll_delay to jobs_poll_delay_sec in sc…
junhaoliao Sep 30, 2025
354b019
refactor(config): Replace int fields with PositiveInt and remove redu…
junhaoliao Sep 30, 2025
e60b39a
refactor(config): Update optional string fields to use NonEmptyStr type.
junhaoliao Sep 30, 2025
3372a4b
docs(config): Update comment to specify specific types.
junhaoliao Sep 30, 2025
68740f5
refactor(config): Use ZstdCompressionLevel for compression_level and …
junhaoliao Sep 30, 2025
420f30f
refactor(config): Replace string logging_level fields with LoggingLev…
junhaoliao Sep 30, 2025
6a584e0
revert jobs_poll_delay rename
junhaoliao Sep 30, 2025
79c12e8
Merge branch 'main' into pydantic-impf
junhaoliao Sep 30, 2025
bfca7d7
Merge branch 'main' into pydantic-impf
junhaoliao Oct 1, 2025
8673afd
Merge branch 'main' into pydantic-impf
junhaoliao Oct 1, 2025
cc9a1f4
refactor(config): Rename type `Host` -> `DomainStr`; Add TODO docstri…
junhaoliao Oct 10, 2025
96ac165
Merge branch 'main' into pydantic-impf
junhaoliao Oct 10, 2025
04929ea
Add custom annotation for serialization
sitaowang1998 Oct 10, 2025
57b2789
Remove field without custom serialization
sitaowang1998 Oct 10, 2025
0b6f43c
Bug fix
sitaowang1998 Oct 10, 2025
3a57db7
Merge branch 'main' into pydantic-enum
sitaowang1998 Oct 14, 2025
e487f82
Fix enum
sitaowang1998 Oct 14, 2025
4cac503
Rename file and restructure
sitaowang1998 Oct 18, 2025
da5e3e7
Use before validator
sitaowang1998 Oct 18, 2025
bfa44c6
Bug fix
sitaowang1998 Oct 18, 2025
f6abd55
Bug fix
sitaowang1998 Oct 18, 2025
c80de48
Revert "Use before validator"
sitaowang1998 Oct 18, 2025
1f9bfc9
Fix lint
sitaowang1998 Oct 18, 2025
ca09544
Apply suggestions from code review
sitaowang1998 Oct 18, 2025
60da510
Rename variables
sitaowang1998 Oct 18, 2025
bd07d8e
Merge branch 'pydantic-enum' of github.com:sitaowang1998/clp into pyd…
sitaowang1998 Oct 18, 2025
0a7b157
Rename variable
sitaowang1998 Oct 18, 2025
78c015b
Rename function
sitaowang1998 Oct 18, 2025
7b3e9d7
Use optional serializable path
sitaowang1998 Oct 18, 2025
8e587e3
Merge branch 'main' into pydantic-enum
sitaowang1998 Oct 18, 2025
d1a36d6
Add return type
sitaowang1998 Oct 18, 2025
a6b6c8d
Fix type definition order
sitaowang1998 Oct 18, 2025
80a7164
Replace or pathlib.Path
sitaowang1998 Oct 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 39 additions & 78 deletions components/clp-py-utils/clp_py_utils/clp_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
Field,
field_validator,
model_validator,
PlainSerializer,
PrivateAttr,
)
from strenum import KebabCaseStrEnum, LowercaseStrEnum
Expand All @@ -21,6 +22,7 @@
read_yaml_config_file,
validate_path_could_be_dir,
)
from .serialization_utils import serialize_path, serialize_str_enum

# Constants
# Component names
Expand Down Expand Up @@ -98,6 +100,8 @@
CLP_QUEUE_PASS_ENV_VAR_NAME = "CLP_QUEUE_PASS"
CLP_REDIS_PASS_ENV_VAR_NAME = "CLP_REDIS_PASS"

# Serializer
StrEnumSerializer = PlainSerializer(serialize_str_enum)
# Generic types
NonEmptyStr = Annotated[str, Field(min_length=1)]
PositiveFloat = Annotated[float, Field(gt=0)]
Expand All @@ -106,6 +110,7 @@
# TODO: Replace this with pydantic_extra_types.domain.DomainStr.
DomainStr = NonEmptyStr
Port = Annotated[int, Field(gt=0, lt=2**16)]
SerializablePath = Annotated[pathlib.Path, PlainSerializer(serialize_path)]
ZstdCompressionLevel = Annotated[int, Field(ge=1, le=19)]


Expand All @@ -114,17 +119,26 @@ class StorageEngine(KebabCaseStrEnum):
CLP_S = auto()


StorageEngineStr = Annotated[StorageEngine, StrEnumSerializer]


class DatabaseEngine(KebabCaseStrEnum):
MARIADB = auto()
MYSQL = auto()


DatabaseEngineStr = Annotated[DatabaseEngine, StrEnumSerializer]


class QueryEngine(KebabCaseStrEnum):
CLP = auto()
CLP_S = auto()
PRESTO = auto()


QueryEngineStr = Annotated[QueryEngine, StrEnumSerializer]


class StorageType(LowercaseStrEnum):
FS = auto()
S3 = auto()
Expand All @@ -137,9 +151,12 @@ class AwsAuthType(LowercaseStrEnum):
ec2 = auto()


AwsAuthTypeStr = Annotated[AwsAuthType, StrEnumSerializer]


class Package(BaseModel):
storage_engine: StorageEngine = StorageEngine.CLP
query_engine: QueryEngine = QueryEngine.CLP
storage_engine: StorageEngineStr = StorageEngine.CLP
query_engine: QueryEngineStr = QueryEngine.CLP

@model_validator(mode="after")
def validate_query_engine_package_compatibility(self):
Expand All @@ -163,15 +180,9 @@ def validate_query_engine_package_compatibility(self):

return self

def dump_to_primitive_dict(self):
d = self.model_dump()
d["storage_engine"] = d["storage_engine"].value
d["query_engine"] = d["query_engine"].value
return d


class Database(BaseModel):
type: DatabaseEngine = DatabaseEngine.MARIADB
type: DatabaseEngineStr = DatabaseEngine.MARIADB
host: DomainStr = "localhost"
port: Port = 3306
name: NonEmptyStr = "clp-db"
Expand Down Expand Up @@ -232,7 +243,6 @@ def get_clp_connection_params_and_type(self, disable_localhost_socket_connection

def dump_to_primitive_dict(self):
d = self.model_dump(exclude={"username", "password"})
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in a separate PR, we shall investigate
specifying private attributes like username as https://docs.pydantic.dev/latest/api/fields/#pydantic.fields.PrivateAttr
and
specifying secrets like password as https://docs.pydantic.dev/2.2/usage/types/secrets/

so we can completely get rid of such dump_to_primitive_dict() methods.

@coderabbitai create an issue to track

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash

# Create a GitHub issue to track the investigation
gh issue create \
  --repo y-scope/clp \
  --title "Investigate using Pydantic PrivateAttr and SecretStr to eliminate dump_to_primitive_dict methods" \
  --body "## Context

This issue tracks a follow-up refactoring task identified during PR #1417.

## Task

Investigate using Pydantic features to eliminate the need for custom \`dump_to_primitive_dict()\` methods:

1. Use [PrivateAttr](https://docs.pydantic.dev/latest/api/fields/#pydantic.fields.PrivateAttr) for private attributes like \`username\`
2. Use [SecretStr](https://docs.pydantic.dev/2.2/usage/types/secrets/) for sensitive fields like \`password\`

This would allow complete removal of \`dump_to_primitive_dict()\` methods throughout the codebase.

## References

- PR: https://github.com/y-scope/clp/pull/1417
- Comment: https://github.com/y-scope/clp/pull/1417#discussion_r2442428522
- Requested by: @junhaoliao" \
  --assignee junhaoliao

Length of output: 42


@junhaoliao I've created issue #1440 to track this investigation: #1440

The issue covers exploring Pydantic's PrivateAttr for private attributes like username and SecretStr for sensitive fields like password, which would allow the removal of dump_to_primitive_dict() methods throughout the codebase.


🧬 Code Graph Analysis Results

Filename: components/clp-py-utils/clp_py_utils/clp_config.py
Line numbers: N/A (summary)

  • Summary: Private attributes used for config paths

    • _container_image_id_path: PrivateAttr with default path CLP_PACKAGE_CONTAINER_IMAGE_ID_PATH
    • _version_file_path: PrivateAttr with default path CLP_VERSION_FILE_PATH
    • Purpose: Conceal internal paths from model serialization; supports lazy/controlled access for container image and version resolution.
  • Summary: dump_to_primitive_dict methods (exclusions to avoid leaking sensitive data)

    • Database.dump_to_primitive_dict
      • Returns: dict representation of the Database model excluding username and password
      • Behavior: Ensures credentials are not dumped to primitive form
    • Redis.dump_to_primitive_dict
      • Returns: dict representation of the Redis model excluding password
    • Queue.dump_to_primitive_dict
      • Returns: dict representation of the Queue model excluding username and password
    • CLPConfig.dump_to_primitive_dict
      • Returns: composite dict where database, queue, and redis are dumped via their respective dump_to_primitive_dict methods
      • Also converts aws_config_directory to a primitive value (string or None)
      • Purpose: Centralized primitive-dict export that omits sensitive credentials
  • Summary: Related credential loading methods (context for removal of dump methods)

    • Database.load_credentials_from_file/load_credentials_from_env
    • Redis.load_credentials_from_file/load_credentials_from_env
    • Queue.load_credentials_from_file/load_credentials_from_env
    • Note: These methods indicate sensitivity of credential data, motivating the move to private attributes and excluding credentials in primitive dumps.

d["type"] = d["type"].value
return d

def load_credentials_from_file(self, credentials_file_path: pathlib.Path):
Expand Down Expand Up @@ -360,12 +370,7 @@ class S3Credentials(BaseModel):


class AwsAuthentication(BaseModel):
type: Literal[
AwsAuthType.credentials.value,
AwsAuthType.profile.value,
AwsAuthType.env_vars.value,
AwsAuthType.ec2.value,
]
type: AwsAuthTypeStr
profile: Optional[NonEmptyStr] = None
credentials: Optional[S3Credentials] = None

Expand Down Expand Up @@ -408,13 +413,10 @@ class S3IngestionConfig(BaseModel):
type: Literal[StorageType.S3.value] = StorageType.S3.value
aws_authentication: AwsAuthentication

def dump_to_primitive_dict(self):
return self.model_dump()


class FsStorage(BaseModel):
type: Literal[StorageType.FS.value] = StorageType.FS.value
directory: pathlib.Path
directory: SerializablePath

@field_validator("directory", mode="before")
@classmethod
Expand All @@ -425,16 +427,11 @@ def validate_directory(cls, value):
def make_config_paths_absolute(self, clp_home: pathlib.Path):
self.directory = make_config_path_absolute(clp_home, self.directory)

def dump_to_primitive_dict(self):
d = self.model_dump()
d["directory"] = str(d["directory"])
return d


class S3Storage(BaseModel):
type: Literal[StorageType.S3.value] = StorageType.S3.value
s3_config: S3Config
staging_directory: pathlib.Path
staging_directory: SerializablePath

@field_validator("staging_directory", mode="before")
@classmethod
Expand All @@ -455,30 +452,25 @@ def validate_key_prefix(cls, value):
def make_config_paths_absolute(self, clp_home: pathlib.Path):
self.staging_directory = make_config_path_absolute(clp_home, self.staging_directory)

def dump_to_primitive_dict(self):
d = self.model_dump()
d["staging_directory"] = str(d["staging_directory"])
return d


class FsIngestionConfig(FsStorage):
directory: pathlib.Path = pathlib.Path("/")
directory: SerializablePath = pathlib.Path("/")


class ArchiveFsStorage(FsStorage):
directory: pathlib.Path = CLP_DEFAULT_DATA_DIRECTORY_PATH / "archives"
directory: SerializablePath = CLP_DEFAULT_DATA_DIRECTORY_PATH / "archives"


class StreamFsStorage(FsStorage):
directory: pathlib.Path = CLP_DEFAULT_DATA_DIRECTORY_PATH / "streams"
directory: SerializablePath = CLP_DEFAULT_DATA_DIRECTORY_PATH / "streams"


class ArchiveS3Storage(S3Storage):
staging_directory: pathlib.Path = CLP_DEFAULT_DATA_DIRECTORY_PATH / "staged-archives"
staging_directory: SerializablePath = CLP_DEFAULT_DATA_DIRECTORY_PATH / "staged-archives"


class StreamS3Storage(S3Storage):
staging_directory: pathlib.Path = CLP_DEFAULT_DATA_DIRECTORY_PATH / "staged-streams"
staging_directory: SerializablePath = CLP_DEFAULT_DATA_DIRECTORY_PATH / "staged-streams"


def _get_directory_from_storage_config(
Expand Down Expand Up @@ -520,11 +512,6 @@ def set_directory(self, directory: pathlib.Path):
def get_directory(self) -> pathlib.Path:
return _get_directory_from_storage_config(self.storage)

def dump_to_primitive_dict(self):
d = self.model_dump()
d["storage"] = self.storage.dump_to_primitive_dict()
return d


class StreamOutput(BaseModel):
storage: Union[StreamFsStorage, StreamS3Storage] = StreamFsStorage()
Expand All @@ -536,11 +523,6 @@ def set_directory(self, directory: pathlib.Path):
def get_directory(self) -> pathlib.Path:
return _get_directory_from_storage_config(self.storage)

def dump_to_primitive_dict(self):
d = self.model_dump()
d["storage"] = self.storage.dump_to_primitive_dict()
return d


class WebUi(BaseModel):
host: DomainStr = "localhost"
Expand Down Expand Up @@ -590,24 +572,26 @@ class CLPConfig(BaseModel):
query_worker: QueryWorker = QueryWorker()
webui: WebUi = WebUi()
garbage_collector: GarbageCollector = GarbageCollector()
credentials_file_path: pathlib.Path = CLP_DEFAULT_CREDENTIALS_FILE_PATH
credentials_file_path: SerializablePath = CLP_DEFAULT_CREDENTIALS_FILE_PATH

presto: Optional[Presto] = None

archive_output: ArchiveOutput = ArchiveOutput()
stream_output: StreamOutput = StreamOutput()
data_directory: pathlib.Path = pathlib.Path("var") / "data"
logs_directory: pathlib.Path = pathlib.Path("var") / "log"
aws_config_directory: Optional[pathlib.Path] = None
data_directory: SerializablePath = pathlib.Path("var") / "data"
logs_directory: SerializablePath = pathlib.Path("var") / "log"
aws_config_directory: Optional[SerializablePath] = None

_container_image_id_path: pathlib.Path = PrivateAttr(
_container_image_id_path: SerializablePath = PrivateAttr(
default=CLP_PACKAGE_CONTAINER_IMAGE_ID_PATH
)
_version_file_path: pathlib.Path = PrivateAttr(default=CLP_VERSION_FILE_PATH)
_version_file_path: SerializablePath = PrivateAttr(default=CLP_VERSION_FILE_PATH)

@field_validator("aws_config_directory")
@classmethod
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we changed the above aws_config_directory type as Optional[SerializablePath], we shall update below value type as Optional[SerializablePath] in below validator prototype for consistency

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked other references of pathlib.Path and believe they are fine to be left as pathlib.Path because those are used in public interfaces and the callers do not need to make the paths serializable.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then we have to manually serialize them in the dump_to_primitive_dict.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. Sorry if i may have confused you, i meant to say

I checked other references of pathlib.Path and believe "all the other references" are fine to be left as pathlib.Path

-> the value argument in the validator of aws_config_directory is the ONLY argument whose type should be updated as SerializablePath
-> that is, let's not change the type from pathlib.Path to SerializablePath in any def make_config_paths_absolute(self, clp_home: pathlib.Path) or def load_credentials_from_file(self, credentials_file_path: pathlib.Path) prototypes

def expand_profile_user_home(cls, value: Optional[pathlib.Path]):
def expand_profile_user_home(
cls, value: Optional[SerializablePath]
) -> Optional[SerializablePath]:
if value is not None:
value = value.expanduser()
return value
Expand Down Expand Up @@ -693,7 +677,7 @@ def validate_aws_config_dir(self):
auth_configs.append(self.stream_output.storage.s3_config.aws_authentication)

for auth in auth_configs:
if AwsAuthType.profile.value == auth.type:
if AwsAuthType.profile == auth.type:
profile_auth_used = True
break

Expand Down Expand Up @@ -735,27 +719,14 @@ def get_runnable_components(self) -> Set[str]:

def dump_to_primitive_dict(self):
custom_serialized_fields = {
"package",
"database",
"queue",
"redis",
"logs_input",
"archive_output",
"stream_output",
}
d = self.model_dump(exclude=custom_serialized_fields)
for key in custom_serialized_fields:
d[key] = getattr(self, key).dump_to_primitive_dict()

# Turn paths into primitive strings
d["credentials_file_path"] = str(self.credentials_file_path)
d["data_directory"] = str(self.data_directory)
d["logs_directory"] = str(self.logs_directory)
if self.aws_config_directory is not None:
d["aws_config_directory"] = str(self.aws_config_directory)
else:
d["aws_config_directory"] = None

return d

@model_validator(mode="after")
Expand All @@ -772,22 +743,12 @@ def validate_presto_config(self):
class WorkerConfig(BaseModel):
package: Package = Package()
archive_output: ArchiveOutput = ArchiveOutput()
data_directory: pathlib.Path = CLPConfig().data_directory
data_directory: SerializablePath = CLPConfig().data_directory

# Only needed by query workers.
stream_output: StreamOutput = StreamOutput()
stream_collection_name: str = ResultsCache().stream_collection_name

def dump_to_primitive_dict(self):
d = self.model_dump()
d["archive_output"] = self.archive_output.dump_to_primitive_dict()

# Turn paths into primitive strings
d["data_directory"] = str(self.data_directory)
d["stream_output"] = self.stream_output.dump_to_primitive_dict()

return d


def get_components_for_target(target: str) -> Set[str]:
if target in TARGET_TO_COMPONENTS:
Expand Down
23 changes: 23 additions & 0 deletions components/clp-py-utils/clp_py_utils/serialization_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pathlib

from strenum import StrEnum


def serialize_str_enum(member: StrEnum) -> str:
"""
Serializes a `strenum.StrEnum` member to its underlying value.
:param member:
:return: The underlying string value of the enum member.
"""
return member.value


def serialize_path(path: pathlib.Path) -> str:
"""
Serializes a `pathlib.Path` to its string representation.
:param path:
:return: The string representation of the path.
"""
return str(path)
Loading