Skip to content

Commit 0dcdcf0

Browse files
authored
ignores native config values if config spec does not implement those (#3233)
* does not fail config resolution if native valued provided to a config that does not implement native values * updates databricks docs * allows to replace hints regexes on schema * removes partition hint on eth merge test on databricks * adds pokemon table count consts * reorgs databricks dlt fix * fixes lancedb custom destination example * fixes lancedb custom destination example * reduces no sql_database examples run on ci * fixes merge * marks and skips rfam tests
1 parent 7848c91 commit 0dcdcf0

File tree

16 files changed

+103
-67
lines changed

16 files changed

+103
-67
lines changed

.github/workflows/test_common.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,11 @@ jobs:
6363
- os: windows-latest
6464
python-version: "3.11"
6565
shell: cmd
66-
pytest_args: '-m "not forked"'
66+
pytest_args: '-m "not forked and not rfam"'
6767
- os: windows-latest
6868
python-version: "3.13"
6969
shell: cmd
70-
pytest_args: '-m "not forked"'
70+
pytest_args: '-m "not forked and not rfam"'
7171

7272
defaults:
7373
run:

dlt/_workspace/_templates/_core_source_templates/sql_database_pipeline.py

Lines changed: 2 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99

1010
from dlt.sources.sql_database import sql_database, sql_table, Table
1111

12-
from sqlalchemy.sql.sqltypes import TypeEngine
1312
import sqlalchemy as sa
1413

1514

@@ -105,46 +104,13 @@ def load_standalone_table_resource() -> None:
105104
defer_table_reflect=True,
106105
)
107106

108-
# Run the resources together
109-
info = pipeline.extract([family, genome], write_disposition="merge")
107+
# Run the resources together (just take one page of results to make it faster)
108+
info = pipeline.extract([family.add_limit(1), genome.add_limit(1)], write_disposition="merge")
110109
print(info)
111110
# Show inferred columns
112111
print(pipeline.default_schema.to_pretty_yaml())
113112

114113

115-
def select_columns() -> None:
116-
"""Uses table adapter callback to modify list of columns to be selected"""
117-
pipeline = dlt.pipeline(
118-
pipeline_name="rfam_database",
119-
destination="duckdb",
120-
dataset_name="rfam_data_cols",
121-
dev_mode=True,
122-
)
123-
124-
def table_adapter(table: Table) -> Table:
125-
print(table.name)
126-
if table.name == "family":
127-
# this is SqlAlchemy table. _columns are writable
128-
# let's drop updated column
129-
table._columns.remove(table.columns["updated"]) # type: ignore
130-
return table
131-
132-
family = sql_table(
133-
credentials="mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
134-
table="family",
135-
chunk_size=10,
136-
reflection_level="full_with_precision",
137-
table_adapter_callback=table_adapter,
138-
)
139-
140-
# also we do not want the whole table, so we add limit to get just one chunk (10 records)
141-
pipeline.run(family.add_limit(1))
142-
# only 10 rows
143-
print(pipeline.last_trace.last_normalize_info)
144-
# no "updated" column in "family" table
145-
print(pipeline.default_schema.to_pretty_yaml())
146-
147-
148114
def select_with_end_value_and_row_order() -> None:
149115
"""Gets data from a table withing a specified range and sorts rows descending"""
150116
pipeline = dlt.pipeline(
@@ -347,9 +313,6 @@ def specify_columns_to_load() -> None:
347313
# Load selected tables with different settings
348314
# load_select_tables_from_database()
349315

350-
# load a table and select columns
351-
# select_columns()
352-
353316
# load_entire_database()
354317
# select_with_end_value_and_row_order()
355318

dlt/common/configuration/resolve.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,10 @@ def _maybe_parse_native_value(
158158
.as_dict_nondefault()
159159
.items()
160160
}
161-
except (ValueError, NotImplementedError) as v_err:
161+
except ValueError as v_err:
162162
raise InvalidNativeValue(type(config), type(native_value), embedded_sections, v_err)
163+
except NotImplementedError:
164+
pass
163165

164166
return native_value # type: ignore[no-any-return]
165167

dlt/common/libs/pydantic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,7 @@ def validate_and_filter_items(
385385
deleted.add(err_idx)
386386
else:
387387
raise NotImplementedError(
388-
f"`{column_mode=:}` not implemented for Pydantic validation"
388+
f"`{data_mode=:}` not implemented for Pydantic validation"
389389
)
390390

391391
# validate again with error items removed

dlt/common/schema/schema.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -418,13 +418,14 @@ def filter_row_with_hint(
418418
def merge_hints(
419419
self,
420420
new_hints: Mapping[TColumnDefaultHint, Sequence[TSimpleRegex]],
421+
replace: bool = False,
421422
normalize_identifiers: bool = True,
422423
) -> None:
423-
"""Merges existing default hints with `new_hints`. Normalizes names in column regexes if possible. Compiles setting at the end
424+
"""Merges or replace existing default hints with `new_hints`. Normalizes names in column regexes if possible. Compiles setting at the end
424425
425426
NOTE: you can manipulate default hints collection directly via `Schema.settings` as long as you call Schema._compile_settings() at the end.
426427
"""
427-
self._merge_hints(new_hints, normalize_identifiers)
428+
self._merge_hints(new_hints, replace=replace, normalize_identifiers=normalize_identifiers)
428429
self._compile_settings()
429430

430431
def update_preferred_types(
@@ -813,6 +814,7 @@ def _infer_hint(self, hint_type: TColumnDefaultHint, col_name: str) -> bool:
813814
def _merge_hints(
814815
self,
815816
new_hints: Mapping[TColumnDefaultHint, Sequence[TSimpleRegex]],
817+
replace: bool = False,
816818
normalize_identifiers: bool = True,
817819
) -> None:
818820
"""Used by `merge_hints method, does not compile settings at the end"""
@@ -829,7 +831,7 @@ def _merge_hints(
829831
default_hints = self._settings.setdefault("default_hints", {})
830832
# add `new_hints` to existing hints
831833
for h, l in new_hints.items():
832-
if h in default_hints:
834+
if h in default_hints and not replace:
833835
extend_list_deduplicated(default_hints[h], l, utils.canonical_simple_regex)
834836
else:
835837
# set new hint type

docs/examples/custom_destination_lancedb/custom_destination_lancedb.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
We'll learn how to:
1717
- Use the [custom destination](../dlt-ecosystem/destinations/destination.md)
1818
- Delegate the embeddings to LanceDB using OpenAI Embeddings
19+
- Use Pydantic for unified dlt and lancedb schema validation
1920
"""
2021

2122
__source_name__ = "spotify"
@@ -59,10 +60,11 @@
5960

6061

6162
class EpisodeSchema(LanceModel):
63+
"""Used for dlt and lance schema validation"""
64+
6265
id: str # noqa: A003
6366
name: str
6467
description: str = func.SourceField()
65-
vector: Vector(func.ndims()) = func.VectorField() # type: ignore[valid-type]
6668
release_date: datetime.date
6769
audio_preview_url: str
6870
duration_ms: int
@@ -71,6 +73,12 @@ class EpisodeSchema(LanceModel):
7173
# there is more data but we are not using it ...
7274

7375

76+
class EpisodeSchemaVector(EpisodeSchema):
77+
"""Adds lance vector field"""
78+
79+
vector: Vector(func.ndims()) = func.VectorField() # type: ignore[valid-type]
80+
81+
7482
@dataclass(frozen=True)
7583
class Shows:
7684
monday_morning_data_chat: str = "3Km3lBNzJpc1nOTJUtbtMh"
@@ -120,11 +128,20 @@ def spotify_shows(
120128
yield dlt.resource(
121129
client.paginate(url, params={"limit": 50}),
122130
name=show_name,
123-
write_disposition="merge",
124131
primary_key="id",
125132
parallelized=True,
126133
max_table_nesting=0,
127-
)
134+
# reuse lance model to filter out all non-matching items and extra columns from spotify api
135+
# 1. unknown columns are removed ("columns": "discard_value")
136+
# 2. non validating items (ie. without id or url) are removed ("data_type": "discard_row")
137+
# 3. for some reason None values are returned as well 🤯, add_filter takes care of that
138+
columns=EpisodeSchema,
139+
schema_contract={
140+
"tables": "evolve",
141+
"columns": "discard_value",
142+
"data_type": "discard_row",
143+
},
144+
).add_filter(lambda i: i is not None)
128145

129146

130147
@dlt.destination(batch_size=250, name="lancedb")
@@ -135,13 +152,7 @@ def lancedb_destination(items: TDataItems, table: TTableSchema) -> None:
135152
try:
136153
tbl = db.open_table(table["name"])
137154
except ValueError:
138-
tbl = db.create_table(table["name"], schema=EpisodeSchema)
139-
140-
# remove all fields that are not in the schema
141-
for item in items:
142-
keys_to_remove = [key for key in item.keys() if key not in EpisodeSchema.model_fields]
143-
for key in keys_to_remove:
144-
del item[key]
155+
tbl = db.create_table(table["name"], schema=EpisodeSchemaVector)
145156

146157
tbl.add(items)
147158

docs/tools/prepare_examples_tests.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111

1212
# settings
1313
SKIP_FOLDERS = ["archive", ".", "_", "local_cache"]
14-
SKIP_EXAMPLES: List[str] = []
14+
# @pytest.mark.rfam
15+
SKIP_EXAMPLES: List[str] = ["backfill_in_chunks", "connector_x_arrow"]
1516
SKIP_FORK_EXAMPLES: List[str] = ["custom_destination_lancedb"]
1617

18+
1719
# the entry point for the script
1820
MAIN_CLAUSE = 'if __name__ == "__main__":'
1921

docs/website/docs/dlt-ecosystem/destinations/databricks.md

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -730,7 +730,36 @@ databricks_adapter(
730730
## Troubleshooting
731731
Use the following steps to avoid conflicts with Databricks' built-in Delta Live Tables (DLT) module and enable dltHub integration.
732732

733-
### 1. Add an `init` script
733+
### Enable dlt on serverless (16.x)
734+
Live Tables (DLT) are not available on serverless but the import machinery that is patching DLT is still there in form of import hooks. You
735+
can temporarily disable this machinery to import `dlt` and use it afterwards. In a notebook cell (assuming that `dlt` is already installed):
736+
737+
```sh
738+
%restart_python
739+
```
740+
741+
```py
742+
import sys
743+
744+
# dlt patching hook is the first one on the list
745+
metas = list(sys.meta_path)
746+
sys.meta_path = metas[1:]
747+
748+
# remove RUNTIME - uncomment on dlt before 1.18.0
749+
# import os
750+
# del os.environ["RUNTIME"]
751+
752+
import dlt
753+
sys.meta_path = metas # restore post import hooks
754+
755+
# use dlt
756+
info = dlt.run([1, 2, 3], destination=dlt.destinations.filesystem("_data"), table_name="digits")
757+
print(info)
758+
```
759+
760+
### Enable dlt on a cluster
761+
762+
#### 1. Add an `init` script
734763
To ensure compatibility with the dltHub's dlt package in Databricks, add an `init` script that runs at cluster startup. This script installs the dlt package from dltHub, renames Databricks’ built-in DLT module to avoid naming conflicts, and updates internal references to allow continued use under the alias `dlt_dbricks`.
735764

736765
1. In your Databricks workspace directory, create a new file named `init.sh` and add the following content:
@@ -767,7 +796,7 @@ The following locations have been confirmed for the two latest LTS runtime versi
767796
- 15.4 LTS: /databricks/python_shell/lib/dbruntime/DeltaLiveTablesHook.py
768797
:::
769798

770-
### 2. Remove preloaded databricks modules in the notebook
799+
#### 2. Remove preloaded databricks modules in the notebook
771800
After the cluster starts, Databricks may partially import its built-in Delta Live Tables (DLT) modules, which can interfere with the dlt package from dltHub.
772801

773802
To ensure a clean environment, add the following code at the top of your notebook:

docs/website/docs/getting-started-snippets.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import os
1+
import pytest
22
from tests.pipeline.utils import assert_load_info
33

44

@@ -91,6 +91,7 @@ def api_snippet() -> None:
9191
assert_load_info(load_info)
9292

9393

94+
@pytest.mark.rfam
9495
def db_snippet() -> None:
9596
# @@@DLT_SNIPPET_START db
9697
import dlt

docs/website/docs/intro-snippets.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import pytest
12
from tests.pipeline.utils import assert_load_info
23

34

@@ -50,6 +51,7 @@ def csv_snippet() -> None:
5051
assert_load_info(load_info)
5152

5253

54+
@pytest.mark.rfam
5355
def db_snippet() -> None:
5456
# @@@DLT_SNIPPET_START db
5557
import dlt

0 commit comments

Comments
 (0)