Skip to content

Commit 2ce2d03

Browse files
authored
Merge pull request osm-search#3702 from lonvia/remove-tokenizer-dir
Remove automatic setup of tokenizer directory So far the tokenizer factory would create a directory for private data for the tokenizer and then hand in the directory location to the tokenizer. ICU tokenizer doesn't need any extra data anymore, so it doesn't make sense to create a directory which then remains empty. If a tokenizer needs such a directory in the future, it needs to create it on its own and make sure to handle the situation correctly where no project directory is used at all.
2 parents c5bbeb6 + 186f562 commit 2ce2d03

File tree

8 files changed

+16
-52
lines changed

8 files changed

+16
-52
lines changed

src/nominatim_db/tokenizer/base.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,14 @@
22
#
33
# This file is part of Nominatim. (https://nominatim.org)
44
#
5-
# Copyright (C) 2024 by the Nominatim developer community.
5+
# Copyright (C) 2025 by the Nominatim developer community.
66
# For a full list of authors see the git log.
77
"""
88
Abstract class definitions for tokenizers. These base classes are here
99
mainly for documentation purposes.
1010
"""
1111
from abc import ABC, abstractmethod
1212
from typing import List, Tuple, Dict, Any, Optional, Iterable
13-
from pathlib import Path
1413

1514
from ..typing import Protocol
1615
from ..config import Configuration
@@ -232,6 +231,6 @@ class TokenizerModule(Protocol):
232231
own tokenizer.
233232
"""
234233

235-
def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
234+
def create(self, dsn: str) -> AbstractTokenizer:
236235
""" Factory for new tokenizers.
237236
"""

src/nominatim_db/tokenizer/factory.py

+3-16
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#
33
# This file is part of Nominatim. (https://nominatim.org)
44
#
5-
# Copyright (C) 2024 by the Nominatim developer community.
5+
# Copyright (C) 2025 by the Nominatim developer community.
66
# For a full list of authors see the git log.
77
"""
88
Functions for creating a tokenizer or initialising the right one for an
@@ -52,19 +52,10 @@ def create_tokenizer(config: Configuration, init_db: bool = True,
5252
if module_name is None:
5353
module_name = config.TOKENIZER
5454

55-
# Create the directory for the tokenizer data
56-
assert config.project_dir is not None
57-
basedir = config.project_dir / 'tokenizer'
58-
if not basedir.exists():
59-
basedir.mkdir()
60-
elif not basedir.is_dir():
61-
LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
62-
raise UsageError("Tokenizer setup failed.")
63-
6455
# Import and initialize the tokenizer.
6556
tokenizer_module = _import_tokenizer(module_name)
6657

67-
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
58+
tokenizer = tokenizer_module.create(config.get_libpq_dsn())
6859
tokenizer.init_new_db(config, init_db=init_db)
6960

7061
with connect(config.get_libpq_dsn()) as conn:
@@ -80,10 +71,6 @@ def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
8071
and initialises it.
8172
"""
8273
assert config.project_dir is not None
83-
basedir = config.project_dir / 'tokenizer'
84-
if not basedir.is_dir():
85-
# Directory will be repopulated by tokenizer below.
86-
basedir.mkdir()
8774

8875
with connect(config.get_libpq_dsn()) as conn:
8976
name = properties.get_property(conn, 'tokenizer')
@@ -94,7 +81,7 @@ def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
9481

9582
tokenizer_module = _import_tokenizer(name)
9683

97-
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
84+
tokenizer = tokenizer_module.create(config.get_libpq_dsn())
9885
tokenizer.init_from_project(config)
9986

10087
return tokenizer

src/nominatim_db/tokenizer/icu_tokenizer.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#
33
# This file is part of Nominatim. (https://nominatim.org)
44
#
5-
# Copyright (C) 2024 by the Nominatim developer community.
5+
# Copyright (C) 2025 by the Nominatim developer community.
66
# For a full list of authors see the git log.
77
"""
88
Tokenizer implementing normalisation as used before Nominatim 4 but using
@@ -12,7 +12,6 @@
1212
Dict, Set, Iterable
1313
import itertools
1414
import logging
15-
from pathlib import Path
1615

1716
from psycopg.types.json import Jsonb
1817
from psycopg import sql as pysql
@@ -38,10 +37,10 @@
3837
('housenumbers', 'H'))
3938

4039

41-
def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
40+
def create(dsn: str) -> 'ICUTokenizer':
4241
""" Create a new instance of the tokenizer provided by this module.
4342
"""
44-
return ICUTokenizer(dsn, data_dir)
43+
return ICUTokenizer(dsn)
4544

4645

4746
class ICUTokenizer(AbstractTokenizer):
@@ -50,9 +49,8 @@ class ICUTokenizer(AbstractTokenizer):
5049
normalization routines in Nominatim 3.
5150
"""
5251

53-
def __init__(self, dsn: str, data_dir: Path) -> None:
52+
def __init__(self, dsn: str) -> None:
5453
self.dsn = dsn
55-
self.data_dir = data_dir
5654
self.loader: Optional[ICURuleLoader] = None
5755

5856
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:

test/python/conftest.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,6 @@ def _import_dummy(*args, **kwargs):
234234
property_table.set('tokenizer', 'dummy')
235235

236236
def _create_tokenizer():
237-
return dummy_tokenizer.DummyTokenizer(None, None)
237+
return dummy_tokenizer.DummyTokenizer(None)
238238

239239
return _create_tokenizer

test/python/dummy_tokenizer.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,16 @@
1111
from nominatim_db.config import Configuration
1212

1313

14-
def create(dsn, data_dir):
14+
def create(dsn):
1515
""" Create a new instance of the tokenizer provided by this module.
1616
"""
17-
return DummyTokenizer(dsn, data_dir)
17+
return DummyTokenizer(dsn)
1818

1919

2020
class DummyTokenizer:
2121

22-
def __init__(self, dsn, data_dir):
22+
def __init__(self, dsn):
2323
self.dsn = dsn
24-
self.data_dir = data_dir
2524
self.init_state = None
2625
self.analyser_cache = {}
2726

test/python/tokenizer/test_factory.py

-16
Original file line numberDiff line numberDiff line change
@@ -32,24 +32,9 @@ def test_setup_dummy_tokenizer(self, temp_db_conn):
3232

3333
assert isinstance(tokenizer, DummyTokenizer)
3434
assert tokenizer.init_state == "new"
35-
assert (self.config.project_dir / 'tokenizer').is_dir()
3635

3736
assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy'
3837

39-
def test_setup_tokenizer_dir_exists(self):
40-
(self.config.project_dir / 'tokenizer').mkdir()
41-
42-
tokenizer = factory.create_tokenizer(self.config)
43-
44-
assert isinstance(tokenizer, DummyTokenizer)
45-
assert tokenizer.init_state == "new"
46-
47-
def test_setup_tokenizer_dir_failure(self):
48-
(self.config.project_dir / 'tokenizer').write_text("foo")
49-
50-
with pytest.raises(UsageError):
51-
factory.create_tokenizer(self.config)
52-
5338
def test_load_tokenizer(self):
5439
factory.create_tokenizer(self.config)
5540

@@ -64,7 +49,6 @@ def test_load_repopulate_tokenizer_dir(self):
6449
self.config.project_dir = self.config.project_dir
6550

6651
factory.get_tokenizer_for_db(self.config)
67-
assert (self.config.project_dir / 'tokenizer').exists()
6852

6953
def test_load_missing_property(self, temp_db_cursor):
7054
factory.create_tokenizer(self.config)

test/python/tokenizer/test_icu.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,9 @@ def test_config(project_env, tmp_path):
3939

4040

4141
@pytest.fixture
42-
def tokenizer_factory(dsn, tmp_path, property_table,
43-
sql_preprocessor, place_table, word_table):
44-
(tmp_path / 'tokenizer').mkdir()
45-
42+
def tokenizer_factory(dsn, property_table, sql_preprocessor, place_table, word_table):
4643
def _maker():
47-
return icu_tokenizer.create(dsn, tmp_path / 'tokenizer')
44+
return icu_tokenizer.create(dsn)
4845

4946
return _maker
5047

test/python/tools/test_postcodes.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def row_set(self):
6363

6464
@pytest.fixture
6565
def tokenizer():
66-
return dummy_tokenizer.DummyTokenizer(None, None)
66+
return dummy_tokenizer.DummyTokenizer(None)
6767

6868

6969
@pytest.fixture

0 commit comments

Comments
 (0)