Skip to content

Commit 157414a

Browse files
authored
Merge pull request #3659 from lonvia/custom-datrie-structure
Replace datrie library with a simple custom Python implementation
2 parents f567ea8 + 18d4996 commit 157414a

File tree

6 files changed

+134
-22
lines changed

6 files changed

+134
-22
lines changed

docs/admin/Installation.md

-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ Furthermore the following Python libraries are required:
3737
* [Jinja2](https://palletsprojects.com/p/jinja/)
3838
* [PyICU](https://pypi.org/project/PyICU/)
3939
* [PyYaml](https://pyyaml.org/) (5.1+)
40-
* [datrie](https://github.com/pytries/datrie)
4140

4241
These will be installed automatically when using pip installation.
4342

docs/develop/Development-Environment.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ To set up the virtual environment with all necessary packages run:
7070
virtualenv ~/nominatim-dev-venv
7171
~/nominatim-dev-venv/bin/pip install\
7272
psutil psycopg[binary] PyICU SQLAlchemy \
73-
python-dotenv jinja2 pyYAML datrie behave \
73+
python-dotenv jinja2 pyYAML behave \
7474
mkdocs mkdocstrings mkdocs-gen-files pytest pytest-asyncio flake8 \
7575
types-jinja2 types-markupsafe types-psutil types-psycopg2 \
7676
types-pygments types-pyyaml types-requests types-ujson \

packaging/nominatim-db/pyproject.toml

-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ dependencies = [
1919
"python-dotenv",
2020
"jinja2",
2121
"pyYAML>=5.1",
22-
"datrie",
2322
"psutil",
2423
"PyICU"
2524
]

src/nominatim_db/tokenizer/token_analysis/generic.py

+12-19
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,19 @@
22
#
33
# This file is part of Nominatim. (https://nominatim.org)
44
#
5-
# Copyright (C) 2024 by the Nominatim developer community.
5+
# Copyright (C) 2025 by the Nominatim developer community.
66
# For a full list of authors see the git log.
77
"""
88
Generic processor for names that creates abbreviation variants.
99
"""
1010
from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
1111
import itertools
1212

13-
import datrie
14-
1513
from ...errors import UsageError
1614
from ...data.place_name import PlaceName
1715
from .config_variants import get_variant_config
1816
from .generic_mutation import MutationVariantGenerator
17+
from .simple_trie import SimpleTrie
1918

2019
# Configuration section
2120

@@ -25,8 +24,7 @@ def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, An
2524
"""
2625
config: Dict[str, Any] = {}
2726

28-
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
29-
normalizer)
27+
config['replacements'], _ = get_variant_config(rules.get('variants'), normalizer)
3028
config['variant_only'] = rules.get('mode', '') == 'variant-only'
3129

3230
# parse mutation rules
@@ -68,12 +66,8 @@ def __init__(self, norm: Any, to_ascii: Any, config: Mapping[str, Any]) -> None:
6866
self.variant_only = config['variant_only']
6967

7068
# Set up datrie
71-
if config['replacements']:
72-
self.replacements = datrie.Trie(config['chars'])
73-
for src, repllist in config['replacements']:
74-
self.replacements[src] = repllist
75-
else:
76-
self.replacements = None
69+
self.replacements: Optional[SimpleTrie[List[str]]] = \
70+
SimpleTrie(config['replacements']) if config['replacements'] else None
7771

7872
# set up mutation rules
7973
self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
@@ -116,10 +110,10 @@ def _generate_word_variants(self, norm_name: str) -> Iterable[str]:
116110
pos = 0
117111
force_space = False
118112
while pos < baselen:
119-
full, repl = self.replacements.longest_prefix_item(baseform[pos:],
120-
(None, None))
121-
if full is not None:
122-
done = baseform[startpos:pos]
113+
frm = pos
114+
repl, pos = self.replacements.longest_prefix(baseform, pos)
115+
if repl is not None:
116+
done = baseform[startpos:frm]
123117
partials = [v + done + r
124118
for v, r in itertools.product(partials, repl)
125119
if not force_space or r.startswith(' ')]
@@ -128,11 +122,10 @@ def _generate_word_variants(self, norm_name: str) -> Iterable[str]:
128122
# to be helpful. Only use the original term.
129123
startpos = 0
130124
break
131-
startpos = pos + len(full)
132-
if full[-1] == ' ':
133-
startpos -= 1
125+
if baseform[pos - 1] == ' ':
126+
pos -= 1
134127
force_space = True
135-
pos = startpos
128+
startpos = pos
136129
else:
137130
pos += 1
138131
force_space = False
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# SPDX-License-Identifier: GPL-3.0-or-later
2+
#
3+
# This file is part of Nominatim. (https://nominatim.org)
4+
#
5+
# Copyright (C) 2025 by the Nominatim developer community.
6+
# For a full list of authors see the git log.
7+
"""
8+
Simple dict-based implementation of a trie structure.
9+
"""
10+
from typing import TypeVar, Generic, Tuple, Optional, List, Dict
11+
from collections import defaultdict
12+
13+
T = TypeVar('T')
14+
15+
16+
class SimpleTrie(Generic[T]):
17+
""" A simple read-only trie structure.
18+
This structure supports examply one lookup operation,
19+
which is longest-prefix lookup.
20+
"""
21+
22+
def __init__(self, data: Optional[List[Tuple[str, T]]] = None) -> None:
23+
self._tree: Dict[str, 'SimpleTrie[T]'] = defaultdict(SimpleTrie[T])
24+
self._value: Optional[T] = None
25+
self._prefix = ''
26+
27+
if data:
28+
for key, value in data:
29+
self._add(key, 0, value)
30+
31+
self._make_compact()
32+
33+
def _add(self, word: str, pos: int, value: T) -> None:
34+
""" (Internal) Add a sub-word to the trie.
35+
The word is added from index 'pos'. If the sub-word to add
36+
is empty, then the trie saves the given value.
37+
"""
38+
if pos < len(word):
39+
self._tree[word[pos]]._add(word, pos + 1, value)
40+
else:
41+
self._value = value
42+
43+
def _make_compact(self) -> None:
44+
""" (Internal) Compress tree where there is exactly one subtree
45+
and no value.
46+
47+
Compression works recursively starting at the leaf.
48+
"""
49+
for t in self._tree.values():
50+
t._make_compact()
51+
52+
if len(self._tree) == 1 and self._value is None:
53+
assert not self._prefix
54+
for k, v in self._tree.items():
55+
self._prefix = k + v._prefix
56+
self._tree = v._tree
57+
self._value = v._value
58+
59+
def longest_prefix(self, word: str, start: int = 0) -> Tuple[Optional[T], int]:
60+
""" Return the longest prefix match for the given word starting at
61+
the position 'start'.
62+
63+
The function returns a tuple with the value for the longest match and
64+
the position of the word after the match. If no match was found at
65+
all, the function returns (None, start).
66+
"""
67+
cur = self
68+
pos = start
69+
result: Tuple[Optional[T], int] = None, start
70+
71+
while True:
72+
if cur._prefix:
73+
if not word.startswith(cur._prefix, pos):
74+
return result
75+
pos += len(cur._prefix)
76+
77+
if cur._value:
78+
result = cur._value, pos
79+
80+
if pos >= len(word) or word[pos] not in cur._tree:
81+
return result
82+
83+
cur = cur._tree[word[pos]]
84+
pos += 1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# SPDX-License-Identifier: GPL-3.0-or-later
2+
#
3+
# This file is part of Nominatim. (https://nominatim.org)
4+
#
5+
# Copyright (C) 2025 by the Nominatim developer community.
6+
# For a full list of authors see the git log.
7+
"""
8+
Tests for simplified trie structure.
9+
"""
10+
11+
from nominatim_db.tokenizer.token_analysis.simple_trie import SimpleTrie
12+
13+
def test_single_item_trie():
14+
t = SimpleTrie([('foob', 42)])
15+
16+
assert t.longest_prefix('afoobar') == (None, 0)
17+
assert t.longest_prefix('afoobar', start=1) == (42, 5)
18+
assert t.longest_prefix('foob') == (42, 4)
19+
assert t.longest_prefix('123foofoo', 3) == (None, 3)
20+
21+
def test_complex_item_tree():
22+
t = SimpleTrie([('a', 1),
23+
('b', 2),
24+
('auto', 3),
25+
('buto', 4),
26+
('automat', 5),
27+
('bu', 6),
28+
('bx', 7)])
29+
30+
assert t.longest_prefix('a') == (1, 1)
31+
assert t.longest_prefix('au') == (1, 1)
32+
assert t.longest_prefix('aut') == (1, 1)
33+
assert t.longest_prefix('auto') == (3, 4)
34+
assert t.longest_prefix('automat') == (5, 7)
35+
assert t.longest_prefix('automatx') == (5, 7)
36+
assert t.longest_prefix('butomat') == (4, 4)
37+
assert t.longest_prefix('butomat', 1) == (None, 1)

0 commit comments

Comments
 (0)