Skip to content

Commit 478a874

Browse files
authored
Merge pull request #3719 from lonvia/query-direction
Estimate query direction
2 parents 1db717b + 7f710d2 commit 478a874

File tree

7 files changed

+210
-108
lines changed

7 files changed

+210
-108
lines changed

src/nominatim_api/search/db_search_builder.py

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#
33
# This file is part of Nominatim. (https://nominatim.org)
44
#
5-
# Copyright (C) 2024 by the Nominatim developer community.
5+
# Copyright (C) 2025 by the Nominatim developer community.
66
# For a full list of authors see the git log.
77
"""
88
Conversion from token assignment to an abstract DB search.
@@ -146,7 +146,7 @@ def build_special_search(self, sdata: dbf.SearchData,
146146
if address:
147147
sdata.lookups = [dbf.FieldLookup('nameaddress_vector',
148148
[t.token for r in address
149-
for t in self.query.get_partials_list(r)],
149+
for t in self.query.iter_partials(r)],
150150
lookups.Restrict)]
151151
yield dbs.PostcodeSearch(penalty, sdata)
152152

@@ -159,7 +159,7 @@ def build_housenumber_search(self, sdata: dbf.SearchData, hnrs: List[qmod.Token]
159159
expected_count = sum(t.count for t in hnrs)
160160

161161
partials = {t.token: t.addr_count for trange in address
162-
for t in self.query.get_partials_list(trange)}
162+
for t in self.query.iter_partials(trange)}
163163

164164
if not partials:
165165
# can happen when none of the partials is indexed
@@ -203,9 +203,9 @@ def yield_lookups(self, name: qmod.TokenRange, address: List[qmod.TokenRange]
203203
are and tries to find a lookup that optimizes index use.
204204
"""
205205
penalty = 0.0 # extra penalty
206-
name_partials = {t.token: t for t in self.query.get_partials_list(name)}
206+
name_partials = {t.token: t for t in self.query.iter_partials(name)}
207207

208-
addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
208+
addr_partials = [t for r in address for t in self.query.iter_partials(r)]
209209
addr_tokens = list({t.token for t in addr_partials})
210210

211211
exp_count = min(t.count for t in name_partials.values()) / (3**(len(name_partials) - 1))
@@ -282,8 +282,7 @@ def get_name_ranking(self, trange: qmod.TokenRange,
282282
ranks = [dbf.RankedTokens(t.penalty, [t.token]) for t in name_fulls]
283283
ranks.sort(key=lambda r: r.penalty)
284284
# Fallback, sum of penalty for partials
285-
name_partials = self.query.get_partials_list(trange)
286-
default = sum(t.penalty for t in name_partials) + 0.2
285+
default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
287286
return dbf.FieldRanking(db_field, default, ranks)
288287

289288
def get_addr_ranking(self, trange: qmod.TokenRange) -> dbf.FieldRanking:
@@ -296,35 +295,35 @@ def get_addr_ranking(self, trange: qmod.TokenRange) -> dbf.FieldRanking:
296295

297296
while todo:
298297
neglen, pos, rank = heapq.heappop(todo)
298+
# partial node
299+
partial = self.query.nodes[pos].partial
300+
if partial is not None:
301+
if pos + 1 < trange.end:
302+
penalty = rank.penalty + partial.penalty \
303+
+ PENALTY_WORDCHANGE[self.query.nodes[pos + 1].btype]
304+
heapq.heappush(todo, (neglen - 1, pos + 1,
305+
dbf.RankedTokens(penalty, rank.tokens)))
306+
else:
307+
ranks.append(dbf.RankedTokens(rank.penalty + partial.penalty,
308+
rank.tokens))
309+
# full words
299310
for tlist in self.query.nodes[pos].starting:
300-
if tlist.ttype in (qmod.TOKEN_PARTIAL, qmod.TOKEN_WORD):
311+
if tlist.ttype == qmod.TOKEN_WORD:
301312
if tlist.end < trange.end:
302313
chgpenalty = PENALTY_WORDCHANGE[self.query.nodes[tlist.end].btype]
303-
if tlist.ttype == qmod.TOKEN_PARTIAL:
304-
penalty = rank.penalty + chgpenalty \
305-
+ max(t.penalty for t in tlist.tokens)
314+
for t in tlist.tokens:
306315
heapq.heappush(todo, (neglen - 1, tlist.end,
307-
dbf.RankedTokens(penalty, rank.tokens)))
308-
else:
309-
for t in tlist.tokens:
310-
heapq.heappush(todo, (neglen - 1, tlist.end,
311-
rank.with_token(t, chgpenalty)))
316+
rank.with_token(t, chgpenalty)))
312317
elif tlist.end == trange.end:
313-
if tlist.ttype == qmod.TOKEN_PARTIAL:
314-
ranks.append(dbf.RankedTokens(rank.penalty
315-
+ max(t.penalty for t in tlist.tokens),
316-
rank.tokens))
317-
else:
318-
ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
319-
if len(ranks) >= 10:
320-
# Too many variants, bail out and only add
321-
# Worst-case Fallback: sum of penalty of partials
322-
name_partials = self.query.get_partials_list(trange)
323-
default = sum(t.penalty for t in name_partials) + 0.2
324-
ranks.append(dbf.RankedTokens(rank.penalty + default, []))
325-
# Bail out of outer loop
326-
todo.clear()
327-
break
318+
ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
319+
320+
if len(ranks) >= 10:
321+
# Too many variants, bail out and only add
322+
# Worst-case Fallback: sum of penalty of partials
323+
default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
324+
ranks.append(dbf.RankedTokens(rank.penalty + default, []))
325+
# Bail out of outer loop
326+
break
328327

329328
ranks.sort(key=lambda r: len(r.tokens))
330329
default = ranks[0].penalty + 0.3

src/nominatim_api/search/geocoder.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#
33
# This file is part of Nominatim. (https://nominatim.org)
44
#
5-
# Copyright (C) 2024 by the Nominatim developer community.
5+
# Copyright (C) 2025 by the Nominatim developer community.
66
# For a full list of authors see the git log.
77
"""
88
Public interface to the search code.
@@ -50,6 +50,9 @@ async def build_searches(self,
5050
self.query_analyzer = await make_query_analyzer(self.conn)
5151

5252
query = await self.query_analyzer.analyze_query(phrases)
53+
query.compute_direction_penalty()
54+
log().var_dump('Query direction penalty',
55+
lambda: f"[{'LR' if query.dir_penalty < 0 else 'RL'}] {query.dir_penalty}")
5356

5457
searches: List[AbstractSearch] = []
5558
if query.num_token_slots() > 0:

src/nominatim_api/search/icu_tokenizer.py

Lines changed: 37 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#
33
# This file is part of Nominatim. (https://nominatim.org)
44
#
5-
# Copyright (C) 2024 by the Nominatim developer community.
5+
# Copyright (C) 2025 by the Nominatim developer community.
66
# For a full list of authors see the git log.
77
"""
88
Implementation of query analysis for the ICU tokenizer.
@@ -267,32 +267,47 @@ def add_extra_tokens(self, query: qmod.QueryStruct) -> None:
267267
def rerank_tokens(self, query: qmod.QueryStruct) -> None:
268268
""" Add penalties to tokens that depend on presence of other token.
269269
"""
270-
for i, node, tlist in query.iter_token_lists():
271-
if tlist.ttype == qmod.TOKEN_POSTCODE:
272-
tlen = len(cast(ICUToken, tlist.tokens[0]).word_token)
273-
for repl in node.starting:
274-
if repl.end == tlist.end and repl.ttype != qmod.TOKEN_POSTCODE \
275-
and (repl.ttype != qmod.TOKEN_HOUSENUMBER or tlen > 4):
276-
repl.add_penalty(0.39)
277-
elif (tlist.ttype == qmod.TOKEN_HOUSENUMBER
278-
and len(tlist.tokens[0].lookup_word) <= 3):
279-
if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
280-
for repl in node.starting:
281-
if repl.end == tlist.end and repl.ttype != qmod.TOKEN_HOUSENUMBER:
282-
repl.add_penalty(0.5 - tlist.tokens[0].penalty)
283-
elif tlist.ttype not in (qmod.TOKEN_COUNTRY, qmod.TOKEN_PARTIAL):
284-
norm = ' '.join(n.term_normalized for n in query.nodes[i + 1:tlist.end + 1]
285-
if n.btype != qmod.BREAK_TOKEN)
286-
if not norm:
287-
# Can happen when the token only covers a partial term
288-
norm = query.nodes[i + 1].term_normalized
289-
for token in tlist.tokens:
290-
cast(ICUToken, token).rematch(norm)
270+
for start, end, tlist in query.iter_tokens_by_edge():
271+
if len(tlist) > 1:
272+
# If it looks like a Postcode, give preference.
273+
if qmod.TOKEN_POSTCODE in tlist:
274+
for ttype, tokens in tlist.items():
275+
if ttype != qmod.TOKEN_POSTCODE and \
276+
(ttype != qmod.TOKEN_HOUSENUMBER or
277+
start + 1 > end or
278+
len(query.nodes[end].term_lookup) > 4):
279+
for token in tokens:
280+
token.penalty += 0.39
281+
282+
# If it looks like a simple housenumber, prefer that.
283+
if qmod.TOKEN_HOUSENUMBER in tlist:
284+
hnr_lookup = tlist[qmod.TOKEN_HOUSENUMBER][0].lookup_word
285+
if len(hnr_lookup) <= 3 and any(c.isdigit() for c in hnr_lookup):
286+
penalty = 0.5 - tlist[qmod.TOKEN_HOUSENUMBER][0].penalty
287+
for ttype, tokens in tlist.items():
288+
if ttype != qmod.TOKEN_HOUSENUMBER:
289+
for token in tokens:
290+
token.penalty += penalty
291+
292+
# rerank tokens against the normalized form
293+
norm = ' '.join(n.term_normalized for n in query.nodes[start + 1:end + 1]
294+
if n.btype != qmod.BREAK_TOKEN)
295+
if not norm:
296+
# Can happen when the token only covers a partial term
297+
norm = query.nodes[start + 1].term_normalized
298+
for ttype, tokens in tlist.items():
299+
if ttype != qmod.TOKEN_COUNTRY:
300+
for token in tokens:
301+
cast(ICUToken, token).rematch(norm)
291302

292303

293304
def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
294305
yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
295306
for i, node in enumerate(query.nodes):
307+
if node.partial is not None:
308+
t = cast(ICUToken, node.partial)
309+
yield [qmod.TOKEN_PARTIAL, str(i), str(i + 1), t.token,
310+
t.word_token, t.lookup_word, t.penalty, t.count, t.info]
296311
for tlist in node.starting:
297312
for token in tlist.tokens:
298313
t = cast(ICUToken, token)

0 commit comments

Comments
 (0)