Skip to content

Commit f567ea8

Browse files
authored
Merge pull request #3658 from lonvia/minor-query-parsing-optimisations
Minor query parsing optimisations
2 parents adabfee + 3e718e4 commit f567ea8

13 files changed

+486
-471
lines changed

docs/develop/ICU-Tokenizer-Modules.md

+13-7
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,19 @@ The order of phrases matters to Nominatim when doing further processing.
6060
Thus, while you may split or join phrases, you should not reorder them
6161
unless you really know what you are doing.
6262

63-
Phrase types (`nominatim_api.search.PhraseType`) can further help narrowing
64-
down how the tokens in the phrase are interpreted. The following phrase types
65-
are known:
66-
67-
::: nominatim_api.search.PhraseType
68-
options:
69-
heading_level: 6
63+
Phrase types can further help narrowing down how the tokens in the phrase
64+
are interpreted. The following phrase types are known:
65+
66+
| Name | Description |
67+
|----------------|-------------|
68+
| PHRASE_ANY | No specific designation (i.e. source is free-form query) |
69+
| PHRASE_AMENITY | Contains name or type of a POI |
70+
| PHRASE_STREET | Contains a street name optionally with a housenumber |
71+
| PHRASE_CITY | Contains the postal city |
72+
| PHRASE_COUNTY | Contains the equivalent of a county |
73+
| PHRASE_STATE | Contains a state or province |
74+
| PHRASE_POSTCODE| Contains a postal code |
75+
| PHRASE_COUNTRY | Contains the country name or code |
7076

7177

7278
## Custom sanitizer modules

src/nominatim_api/core.py

+24-24
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from .status import get_status, StatusResult
2727
from .lookup import get_places, get_detailed_place
2828
from .reverse import ReverseGeocoder
29-
from .search import ForwardGeocoder, Phrase, PhraseType, make_query_analyzer
29+
from . import search as nsearch
3030
from . import types as ntyp
3131
from .results import DetailedResult, ReverseResult, SearchResults
3232

@@ -207,7 +207,7 @@ async def details(self, place: ntyp.PlaceRef, **params: Any) -> Optional[Detaile
207207
async with self.begin() as conn:
208208
conn.set_query_timeout(self.query_timeout)
209209
if details.keywords:
210-
await make_query_analyzer(conn)
210+
await nsearch.make_query_analyzer(conn)
211211
return await get_detailed_place(conn, place, details)
212212

213213
async def lookup(self, places: Sequence[ntyp.PlaceRef], **params: Any) -> SearchResults:
@@ -219,7 +219,7 @@ async def lookup(self, places: Sequence[ntyp.PlaceRef], **params: Any) -> Search
219219
async with self.begin() as conn:
220220
conn.set_query_timeout(self.query_timeout)
221221
if details.keywords:
222-
await make_query_analyzer(conn)
222+
await nsearch.make_query_analyzer(conn)
223223
return await get_places(conn, places, details)
224224

225225
async def reverse(self, coord: ntyp.AnyPoint, **params: Any) -> Optional[ReverseResult]:
@@ -237,7 +237,7 @@ async def reverse(self, coord: ntyp.AnyPoint, **params: Any) -> Optional[Reverse
237237
async with self.begin() as conn:
238238
conn.set_query_timeout(self.query_timeout)
239239
if details.keywords:
240-
await make_query_analyzer(conn)
240+
await nsearch.make_query_analyzer(conn)
241241
geocoder = ReverseGeocoder(conn, details,
242242
self.reverse_restrict_to_country_area)
243243
return await geocoder.lookup(coord)
@@ -251,10 +251,10 @@ async def search(self, query: str, **params: Any) -> SearchResults:
251251

252252
async with self.begin() as conn:
253253
conn.set_query_timeout(self.query_timeout)
254-
geocoder = ForwardGeocoder(conn, ntyp.SearchDetails.from_kwargs(params),
255-
self.config.get_int('REQUEST_TIMEOUT')
256-
if self.config.REQUEST_TIMEOUT else None)
257-
phrases = [Phrase(PhraseType.NONE, p.strip()) for p in query.split(',')]
254+
geocoder = nsearch.ForwardGeocoder(conn, ntyp.SearchDetails.from_kwargs(params),
255+
self.config.get_int('REQUEST_TIMEOUT')
256+
if self.config.REQUEST_TIMEOUT else None)
257+
phrases = [nsearch.Phrase(nsearch.PHRASE_ANY, p.strip()) for p in query.split(',')]
258258
return await geocoder.lookup(phrases)
259259

260260
async def search_address(self, amenity: Optional[str] = None,
@@ -271,22 +271,22 @@ async def search_address(self, amenity: Optional[str] = None,
271271
conn.set_query_timeout(self.query_timeout)
272272
details = ntyp.SearchDetails.from_kwargs(params)
273273

274-
phrases: List[Phrase] = []
274+
phrases: List[nsearch.Phrase] = []
275275

276276
if amenity:
277-
phrases.append(Phrase(PhraseType.AMENITY, amenity))
277+
phrases.append(nsearch.Phrase(nsearch.PHRASE_AMENITY, amenity))
278278
if street:
279-
phrases.append(Phrase(PhraseType.STREET, street))
279+
phrases.append(nsearch.Phrase(nsearch.PHRASE_STREET, street))
280280
if city:
281-
phrases.append(Phrase(PhraseType.CITY, city))
281+
phrases.append(nsearch.Phrase(nsearch.PHRASE_CITY, city))
282282
if county:
283-
phrases.append(Phrase(PhraseType.COUNTY, county))
283+
phrases.append(nsearch.Phrase(nsearch.PHRASE_COUNTY, county))
284284
if state:
285-
phrases.append(Phrase(PhraseType.STATE, state))
285+
phrases.append(nsearch.Phrase(nsearch.PHRASE_STATE, state))
286286
if postalcode:
287-
phrases.append(Phrase(PhraseType.POSTCODE, postalcode))
287+
phrases.append(nsearch.Phrase(nsearch.PHRASE_POSTCODE, postalcode))
288288
if country:
289-
phrases.append(Phrase(PhraseType.COUNTRY, country))
289+
phrases.append(nsearch.Phrase(nsearch.PHRASE_COUNTRY, country))
290290

291291
if not phrases:
292292
raise UsageError('Nothing to search for.')
@@ -309,9 +309,9 @@ async def search_address(self, amenity: Optional[str] = None,
309309
if amenity:
310310
details.layers |= ntyp.DataLayer.POI
311311

312-
geocoder = ForwardGeocoder(conn, details,
313-
self.config.get_int('REQUEST_TIMEOUT')
314-
if self.config.REQUEST_TIMEOUT else None)
312+
geocoder = nsearch.ForwardGeocoder(conn, details,
313+
self.config.get_int('REQUEST_TIMEOUT')
314+
if self.config.REQUEST_TIMEOUT else None)
315315
return await geocoder.lookup(phrases)
316316

317317
async def search_category(self, categories: List[Tuple[str, str]],
@@ -328,15 +328,15 @@ async def search_category(self, categories: List[Tuple[str, str]],
328328
async with self.begin() as conn:
329329
conn.set_query_timeout(self.query_timeout)
330330
if near_query:
331-
phrases = [Phrase(PhraseType.NONE, p) for p in near_query.split(',')]
331+
phrases = [nsearch.Phrase(nsearch.PHRASE_ANY, p) for p in near_query.split(',')]
332332
else:
333333
phrases = []
334334
if details.keywords:
335-
await make_query_analyzer(conn)
335+
await nsearch.make_query_analyzer(conn)
336336

337-
geocoder = ForwardGeocoder(conn, details,
338-
self.config.get_int('REQUEST_TIMEOUT')
339-
if self.config.REQUEST_TIMEOUT else None)
337+
geocoder = nsearch.ForwardGeocoder(conn, details,
338+
self.config.get_int('REQUEST_TIMEOUT')
339+
if self.config.REQUEST_TIMEOUT else None)
340340
return await geocoder.lookup_pois(categories, phrases)
341341

342342

src/nominatim_api/search/__init__.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,12 @@
99
"""
1010
from .geocoder import (ForwardGeocoder as ForwardGeocoder)
1111
from .query import (Phrase as Phrase,
12-
PhraseType as PhraseType)
12+
PHRASE_ANY as PHRASE_ANY,
13+
PHRASE_AMENITY as PHRASE_AMENITY,
14+
PHRASE_STREET as PHRASE_STREET,
15+
PHRASE_CITY as PHRASE_CITY,
16+
PHRASE_COUNTY as PHRASE_COUNTY,
17+
PHRASE_STATE as PHRASE_STATE,
18+
PHRASE_POSTCODE as PHRASE_POSTCODE,
19+
PHRASE_COUNTRY as PHRASE_COUNTRY)
1320
from .query_analyzer_factory import (make_query_analyzer as make_query_analyzer)

src/nominatim_api/search/db_search_builder.py

+32-32
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import heapq
1212

1313
from ..types import SearchDetails, DataLayer
14-
from .query import QueryStruct, Token, TokenType, TokenRange, BreakType
14+
from . import query as qmod
1515
from .token_assignment import TokenAssignment
1616
from . import db_search_fields as dbf
1717
from . import db_searches as dbs
@@ -51,7 +51,7 @@ class SearchBuilder:
5151
""" Build the abstract search queries from token assignments.
5252
"""
5353

54-
def __init__(self, query: QueryStruct, details: SearchDetails) -> None:
54+
def __init__(self, query: qmod.QueryStruct, details: SearchDetails) -> None:
5555
self.query = query
5656
self.details = details
5757

@@ -97,7 +97,7 @@ def build(self, assignment: TokenAssignment) -> Iterator[dbs.AbstractSearch]:
9797
builder = self.build_poi_search(sdata)
9898
elif assignment.housenumber:
9999
hnr_tokens = self.query.get_tokens(assignment.housenumber,
100-
TokenType.HOUSENUMBER)
100+
qmod.TOKEN_HOUSENUMBER)
101101
builder = self.build_housenumber_search(sdata, hnr_tokens, assignment.address)
102102
else:
103103
builder = self.build_special_search(sdata, assignment.address,
@@ -128,7 +128,7 @@ def build_poi_search(self, sdata: dbf.SearchData) -> Iterator[dbs.AbstractSearch
128128
yield dbs.PoiSearch(sdata)
129129

130130
def build_special_search(self, sdata: dbf.SearchData,
131-
address: List[TokenRange],
131+
address: List[qmod.TokenRange],
132132
is_category: bool) -> Iterator[dbs.AbstractSearch]:
133133
""" Build abstract search queries for searches that do not involve
134134
a named place.
@@ -150,8 +150,8 @@ def build_special_search(self, sdata: dbf.SearchData,
150150
lookups.Restrict)]
151151
yield dbs.PostcodeSearch(penalty, sdata)
152152

153-
def build_housenumber_search(self, sdata: dbf.SearchData, hnrs: List[Token],
154-
address: List[TokenRange]) -> Iterator[dbs.AbstractSearch]:
153+
def build_housenumber_search(self, sdata: dbf.SearchData, hnrs: List[qmod.Token],
154+
address: List[qmod.TokenRange]) -> Iterator[dbs.AbstractSearch]:
155155
""" Build a simple address search for special entries where the
156156
housenumber is the main name token.
157157
"""
@@ -173,7 +173,7 @@ def build_housenumber_search(self, sdata: dbf.SearchData, hnrs: List[Token],
173173
list(partials), lookups.LookupAll))
174174
else:
175175
addr_fulls = [t.token for t
176-
in self.query.get_tokens(address[0], TokenType.WORD)]
176+
in self.query.get_tokens(address[0], qmod.TOKEN_WORD)]
177177
if len(addr_fulls) > 5:
178178
return
179179
sdata.lookups.append(
@@ -183,7 +183,7 @@ def build_housenumber_search(self, sdata: dbf.SearchData, hnrs: List[Token],
183183
yield dbs.PlaceSearch(0.05, sdata, expected_count)
184184

185185
def build_name_search(self, sdata: dbf.SearchData,
186-
name: TokenRange, address: List[TokenRange],
186+
name: qmod.TokenRange, address: List[qmod.TokenRange],
187187
is_category: bool) -> Iterator[dbs.AbstractSearch]:
188188
""" Build abstract search queries for simple name or address searches.
189189
"""
@@ -196,7 +196,7 @@ def build_name_search(self, sdata: dbf.SearchData,
196196
sdata.lookups = lookup
197197
yield dbs.PlaceSearch(penalty + name_penalty, sdata, count)
198198

199-
def yield_lookups(self, name: TokenRange, address: List[TokenRange]
199+
def yield_lookups(self, name: qmod.TokenRange, address: List[qmod.TokenRange]
200200
) -> Iterator[Tuple[float, int, List[dbf.FieldLookup]]]:
201201
""" Yield all variants how the given name and address should best
202202
be searched for. This takes into account how frequent the terms
@@ -216,7 +216,7 @@ def yield_lookups(self, name: TokenRange, address: List[TokenRange]
216216

217217
addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 30000
218218
# Partial term to frequent. Try looking up by rare full names first.
219-
name_fulls = self.query.get_tokens(name, TokenType.WORD)
219+
name_fulls = self.query.get_tokens(name, qmod.TOKEN_WORD)
220220
if name_fulls:
221221
fulls_count = sum(t.count for t in name_fulls)
222222

@@ -235,7 +235,7 @@ def yield_lookups(self, name: TokenRange, address: List[TokenRange]
235235
self.get_name_address_ranking(list(name_partials.keys()), addr_partials)
236236

237237
def get_name_address_ranking(self, name_tokens: List[int],
238-
addr_partials: List[Token]) -> List[dbf.FieldLookup]:
238+
addr_partials: List[qmod.Token]) -> List[dbf.FieldLookup]:
239239
""" Create a ranking expression looking up by name and address.
240240
"""
241241
lookup = [dbf.FieldLookup('name_vector', name_tokens, lookups.LookupAll)]
@@ -257,7 +257,7 @@ def get_name_address_ranking(self, name_tokens: List[int],
257257

258258
return lookup
259259

260-
def get_full_name_ranking(self, name_fulls: List[Token], addr_partials: List[Token],
260+
def get_full_name_ranking(self, name_fulls: List[qmod.Token], addr_partials: List[qmod.Token],
261261
use_lookup: bool) -> List[dbf.FieldLookup]:
262262
""" Create a ranking expression with full name terms and
263263
additional address lookup. When 'use_lookup' is true, then
@@ -281,19 +281,19 @@ def get_full_name_ranking(self, name_fulls: List[Token], addr_partials: List[Tok
281281
return dbf.lookup_by_any_name([t.token for t in name_fulls],
282282
addr_restrict_tokens, addr_lookup_tokens)
283283

284-
def get_name_ranking(self, trange: TokenRange,
284+
def get_name_ranking(self, trange: qmod.TokenRange,
285285
db_field: str = 'name_vector') -> dbf.FieldRanking:
286286
""" Create a ranking expression for a name term in the given range.
287287
"""
288-
name_fulls = self.query.get_tokens(trange, TokenType.WORD)
288+
name_fulls = self.query.get_tokens(trange, qmod.TOKEN_WORD)
289289
ranks = [dbf.RankedTokens(t.penalty, [t.token]) for t in name_fulls]
290290
ranks.sort(key=lambda r: r.penalty)
291291
# Fallback, sum of penalty for partials
292292
name_partials = self.query.get_partials_list(trange)
293293
default = sum(t.penalty for t in name_partials) + 0.2
294294
return dbf.FieldRanking(db_field, default, ranks)
295295

296-
def get_addr_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
296+
def get_addr_ranking(self, trange: qmod.TokenRange) -> dbf.FieldRanking:
297297
""" Create a list of ranking expressions for an address term
298298
for the given ranges.
299299
"""
@@ -304,10 +304,10 @@ def get_addr_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
304304
while todo:
305305
neglen, pos, rank = heapq.heappop(todo)
306306
for tlist in self.query.nodes[pos].starting:
307-
if tlist.ttype in (TokenType.PARTIAL, TokenType.WORD):
307+
if tlist.ttype in (qmod.TOKEN_PARTIAL, qmod.TOKEN_WORD):
308308
if tlist.end < trange.end:
309309
chgpenalty = PENALTY_WORDCHANGE[self.query.nodes[tlist.end].btype]
310-
if tlist.ttype == TokenType.PARTIAL:
310+
if tlist.ttype == qmod.TOKEN_PARTIAL:
311311
penalty = rank.penalty + chgpenalty \
312312
+ max(t.penalty for t in tlist.tokens)
313313
heapq.heappush(todo, (neglen - 1, tlist.end,
@@ -317,7 +317,7 @@ def get_addr_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
317317
heapq.heappush(todo, (neglen - 1, tlist.end,
318318
rank.with_token(t, chgpenalty)))
319319
elif tlist.end == trange.end:
320-
if tlist.ttype == TokenType.PARTIAL:
320+
if tlist.ttype == qmod.TOKEN_PARTIAL:
321321
ranks.append(dbf.RankedTokens(rank.penalty
322322
+ max(t.penalty for t in tlist.tokens),
323323
rank.tokens))
@@ -357,11 +357,11 @@ def get_search_data(self, assignment: TokenAssignment) -> Optional[dbf.SearchDat
357357
if assignment.housenumber:
358358
sdata.set_strings('housenumbers',
359359
self.query.get_tokens(assignment.housenumber,
360-
TokenType.HOUSENUMBER))
360+
qmod.TOKEN_HOUSENUMBER))
361361
if assignment.postcode:
362362
sdata.set_strings('postcodes',
363363
self.query.get_tokens(assignment.postcode,
364-
TokenType.POSTCODE))
364+
qmod.TOKEN_POSTCODE))
365365
if assignment.qualifier:
366366
tokens = self.get_qualifier_tokens(assignment.qualifier)
367367
if not tokens:
@@ -386,23 +386,23 @@ def get_search_data(self, assignment: TokenAssignment) -> Optional[dbf.SearchDat
386386

387387
return sdata
388388

389-
def get_country_tokens(self, trange: TokenRange) -> List[Token]:
389+
def get_country_tokens(self, trange: qmod.TokenRange) -> List[qmod.Token]:
390390
""" Return the list of country tokens for the given range,
391391
optionally filtered by the country list from the details
392392
parameters.
393393
"""
394-
tokens = self.query.get_tokens(trange, TokenType.COUNTRY)
394+
tokens = self.query.get_tokens(trange, qmod.TOKEN_COUNTRY)
395395
if self.details.countries:
396396
tokens = [t for t in tokens if t.lookup_word in self.details.countries]
397397

398398
return tokens
399399

400-
def get_qualifier_tokens(self, trange: TokenRange) -> List[Token]:
400+
def get_qualifier_tokens(self, trange: qmod.TokenRange) -> List[qmod.Token]:
401401
""" Return the list of qualifier tokens for the given range,
402402
optionally filtered by the qualifier list from the details
403403
parameters.
404404
"""
405-
tokens = self.query.get_tokens(trange, TokenType.QUALIFIER)
405+
tokens = self.query.get_tokens(trange, qmod.TOKEN_QUALIFIER)
406406
if self.details.categories:
407407
tokens = [t for t in tokens if t.get_category() in self.details.categories]
408408

@@ -415,7 +415,7 @@ def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCa
415415
"""
416416
if assignment.near_item:
417417
tokens: Dict[Tuple[str, str], float] = {}
418-
for t in self.query.get_tokens(assignment.near_item, TokenType.NEAR_ITEM):
418+
for t in self.query.get_tokens(assignment.near_item, qmod.TOKEN_NEAR_ITEM):
419419
cat = t.get_category()
420420
# The category of a near search will be that of near_item.
421421
# Thus, if search is restricted to a category parameter,
@@ -429,11 +429,11 @@ def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCa
429429

430430

431431
PENALTY_WORDCHANGE = {
432-
BreakType.START: 0.0,
433-
BreakType.END: 0.0,
434-
BreakType.PHRASE: 0.0,
435-
BreakType.SOFT_PHRASE: 0.0,
436-
BreakType.WORD: 0.1,
437-
BreakType.PART: 0.2,
438-
BreakType.TOKEN: 0.4
432+
qmod.BREAK_START: 0.0,
433+
qmod.BREAK_END: 0.0,
434+
qmod.BREAK_PHRASE: 0.0,
435+
qmod.BREAK_SOFT_PHRASE: 0.0,
436+
qmod.BREAK_WORD: 0.1,
437+
qmod.BREAK_PART: 0.2,
438+
qmod.BREAK_TOKEN: 0.4
439439
}

0 commit comments

Comments
 (0)