osm-search
diff --git a/‎src/nominatim_api/search/db_search_builder.py
Lines changed: 30 additions & 31 deletions b/‎src/nominatim_api/search/db_search_builder.py
Lines changed: 30 additions & 31 deletions
diff --git a/‎src/nominatim_api/search/geocoder.py
Lines changed: 4 additions & 1 deletion b/‎src/nominatim_api/search/geocoder.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/nominatim_api/search/icu_tokenizer.py
Lines changed: 37 additions & 22 deletions b/‎src/nominatim_api/search/icu_tokenizer.py
Lines changed: 37 additions & 22 deletions
@@ -2,7 +2,7 @@
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Conversion from token assignment to an abstract DB search.
@@ -146,7 +146,7 @@ def build_special_search(self, sdata: dbf.SearchData,
             if address:
                 sdata.lookups = [dbf.FieldLookup('nameaddress_vector',
                                                  [t.token for r in address
-                                                  for t in self.query.get_partials_list(r)],
+                                                  for t in self.query.iter_partials(r)],
                                                  lookups.Restrict)]
             yield dbs.PostcodeSearch(penalty, sdata)
 
@@ -159,7 +159,7 @@ def build_housenumber_search(self, sdata: dbf.SearchData, hnrs: List[qmod.Token]
         expected_count = sum(t.count for t in hnrs)
 
         partials = {t.token: t.addr_count for trange in address
-                    for t in self.query.get_partials_list(trange)}
+                    for t in self.query.iter_partials(trange)}
 
         if not partials:
             # can happen when none of the partials is indexed
@@ -203,9 +203,9 @@ def yield_lookups(self, name: qmod.TokenRange, address: List[qmod.TokenRange]
             are and tries to find a lookup that optimizes index use.
         """
         penalty = 0.0  # extra penalty
-        name_partials = {t.token: t for t in self.query.get_partials_list(name)}
+        name_partials = {t.token: t for t in self.query.iter_partials(name)}
 
-        addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
+        addr_partials = [t for r in address for t in self.query.iter_partials(r)]
         addr_tokens = list({t.token for t in addr_partials})
 
         exp_count = min(t.count for t in name_partials.values()) / (3**(len(name_partials) - 1))
@@ -282,8 +282,7 @@ def get_name_ranking(self, trange: qmod.TokenRange,
         ranks = [dbf.RankedTokens(t.penalty, [t.token]) for t in name_fulls]
         ranks.sort(key=lambda r: r.penalty)
         # Fallback, sum of penalty for partials
-        name_partials = self.query.get_partials_list(trange)
-        default = sum(t.penalty for t in name_partials) + 0.2
+        default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
         return dbf.FieldRanking(db_field, default, ranks)
 
     def get_addr_ranking(self, trange: qmod.TokenRange) -> dbf.FieldRanking:
@@ -296,35 +295,35 @@ def get_addr_ranking(self, trange: qmod.TokenRange) -> dbf.FieldRanking:
 
         while todo:
             neglen, pos, rank = heapq.heappop(todo)
+            # partial node
+            partial = self.query.nodes[pos].partial
+            if partial is not None:
+                if pos + 1 < trange.end:
+                    penalty = rank.penalty + partial.penalty \
+                              + PENALTY_WORDCHANGE[self.query.nodes[pos + 1].btype]
+                    heapq.heappush(todo, (neglen - 1, pos + 1,
+                                   dbf.RankedTokens(penalty, rank.tokens)))
+                else:
+                    ranks.append(dbf.RankedTokens(rank.penalty + partial.penalty,
+                                                  rank.tokens))
+            # full words
             for tlist in self.query.nodes[pos].starting:
-                if tlist.ttype in (qmod.TOKEN_PARTIAL, qmod.TOKEN_WORD):
+                if tlist.ttype == qmod.TOKEN_WORD:
                     if tlist.end < trange.end:
                         chgpenalty = PENALTY_WORDCHANGE[self.query.nodes[tlist.end].btype]
-                        if tlist.ttype == qmod.TOKEN_PARTIAL:
-                            penalty = rank.penalty + chgpenalty \
-                                      + max(t.penalty for t in tlist.tokens)
+                        for t in tlist.tokens:
                             heapq.heappush(todo, (neglen - 1, tlist.end,
-                                                  dbf.RankedTokens(penalty, rank.tokens)))
-                        else:
-                            for t in tlist.tokens:
-                                heapq.heappush(todo, (neglen - 1, tlist.end,
-                                                      rank.with_token(t, chgpenalty)))
+                                                  rank.with_token(t, chgpenalty)))
                     elif tlist.end == trange.end:
-                        if tlist.ttype == qmod.TOKEN_PARTIAL:
-                            ranks.append(dbf.RankedTokens(rank.penalty
-                                                          + max(t.penalty for t in tlist.tokens),
-                                                          rank.tokens))
-                        else:
-                            ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
-                        if len(ranks) >= 10:
-                            # Too many variants, bail out and only add
-                            # Worst-case Fallback: sum of penalty of partials
-                            name_partials = self.query.get_partials_list(trange)
-                            default = sum(t.penalty for t in name_partials) + 0.2
-                            ranks.append(dbf.RankedTokens(rank.penalty + default, []))
-                            # Bail out of outer loop
-                            todo.clear()
-                            break
+                        ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
+
+            if len(ranks) >= 10:
+                # Too many variants, bail out and only add
+                # Worst-case Fallback: sum of penalty of partials
+                default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
+                ranks.append(dbf.RankedTokens(rank.penalty + default, []))
+                # Bail out of outer loop
+                break
 
         ranks.sort(key=lambda r: len(r.tokens))
         default = ranks[0].penalty + 0.3
 
@@ -2,7 +2,7 @@
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Public interface to the search code.
@@ -50,6 +50,9 @@ async def build_searches(self,
             self.query_analyzer = await make_query_analyzer(self.conn)
 
         query = await self.query_analyzer.analyze_query(phrases)
+        query.compute_direction_penalty()
+        log().var_dump('Query direction penalty',
+                       lambda: f"[{'LR' if query.dir_penalty < 0 else 'RL'}] {query.dir_penalty}")
 
         searches: List[AbstractSearch] = []
         if query.num_token_slots() > 0:
 
@@ -2,7 +2,7 @@
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Implementation of query analysis for the ICU tokenizer.
@@ -267,32 +267,47 @@ def add_extra_tokens(self, query: qmod.QueryStruct) -> None:
     def rerank_tokens(self, query: qmod.QueryStruct) -> None:
         """ Add penalties to tokens that depend on presence of other token.
         """
-        for i, node, tlist in query.iter_token_lists():
-            if tlist.ttype == qmod.TOKEN_POSTCODE:
-                tlen = len(cast(ICUToken, tlist.tokens[0]).word_token)
-                for repl in node.starting:
-                    if repl.end == tlist.end and repl.ttype != qmod.TOKEN_POSTCODE \
-                       and (repl.ttype != qmod.TOKEN_HOUSENUMBER or tlen > 4):
-                        repl.add_penalty(0.39)
-            elif (tlist.ttype == qmod.TOKEN_HOUSENUMBER
-                  and len(tlist.tokens[0].lookup_word) <= 3):
-                if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
-                    for repl in node.starting:
-                        if repl.end == tlist.end and repl.ttype != qmod.TOKEN_HOUSENUMBER:
-                            repl.add_penalty(0.5 - tlist.tokens[0].penalty)
-            elif tlist.ttype not in (qmod.TOKEN_COUNTRY, qmod.TOKEN_PARTIAL):
-                norm = ' '.join(n.term_normalized for n in query.nodes[i + 1:tlist.end + 1]
-                                if n.btype != qmod.BREAK_TOKEN)
-                if not norm:
-                    # Can happen when the token only covers a partial term
-                    norm = query.nodes[i + 1].term_normalized
-                for token in tlist.tokens:
-                    cast(ICUToken, token).rematch(norm)
+        for start, end, tlist in query.iter_tokens_by_edge():
+            if len(tlist) > 1:
+                # If it looks like a Postcode, give preference.
+                if qmod.TOKEN_POSTCODE in tlist:
+                    for ttype, tokens in tlist.items():
+                        if ttype != qmod.TOKEN_POSTCODE and \
+                               (ttype != qmod.TOKEN_HOUSENUMBER or
+                                start + 1 > end or
+                                len(query.nodes[end].term_lookup) > 4):
+                            for token in tokens:
+                                token.penalty += 0.39
+
+                # If it looks like a simple housenumber, prefer that.
+                if qmod.TOKEN_HOUSENUMBER in tlist:
+                    hnr_lookup = tlist[qmod.TOKEN_HOUSENUMBER][0].lookup_word
+                    if len(hnr_lookup) <= 3 and any(c.isdigit() for c in hnr_lookup):
+                        penalty = 0.5 - tlist[qmod.TOKEN_HOUSENUMBER][0].penalty
+                        for ttype, tokens in tlist.items():
+                            if ttype != qmod.TOKEN_HOUSENUMBER:
+                                for token in tokens:
+                                    token.penalty += penalty
+
+            # rerank tokens against the normalized form
+            norm = ' '.join(n.term_normalized for n in query.nodes[start + 1:end + 1]
+                            if n.btype != qmod.BREAK_TOKEN)
+            if not norm:
+                # Can happen when the token only covers a partial term
+                norm = query.nodes[start + 1].term_normalized
+            for ttype, tokens in tlist.items():
+                if ttype != qmod.TOKEN_COUNTRY:
+                    for token in tokens:
+                        cast(ICUToken, token).rematch(norm)
 
 
 def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
     yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
     for i, node in enumerate(query.nodes):
+        if node.partial is not None:
+            t = cast(ICUToken, node.partial)
+            yield [qmod.TOKEN_PARTIAL, str(i), str(i + 1), t.token,
+                   t.word_token, t.lookup_word, t.penalty, t.count, t.info]
         for tlist in node.starting:
             for token in tlist.tokens:
                 t = cast(ICUToken, token)