From a59186732d9600c16fd3d85780e18359f236bb5e Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Thu, 1 Dec 2022 11:59:50 +0000 Subject: [PATCH 1/6] first cut --- .../elasticsearch/search/SearchModule.java | 3 + .../subphase/highlight/HighlightField.java | 4 + .../subphase/highlight/HighlightPhase.java | 1 - .../highlight/MatchesFieldHighlighter.java | 140 ++++++++ .../highlight/MatchesHighlighter.java | 51 +++ .../highlight/MatchesHighlighterState.java | 83 +++++ .../highlight/XOffsetsFromMatchIterator.java | 70 ++++ .../highlight/XOffsetsFromPositions.java | 149 +++++++++ .../subphase/highlight/XPassageSelector.java | 313 ++++++++++++++++++ .../highlight/HighlightFieldTests.java | 3 +- .../highlight/MatchesHighlighterTests.java | 111 +++++++ .../search/fetch/HighlighterTestCase.java | 12 +- 12 files changed, 937 insertions(+), 3 deletions(-) create mode 100644 server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesFieldHighlighter.java create mode 100644 server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighter.java create mode 100644 server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterState.java create mode 100644 server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java create mode 100644 server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java create mode 100644 server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java create mode 100644 server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterTests.java diff --git a/server/src/main/java/org/elasticsearch/search/SearchModule.java b/server/src/main/java/org/elasticsearch/search/SearchModule.java index fa2699a30b897..c8635db570e5d 100644 --- a/server/src/main/java/org/elasticsearch/search/SearchModule.java +++ b/server/src/main/java/org/elasticsearch/search/SearchModule.java @@ -8,6 +8,7 @@ package org.elasticsearch.search; +import org.apache.lucene.search.Matches; import org.elasticsearch.common.CheckedBiConsumer; import org.elasticsearch.common.NamedRegistry; import org.elasticsearch.common.io.stream.NamedWriteableRegistry; @@ -219,6 +220,7 @@ import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter; import org.elasticsearch.search.fetch.subphase.highlight.HighlightPhase; import org.elasticsearch.search.fetch.subphase.highlight.Highlighter; +import org.elasticsearch.search.fetch.subphase.highlight.MatchesHighlighter; import org.elasticsearch.search.fetch.subphase.highlight.PlainHighlighter; import org.elasticsearch.search.fetch.subphase.highlight.UnifiedHighlighter; import org.elasticsearch.search.internal.ShardSearchRequest; @@ -863,6 +865,7 @@ private static Map setupHighlighters(Settings settings, Lis highlighters.register("fvh", new FastVectorHighlighter(settings)); highlighters.register("plain", new PlainHighlighter()); highlighters.register("unified", new UnifiedHighlighter()); + highlighters.register("matches", new MatchesHighlighter()); highlighters.extractAndRegister(plugins, SearchPlugin::getHighlighters); return unmodifiableMap(highlighters.getRegistry()); diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightField.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightField.java index d4b5234f4e0b2..df1be7092da9b 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightField.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightField.java @@ -54,6 +54,10 @@ public HighlightField(String name, Text[] fragments) { this.fragments = fragments; } + public HighlightField(String name, List fragments) { + this(name, fragments.stream().map(Text::new).toArray(Text[]::new)); + } + /** * The name of the field highlighted. */ diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightPhase.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightPhase.java index 4b60d32d3065f..b9e0200094a34 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightPhase.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightPhase.java @@ -40,7 +40,6 @@ public FetchSubPhaseProcessor getProcessor(FetchContext context) { if (context.highlight() == null) { return null; } - return getProcessor(context, context.highlight(), context.parsedQuery().query()); } diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesFieldHighlighter.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesFieldHighlighter.java new file mode 100644 index 0000000000000..f19fa62909012 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesFieldHighlighter.java @@ -0,0 +1,140 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.search.fetch.subphase.highlight; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.AnalyzerWrapper; +import org.apache.lucene.search.FilterMatchesIterator; +import org.apache.lucene.search.Matches; +import org.apache.lucene.search.MatchesIterator; +import org.apache.lucene.search.highlight.OffsetLimitTokenFilter; +import org.apache.lucene.search.matchhighlight.OffsetRange; +import org.apache.lucene.search.matchhighlight.OffsetsFromTokens; +import org.apache.lucene.search.matchhighlight.OffsetsRetrievalStrategy; +import org.apache.lucene.search.matchhighlight.Passage; +import org.apache.lucene.search.matchhighlight.PassageFormatter; +import org.apache.lucene.search.matchhighlight.PassageSelector; +import org.apache.lucene.search.uhighlight.PassageScorer; +import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.index.mapper.TextSearchInfo; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +public class MatchesFieldHighlighter { + + private final FieldHighlightContext context; + private final Matches matches; + private final Analyzer analyzer; + private final String field; + + public MatchesFieldHighlighter(FieldHighlightContext context, MatchesHighlighterState state) throws IOException { + this.context = context; + // TODO term vectors and require_field_match=false should intercept things here + this.matches = state.getMatches(context.query, context.hitContext.docId()); + this.analyzer = context.context.getSearchExecutionContext().getIndexAnalyzer(s -> Lucene.STANDARD_ANALYZER); + this.field = context.fieldType.name(); + } + + public MatchesIterator getMatchesIterator() throws IOException { + if (this.matches == null) { + return null; + } + MatchesIterator it = this.matches.getMatches(field); + if (it == null || context.field.fieldOptions().maxAnalyzedOffset() == null) { + return it; + } + int positionCutOff = context.field.fieldOptions().maxAnalyzedOffset() / 5; + return new FilterMatchesIterator(it) { + @Override + public boolean next() throws IOException { + if (it.next() == false) { + return false; + } + return it.startPosition() <= positionCutOff; + } + }; + } + + public List buildHighlights(MatchesIterator it, List sourceValues) throws IOException { + String contiguousSourceText = buildContiguousSourceText(sourceValues); + OffsetsRetrievalStrategy offsetsStrategy = getOffsetStrategy(); + List matchRanges = offsetsStrategy.get(it, f -> sourceValues); + List sourceRanges = computeValueRanges(sourceValues); + XPassageSelector passageSelector = new XPassageSelector(); // TODO word break stuff goes here + PassageFormatter formatter = new PassageFormatter( + "...", + context.field.fieldOptions().preTags()[0], + context.field.fieldOptions().postTags()[0] + ); // TODO multiple field markers a la FVH + List passages = passageSelector.pickBest( + contiguousSourceText, + matchRanges, + context.field.fieldOptions().fragmentCharSize(), + context.field.fieldOptions().numberOfFragments(), + sourceRanges + ); + return formatter.format(contiguousSourceText, passages, sourceRanges); + } + + private OffsetsRetrievalStrategy getOffsetStrategy() { + TextSearchInfo tsi = context.fieldType.getTextSearchInfo(); + // TODO termvectors + return switch (tsi.luceneFieldType().indexOptions()) { + case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS -> new XOffsetsFromMatchIterator( + field, + new XOffsetsFromPositions(field, analyzer) + ); + case DOCS_AND_FREQS_AND_POSITIONS -> new XOffsetsFromPositions(field, analyzer); + case DOCS_AND_FREQS, DOCS -> + // By default retrieve offsets from individual tokens + // retrieved by the analyzer (possibly narrowed down to + // only those terms that the query hinted at when passed + // a QueryVisitor. + // + // Alternative strategies are also possible and may make sense + // depending on the use case (OffsetsFromValues, for example). + new OffsetsFromTokens(field, analyzer); + case NONE -> (matchesIterator, doc) -> { + throw new IOException( + "Field is indexed without positions and/or offsets: " + + field + + ", " + + tsi.luceneFieldType().indexOptions()); + }; + }; + } + + private String buildContiguousSourceText(List values) { + String value; + if (values.size() == 1) { + value = values.get(0).toString(); + } else { + // TODO: This can be inefficient if offset gap is large but the logic + // of applying offsets would get much more complicated so leaving for now + // (would have to recalculate all offsets to omit gaps). + String fieldGapPadding = " ".repeat(analyzer.getOffsetGap(field)); + value = String.join(fieldGapPadding, values); + } + return value; + } + + private List computeValueRanges(List values) { + ArrayList valueRanges = new ArrayList<>(); + int offset = 0; + for (CharSequence v : values) { + valueRanges.add(new OffsetRange(offset, offset + v.length())); + offset += v.length(); + offset += analyzer.getOffsetGap(field); + } + return valueRanges; + } +} diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighter.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighter.java new file mode 100644 index 0000000000000..b6f35e4dd50a8 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighter.java @@ -0,0 +1,51 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.search.fetch.subphase.highlight; + +import org.apache.lucene.search.MatchesIterator; +import org.elasticsearch.index.mapper.MappedFieldType; + +import java.io.IOException; +import java.util.List; + +public class MatchesHighlighter implements Highlighter { + + private static final String MATCHES_HIGHLIGHTER_CONFIG_KEY = "matches_highlighter_config_key"; + + @Override + public boolean canHighlight(MappedFieldType fieldType) { + return true; + } + + @Override + public HighlightField highlight(FieldHighlightContext fieldContext) throws IOException { + + MatchesHighlighterState state = (MatchesHighlighterState) fieldContext.cache.computeIfAbsent( + MATCHES_HIGHLIGHTER_CONFIG_KEY, + k -> new MatchesHighlighterState(fieldContext) + ); + + MatchesFieldHighlighter fieldHighlighter = new MatchesFieldHighlighter(fieldContext, state); + + MatchesIterator it = fieldHighlighter.getMatchesIterator(); + if (it == null) { + return null; + } + + List sourceValues = HighlightUtils.loadFieldValues( + fieldContext.fieldType, + fieldContext.context.getSearchExecutionContext(), + fieldContext.hitContext, + fieldContext.forceSource + ).stream().map(v -> (CharSequence) v.toString()).toList(); + + List highlights = fieldHighlighter.buildHighlights(it, sourceValues); + return new HighlightField(fieldContext.fieldName, highlights); + } +} diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterState.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterState.java new file mode 100644 index 0000000000000..390638f374086 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterState.java @@ -0,0 +1,83 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.search.fetch.subphase.highlight; + +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Matches; +import org.apache.lucene.search.MatchesIterator; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Weight; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + +public class MatchesHighlighterState { + + private static final Matches NO_MATCHES = new Matches() { + @Override + public MatchesIterator getMatches(String field) { + return null; + } + + @Override + public Collection getSubMatches() { + return Collections.emptyList(); + } + + @Override + public Iterator iterator() { + return Collections.emptyIterator(); + } + }; + + private final FieldHighlightContext context; + private final IndexSearcher searcher; + private final Map weightCache = new HashMap<>(); + private final Map matchesCache = new HashMap<>(); + + private int currentDoc = -1; + + public MatchesHighlighterState(FieldHighlightContext context) { + this.context = context; + this.searcher = context.context.searcher(); + } + + public Matches getMatches(Query query, int doc) throws IOException { + if (currentDoc != doc) { + matchesCache.clear(); + currentDoc = doc; + } + Weight w = weightCache.get(query); + if (w == null) { + w = searcher.createWeight(searcher.rewrite(query), ScoreMode.COMPLETE_NO_SCORES, 1); + weightCache.put(query, w); + } + Matches m = matchesCache.get(query); + if (m == null) { + m = w.matches(context.hitContext.readerContext(), doc); + if (m == null) { + m = NO_MATCHES; + } + matchesCache.put(query, m); + } + if (m == NO_MATCHES) { + return null; + } + return m; + } + + public MatchesFieldHighlighter getMatchesFieldHighlighter(FieldHighlightContext fieldContext) throws IOException { + return new MatchesFieldHighlighter(fieldContext, this); + } +} diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java new file mode 100644 index 0000000000000..030cf275e5f40 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java @@ -0,0 +1,70 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.search.fetch.subphase.highlight; + + +import org.apache.lucene.search.MatchesIterator; +import org.apache.lucene.search.matchhighlight.MatchRegionRetriever; +import org.apache.lucene.search.matchhighlight.OffsetRange; +import org.apache.lucene.search.matchhighlight.OffsetsFromPositions; +import org.apache.lucene.search.matchhighlight.OffsetsRetrievalStrategy; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +/** + * This strategy retrieves offsets directly from {@link MatchesIterator}, if they are available, + * otherwise it falls back to using {@link OffsetsFromPositions}. + */ +// https://github.com/apache/lucene/pull/11983 +public final class XOffsetsFromMatchIterator implements OffsetsRetrievalStrategy { + private final String field; + private final XOffsetsFromPositions noOffsetsFallback; + + XOffsetsFromMatchIterator(String field, XOffsetsFromPositions noOffsetsFallback) { + this.field = field; + this.noOffsetsFallback = Objects.requireNonNull(noOffsetsFallback); + } + + @Override + public List get( + MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc) + throws IOException { + ArrayList positionRanges = new ArrayList<>(); + ArrayList offsetRanges = new ArrayList<>(); + while (matchesIterator.next()) { + int fromPosition = matchesIterator.startPosition(); + int toPosition = matchesIterator.endPosition(); + if (fromPosition < 0 || toPosition < 0) { + throw new IOException("Matches API returned negative positions for field: " + field); + } + positionRanges.add(new OffsetRange(fromPosition, toPosition)); + + if (offsetRanges != null) { + int from = matchesIterator.startOffset(); + int to = matchesIterator.endOffset(); + if (from < 0 || to < 0) { + // At least one offset isn't available. Fallback to just positions. + offsetRanges = null; + } else { + offsetRanges.add(new OffsetRange(from, to)); + } + } + } + + // Use the fallback conversion from positions if not all offsets were available. + if (offsetRanges == null) { + return noOffsetsFallback.convertPositionsToOffsets(positionRanges, doc.getValues(field)); + } else { + return offsetRanges; + } + } +} diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java new file mode 100644 index 0000000000000..b1abec70e4afa --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java @@ -0,0 +1,149 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.search.fetch.subphase.highlight; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.search.MatchesIterator; +import org.apache.lucene.search.matchhighlight.MatchRegionRetriever; +import org.apache.lucene.search.matchhighlight.OffsetRange; +import org.apache.lucene.search.matchhighlight.OffsetsRetrievalStrategy; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +// https://github.com/apache/lucene/pull/11983 +public final class XOffsetsFromPositions implements OffsetsRetrievalStrategy { + private final String field; + private final Analyzer analyzer; + + public XOffsetsFromPositions(String field, Analyzer analyzer) { + this.field = field; + this.analyzer = analyzer; + } + + @Override + public List get( + MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc) + throws IOException { + ArrayList positionRanges = new ArrayList<>(); + while (matchesIterator.next()) { + int from = matchesIterator.startPosition(); + int to = matchesIterator.endPosition(); + if (from < 0 || to < 0) { + throw new IOException("Matches API returned negative positions for field: " + field); + } + positionRanges.add(new OffsetRange(from, to)); + } + + // Convert from positions to offsets. + return convertPositionsToOffsets(positionRanges, doc.getValues(field)); + } + + List convertPositionsToOffsets( + ArrayList positionRanges, List values) throws IOException { + + if (positionRanges.isEmpty()) { + return positionRanges; + } + + class PositionSpan extends OffsetRange { + int leftOffset = Integer.MAX_VALUE; + int rightOffset = Integer.MIN_VALUE; + + PositionSpan(int from, int to) { + super(from, to); + } + + @Override + public String toString() { + return "[from=" + from + ", to=" + to + ", L: " + leftOffset + ", R: " + rightOffset + ']'; + } + } + + List spans = new ArrayList<>(); + int minPosition = Integer.MAX_VALUE; + int maxPosition = Integer.MIN_VALUE; + for (OffsetRange range : positionRanges) { + spans.add(new PositionSpan(range.from, range.to)); + minPosition = Math.min(minPosition, range.from); + maxPosition = Math.max(maxPosition, range.to); + } + + PositionSpan[] spansTable = spans.toArray(PositionSpan[]::new); + int spanCount = spansTable.length; + int position = -1; + int valueOffset = 0; + int convertedMatchCount = 0; + for (int valueIndex = 0, max = values.size(); valueIndex < max; valueIndex++) { + final String value = values.get(valueIndex).toString(); + final boolean lastValue = valueIndex + 1 == max; + + TokenStream ts = analyzer.tokenStream(field, value); + OffsetAttribute offsetAttr = ts.getAttribute(OffsetAttribute.class); + PositionIncrementAttribute posAttr = ts.getAttribute(PositionIncrementAttribute.class); + ts.reset(); + while (ts.incrementToken()) { + position += posAttr.getPositionIncrement(); + + if (position >= minPosition) { + // Correct left and right offsets for each span this position applies to. + int startOffset = valueOffset + offsetAttr.startOffset(); + int endOffset = valueOffset + offsetAttr.endOffset(); + + int j = 0; + for (int i = 0; i < spanCount; i++) { + PositionSpan span = spansTable[j] = spansTable[i]; + if (position >= span.from) { + if (position <= span.to) { + span.leftOffset = Math.min(span.leftOffset, startOffset); + span.rightOffset = Math.max(span.rightOffset, endOffset); + convertedMatchCount++; + } else { + // this span can't intersect with any following position + // so omit it by skipping j++. + continue; + } + } + j++; + } + spanCount = j; + + // Only short-circuit if we're on the last value (which should be the common + // case since most fields would only have a single value anyway). We need + // to make sure of this because otherwise offsetAttr would have incorrect value. + if (position > maxPosition && lastValue) { + break; + } + } + } + ts.end(); + position += posAttr.getPositionIncrement() + analyzer.getPositionIncrementGap(field); + valueOffset += offsetAttr.endOffset() + analyzer.getOffsetGap(field); + ts.close(); + } + + if (convertedMatchCount == 0) { + return Collections.emptyList(); + } + spans = spans.subList(0, convertedMatchCount); // remove any matches beyond the max analyzed offset limit + ArrayList converted = new ArrayList<>(spans.size()); + for (PositionSpan span : spans) { + if (span.leftOffset == Integer.MAX_VALUE || span.rightOffset == Integer.MIN_VALUE) { + throw new RuntimeException("One of the offsets missing for position range: " + span); + } + converted.add(new OffsetRange(span.leftOffset, span.rightOffset)); + } + return converted; + } +} diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java new file mode 100644 index 0000000000000..49d8765a4eb5f --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java @@ -0,0 +1,313 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.search.fetch.subphase.highlight; + +import org.apache.lucene.search.matchhighlight.OffsetRange; +import org.apache.lucene.search.matchhighlight.Passage; +import org.apache.lucene.search.matchhighlight.PassageAdjuster; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.PriorityQueue; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.RandomAccess; + +public class XPassageSelector { + public static final Comparator DEFAULT_SCORER = + (a, b) -> { + // Compare the number of highlights first. + int v; + v = Integer.compare(a.markers.size(), b.markers.size()); + if (v != 0) { + return v; + } + + // Total number of characters covered by the highlights. + int len1 = 0, len2 = 0; + for (OffsetRange o : a.markers) { + len1 += o.length(); + } + for (OffsetRange o : b.markers) { + len2 += o.length(); + } + if (len1 != len2) { + return Integer.compare(len1, len2); + } + + return Integer.compare(b.from, a.from); + }; + + private final Comparator passageScorer; + private final PassageAdjuster passageAdjuster; + + public XPassageSelector() { + this(DEFAULT_SCORER, null); + } + + public XPassageSelector(Comparator passageScorer, PassageAdjuster passageAdjuster) { + this.passageScorer = passageScorer; + this.passageAdjuster = passageAdjuster; + } + + public List pickBest( + CharSequence value, + List markers, + int maxPassageWindow, + int maxPassages) { + return pickBest( + value, markers, maxPassageWindow, maxPassages, List.of(new OffsetRange(0, value.length()))); + } + + public List pickBest( + CharSequence value, + List markers, + int maxPassageWindow, + int maxPassages, + List permittedPassageRanges) { + assert markers instanceof RandomAccess; + assert permittedPassageRanges instanceof RandomAccess; + assert sortedAndNonOverlapping(permittedPassageRanges); + + // Handle odd special cases early. + if (value.length() == 0 || maxPassageWindow == 0) { + return Collections.emptyList(); + } + + // Best passages so far. + PriorityQueue pq = + new PriorityQueue<>(markers.size()) { + @Override + protected boolean lessThan(Passage a, Passage b) { + return passageScorer.compare(a, b) < 0; + } + }; + + markers = splitOrTruncateToWindows(markers, maxPassageWindow, permittedPassageRanges); + + // Sort markers by their start offset, shortest first. + markers.sort(Comparator.comparingInt(r -> r.from).thenComparingInt(r -> r.to)); + + final int max = markers.size(); + int markerIndex = 0; + nextRange: + for (OffsetRange range : permittedPassageRanges) { + final int rangeTo = Math.min(range.to, value.length()); + + // Skip ranges outside of the value window anyway. + if (range.from >= rangeTo) { + continue; + } + + while (markerIndex < max) { + OffsetRange m = markers.get(markerIndex); + + // Markers are sorted so if the current marker's start is past the range, + // we can advance, but we need to check the same marker against the new range. + if (m.from >= rangeTo) { + continue nextRange; + } + + // Check if current marker falls within the range and is smaller than the largest allowed + // passage window. + if (m.from >= range.from && m.to <= rangeTo && m.length() <= maxPassageWindow) { + + // Adjust the window range to center the highlight marker. + int from = Math.toIntExact(((long) m.from + m.to - maxPassageWindow) / 2); + int to = Math.toIntExact(((long) m.from + m.to + maxPassageWindow) / 2); + if (from < range.from) { + to += range.from - from; + from = range.from; + } + if (to > rangeTo) { + from -= to - rangeTo; + to = rangeTo; + if (from < range.from) { + from = range.from; + } + } + + if (from < to && to <= value.length()) { + // Find other markers that are completely inside the passage window. + ArrayList inside = new ArrayList<>(); + int i = markerIndex; + while (i > 0 && markers.get(i - 1).from >= from) { + i--; + } + + OffsetRange c; + for (; i < max && (c = markers.get(i)).from < to; i++) { + if (c.to <= to) { + inside.add(c); + } + } + + if (!inside.isEmpty()) { + pq.insertWithOverflow(new Passage(from, to, inside)); + } + } + } + + // Advance to the next marker. + markerIndex++; + } + } + + // Collect from the priority queue (reverse the order so that highest-scoring are first). + Passage[] passages; + if (pq.size() > 0) { + passages = new Passage[pq.size()]; + for (int i = pq.size(); --i >= 0; ) { + passages[i] = pq.pop(); + } + } else { + // Handle the default, no highlighting markers case. + passages = pickDefaultPassage(value, maxPassageWindow, maxPassages, permittedPassageRanges); + } + + // Correct passage boundaries from maxExclusive window. Typically shrink boundaries until we're + // on a proper word/sentence boundary. + if (passageAdjuster != null) { + passageAdjuster.currentValue(value); + for (int x = 0; x < passages.length; x++) { + Passage p = passages[x]; + OffsetRange newRange = passageAdjuster.adjust(p); + if (newRange.from != p.from || newRange.to != p.to) { + assert newRange.from >= p.from && newRange.to <= p.to + : "Adjusters must not expand the passage's range: was " + + p + + " => changed to " + + newRange; + passages[x] = new Passage(newRange.from, newRange.to, p.markers); + } + } + } + + // Ensure there are no overlaps on passages. In case of conflicts, better score wins. + int last = 0; + for (int i = 0; i < passages.length; i++) { + Passage a = passages[i]; + if (a != null && a.length() > 0) { + passages[last++] = a; + for (int j = i + 1; j < passages.length; j++) { + Passage b = passages[j]; + if (b != null) { + if (adjecentOrOverlapping(a, b)) { + passages[j] = null; + } + } + } + } + } + + // Remove nullified slots. + last = Math.min(last, maxPassages); + if (passages.length != last) { + passages = ArrayUtil.copyOfSubArray(passages, 0, last); + } + + // Sort in the offset order again. + Arrays.sort(passages, Comparator.comparingInt(a -> a.from)); + + return Arrays.asList(passages); + } + + /** Truncate or split highlight markers that cross permitted value boundaries. */ + private List splitOrTruncateToWindows( + List markers, + int maxPassageWindow, + List permittedPassageRanges) { + // Process markers overlapping with each permitted window. + ArrayList processedMarkers = new ArrayList<>(markers.size()); + for (OffsetRange marker : markers) { + for (OffsetRange permitted : permittedPassageRanges) { + boolean newSlice = false; + + int from = marker.from; + if (from < permitted.from) { + from = permitted.from; + newSlice = true; + } + + int to = marker.to; + if (to > permitted.to) { + to = permitted.to; + newSlice = true; + } + + if (from >= to) { + continue; + } + + if (to - from > maxPassageWindow) { + to = from + maxPassageWindow; + newSlice = true; + } + + processedMarkers.add(newSlice ? marker.slice(from, to) : marker); + } + } + markers = processedMarkers; + return markers; + } + + static boolean sortedAndNonOverlapping(List permittedPassageRanges) { + if (permittedPassageRanges.size() > 1) { + Iterator i = permittedPassageRanges.iterator(); + for (OffsetRange next, previous = i.next(); i.hasNext(); previous = next) { + next = i.next(); + if (previous.to > next.from) { + throw new AssertionError( + "Ranges must be sorted and non-overlapping: " + permittedPassageRanges); + } + } + } + + return true; + } + + /** + * Invoked when no passages could be selected (due to constraints or lack of highlight markers). + */ + protected Passage[] pickDefaultPassage( + CharSequence value, + int maxCharacterWindow, + int maxPassages, + List permittedPassageRanges) { + // Search for the first range that is not empty. + ArrayList defaultPassages = new ArrayList<>(); + for (OffsetRange o : permittedPassageRanges) { + if (defaultPassages.size() >= maxPassages) { + break; + } + + int to = Math.min(value.length(), o.to); + if (o.from < to) { + defaultPassages.add( + new Passage( + o.from, + o.from + Math.min(maxCharacterWindow, o.length()), + Collections.emptyList())); + } + } + + return defaultPassages.toArray(Passage[]::new); + } + + private static boolean adjecentOrOverlapping(Passage a, Passage b) { + if (a.from >= b.from) { + return a.from <= b.to - 1; + } else { + return a.to - 1 >= b.from; + } + } +} diff --git a/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightFieldTests.java b/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightFieldTests.java index f174ae9180522..bcede2a2c3cea 100644 --- a/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightFieldTests.java +++ b/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightFieldTests.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.util.Arrays; +import java.util.Collections; import static org.elasticsearch.test.EqualsHashCodeTestUtils.checkEqualsAndHashCode; @@ -80,7 +81,7 @@ public void testToXContent() throws IOException { ] }""", Strings.toString(builder)); - field = new HighlightField("foo", null); + field = new HighlightField("foo", Collections.emptyList()); builder = JsonXContent.contentBuilder(); builder.prettyPrint(); builder.startObject(); diff --git a/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterTests.java b/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterTests.java new file mode 100644 index 0000000000000..dcc2e78bbdb1b --- /dev/null +++ b/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterTests.java @@ -0,0 +1,111 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.search.fetch.subphase.highlight; + +import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.index.mapper.ParsedDocument; +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.elasticsearch.search.fetch.HighlighterTestCase; + +import java.io.IOException; +import java.util.Map; + +public class MatchesHighlighterTests extends HighlighterTestCase { + + public void testSimpleTermHighlighting() throws IOException { + + MapperService mapperService = createMapperService(""" + { "_doc" : { "properties" : { + "field" : { "type" : "text" } + }}} + """); + + ParsedDocument doc = mapperService.documentMapper().parse(source(""" + { "field" : "this is some text" } + """)); + + SearchSourceBuilder search = new SearchSourceBuilder().query(QueryBuilders.termQuery("field", "some")) + .highlighter(new HighlightBuilder().field("field").highlighterType("matches")); + + Map highlights = highlight(mapperService, doc, search); + assertHighlights(highlights, "field", "this is some text"); + } + + public void testScoring() throws Exception { + + MapperService mapperService = createMapperService(""" + { "_doc" : { "properties" : { + "description" : { "type" : "text" } + }}} + """); + + ParsedDocument doc = mapperService.documentMapper().parse(source(""" + { + "description": [ + "Lorem Ipsum string Generator that helps to create dummy text for all layout needs.", + "It has roots in a string of classical Latin literature from 45 BC, making it search string over 2000 years old." + ] + } + """)); + + HighlightBuilder highlight = new HighlightBuilder() + .field("description") + .highlighterType("matches") + .numOfFragments(2) + .fragmentSize(50); + SearchSourceBuilder search = new SearchSourceBuilder().query(QueryBuilders.matchQuery("description", "search string")) + .highlighter(highlight); + + Map highlights = highlight(mapperService, doc, search); + assertHighlights( + highlights, + "description", + "Lorem Ipsum string Generator that helps to create ...", + "...from 45 BC, making it search string over 2000 year..." + ); + } + + public void testAnalyzedOffsetLimit() throws IOException { + MapperService mapperService = createMapperService(""" + { "_doc" : { "properties" : { + "description" : { "type" : "text" } + }}} + """); + + ParsedDocument doc = mapperService.documentMapper().parse(source(""" + { + "description": [ + "Lorem Ipsum string Generator that helps to create dummy text for all layout needs.", + "It has roots in a string of classical Latin literature from 45 BC, making it search string over 2000 years old." + ] + } + """)); + + HighlightBuilder highlight = new HighlightBuilder() + .field("description") + .highlighterType("matches") + .maxAnalyzedOffset(70) + .numOfFragments(2) + .fragmentSize(50); + SearchSourceBuilder search = new SearchSourceBuilder().query(QueryBuilders.matchQuery("description", "search string")) + .highlighter(highlight); + + Map highlights = highlight(mapperService, doc, search); + assertHighlights( + highlights, + "description", + "Lorem Ipsum string Generator that helps to create ..." + ); + } + + // multiple fields + // analyzed offset limit + // matched_fields - use matches from a set of different fields to highlight this one +} diff --git a/test/framework/src/main/java/org/elasticsearch/search/fetch/HighlighterTestCase.java b/test/framework/src/main/java/org/elasticsearch/search/fetch/HighlighterTestCase.java index 11bbdbc9b1013..d7697344dcf47 100644 --- a/test/framework/src/main/java/org/elasticsearch/search/fetch/HighlighterTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/search/fetch/HighlighterTestCase.java @@ -9,7 +9,11 @@ package org.elasticsearch.search.fetch; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.QueryCachingPolicy; +import org.apache.lucene.search.similarities.BM25Similarity; +import org.apache.lucene.search.similarities.Similarity; import org.elasticsearch.common.text.Text; +import org.elasticsearch.index.cache.query.TrivialQueryCachingPolicy; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.MapperServiceTestCase; import org.elasticsearch.index.mapper.ParsedDocument; @@ -21,8 +25,10 @@ import org.elasticsearch.search.fetch.subphase.highlight.HighlightField; import org.elasticsearch.search.fetch.subphase.highlight.HighlightPhase; import org.elasticsearch.search.fetch.subphase.highlight.Highlighter; +import org.elasticsearch.search.fetch.subphase.highlight.MatchesHighlighter; import org.elasticsearch.search.fetch.subphase.highlight.PlainHighlighter; import org.elasticsearch.search.fetch.subphase.highlight.UnifiedHighlighter; +import org.elasticsearch.search.internal.ContextIndexSearcher; import org.elasticsearch.search.lookup.Source; import java.io.IOException; @@ -44,7 +50,9 @@ protected Map getHighlighters() { "fvh", new FastVectorHighlighter(getIndexSettings()), "plain", - new PlainHighlighter() + new PlainHighlighter(), + "matches", + new MatchesHighlighter() ); } @@ -91,6 +99,8 @@ private static FetchContext fetchContext(SearchExecutionContext context, SearchS when(fetchContext.parsedQuery()).thenReturn(new ParsedQuery(search.query().toQuery(context))); when(fetchContext.getSearchExecutionContext()).thenReturn(context); when(fetchContext.sourceLoader()).thenReturn(context.newSourceLoader(false)); + ContextIndexSearcher cis = new ContextIndexSearcher(context.searcher().getIndexReader(), new BM25Similarity(), null, TrivialQueryCachingPolicy.NEVER, false); + when(fetchContext.searcher()).thenReturn(cis); return fetchContext; } } From 3e402323a3d24dbb75851db68e3ba32510aa7f10 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Fri, 2 Dec 2022 14:26:00 +0000 Subject: [PATCH 2/6] more tests; javadoc; tidy up --- .../highlight/HighlighterSearchIT.java | 6 +- .../subphase/highlight/HighlightPhase.java | 5 +- .../subphase/highlight/HighlightUtils.java | 7 +- .../highlight/MatchesFieldHighlighter.java | 85 +++++++++++-------- .../highlight/MatchesHighlighter.java | 5 +- .../highlight/MatchesHighlighterState.java | 30 ++++--- .../highlight/MatchesHighlighterTests.java | 59 ++++++++++++- 7 files changed, 137 insertions(+), 60 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/server/src/internalClusterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java index c5fdbc5a3973e..7dbb0379f3534 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java @@ -64,6 +64,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Objects; import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; @@ -106,7 +107,7 @@ public class HighlighterSearchIT extends ESIntegTestCase { // TODO as we move analyzers out of the core we need to move some of these into HighlighterWithAnalyzersTests - private static final String[] ALL_TYPES = new String[] { "plain", "fvh", "unified" }; + private static final String[] ALL_TYPES = new String[] { "plain", "fvh", "unified", "matches" }; @Override protected Collection> nodePlugins() { @@ -3508,6 +3509,9 @@ public void testWithNestedQuery() throws Exception { // but we highlight the root text field since nested documents cannot be highlighted with postings nor term vectors // directly. for (String type : ALL_TYPES) { + if (Objects.equals("matches", type)) { + continue; // matches highlighter doesn't support nested fields + } SearchResponse searchResponse = client().prepareSearch() .setQuery(nestedQuery("foo", prefixQuery("foo.text", "bro"), ScoreMode.None)) .highlighter(new HighlightBuilder().field(new Field("text").highlighterType(type).requireFieldMatch(false))) diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightPhase.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightPhase.java index b9e0200094a34..91846c4b609ef 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightPhase.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightPhase.java @@ -160,10 +160,7 @@ private FieldContext contextBuilders( ) ); } - // TODO in future we can load the storedFields in advance here and make use of them, - // but for now they are loaded separately in HighlightUtils so we only return whether - // or not we need source. - storedFieldsSpec = storedFieldsSpec.merge(new StoredFieldsSpec(sourceRequired, false, Set.of())); + storedFieldsSpec = storedFieldsSpec.merge(new StoredFieldsSpec(sourceRequired, false, storedFields)); } return new FieldContext(storedFieldsSpec, builders); } diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java index 1bf127df3d403..aec6d531e3cc0 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java @@ -43,11 +43,8 @@ public static List loadFieldValues( FetchSubPhase.HitContext hitContext, boolean forceSource ) throws IOException { - if (forceSource == false && fieldType.isStored()) { - CustomFieldsVisitor fieldVisitor = new CustomFieldsVisitor(singleton(fieldType.name()), false); - hitContext.reader().document(hitContext.docId(), fieldVisitor); - List textsToHighlight = fieldVisitor.fields().get(fieldType.name()); - return Objects.requireNonNullElse(textsToHighlight, Collections.emptyList()); + if (forceSource == false && hitContext.loadedFields().containsKey(fieldType.name())) { + return hitContext.loadedFields().get(fieldType.name()); } ValueFetcher fetcher = fieldType.valueFetcher(searchContext, null); fetcher.setNextReader(hitContext.readerContext()); diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesFieldHighlighter.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesFieldHighlighter.java index f19fa62909012..7c1ab54d29b1e 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesFieldHighlighter.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesFieldHighlighter.java @@ -9,61 +9,68 @@ package org.elasticsearch.search.fetch.subphase.highlight; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.AnalyzerWrapper; import org.apache.lucene.search.FilterMatchesIterator; import org.apache.lucene.search.Matches; import org.apache.lucene.search.MatchesIterator; -import org.apache.lucene.search.highlight.OffsetLimitTokenFilter; +import org.apache.lucene.search.MatchesUtils; +import org.apache.lucene.search.matchhighlight.MatchRegionRetriever; import org.apache.lucene.search.matchhighlight.OffsetRange; import org.apache.lucene.search.matchhighlight.OffsetsFromTokens; import org.apache.lucene.search.matchhighlight.OffsetsRetrievalStrategy; import org.apache.lucene.search.matchhighlight.Passage; import org.apache.lucene.search.matchhighlight.PassageFormatter; -import org.apache.lucene.search.matchhighlight.PassageSelector; -import org.apache.lucene.search.uhighlight.PassageScorer; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.index.mapper.TextSearchInfo; import java.io.IOException; import java.util.ArrayList; -import java.util.Comparator; import java.util.List; +import java.util.Set; -public class MatchesFieldHighlighter { +/** + * Highlights individual fields using components from lucene's match highlighter + */ +class MatchesFieldHighlighter { private final FieldHighlightContext context; private final Matches matches; private final Analyzer analyzer; private final String field; - public MatchesFieldHighlighter(FieldHighlightContext context, MatchesHighlighterState state) throws IOException { + MatchesFieldHighlighter(FieldHighlightContext context, MatchesHighlighterState state) throws IOException { this.context = context; // TODO term vectors and require_field_match=false should intercept things here - this.matches = state.getMatches(context.query, context.hitContext.docId()); + this.matches = state.getMatches(context.query, context.hitContext.readerContext(), context.hitContext.docId()); this.analyzer = context.context.getSearchExecutionContext().getIndexAnalyzer(s -> Lucene.STANDARD_ANALYZER); this.field = context.fieldType.name(); } - public MatchesIterator getMatchesIterator() throws IOException { + /** + * @return a MatchesIterator for this field, based on the field highlighter configuration + */ + MatchesIterator getMatchesIterator() throws IOException { if (this.matches == null) { return null; } - MatchesIterator it = this.matches.getMatches(field); - if (it == null || context.field.fieldOptions().maxAnalyzedOffset() == null) { - return it; + + Set matchFields = context.field.fieldOptions().matchedFields(); + if (matchFields == null || matchFields.isEmpty()) { + matchFields = Set.of(field); } - int positionCutOff = context.field.fieldOptions().maxAnalyzedOffset() / 5; - return new FilterMatchesIterator(it) { - @Override - public boolean next() throws IOException { - if (it.next() == false) { - return false; - } - return it.startPosition() <= positionCutOff; + + List fieldIterators = new ArrayList<>(); + for (String field : matchFields) { + MatchesIterator it = this.matches.getMatches(field); + if (it != null) { + fieldIterators.add(it); } - }; + } + return MatchesUtils.disjunction(fieldIterators); } + /** + * Uses a MatchesIterator to highlight a list of source inputs + */ public List buildHighlights(MatchesIterator it, List sourceValues) throws IOException { String contiguousSourceText = buildContiguousSourceText(sourceValues); OffsetsRetrievalStrategy offsetsStrategy = getOffsetStrategy(); @@ -93,23 +100,31 @@ private OffsetsRetrievalStrategy getOffsetStrategy() { field, new XOffsetsFromPositions(field, analyzer) ); - case DOCS_AND_FREQS_AND_POSITIONS -> new XOffsetsFromPositions(field, analyzer); + case DOCS_AND_FREQS_AND_POSITIONS -> limitOffsets(new XOffsetsFromPositions(field, analyzer)); case DOCS_AND_FREQS, DOCS -> - // By default retrieve offsets from individual tokens - // retrieved by the analyzer (possibly narrowed down to - // only those terms that the query hinted at when passed - // a QueryVisitor. - // - // Alternative strategies are also possible and may make sense - // depending on the use case (OffsetsFromValues, for example). new OffsetsFromTokens(field, analyzer); - case NONE -> (matchesIterator, doc) -> { - throw new IOException( - "Field is indexed without positions and/or offsets: " - + field - + ", " - + tsi.luceneFieldType().indexOptions()); + // This should be unreachable because we won't get a MatchesIterator from an unindexed field + case NONE -> (matchesIterator, doc) -> { throw new IllegalStateException("Field [ " + field + "] is not indexed"); }; + }; + } + + // TODO might be more sensible to push this back into OffsetsFromPositions + private OffsetsRetrievalStrategy limitOffsets(OffsetsRetrievalStrategy in) { + if (context.field.fieldOptions().maxAnalyzedOffset() == null) { + return in; + } + return (matchesIterator, doc) -> { + int positionCutOff = context.field.fieldOptions().maxAnalyzedOffset() / 5; + MatchesIterator wrapped = new FilterMatchesIterator(matchesIterator) { + @Override + public boolean next() throws IOException { + if (matchesIterator.next() == false) { + return false; + } + return matchesIterator.startPosition() <= positionCutOff; + } }; + return in.get(wrapped, doc); }; } diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighter.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighter.java index b6f35e4dd50a8..a03ed7710fb69 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighter.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighter.java @@ -14,6 +14,9 @@ import java.io.IOException; import java.util.List; +/** + * A highlighter that uses the output of a query's Matches to highlight tokens + */ public class MatchesHighlighter implements Highlighter { private static final String MATCHES_HIGHLIGHTER_CONFIG_KEY = "matches_highlighter_config_key"; @@ -28,7 +31,7 @@ public HighlightField highlight(FieldHighlightContext fieldContext) throws IOExc MatchesHighlighterState state = (MatchesHighlighterState) fieldContext.cache.computeIfAbsent( MATCHES_HIGHLIGHTER_CONFIG_KEY, - k -> new MatchesHighlighterState(fieldContext) + k -> new MatchesHighlighterState(fieldContext.context.searcher().getIndexReader()) ); MatchesFieldHighlighter fieldHighlighter = new MatchesFieldHighlighter(fieldContext, state); diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterState.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterState.java index 390638f374086..3a5ddf6de69dd 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterState.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterState.java @@ -8,6 +8,8 @@ package org.elasticsearch.search.fetch.subphase.highlight; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Matches; import org.apache.lucene.search.MatchesIterator; @@ -22,7 +24,14 @@ import java.util.Iterator; import java.util.Map; -public class MatchesHighlighterState { +/** + * Shared state for the matches highlighter + * + * This holds two caches, one for the query's Weight which is global across all documents, + * and one for Matches for each query, which will be cached per document. This avoids having + * to regenerate the weight and matches for each field being highlighted. + */ +class MatchesHighlighterState { private static final Matches NO_MATCHES = new Matches() { @Override @@ -41,22 +50,23 @@ public Iterator iterator() { } }; - private final FieldHighlightContext context; private final IndexSearcher searcher; private final Map weightCache = new HashMap<>(); private final Map matchesCache = new HashMap<>(); private int currentDoc = -1; + private int currentLeafOrd = -1; - public MatchesHighlighterState(FieldHighlightContext context) { - this.context = context; - this.searcher = context.context.searcher(); + MatchesHighlighterState(IndexReader reader) { + this.searcher = new IndexSearcher(reader); + this.searcher.setQueryCache(null); // disable caching } - public Matches getMatches(Query query, int doc) throws IOException { - if (currentDoc != doc) { + Matches getMatches(Query query, LeafReaderContext ctx, int doc) throws IOException { + if (currentDoc != doc || currentLeafOrd != ctx.ord) { matchesCache.clear(); currentDoc = doc; + currentLeafOrd = ctx.ord; } Weight w = weightCache.get(query); if (w == null) { @@ -65,7 +75,7 @@ public Matches getMatches(Query query, int doc) throws IOException { } Matches m = matchesCache.get(query); if (m == null) { - m = w.matches(context.hitContext.readerContext(), doc); + m = w.matches(ctx, doc); if (m == null) { m = NO_MATCHES; } @@ -76,8 +86,4 @@ public Matches getMatches(Query query, int doc) throws IOException { } return m; } - - public MatchesFieldHighlighter getMatchesFieldHighlighter(FieldHighlightContext fieldContext) throws IOException { - return new MatchesFieldHighlighter(fieldContext, this); - } } diff --git a/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterTests.java b/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterTests.java index dcc2e78bbdb1b..838926171750f 100644 --- a/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterTests.java +++ b/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterTests.java @@ -38,6 +38,30 @@ public void testSimpleTermHighlighting() throws IOException { assertHighlights(highlights, "field", "this is some text"); } + public void testMultipleFieldHighlighting() throws IOException { + MapperService mapperService = createMapperService(""" + { "_doc" : { "properties" : { + "title" : { "type" : "text" }, + "description" : { "type" : "text" }, + "category" : { "type" : "keyword" } + }}} + """); + + ParsedDocument doc = mapperService.documentMapper().parse(source(""" + { "title" : "A tale of two cities", + "description" : "It's a story about two cities", + "category" : [ "fiction", "dickens" ] } + """)); + + SearchSourceBuilder search = new SearchSourceBuilder().query( + QueryBuilders.queryStringQuery("dickens OR cities").field("title").field("description").field("category")) + .highlighter(new HighlightBuilder().highlighterType("matches").field("title").field("category")); + + Map highlights = highlight(mapperService, doc, search); + assertHighlights(highlights, "title", "A tale of two cities"); + assertHighlights(highlights, "category", "dickens"); + } + public void testScoring() throws Exception { MapperService mapperService = createMapperService(""" @@ -105,7 +129,38 @@ public void testAnalyzedOffsetLimit() throws IOException { ); } - // multiple fields - // analyzed offset limit // matched_fields - use matches from a set of different fields to highlight this one + public void testMatchedFields() throws IOException { + + // note that this doesn't actually use a different analyzer for the subfield, + // given restrictions on analyzers in unit tests + MapperService mapperService = createMapperService(""" + { "_doc" : { "properties" : { + "description" : { + "type" : "text", + "fields" : { + "stemmed" : { "type" : "text" } + } + } + }}} + """); + + ParsedDocument doc = mapperService.documentMapper().parse(source(""" + { "description" : "Here is some text" } + """)); + + HighlightBuilder highlight = new HighlightBuilder() + .field(new HighlightBuilder.Field("description").matchedFields("description", "description.stemmed")) + .highlighterType("matches"); + SearchSourceBuilder search = new SearchSourceBuilder().query(QueryBuilders.termQuery("description.stemmed", "some")) + .highlighter(highlight); + + Map highlights = highlight(mapperService, doc, search); + assertHighlights( + highlights, + "description", + "Here is some text" + ); + + } } From e78e85e68510a57f6471b2d05ea26defcf0edf38 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Fri, 2 Dec 2022 14:28:18 +0000 Subject: [PATCH 3/6] spotless --- .../elasticsearch/search/SearchModule.java | 1 - .../subphase/highlight/HighlightUtils.java | 5 - .../highlight/MatchesFieldHighlighter.java | 6 +- .../highlight/XOffsetsFromMatchIterator.java | 25 ++-- .../highlight/XOffsetsFromPositions.java | 27 +++-- .../subphase/highlight/XPassageSelector.java | 114 +++++++++--------- .../highlight/MatchesHighlighterTests.java | 28 ++--- .../search/fetch/HighlighterTestCase.java | 10 +- 8 files changed, 106 insertions(+), 110 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/search/SearchModule.java b/server/src/main/java/org/elasticsearch/search/SearchModule.java index c8635db570e5d..95ab45e324186 100644 --- a/server/src/main/java/org/elasticsearch/search/SearchModule.java +++ b/server/src/main/java/org/elasticsearch/search/SearchModule.java @@ -8,7 +8,6 @@ package org.elasticsearch.search; -import org.apache.lucene.search.Matches; import org.elasticsearch.common.CheckedBiConsumer; import org.elasticsearch.common.NamedRegistry; import org.elasticsearch.common.io.stream.NamedWriteableRegistry; diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java index aec6d531e3cc0..f8166077859cc 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java @@ -10,7 +10,6 @@ import org.apache.lucene.search.highlight.DefaultEncoder; import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.highlight.SimpleHTMLEncoder; -import org.elasticsearch.index.fieldvisitor.CustomFieldsVisitor; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.ValueFetcher; import org.elasticsearch.index.query.SearchExecutionContext; @@ -18,11 +17,7 @@ import java.io.IOException; import java.util.ArrayList; -import java.util.Collections; import java.util.List; -import java.util.Objects; - -import static java.util.Collections.singleton; public final class HighlightUtils { diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesFieldHighlighter.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesFieldHighlighter.java index 7c1ab54d29b1e..c80da441de3f1 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesFieldHighlighter.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesFieldHighlighter.java @@ -13,7 +13,6 @@ import org.apache.lucene.search.Matches; import org.apache.lucene.search.MatchesIterator; import org.apache.lucene.search.MatchesUtils; -import org.apache.lucene.search.matchhighlight.MatchRegionRetriever; import org.apache.lucene.search.matchhighlight.OffsetRange; import org.apache.lucene.search.matchhighlight.OffsetsFromTokens; import org.apache.lucene.search.matchhighlight.OffsetsRetrievalStrategy; @@ -82,7 +81,7 @@ public List buildHighlights(MatchesIterator it, List sourc context.field.fieldOptions().preTags()[0], context.field.fieldOptions().postTags()[0] ); // TODO multiple field markers a la FVH - List passages = passageSelector.pickBest( + List passages = passageSelector.pickBest( contiguousSourceText, matchRanges, context.field.fieldOptions().fragmentCharSize(), @@ -101,8 +100,7 @@ private OffsetsRetrievalStrategy getOffsetStrategy() { new XOffsetsFromPositions(field, analyzer) ); case DOCS_AND_FREQS_AND_POSITIONS -> limitOffsets(new XOffsetsFromPositions(field, analyzer)); - case DOCS_AND_FREQS, DOCS -> - new OffsetsFromTokens(field, analyzer); + case DOCS_AND_FREQS, DOCS -> new OffsetsFromTokens(field, analyzer); // This should be unreachable because we won't get a MatchesIterator from an unindexed field case NONE -> (matchesIterator, doc) -> { throw new IllegalStateException("Field [ " + field + "] is not indexed"); }; }; diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java index 030cf275e5f40..ac70b2fb1a6ac 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java @@ -1,14 +1,23 @@ /* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License - * 2.0 and the Server Side Public License, v 1; you may not use this file except - * in compliance with, at your election, the Elastic License 2.0 or the Server - * Side Public License, v 1. + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.elasticsearch.search.fetch.subphase.highlight; - import org.apache.lucene.search.MatchesIterator; import org.apache.lucene.search.matchhighlight.MatchRegionRetriever; import org.apache.lucene.search.matchhighlight.OffsetRange; @@ -35,9 +44,7 @@ public final class XOffsetsFromMatchIterator implements OffsetsRetrievalStrategy } @Override - public List get( - MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc) - throws IOException { + public List get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc) throws IOException { ArrayList positionRanges = new ArrayList<>(); ArrayList offsetRanges = new ArrayList<>(); while (matchesIterator.next()) { diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java index b1abec70e4afa..621f1fdc22863 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java @@ -1,9 +1,19 @@ /* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License - * 2.0 and the Server Side Public License, v 1; you may not use this file except - * in compliance with, at your election, the Elastic License 2.0 or the Server - * Side Public License, v 1. + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.elasticsearch.search.fetch.subphase.highlight; @@ -33,9 +43,7 @@ public XOffsetsFromPositions(String field, Analyzer analyzer) { } @Override - public List get( - MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc) - throws IOException { + public List get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc) throws IOException { ArrayList positionRanges = new ArrayList<>(); while (matchesIterator.next()) { int from = matchesIterator.startPosition(); @@ -50,8 +58,7 @@ public List get( return convertPositionsToOffsets(positionRanges, doc.getValues(field)); } - List convertPositionsToOffsets( - ArrayList positionRanges, List values) throws IOException { + List convertPositionsToOffsets(ArrayList positionRanges, List values) throws IOException { if (positionRanges.isEmpty()) { return positionRanges; diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java index 49d8765a4eb5f..63b1417eb72bc 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java @@ -1,11 +1,20 @@ /* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License - * 2.0 and the Server Side Public License, v 1; you may not use this file except - * in compliance with, at your election, the Elastic License 2.0 or the Server - * Side Public License, v 1. + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ - package org.elasticsearch.search.fetch.subphase.highlight; import org.apache.lucene.search.matchhighlight.OffsetRange; @@ -23,29 +32,28 @@ import java.util.RandomAccess; public class XPassageSelector { - public static final Comparator DEFAULT_SCORER = - (a, b) -> { - // Compare the number of highlights first. - int v; - v = Integer.compare(a.markers.size(), b.markers.size()); - if (v != 0) { - return v; - } + public static final Comparator DEFAULT_SCORER = (a, b) -> { + // Compare the number of highlights first. + int v; + v = Integer.compare(a.markers.size(), b.markers.size()); + if (v != 0) { + return v; + } - // Total number of characters covered by the highlights. - int len1 = 0, len2 = 0; - for (OffsetRange o : a.markers) { - len1 += o.length(); - } - for (OffsetRange o : b.markers) { - len2 += o.length(); - } - if (len1 != len2) { - return Integer.compare(len1, len2); - } + // Total number of characters covered by the highlights. + int len1 = 0, len2 = 0; + for (OffsetRange o : a.markers) { + len1 += o.length(); + } + for (OffsetRange o : b.markers) { + len2 += o.length(); + } + if (len1 != len2) { + return Integer.compare(len1, len2); + } - return Integer.compare(b.from, a.from); - }; + return Integer.compare(b.from, a.from); + }; private final Comparator passageScorer; private final PassageAdjuster passageAdjuster; @@ -59,13 +67,8 @@ public XPassageSelector(Comparator passageScorer, PassageAdjuster passa this.passageAdjuster = passageAdjuster; } - public List pickBest( - CharSequence value, - List markers, - int maxPassageWindow, - int maxPassages) { - return pickBest( - value, markers, maxPassageWindow, maxPassages, List.of(new OffsetRange(0, value.length()))); + public List pickBest(CharSequence value, List markers, int maxPassageWindow, int maxPassages) { + return pickBest(value, markers, maxPassageWindow, maxPassages, List.of(new OffsetRange(0, value.length()))); } public List pickBest( @@ -73,7 +76,8 @@ public List pickBest( List markers, int maxPassageWindow, int maxPassages, - List permittedPassageRanges) { + List permittedPassageRanges + ) { assert markers instanceof RandomAccess; assert permittedPassageRanges instanceof RandomAccess; assert sortedAndNonOverlapping(permittedPassageRanges); @@ -84,13 +88,12 @@ public List pickBest( } // Best passages so far. - PriorityQueue pq = - new PriorityQueue<>(markers.size()) { - @Override - protected boolean lessThan(Passage a, Passage b) { - return passageScorer.compare(a, b) < 0; - } - }; + PriorityQueue pq = new PriorityQueue<>(markers.size()) { + @Override + protected boolean lessThan(Passage a, Passage b) { + return passageScorer.compare(a, b) < 0; + } + }; markers = splitOrTruncateToWindows(markers, maxPassageWindow, permittedPassageRanges); @@ -99,8 +102,7 @@ protected boolean lessThan(Passage a, Passage b) { final int max = markers.size(); int markerIndex = 0; - nextRange: - for (OffsetRange range : permittedPassageRanges) { + nextRange: for (OffsetRange range : permittedPassageRanges) { final int rangeTo = Math.min(range.to, value.length()); // Skip ranges outside of the value window anyway. @@ -151,7 +153,7 @@ protected boolean lessThan(Passage a, Passage b) { } } - if (!inside.isEmpty()) { + if (inside.isEmpty() == false) { pq.insertWithOverflow(new Passage(from, to, inside)); } } @@ -166,7 +168,7 @@ protected boolean lessThan(Passage a, Passage b) { Passage[] passages; if (pq.size() > 0) { passages = new Passage[pq.size()]; - for (int i = pq.size(); --i >= 0; ) { + for (int i = pq.size(); --i >= 0;) { passages[i] = pq.pop(); } } else { @@ -183,10 +185,7 @@ protected boolean lessThan(Passage a, Passage b) { OffsetRange newRange = passageAdjuster.adjust(p); if (newRange.from != p.from || newRange.to != p.to) { assert newRange.from >= p.from && newRange.to <= p.to - : "Adjusters must not expand the passage's range: was " - + p - + " => changed to " - + newRange; + : "Adjusters must not expand the passage's range: was " + p + " => changed to " + newRange; passages[x] = new Passage(newRange.from, newRange.to, p.markers); } } @@ -225,7 +224,8 @@ protected boolean lessThan(Passage a, Passage b) { private List splitOrTruncateToWindows( List markers, int maxPassageWindow, - List permittedPassageRanges) { + List permittedPassageRanges + ) { // Process markers overlapping with each permitted window. ArrayList processedMarkers = new ArrayList<>(markers.size()); for (OffsetRange marker : markers) { @@ -266,8 +266,7 @@ static boolean sortedAndNonOverlapping(List permittedPass for (OffsetRange next, previous = i.next(); i.hasNext(); previous = next) { next = i.next(); if (previous.to > next.from) { - throw new AssertionError( - "Ranges must be sorted and non-overlapping: " + permittedPassageRanges); + throw new AssertionError("Ranges must be sorted and non-overlapping: " + permittedPassageRanges); } } } @@ -282,7 +281,8 @@ protected Passage[] pickDefaultPassage( CharSequence value, int maxCharacterWindow, int maxPassages, - List permittedPassageRanges) { + List permittedPassageRanges + ) { // Search for the first range that is not empty. ArrayList defaultPassages = new ArrayList<>(); for (OffsetRange o : permittedPassageRanges) { @@ -292,11 +292,7 @@ protected Passage[] pickDefaultPassage( int to = Math.min(value.length(), o.to); if (o.from < to) { - defaultPassages.add( - new Passage( - o.from, - o.from + Math.min(maxCharacterWindow, o.length()), - Collections.emptyList())); + defaultPassages.add(new Passage(o.from, o.from + Math.min(maxCharacterWindow, o.length()), Collections.emptyList())); } } diff --git a/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterTests.java b/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterTests.java index 838926171750f..0968c213a2d3b 100644 --- a/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterTests.java +++ b/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/MatchesHighlighterTests.java @@ -54,8 +54,8 @@ public void testMultipleFieldHighlighting() throws IOException { """)); SearchSourceBuilder search = new SearchSourceBuilder().query( - QueryBuilders.queryStringQuery("dickens OR cities").field("title").field("description").field("category")) - .highlighter(new HighlightBuilder().highlighterType("matches").field("title").field("category")); + QueryBuilders.queryStringQuery("dickens OR cities").field("title").field("description").field("category") + ).highlighter(new HighlightBuilder().highlighterType("matches").field("title").field("category")); Map highlights = highlight(mapperService, doc, search); assertHighlights(highlights, "title", "A tale of two cities"); @@ -79,8 +79,7 @@ public void testScoring() throws Exception { } """)); - HighlightBuilder highlight = new HighlightBuilder() - .field("description") + HighlightBuilder highlight = new HighlightBuilder().field("description") .highlighterType("matches") .numOfFragments(2) .fragmentSize(50); @@ -112,8 +111,7 @@ public void testAnalyzedOffsetLimit() throws IOException { } """)); - HighlightBuilder highlight = new HighlightBuilder() - .field("description") + HighlightBuilder highlight = new HighlightBuilder().field("description") .highlighterType("matches") .maxAnalyzedOffset(70) .numOfFragments(2) @@ -122,11 +120,7 @@ public void testAnalyzedOffsetLimit() throws IOException { .highlighter(highlight); Map highlights = highlight(mapperService, doc, search); - assertHighlights( - highlights, - "description", - "Lorem Ipsum string Generator that helps to create ..." - ); + assertHighlights(highlights, "description", "Lorem Ipsum string Generator that helps to create ..."); } // matched_fields - use matches from a set of different fields to highlight this one @@ -149,18 +143,14 @@ public void testMatchedFields() throws IOException { { "description" : "Here is some text" } """)); - HighlightBuilder highlight = new HighlightBuilder() - .field(new HighlightBuilder.Field("description").matchedFields("description", "description.stemmed")) - .highlighterType("matches"); + HighlightBuilder highlight = new HighlightBuilder().field( + new HighlightBuilder.Field("description").matchedFields("description", "description.stemmed") + ).highlighterType("matches"); SearchSourceBuilder search = new SearchSourceBuilder().query(QueryBuilders.termQuery("description.stemmed", "some")) .highlighter(highlight); Map highlights = highlight(mapperService, doc, search); - assertHighlights( - highlights, - "description", - "Here is some text" - ); + assertHighlights(highlights, "description", "Here is some text"); } } diff --git a/test/framework/src/main/java/org/elasticsearch/search/fetch/HighlighterTestCase.java b/test/framework/src/main/java/org/elasticsearch/search/fetch/HighlighterTestCase.java index d7697344dcf47..e1ed7c2aa33d9 100644 --- a/test/framework/src/main/java/org/elasticsearch/search/fetch/HighlighterTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/search/fetch/HighlighterTestCase.java @@ -9,9 +9,7 @@ package org.elasticsearch.search.fetch; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.QueryCachingPolicy; import org.apache.lucene.search.similarities.BM25Similarity; -import org.apache.lucene.search.similarities.Similarity; import org.elasticsearch.common.text.Text; import org.elasticsearch.index.cache.query.TrivialQueryCachingPolicy; import org.elasticsearch.index.mapper.MapperService; @@ -99,7 +97,13 @@ private static FetchContext fetchContext(SearchExecutionContext context, SearchS when(fetchContext.parsedQuery()).thenReturn(new ParsedQuery(search.query().toQuery(context))); when(fetchContext.getSearchExecutionContext()).thenReturn(context); when(fetchContext.sourceLoader()).thenReturn(context.newSourceLoader(false)); - ContextIndexSearcher cis = new ContextIndexSearcher(context.searcher().getIndexReader(), new BM25Similarity(), null, TrivialQueryCachingPolicy.NEVER, false); + ContextIndexSearcher cis = new ContextIndexSearcher( + context.searcher().getIndexReader(), + new BM25Similarity(), + null, + TrivialQueryCachingPolicy.NEVER, + false + ); when(fetchContext.searcher()).thenReturn(cis); return fetchContext; } From aeafa723ccd04e62813d383f6bed2ceba6c973cd Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Fri, 2 Dec 2022 14:51:23 +0000 Subject: [PATCH 4/6] notices --- .../fetch/subphase/highlight/XOffsetsFromMatchIterator.java | 4 +++- .../fetch/subphase/highlight/XOffsetsFromPositions.java | 4 +++- .../search/fetch/subphase/highlight/XPassageSelector.java | 4 ++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java index ac70b2fb1a6ac..fc67ca8f1d26c 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java @@ -23,6 +23,7 @@ import org.apache.lucene.search.matchhighlight.OffsetRange; import org.apache.lucene.search.matchhighlight.OffsetsFromPositions; import org.apache.lucene.search.matchhighlight.OffsetsRetrievalStrategy; +import org.elasticsearch.Version; import java.io.IOException; import java.util.ArrayList; @@ -33,7 +34,6 @@ * This strategy retrieves offsets directly from {@link MatchesIterator}, if they are available, * otherwise it falls back to using {@link OffsetsFromPositions}. */ -// https://github.com/apache/lucene/pull/11983 public final class XOffsetsFromMatchIterator implements OffsetsRetrievalStrategy { private final String field; private final XOffsetsFromPositions noOffsetsFallback; @@ -41,6 +41,8 @@ public final class XOffsetsFromMatchIterator implements OffsetsRetrievalStrategy XOffsetsFromMatchIterator(String field, XOffsetsFromPositions noOffsetsFallback) { this.field = field; this.noOffsetsFallback = Objects.requireNonNull(noOffsetsFallback); + // https://github.com/apache/lucene/pull/11983 + assert org.apache.lucene.util.Version.fromBits(9, 5, 0).onOrAfter(Version.CURRENT.luceneVersion) == false; } @Override diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java index 621f1fdc22863..dde05714c4033 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java @@ -26,13 +26,13 @@ import org.apache.lucene.search.matchhighlight.MatchRegionRetriever; import org.apache.lucene.search.matchhighlight.OffsetRange; import org.apache.lucene.search.matchhighlight.OffsetsRetrievalStrategy; +import org.elasticsearch.Version; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; -// https://github.com/apache/lucene/pull/11983 public final class XOffsetsFromPositions implements OffsetsRetrievalStrategy { private final String field; private final Analyzer analyzer; @@ -40,6 +40,8 @@ public final class XOffsetsFromPositions implements OffsetsRetrievalStrategy { public XOffsetsFromPositions(String field, Analyzer analyzer) { this.field = field; this.analyzer = analyzer; + // https://github.com/apache/lucene/pull/11983 + assert org.apache.lucene.util.Version.fromBits(9, 5, 0).onOrAfter(Version.CURRENT.luceneVersion) == false; } @Override diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java index 63b1417eb72bc..0a44daa5505ff 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java @@ -22,6 +22,7 @@ import org.apache.lucene.search.matchhighlight.PassageAdjuster; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.PriorityQueue; +import org.elasticsearch.Version; import java.util.ArrayList; import java.util.Arrays; @@ -32,6 +33,7 @@ import java.util.RandomAccess; public class XPassageSelector { + public static final Comparator DEFAULT_SCORER = (a, b) -> { // Compare the number of highlights first. int v; @@ -65,6 +67,8 @@ public XPassageSelector() { public XPassageSelector(Comparator passageScorer, PassageAdjuster passageAdjuster) { this.passageScorer = passageScorer; this.passageAdjuster = passageAdjuster; + // https://github.com/apache/lucene/pull/11990 + assert org.apache.lucene.util.Version.fromBits(9, 5, 0).onOrAfter(Version.CURRENT.luceneVersion) == false; } public List pickBest(CharSequence value, List markers, int maxPassageWindow, int maxPassages) { From 1c0be7e3b2dae03cc1065771e49a7931ee2c32b1 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Fri, 2 Dec 2022 15:27:12 +0000 Subject: [PATCH 5/6] Update docs/changelog/92068.yaml --- docs/changelog/92068.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/92068.yaml diff --git a/docs/changelog/92068.yaml b/docs/changelog/92068.yaml new file mode 100644 index 0000000000000..cee8e23c6c4e2 --- /dev/null +++ b/docs/changelog/92068.yaml @@ -0,0 +1,5 @@ +pr: 92068 +summary: Add a new highlighter based on lucene's match highlighter +area: Highlighting +type: feature +issues: [] From 5b1a245ae9fb7709c0d7985c84b9a36f38da7acf Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Fri, 2 Dec 2022 16:05:54 +0000 Subject: [PATCH 6/6] duh --- .../fetch/subphase/highlight/XOffsetsFromMatchIterator.java | 2 +- .../search/fetch/subphase/highlight/XOffsetsFromPositions.java | 2 +- .../search/fetch/subphase/highlight/XPassageSelector.java | 2 +- .../search/fetch/subphase/highlight/HighlightFieldTests.java | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java index fc67ca8f1d26c..74b1e49939ddd 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromMatchIterator.java @@ -42,7 +42,7 @@ public final class XOffsetsFromMatchIterator implements OffsetsRetrievalStrategy this.field = field; this.noOffsetsFallback = Objects.requireNonNull(noOffsetsFallback); // https://github.com/apache/lucene/pull/11983 - assert org.apache.lucene.util.Version.fromBits(9, 5, 0).onOrAfter(Version.CURRENT.luceneVersion) == false; + assert Version.CURRENT.luceneVersion.onOrAfter(org.apache.lucene.util.Version.fromBits(9, 5, 0)) == false; } @Override diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java index dde05714c4033..40821ae5cad1c 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XOffsetsFromPositions.java @@ -41,7 +41,7 @@ public XOffsetsFromPositions(String field, Analyzer analyzer) { this.field = field; this.analyzer = analyzer; // https://github.com/apache/lucene/pull/11983 - assert org.apache.lucene.util.Version.fromBits(9, 5, 0).onOrAfter(Version.CURRENT.luceneVersion) == false; + assert Version.CURRENT.luceneVersion.onOrAfter(org.apache.lucene.util.Version.fromBits(9, 5, 0)) == false; } @Override diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java index 0a44daa5505ff..b76337cbeb354 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/XPassageSelector.java @@ -68,7 +68,7 @@ public XPassageSelector(Comparator passageScorer, PassageAdjuster passa this.passageScorer = passageScorer; this.passageAdjuster = passageAdjuster; // https://github.com/apache/lucene/pull/11990 - assert org.apache.lucene.util.Version.fromBits(9, 5, 0).onOrAfter(Version.CURRENT.luceneVersion) == false; + assert Version.CURRENT.luceneVersion.onOrAfter(org.apache.lucene.util.Version.fromBits(9, 5, 0)) == false; } public List pickBest(CharSequence value, List markers, int maxPassageWindow, int maxPassages) { diff --git a/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightFieldTests.java b/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightFieldTests.java index bcede2a2c3cea..1183879f33a8f 100644 --- a/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightFieldTests.java +++ b/server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightFieldTests.java @@ -89,7 +89,7 @@ public void testToXContent() throws IOException { builder.endObject(); assertEquals(""" { - "foo" : null + "foo" : [ ] }""", Strings.toString(builder)); }