Skip to content

Commit acae036

Browse files
authored
ESQL: Fix alias removal in regex extraction with JOIN (#127687) (#128204) (#130566)
* Disallow removal of regex extracted fields
1 parent f8bdb35 commit acae036

File tree

5 files changed

+139
-9
lines changed

5 files changed

+139
-9
lines changed

docs/changelog/127687.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 127687
2+
summary: "ESQL: Fix alias removal in regex extraction with JOIN"
3+
area: ES|QL
4+
type: bug
5+
issues:
6+
- 127467

x-pack/plugin/esql/qa/testFixtures/src/main/resources/lookup-join.csv-spec

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1613,3 +1613,72 @@ FROM sample_data_ts_nanos
16131613
2023-10-23T12:27:28.948123456Z | 172.21.2.113 | 2764889 | Connected to 10.1.0.2
16141614
2023-10-23T12:15:03.360123456Z | 172.21.2.162 | 3450233 | Connected to 10.1.0.3
16151615
;
1616+
1617+
1618+
joinMaskingRegex
1619+
// https://github.com/elastic/elasticsearch/issues/127467
1620+
required_capability: union_types
1621+
required_capability: join_lookup_v12
1622+
required_capability: fix_join_masking_regex_extract
1623+
from books,message_*,ul*
1624+
| enrich languages_policy on status
1625+
| drop `language_name`, `bytes_out`, `id`, id
1626+
| dissect book_no "%{type}"
1627+
| dissect author.keyword "%{HZicfARaID}"
1628+
| mv_expand `status`
1629+
| sort HZicfARaID, year DESC NULLS LAST, publisher DESC NULLS FIRST, description DESC, type NULLS LAST, message ASC NULLS LAST, title NULLS FIRST, status NULLS LAST
1630+
| enrich languages_policy on book_no
1631+
| grok message "%{WORD:DiLNyZKNDu}"
1632+
| limit 7972
1633+
| rename year as language_code
1634+
| lookup join languages_lookup on language_code
1635+
| limit 13966
1636+
| stats rcyIZnSOb = min(language_code), `ratings` = min(@timestamp), dgDxwMeFYrD = count(`@timestamp`), ifyZfXigqVN = count(*), qTXdrzSpY = min(language_code) by author.keyword
1637+
| rename author.keyword as message
1638+
| lookup join message_types_lookup on message
1639+
| stats `ratings` = count(*) by type
1640+
| stats `type` = count(type), `ratings` = count(*)
1641+
| keep `ratings`, ratings
1642+
;
1643+
1644+
ratings:long
1645+
1
1646+
;
1647+
1648+
joinMaskingDissect
1649+
// https://github.com/elastic/elasticsearch/issues/127467
1650+
required_capability: join_lookup_v12
1651+
required_capability: fix_join_masking_regex_extract
1652+
from sample_data
1653+
| dissect message "%{type}"
1654+
| drop type
1655+
| lookup join message_types_lookup on message
1656+
| stats count = count(*) by type
1657+
| keep count
1658+
| sort count
1659+
;
1660+
count:long
1661+
1
1662+
3
1663+
3
1664+
;
1665+
1666+
1667+
joinMaskingGrok
1668+
// https://github.com/elastic/elasticsearch/issues/127467
1669+
required_capability: join_lookup_v12
1670+
required_capability: fix_join_masking_regex_extract
1671+
from sample_data
1672+
| grok message "%{WORD:type}"
1673+
| drop type
1674+
| lookup join message_types_lookup on message
1675+
| stats max = max(event_duration) by type
1676+
| keep max
1677+
| sort max
1678+
;
1679+
1680+
max:long
1681+
1232382
1682+
3450233
1683+
8268153
1684+
;

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -740,7 +740,13 @@ public enum Cap {
740740
* Support for keeping `DROP` attributes when resolving field names.
741741
* see <a href="https://github.com/elastic/elasticsearch/issues/126418"> ES|QL: no matches for pattern #126418 </a>
742742
*/
743-
DROP_AGAIN_WITH_WILDCARD_AFTER_EVAL;
743+
DROP_AGAIN_WITH_WILDCARD_AFTER_EVAL,
744+
745+
/**
746+
* During resolution (pre-analysis) we have to consider that joins can override regex extracted values
747+
* see <a href="https://github.com/elastic/elasticsearch/issues/127467"> ES|QL: pruning of JOINs leads to missing fields #127467</a>
748+
*/
749+
FIX_JOIN_MASKING_REGEX_EXTRACT;
744750

745751
private final boolean enabled;
746752

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/session/EsqlSession.java

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
import org.elasticsearch.xpack.esql.core.expression.Expressions;
4242
import org.elasticsearch.xpack.esql.core.expression.FoldContext;
4343
import org.elasticsearch.xpack.esql.core.expression.MetadataAttribute;
44+
import org.elasticsearch.xpack.esql.core.expression.NamedExpression;
45+
import org.elasticsearch.xpack.esql.core.expression.ReferenceAttribute;
4446
import org.elasticsearch.xpack.esql.core.expression.UnresolvedAttribute;
4547
import org.elasticsearch.xpack.esql.core.expression.UnresolvedStar;
4648
import org.elasticsearch.xpack.esql.core.util.Holder;
@@ -592,11 +594,7 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy
592594

593595
parsed.forEachDown(p -> {// go over each plan top-down
594596
if (p instanceof RegexExtract re) { // for Grok and Dissect
595-
// remove other down-the-tree references to the extracted fields
596-
for (Attribute extracted : re.extractedFields()) {
597-
references.removeIf(attr -> matchByName(attr, extracted.name(), false));
598-
}
599-
// but keep the inputs needed by Grok/Dissect
597+
// keep the inputs needed by Grok/Dissect
600598
references.addAll(re.input().references());
601599
} else if (p instanceof Enrich enrich) {
602600
AttributeSet enrichRefs = Expressions.references(enrich.enrichFields());
@@ -651,15 +649,19 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy
651649
// remove any already discovered UnresolvedAttributes that are in fact aliases defined later down in the tree
652650
// for example "from test | eval x = salary | stats max = max(x) by gender"
653651
// remove the UnresolvedAttribute "x", since that is an Alias defined in "eval"
652+
// also remove other down-the-tree references to the extracted fields from "grok" and "dissect"
654653
AttributeSet planRefs = p.references();
655654
Set<String> fieldNames = planRefs.names();
656-
p.forEachExpressionDown(Alias.class, alias -> {
655+
p.forEachExpressionDown(NamedExpression.class, ne -> {
656+
if ((ne instanceof Alias || ne instanceof ReferenceAttribute) == false) {
657+
return;
658+
}
657659
// do not remove the UnresolvedAttribute that has the same name as its alias, ie "rename id AS id"
658660
// or the UnresolvedAttributes that are used in Functions that have aliases "STATS id = MAX(id)"
659-
if (fieldNames.contains(alias.name())) {
661+
if (fieldNames.contains(ne.name())) {
660662
return;
661663
}
662-
references.removeIf(attr -> matchByName(attr, alias.name(), shadowingRefs.contains(attr)));
664+
references.removeIf(attr -> matchByName(attr, ne.name(), shadowingRefs.contains(attr)));
663665
});
664666
}
665667
});

x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/session/IndexResolverFieldNamesTests.java

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1341,6 +1341,53 @@ public void testDissectOverwriteName() {
13411341
assertThat(fieldNames, equalTo(Set.of("emp_no", "emp_no.*", "first_name", "first_name.*")));
13421342
}
13431343

1344+
/**
1345+
* Fix alias removal in regex extraction with JOIN
1346+
* @see <a href="https://github.com/elastic/elasticsearch/issues/127467">ES|QL: pruning of JOINs leads to missing fields</a>
1347+
*/
1348+
public void testAvoidGrokAttributesRemoval() {
1349+
assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
1350+
Set<String> fieldNames = fieldNames("""
1351+
from message_types
1352+
| eval type = 1
1353+
| lookup join message_types_lookup on message
1354+
| drop message
1355+
| grok type "%{WORD:b}"
1356+
| stats x = max(b)
1357+
| keep x""", Set.of());
1358+
assertThat(fieldNames, equalTo(Set.of("message", "x", "x.*", "message.*")));
1359+
}
1360+
1361+
public void testAvoidGrokAttributesRemoval2() {
1362+
assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
1363+
Set<String> fieldNames = fieldNames("""
1364+
from sample_data
1365+
| dissect message "%{type}"
1366+
| drop type
1367+
| lookup join message_types_lookup on message
1368+
| stats count = count(*) by type
1369+
| keep count
1370+
| sort count""", Set.of());
1371+
assertThat(fieldNames, equalTo(Set.of("type", "message", "count", "message.*", "type.*", "count.*")));
1372+
}
1373+
1374+
public void testAvoidGrokAttributesRemoval3() {
1375+
assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
1376+
Set<String> fieldNames = fieldNames("""
1377+
from sample_data
1378+
| grok message "%{WORD:type}"
1379+
| drop type
1380+
| lookup join message_types_lookup on message
1381+
| stats max = max(event_duration) by type
1382+
| keep max
1383+
| sort max""", Set.of());
1384+
assertThat(
1385+
fieldNames,
1386+
equalTo(Set.of("type", "event_duration", "message", "max", "event_duration.*", "message.*", "type.*", "max.*"))
1387+
);
1388+
1389+
}
1390+
13441391
public void testEnrichOnDefaultField() {
13451392
Set<String> fieldNames = fieldNames("""
13461393
from employees

0 commit comments

Comments
 (0)