Skip to content

[8.x] ESQL: Push down StartsWith and EndsWith functions to Lucene (#123381) #124582

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/123381.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 123381
summary: Push down `StartsWith` and `EndsWith` functions to Lucene
area: ES|QL
type: enhancement
issues:
- 123067
172 changes: 172 additions & 0 deletions x-pack/plugin/esql/qa/testFixtures/src/main/resources/string.csv-spec
Original file line number Diff line number Diff line change
Expand Up @@ -965,6 +965,46 @@ false | null
false | null
;

startsWithLucenePushdown

from hosts
| where starts_with(host, "bet") and starts_with(host_group, "Kuber")
| keep host, host_group
| sort host, host_group;

host:keyword | host_group:text
beta | Kubernetes cluster
beta | Kubernetes cluster
beta | Kubernetes cluster
;

startsWithLuceneDisabledPushdown

from hosts
| where host == "unknown host" or (starts_with(host, "bet") and starts_with(host_group, "Kuber"))
| keep host, host_group
| sort host, host_group;

host:keyword | host_group:text
beta | Kubernetes cluster
beta | Kubernetes cluster
beta | Kubernetes cluster
;

startsWithLucenePushdownIgnoreMultivalues

from hosts
| where starts_with(description, "epsilon")
| keep description
| sort description;

warning:Line 2:9: evaluation of [starts_with(description, \"epsilon\")] failed, treating result as null. Only first 20 failures recorded.
warning:Line 2:9: java.lang.IllegalArgumentException: single-value function encountered multi-value

description:text
epsilon gw instance
;

substringOfText
required_capability: mv_warn

Expand Down Expand Up @@ -1193,6 +1233,138 @@ Bernatsky |false
;


endsWithLucenePushdown

from hosts
| where ends_with(host, "ta") and ends_with(host_group, "cluster")
| keep host, host_group
| sort host, host_group;

host:keyword | host_group:text
beta | Kubernetes cluster
beta | Kubernetes cluster
beta | Kubernetes cluster
;

endsWithLuceneDisabledPushdown

from hosts
| where host == "unknown host" or (ends_with(host, "ta") and ends_with(host_group, "cluster"))
| keep host, host_group
| sort host, host_group;

host:keyword | host_group:text
beta | Kubernetes cluster
beta | Kubernetes cluster
beta | Kubernetes cluster
;

endsWithLucenePushdownIgnoreMultivalues

from hosts
| where ends_with(description, "host")
| keep description
| sort description;

warning:Line 2:9: evaluation of [ends_with(description, \"host\")] failed, treating result as null. Only first 20 failures recorded.
warning:Line 2:9: java.lang.IllegalArgumentException: single-value function encountered multi-value

description:text
;


lucenePushdownMultipleWhere

from hosts
| where starts_with(host, "bet")
| keep host, host_group
| sort host, host_group
| where ends_with(host_group, "cluster");

host:keyword | host_group:text
beta | Kubernetes cluster
beta | Kubernetes cluster
beta | Kubernetes cluster
;

lucenePushdownMultipleIndices

from airports* metadata _index
| where starts_with(name::keyword, "Sahn") and ends_with(abbrev, "UH")
| keep abbrev, name, _index
| sort abbrev, name, _index;

abbrev:keyword | name:text | _index:keyword
LUH | Sahnewal | airports
LUH | Sahnewal | airports_mp
LUH | Sahnewal | airports_no_doc_values
LUH | Sahnewal | airports_not_indexed
LUH | Sahnewal | airports_not_indexed_nor_doc_values
LUH | Sahnewal | airports_web
;

lucenePushdownOr

from airports
| where starts_with(name::keyword, "Sahn") or ends_with(abbrev, "UH")
| keep abbrev, name
| sort abbrev, name;

abbrev:keyword | name:text
AUH | Abu Dhabi Int'l
LUH | Sahnewal
RUH | King Khalid Int'l
;

lucenePushdownMultipleOr

from airports
| where starts_with(name::keyword, "Sahn") or ends_with(abbrev, "UH") or starts_with(abbrev, "OOL")
| keep abbrev, name
| sort abbrev, name;

abbrev:keyword | name:text
AUH | Abu Dhabi Int'l
LUH | Sahnewal
OOL | Gold Coast
RUH | King Khalid Int'l
;

lucenePushdownMultipleAnd

from airports metadata _index
| where starts_with(name::keyword, "Sahn") and ends_with(abbrev, "UH")
| where ends_with(name::keyword, "al")
| keep abbrev, name, _index
| sort abbrev, name, _index;

abbrev:keyword | name:text | _index:keyword
LUH | Sahnewal | airports
;

lucenePushdownMixAndOr

from airports
| where starts_with(name::keyword, "Sahn") and (starts_with(name::keyword, "Abc") or ends_with(abbrev, "UH"))
| keep abbrev, name, scalerank
| sort abbrev, name;

abbrev:keyword | name:text | scalerank:integer
LUH | Sahnewal | 9
;

lucenePushdownMixOrAnd

from airports* metadata _index
| where starts_with(name::keyword, "Sahn") or (starts_with(abbrev, "G") and ends_with(name::keyword, "Falls Int'l"))
| where ends_with(_index, "airports")
| keep abbrev, name, scalerank, _index
| sort abbrev;

abbrev:keyword | name:text | scalerank:integer | _index:keyword
GTF | Great Falls Int'l | 8 | airports
LUH | Sahnewal | 9 | airports
;

toLowerRow#[skip:-8.12.99]
// tag::to_lower[]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,20 @@

package org.elasticsearch.xpack.esql.expression.function.scalar.string;

import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.compute.ann.Evaluator;
import org.elasticsearch.compute.operator.EvalOperator.ExpressionEvaluator;
import org.elasticsearch.xpack.esql.capabilities.TranslationAware;
import org.elasticsearch.xpack.esql.core.expression.Expression;
import org.elasticsearch.xpack.esql.core.expression.FieldAttribute;
import org.elasticsearch.xpack.esql.core.expression.FoldContext;
import org.elasticsearch.xpack.esql.core.querydsl.query.Query;
import org.elasticsearch.xpack.esql.core.querydsl.query.WildcardQuery;
import org.elasticsearch.xpack.esql.core.tree.NodeInfo;
import org.elasticsearch.xpack.esql.core.tree.Source;
import org.elasticsearch.xpack.esql.core.type.DataType;
Expand All @@ -22,6 +29,8 @@
import org.elasticsearch.xpack.esql.expression.function.Param;
import org.elasticsearch.xpack.esql.expression.function.scalar.EsqlScalarFunction;
import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput;
import org.elasticsearch.xpack.esql.optimizer.rules.physical.local.LucenePushdownPredicates;
import org.elasticsearch.xpack.esql.planner.TranslatorHandler;

import java.io.IOException;
import java.util.Arrays;
Expand All @@ -31,7 +40,7 @@
import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.SECOND;
import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isString;

public class EndsWith extends EsqlScalarFunction {
public class EndsWith extends EsqlScalarFunction implements TranslationAware.SingleValueTranslationAware {
public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "EndsWith", EndsWith::new);

private final Expression str;
Expand Down Expand Up @@ -129,6 +138,27 @@ public ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvaluator) {
return new EndsWithEvaluator.Factory(source(), toEvaluator.apply(str), toEvaluator.apply(suffix));
}

@Override
public boolean translatable(LucenePushdownPredicates pushdownPredicates) {
return pushdownPredicates.isPushableAttribute(str) && suffix.foldable();
}

@Override
public Query asQuery(TranslatorHandler handler) {
LucenePushdownPredicates.checkIsPushableAttribute(str);
var fieldName = handler.nameOf(str instanceof FieldAttribute fa ? fa.exactAttribute() : str);

// TODO: Get the real FoldContext here
var wildcardQuery = "*" + QueryParser.escape(BytesRefs.toString(suffix.fold(FoldContext.small())));

return new WildcardQuery(source(), fieldName, wildcardQuery);
}

@Override
public Expression singleValueField() {
return str;
}

Expression str() {
return str;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,20 @@

package org.elasticsearch.xpack.esql.expression.function.scalar.string;

import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.compute.ann.Evaluator;
import org.elasticsearch.compute.operator.EvalOperator.ExpressionEvaluator;
import org.elasticsearch.xpack.esql.capabilities.TranslationAware;
import org.elasticsearch.xpack.esql.core.expression.Expression;
import org.elasticsearch.xpack.esql.core.expression.FieldAttribute;
import org.elasticsearch.xpack.esql.core.expression.FoldContext;
import org.elasticsearch.xpack.esql.core.querydsl.query.Query;
import org.elasticsearch.xpack.esql.core.querydsl.query.WildcardQuery;
import org.elasticsearch.xpack.esql.core.tree.NodeInfo;
import org.elasticsearch.xpack.esql.core.tree.Source;
import org.elasticsearch.xpack.esql.core.type.DataType;
Expand All @@ -22,6 +29,8 @@
import org.elasticsearch.xpack.esql.expression.function.Param;
import org.elasticsearch.xpack.esql.expression.function.scalar.EsqlScalarFunction;
import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput;
import org.elasticsearch.xpack.esql.optimizer.rules.physical.local.LucenePushdownPredicates;
import org.elasticsearch.xpack.esql.planner.TranslatorHandler;

import java.io.IOException;
import java.util.Arrays;
Expand All @@ -31,7 +40,7 @@
import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.SECOND;
import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isString;

public class StartsWith extends EsqlScalarFunction {
public class StartsWith extends EsqlScalarFunction implements TranslationAware.SingleValueTranslationAware {
public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(
Expression.class,
"StartsWith",
Expand Down Expand Up @@ -126,6 +135,27 @@ public ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvaluator) {
return new StartsWithEvaluator.Factory(source(), toEvaluator.apply(str), toEvaluator.apply(prefix));
}

@Override
public boolean translatable(LucenePushdownPredicates pushdownPredicates) {
return pushdownPredicates.isPushableAttribute(str) && prefix.foldable();
}

@Override
public Query asQuery(TranslatorHandler handler) {
LucenePushdownPredicates.checkIsPushableAttribute(str);
var fieldName = handler.nameOf(str instanceof FieldAttribute fa ? fa.exactAttribute() : str);

// TODO: Get the real FoldContext here
var wildcardQuery = QueryParser.escape(BytesRefs.toString(prefix.fold(FoldContext.small()))) + "*";

return new WildcardQuery(source(), fieldName, wildcardQuery);
}

@Override
public Expression singleValueField() {
return str;
}

Expression str() {
return str;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,21 @@

import org.apache.lucene.util.BytesRef;
import org.elasticsearch.xpack.esql.core.expression.Expression;
import org.elasticsearch.xpack.esql.core.expression.FieldAttribute;
import org.elasticsearch.xpack.esql.core.expression.Literal;
import org.elasticsearch.xpack.esql.core.querydsl.query.WildcardQuery;
import org.elasticsearch.xpack.esql.core.tree.Source;
import org.elasticsearch.xpack.esql.core.type.DataType;
import org.elasticsearch.xpack.esql.core.type.EsField;
import org.elasticsearch.xpack.esql.expression.function.AbstractScalarFunctionTestCase;
import org.elasticsearch.xpack.esql.expression.function.TestCaseSupplier;
import org.elasticsearch.xpack.esql.optimizer.rules.physical.local.LucenePushdownPredicates;
import org.elasticsearch.xpack.esql.planner.TranslatorHandler;
import org.hamcrest.Matcher;

import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.function.Supplier;

import static org.hamcrest.Matchers.equalTo;
Expand Down Expand Up @@ -98,4 +105,38 @@ private static TestCaseSupplier.TestCase testCase(
protected Expression build(Source source, List<Expression> args) {
return new EndsWith(source, args.get(0), args.get(1));
}

public void testLuceneQuery_AllLiterals_NonTranslatable() {
var function = new EndsWith(
Source.EMPTY,
new Literal(Source.EMPTY, "test", DataType.KEYWORD),
new Literal(Source.EMPTY, "test", DataType.KEYWORD)
);

assertThat(function.translatable(LucenePushdownPredicates.DEFAULT), equalTo(false));
}

public void testLuceneQuery_NonFoldableSuffix_NonTranslatable() {
var function = new EndsWith(
Source.EMPTY,
new FieldAttribute(Source.EMPTY, "field", new EsField("field", DataType.KEYWORD, Map.of(), true)),
new FieldAttribute(Source.EMPTY, "field", new EsField("suffix", DataType.KEYWORD, Map.of(), true))
);

assertThat(function.translatable(LucenePushdownPredicates.DEFAULT), equalTo(false));
}

public void testLuceneQuery_NonFoldableSuffix_Translatable() {
var function = new EndsWith(
Source.EMPTY,
new FieldAttribute(Source.EMPTY, "field", new EsField("suffix", DataType.KEYWORD, Map.of(), true)),
new Literal(Source.EMPTY, "a*b?c\\", DataType.KEYWORD)
);

assertThat(function.translatable(LucenePushdownPredicates.DEFAULT), equalTo(true));

var query = function.asQuery(TranslatorHandler.TRANSLATOR_HANDLER);

assertThat(query, equalTo(new WildcardQuery(Source.EMPTY, "field", "*a\\*b\\?c\\\\")));
}
}
Loading