Skip to content

Commit

Permalink
Support for search with metdata filter expressions
Browse files Browse the repository at this point in the history
 - Extend the VectorStore with similaritySearch using metadata filters using internal DSL and external DSL using Antlr
 - Metdata support for Pinecone, Milvus, and pgvector vector stores
 - PGVectorStore uses explict ::jsonpath casting for the pgvector filter expression to avoid injections
 - Add unit tests for the filter converters, parser and DSL.
 - Add ITs for the 3 vector stores

Resolves: #75
  • Loading branch information
tzolov authored and markpollack committed Nov 7, 2023
1 parent bc12d43 commit 3615667
Show file tree
Hide file tree
Showing 33 changed files with 4,752 additions and 30 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,5 @@ node
node_modules
package-lock.json
package.json
.vscode
.vscode
.antlr
40 changes: 40 additions & 0 deletions spring-ai-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

<properties>
<jsonschema.version>4.31.1</jsonschema.version>
<antlr.version>4.13.1</antlr.version>
</properties>

<dependencies>
Expand All @@ -31,6 +32,13 @@
<version>${stringtemplate.version}</version>
</dependency>

<dependency>
<groupId>org.antlr</groupId>
<artifactId>antlr4-runtime</artifactId>
<version>${antlr.version}</version>
</dependency>


<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-messaging</artifactId>
Expand Down Expand Up @@ -68,4 +76,36 @@

</dependencies>

<profiles>
<profile>
<id>antlr4</id>
<activation>
<activeByDefault>false</activeByDefault>
</activation>
<build>
<plugins>
<plugin>
<groupId>org.antlr</groupId>
<artifactId>antlr4-maven-plugin</artifactId>
<version>${antlr.version}</version>
<configuration>
<sourceDirectory>${basedir}/src/main/resources/antlr4</sourceDirectory>
<outputDirectory>${basedir}/src/main/java</outputDirectory>
<!-- <outputDirectory>${project.build.directory}/generated-sources/antlr4</outputDirectory> -->
<visitor>true</visitor>
</configuration>
<executions>
<execution>
<goals>
<goal>antlr4</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
</profiles>


</project>
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,29 @@

import org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentWriter;
import org.springframework.ai.vectorstore.filter.Filter;
import org.springframework.ai.vectorstore.filter.FilterExpressionBuilder;
import org.springframework.ai.vectorstore.filter.FilterExpressionTextParser;

public interface VectorStore extends DocumentWriter {

/**
* Adds Documents to the vector store.
* @param documents the list of documents to store Will throw an exception if the
* underlying provider checks for duplicate IDs on add
* Adds list of {@link Document}s to the vector store.
* @param documents the list of documents to store. Throws an exception if the
* underlying provider checks for duplicate IDs.
*/
void add(List<Document> documents);

@Override
default void accept(List<Document> documents) {
add(documents);
}

/**
* Deletes documents from the vector store.
* @param idList list of document ids for which documents will be removed.
* @return
*/
Optional<Boolean> delete(List<String> idList);

List<Document> similaritySearch(String query);
Expand All @@ -34,4 +43,94 @@ default void accept(List<Document> documents) {
*/
List<Document> similaritySearch(String query, int k, double threshold);

/**
* Retrieves documents by query embedding similarity and metadata filters to retrieve
* exactly the number of nearest-neighbor results that match the filters.
*
* For example if your {@link Document#getMetadata()} has a schema like:
*
* <pre>{@code
* &#123;
* "country": <Text>,
* "city": <Text>,
* "year": <Number>,
* "price": <Decimal>,
* "isActive": <Boolean>
* &#125;
* }</pre>
*
* then you can constrain the search result with metadata filter expressions
* equivalent to (country == 'UK' AND year >= 2020 AND isActive == true). You can
* build this filter programmatically like this:
*
* <pre>{@code
*
* new Filter.Expression(AND,
* new Expression(EQ, new Key("country"), new Value("UK")),
* new Expression(AND,
* new Expression(GTE, new Key("year"), new Value(2020)),
* new Expression(EQ, new Key("isActive"), new Value(true))));
*
* }</pre>
*
* and it will ensure that the response contains only embeddings that match the
* specified filer criteria. <br/>
*
* The {@link Filter.Expression} is portable across all vector stores that offer
* metadata filtering. The {@link FilterExpressionBuilder} is expression DSL and
* {@link FilterExpressionTextParser} is text expression parser that build
* {@link Filter.Expression}.
* @param topK the top 'k' similar results to return.
* @param similarityThreshold the lower bound of the similarity score
* @param filterExpression portable metadata filter expression.
* @return similar documents that match the requested similarity threshold and filter.
*/
default List<Document> similaritySearch(String query, int topK, double similarityThreshold,
Filter.Expression filterExpression) {
throw new UnsupportedOperationException("This vector store doesn't support search filtering");
}

/**
* Retrieves documents by query embedding similarity and metadata filters to retrieve
* exactly the number of nearest-neighbor results that match the filters.
*
* For example if your {@link Document#getMetadata()} has a schema like:
*
* <pre>{@code
* &#123;
* "country": <Text>,
* "city": <Text>,
* "year": <Number>,
* "price": <Decimal>,
* "isActive": <Boolean>
* &#125;
* }</pre>
*
* then you can constrain the search result with metadata filter expressions like:
*
* <pre>{@code
*country == 'UK' && year >= 2020 && isActive == true
* Or
*country == 'BG' && (city NOT IN ['Sofia', 'Plovdiv'] || price < 134.34)
* }</pre>
*
* This ensures that the response contains only embeddings that match the specified
* filer criteria. <br/>
*
* The declarative, SQL like, filter syntax is portable across all vector stores
* supporting the filter search feature.<br/>
*
* The {@link FilterExpressionTextParser} is used to convert the text filter
* expression into {@link Filter.Expression}.
* @param topK the top 'k' similar results to return.
* @param similarityThreshold the lower bound of the similarity score
* @param filterExpression portable metadata filter expression.
* @return similar documents that match the requested similarity threshold and filter.
*/
default List<Document> similaritySearch(String query, int topK, double similarityThreshold,
String filterExpression) {
var filterExpressionObject = Filter.parser().parse(filterExpression);
return similaritySearch(query, topK, similarityThreshold, filterExpressionObject);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
/*
* Copyright 2023-2023 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.springframework.ai.vectorstore.filter;

/**
* Portable runtime model for metadata filter expressions. This generic model is used to
* define store agnostic filter expressions than later can be converted into vector-store
* specific, native, expressions.
*
* The expression model supports constant comparison {@code (e.g. ==, !=, <, <=, >, >=) },
* IN/NON-IN checks and AND and OR to compose multiple expressions.
*
* For example:
*
* <pre>{@code
* // 1: country == "BG"
* new Expression(EQ, new Key("country"), new Value("BG"));
*
* // 2: genre == "drama" AND year >= 2020
* new Expression(AND, new Expression(EQ, new Key("genre"), new Value("drama")),
* new Expression(GTE, new Key("year"), new Value(2020)));
*
* // 3: genre in ["comedy", "documentary", "drama"]
* new Expression(IN, new Key("genre"), new Value(List.of("comedy", "documentary", "drama")));
*
* // 4: year >= 2020 OR country == "BG" AND city != "Sofia"
* new Expression(OR, new Expression(GTE, new Key("year"), new Value(2020)),
* new Expression(AND, new Expression(EQ, new Key("country"), new Value("BG")),
* new Expression(NE, new Key("city"), new Value("Sofia"))));
*
* // 5: (year >= 2020 OR country == "BG") AND city NIN ["Sofia", "Plovdiv"]
* new Expression(AND,
* new Group(new Expression(OR, new Expression(EQ, new Key("country"), new Value("BG")),
* new Expression(GTE, new Key("year"), new Value(2020)))),
* new Expression(NIN, new Key("city"), new Value(List.of("Sofia", "Varna"))));
*
* // 6: isOpen == true AND year >= 2020 AND country IN ["BG", "NL", "US"]
* new Expression(AND, new Expression(EQ, new Key("isOpen"), new Value(true)),
* new Expression(AND, new Expression(GTE, new Key("year"), new Value(2020)),
* new Expression(IN, new Key("country"), new Value(List.of("BG", "NL", "US")))));
*
* }</pre>
*
*
* Usually you will not create expression manually but use either the
* {@link Filter#builder()} DSL or the {@link Filter#parser()} for parsing generic text
* expressions. Follow the {@link FilterExpressionBuilder} and
* {@link FilterExpressionTextParser} documentation for how to use them.
*
* @author Christian Tzolov
*/
public class Filter {

/**
* DSL builder for creating {@link Filter.Expression} programmatically.
*/
public static FilterExpressionBuilder builder() {
return new FilterExpressionBuilder();
}

/**
* Parses a portable filter expression text language into {@link Filter.Expression}.
*/
public static FilterExpressionTextParser parser() {
return new FilterExpressionTextParser();
}

/**
* Mark interface representing the supported expression types: {@link Key},
* {@link Value}, {@link Expression} and {@link Group}.
*/
public interface Operand {

}

/**
* String identifier representing an expression key. (e.g. the country in the country
* == "NL" expression).
*/
public record Key(String key) implements Operand {
}

/**
* Represents expression value constant or constant array. Support Numeric, Boolean
* and String data types.
*/
public record Value(Object value) implements Operand {
}

/**
* Filter expression operations. <br/>
*
* - EQ, NE, GT, GTE, LT, LTE operations supports "Key ExprType Value"
* expressions.<br/>
*
* - AND, OR are binary operations that support "(Expression|Group) ExprType
* (Expression|Group)" expressions. <br/>
*
* - IN, NIN support "Key (IN|NIN) ArrayValue" expression. <br/>
*/
public enum ExpressionType {

AND, OR, EQ, NE, GT, GTE, LT, LTE, IN, NIN

}

/**
* Triple that represents and filter boolean expression as
* <code>left type right</code>.
*
* @param type Specify the expression type.
* @param left For comparison and inclusion expression types, the operand must be of
* type {@link Key} and for the AND|OR expression types the left operand must be
* another {@link Expression}.
* @param right For comparison and inclusion expression types, the operand must be of
* type {@link Value} or array of values. For the AND|OR type the right operand must
* be another {@link Expression}.
*/
public record Expression(ExpressionType type, Operand left, Operand right) implements Operand {
}

/**
* Represents expression grouping (e.g. (...) ) that indicates that the group needs to
* be evaluated with a precedence.
*
* @param content Inner expression to be evaluated as a part of the group.
*/
public record Group(Expression content) implements Operand {
}

}
Loading

0 comments on commit 3615667

Please sign in to comment.