Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(document-search): init document-search module with basic RAG capabilities on text #3

Merged
merged 2 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions packages/ragnarok-document-search/examples/simple_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import asyncio

from ragnarok_document_search import DocumentSearch
from ragnarok_document_search.documents.document import DocumentMeta
from ragnarok_document_search.vector_store.in_memory import InMemoryVectorStore

from ragnarok_common.embeddings.litellm import LiteLLMEmbeddings

documents = [
DocumentMeta.create_text_document_from_literal("RIP boiled water. You will be mist."),
DocumentMeta.create_text_document_from_literal(
"Why doesn't James Bond fart in bed? Because it would blow his cover."
),
DocumentMeta.create_text_document_from_literal(
"Why programmers don't like to swim? Because they're scared of the floating points."
),
]


async def main():
"""Run the example."""

document_search = DocumentSearch(embedder=LiteLLMEmbeddings(), vector_store=InMemoryVectorStore())

for document in documents:
await document_search.ingest_document(document)

results = await document_search.search("I'm boiling my water and I need a joke")
print(results)


if __name__ == "__main__":
asyncio.run(main())
3 changes: 3 additions & 0 deletions packages/ragnarok-document-search/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[build-system]
requires = ["setuptools >= 40.9.0", "wheel"]
build-backend = "setuptools.build_meta"
43 changes: 43 additions & 0 deletions packages/ragnarok-document-search/setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
[metadata]
name = ragnarok-document-search
# do not change version by hand: use bump_version.sh
version = 0.0.1
description = "The ragstack module responsible for fetching data from unstructured data sources."
author = deepsense.ai
author_email = [email protected]
license = Other/Proprietary License
license_files = LICENSE.md
classifiers =
Development Status :: 1 - Planning
Environment :: Console
Intended Audience :: Science/Research
License :: Other/Proprietary License
Natural Language :: English
Operating System :: Independent
Programming Language :: Python :: 3.10
Programming Language :: Python :: 3.11
Programming Language :: Python :: 3.12
Topic :: AI
Private :: Do Not Upload

[options]
package_dir=
=src
packages=find:
zip_safe = False
platforms = any
include_package_data = True
python_requires = >=3.10
install_requires =
mhordynski marked this conversation as resolved.
Show resolved Hide resolved
numpy>=1.24.0
pydantic>=2.8.2

[options.packages.find]
where=src

[bdist_wheel]
universal = 1

[aliases]
# Alias `setup.py test` to `setup.py pytest`
test = pytest
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from ._main import DocumentSearch

__all__ = ["DocumentSearch"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""Version information."""

__version__ = "0.0.1"
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from ragnarok_document_search.documents.document import DocumentMeta
from ragnarok_document_search.documents.element import Element
from ragnarok_document_search.ingestion.document_processor import DocumentProcessor
from ragnarok_document_search.retrieval.rephrasers.base import QueryRephraser
from ragnarok_document_search.retrieval.rephrasers.noop import NoopQueryRephraser
from ragnarok_document_search.retrieval.rerankers.base import Reranker
from ragnarok_document_search.retrieval.rerankers.noop import NoopReranker
from ragnarok_document_search.vector_store.base import VectorStore

from ragnarok_common.embeddings.base import Embeddings


class DocumentSearch:
mhordynski marked this conversation as resolved.
Show resolved Hide resolved
"""
A main entrypoint to the DocumentSearch functionality.

It provides methods for both ingestion and retrieval.

Retrieval:

1. Uses QueryRephraser to rephrase the query.
2. Uses VectorStore to retrieve the most relevant chunks.
3. Uses Reranker to rerank the chunks.
"""

embedder: Embeddings

vector_store: VectorStore

query_rephraser: QueryRephraser
reranker: Reranker

def __init__(
self,
embedder: Embeddings,
vector_store: VectorStore,
query_rephraser: QueryRephraser | None = None,
reranker: Reranker | None = None,
) -> None:
self.embedder = embedder
self.vector_store = vector_store
self.query_rephraser = query_rephraser or NoopQueryRephraser()
self.reranker = reranker or NoopReranker()

async def search(self, query: str) -> list[Element]:
"""
Search for the most relevant chunks for a query.

Args:
query: The query to search for.

Returns:
A list of chunks.
"""
queries = self.query_rephraser.rephrase(query)
chunks = []
for rephrased_query in queries:
search_vector = await self.embedder.embed_text([rephrased_query])
# TODO: search parameters should be configurable
entries = await self.vector_store.retrieve(search_vector[0], k=1)
chunks.extend([Element.from_vector_db_entry(entry) for entry in entries])

return self.reranker.rerank(chunks)

async def ingest_document(self, document: DocumentMeta) -> None:
"""
Ingest a document.

Args:
document: The document to ingest.
"""
# TODO: This is a placeholder implementation. It should be replaced with a real implementation.

document_processor = DocumentProcessor()
elements = await document_processor.process(document)
vectors = await self.embedder.embed_text([element.get_key() for element in elements])
entries = [element.to_vector_db_entry(vector) for element, vector in zip(elements, vectors)]
await self.vector_store.store(entries)
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import tempfile
from enum import Enum
from pathlib import Path
from typing import Union

from pydantic import BaseModel, Field
from ragnarok_document_search.documents.sources import LocalFileSource


class DocumentType(str, Enum):
"""Types of documents that can be stored."""

MD = "md"
TXT = "txt"


class DocumentMeta(BaseModel):
"""
An object representing a document metadata.
"""

document_type: DocumentType
source: Union[LocalFileSource] = Field(..., discriminator="source_type")

@property
def id(self) -> str:
"""
Get the document ID.

Returns:
The document ID.
"""
return self.source.get_id()

async def fetch(self) -> "Document":
"""
This method fetches the document from source (potentially remote) and creates an object to interface with it.
Based on the document type, it will return a different object.

Returns:
The document.
"""
local_path = await self.source.fetch()
return Document.from_document_meta(self, local_path)

@classmethod
def create_text_document_from_literal(cls, content: str) -> "DocumentMeta":
mhordynski marked this conversation as resolved.
Show resolved Hide resolved
"""
Create a text document from a literal content.

Args:
content: The content of the document.

Returns:
The document metadata.
"""
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(content.encode())

return cls(
document_type=DocumentType.TXT,
source=LocalFileSource(path=Path(temp_file.name)),
)


class Document(BaseModel):
"""
An object representing a document which is downloaded and stored locally.
mhordynski marked this conversation as resolved.
Show resolved Hide resolved
"""

local_path: Path
metadata: DocumentMeta

@classmethod
def from_document_meta(cls, document_meta: DocumentMeta, local_path: Path) -> "Document":
"""
Create a document from a document metadata.
Based on the document type, it will return a different object.

Args:
document_meta: The document metadata.
local_path: The local path to the document.

Returns:
The document.
"""
if document_meta.document_type in [DocumentType.MD, DocumentType.TXT]:
return TextDocument(local_path=local_path, metadata=document_meta)
return cls(local_path=local_path, metadata=document_meta)


class TextDocument(Document):
"""
An object representing a text document.
"""

@property
def content(self) -> str:
"""
Get the content of the document.

Returns:
The content of the document.
"""
return self.local_path.read_text()
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from abc import ABC, abstractmethod
from typing import ClassVar

from pydantic import BaseModel
from ragnarok_document_search.documents.document import DocumentMeta
from ragnarok_document_search.vector_store.base import VectorDBEntry


class Element(BaseModel, ABC):
"""
An object representing an element in a document.
"""

element_type: str
document: DocumentMeta

_elements_registry: ClassVar[dict[str, type["Element"]]] = {}

@abstractmethod
def get_key(self) -> str:
"""
Get the key of the element which will be used to generate the vector.

Returns:
The key.
"""

@classmethod
def __pydantic_init_subclass__(cls, **kwargs): # pylint: disable=unused-argument
element_type_default = cls.model_fields["element_type"].default

if element_type_default is None:
raise ValueError("Element type must be defined")

Element._elements_registry[element_type_default] = cls

@classmethod
def from_vector_db_entry(cls, db_entry: VectorDBEntry) -> "Element":
"""
Create an element from a vector database entry.

Args:
db_entry: The vector database entry.

Returns:
The element.
"""
meta = db_entry.metadata
element_type = meta["element_type"]
element_cls = Element._elements_registry[element_type]

return element_cls(**meta)

def to_vector_db_entry(self, vector: list[float]) -> VectorDBEntry:
"""
Create a vector database entry from the element.

Args:
vector: The vector.

Returns:
The vector database entry
"""
return VectorDBEntry(
key=self.get_key(),
vector=vector,
metadata=self.model_dump(),
)


class TextElement(Element):
"""
An object representing a text element in a document.
"""

element_type: str = "text"
content: str

def get_key(self) -> str:
"""
Get the key of the element which will be used to generate the vector.

Returns:
The key.
"""
return self.content
Loading
Loading