Skip to content

Commit

Permalink
feat(document-search): init document-search module with basic RAG cap…
Browse files Browse the repository at this point in the history
…abilities on text
  • Loading branch information
mhordynski committed Sep 11, 2024
1 parent 25d8249 commit e1090a1
Show file tree
Hide file tree
Showing 28 changed files with 670 additions and 0 deletions.
Empty file added packages/__init__.py
Empty file.
33 changes: 33 additions & 0 deletions packages/ragnarok-document-search/examples/simple_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import asyncio

from ragnarok_document_search import DocumentSearch
from ragnarok_document_search.documents.document import DocumentMeta
from ragnarok_document_search.vector_store.simple import SimpleVectorStore

from ragnarok_common.embeddings.litellm import LiteLLMEmbeddings

documents = [
DocumentMeta.create_text_document_from_literal("RIP boiled water. You will be mist."),
DocumentMeta.create_text_document_from_literal(
"Why doesn't James Bond fart in bed? Because it would blow his cover."
),
DocumentMeta.create_text_document_from_literal(
"Why programmers don't like to swim? Because they're scared of the floating points."
),
]


async def main():
"""Run the example."""

document_search = DocumentSearch(embedder=LiteLLMEmbeddings(), vector_store=SimpleVectorStore())

for document in documents:
await document_search.ingest_document(document)

results = await document_search.search("I'm boiling my water and I need a joke")
print(results)


if __name__ == "__main__":
asyncio.run(main())
3 changes: 3 additions & 0 deletions packages/ragnarok-document-search/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[build-system]
requires = ["setuptools >= 40.9.0", "wheel"]
build-backend = "setuptools.build_meta"
46 changes: 46 additions & 0 deletions packages/ragnarok-document-search/setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
[metadata]
name = ragnarok-document-search
# do not change version by hand: use bump_version.sh
version = 0.0.1
description = "The ragstack module responsible for fetching data from unstructured data sources."
author = deepsense.ai
author_email = [email protected]
license = Other/Proprietary License
license_files = LICENSE.md
classifiers =
Development Status :: 1 - Planning
Environment :: Console
Intended Audience :: Science/Research
License :: Other/Proprietary License
Natural Language :: English
Operating System :: Independent
Programming Language :: Python :: 3.10
Programming Language :: Python :: 3.11
Programming Language :: Python :: 3.12
Topic :: AI
Private :: Do Not Upload

[options]
package_dir=
=src
packages=find:
zip_safe = False
platforms = any
include_package_data = True
python_requires = >=3.10
install_requires =
python-dotenv>=0.5.1
litellm>=1.37.9
loguru>=0.7.2
numpy>=1.24.0
pydantic>=2.8.2

[options.packages.find]
where=src

[bdist_wheel]
universal = 1

[aliases]
# Alias `setup.py test` to `setup.py pytest`
test = pytest
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from ._main import DocumentSearch

__all__ = ["DocumentSearch"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""Version information."""

__version__ = "0.0.1"
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from ragnarok_document_search.documents.document import DocumentMeta
from ragnarok_document_search.documents.element import Element
from ragnarok_document_search.ingestion.document_processor import DocumentProcessor
from ragnarok_document_search.retrieval.rephrasers.base import QueryRephraser
from ragnarok_document_search.retrieval.rephrasers.noop import NoopQueryRephraser
from ragnarok_document_search.retrieval.rerankers.base import Reranker
from ragnarok_document_search.retrieval.rerankers.noop import NoopReranker
from ragnarok_document_search.vector_store.base import VectorStore

from ragnarok_common.embeddings.base import Embeddings


class DocumentSearch:
"""
A main entrypoint to the DocumentSearch functionality.
It provides methods for both ingestion and retrieval.
Retrieval:
1. Uses QueryRephraser to rephrase the query.
2. Uses VectorStore to retrieve the most relevant chunks.
3. Uses Reranker to rerank the chunks.
"""

embedder: Embeddings

vector_store: VectorStore

query_rephraser: QueryRephraser
reranker: Reranker

def __init__(
self,
embedder: Embeddings,
vector_store: VectorStore,
query_rephraser: QueryRephraser | None = None,
reranker: Reranker | None = None,
) -> None:
self.embedder = embedder
self.vector_store = vector_store
self.query_rephraser = query_rephraser or NoopQueryRephraser()
self.reranker = reranker or NoopReranker()

async def search(self, query: str) -> list[Element]:
"""
Search for the most relevant chunks for a query.
Args:
query: The query to search for.
Returns:
A list of chunks.
"""
queries = self.query_rephraser.rephrase(query)
chunks = []
for rephrased_query in queries:
search_vector = await self.embedder.embed_text([rephrased_query])
# TODO: search parameters should be configurable
entries = await self.vector_store.retrieve(search_vector[0], k=1)
chunks.extend([Element.from_vector_db_entry(entry) for entry in entries])

return self.reranker.rerank(chunks)

async def ingest_document(self, document: DocumentMeta) -> None:
"""
Ingest a document.
Args:
document: The document to ingest.
"""
# TODO: This is a placeholder implementation. It should be replaced with a real implementation.

document_processor = DocumentProcessor()
elements = await document_processor.process(document)
vectors = await self.embedder.embed_text([element.get_key() for element in elements])
entries = [element.to_vector_db_entry(vector) for element, vector in zip(elements, vectors)]
await self.vector_store.store(entries)
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import tempfile
from enum import Enum
from pathlib import Path
from typing import Union

from pydantic import BaseModel, Field
from ragnarok_document_search.documents.sources import LocalFileSource


class DocumentType(str, Enum):
"""Types of documents that can be stored."""

MD = "md"
TXT = "txt"


class DocumentMeta(BaseModel):
"""
An object representing a document metadata.
"""

document_type: DocumentType
source: Union[LocalFileSource] = Field(..., discriminator="source_type")

@property
def id(self) -> str:
"""
Get the document ID.
Returns:
The document ID.
"""
return self.source.get_id()

async def fetch(self) -> "Document":
"""
This method fetches the document from source (potentially remote) and creates an object to interface with it.
Based on the document type, it will return a different object.
Returns:
The document.
"""
local_path = await self.source.fetch()
return Document.from_document_meta(self, local_path)

@classmethod
def create_text_document_from_literal(cls, content: str) -> "DocumentMeta":
"""
Create a text document from a literal content.
Args:
content: The content of the document.
Returns:
The document metadata.
"""
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(content.encode())

return cls(
document_type=DocumentType.TXT,
source=LocalFileSource(path=Path(temp_file.name)),
)


class Document(DocumentMeta):
"""
An object representing a document which is downloaded and stored locally.
"""

local_path: Path

@classmethod
def from_document_meta(cls, document_meta: DocumentMeta, local_path: Path) -> "Document":
"""
Create a document from a document metadata.
Based on the document type, it will return a different object.
Args:
document_meta: The document metadata.
local_path: The local path to the document.
Returns:
The document.
"""
new_obj = {"local_path": local_path, **document_meta.model_dump()}

if document_meta.document_type in [DocumentType.MD, DocumentType.TXT]:
return TextDocument.model_validate(new_obj)
return cls.model_validate(new_obj)


class TextDocument(Document):
"""
An object representing a text document.
"""

@property
def content(self) -> str:
"""
Get the content of the document.
Returns:
The content of the document.
"""
return self.local_path.read_text()
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from abc import ABC, abstractmethod
from typing import ClassVar

from pydantic import BaseModel
from ragnarok_document_search.documents.document import DocumentMeta
from ragnarok_document_search.vector_store.base import VectorDBEntry


class Element(BaseModel, ABC):
"""
An object representing an element in a document.
"""

element_type: str
document: DocumentMeta

_elements_registry: ClassVar[dict[str, type["Element"]]] = {}

@abstractmethod
def get_key(self) -> str:
"""
Get the key of the element which will be used to generate the vector.
Returns:
The key.
"""

@classmethod
def __pydantic_init_subclass__(cls, **kwargs): # pylint: disable=unused-argument
element_type_default = cls.model_fields["element_type"].default

if element_type_default is None:
raise ValueError("Element type must be defined")

Element._elements_registry[element_type_default] = cls

@classmethod
def from_vector_db_entry(cls, db_entry: VectorDBEntry) -> "Element":
"""
Create an element from a vector database entry.
Args:
db_entry: The vector database entry.
Returns:
The element.
"""
meta = db_entry.metadata
element_type = meta["element_type"]
element_cls = Element._elements_registry[element_type]

return element_cls(**meta)

def to_vector_db_entry(self, vector: list[float]) -> VectorDBEntry:
"""
Create a vector database entry from the element.
Args:
vector: The vector.
Returns:
The vector database entry
"""
return VectorDBEntry(
key=self.get_key(),
vector=vector,
metadata=self.model_dump(),
)


class TextElement(Element):
"""
An object representing a text element in a document.
"""

element_type: str = "text"
content: str

def get_key(self) -> str:
"""
Get the key of the element which will be used to generate the vector.
Returns:
The key.
"""
return self.content
Loading

0 comments on commit e1090a1

Please sign in to comment.