Skip to content

Add document manipulation #9

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docling_mcp/docling_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
logger = setup_logger()


def hash_string_md5(input_string: str) -> str:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function already exists in generation.py. Could we move this generally useful function to a new util.py file?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, we need to!

"""Creates an md5 hash-string from the input string."""
return hashlib.md5(input_string.encode()).hexdigest()


def get_cache_dir() -> Path:
"""
Get the cache directory for the application.
Expand Down
6 changes: 6 additions & 0 deletions docling_mcp/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@
open_list_in_docling_document,
save_docling_document,
)
from docling_mcp.tools.manipulation import ( # noqa: F401
delete_document_items_at_anchors,
get_overview_of_document_anchors,
get_text_of_document_item_at_anchor,
update_text_of_document_item_at_anchor,
)

if __name__ == "__main__":
# Create a default project logger
Expand Down
2 changes: 0 additions & 2 deletions docling_mcp/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
from docling_core.types.doc.document import (
DoclingDocument,
NodeItem,
# DocItem,
# GroupItem
)

# Create a single shared FastMCP instance
Expand Down
11 changes: 1 addition & 10 deletions docling_mcp/tools/generation.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
import hashlib
from io import BytesIO

# from bs4 import BeautifulSoup # , NavigableString, PageElement, Tag
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
from docling.datamodel.document import (
ConversionResult,
DoclingDocument,
)
from docling.document_converter import DocumentConverter

# from docling.backend.html_backend import HTMLDocumentBackend
from docling_core.types.doc.document import (
ContentLayer,
DoclingDocument,
Expand All @@ -22,19 +18,14 @@
# TableCellLabel
)

from docling_mcp.docling_cache import get_cache_dir
from docling_mcp.docling_cache import get_cache_dir, hash_string_md5
from docling_mcp.logger import setup_logger
from docling_mcp.shared import local_document_cache, local_stack_cache, mcp

# Create a default project logger
logger = setup_logger()


def hash_string_md5(input_string: str) -> str:
"""Creates an md5 hash-string from the input string."""
return hashlib.md5(input_string.encode()).hexdigest()


@mcp.tool()
def create_new_docling_document(prompt: str) -> str:
"""
Expand Down
205 changes: 205 additions & 0 deletions docling_mcp/tools/manipulation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
from docling_core.types.doc.document import (
DocItem,
GroupItem,
RefItem,
SectionHeaderItem,
TextItem,
TitleItem,
)

from docling_mcp.logger import setup_logger
from docling_mcp.shared import local_document_cache, mcp

# Create a default project logger
logger = setup_logger()


@mcp.tool()
def get_overview_of_document_anchors(document_key: str) -> str:
"""
Retrieves a structured overview of a document from the local document cache.

This tool returns a text representation of the document's structure, showing
the hierarchy and types of elements within the document. Each line in the
output includes the document anchor reference and item label.

Args:
document_key (str): The unique identifier for the document in the local cache.

Returns:
str: A string containing the hierarchical structure of the document with
indentation to show nesting levels, along with anchor references.

Raises:
ValueError: If the specified document_key does not exist in the local cache.

Example:
get_overview_of_document_anchors(document_key="doc123")
"""
if document_key not in local_document_cache:
doc_keys = ", ".join(local_document_cache.keys())
raise ValueError(
f"document-key: {document_key} is not found. Existing document-keys are: {doc_keys}"
)

doc = local_document_cache[document_key]

lines = []
slevel = 0
for item, level in doc.iterate_items():
ref = item.get_ref()

if isinstance(item, DocItem):
if isinstance(item, TitleItem):
lines.append(f"[anchor:{ref.cref}] {item.label}: {item.text}")

elif isinstance(item, SectionHeaderItem):
slevel = item.level
indent = " " * (level + slevel)
lines.append(
f"{indent}[anchor:{ref.cref}] {item.label}-{level}: {item.text}"
)

else:
indent = " " * (level + slevel + 1)
lines.append(f"{indent}[anchor:{ref.cref}] {item.label}")

elif isinstance(item, GroupItem):
indent = " " * (level + slevel + 1)
lines.append(f"{indent}[anchor:{ref.cref}] {item.label}")

return "\n".join(lines)


@mcp.tool()
def get_text_of_document_item_at_anchor(document_key: str, document_anchor: str) -> str:
"""
Retrieves the text content of a specific document item identified by its anchor.

This tool extracts the text from a document item at the specified anchor location
within a document that exists in the local document cache.

Args:
document_key (str): The unique identifier for the document in the local cache.
document_anchor (str): The anchor reference that identifies the specific item
within the document.

Returns:
str: A formatted string containing the text content of the specified item,
wrapped in code block formatting.

Raises:
ValueError: If the specified document_key does not exist in the local cache.
ValueError: If the item at the specified anchor is not a textual item.

Example:
get_text_of_document_item_at_anchor(document_key="doc123", document_anchor="#/texts/2")
"""
if document_key not in local_document_cache:
doc_keys = ", ".join(local_document_cache.keys())
raise ValueError(
f"document-key: {document_key} is not found. Existing document-keys are: {doc_keys}"
)

doc = local_document_cache[document_key]

ref = RefItem(cref=document_anchor)
item = ref.resolve(doc=doc)

if isinstance(item, TextItem):
text = item.text
else:
raise ValueError(
f"Item at {document_anchor} for document-key: {document_key} is not a textual item."
)

return f"The text of {document_anchor} for document-key with {document_key} is:\n\n```{text}```\n\n"


@mcp.tool()
def update_text_of_document_item_at_anchor(
document_key: str, document_anchor: str, updated_text: str
) -> str:
"""
Updates the text content of a specific document item identified by its anchor.

This tool modifies the text of an existing document item at the specified anchor
location within a document that exists in the local document cache.

Args:
document_key (str): The unique identifier for the document in the local cache.
document_anchor (str): The anchor reference that identifies the specific item
within the document.
updated_text (str): The new text content to replace the existing content.

Returns:
str: A confirmation message indicating the text was successfully updated.

Raises:
ValueError: If the specified document_key does not exist in the local cache.
ValueError: If the item at the specified anchor is not a textual item.

Example:
update_text_of_document_item_at_anchor(document_key="doc123", document_anchor="#/texts/2", updated_text="This is the new content.")
"""
if document_key not in local_document_cache:
doc_keys = ", ".join(local_document_cache.keys())
raise ValueError(
f"document-key: {document_key} is not found. Existing document-keys are: {doc_keys}"
)

doc = local_document_cache[document_key]

ref = RefItem(cref=document_anchor)
item = ref.resolve(doc=doc)

if isinstance(item, TextItem):
item.text = updated_text
else:
raise ValueError(
f"Item at {document_anchor} for document-key: {document_key} is not a textual item."
)

return f"Updated the text at {document_anchor} for document with key {document_key}"


@mcp.tool()
def delete_document_items_at_anchors(
document_key: str, document_anchors: list[str]
) -> str:
"""
Deletes multiple document items identified by their anchors.

This tool removes specified items from a document that exists in the local
document cache, based on their anchor references.

Args:
document_key (str): The unique identifier for the document in the local cache.
document_anchors (list[str]): A list of anchor references identifying the items
to be deleted from the document.

Returns:
str: A confirmation message indicating the items were successfully deleted.

Raises:
ValueError: If the specified document_key does not exist in the local cache.

Example:
delete_document_items_at_anchors(document_key="doc123", document_anchors=["#/texts/2", "#/tables/1"])
"""
if document_key not in local_document_cache:
doc_keys = ", ".join(local_document_cache.keys())
raise ValueError(
f"document-key: {document_key} is not found. Existing document-keys are: {doc_keys}"
)

doc = local_document_cache[document_key]

items = []
for _ in document_anchors:
ref = RefItem(cref=_)
items.append(ref.resolve(doc=doc))

doc.delete_items(node_items=items)

return f"Deleted the {document_anchors} for document with key {document_key}"
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ classifiers = [
requires-python = ">=3.10"
dependencies = [
"docling~=2.25",
"docling-core",
"httpx>=0.28.1",
"mcp[cli]>=1.4.0",
"pydantic~=2.10",
Expand Down Expand Up @@ -170,3 +171,6 @@ branch = "main"
parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test"
parser_angular_minor_types = "feat"
parser_angular_patch_types = "fix,perf"

[tool.uv.sources]
docling-core = { git = "https://github.com/docling-project/docling-core.git", rev = "dev/add-doclingdocument-manipulation" }
10 changes: 4 additions & 6 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading