Skip to content

feat: add document manipulation #9

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jun 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docling_mcp/docling_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
logger = setup_logger()


def hash_string_md5(input_string: str) -> str:
"""Creates an md5 hash-string from the input string."""
return hashlib.sha256(input_string.encode(), usedforsecurity=False).hexdigest()

Check warning on line 18 in docling_mcp/docling_cache.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/docling_cache.py#L18

Added line #L18 was not covered by tests


def get_cache_dir() -> Path:
"""Get the cache directory for the application.

Expand Down
2 changes: 1 addition & 1 deletion docling_mcp/docling_settings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""This module manages the settings for Docling."""

from docling.datamodel.pipeline_options import AcceleratorDevice
from docling.datamodel.accelerator_options import AcceleratorDevice

Check warning on line 3 in docling_mcp/docling_settings.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/docling_settings.py#L3

Added line #L3 was not covered by tests
from docling.datamodel.settings import settings

from docling_mcp.logger import setup_logger
Expand Down
2 changes: 1 addition & 1 deletion docling_mcp/tools/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
from mcp.shared.exceptions import McpError
from mcp.types import INTERNAL_ERROR, ErrorData

from docling.datamodel.accelerator_options import AcceleratorDevice

Check warning on line 9 in docling_mcp/tools/conversion.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/conversion.py#L9

Added line #L9 was not covered by tests
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
Expand Down
203 changes: 203 additions & 0 deletions docling_mcp/tools/manipulation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
"""Tools for manipulating Docling documents."""

from docling_core.types.doc.document import (

Check warning on line 3 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L3

Added line #L3 was not covered by tests
DocItem,
GroupItem,
RefItem,
SectionHeaderItem,
TextItem,
TitleItem,
)

from docling_mcp.logger import setup_logger
from docling_mcp.shared import local_document_cache, mcp

Check warning on line 13 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L12-L13

Added lines #L12 - L13 were not covered by tests

# Create a default project logger
logger = setup_logger()

Check warning on line 16 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L16

Added line #L16 was not covered by tests


@mcp.tool()
def get_overview_of_document_anchors(document_key: str) -> str:

Check warning on line 20 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L19-L20

Added lines #L19 - L20 were not covered by tests
"""Retrieves a structured overview of a document from the local document cache.

This tool returns a text representation of the document's structure, showing
the hierarchy and types of elements within the document. Each line in the
output includes the document anchor reference and item label.

Args:
document_key (str): The unique identifier for the document in the local cache.

Returns:
str: A string containing the hierarchical structure of the document with
indentation to show nesting levels, along with anchor references.

Raises:
ValueError: If the specified document_key does not exist in the local cache.

Example:
get_overview_of_document_anchors(document_key="doc123")
"""
if document_key not in local_document_cache:
doc_keys = ", ".join(local_document_cache.keys())
raise ValueError(

Check warning on line 42 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L40-L42

Added lines #L40 - L42 were not covered by tests
f"document-key: {document_key} is not found. Existing document-keys are: {doc_keys}"
)

doc = local_document_cache[document_key]

Check warning on line 46 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L46

Added line #L46 was not covered by tests

lines = []
slevel = 0
for item, level in doc.iterate_items():
ref = item.get_ref()

Check warning on line 51 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L48-L51

Added lines #L48 - L51 were not covered by tests

if isinstance(item, DocItem):
if isinstance(item, TitleItem):
lines.append(f"[anchor:{ref.cref}] {item.label}: {item.text}")

Check warning on line 55 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L53-L55

Added lines #L53 - L55 were not covered by tests

elif isinstance(item, SectionHeaderItem):
slevel = item.level
indent = " " * (level + slevel)
lines.append(

Check warning on line 60 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L57-L60

Added lines #L57 - L60 were not covered by tests
f"{indent}[anchor:{ref.cref}] {item.label}-{level}: {item.text}"
)

else:
indent = " " * (level + slevel + 1)
lines.append(f"{indent}[anchor:{ref.cref}] {item.label}")

Check warning on line 66 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L65-L66

Added lines #L65 - L66 were not covered by tests

elif isinstance(item, GroupItem):
indent = " " * (level + slevel + 1)
lines.append(f"{indent}[anchor:{ref.cref}] {item.label}")

Check warning on line 70 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L68-L70

Added lines #L68 - L70 were not covered by tests

return "\n".join(lines)

Check warning on line 72 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L72

Added line #L72 was not covered by tests


@mcp.tool()
def get_text_of_document_item_at_anchor(document_key: str, document_anchor: str) -> str:

Check warning on line 76 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L75-L76

Added lines #L75 - L76 were not covered by tests
"""Retrieves the text content of a specific document item identified by its anchor.

This tool extracts the text from a document item at the specified anchor location
within a document that exists in the local document cache.

Args:
document_key (str): The unique identifier for the document in the local cache.
document_anchor (str): The anchor reference that identifies the specific item
within the document.

Returns:
str: A formatted string containing the text content of the specified item,
wrapped in code block formatting.

Raises:
ValueError: If the specified document_key does not exist in the local cache.
ValueError: If the item at the specified anchor is not a textual item.

Example:
get_text_of_document_item_at_anchor(document_key="doc123", document_anchor="#/texts/2")
"""
if document_key not in local_document_cache:
doc_keys = ", ".join(local_document_cache.keys())
raise ValueError(

Check warning on line 100 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L98-L100

Added lines #L98 - L100 were not covered by tests
f"document-key: {document_key} is not found. Existing document-keys are: {doc_keys}"
)

doc = local_document_cache[document_key]

Check warning on line 104 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L104

Added line #L104 was not covered by tests

ref = RefItem(cref=document_anchor)
item = ref.resolve(doc=doc)

Check warning on line 107 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L106-L107

Added lines #L106 - L107 were not covered by tests

if isinstance(item, TextItem):
text = item.text

Check warning on line 110 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L109-L110

Added lines #L109 - L110 were not covered by tests
else:
raise ValueError(

Check warning on line 112 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L112

Added line #L112 was not covered by tests
f"Item at {document_anchor} for document-key: {document_key} is not a textual item."
)

return f"The text of {document_anchor} for document-key with {document_key} is:\n\n```{text}```\n\n"

Check warning on line 116 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L116

Added line #L116 was not covered by tests


@mcp.tool()
def update_text_of_document_item_at_anchor(

Check warning on line 120 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L119-L120

Added lines #L119 - L120 were not covered by tests
document_key: str, document_anchor: str, updated_text: str
) -> str:
"""Updates the text content of a specific document item identified by its anchor.

This tool modifies the text of an existing document item at the specified anchor
location within a document that exists in the local document cache.

Args:
document_key (str): The unique identifier for the document in the local cache.
document_anchor (str): The anchor reference that identifies the specific item
within the document.
updated_text (str): The new text content to replace the existing content.

Returns:
str: A confirmation message indicating the text was successfully updated.

Raises:
ValueError: If the specified document_key does not exist in the local cache.
ValueError: If the item at the specified anchor is not a textual item.

Example:
update_text_of_document_item_at_anchor(document_key="doc123", document_anchor="#/texts/2", updated_text="This is the new content.")
"""
if document_key not in local_document_cache:
doc_keys = ", ".join(local_document_cache.keys())
raise ValueError(

Check warning on line 146 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L144-L146

Added lines #L144 - L146 were not covered by tests
f"document-key: {document_key} is not found. Existing document-keys are: {doc_keys}"
)

doc = local_document_cache[document_key]

Check warning on line 150 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L150

Added line #L150 was not covered by tests

ref = RefItem(cref=document_anchor)
item = ref.resolve(doc=doc)

Check warning on line 153 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L152-L153

Added lines #L152 - L153 were not covered by tests

if isinstance(item, TextItem):
item.text = updated_text

Check warning on line 156 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L155-L156

Added lines #L155 - L156 were not covered by tests
else:
raise ValueError(

Check warning on line 158 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L158

Added line #L158 was not covered by tests
f"Item at {document_anchor} for document-key: {document_key} is not a textual item."
)

return f"Updated the text at {document_anchor} for document with key {document_key}"

Check warning on line 162 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L162

Added line #L162 was not covered by tests


@mcp.tool()
def delete_document_items_at_anchors(

Check warning on line 166 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L165-L166

Added lines #L165 - L166 were not covered by tests
document_key: str, document_anchors: list[str]
) -> str:
"""Deletes multiple document items identified by their anchors.

This tool removes specified items from a document that exists in the local
document cache, based on their anchor references.

Args:
document_key (str): The unique identifier for the document in the local cache.
document_anchors (list[str]): A list of anchor references identifying the items
to be deleted from the document.

Returns:
str: A confirmation message indicating the items were successfully deleted.

Raises:
ValueError: If the specified document_key does not exist in the local cache.

Example:
delete_document_items_at_anchors(document_key="doc123", document_anchors=["#/texts/2", "#/tables/1"])
"""
if document_key not in local_document_cache:
doc_keys = ", ".join(local_document_cache.keys())
raise ValueError(

Check warning on line 190 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L188-L190

Added lines #L188 - L190 were not covered by tests
f"document-key: {document_key} is not found. Existing document-keys are: {doc_keys}"
)

doc = local_document_cache[document_key]

Check warning on line 194 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L194

Added line #L194 was not covered by tests

items = []
for _ in document_anchors:
ref = RefItem(cref=_)
items.append(ref.resolve(doc=doc))

Check warning on line 199 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L196-L199

Added lines #L196 - L199 were not covered by tests

doc.delete_items(node_items=items)

Check warning on line 201 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L201

Added line #L201 was not covered by tests

return f"Deleted the {document_anchors} for document with key {document_key}"

Check warning on line 203 in docling_mcp/tools/manipulation.py

View check run for this annotation

Codecov / codecov/patch

docling_mcp/tools/manipulation.py#L203

Added line #L203 was not covered by tests
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -203,3 +203,4 @@ branch = "main"
parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test"
parser_angular_minor_types = "feat"
parser_angular_patch_types = "fix,perf"

Loading