Skip to content

Commit e3a50f7

Browse files
committed
fix(document): save document images page by page
Signed-off-by: Clément Doumouro <[email protected]>
1 parent 1350884 commit e3a50f7

File tree

2 files changed

+53
-9
lines changed

2 files changed

+53
-9
lines changed

docling_core/types/doc/document.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2427,7 +2427,10 @@ def _with_embedded_pictures(self) -> "DoclingDocument":
24272427
return result
24282428

24292429
def _with_pictures_refs(
2430-
self, image_dir: Path, reference_path: Optional[Path] = None
2430+
self,
2431+
image_dir: Path,
2432+
page_no: Optional[int],
2433+
reference_path: Optional[Path] = None,
24312434
) -> "DoclingDocument":
24322435
"""Document with images as refs.
24332436
@@ -2440,7 +2443,7 @@ def _with_pictures_refs(
24402443
image_dir.mkdir(parents=True, exist_ok=True)
24412444

24422445
if image_dir.is_dir():
2443-
for item, level in result.iterate_items(with_groups=False):
2446+
for item, level in result.iterate_items(page_no=page_no, with_groups=False):
24442447
if isinstance(item, PictureItem):
24452448

24462449
if (
@@ -2515,7 +2518,7 @@ def save_as_json(
25152518
os.makedirs(artifacts_dir, exist_ok=True)
25162519

25172520
new_doc = self._make_copy_with_refmode(
2518-
artifacts_dir, image_mode, reference_path=reference_path
2521+
artifacts_dir, image_mode, page_no=None, reference_path=reference_path
25192522
)
25202523

25212524
out = new_doc.export_to_dict()
@@ -2554,7 +2557,7 @@ def save_as_yaml(
25542557
os.makedirs(artifacts_dir, exist_ok=True)
25552558

25562559
new_doc = self._make_copy_with_refmode(
2557-
artifacts_dir, image_mode, reference_path=reference_path
2560+
artifacts_dir, image_mode, page_no=None, reference_path=reference_path
25582561
)
25592562

25602563
out = new_doc.export_to_dict()
@@ -2615,7 +2618,7 @@ def save_as_markdown(
26152618
os.makedirs(artifacts_dir, exist_ok=True)
26162619

26172620
new_doc = self._make_copy_with_refmode(
2618-
artifacts_dir, image_mode, reference_path=reference_path
2621+
artifacts_dir, image_mode, page_no, reference_path=reference_path
26192622
)
26202623

26212624
md_out = new_doc.export_to_markdown(
@@ -2775,7 +2778,7 @@ def save_as_html(
27752778
os.makedirs(artifacts_dir, exist_ok=True)
27762779

27772780
new_doc = self._make_copy_with_refmode(
2778-
artifacts_dir, image_mode, reference_path=reference_path
2781+
artifacts_dir, image_mode, page_no, reference_path=reference_path
27792782
)
27802783

27812784
html_out = new_doc.export_to_html(
@@ -2812,14 +2815,15 @@ def _make_copy_with_refmode(
28122815
self,
28132816
artifacts_dir: Path,
28142817
image_mode: ImageRefMode,
2818+
page_no: Optional[int],
28152819
reference_path: Optional[Path] = None,
28162820
):
28172821
new_doc = None
28182822
if image_mode == ImageRefMode.PLACEHOLDER:
28192823
new_doc = self
28202824
elif image_mode == ImageRefMode.REFERENCED:
28212825
new_doc = self._with_pictures_refs(
2822-
image_dir=artifacts_dir, reference_path=reference_path
2826+
image_dir=artifacts_dir, page_no=page_no, reference_path=reference_path
28232827
)
28242828
elif image_mode == ImageRefMode.EMBEDDED:
28252829
new_doc = self._with_embedded_pictures()

test/test_docling_doc.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,12 +1108,51 @@ def test_save_pictures():
11081108

11091109
doc: DoclingDocument = _construct_doc()
11101110

1111-
new_doc = doc._with_pictures_refs(image_dir=Path("./test/data/constructed_images/"))
1111+
new_doc = doc._with_pictures_refs(
1112+
image_dir=Path("./test/data/constructed_images/"), page_no=None
1113+
)
11121114

11131115
img_paths = new_doc._list_images_on_disk()
11141116
assert len(img_paths) == 1, "len(img_paths)!=1"
11151117

11161118

1119+
def test_save_pictures_with_page():
1120+
# Given
1121+
doc = DoclingDocument(name="Dummy")
1122+
1123+
doc.add_page(page_no=1, size=Size(width=2000, height=4000), image=None)
1124+
doc.add_page(
1125+
page_no=2,
1126+
size=Size(width=2000, height=4000),
1127+
)
1128+
image = PILImage.new(mode="RGB", size=(200, 400), color=(0, 0, 0))
1129+
doc.add_picture(
1130+
image=ImageRef.from_pil(image=image, dpi=72),
1131+
prov=ProvenanceItem(
1132+
page_no=2,
1133+
bbox=BoundingBox(
1134+
b=0, l=0, r=200, t=400, coord_origin=CoordOrigin.BOTTOMLEFT
1135+
),
1136+
charspan=(1, 2),
1137+
),
1138+
)
1139+
1140+
# When
1141+
with_ref = doc._with_pictures_refs(
1142+
image_dir=Path("./test/data/constructed_images/"), page_no=1
1143+
)
1144+
# Then
1145+
n_images = len(with_ref._list_images_on_disk())
1146+
assert n_images == 0
1147+
# When
1148+
with_ref = with_ref._with_pictures_refs(
1149+
image_dir=Path("./test/data/constructed_images/"), page_no=2
1150+
)
1151+
n_images = len(with_ref._list_images_on_disk())
1152+
# Then
1153+
assert n_images == 1
1154+
1155+
11171156
def _normalise_string_wrt_filepaths(instr: str, paths: List[Path]):
11181157

11191158
for p in paths:
@@ -1160,7 +1199,8 @@ def test_save_to_disk():
11601199
image_dir = Path("./test/data/doc/constructed_images/")
11611200

11621201
doc_with_references = doc._with_pictures_refs(
1163-
image_dir=image_dir # Path("./test/data/constructed_images/")
1202+
image_dir=image_dir, # Path("./test/data/constructed_images/")
1203+
page_no=None,
11641204
)
11651205

11661206
# paths will be different on different machines, so needs to be kept!

0 commit comments

Comments
 (0)