When using pymupdf4llm.to_markdown(), I noticed that hyphenated words spanning two text blocks are not joined correctly. The trailing hyphen is removed, but the word parts remain separated.
Example:
-
PDF contains:
under-at end of one text block,standingat start of the next -
Raw page.get_text():
under-\nstanding(hyphen present) -
pymupdf4llm.to_markdown() output:
under\n\nstanding(hyphen removed, words not joined) -
Expected:
understanding
Environment:
-
pymupdf4llm version: 0.2.7
-
PyMuPDF version: 1.26.6
This appears to happen when the hyphenated word falls at a text block boundary (as identified by PyMuPDF’s extractDICT()). The dehyphenation works correctly within a single block, but fails across block boundaries.
Reproduction script included below. It creates a minimal PDF demonstrating the issue and shows the incorrect output.
Looking at the pymupdf4llm source code, the issue appears to be in helpers/document_layout.py. The dehyphenation logic around line 393-408 checks (old_block, old_line) != (s[“block”], s[“line”]) and removes the trailing hyphen when crossing lines, but this only works within the same block processing context. When the continuation is in a completely different block, the blocks are processed separately and the cross-block join never happens, only the hyphen removal occurs.
#!/usr/bin/env python3
import fitz
import pymupdf4llm
def create_test_pdf(path: str) -> None:
"""Create a PDF with a hyphenated word spanning two text blocks."""
doc = fitz.open()
page = doc.new_page(width=595, height=842)
# Block 1: ends with 'under-'
block1_rect = fitz.Rect(50, 100, 500, 130)
page.insert_textbox(
block1_rect,
"This demonstrates a cross-block hyphenation issue with the word under-",
fontsize=12,
fontname="helv",
)
# Block 2: starts with 'standing' (continuation) - separate block due to vertical gap
block2_rect = fitz.Rect(50, 135, 500, 165)
page.insert_textbox(
block2_rect,
'standing which should be joined to form "understanding".',
fontsize=12,
fontname="helv",
)
doc.save(path)
doc.close()
def verify_blocks(path: str) -> None:
"""Verify the PDF has separate text blocks."""
doc = fitz.open(path)
page = doc[0]
blocks = page.get_text("dict")["blocks"]
print("Text blocks in PDF:")
for i, block in enumerate(blocks):
if block.get("type") == 0:
text = ""
for line in block.get("lines", []):
for span in line.get("spans", []):
text += span.get("text", "")
print(f" Block {i}: {repr(text)}")
doc.close()
def test_bug(path: str) -> None:
"""Demonstrate the bug."""
doc = fitz.open(path)
print("\n=== Raw PyMuPDF text extraction ===")
raw_text = doc[0].get_text()
print(repr(raw_text))
print("\n=== pymupdf4llm.to_markdown() output ===")
result = pymupdf4llm.to_markdown(doc)
print(repr(result))
# Check for bug
print("\n=== Bug check ===")
if "under-" in raw_text:
print("Raw text has hyphen: under- [OK]")
if "under\n\nstanding" in result:
print("pymupdf4llm output: under\\n\\nstanding [BUG - hyphen removed but not joined]")
elif "understanding" in result:
print("pymupdf4llm output: understanding [FIXED]")
else:
print(f"Unexpected output pattern")
doc.close()
if __name__ == "__main__":
pdf_path = "/tmp/cross_block_hyphenation_test.pdf"
print(f"pymupdf4llm version: {pymupdf4llm.__version__}")
print(f"PyMuPDF version: {fitz.version}\n")
create_test_pdf(pdf_path)
print(f"Created test PDF: {pdf_path}\n")
verify_blocks(pdf_path)
test_bug(pdf_path)