No worries, thanks for your help!
The PDF in question is: https://assets.publishing.service.gov.uk/media/603539438fa8f54816a78968/scho0909bqyv-e-e.pdf
Running the following code gives the following output & error message:
import pymupdf
import pymupdf.layout
import pymupdf4llm
doc = pymupdf.open("/dbfs/mnt/lab/unrestricted/FloodDX/corpora/corpus_1_randd/raw_docs/scho0909bqyv-e-e.pdf")
json = pymupdf4llm.to_markdown(doc, show_progress=True)
print(json)
Parsing 335 pages of '/dbfs/mnt/lab/unrestricted/FloodDX/corpora/corpus_1_randd/raw_docs/scho0909bqyv-e-e.pdf'...
100%|██████████| 335/335 [01:23<00:00, 3.99it/s]
=== Document parser messages ===
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=334/335.
Full-page OCR on page.number=353/354.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=334/335.
Full-page OCR on page.number=353/354.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=334/335.
Full-page OCR on page.number=353/354.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=334/335.
Full-page OCR on page.number=353/354.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=334/335.
Full-page OCR on page.number=353/354.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=334/335.
Full-page OCR on page.number=353/354.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=334/335.
Full-page OCR on page.number=353/354.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=334/335.
Full-page OCR on page.number=353/354.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=334/335.
Full-page OCR on page.number=0/1.
Full-page OCR on page.number=334/335.
Generating markdown text...
55%|█████▍ | 184/335 [00:00<00:00, 7512.36it/s]
[Trace ID: 00-f12546ebec2a9d46a722b043f77be33d-45513a062f141b4d-00]
File <command-6632665700678967>, line 8
3 import pymupdf4llm
6 doc = pymupdf.open("/dbfs/mnt/lab/unrestricted/FloodDX/corpora/corpus_1_randd/raw_docs/scho0909bqyv-e-e.pdf")
----> 8 json = pymupdf4llm.to_markdown(doc, show_progress=True)
10 print(json)
File <command-6632665700678967>, line 8
3 import pymupdf4llm
6 doc = pymupdf.open("/dbfs/mnt/lab/unrestricted/FloodDX/corpora/corpus_1_randd/raw_docs/scho0909bqyv-e-e.pdf")
----> 8 json = pymupdf4llm.to_markdown(doc, show_progress=True)
10 print(json)
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-b808ee37-f0d0-4f8b-aff2-28b2ace3fee3/lib/python3.12/site-packages/pymupdf4llm/__init__.py:97, in to_markdown(doc, header, footer, pages, write_images, embed_images, image_path, image_format, filename, force_text, page_chunks, page_separators, dpi, ocr_dpi, page_width, page_height, ignore_code, show_progress, use_ocr, **kwargs)
82 raise ValueError("Cannot both write_images and embed_images")
83 parsed_doc = parse_document(
84 doc,
85 filename=filename,
(...)
95 use_ocr=use_ocr,
96 )
---> 97 return parsed_doc.to_markdown(
98 header=header,
99 footer=footer,
100 write_images=write_images,
101 embed_images=embed_images,
102 ignore_code=ignore_code,
103 show_progress=show_progress,
104 page_separators=page_separators,
105 page_chunks=page_chunks,
106 use_ocr=use_ocr,
107 )
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-b808ee37-f0d0-4f8b-aff2-28b2ace3fee3/lib/python3.12/site-packages/pymupdf4llm/helpers/document_layout.py:718, in ParsedDocument.to_markdown(self, header, footer, write_images, embed_images, ignore_code, show_progress, page_separators, page_chunks, **kwargs)
716 md_string += list_item_to_md(box.textlines, list_item_levels[i])
717 elif btype == "footnote":
--> 718 md_string += footnote_to_md(box.textlines)
719 elif not header and btype == "page-header":
720 continue
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-b808ee37-f0d0-4f8b-aff2-28b2ace3fee3/lib/python3.12/site-packages/pymupdf4llm/helpers/document_layout.py:480, in footnote_to_md(textlines)
470 def footnote_to_md(textlines):
471 """
472 Convert "footnote" bboxes to markdown.
473 The first line is prefixed with "> ". Subsequent lines are appended
(...)
478 one list item is contained in a single bbox.
479 """
--> 480 line = textlines[0]
481 spans = line["spans"]
482 output = "> "