Hi Team,
I am extrating the text from an pdf file and translating to another language. After translating I am creating a new pdf. Issue is when long text during translating it going out of page. But we need to put the text into new line and maintain the font size and font name. using insert_htmlbox it will automatically adjusting the fontsize but I need to maintain the fontsize. I am having widget in the pdf I need to translate those dropdown values as well, during the translated pdf checkboxes and table structure is missing. Please can you suggest how can fix this.
sample.pdf (65.8 KB)
import fitz
import json
def extract_pdf_with_widgets(pdf_path, out_json):
doc = fitz.open(pdf_path)
result = []
for page_num, page in enumerate(doc, start=1):
# --- Full text extraction with font info ---
tp = page.get_textpage()
dict_unsorted = tp.extractDICT(sort=False)
# Collect spans with full font + style info
detailed_text = []
for block in dict_unsorted["blocks"]:
if "lines" not in block:
continue
for line in block["lines"]:
for span in line["spans"]:
detailed_text.append({
"text": span["text"],
"bbox": list(span["bbox"]),
"font": span["font"],
"size": span["size"],
"flags": span["flags"],
"color": span["color"],
"alpha": span["alpha"],
"ascender": span["ascender"],
"descender": span["descender"],
"origin": list(span["origin"]),
"char_flags": span["char_flags"],
"bidi": span["bidi"],
})
# --- Widgets extraction ---
widgets = []
for w in page.widgets():
field_type = None
if w.field_type == fitz.PDF_WIDGET_TYPE_BUTTON:
# button could be checkbox, radio, or push button
if w.field_flags & fitz.PDF_BTN_RADIO:
field_type = "radio"
elif w.field_flags & fitz.PDF_BTN_CHECKBOX:
field_type = "checkbox"
else:
field_type = "button"
elif w.field_type == fitz.PDF_WIDGET_TYPE_TEXT:
field_type = "text"
elif w.field_type == fitz.PDF_WIDGET_TYPE_LISTBOX:
field_type = "listbox"
elif w.field_type == fitz.PDF_WIDGET_TYPE_COMBOBOX:
field_type = "combobox"
elif w.field_type == fitz.PDF_WIDGET_TYPE_SIGNATURE:
field_type = "signature"
widget_data = {
"field_name": w.field_name,
"field_type": field_type,
"rect": list(w.rect),
"value": w.field_value,
"options": getattr(w, "field_choices", None) if field_type in ("listbox", "combobox") else None,
}
# Add checkbox / radio details
if field_type in ("checkbox", "radio"):
widget_data["checked"] = bool(w.field_value)
widget_data["export_value"] = w.button_get_on_state() # symbol like "Yes", "On"
widgets.append(widget_data)
result.append({
"page_number": page_num,
"text_spans": detailed_text,
"widgets": widgets
})
with open(out_json, "w", encoding="utf-8") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
return result
# Example usage
extract_pdf_with_widgets("sample.pdf", "sample.json")
I am converting pdf to json and translate and converted back to pdf
import json
from pathlib import Path
from googletrans import Translator # ✅ Google Translate
import fitz
import pymupdf_fonts
from pymupdf import pymupdf
from collections import defaultdict
font_file1 = r"calibri.ttf"
font_file2 = r"calibrib.ttf"
font_file3 = r"calibrii.ttf"
font_file4 = r"calibril.ttf"
font_file5 = r"calibrili.ttf"
font_file6 = r"calibriz.ttf"
font_file7 = r"times.ttf"
font_file8 = r"timesbd.ttf"
font_file9 = r"timesbi.ttf"
font_file10 = r"timesi.ttf"
# ------------------------------
# Font mapping (JSON → file / built-in)
# ------------------------------
fonts_dir = Path(__file__).parent / "fonts"
font_map_ttf = {
"Calibri": fonts_dir / "calibri.ttf",
"Calibri-Light": fonts_dir / "calibril.ttf",
"Calibri-Bold": fonts_dir / "calibrib.ttf",
"Calibri-Italic": fonts_dir / "calibrii.ttf",
"Calibri-BoldItalic": fonts_dir / "calibriz.ttf",
}
def group_spans_into_lines(spans, y_tolerance=2):
lines = defaultdict(list)
for span in spans:
y = span["origin"][1]
# Round or bucket y value to group close baselines
y_key = round(y / y_tolerance) * y_tolerance
lines[y_key].append(span)
merged_lines = []
for y_key, group in lines.items():
# Sort left to right
group = sorted(group, key=lambda s: s["origin"][0])
# Merge text
line_text = "".join([s["text"] for s in group])
# Merge bbox (min x, min y, max x, max y)
min_x = min(s["bbox"][0] for s in group)
min_y = min(s["bbox"][1] for s in group)
max_x = max(s["bbox"][2] for s in group)
max_y = max(s["bbox"][3] for s in group)
merged_lines.append({
"text": line_text,
"bbox": [min_x, min_y, max_x, max_y],
"origin": [group[0]["origin"][0], group[0]["origin"][1]],
"font": group[0]["font"], # pick first font for now
"size": group[0]["size"], # pick first size for now
"color": group[0]["color"]
})
return merged_lines
def get_font(json_font: str):
font_path = font_map_ttf.get(json_font)
if font_path and font_path.exists():
return str(font_path), None # Use this path as fontfile
else:
# Fallback to built-in Helvetica if not available
return None, "helv"
translator = Translator()
def wrap_translated_text(text, max_width, fontsize=12, json_font="helv"):
# Look up font from JSON
fontfile, builtin = get_font(json_font)
# Create font object correctly
if fontfile:
font = fitz.Font(fontfile=fontfile)
else:
font = fitz.Font(fontname=builtin)
words = text.split()
lines = []
current_line = ""
current_width = 0
space_width = font.text_length(" ", fontsize=fontsize)
for word in words:
word_width = font.text_length(word, fontsize=fontsize)
if current_line and current_width + space_width + word_width > max_width:
lines.append(current_line)
current_line = word
current_width = word_width
else:
if current_line:
current_line += " " + word
current_width += space_width + word_width
else:
current_line = word
current_width = word_width
if current_line:
lines.append(current_line)
return "\n".join(lines)
def json_to_pdf_with_widgets(json_file, output_pdf):
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
doc = fitz.open()
for page_data in data:
page = doc.new_page()
page.insert_font(fontfile=font_file1, fontname="Calibri")
page.insert_font(fontfile=font_file2, fontname="Calibri-Bold")
page.insert_font(fontfile=font_file3, fontname="F0")
page.insert_font(fontfile=font_file4, fontname="Calibri-Light")
page.insert_font(fontfile=font_file5, fontname="F1")
page.insert_font(fontfile=font_file6, fontname="F2")
page.insert_font(fontfile=font_file7, fontname="TimesNewRomanPSMT")
page.insert_font(fontfile=font_file8, fontname="TimesNewRomanPS-BoldMT")
page.insert_font(fontfile=font_file9, fontname="TimesNewRomanPS-BoldItal")
page.insert_font(fontfile=font_file10, fontname="NimbusSans-Regular")
# ------------------------------
# Group spans into lines and draw
# ------------------------------
lines = group_spans_into_lines(page_data.get("text_spans", []))
for line in lines:
x, y = line["origin"]
text = line["text"]
font_size = float(line.get("size", 12))
color = line.get("color", 0)
r = ((color >> 16) & 255) / 255
g = ((color >> 8) & 255) / 255
b = (color & 255) / 255
print('text', text)
try:
translated = translator.translate(text, dest="es").text
except Exception as e:
print("Translation error:", e)
translated = text
if len(translated) > 90:
translated_wrapped = wrap_translated_text(
translated,
line["bbox"][2] - line["bbox"][0], # width of bbox
fontsize=font_size,
json_font=line.get("font", "helv") # pass Calibri-Light, Calibri-Bold, etc.
)
else:
translated_wrapped = translated
page.insert_text((x, y), translated_wrapped, fontname=line.get("font", "helv"), fontsize=font_size,
color=(r, g, b))
#
# # ------------------------------
# # Draw text spans
# # ------------------------------
# for span in page_data.get("text_spans", []):
# x, y = span["origin"]
# text = span["text"]
# font_size = float(span.get("size", 12))
# color = span.get("color", 0)
# r = ((color >> 16) & 255) / 255
# g = ((color >> 8) & 255) / 255
# b = (color & 255) / 255
#
# try:
# translated = translator.translate(text, dest="es").text
# except Exception as e:
# print("Translation error:", e)
# translated = text
#
# page.insert_text((x, y), translated, fontname=span.get("font", ""), fontsize=font_size, color=(r, g, b))
# ------------------------------
# Add interactive widgets
# ------------------------------
for w in page_data.get("widgets", []):
rect = fitz.Rect(w["rect"])
field_type = w.get("field_type")
field_name = w.get("field_name", "Field")
value = w.get("value", "")
options = w.get("options", [])
widget = fitz.Widget()
widget.rect = rect
widget.field_name = field_name
if field_type == "text":
widget.field_type = fitz.PDF_WIDGET_TYPE_TEXT
widget.field_value = value or ""
elif field_type == "combobox":
widget.field_type = fitz.PDF_WIDGET_TYPE_COMBOBOX
if options:
widget.field_choices = options
widget.field_value = value or ""
elif field_type == "listbox":
widget.field_type = fitz.PDF_WIDGET_TYPE_LISTBOX
if options:
widget.field_choices = options
widget.field_value = value or ""
else: # Checkbox (default if field_type is None)
widget.field_type = fitz.PDF_WIDGET_TYPE_BUTTON
widget.button_type = fitz.PDF_WIDGET_TYPE_CHECKBOX
widget.on_state = str(value).lower() in ("1", "true", "checked", "yes")
page.add_widget(widget)
doc.save(output_pdf)
doc.close()
print(f"✅ PDF with interactive widgets created at: {output_pdf}")
# Example usage:
json_to_pdf_with_widgets("sample.json", "recreated_with_widgets.pdf")


