group-wbl/.venv/lib/python3.13/site-packages/pypdf/generic/_appearance_stream.py
2026-01-09 09:48:03 +08:00

527 lines
24 KiB
Python

import re
from enum import IntEnum
from typing import Any, Optional, Union, cast
from .._cmap import build_char_map_from_dict
from .._codecs.core_fontmetrics import CORE_FONT_METRICS
from .._font import FontDescriptor
from .._utils import logger_warning
from ..constants import AnnotationDictionaryAttributes, FieldDictionaryAttributes
from ..generic import (
DecodedStreamObject,
DictionaryObject,
NameObject,
NumberObject,
RectangleObject,
)
from ..generic._base import ByteStringObject, TextStringObject, is_null_or_none
DEFAULT_FONT_SIZE_IN_MULTILINE = 12
class TextAlignment(IntEnum):
"""Defines the alignment options for text within a form field's appearance stream."""
LEFT = 0
CENTER = 1
RIGHT = 2
class TextStreamAppearance(DecodedStreamObject):
"""
A class representing the appearance stream for a text-based form field.
This class generates the content stream (the `ap_stream_data`) that dictates
how text is rendered within a form field's bounding box. It handles properties
like font, font size, color, multiline text, and text selection highlighting.
"""
def _scale_text(
self,
font_descriptor: FontDescriptor,
font_size: float,
field_width: float,
field_height: float,
text: str,
is_multiline: bool,
min_font_size: float = 4.0, # Minimum font size to attempt
font_size_step: float = 0.2 # How much to decrease font size by each step
) -> tuple[list[tuple[float, str]], float]:
"""
Takes a piece of text and scales it to field_width or field_height, given font_name
and font_size. For multiline fields, adds newlines to wrap the text.
Args:
font_descriptor: A FontDescriptor for the font to be used.
font_size: The font size in points.
field_width: The width of the field in which to fit the text.
field_height: The height of the field in which to fit the text.
text: The text to fit with the field.
is_multiline: Whether to scale and wrap the text, or only to scale.
min_font_size: The minimum font size at which to scale the text.
font_size_step: The amount by which to decrement font size per step while scaling.
Returns:
The text in the form of list of tuples, each tuple containing the length of a line
and its contents, and the font_size for these lines and lengths.
"""
# Single line:
if not is_multiline:
test_width = font_descriptor.text_width(text) * font_size / 1000
if test_width > field_width or font_size > field_height:
new_font_size = font_size - font_size_step
if new_font_size >= min_font_size:
# Text overflows height; Retry with smaller font size.
return self._scale_text(
font_descriptor,
round(new_font_size, 1),
field_width,
field_height,
text,
is_multiline,
min_font_size,
font_size_step
)
return [(test_width, text)], font_size
# Multiline:
orig_text = text
paragraphs = text.replace("\n", "\r").split("\r")
wrapped_lines = []
current_line_words: list[str] = []
current_line_width: float = 0
space_width = font_descriptor.text_width(" ") * font_size / 1000
for paragraph in paragraphs:
if not paragraph.strip():
wrapped_lines.append((0.0, ""))
continue
words = paragraph.split(" ")
for i, word in enumerate(words):
word_width = font_descriptor.text_width(word) * font_size / 1000
test_width = current_line_width + word_width + (space_width if i else 0)
if test_width > field_width and current_line_words:
wrapped_lines.append((current_line_width, " ".join(current_line_words)))
current_line_words = [word]
current_line_width = word_width
elif not current_line_words and word_width > field_width:
wrapped_lines.append((word_width, word))
current_line_words = []
current_line_width = 0
else:
if current_line_words:
current_line_width += space_width
current_line_words.append(word)
current_line_width += word_width
if current_line_words:
wrapped_lines.append((current_line_width, " ".join(current_line_words)))
current_line_words = []
current_line_width = 0
# Estimate total height.
# Assumes line spacing of 1.4
estimated_total_height = font_size + (len(wrapped_lines) - 1) * 1.4 * font_size
if estimated_total_height > field_height:
# Text overflows height; Retry with smaller font size.
new_font_size = font_size - font_size_step
if new_font_size >= min_font_size:
return self._scale_text(
font_descriptor,
round(new_font_size, 1),
field_width,
field_height,
orig_text,
is_multiline,
min_font_size,
font_size_step
)
return wrapped_lines, font_size
def _generate_appearance_stream_data(
self,
text: str = "",
selection: Optional[list[str]] = None,
rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0),
font_descriptor: Optional[FontDescriptor] = None,
font_glyph_byte_map: Optional[dict[str, bytes]] = None,
font_name: str = "/Helv",
font_size: float = 0.0,
font_color: str = "0 g",
is_multiline: bool = False,
alignment: TextAlignment = TextAlignment.LEFT,
is_comb: bool = False,
max_length: Optional[int] = None
) -> bytes:
"""
Generates the raw bytes of the PDF appearance stream for a text field.
This private method assembles the PDF content stream operators to draw
the provided text within the specified rectangle. It handles text positioning,
font application, color, and special formatting like selected text.
Args:
text: The text to be rendered in the form field.
selection: An optional list of strings that should be highlighted as selected.
font_glyph_byte_map: An optional dictionary mapping characters to their
byte representation for glyph encoding.
rect: The bounding box of the form field. Can be a `RectangleObject`
or a tuple of four floats (x1, y1, x2, y2).
font_name: The name of the font resource to use (e.g., "/Helv").
font_size: The font size. If 0, it is automatically calculated
based on whether the field is multiline or not.
font_color: The color to apply to the font, represented as a PDF
graphics state string (e.g., "0 g" for black).
is_multiline: A boolean indicating if the text field is multiline.
alignment: Text alignment, can be TextAlignment.LEFT, .RIGHT, or .CENTER.
is_comb: Boolean that designates fixed-length fields, where every character
fills one "cell", such as in a postcode.
max_length: Used if is_comb is set. The maximum number of characters for a fixed-
length field.
Returns:
A byte string containing the PDF content stream data.
"""
font_glyph_byte_map = font_glyph_byte_map or {}
if isinstance(rectangle, tuple):
rectangle = RectangleObject(rectangle)
font_descriptor = cast(FontDescriptor, font_descriptor)
# If font_size is 0, apply the logic for multiline or large-as-possible font
if font_size == 0:
if selection: # Don't wrap text when dealing with a /Ch field, in order to prevent problems
is_multiline = False # with matching "selection" with "line" later on.
if is_multiline:
font_size = DEFAULT_FONT_SIZE_IN_MULTILINE
else:
font_size = rectangle.height - 2
lines, font_size = self._scale_text(
font_descriptor,
font_size,
rectangle.width - 3, # One point margin left and right, and an additional point because the first
# offset takes one extra point (see below, "desired_abs_x_start")
rectangle.height - 3, # One point margin for top and bottom, one point extra for the first line
# (see y_offset)
text,
is_multiline,
)
elif is_comb:
if max_length and len(text) > max_length:
logger_warning (
f"Length of text {text} exceeds maximum length ({max_length}) of field, input truncated.",
__name__
)
# We act as if each character is one line, because we draw it separately later on
lines = [(
font_descriptor.text_width(char) * font_size / 1000,
char
) for index, char in enumerate(text) if index < (max_length or len(text))]
else:
lines = [(
font_descriptor.text_width(line) * font_size / 1000,
line
) for line in text.replace("\n", "\r").split("\r")]
# Set the vertical offset
y_offset = rectangle.height - 1 - font_size
default_appearance = f"{font_name} {font_size} Tf {font_color}"
ap_stream = (
f"q\n/Tx BMC \nq\n1 1 {rectangle.width - 1} {rectangle.height - 1} "
f"re\nW\nBT\n{default_appearance}\n"
).encode()
current_x_pos: float = 0 # Initial virtual position within the text object.
for line_number, (line_width, line) in enumerate(lines):
if selection and line in selection:
# Might be improved, but cannot find how to get fill working => replaced with lined box
ap_stream += (
f"1 {y_offset - (line_number * font_size * 1.4) - 1} {rectangle.width - 2} {font_size + 2} re\n"
f"0.5 0.5 0.5 rg s\n{default_appearance}\n"
).encode()
# Calculate the desired absolute starting X for the current line
desired_abs_x_start: float = 0
if is_comb and max_length:
# Calculate the width of a cell for one character
cell_width = rectangle.width / max_length
# Space from the left edge of the cell to the character's baseline start
# line_width here is the *actual* character width in points for the single character 'line'
centering_offset_in_cell = (cell_width - line_width) / 2
# Absolute start X = (Cell Index, i.e., line_number * Cell Width) + Centering Offset
desired_abs_x_start = (line_number * cell_width) + centering_offset_in_cell
elif alignment == TextAlignment.RIGHT:
desired_abs_x_start = rectangle.width - 2 - line_width
elif alignment == TextAlignment.CENTER:
desired_abs_x_start = (rectangle.width - line_width) / 2
else: # Left aligned; default
desired_abs_x_start = 2
# Calculate x_rel_offset: how much to move from the current_x_pos
# to reach the desired_abs_x_start.
x_rel_offset = desired_abs_x_start - current_x_pos
# Y-offset:
y_rel_offset: float = 0
if line_number == 0:
y_rel_offset = y_offset # Initial vertical position
elif is_comb:
y_rel_offset = 0.0 # DO NOT move vertically for subsequent characters
else:
y_rel_offset = - font_size * 1.4 # Move down by line height
# Td is a relative translation (Tx and Ty).
# It updates the current text position.
ap_stream += f"{x_rel_offset} {y_rel_offset} Td\n".encode()
# Update current_x_pos based on the Td operation for the next iteration.
# This is the X position where the *current line* will start.
current_x_pos = desired_abs_x_start
encoded_line: list[bytes] = [
font_glyph_byte_map.get(c, c.encode("utf-16-be")) for c in line
]
if any(len(c) >= 2 for c in encoded_line):
ap_stream += b"<" + (b"".join(encoded_line)).hex().encode() + b"> Tj\n"
else:
ap_stream += b"(" + b"".join(encoded_line) + b") Tj\n"
ap_stream += b"ET\nQ\nEMC\nQ\n"
return ap_stream
def __init__(
self,
text: str = "",
selection: Optional[list[str]] = None,
rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0),
font_resource: Optional[DictionaryObject] = None,
font_name: str = "/Helv",
font_size: float = 0.0,
font_color: str = "0 g",
is_multiline: bool = False,
alignment: TextAlignment = TextAlignment.LEFT,
is_comb: bool = False,
max_length: Optional[int] = None
) -> None:
"""
Initializes a TextStreamAppearance object.
This constructor creates a new PDF stream object configured as an XObject
of subtype Form. It uses the `_appearance_stream_data` method to generate
the content for the stream.
Args:
text: The text to be rendered in the form field.
selection: An optional list of strings that should be highlighted as selected.
rect: The bounding box of the form field. Can be a `RectangleObject`
or a tuple of four floats (x1, y1, x2, y2).
font_resource: An optional variable that represents a PDF font dictionary.
font_name: The name of the font resource, e.g., "/Helv".
font_size: The font size. If 0, it's auto-calculated.
font_color: The font color string.
is_multiline: A boolean indicating if the text field is multiline.
alignment: Text alignment, can be TextAlignment.LEFT, .RIGHT, or .CENTER.
is_comb: Boolean that designates fixed-length fields, where every character
fills one "cell", such as in a postcode.
max_length: Used if is_comb is set. The maximum number of characters for a fixed-
length field.
"""
super().__init__()
# If a font resource was added, get the font character map
if font_resource:
font_resource = cast(DictionaryObject, font_resource.get_object())
font_descriptor = FontDescriptor.from_font_resource(font_resource)
else:
logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__)
font_name = "/Helv"
font_resource = DictionaryObject({
NameObject("/Subtype"): NameObject("/Type1"),
NameObject("/Name"): NameObject("/Helv"),
NameObject("/Type"): NameObject("/Font"),
NameObject("/BaseFont"): NameObject("/Helvetica"),
NameObject("/Encoding"): NameObject("/WinAnsiEncoding")
})
font_descriptor = CORE_FONT_METRICS["Helvetica"]
# Get the font glyph data
_font_subtype, _, font_encoding, font_map = build_char_map_from_dict(
200, font_resource
)
try: # remove width stored in -1 key
del font_map[-1]
except KeyError:
pass
font_glyph_byte_map: dict[str, bytes]
if isinstance(font_encoding, str):
font_glyph_byte_map = {
v: k.encode(font_encoding) for k, v in font_map.items()
}
else:
font_glyph_byte_map = {v: bytes((k,)) for k, v in font_encoding.items()}
font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()}
for key, value in font_map.items():
font_glyph_byte_map[value] = font_encoding_rev.get(key, key)
ap_stream_data = self._generate_appearance_stream_data(
text,
selection,
rectangle,
font_descriptor,
font_glyph_byte_map,
font_name=font_name,
font_size=font_size,
font_color=font_color,
is_multiline=is_multiline,
alignment=alignment,
is_comb=is_comb,
max_length=max_length
)
self[NameObject("/Type")] = NameObject("/XObject")
self[NameObject("/Subtype")] = NameObject("/Form")
self[NameObject("/BBox")] = RectangleObject(rectangle)
self.set_data(ByteStringObject(ap_stream_data))
self[NameObject("/Length")] = NumberObject(len(ap_stream_data))
# Update Resources with font information
self[NameObject("/Resources")] = DictionaryObject({
NameObject("/Font"): DictionaryObject({
NameObject(font_name): getattr(font_resource, "indirect_reference", font_resource)
})
})
@classmethod
def from_text_annotation(
cls,
acro_form: DictionaryObject, # _root_object[CatalogDictionary.ACRO_FORM])
field: DictionaryObject,
annotation: DictionaryObject,
user_font_name: str = "",
user_font_size: float = -1,
) -> "TextStreamAppearance":
"""
Creates a TextStreamAppearance object from a text field annotation.
This class method is a factory for creating a `TextStreamAppearance`
instance by extracting all necessary information (bounding box, font,
text content, etc.) from the PDF field and annotation dictionaries.
It respects inheritance for properties like default appearance (`/DA`).
Args:
acro_form: The root AcroForm dictionary from the PDF catalog.
field: The field dictionary object.
annotation: The widget annotation dictionary object associated with the field.
user_font_name: An optional user-provided font name to override the
default. Defaults to an empty string.
user_font_size: An optional user-provided font size to override the
default. A value of -1 indicates no override.
Returns:
A new `TextStreamAppearance` instance configured for the given field.
"""
# Calculate rectangle dimensions
_rectangle = cast(RectangleObject, annotation[AnnotationDictionaryAttributes.Rect])
rectangle = RectangleObject((0, 0, abs(_rectangle[2] - _rectangle[0]), abs(_rectangle[3] - _rectangle[1])))
# Get default appearance dictionary from annotation
default_appearance = annotation.get_inherited(
AnnotationDictionaryAttributes.DA,
acro_form.get(AnnotationDictionaryAttributes.DA, None),
)
if not default_appearance:
# Create a default appearance if none was found in the annotation
default_appearance = TextStringObject("/Helv 0 Tf 0 g")
else:
default_appearance = default_appearance.get_object()
# Derive font name, size and color from the default appearance. Also set
# user-provided font name and font size in the default appearance, if given.
# For a font name, this presumes that we can find an associated font resource
# dictionary. Uses the variable font_properties as an intermediate.
# As per the PDF spec:
# "At a minimum, the string [that is, default_appearance] shall include a Tf (text
# font) operator along with its two operands, font and size" (Section 12.7.4.3
# "Variable text" of the PDF 2.0 specification).
font_properties = [prop for prop in re.split(r"\s", default_appearance) if prop]
font_name = font_properties.pop(font_properties.index("Tf") - 2)
font_size = float(font_properties.pop(font_properties.index("Tf") - 1))
font_properties.remove("Tf")
font_color = " ".join(font_properties)
# Determine the font name to use, prioritizing the user's input
if user_font_name:
font_name = user_font_name
# Determine the font size to use, prioritizing the user's input
if user_font_size > 0:
font_size = user_font_size
# Try to find a resource dictionary for the font
document_resources: Any = cast(
DictionaryObject,
cast(
DictionaryObject,
annotation.get_inherited(
"/DR",
acro_form.get("/DR", DictionaryObject()),
),
).get_object(),
)
document_font_resources = document_resources.get("/Font", DictionaryObject()).get_object()
# CORE_FONT_METRICS is the dict with Standard font metrics
if font_name not in document_font_resources and font_name.removeprefix("/") not in CORE_FONT_METRICS:
# ...or AcroForm dictionary
document_resources = cast(
dict[Any, Any],
acro_form.get("/DR", {}),
)
document_font_resources = document_resources.get_object().get("/Font", DictionaryObject()).get_object()
font_resource = document_font_resources.get(font_name, None)
if not is_null_or_none(font_resource):
font_resource = cast(DictionaryObject, font_resource.get_object())
# Retrieve field text and selected values
field_flags = field.get(FieldDictionaryAttributes.Ff, 0)
if (
field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and
field_flags & FieldDictionaryAttributes.FfBits.Combo == 0
):
text = "\n".join(annotation.get_inherited(FieldDictionaryAttributes.Opt, []))
selection = field.get("/V", [])
if not isinstance(selection, list):
selection = [selection]
else: # /Tx
text = field.get("/V", "")
selection = []
# Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)
text = text.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")
# Retrieve formatting information
is_comb = False
max_length = None
if field_flags & FieldDictionaryAttributes.FfBits.Comb:
is_comb = True
max_length = annotation.get("/MaxLen")
is_multiline = False
if field_flags & FieldDictionaryAttributes.FfBits.Multiline:
is_multiline = True
alignment = field.get("/Q", TextAlignment.LEFT)
# Create the TextStreamAppearance instance
new_appearance_stream = cls(
text,
selection,
rectangle,
font_resource,
font_name=font_name,
font_size=font_size,
font_color=font_color,
is_multiline=is_multiline,
alignment=alignment,
is_comb=is_comb,
max_length=max_length
)
if AnnotationDictionaryAttributes.AP in annotation:
for key, value in (
cast(DictionaryObject, annotation[AnnotationDictionaryAttributes.AP]).get("/N", {}).items()
):
if key not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:
new_appearance_stream[key] = value
return new_appearance_stream