group-wbl/.venv/lib/python3.13/site-packages/pypdf/_font.py

from collections.abc import Sequence
from dataclasses import dataclass, field
from typing import Any, Optional, Union, cast

from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject

from .errors import ParseError


@dataclass(frozen=True)
class FontDescriptor:
    """
    Represents the FontDescriptor dictionary as defined in the PDF specification.
    This contains both descriptive and metric information.

    The defaults are derived from the mean values of the 14 core fonts, rounded
    to 100.
    """

    name: str = "Unknown"
    family: str = "Unknown"
    weight: str = "Unknown"

    ascent: float = 700.0
    descent: float = -200.0
    cap_height: float = 600.0
    x_height: float = 500.0
    italic_angle: float = 0.0  # Non-italic
    flags: int = 32  # Non-serif, non-symbolic, not fixed width
    bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0))

    character_widths: dict[str, int] = field(default_factory=dict)

    @staticmethod
    def _parse_font_descriptor(font_kwargs: dict[str, Any], font_descriptor_obj: DictionaryObject) -> dict[str, Any]:
        font_descriptor_dict: DictionaryObject = (
            font_descriptor_obj.get_object()
            if isinstance(font_descriptor_obj, IndirectObject)
            else font_descriptor_obj
        )
        for source_key, target_key in [
            ("/FontName", "name"),
            ("/FontFamily", "family"),
            ("/FontWeight", "weight"),
            ("/Ascent", "ascent"),
            ("/Descent", "descent"),
            ("/CapHeight", "cap_height"),
            ("/XHeight", "x_height"),
            ("/ItalicAngle", "italic_angle"),
            ("/Flags", "flags"),
            ("/FontBBox", "bbox")
        ]:
            if source_key in font_descriptor_dict:
                font_kwargs[target_key] = font_descriptor_dict[source_key]
        # No need for an if statement here, bbox is a required key in a font descriptor
        bbox_tuple = tuple(map(float, font_kwargs["bbox"]))
        assert len(bbox_tuple) == 4, bbox_tuple
        font_kwargs["bbox"] = bbox_tuple
        return font_kwargs

    @staticmethod
    def _collect_cid_character_widths(
        d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int]
    ) -> None:
        """Parses the /W array from a DescendantFont dictionary and updates character widths."""
        ord_map = {
            ord(_target): _surrogate
            for _target, _surrogate in char_map.items()
            if isinstance(_target, str)
        }
        # /W width definitions have two valid formats which can be mixed and matched:
        #   (1) A character start index followed by a list of widths, e.g.
        #       `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.
        #   (2) A character start index, a character stop index, and a width, e.g.
        #       `45 65 500` applies width 500 to characters 45-65.
        skip_count = 0
        _w = d_font.get("/W", [])
        for idx, w_entry in enumerate(_w):
            w_entry = w_entry.get_object()
            if skip_count:
                skip_count -= 1
                continue
            if not isinstance(w_entry, (int, float)):  # pragma: no cover
                # We should never get here due to skip_count above. Add a
                # warning and or use reader's "strict" to force an ex???
                continue
            # check for format (1): `int [int int int int ...]`
            w_next_entry = _w[idx + 1].get_object()
            if isinstance(w_next_entry, Sequence):
                start_idx, width_list = w_entry, w_next_entry
                current_widths.update(
                    {
                        ord_map[_cidx]: _width
                        for _cidx, _width in zip(
                            range(
                                cast(int, start_idx),
                                cast(int, start_idx) + len(width_list),
                                1,
                            ),
                            width_list,
                        )
                        if _cidx in ord_map
                    }
                )
                skip_count = 1
            # check for format (2): `int int int`
            elif isinstance(w_next_entry, (int, float)) and isinstance(
                _w[idx + 2].get_object(), (int, float)
            ):
                start_idx, stop_idx, const_width = (
                    w_entry,
                    w_next_entry,
                    _w[idx + 2].get_object(),
                )
                current_widths.update(
                    {
                        ord_map[_cidx]: const_width
                        for _cidx in range(
                            cast(int, start_idx), cast(int, stop_idx + 1), 1
                        )
                        if _cidx in ord_map
                    }
                )
                skip_count = 2
            else:
                # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions
                # while expecting more elements). This raises an IndexError which is sufficient.
                raise ParseError(
                    f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}"
                )  # pragma: no cover

    @classmethod
    def from_font_resource(
        cls,
        pdf_font_dict: DictionaryObject,
        encoding: Optional[Union[str, dict[int, str]]] = None,
        char_map: Optional[dict[Any, Any]] = None
    ) -> "FontDescriptor":
        from pypdf._cmap import get_encoding  # noqa: PLC0415
        from pypdf._codecs.core_fontmetrics import CORE_FONT_METRICS  # noqa: PLC0415
        # Prioritize information from the PDF font dictionary
        font_name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/")
        font_kwargs: dict[str, Any] = {"character_widths": {}}

        # Deal with fonts by type; Type1, TrueType and certain Type3
        if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"):
            if "/FontDescriptor" in pdf_font_dict:
                # Collect character widths - TrueType and Type1 fonts
                # have a /Widths array mapping character codes to widths
                if not (encoding and char_map):
                    encoding, char_map = get_encoding(pdf_font_dict)
                if isinstance(encoding, dict) and "/Widths" in pdf_font_dict:
                    first_char = pdf_font_dict.get("/FirstChar", 0)
                    font_kwargs["character_widths"] = {
                        encoding.get(idx + first_char, chr(idx + first_char)): width
                        for idx, width in enumerate(cast(ArrayObject, pdf_font_dict["/Widths"]))
                    }
                # Collect font descriptor
                font_kwargs = cls._parse_font_descriptor(
                    font_kwargs, pdf_font_dict.get("/FontDescriptor", DictionaryObject())
                )
                return cls(**font_kwargs)

            if font_name in CORE_FONT_METRICS:
                return CORE_FONT_METRICS[font_name]

        # Composite font or CID font
        # CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts
        if "/DescendantFonts" in pdf_font_dict:
            if not (encoding and char_map):
                encoding, char_map = get_encoding(pdf_font_dict)
            d_font: DictionaryObject
            for d_font_idx, d_font in enumerate(
                cast(ArrayObject, pdf_font_dict["/DescendantFonts"])
            ):
                d_font = cast(DictionaryObject, d_font.get_object())
                cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font
                # Collect character widths
                cls._collect_cid_character_widths(
                    d_font, char_map, font_kwargs["character_widths"]
                )
                # Collect font descriptor
                font_kwargs = cls._parse_font_descriptor(
                    font_kwargs, d_font.get("/FontDescriptor", DictionaryObject())
                )

        return cls(**font_kwargs)

    def text_width(self, text: str) -> float:
        """Sum of character widths specified in PDF font for the supplied text."""
        return sum(
            [self.character_widths.get(char, self.character_widths.get("default", 0)) for char in text], 0.0
        )
Add __pycache__ and .venv directories 2026-01-09 09:48:03 +08:00			`from collections.abc import Sequence`
			`from dataclasses import dataclass, field`
			`from typing import Any, Optional, Union, cast`

			`from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject`

			`from .errors import ParseError`


			`@dataclass(frozen=True)`
			`class FontDescriptor:`
			`"""`
			`Represents the FontDescriptor dictionary as defined in the PDF specification.`
			`This contains both descriptive and metric information.`

			`The defaults are derived from the mean values of the 14 core fonts, rounded`
			`to 100.`
			`"""`

			`name: str = "Unknown"`
			`family: str = "Unknown"`
			`weight: str = "Unknown"`

			`ascent: float = 700.0`
			`descent: float = -200.0`
			`cap_height: float = 600.0`
			`x_height: float = 500.0`
			`italic_angle: float = 0.0 # Non-italic`
			`flags: int = 32 # Non-serif, non-symbolic, not fixed width`
			`bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0))`

			`character_widths: dict[str, int] = field(default_factory=dict)`

			`@staticmethod`
			`def _parse_font_descriptor(font_kwargs: dict[str, Any], font_descriptor_obj: DictionaryObject) -> dict[str, Any]:`
			`font_descriptor_dict: DictionaryObject = (`
			`font_descriptor_obj.get_object()`
			`if isinstance(font_descriptor_obj, IndirectObject)`
			`else font_descriptor_obj`
			`)`
			`for source_key, target_key in [`
			`("/FontName", "name"),`
			`("/FontFamily", "family"),`
			`("/FontWeight", "weight"),`
			`("/Ascent", "ascent"),`
			`("/Descent", "descent"),`
			`("/CapHeight", "cap_height"),`
			`("/XHeight", "x_height"),`
			`("/ItalicAngle", "italic_angle"),`
			`("/Flags", "flags"),`
			`("/FontBBox", "bbox")`
			`]:`
			`if source_key in font_descriptor_dict:`
			`font_kwargs[target_key] = font_descriptor_dict[source_key]`
			`# No need for an if statement here, bbox is a required key in a font descriptor`
			`bbox_tuple = tuple(map(float, font_kwargs["bbox"]))`
			`assert len(bbox_tuple) == 4, bbox_tuple`
			`font_kwargs["bbox"] = bbox_tuple`
			`return font_kwargs`

			`@staticmethod`
			`def _collect_cid_character_widths(`
			`d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int]`
			`) -> None:`
			`"""Parses the /W array from a DescendantFont dictionary and updates character widths."""`
			`ord_map = {`
			`ord(_target): _surrogate`
			`for _target, _surrogate in char_map.items()`
			`if isinstance(_target, str)`
			`}`
			`# /W width definitions have two valid formats which can be mixed and matched:`
			`# (1) A character start index followed by a list of widths, e.g.`
			# `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.
			`# (2) A character start index, a character stop index, and a width, e.g.`
			# `45 65 500` applies width 500 to characters 45-65.
			`skip_count = 0`
			`_w = d_font.get("/W", [])`
			`for idx, w_entry in enumerate(_w):`
			`w_entry = w_entry.get_object()`
			`if skip_count:`
			`skip_count -= 1`
			`continue`
			`if not isinstance(w_entry, (int, float)): # pragma: no cover`
			`# We should never get here due to skip_count above. Add a`
			`# warning and or use reader's "strict" to force an ex???`
			`continue`
			# check for format (1): `int [int int int int ...]`
			`w_next_entry = _w[idx + 1].get_object()`
			`if isinstance(w_next_entry, Sequence):`
			`start_idx, width_list = w_entry, w_next_entry`
			`current_widths.update(`
			`{`
			`ord_map[_cidx]: _width`
			`for _cidx, _width in zip(`
			`range(`
			`cast(int, start_idx),`
			`cast(int, start_idx) + len(width_list),`
			`1,`
			`),`
			`width_list,`
			`)`
			`if _cidx in ord_map`
			`}`
			`)`
			`skip_count = 1`
			# check for format (2): `int int int`
			`elif isinstance(w_next_entry, (int, float)) and isinstance(`
			`_w[idx + 2].get_object(), (int, float)`
			`):`
			`start_idx, stop_idx, const_width = (`
			`w_entry,`
			`w_next_entry,`
			`_w[idx + 2].get_object(),`
			`)`
			`current_widths.update(`
			`{`
			`ord_map[_cidx]: const_width`
			`for _cidx in range(`
			`cast(int, start_idx), cast(int, stop_idx + 1), 1`
			`)`
			`if _cidx in ord_map`
			`}`
			`)`
			`skip_count = 2`
			`else:`
			`# Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions`
			`# while expecting more elements). This raises an IndexError which is sufficient.`
			`raise ParseError(`
			`f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}"`
			`) # pragma: no cover`

			`@classmethod`
			`def from_font_resource(`
			`cls,`
			`pdf_font_dict: DictionaryObject,`
			`encoding: Optional[Union[str, dict[int, str]]] = None,`
			`char_map: Optional[dict[Any, Any]] = None`
			`) -> "FontDescriptor":`
			`from pypdf._cmap import get_encoding # noqa: PLC0415`
			`from pypdf._codecs.core_fontmetrics import CORE_FONT_METRICS # noqa: PLC0415`
			`# Prioritize information from the PDF font dictionary`
			`font_name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/")`
			`font_kwargs: dict[str, Any] = {"character_widths": {}}`

			`# Deal with fonts by type; Type1, TrueType and certain Type3`
			`if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"):`
			`if "/FontDescriptor" in pdf_font_dict:`
			`# Collect character widths - TrueType and Type1 fonts`
			`# have a /Widths array mapping character codes to widths`
			`if not (encoding and char_map):`
			`encoding, char_map = get_encoding(pdf_font_dict)`
			`if isinstance(encoding, dict) and "/Widths" in pdf_font_dict:`
			`first_char = pdf_font_dict.get("/FirstChar", 0)`
			`font_kwargs["character_widths"] = {`
			`encoding.get(idx + first_char, chr(idx + first_char)): width`
			`for idx, width in enumerate(cast(ArrayObject, pdf_font_dict["/Widths"]))`
			`}`
			`# Collect font descriptor`
			`font_kwargs = cls._parse_font_descriptor(`
			`font_kwargs, pdf_font_dict.get("/FontDescriptor", DictionaryObject())`
			`)`
			`return cls(**font_kwargs)`

			`if font_name in CORE_FONT_METRICS:`
			`return CORE_FONT_METRICS[font_name]`

			`# Composite font or CID font`
			`# CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts`
			`if "/DescendantFonts" in pdf_font_dict:`
			`if not (encoding and char_map):`
			`encoding, char_map = get_encoding(pdf_font_dict)`
			`d_font: DictionaryObject`
			`for d_font_idx, d_font in enumerate(`
			`cast(ArrayObject, pdf_font_dict["/DescendantFonts"])`
			`):`
			`d_font = cast(DictionaryObject, d_font.get_object())`
			`cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font`
			`# Collect character widths`
			`cls._collect_cid_character_widths(`
			`d_font, char_map, font_kwargs["character_widths"]`
			`)`
			`# Collect font descriptor`
			`font_kwargs = cls._parse_font_descriptor(`
			`font_kwargs, d_font.get("/FontDescriptor", DictionaryObject())`
			`)`

			`return cls(**font_kwargs)`

			`def text_width(self, text: str) -> float:`
			`"""Sum of character widths specified in PDF font for the supplied text."""`
			`return sum(`
			`[self.character_widths.get(char, self.character_widths.get("default", 0)) for char in text], 0.0`
			`)`