Module fpdf.image_parsing

Functions

def ccitt_payload_location_from_pil(img: PILImage) ‑> tuple[int, int]
Expand source code Browse git
def ccitt_payload_location_from_pil(img: "PILImage") -> tuple[int, int]:
    """
    returns the byte offset and length of the CCITT payload in the original TIFF data
    """
    # assert(img.info["compression"] == "group4")

    # Read the TIFF tags to find the offset(s) of the compressed data strips.
    strip_offsets = img.tag_v2[TiffImagePlugin.STRIPOFFSETS]  # type: ignore[attr-defined]
    strip_bytes = img.tag_v2[TiffImagePlugin.STRIPBYTECOUNTS]  # type: ignore[attr-defined]

    # PIL always seems to create a single strip even for very large TIFFs when
    # it saves images, so assume we only have to read a single strip.
    # A test ~10 GPixel image was still encoded as a single strip. Just to be
    # safe check throw an error if there is more than one offset.
    if (
        len(strip_offsets) != 1  # pyright: ignore[reportUnknownArgumentType]
        or len(strip_bytes) != 1  # pyright: ignore[reportUnknownArgumentType]
    ):  # pyright: ignore[reportUnknownArgumentType]
        raise NotImplementedError(
            "Transcoding multiple strips not supported by the PDF format"
        )

    (offset,), (length,) = (  # pyright: ignore[reportUnknownVariableType]
        strip_offsets,
        strip_bytes,
    )  # pyright: ignore[reportUnknownVariableType]

    return offset, length  # pyright: ignore[reportUnknownVariableType]

returns the byte offset and length of the CCITT payload in the original TIFF data

def clear_table() ‑> tuple[dict[bytes, int], int, int, int]
Expand source code Browse git
def clear_table() -> tuple[dict[bytes, int], int, int, int]:
    """
    Reset the encoding table and coding state to initial conditions.

    """

    table = {bytes([i]): i for i in range(256)}
    next_code = LZW_EOD_MARKER + 1
    bits_per_code = LZW_INITIAL_BITS_PER_CODE
    max_code_value = (1 << bits_per_code) - 1
    return table, next_code, bits_per_code, max_code_value

Reset the encoding table and coding state to initial conditions.

def get_img_info(filename: str |  | pathlib.Path,
img: str | bytes |  | PILImage | pathlib.Path | None = None,
image_filter: Literal['AUTO', 'FlateDecode', 'DCTDecode', 'JPXDecode', 'LZWDecode', 'CCITTFaxDecode'] = 'AUTO',
dims: tuple[float, float] | None = None) ‑> RasterImageInfo
Expand source code Browse git
def get_img_info(
    filename: Union[str, BinaryIO, Path],
    img: Union["PILImage", bytes, BinaryIO, Path, str, None] = None,
    image_filter: ImageFilter = "AUTO",
    dims: Optional[tuple[float, float]] = None,
) -> RasterImageInfo:
    """
    Args:
        filename: in a format that can be passed to load_image
        img: optional `bytes`, `BytesIO` or `PIL.Image.Image` instance
        image_filter (str): one of the SUPPORTED_IMAGE_FILTERS
    """
    if Image is None:
        raise EnvironmentError("Pillow not available - fpdf2 cannot insert images")

    is_pil_img = True
    keep_bytes_io_open = False
    # Flag to check whether a cmyk image is jpeg or not, if set to True the decode array
    # is inverted in output.py
    jpeg_inverted = False
    img_raw_data: Optional[BinaryIO] = None
    if not img or isinstance(img, (Path, str)):
        img_raw_data = load_image(filename)
        img = Image.open(img_raw_data)
        is_pil_img = False
    elif not _is_pil_image(img):
        keep_bytes_io_open = isinstance(img, BytesIO)
        if isinstance(img, bytes):
            img_raw_data = BytesIO(img)
        else:
            img_raw_data = img  # type: ignore[assignment]
        assert img_raw_data is not None
        img = Image.open(img_raw_data)
        is_pil_img = False
    assert _is_pil_image(img)

    img_altered = False
    if dims:
        img = img.resize(dims, resample=RESAMPLE)  # type: ignore[arg-type]
        img_altered = True

    if image_filter == "AUTO":
        # Very simple logic for now:
        if img.format == "JPEG":
            image_filter = "DCTDecode"
        elif img.mode == "1" and PIL_features.check("libtiff"):
            # The 2nd condition prevents from running in a bug sometimes,
            # cf. test_transcode_monochrome_and_libtiff_support_custom_tags()
            image_filter = "CCITTFaxDecode"
        else:
            image_filter = "FlateDecode"

    if img.mode in ("P", "PA") and image_filter != "FlateDecode":
        img = img.convert("RGBA")

    if img.mode not in ("1", "L", "LA", "RGB", "RGBA", "P", "PA", "CMYK"):
        img = img.convert("RGBA")
        img_altered = True

    if img.mode in ("P", "RGBA") and image_filter == "LZWDecode":
        img = img.convert("RGB")
    elif img.mode in ("LA") and image_filter == "LZWDecode":
        img = img.convert("L")

    w, h = img.size
    info = RasterImageInfo()

    iccp = None
    if "icc_profile" in img.info:
        if is_iccp_valid(img.info["icc_profile"], filename):
            iccp = img.info["icc_profile"]

    if img_raw_data is not None and not img_altered:
        # if we can use the original image bytes directly we do (JPEG and group4 TIFF only):
        if img.format == "JPEG" and image_filter == "DCTDecode":
            if img.mode in ("RGB", "RGBA"):
                dpn, bpc, colspace = 3, 8, "DeviceRGB"
            elif img.mode == "CMYK":
                dpn, bpc, colspace = 4, 8, "DeviceCMYK"
                jpeg_inverted = True
            elif img.mode == "L":
                dpn, bpc, colspace = 1, 8, "DeviceGray"
            else:
                raise ValueError(f"Unsupported image mode: {img.mode}")
            img_raw_data.seek(0)
            info.update(
                {
                    "data": img_raw_data.read(),
                    "w": w,
                    "h": h,
                    "cs": colspace,
                    "iccp": iccp,
                    "dpn": dpn,
                    "bpc": bpc,
                    "f": image_filter,
                    "inverted": jpeg_inverted,
                    "dp": f"/Predictor 15 /Colors {dpn} /Columns {w}",
                }
            )
            return info
        # We can directly copy the data out of a CCITT Group 4 encoded TIFF, if it
        # only contains a single strip
        if (
            img.format == "TIFF"
            and image_filter == "CCITTFaxDecode"
            and img.info["compression"] == "group4"
            and len(img.tag_v2[TiffImagePlugin.STRIPOFFSETS]) == 1  # type: ignore[attr-defined]
            and len(img.tag_v2[TiffImagePlugin.STRIPBYTECOUNTS]) == 1  # type: ignore[attr-defined]
        ):
            photo = img.tag_v2[TiffImagePlugin.PHOTOMETRIC_INTERPRETATION]  # type: ignore[attr-defined]
            inverted = False
            if photo == 0:
                inverted = True
            elif photo != 1:
                raise ValueError(
                    f"unsupported photometric interpretation for g4 tiff: {photo}"
                )
            offset, length = ccitt_payload_location_from_pil(img)
            img_raw_data.seek(offset)
            ccittrawdata: bytes | bytearray = img_raw_data.read(length)
            fillorder = img.tag_v2.get(TiffImagePlugin.FILLORDER)  # type: ignore[attr-defined]
            if fillorder is None or fillorder == 1:
                # no FillOrder or msb-to-lsb: nothing to do
                pass
            elif fillorder == 2:
                # lsb-to-msb: reverse bits of each byte
                ccittrawdata = bytearray(ccittrawdata)
                for i, n in enumerate(ccittrawdata):
                    ccittrawdata[i] = TIFFBitRevTable[n]
                ccittrawdata = bytes(ccittrawdata)
            else:
                raise ValueError(f"unsupported FillOrder: {fillorder}")
            dpn, bpc, colspace = 1, 1, "DeviceGray"
            info.update(
                {
                    "data": ccittrawdata,
                    "w": w,
                    "h": h,
                    "iccp": None,
                    "dpn": dpn,
                    "cs": colspace,
                    "bpc": bpc,
                    "f": image_filter,
                    "inverted": jpeg_inverted,
                    "dp": f"/BlackIs1 {str(not inverted).lower()} /Columns {w} /K -1 /Rows {h}",
                }
            )
            return info

    # garbage collection
    img_raw_data = None

    if img.mode == "1":
        dpn, bpc, colspace = 1, 1, "DeviceGray"
        info["data"] = _to_data(img, image_filter)
    elif img.mode == "L":
        dpn, bpc, colspace = 1, 8, "DeviceGray"
        info["data"] = _to_data(img, image_filter)
    elif img.mode == "LA":
        dpn, bpc, colspace = 1, 8, "DeviceGray"
        alpha_channel = slice(1, None, 2)
        info["data"] = _to_data(img, image_filter, remove_slice=alpha_channel)
        if _has_alpha(img) and image_filter not in (
            "DCTDecode",
            "JPXDecode",
        ):
            info["smask"] = _to_data(img, image_filter, select_slice=alpha_channel)
    elif img.mode == "P":
        dpn, bpc, colspace = 1, 8, "Indexed"
        info["data"] = _to_data(img, image_filter)
        info["pal"] = img.palette.palette if img.palette is not None else None

        # check if the P image has transparency
        if img.info.get("transparency", None) is not None and image_filter not in (
            "DCTDecode",
            "JPXDecode",
        ):
            # convert to RGBA to get the alpha channel for creating the smask
            info["smask"] = _to_data(
                img.convert("RGBA"), image_filter, select_slice=slice(3, None, 4)
            )
    elif img.mode == "PA":
        dpn, bpc, colspace = 1, 8, "Indexed"
        info["pal"] = img.palette.palette if img.palette is not None else None
        alpha_channel = slice(1, None, 2)
        info["data"] = _to_data(img, image_filter, remove_slice=alpha_channel)
        if _has_alpha(img) and image_filter not in (
            "DCTDecode",
            "JPXDecode",
        ):
            info["smask"] = _to_data(img, image_filter, select_slice=alpha_channel)
    elif img.mode == "CMYK":
        dpn, bpc, colspace = 4, 8, "DeviceCMYK"
        info["data"] = _to_data(img, image_filter)
    elif img.mode == "RGB":
        dpn, bpc, colspace = 3, 8, "DeviceRGB"
        info["data"] = _to_data(img, image_filter)
    else:  # RGBA image
        dpn, bpc, colspace = 3, 8, "DeviceRGB"
        alpha_channel = slice(3, None, 4)
        info["data"] = _to_data(img, image_filter, remove_slice=alpha_channel)
        if _has_alpha(img) and image_filter not in (
            "DCTDecode",
            "JPXDecode",
        ):
            info["smask"] = _to_data(img, image_filter, select_slice=alpha_channel)

    dp = f"/Predictor 15 /Colors {dpn} /Columns {w}"

    if img.mode == "1":
        dp = f"/BlackIs1 true /Columns {w} /K -1 /Rows {h}"

    if not is_pil_img:
        if keep_bytes_io_open:
            setattr(img, "fp", None)
        else:
            img.close()

    info.update(
        {
            "w": w,
            "h": h,
            "cs": colspace,
            "iccp": iccp,
            "bpc": bpc,
            "dpn": dpn,
            "f": image_filter,
            "inverted": jpeg_inverted,
            "dp": dp,
        }
    )
    return info

Args

filename
in a format that can be passed to load_image
img
optional bytes, BytesIO or PIL.Image.Image instance
image_filter : str
one of the SUPPORTED_IMAGE_FILTERS
def get_svg_info(filename: str,
img: ,
image_cache: ImageCache) ‑> tuple[str, SVGObjectVectorImageInfo]
Expand source code Browse git
def get_svg_info(
    filename: str, img: BinaryIO, image_cache: ImageCache
) -> tuple[str, SVGObject, VectorImageInfo]:
    img.seek(0)
    svg_data = img.read()
    svg = SVGObject(svg_data, image_cache=image_cache)
    if svg.viewbox:
        _, _, w, h = svg.viewbox
    else:
        w = h = 0.0
    if svg.width:
        w = svg.width
    if svg.height:
        h = svg.height
    info = VectorImageInfo(data=svg, w=w, h=h)
    return filename, svg, info
def is_iccp_valid(iccp: bytes, filename: str | pathlib.Path) ‑> bool
Expand source code Browse git
@no_type_check
def is_iccp_valid(iccp: bytes, filename: str | Path) -> bool:
    "Checks the validity of an ICC profile"
    try:
        profile: "ImageCmsProfile" = ImageCms.getOpenProfile(BytesIO(iccp))
    except ImageCms.PyCMSError:
        LOGGER.info("Invalid ICC Profile in file %s", filename)
        return False
    color_space = str(profile.profile.xcolor_space).strip()
    if color_space not in ("GRAY", "RGB"):
        LOGGER.info(
            "Unsupported color space %s in ICC Profile of file %s - cf. issue #711",
            color_space,
            filename,
        )
        return False
    return True

Checks the validity of an ICC profile

def load_image(filename: str |  | pathlib.Path) ‑> 
Expand source code Browse git
def load_image(filename: str | Path | BinaryIO) -> BinaryIO:
    """
    This method is used to load external resources, such as images.
    It is automatically called when resource added to document by `fpdf.fpdf.FPDF.image()`.
    It always return a BytesIO buffer.
    """
    # if a file-like object is passed in, use it directly or copy it into a BytesIO buffer
    if isinstance(filename, (BytesIO, io.BufferedIOBase, BinaryIO)):
        return filename
    if hasattr(filename, "read") and not isinstance(filename, (str, Path)):
        # Copy other file-like objects into a BytesIO so downstream code can seek/read freely
        return BytesIO(filename.read())
    if isinstance(filename, Path):
        filename = str(filename)
    # by default loading from network is allowed for all images
    if filename.startswith(("http://", "https://")):
        # disabling bandit & semgrep rules as permitted schemes are whitelisted:
        # nosemgrep: python.lang.security.audit.dynamic-urllib-use-detected.dynamic-urllib-use-detected
        with urlopen(filename) as url_file:  # nosec B310
            return BytesIO(url_file.read())
    elif filename.startswith("data:"):
        return _decode_base64_image(filename)
    with open(filename, "rb") as local_file:
        return BytesIO(local_file.read())

This method is used to load external resources, such as images. It is automatically called when resource added to document by FPDF.image(). It always return a BytesIO buffer.

def pack_codes_into_bytes(codes: Iterable[int]) ‑> bytes
Expand source code Browse git
def pack_codes_into_bytes(codes: Iterable[int]) -> bytes:
    """
    Convert the list of result codes into a continuous byte stream, with codes packed as per the code bit-width.
    The bit-width starts at 9 bits and expands as needed.

    """

    (
        _,
        next_code,
        bits_per_code,
        max_code_value,
    ) = clear_table()
    buffer = 0
    bits_in_buffer = 0
    output = bytearray()

    if numpy is not None:
        # Using numpy improves the performance significantly there
        # _cf._ https://github.com/py-pdf/fpdf2/issues/1380
        codes = numpy.array(codes, dtype=numpy.uint32)
    for code in codes:
        buffer = (buffer << bits_per_code) | code
        bits_in_buffer += bits_per_code

        while bits_in_buffer >= 8:
            bits_in_buffer -= 8
            output.append((buffer >> bits_in_buffer) & 0xFF)

        if code == LZW_CLEAR_TABLE_MARKER:
            _, next_code, bits_per_code, max_code_value = clear_table()
        elif code != LZW_EOD_MARKER:
            next_code += 1
            if next_code > max_code_value and bits_per_code < LZW_MAX_BITS_PER_CODE:
                bits_per_code += 1
                max_code_value = (1 << bits_per_code) - 1

    if bits_in_buffer > 0:
        output.append((buffer << (8 - bits_in_buffer)) & 0xFF)

    return bytes(output)

Convert the list of result codes into a continuous byte stream, with codes packed as per the code bit-width. The bit-width starts at 9 bits and expands as needed.

def preload_image(image_cache: ImageCache,
name: str | bytes |  | PILImage | pathlib.Path | None,
dims: tuple[float, float] | None = None) ‑> tuple[str, SVGObject | Any | bytes |  | pathlib.Path | None, RasterImageInfo | VectorImageInfo]
Expand source code Browse git
def preload_image(
    image_cache: ImageCache,
    name: ImageType,
    dims: Optional[tuple[float, float]] = None,
) -> tuple[
    str,
    Union[SVGObject, "PILImage", bytes, BinaryIO, Path, None],
    RasterImageInfo | VectorImageInfo,
]:
    """
    Read an image and load it into memory.

    For raster images: following this call, the image is inserted in `image_cache.images`,
    and following calls to `fpdf.fpdf.FPDF.image()` will re-use the same cached values, without re-reading the image.

    For vector images: the data is loaded and the metadata extracted.

    Args:
        image_cache: an `ImageCache` instance, usually the `.image_cache` attribute of a `FPDF` instance.
        name: either a string representing a file path to an image, an URL to an image,
            an io.BytesIO, or a instance of `PIL.Image.Image`.
        dims (tuple[int, int]): optional dimensions as a tuple (width, height) to resize the image
            (raster only) before storing it in the PDF.

    Returns: A tuple, consisting of 3 values: the name, the image data,
        and an instance of a subclass of `ImageInfo`.
    """
    # Identify and load SVG data:
    if isinstance(name, (str, Path)) and str(name).endswith(".svg"):
        try:
            return get_svg_info(
                str(name), load_image(str(name)), image_cache=image_cache
            )
        except Exception as error:
            raise ValueError(f"Could not parse file: {name}") from error
    if isinstance(name, bytes) and _is_svg(name.strip()):
        return get_svg_info("vector_image", io.BytesIO(name), image_cache=image_cache)
    if isinstance(name, io.BytesIO) and _is_svg(name.getvalue().strip()):
        return get_svg_info("vector_image", name, image_cache=image_cache)

    # Load raster data.
    img: Union["PILImage", bytes, BinaryIO, Path, None]
    raster_name: str
    if isinstance(name, str):
        raster_name, img = name, None
    elif _is_pil_image(name):
        bytes_ = name.tobytes()
        img_hash = hashlib.new("md5", usedforsecurity=False)  # nosec B324
        img_hash.update(bytes_)
        raster_name, img = img_hash.hexdigest(), name
    elif isinstance(name, (bytes, io.BytesIO)):
        bytes_ = name.getvalue() if isinstance(name, io.BytesIO) else name
        bytes_ = bytes_.strip()
        img_hash = hashlib.new("md5", usedforsecurity=False)  # nosec B324
        img_hash.update(bytes_)
        raster_name, img = img_hash.hexdigest(), name
    else:
        raster_name, img = str(name), None
    info: RasterImageInfo | VectorImageInfo | None = image_cache.images.get(raster_name)
    if info is not None:
        info["usages"] = info["usages"] + 1  # type: ignore[operator]
    else:
        info = get_img_info(
            raster_name,
            img,
            image_cache.image_filter,
            dims,
        )
        info["i"] = len(image_cache.images) + 1
        info["usages"] = 1
        info["iccp_i"] = None
        iccp = info.get("iccp")
        if iccp is not None:
            LOGGER.debug(
                "ICC profile found for image %s - It will be inserted in the PDF document",
                raster_name,
            )
            if iccp in image_cache.icc_profiles:
                info["iccp_i"] = image_cache.icc_profiles[iccp]  # type: ignore[index]
            else:
                iccp_i = len(image_cache.icc_profiles)
                image_cache.icc_profiles[iccp] = iccp_i  # type: ignore[index]
                info["iccp_i"] = iccp_i
            info["iccp"] = None
        image_cache.images[raster_name] = info
    return raster_name, img, info

Read an image and load it into memory.

For raster images: following this call, the image is inserted in image_cache.images, and following calls to FPDF.image() will re-use the same cached values, without re-reading the image.

For vector images: the data is loaded and the metadata extracted.

Args

image_cache
an ImageCache instance, usually the .image_cache attribute of a FPDF instance.
name
either a string representing a file path to an image, an URL to an image, an io.BytesIO, or a instance of PIL.Image.Image.
dims : tuple[int, int]
optional dimensions as a tuple (width, height) to resize the image (raster only) before storing it in the PDF.

Returns: A tuple, consisting of 3 values: the name, the image data, and an instance of a subclass of ImageInfo.

def transcode_monochrome(img: PILImage) ‑> bytes
Expand source code Browse git
def transcode_monochrome(img: "PILImage") -> bytes:
    """
    Convert the open PIL.Image imgdata to compressed CCITT Group4 data.

    """
    # Convert the image to Group 4 in memory. If libtiff is not installed and
    # Pillow is not compiled against it, .save() will raise an exception.
    newimgio = BytesIO()

    # we create a whole new PIL image or otherwise it might happen with some
    # input images, that libtiff fails an assert and the whole process is
    # killed by a SIGABRT:
    img2 = Image.frombytes(img.mode, img.size, img.tobytes())

    # Since version 8.3.0 Pillow limits strips to 64 KB. Since PDF only
    # supports single strip CCITT Group4 payloads, we have to coerce it back
    # into putting everything into a single strip. Thanks to Andrew Murray for
    # the hack.
    #
    # Since version 8.4.0 Pillow allows us to modify the strip size explicitly
    tmp_strip_size = (img.size[0] + 7) // 8 * img.size[1]
    if hasattr(TiffImagePlugin, "STRIP_SIZE"):
        # we are using Pillow 8.4.0 or later
        with temp_attr(TiffImagePlugin, "STRIP_SIZE", tmp_strip_size):
            img2.save(newimgio, format="TIFF", compression="group4")
    else:
        # only needed for Pillow 8.3.x but works for versions before that as
        # well
        pillow__getitem__ = TiffImagePlugin.ImageFileDirectory_v2.__getitem__

        def __getitem__(self: Any, tag: int) -> object:
            overrides = {
                TiffImagePlugin.ROWSPERSTRIP: img.size[1],
                TiffImagePlugin.STRIPBYTECOUNTS: [tmp_strip_size],
                TiffImagePlugin.STRIPOFFSETS: [0],
            }
            return overrides.get(tag, pillow__getitem__(self, tag))

        with temp_attr(
            TiffImagePlugin.ImageFileDirectory_v2, "__getitem__", __getitem__
        ):
            img2.save(newimgio, format="TIFF", compression="group4")

    # Open new image in memory
    newimgio.seek(0)
    newimg = Image.open(newimgio)

    offset, length = ccitt_payload_location_from_pil(newimg)

    newimgio.seek(offset)
    return newimgio.read(length)

Convert the open PIL.Image imgdata to compressed CCITT Group4 data.

Classes

class ImageSettings (compression_level: int = -1)
Expand source code Browse git
@dataclass
class ImageSettings:
    # Passed to zlib.compress() - In range 0-9 - Default is currently equivalent to 6:
    compression_level: int = -1

ImageSettings(compression_level: int = -1)

Instance variables

var compression_level : int

The type of the None singleton.

class temp_attr (obj: Any, field: str, value: Any)
Expand source code Browse git
class temp_attr:
    """
    temporary change the attribute of an object using a context manager
    """

    def __init__(self, obj: Any, field: str, value: Any):
        self.obj = obj
        self.field = field
        self.value = value
        self.exists = False
        self.old_value: Any = None

    def __enter__(self) -> None:
        if hasattr(self.obj, self.field):
            self.exists = True
            self.old_value = getattr(self.obj, self.field)
        setattr(self.obj, self.field, self.value)

    def __exit__(self, exctype: Any, excinst: Any, exctb: Any) -> None:
        if self.exists:
            setattr(self.obj, self.field, self.old_value)
        else:
            delattr(self.obj, self.field)

temporary change the attribute of an object using a context manager