PDF 转图片

现实生活中遇到的问题,一个 PDF 虽然页数不多,但是打开之后渲染特别慢,甚至我的电脑翻页都要卡顿 1 秒左右,为此想到将 PDF 提前渲染好,以图片的形式保存,这样打开的时候就跳过了渲染的步骤。

程序依赖 PyMuPDFpdf2image,因为转图像步骤使用 PyMuPDF 会报错,所以额外使用 pdf2image 来渲染图片。可以全局调整保存图片的 dpi,同时压缩图片大小

snapshot.py
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import pathlib as p
import shutil
import subprocess
import tempfile

import fitz
import pdf2image

from typing import Callable, Iterator, List, Union


PathLike = Union[str, p.Path]


def _pdf2images_fitz(path: p.Path, directory: p.Path) -> Iterator[p.Path]:
    with fitz.open(path) as doc:
        for ith, page in enumerate(doc):
            path = directory / f'{ith}.png'
            try:
                page.get_pixmap(mat=fitz.Matrix(1, 1)).save(path)
            except RuntimeError as e:
                print(f'Page {ith+1}: {e}')
            else:
                yield path

def _pdf2images_poppler(path: p.Path, directory: p.Path) -> Iterator[p.Path]:
    for ith, image in enumerate(pdf2image.convert_from_path(path, dpi=200)):
        path = directory / f'{ith}.png'
        image.save(path, 'PNG')
        yield path

def snapshot(path: PathLike, pdf2images: Callable, tinified: bool = False) -> None:
    path = p.Path(path)
    directory = path.parent / path.stem
    directory.mkdir(parents=True, exist_ok=True)
    output = directory.parent / f'{path.stem}-snapshot{path.suffix}'
    with fitz.open() as doc:
        for path in pdf2images(path, directory):
            if tinified:
                temp = directory / f'{path.stem}-tinypng{path.suffix}'
                if tinypng(path, temp):
                    path = temp
            with fitz.open(path) as img:
                rect = img[0].rect
                pdf = fitz.open('pdf', img.convert_to_pdf())
                doc.new_page(width=rect.width, height=rect.height) \
                    .show_pdf_page(rect, pdf, 0)
        doc.save(output)

def tinypng(input_path: PathLike, output_path: PathLike) -> bool:
    input_path, output_path = p.Path(input_path), p.Path(output_path)
    if not input_path.exists():
        return False
    with tempfile.NamedTemporaryFile(
        suffix=input_path.suffix, delete=False
    ) as f:
        for command in (
            (
                'ffmpeg', '-i', input_path,
                '-vf', 'palettegen=max_colors=256:stats_mode=single',
                '-y', f.name,
            ), (
                'ffmpeg', '-i', input_path, '-i', f.name,
                '-lavfi', '[0][1:v] paletteuse', '-pix_fmt', 'pal8',
                '-y', output_path,
            )
        ):
            subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    os.unlink(f.name)
    if output_path.exists():
        output_path.stat
        if output_path.stat().st_size > input_path.stat().st_size:
            shutil.copy(input_path, output_path)  # 未起到压缩目的
    else:
        shutil.copy(input_path, output_path)  # 压缩失败
    return True


if __name__ == '__main__':
    import sys

    if sys.argv[1:]:
        snapshot(sys.argv[1], _pdf2images_poppler, True)