import os
import pathlib as p
import shutil
import subprocess
import tempfile
import fitz
import pdf2image
from typing import Callable, Iterator, List, Union
PathLike = Union[str, p.Path]
def _pdf2images_fitz(path: p.Path, directory: p.Path) -> Iterator[p.Path]:
with fitz.open(path) as doc:
for ith, page in enumerate(doc):
path = directory / f'{ith}.png'
try:
page.get_pixmap(mat=fitz.Matrix(1, 1)).save(path)
except RuntimeError as e:
print(f'Page {ith+1}: {e}')
else:
yield path
def _pdf2images_poppler(path: p.Path, directory: p.Path) -> Iterator[p.Path]:
for ith, image in enumerate(pdf2image.convert_from_path(path, dpi=200)):
path = directory / f'{ith}.png'
image.save(path, 'PNG')
yield path
def snapshot(path: PathLike, pdf2images: Callable, tinified: bool = False) -> None:
path = p.Path(path)
directory = path.parent / path.stem
directory.mkdir(parents=True, exist_ok=True)
output = directory.parent / f'{path.stem}-snapshot{path.suffix}'
with fitz.open() as doc:
for path in pdf2images(path, directory):
if tinified:
temp = directory / f'{path.stem}-tinypng{path.suffix}'
if tinypng(path, temp):
path = temp
with fitz.open(path) as img:
rect = img[0].rect
pdf = fitz.open('pdf', img.convert_to_pdf())
doc.new_page(width=rect.width, height=rect.height) \
.show_pdf_page(rect, pdf, 0)
doc.save(output)
def tinypng(input_path: PathLike, output_path: PathLike) -> bool:
input_path, output_path = p.Path(input_path), p.Path(output_path)
if not input_path.exists():
return False
with tempfile.NamedTemporaryFile(
suffix=input_path.suffix, delete=False
) as f:
for command in (
(
'ffmpeg', '-i', input_path,
'-vf', 'palettegen=max_colors=256:stats_mode=single',
'-y', f.name,
), (
'ffmpeg', '-i', input_path, '-i', f.name,
'-lavfi', '[0][1:v] paletteuse', '-pix_fmt', 'pal8',
'-y', output_path,
)
):
subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
os.unlink(f.name)
if output_path.exists():
output_path.stat
if output_path.stat().st_size > input_path.stat().st_size:
shutil.copy(input_path, output_path) # 未起到压缩目的
else:
shutil.copy(input_path, output_path) # 压缩失败
return True
if __name__ == '__main__':
import sys
if sys.argv[1:]:
snapshot(sys.argv[1], _pdf2images_poppler, True)