所有汉字叠在一起会看到什么

想法来自 oooooohmygosh 的视频《把所有汉字叠在一起,会看到什么?》,代码实现的原理为渲染字体文件包含的所有字符并按像素累加,使用的库为 PillowfontTools,楷体(simkai.ttf)的结果见最后图片。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import typing as t

import numpy as np

from fontTools.ttLib import TTFont
from PIL import Image, ImageFont


class FontStat:
    def __init__(self, path: str, fontsize: int = 64, rmbg: bool = True) -> None:
        self._path = path
        self._size = fontsize
        self._rmbg = rmbg

    def chars(self) -> t.Iterator[str]:
        font = TTFont(self._path)
        for table in font['cmap'].tables:
            for ith in table.cmap:
                yield chr(ith)

    def bitmap(self, char: str) -> np.ndarray:
        ans = np.zeros((self._size, self._size), dtype=np.uint8)
        mask = ImageFont.truetype('simkai.ttf', self._size).getmask(char)
        raw = np.array(mask, dtype=np.uint8).reshape(mask.size[::-1])
        # remove background
        if self._rmbg:
            begin_x, *_, end_x = np.nonzero(raw.sum(axis=1))[0]
            begin_y, *_, end_y = np.nonzero(raw.sum(axis=0))[0]
            raw = raw[begin_x: end_x+1, begin_y: end_y+1]
        # centered in the square matrix
        height, width = raw.shape
        x, y = (self._size-height)//2, (self._size-width)//2
        ans[x: x+height, y: y+width] = raw
        return ans

    def stat(self) -> np.ndarray:
        stat = np.zeros((self._size, self._size), dtype=np.uint64)
        for char in set(filter(bool, map(str.strip, self.chars()))):
            stat += self.bitmap(char)  # TODO: self.bitmap(char) != 0
        return stat

    def stat_uint8(self) -> np.ndarray:
        stat = self.stat()
        return (stat / stat.max() * 255).astype(np.uint8)  # TODO: log, etc.

    def stat_save(self, path: str) -> None:
        Image.fromarray(self.stat_uint8(), 'L').save(path)


if __name__ == '__main__':
    import pathlib as p
    import tqdm

    root = p.Path('C:/Windows/Fonts')
    for path in tqdm.tqdm(root.iterdir()):
        if path.suffix.lower() == '.ttf':
            output = p.Path(f'image/{path.stem}.jpg')
            if not output.exists():
                FontStat(path, 128, True).stat_save(output)

楷体