Skip to content

Numbeo 爬虫

“Numbeo 是一个众包全球数据库,其数据范围包括被各个国家或地区政府汇报的居民消费价格指数、直观犯罪率、医疗保健品质等其他统计数据。”如下代码为该网站的爬虫程序,2022-03-14 运行的结果如 numbeo.7z 所示。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import typing as t

import requests

from bs4 import BeautifulSoup


Data = t.Dict[str, t.List[t.Dict[str, str]]]


class Numbeo:
    def __init__(self, country: str = 'China') -> None:
        self._country = country
        self._soup = None
        self._data = None
        self._cities = None
        self._cache = {}

    @property
    def soup(self) -> BeautifulSoup:
        if self._soup is None:
            self._soup = self._load_soup()
        return self._soup

    @property
    def data(self) -> Data:
        if self._data is None:
            self._data = self._load_data(self.soup)
        return self._data

    @property
    def cities(self) -> t.List[str]:
        if self._cities is None:
            self._cities = self._load_cities(self.soup)
        return self._cities

    @property
    def cache(self) -> t.Dict[str, Data]:
        return self._cache

    def city(self, name: str) -> Data:
        if name not in self._cache:
            url = 'https://www.numbeo.com/cost-of-living/city_result.jsp'
            response = requests.get(url, params={'country': self._country, 'city': name})
            soup = BeautifulSoup(response.content, 'html.parser')
            self._cache[name] = self._load_data(soup)
        return self._cache[name]

    def _load_soup(self) -> BeautifulSoup:
        url = 'https://www.numbeo.com/cost-of-living/country_result.jsp'
        response = requests.get(url, params={'country': self._country})
        return BeautifulSoup(response.content, 'html.parser')

    def _load_data(self, soup: BeautifulSoup) -> Data:
        data = {}
        table = soup.find(class_='data_wide_table new_bar_table')
        for row in table.find_all('tr'):
            head = row.find('th')
            if head is not None:
                category = self._simplify(head.text)
                data[category] = []
            else:
                name, value, _ = row.find_all('td')
                data[category].append({
                    'name': self._simplify(name.text),
                    'value': self._simplify(value.text),
                })
        return data

    def _load_cities(self, soup: BeautifulSoup) -> t.List[str]:
        func = lambda option: option.attrs['value']
        options = soup.find(id='city').find_all('option')
        return list(filter(bool, map(func, options)))

    def _simplify(self, text: str) -> str:
        return text.replace('\xa0', '').strip()


if __name__ == '__main__':
    import json

    numbeo = Numbeo('China')
    data = {
        name: numbeo.city(name)
        for name in numbeo.cities
    }
    with open('numbeo.json', 'w') as f:
        json.dump(data, f, ensure_ascii=False)