您的位置:首页 > 编程语言 > Python开发

python多线程爬取-今日头条的街拍数据(附源码加思路注释)

2018-09-01 23:41 483 查看
这里用的是json+re+requests+beautifulsoup+多线程

1 import json
2 import re
3 from multiprocessing.pool import Pool
4
5 import requests
6 from bs4 import BeautifulSoup
7 from config import *
8 from requests import RequestException
9
10
11 def get_page_index(offset, keyword):
12     '''得到一个页面的索引'''
13     data = {
14         'offset': offset,
15         'format': 'json',
16         'keyword': keyword,
17         'autoload': 'true',
18         'count': '20',
19         'cur_tab': '1',
20         'from': 'search_tab'
21     }
22     # 请求方式一
23     # url = 'https://www.toutiao.com/search_content/?'+urlencode(data)
24     # response = requests.get(url)
25
26     # 请求方式二
27     url = 'https://www.toutiao.com/search_content/'
28     try:
29         response = requests.get(url, params=data)
30         if response.status_code == 200:
31             return response.text
32         return None
33     except RequestException:
34         return None
35
36
37 def parse_page_index(html):
38     '''解析json数据'''
39     data = json.loads(html)
40     if data and 'data' in data.keys():
41         for item in data.get('data'):
42             yield item.get('article_url')
43
44
45 def get_page_detail(url):
46     '''得到详情页的数据'''
47     # 添加的请求头
48     headers = {
49         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
50     }
51     try:
52         response = requests.get(url, headers=headers)
53         if response.status_code == 200:
54             return response.text
55         return None
56     except RequestException:
57         return None
58
59
60 def parse_page_detail(html, url):
61     '''解析详情页数据'''
62     soup = BeautifulSoup(html, 'lxml')
63     t = soup.select('title')
64     for i in t:
65         title = i.get_text()
66
67     pattern = re.compile('gallery: JSON.parse\("(.*?)"\),', re.S)
68     result = re.search(pattern, html)
69     if result:
70
71         # print(result.group(1))
72         d = re.sub('\\\\', '', result.group(1))
73         # print(d)
74         data = json.loads(d)
75         if data:
76             images = [item.get('url') for item in data.get('sub_images')]
77             for image in images:
78                 download_image(image, title)
79             return {
80                 'title': title,
81                 'url': url,
82                 'images': images
83             }
84     else:
85         None
86
87
88 def download_image(url, title):
89     '''
90     图片下载
91     :param url: 下载的连接
92     :return:
93     '''
94     print('正在下载', url)
95     try:
96         response = requests.get(url)
97         if response.status_code == 200:
98             content = response.content
99             save_to_image(content, title)
100         return None
101     except RequestException:
102         return None
103
104
105 count = 0
106
107
108 def save_to_image(content, title):
109     global count
110     '''
111     保存图片文件
112     :param content: 图片文件的内容
113     :return:
114     '''
115     name = title + str(count)
116     file_path = './头条/{}.{}'.format(name, 'jpg')
117     with open(file_path, 'wb') as f:
118         count += 1
119         f.write(content)
120
121
122 def main(offset):
123     '''主程序入口'''
124     html = get_page_index(offset, '街拍')
125
126     # print(html)
127     for url in parse_page_index(html):
128
129         if url:
130             # print(url)
131             html = get_page_detail(url)
132             if html:
133                 # print(parse_page_detail(html, url))
134                 result = parse_page_detail(html, url)
135                 if result:
136                     print(result)
137                     # save_to_mongo(result)
138
139
140 GROUP_START = 1
141 GROUP_END = 20
142 if __name__ == '__main__':
143     groups = [i * 20 for i in range(GROUP_START, GROUP_END)]
144     pool = Pool()
145     pool.map(main, groups)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: