您的位置:首页 > 编程语言 > Python开发

python爬取百度百科

2017-07-15 14:12 47 查看
来源于imooc教程实例,课程地址http://www.imooc.com/learn/563

以下是自己经过每一步分析,最后成功完成,代码模块化结构分明,不过自己一开始分析还是有点晕晕的,毕竟还不太习惯,以后多练习吧,每一份的收获都来之不易,但收获的喜悦总是弥足珍贵,好了,下面就开始我们的程序。

0,爬取目标

1)百度百科词条标题和简介;

2)链接页面的词条标题和简介;

3)总共爬取1000个页面。

1,准备工作与环境

pycharm,win10,python3.6,requests+BeautifulSoup+re

2,爬虫的构成

按照面向对象编程的原则,可将爬虫按照以下模块进行编程,分别是:

1)爬虫总调度程序,spider_main.py;

2)url管理程序, url_manager.py;

3)html下载程序,html_downloader.py;

4)html 解析程序:BeautifulSoup+正则,html_downloader.py;

5)结果输出程序,html_outputer.py

3,Python代码

1)爬虫总调度程序,spider_main.py;

# coding=utf-8
import url_manager
import html_downloader
import html_outputer
import html_parser
import csv

class SpiderMain(object):
# 初始各个对象, 其中UrlManager、HtmlDownloader、HtmlParser、HtmlOutputer四个对象需要之后创建
def __init__(self):
self.urls = url_manager.UrlManager()   # URL管理器
self.downloader = html_downloader.HtmlDownloader()   # 下载器   # 下载器
self.parser = html_parser.HtmlParse()    # 解析器
self.outputer = html_outputer.HtmlOutputer()  # 输出器

def craw(self,root_url):
count = 1
# 将root_url添加到url管理器
self.urls.add_new_url(root_url)
# f = open('craw.txt','a+',encoding='utf-8')
# f.write('')

while self.urls.has_new_url():  #new_urls集合的长度
try:
new_url = self.urls.get_new_url()
# print('craw %d:%s'%(count,new_url))
# 启动下载器,获取url的页面
html_page = self.downloader.download(new_url)
# 调用解析器解析下载的这个页面
new_urls,new_data = self.parser.parse(new_url,html_page)
# print(new_data)
# 将解析出的url添加到url管理器, 将数据添加到输出器里
#new_data是一个字典
self.urls.add_new_urls(new_urls)
# print(new_data['summary'])
self.outputer.output_html(new_data)
# print(new_data)
# self.outputer.collect_data(new_data)
# print(new_url)
# print(titles)
# print(summarys)

# with open('crawbaidubike.txt','a',encoding='utf-8') as f:
#
#     f.write(new_url)
#     f.write('\n')
#     f.write(titles)
#     f.write(summarys)
#     f.write('\n')
# f.close()

# print(new_url,titles,summarys)

# self.outputer.collect_data(new_urls,titles,summarys)  #new_datas是一个字典

if count ==10:
break
count = count + 1
except:
print('craw failed')
# print(new_datas)
# print(new_data)
# self.outputer.output_html(new_data)

# self.outputer.output_html()

if __name__=='__main__':
root_url = 'http://baike.baidu.com/link?url=iKhdUIHVpllyG6H-jGntGa3wfibBxAxWkLxev-Ekt2kNL6Tyte9w5-59CZNbCyCkTB8u5Aqp89j3P9yjYB97pq'
obj_spider = SpiderMain()
obj_spider.craw(root_url)


2)url管理程序, url_manager.py;

# coding=utf-8
class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()

def add_new_url(self,url): #添加新的url,这个url既不在new_urls集合中,也不在old_urls集合中
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)

def add_new_urls(self,urls):#因为解析一个页面会得到很多的urls,需要一条一条的加入到new_urls集合中
if urls is None or len(urls)==0:
return
for url in urls:
self.add_new_url(url)

def has_new_url(self):   #返回一个new_urls集合的长度
return len(self.new_urls)!=0

def get_new_url(self):  #在new_urls集合中取出一个url,同时将取出的url放入到old_urls集合中
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url


3)html下载程序,html_downloader.py;

import requests

import random

UA_LIST = [

“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1”,

“Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11”,

“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6”,

“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6”,

“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1”,

“Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5”,

“Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5”,

“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,

“Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,

“Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)”,

“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3”,

“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3”,

“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,

“Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,

“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3”,

“Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24”,

“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24”,

“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1”,

“Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11”,

“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6”,

“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6”,

“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1”,

“Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5”,

“Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5”,

“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,

“Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,

“Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,

“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3”,

“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3”,

“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,

“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,

“Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,

“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3”,

“Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24”,

“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24”

]

headers = { ‘User-Agent’: random.choice(UA_LIST)}

返回一个url的页面内容

class HtmlDownloader(object):

def download(self,url):

if url is None:

return None

response = requests.get(url,headers=headers)

if response.status_code!=200:

return None

response.encoding=’utf-8’

return response.text

4)html 解析程序:BeautifulSoup+正则,html_downloader.py;

# coding=utf-8
from bs4 import BeautifulSoup
import re
# url = 'https://baike.baidu.com/item/'

# titles=set()
# summarys=set()
class HtmlParse(object):

def parse(self,new_url,html_page):
if new_url is None or html_page is None:
return

Soup = BeautifulSoup(html_page,'lxml')
# links = Soup.find('div', class_='main-content').find_all('a', href=re.compile(r'/item/.*?'))
new_urls = self.get_new_urls(new_url,Soup)
new_data = self.get_new_data(new_url,Soup)
# title = Soup.find('dd',class_='lemmaWgt-lemmaTitle-title').h1.get_text()
# summary = Soup.find('div',class_='lemma-summary').get_text()
# print(titles)
# print(summarys)
return new_urls,new_data

def get_new_urls(self,new_url,Soup):
url = 'https://baike.baidu.com/item/'
new_urls = set()
links = Soup.find('div', class_='main-content').find_all('a', href=re.compile(r'/item/.*?'))
for link in links:
new_url =url+link.get_text()   #真正的url
new_urls.add(new_url)
return new_urls

def get_new_data(self,new_url,Soup):
res_data = {}
res_data['url'] = new_url
title = Soup.find('dd', class_='lemmaWgt-lemmaTitle-title').h1.get_text()
summary = Soup.find('div',class_='lemma-summary').get_text()
# res_data['url'] = new_url
# titles.add(title)
# summarys.add(summary)
res_data['title']=title
res_data['summary']=summary
# print(res_data)

return res_data


5)结果输出程序,html_outputer.py

class HtmlOutputer(object):

# def init(self):

# self.datas = {}

def collect_data(self,dataes):
if dataes is None:
return
for data in dataes:
self.datas.append(data)

def output_html(self,new_data):
# print(self.datas)
# for data in self.datas:
#     # print(data['url'])
#     print(data['title'])
#     print(data['summary'])
f = open('output.html','a+',encoding='utf-8')
f.write('<html>')
f.write('<body>')
f.write('<table>')

f.write('<thead>')
f.write('<tr>')
f.write('<th width="20%">链接</th>')
f.write('<th width="10%">标题</th>')
f.write('<th width="80%">简介</th>')
f.write('</tr>')
f.write('</thead>')

f.write('<tbody>')

# for i in range(0, 100):
f.write('<tr>')
f.write('<td style="color:orange;text-align:center">%s</td>' % new_data['url'])
f.write('<td style="color:blue;text-align:center">%s</td>' % new_data['title'])
f.write('<td style="color:red;text-align:center">%s</td>' % new_data['summary'])
f.write('</tr>')
f.write('</tbody>')
f.write('</table>')
f.write('</body>')
f.write('</html>')
f.close()


结果:不知道为什么,图片上传不成功,算了,不纠结了。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  百度 python 模块化