python 百度百科的爬虫实例
2018-01-26 13:49
471 查看
本实例用的python版本为3.6。
爬虫启动类 spider_mainpy
url管理器 url_managerpy
html下载器 html_downerpy
html解析器 html_parserpy
html输出器 html_outputerpy
爬虫启动类 spider_mainpy
url管理器 url_managerpy
html下载器 html_downerpy
html解析器 html_parserpy
html输出器 html_outputerpy
爬虫启动类 spider_main.py
from baike_spider import url_manager, html_downer, html_parser, html_outputer class SpiderMain(object): def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downer.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() def craw(self, begin_url): count = 1 self.urls.add_new_url(begin_url) while self.urls.has_new_url(): try: if count > 100: break new_url = self.urls.get_new_url() print("craw %d : %s" % (count, new_url)) html_content = self.downloader.download(new_url) new_urls, new_data = self.parser.parse(new_url, html_content) self.urls.add_new_urls(new_urls) self.outputer.collect_data(new_data) count += 1 except BaseException as e: print(e) print("craw fail") self.outputer.output_html() if __name__ == "__main__": root_url = "https://baike.baidu.com/item/Python/407313" obj_spider = SpiderMain() obj_spider.craw(root_url)
url管理器 url_manager.py
class UrlManager(object): def __init__(self): self.new_urls = set() self.old_urls = set() def add_new_url(self, begin_url): if begin_url is None: return if begin_url in self.new_urls or begin_url in self.old_urls: return self.new_urls.add(begin_url) def has_new_url(self): return len(self.new_urls) != 0 def get_new_url(self): new_url = self.new_urls.pop() self.old_urls.add(new_url) return new_url def add_new_urls(self, new_urls): if new_urls is None or len(new_urls) == 0: return for url in new_urls: self.add_new_url(url)
html下载器 html_downer.py
import ssl from urllib import request class HtmlDownloader(object): def download(self, new_url): ssl._create_default_https_context = ssl._create_unverified_context response = request.urlopen(new_url) print("请求返回码:%d" % response.getcode()) if response.getcode() != 200: return None return response.read()
html解析器 html_parser.py
import re from bs4 import BeautifulSoup from urllib import parse class HtmlParser(object): def parse(self, new_url, html_content): if html_content is None or new_url is None: return soup = BeautifulSoup(html_content, 'html.parser') new_urls = self._get_new_urls(new_url, soup) new_data = self._get_new_data(new_url, soup) return new_urls,new_data def _get_new_urls(self, new_url, soup): full_urls = set() links = soup.find_all("a", href=re.compile(r"/item/")) for link in links: url_href = link["href"] full_url_href = parse.urljoin(new_url, url_href) full_urls.add(full_url_href) return full_urls def _get_new_data(self, new_url, soup): res_data = {"url": new_url} # 获取title class="lemmaWgt-lemmaTitle-title" title = soup.find("dd", class_="lemmaWgt-lemmaTitle-title").find("h1").get_text() res_data["title"] = title # 获取摘要 class = "lemma-summary" summary = soup.find("div", class_="lemma-summary").get_text() res_data["summary"] = summary return res_data
html输出器 html_outputer.py
class HtmlOutputer(object): def __init__(self): self.datas = [] def collect_data(self, new_data): if new_data is None: return self.datas.append(new_data) def output_html(self): fout = open("output.html", 'w') fout.write("<html>") fout.write("<head>") fout.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">") fout.write("</head>") fout.write("<body>") fout.write("<table border = 1>") for data in self.datas: fout.write("<tr>") fout.write("<td>%s<td>" % data['url']) fout.write("<td>%s<td>" % data['title']) fout.write("<td>%s<td>" % data['summary']) fout.write("</tr>") fout.write("</table>") fout.write("</body>") fout.write("</html>") fout.close()
相关文章推荐
- Python爬虫爬取百度百科内容实例
- Python 爬虫实例(爬百度百科词条)
- Python爬虫----实例: 抓取百度百科Python词条相关1000个页面数据
- Python基础爬虫实战实例----爬取1000个Python百度百科词条及相关词条的标题和简介
- Python 爬虫实例(15) 爬取 百度百聘(微信公众号)
- [Python爬虫] Selenium获取百度百科旅游景点的InfoBox消息盒
- Python爬虫完整案例 - 爬取百度百科词条信息
- 第一个python爬虫(python3爬取百度百科1000个页面)
- 实践项目十:爬取百度百科Python词条相关1000个页面数据(慕课简单爬虫实战)
- python爬虫_自动获取seebug的poc实例
- python爬虫"Hello World"级入门实例(二),使用json从中国天气网抓取数据
- Python简易爬虫--抓取任意数目百度百科内容
- Python3.x爬虫下载网页图片的实例讲解
- python爬虫-百度百科词条
- Python爬虫获取图片并下载保存至本地的实例
- 【Python爬虫9】Python网络爬虫实例实战
- Python网络爬虫与信息提取-Day10-(实例)中国大学排名定向爬虫
- python3.5简单爬虫爬取百度百科(参考imooc实战)
- 基于正则表达式(python)对东方财富网上证指数吧爬虫实例
- Python爬虫爬取一个网页上的图片地址实例代码