您的位置:首页 > 编程语言 > Python开发

使用eclipse+python编写爬虫获取python百科的1000条词条

2016-03-29 21:33 756 查看

爬虫的机构



1.      爬虫的调度端

作用是实现爬虫的启动,停止和监视爬虫的运行情况

包括URL管理器:包含待爬取的URL和已经爬取的URL

把待爬取的URL送到网页下载器,下载器会将URL指定的网页下载下来存储成一个字符串,这个字符串会传送给玩野解释器解析,一方面解释出有价值的数据,和一些指向其他网页的URL,这些URL会补充到URL管理器,这三个模块形成一个循环,有相关的URL就会一直运行下去。

2.

实例爬虫的步骤



1.  确定目标:百度百科Python词条相关词条网页-标题和简介

2.  分析目标

1)  URL格式

入口页:
http://baike.baidu.com/view/125370.htm
词条页面URL格式:/view/125370.htm"

2)页面编码   UTF-8

3)数据格式

标题格式<ddclass="lemmaWgt-lemmaTitle-title"></dd>

简介格式“<divclass="lemma-summary" label-module="lemmaSummary">

3.  编写爬虫

#coding:utf8

# from:http://www.imooc.com/learn/563

# By:muqingcai

 

from bs4import BeautifulSoup

import re

importurlparse

importurllib2

import os

 

#URL管理器

classUrlManager(object):

    def __init__(self):

        self.new_urls = set()

        self.old_urls = set()

 

    def add_new_url(self,url):

        if url is None:

            raise Exception # 为None抛出异常

        if url not in self.new_urls and url notin self.old_urls:

            self.new_urls.add(url)

 

    def add_new_urls(self,urls):

        if urls is None or len(urls) == 0:

            raise Exception # 为None抛出异常

        for url in urls:

            self.add_new_url(url)

 

    def has_new_url(self):

        return len(self.new_urls) != 0

 

    def get_new_url(self):

        new_url = self.new_urls.pop()

        self.old_urls.add(new_url)

        return new_url

#HTML下载器

classHtmlDownloader(object):

    def download(self, url):

        if url is None:

            return None

        response = urllib2.urlopen(url)

 

        if response.getcode() != 200:

            return None

        return response.read()

# HTML解析器

classHtmlParser(object):

    def _get_new_urls(self,page_url,soup):

        new_urls = []   ###########

        # /view/123.htm: 得到所有词条URL

        links = soup.find_all("a",href=re.compile(r"/view/\d+\.htm"))

        for link in links:

            new_url = link["href"]

            #把new_url按照page_url的格式拼接成完整的URL格式

            new_full_url =urlparse.urljoin(page_url,new_url)

            new_urls.append(new_full_url)

        return new_urls

 

    def _get_new_data(self,page_url,soup):

        res_data = {}

        #url

        res_data["url"] = page_url

        #<ddclass="lemmaWgt-lemmaTitle-title"><h1>Python</h1>

        title_node =soup.find("dd",class_="lemmaWgt-lemmaTitle-title").find("h1")

        res_data["title"] =title_node.get_text()

        #<divclass="lemma-summary" label-module="lemmaSummary">

        summery_node =soup.find("div",class_="lemma-summary")

        res_data["summary"] =summery_node.get_text()

        return res_data

 

    def parse(self,page_url,html_cont):

        if page_url is None or html_cont isNone:

            return

        soup = BeautifulSoup(html_cont,"html.parser", from_encoding="utf-8")

        new_urls = self._get_new_urls(page_url,soup)

        new_data = self._get_new_data(page_url,soup)

        return new_urls, new_data

 

# HTML输出器

classHtmlOutputer(object):

    def __init__(self):

        self.datas = []

 

    def collect_data(self, data):

        if data is None:

            return

        self.datas.append(data)

 

    def output_html(self):

        fout =open("baike_spider_output.html", "w")

        fout.write("<html>")

        fout.write("<head>")

        fout.write('<metacharset="utf-8"></meta>')

        fout.write("<title>百度百科Python页面爬取相关数据</title>")

        fout.write("</head>")

        fout.write("<body>")

        fout.write('<h1style="text-align:center">在百度百科中爬取相关数据展示</h1>')

        fout.write("<table>")

        for data in self.datas:

            fout.write("<tr>")

           fout.write("<td>%s</td>" % data["url"])

            fout.write("<td><ahref='%s'>%s</a></td>" %(data["url"].encode("utf-8"),data["title"].encode("utf-8")))

           fout.write("<td>%s</td>" %data["summary"].encode("utf-8"))

            fout.write("</tr>")

        fout.write("</table>")

        fout.write("</body>")

        fout.write("</html>")

 

 

classSpiderMain():

    def craw(self, root_url, page_counts):

        count = 1   #记录爬取的是第几个URL

        UrlManager.add_new_url(root_url)

        while UrlManager.has_new_url():    # 如果有待爬取的URL

            try:

                # 把新URL取出来

                new_url =UrlManager.get_new_url()

                # 记录爬取的URL数量

                print "\ncrawed %d :%s" % (count, new_url)

                # 下载该URL的页面

                html_cont =HtmlDownloader.download(new_url)

                # 进行页面解析,得到新的URL和数据

                new_urls, new_data =HtmlParser.parse(new_url, html_cont)

                # 新URL补充进URL管理器

               UrlManager.add_new_urls(new_urls)

                # 进行数据的收集

               HtmlOutputer.collect_data(new_data)

 

                # 爬取到第counts个链接停止

                if count == page_counts:

                    break

                count = count + 1

            except:

                print "craw failed"

 

        #输出收集好的数据

        HtmlOutputer.output_html()

 

if__name__=="__main__":

    print "\nWelcome to use baike_spider:)"

   

    UrlManager = UrlManager()

    HtmlDownloader = HtmlDownloader()

    HtmlParser = HtmlParser()

    HtmlOutputer = HtmlOutputer()

 

    root = raw_input("Enter you want tocraw which baike url: http://baike.baidu.com/view/")
    root_url ="http://baike.baidu.com/view/%s" % (root)  #爬虫入口URL

    page_counts = input("Enter you want tocraw how many pages:" )  #想要爬取的数量

 

    SpiderMain = SpiderMain()

    SpiderMain.craw(root_url,page_counts)   #启动爬虫

 

    print"\nCraw is done, please go to"+os.path.dirname(os.path.abspath('__file__')) + " to see the resultin baike_spider_output.html"

4.  执行爬虫

直接python run即可,执行结果

如下:



..................................



……………………..

在package包下按F5即可刷新得到


点开可以看到
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python 爬虫