python爬虫 -- 网络爬虫练习1
2017-12-26 15:52
507 查看
#! /usr/bin/env python3 # -*- coding:utf-8 -*- # filename : qiushibaike2.py # author : zoujiameng@aliyun.com.cn from urllib.request import urlopen, Request, HTTPBasicAuthHandler, build_opener, ProxyHandler, install_opener from urllib.parse import urlencode from urllib.error import HTTPError, URLError import socket, re class HtmlUtils: # 如何实现单例? #default_timeout = None #the_page = None def __init__(self, timeout=2): self.default_timeout = timeout self.the_page = None socket.setdefaulttimeout(self.default_timeout) def getHtml(self, url): response = urlopen(url, timeout=self.default_timeout) the_page = response.read() self.the_page = the_page return the_page def getHtmlByRequest(self, url, user_agent=None): if user_agent != None: headers = { 'User-Agent' : user_agent } request = Request(url, headers=headers) else: request = Request(url) response = urlopen(request, timeout=self.default_timeout) the_page = response.read() self.the_page = the_page return the_page def __getHtmlBySendDataInternal__(self, url, data, headers=None): values = urlencode(data) if headers == None: request = Request(url, values) request.add_header('Referer', 'http://www.python.org/') else: request = Request(url, values, headers) try: response = urlopen(request, timeout=self.default_timeout) except HTTPError as e: print("Http Error Code:", e.code) except URLError as e: if hasattr(e, "code"): print("URL Error Code:", e.code) if hasattr(e, "reason"): print("URL Error Reason", e.reason) the_page = response.read() self.the_page = the_page return the_page.decode("utf-8") def getHtmlBySendDataAndHeaders(self, url, data, user_agent): #user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } return self.__getHtmlBySendDataInternal__(url, data, headers) def getHtmlBySendData(self, url, data): #user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' return self.__getHtmlBySendDataInternal__(url, data) def savePageToFile(self, filename, dc="utf-8"): with open(filename, "wb") as f: #html = self.the_page#.decode(dc) f.write(self.the_page) def getHtmlByPwd(self, url, username, pwd, a_url): import urllib.request # create a password manager password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm() # Add the username and password. password_mgr.add_password(None, url, username, pwd) handler = HTTPBasicAuthHandler(password_mgr) # create "opener" (OpenerDirector instance) opener = build_opener(handler) # use the opener to fetch a URL self.the_page = opener.open(a_url).read() # if we wanna install the opener. usage under: #urllib.request.install_opener(opener) #a = urllib.request.urlopen(a_url).read().decode('utf8') def getHtmlIfSupportProxy(self, url, ip, port): target = str(ip) + ':' + str(port) proxy_support = ProxyHandler({'': target}) opener = build_opener(proxy_support) install_opener(opener) self.the_page = urllib.request.urlopen(url, timeout=self.default_timeout).read() return self.the_page.decode("utf8") # for test... if __name__ == "__main__": ht = HtmlUtils(3) URLIndex = "https://www.qiushibaike.com/hot/page/" user_agent = 'Mozilla/5.0 (compatible; MSIE 5.5; Ubuntu 14.04)' for page in range(1, 10): url = URLIndex + str(page) ht.getHtmlByRequest(url, user_agent) fn = "Myysite_" + str(page) + ".html" ht.savePageToFile(fn)
正常情况下,会生成9个html文件:
异常情况下,会有部分没有生成:
异常截图,HTTP Error:
相关文章推荐
- [Python]网络爬虫(六):一个简单的百度贴吧的小爬虫
- [Python]网络爬虫(六):一个简单的百度贴吧的小爬虫(转)
- Python计算机视觉编程练习14:pyspider爬虫--安装篇
- 【网络爬虫】【python】网络爬虫(一):python爬虫概述
- [Python]网络爬虫(12):爬虫框架Scrapy的第一个爬虫示例入门教程
- [Python]网络爬虫(十):一个爬虫的诞生全过程(以山东大学绩点运算为例)
- Python爬虫练习笔记二
- python爬虫练习2
- python爬虫练习3
- Python网络爬虫与信息提取-Day10-(实例)中国大学排名定向爬虫
- Python 网络爬虫与信息获取(一)—— requests 库的网络爬虫
- 模拟登陆CSDN -- Python爬虫练习之正则表达式和cookie
- [Python]网络爬虫(六):一个简单的百度贴吧的小爬虫
- [Python]网络爬虫(十):一个爬虫的诞生全过程(以山东大学绩点运算为例)
- Python爬虫练习-查询lol隐藏分
- 【网络爬虫】【python】网络爬虫(五):scrapy爬虫初探——爬取网页及选择器
- Python爬虫练习:爬取csdn极客的更新文章
- [Python]网络爬虫(12):爬虫框架Scrapy的第一个爬虫示例入门教程
- python 爬虫系列(0) --- 初识网络爬虫
- [Python]网络爬虫(六):一个简单的百度贴吧的小爬虫