您的位置：首页 > 其它
不断完善

2017-02-09 00:00 85 查看
摘要: 《用python写网络爬虫》
1. 最简单的网页下载代码
import urllib2                 #使用urllib2模块
from sys import argv
script,urlo = argv

def download(url):
html = urllib2.urlopen(url).read()   #将打开的url传给html变量
print html

if __name__ =='__main__':
download(urlo)
>>> python    文件名     网址(http：//……)
2. 当下载网页出现错误或异常时，代码应该可以处理异常情况
import urllib2
from sys import argv
script,urlo = argv

def download(url):
print 'Downloading:',url
try:                                           #使用try … expect … 捕获异常
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print 'Downloading Error:',e.reason
html = None                    # 如果出现异常，那就什么都不返回就行了
print html

if __name__ =='__main__':
download(urlo)
3. 当下载出现错误时，html = None，无法下载打印网页，或许是服务器端发生错误而请求不存在问题，所以可以尝试重试下载。在download（）函数中添加一个重试次数参数，以设置重试下载的次数，防止服务器错误可能暂时没有解决。
import urllib2
from sys import argv
script,urlo = argv

def download(url,num_retries=2):
print 'Downloading:',url
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print 'Downloading Error:',e.reason
html = None
if num_retries>0:
if hasattr(e,'code') and 500<=e.code<600:	#判断 e 中是否存在 code属性或code方法，若存在返回True,否则返回False
return download(url,num_retries-1)	   #递归调用函数以重试下载
print html

if __name__ =='__main__':
download(urlo)
4. 为代码增加设置代理的功能以防止爬取时使用默认代理而被封禁导致访问拒绝
import urllib2
from sys import argv
script,urlo = argv

def download(url,user_agent = 'wswp',num_retries=2):
print 'Downloading:',url
headers = {'User-agent':user_agent}
request = urllib2.Request(url,headers=headers)      #使用urllib2.request创建一个request对象，该对象在HTTP请求时，允许你做额外的两件事。首先是你能够发送data表单数据，其次你能够传送额外的关于数据或发送本身的信息("metadata")到服务器，此数据作为HTTP的"headers"来发送。
try:
html = urllib2.urlopen(request).read()
except urllib2.URLError as e:
print 'Downloading Error:',e.reason
html = None
if num_retries>0:
if hasattr(e,'code') and 500<=e.code<600:
return download(url,user_agent,num_retries-1)
print html

if __name__ =='__main__':
download(urlo)
5. 以上代码是下载一个链接下的网页，但是要下载指定类型链接或下载大量网页，则要有一个爬取网站链接的函数
import urlparse
def link_crawler(seed_url, link_regex):
crawl_queue = [seed_url]
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
for link in get_links(html):
if re.match(link_regex, link):             #使用正则表达式确定含有指定字符的链接
link = urlparse.urljoin(seed_url, link)            #将网页中的相对链接转换成绝对链接
crawl_queue.append(link)                   #将符合条件的链接加入要爬取的链接列表中
def get_links(html):
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
return webpage_regex.findall(html)
6. 当下载链接时，新下载的页面中可能包含已经下载过的网页链接，会重复下载。要避免重复下载，要记录已经下载过的链接，以防止重复下载。
import urlparse
def link_crawler(seed_url, link_regex):
crawl_queue = [seed_url]
seen = set(crawl_queue)                 #创建一个链接集合
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
for link in get_links(html):
if re.match(link_regex, link):             #使用正则表达式确定含有指定字符的链接
link = urlparse.urljoin(seed_url, link)            #将网页中的相对链接转换成绝对链接
if link not in seen:                           #判断链接是否已经下载过
seen.add(link)
crawl_queue.append(link)                   #将符合条件的链接加入要爬取的链接列表中
def get_links(html):
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
return webpage_regex.findall(html)
7. 为代码增加解析robots.txt文件的能力，以防止下载到禁止爬取得URL
import urlparse
import robotparser
def link_crawler(seed_url, link_regex):
crawl_queue = [seed_url]
seen = set(crawl_queue)                 #创建一个链接集合
rp = get_robots(seed_url)
while crawl_queue:
url = crawl_queue.pop()
if rp.can_fetch(user_agent, url):         #判断设置的代理是否被允许访问网页
html = download(url)
for link in get_links(html):
if re.match(link_regex, link):             #使用正则表达式确定含有指定字符的链接
link = urlparse.urljoin(seed_url, link)            #将网页中的相对链接转换成绝对链接
if link not in seen:
seen.add(link)
crawl_queue.append(link)                   #将符合条件的链接加入要爬取的链接列表中
else:
print 'Blocked by robots.txt:', url
def get_links(html):
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
return webpage_regex.findall(html)

def get_robots(url):               #使用robotparser模块解析robots.txt文件，以避免下载到禁止爬取的URL
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urljoin(url, '/robots.txt'))      #
rp.read()
return rp
8. 综合爬取链接和下载网页的代码
import re
import urllib2
import urlparse
import robotparser
#from sys import argv

#script,urlo = argv
def link_crawler(seed_url, link_regex,user_agent='wswp',proxy=None):
crawl_queue = [seed_url]
seen = set(crawl_queue)
rp = get_robots(seed_url)
while crawl_queue:
url = crawl_queue.pop()
if rp.can_fetch(user_agent, url):
html = download(url,proxy=proxy)
for link in get_links(html):
if re.match(link_regex,link):
link = urlparse.urljoin(seed_url,link)
if link not in seen:
seen.add(link)
crawl_queue.append(link)
else:
print 'Blocked by robots.txt:', url

def get_links(html):
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
return webpage_regex.findall(html)

def get_robots(url):
"""Initialize robots parser for this domain
"""
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urljoin(url, '/robots.txt'))
rp.read()
return rp

def download(url, proxy,user_agent = 'wswp',num_retries = 2):
print 'Downloading:',url
headers = {'User-agent':user_agent}
request = urllib2.Request(url,headers=headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
except urllib2.URLError as e:
print 'Downloading error:',e.reason
html = None
if num_retries>0:
if hasattr(e,'code') and 500<=e.code<600:
return download(url,proxy,user_agent,num_retries-1)
return html

if __name__=='__main__':
link_crawler('http://example.webscraping.com', '/(index|view)',user_agent='GoodCrawler')
link_crawler('http://example.webscraping.com', '/(index|view)',user_agent='BadCrawler')
内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理
标签： 爬虫
相关文章推荐
新的分享
章节导航