scraping_编写第一个网络爬虫_最终版本
2017-05-19 14:14
501 查看
以下是自己学习到的第一个网络爬虫,是自己写与实例版本的对比
1.自己学习写的最终版本
import urllib.request import urllib.error import re #正则表达式 import urllib.parse #将url链接从相对路径(浏览器可懂但python不懂)转为绝对路径(python也懂了) import urllib.robotparser #爬取数据前解析网站robots.txt文件,避免爬取网站所禁止或限制的 import datetime #下载限速功能所需模块 def download(url, user_agent = "brain", proxy = None, num_retries = 2): #下载url网页,proxy是支持代理功能,初始值为None,想要设置就直接传参数即可 print("downloading:",url) header = {"user-agent": user_agent} #设置用户代理,而不使用python默认的用户代理Python-urllib/3.6 req = urllib.request.Request(url, headers = header) opener = urllib.request.build_opener() #为支持代理功能时刻准备着 if proxy: #如果设置了proxy,那么就进行以下设置以实现支持代理功能 proxy_params = { urllib.parse.urlparse(url).scheme: proxy } opener.add_handler(urllib.request.ProxyHandler(proxy_params)) response = opener.open(req) try: html = urllib.request.urlopen(req).read() except urllib.error.URLError as e: #下载过程中出现问题 print("download error:",e.reason) html = None if num_retries > 0: #错误4XX发生在请求存在问题,而5XX错误则发生在服务端存在问题,所以在发生5XX错误时重试下载 if hasattr(e, "code") and 500<= e.code <600: return download(url, user_agent, num_retries-1) # recursively retry 5XX HTTP errors return html #download("http://example.webscraping.com") #访问正常 #download("http://httpstat.us/500") #这个网页测试用,一直是5XXerror #跟踪链接的爬虫 #link_crawler()函数传入两个参数:要爬取的网站URL、用于跟踪链接的正则表达式。 def link_crawler(seed_url, link_regex, max_depth=2): """先下载 seed_url 网页的源代码,然后提取出里面所有的链接URL,接着对所有匹配到的链接URL与link_regex 进行匹配, 如果链接URL里面有link_regex内容,就将这个链接URL放入到队列中, 下一次 执行 while crawl_queue: 就对这个链接URL 进行同样的操作。 反反复复,直到 crawl_queue 队列为空,才退出函数。 4000 """ crawl_queue = [seed_url] max_depth = 2 #为避免爬虫陷阱,将用于避免重复链接的seen记录值修改为字典,增加记录访问次数;如果想要禁用该功能,只需将max_depth设为一个负数即可,此时当前深度永远不会与之相等 seen = {seed_url:0} #初始化seed_url访问深度为0 #seen = set(crawl_queue) #有可能链接中互相重复指向,为避免爬取相同的链接,所以我们需要记录哪些链接已经被爬取过(放在集合seen中),若已被爬取过,不再爬取 while crawl_queue: url = crawl_queue.pop() rp = urllib.robotparser.RobotFileParser() #爬取前解析网站robots.txt,检查是否可以爬取网站,避免爬取网站禁止或限制的 rp.set_url("http://example.webscraping.com/robots.txt") rp.read() user_agent = "brain" if rp.can_fetch(user_agent, url): #解析后发现如果可以正常爬取网站,则继续执行 #爬取网站的下载限速功能的类的调用,每次在download下载前使用 throttle = Throttle(delay=5) #这里实例网站robots.txt中的delay值为5 throttle.wait(url) html = download(url) #html = download(url, hearders, proxy=proxy, num_retries=num_retries)这里可以传所需要的参数 html = str(html) #filter for links matching our regular expression if html == None: continue depth = seen[url] #用于避免爬虫陷阱的记录爬取深度的depth if depth != max_depth: for link in get_links(html): if re.match(link_regex, link): link = urllib.parse.urljoin(seed_url, link) #把提取的相对url路径link(view/178)转化成绝对路径(/view/Poland-178)link if link not in seen: #判断是否之前已经爬取 seen[link] = depth + 1 #在之前的爬取深度上加1 crawl_queue.append(link) #之前没有的话这个链接可用,放在列表中继续进行爬取 else: print("Blocked by %s robots,txt" % url) continue def get_links(html): """用来获取一个html网页中所有的链接URL""" #做了一个匹配模板 webpage_regex,匹配 <a href="xxx"> or <a href='xxx'>这样的字符串,并提取出里面xxx的URL,请注意这里的xxxURL很可能是源码中相对路径,eg view/1 正常访问肯定是打不开的 webpage_regex = re.compile('<a href=["\'](.*?)["\']', re.IGNORECASE) return re.findall(webpage_regex,html) #return re.findall('<a[^>]+href=["\'](.*?)["\']', html)也可以这样实现,但没有上面的先编译模板再匹配好 class Throttle: #爬取网站的下载限速功能的类的实现,每次在download下载前使用 """Add a delay between downloads to the same domain""" def __init__(self, delay): self.delay = delay # value of delay between downloads for each domain self.domains = {} # timestamp of when a domain was last accessed记录上次访问的时间,小知识timestamp:时间戳是指格林威治时间1970年01月01日00时00分00秒(北京时间1970年01月01日08时00分00秒)起至现在的总秒数。 def wait(self, url): domain = urllib.parse.urlparse(url).netloc last_accessed = self.domains.get(domain) if self.delay>0 and last_accessed is not None: sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds if sleep_secs > 0: time.sleep(sleep_secs) #domain has been accessed recently,so need to sleep self.domains[domain] = datetime.datetime.now() #只想找http://example.webscraping.com/index... or http://example.webscraping.com/view... link_crawler("http://example.webscraping.com", "/(index|view)")
2.示例网站提供的最终版本(阅读就好,示例代码是用python2实现的)
import re import urlparse import urllib2 import time from datetime import datetime import robotparser import Queue def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1): """Crawl from the given seed URL following links matched by link_regex """ # the queue of URL's that still need to be crawled crawl_queue = Queue.deque([seed_url]) # the URL's that have been seen and at what depth seen = {seed_url: 0} # track how many URL's have been downloaded num_urls = 0 rp = get_robots(seed_url) throttle = Throttle(delay) headers = headers or {} if user_agent: headers['User-agent'] = user_agent while crawl_queue: url = crawl_queue.pop() # check url passes robots.txt restrictions if rp.can_fetch(user_agent, url): throttle.wait(url) html = download(url, headers, proxy=proxy, num_retries=num_retries) links = [] depth = seen[url] if depth != max_depth: # can still crawl further if link_regex: # filter for links matching our regular expression links.extend(link for link in get_links(html) if re.match(link_regex, link)) for link in links: link = normalize(seed_url, link) # check whether already crawled this link if link not in seen: seen[link] = depth + 1 # check link is within same domain if same_domain(seed_url, link): # success! add this new link to queue crawl_queue.append(link) # check whether have reached downloaded maximum num_urls += 1 if num_urls == max_urls: break else: print 'Blocked by robots.txt:', url class Throttle: """Throttle downloading by sleeping between requests to same domain """ def __init__(self, delay): # amount of delay between downloads for each domain self.delay = delay # timestamp of when a domain was last accessed self.domains = {} def wait(self, url): domain = urlparse.urlparse(url).netloc last_accessed = self.domains.get(domain) if self.delay > 0 and last_accessed is not None: sleep_secs = self.delay - (datetime.now() - last_accessed).seconds if sleep_secs > 0: time.sleep(sleep_secs) self.domains[domain] = datetime.now() def download(url, headers, proxy, num_retries, data=None): print 'Downloading:', url request = urllib2.Request(url, data, headers) opener = urllib2.build_opener() if proxy: proxy_params = {urlparse.urlparse(url).scheme: proxy} opener.add_handler(urllib2.ProxyHandler(proxy_params)) try: response = opener.open(request) html = response.read() code = response.code except urllib2.URLError as e: print 'Download error:', e.reason html = '' if hasattr(e, 'code'): code = e.code if num_retries > 0 and 500 <= code < 600: # retry 5XX HTTP errors return download(url, headers, proxy, num_retries-1, data) else: code = None return html def normalize(seed_url, link): """Normalize this URL by removing hash and adding domain """ link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates return urlparse.urljoin(seed_url, link) def same_domain(url1, url2): """Return True if both URL's belong to same domain """ return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc def get_robots(url): """Initialize robots parser for this domain """ rp = robotparser.RobotFileParser() rp.set_url(urlparse.urljoin(url, '/robots.txt')) rp.read() return rp def get_links(html): """Return a list of links from html """ # a regular expression to extract all links from the webpage webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) # list of all links from the webpage return webpage_regex.findall(html) if __name__ == '__main__': link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler') link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')
3.测试此爬虫
我们可以将用户代理设置为BadCrawler,也就是本章前文所述的被robots.txt 屏蔽了的那个用户代理。从下面的运行结果中可以看出,爬虫果然被屏蔽了,代码启动后马上就会结束:>>>seed_url = "http://example.webscraping.com/index" >>>link_regex = "/(index/view)" >>>link_crawler(seed_url, link_regex, user_agent="BadCrawler") Blocked by robots.txt : http://example.webscraping.com/[/code]现在,让我们使用默认的用户代理,并将最大深度设置为1,这样只有主页上的链接才会被下载:downloading: http://example.webscraping.com downloading: http://example.webscraping.com/index/1 downloading: http://example.webscraping.com/index/2 downloading: http://example.webscraping.com/index/0 downloading: http://example.webscraping.com/view/Barbados-20 downloading: http://example.webscraping.com/view/Bangladesh-19 downloading: http://example.webscraping.com/view/Bahrain-18 downloading: http://example.webscraping.com/view/Bahamas-17 download error: TOO MANY REQUESTS downloading: http://example.webscraping.com/view/Azerbaijan-16 downloading: http://example.webscraping.com/view/Austria-15 downloading: http://example.webscraping.com/view/Australia-14 downloading: http://example.webscraping.com/view/Aruba-13 downloading: http://example.webscraping.com/view/Armenia-12 downloading: http://example.webscraping.com/view/Argentina-11 downloading: http://example.webscraping.com/view/Antigua-and-Barbuda-10 download error: TOO MANY REQUESTS downloading: http://example.webscraping.com/view/Antarctica-9 download error: TOO MANY REQUESTS downloading: http://example.webscraping.com/view/Anguilla-8 download error: TOO MANY REQUESTS downloading: http://example.webscraping.com/view/Angola-7 download error: TOO MANY REQUESTS downloading: http://example.webscraping.com/view/Andorra-6 download error: TOO MANY REQUESTS downloading: http://example.webscraping.com/view/American-Samoa-5 download error: TOO MANY REQUESTS downloading: http://example.webscraping.com/view/Algeria-4 download error: TOO MANY REQUESTS downloading: http://example.webscraping.com/view/Albania-3 download error: TOO MANY REQUESTS downloading: http://example.webscraping.com/view/Aland-Islands-2 download error: TOO MANY REQUESTS downloading: http://example.webscraping.com/view/Afghanistan-1 download error: TOO MANY REQUESTS
相关文章推荐
- 《用Python写网络爬虫》--编写第一个网络爬虫
- python网络爬虫(二)编写第一个爬虫
- [翻译]<Web Scraping with Python>Chapter 1.你的第一个网络爬虫
- 编写10个线程,第一个线程从1加到10...
- flash builder编写代码并调试时:经常出现Flash Builder找不到所需版本的Adobe Flash Player
- X86—qtopia第一个应用程序(hello)编写
- Weblogic10 + EJB3入门教程(1):编写第一个无状态会话Bean(Stateless Session Bean)
- 编写第一个JavaScript程序
- DotNetNuke 4.x 最终版本DNN 4.9.2发布(附源码下载)
- 编写第一个python程序
- 1. 编写第一个C程序
- 编写第一个 Java 程序
- 编写一个函数 接受两个字符串参数 如果 第一个参数被第二个包含 则输出第一个参数的首字符(图)
- 编写第一个CGI程序
- 欲善其事,必利其器 - Librame Utility 3.5.1 正式发布(第一个里程碑版本)
- Java版本的矩阵类编写
- VScode编写第一个Python程序HelloWorld步骤
- 项目版本不同导致Eclipse报错问题——关于在JDK1.7环境中,运行JDK1.8环境下编写的项目。
- 编写一个控制台版本的局域网聊天室实现多人聊天
- AngularJs 第一个自定义指令编写