您的位置:首页 > 编程语言 > Python开发

python爬虫的初级入门

2017-03-16 17:46 302 查看

1.如何获取一个网站的所有路径?

#coding:utf-8
import urllib2
import re
import itertools
import threading
import urlparse
import robotparser
#download是用于下载网页的,并附加服务器名,添加重新访问网站的次数
def download(url,user_agent='wswp',num_retries=2):
print 'DownLoading:',url
headers = {'User-agent':user_agent}
request = urllib2.Request(url,headers=headers)
try:
html=urllib2.urlopen(request).read()
except urllib2.URLError as e:
print 'DownLoad error',e.reason
html=None
if num_retries>0:
if hasattr(e,'code') and 500<=e.code<600:
return download(url,user_agent,num_retries=-1)
return html
#crawl_sitemap是用于进行网站地图的爬取,将所有<a href="(site)">中site爬取出来
def crawl_sitemap(url):
sitemap = download(url)
links = re.findall('<a href="(\S*)">',sitemap)
for link in links:
html = download(link)
#crawl_sitemap('http://sitemap.163.com/')

#link_crawler方法是用于将种子地址的所有连接都爬取下来
def link_crawler(seed_url,link_regex):
crawl_queue = [seed_url]
seen=set(crawl_queue)
while crawl_queue:
url=crawl_queue.pop()
html=download(url)
for link in get_links(html):
if link in get_links(html):
if re.match(link_regex,link):
link=urlparse.urljoin(seed_url,link)
if link not in seen:
seen.add(link)
crawl_queue.append(link)

def get_links(html):
webpage_regex=re.compile('\S*')
return webpage_regex.findall(html)
在最后运行 link_crawler('http://map.163.com'')就可以完成爬取所有网址的操作了!

二。如何进行网址抓取?

1可以使用正则表达式
import re
import urllib2
def download(url,user_agent='wswp',num_retries=2):
print 'DownLoading:',url
headers = {'User-agent':user_agent}
request = urllib2.Request(url,headers=headers)
try:
html=urllib2.urlopen(request).read()
except urllib2.URLError as e:
print 'DownLoad error',e.reason
html=None
if num_retries>0:
if hasattr(e,'code') and 500<=e.code<600:
return download(url,user_agent,num_retries=-1)
return html
url='http://sitemap.163.com/'
html=download(url)
print re.findall('<a href="(\S*)">',html)
2.可以使用美味汤beautifulsoup
from bs4 import BeautifulSoup
f=open("index.html")
file_object = open('index.html')
try:
f = file_object.read( )
finally:
file_object.close( )
soup = BeautifulSoup(f)
for link in soup.find_all('a'):
print(link.get('href'))
3.可以使用lxml进行提取
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: