您的位置:首页 > 编程语言 > Python开发

python3.3 lxml+beautifulsoup 爬虫说明

2016-08-16 12:55 465 查看
1.安装python3.3版本2.安装pip;3.安装bs4和lxml工具包安装bs4:pip install bs4或bs4.exe‘安装lxml:http://blog.csdn.net/qq_23438131/article/details/522224894.控制编码格式:
#coding:utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
5.引用bs4
import bs4
from bs4 import BeautifulSoup as bs
6.根据关键字百度搜索#coding:utf-8import bs4 from bs4 import BeautifulSoup as bsimport urllib.parseimport urllib.requestimport functoolsimport reimport timefrom time import sleep#import socket#socket.setdefaulttimeout(3)class BaiduSpider(object):def __init__(self,word,max_link):self._word = wordself._max_link = max_linkp = {"word":word}self._start_url = "http://www.news.baidu.com/ns?" + urllib.parse.urlencode(p)def _get_links(self):links = []links.append(self._start_url)try:soup = bs(self._get_html(self._start_url),"lxml")links_tag = soup.select("#page")except AttributeError as e_Att:print(e_Att)time.sleep(10)return self._get_links()if 0 != len(links_tag):links_tag = links_tag[0]#get the second page linkfor child in links_tag.children:attr = child.attrsif attr:links.append("http://www.news.baidu.com" + attr["href"])break#get 20~800 news linksfor i in range(20,810,10):link_temp = links[1].__str__()PatternObj = re.compile('&pn=(\\d)+?&')newLink = PatternObj.subn('&pn='+str(i)+'&', link_temp )links.append(str(newLink[0]))end = self._max_link if self._max_link < len(links) else len(links)return links[:end]def _rightTime(self,summary):'''判断summary中的时间是否在2016年6月1日至今中国基金网  14小时前网易新闻  2016年08月12日 16:35'''#2016-06-01转化为datetimetry:startDate_str = '2016-06-01'startTime = time.mktime(time.strptime(startDate_str, '%Y-%m-%d'))a = summary.split()time_in_text = a[1]if '年' in time_in_text:time_in_text = time_in_text.split(" ")[0]time_in_text = time_in_text.replace("年",'-').replace("月",'-').replace("日",'')textTime = time.mktime(time.strptime(time_in_text, '%Y-%m-%d'))if (float(textTime))<=(float(startTime)):return Falsereturn Trueexcept ValueError:print (time_in_text)def _get_html(self,link):res = urllib.request.urlopen(link)return res.read().decode("utf-8")def _get_html_Content_post(self,link,f_error,retries):print (link,'open the link using the post method:',time.time())html_content = ''try:request = urllib.request.Request(link)res =urllib.request.urlopen(request,timeout=3)html_content = res.read()except Exception as e: #爬虫卡住或其他异常,则再次尝试,尝试机会有3次print(link+'\n')print(e)f_error.write(link+'\n')if retries:return self._get_html_Content_post(link, f_error,retries-1)print ('close:',time.time())return html_contentdef _get_html_Content(self,link, f_error,retries=2):print (link,'\n','open the link:',time.time())html_content = ''try:user_agent='Mozilla/4.0(compatible;MSIE 5.5;Windows NT)'headers={'User-Agent':user_agent}request = urllib.request.Request(link)request.add_header('User-Agent', user_agent)#timeout=2res =urllib.request.urlopen(request,timeout=3)html_content = res.read()except Exception as e: #爬虫卡住或其他异常,则再次尝试,尝试用post方式打开print(link+'\n')print(e)f_error.write(link+'\n')if retries:return self._get_html_Content_post(link, f_error,retries=3)print ('close:',time.time())return html_contentdef _get_content(self,content):# 先要把bs4.element.NavigableString类型转化为string类型return functools.reduce(lambda x,y:x+y,map(lambda x:x.replace("<em>","").replace("</em>",""),map(lambda x:x.string,content)))def _spiderDetail(self, link,f_error,Verbdic):'''input:link,f_erroroutput:contea4acnts contained xiepeiyiverb通过第一步获取的URL,得到新闻所在的内容页面URL,由于百度新闻列表页面上的新闻来自不同的站,所以很难找到一个通用的结构,大多数的新闻类网站,内容都是放在p标签内,所以就采用了如下的方式获取新闻的内容'''html_content = self._get_html_Content(link, f_error,retries=2)contents =''if html_content != '':soup = bs(html_content,"lxml")#reg=u".+?带领"#Res = re.compile(reg)#contents = soup.findAll(name="p", text=Res)contents = '<p>'iter = []nodes_p = soup.find_all(name='p')for n in nodes_p:p_cont = n.get_text(strip=True)for ver in Verbdic:if ver in p_cont:iter.append(p_cont)breakcontents = contents.join(iter)return contentsdef _spider(self,f, f_error,Verbdic):'''百度新闻列表页面,根据关键词检索新闻,获取新闻标题、来源及时间、链接、链接页面文字'''total_links = self._get_links()print (total_links)for i,l in enumerate(total_links):print ("Page {0}".format(i+1))soup = bs(self._get_html(l),"lxml")# 找到左边内容到的跟节点left_div = soup.select("#content_left")[0]# base_div_list是一个新闻列表for child_div in left_div.children:if isinstance(child_div,bs4.element.Tag) and child_div.div and child_div.div.get('class') and'result' in child_div.div['class']:base_div = child_divchilds = base_div.childrenfor child in childs:title = child.select(".c-title")[0]summary = ""summary = summary.join(self._get_content(child.select(".c-summary")[0].p.contents))a_link = title.a["href"]titlename = ""titlename = titlename.join(self._get_content(title.a.contents))#爬取新闻内容网页content = ''if self._rightTime(summary):content = self._spiderDetail(a_link, f_error,Verbdic)f.write ('标题:'+titlename+'\t来源及时间:'+summary+'\t链接:'+a_link+'\t新闻内容:'+content+"\n")def start(self,f, f_error,Verbdic):self._spider(f,f_error,Verbdic)if '__main__' == __name__:'''f存储爬取结果#f_error存储读取新闻内容错误的链接'''Verbdic = ['协同','协助']with open("links2.txt",'wt',encoding='utf-8') as f, open("logError2.txt",'wt') as f_error, open("overVerb.txt",'wt') as f_over:for keyword in Verbdic:baidu_spider = BaiduSpider(keyword,800)baidu_spider.start( f, f_error,Verbdic)f_over.write(keyword+'\n')
7.爬虫问题:1.Python程序卡住:原因是链接的网站反爬虫、get/post方式错误、网络问题等。解决方法一:模拟浏览器上网:
            user_agent='Mozilla/4.0(compatible;MSIE 5.5;Windows NT)'headers={'User-Agent':user_agent}request = urllib.request.Request(link)request.add_header('User-Agent', user_agent)
解决方法二:超时重试:
<span style="white-space:pre">	</span>try:        request = urllib.request.Request(link)            
<span style="white-space:pre">	</span>    res =urllib.request.urlopen(request,timeout=3)html_content = res.read()except Exception as e:       #爬虫卡住或其他异常,则再次尝试,尝试用post方式打开print(link+'\n')print(e)f_error.write(link+'\n')if retries:return self._get_html_Content_post(link, f_error,retries=3)
解决方法三:如果模拟浏览器方式无法打开网页,即无法用get方式打开网页,则采用post方式打开网页:
    def _get_html_Content_post(self,link,f_error,retries):print (link,'open the link using the post method:',time.time())html_content = ''try:request = <span style="font-family: Arial, Helvetica, sans-serif;">urllib.request.</span><span style="font-family: Arial, Helvetica, sans-serif;">.Request(link)</span>res = <span style="font-family: Arial, Helvetica, sans-serif;">urllib.request</span><span style="font-family: Arial, Helvetica, sans-serif;">.urlopen(request,timeout=3)</span>html_content = res.read()except Exception as e:       #爬虫卡住或其他异常,则再次尝试,尝试机会有3次print(link+'\n')print(e)f_error.write(link+'\n')if retries:return self._get_html_Content_post(link, f_error,retries-1)print ('close:',time.time())return html_content
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: