用python和BeautifulSoup抓取百度搜索结果10-20页面中的网站链接
2013-05-09 17:43
429 查看
#-*-coding=utf-8-*- #author:zhangle import urllib2 import urllib import chardet import thread import threading from bs4 import BeautifulSoup as BS myLock = threading.RLock() class GetUrls(object): pageCount = 1 search_url = 'http://www.baidu.com/s?wd=key' req_header = {'User-Agent':'Mozilla/5.0(Windows;U;Windows NT 6.1;en-US; rv:1.9.1.6)Gecko/20091201 Firefox/3.5.6'} def __init__(self,inputInfo): self.inputInfo = inputInfo GetUrls.search_url = GetUrls.search_url.replace('key',self.inputInfo) #detect the coding of the html page def __detectCode(self,url): htmlInfo = urllib.urlopen(url).info() coding = htmlInfo.getparam('charset') if coding is None: htmlInfo = urllib.urlopen(url).read() coding = chardet.detect(htmlInfo)['encoding'] if coding is None: coding = 'utf-8' coding = coding.lower() return coding #get the html title def __getTitle(self,url): coding = self.__detectCode(url) try: titleReq = urllib2.Request(url,None,GetUrls.req_header) titleRes = urllib2.urlopen(titleReq) html = titleRes.read() titleSoup = BS(html.decode(coding,'ignore')) title = titleSoup.title.string return title except urllib2.HTTPError: return None except urllib2.URLError: return None #get the information inside the html page def __getInfo(self,redUrl): #myLock.acquire() with open('info.txt',mode='a') as a_file: title = self.__getTitle(redUrl) if title: #print title.encode('gbk','ignore') a_file.write(title.encode('gbk','ignore') + '\n') a_file.write(redUrl + '\n\n') #myLock.release() def __searchUrls(self,url): if GetUrls.pageCount > 20: return else: req = urllib2.Request(url,None,GetUrls.req_header) res = urllib2.urlopen(req) html = res.read() soup = BS(html.decode('utf-8','ignore')) #get the 100 urls from page 10 to page 20 if GetUrls.pageCount > 10: htmlList = soup.find_all('h3') for hh in htmlList: #get url in the html page urlInPage = hh.a.get('href') #get url after redirecting try: req = urllib2.Request(urlInPage,None,GetUrls.req_header) redUrl = urllib2.urlopen(req).geturl() except urllib2.HTTPError: redUrl = urlInPage thread.start_new_thread(self.__getInfo,(redUrl,)) GetUrls.pageCount += 1 pNode = soup.find_all('span',text=GetUrls.pageCount) nextUrl = 'http://www.baidu.com' + pNode[0].parent.get('href') self.__searchUrls(nextUrl) def UrlParse(self): self.__searchUrls(GetUrls.search_url) if __name__ == '__main__': getUrlInfo = GetUrls('挖掘机') getUrlInfo.UrlParse()
相关文章推荐
- Python实现抓取百度搜索结果页的网站标题信息
- Python实现抓取百度搜索结果页的网站标题信息
- C++和python如何获取百度搜索结果页面下信息对应的真实链接(百度搜索爬虫,可指定页数)
- python 抓取百度搜索结果的快照排名信息
- PHP抓取百度搜索结果对应的第一个百度快照的链接
- PHP抓取百度搜索结果页面的【相关搜索词】并存储
- C#抓取百度和谷歌的搜索结果(标题和链接) 代码整理
- python3.5 爬取bing搜索结果页面标题、链接
- python抓取百度搜索列表的实际网址和网站标题
- PHP实现抓取百度搜索结果页面【相关搜索词】并存储到txt文件示例
- Python抓取百度搜索结果
- 分别使用Python和Java抓取百度搜索结果
- 解析百度搜索结果页面的python脚本(Linux/Win都可以运行)
- python,抓取百度搜索结果
- 百度搜索结果页面的参数 搜索框提示词搜索方式(rsv_sug2)
- Google 会帮你在目标网站用你的关键字搜索,直接给你搜索结果页面?
- python 抓取google搜索结果
- 百度搜索结果页面的参数 相关搜索_语义关联性(rs_src)
- python3 - 通过BeautifulSoup 4抓取百度百科人物相关链接
- Python实现抓取页面上链接的简单爬虫分享