Python爬取百度新闻数据并将时间统计到EXCEL中
2017-02-26 20:30
375 查看
缘起
我的好朋友的毕业论文需要爬取基金经理的新闻数量,并且统计新闻数量与基金的成交率的关系,我当然义不容辞啦。任务描述:爬取三百位基金经理“百度新闻”中的搜索结果,并且将其分别按月和按季度统计新闻数量。
使用到的技术
beatifulsoup,urllib, request,python文件I/OTalk is cheap,show the code
主函数:GCWspider_main.py
import url_manager,html_downloader,html_parser,html_output import xlwt import xlrd import urllib class SpiderMain(object): def __init__(self): self.urls=url_manager.UrlManager() self.downloader=html_downloader.HtmlDownoader() self.parser=html_parser.HtmlParser() self.output=html_output.HtmlOutputer() def craw(self,sheet1,sheet2,root_url,num,name): count=1 listZeros=[0] resultlistM=listZeros*((2016-2000)*12) resultlistS = listZeros * ((2016 - 2000) * 4) self.urls.add_new_url(root_url) while self.urls.has_new_url(): try: new_url=self.urls.get_new_url() print('crawling URL => %d ... : %s' % (count, new_url)) html_cont=self.downloader.download(new_url) new_urls, resultlistM,resultlistS=self.parser.parse(new_url,html_cont,resultlistM,resultlistS) self.urls.add_new_urls(new_urls) #if count==100: # break count=count+1 except Exception as e: print(e) print('crawing failure') #self.output.output_html() self.output.collect_data(sheet1, resultlistM,name,num) self.output.collect_data(sheet2, resultlistS, name, num) if __name__=="__main__": wb = xlwt.Workbook() wsmonth = wb.add_sheet('month') wsseason = wb.add_sheet('season') A2016=list(range(201612,201600,-1))#2016年12月到2001年1月(月份) A=A2016 for year in range(1,16): A=A+[a-100*year for a in A2016] for gap in range(len(A)): wsmonth.write(0,gap+1,A[gap]) for Ygap in range(16): #季度 for Sgap in range(4): if 16 - Ygap < 10: B = "0" + str(16 - Ygap) else: B = str(16 - Ygap) wsseason.write(0,Ygap*4+Sgap+1,"20" + B + "年第" + str(4 - Sgap) + "季度") keywords=xlrd.open_workbook('keywords.xlsx')#295words sh = keywords.sheet_by_index(0) for num in range(295): name=sh.cell(num,0).value root_url = "http://news.baidu.com/ns?cl=2&rn=20&tn=news&word=" + urllib.parse.quote(name) #root_url = "https://www.baidu.com/s?wd=" + urllib.parse.quote(name) obj_spider = SpiderMain() number=num+1 obj_spider.craw(wsmonth,wsseason, root_url,number,name) wb.save('new_result.xls')
下载器html_downloader.py
import urllib.request import ssl class HtmlDownoader(object): ssl._create_default_https_context = ssl._create_unverified_context def download(self, url): if url is None: return None user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} req = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(req) if response.getcode()!=200: print(response.getcode()) return None return response.read()
解析器html_parser.py
from bs4 import BeautifulSoup import re import urllib class HtmlParser(object): def parse(self, page_url, html_cont,resultlistM,resultlistS): if page_url is None or html_cont is None: return soup=BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8') # print(soup) new_urls=self._get_new_urls(page_url,soup) new_resultlistM,new_resultlistS=self._get_new_data(resultlistM,resultlistS,soup) return new_urls,new_resultlistM,new_resultlistS def _get_new_urls(self, page_url, soup): new_urls=set() #< a href = "/ns?word=%E5%8D%9A%E6%97%B6%E6%9D%A8%E9%94%90&pn=60&cl=2&ct=1&tn=news&rn=20&ie=utf-8&bt=0&et=0" > # < span class ="fk fkd" > < i class ="c-icon c-icon-bear-pn" > < / i > < / span > < span class ="pc" > 4 < / span > < / a > # print(soup) links=soup.find_all('a', href=re.compile(r"/ns\?word=.*pn=[^0].*")) if links is None: print("what's the fuck!") for link in links: new_url=link['href'] if new_url.endswith('-1') or new_url.endswith('1'): continue new_full_url=urllib.parse.urljoin(page_url,new_url) new_urls.add(new_full_url) return new_urls def data_process(self,data): data=re.sub("\D","",data) real_data=int(data[0:6]) return real_data def _get_new_data(self, resultlistM,resultlistS, soup): res_data=[] #res_data['url']=page_url # <dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1> #title_node=soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find('h1') nodes=soup.find_all('div',class_="result") #< p class ="c-author" > 凤凰网 & nbsp; & nbsp;2014年07月29日 15:00 </ p > for node in nodes: time=node.find('p',class_="c-author") realdata=self.data_process(time.get_text()) if realdata>201612 or realdata<200101: print("time out of range") continue index=int((realdata-200100)/100)*12+((realdata-200100)%100) index=16*12-index+1 index2=int((realdata-200100)/100)*4+int(((realdata-200100)%100)/4) index2=16*4-index2+1 num=node.find('a',class_="c-more_link") if num is None: resultlistM[index] = resultlistM[index] + 1 resultlistS[index2] = resultlistM[index] + 1 else: realnum=self.data_process(num.get_text()) resultlistM[index]=resultlistM[index]+realnum resultlistS[index2] = resultlistM[index] + realnum #summary_node=soup.find('div',class_="lemma-summary") #res_data['summary']=summary_node.get_text() return resultlistM,resultlistS
输出器html_output.py
class HtmlOutputer(object): def __init__(self): self.datas=[] def collect_data(self, sheet,resultlist,name,num): count=1 sheet.write(num,0,name) for result in resultlist: sheet.write(num,count,result) count = count + 1 #self.datas.append(data)
相关文章推荐
- python读写sqlite3数据库并将统计数据写入excel
- 用python实现简单EXCEL数据统计的实例
- 小猫统计导入excel数据用于批量证书打印时,时间数据格式的规范化处理
- 用 python实现简单EXCEL数据统计
- Python实现读写sqlite3数据库并将统计数据写入Excel的方法示例
- Python从数据库取数,对时间进行处理,统计数据汇总后画图
- 人工智能:python 实现 第十一章,从时间序列数据中提取统计信息
- C# 操作Excel获取数据、时间、图片
- 导入导出EXCEL数据时有关时间的处理
- excel数据个数统计问题
- 统计函数——汇总统计时间类数据
- 通过Excel分析测试数据同步复制持续时间
- 最土系统统计某段时间的营业数据
- 用Jxl实现将统计数据导出到excel表中
- python 读取excel中的数据
- Aspose.Cells 根据Excel模板导出数据统计
- excel数据统计分析面面观
- [导入]c# 统计方法执行时间,计算缓存读取数据方法的命中率
- 用Python 模块xlrd 操作excel,并将数据导入MySQL
- 在规定的时间范围类,按月统计数据