对前程无忧的计算机大数据职位进行抓取
2017-09-04 18:11
866 查看
这几天由于自己写一个项目的需求,我要对计算机大数据职位信息进行抓取,每天在前程无忧中可以抓取4-5M(大约四万个)的数据(有点太少了,不过凑合一下),还改版了抓取计算机职位(这里不上传了因为有一点bug不知道为什么第一次运行一个半小时候卡死了,哎)。
代码以及详细的解释我给粘贴出来,希望勉励自己
这里说明一下,程序运行的比较缓慢,我觉得因为是二次打开网页的缘由,因为我们不能在找到的网页中直接抓取所要的全部数据,还要在找到某些网址进行二次打开。
我是一个刚学的人,有些东西肯定是错的,语言也没有专业术语,希望多多包涵!
这几天一直在学习hadoop,也希望hadoop的学习者联系我,一起勉励:QQ:1755545594
代码菜鸟,如有错误,请多包涵!!!
代码以及详细的解释我给粘贴出来,希望勉励自己
这里说明一下,程序运行的比较缓慢,我觉得因为是二次打开网页的缘由,因为我们不能在找到的网页中直接抓取所要的全部数据,还要在找到某些网址进行二次打开。
我是一个刚学的人,有些东西肯定是错的,语言也没有专业术语,希望多多包涵!
# -*- coding:utf-8 -*- import requests import re import sys from time import time import csv from multiprocessing.dummy import Pool as ThreadPool reload(sys) sys.setdefaultencoding("gbk") def forLink1(pro): html = zhiwei.getsource(pro['url']) html.encoding = 'GBK' html = html.text if re.search('<p class="msg ltype">(.*?)</p>', html, re.S) != None: pro1 = re.search('<p class="msg ltype">\s+(.*?)\s+</p>', html, re.S).group(1) pro['xinxi'] = pro1.split(' | ')[2].strip() else: pro['xinxi'] = None each = re.search('<div class="t1">(.*?)</div>', html, re.S).group(1) if re.search('<span class="sp4"><em class="i1"></em>(.*?)</span>', each, re.S) != None: pro['jingyan'] = re.search('<span class="sp4"><em class="i1"></em>(.*?)</span>', each, re.S).group(1).decode('gbk') else: pro['jingyan'] = None if re.search('<span class="sp4"><em class="i2"></em>(.*?)</span>', each, re.S) != None: pro['xueli'] = re.search('<span class="sp4"><em class="i2"></em>(.*?)</span>', each, re.S).group(1).decode('gbk') else: pro['xueli'] = None if re.search('<span class="sp4"><em class="i3"></em>(.*?)</span>', each, re.S) != None: pro['renshu'] = re.search('<span class="sp4"><em class="i3"></em>(.*?)</span>', each, re.S).group(1).decode('gbk') else: pro['renshu'] = None class spider(object): def __init__(self): print u'开始爬取....' #获取所有的页面 def changepage(self, url, total_page): now_page = int(re.search('25AE,2,(\d+).html',url, re.S).group(1)) # now_page = int(re.search('B9,2,(\d+).html', url, re.S).group(1)) page_group = [] for i in range(now_page, total_page+1): link = re.sub('25AE,2,\d+.html', '25AE,2,%s.html'%i, url, re.S) # link = re.sub('B9,2,\d+.html', 'B9,2,%s.html'%i, url, re.S) page_group.append(link) return page_group #getsource用来获取网页源代码 def getsource(self, url): headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'} html = requests.get(url = url, headers = headers) return html #获取单个的class def geteveryclass(self, html): everyclass = re.findall('<div class="el">(.*?<span class="t5">.*?</span>)', html, re.S) return everyclass #获取有关的信息并存入pro中 def getinfo(self, each): pro = {} pro['position'] = re.search('onmousedown="">\s+(.*?)\s+</a>', each, re.S).group(1).decode('gbk') pro['url'] = re.search('href="(.*?)" onmousedown="">', each, re.S).group(1) pro['city'] = re.search('<span class="t3">(.*?)</span>', each, re.S).group(1).decode('gbk') pro['money'] = re.search('<span class="t4">(.*?)</span>', each, re.S).group(1).decode('gbk') pro['company'] = re.search('<span class="t2"><a target="_blank" title="(.*?)" href=', each, re.S).group(1).decode('gbk') pro['day'] = re.search('<span class="t5">(.*?)</span>', each, re.S).group(1).decode('gbk') forLink1(pro) # print pro['jingyan'] return pro #保存数据(职位、所在城市、月薪、公司、发布日期、招聘人数、经验、学历、属于哪一方面 def saceinfo(self, position, city, money, company, day, renshu, jingyan, xueli, xinxi): # f = open(r'C:\Users\root\Desktop\info.txt', 'a') # h = str(position) # f.writelines(h) # h = str(city) # f.writelines(h) # h = str(money) # f.writelines(h) # f.close() ids = range(1, 50*2000+1) predictions_file = open(r"C:\Users\root\Desktop\51.csv", "ab") open_file_object = csv.writer(predictions_file) #open_file_object.writerow(["ImageId", "Position", "Money", "City", "Company", "Day", "Employment", "Experience", "Education", "Industry"]) #open_file_object.writerows(zip(ids, position, money, city, company, day, renshu, jingyan, xueli, xinxi)) #open_file_object.writerow(["Position", "Money", "City", "Company", "Day", "Employment", "Experience", "Education","Industry"]) open_file_object.writerows(zip(position, money, city, company, day, renshu, jingyan, xueli, xinxi)) predictions_file.close() def forLink(link): print link html = zhiwei.getsource(link) html.encoding = 'GBK' html = html.text everyclass = zhiwei.geteveryclass(html) info_position = [] info_city = [] info_money = [] info_company = [] info_day = [] info_jingyan = [] info_renshu = [] info_xueli = [] info_xinxi = [] for each in everyclass: try: info = zhiwei.getinfo(each) h = str(info['position']).encode('utf-8') h1 = str(info['city']).encode('utf-8') h2 = str(info['money']).encode('utf-8') h3 = str(info['company']).encode('utf-8') h4 = str(info['day']).encode('utf-8') info_position.append(info['position']) info_city.append(info['city']) info_money.append(info['money']) info_company.append(info['company']) info_day.append(info['day']) info_renshu.append(info['renshu']) info_jingyan.append(info['jingyan']) info_xueli.append(info['xueli']) info_xinxi.append(info['xinxi']) except: print u'这个class下载失败!继续...' infos_money.extend(info_money) infos_city.extend(info_city) infos_position.extend(info_position) infos_company.extend(info_company) infos_day.extend(info_day) infos_renshu.extend(info_renshu) infos_jingyan.extend(info_jingyan) infos_xueli.extend(info_xueli) infos_xinxi.extend(info_xinxi) if __name__ == '__main__': #--------程序入口--------- print u""" ------------------------ 程序:职位资料爬虫 版本:2.0 作者:王鹏鹏 日期:2017-08-28 语言:python2 ------------------------ """ t0 = time() classinfo = [] url = 'http://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' #url = 'http://search.51job.com/list/000000,000000,0000,00,9,99,%25E8%25AE%25A1%25E7%25AE%2597%25E6%259C%25BA%2B-%25E5%259C%25B0%25E7%2582%25B9,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' zhiwei = spider() all_links = zhiwei.changepage(url, 840) infos_position = [] infos_city = [] infos_money = [] infos_company = [] infos_day = [] infos_renshu = [] infos_jingyan = [] infos_xueli = [] infos_xinxi = [] pool = ThreadPool(10) #infos_city, infos_position, infos_money = pool.map(forLink, all_links) pool.map(forLink, all_links) pool.close() pool.join() print time()-t0 print 'end' zhiwei.saceinfo(infos_position, infos_city, infos_money, infos_company, infos_day, infos_renshu, infos_jingyan, infos_xueli, infos_xinxi) print time()-t0 #print infos_money #1440s #385s
这几天一直在学习hadoop,也希望hadoop的学习者联系我,一起勉励:QQ:1755545594
代码菜鸟,如有错误,请多包涵!!!
相关文章推荐
- Python scrapy爬虫爬取前程无忧的职位信息,并简要数据分析
- R语言:使用rvest包进行数据简单抓取
- 使用Jsoup进行网页数据抓取
- 前程无忧python工作薪资爬取及数据分析
- 【读书笔记】抓取深圳或上海股市,或香港美国等有关股市某时间段内全部股票行情数据,进行主成分分析构造你个人的股市指数
- python爬虫 前程无忧网页抓取
- 利用acl_master的http库进行聚合数据新闻信息的数据抓取
- PHP中使用snoopy采集类进行数据抓取
- 通过jsoup对网页进行数据抓取。
- Python 抓取欧洲足球联赛数据进行大数据分析
- PHP应用curl对ajax型的网页数据进行抓取
- 用python进行分布式网页数据抓取(一)——设计
- Pyhton爬虫实战 - 抓取BOSS直聘职位描述 和 数据清洗
- python urllib2用url爬前程无忧职位信息
- 计算机网络复习 主题2 :为什么因特网采用分组交换的方式进行数据交换
- ping 原理与ICMP协议 && Android利用tcpdump和wireshark抓取网络数据包 && Android利用Fiddler进行网络数据抓包
- php爬虫教程(四)抓取数据并进行处理
- 抓取网页中的内容、如何解决乱码问题、如何解决登录问题以及对所采集的数据进行处理显示的过程
- 利用WebClient进行数据抓取
- 根据上传模板,抓取相关数据,并进行物料主数据除基本视图外批量扩展维护