您的位置:首页 > 理论基础

对前程无忧的计算机大数据职位进行抓取

2017-09-04 18:11 866 查看
这几天由于自己写一个项目的需求,我要对计算机大数据职位信息进行抓取,每天在前程无忧中可以抓取4-5M(大约四万个)的数据(有点太少了,不过凑合一下),还改版了抓取计算机职位(这里不上传了因为有一点bug不知道为什么第一次运行一个半小时候卡死了,哎)。

代码以及详细的解释我给粘贴出来,希望勉励自己

这里说明一下,程序运行的比较缓慢,我觉得因为是二次打开网页的缘由,因为我们不能在找到的网页中直接抓取所要的全部数据,还要在找到某些网址进行二次打开。

我是一个刚学的人,有些东西肯定是错的,语言也没有专业术语,希望多多包涵!

# -*- coding:utf-8 -*-
import requests
import re
import sys
from time import time
import csv
from multiprocessing.dummy import Pool as ThreadPool

reload(sys)
sys.setdefaultencoding("gbk")

def forLink1(pro):
html = zhiwei.getsource(pro['url'])
html.encoding = 'GBK'
html = html.text
if re.search('<p class="msg ltype">(.*?)</p>', html, re.S) != None:
pro1 = re.search('<p class="msg ltype">\s+(.*?)\s+</p>', html, re.S).group(1)
pro['xinxi'] = pro1.split('  |  ')[2].strip()
else:
pro['xinxi'] = None
each = re.search('<div class="t1">(.*?)</div>', html, re.S).group(1)

if re.search('<span class="sp4"><em class="i1"></em>(.*?)</span>', each, re.S) != None:
pro['jingyan'] = re.search('<span class="sp4"><em class="i1"></em>(.*?)</span>', each, re.S).group(1).decode('gbk')
else:
pro['jingyan'] = None
if re.search('<span class="sp4"><em class="i2"></em>(.*?)</span>', each, re.S) != None:
pro['xueli'] = re.search('<span class="sp4"><em class="i2"></em>(.*?)</span>', each, re.S).group(1).decode('gbk')
else:
pro['xueli'] = None
if re.search('<span class="sp4"><em class="i3"></em>(.*?)</span>', each, re.S) != None:
pro['renshu'] = re.search('<span class="sp4"><em class="i3"></em>(.*?)</span>', each, re.S).group(1).decode('gbk')
else:
pro['renshu'] = None

class spider(object):
def __init__(self):
print u'开始爬取....'

#获取所有的页面
def changepage(self, url, total_page):
now_page = int(re.search('25AE,2,(\d+).html',url, re.S).group(1))
# now_page = int(re.search('B9,2,(\d+).html', url, re.S).group(1))
page_group = []
for i in range(now_page, total_page+1):
link = re.sub('25AE,2,\d+.html', '25AE,2,%s.html'%i, url, re.S)
# link = re.sub('B9,2,\d+.html', 'B9,2,%s.html'%i, url, re.S)
page_group.append(link)
return page_group

#getsource用来获取网页源代码
def getsource(self, url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'}
html = requests.get(url = url, headers = headers)
return html

#获取单个的class
def geteveryclass(self, html):
everyclass = re.findall('<div class="el">(.*?<span class="t5">.*?</span>)', html, re.S)
return everyclass

#获取有关的信息并存入pro中
def getinfo(self, each):
pro = {}
pro['position'] = re.search('onmousedown="">\s+(.*?)\s+</a>', each, re.S).group(1).decode('gbk')
pro['url'] = re.search('href="(.*?)" onmousedown="">', each, re.S).group(1)
pro['city'] = re.search('<span class="t3">(.*?)</span>', each, re.S).group(1).decode('gbk')
pro['money'] = re.search('<span class="t4">(.*?)</span>', each, re.S).group(1).decode('gbk')
pro['company'] = re.search('<span class="t2"><a target="_blank" title="(.*?)" href=', each, re.S).group(1).decode('gbk')
pro['day'] = re.search('<span class="t5">(.*?)</span>', each, re.S).group(1).decode('gbk')
forLink1(pro)
# print pro['jingyan']
return pro

#保存数据(职位、所在城市、月薪、公司、发布日期、招聘人数、经验、学历、属于哪一方面
def saceinfo(self, position, city, money, company, day, renshu, jingyan, xueli, xinxi):
# f = open(r'C:\Users\root\Desktop\info.txt', 'a')
# h = str(position)
# f.writelines(h)
# h = str(city)
# f.writelines(h)
# h = str(money)
# f.writelines(h)
# f.close()
ids = range(1, 50*2000+1)
predictions_file = open(r"C:\Users\root\Desktop\51.csv", "ab")
open_file_object = csv.writer(predictions_file)
#open_file_object.writerow(["ImageId", "Position", "Money", "City", "Company", "Day", "Employment", "Experience", "Education", "Industry"])
#open_file_object.writerows(zip(ids, position, money, city, company, day, renshu, jingyan, xueli, xinxi))
#open_file_object.writerow(["Position", "Money", "City", "Company", "Day", "Employment", "Experience", "Education","Industry"])
open_file_object.writerows(zip(position, money, city, company, day, renshu, jingyan, xueli, xinxi))
predictions_file.close()

def forLink(link):
print link
html = zhiwei.getsource(link)
html.encoding = 'GBK'
html = html.text
everyclass = zhiwei.geteveryclass(html)
info_position = []
info_city = []
info_money = []
info_company = []
info_day = []
info_jingyan = []
info_renshu = []
info_xueli = []
info_xinxi = []
for each in everyclass:
try:
info = zhiwei.getinfo(each)
h = str(info['position']).encode('utf-8')
h1 = str(info['city']).encode('utf-8')
h2 = str(info['money']).encode('utf-8')
h3 = str(info['company']).encode('utf-8')
h4 = str(info['day']).encode('utf-8')
info_position.append(info['position'])
info_city.append(info['city'])
info_money.append(info['money'])
info_company.append(info['company'])
info_day.append(info['day'])
info_renshu.append(info['renshu'])
info_jingyan.append(info['jingyan'])
info_xueli.append(info['xueli'])
info_xinxi.append(info['xinxi'])
except:
print u'这个class下载失败!继续...'

infos_money.extend(info_money)
infos_city.extend(info_city)
infos_position.extend(info_position)
infos_company.extend(info_company)
infos_day.extend(info_day)
infos_renshu.extend(info_renshu)
infos_jingyan.extend(info_jingyan)
infos_xueli.extend(info_xueli)
infos_xinxi.extend(info_xinxi)

if __name__ == '__main__':

#--------程序入口---------
print u"""
------------------------
程序:职位资料爬虫
版本:2.0
作者:王鹏鹏
日期:2017-08-28
语言:python2
------------------------
"""
t0 = time()
classinfo = []
url = 'http://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
#url = 'http://search.51job.com/list/000000,000000,0000,00,9,99,%25E8%25AE%25A1%25E7%25AE%2597%25E6%259C%25BA%2B-%25E5%259C%25B0%25E7%2582%25B9,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
zhiwei = spider()
all_links = zhiwei.changepage(url, 840)
infos_position = []
infos_city = []
infos_money = []
infos_company = []
infos_day = []
infos_renshu = []
infos_jingyan = []
infos_xueli = []
infos_xinxi = []
pool = ThreadPool(10)
#infos_city, infos_position, infos_money = pool.map(forLink, all_links)
pool.map(forLink, all_links)
pool.close()
pool.join()
print time()-t0
print 'end'

zhiwei.saceinfo(infos_position, infos_city, infos_money, infos_company, infos_day, infos_renshu, infos_jingyan, infos_xueli, infos_xinxi)

print time()-t0
#print infos_money
#1440s
#385s


这几天一直在学习hadoop,也希望hadoop的学习者联系我,一起勉励:QQ:1755545594

代码菜鸟,如有错误,请多包涵!!!
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: