您的位置:首页 > 编程语言 > Python开发

python爬取拉勾网任意职位数据

2016-07-28 00:02 253 查看
</pre><pre name="code" class="python">
</pre><pre name="code" class="python">
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 27 15:44:14 2016
#python vesion:3.5.2

@author: mozzielx
"""

import urllib.parse
import urllib.request
import json
from openpyxl import Workbook

def get_content():
url = 'http://www.lagou.com/jobs/positionAjax.json?px=default'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36'
#需要向网站post的data
values = {
'first':'true',
'kd':'机械',  #查找职位,可改成需要查找的职位
'pn':page_num #当前页数
}
headers = { 'User-Agent' : user_agent }
data = urllib.parse.urlencode(values).encode(encoding='utf-8')
req = urllib.request.Request(url, data, headers)
response = urllib.request.urlopen(req)
the_page = response.read().decode("utf-8")
data_json = json.loads(the_page)
tar = data_json['content']['positionResult']['result']
return tar

#清空
page_num=''
companyLabelList=[]
companyFullName=[]
positionAdvantage=[]
city=[]
salary=[]
companySize=[]
district=[]#哪个区
jobNature=[]
workYear=[] #工作资历
education=[]
positionName=[]#岗位名称
industryField=[]#公司研究领域

#数据标准化
def standardize_data(tar):
for each in tar:
companyLabelList.append(each['companyLabelList'])
companyFullName.append(each['companyFullName'])
positionAdvantage.append(each['positionAdvantage'])
city.append(each['city'])
salary.append(each['salary'])
companySize.append(each['companySize'])
district.append(each['district'])
jobNature.append(each['jobNature'])
workYear.append(each['workYear'])
education.append(each['education'])
positionName.append(each['positionName'])
industryField.append(each['industryField'])

#将数据保存到excel
def excel_data():
wb=Workbook()
ws=wb.active
ws['A1']='positionName'
ws['B1']='salary'
ws['C1']='jobNature'
ws['D1']='workYear'
ws['E1']='education'
ws['F1']='positionAdvantage'
ws['G1']='city'
ws['H1']='district'
ws['I1']='companyFullName'
ws['J1']='companyLabelList'
ws['K1']='industryField'
for row in range(2,len(city) + 2):
for col in range(1,2):
_ = ws.cell(column=col, row=row, value="%s" % positionName[row-2])
for col in range(2,3):
_ = w
4000
s.cell(column=col, row=row, value="%s" % salary[row-2])
for col in range(3,4):
_ = ws.cell(column=col, row=row, value="%s" % jobNature[row-2])
for col in range(4,5):
_ = ws.cell(column=col, row=row, value="%s" % workYear[row-2])
for col in range(5,6):
_ = ws.cell(column=col, row=row, value="%s" % education[row-2])
for col in range(6,7):
_ = ws.cell(column=col, row=row, value="%s" % positionAdvantage[row-2])
for col in range(7,8):
_ = ws.cell(column=col, row=row, value="%s" % city[row-2])
for col in range(8,9):
_ = ws.cell(column=col, row=row, value="%s" % district[row-2])
for col in range(9,10):
_ = ws.cell(column=col, row=row, value="%s" % companyFullName[row-2])
for col in range(10,11):
_ = ws.cell(column=col, row=row, value="%s" % companyLabelList[row-2])
for col in range(11,12):
_ = ws.cell(column=col, row=row, value="%s" % industryField[row-2])

wb.save('拉勾网——机械有关职位数据.xlsx')

if __name__ == '__main__':
for each in range(1,100):#5可以改成任意页数(最大值)
page_num=each
standardize_data(get_content())
excel_data()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  爬虫 数据