python爬取拉勾网任意职位数据
2016-07-28 00:02
253 查看
</pre><pre name="code" class="python">
</pre><pre name="code" class="python">
# -*- coding: utf-8 -*- """ Created on Wed Jul 27 15:44:14 2016
#python vesion:3.5.2 @author: mozzielx """ import urllib.parse import urllib.request import json from openpyxl import Workbook def get_content(): url = 'http://www.lagou.com/jobs/positionAjax.json?px=default' user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36' #需要向网站post的data values = { 'first':'true', 'kd':'机械', #查找职位,可改成需要查找的职位 'pn':page_num #当前页数 } headers = { 'User-Agent' : user_agent } data = urllib.parse.urlencode(values).encode(encoding='utf-8') req = urllib.request.Request(url, data, headers) response = urllib.request.urlopen(req) the_page = response.read().decode("utf-8") data_json = json.loads(the_page) tar = data_json['content']['positionResult']['result'] return tar #清空 page_num='' companyLabelList=[] companyFullName=[] positionAdvantage=[] city=[] salary=[] companySize=[] district=[]#哪个区 jobNature=[] workYear=[] #工作资历 education=[] positionName=[]#岗位名称 industryField=[]#公司研究领域 #数据标准化 def standardize_data(tar): for each in tar: companyLabelList.append(each['companyLabelList']) companyFullName.append(each['companyFullName']) positionAdvantage.append(each['positionAdvantage']) city.append(each['city']) salary.append(each['salary']) companySize.append(each['companySize']) district.append(each['district']) jobNature.append(each['jobNature']) workYear.append(each['workYear']) education.append(each['education']) positionName.append(each['positionName']) industryField.append(each['industryField']) #将数据保存到excel def excel_data(): wb=Workbook() ws=wb.active ws['A1']='positionName' ws['B1']='salary' ws['C1']='jobNature' ws['D1']='workYear' ws['E1']='education' ws['F1']='positionAdvantage' ws['G1']='city' ws['H1']='district' ws['I1']='companyFullName' ws['J1']='companyLabelList' ws['K1']='industryField' for row in range(2,len(city) + 2): for col in range(1,2): _ = ws.cell(column=col, row=row, value="%s" % positionName[row-2]) for col in range(2,3): _ = w 4000 s.cell(column=col, row=row, value="%s" % salary[row-2]) for col in range(3,4): _ = ws.cell(column=col, row=row, value="%s" % jobNature[row-2]) for col in range(4,5): _ = ws.cell(column=col, row=row, value="%s" % workYear[row-2]) for col in range(5,6): _ = ws.cell(column=col, row=row, value="%s" % education[row-2]) for col in range(6,7): _ = ws.cell(column=col, row=row, value="%s" % positionAdvantage[row-2]) for col in range(7,8): _ = ws.cell(column=col, row=row, value="%s" % city[row-2]) for col in range(8,9): _ = ws.cell(column=col, row=row, value="%s" % district[row-2]) for col in range(9,10): _ = ws.cell(column=col, row=row, value="%s" % companyFullName[row-2]) for col in range(10,11): _ = ws.cell(column=col, row=row, value="%s" % companyLabelList[row-2]) for col in range(11,12): _ = ws.cell(column=col, row=row, value="%s" % industryField[row-2]) wb.save('拉勾网——机械有关职位数据.xlsx') if __name__ == '__main__': for each in range(1,100):#5可以改成任意页数(最大值) page_num=each standardize_data(get_content()) excel_data()
相关文章推荐
- Python3写爬虫(四)多线程实现数据爬取
- Scrapy的架构介绍
- 爬虫笔记
- 我是运营,我没有假期
- DB2数据库的安装
- C#实现把指定数据写入串口
- “传奇”图象数据存储方式
- 修复mysql数据库
- 浅析SQL数据操作语句
- SQLServer 数据导入导出的几种方法小结
- 简述MySQL分片中快速数据迁移
- MySQL数据备份之mysqldump的使用详解
- 基于C#实现网页爬虫
- C#实现窗体间传递数据实例
- C#中的委托数据类型简介
- SQL Server删除表及删除表中数据的方法
- SqlServer2008误操作数据(delete或者update)后恢复数据的方法
- 给你的数据库文件减肥
- Oracle数据更改后出错的解决方法
- Oracle数据库数据丢失恢复的几种方法总结