您的位置:首页 > 编程语言 > Python开发

python爬虫3——爬取腾讯招聘全部招聘信息

2018-10-17 08:45 489 查看

python爬虫2中,已经有了初步的代码,之后做了优化

增加了工作职责、工作要求:

获取的数据有:

代码如下:

#!/usr/bin/env python
# -*- coding:utf-8 -*-

from bs4 import BeautifulSoup
import urllib2
import urllib
import json    # 使用了json格式存储

def tengxun(detail,num):
url = 'https://hr.tencent.com/'
# detail = 'position.php?&start=0#a'
request = urllib2.Request(url + detail)
response =urllib2.urlopen(request)
resHtml = response.read()
soup = BeautifulSoup(resHtml,'html.parser', from_encoding='utf-8')
result = soup.select(".even")
result += soup.select(".odd")
# print len(result)
# 处理页面
items = []
for node in result:
item = {}
# 职位名
zname = node.select('td')[0].get_text()
# 职位类别
ztype = node.select('td')[1].get_text()
# 人数
znum = node.select('td')[2].get_text()
# 地点
zlocal = node.select('td')[3].get_text()
# 发布时间
ztime = node.select('td')[4].get_text()
# 链接
detailLink = node.select('td a')[0].attrs['href']
# 获取工作职责、工作要求
request1 = urllib2.Request(url + detailLink)
response1 = urllib2.urlopen(request1)
jobHtml = response1.read()
soup1 = BeautifulSoup(jobHtml, 'html.parser', from_encoding='utf-8')
# print len(soup1.select('ul.squareli'))
# 工作职责
jobRes = ''
for li in soup1.select('ul.squareli')[0].select('li') :
jobRes += li.get_text() + '\n'
# 工作要求
jobReq = ''
for li in soup1.select('ul.squareli')[1].select('li') :
jobReq += li.get_text() + '\n'
# print jobReq
# 将数据存入item中
item['zname']=zname;
item['detailLink'] = detailLink;
item['ztype']=ztype
item['znum'] = znum
item['zlocal'] = zlocal
item['ztime'] = ztime
item['jobRes'] = jobRes
item['jobReq'] = jobReq
# 处理工作职责和工作要求
items.append(item)

origin = []
print(len(items))

# 以json格式输出到文件中
# 禁用ascii编码,按utf-8编码
output = open('tencent.json'+ str(num), 'w')
for i in origin:
items.append(i)
line = json.dumps(items, ensure_ascii=False);
# print line
output.write(line.encode('utf-8'))
output.close()

# print resHtml
for i in range(303):
print("进行到第" + str(i) + "页")
url = 'position.php?&start='+ str(i * 10) +'#a'
tengxun(url, i)

取出来的json数据:

 

 

阅读更多
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: