您的位置:首页 > 编程语言 > Python开发

python爬虫实战-爬取岗位招聘信息并保存至本地(方法、bs4)

2019-02-23 15:50 756 查看

python爬虫实战-爬取岗位招聘信息并保存至本地(方法,bs4)

from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import json
import time

class ZhiLianSpider(object):
"""docstring for ZhiLianSpider"""
url='https://xiaoyuan.zhaopin.com/full/0/0_0_0_0_0_-1_'
# https://xiaoyuan.zhaopin.com/full/0/0_0_0_0_0_-1_%E8%A5%BF%E5%AE%89_3_0
def __init__(self, kw,start_page,end_page):
super(ZhiLianSpider, self).__init__()
self.kw=kw
self.start_page=start_page
self.end_page=end_page
self.items=[]

def parse_content(self,content):
soup=BeautifulSoup(content,'lxml')
table_list=soup.select('.searchResultListUl>li')
# print(table_list)
for table in table_list:
zwmc=table.select('.searchResultJobName > a')[0].text
city=table.select('.searchResultJobCityval')[0].text
gsmc=table.select('.searchResultCompanyname >span')[0].text
zwlb=table.select('.searchResultCompanyIndustry')[0].text
zprs=table.select('.searchResultJobPeopnum')[0].text
fbsj=table.select('.pt15 > .searchResultKeyval > span')[0].text
item = {
'职位名称':zwmc,
'公司名称':gsmc,
'工作城市':city,
'招聘人数':zprs,
'职位类别':zwlb,
'发布时间':fbsj.split(':')[-1],
}
self.items.append(item)

#根据page拼接url,生成请求对象
def handle_request(self,page):
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
#对关键字中文进行编码
self.kw=urllib.parse.quote(self.kw)
url_page=self.url + self.kw + '_' + str(page)+'_0'
request=urllib.request.Request(url=url_page,headers=headers)
return request

def run(self):
for page in range(self.start_page,self.end_page+1):
print("正在采集第%s页" %page)
request=self.handle_request(page)
content=urllib.request.urlopen(request).read().decode()
self.parse_content(content)
string=json.dumps(self.items,ensure_ascii=False)
with open('zhaopin.txt','w',encoding='utf8') as fp:
fp.write(string)

def main():
kw=input('请输入工作关键字:')
start_page=int(input('请输入起始页码'))
end_page=int(input('请输入结束页码'))

#创建一个对象
spider=ZhiLianSpider(kw,start_page,end_page)
spider.run()

if __name__ == '__main__':
main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: