您的位置:首页 > 编程语言 > Python开发

Python3.5.3 urllib简单爬取51job

2017-04-18 00:00 92 查看
python3自带的urllib编写的简单爬虫,适合一般反爬的网站,python做爬虫代码很少开发快,主要就是了解爬虫的流程和正则表达式。

import urllib.request
import re

def get_html(url):
#http header, because some websites have anti crawler mechanism
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}

req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
html = response.read().decode('gbk')
print(html)
return html

def get_contents(html):
#pattern = re.compile('<span>\s*?<a target="_blank" title="(.*?)"')
#pattern = re.compile('<span>\s*?<a target="_blank" title="(.*?)" href="(.*?)"')
#pattern = re.compile(r'<span>\s*?<a target="_blank" title="(.*?)" href="(.*?)"[\s\S]*?<span class="t2"><a target="_blank" title="(.*?)"')
#pattern = re.compile(r'<p class="t1 ">[\s\S]*?<span>\s*?<a target="_blank" title="(.*?)" href="(.*?)"[\s\S]*?<span class="t2"><a target="_blank" title="(.*?)"')
#pattern = re.compile(r'<p class="t1 ">[\s\S]*?<span>\s*?<a target="_blank" title="(.*?)" href="(.*?)"[\s\S]*?<span class="t2"><a target="_blank" title="(.*?)"[\s\S]*?<span class="t3">(.*?)</span>')
#pattern = re.compile(r'<p class="t1 ">[\s\S]*?<span>\s*?<a target="_blank" title="(.*?)" href="(.*?)"[\s\S]*?<span class="t2"><a target="_blank" title="(.*?)"[\s\S]*?<span class="t3">(.*?)</span>[\s\S]*?<span class="t4">(.*?)</span>')

pattern = re.compile(r'<p class="t1 ">[\s\S]*?<span>\s*?<a target="_blank" title="(.*?)" href="(.*?)"[\s\S]*?<span class="t2"><a target="_blank" title="(.*?)"[\s\S]*?<span class="t3">(.*?)</span>[\s\S]*?<span class="t4">(.*?)</span>[\s\S]*?<span class="t5">(.*?)</span>')
items = pattern.findall(html)

for item  in items:
print( "职位名:%s   链接:%s   公司名:%s   工作地点:%s   薪资:%s   发布时间:%s" % (item[0],item[1],item[2],item[3],item[4],item[5]) )
#break

#url = 'http://www.51job.com/'
url = 'http://search.51job.com/list/010000,000000,0000,00,9,99,Python%25E5%25BC%2580%25E5%258F%2591%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
html = get_html(url)
get_contents(html)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: