您的位置:首页 > 编程语言 > Python开发

Python爬虫

2016-03-13 16:01 316 查看
之前用来搜职位的Python爬虫
# -*- coding:utf-8 -*-
##GB18030
import urllib
import urllib2
import re
import os
import math

import sqlite3

##import sys
##reload(sys)
##sys.setdefaultencoding('utf8')

import socket
socket.setdefaulttimeout(25)

#int re module the () things must add \(\) oh
#if you want to insert into database  the content must use decode('GB18030').encode('utf8')
x=1

conn=sqlite3.connect("jobs.db")

try :
conn.execute("create table jobs(id int primary key,name text,wage text,comment text)")
#conn.close()
except Exception as e:
print "create table",e

for i in range(1, 5):

url = 'http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=190200%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=%E5%B5%8C%E5%85%A5%E5%BC%8F%E8%BD%AF%E4%BB%B6&keywordtype=1&curr_page=2&lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=01&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=1&fromType=14&dibiaoid=-1'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content = response.read().decode('GB18030')

#print content

#pattern = re.compile('<li>.*<a href="/W/(.*?)" title="(.*?)" class="ah" target="_blank">'+
#                     '<img src="(.*?)"',re.S)

pattern = re.compile('<a adid=""  onmousedown="return AdsClick\(\)"  href="(.*?)" onclick="zzSearch.acStatRecJob\( 1 \)',re.S)

items = re.findall(pattern,content)
#print items
for item in items:
#haveImg = re.search("img",item[3])
#if not haveImg:
print i,item

url = item
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content = response.read().decode('GB18030').encode('utf8')

pattern = re.compile('<div style="padding-bottom:30px;">(.*?)</div>',re.S)

newitems = re.findall(pattern,content)
for newitem in newitems:
print newitem.replace("<br>","\r\n")

try :
conn.execute("insert into jobs(id,name,wage,comment) values(%d,'%s','%s','%s')" %
(x,item.decode('GB18030').encode('utf8'),'2',newitem.replace('<br>','\r\n')))
conn.commit()
except Exception as e:
print e

x+=1

except urllib2.URLError, e:
if hasattr(e,"code"):
print e.code
if hasattr(e,"reason"):
print e.reason
if isinstance(e.reason, socket.timeout):
print e

except socket.timeout, e:
print e

print "\n"

retval = conn.execute("select * from jobs")
for val in retval:
print val[0]
print val[1]
print val[2]
print val[3]

conn.close()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: