您的位置:首页 > 其它

存档

2016-08-12 16:20 225 查看
# -*- coding: utf-8 -*-
import urllib2,cookielib
import urllib
import cStringIO
import datetime
from PIL import Image
from lxml import etree
import sys
reload(sys)
sys.setdefaultencoding('utf8')

def setOpener():
cookie = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
opener.addheaders.append(('User-Agent','Mozilla/5.0 (Windows NT 5.1; rv:25.0) Gecko/20100101 Firefox/25.0'))
return opener

def md5(str):
import hashlib
import types
if type(str) is types.StringType:
m = hashlib.md5()
m.update(str)
return m.hexdigest()
else:
return ''
class spider:
def __init__(self):
self.opener=setOpener()#保存cookie信息
self.imgUrl='http://210.42.121.241/servlet/GenImg'
self.loginUrl='http://210.42.121.241/servlet/Login'
self.queryScoreUrl='http://210.42.121.241/servlet/Svlt_QueryStuScore'
self.studentID=''
self.password=''
self.captcha=''#验证码
self.mainPageContent=''
def getCaptcha(self):
res =self.opener.open(urllib2.Request(self.imgUrl))
tempIm = cStringIO.StringIO(res.read())
im = Image.open(tempIm)
return im
#im.save('test.jpg')
#im.show()
#self.captcha = raw_input("验证码:")

def loginMainPage(self):
#需要post的数据
pwdMD5=md5(self.password)
postdata = urllib.urlencode({
'id':self.studentID,
'pwd':pwdMD5,
'xdvfb':self.captcha
})
req = urllib2.Request(
url = self.loginUrl,
data = postdata
)
response = self.opener.open(req)
self.mainPageContent = response.read().decode('gb2312')

def getAndSaveScore(self):
page=etree.HTML(self.mainPageContent)
text=page.xpath('//div[@id="school"]/@onclick')
try:
token=text[0][65:101]
except IndexError:
print "Error:未能正确打开主页面"
return 0
else:
GMT_FORMAT = '%a, %d %b %Y %H:%M:%S GMT'
GMT_time=datetime.datetime.utcnow().strftime(GMT_FORMAT)
getParams=urllib.urlencode({
'csrftoken':token,
'learnType':'',
'scoreFlag':'0',
't':GMT_time,
'term':'',
'year':'0'
})

url = self.queryScoreUrl
fullUrl=url+'?'+getParams
#print fullUrl
req = urllib2.Request(fullUrl)
response = self.opener.open(req)
result = response.read().decode('gb2312')
# 由于该网页是gb2312的编码,所以需要解码
#print result
out=open('inputScore.html','wb')
out.write(result)
out.close()
return 1

#mySpider=spider()
#mySpider.getCaptcha()
#mySpider.loginMainPage()
#mySpider.getAndSaveScore()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: