存档
2016-08-12 16:20
225 查看
# -*- coding: utf-8 -*- import urllib2,cookielib import urllib import cStringIO import datetime from PIL import Image from lxml import etree import sys reload(sys) sys.setdefaultencoding('utf8') def setOpener(): cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) opener.addheaders.append(('User-Agent','Mozilla/5.0 (Windows NT 5.1; rv:25.0) Gecko/20100101 Firefox/25.0')) return opener def md5(str): import hashlib import types if type(str) is types.StringType: m = hashlib.md5() m.update(str) return m.hexdigest() else: return '' class spider: def __init__(self): self.opener=setOpener()#保存cookie信息 self.imgUrl='http://210.42.121.241/servlet/GenImg' self.loginUrl='http://210.42.121.241/servlet/Login' self.queryScoreUrl='http://210.42.121.241/servlet/Svlt_QueryStuScore' self.studentID='' self.password='' self.captcha=''#验证码 self.mainPageContent='' def getCaptcha(self): res =self.opener.open(urllib2.Request(self.imgUrl)) tempIm = cStringIO.StringIO(res.read()) im = Image.open(tempIm) return im #im.save('test.jpg') #im.show() #self.captcha = raw_input("验证码:") def loginMainPage(self): #需要post的数据 pwdMD5=md5(self.password) postdata = urllib.urlencode({ 'id':self.studentID, 'pwd':pwdMD5, 'xdvfb':self.captcha }) req = urllib2.Request( url = self.loginUrl, data = postdata ) response = self.opener.open(req) self.mainPageContent = response.read().decode('gb2312') def getAndSaveScore(self): page=etree.HTML(self.mainPageContent) text=page.xpath('//div[@id="school"]/@onclick') try: token=text[0][65:101] except IndexError: print "Error:未能正确打开主页面" return 0 else: GMT_FORMAT = '%a, %d %b %Y %H:%M:%S GMT' GMT_time=datetime.datetime.utcnow().strftime(GMT_FORMAT) getParams=urllib.urlencode({ 'csrftoken':token, 'learnType':'', 'scoreFlag':'0', 't':GMT_time, 'term':'', 'year':'0' }) url = self.queryScoreUrl fullUrl=url+'?'+getParams #print fullUrl req = urllib2.Request(fullUrl) response = self.opener.open(req) result = response.read().decode('gb2312') # 由于该网页是gb2312的编码,所以需要解码 #print result out=open('inputScore.html','wb') out.write(result) out.close() return 1 #mySpider=spider() #mySpider.getCaptcha() #mySpider.loginMainPage() #mySpider.getAndSaveScore()
相关文章推荐
- 第四章:Linear Models for Classification exercise 25-26
- 大数据系列修炼-Scala课程13+14
- 理解Cookie和Session机制
- C语言操作符优先级
- CSS盒子模型-盒子模型应用
- OC当中的闭包
- swift学习----记使用NSClassFromString一个坑
- 树
- Android Studio中SVN安装与使用
- java的初始化顺序
- jquery表格datatables实例解析 直接加载和延迟加载
- iOS URL Scheme 第三方跳转
- opencv森林火灾检测-2
- GO2应对千万级访问量的解决之道
- 漫步微积分十七——最大最小值问题(续)
- 经典问题总结
- HTML5中<video>标签的duration属性在IE中为NAN的原因
- 7.正则表达式(可用于在抓取的html源码中,获取要求的数字)
- qmake 常见错误:QApplication: No such file or directory
- json格式null转空串