您的位置:首页 > Web前端 > HTML

利用python脚本抓取AC的代码[爬虫+HTMLParser+handle_entityref+正则表达式+模拟登陆+文件操作]

2012-12-18 17:30 946 查看
#-*- coding=utf-8 -*-
import time,urllib2,urllib,re,HTMLParser,os
from htmlentitydefs import entitydefs

class PageParser(HTMLParser.HTMLParser):#翻译实体
def __init__(self):
self.data=""
self.readcode=0
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
if tag=='textarea':
self.readcode=1
def handle_data(self,data):
if self.readcode:
self.data+=data
def handle_endtag(self,tag):
if tag=='textarea':
self.readcode=0
def handle_entityref(self,name):
if entitydefs.has_key(name):
self.handle_data(entitydefs[name])
else:
self.handle_data('&'+name+';')
def getdata(self):
return self.data

global res
def getACUrl():
step=1
r=re.compile(unicode("下一页","utf8"))
#.{500}?\"
r2=re.compile("<input type=\"hidden\" name=\"__VIEWSTATE\" id=\"__VIEWSTATE\" value=\".{1,50000}?\" />")
url = "http://algorithm.fzu.edu.cn/OnlineJudgeUserStatus.aspx"
parms = {
'__EVENTTARGET':'ctl00$MainRightHolder$UserStatusGridView',
'ctl00$MainRightHolder$UserIdTextBox':'*****',
}
login1=urllib2.urlopen(url)
pagedata=login1.read()
s1=r2.findall(pagedata)
if len(s1)==0:
return
parms['__VIEWSTATE']=s1[0][64:-4]#form表单的提取
while True:
try:
if step==1:
parms['__EVENTARGUMENT']='Page$First'
else:
parms['__EVENTARGUMENT']='Page$Next'
step+=1
login = urllib2.urlopen(url,urllib.urlencode(parms))
data=(unicode(login.read(),"utf8"))
#            fout=open("c:\\1.html","wb+")
#            fout.write(data.encode("GBK"))
#            fout.close()
findurl(data)
if len(r.findall(data))==0:
break
s1=r2.findall(data)
if len(s1)==0:
return
parms['__VIEWSTATE']=s1[0][64:-4]
data=""
except Exception,e:
print(e)
break
def Login(username,password):#模拟登陆
try:
cookies = urllib2.HTTPCookieProcessor()
opener = urllib2.build_opener(cookies)
urllib2.install_opener(opener)
parms = {
'__VIEWSTATE':r'/wEPDwULLTE2ODk5MTAyOTUPZBYCAgMPZBYCAgUPEA8WAh4LXyFEYXRhQm91bmRnZBAVBgzmnIDmlrDkv6Hmga8kMTLmnIg15pel566X5rOV6K++5YGc5LiK5LiA5qyh77yM6K++EuS8mOengOS9nOS4muWAmemAiSgg5YWz5LqO6aKY55uu55qE5pe26Ze056m66Ze06ZmQ5Yi255qE6ZeuJOWFs+S6jueZu+mZhuezu+e7n+eUqOaIt+WQjeS4juWvhueggQg+PuabtOWkmhUGABdTaG93QnVsbGV0aW4uYXNweD9iaWQ9NRdTaG93QnVsbGV0aW4uYXNweD9iaWQ9NBdTaG93QnVsbGV0aW4uYXNweD9iaWQ9MxdTaG93QnVsbGV0aW4uYXNweD9iaWQ9MhFCdWxsZXRpbkxpc3QuYXNweBQrAwZnZ2dnZ2cWAGQYAgUeX19Db250cm9sc1JlcXVpcmVQb3N0QmFja0tleV9fFgEFEkJhbm5lciRMb2dpbkJ1dHRvbgUgQmFubmVyJFVzZXJDb250cm9sUGFuZWxNdWx0aVZpZXcPD2RmZOAvQzwaH/EzyqdrNO7IO2UefuMIHdnWhg02m4yXus4K',
'Banner$LoginButton.x':'17',
'Banner$LoginButton.y':'5'
}
parms[r"Banner$UserNameText"]=username
parms[r"Banner$Password"]=password

loginUrl = "http://algorithm.fzu.edu.cn/Default.aspx"
login = urllib2.urlopen(loginUrl,urllib.urlencode(parms))
h=(unicode(login.read(),"utf8"))
# loginer = urllib2.urlopen("http://poj.org/")#登录主页
#  print(loginer.read().decode("utf8"))
except Exception,e:
print(e)
def findurl(data):
r=re.compile("<a class=\"underline\" href=\".{1,500}?\" target=\"_blank\">.{1,500}?</a></td><td><a class=\"hover-underline\" href=\".{1,500}?\" target=\"_blank\">AC\|AC\|AC\|AC\|AC\|AC\|AC\|AC\|AC\|AC\|</a>")
h=r.findall(data)
import string
global res
for s in h:
y=string.find(s,"cpp")
if y==-1:
continue
x=y
while(s[x]!='>'):
x-=1
name=s[x+1:y+3]
if name in res.keys():
continue
x=string.find(s,"href=")
if x==-1:
continue
y=string.find(s,"\" target=")
if y!=-1:
ss=s[x+6:y]
x=string.find(ss,"&")
if x!=-1:
ss="http://algorithm.fzu.edu.cn/"+ss[:x+1]+ss[x+5:]
res[name]=ss
def getACUrl2():
r=re.compile("<a target='_blank' href='.{1,500}?'>.{1,500}?</a></td><td>AC\|AC\|AC\|AC\|AC\|AC\|AC\|AC\|AC\|AC\|</td>")
url = "http://algorithm.fzu.edu.cn/StuMain.aspx"
try:
login = urllib2.urlopen(url)
data=(unicode(login.read(),"utf8"))
h=r.findall(data)
import string
global res
for s in h:
x=string.find(s,"cpp")
if x==-1:
continue
y=x
while x>0 and s[x]!='>':
x-=1
if x<0:
continue
name=s[x+1:y+3]
if name in res.keys():
continue
t=y
y=x
x=t
while not( x>4 and s[x]=='=' and s[x-1]=='f' and s[x-2]=='e' and s[x-3]=='r' and s[x-4]=='h'):
x-=1
if x==-1:
continue
while y>=0 and s[y]!='\'':
y-=1
if y<0:
continue
res[name]="http://algorithm.fzu.edu.cn/"+s[x+2:y]
except Exception,e:
print(e)
if __name__ == '__main__':
username=raw_input("请输入用户名:")
password=raw_input("请输入密码:")
global res
res={}
Login(username,password)#模拟登陆
getACUrl2()#获取管理后台中的AC代码
getACUrl()#获取AC代码对应的url
if not os.path.exists("ACCode"):
os.mkdir("ACCode")
for key in res:
print key
u=urllib2.urlopen(res[key])#根据获取的url,逐个访问,并将代码保存到本地
pp=PageParser()
pp.feed(u.read())
data=pp.getdata()
fout=open('ACCode'+'\\'+key,"wb+")
fout.write(data)
fout.close()
print "共%d份AC代码" % len(res)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: