您的位置:首页 > 编程语言 > Python开发

Python Post and Get 登陆web后台系统并抓取页面

2013-10-29 21:20 513 查看
#coding=utf8
#! /usr/bin/env python

import httplib
import re
import socket
import urllib

timeout = 60
socket.setdefaulttimeout(timeout)

def getTable():

f = open('kvpage.html')
page = f.readlines()
f.close()
pattern = re.compile(r'.*<tbody>(.*?)</tbody>.*')

for line in page:
#print line
m = pattern.match(line.strip())
if m is not None:
return m.group(1)

return None

def extractKvEvents(content):

#init result
table = []

#init pattern
patternTR = re.compile(r"<tr>(.*?)</tr>")
patternTD = re.compile(r'<td class="confluenceTd">(.*?)</td>')

#search all the rows
allrows = patternTR.findall(content)
if allrows is not None:
for row in allrows:
#print row
cols = patternTD.findall(row)
if cols is not None:

table.append(cols)

return table

def outputToExcel(table):
for row in table:
print row

def loginWiki():

httpClient = None
try:
params = urllib.urlencode({'os_username': 'xxxx@xxx.com',
'os_password': 'xxxx',
'login': 'Log In'})

headers = {"Content-type": "application/x-www-form-urlencoded"
, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}

httpClient = httplib.HTTPConnection("xxx.com", 8080, timeout=30)
httpClient.request("POST", "/login.action", params, headers)

response = httpClient.getresponse()
#         print response.status
#         print response.reason
#         print response.read()
#         print response.getheaders()
print response.getheader('Set-Cookie')
cookieFile = open('cookie.txt', 'w')
cookieFile.write(response.getheader('Set-Cookie'))
cookieFile.close()
except Exception, e:
print e
finally:
if httpClient:
httpClient.close()

def catchPage():
httpClient = None

try:
#read cookie
f = open('cookie.txt')
cookie = f.read().strip()
print cookie
f.close()

#init headers
headers = {"Content-type": "application/x-www-form-urlencoded",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'Cookie': cookie}

#send request
httpClient = httplib.HTTPConnection('xxx.com', 8080, timeout=30)
httpClient.request('GET', '/xxxPath', headers=headers)

#response是HTTPResponse对象
response = httpClient.getresponse()
print response.status
print response.reason

htmlPage = open('kvpage.html', 'w')
htmlPage.write(response.read())
htmlPage.close()
except Exception, e:
print e
finally:
if httpClient:
httpClient.close()

if __name__ == '__main__':

loginWiki()
catchPage()
tablecontent = getTable()
table = extractKvEvents(tablecontent)
outputToExcel(table)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: