您的位置:首页 > 编程语言 > Python开发

python3 urllib爬虫抓取记录

2017-12-01 11:46 288 查看
# 目的:GET请求 抓取csdn博客页面所有文章标题,并保存在csdn目录下
import re
import os
from urllib import request

#抓取整个页面下来
data=request.urlopen('http://blog.csdn.net/a519395243').read().decode()
#正则提取所有文章标题,
ruler = re.compile('<span class="link_title"><a href="/a519395243/article/details/[1-9]{8}">(.*?)</a>',re.S)
match = ruler.findall(data)
#把抓取到的数据遍历
for x in match:
#把 \r\n 和空格 都去掉
content = x.replace('\r\n','').replace(' ','')
#文件保存路径,如果没有,则创建
path = 'csdn'
if not os.path.exists(path):
os.makedirs(path)
#保存文件名
file_path = path+'/csdn.txt'
#打开文件
f = open(file_path,'a+')
#写入文件
f.write(content)
#关闭文件
f.close()
pass


#模拟浏览器发送GET请求,通过往Request对象添加HTTP头,伪装成浏览器
from urllib import request

req = request.Request('http://blog.csdn.net/a519395243')
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')
data = request.urlopen(req).read().decode()
print(data)


# 目的:模拟登录 csdn
import gzip
import re
import urllib.request
import urllib.parse
import http.cookiejar

def ungzip(data):
try:
print("尝试解压缩...")
data = gzip.decompress(data)
print("解压完毕")
except:
print("未经压缩,无需解压")

return data

def getLt(data):
cer = re.compile('name=\"lt\" value=\"(.*)\"')
strlist = cer.findall(data)
return strlist[0]

def getExecution(data):
cer = re.compile('name=\"execution\" value=\"(.*)\"')
strlist = cer.findall(data)
return strlist[0]

def getOpener(head):
# cookies 处理
cj = http.cookiejar.CookieJar()
pro = urllib.request.HTTPCookieProcessor(cj)
opener = urllib.request.build_opener(pro)
header = []
for key,value in head.items():
elem = (key,value)
header.append(elem)
opener.addheaders = header
return opener
# header信息可以通过firebug获得
header = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Host':'passport.csdn.net',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Cookie':'uuid_tt_dd=-6281662822437337065_20171128; __message_district_code=440000; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22160058ffb5850c-0d114986a51ac1-6a11157a-1440000-160058ffb598c9%22%2C%22%24device_id%22%3A%22160058ffb5850c-0d114986a51ac1-6a11157a-1440000-160058ffb598c9%22%2C%22props%22%3A%7B%22%24latest_utm_source%22%3A%22news0%22%7D%7D; kd_user_id=1f003860-eec5-424d-8a20-498a00b6ab73; UM_distinctid=160068870b25ec-07ca748d26f527-6a11157a-15f900-160068870b3750; UN=a519395243; UE="519395243@qq.com"; BT=1512011174110; shown_offset=20; Hm_lvt_3f9df99a208b69b45eb52cfbe2dc3bf8=1511939807,1512007982,1512022346,1512026346; Hm_lpvt_3f9df99a208b69b45eb52cfbe2dc3bf8=1512026346; __message_sys_msg_id=0; __message_gu_msg_id=0; __message_cnel_msg_id=0; __message_in_school=0; JSESSIONID=8669679CFA8B508DD860D5C76BDA9E69.tomcat1; LSSC=LSSC-55438-kdj63iwrBuHfcdst9TBrRIONZeKOQh-passport.csdn.net; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1512011295,1512022345,1512026346,1512029753; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1512032481; dc_tos=p083q9; dc_session_id=1512031760278'
}

url = url = 'https://passport.csdn.net/account/verify'
opener = getOpener(header)
op = opener.open(url)
data = op.read()
data = ungzip(data)
lt = getLt(data.decode())
execution = getExecution(data.decode())

username = "帐号"
password = "密码"
postDict = {
'lt': lt,
'username': username,
'password': password,
'_eventId': 'submit',
'execution':execution
}
postData = urllib.parse.urlencode(postDict).encode()
op = opener.open(url,postData)
data = op.read()
data = ungzip(data)

print(data.decode())
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python 爬虫 正则 urllib