python3 urllib爬虫抓取记录
2017-12-01 11:46
288 查看
# 目的:GET请求 抓取csdn博客页面所有文章标题,并保存在csdn目录下 import re import os from urllib import request #抓取整个页面下来 data=request.urlopen('http://blog.csdn.net/a519395243').read().decode() #正则提取所有文章标题, ruler = re.compile('<span class="link_title"><a href="/a519395243/article/details/[1-9]{8}">(.*?)</a>',re.S) match = ruler.findall(data) #把抓取到的数据遍历 for x in match: #把 \r\n 和空格 都去掉 content = x.replace('\r\n','').replace(' ','') #文件保存路径,如果没有,则创建 path = 'csdn' if not os.path.exists(path): os.makedirs(path) #保存文件名 file_path = path+'/csdn.txt' #打开文件 f = open(file_path,'a+') #写入文件 f.write(content) #关闭文件 f.close() pass
#模拟浏览器发送GET请求,通过往Request对象添加HTTP头,伪装成浏览器 from urllib import request req = request.Request('http://blog.csdn.net/a519395243') req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36') data = request.urlopen(req).read().decode() print(data)
# 目的:模拟登录 csdn import gzip import re import urllib.request import urllib.parse import http.cookiejar def ungzip(data): try: print("尝试解压缩...") data = gzip.decompress(data) print("解压完毕") except: print("未经压缩,无需解压") return data def getLt(data): cer = re.compile('name=\"lt\" value=\"(.*)\"') strlist = cer.findall(data) return strlist[0] def getExecution(data): cer = re.compile('name=\"execution\" value=\"(.*)\"') strlist = cer.findall(data) return strlist[0] def getOpener(head): # cookies 处理 cj = http.cookiejar.CookieJar() pro = urllib.request.HTTPCookieProcessor(cj) opener = urllib.request.build_opener(pro) header = [] for key,value in head.items(): elem = (key,value) header.append(elem) opener.addheaders = header return opener # header信息可以通过firebug获得 header = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, sdch, br', 'Accept-Language':'zh-CN,zh;q=0.8', 'Connection':'keep-alive', 'Host':'passport.csdn.net', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Cookie':'uuid_tt_dd=-6281662822437337065_20171128; __message_district_code=440000; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22160058ffb5850c-0d114986a51ac1-6a11157a-1440000-160058ffb598c9%22%2C%22%24device_id%22%3A%22160058ffb5850c-0d114986a51ac1-6a11157a-1440000-160058ffb598c9%22%2C%22props%22%3A%7B%22%24latest_utm_source%22%3A%22news0%22%7D%7D; kd_user_id=1f003860-eec5-424d-8a20-498a00b6ab73; UM_distinctid=160068870b25ec-07ca748d26f527-6a11157a-15f900-160068870b3750; UN=a519395243; UE="519395243@qq.com"; BT=1512011174110; shown_offset=20; Hm_lvt_3f9df99a208b69b45eb52cfbe2dc3bf8=1511939807,1512007982,1512022346,1512026346; Hm_lpvt_3f9df99a208b69b45eb52cfbe2dc3bf8=1512026346; __message_sys_msg_id=0; __message_gu_msg_id=0; __message_cnel_msg_id=0; __message_in_school=0; JSESSIONID=8669679CFA8B508DD860D5C76BDA9E69.tomcat1; LSSC=LSSC-55438-kdj63iwrBuHfcdst9TBrRIONZeKOQh-passport.csdn.net; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1512011295,1512022345,1512026346,1512029753; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1512032481; dc_tos=p083q9; dc_session_id=1512031760278' } url = url = 'https://passport.csdn.net/account/verify' opener = getOpener(header) op = opener.open(url) data = op.read() data = ungzip(data) lt = getLt(data.decode()) execution = getExecution(data.decode()) username = "帐号" password = "密码" postDict = { 'lt': lt, 'username': username, 'password': password, '_eventId': 'submit', 'execution':execution } postData = urllib.parse.urlencode(postDict).encode() op = opener.open(url,postData) data = op.read() data = ungzip(data) print(data.decode())
相关文章推荐
- python爬虫之urllib2登录并抓取HTML页面
- 鱼c笔记——Python爬虫(一):利用urllib进行简单的网页抓取
- Python3.7 爬虫(二)使用 Urllib2 与 BeautifulSoup4 抓取解析网页
- python爬虫入门----用urllib抓取整个网页
- Python3.7 爬虫(一)使用 Urllib2 与正则表达式抓取
- 【Python3.6爬虫学习记录】(六)urllib详细使用方法(header,代理,超时,认证,异常处理)
- Python爬虫学习记录(1)——Xiami全站播放数
- python写的爬虫抓取到的网页是乱码解决
- Python爬虫入门(4):Urllib库的高级用法
- Python爬虫实战(4):抓取淘宝MM照片
- 03—小白学Python爬虫之urllib的基本和进阶使用及Get、Post示例
- Python爬虫: 抓取One网页上的每日一话和图
- 抓取国家统计局区划、城乡划分代码的简易python爬虫实现
- 抓取安居客二手房经纪人数据,python爬虫自动翻页
- 使用 python urllib2 抓取网页时出现乱码的解决方案
- Python爬虫入门_之urllib2&&urllib
- Python写爬虫——抓取网页并解析HTML(修订篇)
- python--爬虫入门(八)体验HTMLParser解析网页,网页抓取解析整合练习
- Python爬虫(urllib2+bs4)+分析找出谁是水贴王(2)--数据分析
- [置顶] [爬虫]用Python抓取非小号网站数字货币(二)