扒照片
2015-08-07 20:45
337 查看
以前写过一些python,主要都是调用linux命令和封装完成命令去做watchdog监控,好像还没用到python去爬网页,于是写一个吧,不如就爬下公司内网的征婚的mm的照片吧,说干就干吧,代码如下:
#! -*- coding:utf-8 -*- import urllib2 import urllib import cookielib import os import re # 登录信息 data = {'actionFlag':"loginAuthenticate", "lang":"en", "loginMethod": "login", 'loginPageType':'mix', "redirect":"http%3A%2F%2Fxinsheng.huawei.com", 'uid':'coder_xia', 'password':'xxxxx' } postdata = urllib.urlencode(data) #模拟浏览器信息 headers={ 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.56 Safari/537.17' } # 登录地址 url_login = "https://login.xx.com/login/login.do" #深圳 url_xinsheng = "http://xinsheng.xx.com/cn/index.php?app=forum&mod=List&act=index&class=409&cate=44" # 登录 def login(url_login): #cookie cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) response = urllib2.Request(url_login, postdata,headers) res = opener.open(response).read() regex = re.compile(r"欢迎") resultreg = re.compile(regex); if len(re.findall(resultreg,res))!=0: print "login success" else: print "login fail" def getHtml(url): page = urllib2.urlopen(url) html = page.read() return html #返回图片url实际地址 def getAllPictureLink(html): reg = r'lazyload="(.+?\.jpg)"' imgre = re.compile(reg) imglist = re.findall(imgre,html) return imglist #返回征婚相关连接 def getAllLink(html): reg = r'征.*?"(http://.*?mod=Detail.*?[1-9])"' linkre = re.compile(reg) linklist = re.findall(linkre, html) return linklist #保存图片到本地 def savePicture(pic_name,url): of = open(os.path.join("F:\\xinsheng\\hangzhou", pic_name.split("-")[2]+"-"+pic_name.split("-")[3]), 'w+b') q = urllib.urlopen(url) of.write(q.read()) q.close() of.close() def downloadPitureInURL(url): html = getHtml(url) linklist = getAllLink(html) for url in linklist: print 'url = ' + url; pic_list = getAllPictureLink(getHtml(url)); for img_url in pic_list: print img_url imgarr = img_url.split("/"); if len(imgarr) != 6 or len(imgarr[5].split("-")) != 4: continue savePicture(imgarr[5],img_url) login(url_login); downloadPitureInURL(url_xinsheng) for i in range(2,31): url_every_page = url_xinsheng+"&p="+str(i); print "url_every_page = " + url_every_page; downloadPitureInURL(url_every_page)参考网址:http://www.cnblogs.com/sysu-blackbear/p/3629770.html
相关文章推荐
- struts基本开发流程
- poj 2406 Power Strings (kmp)
- Why Docker is Not Yet Succeeding Widely in Production
- BZOJ2911 : [Poi1997]The Number of Symmetrical Choices
- log4net按日志级别(debug,info,warn,error,fatal)生成日志目录,同时每小时生成一个日志文件
- 12、C语言和设计模式(代理模式)
- BZOJ 2049 [Sdoi2008]Cave 洞穴勘测 LCT
- 11gR2更换OCR和VOTE
- python两个整数和浮点的方法来获取值
- 北大POJ_1050_To the Max
- 四大组件之Activity小结
- HDU 1166 树状数组
- 杭电 1686 Oulipo (kmp)
- 四大组件之Activity小结
- 最锋利的VS Web开发工具扩展:Web Essentials详解
- js使用经验之谈
- 概念与原理
- HDOJ 1358 Period(KMP next数组运用)
- poj1113 Wall 凸包
- 将Visual Studio打造成为Node.js IDE