您的位置:首页 > 其它

扒照片

2015-08-07 20:45 337 查看
以前写过一些python,主要都是调用linux命令和封装完成命令去做watchdog监控,好像还没用到python去爬网页,于是写一个吧,不如就爬下公司内网的征婚的mm的照片吧,说干就干吧,代码如下:
#! -*- coding:utf-8 -*-
import urllib2
import urllib
import cookielib
import os
import re

# 登录信息
data = {'actionFlag':"loginAuthenticate",
"lang":"en",
"loginMethod": "login",
'loginPageType':'mix',
"redirect":"http%3A%2F%2Fxinsheng.huawei.com",
'uid':'coder_xia',
'password':'xxxxx'
}
postdata = urllib.urlencode(data)

#模拟浏览器信息
headers={
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.56 Safari/537.17'
}

# 登录地址
url_login = "https://login.xx.com/login/login.do"

#深圳
url_xinsheng = "http://xinsheng.xx.com/cn/index.php?app=forum&mod=List&act=index&class=409&cate=44"

# 登录
def login(url_login):
#cookie
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)

response = urllib2.Request(url_login, postdata,headers)
res = opener.open(response).read()
regex = re.compile(r"欢迎")
resultreg = re.compile(regex);
if len(re.findall(resultreg,res))!=0:
print "login success"
else:
print "login fail"

def getHtml(url):
page = urllib2.urlopen(url)
html = page.read()
return html

#返回图片url实际地址
def getAllPictureLink(html):
reg = r'lazyload="(.+?\.jpg)"'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
return imglist

#返回征婚相关连接
def  getAllLink(html):
reg = r'征.*?"(http://.*?mod=Detail.*?[1-9])"'
linkre = re.compile(reg)
linklist = re.findall(linkre, html)
return linklist

#保存图片到本地
def savePicture(pic_name,url):
of = open(os.path.join("F:\\xinsheng\\hangzhou", pic_name.split("-")[2]+"-"+pic_name.split("-")[3]), 'w+b')
q = urllib.urlopen(url)
of.write(q.read())
q.close()
of.close()

def downloadPitureInURL(url):
html = getHtml(url)
linklist = getAllLink(html)
for url in linklist:
print 'url = ' + url;
pic_list = getAllPictureLink(getHtml(url));
for img_url in pic_list:
print img_url
imgarr = img_url.split("/");
if len(imgarr) != 6 or len(imgarr[5].split("-")) != 4:
continue

savePicture(imgarr[5],img_url)

login(url_login);
downloadPitureInURL(url_xinsheng)
for i in range(2,31):
url_every_page = url_xinsheng+"&p="+str(i);
print "url_every_page = " + url_every_page;
downloadPitureInURL(url_every_page)
参考网址:http://www.cnblogs.com/sysu-blackbear/p/3629770.html
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: