您的位置：首页 > 编程语言 > Python开发

Python爬虫——人人好友相册自动下载（一）

2013-12-03 23:20 483 查看

#版本一，COOKIE登陆，不能多线程下载，不能刷新加载更多的好友

#登陆参数，需要你COOKIE，人人ID（可在网址最后的9位数字找到）

# coding=utf8
import os
import re
import urllib2

COOKIE = '你自己人人的COOKIE'
HEADERS = {'cookie' : COOKIE}

# find title
def find_title(mypage):
myMatch = re.search(r'<title>(.+?)</title>', mypage, re.S)
title = u'undefined'
if myMatch:
title = myMatch.group(1)
else:
print u'find no title'
# 文件名不能包含以下字符： \ / ： * ? " < > |
title = title.replace('\\', '').replace('/', '').replace(':', '').replace('*', '').replace('?', '').replace('"', '').replace('>', '').replace('<', '').replace('|', '')
return title

def login_renren(url):
try:
req = urllib2.Request(url, headers=HEADERS)
page = urllib2.urlopen(req).read()
page = page.decode('utf-8')
title = find_title(page)
print title
return page
except:
page = ur''
return page

def downImage(filePath, savePath):  # 根据filePath里面的url自动下载图片
tt = 0  # name
sour = open(filePath, 'r')
while 1:
line = sour.readline()
if line:
# 判断从文件中读取的url是不是图片类型，这里是jpg类型
if(line.find('jpg') > 0):
data = urllib2.urlopen(line).read()
path = savePath +str(line[-10:-5])+ str(tt) + '.jpg'
f = open(path, 'wb')  # 在tmp文件中存储下载的图片
f.write(data)
f.close()
tt = tt + 1
else:
pass
else:
break
sour.close()

# http://www.renren.com/你的人人ID号码#!//friend/manage def find_friendId(loginID):
list = ur''
sour = open('id.txt', 'w')
friendManager_url = r'http://www.renren.com/' + str(loginID) + '#!//friend/manage'
print friendManager_url
page = login_renren(friendManager_url)
pattern3 = re.compile(r'namecard=".*?"\shref')
if pattern3.findall(page):
list = pattern3.findall(page)
else:
print 'find no friend id'

pattern2 = re.compile(r'\d{9}')
for i in list:
if pattern2.search(i):
id = pattern2.search(i).group()
sour.write(id)
sour.write(os.linesep)
sour.close()

# http://photo.renren.com/photo/你好友的ID/album/relatives/profile   		这里是你好友的相册目录
# http://photo.renren.com/photo/你好友的ID/album-535947620?frommyphoto            这里是你好友的相册
def find_AblumUrl():
list = ur''
file = open('id.txt')
ablum = open('ablumlist.txt', 'w')
while 1:
line = file.readline()
if line:
line = line[:-1]
photo_url = 'http://photo.renren.com/photo/' + str(line) + '/album/relatives/profile'
print photo_url
data = login_renren(photo_url)
pattern = re.compile(r'http://photo.renren.com/photo/(.+?)frommyphoto')
if pattern.findall(data):
list = pattern.findall(data)
else:
print 'find no ablum id'

for i in list:
print i
album_list = 'http://photo.renren.com/photo/' + str(i) + 'frommyphoto'
print album_list
ablum.write(album_list)
ablum.write(os.linesep)
else:
break

# xLarge:'http://fmn.rrfmn.com/fmn058/20130603/0035/original_1l5N_40d00000290b125d.jpg' 这是一个大图的URL地址，通过他可以进行下载
def getImageUrl(data, filePath):
list = ur''
sour = open(filePath, 'w')
pattern = re.compile(r'xLarge:.*?\.jpg')
if pattern.findall(data):
list = pattern.findall(data)
else:
print 'found no image'

for i in list:
i = i[8:]
sour.write(i)
sour.write(os.linesep)
sour.close()

def searchAlbum(filePath):
file = open('ablumlist.txt')
while 1:
line = file.readline()
if not line:
break
else:
data = login_renren(line)
getImageUrl(data, filePath)
downImage(filePath, savePath)
file.close()

LOGINID = '你的ID号'     #请输入你人人ID  http://www.renren.com/XXXXXXXXX URL = r'http://www.renren.com'
savePath = r''
filePath = r'image_list.txt'

find_friendId(LOGINID)
find_AblumUrl()
searchAlbum(filePath)
downImage(filePath, savePath)
print 'OK '

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航