您的位置:首页 > 编程语言 > Python开发

Python爬虫——人人好友相册多线程下载(二)

2013-12-14 21:04 513 查看
改进:加入了多线程下载,提高了照片匹配的准确度,好友相册能够完整下载

一、Cookie获得

chrome 浏览器 Mac Command + Alt + I windows 好像是F12 打开开发者工具进行抓包



二、抓取图片


http://friend.renren.com/groupsdata 从次页面用正则表达式获取全部好友的ID
http://photo.renren.com/photo/' + 好友ID + '/album/relatives/profile
从此页面可获得好友相册的ID
http://photo.renren.com/photo/好友ID/album-相册ID?frommyphoto 从相册页面获取照片ID

三、多线程下载

Download类继承了threading.Thread,重写了run()方法,传入了一个存放照片URL的set(),遍历集合进行下载

在实际抓取照片中,每一个相册将会开启一个线程进行下载

代码需附上你自己的人人Cookie

# coding=utf8
import os
import re
import threading
import urllib2

COOKIE = '你自己人人的Cookie'
HEADERS = {'cookie': COOKIE}

# find title
def find_title(mypage):
myMatch = re.search(r'<title>(.+?)</title>', mypage, re.S)
title = u'undefined'
if myMatch:
title = myMatch.group(1)
else:
print u'find no title'
# 文件名不能包含以下字符: \ / : * ? " < > |
title = title.replace('\\', '').replace('/', '').replace(':', '').replace('*', '').replace('?', '').replace('"',
'').replace(
'>', '').replace('<', '').replace('|', '')
return title

def login_renren(url):
try:
req = urllib2.Request(url, headers=HEADERS)
page = urllib2.urlopen(req).read()
page = page.decode('utf-8')
title = find_title(page)
print title
return page
except:
page = ur''
return page

def find_friendlist():
url_friend = 'http://friend.renren.com/groupsdata'    #friend list
req = urllib2.Request(url_friend, headers=HEADERS)
try:
page = urllib2.urlopen(req).read()
page = page.decode('utf-8')
except:
print 'cookie is error'
page = ''
pattern = re.compile(r'"fid":\d*?,')
if pattern.findall(page):
list = pattern.findall(page)
friend_file = open('id.txt', 'w')
for i in list:
id = i[6:-1]
friend_file.write(id)
friend_file.write(os.linesep)
friend_file.close()
else:
print 'find no friendID'

# http://photo.renren.com/photo/XXXXXXXXX/album/relatives/profile # http://photo.renren.com/photo/XXXXXXXXX/album-535947620?frommyphoto def find_ablumUrl():
list = ur''
file = open('id.txt')
ablum = open('albumlist.txt', 'w')
while 1:
line = file.readline()
if line:
line = line[:-1]
photo_url = 'http://photo.renren.com/photo/' + str(line) + '/album/relatives/profile'
print photo_url
data = login_renren(photo_url)
pattern = re.compile(r'http://photo.renren.com/photo/(.+?)frommyphoto')
if pattern.findall(data):
list = pattern.findall(data)
else:
print 'find no album id'
#remove duplicate album id
albumid_set = set()
for i in list:
albumid_set.add(i)

for i in albumid_set:
album_list = 'http://photo.renren.com/photo/' + str(i) + 'frommyphoto'
print album_list
ablum.write(album_list)
ablum.write(os.linesep)
else:
break

def download_album():
file = open('albumlist.txt')
while 1:
line = file.readline()
if not line:
break
else:
list = ''
data = login_renren(line)
pattern = re.compile(r'large:.*?\.jpg', re.I)   #large xlarge
if pattern.findall(data):
list = pattern.findall(data)
else:
print 'found no image'

photo_url = set()
for i in list:
i = i[7:]
photo_url.add(i)
print i   # test
try:
d = Download(photo_url)
print d.name
d.start()
except:
print u'download error   ' + line
file.close()

#download by thread
class Download(threading.Thread):
def __init__(self, que):
threading.Thread.__init__(self)
self.que = que

def run(self):
for i in self.que:
data = urllib2.urlopen(i).read()
path = str(i[-15:-5]) + '.jpg'
f = open(path, 'wb')  # 存储下载的图片
f.write(data)
f.close()
return

#start
def start_photo_grap():
login_renren(URL)
find_friendlist()
find_ablumUrl()
download_album()

URL = r'http://www.renren.com'

if __name__ == '__main__':
start_photo_grap()
print 'success '
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: