您的位置:首页 > 运维架构 > 网站架构

爬取图片网站上的美女图片

2016-03-18 15:49 316 查看
#coding=utf-8

from bs4 import BeautifulSoup as BS4
import requests
import wget
import sys
import chardet
import os

__author__ = 'Administrator'

HOST = 'http://www.5442.com/meinv/'
START_URL = 'http://www.5442.com/meinv/'
ATTR_HREF = 'href'
ATTR_TITLE = 'title'
ATTR_ALT = 'alt'
ATTR_SRC = 'src'
HOST_CONTENT_ENCODING = 'GB2312'
MAIN_DOWNLOAD_FOLDER = 'G:/meinv/'
NEXT_ALBLUM = '可爱熊吖BOBO'
ALREDY_TO_DOWNLOAD_ALBLUMN = False
MAX_THEME_PAGE_COUNT = 200

def utf82GBK(s):
return s.decode('utf-8').encode('gb2312')

def gbkprint(s):
print(utf82GBK(s))

def uni2utf8(s):
return s.encode('utf-8')

def gbk2utf8(s):
return s.decode('gb2312','ignore').encode('utf-8','ignore')

def myprint(s):
print(s)

_ = myprint

def url2bs4(url):
res = requests.get(url)
if res.status_code != 200:
return

#enc = chardet.detect(res.content)
#print enc
res.encoding = HOST_CONTENT_ENCODING
content = gbk2utf8(res.content)
ret = BS4(content)
return ret

def read_tags(fromURL):
bs = url2bs4(fromURL)
tags_objects = bs.find_all("a",attrs={'class':'yxtag'})
ret = {}
for t in tags_objects:
ret[uni2utf8(t.text)] = uni2utf8(t[ATTR_HREF])

return ret

def dump_tags(tags):
for k in tags:
_('名称:%s 地址:%s' % (k, tags[k]))

def next_download_addr(album_name, folder_name):
full_name = os.path.join(MAIN_DOWNLOAD_FOLDER, album_name)#, folder_name)
folder_full_name = utf82GBK(full_name)
if not os.path.isdir(folder_full_name):
os.makedirs(folder_full_name)

next_num = 1
while True:
file_name = os.path.join(folder_full_name, str(next_num)+ '.jpg')
if not os.path.isfile(file_name):
break
next_num = next_num + 1

return file_name

def download_album_one_page(url, album_name):
bs = url2bs4(url)
cur = bs.find(name='p', attrs={'id': 'contents',})
imgs = cur.find_all(name='img')
for img in imgs:
wget.download(img[ATTR_SRC], next_download_addr(album_name, uni2utf8(img[ATTR_ALT])))

def download_album(url, name):
global ALREDY_TO_DOWNLOAD_ALBLUMN
if not ALREDY_TO_DOWNLOAD_ALBLUMN:
_('已经下载 %s' % (name,))
if NEXT_ALBLUM in name:
_('准备开始继续下载...')
ALREDY_TO_DOWNLOAD_ALBLUMN = True
return

_('正在下载影集:%s 地址:%s' % (name, url))
bs = url2bs4(url)
page = bs.find(name='div', attrs={'class':'page'})
lis = page.find_all(name='li')
sum_page = 0
for li in lis:
page_name = uni2utf8(li.a.text)
if page_name.startswith('共'):
sum_page = int(page_name[3:-5])
break

download_album_one_page(url, name)
cur_page = 2
prefix = url[:url.rfind('.')]
while cur_page <= sum_page:
download_album_one_page('%s_%d.html' % (prefix,cur_page,), name)
cur_page = cur_page + 1

def process_one_tag_page(tag_name, tag_url):
_('当前主题:%s' % (tag_name,))
bs = url2bs4(tag_url)
bricks = bs.find_all(name='div', attrs={'class':'item masonry_brick masonry-brick'})
for i in bricks:
album = i.find(name='a')
if not album:
continue

download_album(uni2utf8(album[ATTR_HREF]), uni2utf8(album[ATTR_TITLE]))

def process_tags(tags):
for k in tags:
#if '全部' in k:
#   continue
#if not '人体艺术' in k and not '丝袜' in k:
#   continue
if not '丝袜' in k:
continue

#process_one_tag_page(k, tags[k])
bs_first_page = url2bs4(tags[k])
page_num = bs_first_page.find(name='li', attrs={'class':'pageinfo',})
if not page_num:
continue
page_num = page_num.text
seprator_index = page_num[page_num.find('/')+1:-1]
page_num = int(seprator_index)

prefix_url = tags[k][0:tags[k].rfind('.')]
url_list = [tags[k],]
start_page_num = 2
while start_page_num < page_num:
next = '%s/%d.html' % (prefix_url,start_page_num)
if len(url_list) >= MAX_THEME_PAGE_COUNT:
break
url_list.append(next)
start_page_num = start_page_num + 1

for n  in url_list:
process_one_tag_page(k, n)

if __name__ == '__main__':
#reload(sys)
#sys.setdefaultencoding('gb2312')

tags = read_tags(START_URL)
#dump_tags(tags)
process_tags(tags)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: