爬取图片网站上的美女图片
2016-03-18 15:49
316 查看
#coding=utf-8 from bs4 import BeautifulSoup as BS4 import requests import wget import sys import chardet import os __author__ = 'Administrator' HOST = 'http://www.5442.com/meinv/' START_URL = 'http://www.5442.com/meinv/' ATTR_HREF = 'href' ATTR_TITLE = 'title' ATTR_ALT = 'alt' ATTR_SRC = 'src' HOST_CONTENT_ENCODING = 'GB2312' MAIN_DOWNLOAD_FOLDER = 'G:/meinv/' NEXT_ALBLUM = '可爱熊吖BOBO' ALREDY_TO_DOWNLOAD_ALBLUMN = False MAX_THEME_PAGE_COUNT = 200 def utf82GBK(s): return s.decode('utf-8').encode('gb2312') def gbkprint(s): print(utf82GBK(s)) def uni2utf8(s): return s.encode('utf-8') def gbk2utf8(s): return s.decode('gb2312','ignore').encode('utf-8','ignore') def myprint(s): print(s) _ = myprint def url2bs4(url): res = requests.get(url) if res.status_code != 200: return #enc = chardet.detect(res.content) #print enc res.encoding = HOST_CONTENT_ENCODING content = gbk2utf8(res.content) ret = BS4(content) return ret def read_tags(fromURL): bs = url2bs4(fromURL) tags_objects = bs.find_all("a",attrs={'class':'yxtag'}) ret = {} for t in tags_objects: ret[uni2utf8(t.text)] = uni2utf8(t[ATTR_HREF]) return ret def dump_tags(tags): for k in tags: _('名称:%s 地址:%s' % (k, tags[k])) def next_download_addr(album_name, folder_name): full_name = os.path.join(MAIN_DOWNLOAD_FOLDER, album_name)#, folder_name) folder_full_name = utf82GBK(full_name) if not os.path.isdir(folder_full_name): os.makedirs(folder_full_name) next_num = 1 while True: file_name = os.path.join(folder_full_name, str(next_num)+ '.jpg') if not os.path.isfile(file_name): break next_num = next_num + 1 return file_name def download_album_one_page(url, album_name): bs = url2bs4(url) cur = bs.find(name='p', attrs={'id': 'contents',}) imgs = cur.find_all(name='img') for img in imgs: wget.download(img[ATTR_SRC], next_download_addr(album_name, uni2utf8(img[ATTR_ALT]))) def download_album(url, name): global ALREDY_TO_DOWNLOAD_ALBLUMN if not ALREDY_TO_DOWNLOAD_ALBLUMN: _('已经下载 %s' % (name,)) if NEXT_ALBLUM in name: _('准备开始继续下载...') ALREDY_TO_DOWNLOAD_ALBLUMN = True return _('正在下载影集:%s 地址:%s' % (name, url)) bs = url2bs4(url) page = bs.find(name='div', attrs={'class':'page'}) lis = page.find_all(name='li') sum_page = 0 for li in lis: page_name = uni2utf8(li.a.text) if page_name.startswith('共'): sum_page = int(page_name[3:-5]) break download_album_one_page(url, name) cur_page = 2 prefix = url[:url.rfind('.')] while cur_page <= sum_page: download_album_one_page('%s_%d.html' % (prefix,cur_page,), name) cur_page = cur_page + 1 def process_one_tag_page(tag_name, tag_url): _('当前主题:%s' % (tag_name,)) bs = url2bs4(tag_url) bricks = bs.find_all(name='div', attrs={'class':'item masonry_brick masonry-brick'}) for i in bricks: album = i.find(name='a') if not album: continue download_album(uni2utf8(album[ATTR_HREF]), uni2utf8(album[ATTR_TITLE])) def process_tags(tags): for k in tags: #if '全部' in k: # continue #if not '人体艺术' in k and not '丝袜' in k: # continue if not '丝袜' in k: continue #process_one_tag_page(k, tags[k]) bs_first_page = url2bs4(tags[k]) page_num = bs_first_page.find(name='li', attrs={'class':'pageinfo',}) if not page_num: continue page_num = page_num.text seprator_index = page_num[page_num.find('/')+1:-1] page_num = int(seprator_index) prefix_url = tags[k][0:tags[k].rfind('.')] url_list = [tags[k],] start_page_num = 2 while start_page_num < page_num: next = '%s/%d.html' % (prefix_url,start_page_num) if len(url_list) >= MAX_THEME_PAGE_COUNT: break url_list.append(next) start_page_num = start_page_num + 1 for n in url_list: process_one_tag_page(k, n) if __name__ == '__main__': #reload(sys) #sys.setdefaultencoding('gb2312') tags = read_tags(START_URL) #dump_tags(tags) process_tags(tags)
相关文章推荐
- 技术架构的关注点
- TN21 命令与消息传递
- html5手机网站需要加的那些meta/link标签,html5 meta全解
- 漫谈IM通信架构
- 开发者必备的网站-你都收藏了么?
- ava处理高并发高负载类网站的优化方法
- MySQL高可用浅析
- 大型网站架构不得不考虑的10个问题
- 大型网站架构不得不考虑的10个问题
- SharePoint开发——利用CSOM逐级获取O365中SharePoint网站的List内容
- 大型网站Mysql的演变史
- 如何成为架构师?7个关键的思考、习惯和经验
- 《软件架构设计》学习笔记--0--开篇
- 帝国CMS网站地图sitemap的制作教程,分享2种帝国cms网站地图模板
- 系统架构师所要了解的部分框架
- 论文查重网站
- 用PHP开发购物车网站(第二篇):PDO数据库
- Intel Pin架构
- java 解析网站的标题,简介,图片等
- 架构师都要懂哪些知识