您的位置:首页 > 编程语言 > Python开发

python百度贴吧图片下载脚本实例

2018-10-13 17:26 155 查看

功能介绍: 对百度贴吧内的图片进行下载;
python版本: python2.7
用到的库: urllib,requests

核心原理

使用urllib库爬取贴吧页面的图片链接,将其进行下载;requests用于获取当前访问页面返回状态码;

urllib.urlopen(url).read()
urllib.urlretrieve(pictures,Path_img)
requests.get(url).status_code

原理简单不用多说直接上code

code

#!/usr/bin/Python
# -*- coding: utf-8 -*-
__author__ = "Man_ge"

import urllib
import requests
import time,re,os,sys,random
import datetime

reload(sys)
sys.setdefaultencoding('utf-8')

#保存路径
LOCAL_PATH = "C:\\Users\\Administrator\\Desktop\\meinv4\\"

#basic function
class TB_get:
def __init__(self):
pass

#获取html
def get_html(self,url):
page = urllib.urlopen(url).read()
return page

#获取url状态
def get_state(self,url):
code=requests.get(url).status_code
return code

#获取网页title
def get_title(self,url):
reg = r'<title>(.*?)</title>'
reger = re.compile(reg)
data = re.findall(reger, urllib.urlopen(url).read())
return data[0].decode('UTF-8').encode('GBK')

#获取回复信息
def get_Replypost(self,url):
reg = r'l_reply_num.*?</li>'
reger = re.compile(reg)
data = re.findall(reger, urllib.urlopen(url).read())
info = re.compile(r'<span .*?>(.*?)</span>')
info_data = re.findall(info, str(data))
return int(info_data[0])

#页数
def get_pagenumber(self,url):
reg = r'l_reply_num.*?</li>'
reger = re.compile(reg)
data = re.findall(reger, urllib.urlopen(url).read())
info = re.compile(r'<span .*?>(.*?)</span>')
info_data = re.findall(info, str(data))
return int(info_data[1])

class TB_filter:
def __init__(self,html_page):
self.data=html_page

#匹配所有<href>
def filter_href(self):
reg = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
reger = re.compile(reg)
data = re.findall(reger, self.data)
return data

#匹配所有<a>
def filter_a(self):
reg = r'<a .*?>(.*?)</a>'
reger = re.compile(reg)
data = re.findall(reger, self.data)
return data

#匹配所有 src:
def filter_src(self):
reg = r"(?<=src=\").+?(?=\")|(?<=src=\').+?(?=\')"
reger = re.compile(reg)
data = re.findall(reger, self.data)
return data

#下载功能; 下载 png,jpg
def download_img(path_html):
tb = TB_get()
print "Title : ",tb.get_title(path_html)
if 'page404' in tb.get_html(path_html):
print u"很抱歉,该贴已被删除。"
else:
print "state : ",tb.get_state(path_html)
save_path=LOCAL_PATH+tb.get_title(path_html)+"\\"
isExists=os.path.exists(save_path)
if not isExists:
os.makedirs(save_path)
page_number = tb.get_pagenumber(path_html)#获取当前贴吧的页数
print u"页数 : ",page_number
print u"回复贴 : ",tb.get_Replypost(path_html)
download_page = 0
while download_page < page_number:
download_html=path_html+'?pn='+str(download_page+1)#对每页进行下载
print "\n\nstart access : ",download_html
state_code=tb.get_state(download_html)
print "state : ",state_code
if tb.get_state(download_html) == 200:#如果状态是200就可以下载 否则不能下载
page_data = tb.get_html(download_html)
fl = TB_filter(page_data)
data = fl.filter_src()
pictures_number=0
for pictures in data:
pictures_number+=1
if pictures.split(".")[-1] in ["png","jpg"]:#筛选出 png,jpg为后缀的图片格式进行下载
http_1=str(pictures.split("/")[0])
if http_1=="https:":
name= str(pictures.split("/")[-1])
tt= int(time.time())
newname=str(tt)+".jpg"
Path_img=save_path+newname
imgname=str(name.split("_")[0])
if imgname != "image" and '?' not in name:
print "\nstart download ====> "+name
print "loading......."
urllib.urlretrieve(pictures,Path_img)
print "download succees ====> "+newname
time.sleep(1)
else:
print "access failed!! state : ",state_code
download_page+=1

#下载器  只需要给定帖子路径,和帖子页数
def downloader(tb_path,tb_pg):
tb_path='https://tieba.baidu.com/f?kw='+tb_path+'&ie=utf-8&pn='+str((tb_pg-1)*50)
#print tb_path
tb = TB_get()
get_all_tb=tb.get_html(tb_path)
if tb.get_state(tb_path) == 200:
print "\n\nAccess : ",tb_path
reg = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
reger = re.compile(reg)
data = re.findall(reger, get_all_tb)
for tb_link in data:
reg1 = r'//tieba.baidu.com/p/.{0,}|/p/.{0,}'
reger1 = re.compile(reg1)
all_tb_link = re.findall(reger1, tb_link)
if all_tb_link != []:#获取当前页数的贴吧的所有帖子
assign_link=str(all_tb_link).split("/p")[-1]
assign_link=str(assign_link)[0:-2]
donwload_link= "https://tieba.baidu.com/p"+assign_link
print donwload_link
download_img(donwload_link)
else:
print "access failed!! state : ",state_code

if __name__ == '__main__':
n=0
#下载美女贴吧1到10页的每个帖子里的图片,一共500个帖子的图片
while n<10:
downloader('美女',n+1)
n+=1

运行

产出

版权声明:本文出自Man_ge博客原创文章,转载必须注明出处:https://mp.csdn.net/mdeditor/83040439

作者:Man_ge https://blog.csdn.net/Man_ge

阅读更多
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: