您的位置:首页 > 编程语言 > Python开发

python-贴吧图片爬取的一个小脚本

2018-02-08 21:23 197 查看
  学了点python,写了个爬取贴吧图片的小脚本,记录一下,其中遇到了一个坑,就是下载下来的html,百度不知道怎么做了特殊处理,加上了注释,结果一开始怎么都提取不到图片地址,最后仔细比较才发现,然后批量把注释取消了才成功获得url。
 真坑!

代码如下:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

import urllib
import urllib2
import ssl
import re
import os
from lxml import etree

totalcount = 0

def mkdir(path):
# 引入模块
import os

# 去除首位空格
path = path.strip()
# 去除尾部 \ 符号
path = path.rstrip("\\")

# 判断路径是否存在
# 存在 True
# 不存在 False
isExists = os.path.exists(path)

# 判断结果
if not isExists:
# 如果不存在则创建目录
# 创建目录操作函数
os.makedirs(path)
os.chdir(path)
print path+ ' 创建成功'
return True
else:
# 如果目录存在则不创建,并提示目录已存在
os.chdir(path)
print path + ' 目录已存在'
return False

def loadPage(url):
"""
作用:根据url发送请求,获取服务器相应文件
:param url: 需要爬取的url地址
:param fileName: 处理的文件名
:return: 读出来的内容
"""

sslNoVerify = ssl._create_unverified_context()
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.4.7 (KHTML, like Gecko) Version/11.0.2 Safari/604.4.7"}

request = urllib2.Request(url,headers=headers)
html = urllib2.urlopen(request,context=sslNoVerify ).read()

html = re.sub(r"<!--","<div>",html)
html = re.sub(r"--\>", "</div>", html)

content = etree.HTML(html)
link_list = content.xpath('//div[@class="t_con cleafix"]//a[@class="j_th_tit "]/@href')

for a in link_list:
subfullUrl = "https://tieba.baidu.com/"+a
gotoSubHtml(subfullUrl )
return html

def gotoSubHtml(url):
sslNoVerify = ssl._create_unverified_context()
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.4.7 (KHTML, like Gecko) Version/11.0.2 Safari/604.4.7"}

request = urllib2.Request(url, headers=headers)
html = urllib2.urlopen(request, context=sslNoVerify).read()

html = re.sub(r"<!--", "<div>", html)
html = re.sub(r"--\>", "</div>", html)

content = etree.HTML(html)
link_list = content.xpath('//div[@class="d_post_content_main "]//img[@class="BDE_Image"]/@src')
for a in link_list:
response = urllib2.urlopen(a)

pic = response.read()
global totalcount
with open(str(totalcount)+"_"+a[-8:], 'wb') as f:
f.write(pic)

totalcount += 1
print "---下载一张图片成功 ,第"+str(totalcount)+ "张"

def saveImg(url):
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.4.7 (KHTML, like Gecko) Version/11.0.2 Safari/604.4.7"}

response = urllib2.Request()

def writePage(html,fileName):
"""
作用:将html内容写入到本地
:param html: 服务器相应文件内容
:param fileName: 保存的文件名
:return:
"""

print "正在保存" + fileName
#文件写入
with open(fileName,"w") as f:
f.write(html)

print "-" * 30

def tiebaSpider(url,beginPage,endPage,folderName):
"""
作用:贴吧爬虫调度器,负责组合吃力每个页面的url
:param url: 贴吧url的前部分
:param beginPage: 起始页
:param endPage: 结束页
:return: nil
"""

for page in range(beginPage,endPage+1):
pn = (page -1) * 50
fileName = "第" + str(page) + "页.html"
fullUrl = url + "&pn=" + str(pn)
# print fullUrl

mkdir(folderName)
html = loadPage(fullUrl)
print "谢谢使用"
print "-"*30

if __name__ == "__main__":
kw = raw_input("请输入需要爬取的贴吧名:")
beginPage =int(raw_input("请输入起始页:"))
endPage= int(raw_input("请输入结束页:"))

url = "http://tieba.baidu.com/f?"

key = urllib.urlencode({"kw":unicode(kw, "utf-8").encode('gb2312')})
print "key = " + key
fullUrl = url + key

tiebaSpider(fullUrl,beginPage,endPage,kw)

爬出来的结果
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python 爬虫