您的位置:首页 > 编程语言 > Python开发

Python爬虫学习记录(1)——百度贴吧图片下载

2015-08-22 20:06 1046 查看
#!/usr/bin/python
#coding=utf-8
import os
from urllib.request import urlopen
from urllib.request import urlretrieve
import re
def getHtml(url):#获取网页的函数
page = urlopen(url)
html = page.read()
return html

def getImg(html,id,page_num): #获取图片的函数
reg = r'http:\/\/imgsrc.baidu.com\/forum\/.{70,100}jpg'
imgre = re.compile(reg)
html = str(html)
f = open("/usr/lxp/python_test/getImg_Python/out_" + str(page_num),"w+")
f.write(html)
f.close()
imglist = imgre.findall(html)
x = 0
for imgurl in imglist:
save_name = 'topic_'+ id + '_' + str(page_num) + '_%s.jpg' % x
print('download' + save_name +' sucessfully from ' + imgurl)
urlretrieve(imgurl,save_name)
x+=1
return imglist

def getAllImg(topic_id):#解析网页按页数下载
page_num = 1
html_len=0
os.system('mkdir topic_' + topic_id)
while True :
html = getHtml("http://tieba.baidu.com/p/" + topic_id + '?see_lz=1&pn=' + str(page_num))
print(str(html_len) + ' ' + str(len(html)))
if html_len == len(html):
break
getImg(html,topic_id,page_num)
os.system('mv topic_' + topic_id + '*.jpg topic_' + topic_id)
html_len = len(html)
page_num = page_num + 1
return page_num

topic_id = input("topic id:")
getAllImg(topic_id)

</pre><pre name="code" class="python">
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python 爬虫 图片 百度