python爬虫抓取MM图(www.mmjpg.com)
2017-07-11 19:31
597 查看
运行输入开始抓取的页面,及结束的页面即可(注:开始,结束页面在1-70之间,具体看www.mmjpg.com更新情况可以改变范围)
(只有不断练习才会上升,之前写过一个,忘得差不多了。这个写起来费力,毕竟才开始不久,还没哟视频,只有慢慢摸)
#!/usr/bin/env Python
#coding = utf-8
import urllib.request
import random
import os
import re
import parser
''''''#修改header
header = {}
header["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"
''''''
#获取HTML,返回HTML
req = urllib.request.Request(url)
req.add_header = ("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36")
response = urllib.request.urlopen(req)
html = response.read()
return html
def html_dec(html):#解码
return html.decode("utf-8")
def get_main_img_adr(html):#获得单个图片地址及其图片数
p = r'<div class=\"content\".+img src="(.+\.jpg)".+</div>'
result = re.findall(p,html)
p1 = r'<div class=\"page\".+>(.+)</a><em.+</div>'
picture_num = re.findall(p1,html)
packge = []
packge.append(result[0])
packge.append(picture_num[0])
return packge
def try_it():#返回图片地址,和图片可扩展的张数,例如返回['http://img.mmjpg.com/2015/1/1.jpg', '33'],表示图片地址的最后一位数可更换为33一下任何数值,对应也就是图片的地址
picture = []
start_page = int(input("输入开始下载页: "))
end_page = int(input("输入停止下载页: "))
for e in range(start_page,end_page):
url_copy = url
url_copy = url + "/mm/" + str(e)
html1 = re_html(url_copy)
html2 = html_dec(html1)
result = get_main_img_adr(html2)
picture.append(result)
print(picture)
return picture
def down_img(picture): #下载图片
i = 1
for e in picture:
x = 1
while x <= int(e[1]):
with open(str(i)+".jpg","wb") as f:
adr = e[0][:-5]+str(x)+e[0][-4:]
print(adr)
img = re_html(adr)
f.write(img)
x += 1
i += 1
def catch_mm(folder = "mm", main_page = 3):#下载MM图片
os.mkdir(folder)#创建文件夹
os.chdir(folder)
picture = try_it()#得到图片地址,返回的是元素为[图片网址,图片数]的列表
down_img(picture)#读出列表中的每个元素,并下载下来
if __name__ == "__main__" :
catch_mm()
(只有不断练习才会上升,之前写过一个,忘得差不多了。这个写起来费力,毕竟才开始不久,还没哟视频,只有慢慢摸)
#!/usr/bin/env Python
#coding = utf-8
import urllib.request
import random
import os
import re
import parser
url = "http://www.mmjpg.com/"#主网站网址
def re_html(url): #返回HTML的内容 #创建opener iplist = ['183.62.71.242:3128', '157.0.32.175:8998', '180.110.18.111:808', '112.252.165.21:8889'] proxy_support = urllib.request.ProxyHandler({'https':random.choice(iplist)}) opener = urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener)
''''''#修改header
header = {}
header["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"
''''''
#获取HTML,返回HTML
req = urllib.request.Request(url)
req.add_header = ("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36")
response = urllib.request.urlopen(req)
html = response.read()
return html
def html_dec(html):#解码
return html.decode("utf-8")
def get_main_img_adr(html):#获得单个图片地址及其图片数
p = r'<div class=\"content\".+img src="(.+\.jpg)".+</div>'
result = re.findall(p,html)
p1 = r'<div class=\"page\".+>(.+)</a><em.+</div>'
picture_num = re.findall(p1,html)
packge = []
packge.append(result[0])
packge.append(picture_num[0])
return packge
def try_it():#返回图片地址,和图片可扩展的张数,例如返回['http://img.mmjpg.com/2015/1/1.jpg', '33'],表示图片地址的最后一位数可更换为33一下任何数值,对应也就是图片的地址
picture = []
start_page = int(input("输入开始下载页: "))
end_page = int(input("输入停止下载页: "))
for e in range(start_page,end_page):
url_copy = url
url_copy = url + "/mm/" + str(e)
html1 = re_html(url_copy)
html2 = html_dec(html1)
result = get_main_img_adr(html2)
picture.append(result)
print(picture)
return picture
def down_img(picture): #下载图片
i = 1
for e in picture:
x = 1
while x <= int(e[1]):
with open(str(i)+".jpg","wb") as f:
adr = e[0][:-5]+str(x)+e[0][-4:]
print(adr)
img = re_html(adr)
f.write(img)
x += 1
i += 1
def catch_mm(folder = "mm", main_page = 3):#下载MM图片
os.mkdir(folder)#创建文件夹
os.chdir(folder)
picture = try_it()#得到图片地址,返回的是元素为[图片网址,图片数]的列表
down_img(picture)#读出列表中的每个元素,并下载下来
if __name__ == "__main__" :
catch_mm()
相关文章推荐
- Python爬虫实战:抓取淘宝MM照片
- Python爬虫实战(4):抓取淘宝MM照片
- Python爬虫实战(4):抓取淘宝MM照片
- Python爬虫实战(4):抓取淘宝MM照片
- Python爬虫实战:抓取淘宝MM照片
- Python爬虫小实践:下载妹子图www.mzitu.com网站上所有的妹子图片,并按相册名字建立文件夹分好文件名
- Getting Started Spidering a Site使用Chilkat(python)练习的一个爬虫(from :http://www.example-code.com)
- 芝麻HTTP:Python爬虫实战之抓取淘宝MM照片
- Python爬虫1:简单抓取网页
- python网页爬虫之列车时刻表的抓取-完整的python脚本
- Python爬虫:新浪新闻详情页的数据抓取(函数版)
- www.pythonchanlleges.com
- Python爬虫抓取框架:Scrapy的架构
- python爬虫(抓取百度新闻列表)
- 芝麻HTTP:Python爬虫实战之抓取爱问知识人问题并保存至数据库
- 零基础写python爬虫之抓取糗事百科代码分享
- 零基础写python爬虫之抓取百度贴吧代码分享
- 使用python/casperjs编写终极爬虫-客户端App的抓取
- 【Python爬虫基础】抓取知乎页面所有图片
- Python爬虫之三:抓取猫眼电影TOP100