网络爬虫:抓取XXOO图片
2017-01-20 19:14
225 查看
基本程序
# -*- coding: utf-8 -*- import urllib.request import urllib.parse import os from bs4 import BeautifulSoup import re def url_open(url): req=urllib.request.Request(url) req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36") response=urllib.request.urlopen(req) html=response.read() return html def get_pagenum(url): #获取jandan网站的页面号(2320) html = url_open(url).decode("utf-8") num_re = re.compile(r'<span class="current-comment-page">\[\d{4}\]</span>') num = num_re.search(html) a = re.compile(r'\d{4}') num_b = a.search(num.group()) return num_b.group() def get_images(url): # html=url_open(url).decode("utf-8") # img_list=[] # jpg_re=re.compile(r'<img src="//w.*\.jpg') #有的图片ww开头,有的以wx开头 # numurl=jpg_re.findall(html) # print (numurl) # jpg = re.compile(r'//w.+\.jpg') # for line in numurl: # imgurl=jpg.findall(line) # img_list.append(imgurl[0]) # return img_list html=url_open(url).decode("utf-8") img_list=[] jpg_re=re.compile(r'<img src="//([^"]+\.jpg)"') #当给出的正则表达式中带有一个括号时,列表的元素为字符串, #此字符串的内容与括号中的正则表达式相对应(不是整个正则表达式的匹配内容) imgurl=jpg_re.findall(html) for each in imgurl: img_list.append(each) # print (img_list) return img_list def save_imgs(img_list): i=0 for each in img_list: i+=1 filename=each.split("/")[-1] with open(filename,"wb") as f: img=url_open("http://%s" %each) f.write(img) print ("下载本页的第%s张图片,名称为%s" %(i,filename)) def download__mm(dir,url): if not os.path.isdir(dir): os.mkdir(dir) os.chdir(dir) else: os.chdir(dir) url=url page_num=int(get_pagenum(url)) for i in range(20): page_num -= 1 pageurl = url + "page-" + str(page_num) + "#comments" imgurl = get_images(pageurl) print("下载第%s页图片" % page_num) saveimg = save_imgs(imgurl) if __name__=="__main__": dir="PaPa" url= "http://jandan.net/ooxx/" download__mm(dir,url)
增加代理
**但是存在的问题是,使用代理后urllib.error.URLError[WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。>
# -*- coding: utf-8 -*- import urllib.request import urllib.parse import os from bs4 import BeautifulSoup import re import random proxies = [] headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"} def get_proxy(): url="http://www.xicidaili.com" req=urllib.request.Request(url,headers=headers) # req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) " # "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36") response=urllib.request.urlopen(req) html=response.read().decode("utf-8") # IP=re.compile(r"\b(([01]?\d?\d|2[0-4]\d|25[0-5])\.){3}([01]?\d?\d|2[0-4]\d|25[0-5])\b") IP = re.compile('<td>(\d+)\.(\d+)\.(\d+)\.(\d+)</td>\s*<td>(\d+)</td>') proxy_ip=IP.findall(html) for each in proxy_ip: proxies.append(":".join([(".".join(each[0:4])),each[4]])) # print (proxies) return proxies def change_proxy(): proxy=random.choice(proxies) if proxy==None: proxy_support=urllib.request.ProxyHandler({}) else: proxy_support = urllib.request.ProxyHandler({"http": proxy}) opener = urllib.request.build_opener(proxy_support) opener.addheaders=[("User-Agent",headers["User-Agent"])] urllib.request.install_opener(opener) print('智能切换代理:%s' % ('本机' if proxy == None else proxy)) def url_open(url): req=urllib.request.Request(url,headers=headers) response=urllib.request.urlopen(req) html=response.read() return html def get_pagenum(url): #获取jandan网站的页面号(2320) html = url_open(url).decode("utf-8") num_re = re.compile(r'<span class="current-comment-page">\[\d{4}\]</span>') num = num_re.search(html) a = re.compile(r'\d{4}') num_b = a.search(num.group()) return num_b.group() def get_images(url): html=url_open(url).decode("utf-8") img_list=[] jpg_re=re.compile(r'<img src="//([^"]+\.jpg)"') #当给出的正则表达式中带有一个括号时,列表的元素为字符串, #此字符串的内容与括号中的正则表达式相对应(不是整个正则表达式的匹配内容) imgurl=jpg_re.findall(html) for each in imgurl: img_list.append(each) # print (img_list) return img_list def save_imgs(img_list): i=0 for each in img_list: i+=1 filename=each.split("/")[-1] with open(filename,"wb") as f: img=url_open("http://%s" %each) f.write(img) print ("下载本页的第%s张图片,名称为%s" %(i,filename)) def download__mm(dir,url): if not os.path.isdir(dir): os.mkdir(dir) os.chdir(dir) else: os.chdir(dir) url=url page_num=int(get_pagenum(url)) for i in range(20): page_num -= 1 pageurl = url + "page-" + str(page_num) + "#comments" imgurl = get_images(pageurl) print("下载第%s页图片" % page_num) saveimg = save_imgs(imgurl) if __name__=="__main__": get_proxy() change_proxy() dir="PaPa" url= "http://jandan.net/ooxx/" download__mm(dir,url)
相关文章推荐
- 蜘蛛爬虫网络高像素图片抓取工具[ZSpider.NET]
- 蜘蛛爬虫网络高像素图片抓取工具[搜索引擎]
- Python3 网络爬虫之抓取图片
- android适用爬虫抓取网络图片s
- python 网络爬虫抓取图片
- 蜘蛛爬虫网络高像素图片抓取工具[ZSpider.NET]
- 基于python的网络爬虫---抓取p站图片
- 蜘蛛爬虫网络高像素图片抓取工具[搜索引擎]
- 网络爬虫:抓取页面图片
- Python爬虫之抓取网络图片
- Jsoup 爬虫 抓取网络图片
- 网络爬虫内容抓取
- Java抓取网络图片
- [python]第一炮:抓取图片的小爬虫
- 网络爬虫,使用NodeJs抓取RSS新闻
- java抓取、java网络爬虫实例项目jnc
- php爬虫抓取百度贴吧图片
- C#抓取网络图片保存到本地
- python爬虫抓取图片