您的位置:首页 > 编程语言 > Go语言

Google 图片下载工具

2012-07-11 22:06 281 查看
毕设做实验需要从网上下几万张图片,以前用师兄做的Flickr下载器,用Flickr的API完成的。但是Flickr上的图片是用户分享居多,通过指定的关键词去搜索,很多时候无法得到满意的图片。在Google、Bing上虽然能得到比较好的搜索结果,但是Google早早地停用了搜索的SDK,CodeProject上的例子是N年前的,试过都不能用了;Bing虽然现在还有SDK,但是看官方的通告,大约是8月份也要停用了,而且现在提供的下载限制每天一张,木有办法,只能自己想招了。

在查看Google图片搜索页面的源码时,发现在<a>的href属性里面包含了图片原始的url,所以就想到解析搜索结果页面的办法,将原始图片的url切出来,然后从url现在原始的图片。url的切割,可以使用正则表达式来完成。

GoogleImages 1 using System;
2 using System.Collections.Generic;
3 using System.Net;
4 using System.Xml.Linq;
5 using System.IO;
6 using System.Web;
7 using System.Text.RegularExpressions;
8 using System.Threading;
9 using System.Drawing;
10 using System.Text;
11
12 namespace GoogleImageDownload
13 {
14 public class GoogleImages
15 {
16 /// <summary>
17 /// 通过拼url的方式,向google发出请求
18 /// 参数0:查询关键字
19 /// 参数1:从那一条搜索记录开始,每页默认21个,通过设置为0、21、42、63等,获取多张图片
20 /// </summary>
21 private const string IMG_URL = "http://www.google.com.hk/search?q={0}&hl=zh-CN&newwindow=1&safe=strict&biw=1280&" +
22 "bih=699&gbv=2&ie=UTF-8&tbm=isch&ei=2HblT4vrCISwiQeavLhZ&start={1}&sa=N";
23 /// <summary>
24 /// 默认POST 10 页
25 /// </summary>
26 private const int PAGES = 10;
27 /// <summary>
28 /// 提供四种出错信息
29 /// </summary>
30 public static string[] ERRORS = { "GetDownloadInfo", "CreateImageDownloadLink", "SaveToLocal", "RenameImage" };
31 private string logFile = "";
32 private string downloadFolder = "";
33 private string downloadObj = "";
34
35 /// <summary>
36 /// Download images from Google
37 /// </summary>
38 public void DownLoadImages(object Obj)
39 {
40 DateTime tStart = DateTime.Now;
41 this.downloadObj = (string)Obj;
42
43 /// Images:每个关键字为单独的一个文件夹,该文件夹下保存图片
44 /// DownloadInfos:POST得到的Google搜索结果的页面,以及页面中图片的url
45 /// Log:下载图片中出现的异常信息
46 #region 创建保存下载信息的文件夹
47
48 /// 创建图片文件夹
49 if (!Directory.Exists(String.Format(".\\Images\\{0}", downloadObj)))
50 {
51 Directory.CreateDirectory(String.Format(".\\Images\\{0}", downloadObj));
52 }
53 this.downloadFolder = String.Format(".\\Images\\{0}", downloadObj);
54
55 /// 创建下载信息文件夹
56 if (!Directory.Exists(String.Format(".\\DownloadInfos", downloadObj)))
57 {
58 Directory.CreateDirectory(String.Format(".\\DownloadInfos", downloadObj));
59 }
60
61 string resHtmlFile = ".\\DownloadInfos\\" + downloadObj + "_res.txt";
62 if (!File.Exists(resHtmlFile))
63 {
64 File.Create(resHtmlFile);
65 }
66
67 string resImageList = ".\\DownloadInfos\\" + downloadObj + "_img.txt";
68 if (!File.Exists(resImageList))
69 {
70 File.Create(resImageList);
71 }
72
73 /// 创建日志文件夹
74 if (!Directory.Exists(String.Format(".\\Log", downloadObj)))
75 {
76 Directory.CreateDirectory(String.Format(".\\Log", downloadObj));
77 }
78
79 this.logFile = ".\\Log\\" + (string)Obj + ".log";
80 if (!File.Exists(logFile))
81 {
82 File.Create(logFile);
83 }
84
85 #endregion
86
87 /// 确定下载几页,模拟了在搜索结果中手动翻页
88 /// 默认10页,每页大约21张图片
89
90 #region POST 10 个请求,返回10个Google搜索结果页面,所有内容存在一个文本文件里面
91
92 for (int i = 0; i < PAGES; i++)
93 {
94 string url = string.Format(IMG_URL, downloadObj, 21 * i);
95 try
96 {
97 System.Net.HttpWebRequest r = (System.Net.HttpWebRequest)System.Net.HttpWebRequest.Create(url);
98 r.AllowAutoRedirect = true;
99 System.Net.CookieContainer c = new System.Net.CookieContainer();
100 r.CookieContainer = c;
101 System.Net.HttpWebResponse res = r.GetResponse() as System.Net.HttpWebResponse;
102 if (res.StatusCode == HttpStatusCode.OK)
103 {
104 System.IO.StreamReader s = new System.IO.StreamReader(res.GetResponseStream(), System.Text.Encoding.GetEncoding("GB2312"));
105 //Response.Write(s.ReadToEnd());
106 Console.WriteLine("start");
107 StreamWriter sw = new StreamWriter(resHtmlFile, true);
108 sw.Write(s.ReadToEnd());
109 Console.WriteLine(downloadObj + " " + i);
110 sw.Close();
111 s.Close();
112 res.Close();
113 }
114 }
115 catch (Exception ex)
116 {
117 PrintException(0, downloadObj, ex.ToString());
118 }
119
120 Console.WriteLine("end");
121 }
122
123 #endregion
124
125 #region 从文本文件中用RE切出图片的url,并下载图片
126
127 StreamReader sr = new StreamReader(resHtmlFile);
128 string result = sr.ReadToEnd();
129 sr.Close();
130
131 /// 一般网址url的RE
132 string strRegex = "(http[s]{0,1}|ftp)://[a-zA-Z0-9\\.\\-]+\\.([a-zA-Z]{2,4})(:\\d+)?(/[a-zA-Z0-9\\.\\-~!@#$%^&*+?:_/=<>]*)?";
133 Regex re = new Regex(strRegex);
134 MatchCollection mactes = re.Matches(result);
135
136 int count = 0;
137 foreach (Match img in mactes)
138 {
139 string tmp = img.Value;
140 /// 割掉RE得到的多余的内容
141 if (tmp.Contains("&"))
142 tmp = tmp.Substring(0, tmp.Length - 4);
143 /// 过滤url,专找图片的url
144 if (tmp.Contains(".jpg") || tmp.Contains(".png") || tmp.Contains(".jpeg") || tmp.Contains(".gif"))
145 {
146 string newFileName = "";
147 string[] split = tmp.Split(new char[]{'/'});
148 /// 给图片分配一个新的名字
149 try
150 {
151 FileInfo fi = new FileInfo(split[split.Length - 1]);
152 newFileName = String.Format("{0}_{1}{2}", this.downloadObj, count.ToString("000"), fi.Extension);
153 count++;
154 /// 输出“newFileName ImageUrl”到DownloadInfos
155 StreamWriter sw2 = new StreamWriter(resImageList, true);
156 sw2.WriteLine(String.Format("{0}\t{1}", newFileName, tmp));
157 sw2.Flush();
158 sw2.Close();
159 }
160 catch (Exception ex)
161 {
162 PrintException(0, split[split.Length - 1], ex.ToString());
163 }
164
165 Console.WriteLine(split[split.Length-1]+" is downloading");
166 /////////////////////
167 // Download Images //
168 /////////////////////
169 SavePhotoFromUrl(newFileName, tmp);
170 //ThreadPool.QueueUserWorkItem(new WaitCallback(this.SavePhotoFromUrl), new string[] { newFileName, tmp });
171 //Thread save = new Thread(new ParameterizedThreadStart(this.SavePhotoFromUrl));
172 //save.Name = "Thread_" + newFileName;
173 //save.Start(new string[] { newFileName, tmp });
174 Console.WriteLine(split[split.Length-1]+" has been downloaded");
175 }
176 }
177
178 #endregion
179
180 /// 输出下载用时
181 DateTime tEnd = DateTime.Now;
182 TimeSpan cost = tEnd - tStart;
183 PrintTime(cost.ToString());
184 }
185
186 /// <summary>
187 /// 通过url将图片保存到本地,指定文件名为FileName
188 /// </summary>
189 public bool SavePhotoFromUrl(string FileName, string Url)
190 //public void SavePhotoFromUrl(Object paras)
191 {
192 //string[] Para = (string[])paras;
193 //string FileName = Para[0];
194 //string Url = Para[1];
195 bool Value = false;
196 WebResponse response = null;
197 Stream stream = null;
198
199 try
200 {
201 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
202 response = request.GetResponse();
203 stream = response.GetResponseStream();
204 Value = SaveBinaryFile(response, this.downloadFolder + "\\" + FileName);
205 }
206 catch (Exception ex)
207 {
208 PrintException(1, Url, ex.ToString());
209 }
210 return Value;
211 }
212 /// <summary>
213 /// 保存图片到本地
214 /// </summary>
215 /// <param name="response">用来保存图片的Response</param>
216 private bool SaveBinaryFile(WebResponse response, string FileName)
217 {
218 bool Value = true;
219 byte[] buffer = new byte[1024];
220
221 try
222 {
223 if (File.Exists(FileName))
224 {
225 return true;
226 }
227 Stream outStream = System.IO.File.Create(FileName);
228 Stream inStream = response.GetResponseStream();
229
230 int l;
231 do
232 {
233 l = inStream.Read(buffer, 0, buffer.Length);
234 if (l > 0)
235 outStream.Write(buffer, 0, l);
236 }
237 while (l > 0);
238
239 outStream.Close();
240 inStream.Close();
241 }
242 catch (Exception ex)
243 {
244 PrintException(2, FileName, ex.ToString());
245 Value = false;
246 }
247 return Value;
248 }
249
250 /// <summary>
251 /// 在下载过程中打印出错信息
252 /// 三种出错信息:
253 /// 0:GetDownloadInfo 在向Google请求下载信息的时候出错
254 /// 1:CreateImageDownloadLink 在获取图片url后建立连接过程中出错
255 /// 2:SaveToLocal 在建立下载连接后保存到本地过程中出错
256 /// 3: RenameImage 按照标准命名方式重命名文件过程中出错
257 /// </summary>
258 /// <param name="type">出错类型</param>
259 /// <param name="obj">出错对象</param>
260 /// <param name="exceptionInfo">出错信息</param>
261 private void PrintException(int type, string obj, string exceptionInfo)
262 {
263 try
264 {
265 StreamWriter sPrint = new StreamWriter(this.logFile, true);
266 sPrint.WriteLine(String.Format("TYPE:{0}\tOBJECT:[{1,-30}]\nERROR:{2}\n", GoogleImages.ERRORS[type], obj, exceptionInfo));
267 sPrint.Close();
268 }
269 catch (Exception ex)
270 {
271 ;
272 }
273 }
274
275 /// <summary>
276 /// 输出下载所用时间
277 /// </summary>
278 /// <param name="time">下载用时</param>
279 private void PrintTime(string time)
280 {
281 try
282 {
283 StreamWriter sPrint = new StreamWriter(this.logFile, true);
284 sPrint.WriteLine(String.Format("Download Cost:{0}",time));
285 sPrint.Close();
286 }
287 catch (Exception ex)
288 {
289 ;
290 }
291 }
292 }
293 }

附:Google url中各个参数的含义(

http://www.4ucode.com/Study/Topic/1060948):

hl(Interface Language):Google搜索的界面语言

q(Query):查询的关键词

start:显示搜索结果的起始端,如果start=1,则从第2个搜索结果开始显示;如果你想直接看第搜索结果第21页,让start=200即可,由于Google只显示1000条搜索结果记录,start理论取值范围在0–999之间

lr(Language Restrict):搜索内容的语言限定限定只搜索某种语言的网页。如果lr参数为空,则为搜索所有网页

ie(Input Encoding):查询关键词的编码,缺省设置为utf-8,也就是说请求Google搜索时参数q的值是一段utf-8编码的文字

oe(Output Encoding):搜索结果页面的网页编码,缺省设置oe=utf-8

num(Number):搜索结果显示条数,取值范围在10–100条之间,缺省设置num=10

newwindow:是否开启新窗口以显示查询结果,缺省设置newwindow=1,在新窗口打开搜索结果而面

aq(Ascending Query):判断搜索用户是否是第一次查询,如果用户第一次进行查询,则aq=f(First);如若进行过多次查询,则aq=-1,这个的主要作用应该是统计和放置作-弊

as_q(Ascending Search Query):上一次查询关键词

欢迎指教&讨论~
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: