C#winform抓取百度,Google搜索关键词结果
2011-11-07 13:44
771 查看
基于网站seo,做了一采集百度和Google搜索关键字结果的采集.在这里与大家分享一下
先看先效果图View Code
1 private void baidu_Click(object sender, EventArgs e)
2 {
3 int num = 100;//搜索条数
4 string url = "http://www.baidu.com/s?wd=" + txtSearch.Text.Trim() + "&rn=" + num + "";
5 string html = search(url, "gb2312");
6 BaiduSearch baidu = new BaiduSearch();
7 if (!string.IsNullOrEmpty(html))
8 {
9 int count = baidu.GetSearchCount(html);//搜索条数
if (count > 0)
{
List<Keyword> keywords = baidu.GetKeywords(html, txtSearch.Text.Trim());
dataGridView1.DataSource = keywords;
}
}
}
private void google_Click(object sender, EventArgs e)
{
int num = 100;
string url = "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=" + txtSearch.Text.Trim() + "&aq=f&aqi=&aql=&oq=&num=" + num + "";
string html = search(url, "utf-8");
if (!string.IsNullOrEmpty(html))
{
googleSearch google = new googleSearch();
List<Keyword> keywords = google.GetKeywords(html, txtSearch.Text.Trim());
dataGridView1.DataSource = keywords;
}
}
/// <summary>
/// 搜索处理
/// </summary>
/// <param name="url">搜索网址</param>
/// <param name="Chareset">编码</param>
public string search(string url, string Chareset)
{
HttpState result = new HttpState();
Uri uri = new Uri(url);
HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url);
myHttpWebRequest.UseDefaultCredentials = true;
myHttpWebRequest.ContentType = "text/html";
myHttpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215;)";
myHttpWebRequest.Method = "GET";
myHttpWebRequest.CookieContainer = new CookieContainer();
try
{
HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse();
// 从 ResponseStream 中读取HTML源码并格式化 add by cqp
result.Html = readResponseStream(response, Chareset);
result.CookieContainer = myHttpWebRequest.CookieContainer;
return result.Html;
}
catch (Exception ex)
{
return ex.ToString();
}
}
public string readResponseStream(HttpWebResponse response, string Chareset)
{
string result = "";
using (StreamReader responseReader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(Chareset)))
{
result = formatHTML(responseReader.ReadToEnd());
}
return result;
}
/// <summary>
/// 描述:格式化网页源码
///
/// </summary>
/// <param name="htmlContent"></param>
/// <returns></returns>
public string formatHTML(string htmlContent)
{
string result = "";
result = htmlContent.Replace("»", "").Replace(" ", "")
.Replace("©", "").Replace("/r", "").Replace("/t", "")
.Replace("/n", "").Replace("&", "&");
return result;
把百度和Google两个类抽取了出来
1.百度Search类
View Code
1 class BaiduSearch
2 {
3 protected string uri = "http://www.baidu.com/s?wd=";
4 protected Encoding queryEncoding = Encoding.GetEncoding("gb2312");
5 protected Encoding pageEncoding = Encoding.GetEncoding("gb2312");
6 protected string resultPattern = @"(?<=找到相关结果[约]?)[0-9,]*?(?=个)";
7 public int GetSearchCount(string html)
8 {
9 int result = 0;
string searchcount = string.Empty;
Regex regex = new Regex(resultPattern);
Match match = regex.Match(html);
if (match.Success)
{
searchcount = match.Value;
}
else
{
searchcount = "0";
}
if (searchcount.IndexOf(",") > 0)
{
searchcount = searchcount.Replace(",", string.Empty);
}
int.TryParse(searchcount, out result);
return result;
}
public List<Keyword> GetKeywords(string html, string word)
{
int i = 1;
List<Keyword> keywords = new List<Keyword>();
string ss="<h3 class=\"t\"><a.*?href=\"(?<url>.*?)\".*?>(?<content>.*?)</a>";
MatchCollection mcTable = Regex.Matches(html,ss);
foreach (Match mTable in mcTable)
{
if (mTable.Success)
{
Keyword keyword = new Keyword();
keyword.ID = i++;
keyword.Title = Regex.Replace(mTable.Groups["content"].Value, "<[^>]*>", string.Empty);
keyword.Link = mTable.Groups["url"].Value;
keywords.Add(keyword);
}
}
return keywords;
}
2 .GoogleSearch类
View Code
1 class googleSearch
2 {
3
4 public List<Keyword> GetKeywords(string html, string word)
5 {
6 int i = 1;
7 List<Keyword> keywords = new List<Keyword>();
8
9 Regex regTable = new Regex("<h3 class=\"r\"><a.*?href=\"(?<url>.*?)\".*?>(?<content>.*?)</a>", RegexOptions.IgnoreCase);
Regex regA = new Regex(@"(?is)<a/b[^>]*?href=(['""]?)(?<link>[^'""/s>]+)/1[^>]*>(?<title>.*?)</a>", RegexOptions.IgnoreCase);
MatchCollection mcTable = regTable.Matches(html);
foreach (Match mTable in mcTable)
{
if (mTable.Success)
{
Keyword keyword = new Keyword();
keyword.ID = i++;
keyword.Title = Regex.Replace(mTable.Groups["content"].Value, "<[^>]*>", string.Empty);
keyword.Link = mTable.Groups["url"].Value;
keywords.Add(keyword);
}
}
return keywords;
}
忘了.还有个导出Excel,这个友友们应该都有自己的方法,我这里就简单写了一个excel导出.也贴出来吧.
1 public void ExportDataGridViewToExcel(DataGridView dataGridview1)
2 {
3 SaveFileDialog saveFileDialog = new SaveFileDialog();
4 saveFileDialog.Filter = "Execl files (*.xls)|*.xls";
5 saveFileDialog.FilterIndex = 0;
6 saveFileDialog.RestoreDirectory = true;
7 saveFileDialog.CreatePrompt = true;
8 saveFileDialog.Title = "导出Excel文件";
9
DateTime now = DateTime.Now;
saveFileDialog.FileName = now.Year.ToString().PadLeft(2) + now.Month.ToString().PadLeft(2, '0') + now.Day.ToString().PadLeft(2, '0') + "-" + now.Hour.ToString().PadLeft(2, '0') + now.Minute.ToString().PadLeft(2, '0') + now.Second.ToString().PadLeft(2, '0');
saveFileDialog.ShowDialog();
Stream myStream;
myStream = saveFileDialog.OpenFile();
StreamWriter sw = new StreamWriter(myStream, System.Text.Encoding.GetEncoding("gb2312"));
string str = "";
try
{
//写标题
for (int i = 0; i < dataGridview1.ColumnCount; i++)
{
if (i > 0)
{
str += "\t";
}
str += dataGridview1.Columns[i].HeaderText;
}
sw.WriteLine(str);
//写内容
for (int j = 0; j < dataGridview1.Rows.Count; j++)
{
string tempStr = "";
for (int k = 0; k < dataGridview1.Columns.Count; k++)
{
if (k > 0)
{
tempStr += "\t";
}
tempStr += dataGridview1.Rows[j].Cells[k].Value.ToString();
}
sw.WriteLine(tempStr);
}
sw.Close();
myStream.Close();
MessageBox.Show("导出成功");
}
catch (Exception e)
{
MessageBox.Show(e.ToString());
}
finally
{
sw.Close();
myStream.Close();
}
}
我把HTTpStatus类给贴出来..有需要demo的可以发邮件给我.或者留下邮箱
Httpstatus.cs
KeyWord.cs
鉴于大家都需要demo,今天就整理一下发上来.添加了导出word,导出excel功能.晕...木找到怎么放文件路径进来....有需要的可以email我.
先看先效果图View Code
1 private void baidu_Click(object sender, EventArgs e)
2 {
3 int num = 100;//搜索条数
4 string url = "http://www.baidu.com/s?wd=" + txtSearch.Text.Trim() + "&rn=" + num + "";
5 string html = search(url, "gb2312");
6 BaiduSearch baidu = new BaiduSearch();
7 if (!string.IsNullOrEmpty(html))
8 {
9 int count = baidu.GetSearchCount(html);//搜索条数
if (count > 0)
{
List<Keyword> keywords = baidu.GetKeywords(html, txtSearch.Text.Trim());
dataGridView1.DataSource = keywords;
}
}
}
private void google_Click(object sender, EventArgs e)
{
int num = 100;
string url = "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=" + txtSearch.Text.Trim() + "&aq=f&aqi=&aql=&oq=&num=" + num + "";
string html = search(url, "utf-8");
if (!string.IsNullOrEmpty(html))
{
googleSearch google = new googleSearch();
List<Keyword> keywords = google.GetKeywords(html, txtSearch.Text.Trim());
dataGridView1.DataSource = keywords;
}
}
/// <summary>
/// 搜索处理
/// </summary>
/// <param name="url">搜索网址</param>
/// <param name="Chareset">编码</param>
public string search(string url, string Chareset)
{
HttpState result = new HttpState();
Uri uri = new Uri(url);
HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url);
myHttpWebRequest.UseDefaultCredentials = true;
myHttpWebRequest.ContentType = "text/html";
myHttpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215;)";
myHttpWebRequest.Method = "GET";
myHttpWebRequest.CookieContainer = new CookieContainer();
try
{
HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse();
// 从 ResponseStream 中读取HTML源码并格式化 add by cqp
result.Html = readResponseStream(response, Chareset);
result.CookieContainer = myHttpWebRequest.CookieContainer;
return result.Html;
}
catch (Exception ex)
{
return ex.ToString();
}
}
public string readResponseStream(HttpWebResponse response, string Chareset)
{
string result = "";
using (StreamReader responseReader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(Chareset)))
{
result = formatHTML(responseReader.ReadToEnd());
}
return result;
}
/// <summary>
/// 描述:格式化网页源码
///
/// </summary>
/// <param name="htmlContent"></param>
/// <returns></returns>
public string formatHTML(string htmlContent)
{
string result = "";
result = htmlContent.Replace("»", "").Replace(" ", "")
.Replace("©", "").Replace("/r", "").Replace("/t", "")
.Replace("/n", "").Replace("&", "&");
return result;
把百度和Google两个类抽取了出来
1.百度Search类
View Code
1 class BaiduSearch
2 {
3 protected string uri = "http://www.baidu.com/s?wd=";
4 protected Encoding queryEncoding = Encoding.GetEncoding("gb2312");
5 protected Encoding pageEncoding = Encoding.GetEncoding("gb2312");
6 protected string resultPattern = @"(?<=找到相关结果[约]?)[0-9,]*?(?=个)";
7 public int GetSearchCount(string html)
8 {
9 int result = 0;
string searchcount = string.Empty;
Regex regex = new Regex(resultPattern);
Match match = regex.Match(html);
if (match.Success)
{
searchcount = match.Value;
}
else
{
searchcount = "0";
}
if (searchcount.IndexOf(",") > 0)
{
searchcount = searchcount.Replace(",", string.Empty);
}
int.TryParse(searchcount, out result);
return result;
}
public List<Keyword> GetKeywords(string html, string word)
{
int i = 1;
List<Keyword> keywords = new List<Keyword>();
string ss="<h3 class=\"t\"><a.*?href=\"(?<url>.*?)\".*?>(?<content>.*?)</a>";
MatchCollection mcTable = Regex.Matches(html,ss);
foreach (Match mTable in mcTable)
{
if (mTable.Success)
{
Keyword keyword = new Keyword();
keyword.ID = i++;
keyword.Title = Regex.Replace(mTable.Groups["content"].Value, "<[^>]*>", string.Empty);
keyword.Link = mTable.Groups["url"].Value;
keywords.Add(keyword);
}
}
return keywords;
}
2 .GoogleSearch类
View Code
1 class googleSearch
2 {
3
4 public List<Keyword> GetKeywords(string html, string word)
5 {
6 int i = 1;
7 List<Keyword> keywords = new List<Keyword>();
8
9 Regex regTable = new Regex("<h3 class=\"r\"><a.*?href=\"(?<url>.*?)\".*?>(?<content>.*?)</a>", RegexOptions.IgnoreCase);
Regex regA = new Regex(@"(?is)<a/b[^>]*?href=(['""]?)(?<link>[^'""/s>]+)/1[^>]*>(?<title>.*?)</a>", RegexOptions.IgnoreCase);
MatchCollection mcTable = regTable.Matches(html);
foreach (Match mTable in mcTable)
{
if (mTable.Success)
{
Keyword keyword = new Keyword();
keyword.ID = i++;
keyword.Title = Regex.Replace(mTable.Groups["content"].Value, "<[^>]*>", string.Empty);
keyword.Link = mTable.Groups["url"].Value;
keywords.Add(keyword);
}
}
return keywords;
}
忘了.还有个导出Excel,这个友友们应该都有自己的方法,我这里就简单写了一个excel导出.也贴出来吧.
1 public void ExportDataGridViewToExcel(DataGridView dataGridview1)
2 {
3 SaveFileDialog saveFileDialog = new SaveFileDialog();
4 saveFileDialog.Filter = "Execl files (*.xls)|*.xls";
5 saveFileDialog.FilterIndex = 0;
6 saveFileDialog.RestoreDirectory = true;
7 saveFileDialog.CreatePrompt = true;
8 saveFileDialog.Title = "导出Excel文件";
9
DateTime now = DateTime.Now;
saveFileDialog.FileName = now.Year.ToString().PadLeft(2) + now.Month.ToString().PadLeft(2, '0') + now.Day.ToString().PadLeft(2, '0') + "-" + now.Hour.ToString().PadLeft(2, '0') + now.Minute.ToString().PadLeft(2, '0') + now.Second.ToString().PadLeft(2, '0');
saveFileDialog.ShowDialog();
Stream myStream;
myStream = saveFileDialog.OpenFile();
StreamWriter sw = new StreamWriter(myStream, System.Text.Encoding.GetEncoding("gb2312"));
string str = "";
try
{
//写标题
for (int i = 0; i < dataGridview1.ColumnCount; i++)
{
if (i > 0)
{
str += "\t";
}
str += dataGridview1.Columns[i].HeaderText;
}
sw.WriteLine(str);
//写内容
for (int j = 0; j < dataGridview1.Rows.Count; j++)
{
string tempStr = "";
for (int k = 0; k < dataGridview1.Columns.Count; k++)
{
if (k > 0)
{
tempStr += "\t";
}
tempStr += dataGridview1.Rows[j].Cells[k].Value.ToString();
}
sw.WriteLine(tempStr);
}
sw.Close();
myStream.Close();
MessageBox.Show("导出成功");
}
catch (Exception e)
{
MessageBox.Show(e.ToString());
}
finally
{
sw.Close();
myStream.Close();
}
}
我把HTTpStatus类给贴出来..有需要demo的可以发邮件给我.或者留下邮箱
Httpstatus.cs
class HttpState { private string _statusDescription; public string StatusDescription { get { return _statusDescription; } set { _statusDescription = value; } } /// <summary> /// 回调 址址, 登陆测试中使用 /// </summary> private string _callBackUrl; public string CallBackUrl { get { return _callBackUrl; } set { _callBackUrl = value; } } /// <summary> /// 网页网址 绝对路径格式 /// </summary> private string _url; public string Url { get { return _url; } set { _url = value; } } /// <summary> /// 字符串的形式的Cookie信息 /// </summary> private string _cookies; public string Cookies { get { return _cookies; } set { _cookies = value; } } /// <summary> /// Cookie信息 /// </summary> private CookieContainer _cookieContainer = new CookieContainer(); public CookieContainer CookieContainer { get { return _cookieContainer; } set { _cookieContainer = value; } } /// <summary> /// 网页源码 /// </summary> private string _html; public string Html { get { return _html; } set { _html = value; } } /// <summary> /// 验证码临时文件(绝对路径) /// </summary> private string _tmpValCodePic; public string TmpValCodePic { get { return _tmpValCodePic; } set { _tmpValCodePic = value; } } /// <summary> /// 验证码临时文件名(相对路径) /// </summary> private string _tmpValCodeFileName = "emptyPic.gif"; public string TmpValCodeFileName { get { return _tmpValCodeFileName; } set { _tmpValCodeFileName = value; } } /// <summary> /// 有验证码 /// </summary> private bool _isValCode; public bool IsValCode { get { return _isValCode; } set { _isValCode = value; } } /// <summary> /// 验证码URL /// </summary> private string _valCodeURL; public string ValCodeURL { get { return _valCodeURL; } set { _valCodeURL = value; } } /// <summary> /// 验证码识别后的值 /// </summary> private string _valCodeValue; public string ValCodeValue { get { return _valCodeValue; } set { _valCodeValue = value; } } /// <summary> /// 其它参数 /// </summary> private Hashtable _otherParams = new Hashtable(); public Hashtable OtherParams { get { return _otherParams; } set { _otherParams = value; } } // 重复添加处理 add by fengcj 09/11/19 PM public void addOtherParam(object key, object value) { if (!this.OtherParams.ContainsKey(key)) this.OtherParams.Add(key, value); else { this.OtherParams[key] = value; } } public void removeOtherParam(object key) { this.OtherParams.Remove(key); } public object getOtherParam(object key) { return this.OtherParams[key]; } }
KeyWord.cs
class Keyword { public int ID { get; set; } public string Title { get; set; } public string Link { get; set; } }
鉴于大家都需要demo,今天就整理一下发上来.添加了导出word,导出excel功能.晕...木找到怎么放文件路径进来....有需要的可以email我.
相关文章推荐
- PHP抓取百度搜索结果对应的第一个百度快照的链接
- python,抓取百度搜索结果
- 百度与google的搜索结果比较:汇总 翻译
- PHP多进程抓取百度搜索结果
- C#抓取百度和谷歌的搜索结果(标题和链接) 代码整理
- 抓取 google 搜索结果
- 如何抓取谷歌,百度里面特定的搜索结果
- 绕GOOGLE防抓取搜索结果的方法
- 百度搜索结果页面的参数 关键词(wd|word|kw|keyword)
- java正则表达式的使用-抓取百度搜索的结果
- 抓取百度搜索结果
- 搜索“phpcms 显示当前栏目名称”GOOGLE和百度结果让我又一次震惊了。。。
- 关于python抓取google搜索结果的若干问题
- 百度搜索中关键词顺序对搜索结果排序的影响
- 分别使用Python和Java抓取百度搜索结果
- 关于python抓取google搜索结果的若干问题
- python 抓取google搜索结果
- 使用python抓取百度搜索、百度新闻搜索的关键词个数
- 使用HtmlUnit抓取百度搜索结果
- 抓取百度搜索结果