您的位置:首页 > 其它

程序中得到百度的搜索结果

2011-11-10 17:23 295 查看
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using System.Net;
using System.IO;
namespace baiduRobot
{
struct BaiduEntry
{
public string title, brief, link;
}
class Program
{
static string GetHtml(string keyword)
{
string url = @"http://www.baidu.com/";

string encodedKeyword = HttpUtility.UrlEncode(keyword, Encoding.GetEncoding(936));
//百度使用codepage 936字符编码来作为查询串,果然专注于中文搜索……
//更不用说,还很喜欢微软
//谷歌能正确识别UTF-8编码和codepage这两种情况,不过本身网页在HTTP头里标明是UTF-8的
//估计谷歌也不讨厌微软(以及微软的专有规范)
string query = "s?wd=" + keyword;

HttpWebRequest req;
HttpWebResponse response;
Stream stream;
req = (HttpWebRequest)WebRequest.Create(url + query);
response = (HttpWebResponse)req.GetResponse();
stream = response.GetResponseStream();
int count = 0;
byte[] buf = new byte[8192];
string decodedString = null;
StringBuilder sb = new StringBuilder();
try
{
Console.WriteLine("正在读取网页{0}的内容……", url + query);
do
{
count = stream.Read(buf, 0, buf.Length);
if (count > 0)
{
decodedString = Encoding.GetEncoding(936).GetString(buf, 0, count);
sb.Append(decodedString);
}
} while (count > 0);
}
catch
{
Console.WriteLine("网络连接失败,请检查网络设置。");
}
return sb.ToString();
}
static void PrintResult(List<BaiduEntry> entries)
{
int count = 0;
entries.ForEach(delegate(BaiduEntry entry)
{
Console.WriteLine("找到了百度的第{0}条搜索结果:", count += 1);
if (entry.link != null)
{
Console.WriteLine("找到了一条链接:");
Console.WriteLine(entry.link);
}
if (entry.title != null)
{
Console.WriteLine("标题为:");
Console.WriteLine(entry.title);
}
if (entry.brief != null)
{
Console.WriteLine("下面是摘要:");
Console.WriteLine(entry.brief);
}
Program.Cut();
});
}
static void simpleOutput()
{
string html = "<table><tr><td><font>test</font><a>hello</a><br></td></tr></table>";
Console.WriteLine(RemoveSomeTags(html));
}
static string RemoveVoidTag(string html)
{
string[] filter = { "<br>" };
foreach (string tag in filter)
{
html = html.Replace(tag, "");
}
return html;
}
static string ReleaseXmlTags(string html)
{
string[] filter = { "<a.*?>", "</a>", "<em>", "</em>", "<b>", "</b>", "<font.*?>", "</font>" };
foreach (string tag in filter)
{
html = Regex.Replace(html, tag, "");
}
return html;
}

static string RemoveSomeTags(string html)
{
html = RemoveVoidTag(html);
html = ReleaseXmlTags(html);
return html;
}
static void Cut()
{
Console.WriteLine("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
}
static void MainProc(string input)
{
MainProc(input, false);
}
static void MainProc(string input, bool tagsForBrief)
{
Regex r = new Regex("<table*</table>", RegexOptions.IgnoreCase);
//提取出(<table>,</table>)对,并等待进一步处理。
Match m = r.Match(input);
List<string> collection = new List<string>();
while (m.Success)
{
collection.Add(m.Value);
//找出tagname为table的节点并存储到collection变量中
m = m.NextMatch();
}
List<BaiduEntry> entries = new List<BaiduEntry>();
collection.ForEach(delegate(string entry)
{
r = new Regex("<td.*?>(.*)</td>", RegexOptions.IgnoreCase);
m = r.Match(entry);
while (m.Success)
{
//Console.WriteLine(m.Value);

GroupCollection gc = m.Groups;
// Console.WriteLine(gc[0].Captures[0].Value == gc[0].Value);
for (int i = 1; i < gc.Count; i++)
{//放弃第一个group,那里只有整个match字符串,而且永远只有这1个捕获组(gc[0].Captures.Count恒为1)
Capture result = gc[i].Captures[0];//正则对象r里只有1个分组,所以只需要提取第一个分组就可以了。
string html = result.Value;
//result里存储着td节点的innerHTML,那里有真正的搜索结果
BaiduEntry baidu = new BaiduEntry();
r = new Regex("<a.*?href=\"(.*?)\".*?>", RegexOptions.IgnoreCase);
if (r.IsMatch(html))
{
string linkString = r.Match(html).Groups[1].Captures[0].Value;
baidu.link = linkString;
}
r = new Regex("<font.*</font>");
//td节点下有一些嵌套了2层的font标签,把这个大的font标签拿下来。
html = r.Match(html).Value;//现在html变量里存储着比较浓缩的信息了。

r = new Regex("<font.*?>(.*?)</font>");
Match contentMatch = r.Match(html);
if (contentMatch.Success)
{
//Console.WriteLine(html);
string title = contentMatch.Groups[1].Captures[0].Value;
title = RemoveSomeTags(title);
baidu.title = title;
contentMatch = contentMatch.NextMatch();
if (contentMatch.Success)
{
string brief = contentMatch.Groups[1].Captures[0].Value;
int splitIndex = brief.IndexOf("<font");
if (splitIndex > -1)
brief = brief.Substring(0, splitIndex);
if (!tagsForBrief)
brief = RemoveSomeTags(brief);
//如果不需要带有HTML格式的摘要,那么就处理掉HTML标签
baidu.brief = brief;
}
}
else
{
if (html == "") continue;
Console.WriteLine("怪了,这里没有找到任何结果。");
Console.WriteLine("如果百度已经更改了页面的结构那么程序需要重新设计。");
Console.WriteLine("Mark:");
Console.WriteLine(html);
Cut();
Cut();
Cut();
}
//Console.WriteLine(html);
//Program.Cut();
entries.Add(baidu);
}
m = m.NextMatch();
}
});

PrintResult(entries);
}
public static void Main(string[] args)
{
Console.WriteLine("请输入一个关键字。");
string keyword;
keyword = Console.ReadLine();
Console.WriteLine("正在从百度上获取结果,请稍等……");
string input;
input = GetHtml(keyword);
Regex r = new Regex("<table.*class=\"result\"[\\s\\S]*</table>", RegexOptions.IgnoreCase);
input = r.Match(input).Value;
MainProc(input);
Console.ReadKey(true);
}
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: