您的位置:首页 > Web前端 > HTML

关于使用HtmlAgilityPack

2011-10-27 16:49 197 查看
请直接看代码:

/// <summary>
/// 根据输入的地址获取其文档节点对象
/// </summary>
/// <param name="url">地址</param>
/// <returns></returns>
public static HtmlAgilityPack.HtmlNode GetHtmlNodeFromLink(string url)
{
try{
Uri uri = new Uri(url);

HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
WebResponse response = request.GetResponse();

Stream stream = response.GetResponseStream();
StreamReader read = new StreamReader(stream, Encoding.GetEncoding("gb2312"));
string str = read.ReadToEnd();

HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
html.LoadHtml(str);
return html.DocumentNode;
}
catch{return null;}
}

/// <summary>
/// 根据输入的URL地址输出指定XPATH下的节点集合
/// </summary>
/// <param name="url">地址</param>
/// <param name="xPath">过滤地址</param>
/// <param name="imgs">过滤地址</param>
/// <param name="links">过滤地址</param>
/// <param name="title">标题</param>
/// <returns></returns>
public static bool GetGalleryInfo(HtmlAgilityPack.HtmlNode htmlNode,string xPath,ref string[] imgs, ref string[] links,ref string[] title)
{
try
{
HtmlNodeCollection hnc = htmlNode.SelectNodes(xPath);//"//div[@class='slideBannerA homeSlideAD1']"
if (hnc.Count < 1)
return false;
links = new string[hnc.Count];
title = new string[hnc.Count];
imgs = new string[hnc.Count];
int i = 0;
string cateDataRegex = @"background-image:url\((?<image>.+)\)";
Regex re = new Regex(cateDataRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);
foreach (HtmlNode node in hnc)
{
HtmlAttributeCollection hac = node.Attributes;
links[i] = hac["href"].Value;
imgs[i] = hac["style"] == null ? hac["src2"].Value : re.Match(hac["style"].Value).Groups["image"].Value;
title[i++] = string.IsNullOrEmpty(hac["title"].Value) ? hac["alt"].Value : hac["title"].Value;
}
return true;
}
catch { return false; }
}

//调用

string[] strLink;
string[] strLinAlt;
string[] strImg;
string urls = "http://www.newegg.com.cn";
HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink(urls);
GetGalleryInfo(nodes, "//div[@class='slideBannerA homeSlideAD1']/div[1]/div[1]/a", out strImg, out strLink,out strLinAlt);

淘宝今日活动:

/// <summary>
/// 淘宝今日活动
/// </summary>
/// <param name="htmlNode">页面节点集合</param>
/// <param name="xPath">选择的路径</param>
/// <param name="imgs">图片集合</param>
/// <param name="links">链接集合</param>
/// 调用:
/// string[] strLink;
/// string[] strImg;
/// HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.taobao.com");
/// GetTaobaoGalleryInfo(nodes, "//div[@class='sub-promotion-content']/div[@class='ks-switchable-content zoom']/ul/li", out strImg, out strLink);
/// <returns></returns>
public static bool GetTaobaoGalleryInfo(HtmlAgilityPack.HtmlNode htmlNode, string xPath, out string[] imgs, out string[] links)//, ref string[] title)
{
HtmlNodeCollection hnc = htmlNode.SelectNodes(xPath);//"//div[@class='slideBannerA homeSlideAD1']"
links = new string[hnc.Count];
imgs = new string[hnc.Count];
try
{
if (hnc.Count < 1)
return false;
int i = 0;
foreach (HtmlNode node in hnc)
{
links[i] = node.ChildNodes[1].Attributes["href"].Value;
imgs[i++] = node.ChildNodes[1].ChildNodes[0].Attributes["src"].Value;
}
return true;
}
catch { return false; }
}

//今日炸弹
HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.newegg.com.cn/");

HtmlAgilityPack.HtmlNode node = nodes.SelectSingleNode("//div[@class='colSub']/div[@class='picBanner shellShocker ']/a");//"//div[@class='slideBannerA homeSlideAD1']"

string strImg = node.Attributes["href"].Value;
string strSrc= node.ChildNodes[0].Attributes["src"].Value;

//淘宝类别活动
HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.taobao.com");
HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//span[@class='category-pop']/a");//"//div[@class='slideBannerA homeSlideAD1']"

string[] strLink = new string[node.Count];
string[] strText = new string[node.Count];

try
{
int i = 0;
foreach (HtmlNode htmlNode in node)
{
strLink[i] = htmlNode.Attributes["href"].Value;
strText[i++] = htmlNode.InnerText;
}
}
catch { }

//淘宝-服侍-新品推荐
HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://fushi.taobao.com");
HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//div[@class='new-product-image-list']/ul[@class='image-list']/li");//"//div[@class='slideBannerA homeSlideAD1']"

string[] strLink = new string[node.Count];
string[] strImg = new string[node.Count];
string[] strAlt = new string[node.Count];

try
{
int i = 0;
foreach (HtmlNode htmlNode in node)
{
strLink[i] = htmlNode.ChildNodes[0].Attributes["href"].Value;
strAlt[i] = htmlNode.ChildNodes[0].ChildNodes[1].InnerHtml;
strImg[i++] = htmlNode.ChildNodes[0].ChildNodes[0].Attributes["src"].Value;
}
}
catch { }

//针织衫推荐
HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://rihan.vancl.com/","UTF-8");
HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//div[@class='prod_area']/ul/li");//"//div[@class='slideBannerA homeSlideAD1']"

string[] strLink = new string[node.Count];
string[] strImg = new string[node.Count];
string[] strAlt = new string[node.Count];
string[] strPrice = new string[node.Count];
string[] strCurrentPrice = new string[node.Count];

int i = 0;
foreach (HtmlNode htmlNode in node)
{
try
{
strLink[i] = htmlNode.ChildNodes[0].Attributes["href"].Value;
strAlt[i] = htmlNode.ChildNodes[4].ChildNodes[1].InnerHtml.Trim();
strImg[i] = htmlNode.ChildNodes[0].ChildNodes[1].Attributes["src"].Value;
strPrice[i] = htmlNode.ChildNodes[6].ChildNodes[1].ChildNodes[1].InnerHtml.Trim().Replace("¥", "");
strCurrentPrice[i++] = htmlNode.ChildNodes[6].ChildNodes[2].InnerHtml.Trim().Replace("售价¥", "");
}
catch { }
}

private void button8_Click(object sender, EventArgs e)
{
//http://www.masamaso.com 商品列表
HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.masamaso.com/", "UTF-8");
HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//ul/li/div[@class='goods_case']");//"//div[@class='slideBannerA homeSlideAD1']"

string[] strLink = new string[node.Count];
string[] strImg = new string[node.Count];
string[] strAlt = new string[node.Count];
string[] strPrice = new string[node.Count];
string[] strCurrentPrice = new string[node.Count];

int i = 0;
foreach (HtmlNode htmlNode in node)
{
try
{
strLink[i] = "http://www.masamaso.com/" + htmlNode.ChildNodes[1].ChildNodes[0].Attributes["href"].Value;
strAlt[i] = htmlNode.ChildNodes[1].ChildNodes[0].Attributes["title"].Value;
strImg[i] = htmlNode.ChildNodes[1].ChildNodes[0].ChildNodes[0].Attributes["src"].Value;
//strPrice[i] = htmlNode.ChildNodes[6].ChildNodes[1].ChildNodes[1].InnerHtml.Trim().Replace("¥", "");
strCurrentPrice[i++] = htmlNode.ChildNodes[3].ChildNodes[1].ChildNodes[1].ChildNodes[0].InnerHtml.Trim().Replace("¥", "");
}
catch
{ }
}
}

private void button9_Click(object sender, EventArgs e)
{
//http://www.masamaso.com/ 弹出广告
HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.masamaso.com/", "UTF-8");
HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//div[@class='foot_img tabContainer']/div[@class='tabBox']/div[@class='hd_tp']");//"//div[@class='slideBannerA homeSlideAD1']"

string[] strLink = new string[node.Count];
string[] strImg = new string[node.Count];
string[] strAlt = new string[node.Count];
string[] strPrice = new string[node.Count];
string[] strCurrentPrice = new string[node.Count];

int i = 0;
foreach (HtmlNode htmlNode in node)
{
try
{
strLink[i] = htmlNode.ChildNodes[0].Attributes["href"].Value;
//strAlt[i] = htmlNode.ChildNodes[1].ChildNodes[0].Attributes["title"].Value;
strImg[i++] = htmlNode.ChildNodes[0].ChildNodes[0].Attributes["src"].Value;
//strPrice[i] = htmlNode.ChildNodes[6].ChildNodes[1].ChildNodes[1].InnerHtml.Trim().Replace("¥", "");
//strCurrentPrice[i++] = htmlNode.ChildNodes[3].ChildNodes[1].ChildNodes[1].ChildNodes[0].InnerHtml.Trim().Replace("¥", "");
}
catch
{ }
}

Func();
}

private void Func()
{
//http://www.vivian.cn/ 弹出广告
HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.vivian.cn/", "UTF-8");
HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//div[@class='foot_img tabContainer']/div[@class='tabBox']/div[@class='hd_tp']");//"//div[@class='slideBannerA homeSlideAD1']"

string[] strLink = new string[node.Count];
string[] strImg = new string[node.Count];
string[] strAlt = new string[node.Count];
string[] strPrice = new string[node.Count];
string[] strCurrentPrice = new string[node.Count];

int i = 0;
foreach (HtmlNode htmlNode in node)
{
try
{
strLink[i] = htmlNode.ChildNodes[0].Attributes["href"].Value;
//strAlt[i] = htmlNode.ChildNodes[1].ChildNodes[0].Attributes["title"].Value;
strImg[i++] = htmlNode.ChildNodes[0].ChildNodes[0].Attributes["src"].Value;
//strPrice[i] = htmlNode.ChildNodes[6].ChildNodes[1].ChildNodes[1].InnerHtml.Trim().Replace("¥", "");
//strCurrentPrice[i++] = htmlNode.ChildNodes[3].ChildNodes[1].ChildNodes[1].ChildNodes[0].InnerHtml.Trim().Replace("¥", "");
}
catch
{ }
}

}

private void button10_Click(object sender, EventArgs e)
{
//http://www.vivian.cn/" 产品列表
HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.vivian.cn/", "UTF-8");
HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//div[@class='goods_list']/ul/li");//"//div[@class='slideBannerA homeSlideAD1']"

string[] strLink = new string[node.Count];
string[] strImg = new string[node.Count];
string[] strAlt = new string[node.Count];
string[] strPrice = new string[node.Count];
string[] strCurrentPrice = new string[node.Count];

int i = 0;
foreach (HtmlNode htmlNode in node)
{
try
{
strLink[i] = "http://www.vivian.cn/" + htmlNode.ChildNodes[1].ChildNodes[1].ChildNodes[0].Attributes["href"].Value;
strAlt[i] = htmlNode.ChildNodes[1].ChildNodes[1].ChildNodes[0].Attributes["title"].Value;
strImg[i] = htmlNode.ChildNodes[1].ChildNodes[1].ChildNodes[0].ChildNodes[0].Attributes["src"].Value;
//strPrice[i] = htmlNode.ChildNodes[6].ChildNodes[1].ChildNodes[1].InnerHtml.Trim().Replace("¥", "");
strCurrentPrice[i++] = htmlNode.ChildNodes[1].ChildNodes[3].ChildNodes[3].ChildNodes[0].ChildNodes[1].InnerHtml.Trim().Replace("¥", "");
}
catch
{ }
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: