您的位置:首页 > Web前端 > HTML

HtmlAgilityPack抓取搜房网数据简单示例

2015-12-07 09:32 671 查看
HtmlAgilityPack是一个开源的解析HTML元素的类库,最大的特点是可以通过XPath来解析HMTL,如果您以前用C#操作过XML,那么使用起HtmlAgilityPack也会得心应手。目前最新版本为1.4.6。

程序示例如下:



代码如下:

using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace WinformHtmlAgilityPack
{
public partial class Form1 : Form
{

public Form1()
{
InitializeComponent();
}
private void Form1_Load(object sender, EventArgs e)
{
Thread thread = new Thread(DownloadData);
thread.Start();
}
private void DownloadData()
{
GZipWebClient c = new GZipWebClient();
List<HouseModel> houseList = new List<HouseModel>();
int pageCount = 50;
int index = 1;
for (int m = 1; m <= pageCount; m++)
{
byte[] data = c.DownloadData(new Uri("http://newhouse.fang.com/house/s/b9" + m + "/"));
string strx = Encoding.Default.GetString(data);

HtmlWeb htmlWeb = new HtmlWeb();
HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.LoadHtml(strx);

//HtmlNode node = htmlDoc.GetElementbyId("sjina_C01_37");
//HtmlNode node = htmlDoc.DocumentNode.SelectSingleNode("//div[@class='nl_con clearfix']");
//HtmlNode list = node.SelectSingleNode("//ul");

//根据xpath ul下的li   li[2]代表索引   得到 名称
//HtmlNode nodeli_name = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[1]/div/div[2]/div[1]/div[1]/a");
//根据xpath ul下的li   li[2]代表索引   得到 价格
//HtmlNode nodeli_price = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[3]/div/div[2]/div[4]/span");
for (int i = 1; i <= 20; i++)
{
//通过Xpath选中html的指定元素

HouseModel houseModel = new HouseModel();
HtmlNode nodeli_name = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[" + i + "]/div/div[2]/div[1]/div[1]/a");
if (nodeli_name != null)
{
houseModel.楼盘名称 = NoHTML(nodeli_name.InnerHtml);
}
HtmlNode nodeli_price = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[" + i + "]/div/div[2]/div[4]/span");
if (nodeli_price != null)
{
houseModel.价格 = NoHTML(nodeli_price.InnerHtml);
}
HtmlNode nodeli_unit = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[" + i + "]/div/div[2]/div[4]/em");
if (nodeli_unit != null)
{
houseModel.单位 = NoHTML(nodeli_unit.InnerHtml);
}

HtmlNode nodeli_address = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[" + i + "]/div/div[2]/div[2]/div[1]/a");
if (nodeli_address != null)
{
string district = nodeli_address.InnerHtml.Split(']')[0];
houseModel.所在区域 = NoHTML(district).Substring(1);
houseModel.地址 = NoHTML(nodeli_address.InnerHtml);
}
houseModel.序号 = index;
houseList.Add(houseModel);
this.label1.Invoke(new MethodInvoker(delegate
{
this.label1.Text = "已经抓取:"+houseList.Count.ToString()+" 条.....";

}));
index++;

this.dataGridView1.Invoke(new MethodInvoker(delegate
{
this.dataGridView1.DataSource = null;
this.dataGridView1.DataSource = houseList;
this.dataGridView1.Columns[5].Width = 500;

this.dataGridView1.FirstDisplayedScrollingRowIndex = this.dataGridView1.Rows.Count - 1;
}));

Thread.Sleep(100);
}
}
this.label1.Invoke(new MethodInvoker(delegate
{
this.label1.Text = "已经抓取完毕。";

}));
}
public class GZipWebClient : WebClient
{
protected override WebRequest GetWebRequest(Uri address)
{
HttpWebRequest webrequest = (HttpWebRequest)base.GetWebRequest(address);
webrequest.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
return webrequest;
}
}
public class HouseModel
{
public int 序号 { get; set; }
public string 楼盘名称 { get; set; }
public string 所在区域 { get; set; }
public string 价格 { get; set; }
public string 单位 { get; set; }
public string 地址 { get; set; }
}

private void btn_exportexcel_Click(object sender, EventArgs e)
{
DataGridViewToExcel(this.dataGridView1);
}
#region
private void DataGridViewToExcel(DataGridView dgv)
{
SaveFileDialog dlg = new SaveFileDialog();
dlg.Filter = "Execl files (*.xls)|*.xls";
dlg.FilterIndex = 0;
dlg.RestoreDirectory = true;
dlg.CreatePrompt = true;
dlg.Title = "保存为Excel文件";

if (dlg.ShowDialog() == DialogResult.OK)
{
Stream myStream;
myStream = dlg.OpenFile();
StreamWriter sw = new StreamWriter(myStream, System.Text.Encoding.GetEncoding(-0));
string columnTitle = "";
try
{
//写入列标题
for (int i = 0; i < dgv.ColumnCount; i++)
{
if (i > 0)
{
columnTitle += "\t";
}
columnTitle += dgv.Columns[i].HeaderText;
}
sw.WriteLine(columnTitle);

//写入列内容
for (int j = 0; j < dgv.Rows.Count; j++)
{
string columnValue = "";
for (int k = 0; k < dgv.Columns.Count; k++)
{
if (k > 0)
{
columnValue += "\t";
}
if (dgv.Rows[j].Cells[k].Value == null)
columnValue += "";
else
columnValue += dgv.Rows[j].Cells[k].Value.ToString().Trim();
}
sw.WriteLine(columnValue);
}
sw.Close();
myStream.Close();
}
catch (Exception e)
{
MessageBox.Show(e.ToString());
}
finally
{
sw.Close();
myStream.Close();
}
}
}
#endregion
public static string NoHTML(string Htmlstring)
{
//删除脚本
Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
//删除HTML
Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);

Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", "   ", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);

Htmlstring.Replace("<", "");
Htmlstring.Replace(">", "");
Htmlstring.Replace("\r\n", "");
return Htmlstring;
}
}
}


 源码地址:http://files.cnblogs.com/files/sunyj/WinformHtmlAgilityPack.rar
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: