您的位置:首页 > Web前端 > HTML

HTML解析利器HtmlAgilityPack

2011-09-20 15:52 651 查看
using System;

using System.Collections.Generic;

using System.Web;

using System.Web.UI;

using System.Web.UI.WebControls;

using HtmlAgilityPack;

using System.IO;

using System.Text.RegularExpressions;

using Maticsoft.Common;

namespace Maticsoft.Web.PubResources

{

public partial class ImportGroupMessage : System.Web.UI.Page

{

HtmlDocument doc = null;

StreamReader sr = null;

private readonly Maticsoft.BLL.publish_Resources bLL = new Maticsoft.BLL.publish_Resources();

protected void Page_Load(object sender, EventArgs e)

{

//没有登录或是没有审核通过

if (CookiesManager.GetCookie("cn") == "0" || CookiesManager.GetCookie("loginName") == "")

{

Response.Redirect("/index.aspx");

}

//审核通过但不是管理员

else if (CookiesManager.GetCookie("cn") == "1")

{

Response.Redirect("/add.aspx");

}

}

protected void btn_Import_Click(object sender, EventArgs e)

{

try

{

if (!FileUpload1.HasFile)

{

MessageBox.Show(this, "请选择要上传的文件!");

return;

}

//获取文件后缀名

string fileType = System.IO.Path.GetExtension(FileUpload1.FileName);

if (fileType != ".mht")

{

MessageBox.Show(this, "文件类型格式不对请重新选择!");

return;

}

//将文件上传到服务器指定的文件夹下保存

FileUpload1.SaveAs(Server.MapPath("\\upload") + "\\" + FileUpload1.FileName);

//删除已有过期所有信息

bool b = bLL.Del(DateTime.Now.ToString("yyyy-MM-dd"));

if (b)

{

//MessageBox.Show(this, "过期信息删除成功!");

}

//string filepath = this.FileUpload1.PostedFile.FileName;

//获取文件在服务器的完整路径

string filepath = Server.MapPath("\\upload") + "\\" + FileUpload1.FileName;

if (filepath == "")

{

MessageBox.Show(this, "没有找到服务器上,已上传的文件!");

return;

}

sr = File.OpenText(filepath);

doc = new HtmlDocument();

doc.Load(sr);

getNode();

sr.Close();

//删除已导入的文件

File.Delete(Server.MapPath("\\upload")+"\\"+ FileUpload1.FileName);

MessageBox.Show(this,"导入成功!");

}

catch (Exception ex)

{

throw ex;

}

finally

{

//关闭数据流

sr.Close();

}

}

/// <summary>

/// 解析HTML

/// </summary>

private void getNode()

{

HtmlNodeCollection repeatNodes = doc.DocumentNode.SelectNodes("//table/tr");

List<Maticsoft.Model.publish_Resources> list = new List<Maticsoft.Model.publish_Resources>();

//循环节点

foreach (HtmlNode node in repeatNodes)

{

HtmlDocument d = new HtmlDocument();

d.LoadHtml(node.InnerHtml);

HtmlNode title = d.DocumentNode.SelectSingleNode("//td[1]//div[1]//div[1]");

HtmlNode title2 = d.DocumentNode.SelectSingleNode("//td[1]//div[2]");

HtmlNode title3 = d.DocumentNode.SelectSingleNode("//td[1]//div[1]");

string s3 = null;

String s = null;

string s2 = null;

string contentText = null;

//获取QQ

if (title != null)

{

s = title.InnerText;

if (s.Length > 10)

{

int ef = s.LastIndexOf(")");

int b = s.LastIndexOf("(");

if(b!=-1 && ef!=-1)

{

s = s.Substring(b + 1, ef - b - 1);

}

}

// s = s.Substring(s.Length - 11, 10);

// if (s.Substring(0, 1) == "(")

// {

// s = s.Substring(1, 9);

// }

}

//内容content

if(title2!=null)

{

s2 = title2.InnerHtml;

contentText = title2.InnerText;

}

//发消息时间

if (title3 != null)

{

s3 = title3.InnerText;

if (s3.Length > 7)

{

s3 = s3.Substring(s3.Length - 8);

if (s3.Substring(0, 1) == ";" || s3.Substring(0, 1) == ")")

{

s3 = s3.Substring(1, 7);

}

}

}

if (title != null && title2 != null && title3 != null)

{

int t = s.Length;

int t2 = s2.Length;

int t3 = s3.Length;

if (s.Length > 7 && s2.Length > 300 && s3.Length > 7)

{

Maticsoft.Model.publish_Resources model = new Maticsoft.Model.publish_Resources();

model.Qq = s;

model.Content = s2;

model.Infotime = s3;

model.ContentText = contentText;

bool bcontent = getcontent(model.ContentText,list);

if(bcontent)

{

list.Add(model);

}

}

}

// HtmlNodeCollection cc = node.SelectNodes("//td/div");

//foreach (HtmlNode c in cc)

//{

// string s = c.InnerText;

// string f = c.InnerHtml;

// //String t = c.XPath + "/div[1]";

// //String x=c.SelectSingleNode(t).InnerText;

//}

//if(s==null||s2==null||s3==null)

//{

// return;

//}

}

int i = 0;

//循环遍历插入数据库

foreach (Maticsoft.Model.publish_Resources publishResources in list)

{

Maticsoft.Model.publish_Resources mod = new Maticsoft.Model.publish_Resources();

if (!string.IsNullOrEmpty(publishResources.Infotime))

{

bool isvalidate = CheckIstime(publishResources.Infotime);

if (isvalidate)

{

mod.Infotime = publishResources.Infotime;

}

}

if (!string.IsNullOrEmpty(publishResources.Qq))

{

bool isQQ = CheckIsQQNumber(publishResources.Qq);

if (isQQ)

{

mod.Qq = publishResources.Qq;

}

}

mod.Content = publishResources.Content;

mod.Type = "0";

mod.Title = "";

mod.GetDateTime = DateTime.Now.ToString();

mod.ContentText = publishResources.ContentText;

if(mod.Qq!=null)

{

i = bLL.Add(mod);

}

}

}

//判定字符串是否为 时 分 秒

private bool CheckIstime(String StrSource)

{

return Regex.IsMatch(StrSource, @"^([0-1]?[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])$");

}

//判定字符串是否为qq号

private bool CheckIsQQNumber(String StrSource)

{

return Regex.IsMatch(StrSource, @"^\d{5,12}$");

}

/// <summary>

/// 获取文本内容 看这条记录是否已存在

/// </summary>

/// <param name="str">内容</param>

/// <param name="list">在集合中的内容</param>

/// <returns></returns>

private bool getcontent(string str,List<Maticsoft.Model.publish_Resources> list)

{

bool bo = true;

foreach (Maticsoft.Model.publish_Resources pr in list)

{

if (str == pr.ContentText)

{

bo = false;

}

}

return bo;

}

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: