您的位置:首页 > Web前端 > HTML

C#正则表达式 解析html+table tr td 内容

2017-05-01 08:51 501 查看
aspx页面获取方法:
    var tbZHXX = GetWorldexWyHtml(s, @"id=""tbZHXX""", @"class=""GridCommonItem""", "Worldex"); 

提交参数及隐藏hiddle值
 var postUrl = "http://xxm.cn/glj/querydata/xxSearchOld.aspx";
            List<KeyValuePair<String, String>> paramList = new List<KeyValuePair<String, String>>();

            GetViewHiddenData(postUrl).ToList().ForEach(x => paramList.Add(new KeyValuePair<string, string>(x.Key, x.Value)));
            paramList = (from p in paramList where !string.IsNullOrEmpty(p.Value) select p).ToList();
            paramList.Add(new KeyValuePair<string, string>("txtBillNo", strBlNo));
            paramList.Add(new KeyValuePair<string, string>("btnSearch", "查询"));
var s = HttpAspxPostMathHtml(postUrl, paramList); 

/// <summary>
///
/// </summary>
/// <param name="regexInfo">解析html内容</param>
/// <param name="regexParm">table class或者id</param>
/// <param name="classParm">tr class或者id</param>
/// <param name="companyCode"></param>
/// <returns></returns>
public static List<WWyDetails> GetWWyHtml(string regexInfo, string regexParm, string classParm, string companyCode)
{
List<WWyDetails> resultWdHtml = new List<WWyDetails>();

WWyDetails wd = new WWyDetails();
var rex = "(?is)(?<=<table[^>]*?" + regexParm + "[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?" + classParm + "[^>]*?>(?:\\s*<td[^>]*>(.*?)</td>)*\\s*</tr>";
Regex reg = new Regex(rex);
var td = new List<string>();
foreach (Match m in reg.Matches(regexInfo).Cast<Match>())
{
if (companyCode == "Worldex") //港联捷
{
//查找每个TD的内容								(\s+scope=[^>]+)? td 后面跟着 class align等		 \r\n|\s+ \r\n\s+处理TD内容后面有换行
td = Regex.Matches(m.Value, @"(?<=<td(\s+scope=[^>]+)?>)\r\n\s+.*?\r\n\s+(?=</td>)")// @"(?<=<td>)\r\n\s+[\s\S]*?\r\n\s+(?=</td>)")
.Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList();
}
else
{
//<font color="#333333">SNL7QDJL510757</font>
//td = Regex.Matches(m.Value, @"(?<=<td>).*?(?=</td>)").Cast<Match>().Select(mx => mx.Groups[0].Value).ToList();
td = Regex.Matches(m.Value, @"(?<=<font(\s+color=[^>]+)?>).*?(?=</font>)").Cast<Match>().Select(mx => mx.Groups[0].Value).ToList();
}
resultWdHtml.Add(new WWyDetails() { WSinotrans = td.ToList() });
}
return resultWdHtml;
}

public class WWyDetails
{
public List<string> WSinotrans { get; set; }
}


获取asp页面及解析使用方法:  
var sPuci = HttpAspPostMathHtml("http://xxx/index_dt_container.asp", "search=true&companyname=&companycode=&container_no=&bill_no=" + strBlNo + "&btn3.x=39&btn3.y=15", "gb2312");

var gdvContainer = GetSYDetails(sPuci, @"class=tableGrid", @"class=gridHeader", "铅封号", "YTWY"); 

/// <summary>
/// 获取不同Table 中内容
/// </summary>
/// <param name="regexInfo">解析内容</param>
/// <param name="classTable">table class或者id</param>
/// <param name="classParm">tr 中class 或者id</param>
/// <param name="compareInfo">进行提取对比的关键字</param>
/// <param name="companyCode">对比的公司名</param>
/// <returns></returns>
public static List<SYDetails> GetSYDetails(string regexInfo, string classTable, string classParm, string compareInfo, string companyCode)
{                   //<table.*? class=grid[^>]*?>[\s\S]*?<\/table> 匹配所有table                                                                          //tr[^>]*?
Regex regTable = new Regex(@"<table.*?" + classTable + "[^>]*?>[\\s\\S]*?<\\/table>"); //@"(?is)(?<=<table[^>]*?class=grid[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?>(?:\s*<td[^>]*>(.*?)</td>)*\s*</tr>");
// new Regex(@"(?is)(?<=<table[^>]*?class=grid[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?>(?:\s*<td[^>]*>(.*?)</td>)*\s*</tr>");
List<SYDetails> lstSTX = new List<SYDetails>();
var td = new List<string>();
foreach (Match mTable in regTable.Matches(regexInfo).Cast<Match>())
{
//进行每个table里面关键标题对比是否存在
var compare = Regex.Match(mTable.Value, "(?is)<tr " + classParm + ">(?:\\s*<td[^>]*>(.*?)</td>)*\\s*((?!</tr>).)*").Groups[0].Value.Trim();//.Groups[1].Value.Trim();
if (compare.Contains(compareInfo))
{
//解析table 里面包含多少个tr
Regex regTr = new Regex(@"(?is)(?<=<table[^>]*?" + classTable + "[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?>(?:\\s*<td[^>]*>(.*?)</td>)*\\s*</tr>");
foreach (Match mTr in regTr.Matches(mTable.Value).Cast<Match>().Skip(1))  //Skip(1跳过tr 标题列
{
if (companyCode == "YTWY") //烟台外运国际码头
{
if (!mTr.Value.Contains("查询数据为空。"))
{
//查找每个TD的内容 包含td 后面 class等
td = Regex.Matches(mTr.Value, @"(?is)(?<=<td(\s+align=[^>]+)?>).*?(?=\s*</td)")
.Cast<Match>().Select(mx => mx.Groups[0].Value).ToList();
}
}
else
{
//查找每个TD的内容 包含td 后面 class等
td = Regex.Matches(mTr.Value, @"(?is)(?<=<td(\s+class=[^>]+)?>).*?(?=\s*</td)")
.Cast<Match>().Select(mx => mx.Groups[0].Value).ToList();
}
lstSTX.Add(new SYDetails() { SYTwy = td });
}
break;
}
}
return lstSTX;
}
public class SYDetails
{
public List<string> SYTwy { get; set; }
}


#region ASP/ASPX页面  Get/Post获取返回数据
/// <summary>
/// ASP 页面POST请求与获取结果
/// </summary>
/// <param name="Url">posturl</param>
/// <param name="postDataStr">post参数</param>
/// <param name="encoding">页面编码</param>
/// <returns></returns>
public static string HttpAspPostMathHtml(string Url, string postDataStr, string encoding)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
request.Method = "POST";
request.Accept = "text/html, application/xhtml+xml, image/jxr, */*";
request.ContentType = "application/x-www-form-urlencoded";
request.KeepAlive = true;
request.Headers.Add("Accept-Language", "zh-Hans-CN,zh-Hans;q=0.7,ja;q=0.3");
request.Headers.Add("Accept-Encoding", "gzip, deflate");
request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko";
byte[] bytes = System.Text.Encoding.Default.GetBytes(postDataStr);
request.ContentLength = bytes.Length;
Stream stream = request.GetRequestStream();
stream.Write(bytes, 0, bytes.Length);
stream.Close();//以上是POST数据的写入

HttpWebResponse response = (HttpWebResponse)request.GetResponse();
var retString = string.Empty;
using (Stream responsestream = response.GetResponseStream())
{
using (StreamReader sr = new StreamReader(responsestream, System.Text.Encoding.GetEncoding(encoding)))
{
retString = sr.ReadToEnd();
}
} //直接获取body内容
var resultStr = Regex.Matches(DelHTML(retString), @"(?is)<body[^>]*?>([\s\S].*?)</body>")
.Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList();
return resultStr[0].ToString();
}

/// <summary>
/// ASP 页面Get请求与获取结果
/// </summary>
/// <param name="Url">posturl</param>
/// <param name="postDataStr">post参数</param>
/// <param name="encoding">页面编码</param>
/// <returns></returns>
public static string HttpAspGetMathHtml(string Url, string postDataStr, string encoding)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + "?" + postDataStr);
request.Method = "Get";
request.Accept = "text/html, application/xhtml+xml, image/jxr, */*";
request.KeepAlive = true;
request.Headers.Add("Accept-Language", "zh-Hans-CN,zh-Hans;q=0.7,ja;q=0.3");
request.Headers.Add("Accept-Encoding", "gzip, deflate");
request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko";

HttpWebResponse response = (HttpWebResponse)request.GetResponse();
var retString = string.Empty;
using (Stream responsestream = response.GetResponseStream())
{
using (StreamReader sr = new StreamReader(responsestream, System.Text.Encoding.GetEncoding(encoding)))
{
retString = sr.ReadToEnd();
}
}
return retString;
}

/// <summary>
/// 获取ASPX页面中隐藏post值	Viewstae 等 进行post提交
/// </summary>
/// <param name="Url"></param>
/// <returns></returns>
public Dictionary<string, string> GetViewHiddenData(string Url)
{
HttpClient httpClient = new HttpClient();
httpClient.MaxResponseContentBufferSize = 256000;
httpClient.DefaultRequestHeaders.Add("user-agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36");
HttpResponseMessage response = httpClient.GetAsync(new Uri(Url)).Result;
var result = Regex.Matches(response.Content.ReadAsStringAsync().Result, @"<input type=""hidden""[^>]*?.*?\/>")
.Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList();
Dictionary<string, string> returnHidden = new Dictionary<string, string>();
foreach (var item in result)
{
//获取 隐藏域中的 id  value
//var reg = @"(?isn)<input((?!([<>]|id=)).)+id=""(?<id>[^""<>]+)""[^<>]*?value=""(?<value>[^<>""]*)""";
//	var keyvalue = Regex.Match(item, reg);
//returnHidden.Add(keyvalue.Groups[1].Value, keyvalue.Groups[2].Value);
var key = Regex.Match(item, @"<input type=""hidden""[^>]*?[^>]+?id=""([\s\S]+?)""[^>]+>").Groups[1].Value;
var value = Regex.Match(item, @"<input type=""hidden""[^>]*?[^>]+?value=""([\s\S]+?)""[^>]+>").Groups[1].Value;
returnHidden.Add(key, value);
}
//用完要记得释放
httpClient.Dispose();
return returnHidden;
}

/// <summary>
/// ASPX页面POST请求与获取结果
/// </summary>
/// <param name="Url"></param>
/// <param name="postDataStr"></param>
/// <returns></returns>
public static string HttpAspxPostMathHtml(string Url, List<KeyValuePair<String, String>> postDataStr)
{
var retString = string.Empty;
HttpClient httpClient = new HttpClient();
HttpResponseMessage response = httpClient.GetAsync(new Uri(Url)).Result;
response = httpClient.PostAsync(new Uri(Url), new FormUrlEncodedContent(postDataStr)).Result;
retString = response.Content.ReadAsStringAsync().Result;

var resultStr = Regex.Matches(DelHTML(retString), @"(?is)<body[^>]*?>([\s\S].*?)</body>")
.Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList();
//用完要记得释放
httpClient.Dispose();
return resultStr[0].ToString();
}

#region	将HTML去除一些无用数据
/// <summary>
/// //将HTML去除一些无用数据
/// </summary>
/// <param name="Htmlstring"></param>
/// <returns></returns>
public static string DelHTML(string Htmlstring)
{
//删除脚本
Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
//<input((?< !<).) *? hidden.*?\/>
Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<input type=""hidden""[^>]*?.*?\/>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
//删除HTML
Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"-->", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<!--.*", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(amp|#38);", "&", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(lt|#60);", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(gt|#62);", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

//	Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&#(\d+);", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
Htmlstring.Replace("\r\n", "");
return Htmlstring;
}
#endregion
#endregion
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息