C#正则表达式 解析html+table tr td 内容
2017-05-01 08:51
501 查看
aspx页面获取方法: var tbZHXX = GetWorldexWyHtml(s, @"id=""tbZHXX""", @"class=""GridCommonItem""", "Worldex"); 提交参数及隐藏hiddle值 var postUrl = "http://xxm.cn/glj/querydata/xxSearchOld.aspx"; List<KeyValuePair<String, String>> paramList = new List<KeyValuePair<String, String>>(); GetViewHiddenData(postUrl).ToList().ForEach(x => paramList.Add(new KeyValuePair<string, string>(x.Key, x.Value))); paramList = (from p in paramList where !string.IsNullOrEmpty(p.Value) select p).ToList(); paramList.Add(new KeyValuePair<string, string>("txtBillNo", strBlNo)); paramList.Add(new KeyValuePair<string, string>("btnSearch", "查询")); var s = HttpAspxPostMathHtml(postUrl, paramList); /// <summary> /// /// </summary> /// <param name="regexInfo">解析html内容</param> /// <param name="regexParm">table class或者id</param> /// <param name="classParm">tr class或者id</param> /// <param name="companyCode"></param> /// <returns></returns> public static List<WWyDetails> GetWWyHtml(string regexInfo, string regexParm, string classParm, string companyCode) { List<WWyDetails> resultWdHtml = new List<WWyDetails>(); WWyDetails wd = new WWyDetails(); var rex = "(?is)(?<=<table[^>]*?" + regexParm + "[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?" + classParm + "[^>]*?>(?:\\s*<td[^>]*>(.*?)</td>)*\\s*</tr>"; Regex reg = new Regex(rex); var td = new List<string>(); foreach (Match m in reg.Matches(regexInfo).Cast<Match>()) { if (companyCode == "Worldex") //港联捷 { //查找每个TD的内容 (\s+scope=[^>]+)? td 后面跟着 class align等 \r\n|\s+ \r\n\s+处理TD内容后面有换行 td = Regex.Matches(m.Value, @"(?<=<td(\s+scope=[^>]+)?>)\r\n\s+.*?\r\n\s+(?=</td>)")// @"(?<=<td>)\r\n\s+[\s\S]*?\r\n\s+(?=</td>)") .Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList(); } else { //<font color="#333333">SNL7QDJL510757</font> //td = Regex.Matches(m.Value, @"(?<=<td>).*?(?=</td>)").Cast<Match>().Select(mx => mx.Groups[0].Value).ToList(); td = Regex.Matches(m.Value, @"(?<=<font(\s+color=[^>]+)?>).*?(?=</font>)").Cast<Match>().Select(mx => mx.Groups[0].Value).ToList(); } resultWdHtml.Add(new WWyDetails() { WSinotrans = td.ToList() }); } return resultWdHtml; } public class WWyDetails { public List<string> WSinotrans { get; set; } }
获取asp页面及解析使用方法: var sPuci = HttpAspPostMathHtml("http://xxx/index_dt_container.asp", "search=true&companyname=&companycode=&container_no=&bill_no=" + strBlNo + "&btn3.x=39&btn3.y=15", "gb2312"); var gdvContainer = GetSYDetails(sPuci, @"class=tableGrid", @"class=gridHeader", "铅封号", "YTWY"); /// <summary> /// 获取不同Table 中内容 /// </summary> /// <param name="regexInfo">解析内容</param> /// <param name="classTable">table class或者id</param> /// <param name="classParm">tr 中class 或者id</param> /// <param name="compareInfo">进行提取对比的关键字</param> /// <param name="companyCode">对比的公司名</param> /// <returns></returns> public static List<SYDetails> GetSYDetails(string regexInfo, string classTable, string classParm, string compareInfo, string companyCode) { //<table.*? class=grid[^>]*?>[\s\S]*?<\/table> 匹配所有table //tr[^>]*? Regex regTable = new Regex(@"<table.*?" + classTable + "[^>]*?>[\\s\\S]*?<\\/table>"); //@"(?is)(?<=<table[^>]*?class=grid[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?>(?:\s*<td[^>]*>(.*?)</td>)*\s*</tr>"); // new Regex(@"(?is)(?<=<table[^>]*?class=grid[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?>(?:\s*<td[^>]*>(.*?)</td>)*\s*</tr>"); List<SYDetails> lstSTX = new List<SYDetails>(); var td = new List<string>(); foreach (Match mTable in regTable.Matches(regexInfo).Cast<Match>()) { //进行每个table里面关键标题对比是否存在 var compare = Regex.Match(mTable.Value, "(?is)<tr " + classParm + ">(?:\\s*<td[^>]*>(.*?)</td>)*\\s*((?!</tr>).)*").Groups[0].Value.Trim();//.Groups[1].Value.Trim(); if (compare.Contains(compareInfo)) { //解析table 里面包含多少个tr Regex regTr = new Regex(@"(?is)(?<=<table[^>]*?" + classTable + "[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?>(?:\\s*<td[^>]*>(.*?)</td>)*\\s*</tr>"); foreach (Match mTr in regTr.Matches(mTable.Value).Cast<Match>().Skip(1)) //Skip(1跳过tr 标题列 { if (companyCode == "YTWY") //烟台外运国际码头 { if (!mTr.Value.Contains("查询数据为空。")) { //查找每个TD的内容 包含td 后面 class等 td = Regex.Matches(mTr.Value, @"(?is)(?<=<td(\s+align=[^>]+)?>).*?(?=\s*</td)") .Cast<Match>().Select(mx => mx.Groups[0].Value).ToList(); } } else { //查找每个TD的内容 包含td 后面 class等 td = Regex.Matches(mTr.Value, @"(?is)(?<=<td(\s+class=[^>]+)?>).*?(?=\s*</td)") .Cast<Match>().Select(mx => mx.Groups[0].Value).ToList(); } lstSTX.Add(new SYDetails() { SYTwy = td }); } break; } } return lstSTX; } public class SYDetails { public List<string> SYTwy { get; set; } }
#region ASP/ASPX页面 Get/Post获取返回数据 /// <summary> /// ASP 页面POST请求与获取结果 /// </summary> /// <param name="Url">posturl</param> /// <param name="postDataStr">post参数</param> /// <param name="encoding">页面编码</param> /// <returns></returns> public static string HttpAspPostMathHtml(string Url, string postDataStr, string encoding) { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url); request.Method = "POST"; request.Accept = "text/html, application/xhtml+xml, image/jxr, */*"; request.ContentType = "application/x-www-form-urlencoded"; request.KeepAlive = true; request.Headers.Add("Accept-Language", "zh-Hans-CN,zh-Hans;q=0.7,ja;q=0.3"); request.Headers.Add("Accept-Encoding", "gzip, deflate"); request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"; byte[] bytes = System.Text.Encoding.Default.GetBytes(postDataStr); request.ContentLength = bytes.Length; Stream stream = request.GetRequestStream(); stream.Write(bytes, 0, bytes.Length); stream.Close();//以上是POST数据的写入 HttpWebResponse response = (HttpWebResponse)request.GetResponse(); var retString = string.Empty; using (Stream responsestream = response.GetResponseStream()) { using (StreamReader sr = new StreamReader(responsestream, System.Text.Encoding.GetEncoding(encoding))) { retString = sr.ReadToEnd(); } } //直接获取body内容 var resultStr = Regex.Matches(DelHTML(retString), @"(?is)<body[^>]*?>([\s\S].*?)</body>") .Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList(); return resultStr[0].ToString(); } /// <summary> /// ASP 页面Get请求与获取结果 /// </summary> /// <param name="Url">posturl</param> /// <param name="postDataStr">post参数</param> /// <param name="encoding">页面编码</param> /// <returns></returns> public static string HttpAspGetMathHtml(string Url, string postDataStr, string encoding) { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + "?" + postDataStr); request.Method = "Get"; request.Accept = "text/html, application/xhtml+xml, image/jxr, */*"; request.KeepAlive = true; request.Headers.Add("Accept-Language", "zh-Hans-CN,zh-Hans;q=0.7,ja;q=0.3"); request.Headers.Add("Accept-Encoding", "gzip, deflate"); request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); var retString = string.Empty; using (Stream responsestream = response.GetResponseStream()) { using (StreamReader sr = new StreamReader(responsestream, System.Text.Encoding.GetEncoding(encoding))) { retString = sr.ReadToEnd(); } } return retString; } /// <summary> /// 获取ASPX页面中隐藏post值 Viewstae 等 进行post提交 /// </summary> /// <param name="Url"></param> /// <returns></returns> public Dictionary<string, string> GetViewHiddenData(string Url) { HttpClient httpClient = new HttpClient(); httpClient.MaxResponseContentBufferSize = 256000; httpClient.DefaultRequestHeaders.Add("user-agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"); HttpResponseMessage response = httpClient.GetAsync(new Uri(Url)).Result; var result = Regex.Matches(response.Content.ReadAsStringAsync().Result, @"<input type=""hidden""[^>]*?.*?\/>") .Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList(); Dictionary<string, string> returnHidden = new Dictionary<string, string>(); foreach (var item in result) { //获取 隐藏域中的 id value //var reg = @"(?isn)<input((?!([<>]|id=)).)+id=""(?<id>[^""<>]+)""[^<>]*?value=""(?<value>[^<>""]*)"""; // var keyvalue = Regex.Match(item, reg); //returnHidden.Add(keyvalue.Groups[1].Value, keyvalue.Groups[2].Value); var key = Regex.Match(item, @"<input type=""hidden""[^>]*?[^>]+?id=""([\s\S]+?)""[^>]+>").Groups[1].Value; var value = Regex.Match(item, @"<input type=""hidden""[^>]*?[^>]+?value=""([\s\S]+?)""[^>]+>").Groups[1].Value; returnHidden.Add(key, value); } //用完要记得释放 httpClient.Dispose(); return returnHidden; } /// <summary> /// ASPX页面POST请求与获取结果 /// </summary> /// <param name="Url"></param> /// <param name="postDataStr"></param> /// <returns></returns> public static string HttpAspxPostMathHtml(string Url, List<KeyValuePair<String, String>> postDataStr) { var retString = string.Empty; HttpClient httpClient = new HttpClient(); HttpResponseMessage response = httpClient.GetAsync(new Uri(Url)).Result; response = httpClient.PostAsync(new Uri(Url), new FormUrlEncodedContent(postDataStr)).Result; retString = response.Content.ReadAsStringAsync().Result; var resultStr = Regex.Matches(DelHTML(retString), @"(?is)<body[^>]*?>([\s\S].*?)</body>") .Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList(); //用完要记得释放 httpClient.Dispose(); return resultStr[0].ToString(); } #region 将HTML去除一些无用数据 /// <summary> /// //将HTML去除一些无用数据 /// </summary> /// <param name="Htmlstring"></param> /// <returns></returns> public static string DelHTML(string Htmlstring) { //删除脚本 Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase); //<input((?< !<).) *? hidden.*?\/> Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<input type=""hidden""[^>]*?.*?\/>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase); //删除HTML Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"-->", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase); Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<!--.*", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase); Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", System.Text.RegularExpressions.RegexOptions.IgnoreCase); Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(amp|#38);", "&", System.Text.RegularExpressions.RegexOptions.IgnoreCase); Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(lt|#60);", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase); Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(gt|#62);", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", System.Text.RegularExpressions.RegexOptions.IgnoreCase); Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", System.Text.RegularExpressions.RegexOptions.IgnoreCase); Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", System.Text.RegularExpressions.RegexOptions.IgnoreCase); Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", System.Text.RegularExpressions.RegexOptions.IgnoreCase); Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"(\d+);", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase); Htmlstring.Replace("\r\n", ""); return Htmlstring; } #endregion #endregion
相关文章推荐
- 正则表达式相关:C# RichTextBox显示html文本内容
- 正则表达式相关:C# RichTextBox显示html文本内容
- 正则表达式相关:C# RichTextBox显示html文本内容
- java根据 正则表达式解析html网页内容
- 正则表达式获取HTML标记中的内容(C#)
- vb.net 使用 Regex Replace 正则 替换 Html字串的table中tbody第一个tr下的td为th
- 正则表达式获取HTML标记中的内容(C#)
- C# 正则表达式匹配多层嵌套的括号里面的内容 百度api逆地址解析
- C# 处理html 标签一些正则表达式 整理收集
- c# 正则表达式对网页进行有效内容抽取
- C#正则表达式提取HTML中IMG标签的SRC地址
- C#中利用正则表达式去除HTML中的格式
- HTML基础 table中的tr中的td标签中的valign属性设置文本靠上,中间,靠下
- C#中利用正则表达式获取字符串中双引号包含的内容
- C#正则表达式提取HTML中IMG标签的SRC地址
- 用正则表达式解析C#文件(Updated)
- c# 正则表达式对网页进行有效内容抽取
- 关于html中table表格tr,td的高度和宽度
- HTML解析之一:正则表达式
- C#正则表达式匹配HTML中的图片路径,图片地址