C# Html格式内容转Csv内容包括table(重点在rowspan和colspan合并),p,div元素
2016-04-01 10:04
1011 查看
Html格式内容转Csv内容,包括table(重点在rowspan和colspan合并),p,div元素,table不能包含嵌套功能。
html:
csv:
url:http://www.cnblogs.com/dreamman/p/5343924.html
/// <summary> /// Html格式内容转Csv内容包括table(重点在rowspan和colspan合并),p,div元素 /// </summary> /// <param name="hrml"></param> /// <returns></returns> private string HtmlToCsv(string hrml) { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(hrml); StringBuilder sbLines = new StringBuilder(); HtmlAgilityPack.HtmlNodeCollection tList = doc.DocumentNode.SelectNodes("//table"); if (tList != null) { foreach (HtmlAgilityPack.HtmlNode table in tList) { sbLines.AppendLine("#flag_table#,"); HtmlAgilityPack.HtmlNodeCollection rows = table.SelectNodes("//tr"); if (rows != null) { int colCount = 0; StringBuilder sbTable = new StringBuilder(); foreach (HtmlAgilityPack.HtmlNode td in rows[0].ChildNodes.Where(m => m.OriginalName.ToLower() == "td")) { HtmlAgilityPack.HtmlAttribute attr = td.Attributes["colspan"]; int colspan = (attr != null) ? int.Parse(attr.Value) : 1; colCount = colCount + colspan; } int rowCount = rows.Count; string[][] arr = new string[rowCount][]; for (int r = 0; r < rowCount; r++) { arr[r] = new string[colCount]; } //填充区域 for (int r = 0; r < rowCount; r++) { HtmlAgilityPack.HtmlNode tr = rows[r]; List<HtmlAgilityPack.HtmlNode> cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList(); int colspan = 0; int rowspan = 0; for (int c = 0; c < cols.Count; c++) { HtmlAgilityPack.HtmlAttribute cAttr = cols[c].Attributes["colspan"]; colspan = (cAttr != null) ? int.Parse(cAttr.Value) : 1; HtmlAgilityPack.HtmlAttribute rAttr = cols[c].Attributes["rowspan"]; rowspan = (rAttr != null) ? int.Parse(rAttr.Value) : 1; string text = cols[c].InnerText.Replace(" ", "").Replace(",", ",").Replace("\r", "").Replace("\n", "").Trim(); if (colspan == 1 && rowspan == 1) { continue; } bool isFirst = true; int rFill = r + rowspan; for (int ri = r; ri < rFill; ri++) { int cFill = c + colspan; for (int ci = c; ci < cFill; ci++) { if (isFirst) { text = (text == string.Empty) ? " " : text; arr[ri][ci] = text; isFirst = false; } else { arr[ri][ci] = string.Empty; } } } } } //填充单元 for (int r = 0; r < rowCount; r++) { HtmlAgilityPack.HtmlNode tr = rows[r]; List<HtmlAgilityPack.HtmlNode> cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList(); Queue<string> queue = new Queue<string>(); for (int c = 0; c < cols.Count; c++) { string text = cols[c].InnerText.Replace(" ", "").Replace(",", ",").Replace("\r", "").Replace("\n", "").Trim(); queue.Enqueue(text); } for (int c = 0; c < colCount; c++) { if (arr[r][c] == null) { string text = queue.Count > 0 ? queue.Dequeue() : string.Empty; arr[r][c] = text; } else { if (arr[r][c] != string.Empty) { if (queue.Count > 0) { queue.Dequeue(); } } } } } //组装成cvs格式内容 foreach (string[] cols in arr) { foreach (string col in cols) { sbLines.Append(col + ","); } sbLines.AppendLine(","); } table.RemoveAll(); } } } HtmlAgilityPack.HtmlNodeCollection pList = doc.DocumentNode.SelectNodes("//p"); if (pList != null) { sbLines.AppendLine("#flag_text#,"); foreach (HtmlAgilityPack.HtmlNode p in pList) { string text = p.InnerText.Replace(" ", "").Replace(",", ",").Replace("\r", "").Replace("\n", "").Trim(); text = GetTextByHtml(text); if (!string.IsNullOrWhiteSpace(text)) { sbLines.Append(text + ","); sbLines.AppendLine(","); } else { sbLines.AppendLine(","); } p.RemoveAll(); } } HtmlAgilityPack.HtmlNodeCollection dList = doc.DocumentNode.SelectNodes("//div"); if (pList != null) { sbLines.AppendLine("#flag_text#,"); foreach (HtmlAgilityPack.HtmlNode div in pList) { string text = div.InnerText.Replace(" ", "").Replace(",", ",").Replace("\r", "").Replace("\n", "").Trim(); text = GetTextByHtml(text); if (!string.IsNullOrWhiteSpace(text)) { sbLines.Append(text + ","); sbLines.AppendLine(","); } else { sbLines.AppendLine(","); } //div.RemoveAll(); } } return sbLines.ToString(); }
html:
csv:
url:http://www.cnblogs.com/dreamman/p/5343924.html
相关文章推荐
- C#将html转pdf
- 动态HTML和W3C文档对象模型
- 在HTML中URL、src、href分别代表什么?如何使用?
- HTML学习笔记(七)<div>与<span>
- HTML简介
- HTML <td> 标签的 nowrap 属性
- HTML中cellSpace 和 cellPadding
- html学习笔记3
- HTML基础语法(2)
- HTML 作业
- PHP代码为什么不能直接保存HTML文件——>PHP生成静态页面教程
- html基础
- html中dispaly 与 visibility的区别
- html 输入框显示“小叉叉”的清空图标
- 函数放到onload里面,在html里面执行函数会报错-----作用域和闭包相关问题
- html canvas 简单体验
- HTML学习笔记(六)列表
- xhtml与html的区别
- htm、html、shtml区别
- [Web开发] 在HTML代码里面如何判断IE版本