您的位置:首页 > Web前端 > HTML

C# Html格式内容转Csv内容包括table(重点在rowspan和colspan合并),p,div元素

2016-04-01 10:04 1011 查看
Html格式内容转Csv内容,包括table(重点在rowspan和colspan合并),p,div元素,table不能包含嵌套功能。

/// <summary>
/// Html格式内容转Csv内容包括table(重点在rowspan和colspan合并),p,div元素
/// </summary>
/// <param name="hrml"></param>
/// <returns></returns>
private string HtmlToCsv(string hrml)
{
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(hrml);
StringBuilder sbLines = new StringBuilder();
HtmlAgilityPack.HtmlNodeCollection tList = doc.DocumentNode.SelectNodes("//table");
if (tList != null)
{
foreach (HtmlAgilityPack.HtmlNode table in tList)
{
sbLines.AppendLine("#flag_table#,");
HtmlAgilityPack.HtmlNodeCollection rows = table.SelectNodes("//tr");
if (rows != null)
{
int colCount = 0;
StringBuilder sbTable = new StringBuilder();
foreach (HtmlAgilityPack.HtmlNode td in rows[0].ChildNodes.Where(m => m.OriginalName.ToLower() == "td"))
{
HtmlAgilityPack.HtmlAttribute attr = td.Attributes["colspan"];
int colspan = (attr != null) ? int.Parse(attr.Value) : 1;
colCount = colCount + colspan;
}
int rowCount = rows.Count;

string[][] arr = new string[rowCount][];
for (int r = 0; r < rowCount; r++)
{
arr[r] = new string[colCount];
}

//填充区域
for (int r = 0; r < rowCount; r++)
{
HtmlAgilityPack.HtmlNode tr = rows[r];
List<HtmlAgilityPack.HtmlNode> cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList();

int colspan = 0;
int rowspan = 0;
for (int c = 0; c < cols.Count; c++)
{
HtmlAgilityPack.HtmlAttribute cAttr = cols[c].Attributes["colspan"];
colspan = (cAttr != null) ? int.Parse(cAttr.Value) : 1;
HtmlAgilityPack.HtmlAttribute rAttr = cols[c].Attributes["rowspan"];
rowspan = (rAttr != null) ? int.Parse(rAttr.Value) : 1;
string text = cols[c].InnerText.Replace(" ", "").Replace(",", ",").Replace("\r", "").Replace("\n", "").Trim();

if (colspan == 1 && rowspan == 1)
{
continue;
}

bool isFirst = true;
int rFill = r + rowspan;
for (int ri = r; ri < rFill; ri++)
{
int cFill = c + colspan;
for (int ci = c; ci < cFill; ci++)
{
if (isFirst)
{
text = (text == string.Empty) ? " " : text;
arr[ri][ci] = text;
isFirst = false;
}
else
{
arr[ri][ci] = string.Empty;
}
}
}
}
}

//填充单元
for (int r = 0; r < rowCount; r++)
{
HtmlAgilityPack.HtmlNode tr = rows[r];
List<HtmlAgilityPack.HtmlNode> cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList();
Queue<string> queue = new Queue<string>();
for (int c = 0; c < cols.Count; c++)
{
string text = cols[c].InnerText.Replace(" ", "").Replace(",", ",").Replace("\r", "").Replace("\n", "").Trim();
queue.Enqueue(text);
}
for (int c = 0; c < colCount; c++)
{
if (arr[r][c] == null)
{
string text = queue.Count > 0 ? queue.Dequeue() : string.Empty;
arr[r][c] = text;
}
else
{
if (arr[r][c] != string.Empty)
{
if (queue.Count > 0)
{
queue.Dequeue();
}
}
}
}
}

//组装成cvs格式内容
foreach (string[] cols in arr)
{
foreach (string col in cols)
{
sbLines.Append(col + ",");
}
sbLines.AppendLine(",");
}
table.RemoveAll();
}
}
}

HtmlAgilityPack.HtmlNodeCollection pList = doc.DocumentNode.SelectNodes("//p");
if (pList != null)
{
sbLines.AppendLine("#flag_text#,");
foreach (HtmlAgilityPack.HtmlNode p in pList)
{
string text = p.InnerText.Replace(" ", "").Replace(",", ",").Replace("\r", "").Replace("\n", "").Trim();
text = GetTextByHtml(text);
if (!string.IsNullOrWhiteSpace(text))
{
sbLines.Append(text + ",");
sbLines.AppendLine(",");
}
else
{
sbLines.AppendLine(",");
}
p.RemoveAll();
}
}

HtmlAgilityPack.HtmlNodeCollection dList = doc.DocumentNode.SelectNodes("//div");
if (pList != null)
{
sbLines.AppendLine("#flag_text#,");
foreach (HtmlAgilityPack.HtmlNode div in pList)
{
string text = div.InnerText.Replace(" ", "").Replace(",", ",").Replace("\r", "").Replace("\n", "").Trim();
text = GetTextByHtml(text);
if (!string.IsNullOrWhiteSpace(text))
{
sbLines.Append(text + ",");
sbLines.AppendLine(",");
}
else
{
sbLines.AppendLine(",");
}
//div.RemoveAll();
}
}
return sbLines.ToString();
}


html:



csv:



url:http://www.cnblogs.com/dreamman/p/5343924.html
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: