您的位置:首页 > 编程语言 > ASP

Asp.net对比两网页不同生成修改痕迹----用于编辑器修改内容的历史版本

2013-06-09 13:58 585 查看
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;

namespace Util
{
public class HtmlDiff
{

private StringBuilder content;
private string oldText, newText;
private string[] oldWords, newWords;
Dictionary<string, List<int>> wordIndices;
private string[] specialCaseOpeningTags = new string[] { "<strong[\\>\\s]+", "<b[\\>\\s]+", "<i[\\>\\s]+", "<big[\\>\\s]+", "<small[\\>\\s]+", "<u[\\>\\s]+", "<sub[\\>\\s]+", "<sup[\\>\\s]+", "<strike[\\>\\s]+", "<s[\\>\\s]+" };
private string[] specialCaseClosingTags = new string[] { "</strong>", "</b>", "</i>", "</big>", "</small>", "</u>", "</sub>", "</sup>", "</strike>", "</s>" };

//private string[] imgOpenTag = new string[] { "<img[\\>\\s]+" };

/// <summary>
/// Initializes a new instance of the <see cref="Diff"/> class.
/// </summary>
/// <param name="oldText">The old text.</param>
/// <param name="newText">The new text.</param>
public HtmlDiff(string oldText, string newText)
{
this.oldText = oldText;
this.newText = newText;

this.content = new StringBuilder();
}

/// <summary>
/// Builds the HTML diff output
/// </summary>
/// <returns>HTML diff markup</returns>
public string Build()
{
this.SplitInputsToWords();

this.IndexNewWords();

var operations = this.Operations();

foreach (var item in operations)
{
this.PerformOperation(item);
}

return this.content.ToString();
}

private void IndexNewWords()
{
this.wordIndices = new Dictionary<string, List<int>>();
for (int i = 0; i < this.newWords.Length; i++)
{
string word = this.newWords[i];

if (this.wordIndices.ContainsKey(word))
{
this.wordIndices[word].Add(i);
}
else
{
this.wordIndices[word] = new List<int>();
this.wordIndices[word].Add(i);
}
}
}

private void SplitInputsToWords()
{
this.oldWords = ConvertHtmlToListOfWords(this.Explode(this.oldText));
this.newWords = ConvertHtmlToListOfWords(this.Explode(this.newText));
}

//把html字符串分成<>,空格和单个字符串,然后比较
private string[] ConvertHtmlToListOfWords(string[] characterString)
{
Mode mode = Mode.character;
string current_word = String.Empty;
List<string> words = new List<string>();

foreach (var character in characterString)
{
switch (mode)
{
case Mode.character:

if (this.IsStartOfTag(character))
{
if (current_word != String.Empty)
{
words.Add(current_word);
}

current_word = "<";
mode = Mode.tag;
}
else if (Regex.IsMatch(character, "\\s"))
{
if (current_word != String.Empty)
{
words.Add(current_word);
}
current_word = character;
mode = Mode.whitespace;
}
else
{
if (current_word != string.Empty)
{
words.Add(current_word);
}
current_word = character;
}

break;
case Mode.tag:

if (this.IsEndOfTag(character))
{
current_word += ">";
words.Add(current_word);
current_word = "";

if (IsWhiteSpace(character))
{
mode = Mode.whitespace;
}
else
{
mode = Mode.character;
}
}
else
{
current_word += character;
}

break;
case Mode.whitespace:

if (this.IsStartOfTag(character))
{
if (current_word != String.Empty)
{
words.Add(current_word);
}
current_word = "<";
mode = Mode.tag;
}
else if (Regex.IsMatch(character, "\\s"))
{
current_word += character;
}
else
{
if (current_word != String.Empty)
{
words.Add(current_word);
}

current_word = character;
mode = Mode.character;
}

break;
default:
break;
}

}
if (current_word != string.Empty)
{
words.Add(current_word);
}

return words.ToArray();
}

private bool IsStartOfTag(string val)
{
return val == "<";
}

private bool IsEndOfTag(string val)
{
return val == ">";
}

private bool IsWhiteSpace(string value)
{
return Regex.IsMatch(value, "\\s", RegexOptions.IgnoreCase);
}

private string[] Explode(string value)
{
return Regex.Split(value, "", RegexOptions.IgnoreCase);
}

private void PerformOperation(Operation operation)
{
switch (operation.Action)
{
case Action.equal:
this.ProcessEqualOperation(operation);
break;
case Action.delete:
this.ProcessDeleteOperation(operation, "diffdel");
break;
case Action.insert:
this.ProcessInsertOperation(operation, "diffins");
break;
case Action.none:
break;
case Action.replace:
this.ProcessReplaceOperation(operation);
break;
default:
break;
}
}

private void ProcessReplaceOperation(Operation operation)
{
this.ProcessDeleteOperation(operation, "diffmod");
this.ProcessInsertOperation(operation, "diffmod");
}

private void ProcessInsertOperation(Operation operation, string cssClass)
{
this.InsertTag("ins", cssClass, this.newWords.Where((s, pos) => pos >= operation.StartInNew && pos < operation.EndInNew).ToList());
}

private void ProcessDeleteOperation(Operation operation, string cssClass)
{
var text = this.oldWords.Where((s, pos) => pos >= operation.StartInOld && pos < operation.EndInOld).ToList();
this.InsertTag("del", cssClass, text);
}

private void ProcessEqualOperation(Operation operation)
{
var result = this.newWords.Where((s, pos) => pos >= operation.StartInNew && pos < operation.EndInNew).ToArray();
this.content.Append(String.Join("", result));
}

/// <summary>
/// This method encloses words within a specified tag (ins or del), and adds this into "content",
/// with a twist: if there are words contain tags, it actually creates multiple ins or del,
/// so that they don't include any ins or del. This handles cases like
/// old: '<p>a</p>'
/// new: '<p>ab</p><p>c</b>'
/// diff result: '<p>a<ins>b</ins></p><p><ins>c</ins></p>'
/// this still doesn't guarantee valid HTML (hint: think about diffing a text containing ins or
/// del tags), but handles correctly more cases than the earlier version.
///
/// P.S.: Spare a thought for people who write HTML browsers. They live in this ... every day.
/// </summary>
/// <param name="tag"></param>
/// <param name="cssClass"></param>
/// <param name="words"></param>
private void InsertTag(string tag, string cssClass, List<string> words)
{
while (true)
{
if (words.Count == 0)
{
break;
}

var nonTags = ExtractConsecutiveWords(words, x => !this.IsTag(x));

string specialCaseTagInjection = string.Empty;
bool specialCaseTagInjectionIsBefore = false;                       //标签是插入在前还是在后

//string text = this.WrapText(string.Join("", words.ToArray()), tag, cssClass);

//this.content.Append(text);

if (nonTags.Length != 0)
{
string text = this.WrapText(string.Join("", nonTags), tag, cssClass);

this.content.Append(text);
}
else
{
// Check if strong tag

if (this.specialCaseOpeningTags.FirstOrDefault(x => Regex.IsMatch(words[0], x, RegexOptions.IgnoreCase)) != null)
{
specialCaseTagInjection = "<ins class='mod'>";

//判断是否是图片,是图片设specialCaseTagInjectionIsBefore=true
//if (imgOpenTag.FirstOrDefault(x => Regex.IsMatch(words[0], x)) != null)
//{
//    specialCaseTagInjectionIsBefore = true;
//}

//if (tag == "del")
//{
words.Clear();
//}
}
else if (this.specialCaseClosingTags.Contains(words[0]))
{
specialCaseTagInjection = "</ins>";
specialCaseTagInjectionIsBefore = true;
//if (tag == "del")
//{
words.Clear();
//}
}
else if (Regex.IsMatch(words[0], "<img[\\>\\s]+")) // 图片
{
specialCaseTagInjectionIsBefore = true;
specialCaseTagInjection = string.Format("<{0} class='{1}'>", tag, cssClass);
this.content.Append(specialCaseTagInjection + String.Join("", this.ExtractConsecutiveWords(words, x => this.IsTag(x))));
specialCaseTagInjection = string.Format("</{0}>", tag);
words.Clear();
}

}

if (words.Count == 0 && specialCaseTagInjection.Length == 0)
{
break;
}

if (specialCaseTagInjectionIsBefore)
{
this.content.Append(specialCaseTagInjection + String.Join("", this.ExtractConsecutiveWords(words, x => this.IsTag(x))));
}
else
{
this.content.Append(String.Join("", this.ExtractConsecutiveWords(words, x => this.IsTag(x))) + specialCaseTagInjection);
}
}
}

private string WrapText(string text, string tagName, string cssClass)
{
return string.Format("<{0} class='{1}'>{2}</{0}>", tagName, cssClass, text);
}

private string[] ExtractConsecutiveWords(List<string> words, Func<string, bool> condition)
{
int? indexOfFirstTag = null;

for (int i = 0; i < words.Count; i++)
{
string word = words[i];

if (!condition(word))
{
indexOfFirstTag = i;
break;
}
}

if (indexOfFirstTag != null)
{
var items = words.Where((s, pos) => pos >= 0 && pos < indexOfFirstTag).ToArray();
if (indexOfFirstTag.Value > 0)
{
words.RemoveRange(0, indexOfFirstTag.Value);
}
return items;
}
else
{
var items = words.Where((s, pos) => pos >= 0 && pos <= words.Count).ToArray();
words.RemoveRange(0, words.Count);
return items;
}
}

/// <summary>
/// 是否是标签<img>,<b>
/// </summary>
/// <param name="item"></param>
/// <returns></returns>
private bool IsTag(string item)
{
bool isTag = IsOpeningTag(item) || IsClosingTag(item);
return isTag;
}

private bool IsOpeningTag(string item)
{
return Regex.IsMatch(item, "^\\s*<[^>]+>\\s*$", RegexOptions.IgnoreCase);
}

private bool IsClosingTag(string item)
{
return Regex.IsMatch(item, "^\\s*</[^>]+>\\s*$", RegexOptions.IgnoreCase);
}

private List<Operation> Operations()
{
int positionInOld = 0, positionInNew = 0;
List<Operation> operations = new List<Operation>();

var matches = this.MatchingBlocks();

matches.Add(new Match(this.oldWords.Length, this.newWords.Length, 0));

for (int i = 0; i < matches.Count; i++)
{
var match = matches[i];

bool matchStartsAtCurrentPositionInOld = (positionInOld == match.StartInOld);
bool matchStartsAtCurrentPositionInNew = (positionInNew == match.StartInNew);

Action action = Action.none;

if (matchStartsAtCurrentPositionInOld == false
&& matchStartsAtCurrentPositionInNew == false)
{
action = Action.replace;
}
else if (matchStartsAtCurrentPositionInOld == true
&& matchStartsAtCurrentPositionInNew == false)
{
action = Action.insert;
}
else if (matchStartsAtCurrentPositionInOld == false
&& matchStartsAtCurrentPositionInNew == true)
{
action = Action.delete;
}
else // This occurs if the first few words are the same in both versions
{
action = Action.none;
}

if (action != Action.none)
{
operations.Add(
new Operation(action,
positionInOld,
match.StartInOld,
positionInNew,
match.StartInNew));
}

if (match.Size != 0)
{
operations.Add(new Operation(
Action.equal,
match.StartInOld,
match.EndInOld,
match.StartInNew,
match.EndInNew));

}

positionInOld = match.EndInOld;
positionInNew = match.EndInNew;
}

return operations;

}

private List<Match> MatchingBlocks()
{
List<Match> matchingBlocks = new List<Match>();
this.FindMatchingBlocks(0, this.oldWords.Length, 0, this.newWords.Length, matchingBlocks);
return matchingBlocks;
}

private void FindMatchingBlocks(int startInOld, int endInOld, int startInNew, int endInNew, List<Match> matchingBlocks)
{
var match = this.FindMatch(startInOld, endInOld, startInNew, endInNew);

if (match != null)
{
if (startInOld < match.StartInOld && startInNew < match.StartInNew)
{
this.FindMatchingBlocks(startInOld, match.StartInOld, startInNew, match.StartInNew, matchingBlocks);
}

matchingBlocks.Add(match);

if (match.EndInOld < endInOld && match.EndInNew < endInNew)
{
this.FindMatchingBlocks(match.EndInOld, endInOld, match.EndInNew, endInNew, matchingBlocks);
}

}
}

private Match FindMatch(int startInOld, int endInOld, int startInNew, int endInNew)
{
int bestMatchInOld = startInOld;
int bestMatchInNew = startInNew;
int bestMatchSize = 0;

Dictionary<int, int> matchLengthAt = new Dictionary<int, int>();

for (int indexInOld = startInOld; indexInOld < endInOld; indexInOld++)
{
var newMatchLengthAt = new Dictionary<int, int>();

string index = this.oldWords[indexInOld];

if (!this.wordIndices.ContainsKey(index))
{
matchLengthAt = newMatchLengthAt;
continue;
}

foreach (var indexInNew in this.wordIndices[index])
{
if (indexInNew < startInNew)
{
continue;
}

if (indexInNew >= endInNew)
{
break;
}

int newMatchLength = (matchLengthAt.ContainsKey(indexInNew - 1) ? matchLengthAt[indexInNew - 1] : 0) + 1;
newMatchLengthAt[indexInNew] = newMatchLength;

if (newMatchLength > bestMatchSize)
{
bestMatchInOld = indexInOld - newMatchLength + 1;
bestMatchInNew = indexInNew - newMatchLength + 1;
bestMatchSize = newMatchLength;
}
}

matchLengthAt = newMatchLengthAt;
}

return bestMatchSize != 0 ? new Match(bestMatchInOld, bestMatchInNew, bestMatchSize) : null;
}

}

public class Match
{
public Match(int startInOld, int startInNew, int size)
{
this.StartInOld = startInOld;
this.StartInNew = startInNew;
this.Size = size;
}

public int StartInOld { get; set; }
public int StartInNew { get; set; }
public int Size { get; set; }

public int EndInOld
{
get
{
return this.StartInOld + this.Size;
}
}

public int EndInNew
{
get
{
return this.StartInNew + this.Size;
}
}

}

public class Operation
{
public Action Action { get; set; }
public int StartInOld { get; set; }
public int EndInOld { get; set; }
public int StartInNew { get; set; }
public int EndInNew { get; set; }

public Operation(Action action, int startInOld, int endInOld, int startInNew, int endInNew)
{
this.Action = action;
this.StartInOld = startInOld;
this.EndInOld = endInOld;
this.StartInNew = startInNew;
this.EndInNew = endInNew;
}
}

public enum Mode
{
character,
tag,
whitespace,
}

public enum Action
{
equal,
delete,
insert,
none,
replace
}
}


使用方法:

HtmlDiff html=new HtmlDiff(string 旧版本,string 新版本);
string 比对后字符=html.Build();


                                            
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: