c#中过滤html的正则表达式
2006-10-13 00:00
746 查看
实现代码
/// <summary> /// 去除HTML标记 /// </summary> /// <param name=”NoHTML”>包括HTML的源码 </param> /// <returns>已经去除后的文字</returns> public static string NoHTML(string Htmlstring) { //删除脚本 Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase); //删除HTML Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"–>", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"<!–.*", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"(\d+);", "", RegexOptions.IgnoreCase); Htmlstring.Replace("<", ""); Htmlstring.Replace(">", ""); Htmlstring.Replace("\r\n", ""); Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim(); return Htmlstring; }
C#过滤Html标签及空格
public static string FilterHTML(string HTMLStr) { if (!string.IsNullOrEmpty(HTMLStr)) return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>| ", ""); else return ""; }
写一个静态方法移除HTML标签
#region /// <summary> /// 移除HTML标签 /// </summary> /// <param name="HTMLStr">HTMLStr</param> public static string ParseTags(string HTMLStr) { return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>", ""); } #endregion
取出文本中的图片地址
#region /// <summary> /// 取出文本中的图片地址 /// </summary> /// <param name="HTMLStr">HTMLStr</param> public static string GetImgUrl(string HTMLStr) { string str = string.Empty; string sPattern = @"^<img\s+[^>]*>"; Regex r = new Regex(@"<img\s+[^>]*\s*src\s*=\s*([']?)(?<url>\S+)'?[^>]*>", RegexOptions.Compiled); Match m = r.Match(HTMLStr.ToLower()); if (m.Success) str = m.Result("${url}"); return str; } #endregion
提取HTML代码中文字的C#函数
/// <summary> /// 提取HTML代码中文字的C#函数 /// </summary> /// <param name="strHtml">包括HTML的源码 </param> /// <returns>已经去除后的文字</returns> using System; using System.Text.RegularExpressions; public class StripHTMLTest { public static void Main() { string s = StripHTML( "<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>"); Console.WriteLine(s); } public static string StripHTML(string strHtml) { string[]aryReg = { @"<script[^>]*?>.*?</script>", @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[" "'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>", @"([\r\n])[\s]+", @ "&(quot|#34);", @"&(amp|#38);", @"&(lt|#60);", @"&(gt|#62);", @ "&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);", @"&(copy|#169);", @"(\d+);", @"-->", @"<!--.*\n" }; string[]aryRep = { "", "", "", "\"", "&", "<", ">", " ", "\xa1", //chr(161), "\xa2", //chr(162), "\xa3", //chr(163), "\xa9", //chr(169), "", "\r\n", "" }; string newReg = aryReg[0]; string strOutput = strHtml; for (int i = 0; i < aryReg.Length; i++) { Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase); strOutput = regex.Replace(strOutput, aryRep[i]); } strOutput.Replace("<", ""); strOutput.Replace(">", ""); strOutput.Replace("\r\n", ""); return strOutput; } }
TempContent 表示包含有html的字符串;
TempContent = System.Text.RegularExpressions.Regex.Replace(TempContent,"<[^>]+>","");至少一个
TempContent = System.Text.RegularExpressions.Regex.Replace(TempContent,"<[^>]*>","");任意个
您可能感兴趣的文章:
相关文章推荐
- C# 正则表达式过滤危险HTML
- c#中过滤html的正则表达式
- C# 正则表达式过滤危险HTML
- C#使用正则表达式过滤html标签
- C#中可以使用正则表达式来过滤html字符
- 在C#中使用正则表达式过滤html字符
- c#中过滤html的正则表达式
- C#中使用正则表达式来过滤html字符
- C#使用正则表达式过滤html标签
- C#中可以使用正则表达式来过滤html字符
- asp.net正则表达式提取网页网址、标题、图片实例以及过滤所有HTML标签实例
- php用正则表达式过滤html的超链接及提取链接
- java 正则表达式过滤掉html标签 过滤掉Html代码
- c#正则过滤html标记
- 用正则表达式过滤数据中的html标签
- C#正则表达式之过滤掉双引号...
- 使用正则表达式过滤html标签,属性,样式表,挂马脚本
- .net(C#)从html中提取中文字_正则表达式
- 正则表达式获取HTML标记中的内容(C#)
- C#正则表达式提取HTML中IMG标签的SRC地址