HttpWebRequest获取网页html源代码(并自动获取encoding)
2010-11-19 17:14
513 查看
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Net; using System.IO; using System.IO.Compression; using System.Text.RegularExpressions; namespace WikiPageCreater.Common { public class PageHelper { /// <summary> /// 根据 url 获取网页编码 /// </summary> /// <param name="url"></param> /// <returns></returns> public static string GetEncoding(string url) { HttpWebRequest request = null; HttpWebResponse response = null; StreamReader reader = null; try { request = (HttpWebRequest)WebRequest.Create(url); request.Timeout = 20000; request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024) { if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase)) reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress)); else reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII); string html = reader.ReadToEnd(); Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)"); if (reg_charset.IsMatch(html)) { return reg_charset.Match(html).Groups["charset"].Value; } else if (response.CharacterSet != string.Empty) { return response.CharacterSet; } else return Encoding.Default.BodyName; } } catch { } finally { if (response != null) { response.Close(); response = null; } if (reader != null) reader.Close(); if (request != null) request = null; } return Encoding.Default.BodyName; } /// <summary> /// 根据 url 和 encoding 获取当前url页面的 html 源代码 /// </summary> /// <param name="url"></param> /// <param name="encoding"></param> /// <returns></returns> public static string GetHtml(string url, Encoding encoding) { HttpWebRequest request = null; HttpWebResponse response = null; StreamReader reader = null; try { request = (HttpWebRequest)WebRequest.Create(url); request.Timeout = 20000; request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024) { if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase)) reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding); else reader = new StreamReader(response.GetResponseStream(), encoding); string html = reader.ReadToEnd(); return html; } } catch { } finally { if (response != null) { response.Close(); response = null; } if (reader != null) reader.Close(); if (request != null) request = null; } return string.Empty; } } }
相关文章推荐
- HttpWebRequest获取网页html源代码(并自动获取encoding)
- asp.net 利用HttpWebRequest自动获取网页编码并获取网页源代码
- asp.net 利用HttpWebRequest自动获取网页编码并获取网页源代码
- HttpWebRequest获取网页源代码时自动识别网页编码
- 用asp.net c# HttpWebRequest获取网页源代码
- C# 利用HttpWebRequest模拟登陆获取数据设置Accept-Encoding为gzip,deflate后返回的网页是乱码处理
- ASP.NET使用HttpWebRequest读取远程网页源代码
- c#利用WebClient和WebRequest获取网页源代码的比较
- c#利用WebClient和WebRequest获取网页源代码的比较
- HttpWebRequest获取百度的网页
- HttpWebRequest和Stream获取网页验证码图片
- c#利用WebClient和WebRequest获取网页源代码
- 解决HttpWebRequest和HtmlAgilityPack采集网页中文乱码问题
- 找不到好的方法,如何通过HttpWebRequest获取页面的Encoding。(转)
- 利用HttpWebRequest获取网页内容,由于Gzip压缩导致乱码的情况
- HttpWebRequest 下载网页Html代码 下载文件(Remote和FTP)Get方式
- HttpWebRequest自动登录网站并获取网站内容
- 京东价格监控软件开发技术探讨二:通过HttpWebRequest获取指定网页数据
- HttpWebRequest自动登录网站并获取网站内容
- C# HttpWebRequest 绝技 根据URL地址获取网页信息