C#获取网页内容,解决大部分乱码问题
2015-06-26 10:31
633 查看
思路,根据请求返回的响应头的Content-Type类型中的charset编码类型去编码抓取的内容,达到解决乱码的目的
public static string GetHtml(string url)
{
string htmlCode;
HttpWebRequest webRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
webRequest.Timeout = 30000;
webRequest.Method = "GET";
webRequest.UserAgent = "Mozilla/4.0";
webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
HttpWebResponse webResponse = (System.Net.HttpWebResponse)webRequest.GetResponse();
//获取目标网站的编码格式
string contentype = webResponse.Headers["Content-Type"];
Regex regex = new Regex("charset\\s*=\\s*[\\W]?\\s*([\\w-]+)", RegexOptions.IgnoreCase);
if (webResponse.ContentEncoding.ToLower() == "gzip")//如果使用了GZip则先解压
{
using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
{
using (var zipStream = new System.IO.Compression.GZipStream(streamReceive, System.IO.Compression.CompressionMode.Decompress))
{
//匹配编码格式
if (regex.IsMatch(contentype))
{
Encoding ending = Encoding.GetEncoding(regex.Match(contentype).Groups[1].Value.Trim());
using (StreamReader sr = new System.IO.StreamReader(zipStream, ending))
{
htmlCode = sr.ReadToEnd();
}
}
else
{
using (StreamReader sr = new System.IO.StreamReader(zipStream, Encoding.UTF8))
{
htmlCode = sr.ReadToEnd();
}
}
}
}
}
else
{
using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
{
using (System.IO.StreamReader sr = new System.IO.StreamReader(streamReceive, Encoding.Default))
{
htmlCode = sr.ReadToEnd();
}
}
}
return htmlCode;
}
public static string GetHtml(string url)
{
string htmlCode;
HttpWebRequest webRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
webRequest.Timeout = 30000;
webRequest.Method = "GET";
webRequest.UserAgent = "Mozilla/4.0";
webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
HttpWebResponse webResponse = (System.Net.HttpWebResponse)webRequest.GetResponse();
//获取目标网站的编码格式
string contentype = webResponse.Headers["Content-Type"];
Regex regex = new Regex("charset\\s*=\\s*[\\W]?\\s*([\\w-]+)", RegexOptions.IgnoreCase);
if (webResponse.ContentEncoding.ToLower() == "gzip")//如果使用了GZip则先解压
{
using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
{
using (var zipStream = new System.IO.Compression.GZipStream(streamReceive, System.IO.Compression.CompressionMode.Decompress))
{
//匹配编码格式
if (regex.IsMatch(contentype))
{
Encoding ending = Encoding.GetEncoding(regex.Match(contentype).Groups[1].Value.Trim());
using (StreamReader sr = new System.IO.StreamReader(zipStream, ending))
{
htmlCode = sr.ReadToEnd();
}
}
else
{
using (StreamReader sr = new System.IO.StreamReader(zipStream, Encoding.UTF8))
{
htmlCode = sr.ReadToEnd();
}
}
}
}
}
else
{
using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
{
using (System.IO.StreamReader sr = new System.IO.StreamReader(streamReceive, Encoding.Default))
{
htmlCode = sr.ReadToEnd();
}
}
}
return htmlCode;
}
相关文章推荐
- C#创建目录,文件写入消息不覆 4000 盖原有消息
- C#获取数组中最大最小值的方法
- C#窗口实现最小化到系统托盘
- c# 下实现ping 命令操作
- C#通过模板创建Word文件
- c#删除文件夹(目录)
- [转]C#开发系统服务时用的定时器组件
- C#中文件名或文件路径非法字符判断方法
- C# DataTable中查询指定字段名称的数据
- c#有关udp可靠传输(包传输数据包) 升级
- c# label的内容显示不全
- 基于c# 类、接口、结构的联系与区别详解
- C#学习笔记二(函数高级参数)
- C# WPF TextBox绑定数据的简单应用
- csharp: get Web.Services WebMethod
- C# 操作office
- WP8.1发送Post或Get请求顺带文件上传
- C# 函数式编程
- ArcGIS / C#开发 无法读取Excel(*.xlsx)文件
- C# 只允许运行一个实例