您的位置:首页 > 编程语言 > C#

C#获取网页内容,解决大部分乱码问题

2015-06-26 10:31 633 查看
思路,根据请求返回的响应头的Content-Type类型中的charset编码类型去编码抓取的内容,达到解决乱码的目的

public static string GetHtml(string url)

{

string htmlCode;

HttpWebRequest webRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);

webRequest.Timeout = 30000;

webRequest.Method = "GET";

webRequest.UserAgent = "Mozilla/4.0";

webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");

HttpWebResponse webResponse = (System.Net.HttpWebResponse)webRequest.GetResponse();

//获取目标网站的编码格式

string contentype = webResponse.Headers["Content-Type"];

Regex regex = new Regex("charset\\s*=\\s*[\\W]?\\s*([\\w-]+)", RegexOptions.IgnoreCase);

if (webResponse.ContentEncoding.ToLower() == "gzip")//如果使用了GZip则先解压

{

using (System.IO.Stream streamReceive = webResponse.GetResponseStream())

{

using (var zipStream = new System.IO.Compression.GZipStream(streamReceive, System.IO.Compression.CompressionMode.Decompress))

{

//匹配编码格式

if (regex.IsMatch(contentype))

{

Encoding ending = Encoding.GetEncoding(regex.Match(contentype).Groups[1].Value.Trim());

using (StreamReader sr = new System.IO.StreamReader(zipStream, ending))

{

htmlCode = sr.ReadToEnd();

}

}

else

{

using (StreamReader sr = new System.IO.StreamReader(zipStream, Encoding.UTF8))

{

htmlCode = sr.ReadToEnd();

}

}

}

}

}

else

{

using (System.IO.Stream streamReceive = webResponse.GetResponseStream())

{

using (System.IO.StreamReader sr = new System.IO.StreamReader(streamReceive, Encoding.Default))

{

htmlCode = sr.ReadToEnd();

}

}

}

return htmlCode;

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: