您的位置:首页 > 理论基础 > 计算机网络

Get Html Content use HttpDownloader

2011-11-27 19:13 525 查看
http://stackoverflow.com/questions/2700638/characters-in-string-changed-after-downloading-html-from-the-internet



Issue

Using the following code, I can download the HTML of a file from the internet:
WebClient wc = new WebClient();

// ....

string downloadedFile = wc.DownloadString("http://www.myurl.com/");


However, sometimes the file contains "interesting" characters like
é
to
é
,
to
â†
and
フシギダネ
to
フシギダãƒ
.

Resolution

Here's a wrapped download class which supports gzip and checks encoding header and meta tags in order to decode it correctly.

Instantiate the class, and call
GetPage()


public class HttpDownloader
{
private readonly string _referer;
private readonly string _userAgent;

public Encoding Encoding { get; set; }
public WebHeaderCollection Headers { get; set; }
public Uri Url { get; set; }

public HttpDownloader(string url, string referer, string userAgent)
{
Encoding = Encoding.GetEncoding("ISO-8859-1");
Url = new Uri(url); // verify the uri
_userAgent = userAgent;
_referer = referer;
}

public string GetPage()
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
if (!string.IsNullOrEmpty(_referer))
request.Referer = _referer;
if (!string.IsNullOrEmpty(_userAgent))
request.UserAgent = _userAgent;

request.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip,deflate");

using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
{
Headers = response.Headers;
Url = response.ResponseUri;
return ProcessContent(response);
}

}

private string ProcessContent(HttpWebResponse response)
{
SetEncodingFromHeader(response);

Stream s = response.GetResponseStream();
if (response.ContentEncoding.ToLower().Contains("gzip"))
s = new GZipStream(s, CompressionMode.Decompress);
else if (response.ContentEncoding.ToLower().Contains("deflate"))
s = new DeflateStream(s, CompressionMode.Decompress);

MemoryStream memStream = new MemoryStream();
int bytesRead;
byte[] buffer = new byte[0x1000];
for (bytesRead = s.Read(buffer, 0, buffer.Length); bytesRead > 0; bytesRead = s.Read(buffer, 0, buffer.Length))
{
memStream.Write(buffer, 0, bytesRead);
}
s.Close();
string html;
memStream.Position = 0;
using (StreamReader r = new StreamReader(memStream, Encoding))
{
html = r.ReadToEnd().Trim();
html = CheckMetaCharSetAndReEncode(memStream, html);
}

return html;
}

private void SetEncodingFromHeader(HttpWebResponse response)
{
string charset = null;
if (string.IsNullOrEmpty(response.CharacterSet))
{
Match m = Regex.Match(response.ContentType, @";\s*charset\s*=\s*(?<charset>.*)", RegexOptions.IgnoreCase);
if (m.Success)
{
charset = m.Groups["charset"].Value.Trim(new[] { '\'', '"' });
}
}
else
{
charset = response.CharacterSet;
}
if (!string.IsNullOrEmpty(charset))
{
try
{
Encoding = Encoding.GetEncoding(charset);
}
catch (ArgumentException)
{
}
}
}

private string CheckMetaCharSetAndReEncode(Stream memStream, string html)
{
Match m = new Regex(@"<meta\s+.*?charset\s*=\s*(?<charset>[A-Za-z0-9_-]+)", RegexOptions.Singleline | RegexOptions.IgnoreCase).Match(html);
if (m.Success)
{
string charset = m.Groups["charset"].Value.ToLower() ?? "iso-8859-1";
if ((charset == "unicode") || (charset == "utf-16"))
{
charset = "utf-8";
}

try
{
Encoding metaEncoding = Encoding.GetEncoding(charset);
if (Encoding != metaEncoding)
{
memStream.Position = 0L;
StreamReader recodeReader = new StreamReader(memStream, metaEncoding);
html = recodeReader.ReadToEnd().Trim();
recodeReader.Close();
}
}
catch (ArgumentException)
{
}
}

return html;
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐