您的位置:首页 > 编程语言 > C#

一段多编码兼容的 C# 网页读取关键代码

2011-07-03 18:06 483 查看
/// <summary>
/// 读取 Web 页面的内容。
/// </summary>
/// <param name="url">Web 页面的 URL。</param>
/// <param name="useProxy">True 则使用程序配置的代理服务器(如果有),False 则不使用。</param>
/// <param name="streamEnc">指定文本编码。当为 null 时,程序将尝试从 Response Headers 以及 Meta Tags(如果 readMetaData 为 true)中找到编码方式,如果找不到,则使用 Encoding.Default。</param>
/// <param name="readMetaData">指定是否读取 Meta Tags 标记以判断文本编码。操作可能会略延长程序时间。</param>
/// <returns>返回 Web 页面的内容。</returns>
public static string GetPageContent(
string url,
bool useProxy,
Encoding streamEnc,
bool readMetaData)
{
WebRequest wr = null;
WebResponse rsp = null;
WebProxy wp = null;
Stream st = null;
StreamReader reader = null;
MemoryStream ms = null;
BinaryReader br = null;
byte[] buffer = null;
byte[] cnt = null;
Encoding enc = streamEnc;
try
{
// Create the webrequest
wr = WebRequest.Create(url);
// Init The Proxy Server
if (NetConfig.UseProxy && useProxy)
{
wp = new WebProxy(NetConfig.ProxyServer, NetConfig.ProxyPort);
wp.BypassProxyOnLocal = true;
wp.UseDefaultCredentials = false;
if (NetConfig.UseProxyCredential)
wp.Credentials = new NetworkCredential(NetConfig.ProxyUser, NetConfig.ProxyPass);
wr.Proxy = wp;
}
// Get the response
rsp = wr.GetResponse();
st = rsp.GetResponseStream();
if (enc == null)
{
// Copy the stream to a byte array
br = new BinaryReader(st);
ms = new MemoryStream();
while ((buffer = br.ReadBytes(8192)) != null
&& buffer.Length > 0)
{
ms.Write(buffer, 0, buffer.Length);
}
cnt = ms.ToArray();
// clear up
br.Close(); br = null;
ms.Close(); ms = null;
st.Close(); st = null;
// Try to get the encoding
if (enc == null)
{
int j = 0;
string encStr = "";
// try read the header
if (rsp.Headers[HttpResponseHeader.ContentType] != null)
{
j = rsp.Headers[HttpResponseHeader.ContentType].IndexOf("charset=", StringComparison.OrdinalIgnoreCase);
if (j >= 0)
{
encStr = rsp.Headers[HttpResponseHeader.ContentType].Substring(j + "charset=".Length).Trim();
if (!string.IsNullOrEmpty(encStr))
{
try
{
enc = Encoding.GetEncoding(encStr);
}
catch { }
}
}
}
// try read the meta data
if (enc == null && readMetaData)
{
string pgCnt = "";
pgCnt = Encoding.Default.GetString(cnt);
Regex regex = new Regex("<meta[^<>]+content=\"[^\"]*charset=(?<charset>[^\"]*)\"");
Match mc = regex.Match(pgCnt);
if (mc != null && mc.Success)
{
encStr = mc.Groups["charset"].Value;
if (encStr.Equals(Encoding.Default.WebName, StringComparison.OrdinalIgnoreCase))
{
return pgCnt;
}
if (!string.IsNullOrEmpty(encStr))
{
try
{
enc = Encoding.GetEncoding(encStr);
}
catch { }
}
}
}
}
if (enc == null)
enc = Encoding.Default;
// 正式读取内容
return enc.GetString(cnt);
}
else
{
// 如果已经指定了 Encoding
// 不需要那么多的步骤
reader = new StreamReader(st, enc);
return reader.ReadToEnd();
}
}
catch (Exception ex)
{
Debugger.LogException(ex);
return null;
}
finally
{
if (br != null)
br.Close();
if (ms != null)
ms.Close();
if (reader != null)
reader.Close();
if (st != null)
st.Close();
if (rsp != null)
rsp.Close();
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: