有关C# httpresponse 404 page not found error 的处理方案
2009-07-29 11:08
645 查看
需求分析:本人最近做一个项目,项目中需要从新闻的索引页(就是上面有很多链接的那种网页),获取新闻正文页源码,并将新闻正文页源码保存到本地数据库中。
但是由于网络稳定性的原因,总会出现 404 page not found 类型的error。(但是网页是确确实实存在的)。而且这种错误,往往是在程序运行一段时间后出现的,觉得很不可思议。我在网络上查这种问题的解决方案时,发现没有一种管用的。本人现在已经成功解决该问题,遂将自己的解决方案写下来和大家分享与探讨。
解决方案核心:一旦出现这种错误,程序中就递归调用下载函数本身。代码说明如下:
public static string GetDataFromUrl(string url, int nRetryTimes)
{
if (nRetryTimes == 0)
return string.Empty;
string result = string.Empty;
try
{
result=GetDataFromUrl(url);
}
catch (System.Exception exc)
{
if(exc.Message.IndexOf("404")!=-1)
{
result=GetDataFromUrl(url,nRetryTimes-1);
}
}
return result;
}
其中nRetryTimes 代表出现这种错误后,函数递归调用自己的次数,也可以理解为递归终止的条件。GetDataFromUrl(string url)函数代码如下:
public static string GetDataFromUrl(string url)
{
string str = string.Empty;
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
//设置http头
request.AllowAutoRedirect = true;
request.AllowWriteStreamBuffering = true;
request.Referer = "";
request.Timeout = 10000000;
request.UserAgent = "";
;
request.KeepAlive = false;//to avoid the error of time out
HttpWebResponse response = null;
response = (HttpWebResponse)request.GetResponse();
//根据http应答的http头来判断编码
string characterSet = response.CharacterSet;
Encoding encode;
if (characterSet != "")
{
if (characterSet == "ISO-8859-1")
{
characterSet = "gb2312";
}
encode = Encoding.GetEncoding(characterSet);
}
else
{
encode = Encoding.Default;
}
//声明一个内存流来保存http应答流
Stream receiveStream = response.GetResponseStream();
MemoryStream mStream = new MemoryStream();
byte[] bf = new byte[255];
int count = receiveStream.Read(bf, 0, 255);
while (count > 0)
{
mStream.Write(bf, 0, count);
count = receiveStream.Read(bf, 0, 255);
}
receiveStream.Close();
mStream.Seek(0, SeekOrigin.Begin);
//从内存流里读取字符串
StreamReader reader = new StreamReader(mStream, encode);
char[] buffer = new char[1024];
count = reader.Read(buffer, 0, 1024);
while (count > 0)
{
str += new String(buffer, 0, count);
count = reader.Read(buffer, 0, 1024);
}
//从解析出的字符串里判断charset,如果和http应答的编码不一直
//那么以页面声明的为准,再次从内存流里重新读取文本
Regex reg =
new Regex(@"<meta[\s\S]+?charset=(.*?)""[\s\S]+?>",
RegexOptions.Multiline | RegexOptions.IgnoreCase);
MatchCollection mc = reg.Matches(str);
if (mc.Count > 0)
{
string tempCharSet = mc[0].Result("$1");
if (string.Compare(tempCharSet, characterSet, true) != 0)
{
encode = Encoding.GetEncoding(tempCharSet);
str = string.Empty;
mStream.Seek(0, SeekOrigin.Begin);
reader = new StreamReader(mStream, encode);
buffer = new char[255];
count = reader.Read(buffer, 0, 255);
while (count > 0)
{
str += new String(buffer, 0, count);
count = reader.Read(buffer, 0, 255);
}
}
}
reader.Close();
mStream.Close();
if (response != null)
response.Close();
return str;
}
值得说明的是:尽管采用了此方法,当你查看数据库的时候,你还是会发现有些正文源码没有下载下来。拿我的数据表单来说:我的数据库表单的各个属性如下 ArticlePageId,--数据表的主键。ArticlePageTitle--新闻标题,ArticlePageUrl,--新闻正文页URL,ArticlePageSource--新闻正文页源码,也就是从ArticlePageUrl下载的源码。如果ArticlePageSource字段为空,则表明,下载失败。于是,我又加了一个打补丁的模块。代码如下:
把补丁的模块代码
public static void PatchingUp(string desTable)
{
string conStr = "server=(local);database=ArticleCollection;uid=sa;pwd=123456";
string des = desTable;
string select = string.Format("SELECT ArticlePageId,ArticlePageSource,ArticlePageUrl from {0}", des);
SqlConnection connection = new SqlConnection(conStr);
SqlDataAdapter adapter = new SqlDataAdapter(select, connection);
SqlCommandBuilder builder = new SqlCommandBuilder(adapter);
DataTable table = new DataTable();
adapter.Fill(table);
bool isModified = false;
for (int i = 0; i < table.Rows.Count; i++)
{
if (table.Rows[i]["ArticlePageSource"].ToString() == null || table.Rows[i]["ArticlePageSource"].ToString() == "")
{
Console.WriteLine("在原数据库第{0}行插入失败", i+1);
string url = table.Rows[i]["ArticlePageUrl"].ToString().Trim();
table.Rows[i]["ArticlePageSource"] = ClawlerAssist.GetDataFromUrl(url, 5);
Console.WriteLine("在原数据库第{0}行更新成功", i+1);
if (!isModified)
{
isModified = true;
}
}
}
if (isModified)
{
adapter.Update(table);
}
}
PS:我是新手,这也是我第一次选择首页发帖和大家分享我的一点收获和见解。如有不对的地方还请各位前辈指证。以免误认子弟。
但是由于网络稳定性的原因,总会出现 404 page not found 类型的error。(但是网页是确确实实存在的)。而且这种错误,往往是在程序运行一段时间后出现的,觉得很不可思议。我在网络上查这种问题的解决方案时,发现没有一种管用的。本人现在已经成功解决该问题,遂将自己的解决方案写下来和大家分享与探讨。
解决方案核心:一旦出现这种错误,程序中就递归调用下载函数本身。代码说明如下:
public static string GetDataFromUrl(string url, int nRetryTimes)
{
if (nRetryTimes == 0)
return string.Empty;
string result = string.Empty;
try
{
result=GetDataFromUrl(url);
}
catch (System.Exception exc)
{
if(exc.Message.IndexOf("404")!=-1)
{
result=GetDataFromUrl(url,nRetryTimes-1);
}
}
return result;
}
其中nRetryTimes 代表出现这种错误后,函数递归调用自己的次数,也可以理解为递归终止的条件。GetDataFromUrl(string url)函数代码如下:
public static string GetDataFromUrl(string url)
{
string str = string.Empty;
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
//设置http头
request.AllowAutoRedirect = true;
request.AllowWriteStreamBuffering = true;
request.Referer = "";
request.Timeout = 10000000;
request.UserAgent = "";
;
request.KeepAlive = false;//to avoid the error of time out
HttpWebResponse response = null;
response = (HttpWebResponse)request.GetResponse();
//根据http应答的http头来判断编码
string characterSet = response.CharacterSet;
Encoding encode;
if (characterSet != "")
{
if (characterSet == "ISO-8859-1")
{
characterSet = "gb2312";
}
encode = Encoding.GetEncoding(characterSet);
}
else
{
encode = Encoding.Default;
}
//声明一个内存流来保存http应答流
Stream receiveStream = response.GetResponseStream();
MemoryStream mStream = new MemoryStream();
byte[] bf = new byte[255];
int count = receiveStream.Read(bf, 0, 255);
while (count > 0)
{
mStream.Write(bf, 0, count);
count = receiveStream.Read(bf, 0, 255);
}
receiveStream.Close();
mStream.Seek(0, SeekOrigin.Begin);
//从内存流里读取字符串
StreamReader reader = new StreamReader(mStream, encode);
char[] buffer = new char[1024];
count = reader.Read(buffer, 0, 1024);
while (count > 0)
{
str += new String(buffer, 0, count);
count = reader.Read(buffer, 0, 1024);
}
//从解析出的字符串里判断charset,如果和http应答的编码不一直
//那么以页面声明的为准,再次从内存流里重新读取文本
Regex reg =
new Regex(@"<meta[\s\S]+?charset=(.*?)""[\s\S]+?>",
RegexOptions.Multiline | RegexOptions.IgnoreCase);
MatchCollection mc = reg.Matches(str);
if (mc.Count > 0)
{
string tempCharSet = mc[0].Result("$1");
if (string.Compare(tempCharSet, characterSet, true) != 0)
{
encode = Encoding.GetEncoding(tempCharSet);
str = string.Empty;
mStream.Seek(0, SeekOrigin.Begin);
reader = new StreamReader(mStream, encode);
buffer = new char[255];
count = reader.Read(buffer, 0, 255);
while (count > 0)
{
str += new String(buffer, 0, count);
count = reader.Read(buffer, 0, 255);
}
}
}
reader.Close();
mStream.Close();
if (response != null)
response.Close();
return str;
}
值得说明的是:尽管采用了此方法,当你查看数据库的时候,你还是会发现有些正文源码没有下载下来。拿我的数据表单来说:我的数据库表单的各个属性如下 ArticlePageId,--数据表的主键。ArticlePageTitle--新闻标题,ArticlePageUrl,--新闻正文页URL,ArticlePageSource--新闻正文页源码,也就是从ArticlePageUrl下载的源码。如果ArticlePageSource字段为空,则表明,下载失败。于是,我又加了一个打补丁的模块。代码如下:
把补丁的模块代码
public static void PatchingUp(string desTable)
{
string conStr = "server=(local);database=ArticleCollection;uid=sa;pwd=123456";
string des = desTable;
string select = string.Format("SELECT ArticlePageId,ArticlePageSource,ArticlePageUrl from {0}", des);
SqlConnection connection = new SqlConnection(conStr);
SqlDataAdapter adapter = new SqlDataAdapter(select, connection);
SqlCommandBuilder builder = new SqlCommandBuilder(adapter);
DataTable table = new DataTable();
adapter.Fill(table);
bool isModified = false;
for (int i = 0; i < table.Rows.Count; i++)
{
if (table.Rows[i]["ArticlePageSource"].ToString() == null || table.Rows[i]["ArticlePageSource"].ToString() == "")
{
Console.WriteLine("在原数据库第{0}行插入失败", i+1);
string url = table.Rows[i]["ArticlePageUrl"].ToString().Trim();
table.Rows[i]["ArticlePageSource"] = ClawlerAssist.GetDataFromUrl(url, 5);
Console.WriteLine("在原数据库第{0}行更新成功", i+1);
if (!isModified)
{
isModified = true;
}
}
}
if (isModified)
{
adapter.Update(table);
}
}
PS:我是新手,这也是我第一次选择首页发帖和大家分享我的一点收获和见解。如有不对的地方还请各位前辈指证。以免误认子弟。
相关文章推荐
- Errore HTTP 404.2 - Not Found" IIS 7.5 请求的内容似乎是脚本,因而将无法由静态文件处理程序来处理
- Errore HTTP 404.2 - Not Found" IIS 7.5 请求的内容似乎是脚本,因而将无法由静态文件处理程序来处理
- sharepoint 404 pagenotfound error
- Openning SharePoint - 80 website gives HTTP 404 Error, The webpage cannot be found ! on SharePoint 2013
- Errore HTTP 404.2 - Not Found" IIS 7.5 请求的内容似乎是脚本,因而将无法由静态文件处理程序来处理
- Errore HTTP 404.2 - Not Found" IIS 7.5 请求的内容似乎是脚本,因而将无法由静态文件处理程序来处理
- Errore HTTP 404.2 - Not Found" IIS 7.5 请求的内容似乎是脚本,因而将无法由静态文件处理程序来处理
- Errore HTTP 404.2 - Not Found" IIS 7.5 请求的内容似乎是脚本,因而将无法由静态文件处理程序来处理
- Page not found (404) Request Method:GET Request URL: http://127.0.0.1:8000/blog/talks Using the URL
- 各种HTTP返回错误代码大全 HTTP 403 Forbidden / 404 Not Found / 500 Internal Server Error / 502 Bad Gateway / 50
- HTTP 404 Not Found Error with .woff or .woff2 Font Files
- Error: HttpServlet was not found on the Java
- Maven install 报Fatal error compiling: tools.jar not found错误的处理
- 使用HttpHanlder处理404: File not found
- HTTP 错误 500.21 - Internal Server Error 处理程序“PageHandlerFactory-Integr
- InternalServerError时 HttpWebRequest 的 GetResponse 方法处理策略
- 【FAQ】Could not extract response: no suitable HttpMessageConverter found for respo
- java.lang.IllegalArgumentException: not found @HttpResponse from class java.lang.Object解决方法
- HTTP错误 404.17 - Not Found" IIS 7.5 请求的内容似乎是脚本,因而将无法由静态文件处理程序来处理
- [org.springframework.web.servlet.PageNotFound] - No mapping found for HTTP request with URI [/ssm/us