您的位置:首页 > 编程语言 > ASP

asp.net C#抓取网页链接

2006-04-13 16:57 405 查看
采用了写正则,具体可以看这里 不过我用的不是这个。呵呵

代码还有点粗糙,比如还没有实现,统一写到xml中,然后显示出来。

还有些东西还要过滤,一点一点来吧。先记录一下,免得以后忘记。

default.aspx

<%@ Page Language="C#" AutoEventWireup="true" CodeFile="Default.aspx.cs" Inherits="_Default" ValidateRequest="false" %>

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml" >
<head runat="server">
<title>Untitled Page</title>
</head>
<body>
<form id="aspBuffer" method=post runat="server">
<div align="center" style="FONT-WEIGHT: bold">得到任意网页源代码</div>
<br />
<div>
<asp:TextBox ID="UrlText" runat="server" Style="z-index: 100; left: 9px; position: absolute;
top: 47px" Width="400px"></asp:TextBox>
  
<asp:Button id="WebRequestButton" runat="server" Text="用WebRequest得到" style="z-index: 101; left: 444px; position: absolute; top: 45px" OnClick="WebRequestButton_Click"></asp:Button>
   
<asp:TextBox id="ContentHtml" runat="server" Width="100%" Height="360px" TextMode="MultiLine" style="z-index: 102; left: 3px; position: absolute; top: 92px">
</asp:TextBox>
<asp:Button ID="getUrl" runat="server" OnClick="getUrl_Click" Style="z-index: 104;
left: 675px; position: absolute; top: 45px" Text="得到网页链接" />

</div>
</form>
</body>
</html>

default.aspx.cs

using System;
using System.Data;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;

public partial class _Default : System.Web.UI.Page
{

public string urlPage = "";

protected void Page_Load(object sender, EventArgs e)
{

}
protected void WebRequestButton_Click(object sender, EventArgs e)
{

urlPage = UrlText.Text;
WebRequest request = WebRequest.Create(urlPage);
WebResponse response = request.GetResponse();
Stream resStream = response.GetResponseStream();
StreamReader sr = new StreamReader(resStream, System.Text.Encoding.Default);
ContentHtml.Text = Server.HtmlEncode(sr.ReadToEnd());
resStream.Close();
sr.Close();
response.Close();

}

protected void getUrl_Click(object sender, EventArgs e)
{
ArrayList allLinks;
allLinks = GetHyperLinks(ContentHtml.Text.ToString());
ContentHtml.Text = "";
string strTemp = "";

for (int j = 0; j< allLinks.Count - 1; j++)
{
strTemp += allLinks[j].ToString();
}

ContentHtml.Text = strTemp;

}
static ArrayList GetHyperLinks(string htmlCode)
{
ArrayList myal = new ArrayList();
string strRegex = @"http:///S+/./S+";
Regex rg = new Regex(strRegex, RegexOptions.IgnoreCase);
MatchCollection m = rg.Matches(htmlCode);
for( int i=0; i<=m.Count-1; i++)
{
bool rep = false;

string strNew = m[i].ToString( );
// 过滤重复的URL
foreach( string str in myal )
{
if( strNew==str )
{
rep =true;
break;
}
}
if( !rep ) myal.Add( strNew );
}
myal.Sort( );
return myal;
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: