您的位置:首页 > 其它

爬虫程序判断是否已抓URL

2009-07-06 09:53 127 查看
/article/4803483.html

看了这篇文章后的灵感,不过他是用C++实现的,我是用C#实现。。不说那么多,帖代码了。。

判断URLIdentity类,用于对已抓取的URL进行标识,判断URL是否已抓取

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections;
using System.Security.Cryptography;

namespace TestMD5
{
public class URLIdentity
{
private BitArray []SegmentArray = new BitArray[4096];

private int GetSegmentIndex(int hashValue)
{
return hashValue / 25000;
}

private int GetSegmentOffset(int hashValue)
{
return hashValue % 25000;
}

public int GetIntHashCode(string url)
{
byte[] tmpByte;
MD5 md5 = new MD5CryptoServiceProvider();
tmpByte = md5.ComputeHash(Encoding.Default.GetBytes(url));
StringBuilder sb = new StringBuilder();
for (int i = 0; i < tmpByte.Length; i++)
{
sb.Append(tmpByte[i]);
}
string temp = sb.ToString().Substring(0, 8);
return Int32.Parse(temp);
}

public void SetUrlIndentity(string url)
{
int HashNum = GetIntHashCode(url);
int SegIndex = GetSegmentIndex(HashNum);
int SegOffset = GetSegmentOffset(HashNum);

if (SegmentArray[SegIndex] == null)
{
SegmentArray[SegIndex] = new BitArray(25000);
}

SegmentArray[SegIndex][SegOffset] = true;
}

public bool GetUrlIdentity(string url)
{
int HashNum = GetIntHashCode(url);
int SegIndex = GetSegmentIndex(HashNum);
int SegOffset = GetSegmentOffset(HashNum);

if (SegmentArray[SegIndex] == null)
{
return false;
}
else
{
return SegmentArray[SegIndex][SegOffset];
}
}
}
}

DEMO:

Code
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Security.Cryptography;
using System.Web.Security;
using System.Runtime.InteropServices;
using System.Collections;
namespace TestMD5
{
class Program
{

static void Main(string[] args)
{
URLIdentity urlIdentity = new URLIdentity();
string str = "http://www.cnblogs.com";
urlIdentity.SetUrlIndentity(str)
if(urlIdentity.GetUrlIdentity(str))
{
Console.WriteLine("this url had been crawled");
}
}

}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐