您的位置:首页 > 运维架构 > 网站架构

Java版PageRank及网站收录情况查询代码

2008-12-31 20:02 746 查看
在Google这个由10的100次方得名的站点中,各种评估网站的算法层出不穷,而PageRank即是其中之一。

Google的PageRank根据网站的外部链接和内部链接的数量和质量俩衡量网站的价值。PageRank背后的概念是,每个到页面的链接都是对该页面的一次投票,被链接的越多,就意味着被其他网站投票越多。这个就是所谓的“链接流行度”——衡量多少人愿意将他们的网站和你的网站挂钩。PageRank这个概念引自学术中一篇论文的被引述的频度——即被别人引述的次数越多,一般判断这篇论文的权威性就越高。

通常情况下讲,原创内容越多的站点,PageRank越容易提升,反之则相对比较困难,PageRank最大上限值为10。在Google的评估中,能上10的网站真可谓凤毛麟角,即使算上Google,能成就PageRank 10这“伟业”者,望眼环球也不足40家。一般来说,个人站点评估值4即办的不错,商业网站到6以上便算步入正轨了。

网上虽然有不少现成的查询器及源码,但是光用别人的毕竟不符合程序员风格,所以今天自己用Java重造轮子又写了个PageRank查询实现,捎带着把一些常用搜索引擎的网站链接及反向链接查询也加上了。

源码如下:


GooglePageRank.java
package org.loon.test;

import java.io.IOException;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Copyright 2008
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0 *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
* @project loonframework
* @author chenpeng
* @email:ceponline@yahoo.com.cn
* @version 0.1
*/
public class GooglePageRank {

// google pagerank服务器ip地址列表(最近google小气了很多,反复查询一个封ip)
final static String[] GoogleServiceIP = new String[] { "64.233.161.100",
"64.233.161.101", "64.233.183.91", "64.233.189.44", "66.102.1.103",
"66.102.9.115", "66.249.89.83", "66.249.91.99", "66.249.93.190" };

// google用识别标记
final static private int GOOGLE_MAGIC = 0xE6359A60;

// ch数值混合器
private class CHMix {

int a;

int b;

int c;

public CHMix() {
this(0, 0, 0);
}

public CHMix(int a, int b, int c) {
this.a = a;
this.b = b;
this.c = c;
}
}

/**
* 按google要求混合成ch数据
*
* @param mix
*/
private static void mix(final CHMix mix) {
mix.a -= mix.b;
mix.a -= mix.c;
mix.a ^= mix.c >> 13;
mix.b -= mix.c;
mix.b -= mix.a;
mix.b ^= mix.a << 8;
mix.c -= mix.a;
mix.c -= mix.b;
mix.c ^= mix.b >> 13;
mix.a -= mix.b;
mix.a -= mix.c;
mix.a ^= mix.c >> 12;
mix.b -= mix.c;
mix.b -= mix.a;
mix.b ^= mix.a << 16;
mix.c -= mix.a;
mix.c -= mix.b;
mix.c ^= mix.b >> 5;
mix.a -= mix.b;
mix.a -= mix.c;
mix.a ^= mix.c >> 3;
mix.b -= mix.c;
mix.b -= mix.a;
mix.b ^= mix.a << 10;
mix.c -= mix.a;
mix.c -= mix.b;
mix.c ^= mix.b >> 15;
}

/**
* 获得ch数值混合器
*
* @return
*/
public static CHMix getInnerCHMix() {
return new GooglePageRank().new CHMix();
}

/**
* 通过url获得googlech(google数据库针对页面的全球唯一标识)
*
* @param url
* @return
*/
public static String GoogleCH(final String url) {
// 格式化为google要求的info:url模式
String nUrl = String.format("info:%s", new Object[] { url });
// 获得新url字符串格式
char[] urls = nUrl.toCharArray();
// 获得新url长度
int length = urls.length;

// 获得一个ch数值混合器
CHMix chMix = GooglePageRank.getInnerCHMix();
// 为c注入google识别标识
chMix.c = GOOGLE_MAGIC;

// 为a、b项注入google要求的初始标识
chMix.a = chMix.b = 0x9E3779B9;

int k = 0;

int len = length;

while (len >= 12) {

chMix.a += (int) (urls[k + 0] + (urls[k + 1] << 8)
+ (urls[k + 2] << 16) + (urls[k + 3] << 24));
chMix.b += (int) (urls[k + 4] + (urls[k + 5] << 8)
+ (urls[k + 6] << 16) + (urls[k + 7] << 24));
chMix.c += (int) (urls[k + 8] + (urls[k + 9] << 8)
+ (urls[k + 10] << 16) + (urls[k + 11] << 24));
// 获得混合运算后的数据
GooglePageRank.mix(chMix);
k += 12;
len -= 12;
}
chMix.c += length;

// 产生googlech的11位标识
switch (len) {
case 11:
chMix.c += (int) (urls[k + 10] << 24);
case 10:
chMix.c += (int) (urls[k + 9] << 16);
case 9:
chMix.c += (int) (urls[k + 8] << 8);
case 8:
chMix.b += (int) (urls[k + 7] << 24);
case 7:
chMix.b += (int) (urls[k + 6] << 16);
case 6:
chMix.b += (int) (urls[k + 5] << 8);
case 5:
chMix.b += (int) (urls[k + 4]);
case 4:
chMix.a += (int) (urls[k + 3] << 24);
case 3:
chMix.a += (int) (urls[k + 2] << 16);
case 2:
chMix.a += (int) (urls[k + 1] << 8);
case 1:
chMix.a += (int) (urls[k + 0]);
break;
default:
break;
}
// 获得混合运算后的数据
GooglePageRank.mix(chMix);
// 获得未修订的CH
String tch = String.valueOf(chMix.c);
// 矫正差值后反馈正确CH
return String
.format("6%s", new Object[] { tch.length() < 10 ? ("-" + tch)
.intern() : tch });
}

/**
* 正则匹配pagerank结果
*
* @param value
* @return
*/
private static String MatchRank(final String value) {
Pattern pattern = Pattern.compile("Rank_1:[0-9]:([0-9]+)");
Matcher matcher = pattern.matcher(value);
if (matcher.find()) {
return matcher.group(1);
}
return "0";
}

/**
* 获得指定页面的google pagerank值
*
* @param url
* @return
*/
public static String GooglePR(final String url) {
String rip = GoogleServiceIP[new Random()
.nextInt(GoogleServiceIP.length)];
return GooglePR(url, rip);
}

/**
* 以指定的google服务器获得指定页面的google pagerank值
*
* @param url
* @param ip
* @return
*/
public static String GooglePR(final String url, final String ip) {
// 产生查询用唯一标识
String checksum = GoogleCH(url);
// 产生查询用url
String queryUrl = String
.format(
"http://%s/search?client=navclient-auto&ch=%s&features=Rank&q=info:%s",
new Object[] { ip, checksum, url });

String response;
try {
response = SimpleWebClient.getRequestHttp(queryUrl);
} catch (IOException e) {
response = "";
}
if (response.length() == 0) {
return "0";
} else {
return GooglePageRank.MatchRank(response);
}
}

}


SimpleWebClient.java


package org.loon.test;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;

import sun.misc.BASE64Encoder;

/**
* Copyright 2008
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0 *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
* @project loonframework
* @author chenpeng
* @email:ceponline@yahoo.com.cn
* @version 0.1
*/
public class SimpleWebClient {

/**
* 向指定url发送请求并获得响应数据
*
* @param urlString
* @return
* @throws IOException
*/
public static String getRequestHttp(String urlString) throws IOException {
return getRequestHttp(urlString, "utf-8");
}

/**
* 向指定url发送请求并获得响应数据
*
* @param urlString
* @param encoding
* @return
* @throws IOException
*/
public static String getRequestHttp(String urlString, String encoding)
throws IOException {
return getRequestHttp(urlString, encoding, null, 5000);
}

/**
* 向指定url发送请求并获得响应数据
*
* @param urlString
* @param encoding
* @param parameter
* @return
* @throws IOException
*/
public static String getRequestHttp(final String urlString,
final String encoding, final Map parameter, final int timeout)
throws IOException {

String nURL = (urlString.startsWith("http://") || urlString
.startsWith("https://")) ? urlString : ("http:" + urlString)
.intern();

String user = null;
String password = null;
String method = "GET";
String post = null;
String digest = null;

String responseContent = "ERROR";

boolean foundRedirect = false;

Map headers = new HashMap();

if (parameter != null) {
Set entrySet = parameter.entrySet();

for (Iterator it = entrySet.iterator(); it.hasNext();) {
Entry header = (Entry) it.next();
String key = (String) header.getKey();
String value = (String) header.getValue();
if ("user".equals(key)) {
user = value;
} else if ("pass".equals(key)) {
password = value;
} else if ("method".equals(key)) {
method = value;
} else if ("post".equals(key)) {
post = value;
} else {
headers.put(key, value);
}
}
}
URL url = new URL(nURL);

if (user != null && password != null) {
BASE64Encoder base64 = new BASE64Encoder();
digest = "Basic "
+ base64.encode((user + ":" + password).getBytes());
}

do {

HttpURLConnection urlConnection = (HttpURLConnection) url
.openConnection();
// 添加访问授权
if (digest != null) {
urlConnection.setRequestProperty("Authorization", digest);
}
urlConnection.setDoOutput(true);
urlConnection.setDoInput(true);
urlConnection.setUseCaches(false);
urlConnection.setInstanceFollowRedirects(false);
urlConnection.setRequestMethod(method);
if (timeout > 0) {
urlConnection.setConnectTimeout(timeout);
}
//模拟http头文件
urlConnection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0;)");
urlConnection.setRequestProperty("Accept", "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/msword, application/vnd.ms-excel, application/vnd.ms-powerpoint, */*");
//追加http头文件
Set headersSet = headers.entrySet();
for (Iterator it = headersSet.iterator(); it.hasNext();) {
Entry entry = (Entry) it.next();
urlConnection.setRequestProperty((String) entry.getKey(),
(String) entry.getValue());
}

if (post != null) {
OutputStreamWriter outRemote = new OutputStreamWriter(
urlConnection.getOutputStream());
outRemote.write(post);
outRemote.flush();
}

// 获得响应状态
int responseCode = urlConnection.getResponseCode();

// 获得返回的数据长度
int responseLength = urlConnection.getContentLength();

if (responseCode == 302) {
// 重定向
String location = urlConnection.getHeaderField("Location");
url = new URL(location);
foundRedirect = true;
} else {
BufferedInputStream in;
if (responseCode == 200 || responseCode == 201) {
in = new BufferedInputStream(urlConnection.getInputStream());
} else {
in = new BufferedInputStream(urlConnection.getErrorStream());
}
int size = responseLength == -1 ? 4096 : responseLength;
if (encoding != null) {
responseContent = SimpleWebClient.read(in, size, encoding);
} else {
ByteArrayOutputStream out = new ByteArrayOutputStream();
byte[] bytes = new byte[size];
int read;
while ((read = in.read(bytes)) >= 0) {
out.write(bytes, 0, read);
}
responseContent = new String(out.toByteArray());
in.close();
out.close();
}
foundRedirect = false;
}
// 如果重定向则继续
} while (foundRedirect);

return responseContent;
}

/**
* 转化InputStream为String
*
* @param in
* @param size
* @return
* @throws IOException
*/
private static String read(final InputStream in, final int size,
final String encoding) throws IOException {
StringBuilder sbr = new StringBuilder();
int nSize = size;
if (nSize == 0) {
nSize = 1;
}
char[] buffer = new char[nSize];
int offset = 0;
InputStreamReader isr = new InputStreamReader(in, encoding);
while ((offset = isr.read(buffer)) != -1) {
sbr.append(buffer, 0, offset);
}
in.close();
isr.close();
return sbr.toString();
}
}

WebAppraise.java
package org.loon.test;

import java.io.IOException;

/**
* Copyright 2008
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0 *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
* @project loonframework
* @author chenpeng
* @email:ceponline@yahoo.com.cn
* @version 0.1
*/
public class WebAppraise {

private String googleSum;

private String baiduSum;

private String msnSum;

private String altaVistaSum;

private String allTheWebSum;

private String yahooSum;

private String testURL;

public WebAppraise(final String url) {

if (url != null && !"".equals(url)) {
this.testURL = url.trim();
if (this.testURL.startsWith("http://")) {
this.testURL = this.testURL.substring(7);
}
if (this.testURL.startsWith("https://")) {
this.testURL = this.testURL.substring(8);
}
} else {
throw new RuntimeException("url is NULL!");
}

}

/**
* 分析指定链接结果,并返回整型数值
*
* @param searchURL
* @param anchor
* @param trail
* @return
*/
private static int getLinks(final String searchURL, final String anchor,
final String trail) {
int count = 0;
String serverResponse;

try {
// 我国特色……
if (searchURL.startsWith("http://www.baidu.com")) {
// 永不离休的gb2312同志(-_-||)
serverResponse = SimpleWebClient.getRequestHttp(searchURL,
"gb2312");
} else {
serverResponse = SimpleWebClient.getRequestHttp(searchURL);
}
} catch (IOException e) {
serverResponse = e.getMessage();
}

int pos = serverResponse.indexOf(anchor);
if (pos > 1) {
serverResponse = serverResponse.substring(pos + anchor.length());
pos = serverResponse.indexOf(trail);
String value = serverResponse.substring(0, pos).trim();
value = value.replace(",", "");
value = value.replace(".", "");
count = Integer.parseInt(value);
}
return count;
}

public String getAllTheWebSite() {
return getAllTheWebSite(false);
}

public String getAllTheWebSite(boolean isDomain) {
try {
String allTheWeb;
if (isDomain) {
allTheWeb = "http://www.alltheweb.com/search?cat=web&cs=utf8&rys=0&itag=crv&_sb_lang=any&q=linkdomain%3A"
+ this.testURL;
} else {
allTheWeb = "http://www.alltheweb.com/search?cat=web&cs=utf-8&q=link%3Ahttp%3A%2F%2F"
+ this.testURL + "&_sb_lang=any";
}
allTheWebSum = ""
+ getLinks(allTheWeb, "<span class=/"ofSoMany/">",
"</span>");
} catch (Exception ex) {
allTheWebSum = ex.getMessage();
}
return allTheWebSum;
}

public String getAltaVistaSite() {
return getAltaVistaSite(false);
}

public String getAltaVistaSite(boolean isDomain) {
try {
String altaVista;
if (isDomain) {
altaVista = "http://www.altavista.com/web/results?itag=ody&q=link%3A"
+ this.testURL + "&kgs=0&kls=0";
} else {
altaVista = "http://www.altavista.com/web/results?itag=ody&kgs=0&kls=0&q=site%3A"
+ this.testURL;
}
altaVistaSum = "" + getLinks(altaVista, "AltaVista found ", " ");
} catch (Exception ex) {
altaVistaSum = ex.getMessage();
}
return altaVistaSum;
}

public String getGooglePR() {
return GooglePageRank.GooglePR(this.testURL);
}

public String getGoogleSite() {
return getGoogleSite(false);
}

public String getGoogleSite(final boolean isDomian) {
try {
String google;
// 反向链接
if (isDomian) {
google = "http://www.google.com/search?hl=en&q=link%3A"
+ this.testURL;
} else {
google = "http://www.google.com/search?hl=en&q=site%3A"
+ this.testURL + "&btnG=Google+Search&aq=f&oq=";
}
googleSum = "" + getLinks(google, "about <b>", "</b>");
} catch (Exception ex) {
googleSum = ex.getMessage();
}
return googleSum;
}

public String getBaiduSite() {
return getBaiduSite(false);
}

public String getBaiduSite(final boolean isDomian) {
try {
String baidu;
if (isDomian) {
baidu = "http://www.baidu.com/s?wd=domain%3A" + this.testURL
+ "&cl=3";
} else {
baidu = "http://www.baidu.com/s?wd=site%3A" + this.testURL;
}
baiduSum = "" + getLinks(baidu, "找到相关网页", "篇");
} catch (Exception ex) {
String baidu;
if (isDomian) {
baidu = "http://www.baidu.com/s?wd=domain%3A" + this.testURL
+ "&cl=3";
} else {
baidu = "http://www.baidu.com/s?wd=site%3A" + this.testURL;
}
baiduSum = "" + getLinks(baidu, "找到相关网页约", "篇");
}
return baiduSum;
}

public String getYahooSite() {
return getYahooSite(false);
}

public String getYahooSite(final boolean isDomian) {
try {
String yahoo;
if (isDomian) {
yahoo = "http://sitemap.cn.yahoo.com/search?p=" + this.testURL
+ "&bwm=i";
yahooSum = "" + getLinks(yahoo, "<strong>", "</strong>");
} else {
yahoo = "http://www.yahoo.cn/s?p=site%3A" + this.testURL
+ "&pid=hp&v=web";
yahooSum = "" + getLinks(yahoo, "找到相关网页约", "条");
}

} catch (Exception ex) {
yahooSum = ex.getMessage();
}
return yahooSum;
}

public String getMsnSite() {
return getMsnSite(false);
}

public String getMsnSite(boolean isDomain) {
try {
String msn;
if (isDomain) {
msn = "http://cnweb.search.live.com/results.aspx?q=link%3A"
+ this.testURL + "&mkt=zh-cn&scope=&FORM=LIVSO";
} else {
msn = "http://cnweb.search.live.com/results.aspx?q=site%3A"
+ this.testURL + "&go=&form=QBRE";
}
msnSum = "" + getLinks(msn, "共", "条搜索结果");
} catch (Exception ex) {
msnSum = ex.getMessage();
}
return msnSum;
}

public String getTestURL() {
return testURL;
}

}

Test.java
package org.loon.test;

/**
* Copyright 2008
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0 *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
* @project loonframework
* @author chenpeng
* @email:ceponline@yahoo.com.cn
* @version 0.1
*/
public class Test {

public static void main(String[] args) {

WebAppraise appraise = new WebAppraise("http://blog.csdn.net/cping1982");

System.out.println("GooglePagerRank值:" + appraise.getGooglePR());
System.out.println("google收录:" + appraise.getGoogleSite());
System.out.println("google反向收录:" + appraise.getGoogleSite(true));
System.out.println("yahoo收录:" + appraise.getYahooSite());
System.out.println("yahoo反向收录:" + appraise.getYahooSite(true));
System.out.println("baidu收录:" + appraise.getBaiduSite());
System.out.println("baidu反向收录:" + appraise.getBaiduSite(true));
System.out.println("msn收录:" + appraise.getMsnSite());
System.out.println("msn反向收录:" + appraise.getMsnSite(true));
System.out.println("AllTheWeb收录:" + appraise.getAllTheWebSite());
System.out.println("AllTheWeb反向收录:" + appraise.getAllTheWebSite(true));
System.out.println("AltaVista收录:" + appraise.getAltaVistaSite());
System.out.println("AltaVista反向收录:" + appraise.getAltaVistaSite(true));

}
}

检测http://blog.csdn.net/cping1982运行结果如下图:



源码下载地址:http://download.csdn.net/source/929348
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: