您的位置:首页 > 理论基础 > 计算机网络

从csdn搬家到wordpress 转自:http://youthmemo.com/?p=2082

2011-09-15 10:29 387 查看
转自:http://youthmemo.com/?p=2082


最近开了个blog站点,想把以前在csdn上面的文章都转移过来,但一直没找到合适的工具,于是周末就自己写了个小程序。

本程序可以完成的工作:转移csdn上面的文章(限于文本内容)到wordpress;不能完成的工作:1、不支持在wordpress上创建分类,所以需要提前在wordpress上手工创建分类(保持与csdn一致);2、不能以很好的格式转移文章,转移之后文章格式需要调整。



程序由采集、解析、发帖三部分构成。采集负责将指定url的内容下载下来,解析负责从网页内容中解析出正文链接、标题、发布时间、分类信息,发帖部分负责将解析出来的数据通过rpc发送给wordpress,生成博文。



本程序用到的jar包及其版本如下:
-rw-r--r-- 1 mingyuan mingyuan  46725 2011-09-03 23:05 commons-codec-1.3.jar
-rw-r--r-- 1 mingyuan mingyuan 279781 2011-09-03 23:05 commons-httpclient-3.0.1.jar
-rwxrwxrwx 1 mingyuan mingyuan  52915 2010-05-03 03:39 commons-logging-1.1.jar
-rw-r--r-- 1 mingyuan mingyuan 281579 2011-09-04 01:40 jsoup-1.6.1.jar
-rwxrwxrwx 1 mingyuan mingyuan  34407 2010-05-03 03:39 ws-commons-util-1.0.2.jar
-rwxrwxrwx 1 mingyuan mingyuan  58573 2010-05-03 03:39 xmlrpc-client-3.1.3.jar
-rwxrwxrwx 1 mingyuan mingyuan 109131 2010-05-03 03:39 xmlrpc-common-3.1.3.jar
-rwxrwxrwx 1 mingyuan mingyuan  81555 2010-05-03 03:39 xmlrpc-server-3.1.3.jar




代码很简单,就不解释了,大伙看看即可明白。程序的入口函数是Mover.main



下面先给出主要的类Mover.java

package cn.mingyuan.csdn2wordpress;



import java.io.IOException;

import java.net.MalformedURLException;

import java.net.URL;

import java.text.ParseException;

import java.text.SimpleDateFormat;

import java.util.Date;

import java.util.HashMap;

import java.util.LinkedList;

import java.util.List;

import java.util.Map;

import java.util.concurrent.TimeUnit;



import org.apache.xmlrpc.XmlRpcException;

import org.apache.xmlrpc.client.XmlRpcClient;

import org.apache.xmlrpc.client.XmlRpcClientConfigImpl;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;



/**

* 采集、解析、转移

*

* @author mingyuan

*

*/

public class Mover {

private int totalPages;

private XmlRpcClientConfigImpl config;

private XmlRpcClient client;

private String baseUrl;

private Object userName;

private Object password;

private String csdnUserName;



public Mover(int totalPages, String blogRpcUrl, String csdnUrl, String csdnUserName, String userName,

String password) {

this.totalPages = totalPages;

this.baseUrl = csdnUrl;

this.csdnUserName = csdnUserName;

this.userName = userName;

this.password = password;

config = new XmlRpcClientConfigImpl();

try {

config.setServerURL(new URL(blogRpcUrl));

} catch (MalformedURLException e) {

System.out.println(“请检查url”);

}

client = new XmlRpcClient();

client.setConfig(config);

}



private List<String> getlinks() {

List<String> list = new LinkedList<String>();

for (int i = 1; i <= totalPages; i++) {

System.out.println(“processing page ” + i);

Downloader downloader = new Downloader();

String content = downloader.download(baseUrl + “/” + csdnUserName + “/article/list/” + i);

if (content == null)

continue;

Document doc = Jsoup.parse(content);

Elements first = doc.select(“.link_title”);

for (int j = 0; j < first.size(); j++) {

Element first2 = first.get(j).select(“a”).first();

String link = baseUrl + first2.attr(“href”);

list.add(link);

System.out.println(“get link\t” + link);

}

System.out.println(“page ” + i + “ extractor done,sleep 2s”);

try {

TimeUnit.SECONDS.sleep(1);

} catch (InterruptedException e) {

e.printStackTrace();

}

}

return list;

}



public List<CSDNPost> getPosts() {

List<String> links = getlinks();

List<CSDNPost> posts = new LinkedList<CSDNPost>();

for (String link : links) {

CSDNPost post = getPost(link);

if (post != null) {

posts.add(post);

}

}

return posts;

}



private CSDNPost getPost(String url) {

System.out.println(“url\t” + url);

Downloader downloader = new Downloader();

String html = downloader.download(url);

if (html == null)

return null;

Document doc = Jsoup.parse(html);

String title = doc.select(“.article_title”).first().text();

String categroy = “Uncategorized”;

Elements link_categories = doc.select(“.article_manage .link_categories”);

if (link_categories != null) {

Element first = link_categories.first();

if (first != null) {

Elements href = first.select(“a”);

if (href != null) {

categroy = href.text();

}

}

}

String postdate = doc.select(“.article_manage .link_postdate”).first().text();

String content = doc.select(“.details .article_content”).first().text();

SimpleDateFormat sdf = new SimpleDateFormat(“yyyy-MM-dd HH:mm”);

CSDNPost post = new CSDNPost();

post.setCategories(new String[] { categroy });

post.setTitle(title);

try {

post.setDateCreated(sdf.parse(postdate));

} catch (ParseException e) {

post.setDateCreated(new Date());

}

post.setDescription(content);

return post;

}



public void publish(CSDNPost post) {

Map<String, Object> struct = new HashMap<String, Object>();

struct.put(“dateCreated”, post.getDateCreated());

struct.put(“description”, post.getDescription());

struct.put(“title”, post.getTitle());

struct.put(“categories”, post.getCategories());

Object[] params = new Object[] { userName, userName, password, struct, true };

String blogid = null;

try {

blogid = (String) client.execute(“metaWeblog.newPost”, params);

} catch (XmlRpcException e) {

e.printStackTrace();

System.out.println(“导入出现错误:title=” + post.getTitle());

}

System.out.println(post.getTitle() + “>> 导入完毕,生成博文id为>>” + blogid);

struct.clear();

}



public static void main(String[] args) throws IOException {

Mover extractor = new Mover(19, “http://youthmemo.com/xmlrpc.php”, “http://blog.csdn.net”, “telnetor”, “admin”,

“xxxx”);

List<CSDNPost> posts = extractor.getPosts();

for (CSDNPost post : posts) {

extractor.publish(post);

try {

TimeUnit.SECONDS.sleep(1);

} catch (InterruptedException e) {

e.printStackTrace();

}

System.out.println(post.getTitle());

}

System.out.println(“done!”);

}

}



下面给出下载类Downloader.java

package cn.mingyuan.csdn2wordpress;



import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;



import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.HttpException;

import org.apache.commons.httpclient.HttpMethod;

import org.apache.commons.httpclient.HttpStatus;

import org.apache.commons.httpclient.cookie.CookiePolicy;

import org.apache.commons.httpclient.methods.GetMethod;

import org.apache.commons.httpclient.params.HttpClientParams;

import org.apache.commons.httpclient.params.HttpMethodParams;



/**

* downloader

*

* @author mingyuan

*

*/

public class Downloader {

private HttpClientParams params = null;

private HttpClient client = null;



/**

* 默认构造函数,初始化一系列变量

*/

public Downloader() {

// 构造HttpClientParams参数

params = new HttpClientParams();

params.setParameter(

HttpClientParams.USER_AGENT,

“Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3 GTBDFff GTB7.0 (.NET CLR 3.5.30729)”);

params.setParameter(HttpClientParams.ALLOW_CIRCULAR_REDIRECTS, false);

params.setParameter(HttpClientParams.MAX_REDIRECTS, 4);

params.setParameter(HttpClientParams.CONNECTION_MANAGER_TIMEOUT, (long) 60 * 1000);

params.setParameter(HttpClientParams.SO_TIMEOUT, 60 * 1000);

// 使用系统提供的默认的恢复策略

params.setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());

client = new HttpClient(params);

}



/**

* 下载网页

*

* @param url

* 网页url

* @return String类型的网页源码

*/

public String download(String url) {

HttpMethod method = new GetMethod(url);

String sourceCode = null;

method.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);

// 读取内容

StringBuilder builder = new StringBuilder();

BufferedReader reader = null;

try {

int statusCode = client.executeMethod(method);

if (statusCode != HttpStatus.SC_OK) {

return null;

}



reader = new BufferedReader(new InputStreamReader(method.getResponseBodyAsStream(), “utf8″));

String line;

while ((line = reader.readLine()) != null) {

builder.append(line + “\r\n”);

}

sourceCode = builder.toString();

} catch (HttpException e) {

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

} finally {

try {

reader.close();

} catch (IOException e) {

e.printStackTrace();

}

// 释放连接

method.releaseConnection();

client.getHttpConnectionManager().closeIdleConnections(0);

}

return sourceCode;

}

}

最后发出一个pojo,CSDNPost.java

package cn.mingyuan.csdn2wordpress;



import java.util.Date;



/**

* csdn post

*

* @author mingyuan

*

*/

public class CSDNPost {

/**

* 博文创建日期

*/

private Date dateCreated;

/**

* 博文内容

*/

private String description;

/**

* 标题

*/

private String title;

/**

* 博文分类

*/

private String[] categories;



public CSDNPost() {



}



public CSDNPost(String title, String description, String[] categories, Date dateCreated) {

this.dateCreated = dateCreated;

this.description = description;

this.title = title;

this.categories = categories;

}



public Date getDateCreated() {

return dateCreated;

}



public void setDateCreated(Date dateCreated) {

this.dateCreated = dateCreated;

}



public String getDescription() {

return description;

}



public void setDescription(String description) {

this.description = description;

}



public String getTitle() {

return title;

}



public void setTitle(String title) {

this.title = title;

}



public String[] getCategories() {

return categories;

}



public void setCategories(String[] categories) {

this.categories = categories;

}

}



以上是全部源码。

在文章的结尾,我愿意跟大家分享一下这个小程序的开发心得。

一开始写这个程序的时候,觉得会很快搞定,因为这个程序无非就是三个过程:采集、解析、发帖。其实也真是这样的一个过程。

这个程序耗费精力比较多的地方是在解析网页、提取链接、标题、内容、发布时间、分类方面。

一开始想用xpath解析网页,并且写xpath表达式都在chrome上测试通过xpath helper验证通过了。但在编码阶段发现现有的工具包,比如dom4j就不支持对html的解析,网上看了有通过htmlparser将html转换成xml的方法。但觉得太麻烦,最后发现了JSoup这个非常强大的工具,它可以通过类似jquery和css选取语法的表达式来提取内容。尝试了下非常方便,于是解析这个问题没有了(有个小窍门:chrome浏览器开发者工具可以看某节点的css样式,把这个样式直接传递给jsoup就能提取内容)。



wordpress支持MetaWeblog协议,可以通过XML-RPC进行发帖。关于它们的信息可以通过以下链接找到:

http://en.wikipedia.org/wiki/MetaWeblog

http://en.wikipedia.org/wiki/XML-RPC (可以找到各种语言版本的api)

另外JSoup的地址是:

http://jsoup.org/





程序写的太匆忙,肯定有很多不尽人意的地方,希望各位指出。我的联系方式是:admin#youthmemo.com。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: