从csdn搬家到wordpress 转自:http://youthmemo.com/?p=2082
2011-09-15 10:29
387 查看
转自:http://youthmemo.com/?p=2082
本程序可以完成的工作:转移csdn上面的文章(限于文本内容)到wordpress;不能完成的工作:1、不支持在wordpress上创建分类,所以需要提前在wordpress上手工创建分类(保持与csdn一致);2、不能以很好的格式转移文章,转移之后文章格式需要调整。
程序由采集、解析、发帖三部分构成。采集负责将指定url的内容下载下来,解析负责从网页内容中解析出正文链接、标题、发布时间、分类信息,发帖部分负责将解析出来的数据通过rpc发送给wordpress,生成博文。
本程序用到的jar包及其版本如下:
代码很简单,就不解释了,大伙看看即可明白。程序的入口函数是Mover.main
下面先给出主要的类Mover.java
package cn.mingyuan.csdn2wordpress;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.apache.xmlrpc.XmlRpcException;
import org.apache.xmlrpc.client.XmlRpcClient;
import org.apache.xmlrpc.client.XmlRpcClientConfigImpl;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* 采集、解析、转移
*
* @author mingyuan
*
*/
public class Mover {
private int totalPages;
private XmlRpcClientConfigImpl config;
private XmlRpcClient client;
private String baseUrl;
private Object userName;
private Object password;
private String csdnUserName;
public Mover(int totalPages, String blogRpcUrl, String csdnUrl, String csdnUserName, String userName,
String password) {
this.totalPages = totalPages;
this.baseUrl = csdnUrl;
this.csdnUserName = csdnUserName;
this.userName = userName;
this.password = password;
config = new XmlRpcClientConfigImpl();
try {
config.setServerURL(new URL(blogRpcUrl));
} catch (MalformedURLException e) {
System.out.println(“请检查url”);
}
client = new XmlRpcClient();
client.setConfig(config);
}
private List<String> getlinks() {
List<String> list = new LinkedList<String>();
for (int i = 1; i <= totalPages; i++) {
System.out.println(“processing page ” + i);
Downloader downloader = new Downloader();
String content = downloader.download(baseUrl + “/” + csdnUserName + “/article/list/” + i);
if (content == null)
continue;
Document doc = Jsoup.parse(content);
Elements first = doc.select(“.link_title”);
for (int j = 0; j < first.size(); j++) {
Element first2 = first.get(j).select(“a”).first();
String link = baseUrl + first2.attr(“href”);
list.add(link);
System.out.println(“get link\t” + link);
}
System.out.println(“page ” + i + “ extractor done,sleep 2s”);
try {
TimeUnit.SECONDS.sleep(1);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
return list;
}
public List<CSDNPost> getPosts() {
List<String> links = getlinks();
List<CSDNPost> posts = new LinkedList<CSDNPost>();
for (String link : links) {
CSDNPost post = getPost(link);
if (post != null) {
posts.add(post);
}
}
return posts;
}
private CSDNPost getPost(String url) {
System.out.println(“url\t” + url);
Downloader downloader = new Downloader();
String html = downloader.download(url);
if (html == null)
return null;
Document doc = Jsoup.parse(html);
String title = doc.select(“.article_title”).first().text();
String categroy = “Uncategorized”;
Elements link_categories = doc.select(“.article_manage .link_categories”);
if (link_categories != null) {
Element first = link_categories.first();
if (first != null) {
Elements href = first.select(“a”);
if (href != null) {
categroy = href.text();
}
}
}
String postdate = doc.select(“.article_manage .link_postdate”).first().text();
String content = doc.select(“.details .article_content”).first().text();
SimpleDateFormat sdf = new SimpleDateFormat(“yyyy-MM-dd HH:mm”);
CSDNPost post = new CSDNPost();
post.setCategories(new String[] { categroy });
post.setTitle(title);
try {
post.setDateCreated(sdf.parse(postdate));
} catch (ParseException e) {
post.setDateCreated(new Date());
}
post.setDescription(content);
return post;
}
public void publish(CSDNPost post) {
Map<String, Object> struct = new HashMap<String, Object>();
struct.put(“dateCreated”, post.getDateCreated());
struct.put(“description”, post.getDescription());
struct.put(“title”, post.getTitle());
struct.put(“categories”, post.getCategories());
Object[] params = new Object[] { userName, userName, password, struct, true };
String blogid = null;
try {
blogid = (String) client.execute(“metaWeblog.newPost”, params);
} catch (XmlRpcException e) {
e.printStackTrace();
System.out.println(“导入出现错误:title=” + post.getTitle());
}
System.out.println(post.getTitle() + “>> 导入完毕,生成博文id为>>” + blogid);
struct.clear();
}
public static void main(String[] args) throws IOException {
Mover extractor = new Mover(19, “http://youthmemo.com/xmlrpc.php”, “http://blog.csdn.net”, “telnetor”, “admin”,
“xxxx”);
List<CSDNPost> posts = extractor.getPosts();
for (CSDNPost post : posts) {
extractor.publish(post);
try {
TimeUnit.SECONDS.sleep(1);
} catch (InterruptedException e) {
e.printStackTrace();
}
System.out.println(post.getTitle());
}
System.out.println(“done!”);
}
}
下面给出下载类Downloader.java
package cn.mingyuan.csdn2wordpress;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpClientParams;
import org.apache.commons.httpclient.params.HttpMethodParams;
/**
* downloader
*
* @author mingyuan
*
*/
public class Downloader {
private HttpClientParams params = null;
private HttpClient client = null;
/**
* 默认构造函数,初始化一系列变量
*/
public Downloader() {
// 构造HttpClientParams参数
params = new HttpClientParams();
params.setParameter(
HttpClientParams.USER_AGENT,
“Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3 GTBDFff GTB7.0 (.NET CLR 3.5.30729)”);
params.setParameter(HttpClientParams.ALLOW_CIRCULAR_REDIRECTS, false);
params.setParameter(HttpClientParams.MAX_REDIRECTS, 4);
params.setParameter(HttpClientParams.CONNECTION_MANAGER_TIMEOUT, (long) 60 * 1000);
params.setParameter(HttpClientParams.SO_TIMEOUT, 60 * 1000);
// 使用系统提供的默认的恢复策略
params.setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());
client = new HttpClient(params);
}
/**
* 下载网页
*
* @param url
* 网页url
* @return String类型的网页源码
*/
public String download(String url) {
HttpMethod method = new GetMethod(url);
String sourceCode = null;
method.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
// 读取内容
StringBuilder builder = new StringBuilder();
BufferedReader reader = null;
try {
int statusCode = client.executeMethod(method);
if (statusCode != HttpStatus.SC_OK) {
return null;
}
reader = new BufferedReader(new InputStreamReader(method.getResponseBodyAsStream(), “utf8″));
String line;
while ((line = reader.readLine()) != null) {
builder.append(line + “\r\n”);
}
sourceCode = builder.toString();
} catch (HttpException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
// 释放连接
method.releaseConnection();
client.getHttpConnectionManager().closeIdleConnections(0);
}
return sourceCode;
}
}
最后发出一个pojo,CSDNPost.java
package cn.mingyuan.csdn2wordpress;
import java.util.Date;
/**
* csdn post
*
* @author mingyuan
*
*/
public class CSDNPost {
/**
* 博文创建日期
*/
private Date dateCreated;
/**
* 博文内容
*/
private String description;
/**
* 标题
*/
private String title;
/**
* 博文分类
*/
private String[] categories;
public CSDNPost() {
}
public CSDNPost(String title, String description, String[] categories, Date dateCreated) {
this.dateCreated = dateCreated;
this.description = description;
this.title = title;
this.categories = categories;
}
public Date getDateCreated() {
return dateCreated;
}
public void setDateCreated(Date dateCreated) {
this.dateCreated = dateCreated;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String[] getCategories() {
return categories;
}
public void setCategories(String[] categories) {
this.categories = categories;
}
}
以上是全部源码。
在文章的结尾,我愿意跟大家分享一下这个小程序的开发心得。
一开始写这个程序的时候,觉得会很快搞定,因为这个程序无非就是三个过程:采集、解析、发帖。其实也真是这样的一个过程。
这个程序耗费精力比较多的地方是在解析网页、提取链接、标题、内容、发布时间、分类方面。
一开始想用xpath解析网页,并且写xpath表达式都在chrome上测试通过xpath helper验证通过了。但在编码阶段发现现有的工具包,比如dom4j就不支持对html的解析,网上看了有通过htmlparser将html转换成xml的方法。但觉得太麻烦,最后发现了JSoup这个非常强大的工具,它可以通过类似jquery和css选取语法的表达式来提取内容。尝试了下非常方便,于是解析这个问题没有了(有个小窍门:chrome浏览器开发者工具可以看某节点的css样式,把这个样式直接传递给jsoup就能提取内容)。
wordpress支持MetaWeblog协议,可以通过XML-RPC进行发帖。关于它们的信息可以通过以下链接找到:
http://en.wikipedia.org/wiki/MetaWeblog
http://en.wikipedia.org/wiki/XML-RPC (可以找到各种语言版本的api)
另外JSoup的地址是:
http://jsoup.org/
程序写的太匆忙,肯定有很多不尽人意的地方,希望各位指出。我的联系方式是:admin#youthmemo.com。
最近开了个blog站点,想把以前在csdn上面的文章都转移过来,但一直没找到合适的工具,于是周末就自己写了个小程序。
本程序可以完成的工作:转移csdn上面的文章(限于文本内容)到wordpress;不能完成的工作:1、不支持在wordpress上创建分类,所以需要提前在wordpress上手工创建分类(保持与csdn一致);2、不能以很好的格式转移文章,转移之后文章格式需要调整。程序由采集、解析、发帖三部分构成。采集负责将指定url的内容下载下来,解析负责从网页内容中解析出正文链接、标题、发布时间、分类信息,发帖部分负责将解析出来的数据通过rpc发送给wordpress,生成博文。
本程序用到的jar包及其版本如下:
-rw-r--r-- 1 mingyuan mingyuan 46725 2011-09-03 23:05 commons-codec-1.3.jar -rw-r--r-- 1 mingyuan mingyuan 279781 2011-09-03 23:05 commons-httpclient-3.0.1.jar -rwxrwxrwx 1 mingyuan mingyuan 52915 2010-05-03 03:39 commons-logging-1.1.jar -rw-r--r-- 1 mingyuan mingyuan 281579 2011-09-04 01:40 jsoup-1.6.1.jar -rwxrwxrwx 1 mingyuan mingyuan 34407 2010-05-03 03:39 ws-commons-util-1.0.2.jar -rwxrwxrwx 1 mingyuan mingyuan 58573 2010-05-03 03:39 xmlrpc-client-3.1.3.jar -rwxrwxrwx 1 mingyuan mingyuan 109131 2010-05-03 03:39 xmlrpc-common-3.1.3.jar -rwxrwxrwx 1 mingyuan mingyuan 81555 2010-05-03 03:39 xmlrpc-server-3.1.3.jar
代码很简单,就不解释了,大伙看看即可明白。程序的入口函数是Mover.main
下面先给出主要的类Mover.java
package cn.mingyuan.csdn2wordpress;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.apache.xmlrpc.XmlRpcException;
import org.apache.xmlrpc.client.XmlRpcClient;
import org.apache.xmlrpc.client.XmlRpcClientConfigImpl;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* 采集、解析、转移
*
* @author mingyuan
*
*/
public class Mover {
private int totalPages;
private XmlRpcClientConfigImpl config;
private XmlRpcClient client;
private String baseUrl;
private Object userName;
private Object password;
private String csdnUserName;
public Mover(int totalPages, String blogRpcUrl, String csdnUrl, String csdnUserName, String userName,
String password) {
this.totalPages = totalPages;
this.baseUrl = csdnUrl;
this.csdnUserName = csdnUserName;
this.userName = userName;
this.password = password;
config = new XmlRpcClientConfigImpl();
try {
config.setServerURL(new URL(blogRpcUrl));
} catch (MalformedURLException e) {
System.out.println(“请检查url”);
}
client = new XmlRpcClient();
client.setConfig(config);
}
private List<String> getlinks() {
List<String> list = new LinkedList<String>();
for (int i = 1; i <= totalPages; i++) {
System.out.println(“processing page ” + i);
Downloader downloader = new Downloader();
String content = downloader.download(baseUrl + “/” + csdnUserName + “/article/list/” + i);
if (content == null)
continue;
Document doc = Jsoup.parse(content);
Elements first = doc.select(“.link_title”);
for (int j = 0; j < first.size(); j++) {
Element first2 = first.get(j).select(“a”).first();
String link = baseUrl + first2.attr(“href”);
list.add(link);
System.out.println(“get link\t” + link);
}
System.out.println(“page ” + i + “ extractor done,sleep 2s”);
try {
TimeUnit.SECONDS.sleep(1);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
return list;
}
public List<CSDNPost> getPosts() {
List<String> links = getlinks();
List<CSDNPost> posts = new LinkedList<CSDNPost>();
for (String link : links) {
CSDNPost post = getPost(link);
if (post != null) {
posts.add(post);
}
}
return posts;
}
private CSDNPost getPost(String url) {
System.out.println(“url\t” + url);
Downloader downloader = new Downloader();
String html = downloader.download(url);
if (html == null)
return null;
Document doc = Jsoup.parse(html);
String title = doc.select(“.article_title”).first().text();
String categroy = “Uncategorized”;
Elements link_categories = doc.select(“.article_manage .link_categories”);
if (link_categories != null) {
Element first = link_categories.first();
if (first != null) {
Elements href = first.select(“a”);
if (href != null) {
categroy = href.text();
}
}
}
String postdate = doc.select(“.article_manage .link_postdate”).first().text();
String content = doc.select(“.details .article_content”).first().text();
SimpleDateFormat sdf = new SimpleDateFormat(“yyyy-MM-dd HH:mm”);
CSDNPost post = new CSDNPost();
post.setCategories(new String[] { categroy });
post.setTitle(title);
try {
post.setDateCreated(sdf.parse(postdate));
} catch (ParseException e) {
post.setDateCreated(new Date());
}
post.setDescription(content);
return post;
}
public void publish(CSDNPost post) {
Map<String, Object> struct = new HashMap<String, Object>();
struct.put(“dateCreated”, post.getDateCreated());
struct.put(“description”, post.getDescription());
struct.put(“title”, post.getTitle());
struct.put(“categories”, post.getCategories());
Object[] params = new Object[] { userName, userName, password, struct, true };
String blogid = null;
try {
blogid = (String) client.execute(“metaWeblog.newPost”, params);
} catch (XmlRpcException e) {
e.printStackTrace();
System.out.println(“导入出现错误:title=” + post.getTitle());
}
System.out.println(post.getTitle() + “>> 导入完毕,生成博文id为>>” + blogid);
struct.clear();
}
public static void main(String[] args) throws IOException {
Mover extractor = new Mover(19, “http://youthmemo.com/xmlrpc.php”, “http://blog.csdn.net”, “telnetor”, “admin”,
“xxxx”);
List<CSDNPost> posts = extractor.getPosts();
for (CSDNPost post : posts) {
extractor.publish(post);
try {
TimeUnit.SECONDS.sleep(1);
} catch (InterruptedException e) {
e.printStackTrace();
}
System.out.println(post.getTitle());
}
System.out.println(“done!”);
}
}
下面给出下载类Downloader.java
package cn.mingyuan.csdn2wordpress;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpClientParams;
import org.apache.commons.httpclient.params.HttpMethodParams;
/**
* downloader
*
* @author mingyuan
*
*/
public class Downloader {
private HttpClientParams params = null;
private HttpClient client = null;
/**
* 默认构造函数,初始化一系列变量
*/
public Downloader() {
// 构造HttpClientParams参数
params = new HttpClientParams();
params.setParameter(
HttpClientParams.USER_AGENT,
“Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3 GTBDFff GTB7.0 (.NET CLR 3.5.30729)”);
params.setParameter(HttpClientParams.ALLOW_CIRCULAR_REDIRECTS, false);
params.setParameter(HttpClientParams.MAX_REDIRECTS, 4);
params.setParameter(HttpClientParams.CONNECTION_MANAGER_TIMEOUT, (long) 60 * 1000);
params.setParameter(HttpClientParams.SO_TIMEOUT, 60 * 1000);
// 使用系统提供的默认的恢复策略
params.setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());
client = new HttpClient(params);
}
/**
* 下载网页
*
* @param url
* 网页url
* @return String类型的网页源码
*/
public String download(String url) {
HttpMethod method = new GetMethod(url);
String sourceCode = null;
method.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
// 读取内容
StringBuilder builder = new StringBuilder();
BufferedReader reader = null;
try {
int statusCode = client.executeMethod(method);
if (statusCode != HttpStatus.SC_OK) {
return null;
}
reader = new BufferedReader(new InputStreamReader(method.getResponseBodyAsStream(), “utf8″));
String line;
while ((line = reader.readLine()) != null) {
builder.append(line + “\r\n”);
}
sourceCode = builder.toString();
} catch (HttpException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
// 释放连接
method.releaseConnection();
client.getHttpConnectionManager().closeIdleConnections(0);
}
return sourceCode;
}
}
最后发出一个pojo,CSDNPost.java
package cn.mingyuan.csdn2wordpress;
import java.util.Date;
/**
* csdn post
*
* @author mingyuan
*
*/
public class CSDNPost {
/**
* 博文创建日期
*/
private Date dateCreated;
/**
* 博文内容
*/
private String description;
/**
* 标题
*/
private String title;
/**
* 博文分类
*/
private String[] categories;
public CSDNPost() {
}
public CSDNPost(String title, String description, String[] categories, Date dateCreated) {
this.dateCreated = dateCreated;
this.description = description;
this.title = title;
this.categories = categories;
}
public Date getDateCreated() {
return dateCreated;
}
public void setDateCreated(Date dateCreated) {
this.dateCreated = dateCreated;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String[] getCategories() {
return categories;
}
public void setCategories(String[] categories) {
this.categories = categories;
}
}
以上是全部源码。
在文章的结尾,我愿意跟大家分享一下这个小程序的开发心得。
一开始写这个程序的时候,觉得会很快搞定,因为这个程序无非就是三个过程:采集、解析、发帖。其实也真是这样的一个过程。
这个程序耗费精力比较多的地方是在解析网页、提取链接、标题、内容、发布时间、分类方面。
一开始想用xpath解析网页,并且写xpath表达式都在chrome上测试通过xpath helper验证通过了。但在编码阶段发现现有的工具包,比如dom4j就不支持对html的解析,网上看了有通过htmlparser将html转换成xml的方法。但觉得太麻烦,最后发现了JSoup这个非常强大的工具,它可以通过类似jquery和css选取语法的表达式来提取内容。尝试了下非常方便,于是解析这个问题没有了(有个小窍门:chrome浏览器开发者工具可以看某节点的css样式,把这个样式直接传递给jsoup就能提取内容)。
wordpress支持MetaWeblog协议,可以通过XML-RPC进行发帖。关于它们的信息可以通过以下链接找到:
http://en.wikipedia.org/wiki/MetaWeblog
http://en.wikipedia.org/wiki/XML-RPC (可以找到各种语言版本的api)
另外JSoup的地址是:
http://jsoup.org/
程序写的太匆忙,肯定有很多不尽人意的地方,希望各位指出。我的联系方式是:admin#youthmemo.com。
相关文章推荐
- csdn上的博客搬家到cnblogs了,以后大家请访问http://yjmyzz.cnblogs.com/
- 鉴于CSDN的BLOG的不稳定性,打算搬家.新地址为http://www.cnblogs.com/zhucde/
- 我的博客搬家咯http://www.ismailes.com
- 使用SVN管理VC项目(解决无法访问https://code.google.com/hosting/settings)(服务器为Code Google)(转http://blog.csdn.net/xiadasong007/archive/2010/07/
- 博客搬家了,欢迎访问 http://blog.csdn.net/yinpengxiang/
- from http://csdn.fengyuan.com
- 本博客早已搬家到http://overred.cnblogs.com
- 搬家了啊。。。新地址 http://www.cnblogs.com/playerc/
- 介绍几本数学书 转自:http://shaoweicai.wordpress.com/2009/12/21/%E3%80%90%E8%BD%ACmit%E5%A4%A7%E7%89%9B%E5%8D%
- 博客由http://www.cnblogs.com/misterjoker/搬家到这里
- 博客搬家至:http://zyzhang.github.com
- 已经搬家到: http://www.cnblogs.com/realfun
- 本博客已搬家至www.stalvan.com csdn不再更新
- 博客搬家了......http://orz.miyuoo.com
- js快捷键大全 http://www.zhangxinxu.com/wordpress/?p=1667
- 我的博客搬家了! 新地址:http://www.cnblogs.com/jubincn/
- 弄了一个新的wordpress博客 http://mnlm.comyr.com/
- 搬家了,新地址http://www.cnblogs.com/clayman/
- 博客搬家至github: http://zyzhang.github.com
- 祝大家新年快乐,我的博客搬家了,欢迎大家访问(http://blog.fwhyy.com)