lucene 自动采集代码,我自己测试的
2009-02-04 13:36
190 查看
lucene 自动采集代码,我自己测试的
Urls.java
package com.Test2;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.net.*;
import java.io.*;
import java.util.regex.*;
/*
根据指定的规则,通过构造正则表达式获取网址
*/
public class Urls {
private String startUrl; //开始采集网址
String urlContent;
String ContentArea;
private String strAreaBegin, strAreaEnd; //采集区域开始采集字符串和结束采集字符串
private String stringInUrl, stringNotInUrl;
String strContent;//获得的采集内容
String[] allUrls; //采集到的所有网址
private String regex; //采集规则
UrlAndTitle urlAndTitle = new UrlAndTitle(); //存储网址和标题
public static void main(String[] args) {
Urls myurl = new Urls("<body", "/body>");
myurl.getStartUrl("http://www.qidian.com/");
myurl.getUrlContent();
myurl.getContentArea();
myurl.getStringInUrl("http://www.qidian.com/");
myurl.getStringNotInUrl("google");
myurl.Urls();
// System.out.println("startUrl:"+myurl.startUrl);
// System.out.println("urlcontent:"+myurl.urlContent);
// System.out.println("ContentArea:"+myurl.ContentArea);
}
//初始化构造函数 strAreaBegin 和strAreaEnd
public Urls(String strAreaBegin, String strAreaEnd) {
this.strAreaBegin = strAreaBegin;
this.strAreaEnd = strAreaEnd;
}
//
public void Urls() {
int i = 0;
//String regex ="<a href="?''?http://[a-zA-Z0-9]+/.[a-zA-Z0-9]+/.[a-zA-Z]+/?[/.?[/S|/s]]+[a>]$";
String regex = "<a.*?/a>";
//String regex ="http://.*?>";
Pattern pt = Pattern.compile(regex);
Matcher mt = pt.matcher(ContentArea);
while (mt.find()) {
System.out.println(mt.group());
i++;
//获取标题
Matcher title = Pattern.compile(">.*?</a>").matcher(mt.group());
while (title.find()) {
System.out.println("标题:"
+ title.group().replaceAll(">|</a>", ""));
}
//获取网址
Matcher myurl = Pattern.compile("href=.*?>").matcher(mt.group());
while (myurl.find()) {
System.out.println("网址:"
+ myurl.group().replaceAll("href=|>", ""));
}
System.out.println();
}
System.out.println("共有" + i + "个符合结果");
}
//获得开始采集网址
public void getStartUrl(String startUrl) {
this.startUrl = startUrl;
}
//获得网址所在内容;
public void getUrlContent() {
StringBuffer is = new StringBuffer();
try {
URL myUrl = new URL(startUrl);
BufferedReader br = new BufferedReader(new InputStreamReader(myUrl
.openStream()));
String s;
while ((s = br.readLine()) != null) {
is.append(s);
}
urlContent = is.toString();
} catch (Exception e)
{
System.out.println("网址文件未能输出");
e.printStackTrace();
}
}
//获得网址所在的匹配区域部分
public void getContentArea() {
int pos1 = 0, pos2 = 0;
pos1 = urlContent.indexOf(strAreaBegin) + strAreaBegin.length();
pos2 = urlContent.indexOf(strAreaEnd, pos1);
ContentArea = urlContent.substring(pos1, pos2);
}
//以下两个函数获得网址应该要包含的关键字及不能包含的关键字
//这里只做初步的实验。后期,保护的关键字及不能包含的关键字应该是不只一个的。
public void getStringInUrl(String stringInUrl) {
this.stringInUrl = stringInUrl;
}
public void getStringNotInUrl(String stringNotInUrl) {
this.stringNotInUrl = stringNotInUrl;
}
//获取采集规则
//获取url网址
public void getUrl() {
}
public String getRegex() {
return regex;
}
class UrlAndTitle {
String myURL;
String title;
}
}
Test.java
package com.Test2;
import java.io.*;
public class Test {
public static String getWebContent(String domain){
System.out.println("开始读取内容...("+domain+")");
StringBuffer sb = new StringBuffer();
try{
java.net.URL url = new java.net.URL(domain);
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
String line;
while((line = in.readLine()) != null){
sb.append(line);
}
in.close();
}catch(Exception e) { // Report any errors that arise
sb.append(e.toString());
System.err.println(e);
System.err.println("Usage: java HttpClient <URL> [<filename>]");
}
return sb.toString();
}
public static void main(String args[])
{
System.out.println(getWebContent("http://www.qidian.com"));;
}
}
Test4.java
package com.Test2;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Test4 {
static String txt=new String();
public static String matcherStr(String str, String cp, String s){
if(str==null || str.equals("")){
return "";
}
txt = str;
if(str!=null && !str.equals("")){
txt = str;
Pattern p = Pattern.compile(cp,2); //参数2表示大小写不区分
Matcher m = p.matcher(txt);
StringBuffer sb = new StringBuffer();
int i=0;
boolean result = m.find();
//使用循环将句子里所有匹配的内容找出并替换再将内容加到sb里
while(result) {
i++;
sb.append(m.group());
sb.append(s);
//继续查找下一个匹配对象
result = m.find();
}
txt = String.valueOf(sb);
}else{
txt = "";
}
return txt;
}
public static void main(String args[])
{
// Pattern p=Pattern.compile("</td>.*?>",2);
// Matcher m=p.matcher(txt);
// while(m.find())
// {
// System.out.println(matcherStr(Test.getWebContent("http://www.qidian.com"), ".*?小说", ","));
// System.out.println(m.group());
// }
System.out.println(matcherStr(Test.getWebContent("http://www.qidian.com/"),
".*?小说".replace(Pattern.compile("<.*?>",2).matcher(txt).toString(), ""), "/n"));
}
}
Urls.java
package com.Test2;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.net.*;
import java.io.*;
import java.util.regex.*;
/*
根据指定的规则,通过构造正则表达式获取网址
*/
public class Urls {
private String startUrl; //开始采集网址
String urlContent;
String ContentArea;
private String strAreaBegin, strAreaEnd; //采集区域开始采集字符串和结束采集字符串
private String stringInUrl, stringNotInUrl;
String strContent;//获得的采集内容
String[] allUrls; //采集到的所有网址
private String regex; //采集规则
UrlAndTitle urlAndTitle = new UrlAndTitle(); //存储网址和标题
public static void main(String[] args) {
Urls myurl = new Urls("<body", "/body>");
myurl.getStartUrl("http://www.qidian.com/");
myurl.getUrlContent();
myurl.getContentArea();
myurl.getStringInUrl("http://www.qidian.com/");
myurl.getStringNotInUrl("google");
myurl.Urls();
// System.out.println("startUrl:"+myurl.startUrl);
// System.out.println("urlcontent:"+myurl.urlContent);
// System.out.println("ContentArea:"+myurl.ContentArea);
}
//初始化构造函数 strAreaBegin 和strAreaEnd
public Urls(String strAreaBegin, String strAreaEnd) {
this.strAreaBegin = strAreaBegin;
this.strAreaEnd = strAreaEnd;
}
//
public void Urls() {
int i = 0;
//String regex ="<a href="?''?http://[a-zA-Z0-9]+/.[a-zA-Z0-9]+/.[a-zA-Z]+/?[/.?[/S|/s]]+[a>]$";
String regex = "<a.*?/a>";
//String regex ="http://.*?>";
Pattern pt = Pattern.compile(regex);
Matcher mt = pt.matcher(ContentArea);
while (mt.find()) {
System.out.println(mt.group());
i++;
//获取标题
Matcher title = Pattern.compile(">.*?</a>").matcher(mt.group());
while (title.find()) {
System.out.println("标题:"
+ title.group().replaceAll(">|</a>", ""));
}
//获取网址
Matcher myurl = Pattern.compile("href=.*?>").matcher(mt.group());
while (myurl.find()) {
System.out.println("网址:"
+ myurl.group().replaceAll("href=|>", ""));
}
System.out.println();
}
System.out.println("共有" + i + "个符合结果");
}
//获得开始采集网址
public void getStartUrl(String startUrl) {
this.startUrl = startUrl;
}
//获得网址所在内容;
public void getUrlContent() {
StringBuffer is = new StringBuffer();
try {
URL myUrl = new URL(startUrl);
BufferedReader br = new BufferedReader(new InputStreamReader(myUrl
.openStream()));
String s;
while ((s = br.readLine()) != null) {
is.append(s);
}
urlContent = is.toString();
} catch (Exception e)
{
System.out.println("网址文件未能输出");
e.printStackTrace();
}
}
//获得网址所在的匹配区域部分
public void getContentArea() {
int pos1 = 0, pos2 = 0;
pos1 = urlContent.indexOf(strAreaBegin) + strAreaBegin.length();
pos2 = urlContent.indexOf(strAreaEnd, pos1);
ContentArea = urlContent.substring(pos1, pos2);
}
//以下两个函数获得网址应该要包含的关键字及不能包含的关键字
//这里只做初步的实验。后期,保护的关键字及不能包含的关键字应该是不只一个的。
public void getStringInUrl(String stringInUrl) {
this.stringInUrl = stringInUrl;
}
public void getStringNotInUrl(String stringNotInUrl) {
this.stringNotInUrl = stringNotInUrl;
}
//获取采集规则
//获取url网址
public void getUrl() {
}
public String getRegex() {
return regex;
}
class UrlAndTitle {
String myURL;
String title;
}
}
Test.java
package com.Test2;
import java.io.*;
public class Test {
public static String getWebContent(String domain){
System.out.println("开始读取内容...("+domain+")");
StringBuffer sb = new StringBuffer();
try{
java.net.URL url = new java.net.URL(domain);
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
String line;
while((line = in.readLine()) != null){
sb.append(line);
}
in.close();
}catch(Exception e) { // Report any errors that arise
sb.append(e.toString());
System.err.println(e);
System.err.println("Usage: java HttpClient <URL> [<filename>]");
}
return sb.toString();
}
public static void main(String args[])
{
System.out.println(getWebContent("http://www.qidian.com"));;
}
}
Test4.java
package com.Test2;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Test4 {
static String txt=new String();
public static String matcherStr(String str, String cp, String s){
if(str==null || str.equals("")){
return "";
}
txt = str;
if(str!=null && !str.equals("")){
txt = str;
Pattern p = Pattern.compile(cp,2); //参数2表示大小写不区分
Matcher m = p.matcher(txt);
StringBuffer sb = new StringBuffer();
int i=0;
boolean result = m.find();
//使用循环将句子里所有匹配的内容找出并替换再将内容加到sb里
while(result) {
i++;
sb.append(m.group());
sb.append(s);
//继续查找下一个匹配对象
result = m.find();
}
txt = String.valueOf(sb);
}else{
txt = "";
}
return txt;
}
public static void main(String args[])
{
// Pattern p=Pattern.compile("</td>.*?>",2);
// Matcher m=p.matcher(txt);
// while(m.find())
// {
// System.out.println(matcherStr(Test.getWebContent("http://www.qidian.com"), ".*?小说", ","));
// System.out.println(m.group());
// }
System.out.println(matcherStr(Test.getWebContent("http://www.qidian.com/"),
".*?小说".replace(Pattern.compile("<.*?>",2).matcher(txt).toString(), ""), "/n"));
}
}
相关文章推荐
- lucene的一段测试代码
- IDEA 整合Junit实现自动生成测试代码
- 用NUnit自动测试.NET代码
- Linux主机本地信息自动采集工具(***测试必备)
- 使用Ant+JUnit+Cobertura来实现代码覆盖自动测试
- imx6q led灯驱动及测试代码ioctl(自动创建设备文件v2)
- ASP下实现自动采集程序及入库的代码
- ASP 自动采集实现代码
- 使用Record Espresso Test功能自动生成测试代码
- Lucene 3.5 测试代码
- imx6q led灯驱动及测试代码(自动创建设备文件v1)
- 在linux上一行代码不用写实现自动采集+hadoop分词
- 在开启多线程时,用junit测试,到连接数据库代码时,junit自动停止,而且没有报错误。
- Coveralls自动测试代码覆盖率
- Python基础-文档测试(自动执行注释中的代码)
- [memo]intelij idea 自动生成测试代码junit设置到maven标准测试目录
- PR自动转PO测试代码
- 测试代码记录(我自己看得,未整理)
- c# 自动发送邮件测试代码