您的位置:首页 > 其它

使用正则表达式抓取网易云课堂中的数据

2017-07-22 15:17 274 查看
 要抓取数据的页面如下:



代码:

package com.url;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class PaChong {
static Vector<String> url1 = new Vector<>();
public static void FindOne(URL url) throws Exception{
URLConnection Conn = url.openConnection();
Conn.setReadTimeout(10000);

BufferedReader read = new BufferedReader(new InputStreamReader(Conn.getInputStream(),"UTF-8"));
String line = "";
while((line = read.readLine())!=null){
int index = line.indexOf("about/aboutus.htm#/about?"); //截取URL搜索到的网页源码中的包含该字段的源码
if(index>=0){
String URL ="http://study.163.com/"+line.substring(index);
try {
URL = URL.substring(0, URL.indexOf("\""));
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
url1.add(URL);
System.out.println(URL);
}
}
}

public static void FindTitle(URL url) throws Exception{
URLConnection Conn = url.openConnection();
Conn.setReadTimeout(10000);

BufferedReader read = new BufferedReader(new InputStreamReader(Conn.getInputStream(),"UTF-8"));
String line = "";
while((line = read.readLine())!=null){
int Titleindex = line.indexOf("<title>");
if(Titleindex>=0){
System.out.println(PaChong.getChinese(line));
}
}
}

public static void FindContent(URL url) throws Exception{
URLConnection Conn = url.openConnection();
Conn.setReadTimeout(10000);

BufferedReader read = new BufferedReader(new InputStreamReader(Conn.getInputStream(),"UTF-8"));
String line = "";
while((line = read.readLine())!=null){
//          int Contentindex = line.indexOf("<a data-index=");
int Contentindex = line.indexOf("data-name=");
if(Contentindex>=0){
String content = line.substring(line.indexOf("\""));
System.out.println(PaChong.getChinese(content));
}
}
}

//正则表达式提取搜索到网页中需要的中文字符
public static String getChinese(String paramValue) {
String regex = "([\u4e00-\u9fa5]+)";
String str = "";
Matcher matcher = Pattern.compile(regex).matcher(paramValue);
while (matcher.find()) {
str+= matcher.group(0);
str+= "  ";
}
return str;
}
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
URL url = new URL("http://study.163.com/courses-search?keyword=JAVA");  //爬取的链接
System.err.println("提取的相关介绍网页如下:");
FindOne(url);
System.err.println("提取的网页标题如下:");
FindTitle(url);
System.err.println("提取的网页内容如下:");
FindContent(url);
}

}

截图:

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: