您的位置:首页 > 编程语言 > Java开发

java根据 正则表达式解析html网页内容

2017-05-25 12:15 579 查看
仅供参考:

import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.cms.common.entity.HttpRespons;

public class test {

public static void main(String[] args) {
getLyric("逐浪飞花");
}

/**
* 获取可下载的歌词信息
* @param songName 歌曲名称
*
*/
public static List<Map<String, String>> getLyric(String songName) {
List<Map<String, String>> reqMap = new ArrayList<Map<String,String>>();
try {
HttpRequester request = new HttpRequester();
String urlNameString = "http://www.lrcgc.com/so/?q="+ songName;
HttpRespons hr = request.sendGet(urlNameString);
String content = hr.getContent();
//返回内容
// System.out.println(content);
//获取歌曲信息的结果 如: <a href="/lyric-26228-242158.html" target="_blank"><em>逐浪</em><em>飞花</em></a>,
String regex = "<a[^>]*href=\"/lyric-[^>]*>.*?</a>";
List<String> link = getContentByRegex(content,regex);
// System.out.println("路径:"+link);

//歌曲名称列表
List<String> songUrlList = match(link.toString(), "a", "href");
// System.out.println("值:"+songUrlList);

//歌手列表
List<String> songNameList = getLabelValues(link.toString(),regex);
// System.out.println(songNameList);

regex = "<a[^>]*href=\"/songlist-[^>]*>.*?</a>";
link = getContentByRegex(content,regex);
// System.out.println("歌手:"+link);
//歌手列表
List<String> singerList = getLabelValues(link.toString(),regex);
// System.out.println(singerList);

for (int i = 0; i < singerList.size(); i++) {
Map<String, String> map = new HashMap<String, String>();
map.put("singerName", singerList.get(i).replace("&", "&"));
map.put("songName", songNameList.get(i));
//下载链接 TODO
map.put("songUrl", geciDownlrc(songUrlList.get(i)));
reqMap.add(map);
}

} catch (Exception e) {
e.printStackTrace();
}

return reqMap;
}

/**
*
* 下载 歌词信息
*/
private static String geciDownlrc(String songUrl) {
try {
HttpRequester request = new HttpRequester();
//歌曲名称
// http://www.lrcgc.com/lyric-26228-242158.html String urlNameString = "http://www.lrcgc.com/"+songUrl;
HttpRespons hr = request.sendGet(urlNameString);
//请求链接
String content = hr.getContent();
//返回内容
// System.out.println(content);

//获取歌曲信息的结果 如: <a href="/lyric-26228-242158.html" target="_blank"><em>逐浪</em><em>飞花</em></a>,
String regex = "<a[^>]*id=\"J_downlrc\"[^>]*>.*?</a>";
List<String> link = getContentByRegex(content,regex);
// System.out.println(link);

List<String> list = match(link.toString(), "a", "href");
// System.out.println("值:"+list);

String fileName = "";
String fileUrl = "";
//获取文件名称
if (list != null && list.size() > 0) {
fileUrl = list.get(0).replace("&", "&");
fileName = fileUrl.substring(fileUrl.indexOf("/")+1,fileUrl.length());
fileUrl = "http://www.lrcgc.com//"+fileUrl;
}

// System.out.println("fileUrl:"+fileUrl);
// System.out.println("fileName:"+fileName);

return fileUrl;

} catch (Exception e) {
e.printStackTrace();
}
return "";
}

/**
* 传入要下载的文件的url,将url所对应的文件下载到本地
* @param urlString 下载的文件的url
* @param fileName 文件名称
*/
public static void downloadFile(String urlString,String fileName) {
String localFilePath = "C:\\Users\\Administrator\\Desktop\\"+fileName;
try {
URL url = new URL(urlString);
DataInputStream dataInputStream = new DataInputStream(url.openStream());
FileOutputStream fileOutputStream = new FileOutputStream(new File(localFilePath));
byte[] buffer = new byte[1024];
int length;
while ((length = dataInputStream.read(buffer)) > 0) {
fileOutputStream.write(buffer, 0, length);
}
dataInputStream.close();
fileOutputStream.close();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}

/**
*
* @param html
* @return 获得网页标题
*/
public static String getTitle( String html) {
String regex;
String title = "";
final List<String> list = new ArrayList<String>();
regex = "<title>.*?</title>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(html);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
title = title + list.get(i);
}
return title.replaceAll("<.*?>", "");
}

/**
* 获取标签中的值
* @param html 内容
* @param regex 正则表达式
* @return
*/
public static List<String> getLabelValues(String html,String regex) {
// String regex;
final List<String> list = new ArrayList<String>();
// regex = "<a[^>]*href=\"/lyric-[^>]*>(.*?)</a>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(html);
while (ma.find()) {
list.add(ma.group().replaceAll("<.*?>", ""));
}
return list;
}

/**
* 获取匹配的正则表达式
* @param s 内容
* @param regex 正则表达式
* @return
*/
public static List<String> getContentByRegex(String s,String regex) {

final List<String> list = new ArrayList<String>();
//获得页面所有的链接
final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
return list;
}

/**
*
* @param s
* @return 获得所有的超链接
*/
public List<String> getLink(final String s) {
String regex;
final List<String> list = new ArrayList<String>();
regex = "<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)</a>";
final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
final Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
return list;
}

/**
* 获取指定HTML标签的指定属性的值
* @param source 要匹配的源文本
* @param element 标签名称
* @param attr 标签的属性名称
* @return 属性值列表
*/
public static List<String> match(String source, String element, String attr) {
List<String> result = new ArrayList<String>();
String reg = "<" + element + "[^<>]*?\\s" + attr + "=['\"]?(.*?)['\"]?(\\s.*?)?>";
Matcher m = Pattern.compile(reg).matcher(source);
while (m.find()) {
String r = m.group(1);
result.add(r);
}
return result;
}

}


HttpRespons hr = request.sendGet(urlNameString);  
这个方法请参考:http://blog.csdn.net/qq_27292113/article/details/71534346 这里面有详细的代码。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: