您的位置:首页 > Web前端 > HTML

网页爬去 String转为Html形式

2015-11-27 17:42 411 查看
public static String replaceByPattern(String html, String url, Pattern pattern) {
StringBuilder stringBuilder = new StringBuilder();
Matcher matcher = pattern.matcher(html);
int lastEnd = 0;
boolean modified = false;
while (matcher.find()) {
modified = true;
stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start()));
stringBuilder.append(matcher.group(1));
stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\"");
lastEnd = matcher.end();
}
if (!modified) {
return html;
}
stringBuilder.append(StringUtils.substring(html, lastEnd));
return stringBuilder.toString();
}

private static Pattern patternForHrefWithQuote = Pattern.compile("(<a[^<>]*href=)[\"']([^\"'<>]*)[\"']", Pattern.CASE_INSENSITIVE);
private static Pattern patternForHrefWithoutQuote = Pattern.compile("(<a[^<>]*href=)([^\"'<>\\s]+)", Pattern.CASE_INSENSITIVE);

public static String fixAllRelativeHrefs(String html, String url) {
html = replaceByPattern(html, url, patternForHrefWithQuote);
html = replaceByPattern(html, url, patternForHrefWithoutQuote);
return html;
}


public static Html getHtml(String body,String url) {
Html  html;
html = new Html(UrlUtils.fixAllRelativeHrefs(body,url));
return html;

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: