您的位置:首页 > 理论基础 > 计算机网络

根据HTTP和HTML中的字符集下载网页

2016-05-19 14:07 501 查看
参考文章:

http://stulance.iteye.com/blog/1740524

http://blog.csdn.net/it_magician/article/details/9240727

http://hc.apache.org/httpcomponents-client-4.5.x/tutorial/html/fundamentals.html#d5e199

/**
* 根据url下载文件,保存到文件中,文件全路径为filename
* @param url 下载URL
* @param filename 完整的文件名
* @return 文件的文本内容
*/
public static String downloadFile(String url, String filename,
CloseableHttpClient client, HttpClientContext context) {
String content = null;
try {
if(client == null) {
return "";
}

HttpGet httpGet = new HttpGet(url);
try {
content = client.execute(httpGet,rh,context);
} catch (HttpResponseException hrex) {
return null;
} catch (ClientProtocolException cpex) {
return null;
}

if(content == null) {
return null;
}

File file = new File(filename);
file.getParentFile().mkdirs();
BCFileUtils.writeFile(content,filename);

// 尾文件增加BOM头
BCFileUtils.addBOMHead(file);
} catch (HttpHostConnectException exHHCE) {
CrawlLogger.CrawlInfo("[ERROR]url=" + url + " connection failed");
return null;
} catch(UnknownHostException exUHE) {
CrawlLogger.CrawlInfo("[ERROR]url=" + url + " domain name parsing failed");
return null;
} catch (Exception e) {
CrawlLogger.ErrInfo(BCWebUtils.class,e);
return null;
}
return content;
}

// 使用ResponseHandler读取内容
private static ResponseHandler<String> rh = new ResponseHandler<String>() {
@Override
public String handleResponse(
final HttpResponse response) throws IOException {
StatusLine statusLine = response.getStatusLine();
HttpEntity entity = response.getEntity();
if (statusLine.getStatusCode() >= 300) {
throw new HttpResponseException(
statusLine.getStatusCode(),
statusLine.getReasonPhrase());
}
if (entity == null) {
throw new ClientProtocolException("Response contains no content");
}

ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[4096];
int len;
while ((len = entity.getContent().read(buffer)) > -1 ) {
baos.write(buffer, 0, len);
}
baos.flush();

ContentType contentType = ContentType.getOrDefault(entity);
Charset charset = contentType.getCharse
4000
t();
try {
if(charset == null) {
Reader reader = new InputStreamReader(new ByteArrayInputStream(baos.toByteArray()), "UTF-8");
char[] charBuffer = new char[4096];
int c = 0;
StringBuilder content = new StringBuilder();
while ((c = reader.read(charBuffer)) != -1) {
content.append(charBuffer,0,c);
}
String charsetFromContent = getCharset(content.toString());
charset = Charset.forName(charsetFromContent);
if(charset == null) {
charset = Charset.defaultCharset();
}
}
} catch (Exception ex) {
charset = Charset.defaultCharset();
CrawlLogger.ErrInfo(BCWebUtils.class,ex);
}
Reader reader = new InputStreamReader(new ByteArrayInputStream(baos.toByteArray()), charset);
char[] charBuffer = new char[4096];
int c = 0;
StringBuilder content = new StringBuilder();
while ((c = reader.read(charBuffer)) != -1) {
content.append(charBuffer,0,c);
}

//System.out.print(content.toString());

return content.toString();
}
};

public static String getCharset(String content) throws Exception {
String charset = getCharsetFromContent(content);
if (charset == null) {
charset = getCharsetFromMeta(content);
}
return charset;
}

public static String getCharsetFromContent(String content) throws IOException {
String pattern = "\\<meta\\s*http-equiv=[\\\"\\']content-type[\\\"\\']\\s*content\\s*=\\s*[\"']" +
"text/html\\s*;\\s*charset=([a-z\\d\\-]*)[\\\"\\'\\>]";
Matcher matcher = Pattern.compile(pattern,  Pattern.CASE_INSENSITIVE).matcher(content);
if (matcher.find()) {
String charset = matcher.group(1);
if (Charset.isSupported(charset)) {
return charset;
}
}

return null;
}

public static String getCharsetFromMeta(String content) throws Exception {
String pattern = "\\<meta\\s*[\\\"\\']charset=([a-z\\d\\-]*)[\\\"\\'\\>]";
Matcher matcher = Pattern.compile(pattern,  Pattern.CASE_INSENSITIVE).matcher(content);
if (matcher.find()) {
String charset = matcher.group(1);
if (Charset.isSupported(charset)) {
return charset;
}
}

return null;
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: