根据HTTP和HTML中的字符集下载网页
2016-05-19 14:07
501 查看
参考文章:
http://stulance.iteye.com/blog/1740524
http://blog.csdn.net/it_magician/article/details/9240727
http://hc.apache.org/httpcomponents-client-4.5.x/tutorial/html/fundamentals.html#d5e199
http://stulance.iteye.com/blog/1740524
http://blog.csdn.net/it_magician/article/details/9240727
http://hc.apache.org/httpcomponents-client-4.5.x/tutorial/html/fundamentals.html#d5e199
/** * 根据url下载文件,保存到文件中,文件全路径为filename * @param url 下载URL * @param filename 完整的文件名 * @return 文件的文本内容 */ public static String downloadFile(String url, String filename, CloseableHttpClient client, HttpClientContext context) { String content = null; try { if(client == null) { return ""; } HttpGet httpGet = new HttpGet(url); try { content = client.execute(httpGet,rh,context); } catch (HttpResponseException hrex) { return null; } catch (ClientProtocolException cpex) { return null; } if(content == null) { return null; } File file = new File(filename); file.getParentFile().mkdirs(); BCFileUtils.writeFile(content,filename); // 尾文件增加BOM头 BCFileUtils.addBOMHead(file); } catch (HttpHostConnectException exHHCE) { CrawlLogger.CrawlInfo("[ERROR]url=" + url + " connection failed"); return null; } catch(UnknownHostException exUHE) { CrawlLogger.CrawlInfo("[ERROR]url=" + url + " domain name parsing failed"); return null; } catch (Exception e) { CrawlLogger.ErrInfo(BCWebUtils.class,e); return null; } return content; } // 使用ResponseHandler读取内容 private static ResponseHandler<String> rh = new ResponseHandler<String>() { @Override public String handleResponse( final HttpResponse response) throws IOException { StatusLine statusLine = response.getStatusLine(); HttpEntity entity = response.getEntity(); if (statusLine.getStatusCode() >= 300) { throw new HttpResponseException( statusLine.getStatusCode(), statusLine.getReasonPhrase()); } if (entity == null) { throw new ClientProtocolException("Response contains no content"); } ByteArrayOutputStream baos = new ByteArrayOutputStream(); byte[] buffer = new byte[4096]; int len; while ((len = entity.getContent().read(buffer)) > -1 ) { baos.write(buffer, 0, len); } baos.flush(); ContentType contentType = ContentType.getOrDefault(entity); Charset charset = contentType.getCharse 4000 t(); try { if(charset == null) { Reader reader = new InputStreamReader(new ByteArrayInputStream(baos.toByteArray()), "UTF-8"); char[] charBuffer = new char[4096]; int c = 0; StringBuilder content = new StringBuilder(); while ((c = reader.read(charBuffer)) != -1) { content.append(charBuffer,0,c); } String charsetFromContent = getCharset(content.toString()); charset = Charset.forName(charsetFromContent); if(charset == null) { charset = Charset.defaultCharset(); } } } catch (Exception ex) { charset = Charset.defaultCharset(); CrawlLogger.ErrInfo(BCWebUtils.class,ex); } Reader reader = new InputStreamReader(new ByteArrayInputStream(baos.toByteArray()), charset); char[] charBuffer = new char[4096]; int c = 0; StringBuilder content = new StringBuilder(); while ((c = reader.read(charBuffer)) != -1) { content.append(charBuffer,0,c); } //System.out.print(content.toString()); return content.toString(); } }; public static String getCharset(String content) throws Exception { String charset = getCharsetFromContent(content); if (charset == null) { charset = getCharsetFromMeta(content); } return charset; } public static String getCharsetFromContent(String content) throws IOException { String pattern = "\\<meta\\s*http-equiv=[\\\"\\']content-type[\\\"\\']\\s*content\\s*=\\s*[\"']" + "text/html\\s*;\\s*charset=([a-z\\d\\-]*)[\\\"\\'\\>]"; Matcher matcher = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(content); if (matcher.find()) { String charset = matcher.group(1); if (Charset.isSupported(charset)) { return charset; } } return null; } public static String getCharsetFromMeta(String content) throws Exception { String pattern = "\\<meta\\s*[\\\"\\']charset=([a-z\\d\\-]*)[\\\"\\'\\>]"; Matcher matcher = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(content); if (matcher.find()) { String charset = matcher.group(1); if (Charset.isSupported(charset)) { return charset; } } return null; }
相关文章推荐
- git使用方法
- TCP/IP之大明王朝邮差
- 二,使用框架完成http_python服务器
- Windows网络编程相关概念
- fedora配置网络
- 电脑无法自动获取ip
- Anroid端通过HttpClient连接Web端
- 太空飞行计划问题[网络流24题之2]
- HTTP 2.0 与 tomcat
- Android设置代理访问网络Proxy
- 【LINUX网络编程】Makefile文件
- Java网络爬虫基础
- [javaEE] http协议详细
- 使用WinINet和WinHTTP实现Http访问
- Android 网络框架
- android http——OkHttp使用详解
- 神经网络数据预处理,正则化与损失函数
- C#模拟Http请求时出现 基础连接已经关闭 未能为 SSLTLS 安全通道建立信任关系
- ../ 路径 如http://www.aa.com/aa/../bb.html
- Loss和神经网络训练