利用cpdetector获取文件编码格式,同时得到网页内容。增加http/https通用方式
2016-06-22 15:20
726 查看
获取网页编码格式,同时得到网页内容。 import info.monitorenter.cpdetector.io.ASCIIDetector; import info.monitorenter.cpdetector.io.CodepageDetectorProxy; import info.monitorenter.cpdetector.io.JChardetFacade; import info.monitorenter.cpdetector.io.ParsingDetector; import info.monitorenter.cpdetector.io.UnicodeDetector; import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; public class HtmlContentUtil { private static CodepageDetectorProxy detector = null; static{//获取探测编码器detector对象 detector = CodepageDetectorProxy.getInstance(); detector.add(JChardetFacade.getInstance());// 用到antlr.jar、chardet.jar detector.add(new ParsingDetector(false)); detector.add(ASCIIDetector.getInstance()); detector.add(UnicodeDetector.getInstance()); } /** * @描述: 获取网页内容 * @说明: * @修改时间: 2016年6月22日 下午3:16:55 * @param url * @return * @throws Exception */ public static String getContent(String url) throws Exception { if (!url.contains("http") && !url.contains("https")) { url = "http://" + url; } URL indexUrl = new URL(url); String fileEncode = getFileEncode(indexUrl); if(fileEncode == null){ fileEncode = "utf-8"; } HttpURLConnection httpConn = (HttpURLConnection) indexUrl.openConnection(); InputStreamReader input = new InputStreamReader(httpConn.getInputStream(), fileEncode); BufferedReader bufReader = new BufferedReader(input); String line = ""; StringBuilder contentBuf = new StringBuilder(); while ((line = bufReader.readLine()) != null) { contentBuf.append(line); } String content = contentBuf.toString(); return content; } /** * @描述:利用第三方开源包cpdetector获取文件编码格式 * @说明: * @修改时间: 2016年6月22日 下午3:16:36 * @param indexUrl * @return */ public static String getFileEncode(URL indexUrl) { java.nio.charset.Charset charset = null; try { charset = detector.detectCodepage(indexUrl); } catch (Exception ex) { } if (charset != null) { if (charset.name().equals("void")) { return "GBK";//未知的编码默认为gbk }else{ return charset.name(); } } else{ return null; } } public static void main(String[] args) { try { System.out.println(getContent("www.xjjz.gov.cn")); } catch (Exception e) { e.printStackTrace(); } } }
以上只能获取http协议网站内容,增加一个http、https都能获取的方式
注册http客户端
/**
* 创建httpclient
* @return
*/
public CloseableHttpClient buildHttpClient() {
try {
RegistryBuilder<ConnectionSocketFactory> builder = RegistryBuilder.create();
ConnectionSocketFactory factory = new PlainConnectionSocketFactory();
builder.register("http", factory);
KeyStore trustStore = KeyStore.getInstance(KeyStore.getDefaultType());
SSLContext context = SSLContexts.custom().useTLS().loadTrustMa
4000
terial(trustStore, new TrustStrategy() {
@Override
public boolean isTrusted(X509Certificate[] chain, String authType)
throws CertificateException {
return true;
}
}).build();
LayeredConnectionSocketFactory sslFactory = new SSLConnectionSocketFactory(context, SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
builder.register("https", sslFactory);
Registry<ConnectionSocketFactory> registry = builder.build();
PoolingHttpClientConnectionManager manager = new PoolingHttpClientConnectionManager(registry);
ConnectionConfig connConfig = ConnectionConfig.custom().setCharset(Charset.forName(defaultEncoding)).build();
SocketConfig socketConfig = SocketConfig.custom().setSoTimeout(100000).build();
manager.setDefaultConnectionConfig(connConfig);
manager.setDefaultSocketConfig(socketConfig);
return HttpClientBuilder.create().setConnectionManager(manager).build();
} catch (KeyStoreException e) {
e.printStackTrace();
} catch (KeyManagementException e) {
e.printStackTrace();
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
}
return null;
}
/** * @描述: 获取网页内容,支持http和https * @说明: * @修改时间: 2016年7月4日 上午10:20:27 * @param url * @return * @throws IOException * @throws ClientProtocolException */ public static String getAllContent(String url) throws ClientProtocolException, IOException { if (!url.contains("http") && !url.contains("https")) { url = "http://" + url; } String fileEncode = getFileEncode(new URL(url)); if(fileEncode == null){ fileEncode = "utf-8"; } CloseableHttpClient buildHttpClient = new HttpUtils().buildHttpClient(); //RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(5000).setConnectionRequestTimeout(3000).setSocketTimeout(3000).build(); HttpGet httpGet = new HttpGet(url); //httpGet.setConfig(requestConfig); CloseableHttpResponse response = buildHttpClient.execute(httpGet); HttpEntity entity = response.getEntity(); String result = ""; if (entity != null) { result = EntityUtils.toString(entity, Charset.forName(fileEncode)); } return result; }
相关文章推荐
- HttpURLConnection介绍
- C# Http POST请求方法
- volley使用
- 【网络安全】SSL协议(HTTPS) 握手、工作流程详解(双向HTTPS流程)
- Java 十进制和十六制之间的转化(负数的处理)___http://www.cnblogs.com/literoad/archive/2013/01/25/2875908.html
- 卷积神经网络反向传播算法
- 理解 Linux 网络栈(1):Linux 网络协议栈简单总结 图
- centos6.5 使用setup配置网络
- web.xml 中的listener、 filter、servlet 加载顺序及其详解___http://www.cnblogs.com/JesseV/archive/2009/11/17/16050
- OkHttpUtils一个专注于让网络请求更简单的框架
- tcp-backlog配置
- ping(8)命令
- Android Https 双向认证
- Libnids中tcp重组的实现
- mac中apache开启https功能
- Clumsy logo差网络环境模拟工具 Clumsy
- 深度卷积网络CNN与图像语义分割
- 深入理解HTTP Session
- 浅析IMS网络中使用的标识
- java HttpsURLConnection发送https请求