您的位置:首页 > 理论基础 > 计算机网络

利用cpdetector获取文件编码格式,同时得到网页内容。增加http/https通用方式

2016-06-22 15:20 726 查看
获取网页编码格式,同时得到网页内容。

import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;
import info.monitorenter.cpdetector.io.UnicodeDetector;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;

public class HtmlContentUtil {
private static CodepageDetectorProxy detector = null;

static{//获取探测编码器detector对象
detector = CodepageDetectorProxy.getInstance();
detector.add(JChardetFacade.getInstance());// 用到antlr.jar、chardet.jar
detector.add(new ParsingDetector(false));
detector.add(ASCIIDetector.getInstance());
detector.add(UnicodeDetector.getInstance());
}

/**
* @描述:  获取网页内容
* @说明:
* @修改时间: 2016年6月22日 下午3:16:55
* @param url
* @return
* @throws Exception
*/
public static String getContent(String url) throws Exception {
if (!url.contains("http") && !url.contains("https")) {
url = "http://" + url;
}

URL indexUrl = new URL(url);
String fileEncode = getFileEncode(indexUrl);
if(fileEncode == null){
fileEncode = "utf-8";
}
HttpURLConnection httpConn = (HttpURLConnection) indexUrl.openConnection();
InputStreamReader input = new InputStreamReader(httpConn.getInputStream(), fileEncode);
BufferedReader bufReader = new BufferedReader(input);
String line = "";
StringBuilder contentBuf = new StringBuilder();
while ((line = bufReader.readLine()) != null) {
contentBuf.append(line);
}
String content = contentBuf.toString();
return content;
}

/**
* @描述:利用第三方开源包cpdetector获取文件编码格式
* @说明:
* @修改时间: 2016年6月22日 下午3:16:36
* @param indexUrl
* @return
*/
public static String getFileEncode(URL indexUrl) {
java.nio.charset.Charset charset = null;
try {
charset = detector.detectCodepage(indexUrl);
} catch (Exception ex) {
}
if (charset != null) {
if (charset.name().equals("void")) {
return "GBK";//未知的编码默认为gbk
}else{
return charset.name();
}
} else{
return null;
}
}

public static void main(String[] args) {
try {
System.out.println(getContent("www.xjjz.gov.cn"));
} catch (Exception e) {
e.printStackTrace();
}

}
}

以上只能获取http协议网站内容,增加一个http、https都能获取的方式

注册http客户端

/**
* 创建httpclient
* @return
*/
public CloseableHttpClient buildHttpClient() {
try {
RegistryBuilder<ConnectionSocketFactory> builder = RegistryBuilder.create();
ConnectionSocketFactory factory = new PlainConnectionSocketFactory();
builder.register("http", factory);
KeyStore trustStore = KeyStore.getInstance(KeyStore.getDefaultType());
SSLContext context = SSLContexts.custom().useTLS().loadTrustMa
4000
terial(trustStore, new TrustStrategy() {
@Override
public boolean isTrusted(X509Certificate[] chain, String authType)
throws CertificateException {
return true;
}
}).build();
LayeredConnectionSocketFactory sslFactory = new SSLConnectionSocketFactory(context, SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
builder.register("https", sslFactory);
Registry<ConnectionSocketFactory> registry = builder.build();

PoolingHttpClientConnectionManager manager = new PoolingHttpClientConnectionManager(registry);
ConnectionConfig connConfig = ConnectionConfig.custom().setCharset(Charset.forName(defaultEncoding)).build();
SocketConfig socketConfig = SocketConfig.custom().setSoTimeout(100000).build();
manager.setDefaultConnectionConfig(connConfig);
manager.setDefaultSocketConfig(socketConfig);
return HttpClientBuilder.create().setConnectionManager(manager).build();
} catch (KeyStoreException e) {
e.printStackTrace();
} catch (KeyManagementException e) {
e.printStackTrace();
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
}
return null;
}
/**
* @描述:  获取网页内容,支持http和https
* @说明:
* @修改时间: 2016年7月4日 上午10:20:27
* @param url
* @return
* @throws IOException
* @throws ClientProtocolException
*/
public static String getAllContent(String url) throws ClientProtocolException, IOException {
if (!url.contains("http") && !url.contains("https")) {
url = "http://" + url;
}
String fileEncode = getFileEncode(new URL(url));
if(fileEncode == null){
fileEncode = "utf-8";
}
CloseableHttpClient buildHttpClient = new HttpUtils().buildHttpClient();
//RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(5000).setConnectionRequestTimeout(3000).setSocketTimeout(3000).build();
HttpGet httpGet = new HttpGet(url);
//httpGet.setConfig(requestConfig);
CloseableHttpResponse response = buildHttpClient.execute(httpGet);

HttpEntity entity = response.getEntity();
String result = "";

if (entity != null) {
result = EntityUtils.toString(entity, Charset.forName(fileEncode));
}

return result;
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: