您的位置:首页 > 理论基础 > 计算机网络

使用httpclient获取其他网站数据(含解析验证码)

2016-08-31 15:18 423 查看
**使用httpclient获取其他网站数据**


使用httpclient模拟浏览器请求网站加载个人诉讼记录信息接口;

总结:1.系统如果上线,linux系统中使用了python命令来识别验证码,先将验证码保存在本地,识别完成后删除;需要一个python脚本,代码粘下面:

2.如果是在windows系统上运行该系统,提供了一个OCR的封装类,直接调用即可识别验证码;这里需要使用一个工具类,地址:http://download.csdn.net/download/qq_23339149/9617921

接口类:


import com.alibaba.fastjson.JSONObject;
import com.aweb.platform.util.StringUtils;
import com.dbn.sysmodule.util.IdcardUtils;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Value;

import java.io.*;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* Created by warming on 2016/8/31 with IntelliJ IDEA.
*/
public class OuterInformationServiceImpl implements com.dbn.remote.service.OuterInformationService {
private Logger log = Logger.getLogger(OuterInformationServiceImpl.class);

@Value("${zhiXingIndexUrl}")
private String zhiXingIndexUrl;
@Value("${verificationCodeUrl}")
private String verificationCodeUrl;
@Value("${zhiXingSearchUrl}")
private String zhiXingSearchUrl;
@Val
4000
ue("${zhiXingSearchUserAgent}")
private String zhiXingSearchUserAgent;

public String getPersonLitigationRecords(String pName, String cardNum) throws Exception {
String jsonStr = null;
try {
if (StringUtils.checkStr(pName) && IdcardUtils.validateCard(cardNum)) {
HttpClient client = new HttpClient();
GetMethod method = null;
loadIndex(method, client);//模拟加载首页
String htmlResponse = getRecords(client, method, pName, cardNum);
int count = 0;
Boolean success = false;
while (count < 5) { //请求次数
if (htmlResponse.contains("验证码错误")) {
htmlResponse = getRecords(client, method, pName, cardNum);
count++;
} else {
success = true;
break;
}
}
log.info("验证码解析错误次数:" + count);
if (!success) {
return "查询失败";
}
jsonStr = getJsonStrByHtml(htmlResponse);
log.info("查询结果:" + jsonStr);
}
} catch (Exception e) {
e.printStackTrace();
throw new Exception("查询个人诉讼记录错误!", e);
}
return jsonStr;
}

private String getRecords(HttpClient client, GetMethod method, String pName, String cardNum) throws Exception {
//加载验证码
method = new GetMethod(verificationCodeUrl);
method.addRequestHeader("User-Agent", zhiXingSearchUserAgent);
client.executeMethod(method);
//通过linux调用python命令执行;
String fileName = "/tmp/" + Long.toString(System.currentTimeMillis()) + String.valueOf(getRandom()) + ".jpeg";
FileOutputStream fout = null;
try {
fout = new FileOutputStream(fileName);
fout.write(method.getResponseBody());
}catch(Exception e){
log.info("将验证码写入本地失败!");
}finally{
if(fout != null){
fout.flush();
fout.close();
}
}
String code = exec(fileName);
log.info("解析验证码为::" + code);

//适用于windows操作系统
//        InputStream bis = new ByteArrayInputStream(get.getResponseBody());
//        String code = ParseJPEG_withOCR.getRecogniseStr(bis);
//        log.info("验证码解析结果:" + code);
//        bis.close();

PostMethod post = new PostMethod(zhiXingSearchUrl);
post.setRequestHeader("Content-Type", "application/x-www-form-urlencoded");
post.setRequestHeader("Referer", zhiXingIndexUrl);
post.addRequestHeader("User-Agent", zhiXingSearchUserAgent);
post.setRequestBody("searchCourtName=" + URLEncoder.encode("全国法院(包含地方各级法院)") + "&selectCourtId=1&selectCourtArrange=1&pname=" +
URLEncoder.encode(pName) + "&cardNum=" + cardNum + "&j_captcha=" + code);
client.executeMethod(post);
InputStream is = null;
ByteArrayOutputStream baos = new ByteArrayOutputStream();
try{
is = post.getResponseBodyAsStream();
int i = -1;
while ((i = is.read()) != -1) {
baos.write(i);
}
}catch (Exception e){
log.info("获取查询返回页面失败!");
}finally {
if(is != null){
is.close();
}
}
return baos.toString();
}

//获取HTML页面中table标签对应的json值
private String getJsonStrByHtml(String htmlResponse) throws Exception {
if (StringUtils.checkStr(htmlResponse)) {
JSONObject data = new JSONObject();
org.jsoup.nodes.Document doc = Jsoup.parse(htmlResponse);
Elements trs = doc.getElementsByTag("tr");
Elements ths = doc.getElementsByTag("th");
Elements tds = doc.getElementsByTag("td");
int trsSize = trs.size();//行数
int thsSize = ths.size();//表头列数
int tdsSize = tds.size();//td数
if (trsSize > 0 && thsSize > 0 && tdsSize > 0) {
List<Object> list = new ArrayList<>();
for (int j = 0; j < trs.size() - 1; j++) {
Map<String, Object> map = new HashMap<>();
for (int i = 0; i < thsSize - 1; i++) {
map.put(ths.get(i).text(), tds.get(thsSize * j + i).text());
}
list.add(map);
}
data.put("data", list);
return data.toJSONString();
} else {
return null;
}
} else {
return null;
}
}

//模拟请求首页
private void loadIndex(GetMethod get, HttpClient client) throws IOException {
get = new GetMethod(zhiXingIndexUrl);
get.addRequestHeader("User-Agent", zhiXingSearchUserAgent);
client.executeMethod(get);
log.info("首页加载完成");
}

//调用linux命令
public String exec(String fileName) {
log.info("验证文件名称:" + fileName);
try {
String cmd ="python /tmp/captcha.py " + fileName;
Process process = Runtime.getRuntime().exec(cmd);
LineNumberReader br = new LineNumberReader(new InputStreamReader(
process.getInputStream()));
StringBuffer sb = new StringBuffer();
String line;
while ((line = br.readLine()) != null) {
System.out.println(line);
sb.append(line).append("\n");
}
//删除生成的验证码图片
Runtime.getRuntime().exec("rm -f " + fileName);
return sb.toString();
} catch (Exception e) {
e.printStackTrace();
}
return null;
}

//随机三位数
public int getRandom() {
int number = 0;
while (true) {
number = (int) (Math.random() * 1000);
if (number >= 100 && number < 1000) {
break;
}
}
return number;
}
}


工具类:

import com.asprise.util.ocr.OCR;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;

public class ParseJPEG_withOCR {
public static String getRecogniseStr(InputStream imageFile) {
String s = "";
try {
BufferedImage image = ImageIO.read(imageFile);
int width = image.getTileWidth();
int height = image.getTileHeight();
image = image.getSubimage(0, 0, width, height);
s = new OCR().recognizeEverything(image);
} catch (IOException e) {
e.printStackTrace();
System.out.println(" 图片识别失败! ");
}
return s;
}
public static String getRecogniseStrByFile(File imageFile) {
String s = "";
try {
BufferedImage image = ImageIO.read(imageFile);
int width = image.getTileWidth();
int height = image.getTileHeight();
image = image.getSubimage(0, 0, width, height);
s = new OCR().recognizeEverything(image);
} catch (IOException e) {
e.printStackTrace();
System.out.println(" 图片识别失败! ");
}
return s;
}

public static void main(String[] args) {
//        for (int i = 0; i < 100; i++) {
//            String code = getRecogniseStrByFile(new File("D:\\pic\\download/" + i + ".jpeg"));
//            System.out.println(code);
//        }
}

}


python脚本(文件名命名为:captcha.py),在linux同目录下保存验证码,执行命令:python /tmp/captcha.py ” + fileName 即可返回数据,pytesseract类库可

百度下载:

from PIL import Image
import sys
import pytesseract

def output(imgfile):
img = Image.open(imgfile)
gray = img.convert('L')
print(pytesseract.image_to_string(gray, config='-psm 7'))

if __name__ == "__main__":
file_name = sys.argv[1]
output(file_name)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐