httpclient 爬虫实例——爬取中学名(总计6万+)
2018-02-06 16:31
260 查看
本人在使用 httpclient 的过程中,突然想起来可以爬取一些数据,比如全国的中学名。当然不是空穴来风,之前也做过这方面的爬虫,不过基于selenium 做的 UI 脚本,效率非常慢,而且很不稳定,所以这次采取了接口的形式,果然效率提升了几个档次。一共6万+数据,用了16分钟左右,期间包括数据库的存储。现在分享代码供大家参考。关键信息隐去,大家看一下思路就好了。
package practise;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.client.methods.HttpGet;
import net.sf.json.JSONObject;
import source.ApiLibrary;
import source.Concurrent;
public class Crawler extends ApiLibrary {
public static String host = "";
public static Map<String, Integer> countrys = new HashMap<>();
public static Map<String, Integer> citys = new HashMap<>();
public static Map<String, Integer> address = new HashMap<>();
public static Map<String, Integer> school = new HashMap<>();
public static List<String> total = new ArrayList<>();
public static void main(String[] args) {
Crawler crawler = new Crawler();
crawler.getCountry1();// 省份
Set<String> countryId = countrys.keySet();
for (String name : countryId) {
int id = countrys.get(name);
crawler.getCountry2(id);// 市
Set<String> cityId = citys.keySet();
for (String city : cityId) {
int cid = citys.get(city);
crawler.getCountry3(cid);// 县
Set<String> adresss = address.keySet();
for (String adres : adresss) {
int aid = address.get(adres);
crawler.getCountry4(aid);// 名
Set<String> schol = school.keySet();
for (String sch : schol) {
String line = name + PART + city + PART + adres + PART + sch;
total.add(line);
}
}
}
}
Concurrent.saveRequestTimes(total);
testOver();
}
/**
* 查询省份
*/
public void getCountry1() {
String url = host + "/user/editinfo/getSchollCountryList";
HttpGet httpGet = getHttpGet(url);
// httpGet.addHeader("Cookie", cookies);
// httpGet.addHeader("User-Agent", userangent);
JSONObject response = getHttpResponseEntityByJson(httpGet);
String[] country = response.getString("content").split("</a>");
int size = country.length;
for (int i = 0; i < size; i++) {
String msg = country[i];
int code = getCode(msg);
String name = getName(msg);
countrys.put(name, code);
}
}
/**
* 查询市
*
* @param id
*/
public void getCountry2(int id) {
String url = host + "/user/editinfo/getSchollCityList?region_id=" + id;
HttpGet httpGet = getHttpGet(url);
JSONObject response = getHttpResponseEntityByJson(httpGet);
String[] ssString = response.getString("content").split("</a>");
int size = ssString.length;
citys.clear();
for (int i = 0; i < size; i++) {
String msg = ssString[i];
int code = getCode(msg);
String name = getName(msg);
citys.put(name, code);
}
}
/**
* 查询县
*
* @param id
*/
public void getCountry3(int id) {
String url = host + "/user/editinfo/getSchollAddressList?region_id=" + id;
HttpGet httpGet = getHttpGet(url);
JSONObject response = getHttpResponseEntityByJson(httpGet);
String[] ssString = response.getString("content").split("</a>");
int size = ssString.length;
address.clear();
for (int i = 0; i < size; i++) {
String msg = ssString[i];
int code = getCode(msg);
String name = getName(msg);
address.put(name, code);
}
}
/**
* 查询学校
*
* @param id
*/
public void getCountry4(int id) {
String url = host + "/user/editinfo/getSchoolNameList?region_id=" + id;
HttpGet httpGet = getHttpGet(url);
JSONObject response = getHttpResponseEntityByJson(httpGet);
String[] ssString = response.getString("content").split("</a>");
int size = ssString.length;
school.clear();
for (int i = 0; i < size; i++) {
String msg = ssString[i];
int code = getCode(msg);
String name = getName(msg);
school.put(name, code);
}
}
/**
* 获取 code
*
* @param text
* @return
*/
public int getCode(String text) {
int code = 0;
Pattern pattern = Pattern.compile("\"\\d+\"");
Matcher matcher = pattern.matcher(text);
if (matcher.find()) {
code = changeStringToInt(matcher.group(0).replace("\"", ""));
}
return code;
}
/**
* 获取名称
*
* @param text
* @return
*/
public String getName(String text) {
String name = text.substring(text.lastIndexOf(">") + 1, text.length());
return name;
}
}
中间关键的方法在可以在我早先的 httpclient 接口源码文章中找到。接口封装源码传送门
下面是爬取到数据截图
末了宣传一下自己的 QQ群:群号:340964272
package practise;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.client.methods.HttpGet;
import net.sf.json.JSONObject;
import source.ApiLibrary;
import source.Concurrent;
public class Crawler extends ApiLibrary {
public static String host = "";
public static Map<String, Integer> countrys = new HashMap<>();
public static Map<String, Integer> citys = new HashMap<>();
public static Map<String, Integer> address = new HashMap<>();
public static Map<String, Integer> school = new HashMap<>();
public static List<String> total = new ArrayList<>();
public static void main(String[] args) {
Crawler crawler = new Crawler();
crawler.getCountry1();// 省份
Set<String> countryId = countrys.keySet();
for (String name : countryId) {
int id = countrys.get(name);
crawler.getCountry2(id);// 市
Set<String> cityId = citys.keySet();
for (String city : cityId) {
int cid = citys.get(city);
crawler.getCountry3(cid);// 县
Set<String> adresss = address.keySet();
for (String adres : adresss) {
int aid = address.get(adres);
crawler.getCountry4(aid);// 名
Set<String> schol = school.keySet();
for (String sch : schol) {
String line = name + PART + city + PART + adres + PART + sch;
total.add(line);
}
}
}
}
Concurrent.saveRequestTimes(total);
testOver();
}
/**
* 查询省份
*/
public void getCountry1() {
String url = host + "/user/editinfo/getSchollCountryList";
HttpGet httpGet = getHttpGet(url);
// httpGet.addHeader("Cookie", cookies);
// httpGet.addHeader("User-Agent", userangent);
JSONObject response = getHttpResponseEntityByJson(httpGet);
String[] country = response.getString("content").split("</a>");
int size = country.length;
for (int i = 0; i < size; i++) {
String msg = country[i];
int code = getCode(msg);
String name = getName(msg);
countrys.put(name, code);
}
}
/**
* 查询市
*
* @param id
*/
public void getCountry2(int id) {
String url = host + "/user/editinfo/getSchollCityList?region_id=" + id;
HttpGet httpGet = getHttpGet(url);
JSONObject response = getHttpResponseEntityByJson(httpGet);
String[] ssString = response.getString("content").split("</a>");
int size = ssString.length;
citys.clear();
for (int i = 0; i < size; i++) {
String msg = ssString[i];
int code = getCode(msg);
String name = getName(msg);
citys.put(name, code);
}
}
/**
* 查询县
*
* @param id
*/
public void getCountry3(int id) {
String url = host + "/user/editinfo/getSchollAddressList?region_id=" + id;
HttpGet httpGet = getHttpGet(url);
JSONObject response = getHttpResponseEntityByJson(httpGet);
String[] ssString = response.getString("content").split("</a>");
int size = ssString.length;
address.clear();
for (int i = 0; i < size; i++) {
String msg = ssString[i];
int code = getCode(msg);
String name = getName(msg);
address.put(name, code);
}
}
/**
* 查询学校
*
* @param id
*/
public void getCountry4(int id) {
String url = host + "/user/editinfo/getSchoolNameList?region_id=" + id;
HttpGet httpGet = getHttpGet(url);
JSONObject response = getHttpResponseEntityByJson(httpGet);
String[] ssString = response.getString("content").split("</a>");
int size = ssString.length;
school.clear();
for (int i = 0; i < size; i++) {
String msg = ssString[i];
int code = getCode(msg);
String name = getName(msg);
school.put(name, code);
}
}
/**
* 获取 code
*
* @param text
* @return
*/
public int getCode(String text) {
int code = 0;
Pattern pattern = Pattern.compile("\"\\d+\"");
Matcher matcher = pattern.matcher(text);
if (matcher.find()) {
code = changeStringToInt(matcher.group(0).replace("\"", ""));
}
return code;
}
/**
* 获取名称
*
* @param text
* @return
*/
public String getName(String text) {
String name = text.substring(text.lastIndexOf(">") + 1, text.length());
return name;
}
}
中间关键的方法在可以在我早先的 httpclient 接口源码文章中找到。接口封装源码传送门
下面是爬取到数据截图
末了宣传一下自己的 QQ群:群号:340964272
相关文章推荐
- HttpClient入门实例之简单的pdf文件爬虫
- httpclient 多线程爬虫实例
- HTTP BASIC认证,抢先认证介绍和 HttpClient 4.1.1 实例
- HttpClient post 请求实例
- HttpClient 实例
- 网络爬虫学习 httpclient 包的寻找
- Java爬虫入门简介(一) —— HttpClient请求及其使用方法
- 网页爬虫,HttpClient+Jericho HTML Parser 实现网页的抓取
- JSP开发中Apache-HTTPClient 用户验证的实例详解
- android菜鸟进阶之路—— HttpClient 的实例
- Web Service系列之实例之使用http.client发送SOAP POST请求
- HttpClient,HttpParser实现简易爬虫
- java HttpClient Post实例
- 爬虫简单示例,用httpClient4.2.1实现(转载)
- Java爬虫(八)-- httpClient进阶:HTTPS和证书认证(原理总结篇)
- RxAndroid 与OkHttpClient打造下载实例
- httpclient使用实例
- JAVA 爬虫之httpclient post请求提交表单获取Ajax数据
- HttpAsyncClient 做并发长连接的一个实例
- HttpClient 灵活应用及其实例