java网页数据抓取源代码(抓取电话和身份证信息为例)
2013-04-23 10:44
429 查看
说明:源链接的代码是会报错,这个代码是修改过确定可以运行使用的。
对于加密的网站还没去研究,不知道能不能抓取,现在只是对一些没有加密的网站进行网页数据抓取。刚刚开始写的时候以为很多网站都能抓取,但是发现很多都加密了,本来以为一些地址可以通过网页数据检测工具测出他的数据变化,但是只能监测到一些通过js显示的数据,依然不能抓取到加密的网站。嗨,这个问题以后再说吧。
import java.net.* ;
import java.io.* ;
import java.util.regex.* ;
public class Capture{
public static void main(String args[])throws Exception{
System.out.println("*************************手机号查询************************") ;
System.out.println("我的位置是:" + new GrabMobile().grabMobileLocation("18323483580")) ;
System.out.println("手机卡类型是:" + new GrabMobile().grabMobileType("18323483580")) ;
System.out.println("我的邮编是:" + new GrabMobile().grabMobilePost("18323483580")) ;
System.out.println("*************************身份证查询************************") ;
System.out.println("我的性别是:" + new GrabIdentity().grabIdentitySex("362203199202243575")) ;
System.out.println("我的生日是:" + new GrabIdentity().grabIdentityBirth("362203199202243575")) ;
System.out.println("我的家乡是:" + new GrabIdentity().grabIdentityHome("362203199202243575")) ;
}
}
//1):抓取手机查询信息
class GrabMobile{
//1,手机归属地
public String grabMobileLocation(String m)throws Exception{
String strUrl = "http://www.ip138.com:8080/search.asp?action=mobile&mobile=" + m;
URL url = new URL(strUrl) ;
HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
BufferedReader bufRead = new BufferedReader(inRead) ;
StringBuffer strBuf = new StringBuffer() ;
String line = "" ;
while ((line = bufRead.readLine()) != null) {
strBuf.append(line);
}
String strStart = "卡号归属地" ;
String strEnd = "卡 类 型";
String strAll = strBuf.toString() ;
int start = strAll.indexOf(strStart) ;
int end = strAll.indexOf(strEnd) ;
String result = strAll.substring(start+42,end-33) ;
result = drawChMob(result) ;
return result ;
}
//2,卡号类型
public String grabMobileType(String m)throws Exception{
String strUrl = "http://www.ip138.com:8080/search.asp?action=mobile&mobile=" + m;
URL url = new URL(strUrl) ;
HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
BufferedReader bufRead = new BufferedReader(inRead) ;
StringBuffer strBuf = new StringBuffer() ;
String line = "" ;
while ((line = bufRead.readLine()) != null) {
strBuf.append(line);
}
String strStart = "卡 类 型" ;
String strEnd = "<TD align=\"center\">区 号</TD>";
String strAll = strBuf.toString() ;
int start = strAll.indexOf(strStart) ;
int end = strAll.indexOf(strEnd) ;
String result = strAll.substring(start+12,end) ;
result = drawChMob(result) ;
result = result.substring(1) ;
return result ;
}
//3, 邮编
public String grabMobilePost(String m)throws Exception{
String strUrl = "http://www.ip138.com:8080/search.asp?action=mobile&mobile=" + m;
URL url = new URL(strUrl) ;
HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
BufferedReader bufRead = new BufferedReader(inRead) ;
StringBuffer strBuf = new StringBuffer() ;
String line = "" ;
while ((line = bufRead.readLine()) != null) {
strBuf.append(line);
}
String strStart = "邮 编" ;
String strEnd = "更详细的..";
String strAll = strBuf.toString() ;
int start = strAll.indexOf(strStart) ;
int end = strAll.indexOf(strEnd) ;
String result = strAll.substring(start+40,end-55) ;
return result ;
}
//正则
public String drawChMob(String str){
StringBuffer strBuf = new StringBuffer() ;
String regex="([\u4e00-\u9fa5]+)";
Matcher matcher = Pattern.compile(regex).matcher(str);
while(matcher.find()){
strBuf.append(matcher.group(0)).toString() ;
}
return strBuf.toString() ;
}
}
//2):抓取身份证查询信息
class GrabIdentity{
//1,性别
public String grabIdentitySex(String userid)throws Exception{
String strUrl = "http://qq.ip138.com/idsearch/index.asp?action=idcard&userid=" + userid + "&B1=%B2%E9+%D1%AF";
URL url = new URL(strUrl) ;
HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
BufferedReader bufRead = new BufferedReader(inRead) ;
StringBuffer strBuf = new StringBuffer() ;
String line = "" ;
while ((line = bufRead.readLine()) != null) {
strBuf.append(line);
}
String strStart = " 别" ;
String strEnd = "出生日期";
String strAll = strBuf.toString() ;
int start = strAll.indexOf(strStart) ;
int end = strAll.indexOf(strEnd) ;
String result = strAll.substring(start+7,end) ;
result = drawCh(result) ;
return result ;
}
//2,出生日期
public String grabIdentityBirth(String userid)throws Exception{
String strUrl = "http://qq.ip138.com/idsearch/index.asp?action=idcard&userid=" + userid + "&B1=%B2%E9+%D1%AF";
URL url = new URL(strUrl) ;
HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
BufferedReader bufRead = new BufferedReader(inRead) ;
StringBuffer strBuf = new StringBuffer() ;
String line = "" ;
while ((line = bufRead.readLine()) != null) {
strBuf.append(line);
}
String strStart = "出生日期:</td><td class=\"tdc2\">" ;
String strEnd = "</td><tr><tr><td class=";
String strAll = strBuf.toString() ;
int start = strAll.indexOf(strStart) ;
int end = strAll.indexOf(strEnd) ;
String result = strAll.substring(start+27,end) ;
return result ;
}
//3,身份证归属地
public String grabIdentityHome(String userid)throws Exception{
String strUrl = "http://qq.ip138.com/idsearch/index.asp?action=idcard&userid=" + userid + "&B1=%B2%E9+%D1%AF";
URL url = new URL(strUrl) ;
HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
BufferedReader bufRead = new BufferedReader(inRead) ;
StringBuffer strBuf = new StringBuffer() ;
String line = "" ;
while ((line = bufRead.readLine()) != null) {
strBuf.append(line);
}
String strStart = "证 地:</td><td class=\"tdc2\">" ;
String strEnd = "<br/></td></tr><tr><td class=\"tdc3\" valign=\"top\" align=\"right\">部分或" ;
String strAll = strBuf.toString() ;
int start = strAll.indexOf(strStart) ;
int end = strAll.indexOf(strEnd) ;
String result = strAll.substring(start+31,end) ;
return result ;
}
//正则
public String drawCh(String str){
StringBuffer strBuf = new StringBuffer() ;
String regex="([\u4e00-\u9fa5]+)";
Matcher matcher = Pattern.compile(regex).matcher(str);
if(matcher.find()){
str = strBuf.append(matcher.group(0)).toString() ;
}
return str ;
}
}
源码:http://www.2cto.com/kf/201211/166624.html
对于加密的网站还没去研究,不知道能不能抓取,现在只是对一些没有加密的网站进行网页数据抓取。刚刚开始写的时候以为很多网站都能抓取,但是发现很多都加密了,本来以为一些地址可以通过网页数据检测工具测出他的数据变化,但是只能监测到一些通过js显示的数据,依然不能抓取到加密的网站。嗨,这个问题以后再说吧。
import java.net.* ;
import java.io.* ;
import java.util.regex.* ;
public class Capture{
public static void main(String args[])throws Exception{
System.out.println("*************************手机号查询************************") ;
System.out.println("我的位置是:" + new GrabMobile().grabMobileLocation("18323483580")) ;
System.out.println("手机卡类型是:" + new GrabMobile().grabMobileType("18323483580")) ;
System.out.println("我的邮编是:" + new GrabMobile().grabMobilePost("18323483580")) ;
System.out.println("*************************身份证查询************************") ;
System.out.println("我的性别是:" + new GrabIdentity().grabIdentitySex("362203199202243575")) ;
System.out.println("我的生日是:" + new GrabIdentity().grabIdentityBirth("362203199202243575")) ;
System.out.println("我的家乡是:" + new GrabIdentity().grabIdentityHome("362203199202243575")) ;
}
}
//1):抓取手机查询信息
class GrabMobile{
//1,手机归属地
public String grabMobileLocation(String m)throws Exception{
String strUrl = "http://www.ip138.com:8080/search.asp?action=mobile&mobile=" + m;
URL url = new URL(strUrl) ;
HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
BufferedReader bufRead = new BufferedReader(inRead) ;
StringBuffer strBuf = new StringBuffer() ;
String line = "" ;
while ((line = bufRead.readLine()) != null) {
strBuf.append(line);
}
String strStart = "卡号归属地" ;
String strEnd = "卡 类 型";
String strAll = strBuf.toString() ;
int start = strAll.indexOf(strStart) ;
int end = strAll.indexOf(strEnd) ;
String result = strAll.substring(start+42,end-33) ;
result = drawChMob(result) ;
return result ;
}
//2,卡号类型
public String grabMobileType(String m)throws Exception{
String strUrl = "http://www.ip138.com:8080/search.asp?action=mobile&mobile=" + m;
URL url = new URL(strUrl) ;
HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
BufferedReader bufRead = new BufferedReader(inRead) ;
StringBuffer strBuf = new StringBuffer() ;
String line = "" ;
while ((line = bufRead.readLine()) != null) {
strBuf.append(line);
}
String strStart = "卡 类 型" ;
String strEnd = "<TD align=\"center\">区 号</TD>";
String strAll = strBuf.toString() ;
int start = strAll.indexOf(strStart) ;
int end = strAll.indexOf(strEnd) ;
String result = strAll.substring(start+12,end) ;
result = drawChMob(result) ;
result = result.substring(1) ;
return result ;
}
//3, 邮编
public String grabMobilePost(String m)throws Exception{
String strUrl = "http://www.ip138.com:8080/search.asp?action=mobile&mobile=" + m;
URL url = new URL(strUrl) ;
HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
BufferedReader bufRead = new BufferedReader(inRead) ;
StringBuffer strBuf = new StringBuffer() ;
String line = "" ;
while ((line = bufRead.readLine()) != null) {
strBuf.append(line);
}
String strStart = "邮 编" ;
String strEnd = "更详细的..";
String strAll = strBuf.toString() ;
int start = strAll.indexOf(strStart) ;
int end = strAll.indexOf(strEnd) ;
String result = strAll.substring(start+40,end-55) ;
return result ;
}
//正则
public String drawChMob(String str){
StringBuffer strBuf = new StringBuffer() ;
String regex="([\u4e00-\u9fa5]+)";
Matcher matcher = Pattern.compile(regex).matcher(str);
while(matcher.find()){
strBuf.append(matcher.group(0)).toString() ;
}
return strBuf.toString() ;
}
}
//2):抓取身份证查询信息
class GrabIdentity{
//1,性别
public String grabIdentitySex(String userid)throws Exception{
String strUrl = "http://qq.ip138.com/idsearch/index.asp?action=idcard&userid=" + userid + "&B1=%B2%E9+%D1%AF";
URL url = new URL(strUrl) ;
HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
BufferedReader bufRead = new BufferedReader(inRead) ;
StringBuffer strBuf = new StringBuffer() ;
String line = "" ;
while ((line = bufRead.readLine()) != null) {
strBuf.append(line);
}
String strStart = " 别" ;
String strEnd = "出生日期";
String strAll = strBuf.toString() ;
int start = strAll.indexOf(strStart) ;
int end = strAll.indexOf(strEnd) ;
String result = strAll.substring(start+7,end) ;
result = drawCh(result) ;
return result ;
}
//2,出生日期
public String grabIdentityBirth(String userid)throws Exception{
String strUrl = "http://qq.ip138.com/idsearch/index.asp?action=idcard&userid=" + userid + "&B1=%B2%E9+%D1%AF";
URL url = new URL(strUrl) ;
HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
BufferedReader bufRead = new BufferedReader(inRead) ;
StringBuffer strBuf = new StringBuffer() ;
String line = "" ;
while ((line = bufRead.readLine()) != null) {
strBuf.append(line);
}
String strStart = "出生日期:</td><td class=\"tdc2\">" ;
String strEnd = "</td><tr><tr><td class=";
String strAll = strBuf.toString() ;
int start = strAll.indexOf(strStart) ;
int end = strAll.indexOf(strEnd) ;
String result = strAll.substring(start+27,end) ;
return result ;
}
//3,身份证归属地
public String grabIdentityHome(String userid)throws Exception{
String strUrl = "http://qq.ip138.com/idsearch/index.asp?action=idcard&userid=" + userid + "&B1=%B2%E9+%D1%AF";
URL url = new URL(strUrl) ;
HttpURLConnection httpUrlCon = (HttpURLConnection)url.openConnection() ;
InputStreamReader inRead = new InputStreamReader(httpUrlCon.getInputStream(),"GBK") ;
BufferedReader bufRead = new BufferedReader(inRead) ;
StringBuffer strBuf = new StringBuffer() ;
String line = "" ;
while ((line = bufRead.readLine()) != null) {
strBuf.append(line);
}
String strStart = "证 地:</td><td class=\"tdc2\">" ;
String strEnd = "<br/></td></tr><tr><td class=\"tdc3\" valign=\"top\" align=\"right\">部分或" ;
String strAll = strBuf.toString() ;
int start = strAll.indexOf(strStart) ;
int end = strAll.indexOf(strEnd) ;
String result = strAll.substring(start+31,end) ;
return result ;
}
//正则
public String drawCh(String str){
StringBuffer strBuf = new StringBuffer() ;
String regex="([\u4e00-\u9fa5]+)";
Matcher matcher = Pattern.compile(regex).matcher(str);
if(matcher.find()){
str = strBuf.append(matcher.group(0)).toString() ;
}
return str ;
}
}
源码:http://www.2cto.com/kf/201211/166624.html
相关文章推荐
- java通过url抓取网页数据-----正则表达式
- C#抓取网页数据 解析标题描述图片等信息 去除HTML标签
- Java抓取网页数据(原网页+Javascript返回数据)
- Java抓取网页数据(原网页+Javascript返回数据)
- Java抓取网页数据(原网页+Javascript返回数据)
- java验证身份证号码是否有效源代码 wn25的头像 wn25 23 2015-01-04 20:09 6 基本信息 Java × 1 浏览
- java爬取动态加载/js返回数据的网页的源代码
- jsoup 分页抓取网页数据Java HTML Parser
- Java抓取网页信息[例子是抓取双色球某一天的信息]
- Java抓取网页数据(原网页+Javascript返回数据)
- Java抓取网页数据(原网页+Javascript返回数据)
- Java网页数据抓取实例
- Java抓取网页数据
- apache HttpClient 4.3.4自动登录并抓取中国联通网页用户基本信息和账单数据
- java htmlunit 抓取网页数据
- 网页信息抓取进阶 支持Js生成数据 Jsoup的不足之处
- 网页信息抓取进阶 支持Js生成数据 Jsoup的不足之处
- Jsoup一个简短的引论——采用Java抓取网页数据
- Nutch源代码研究 网页抓取 数据结构
- java抓取网页数据获取网页中所有的链接实例分享