您的位置:首页 > 编程语言

简单的网页内容抓取实例(携程酒店)

2017-10-23 10:59 579 查看
网页抓取有很多种,这里介绍一个简单方法,暴力但快速得到有规律的网页内容

比如携程酒店的网页内容,希望得到一下基本信息:

酒店名称

英文名称

城市

省份

地址

纬度

经度

经纬度(String 类型)

电话

酒店星级

这里是代码(带测试样例)

package webTextGrabber;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

public class WebContent {
// hotelId, hotelUrl, cityId can be obtained at CtripUtil class
private String hotelName;
private String hotelEname;
private String cityName;
private String provinceName;
private String address;
private double lat;
private double lng;
private String coordinates;
private String tel;
private int hotelStars;

public String getUrlSource(String url) throws IOException {
URL webpage = new URL(url);
URLConnection yc = webpage.openConnection();
BufferedReader in = new BufferedReader(new InputStreamReader(yc.getInputStream(), "UTF-8"));
String inputLine;
StringBuilder a = new StringBuilder();
while ((inputLine = in.readLine()) != null)
a.append(inputLine);
in.close();

return a.toString();
}

public void setAll(String str) throws Exception {
try {
setHotelName(str);
setHotelEname(str);
setCityName(str);
setProvinceName(str);
setAddress(str);
setLat(str);
setLng(str);
setCoordinates();
setTel(str);
setHotelStars(str);
} catch (Exception e) {
throw new Exception(e);
}

}

public String setHotelName(String str) throws Exception {
try {
int index = str.indexOf("cn_n");
hotelName = str.substring(str.indexOf(">", index) + 1, str.indexOf("<", index));
} catch (Exception e) {
throw new Exception(e);
}
return hotelName;

}

public String setHotelEname(String str) throws Exception {
try {
int index = str.indexOf("en_n");
if (index == -1) {
return hotelEname;
}
hotelEname = str.substring(str.indexOf(">", index) + 1, str.indexOf("<", index));
} catch (Exception e) {
throw new Exception(e);
}
return hotelEname;
}

public String setCityName(String str) {
int index = str.indexOf("city");
cityName = str.substring(index + 5, str.indexOf(">", index) - 1);
return cityName;
}

public String setProvinceName(String str) {
int index = str.indexOf("province");
provinceName = str.substring(index + 9, str.indexOf(";", index));
return provinceName;
}

public String setAddress(String str) {
int index = str.lastIndexOf("酒店地址");
if (!provinceName.equals(cityName)) {
address = provinceName + cityName;
} else {
address = provinceName;
}

address += str.substring(index + 5, str.indexOf(";", index));
return address;
}

public double setLat(String str) {
int index = str.indexOf("latitude");
lat = Double.parseDouble(str.substring(str.indexOf("content=", index) + 9, str.indexOf("/>", index) - 2));
return lat;
}

public double setLng(String str) {
int index = str.indexOf("longitude");
lng = Double.parseDouble(str.substring(str.indexOf("content=", index) + 9, str.indexOf("/>", index) - 2));
return lng;
}

public String setCoordinates() {
coordinates = "" + lat + ", " + lng;
return coordinates;
}

public String setTel(String str) throws Exception {
try {
int index = str.indexOf("电话0");
if (index == -1) {
return tel;
}
tel = str.substring(index + 2, index + 14);
} catch (Exception e) {
throw new Exception(e);
}
return tel;
}

public int setHotelStars(String str) throws Exception {
try {
int index = str.indexOf("hotel_stars");
if (index == -1) {
return hotelStars;
}
hotelStars = Integer.parseInt(str.substring(index + 11, index + 13));
} catch (Exception e) {
throw new Exception(e);
}

return hotelStars;
}

public String getHotelName() {
return hotelName;
}

public String getHotelEname() {
return hotelEname;
}

public String getCityName() {
return cityName;
}

public String getProvinceName() {
return provinceName;
}

public String getAddress() {
return address;
}

public double getLat() {
return lat;
}

public double getLng() {
return lng;
}

public String getCoordinates() {
return coordinates;
}

public String getTel() {
return tel;
}

public int getHotelStars() {
return hotelStars;
}

/**
*
* @param args
*
* @throws IOException
*/
public static void main(final String args[]) throws IOException
{

final List<String> list = new ArrayList<String>();
list.add("http://hotels.ctrip.com/hotel/427952.html");
list.add("http://hotels.ctrip.com/hotel/671.html");
list.add("http://hotels.ctrip.com/hotel/2005959.html");
list.add("http://hotels.ctrip.com/hotel/481810.html");
list.add("http://hotels.ctrip.com/hotel/2104633.html");
list.add("http://hotels.ctrip.com/hotel/1481502.html");
list.add("http://hotels.ctrip.com/hotel/1720124.html");
list.add("http://hotels.ctrip.com/hotel/2165407.html");
list.add("http://hotels.ctrip.com/hotel/1636803.html");
list.add("http://hotels.ctrip.com/hotel/371188.html");

final WebContent wc = new WebContent();

for (int i = 0; i < list.size(); i++) {
String webinfo = wc.getUrlSource(list.get(i));
if (webinfo == null || webinfo.length() == 0 || webinfo.indexOf("验证") != -1) {
continue;
}
try {
wc.setAll(webinfo);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println(wc.getHotelName());
System.out.println(wc.getHotelEname());
System.out.println(wc.getCityName());
System.out.println(wc.getProvinceName());
System.out.println(wc.getAddress());
System.out.println(wc.getLat());
System.out.println(wc.getLng());
System.out.println(wc.getCoordinates());
System.out.println(wc.getTel());
System.out.println(wc.getHotelStars());
}
}
}

这里是输出结果:

北京金隅喜来登酒店
Sheraton Beijing Dongcheng Hotel
北京
北京
北京北三环东路36号
39.97346873
116.4163028
39.97346873, 116.4163028
010-57988888
5
上海大厦
Broadway Mansions Hotel
上海
上海
上海北苏州路20号
31.250007605066
121.49663745291
31.250007605066, 121.49663745291
021-63246260
5
宁国伯爵王朝大酒店
Bojue Dynasty Hotel
宁国
安徽
安徽宁国宁阳西路155号
30.606036618238
118.97454952709
30.606036618238, 118.97454952709
0563-4188888
5
合肥天鹅湖大酒店
Swan Lake Hotel
合肥
安徽
安徽合肥政务文化新区东流路888号
31.823922092928
117.23604371154
31.823922092928, 117.23604371154
0551-6353666
5
宁国都市阳光酒店

宁国
安徽
安徽宁国中溪北路8号
30.633710023238
118.98375012382
30.633710023238, 118.98375012382
0563-4101788
3
达拉特旗东达假日酒店
Dongda Holiday Hotel
达拉特旗
内蒙古
内蒙古达拉特旗树林召西街南侧
40.402906010224
110.00925942075
40.402906010224, 110.00925942075
0477-3963888
3
大理和舀田园度假酒店

大理市
云南
云南大理市城北村(0872-2475995)
25.861913102242
100.14410073281
25.861913102242, 100.14410073281
0872-2475995
3
喀纳斯贾登峪回家休闲酒店(酒店区)
Connectedhome
布尔津
新疆
新疆布尔津喀纳斯贾登峪生活区游客接待基地一区
48.501779520655
87.157369327333
48.501779520655, 87.157369327333
0906-6327598
3
欣得酒店(北京石佛营店)

北京
北京
北京石佛营东里99号
39.941142006731
116.51243647991
39.941142006731, 116.51243647991
010-85814122
3
青岛颐中皇冠假日酒店
Crowne Plaza Qingdao
青岛
山东
山东青岛香港中路76号
36.070022690161
120.40615095949
36.070022690161, 120.406
4000
15095949
0532-8571888
3


当然,在具体的工作学习使用中,可以将数据存成相应的数据格式来保存在数据库中。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息