您的位置:首页 > 编程语言 > Java开发

java利用爬虫技术抓取(省、市(区号\邮编)、县)数据

2014-07-18 23:00 573 查看
最近项目需要用到 城市的地址信息,但从网上下载的xml数据没有几个是最新的地址信息.....数据太老,导致有些地区不全。所以才想到天气预报官网特定有最新最全的数据。贴出代码,希望能给有同样困惑的朋友,减少一些时间。

/**
* @param var  城市名称
* @return	string数组,0表示邮编	1表示区号
*/
@SuppressWarnings("deprecation")
private String[] getZipCode(String var) {
String[] code = new String[2];
String zipCode_S = "邮编:";
String zipCode_E = " ";
String qhCode_S = "区号:";
String qhCode_E = "</td>";
String encode = URLEncoder.encode(var);
try {
URL url = new URL("http://www.ip138.com/post/search.asp?area="
+ encode + "&action=area2zone");
BufferedReader br = new BufferedReader(new InputStreamReader(
url.openStream(), "GBK"));
for (String line; (line = br.readLine()) != null;) {
int zipNum = line.indexOf(zipCode_S);
if (zipNum > 1) {
String str = line.substring(zipNum + zipCode_S.length());
str = str.substring(0, str.indexOf(zipCode_E));
code[0] = str;
}
int qhNum = line.indexOf(qhCode_S);
if(qhNum > 1)
{
String str = line.substring(qhNum + qhCode_S.length());
str = str.substring(0, str.indexOf(qhCode_E));
code[1] = str;
break;
}
}
} catch (Exception e) {
System.out.println(var +"\t错误"+e.toString());
}
return code;
}

/**
* 	主程序
* @throws Exception
*/
@Test
public void main() throws Exception
{
//1:获取所有省份
TreeMap<String,String> provincesBuffer = getAddressInfo("http://www.weather.com.cn//data/city3jdata/china.html");
Element prcEle = DocumentHelper.createElement("Provinces");

//2:根据省份获取城市
Element citysEle = DocumentHelper.createElement("Citys");

//3:根据省份城市获取区、县
Element distEle = DocumentHelper.createElement("Districts");
int p = 1;
int c = 1;
int d = 1;
for(Entry<String, String> prc : provincesBuffer.entrySet())
{
Element province = DocumentHelper.createElement("Province");
province.addAttribute("ID",""+(p)).addAttribute("ProvinceName", prc.getValue()).addText(prc.getValue());
//获取邮政编号
TreeMap<String,String> cityBuffer = getAddressInfo("http://www.weather.com.cn/data/city3jdata/provshi/"+prc.getKey()+".html");
for(Entry<String, String> citys : cityBuffer.entrySet())
{
Element city = DocumentHelper.createElement("City");
String[] zipCode = getZipCode(citys.getValue());
if(zipCode[0]==null||zipCode[1]==null)
System.out.println("缺少"+citys.getValue()+"邮政或区号!");
city.addAttribute("ID", ""+c).addAttribute("CityName", citys.getValue()).addAttribute("PID",p+"").addAttribute("ZipCode", zipCode[0]).addAttribute("AreaCode", zipCode[1]).addText(citys.getValue());
TreeMap<String, String> distsBuffer = getAddressInfo("http://www.weather.com.cn/data/city3jdata/station/"+prc.getKey()+""+citys.getKey()+".html");
for(Entry<String, String> dists : distsBuffer.entrySet())
{
String value = dists.getValue();
if(value.equals(citys.getValue()))
continue;

Element district = DocumentHelper.createElement("District");
district.addAttribute("ID",""+(d++)).addAttribute("DistrictName", dists.getValue()).addAttribute("CID", c+"").addText(dists.getValue());
distEle.add(district);
}
citysEle.add(city);
c++;
}
prcEle.add(province);
p++;
}
//4:保存到本地
saveInf("f:\\Provinces.xml",prcEle);
saveInf("f:\\Citys.xml",citysEle);
saveInf("f:\\Districts.xml",distEle);
}

/**	保存xml
* @param savePath  xml保存路径
* @param varEle	根元素
*/
private void saveInf(String savePath, Element varEle) {
Document varDoc = DocumentHelper.createDocument();
varDoc.add(varEle);
try {
XMLWriter xmlwri = new XMLWriter(new FileOutputStream(new File(savePath)), new OutputFormat("\t", true, "UTF-8"));
xmlwri.write(varDoc);
xmlwri.close();
} catch (Exception e) {
System.out.println(savePath +"失败,原因如下");
throw new RuntimeException(e);
}
}

/**
* 	获取信息
* @param address  url路径
* @return	key :信息编号	value:信息名称
*/
private TreeMap<String, String> getAddressInfo(String address) {
TreeMap<String,String> china = new TreeMap<String, String>();
BufferedReader br = null;
String buffer = null;
try {
URL url = new URL(address);
br = new BufferedReader(new InputStreamReader(url.openStream(),"UTF-8"));
buffer = br.readLine();
} catch (Exception e) {
System.out.println("错误:"+e.getMessage());
}finally{
if(br != null)
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if(buffer==null)
return china;

buffer = buffer.replaceAll("\\{|\\}|\"","");
String[] splits = buffer.split(",");
for(String sp : splits)
{
String[] split = sp.split(":");
if(split!=null && split.length == 2)
china.put(split[0], split[1]);
else
System.out.println(address);
}
buffer = null;
return china;
}


下载xml数据
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐