Java网络爬虫基础和抓取网站数据的两个小实例
2017-11-09 17:35
429 查看
前段时间在学习爬虫,并从网络抓取了一些简单的数据,记录一下。
抓取分成下面3个部分:
1、网络请求
2、解析抓取下来的页面,并且处理乱码或者解压代码的问题
3、拿到指定的数据、资源
完整代码如下:
第一个实例:
[java] view
plain copy
/**
* 从某网站查找所有帖子标题
* 把所有标题和链接存放在txt文件里面
*/
public static Map<String, String> parseClPage(){
String html = "http://cl.xxxx/thread0806.php"; // 解析的网站域名
String currentuserdesktop = System.getProperty("user.home")+"\\Desktop"; // 获取操作系统为windows的桌面路径
Map<String, String> resultMap = new TreeMap<String, String>(); // 结果-链接
Document doc = null;
try {
for (int i = 0; i < 199; i++) { // 设置扫描的页数范围
StringBuffer htmlCode = new StringBuffer("");
HttpMethod httpMethod = new GetMethod("http://cl.xxxx/thread0806.php?fid=7&search=&page="+(i+1));
HttpClient client = new HttpClient(); // 以下是设置网络请求的头信息,可以直接用浏览器的头信息
httpMethod.addRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
httpMethod.addRequestHeader("Accept-Encoding", "gzip, deflate, sdch");
httpMethod.addRequestHeader("Accept-Language", "zh-CN,zh;q=0.8");
httpMethod.addRequestHeader("Referer", "http://cl.clvv.biz/thread0806.php?fid=7");
httpMethod.addRequestHeader("HTTPS", "1");
httpMethod.addRequestHeader("Connection", "keep-alive");
httpMethod.addRequestHeader("Host", "cl.clvv.biz");
httpMethod.addRequestHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36ozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36");
client.setTimeout(3000);
client.executeMethod(httpMethod);
InputStream inputStream = httpMethod.getResponseBodyAsStream();// 得到网络请求返回的HTML流
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream);// 解压压缩的HTML流,解压方式是GZIP
InputStreamReader inputStreamReader = new InputStreamReader(gzipInputStream,Charset.forName("gb2312")); // 转码
BufferedReader bin21 = new BufferedReader(inputStreamReader);
while(bin21.readLine()!=null){
String line = bin21.readLine();
htmlCode.append(line);
}
doc = Jsoup.parse(htmlCode.toString()); // 解析html用的是操作较简单的jsoup包
Elements elementsTr = doc.select("table tr"); // 用jQuery方式解析特定的数据域
for (Element element : elementsTr) {
String title = element.select("td").eq(1).select("h3 a").text();
if(null!=title && !"".equals(title)){
String link = "http://cl.xxxx/"+element.select("td").eq(1).select("h3 a").attr("href");
// 文件流
writefiletotxt((new FileWriter(currentuserdesktop+"\\查找结果.txt",true)),("标题:"+title+"\t链接:"+link+"\r\n"));
}
}
// 释放链接
httpMethod.abort();
httpMethod.releaseConnection();
}
System.out.println("done--");
} catch (Exception e) {
e.printStackTrace();
}
return resultMap;
}
public static void writefiletotxt(FileWriter fw,String result){
try {
fw.write(result);
fw.flush();
fw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
第二个实例:抓取网站图片。思路和第一个差不多
[java] view
plain copy
/**
* @author 高攀
* @上午9:58:01
*/
public class CatchImages {
private static String curdesktop = System.getProperty("user.home")+"\\Desktop\\CatchImages\\";
public static void main(String[] args) {
doCatch("http://item.jd.com/716240.html");
}
// 网络请求并且拿到图片链接
public static Integer doCatch(String site){
GetMethod method = new GetMethod(site);
HttpClient client = new HttpClient();
try {
method.addRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
// method.addRequestHeader("Accept-Encoding", "gzip, deflate, sdch");
method.addRequestHeader("Accept-Language", "zh-CN,zh;q=0.8");
method.addRequestHeader("Avail-Dictionary", "XprLfaXG");
method.addRequestHeader("Cache-Control", "max-age=0");
method.addRequestHeader("Connection", "keep-alive");
method.addRequestHeader("Cookie", "");
method.addRequestHeader("Host", "user.qzone.qq.com");
method.addRequestHeader("If-Modified-Since", "Thu, 24 Sep 2015 02:55:30 GMT");
method.addRequestHeader("Upgrade-Insecure-Requests", "1");
method.addRequestHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36ozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36");
client.executeMethod(method);
String htmlCode = method.getResponseBodyAsString();
// 得到所有的img标签的链接
Document doc = Jsoup.parse(htmlCode);
Elements elementImg = doc.select("body img");
for (Element element : elementImg) {
String src = element.attr("src");
if(src.contains("http")){ // 绝对路径就不变
}else { // 否则变成绝对路径
String rootUrl = HTMLParserHelper.getRootUrl(site); // 得到根路径
// 加上跟路径
src = rootUrl+src;
}
System.out.println(src);
downloadImage(src);
System.out.println("ok");
}
System.out.println(elementImg.size()+" result catched.");
} catch (Exception e) {
e.printStackTrace();
} finally{
method.abort();
method.releaseConnection();
}
return 0;
}
// 下载图片的方法
public static void downloadImage(String imageUrl){
GetMethod method = new GetMethod(imageUrl);
HttpClient client = new HttpClient();
try {
client.executeMethod(method);
InputStream inputStream = method.getResponseBodyAsStream();
File file = new File(curdesktop);
if(!file.exists()){
try {
file.mkdir();
} catch (Exception e) {
e.printStackTrace();
}
}
byte b[] = {1};
int size = 0;
FileOutputStream outputStream = new FileOutputStream(new File(curdesktop+HTMLParserHelper.getImageNameAndHouzui(imageUrl)));
while((size=inputStream.read(b))!=-1){
outputStream.write(b, 0, size);
}
outputStream.close();
} catch (Exception e) {
e.printStackTrace();
} finally{
// 释放链接
method.abort();
method.releaseConnection();
}
}
}
抓取分成下面3个部分:
1、网络请求
2、解析抓取下来的页面,并且处理乱码或者解压代码的问题
3、拿到指定的数据、资源
完整代码如下:
第一个实例:
[java] view
plain copy
/**
* 从某网站查找所有帖子标题
* 把所有标题和链接存放在txt文件里面
*/
public static Map<String, String> parseClPage(){
String html = "http://cl.xxxx/thread0806.php"; // 解析的网站域名
String currentuserdesktop = System.getProperty("user.home")+"\\Desktop"; // 获取操作系统为windows的桌面路径
Map<String, String> resultMap = new TreeMap<String, String>(); // 结果-链接
Document doc = null;
try {
for (int i = 0; i < 199; i++) { // 设置扫描的页数范围
StringBuffer htmlCode = new StringBuffer("");
HttpMethod httpMethod = new GetMethod("http://cl.xxxx/thread0806.php?fid=7&search=&page="+(i+1));
HttpClient client = new HttpClient(); // 以下是设置网络请求的头信息,可以直接用浏览器的头信息
httpMethod.addRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
httpMethod.addRequestHeader("Accept-Encoding", "gzip, deflate, sdch");
httpMethod.addRequestHeader("Accept-Language", "zh-CN,zh;q=0.8");
httpMethod.addRequestHeader("Referer", "http://cl.clvv.biz/thread0806.php?fid=7");
httpMethod.addRequestHeader("HTTPS", "1");
httpMethod.addRequestHeader("Connection", "keep-alive");
httpMethod.addRequestHeader("Host", "cl.clvv.biz");
httpMethod.addRequestHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36ozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36");
client.setTimeout(3000);
client.executeMethod(httpMethod);
InputStream inputStream = httpMethod.getResponseBodyAsStream();// 得到网络请求返回的HTML流
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream);// 解压压缩的HTML流,解压方式是GZIP
InputStreamReader inputStreamReader = new InputStreamReader(gzipInputStream,Charset.forName("gb2312")); // 转码
BufferedReader bin21 = new BufferedReader(inputStreamReader);
while(bin21.readLine()!=null){
String line = bin21.readLine();
htmlCode.append(line);
}
doc = Jsoup.parse(htmlCode.toString()); // 解析html用的是操作较简单的jsoup包
Elements elementsTr = doc.select("table tr"); // 用jQuery方式解析特定的数据域
for (Element element : elementsTr) {
String title = element.select("td").eq(1).select("h3 a").text();
if(null!=title && !"".equals(title)){
String link = "http://cl.xxxx/"+element.select("td").eq(1).select("h3 a").attr("href");
// 文件流
writefiletotxt((new FileWriter(currentuserdesktop+"\\查找结果.txt",true)),("标题:"+title+"\t链接:"+link+"\r\n"));
}
}
// 释放链接
httpMethod.abort();
httpMethod.releaseConnection();
}
System.out.println("done--");
} catch (Exception e) {
e.printStackTrace();
}
return resultMap;
}
public static void writefiletotxt(FileWriter fw,String result){
try {
fw.write(result);
fw.flush();
fw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
第二个实例:抓取网站图片。思路和第一个差不多
[java] view
plain copy
/**
* @author 高攀
* @上午9:58:01
*/
public class CatchImages {
private static String curdesktop = System.getProperty("user.home")+"\\Desktop\\CatchImages\\";
public static void main(String[] args) {
doCatch("http://item.jd.com/716240.html");
}
// 网络请求并且拿到图片链接
public static Integer doCatch(String site){
GetMethod method = new GetMethod(site);
HttpClient client = new HttpClient();
try {
method.addRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
// method.addRequestHeader("Accept-Encoding", "gzip, deflate, sdch");
method.addRequestHeader("Accept-Language", "zh-CN,zh;q=0.8");
method.addRequestHeader("Avail-Dictionary", "XprLfaXG");
method.addRequestHeader("Cache-Control", "max-age=0");
method.addRequestHeader("Connection", "keep-alive");
method.addRequestHeader("Cookie", "");
method.addRequestHeader("Host", "user.qzone.qq.com");
method.addRequestHeader("If-Modified-Since", "Thu, 24 Sep 2015 02:55:30 GMT");
method.addRequestHeader("Upgrade-Insecure-Requests", "1");
method.addRequestHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36ozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36");
client.executeMethod(method);
String htmlCode = method.getResponseBodyAsString();
// 得到所有的img标签的链接
Document doc = Jsoup.parse(htmlCode);
Elements elementImg = doc.select("body img");
for (Element element : elementImg) {
String src = element.attr("src");
if(src.contains("http")){ // 绝对路径就不变
}else { // 否则变成绝对路径
String rootUrl = HTMLParserHelper.getRootUrl(site); // 得到根路径
// 加上跟路径
src = rootUrl+src;
}
System.out.println(src);
downloadImage(src);
System.out.println("ok");
}
System.out.println(elementImg.size()+" result catched.");
} catch (Exception e) {
e.printStackTrace();
} finally{
method.abort();
method.releaseConnection();
}
return 0;
}
// 下载图片的方法
public static void downloadImage(String imageUrl){
GetMethod method = new GetMethod(imageUrl);
HttpClient client = new HttpClient();
try {
client.executeMethod(method);
InputStream inputStream = method.getResponseBodyAsStream();
File file = new File(curdesktop);
if(!file.exists()){
try {
file.mkdir();
} catch (Exception e) {
e.printStackTrace();
}
}
byte b[] = {1};
int size = 0;
FileOutputStream outputStream = new FileOutputStream(new File(curdesktop+HTMLParserHelper.getImageNameAndHouzui(imageUrl)));
while((size=inputStream.read(b))!=-1){
outputStream.write(b, 0, size);
}
outputStream.close();
} catch (Exception e) {
e.printStackTrace();
} finally{
// 释放链接
method.abort();
method.releaseConnection();
}
}
}
相关文章推荐
- htmlunit做爬虫绕过验证码抓取网站数据
- java实现网络爬虫--抓取网站数据
- Python爬虫----实例: 抓取百度百科Python词条相关1000个页面数据
- Nodejs实现爬虫抓取数据实例解析
- 爬虫抓取5大门户网站和电商数据day1:基础环境搭建
- python爬虫"Hello World"级入门实例(二),使用json从中国天气网抓取数据
- [Python爬虫] 之二十一:Selenium +phantomjs 利用 pyquery抓取36氪网站数据
- 直播网站LiveTV Mining,爬虫抓取数据 python3+selenium+phantomjs
- Python爬虫实例2-多线程爬虫抓取糗事百科数据
- Python selenium爬虫抓取船舶网站数据(动态页面)
- python3.4学习笔记(十四) 网络爬虫实例代码,抓取新浪爱彩双色球开奖数据实例
- 使用爬虫抓取网站异步加载数据
- 四周实现爬虫系统(1)-抓取tripadvisor猫途鹰网站数据信息
- 爬虫 抓取论坛 数据 发邮件 两个邮箱
- [Python爬虫] 之二十二:Selenium +phantomjs 利用 pyquery抓取界面网站数据
- java爬虫爬取网站数据实例
- Scrapy爬虫抓取网站数据
- [ahk]爬虫基础 post数据获取网站上的图片(获取艺术签名)
- 使用java 爬虫 抓取youtube,youku,facebook 等视频网站的视频数据(请求规则的分析)
- 零基础写python爬虫之抓取百度贴吧代码分享