您的位置：首页 > 理论基础 > 计算机网络

httpclient爬虫爬取电影信息和下载地址实例

2019-09-16 15:27 197 查看

本次更新主要解决了老旧页面下载链接可能是迅雷和ftp格式的，还有就是去重，因为每一页有一个推荐列表，里面也会有相应的详情链接，还有兼容了另外的页面格式，更新了两个方法：

public static void spider(int pa) {
List<String> page = getPage(pa);
String[] abc = "http://www.***.net/ys/20170620/37704.htm, http://www.***.net/ys/20170727/38028.htm, http://www.***.net/ys/20170810/38113.htm, http://www.***.net/ys/20170703/37769.htm, http://www.***.net/ys/20170615/37680.htm, http://www.***.net/ys/20170615/37678.htm, http://www.***.net/ys/20170727/38027.htm, http://www.***.net/ys/20170802/38060.htm, http://www.***.net/ys/20170515/37385.htm, http://www.***.net/ys/20170725/38001.htm, http://www.***.net/ys/20170608/37614.htm, http://www.***.net/ys/20170802/38059.htm, http://www.***.net/ys/20170629/37742.htm, http://www.***.net/ys/20170512/37323.htm, http://www.***.net/ys/20170426/37219.htm, http://www.***.net/ys/20170727/38026.htm, http://www.***.net/ys/20170730/38046.htm, http://www.***.net/ys/20170804/38082.htm, http://www.***.net/ys/20170714/37848.htm, http://www.***.net/ys/20180819/40982.htm, http://www.***.net/ys/20180819/40981.htm, http://www.***.net/ys/20180818/40980.htm, http://www.***.net/ys/20180818/40979.htm, http://www.***.net/ys/20180818/40978.htm, http://www.***.net/ys/20180818/40977.htm, http://www.***.net/ys/20180817/40975.htm, http://www.***.net/ys/20180817/40974.htm".split(", ");
List<String> list = Arrays.asList(abc);
page.removeAll(list);
output(page.size());
Set<String> truelist = new HashSet<>();
page.forEach(l -> truelist.add(l));
truelist.forEach(p -> {
try {
getMovieInfo(p);
sleep(getRandomInt(3) + 3);
} catch (Exception e) {
output(p);
}
});
}

public static void spider(String text) {
List<String> page = getPage(text);
Set<String> truelist = new HashSet<>();
page.forEach(l -> truelist.add(l));
truelist.forEach(p -> {
try {
getMovieInfo(p);
sleep(getRandomInt(3));
} catch (Exception e) {
output(p);
}
});
}

public static List<String> getPage(int page) {
String url = "http://www.***.net/ys/index_" + page + ".htm";
if (page == 1) url = "http://www.***.net/ys/";
output(url);
HttpGet httpGet = getHttpGet(url);
JSONObject response = getHttpResponse(httpGet);
String content = response.getString("content");
byte[] bytes = content.getBytes(UTF_8);
String all = new String(bytes, UTF_8);
List<String> list = regexAll(all, "http://www.***.net/ys/\\d+/\\d+.htm");
return list;
}

public static List<String> getPage(String page) {
String content = page;
byte[] bytes = content.getBytes(UTF_8);
String all = new String(bytes, UTF_8);
List<String> list = regexAll(all, "http://www.***.net/ys/\\d+/\\d+.htm");
return list;
}

public static boolean getMovieInfo(int day, int index) {
//        String url = "http://www.***.net/ys/20180819/40981.htm";
String url = "http://www.***.net/ys/" + day + "/" + index + ".htm";
getMovieInfo(url);
return true;
}

public static boolean getMovieInfo(String url) {
HttpGet httpGet = getHttpGet(url);
JSONObject response = getHttpResponse(httpGet);
String s = response.getString("content");
if (s.contains("您查询的内容不存在，请返回首页重新搜索")) return false;
byte[] bytes = s.getBytes(UTF_8);
String all = new String(bytes, UTF_8);
String name = EMPTY, tname = EMPTY, year = EMPTY, language = EMPTY, date = EMPTY, score = EMPTY, length = EMPTY, author = EMPTY;
if (all.contains("◎")) {
int i = all.indexOf("◎");
int i1 = all.indexOf("<hr");
String info = s.substring(i, i1);
name = getInfo(info, "片　　名　");
tname = getInfo(info, "译　　名　");
year = getInfo(info, "年　　代　");
language = getInfo(info, "语　　言　");
date = getInfo(info, "上映日期　");
score = getInfo(info, "豆瓣评分　");
length = getInfo(info, "片　　长　");
author = getInfo(info, "导　　演　");
} else {
name = getInfo(all, "<title>");
if (name.contains("_")) name = name.substring(0, name.indexOf("_"));
length = getInfo(all, "片长: ");
date = getInfo(all, "上映日期: ");
author = getInfo(all, "导演: ");
language = getInfo(all, "语言: ");
}
List<String> magnets = regexAll(all, "magnet:.+?>");
List<String> ed2ks = regexAll(all, "ed2k:.+?>");
if (ed2ks.size() == 0) ed2ks = regexAll(all, "ftp://.+?>");
if (ed2ks.size() == 0) ed2ks = regexAll(all, "thunder://.+?>");
List<String> pans = regexAll(all, "http(s)*://pan.baidu.com/.+?</td>");
String sql = "INSERT INTO movie (name,tname,year,language,date,score,length,author,magnet,ed2k,pan) VALUES(\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");";
sql = String.format(sql, name, tname, year, language, date, score, length, author, magnets.toString().replace("\"", EMPTY), ed2ks.toString().replace("\"", EMPTY), pans.toString().replace("\"", EMPTY));
if (ed2ks.size() != 0) MySqlTest.sendWork(sql);
output(magnets.toString().length(), ed2ks.toString().length(), pans.toString().length());
output(sql);
return true;
}

public static String getInfo(String text, String start) {
String value = EMPTY;
List<String> nameinfo = regexAll(text, start + ".+?<");
if (nameinfo.size() > 0) value = nameinfo.get(0).replace(start, EMPTY).replace("<", EMPTY);
return value;
}

----------------------------------分割线-------------------------------------------------

本人使用httpclient爬虫过程中，想爬取关注的一个电影网站的下载地址。在经过尝试之后，终于成功爬取了几百部热门电影的信息和下载地址（电驴和磁力链接）。中间遇到了编码格式，正则匹配不一致，还有重复链接过滤等问题，也都一一搞定。附上代码，供大家参考。

关键信息隐藏，思路供大家参考：先去访问列表页，拿到详情页的链接，去重之后去访问详情页，拿到相关信息和下载地址，存储到数据库中。

public class MyTest extends ApiLibrary {
public static void main(String[] args) {
DEFAULT_CHARSET = GB2312;
for (int i = 0; i < 10; i++) {
spider(1);
}

testOver();
}

public static void spider(int pa) {
String[] abc = "http://www.***.net/ys/20170620/37704.htm, http://www.***.net/ys/20170727/38028.htm, http://www.***.net/ys/20170810/38113.htm, http://www.***.net/ys/20170703/37769.htm, http://www.***.net/ys/20170615/37680.htm, http://www.***.net/ys/20170615/37678.htm, http://www.***.net/ys/20170727/38027.htm, http://www.***.net/ys/20170802/38060.htm, http://www.***.net/ys/20170515/37385.htm, http://www.***.net/ys/20170725/38001.htm, http://www.***.net/ys/20170608/37614.htm, http://www.***.net/ys/20170802/38059.htm, http://www.***.net/ys/20170629/37742.htm, http://www.***.net/ys/20170512/37323.htm, http://www.***.net/ys/20170426/37219.htm, http://www.***.net/ys/20170727/38026.htm, http://www.***.net/ys/20170730/38046.htm, http://www.***.net/ys/20170804/38082.htm, http://www.***.net/ys/20170714/37848.htm, http://www.***.net/ys/20180819/40982.htm, http://www.***.net/ys/20180819/40981.htm, http://www.***.net/ys/20180818/40980.htm, http://www.***.net/ys/20180818/40979.htm, http://www.***.net/ys/20180818/40978.htm, http://www.***.net/ys/20180818/40977.htm, http://www.***.net/ys/20180817/40975.htm, http://www.***.net/ys/20180817/40974.htm".split(", ");
List<String> list = Arrays.asList(abc);
page.removeAll(list);
Set<String> truelist = new HashSet<>();
page.forEach(l -> truelist.add(l));
truelist.forEach(p -> {
try {
getMovieInfo(p);
sleep(getRandomInt(3));
} catch (Exception e) {
output(p);
}
});
}

public static List<String> getPage(int page) {
String url = "http://www.***.net/ys/index_" + page + ".htm";
if (page == 1) url = "http://www.***.net/ys/";
output(url);
HttpGet httpGet = getHttpGet(url);
JSONObject response = getHttpResponse(httpGet);
String content = response.getString("content");
output(content);
byte[] bytes = content.getBytes(UTF_8);
String all = new String(bytes, UTF_8);
List<String> list = regexAll(all, "http://www.***.net/ys/\\d+/\\d+.htm");
return list;
}

public static boolean getMovieInfo(int day, int index) {
//        String url = "http://www.***.net/ys/20180819/40981.htm";
String url = "http://www.***.net/ys/" + day + "/" + index + ".htm";
HttpGet httpGet = getHttpGet(url);
JSONObject response = getHttpResponse(httpGet);
String s = response.getString("content");
if (s.contains("您查询的内容不存在，请返回首页重新搜索")) return false;
byte[] bytes = s.getBytes(UTF_8);
String all = new String(bytes, UTF_8);
int i = all.indexOf("◎");
int i1 = all.indexOf("<hr");
String info = s.substring(i, i1);
String name = getInfo(info, "片　　名　");
String tname = getInfo(info, "译　　名　");
String year = getInfo(info, "年　　代　");
String language = getInfo(info, "语　　言　");
String date = getInfo(info, "上映日期　");
String score = getInfo(info, "豆瓣评分　");
String length = getInfo(info, "片　　长　");
String author = getInfo(info, "导　　演　");
List<String> magnets = regexAll(all, "magnet:.+?>");
List<String> ed2ks = regexAll(all, "ed2k:.+?>");
List<String> pans = regexAll(all, "http(s)*://pan.baidu.com/.+?</td>");
String sql = "INSERT INTO movie (name,tname,year,language,date,score,length,author,magnet,ed2k,pan) VALUES(\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");";
sql = String.format(sql, name, tname, year, language, date, score, length, author, magnets.toString().replace("\"", EMPTY), ed2ks.toString().replace("\"", EMPTY), pans.toString().replace("\"", EMPTY));
MySqlTest.sendWork(sql);
return true;
}

public static boolean getMovieInfo(String url) {
HttpGet httpGet = getHttpGet(url);
JSONObject response = getHttpResponse(httpGet);
String s = response.getString("content");
if (s.contains("您查询的内容不存在，请返回首页重新搜索")) return false;
byte[] bytes = s.getBytes(UTF_8);
String all = new String(bytes, UTF_8);
int i = all.indexOf("◎");
int i1 = all.indexOf("<hr");
String info = s.substring(i, i1);
String name = getInfo(info, "片　　名　");
String tname = getInfo(info, "译　　名　");
String year = getInfo(info, "年　　代　");
String language = getInfo(info, "语　　言　");
String date = getInfo(info, "上映日期　");
String score = getInfo(info, "豆瓣评分　");
String length = getInfo(info, "片　　长　");
String author = getInfo(info, "导　　演　");
List<String> magnets = regexAll(all, "magnet:.+?>");
List<String> ed2ks = regexAll(all, "ed2k:.+?>");
List<String> pans = regexAll(all, "http(s)*://pan.baidu.com/.+?</td>");
String sql = "INSERT INTO movie (name,tname,year,language,date,score,length,author,magnet,ed2k,pan) VALUES(\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");";
sql = String.format(sql, name, tname, year, language, date, score, length, author, magnets.toString().replace("\"", EMPTY), ed2ks.toString().replace("\"", EMPTY), pans.toString().replace("\"", EMPTY));
MySqlTest.sendWork(sql);
output(magnets.toString().length(), ed2ks.toString().length(), pans.toString().length());
output(sql);
return true;
}

public static String getInfo(String text, String start) {
String value = EMPTY;
List<String> nameinfo = regexAll(text, start + ".+?<");
if (nameinfo.size() > 0) value = nameinfo.get(0).replace(start, EMPTY).replace("<", EMPTY);
return value;
}

}

下面是数据库存储的截图：

技术类文章精选

非技术文章精选

点击查看公众号地图

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： HttpClient .Net Framework

相关文章推荐

新的分享

章节导航