您的位置:首页 > 其它

简单爬取百度百科周杰伦所有歌的歌词

2020-06-25 04:26 447 查看

这里写自定义目录标题


这里只是学过爬虫的简单练习,其实还可以更简单,不过比较麻烦,简单的部分过几天再发
package HttpClient;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;

public class HttpClient {
public static void main(String[] args) throws IOException {
//其实根据专辑会更简单,不过暂时没想到好的办法
String album = “jay,范特西,八度空间,叶惠美,七里香,十一月的肖邦,依然范特西,我很忙,魔杰座,跨时代,惊叹号,十二新作,哎吆不错哦,周杰伦的床边故事”;
String[] arr = new String[]{“可爱女人”, “完美主义”,“星晴”,“娘子”,“斗牛”,
“黑色幽默”, “龙卷风”, “反方向的钟”, “伊斯坦堡”, “印第安老斑鸠”,“爱在西元前”,
“爸我回来了”,
“简单爱”,
“忍者”,
“开不了口”,
“上海一九四三”,
“对不起”,
“威廉古堡”,
“双截棍”,
“安静”,“半兽人”, “半岛铁盒”, “暗号”, “龙拳”, “火车叨位去”,
“分裂”, “爷爷泡的茶”, “回到过去”, “米兰的小铁匠” ,“最后的战役”,
“以父之名”, “懦夫”, “晴天”, “三年二班”, “东风破”, “你听得到”,
“同一种调调”, “她的睫毛”, “爱情悬崖” ,“梯田”,“双刀”,
“我的地盘”, “七里香”, “借口”, “外婆”, “将军”,
“搁浅”, “乱舞春秋”, “困兽之斗”, “园游会”,“止战之殇”,
“夜曲”, “蓝色风暴”, “发如雪”, “黑色毛衣”, “四面楚歌”, “枫”,
“浪漫手机”, “逆鳞”, “麦芽糖”, “珊瑚海”, “飘移”, “一路向北”,
“夜的第七章”, “听妈妈的话”, “千里之外”, “本草纲目”, “退后”,
“红模仿”, “白色风车”, “迷迭香”, “菊花台”, “心雨”,
“牛仔很忙”, “无双”, “青花瓷”, “阳光宅男”, “蒲公英的约定”,
“我不配”, “扯”, “甜甜的”, “最长的电影”, “彩虹”,
“龙战骑士”, “蛇舞”, “花海”, “魔术先生”, “说好的幸福呢”, “兰亭序”,
“时光机”, “乔克叔叔”, “稻香”, “流浪诗人”, “给我一首歌的时间”,
“超人不会飞”, “跨时代”, “说了再见”,“烟花易冷”, “好久不见” ,“雨下一整晚”,
“嘻哈空姐”, “我落泪·情绪零碎”, “自导自演”, “爱的飞行日记”, “免费教学录影带”,
“惊叹号”, “迷魂曲”, “MineMine” ,“公主病”, “你好吗”, “疗伤烧肉粽”,
“琴伤”, “水手怕水”, “世界未末日”, “皮影戏”, “超跑女神”,
“四季列车”, “手语”, “公公偏头痛”, “明明就”, “傻笑”, “比较大的大提琴”,
“爱你没差”, “红尘客栈”, “梦想启动”, “大笨钟”, “哪里都是你”, “乌克丽丽”,
“阳明山”,
“窃爱”,
“算什么男人”,
“天涯过客”,
“怎么了”,
“一口气全念对”,
“我要夏天”,
“手写的从前”,
“鞋子特大号”,
“听爸爸的话”,
“美人鱼”,
“听见下雨的声音”,
“床边故事”,
“说走就走”,
“一点点”,
“前世情人”,
“英雄”,
“不该”,
“土耳其冰淇淋”,
“告白气球”,
“NowYouSeeMe”,
“爱情废柴”};

System.out.println(arr[0]);
for (int i = 0; i <=150; i++) {
String indexUrl;
if(arr[i].equals("七里香")){ indexUrl = "https://www.baidu.com/s?wd="+"周杰伦"+arr[i]+"歌曲百度百科";}else{
indexUrl = "https://www.baidu.com/s?wd="+"周杰伦"+arr[i]+"百度百科";}
//https://www.bilibili.com/video/av1415480?from=search&seid=12486960560568652116
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(indexUrl);

httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36");
CloseableHttpResponse closeableHttpResponse = httpClient.execute(httpGet);
int statusCode = closeableHttpResponse.getStatusLine().getStatusCode();
if (statusCode == 200) {
String html = EntityUtils.toString(closeableHttpResponse.getEntity(), "UTF-8");
// Document document = Jsoup.connect(indexUrl).get();
Document document = Jsoup.parse(html);
System.out.println("111");
//result-op c-container xpath-log
Elements select2 = document.select("div[id=1]");
//包含“t“字段
Elements select1 = select2.select("h3[class~=^t]");
Elements select3 = select1.select("a");
String url1 = select3.attr("href");
System.out.println(url1);

httpGet = new HttpGet(url1);
closeableHttpResponse = httpClient.execute(httpGet);
html = EntityUtils.toString(closeableHttpResponse.getEntity(), "UTF-8");
//entry-item
Document document1 = Jsoup.parse(html);
Elements select4 = document1.select("div[class=para]");
System.out.println("24");
/* Elements select4 = select.select("a[class=link-inner]");
System.out.println("24");
System.out.println(select4.attr("href"));*/
//文件存储位置
File file = new File("D:\\ketangggg\\项目\\jielun\\" + arr[i] + ".txt");
//建立数据的输出通道
// FileInputStream fis=new FileInputStream(file);
// OutputStreamWriter osw=new OutputStreamWriter(fis, UTF8Reader);
//FileWriter fileWriter = new FileWriter(file,true);
//建立缓冲输出流对象
OutputStreamWriter fileWriter = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");
BufferedWriter bufferedWriter = new BufferedWriter(fileWriter);
//关闭资源

for (Element element : select4) {
String text = element.text();
System.out.println(text);
if (text.length() <= 18) {
System.out.println(text);

bufferedWriter.write(text);
bufferedWriter.write("\r\n");
}
bufferedWriter.flush();
}
bufferedWriter.close();
}
closeableHttpResponse.close();
httpClient.close();
}
}

}
接下来看效果


随便看一个爱在西元前

细节部分的不足请指出

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: