简单爬取百度百科周杰伦所有歌的歌词
这里写自定义目录标题
这里只是学过爬虫的简单练习,其实还可以更简单,不过比较麻烦,简单的部分过几天再发
package HttpClient;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
public class HttpClient {
public static void main(String[] args) throws IOException {
//其实根据专辑会更简单,不过暂时没想到好的办法
String album = “jay,范特西,八度空间,叶惠美,七里香,十一月的肖邦,依然范特西,我很忙,魔杰座,跨时代,惊叹号,十二新作,哎吆不错哦,周杰伦的床边故事”;
String[] arr = new String[]{“可爱女人”, “完美主义”,“星晴”,“娘子”,“斗牛”,
“黑色幽默”, “龙卷风”, “反方向的钟”, “伊斯坦堡”, “印第安老斑鸠”,“爱在西元前”,
“爸我回来了”,
“简单爱”,
“忍者”,
“开不了口”,
“上海一九四三”,
“对不起”,
“威廉古堡”,
“双截棍”,
“安静”,“半兽人”, “半岛铁盒”, “暗号”, “龙拳”, “火车叨位去”,
“分裂”, “爷爷泡的茶”, “回到过去”, “米兰的小铁匠” ,“最后的战役”,
“以父之名”, “懦夫”, “晴天”, “三年二班”, “东风破”, “你听得到”,
“同一种调调”, “她的睫毛”, “爱情悬崖” ,“梯田”,“双刀”,
“我的地盘”, “七里香”, “借口”, “外婆”, “将军”,
“搁浅”, “乱舞春秋”, “困兽之斗”, “园游会”,“止战之殇”,
“夜曲”, “蓝色风暴”, “发如雪”, “黑色毛衣”, “四面楚歌”, “枫”,
“浪漫手机”, “逆鳞”, “麦芽糖”, “珊瑚海”, “飘移”, “一路向北”,
“夜的第七章”, “听妈妈的话”, “千里之外”, “本草纲目”, “退后”,
“红模仿”, “白色风车”, “迷迭香”, “菊花台”, “心雨”,
“牛仔很忙”, “无双”, “青花瓷”, “阳光宅男”, “蒲公英的约定”,
“我不配”, “扯”, “甜甜的”, “最长的电影”, “彩虹”,
“龙战骑士”, “蛇舞”, “花海”, “魔术先生”, “说好的幸福呢”, “兰亭序”,
“时光机”, “乔克叔叔”, “稻香”, “流浪诗人”, “给我一首歌的时间”,
“超人不会飞”, “跨时代”, “说了再见”,“烟花易冷”, “好久不见” ,“雨下一整晚”,
“嘻哈空姐”, “我落泪·情绪零碎”, “自导自演”, “爱的飞行日记”, “免费教学录影带”,
“惊叹号”, “迷魂曲”, “MineMine” ,“公主病”, “你好吗”, “疗伤烧肉粽”,
“琴伤”, “水手怕水”, “世界未末日”, “皮影戏”, “超跑女神”,
“四季列车”, “手语”, “公公偏头痛”, “明明就”, “傻笑”, “比较大的大提琴”,
“爱你没差”, “红尘客栈”, “梦想启动”, “大笨钟”, “哪里都是你”, “乌克丽丽”,
“阳明山”,
“窃爱”,
“算什么男人”,
“天涯过客”,
“怎么了”,
“一口气全念对”,
“我要夏天”,
“手写的从前”,
“鞋子特大号”,
“听爸爸的话”,
“美人鱼”,
“听见下雨的声音”,
“床边故事”,
“说走就走”,
“一点点”,
“前世情人”,
“英雄”,
“不该”,
“土耳其冰淇淋”,
“告白气球”,
“NowYouSeeMe”,
“爱情废柴”};
System.out.println(arr[0]); for (int i = 0; i <=150; i++) { String indexUrl; if(arr[i].equals("七里香")){ indexUrl = "https://www.baidu.com/s?wd="+"周杰伦"+arr[i]+"歌曲百度百科";}else{ indexUrl = "https://www.baidu.com/s?wd="+"周杰伦"+arr[i]+"百度百科";} //https://www.bilibili.com/video/av1415480?from=search&seid=12486960560568652116 CloseableHttpClient httpClient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(indexUrl); httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"); CloseableHttpResponse closeableHttpResponse = httpClient.execute(httpGet); int statusCode = closeableHttpResponse.getStatusLine().getStatusCode(); if (statusCode == 200) { String html = EntityUtils.toString(closeableHttpResponse.getEntity(), "UTF-8"); // Document document = Jsoup.connect(indexUrl).get(); Document document = Jsoup.parse(html); System.out.println("111"); //result-op c-container xpath-log Elements select2 = document.select("div[id=1]"); //包含“t“字段 Elements select1 = select2.select("h3[class~=^t]"); Elements select3 = select1.select("a"); String url1 = select3.attr("href"); System.out.println(url1); httpGet = new HttpGet(url1); closeableHttpResponse = httpClient.execute(httpGet); html = EntityUtils.toString(closeableHttpResponse.getEntity(), "UTF-8"); //entry-item Document document1 = Jsoup.parse(html); Elements select4 = document1.select("div[class=para]"); System.out.println("24"); /* Elements select4 = select.select("a[class=link-inner]"); System.out.println("24"); System.out.println(select4.attr("href"));*/ //文件存储位置 File file = new File("D:\\ketangggg\\项目\\jielun\\" + arr[i] + ".txt"); //建立数据的输出通道 // FileInputStream fis=new FileInputStream(file); // OutputStreamWriter osw=new OutputStreamWriter(fis, UTF8Reader); //FileWriter fileWriter = new FileWriter(file,true); //建立缓冲输出流对象 OutputStreamWriter fileWriter = new OutputStreamWriter(new FileOutputStream(file), "UTF-8"); BufferedWriter bufferedWriter = new BufferedWriter(fileWriter); //关闭资源 for (Element element : select4) { String text = element.text(); System.out.println(text); if (text.length() <= 18) { System.out.println(text); bufferedWriter.write(text); bufferedWriter.write("\r\n"); } bufferedWriter.flush(); } bufferedWriter.close(); } closeableHttpResponse.close(); httpClient.close(); } }
}
接下来看效果
随便看一个爱在西元前
细节部分的不足请指出
- 用python写一个爬取周杰伦所有歌词的爬虫
- 十六进制内所有进制转换!!!!速度快,运算简单,java
- Java 简单高效处理字符串-删除所有标点
- 对“求数组中所有和为某固定数的所有数对”的算法的简单思考
- IntelliJ IDEA打开Maven项目,Spring boot所有依赖红名,不可用,简单解决方案
- android初步学习时所有简单例子整合到一个ListView上
- 简单方式打印数组所有元素
- SYSTEM32 下的几乎所有文件的简单说明
- SYSTEM32 下的几乎所有文件的简单说明
- java获取request所有参数简单方法
- Python爬虫网易云歌词及词频统计--(周杰伦top50)
- 简单的3个SQL视图搞定所有SqlServer数据库字典
- 简单T-Sql备份所有数据库
- 对于一般的数组类型,简单使用TYPE_MAIN_VARIANT无法将所有的qualifiers去掉?
- 爬取百度百科上中国所有城市的信息
- [导入][转]简单的3个SQL视图搞定所有SqlServer数据库字典
- Android 超简单音乐播放器(十)歌词的实现
- 简单实现打印输出某目录的所有文件
- [数据库字典]简单的3个SQL视图搞定所有SqlServer数据库字典
- Python简单爬虫爬取自己博客园所有文章