您的位置：首页 > 编程语言 > Python开发

python爬取百度音乐（二）——保存数据到mysql中

2017-08-20 09:46 190 查看

上篇博客只是爬取了网页上的数据并打印出来，但是在实际开发应用中，数据是要持久保存起来的，数据可以保存到关系型数据库如：mysql中或NoSQL数据库中如：mongodb

这篇博客是把数据保存到mysql数据库中

采用的模块是pymysql，[使用方法]http://www.runoob.com/python3/python3-mysql.html

本程序的数据代码段:

# 保存音乐信息到数据库中
def savaMusicToDB(m_id,m_name,m_link,m_type,m_singer,m_album,m_click,m_collect):
print 'savaMusicToDB start'
DBConnection=getDBConnection()
print 'dbconnection='+str(DBConnection)
#创建游标
cursor=DBConnection.cursor()
sql='insert into orgmusic(m_id,m_name,m_link,m_type,m_singer,m_album,m_click,m_collect) values(%s,%s,%s,%s,%s,%s,%s,%s)'
cursor.execute(sql,(m_id,m_name,m_link,m_type,m_singer,m_album,m_click,m_collect))
DBConnection.commit()
closeDBConnection(DBConnection,cursor)

#建立数据库连接
def getDBConnection():
print 'getDBConnection start'
host = '182.254.220.188'
port = 3306
user = 'root'
password = 'ldy123456'
db = 'music'
charset = 'utf8'
# 建立数据库链接
DBConnection=pymysql.connect(host=host,port=port,user=user,passwd=password,db=db,charset=charset)
return DBConnection

#关闭数据库连接
def closeDBConnection(DBConnection):
DBConnection.close()

#关闭数据库连接和游标
def closeDBConnection(DBConnection,cursor):
cursor.close()
DBConnection.close()

本爬虫完整代码：

#coding=utf-8 #设置编码
#获取百度音乐

import urllib2
from bs4 import BeautifulSoup
import pymysql
import datetime
import random

#百度音乐的根路径url
baiduMusicRootURL='http://music.baidu.com'
#百度音乐分类的基本的根路径url
baiduMusicTagURL='http://music.baidu.com/tag'

#获取音乐的分类标签
def getMusicTags(musicTagURL):
print 'getMusicTag='+musicTagURL
musicTags={}
htmlContent=getHTML(musicTagURL)
print 'getMusicTags='+htmlContent
#解析网页,获取分类标签
soup=BeautifulSoup(htmlContent,'lxml')
Tags=soup.find_all('span','tag-list clearfix')
#print Tags
for tag in Tags:
#获取连接文本内容
tagName=tag.get_text()
#获取链接
aSoup=BeautifulSoup(str(tag),'lxml')
a=aSoup.select_one('a[href]')
tagLink=a.get('href')
#tagName作为键，tagLink作为值保存到字典中
musicTags[tagName]=tagLink
return musicTags

#获取网页
def getHTML(musicTagURL):
print 'getHTML= '+musicTagURL
headers={}
request=urllib2.Request(musicTagURL,headers=headers)
response=urllib2.urlopen(request)
htmlContent=response.read()
return htmlContent

#获取该页面的所有歌曲的内容
def getAllMusic(sourceURL):
print 'getAllMusic start sourceURL='+sourceURL
noData='#'
try:
#获取页面数量
size=20
sURL=sourceURL+'?start=0&size=20&third_type=0'
htmlContent=getHTML(sURL)
soup=BeautifulSoup(htmlContent,'lxml')
aLists=soup.find_all('div','page-inner')
if aLists:
aSoup = BeautifulSoup(str(aLists), 'lxml')
pageNumberLists = aSoup.find_all('a')
#print pageNumberLists
if pageNumberLists:
aStr=pageNumberLists[len(pageNumberLists)-2]
#print 'aStr='+str(aStr)
pageASoup=BeautifulSoup(str(aStr),'lxml')
pageNumber=int(pageASoup.find('a').get_text())
else:
pageNumber=0
else:
pageNumber=0
print 'pageNumber='+str(pageNumber)
#获取该类型的所有歌曲
count = 0
for i in range(0,pageNumber+1):
print 'i='+str(i)
sURL=''
try:
sURL=sourceURL+'?start='+str(count)+'&size=20&third_type=0'
print 'sURL='+sURL
#获取每个页面上的歌曲
# 获取html页面
htmlContent = getHTML(sURL)
# 对页面进行解析
soup = BeautifulSoup(htmlContent, 'lxml')
# 获取歌曲类型
m_type= soup.find('span', 'title').get_text()
print 'm_type=' + m_type
#获取歌曲列表
musicList=soup.find('div','main-body-cont')
#print 'musicListSoup='+str(musicList)
musicListSoup=BeautifulSoup(str(musicList),'lxml')
musicsLists=musicListSoup.find_all('div','song-item')
print 'musicsLists='+ str(musicsLists)
print 'musicsLists len='+str(len(musicsLists))
for music in music
d065
sLists:
#print 'music='+str(music)
# 获取（设置）id
m_id = setMusicID()
print 'm_id=' + m_id
musicSoup=BeautifulSoup(str(music),'lxml')
# 获取歌曲名和歌曲链接
spanStr = musicSoup.find('span', 'song-title')
spanSoup = BeautifulSoup(str(spanStr), 'lxml')
# 获取歌曲名
m_name = spanSoup.find('a').get_text()
if not m_name:
m_name=noData
print 'm_name=' + m_name
# 获取歌曲链接
m_link = baiduMusicRootURL + spanSoup.select_one('a[href]').get('href')
if not m_link:
m_link=noData
print 'm_link=' + m_link
# 获取歌手名
m_singer = musicSoup.find('span', 'author_list').get_text()
#第一个字符是‘\n’
if m_singer and len(m_singer)>1:
nStr = m_singer[0:1]
if nStr == '\n':
m_singer = m_singer[1:len(m_singer)]
else:
m_singer=noData
print 'm_singer len=' + str(len(m_singer))
print 'm_singer=' + m_singer
# 获取专辑
m_album = musicSoup.find('span', 'album-title').get_text()
if not m_album:
m_album='#'
print 'm_album=' + m_album
# 获取（设置）点播量
m_click = 0
# 获取（设置）收藏量
m_collect = 0
# 保存到数据库中
savaMusicToDB(m_id, m_name, m_link, m_type, m_singer, m_album, m_click, m_collect)

count = count + 20
except:
pass
except:
pass
#获取（设置）音乐的id
def setMusicID():
print 'setMusicID start'
nowTime=datetime.datetime.now().strftime("%Y%m%d%H%M%S")
randomNum=random.randint(0,1000)
m_id=str(nowTime)+str(randomNum)
return m_id

# 保存音乐信息到数据库中
def savaMusicToDB(m_id,m_name,m_link,m_type,m_singer,m_album,m_click,m_collect):
print 'savaMusicToDB start'
DBConnection=getDBConnection()
print 'dbconnection='+str(DBConnection)
#创建游标
cursor=DBConnection.cursor()
sql='insert into orgmusic(m_id,m_name,m_link,m_type,m_singer,m_album,m_click,m_collect) values(%s,%s,%s,%s,%s,%s,%s,%s)'
cursor.execute(sql,(m_id,m_name,m_link,m_type,m_singer,m_album,m_click,m_collect))
DBConnection.commit()
closeDBConnection(DBConnection,cursor)

#建立数据库连接
def getDBConnection():
print 'getDBConnection start'
host = '182.254.220.188'
port = 3306
user = 'root'
password = 'ldy123456'
db = 'music'
charset = 'utf8'
# 建立数据库链接
DBConnection=pymysql.connect(host=host,port=port,user=user,passwd=password,db=db,charset=charset)
return DBConnection

#关闭数据库连接
def closeDBConnection(DBConnection):
DBConnection.close()

#关闭数据库连接和游标
def closeDBConnection(DBConnection,cursor):
cursor.close()
DBConnection.close()

#主程序
if __name__ == '__main__':
print 'Music Spider start'

#获取百度音乐的分类标签
musicTags=getMusicTags(baiduMusicTagURL)
print musicTags
#按照分类爬取音乐
for k,v in musicTags.items():
print 'k='+k
print 'v='+str(v)
httpStr=str(v)[0:7]
if httpStr=='http://':
sourceURL=str(v)
else:
sourceURL=baiduMusicRootURL+str(v)
print 'sourceURL='+sourceURL
#获取歌曲
getAllMusic(sourceURL)

不喜勿喷！！！

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： python 爬虫

相关文章推荐

新的分享

章节导航