您的位置:首页 > 其它

[Clojure] Data Collection and Data Analysis on the music of www.xiami.com - Part 5

2014-09-14 10:46 531 查看
Source code

https://github.com/bluesilence/Lisp/tree/master/clojure/projects/xiami-crawler

Data Cooking

For simplicity, I chose SQL to analyse the data. The raw data in .log has to be structured before that.

.log(csv) -> Excel -> SQL Server

3 tables are built: Songs, Albums, Artists.

2 table is created: Genre, GenrePreference. The GenrePreference table ismanually modified to record my flavor on each genre. However, this table is obsolete since the latest data combines multiple genres of an album into
1 string...

INSERT INTO [xiami].[dbo].[Genre]
(GenreName)
SELECT DISTINCT
AlbumGenre
FROM [xiami].[dbo].[Albums$]

INSERT INTO [xiami].[dbo].[GenrePreference]
(UserId, GenreId, Preference)
SELECT
1 AS UserId,
Id,
0 AS Preference
FROM [xiami].[dbo].[Genre]


2 functions are built:

GetAlbumPreferenceByGenre

-- =============================================
-- Author:	<Author,,Name>
-- Create date: <Create Date, ,>
-- Description: Obsolete since multiple genres are considered an independent genre
-- =============================================
CREATE FUNCTION [dbo].[GetAlbumPreferenceByGenre]
(
-- Add the parameters for the function here
@AlbumId decimal(18,4)
)
RETURNS int
AS
BEGIN
DECLARE @genres [nvarchar](255);
DECLARE @preference int = 0;

SELECT @genres = AlbumGenre
FROM [dbo].[Albums$]
WHERE AlbumId = @AlbumId;

SELECT @preference = MAX(Preference)
FROM
(
SELECT Item
FROM [dbo].[split] (@genres, ',')
) a
JOIN
(
SELECT GenreName, Preference
FROM [dbo].[GenrePreference]
) b
ON a.Item = b.GenreName;

RETURN @preference;
END


GetAlbumPreferenceByUser

-- =============================================
-- Author:	<Author,,Name>
-- Create date: <Create Date, ,>
-- Description:	Get Album Preference By UserId
-- =============================================
CREATE FUNCTION [dbo].[GetAlbumPreferenceByUser]
(
-- Add the parameters for the function here
@AlbumId decimal(18,4),
@UserId int
)
RETURNS int
AS
BEGIN

DECLARE @genre [nvarchar](512);

SELECT @genre = AlbumGenre
FROM [dbo].[Albums$]
WHERE AlbumId = @AlbumId;

DECLARE @genreId int;

SELECT @genreId = Id
FROM [dbo].[Genre]
WHERE GenreName = @genre;

DECLARE @preference int = 0;

SELECT @preference = Preference
FROM [dbo].[UserGenrePreference]
WHERE UserId = @UserId
AND GenreId = @genreId

RETURN @preference;
END


Then we can base on the tabular tables to do some primary queries.

Data Analysis

Q: Which artist has the max genres?

SELECT ArtistName, COUNT(DISTINCT AlbumGenre) AS Genres
FROM
(
SELECT ArtistId, ArtistName
FROM dbo.Artists$
) artists
JOIN
(
SELECT AlbumId, AlbumName, AlbumGenre, ArtistId
FROM dbo.Albums$
WHERE AlbumGenre != 'N/A'
) albums
ON artists.ArtistId = albums.ArtistId
GROUP BY ArtistName
ORDER BY Genres DESC, ArtistName


Q: Which artist has the most number of albums?

SELECT ArtistName, COUNT(AlbumId) AS Albums
FROM
(
SELECT ArtistId, ArtistName
FROM dbo.Artists$
GROUP BY ArtistId, ArtistName
) artists
JOIN
(
SELECT AlbumId, AlbumName, ArtistId
FROM dbo.Albums$
WHERE AlbumGenre != 'N/A'
GROUP BY AlbumId, AlbumName, ArtistId
) albums
ON artists.ArtistId = albums.ArtistId
GROUP BY ArtistName
ORDER BY Albums DESC, ArtistName


Q: Hotness of artists?

SELECT a.ArtistId, ArtistName, SUM(Hotness) AS Hotness
FROM
(
SELECT ArtistId, ArtistName
FROM dbo.Artists$
) a
JOIN
(
SELECT ArtistId, albums.AlbumId AS AlbumId, AlbumName, Hotness
FROM
(
SELECT AlbumId, AlbumName, ArtistId
FROM dbo.Albums$
GROUP BY AlbumId, AlbumName, ArtistId
) albums
JOIN
(
SELECT SUM(SongHot) AS Hotness, AlbumId
FROM dbo.Songs$
GROUP BY AlbumId
) songs
ON albums.AlbumId = songs.AlbumId
) b
ON a.ArtistId = b.ArtistId
GROUP BY a.ArtistId, ArtistName
ORDER BY Hotness DESC, ArtistName


Q: Which artist has the max hotness per album?

SELECT
artistHotness.ArtistId,
artistHotness.ArtistName,
Hotness * 1.0 / Albums AS HotnessPerAlbum
FROM
(
SELECT artists.ArtistId, COUNT(AlbumId) AS Albums
FROM
(
SELECT ArtistId, ArtistName
FROM dbo.Artists$
GROUP BY ArtistId, ArtistName
) artists
JOIN
(
SELECT AlbumId, AlbumName, ArtistId
FROM dbo.Albums$
WHERE AlbumGenre != 'N/A'
GROUP BY AlbumId, AlbumName, ArtistId
) albums
ON artists.ArtistId = albums.ArtistId
GROUP BY artists.ArtistId
)
albumCount
JOIN
(
SELECT a.ArtistId, ArtistName, SUM(Hotness) AS Hotness
FROM
(
SELECT ArtistId, ArtistName
FROM dbo.Artists$
) a
JOIN
(
SELECT ArtistId, albums.AlbumId AS AlbumId, AlbumName, Hotness
FROM
(
SELECT AlbumId, AlbumName, ArtistId
FROM dbo.Albums$
GROUP BY AlbumId, AlbumName, ArtistId
) albums
JOIN
(
SELECT SUM(SongHot) AS Hotness, AlbumId
FROM dbo.Songs$
GROUP BY AlbumId
) songs
ON albums.AlbumId = songs.AlbumId
) b
ON a.ArtistId = b.ArtistId
GROUP BY a.ArtistId, ArtistName
)
artistHotness
ON artistHotness.ArtistId = albumCount.ArtistId
ORDER BY HotnessPerAlbum DESC


Note:

Because there
are records at different DataTime,
we need to use GROUP BY to de-dup before JOIN.

Q: Which album makes the biggest comment/collected ratio?

SELECT
*,
AlbumComments * 1.0 / AlbumCollected AS Ratio
FROM [xiami].[dbo].[Albums$]
WHERE AlbumCollected != 0
AND AlbumValue > 9
AND AlbumCollected > 300
ORDER BY Ratio DESC


Q: Which album has the highest hot/collected conversion ratio?

SELECT
a.AlbumId AS AlbumId,
a.AlbumName AS AlbumName,
a.AlbumValue AS AlbumValue,
a.AlbumComments AS AlbumComments,
a.AlbumCollected AS AlbumCollected,
SUM(b.SongHot) AS AlbumHot,
a.AlbumCollected * 1.0 / SUM(b.SongHot) AS ConvertRatio
FROM
(
SELECT
*
FROM [xiami].[dbo].[Albums$]
) a
JOIN
(
SELECT *
FROM [xiami].[dbo].[Songs$]
) b
ON a.AlbumId = b.AlbumId
WHERE SongHot > 0
AND AlbumValue > 0
GROUP BY a.AlbumId, AlbumName, AlbumValue, AlbumComments, AlbumCollected
ORDER BY ConvertRatio DESC


Q: Which genre of music is the most popular?

By SongHot:

SELECT
a.AlbumGenre AS AlbumGenre,
SUM(b.SongHot) AS AlbumHot
FROM
(
SELECT
*
FROM [xiami].[dbo].[Albums$]
) a
JOIN
(
SELECT *
FROM [xiami].[dbo].[Songs$]
) b
ON a.AlbumId = b.AlbumId
WHERE AlbumValue > 0
AND AlbumGenre != 'N/A'
GROUP BY AlbumGenre
ORDER BY AlbumHot DESC


By AlbumCollected:

SELECT
AlbumGenre,
SUM(AlbumCollected) AS Collected
FROM [xiami].[dbo].[Albums$]
WHERE AlbumValue > 0
AND AlbumGenre != 'N/A'
GROUP BY AlbumGenre
ORDER BY Collected DESC


By AlbumComments:

SELECT
AlbumGenre,
SUM(AlbumComments) AS Comments
FROM [xiami].[dbo].[Albums$]
WHERE AlbumValue > 0
AND AlbumGenre != 'N/A'
GROUP BY AlbumGenre
ORDER BY Comments DESC


Q: Get the top 10 representative songs for each genre.

SELECT TOP 50
SongName, ArtistName
FROM
(
SELECT SongId, SongName, SongHot, ArtistId, AlbumGenre, AlbumValue, AlbumCollected, AlbumComments
FROM
(SELECT
SongId,
SongName,
SongHot,
songs.AlbumId AS AlbumId,
albums.ArtistId AS ArtistId,
AlbumGenre,
AlbumValue,
AlbumCollected,
AlbumComments,
Rank() OVER (PARTITION BY AlbumGenre ORDER BY SongHot DESC, AlbumCollected DESC, AlbumValue DESC, AlbumComments DESC) AS r
FROM
(
(
SELECT
SongId,
SongName,
SongHot,
AlbumId
FROM [xiami].[dbo].[Songs$]
) songs
JOIN
(
SELECT
AlbumId,
AlbumGenre,
AlbumValue,
AlbumCollected,
AlbumComments,
ArtistId
FROM [xiami].[dbo].[Albums$]
WHERE AlbumValue > 0
AND AlbumGenre != 'N/A'
) albums
ON songs.AlbumId = albums.AlbumId
)
) results
WHERE r <= 10
) tmp
JOIN
(
SELECT *
FROM [xiami].[dbo].[Artists$]
) artists
ON tmp.ArtistId = artists.ArtistId


Q: Which song is my taste?

A: Save query
results as tsv,
then it can be imported into xiami collection.

Xiami collection
only allows a maximum of 50 songs.

SELECT SongName, ArtistName, Preference
FROM
(
SELECT
SongId,
SongName,
SongHot,
a.AlbumId AS AlbumId,
AlbumName,
AlbumGenre,
c.ArtistName AS ArtistName,
AlbumValue,
AlbumComments,
AlbumCollected,
[dbo].[GetAlbumPreferenceByUser] (a.AlbumId, 1) AS Preference
FROM
(
SELECT *
FROM [xiami].[dbo].[Songs$]
) a
JOIN
(
SELECT *
FROM [xiami].[dbo].[Albums$]
WHERE AlbumValue > 9.3
AND AlbumCollected > 300
AND AlbumComments > 100
AND ((AlbumComments * 1.0) / AlbumCollected) > 0.05
) b
ON a.AlbumId = b.AlbumId
JOIN
(
SELECT *
FROM [xiami].[dbo].[Artists$]
) c
ON b.ArtistId = c.ArtistId
) final
WHERE Preference > 0
ORDER BY Preference DESC


Note:

The constants of the queries above are arbitrary.

Just out of my preference : P
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐