您的位置:首页 > 编程语言 > Python开发

Python数据分析与机器学习-足球赛事数据集

2018-02-03 13:50 507 查看
import pandas as pd
import numpy as np

pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)

df = pd.read_csv('redcard.csv.gz', compression='gzip')
# print(df.shape) #(146028, 28)

# print(df.describe().T)  # 粗略看数据描述  T转置
# print(df.dtypes)  # 看数据类型 object=字符类型
# all_columns =df.columns.tolist()
# print(all_columns)
'''
['playerShort', 'player', 'club', 'leagueCountry', 'birthday', 'height', 'weight', 'position', 'games',
'victories', 'ties', 'defeats', 'goals', 'yellowCards', 'yellowReds', 'redCards', 'photoID', 'rater1', 'rater2',
'refNum', 'refCountry', 'Alpha_3', 'meanIAT', 'nIAT', 'seIAT', 'meanExp', 'nExp', 'seExp']
'''
# print(df['height'].mean())
# print(df['weight'].mean())
# print(df.groupby(by='playerShort')['height'].mean())
# print(np.mean(df.groupby(by='playerShort')['height'].mean()))

'''球员数据集'''
player_index = 'playerShort'
player_cols = [  # 'player', # drop player name, we have unique identifier
'birthday',
'height',
'weight',
'position',
'photoID',
'rater1',
'rater2',
]
all_cols_unique_players = df.groupby(player_index).agg({col: 'nunique' for col in player_cols})

# print(all_cols_unique_players)

# 判断数据集的key值是否有重复
def get_subgroup(dataframe, g_index, g_columns):
"""Helper function that creates a sub-table from the columns and runs a quick uniqueness test."""
g = dataframe.groupby(g_index).agg({col: 'nunique' for col in g_columns})
if g[g > 1].dropna().shape[0] != 0:
print("Warning: you probably assumed this had all unique values but it doesn't.")
return dataframe.groupby(g_index).agg({col: 'max' for col in g_columns})

# players = get_subgroup(df, player_index, player_cols)
# print(players.head())

# 把处理好的数据存储
def save_subgroup(dataframe, g_index, subgroup_name, prefix='raw_'):
save_subgroup_filename = "".join([prefix, subgroup_name, ".csv.gz"])
dataframe.to_csv(save_subgroup_filename, compression='gzip', encoding='UTF-8')
test_df = pd.read_csv(save_subgroup_filename, compression='gzip', index_col=g_index, encoding='UTF-8')
# Test that we recover what we send in
if dataframe.equals(test_df):
print("Test-passed: we recover the equivalent subgroup dataframe.")
else:
print("Warning -- equivalence test!!! Double-check.")

# players = get_subgroup(df, player_index, player_cols)
# save_subgroup(players, player_index, "players")

# club_index = 'club'
# club_cols = ['leagueCountry']
# clubs = get_subgroup(df, club_index, club_cols)
# save_subgroup(clubs, club_index, "clubs", )

# country_index = 'refCountry'
# country_cols = ['Alpha_3',  # rename this name of country
#                 'meanIAT',
#                 'nIAT',
#                 'seIAT',
#                 'meanExp',
#                 'nExp',
#                 'seExp',
#                 ]
# countries = get_subgroup(df, country_index, country_cols)
# rename_columns = {'Alpha_3': 'countryName', }
# countries = countries.rename(columns=rename_columns)
# print(countries.head())
# save_subgroup(countries, country_index, "countries")

# 球员和裁判之间的关系
dyad_index = ['refNum', 'playerShort']
dyad_cols = ['games',
'victories',
'ties',
'defeats',
'goals',
'yellowCards',
'yellowReds',
'redCards',
]
dyads = get_subgroup(df, g_index=dyad_index, g_columns=dyad_cols)
save_subgroup(dyads, dyad_index, "dyads")

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)

def save_subgroup(dataframe, g_index, subgroup_name, prefix='raw_'):
save_subgroup_filename = "".join([prefix, subgroup_name, ".csv.gz"])
dataframe.to_csv(save_subgroup_filename, compression='gzip', encoding='UTF-8')
test_df = pd.read_csv(save_subgroup_filename, compression='gzip', index_col=g_index, encoding='UTF-8')
# Test that we recover what we send in
if dataframe.equals(test_df):
print("Test-passed: we recover the equivalent subgroup dataframe.")
else:
print("Warning -- equivalence test!!! Double-check.")

def load_subgroup(filename, index_col=[0]):
return pd.read_csv(filename, compression='gzip', index_col=index_col, encoding='UTF-8')

players = load_subgroup("raw_players.csv.gz")

'''拿到数据先看数据的概况'''

# print(players.head())
'''
photoID              position  height  rater1    birthday  rater2  weight
playerShort
aaron-hughes              3868.jpg           Center Back   182.0    0.25  08.11.1979    0.00    71.0
aaron-hunt               20136.jpg  Attacking Midfielder   183.0    0.00  04.09.1986    0.25    73.0
aaron-lennon             13515.jpg      Right Midfielder   165.0    0.25  16.04.1987    0.25    63.0
aaron-ramsey             94953.jpg     Center Midfielder   178.0    0.00  26.12.1990    0.00    76.0
abdelhamid-el-kaoutari  124913.jpg           Center Back   180.0    0.25  17.03.1990    0.25    73.0
'''

import missingno as msno

# msno.matrix(players.sample(n=500))
# msno.heatmap(players.sample(n=500))
# msno.bar(players.sample(n=500))

# 看 rater1 rater2 两个人打分的规则
# print(pd.crosstab(players['rater1'], players['rater2']))

# fig, ax = plt.subplots()
# sns.heatmap(pd.crosstab(players['rater1'], players['rater2']), cmap='Blues')
# ax.set_title('Correlation between Rater 1 and Rater 2')
# fig.tight_layout()
# plt.show()

# modifying dataframe rater1 rater2平均值
players['skintone'] = players[['rater1', 'rater2']].mean(axis=1)
# players = players.dropna(axis=0)
# # sns.distplot(players['skintone'], kde=False)
# sns.distplot(players.skintone, kde=False)
# plt.show()

# 踢球位置可视化
# MIDSIZE = (12, 8)
# fig, ax = plt.subplots(figsize=MIDSIZE)
# players.position.value_counts(dropna=False, ascending=True).plot(kind='barh', ax=ax)
# ax.set_ylabel("Position")
# ax.set_xlabel("Counts")
# fig.tight_layout()
# plt.show()

# 自定义球场踢球位置
defense = ['Center Back', 'Defensive Midfielder', 'Left Fullback', 'Right Fullback', ]  # 后卫
midfield = ['Right Midfielder', 'Center Midfielder', 'Left Midfielder', ]  # 中场
forward = ['Attacking Midfielder', 'Left Winger', 'Right Winger', 'Center Forward']  # 前锋
keeper = 'Goalkeeper'  # 守门员

# modifying dataframe -- adding the aggregated position categorical position_agg
players.loc[players['position'].isin(defense), 'position_agg'] = "Defense"
players.loc[players['position'].isin(midfield), 'position_agg'] = "Midfield"
players.loc[players['position'].isin(forward), 'position_agg'] = "Forward"
players.loc[players['position'].eq(keeper), 'position_agg'] = "Keeper"
# print(players.head())
#
MIDSIZE = (12, 8)
# fig, ax = plt.subplots(figsize=MIDSIZE)
# players['position_agg'].value_counts(dropna=False, ascending=True).plot(kind='barh', ax=ax)
# ax.set_ylabel("position_agg")
# ax.set_xlabel("Counts")
# fig.tight_layout()
# plt.show()

'''多变量之间的关系'''
from pandas.plotting import scatter_matrix

# fig, ax = plt.subplots()
# scatter_matrix(players[['height', 'weight', 'skintone']])
# plt.show()

# fig, ax = plt.subplots(figsize=MIDSIZE)
# sns.regplot('weight', 'height', data=players, ax=ax)
# ax.set_ylabel("Height [cm]")
# ax.set_xlabel("Weight [kg]")
# fig.tight_layout()
# plt.show()

# 把连续值变为区间值
weight_categories = ["vlow_weight",
"low_weight",
"mid_weight",
"high_weight",
"vhigh_weight",
]
players['weightclass'] = pd.qcut(players['weight'], len(weight_categories), weight_categories)

height_categories = ["vlow_height",
"low_height",
"mid_height",
"high_height",
"vhigh_height",
]
players['heightclass'] = pd.qcut(players['height'], len(height_categories), height_categories)

# modifying dataframe
players['birth_date'] = pd.to_datetime(players['birthday'], format='%d.%m.%Y')
players['age_years'] = ((pd.to_datetime('2018-02-03') - players['birth_date']).dt.days) / 365.25
# print(players['age_years'])

players_cleaned_variables = players.columns.tolist()
# print(players_cleaned_variables)

players['skintoneclass'] = pd.qcut(players['skintone'], 3)

# 数据项属性选择
players_cleaned_variables = [
'height',
'weight',
'skintone',
'position_agg',
'weightclass',
'heightclass',
'skintoneclass',
'age_years']
players[players_cleaned_variables].to_csv("cleaned_players.csv.gz", compression='gzip')
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息