Python数据分析与机器学习-足球赛事数据集
2018-02-03 13:50
507 查看
import pandas as pd import numpy as np pd.set_option('display.height', 9999) pd.set_option('display.max_rows', 9999) pd.set_option('display.max_columns', 9999) pd.set_option('display.width', 9999) df = pd.read_csv('redcard.csv.gz', compression='gzip') # print(df.shape) #(146028, 28) # print(df.describe().T) # 粗略看数据描述 T转置 # print(df.dtypes) # 看数据类型 object=字符类型 # all_columns =df.columns.tolist() # print(all_columns) ''' ['playerShort', 'player', 'club', 'leagueCountry', 'birthday', 'height', 'weight', 'position', 'games', 'victories', 'ties', 'defeats', 'goals', 'yellowCards', 'yellowReds', 'redCards', 'photoID', 'rater1', 'rater2', 'refNum', 'refCountry', 'Alpha_3', 'meanIAT', 'nIAT', 'seIAT', 'meanExp', 'nExp', 'seExp'] ''' # print(df['height'].mean()) # print(df['weight'].mean()) # print(df.groupby(by='playerShort')['height'].mean()) # print(np.mean(df.groupby(by='playerShort')['height'].mean())) '''球员数据集''' player_index = 'playerShort' player_cols = [ # 'player', # drop player name, we have unique identifier 'birthday', 'height', 'weight', 'position', 'photoID', 'rater1', 'rater2', ] all_cols_unique_players = df.groupby(player_index).agg({col: 'nunique' for col in player_cols}) # print(all_cols_unique_players) # 判断数据集的key值是否有重复 def get_subgroup(dataframe, g_index, g_columns): """Helper function that creates a sub-table from the columns and runs a quick uniqueness test.""" g = dataframe.groupby(g_index).agg({col: 'nunique' for col in g_columns}) if g[g > 1].dropna().shape[0] != 0: print("Warning: you probably assumed this had all unique values but it doesn't.") return dataframe.groupby(g_index).agg({col: 'max' for col in g_columns}) # players = get_subgroup(df, player_index, player_cols) # print(players.head()) # 把处理好的数据存储 def save_subgroup(dataframe, g_index, subgroup_name, prefix='raw_'): save_subgroup_filename = "".join([prefix, subgroup_name, ".csv.gz"]) dataframe.to_csv(save_subgroup_filename, compression='gzip', encoding='UTF-8') test_df = pd.read_csv(save_subgroup_filename, compression='gzip', index_col=g_index, encoding='UTF-8') # Test that we recover what we send in if dataframe.equals(test_df): print("Test-passed: we recover the equivalent subgroup dataframe.") else: print("Warning -- equivalence test!!! Double-check.") # players = get_subgroup(df, player_index, player_cols) # save_subgroup(players, player_index, "players") # club_index = 'club' # club_cols = ['leagueCountry'] # clubs = get_subgroup(df, club_index, club_cols) # save_subgroup(clubs, club_index, "clubs", ) # country_index = 'refCountry' # country_cols = ['Alpha_3', # rename this name of country # 'meanIAT', # 'nIAT', # 'seIAT', # 'meanExp', # 'nExp', # 'seExp', # ] # countries = get_subgroup(df, country_index, country_cols) # rename_columns = {'Alpha_3': 'countryName', } # countries = countries.rename(columns=rename_columns) # print(countries.head()) # save_subgroup(countries, country_index, "countries") # 球员和裁判之间的关系 dyad_index = ['refNum', 'playerShort'] dyad_cols = ['games', 'victories', 'ties', 'defeats', 'goals', 'yellowCards', 'yellowReds', 'redCards', ] dyads = get_subgroup(df, g_index=dyad_index, g_columns=dyad_cols) save_subgroup(dyads, dyad_index, "dyads")
import pandas as pd from matplotlib import pyplot as plt import seaborn as sns pd.set_option('display.height', 9999) pd.set_option('display.max_rows', 9999) pd.set_option('display.max_columns', 9999) pd.set_option('display.width', 9999) def save_subgroup(dataframe, g_index, subgroup_name, prefix='raw_'): save_subgroup_filename = "".join([prefix, subgroup_name, ".csv.gz"]) dataframe.to_csv(save_subgroup_filename, compression='gzip', encoding='UTF-8') test_df = pd.read_csv(save_subgroup_filename, compression='gzip', index_col=g_index, encoding='UTF-8') # Test that we recover what we send in if dataframe.equals(test_df): print("Test-passed: we recover the equivalent subgroup dataframe.") else: print("Warning -- equivalence test!!! Double-check.") def load_subgroup(filename, index_col=[0]): return pd.read_csv(filename, compression='gzip', index_col=index_col, encoding='UTF-8') players = load_subgroup("raw_players.csv.gz") '''拿到数据先看数据的概况''' # print(players.head()) ''' photoID position height rater1 birthday rater2 weight playerShort aaron-hughes 3868.jpg Center Back 182.0 0.25 08.11.1979 0.00 71.0 aaron-hunt 20136.jpg Attacking Midfielder 183.0 0.00 04.09.1986 0.25 73.0 aaron-lennon 13515.jpg Right Midfielder 165.0 0.25 16.04.1987 0.25 63.0 aaron-ramsey 94953.jpg Center Midfielder 178.0 0.00 26.12.1990 0.00 76.0 abdelhamid-el-kaoutari 124913.jpg Center Back 180.0 0.25 17.03.1990 0.25 73.0 ''' import missingno as msno # msno.matrix(players.sample(n=500)) # msno.heatmap(players.sample(n=500)) # msno.bar(players.sample(n=500)) # 看 rater1 rater2 两个人打分的规则 # print(pd.crosstab(players['rater1'], players['rater2'])) # fig, ax = plt.subplots() # sns.heatmap(pd.crosstab(players['rater1'], players['rater2']), cmap='Blues') # ax.set_title('Correlation between Rater 1 and Rater 2') # fig.tight_layout() # plt.show() # modifying dataframe rater1 rater2平均值 players['skintone'] = players[['rater1', 'rater2']].mean(axis=1) # players = players.dropna(axis=0) # # sns.distplot(players['skintone'], kde=False) # sns.distplot(players.skintone, kde=False) # plt.show() # 踢球位置可视化 # MIDSIZE = (12, 8) # fig, ax = plt.subplots(figsize=MIDSIZE) # players.position.value_counts(dropna=False, ascending=True).plot(kind='barh', ax=ax) # ax.set_ylabel("Position") # ax.set_xlabel("Counts") # fig.tight_layout() # plt.show() # 自定义球场踢球位置 defense = ['Center Back', 'Defensive Midfielder', 'Left Fullback', 'Right Fullback', ] # 后卫 midfield = ['Right Midfielder', 'Center Midfielder', 'Left Midfielder', ] # 中场 forward = ['Attacking Midfielder', 'Left Winger', 'Right Winger', 'Center Forward'] # 前锋 keeper = 'Goalkeeper' # 守门员 # modifying dataframe -- adding the aggregated position categorical position_agg players.loc[players['position'].isin(defense), 'position_agg'] = "Defense" players.loc[players['position'].isin(midfield), 'position_agg'] = "Midfield" players.loc[players['position'].isin(forward), 'position_agg'] = "Forward" players.loc[players['position'].eq(keeper), 'position_agg'] = "Keeper" # print(players.head()) # MIDSIZE = (12, 8) # fig, ax = plt.subplots(figsize=MIDSIZE) # players['position_agg'].value_counts(dropna=False, ascending=True).plot(kind='barh', ax=ax) # ax.set_ylabel("position_agg") # ax.set_xlabel("Counts") # fig.tight_layout() # plt.show() '''多变量之间的关系''' from pandas.plotting import scatter_matrix # fig, ax = plt.subplots() # scatter_matrix(players[['height', 'weight', 'skintone']]) # plt.show() # fig, ax = plt.subplots(figsize=MIDSIZE) # sns.regplot('weight', 'height', data=players, ax=ax) # ax.set_ylabel("Height [cm]") # ax.set_xlabel("Weight [kg]") # fig.tight_layout() # plt.show() # 把连续值变为区间值 weight_categories = ["vlow_weight", "low_weight", "mid_weight", "high_weight", "vhigh_weight", ] players['weightclass'] = pd.qcut(players['weight'], len(weight_categories), weight_categories) height_categories = ["vlow_height", "low_height", "mid_height", "high_height", "vhigh_height", ] players['heightclass'] = pd.qcut(players['height'], len(height_categories), height_categories) # modifying dataframe players['birth_date'] = pd.to_datetime(players['birthday'], format='%d.%m.%Y') players['age_years'] = ((pd.to_datetime('2018-02-03') - players['birth_date']).dt.days) / 365.25 # print(players['age_years']) players_cleaned_variables = players.columns.tolist() # print(players_cleaned_variables) players['skintoneclass'] = pd.qcut(players['skintone'], 3) # 数据项属性选择 players_cleaned_variables = [ 'height', 'weight', 'skintone', 'position_agg', 'weightclass', 'heightclass', 'skintoneclass', 'age_years'] players[players_cleaned_variables].to_csv("cleaned_players.csv.gz", compression='gzip')
相关文章推荐
- Python数据分析与机器学习-逻辑回归算法
- Python数据分析与机器学习-使用Kmeans进行图像压缩
- Python机器学习-数据分析系列(1)-引言
- Python数据分析与机器学习-贷款申请最大化利润
- Python数据分析与机器学习-神经网络
- Python数据分析与机器学习-Pandas
- 关于 Python 数据抓取、分析、挖掘、机器学习和Python 分布式计算内容分享
- 大数据分析与机器学习领域Python兵器谱
- Python数据分析与机器学习-贝叶斯实现拼写检查器
- python数据分析与机器学习-用户流失预警
- Python数据分析与机器学习-泰坦尼克获救预测
- Python数据分析与机器学习-Numpy
- Python vs R : 在机器学习和数据分析领域中的对比
- Python数据分析与机器学习-用户流失预警
- Python数据分析与机器学习-scikit-learn模型建立与评估
- 关于 Python 数据抓取 & 分析 & 机器学习 & 挖掘 & 神经网络 内容的分享。
- Python数据分析与机器学习-梯度下降策略
- 2018python数据分析与机器学习实战(视频+源码+课件)
- python数据分析(预测性分析与机器学习)
- Python数据分析与机器学习-线性回归算法原理推导