Python数据分析与机器学习-Pandas
2017-12-26 20:00
465 查看
源码下载地址:
http://download.csdn.net/download/adam_zs/10174600
import pandas
food_info = pandas.read_csv("food_info.csv") # 默认第一行作为列名
print(food_info)
print(type(food_info)) # DataFrame
print(food_info.dtypes)
first_rows = food_info.head() # 默认显示前5行
print(first_rows)
print(food_info.head(3)) # 显示前3行
print(food_info.tail(3)) # 显示后3行
print(food_info.shape)
print(food_info.columns) # 列名
'''
['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)',
'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)',
'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',
'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)',
'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)',
'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)',
'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg',
'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)',
'Cholestrl_(mg)']
'''
print(food_info.loc[0]) # 第一行
print(food_info.loc[6]) # 第七行
# print(food_info.loc[8620]) #KeyError: 'the label [8620] is not in the [index]'
print(food_info.loc[3:6]) # 切片,返回3,4,5,6行,包括第3行和第6行
print(food_info.loc[[2, 5, 10]]) # 返回2, 5, 10行
print(food_info["NDB_No"]) # 显示NDB_No列数据
print(food_info[["Zinc_(mg)", "Copper_(mg)"]]) # 显示Zinc_(mg),Copper_(mg)列数据
col_names = food_info.columns.tolist()
gram_columns = []
for c in col_names:
if c.endswith("(g)"):
gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))
http://download.csdn.net/download/adam_zs/10174600
import pandas
food_info = pandas.read_csv("food_info.csv") # 默认第一行作为列名
print(food_info)
print(type(food_info)) # DataFrame
print(food_info.dtypes)
first_rows = food_info.head() # 默认显示前5行
print(first_rows)
print(food_info.head(3)) # 显示前3行
print(food_info.tail(3)) # 显示后3行
print(food_info.shape)
print(food_info.columns) # 列名
'''
['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)',
'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)',
'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',
'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)',
'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)',
'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)',
'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg',
'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)',
'Cholestrl_(mg)']
'''
print(food_info.loc[0]) # 第一行
print(food_info.loc[6]) # 第七行
# print(food_info.loc[8620]) #KeyError: 'the label [8620] is not in the [index]'
print(food_info.loc[3:6]) # 切片,返回3,4,5,6行,包括第3行和第6行
print(food_info.loc[[2, 5, 10]]) # 返回2, 5, 10行
print(food_info["NDB_No"]) # 显示NDB_No列数据
print(food_info[["Zinc_(mg)", "Copper_(mg)"]]) # 显示Zinc_(mg),Copper_(mg)列数据
col_names = food_info.columns.tolist()
gram_columns = []
for c in col_names:
if c.endswith("(g)"):
gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))
import pandas food_info = pandas.read_csv("food_info.csv") col_names = food_info.columns.tolist() print(col_names) # ['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', # #'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', # 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)', # 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', # 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', # 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)'] print(food_info[col_names]) print(food_info["Iron_(mg)"]) print(food_info["Iron_(mg)"] / 1000) print(food_info["Iron_(mg)"] + 100) print(food_info["Iron_(mg)"] - 100) print(food_info["Iron_(mg)"] * 2) # 对应位置相乘、相除 water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"] iron_grams = food_info["Iron_(mg)"] / 1000 food_info["Iron_(g)"] = iron_grams weighted_protein = food_info["Protein_(g)"] * 2 weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"] initial_rating = weighted_protein + weighted_fat print(food_info["Energ_Kcal"]) max_calories = food_info["Energ_Kcal"].max() normalized_calories = food_info["Energ_Kcal"] / max_calories normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max() normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max() food_info["Normalized_Protein"] = normalized_protein food_info["Normalized_Fat"] = normalized_fat # inplace=True,对DataFrame进行就地排序,而不是返回新的DataFrame。 food_info.sort_values("Sodium_(mg)", inplace=True, ascending=True) # 升序排列,默认升序 print(food_info["Sodium_(mg)"]) food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False) # 降序排列 print(food_info["Sodium_(mg)"])
import pandas as pd import numpy as np # 泰克尼克号获救信号 ''' 'PassengerId' 乘客id 'Survived' 标签值 'Pclass' 仓位等级 'Name' 姓名 'Sex' 性别 'Age' 年龄 'SibSp' 兄弟姐妹数量 'Parch' 老人孩子总数 'Ticket' 票号 'Fare' 票价 'Cabin' 座位 'Embarked' 登船地点 ''' # pd.set_option('display.height', 1000) # pd.set_option('display.max_rows', 500) # pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) titanic_survival = pd.read_csv("titanic_train.csv") print(titanic_survival.columns) print(titanic_survival.head()) # The Pandas library uses NaN, which stands for "not a number", to indicate a missing value. # we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values # Age没有值的数量 age = titanic_survival["Age"] print(age) age_is_null = pd.isnull(age) # pd.isnull true:是缺失值 flase:不是缺失值 print(age_is_null) age_null_true = age[age_is_null] print(len(age_null_true)) # Age缺失的数量 177 mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"]) print(mean_age) # nan good_ages = titanic_survival["Age"][age_is_null == False] print(sum(good_ages) / len(good_ages)) # 29.6991176471 correct_mean_age = titanic_survival["Age"].mean() # mean,平均年龄,自动筛选出没有age的数据 print(correct_mean_age) # 29.69911764705882 # 每个仓位票价的均价 passenger_classes = [1, 2, 3] fares_by_class = {} for this_class in passenger_classes: pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class] fare_for_class = pclass_rows["Fare"].mean() fares_by_class[this_class] = fare_for_class print(fares_by_class) # pivot_table 数据统计 # index 按照那个分组 # values 是我们要应用计算的列 # aggfunc 指定我们要执行的计算 passenger_Fare = titanic_survival.pivot_table(index="Pclass", values="Fare", aggfunc=np.mean) print(passenger_Fare) passenger_Fare = titanic_survival.pivot_table(index="Pclass", values="Fare") # 默认求平均值 print(passenger_Fare) # 按照Embarked进行分组,求Fare,Survived的和 port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare", "Survived"], aggfunc=np.sum) print(port_stats) print(titanic_survival) # 指定axis= 1或axis='columns' 删除任何具有空值的列 print(titanic_survival.fillna(0)) # 缺失值都填充为0 print(titanic_survival.dropna(axis=1)) # 删除有空值的列 print(titanic_survival.dropna(axis=0)) # 删除有空值的行 print(titanic_survival.dropna(axis=0, subset=["Age", "Sex"])) # 删除"Age", "Sex"有空值的行 row_index_83_age = titanic_survival.loc[83, "Age"] # 行号、列名 row_index_1000_pclass = titanic_survival.loc[766, "Pclass"] print(titanic_survival.loc[83]) print(row_index_83_age) print(titanic_survival.loc[766]) print(row_index_1000_pclass) new_titanic_survival = titanic_survival.sort_values("Age", ascending=True) print(new_titanic_survival) print(new_titanic_survival.loc[:10]) itanic_reindexed = new_titanic_survival.reset_index(drop=True) # 重建索引,返回重建索引后的矩阵 print(itanic_reindexed.loc[:10]) # 返回一系列的第一百个项目 def hundredth_row(column): return column.loc[99] # apply运行自定义函数 hundredth_row = titanic_survival.apply(hundredth_row) print(hundredth_row) # 缺失值的数量 def not_null_count(column): column_null = pd.isnull(column) return len(titanic_survival[column_null]) print(pd.isnull(titanic_survival)) print(len(pd.isnull(titanic_survival))) # 891 column_null_count = titanic_survival.apply(not_null_count, axis=0) # axis=0 列,axis=1 行 print(column_null_count) def which_class(row): pclass = row["Pclass"] if pd.isnull(pclass): return "Unknown" elif pclass == 1: return "First Class" elif pclass == 2: return "Second Class" elif pclass == 3: return "Third Class" classes = titanic_survival.apply(which_class, axis=1) print(classes) def is_minor(row): if row["Age"] > 18: return True else: return False ages = titanic_survival.apply(is_minor, axis=1) print(ages) def generate_age_label(row): age = row["Age"] if pd.isnull(age): return "unknown" elif age < 18: return "minor" else: return "adult" age_labels = titanic_survival.apply(generate_age_label, axis=1) print(age_labels) titanic_survival['age_labels'] = age_labels # 成年、未成年的获救率 age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived") print(age_group_survival)
import pandas as pd pd.set_option('display.height', 9999) pd.set_option('display.max_rows', 9999) pd.set_option('display.max_columns', 9999) pd.set_option('display.width', 9999) # Series (collection of values) 一行或者一列 # DataFrame (collection of Series objects) # Panel (collection of DataFrame objects) # DataFrame中一行或者一列是 Series # Series.values 是 ndarray # 电影评分 fandango = pd.read_csv('fandango_score_comparison.csv') print(fandango.head()) series_film = fandango['FILM'] print(type(series_film)) # Series print(series_film[0:5]) series_rt = fandango['RottenTomatoes'] print(series_rt[0:5]) from pandas import Series film_names = series_film.values print(type(film_names)) # ndarray print(film_names) rt_scores = series_rt.values print(rt_scores) # 电影名字film_names作为索引 series_custom = Series(rt_scores, index=film_names) print(series_custom) print(series_custom[['Minions (2015)', 'Leviathan (2014)']]) # int index is also aviable series_custom = Series(rt_scores, index=film_names) print(series_custom) print(series_custom[['Minions (2015)', 'Leviathan (2014)']]) fiveten = series_custom[5:10] print(fiveten) original_index = series_custom.index.tolist() print(original_index) sorted_index = sorted(original_index) print(sorted_index) sorted_by_index = series_custom.reindex(sorted_index) print(sorted_by_index) sc2 = series_custom.sort_index() sc3 = series_custom.sort_values() print(sc2[:10]) print(sc3[:10]) import numpy as np print(series_custom.head()) # Add each value with each other print(np.add(series_custom.head(), series_custom.head())) # Apply sine function to each value print(np.sin(series_custom)) # Return the highest value (will return a single value not a Series) print(np.max(series_custom)) print(fandango.head()) series_custom = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'].values) print(series_custom.head()) print(series_custom > 50) series_greater_than_50 = series_custom[series_custom > 50] criteria_one = series_custom > 50 criteria_two = series_custom < 75 both_criteria = series_custom[criteria_one & criteria_two] print(both_criteria) # data alignment same index rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM']) rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM']) rt_mean = (rt_critics + rt_users) / 2 print(rt_mean)
import pandas as pd pd.set_option('display.height', 9999) pd.set_option('display.max_rows', 9999) pd.set_option('display.max_columns', 9999) pd.set_option('display.width', 9999) fandango = pd.read_csv('fandango_score_comparison.csv') print(fandango.head()) print(type(fandango)) # DataFrame指定索引 fandango_films = fandango.set_index('FILM', drop=True) # 以FILM作为索引,删除原FILM的列 print(fandango_films) print(fandango_films.index) # Slice using either bracket notation or loc[] print(fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]) print(fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]) # Specific movie print(fandango_films.loc['Kumiko, The Treasure Hunter (2015)']) # Selecting list of movies movies = ['Kumiko, The Treasure Hunter (2015)', 'Do You Believe? (2015)', 'Ant-Man (2015)'] print(fandango_films.loc[movies]) # When selecting multiple rows, a DataFrame is returned, # but when selecting an individual row, a Series object is returned instead import numpy as np print(fandango_films.head()) types = fandango_films.dtypes print(types) float_columns = types[types.values == 'float64'].index float_df = fandango_films[float_columns] print(float_df) # `x` is a Series object representing a column deviations = float_df.apply(lambda x: np.std(x)) print(deviations) # np.std标准差 print(float_df.head()) rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']] print(rt_mt_user.apply(lambda x: np.std(x), axis=1))
相关文章推荐
- Python数据分析与机器学习-神经网络
- 利用Python进行数据分析(12) pandas基础: 数据合并
- python 抓取数据,pandas进行数据分析并可视化展示
- python中添加数据分析工具numpy和pandas
- 【python数据挖掘课程】十二.Pandas、Matplotlib结合SQL语句对比图分析
- Python数据分析库pandas基本操作方法
- 关于 Python 数据抓取、分析、挖掘、机器学习和Python 分布式计算内容分享
- Python数据分析常用手册——Numpy和Pandas
- Python数据分析之pandas学习(二)
- Python数据分析包:pandas 基础
- Python 数据分析-pandas 基础
- [Python数据分析-01]Pandas数据结构之Series
- 用python做数据分析4|pandas库介绍之DataFrame基本操作
- python数据分析(pandas入门)
- python数据分析之pandas里的Series
- Python数据分析模块 | pandas做数据分析(三):统计相关函数
- Python数据分析之真实IP请求Pandas详解
- Python数据分析之pandas学习
- python数据分析入门(一)----安装pandas
- Python数据分析:NumPy、pandas中(axis=0 与axis=1)区分