您的位置:首页 > 其它

机器学习实战第二节 决策树

2017-12-28 14:54 274 查看
度量数据集无序程度的方法:
香农熵 (shannon entropy)
                 



变量的不确定性越大,熵越大
基尼不纯度 (Gini impurity)
从一个数据集中随机选择子项,度量其被错误分类到其他分组里的概率。
 
递归构造决策树:
得到原始数据集,然后基于最好的属性值划分数据集,由于特征值可能多于两个,因此可能存在大于两个分支的数据集划分。采用递归原则处理数据。
 
递归的结束条件:遍历完所有划分数据集的属性,或者每个分支下的所有实例都有相同的分类。

python实现:

from math import log
import operator

def create_trees(dataset, labels):
class_list = [example[-1] for example in dataset]
feat_num = len(dataset[0])
# 递归的第二个结束条件:如果所有分支都属于一个类
if class_list.count(class_list[0]) == len(class_list):
return class_list[0]
# 递归的第一个结束条件,如果遍历完了特征,返回类别最多的那类
if feat_num == 1:
return majority_count(class_list)

best_feat = choose_best_feature(dataset)
best_feat_label = labels[best_feat]
del(labels[best_feat])
# 返回的best_feat是tuple型,所以取列值只能用这种方式
feat_values = [example[best_feat] for example in dataset]
uniq_feat_values = set(feat_values)
my_tree = {best_feat_label:{}}

for value in uniq_feat_values:
sub_labels = labels[:]
sub_dataset = split_dataset(dataset, best_feat, value)
my_tree[best_feat_label][value] = create_trees(sub_dataset, sub_labels)

return my_tree

def majority_count(class_list):
'''

:param class_list:
:return: 出现次数最多的分类的名称
'''
class_count = {}
for cla in class_list:
if cla not in class_count.keys():
class_count[cla] = 0
class_count[cla] += 1

sorted_list = sorted(class_count.items(), key=operator.itemgetter(1), reverse=True)
return sorted_list[0][0]

def choose_best_feature(dataset):
best_entropy = cal_shannon_entropy(da
4000
taset)
best_infogain = 0.0
feature_num = len(dataset[0])
best_feat = -1

for i in range(feature_num):
new_entropy = 0.0

# 获取数据集第i列的值
feat_list = [example[i] for example in dataset]
uniq_feat_list = set(feat_list)

for value in uniq_feat_list:
sub_dataset = split_dataset(dataset, i, value)
prob = len(sub_dataset)/len(dataset)
new_entropy += prob * cal_shannon_entropy(sub_dataset)

info_gain = best_entropy - new_entropy
if info_gain > best_infogain:
best_infogain = info_gain
best_feat = i

return best_feat

def split_dataset(dataset, axis, value):
'''

:param dataset: 需要划分的数据集
:param axis: 划分的特征
:param value: 特征的值
:return:
'''
ret_dataset = []
for vec in dataset:
if vec[axis] == value:
reduced_vec = vec[:axis]
reduced_vec.extend(vec[axis + 1:])
ret_dataset.append(reduced_vec)

return ret_dataset

def cal_shannon_entropy(dataset):
'''

:param dataset: 需要计算香农熵的数据集
:return: 香农熵
'''
shannon_entropy = 0.0
label_count = {}
for vec in dataset:
current_label = vec[-1]
if current_label not in label_count.keys():
label_count[current_label] = 0
label_count[current_label] += 1

for key in label_count:
prob = float(label_count[key])/len(dataset)
shannon_entropy -= prob * log(prob, 2)

return shannon_entropy

def create_dataset():
dataset = [[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']]
labels = ['no surfacing', 'flippers']
return dataset, labels

dataset, labels = create_dataset()
print(dataset)
my_tree = create_trees(dataset,labels)
print(my_tree)


输出:



使用matplotlib注解(annotations)绘制树形图:

import matplotlib.pyplot as plt

decision_node = dict(boxstyle="sawtooth", fc="0.8")
leaf_node = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")

def create_plot(in_tree):
fig = plt.figure(1, facecolor='white')
fig.clf()
# axprops = dict(plt.xticks[], plt.yticks[])
create_plot.ax1 = plt.subplot(111, frameon=False)
plot_tree.totalW = float(get_leaf_num(in_tree))
plot_tree.totalD = float(get_tree_depth(in_tree))
plot_tree.x_off = -0.5/plot_tree.totalW
plot_tree.y_off = 1.0
plot_tree(in_tree, (0.5, 1.0), '')

plt.show()

def plot_node(node_text, center_pt, parent_pt, node_type):
create_plot.ax1.annotate(node_text, xy=parent_pt, xycoords='axes fraction', xytext=center_pt, \
textcoords='axes fraction', va="center", ha="center",\
bbox=node_type, arrowprops=arrow_args)

def plot_mid_text(cntr_pt, parent_pt, txt_string):
x_mid = (parent_pt[0] - cntr_pt[0])/2.0 + cntr_pt[0]
y_mid = (parent_pt[1] - cntr_pt[1])/2.0 + cntr_pt[1]
create_plot.ax1.text(x_mid, y_mid, txt_string)

def plot_tree(my_tree, parent_pt, node_txt):
leaf_num = get_leaf_num(my_tree)
depth = get_tree_depth(my_tree)
first_str = my_tree.keys()
cntr_pt =(plot_tree.x_off + (1.0 + float(leaf_num))/2.0/plot_tree.totalW,\
plot_tree.y_off)
plot_mid_text(cntr_pt, parent_pt, node_txt)
plot_node(first_str, cntr_pt, parent_pt, decision_node)
second_dict = my_tree[''.join(first_str)]
plot_tree.y_off = plot_tree.y_off - 1.0/plot_tree.totalD
for key in second_dict.keys():
if type(second_dict[key]).__name__ == 'dict':
plot_tree(second_dict[key], cntr_pt, str(key))
else:
plot_tree.x_off = plot_tree.x_off + 1.0/plot_tree.totalW
plot_node(second_dict[key], (plot_tree.x_off, plot_tree.y_off), cntr_pt, leaf_node)
plot_mid_text((plot_tree.x_off, plot_tree.y_off), cntr_pt, str(key))

plot_tree.y_off = plot_tree.y_off + 1.0/plot_tree.totalD

def retrieve_tree(i):
list_of_tree = [{'no surfacing': {0:'no', 1:{'flippers':{0:'no', 1:'yes'}}}},
{'no surfacing':{0:'no', 1:{'flippers':{0:{'head':{0:'no', 1: 'yes'}}, 1:'no'}}}}]

return list_of_tree[i]

def get_leaf_num(my_tree):
leaf_num = 0
# 因为树只有一个根节点,所以返回只有一个key,但是类型为dict_keys
first_keys = my_tree.keys()
# 转成字符串类型
first_str = ''.join(first_keys)
# 得到第二层树
second_dict = my_tree[first_str]

# 遍历第二层树的key
for key in second_dict.keys():
# 如果值是字典类型,就递归
if type(second_dict[key]).__name__ == 'dict':
leaf_num += get_leaf_num(second_dict[key])
else:
leaf_num += 1

return leaf_num

def get_tree_depth(my_tree):
max_depth = 0
# 因为树只有一个根节点,所以返回只有一个key,但是类型为dict_keys
first_keys = my_tree.keys()
# 转成字符串类型
first_str = ''.join(first_keys)
second_dict = my_tree[first_str]

for key in second_dict.keys():
if type(second_dict[key]).__name__ == 'dict':
this_depth = 1 + get_tree_depth(second_dict[key])
else:
this_depth = 1
if this_depth > max_depth:
max_depth = this_depth

return max_depth

my_tree = retrieve_tree(0)
print(my_tree)
create_plot(my_tree)



 

matplotlibannotations
boxstyle参数取值:



arrowstyle参数取值:



应用决策树分类:

def classify(input_tree, feat_labels, test_vec):
'''
使用决策树的分类函数
:param input_tree: 输入的决策树
:param feat_labels: 变量标签
:param test_vec: 要测试的变量
:return: 类别
'''
first_str = input_tree.keys()
second_dict = input_tree[''.join(first_str)]
feat_index = feat_labels.index(''.join(first_str))
for key in second_dict.keys():
if test_vec[feat_index] == key:
if type(second_dict[key]).__name__ == 'dict':
class_label = classify(second_dict[key], feat_labels, test_vec)
else:
class_label = second_dict[key]

return class_label

def retrieve_tree(i):
list_of_tree = [{'no surfacing': {0:'no', 1:{'flippers':{0:'no', 1:'yes'}}}},
{'no surfacing':{0:'no', 1:{'flippers':{0:{'head':{0:'no', 1: 'yes'}}, 1:'no'}}}}]

return list_of_tree[i]

dataset, labels = create_dataset()
print(dataset)
my_tree = retrieve_tree(0)
c = classify(my_tree, labels, [1,0])
print(c)  # 输出no


______________________________________________________________
python序列化:pickle和jason
pickle
将变量转化为bytes进行存储
pickle是python独有的序列化模块
功能:序列化函数:dump/dumps
  反序列化函数:load/loads
dump(var,file)
load(file)
 
jason
序列化后格式为字符串str

决策树的存储:

def store_tree(input_tree, filename):
import pickle
fw = open(filename, 'w')
pickle.dump(input_tree, fw)
fw.close()

def grab_tree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  决策树