向大神求教!python写的python决策树分箱的ID3算法怎么一直提示bestfeat=labels[bestfeat_index]超出索引啊!

【机器学习】决策树(基于ID3算法)—— python3 实现方案
def calcShannonEnt(dataSet):
计算数据集的香农熵
:param dataSet: 包含标签的数据集,shape=m*(n+1),m是样本数,n是特征数量
:return: 返回数据集的香农熵
m = len(dataSet)
# 用于统计标签的数量
for example in dataSet:
target = example[-1]
if target not in d:
d[target] = 0
d[target] += 1
ShannonEnt = 0
for label in d:
prob = d[label] / m
ShannonEnt += -prob * np.log2(prob)
return ShannonEnt
def splitDataSet(dataSet, feat, value):
按特征的值,划分数据集
:param dataSet: 包含标签的数据集,shape=m*(n+1),m是样本数,n是特征数量
:param feat: 设为划分点的样本特征
:param value:
:return: 划分好的数据集
splitdata = []
for i in range(len(dataSet)):
if dataSet[i][feat] == value:
reducedfeat = dataSet[i][: feat]
reducedfeat.extend(dataSet[i][feat + 1:])
splitdata.append(reducedfeat)
return splitdata
def chooseBestFeatureToSplit(dataSet):
对每个特征的每个特征值进行划分,选出特征增量最大的特征
:param dataSet: dataSet: 包含标签的数据集,shape=m*(n+1),m是样本数,n是特征数量
:return: 最合适特征的索引值i
n = len(dataSet[0]) - 1
baseEnt = calcShannonEnt(dataSet)
bestFeature = -1
bestInfoGain = 0
for i in range(n):
values = [sample[i] for sample in dataSet]
values = set(values)
newEnt = 0
for value in values:
splitdata = splitDataSet(dataSet, i, value)
prob = len(splitdata) / len(dataSet)
newEnt += prob * calcShannonEnt(splitdata)
infoGain = baseEnt - newEnt
if infoGain & bestInfoGain:
bestInfoGain = infoGain
bestFeature = i
return bestFeature
def classtarget(targetlist):
统计标签列表中个列表的个数,返回个数最多的标签
:param classlist: 标签列表
:return: 个数最多的标签
# 用于记录各标签的个数
for target in targetlist:
if target not in d:
d[target] = 0
d[target] += 1
return max(d, d.get)
def creatTree(dataSet, feature_label):
利用递归,创建最佳分类的决策树
:param dataSet: 包含标签的数据集,shape=m*(n+1),m是样本数,n是特征数量
:param feature_label: 实际操作时,使用的是特征的索引值, 这个表是 索引-特征的含义 对照表,方便人类去理解
:return: 决策树
targetlist = [example[-1] for example in dataSet]
# 取出所有样本的标签, 存储在列表中, 记为标签列表
if len(set(targetlist)) == 1:
# 即标签列表中只有一个类别,返回此类别
return targetlist[0]
if len(dataSet[0]) == 1:
# 对应 没有特征值可分的情况
return classtarget(targetlist)
# 返回出现次数最多的类别
bestfeature = chooseBestFeatureToSplit(dataSet)
# 选取最佳分类特征索引值
bestfeature_label = feature_label[bestfeature]
# 获取其含义
featlabel_copy = feature_label.copy()
del featlabel_copy[bestfeature]
# 因为这个表要传递给子树使用,所以删去表中的这个元素(不然会导致索引值混乱,从而无法对应正确的特征)
mytree = {bestfeature_label: {}}
# 创建根节点
values = [example[bestfeature] for example in dataSet]
values = set(values)
for value in values:
# 针对最佳分类特征的每一个属性值,创建子树
sublabel = featlabel_copy[:]
# 更新子 特征-含义 列表
mytree[bestfeature_label][value] = creatTree(splitDataSet(dataSet, bestfeature, value), sublabel) # 递归方法创建子树
return mytree
decisionNode = dict(boxstyle='sawtooth', fc='0.8')
# 决策节点锯齿形
leafNode = dict(boxstyle='round4', fc='0.8')
# 叶子节点椭圆形
arrow_args = dict(arrowstyle='&-')
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
定义文本框和箭头格式
:param nodeTxt:
:param centerPt:
箭头的终点
:param parentPt:
箭头的起点
:param nodeType:
节点的类型
:return: 无返回值。
createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', xytext=centerPt, textcoords='axes fraction',
va='center', ha='center', bbox=nodeType, arrowprops=arrow_args)
def getNumleafs(myTree):
计算决策树叶子节点的数量,递归方法
:param myTree: 决策树
:return: 叶子节点的数量
numleafs = 0
rootnode = list(myTree.keys())[0]
# 获取根节点
secondnode = myTree[rootnode]
# 获取根节点下一层的节点
for node in secondnode.keys():
if type(secondnode[node]) == dict:
# 如果该节点还有下一层,则调用原函数,递归
numleafs += getNumleafs(secondnode[node])
numleafs += 1
# 如果已经是叶子节点了,则不用递归了,返回1
return numleafs
def getTreeDepth(myTree):
计算决策树的深度
:param myTree: 决策树
:return: 返回决策树的深度
maxdepth = 0
rootnode = list(myTree.keys())[0]
secondnode = myTree[rootnode]
for node in secondnode.keys():
if type(secondnode[node]) == dict:
thisDepth = 1 + getTreeDepth(secondnode[node])
thisDepth = 1
if thisDepth & maxdepth:
maxdepth = thisDepth
return maxdepth
def plotMidText(cntrPt, parentPt, txtString):
在父子节点间填充文本信息
:param cntrPt: 子节点坐标
:param parentPt:
父节点坐标
:param txtString: 文本
:return: 无返回值
xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]
# 获取横坐标
yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]
# 获取纵坐标
createPlot.ax1.text(xMid, yMid, txtString)
# 在ax1中画出这个点
def plotTree(myTree, parentPt, nodeTxt):
计算结果图的深度,宽度 位置等等
:param myTree: 决策树
:param parentPt: 父节点坐标
:param nodeTxt: 文本
numLeafs = getNumleafs(myTree)
# 计算宽与高
depth = getTreeDepth(myTree)
firstStr = list(myTree.keys())[0]
cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff)
plotMidText(cntrPt, parentPt, nodeTxt)
plotNode(firstStr, cntrPt, parentPt, decisionNode)
# 标记子结点属性值
secondDict = myTree[firstStr]
plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
# 减少y偏移
for key in secondDict.keys():
if type(secondDict[key]) == dict:
plotTree(secondDict[key], cntrPt, str(key))
plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD
def createPlot(myTree):
绘制决策树
:param myTree: 决策树
:return: 无返回值
fig = plt.figure(1, facecolor='white')
# 创建画板
# 清理画板
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
plotTree.totalW = float(getNumleafs(myTree))
# 获取宽度
plotTree.totalD = float(getTreeDepth(myTree))
# 获取深度
plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0
plotTree(myTree, (0.5,1.0), '')
plt.show()
def classify(myTree, featLabels, testVec):
利用决策树进行分类
:param: myTree: 构造好的决策树模型
:param: featLabels: 所有的类标签
:param: testVec: 测试数据
:return: 分类决策结果
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
featIndex = featLabels.index(firstStr)
key = testVec[featIndex]
valueOfFeat = secondDict[key]
if isinstance(valueOfFeat, dict):
classLabel = classify(valueOfFeat, featLabels, testVec)
# 非叶子节点,使用递归进入下一层
classLabel = valueOfFeat
# 到达叶子节点,返回其值
return classLabel
【机器学习实战-python3】决策树ID3
没有更多推荐了,import matplotlib.pyplot as plt
import numpy as np
import operator as op
import math
import treePlotter
#knn算法缺点:无法给出数据内在含义
# python中对列表使用都是引用形式
#计算香农熵, 用来度量信息的无序度
m*n,最后一列是标签
def calShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries #每个标签出现的概率
shannonEnt -= prob * math.log(prob, 2)
return shannonEnt
# 按照给定特征进行划分 featVec feature vectory
# 第axis个特征等于value则把该case抽取,返回值中不需要这个特征
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
# 找到一个好的特征,使得信息增益最大(熵越大无序度越大),即使得子集的熵和最小的那个
def chooseBestFeature(dataSet):
numFeature = len(dataSet[0])-1 #最后一列是标签
baseEntropy = calShannonEnt(dataSet)
bestInfoGain, bestFeature = 0, -1
for i in range(numFeature):
# 通过dataSet的每行example[i]得到dataSet第 i 列所有数据
featList = [example[i] for example in dataSet]
uniqueVal = set(featList)
newEntropy = 0
#按value值划分子集,计算各子集的熵,然后计算总的熵,并计算增益
for value in uniqueVal:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob*calShannonEnt(subDataSet)
infoGain = baseEntropy-newEntropy
if infoGain&bestInfoGain:
bestInfoGain = infoGain
bestFeature = i
return bestFeature
#当遍历完所有属性,某个节点下的实例标签仍然不唯一,采用投票原则,即该分支归属于标签最多的那个
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.items():
classCount[vote] = 0
classCount[vote] += 1
#按照票数从大到小排列
sortClassCount = sorted(classCount.items(), key=op.itemgetter(1), reverse=True)
return sortClassCount[0][0]
# 递归创建决策树,labels包含数据集中所有特征的标签,可以认为是他的一个名字,如下图的facing,flipper等
# 在找到bestFeat后回删掉,因此labels的索引回发生变化
# {'no facing':{0:'no', 1:{'flipper':{0:'no', 1:'yes'}}}}
def createTree(dataSet, labels):
classList = [example[-1] for example in dataSet]
# 该分支只有一类,到大叶子节点
if classList.count(classList[0]) == len(classList):
return classList[0]
#属性全部用完,到达叶子节点,采用投票方法得出类标签
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeat = chooseBestFeature(dataSet)
bestFeatLabel = labels[bestFeat]
# 字典嵌套方式存储树
myTree = {bestFeatLabel:{}}
#删除一个bestfeat后对应的label要变化,但一定要留一份拷贝(python对列表使用引用传参)
sublabel = labels[:]
del(sublabel[bestFeat])
#得到数据集中最好特征所在列,并得到唯一个可以遍历的集合
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
#最好特征相同的归属于同一分支
for value in uniqueVals:
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), sublabel)
return myTree
#对建好的树进行遍历,到叶子节点时返回
def classify(inputTree, featLabels, testVec):
firstStr = list(inputTree.keys())[0] #决策树的第一个标签
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
# 遍历所有的取值
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]).__name__ =='dict':
classLabel = classify(secondDict[key], featLabels, testVec)
classLabel = secondDict[key]
return classLabel
dataSet = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
labels = ['no surfacing', 'flippers']
mytree = createTree(dataSet, labels)
treePlotter.createPlot(mytree)
print(classify(mytree, labels, [0, 0]))
没有更多推荐了,机器学习算法与Python学习(guodongwei1991)原文发表时间:本文参与,欢迎正在阅读的你也加入,一起分享。分享分享文章到朋友圈分享文章到 QQ分享文章到 QQ 空间分享文章到微博复制文章链接到剪贴板扫描二维码扫描关注云+社区399 篇文章65 人订阅相关文章来自专栏0来自专栏2来自专栏0来自专栏2来自专栏3来自专栏3扫描二维码扫描关注云+社区&nbsp>&nbsp
&nbsp>&nbsp
&nbsp>&nbsp
决策树算法学习笔记(二)
摘要:#-*-coding:UTF-8-*-frommathimportlogfromnumpyimport*importmatplotlib.pyplotaspltdefcalcShannonEnt(dataSet):numEntries=len(dataSet)#统计数据集的数量labelCounts={}#创建一个数据字典forfeatVecindataSet:currentLabel=featVec[-1]#取数据集的最后一列数据ifcurrentLabelnotinlab
# -*- coding: UTF-8 -*-from math import logfrom numpy import
*import matplotlib.pyplot as pltdef calcShannonEnt(dataSet):
numEntries=len(dataSet)#统计数据集的数量
labelCounts={}#创建一个数据字典
for featVec in dataSet:
currentLabel=featVec[-1]#取数据集的最后一列数据
if currentLabel not in labelCounts.keys():#数据字典中是否存在当前键值
labelCounts[currentLabel]=0#数据字典中如果不存在则加入字典中
labelCounts[currentLabel]+=1#数据字典中如果存在当前键值则数量加一
shannonEnt=0.0#初始化香农熵
for key in labelCounts:#计算香农熵
prob=float(labelCounts[key])/numEntries
shannonEnt -= prob*math.log(prob,2)
return shannonEnt#初始化数据集def createDataSet():
dataSet=[[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]
labels=['no surfacing','flippers']
return dataSet,labelsdef filematrix(filename):
labels = []
fr=open(filename)
arrayLines=fr.readlines()
for line in arrayLines:
tokens=line.strip().split(' ')
data.append([float(tk) for tk in tokens[:-1]])
labels.append(tokens[-1])
return data,labelsdata,labels=filematrix(&highandweight.txt&)#print calcShannonEnt(data)#print
datamyDat,labels=createDataSet()print (calcShannonEnt(myDat))#划分数据集''' dataSet:待划分的数据集axis:划分数据集的特征value:需要返回的特征的值'''def splitDataSet(dataSet,axis,value):
retDataSet=[]
for featVec in dataSet:
if featVec[axis]==value:
reducedFeatVec=featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSetprint splitDataSet(myDat,0,1)#选择最好的数据集划分方式def chooseBestFeatureToSplit(dataSet):
numFeatures=len(dataSet[0])-1#统计特征数量
baseEntropy=calcShannonEnt(dataSet)#计算初始香农熵
bestInfoGain=0.0
bestFeature=-1
for i in range(numFeatures):
featList=[example[i] for example in dataSet]#特征值提取
uniqueVals=set(featList)#去除重复特征值
newEntropy=0.0
for value in uniqueVals:
subDataSet=splitDataSet(dataSet,i,value)
prob=len(subDataSet)/float(len(dataSet))
newEntropy+=prob*calcShannonEnt(subDataSet)#计算新的香农熵
infoGain=baseEntropy-newEntropy
if(infoGain&bestInfoGain):
bestInfoGain=infoGain
bestFeature=i
bestFeatureprint
chooseBestFeatureToSplit(myDat)#分类名称出现最多的种类def majorityCnt(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys():
classCount[vote]=0
classCount[vote]+=1
sortedClassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]#决策树的创建'''dataSet:数据集labels:标签集'''def createTree(dataSet,labels):
classList=[example[-1] for example in dataSet]#统计种类标签
if classList.count(classList[0])==len(classList):#所有标签完全相同
return classList[0]
if len(dataSet[0])==1:#遍历完所有标签
return majorityCnt(classList)
bestFeat=chooseBestFeatureToSplit(dataSet)
bestFeatLabel=labels[bestFeat]
myTree={bestFeatLabel:{}}
del (labels[bestFeat])
featValues=[example[bestFeat] for example in dataSet]
uniqueVals=set(featValues)
for value in uniqueVals:
subLabels=labels[:]
myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
myTreemyTree=createTree(myDat,labels)print myTreedef classify(inputTree,featLables,testVec):
firstStr=inputTree.keys()[0]
secondDict=inputTree[firstStr]
myDat,featLables= createDataSet()
featIndex=featLables.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex]==key:
if type(secondDict[key]).__name__=='dict':
classLabel=classify(secondDict[key],featLables,testVec)
classLabel=secondDict[key]
return classLabelprint classify(myTree,labels,[1,1])
以上是的内容,更多
的内容,请您使用右上方搜索功能获取相关信息。
若你要投稿、删除文章请联系邮箱:zixun-group@service.aliyun.com,工作人员会在五个工作日内给你回复。
新用户大礼包!
现在注册,免费体验40+云产品,及域名优惠!
云服务器 ECS
可弹性伸缩、安全稳定、简单易用
&40.8元/月起
预测未发生的攻击
&24元/月起
你可能还喜欢
你可能感兴趣
阿里云教程中心为您免费提供
决策树算法学习笔记(二)相关信息,包括
的信息,所有决策树算法学习笔记(二)相关内容均不代表阿里云的意见!投稿删除文章请联系邮箱:zixun-group@service.aliyun.com,工作人员会在五个工作日内答复
售前咨询热线
支持与服务
资源和社区
关注阿里云
International

我要回帖

更多关于 python决策树预测模型 的文章

 

随机推荐