## 1.决策树的构造

### 1.1 信息增益

```from math import log         #we use log function to calculate the entropy
import operator```

```def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet: #the the number of unique elements and their occurance
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob * log(prob,2)     #log base 2
return shannonEnt```

```def createDataSet():
dataSet = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
labels = ['no surfacing','flippers']
#change to discrete values
return dataSet, labels```

`cd /home/fangyang/桌面/machinelearninginaction/Ch03`

/home/fangyang/桌面/machinelearninginaction/Ch03

`import trees` `myDat, labels = trees.createDataSet()` `myDat #old data set` [[1, 1, ‘yes’], [1, 1, ‘yes’], [1, 0, ‘no’], [0, 1, ‘no’], [0, 1, ‘no’]] `labels` [‘no surfacing’, ‘flippers’] `trees.calcShannonEnt(myDat) #calculate the entropy` 0.9709505944546686 `myDat[0][-1]='maybe' #change the result ,and look again the entropy` `myDat #new data set` [[1, 1, ‘maybe’], [1, 1, ‘yes’], [1, 0, ‘no’], [0, 1, ‘no’], [0, 1, ‘no’]] `trees.calcShannonEnt(myDat) # the new entropy` 1.3709505944546687

### 1.2 划分数据集

```def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]     #chop out axis used for splitting
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet```

```a=[1,2,3]
b=[4,5,6]
a.append(b)```

`a`

[1, 2, 3, [4, 5, 6]]

```a=[1,2,3]
a.extend(b)```

`a`

[1, 2, 3, 4, 5, 6] 可见append函数是直接将b的原型导入a中，extend是将b中的元素导入到a中 下面再来测试一下 `myDat, labels = trees.createDataSet() #initialization` `myDat` [[1, 1, ‘yes’], [1, 1, ‘yes’], [1, 0, ‘no’], [0, 1, ‘no’], [0, 1, ‘no’]] `trees.splitDataSet(myDat,0,1) #choose the first character to split the dataset` [[1, ‘yes’], [1, ‘yes’], [0, ‘no’]] `trees.splitDataSet(myDat,0,0)# change the value ,look the difference of previous results` [[1, ‘no’], [1, ‘no’]]

```def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1      #the last column is used for the labels
baseEntropy = calcShannonEnt(dataSet) #calculate the original entropy
bestInfoGain = 0.0; bestFeature = -1
for i in range(numFeatures):        #iterate over all the features
featList = [example[i] for example in dataSet]#create a list of all the examples of this feature
uniqueVals = set(featList)       #get a set of unique values
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy     #calculate the info gain; ie reduction in entropy
if (infoGain > bestInfoGain):       #compare this to the best gain so far
bestInfoGain = infoGain         #if better than current best, set to best
bestFeature = i
return bestFeature                      #returns an integer```

```import trees
myDat, labels = trees.createDataSet()
trees.chooseBestFeatureToSplit(myDat)   #return the index of best character to split```

0

### 1.3 递归构建决策树

```def majorityCnt(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]```

```import numpy as np
classList = np.array(myDat).T[-1]```

`classList`

array([‘yes’, ‘yes’, ‘no’, ‘no’, ‘no’], dtype=’|S21′)

`majorityCnt(classList)    #the number of 'no' is 3, 'yes' is 2,so return 'no'`

‘no’ 接下来是创建决策树函数

```def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0]#stop splitting when all of the classes are equal
if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}}
del(labels[bestFeat])              #delete the best feature , so it can find the next best feature
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]       #copy all of labels, so trees don't mess up existing labels
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
return myTree```

`myTree = trees.createTree(myDat,labels)`

`myTree`

{‘no surfacing’: {0: ‘no’, 1: {‘flippers’: {0: ‘no’, 1: ‘yes’}}}}

### 1.4 使用决策树执行分类

```def classify(inputTree,featLabels,testVec):
firstStr = inputTree.keys()[0]
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
key = testVec[featIndex]
valueOfFeat = secondDict[key]
if isinstance(valueOfFeat, dict):
classLabel = classify(valueOfFeat, featLabels, testVec)
else: classLabel = valueOfFeat
return classLabel```

{‘no surfacing’: {0: ‘no’, 1: {‘flippers’: {0: ‘no’, 1: ‘yes’}}}}，这是就是我们的inputTree，首先通过函数的第一句话得到它的第一个bestFeat，也就是‘no surfacing’，赋给了firstStr，secondDict就是‘no surfacing’的值，也就是 {0: ‘no’, 1: {‘flippers’: {0: ‘no’, 1: ‘yes’}}}，然后用index函数找到firstStr的标号，结果应该是0，根据下标，把测试向量的值赋给key，然后找到对应secondDict中的值，这里有一个isinstance函数，功能是第一个参数的类型等于后面参数的类型，则返回true，否则返回false，testVec列表第一位是1，则valueOfFeat的值是 {0: ‘no’, 1: ‘yes’}，是dict，则递归调用这个函数，再进行classify，知道不是字典，也就最后的结果了，其实就是将决策树过一遍，找到对应的labels罢了。

`isinstance?     #run it , you will see a below window which is used to introduce this function`

`trees.classify(myTree,labels,[1,0])`

‘no’

`trees.classify(myTree,labels,[1,1])`

‘yes’

### 1.5 决策树的存储

```def storeTree(inputTree,filename):
import pickle
fw = open(filename,'w')
pickle.dump(inputTree,fw)
fw.close()

def grabTree(filename):
import pickle
fr = open(filename)

`trees.storeTree(myTree,'classifierStorage.txt')   #run it ,store the tree`

`trees.grabTree('classifierStorage.txt')`

{‘no surfacing’: {0: ‘no’, 1: {‘flippers’: {0: ‘no’, 1: ‘yes’}}}} 决策树的构造部分结束了，下面介绍怎样绘制决策树

## 2. 使用Matplotlib注解绘制树形图

### 2.1 Matplotlib注解

Matplotlib提供了一个注解工具annotations,它可以在数据图形上添加文本注释。

```import matplotlib.pyplot as plt
decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
createPlot.ax1.annotate(nodeTxt, xy=parentPt,  xycoords='axes fraction',\
xytext=centerPt, textcoords='axes fraction',\
va="center", ha="center", bbox=nodeType, arrowprops=arrow_args )

def createPlot1():
fig = plt.figure(1, facecolor='white')
fig.clf()
createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses
plotNode('a decision node', (0.5, 0.1), (0.1, 0.5), decisionNode)
plotNode('a leaf node', (0.8, 0.1), (0.3, 0.8), leafNode)
plt.show()```

`reset -f   #clear all the module and data`

`cd 桌面/machinelearninginaction/Ch03`

/home/fangyang/桌面/machinelearninginaction/Ch03

```import treePlotter
import matplotlib.pyplot as plt```

`treePlotter.createPlot1()`

### 2.2 构造注解树

```def getNumLeafs(myTree):
numLeafs = 0
firstStr = myTree.keys()[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
numLeafs += getNumLeafs(secondDict[key])
else:   numLeafs +=1
return numLeafs
def getTreeDepth(myTree):
maxDepth = 0
firstStr = myTree.keys()[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
thisDepth = 1 + getTreeDepth(secondDict[key])
else:   thisDepth = 1
if thisDepth > maxDepth: maxDepth = thisDepth
return maxDepth```

```def retrieveTree(i):
listOfTrees =[{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
{'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
]
return listOfTrees[i]```

```def plotMidText(cntrPt, parentPt, txtString):
xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)
def plotTree(myTree, parentPt, nodeTxt):#if the first key tells you what feat was split on
numLeafs = getNumLeafs(myTree)  #this determines the x width of this tree
depth = getTreeDepth(myTree)
firstStr = myTree.keys()[0]     #the text label for this node should be this
cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)
plotMidText(cntrPt, parentPt, nodeTxt)
plotNode(firstStr, cntrPt, parentPt, decisionNode)
secondDict = myTree[firstStr]
plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
plotTree(secondDict[key],cntrPt,str(key))        #recursion
else:   #it's a leaf node print the leaf node
plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
#if you do get a dictonary you know it's a tree, and the first element will be another dict
def createPlot(inTree):
fig = plt.figure(1, facecolor='white')
fig.clf()
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;
plotTree(inTree, (0.5,1.0), '')
plt.show()```

`cd 桌面/machinelearninginaction/Ch03`

/home/fangyang/桌面/machinelearninginaction/Ch03

```import treePlotter
myTree = treePlotter.retrieveTree(0)
treePlotter.createPlot(myTree)```

```myTree['no surfacing'][3] = 'maybe'
treePlotter.createPlot(myTree)```

## 3 使用决策树预测眼睛类型

```import trees
lensesTree = trees.createTree(lenses,lensesLabels)
fr = open('lenses.txt')
lensesTree = trees.createTree(lenses,lensesLabels)
lenses = [inst.strip().split('\t') for inst in fr.readlines()]
lensesLabels = ['age' , 'prescript' , 'astigmatic','tearRate']
lensesTree = trees.createTree(lenses,lensesLabels)```

`lensesTree`

{‘tearRate’: {‘normal’: {‘astigmatic’: {‘no’: {‘age’: {‘pre’: ‘soft’, ‘presbyopic’: {‘prescript’: {‘hyper’: ‘soft’, ‘myope’: ‘no lenses’}}, ‘young’: ‘soft’}}, ‘yes’: {‘prescript’: {‘hyper’: {‘age’: {‘pre’: ‘no lenses’, ‘presbyopic’: ‘no lenses’, ‘young’: ‘hard’}}, ‘myope’: ‘hard’}}}}, ‘reduced’: ‘no lenses’}}

`treePlotter.createPlot(lensesTree)`