AI 决策树ID3 代码(C++)

源代码工程文件(vs2005) http://d.download.csdn.net/down/1018461/cctt_1

过去在网上找了段代码,发现写的代码要改些地方,而且也想顺便练习下自己的c++编码。

首先我要建立一个真正的树形结构。于是使用了自己过去的 GeneralTree.h

(当然这里还是改动些GeneralTree的代码例如增添了些函数,另外把有些私有函数变成了公有函数)。

训练文本格式如下:并命名为decision2.txt 并发在自己的工程目录下。当然你也可以改改相关源代码
概念  颜色 形状 轻重
苹果   红   球   一般
苹果   绿   球   一般
香蕉   黄   弯月  一般
草莓   红   球    轻
草莓   绿   球    轻
西瓜   绿   椭球  重
西瓜   绿   球    重
桔子   桔黄 椭球  轻

测试格式文本格式如下:命名为test.txt并放在工程目录下(试试改改源代码)

颜色 形状 轻重
红   球   一般
绿   球   一般
黄   弯月  一般

这里应该考虑各个类分开的。不过为了看起来方便,就合在一起了。
下面是具体代码:

/* created by chico chen * date 2009/02/02 * 如需转载注明出处 */ #include “stdafx.h”
#include #include #include #include
#include #include #include #include “D://Tools//not
Finished//TreeTest//TreeTest//GeneralTree.h” using namespace std; // this
class is for computing attribute entropy class AttribDiff { public: string
attribName; // 属性名 map<string,int> attribNum; //具体属性和个数对
map<string,map<string,int>> typeNumber; // 第一个string为具体属性名,第二个为类型, //
int是类型在具体属性中的个数. // 例如:是否可见 类型 形状 // 1 西瓜 圆 // 1 冬瓜 扁 // 0 橘子 圆 //
其中具体属性为圆,类型为西瓜等个数为形状为圆的类型为西瓜的个数 AttribDiff(string& attribName) {
this->attribName = attribName; } // in order to computer entropy of an
attribute double AttribDifferComputer(vector<vector> infos,int
i_attrib,int i_types, vector& visible) { double probability = 0; double
entropy = 0; double attribG = 0; map<string,int> temp; int tempNum = 0;
for(int i =0 ; i < infos.size(); i++) { if(visible[i] != 0 ) { tempNum =
attribNum[infos[i][i_attrib]]; attribNum[infos[i][i_attrib]] = tempNum; temp
= typeNumber[infos[i][i_attrib]]; tempNum = temp[infos[i][i_types]];
temp[infos[i][i_types]] = tempNum; typeNumber[infos[i][i_attrib]] = temp; }
} map<string,int>::iterator i_number; map<string,int>::iterator i_type;
for(i_number = attribNum.begin(); i_number != attribNum.end(); i_number
) {
probability = (*i_number).second/(double)infos.size(); cout
<<(*i_number).first <<“概率为:”<< probability<<endl; entropy = 0; for(i_type =
typeNumber[(*i_number).first].begin(); i_type !=
typeNumber[(*i_number).first].end(); i_type
) { entropy +=
ComputerEntropyHelp((*i_type).second/(double)(*i_number).second); } attribG +=
(-1)probability * entropy; } return attribG; } // compute the entropy double
ComputerEntropyHelp(double pi) { return pi
log(pi)/log((double)2); } }; //
this class is create a data struct for general tree node class NodeInfo {
public: // 颜色 // 红 // 蓝 string attribName; // the attribute name, such as 颜色
vector detailAttrib; // all of detail attributes under one of
attribute name, for example, 红 NodeInfo() { attribName = “”; } NodeInfo(string
& attribName) { this->attribName = attribName; } NodeInfo(NodeInfo & ni) {
this->attribName = ni.attribName; this->detailAttrib = ni.detailAttrib; } //
add detail attributes in NodeInfo void NodeDetailInfoAdd(string & detailA) {
if(!CheckIsHas(detailA)) { this->detailAttrib.push_back(detailA); } } // If
detail attribute is in the detailAttrib list, return true; // else return
false; bool CheckIsHas(string & name) { for(int i = 0; i <
detailAttrib.size(); i++) { if(strcmp(name.c_str(),detailAttrib[i].c_str())
==0) { // the same attribute return true; } } return false; } // this is print
control for printing NodeInfo static void Print(NodeInfo& info) { cout <<
info.attribName<< “(”; for(int i = 0; i < info.detailAttrib.size() ; i++) {
cout << info.detailAttrib[i]<<" "; } cout << “)/n”; } }; // this class is
decision tree class DT { protected: const string filename; // the data file
name vector<vector> infos; // the array for storing information
vector attribs; // the array for storing the attributes
GeneralTreegt; // the general tree for storing the decision tree
const int START; // which column is the start attribute, except the type
column const int I_TYPE;// the column index of type const int MAX_ENTROPY; //
set an max entropy to find the minimal entropy private: // to help print void
PrintHelp(int helpPrint) { for(int i = 0; i < helpPrint; i++) { cout << “…”;
} } // to find the index of the attribName in attribs array int Find(string&
attribName,vector& attribs) { for(int i = 0; i < attribs.size(); i++)
{ if(strcmp(attribName.c_str(),attribs[i].c_str()) == 0) { // the same return
i; } } return -1; } // this function is used for detecting if the arithmetic
is over bool CheckOver(vector& visible,string& type) { map<string,int>
types; int temp = 0; for(int i = 0; i < infos.size(); i++) { if(visible[i] !=
0) { type = infos[i][I_TYPE]; temp = types[infos[i][I_TYPE]]; if(temp == 0) {
types[infos[i][I_TYPE]] = 1; } if(types.size() > 1) { return false; // there
are more than one types } } } return true; // there is only one type } // to
create a Decision Tree void DTCreate(GeneralTreeNode *parent,
vector visible,vector visibleA, int i_attrib,string& detailA, int
helpPrint) { if(i_attrib >= START) { for(int i = 0; i < infos.size(); i++) {
if(strcmp(infos[i][i_attrib].c_str(),detailA.c_str()) !=0) { // not same
detail attribute visible[i] = 0; } } } string type = “”;
if(CheckOver(visible,type)) { // the arithmetic is over and add the type node
into the general tree NodeInfo n(type); GeneralTreeNode * node = new
GeneralTreeNode(n); gt.Insert(node,parent); PrintHelp(helpPrint);
cout << “decision type:”<<n.attribName<<endl; return; } map<string,double>
attribGs; // this is for deciding which attrib should be used for(int i =
START; i < attribs.size(); i++) { // iterate attribs if(visibleA[i] != 0) {
AttribDiff ad(attribs[i]); attribGs[attribs[i]] =
ad.AttribDifferComputer(infos,i,I_TYPE,visible); cout <<attribs[i] <<“的G为:”<<
attribGs[attribs[i]]<<endl; } } // to find the decision attribute double min =
MAX_ENTROPY; string attributeName; for(map<string,double>::iterator i =
attribGs.begin(); i != attribGs.end(); i++) { if(min >= (*i).second) {
attributeName = (*i).first; min = (*i).second; } } NodeInfo n(attributeName);
int i_max = Find(attributeName,attribs); for(int i = 0; i<infos.size() ; i++)
{ n.NodeDetailInfoAdd(infos[i][i_max]); } GeneralTreeNode * node =
new GeneralTreeNode(n); gt.Insert(node,parent); visibleA[i_max] = 0;
PrintHelp(helpPrint); cout << “choose attribute:”<< attributeName<<endl;
for(int i = 0; i < node->data.detailAttrib.size(); i++) {
PrintHelp(helpPrint); cout << “go into the
branch:”<data.detailAttrib[i]<<endl; // go to every branch to decision
DTCreate(node,visible,visibleA,i_max,node->data.detailAttrib[i],helpPrint+1);
} } public: // 要注意的一点是这里的decision2.txt要放在工程目录下。当然如果你愿意可以写绝对路径 // 注意文件的格式: //
首先一列为类别,然后是各个属性 // 例如: 类型 形状 // 西瓜 圆 // 冬瓜 扁 // 橘子 圆
DT():filename(“decision2.txt”),START(1),I_TYPE(0),MAX_ENTROPY(10000) {
GetInfo(attribs,infos,filename); DTCreate(); } // this function is used for
read data from the file // and create the attribute array and all information
array // post: attribs has at least one element // infos has at least one
element // pre: filename is not empty and the file is exist void
GetInfo(vector& attribs,vector<vector>& infos,const string&
filename) { ifstream read(filename.c_str()); int start = 0; int end = 0;
string info = “”; getline(read,info); istringstream iss(info); string attrib;
while(iss >> attrib) { attribs.push_back(attrib); } while(true) { info = “”;
getline(read,info); if(info == “” || info.length() <= 1) { break; }
vector infoline; istringstream stream(info); while(stream >> attrib) {
infoline.push_back(attrib); } infos.push_back(infoline); } read.close(); } //
create the DT void DTCreate() { vector visible(infos.size(),1);
vector visibleA(attribs.size(),1); // to judge which attribute is useless
string temp = “”; DTCreate(NULL,visible,visibleA,START-1,temp,0); } // print
the DT void Print() { gt.LevelPrint(NodeInfo::Print); } void Judge(const
string& testFilename,vector& types,const string& testResultFileName) {
vector attribs_test; vector<vector> infos_test;
GetInfo(attribs_test,infos_test,testFilename);
if(!CheckFileFormat(attribs_test)) { throw “file format error”; }
GeneralTreeNode * root = gt.GetRoot(); for(int i = 0; i <
infos_test.size(); i++) {
types.push_back(JudgeType(root,infos_test[i],attribs_test)); }
WriteTestTypesInfo(testResultFileName,types); } void WriteTestTypesInfo(const
string& filename, vector& types) { ofstream out(filename.c_str()); out
<< “类别”<<endl; for(int i = 0 ; i < types.size(); i++) { out << types[i]<<endl;
} out.close(); } string JudgeType(GeneralTreeNode * node,
vector& info,vector& attribs_test) {
if(gt.GetChildNodeNum(node) == 0) { return node->getData().attribName; } int
index = Find(node->getData().attribName,attribs_test); int branch_index =
Find(info[index],node->getData().detailAttrib); if(branch_index == -1) { // is
not find this detail attribute in this node detailAttrib // there are two way
to deal with this situation // 1. every branch has possibility to choose // 2.
no such type and can not judge // the first solution make the correct ratio
low // the second solution has no fault-tolerance. // and here I choose the
second solution. // if I have more free time later, I will write the first
solution throw “no such type”; } GeneralTreeNode * childNode =
gt.GetAChild(node,branch_index); return JudgeType(childNode,
info,attribs_test); } bool CheckFileFormat(vector& attribs_test) {
bool isCorrect = true; for(int j = 0; j < attribs_test.size(); j++) {
if(Find(attribs_test[j],attribs) == -1) { isCorrect = false; } }
if(attribs_test.size() == attribs.size() - 1) { isCorrect = isCorrect && true;
} else { isCorrect = false; } return isCorrect; } };

这里的main函数这样写(自己使用的VS2005):

int _tmain(int argc, _TCHAR* argv[]) { DT dt; //dt.Print(); string testFile =
“test.txt”; string testResult = “testResult.txt”; vectortypes;
dt.Judge(testFile,types,testResult); return 0; }

自己感觉DT 的注释比较详细,所以在我的blog中就不再做太多的解释。另外这段代码会将测试结果放在工程目录下的testResult.txt中。

另外在控制台上会有生成决策树ID3的相关相关的信息显示,例如:

红概率为:0.25
黄概率为:0.125
桔黄概率为:0.125
绿概率为:0.5
颜色的G为:1
球概率为:0.625
椭球概率为:0.25
弯月概率为:0.125
形状的G为:1.20121
轻概率为:0.375
一般概率为:0.375
重概率为:0.25
轻重的G为:0.688722
choose attribute:轻重
go into the branch:一般
红概率为:0.125
黄概率为:0.125
绿概率为:0.125
颜色的G为:0
球概率为:0.25
弯月概率为:0.125
形状的G为:0
…choose attribute:颜色
…go into the branch:红
…decision type:苹果
…go into the branch:绿
…decision type:苹果
…go into the branch:黄
…decision type:香蕉
…go into the branch:桔黄
…decision type:
go into the branch:轻
红概率为:0.125
桔黄概率为:0.125
绿概率为:0.125
颜色的G为:0
球概率为:0.25
椭球概率为:0.125
形状的G为:0
…choose attribute:颜色
…go into the branch:红
…decision type:草莓
…go into the branch:绿
…decision type:草莓
…go into the branch:黄
…decision type:
…go into the branch:桔黄
…decision type:桔子
go into the branch:重
…decision type:西瓜

这一段信息是什么意思呢?

红概率为:0.25
黄概率为:0.125
桔黄概率为:0.125
绿概率为:0.5
颜色的G为:1

红,黄,桔黄,绿的概率是颜色的具体属性。这里没有把entropy打印出来。如果此段代码被中科院的师弟师妹有幸看到,

你们可以在AttribDifferComputer()函数中添加几行代码就可以把每一个entropy打印出来。反正老师也会让你们看代码,这里就当作作业题吧。
(另外老师第十章机器学习ppt上的决策树的这个例子计算结果有错误。如果你认真计算过的话)颜色G的含义是颜色G的决策值,决策值越小,选择此属性的概率就越大。

那决策树是什么样子的呢?

choose attribute:轻重
go into the branch:一般

…choose attribute:颜色
…go into the branch:红


看看上面的这些.这里代表根节点是“轻重”,然后进入“一般”分支,然后进入“一般”分支的节点为颜色…然后进入”红“分支.这里一定要注意”…“,相等的"…
”代表树的相同的层次。

做出这个Decision Tree 的ID3代码主要是为了学弟学妹们在考试中测试用的。因为我只是测试了老师ppt中的例子,不保证对于所有的例子都正确。而且老
师出的考试题比较变态(属性十个左右)…如果手工计算应该需要一个小时左右的时间。

当初后悔没有先编一个程序。祝各位考试顺利…(我想我这段代码可能会在考试之前被搜到)。

同时提醒大家一点, ID3也不是什么很好的算法。当两个属性的G值一致时,如果它并不能给出一个更好的判断标准。而且如果采用顺序选择很有可能生成一个非最小决策树
。这点还值得研究一下。