下面是使用Python编写决策树分类算法的示例代码:

import numpy as np

def entropy(y):
    """计算信息熵"""
    unique_labels, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

def gini_index(y):
    """计算基尼指数"""
    unique_labels, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    gini = 1 - np.sum(probabilities**2)
    return gini

def split_dataset(X, y, feature_index, threshold):
    """根据特征和阈值划分数据集"""
    left_indices = np.where(X[:, feature_index] <= threshold)[0]
    right_indices = np.where(X[:, feature_index] > threshold)[0]
    left_X, left_y = X[left_indices], y[left_indices]
    right_X, right_y = X[right_indices], y[right_indices]
    return left_X, left_y, right_X, right_y

def get_best_split(X, y, criterion='gini'):
    """找到最佳的划分点"""
    best_criterion_value = np.inf if criterion == 'gini' else 0
    best_feature_index, best_threshold = None, None
    for feature_index in range(X.shape[1]):
        thresholds = np.unique(X[:, feature_index])
        for threshold in thresholds:
            left_X, left_y, right_X, right_y = split_dataset(X, y, feature_index, threshold)
            if criterion == 'gini':
                left_criterion_value = gini_index(left_y)
                right_criterion_value = gini_index(right_y)
            else:
                left_criterion_value = entropy(left_y)
                right_criterion_value = entropy(right_y)
            criterion_value = (len(left_y) * left_criterion_value + len(right_y) * right_criterion_value) / len(y)
            if criterion == 'gini' and criterion_value < best_criterion_value:
                best_criterion_value = criterion_value
                best_feature_index, best_threshold = feature_index, threshold
            elif criterion == 'entropy' and criterion_value > best_criterion_value:
                best_criterion_value = criterion_value
                best_feature_index, best_threshold = feature_index, threshold
    return best_feature_index, best_threshold

def majority_vote(y):
    """找到标签集合中数量最多的标签"""
    unique_labels, counts = np.unique(y, return_counts=True)
    majority_label = unique_labels[np.argmax(counts)]
    return majority_label

class DecisionTreeClassifier:
    def __init__(self, max_depth=None, criterion='gini'):
        self.max_depth = max_depth
        self.criterion = criterion
        
    def fit(self, X, y):
        self.tree_ = self._build_tree(X, y, depth=0)
        
    def _build_tree(self, X, y, depth):
        if len(np.unique(y)) == 1:  # 如果所有样本都属于同一类别,则返回叶节点
            return {'label': y[0]}
        
        if self.max_depth is not None and depth == self.max_depth:  # 如果达到最大深度,则返回叶节点,叶节点的标签为数量最多的类别
            return {'label': majority_vote(y)}
        
        best_feature_index, best_threshold = get_best_split(X, y, criterion=self.criterion)
        left_X, left_y, right_X, right_y = split_dataset(X, y, best_feature_index, best_threshold)
        return {
            'feature_index': best_feature_index,
            'threshold': best_threshold,
            'left': self._build_tree(left_X, left_y, depth+1),
            'right': self._build_tree(right_X, right_y, depth+1)
        }
    
    def _predict_single(self, x):
        node = self.tree_
        while 'label' not in node:
            if x[node['feature_index']] <= node['threshold']:
                node = node['left']
            else:
                node = node['right']
        return node['label']
    
    def predict(self, X):
        return np.array([self._predict_single(x) for x in X])
 

使用示例:

# 创建一个决策树分类器的实例
clf = DecisionTreeClassifier(max_depth=3, criterion='gini')

# 导入训练数据
X_train = np.array([[5, 2], [4, 1], [1, 2], [3, 2], [2, 1], [6, 1], [7, 3]])
y_train = np.array([0, 0, 1, 0, 1, 1, 1])

# 训练模型
clf.fit(X_train, y_train)

# 导入测试数据
X_test = np.array([[4, 2], [6, 2]])

# 预测
y_pred = clf.predict(X_test)

print(y_pred)  # 输出 [1 1]
 

在Python中,您可以使用Scikit-learn库来实现决策树分类算法。以下是一个简单的例子,展示了如何使用Scikit-learn来创建和训练一个决策树分类器。


首先,确保您已经安装了Scikit-learn库。如果没有安装,可以使用pip安装:
```bash
pipinstallscikit-learn
```
然后,您可以按照以下步骤使用Scikit-learn创建决策树分类器:
```python
fromsklearn.treeimportDecisionTreeClassifier
fromsklearn.datasetsimportload_iris
fromsklearn.model_selectionimporttrain_test_split
fromsklearn.metricsimportaccuracy_score

#加载数据集
iris=load_iris()
X=iris.data
y=iris.target

#将数据集分为训练集和测试集
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

#创建决策树分类器实例
clf=DecisionTreeClassifier(max_depth=2)

#使用训练集训练模型
clf.fit(X_train,y_train)
#使用测试集测试模型
y_pred=clf.predict(X_test)
#计算准确率
accuracy=accuracy_score(y_test,y_pred)
print(f"模型准确率:{accuracy:.2f}")
```
在这个例子中,我们使用的是鸢尾花数据集,它是一个经典的数据集,用于分类任务。我们首先加载数据集,然后将其分为训练集和测试集,并创建一个`DecisionTreeClassifier`实例。我们设置了`max_depth`为2,以限制决策树的最大深度。

接着,我们使用训练集来训练模型,并使用测试集来测试模型的性能。最后,我们使用`accuracy_score`函数来计算模型的准确率。
请注意,这个例子是非常基础的,实际应用中可能需要调整决策树的参数,并进行更复杂的模型评估和调优。

Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐