当前位置：首页 > C++ > 正文

C++语言决策树实现方法（从零开始构建你的第一个决策树模型）

主机测评网
C++
2025-12-11
611

在机器学习领域，决策树是一种非常直观且易于理解的分类与回归方法。本教程将手把手教你如何使用C++语言实现一个简单的决策树，即使你是编程小白，也能轻松上手！我们将围绕C++决策树实现、决策树算法C++、C++机器学习教程以及决策树代码示例这四个核心关键词展开。

什么是决策树？

决策树是一种树形结构模型，其中每个内部节点表示一个特征上的判断，每个分支代表一个判断结果，而每个叶节点代表一种类别（分类任务）或数值（回归任务）。它的优点包括：可解释性强、无需数据预处理、能处理数值型和类别型数据等。

C++语言决策树实现方法（从零开始构建你的第一个决策树模型） C++决策树实现决策树算法C++ C++机器学习教程决策树代码示例第1张

实现思路概述

我们将实现一个用于二分类问题的简单决策树，基于信息增益（Information Gain）选择最佳分割特征。主要步骤包括：

定义数据结构（样本、节点）
计算信息熵（Entropy）
计算信息增益并选择最优特征
递归构建树
预测新样本

第一步：定义基本数据结构

我们用 vector<vector<double>> 表示特征矩阵，用 vector<int> 表示标签（0 或 1）。

struct TreeNode {    int feature_index = -1;      // 使用的特征索引，-1 表示叶节点    double threshold = 0.0;      // 分割阈值    int prediction = 0;          // 叶节点的预测类别    TreeNode* left = nullptr;    TreeNode* right = nullptr;};using Dataset = std::vector<std::vector<double>>;using Labels = std::vector<int>;

第二步：计算信息熵

信息熵衡量数据集的“混乱程度”。公式为：
\( H(S) = -\sum_{i=1}^{n} p_i \log_2 p_i \)

double calculateEntropy(const Labels& labels) {    if (labels.empty()) return 0.0;        int count0 = 0, count1 = 0;    for (int label : labels) {        if (label == 0) count0++;        else count1++;    }        double total = labels.size();    double p0 = count0 / total;    double p1 = count1 / total;        double entropy = 0.0;    if (p0 > 0) entropy -= p0 * log2(p0);    if (p1 > 0) entropy -= p1 * log2(p1);        return entropy;}

第三步：按特征分割数据集

给定一个特征索引和阈值，将数据分为左右两部分。

std::pair<std::vector<int>, std::vector<int>> splitDataset(    const Dataset& X, const Labels& y,    int feature_index, double threshold) {        std::vector<int> left_indices, right_indices;    for (size_t i = 0; i < X.size(); ++i) {        if (X[i][feature_index] <= threshold) {            left_indices.push_back(i);        } else {            right_indices.push_back(i);        }    }    return {left_indices, right_indices};}

第四步：构建决策树（递归）

我们设定最大深度和最小样本数作为停止条件。

TreeNode* buildTree(const Dataset& X, const Labels& y,                   int depth = 0, int max_depth = 5,                   int min_samples_split = 2) {    // 停止条件    if (y.empty() || depth >= max_depth ||         static_cast<int>(y.size()) < min_samples_split) {        TreeNode* leaf = new TreeNode();        // 多数投票        int count0 = std::count(y.begin(), y.end(), 0);        int count1 = y.size() - count0;        leaf->prediction = (count1 > count0) ? 1 : 0;        return leaf;    }    double best_gain = -1;    int best_feature = -1;    double best_threshold = 0.0;    double parent_entropy = calculateEntropy(y);    // 遍历所有特征    for (size_t f = 0; f < X[0].size(); ++f) {        // 收集该特征的所有唯一值作为候选阈值        std::set<double> thresholds;        for (const auto& row : X) thresholds.insert(row[f]);                for (double t : thresholds) {            auto [left_idx, right_idx] = splitDataset(X, y, f, t);            if (left_idx.empty() || right_idx.empty()) continue;            // 构建左右子集标签            Labels left_y, right_y;            for (int idx : left_idx) left_y.push_back(y[idx]);            for (int idx : right_idx) right_y.push_back(y[idx]);            double left_entropy = calculateEntropy(left_y);            double right_entropy = calculateEntropy(right_y);            double weight_left = static_cast<double>(left_y.size()) / y.size();            double weight_right = static_cast<double>(right_y.size()) / y.size();            double gain = parent_entropy -                           (weight_left * left_entropy + weight_right * right_entropy);            if (gain > best_gain) {                best_gain = gain;                best_feature = f;                best_threshold = t;            }        }    }    // 如果没有有效分割，返回叶节点    if (best_gain <= 0) {        TreeNode* leaf = new TreeNode();        int count0 = std::count(y.begin(), y.end(), 0);        int count1 = y.size() - count0;        leaf->prediction = (count1 > count0) ? 1 : 0;        return leaf;    }    // 分割数据    auto [left_idx, right_idx] = splitDataset(X, y, best_feature, best_threshold);    Dataset X_left, X_right;    Labels y_left, y_right;    for (int idx : left_idx) {        X_left.push_back(X[idx]);        y_left.push_back(y[idx]);    }    for (int idx : right_idx) {        X_right.push_back(X[idx]);        y_right.push_back(y[idx]);    }    // 递归构建子树    TreeNode* node = new TreeNode();    node->feature_index = best_feature;    node->threshold = best_threshold;    node->left = buildTree(X_left, y_left, depth + 1, max_depth, min_samples_split);    node->right = buildTree(X_right, y_right, depth + 1, max_depth, min_samples_split);    return node;}

第五步：预测函数

int predict(const TreeNode* root, const std::vector<double>& sample) {    const TreeNode* current = root;    while (current->feature_index != -1) {        if (sample[current->feature_index] <= current->threshold) {            current = current->left;        } else {            current = current->right;        }    }    return current->prediction;}

完整使用示例

#include <iostream>#include <vector>#include <set>#include <cmath>#include <algorithm>// 上述所有函数和结构体定义...int main() {    // 示例数据：2个特征，4个样本    Dataset X = {{2.771244718, 1.784783929},                 {1.728571309, 1.169761413},                 {3.678319846, 2.812813570},                 {3.961043357, 2.619950320}};    Labels y = {0, 0, 1, 1};    TreeNode* tree = buildTree(X, y);    // 测试预测    std::vector<double> test_sample = {3.0, 2.0};    int result = predict(tree, test_sample);    std::cout << "预测结果: " << result << std::endl; // 输出应为 0 或 1    // 注意：实际项目中需释放内存（略）    return 0;}