当前位置：首页 > C++ > 正文

C++随机森林基础实现（从零开始构建你的第一个随机森林模型）

主机测评网
C++
2025-12-04
354

在机器学习领域，随机森林（Random Forest）是一种强大且易于理解的集成学习算法。它通过组合多个决策树来提高预测准确性和防止过拟合。本教程将带你使用 C++ 从零开始实现一个简化版的随机森林模型，即使你是编程小白，也能一步步跟上！

C++随机森林基础实现（从零开始构建你的第一个随机森林模型） C++随机森林随机森林实现 C++机器学习随机森林入门教程第1张

什么是随机森林？

随机森林是一种集成学习方法，它通过构建多个决策树并取其预测结果的平均值（回归）或多数投票（分类）来做出最终预测。每棵树在训练时使用数据集的一个随机子集（自助采样，Bootstrap）和特征的随机子集，从而增强模型的泛化能力。

为什么用 C++ 实现？

虽然 Python 是机器学习的主流语言，但 C++ 在性能要求高的场景（如嵌入式系统、高频交易、游戏 AI）中具有显著优势。掌握 C++随机森林 的实现原理，有助于你深入理解算法底层逻辑，并为高性能应用打下基础。

项目结构概览

我们将实现以下核心组件：

DecisionTree：单棵决策树
RandomForest：管理多棵树的森林
数据结构：样本（Sample）、数据集（Dataset）

第一步：定义基本数据结构

首先，我们定义样本和数据集的结构：

#include <vector>#include <random>#include <iostream>using namespace std;// 一个样本：包含特征向量和标签struct Sample {    vector<double> features;    int label; // 假设是分类问题，标签为整数};// 数据集：样本列表using Dataset = vector<Sample>;

第二步：实现简单的决策树

为了简化，我们使用一个非常基础的决策树——每次随机选择一个特征和阈值进行分割（实际中应使用信息增益等指标，但这里聚焦于森林结构）。

class DecisionTree {private:    int feature_index;    double threshold;    bool is_leaf;    int prediction;    unique_ptr<DecisionTree> left, right;    // 随机数生成器    mt19937 rng{random_device{}()};public:    // 构造函数：训练树    DecisionTree(const Dataset& data, int max_depth = 5) {        build(data, max_depth);    }    void build(const Dataset& data, int depth) {        if (data.empty() || depth == 0) {            is_leaf = true;            prediction = get_majority_label(data);            return;        }        // 简化：随机选一个特征和阈值        uniform_int_distribution<int> feat_dist(0, data[0].features.size() - 1);        feature_index = feat_dist(rng);        uniform_real_distribution<double> thresh_dist(0.0, 1.0);        threshold = thresh_dist(rng);        Dataset left_data, right_data;        for (const auto& sample : data) {            if (sample.features[feature_index] <= threshold)                left_data.push_back(sample);            else                right_data.push_back(sample);        }        if (left_data.empty() || right_data.empty()) {            is_leaf = true;            prediction = get_majority_label(data);            return;        }        is_leaf = false;        left = make_unique<DecisionTree>(left_data, depth - 1);        right = make_unique<DecisionTree>(right_data, depth - 1);    }    int get_majority_label(const Dataset& data) {        if (data.empty()) return 0;        unordered_map<int, int> count;        for (const auto& s : data) count[s.label]++;        return max_element(count.begin(), count.end(),            [](const auto& a, const auto& b) { return a.second < b.second; })->first;    }    int predict(const vector<double>& features) const {        if (is_leaf) return prediction;        if (features[feature_index] <= threshold)            return left->predict(features);        else            return right->predict(features);    }};

第三步：构建随机森林

现在，我们创建 RandomForest 类，它将训练多棵决策树，并对预测结果进行投票。

class RandomForest {private:    vector<unique_ptr<DecisionTree>> trees;    mt19937 rng{random_device{}()};public:    // n_trees: 树的数量    // max_depth: 每棵树的最大深度    void train(const Dataset& data, int n_trees = 10, int max_depth = 5) {        for (int i = 0; i < n_trees; ++i) {            // 自助采样：有放回地随机抽取样本            Dataset bootstrap;            uniform_int_distribution<int> index_dist(0, data.size() - 1);            for (size_t j = 0; j < data.size(); ++j) {                int idx = index_dist(rng);                bootstrap.push_back(data[idx]);            }            trees.push_back(make_unique<DecisionTree>(bootstrap, max_depth));        }    }    int predict(const vector<double>& features) const {        unordered_map<int, int> votes;        for (const auto& tree : trees) {            int pred = tree->predict(features);            votes[pred]++;        }        return max_element(votes.begin(), votes.end(),            [](const auto& a, const auto& b) { return a.second < b.second; })->first;    }};

第四步：测试你的随机森林

让我们用一个小数据集测试模型：

int main() {    // 创建简单数据集：2个特征，2个类别    Dataset data = {        {{0.1, 0.2}, 0},        {{0.2, 0.3}, 0},        {{0.8, 0.9}, 1},        {{0.9, 0.8}, 1},        {{0.15, 0.25}, 0},        {{0.85, 0.88}, 1}    };    RandomForest rf;    rf.train(data, 5, 3); // 5棵树，最大深度3    vector<double> test_sample = {0.82, 0.87};    int result = rf.predict(test_sample);    cout << "预测类别: " << result << endl; // 应输出 1    return 0;}