当前位置：首页 > C > 正文

C语言外部排序详解（手把手教你用C语言实现大数据外部排序算法）

主机测评网
C
2025-12-12
318

在处理大规模数据时，如果数据量超过了计算机内存的容量，我们就无法一次性将所有数据加载到内存中进行排序。这时候就需要使用C语言外部排序算法。本教程将从零开始，详细讲解如何使用C语言实现外部排序，即使你是编程小白也能轻松理解。

什么是外部排序？

外部排序是指当待排序的数据量太大，无法全部装入内存时，利用外部存储（如硬盘）辅助完成排序的一种算法。最常用的外部排序方法是多路归并排序，它分为两个主要阶段：

生成初始归并段（Run）：将大文件分块读入内存，对每一块进行内部排序（如快速排序），然后写回磁盘，形成多个有序的小文件（称为归并段）。
多路归并：将这些有序的小文件合并成一个最终的有序大文件。

C语言外部排序详解（手把手教你用C语言实现大数据外部排序算法） C语言外部排序外部排序算法 C语言大数据排序磁盘排序算法第1张

为什么需要C语言外部排序？

随着数据爆炸式增长，很多应用场景（如日志分析、数据库索引构建、科学计算）都需要处理GB甚至TB级别的数据。此时，传统的内部排序（如qsort）因内存限制而失效。通过外部排序算法，我们可以高效地处理这些超大规模数据集。

实现步骤详解

下面我们用C语言一步步实现一个简化版的外部排序程序。

第一步：生成初始归并段

假设我们有一个大文件 input.txt，里面包含大量整数。我们将每次读取固定数量的整数（比如1000个）到内存，用快速排序排好后，写入临时文件 run0.tmp, run1.tmp 等。

#include <stdio.h>#include <stdlib.h>#define BUFFER_SIZE 1000  // 内存缓冲区大小// 快速排序函数void quick_sort(int arr[], int low, int high) {    if (low < high) {        int pivot = arr[high];        int i = low - 1;        for (int j = low; j < high; j++) {            if (arr[j] <= pivot) {                i++;                int temp = arr[i];                arr[i] = arr[j];                arr[j] = temp;            }        }        int temp = arr[i + 1];        arr[i + 1] = arr[high];        arr[high] = temp;                quick_sort(arr, low, i);        quick_sort(arr, i + 2, high);    }}// 生成初始归并段int generate_runs(const char* input_file) {    FILE* fp = fopen(input_file, "r");    if (!fp) return -1;    int buffer[BUFFER_SIZE];    int run_count = 0;    int count;    while (!feof(fp)) {        count = 0;        while (count < BUFFER_SIZE && fscanf(fp, "%d", &buffer[count]) == 1) {            count++;        }        if (count == 0) break;        quick_sort(buffer, 0, count - 1);        char run_file[50];        sprintf(run_file, "run%d.tmp", run_count);        FILE* run_fp = fopen(run_file, "w");        for (int i = 0; i < count; i++) {            fprintf(run_fp, "%d\n", buffer[i]);        }        fclose(run_fp);        run_count++;    }    fclose(fp);    return run_count;}

第二步：两路归并（简化版）

为简化理解，我们先实现两路归并。实际应用中常用多路归并（如4路、8路）以减少I/O次数。

// 合并两个有序临时文件void merge_two_files(const char* file1, const char* file2, const char* output) {    FILE *fp1 = fopen(file1, "r");    FILE *fp2 = fopen(file2, "r");    FILE *out = fopen(output, "w");    int num1, num2;    int has1 = fscanf(fp1, "%d", &num1);    int has2 = fscanf(fp2, "%d", &num2);    while (has1 == 1 && has2 == 1) {        if (num1 <= num2) {            fprintf(out, "%d\n", num1);            has1 = fscanf(fp1, "%d", &num1);        } else {            fprintf(out, "%d\n", num2);            has2 = fscanf(fp2, "%d", &num2);        }    }    // 处理剩余数据    while (has1 == 1) {        fprintf(out, "%d\n", num1);        has1 = fscanf(fp1, "%d", &num1);    }    while (has2 == 1) {        fprintf(out, "%d\n", num2);        has2 = fscanf(fp2, "%d", &num2);    }    fclose(fp1);    fclose(fp2);    fclose(out);}

第三步：主函数整合

int main() {    const char* input = "input.txt";    int run_count = generate_runs(input);        if (run_count <= 0) {        printf("Error reading input file!\n");        return 1;    }    // 简化：假设只有两个归并段    if (run_count == 2) {        merge_two_files("run0.tmp", "run1.tmp", "output.txt");        printf("排序完成！结果保存在 output.txt\n");    } else {        printf("本示例仅支持2个归并段。实际应用需实现多路归并。\n");    }    // 清理临时文件（可选）    // remove("run0.tmp"); remove("run1.tmp");        return 0;}