External Sorting of integers
Problem
Given an array of a million integers between zero and a billion, out of order, how can you efficiently sort it? Assume that you cannot store an array of a billion elements in memory.
Examples
Example 1
Input: [500000000, 1, 999999999, 250000000, 750000000, ...] // 1 million integers
Memory Constraint: Cannot allocate array of size 1 billion
Output: [1, 250000000, 500000000, 750000000, 999999999, ...] // Sorted array
Example 2
Input: Large file with 1M integers ranging 0-1B
Available Memory: 100MB (can hold ~25M integers)
Output: Sorted file with same 1M integers
Solutions
Method 1 – External Merge Sort
Intuition
Since we cannot fit a billion-element array in memory, we need to use external sorting. The key insight is to divide the problem into smaller chunks that can fit in memory, sort each chunk individually, and then merge the sorted chunks.
External merge sort works by:
- Divide phase: Split input into smaller chunks that fit in memory
- Conquer phase: Sort each chunk in memory using any efficient algorithm
- Merge phase: Use k-way merge to combine sorted chunks into final result
Approach
- Calculate chunk size based on available memory (e.g., if we have 100MB and integers are 4 bytes, we can handle ~25M integers per chunk)
- Read and sort chunks: Read chunks of data that fit in memory, sort them using quicksort/mergesort, and write to temporary files
- K-way merge: Use a min-heap to efficiently merge all sorted chunks
- Write final result: Stream the merged result to output file
Code
C++
#include <vector>
#include <fstream>
#include <queue>
#include <algorithm>
#include <string>
class ExternalSorter {
private:
struct FileReader {
std::ifstream file;
int current_value;
bool has_data;
int file_id;
FileReader(const std::string& filename, int id) : file_id(id) {
file.open(filename);
has_data = readNext();
}
bool readNext() {
if (file >> current_value) {
return true;
}
has_data = false;
file.close();
return false;
}
};
struct Compare {
bool operator()(const std::unique_ptr<FileReader>& a,
const std::unique_ptr<FileReader>& b) {
return a->current_value > b->current_value; // Min heap
}
};
int chunk_size;
std::string temp_dir;
public:
ExternalSorter(int memory_limit_mb = 100) {
// Assuming 4 bytes per integer
chunk_size = (memory_limit_mb * 1024 * 1024) / 4;
temp_dir = "/tmp/";
}
std::vector<std::string> sortAndSaveChunks(const std::string& input_file) {
std::ifstream input(input_file);
std::vector<std::string> temp_files;
std::vector<int> buffer;
buffer.reserve(chunk_size);
int chunk_id = 0;
int value;
while (input >> value) {
buffer.push_back(value);
if (buffer.size() == chunk_size) {
std::string temp_file = temp_dir + "chunk_" + std::to_string(chunk_id++) + ".txt";
std::sort(buffer.begin(), buffer.end());
std::ofstream output(temp_file);
for (int val : buffer) {
output << val << "\n";
}
output.close();
temp_files.push_back(temp_file);
buffer.clear();
}
}
// Handle remaining elements
if (!buffer.empty()) {
std::string temp_file = temp_dir + "chunk_" + std::to_string(chunk_id++) + ".txt";
std::sort(buffer.begin(), buffer.end());
std::ofstream output(temp_file);
for (int val : buffer) {
output << val << "\n";
}
output.close();
temp_files.push_back(temp_file);
}
input.close();
return temp_files;
}
void mergeChunks(const std::vector<std::string>& temp_files,
const std::string& output_file) {
std::priority_queue<std::unique_ptr<FileReader>,
std::vector<std::unique_ptr<FileReader>>,
Compare> min_heap;
// Initialize file readers
for (int i = 0; i < temp_files.size(); i++) {
auto reader = std::make_unique<FileReader>(temp_files[i], i);
if (reader->has_data) {
min_heap.push(std::move(reader));
}
}
std::ofstream output(output_file);
while (!min_heap.empty()) {
auto current = std::move(const_cast<std::unique_ptr<FileReader>&>(min_heap.top()));
min_heap.pop();
output << current->current_value << "\n";
if (current->readNext()) {
min_heap.push(std::move(current));
}
}
output.close();
// Clean up temporary files
for (const auto& file : temp_files) {
std::remove(file.c_str());
}
}
void externalSort(const std::string& input_file, const std::string& output_file) {
auto temp_files = sortAndSaveChunks(input_file);
mergeChunks(temp_files, output_file);
}
};
Go
package main
import (
"bufio"
"container/heap"
"fmt"
"os"
"sort"
"strconv"
"strings"
)
type FileReader struct {
file *os.File
scanner *bufio.Scanner
currentValue int
hasData bool
fileID int
}
func NewFileReader(filename string, id int) (*FileReader, error) {
file, err := os.Open(filename)
if err != nil {
return nil, err
}
fr := &FileReader{
file: file,
scanner: bufio.NewScanner(file),
fileID: id,
}
fr.hasData = fr.readNext()
return fr, nil
}
func (fr *FileReader) readNext() bool {
if fr.scanner.Scan() {
if val, err := strconv.Atoi(strings.TrimSpace(fr.scanner.Text())); err == nil {
fr.currentValue = val
return true
}
}
fr.hasData = false
fr.file.Close()
return false
}
type FileReaderHeap []*FileReader
func (h FileReaderHeap) Len() int { return len(h) }
func (h FileReaderHeap) Less(i, j int) bool { return h[i].currentValue < h[j].currentValue }
func (h FileReaderHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h *FileReaderHeap) Push(x interface{}) {
*h = append(*h, x.(*FileReader))
}
func (h *FileReaderHeap) Pop() interface{} {
old := *h
n := len(old)
item := old[n-1]
*h = old[0 : n-1]
return item
}
type ExternalSorter struct {
chunkSize int
tempDir string
}
func NewExternalSorter(memoryLimitMB int) *ExternalSorter {
return &ExternalSorter{
chunkSize: (memoryLimitMB * 1024 * 1024) / 4, // 4 bytes per int
tempDir: "/tmp/",
}
}
func (es *ExternalSorter) sortAndSaveChunks(inputFile string) ([]string, error) {
file, err := os.Open(inputFile)
if err != nil {
return nil, err
}
defer file.Close()
var tempFiles []string
var buffer []int
chunkID := 0
scanner := bufio.NewScanner(file)
for scanner.Scan() {
if val, err := strconv.Atoi(strings.TrimSpace(scanner.Text())); err == nil {
buffer = append(buffer, val)
if len(buffer) == es.chunkSize {
tempFile := fmt.Sprintf("%schunk_%d.txt", es.tempDir, chunkID)
chunkID++
sort.Ints(buffer)
if err := es.writeChunk(buffer, tempFile); err != nil {
return nil, err
}
tempFiles = append(tempFiles, tempFile)
buffer = buffer[:0] // Clear buffer
}
}
}
// Handle remaining elements
if len(buffer) > 0 {
tempFile := fmt.Sprintf("%schunk_%d.txt", es.tempDir, chunkID)
sort.Ints(buffer)
if err := es.writeChunk(buffer, tempFile); err != nil {
return nil, err
}
tempFiles = append(tempFiles, tempFile)
}
return tempFiles, nil
}
func (es *ExternalSorter) writeChunk(data []int, filename string) error {
file, err := os.Create(filename)
if err != nil {
return err
}
defer file.Close()
writer := bufio.NewWriter(file)
for _, val := range data {
fmt.Fprintln(writer, val)
}
return writer.Flush()
}
func (es *ExternalSorter) mergeChunks(tempFiles []string, outputFile string) error {
output, err := os.Create(outputFile)
if err != nil {
return err
}
defer output.Close()
writer := bufio.NewWriter(output)
defer writer.Flush()
h := &FileReaderHeap{}
heap.Init(h)
// Initialize file readers
for i, tempFile := range tempFiles {
if reader, err := NewFileReader(tempFile, i); err == nil && reader.hasData {
heap.Push(h, reader)
}
}
for h.Len() > 0 {
current := heap.Pop(h).(*FileReader)
fmt.Fprintln(writer, current.currentValue)
if current.readNext() {
heap.Push(h, current)
}
}
// Clean up temporary files
for _, tempFile := range tempFiles {
os.Remove(tempFile)
}
return nil
}
func (es *ExternalSorter) ExternalSort(inputFile, outputFile string) error {
tempFiles, err := es.sortAndSaveChunks(inputFile)
if err != nil {
return err
}
return es.mergeChunks(tempFiles, outputFile)
}
Java
import java.io.*;
import java.util.*;
class ExternalSorter {
private int chunkSize;
private String tempDir;
static class FileReader {
BufferedReader reader;
Integer currentValue;
boolean hasData;
int fileId;
FileReader(String filename, int id) throws IOException {
this.fileId = id;
this.reader = new BufferedReader(new java.io.FileReader(filename));
this.hasData = readNext();
}
boolean readNext() throws IOException {
String line = reader.readLine();
if (line != null && !line.trim().isEmpty()) {
currentValue = Integer.parseInt(line.trim());
return true;
}
hasData = false;
reader.close();
return false;
}
}
public ExternalSorter(int memoryLimitMB) {
this.chunkSize = (memoryLimitMB * 1024 * 1024) / 4; // 4 bytes per integer
this.tempDir = System.getProperty("java.io.tmpdir");
}
private List<String> sortAndSaveChunks(String inputFile) throws IOException {
List<String> tempFiles = new ArrayList<>();
List<Integer> buffer = new ArrayList<>();
try (BufferedReader reader = new BufferedReader(new java.io.FileReader(inputFile))) {
String line;
int chunkId = 0;
while ((line = reader.readLine()) != null) {
if (!line.trim().isEmpty()) {
buffer.add(Integer.parseInt(line.trim()));
if (buffer.size() == chunkSize) {
String tempFile = tempDir + "/chunk_" + chunkId++ + ".txt";
Collections.sort(buffer);
writeChunk(buffer, tempFile);
tempFiles.add(tempFile);
buffer.clear();
}
}
}
// Handle remaining elements
if (!buffer.isEmpty()) {
String tempFile = tempDir + "/chunk_" + chunkId++ + ".txt";
Collections.sort(buffer);
writeChunk(buffer, tempFile);
tempFiles.add(tempFile);
}
}
return tempFiles;
}
private void writeChunk(List<Integer> data, String filename) throws IOException {
try (PrintWriter writer = new PrintWriter(new FileWriter(filename))) {
for (int value : data) {
writer.println(value);
}
}
}
private void mergeChunks(List<String> tempFiles, String outputFile) throws IOException {
PriorityQueue<FileReader> minHeap = new PriorityQueue<>(
Comparator.comparingInt(fr -> fr.currentValue)
);
// Initialize file readers
for (int i = 0; i < tempFiles.size(); i++) {
FileReader reader = new FileReader(tempFiles.get(i), i);
if (reader.hasData) {
minHeap.offer(reader);
}
}
try (PrintWriter writer = new PrintWriter(new FileWriter(outputFile))) {
while (!minHeap.isEmpty()) {
FileReader current = minHeap.poll();
writer.println(current.currentValue);
if (current.readNext()) {
minHeap.offer(current);
}
}
}
// Clean up temporary files
for (String tempFile : tempFiles) {
new File(tempFile).delete();
}
}
public void externalSort(String inputFile, String outputFile) throws IOException {
List<String> tempFiles = sortAndSaveChunks(inputFile);
mergeChunks(tempFiles, outputFile);
}
}
Kotlin
import java.io.*
import java.util.*
class ExternalSorter(memoryLimitMB: Int = 100) {
private val chunkSize = (memoryLimitMB * 1024 * 1024) / 4 // 4 bytes per integer
private val tempDir = System.getProperty("java.io.tmpdir")
data class FileReader(
val reader: BufferedReader,
var currentValue: Int = 0,
var hasData: Boolean = false,
val fileId: Int
) {
init {
hasData = readNext()
}
fun readNext(): Boolean {
val line = reader.readLine()
return if (line != null && line.trim().isNotEmpty()) {
currentValue = line.trim().toInt()
true
} else {
hasData = false
reader.close()
false
}
}
}
private fun sortAndSaveChunks(inputFile: String): List<String> {
val tempFiles = mutableListOf<String>()
val buffer = mutableListOf<Int>()
var chunkId = 0
BufferedReader(java.io.FileReader(inputFile)).use { reader ->
reader.lineSequence().forEach { line ->
if (line.trim().isNotEmpty()) {
buffer.add(line.trim().toInt())
if (buffer.size == chunkSize) {
val tempFile = "$tempDir/chunk_${chunkId++}.txt"
buffer.sort()
writeChunk(buffer, tempFile)
tempFiles.add(tempFile)
buffer.clear()
}
}
}
// Handle remaining elements
if (buffer.isNotEmpty()) {
val tempFile = "$tempDir/chunk_${chunkId++}.txt"
buffer.sort()
writeChunk(buffer, tempFile)
tempFiles.add(tempFile)
}
}
return tempFiles
}
private fun writeChunk(data: List<Int>, filename: String) {
PrintWriter(FileWriter(filename)).use { writer ->
data.forEach { writer.println(it) }
}
}
private fun mergeChunks(tempFiles: List<String>, outputFile: String) {
val minHeap = PriorityQueue<FileReader> { a, b -> a.currentValue.compareTo(b.currentValue) }
// Initialize file readers
tempFiles.forEachIndexed { index, tempFile ->
val reader = FileReader(BufferedReader(java.io.FileReader(tempFile)), fileId = index)
if (reader.hasData) {
minHeap.offer(reader)
}
}
PrintWriter(FileWriter(outputFile)).use { writer ->
while (minHeap.isNotEmpty()) {
val current = minHeap.poll()
writer.println(current.currentValue)
if (current.readNext()) {
minHeap.offer(current)
}
}
}
// Clean up temporary files
tempFiles.forEach { File(it).delete() }
}
fun externalSort(inputFile: String, outputFile: String) {
val tempFiles = sortAndSaveChunks(inputFile)
mergeChunks(tempFiles, outputFile)
}
}
Python
import heapq
import os
import tempfile
from typing import List
class FileReader:
def __init__(self, filename: str, file_id: int):
self.file = open(filename, 'r')
self.file_id = file_id
self.current_value = None
self.has_data = self._read_next()
def _read_next(self) -> bool:
line = self.file.readline()
if line.strip():
self.current_value = int(line.strip())
return True
else:
self.has_data = False
self.file.close()
return False
def read_next(self) -> bool:
return self._read_next()
def __lt__(self, other):
return self.current_value < other.current_value
class ExternalSorter:
def __init__(self, memory_limit_mb: int = 100):
self.chunk_size = (memory_limit_mb * 1024 * 1024) // 4 # 4 bytes per integer
self.temp_dir = tempfile.gettempdir()
def _sort_and_save_chunks(self, input_file: str) -> List[str]:
temp_files = []
buffer = []
chunk_id = 0
with open(input_file, 'r') as f:
for line in f:
if line.strip():
buffer.append(int(line.strip()))
if len(buffer) == self.chunk_size:
temp_file = os.path.join(self.temp_dir, f"chunk_{chunk_id}.txt")
chunk_id += 1
buffer.sort()
self._write_chunk(buffer, temp_file)
temp_files.append(temp_file)
buffer.clear()
# Handle remaining elements
if buffer:
temp_file = os.path.join(self.temp_dir, f"chunk_{chunk_id}.txt")
buffer.sort()
self._write_chunk(buffer, temp_file)
temp_files.append(temp_file)
return temp_files
def _write_chunk(self, data: List[int], filename: str) -> None:
with open(filename, 'w') as f:
for value in data:
f.write(f"{value}\n")
def _merge_chunks(self, temp_files: List[str], output_file: str) -> None:
min_heap = []
# Initialize file readers
for i, temp_file in enumerate(temp_files):
reader = FileReader(temp_file, i)
if reader.has_data:
heapq.heappush(min_heap, reader)
with open(output_file, 'w') as output:
while min_heap:
current = heapq.heappop(min_heap)
output.write(f"{current.current_value}\n")
if current.read_next():
heapq.heappush(min_heap, current)
# Clean up temporary files
for temp_file in temp_files:
os.remove(temp_file)
def external_sort(self, input_file: str, output_file: str) -> None:
"""
Sort a large file using external merge sort algorithm.
Args:
input_file: Path to input file containing integers to sort
output_file: Path to output file where sorted integers will be written
"""
temp_files = self._sort_and_save_chunks(input_file)
self._merge_chunks(temp_files, output_file)
# Usage example
def sort_large_file(input_filename: str, output_filename: str, memory_limit_mb: int = 100):
sorter = ExternalSorter(memory_limit_mb)
sorter.external_sort(input_filename, output_filename)
print(f"File sorted successfully. Output written to {output_filename}")
Rust
use std::collections::BinaryHeap;
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::Path;
use std::cmp::Reverse;
struct FileReader {
reader: BufReader<File>,
current_value: Option<i32>,
file_id: usize,
}
impl FileReader {
fn new(filename: &str, file_id: usize) -> std::io::Result<Self> {
let file = File::open(filename)?;
let mut reader = BufReader::new(file);
let current_value = Self::read_next_value(&mut reader)?;
Ok(FileReader {
reader,
current_value,
file_id,
})
}
fn read_next_value(reader: &mut BufReader<File>) -> std::io::Result<Option<i32>> {
let mut line = String::new();
match reader.read_line(&mut line)? {
0 => Ok(None),
_ => {
let trimmed = line.trim();
if trimmed.is_empty() {
Self::read_next_value(reader)
} else {
Ok(Some(trimmed.parse().map_err(|e| {
std::io::Error::new(std::io::ErrorKind::InvalidData, e)
})?))
}
}
}
}
fn read_next(&mut self) -> std::io::Result<()> {
self.current_value = Self::read_next_value(&mut self.reader)?;
Ok(())
}
fn has_data(&self) -> bool {
self.current_value.is_some()
}
fn current_value(&self) -> i32 {
self.current_value.unwrap()
}
}
impl PartialEq for FileReader {
fn eq(&self, other: &Self) -> bool {
self.current_value == other.current_value
}
}
impl Eq for FileReader {}
impl PartialOrd for FileReader {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for FileReader {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
other.current_value.cmp(&self.current_value) // Reverse for min-heap
}
}
pub struct ExternalSorter {
chunk_size: usize,
temp_dir: String,
}
impl ExternalSorter {
pub fn new(memory_limit_mb: usize) -> Self {
ExternalSorter {
chunk_size: (memory_limit_mb * 1024 * 1024) / 4, // 4 bytes per integer
temp_dir: "/tmp".to_string(),
}
}
fn sort_and_save_chunks(&self, input_file: &str) -> std::io::Result<Vec<String>> {
let file = File::open(input_file)?;
let reader = BufReader::new(file);
let mut temp_files = Vec::new();
let mut buffer = Vec::new();
let mut chunk_id = 0;
for line in reader.lines() {
let line = line?;
if let Ok(value) = line.trim().parse::<i32>() {
buffer.push(value);
if buffer.len() == self.chunk_size {
let temp_file = format!("{}/chunk_{}.txt", self.temp_dir, chunk_id);
chunk_id += 1;
buffer.sort_unstable();
self.write_chunk(&buffer, &temp_file)?;
temp_files.push(temp_file);
buffer.clear();
}
}
}
// Handle remaining elements
if !buffer.is_empty() {
let temp_file = format!("{}/chunk_{}.txt", self.temp_dir, chunk_id);
buffer.sort_unstable();
self.write_chunk(&buffer, &temp_file)?;
temp_files.push(temp_file);
}
Ok(temp_files)
}
fn write_chunk(&self, data: &[i32], filename: &str) -> std::io::Result<()> {
let file = File::create(filename)?;
let mut writer = BufWriter::new(file);
for &value in data {
writeln!(writer, "{}", value)?;
}
writer.flush()?;
Ok(())
}
fn merge_chunks(&self, temp_files: &[String], output_file: &str) -> std::io::Result<()> {
let mut min_heap = BinaryHeap::new();
// Initialize file readers
for (i, temp_file) in temp_files.iter().enumerate() {
match FileReader::new(temp_file, i) {
Ok(reader) if reader.has_data() => {
min_heap.push(Reverse(reader));
}
Ok(_) => {} // Empty file
Err(e) => return Err(e),
}
}
let output = File::create(output_file)?;
let mut writer = BufWriter::new(output);
while let Some(Reverse(mut current)) = min_heap.pop() {
writeln!(writer, "{}", current.current_value())?;
if current.read_next().is_ok() && current.has_data() {
min_heap.push(Reverse(current));
}
}
writer.flush()?;
// Clean up temporary files
for temp_file in temp_files {
std::fs::remove_file(temp_file).ok();
}
Ok(())
}
pub fn external_sort(&self, input_file: &str, output_file: &str) -> std::io::Result<()> {
let temp_files = self.sort_and_save_chunks(input_file)?;
self.merge_chunks(&temp_files, output_file)?;
Ok(())
}
}
TypeScript
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import { createReadStream, createWriteStream } from 'fs';
import { createInterface } from 'readline';
class FileReader {
private reader: any;
private currentValue: number | null = null;
private hasDataFlag: boolean = false;
public fileId: number;
constructor(filename: string, fileId: number) {
this.fileId = fileId;
this.reader = createInterface({
input: createReadStream(filename),
crlfDelay: Infinity
});
this.initializeReader();
}
private async initializeReader(): Promise<void> {
const iterator = this.reader[Symbol.asyncIterator]();
this.hasDataFlag = await this.readNext(iterator);
}
private async readNext(iterator?: any): Promise<boolean> {
try {
const iter = iterator || this.reader[Symbol.asyncIterator]();
const { value, done } = await iter.next();
if (!done && value.trim()) {
this.currentValue = parseInt(value.trim());
return true;
} else {
this.hasDataFlag = false;
this.reader.close();
return false;
}
} catch {
this.hasDataFlag = false;
return false;
}
}
async readNextValue(): Promise<boolean> {
return await this.readNext();
}
get hasData(): boolean {
return this.hasDataFlag;
}
get value(): number {
return this.currentValue!;
}
}
class MinHeap<T> {
private items: T[] = [];
private compare: (a: T, b: T) => number;
constructor(compareFunction: (a: T, b: T) => number) {
this.compare = compareFunction;
}
push(item: T): void {
this.items.push(item);
this.heapifyUp(this.items.length - 1);
}
pop(): T | undefined {
if (this.items.length === 0) return undefined;
const min = this.items[0];
const last = this.items.pop()!;
if (this.items.length > 0) {
this.items[0] = last;
this.heapifyDown(0);
}
return min;
}
get size(): number {
return this.items.length;
}
private heapifyUp(index: number): void {
if (index === 0) return;
const parentIndex = Math.floor((index - 1) / 2);
if (this.compare(this.items[index], this.items[parentIndex]) < 0) {
[this.items[index], this.items[parentIndex]] = [this.items[parentIndex], this.items[index]];
this.heapifyUp(parentIndex);
}
}
private heapifyDown(index: number): void {
const leftChild = 2 * index + 1;
const rightChild = 2 * index + 2;
let smallest = index;
if (leftChild < this.items.length &&
this.compare(this.items[leftChild], this.items[smallest]) < 0) {
smallest = leftChild;
}
if (rightChild < this.items.length &&
this.compare(this.items[rightChild], this.items[smallest]) < 0) {
smallest = rightChild;
}
if (smallest !== index) {
[this.items[index], this.items[smallest]] = [this.items[smallest], this.items[index]];
this.heapifyDown(smallest);
}
}
}
class ExternalSorter {
private chunkSize: number;
private tempDir: string;
constructor(memoryLimitMB: number = 100) {
this.chunkSize = Math.floor((memoryLimitMB * 1024 * 1024) / 4); // 4 bytes per integer
this.tempDir = os.tmpdir();
}
private async sortAndSaveChunks(inputFile: string): Promise<string[]> {
const tempFiles: string[] = [];
let buffer: number[] = [];
let chunkId = 0;
const fileStream = createReadStream(inputFile);
const rl = createInterface({
input: fileStream,
crlfDelay: Infinity
});
for await (const line of rl) {
if (line.trim()) {
buffer.push(parseInt(line.trim()));
if (buffer.length === this.chunkSize) {
const tempFile = path.join(this.tempDir, `chunk_${chunkId++}.txt`);
buffer.sort((a, b) => a - b);
await this.writeChunk(buffer, tempFile);
tempFiles.push(tempFile);
buffer = [];
}
}
}
// Handle remaining elements
if (buffer.length > 0) {
const tempFile = path.join(this.tempDir, `chunk_${chunkId++}.txt`);
buffer.sort((a, b) => a - b);
await this.writeChunk(buffer, tempFile);
tempFiles.push(tempFile);
}
return tempFiles;
}
private async writeChunk(data: number[], filename: string): Promise<void> {
return new Promise((resolve, reject) => {
const writeStream = createWriteStream(filename);
writeStream.on('error', reject);
writeStream.on('finish', resolve);
for (const value of data) {
writeStream.write(`${value}\n`);
}
writeStream.end();
});
}
private async mergeChunks(tempFiles: string[], outputFile: string): Promise<void> {
const minHeap = new MinHeap<FileReader>((a, b) => a.value - b.value);
const readers: FileReader[] = [];
// Initialize file readers
for (let i = 0; i < tempFiles.length; i++) {
const reader = new FileReader(tempFiles[i], i);
readers.push(reader);
// Wait for reader initialization
await new Promise(resolve => setTimeout(resolve, 10));
if (reader.hasData) {
minHeap.push(reader);
}
}
const writeStream = createWriteStream(outputFile);
while (minHeap.size > 0) {
const current = minHeap.pop()!;
writeStream.write(`${current.value}\n`);
if (await current.readNextValue()) {
minHeap.push(current);
}
}
writeStream.end();
// Clean up temporary files
for (const tempFile of tempFiles) {
try {
fs.unlinkSync(tempFile);
} catch (e) {
// Ignore cleanup errors
}
}
}
async externalSort(inputFile: string, outputFile: string): Promise<void> {
const tempFiles = await this.sortAndSaveChunks(inputFile);
await this.mergeChunks(tempFiles, outputFile);
}
}
Complexity
- ⏰ Time complexity:
O(n log n), where n is the number of elements. Each element is read/written twice (once for chunking, once for merging), and sorting each chunk takes O(k log k) where k is chunk size - 🧺 Space complexity:
O(k + m), where k is the chunk size (limited by available memory) and m is the number of chunks (for the priority queue during merging)
Method 2 – Counting Sort with Range Partitioning
Intuition
Since we know the range of values (0 to 1 billion), we can use a hybrid approach that combines counting sort principles with external sorting. We can partition the range into smaller sub-ranges and process each partition separately.
Approach
- Partition the range: Divide [0, 1B] into manageable sub-ranges based on available memory
- Distribute elements: Read input and write elements to appropriate partition files
- Sort each partition: Use counting sort or regular sorting for each partition
- Concatenate results: Simply concatenate sorted partitions to get final result
Code
Python
import os
import tempfile
from typing import List, Tuple
class RangeBasedSorter:
def __init__(self, memory_limit_mb: int = 100, max_value: int = 1_000_000_000):
self.memory_limit_mb = memory_limit_mb
self.max_value = max_value
self.temp_dir = tempfile.gettempdir()
# Calculate number of partitions based on memory
# Each partition can hold memory_limit numbers
max_elements_in_memory = (memory_limit_mb * 1024 * 1024) // 4
self.num_partitions = min(1000, max(10, max_value // max_elements_in_memory))
self.partition_size = (max_value + self.num_partitions - 1) // self.num_partitions
def _get_partition_id(self, value: int) -> int:
return min(value // self.partition_size, self.num_partitions - 1)
def _distribute_to_partitions(self, input_file: str) -> List[str]:
# Create partition files
partition_files = []
partition_writers = []
for i in range(self.num_partitions):
filename = os.path.join(self.temp_dir, f"partition_{i}.txt")
partition_files.append(filename)
partition_writers.append(open(filename, 'w'))
try:
# Distribute elements to partitions
with open(input_file, 'r') as f:
for line in f:
if line.strip():
value = int(line.strip())
partition_id = self._get_partition_id(value)
partition_writers[partition_id].write(f"{value}\n")
finally:
# Close all partition files
for writer in partition_writers:
writer.close()
return partition_files
def _sort_partition(self, partition_file: str, output_file: str) -> None:
# Read all values from partition
values = []
with open(partition_file, 'r') as f:
for line in f:
if line.strip():
values.append(int(line.strip()))
# Sort and write back
values.sort()
with open(output_file, 'w') as f:
for value in values:
f.write(f"{value}\n")
def _concatenate_partitions(self, sorted_partitions: List[str], output_file: str) -> None:
with open(output_file, 'w') as output:
for partition_file in sorted_partitions:
if os.path.exists(partition_file) and os.path.getsize(partition_file) > 0:
with open(partition_file, 'r') as f:
output.write(f.read())
def range_based_sort(self, input_file: str, output_file: str) -> None:
# Step 1: Distribute elements to partitions
partition_files = self._distribute_to_partitions(input_file)
# Step 2: Sort each partition
sorted_partitions = []
for i, partition_file in enumerate(partition_files):
if os.path.exists(partition_file) and os.path.getsize(partition_file) > 0:
sorted_file = os.path.join(self.temp_dir, f"sorted_partition_{i}.txt")
self._sort_partition(partition_file, sorted_file)
sorted_partitions.append(sorted_file)
# Clean up original partition file
if os.path.exists(partition_file):
os.remove(partition_file)
# Step 3: Concatenate sorted partitions
self._concatenate_partitions(sorted_partitions, output_file)
# Clean up sorted partition files
for sorted_file in sorted_partitions:
if os.path.exists(sorted_file):
os.remove(sorted_file)
# Usage example
def range_based_sort_large_file(input_filename: str, output_filename: str, memory_limit_mb: int = 100):
sorter = RangeBasedSorter(memory_limit_mb)
sorter.range_based_sort(input_filename, output_filename)
print(f"File sorted using range-based approach. Output written to {output_filename}")
Complexity
- ⏰ Time complexity:
O(n + k), where n is the number of elements and k is the range size. Can be very efficient when the range is not too large compared to the number of elements - 🧺 Space complexity:
O(min(n, k)), limited by the size of the largest partition that fits in memory
Analysis
When to use External Merge Sort:
- General-purpose solution that works for any data distribution
- Guaranteed O(n log n) performance
- Works well when you don't know the range of values in advance
When to use Range-Based Sorting:
- When you know the range of possible values
- When the data is somewhat uniformly distributed across the range
- Can be faster than merge sort for certain distributions
- More memory-efficient for sparse datasets
Trade-offs:
- External Merge Sort: More general but requires more I/O operations
- Range-Based Sort: More efficient for known ranges but can degenerate if data is highly skewed
For the given problem (1M integers in range 0-1B), external merge sort is typically the safer choice as it guarantees good performance regardless of data distribution.