Find largest 1M numbers in 1B numbers
Problem
Describe an algorithm to find the largest 1 million numbers in 1 billion numbers. Assume that the computer memory can hold all one billion numbers.
Input
- A stream of integers
nums[]of length 1 Billion - An integer M, at max M = 1 Million, representing the number of top largest elements to retrieve.
Examples
Example 1
Input: nums = [1, 2, ... billion of numbers], M = 3
Output: [99999993, 99999992, 99999991] // top 3 largest numbers
Solutions
Method 1 – Full Sorting
Intuition
If we sort all the numbers, the largest M elements will always be at the end of the sorted array. This is a direct and reliable way to get the top M largest numbers.
Approach
- Sort the entire array of N numbers in ascending order.
- After sorting, select the last M elements from the array as the result.
- Return these M elements as the largest numbers (order can be ascending or descending as needed).
Complexity
- ⏰ Time complexity:
O(N log N), because sorting the entire array of N elements dominates the runtime. - 🧺 Space complexity:
O(1)(if sorting in-place), orO(N)if a new array is created for the result.
Method 2 – Quickselect/Partial Partitioning
Intuition
We don't need to fully sort the array to find the largest M numbers. By using a partitioning approach inspired by Quicksort, we can efficiently separate the largest M elements from the rest, without caring about their order.
Approach
- Use a partition function (like in Quicksort) to split the array around a pivot.
- After partitioning, count the number of elements greater than or equal to the pivot (right partition).
- If this count is exactly M, return these elements.
- If the count is more than M, recursively partition the right side.
- If the count is less than M, collect all from the right and recursively find the remaining (M - count) from the left.
- This process continues until exactly M largest elements are found.
Code
C++
#include <vector>
#include <algorithm>
#include <random>
class Solution {
public:
int partition(std::vector<int>& nums, int low, int high) {
int pivotIdx = low + rand() % (high - low + 1);
int pivot = nums[pivotIdx];
std::swap(nums[pivotIdx], nums[low]);
int L = low + 1, R = high;
while (L <= R) {
if (nums[L] < pivot) L++;
else if (nums[R] >= pivot) R--;
else std::swap(nums[L++], nums[R--]);
}
std::swap(nums[low], nums[R]);
return R;
}
void quickSelect(std::vector<int>& nums, int M, int low, int high, std::vector<int>& ans) {
if (low > high) return;
int idx = partition(nums, low, high);
int count = high - idx + 1;
if (count == M) {
ans.insert(ans.end(), nums.begin() + idx, nums.begin() + high + 1);
} else if (count > M) {
quickSelect(nums, M, idx, high, ans);
} else {
ans.insert(ans.end(), nums.begin() + idx, nums.begin() + high + 1);
quickSelect(nums, M - count, low, idx - 1, ans);
}
}
std::vector<int> largestM(std::vector<int>& nums, int M) {
std::vector<int> ans;
quickSelect(nums, M, 0, nums.size() - 1, ans);
return ans;
}
};
Go
import "math/rand"
func partition(nums []int, low, high int) int {
pivotIdx := low + rand.Intn(high-low+1)
pivot := nums[pivotIdx]
nums[pivotIdx], nums[low] = nums[low], nums[pivotIdx]
L, R := low+1, high
for L <= R {
if nums[L] < pivot {
L++
} else if nums[R] >= pivot {
R--
} else {
nums[L], nums[R] = nums[R], nums[L]
L++
R--
}
}
nums[low], nums[R] = nums[R], nums[low]
return R
}
func quickSelect(nums []int, M, low, high int, ans *[]int) {
if low > high { return }
idx := partition(nums, low, high)
count := high - idx + 1
if count == M {
*ans = append(*ans, nums[idx:high+1]...)
} else if count > M {
quickSelect(nums, M, idx, high, ans)
} else {
*ans = append(*ans, nums[idx:high+1]...)
quickSelect(nums, M-count, low, idx-1, ans)
}
}
func largestM(nums []int, M int) []int {
ans := []int{}
quickSelect(nums, M, 0, len(nums)-1, &ans)
return ans
}
Java
import java.util.*;
class Solution {
Random rand = new Random();
int partition(int[] nums, int low, int high) {
int idx = low + rand.nextInt(high - low + 1);
int pivot = nums[idx];
nums[idx] = nums[low]; nums[low] = pivot;
int L = low + 1, R = high;
while (L <= R) {
if (nums[L] < pivot) L++;
else if (nums[R] >= pivot) R--;
else { int t = nums[L]; nums[L++] = nums[R]; nums[R--] = t; }
}
nums[low] = nums[R]; nums[R] = pivot;
return R;
}
void quickSelect(int[] nums, int M, int low, int high, List<Integer> ans) {
if (low > high) return;
int idx = partition(nums, low, high);
int count = high - idx + 1;
if (count == M) {
for (int i = idx; i <= high; ++i) ans.add(nums[i]);
} else if (count > M) {
quickSelect(nums, M, idx, high, ans);
} else {
for (int i = idx; i <= high; ++i) ans.add(nums[i]);
quickSelect(nums, M - count, low, idx - 1, ans);
}
}
public List<Integer> largestM(int[] nums, int M) {
List<Integer> ans = new ArrayList<>();
quickSelect(nums, M, 0, nums.length - 1, ans);
return ans;
}
}
Kotlin
import kotlin.random.Random
class Solution {
fun largestM(nums: IntArray, M: Int): List<Int> {
val ans = mutableListOf<Int>()
fun partition(low: Int, high: Int): Int {
val idx = low + Random.nextInt(high - low + 1)
val pivot = nums[idx]
nums[idx] = nums[low].also { nums[low] = pivot }
var L = low + 1
var R = high
while (L <= R) {
if (nums[L] < pivot) L++
else if (nums[R] >= pivot) R--
else { nums[L] = nums[R].also { nums[R] = nums[L] }; L++; R-- }
}
nums[low] = nums[R].also { nums[R] = pivot }
return R
}
fun quickSelect(M: Int, low: Int, high: Int) {
if (low > high) return
val idx = partition(low, high)
val count = high - idx + 1
if (count == M) {
for (i in idx..high) ans.add(nums[i])
} else if (count > M) {
quickSelect(M, idx, high)
} else {
for (i in idx..high) ans.add(nums[i])
quickSelect(M - count, low, idx - 1)
}
}
quickSelect(M, 0, nums.size - 1)
return ans
}
}
Python
import random
class Solution:
def largestM(self, nums: list[int], M: int) -> list[int]:
ans: list[int] = []
def partition(low: int, high: int) -> int:
idx = random.randint(low, high)
pivot = nums[idx]
nums[idx], nums[low] = nums[low], nums[idx]
L, R = low + 1, high
while L <= R:
if nums[L] < pivot:
L += 1
elif nums[R] >= pivot:
R -= 1
else:
nums[L], nums[R] = nums[R], nums[L]
L += 1
R -= 1
nums[low], nums[R] = nums[R], nums[low]
return R
def quickSelect(M: int, low: int, high: int):
if low > high:
return
idx = partition(low, high)
count = high - idx + 1
if count == M:
ans.extend(nums[idx:high+1])
elif count > M:
quickSelect(M, idx, high)
else:
ans.extend(nums[idx:high+1])
quickSelect(M - count, low, idx - 1)
quickSelect(M, 0, len(nums) - 1)
return ans
Rust
use rand::Rng;
impl Solution {
pub fn largest_m(nums: &mut Vec<i32>, m: usize) -> Vec<i32> {
fn partition(nums: &mut [i32], low: usize, high: usize) -> usize {
let mut rng = rand::thread_rng();
let idx = low + rng.gen_range(0..=high-low);
nums.swap(idx, low);
let pivot = nums[low];
let (mut l, mut r) = (low+1, high);
while l <= r {
if nums[l] < pivot { l += 1; }
else if nums[r] >= pivot { r -= 1; }
else { nums.swap(l, r); l += 1; r -= 1; }
}
nums.swap(low, r);
r
}
fn quick_select(nums: &mut [i32], m: usize, low: usize, high: usize, ans: &mut Vec<i32>) {
if low > high { return; }
let idx = partition(nums, low, high);
let count = high - idx + 1;
if count == m {
ans.extend_from_slice(&nums[idx..=high]);
} else if count > m {
quick_select(nums, m, idx, high, ans);
} else {
ans.extend_from_slice(&nums[idx..=high]);
quick_select(nums, m-count, low, idx-1, ans);
}
}
let mut ans = vec![];
let len = nums.len();
quick_select(nums, m, 0, len-1, &mut ans);
ans
}
}
TypeScript
class Solution {
largestM(nums: number[], M: number): number[] {
function partition(low: number, high: number): number {
const idx = low + Math.floor(Math.random() * (high - low + 1));
const pivot = nums[idx];
[nums[idx], nums[low]] = [nums[low], nums[idx]];
let L = low + 1, R = high;
while (L <= R) {
if (nums[L] < pivot) L++;
else if (nums[R] >= pivot) R--;
else { [nums[L], nums[R]] = [nums[R], nums[L]]; L++; R--; }
}
[nums[low], nums[R]] = [nums[R], nums[low]];
return R;
}
function quickSelect(M: number, low: number, high: number, ans: number[]) {
if (low > high) return;
const idx = partition(low, high);
const count = high - idx + 1;
if (count === M) {
ans.push(...nums.slice(idx, high+1));
} else if (count > M) {
quickSelect(M, idx, high, ans);
} else {
ans.push(...nums.slice(idx, high+1));
quickSelect(M - count, low, idx - 1, ans);
}
}
const ans: number[] = [];
quickSelect(M, 0, nums.length - 1, ans);
return ans;
}
}
Complexity
- ⏰ Time complexity:
O(N), since each partition step reduces the problem size and the expected number of steps is linear. - 🧺 Space complexity:
O(1)(in-place partitioning, ignoring recursion stack and result array).
Java code
public static int quickPartition(int[] array, int low, int high) {
if (high <= low)
return -1;
// randomly choose a pivot
int index = new Random().nextInt(high - low + 1) + low;
int pivot = array[index];
// swap the pivot to the front
int num = array[index];
array[index] = array[low];
array[low] = num;
// partition as in Quicksort
int L = low + 1;
int R = high;
while (R >= L) {
while (R >= L && array[L] < pivot)
L++;
while (R >= L && array[R] >= pivot)
R--;
if (R >= L) {
int temp = array[L];
array[L] = array[R];
array[R] = temp;
L++;
R--;
}
}
// swap the pivot back to appropriate position.
num = array[R];
array[R] = array[low];
array[low] = num;
return R;
}
public static List<Integer> largestM(int[] numbers, int M, int low, int high) {
// RIndex = index of pivot
int RIndex = quickPartition(numbers, low, high);
// mFound = # of the largest numbers
int mFound = high + 1 - RIndex;
List<Integer> results = new LinkedList<Integer>();
if (mFound == M) { // Done!
int i = RIndex;
for (; i <= high; ++i)
results.add(numbers[i]);
return results;
} else if (mFound > M) { // Keep looking
// check if those mFound elements are actually the same
boolean duplicates = true;
for (int i = RIndex; i <= high; ++i) {
if (numbers[i] != numbers[RIndex])
duplicates = false;
}
if (!duplicates)
return largestM(numbers, M, RIndex, high);
else {
// if they are the same, just return M duplicates of them
for (int k = 0; k < M; ++k)
results.add(numbers[RIndex]);
return results;
}
} else { // Has found some, keep looking for the rest
int i = RIndex;
for (; i < numbers.length; ++i)
results.add(numbers[i]);
results.addAll(largestM(numbers, M - mFound, low, RIndex - 1));
return results;
}
}
Method 3 – Min-Heap (Priority Queue)
Intuition
We can maintain a collection of the largest M numbers seen so far by using a min-heap of size M. The heap always contains the current M largest elements, with the smallest at the top. Any new number larger than the heap's minimum replaces it, ensuring only the largest M remain.
Approach
- Initialize a min-heap (priority queue) of size at most M.
- Iterate through each number in the array:
- If the heap has fewer than M elements, add the number.
- If the heap is full and the current number is larger than the smallest in the heap, replace the smallest.
- After processing all numbers, the heap contains the largest M numbers (in any order).
Code
C++
#include <vector>
#include <queue>
class Solution {
public:
std::vector<int> largestM(const std::vector<int>& nums, int M) {
std::priority_queue<int, std::vector<int>, std::greater<int>> minHeap;
for (int num : nums) {
if ((int)minHeap.size() < M) minHeap.push(num);
else if (num > minHeap.top()) {
minHeap.pop();
minHeap.push(num);
}
}
std::vector<int> ans;
while (!minHeap.empty()) {
ans.push_back(minHeap.top());
minHeap.pop();
}
return ans;
}
};
Go
import "container/heap"
type IntHeap []int
func (h IntHeap) Len() int { return len(h) }
func (h IntHeap) Less(i, j int) bool { return h[i] < h[j] }
func (h IntHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h *IntHeap) Push(x interface{}) { *h = append(*h, x.(int)) }
func (h *IntHeap) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
*h = old[:n-1]
return x
}
func largestM(nums []int, M int) []int {
h := &IntHeap{}
heap.Init(h)
for _, num := range nums {
if h.Len() < M {
heap.Push(h, num)
} else if num > (*h)[0] {
heap.Pop(h)
heap.Push(h, num)
}
}
return *h
}
Java
import java.util.*;
class Solution {
public List<Integer> largestM(int[] nums, int M) {
PriorityQueue<Integer> minHeap = new PriorityQueue<>();
for (int num : nums) {
if (minHeap.size() < M) minHeap.offer(num);
else if (num > minHeap.peek()) {
minHeap.poll();
minHeap.offer(num);
}
}
return new ArrayList<>(minHeap);
}
}
Kotlin
import java.util.PriorityQueue
class Solution {
fun largestM(nums: IntArray, M: Int): List<Int> {
val minHeap = PriorityQueue<Int>()
for (num in nums) {
if (minHeap.size < M) minHeap.add(num)
else if (num > minHeap.peek()) {
minHeap.poll()
minHeap.add(num)
}
}
return minHeap.toList()
}
}
Python
import heapq
class Solution:
def largestM(self, nums: list[int], M: int) -> list[int]:
if M == 0: return []
minHeap: list[int] = []
for num in nums:
if len(minHeap) < M:
heapq.heappush(minHeap, num)
elif num > minHeap[0]:
heapq.heappushpop(minHeap, num)
return minHeap
Rust
use std::collections::BinaryHeap;
use std::cmp::Reverse;
impl Solution {
pub fn largest_m(nums: &[i32], m: usize) -> Vec<i32> {
let mut heap = std::collections::BinaryHeap::new();
for &num in nums {
if heap.len() < m {
heap.push(Reverse(num));
} else if let Some(&Reverse(top)) = heap.peek() {
if num > top {
heap.pop();
heap.push(Reverse(num));
}
}
}
heap.into_iter().map(|Reverse(x)| x).collect()
}
}
TypeScript
class MinHeap {
private data: number[] = [];
size() { return this.data.length; }
peek() { return this.data[0]; }
push(val: number) {
this.data.push(val);
let i = this.data.length - 1;
while (i > 0) {
let p = (i - 1) >> 1;
if (this.data[p] > this.data[i]) {
[this.data[p], this.data[i]] = [this.data[i], this.data[p]];
i = p;
} else break;
}
}
pop() {
if (this.data.length === 1) return this.data.pop();
const top = this.data[0];
this.data[0] = this.data.pop()!;
let i = 0, n = this.data.length;
while (true) {
let l = 2 * i + 1, r = 2 * i + 2, min = i;
if (l < n && this.data[l] < this.data[min]) min = l;
if (r < n && this.data[r] < this.data[min]) min = r;
if (min === i) break;
[this.data[i], this.data[min]] = [this.data[min], this.data[i]];
i = min;
}
return top;
}
}
class Solution {
largestM(nums: number[], M: number): number[] {
const heap = new MinHeap();
for (const num of nums) {
if (heap.size() < M) heap.push(num);
else if (num > heap.peek()) {
heap.pop();
heap.push(num);
}
}
return heap.data;
}
}
Complexity
- ⏰ Time complexity:
O(N log M), since each insertion or replacement in the heap of size M takesO(log M)and we process N elements. - 🧺 Space complexity:
O(M), for the heap storing the largest M elements.