Problem#
Implement a data structure which carries out the following operations without resizing the underlying array:
add(value)
: Add a value to the set of values.
check(value)
: Check whether a value is in the set.
The check
method may return occasional false positives (in other words, incorrectly identifying an element as part of the set), but should always correctly identify a true element.
Examples#
Example 1#
1
2
3
4
5
6
7
8
9
10
11
12
Input:
BloomFilter bf = new BloomFilter( 10 )
bf. add ( "apple" )
bf. add ( "banana" )
bf. check ( "apple" )
bf. check ( "banana" )
bf. check ( "orange" )
Output:
true ( apple exists)
true ( banana exists)
false or true ( orange doesn' t exist, but false positive possible)
Explanation: After adding "apple" and "banana" , checking for them should return true . "orange" was never added, so it should ideally return false , but false positives are allowed.
Example 2#
1
2
3
4
5
6
7
8
9
10
Input:
BloomFilter bf = new BloomFilter( 5 )
bf. add ( 42 )
bf. add ( 17 )
bf. check ( 42 )
bf. check ( 99 )
Output:
true ( 42 exists)
false or true ( 99 doesn' t exist, but false positive possible)
Explanation: The small size ( 5 ) increases the probability of false positives due to hash collisions.
Example 3#
1
2
3
4
5
6
7
8
9
Input:
BloomFilter bf = new BloomFilter( 100 )
bf. add ( "test" )
bf. check ( "test" )
bf. check ( "missing" )
Output:
true ( test exists)
false or true ( missing doesn' t exist, but false positive possible)
Explanation: Larger size ( 100 ) reduces false positive probability but doesn' t eliminate it completely.
Example 4#
1
2
3
4
5
6
7
Input:
BloomFilter bf = new BloomFilter( 8 )
( empty filter)
bf. check ( "anything" )
Output:
false ( nothing added yet, so definitely false )
Explanation: Before adding any elements, all checks should return false since no bits are set.
Example 5#
1
2
3
4
5
6
7
8
9
Input:
BloomFilter bf = new BloomFilter( 3 )
bf. add ( "a" )
bf. add ( "b" )
bf. add ( "c" )
bf. check ( "d" )
Output:
false or true ( high probability of false positive due to small size)
Explanation: With many elements in a small filter, hash collisions become very likely, increasing false positive rate.
Solution#
Method 1 - Single Hash Function Bloom Filter#
Intuition#
A Bloom filter uses a bit array and hash functions to efficiently test set membership. When adding an element, we hash it to get an index and set that bit to 1. When checking, we hash the element and see if that bit is set. False positives occur when different elements hash to the same position, but false negatives are impossible.
Approach#
Initialize a boolean array of fixed size
Choose a hash function that maps values to array indices
For add(value)
: hash the value and set the corresponding bit to true
For check(value)
: hash the value and return the value of that bit
Use simple hash functions like modulo or built-in hash functions
Handle different data types (strings, integers) appropriately
Code#
Cpp
Go
Java
Python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
class BloomFilter {
private :
vector< bool > bits;
int size;
int hash1 (const string& value) {
int hash = 0 ;
for (char c : value) {
hash = (hash * 31 + c) % size;
}
return hash;
}
int hash1 (int value) {
return value % size;
}
public :
BloomFilter(int capacity) : size(capacity) {
bits.resize(size, false);
}
void add (const string& value) {
int idx = hash1(value);
bits[idx] = true;
}
void add (int value) {
int idx = hash1(value);
bits[idx] = true;
}
bool check (const string& value) {
int idx = hash1(value);
return bits[idx];
}
bool check (int value) {
int idx = hash1(value);
return bits[idx];
}
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
type BloomFilter struct {
bits []bool
size int
}
func NewBloomFilter (capacity int ) * BloomFilter {
return & BloomFilter {
bits : make([]bool , capacity ),
size : capacity ,
}
}
func (bf * BloomFilter ) hash1String (value string ) int {
hash := 0
for _ , c := range value {
hash = (hash * 31 + int(c )) % bf .size
}
return hash
}
func (bf * BloomFilter ) hash1Int (value int ) int {
return value % bf .size
}
func (bf * BloomFilter ) AddString (value string ) {
idx := bf .hash1String (value )
bf .bits [idx ] = true
}
func (bf * BloomFilter ) AddInt (value int ) {
idx := bf .hash1Int (value )
bf .bits [idx ] = true
}
func (bf * BloomFilter ) CheckString (value string ) bool {
idx := bf .hash1String (value )
return bf .bits [idx ]
}
func (bf * BloomFilter ) CheckInt (value int ) bool {
idx := bf .hash1Int (value )
return bf .bits [idx ]
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
class BloomFilter {
private boolean [] bits;
private int size;
public BloomFilter (int capacity) {
this .size = capacity;
this .bits = new boolean [ capacity] ;
}
private int hash1 (String value) {
int hash = 0;
for (char c : value.toCharArray ()) {
hash = (hash * 31 + c) % size;
}
return Math.abs (hash);
}
private int hash1 (int value) {
return Math.abs (value % size);
}
public void add (String value) {
int idx = hash1(value);
bits[ idx] = true ;
}
public void add (int value) {
int idx = hash1(value);
bits[ idx] = true ;
}
public boolean check (String value) {
int idx = hash1(value);
return bits[ idx] ;
}
public boolean check (int value) {
int idx = hash1(value);
return bits[ idx] ;
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
class BloomFilter :
def __init__ (self, capacity: int):
self. size = capacity
self. bits = [False ] * capacity
def _hash1 (self, value: Union[str, int]) -> int:
if isinstance(value, str):
hash_val = 0
for c in value:
hash_val = (hash_val * 31 + ord(c)) % self. size
return hash_val
else :
return value % self. size
def add (self, value: Union[str, int]) -> None :
idx = self. _hash1(value)
self. bits[idx] = True
def check (self, value: Union[str, int]) -> bool:
idx = self. _hash1(value)
return self. bits[idx]
Complexity#
⏰ Time complexity: O(1)
for both add and check operations, as hash computation and array access are constant time
🧺 Space complexity: O(m)
, where m is the size of the bit array
Method 2 - Multiple Hash Functions Bloom Filter#
Intuition#
Using multiple hash functions reduces the false positive rate significantly. Each element sets multiple bits, and to check membership, all corresponding bits must be set. This makes false positives less likely while maintaining the no-false-negative property.
Approach#
Initialize a boolean array of fixed size
Implement multiple hash functions (typically 2-3 for good balance)
For add(value)
: apply all hash functions and set all corresponding bits
For check(value)
: apply all hash functions and return true only if all bits are set
Use different hash techniques like different seeds or polynomial rolling
Balance number of hash functions vs. false positive rate
Code#
Cpp
Go
Java
Python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
class MultiHashBloomFilter {
private :
vector< bool > bits;
int size;
int numHashes;
int hash1 (const string& value) {
int hash = 0 ;
for (char c : value) {
hash = (hash * 31 + c) % size;
}
return abs(hash);
}
int hash2 (const string& value) {
int hash = 0 ;
for (char c : value) {
hash = (hash * 37 + c) % size;
}
return abs(hash);
}
int hash3 (const string& value) {
int hash = 0 ;
for (char c : value) {
hash = (hash * 41 + c) % size;
}
return abs(hash);
}
int hash1 (int value) {
return abs(value % size);
}
int hash2 (int value) {
return abs((value * 31 ) % size);
}
int hash3 (int value) {
return abs((value * 37 ) % size);
}
vector< int > getHashes(const string& value) {
return {hash1(value), hash2(value), hash3(value)};
}
vector< int > getHashes(int value) {
return {hash1(value), hash2(value), hash3(value)};
}
public :
MultiHashBloomFilter(int capacity, int hashes = 3 ) : size(capacity), numHashes(hashes) {
bits.resize(size, false);
}
void add (const string& value) {
vector< int > indices = getHashes(value);
for (int i = 0 ; i < numHashes && i < indices.size(); i++ ) {
bits[indices[i]] = true;
}
}
void add (int value) {
vector< int > indices = getHashes(value);
for (int i = 0 ; i < numHashes && i < indices.size(); i++ ) {
bits[indices[i]] = true;
}
}
bool check (const string& value) {
vector< int > indices = getHashes(value);
for (int i = 0 ; i < numHashes && i < indices.size(); i++ ) {
if (! bits[indices[i]]) return false;
}
return true;
}
bool check (int value) {
vector< int > indices = getHashes(value);
for (int i = 0 ; i < numHashes && i < indices.size(); i++ ) {
if (! bits[indices[i]]) return false;
}
return true;
}
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
type MultiHashBloomFilter struct {
bits []bool
size int
numHashes int
}
func NewMultiHashBloomFilter (capacity , hashes int ) * MultiHashBloomFilter {
return & MultiHashBloomFilter {
bits : make([]bool , capacity ),
size : capacity ,
numHashes : hashes ,
}
}
func (bf * MultiHashBloomFilter ) hash1String (value string ) int {
hash := 0
for _ , c := range value {
hash = (hash * 31 + int(c )) % bf .size
}
if hash < 0 {
hash = - hash
}
return hash
}
func (bf * MultiHashBloomFilter ) hash2String (value string ) int {
hash := 0
for _ , c := range value {
hash = (hash * 37 + int(c )) % bf .size
}
if hash < 0 {
hash = - hash
}
return hash
}
func (bf * MultiHashBloomFilter ) hash3String (value string ) int {
hash := 0
for _ , c := range value {
hash = (hash * 41 + int(c )) % bf .size
}
if hash < 0 {
hash = - hash
}
return hash
}
func (bf * MultiHashBloomFilter ) getHashesString (value string ) []int {
return []int {
bf .hash1String (value ),
bf .hash2String (value ),
bf .hash3String (value ),
}
}
func (bf * MultiHashBloomFilter ) getHashesInt (value int ) []int {
if value < 0 {
value = - value
}
return []int {
value % bf .size ,
(value * 31 ) % bf .size ,
(value * 37 ) % bf .size ,
}
}
func (bf * MultiHashBloomFilter ) AddString (value string ) {
indices := bf .getHashesString (value )
for i := 0 ; i < bf .numHashes && i < len(indices ); i ++ {
bf .bits [indices [i ]] = true
}
}
func (bf * MultiHashBloomFilter ) AddInt (value int ) {
indices := bf .getHashesInt (value )
for i := 0 ; i < bf .numHashes && i < len(indices ); i ++ {
bf .bits [indices [i ]] = true
}
}
func (bf * MultiHashBloomFilter ) CheckString (value string ) bool {
indices := bf .getHashesString (value )
for i := 0 ; i < bf .numHashes && i < len(indices ); i ++ {
if !bf .bits [indices [i ]] {
return false
}
}
return true
}
func (bf * MultiHashBloomFilter ) CheckInt (value int ) bool {
indices := bf .getHashesInt (value )
for i := 0 ; i < bf .numHashes && i < len(indices ); i ++ {
if !bf .bits [indices [i ]] {
return false
}
}
return true
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
class MultiHashBloomFilter {
private boolean [] bits;
private int size;
private int numHashes;
public MultiHashBloomFilter (int capacity, int hashes) {
this .size = capacity;
this .numHashes = hashes;
this .bits = new boolean [ capacity] ;
}
private int hash1 (String value) {
int hash = 0;
for (char c : value.toCharArray ()) {
hash = (hash * 31 + c) % size;
}
return Math.abs (hash);
}
private int hash2 (String value) {
int hash = 0;
for (char c : value.toCharArray ()) {
hash = (hash * 37 + c) % size;
}
return Math.abs (hash);
}
private int hash3 (String value) {
int hash = 0;
for (char c : value.toCharArray ()) {
hash = (hash * 41 + c) % size;
}
return Math.abs (hash);
}
private int [] getHashes (String value) {
return new int [] {hash1(value), hash2(value), hash3(value)};
}
private int [] getHashes (int value) {
return new int [] {
Math.abs (value % size),
Math.abs ((value * 31) % size),
Math.abs ((value * 37) % size)
};
}
public void add (String value) {
int [] indices = getHashes(value);
for (int i = 0; i < numHashes && i < indices.length ; i++ ) {
bits[ indices[ i]] = true ;
}
}
public void add (int value) {
int [] indices = getHashes(value);
for (int i = 0; i < numHashes && i < indices.length ; i++ ) {
bits[ indices[ i]] = true ;
}
}
public boolean check (String value) {
int [] indices = getHashes(value);
for (int i = 0; i < numHashes && i < indices.length ; i++ ) {
if (! bits[ indices[ i]] ) return false ;
}
return true ;
}
public boolean check (int value) {
int [] indices = getHashes(value);
for (int i = 0; i < numHashes && i < indices.length ; i++ ) {
if (! bits[ indices[ i]] ) return false ;
}
return true ;
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
class MultiHashBloomFilter :
def __init__ (self, capacity: int, num_hashes: int = 3 ):
self. size = capacity
self. num_hashes = num_hashes
self. bits = [False ] * capacity
def _hash1 (self, value: Union[str, int]) -> int:
if isinstance(value, str):
hash_val = 0
for c in value:
hash_val = (hash_val * 31 + ord(c)) % self. size
return abs(hash_val)
else :
return abs(value % self. size)
def _hash2 (self, value: Union[str, int]) -> int:
if isinstance(value, str):
hash_val = 0
for c in value:
hash_val = (hash_val * 37 + ord(c)) % self. size
return abs(hash_val)
else :
return abs((value * 31 ) % self. size)
def _hash3 (self, value: Union[str, int]) -> int:
if isinstance(value, str):
hash_val = 0
for c in value:
hash_val = (hash_val * 41 + ord(c)) % self. size
return abs(hash_val)
else :
return abs((value * 37 ) % self. size)
def _get_hashes (self, value: Union[str, int]) -> List[int]:
return [self. _hash1(value), self. _hash2(value), self. _hash3(value)]
def add (self, value: Union[str, int]) -> None :
indices = self. _get_hashes(value)
for i in range(min(self. num_hashes, len(indices))):
self. bits[indices[i]] = True
def check (self, value: Union[str, int]) -> bool:
indices = self. _get_hashes(value)
for i in range(min(self. num_hashes, len(indices))):
if not self. bits[indices[i]]:
return False
return True
Complexity#
⏰ Time complexity: O(k)
for both add and check operations, where k is the number of hash functions (typically small constant)
🧺 Space complexity: O(m)
, where m is the size of the bit array, same as single hash but with lower false positive rate