Website Similarity Analysis
Problem
You are given a list of (website, user) pairs that represent users visiting websites. Come up with a program that identifies the top k pairs of websites with the greatest similarity.
For example, suppose k = 1, and the list of tuples is:
[('a', 1), ('a', 3), ('a', 5),
('b', 2), ('b', 6),
('c', 1), ('c', 2), ('c', 3), ('c', 4), ('c', 5),
('d', 4), ('d', 5), ('d', 6), ('d', 7),
('e', 1), ('e', 3), ('e', 5), ('e', 6)]
Then a reasonable similarity metric would most likely conclude that a and e are the most similar, so your program should return [('a', 'e')].
Examples
Example 1
Input: visits = [('a', 1), ('a', 3), ('a', 5), ('b', 2), ('b', 6), ('c', 1), ('c', 2), ('c', 3), ('c', 4), ('c', 5), ('d', 4), ('d', 5), ('d', 6), ('d', 7), ('e', 1), ('e', 3), ('e', 5), ('e', 6)], k = 1
Output: [('a', 'e')]
Explanation: Website 'a' has users {1, 3, 5} and website 'e' has users {1, 3, 5, 6}. They share 3 common users out of 4 total unique users (3/4 = 0.75 Jaccard similarity), which is the highest similarity among all pairs.
Example 2
Input: visits = [('x', 1), ('x', 2), ('y', 1), ('y', 2), ('z', 3)], k = 2
Output: [('x', 'y'), ('x', 'z')]
Explanation: Websites 'x' and 'y' both have users {1, 2} with Jaccard similarity of 1.0 (perfect match). Other pairs have lower similarity, so we return the top 2 pairs.
Example 3
Input: visits = [('a', 1), ('b', 2), ('c', 3)], k = 1
Output: [('a', 'b')]
Explanation: All websites have completely different users, so all pairs have 0 similarity. We return any pair alphabetically first.
Example 4
Input: visits = [], k = 1
Output: []
Explanation: No visits means no websites to compare.
Solution
Method 1 - Jaccard Similarity with Hash Sets
Intuition
Website similarity can be measured using Jaccard similarity, which calculates the ratio of shared users to total unique users between two websites. We need to group users by website, then compute pairwise similarities and find the top k pairs.
Approach
- Group users by website using a hash map where key is website and value is set of users
- Generate all possible pairs of websites
- For each pair, calculate Jaccard similarity: |intersection| / |union|
- Sort all pairs by similarity score in descending order
- Return the top k pairs
- Handle edge cases: empty input, k larger than available pairs
Code
C++
class Solution {
public:
vector<pair<string, string>> topKSimilarWebsites(vector<pair<string, int>>& visits, int k) {
if (visits.empty() || k <= 0) return {};
// Group users by website
unordered_map<string, unordered_set<int>> siteUsers;
for (auto& visit : visits) {
siteUsers[visit.first].insert(visit.second);
}
vector<string> sites;
for (auto& entry : siteUsers) {
sites.push_back(entry.first);
}
// Calculate similarities for all pairs
vector<tuple<double, string, string>> similarities;
for (int i = 0; i < sites.size(); i++) {
for (int j = i + 1; j < sites.size(); j++) {
string site1 = sites[i], site2 = sites[j];
double sim = calculateJaccardSimilarity(siteUsers[site1], siteUsers[site2]);
similarities.push_back({sim, min(site1, site2), max(site1, site2)});
}
}
// Sort by similarity descending, then alphabetically
sort(similarities.begin(), similarities.end(), [](const auto& a, const auto& b) {
if (get<0>(a) != get<0>(b)) return get<0>(a) > get<0>(b);
if (get<1>(a) != get<1>(b)) return get<1>(a) < get<1>(b);
return get<2>(a) < get<2>(b);
});
vector<pair<string, string>> ans;
int limit = min(k, (int)similarities.size());
for (int i = 0; i < limit; i++) {
ans.push_back({get<1>(similarities[i]), get<2>(similarities[i])});
}
return ans;
}
private:
double calculateJaccardSimilarity(const unordered_set<int>& set1, const unordered_set<int>& set2) {
int intersection = 0;
for (int user : set1) {
if (set2.count(user)) intersection++;
}
int unionSize = set1.size() + set2.size() - intersection;
return unionSize == 0 ? 0.0 : (double)intersection / unionSize;
}
};
Go
import "sort"
func topKSimilarWebsites(visits [][]interface{}, k int) [][]string {
if len(visits) == 0 || k <= 0 {
return [][]string{}
}
// Group users by website
siteUsers := make(map[string]map[int]bool)
for _, visit := range visits {
site := visit[0].(string)
user := visit[1].(int)
if siteUsers[site] == nil {
siteUsers[site] = make(map[int]bool)
}
siteUsers[site][user] = true
}
// Get all websites
var sites []string
for site := range siteUsers {
sites = append(sites, site)
}
// Calculate similarities
type similarity struct {
score float64
site1 string
site2 string
}
var similarities []similarity
for i := 0; i < len(sites); i++ {
for j := i + 1; j < len(sites); j++ {
site1, site2 := sites[i], sites[j]
score := calculateJaccardSimilarity(siteUsers[site1], siteUsers[site2])
if site1 > site2 {
site1, site2 = site2, site1
}
similarities = append(similarities, similarity{score, site1, site2})
}
}
// Sort by similarity descending, then alphabetically
sort.Slice(similarities, func(i, j int) bool {
if similarities[i].score != similarities[j].score {
return similarities[i].score > similarities[j].score
}
if similarities[i].site1 != similarities[j].site1 {
return similarities[i].site1 < similarities[j].site1
}
return similarities[i].site2 < similarities[j].site2
})
limit := k
if limit > len(similarities) {
limit = len(similarities)
}
ans := make([][]string, limit)
for i := 0; i < limit; i++ {
ans[i] = []string{similarities[i].site1, similarities[i].site2}
}
return ans
}
func calculateJaccardSimilarity(set1, set2 map[int]bool) float64 {
intersection := 0
for user := range set1 {
if set2[user] {
intersection++
}
}
unionSize := len(set1) + len(set2) - intersection
if unionSize == 0 {
return 0.0
}
return float64(intersection) / float64(unionSize)
}
Java
class Solution {
public List<List<String>> topKSimilarWebsites(List<List<Object>> visits, int k) {
if (visits.isEmpty() || k <= 0) return new ArrayList<>();
// Group users by website
Map<String, Set<Integer>> siteUsers = new HashMap<>();
for (List<Object> visit : visits) {
String site = (String) visit.get(0);
Integer user = (Integer) visit.get(1);
siteUsers.computeIfAbsent(site, x -> new HashSet<>()).add(user);
}
List<String> sites = new ArrayList<>(siteUsers.keySet());
// Calculate similarities
List<Similarity> similarities = new ArrayList<>();
for (int i = 0; i < sites.size(); i++) {
for (int j = i + 1; j < sites.size(); j++) {
String site1 = sites.get(i), site2 = sites.get(j);
double score = calculateJaccardSimilarity(siteUsers.get(site1), siteUsers.get(site2));
String first = site1.compareTo(site2) < 0 ? site1 : site2;
String second = site1.compareTo(site2) < 0 ? site2 : site1;
similarities.add(new Similarity(score, first, second));
}
}
// Sort by similarity descending, then alphabetically
similarities.sort((a, b) -> {
if (Double.compare(a.score, b.score) != 0) {
return Double.compare(b.score, a.score);
}
if (!a.site1.equals(b.site1)) {
return a.site1.compareTo(b.site1);
}
return a.site2.compareTo(b.site2);
});
List<List<String>> ans = new ArrayList<>();
int limit = Math.min(k, similarities.size());
for (int i = 0; i < limit; i++) {
Similarity sim = similarities.get(i);
ans.add(Arrays.asList(sim.site1, sim.site2));
}
return ans;
}
private double calculateJaccardSimilarity(Set<Integer> set1, Set<Integer> set2) {
Set<Integer> intersection = new HashSet<>(set1);
intersection.retainAll(set2);
Set<Integer> union = new HashSet<>(set1);
union.addAll(set2);
return union.isEmpty() ? 0.0 : (double) intersection.size() / union.size();
}
private static class Similarity {
double score;
String site1, site2;
Similarity(double score, String site1, String site2) {
this.score = score;
this.site1 = site1;
this.site2 = site2;
}
}
}
Python
class Solution:
def top_k_similar_websites(self, visits: List[List], k: int) -> List[List[str]]:
if not visits or k <= 0:
return []
# Group users by website
site_users = {}
for site, user in visits:
if site not in site_users:
site_users[site] = set()
site_users[site].add(user)
sites = list(site_users.keys())
# Calculate similarities
similarities = []
for i in range(len(sites)):
for j in range(i + 1, len(sites)):
site1, site2 = sites[i], sites[j]
score = self._calculate_jaccard_similarity(site_users[site1], site_users[site2])
first, second = (site1, site2) if site1 < site2 else (site2, site1)
similarities.append((score, first, second))
# Sort by similarity descending, then alphabetically
similarities.sort(key=lambda x: (-x[0], x[1], x[2]))
ans = []
limit = min(k, len(similarities))
for i in range(limit):
ans.append([similarities[i][1], similarities[i][2]])
return ans
def _calculate_jaccard_similarity(self, set1: set, set2: set) -> float:
intersection = len(set1 & set2)
union = len(set1 | set2)
return intersection / union if union > 0 else 0.0
Complexity
- ⏰ Time complexity:
O(V + W^2 * U), where V is the number of visits, W is the number of websites, and U is the average number of users per website. We process all visits once, then compare all pairs of websites - 🧺 Space complexity:
O(V + W^2), for storing the user sets and similarity results
Method 2 - Optimized with Early Termination
Intuition
We can optimize by using a heap to maintain only the top k similarities instead of computing and storing all pairs. This is especially useful when k is much smaller than the total number of website pairs.
Approach
- Group users by website as before
- Use a min-heap of size k to track the top k similarities
- For each pair of websites, calculate similarity
- If heap size < k, add the pair to heap
- If similarity > minimum in heap, replace the minimum
- Return all pairs from the heap
Code
C++
class Solution {
public:
vector<pair<string, string>> topKSimilarWebsites(vector<pair<string, int>>& visits, int k) {
if (visits.empty() || k <= 0) return {};
// Group users by website
unordered_map<string, unordered_set<int>> siteUsers;
for (auto& visit : visits) {
siteUsers[visit.first].insert(visit.second);
}
vector<string> sites;
for (auto& entry : siteUsers) {
sites.push_back(entry.first);
}
// Use min-heap to maintain top k similarities
priority_queue<tuple<double, string, string>, vector<tuple<double, string, string>>, greater<>> minHeap;
for (int i = 0; i < sites.size(); i++) {
for (int j = i + 1; j < sites.size(); j++) {
string site1 = sites[i], site2 = sites[j];
double sim = calculateJaccardSimilarity(siteUsers[site1], siteUsers[site2]);
string first = min(site1, site2), second = max(site1, site2);
if (minHeap.size() < k) {
minHeap.push({sim, first, second});
} else if (sim > get<0>(minHeap.top())) {
minHeap.pop();
minHeap.push({sim, first, second});
}
}
}
vector<pair<string, string>> ans;
while (!minHeap.empty()) {
auto [sim, site1, site2] = minHeap.top();
minHeap.pop();
ans.push_back({site1, site2});
}
reverse(ans.begin(), ans.end());
return ans;
}
private:
double calculateJaccardSimilarity(const unordered_set<int>& set1, const unordered_set<int>& set2) {
int intersection = 0;
for (int user : set1) {
if (set2.count(user)) intersection++;
}
int unionSize = set1.size() + set2.size() - intersection;
return unionSize == 0 ? 0.0 : (double)intersection / unionSize;
}
};
Go
import (
"container/heap"
"sort"
)
type SimilarityHeap []Similarity
func (h SimilarityHeap) Len() int { return len(h) }
func (h SimilarityHeap) Less(i, j int) bool { return h[i].score < h[j].score }
func (h SimilarityHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h *SimilarityHeap) Push(x interface{}) {
*h = append(*h, x.(Similarity))
}
func (h *SimilarityHeap) Pop() interface{} {
old := *h
n := len(old)
item := old[n-1]
*h = old[0 : n-1]
return item
}
func topKSimilarWebsitesOptimized(visits [][]interface{}, k int) [][]string {
if len(visits) == 0 || k <= 0 {
return [][]string{}
}
// Group users by website
siteUsers := make(map[string]map[int]bool)
for _, visit := range visits {
site := visit[0].(string)
user := visit[1].(int)
if siteUsers[site] == nil {
siteUsers[site] = make(map[int]bool)
}
siteUsers[site][user] = true
}
var sites []string
for site := range siteUsers {
sites = append(sites, site)
}
// Use min-heap for top k
h := &SimilarityHeap{}
heap.Init(h)
for i := 0; i < len(sites); i++ {
for j := i + 1; j < len(sites); j++ {
site1, site2 := sites[i], sites[j]
score := calculateJaccardSimilarity(siteUsers[site1], siteUsers[site2])
first, second := site1, site2
if site1 > site2 {
first, second = site2, site1
}
if h.Len() < k {
heap.Push(h, Similarity{score, first, second})
} else if score > (*h)[0].score {
heap.Pop(h)
heap.Push(h, Similarity{score, first, second})
}
}
}
// Extract results in descending order
ans := make([][]string, h.Len())
for i := len(ans) - 1; i >= 0; i-- {
sim := heap.Pop(h).(Similarity)
ans[i] = []string{sim.site1, sim.site2}
}
return ans
}
Java
class Solution {
public List<List<String>> topKSimilarWebsites(List<List<Object>> visits, int k) {
if (visits.isEmpty() || k <= 0) return new ArrayList<>();
// Group users by website
Map<String, Set<Integer>> siteUsers = new HashMap<>();
for (List<Object> visit : visits) {
String site = (String) visit.get(0);
Integer user = (Integer) visit.get(1);
siteUsers.computeIfAbsent(site, x -> new HashSet<>()).add(user);
}
List<String> sites = new ArrayList<>(siteUsers.keySet());
// Use min-heap for top k
PriorityQueue<Similarity> minHeap = new PriorityQueue<>(
Comparator.comparingDouble(s -> s.score)
);
for (int i = 0; i < sites.size(); i++) {
for (int j = i + 1; j < sites.size(); j++) {
String site1 = sites.get(i), site2 = sites.get(j);
double score = calculateJaccardSimilarity(siteUsers.get(site1), siteUsers.get(site2));
String first = site1.compareTo(site2) < 0 ? site1 : site2;
String second = site1.compareTo(site2) < 0 ? site2 : site1;
if (minHeap.size() < k) {
minHeap.offer(new Similarity(score, first, second));
} else if (score > minHeap.peek().score) {
minHeap.poll();
minHeap.offer(new Similarity(score, first, second));
}
}
}
List<List<String>> ans = new ArrayList<>();
while (!minHeap.isEmpty()) {
Similarity sim = minHeap.poll();
ans.add(Arrays.asList(sim.site1, sim.site2));
}
Collections.reverse(ans);
return ans;
}
private double calculateJaccardSimilarity(Set<Integer> set1, Set<Integer> set2) {
Set<Integer> intersection = new HashSet<>(set1);
intersection.retainAll(set2);
Set<Integer> union = new HashSet<>(set1);
union.addAll(set2);
return union.isEmpty() ? 0.0 : (double) intersection.size() / union.size();
}
private static class Similarity {
double score;
String site1, site2;
Similarity(double score, String site1, String site2) {
this.score = score;
this.site1 = site1;
this.site2 = site2;
}
}
}
Python
import heapq
class Solution:
def top_k_similar_websites(self, visits: List[List], k: int) -> List[List[str]]:
if not visits or k <= 0:
return []
# Group users by website
site_users = {}
for site, user in visits:
if site not in site_users:
site_users[site] = set()
site_users[site].add(user)
sites = list(site_users.keys())
# Use min-heap for top k
min_heap = []
for i in range(len(sites)):
for j in range(i + 1, len(sites)):
site1, site2 = sites[i], sites[j]
score = self._calculate_jaccard_similarity(site_users[site1], site_users[site2])
first, second = (site1, site2) if site1 < site2 else (site2, site1)
if len(min_heap) < k:
heapq.heappush(min_heap, (score, first, second))
elif score > min_heap[0][0]:
heapq.heapreplace(min_heap, (score, first, second))
# Extract results in descending order
ans = []
while min_heap:
score, site1, site2 = heapq.heappop(min_heap)
ans.append([site1, site2])
ans.reverse()
return ans
def _calculate_jaccard_similarity(self, set1: set, set2: set) -> float:
intersection = len(set1 & set2)
union = len(set1 | set2)
return intersection / union if union > 0 else 0.0
Complexity
- ⏰ Time complexity:
O(V + W^2 * U + W^2 * log k), where the additional log k factor comes from heap operations, but this is better when k << W^2 - 🧺 Space complexity:
O(V + k), significantly better space usage when k is small compared to total number of pairs