Problem
Given a string s
, consider all duplicated substrings: (contiguous) substrings of s that occur 2 or more times. The occurrences may overlap.
Return any duplicated substring that has the longest possible length. If s
does not have a duplicated substring, the answer is ""
.
Examples
Example 1:
Input: s = "banana"
Output: "ana"
Example 2:
Input: s = "abcd"
Output: ""
Solution
Method 1 - LCS
One effective way to approach this problem is by using the longest common substring (LCS) technique. We can utilize suffix array construction and longest common prefix (LCP) array to efficiently solve the problem.
Here are the steps:
- Suffix Array: Construct a suffix array for the string
s
. This array contains the starting indices of all suffixes ofs
sorted in lexicographical order. - LCP Array: Construct a longest common prefix (LCP) array which stores the lengths of the longest common prefixes between consecutive suffixes in the suffix array.
- Find Maximum Length: Traverse the LCP array to find the maximum value, which corresponds to the longest duplicated substring in the original string.
Code
Java
public class Solution {
public String longestDupSubstring(String s) {
int n = s.length();
int[] suffixArr = buildSuffixArray(s, n);
int[] lcp = buildLCPArray(s, suffixArr, n);
int maxLen = 0;
int startIndex = 0;
for (int i = 1; i < n; i++) {
if (lcp[i] > maxLen) {
maxLen = lcp[i];
startIndex = suffixArr[i];
}
}
return maxLen > 0 ? s.substring(startIndex, startIndex + maxLen) : "";
}
private int[] buildSuffixArray(String s, int n) {
Integer[] suffixArr = new Integer[n];
int[] rank = new int[n];
int[] temp = new int[n];
for (int i = 0; i < n; i++) {
suffixArr[i] = i;
rank[i] = s.charAt(i) - 'a';
}
for (int k = 1; k < n; k *= 2) {
Arrays.sort(suffixArr, (i, j) -> {
if (rank[i] != rank[j])
return Integer.compare(rank[i], rank[j]);
int ri = i + k < n ? rank[i + k] : -1;
int rj = j + k < n ? rank[j + k] : -1;
return Integer.compare(ri, rj);
});
temp[suffixArr[0]] = 0;
for (int i = 1; i < n; i++) {
temp[suffixArr[i]] = temp[suffixArr[i - 1]];
if (compare(s, suffixArr[i - 1], suffixArr[i], k, rank)) {
temp[suffixArr[i]]++;
}
}
System.arraycopy(temp, 0, rank, 0, n);
}
int[] finalSuffixArr = new int[n];
for (int i = 0; i < n; i++) {
finalSuffixArr[i] = suffixArr[i];
}
return finalSuffixArr;
}
private boolean compare(String s, int i, int j, int k, int[] rank) {
if (rank[i] != rank[j]) return false;
int ri = i + k < s.length() ? rank[i + k] : -1;
int rj = j + k < s.length() ? rank[j + k] : -1;
return ri == rj;
}
private int[] buildLCPArray(String s, int[] suffixArr, int n) {
int[] rank = new int[n];
for (int i = 0; i < n; i++) {
rank[suffixArr[i]] = i;
}
int[] lcp = new int[n];
int h = 0;
for (int i = 0; i < n; i++) {
if (rank[i] > 0) {
int j = suffixArr[rank[i] - 1];
while (i + h < n && j + h < n && s.charAt(i + h) == s.charAt(j + h)) {
h++;
}
lcp[rank[i]] = h;
if (h > 0) h--;
}
}
return lcp;
}
}
Python
class Solution:
def longestDupSubstring(self, s: str) -> str:
suffix_arr = self.build_suffix_array(s)
lcp = self.build_lcp_array(s, suffix_arr)
max_len = 0
start_index = 0
for i in range(1, len(s)):
if lcp[i] > max_len:
max_len = lcp[i]
start_index = suffix_arr[i]
return s[start_index:start_index + max_len] if max_len > 0 else ""
def build_suffix_array(self, s: str) -> List[int]:
n = len(s)
suffix_arr = list(range(n))
rank = [ord(s[i]) for i in range(n)]
k = 1
while k < n:
suffix_arr.sort(key=lambda x: (rank[x], rank[x + k] if x + k < n else -1))
temp_rank = [0] * n
for i in range(1, n):
temp_rank[suffix_arr[i]] = temp_rank[suffix_arr[i - 1]]
if rank[suffix_arr[i]] != rank[suffix_arr[i - 1]] or \
rank[suffix_arr[i] + k if suffix_arr[i] + k < n else 0] != rank[suffix_arr[i - 1] + k if suffix_arr[i - 1] + k < n else 0]:
temp_rank[suffix_arr[i]] += 1
rank = temp_rank
k *= 2
return suffix_arr
def build_lcp_array(self, s: str, suffix_arr: List[int]) -> List[int]:
n = len(s)
rank = [0] * n
lcp = [0] * n
for i, suffix in enumerate(suffix_arr):
rank[suffix] = i
h = 0
for i in range(n):
if rank[i] > 0:
j = suffix_arr[rank[i] - 1]
while i + h < n and j + h < n and s[i + h] == s[j + h]:
h += 1
lcp[rank[i]] = h
if h > 0:
h -= 1
return lcp
Complexity
- ⏰ Time complexity:
O(n * log(n))
for suffix array construction andO(n)
for LCP array construction and maximum length finding. Overall, it isO(n * log(n))
. - 🧺 Space complexity:
O(n)
for storing the suffix array, LCP array, and auxiliary data structures.
Method 2 - Trie + Binary Search
To solve the problem of finding the longest duplicated substring in a given string s
using a Trie and binary search, we can follow these steps:
- Binary Search:
- Use binary search to determine the maximum length of the duplicated substring. The idea is to check for duplicated substrings of varying lengths, starting from half of the string length and narrowing down.
- Trie:
- For each length determined by the binary search, use a Trie to check if there are any duplicated substrings of that length.
- Insert substrings of the given length into the Trie and check for duplicates.
Code
Java
public class Solution {
private class TrieNode {
TrieNode[] children = new TrieNode[26];
boolean isEnd;
}
public String longestDupSubstring(String s) {
int n = s.length();
int left = 1, right = n - 1;
String ans = "";
while (left <= right) {
int mid = left + (right - left) / 2;
String dup = searchDuplicate(s, mid);
if (dup != null) {
ans = dup;
left = mid + 1;
} else {
right = mid - 1;
}
}
return ans;
}
private String searchDuplicate(String s, int length) {
TrieNode root = new TrieNode();
for (int i = 0; i <= s.length() - length; i++) {
String substr = s.substring(i, i + length);
if (insert(root, substr)) {
return substr;
}
}
return null;
}
private boolean insert(TrieNode root, String s) {
TrieNode current = root;
boolean isExisting = true;
for (char c : s.toCharArray()) {
int index = c - 'a';
if (current.children[index] == null) {
current.children[index] = new TrieNode();
isExisting = false;
}
current = current.children[index];
}
if (current.isEnd) {
return true;
} else {
current.isEnd = true;
return false;
}
}
}
Python
class TrieNode:
def __init__(self):
self.children = {}
self.is_end = False
class Solution:
def longestDupSubstring(self, s: str) -> str:
n = len(s)
left, right = 1, n - 1
ans = ""
while left <= right:
mid = (left + right) // 2
dup = self.search_duplicate(s, mid)
if dup:
ans = dup
left = mid + 1
else:
right = mid - 1
return ans
def search_duplicate(self, s: str, length: int) -> str:
root = TrieNode()
for i in range(len(s) - length + 1):
substr = s[i:i + length]
if self.insert(root, substr):
return substr
return None
def insert(self, root: TrieNode, s: str) -> bool:
current = root
is_existing = True
for char in s:
if char not in current.children:
current.children[char] = TrieNode()
is_existing = False
current = current.children[char]
if current.is_end:
return True
else:
current.is_end = True
return False
Complexity
- ⏰ Time complexity:
O(n^2 log n)
, wheren
is the length of the strings
. Inserting and searching in a Trie takesO(n^2)
in the worst case, and binary search adds a logarithmic factor. - 🧺 Space complexity:
O(n^2)
for storing substrings in the Trie.
Method 3 - Rabin Karp Like Rolling Hash + Binary Search
Lets define 3 functions:
hash() :
computes hash for a given stringnextHash() :
computes next hash by removing first letter and adding next lettergetDup() :
returns a duplicate string for a given window size
We have already seen how to calculate hashcode and rolling hashcode here:
But this is a bad hash, as it wil have a lot of collisions, as say string abc
will also have hash as 6. Read more here - Good vs Bad Hash function examples.
We can take stronger has like Rabin-Karp Algorithm Explained#Algorithm with Rolling Hash. This will result in less collisions.
bac
Here i chose 31
as the prime number for helping with hash. How to calculate the hash value. So, suppose we have a string bac
, the
Code
Java
public class Solution {
public String longestDupSubstring(String s) {
int n = s.length();
int left = 1, right = n - 1;
String ans = "";
while (left <= right) {
int mid = left + (right - left) / 2;
String dup = getDup(s, mid);
if (dup != null) {
ans = dup;
left = mid + 1;
} else {
right = mid - 1;
}
}
return ans;
}
private String getDup(String s, int length) {
Set<Long> set = new HashSet<>();
long h = hash(s, length), power = 1;
for (int i = 1; i < length; i++) {
power = power * 31;
}
set.add(h);
for (int i = 1; i <= s.length() - length; i++) {
h = h * 31 - s.charAt(i - 1) * power + s.charAt(i + length - 1);
if (!set.add(h)) {
if (s.substring(i, i + length).equals(s.substring(l - length, l))) {
return s.substring(i, i + length);
}
}
}
return null;
}
private long hash(String s, int length) {
long h = 0;
for (int i = 0; i < length; i++) {
h = h * 31 + s.charAt(i);
}
return h;
}
}
Python
class Solution:
def longestDupSubstring(self, s: str) -> str:
def hash(s: str, size: int) -> int:
h = 0
for i in range(size):
h = h * 31 + ord(s[i])
return h
def nextHash(h: int, start: int, size: int) -> int:
h = h * 31 - ord(s[start - 1]) * power + ord(s[start + size - 1])
return h
def getDup(size: int) -> str:
seen = {}
h = hash(s, size)
seen[h] = 0
for i in range(1, len(s) - size + 1):
h = nextHash(h, i, size)
if h in seen:
if s[i:i + size] == s[seen[h]:seen[h] + size]:
return s[i:i + size]
seen[h] = i
return ""
n = len(s)
left, right = 1, n-1
power = 31**(right)
result = ""
while left <= right:
mid = left + (right - left) // 2
dup = getDup(mid)
if dup:
result = dup
left = mid + 1
else:
right = mid - 1
return result
Method 4 - Ukonnen Suffix Tree
to do later