A string-searching algorithm that uses rolling hash to find an exact match of a pattern string in a text.
The expected time complexity is O(S+T) where S and T are the lengths of the original string s and the pattern string t respectively.
Its worst-case time complexity is O(ST).
Algorithm
First get the hash ht of the pattern string t.
We keep a sliding window of size T and compute the hash hs of the substring of s within the window.
Whenever ht == hs, the part in the window might be the same as t. In case of hash conflict, we linearly check the equity of them.
Implementation
Version 1
The hashes are signed long long.
We need to use a number m to mod the hashes in case of overflow.
We need to do hs += m when hs < 0.
// OJ: https://leetcode.com/problems/implement-strstr/
// Author: github.com/lzl124631x
// Time: average O(S+T), worst O(ST)
// Space: O(1)
class Solution {
typedef long long LL;
public:
int strStr(string s, string t) {
int S = s.size(), T = t.size(), d = 128;
if (!S || !T || T > S) return T ? -1 : 0;
LL m = 1e9+7, p = 1, hs = s[0], ht = t[0];
for (int i = 1; i < T; ++i) {
p = (p * d) % m; // we need d^(T-1)
ht = (ht * d + t[i]) % m;
hs = (hs * d + s[i]) % m;
}
for (int i = 0; i <= S - T; ++i) { // Loop for each start/pop point
if (hs == ht) { // in case of collision, check the equity.
int j = 0;
for (; j < T && s[i + j] == t[j]; ++j);
if (j == T) return i;
}
if (i < S - T) hs = ((hs - s[i] * p) * d + s[i + T]) % m;
if (hs < 0) hs += m; // we might get negative hs, converting it to positive
}
return -1;
}
};
Version 2
This version is translated from standard golang implementation.
The hashes are unsigned int.
No mod number m is needed! It will automatically mod by 2^32 when overflow happens.
// OJ: https://leetcode.com/problems/implement-strstr/
// Author: github.com/lzl124631x
// Time: average O(S+T), worst O(ST)
// Space: O(T)
// Ref: https://github.com/golang/go/blob/ba9e10889976025ee1d027db6b1cad383ec56de8/src/internal/bytealg/bytealg.go#L128
class Solution {
const unsigned primeRK = 16777619;
pair<unsigned, unsigned> hashStr(string &s) {
unsigned h = 0, p = 1, sq = primeRK;
for (char c : s) h = h * primeRK + c;
for (int i = s.size(); i > 0; i >>= 1) { // fast pow
if (i & 1) p *= sq;
sq *= sq;
}
return { h, p };
}
public:
int strStr(string s, string t) {
if (t.empty()) return 0;
unsigned T = t.size(), hs = 0;
auto [ht, p] = hashStr(t);
for (int i = 0; i < T; ++i) hs = hs * primeRK + s[i];
if (hs == ht && s.substr(0, T) == t) return 0;
for (int i = T; i < s.size(); ) {
hs = hs * primeRK + s[i] - p * s[i - T];
++i;
if (hs == ht && s.substr(i - T, T) == t) return i - T;
}
return -1;
}
};
Version 3
No mod number m is needed! It will automatically mod by 2^32 when overflow happens.
// OJ: https://leetcode.com/problems/implement-strstr/
// Author: github.com/lzl124631x
// Time: average O(S + T), worst O(ST)
// Space: O(1)
class Solution {
public:
int strStr(string s, string t) {
unsigned long long S = s.size(), T = t.size(), d = 1099511628211, p = 1, hs = 0, ht = 0; // we can use d = 29 as well or some other prime greater than the size of the character set.
if (!S || !T || T > S) return T ? -1 : 0;
for (int i = 0; i < S; ++i) {
hs = hs * d + s[i] - 'a';
if (i < T) {
p *= d;
ht = ht * d + t[i] - 'a';
}
if (i >= T) hs -= (s[i - T] - 'a') * p;
if (i >= T - 1 && hs == ht) {
int j = 0, start = i - T + 1;
for (; j < T && s[start + j] == t[j]; ++j);
if (j == T) return start;
}
}
return -1;
}
};
Why 1099511628211?
1099511628211 is the FNV(Fowler–Noll–Vo) prime for 64 bit integer. When using FNV prime, the hash values produced are more scattered throughout the hash space.
Rabin Karp algorithm or rolling hash is often used in problems where you need to find repeated subarray/substring in a long array/string.
Examples:
vector<ULL> rolling(vector<int> &A, int len) {
unsigned long long d = 1099511628211, h = 0, p = 1;
vector<ULL> ans;
for (int i = 0; i < A.size(); ++i) {
h = h * d + A[i];
if (i < len) p *= d;
else h -= A[i - len] * p;
if (i >= len - 1) ans.push_back(h);
}
return ans;
}
Prefix Hash Array
// Generate Prefix Hash Array
unsigned long long d = 1099511628211, h[501] = {}, p[501] = {1}, N = s.size();
for (int i = 0; i < N; ++i) {
p[i + 1] = p[i] * d;
h[i + 1] = h[i] * d + s[i];
}
// Access a hash of a substring of s[begin, end)
unsigned long long hash(int begin, int end) {
return h[end] - h[begin] * p[end - begin];
}
// OJ: https://leetcode.com/problems/number-of-distinct-substrings-in-a-string/
// Author: github.com/lzl124631x
// Time: O(N^2)
// Space: O(N^2)
class Solution {
unsigned long long d = 1099511628211, h[501] = {}, p[501] = {1};
unsigned long long hash(int begin, int end) {
return h[end] - h[begin] * p[end - begin];
}
public:
int countDistinct(string s) {
int N = s.size();
for (int i = 0; i < N; ++i) {
p[i + 1] = p[i] * d;
h[i + 1] = h[i] * d + s[i];
}
unordered_set<unsigned long long> seen;
for (int len = 1; len <= N; ++len) {
for (int i = 0; i + len <= N; ++i) {
seen.insert(hash(i, i + len));
}
}
return seen.size();
}
};