KMP String Matching Algorithm
The Knuth-Morris-Pratt (KMP) algorithm is a fundamental string matching algorithm that achieves O(n+m) time complexity by preprocessing the pattern to build a failure function, avoiding redundant comparisons during matching.
KMP Algorithm Implementation
The KMP algorithm uses a failure function (Ο function) to record the longest proper prefix which is also a suffix for each position in the pattern.
KMP Algorithm Basic Implementation
#include <iostream>
#include <vector>
#include <string>
using namespace std;
class KMP {
private:
string pattern;
vector<int> pi; // failure function
public:
KMP(const string& p) : pattern(p) {
computePi();
}
// Build failure function
void computePi() {
int m = pattern.length();
pi.assign(m, 0);
for (int i = 1; i < m; i++) {
int j = pi[i - 1];
// Find longest proper prefix which is also suffix
while (j > 0 && pattern[i] != pattern[j]) {
j = pi[j - 1];
}
if (pattern[i] == pattern[j]) {
j++;
}
pi[i] = j;
}
}
// Find all match positions in text
vector<int> findAll(const string& text) {
vector<int> matches;
int n = text.length();
int m = pattern.length();
if (m == 0) return matches;
int j = 0; // pattern index
for (int i = 0; i < n; i++) {
// Use failure function to skip unnecessary comparisons
while (j > 0 && text[i] != pattern[j]) {
j = pi[j - 1];
}
if (text[i] == pattern[j]) {
j++;
}
// Found complete match
if (j == m) {
matches.push_back(i - m + 1);
j = pi[j - 1]; // continue searching for next match
}
}
return matches;
}
// Get failure function array
vector<int> getPi() const {
return pi;
}
// Show failure function construction process
void showPiConstruction() {
cout << "Pattern: " << pattern << endl;
cout << "Index: ";
for (int i = 0; i < pattern.length(); i++) {
cout << i << " ";
}
cout << endl;
cout << "Ο function: ";
for (int x : pi) {
cout << x << " ";
}
cout << endl;
}
};
int main() {
string text = "ABABDABACDABABCABCABCABABABCAC";
string pattern = "ABABCAC";
cout << "Text: " << text << endl;
cout << "Pattern: " << pattern << endl << endl;
KMP kmp(pattern);
kmp.showPiConstruction();
vector<int> matches = kmp.findAll(text);
cout << "\nMatch positions: ";
for (int pos : matches) {
cout << pos << " ";
}
cout << endl;
// Show matched substrings
cout << "\nMatch details:" << endl;
for (int pos : matches) {
cout << "Position " << pos << ": " << text.substr(pos, pattern.length()) << endl;
}
return 0;
}The KMP algorithm uses the Ο function to record the length of the longest proper prefix which is also a suffix at each position, enabling efficient skipping of known non-matching parts.
Z Algorithm - Extended KMP
The Z algorithm computes the longest common prefix between the string and each of its suffixes, providing an alternative approach to string matching.
Z Algorithm Implementation
#include <iostream>
#include <vector>
#include <string>
#include <algorithm>
using namespace std;
class ZAlgorithm {
public:
// Compute Z array
static vector<int> computeZ(const string& s) {
int n = s.length();
vector<int> z(n);
z[0] = n; // z[0] is defined as string length
int l = 0, r = 0; // Z-box boundaries
for (int i = 1; i < n; i++) {
if (i <= r) {
// i is inside Z-box, use previous information
z[i] = min(r - i + 1, z[i - l]);
}
// Try to extend the match
while (i + z[i] < n && s[z[i]] == s[i + z[i]]) {
z[i]++;
}
// Update Z-box
if (i + z[i] - 1 > r) {
l = i;
r = i + z[i] - 1;
}
}
return z;
}
// String matching using Z algorithm
static vector<int> stringMatch(const string& text, const string& pattern) {
string combined = pattern + "#" + text; // separate with special character
vector<int> z = computeZ(combined);
vector<int> matches;
int patternLen = pattern.length();
for (int i = patternLen + 1; i < combined.length(); i++) {
if (z[i] == patternLen) {
matches.push_back(i - patternLen - 1);
}
}
return matches;
}
// Find all palindromic substrings
static vector<pair<int, int>> findPalindromes(const string& s) {
vector<pair<int, int>> palindromes;
int n = s.length();
// Check odd-length palindromes
for (int center = 0; center < n; center++) {
string left = s.substr(0, center + 1);
reverse(left.begin(), left.end());
string right = s.substr(center);
string combined = left + "#" + right;
vector<int> z = computeZ(combined);
for (int i = left.length() + 1; i < combined.length(); i++) {
int len = z[i];
if (len > 0 && center - len + 1 >= 0 && center + len - 1 < n) {
palindromes.push_back({center - len + 1, center + len - 1});
}
}
}
return palindromes;
}
};
int main() {
string text = "AABAACAADAABAABA";
string pattern = "AABA";
cout << "Text: " << text << endl;
cout << "Pattern: " << pattern << endl;
// Compute Z array for pattern
vector<int> z = ZAlgorithm::computeZ(pattern);
cout << "\nZ array for pattern: ";
for (int x : z) cout << x << " ";
cout << endl;
// Find matches
vector<int> matches = ZAlgorithm::stringMatch(text, pattern);
cout << "\nMatch positions: ";
for (int pos : matches) {
cout << pos << " ";
}
cout << endl;
// Show matched substrings
cout << "\nMatched substrings:" << endl;
for (int pos : matches) {
cout << "Position " << pos << ": " << text.substr(pos, pattern.length()) << endl;
}
return 0;
}The Z algorithm maintains a Z-box [l, r] representing the rightmost segment that matches a prefix. This allows efficient computation of Z values using previously computed information.
Key Takeaways
- KMP achieves O(n+m) time complexity by preprocessing the pattern to build a failure function
- The failure function Ο[i] stores the length of the longest proper prefix that is also a suffix
- Z algorithm computes longest common prefix with the entire string for each position
- Both algorithms are fundamental for advanced string processing and pattern matching problems