KMP String Matching Algorithm

The Knuth-Morris-Pratt (KMP) algorithm is a fundamental string matching algorithm that achieves O(n+m) time complexity by preprocessing the pattern to build a failure function, avoiding redundant comparisons during matching.

KMP Algorithm Implementation

The KMP algorithm uses a failure function (Ο€ function) to record the longest proper prefix which is also a suffix for each position in the pattern.

KMP Algorithm Basic Implementation

#include <iostream>
#include <vector>
#include <string>
using namespace std;

class KMP {
private:
    string pattern;
    vector<int> pi; // failure function
    
public:
    KMP(const string& p) : pattern(p) {
        computePi();
    }
    
    // Build failure function
    void computePi() {
        int m = pattern.length();
        pi.assign(m, 0);
        
        for (int i = 1; i < m; i++) {
            int j = pi[i - 1];
            
            // Find longest proper prefix which is also suffix
            while (j > 0 && pattern[i] != pattern[j]) {
                j = pi[j - 1];
            }
            
            if (pattern[i] == pattern[j]) {
                j++;
            }
            
            pi[i] = j;
        }
    }
    
    // Find all match positions in text
    vector<int> findAll(const string& text) {
        vector<int> matches;
        int n = text.length();
        int m = pattern.length();
        
        if (m == 0) return matches;
        
        int j = 0; // pattern index
        
        for (int i = 0; i < n; i++) {
            // Use failure function to skip unnecessary comparisons
            while (j > 0 && text[i] != pattern[j]) {
                j = pi[j - 1];
            }
            
            if (text[i] == pattern[j]) {
                j++;
            }
            
            // Found complete match
            if (j == m) {
                matches.push_back(i - m + 1);
                j = pi[j - 1]; // continue searching for next match
            }
        }
        
        return matches;
    }
    
    // Get failure function array
    vector<int> getPi() const {
        return pi;
    }
    
    // Show failure function construction process
    void showPiConstruction() {
        cout << "Pattern: " << pattern << endl;
        cout << "Index:   ";
        for (int i = 0; i < pattern.length(); i++) {
            cout << i << " ";
        }
        cout << endl;
        cout << "Ο€ function: ";
        for (int x : pi) {
            cout << x << " ";
        }
        cout << endl;
    }
};

int main() {
    string text = "ABABDABACDABABCABCABCABABABCAC";
    string pattern = "ABABCAC";
    
    cout << "Text: " << text << endl;
    cout << "Pattern: " << pattern << endl << endl;
    
    KMP kmp(pattern);
    kmp.showPiConstruction();
    
    vector<int> matches = kmp.findAll(text);
    
    cout << "\nMatch positions: ";
    for (int pos : matches) {
        cout << pos << " ";
    }
    cout << endl;
    
    // Show matched substrings
    cout << "\nMatch details:" << endl;
    for (int pos : matches) {
        cout << "Position " << pos << ": " << text.substr(pos, pattern.length()) << endl;
    }
    
    return 0;
}

The KMP algorithm uses the Ο€ function to record the length of the longest proper prefix which is also a suffix at each position, enabling efficient skipping of known non-matching parts.

Z Algorithm - Extended KMP

The Z algorithm computes the longest common prefix between the string and each of its suffixes, providing an alternative approach to string matching.

Z Algorithm Implementation

#include <iostream>
#include <vector>
#include <string>
#include <algorithm>
using namespace std;

class ZAlgorithm {
public:
    // Compute Z array
    static vector<int> computeZ(const string& s) {
        int n = s.length();
        vector<int> z(n);
        z[0] = n; // z[0] is defined as string length
        
        int l = 0, r = 0; // Z-box boundaries
        
        for (int i = 1; i < n; i++) {
            if (i <= r) {
                // i is inside Z-box, use previous information
                z[i] = min(r - i + 1, z[i - l]);
            }
            
            // Try to extend the match
            while (i + z[i] < n && s[z[i]] == s[i + z[i]]) {
                z[i]++;
            }
            
            // Update Z-box
            if (i + z[i] - 1 > r) {
                l = i;
                r = i + z[i] - 1;
            }
        }
        
        return z;
    }
    
    // String matching using Z algorithm
    static vector<int> stringMatch(const string& text, const string& pattern) {
        string combined = pattern + "#" + text; // separate with special character
        vector<int> z = computeZ(combined);
        
        vector<int> matches;
        int patternLen = pattern.length();
        
        for (int i = patternLen + 1; i < combined.length(); i++) {
            if (z[i] == patternLen) {
                matches.push_back(i - patternLen - 1);
            }
        }
        
        return matches;
    }
    
    // Find all palindromic substrings
    static vector<pair<int, int>> findPalindromes(const string& s) {
        vector<pair<int, int>> palindromes;
        int n = s.length();
        
        // Check odd-length palindromes
        for (int center = 0; center < n; center++) {
            string left = s.substr(0, center + 1);
            reverse(left.begin(), left.end());
            string right = s.substr(center);
            
            string combined = left + "#" + right;
            vector<int> z = computeZ(combined);
            
            for (int i = left.length() + 1; i < combined.length(); i++) {
                int len = z[i];
                if (len > 0 && center - len + 1 >= 0 && center + len - 1 < n) {
                    palindromes.push_back({center - len + 1, center + len - 1});
                }
            }
        }
        
        return palindromes;
    }
};

int main() {
    string text = "AABAACAADAABAABA";
    string pattern = "AABA";
    
    cout << "Text: " << text << endl;
    cout << "Pattern: " << pattern << endl;
    
    // Compute Z array for pattern
    vector<int> z = ZAlgorithm::computeZ(pattern);
    cout << "\nZ array for pattern: ";
    for (int x : z) cout << x << " ";
    cout << endl;
    
    // Find matches
    vector<int> matches = ZAlgorithm::stringMatch(text, pattern);
    cout << "\nMatch positions: ";
    for (int pos : matches) {
        cout << pos << " ";
    }
    cout << endl;
    
    // Show matched substrings
    cout << "\nMatched substrings:" << endl;
    for (int pos : matches) {
        cout << "Position " << pos << ": " << text.substr(pos, pattern.length()) << endl;
    }
    
    return 0;
}

The Z algorithm maintains a Z-box [l, r] representing the rightmost segment that matches a prefix. This allows efficient computation of Z values using previously computed information.

Key Takeaways

  • KMP achieves O(n+m) time complexity by preprocessing the pattern to build a failure function
  • The failure function Ο€[i] stores the length of the longest proper prefix that is also a suffix
  • Z algorithm computes longest common prefix with the entire string for each position
  • Both algorithms are fundamental for advanced string processing and pattern matching problems