KMP字符串匹配算法

KMP(Knuth-Morris-Pratt)算法是字符串匹配的经典算法,通过预处理模式串构建失配函数,实现O(n+m)时间复杂度的字符串匹配,避免重复比较。

KMP算法基础实现

KMP算法的核心是利用已匹配的信息,避免重复比较。关键在于构建失配函数(π函数),记录模式串的最长相等前后缀。

KMP算法基础实现

#include <iostream>
#include <vector>
#include <string>
using namespace std;

class KMP {
private:
    string pattern;
    vector<int> pi; // 失配函数
    
public:
    KMP(const string& p) : pattern(p) {
        computePi();
    }
    
    // 构建失配函数
    void computePi() {
        int m = pattern.length();
        pi.assign(m, 0);
        
        for (int i = 1; i < m; i++) {
            int j = pi[i - 1];
            
            // 寻找最长相等前后缀
            while (j > 0 && pattern[i] != pattern[j]) {
                j = pi[j - 1];
            }
            
            if (pattern[i] == pattern[j]) {
                j++;
            }
            
            pi[i] = j;
        }
    }
    
    // 在文本中查找所有匹配位置
    vector<int> findAll(const string& text) {
        vector<int> matches;
        int n = text.length();
        int m = pattern.length();
        
        if (m == 0) return matches;
        
        int j = 0; // pattern的索引
        
        for (int i = 0; i < n; i++) {
            // 利用失配函数跳过不必要的比较
            while (j > 0 && text[i] != pattern[j]) {
                j = pi[j - 1];
            }
            
            if (text[i] == pattern[j]) {
                j++;
            }
            
            // 找到完整匹配
            if (j == m) {
                matches.push_back(i - m + 1);
                j = pi[j - 1]; // 继续寻找下一个匹配
            }
        }
        
        return matches;
    }
    
    // 获取失配函数数组
    vector<int> getPi() const {
        return pi;
    }
    
    // 显示失配函数的构建过程
    void showPiConstruction() {
        cout << "模式串: " << pattern << endl;
        cout << "索引:   ";
        for (int i = 0; i < pattern.length(); i++) {
            cout << i << " ";
        }
        cout << endl;
        cout << "π函数:  ";
        for (int x : pi) {
            cout << x << " ";
        }
        cout << endl;
    }
};

int main() {
    string text = "ABABDABACDABABCABCABCABABABCAC";
    string pattern = "ABABCAC";
    
    cout << "文本: " << text << endl;
    cout << "模式: " << pattern << endl << endl;
    
    KMP kmp(pattern);
    kmp.showPiConstruction();
    
    vector<int> matches = kmp.findAll(text);
    
    cout << "\n匹配位置: ";
    for (int pos : matches) {
        cout << pos << " ";
    }
    cout << endl;
    
    // 显示匹配的子串
    cout << "\n匹配详情:" << endl;
    for (int pos : matches) {
        cout << "位置 " << pos << ": " << text.substr(pos, pattern.length()) << endl;
    }
    
    return 0;
}

KMP算法通过π函数记录模式串中每个位置的最长相等前后缀长度,在匹配失败时能够跳过已知不匹配的部分。

Z算法 - 扩展KMP

Z算法计算字符串每个位置与整个字符串的最长公共前缀,是KMP的一个重要扩展。

Z算法实现

#include <iostream>
#include <vector>
#include <string>
#include <algorithm>
using namespace std;

class ZAlgorithm {
public:
    // 计算Z数组
    static vector<int> computeZ(const string& s) {
        int n = s.length();
        vector<int> z(n);
        z[0] = n; // z[0]定义为字符串长度
        
        int l = 0, r = 0; // Z-box的左右边界
        
        for (int i = 1; i < n; i++) {
            if (i <= r) {
                // i在Z-box内,可以利用之前的信息
                z[i] = min(r - i + 1, z[i - l]);
            }
            
            // 尝试扩展匹配
            while (i + z[i] < n && s[z[i]] == s[i + z[i]]) {
                z[i]++;
            }
            
            // 更新Z-box
            if (i + z[i] - 1 > r) {
                l = i;
                r = i + z[i] - 1;
            }
        }
        
        return z;
    }
    
    // 使用Z算法进行字符串匹配
    static vector<int> stringMatch(const string& text, const string& pattern) {
        string combined = pattern + "#" + text; // 用特殊字符分隔
        vector<int> z = computeZ(combined);
        
        vector<int> matches;
        int patternLen = pattern.length();
        
        for (int i = patternLen + 1; i < combined.length(); i++) {
            if (z[i] == patternLen) {
                matches.push_back(i - patternLen - 1);
            }
        }
        
        return matches;
    }
    
    // 找所有回文子串
    static vector<pair<int, int>> findPalindromes(const string& s) {
        vector<pair<int, int>> palindromes;
        int n = s.length();
        
        // 检查奇数长度回文
        for (int center = 0; center < n; center++) {
            string left = s.substr(0, center + 1);
            reverse(left.begin(), left.end());
            string right = s.substr(center);
            
            string combined = left + "#" + right;
            vector<int> z = computeZ(combined);
            
            for (int i = left.length() + 1; i < combined.length(); i++) {
                int len = z[i];
                if (len > 0 && center - len + 1 >= 0 && center + len - 1 < n) {
                    palindromes.push_back({center - len + 1, center + len - 1});
                }
            }
        }
        
        return palindromes;
    }
};

int main() {
    string text = "AABAACAADAABAABA";
    string pattern = "AABA";
    
    cout << "文本: " << text << endl;
    cout << "模式: " << pattern << endl;
    
    // 计算模式串的Z数组
    vector<int> z = ZAlgorithm::computeZ(pattern);
    cout << "\n模式串的Z数组: ";
    for (int x : z) cout << x << " ";
    cout << endl;
    
    // 查找匹配
    vector<int> matches = ZAlgorithm::stringMatch(text, pattern);
    cout << "\n匹配位置: ";
    for (int pos : matches) {
        cout << pos << " ";
    }
    cout << endl;
    
    // 显示匹配的子串
    cout << "\n匹配的子串:" << endl;
    for (int pos : matches) {
        cout << "位置 " << pos << ": " << text.substr(pos, pattern.length()) << endl;
    }
    
    return 0;
}

Z算法维护一个Z-box [l, r]表示最右边与前缀匹配的段。这允许使用之前计算的信息高效计算Z值。

关键要点

  • KMP通过预处理模式串构建失配函数实现O(n+m)时间复杂度
  • 失配函数π[i]存储最长相等前后缀的长度
  • Z算法计算每个位置与整个字符串的最长公共前缀
  • 两种算法都是高级字符串处理和模式匹配问题的基础