KMP字符串匹配算法
KMP(Knuth-Morris-Pratt)算法是字符串匹配的经典算法,通过预处理模式串构建失配函数,实现O(n+m)时间复杂度的字符串匹配,避免重复比较。
KMP算法基础实现
KMP算法的核心是利用已匹配的信息,避免重复比较。关键在于构建失配函数(π函数),记录模式串的最长相等前后缀。
KMP算法基础实现
#include <iostream>
#include <vector>
#include <string>
using namespace std;
class KMP {
private:
string pattern;
vector<int> pi; // 失配函数
public:
KMP(const string& p) : pattern(p) {
computePi();
}
// 构建失配函数
void computePi() {
int m = pattern.length();
pi.assign(m, 0);
for (int i = 1; i < m; i++) {
int j = pi[i - 1];
// 寻找最长相等前后缀
while (j > 0 && pattern[i] != pattern[j]) {
j = pi[j - 1];
}
if (pattern[i] == pattern[j]) {
j++;
}
pi[i] = j;
}
}
// 在文本中查找所有匹配位置
vector<int> findAll(const string& text) {
vector<int> matches;
int n = text.length();
int m = pattern.length();
if (m == 0) return matches;
int j = 0; // pattern的索引
for (int i = 0; i < n; i++) {
// 利用失配函数跳过不必要的比较
while (j > 0 && text[i] != pattern[j]) {
j = pi[j - 1];
}
if (text[i] == pattern[j]) {
j++;
}
// 找到完整匹配
if (j == m) {
matches.push_back(i - m + 1);
j = pi[j - 1]; // 继续寻找下一个匹配
}
}
return matches;
}
// 获取失配函数数组
vector<int> getPi() const {
return pi;
}
// 显示失配函数的构建过程
void showPiConstruction() {
cout << "模式串: " << pattern << endl;
cout << "索引: ";
for (int i = 0; i < pattern.length(); i++) {
cout << i << " ";
}
cout << endl;
cout << "π函数: ";
for (int x : pi) {
cout << x << " ";
}
cout << endl;
}
};
int main() {
string text = "ABABDABACDABABCABCABCABABABCAC";
string pattern = "ABABCAC";
cout << "文本: " << text << endl;
cout << "模式: " << pattern << endl << endl;
KMP kmp(pattern);
kmp.showPiConstruction();
vector<int> matches = kmp.findAll(text);
cout << "\n匹配位置: ";
for (int pos : matches) {
cout << pos << " ";
}
cout << endl;
// 显示匹配的子串
cout << "\n匹配详情:" << endl;
for (int pos : matches) {
cout << "位置 " << pos << ": " << text.substr(pos, pattern.length()) << endl;
}
return 0;
}KMP算法通过π函数记录模式串中每个位置的最长相等前后缀长度,在匹配失败时能够跳过已知不匹配的部分。
Z算法 - 扩展KMP
Z算法计算字符串每个位置与整个字符串的最长公共前缀,是KMP的一个重要扩展。
Z算法实现
#include <iostream>
#include <vector>
#include <string>
#include <algorithm>
using namespace std;
class ZAlgorithm {
public:
// 计算Z数组
static vector<int> computeZ(const string& s) {
int n = s.length();
vector<int> z(n);
z[0] = n; // z[0]定义为字符串长度
int l = 0, r = 0; // Z-box的左右边界
for (int i = 1; i < n; i++) {
if (i <= r) {
// i在Z-box内,可以利用之前的信息
z[i] = min(r - i + 1, z[i - l]);
}
// 尝试扩展匹配
while (i + z[i] < n && s[z[i]] == s[i + z[i]]) {
z[i]++;
}
// 更新Z-box
if (i + z[i] - 1 > r) {
l = i;
r = i + z[i] - 1;
}
}
return z;
}
// 使用Z算法进行字符串匹配
static vector<int> stringMatch(const string& text, const string& pattern) {
string combined = pattern + "#" + text; // 用特殊字符分隔
vector<int> z = computeZ(combined);
vector<int> matches;
int patternLen = pattern.length();
for (int i = patternLen + 1; i < combined.length(); i++) {
if (z[i] == patternLen) {
matches.push_back(i - patternLen - 1);
}
}
return matches;
}
// 找所有回文子串
static vector<pair<int, int>> findPalindromes(const string& s) {
vector<pair<int, int>> palindromes;
int n = s.length();
// 检查奇数长度回文
for (int center = 0; center < n; center++) {
string left = s.substr(0, center + 1);
reverse(left.begin(), left.end());
string right = s.substr(center);
string combined = left + "#" + right;
vector<int> z = computeZ(combined);
for (int i = left.length() + 1; i < combined.length(); i++) {
int len = z[i];
if (len > 0 && center - len + 1 >= 0 && center + len - 1 < n) {
palindromes.push_back({center - len + 1, center + len - 1});
}
}
}
return palindromes;
}
};
int main() {
string text = "AABAACAADAABAABA";
string pattern = "AABA";
cout << "文本: " << text << endl;
cout << "模式: " << pattern << endl;
// 计算模式串的Z数组
vector<int> z = ZAlgorithm::computeZ(pattern);
cout << "\n模式串的Z数组: ";
for (int x : z) cout << x << " ";
cout << endl;
// 查找匹配
vector<int> matches = ZAlgorithm::stringMatch(text, pattern);
cout << "\n匹配位置: ";
for (int pos : matches) {
cout << pos << " ";
}
cout << endl;
// 显示匹配的子串
cout << "\n匹配的子串:" << endl;
for (int pos : matches) {
cout << "位置 " << pos << ": " << text.substr(pos, pattern.length()) << endl;
}
return 0;
}Z算法维护一个Z-box [l, r]表示最右边与前缀匹配的段。这允许使用之前计算的信息高效计算Z值。
关键要点
- KMP通过预处理模式串构建失配函数实现O(n+m)时间复杂度
- 失配函数π[i]存储最长相等前后缀的长度
- Z算法计算每个位置与整个字符串的最长公共前缀
- 两种算法都是高级字符串处理和模式匹配问题的基础