字符串算法

字符串算法简介

字符串算法是文本处理、模式匹配和数据压缩的基础。它们在搜索引擎、生物信息学、文本编辑器和许多其他高效处理文本数据的应用中必不可少。

模式匹配:朴素和优化方法

模式匹配在文本中查找模式的出现。朴素方法的复杂度为O(nm),而像KMP这样的优化算法实现O(n+m)复杂度。

朴素 vs 优化模式匹配

#include <iostream>
#include <vector>
#include <string>
#include <chrono>
using namespace std;
using namespace std::chrono;

class PatternMatching {
public:
    // Naive pattern matching - O(nm)
    vector<int> naiveSearch(const string& text, const string& pattern) {
        vector<int> matches;
        int n = text.length();
        int m = pattern.length();
        
        cout << "Naive Pattern Matching:" << endl;
        cout << "Text: " << text << endl;
        cout << "Pattern: " << pattern << endl;
        cout << "Checking positions:" << endl;
        
        for (int i = 0; i <= n - m; i++) {
            int j;
            cout << "Position " << i << ": ";
            
            for (j = 0; j < m; j++) {
                if (text[i + j] != pattern[j]) {
                    cout << "Mismatch at " << (i + j) << endl;
                    break;
                }
            }
            
            if (j == m) {
                matches.push_back(i);
                cout << "Match found!" << endl;
            }
        }
        
        return matches;
    }
    
    // KMP Algorithm - O(n + m)
    vector<int> KMPSearch(const string& text, const string& pattern) {
        vector<int> matches;
        vector<int> lps = computeLPSArray(pattern);
        
        cout << "\nKMP Pattern Matching:" << endl;
        cout << "LPS Array: ";
        for (int x : lps) cout << x << " ";
        cout << endl;
        
        int n = text.length();
        int m = pattern.length();
        int i = 0; // index for text
        int j = 0; // index for pattern
        
        while (i < n) {
            if (pattern[j] == text[i]) {
                i++;
                j++;
            }
            
            if (j == m) {
                matches.push_back(i - j);
                cout << "Match found at position " << (i - j) << endl;
                j = lps[j - 1];
            } else if (i < n && pattern[j] != text[i]) {
                if (j != 0) {
                    j = lps[j - 1];
                } else {
                    i++;
                }
            }
        }
        
        return matches;
    }
    
private:
    // Compute Longest Proper Prefix which is also Suffix array
    vector<int> computeLPSArray(const string& pattern) {
        int m = pattern.length();
        vector<int> lps(m, 0);
        int len = 0; // length of previous longest prefix suffix
        int i = 1;
        
        while (i < m) {
            if (pattern[i] == pattern[len]) {
                len++;
                lps[i] = len;
                i++;
            } else {
                if (len != 0) {
                    len = lps[len - 1];
                } else {
                    lps[i] = 0;
                    i++;
                }
            }
        }
        
        return lps;
    }
    
public:
    void comparePerformance(const string& text, const string& pattern) {
        cout << "\n=== Performance Comparison ===" << endl;
        
        // Test naive approach
        auto start = high_resolution_clock::now();
        vector<int> naiveResult = naiveSearch(text, pattern);
        auto end = high_resolution_clock::now();
        auto naiveDuration = duration_cast<microseconds>(end - start);
        
        // Test KMP approach
        start = high_resolution_clock::now();
        vector<int> kmpResult = KMPSearch(text, pattern);
        end = high_resolution_clock::now();
        auto kmpDuration = duration_cast<microseconds>(end - start);
        
        cout << "\nResults:" << endl;
        cout << "Naive: " << naiveResult.size() << " matches in " << naiveDuration.count() << " μs" << endl;
        cout << "KMP: " << kmpResult.size() << " matches in " << kmpDuration.count() << " μs" << endl;
    }
};

int main() {
    PatternMatching pm;
    string text = "ABABDABACDABABCABCABCABCAB";
    string pattern = "ABABCAB";
    
    vector<int> matches = pm.naiveSearch(text, pattern);
    cout << "\nNaive matches at positions: ";
    for (int pos : matches) cout << pos << " ";
    cout << endl;
    
    matches = pm.KMPSearch(text, pattern);
    cout << "KMP matches at positions: ";
    for (int pos : matches) cout << pos << " ";
    cout << endl;
    
    return 0;
}

KMP使用LPS数组在不匹配时避免冗余字符比较。

字符串哈希和滚动哈希

字符串哈希将字符串转换为数值以进行快速比较。滚动哈希有效地更新滑动窗口的哈希值,在模式匹配和子字符串搜索中很有用。

滚动哈希实现

#include <iostream>
#include <string>
#include <vector>
#include <cmath>
using namespace std;

class RollingHash {
private:
    static const int BASE = 31;
    static const int MOD = 1e9 + 7;
    
    long long computePower(int exp) {
        long long result = 1;
        long long base = BASE;
        while (exp > 0) {
            if (exp % 2 == 1) {
                result = (result * base) % MOD;
            }
            base = (base * base) % MOD;
            exp /= 2;
        }
        return result;
    }
    
public:
    // Compute hash of a string
    long long computeHash(const string& s) {
        long long hash = 0;
        long long power = 1;
        
        for (char c : s) {
            hash = (hash + (c - 'a' + 1) * power) % MOD;
            power = (power * BASE) % MOD;
        }
        
        return hash;
    }
    
    // Rabin-Karp algorithm using rolling hash
    vector<int> rabinKarpSearch(const string& text, const string& pattern) {
        vector<int> matches;
        int n = text.length();
        int m = pattern.length();
        
        if (m > n) return matches;
        
        long long patternHash = computeHash(pattern);
        long long windowHash = 0;
        long long power = 1;
        
        // Compute hash of first window
        for (int i = 0; i < m; i++) {
            windowHash = (windowHash + (text[i] - 'a' + 1) * power) % MOD;
            if (i < m - 1) power = (power * BASE) % MOD;
        }
        
        cout << "Rabin-Karp Search:" << endl;
        cout << "Pattern: " << pattern << " (hash: " << patternHash << ")" << endl;
        cout << "Text: " << text << endl;
        cout << "Sliding window hashes:" << endl;
        
        // Check first window
        cout << "Window 0-" << (m-1) << ": " << text.substr(0, m) 
             << " (hash: " << windowHash << ")";
        if (windowHash == patternHash) {
            if (text.substr(0, m) == pattern) {
                matches.push_back(0);
                cout << " -> Match!";
            }
        }
        cout << endl;
        
        // Rolling hash for remaining windows
        for (int i = m; i < n; i++) {
            // Remove first character of previous window
            windowHash = (windowHash - (text[i - m] - 'a' + 1) + MOD) % MOD;
            windowHash = (windowHash * computePower(MOD - 2)) % MOD; // Multiply by inverse
            
            // Add new character
            windowHash = (windowHash + (text[i] - 'a' + 1) * power) % MOD;
            
            cout << "Window " << (i-m+1) << "-" << i << ": " 
                 << text.substr(i - m + 1, m) << " (hash: " << windowHash << ")";
            
            if (windowHash == patternHash) {
                if (text.substr(i - m + 1, m) == pattern) {
                    matches.push_back(i - m + 1);
                    cout << " -> Match!";
                }
            }
            cout << endl;
        }
        
        return matches;
    }
    
    // Find longest common substring using rolling hash
    string longestCommonSubstring(const string& s1, const string& s2) {
        int maxLen = 0;
        string result = "";
        
        cout << "\nFinding Longest Common Substring:" << endl;
        cout << "String 1: " << s1 << endl;
        cout << "String 2: " << s2 << endl;
        
        // Try all possible lengths
        for (int len = 1; len <= min(s1.length(), s2.length()); len++) {
            vector<long long> hashes1, hashes2;
            
            // Generate all substrings of length 'len' from s1
            for (int i = 0; i <= (int)s1.length() - len; i++) {
                hashes1.push_back(computeHash(s1.substr(i, len)));
            }
            
            // Check substrings of length 'len' from s2
            for (int i = 0; i <= (int)s2.length() - len; i++) {
                long long hash2 = computeHash(s2.substr(i, len));
                
                for (int j = 0; j < hashes1.size(); j++) {
                    if (hashes1[j] == hash2) {
                        string sub1 = s1.substr(j, len);
                        string sub2 = s2.substr(i, len);
                        if (sub1 == sub2 && len > maxLen) {
                            maxLen = len;
                            result = sub1;
                        }
                    }
                }
            }
        }
        
        return result;
    }
    
    // Check if two strings are anagrams using hash
    bool areAnagrams(const string& s1, const string& s2) {
        if (s1.length() != s2.length()) return false;
        
        // Simple hash: sum of character values
        long long hash1 = 0, hash2 = 0;
        for (char c : s1) hash1 += c;
        for (char c : s2) hash2 += c;
        
        cout << "\nAnagram check:" << endl;
        cout << """ << s1 << "" hash: " << hash1 << endl;
        cout << """ << s2 << "" hash: " << hash2 << endl;
        
        return hash1 == hash2;
    }
};

int main() {
    RollingHash rh;
    
    // Test Rabin-Karp
    string text = "abababcababa";
    string pattern = "abab";
    
    vector<int> matches = rh.rabinKarpSearch(text, pattern);
    cout << "\nMatches found at positions: ";
    for (int pos : matches) {
        cout << pos << " ";
    }
    cout << endl;
    
    // Test longest common substring
    string s1 = "abcdxyz";
    string s2 = "xyzabcd";
    string lcs = rh.longestCommonSubstring(s1, s2);
    cout << "Longest common substring: "" << lcs << """ << endl;
    
    // Test anagram detection
    bool isAnagram = rh.areAnagrams("listen", "silent");
    cout << "Are anagrams: " << (isAnagram ? "Yes" : "No") << endl;
    
    return 0;
}

滚动哈希通过有效地移除旧字符和添加新字符来实现O(1)哈希更新。

最长回文子串

寻找最长回文子串是一个经典的字符串问题。我们将探讨从中心扩展方法和Manacher算法。

最长回文子串

#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
using namespace std;

class PalindromeAlgorithms {
public:
    // Expand around centers approach - O(n²)
    string longestPalindromeExpand(const string& s) {
        if (s.empty()) return "";
        
        int start = 0, maxLen = 1;
        
        cout << "Expand Around Centers Approach:" << endl;
        cout << "String: " << s << endl;
        cout << "Checking centers:" << endl;
        
        for (int i = 0; i < s.length(); i++) {
            // Odd length palindromes (center at i)
            int len1 = expandAroundCenter(s, i, i);
            cout << "Center " << i << " (odd): length " << len1;
            if (len1 > 1) {
                int start_pos = i - len1/2;
                cout << " -> "" << s.substr(start_pos, len1) << """;
            }
            cout << endl;
            
            // Even length palindromes (center between i and i+1)
            int len2 = expandAroundCenter(s, i, i + 1);
            if (len2 > 0) {
                cout << "Center " << i << "-" << (i+1) << " (even): length " << len2;
                int start_pos = i - len2/2 + 1;
                cout << " -> "" << s.substr(start_pos, len2) << """ << endl;
            }
            
            int maxCurrent = max(len1, len2);
            if (maxCurrent > maxLen) {
                maxLen = maxCurrent;
                start = i - (maxCurrent - 1) / 2;
            }
        }
        
        return s.substr(start, maxLen);
    }
    
private:
    int expandAroundCenter(const string& s, int left, int right) {
        while (left >= 0 && right < s.length() && s[left] == s[right]) {
            left--;
            right++;
        }
        return right - left - 1;
    }
    
public:
    // Manacher's Algorithm - O(n)
    string longestPalindromeManacher(const string& s) {
        if (s.empty()) return "";
        
        // Preprocess string: "abc" -> "^#a#b#c#$"
        string processed = "^#";
        for (char c : s) {
            processed += c;
            processed += '#';
        }
        processed += '$';
        
        cout << "\nManacher's Algorithm:" << endl;
        cout << "Original: " << s << endl;
        cout << "Processed: " << processed << endl;
        
        int n = processed.length();
        vector<int> P(n, 0); // P[i] = radius of palindrome centered at i
        int center = 0, right = 0;
        
        cout << "\nBuilding P array:" << endl;
        cout << "i  | char | P[i] | center | right | palindrome" << endl;
        cout << "---|------|------|--------|-------|----------" << endl;
        
        for (int i = 1; i < n - 1; i++) {
            int mirror = 2 * center - i;
            
            if (i < right) {
                P[i] = min(right - i, P[mirror]);
            }
            
            // Expand around center i
            while (processed[i + P[i] + 1] == processed[i - P[i] - 1]) {
                P[i]++;
            }
            
            // Update center and right boundary if expanded past right
            if (i + P[i] > right) {
                center = i;
                right = i + P[i];
            }
            
            cout << i << "  |  " << processed[i] << "   |  " << P[i] 
                 << "   |   " << center << "    |   " << right << "   | ";
            
            if (P[i] > 0) {
                int start_pos = (i - P[i]) / 2;
                int len = P[i];
                cout << s.substr(start_pos, len);
            }
            cout << endl;
        }
        
        // Find the longest palindrome
        int maxLen = 0, centerIndex = 0;
        for (int i = 1; i < n - 1; i++) {
            if (P[i] > maxLen) {
                maxLen = P[i];
                centerIndex = i;
            }
        }
        
        int start = (centerIndex - maxLen) / 2;
        return s.substr(start, maxLen);
    }
    
    // Check if entire string is palindrome
    bool isPalindrome(const string& s) {
        int left = 0, right = s.length() - 1;
        while (left < right) {
            if (s[left] != s[right]) return false;
            left++;
            right--;
        }
        return true;
    }
    
    // Count all palindromic substrings
    int countPalindromes(const string& s) {
        int count = 0;
        
        for (int i = 0; i < s.length(); i++) {
            // Odd length palindromes
            count += countPalindromesExpand(s, i, i);
            // Even length palindromes
            count += countPalindromesExpand(s, i, i + 1);
        }
        
        return count;
    }
    
private:
    int countPalindromesExpand(const string& s, int left, int right) {
        int count = 0;
        while (left >= 0 && right < s.length() && s[left] == s[right]) {
            count++;
            left--;
            right++;
        }
        return count;
    }
};

int main() {
    PalindromeAlgorithms pa;
    string s = "babad";
    
    string result1 = pa.longestPalindromeExpand(s);
    cout << "\nLongest palindrome (Expand): "" << result1 << """ << endl;
    
    string result2 = pa.longestPalindromeManacher(s);
    cout << "\nLongest palindrome (Manacher): "" << result2 << """ << endl;
    
    // Test palindrome checking
    cout << "\nPalindrome tests:" << endl;
    cout << ""racecar" is palindrome: " << pa.isPalindrome("racecar") << endl;
    cout << ""hello" is palindrome: " << pa.isPalindrome("hello") << endl;
    
    // Count all palindromes
    cout << "\nTotal palindromic substrings in "" << s << "": " 
         << pa.countPalindromes(s) << endl;
    
    return 0;
}

Manacher算法预处理字符串并使用先前计算的信息来避免冗余检查。