String Algorithms
Introduction to String Algorithms
String algorithms are fundamental for text processing, pattern matching, and data compression. They are essential in search engines, bioinformatics, text editors, and many other applications that process textual data efficiently.
Pattern Matching: Naive and Optimized
Pattern matching finds occurrences of a pattern within a text. The naive approach has O(nm) complexity, while optimized algorithms like KMP achieve O(n+m).
Naive vs Optimized Pattern Matching
#include <iostream>
#include <vector>
#include <string>
#include <chrono>
using namespace std;
using namespace std::chrono;
class PatternMatching {
public:
// Naive pattern matching - O(nm)
vector<int> naiveSearch(const string& text, const string& pattern) {
vector<int> matches;
int n = text.length();
int m = pattern.length();
cout << "Naive Pattern Matching:" << endl;
cout << "Text: " << text << endl;
cout << "Pattern: " << pattern << endl;
cout << "Checking positions:" << endl;
for (int i = 0; i <= n - m; i++) {
int j;
cout << "Position " << i << ": ";
for (j = 0; j < m; j++) {
if (text[i + j] != pattern[j]) {
cout << "Mismatch at " << (i + j) << endl;
break;
}
}
if (j == m) {
matches.push_back(i);
cout << "Match found!" << endl;
}
}
return matches;
}
// KMP Algorithm - O(n + m)
vector<int> KMPSearch(const string& text, const string& pattern) {
vector<int> matches;
vector<int> lps = computeLPSArray(pattern);
cout << "\nKMP Pattern Matching:" << endl;
cout << "LPS Array: ";
for (int x : lps) cout << x << " ";
cout << endl;
int n = text.length();
int m = pattern.length();
int i = 0; // index for text
int j = 0; // index for pattern
while (i < n) {
if (pattern[j] == text[i]) {
i++;
j++;
}
if (j == m) {
matches.push_back(i - j);
cout << "Match found at position " << (i - j) << endl;
j = lps[j - 1];
} else if (i < n && pattern[j] != text[i]) {
if (j != 0) {
j = lps[j - 1];
} else {
i++;
}
}
}
return matches;
}
private:
// Compute Longest Proper Prefix which is also Suffix array
vector<int> computeLPSArray(const string& pattern) {
int m = pattern.length();
vector<int> lps(m, 0);
int len = 0; // length of previous longest prefix suffix
int i = 1;
while (i < m) {
if (pattern[i] == pattern[len]) {
len++;
lps[i] = len;
i++;
} else {
if (len != 0) {
len = lps[len - 1];
} else {
lps[i] = 0;
i++;
}
}
}
return lps;
}
public:
void comparePerformance(const string& text, const string& pattern) {
cout << "\n=== Performance Comparison ===" << endl;
// Test naive approach
auto start = high_resolution_clock::now();
vector<int> naiveResult = naiveSearch(text, pattern);
auto end = high_resolution_clock::now();
auto naiveDuration = duration_cast<microseconds>(end - start);
// Test KMP approach
start = high_resolution_clock::now();
vector<int> kmpResult = KMPSearch(text, pattern);
end = high_resolution_clock::now();
auto kmpDuration = duration_cast<microseconds>(end - start);
cout << "\nResults:" << endl;
cout << "Naive: " << naiveResult.size() << " matches in " << naiveDuration.count() << " μs" << endl;
cout << "KMP: " << kmpResult.size() << " matches in " << kmpDuration.count() << " μs" << endl;
}
};
int main() {
PatternMatching pm;
string text = "ABABDABACDABABCABCABCABCAB";
string pattern = "ABABCAB";
vector<int> matches = pm.naiveSearch(text, pattern);
cout << "\nNaive matches at positions: ";
for (int pos : matches) cout << pos << " ";
cout << endl;
matches = pm.KMPSearch(text, pattern);
cout << "KMP matches at positions: ";
for (int pos : matches) cout << pos << " ";
cout << endl;
return 0;
}KMP uses the LPS array to avoid redundant character comparisons when a mismatch occurs.
String Hashing and Rolling Hash
String hashing converts strings to numerical values for fast comparison. Rolling hash efficiently updates hash values for sliding windows, useful in pattern matching and substring search.
Rolling Hash Implementation
#include <iostream>
#include <string>
#include <vector>
#include <cmath>
using namespace std;
class RollingHash {
private:
static const int BASE = 31;
static const int MOD = 1e9 + 7;
long long computePower(int exp) {
long long result = 1;
long long base = BASE;
while (exp > 0) {
if (exp % 2 == 1) {
result = (result * base) % MOD;
}
base = (base * base) % MOD;
exp /= 2;
}
return result;
}
public:
// Compute hash of a string
long long computeHash(const string& s) {
long long hash = 0;
long long power = 1;
for (char c : s) {
hash = (hash + (c - 'a' + 1) * power) % MOD;
power = (power * BASE) % MOD;
}
return hash;
}
// Rabin-Karp algorithm using rolling hash
vector<int> rabinKarpSearch(const string& text, const string& pattern) {
vector<int> matches;
int n = text.length();
int m = pattern.length();
if (m > n) return matches;
long long patternHash = computeHash(pattern);
long long windowHash = 0;
long long power = 1;
// Compute hash of first window
for (int i = 0; i < m; i++) {
windowHash = (windowHash + (text[i] - 'a' + 1) * power) % MOD;
if (i < m - 1) power = (power * BASE) % MOD;
}
cout << "Rabin-Karp Search:" << endl;
cout << "Pattern: " << pattern << " (hash: " << patternHash << ")" << endl;
cout << "Text: " << text << endl;
cout << "Sliding window hashes:" << endl;
// Check first window
cout << "Window 0-" << (m-1) << ": " << text.substr(0, m)
<< " (hash: " << windowHash << ")";
if (windowHash == patternHash) {
if (text.substr(0, m) == pattern) {
matches.push_back(0);
cout << " -> Match!";
}
}
cout << endl;
// Rolling hash for remaining windows
for (int i = m; i < n; i++) {
// Remove first character of previous window
windowHash = (windowHash - (text[i - m] - 'a' + 1) + MOD) % MOD;
windowHash = (windowHash * computePower(MOD - 2)) % MOD; // Multiply by inverse
// Add new character
windowHash = (windowHash + (text[i] - 'a' + 1) * power) % MOD;
cout << "Window " << (i-m+1) << "-" << i << ": "
<< text.substr(i - m + 1, m) << " (hash: " << windowHash << ")";
if (windowHash == patternHash) {
if (text.substr(i - m + 1, m) == pattern) {
matches.push_back(i - m + 1);
cout << " -> Match!";
}
}
cout << endl;
}
return matches;
}
// Find longest common substring using rolling hash
string longestCommonSubstring(const string& s1, const string& s2) {
int maxLen = 0;
string result = "";
cout << "\nFinding Longest Common Substring:" << endl;
cout << "String 1: " << s1 << endl;
cout << "String 2: " << s2 << endl;
// Try all possible lengths
for (int len = 1; len <= min(s1.length(), s2.length()); len++) {
vector<long long> hashes1, hashes2;
// Generate all substrings of length 'len' from s1
for (int i = 0; i <= (int)s1.length() - len; i++) {
hashes1.push_back(computeHash(s1.substr(i, len)));
}
// Check substrings of length 'len' from s2
for (int i = 0; i <= (int)s2.length() - len; i++) {
long long hash2 = computeHash(s2.substr(i, len));
for (int j = 0; j < hashes1.size(); j++) {
if (hashes1[j] == hash2) {
string sub1 = s1.substr(j, len);
string sub2 = s2.substr(i, len);
if (sub1 == sub2 && len > maxLen) {
maxLen = len;
result = sub1;
}
}
}
}
}
return result;
}
// Check if two strings are anagrams using hash
bool areAnagrams(const string& s1, const string& s2) {
if (s1.length() != s2.length()) return false;
// Simple hash: sum of character values
long long hash1 = 0, hash2 = 0;
for (char c : s1) hash1 += c;
for (char c : s2) hash2 += c;
cout << "\nAnagram check:" << endl;
cout << """ << s1 << "" hash: " << hash1 << endl;
cout << """ << s2 << "" hash: " << hash2 << endl;
return hash1 == hash2;
}
};
int main() {
RollingHash rh;
// Test Rabin-Karp
string text = "abababcababa";
string pattern = "abab";
vector<int> matches = rh.rabinKarpSearch(text, pattern);
cout << "\nMatches found at positions: ";
for (int pos : matches) {
cout << pos << " ";
}
cout << endl;
// Test longest common substring
string s1 = "abcdxyz";
string s2 = "xyzabcd";
string lcs = rh.longestCommonSubstring(s1, s2);
cout << "Longest common substring: "" << lcs << """ << endl;
// Test anagram detection
bool isAnagram = rh.areAnagrams("listen", "silent");
cout << "Are anagrams: " << (isAnagram ? "Yes" : "No") << endl;
return 0;
}Rolling hash enables O(1) hash updates by efficiently removing old characters and adding new ones.
Longest Palindromic Substring
Finding the longest palindromic substring is a classic string problem. We'll explore both the expand-around-centers approach and Manacher's algorithm.
Longest Palindromic Substring
#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
using namespace std;
class PalindromeAlgorithms {
public:
// Expand around centers approach - O(n²)
string longestPalindromeExpand(const string& s) {
if (s.empty()) return "";
int start = 0, maxLen = 1;
cout << "Expand Around Centers Approach:" << endl;
cout << "String: " << s << endl;
cout << "Checking centers:" << endl;
for (int i = 0; i < s.length(); i++) {
// Odd length palindromes (center at i)
int len1 = expandAroundCenter(s, i, i);
cout << "Center " << i << " (odd): length " << len1;
if (len1 > 1) {
int start_pos = i - len1/2;
cout << " -> "" << s.substr(start_pos, len1) << """;
}
cout << endl;
// Even length palindromes (center between i and i+1)
int len2 = expandAroundCenter(s, i, i + 1);
if (len2 > 0) {
cout << "Center " << i << "-" << (i+1) << " (even): length " << len2;
int start_pos = i - len2/2 + 1;
cout << " -> "" << s.substr(start_pos, len2) << """ << endl;
}
int maxCurrent = max(len1, len2);
if (maxCurrent > maxLen) {
maxLen = maxCurrent;
start = i - (maxCurrent - 1) / 2;
}
}
return s.substr(start, maxLen);
}
private:
int expandAroundCenter(const string& s, int left, int right) {
while (left >= 0 && right < s.length() && s[left] == s[right]) {
left--;
right++;
}
return right - left - 1;
}
public:
// Manacher's Algorithm - O(n)
string longestPalindromeManacher(const string& s) {
if (s.empty()) return "";
// Preprocess string: "abc" -> "^#a#b#c#$"
string processed = "^#";
for (char c : s) {
processed += c;
processed += '#';
}
processed += '$';
cout << "\nManacher's Algorithm:" << endl;
cout << "Original: " << s << endl;
cout << "Processed: " << processed << endl;
int n = processed.length();
vector<int> P(n, 0); // P[i] = radius of palindrome centered at i
int center = 0, right = 0;
cout << "\nBuilding P array:" << endl;
cout << "i | char | P[i] | center | right | palindrome" << endl;
cout << "---|------|------|--------|-------|----------" << endl;
for (int i = 1; i < n - 1; i++) {
int mirror = 2 * center - i;
if (i < right) {
P[i] = min(right - i, P[mirror]);
}
// Expand around center i
while (processed[i + P[i] + 1] == processed[i - P[i] - 1]) {
P[i]++;
}
// Update center and right boundary if expanded past right
if (i + P[i] > right) {
center = i;
right = i + P[i];
}
cout << i << " | " << processed[i] << " | " << P[i]
<< " | " << center << " | " << right << " | ";
if (P[i] > 0) {
int start_pos = (i - P[i]) / 2;
int len = P[i];
cout << s.substr(start_pos, len);
}
cout << endl;
}
// Find the longest palindrome
int maxLen = 0, centerIndex = 0;
for (int i = 1; i < n - 1; i++) {
if (P[i] > maxLen) {
maxLen = P[i];
centerIndex = i;
}
}
int start = (centerIndex - maxLen) / 2;
return s.substr(start, maxLen);
}
// Check if entire string is palindrome
bool isPalindrome(const string& s) {
int left = 0, right = s.length() - 1;
while (left < right) {
if (s[left] != s[right]) return false;
left++;
right--;
}
return true;
}
// Count all palindromic substrings
int countPalindromes(const string& s) {
int count = 0;
for (int i = 0; i < s.length(); i++) {
// Odd length palindromes
count += countPalindromesExpand(s, i, i);
// Even length palindromes
count += countPalindromesExpand(s, i, i + 1);
}
return count;
}
private:
int countPalindromesExpand(const string& s, int left, int right) {
int count = 0;
while (left >= 0 && right < s.length() && s[left] == s[right]) {
count++;
left--;
right++;
}
return count;
}
};
int main() {
PalindromeAlgorithms pa;
string s = "babad";
string result1 = pa.longestPalindromeExpand(s);
cout << "\nLongest palindrome (Expand): "" << result1 << """ << endl;
string result2 = pa.longestPalindromeManacher(s);
cout << "\nLongest palindrome (Manacher): "" << result2 << """ << endl;
// Test palindrome checking
cout << "\nPalindrome tests:" << endl;
cout << ""racecar" is palindrome: " << pa.isPalindrome("racecar") << endl;
cout << ""hello" is palindrome: " << pa.isPalindrome("hello") << endl;
// Count all palindromes
cout << "\nTotal palindromic substrings in "" << s << "": "
<< pa.countPalindromes(s) << endl;
return 0;
}Manacher's algorithm preprocesses the string and uses previously computed information to avoid redundant checks.