rabin_karp
class RabinKarp:
def __init__(self, base=256, prime=101):
"""
Rabin-Karp Algorithm
Time Complexity: O(n+m) average, O(nm) worst case
Space Complexity: O(1)
where n is text length and m is pattern length
"""
self.base = base # Number of characters in input alphabet
self.prime = prime # A prime number
def find_pattern(self, text, pattern):
"""Find all occurrences of pattern in text"""
n = len(text)
m = len(pattern)
occurrences = []
# Calculate hash value for pattern and first window of text
pattern_hash = self._hash(pattern, m)
text_hash = self._hash(text, m)
# Slide pattern over text one by one
for s in range(n - m + 1):
# Check hash values
if pattern_hash == text_hash:
# If hash values match, check characters one by one
if self._check_match(text, pattern, s):
occurrences.append(s)
# Calculate hash value for next window of text
if s < n - m:
text_hash = self._rehash(text, text_hash, s, m)
return occurrences
def _hash(self, string, length):
"""Calculate hash value for string of given length"""
hash_value = 0
for i in range(length):
hash_value = (self.base * hash_value + ord(string[i])) % self.prime
return hash_value
def _rehash(self, text, old_hash, start, length):
"""Calculate hash value for next window"""
# Remove leading digit
old_hash = (old_hash - ord(text[start]) * pow(self.base, length-1, self.prime)) % self.prime
# Add trailing digit
new_hash = (old_hash * self.base + ord(text[start + length])) % self.prime
return new_hash
def _check_match(self, text, pattern, start):
"""Check if pattern matches text starting at position start"""
for i in range(len(pattern)):
if text[start + i] != pattern[i]:
return False
return True
class RabinKarpWithMultiplePatterns(RabinKarp):
def __init__(self, base=256, prime=101):
super().__init__(base, prime)
self.pattern_hashes = {}
def find_patterns(self, text, patterns):
"""Find all occurrences of multiple patterns in text"""
n = len(text)
m = len(patterns[0]) # Assume all patterns have same length
occurrences = {pattern: [] for pattern in patterns}
# Calculate hash values for all patterns
for pattern in patterns:
self.pattern_hashes[pattern] = self._hash(pattern, m)
# Calculate hash value for first window of text
text_hash = self._hash(text, m)
# Slide window over text one by one
for s in range(n - m + 1):
# Check hash values for all patterns
for pattern in patterns:
if self.pattern_hashes[pattern] == text_hash:
if self._check_match(text, pattern, s):
occurrences[pattern].append(s)
# Calculate hash value for next window of text
if s < n - m:
text_hash = self._rehash(text, text_hash, s, m)
return occurrences
# Example usage
if __name__ == "__main__":
# Create sample text and pattern
text = "AABAACAADAABAABA"
pattern = "AABA"
# Basic Rabin-Karp
rk = RabinKarp()
occurrences = rk.find_pattern(text, pattern)
print("Pattern occurrences (basic):", occurrences)
# Rabin-Karp with multiple patterns
rk_multi = RabinKarpWithMultiplePatterns()
patterns = ["AABA", "ACA", "BA"]
occurrences_multi = rk_multi.find_patterns(text, patterns)
print("\nPattern occurrences (multiple):")
for pattern, positions in occurrences_multi.items():
print(f"{pattern}: {positions}")
Explanation
Rabin-Karp algorithm is a string matching algorithm that uses hashing to find patterns in text.
How it works:
- Calculate hash value for pattern and first window of text
- Slide pattern over text one by one
- If hash values match, check characters one by one
- Calculate hash value for next window using rolling hash
- Repeat until end of text
Time Complexity:
- Best Case: O(n+m)
- Average Case: O(n+m)
- Worst Case: O(nm)
Space Complexity: O(1)
Applications:
- Text search
- Plagiarism detection
- DNA sequence matching
- Multiple pattern matching
- Large text processing
Advantages:
- Efficient average case
- Multiple pattern matching
- Rolling hash optimization
- Simple implementation
- Works for any pattern
Disadvantages:
- Poor worst-case performance
- Hash collisions possible
- Sensitive to hash function
- Requires prime number
- May need multiple checks