dbscan
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
class DBSCAN:
def __init__(self, eps=0.5, min_samples=5):
"""
DBSCAN Clustering
Time Complexity: O(n^2) in worst case, O(n log n) with spatial indexing
Space Complexity: O(n)
where n is number of samples
"""
self.eps = eps
self.min_samples = min_samples
self.labels = None
self.core_samples = None
def fit(self, X):
n_samples = X.shape[0]
self.labels = np.full(n_samples, -1)
self.core_samples = np.zeros(n_samples, dtype=bool)
cluster_id = 0
# Compute pairwise distances
distances = euclidean_distances(X)
for i in range(n_samples):
if self.labels[i] != -1:
continue
# Find neighbors
neighbors = self._find_neighbors(distances, i)
if len(neighbors) < self.min_samples:
self.labels[i] = -1 # Mark as noise
else:
self.core_samples[i] = True
self._expand_cluster(X, distances, i, neighbors, cluster_id)
cluster_id += 1
def _find_neighbors(self, distances, i):
return np.where(distances[i] <= self.eps)[0]
def _expand_cluster(self, X, distances, i, neighbors, cluster_id):
self.labels[i] = cluster_id
i = 0
while i < len(neighbors):
neighbor = neighbors[i]
if self.labels[neighbor] == -1:
self.labels[neighbor] = cluster_id
elif self.labels[neighbor] == -1:
self.labels[neighbor] = cluster_id
new_neighbors = self._find_neighbors(distances, neighbor)
if len(new_neighbors) >= self.min_samples:
self.core_samples[neighbor] = True
neighbors = np.concatenate([neighbors, new_neighbors])
i += 1
def predict(self, X):
if self.labels is None:
raise ValueError("Model not fitted yet")
distances = euclidean_distances(X)
predictions = np.zeros(X.shape[0])
for i in range(X.shape[0]):
# Find nearest core sample
min_dist = float('inf')
nearest_core = -1
for j in range(len(self.core_samples)):
if self.core_samples[j]:
dist = distances[i, j]
if dist < min_dist:
min_dist = dist
nearest_core = j
if min_dist <= self.eps:
predictions[i] = self.labels[nearest_core]
else:
predictions[i] = -1
return predictions
class DBSCANWithKDTree(DBSCAN):
def __init__(self, eps=0.5, min_samples=5):
super().__init__(eps, min_samples)
def fit(self, X):
from sklearn.neighbors import KDTree
n_samples = X.shape[0]
self.labels = np.full(n_samples, -1)
self.core_samples = np.zeros(n_samples, dtype=bool)
cluster_id = 0
# Build KDTree for efficient neighbor search
tree = KDTree(X)
for i in range(n_samples):
if self.labels[i] != -1:
continue
# Find neighbors using KDTree
neighbors = tree.query_radius(X[i:i+1], r=self.eps)[0]
if len(neighbors) < self.min_samples:
self.labels[i] = -1 # Mark as noise
else:
self.core_samples[i] = True
self._expand_cluster(X, tree, i, neighbors, cluster_id)
cluster_id += 1
def _expand_cluster(self, X, tree, i, neighbors, cluster_id):
self.labels[i] = cluster_id
i = 0
while i < len(neighbors):
neighbor = neighbors[i]
if self.labels[neighbor] == -1:
self.labels[neighbor] = cluster_id
elif self.labels[neighbor] == -1:
self.labels[neighbor] = cluster_id
new_neighbors = tree.query_radius(X[neighbor:neighbor+1], r=self.eps)[0]
if len(new_neighbors) >= self.min_samples:
self.core_samples[neighbor] = True
neighbors = np.concatenate([neighbors, new_neighbors])
i += 1
# Example usage
if __name__ == "__main__":
# Generate sample data
np.random.seed(42)
n_samples = 300
X = np.random.randn(n_samples, 2)
# Add some clusters
X[:100] += 5
X[100:200] -= 5
X[200:] += np.array([5, -5])
# Add some noise
X = np.vstack([X, np.random.randn(50, 2) * 10])
# Basic DBSCAN
dbscan = DBSCAN(eps=1.0, min_samples=5)
dbscan.fit(X)
print("Number of clusters:", len(np.unique(dbscan.labels)) - 1)
print("Number of noise points:", np.sum(dbscan.labels == -1))
# DBSCAN with KDTree
dbscan_kd = DBSCANWithKDTree(eps=1.0, min_samples=5)
dbscan_kd.fit(X)
print("\nNumber of clusters (with KDTree):", len(np.unique(dbscan_kd.labels)) - 1)
print("Number of noise points (with KDTree):", np.sum(dbscan_kd.labels == -1))
Explanation
DBSCAN is a density-based clustering algorithm that groups together points that are closely packed together and marks points that lie alone in low-density regions as outliers.
How it works:
- For each point, find its neighbors within eps distance
- If a point has at least min_samples neighbors, it's a core point
- Expand cluster by adding reachable points
- Points not reachable from any core point are noise
Time Complexity:
- Best Case: O(n log n) with spatial indexing
- Average Case: O(n log n) with spatial indexing
- Worst Case: O(n^2)
Space Complexity: O(n)
Applications:
- Anomaly detection
- Image segmentation
- Geographic data analysis
- Customer segmentation
- Document clustering
Advantages:
- Can find clusters of arbitrary shape
- Robust to noise
- No need to specify number of clusters
- Works well with spatial data
- Handles outliers effectively
Disadvantages:
- Sensitive to parameters
- Not suitable for high-dimensional data
- Performance depends on distance metric
- Memory intensive for large datasets
- May struggle with varying densities