gradient_descent
import numpy as np
class GradientDescent:
def __init__(self, learning_rate=0.01, max_iter=1000, tol=1e-6):
"""
Gradient Descent
Time Complexity: O(n * d * i)
Space Complexity: O(d)
where n is number of samples, d is number of features,
and i is number of iterations
"""
self.learning_rate = learning_rate
self.max_iter = max_iter
self.tol = tol
self.weights = None
self.loss_history = []
def fit(self, X, y):
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
for i in range(self.max_iter):
# Compute gradient
gradient = self._compute_gradient(X, y)
# Update weights
new_weights = self.weights - self.learning_rate * gradient
# Check convergence
if np.linalg.norm(new_weights - self.weights) < self.tol:
break
self.weights = new_weights
self.loss_history.append(self._compute_loss(X, y))
def predict(self, X):
return X @ self.weights
def _compute_gradient(self, X, y):
return 2 * X.T @ (X @ self.weights - y) / len(y)
def _compute_loss(self, X, y):
return np.mean((X @ self.weights - y)**2)
class StochasticGradientDescent(GradientDescent):
def __init__(self, learning_rate=0.01, max_iter=1000, tol=1e-6, batch_size=32):
super().__init__(learning_rate, max_iter, tol)
self.batch_size = batch_size
def fit(self, X, y):
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
for i in range(self.max_iter):
# Randomly sample batch
indices = np.random.choice(n_samples, self.batch_size, replace=False)
X_batch = X[indices]
y_batch = y[indices]
# Compute gradient
gradient = self._compute_gradient(X_batch, y_batch)
# Update weights
new_weights = self.weights - self.learning_rate * gradient
# Check convergence
if np.linalg.norm(new_weights - self.weights) < self.tol:
break
self.weights = new_weights
self.loss_history.append(self._compute_loss(X, y))
class MomentumGradientDescent(GradientDescent):
def __init__(self, learning_rate=0.01, max_iter=1000, tol=1e-6, momentum=0.9):
super().__init__(learning_rate, max_iter, tol)
self.momentum = momentum
self.velocity = None
def fit(self, X, y):
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.velocity = np.zeros(n_features)
for i in range(self.max_iter):
# Compute gradient
gradient = self._compute_gradient(X, y)
# Update velocity
self.velocity = self.momentum * self.velocity - self.learning_rate * gradient
# Update weights
new_weights = self.weights + self.velocity
# Check convergence
if np.linalg.norm(new_weights - self.weights) < self.tol:
break
self.weights = new_weights
self.loss_history.append(self._compute_loss(X, y))
class AdamGradientDescent(GradientDescent):
def __init__(self, learning_rate=0.001, max_iter=1000, tol=1e-6, beta1=0.9, beta2=0.999, epsilon=1e-8):
super().__init__(learning_rate, max_iter, tol)
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.m = None
self.v = None
self.t = 0
def fit(self, X, y):
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.m = np.zeros(n_features)
self.v = np.zeros(n_features)
for i in range(self.max_iter):
self.t += 1
# Compute gradient
gradient = self._compute_gradient(X, y)
# Update biased first moment estimate
self.m = self.beta1 * self.m + (1 - self.beta1) * gradient
# Update biased second moment estimate
self.v = self.beta2 * self.v + (1 - self.beta2) * gradient**2
# Compute bias-corrected first moment estimate
m_hat = self.m / (1 - self.beta1**self.t)
# Compute bias-corrected second moment estimate
v_hat = self.v / (1 - self.beta2**self.t)
# Update weights
new_weights = self.weights - self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)
# Check convergence
if np.linalg.norm(new_weights - self.weights) < self.tol:
break
self.weights = new_weights
self.loss_history.append(self._compute_loss(X, y))
# Example usage
if __name__ == "__main__":
# Generate sample data
np.random.seed(42)
n_samples = 100
n_features = 5
X = np.random.randn(n_samples, n_features)
true_weights = np.random.randn(n_features)
y = X @ true_weights + 0.1 * np.random.randn(n_samples)
# Basic Gradient Descent
gd = GradientDescent(learning_rate=0.01)
gd.fit(X, y)
print("Basic Gradient Descent Weights:", gd.weights)
# Stochastic Gradient Descent
sgd = StochasticGradientDescent(learning_rate=0.01, batch_size=32)
sgd.fit(X, y)
print("\nStochastic Gradient Descent Weights:", sgd.weights)
# Momentum Gradient Descent
mgd = MomentumGradientDescent(learning_rate=0.01, momentum=0.9)
mgd.fit(X, y)
print("\nMomentum Gradient Descent Weights:", mgd.weights)
# Adam Gradient Descent
adam = AdamGradientDescent(learning_rate=0.001)
adam.fit(X, y)
print("\nAdam Gradient Descent Weights:", adam.weights)
Explanation
Gradient Descent is an optimization algorithm used to minimize a function by iteratively moving in the direction of steepest descent.
How it works:
- Initialize weights randomly
- Compute gradient of loss function
- Update weights in direction of negative gradient
- Repeat until convergence
Time Complexity:
- Best Case: O(n * d * i)
- Average Case: O(n * d * i)
- Worst Case: O(n * d * i)
Space Complexity: O(d)
Applications:
- Linear regression
- Neural networks
- Logistic regression
- Support vector machines
- Deep learning
Advantages:
- Simple implementation
- Works for convex functions
- Easy to understand
- Widely applicable
- Efficient for large datasets
Disadvantages:
- May converge slowly
- Sensitive to learning rate
- May get stuck in local minima
- Requires differentiable function
- Memory intensive for large models