import numpy as np
import torch
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from typing import Callable

#Configuration for clear comparison
configs_sharp = {
    'suboptimal': {
        'center': (0, 0),
        'depth': 1.0,    
        'width': 0.35,    
        'power': 6      
    },
    'optimal': {
        'center': (3, 3),  
        'depth': 1.0,     
        'width': 2.0,  
        'power': 2         
    }
}

configs_medium = {
    'suboptimal': {
        'center': (0, 0),
        'depth': 1.0,   
        'width': 0.75,    
        'power': 4       
    },
    'optimal': {
        'center': (3, 3),  
        'depth': 1.0,      
        'width': 2.0,      
        'power': 2       
    }
}

configs_wide = {
    'suboptimal': {
        'center': (0, 0),
        'depth': 1.0,    
        'width': 1.0,   
        'power': 2       
    },
    'optimal': {
        'center': (3, 3),  
        'depth': 1.0,      
        'width': 2.0,     
        'power': 2        
    }
}

def F(x, y, configs=configs_sharp):
    #Suboptimal basin where SGD experiments will begin
    r1 = np.sqrt((x - 0)**2 + (y - 0)**2)
    F1 = configs['suboptimal']['depth'] * \
         (r1 / configs['suboptimal']['width'])**configs['suboptimal']['power']
    
    #Optimal basin
    r2 = np.sqrt((x - 3)**2 + (y - 3)**2)
    F2 = configs['optimal']['depth'] * \
         (r2 / configs['optimal']['width'])**configs['optimal']['power']
    
    return np.minimum(F1, F2)

xs = np.linspace(-4,5,1000)
ys = np.linspace(-4,5,1000)
x, y = np.meshgrid(xs, ys)

F_sharp = F(x, y, configs_sharp)
F_medium = F(x, y, configs_medium)
F_wide = F(x, y, configs_wide)

fig = plt.figure(figsize=[18,18])

ax = fig.add_subplot(121, projection='3d')

ax.set_title("Two-basin Loss Landscape for 3 different basin shapes")
ax.plot_surface(x , y, F_sharp, alpha=0.4, color='purple')
ax.plot_surface(x , y, F_medium, alpha=0.3, color='yellow')
ax.plot_surface(x , y, F_wide, alpha=0.2, cmap='viridis')


ax = fig.add_subplot(122, projection='3d')
ax.view_init(elev=0, azim=0)

ax.set_title("Two-basin Loss Landscape (Orthogonal View)")
#ax.plot_surface(x , y, F_elliptical_well(x,y,3,3), alpha=0.7, cmap='viridis')
ax.plot_surface(x , y, F_sharp, alpha=0.4, color='purple')

ax.plot_surface(x , y, F_medium, alpha=0.3, color='yellow')

ax.plot_surface(x , y, F_wide, alpha=0.2, cmap='viridis')


plt.show()

def grad_F(x, y, configs):
    '''
    Input: loss surface config, point(s) to evaluate
    Outputs: 
    gradient at x (float),
    gradient at y (float),
    escaped (bool) (whether we're in basin 2 or not)
    '''
    c_1 = configs['suboptimal']
    c_2 = configs['optimal']
    
    #Centers
    x1, y1 = c_1['center']
    x2, y2 = c_2['center']
    
    #Distances
    r1 = np.sqrt((x-x1)**2 + (y-y1)**2)
    r2 = np.sqrt((x-x2)**2 + (y-y2)**2)
    
    #Detects which basin to use
    F1 = c_1['depth'] * (r1/c_1['width'])**c_1['power']
    F2 = c_2['depth'] * (r2/c_2['width'])**c_2['power']
    use_basin1 = F1 < F2
    
    #Avoid division by zero
    r1 = np.maximum(r1, 1e-10)
    r2 = np.maximum(r2, 1e-10)
    
    #Gradient of basin 1
    coeff1 = (c_1['depth']*c_1['power']/c_1['width']**2)*(r1/c_1['width'])**(c_1['power']-2)
    grad1_x = coeff1 * (x - x1)
    grad1_y = coeff1 * (y - y1)
    
    #Gradient of basin 2
    coeff2 = (c_2['depth']*c_2['power']/c_2['width']**2)*(r2/c_2['width'])**(c_2['power']-2)
    grad2_x = coeff2*(x-x2)
    grad2_y = coeff2*(y-y2)
    
    #Return gradient from active basin and escaped bool
    return np.array([np.where(use_basin1, grad1_x, grad2_x), np.where(use_basin1, grad1_y, grad2_y)]), not use_basin1

def levy_stable_sample(alpha, beta=0, size=1):
    #Handles Gaussian case
    if alpha == 2:
        return np.random.randn(size)
    
    #Chambers-Mallows-Stuck method
    U = np.random.uniform(-np.pi/2, np.pi/2, size)
    W = np.random.exponential(1.0, size)
    
    #Handles Cauchy case
    if alpha == 1:
        return np.tan(U)
    
    const = beta * np.tan(np.pi * alpha / 2)
    B = np.arctan(const) / alpha
    S = (1 + const**2)**(1/(2*alpha))
    
    term1 = np.sin(alpha * (U + B))
    term2 = (np.cos(U))**(1/alpha)
    term3 = (np.cos(U - alpha * (U + B)) / W)**((1 - alpha)/alpha)
    
    samples = S*term1/term2*term3
    
    return samples[0] if size == 1 else samples

#Generate samples
np.random.seed(42)
gaussian = np.random.randn(10000)
levy = levy_stable_sample(alpha=1.5, size=10000)

#Clip extreme outliers for visualization
levy_clipped = np.clip(levy, -10, 10)

fig, axes = plt.subplots(1, 4, figsize=[18, 5])

#Plot 1: Gaussian
axes[0].hist(gaussian, bins=100, alpha=0.8, color='blue', density=True, edgecolor='black')
axes[0].set_xlim(-4, 4)
axes[0].set_title('Gaussian Distribution', fontsize=14)
axes[0].set_xlabel('Value')
axes[0].set_ylabel('Density')
axes[0].grid(alpha=0.3)

#Plot 2: Lévy (clipped for visualization)
axes[1].hist(levy_clipped, bins=100, alpha=0.8, color='red', density=True, edgecolor='black')
axes[1].set_xlim(-4, 4)
axes[1].set_title('Lévy α=1.5 (Heavy Tails)', fontsize=14)
axes[1].set_xlabel('Value')
axes[1].set_ylabel('Density')
axes[1].grid(alpha=0.3)


#Plot 3: Log scale to see tails
axes[2].hist(gaussian, bins=200, alpha=0.6, label='Gaussian', color='blue', 
             range=(-50, 50), log=True)
axes[2].hist(levy, bins=200, alpha=0.6, label='Lévy (α=1.5)', color='red', 
             range=(-50, 50), log=True)
axes[2].set_xlim(-50, 50)
axes[2].set_title('Heavy Tails (log scale)', fontsize=14)
axes[2].set_xlabel('Value')
axes[2].set_ylabel('Count (log)')
axes[2].legend()


#Plot 4: Comparison showing tail probabilities
tail_thresholds = np.arange(0, 10, 0.5)
gaussian_tail_prob = [np.mean(np.abs(gaussian) > t) for t in tail_thresholds]
levy_tail_prob = [np.mean(np.abs(levy) > t) for t in tail_thresholds]

axes[3].semilogy(tail_thresholds, gaussian_tail_prob, 'b-', linewidth=2, label='Gaussian')
axes[3].semilogy(tail_thresholds, levy_tail_prob, 'r-', linewidth=2, label='Lévy α=1.5')
axes[3].set_xlabel('Threshold', fontsize=12)
axes[3].set_ylabel('P(|X| > threshold)', fontsize=12)
axes[3].set_title('Tail Probability (Heavy Tails!)', fontsize=14)
axes[3].legend(fontsize=12)
axes[3].grid(alpha=0.3)

plt.tight_layout()
plt.show()

def SGD(steps, configs, eta=0.001, noise_alpha=2):
    np.random.seed()
    #Defines starting point
    #Set starting point=center
    theta = configs['suboptimal']['center']
    thetas=[theta]
    escaped=False
    step_count = 0
    #Generates noise array (we do this at once so we have reproducibility with seed setting)
    levy_noise = levy_stable_sample(alpha=noise_alpha, size=steps)

    while not escaped and step_count < steps: 
        #Calcs exact gradient
        grad, escaped = grad_F(x=theta[0],y=theta[1],configs=configs)
        grad_term = eta*grad
        #Samples noise distribution
        noise_term = eta*levy_stable_sample(alpha=noise_alpha,size=1)
        #Increments
        theta = theta - grad_term + noise_term
        thetas.append(theta)
        step_count += 1
        #if step_count % 100000 == 0:
        #    print(step_count)
        #    print(noise_term)
    return(thetas,step_count)

#Averages escape over 1000 turns for more robust analysis
s_wide,s_medium,s_sharp=0,0,0
eta=0.05
alpha=1.5
for i in range(1000):
    s_wide += SGD(10000,configs_wide,eta=eta,noise_alpha=alpha)[1]
    s_medium += SGD(10000,configs_medium,eta=eta,noise_alpha=alpha)[1]
    s_sharp += SGD(10000,configs_sharp,eta=eta,noise_alpha=alpha)[1]
s_wide /= 1000
s_medium /= 1000
s_sharp /= 1000
print(f"Average Wide Escape Time: {s_wide}\nAverage Medium Escape Time: {s_medium}\nAverage Sharp Escape Time:{s_sharp}")

Average Wide Escape Time: 328.322
Average Medium Escape Time: 130.378
Average Sharp Escape Time:19.348

s_wide,s_medium,s_sharp=0,0,0
eta=0.2
alpha=2
for i in range(1000):
    s_wide += SGD(10000,configs_wide,eta=eta,noise_alpha=alpha)[1]
    s_medium += SGD(10000,configs_medium,eta=eta,noise_alpha=alpha)[1]
    s_sharp += SGD(10000,configs_sharp,eta=eta,noise_alpha=alpha)[1]
s_wide /= 1000
s_medium /= 1000
s_sharp /= 1000
print(f"Average Wide Escape Time: {s_wide}\nAverage Medium Escape Time: {s_medium}\nAverage Sharp Escape Time:{s_sharp}")

Average Wide Escape Time: 8581.439
Average Medium Escape Time: 143.298
Average Sharp Escape Time:4.331

x=[configs_sharp['suboptimal']['width'],configs_medium['suboptimal']['width'],configs_wide['suboptimal']['width']]
y_1=[19.348,130.378,328.322]
y_2=[4.331,143.298,8581.439]

fig = plt.figure(figsize=[14,5])

#Escape time for Lévy
ax = fig.add_subplot(121)
ax.set_title("Lévy Escape Time vs Basin Width (Radon Measure Proxy)")
ax.set_xlabel("Starting Basin Width")
ax.set_ylabel("Escape Time $\Gamma$ (SGD steps)")
ax.scatter(x, y_1, color='blue', label='Lévy Noise')
ax.plot(x, y_1, color='blue')
ax.set_xscale("log")
ax.set_yscale("log")
plt.legend()

ax2 = fig.add_subplot(122)
ax2.set_title("Gaussian Escape Time vs Basin Width (Radon Measure Proxy)")
ax2.set_xlabel("Starting Basin Width")
ax2.set_ylabel("Escape Time $\Gamma$ (SGD steps)")
ax2.set_xscale("log")
ax2.set_yscale("log")
ax2.scatter(x, y_2, color='red', label='Gaussian Noise')
ax2.plot(x, y_2, color='red')

plt.legend()
plt.show()

import numpy as np
import copy
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from itertools import cycle
import matplotlib.pyplot as plt
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

def alpha_estimator(grads):
    '''
    Estimates alpha according to Mohammadi, Mohammadpour (2014) Section 3.2
    '''
    #First finds m=sum block size (closest to root N probably best). We pray here that N is not prime.
    N = len(grads)
    
    for i in range(1, int(N**0.5)+1):
        if N%i == 0:
            m=i
    n = int(N/m) #Gets num of blocks n (each with size m)

    #Shifts the µ of the alpha-stable distribution to 0 (since Section 3.2 assumes centered distribution)
    grads = torch.stack(grads)
    grads -= grads.mean()
    
    #Gets the real noise distr. and the binned sum noise distr. for comparison
    X = grads
    Y = torch.sum(X.view(n, m, -1), 1)
    
    #Calculates alpha using inverse-log-scaling of distr summing
    Y_log_norm = torch.log(Y.norm(dim=1) + 1e-15).mean()
    X_log_norm = torch.log(X.norm(dim=1) + 1e-15).mean()
    alpha = 1/((Y_log_norm - X_log_norm) / math.log(m))
    
    return(alpha)

class InceptionModule(nn.Module):
    def __init__(self, in_channels, sublayer_1, sublayer_2_input, sublayer_2_output, 
                 sublayer_3_input, sublayer_3_output, sublayer_maxpool_output):
        super(InceptionModule, self).__init__()
        
        #1x1 sublayer
        self.branch1 = nn.Sequential(
            nn.Conv2d(in_channels, sublayer_1, kernel_size=1, padding=0),
            nn.ReLU(inplace=True)
        )       
        #3x3 sublayer with 1x1 bottleneck)
        self.branch2 = nn.Sequential(
            nn.Conv2d(in_channels, sublayer_2_input, kernel_size=1, padding=0),
            nn.ReLU(inplace=True),
            nn.Conv2d(sublayer_2_input, sublayer_2_output, kernel_size=3, padding=1),
            nn.ReLU(inplace=True)
        )      
        #5x5 sublayer (with 1x1 bottleneck)
        self.branch3 = nn.Sequential(
            nn.Conv2d(in_channels, sublayer_3_input, kernel_size=1, padding=0),
            nn.ReLU(inplace=True),
            nn.Conv2d(sublayer_3_input, sublayer_3_output, kernel_size=5, padding=2),
            nn.ReLU(inplace=True)
        )       
        #MaxPooling sublayer
        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            nn.Conv2d(in_channels, sublayer_maxpool_output, kernel_size=1, padding=0),
            nn.ReLU(inplace=True)
        )
    
    def forward(self, x):
        branch1_out = self.branch1(x)
        branch2_out = self.branch2(x)
        branch3_out = self.branch3(x)
        branch4_out = self.branch4(x)      
        #Concatenate along channel dimension
        output = torch.cat([branch1_out, branch2_out, branch3_out, branch4_out], dim=1)
        return output


class InceptionCNN(nn.Module):
    def __init__(self, num_classes=6):
        super(InceptionCNN, self).__init__()
        
        #Initial conv layers
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )      
        self.conv2 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=1, padding=0),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 192, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )
        
        #Inception modules
        self.inception1 = InceptionModule(192, 64, 96, 128, 16, 32, 32)
        self.inception2 = InceptionModule(256, 128, 128, 192, 32, 96, 64)
        
        #Output layers
        self.global_avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(480, num_classes)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.inception1(x)
        x = self.inception2(x)
        x = self.global_avg_pool(x)
        x = torch.flatten(x, 1)
        x = self.dropout(x)
        x = self.fc(x)
        return x

class FullyConnectedNetwork(nn.Module):
    def __init__(self, input_dim=[3,32,32], width=30, depth=7, output_dim=10):
        super().__init__()
        self.input_dim = input_dim[0]*input_dim[1]*input_dim[2]
        self.width = width
        self.depth = depth
        self.output_dim = output_dim
        
        fcblock = [nn.Linear(self.width, self.width, bias=False), nn.ReLU(inplace=True)]        
        self.fc = nn.Sequential(nn.Linear(self.input_dim, self.width, bias=False),
                               nn.ReLU(inplace=True),
                               *[fcblock[int(i%2!=0)] for i in range(self.depth-2)],
                               nn.Linear(self.width, self.output_dim, bias=False))
        
    def forward(self, x):
        x = x.view(x.size(0), self.input_dim)
        x = self.fc(x)
        return(x)

#Setup
seed = 0
torch.manual_seed(seed)

#Load CIFAR-10 data
batch_size, num_workers, num_classes = 125, 2, 10
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465),
                                 (0.2470, 0.2435, 0.2616))

tf_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(), normalize])

tf_test = transforms.Compose([transforms.ToTensor(), normalize])

train = datasets.CIFAR10('./data', train=True, download=True, transform=tf_train)
test  = datasets.CIFAR10('./data', train=False, download=True, transform=tf_test)

train_loader = torch.utils.data.DataLoader(train, batch_size, shuffle=True, num_workers=num_workers)
train_loader_eval = torch.utils.data.DataLoader(datasets.CIFAR10('./data', train=True, transform=tf_test),
                                                batch_size, shuffle=False, num_workers=num_workers)
test_loader_eval = torch.utils.data.DataLoader(test, batch_size, shuffle=False, num_workers=num_workers)

Files already downloaded and verified
Files already downloaded and verified

# Setup params
iterations = 7000
lr = 0.1
mom = 0.9
wd = 5e-4
print_freq = 100
eval_freq = 100


def eval_model(eval_loader, model, crit, opt):
    """Evaluate model and estimate alpha"""
    model.eval()
    
    total_size = 0
    total_loss = 0
    total_correct = 0
    
    #Get loss and accuracy 
    with torch.no_grad():
        for x, y in eval_loader:
            x, y = x.to(device), y.to(device)
            out = model(x)
            
            # Get predictions
            pred = out.argmax(dim=1)
            correct = (pred == y).sum().item()
            
            # Calculate loss
            loss = crit(out, y)
            
            bs = x.size(0)
            total_size += bs
            total_loss += loss.item() * bs
            total_correct += correct
    
    #Fetch gradients for alpha estimation
    grads = []
    for x, y in eval_loader:
        x, y = x.to(device), y.to(device)
        opt.zero_grad()
        out = model(x)
        loss = crit(out, y)
        loss.backward()
        
        grad = nn.utils.parameters_to_vector(
            [p.grad for p in model.parameters() if p.grad is not None]
        )
        grads.append(grad.cpu())
    
    #Estimate alpha from gradients
    alpha = alpha_estimator(grads)
    
    avg_loss = total_loss / total_size
    avg_acc = 100.0 * total_correct / total_size
    
    return avg_loss, avg_acc, alpha


#Init Inception Conv. neural network model
model = InceptionCNN(num_classes=10).to(device)

#Init optimizer to be SGD and loss to be x-entropy
opt = optim.SGD(model.parameters(), lr=lr, momentum=mom, weight_decay=wd)
crit = nn.CrossEntropyLoss()

#rceate infinite data loader so we can infinitely iterate
circ_train_loader = cycle(train_loader)

training_history = []
evaluation_history_train = []
evaluation_history_val = []
alpha_history_train = []
alpha_history_val = []

#Training starts
print("Starting training")

for i, (x, y) in enumerate(circ_train_loader):
    #Periodic evaluation
    if i % eval_freq == 0:
        print(f"\nEvaluating at iteration {i}")
        val_loss, val_acc, val_alpha = eval_model(test_loader_eval, model, crit, opt)
        train_loss, train_acc, train_alpha = eval_model(train_loader_eval, model, crit, opt)
        
        evaluation_history_val.append([i, val_loss, val_acc])
        evaluation_history_train.append([i, train_loss, train_acc])
        alpha_history_train.append(train_alpha)
        alpha_history_val.append(val_alpha)
        
        print(f"Val   - Loss: {val_loss:.4f}, Acc: {val_acc:.2f}%, Alpha: {val_alpha:.4f}")
        print(f"Train - Loss: {train_loss:.4f}, Acc: {train_acc:.2f}%, Alpha: {train_alpha:.4f}")
    
    #Training step
    model.train()
    x, y = x.to(device), y.to(device)
    
    opt.zero_grad()
    out = model(x)
    loss = crit(out, y)
    loss.backward()
    opt.step()
    
    with torch.no_grad():
        pred = out.argmax(dim=1)
        acc = 100.0 * (pred == y).sum().item() / y.size(0)
    
    training_history.append([i, loss.item(), acc])
    
    if i % print_freq == 0:
        print(f"Iter {i}: Loss={loss.item():.4f}, Acc={acc:.2f}%")
        
    if i >= iterations:
        break

#Final eval
print("\nFinal evaluation:")
val_loss, val_acc, val_alpha = eval_model(test_loader_eval, model, crit, opt)
train_loss, train_acc, train_alpha = eval_model(train_loader_eval, model, crit, opt)

evaluation_history_val.append([i + 1, val_loss, val_acc])
evaluation_history_train.append([i + 1, train_loss, train_acc])
alpha_history_train.append(train_alpha)
alpha_history_val.append(val_alpha)

print(f"Final Val  - Loss: {val_loss:.4f}, Acc: {val_acc:.2f}%, Alpha: {val_alpha:.4f}")
print(f"Final Train - Loss: {train_loss:.4f}, Acc: {train_acc:.2f}%, Alpha: {train_alpha:.4f}")

def plot_metrics(train_history, val_history, train_alpha, val_alpha):
    """Plot training and validation metrics."""
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    train_data = list(zip(*train_history))
    val_data = list(zip(*val_history))
    
    plots = [
        (axes[0, 0], 'Loss', train_data[1], val_data[1]),
        (axes[0, 1], 'Accuracy (%)', train_data[2], val_data[2]),
        (axes[1, 0], r'Estimated $\alpha$ (Train Set)', None, train_alpha, 'blue'),
        (axes[1, 1], r'Estimated $\alpha$ (Val Set)', None, val_alpha, 'orange')
    ]
    
    for i, plot_info in enumerate(plots):
        ax = plot_info[0]
        title = plot_info[1]
        
        if i < 2:   #Loss plots
            ax.plot(train_data[0], plot_info[2], label='Train', alpha=0.7)
            ax.plot(val_data[0], plot_info[3], label='Val', marker='o', markersize=4)
            ax.legend()
        else:  #Alpha plots
            ax.plot(val_data[0], plot_info[3], marker='o', markersize=4, color=plot_info[4])
        
        ax.set_xlabel('Iteration')
        ax.set_ylabel(title.split('(')[0].strip())
        ax.set_title(f'Training and Validation {title}' if i < 2 else title)
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Plot results
plot_metrics(training_history, evaluation_history_val, alpha_history_train, alpha_history_val)

Lévy-Driven Stochastic Differential Equation formulation of SGD¶

2D Loss landscape formulation¶

Gradient Formulation¶

Noise Processes for $u_t$¶

Chambers–Mallows–Stuck (CMS) Method for Simulating Stable α-Distributions¶

Lévy-driven Stochastic Differential Equation¶

Escape Dynamics¶

Theorem from Zhou et al., 2021: escaping time bounds¶

Escape Experiment: Lévy escape for different geometry basins and gradient noise tail index $\alpha$¶

Analysis¶

Conclusion¶

Follow Up Study: Empirical Lévy Index $\alpha$ Estimation on Gradient Noise during Training¶

Lévy index $\alpha$ estimation - How do we even get $\alpha$ in practice??¶

Network and Data Definition¶

InceptionNet¶

Fully-Connected Model Dinosaur Exhibit¶

Data Loading¶

Model Training¶

Results: Gradient noise $\alpha$ and acc/loss during deep network training¶

Conclusion:¶