# run this cell to reset the kernel or select kernel > restart kernel
%reset -s -f


!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: OK


import logging
import os

import numpy as np

import matplotlib.pyplot as plt
from matplotlib import rc

import torch
import torch.nn as nn
import torch.utils.data as data
import torchvision

import pyro
import pyro.distributions as dist


assert pyro.__version__.startswith('1.4.0'), f"For TyXe, pyro version must be exactly 1.4.0, not {pyro.__version__}"

# pyro.enable_validation(True)
pyro.distributions.enable_validation(False)
pyro.set_rng_seed(42)

logging.basicConfig(format='%(message)s', level=logging.INFO)

# Set matplotlib settings
%matplotlib inline
plt.style.use('default')

# Is GPU available?
USE_CUDA = torch.cuda.is_available()

USE_CUDA

True

def model():
    pyro.sample("z_1", ...)


# ---- VAE example on MNIST's handwritten digit data ----

from vae import Encoder, Decoder # appropriate torch.nn.Modules

class VAE(nn.Module):
    def __init__(self, z_dim=50, hidden_dim=400, use_cuda=False):
        super().__init__()
        # create the encoder and decoder networks
        self.encoder = Encoder(z_dim, hidden_dim)
        self.decoder = Decoder(z_dim, hidden_dim)

        if use_cuda:
            # calling cuda() here will put all the parameters of
            # the encoder and decoder networks into gpu memory
            self.cuda()
        self.use_cuda = use_cuda
        self.z_dim = z_dim


    # define the model p(x|z)p(z)
    def model(self, x):
        # register PyTorch module `decoder` with Pyro
        pyro.module("decoder", self.decoder)
        
        # this context makes the samples in the batch independent:
        with pyro.plate("data", x.shape[0]):
            
            # setup parameters for gaussian prior p(z)
            z_loc = x.new_zeros(torch.Size((x.shape[0], self.z_dim)))
            z_scale = x.new_ones(torch.Size((x.shape[0], self.z_dim)))
            
            # sample from prior (value will also be sampled by guide when computing the ELBO)
            z = pyro.sample("latent", dist.Normal(z_loc, z_scale).to_event(1))
            # dist.to_event(n) declares the n rightmost dimensions as being RVs, the rest are batch dimensions
            
            # decode the latent code z
            img = self.decoder(z)
            
            # score against actual images 
            # (results in reconstruction loss = log likelihood for decoder)
            pyro.sample("obs", dist.Bernoulli(img).to_event(1), obs=x.reshape(-1, 784)) # bernoulli to sample black/white
            
            # return the img only so that we can visualize it later
            return img
        
VAE.model = model


    # define the guide (i.e. variational distribution) q(z|x)
    def guide(self, x):
        # register PyTorch module `encoder` with Pyro
        pyro.module("encoder", self.encoder)
        with pyro.plate("data", x.shape[0]):
            # use the encoder to get the parameters used to define q(z|x)
            z_loc, z_scale = self.encoder(x)
            
            # sample the latent code z
            pyro.sample("latent", dist.Normal(z_loc, z_scale).to_event(1))

VAE.guide = guide


# preliminaries 

from utils import make_loaders_mnist
from vae import train
from vae import evaluate

# Run options
smoke_test = True # short run
LEARNING_RATE = 1.0e-3

# Run only for a single iteration for testing
NUM_EPOCHS = 1 if smoke_test else 100
TEST_FREQUENCY = 5

# load MNIST
train_loader, test_loader = make_loaders_mnist(
    batch_size=32,
    use_cuda=USE_CUDA
)

# clear pyro's global parameter store
pyro.clear_param_store()


# setup the VAE
vae = VAE(
    z_dim=10,
    hidden_dim=40,    
    use_cuda=USE_CUDA
)

# setup the optimizer
optimizer = pyro.optim.Adam({"lr": LEARNING_RATE}) # pyro.optim wraps torch.optim

# setup the stochastic variational inference algorithm
svi = pyro.infer.SVI(vae.model, vae.guide, optimizer, loss=pyro.infer.Trace_ELBO())


# training
train_elbo = []
test_elbo = []

# training loop
for epoch in range(NUM_EPOCHS):
    # simply calls svi.step(batch) as train loop
    total_epoch_loss_train = train(svi, train_loader, use_cuda=USE_CUDA)
    train_elbo.append(-total_epoch_loss_train)
    print("[epoch %03d] average training loss: %.4f" % (epoch, total_epoch_loss_train))

    if epoch % TEST_FREQUENCY == 0:
         # simply calls svi.evaluate_loss(batch) as test loop
        total_epoch_loss_test = evaluate(svi, test_loader, use_cuda=USE_CUDA)
        test_elbo.append(-total_epoch_loss_test)
        print("[epoch %03d] average test loss: %.4f" % (epoch, total_epoch_loss_test))


# plot example images

row = 12
column = 3
plt.figure(figsize=(20,5))

mock_input = torch.zeros([1, 784]) # does not matter, check VAE.model

if USE_CUDA:
    mock_input = mock_input.cuda()

for i in range(1, row * column +1):
    plt.subplot(column, row, i) 

    # get img from the model.
    # model samples internally and practically ignores mock_input
    sample_img = vae.model(mock_input) # p(img|z)p(z) 
    
    img = sample_img[0].view(28, 28).cpu().data.numpy()
    plt.xticks([])
    plt.yticks([]) 
    plt.imshow(img, cmap="gray")


# time for a coffee pause? :)


# preliminaries

# dataset helper functions
from utils import make_loaders_bnns, make_net

dataset = "cifar10"
model_name = "resnet18"
pretrained = False # usually, start with a net pretrained using MLE!

# load cifar data
train_loader, test_loader, _ = make_loaders_bnns(
    dataset, "./data", 32, 32, False, False 
)

# take an existing net:
resnet: torch.nn.Module = make_net(dataset, model_name, pretrained=pretrained)
# make_net only reshapes final layer of torchvision.models.model_name()

# initialize gaussian means using these:
pretrained_weights = resnet.state_dict()

# convert model to pyro module inplace
pyro.nn.module.to_pyro_module_(resnet)


for m in resnet.modules():
    # replace weights and biases of 
    # fully connected and convolutional modules
    # (do not be bayesian about batchnorms)
    if isinstance(m, (nn.Linear, nn.Conv2d)):
        # PyroSample Modules sample on their forward pass,
        m.weight = pyro.nn.PyroSample(
            dist.Normal(
                torch.zeros_like(m.weight),
                torch.ones_like(m.weight)
            ).to_event()
        )
        if m.bias is not None:
            m.bias = pyro.nn.PyroSample(
                dist.Normal(
                    torch.zeros_like(m.bias),
                    torch.ones_like(m.bias)
                ).to_event()
            )


def model(x, y=None):
    logits = resnet(x) # forward samples from prior
    
    # define the likelihood 
    # p(data | weights)
    with pyro.plate("data_plate", x.shape[0]):
        # context for IID data points in batch
        
        # log likelihood reconstruction loss
        pyro.sample("data", dist.Categorical(logits=logits), obs=y)
        
        # return for prediction/testing only:
        return logits


guide = pyro.infer.autoguide.AutoNormal(
    model,
    init_scale=1e-4, # initialize the variances uniformly (vars=phi above)
    init_loc_fn=pyro.infer.autoguide.init_to_value(
        values=pretrained_weights
    ) # init gauss means to pretrained weights
)


# an optimizer can be taken from pyro.optim (wrapper around torch.optim API)
optim = pyro.optim.Adam({"lr": 1e-3})

# set up stochastic variational inference
svi = pyro.infer.SVI(
    model,
    guide,
    optim,
    pyro.infer.Trace_ELBO() # objective function
)


# training

# fit the BNN
num_epochs = 1
first_n_batches_train = 20

for _ in range(num_epochs):
    for i, (x, y) in enumerate(iter(train_loader)):
        if i > first_n_batches_train:
            break
            
        # 1. forward guide(x,y), memorize sampled weight values
        # 2. forward model(x,y), using memorized values
        # 3. compute the elbo
        # 4. elbo.backward() # updates guide
        svi.step(x, y)


# prediction
def make_prediction(model, guide, x, y):
    trace = pyro.poutine.trace(guide).get_trace(x) # memorize sampled weight values of guide
    logits = pyro.poutine.replay(model, trace=trace)(x) # use sampled weight values of guide
    predictions = logits.argmax(-1)

    acc = ((predictions == y).sum()/y.shape[0]).item()
    print(f"Batch acc = {acc}")


first_n_batches_test = 10

test_predictions = [make_prediction(model, guide, x, y) for i, (x, y) in enumerate(test_loader) if i < first_n_batches_test]


# first some more imports:
import os
import contextlib
import functools
from typing import List, Optional

import tyxe # bnn library ontop of pyro

from utils import make_loaders_bnns, make_net


# take an existing torch.nn.Module:
resnet: torch.nn.Module = make_net("cifar10", "resnet18", pretrained=False)

# setup bnn:
prior = tyxe.priors.IIDPrior(dist.Normal(0,1), expose_all=False, hide_module_types=(nn.BatchNorm2d,))
likelihood = tyxe.likelihoods.Categorical(len(train_loader))
guide = functools.partial(
    tyxe.guides.AutoNormal, 
    train_loc=False,
    init_loc_fn=tyxe.guides.PretrainedInitializer.from_net(resnet)
)
bnn = tyxe.VariationalBNN(resnet, prior, likelihood, guide)

# train bnn
# (execute with care; very compute intensive, may be too much for cpu!)
# bnn.fit(train_loader, optim=pyro.optim.Adam({"lr":1e-3}), num_epochs=1, device="gpu" if USE_CUDA else "cpu")


# Standard Hyperparameters first ...

# MODEL
architecture: str = "resnet18"
dataset: str = "cifar10"
pretrained: bool = False
mock_dataset: bool = False

# DATA
train_batch_size: int = 10
test_batch_size: int = 10
num_epochs: int = 1
test_samples: int = 20

# MISC
root: str = os.environ.get("DATASETS_PATH", "./data")
seed: int = 42
output_dir: Optional[str] = None

# OPTIMIZER
lr: float = 0.001 # initial learning rate
milestones: Optional[List[int]] = None # epochs at which to do scheduler step
gamma: float = 0.1 # scheduler step factor

resnets = [n for n in dir(torchvision.models) if (n.startswith("resnet") or n.startswith("wide_resnet")) and n[-1].isdigit()]
assert architecture in resnets, architecture

datasets = ["cifar10", "cifar100", "mnist"]
assert dataset in datasets, dataset


# Some BNN Hyperparameters:

inference: str = "mean-field"
local_reparameterization: bool = False # important: variance reduction for gradients!
flipout: bool = False # important: variance reduction for gradients!
max_guide_scale: float = 0.1 # to prevent underfitting: clamp learned variance
rank: int = 10 # low rank setting for inference == "last-layer-low-rank"
scale_only: bool = False # train variance only, leaving means at pretrained values

# More comments on these inference options in the section 'guide' below
inference_options = [
    "mle", # maximum likelihood estimation: weights = argmax p(data|weights)
    "map", # maximum a posteriori inference: weights = argmax p(data|weights)*p(weights)
    "mean-field", # svi with autonormal guide (diagonal covariance)
    "last-layer-mean-field", # svi for last layer only, autonormal guide and diagonal covariance
    "last-layer-full", # svi for last layer only, autonormal guide and FULL covariance
    "last-layer-low-rank" # svi for last layer only, low rank
]
assert inference in inference_options, inference


# ----- set up pyro & torch -----
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

# ----- set up dataset and model -----
train_loader, test_loader, ood_loader = make_loaders_bnns(
    dataset, root, train_batch_size, test_batch_size, use_cuda, mock_dataset
)

net: torch.nn.Module = make_net(dataset, architecture, pretrained=pretrained).to(device)


# tyxe.likelihoods includes:

# Bernoulli
# Categorical
# HeteroskedasticGaussian
# HomoskedasticGaussian

likelihood = tyxe.likelihoods.Categorical(len(train_loader.sampler))

# uncomment for documentation:
# tyxe.likelihoods.HeteroskedasticGaussian?


if inference == "mle":
    # do maximum likelihood estimation
    test_samples = 1
    guide = None


if inference == "map":
    # maximum a posteriori inference 
    test_samples = 1
    guide = functools.partial(
        pyro.infer.autoguide.AutoDelta, # deterministic weights
        init_loc_fn=tyxe.guides.PretrainedInitializer.from_net(net)
    )


if inference == "mean-field":
    # SVI with diagonal covariances
    guide = functools.partial(
        tyxe.guides.AutoNormal,
        init_loc_fn=tyxe.guides.PretrainedInitializer.from_net(net),
        init_scale=1e-4,
        max_guide_scale=max_guide_scale, # prevent underfitting
        train_loc=not scale_only # train gaussian means?
    )


if inference.startswith("last-layer"):
    # usually only done for pretrained network:
    # if not pretrained:
    #    raise ValueError("Asked to do last-layer inference, but no pre-trained weights were provided.")
    
    # turning parameters except for last layer in buffers to avoid training them
    # this might be avoidable via poutine.block
    for module in net.modules():
        if module is not net.fc:
            for param_name, param in list(module.named_parameters(recurse=False)):
                delattr(module, param_name)
                module.register_buffer(param_name, param.detach().data)

    if inference == "last-layer-mean-field":
        guide = functools.partial(
            tyxe.guides.AutoNormal, 
            init_loc_fn=tyxe.guides.PretrainedInitializer.from_net(net),
            init_scale=1e-4
        )
        
    elif inference == "last-layer-full":
        guide = functools.partial(
            pyro.infer.autoguide.AutoMultivariateNormal,
            init_loc_fn=tyxe.guides.PretrainedInitializer.from_net(net),
            init_scale=1e-4
        )
        
    elif inference == "last-layer-low-rank":
        guide = functools.partial(
            pyro.infer.autoguide.AutoLowRankMultivariateNormal,
            rank=rank,
            init_loc_fn=tyxe.guides.PretrainedInitializer.from_net(net),
            init_scale=1e-4
        )

Pyro's Automatic Guide Families:
['AutoCallable', 'AutoContinuous', 'AutoDelta', 'AutoDiagonalNormal', 'AutoDiscreteParallel', 'AutoGuide', 'AutoGuideList', 'AutoIAFNormal', 'AutoLaplaceApproximation', 'AutoLowRankMultivariateNormal', 'AutoMultivariateNormal', 'AutoNormal', 'AutoNormalizingFlow']
TyXe's Automatic Guide Families:
['AutoNormal']


print("Pyro's Automatic Guide Families:")
print([g for g in dir(pyro.infer.autoguide) if g.startswith("Auto")])

print("TyXe's Automatic Guide Families:")
print([g for g in dir(tyxe.guides) if g.startswith("Auto")])

# uncomment for documentation:
# pyro.infer.autoguide.AutoDelta?


# it is standard practice to not be bayesian about batchnorm modules:
prior_kwargs = {
    "expose_all": False, # do not treat all nn.Modules with pyro 
    "hide_module_types": (nn.BatchNorm2d,) # specifically, ignore batchnorms
}

# our choice of guide impacts how we need to initialize the Prior:
if inference == "mle":
    # we dont want a prior for maximum likelihood estimation
    prior_kwargs["hide_all"] = True
    
elif inference.startswith("last-layer"):
    # only be bayesian about the final, fully connected layer
    del prior_kwargs['hide_module_types']
    prior_kwargs["expose_modules"] = [net.fc]
    
prior = tyxe.priors.IIDPrior(
    dist.Normal(
        torch.zeros(1, device=device),
        torch.ones(1, device=device)
    ),
    **prior_kwargs
)

print("TyXe's Available Prior Distributions:")
print([p for p in dir(tyxe.priors) if p.endswith("Prior")])

# uncomment for documentation:
# tyxe.priors.IIDPrior?

TyXe's Available Prior Distributions:
['DictPrior', 'IIDPrior', 'LambdaPrior', 'LayerwiseNormalPrior', 'Prior']


# Finally set up our VariationalBNN!
bnn = tyxe.VariationalBNN(
    net, prior, likelihood, guide
)

# uncomment for documentation:
# bnn?


# gradient variance reduction techniques:
if local_reparameterization:
    if flipout:
        raise RuntimeError("Can't use both local reparameterization and flipout, pick one.")
    train_context = tyxe.poutine.local_reparameterization
    # turns each
    # torch.distributions.Normal(loc, scale).sample() (gradient w.r.t. loc, scale is stochastic)
    # into
    # loc + scale * torch.distributions.Normal(0, 1).sample() (gradient w.r.t. loc, scale is deterministic) 
elif flipout:
    # usually: use one sampled weight for entire minibatch
    # flipout: efficiently sample pseudo-independent weights along the minibatch dimension
    train_context = tyxe.poutine.flipout
else:
    train_context = contextlib.nullcontext


# pyro-specific: optimizer must come from pyro.optim
if milestones is None:
    optim = pyro.optim.Adam({"lr": lr})
else:
    optimizer = torch.optim.Adam
    optim = pyro.optim.MultiStepLR({"optimizer": optimizer, "optim_args": {"lr": lr}, "milestones": milestones, "gamma": gamma})

print("All typical pytorch optimizers & schedulers are supported by pyro.optim:")
print([opt for opt in dir(pyro.optim) if "_" not in opt and opt[0] == opt[0].upper()])

All typical pytorch optimizers & schedulers are supported by pyro.optim:
['ASGD', 'Adadelta', 'Adagrad', 'AdagradRMSProp', 'Adam', 'AdamW', 'Adamax', 'ChainedScheduler', 'ClippedAdam', 'ConstantLR', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts', 'CyclicLR', 'DCTAdam', 'ExponentialLR', 'LambdaLR', 'LinearLR', 'MultiStepLR', 'MultiplicativeLR', 'NAdam', 'OneCycleLR', 'PyroLRScheduler', 'PyroOptim', 'RAdam', 'RMSprop', 'ReduceLROnPlateau', 'Rprop', 'SGD', 'SequentialLR', 'SparseAdam', 'StepLR']


# tyXe-specific: evaluation and logging done after every epoch:
def callback(
        b: tyxe.VariationalBNN, # bnn
        i: int, # epoch number
        avg_elbo: float # mean elbo this epoch
    ):
    avg_err, avg_ll = 0., 0.
    
    for x, y in iter(test_loader):t
        err, ll = b.evaluate(x.to(device), y.to(device), num_predictions=test_samples)
        avg_err += err / len(test_loader.sampler)
        avg_ll += ll / len(test_loader.sampler)
        
    print(f"ELBO={avg_elbo}; test error={100 * avg_err:.2f}%; LL={avg_ll:.4f}")


# ------ TRAIN THE MODEL ------
with train_context():
    bnn.fit(train_loader, optim, num_epochs, callback=callback, device=device)

Introduction to Stochastic Variational Inference and Bayesian Neural Networks¶

Outline¶

Setup¶

Probabilistic Machine Learning¶

Probabilistic Models¶

Bayesian Inference, Learning and Evaluation¶

Some Bayesian Inference Algorithms¶

Inference in Pyro¶

Stochastic Variational Inference (SVI)¶

Evidence Lower Bound (ELBO)¶

Pyro Primitives¶

Pyro's "guide" programs¶

Restrictions on guides¶

Pyro Summary¶

Pyro Example: Variational Autoencoder (VAE)¶

`VAE.model(x)`¶

`VAE.guide(x)`¶

Pyro Example: Model Evaluation¶

Bayesian Neural Networks¶

Bayesian ResNet in Pyro¶

1. Prior¶

2. Likelihood¶

3. Guide¶

Bayesian ResNet in TyXe¶

TyXe Options¶

Initialize our Dataset and Model¶

Set up ResNet to be Bayesian using TyXe¶

1. Likelihood¶

2. Guide¶

Instead of SVI on the full model with full gaussian covariances, we will do one of:¶

3. Prior¶

Variance Reduction¶

Optimizer¶

Evaluation logic¶

Training¶

That's it!¶

Introduction to Stochastic Variational Inference and Bayesian Neural Networks¶

Outline¶

Setup¶

Probabilistic Machine Learning¶

Probabilistic Models¶

Bayesian Inference, Learning and Evaluation¶

Some Bayesian Inference Algorithms¶

Inference in Pyro¶

Stochastic Variational Inference (SVI)¶

Evidence Lower Bound (ELBO)¶

Pyro Primitives¶

Pyro's "guide" programs¶

Restrictions on guides¶

Pyro Summary¶

Pyro Example: Variational Autoencoder (VAE)¶

VAE.model(x)¶

VAE.guide(x)¶

Pyro Example: Model Evaluation¶

Bayesian Neural Networks¶

Bayesian ResNet in Pyro¶

1. Prior¶

2. Likelihood¶

3. Guide¶

Bayesian ResNet in TyXe¶

TyXe Options¶

Initialize our Dataset and Model¶

Set up ResNet to be Bayesian using TyXe¶

1. Likelihood¶

2. Guide¶

Instead of SVI on the full model with full gaussian covariances, we will do one of:¶

3. Prior¶

Variance Reduction¶

Optimizer¶

Evaluation logic¶

Training¶

That's it!¶

`VAE.model(x)`¶

`VAE.guide(x)`¶