# Copyright (c) 2022 The BayesFlow Developers
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import tensorflow as tf
from bayesflow.computational_utilities import maximum_mean_discrepancy
[docs]
def kl_latent_space_gaussian(z, log_det_J):
"""Computes the Kullback-Leibler divergence between true and approximate
posterior assuming a Gaussian latent space as a source distribution.
Parameters
----------
z : tf.Tensor of shape (batch_size, ...)
The (latent transformed) target variables
log_det_J : tf.Tensor of shape (batch_size, ...)
The logartihm of the Jacobian determinant of the transformation.
Returns
-------
loss : tf.Tensor
A single scalar value representing the KL loss, shape (,)
Examples
--------
Parameter estimation
>>> kl_latent_space_gaussian(z, log_det_J)
"""
loss = tf.reduce_mean(0.5 * tf.math.square(tf.norm(z, axis=-1)) - log_det_J)
return loss
[docs]
def kl_latent_space_student(v, z, log_det_J):
"""Computes the Kullback-Leibler divergence between true and approximate
posterior assuming latent student t-distribution as a source distribution.
Parameters
----------
v : tf Tensor of shape (batch_size, ...)
The degrees of freedom of the latent student t-distribution
z : tf.Tensor of shape (batch_size, ...)
The (latent transformed) target variables
log_det_J : tf.Tensor of shape (batch_size, ...)
The logarithm of the Jacobian determinant of the transformation.
Returns
-------
loss : tf.Tensor
A single scalar value representing the KL loss, shape (,)
"""
d = z.shape[-1]
loss = 0.0
loss -= d * tf.math.lgamma(0.5 * (v + 1))
loss += d * tf.math.lgamma(0.5 * v + 1e-15)
loss += (0.5 * d) * tf.math.log(v + 1e-15)
loss += 0.5 * (v + 1) * tf.reduce_sum(tf.math.log1p(z**2 / tf.expand_dims(v, axis=-1)), axis=-1)
loss -= log_det_J
mean_loss = tf.reduce_mean(loss)
return mean_loss
[docs]
def kl_dirichlet(model_indices, alpha):
"""Computes the KL divergence between a Dirichlet distribution with parameter vector alpha and a uniform Dirichlet.
Parameters
----------
model_indices : tf.Tensor of shape (batch_size, n_models)
one-hot-encoded true model indices
alpha : tf.Tensor of shape (batch_size, n_models)
positive network outputs in ``[1, +inf]``
Returns
-------
kl : tf.Tensor
A single scalar representing :math:`D_{KL}(\mathrm{Dir}(\\alpha) | \mathrm{Dir}(1,1,\ldots,1) )`, shape (,)
"""
# Extract number of models
J = int(model_indices.shape[1])
# Set-up ground-truth preserving prior
alpha = alpha * (1 - model_indices) + model_indices
beta = tf.ones((1, J), dtype=tf.float32)
alpha0 = tf.reduce_sum(alpha, axis=1, keepdims=True)
# Computation of KL
kl = (
tf.reduce_sum((alpha - beta) * (tf.math.digamma(alpha) - tf.math.digamma(alpha0)), axis=1, keepdims=True)
+ tf.math.lgamma(alpha0)
- tf.reduce_sum(tf.math.lgamma(alpha), axis=1, keepdims=True)
+ tf.reduce_sum(tf.math.lgamma(beta), axis=1, keepdims=True)
- tf.math.lgamma(tf.reduce_sum(beta, axis=1, keepdims=True))
)
loss = tf.reduce_mean(kl)
return loss
[docs]
def mmd_summary_space(summary_outputs, z_dist=tf.random.normal, kernel="gaussian"):
"""Computes the MMD(p(summary_otuputs) | z_dist) to re-shape the summary network outputs in
an information-preserving manner.
Parameters
----------
summary_outputs : tf Tensor of shape (batch_size, ...)
The outputs of the summary network.
z_dist : callable, default tf.random.normal
The latent data distribution towards which the summary outputs are optimized.
kernel : str in ('gaussian', 'inverse_multiquadratic'), default 'gaussian'
The kernel function to use for MMD computation.
"""
z_samples = z_dist(tf.shape(summary_outputs))
mmd_loss = maximum_mean_discrepancy(summary_outputs, z_samples, kernel)
return mmd_loss
[docs]
def log_loss(model_indices, preds, evidential=False, label_smoothing=0.01):
"""Computes the logarithmic loss given true ``model_indices`` and approximate model
probabilities either according to [1] if ``evidential is True`` or according to [2]
if ``evidential is False``.
[1] Radev, S. T., D'Alessandro, M., Mertens, U. K., Voss, A., Köthe, U., & Bürkner, P. C. (2021).
Amortized bayesian model comparison with evidential deep learning.
IEEE Transactions on Neural Networks and Learning Systems.
[2] Elsemüller, L., Schnuerch, M., Bürkner, P. C., & Radev, S. T. (2023).
A Deep Learning Method for Comparing Bayesian Hierarchical Models.
arXiv preprint arXiv:2301.11873.
Parameters
----------
model_indices : tf.Tensor of shape (batch_size, num_models)
one-hot-encoded true model indices
preds : tf.Tensor of shape (batch_size, num_models)
If ``evidential is True`` these should be the concentration
parameters of a Dirichlet density bounded between ``[1, +inf]``.
Else, these should be normalized probability values.
evidential : boolean, optional, default: False
Whether to first normalize ``preds`` (True) or assume
normalized (False, default)
label_smoothing : float or None, optional, default: 0.01
Optional label smoothing factor.
Returns
-------
loss : tf.Tensor
A single scalar Monte-Carlo approximation of the log-loss, shape (,)
"""
# Apply label smoothing to indices, if specified
if label_smoothing is not None:
num_models = tf.cast(tf.shape(model_indices)[1], dtype=tf.float32)
model_indices *= 1.0 - label_smoothing
model_indices += label_smoothing / num_models
# Obtain probs if using an evidential network
if evidential:
preds = preds / tf.reduce_sum(preds, axis=1, keepdims=True)
# Numerical stability
preds = tf.clip_by_value(preds, 1e-15, 1 - 1e-15)
# Actual loss + regularization (if given)
loss = -tf.reduce_mean(tf.reduce_sum(model_indices * tf.math.log(preds), axis=1))
return loss
[docs]
def norm_diff(tensor_a, tensor_b, axis=None, ord='euclidean'):
"""
Wrapper around tf.norm that computes the norm of the difference between two tensors along the specified axis.
Parameters
----------
tensor_a : A Tensor.
tensor_b : A Tensor. Must be the same shape as tensor_a.
axis : Any or None
Axis along which to compute the norm of the difference. Default is None.
ord : int or str
Order of the norm. Supports 'euclidean' and other norms supported by tf.norm. Default is 'euclidean'.
"""
difference = tensor_a - tensor_b
return tf.norm(difference, ord=ord, axis=axis)