Source code for bayesflow.networks.summary.transformers.fusion_transformer

import keras
from keras import layers

from bayesflow.types import Tensor
from bayesflow.utils import check_lengths_same
from bayesflow.utils.serialization import serializable

from .transformer import Transformer
from .attention import MultiHeadAttention



[docs]
@serializable("bayesflow.networks")
class FusionTransformer(Transformer):
    """
    (SN) Implements a more flexible version of the TimeSeriesTransformer that applies a series of self-attention
    layers followed by cross-attention between the representation and a learnable template summarized via a
    recurrent net.

    Note: This network does not need time embeddings, as the sequence itself is used as a learnable embedding.
    """

    def __init__(
        self,
        summary_dim: int = 16,
        embed_dims: tuple = (64, 64),
        num_heads: tuple = (4, 4),
        dropout: float = 0.05,
        expansion_factor: float = 4.0,
        glu_variant: str = "swiglu",
        kernel_initializer: str = "glorot_uniform",
        use_bias: bool = False,
        layer_norm: bool = True,
        template_type: str = "lstm",
        bidirectional: bool = True,
        template_dim: int = 128,
        **kwargs,
    ):
        """Creates a fusion transformer used to flexibly compress time series.

        Important: This network needs at least 2 transformer blocks and always acts as a many-to-one transform.

        Parameters
        ----------
        summary_dim : int, optional
            Dimensionality of the final summary output, by default 16.
        embed_dims : tuple of int, optional
            Embedding dimensionality for each attention block, by default (64, 64).
        num_heads : tuple of int, optional
            Number of attention heads for each block, by default (4, 4).
        dropout : float, optional
            Dropout rate applied inside the attention sublayer, by default 0.05.
        expansion_factor : float, optional
            FFN intermediate width multiplier (before the 2/3 GLU correction), by default 4.0.
        glu_variant : str, optional
            GLU activation variant for the FFN. One of ``"swiglu"``, ``"geglu"``,
            ``"reglu"``, or ``"liglu"``, by default ``"swiglu"``.
        kernel_initializer : str, optional
            Initializer for kernel weights, by default ``"glorot_uniform"``.
        use_bias : bool, optional
            Whether to include bias terms in dense layers, by default False.
        layer_norm : bool, optional
            Whether to apply Pre-LN RMSNorm before each sublayer, by default True.
        template_type : str, optional
            Recurrent architecture for the template network: ``"lstm"`` or ``"gru"``,
            by default ``"lstm"``.
        bidirectional : bool, optional
            Whether the template recurrent network is bidirectional, by default True.
        template_dim : int, optional
            Hidden units of the recurrent template network, by default 128.
        **kwargs
            Additional keyword arguments passed to the base layer.
        """

        super().__init__(**kwargs)

        check_lengths_same(embed_dims, num_heads)

        self.attention_blocks = []
        for i in range(len(embed_dims)):
            block = MultiHeadAttention(
                embed_dim=embed_dims[i],
                num_heads=num_heads[i],
                expansion_factor=expansion_factor,
                glu_variant=glu_variant,
                dropout=dropout,
                kernel_initializer=kernel_initializer,
                use_bias=use_bias,
                layer_norm=layer_norm,
            )
            self.attention_blocks.append(block)

        template_type_upper = template_type.upper()
        if template_type_upper == "LSTM":
            rnn = layers.LSTM(template_dim // 2 if bidirectional else template_dim, dropout=dropout)
        elif template_type_upper == "GRU":
            rnn = layers.GRU(template_dim // 2 if bidirectional else template_dim, dropout=dropout)
        else:
            raise ValueError(f"Argument `template_type` must be 'lstm' or 'gru', got '{template_type}'.")

        self.template_net = layers.Bidirectional(rnn) if bidirectional else rnn

        self.output_projector = keras.layers.Dense(units=summary_dim)
        self.summary_dim = summary_dim


[docs]
    def call(self, x: Tensor, training: bool = False, attention_mask: Tensor = None) -> Tensor:
        """Compresses the input sequence into a summary vector of size ``summary_dim``.

        Parameters
        ----------
        x : Tensor
            Input of shape ``(batch_size, sequence_length, input_dim)``.
        training : bool, optional
            Passed to dropout and norm layers, by default False.
        attention_mask : Tensor, optional
            Boolean mask of shape ``(B, T, T)`` where 1 = attend, 0 = mask.

        Returns
        -------
        Tensor
            Output of shape ``(batch_size, summary_dim)``.
        """
        template = self.template_net(x, training=training)

        rep = x
        for layer in self.attention_blocks[:-1]:
            rep = layer(rep, rep, training=training, attention_mask=attention_mask)

        summary = self.attention_blocks[-1](keras.ops.expand_dims(template, axis=1), rep, training=training)
        summary = self.output_projector(keras.ops.squeeze(summary, axis=1))
        return summary