Source code for bayesflow.networks.summary.transformers.time_series_transformer

import keras

from bayesflow.types import Tensor
from bayesflow.utils import check_lengths_same
from bayesflow.utils.serialization import serializable

from .attention import MultiHeadAttention
from .transformer import Transformer

from ...helpers import Time2Vec, RecurrentEmbedding


[docs] @serializable("bayesflow.networks") class TimeSeriesTransformer(Transformer): def __init__( self, summary_dim: int = 16, embed_dims: tuple = (64, 64), num_heads: tuple = (4, 4), dropout: float = 0.05, expansion_factor: float = 4.0, glu_variant: str = "swiglu", kernel_initializer: str = "glorot_uniform", use_bias: bool = False, layer_norm: bool = True, time_embedding: str = "time2vec", time_embed_dim: int = 8, time_axis: int = None, return_sequences: bool = False, **kwargs, ): """(SN) Creates a regular transformer coupled with Time2Vec embeddings of time used to flexibly compress time series. If the time intervals vary across batches, it is highly recommended that your simulator also returns a "time" vector appended to the simulator outputs and specified via the "time_axis" argument. Parameters ---------- summary_dim : int, optional Dimensionality of the final summary output, by default 16. embed_dims : tuple of int, optional Embedding dimensionality for each attention block, by default (64, 64). num_heads : tuple of int, optional Number of attention heads for each block, by default (4, 4). dropout : float, optional Dropout rate applied inside the attention sublayer, by default 0.05. expansion_factor : float, optional FFN intermediate width multiplier (before the 2/3 GLU correction), by default 4.0. glu_variant : str, optional GLU activation variant for the FFN. One of ``"swiglu"``, ``"geglu"``, ``"reglu"``, or ``"liglu"``, by default ``"swiglu"``. kernel_initializer : str, optional Initializer for kernel weights, by default ``"glorot_uniform"``. use_bias : bool, optional Whether to include bias terms in dense layers, by default False. layer_norm : bool, optional Whether to apply Pre-LN RMSNorm before each sublayer, by default True. time_embedding : str, optional The type of embedding to use. Must be in ``["time2vec", "lstm", "gru"]``, by default ``"time2vec"``. time_embed_dim : int, optional Dimensionality of the Time2Vec or recurrent embedding, by default 8. time_axis : int or None, optional The time axis from which to grab the time vector for the embedding. If None, a uniform time interval between [0, sequence_len] is assumed. return_sequences : bool, optional If True, acts as a many-to-many encoder. If False (default), acts as a many-to-one embedding network and ``summary_dim`` is the output dimension. **kwargs Additional keyword arguments passed to the base layer. """ super().__init__(**kwargs) check_lengths_same(embed_dims, num_heads) if time_embedding is None: self.time_embedding = None elif time_embedding == "time2vec": self.time_embedding = Time2Vec(num_periodic_features=time_embed_dim - 1) elif time_embedding in ["lstm", "gru"]: self.time_embedding = RecurrentEmbedding(time_embed_dim, time_embedding) else: raise ValueError( f"Invalid time embedding type: {time_embedding}. Expected one of ['time2vec', 'lstm', 'gru']." ) self.attention_blocks = [] for i in range(len(embed_dims)): block = MultiHeadAttention( embed_dim=embed_dims[i], num_heads=num_heads[i], dropout=dropout, expansion_factor=expansion_factor, glu_variant=glu_variant, kernel_initializer=kernel_initializer, use_bias=use_bias, layer_norm=layer_norm, ) self.attention_blocks.append(block) if return_sequences: self.pooling = keras.layers.Identity() else: self.pooling = keras.layers.GlobalAveragePooling1D() self.output_projector = keras.layers.Dense(units=summary_dim) self.summary_dim = summary_dim self.time_axis = time_axis
[docs] def call(self, x: Tensor, training: bool = False, attention_mask: Tensor = None) -> Tensor: """Compresses the input sequence into a summary vector of size ``summary_dim``. Parameters ---------- x : Tensor Input of shape ``(batch_size, sequence_length, input_dim)``. training : bool, optional Passed to dropout and norm layers, by default False. attention_mask : Tensor, optional Boolean mask of shape ``(B, T, T)`` where 1 = attend, 0 = mask. Returns ------- Tensor Shape ``(batch_size, summary_dim)`` if ``return_sequences=False``, otherwise ``(batch_size, sequence_length, summary_dim)``. """ if self.time_axis is not None: t = x[..., self.time_axis] indices = list(range(keras.ops.shape(x)[-1])) indices.pop(self.time_axis) inp = keras.ops.take(x, indices, axis=-1) else: t = None inp = x if self.time_embedding: inp = self.time_embedding(inp, t=t) for layer in self.attention_blocks: inp = layer(inp, inp, training=training, attention_mask=attention_mask) summary = self.pooling(inp) summary = self.output_projector(summary) return summary