Source code for bayesflow.networks.transformers.time_series_transformer
import keras
from keras.saving import register_keras_serializable as serializable
from bayesflow.types import Tensor
from bayesflow.utils import check_lengths_same
from bayesflow.utils.decorators import sanitize_input_shape
from ..embeddings import Time2Vec, RecurrentEmbedding
from ..summary_network import SummaryNetwork
from .mab import MultiHeadAttentionBlock
[docs]
@serializable(package="bayesflow.networks")
class TimeSeriesTransformer(SummaryNetwork):
def __init__(
self,
summary_dim: int = 16,
embed_dims: tuple = (64, 64),
num_heads: tuple = (4, 4),
mlp_depths: tuple = (2, 2),
mlp_widths: tuple = (128, 128),
dropout: float = 0.05,
mlp_activation: str = "gelu",
kernel_initializer: str = "he_normal",
use_bias: bool = True,
layer_norm: bool = True,
time_embedding: str = "time2vec",
time_embed_dim: int = 8,
time_axis: int = None,
**kwargs,
):
"""Creates a regular transformer coupled with Time2Vec embeddings of time used to flexibly compress time series.
If the time intervals vary across batches, it is highly recommended that your simulator also returns a "time"
vector appended to the simulator outputs and specified via the "time_axis" argument.
Parameters
----------
summary_dim : int, optional (default - 16)
Dimensionality of the final summary output.
embed_dims : tuple of int, optional (default - (64, 64))
Dimensions of the keys, values, and queries for each attention block.
num_heads : tuple of int, optional (default - (4, 4))
Number of attention heads for each embedding dimension.
mlp_depths : tuple of int, optional (default - (2, 2))
Depth of the multi-layer perceptron (MLP) blocks for each component.
mlp_widths : tuple of int, optional (default - (128, 128))
Width of each MLP layer in each block for each component.
dropout : float, optional (default - 0.05)
Dropout rate applied to the attention and MLP layers. If set to None, no dropout is applied.
mlp_activation : str, optional (default - 'gelu')
Activation function used in the dense layers. Common choices include "relu", "elu", and "gelu".
kernel_initializer : str, optional (default - 'he_normal')
Initializer for the kernel weights matrix. Common choices include "glorot_uniform", "he_normal", etc.
use_bias : bool, optional (default - True)
Whether to include a bias term in the dense layers.
layer_norm : bool, optional (default - True)
Whether to apply layer normalization after the attention and MLP layers.
time_embedding : str, optional (default - "time2vec")
The type of embedding to use. Must be in ["time2vec", "lstm", "gru"]
time_embed_dim : int, optional (default - 8)
The dimensionality of the Time2Vec or recurrent embedding.
time_axis : int, optional (default - None)
The time axis (e.g., -1 for last axis) from which to grab the time vector that goes into the embedding.
If an embedding is provided and time_axis is None, a uniform time interval between [0, sequence_len]
will be assumed.
**kwargs : dict
Additional keyword arguments passed to the base layer.
"""
super().__init__(**kwargs)
# Ensure all tuple-settings have the same length
check_lengths_same(embed_dims, num_heads, mlp_depths, mlp_widths)
# Initialize Time2Vec embedding layer
if time_embedding is None:
self.time_embedding = None
elif time_embedding == "time2vec":
self.time_embedding = Time2Vec(num_periodic_features=time_embed_dim - 1)
elif time_embedding in ["lstm", "gru"]:
self.time_embedding = RecurrentEmbedding(time_embed_dim, time_embedding)
else:
raise ValueError("Embedding not found!")
# Construct a series of set-attention blocks
self.attention_blocks = []
for i in range(len(embed_dims)):
layer_attention_settings = dict(
dropout=dropout,
mlp_activation=mlp_activation,
kernel_initializer=kernel_initializer,
use_bias=use_bias,
layer_norm=layer_norm,
num_heads=num_heads[i],
embed_dim=embed_dims[i],
mlp_depth=mlp_depths[i],
mlp_width=mlp_widths[i],
)
block = MultiHeadAttentionBlock(**layer_attention_settings)
self.attention_blocks.append(block)
# Pooling will be applied as a final step to the abstract representations obtained from set attention
self.pooling = keras.layers.GlobalAvgPool1D()
self.output_projector = keras.layers.Dense(summary_dim)
self.summary_dim = summary_dim
self.time_axis = time_axis
[docs]
def call(self, input_sequence: Tensor, training: bool = False, **kwargs) -> Tensor:
"""Compresses the input sequence into a summary vector of size `summary_dim`.
Parameters
----------
input_sequence : Tensor
Input of shape (batch_size, sequence_length, input_dim)
training : boolean, optional (default - False)
Passed to the optional internal dropout and spectral normalization
layers to distinguish between train and test time behavior.
**kwargs : dict, optional (default - {})
Additional keyword arguments passed to the internal attention layer,
such as ``attention_mask`` or ``return_attention_scores``
Returns
-------
out : Tensor
Output of shape (batch_size, set_size, output_dim)
"""
if self.time_axis is not None:
t = input_sequence[..., self.time_axis]
indices = list(range(keras.ops.shape(input_sequence)[-1]))
indices.pop(self.time_axis)
inp = keras.ops.take(input_sequence, indices, axis=-1)
else:
t = None
inp = input_sequence
if self.time_embedding:
inp = self.time_embedding(inp, t=t)
# Apply self-attention blocks
for layer in self.attention_blocks:
inp = layer(inp, inp, training=training, **kwargs)
# Global average pooling and output projection
summary = self.pooling(inp)
summary = self.output_projector(summary)
return summary
[docs]
@sanitize_input_shape
def build(self, input_shape):
super().build(input_shape)
self.call(keras.ops.zeros(input_shape))