From 0b1c8966927ff266f09684de7e871520ad6c7177 Mon Sep 17 00:00:00 2001 From: Simonas <20096648+simjak@users.noreply.github.com> Date: Wed, 3 Jul 2024 16:42:02 +0300 Subject: [PATCH 1/3] feat: statistical chunker improvements --- docs/00-chunkers-intro.ipynb | 1522 ++------------------- semantic_chunkers/chunkers/statistical.py | 135 +- semantic_chunkers/utils/text.py | 48 + 3 files changed, 268 insertions(+), 1437 deletions(-) diff --git a/docs/00-chunkers-intro.ipynb b/docs/00-chunkers-intro.ipynb index ef3d94e..b915a1d 100644 --- a/docs/00-chunkers-intro.ipynb +++ b/docs/00-chunkers-intro.ipynb @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -87,14 +87,6 @@ "outputId": "bd606fad-8214-4fd4-cad1-54bb86234575" }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jamesbriggs/opt/anaconda3/envs/semantic-chunkers/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - }, { "data": { "text/plain": [ @@ -104,7 +96,7 @@ "})" ] }, - "execution_count": 1, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -118,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -153,7 +145,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -173,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 17, "metadata": { "id": "Mqnc35w85A8L" }, @@ -208,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -219,24 +211,87 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-05-13 15:13:14 INFO semantic_chunkers.utils.logger Single document exceeds the maximum token limit of 300. Splitting to sentences before semantically merging.\u001b[0m\n" + "\u001b[32m2024-07-03 16:41:12 INFO semantic_chunkers.utils.logger Single document exceeds the maximum token limit of 300. Splitting to sentences before semantically merging.\u001b[0m\n", + "100%|██████████| 6/6 [00:05<00:00, 1.08it/s]\n", + "\u001b[32m2024-07-03 16:41:17 INFO semantic_chunkers.utils.logger Single document exceeds the maximum token limit of 300. Splitting to sentences before semantically merging.\u001b[0m\n" ] } ], "source": [ - "chunks = chunker(docs=[content])" + "chunks = chunker(docs=[content])\n", + "chunks_async = await chunker.acall(docs=[content])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print and compare sync and async chunks." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Synchronous Chunks

# Mamba:Linear-Time Sequence Modeling with Selective State Spaces# Albert Gu*1 and Tri Dao*21Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me# AbstractFoundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module.Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformersâcomputational ineï¬ciency on long sequences, but they have not performed as well as attention on important modalities such as language.We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements.First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token.Second, even though this change prevents the use of eï¬cient convolutions, we design a hardware-aware parallel algorithm in recurrent mode.We integrate these selective SSMs into a simpliï¬ed end-to-end neural network architecture without attention or even MLP blocks (Mamba).Mamba enjoys fast inference (5Ãhigher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences.

As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics.On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.# 1 IntroductionFoundation models (FMs), or large models pretrained on massive data then adapted for downstream tasks, have emerged as an eï¬ective paradigm in modern machine learning.The backbone of these FMs are often sequence models, operating on arbitrary sequences of inputs from a wide variety of domains such as language, images, speech, audio, time series, and genomics (Brown et al. 2020; Dosovitskiy et al. 2020; Ismail Fawaz et al. 2019; Oord et al. 2016; Poli et al. 2023; Sutskever, Vinyals, and Quoc V Le 2014).While this concept is agnostic to a particular choice of model architecture, modern FMs are predominantly based on a single type of sequence model: the Transformer (Vaswani et al. 2017) and its core attention layer (Bahdanau, Cho, and Bengio 2015) The eï¬cacy of self-attention is attributed to its ability to route information densely within a context window, allowing it to model complex data.

However, this property brings fundamental drawbacks: an inability to model anything outside of a ï¬nite window, and quadratic scaling with respect to the window length.An enormous body of research has appeared on more eï¬cient variants of attention to overcome these drawbacks (Tay, Dehghani, Bahri, et al. 2022), but often at the expense of the very properties that makes it eï¬ective.As of yet, none of these variants have been shown to be empirically eï¬ective at scale across domains.

Recently, structured state space sequence models (SSMs) (Gu, Goel, and Ré 2022; Gu, Johnson, Goel, et al. 2021) have emerged as a promising class of architectures for sequence modeling.These models can be interpreted as a combination of recurrent neural networks (RNNs) and convolutional neural networks (CNNs), with inspiration from classical state space models (Kalman 1960).This class of models can be computed very eï¬ciently as either a recurrence or convolution, with linear or near-linear scaling in sequence length.

Additionally, they have principledEqual contribution.1mechanisms for modeling long-range dependencies (Gu, Dao, et al. 2020) in certain data modalities, and have dominated benchmarks such as the Long Range Arena (Tay, Dehghani, Abnar, et al. 2021).Many ï¬avors of SSMs (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Y.Li et al. 2023; Ma et al. 2023; Orvieto et al. 2023; Smith, Warrington, and Linderman 2023) have been successful in domains involving continuous signal data such as audio and vision (Goel et al. 2022; Nguyen, Goel, et al. 2022; Saon, Gupta, and Cui 2023).However, they have been less eï¬ective at modeling discrete and information-dense data such as text.We propose a new class of selective state space models, that improves on prior work on several axes to achieve the modeling power of Transformers while scaling linearly in sequence length.

Selection Mechanism.First, we identify a key limitation of prior models: the ability to eï¬ciently select data in an input-dependent manner (i.e. focus on or ignore particular inputs).Building on intuition based on important synthetic tasks such as selective copy and induction heads, we design a simple selection mechanism by parameterizing the SSM parameters based on the input.This allows the model to ï¬lter out irrelevant information and remember relevant information indeï¬nitely.Hardware-aware Algorithm.This simple change poses a technical challenge for the computation of the model; in fact, all prior SSMs models must be time- and input-invariant in order to be computationally eï¬cient.We overcome this with a hardware-aware algorithm that computes the model recurrently with a scan instead of convolution, but does not materialize the expanded state in order to avoid IO access between diï¬erent levels of the GPU memory hierarchy.The resulting implementation is faster than previous methods both in theory (scaling linearly in sequence length, compared to pseudo-linear for all convolution-based SSMs) and on modern hardware (up to 3Ãfaster on A100 GPUs).

Architecture.We simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces.Selective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬ciency together yield performance improvements on real data up to sequence length 1M.We empirically validate Mambaâs potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬c task performance, on several types of modalities and settings:

⢠Synthetics.On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬nitely long (>1M tokens).⢠Audio and Genomics.Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half).In both settings, its performance improves with longer context up to million-length sequences.⢠Language Modeling.

Mamba is the ï¬rst linear-time sequence model that truly achieves Transformer-quality performance, both in pretraining perplexity and downstream evaluations.With scaling laws up to 1B parameters, we show that Mamba exceeds the performance of a large range of baselines, including very strong modern Transformer training recipes based on LLaMa (Touvron et al. 2023).Our Mamba language model has 5Ãgeneration throughput compared to Transformers of similar size, and Mamba-3Bâs quality matches that of Transformers twice its size (e.g. 4 points higher avg. on common sense reasoning compared to Pythia-3B and even exceeding Pythia-7B).Model code and pre-trained checkpoints are open-sourced at https://github.com/state-spaces/mamba.

2# Selective State Space Model# with Hardware-aware State Expansion# Avuvy GPU SRAM Selection Mechanism esSelection MechanismFigure 1: (Overview.) Structured SSMs independently map each channel (e.g. ð· = 5) of an input ð¥ to output ð¦ through a higher dimensional latent state â(e.g. ð= 4).Prior SSMs avoid materializing this large effective state (ð·ð, times batch size ðµ and sequence length ð¿) through clever alternate computation paths requiring time-invariance: the (â, A, B, C) parameters are constant across time.Our selection mechanism adds back input-dependent dynamics, which also requires a careful hardware-aware algorithm to only materialize the expanded states in more efficient levels of the GPU memory hierarchy.# 2 State Space ModelsStructured state space sequence models (S4) are a recent class of sequence models for deep learning that are broadly related to RNNs, and CNNs, and classical state space models.They are inspired by a particular continuous system (1) that maps a 1-dimensional function or sequence ð

¥(ð¡) ââ⦠ð¦(ð¡) ââthrough an implicit latent state â(ð¡) ââð.Concretely, S4 models are deï¬ned with four parameters (â, A, B, C), which deï¬ne a sequence-to-sequence trans- formation in two stages.ââ²(ð¡) = Aâ(ð¡) + Bð¥(ð¡) ð¦(ð¡) = Câ(ð¡)(1a) (1b) âð¡ = Aâð¡â1 + Bð¥ð¡ ð¦ð¡ = Câð¡ (2a) (2b) ðð² = (Cð©, Cð¨ð©, ⦠, Cð¨ ð¦ = ð¥ âð² ð©, â

¦ ) (3a) (3b)Discretization.The ï¬rst stage transforms the âcontinuous parametersâ(â, A, B) to âdiscrete parametersâ(A, B) through ï¬xed formulas A = ðð´(â, A) and B = ððµ(â, A, B), where the pair (ðð´, ððµ) is called a discretization rule.Various rules can be used such as the zero-order hold (ZOH) deï¬

ned in equation (4).A = exp(âA) B = (âA)â1(exp(âA) âI) ââB (4)Discretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023).

It also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5.However, from a mechanical point of view discretization can simply be viewed as the ï¬rst step of the computation graph in the forward pass of an SSM.Alternate ï¬avors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about.Computation.After the parameters have been transformed from (â, A, B, C) ⦠(A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3).3Commonly, the model uses the convolutional mode (3) for eï¬cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬cient autoregressive inference (where the inputs are seen one timestep at a time).Linear Time Invariance (LTI).An important property of equations (1) to (3) is that the modelâs dynamics are constant through time.

In other words (â, A, B, C), and consequently (A, B) as well, are ï¬xed for all time-steps.This property is called linear time invariance (LTI), which is deeply connected to recurrence and convolutions.Informally, we think of LTI SSMs as being equivalent to any linear recurrence (2a) or convolution (3b), and use LTI as an umbrella term for these classes of models.Thus far, all structured SSMs have been LTI (e.g. computed as convolutions) because of fundamental eï¬ciency constraints, discussed in Section 3.3.However, a core insight of this work is that LTI models have fundamental limitations in modeling certain types of data, and our technical contributions involve removing the LTI constraint while overcoming the eï¬ciency bottlenecks.

Structure and Dimensions.Finally, we note that structured SSMs are so named because computing them eï¬ciently also requires imposing structure on the A matrix.The most popular form of structure is diagonal (Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Smith, Warrington, and Linderman 2023), which we also use.In this case, the A ââðÃð, B ââðÃ1, C ââ1Ãðmatrices can all be represented by ðnumbers.To operate over an input sequence ð¥ of batch size ðµ and length ð¿ with ð

· channels, the SSM is applied independently to each channel.Note that in this case, the total hidden state has dimension ð·ðper input, and computing it over the sequence length requires ð(ðµð¿ð·ð) time and memory; this is the root of the fundamental eï¬ciency bottleneck addressed in Section 3.3.General State Space Models.We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state.It has been used to refer to many disparate concepts in diï¬erent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning).Throughout this entire paper we use the term â

SSMâto refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably.For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y.Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary.SSM Architectures.SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines.

⢠Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM.⢠H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3).H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer.⢠Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021).â

¢ RetNet (Y.Sun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions.4⢠RWKV (B.Peng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S.Zhai et al. 2021)).Its main âWKVâ

mechanism involves LTI recurrences and can be viewed as the ratio of two SSMs.Other closely related SSMs and architectures are discussed further in an extended related work (Appendix B).We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM.# 3 Selective State Space ModelsWe motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2).The resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬

ciently.We overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3).We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4).Finally, we discuss some additional properties of selection mechanisms (Section 3.5).# 3.1 Motivation:Selection as a Means of CompressionWe argue that a fundamental problem of sequence modeling is compressing context into a smaller state.In fact, we can view the tradeoï¬s of popular sequence models from this point of view.For example, attention is both eï¬ective and ineï¬cient because it explicitly does not compress context at all.This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers.On the other hand, recurrent models are eï¬cient because they have a ï¬nite state, implying constant-time inference and linear-time training.However, their eï¬ectiveness is limited by how well this state has compressed the context.To understand this principle, we focus on two running examples of synthetic tasks (Figure 2).

⢠The Selective Copying task modiï¬es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize.It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬lter out the irrelevant ones (white).⢠The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022).It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black).These tasks reveal the failure mode of LTI models.From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬ect the hidden state passed along the sequence an in input-dependent way.From the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬culty with the Selective Copying task because of lack of content-awareness (Figure 2).More concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels.

In summary, the eï¬ciency vs. eï¬ectiveness tradeoï¬of sequence models is characterized by how well they compress their state: eï¬cient models must have a small state, while eï¬ective models must have a state that contains all necessary information from the context.In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬lter out inputs into a sequential state.In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion).

# Improving SSMs with SelectionOne method of incorporating a selection mechanism into models is by letting their parameters that aï¬ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the c

Asynchronous Chunks

# Mamba:Linear-Time Sequence Modeling with Selective State Spaces# Albert Gu*1 and Tri Dao*21Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me# AbstractFoundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module.Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformersâcomputational ineï¬ciency on long sequences, but they have not performed as well as attention on important modalities such as language.We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements.First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token.Second, even though this change prevents the use of eï¬cient convolutions, we design a hardware-aware parallel algorithm in recurrent mode.We integrate these selective SSMs into a simpliï¬ed end-to-end neural network architecture without attention or even MLP blocks (Mamba).Mamba enjoys fast inference (5Ãhigher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences.

As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics.On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.# 1 IntroductionFoundation models (FMs), or large models pretrained on massive data then adapted for downstream tasks, have emerged as an eï¬ective paradigm in modern machine learning.The backbone of these FMs are often sequence models, operating on arbitrary sequences of inputs from a wide variety of domains such as language, images, speech, audio, time series, and genomics (Brown et al. 2020; Dosovitskiy et al. 2020; Ismail Fawaz et al. 2019; Oord et al. 2016; Poli et al. 2023; Sutskever, Vinyals, and Quoc V Le 2014).While this concept is agnostic to a particular choice of model architecture, modern FMs are predominantly based on a single type of sequence model: the Transformer (Vaswani et al. 2017) and its core attention layer (Bahdanau, Cho, and Bengio 2015) The eï¬cacy of self-attention is attributed to its ability to route information densely within a context window, allowing it to model complex data.

However, this property brings fundamental drawbacks: an inability to model anything outside of a ï¬nite window, and quadratic scaling with respect to the window length.An enormous body of research has appeared on more eï¬cient variants of attention to overcome these drawbacks (Tay, Dehghani, Bahri, et al. 2022), but often at the expense of the very properties that makes it eï¬ective.As of yet, none of these variants have been shown to be empirically eï¬ective at scale across domains.

Recently, structured state space sequence models (SSMs) (Gu, Goel, and Ré 2022; Gu, Johnson, Goel, et al. 2021) have emerged as a promising class of architectures for sequence modeling.These models can be interpreted as a combination of recurrent neural networks (RNNs) and convolutional neural networks (CNNs), with inspiration from classical state space models (Kalman 1960).This class of models can be computed very eï¬ciently as either a recurrence or convolution, with linear or near-linear scaling in sequence length.

Additionally, they have principledEqual contribution.1mechanisms for modeling long-range dependencies (Gu, Dao, et al. 2020) in certain data modalities, and have dominated benchmarks such as the Long Range Arena (Tay, Dehghani, Abnar, et al. 2021).Many ï¬avors of SSMs (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Y.Li et al. 2023; Ma et al. 2023; Orvieto et al. 2023; Smith, Warrington, and Linderman 2023) have been successful in domains involving continuous signal data such as audio and vision (Goel et al. 2022; Nguyen, Goel, et al. 2022; Saon, Gupta, and Cui 2023).However, they have been less eï¬ective at modeling discrete and information-dense data such as text.We propose a new class of selective state space models, that improves on prior work on several axes to achieve the modeling power of Transformers while scaling linearly in sequence length.

Selection Mechanism.First, we identify a key limitation of prior models: the ability to eï¬ciently select data in an input-dependent manner (i.e. focus on or ignore particular inputs).Building on intuition based on important synthetic tasks such as selective copy and induction heads, we design a simple selection mechanism by parameterizing the SSM parameters based on the input.This allows the model to ï¬lter out irrelevant information and remember relevant information indeï¬nitely.Hardware-aware Algorithm.This simple change poses a technical challenge for the computation of the model; in fact, all prior SSMs models must be time- and input-invariant in order to be computationally eï¬cient.We overcome this with a hardware-aware algorithm that computes the model recurrently with a scan instead of convolution, but does not materialize the expanded state in order to avoid IO access between diï¬erent levels of the GPU memory hierarchy.The resulting implementation is faster than previous methods both in theory (scaling linearly in sequence length, compared to pseudo-linear for all convolution-based SSMs) and on modern hardware (up to 3Ãfaster on A100 GPUs).

Architecture.We simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces.Selective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬ciency together yield performance improvements on real data up to sequence length 1M.

We empirically validate Mambaâs potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬c task performance, on several types of modalities and settings:⢠Synthetics.On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬nitely long (>1M tokens).⢠Audio and Genomics.Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half).In both settings, its performance improves with longer context up to million-length sequences.⢠Language Modeling.

Mamba is the ï¬rst linear-time sequence model that truly achieves Transformer-quality performance, both in pretraining perplexity and downstream evaluations.With scaling laws up to 1B parameters, we show that Mamba exceeds the performance of a large range of baselines, including very strong modern Transformer training recipes based on LLaMa (Touvron et al. 2023).Our Mamba language model has 5Ãgeneration throughput compared to Transformers of similar size, and Mamba-3Bâs quality matches that of Transformers twice its size (e.g. 4 points higher avg. on common sense reasoning compared to Pythia-3B and even exceeding Pythia-7B).Model code and pre-trained checkpoints are open-sourced at https://github.com/state-spaces/mamba.

2# Selective State Space Model# with Hardware-aware State Expansion# Avuvy GPU SRAM Selection Mechanism esSelection MechanismFigure 1: (Overview.) Structured SSMs independently map each channel (e.g. ð· = 5) of an input ð¥ to output ð¦ through a higher dimensional latent state â(e.g. ð= 4).Prior SSMs avoid materializing this large effective state (ð·ð, times batch size ðµ and sequence length ð¿) through clever alternate computation paths requiring time-invariance: the (â, A, B, C) parameters are constant across time.Our selection mechanism adds back input-dependent dynamics, which also requires a careful hardware-aware algorithm to only materialize the expanded states in more efficient levels of the GPU memory hierarchy.# 2 State Space ModelsStructured state space sequence models (S4) are a recent class of sequence models for deep learning that are broadly related to RNNs, and CNNs, and classical state space models.They are inspired by a particular continuous system (1) that maps a 1-dimensional function or sequence ð

¥(ð¡) ââ⦠ð¦(ð¡) ââthrough an implicit latent state â(ð¡) ââð.Concretely, S4 models are deï¬ned with four parameters (â, A, B, C), which deï¬ne a sequence-to-sequence trans- formation in two stages.ââ²(ð¡) = Aâ

(ð¡) + Bð¥(ð¡) ð¦(ð¡) = Câ(ð¡)(1a) (1b) âð¡ = Aâð¡â1 + Bð¥ð¡ ð¦ð¡ = Câð¡ (2a) (2b) ðð² = (Cð©, Cð¨ð©, ⦠, Cð¨ ð¦ = ð¥ âð² ð©, â

¦ ) (3a) (3b)Discretization.The ï¬rst stage transforms the âcontinuous parametersâ(â, A, B) to âdiscrete parametersâ(A, B) through ï¬xed formulas A = ðð´(â, A) and B = ððµ(â, A, B), where the pair (ðð´, ððµ) is called a discretization rule.Various rules can be used such as the zero-order hold (ZOH) deï¬

ned in equation (4).A = exp(âA) B = (âA)â1(exp(âA) âI) ââB (4)Discretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023).

It also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5.

However, from a mechanical point of view discretization can simply be viewed as the ï¬rst step of the computation graph in the forward pass of an SSM.Alternate ï¬avors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about.Computation.After the parameters have been transformed from (â, A, B, C) ⦠(A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3).3Commonly, the model uses the convolutional mode (3) for eï¬cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬cient autoregressive inference (where the inputs are seen one timestep at a time).Linear Time Invariance (LTI).An important property of equations (1) to (3) is that the modelâs dynamics are constant through time.

In other words (â, A, B, C), and consequently (A, B) as well, are ï¬xed for all time-steps.This property is called linear time invariance (LTI), which is deeply connected to recurrence and convolutions.Informally, we think of LTI SSMs as being equivalent to any linear recurrence (2a) or convolution (3b), and use LTI as an umbrella term for these classes of models.Thus far, all structured SSMs have been LTI (e.g. computed as convolutions) because of fundamental eï¬ciency constraints, discussed in Section 3.3.However, a core insight of this work is that LTI models have fundamental limitations in modeling certain types of data, and our technical contributions involve removing the LTI constraint while overcoming the eï¬ciency bottlenecks.

Structure and Dimensions.Finally, we note that structured SSMs are so named because computing them eï¬ciently also requires imposing structure on the A matrix.The most popular form of structure is diagonal (Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Smith, Warrington, and Linderman 2023), which we also use.In this case, the A ââðÃð, B ââðÃ1, C ââ1Ãðmatrices can all be represented by ðnumbers.To operate over an input sequence ð¥ of batch size ðµ and length ð¿ with ð

· channels, the SSM is applied independently to each channel.Note that in this case, the total hidden state has dimension ð·ðper input, and computing it over the sequence length requires ð(ðµð¿ð·ð) time and memory; this is the root of the fundamental eï¬ciency bottleneck addressed in Section 3.3.General State Space Models.We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state.It has been used to refer to many disparate concepts in diï¬erent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning).Throughout this entire paper we use the term â

SSMâ

to refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably.For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y.Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary.SSM Architectures.SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines.

⢠Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM.⢠H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3).H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer.⢠Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021).â

¢ RetNet (Y.Sun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions.4⢠RWKV (B.Peng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S.Zhai et al. 2021)).Its main âWKVâ

mechanism involves LTI recurrences and can be viewed as the ratio of two SSMs.Other closely related SSMs and architectures are discussed further in an extended related work (Appendix B).We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM.# 3 Selective State Space ModelsWe motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2).The resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬

ciently.We overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3).We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4).Finally, we discuss some additional properties of selection mechanisms (Section 3.5).# 3.1 Motivation:Selection as a Means of CompressionWe argue that a fundamental problem of sequence modeling is compressing context into a smaller state.In fact, we can view the tradeoï¬s of popular sequence models from this point of view.For example, attention is both eï¬ective and ineï¬cient because it explicitly does not compress context at all.This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers.On the other hand, recurrent models are eï¬cient because they have a ï¬nite state, implying constant-time inference and linear-time training.However, their eï¬ectiveness is limited by how well this state has compressed the context.To understand this principle, we focus on two running examples of synthetic tasks (Figure 2).

⢠The Selective Copying task modiï¬es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize.It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬lter out the irrelevant ones (white).⢠The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022).It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black).These tasks reveal the failure mode of LTI models.From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬ect the hidden state passed along the sequence an in input-dependent way.From the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬culty with the Selective Copying task because of lack of content-awareness (Figure 2).More concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels.

In summary, the eï¬ciency vs. eï¬ectiveness tradeoï¬

of sequence models is characterized by how well they compress their state: eï¬cient models must have a small state, while eï¬ective models must have a state that contains all necessary information from the context.In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬lter out inputs into a sequential state.In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion).

# Improving SSMs with SelectionOne method of incorporating a selection mechanism into models is by letting their parameters that aï¬ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the c

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import random\n", + "from IPython.display import display, HTML\n", + "\n", + "# Predefined list of colors\n", + "colors = ['000000', 'FF0000', '800080', '008000', '0000FF']\n", + "\n", + "html_str = '
'\n", + "html_str += '
'\n", + "html_str += '

Synchronous Chunks

'\n", + "for chunk in chunks[0]:\n", + " chunk_text = \"\"\n", + " for split in chunk.splits:\n", + " chunk_text += split\n", + " color = random.choice(colors)\n", + " html_str += f'

{chunk_text}

'\n", + "html_str += '
'\n", + "html_str += '
'\n", + "html_str += '

Asynchronous Chunks

'\n", + "for chunk in chunks_async[0]:\n", + " chunk_text = \"\"\n", + " try:\n", + " for split in chunk.splits:\n", + " chunk_text += split\n", + " except AttributeError:\n", + " print(f\"Error Chunk: {chunk}\")\n", + " color = random.choice(colors)\n", + " html_str += f'

{chunk_text}

'\n", + "html_str += '
'\n", + "html_str += '
'\n", + "\n", + "# Display the HTML\n", + "display(HTML(html_str))" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -263,63 +318,63 @@ "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 5, tokens 200, triggered by: 0.34\n", - "\u001b[31mAdditionally, they have principled Equal contribution. 1 mechanisms for modeling long-range dependencies (Gu, Dao, et al. 2020) in certain data modalities, and have dominated benchmarks such as the Long Range Arena (Tay, Dehghani, Abnar, et al. 2021). Many ï¬ avors of SSMs (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Y. Li et al. 2023; Ma et al. 2023; Orvieto et al. 2023; Smith, Warrington, and Linderman 2023) have been successful in domains involving continuous signal data such as audio and vision (Goel et al. 2022; Nguyen, Goel, et al. 2022; Saon, Gupta, and Cui 2023).\u001b[0m\n", + "Split 5, tokens 256, triggered by: 0.26\n", + "\u001b[31mAdditionally, they have principled Equal contribution. 1 mechanisms for modeling long-range dependencies (Gu, Dao, et al. 2020) in certain data modalities, and have dominated benchmarks such as the Long Range Arena (Tay, Dehghani, Abnar, et al. 2021). Many ï¬ avors of SSMs (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Y. Li et al. 2023; Ma et al. 2023; Orvieto et al. 2023; Smith, Warrington, and Linderman 2023) have been successful in domains involving continuous signal data such as audio and vision (Goel et al. 2022; Nguyen, Goel, et al. 2022; Saon, Gupta, and Cui 2023). However, they have been less eï¬ ective at modeling discrete and information-dense data such as text. We propose a new class of selective state space models, that improves on prior work on several axes to achieve the modeling power of Transformers while scaling linearly in sequence length.\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 6, tokens 151, triggered by: 0.22\n", - "\u001b[32mHowever, they have been less eï¬ ective at modeling discrete and information-dense data such as text. We propose a new class of selective state space models, that improves on prior work on several axes to achieve the modeling power of Transformers while scaling linearly in sequence length. Selection Mechanism. First, we identify a key limitation of prior models: the ability to eï¬ ciently select data in an input-dependent manner (i.e. focus on or ignore particular inputs). Building on intuition based on important synthetic tasks such as selective copy and induction heads, we design a simple selection mechanism by parameterizing the SSM parameters based on the input. This allows the model to ï¬ lter out irrelevant information and remember relevant information indeï¬\u001b[0m\n", + "Split 6, tokens 240, triggered by: 0.28\n", + "\u001b[32mSelection Mechanism. First, we identify a key limitation of prior models: the ability to eï¬ ciently select data in an input-dependent manner (i.e. focus on or ignore particular inputs). Building on intuition based on important synthetic tasks such as selective copy and induction heads, we design a simple selection mechanism by parameterizing the SSM parameters based on the input. This allows the model to ï¬ lter out irrelevant information and remember relevant information indeï¬ nitely. Hardware-aware Algorithm. This simple change poses a technical challenge for the computation of the model; in fact, all prior SSMs models must be time- and input-invariant in order to be computationally eï¬ cient. We overcome this with a hardware-aware algorithm that computes the model recurrently with a scan instead of convolution, but does not materialize the expanded state in order to avoid IO access between diï¬ erent levels of the GPU memory hierarchy. The resulting implementation is faster than previous methods both in theory (scaling linearly in sequence length, compared to pseudo-linear for all convolution-based SSMs) and on modern hardware (up to 3à faster on A100 GPUs).\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 7, tokens 145, triggered by: 0.28\n", - "\u001b[34mnitely. Hardware-aware Algorithm. This simple change poses a technical challenge for the computation of the model; in fact, all prior SSMs models must be time- and input-invariant in order to be computationally eï¬ cient. We overcome this with a hardware-aware algorithm that computes the model recurrently with a scan instead of convolution, but does not materialize the expanded state in order to avoid IO access between diï¬ erent levels of the GPU memory hierarchy. The resulting implementation is faster than previous methods both in theory (scaling linearly in sequence length, compared to pseudo-linear for all convolution-based SSMs) and on modern hardware (up to 3à faster on A100 GPUs).\u001b[0m\n", + "Split 7, tokens 196, triggered by: token limit\n", + "\u001b[34mArchitecture. We simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces. Selective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬ ciency together yield performance improvements on real data up to sequence length 1M.\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 8, tokens 236, triggered by: 0.26\n", - "\u001b[35mArchitecture. We simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces. Selective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬ ciency together yield performance improvements on real data up to sequence length 1M. We empirically validate Mambaâ s potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬ c task performance, on several types of modalities and settings:\u001b[0m\n", + "Split 8, tokens 179, triggered by: 0.33\n", + "\u001b[35mWe empirically validate Mambaâ s potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬ c task performance, on several types of modalities and settings: â ¢ Synthetics. On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬ nitely long (>1M tokens). â ¢ Audio and Genomics. Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half). In both settings, its performance improves with longer context up to million-length sequences. â ¢ Language Modeling.\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 9, tokens 139, triggered by: 0.33\n", - "\u001b[31mâ ¢ Synthetics. On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬ nitely long (>1M tokens). â ¢ Audio and Genomics. Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half). In both settings, its performance improves with longer context up to million-length sequences. â ¢ Language Modeling.\u001b[0m\n", + "Split 9, tokens 165, triggered by: 0.21\n", + "\u001b[31mMamba is the ï¬ rst linear-time sequence model that truly achieves Transformer-quality performance, both in pretraining perplexity and downstream evaluations. With scaling laws up to 1B parameters, we show that Mamba exceeds the performance of a large range of baselines, including very strong modern Transformer training recipes based on LLaMa (Touvron et al. 2023). Our Mamba language model has 5à generation throughput compared to Transformers of similar size, and Mamba-3Bâ s quality matches that of Transformers twice its size (e.g. 4 points higher avg. on common sense reasoning compared to Pythia-3B and even exceeding Pythia-7B). Model code and pre-trained checkpoints are open-sourced at https://github.com/state-spaces/mamba.\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 10, tokens 165, triggered by: 0.21\n", - "\u001b[32mMamba is the ï¬ rst linear-time sequence model that truly achieves Transformer-quality performance, both in pretraining perplexity and downstream evaluations. With scaling laws up to 1B parameters, we show that Mamba exceeds the performance of a large range of baselines, including very strong modern Transformer training recipes based on LLaMa (Touvron et al. 2023). Our Mamba language model has 5à generation throughput compared to Transformers of similar size, and Mamba-3Bâ s quality matches that of Transformers twice its size (e.g. 4 points higher avg. on common sense reasoning compared to Pythia-3B and even exceeding Pythia-7B). Model code and pre-trained checkpoints are open-sourced at https://github.com/state-spaces/mamba.\u001b[0m\n", + "Split 10, tokens 238, triggered by: 0.26\n", + "\u001b[32m2 # Selective State Space Model # with Hardware-aware State Expansion # A vuvy GPU SRAM Selection Mechanism es Selection Mechanism Figure 1: (Overview.) Structured SSMs independently map each channel (e.g. ð · = 5) of an input ð ¥ to output ð ¦ through a higher dimensional latent state â (e.g. ð = 4). Prior SSMs avoid materializing this large effective state (ð ·ð , times batch size ð µ and sequence length ð ¿) through clever alternate computation paths requiring time-invariance: the (â , A, B, C) parameters are constant across time. Our selection mechanism adds back input-dependent dynamics, which also requires a careful hardware-aware algorithm to only materialize the expanded states in more efficient levels of the GPU memory hierarchy. # 2 State Space Models Structured state space sequence models (S4) are a recent class of sequence models for deep learning that are broadly related to RNNs, and CNNs, and classical state space models. They are inspired by a particular continuous system (1) that maps a 1-dimensional function or sequence ð\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 11, tokens 138, triggered by: 0.35\n", - "\u001b[34m2 # Selective State Space Model # with Hardware-aware State Expansion # A vuvy GPU SRAM Selection Mechanism es Selection Mechanism Figure 1: (Overview.) Structured SSMs independently map each channel (e.g. ð · = 5) of an input ð ¥ to output ð ¦ through a higher dimensional latent state â (e.g. ð = 4). Prior SSMs avoid materializing this large effective state (ð ·ð , times batch size ð µ and sequence length ð ¿) through clever alternate computation paths requiring time-invariance: the (â , A, B, C) parameters are constant across time.\u001b[0m\n", + "Split 11, tokens 83, triggered by: token limit\n", + "\u001b[34m¥(ð ¡) â â â ¦ ð ¦(ð ¡) â â through an implicit latent state â (ð ¡) â â ð . Concretely, S4 models are deï¬ ned with four parameters (â , A, B, C), which deï¬ ne a sequence-to-sequence trans- formation in two stages. â â ²(ð ¡) = Aâ\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 12, tokens 100, triggered by: 0.26\n", - "\u001b[35mOur selection mechanism adds back input-dependent dynamics, which also requires a careful hardware-aware algorithm to only materialize the expanded states in more efficient levels of the GPU memory hierarchy. # 2 State Space Models Structured state space sequence models (S4) are a recent class of sequence models for deep learning that are broadly related to RNNs, and CNNs, and classical state space models. They are inspired by a particular continuous system (1) that maps a 1-dimensional function or sequence ð\u001b[0m\n", + "Split 12, tokens 105, triggered by: 0.46\n", + "\u001b[35m(ð ¡) + Bð ¥(ð ¡) ð ¦(ð ¡) = Câ (ð ¡) (1a) (1b) â ð ¡ = Aâ ð ¡â 1 + Bð ¥ð ¡ ð ¦ð ¡ = Câ ð ¡ (2a) (2b) ð ð ² = (Cð ©, Cð ¨ð ©, â ¦ , Cð ¨ ð ¦ = ð ¥ â ð ² ð ©, â\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 13, tokens 198, triggered by: 0.29\n", - "\u001b[31m¥(ð ¡) â â â ¦ ð ¦(ð ¡) â â through an implicit latent state â (ð ¡) â â ð . Concretely, S4 models are deï¬ ned with four parameters (â , A, B, C), which deï¬ ne a sequence-to-sequence trans- formation in two stages. â â ²(ð ¡) = Aâ (ð ¡) + Bð ¥(ð ¡) ð ¦(ð ¡) = Câ (ð ¡) (1a) (1b) â ð ¡ = Aâ ð ¡â 1 + Bð ¥ð ¡ ð ¦ð ¡ = Câ ð ¡ (2a) (2b) ð ð ² = (Cð ©, Cð ¨ð ©, â ¦ , Cð ¨ ð ¦ = ð ¥ â ð ² ð ©, â ¦ ) (3a) (3b)\u001b[0m\n", + "Split 13, tokens 112, triggered by: 0.41\n", + "\u001b[31m¦ ) (3a) (3b) Discretization. The ï¬ rst stage transforms the â continuous parametersâ (â , A, B) to â discrete parametersâ (A, B) through ï¬ xed formulas A = ð ð ´(â , A) and B = ð ð µ(â , A, B), where the pair (ð ð ´, ð ð µ) is called a discretization rule. Various rules can be used such as the zero-order hold (ZOH) deï¬\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 14, tokens 137, triggered by: 0.16\n", - "\u001b[32mDiscretization. The ï¬ rst stage transforms the â continuous parametersâ (â , A, B) to â discrete parametersâ (A, B) through ï¬ xed formulas A = ð ð ´(â , A) and B = ð ð µ(â , A, B), where the pair (ð ð ´, ð ð µ) is called a discretization rule. Various rules can be used such as the zero-order hold (ZOH) deï¬ ned in equation (4). A = exp(â A) B = (â A)â 1(exp(â A) â I) â â B (4)\u001b[0m\n", + "Split 14, tokens 109, triggered by: 0.24\n", + "\u001b[32mned in equation (4). A = exp(â A) B = (â A)â 1(exp(â A) â I) â â B (4) Discretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023).\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 15, tokens 157, triggered by: 0.32\n", - "\u001b[34mDiscretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023). It also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5. However, from a mechanical point of view discretization can simply be viewed as the ï¬ rst step of the computation graph in the forward pass of an SSM.\u001b[0m\n", + "Split 15, tokens 49, triggered by: token limit\n", + "\u001b[34mIt also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5.\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 16, tokens 191, triggered by: 0.29\n", - "\u001b[35mAlternate ï¬ avors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about. Computation. After the parameters have been transformed from (â , A, B, C) â ¦ (A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3). 3 Commonly, the model uses the convolutional mode (3) for eï¬ cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬ cient autoregressive inference (where the inputs are seen one timestep at a time). Linear Time Invariance (LTI). An important property of equations (1) to (3) is that the modelâ s dynamics are constant through time.\u001b[0m\n", + "Split 16, tokens 225, triggered by: 0.29\n", + "\u001b[35mHowever, from a mechanical point of view discretization can simply be viewed as the ï¬ rst step of the computation graph in the forward pass of an SSM. Alternate ï¬ avors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about. Computation. After the parameters have been transformed from (â , A, B, C) â ¦ (A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3). 3 Commonly, the model uses the convolutional mode (3) for eï¬ cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬ cient autoregressive inference (where the inputs are seen one timestep at a time). Linear Time Invariance (LTI). An important property of equations (1) to (3) is that the modelâ s dynamics are constant through time.\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", @@ -333,43 +388,58 @@ "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 19, tokens 236, triggered by: 0.26\n", + "Split 19, tokens 236, triggered by: 0.27\n", "\u001b[34m· channels, the SSM is applied independently to each channel. Note that in this case, the total hidden state has dimension ð ·ð per input, and computing it over the sequence length requires ð (ð µð ¿ð ·ð ) time and memory; this is the root of the fundamental eï¬ ciency bottleneck addressed in Section 3.3. General State Space Models. We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state. It has been used to refer to many disparate concepts in diï¬ erent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬ lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning). Throughout this entire paper we use the term â\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 20, tokens 229, triggered by: 0.22\n", - "\u001b[35mSSMâ to refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably. For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y. Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary. SSM Architectures. SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines.\u001b[0m\n", + "Split 20, tokens 3, triggered by: token limit\n", + "\u001b[35mSSMâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 21, tokens 226, triggered by: 0.22\n", + "\u001b[31mto refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably. For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y. Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary. SSM Architectures. SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 22, tokens 158, triggered by: 0.32\n", + "\u001b[32mâ ¢ Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM. â ¢ H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3). H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer. â ¢ Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021). â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 23, tokens 106, triggered by: 0.26\n", + "\u001b[34m¢ RetNet (Y. Sun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions. 4 â ¢ RWKV (B. Peng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S. Zhai et al. 2021)). Its main â WKVâ\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 21, tokens 158, triggered by: 0.32\n", - "\u001b[31mâ ¢ Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM. â ¢ H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3). H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer. â ¢ Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021). â\u001b[0m\n", + "Split 24, tokens 172, triggered by: 0.25\n", + "\u001b[35mmechanism involves LTI recurrences and can be viewed as the ratio of two SSMs. Other closely related SSMs and architectures are discussed further in an extended related work (Appendix B). We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM. # 3 Selective State Space Models We motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2). The resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 22, tokens 106, triggered by: 0.26\n", - "\u001b[32m¢ RetNet (Y. Sun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions. 4 â ¢ RWKV (B. Peng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S. Zhai et al. 2021)). Its main â WKVâ\u001b[0m\n", + "Split 25, tokens 254, triggered by: 0.33\n", + "\u001b[31mciently. We overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3). We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4). Finally, we discuss some additional properties of selection mechanisms (Section 3.5). # 3.1 Motivation: Selection as a Means of Compression We argue that a fundamental problem of sequence modeling is compressing context into a smaller state. In fact, we can view the tradeoï¬ s of popular sequence models from this point of view. For example, attention is both eï¬ ective and ineï¬ cient because it explicitly does not compress context at all. This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers. On the other hand, recurrent models are eï¬ cient because they have a ï¬ nite state, implying constant-time inference and linear-time training. However, their eï¬ ectiveness is limited by how well this state has compressed the context. To understand this principle, we focus on two running examples of synthetic tasks (Figure 2).\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 23, tokens 172, triggered by: 0.25\n", - "\u001b[34mmechanism involves LTI recurrences and can be viewed as the ratio of two SSMs. Other closely related SSMs and architectures are discussed further in an extended related work (Appendix B). We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM. # 3 Selective State Space Models We motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2). The resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬\u001b[0m\n", + "Split 26, tokens 283, triggered by: 0.35\n", + "\u001b[32mâ ¢ The Selective Copying task modiï¬ es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize. It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬ lter out the irrelevant ones (white). â ¢ The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022). It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black). These tasks reveal the failure mode of LTI models. From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬ ect the hidden state passed along the sequence an in input-dependent way. From the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬ culty with the Selective Copying task because of lack of content-awareness (Figure 2). More concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels.\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 24, tokens 254, triggered by: 0.33\n", - "\u001b[35mciently. We overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3). We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4). Finally, we discuss some additional properties of selection mechanisms (Section 3.5). # 3.1 Motivation: Selection as a Means of Compression We argue that a fundamental problem of sequence modeling is compressing context into a smaller state. In fact, we can view the tradeoï¬ s of popular sequence models from this point of view. For example, attention is both eï¬ ective and ineï¬ cient because it explicitly does not compress context at all. This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers. On the other hand, recurrent models are eï¬ cient because they have a ï¬ nite state, implying constant-time inference and linear-time training. However, their eï¬ ectiveness is limited by how well this state has compressed the context. To understand this principle, we focus on two running examples of synthetic tasks (Figure 2).\u001b[0m\n", + "Split 27, tokens 20, triggered by: token limit\n", + "\u001b[34mIn summary, the eï¬ ciency vs. eï¬ ectiveness tradeoï¬\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 25, tokens 297, triggered by: token limit\n", - "\u001b[31mâ ¢ The Selective Copying task modiï¬ es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize. It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬ lter out the irrelevant ones (white). â ¢ The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022). It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black). These tasks reveal the failure mode of LTI models. From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬ ect the hidden state passed along the sequence an in input-dependent way. From the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬ culty with the Selective Copying task because of lack of content-awareness (Figure 2). More concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels. In summary, the eï¬ ciency vs. eï¬\u001b[0m\n", + "Split 28, tokens 109, triggered by: 0.41\n", + "\u001b[35mof sequence models is characterized by how well they compress their state: eï¬ cient models must have a small state, while eï¬ ective models must have a state that contains all necessary information from the context. In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬ lter out inputs into a sequential state. In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion).\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 26, tokens 160, triggered by: final split\n", - "\u001b[32mectiveness tradeoï¬ of sequence models is characterized by how well they compress their state: eï¬ cient models must have a small state, while eï¬ ective models must have a state that contains all necessary information from the context. In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬ lter out inputs into a sequential state. In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion). # Improving SSMs with Selection One method of incorporating a selection mechanism into models is by letting their parameters that aï¬ ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the c\u001b[0m\n", + "Split 29, tokens 45, triggered by: final split\n", + "\u001b[31m# Improving SSMs with Selection One method of incorporating a selection mechanism into models is by letting their parameters that aï¬ ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the c\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n" @@ -377,7 +447,7 @@ } ], "source": [ - "chunker.print(chunks[0])" + "chunker.print(chunks_async[0])" ] }, { @@ -400,7 +470,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "id": "AaKVbv942kkc" }, @@ -413,7 +483,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -422,702 +492,16 @@ "id": "d3mtF7R66tFJ", "outputId": "be8a0a91-e042-4214-9019-5cb17559c6de" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 6/6 [00:08<00:00, 1.48s/it]\n", - "100%|██████████| 328/328 [00:00<00:00, 36590.56it/s]\n" - ] - } - ], + "outputs": [], "source": [ "chunks = chunker(docs=[content])" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Split 1, tokens None, triggered by: 0.09\n", - "\u001b[31m# Mamba:\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 2, tokens None, triggered by: 0.10\n", - "\u001b[32mLinear-Time Sequence Modeling with Selective State Spaces\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 3, tokens None, triggered by: 0.25\n", - "\u001b[34m# Albert Gu*1 and Tri Dao*2 1Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 4, tokens None, triggered by: 0.22\n", - "\u001b[35m# Abstract\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 5, tokens None, triggered by: 0.30\n", - "\u001b[31mFoundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformersâ computational ineï¬ ciency on long sequences, but they have not performed as well as attention on important modalities such as language. We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements. First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 6, tokens None, triggered by: 0.22\n", - "\u001b[32mSecond, even though this change prevents the use of eï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 7, tokens None, triggered by: 0.28\n", - "\u001b[34mcient convolutions, we design a hardware-aware parallel algorithm in recurrent mode.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 8, tokens None, triggered by: 0.25\n", - "\u001b[35mWe integrate these selective SSMs into a simpliï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 9, tokens None, triggered by: 0.11\n", - "\u001b[31med end-to-end neural network architecture without attention or even MLP blocks (Mamba). Mamba enjoys fast inference (5à higher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences. As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics. On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 10, tokens None, triggered by: 0.21\n", - "\u001b[32m# 1 Introduction\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 11, tokens None, triggered by: 0.14\n", - "\u001b[34mFoundation models (FMs), or large models pretrained on massive data then adapted for downstream tasks, have emerged as an eï¬ ective paradigm in modern machine learning. The backbone of these FMs are often sequence models, operating on arbitrary sequences of inputs from a wide variety of domains such as language, images, speech, audio, time series, and genomics (Brown et al. 2020; Dosovitskiy et al. 2020; Ismail Fawaz et al. 2019; Oord et al. 2016; Poli et al. 2023; Sutskever, Vinyals, and Quoc V Le 2014). While this concept is agnostic to a particular choice of model architecture, modern FMs are predominantly based on a single type of sequence model: the Transformer (Vaswani et al. 2017) and its core attention layer (Bahdanau, Cho, and Bengio 2015) The eï¬ cacy of self-attention is attributed to its ability to route information densely within a context window, allowing it to model complex data. However, this property brings fundamental drawbacks: an inability to model anything outside of a ï¬ nite window, and quadratic scaling with respect to the window length.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 12, tokens None, triggered by: 0.21\n", - "\u001b[35mAn enormous body of research has appeared on more eï¬ cient variants of attention to overcome these drawbacks (Tay, Dehghani, Bahri, et al. 2022), but often at the expense of the very properties that makes it eï¬ ective.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 13, tokens None, triggered by: 0.27\n", - "\u001b[31mAs of yet, none of these variants have been shown to be empirically eï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 14, tokens None, triggered by: 0.26\n", - "\u001b[32mective at scale across domains.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 15, tokens None, triggered by: 0.09\n", - "\u001b[34mRecently, structured state space sequence models (SSMs) (Gu, Goel, and Ré 2022; Gu, Johnson, Goel, et al. 2021) have emerged as a promising class of architectures for sequence modeling. These models can be interpreted as a combination of recurrent neural networks (RNNs) and convolutional neural networks (CNNs), with inspiration from classical state space models (Kalman 1960). This class of models can be computed very eï¬ ciently as either a recurrence or convolution, with linear or near-linear scaling in sequence length.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 16, tokens None, triggered by: 0.28\n", - "\u001b[35mAdditionally, they have principled\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 17, tokens None, triggered by: 0.23\n", - "\u001b[31mEqual contribution.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 18, tokens None, triggered by: 0.07\n", - "\u001b[32m1\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 19, tokens None, triggered by: 0.15\n", - "\u001b[34mmechanisms for modeling long-range dependencies (Gu, Dao, et al. 2020) in certain data modalities, and have dominated benchmarks such as the Long Range Arena (Tay, Dehghani, Abnar, et al. 2021).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 20, tokens None, triggered by: 0.23\n", - "\u001b[35mMany ï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 21, tokens None, triggered by: 0.21\n", - "\u001b[31mavors of SSMs (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Y. Li et al. 2023; Ma et al. 2023; Orvieto et al. 2023; Smith, Warrington, and Linderman 2023) have been successful in domains involving continuous signal data such as audio and vision (Goel et al. 2022; Nguyen, Goel, et al. 2022; Saon, Gupta, and Cui 2023).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 22, tokens None, triggered by: 0.20\n", - "\u001b[32mHowever, they have been less eï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 23, tokens None, triggered by: 0.20\n", - "\u001b[34mective at modeling discrete and information-dense data such as text. We propose a new class of selective state space models, that improves on prior work on several axes to achieve the modeling power of Transformers while scaling linearly in sequence length.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 24, tokens None, triggered by: 0.18\n", - "\u001b[35mSelection Mechanism.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 25, tokens None, triggered by: 0.25\n", - "\u001b[31mFirst, we identify a key limitation of prior models: the ability to eï¬ ciently select data in an input-dependent manner (i.e. focus on or ignore particular inputs). Building on intuition based on important synthetic tasks such as selective copy and induction heads, we design a simple selection mechanism by parameterizing the SSM parameters based on the input.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 26, tokens None, triggered by: 0.18\n", - "\u001b[32mThis allows the model to ï¬ lter out irrelevant information and remember relevant information indeï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 27, tokens None, triggered by: 0.16\n", - "\u001b[34mnitely.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 28, tokens None, triggered by: 0.27\n", - "\u001b[35mHardware-aware Algorithm. This simple change poses a technical challenge for the computation of the model; in fact, all prior SSMs models must be time- and input-invariant in order to be computationally eï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 29, tokens None, triggered by: 0.18\n", - "\u001b[31mcient.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 30, tokens None, triggered by: 0.29\n", - "\u001b[32mWe overcome this with a hardware-aware algorithm that computes the model recurrently with a scan instead of convolution, but does not materialize the expanded state in order to avoid IO access between diï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 31, tokens None, triggered by: 0.28\n", - "\u001b[34merent levels of the GPU memory hierarchy.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 32, tokens None, triggered by: 0.19\n", - "\u001b[35mThe resulting implementation is faster than previous methods both in theory (scaling linearly in sequence length, compared to pseudo-linear for all convolution-based SSMs) and on modern hardware (up to 3à faster on A100 GPUs).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 33, tokens None, triggered by: 0.28\n", - "\u001b[31mArchitecture.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 34, tokens None, triggered by: 0.29\n", - "\u001b[32mWe simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces. Selective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬ ciency together yield performance improvements on real data up to sequence length 1M.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 35, tokens None, triggered by: 0.24\n", - "\u001b[34mWe empirically validate Mambaâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 36, tokens None, triggered by: 0.24\n", - "\u001b[35ms potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬ c task performance, on several types of modalities and settings:\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 37, tokens None, triggered by: 0.19\n", - "\u001b[31mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 38, tokens None, triggered by: 0.26\n", - "\u001b[32m¢ Synthetics. On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 39, tokens None, triggered by: 0.20\n", - "\u001b[34mnitely long (>1M tokens).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 40, tokens None, triggered by: 0.24\n", - "\u001b[35mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 41, tokens None, triggered by: 0.13\n", - "\u001b[31m¢ Audio and Genomics. Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half). In both settings, its performance improves with longer context up to million-length sequences.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 42, tokens None, triggered by: 0.24\n", - "\u001b[32mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 43, tokens None, triggered by: 0.15\n", - "\u001b[34m¢ Language Modeling.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 44, tokens None, triggered by: 0.10\n", - "\u001b[35mMamba is the ï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 45, tokens None, triggered by: 0.20\n", - "\u001b[31mrst linear-time sequence model that truly achieves Transformer-quality performance, both in pretraining perplexity and downstream evaluations. With scaling laws up to 1B parameters, we show that Mamba exceeds the performance of a large range of baselines, including very strong modern Transformer training recipes based on LLaMa (Touvron et al. 2023). Our Mamba language model has 5à generation throughput compared to Transformers of similar size, and Mamba-3Bâ s quality matches that of Transformers twice its size (e.g. 4 points higher avg. on common sense reasoning compared to Pythia-3B and even exceeding Pythia-7B).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 46, tokens None, triggered by: 0.08\n", - "\u001b[32mModel code and pre-trained checkpoints are open-sourced at https://github.com/state-spaces/mamba.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 47, tokens None, triggered by: 0.14\n", - "\u001b[34m2\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 48, tokens None, triggered by: 0.19\n", - "\u001b[35m# Selective State Space Model # with Hardware-aware State Expansion # A\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 49, tokens None, triggered by: 0.29\n", - "\u001b[31mvuvy GPU SRAM Selection Mechanism es\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 50, tokens None, triggered by: 0.25\n", - "\u001b[32mSelection Mechanism\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 51, tokens None, triggered by: 0.25\n", - "\u001b[34mFigure 1: (Overview.) Structured SSMs independently map each channel (e.g. ð · = 5) of an input ð ¥ to output ð\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 52, tokens None, triggered by: 0.28\n", - "\u001b[35m¦ through a higher dimensional latent state â\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 53, tokens None, triggered by: 0.23\n", - "\u001b[31m(e.g. ð = 4).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 54, tokens None, triggered by: 0.29\n", - "\u001b[32mPrior SSMs avoid materializing this large effective state (ð ·ð , times batch size ð µ and sequence length ð\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 55, tokens None, triggered by: 0.26\n", - "\u001b[34m¿) through clever alternate computation paths requiring time-invariance: the (â , A, B, C) parameters are constant across time.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 56, tokens None, triggered by: 0.26\n", - "\u001b[35mOur selection mechanism adds back input-dependent dynamics, which also requires a careful hardware-aware algorithm to only materialize the expanded states in more efficient levels of the GPU memory hierarchy.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 57, tokens None, triggered by: 0.24\n", - "\u001b[31m# 2 State Space Models Structured state space sequence models (S4) are a recent class of sequence models for deep learning that are broadly related to RNNs, and CNNs, and classical state space models. They are inspired by a particular continuous system (1) that maps a 1-dimensional function or sequence ð ¥(ð ¡) â â â ¦ ð ¦(ð ¡) â â\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 58, tokens None, triggered by: 0.28\n", - "\u001b[32mthrough an implicit latent state â\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 59, tokens None, triggered by: 0.23\n", - "\u001b[34m(ð ¡) â â ð .\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 60, tokens None, triggered by: 0.22\n", - "\u001b[35mConcretely, S4 models are deï¬ ned with four parameters (â , A, B, C), which deï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 61, tokens None, triggered by: 0.18\n", - "\u001b[31mne a sequence-to-sequence trans- formation in two stages.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 62, tokens None, triggered by: 0.27\n", - "\u001b[32mâ â ²(ð ¡) = Aâ (ð ¡) + Bð ¥(ð ¡) ð ¦(ð ¡) = Câ (ð ¡) (1a) (1b) â\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 63, tokens None, triggered by: 0.27\n", - "\u001b[34mð\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 64, tokens None, triggered by: 0.27\n", - "\u001b[35m¡ = Aâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 65, tokens None, triggered by: 0.24\n", - "\u001b[31mð ¡â\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 66, tokens None, triggered by: 0.28\n", - "\u001b[32m1 + Bð ¥ð ¡ ð ¦ð\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 67, tokens None, triggered by: 0.30\n", - "\u001b[34m¡ = Câ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 68, tokens None, triggered by: 0.26\n", - "\u001b[35mð ¡ (2a) (2b) ð ð ² = (Cð ©, Cð ¨ð ©, â ¦ , Cð ¨ ð ¦ = ð ¥ â ð ² ð ©, â ¦ ) (3a) (3b)\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 69, tokens None, triggered by: 0.22\n", - "\u001b[31mDiscretization.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 70, tokens None, triggered by: 0.30\n", - "\u001b[32mThe ï¬ rst stage transforms the â\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 71, tokens None, triggered by: 0.23\n", - "\u001b[34mcontinuous parametersâ (â , A, B) to â discrete parametersâ (A, B) through ï¬ xed formulas A = ð ð ´(â , A) and B = ð ð µ(â , A, B), where the pair (ð ð ´, ð ð\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 72, tokens None, triggered by: 0.27\n", - "\u001b[35mµ) is called a discretization rule. Various rules can be used such as the zero-order hold (ZOH) deï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 73, tokens None, triggered by: 0.28\n", - "\u001b[31mned in equation (4).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 74, tokens None, triggered by: 0.13\n", - "\u001b[32mA = exp(â A) B = (â A)â 1(exp(â A) â I) â â B (4)\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 75, tokens None, triggered by: 0.26\n", - "\u001b[34mDiscretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023). It also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 76, tokens None, triggered by: 0.29\n", - "\u001b[35mHowever, from a mechanical point of view discretization can simply be viewed as the ï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 77, tokens None, triggered by: 0.15\n", - "\u001b[31mrst step of the computation graph in the forward pass of an SSM.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 78, tokens None, triggered by: 0.22\n", - "\u001b[32mAlternate ï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 79, tokens None, triggered by: 0.29\n", - "\u001b[34mavors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about. Computation.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 80, tokens None, triggered by: 0.25\n", - "\u001b[35mAfter the parameters have been transformed from (â , A, B, C) â ¦ (A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 81, tokens None, triggered by: 0.24\n", - "\u001b[31m3\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 82, tokens None, triggered by: 0.14\n", - "\u001b[32mCommonly, the model uses the convolutional mode (3) for eï¬ cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬ cient autoregressive inference (where the inputs are seen one timestep at a time). Linear Time Invariance (LTI). An important property of equations (1) to (3) is that the modelâ s dynamics are constant through time.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 83, tokens None, triggered by: 0.21\n", - "\u001b[34mIn other words (â , A, B, C), and consequently (A, B) as well, are ï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 84, tokens None, triggered by: 0.24\n", - "\u001b[35mxed for all time-steps.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 85, tokens None, triggered by: 0.21\n", - "\u001b[31mThis property is called linear time invariance (LTI), which is deeply connected to recurrence and convolutions. Informally, we think of LTI SSMs as being equivalent to any linear recurrence (2a) or convolution (3b), and use LTI as an umbrella term for these classes of models. Thus far, all structured SSMs have been LTI (e.g. computed as convolutions) because of fundamental eï¬ ciency constraints, discussed in Section 3.3. However, a core insight of this work is that LTI models have fundamental limitations in modeling certain types of data, and our technical contributions involve removing the LTI constraint while overcoming the eï¬ ciency bottlenecks.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 86, tokens None, triggered by: 0.23\n", - "\u001b[32mStructure and Dimensions. Finally, we note that structured SSMs are so named because computing them eï¬ ciently also requires imposing structure on the A matrix. The most popular form of structure is diagonal (Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Smith, Warrington, and Linderman 2023), which we also use.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 87, tokens None, triggered by: 0.28\n", - "\u001b[34mIn this case, the A â â ð à ð\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 88, tokens None, triggered by: 0.27\n", - "\u001b[35m, B â â ð à 1, C â â 1à ð matrices can all be represented by ð\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 89, tokens None, triggered by: 0.18\n", - "\u001b[31mnumbers.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 90, tokens None, triggered by: 0.10\n", - "\u001b[32mTo operate over an input sequence ð ¥ of batch size ð µ and length ð ¿ with ð\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 91, tokens None, triggered by: 0.28\n", - "\u001b[34m· channels, the SSM is applied independently to each channel.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 92, tokens None, triggered by: 0.28\n", - "\u001b[35mNote that in this case, the total hidden state has dimension ð ·ð per input, and computing it over the sequence length requires ð (ð µð ¿ð ·ð ) time and memory; this is the root of the fundamental eï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 93, tokens None, triggered by: 0.20\n", - "\u001b[31mciency bottleneck addressed in Section 3.3.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 94, tokens None, triggered by: 0.24\n", - "\u001b[32mGeneral State Space Models. We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 95, tokens None, triggered by: 0.23\n", - "\u001b[34mIt has been used to refer to many disparate concepts in diï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 96, tokens None, triggered by: 0.19\n", - "\u001b[35merent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬ lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 97, tokens None, triggered by: 0.26\n", - "\u001b[31mThroughout this entire paper we use the term â\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 98, tokens None, triggered by: 0.16\n", - "\u001b[32mSSMâ to refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably. For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y. Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 99, tokens None, triggered by: 0.09\n", - "\u001b[34mSSM Architectures. SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 100, tokens None, triggered by: 0.12\n", - "\u001b[35mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 101, tokens None, triggered by: 0.12\n", - "\u001b[31m¢ Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 102, tokens None, triggered by: 0.28\n", - "\u001b[32mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 103, tokens None, triggered by: 0.13\n", - "\u001b[34m¢ H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3). H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 104, tokens None, triggered by: 0.12\n", - "\u001b[35mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 105, tokens None, triggered by: 0.12\n", - "\u001b[31m¢ Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 106, tokens None, triggered by: 0.23\n", - "\u001b[32mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 107, tokens None, triggered by: 0.17\n", - "\u001b[34m¢ RetNet (Y.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 108, tokens None, triggered by: 0.07\n", - "\u001b[35mSun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 109, tokens None, triggered by: 0.24\n", - "\u001b[31m4 â\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 110, tokens None, triggered by: 0.19\n", - "\u001b[32m¢ RWKV (B.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 111, tokens None, triggered by: 0.16\n", - "\u001b[34mPeng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S. Zhai et al. 2021)).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 112, tokens None, triggered by: 0.10\n", - "\u001b[35mIts main â WKVâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 113, tokens None, triggered by: 0.29\n", - "\u001b[31mmechanism involves LTI recurrences and can be viewed as the ratio of two SSMs. Other closely related SSMs and architectures are discussed further in an extended related work (Appendix B). We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM. # 3 Selective State Space Models We motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 114, tokens None, triggered by: 0.22\n", - "\u001b[32mThe resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 115, tokens None, triggered by: 0.28\n", - "\u001b[34mciently.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 116, tokens None, triggered by: 0.21\n", - "\u001b[35mWe overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3). We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4). Finally, we discuss some additional properties of selection mechanisms (Section 3.5). # 3.1 Motivation:\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 117, tokens None, triggered by: 0.10\n", - "\u001b[31mSelection as a Means of Compression We argue that a fundamental problem of sequence modeling is compressing context into a smaller state.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 118, tokens None, triggered by: 0.20\n", - "\u001b[32mIn fact, we can view the tradeoï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 119, tokens None, triggered by: 0.21\n", - "\u001b[34ms of popular sequence models from this point of view.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 120, tokens None, triggered by: 0.30\n", - "\u001b[35mFor example, attention is both eï¬ ective and ineï¬ cient because it explicitly does not compress context at all. This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers. On the other hand, recurrent models are eï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 121, tokens None, triggered by: 0.24\n", - "\u001b[31mcient because they have a ï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 122, tokens None, triggered by: 0.16\n", - "\u001b[32mnite state, implying constant-time inference and linear-time training.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 123, tokens None, triggered by: 0.27\n", - "\u001b[34mHowever, their eï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 124, tokens None, triggered by: 0.26\n", - "\u001b[35mectiveness is limited by how well this state has compressed the context.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 125, tokens None, triggered by: 0.12\n", - "\u001b[31mTo understand this principle, we focus on two running examples of synthetic tasks (Figure 2).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 126, tokens None, triggered by: 0.25\n", - "\u001b[32mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 127, tokens None, triggered by: 0.20\n", - "\u001b[34m¢ The Selective Copying task modiï¬ es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize. It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬ lter out the irrelevant ones (white).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 128, tokens None, triggered by: 0.12\n", - "\u001b[35mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 129, tokens None, triggered by: 0.21\n", - "\u001b[31m¢ The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022). It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 130, tokens None, triggered by: 0.27\n", - "\u001b[32mThese tasks reveal the failure mode of LTI models. From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬ ect the hidden state passed along the sequence an in input-dependent way.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 131, tokens None, triggered by: 0.20\n", - "\u001b[34mFrom the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬ culty with the Selective Copying task because of lack of content-awareness (Figure 2).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 132, tokens None, triggered by: 0.13\n", - "\u001b[35mMore concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 133, tokens None, triggered by: 0.20\n", - "\u001b[31mIn summary, the eï¬ ciency vs. eï¬ ectiveness tradeoï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 134, tokens None, triggered by: final split\n", - "\u001b[32mof sequence models is characterized by how well they compress their state: eï¬ cient models must have a small state, while eï¬ ective models must have a state that contains all necessary information from the context. In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬ lter out inputs into a sequential state. In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion). # Improving SSMs with Selection One method of incorporating a selection mechanism into models is by letting their parameters that aï¬ ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the c\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "chunker.print(chunks[0])" ] @@ -1138,7 +522,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1149,648 +533,18 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 329/329 [04:17<00:00, 1.28it/s]\n" - ] - } - ], + "outputs": [], "source": [ "chunks = chunker(docs=[content])" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Split 1, tokens None, triggered by: 0.09\n", - "\u001b[31m# Mamba:\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 2, tokens None, triggered by: 0.10\n", - "\u001b[32mLinear-Time Sequence Modeling with Selective State Spaces\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 3, tokens None, triggered by: 0.28\n", - "\u001b[34m# Albert Gu*1 and Tri Dao*2 1Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 4, tokens None, triggered by: 0.22\n", - "\u001b[35m# Abstract\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 5, tokens None, triggered by: 0.23\n", - "\u001b[31mFoundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformersâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 6, tokens None, triggered by: 0.30\n", - "\u001b[32mcomputational ineï¬ ciency on long sequences, but they have not performed as well as attention on important modalities such as language. We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements. First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 7, tokens None, triggered by: 0.22\n", - "\u001b[34mSecond, even though this change prevents the use of eï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 8, tokens None, triggered by: 0.28\n", - "\u001b[35mcient convolutions, we design a hardware-aware parallel algorithm in recurrent mode.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 9, tokens None, triggered by: 0.25\n", - "\u001b[31mWe integrate these selective SSMs into a simpliï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 10, tokens None, triggered by: 0.17\n", - "\u001b[32med end-to-end neural network architecture without attention or even MLP blocks (Mamba). Mamba enjoys fast inference (5à higher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences. As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics. On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 11, tokens None, triggered by: 0.21\n", - "\u001b[34m# 1 Introduction\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 12, tokens None, triggered by: 0.20\n", - "\u001b[35mFoundation models (FMs), or large models pretrained on massive data then adapted for downstream tasks, have emerged as an eï¬ ective paradigm in modern machine learning. The backbone of these FMs are often sequence models, operating on arbitrary sequences of inputs from a wide variety of domains such as language, images, speech, audio, time series, and genomics (Brown et al. 2020; Dosovitskiy et al. 2020; Ismail Fawaz et al. 2019; Oord et al. 2016; Poli et al. 2023; Sutskever, Vinyals, and Quoc V Le 2014). While this concept is agnostic to a particular choice of model architecture, modern FMs are predominantly based on a single type of sequence model: the Transformer (Vaswani et al. 2017) and its core attention layer (Bahdanau, Cho, and Bengio 2015) The eï¬ cacy of self-attention is attributed to its ability to route information densely within a context window, allowing it to model complex data.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 13, tokens None, triggered by: 0.25\n", - "\u001b[31mHowever, this property brings fundamental drawbacks: an inability to model anything outside of a ï¬ nite window, and quadratic scaling with respect to the window length.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 14, tokens None, triggered by: 0.28\n", - "\u001b[32mAn enormous body of research has appeared on more eï¬ cient variants of attention to overcome these drawbacks (Tay, Dehghani, Bahri, et al. 2022), but often at the expense of the very properties that makes it eï¬ ective. As of yet, none of these variants have been shown to be empirically eï¬ ective at scale across domains.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 15, tokens None, triggered by: 0.09\n", - "\u001b[34mRecently, structured state space sequence models (SSMs) (Gu, Goel, and Ré 2022; Gu, Johnson, Goel, et al. 2021) have emerged as a promising class of architectures for sequence modeling. These models can be interpreted as a combination of recurrent neural networks (RNNs) and convolutional neural networks (CNNs), with inspiration from classical state space models (Kalman 1960). This class of models can be computed very eï¬ ciently as either a recurrence or convolution, with linear or near-linear scaling in sequence length.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 16, tokens None, triggered by: 0.28\n", - "\u001b[35mAdditionally, they have principled\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 17, tokens None, triggered by: 0.23\n", - "\u001b[31mEqual contribution.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 18, tokens None, triggered by: 0.07\n", - "\u001b[32m1\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 19, tokens None, triggered by: 0.15\n", - "\u001b[34mmechanisms for modeling long-range dependencies (Gu, Dao, et al. 2020) in certain data modalities, and have dominated benchmarks such as the Long Range Arena (Tay, Dehghani, Abnar, et al. 2021).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 20, tokens None, triggered by: 0.23\n", - "\u001b[35mMany ï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 21, tokens None, triggered by: 0.20\n", - "\u001b[31mavors of SSMs (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Y. Li et al. 2023; Ma et al. 2023; Orvieto et al. 2023; Smith, Warrington, and Linderman 2023) have been successful in domains involving continuous signal data such as audio and vision (Goel et al. 2022; Nguyen, Goel, et al. 2022; Saon, Gupta, and Cui 2023).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 22, tokens None, triggered by: 0.20\n", - "\u001b[32mHowever, they have been less eï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 23, tokens None, triggered by: 0.24\n", - "\u001b[34mective at modeling discrete and information-dense data such as text. We propose a new class of selective state space models, that improves on prior work on several axes to achieve the modeling power of Transformers while scaling linearly in sequence length.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 24, tokens None, triggered by: 0.18\n", - "\u001b[35mSelection Mechanism.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 25, tokens None, triggered by: 0.07\n", - "\u001b[31mFirst, we identify a key limitation of prior models: the ability to eï¬ ciently select data in an input-dependent manner (i.e. focus on or ignore particular inputs). Building on intuition based on important synthetic tasks such as selective copy and induction heads, we design a simple selection mechanism by parameterizing the SSM parameters based on the input. This allows the model to ï¬ lter out irrelevant information and remember relevant information indeï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 26, tokens None, triggered by: 0.16\n", - "\u001b[32mnitely.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 27, tokens None, triggered by: 0.26\n", - "\u001b[34mHardware-aware Algorithm. This simple change poses a technical challenge for the computation of the model; in fact, all prior SSMs models must be time- and input-invariant in order to be computationally eï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 28, tokens None, triggered by: 0.18\n", - "\u001b[35mcient.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 29, tokens None, triggered by: 0.29\n", - "\u001b[31mWe overcome this with a hardware-aware algorithm that computes the model recurrently with a scan instead of convolution, but does not materialize the expanded state in order to avoid IO access between diï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 30, tokens None, triggered by: 0.28\n", - "\u001b[32merent levels of the GPU memory hierarchy.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 31, tokens None, triggered by: 0.12\n", - "\u001b[34mThe resulting implementation is faster than previous methods both in theory (scaling linearly in sequence length, compared to pseudo-linear for all convolution-based SSMs) and on modern hardware (up to 3à faster on A100 GPUs).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 32, tokens None, triggered by: 0.28\n", - "\u001b[35mArchitecture.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 33, tokens None, triggered by: 0.23\n", - "\u001b[31mWe simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces. Selective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬ ciency together yield performance improvements on real data up to sequence length 1M. We empirically validate Mambaâ s potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 34, tokens None, triggered by: 0.24\n", - "\u001b[32mc task performance, on several types of modalities and settings:\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 35, tokens None, triggered by: 0.19\n", - "\u001b[34mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 36, tokens None, triggered by: 0.27\n", - "\u001b[35m¢ Synthetics. On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 37, tokens None, triggered by: 0.20\n", - "\u001b[31mnitely long (>1M tokens).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 38, tokens None, triggered by: 0.24\n", - "\u001b[32mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 39, tokens None, triggered by: 0.18\n", - "\u001b[34m¢ Audio and Genomics. Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half). In both settings, its performance improves with longer context up to million-length sequences.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 40, tokens None, triggered by: 0.24\n", - "\u001b[35mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 41, tokens None, triggered by: 0.15\n", - "\u001b[31m¢ Language Modeling.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 42, tokens None, triggered by: 0.10\n", - "\u001b[32mMamba is the ï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 43, tokens None, triggered by: 0.10\n", - "\u001b[34mrst linear-time sequence model that truly achieves Transformer-quality performance, both in pretraining perplexity and downstream evaluations. With scaling laws up to 1B parameters, we show that Mamba exceeds the performance of a large range of baselines, including very strong modern Transformer training recipes based on LLaMa (Touvron et al. 2023). Our Mamba language model has 5à generation throughput compared to Transformers of similar size, and Mamba-3Bâ s quality matches that of Transformers twice its size (e.g. 4 points higher avg. on common sense reasoning compared to Pythia-3B and even exceeding Pythia-7B). Model code and pre-trained checkpoints are open-sourced at https://github.com/state-spaces/mamba.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 44, tokens None, triggered by: 0.14\n", - "\u001b[35m2\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 45, tokens None, triggered by: 0.25\n", - "\u001b[31m# Selective State Space Model # with Hardware-aware State Expansion\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 46, tokens None, triggered by: 0.19\n", - "\u001b[32m# A\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 47, tokens None, triggered by: 0.29\n", - "\u001b[34mvuvy GPU SRAM Selection Mechanism es\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 48, tokens None, triggered by: 0.25\n", - "\u001b[35mSelection Mechanism\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 49, tokens None, triggered by: 0.28\n", - "\u001b[31mFigure 1: (Overview.) Structured SSMs independently map each channel (e.g. ð · = 5) of an input ð ¥ to output ð ¦ through a higher dimensional latent state â\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 50, tokens None, triggered by: 0.28\n", - "\u001b[32m(e.g. ð = 4).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 51, tokens None, triggered by: 0.16\n", - "\u001b[34mPrior SSMs avoid materializing this large effective state (ð ·ð , times batch size ð µ and sequence length ð ¿) through clever alternate computation paths requiring time-invariance: the (â , A, B, C) parameters are constant across time. Our selection mechanism adds back input-dependent dynamics, which also requires a careful hardware-aware algorithm to only materialize the expanded states in more efficient levels of the GPU memory hierarchy. # 2 State Space Models Structured state space sequence models (S4) are a recent class of sequence models for deep learning that are broadly related to RNNs, and CNNs, and classical state space models. They are inspired by a particular continuous system (1) that maps a 1-dimensional function or sequence ð\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 52, tokens None, triggered by: 0.27\n", - "\u001b[35m¥(ð ¡) â â â ¦ ð ¦(ð ¡) â â\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 53, tokens None, triggered by: 0.28\n", - "\u001b[31mthrough an implicit latent state â\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 54, tokens None, triggered by: 0.26\n", - "\u001b[32m(ð ¡) â â ð .\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 55, tokens None, triggered by: 0.16\n", - "\u001b[34mConcretely, S4 models are deï¬ ned with four parameters (â , A, B, C), which deï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 56, tokens None, triggered by: 0.18\n", - "\u001b[35mne a sequence-to-sequence trans- formation in two stages.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 57, tokens None, triggered by: 0.27\n", - "\u001b[31mâ â ²(ð ¡) = Aâ (ð ¡) + Bð ¥(ð ¡) ð ¦(ð ¡) = Câ (ð ¡) (1a) (1b) â\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 58, tokens None, triggered by: 0.27\n", - "\u001b[32mð\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 59, tokens None, triggered by: 0.27\n", - "\u001b[34m¡ = Aâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 60, tokens None, triggered by: 0.29\n", - "\u001b[35mð ¡â 1 + Bð ¥ð ¡ ð ¦ð ¡ = Câ ð ¡ (2a) (2b) ð ð ² = (Cð ©, Cð ¨ð ©, â ¦ , Cð ¨ ð ¦ = ð ¥ â ð ² ð ©, â ¦ ) (3a) (3b)\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 61, tokens None, triggered by: 0.22\n", - "\u001b[31mDiscretization.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 62, tokens None, triggered by: 0.25\n", - "\u001b[32mThe ï¬ rst stage transforms the â continuous parametersâ (â , A, B) to â discrete parametersâ (A, B) through ï¬ xed formulas A = ð\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 63, tokens None, triggered by: 0.28\n", - "\u001b[34mð ´(â , A) and B = ð ð µ(â , A, B), where the pair (ð ð ´, ð ð µ) is called a discretization rule. Various rules can be used such as the zero-order hold (ZOH) deï¬ ned in equation (4). A = exp(â A) B = (â A)â 1(exp(â A) â I) â â B (4) Discretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 64, tokens None, triggered by: 0.26\n", - "\u001b[35mIt also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 65, tokens None, triggered by: 0.29\n", - "\u001b[31mHowever, from a mechanical point of view discretization can simply be viewed as the ï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 66, tokens None, triggered by: 0.15\n", - "\u001b[32mrst step of the computation graph in the forward pass of an SSM.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 67, tokens None, triggered by: 0.22\n", - "\u001b[34mAlternate ï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 68, tokens None, triggered by: 0.21\n", - "\u001b[35mavors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about. Computation. After the parameters have been transformed from (â , A, B, C) â ¦ (A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 69, tokens None, triggered by: 0.24\n", - "\u001b[31m3\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 70, tokens None, triggered by: 0.25\n", - "\u001b[32mCommonly, the model uses the convolutional mode (3) for eï¬ cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬ cient autoregressive inference (where the inputs are seen one timestep at a time).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 71, tokens None, triggered by: 0.20\n", - "\u001b[34mLinear Time Invariance (LTI). An important property of equations (1) to (3) is that the modelâ s dynamics are constant through time.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 72, tokens None, triggered by: 0.17\n", - "\u001b[35mIn other words (â , A, B, C), and consequently (A, B) as well, are ï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 73, tokens None, triggered by: 0.24\n", - "\u001b[31mxed for all time-steps.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 74, tokens None, triggered by: 0.28\n", - "\u001b[32mThis property is called linear time invariance (LTI), which is deeply connected to recurrence and convolutions. Informally, we think of LTI SSMs as being equivalent to any linear recurrence (2a) or convolution (3b), and use LTI as an umbrella term for these classes of models. Thus far, all structured SSMs have been LTI (e.g. computed as convolutions) because of fundamental eï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 75, tokens None, triggered by: 0.23\n", - "\u001b[34mciency constraints, discussed in Section 3.3. However, a core insight of this work is that LTI models have fundamental limitations in modeling certain types of data, and our technical contributions involve removing the LTI constraint while overcoming the eï¬ ciency bottlenecks.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 76, tokens None, triggered by: 0.29\n", - "\u001b[35mStructure and Dimensions. Finally, we note that structured SSMs are so named because computing them eï¬ ciently also requires imposing structure on the A matrix. The most popular form of structure is diagonal (Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Smith, Warrington, and Linderman 2023), which we also use.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 77, tokens None, triggered by: 0.26\n", - "\u001b[31mIn this case, the A â â\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 78, tokens None, triggered by: 0.26\n", - "\u001b[32mð à ð , B â â ð à 1, C â â 1à ð matrices can all be represented by ð\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 79, tokens None, triggered by: 0.18\n", - "\u001b[34mnumbers.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 80, tokens None, triggered by: 0.26\n", - "\u001b[35mTo operate over an input sequence ð ¥ of batch size ð µ and length ð ¿ with ð\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 81, tokens None, triggered by: 0.28\n", - "\u001b[31m· channels, the SSM is applied independently to each channel.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 82, tokens None, triggered by: 0.27\n", - "\u001b[32mNote that in this case, the total hidden state has dimension ð ·ð per input, and computing it over the sequence length requires ð (ð µð\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 83, tokens None, triggered by: 0.27\n", - "\u001b[34m¿ð ·ð\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 84, tokens None, triggered by: 0.28\n", - "\u001b[35m) time and memory; this is the root of the fundamental eï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 85, tokens None, triggered by: 0.20\n", - "\u001b[31mciency bottleneck addressed in Section 3.3.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 86, tokens None, triggered by: 0.19\n", - "\u001b[32mGeneral State Space Models. We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 87, tokens None, triggered by: 0.23\n", - "\u001b[34mIt has been used to refer to many disparate concepts in diï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 88, tokens None, triggered by: 0.15\n", - "\u001b[35merent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬ lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 89, tokens None, triggered by: 0.26\n", - "\u001b[31mThroughout this entire paper we use the term â\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 90, tokens None, triggered by: 0.13\n", - "\u001b[32mSSMâ to refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably. For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y. Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary. SSM Architectures. SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 91, tokens None, triggered by: 0.12\n", - "\u001b[34mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 92, tokens None, triggered by: 0.12\n", - "\u001b[35m¢ Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 93, tokens None, triggered by: 0.28\n", - "\u001b[31mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 94, tokens None, triggered by: 0.23\n", - "\u001b[32m¢ H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3). H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 95, tokens None, triggered by: 0.12\n", - "\u001b[34mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 96, tokens None, triggered by: 0.12\n", - "\u001b[35m¢ Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 97, tokens None, triggered by: 0.23\n", - "\u001b[31mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 98, tokens None, triggered by: 0.18\n", - "\u001b[32m¢ RetNet (Y.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 99, tokens None, triggered by: 0.07\n", - "\u001b[34mSun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 100, tokens None, triggered by: 0.25\n", - "\u001b[35m4 â\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 101, tokens None, triggered by: 0.19\n", - "\u001b[31m¢ RWKV (B.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 102, tokens None, triggered by: 0.11\n", - "\u001b[32mPeng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S. Zhai et al. 2021)).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 103, tokens None, triggered by: 0.15\n", - "\u001b[34mIts main â WKVâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 104, tokens None, triggered by: 0.20\n", - "\u001b[35mmechanism involves LTI recurrences and can be viewed as the ratio of two SSMs. Other closely related SSMs and architectures are discussed further in an extended related work (Appendix B). We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM. # 3 Selective State Space Models We motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2). The resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 105, tokens None, triggered by: 0.28\n", - "\u001b[31mciently.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 106, tokens None, triggered by: 0.25\n", - "\u001b[32mWe overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3). We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4). Finally, we discuss some additional properties of selection mechanisms (Section 3.5).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 107, tokens None, triggered by: 0.21\n", - "\u001b[34m# 3.1 Motivation:\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 108, tokens None, triggered by: 0.10\n", - "\u001b[35mSelection as a Means of Compression We argue that a fundamental problem of sequence modeling is compressing context into a smaller state.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 109, tokens None, triggered by: 0.20\n", - "\u001b[31mIn fact, we can view the tradeoï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 110, tokens None, triggered by: 0.21\n", - "\u001b[32ms of popular sequence models from this point of view.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 111, tokens None, triggered by: 0.28\n", - "\u001b[34mFor example, attention is both eï¬ ective and ineï¬ cient because it explicitly does not compress context at all. This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers. On the other hand, recurrent models are eï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 112, tokens None, triggered by: 0.25\n", - "\u001b[35mcient because they have a ï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 113, tokens None, triggered by: 0.16\n", - "\u001b[31mnite state, implying constant-time inference and linear-time training.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 114, tokens None, triggered by: 0.27\n", - "\u001b[32mHowever, their eï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 115, tokens None, triggered by: 0.26\n", - "\u001b[34mectiveness is limited by how well this state has compressed the context.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 116, tokens None, triggered by: 0.12\n", - "\u001b[35mTo understand this principle, we focus on two running examples of synthetic tasks (Figure 2).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 117, tokens None, triggered by: 0.25\n", - "\u001b[31mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 118, tokens None, triggered by: 0.28\n", - "\u001b[32m¢ The Selective Copying task modiï¬ es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize. It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 119, tokens None, triggered by: 0.20\n", - "\u001b[34mlter out the irrelevant ones (white).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 120, tokens None, triggered by: 0.12\n", - "\u001b[35mâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 121, tokens None, triggered by: 0.24\n", - "\u001b[31m¢ The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022). It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black). These tasks reveal the failure mode of LTI models. From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬ ect the hidden state passed along the sequence an in input-dependent way. From the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬ culty with the Selective Copying task because of lack of content-awareness (Figure 2). More concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels.\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 122, tokens None, triggered by: 0.30\n", - "\u001b[32mIn summary, the eï¬ ciency vs. eï¬ ectiveness tradeoï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 123, tokens None, triggered by: final split\n", - "\u001b[34mof sequence models is characterized by how well they compress their state: eï¬ cient models must have a small state, while eï¬ ective models must have a state that contains all necessary information from the context. In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬ lter out inputs into a sequential state. In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion). # Improving SSMs with Selection One method of incorporating a selection mechanism into models is by letting their parameters that aï¬ ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the c\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "chunker.print(chunks[0])" ] diff --git a/semantic_chunkers/chunkers/statistical.py b/semantic_chunkers/chunkers/statistical.py index 97e5629..d0bf0dc 100644 --- a/semantic_chunkers/chunkers/statistical.py +++ b/semantic_chunkers/chunkers/statistical.py @@ -1,3 +1,4 @@ +import asyncio from dataclasses import dataclass from typing import Any, List @@ -10,7 +11,7 @@ from semantic_chunkers.splitters.base import BaseSplitter from semantic_chunkers.splitters.sentence import RegexSplitter from semantic_chunkers.utils.logger import logger -from semantic_chunkers.utils.text import tiktoken_length +from semantic_chunkers.utils.text import async_retry_with_timeout, tiktoken_length, time_it @dataclass @@ -54,7 +55,6 @@ def __init__( enable_statistics=False, ): super().__init__(name=name, encoder=encoder, splitter=splitter) - self.calculated_threshold: float self.encoder = encoder self.threshold_adjustment = threshold_adjustment self.dynamic_threshold = dynamic_threshold @@ -67,6 +67,7 @@ def __init__( self.statistics: ChunkStatistics self.DEFAULT_THRESHOLD = 0.5 + @time_it def _chunk( self, splits: List[Any], batch_size: int = 64, enforce_max_tokens: bool = False ) -> List[Chunk]: @@ -99,44 +100,58 @@ def _chunk( splits = [split for split in new_splits if split and split.strip()] chunks = [] - last_split = None + last_chunk: Chunk | None = None for i in tqdm(range(0, len(splits), batch_size)): batch_splits = splits[i : i + batch_size] - if last_split is not None: - batch_splits = last_split.splits + batch_splits + if last_chunk is not None: + batch_splits = last_chunk.splits + batch_splits encoded_splits = self._encode_documents(batch_splits) similarities = self._calculate_similarity_scores(encoded_splits) + if self.dynamic_threshold: - self._find_optimal_threshold(batch_splits, similarities) + calculated_threshold = self._find_optimal_threshold( + batch_splits, similarities + ) else: - self.calculated_threshold = ( + calculated_threshold = ( self.encoder.score_threshold if self.encoder.score_threshold else self.DEFAULT_THRESHOLD ) - split_indices = self._find_split_indices(similarities=similarities) + split_indices = self._find_split_indices( + similarities=similarities, calculated_threshold=calculated_threshold + ) + doc_chunks = self._split_documents( - batch_splits, split_indices, similarities + docs=batch_splits, + split_indices=split_indices, + similarities=similarities, ) if len(doc_chunks) > 1: chunks.extend(doc_chunks[:-1]) - last_split = doc_chunks[-1] + last_chunk = doc_chunks[-1] else: - last_split = doc_chunks[0] + last_chunk = doc_chunks[0] if self.plot_chunks: - self.plot_similarity_scores(similarities, split_indices, doc_chunks) + self.plot_similarity_scores( + similarities=similarities, + split_indices=split_indices, + chunks=doc_chunks, + calculated_threshold=calculated_threshold, + ) if self.enable_statistics: print(self.statistics) - if last_split: - chunks.append(last_split) + if last_chunk: + chunks.append(last_chunk) return chunks + @time_it async def _async_chunk( self, splits: List[Any], batch_size: int = 64, enforce_max_tokens: bool = False ) -> List[Chunk]: @@ -168,45 +183,50 @@ async def _async_chunk( splits = [split for split in new_splits if split and split.strip()] - chunks = [] - last_split = None - for i in tqdm(range(0, len(splits), batch_size)): - batch_splits = splits[i : i + batch_size] - if last_split is not None: - batch_splits = last_split.splits + batch_splits + chunks: list[Chunk] = [] + # Step 1: Define process_batch as a separate coroutine function for parallel + async def _process_batch(batch_splits: List[str]): encoded_splits = await self._async_encode_documents(batch_splits) + return batch_splits, encoded_splits + + # Step 2: Create tasks for parallel execution + tasks = [] + for i in range(0, len(splits), batch_size): + batch_splits = splits[i : i + batch_size] + tasks.append(_process_batch(batch_splits)) + + # Step 3: Await tasks and collect results + encoded_split_results = await asyncio.gather(*tasks) + + # Step 4: Sequentially process results + for batch_splits, encoded_splits in encoded_split_results: similarities = self._calculate_similarity_scores(encoded_splits) if self.dynamic_threshold: - self._find_optimal_threshold(batch_splits, similarities) + calculated_threshold = self._find_optimal_threshold( + batch_splits, similarities + ) else: - self.calculated_threshold = ( + calculated_threshold = ( self.encoder.score_threshold if self.encoder.score_threshold else self.DEFAULT_THRESHOLD ) - split_indices = self._find_split_indices(similarities=similarities) - doc_chunks = self._split_documents( - batch_splits, split_indices, similarities + + split_indices = self._find_split_indices( + similarities=similarities, calculated_threshold=calculated_threshold ) - if len(doc_chunks) > 1: - chunks.extend(doc_chunks[:-1]) - last_split = doc_chunks[-1] - else: - last_split = doc_chunks[0] - - if self.plot_chunks: - self.plot_similarity_scores(similarities, split_indices, doc_chunks) - - if self.enable_statistics: - print(self.statistics) - - if last_split: - chunks.append(last_split) + doc_chunks: list[Chunk] = self._split_documents( + docs=batch_splits, + split_indices=split_indices, + similarities=similarities, + ) + chunks.extend(doc_chunks) return chunks + @time_it def __call__(self, docs: List[str], batch_size: int = 64) -> List[List[Chunk]]: """Split documents into smaller chunks based on semantic similarity. @@ -235,6 +255,7 @@ def __call__(self, docs: List[str], batch_size: int = 64) -> List[List[Chunk]]: raise ValueError("The document must be a string.") return all_chunks + @time_it async def acall(self, docs: List[str], batch_size: int = 64) -> List[List[Chunk]]: """Split documents into smaller chunks based on semantic similarity. @@ -263,6 +284,7 @@ async def acall(self, docs: List[str], batch_size: int = 64) -> List[List[Chunk] raise ValueError("The document must be a string.") return all_chunks + @time_it def _encode_documents(self, docs: List[str]) -> np.ndarray: """ Encodes a list of documents into embeddings. If the number of documents @@ -286,6 +308,8 @@ def _encode_documents(self, docs: List[str]) -> np.ndarray: return np.array(embeddings) + @async_retry_with_timeout(retries=3, timeout=5) + @time_it async def _async_encode_documents(self, docs: List[str]) -> np.ndarray: """ Encodes a list of documents into embeddings. If the number of documents @@ -321,14 +345,16 @@ def _calculate_similarity_scores(self, encoded_docs: np.ndarray) -> List[float]: raw_similarities.append(curr_sim_score) return raw_similarities - def _find_split_indices(self, similarities: List[float]) -> List[int]: + def _find_split_indices( + self, similarities: List[float], calculated_threshold: float + ) -> List[int]: split_indices = [] for idx, score in enumerate(similarities): logger.debug(f"Similarity score at index {idx}: {score}") - if score < self.calculated_threshold: + if score < calculated_threshold: logger.debug( f"Adding to split_indices due to score < threshold: " - f"{score} < {self.calculated_threshold}" + f"{score} < {calculated_threshold}" ) # Chunk after the document at idx split_indices.append(idx + 1) @@ -348,11 +374,14 @@ def _find_optimal_threshold(self, docs: List[str], similarity_scores: List[float iteration = 0 median_tokens = 0 + calculated_threshold = 0 while low <= high: - self.calculated_threshold = (low + high) / 2 - split_indices = self._find_split_indices(similarity_scores) + calculated_threshold = (low + high) / 2 + split_indices = self._find_split_indices( + similarity_scores, calculated_threshold + ) logger.debug( - f"Iteration {iteration}: Trying threshold: {self.calculated_threshold}" + f"Iteration {iteration}: Trying threshold: {calculated_threshold}" ) # Calculate the token counts for each split using the cumulative sums @@ -376,20 +405,20 @@ def _find_optimal_threshold(self, docs: List[str], similarity_scores: List[float logger.debug("Median tokens in target range. Stopping iteration.") break elif median_tokens < self.min_split_tokens: - high = self.calculated_threshold - self.threshold_adjustment + high = calculated_threshold - self.threshold_adjustment logger.debug(f"Iteration {iteration}: Adjusting high to {high}") else: - low = self.calculated_threshold + self.threshold_adjustment + low = calculated_threshold + self.threshold_adjustment logger.debug(f"Iteration {iteration}: Adjusting low to {low}") iteration += 1 logger.debug( - f"Optimal threshold {self.calculated_threshold} found " + f"Optimal threshold {calculated_threshold} found " f"with median tokens ({median_tokens}) in target range " f"({self.min_split_tokens}-{self.max_split_tokens})." ) - return self.calculated_threshold + return calculated_threshold def _split_documents( self, docs: List[str], split_indices: List[int], similarities: List[float] @@ -440,7 +469,7 @@ def _split_documents( ) logger.debug( f"Chunk finalized with {current_tokens_count} tokens due to " - f"threshold {self.calculated_threshold}." + f"threshold {triggered_score}." ) current_split, current_tokens_count = [], 0 chunks_by_threshold += 1 @@ -528,6 +557,7 @@ def plot_similarity_scores( similarities: List[float], split_indices: List[int], chunks: list[Chunk], + calculated_threshold: float, ): try: from matplotlib import pyplot as plt @@ -550,7 +580,7 @@ def plot_similarity_scores( label="Chunk" if split_index == split_indices[0] else "", ) axs[0].axhline( - y=self.calculated_threshold, + y=calculated_threshold, color="g", linestyle="-.", label="Threshold Similarity Score", @@ -569,8 +599,7 @@ def plot_similarity_scores( axs[0].set_xlabel("Document Segment Index") axs[0].set_ylabel("Similarity Score") axs[0].set_title( - f"Threshold: {self.calculated_threshold} |" - f" Window Size: {self.window_size}", + f"Threshold: {calculated_threshold} |" f" Window Size: {self.window_size}", loc="right", fontsize=10, ) diff --git a/semantic_chunkers/utils/text.py b/semantic_chunkers/utils/text.py index 8a306b1..b19c723 100644 --- a/semantic_chunkers/utils/text.py +++ b/semantic_chunkers/utils/text.py @@ -1,7 +1,55 @@ +import asyncio +from functools import wraps import tiktoken +import time +from semantic_chunkers.utils.logger import logger def tiktoken_length(text: str) -> int: tokenizer = tiktoken.get_encoding("cl100k_base") tokens = tokenizer.encode(text, disallowed_special=()) return len(tokens) + + +def time_it(func): + async def async_wrapper(*args, **kwargs): + start_time = time.time() + result = await func(*args, **kwargs) # Await the async function + end_time = time.time() + logger.debug(f"{func.__name__} duration: {end_time - start_time:.2f} seconds") + return result + + def sync_wrapper(*args, **kwargs): + start_time = time.time() + result = func(*args, **kwargs) # Call the sync function directly + end_time = time.time() + logger.debug(f"{func.__name__} duration: {end_time - start_time:.2f} seconds") + return result + + if asyncio.iscoroutinefunction(func): + return async_wrapper + else: + return sync_wrapper + + +def async_retry_with_timeout(retries=3, timeout=10): + def decorator(func): + @wraps(func) + async def wrapper(*args, **kwargs): + for attempt in range(retries): + try: + return await asyncio.wait_for(func(*args, **kwargs), timeout) + except asyncio.TimeoutError: + logger.warning( + f"Timeout on attempt {attempt+1} for {func.__name__}" + ) + except Exception as e: + logger.error( + f"Exception on attempt {attempt+1} for {func.__name__}: {e}" + ) + if attempt == retries - 1: + raise + else: + await asyncio.sleep(2**attempt) # Exponential backoff + return wrapper + return decorator From 4252440abe2c1f02af92be4659bf037bde081b9b Mon Sep 17 00:00:00 2001 From: Simonas <20096648+simjak@users.noreply.github.com> Date: Wed, 3 Jul 2024 16:58:16 +0300 Subject: [PATCH 2/3] chore: lint --- docs/00-chunkers-intro.ipynb | 12 ++++++------ semantic_chunkers/chunkers/statistical.py | 8 ++++++-- semantic_chunkers/utils/text.py | 6 +++++- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/docs/00-chunkers-intro.ipynb b/docs/00-chunkers-intro.ipynb index b915a1d..0ca7cfb 100644 --- a/docs/00-chunkers-intro.ipynb +++ b/docs/00-chunkers-intro.ipynb @@ -259,20 +259,20 @@ "from IPython.display import display, HTML\n", "\n", "# Predefined list of colors\n", - "colors = ['000000', 'FF0000', '800080', '008000', '0000FF']\n", + "colors = [\"000000\", \"FF0000\", \"800080\", \"008000\", \"0000FF\"]\n", "\n", "html_str = '
'\n", "html_str += '
'\n", - "html_str += '

Synchronous Chunks

'\n", + "html_str += \"

Synchronous Chunks

\"\n", "for chunk in chunks[0]:\n", " chunk_text = \"\"\n", " for split in chunk.splits:\n", " chunk_text += split\n", " color = random.choice(colors)\n", " html_str += f'

{chunk_text}

'\n", - "html_str += '
'\n", + "html_str += \"
\"\n", "html_str += '
'\n", - "html_str += '

Asynchronous Chunks

'\n", + "html_str += \"

Asynchronous Chunks

\"\n", "for chunk in chunks_async[0]:\n", " chunk_text = \"\"\n", " try:\n", @@ -282,8 +282,8 @@ " print(f\"Error Chunk: {chunk}\")\n", " color = random.choice(colors)\n", " html_str += f'

{chunk_text}

'\n", - "html_str += '
'\n", - "html_str += ''\n", + "html_str += \"\"\n", + "html_str += \"\"\n", "\n", "# Display the HTML\n", "display(HTML(html_str))" diff --git a/semantic_chunkers/chunkers/statistical.py b/semantic_chunkers/chunkers/statistical.py index d0bf0dc..77e29de 100644 --- a/semantic_chunkers/chunkers/statistical.py +++ b/semantic_chunkers/chunkers/statistical.py @@ -11,7 +11,11 @@ from semantic_chunkers.splitters.base import BaseSplitter from semantic_chunkers.splitters.sentence import RegexSplitter from semantic_chunkers.utils.logger import logger -from semantic_chunkers.utils.text import async_retry_with_timeout, tiktoken_length, time_it +from semantic_chunkers.utils.text import ( + async_retry_with_timeout, + tiktoken_length, + time_it, +) @dataclass @@ -212,7 +216,7 @@ async def _process_batch(batch_splits: List[str]): if self.encoder.score_threshold else self.DEFAULT_THRESHOLD ) - + split_indices = self._find_split_indices( similarities=similarities, calculated_threshold=calculated_threshold ) diff --git a/semantic_chunkers/utils/text.py b/semantic_chunkers/utils/text.py index b19c723..379d454 100644 --- a/semantic_chunkers/utils/text.py +++ b/semantic_chunkers/utils/text.py @@ -1,7 +1,9 @@ import asyncio +import time from functools import wraps + import tiktoken -import time + from semantic_chunkers.utils.logger import logger @@ -51,5 +53,7 @@ async def wrapper(*args, **kwargs): raise else: await asyncio.sleep(2**attempt) # Exponential backoff + return wrapper + return decorator From 5c85749eab14770d47300cec4e25554d190c5e81 Mon Sep 17 00:00:00 2001 From: Simonas <20096648+simjak@users.noreply.github.com> Date: Wed, 3 Jul 2024 20:03:27 +0300 Subject: [PATCH 3/3] chore: pr comment fixes --- docs/00-chunkers-intro.ipynb | 1628 ++++++++++++++++++--- docs/02-chunkers-async.ipynb | 1622 ++++++++++++++++++++ semantic_chunkers/chunkers/statistical.py | 2 +- 3 files changed, 3040 insertions(+), 212 deletions(-) create mode 100644 docs/02-chunkers-async.ipynb diff --git a/docs/00-chunkers-intro.ipynb b/docs/00-chunkers-intro.ipynb index 0ca7cfb..be81a7e 100644 --- a/docs/00-chunkers-intro.ipynb +++ b/docs/00-chunkers-intro.ipynb @@ -40,52 +40,17 @@ "In this example, we will stick with a simple RAG-focused example. We will learn about three different types of chunkers available to us; `StatisticalChunker`, `ConsecutiveChunker`, and `CumulativeChunker`. To begin, we need some data." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note:** by using the [async methods here]([link](https://github.com/aurelio-labs/semantic-chunkers/blob/main/docs/02-chunkers-async.ipynb)) docs can be processed *40x* faster." + ] + }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 341, - "referenced_widgets": [ - "1180440b21b74b3da04a8a876a135658", - "8489e5ef0f384d6480e8e2c5c60fdd1d", - "52661abd5e3c460eb54489d1e3515d69", - "f3c981b961eb4185b995dc4e3b5d3919", - "b987413b65ad4b10a7f7759c9dcc1c2f", - "75159166d8254cde933b4131e4627b8f", - "d5c5e31059e94176be64f32dd2496164", - "b59d191a2c0e498e80494f9384de7dfb", - "e57af473700d4bc2bbea39eca36ece8c", - "be2cbe5e3ea94a9f81b9e370957e63b7", - "1707a76c176a4e1d96baa4a4ed9e7d99", - "8b640a38e82e4f0598fa6ed382888499", - "3321078273154d2bb3b6a9189d97806c", - "ce12ce0393464fd7a533123d3c71a3e6", - "0ec1edf815164f5f82001efa06857553", - "f0f1642352f14137a19ee27513f18ca6", - "d51e32f5700b40cea7512aded4d6d019", - "a27eb006ba8c4d69a70d8be4e3c280a2", - "46577f9ba4e142009d0aebb6741d6e84", - "e430ed4d66604027812b1e39e1fc000e", - "272c61542ae8454b9eb616e5e2a858e3", - "2a19c3d693834de786396120d19a65b3", - "c59101e643d34bc5a4ad4fa664064614", - "7b3688583754445ab07bf28254fcc97d", - "f7870f32ab2d4d938fc5ba85aecf7f51", - "bae4b7ef05ea49b6bdb2a0e369e6768b", - "20740edade1a44c29f4d4094ed487f00", - "1bc01711294a49b4a81475e547874514", - "4b5d04efea944bbc8659b4f56f22ed69", - "1d5846c892724b8cab97693ec1c1cd33", - "f4e26036353d452885f364eb8c2b241e", - "29f5562abb0e4383956f18652833defe", - "b6fc89b557b44064b2e17829347bc951" - ] - }, - "id": "aTN4gsdl2WBQ", - "outputId": "bd606fad-8214-4fd4-cad1-54bb86234575" - }, + "execution_count": 22, + "metadata": {}, "outputs": [ { "data": { @@ -96,7 +61,7 @@ "})" ] }, - "execution_count": 14, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -110,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -145,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -165,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 25, "metadata": { "id": "Mqnc35w85A8L" }, @@ -200,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -211,22 +176,20 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-07-03 16:41:12 INFO semantic_chunkers.utils.logger Single document exceeds the maximum token limit of 300. Splitting to sentences before semantically merging.\u001b[0m\n", - "100%|██████████| 6/6 [00:05<00:00, 1.08it/s]\n", - "\u001b[32m2024-07-03 16:41:17 INFO semantic_chunkers.utils.logger Single document exceeds the maximum token limit of 300. Splitting to sentences before semantically merging.\u001b[0m\n" + "\u001b[32m2024-07-03 19:57:53 INFO semantic_chunkers.utils.logger Single document exceeds the maximum token limit of 300. Splitting to sentences before semantically merging.\u001b[0m\n", + "100%|██████████| 6/6 [00:04<00:00, 1.46it/s]\n" ] } ], "source": [ - "chunks = chunker(docs=[content])\n", - "chunks_async = await chunker.acall(docs=[content])" + "chunks = chunker(docs=[content])" ] }, { @@ -238,60 +201,7 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

Synchronous Chunks

# Mamba:Linear-Time Sequence Modeling with Selective State Spaces# Albert Gu*1 and Tri Dao*21Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me# AbstractFoundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module.Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformersâcomputational ineï¬ciency on long sequences, but they have not performed as well as attention on important modalities such as language.We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements.First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token.Second, even though this change prevents the use of eï¬cient convolutions, we design a hardware-aware parallel algorithm in recurrent mode.We integrate these selective SSMs into a simpliï¬ed end-to-end neural network architecture without attention or even MLP blocks (Mamba).Mamba enjoys fast inference (5Ãhigher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences.

As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics.On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.# 1 IntroductionFoundation models (FMs), or large models pretrained on massive data then adapted for downstream tasks, have emerged as an eï¬ective paradigm in modern machine learning.The backbone of these FMs are often sequence models, operating on arbitrary sequences of inputs from a wide variety of domains such as language, images, speech, audio, time series, and genomics (Brown et al. 2020; Dosovitskiy et al. 2020; Ismail Fawaz et al. 2019; Oord et al. 2016; Poli et al. 2023; Sutskever, Vinyals, and Quoc V Le 2014).While this concept is agnostic to a particular choice of model architecture, modern FMs are predominantly based on a single type of sequence model: the Transformer (Vaswani et al. 2017) and its core attention layer (Bahdanau, Cho, and Bengio 2015) The eï¬cacy of self-attention is attributed to its ability to route information densely within a context window, allowing it to model complex data.

However, this property brings fundamental drawbacks: an inability to model anything outside of a ï¬nite window, and quadratic scaling with respect to the window length.An enormous body of research has appeared on more eï¬cient variants of attention to overcome these drawbacks (Tay, Dehghani, Bahri, et al. 2022), but often at the expense of the very properties that makes it eï¬ective.As of yet, none of these variants have been shown to be empirically eï¬ective at scale across domains.

Recently, structured state space sequence models (SSMs) (Gu, Goel, and Ré 2022; Gu, Johnson, Goel, et al. 2021) have emerged as a promising class of architectures for sequence modeling.These models can be interpreted as a combination of recurrent neural networks (RNNs) and convolutional neural networks (CNNs), with inspiration from classical state space models (Kalman 1960).This class of models can be computed very eï¬ciently as either a recurrence or convolution, with linear or near-linear scaling in sequence length.

Additionally, they have principledEqual contribution.1mechanisms for modeling long-range dependencies (Gu, Dao, et al. 2020) in certain data modalities, and have dominated benchmarks such as the Long Range Arena (Tay, Dehghani, Abnar, et al. 2021).Many ï¬avors of SSMs (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Y.Li et al. 2023; Ma et al. 2023; Orvieto et al. 2023; Smith, Warrington, and Linderman 2023) have been successful in domains involving continuous signal data such as audio and vision (Goel et al. 2022; Nguyen, Goel, et al. 2022; Saon, Gupta, and Cui 2023).However, they have been less eï¬ective at modeling discrete and information-dense data such as text.We propose a new class of selective state space models, that improves on prior work on several axes to achieve the modeling power of Transformers while scaling linearly in sequence length.

Selection Mechanism.First, we identify a key limitation of prior models: the ability to eï¬ciently select data in an input-dependent manner (i.e. focus on or ignore particular inputs).Building on intuition based on important synthetic tasks such as selective copy and induction heads, we design a simple selection mechanism by parameterizing the SSM parameters based on the input.This allows the model to ï¬lter out irrelevant information and remember relevant information indeï¬nitely.Hardware-aware Algorithm.This simple change poses a technical challenge for the computation of the model; in fact, all prior SSMs models must be time- and input-invariant in order to be computationally eï¬cient.We overcome this with a hardware-aware algorithm that computes the model recurrently with a scan instead of convolution, but does not materialize the expanded state in order to avoid IO access between diï¬erent levels of the GPU memory hierarchy.The resulting implementation is faster than previous methods both in theory (scaling linearly in sequence length, compared to pseudo-linear for all convolution-based SSMs) and on modern hardware (up to 3Ãfaster on A100 GPUs).

Architecture.We simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces.Selective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬ciency together yield performance improvements on real data up to sequence length 1M.We empirically validate Mambaâs potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬c task performance, on several types of modalities and settings:

⢠Synthetics.On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬nitely long (>1M tokens).⢠Audio and Genomics.Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half).In both settings, its performance improves with longer context up to million-length sequences.⢠Language Modeling.

Mamba is the ï¬rst linear-time sequence model that truly achieves Transformer-quality performance, both in pretraining perplexity and downstream evaluations.With scaling laws up to 1B parameters, we show that Mamba exceeds the performance of a large range of baselines, including very strong modern Transformer training recipes based on LLaMa (Touvron et al. 2023).Our Mamba language model has 5Ãgeneration throughput compared to Transformers of similar size, and Mamba-3Bâs quality matches that of Transformers twice its size (e.g. 4 points higher avg. on common sense reasoning compared to Pythia-3B and even exceeding Pythia-7B).Model code and pre-trained checkpoints are open-sourced at https://github.com/state-spaces/mamba.

2# Selective State Space Model# with Hardware-aware State Expansion# Avuvy GPU SRAM Selection Mechanism esSelection MechanismFigure 1: (Overview.) Structured SSMs independently map each channel (e.g. ð· = 5) of an input ð¥ to output ð¦ through a higher dimensional latent state â(e.g. ð= 4).Prior SSMs avoid materializing this large effective state (ð·ð, times batch size ðµ and sequence length ð¿) through clever alternate computation paths requiring time-invariance: the (â, A, B, C) parameters are constant across time.Our selection mechanism adds back input-dependent dynamics, which also requires a careful hardware-aware algorithm to only materialize the expanded states in more efficient levels of the GPU memory hierarchy.# 2 State Space ModelsStructured state space sequence models (S4) are a recent class of sequence models for deep learning that are broadly related to RNNs, and CNNs, and classical state space models.They are inspired by a particular continuous system (1) that maps a 1-dimensional function or sequence ð

¥(ð¡) ââ⦠ð¦(ð¡) ââthrough an implicit latent state â(ð¡) ââð.Concretely, S4 models are deï¬ned with four parameters (â, A, B, C), which deï¬ne a sequence-to-sequence trans- formation in two stages.ââ²(ð¡) = Aâ(ð¡) + Bð¥(ð¡) ð¦(ð¡) = Câ(ð¡)(1a) (1b) âð¡ = Aâð¡â1 + Bð¥ð¡ ð¦ð¡ = Câð¡ (2a) (2b) ðð² = (Cð©, Cð¨ð©, ⦠, Cð¨ ð¦ = ð¥ âð² ð©, â

¦ ) (3a) (3b)Discretization.The ï¬rst stage transforms the âcontinuous parametersâ(â, A, B) to âdiscrete parametersâ(A, B) through ï¬xed formulas A = ðð´(â, A) and B = ððµ(â, A, B), where the pair (ðð´, ððµ) is called a discretization rule.Various rules can be used such as the zero-order hold (ZOH) deï¬

ned in equation (4).A = exp(âA) B = (âA)â1(exp(âA) âI) ââB (4)Discretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023).

It also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5.However, from a mechanical point of view discretization can simply be viewed as the ï¬rst step of the computation graph in the forward pass of an SSM.Alternate ï¬avors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about.Computation.After the parameters have been transformed from (â, A, B, C) ⦠(A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3).3Commonly, the model uses the convolutional mode (3) for eï¬cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬cient autoregressive inference (where the inputs are seen one timestep at a time).Linear Time Invariance (LTI).An important property of equations (1) to (3) is that the modelâs dynamics are constant through time.

In other words (â, A, B, C), and consequently (A, B) as well, are ï¬xed for all time-steps.This property is called linear time invariance (LTI), which is deeply connected to recurrence and convolutions.Informally, we think of LTI SSMs as being equivalent to any linear recurrence (2a) or convolution (3b), and use LTI as an umbrella term for these classes of models.Thus far, all structured SSMs have been LTI (e.g. computed as convolutions) because of fundamental eï¬ciency constraints, discussed in Section 3.3.However, a core insight of this work is that LTI models have fundamental limitations in modeling certain types of data, and our technical contributions involve removing the LTI constraint while overcoming the eï¬ciency bottlenecks.

Structure and Dimensions.Finally, we note that structured SSMs are so named because computing them eï¬ciently also requires imposing structure on the A matrix.The most popular form of structure is diagonal (Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Smith, Warrington, and Linderman 2023), which we also use.In this case, the A ââðÃð, B ââðÃ1, C ââ1Ãðmatrices can all be represented by ðnumbers.To operate over an input sequence ð¥ of batch size ðµ and length ð¿ with ð

· channels, the SSM is applied independently to each channel.Note that in this case, the total hidden state has dimension ð·ðper input, and computing it over the sequence length requires ð(ðµð¿ð·ð) time and memory; this is the root of the fundamental eï¬ciency bottleneck addressed in Section 3.3.General State Space Models.We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state.It has been used to refer to many disparate concepts in diï¬erent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning).Throughout this entire paper we use the term â

SSMâto refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably.For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y.Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary.SSM Architectures.SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines.

⢠Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM.⢠H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3).H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer.⢠Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021).â

¢ RetNet (Y.Sun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions.4⢠RWKV (B.Peng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S.Zhai et al. 2021)).Its main âWKVâ

mechanism involves LTI recurrences and can be viewed as the ratio of two SSMs.Other closely related SSMs and architectures are discussed further in an extended related work (Appendix B).We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM.# 3 Selective State Space ModelsWe motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2).The resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬

ciently.We overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3).We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4).Finally, we discuss some additional properties of selection mechanisms (Section 3.5).# 3.1 Motivation:Selection as a Means of CompressionWe argue that a fundamental problem of sequence modeling is compressing context into a smaller state.In fact, we can view the tradeoï¬s of popular sequence models from this point of view.For example, attention is both eï¬ective and ineï¬cient because it explicitly does not compress context at all.This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers.On the other hand, recurrent models are eï¬cient because they have a ï¬nite state, implying constant-time inference and linear-time training.However, their eï¬ectiveness is limited by how well this state has compressed the context.To understand this principle, we focus on two running examples of synthetic tasks (Figure 2).

⢠The Selective Copying task modiï¬es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize.It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬lter out the irrelevant ones (white).⢠The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022).It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black).These tasks reveal the failure mode of LTI models.From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬ect the hidden state passed along the sequence an in input-dependent way.From the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬culty with the Selective Copying task because of lack of content-awareness (Figure 2).More concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels.

In summary, the eï¬ciency vs. eï¬ectiveness tradeoï¬of sequence models is characterized by how well they compress their state: eï¬cient models must have a small state, while eï¬ective models must have a state that contains all necessary information from the context.In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬lter out inputs into a sequential state.In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion).

# Improving SSMs with SelectionOne method of incorporating a selection mechanism into models is by letting their parameters that aï¬ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the c

Asynchronous Chunks

# Mamba:Linear-Time Sequence Modeling with Selective State Spaces# Albert Gu*1 and Tri Dao*21Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me# AbstractFoundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module.Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformersâcomputational ineï¬ciency on long sequences, but they have not performed as well as attention on important modalities such as language.We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements.First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token.Second, even though this change prevents the use of eï¬cient convolutions, we design a hardware-aware parallel algorithm in recurrent mode.We integrate these selective SSMs into a simpliï¬ed end-to-end neural network architecture without attention or even MLP blocks (Mamba).Mamba enjoys fast inference (5Ãhigher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences.

As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics.On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.# 1 IntroductionFoundation models (FMs), or large models pretrained on massive data then adapted for downstream tasks, have emerged as an eï¬ective paradigm in modern machine learning.The backbone of these FMs are often sequence models, operating on arbitrary sequences of inputs from a wide variety of domains such as language, images, speech, audio, time series, and genomics (Brown et al. 2020; Dosovitskiy et al. 2020; Ismail Fawaz et al. 2019; Oord et al. 2016; Poli et al. 2023; Sutskever, Vinyals, and Quoc V Le 2014).While this concept is agnostic to a particular choice of model architecture, modern FMs are predominantly based on a single type of sequence model: the Transformer (Vaswani et al. 2017) and its core attention layer (Bahdanau, Cho, and Bengio 2015) The eï¬cacy of self-attention is attributed to its ability to route information densely within a context window, allowing it to model complex data.

However, this property brings fundamental drawbacks: an inability to model anything outside of a ï¬nite window, and quadratic scaling with respect to the window length.An enormous body of research has appeared on more eï¬cient variants of attention to overcome these drawbacks (Tay, Dehghani, Bahri, et al. 2022), but often at the expense of the very properties that makes it eï¬ective.As of yet, none of these variants have been shown to be empirically eï¬ective at scale across domains.

Recently, structured state space sequence models (SSMs) (Gu, Goel, and Ré 2022; Gu, Johnson, Goel, et al. 2021) have emerged as a promising class of architectures for sequence modeling.These models can be interpreted as a combination of recurrent neural networks (RNNs) and convolutional neural networks (CNNs), with inspiration from classical state space models (Kalman 1960).This class of models can be computed very eï¬ciently as either a recurrence or convolution, with linear or near-linear scaling in sequence length.

Additionally, they have principledEqual contribution.1mechanisms for modeling long-range dependencies (Gu, Dao, et al. 2020) in certain data modalities, and have dominated benchmarks such as the Long Range Arena (Tay, Dehghani, Abnar, et al. 2021).Many ï¬avors of SSMs (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Y.Li et al. 2023; Ma et al. 2023; Orvieto et al. 2023; Smith, Warrington, and Linderman 2023) have been successful in domains involving continuous signal data such as audio and vision (Goel et al. 2022; Nguyen, Goel, et al. 2022; Saon, Gupta, and Cui 2023).However, they have been less eï¬ective at modeling discrete and information-dense data such as text.We propose a new class of selective state space models, that improves on prior work on several axes to achieve the modeling power of Transformers while scaling linearly in sequence length.

Selection Mechanism.First, we identify a key limitation of prior models: the ability to eï¬ciently select data in an input-dependent manner (i.e. focus on or ignore particular inputs).Building on intuition based on important synthetic tasks such as selective copy and induction heads, we design a simple selection mechanism by parameterizing the SSM parameters based on the input.This allows the model to ï¬lter out irrelevant information and remember relevant information indeï¬nitely.Hardware-aware Algorithm.This simple change poses a technical challenge for the computation of the model; in fact, all prior SSMs models must be time- and input-invariant in order to be computationally eï¬cient.We overcome this with a hardware-aware algorithm that computes the model recurrently with a scan instead of convolution, but does not materialize the expanded state in order to avoid IO access between diï¬erent levels of the GPU memory hierarchy.The resulting implementation is faster than previous methods both in theory (scaling linearly in sequence length, compared to pseudo-linear for all convolution-based SSMs) and on modern hardware (up to 3Ãfaster on A100 GPUs).

Architecture.We simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces.Selective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬ciency together yield performance improvements on real data up to sequence length 1M.

We empirically validate Mambaâs potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬c task performance, on several types of modalities and settings:⢠Synthetics.On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬nitely long (>1M tokens).⢠Audio and Genomics.Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half).In both settings, its performance improves with longer context up to million-length sequences.⢠Language Modeling.

Mamba is the ï¬rst linear-time sequence model that truly achieves Transformer-quality performance, both in pretraining perplexity and downstream evaluations.With scaling laws up to 1B parameters, we show that Mamba exceeds the performance of a large range of baselines, including very strong modern Transformer training recipes based on LLaMa (Touvron et al. 2023).Our Mamba language model has 5Ãgeneration throughput compared to Transformers of similar size, and Mamba-3Bâs quality matches that of Transformers twice its size (e.g. 4 points higher avg. on common sense reasoning compared to Pythia-3B and even exceeding Pythia-7B).Model code and pre-trained checkpoints are open-sourced at https://github.com/state-spaces/mamba.

2# Selective State Space Model# with Hardware-aware State Expansion# Avuvy GPU SRAM Selection Mechanism esSelection MechanismFigure 1: (Overview.) Structured SSMs independently map each channel (e.g. ð· = 5) of an input ð¥ to output ð¦ through a higher dimensional latent state â(e.g. ð= 4).Prior SSMs avoid materializing this large effective state (ð·ð, times batch size ðµ and sequence length ð¿) through clever alternate computation paths requiring time-invariance: the (â, A, B, C) parameters are constant across time.Our selection mechanism adds back input-dependent dynamics, which also requires a careful hardware-aware algorithm to only materialize the expanded states in more efficient levels of the GPU memory hierarchy.# 2 State Space ModelsStructured state space sequence models (S4) are a recent class of sequence models for deep learning that are broadly related to RNNs, and CNNs, and classical state space models.They are inspired by a particular continuous system (1) that maps a 1-dimensional function or sequence ð

¥(ð¡) ââ⦠ð¦(ð¡) ââthrough an implicit latent state â(ð¡) ââð.Concretely, S4 models are deï¬ned with four parameters (â, A, B, C), which deï¬ne a sequence-to-sequence trans- formation in two stages.ââ²(ð¡) = Aâ

(ð¡) + Bð¥(ð¡) ð¦(ð¡) = Câ(ð¡)(1a) (1b) âð¡ = Aâð¡â1 + Bð¥ð¡ ð¦ð¡ = Câð¡ (2a) (2b) ðð² = (Cð©, Cð¨ð©, ⦠, Cð¨ ð¦ = ð¥ âð² ð©, â

¦ ) (3a) (3b)Discretization.The ï¬rst stage transforms the âcontinuous parametersâ(â, A, B) to âdiscrete parametersâ(A, B) through ï¬xed formulas A = ðð´(â, A) and B = ððµ(â, A, B), where the pair (ðð´, ððµ) is called a discretization rule.Various rules can be used such as the zero-order hold (ZOH) deï¬

ned in equation (4).A = exp(âA) B = (âA)â1(exp(âA) âI) ââB (4)Discretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023).

It also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5.

However, from a mechanical point of view discretization can simply be viewed as the ï¬rst step of the computation graph in the forward pass of an SSM.Alternate ï¬avors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about.Computation.After the parameters have been transformed from (â, A, B, C) ⦠(A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3).3Commonly, the model uses the convolutional mode (3) for eï¬cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬cient autoregressive inference (where the inputs are seen one timestep at a time).Linear Time Invariance (LTI).An important property of equations (1) to (3) is that the modelâs dynamics are constant through time.

In other words (â, A, B, C), and consequently (A, B) as well, are ï¬xed for all time-steps.This property is called linear time invariance (LTI), which is deeply connected to recurrence and convolutions.Informally, we think of LTI SSMs as being equivalent to any linear recurrence (2a) or convolution (3b), and use LTI as an umbrella term for these classes of models.Thus far, all structured SSMs have been LTI (e.g. computed as convolutions) because of fundamental eï¬ciency constraints, discussed in Section 3.3.However, a core insight of this work is that LTI models have fundamental limitations in modeling certain types of data, and our technical contributions involve removing the LTI constraint while overcoming the eï¬ciency bottlenecks.

Structure and Dimensions.Finally, we note that structured SSMs are so named because computing them eï¬ciently also requires imposing structure on the A matrix.The most popular form of structure is diagonal (Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Smith, Warrington, and Linderman 2023), which we also use.In this case, the A ââðÃð, B ââðÃ1, C ââ1Ãðmatrices can all be represented by ðnumbers.To operate over an input sequence ð¥ of batch size ðµ and length ð¿ with ð

· channels, the SSM is applied independently to each channel.Note that in this case, the total hidden state has dimension ð·ðper input, and computing it over the sequence length requires ð(ðµð¿ð·ð) time and memory; this is the root of the fundamental eï¬ciency bottleneck addressed in Section 3.3.General State Space Models.We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state.It has been used to refer to many disparate concepts in diï¬erent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning).Throughout this entire paper we use the term â

SSMâ

to refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably.For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y.Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary.SSM Architectures.SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines.

⢠Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM.⢠H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3).H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer.⢠Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021).â

¢ RetNet (Y.Sun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions.4⢠RWKV (B.Peng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S.Zhai et al. 2021)).Its main âWKVâ

mechanism involves LTI recurrences and can be viewed as the ratio of two SSMs.Other closely related SSMs and architectures are discussed further in an extended related work (Appendix B).We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM.# 3 Selective State Space ModelsWe motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2).The resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬

ciently.We overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3).We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4).Finally, we discuss some additional properties of selection mechanisms (Section 3.5).# 3.1 Motivation:Selection as a Means of CompressionWe argue that a fundamental problem of sequence modeling is compressing context into a smaller state.In fact, we can view the tradeoï¬s of popular sequence models from this point of view.For example, attention is both eï¬ective and ineï¬cient because it explicitly does not compress context at all.This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers.On the other hand, recurrent models are eï¬cient because they have a ï¬nite state, implying constant-time inference and linear-time training.However, their eï¬ectiveness is limited by how well this state has compressed the context.To understand this principle, we focus on two running examples of synthetic tasks (Figure 2).

⢠The Selective Copying task modiï¬es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize.It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬lter out the irrelevant ones (white).⢠The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022).It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black).These tasks reveal the failure mode of LTI models.From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬ect the hidden state passed along the sequence an in input-dependent way.From the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬culty with the Selective Copying task because of lack of content-awareness (Figure 2).More concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels.

In summary, the eï¬ciency vs. eï¬ectiveness tradeoï¬

of sequence models is characterized by how well they compress their state: eï¬cient models must have a small state, while eï¬ective models must have a state that contains all necessary information from the context.In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬lter out inputs into a sequential state.In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion).

# Improving SSMs with SelectionOne method of incorporating a selection mechanism into models is by letting their parameters that aï¬ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the c

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import random\n", - "from IPython.display import display, HTML\n", - "\n", - "# Predefined list of colors\n", - "colors = [\"000000\", \"FF0000\", \"800080\", \"008000\", \"0000FF\"]\n", - "\n", - "html_str = '
'\n", - "html_str += '
'\n", - "html_str += \"

Synchronous Chunks

\"\n", - "for chunk in chunks[0]:\n", - " chunk_text = \"\"\n", - " for split in chunk.splits:\n", - " chunk_text += split\n", - " color = random.choice(colors)\n", - " html_str += f'

{chunk_text}

'\n", - "html_str += \"
\"\n", - "html_str += '
'\n", - "html_str += \"

Asynchronous Chunks

\"\n", - "for chunk in chunks_async[0]:\n", - " chunk_text = \"\"\n", - " try:\n", - " for split in chunk.splits:\n", - " chunk_text += split\n", - " except AttributeError:\n", - " print(f\"Error Chunk: {chunk}\")\n", - " color = random.choice(colors)\n", - " html_str += f'

{chunk_text}

'\n", - "html_str += \"
\"\n", - "html_str += \"
\"\n", - "\n", - "# Display the HTML\n", - "display(HTML(html_str))" - ] - }, - { - "cell_type": "code", - "execution_count": 21, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -328,13 +238,13 @@ "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 7, tokens 196, triggered by: token limit\n", - "\u001b[34mArchitecture. We simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces. Selective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬ ciency together yield performance improvements on real data up to sequence length 1M.\u001b[0m\n", + "Split 7, tokens 236, triggered by: 0.26\n", + "\u001b[34mArchitecture. We simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces. Selective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬ ciency together yield performance improvements on real data up to sequence length 1M. We empirically validate Mambaâ s potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬ c task performance, on several types of modalities and settings:\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 8, tokens 179, triggered by: 0.33\n", - "\u001b[35mWe empirically validate Mambaâ s potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬ c task performance, on several types of modalities and settings: â ¢ Synthetics. On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬ nitely long (>1M tokens). â ¢ Audio and Genomics. Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half). In both settings, its performance improves with longer context up to million-length sequences. â ¢ Language Modeling.\u001b[0m\n", + "Split 8, tokens 139, triggered by: 0.33\n", + "\u001b[35mâ ¢ Synthetics. On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬ nitely long (>1M tokens). â ¢ Audio and Genomics. Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half). In both settings, its performance improves with longer context up to million-length sequences. â ¢ Language Modeling.\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", @@ -348,97 +258,77 @@ "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 11, tokens 83, triggered by: token limit\n", - "\u001b[34m¥(ð ¡) â â â ¦ ð ¦(ð ¡) â â through an implicit latent state â (ð ¡) â â ð . Concretely, S4 models are deï¬ ned with four parameters (â , A, B, C), which deï¬ ne a sequence-to-sequence trans- formation in two stages. â â ²(ð ¡) = Aâ\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 12, tokens 105, triggered by: 0.46\n", - "\u001b[35m(ð ¡) + Bð ¥(ð ¡) ð ¦(ð ¡) = Câ (ð ¡) (1a) (1b) â ð ¡ = Aâ ð ¡â 1 + Bð ¥ð ¡ ð ¦ð ¡ = Câ ð ¡ (2a) (2b) ð ð ² = (Cð ©, Cð ¨ð ©, â ¦ , Cð ¨ ð ¦ = ð ¥ â ð ² ð ©, â\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 13, tokens 112, triggered by: 0.41\n", - "\u001b[31m¦ ) (3a) (3b) Discretization. The ï¬ rst stage transforms the â continuous parametersâ (â , A, B) to â discrete parametersâ (A, B) through ï¬ xed formulas A = ð ð ´(â , A) and B = ð ð µ(â , A, B), where the pair (ð ð ´, ð ð µ) is called a discretization rule. Various rules can be used such as the zero-order hold (ZOH) deï¬\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 14, tokens 109, triggered by: 0.24\n", - "\u001b[32mned in equation (4). A = exp(â A) B = (â A)â 1(exp(â A) â I) â â B (4) Discretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023).\u001b[0m\n", - "----------------------------------------------------------------------------------------\n", - "\n", - "\n", - "Split 15, tokens 49, triggered by: token limit\n", - "\u001b[34mIt also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5.\u001b[0m\n", + "Split 11, tokens 188, triggered by: 0.46\n", + "\u001b[34m¥(ð ¡) â â â ¦ ð ¦(ð ¡) â â through an implicit latent state â (ð ¡) â â ð . Concretely, S4 models are deï¬ ned with four parameters (â , A, B, C), which deï¬ ne a sequence-to-sequence trans- formation in two stages. â â ²(ð ¡) = Aâ (ð ¡) + Bð ¥(ð ¡) ð ¦(ð ¡) = Câ (ð ¡) (1a) (1b) â ð ¡ = Aâ ð ¡â 1 + Bð ¥ð ¡ ð ¦ð ¡ = Câ ð ¡ (2a) (2b) ð ð ² = (Cð ©, Cð ¨ð ©, â ¦ , Cð ¨ ð ¦ = ð ¥ â ð ² ð ©, â\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 16, tokens 225, triggered by: 0.29\n", - "\u001b[35mHowever, from a mechanical point of view discretization can simply be viewed as the ï¬ rst step of the computation graph in the forward pass of an SSM. Alternate ï¬ avors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about. Computation. After the parameters have been transformed from (â , A, B, C) â ¦ (A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3). 3 Commonly, the model uses the convolutional mode (3) for eï¬ cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬ cient autoregressive inference (where the inputs are seen one timestep at a time). Linear Time Invariance (LTI). An important property of equations (1) to (3) is that the modelâ s dynamics are constant through time.\u001b[0m\n", + "Split 12, tokens 112, triggered by: 0.41\n", + "\u001b[35m¦ ) (3a) (3b) Discretization. The ï¬ rst stage transforms the â continuous parametersâ (â , A, B) to â discrete parametersâ (A, B) through ï¬ xed formulas A = ð ð ´(â , A) and B = ð ð µ(â , A, B), where the pair (ð ð ´, ð ð µ) is called a discretization rule. Various rules can be used such as the zero-order hold (ZOH) deï¬\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 17, tokens 182, triggered by: 0.33\n", - "\u001b[31mIn other words (â , A, B, C), and consequently (A, B) as well, are ï¬ xed for all time-steps. This property is called linear time invariance (LTI), which is deeply connected to recurrence and convolutions. Informally, we think of LTI SSMs as being equivalent to any linear recurrence (2a) or convolution (3b), and use LTI as an umbrella term for these classes of models. Thus far, all structured SSMs have been LTI (e.g. computed as convolutions) because of fundamental eï¬ ciency constraints, discussed in Section 3.3. However, a core insight of this work is that LTI models have fundamental limitations in modeling certain types of data, and our technical contributions involve removing the LTI constraint while overcoming the eï¬ ciency bottlenecks.\u001b[0m\n", + "Split 13, tokens 109, triggered by: 0.24\n", + "\u001b[31mned in equation (4). A = exp(â A) B = (â A)â 1(exp(â A) â I) â â B (4) Discretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023).\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 18, tokens 141, triggered by: 0.24\n", - "\u001b[32mStructure and Dimensions. Finally, we note that structured SSMs are so named because computing them eï¬ ciently also requires imposing structure on the A matrix. The most popular form of structure is diagonal (Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Smith, Warrington, and Linderman 2023), which we also use. In this case, the A â â ð à ð , B â â ð à 1, C â â 1à ð matrices can all be represented by ð numbers. To operate over an input sequence ð ¥ of batch size ð µ and length ð ¿ with ð\u001b[0m\n", + "Split 14, tokens 274, triggered by: 0.29\n", + "\u001b[32mIt also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5. However, from a mechanical point of view discretization can simply be viewed as the ï¬ rst step of the computation graph in the forward pass of an SSM. Alternate ï¬ avors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about. Computation. After the parameters have been transformed from (â , A, B, C) â ¦ (A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3). 3 Commonly, the model uses the convolutional mode (3) for eï¬ cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬ cient autoregressive inference (where the inputs are seen one timestep at a time). Linear Time Invariance (LTI). An important property of equations (1) to (3) is that the modelâ s dynamics are constant through time.\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 19, tokens 236, triggered by: 0.27\n", - "\u001b[34m· channels, the SSM is applied independently to each channel. Note that in this case, the total hidden state has dimension ð ·ð per input, and computing it over the sequence length requires ð (ð µð ¿ð ·ð ) time and memory; this is the root of the fundamental eï¬ ciency bottleneck addressed in Section 3.3. General State Space Models. We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state. It has been used to refer to many disparate concepts in diï¬ erent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬ lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning). Throughout this entire paper we use the term â\u001b[0m\n", + "Split 15, tokens 182, triggered by: 0.33\n", + "\u001b[34mIn other words (â , A, B, C), and consequently (A, B) as well, are ï¬ xed for all time-steps. This property is called linear time invariance (LTI), which is deeply connected to recurrence and convolutions. Informally, we think of LTI SSMs as being equivalent to any linear recurrence (2a) or convolution (3b), and use LTI as an umbrella term for these classes of models. Thus far, all structured SSMs have been LTI (e.g. computed as convolutions) because of fundamental eï¬ ciency constraints, discussed in Section 3.3. However, a core insight of this work is that LTI models have fundamental limitations in modeling certain types of data, and our technical contributions involve removing the LTI constraint while overcoming the eï¬ ciency bottlenecks.\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 20, tokens 3, triggered by: token limit\n", - "\u001b[35mSSMâ\u001b[0m\n", + "Split 16, tokens 141, triggered by: 0.24\n", + "\u001b[35mStructure and Dimensions. Finally, we note that structured SSMs are so named because computing them eï¬ ciently also requires imposing structure on the A matrix. The most popular form of structure is diagonal (Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Smith, Warrington, and Linderman 2023), which we also use. In this case, the A â â ð à ð , B â â ð à 1, C â â 1à ð matrices can all be represented by ð numbers. To operate over an input sequence ð ¥ of batch size ð µ and length ð ¿ with ð\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 21, tokens 226, triggered by: 0.22\n", - "\u001b[31mto refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably. For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y. Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary. SSM Architectures. SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines.\u001b[0m\n", + "Split 17, tokens 236, triggered by: 0.27\n", + "\u001b[31m· channels, the SSM is applied independently to each channel. Note that in this case, the total hidden state has dimension ð ·ð per input, and computing it over the sequence length requires ð (ð µð ¿ð ·ð ) time and memory; this is the root of the fundamental eï¬ ciency bottleneck addressed in Section 3.3. General State Space Models. We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state. It has been used to refer to many disparate concepts in diï¬ erent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬ lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning). Throughout this entire paper we use the term â\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 22, tokens 158, triggered by: 0.32\n", - "\u001b[32mâ ¢ Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM. â ¢ H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3). H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer. â ¢ Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021). â\u001b[0m\n", + "Split 18, tokens 229, triggered by: 0.22\n", + "\u001b[32mSSMâ to refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably. For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y. Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary. SSM Architectures. SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines.\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 23, tokens 106, triggered by: 0.26\n", - "\u001b[34m¢ RetNet (Y. Sun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions. 4 â ¢ RWKV (B. Peng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S. Zhai et al. 2021)). Its main â WKVâ\u001b[0m\n", + "Split 19, tokens 158, triggered by: 0.32\n", + "\u001b[34mâ ¢ Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM. â ¢ H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3). H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer. â ¢ Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021). â\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 24, tokens 172, triggered by: 0.25\n", - "\u001b[35mmechanism involves LTI recurrences and can be viewed as the ratio of two SSMs. Other closely related SSMs and architectures are discussed further in an extended related work (Appendix B). We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM. # 3 Selective State Space Models We motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2). The resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬\u001b[0m\n", + "Split 20, tokens 106, triggered by: 0.26\n", + "\u001b[35m¢ RetNet (Y. Sun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions. 4 â ¢ RWKV (B. Peng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S. Zhai et al. 2021)). Its main â WKVâ\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 25, tokens 254, triggered by: 0.33\n", - "\u001b[31mciently. We overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3). We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4). Finally, we discuss some additional properties of selection mechanisms (Section 3.5). # 3.1 Motivation: Selection as a Means of Compression We argue that a fundamental problem of sequence modeling is compressing context into a smaller state. In fact, we can view the tradeoï¬ s of popular sequence models from this point of view. For example, attention is both eï¬ ective and ineï¬ cient because it explicitly does not compress context at all. This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers. On the other hand, recurrent models are eï¬ cient because they have a ï¬ nite state, implying constant-time inference and linear-time training. However, their eï¬ ectiveness is limited by how well this state has compressed the context. To understand this principle, we focus on two running examples of synthetic tasks (Figure 2).\u001b[0m\n", + "Split 21, tokens 172, triggered by: 0.25\n", + "\u001b[31mmechanism involves LTI recurrences and can be viewed as the ratio of two SSMs. Other closely related SSMs and architectures are discussed further in an extended related work (Appendix B). We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM. # 3 Selective State Space Models We motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2). The resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 26, tokens 283, triggered by: 0.35\n", - "\u001b[32mâ ¢ The Selective Copying task modiï¬ es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize. It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬ lter out the irrelevant ones (white). â ¢ The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022). It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black). These tasks reveal the failure mode of LTI models. From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬ ect the hidden state passed along the sequence an in input-dependent way. From the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬ culty with the Selective Copying task because of lack of content-awareness (Figure 2). More concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels.\u001b[0m\n", + "Split 22, tokens 254, triggered by: 0.33\n", + "\u001b[32mciently. We overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3). We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4). Finally, we discuss some additional properties of selection mechanisms (Section 3.5). # 3.1 Motivation: Selection as a Means of Compression We argue that a fundamental problem of sequence modeling is compressing context into a smaller state. In fact, we can view the tradeoï¬ s of popular sequence models from this point of view. For example, attention is both eï¬ ective and ineï¬ cient because it explicitly does not compress context at all. This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers. On the other hand, recurrent models are eï¬ cient because they have a ï¬ nite state, implying constant-time inference and linear-time training. However, their eï¬ ectiveness is limited by how well this state has compressed the context. To understand this principle, we focus on two running examples of synthetic tasks (Figure 2).\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 27, tokens 20, triggered by: token limit\n", - "\u001b[34mIn summary, the eï¬ ciency vs. eï¬ ectiveness tradeoï¬\u001b[0m\n", + "Split 23, tokens 283, triggered by: 0.35\n", + "\u001b[34mâ ¢ The Selective Copying task modiï¬ es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize. It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬ lter out the irrelevant ones (white). â ¢ The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022). It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black). These tasks reveal the failure mode of LTI models. From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬ ect the hidden state passed along the sequence an in input-dependent way. From the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬ culty with the Selective Copying task because of lack of content-awareness (Figure 2). More concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels.\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 28, tokens 109, triggered by: 0.41\n", - "\u001b[35mof sequence models is characterized by how well they compress their state: eï¬ cient models must have a small state, while eï¬ ective models must have a state that contains all necessary information from the context. In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬ lter out inputs into a sequential state. In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion).\u001b[0m\n", + "Split 24, tokens 129, triggered by: 0.41\n", + "\u001b[35mIn summary, the eï¬ ciency vs. eï¬ ectiveness tradeoï¬ of sequence models is characterized by how well they compress their state: eï¬ cient models must have a small state, while eï¬ ective models must have a state that contains all necessary information from the context. In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬ lter out inputs into a sequential state. In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion).\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", "\n", - "Split 29, tokens 45, triggered by: final split\n", + "Split 25, tokens 45, triggered by: final split\n", "\u001b[31m# Improving SSMs with Selection One method of incorporating a selection mechanism into models is by letting their parameters that aï¬ ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the c\u001b[0m\n", "----------------------------------------------------------------------------------------\n", "\n", @@ -447,7 +337,7 @@ } ], "source": [ - "chunker.print(chunks_async[0])" + "chunker.print(chunks[0])" ] }, { @@ -470,7 +360,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": { "id": "AaKVbv942kkc" }, @@ -483,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -492,59 +382,1375 @@ "id": "d3mtF7R66tFJ", "outputId": "be8a0a91-e042-4214-9019-5cb17559c6de" }, - "outputs": [], - "source": [ - "chunks = chunker(docs=[content])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "chunker.print(chunks[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cumulative Chunking" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Cumulative chunking is a more compute intensive process, but can often provide more stable results as it is more noise resistant. However, it is _very expensive_ in both time and (if using APIs) money." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from semantic_chunkers import CumulativeChunker\n", - "\n", - "chunker = CumulativeChunker(encoder=encoder, score_threshold=0.3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 6/6 [00:04<00:00, 1.39it/s]\n", + "100%|██████████| 328/328 [00:00<00:00, 92287.63it/s]\n" + ] + } + ], "source": [ "chunks = chunker(docs=[content])" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split 1, tokens None, triggered by: 0.09\n", + "\u001b[31m# Mamba:\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 2, tokens None, triggered by: 0.10\n", + "\u001b[32mLinear-Time Sequence Modeling with Selective State Spaces\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 3, tokens None, triggered by: 0.25\n", + "\u001b[34m# Albert Gu*1 and Tri Dao*2 1Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 4, tokens None, triggered by: 0.22\n", + "\u001b[35m# Abstract\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 5, tokens None, triggered by: 0.30\n", + "\u001b[31mFoundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformersâ computational ineï¬ ciency on long sequences, but they have not performed as well as attention on important modalities such as language. We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements. First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 6, tokens None, triggered by: 0.22\n", + "\u001b[32mSecond, even though this change prevents the use of eï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 7, tokens None, triggered by: 0.28\n", + "\u001b[34mcient convolutions, we design a hardware-aware parallel algorithm in recurrent mode.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 8, tokens None, triggered by: 0.25\n", + "\u001b[35mWe integrate these selective SSMs into a simpliï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 9, tokens None, triggered by: 0.11\n", + "\u001b[31med end-to-end neural network architecture without attention or even MLP blocks (Mamba). Mamba enjoys fast inference (5à higher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences. As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics. On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 10, tokens None, triggered by: 0.21\n", + "\u001b[32m# 1 Introduction\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 11, tokens None, triggered by: 0.14\n", + "\u001b[34mFoundation models (FMs), or large models pretrained on massive data then adapted for downstream tasks, have emerged as an eï¬ ective paradigm in modern machine learning. The backbone of these FMs are often sequence models, operating on arbitrary sequences of inputs from a wide variety of domains such as language, images, speech, audio, time series, and genomics (Brown et al. 2020; Dosovitskiy et al. 2020; Ismail Fawaz et al. 2019; Oord et al. 2016; Poli et al. 2023; Sutskever, Vinyals, and Quoc V Le 2014). While this concept is agnostic to a particular choice of model architecture, modern FMs are predominantly based on a single type of sequence model: the Transformer (Vaswani et al. 2017) and its core attention layer (Bahdanau, Cho, and Bengio 2015) The eï¬ cacy of self-attention is attributed to its ability to route information densely within a context window, allowing it to model complex data. However, this property brings fundamental drawbacks: an inability to model anything outside of a ï¬ nite window, and quadratic scaling with respect to the window length.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 12, tokens None, triggered by: 0.21\n", + "\u001b[35mAn enormous body of research has appeared on more eï¬ cient variants of attention to overcome these drawbacks (Tay, Dehghani, Bahri, et al. 2022), but often at the expense of the very properties that makes it eï¬ ective.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 13, tokens None, triggered by: 0.27\n", + "\u001b[31mAs of yet, none of these variants have been shown to be empirically eï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 14, tokens None, triggered by: 0.26\n", + "\u001b[32mective at scale across domains.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 15, tokens None, triggered by: 0.09\n", + "\u001b[34mRecently, structured state space sequence models (SSMs) (Gu, Goel, and Ré 2022; Gu, Johnson, Goel, et al. 2021) have emerged as a promising class of architectures for sequence modeling. These models can be interpreted as a combination of recurrent neural networks (RNNs) and convolutional neural networks (CNNs), with inspiration from classical state space models (Kalman 1960). This class of models can be computed very eï¬ ciently as either a recurrence or convolution, with linear or near-linear scaling in sequence length.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 16, tokens None, triggered by: 0.28\n", + "\u001b[35mAdditionally, they have principled\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 17, tokens None, triggered by: 0.23\n", + "\u001b[31mEqual contribution.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 18, tokens None, triggered by: 0.07\n", + "\u001b[32m1\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 19, tokens None, triggered by: 0.15\n", + "\u001b[34mmechanisms for modeling long-range dependencies (Gu, Dao, et al. 2020) in certain data modalities, and have dominated benchmarks such as the Long Range Arena (Tay, Dehghani, Abnar, et al. 2021).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 20, tokens None, triggered by: 0.23\n", + "\u001b[35mMany ï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 21, tokens None, triggered by: 0.21\n", + "\u001b[31mavors of SSMs (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Y. Li et al. 2023; Ma et al. 2023; Orvieto et al. 2023; Smith, Warrington, and Linderman 2023) have been successful in domains involving continuous signal data such as audio and vision (Goel et al. 2022; Nguyen, Goel, et al. 2022; Saon, Gupta, and Cui 2023).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 22, tokens None, triggered by: 0.20\n", + "\u001b[32mHowever, they have been less eï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 23, tokens None, triggered by: 0.21\n", + "\u001b[34mective at modeling discrete and information-dense data such as text. We propose a new class of selective state space models, that improves on prior work on several axes to achieve the modeling power of Transformers while scaling linearly in sequence length.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 24, tokens None, triggered by: 0.18\n", + "\u001b[35mSelection Mechanism.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 25, tokens None, triggered by: 0.25\n", + "\u001b[31mFirst, we identify a key limitation of prior models: the ability to eï¬ ciently select data in an input-dependent manner (i.e. focus on or ignore particular inputs). Building on intuition based on important synthetic tasks such as selective copy and induction heads, we design a simple selection mechanism by parameterizing the SSM parameters based on the input.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 26, tokens None, triggered by: 0.18\n", + "\u001b[32mThis allows the model to ï¬ lter out irrelevant information and remember relevant information indeï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 27, tokens None, triggered by: 0.16\n", + "\u001b[34mnitely.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 28, tokens None, triggered by: 0.27\n", + "\u001b[35mHardware-aware Algorithm. This simple change poses a technical challenge for the computation of the model; in fact, all prior SSMs models must be time- and input-invariant in order to be computationally eï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 29, tokens None, triggered by: 0.18\n", + "\u001b[31mcient.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 30, tokens None, triggered by: 0.29\n", + "\u001b[32mWe overcome this with a hardware-aware algorithm that computes the model recurrently with a scan instead of convolution, but does not materialize the expanded state in order to avoid IO access between diï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 31, tokens None, triggered by: 0.28\n", + "\u001b[34merent levels of the GPU memory hierarchy.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 32, tokens None, triggered by: 0.19\n", + "\u001b[35mThe resulting implementation is faster than previous methods both in theory (scaling linearly in sequence length, compared to pseudo-linear for all convolution-based SSMs) and on modern hardware (up to 3à faster on A100 GPUs).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 33, tokens None, triggered by: 0.28\n", + "\u001b[31mArchitecture.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 34, tokens None, triggered by: 0.29\n", + "\u001b[32mWe simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces. Selective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬ ciency together yield performance improvements on real data up to sequence length 1M.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 35, tokens None, triggered by: 0.24\n", + "\u001b[34mWe empirically validate Mambaâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 36, tokens None, triggered by: 0.24\n", + "\u001b[35ms potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬ c task performance, on several types of modalities and settings:\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 37, tokens None, triggered by: 0.19\n", + "\u001b[31mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 38, tokens None, triggered by: 0.26\n", + "\u001b[32m¢ Synthetics. On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 39, tokens None, triggered by: 0.20\n", + "\u001b[34mnitely long (>1M tokens).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 40, tokens None, triggered by: 0.24\n", + "\u001b[35mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 41, tokens None, triggered by: 0.13\n", + "\u001b[31m¢ Audio and Genomics. Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half). In both settings, its performance improves with longer context up to million-length sequences.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 42, tokens None, triggered by: 0.24\n", + "\u001b[32mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 43, tokens None, triggered by: 0.15\n", + "\u001b[34m¢ Language Modeling.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 44, tokens None, triggered by: 0.10\n", + "\u001b[35mMamba is the ï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 45, tokens None, triggered by: 0.20\n", + "\u001b[31mrst linear-time sequence model that truly achieves Transformer-quality performance, both in pretraining perplexity and downstream evaluations. With scaling laws up to 1B parameters, we show that Mamba exceeds the performance of a large range of baselines, including very strong modern Transformer training recipes based on LLaMa (Touvron et al. 2023). Our Mamba language model has 5à generation throughput compared to Transformers of similar size, and Mamba-3Bâ s quality matches that of Transformers twice its size (e.g. 4 points higher avg. on common sense reasoning compared to Pythia-3B and even exceeding Pythia-7B).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 46, tokens None, triggered by: 0.08\n", + "\u001b[32mModel code and pre-trained checkpoints are open-sourced at https://github.com/state-spaces/mamba.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 47, tokens None, triggered by: 0.14\n", + "\u001b[34m2\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 48, tokens None, triggered by: 0.19\n", + "\u001b[35m# Selective State Space Model # with Hardware-aware State Expansion # A\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 49, tokens None, triggered by: 0.29\n", + "\u001b[31mvuvy GPU SRAM Selection Mechanism es\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 50, tokens None, triggered by: 0.25\n", + "\u001b[32mSelection Mechanism\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 51, tokens None, triggered by: 0.25\n", + "\u001b[34mFigure 1: (Overview.) Structured SSMs independently map each channel (e.g. ð · = 5) of an input ð ¥ to output ð\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 52, tokens None, triggered by: 0.28\n", + "\u001b[35m¦ through a higher dimensional latent state â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 53, tokens None, triggered by: 0.23\n", + "\u001b[31m(e.g. ð = 4).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 54, tokens None, triggered by: 0.29\n", + "\u001b[32mPrior SSMs avoid materializing this large effective state (ð ·ð , times batch size ð µ and sequence length ð\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 55, tokens None, triggered by: 0.26\n", + "\u001b[34m¿) through clever alternate computation paths requiring time-invariance: the (â , A, B, C) parameters are constant across time.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 56, tokens None, triggered by: 0.26\n", + "\u001b[35mOur selection mechanism adds back input-dependent dynamics, which also requires a careful hardware-aware algorithm to only materialize the expanded states in more efficient levels of the GPU memory hierarchy.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 57, tokens None, triggered by: 0.24\n", + "\u001b[31m# 2 State Space Models Structured state space sequence models (S4) are a recent class of sequence models for deep learning that are broadly related to RNNs, and CNNs, and classical state space models. They are inspired by a particular continuous system (1) that maps a 1-dimensional function or sequence ð ¥(ð ¡) â â â ¦ ð ¦(ð ¡) â â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 58, tokens None, triggered by: 0.28\n", + "\u001b[32mthrough an implicit latent state â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 59, tokens None, triggered by: 0.23\n", + "\u001b[34m(ð ¡) â â ð .\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 60, tokens None, triggered by: 0.22\n", + "\u001b[35mConcretely, S4 models are deï¬ ned with four parameters (â , A, B, C), which deï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 61, tokens None, triggered by: 0.18\n", + "\u001b[31mne a sequence-to-sequence trans- formation in two stages.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 62, tokens None, triggered by: 0.27\n", + "\u001b[32mâ â ²(ð ¡) = Aâ (ð ¡) + Bð ¥(ð ¡) ð ¦(ð ¡) = Câ (ð ¡) (1a) (1b) â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 63, tokens None, triggered by: 0.27\n", + "\u001b[34mð\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 64, tokens None, triggered by: 0.27\n", + "\u001b[35m¡ = Aâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 65, tokens None, triggered by: 0.24\n", + "\u001b[31mð ¡â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 66, tokens None, triggered by: 0.28\n", + "\u001b[32m1 + Bð ¥ð ¡ ð ¦ð\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 67, tokens None, triggered by: 0.30\n", + "\u001b[34m¡ = Câ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 68, tokens None, triggered by: 0.26\n", + "\u001b[35mð ¡ (2a) (2b) ð ð ² = (Cð ©, Cð ¨ð ©, â ¦ , Cð ¨ ð ¦ = ð ¥ â ð ² ð ©, â ¦ ) (3a) (3b)\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 69, tokens None, triggered by: 0.22\n", + "\u001b[31mDiscretization.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 70, tokens None, triggered by: 0.30\n", + "\u001b[32mThe ï¬ rst stage transforms the â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 71, tokens None, triggered by: 0.23\n", + "\u001b[34mcontinuous parametersâ (â , A, B) to â discrete parametersâ (A, B) through ï¬ xed formulas A = ð ð ´(â , A) and B = ð ð µ(â , A, B), where the pair (ð ð ´, ð ð\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 72, tokens None, triggered by: 0.27\n", + "\u001b[35mµ) is called a discretization rule. Various rules can be used such as the zero-order hold (ZOH) deï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 73, tokens None, triggered by: 0.28\n", + "\u001b[31mned in equation (4).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 74, tokens None, triggered by: 0.13\n", + "\u001b[32mA = exp(â A) B = (â A)â 1(exp(â A) â I) â â B (4)\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 75, tokens None, triggered by: 0.26\n", + "\u001b[34mDiscretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023). It also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 76, tokens None, triggered by: 0.29\n", + "\u001b[35mHowever, from a mechanical point of view discretization can simply be viewed as the ï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 77, tokens None, triggered by: 0.15\n", + "\u001b[31mrst step of the computation graph in the forward pass of an SSM.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 78, tokens None, triggered by: 0.22\n", + "\u001b[32mAlternate ï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 79, tokens None, triggered by: 0.29\n", + "\u001b[34mavors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about. Computation.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 80, tokens None, triggered by: 0.25\n", + "\u001b[35mAfter the parameters have been transformed from (â , A, B, C) â ¦ (A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 81, tokens None, triggered by: 0.24\n", + "\u001b[31m3\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 82, tokens None, triggered by: 0.14\n", + "\u001b[32mCommonly, the model uses the convolutional mode (3) for eï¬ cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬ cient autoregressive inference (where the inputs are seen one timestep at a time). Linear Time Invariance (LTI). An important property of equations (1) to (3) is that the modelâ s dynamics are constant through time.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 83, tokens None, triggered by: 0.21\n", + "\u001b[34mIn other words (â , A, B, C), and consequently (A, B) as well, are ï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 84, tokens None, triggered by: 0.24\n", + "\u001b[35mxed for all time-steps.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 85, tokens None, triggered by: 0.21\n", + "\u001b[31mThis property is called linear time invariance (LTI), which is deeply connected to recurrence and convolutions. Informally, we think of LTI SSMs as being equivalent to any linear recurrence (2a) or convolution (3b), and use LTI as an umbrella term for these classes of models. Thus far, all structured SSMs have been LTI (e.g. computed as convolutions) because of fundamental eï¬ ciency constraints, discussed in Section 3.3. However, a core insight of this work is that LTI models have fundamental limitations in modeling certain types of data, and our technical contributions involve removing the LTI constraint while overcoming the eï¬ ciency bottlenecks.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 86, tokens None, triggered by: 0.23\n", + "\u001b[32mStructure and Dimensions. Finally, we note that structured SSMs are so named because computing them eï¬ ciently also requires imposing structure on the A matrix. The most popular form of structure is diagonal (Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Smith, Warrington, and Linderman 2023), which we also use.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 87, tokens None, triggered by: 0.28\n", + "\u001b[34mIn this case, the A â â ð à ð\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 88, tokens None, triggered by: 0.27\n", + "\u001b[35m, B â â ð à 1, C â â 1à ð matrices can all be represented by ð\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 89, tokens None, triggered by: 0.18\n", + "\u001b[31mnumbers.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 90, tokens None, triggered by: 0.10\n", + "\u001b[32mTo operate over an input sequence ð ¥ of batch size ð µ and length ð ¿ with ð\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 91, tokens None, triggered by: 0.28\n", + "\u001b[34m· channels, the SSM is applied independently to each channel.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 92, tokens None, triggered by: 0.28\n", + "\u001b[35mNote that in this case, the total hidden state has dimension ð ·ð per input, and computing it over the sequence length requires ð (ð µð ¿ð ·ð ) time and memory; this is the root of the fundamental eï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 93, tokens None, triggered by: 0.20\n", + "\u001b[31mciency bottleneck addressed in Section 3.3.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 94, tokens None, triggered by: 0.24\n", + "\u001b[32mGeneral State Space Models. We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 95, tokens None, triggered by: 0.23\n", + "\u001b[34mIt has been used to refer to many disparate concepts in diï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 96, tokens None, triggered by: 0.19\n", + "\u001b[35merent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬ lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 97, tokens None, triggered by: 0.26\n", + "\u001b[31mThroughout this entire paper we use the term â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 98, tokens None, triggered by: 0.16\n", + "\u001b[32mSSMâ to refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably. For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y. Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 99, tokens None, triggered by: 0.09\n", + "\u001b[34mSSM Architectures. SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 100, tokens None, triggered by: 0.12\n", + "\u001b[35mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 101, tokens None, triggered by: 0.12\n", + "\u001b[31m¢ Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 102, tokens None, triggered by: 0.28\n", + "\u001b[32mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 103, tokens None, triggered by: 0.13\n", + "\u001b[34m¢ H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3). H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 104, tokens None, triggered by: 0.12\n", + "\u001b[35mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 105, tokens None, triggered by: 0.12\n", + "\u001b[31m¢ Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 106, tokens None, triggered by: 0.23\n", + "\u001b[32mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 107, tokens None, triggered by: 0.18\n", + "\u001b[34m¢ RetNet (Y.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 108, tokens None, triggered by: 0.07\n", + "\u001b[35mSun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 109, tokens None, triggered by: 0.24\n", + "\u001b[31m4 â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 110, tokens None, triggered by: 0.19\n", + "\u001b[32m¢ RWKV (B.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 111, tokens None, triggered by: 0.16\n", + "\u001b[34mPeng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S. Zhai et al. 2021)).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 112, tokens None, triggered by: 0.10\n", + "\u001b[35mIts main â WKVâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 113, tokens None, triggered by: 0.29\n", + "\u001b[31mmechanism involves LTI recurrences and can be viewed as the ratio of two SSMs. Other closely related SSMs and architectures are discussed further in an extended related work (Appendix B). We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM. # 3 Selective State Space Models We motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 114, tokens None, triggered by: 0.22\n", + "\u001b[32mThe resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 115, tokens None, triggered by: 0.28\n", + "\u001b[34mciently.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 116, tokens None, triggered by: 0.21\n", + "\u001b[35mWe overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3). We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4). Finally, we discuss some additional properties of selection mechanisms (Section 3.5). # 3.1 Motivation:\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 117, tokens None, triggered by: 0.10\n", + "\u001b[31mSelection as a Means of Compression We argue that a fundamental problem of sequence modeling is compressing context into a smaller state.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 118, tokens None, triggered by: 0.20\n", + "\u001b[32mIn fact, we can view the tradeoï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 119, tokens None, triggered by: 0.21\n", + "\u001b[34ms of popular sequence models from this point of view.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 120, tokens None, triggered by: 0.30\n", + "\u001b[35mFor example, attention is both eï¬ ective and ineï¬ cient because it explicitly does not compress context at all. This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers. On the other hand, recurrent models are eï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 121, tokens None, triggered by: 0.24\n", + "\u001b[31mcient because they have a ï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 122, tokens None, triggered by: 0.16\n", + "\u001b[32mnite state, implying constant-time inference and linear-time training.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 123, tokens None, triggered by: 0.27\n", + "\u001b[34mHowever, their eï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 124, tokens None, triggered by: 0.26\n", + "\u001b[35mectiveness is limited by how well this state has compressed the context.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 125, tokens None, triggered by: 0.12\n", + "\u001b[31mTo understand this principle, we focus on two running examples of synthetic tasks (Figure 2).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 126, tokens None, triggered by: 0.25\n", + "\u001b[32mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 127, tokens None, triggered by: 0.20\n", + "\u001b[34m¢ The Selective Copying task modiï¬ es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize. It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬ lter out the irrelevant ones (white).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 128, tokens None, triggered by: 0.12\n", + "\u001b[35mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 129, tokens None, triggered by: 0.21\n", + "\u001b[31m¢ The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022). It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 130, tokens None, triggered by: 0.27\n", + "\u001b[32mThese tasks reveal the failure mode of LTI models. From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬ ect the hidden state passed along the sequence an in input-dependent way.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 131, tokens None, triggered by: 0.20\n", + "\u001b[34mFrom the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬ culty with the Selective Copying task because of lack of content-awareness (Figure 2).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 132, tokens None, triggered by: 0.13\n", + "\u001b[35mMore concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 133, tokens None, triggered by: 0.20\n", + "\u001b[31mIn summary, the eï¬ ciency vs. eï¬ ectiveness tradeoï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 134, tokens None, triggered by: final split\n", + "\u001b[32mof sequence models is characterized by how well they compress their state: eï¬ cient models must have a small state, while eï¬ ective models must have a state that contains all necessary information from the context. In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬ lter out inputs into a sequential state. In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion). # Improving SSMs with Selection One method of incorporating a selection mechanism into models is by letting their parameters that aï¬ ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the c\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n" + ] + } + ], + "source": [ + "chunker.print(chunks[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cumulative Chunking" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Cumulative chunking is a more compute intensive process, but can often provide more stable results as it is more noise resistant. However, it is _very expensive_ in both time and (if using APIs) money." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "from semantic_chunkers import CumulativeChunker\n", + "\n", + "chunker = CumulativeChunker(encoder=encoder, score_threshold=0.3)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 329/329 [03:22<00:00, 1.63it/s]\n" + ] + } + ], + "source": [ + "chunks = chunker(docs=[content])" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split 1, tokens None, triggered by: 0.09\n", + "\u001b[31m# Mamba:\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 2, tokens None, triggered by: 0.10\n", + "\u001b[32mLinear-Time Sequence Modeling with Selective State Spaces\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 3, tokens None, triggered by: 0.28\n", + "\u001b[34m# Albert Gu*1 and Tri Dao*2 1Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 4, tokens None, triggered by: 0.22\n", + "\u001b[35m# Abstract\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 5, tokens None, triggered by: 0.23\n", + "\u001b[31mFoundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformersâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 6, tokens None, triggered by: 0.30\n", + "\u001b[32mcomputational ineï¬ ciency on long sequences, but they have not performed as well as attention on important modalities such as language. We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements. First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 7, tokens None, triggered by: 0.22\n", + "\u001b[34mSecond, even though this change prevents the use of eï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 8, tokens None, triggered by: 0.28\n", + "\u001b[35mcient convolutions, we design a hardware-aware parallel algorithm in recurrent mode.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 9, tokens None, triggered by: 0.25\n", + "\u001b[31mWe integrate these selective SSMs into a simpliï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 10, tokens None, triggered by: 0.17\n", + "\u001b[32med end-to-end neural network architecture without attention or even MLP blocks (Mamba). Mamba enjoys fast inference (5à higher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences. As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics. On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 11, tokens None, triggered by: 0.21\n", + "\u001b[34m# 1 Introduction\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 12, tokens None, triggered by: 0.20\n", + "\u001b[35mFoundation models (FMs), or large models pretrained on massive data then adapted for downstream tasks, have emerged as an eï¬ ective paradigm in modern machine learning. The backbone of these FMs are often sequence models, operating on arbitrary sequences of inputs from a wide variety of domains such as language, images, speech, audio, time series, and genomics (Brown et al. 2020; Dosovitskiy et al. 2020; Ismail Fawaz et al. 2019; Oord et al. 2016; Poli et al. 2023; Sutskever, Vinyals, and Quoc V Le 2014). While this concept is agnostic to a particular choice of model architecture, modern FMs are predominantly based on a single type of sequence model: the Transformer (Vaswani et al. 2017) and its core attention layer (Bahdanau, Cho, and Bengio 2015) The eï¬ cacy of self-attention is attributed to its ability to route information densely within a context window, allowing it to model complex data.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 13, tokens None, triggered by: 0.25\n", + "\u001b[31mHowever, this property brings fundamental drawbacks: an inability to model anything outside of a ï¬ nite window, and quadratic scaling with respect to the window length.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 14, tokens None, triggered by: 0.28\n", + "\u001b[32mAn enormous body of research has appeared on more eï¬ cient variants of attention to overcome these drawbacks (Tay, Dehghani, Bahri, et al. 2022), but often at the expense of the very properties that makes it eï¬ ective. As of yet, none of these variants have been shown to be empirically eï¬ ective at scale across domains.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 15, tokens None, triggered by: 0.09\n", + "\u001b[34mRecently, structured state space sequence models (SSMs) (Gu, Goel, and Ré 2022; Gu, Johnson, Goel, et al. 2021) have emerged as a promising class of architectures for sequence modeling. These models can be interpreted as a combination of recurrent neural networks (RNNs) and convolutional neural networks (CNNs), with inspiration from classical state space models (Kalman 1960). This class of models can be computed very eï¬ ciently as either a recurrence or convolution, with linear or near-linear scaling in sequence length.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 16, tokens None, triggered by: 0.28\n", + "\u001b[35mAdditionally, they have principled\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 17, tokens None, triggered by: 0.23\n", + "\u001b[31mEqual contribution.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 18, tokens None, triggered by: 0.07\n", + "\u001b[32m1\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 19, tokens None, triggered by: 0.15\n", + "\u001b[34mmechanisms for modeling long-range dependencies (Gu, Dao, et al. 2020) in certain data modalities, and have dominated benchmarks such as the Long Range Arena (Tay, Dehghani, Abnar, et al. 2021).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 20, tokens None, triggered by: 0.23\n", + "\u001b[35mMany ï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 21, tokens None, triggered by: 0.20\n", + "\u001b[31mavors of SSMs (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Y. Li et al. 2023; Ma et al. 2023; Orvieto et al. 2023; Smith, Warrington, and Linderman 2023) have been successful in domains involving continuous signal data such as audio and vision (Goel et al. 2022; Nguyen, Goel, et al. 2022; Saon, Gupta, and Cui 2023).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 22, tokens None, triggered by: 0.20\n", + "\u001b[32mHowever, they have been less eï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 23, tokens None, triggered by: 0.24\n", + "\u001b[34mective at modeling discrete and information-dense data such as text. We propose a new class of selective state space models, that improves on prior work on several axes to achieve the modeling power of Transformers while scaling linearly in sequence length.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 24, tokens None, triggered by: 0.18\n", + "\u001b[35mSelection Mechanism.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 25, tokens None, triggered by: 0.07\n", + "\u001b[31mFirst, we identify a key limitation of prior models: the ability to eï¬ ciently select data in an input-dependent manner (i.e. focus on or ignore particular inputs). Building on intuition based on important synthetic tasks such as selective copy and induction heads, we design a simple selection mechanism by parameterizing the SSM parameters based on the input. This allows the model to ï¬ lter out irrelevant information and remember relevant information indeï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 26, tokens None, triggered by: 0.16\n", + "\u001b[32mnitely.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 27, tokens None, triggered by: 0.26\n", + "\u001b[34mHardware-aware Algorithm. This simple change poses a technical challenge for the computation of the model; in fact, all prior SSMs models must be time- and input-invariant in order to be computationally eï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 28, tokens None, triggered by: 0.18\n", + "\u001b[35mcient.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 29, tokens None, triggered by: 0.29\n", + "\u001b[31mWe overcome this with a hardware-aware algorithm that computes the model recurrently with a scan instead of convolution, but does not materialize the expanded state in order to avoid IO access between diï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 30, tokens None, triggered by: 0.28\n", + "\u001b[32merent levels of the GPU memory hierarchy.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 31, tokens None, triggered by: 0.12\n", + "\u001b[34mThe resulting implementation is faster than previous methods both in theory (scaling linearly in sequence length, compared to pseudo-linear for all convolution-based SSMs) and on modern hardware (up to 3à faster on A100 GPUs).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 32, tokens None, triggered by: 0.28\n", + "\u001b[35mArchitecture.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 33, tokens None, triggered by: 0.23\n", + "\u001b[31mWe simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces. Selective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬ ciency together yield performance improvements on real data up to sequence length 1M. We empirically validate Mambaâ s potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 34, tokens None, triggered by: 0.24\n", + "\u001b[32mc task performance, on several types of modalities and settings:\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 35, tokens None, triggered by: 0.19\n", + "\u001b[34mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 36, tokens None, triggered by: 0.27\n", + "\u001b[35m¢ Synthetics. On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 37, tokens None, triggered by: 0.20\n", + "\u001b[31mnitely long (>1M tokens).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 38, tokens None, triggered by: 0.24\n", + "\u001b[32mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 39, tokens None, triggered by: 0.18\n", + "\u001b[34m¢ Audio and Genomics. Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half). In both settings, its performance improves with longer context up to million-length sequences.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 40, tokens None, triggered by: 0.24\n", + "\u001b[35mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 41, tokens None, triggered by: 0.15\n", + "\u001b[31m¢ Language Modeling.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 42, tokens None, triggered by: 0.10\n", + "\u001b[32mMamba is the ï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 43, tokens None, triggered by: 0.10\n", + "\u001b[34mrst linear-time sequence model that truly achieves Transformer-quality performance, both in pretraining perplexity and downstream evaluations. With scaling laws up to 1B parameters, we show that Mamba exceeds the performance of a large range of baselines, including very strong modern Transformer training recipes based on LLaMa (Touvron et al. 2023). Our Mamba language model has 5à generation throughput compared to Transformers of similar size, and Mamba-3Bâ s quality matches that of Transformers twice its size (e.g. 4 points higher avg. on common sense reasoning compared to Pythia-3B and even exceeding Pythia-7B). Model code and pre-trained checkpoints are open-sourced at https://github.com/state-spaces/mamba.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 44, tokens None, triggered by: 0.14\n", + "\u001b[35m2\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 45, tokens None, triggered by: 0.25\n", + "\u001b[31m# Selective State Space Model # with Hardware-aware State Expansion\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 46, tokens None, triggered by: 0.19\n", + "\u001b[32m# A\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 47, tokens None, triggered by: 0.29\n", + "\u001b[34mvuvy GPU SRAM Selection Mechanism es\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 48, tokens None, triggered by: 0.25\n", + "\u001b[35mSelection Mechanism\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 49, tokens None, triggered by: 0.28\n", + "\u001b[31mFigure 1: (Overview.) Structured SSMs independently map each channel (e.g. ð · = 5) of an input ð ¥ to output ð ¦ through a higher dimensional latent state â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 50, tokens None, triggered by: 0.28\n", + "\u001b[32m(e.g. ð = 4).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 51, tokens None, triggered by: 0.16\n", + "\u001b[34mPrior SSMs avoid materializing this large effective state (ð ·ð , times batch size ð µ and sequence length ð ¿) through clever alternate computation paths requiring time-invariance: the (â , A, B, C) parameters are constant across time. Our selection mechanism adds back input-dependent dynamics, which also requires a careful hardware-aware algorithm to only materialize the expanded states in more efficient levels of the GPU memory hierarchy. # 2 State Space Models Structured state space sequence models (S4) are a recent class of sequence models for deep learning that are broadly related to RNNs, and CNNs, and classical state space models. They are inspired by a particular continuous system (1) that maps a 1-dimensional function or sequence ð\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 52, tokens None, triggered by: 0.27\n", + "\u001b[35m¥(ð ¡) â â â ¦ ð ¦(ð ¡) â â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 53, tokens None, triggered by: 0.28\n", + "\u001b[31mthrough an implicit latent state â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 54, tokens None, triggered by: 0.26\n", + "\u001b[32m(ð ¡) â â ð .\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 55, tokens None, triggered by: 0.15\n", + "\u001b[34mConcretely, S4 models are deï¬ ned with four parameters (â , A, B, C), which deï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 56, tokens None, triggered by: 0.18\n", + "\u001b[35mne a sequence-to-sequence trans- formation in two stages.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 57, tokens None, triggered by: 0.27\n", + "\u001b[31mâ â ²(ð ¡) = Aâ (ð ¡) + Bð ¥(ð ¡) ð ¦(ð ¡) = Câ (ð ¡) (1a) (1b) â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 58, tokens None, triggered by: 0.27\n", + "\u001b[32mð\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 59, tokens None, triggered by: 0.27\n", + "\u001b[34m¡ = Aâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 60, tokens None, triggered by: 0.29\n", + "\u001b[35mð ¡â 1 + Bð ¥ð ¡ ð ¦ð ¡ = Câ ð ¡ (2a) (2b) ð ð ² = (Cð ©, Cð ¨ð ©, â ¦ , Cð ¨ ð ¦ = ð ¥ â ð ² ð ©, â ¦ ) (3a) (3b)\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 61, tokens None, triggered by: 0.22\n", + "\u001b[31mDiscretization.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 62, tokens None, triggered by: 0.25\n", + "\u001b[32mThe ï¬ rst stage transforms the â continuous parametersâ (â , A, B) to â discrete parametersâ (A, B) through ï¬ xed formulas A = ð\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 63, tokens None, triggered by: 0.28\n", + "\u001b[34mð ´(â , A) and B = ð ð µ(â , A, B), where the pair (ð ð ´, ð ð µ) is called a discretization rule. Various rules can be used such as the zero-order hold (ZOH) deï¬ ned in equation (4). A = exp(â A) B = (â A)â 1(exp(â A) â I) â â B (4) Discretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 64, tokens None, triggered by: 0.26\n", + "\u001b[35mIt also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 65, tokens None, triggered by: 0.29\n", + "\u001b[31mHowever, from a mechanical point of view discretization can simply be viewed as the ï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 66, tokens None, triggered by: 0.15\n", + "\u001b[32mrst step of the computation graph in the forward pass of an SSM.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 67, tokens None, triggered by: 0.22\n", + "\u001b[34mAlternate ï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 68, tokens None, triggered by: 0.21\n", + "\u001b[35mavors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about. Computation. After the parameters have been transformed from (â , A, B, C) â ¦ (A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 69, tokens None, triggered by: 0.24\n", + "\u001b[31m3\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 70, tokens None, triggered by: 0.25\n", + "\u001b[32mCommonly, the model uses the convolutional mode (3) for eï¬ cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬ cient autoregressive inference (where the inputs are seen one timestep at a time).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 71, tokens None, triggered by: 0.20\n", + "\u001b[34mLinear Time Invariance (LTI). An important property of equations (1) to (3) is that the modelâ s dynamics are constant through time.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 72, tokens None, triggered by: 0.17\n", + "\u001b[35mIn other words (â , A, B, C), and consequently (A, B) as well, are ï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 73, tokens None, triggered by: 0.24\n", + "\u001b[31mxed for all time-steps.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 74, tokens None, triggered by: 0.28\n", + "\u001b[32mThis property is called linear time invariance (LTI), which is deeply connected to recurrence and convolutions. Informally, we think of LTI SSMs as being equivalent to any linear recurrence (2a) or convolution (3b), and use LTI as an umbrella term for these classes of models. Thus far, all structured SSMs have been LTI (e.g. computed as convolutions) because of fundamental eï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 75, tokens None, triggered by: 0.23\n", + "\u001b[34mciency constraints, discussed in Section 3.3. However, a core insight of this work is that LTI models have fundamental limitations in modeling certain types of data, and our technical contributions involve removing the LTI constraint while overcoming the eï¬ ciency bottlenecks.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 76, tokens None, triggered by: 0.29\n", + "\u001b[35mStructure and Dimensions. Finally, we note that structured SSMs are so named because computing them eï¬ ciently also requires imposing structure on the A matrix. The most popular form of structure is diagonal (Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Smith, Warrington, and Linderman 2023), which we also use.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 77, tokens None, triggered by: 0.26\n", + "\u001b[31mIn this case, the A â â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 78, tokens None, triggered by: 0.26\n", + "\u001b[32mð à ð , B â â ð à 1, C â â 1à ð matrices can all be represented by ð\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 79, tokens None, triggered by: 0.18\n", + "\u001b[34mnumbers.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 80, tokens None, triggered by: 0.25\n", + "\u001b[35mTo operate over an input sequence ð ¥ of batch size ð µ and length ð ¿ with ð\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 81, tokens None, triggered by: 0.28\n", + "\u001b[31m· channels, the SSM is applied independently to each channel.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 82, tokens None, triggered by: 0.27\n", + "\u001b[32mNote that in this case, the total hidden state has dimension ð ·ð per input, and computing it over the sequence length requires ð (ð µð\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 83, tokens None, triggered by: 0.27\n", + "\u001b[34m¿ð ·ð\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 84, tokens None, triggered by: 0.28\n", + "\u001b[35m) time and memory; this is the root of the fundamental eï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 85, tokens None, triggered by: 0.20\n", + "\u001b[31mciency bottleneck addressed in Section 3.3.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 86, tokens None, triggered by: 0.19\n", + "\u001b[32mGeneral State Space Models. We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 87, tokens None, triggered by: 0.23\n", + "\u001b[34mIt has been used to refer to many disparate concepts in diï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 88, tokens None, triggered by: 0.15\n", + "\u001b[35merent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬ lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 89, tokens None, triggered by: 0.26\n", + "\u001b[31mThroughout this entire paper we use the term â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 90, tokens None, triggered by: 0.13\n", + "\u001b[32mSSMâ to refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably. For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y. Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary. SSM Architectures. SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 91, tokens None, triggered by: 0.12\n", + "\u001b[34mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 92, tokens None, triggered by: 0.12\n", + "\u001b[35m¢ Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 93, tokens None, triggered by: 0.28\n", + "\u001b[31mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 94, tokens None, triggered by: 0.23\n", + "\u001b[32m¢ H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3). H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 95, tokens None, triggered by: 0.12\n", + "\u001b[34mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 96, tokens None, triggered by: 0.12\n", + "\u001b[35m¢ Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 97, tokens None, triggered by: 0.23\n", + "\u001b[31mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 98, tokens None, triggered by: 0.18\n", + "\u001b[32m¢ RetNet (Y.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 99, tokens None, triggered by: 0.07\n", + "\u001b[34mSun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 100, tokens None, triggered by: 0.25\n", + "\u001b[35m4 â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 101, tokens None, triggered by: 0.19\n", + "\u001b[31m¢ RWKV (B.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 102, tokens None, triggered by: 0.11\n", + "\u001b[32mPeng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S. Zhai et al. 2021)).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 103, tokens None, triggered by: 0.15\n", + "\u001b[34mIts main â WKVâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 104, tokens None, triggered by: 0.20\n", + "\u001b[35mmechanism involves LTI recurrences and can be viewed as the ratio of two SSMs. Other closely related SSMs and architectures are discussed further in an extended related work (Appendix B). We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM. # 3 Selective State Space Models We motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2). The resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 105, tokens None, triggered by: 0.28\n", + "\u001b[31mciently.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 106, tokens None, triggered by: 0.25\n", + "\u001b[32mWe overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3). We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4). Finally, we discuss some additional properties of selection mechanisms (Section 3.5).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 107, tokens None, triggered by: 0.21\n", + "\u001b[34m# 3.1 Motivation:\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 108, tokens None, triggered by: 0.10\n", + "\u001b[35mSelection as a Means of Compression We argue that a fundamental problem of sequence modeling is compressing context into a smaller state.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 109, tokens None, triggered by: 0.20\n", + "\u001b[31mIn fact, we can view the tradeoï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 110, tokens None, triggered by: 0.21\n", + "\u001b[32ms of popular sequence models from this point of view.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 111, tokens None, triggered by: 0.28\n", + "\u001b[34mFor example, attention is both eï¬ ective and ineï¬ cient because it explicitly does not compress context at all. This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers. On the other hand, recurrent models are eï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 112, tokens None, triggered by: 0.24\n", + "\u001b[35mcient because they have a ï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 113, tokens None, triggered by: 0.16\n", + "\u001b[31mnite state, implying constant-time inference and linear-time training.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 114, tokens None, triggered by: 0.27\n", + "\u001b[32mHowever, their eï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 115, tokens None, triggered by: 0.26\n", + "\u001b[34mectiveness is limited by how well this state has compressed the context.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 116, tokens None, triggered by: 0.12\n", + "\u001b[35mTo understand this principle, we focus on two running examples of synthetic tasks (Figure 2).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 117, tokens None, triggered by: 0.25\n", + "\u001b[31mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 118, tokens None, triggered by: 0.28\n", + "\u001b[32m¢ The Selective Copying task modiï¬ es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize. It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 119, tokens None, triggered by: 0.20\n", + "\u001b[34mlter out the irrelevant ones (white).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 120, tokens None, triggered by: 0.12\n", + "\u001b[35mâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 121, tokens None, triggered by: 0.24\n", + "\u001b[31m¢ The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022). It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black). These tasks reveal the failure mode of LTI models. From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬ ect the hidden state passed along the sequence an in input-dependent way. From the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬ culty with the Selective Copying task because of lack of content-awareness (Figure 2). More concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 122, tokens None, triggered by: 0.30\n", + "\u001b[32mIn summary, the eï¬ ciency vs. eï¬ ectiveness tradeoï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 123, tokens None, triggered by: final split\n", + "\u001b[34mof sequence models is characterized by how well they compress their state: eï¬ cient models must have a small state, while eï¬ ective models must have a state that contains all necessary information from the context. In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬ lter out inputs into a sequential state. In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion). # Improving SSMs with Selection One method of incorporating a selection mechanism into models is by letting their parameters that aï¬ ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the c\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n" + ] + } + ], "source": [ "chunker.print(chunks[0])" ] diff --git a/docs/02-chunkers-async.ipynb b/docs/02-chunkers-async.ipynb new file mode 100644 index 0000000..3a5dd9f --- /dev/null +++ b/docs/02-chunkers-async.ipynb @@ -0,0 +1,1622 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/aurelio-labs/semantic-chunkers/blob/main/docs/00-chunkers-intro.ipynb) [![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/aurelio-labs/semantic-chunkers/blob/main/docs/00-chunkers-intro.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iFgZNmSH2Dee", + "outputId": "45754137-cb9c-4e85-9dbc-e139c8a2c9bb" + }, + "outputs": [], + "source": [ + "!pip install -qU \\\n", + " semantic-chunkers \\\n", + " datasets==2.19.1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# An Async version of Semantic Chunkers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Semantic chunkers allow us to build more context aware chunks of information. We can use this for RAG, splitting video, audio, and much more.\n", + "\n", + "In this example, we will stick with a simple RAG-focused example. We will learn about three different types of chunkers available to us; `StatisticalChunker`, `ConsecutiveChunker`, and `CumulativeChunker`. To begin, we need some data." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 341, + "referenced_widgets": [ + "1180440b21b74b3da04a8a876a135658", + "8489e5ef0f384d6480e8e2c5c60fdd1d", + "52661abd5e3c460eb54489d1e3515d69", + "f3c981b961eb4185b995dc4e3b5d3919", + "b987413b65ad4b10a7f7759c9dcc1c2f", + "75159166d8254cde933b4131e4627b8f", + "d5c5e31059e94176be64f32dd2496164", + "b59d191a2c0e498e80494f9384de7dfb", + "e57af473700d4bc2bbea39eca36ece8c", + "be2cbe5e3ea94a9f81b9e370957e63b7", + "1707a76c176a4e1d96baa4a4ed9e7d99", + "8b640a38e82e4f0598fa6ed382888499", + "3321078273154d2bb3b6a9189d97806c", + "ce12ce0393464fd7a533123d3c71a3e6", + "0ec1edf815164f5f82001efa06857553", + "f0f1642352f14137a19ee27513f18ca6", + "d51e32f5700b40cea7512aded4d6d019", + "a27eb006ba8c4d69a70d8be4e3c280a2", + "46577f9ba4e142009d0aebb6741d6e84", + "e430ed4d66604027812b1e39e1fc000e", + "272c61542ae8454b9eb616e5e2a858e3", + "2a19c3d693834de786396120d19a65b3", + "c59101e643d34bc5a4ad4fa664064614", + "7b3688583754445ab07bf28254fcc97d", + "f7870f32ab2d4d938fc5ba85aecf7f51", + "bae4b7ef05ea49b6bdb2a0e369e6768b", + "20740edade1a44c29f4d4094ed487f00", + "1bc01711294a49b4a81475e547874514", + "4b5d04efea944bbc8659b4f56f22ed69", + "1d5846c892724b8cab97693ec1c1cd33", + "f4e26036353d452885f364eb8c2b241e", + "29f5562abb0e4383956f18652833defe", + "b6fc89b557b44064b2e17829347bc951" + ] + }, + "id": "aTN4gsdl2WBQ", + "outputId": "bd606fad-8214-4fd4-cad1-54bb86234575" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'content', 'references'],\n", + " num_rows: 2673\n", + "})" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from datasets import load_dataset\n", + "\n", + "data = load_dataset(\"jamescalam/ai-arxiv2\", split=\"train\")\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nNZSP8iL2dDB", + "outputId": "9615cc01-27f5-4bdd-9cc8-54f7308bea72" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Mamba: Linear-Time Sequence Modeling with Selective State Spaces\n", + "# Albert Gu*1 and Tri Dao*2\n", + "1Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me\n", + "# Abstract\n", + "Foundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformers’ computational inefficiency on long sequences, but they have not performed as well as attention on important modalities such as language. We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements. First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities\n" + ] + } + ], + "source": [ + "content = data[3][\"content\"]\n", + "print(content[:1000])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will keep a smaller section of content to speed up (and limit cost) for the examples." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "content = content[:20_000]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v4f7BIUT2trY" + }, + "source": [ + "We will experiment with different semantic chunking methods on the above text. Every chunker requires an _encoder_ for which we can choose from open source encoders via `HuggingfaceEncoder` or `FastembedEncoder`, and proprietary API encoders like `OpenAIEncoder` or `CohereEncoder`.\n", + "\n", + "We will use the `OpenAIEncoder` with `text-embedding-3-small`:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "id": "Mqnc35w85A8L" + }, + "outputs": [], + "source": [ + "import os\n", + "from getpass import getpass\n", + "from semantic_router.encoders import OpenAIEncoder\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = os.getenv(\"OPENAI_API_KEY\") or getpass(\n", + " \"OpenAI API key: \"\n", + ")\n", + "\n", + "encoder = OpenAIEncoder(name=\"text-embedding-3-small\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Statistical Chunking" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The statistical chunking method our most robust chunking method, it uses a varying similarity threshold to identify more dynamic and local similarity splits. It offers a good balance between accuracy and efficiency _but_ can only be used for text documents (unlike the multi-modal `ConsecutiveChunker`).\n", + "\n", + "The `StatisticalChunker` can automatically identify a good threshold value to use while chunking our text, so it tends to require less customization than our other chunkers." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from semantic_chunkers import StatisticalChunker\n", + "\n", + "chunker = StatisticalChunker(encoder=encoder)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2024-07-03 16:41:12 INFO semantic_chunkers.utils.logger Single document exceeds the maximum token limit of 300. Splitting to sentences before semantically merging.\u001b[0m\n", + "100%|██████████| 6/6 [00:05<00:00, 1.08it/s]\n", + "\u001b[32m2024-07-03 16:41:17 INFO semantic_chunkers.utils.logger Single document exceeds the maximum token limit of 300. Splitting to sentences before semantically merging.\u001b[0m\n" + ] + } + ], + "source": [ + "chunks_async = await chunker.acall(docs=[content])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sync call\n", + "chunks = chunker(docs=[content])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print and compare sync and async chunks." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Synchronous Chunks

# Mamba:Linear-Time Sequence Modeling with Selective State Spaces# Albert Gu*1 and Tri Dao*21Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me# AbstractFoundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module.Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformersâcomputational ineï¬ciency on long sequences, but they have not performed as well as attention on important modalities such as language.We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements.First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token.Second, even though this change prevents the use of eï¬cient convolutions, we design a hardware-aware parallel algorithm in recurrent mode.We integrate these selective SSMs into a simpliï¬ed end-to-end neural network architecture without attention or even MLP blocks (Mamba).Mamba enjoys fast inference (5Ãhigher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences.

As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics.On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.# 1 IntroductionFoundation models (FMs), or large models pretrained on massive data then adapted for downstream tasks, have emerged as an eï¬ective paradigm in modern machine learning.The backbone of these FMs are often sequence models, operating on arbitrary sequences of inputs from a wide variety of domains such as language, images, speech, audio, time series, and genomics (Brown et al. 2020; Dosovitskiy et al. 2020; Ismail Fawaz et al. 2019; Oord et al. 2016; Poli et al. 2023; Sutskever, Vinyals, and Quoc V Le 2014).While this concept is agnostic to a particular choice of model architecture, modern FMs are predominantly based on a single type of sequence model: the Transformer (Vaswani et al. 2017) and its core attention layer (Bahdanau, Cho, and Bengio 2015) The eï¬cacy of self-attention is attributed to its ability to route information densely within a context window, allowing it to model complex data.

However, this property brings fundamental drawbacks: an inability to model anything outside of a ï¬nite window, and quadratic scaling with respect to the window length.An enormous body of research has appeared on more eï¬cient variants of attention to overcome these drawbacks (Tay, Dehghani, Bahri, et al. 2022), but often at the expense of the very properties that makes it eï¬ective.As of yet, none of these variants have been shown to be empirically eï¬ective at scale across domains.

Recently, structured state space sequence models (SSMs) (Gu, Goel, and Ré 2022; Gu, Johnson, Goel, et al. 2021) have emerged as a promising class of architectures for sequence modeling.These models can be interpreted as a combination of recurrent neural networks (RNNs) and convolutional neural networks (CNNs), with inspiration from classical state space models (Kalman 1960).This class of models can be computed very eï¬ciently as either a recurrence or convolution, with linear or near-linear scaling in sequence length.

Additionally, they have principledEqual contribution.1mechanisms for modeling long-range dependencies (Gu, Dao, et al. 2020) in certain data modalities, and have dominated benchmarks such as the Long Range Arena (Tay, Dehghani, Abnar, et al. 2021).Many ï¬avors of SSMs (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Y.Li et al. 2023; Ma et al. 2023; Orvieto et al. 2023; Smith, Warrington, and Linderman 2023) have been successful in domains involving continuous signal data such as audio and vision (Goel et al. 2022; Nguyen, Goel, et al. 2022; Saon, Gupta, and Cui 2023).However, they have been less eï¬ective at modeling discrete and information-dense data such as text.We propose a new class of selective state space models, that improves on prior work on several axes to achieve the modeling power of Transformers while scaling linearly in sequence length.

Selection Mechanism.First, we identify a key limitation of prior models: the ability to eï¬ciently select data in an input-dependent manner (i.e. focus on or ignore particular inputs).Building on intuition based on important synthetic tasks such as selective copy and induction heads, we design a simple selection mechanism by parameterizing the SSM parameters based on the input.This allows the model to ï¬lter out irrelevant information and remember relevant information indeï¬nitely.Hardware-aware Algorithm.This simple change poses a technical challenge for the computation of the model; in fact, all prior SSMs models must be time- and input-invariant in order to be computationally eï¬cient.We overcome this with a hardware-aware algorithm that computes the model recurrently with a scan instead of convolution, but does not materialize the expanded state in order to avoid IO access between diï¬erent levels of the GPU memory hierarchy.The resulting implementation is faster than previous methods both in theory (scaling linearly in sequence length, compared to pseudo-linear for all convolution-based SSMs) and on modern hardware (up to 3Ãfaster on A100 GPUs).

Architecture.We simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces.Selective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬ciency together yield performance improvements on real data up to sequence length 1M.We empirically validate Mambaâs potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬c task performance, on several types of modalities and settings:

⢠Synthetics.On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬nitely long (>1M tokens).⢠Audio and Genomics.Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half).In both settings, its performance improves with longer context up to million-length sequences.⢠Language Modeling.

Mamba is the ï¬rst linear-time sequence model that truly achieves Transformer-quality performance, both in pretraining perplexity and downstream evaluations.With scaling laws up to 1B parameters, we show that Mamba exceeds the performance of a large range of baselines, including very strong modern Transformer training recipes based on LLaMa (Touvron et al. 2023).Our Mamba language model has 5Ãgeneration throughput compared to Transformers of similar size, and Mamba-3Bâs quality matches that of Transformers twice its size (e.g. 4 points higher avg. on common sense reasoning compared to Pythia-3B and even exceeding Pythia-7B).Model code and pre-trained checkpoints are open-sourced at https://github.com/state-spaces/mamba.

2# Selective State Space Model# with Hardware-aware State Expansion# Avuvy GPU SRAM Selection Mechanism esSelection MechanismFigure 1: (Overview.) Structured SSMs independently map each channel (e.g. ð· = 5) of an input ð¥ to output ð¦ through a higher dimensional latent state â(e.g. ð= 4).Prior SSMs avoid materializing this large effective state (ð·ð, times batch size ðµ and sequence length ð¿) through clever alternate computation paths requiring time-invariance: the (â, A, B, C) parameters are constant across time.Our selection mechanism adds back input-dependent dynamics, which also requires a careful hardware-aware algorithm to only materialize the expanded states in more efficient levels of the GPU memory hierarchy.# 2 State Space ModelsStructured state space sequence models (S4) are a recent class of sequence models for deep learning that are broadly related to RNNs, and CNNs, and classical state space models.They are inspired by a particular continuous system (1) that maps a 1-dimensional function or sequence ð

¥(ð¡) ââ⦠ð¦(ð¡) ââthrough an implicit latent state â(ð¡) ââð.Concretely, S4 models are deï¬ned with four parameters (â, A, B, C), which deï¬ne a sequence-to-sequence trans- formation in two stages.ââ²(ð¡) = Aâ(ð¡) + Bð¥(ð¡) ð¦(ð¡) = Câ(ð¡)(1a) (1b) âð¡ = Aâð¡â1 + Bð¥ð¡ ð¦ð¡ = Câð¡ (2a) (2b) ðð² = (Cð©, Cð¨ð©, ⦠, Cð¨ ð¦ = ð¥ âð² ð©, â

¦ ) (3a) (3b)Discretization.The ï¬rst stage transforms the âcontinuous parametersâ(â, A, B) to âdiscrete parametersâ(A, B) through ï¬xed formulas A = ðð´(â, A) and B = ððµ(â, A, B), where the pair (ðð´, ððµ) is called a discretization rule.Various rules can be used such as the zero-order hold (ZOH) deï¬

ned in equation (4).A = exp(âA) B = (âA)â1(exp(âA) âI) ââB (4)Discretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023).

It also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5.However, from a mechanical point of view discretization can simply be viewed as the ï¬rst step of the computation graph in the forward pass of an SSM.Alternate ï¬avors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about.Computation.After the parameters have been transformed from (â, A, B, C) ⦠(A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3).3Commonly, the model uses the convolutional mode (3) for eï¬cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬cient autoregressive inference (where the inputs are seen one timestep at a time).Linear Time Invariance (LTI).An important property of equations (1) to (3) is that the modelâs dynamics are constant through time.

In other words (â, A, B, C), and consequently (A, B) as well, are ï¬xed for all time-steps.This property is called linear time invariance (LTI), which is deeply connected to recurrence and convolutions.Informally, we think of LTI SSMs as being equivalent to any linear recurrence (2a) or convolution (3b), and use LTI as an umbrella term for these classes of models.Thus far, all structured SSMs have been LTI (e.g. computed as convolutions) because of fundamental eï¬ciency constraints, discussed in Section 3.3.However, a core insight of this work is that LTI models have fundamental limitations in modeling certain types of data, and our technical contributions involve removing the LTI constraint while overcoming the eï¬ciency bottlenecks.

Structure and Dimensions.Finally, we note that structured SSMs are so named because computing them eï¬ciently also requires imposing structure on the A matrix.The most popular form of structure is diagonal (Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Smith, Warrington, and Linderman 2023), which we also use.In this case, the A ââðÃð, B ââðÃ1, C ââ1Ãðmatrices can all be represented by ðnumbers.To operate over an input sequence ð¥ of batch size ðµ and length ð¿ with ð

· channels, the SSM is applied independently to each channel.Note that in this case, the total hidden state has dimension ð·ðper input, and computing it over the sequence length requires ð(ðµð¿ð·ð) time and memory; this is the root of the fundamental eï¬ciency bottleneck addressed in Section 3.3.General State Space Models.We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state.It has been used to refer to many disparate concepts in diï¬erent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning).Throughout this entire paper we use the term â

SSMâto refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably.For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y.Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary.SSM Architectures.SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines.

⢠Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM.⢠H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3).H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer.⢠Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021).â

¢ RetNet (Y.Sun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions.4⢠RWKV (B.Peng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S.Zhai et al. 2021)).Its main âWKVâ

mechanism involves LTI recurrences and can be viewed as the ratio of two SSMs.Other closely related SSMs and architectures are discussed further in an extended related work (Appendix B).We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM.# 3 Selective State Space ModelsWe motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2).The resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬

ciently.We overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3).We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4).Finally, we discuss some additional properties of selection mechanisms (Section 3.5).# 3.1 Motivation:Selection as a Means of CompressionWe argue that a fundamental problem of sequence modeling is compressing context into a smaller state.In fact, we can view the tradeoï¬s of popular sequence models from this point of view.For example, attention is both eï¬ective and ineï¬cient because it explicitly does not compress context at all.This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers.On the other hand, recurrent models are eï¬cient because they have a ï¬nite state, implying constant-time inference and linear-time training.However, their eï¬ectiveness is limited by how well this state has compressed the context.To understand this principle, we focus on two running examples of synthetic tasks (Figure 2).

⢠The Selective Copying task modiï¬es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize.It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬lter out the irrelevant ones (white).⢠The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022).It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black).These tasks reveal the failure mode of LTI models.From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬ect the hidden state passed along the sequence an in input-dependent way.From the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬culty with the Selective Copying task because of lack of content-awareness (Figure 2).More concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels.

In summary, the eï¬ciency vs. eï¬ectiveness tradeoï¬of sequence models is characterized by how well they compress their state: eï¬cient models must have a small state, while eï¬ective models must have a state that contains all necessary information from the context.In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬lter out inputs into a sequential state.In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion).

# Improving SSMs with SelectionOne method of incorporating a selection mechanism into models is by letting their parameters that aï¬ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the c

Asynchronous Chunks

# Mamba:Linear-Time Sequence Modeling with Selective State Spaces# Albert Gu*1 and Tri Dao*21Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me# AbstractFoundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module.Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformersâcomputational ineï¬ciency on long sequences, but they have not performed as well as attention on important modalities such as language.We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements.First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token.Second, even though this change prevents the use of eï¬cient convolutions, we design a hardware-aware parallel algorithm in recurrent mode.We integrate these selective SSMs into a simpliï¬ed end-to-end neural network architecture without attention or even MLP blocks (Mamba).Mamba enjoys fast inference (5Ãhigher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences.

As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics.On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.# 1 IntroductionFoundation models (FMs), or large models pretrained on massive data then adapted for downstream tasks, have emerged as an eï¬ective paradigm in modern machine learning.The backbone of these FMs are often sequence models, operating on arbitrary sequences of inputs from a wide variety of domains such as language, images, speech, audio, time series, and genomics (Brown et al. 2020; Dosovitskiy et al. 2020; Ismail Fawaz et al. 2019; Oord et al. 2016; Poli et al. 2023; Sutskever, Vinyals, and Quoc V Le 2014).While this concept is agnostic to a particular choice of model architecture, modern FMs are predominantly based on a single type of sequence model: the Transformer (Vaswani et al. 2017) and its core attention layer (Bahdanau, Cho, and Bengio 2015) The eï¬cacy of self-attention is attributed to its ability to route information densely within a context window, allowing it to model complex data.

However, this property brings fundamental drawbacks: an inability to model anything outside of a ï¬nite window, and quadratic scaling with respect to the window length.An enormous body of research has appeared on more eï¬cient variants of attention to overcome these drawbacks (Tay, Dehghani, Bahri, et al. 2022), but often at the expense of the very properties that makes it eï¬ective.As of yet, none of these variants have been shown to be empirically eï¬ective at scale across domains.

Recently, structured state space sequence models (SSMs) (Gu, Goel, and Ré 2022; Gu, Johnson, Goel, et al. 2021) have emerged as a promising class of architectures for sequence modeling.These models can be interpreted as a combination of recurrent neural networks (RNNs) and convolutional neural networks (CNNs), with inspiration from classical state space models (Kalman 1960).This class of models can be computed very eï¬ciently as either a recurrence or convolution, with linear or near-linear scaling in sequence length.

Additionally, they have principledEqual contribution.1mechanisms for modeling long-range dependencies (Gu, Dao, et al. 2020) in certain data modalities, and have dominated benchmarks such as the Long Range Arena (Tay, Dehghani, Abnar, et al. 2021).Many ï¬avors of SSMs (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Y.Li et al. 2023; Ma et al. 2023; Orvieto et al. 2023; Smith, Warrington, and Linderman 2023) have been successful in domains involving continuous signal data such as audio and vision (Goel et al. 2022; Nguyen, Goel, et al. 2022; Saon, Gupta, and Cui 2023).However, they have been less eï¬ective at modeling discrete and information-dense data such as text.We propose a new class of selective state space models, that improves on prior work on several axes to achieve the modeling power of Transformers while scaling linearly in sequence length.

Selection Mechanism.First, we identify a key limitation of prior models: the ability to eï¬ciently select data in an input-dependent manner (i.e. focus on or ignore particular inputs).Building on intuition based on important synthetic tasks such as selective copy and induction heads, we design a simple selection mechanism by parameterizing the SSM parameters based on the input.This allows the model to ï¬lter out irrelevant information and remember relevant information indeï¬nitely.Hardware-aware Algorithm.This simple change poses a technical challenge for the computation of the model; in fact, all prior SSMs models must be time- and input-invariant in order to be computationally eï¬cient.We overcome this with a hardware-aware algorithm that computes the model recurrently with a scan instead of convolution, but does not materialize the expanded state in order to avoid IO access between diï¬erent levels of the GPU memory hierarchy.The resulting implementation is faster than previous methods both in theory (scaling linearly in sequence length, compared to pseudo-linear for all convolution-based SSMs) and on modern hardware (up to 3Ãfaster on A100 GPUs).

Architecture.We simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces.Selective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬ciency together yield performance improvements on real data up to sequence length 1M.

We empirically validate Mambaâs potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬c task performance, on several types of modalities and settings:⢠Synthetics.On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬nitely long (>1M tokens).⢠Audio and Genomics.Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half).In both settings, its performance improves with longer context up to million-length sequences.⢠Language Modeling.

Mamba is the ï¬rst linear-time sequence model that truly achieves Transformer-quality performance, both in pretraining perplexity and downstream evaluations.With scaling laws up to 1B parameters, we show that Mamba exceeds the performance of a large range of baselines, including very strong modern Transformer training recipes based on LLaMa (Touvron et al. 2023).Our Mamba language model has 5Ãgeneration throughput compared to Transformers of similar size, and Mamba-3Bâs quality matches that of Transformers twice its size (e.g. 4 points higher avg. on common sense reasoning compared to Pythia-3B and even exceeding Pythia-7B).Model code and pre-trained checkpoints are open-sourced at https://github.com/state-spaces/mamba.

2# Selective State Space Model# with Hardware-aware State Expansion# Avuvy GPU SRAM Selection Mechanism esSelection MechanismFigure 1: (Overview.) Structured SSMs independently map each channel (e.g. ð· = 5) of an input ð¥ to output ð¦ through a higher dimensional latent state â(e.g. ð= 4).Prior SSMs avoid materializing this large effective state (ð·ð, times batch size ðµ and sequence length ð¿) through clever alternate computation paths requiring time-invariance: the (â, A, B, C) parameters are constant across time.Our selection mechanism adds back input-dependent dynamics, which also requires a careful hardware-aware algorithm to only materialize the expanded states in more efficient levels of the GPU memory hierarchy.# 2 State Space ModelsStructured state space sequence models (S4) are a recent class of sequence models for deep learning that are broadly related to RNNs, and CNNs, and classical state space models.They are inspired by a particular continuous system (1) that maps a 1-dimensional function or sequence ð

¥(ð¡) ââ⦠ð¦(ð¡) ââthrough an implicit latent state â(ð¡) ââð.Concretely, S4 models are deï¬ned with four parameters (â, A, B, C), which deï¬ne a sequence-to-sequence trans- formation in two stages.ââ²(ð¡) = Aâ

(ð¡) + Bð¥(ð¡) ð¦(ð¡) = Câ(ð¡)(1a) (1b) âð¡ = Aâð¡â1 + Bð¥ð¡ ð¦ð¡ = Câð¡ (2a) (2b) ðð² = (Cð©, Cð¨ð©, ⦠, Cð¨ ð¦ = ð¥ âð² ð©, â

¦ ) (3a) (3b)Discretization.The ï¬rst stage transforms the âcontinuous parametersâ(â, A, B) to âdiscrete parametersâ(A, B) through ï¬xed formulas A = ðð´(â, A) and B = ððµ(â, A, B), where the pair (ðð´, ððµ) is called a discretization rule.Various rules can be used such as the zero-order hold (ZOH) deï¬

ned in equation (4).A = exp(âA) B = (âA)â1(exp(âA) âI) ââB (4)Discretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023).

It also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5.

However, from a mechanical point of view discretization can simply be viewed as the ï¬rst step of the computation graph in the forward pass of an SSM.Alternate ï¬avors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about.Computation.After the parameters have been transformed from (â, A, B, C) ⦠(A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3).3Commonly, the model uses the convolutional mode (3) for eï¬cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬cient autoregressive inference (where the inputs are seen one timestep at a time).Linear Time Invariance (LTI).An important property of equations (1) to (3) is that the modelâs dynamics are constant through time.

In other words (â, A, B, C), and consequently (A, B) as well, are ï¬xed for all time-steps.This property is called linear time invariance (LTI), which is deeply connected to recurrence and convolutions.Informally, we think of LTI SSMs as being equivalent to any linear recurrence (2a) or convolution (3b), and use LTI as an umbrella term for these classes of models.Thus far, all structured SSMs have been LTI (e.g. computed as convolutions) because of fundamental eï¬ciency constraints, discussed in Section 3.3.However, a core insight of this work is that LTI models have fundamental limitations in modeling certain types of data, and our technical contributions involve removing the LTI constraint while overcoming the eï¬ciency bottlenecks.

Structure and Dimensions.Finally, we note that structured SSMs are so named because computing them eï¬ciently also requires imposing structure on the A matrix.The most popular form of structure is diagonal (Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Smith, Warrington, and Linderman 2023), which we also use.In this case, the A ââðÃð, B ââðÃ1, C ââ1Ãðmatrices can all be represented by ðnumbers.To operate over an input sequence ð¥ of batch size ðµ and length ð¿ with ð

· channels, the SSM is applied independently to each channel.Note that in this case, the total hidden state has dimension ð·ðper input, and computing it over the sequence length requires ð(ðµð¿ð·ð) time and memory; this is the root of the fundamental eï¬ciency bottleneck addressed in Section 3.3.General State Space Models.We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state.It has been used to refer to many disparate concepts in diï¬erent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning).Throughout this entire paper we use the term â

SSMâ

to refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably.For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y.Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary.SSM Architectures.SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines.

⢠Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM.⢠H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3).H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer.⢠Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021).â

¢ RetNet (Y.Sun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions.4⢠RWKV (B.Peng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S.Zhai et al. 2021)).Its main âWKVâ

mechanism involves LTI recurrences and can be viewed as the ratio of two SSMs.Other closely related SSMs and architectures are discussed further in an extended related work (Appendix B).We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM.# 3 Selective State Space ModelsWe motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2).The resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬

ciently.We overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3).We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4).Finally, we discuss some additional properties of selection mechanisms (Section 3.5).# 3.1 Motivation:Selection as a Means of CompressionWe argue that a fundamental problem of sequence modeling is compressing context into a smaller state.In fact, we can view the tradeoï¬s of popular sequence models from this point of view.For example, attention is both eï¬ective and ineï¬cient because it explicitly does not compress context at all.This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers.On the other hand, recurrent models are eï¬cient because they have a ï¬nite state, implying constant-time inference and linear-time training.However, their eï¬ectiveness is limited by how well this state has compressed the context.To understand this principle, we focus on two running examples of synthetic tasks (Figure 2).

⢠The Selective Copying task modiï¬es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize.It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬lter out the irrelevant ones (white).⢠The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022).It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black).These tasks reveal the failure mode of LTI models.From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬ect the hidden state passed along the sequence an in input-dependent way.From the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬culty with the Selective Copying task because of lack of content-awareness (Figure 2).More concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels.

In summary, the eï¬ciency vs. eï¬ectiveness tradeoï¬

of sequence models is characterized by how well they compress their state: eï¬cient models must have a small state, while eï¬ective models must have a state that contains all necessary information from the context.In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬lter out inputs into a sequential state.In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion).

# Improving SSMs with SelectionOne method of incorporating a selection mechanism into models is by letting their parameters that aï¬ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the c

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import random\n", + "from IPython.display import display, HTML\n", + "\n", + "# Predefined list of colors\n", + "colors = [\"000000\", \"FF0000\", \"800080\", \"008000\", \"0000FF\"]\n", + "\n", + "html_str = '
'\n", + "html_str += '
'\n", + "html_str += \"

Synchronous Chunks

\"\n", + "for chunk in chunks[0]:\n", + " chunk_text = \"\"\n", + " for split in chunk.splits:\n", + " chunk_text += split\n", + " color = random.choice(colors)\n", + " html_str += f'

{chunk_text}

'\n", + "html_str += \"
\"\n", + "html_str += '
'\n", + "html_str += \"

Asynchronous Chunks

\"\n", + "for chunk in chunks_async[0]:\n", + " chunk_text = \"\"\n", + " try:\n", + " for split in chunk.splits:\n", + " chunk_text += split\n", + " except AttributeError:\n", + " print(f\"Error Chunk: {chunk}\")\n", + " color = random.choice(colors)\n", + " html_str += f'

{chunk_text}

'\n", + "html_str += \"
\"\n", + "html_str += \"
\"\n", + "\n", + "# Display the HTML\n", + "display(HTML(html_str))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split 1, tokens 300, triggered by: token limit\n", + "\u001b[31m# Mamba: Linear-Time Sequence Modeling with Selective State Spaces # Albert Gu*1 and Tri Dao*2 1Machine Learning Department, Carnegie Mellon University 2Department of Computer Science, Princeton University agu@cs.cmu.edu, tri@tridao.me # Abstract Foundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformersâ computational ineï¬ ciency on long sequences, but they have not performed as well as attention on important modalities such as language. We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements. First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token. Second, even though this change prevents the use of eï¬ cient convolutions, we design a hardware-aware parallel algorithm in recurrent mode. We integrate these selective SSMs into a simpliï¬ ed end-to-end neural network architecture without attention or even MLP blocks (Mamba). Mamba enjoys fast inference (5à higher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 2, tokens 300, triggered by: token limit\n", + "\u001b[32mAs a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics. On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation. # 1 Introduction Foundation models (FMs), or large models pretrained on massive data then adapted for downstream tasks, have emerged as an eï¬ ective paradigm in modern machine learning. The backbone of these FMs are often sequence models, operating on arbitrary sequences of inputs from a wide variety of domains such as language, images, speech, audio, time series, and genomics (Brown et al. 2020; Dosovitskiy et al. 2020; Ismail Fawaz et al. 2019; Oord et al. 2016; Poli et al. 2023; Sutskever, Vinyals, and Quoc V Le 2014). While this concept is agnostic to a particular choice of model architecture, modern FMs are predominantly based on a single type of sequence model: the Transformer (Vaswani et al. 2017) and its core attention layer (Bahdanau, Cho, and Bengio 2015) The eï¬ cacy of self-attention is attributed to its ability to route information densely within a context window, allowing it to model complex data.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 3, tokens 112, triggered by: 0.32\n", + "\u001b[34mHowever, this property brings fundamental drawbacks: an inability to model anything outside of a ï¬ nite window, and quadratic scaling with respect to the window length. An enormous body of research has appeared on more eï¬ cient variants of attention to overcome these drawbacks (Tay, Dehghani, Bahri, et al. 2022), but often at the expense of the very properties that makes it eï¬ ective. As of yet, none of these variants have been shown to be empirically eï¬ ective at scale across domains.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 4, tokens 121, triggered by: 0.21\n", + "\u001b[35mRecently, structured state space sequence models (SSMs) (Gu, Goel, and Ré 2022; Gu, Johnson, Goel, et al. 2021) have emerged as a promising class of architectures for sequence modeling. These models can be interpreted as a combination of recurrent neural networks (RNNs) and convolutional neural networks (CNNs), with inspiration from classical state space models (Kalman 1960). This class of models can be computed very eï¬ ciently as either a recurrence or convolution, with linear or near-linear scaling in sequence length.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 5, tokens 256, triggered by: 0.26\n", + "\u001b[31mAdditionally, they have principled Equal contribution. 1 mechanisms for modeling long-range dependencies (Gu, Dao, et al. 2020) in certain data modalities, and have dominated benchmarks such as the Long Range Arena (Tay, Dehghani, Abnar, et al. 2021). Many ï¬ avors of SSMs (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Y. Li et al. 2023; Ma et al. 2023; Orvieto et al. 2023; Smith, Warrington, and Linderman 2023) have been successful in domains involving continuous signal data such as audio and vision (Goel et al. 2022; Nguyen, Goel, et al. 2022; Saon, Gupta, and Cui 2023). However, they have been less eï¬ ective at modeling discrete and information-dense data such as text. We propose a new class of selective state space models, that improves on prior work on several axes to achieve the modeling power of Transformers while scaling linearly in sequence length.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 6, tokens 240, triggered by: 0.28\n", + "\u001b[32mSelection Mechanism. First, we identify a key limitation of prior models: the ability to eï¬ ciently select data in an input-dependent manner (i.e. focus on or ignore particular inputs). Building on intuition based on important synthetic tasks such as selective copy and induction heads, we design a simple selection mechanism by parameterizing the SSM parameters based on the input. This allows the model to ï¬ lter out irrelevant information and remember relevant information indeï¬ nitely. Hardware-aware Algorithm. This simple change poses a technical challenge for the computation of the model; in fact, all prior SSMs models must be time- and input-invariant in order to be computationally eï¬ cient. We overcome this with a hardware-aware algorithm that computes the model recurrently with a scan instead of convolution, but does not materialize the expanded state in order to avoid IO access between diï¬ erent levels of the GPU memory hierarchy. The resulting implementation is faster than previous methods both in theory (scaling linearly in sequence length, compared to pseudo-linear for all convolution-based SSMs) and on modern hardware (up to 3à faster on A100 GPUs).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 7, tokens 196, triggered by: token limit\n", + "\u001b[34mArchitecture. We simplify prior deep sequence model architectures by combining the design of prior SSM architectures (Dao, Fu, Saab, et al. 2023) with the MLP block of Transformers into a single block, leading to a simple and homogenous architecture design (Mamba) incorporating selective state spaces. Selective SSMs, and by extension the Mamba architecture, are fully recurrent models with key properties that make them suitable as the backbone of general foundation models operating on sequences. (i) High quality: selectivity brings strong performance on dense modalities such as language and genomics. (ii) Fast training and inference: computation and memory scales linearly in sequence length during training, and unrolling the model autoregressively during inference requires only constant time per step since it does not require a cache of previous elements. (iii) Long context: the quality and eï¬ ciency together yield performance improvements on real data up to sequence length 1M.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 8, tokens 179, triggered by: 0.33\n", + "\u001b[35mWe empirically validate Mambaâ s potential as a general sequence FM backbone, in both pretraining quality and domain-speciï¬ c task performance, on several types of modalities and settings: â ¢ Synthetics. On important synthetic tasks such as copying and induction heads that have been proposed as being key to large language models, Mamba not only solves them easily but can extrapolate solutions indeï¬ nitely long (>1M tokens). â ¢ Audio and Genomics. Mamba out-performs prior state-of-the-art models such as SaShiMi, Hyena, and Transform- ers on modeling audio waveforms and DNA sequences, both in pretraining quality and downstream metrics (e.g. reducing FID on a challenging speech generation dataset by more than half). In both settings, its performance improves with longer context up to million-length sequences. â ¢ Language Modeling.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 9, tokens 165, triggered by: 0.21\n", + "\u001b[31mMamba is the ï¬ rst linear-time sequence model that truly achieves Transformer-quality performance, both in pretraining perplexity and downstream evaluations. With scaling laws up to 1B parameters, we show that Mamba exceeds the performance of a large range of baselines, including very strong modern Transformer training recipes based on LLaMa (Touvron et al. 2023). Our Mamba language model has 5à generation throughput compared to Transformers of similar size, and Mamba-3Bâ s quality matches that of Transformers twice its size (e.g. 4 points higher avg. on common sense reasoning compared to Pythia-3B and even exceeding Pythia-7B). Model code and pre-trained checkpoints are open-sourced at https://github.com/state-spaces/mamba.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 10, tokens 238, triggered by: 0.26\n", + "\u001b[32m2 # Selective State Space Model # with Hardware-aware State Expansion # A vuvy GPU SRAM Selection Mechanism es Selection Mechanism Figure 1: (Overview.) Structured SSMs independently map each channel (e.g. ð · = 5) of an input ð ¥ to output ð ¦ through a higher dimensional latent state â (e.g. ð = 4). Prior SSMs avoid materializing this large effective state (ð ·ð , times batch size ð µ and sequence length ð ¿) through clever alternate computation paths requiring time-invariance: the (â , A, B, C) parameters are constant across time. Our selection mechanism adds back input-dependent dynamics, which also requires a careful hardware-aware algorithm to only materialize the expanded states in more efficient levels of the GPU memory hierarchy. # 2 State Space Models Structured state space sequence models (S4) are a recent class of sequence models for deep learning that are broadly related to RNNs, and CNNs, and classical state space models. They are inspired by a particular continuous system (1) that maps a 1-dimensional function or sequence ð\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 11, tokens 83, triggered by: token limit\n", + "\u001b[34m¥(ð ¡) â â â ¦ ð ¦(ð ¡) â â through an implicit latent state â (ð ¡) â â ð . Concretely, S4 models are deï¬ ned with four parameters (â , A, B, C), which deï¬ ne a sequence-to-sequence trans- formation in two stages. â â ²(ð ¡) = Aâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 12, tokens 105, triggered by: 0.46\n", + "\u001b[35m(ð ¡) + Bð ¥(ð ¡) ð ¦(ð ¡) = Câ (ð ¡) (1a) (1b) â ð ¡ = Aâ ð ¡â 1 + Bð ¥ð ¡ ð ¦ð ¡ = Câ ð ¡ (2a) (2b) ð ð ² = (Cð ©, Cð ¨ð ©, â ¦ , Cð ¨ ð ¦ = ð ¥ â ð ² ð ©, â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 13, tokens 112, triggered by: 0.41\n", + "\u001b[31m¦ ) (3a) (3b) Discretization. The ï¬ rst stage transforms the â continuous parametersâ (â , A, B) to â discrete parametersâ (A, B) through ï¬ xed formulas A = ð ð ´(â , A) and B = ð ð µ(â , A, B), where the pair (ð ð ´, ð ð µ) is called a discretization rule. Various rules can be used such as the zero-order hold (ZOH) deï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 14, tokens 109, triggered by: 0.24\n", + "\u001b[32mned in equation (4). A = exp(â A) B = (â A)â 1(exp(â A) â I) â â B (4) Discretization has deep connections to continuous-time systems which can endow them with additional properties such as resolution invariance (Nguyen, Goel, et al. 2022) and automatically ensuring that the model is properly normalized (Gu, Johnson, Timalsina, et al. 2023; Orvieto et al. 2023).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 15, tokens 49, triggered by: token limit\n", + "\u001b[34mIt also has connections to gating mechanisms of RNNs (Gu, Gulcehre, et al. 2020; Tallec and Ollivier 2018) which we will revisit in Section 3.5.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 16, tokens 225, triggered by: 0.29\n", + "\u001b[35mHowever, from a mechanical point of view discretization can simply be viewed as the ï¬ rst step of the computation graph in the forward pass of an SSM. Alternate ï¬ avors of SSMs can bypass the discretization step and parameterize (A, B) directly instead (Zhang et al. 2023), which may be easier to reason about. Computation. After the parameters have been transformed from (â , A, B, C) â ¦ (A, B, C), the model can be computed in two ways, either as a linear recurrence (2) or a global convolution (3). 3 Commonly, the model uses the convolutional mode (3) for eï¬ cient parallelizable training (where the whole input sequence is seen ahead of time), and switched into recurrent mode (2) for eï¬ cient autoregressive inference (where the inputs are seen one timestep at a time). Linear Time Invariance (LTI). An important property of equations (1) to (3) is that the modelâ s dynamics are constant through time.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 17, tokens 182, triggered by: 0.33\n", + "\u001b[31mIn other words (â , A, B, C), and consequently (A, B) as well, are ï¬ xed for all time-steps. This property is called linear time invariance (LTI), which is deeply connected to recurrence and convolutions. Informally, we think of LTI SSMs as being equivalent to any linear recurrence (2a) or convolution (3b), and use LTI as an umbrella term for these classes of models. Thus far, all structured SSMs have been LTI (e.g. computed as convolutions) because of fundamental eï¬ ciency constraints, discussed in Section 3.3. However, a core insight of this work is that LTI models have fundamental limitations in modeling certain types of data, and our technical contributions involve removing the LTI constraint while overcoming the eï¬ ciency bottlenecks.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 18, tokens 141, triggered by: 0.24\n", + "\u001b[32mStructure and Dimensions. Finally, we note that structured SSMs are so named because computing them eï¬ ciently also requires imposing structure on the A matrix. The most popular form of structure is diagonal (Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Smith, Warrington, and Linderman 2023), which we also use. In this case, the A â â ð à ð , B â â ð à 1, C â â 1à ð matrices can all be represented by ð numbers. To operate over an input sequence ð ¥ of batch size ð µ and length ð ¿ with ð\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 19, tokens 236, triggered by: 0.27\n", + "\u001b[34m· channels, the SSM is applied independently to each channel. Note that in this case, the total hidden state has dimension ð ·ð per input, and computing it over the sequence length requires ð (ð µð ¿ð ·ð ) time and memory; this is the root of the fundamental eï¬ ciency bottleneck addressed in Section 3.3. General State Space Models. We note that the term state space model has a very broad meaning which simply represents the notion of any recurrent process with a latent state. It has been used to refer to many disparate concepts in diï¬ erent disciplines, including Markov decision processes (MDP) (reinforcement learning (Hafner et al. 2020)), dynamic causal modeling (DCM) (computational neuroscience (Friston, Harrison, and Penny 2003)), Kalman ï¬ lters (controls (Kalman 1960)), hidden Markov models (HMM) and linear dynamical systems (LDS) (machine learning), and recurrent (and sometimes convolutional) models at large (deep learning). Throughout this entire paper we use the term â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 20, tokens 3, triggered by: token limit\n", + "\u001b[35mSSMâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 21, tokens 226, triggered by: 0.22\n", + "\u001b[31mto refer exclusively to the class of structured SSMs or S4 models (Gu, Goel, and Ré 2022; Gu, Gupta, et al. 2022; Gupta, Gu, and Berant 2022; Hasani et al. 2023; Ma et al. 2023; Smith, Warrington, and Linderman 2023) and use these terms interchangeably. For convenience we may also include derivatives of such models, such as those focusing on either the linear-recurrence or global-convolution viewpoints (Y. Li et al. 2023; Orvieto et al. 2023; Poli et al. 2023), and clarify nuances when necessary. SSM Architectures. SSMs are standalone sequence transformations that can be incorporated into end-to-end neural network architectures. (We also sometimes call SSM architectures SSNNs, which are to SSM layers as CNNs are to linear convolution layers.) We discuss some of the most well-known SSM architectures, many of which will also serve as our primary baselines.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 22, tokens 158, triggered by: 0.32\n", + "\u001b[32mâ ¢ Linear attention (Katharopoulos et al. 2020) is an approximation of self-attention involving a recurrence which can be viewed as a degenerate linear SSM. â ¢ H3 (Dao, Fu, Saab, et al. 2023) generalized this recurrence to use S4; it can be viewed as an architecture with an SSM sandwiched by two gated connections (Figure 3). H3 also inserts a standard local convolution, which they frame as a shift-SSM, before the main SSM layer. â ¢ Hyena (Poli et al. 2023) uses the same architecture as H3 but replaces the S4 layer with an MLP-parameterized global convolution (Romero et al. 2021). â\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 23, tokens 106, triggered by: 0.26\n", + "\u001b[34m¢ RetNet (Y. Sun et al. 2023) adds an additional gate to the architecture and uses a simpler SSM, allowing an alternative parallelizable computation path, using a variant of multi-head attention (MHA) instead of convolutions. 4 â ¢ RWKV (B. Peng et al. 2023) is a recent RNN designed for language modeling based on another linear attention approximation (attention-free Transformer (S. Zhai et al. 2021)). Its main â WKVâ\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 24, tokens 172, triggered by: 0.25\n", + "\u001b[35mmechanism involves LTI recurrences and can be viewed as the ratio of two SSMs. Other closely related SSMs and architectures are discussed further in an extended related work (Appendix B). We highlight in particular S5 (Smith, Warrington, and Linderman 2023), QRNN (Bradbury et al. 2016), and SRU (Lei et al. 2017), which we view as the most closely related methods to our core selective SSM. # 3 Selective State Space Models We motivate our selection mechanism using intuition from synthetic tasks (Section 3.1), then explain how to incorporate this mechanism into state space models (Section 3.2). The resulting time-varying SSMs cannot use convolutions, presenting a technical challenge of how to compute them eï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 25, tokens 254, triggered by: 0.33\n", + "\u001b[31mciently. We overcome this with a hardware-aware algorithm that exploits the memory hierarchy on modern hardware (Section 3.3). We then describe a simple SSM architecture without attention or even MLP blocks (Section 3.4). Finally, we discuss some additional properties of selection mechanisms (Section 3.5). # 3.1 Motivation: Selection as a Means of Compression We argue that a fundamental problem of sequence modeling is compressing context into a smaller state. In fact, we can view the tradeoï¬ s of popular sequence models from this point of view. For example, attention is both eï¬ ective and ineï¬ cient because it explicitly does not compress context at all. This can be seen from the fact that autoregressive inference requires explicitly storing the entire context (i.e. the KV cache), which directly causes the slow linear-time inference and quadratic-time training of Transformers. On the other hand, recurrent models are eï¬ cient because they have a ï¬ nite state, implying constant-time inference and linear-time training. However, their eï¬ ectiveness is limited by how well this state has compressed the context. To understand this principle, we focus on two running examples of synthetic tasks (Figure 2).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 26, tokens 283, triggered by: 0.35\n", + "\u001b[32mâ ¢ The Selective Copying task modiï¬ es the popular Copying task (Arjovsky, Shah, and Bengio 2016) by varying the position of the tokens to memorize. It requires content-aware reasoning to be able to memorize the relevant tokens (colored) and ï¬ lter out the irrelevant ones (white). â ¢ The Induction Heads task is a well-known mechanism hypothesized to explain the majority of in-context learning abilities of LLMs (Olsson et al. 2022). It requires context-aware reasoning to know when to produce the correct output in the appropriate context (black). These tasks reveal the failure mode of LTI models. From the recurrent view, their constant dynamics (e.g. the (A, B) transitions in (2)) cannot let them select the correct information from their context, or aï¬ ect the hidden state passed along the sequence an in input-dependent way. From the convolutional view, it is known that global convolutions can solve the vanilla Copying task (Romero et al. 2021) because it only requires time-awareness, but that they have diï¬ culty with the Selective Copying task because of lack of content-awareness (Figure 2). More concretely, the spacing between inputs-to-outputs is varying and cannot be modeled by static convolution kernels.\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 27, tokens 20, triggered by: token limit\n", + "\u001b[34mIn summary, the eï¬ ciency vs. eï¬ ectiveness tradeoï¬\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 28, tokens 109, triggered by: 0.41\n", + "\u001b[35mof sequence models is characterized by how well they compress their state: eï¬ cient models must have a small state, while eï¬ ective models must have a state that contains all necessary information from the context. In turn, we propose that a fundamental principle for building sequence models is selectivity: or the context-aware ability to focus on or ï¬ lter out inputs into a sequential state. In particular, a selection mechanism controls how information propagates or interacts along the sequence dimension (see Section 3.5 for more discussion).\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n", + "Split 29, tokens 45, triggered by: final split\n", + "\u001b[31m# Improving SSMs with Selection One method of incorporating a selection mechanism into models is by letting their parameters that aï¬ ect interactions along the sequence (e.g. the recurrent dynamics of an RNN or the c\u001b[0m\n", + "----------------------------------------------------------------------------------------\n", + "\n", + "\n" + ] + } + ], + "source": [ + "chunker.print(chunks_async[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lDIq3kOm3M4U" + }, + "source": [ + "## Consecutive Chunking" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8HmOB5pL3Nim" + }, + "source": [ + "Consecutive chunking is the simplest version of semantic chunking." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "AaKVbv942kkc" + }, + "outputs": [], + "source": [ + "from semantic_chunkers import ConsecutiveChunker\n", + "\n", + "chunker = ConsecutiveChunker(encoder=encoder, score_threshold=0.3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 315 + }, + "id": "d3mtF7R66tFJ", + "outputId": "be8a0a91-e042-4214-9019-5cb17559c6de" + }, + "outputs": [], + "source": [ + "chunks = await chunker.acall(docs=[content])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "chunker.print(chunks[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cumulative Chunking" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Cumulative chunking is a more compute intensive process, but can often provide more stable results as it is more noise resistant. However, it is _very expensive_ in both time and (if using APIs) money." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from semantic_chunkers import CumulativeChunker\n", + "\n", + "chunker = CumulativeChunker(encoder=encoder, score_threshold=0.3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "chunks = await chunker.acall(docs=[content])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "chunker.print(chunks[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "0ec1edf815164f5f82001efa06857553": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_272c61542ae8454b9eb616e5e2a858e3", + "placeholder": "​", + "style": "IPY_MODEL_2a19c3d693834de786396120d19a65b3", + "value": " 217M/217M [00:04<00:00, 49.9MB/s]" + } + }, + "1180440b21b74b3da04a8a876a135658": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8489e5ef0f384d6480e8e2c5c60fdd1d", + "IPY_MODEL_52661abd5e3c460eb54489d1e3515d69", + "IPY_MODEL_f3c981b961eb4185b995dc4e3b5d3919" + ], + "layout": "IPY_MODEL_b987413b65ad4b10a7f7759c9dcc1c2f" + } + }, + "1707a76c176a4e1d96baa4a4ed9e7d99": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1bc01711294a49b4a81475e547874514": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1d5846c892724b8cab97693ec1c1cd33": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "20740edade1a44c29f4d4094ed487f00": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "272c61542ae8454b9eb616e5e2a858e3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "29f5562abb0e4383956f18652833defe": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2a19c3d693834de786396120d19a65b3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3321078273154d2bb3b6a9189d97806c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d51e32f5700b40cea7512aded4d6d019", + "placeholder": "​", + "style": "IPY_MODEL_a27eb006ba8c4d69a70d8be4e3c280a2", + "value": "Downloading data: 100%" + } + }, + "46577f9ba4e142009d0aebb6741d6e84": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4b5d04efea944bbc8659b4f56f22ed69": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "52661abd5e3c460eb54489d1e3515d69": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b59d191a2c0e498e80494f9384de7dfb", + "max": 21, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e57af473700d4bc2bbea39eca36ece8c", + "value": 21 + } + }, + "75159166d8254cde933b4131e4627b8f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7b3688583754445ab07bf28254fcc97d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1bc01711294a49b4a81475e547874514", + "placeholder": "​", + "style": "IPY_MODEL_4b5d04efea944bbc8659b4f56f22ed69", + "value": "Generating train split: 100%" + } + }, + "8489e5ef0f384d6480e8e2c5c60fdd1d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_75159166d8254cde933b4131e4627b8f", + "placeholder": "​", + "style": "IPY_MODEL_d5c5e31059e94176be64f32dd2496164", + "value": "Downloading readme: 100%" + } + }, + "8b640a38e82e4f0598fa6ed382888499": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_3321078273154d2bb3b6a9189d97806c", + "IPY_MODEL_ce12ce0393464fd7a533123d3c71a3e6", + "IPY_MODEL_0ec1edf815164f5f82001efa06857553" + ], + "layout": "IPY_MODEL_f0f1642352f14137a19ee27513f18ca6" + } + }, + "a27eb006ba8c4d69a70d8be4e3c280a2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b59d191a2c0e498e80494f9384de7dfb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b6fc89b557b44064b2e17829347bc951": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b987413b65ad4b10a7f7759c9dcc1c2f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bae4b7ef05ea49b6bdb2a0e369e6768b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_29f5562abb0e4383956f18652833defe", + "placeholder": "​", + "style": "IPY_MODEL_b6fc89b557b44064b2e17829347bc951", + "value": " 2673/2673 [00:04<00:00, 993.75 examples/s]" + } + }, + "be2cbe5e3ea94a9f81b9e370957e63b7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c59101e643d34bc5a4ad4fa664064614": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7b3688583754445ab07bf28254fcc97d", + "IPY_MODEL_f7870f32ab2d4d938fc5ba85aecf7f51", + "IPY_MODEL_bae4b7ef05ea49b6bdb2a0e369e6768b" + ], + "layout": "IPY_MODEL_20740edade1a44c29f4d4094ed487f00" + } + }, + "ce12ce0393464fd7a533123d3c71a3e6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_46577f9ba4e142009d0aebb6741d6e84", + "max": 217244045, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e430ed4d66604027812b1e39e1fc000e", + "value": 217244045 + } + }, + "d51e32f5700b40cea7512aded4d6d019": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d5c5e31059e94176be64f32dd2496164": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e430ed4d66604027812b1e39e1fc000e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e57af473700d4bc2bbea39eca36ece8c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f0f1642352f14137a19ee27513f18ca6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f3c981b961eb4185b995dc4e3b5d3919": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_be2cbe5e3ea94a9f81b9e370957e63b7", + "placeholder": "​", + "style": "IPY_MODEL_1707a76c176a4e1d96baa4a4ed9e7d99", + "value": " 21.0/21.0 [00:00<00:00, 566B/s]" + } + }, + "f4e26036353d452885f364eb8c2b241e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f7870f32ab2d4d938fc5ba85aecf7f51": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1d5846c892724b8cab97693ec1c1cd33", + "max": 2673, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f4e26036353d452885f364eb8c2b241e", + "value": 2673 + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/semantic_chunkers/chunkers/statistical.py b/semantic_chunkers/chunkers/statistical.py index 77e29de..922d8ce 100644 --- a/semantic_chunkers/chunkers/statistical.py +++ b/semantic_chunkers/chunkers/statistical.py @@ -378,7 +378,7 @@ def _find_optimal_threshold(self, docs: List[str], similarity_scores: List[float iteration = 0 median_tokens = 0 - calculated_threshold = 0 + calculated_threshold = 0.0 while low <= high: calculated_threshold = (low + high) / 2 split_indices = self._find_split_indices(