From 79c94a1275bfe2f1e4deae927f288425bb9eaa33 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 23 Apr 2024 22:05:46 +0000
Subject: [PATCH 01/90] fixed fp8 conflict with aqlm

---
 vllm/model_executor/layers/quantization/fp8.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 8df82e0e18edd..01e494c870e71 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -64,12 +64,13 @@ def create_weights(
         self,
         layer: torch.nn.Module,
         input_size_per_partition: int,
-        output_size_per_partition: int,
+        output_partition_sizes: List[int],
         input_size: int,
         output_size: int,
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
+        output_size_per_partition = sum(output_partition_sizes)
         weight = Parameter(torch.empty(output_size_per_partition,
                                        input_size_per_partition,
                                        dtype=params_dtype),

From f8b57e4320303b6001949fd57ba7ce12892466df Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 23 Apr 2024 22:10:05 +0000
Subject: [PATCH 02/90] added quantization tests to buildkite

---
 .buildkite/test-pipeline.yaml  | 3 +++
 tests/quantization/test_fp8.py | 6 +++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f7c1569696249..11cda053260ec 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -96,6 +96,9 @@ steps:
 - label: Metrics Test
   command: pytest -v -s metrics
 
+- label: Quantization Test
+  command: pytest -v -s quantization
+
 - label: Benchmarks
   working_dir: "/vllm-workspace/.buildkite"
   commands:
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index fa10e60de10a7..d643ebd38bb5d 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -12,9 +12,9 @@
 capability = capability[0] * 10 + capability[1]
 
 
-@pytest.mark.skipif(
-    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
-    reason="FP8 is not supported on this GPU type.")
+# @pytest.mark.skipif(
+#     capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
+#     reason="FP8 is not supported on this GPU type.")
 def test_load_fp16_model(vllm_runner) -> None:
     llm = vllm_runner("facebook/opt-125m", quantization="fp8")
 

From 7175e5b119e1cebdabe7202a5a0387a77ae80c72 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 23 Apr 2024 22:11:40 +0000
Subject: [PATCH 03/90] removed commented out piece

---
 tests/quantization/test_fp8.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index d643ebd38bb5d..fa10e60de10a7 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -12,9 +12,9 @@
 capability = capability[0] * 10 + capability[1]
 
 
-# @pytest.mark.skipif(
-#     capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
-#     reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
+    reason="FP8 is not supported on this GPU type.")
 def test_load_fp16_model(vllm_runner) -> None:
     llm = vllm_runner("facebook/opt-125m", quantization="fp8")
 

From 7a7520de4037161c14f6c7acff68c6f86e499fb8 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 23 Apr 2024 23:26:05 +0000
Subject: [PATCH 04/90] model loaded!

---
 tests/quantization/test_fp8.py                |  7 +++
 vllm/model_executor/layers/linear.py          | 46 +++++++++++++++++++
 .../layers/quantization/__init__.py           |  2 +
 vllm/model_executor/models/llama.py           |  1 +
 4 files changed, 56 insertions(+)

diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index fa10e60de10a7..c121d25daebe7 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -10,7 +10,14 @@
 
 capability = torch.cuda.get_device_capability()
 capability = capability[0] * 10 + capability[1]
+print(capability)
 
+@pytest.mark.skipif(
+    capability < QUANTIZATION_METHODS["fp8_static"].get_min_capability(),
+    reason="FP8 is not supported on this GPU type.")
+def test_load_static_model(vllm_runner) -> None:
+    llm = vllm_runner("FriendliAI/Llama-2-7b-chat-hf-fp8", quantization="fp8_static")
+    print(llm)
 
 @pytest.mark.skipif(
     capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index e56af9075e2fd..9da2a6e1287c8 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -285,6 +285,22 @@ def weight_loader(self,
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
         is_metadata = getattr(param, "is_metadata", False)
+        
+        # TODO: document.
+        # TODO: sync with is_metadata.
+        # For loading scales.
+        param_shard_splitter = getattr(param, "shard_splitter", None)
+        if output_dim is not None and param_shard_splitter is not None:
+            raise NotImplementedError(
+                "We do not currently support output_dim != None and "
+                "shard_splitter != None for a parameter. Please open an issue."
+            )
+        if loaded_shard_id is None and param_shard_splitter is not None:
+            raise NotImplementedError(
+                "We do not currently support loaded_shard_id == None and "
+                "shard_splitter != None for a parameter. Please open an issue."
+            )
+        
         if loaded_shard_id is None:
             # Loaded weight is already packed.
             if output_dim is None:
@@ -342,6 +358,13 @@ def weight_loader(self,
             shard_size = loaded_weight.shape[0]
             shard_offset = loaded_shard_id * shard_size
             param_data = param_data.narrow(0, shard_offset, shard_size)
+        
+        # TODO: sync with is_metadata UX.
+        # If a param_shard_splitter is defined by the LinearMethod, use it.
+        elif param_shard_splitter is not None:
+            param_data, loaded_weight = param_shard_splitter(
+                param_data, loaded_weight, loaded_shard_id)
+        
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -423,6 +446,19 @@ def weight_loader(self,
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
         is_metadata = getattr(param, "is_metadata", False)
+        
+        # TODO: sync with is_metadata UX
+        param_shard_splitter = getattr(param, "shard_splitter", None)
+        if output_dim is not None and param_shard_splitter is not None:
+            raise NotImplementedError(
+                "We do not currently support output_dim != None and "
+                "shard_splitter != None for a parameter. Please open an issue."
+            )
+        if loaded_shard_id is None and param_shard_splitter is not None:
+            raise NotImplementedError(
+                "We do not currently support loaded_shard_id == None and "
+                "shard_splitter != None for a parameter. Please open an issue."
+            )
 
         if loaded_shard_id is None:
             # Loaded weight is already packed.
@@ -496,6 +532,11 @@ def weight_loader(self,
             shard_index = ["q", "k", "v"].index(loaded_shard_id)
             param_data = param_data.narrow(0, shard_index * shard_size,
                                            shard_size)
+        # TODO: sync with QKV
+        # If a param_shard_splitter is defined by the LinearMethod, use it.
+        elif param_shard_splitter is not None:
+            param_data, loaded_weight = param_shard_splitter(
+                param_data, loaded_weight, loaded_shard_id)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -592,6 +633,11 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(input_dim, start_idx,
                                                  shard_size)
+        # TODO: canon
+        # This is for loading scales for fp8, which have no dims.
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index a525add458499..c139331307872 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -5,6 +5,7 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.fp8 import FP8Config
+from vllm.model_executor.layers.quantization.fp8_static import FP8StaticConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.marlin import MarlinConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
@@ -13,6 +14,7 @@
     "aqlm": AQLMConfig,
     "awq": AWQConfig,
     "fp8": FP8Config,
+    "fp8_static": FP8StaticConfig,
     "gptq": GPTQConfig,
     "squeezellm": SqueezeLLMConfig,
     "marlin": MarlinConfig,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 016e3b039d1e8..d542d415e6cf7 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -386,6 +386,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
+            print(f"----- {name}")
             if "rotary_emb.inv_freq" in name:
                 continue
             if ("rotary_emb.cos_cached" in name

From e0b4d727c011732db0ba9dc62864f354437e0ccd Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 23 Apr 2024 23:28:21 +0000
Subject: [PATCH 05/90] renamed

---
 vllm/model_executor/layers/linear.py | 28 ++++++++++++++--------------
 vllm/model_executor/models/llama.py  |  1 -
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 9da2a6e1287c8..01af70a34c488 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -289,16 +289,16 @@ def weight_loader(self,
         # TODO: document.
         # TODO: sync with is_metadata.
         # For loading scales.
-        param_shard_splitter = getattr(param, "shard_splitter", None)
-        if output_dim is not None and param_shard_splitter is not None:
+        shard_indexer = getattr(param, "shard_indexer", None)
+        if output_dim is not None and shard_indexer is not None:
             raise NotImplementedError(
                 "We do not currently support output_dim != None and "
-                "shard_splitter != None for a parameter. Please open an issue."
+                "shard_indexer != None for a parameter. Please open an issue."
             )
-        if loaded_shard_id is None and param_shard_splitter is not None:
+        if loaded_shard_id is None and shard_indexer is not None:
             raise NotImplementedError(
                 "We do not currently support loaded_shard_id == None and "
-                "shard_splitter != None for a parameter. Please open an issue."
+                "shard_indexer != None for a parameter. Please open an issue."
             )
         
         if loaded_shard_id is None:
@@ -361,8 +361,8 @@ def weight_loader(self,
         
         # TODO: sync with is_metadata UX.
         # If a param_shard_splitter is defined by the LinearMethod, use it.
-        elif param_shard_splitter is not None:
-            param_data, loaded_weight = param_shard_splitter(
+        elif shard_indexer is not None:
+            param_data, loaded_weight = shard_indexer(
                 param_data, loaded_weight, loaded_shard_id)
         
         else:
@@ -448,16 +448,16 @@ def weight_loader(self,
         is_metadata = getattr(param, "is_metadata", False)
         
         # TODO: sync with is_metadata UX
-        param_shard_splitter = getattr(param, "shard_splitter", None)
-        if output_dim is not None and param_shard_splitter is not None:
+        shard_indexer = getattr(param, "shard_indexer", None)
+        if output_dim is not None and shard_indexer is not None:
             raise NotImplementedError(
                 "We do not currently support output_dim != None and "
-                "shard_splitter != None for a parameter. Please open an issue."
+                "shard_indexer != None for a parameter. Please open an issue."
             )
-        if loaded_shard_id is None and param_shard_splitter is not None:
+        if loaded_shard_id is None and shard_indexer is not None:
             raise NotImplementedError(
                 "We do not currently support loaded_shard_id == None and "
-                "shard_splitter != None for a parameter. Please open an issue."
+                "shard_indexer != None for a parameter. Please open an issue."
             )
 
         if loaded_shard_id is None:
@@ -534,8 +534,8 @@ def weight_loader(self,
                                            shard_size)
         # TODO: sync with QKV
         # If a param_shard_splitter is defined by the LinearMethod, use it.
-        elif param_shard_splitter is not None:
-            param_data, loaded_weight = param_shard_splitter(
+        elif shard_indexer is not None:
+            param_data, loaded_weight = shard_indexer(
                 param_data, loaded_weight, loaded_shard_id)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index d542d415e6cf7..016e3b039d1e8 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -386,7 +386,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
-            print(f"----- {name}")
             if "rotary_emb.inv_freq" in name:
                 continue
             if ("rotary_emb.cos_cached" in name

From f96428e692d2bccefa4799b7815ff63faf49442b Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 24 Apr 2024 01:58:36 +0000
Subject: [PATCH 06/90] stash

---
 tests/quantization/test_fp8.py      | 9 ++++++---
 vllm/model_executor/models/llama.py | 5 +++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index c121d25daebe7..6e38d570f8541 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -5,19 +5,22 @@
 import pytest
 import torch
 
+from vllm import SamplingParams
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 
 capability = torch.cuda.get_device_capability()
 capability = capability[0] * 10 + capability[1]
-print(capability)
 
 @pytest.mark.skipif(
     capability < QUANTIZATION_METHODS["fp8_static"].get_min_capability(),
     reason="FP8 is not supported on this GPU type.")
 def test_load_static_model(vllm_runner) -> None:
-    llm = vllm_runner("FriendliAI/Llama-2-7b-chat-hf-fp8", quantization="fp8_static")
-    print(llm)
+    llm = vllm_runner("FriendliAI/Llama-2-7b-chat-hf-fp8", quantization="fp8_static", enforce_eager=True)
+    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
+    fc1 = model.model.decoder.layers[0].fc1
+    assert isinstance(fc1.linear_method, Fp8LinearMethod)
+    print(llm.generate("Hello my name is", SamplingParams(max_tokens=20)))
 
 @pytest.mark.skipif(
     capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 016e3b039d1e8..22957a78cbc52 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -276,10 +276,13 @@ def forward(
         attn_metadata: AttentionMetadata,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        self.embed_tokens.weight[0] = torch.Tensor([-2.0336052269225574e-36, 3.3207715410729622e-37, -1.5516525430854195e-35, 1.2319180796617573e-35, 1.2695338988880705e-35, 6.629788138637702e-36, -1.704466808692317e-36, 7.288064975098183e-36, -5.900981641127883e-36, 1.0626468931433479e-35, -9.4039548065783e-36, -1.1472824864025526e-35, -1.8337711872827685e-36, 1.3917853113735884e-35, -6.441709042506136e-36, -6.864887008802159e-36, 9.827132772874324e-36, -1.2283915966092904e-36, -2.915225990039273e-36, 9.051306501331614e-37, 7.617203393328423e-36, -6.441709042506136e-36, 5.830451980078546e-36, -1.4223481644949679e-36, -6.347669494440353e-36, 3.314894069318851e-36, -7.758262715427098e-36, -1.504632769052528e-36, 1.909002825735395e-35, 4.0907203408615605e-36, -5.430783900798968e-36, 4.255289549976681e-36, 5.360254239749631e-36, 1.1707923734189984e-35, -1.316553672920962e-35, -1.1472824864025526e-35, 4.70197740328915e-36, 1.288341808501227e-35, -2.7506567809241528e-36, -6.065550850243004e-36, 6.065550850243004e-36, 2.8446963289899358e-36, -6.488728816539027e-36, 4.2787994369931265e-36, 1.4011892661801667e-35, -1.2319180796617573e-35, -3.385423730368188e-36, -1.222514124855179e-35, 4.255289549976681e-36, 1.2871663141504048e-36, -1.814963277669612e-35, 8.416539551887579e-36, 3.7468882432460414e-37, 3.34281206015088e-38, -9.592033902709866e-36, -4.772507064338487e-36, 6.78847987599871e-37, 8.510579099953362e-36, 8.13442090769023e-36, -7.147005652999508e-36, 2.7624117244323756e-36, 3.7145621485984285e-36, -1.1872492943305104e-36, 2.0662986635548023e-38, 7.005946330900834e-36, 1.9513206223649973e-36, 1.0285575569695016e-36, -1.6692019781676483e-36, -4.255289549976681e-36, 3.8086016966642115e-36, -5.43666137255308e-37, -1.5398975995771966e-36, 4.255289549976681e-36, 4.302309324009572e-36, 1.9043008483321058e-36, -7.52316384526264e-36, 2.527312854267918e-36, 2.3980084756774665e-36, 4.607937855223367e-36, 9.262895484479626e-36, 1.2789378536946488e-35, -2.9387358770557188e-37, -1.9654265545748647e-35, 1.1660903960157092e-35, 4.913566386437162e-36, -4.043700566828669e-36, 1.5140367238591063e-35, 5.3132344657167395e-36, -5.172175143618065e-36, -2.6801271198748155e-36, 2.5155579107596953e-36, -1.034435028723613e-35, 7.241045201065291e-36, 3.314894069318851e-36, -2.9740007075803874e-36, 1.3753283904620764e-36, 9.968192094972998e-36, -5.571843222897643e-36, 2.8446963289899358e-36, 6.347669494440353e-36, 3.314894069318851e-36, 9.4039548065783e-36, 5.3132344657167395e-36, -2.3980084756774665e-36, -1.4670169498262148e-35, 1.1637394073140646e-36, 5.3132344657167395e-36, 4.114230227878006e-36, 6.770847460736376e-36, 1.8220162437745456e-36, -1.095560734966372e-35, 6.018531076210112e-36, -7.85230226349288e-36, -2.2804590405952378e-36, -1.516387712560751e-36, -2.433273306202135e-36, -8.287235173297127e-37, -3.0562853121379475e-36, -1.1707923734189984e-35, 1.4576129950196365e-36, -1.0861567801597937e-35, 2.82118644197349e-36, -9.827132772874324e-36, 2.926980933547496e-36, 3.408933617384634e-36, 7.85230226349288e-36, -1.774996469741654e-36, 1.2789378536946488e-35, 1.285696946211877e-37, 7.899322037525772e-36, -1.2166366531010676e-36, -1.095560734966372e-35, -2.8682062160063815e-36, -2.1393997184965633e-36, -1.0062231643038781e-35, 1.7985063567581e-36, 8.087401133657338e-36, 1.5942642133027274e-37, 4.631447742239813e-36, 9.450974580611192e-36, 1.1049646897729503e-35, -5.430783900798968e-36, 9.262895484479626e-36, 1.9513206223649973e-36, -5.054625708535836e-36, 5.289724578700294e-36, 2.228737289159057e-35, -1.2695338988880705e-35, 1.0861567801597937e-35, 1.532844633472263e-35, 1.504632769052528e-36, 1.3259576277275403e-35, -1.814963277669612e-35, -8.46355932592047e-36, -2.3039689276116835e-36, -9.497994354644083e-36, 1.1754943508222875e-36, -5.830451980078546e-36, 4.513898307157584e-36, 2.0100953399061116e-36, 1.0203290965137456e-35, -8.087401133657338e-36, 2.2216843230541234e-36, -8.346009890838241e-37, 5.430783900798968e-36, 9.827132772874324e-36, -4.53740819417403e-36, 1.1990042378387333e-36, 2.7506567809241528e-36, -3.6205226005326455e-36, 1.3259576277275403e-35, 1.1801963282255767e-35, -1.5422485882788412e-35, -1.974830509381443e-36, -1.3224311446750734e-36, -7.099985878966617e-36, -4.790139479600822e-37, 7.85230226349288e-36, 8.557598873986253e-36, 1.3259576277275403e-35, -2.433273306202135e-36, -1.0861567801597937e-35, 6.612155723375367e-37, 1.3259576277275403e-35, -7.85230226349288e-36, -2.456783193218581e-36, -4.725487290305596e-36, 2.5860875718090325e-36, 4.1847598889273435e-36, 1.1143686445795286e-35, 3.385423730368188e-36, 1.0297330513203239e-35, 1.1378785315959743e-35, -1.610427260626534e-36, 3.8086016966642115e-36, 9.592033902709866e-36, 9.874152546907215e-36, -1.8102613002663228e-36, -5.360254239749631e-36, -1.8102613002663228e-36, 7.946341811558664e-36, -5.853961867094992e-36, -5.853961867094992e-36, 1.0203290965137456e-35, -1.222514124855179e-35, -1.1190706219828177e-35, 4.513898307157584e-36, 9.462729524119414e-37, 6.864887008802159e-36, -7.099985878966617e-36, -2.057115113939003e-36, 1.0062231643038781e-35, -8.275480229788904e-36, -1.1660903960157092e-35, 1.7397316392169855e-36, -1.692711865184094e-36, -1.2319180796617573e-35, 2.715391950399484e-36, 1.128474576789396e-35, 8.181440681723121e-36, 1.3988382774785221e-36, 9.73309322480854e-36, -1.6080762719248893e-35, 5.466048731323637e-37, -2.0453601704307803e-36, 1.1190706219828177e-35, -1.1660903960157092e-35, 1.095560734966372e-35, -3.9966807927957775e-37, -6.535748590571919e-36, -1.1931267660846218e-36, -6.629788138637702e-36, 9.450974580611192e-36, -1.0579449157400588e-35, 6.206610172341678e-36, 5.430783900798968e-36, -2.997510594596833e-36, 2.362743645152798e-36, 1.1225971050352846e-36, -1.5281426560689738e-36, -1.344765537340697e-35, 1.9160557918403286e-36, 6.065550850243004e-36, 1.8901949161222383e-35, 1.4482090402130582e-35, -1.4199971757933233e-35, 1.2695338988880705e-35, -9.027796614315168e-36, -1.9513206223649973e-36, 7.005946330900834e-36, -2.080625000955449e-36, -9.450974580611192e-36, -4.4668785331246925e-36, -1.892545904823883e-36, -2.0782740122538043e-35, 4.161250001910898e-36, -1.0861567801597937e-35, 4.261167021730792e-37, 6.770847460736376e-36, -6.78847987599871e-37, -9.309915258512517e-36, -3.126814973187285e-36, 7.875812150509326e-37, -7.805282489459989e-36, 8.369519777854687e-36, 1.9689530376273316e-37, 1.986585452889666e-36, -3.879131357713549e-36, 4.137740114894452e-36, -4.5609180811904755e-36, 5.8774717541114375e-37, -9.027796614315168e-36, 1.1660903960157092e-35, 1.986585452889666e-36, 1.2319180796617573e-35, 8.9925317837905e-37, 6.770847460736376e-36, -4.1847598889273435e-36, 3.0562853121379475e-36, -5.987674349501027e-38, 2.3133728824182618e-35, -1.1566864412091309e-35, -1.3224311446750734e-36, -2.339233758136352e-36, 4.86654661240427e-36, -5.466048731323637e-37, 5.830451980078546e-36, -7.993361585591555e-36, 5.47780367483186e-36, 5.830451980078546e-36, -2.0218502834143345e-36, -6.58276836460481e-36, -8.639883478543813e-37, -1.692711865184094e-36, -2.433273306202135e-36, 1.3400635599374078e-36, 5.0311158215193905e-36, -6.958926556867942e-36, 5.3132344657167395e-36, 1.5234406786656846e-35, -1.3729774017604318e-35, 4.419858759091801e-36, -6.770847460736376e-36, 7.241045201065291e-36, -3.408933617384634e-36, -6.083183265505338e-37, 7.758262715427098e-36, -5.054625708535836e-36, 6.347669494440353e-36, -6.629788138637702e-36, -4.302309324009572e-36, 7.899322037525772e-36, -1.2413220344683356e-35, -1.570460452698576e-35, 1.065291755432698e-37, -7.664223167361315e-36, -1.3635734469538535e-35, -7.899322037525772e-36, -4.043700566828669e-36, 3.2443644082695135e-36, -2.7859216114488214e-36, -1.6574470346594254e-36, -1.6456920911512025e-36, 4.584427968206921e-36, -1.3917853113735884e-35, 1.3929608057244107e-36, 1.0285575569695016e-36, -9.827132772874324e-36, 2.1393997184965633e-36, 6.065550850243004e-36, 4.4668785331246925e-36, -3.549992939483308e-36, 1.0109251417071673e-36, 3.667542374565537e-36, -6.958926556867942e-36, 2.915225990039273e-36, -8.13442090769023e-36, -1.0767528253532154e-35, 1.4858248594393714e-35, 3.9026412447299945e-36, -3.220854521253068e-36, 4.419858759091801e-36, -4.86654661240427e-36, -8.757432913626042e-37, 7.099985878966617e-36, -1.2342690683634019e-36, 3.314894069318851e-36, 8.79269774415071e-36, -9.309915258512517e-36, 1.43880508540648e-35, 9.545014128676975e-36, 1.3283086164291849e-36, 8.510579099953362e-36, 1.8901949161222383e-35, -2.2099293795459005e-36, 9.07481638834806e-36, -3.0562853121379475e-36, 1.7514865827252084e-36, 3.032775425121502e-36, 2.527312854267918e-36, 1.3071497181143837e-35, 5.383764126766077e-36, 9.874152546907215e-36, 1.2977457633078054e-35, 9.07481638834806e-36, -1.4952288142459497e-35, 7.4350017689509685e-37, -5.383764126766077e-36, 7.382104523163966e-36, -2.268704097087015e-36, -2.0923799444636718e-36, 8.13442090769023e-36, 2.8917161030228273e-36, 9.827132772874324e-36, -1.8102613002663228e-36, -2.82118644197349e-36, -2.174664549021232e-36, -2.453844457341525e-37, -4.419858759091801e-36, 5.853961867094992e-36, -9.73309322480854e-36, -2.73890183741593e-36, 3.948926334793622e-38, 8.604618648019145e-36, -9.73309322480854e-36, -1.3635734469538535e-35, -7.052966104933725e-36, 5.289724578700294e-36, -7.241045201065291e-36, 1.128474576789396e-35, 1.2871663141504048e-36, 7.376227051409854e-37, 6.347669494440353e-36, 7.288064975098183e-36, -6.347669494440353e-36, 7.52316384526264e-36, 2.550822741284364e-36, -2.0218502834143345e-36, -1.5516525430854195e-36, 6.347669494440353e-36, -1.410593220986745e-36, 1.2037062152420224e-35, -5.3132344657167395e-36, 3.173834747220176e-36, 8.933757066249385e-36, 1.0297330513203239e-35, 8.040381359624447e-36, 1.3665121828309092e-37, 1.1801963282255767e-35, 6.018531076210112e-36, 3.5264830524668625e-36, -7.335084749131074e-36, 5.47780367483186e-36, -8.087401133657338e-36, 2.8917161030228273e-36, -3.032775425121502e-36, -5.025238349765279e-37, 9.771296791210265e-38, 1.0485409609334805e-35, 1.1990042378387333e-35, 1.0767528253532154e-35, -1.0638223874941702e-36, 7.170515540015954e-37, -7.85230226349288e-36, -1.0626468931433479e-35, -1.034435028723613e-35, 4.70197740328915e-36, 1.7115197747972506e-35, -1.4294011305999016e-35, -4.86654661240427e-36, 8.933757066249385e-36, -1.0579449157400588e-35, 7.429124297196857e-36, -7.288064975098183e-36, 1.0101904577379033e-38, -1.1707923734189984e-35, 6.112570624275895e-36, -1.0391370061269022e-35, 3.314894069318851e-36, -1.8572810742992143e-36, -1.095560734966372e-35, -2.362743645152798e-36, 2.4685381367268038e-36, 5.466048731323637e-37, 1.1801963282255767e-35, 9.027796614315168e-36, 3.92615113174644e-36, 9.309915258512517e-36, -1.3353615825341186e-35, 1.786751413249877e-36, 1.0579449157400588e-36, -1.3635734469538535e-35, 5.7834322060456545e-36, 7.429124297196857e-36, -3.314894069318851e-36, -1.034435028723613e-35, 8.040381359624447e-36, 4.231779662960235e-36, -6.676807912670593e-36, -9.262895484479626e-36, -3.079795199154393e-36, -2.5860875718090325e-36, 1.7632415262334313e-36, -4.8430367253878245e-36, -9.450974580611192e-36, -3.949661018762886e-36, 1.3635734469538535e-35, -6.958926556867942e-36, -3.314894069318851e-36, 1.0297330513203239e-35, 1.9654265545748647e-35, -1.6080762719248893e-35, 4.513898307157584e-36, -4.0907203408615605e-36, -9.686073450775649e-36, -4.678467516272704e-36, -1.1402295202976189e-36, -3.103305086170839e-36, -5.759922319029209e-37, 1.6339371476429796e-36, 2.2099293795459005e-36, -1.4987552972984166e-36, -7.570183619295532e-36, -1.0485409609334805e-35, 7.758262715427098e-36, -5.948001415160775e-36, -3.832111583680657e-36, 1.1002627123696611e-35, -2.6918820633830384e-36, 4.984096047486499e-36, -2.228737289159057e-35, 6.832560914154546e-38, -8.839717518183602e-36, -3.9966807927957775e-37, 1.504632769052528e-35, -3.032775425121502e-36, 5.289724578700294e-36, 6.91190678283505e-36, -1.1049646897729503e-35, -8.79269774415071e-36, -6.535748590571919e-36, 1.8102613002663228e-36, 4.067210453845115e-36, 4.7489971773220415e-36, 3.408933617384634e-36, 4.6549576292562585e-36, -1.0626468931433479e-35, -9.168855936413843e-36, 2.644862289350147e-36, 6.206610172341678e-36, -3.9966807927957775e-36, 4.772507064338487e-36, 7.993361585591555e-37, -5.054625708535836e-36, -4.043700566828669e-36, 2.5567002130384753e-37, -5.900981641127883e-36, -4.796016951354933e-36, -2.2099293795459005e-35, -1.410593220986745e-35, 1.275411370642182e-36, -1.6739039555709374e-35, -2.174664549021232e-36, 2.3274788146281293e-36, -4.4668785331246925e-36, -7.570183619295532e-36, -1.5398975995771966e-36, 3.9966807927957775e-36, -2.82118644197349e-36, -1.504632769052528e-35, 2.0218502834143345e-36, 3.738072035614874e-36, 4.2787994369931265e-36, 7.85230226349288e-36, -9.639053676742758e-37, -1.2789378536946488e-35, -8.933757066249385e-36, 1.0579449157400588e-35, -9.121836162380951e-36, -1.516387712560751e-36, 4.067210453845115e-36, 9.8153778293661e-37, -6.435831570752024e-37, -6.91190678283505e-36, 6.535748590571919e-36, -1.4294011305999016e-35, -2.4215183626939123e-36, -9.4039548065783e-37, 1.9654265545748647e-35, 3.855621470697103e-36, 2.82118644197349e-36, -5.0840130673063935e-37, 2.2216843230541234e-36, 7.946341811558664e-36, 1.3259576277275403e-35, -5.3132344657167395e-36, 8.275480229788904e-36, 4.208269775943789e-36, 7.241045201065291e-36, -9.844765188136658e-38, 9.545014128676975e-36, 5.736412432012763e-36, 5.736412432012763e-36, 9.215875710446734e-36, 2.0218502834143345e-35, 1.1801963282255767e-35, 2.6801271198748155e-36, -6.553381005834253e-37, 2.350988701644575e-36, 1.189600283032155e-35, -8.181440681723121e-36, -1.275411370642182e-36, -9.874152546907215e-36, 1.3635734469538535e-35, 4.3728389850589095e-36, -1.1143686445795286e-35, -1.1578619355599532e-36, -2.6801271198748155e-36, 3.103305086170839e-36, -6.065550850243004e-36, 2.433273306202135e-36, 1.5892683623117327e-35, 9.121836162380951e-36, -1.1660903960157092e-35, -1.2695338988880705e-35, 1.3297779843677127e-37, 8.557598873986253e-36, 8.322500003821796e-36, -5.5953531099140885e-36, -4.137740114894452e-36, 8.839717518183602e-36, 7.85230226349288e-36, -9.697828394283872e-37, 5.054625708535836e-36, 2.915225990039273e-36, 1.7679435036367204e-35, 3.103305086170839e-36, 9.73309322480854e-36, 8.510579099953362e-36, -5.242704804667402e-36, -9.027796614315168e-36, -1.3071497181143837e-35, -7.664223167361315e-36, -5.830451980078546e-36, -9.286405371496071e-37, 3.6205226005326455e-36, 4.584427968206921e-36, -8.369519777854687e-36, 9.592033902709866e-36, -7.758262715427098e-36, 2.2099293795459005e-36, 1.88079096131566e-35, 8.416539551887579e-36, -5.348499296241408e-37, -9.968192094972998e-36, 1.8455261307909914e-36, 9.545014128676975e-36, 1.288341808501227e-35, -3.489748854003666e-38, 1.3729774017604318e-35, 7.758262715427098e-36, -6.347669494440353e-36, 3.502973165450417e-36, -2.1981744360376776e-36, 3.0562853121379475e-36, 4.419858759091801e-36, -1.1378785315959743e-35, 3.667542374565537e-36, 1.0814548027565045e-36, 1.2695338988880705e-35, 1.4179400606793843e-37, -4.419858759091801e-36, -4.3728389850589095e-36, -2.6889433275059827e-37, 1.2977457633078054e-35, 4.302309324009572e-36, -5.289724578700294e-36, -1.1707923734189984e-35, -2.4685381367268038e-36, -5.524823448864751e-36, -6.817867234769268e-36, 7.147005652999508e-36, 2.1158898314801175e-35, -3.173834747220176e-36, -8.510579099953362e-36, 1.2037062152420224e-35, -7.241045201065291e-36, 5.830451980078546e-36, -1.4858248594393714e-35, 9.827132772874324e-36, 4.7489971773220415e-36, 4.8430367253878245e-36, -9.309915258512517e-36, 3.3384039563352965e-36, -5.830451980078546e-36, -3.820356640172434e-37, 7.93458686805044e-37, 8.287235173297127e-37, -2.1393997184965633e-36, 1.476420904632793e-35, -1.0767528253532154e-35, 3.6205226005326455e-36, -2.3744985886610208e-36, -2.6595559687354255e-37, 7.711242941394206e-36, -5.242704804667402e-36, -8.510579099953362e-36, -5.5953531099140885e-36, -1.1143686445795286e-35, 7.241045201065291e-36, -1.1660903960157092e-35, -7.335084749131074e-36, 1.974830509381443e-36, -8.040381359624447e-36, 8.087401133657338e-36, -5.948001415160775e-36, -6.488728816539027e-36, 8.557598873986253e-36, 5.195685030634511e-36, 2.715391950399484e-36, 1.786751413249877e-36, -7.288064975098183e-36, 8.886737292216494e-36, 5.0311158215193905e-36, 8.87498234870827e-37, 6.065550850243004e-36, -5.430783900798968e-36, -5.101645482568728e-36, 5.148665256601619e-36, 1.2037062152420224e-35, -1.1566864412091309e-35, -1.1190706219828177e-35, 7.617203393328423e-36, 1.1472824864025526e-35, -1.1143686445795286e-35, -7.476144071229749e-36, 4.114230227878006e-36, -5.853961867094992e-36, -7.382104523163966e-36, -6.300649720407461e-36, 5.101645482568728e-36, -5.195685030634511e-36, -2.3274788146281293e-36, 7.93458686805044e-37, 1.5892683623117327e-35, -1.5516525430854195e-35, 1.3635734469538535e-35, -4.0907203408615605e-36, -6.318282135669795e-37, -2.6918820633830384e-36, 2.8682062160063815e-36, -6.629788138637702e-36, -1.3353615825341186e-35, -6.9648040286220535e-37, 1.2131101700486007e-35, 3.173834747220176e-36, -1.0638223874941702e-36, 1.774996469741654e-36, -9.168855936413843e-37, 1.6739039555709374e-35, 9.827132772874324e-36, 8.13442090769023e-36, 7.335084749131074e-36, -3.0562853121379475e-36, 1.8337711872827685e-36, -1.9513206223649973e-36, -7.241045201065291e-36, 8.839717518183602e-36, 6.441709042506136e-36, -6.58276836460481e-36, 1.8901949161222383e-35, 1.0720508479499262e-35, 1.3400635599374078e-36, 6.958926556867942e-36, -3.849743998942992e-37, -3.479463278433971e-36, 2.5743326283008096e-36, 9.756603111824986e-37, -5.0311158215193905e-36, 1.974830509381443e-35, 8.980776840282277e-36, -7.335084749131074e-36, -1.8102613002663228e-36, -9.521504241660529e-37, 1.598672317118311e-35, 5.64237288394698e-36, 1.3541694921472752e-35, -1.5234406786656846e-35, -1.8995988709288166e-35, -1.2601299440814922e-35, -2.3157238711199064e-36, 8.510579099953362e-36, 5.113400426076951e-37, 3.9966807927957775e-36, -1.4670169498262148e-35, 1.3077374652897949e-37, 3.361913843351742e-36, -3.009265538105056e-36, -3.032775425121502e-36, -1.6739039555709374e-35, 6.065550850243004e-36, -2.73890183741593e-36, -1.034435028723613e-35, -6.018531076210112e-36, 1.6339371476429796e-36, -3.949661018762886e-36, 1.910178320086217e-37, 1.7773474584432987e-35, -1.034435028723613e-35, 4.913566386437162e-36, -7.699487997885983e-37, 6.817867234769268e-36, -3.197344634236622e-36, 5.5953531099140885e-36, -1.7118136483849562e-37, -1.3917853113735884e-35, 7.099985878966617e-36, 3.8086016966642115e-36, 6.300649720407461e-36, -2.6566172328583698e-36, -2.4685381367268038e-37, 1.7491355940235638e-35, 1.704466808692317e-36, -1.38238135656701e-35, -1.3541694921472752e-35, -2.9504908205639416e-36, -1.1660903960157092e-35, -2.350988701644575e-36, -4.772507064338487e-36, 1.0297330513203239e-35, -1.5798644075051544e-35, -4.70197740328915e-36, -1.0203290965137456e-35, 1.3259576277275403e-35, -2.433273306202135e-36, 1.3077374652897949e-37, 2.644862289350147e-36, 1.4223481644949679e-36, -8.557598873986253e-36, -3.573502826499754e-36, -2.4391507779562466e-37, -4.9370762734536075e-36, 4.349329098042464e-37, 1.0203290965137456e-35, -9.027796614315168e-36, -8.839717518183602e-36, 3.267874295285959e-36, 8.46355932592047e-36, -3.855621470697103e-36, -1.1990042378387333e-36, 7.005946330900834e-36, -2.4685381367268038e-36, 2.9740007075803874e-36, -9.07481638834806e-36, 1.3885527019088271e-37, -1.2413220344683356e-35, -4.984096047486499e-36, 1.664500000764359e-35, -7.335084749131074e-36, 1.128474576789396e-35, 1.0485409609334805e-35, 4.53740819417403e-36, 5.830451980078546e-36, 1.7397316392169855e-36, 2.5743326283008096e-36, -3.573502826499754e-36, 1.6339371476429796e-36, 1.316553672920962e-35, 8.275480229788904e-36, 2.7624117244323756e-36, -1.96307556587322e-36, 3.5264830524668625e-36, 5.172175143618065e-36, -5.383764126766077e-36, -1.7632415262334313e-36, 5.3132344657167395e-36, 7.85230226349288e-36, 2.2804590405952378e-36, 1.0485409609334805e-35, -1.1660903960157092e-35, 3.573502826499754e-36, -1.5516525430854195e-35, 3.691052261581983e-36, -7.099985878966617e-36, -6.553381005834253e-37, 1.0579449157400588e-35, 1.0203290965137456e-35, -1.0485409609334805e-35, -8.839717518183602e-36, 1.8220162437745456e-36, 2.3321807920314184e-35, -4.5609180811904755e-36, 3.361913843351742e-36, -6.676807912670593e-36, -1.169616879068176e-36, 6.394689268473244e-36, 1.1707923734189984e-35, -1.1660903960157092e-35, 1.2460240118716248e-36, -2.915225990039273e-36, -2.600781251194311e-37, 5.195685030634511e-36, 1.532844633472263e-35, 7.241045201065291e-36, -4.678467516272704e-36, -5.289724578700294e-36, 9.027796614315168e-36, -8.745677970117819e-36, -2.4832318161120824e-37, -4.067210453845115e-36, -5.101645482568728e-36, -3.7175008844754842e-37, -4.878301555912493e-37, -1.9278107353485515e-36, 4.4668785331246925e-36, -3.644032487549091e-36, -9.07481638834806e-36, -5.101645482568728e-36, -3.6205226005326455e-36, 1.1472824864025526e-35, 1.3071497181143837e-35, 5.289724578700294e-37, 3.5264830524668625e-36, -1.222514124855179e-35, -6.065550850243004e-36, 1.1343520485435074e-36, 1.2319180796617573e-35, 6.958926556867942e-36, -7.2880649750981825e-37, -1.9936384189945996e-35, -2.4215183626939123e-36, 1.4670169498262148e-35, 1.3459410316915192e-36, -7.052966104933725e-36, -4.114230227878006e-36, 1.369450918707965e-36, -1.8619830517025034e-35, -1.2871663141504048e-36, 4.261167021730792e-37, 5.383764126766077e-36, -1.0720508479499262e-35, 2.715391950399484e-36, -3.361913843351742e-36, -7.899322037525772e-36, -3.8086016966642115e-36, 2.5155579107596953e-36, -6.25362994637457e-36, -1.3353615825341186e-35, -4.848914197141936e-37, -3.0562853121379475e-36, -4.255289549976681e-36, 1.0391370061269022e-35, -1.288341808501227e-35, -1.1049646897729503e-35, -3.314894069318851e-36, -5.524823448864751e-36, 1.0696998592482816e-36, -6.629788138637702e-36, -1.8619830517025034e-35, 7.241045201065291e-36, -6.394689268473244e-36, -4.607937855223367e-36, 4.631447742239813e-36, 8.757432913626042e-37, -5.195685030634511e-36, -8.040381359624447e-36, 1.5892683623117327e-35, 2.2804590405952378e-36, -6.347669494440353e-36, 3.92615113174644e-36, 1.3106762011668506e-36, -1.2413220344683356e-35, 2.1041348879718946e-36, 1.4294011305999016e-35, -3.5264830524668625e-36, -1.2131101700486007e-35, -3.667542374565537e-36, 7.241045201065291e-36, 3.291384182302405e-36, -4.419858759091801e-36, -1.1990042378387333e-36, 3.92615113174644e-36, -1.0720508479499262e-35, -5.5953531099140885e-36, -1.8102613002663228e-36, 5.195685030634511e-36, -1.6692019781676483e-36, 4.9370762734536075e-36, 5.948001415160775e-36, -1.8337711872827685e-36, 2.9740007075803874e-36, 6.817867234769268e-36, 1.3929608057244107e-36, 1.9043008483321058e-36, -5.853961867094992e-36, 1.250725989274914e-35, -1.0579449157400588e-36, -1.095560734966372e-35, -2.9093485182851616e-37, 1.6971199689996776e-37, 2.550822741284364e-36, -2.621352402333701e-36, -5.101645482568728e-36, 5.289724578700294e-36, 2.456783193218581e-36, 4.208269775943789e-36, -4.255289549976681e-36, -1.96307556587322e-36, -1.1002627123696611e-35, -5.583598166405866e-37, 1.2977457633078054e-35, -1.5610564978919978e-35, -1.0062231643038781e-35, -8.79269774415071e-36, 2.6595559687354255e-37, 1.6574470346594254e-36, 8.191726257292816e-38, -9.497994354644083e-36, -5.101645482568728e-36, -1.1990042378387333e-35, 3.1503248602037305e-36, -8.9925317837905e-37, -5.5013135618483055e-36, -9.73309322480854e-36, 1.680956921675871e-36, 6.629788138637702e-36, -4.378716456813021e-37, 4.255289549976681e-36, 1.034435028723613e-35, -5.360254239749631e-36, -4.678467516272704e-36, 3.667542374565537e-36, -2.3980084756774665e-36, 2.5978425153172554e-36, 1.316553672920962e-35, 5.736412432012763e-36, 1.1472824864025526e-35, 1.4670169498262148e-35, -2.5978425153172554e-36, -1.1660903960157092e-35, 4.4668785331246925e-36, -1.6339371476429796e-36, -3.738072035614874e-36, 1.1660903960157092e-35, -8.228460455756013e-36, 4.4668785331246925e-37, 2.644862289350147e-36, 5.64237288394698e-37, -3.267874295285959e-36, 1.774996469741654e-36, 8.886737292216494e-36, 7.099985878966617e-36, -9.497994354644083e-36, 6.230120059358124e-37, -2.644862289350147e-36, -1.3917853113735884e-35, -4.796016951354933e-36, -4.1436175866485635e-37, 1.250725989274914e-35, 9.309915258512517e-36, -1.410593220986745e-36, -8.651638422052036e-36, -1.0226800852153901e-36, 1.786751413249877e-35, 6.78847987599871e-37, 2.6918820633830384e-36, -8.416539551887579e-36, -2.0453601704307803e-36, 3.3384039563352965e-36, 3.079795199154393e-36, 3.4324435044010795e-36, -2.644862289350147e-36, -1.3917853113735884e-35, -3.667542374565537e-36, 3.549992939483308e-36, 6.065550850243004e-36, 9.874152546907215e-36, 1.2695338988880705e-35, 2.82118644197349e-36, -7.805282489459989e-36, 1.357695975199742e-36, -1.9160557918403286e-36, 7.664223167361315e-36, -3.455953391417525e-36, 1.4341031080031908e-36, 2.1276447749883404e-36, -5.524823448864751e-36, -3.032775425121502e-36, 9.07481638834806e-36, 3.92615113174644e-36, 1.2107591813469561e-36, -8.510579099953362e-36, 3.502973165450417e-36, -9.968192094972998e-36, 8.13442090769023e-36, 5.6893926579798715e-36, 1.8995988709288166e-35, -1.3071497181143837e-35, -2.915225990039273e-36, -7.476144071229749e-36, 1.1472824864025526e-35, -1.0720508479499262e-35, -8.79269774415071e-36, 3.361913843351742e-36, -8.510579099953362e-36, -1.1801963282255767e-35, -6.347669494440353e-36, -2.1981744360376776e-36, -5.571843222897643e-36, -7.946341811558664e-36, 1.4199971757933233e-35, -1.96307556587322e-36, 1.5398975995771966e-36, -2.4685381367268038e-36, -1.7397316392169855e-36, 6.817867234769268e-36, 1.3988382774785221e-36, 7.147005652999508e-36, -1.4482090402130582e-35, 6.394689268473244e-36, -1.8220162437745456e-36, 8.181440681723121e-36, -6.488728816539027e-36, -7.805282489459989e-36, -8.839717518183602e-36, 1.7632415262334313e-36, 2.0923799444636718e-36, -3.4236272967699124e-37, -8.933757066249385e-36, -3.7145621485984285e-36, -2.3274788146281293e-36, 3.103305086170839e-36, -6.488728816539027e-36, 6.018531076210112e-36, 7.052966104933725e-37, 6.770847460736376e-36, -1.829363083467185e-37, 6.817867234769268e-36, 3.361913843351742e-36, -5.148665256601619e-36, -3.408933617384634e-36, -1.1190706219828177e-35, 1.6833079103775157e-35, 1.189600283032155e-35, -1.316553672920962e-35, 5.830451980078546e-36, -3.644032487549091e-36, -9.497994354644083e-36, 7.052966104933725e-36, -9.121836162380951e-36, 1.532844633472263e-35, 6.25362994637457e-36, -3.667542374565537e-36, -1.2871663141504048e-36, -4.231779662960235e-36, -8.228460455756013e-37, -2.73890183741593e-36, 1.3259576277275403e-35, 2.82118644197349e-36, -1.3929608057244107e-36, -3.585257770007977e-37, 7.335084749131074e-36, -7.711242941394206e-36, 1.516387712560751e-36, 2.1158898314801175e-36, -2.068870057447226e-35, 2.6801271198748155e-36, 9.62435999735748e-38, 2.550822741284364e-36, 6.018531076210112e-36, 5.0311158215193905e-36, -2.8917161030228273e-36, -2.5155579107596953e-36, 2.6566172328583698e-36, -1.4294011305999016e-35, -9.545014128676975e-36, 9.545014128676975e-36, 3.126814973187285e-36, 1.189600283032155e-35, -2.2099293795459005e-35, 2.0453601704307803e-36, -3.408933617384634e-36, -1.7679435036367204e-35, 2.256949153578792e-36, 4.584427968206921e-37, 6.770847460736376e-36, -5.0311158215193905e-36, 5.736412432012763e-36, 3.314894069318851e-36, -1.8455261307909914e-36, -9.027796614315168e-36, 6.876641952310382e-37, 1.5722236942248095e-37, 5.466048731323637e-37, 1.189600283032155e-35, -1.0626468931433479e-35, -1.410593220986745e-35, 1.4693679385278594e-37, -1.1660903960157092e-35, -1.8220162437745456e-36, -1.7397316392169855e-35, -2.7859216114488214e-36, -2.040658193027491e-35, -8.322500003821796e-36, 7.241045201065291e-36, 9.874152546907215e-36, 8.639883478543813e-37, -4.8430367253878245e-36, -1.570460452698576e-35, -6.018531076210112e-36, -5.430783900798968e-36, 2.097081921866961e-35, -1.2037062152420224e-35, 1.4987552972984166e-36, -8.639883478543813e-37, -2.8917161030228273e-36, 7.805282489459989e-36, -9.215875710446734e-36, -5.730534960258652e-37, -1.532844633472263e-35, -1.1143686445795286e-35, -2.7624117244323756e-36, -5.148665256601619e-36, 3.691052261581983e-36, 5.900981641127883e-36, -8.557598873986253e-36, 1.2166366531010676e-36, -4.419858759091801e-36, 3.949661018762886e-36, -1.0168026134612787e-36, -4.9370762734536075e-36, 4.4668785331246925e-36, -7.1940254270324e-36, 4.255289549976681e-36, 6.441709042506136e-36, -2.0336052269225574e-36, 5.289724578700294e-36, -5.5953531099140885e-36, -1.6574470346594254e-36, -7.288064975098183e-36, -7.899322037525772e-36, 3.032775425121502e-36, 2.8446963289899358e-36, 3.6205226005326455e-36, -1.1707923734189984e-35, 1.946618644961708e-35, -3.644032487549091e-36, 9.827132772874324e-36, -7.711242941394206e-36, -2.4215183626939123e-36, -5.054625708535836e-36, -3.173834747220176e-36, -9.309915258512517e-36, -7.335084749131074e-36, -7.93458686805044e-37, -1.38238135656701e-35, -1.88079096131566e-37, -6.864887008802159e-36, 2.8682062160063815e-36, -8.980776840282277e-36, -7.993361585591555e-36, -1.0814548027565045e-36, -5.571843222897643e-36, -1.1660903960157092e-35, -1.0626468931433479e-35, 1.986585452889666e-36, -2.4215183626939123e-36, 1.4870003537901937e-36, -1.7679435036367204e-35, -6.723827686703485e-36, -5.736412432012763e-36, 7.899322037525772e-36, -2.4795583962657627e-39, -2.82118644197349e-36, -1.8337711872827685e-36, 7.993361585591555e-36, -5.360254239749631e-36, -2.4920480237432495e-36, 7.335084749131074e-36, 1.6739039555709374e-35, -4.70197740328915e-36, 1.2577789553798476e-36, -3.8086016966642115e-36, -7.85230226349288e-36, -5.0311158215193905e-36, -3.9026412447299945e-36, -5.430783900798968e-36, 8.510579099953362e-36, -7.805282489459989e-36, 3.0562853121379475e-36, 2.424457098570968e-38, 1.9654265545748647e-35, -2.3980084756774665e-36, 5.6893926579798715e-36, -4.255289549976681e-36, -6.347669494440353e-36, -2.4920480237432495e-36, -1.0203290965137456e-35, 6.58276836460481e-36, -2.7976765549570443e-36, 2.0100953399061116e-36, 9.121836162380951e-36, -4.043700566828669e-36, -9.262895484479626e-36, 7.805282489459989e-36, 1.0297330513203239e-35, 4.53740819417403e-36, 7.335084749131074e-36, 2.915225990039273e-36, 7.993361585591555e-36, -2.2922139841034606e-36, -5.5953531099140885e-36, 2.5978425153172554e-36, -6.488728816539027e-36, 7.335084749131074e-36, 1.5281426560689738e-36, 2.5860875718090325e-36, 5.948001415160775e-36, 1.43880508540648e-35, 2.6918820633830384e-36, -4.2787994369931265e-36, 1.2989212576586277e-36, 7.899322037525772e-36, -1.1660903960157092e-35, -2.2216843230541234e-36, 1.3400635599374078e-36, 3.314894069318851e-36, -4.419858759091801e-36, 4.5609180811904755e-36, 7.335084749131074e-36, -1.2601299440814922e-35, 3.455953391417525e-36, 1.0391370061269022e-35, -7.581938562803754e-37, 1.0861567801597937e-35, 3.691052261581983e-36, -1.2601299440814922e-35, 5.5013135618483055e-36, -4.325819211026018e-36, 5.510129769479473e-39, -6.065550850243004e-36, 1.88079096131566e-36, 6.864887008802159e-36, 3.667542374565537e-36, -3.126814973187285e-36, -6.906029311080939e-37, 6.065550850243004e-36, -2.453844457341525e-37, 2.1452771902506747e-37, -1.1402295202976189e-36, 3.5264830524668625e-36, 9.262895484479626e-36, 9.874152546907215e-36, 9.827132772874324e-36, -3.197344634236622e-36, -1.4914084576057773e-37, 5.995021189193666e-36, 1.1660903960157092e-35, -1.4199971757933233e-35, -4.513898307157584e-36, -8.46355932592047e-36, 1.0203290965137456e-35, 6.347669494440353e-36, -4.396348872075355e-36, -1.8220162437745456e-36, 1.1801963282255767e-35, -1.0297330513203239e-35, 3.220854521253068e-36, 1.3929608057244107e-36, -9.874152546907215e-36, 3.738072035614874e-36, -1.2037062152420224e-35, -2.0218502834143345e-36, 3.0562853121379475e-36, -7.711242941394206e-36, 9.73309322480854e-36, 6.206610172341678e-36, -4.6549576292562585e-36, -1.4576129950196365e-35, -2.5860875718090325e-36, 1.0720508479499262e-35, 6.723827686703485e-36, -1.476420904632793e-35, -2.8446963289899358e-36, -1.0297330513203239e-35, -1.2695338988880705e-35, -3.3501588998435194e-37, -1.3812058622161878e-36, 2.3039689276116835e-36, -6.065550850243004e-36, -3.314894069318851e-36, -4.678467516272704e-36, 2.456783193218581e-36, -7.85230226349288e-36, -2.130583510865396e-37, 9.121836162380951e-36, -8.287235173297127e-37, 9.686073450775649e-36, 1.6971199689996776e-37, 3.455953391417525e-36, -2.5155579107596953e-36, -4.607937855223367e-36, 1.3541694921472752e-35, 5.830451980078546e-36, -5.7834322060456545e-36, 5.054625708535836e-36, 4.86654661240427e-36, 2.915225990039273e-36, -7.005946330900834e-36, -7.85230226349288e-36, -1.1049646897729503e-35, -1.8367099231598242e-39, -7.099985878966617e-36, 6.065550850243004e-36, 2.040658193027491e-35, 2.5978425153172554e-36, -9.07481638834806e-36, 5.524823448864751e-36, 1.9160557918403286e-36, 4.255289549976681e-36, -9.93292726444833e-37, -6.723827686703485e-36, 1.222514124855179e-35, 3.032775425121502e-36, -7.993361585591555e-36, -7.147005652999508e-36, -4.6549576292562585e-36, -1.0062231643038781e-35, 1.2107591813469561e-36, -1.4987552972984166e-36, 1.7021158199906723e-35, 4.631447742239813e-36, -1.1049646897729503e-35, 4.725487290305596e-36, -7.570183619295532e-36, -2.2381412439656354e-35, -3.9966807927957775e-36, 7.099985878966617e-36, 8.040381359624447e-36, -9.168855936413843e-36, -1.6574470346594254e-36, -6.347669494440353e-36, 1.476420904632793e-35, 1.0579449157400588e-35, 2.621352402333701e-36, -5.383764126766077e-36, 1.4482090402130582e-35, 5.7834322060456545e-36, 1.758539548830142e-35, -9.73309322480854e-36, -1.2930437859045163e-36, -7.335084749131074e-36, -9.4039548065783e-36, 5.5953531099140885e-36, 1.410593220986745e-36, -7.288064975098183e-36, -6.488728816539027e-36, -9.73309322480854e-36, 3.197344634236622e-36, 3.34281206015088e-38, 3.691052261581983e-36, -7.335084749131074e-36, -3.291384182302405e-36, 5.995021189193666e-37, 1.4576129950196365e-35, 1.9513206223649973e-36, -2.0923799444636718e-36, 1.1002627123696611e-35, -9.309915258512517e-36, 6.488728816539027e-36, -1.4670169498262148e-35, 3.361913843351742e-36, -5.148665256601619e-36, 1.7779352056187099e-37, -8.9925317837905e-37, -7.099985878966617e-36, 8.040381359624447e-36, -9.07481638834806e-36, 2.0718087933242817e-37, 3.832111583680657e-36, -3.361913843351742e-36, 1.2695338988880705e-35, 8.369519777854687e-36, -1.2342690683634019e-36, 4.631447742239813e-36, 6.676807912670593e-36, 2.1158898314801175e-37, -9.592033902709866e-36, 3.0562853121379475e-36, 1.43880508540648e-35, 1.598672317118311e-36, -2.6918820633830384e-36, 1.598672317118311e-35, -8.13442090769023e-36, -5.101645482568728e-36, -1.1143686445795286e-35, 5.360254239749631e-36, -1.034435028723613e-36, 6.206610172341678e-36, 1.1378785315959743e-35, -4.1847598889273435e-36, -5.571843222897643e-36, 4.7489971773220415e-36, -9.07481638834806e-36, -9.686073450775649e-36, -2.433273306202135e-36, -1.1660903960157092e-35, -3.549992939483308e-36, -1.4482090402130582e-35, 5.47780367483186e-36, 6.394689268473244e-36, 2.1535056507064307e-35, -1.6739039555709374e-35, -4.913566386437162e-36, -3.188528426605455e-37, 1.6221822041347568e-36, -2.1981744360376776e-36, -5.148665256601619e-36, 4.913566386437162e-36, 8.087401133657338e-36, -4.2787994369931265e-36, 3.832111583680657e-36, -7.1117408224748394e-37, -1.516387712560751e-36, 2.0218502834143345e-36, -2.8682062160063815e-36, 2.2334392665623463e-36, 2.453844457341525e-37, -4.325819211026018e-36, 4.8430367253878245e-36, 1.586917373610088e-36, 1.095560734966372e-35, 4.796016951354933e-36, 5.360254239749631e-36, 6.065550850243004e-36, 1.4199971757933233e-35, 7.429124297196857e-36, -1.1990042378387333e-36, -3.6205226005326455e-36, 9.309915258512517e-36, 1.095560734966372e-35, 9.968192094972998e-36, -1.7115197747972506e-35, -2.527312854267918e-36, -1.2695338988880705e-35, 3.949661018762886e-36, 8.839717518183602e-36, -1.8220162437745456e-36, -1.2319180796617573e-35, -3.220854521253068e-36, 7.052966104933725e-36, 2.2804590405952378e-36, 5.5013135618483055e-36, 1.0485409609334805e-35, -8.485599844998388e-38, 1.8243672324761902e-35, 5.995021189193666e-37, 6.770847460736376e-36, -6.488728816539027e-36, 4.137740114894452e-36, 3.96729343402522e-37, -1.1108421615270617e-36, -3.9966807927957775e-36, -3.92615113174644e-36, -6.629788138637702e-36, -3.502973165450417e-36, 2.162909605513009e-36, -1.0297330513203239e-35, -3.173834747220176e-36, 2.0923799444636718e-36, -5.571843222897643e-36, 1.774996469741654e-36, 1.189600283032155e-35, 1.5892683623117327e-35, -1.704466808692317e-36, 4.913566386437162e-36, -5.853961867094992e-36, 1.0767528253532154e-35, 3.4324435044010795e-36, -4.8430367253878245e-36, -8.557598873986253e-36, -3.549992939483308e-36, 9.73309322480854e-36, -1.0109251417071673e-35, -2.7624117244323756e-36, -6.676807912670593e-36, 4.913566386437162e-36, 4.255289549976681e-36, -2.621352402333701e-36, 1.516387712560751e-36, -7.570183619295532e-36, 2.7859216114488214e-36, -1.2037062152420224e-35, -7.617203393328423e-36, 1.3635734469538535e-35, -3.032775425121502e-36, -9.968192094972998e-36, -7.85230226349288e-36, -1.6456920911512025e-35, -1.0579449157400588e-35, -1.250725989274914e-35, 1.2695338988880705e-35, -4.114230227878006e-36, -1.263656427133959e-36, -1.3259576277275403e-35, -1.1343520485435074e-36, 4.443368646108247e-36, 2.1393997184965633e-36, -1.9513206223649973e-36, 1.0297330513203239e-35, 1.4752454102819708e-36, -1.1190706219828177e-35, 9.4039548065783e-36, 2.9534295564409974e-37, 1.2695338988880705e-35, -6.58276836460481e-36, 6.817867234769268e-36, -1.8901949161222383e-35, 6.25362994637457e-36, -5.830451980078546e-36, -3.5264830524668625e-36, -2.497925495497361e-37, 7.229290257557068e-37, -2.6801271198748155e-36, 7.899322037525772e-36, -8.087401133657338e-36, 1.128474576789396e-35, 8.275480229788904e-36, -7.099985878966617e-36, -3.667542374565537e-36, -5.383764126766077e-36, 2.4685381367268038e-36, 3.314894069318851e-36, -9.968192094972998e-36, 5.142787784847508e-37, -1.1660903960157092e-35, -1.2789378536946488e-35, -1.9395656788567744e-36, 6.018531076210112e-36, 1.974830509381443e-35, -5.6893926579798715e-36, -7.382104523163966e-36, 4.7489971773220415e-36, 4.86654661240427e-36, 4.0907203408615605e-36, -4.7489971773220415e-36, 1.758539548830142e-35, -5.571843222897643e-36, 3.173834747220176e-36, -1.2977457633078054e-35, -1.2107591813469561e-36, 2.380376060415132e-37, 4.490388420141138e-36, -5.7834322060456545e-36, 9.07481638834806e-36, 4.607937855223367e-36, -6.347669494440353e-36, -7.664223167361315e-36, -1.8619830517025034e-35, -5.0311158215193905e-36, -7.476144071229749e-36, 2.0923799444636718e-36, 3.76158192263132e-36, 4.86654661240427e-36, -9.07481638834806e-36, -5.47780367483186e-36, -8.839717518183602e-36, -3.314894069318851e-36, -8.557598873986253e-36, -1.3929608057244107e-36, -1.532844633472263e-35, -3.479463278433971e-36, -8.087401133657338e-36, 3.92615113174644e-36, -7.93458686805044e-37, -2.82118644197349e-36, 9.121836162380951e-36, -7.85230226349288e-36, -6.159590398308787e-36, -6.018531076210112e-36, -4.173004945419121e-37, -7.581938562803754e-37, -1.6339371476429796e-36, 8.522334043461584e-37, 2.7624117244323756e-36, 1.128474576789396e-35, -1.0062231643038781e-35, -6.629788138637702e-36, -9.73309322480854e-36, 1.1378785315959743e-35, -1.087332274510616e-36, 1.570460452698576e-35, -1.3259576277275403e-35, -1.5398975995771966e-36, -8.839717518183602e-36, 4.043700566828669e-36, -6.25362994637457e-36, 4.349329098042464e-36, 5.936246471652552e-37, -3.644032487549091e-36, -5.900981641127883e-36, 5.195685030634511e-36, -5.571843222897643e-36, 5.319111937470851e-37, 4.067210453845115e-36, 1.0520674439859473e-36, 4.231779662960235e-36, 4.255289549976681e-36, 9.73309322480854e-36, 1.7985063567581e-36, 1.1049646897729503e-36, -5.0311158215193905e-36, 7.241045201065291e-36, 1.532844633472263e-35, 4.231779662960235e-37, -1.6574470346594254e-36, 3.032775425121502e-36, -6.629788138637702e-36, -1.570460452698576e-35, 5.5953531099140885e-36, -5.571843222897643e-36, 1.986585452889666e-36, 2.6918820633830384e-36, 1.4294011305999016e-35, 2.0923799444636718e-36, 4.255289549976681e-36, 7.099985878966617e-36, 3.314894069318851e-36, -1.4576129950196365e-36, -4.1847598889273435e-36, -6.958926556867942e-36, 3.267874295285959e-36, 9.686073450775649e-36, 7.005946330900834e-36, -1.410593220986745e-36, -4.984096047486499e-36, 1.316553672920962e-35, 8.287235173297127e-37, 4.984096047486499e-36, 1.2037062152420224e-35, 7.147005652999508e-36, 1.2131101700486007e-35, -1.946618644961708e-35, -1.3353615825341186e-35, 9.991701981989444e-37, 1.8619830517025034e-35, -7.85230226349288e-36, -1.369450918707965e-36, -5.054625708535836e-36, 1.3917853113735884e-35, 5.383764126766077e-36, -1.1660903960157092e-35, 5.830451980078546e-36, 3.220854521253068e-36, -4.8430367253878245e-36, -9.262895484479626e-36, -5.319111937470851e-37, 9.497994354644083e-36, 4.513898307157584e-36, 1.7279766957087626e-36, 2.1393997184965633e-36, 1.3812058622161878e-36, 7.147005652999508e-36, -1.4576129950196365e-35, -2.3603926564511533e-35, -1.8572810742992143e-36, -3.949661018762886e-36, -2.1981744360376776e-36, 6.25362994637457e-36, 3.644032487549091e-36, 1.3917853113735884e-35, 7.429124297196857e-36, 5.830451980078546e-36, -7.376227051409854e-37, -8.181440681723121e-36, 2.997510594596833e-36, 8.839717518183602e-36, 9.215875710446734e-36, 1.128474576789396e-35, 9.168855936413843e-36, -4.796016951354933e-36, -1.8995988709288166e-35, -4.984096047486499e-36, 5.054625708535836e-36, 2.997510594596833e-36, 1.852579096895925e-35, -1.814963277669612e-35, 9.356935032545409e-36, -1.3259576277275403e-35, -3.173834747220176e-36, -7.711242941394206e-36, 7.758262715427098e-36, 5.965633830423109e-37, -7.93458686805044e-37, 8.181440681723121e-36, 5.5013135618483055e-36, 1.4011892661801667e-35, -1.316553672920962e-35, -1.96307556587322e-36, 1.1143686445795286e-35, 6.629788138637702e-36, 1.5516525430854195e-36, 4.8430367253878245e-36, -5.5013135618483055e-36, 3.0562853121379475e-36, 1.535489495761613e-37, -2.9504908205639416e-36, -7.288064975098183e-36, -1.2789378536946488e-35, -9.262895484479626e-36, 2.057115113939003e-37, -5.948001415160775e-36, 2.0218502834143345e-35, -9.309915258512517e-36, -2.8446963289899358e-36, -8.369519777854687e-36, -3.76158192263132e-36, -1.1378785315959743e-35, -8.510579099953362e-36, -3.0268979533673903e-37, 1.6530389308438418e-38, 7.85230226349288e-36, -2.0453601704307803e-36, -5.495436090094194e-37, -1.5234406786656846e-35, -9.874152546907215e-36, -3.691052261581983e-36, -2.5743326283008096e-36, -1.463490466773748e-36, 1.1402295202976189e-36, 9.827132772874324e-36, 1.095560734966372e-35, 6.817867234769268e-36, 7.147005652999508e-36, 1.0109251417071673e-35, -7.1940254270324e-36, 1.7412010071555134e-37, 3.5264830524668625e-36, -9.309915258512517e-36, 3.549992939483308e-36, -2.865267480129326e-37, -9.168855936413843e-36, -4.0907203408615605e-36, 8.839717518183602e-36, 4.4668785331246925e-36, -5.995021189193666e-37, -3.385423730368188e-36, 1.1049646897729503e-35, 4.4668785331246925e-36, 5.101645482568728e-36, -3.855621470697103e-36, -5.289724578700294e-36, -3.667542374565537e-36, 4.86654661240427e-36, 1.570460452698576e-35, 5.319111937470851e-37, -7.617203393328423e-36, -3.549992939483308e-36, -1.4914084576057773e-37, 2.1981744360376776e-36, -9.356935032545409e-36, -5.64237288394698e-36, -1.758539548830142e-35, 3.103305086170839e-36, -1.9043008483321058e-36, -2.4920480237432495e-36, -1.3635734469538535e-35, -1.570460452698576e-35, 2.9504908205639416e-36, 1.1519844638058418e-36, -3.079795199154393e-36, -1.7397316392169855e-36, -9.121836162380951e-36, -6.676807912670593e-36, -7.429124297196857e-36, 1.0579449157400588e-35, 2.8682062160063815e-36, 7.85230226349288e-36, -5.5953531099140885e-36, -7.85230226349288e-36, -1.1578619355599532e-36, -6.629788138637702e-36, -3.408933617384634e-36, -2.7976765549570443e-36, -6.864887008802159e-36, 6.112570624275895e-36, -5.289724578700294e-36, -1.288341808501227e-35, -4.302309324009572e-36, -1.2413220344683356e-35, 1.3224311446750734e-36, -4.043700566828669e-36, -4.6549576292562585e-36, -6.465218929522581e-37, -9.07481638834806e-36, 9.968192094972998e-36, 5.612985525176423e-37, -6.488728816539027e-36, -8.557598873986253e-36, 3.314894069318851e-36, 5.995021189193666e-36, -2.1723135603195873e-35, 1.570460452698576e-35, -7.758262715427098e-36, 7.85230226349288e-36, -6.58276836460481e-36, -1.626884181538046e-35, 2.362743645152798e-36, 1.516387712560751e-36, 1.4482090402130582e-35, -8.05213630313267e-37, 7.805282489459989e-36, 3.573502826499754e-36, 1.1660903960157092e-35, -1.7162217522005398e-36, 1.476420904632793e-35, -8.933757066249385e-36, -4.513898307157584e-36, -1.7985063567581e-36, 1.610427260626534e-36, -2.1393997184965633e-36, 2.915225990039273e-36, -1.3729774017604318e-35, -1.1566864412091309e-35, -1.1660903960157092e-35, 1.570460452698576e-35, 1.4576129950196365e-35, -2.4685381367268038e-36, -2.339233758136352e-36, 2.433273306202135e-36, -1.034435028723613e-36, -1.6739039555709374e-35, -1.3729774017604318e-35, 7.241045201065291e-36, -1.2695338988880705e-35, -1.1801963282255767e-35, 1.2319180796617573e-35, -1.3988382774785221e-36, 2.068870057447226e-35, -3.314894069318851e-36, -6.58276836460481e-36, 8.087401133657338e-36, 5.853961867094992e-36, -1.9936384189945996e-35, 1.9395656788567744e-36, -9.756603111824986e-37, -1.774996469741654e-36, -1.1801963282255767e-35, -7.993361585591555e-36, -6.676807912670593e-36, -3.220854521253068e-36, -5.0311158215193905e-36, -6.300649720407461e-36, 7.971321066513637e-38, 4.2787994369931265e-36, -2.0218502834143345e-36, 3.879131357713549e-36, 7.476144071229749e-36, -1.8337711872827685e-36, 1.4341031080031908e-36, 9.309915258512517e-36, 5.0311158215193905e-36, -1.0168026134612787e-36, -1.9654265545748647e-35, -5.571843222897643e-36, -7.899322037525772e-36, -8.228460455756013e-36, 7.617203393328423e-36, 1.0579449157400588e-35, 6.723827686703485e-36, 1.8337711872827685e-36, -7.241045201065291e-36, 5.360254239749631e-36, 8.087401133657338e-36, 1.4952288142459497e-35, -1.1049646897729503e-35, -5.571843222897643e-36, -1.5234406786656846e-35, -5.383764126766077e-36, 1.4294011305999016e-35, 4.984096047486499e-36, -9.07481638834806e-36, 1.504632769052528e-35, 7.099985878966617e-36, 7.241045201065291e-36, -1.8572810742992143e-36, -4.86654661240427e-36, 9.036612821946335e-38, -1.0297330513203239e-35, 3.0562853121379475e-36, 1.2416159080560412e-37, 7.288064975098183e-36, -3.9966807927957775e-36, 1.1378785315959743e-35, 6.817867234769268e-36, -5.47780367483186e-36, -8.181440681723121e-36, -1.2131101700486007e-35, 7.664223167361315e-36, 4.7489971773220415e-36, 1.5234406786656846e-35, 5.383764126766077e-36, 6.065550850243004e-36, 6.018531076210112e-36, 6.700317799687039e-37, 2.997510594596833e-36, 1.974830509381443e-35, -3.291384182302405e-36, -2.915225990039273e-36, -5.8774717541114375e-37, -1.0050476699530558e-36, -6.676807912670593e-36, -4.396348872075355e-36, -2.2099293795459005e-36, 1.5234406786656846e-35, 3.479463278433971e-36, 2.0100953399061116e-36, -1.626884181538046e-35, -1.664500000764359e-35, -1.2460240118716248e-36, -3.173834747220176e-36, 2.7624117244323756e-36, 8.557598873986253e-36, -1.2131101700486007e-35, -1.4341031080031908e-36, -1.0109251417071673e-35, 8.087401133657338e-36, 4.678467516272704e-36, 9.827132772874324e-36, 4.3728389850589095e-36, -9.827132772874324e-36, 4.9370762734536075e-36, 1.43880508540648e-35, -4.9370762734536075e-37, 6.817867234769268e-36, -4.984096047486499e-36, 6.700317799687039e-37, -1.8337711872827685e-36, -7.993361585591555e-36, -2.456783193218581e-36, -3.502973165450417e-36, -4.772507064338487e-36, 5.571843222897643e-36, 1.5140367238591063e-35, -1.2131101700486007e-35, 8.980776840282277e-36, -1.6339371476429796e-36, -9.686073450775649e-36, -8.557598873986253e-36, 1.0285575569695016e-36, -1.189600283032155e-35, 4.1436175866485635e-37, 4.6549576292562585e-36, -9.309915258512517e-36, 7.617203393328423e-36, -5.47780367483186e-36, 1.0391370061269022e-35, -8.087401133657338e-36, -3.408933617384634e-36, -4.020190679812223e-36, -1.4858248594393714e-35, 3.6205226005326455e-36, 7.382104523163966e-36, -4.208269775943789e-36, 5.383764126766077e-36, -4.9370762734536075e-36, 7.382104523163966e-36, 9.545014128676975e-36, 1.8337711872827685e-35, 6.535748590571919e-36, 7.288064975098183e-36, -1.38238135656701e-35, 1.0062231643038781e-35, -2.73890183741593e-36, 1.5798644075051544e-35, 7.993361585591555e-36, -1.0579449157400588e-35, 2.8682062160063815e-36, 3.455953391417525e-36, -1.2930437859045163e-36, -7.241045201065291e-36, 8.416539551887579e-36, 1.1343520485435074e-36, -7.335084749131074e-36, -1.1801963282255767e-35, -5.5953531099140885e-36, 8.651638422052036e-36, -5.830451980078546e-36, -4.161250001910898e-36, -5.948001415160775e-36, 1.2789378536946488e-35, -4.796016951354933e-36, -1.0297330513203239e-35, -1.9043008483321058e-36, -4.984096047486499e-36, 1.610427260626534e-36, -2.3862535321692436e-36, 6.58276836460481e-36, -1.9513206223649973e-36, 1.3635734469538535e-35, -1.1660903960157092e-35, 3.76158192263132e-36, 3.667542374565537e-36, 8.13442090769023e-36, -1.0626468931433479e-35, 6.394689268473244e-36, 1.1990042378387333e-35, 1.786751413249877e-36, 1.189600283032155e-35, -1.1002627123696611e-35, 2.9740007075803874e-36, -5.995021189193666e-36, 9.545014128676975e-36, -1.0626468931433479e-35, 5.6893926579798715e-36, 1.0203290965137456e-35, -1.0297330513203239e-35, 4.772507064338487e-36, 1.5610564978919978e-35, -4.607937855223367e-36, -1.3635734469538535e-35, -5.853961867094992e-36, 3.455953391417525e-36, 1.9043008483321058e-36, 1.128474576789396e-35, 4.173004945419121e-37, -2.644862289350147e-36, -1.0062231643038781e-35, 1.087332274510616e-36, -3.408933617384634e-36, 5.5013135618483055e-36, 3.479463278433971e-36, 1.288341808501227e-35, 5.47780367483186e-36, -1.0767528253532154e-35, 6.58276836460481e-36, 6.676807912670593e-36, -3.197344634236622e-36, 1.1707923734189984e-35, -8.980776840282277e-36, 1.1660903960157092e-35, -7.875812150509326e-37, 3.2443644082695135e-36, -3.790969281401877e-37, 6.37705685321091e-37, -1.8220162437745456e-36, -1.2283915966092904e-36, 1.986585452889666e-36, -2.997510594596833e-36, -2.9534295564409974e-37, -4.043700566828669e-36, 7.93458686805044e-37, 4.8430367253878245e-36, 1.0109251417071673e-35, 3.3384039563352965e-36, -4.9370762734536075e-36, -9.827132772874324e-36, 5.172175143618065e-36, 5.5953531099140885e-36, -1.0109251417071673e-35, 5.242704804667402e-36, 1.1143686445795286e-35, 1.344765537340697e-35, -3.7145621485984285e-36, -4.796016951354933e-36, -3.92615113174644e-36, 2.456783193218581e-36, -1.7115197747972506e-35, 3.455953391417525e-36, -1.1707923734189984e-35, -2.915225990039273e-36, 7.993361585591555e-36, -4.885648395605132e-38, -9.497994354644083e-36, 1.9043008483321058e-36, -2.162909605513009e-36, -7.052966104933725e-36, -1.0168026134612787e-36, -4.513898307157584e-36, 3.691052261581983e-36, -3.573502826499754e-36, 1.626884181538046e-35, -2.1276447749883404e-36, 1.6456920911512025e-35, 8.933757066249385e-36, -8.604618648019145e-36, 8.745677970117819e-36, 7.476144071229749e-36, 8.604618648019145e-36, 1.1461069920517303e-36, 7.099985878966617e-36, -1.2789378536946488e-35, 4.419858759091801e-36, -5.948001415160775e-36, -1.4011892661801667e-35, -1.1566864412091309e-35, -2.5860875718090325e-36, -8.040381359624447e-36, 8.040381359624447e-36, 9.309915258512517e-36, -4.419858759091801e-36, -3.385423730368188e-36, 4.8430367253878245e-36, -8.557598873986253e-36, -3.032775425121502e-36, -1.2037062152420224e-35, 7.617203393328423e-36, -7.147005652999508e-36, -1.1190706219828177e-35, -2.915225990039273e-36, -4.2795341209623905e-38, 1.504632769052528e-35, -6.676807912670593e-36, 1.9395656788567744e-36, -1.2037062152420224e-35, 9.592033902709866e-36, 7.993361585591555e-36, 1.3635734469538535e-35, -1.2789378536946488e-35, -2.1393997184965633e-36, 5.054625708535836e-36, 1.3929608057244107e-36, 4.8430367253878245e-36, 3.6205226005326455e-36, -3.573502826499754e-36, 1.7021158199906723e-35, -5.571843222897643e-36, -8.557598873986253e-36, 6.723827686703485e-36, 8.604618648019145e-36, -1.0062231643038781e-35, 6.535748590571919e-36, 4.1847598889273435e-36, -3.502973165450417e-36, 1.3259576277275403e-35, 1.6833079103775157e-35, -2.0130340757831674e-37, 1.0579449157400588e-35, -8.933757066249385e-36, 1.6080762719248893e-35, 2.3656823810298536e-37, -1.2131101700486007e-35, 5.900981641127883e-36, 6.817867234769268e-36, 7.85230226349288e-36, 1.2789378536946488e-35, -9.215875710446734e-36, -8.13442090769023e-36, -3.009265538105056e-36, -3.079795199154393e-36, 2.7859216114488214e-36, 3.4236272967699124e-37, -8.87498234870827e-37, 3.855621470697103e-36, -2.621352402333701e-36, 1.8337711872827685e-36, -9.07481638834806e-36, -5.242704804667402e-36, -5.47780367483186e-36, 2.4685381367268038e-36, -4.255289549976681e-36, -1.5140367238591063e-35, -1.0109251417071673e-35, 1.586917373610088e-36, -5.5953531099140885e-36, -9.968192094972998e-36, -1.0109251417071673e-36, -1.0626468931433479e-35, -2.0218502834143345e-36, -5.5542108076353085e-37, -1.3259576277275403e-35, -4.6549576292562585e-36, -5.0311158215193905e-36, 1.9983403963978888e-36, 1.7632415262334313e-36, 5.5953531099140885e-36, 7.288064975098183e-36, 6.6415430821459244e-37, 8.632536638851174e-38, 1.0861567801597937e-35, 1.128474576789396e-35, -1.0579449157400588e-35, -1.88079096131566e-37, 1.570460452698576e-35, 4.208269775943789e-36, -7.617203393328423e-36, -1.6739039555709374e-35, -2.3274788146281293e-36, 1.96307556587322e-36, 9.874152546907215e-36, 2.1276447749883404e-36, -8.745677970117819e-36, -5.524823448864751e-36, -6.864887008802159e-36, 3.2326094647612906e-37, 4.513898307157584e-36, -9.686073450775649e-36, 1.8220162437745456e-36, 1.5234406786656846e-35, -1.5398975995771966e-36, -9.07481638834806e-36, -8.228460455756013e-36, 1.1049646897729503e-35, 1.6574470346594254e-36, -7.335084749131074e-36, -4.984096047486499e-36, 5.47780367483186e-36, 4.9370762734536075e-36, -2.527312854267918e-36, 9.73309322480854e-36, -3.408933617384634e-36, 1.3259576277275403e-35, -6.723827686703485e-36, -1.5398975995771966e-36, 6.700317799687039e-37, 1.1637394073140646e-36, -2.926980933547496e-36, -1.2789378536946488e-35, -9.309915258512517e-36, -1.4752454102819708e-36, 1.9043008483321058e-36, -1.4482090402130582e-35, 4.231779662960235e-37, 7.85230226349288e-36, -9.462729524119414e-37, -1.9395656788567744e-36, 1.610427260626534e-36, -9.07481638834806e-36, -8.13442090769023e-36, -9.545014128676975e-36, 7.899322037525772e-36, -1.2601299440814922e-35, 2.1441016958998524e-35, -4.419858759091801e-36, 6.206610172341678e-36, 4.1436175866485635e-37, -7.170515540015954e-37, 1.598672317118311e-35, 2.0923799444636718e-36, -1.4858248594393714e-35, -2.1276447749883404e-36, 1.7632415262334313e-36, 9.309915258512517e-36, -5.830451980078546e-36, 5.360254239749631e-36, -3.479463278433971e-36, -7.335084749131074e-36, 5.7834322060456545e-36, -2.3274788146281293e-36, 1.1566864412091309e-35, -8.839717518183602e-36, -1.6080762719248893e-35, -1.774996469741654e-36, -9.874152546907215e-36, 8.087401133657338e-36, -3.92615113174644e-36, -3.291384182302405e-36, -8.05213630313267e-37, 4.4668785331246925e-36, 8.999878623483139e-39, -5.172175143618065e-37, -5.242704804667402e-36, -1.8440567628524635e-37, -1.0062231643038781e-35, 1.1566864412091309e-35, 2.5743326283008096e-36, -3.973170905779332e-36, -5.7834322060456545e-36, 9.121836162380951e-36, 4.302309324009572e-36, 1.814963277669612e-35, -5.47780367483186e-36, -4.760752120830264e-37, -9.07481638834806e-36, -4.231779662960235e-36, -5.101645482568728e-36, 4.796016951354933e-36, 9.168855936413843e-36, 1.720923729603829e-35, -4.137740114894452e-36, -4.255289549976681e-36, 9.73309322480854e-36, -2.9740007075803874e-36, 3.76158192263132e-36, 4.419858759091801e-36, 8.9925317837905e-37, -5.948001415160775e-36, -1.3958995416014664e-37, 4.4668785331246925e-36, -1.8337711872827685e-36, -1.8337711872827685e-36, -6.018531076210112e-36, 2.915225990039273e-36, -5.3132344657167395e-36, -1.1872492943305104e-36, 4.419858759091801e-36, -1.3635734469538535e-35, 1.6080762719248893e-35, -1.6971199689996776e-37, 6.112570624275895e-36, 2.456783193218581e-36, 5.571843222897643e-36, -1.2695338988880705e-35, -1.4670169498262148e-35, 8.087401133657338e-36, -4.231779662960235e-36, 1.5398975995771966e-36, 3.103305086170839e-36, 8.651638422052036e-36, 8.040381359624447e-36, -4.349329098042464e-36, 5.172175143618065e-36, -1.1108421615270617e-36, -1.2413220344683356e-35, 2.4685381367268038e-36, -9.309915258512517e-36, 5.900981641127883e-36, 2.080625000955449e-36, 1.2131101700486007e-35, 1.3812058622161878e-37, 4.760752120830264e-37, 4.584427968206921e-36, 6.58276836460481e-36, -2.2099293795459005e-35, -1.504632769052528e-36, -2.6801271198748155e-36, -1.1660903960157092e-35, -9.07481638834806e-36, 2.5860875718090325e-36, -9.121836162380951e-36, 1.2283915966092904e-36, 9.93292726444833e-37, -6.723827686703485e-36, -1.0109251417071673e-35, -5.47780367483186e-36, -6.159590398308787e-36, -3.879131357713549e-36, 8.79269774415071e-36, 3.879131357713549e-36, -9.592033902709866e-36, 9.027796614315168e-36, -1.4458580515114136e-36, 6.535748590571919e-36, 6.441709042506136e-36, -1.2989212576586277e-36, -3.0562853121379475e-36, -1.1002627123696611e-35, 3.6205226005326455e-36, 6.676807912670593e-36, 1.598672317118311e-35, 1.5634074865936424e-36, 1.2131101700486007e-35, 7.335084749131074e-36, 8.087401133657338e-36, 1.3635734469538535e-35, -1.2695338988880705e-35, -1.8572810742992143e-36, -6.629788138637702e-36, -1.0297330513203239e-35, -3.92615113174644e-36, 2.527312854267918e-36, 3.6205226005326455e-36, 2.621352402333701e-36, -5.936246471652552e-37, -2.73890183741593e-36, -1.5281426560689738e-36, 8.933757066249385e-36, 3.785091809647766e-36, -9.686073450775649e-36, -1.410593220986745e-36, -8.557598873986253e-36, -7.899322037525772e-36, 1.774996469741654e-36, 1.034435028723613e-35, -4.419858759091801e-36, 1.3729774017604318e-35, 9.450974580611192e-36, 8.369519777854687e-36, 1.6339371476429796e-36, 5.148665256601619e-36, 6.700317799687039e-37, 3.667542374565537e-36, -5.289724578700294e-36, -3.549992939483308e-36, 2.997510594596833e-36, 7.335084749131074e-36, -1.2460240118716248e-36, 8.087401133657338e-36, -1.9936384189945996e-35, 6.159590398308787e-36, -9.545014128676975e-36, -4.760752120830264e-37, -1.9513206223649973e-36, -2.915225990039273e-36, -1.0062231643038781e-35, 1.7021158199906723e-35, -1.516387712560751e-36, -1.5398975995771966e-36, -1.0050476699530558e-36, -1.0461899722318359e-36, -4.8430367253878245e-36, -7.899322037525772e-36, 3.644032487549091e-36, -1.88079096131566e-36, -4.70197740328915e-38, -1.087332274510616e-36, -2.915225990039273e-36, -1.692711865184094e-35, -1.570460452698576e-35, 5.736412432012763e-36, 2.9740007075803874e-36, -9.462729524119414e-37, -6.347669494440353e-36, 6.535748590571919e-36, -1.9983403963978888e-36, -4.349329098042464e-36, 7.85230226349288e-36, -1.1931267660846218e-36, -3.6205226005326455e-36, 1.5234406786656846e-35, 9.110081218872728e-37, -2.3980084756774665e-36, -2.9740007075803874e-36, 2.747718045047097e-37, -2.245194210070569e-36, 1.344765537340697e-35, 1.2789378536946488e-35, -4.419858759091801e-36, -8.322500003821796e-36, -1.7773474584432987e-35, -4.607937855223367e-36, -6.206610172341678e-36, -1.7118136483849562e-37, -5.101645482568728e-36, -2.1393997184965633e-36, 4.0907203408615605e-36, 1.704466808692317e-36, 4.70197740328915e-36, 2.1723135603195873e-35, -5.242704804667402e-36, -3.032775425121502e-36, -3.949661018762886e-36, 7.099985878966617e-36, -1.128474576789396e-35, -9.121836162380951e-36, -4.772507064338487e-36, -9.545014128676975e-36, 1.0520674439859473e-36, 6.629788138637702e-36, 1.9043008483321058e-36, 5.830451980078546e-36, -1.128474576789396e-36, 1.8734441216230207e-37, 6.817867234769268e-36, -9.051306501331614e-37, 4.53740819417403e-36, -1.9513206223649973e-36, 2.5155579107596953e-36, 2.82118644197349e-37, 3.3501588998435194e-37, 3.9026412447299945e-36, -2.151154662004786e-36, -7.85230226349288e-36, 2.1041348879718946e-36, 8.087401133657338e-36, -4.86654661240427e-36, -2.715391950399484e-36, -5.360254239749631e-36, 8.040381359624447e-36, 1.7491355940235638e-35, -1.0579449157400588e-35, 2.5155579107596953e-36, 4.419858759091801e-36, 1.946618644961708e-35, 6.171345341817009e-37, 1.0109251417071673e-36, 3.9966807927957775e-36, -1.5140367238591063e-35, -1.1660903960157092e-35, -9.497994354644083e-36, 1.6221822041347568e-36, 7.699487997885983e-37, 2.621352402333701e-36, -3.8086016966642115e-36, -3.6205226005326455e-36, -1.2131101700486007e-35, -2.5860875718090325e-36, -3.408933617384634e-36, 1.0109251417071673e-36, -2.4215183626939123e-36, 9.4039548065783e-36, -4.043700566828669e-36, -4.643202685748036e-37, -1.075577331002393e-36, -1.2989212576586277e-36, 1.3729774017604318e-35, 3.9026412447299945e-36, 7.052966104933725e-36, -3.220854521253068e-36, 1.4199971757933233e-35, -1.974830509381443e-35, 1.7115197747972506e-35, 5.101645482568728e-36, -5.7834322060456545e-36, -9.027796614315168e-36, 6.488728816539027e-36, -6.441709042506136e-36, -1.3635734469538535e-35, -1.3259576277275403e-35, 5.195685030634511e-36, -5.360254239749631e-36, 7.4350017689509685e-37, 5.383764126766077e-36, 9.345180089037186e-37, -7.85230226349288e-36, 8.886737292216494e-36, 6.864887008802159e-36, -1.5634074865936424e-36, 9.545014128676975e-36, 1.5892683623117327e-35, 1.4576129950196365e-35, -1.5610564978919978e-35, 4.419858759091801e-36, 1.034435028723613e-35, 7.52316384526264e-36, 3.9966807927957775e-36, 1.7021158199906723e-35, 6.300649720407461e-36, 2.2804590405952378e-36, 1.189600283032155e-35, -1.0062231643038781e-35, 3.502973165450417e-36, -1.9160557918403286e-36, 1.4294011305999016e-35, 2.550822741284364e-36, 3.549992939483308e-36, -2.6801271198748155e-36, 2.5743326283008096e-36, -1.1660903960157092e-35, -6.629788138637702e-36, -3.291384182302405e-36, 7.85230226349288e-36, -5.948001415160775e-36, -1.5634074865936424e-36, 1.1637394073140646e-36, -3.385423730368188e-36, -5.195685030634511e-36, 2.0218502834143345e-36, 4.6549576292562585e-36, 4.984096047486499e-36, -1.8587504422377421e-37, 1.6174802267314676e-35, -5.5013135618483055e-36, 1.610427260626534e-36, -5.6893926579798715e-36, -1.986585452889666e-36, -1.410593220986745e-35, -7.758262715427098e-36, 2.3274788146281293e-36, 5.348499296241408e-37, 5.571843222897643e-36, 9.545014128676975e-36, -3.479463278433971e-36, -9.07481638834806e-36, 1.4576129950196365e-36, 2.527312854267918e-37, -9.697828394283872e-37, 1.4199971757933233e-35, -1.1707923734189984e-35, 1.2342690683634019e-36, -1.2989212576586277e-36, 3.832111583680657e-36, -2.6918820633830384e-36, 2.865267480129326e-37, 5.995021189193666e-36, 3.5970127135162e-36, -6.300649720407461e-36, 1.0767528253532154e-35, -1.664500000764359e-35, 6.770847460736376e-36, -3.790969281401877e-37, 3.738072035614874e-36, 1.189600283032155e-35, -1.7303276844104072e-35, -4.607937855223367e-36, 1.909002825735395e-35, 5.64237288394698e-36, 6.535748590571919e-36, 1.0461899722318359e-36, 3.103305086170839e-36, -7.335084749131074e-36, -9.4039548065783e-36, -1.0579449157400588e-35, 9.874152546907215e-36, 1.4870003537901937e-36, -3.92615113174644e-36, -1.3259576277275403e-35, 1.974830509381443e-36, -2.997510594596833e-36, -2.456783193218581e-36, -1.6550960459577808e-35, -5.524823448864751e-36, -2.6918820633830384e-36, 6.676807912670593e-36, -1.2695338988880705e-35, 1.0814548027565045e-36, 6.112570624275895e-36, -4.7489971773220415e-36, -2.456783193218581e-36, 6.159590398308787e-36, -1.3541694921472752e-35, -4.2787994369931265e-36, 7.805282489459989e-36, 6.112570624275895e-36, 9.874152546907215e-36, -3.92615113174644e-36, -1.7491355940235638e-35, -6.065550850243004e-36, -1.4482090402130582e-35, -1.4223481644949679e-36, 5.5953531099140885e-36, -8.510579099953362e-36, 1.1801963282255767e-35, 7.581938562803754e-37, -1.2977457633078054e-35, 2.7624117244323756e-36, -1.4670169498262148e-35, -1.2460240118716248e-36, -2.5978425153172554e-36, 3.6205226005326455e-36, 4.966463632224165e-37, -3.549992939483308e-36, 1.3353615825341186e-35, 5.995021189193666e-36, 1.034435028723613e-35, 9.309915258512517e-36, 4.70197740328915e-36, -4.255289549976681e-36, -1.0062231643038781e-35, -1.357695975199742e-36, 1.2695338988880705e-35, 8.886737292216494e-36, -8.839717518183602e-36, 1.2601299440814922e-35, -6.347669494440353e-36, 2.915225990039273e-36, -7.93458686805044e-37, -3.408933617384634e-36, 6.25362994637457e-36, -1.3259576277275403e-35, 1.4870003537901937e-36, -4.8430367253878245e-36, 1.3635734469538535e-35, -6.817867234769268e-36, -9.827132772874324e-36, 3.832111583680657e-36, -6.112570624275895e-36, 3.079795199154393e-36, 7.005946330900834e-36, 7.617203393328423e-36, -6.065550850243004e-36, -2.3862535321692436e-36, 5.5013135618483055e-36, -5.383764126766077e-36, -1.96307556587322e-36, 5.289724578700294e-36, 1.43880508540648e-35, -1.3077374652897949e-37, -1.6603857705364811e-37, 2.0923799444636718e-36, -2.2005254247393222e-35, -2.621352402333701e-36, 6.676807912670593e-36, -6.723827686703485e-36, 4.70197740328915e-36, -1.189600283032155e-35, -4.6549576292562585e-36, -3.6205226005326455e-36, 2.2804590405952378e-36, 6.25362994637457e-36, -1.3259576277275403e-35, -6.817867234769268e-36, 8.745677970117819e-36, 1.774996469741654e-36, 9.356935032545409e-36, -1.0062231643038781e-35, 5.148665256601619e-36, 1.1990042378387333e-35, -4.3728389850589095e-36, 5.3132344657167395e-36, 5.252990380237097e-38, -6.065550850243004e-36, 4.208269775943789e-36, 1.7021158199906723e-35, -4.53740819417403e-36, -4.984096047486499e-36, 1.288341808501227e-35, 1.786751413249877e-36, 1.4199971757933233e-35, -7.993361585591555e-36, 1.598672317118311e-35, 1.0767528253532154e-35, -2.8682062160063815e-36, -1.692711865184094e-36, -1.3541694921472752e-35, 4.86654661240427e-36, -4.53740819417403e-36, -1.3259576277275403e-35, -8.79269774415071e-36, -1.2107591813469561e-36, 9.592033902709866e-36, 5.3132344657167395e-36, 1.2037062152420224e-35, -3.76158192263132e-36, -2.245194210070569e-36, 2.3603926564511533e-35, 1.0485409609334805e-35, 7.993361585591555e-36, -7.617203393328423e-36, -3.973170905779332e-36, 5.853961867094992e-36, 8.228460455756013e-36, 3.8086016966642115e-36, 2.3274788146281293e-36, 1.1190706219828177e-35, 5.995021189193666e-36, -1.095560734966372e-35, -5.348499296241408e-37, 6.723827686703485e-36, -1.0626468931433479e-35, 9.497994354644083e-36, 5.948001415160775e-36, 7.85230226349288e-36, -5.242704804667402e-36, 3.9966807927957775e-36, -6.018531076210112e-36, 7.147005652999508e-36, -2.997510594596833e-36, -6.91190678283505e-36, -8.745677970117819e-36, 3.5970127135162e-36, -4.325819211026018e-36, -6.629788138637702e-36, -1.88079096131566e-35, 1.1707923734189984e-35, -5.524823448864751e-36, -1.9983403963978888e-36, 1.222514124855179e-35, -1.0485409609334805e-35, -3.314894069318851e-36, -1.1990042378387333e-35, 9.686073450775649e-36, 9.262895484479626e-36, -2.3980084756774665e-36, 4.4668785331246925e-36, 5.64237288394698e-36, 6.958926556867942e-36, -6.25362994637457e-36, 3.3384039563352965e-36, -1.189600283032155e-35, -8.275480229788904e-36, -1.680956921675871e-36, -1.7021158199906723e-35, 1.316553672920962e-35, -3.5970127135162e-36, 5.612985525176423e-37, -8.46355932592047e-36, -3.644032487549091e-36, -1.1143686445795286e-35, 3.361913843351742e-36, -2.4920480237432495e-36, 3.7145621485984285e-36, 3.173834747220176e-36, -8.839717518183602e-36, 4.984096047486499e-36, 1.0579449157400588e-35, 6.347669494440353e-36, -8.369519777854687e-36, -4.725487290305596e-36, 1.8619830517025034e-35, -3.949661018762886e-36, -1.3259576277275403e-35, 4.796016951354933e-36, -7.711242941394206e-36, 2.747718045047097e-37, 1.1402295202976189e-36, 1.96307556587322e-36, 2.71833068627654e-37, -1.0579449157400588e-35, -1.2930437859045163e-36, -5.0311158215193905e-36, 1.1931267660846218e-36, -1.222514124855179e-35, 7.946341811558664e-36, 4.86654661240427e-36, -9.827132772874324e-36, 9.545014128676975e-36, -5.5013135618483055e-36, 3.6205226005326455e-36, -9.827132772874324e-36, 1.0203290965137456e-35, -1.626884181538046e-35, -8.369519777854687e-36, -2.644862289350147e-36, -8.557598873986253e-36, -9.686073450775649e-36, 4.255289549976681e-36, -1.570460452698576e-35, -3.96729343402522e-37, 9.051306501331614e-37, 7.993361585591555e-36, -1.034435028723613e-35, 2.245194210070569e-36, -1.0461899722318359e-36, -1.0485409609334805e-35, -6.770847460736376e-36, -5.242704804667402e-36, 3.636685647856452e-38, -6.535748590571919e-36, -1.6574470346594254e-36, -2.7036370068912613e-37, -5.5953531099140885e-36, 1.344765537340697e-35, 2.0124463286077562e-35, -1.1190706219828177e-35, 2.915225990039273e-36, -1.3283086164291849e-36, 1.8431751420893468e-35, -1.476420904632793e-35, -9.686073450775649e-36, -4.419858759091801e-36, -1.3459410316915192e-36, -1.1472824864025526e-35, 1.2131101700486007e-35, 7.85230226349288e-36, 1.1990042378387333e-36, -7.241045201065291e-36, 6.347669494440353e-36, -5.360254239749631e-36, 8.745677970117819e-36, -7.52316384526264e-36, -6.58276836460481e-36, -1.189600283032155e-35, -2.3744985886610208e-36, -9.592033902709866e-36, -2.268704097087015e-36, -2.621352402333701e-36, 9.827132772874324e-36, 7.052966104933725e-36, 3.314894069318851e-36, 1.4294011305999016e-35, 7.570183619295532e-36, 6.817867234769268e-36, 6.817867234769268e-36, 1.8901949161222383e-35, -4.2787994369931265e-36, -1.0109251417071673e-35, -9.686073450775649e-36, -1.3259576277275403e-35, 1.0391370061269022e-35, -6.676807912670593e-36, -9.497994354644083e-36, -8.510579099953362e-36, -1.0203290965137456e-35, -1.3106762011668506e-36, 7.099985878966617e-36, 2.73890183741593e-36, 1.598672317118311e-36, 1.263656427133959e-36, -1.586917373610088e-36, -1.7115197747972506e-35, 4.319941739271907e-37, -1.692711865184094e-36, -1.9043008483321058e-36, -2.621352402333701e-36, 6.171345341817009e-37, -1.7279766957087626e-36, -4.5609180811904755e-36, 9.639053676742758e-37, 1.2037062152420224e-35, -5.195685030634511e-36, -2.1981744360376776e-36, 4.913566386437162e-36, -2.2099293795459005e-36, -3.173834747220176e-36, 4.2787994369931265e-36, 1.3635734469538535e-35, 1.0579449157400588e-35, 1.2319180796617573e-35, -2.245194210070569e-36, 1.504632769052528e-35, 2.5978425153172554e-36, -4.796016951354933e-36, 5.524823448864751e-36, 6.488728816539027e-36, 1.0109251417071673e-35, -1.1707923734189984e-35, -1.9654265545748647e-35, 7.52316384526264e-36, -1.8713870065090817e-35, 1.1402295202976189e-36, 9.844765188136658e-38, 4.8430367253878245e-36, 9.686073450775649e-36, -1.2977457633078054e-35, 1.3353615825341186e-35, -1.2131101700486007e-35, 6.488728816539027e-36, -2.8446963289899358e-36, 2.8446963289899358e-36, -1.4294011305999016e-35, 7.382104523163966e-36, -9.497994354644083e-36, 1.1660903960157092e-35, -4.678467516272704e-36, 9.07481638834806e-36, -8.322500003821796e-36, 9.356935032545409e-36, -2.997510594596833e-36, 1.3917853113735884e-35, 1.3259576277275403e-35, 2.527312854267918e-36, 4.86654661240427e-36, 5.64237288394698e-36, -8.839717518183602e-36, 1.9395656788567744e-37, -1.1637394073140646e-36, 6.347669494440353e-36, 7.899322037525772e-36, -1.6080762719248893e-35, 4.760752120830264e-37, -4.772507064338487e-36, 8.087401133657338e-36, 1.189600283032155e-35, 6.535748590571919e-36, 1.43880508540648e-35, -9.686073450775649e-36, 1.2037062152420224e-35, 7.805282489459989e-36, 3.009265538105056e-36, 1.3867159919856673e-38, 3.738072035614874e-36, 2.339233758136352e-36, -8.839717518183602e-36, -1.1190706219828177e-35, -1.6550960459577808e-35, 9.07481638834806e-36, 6.629788138637702e-36, 8.698658196084928e-36, 6.488728816539027e-36, -5.524823448864751e-36, 3.855621470697103e-36, 8.46355932592047e-36, -2.8446963289899358e-36, 3.009265538105056e-36, -7.52316384526264e-36, 3.471381754772068e-38, -3.973170905779332e-36, -7.946341811558664e-36, -5.113400426076951e-37, 1.2871663141504048e-36, -2.915225990039273e-36, -2.7506567809241528e-36, -4.8430367253878245e-36, 8.557598873986253e-36, -3.385423730368188e-36, 1.2695338988880705e-35, -3.8086016966642115e-36, -1.1225971050352846e-36, -2.433273306202135e-36, -2.1393997184965633e-36, -1.0109251417071673e-35, -1.1190706219828177e-35, -3.3384039563352965e-36, -2.4215183626939123e-36, -4.255289549976681e-36, 7.099985878966617e-36, 3.5970127135162e-36, 8.886737292216494e-36, -6.065550850243004e-36, -4.984096047486499e-36, 8.632536638851174e-38, 2.0100953399061116e-36, 9.07481638834806e-36, -4.114230227878006e-36, -3.220854521253068e-36, 2.5860875718090325e-36, 5.319111937470851e-37, 3.032775425121502e-36, 7.617203393328423e-36, 7.617203393328423e-36, 6.347669494440353e-36, -8.510579099953362e-36, 5.830451980078546e-36, -4.9370762734536075e-37, 5.900981641127883e-36, 6.065550850243004e-36, -5.583598166405866e-37, 6.864887008802159e-36, -9.356935032545409e-36, 6.958926556867942e-36, -2.268704097087015e-36, 1.1472824864025526e-35, 3.644032487549091e-36, 1.9513206223649973e-36, -7.93458686805044e-37, 8.557598873986253e-36, -1.4199971757933233e-35, -3.832111583680657e-36, 4.396348872075355e-36, -1.692711865184094e-35, -2.8917161030228273e-36, 1.946618644961708e-35, 1.2131101700486007e-35, 5.736412432012763e-36, -7.946341811558664e-36, 8.322500003821796e-36, 5.430783900798968e-36, -1.1461069920517303e-36, -5.289724578700294e-36, 3.361913843351742e-36, 1.3259576277275403e-35, -6.300649720407461e-36, 6.817867234769268e-36, -3.502973165450417e-36, 4.1847598889273435e-36, -2.82118644197349e-36, 9.968192094972998e-36, -1.0485409609334805e-35, 1.570460452698576e-35, 1.3259576277275403e-35, -1.0626468931433479e-35, -1.6362881363446242e-35, -1.4952288142459497e-35, 1.892545904823883e-36, -1.0297330513203239e-35, 6.723827686703485e-36, -1.598672317118311e-35, 7.85230226349288e-36, -8.816207631167156e-38, 2.0424214345537245e-37, -1.1002627123696611e-35, -6.906029311080939e-37, 9.697828394283872e-37, 4.2787994369931265e-36, 4.020190679812223e-36, 1.0403125004777244e-36, 3.361913843351742e-36, 7.335084749131074e-36, 5.571843222897643e-36, 1.5140367238591063e-35, 1.0297330513203239e-35, 3.7145621485984285e-36, 1.0626468931433479e-35, -9.592033902709866e-36, -1.6080762719248893e-35, -8.510579099953362e-36, -2.5978425153172554e-36, 1.0297330513203239e-35, 7.335084749131074e-36, 4.114230227878006e-36, -6.629788138637702e-36, -4.9370762734536075e-36, 9.07481638834806e-36, 3.92615113174644e-36, 1.9184067805419732e-35, 5.101645482568728e-36, -3.879131357713549e-36, 4.2787994369931265e-36, 2.915225990039273e-36, 7.85230226349288e-36, -3.220854521253068e-36, -1.1660903960157092e-35, -9.968192094972998e-36, 3.5970127135162e-36, -1.7115197747972506e-35, -6.300649720407461e-36, 3.122406869371701e-39, 7.817037432968212e-37, -6.441709042506136e-36, 8.651638422052036e-36, 1.189600283032155e-35, 7.005946330900834e-36, 5.101645482568728e-36, 4.043700566828669e-36, -6.25362994637457e-36, 2.0100953399061116e-36, 2.550822741284364e-36, 5.054625708535836e-36, 5.360254239749631e-36, -1.9160557918403286e-36, -2.256949153578792e-36, 3.438320976155191e-37, 4.8430367253878245e-36, 4.3728389850589095e-36, 6.018531076210112e-36, 1.128474576789396e-36, -3.314894069318851e-36, 8.816207631167156e-37, -7.946341811558664e-36, 1.4576129950196365e-35, -2.339233758136352e-36, -5.172175143618065e-36, -8.228460455756013e-36, -1.0485409609334805e-35, -5.830451980078546e-36, 6.817867234769268e-36, 1.4576129950196365e-35, -2.277520304718182e-37, -1.1343520485435074e-36, -1.6456920911512025e-35, 2.997510594596833e-36, -1.0767528253532154e-35, 2.4215183626939123e-36, -4.1847598889273435e-36, -6.065550850243004e-36, 4.026068151566335e-37, 2.1276447749883404e-36, -9.844765188136658e-38, 5.736412432012763e-36, -5.054625708535836e-36, -4.020190679812223e-36, 3.0562853121379475e-36, -2.6566172328583698e-36, -4.913566386437162e-36, 9.827132772874324e-36, 1.5234406786656846e-35, 4.231779662960235e-36, 1.680956921675871e-36, -6.629788138637702e-36, -5.995021189193666e-36, 1.5398975995771966e-36, 2.5126191748826395e-37, 1.692711865184094e-36, 3.126814973187285e-36, -8.228460455756013e-36, -4.208269775943789e-36, 3.76158192263132e-36, -4.208269775943789e-36, -4.8430367253878245e-36, -4.86654661240427e-36, -1.410593220986745e-35, 4.1847598889273435e-36, -4.8430367253878245e-36, 2.997510594596833e-36, 7.758262715427098e-36, 7.273371295712904e-38, 2.715391950399484e-36, -4.419858759091801e-36, -5.571843222897643e-36, 4.70197740328915e-36, 1.0109251417071673e-35, -1.3259576277275403e-35, -1.4199971757933233e-35, 6.347669494440353e-36, -3.103305086170839e-36, 3.92615113174644e-36, -1.1378785315959743e-35, -1.128474576789396e-35, 1.2601299440814922e-35, -1.3929608057244107e-36, -9.8153778293661e-37, -1.095560734966372e-35, -1.2342690683634019e-36, -2.2334392665623463e-36, 3.9966807927957775e-37, 2.453844457341525e-37, 5.830451980078546e-36, 2.1158898314801175e-35, 2.0923799444636718e-36, 7.946341811558664e-36, -8.46355932592047e-36, 2.080625000955449e-36, 9.309915258512517e-36, 6.629788138637702e-36, 3.314894069318851e-36, -2.7976765549570443e-36, -2.245194210070569e-36, -2.621352402333701e-36, -6.770847460736376e-36, -1.570460452698576e-35, 1.1049646897729503e-35, 1.288341808501227e-35, 3.032775425121502e-36, 6.91190678283505e-36, -1.0203290965137456e-35, 5.430783900798968e-36, 1.6174802267314676e-35, -1.5798644075051544e-35, 6.159590398308787e-36, 1.0720508479499262e-35, -5.64237288394698e-36, 3.291384182302405e-36, -1.6692019781676483e-36, -9.027796614315168e-36, 4.020190679812223e-36, -1.1990042378387333e-35, -9.309915258512517e-36, 9.686073450775649e-36, -1.7021158199906723e-35, -8.595802440387977e-38, 1.1143686445795286e-35, -1.0861567801597937e-35, -4.4668785331246925e-36, 1.0579449157400588e-35, 2.8446963289899358e-36, -2.7976765549570443e-36, -4.437491174354135e-37, -5.995021189193666e-36, 1.2695338988880705e-35, 1.1002627123696611e-35, -4.055455510336892e-37, 7.52316384526264e-36, -5.3132344657167395e-36, 6.394689268473244e-36, -6.112570624275895e-36, -3.549992939483308e-36, -1.3929608057244107e-36, -6.817867234769268e-36, 7.570183619295532e-36, 8.287235173297127e-37, 1.2601299440814922e-35, 2.3274788146281293e-36, 2.8446963289899358e-36, -2.5860875718090325e-36, -1.4294011305999016e-35, -8.980776840282277e-36, 3.314894069318851e-36, 8.651638422052036e-36, -4.043700566828669e-36, 1.6174802267314676e-35, 7.429124297196857e-36, 4.796016951354933e-36, 4.1847598889273435e-36, 1.5610564978919978e-35, -1.1461069920517303e-36, 1.1801963282255767e-35, 2.5155579107596953e-36, 8.698658196084928e-36, 6.553381005834253e-37, 5.172175143618065e-36, 2.0124463286077562e-35, -5.054625708535836e-36, 1.8337711872827685e-35, 2.362743645152798e-36, -5.524823448864751e-36, 7.617203393328423e-36, 1.5610564978919978e-35, 4.231779662960235e-36, -5.47780367483186e-36, -8.745677970117819e-36, -6.876641952310382e-37, 8.963144425019942e-38, -1.1801963282255767e-35, -6.112570624275895e-36, -7.493776486492083e-37, -1.0109251417071673e-35, 8.228460455756013e-36, 2.3980084756774665e-36, -2.0718087933242817e-37, -9.827132772874324e-36, 3.1503248602037305e-36, -6.676807912670593e-36, -2.2804590405952378e-36, -3.408933617384634e-36, -2.82118644197349e-36, -1.3224311446750734e-36, 8.557598873986253e-36, 1.1472824864025526e-35, 9.262895484479626e-36, -3.314894069318851e-36, -7.147005652999508e-36, 1.0861567801597937e-35, -1.3353615825341186e-35, -6.770847460736376e-36, -5.995021189193666e-36, 8.839717518183602e-36, -1.0297330513203239e-35, 1.4199971757933233e-35, 7.617203393328423e-36, -1.6362881363446242e-35, -1.7632415262334313e-36, -1.4670169498262148e-35, 3.502973165450417e-36, -6.37705685321091e-37, 1.5140367238591063e-35, 4.349329098042464e-36, -6.018531076210112e-36, -1.3635734469538535e-35, 8.79269774415071e-36, 5.965633830423109e-37, 5.84808439534088e-37, 1.3635734469538535e-35, 7.946341811558664e-36, -1.189600283032155e-35, 6.58276836460481e-36, -6.958926556867942e-36, -7.147005652999508e-36, 1.4294011305999016e-35, 4.161250001910898e-36, -1.9395656788567744e-36, -1.1707923734189984e-35, 1.1990042378387333e-36, 6.629788138637702e-36, 2.915225990039273e-36, -6.25362994637457e-36, 4.725487290305596e-36, -2.080625000955449e-36, 3.667542374565537e-36, 4.231779662960235e-36, 9.121836162380951e-36, -1.3106762011668506e-36, 8.79269774415071e-36, -4.137740114894452e-36, 9.991701981989444e-38, -5.383764126766077e-36, -5.0311158215193905e-36, -1.0062231643038781e-35, -4.678467516272704e-36, 1.598672317118311e-36, -5.736412432012763e-36, -9.07481638834806e-36, 4.325819211026018e-36, 3.9026412447299945e-36, 2.1041348879718946e-36, -1.1472824864025526e-35, 7.288064975098183e-36, 2.1864194925294548e-36, -1.1801963282255767e-35, 2.82118644197349e-36, -1.6339371476429796e-36, 9.8153778293661e-37, 1.0626468931433479e-35, -9.168855936413843e-36, 3.408933617384634e-36, 2.8917161030228273e-36, 7.052966104933725e-36, -1.6362881363446242e-35, 2.715391950399484e-36, 9.286405371496071e-37, -8.604618648019145e-36, 3.2443644082695135e-36, -6.723827686703485e-36, -7.899322037525772e-36, 4.607937855223367e-36, 1.476420904632793e-35, 6.629788138637702e-36, 5.5953531099140885e-36, -7.567244883418476e-38, 4.53740819417403e-36, -1.1566864412091309e-35, 1.0109251417071673e-35, -5.054625708535836e-36, 5.242704804667402e-36, 2.4685381367268038e-36, -1.8102613002663228e-36, 4.6549576292562585e-36, 8.698658196084928e-36, 2.3744985886610208e-36, -5.47780367483186e-36, -1.7279766957087626e-36, -3.785091809647766e-36, -6.488728816539027e-36, -1.1801963282255767e-35, -9.91823358506305e-38, 2.7506567809241528e-36, 7.335084749131074e-36, 9.262895484479626e-36, -1.0062231643038781e-35, -6.488728816539027e-36, 1.6550960459577808e-35, 4.1847598889273435e-36, -6.318282135669795e-37, -4.5609180811904755e-36, -8.510579099953362e-36, -1.410593220986745e-36, -3.0562853121379475e-36, 5.148665256601619e-36, -4.775445800215543e-38, -5.6893926579798715e-36, -1.410593220986745e-35, 1.3283086164291849e-36, -3.502973165450417e-36, -1.288341808501227e-35, 1.1002627123696611e-35, -4.443368646108247e-36, -1.704466808692317e-36, 1.4576129950196365e-35, -2.3862535321692436e-36, -2.1535056507064307e-35, 4.631447742239813e-36, -2.433273306202135e-36, -3.785091809647766e-36, 9.827132772874324e-36, -1.504632769052528e-36, -6.159590398308787e-36, 9.4039548065783e-36, -2.3321807920314184e-35, 3.9966807927957775e-36, 2.5860875718090325e-36, 1.0109251417071673e-35, 1.3635734469538535e-35, 1.4670169498262148e-35, 1.5610564978919978e-35, -9.545014128676975e-36, -6.206610172341678e-36, -9.309915258512517e-36, -9.07481638834806e-36, 4.173004945419121e-37, 1.7397316392169855e-36, 2.268704097087015e-36, -2.7976765549570443e-36, -1.1872492943305104e-36, -4.984096047486499e-36, -5.383764126766077e-36, 5.853961867094992e-36, -7.288064975098183e-36, -2.4920480237432495e-36, 9.356935032545409e-36, 4.490388420141138e-36, -7.099985878966617e-36, 1.5610564978919978e-35, -9.07481638834806e-36, -4.678467516272704e-36, 1.0203290965137456e-35, -9.262895484479626e-36, 1.598672317118311e-36, -3.691052261581983e-36, -7.993361585591555e-36, 7.476144071229749e-36, 9.592033902709866e-36, 7.382104523163966e-36, 3.92615113174644e-36, -4.3728389850589095e-36, -5.3132344657167395e-36, -8.639883478543813e-37, 1.0990872180188388e-36, -7.147005652999508e-36, -6.065550850243004e-36, 4.725487290305596e-36, -1.9395656788567744e-36, -6.91190678283505e-36, -4.86654661240427e-36, -8.557598873986253e-36, -4.137740114894452e-36, 5.3132344657167395e-36, -3.92615113174644e-36, 1.0520674439859473e-36, 1.0203290965137456e-35, 2.0923799444636718e-36, 1.626884181538046e-35, 9.827132772874324e-36, 6.906029311080939e-37, -9.93292726444833e-37, 2.2922139841034606e-36, 2.7624117244323756e-36, -5.289724578700294e-36, 3.267874295285959e-36, 9.521504241660529e-37, 3.173834747220176e-36, 2.0594661026406477e-35, -5.3132344657167395e-36, 2.644862289350147e-36, -2.6918820633830384e-36, -5.6893926579798715e-36, 6.817867234769268e-36, -2.8446963289899358e-36, -4.86654661240427e-36, 1.263656427133959e-36, -6.817867234769268e-36, 1.0203290965137456e-35, 1.0767528253532154e-35, 1.2107591813469561e-36, 7.229290257557068e-37, -4.584427968206921e-36, 9.07481638834806e-36, 8.228460455756013e-36, 7.2880649750981825e-37, 4.513898307157584e-36, 7.52316384526264e-37, -2.915225990039273e-36, 1.786751413249877e-36, -3.5970127135162e-36, -6.770847460736376e-36, 1.7115197747972506e-35, -1.9983403963978888e-36, 2.433273306202135e-36, 9.73309322480854e-36, -1.1707923734189984e-35, -6.535748590571919e-36, 9.286405371496071e-37, 4.796016951354933e-36, 7.617203393328423e-36, -2.256949153578792e-36, -5.5013135618483055e-36, -7.052966104933725e-36, -2.0923799444636718e-36, -7.335084749131074e-36, -1.664500000764359e-35, 5.6893926579798715e-36, -9.697828394283872e-37, -1.96307556587322e-36, 2.4215183626939123e-36, 5.571843222897643e-36, -4.8430367253878245e-36, -1.1660903960157092e-35, 5.900981641127883e-36, -3.549992939483308e-36, -2.0100953399061116e-36, -1.5516525430854195e-35, -5.7834322060456545e-36, -1.6339371476429796e-36, -8.933757066249385e-36, 1.570460452698576e-35, -4.772507064338487e-36, 1.4199971757933233e-35, 9.168855936413843e-36, 2.7976765549570443e-36, -3.502973165450417e-36, -1.0285575569695016e-36, -5.7834322060456545e-36, -3.408933617384634e-36, 3.667542374565537e-36, 9.545014128676975e-36, 6.91190678283505e-36, 7.664223167361315e-36, -1.4752454102819708e-36, 1.4458580515114136e-36, 3.849743998942992e-37, -2.527312854267918e-36, 4.86654661240427e-36, 7.758262715427098e-36, -1.6574470346594254e-36, 5.383764126766077e-36, 1.3071497181143837e-35, -1.1660903960157092e-35, -7.617203393328423e-36, -4.984096047486499e-36, 5.054625708535836e-37, -6.876641952310382e-37, -9.356935032545409e-36, -1.275411370642182e-36, 7.85230226349288e-36, 4.2787994369931265e-36, 6.612155723375367e-37, -1.316553672920962e-35, -1.2319180796617573e-35, -3.267874295285959e-36, 2.926980933547496e-36, 1.2695338988880705e-35, 5.289724578700294e-36, -1.3541694921472752e-35, 1.2695338988880705e-35, 1.7491355940235638e-35, 4.319941739271907e-37, -2.456783193218581e-36, -1.0391370061269022e-35, -1.88079096131566e-36, -7.899322037525772e-36, 9.545014128676975e-36, 5.995021189193666e-36, 2.9828169152115546e-37, -1.986585452889666e-36, 3.5970127135162e-36, 9.592033902709866e-36, -9.215875710446734e-36, -5.172175143618065e-36, 4.513898307157584e-36, -9.592033902709866e-36, 2.5126191748826395e-37, 2.997510594596833e-36, 1.2695338988880705e-35, -1.2977457633078054e-35, -8.557598873986253e-36, 1.0638223874941702e-36, 1.4011892661801667e-35, 4.6549576292562585e-36, 9.07481638834806e-36, 4.643202685748036e-37, -5.5953531099140885e-36, -2.0218502834143345e-36, 3.855621470697103e-36, -1.288341808501227e-35, 6.441709042506136e-36, 2.5743326283008096e-36, -1.2166366531010676e-36, 7.993361585591555e-36, -1.4011892661801667e-35, 5.64237288394698e-36, -5.995021189193666e-36, -6.770847460736376e-36, -7.335084749131074e-36, -2.3980084756774665e-36, -3.009265538105056e-36, 9.4039548065783e-36, 9.686073450775649e-36, 1.4870003537901937e-36, 9.521504241660529e-37, -3.197344634236622e-36, 4.9370762734536075e-36, -1.2601299440814922e-35, -1.0626468931433479e-35, -4.231779662960235e-36, -7.899322037525772e-36, 1.2166366531010676e-36, -1.3459410316915192e-36, -7.85230226349288e-36, 9.545014128676975e-36, 7.1940254270324e-36, 1.0062231643038781e-35, -5.948001415160775e-36, -4.349329098042464e-36, 8.745677970117819e-36, -2.057115113939003e-36, 1.344765537340697e-35, 1.504632769052528e-36, 1.369450918707965e-36, 3.855621470697103e-36, 3.197344634236622e-36, 8.287235173297127e-37, 7.640713280344869e-37, 7.005946330900834e-36, -8.040381359624447e-36, 3.197344634236622e-36, -5.5953531099140885e-36, 2.7859216114488214e-36, 5.47780367483186e-36, -4.4668785331246925e-37, 4.5609180811904755e-36, 3.103305086170839e-36, 2.256949153578792e-36, -5.64237288394698e-36, 9.73309322480854e-36, 6.629788138637702e-36, -1.814963277669612e-35, 4.5609180811904755e-36, -6.465218929522581e-38, 5.524823448864751e-36, 3.549992939483308e-36, -5.995021189193666e-36, 1.2166366531010676e-36, 5.383764126766077e-36, 2.527312854267918e-37, -2.915225990039273e-36, 1.504632769052528e-35, 1.6692019781676483e-36, -2.915225990039273e-36, -1.222514124855179e-35, -5.64237288394698e-36, 7.993361585591555e-36, 2.3039689276116835e-36, 1.96307556587322e-36, 2.456783193218581e-36, 2.926980933547496e-36, 7.52316384526264e-36, 2.915225990039273e-36, -6.488728816539027e-36, -6.535748590571919e-36, 5.571843222897643e-36, 1.2131101700486007e-35, 2.4685381367268038e-36, 3.385423730368188e-36, -1.5516525430854195e-35, -8.79269774415071e-36, -3.573502826499754e-36, -1.3729774017604318e-35, 9.309915258512517e-36, 1.4952288142459497e-35, 6.318282135669795e-38, -1.2131101700486007e-35, -1.0720508479499262e-35, 8.369519777854687e-36, 1.3635734469538535e-35, -1.0861567801597937e-35, 1.504632769052528e-35, -5.64237288394698e-36, 7.147005652999508e-36, 3.92615113174644e-36, -1.586917373610088e-36, 1.250725989274914e-35, 5.172175143618065e-36, -3.032775425121502e-36, -3.8086016966642115e-36, -3.032775425121502e-36, 5.430783900798968e-36, 9.168855936413843e-36, 2.5155579107596953e-36, 4.2787994369931265e-36, -1.4011892661801667e-35, 1.598672317118311e-36, -4.5609180811904755e-36, -1.0720508479499262e-35, -2.6918820633830384e-36, -5.195685030634511e-36, 1.7021158199906723e-35, 1.189600283032155e-35, 1.128474576789396e-35, -6.676807912670593e-36, 1.288341808501227e-35, -6.994191387392611e-37, 1.0062231643038781e-35, 8.181440681723121e-36, -6.723827686703485e-36, -2.82118644197349e-36, -6.91190678283505e-36, -1.4294011305999016e-35, -1.814963277669612e-35, 3.6440324875490913e-37, 3.126814973187285e-36, 9.8153778293661e-37, 2.3069076634887392e-37, 9.497994354644083e-36, -5.830451980078546e-36, -5.363192975626687e-38, -1.1660903960157092e-35, -6.018531076210112e-36, -7.617203393328423e-36, 2.5743326283008096e-36, 9.592033902709866e-36, 2.4685381367268038e-36, -9.262895484479626e-36, -9.262895484479626e-36, -1.4482090402130582e-35, 8.980776840282277e-36, -3.785091809647766e-36, 1.3259576277275403e-35, -1.4752454102819708e-36, 3.173834747220176e-36, -2.1276447749883404e-36, 9.874152546907215e-36, 3.314894069318851e-36, -6.347669494440353e-36, 5.853961867094992e-36, -6.535748590571919e-36, -8.485599844998388e-38, 7.241045201065291e-36, -5.830451980078546e-36, -1.1990042378387333e-35, -1.3071497181143837e-35, -9.309915258512517e-36, 9.521504241660529e-37, 1.7632415262334313e-36, -5.6893926579798715e-36, 3.079795199154393e-36, -8.369519777854687e-36, 1.0485409609334805e-35, 4.53740819417403e-36, 1.0062231643038781e-35, 6.864887008802159e-36, 7.335084749131074e-36, 8.181440681723121e-36, 6.58276836460481e-36, 3.4530146555404696e-37, -1.4294011305999016e-35, 1.1990042378387333e-35, 1.0626468931433479e-35, -7.147005652999508e-36, -6.018531076210112e-36, 1.0485409609334805e-35, -1.3812058622161878e-36, 3.4324435044010795e-36, -5.383764126766077e-36, -4.302309324009572e-36, -6.065550850243004e-36, 6.723827686703485e-36, 9.4039548065783e-36, -4.2787994369931265e-36, 1.2977457633078054e-35, -5.195685030634511e-36, 5.172175143618065e-37, 5.0311158215193905e-36, -4.678467516272704e-36, -9.827132772874324e-36, 1.6080762719248893e-35, -9.07481638834806e-36, 2.5155579107596953e-36, 1.7491355940235638e-35, 1.095560734966372e-35, -6.817867234769268e-36, -1.3988382774785221e-36, 7.899322037525772e-36, -4.2787994369931265e-36, 5.6893926579798715e-36, 6.91190678283505e-36, -4.0907203408615605e-36, 1.1002627123696611e-35, 1.0720508479499262e-35, -1.0932097462647274e-36, 1.0720508479499262e-35, 2.256949153578792e-36, 6.58276836460481e-36, -1.2131101700486007e-35, 6.112570624275895e-36, 3.6205226005326455e-36, 1.1707923734189984e-35, 6.159590398308787e-36, 7.147005652999508e-36, -4.984096047486499e-36, 8.698658196084928e-36, -7.493776486492083e-37, 5.360254239749631e-36, 1.6739039555709374e-35, 4.9370762734536075e-36, 6.553381005834253e-37, 9.168855936413843e-37, -8.040381359624447e-36, -2.5743326283008096e-36, 6.347669494440353e-36, -2.3862535321692436e-36, -3.879131357713549e-36, 8.416539551887579e-36, 3.408933617384634e-36, -1.1472824864025526e-35, -4.678467516272704e-36, -1.0297330513203239e-35, 2.938735877055719e-38, -5.172175143618065e-36, -1.5140367238591063e-35, -7.099985878966617e-36, -9.991701981989444e-37, -6.864887008802159e-36, 1.3738590225235485e-37, -9.592033902709866e-36, 6.394689268473244e-36, -1.4670169498262148e-35, 4.913566386437162e-36, -9.686073450775649e-36, -1.1225971050352846e-36, 1.8220162437745456e-36, 9.07481638834806e-36, -1.5516525430854195e-35, -5.995021189193666e-36, -3.291384182302405e-36, 4.796016951354933e-36, 1.4576129950196365e-36, 1.8055593228630336e-35, -1.8337711872827685e-35, 1.2789378536946488e-35, 9.592033902709866e-36, 6.347669494440353e-36, -7.335084749131074e-36, -7.4350017689509685e-37, 9.309915258512517e-36, 2.5978425153172554e-36, 1.7115197747972506e-35, 6.300649720407461e-36, 3.361913843351742e-36, 6.629788138637702e-36, -2.068870057447226e-36, -4.86654661240427e-36, 4.731364762059707e-37, 1.1049646897729503e-35, -8.698658196084928e-36, -4.86654661240427e-36, -6.58276836460481e-36, 9.051306501331614e-37, -1.0861567801597937e-35, -1.0062231643038781e-35, 7.241045201065291e-36, -1.2319180796617573e-35, 3.92615113174644e-36, 4.3728389850589095e-36, -1.4576129950196365e-36, -1.4576129950196365e-35, 1.6530389308438418e-38, 2.5860875718090325e-36, 2.3744985886610208e-36, -1.0109251417071673e-35, 7.85230226349288e-36, -1.4576129950196365e-36, -7.85230226349288e-36, 1.2131101700486007e-35, 9.874152546907215e-36, 3.032775425121502e-36, -1.986585452889666e-36, 1.2460240118716248e-36, 1.6574470346594254e-36, -1.4670169498262148e-35, 3.314894069318851e-36, -3.4824020143110267e-37, -1.222514124855179e-35, -1.9043008483321058e-36, -3.314894069318851e-36, -9.73309322480854e-36, 1.369450918707965e-36, 4.86654661240427e-36, -2.0453601704307803e-36, -1.9043008483321058e-36, 1.2977457633078054e-35, 8.275480229788904e-36, 7.052966104933725e-36, 9.345180089037186e-37, 1.2131101700486007e-35, 6.906029311080939e-37, 1.6456920911512025e-35, -1.3259576277275403e-35, -1.4670169498262148e-35, -1.7279766957087626e-36, 2.350988701644575e-36, 2.4685381367268038e-36, 5.6893926579798715e-36, 3.8086016966642115e-36, 7.099985878966617e-36, 2.4685381367268038e-36, 5.853961867094992e-36, 4.8430367253878245e-36, -7.241045201065291e-36, 4.678467516272704e-36, 1.3635734469538535e-35, 5.360254239749631e-36, 1.692711865184094e-35, 7.85230226349288e-36, -2.068870057447226e-36, -1.1167196332811731e-36, 4.396348872075355e-36, -6.723827686703485e-36, -4.255289549976681e-36, 3.92615113174644e-36, 7.099985878966617e-36, 6.206610172341678e-36, -2.068870057447226e-35, -4.419858759091801e-36, -2.6889433275059827e-37, 1.2695338988880705e-35, -2.527312854267918e-36, 9.874152546907215e-36, 2.204051907791789e-37, 8.933757066249385e-36, -7.147005652999508e-36, -1.6456920911512025e-36, 1.1225971050352846e-36, -7.4350017689509685e-37, -3.502973165450417e-36, -3.667542374565537e-36, -4.7489971773220415e-36, -7.335084749131074e-36, -1.0579449157400588e-35, 7.099985878966617e-36, -1.786751413249877e-35, -4.913566386437162e-36, -8.228460455756013e-36, -1.2166366531010676e-36, -1.3259576277275403e-35, 3.785091809647766e-36, 4.419858759091801e-36, 1.0861567801597937e-35, 9.874152546907215e-36, 1.1990042378387333e-36, -1.1378785315959743e-35, -1.5798644075051544e-35, -1.774996469741654e-36, 6.629788138637702e-36, 3.173834747220176e-36, -1.410593220986745e-36, 3.4530146555404696e-37, -1.2695338988880705e-35, -3.738072035614874e-36, 8.651638422052036e-36, 8.087401133657338e-36, -1.0861567801597937e-35, 2.1535056507064307e-35, -1.986585452889666e-36, 3.197344634236622e-36, 2.1599708696359533e-37, -3.2473031441465692e-37, 3.173834747220176e-36, 1.96307556587322e-36, -4.3728389850589095e-36, -3.291384182302405e-36, 7.617203393328423e-36, 2.0594661026406477e-35, -1.1990042378387333e-35, -5.242704804667402e-36, 1.8337711872827685e-36, -8.087401133657338e-36, -2.256949153578792e-36, -3.92615113174644e-36, 4.161250001910898e-36, 6.112570624275895e-36, -9.309915258512517e-36, -8.181440681723121e-36, 7.617203393328423e-36, -1.3259576277275403e-35, 3.408933617384634e-36, 9.262895484479626e-36, -3.408933617384634e-36, 8.322500003821796e-36, -3.573502826499754e-36, -4.984096047486499e-36, -3.614645128778534e-37, 1.0626468931433479e-35, 1.1566864412091309e-35, -7.570183619295532e-36, 1.275411370642182e-36, 3.6205226005326455e-36, 1.476420904632793e-35, 5.5013135618483055e-36, 2.456783193218581e-36, 9.968192094972998e-36, 8.839717518183602e-36, -9.686073450775649e-36, 3.738072035614874e-36, 3.3384039563352965e-36, -1.8619830517025034e-35, -3.691052261581983e-36, 6.629788138637702e-36, 1.8220162437745456e-36, 1.2789378536946488e-35, -4.2787994369931265e-36, 3.973170905779332e-36, 1.0109251417071673e-35, 3.7145621485984285e-36, 6.958926556867942e-36, -1.692711865184094e-35, -7.85230226349288e-36, -8.228460455756013e-36, 1.7985063567581e-36, 9.4039548065783e-36, -1.0579449157400588e-36, -3.879131357713549e-36, -2.433273306202135e-36, -8.698658196084928e-36, 1.1049646897729503e-35, 1.516387712560751e-36, -6.629788138637702e-36, -6.629788138637702e-36, -4.913566386437162e-36, 8.839717518183602e-36, 1.2413220344683356e-35, -3.197344634236622e-36, -5.524823448864751e-36, -5.830451980078546e-36, 2.0923799444636718e-36, -4.9370762734536075e-36, -7.993361585591555e-36, -5.948001415160775e-36])
+        
         if inputs_embeds is not None:
             hidden_states = inputs_embeds
         else:
             hidden_states = self.get_input_embeddings(input_ids)
+        
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]
@@ -290,7 +293,9 @@ def forward(
                 attn_metadata,
                 residual,
             )
+            print(f"idx: {i}: {hidden_states}")
         hidden_states, _ = self.norm(hidden_states, residual)
+        print(hidden_states)
         return hidden_states
 
 

From 88ba83b733f7ccfee66a18e591abc2ec40db3a7e Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 24 Apr 2024 02:03:09 +0000
Subject: [PATCH 07/90] added static fp8

---
 run_fp8.py                                    |   5 +
 .../layers/quantization/fp8_static.py         | 218 ++++++++++++++++++
 vllm/model_executor/models/llama.py           |   3 +-
 3 files changed, 224 insertions(+), 2 deletions(-)
 create mode 100644 run_fp8.py
 create mode 100644 vllm/model_executor/layers/quantization/fp8_static.py

diff --git a/run_fp8.py b/run_fp8.py
new file mode 100644
index 0000000000000..4b3833be8337e
--- /dev/null
+++ b/run_fp8.py
@@ -0,0 +1,5 @@
+from vllm import LLM
+
+model = LLM("FriendliAI/Mistral-7B-Instruct-v0.2-fp8", quantization="fp8_static", enforce_eager=True, max_model_len=1024)
+# model = LLM("mistralai/Mistral-7B-Instruct-v0.2", enforce_eager=True, max_model_len=1024)
+print(model.generate("Hello my name is"))
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/fp8_static.py b/vllm/model_executor/layers/quantization/fp8_static.py
new file mode 100644
index 0000000000000..1def8c0f9fcd3
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/fp8_static.py
@@ -0,0 +1,218 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+
+class FP8StaticConfig(QuantizationConfig):
+    """Config class for FP8."""
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "fp8_static"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 89
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "FP8StaticConfig":
+        return cls()
+
+    def get_linear_method(self) -> "Fp8LinearMethod":
+        return Fp8LinearMethod(self)
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class Fp8LinearMethod(LinearMethodBase):
+    """Linear method for StaticFP8
+    .
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: FP8StaticConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+        
+        weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                       input_size_per_partition,
+                                       dtype=torch.float8_e4m3fn),
+                           requires_grad=False)
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+        set_weight_attrs(weight, extra_weight_attrs)
+
+        weight_scale = Parameter(
+            torch.empty(
+                len(output_partition_sizes), 
+                 device='cuda', dtype=torch.float32,
+            ), requires_grad=False
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+        set_weight_attrs(weight_scale, extra_weight_attrs)
+        set_weight_attrs(weight_scale, {
+            "shard_indexer": self.scales_shard_indexer,
+        })
+
+        in_scale = Parameter(
+            torch.empty(
+                len(output_partition_sizes), 
+                 device='cuda', dtype=torch.float32,
+            ), requires_grad=False
+        )
+        layer.register_parameter("in_scale", in_scale)
+        set_weight_attrs(in_scale, extra_weight_attrs)
+        set_weight_attrs(in_scale, {
+            "shard_indexer": self.scales_shard_indexer,
+        })
+
+        layer.logical_widths = output_partition_sizes
+
+    def shard_id_as_int(
+        self, 
+        shard_id: Union[str, int]
+    ) -> int:
+        if isinstance(shard_id, int):
+            return shard_id
+        assert isinstance(shard_id, str)
+        qkv_idxs = { "q": 0, "k": 1, "v": 2 }
+        assert shard_id in qkv_idxs
+        return qkv_idxs[shard_id]
+
+    # def scales_shard_splitter_NKK(
+    #     self,
+    #     param: torch.Tensor,
+    #     loaded_weight: torch.Tensor,
+    #     shard_id: Union[str, int],
+    #     logical_widths: torch.Tensor
+    # ) -> Tuple[torch.Tensor, torch.Tensor]:
+    #     shard_id = self.shard_id_as_int(shard_id)
+    #     offset = sum(logical_widths[:shard_id]) 
+    #     size = logical_widths[shard_id]
+    #     # update loaded weight with copies for broadcast.
+    #     loaded_weight = loaded_weight.repeat(size)
+    #     return param[offset : offset + size], loaded_weight
+    
+    def scales_shard_indexer(
+        self,
+        param: torch.Tensor,
+        loaded_weight: torch.Tensor,
+        shard_id: Union[str, int],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # print(f"----- shard_id: {shard_id}")
+        # print(f"----- loaded_weight: {loaded_weight}")
+        return param[self.shard_id_as_int(shard_id)], loaded_weight
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:        
+        logical_widths = layer.logical_widths
+        q_weight = layer.weight
+        w_scales = layer.weight_scale
+        in_scales = layer.in_scale
+
+        output = torch.zeros(x.shape[0], q_weight.shape[0], dtype=x.dtype, device="cuda")
+        start_offset = 0
+        for _, (logical_width, w_scale, in_scale) in enumerate(zip(logical_widths, w_scales, in_scales)):
+            end_offset = start_offset + logical_width
+            weight_dq = self._dequantize(q_weight[start_offset:end_offset, :], w_scale, x.dtype)
+            x_dq = self._fake_quantize_static(x, in_scale)
+
+            # print(f"x_dq[0,0]: {x_dq[0,0]} // weight_dq[0,0]: {weight_dq[0,0]}")
+            output[:, start_offset:end_offset] = torch.nn.functional.linear(x_dq, weight_dq)
+            start_offset = end_offset
+        
+        assert end_offset == output.shape[1]
+        # print(output)
+        # print(output.dtype)
+        return output
+
+    def _quantize_dynamic(self, x: torch.Tensor):
+        finfo = torch.finfo(torch.float8_e4m3fn)
+        min_val, max_val = x.aminmax()
+        amax = min_val.abs().max(max_val.abs())
+        scale = finfo.max / amax.clamp(min=1e-12)
+
+        # print(finfo.max)
+        # print(amax)
+        # print(finfo.max / amax.clamp(min=1e-12))
+        # assert False
+        # scale and clamp the tensor to bring it to
+        # the representative range of float8 data type
+        # (as default cast is unsaturated)
+        qweight = (x * scale).clamp(min=finfo.min, max=finfo.max)
+        # Return both float8 data and the inverse scale (as float),
+        # as both required as inputs to torch._scaled_mm
+        # print(scale)
+        return qweight, scale.float().reciprocal()
+    
+    def _quantize(self, x: torch.Tensor, inv_scale: torch.tensor):
+        finfo = torch.finfo(torch.float8_e4m3fn)
+        return (x / inv_scale).clamp(min=finfo.min, max=finfo.max)
+        
+    def _dequantize(self, xq: torch.Tensor, inv_scale: torch.tensor, dtype: torch.dtype):
+        return (xq.to(dtype) * inv_scale)
+    
+    def _fake_quantize_static(self, x: torch.Tensor, inv_scale: torch.Tensor):
+        xq = self._quantize(x, inv_scale)
+        # xq, inv_scale = self._dynamic_quantize(x)
+        # print(inv_scale)
+        xdq = self._dequantize(xq, inv_scale, x.dtype)
+
+        # print(f"----- inv_scale: {inv_scale} // x[0,0]: {x[0,0]} // xq[0,0]: {xq[0,0]} // xdq[0,0]: {xdq[0,0]}")
+
+        return xdq
+
+
+def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
+    """Quantize a tensor using per-tensor static scaling factor.
+
+    Args:
+        tensor: The input tensor.
+    """
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    # Calculate the scale as dtype max divided by absmax.
+    # Since .abs() creates a new tensor, we use aminmax to get
+    # the min and max first and then calculate the absmax.
+    min_val, max_val = tensor.aminmax()
+    amax = min_val.abs().max(max_val.abs())
+    scale = finfo.max / amax.clamp(min=1e-12)
+    # scale and clamp the tensor to bring it to
+    # the representative range of float8 data type
+    # (as default cast is unsaturated)
+    qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)
+    # Return both float8 data and the inverse scale (as float),
+    # as both required as inputs to torch._scaled_mm
+    qweight = qweight.to(torch.float8_e4m3fn)
+    scale = scale.float().reciprocal()
+    return qweight, scale
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 22957a78cbc52..8b8679b9bc1d5 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -276,13 +276,12 @@ def forward(
         attn_metadata: AttentionMetadata,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        self.embed_tokens.weight[0] = torch.Tensor([-2.0336052269225574e-36, 3.3207715410729622e-37, -1.5516525430854195e-35, 1.2319180796617573e-35, 1.2695338988880705e-35, 6.629788138637702e-36, -1.704466808692317e-36, 7.288064975098183e-36, -5.900981641127883e-36, 1.0626468931433479e-35, -9.4039548065783e-36, -1.1472824864025526e-35, -1.8337711872827685e-36, 1.3917853113735884e-35, -6.441709042506136e-36, -6.864887008802159e-36, 9.827132772874324e-36, -1.2283915966092904e-36, -2.915225990039273e-36, 9.051306501331614e-37, 7.617203393328423e-36, -6.441709042506136e-36, 5.830451980078546e-36, -1.4223481644949679e-36, -6.347669494440353e-36, 3.314894069318851e-36, -7.758262715427098e-36, -1.504632769052528e-36, 1.909002825735395e-35, 4.0907203408615605e-36, -5.430783900798968e-36, 4.255289549976681e-36, 5.360254239749631e-36, 1.1707923734189984e-35, -1.316553672920962e-35, -1.1472824864025526e-35, 4.70197740328915e-36, 1.288341808501227e-35, -2.7506567809241528e-36, -6.065550850243004e-36, 6.065550850243004e-36, 2.8446963289899358e-36, -6.488728816539027e-36, 4.2787994369931265e-36, 1.4011892661801667e-35, -1.2319180796617573e-35, -3.385423730368188e-36, -1.222514124855179e-35, 4.255289549976681e-36, 1.2871663141504048e-36, -1.814963277669612e-35, 8.416539551887579e-36, 3.7468882432460414e-37, 3.34281206015088e-38, -9.592033902709866e-36, -4.772507064338487e-36, 6.78847987599871e-37, 8.510579099953362e-36, 8.13442090769023e-36, -7.147005652999508e-36, 2.7624117244323756e-36, 3.7145621485984285e-36, -1.1872492943305104e-36, 2.0662986635548023e-38, 7.005946330900834e-36, 1.9513206223649973e-36, 1.0285575569695016e-36, -1.6692019781676483e-36, -4.255289549976681e-36, 3.8086016966642115e-36, -5.43666137255308e-37, -1.5398975995771966e-36, 4.255289549976681e-36, 4.302309324009572e-36, 1.9043008483321058e-36, -7.52316384526264e-36, 2.527312854267918e-36, 2.3980084756774665e-36, 4.607937855223367e-36, 9.262895484479626e-36, 1.2789378536946488e-35, -2.9387358770557188e-37, -1.9654265545748647e-35, 1.1660903960157092e-35, 4.913566386437162e-36, -4.043700566828669e-36, 1.5140367238591063e-35, 5.3132344657167395e-36, -5.172175143618065e-36, -2.6801271198748155e-36, 2.5155579107596953e-36, -1.034435028723613e-35, 7.241045201065291e-36, 3.314894069318851e-36, -2.9740007075803874e-36, 1.3753283904620764e-36, 9.968192094972998e-36, -5.571843222897643e-36, 2.8446963289899358e-36, 6.347669494440353e-36, 3.314894069318851e-36, 9.4039548065783e-36, 5.3132344657167395e-36, -2.3980084756774665e-36, -1.4670169498262148e-35, 1.1637394073140646e-36, 5.3132344657167395e-36, 4.114230227878006e-36, 6.770847460736376e-36, 1.8220162437745456e-36, -1.095560734966372e-35, 6.018531076210112e-36, -7.85230226349288e-36, -2.2804590405952378e-36, -1.516387712560751e-36, -2.433273306202135e-36, -8.287235173297127e-37, -3.0562853121379475e-36, -1.1707923734189984e-35, 1.4576129950196365e-36, -1.0861567801597937e-35, 2.82118644197349e-36, -9.827132772874324e-36, 2.926980933547496e-36, 3.408933617384634e-36, 7.85230226349288e-36, -1.774996469741654e-36, 1.2789378536946488e-35, 1.285696946211877e-37, 7.899322037525772e-36, -1.2166366531010676e-36, -1.095560734966372e-35, -2.8682062160063815e-36, -2.1393997184965633e-36, -1.0062231643038781e-35, 1.7985063567581e-36, 8.087401133657338e-36, 1.5942642133027274e-37, 4.631447742239813e-36, 9.450974580611192e-36, 1.1049646897729503e-35, -5.430783900798968e-36, 9.262895484479626e-36, 1.9513206223649973e-36, -5.054625708535836e-36, 5.289724578700294e-36, 2.228737289159057e-35, -1.2695338988880705e-35, 1.0861567801597937e-35, 1.532844633472263e-35, 1.504632769052528e-36, 1.3259576277275403e-35, -1.814963277669612e-35, -8.46355932592047e-36, -2.3039689276116835e-36, -9.497994354644083e-36, 1.1754943508222875e-36, -5.830451980078546e-36, 4.513898307157584e-36, 2.0100953399061116e-36, 1.0203290965137456e-35, -8.087401133657338e-36, 2.2216843230541234e-36, -8.346009890838241e-37, 5.430783900798968e-36, 9.827132772874324e-36, -4.53740819417403e-36, 1.1990042378387333e-36, 2.7506567809241528e-36, -3.6205226005326455e-36, 1.3259576277275403e-35, 1.1801963282255767e-35, -1.5422485882788412e-35, -1.974830509381443e-36, -1.3224311446750734e-36, -7.099985878966617e-36, -4.790139479600822e-37, 7.85230226349288e-36, 8.557598873986253e-36, 1.3259576277275403e-35, -2.433273306202135e-36, -1.0861567801597937e-35, 6.612155723375367e-37, 1.3259576277275403e-35, -7.85230226349288e-36, -2.456783193218581e-36, -4.725487290305596e-36, 2.5860875718090325e-36, 4.1847598889273435e-36, 1.1143686445795286e-35, 3.385423730368188e-36, 1.0297330513203239e-35, 1.1378785315959743e-35, -1.610427260626534e-36, 3.8086016966642115e-36, 9.592033902709866e-36, 9.874152546907215e-36, -1.8102613002663228e-36, -5.360254239749631e-36, -1.8102613002663228e-36, 7.946341811558664e-36, -5.853961867094992e-36, -5.853961867094992e-36, 1.0203290965137456e-35, -1.222514124855179e-35, -1.1190706219828177e-35, 4.513898307157584e-36, 9.462729524119414e-37, 6.864887008802159e-36, -7.099985878966617e-36, -2.057115113939003e-36, 1.0062231643038781e-35, -8.275480229788904e-36, -1.1660903960157092e-35, 1.7397316392169855e-36, -1.692711865184094e-36, -1.2319180796617573e-35, 2.715391950399484e-36, 1.128474576789396e-35, 8.181440681723121e-36, 1.3988382774785221e-36, 9.73309322480854e-36, -1.6080762719248893e-35, 5.466048731323637e-37, -2.0453601704307803e-36, 1.1190706219828177e-35, -1.1660903960157092e-35, 1.095560734966372e-35, -3.9966807927957775e-37, -6.535748590571919e-36, -1.1931267660846218e-36, -6.629788138637702e-36, 9.450974580611192e-36, -1.0579449157400588e-35, 6.206610172341678e-36, 5.430783900798968e-36, -2.997510594596833e-36, 2.362743645152798e-36, 1.1225971050352846e-36, -1.5281426560689738e-36, -1.344765537340697e-35, 1.9160557918403286e-36, 6.065550850243004e-36, 1.8901949161222383e-35, 1.4482090402130582e-35, -1.4199971757933233e-35, 1.2695338988880705e-35, -9.027796614315168e-36, -1.9513206223649973e-36, 7.005946330900834e-36, -2.080625000955449e-36, -9.450974580611192e-36, -4.4668785331246925e-36, -1.892545904823883e-36, -2.0782740122538043e-35, 4.161250001910898e-36, -1.0861567801597937e-35, 4.261167021730792e-37, 6.770847460736376e-36, -6.78847987599871e-37, -9.309915258512517e-36, -3.126814973187285e-36, 7.875812150509326e-37, -7.805282489459989e-36, 8.369519777854687e-36, 1.9689530376273316e-37, 1.986585452889666e-36, -3.879131357713549e-36, 4.137740114894452e-36, -4.5609180811904755e-36, 5.8774717541114375e-37, -9.027796614315168e-36, 1.1660903960157092e-35, 1.986585452889666e-36, 1.2319180796617573e-35, 8.9925317837905e-37, 6.770847460736376e-36, -4.1847598889273435e-36, 3.0562853121379475e-36, -5.987674349501027e-38, 2.3133728824182618e-35, -1.1566864412091309e-35, -1.3224311446750734e-36, -2.339233758136352e-36, 4.86654661240427e-36, -5.466048731323637e-37, 5.830451980078546e-36, -7.993361585591555e-36, 5.47780367483186e-36, 5.830451980078546e-36, -2.0218502834143345e-36, -6.58276836460481e-36, -8.639883478543813e-37, -1.692711865184094e-36, -2.433273306202135e-36, 1.3400635599374078e-36, 5.0311158215193905e-36, -6.958926556867942e-36, 5.3132344657167395e-36, 1.5234406786656846e-35, -1.3729774017604318e-35, 4.419858759091801e-36, -6.770847460736376e-36, 7.241045201065291e-36, -3.408933617384634e-36, -6.083183265505338e-37, 7.758262715427098e-36, -5.054625708535836e-36, 6.347669494440353e-36, -6.629788138637702e-36, -4.302309324009572e-36, 7.899322037525772e-36, -1.2413220344683356e-35, -1.570460452698576e-35, 1.065291755432698e-37, -7.664223167361315e-36, -1.3635734469538535e-35, -7.899322037525772e-36, -4.043700566828669e-36, 3.2443644082695135e-36, -2.7859216114488214e-36, -1.6574470346594254e-36, -1.6456920911512025e-36, 4.584427968206921e-36, -1.3917853113735884e-35, 1.3929608057244107e-36, 1.0285575569695016e-36, -9.827132772874324e-36, 2.1393997184965633e-36, 6.065550850243004e-36, 4.4668785331246925e-36, -3.549992939483308e-36, 1.0109251417071673e-36, 3.667542374565537e-36, -6.958926556867942e-36, 2.915225990039273e-36, -8.13442090769023e-36, -1.0767528253532154e-35, 1.4858248594393714e-35, 3.9026412447299945e-36, -3.220854521253068e-36, 4.419858759091801e-36, -4.86654661240427e-36, -8.757432913626042e-37, 7.099985878966617e-36, -1.2342690683634019e-36, 3.314894069318851e-36, 8.79269774415071e-36, -9.309915258512517e-36, 1.43880508540648e-35, 9.545014128676975e-36, 1.3283086164291849e-36, 8.510579099953362e-36, 1.8901949161222383e-35, -2.2099293795459005e-36, 9.07481638834806e-36, -3.0562853121379475e-36, 1.7514865827252084e-36, 3.032775425121502e-36, 2.527312854267918e-36, 1.3071497181143837e-35, 5.383764126766077e-36, 9.874152546907215e-36, 1.2977457633078054e-35, 9.07481638834806e-36, -1.4952288142459497e-35, 7.4350017689509685e-37, -5.383764126766077e-36, 7.382104523163966e-36, -2.268704097087015e-36, -2.0923799444636718e-36, 8.13442090769023e-36, 2.8917161030228273e-36, 9.827132772874324e-36, -1.8102613002663228e-36, -2.82118644197349e-36, -2.174664549021232e-36, -2.453844457341525e-37, -4.419858759091801e-36, 5.853961867094992e-36, -9.73309322480854e-36, -2.73890183741593e-36, 3.948926334793622e-38, 8.604618648019145e-36, -9.73309322480854e-36, -1.3635734469538535e-35, -7.052966104933725e-36, 5.289724578700294e-36, -7.241045201065291e-36, 1.128474576789396e-35, 1.2871663141504048e-36, 7.376227051409854e-37, 6.347669494440353e-36, 7.288064975098183e-36, -6.347669494440353e-36, 7.52316384526264e-36, 2.550822741284364e-36, -2.0218502834143345e-36, -1.5516525430854195e-36, 6.347669494440353e-36, -1.410593220986745e-36, 1.2037062152420224e-35, -5.3132344657167395e-36, 3.173834747220176e-36, 8.933757066249385e-36, 1.0297330513203239e-35, 8.040381359624447e-36, 1.3665121828309092e-37, 1.1801963282255767e-35, 6.018531076210112e-36, 3.5264830524668625e-36, -7.335084749131074e-36, 5.47780367483186e-36, -8.087401133657338e-36, 2.8917161030228273e-36, -3.032775425121502e-36, -5.025238349765279e-37, 9.771296791210265e-38, 1.0485409609334805e-35, 1.1990042378387333e-35, 1.0767528253532154e-35, -1.0638223874941702e-36, 7.170515540015954e-37, -7.85230226349288e-36, -1.0626468931433479e-35, -1.034435028723613e-35, 4.70197740328915e-36, 1.7115197747972506e-35, -1.4294011305999016e-35, -4.86654661240427e-36, 8.933757066249385e-36, -1.0579449157400588e-35, 7.429124297196857e-36, -7.288064975098183e-36, 1.0101904577379033e-38, -1.1707923734189984e-35, 6.112570624275895e-36, -1.0391370061269022e-35, 3.314894069318851e-36, -1.8572810742992143e-36, -1.095560734966372e-35, -2.362743645152798e-36, 2.4685381367268038e-36, 5.466048731323637e-37, 1.1801963282255767e-35, 9.027796614315168e-36, 3.92615113174644e-36, 9.309915258512517e-36, -1.3353615825341186e-35, 1.786751413249877e-36, 1.0579449157400588e-36, -1.3635734469538535e-35, 5.7834322060456545e-36, 7.429124297196857e-36, -3.314894069318851e-36, -1.034435028723613e-35, 8.040381359624447e-36, 4.231779662960235e-36, -6.676807912670593e-36, -9.262895484479626e-36, -3.079795199154393e-36, -2.5860875718090325e-36, 1.7632415262334313e-36, -4.8430367253878245e-36, -9.450974580611192e-36, -3.949661018762886e-36, 1.3635734469538535e-35, -6.958926556867942e-36, -3.314894069318851e-36, 1.0297330513203239e-35, 1.9654265545748647e-35, -1.6080762719248893e-35, 4.513898307157584e-36, -4.0907203408615605e-36, -9.686073450775649e-36, -4.678467516272704e-36, -1.1402295202976189e-36, -3.103305086170839e-36, -5.759922319029209e-37, 1.6339371476429796e-36, 2.2099293795459005e-36, -1.4987552972984166e-36, -7.570183619295532e-36, -1.0485409609334805e-35, 7.758262715427098e-36, -5.948001415160775e-36, -3.832111583680657e-36, 1.1002627123696611e-35, -2.6918820633830384e-36, 4.984096047486499e-36, -2.228737289159057e-35, 6.832560914154546e-38, -8.839717518183602e-36, -3.9966807927957775e-37, 1.504632769052528e-35, -3.032775425121502e-36, 5.289724578700294e-36, 6.91190678283505e-36, -1.1049646897729503e-35, -8.79269774415071e-36, -6.535748590571919e-36, 1.8102613002663228e-36, 4.067210453845115e-36, 4.7489971773220415e-36, 3.408933617384634e-36, 4.6549576292562585e-36, -1.0626468931433479e-35, -9.168855936413843e-36, 2.644862289350147e-36, 6.206610172341678e-36, -3.9966807927957775e-36, 4.772507064338487e-36, 7.993361585591555e-37, -5.054625708535836e-36, -4.043700566828669e-36, 2.5567002130384753e-37, -5.900981641127883e-36, -4.796016951354933e-36, -2.2099293795459005e-35, -1.410593220986745e-35, 1.275411370642182e-36, -1.6739039555709374e-35, -2.174664549021232e-36, 2.3274788146281293e-36, -4.4668785331246925e-36, -7.570183619295532e-36, -1.5398975995771966e-36, 3.9966807927957775e-36, -2.82118644197349e-36, -1.504632769052528e-35, 2.0218502834143345e-36, 3.738072035614874e-36, 4.2787994369931265e-36, 7.85230226349288e-36, -9.639053676742758e-37, -1.2789378536946488e-35, -8.933757066249385e-36, 1.0579449157400588e-35, -9.121836162380951e-36, -1.516387712560751e-36, 4.067210453845115e-36, 9.8153778293661e-37, -6.435831570752024e-37, -6.91190678283505e-36, 6.535748590571919e-36, -1.4294011305999016e-35, -2.4215183626939123e-36, -9.4039548065783e-37, 1.9654265545748647e-35, 3.855621470697103e-36, 2.82118644197349e-36, -5.0840130673063935e-37, 2.2216843230541234e-36, 7.946341811558664e-36, 1.3259576277275403e-35, -5.3132344657167395e-36, 8.275480229788904e-36, 4.208269775943789e-36, 7.241045201065291e-36, -9.844765188136658e-38, 9.545014128676975e-36, 5.736412432012763e-36, 5.736412432012763e-36, 9.215875710446734e-36, 2.0218502834143345e-35, 1.1801963282255767e-35, 2.6801271198748155e-36, -6.553381005834253e-37, 2.350988701644575e-36, 1.189600283032155e-35, -8.181440681723121e-36, -1.275411370642182e-36, -9.874152546907215e-36, 1.3635734469538535e-35, 4.3728389850589095e-36, -1.1143686445795286e-35, -1.1578619355599532e-36, -2.6801271198748155e-36, 3.103305086170839e-36, -6.065550850243004e-36, 2.433273306202135e-36, 1.5892683623117327e-35, 9.121836162380951e-36, -1.1660903960157092e-35, -1.2695338988880705e-35, 1.3297779843677127e-37, 8.557598873986253e-36, 8.322500003821796e-36, -5.5953531099140885e-36, -4.137740114894452e-36, 8.839717518183602e-36, 7.85230226349288e-36, -9.697828394283872e-37, 5.054625708535836e-36, 2.915225990039273e-36, 1.7679435036367204e-35, 3.103305086170839e-36, 9.73309322480854e-36, 8.510579099953362e-36, -5.242704804667402e-36, -9.027796614315168e-36, -1.3071497181143837e-35, -7.664223167361315e-36, -5.830451980078546e-36, -9.286405371496071e-37, 3.6205226005326455e-36, 4.584427968206921e-36, -8.369519777854687e-36, 9.592033902709866e-36, -7.758262715427098e-36, 2.2099293795459005e-36, 1.88079096131566e-35, 8.416539551887579e-36, -5.348499296241408e-37, -9.968192094972998e-36, 1.8455261307909914e-36, 9.545014128676975e-36, 1.288341808501227e-35, -3.489748854003666e-38, 1.3729774017604318e-35, 7.758262715427098e-36, -6.347669494440353e-36, 3.502973165450417e-36, -2.1981744360376776e-36, 3.0562853121379475e-36, 4.419858759091801e-36, -1.1378785315959743e-35, 3.667542374565537e-36, 1.0814548027565045e-36, 1.2695338988880705e-35, 1.4179400606793843e-37, -4.419858759091801e-36, -4.3728389850589095e-36, -2.6889433275059827e-37, 1.2977457633078054e-35, 4.302309324009572e-36, -5.289724578700294e-36, -1.1707923734189984e-35, -2.4685381367268038e-36, -5.524823448864751e-36, -6.817867234769268e-36, 7.147005652999508e-36, 2.1158898314801175e-35, -3.173834747220176e-36, -8.510579099953362e-36, 1.2037062152420224e-35, -7.241045201065291e-36, 5.830451980078546e-36, -1.4858248594393714e-35, 9.827132772874324e-36, 4.7489971773220415e-36, 4.8430367253878245e-36, -9.309915258512517e-36, 3.3384039563352965e-36, -5.830451980078546e-36, -3.820356640172434e-37, 7.93458686805044e-37, 8.287235173297127e-37, -2.1393997184965633e-36, 1.476420904632793e-35, -1.0767528253532154e-35, 3.6205226005326455e-36, -2.3744985886610208e-36, -2.6595559687354255e-37, 7.711242941394206e-36, -5.242704804667402e-36, -8.510579099953362e-36, -5.5953531099140885e-36, -1.1143686445795286e-35, 7.241045201065291e-36, -1.1660903960157092e-35, -7.335084749131074e-36, 1.974830509381443e-36, -8.040381359624447e-36, 8.087401133657338e-36, -5.948001415160775e-36, -6.488728816539027e-36, 8.557598873986253e-36, 5.195685030634511e-36, 2.715391950399484e-36, 1.786751413249877e-36, -7.288064975098183e-36, 8.886737292216494e-36, 5.0311158215193905e-36, 8.87498234870827e-37, 6.065550850243004e-36, -5.430783900798968e-36, -5.101645482568728e-36, 5.148665256601619e-36, 1.2037062152420224e-35, -1.1566864412091309e-35, -1.1190706219828177e-35, 7.617203393328423e-36, 1.1472824864025526e-35, -1.1143686445795286e-35, -7.476144071229749e-36, 4.114230227878006e-36, -5.853961867094992e-36, -7.382104523163966e-36, -6.300649720407461e-36, 5.101645482568728e-36, -5.195685030634511e-36, -2.3274788146281293e-36, 7.93458686805044e-37, 1.5892683623117327e-35, -1.5516525430854195e-35, 1.3635734469538535e-35, -4.0907203408615605e-36, -6.318282135669795e-37, -2.6918820633830384e-36, 2.8682062160063815e-36, -6.629788138637702e-36, -1.3353615825341186e-35, -6.9648040286220535e-37, 1.2131101700486007e-35, 3.173834747220176e-36, -1.0638223874941702e-36, 1.774996469741654e-36, -9.168855936413843e-37, 1.6739039555709374e-35, 9.827132772874324e-36, 8.13442090769023e-36, 7.335084749131074e-36, -3.0562853121379475e-36, 1.8337711872827685e-36, -1.9513206223649973e-36, -7.241045201065291e-36, 8.839717518183602e-36, 6.441709042506136e-36, -6.58276836460481e-36, 1.8901949161222383e-35, 1.0720508479499262e-35, 1.3400635599374078e-36, 6.958926556867942e-36, -3.849743998942992e-37, -3.479463278433971e-36, 2.5743326283008096e-36, 9.756603111824986e-37, -5.0311158215193905e-36, 1.974830509381443e-35, 8.980776840282277e-36, -7.335084749131074e-36, -1.8102613002663228e-36, -9.521504241660529e-37, 1.598672317118311e-35, 5.64237288394698e-36, 1.3541694921472752e-35, -1.5234406786656846e-35, -1.8995988709288166e-35, -1.2601299440814922e-35, -2.3157238711199064e-36, 8.510579099953362e-36, 5.113400426076951e-37, 3.9966807927957775e-36, -1.4670169498262148e-35, 1.3077374652897949e-37, 3.361913843351742e-36, -3.009265538105056e-36, -3.032775425121502e-36, -1.6739039555709374e-35, 6.065550850243004e-36, -2.73890183741593e-36, -1.034435028723613e-35, -6.018531076210112e-36, 1.6339371476429796e-36, -3.949661018762886e-36, 1.910178320086217e-37, 1.7773474584432987e-35, -1.034435028723613e-35, 4.913566386437162e-36, -7.699487997885983e-37, 6.817867234769268e-36, -3.197344634236622e-36, 5.5953531099140885e-36, -1.7118136483849562e-37, -1.3917853113735884e-35, 7.099985878966617e-36, 3.8086016966642115e-36, 6.300649720407461e-36, -2.6566172328583698e-36, -2.4685381367268038e-37, 1.7491355940235638e-35, 1.704466808692317e-36, -1.38238135656701e-35, -1.3541694921472752e-35, -2.9504908205639416e-36, -1.1660903960157092e-35, -2.350988701644575e-36, -4.772507064338487e-36, 1.0297330513203239e-35, -1.5798644075051544e-35, -4.70197740328915e-36, -1.0203290965137456e-35, 1.3259576277275403e-35, -2.433273306202135e-36, 1.3077374652897949e-37, 2.644862289350147e-36, 1.4223481644949679e-36, -8.557598873986253e-36, -3.573502826499754e-36, -2.4391507779562466e-37, -4.9370762734536075e-36, 4.349329098042464e-37, 1.0203290965137456e-35, -9.027796614315168e-36, -8.839717518183602e-36, 3.267874295285959e-36, 8.46355932592047e-36, -3.855621470697103e-36, -1.1990042378387333e-36, 7.005946330900834e-36, -2.4685381367268038e-36, 2.9740007075803874e-36, -9.07481638834806e-36, 1.3885527019088271e-37, -1.2413220344683356e-35, -4.984096047486499e-36, 1.664500000764359e-35, -7.335084749131074e-36, 1.128474576789396e-35, 1.0485409609334805e-35, 4.53740819417403e-36, 5.830451980078546e-36, 1.7397316392169855e-36, 2.5743326283008096e-36, -3.573502826499754e-36, 1.6339371476429796e-36, 1.316553672920962e-35, 8.275480229788904e-36, 2.7624117244323756e-36, -1.96307556587322e-36, 3.5264830524668625e-36, 5.172175143618065e-36, -5.383764126766077e-36, -1.7632415262334313e-36, 5.3132344657167395e-36, 7.85230226349288e-36, 2.2804590405952378e-36, 1.0485409609334805e-35, -1.1660903960157092e-35, 3.573502826499754e-36, -1.5516525430854195e-35, 3.691052261581983e-36, -7.099985878966617e-36, -6.553381005834253e-37, 1.0579449157400588e-35, 1.0203290965137456e-35, -1.0485409609334805e-35, -8.839717518183602e-36, 1.8220162437745456e-36, 2.3321807920314184e-35, -4.5609180811904755e-36, 3.361913843351742e-36, -6.676807912670593e-36, -1.169616879068176e-36, 6.394689268473244e-36, 1.1707923734189984e-35, -1.1660903960157092e-35, 1.2460240118716248e-36, -2.915225990039273e-36, -2.600781251194311e-37, 5.195685030634511e-36, 1.532844633472263e-35, 7.241045201065291e-36, -4.678467516272704e-36, -5.289724578700294e-36, 9.027796614315168e-36, -8.745677970117819e-36, -2.4832318161120824e-37, -4.067210453845115e-36, -5.101645482568728e-36, -3.7175008844754842e-37, -4.878301555912493e-37, -1.9278107353485515e-36, 4.4668785331246925e-36, -3.644032487549091e-36, -9.07481638834806e-36, -5.101645482568728e-36, -3.6205226005326455e-36, 1.1472824864025526e-35, 1.3071497181143837e-35, 5.289724578700294e-37, 3.5264830524668625e-36, -1.222514124855179e-35, -6.065550850243004e-36, 1.1343520485435074e-36, 1.2319180796617573e-35, 6.958926556867942e-36, -7.2880649750981825e-37, -1.9936384189945996e-35, -2.4215183626939123e-36, 1.4670169498262148e-35, 1.3459410316915192e-36, -7.052966104933725e-36, -4.114230227878006e-36, 1.369450918707965e-36, -1.8619830517025034e-35, -1.2871663141504048e-36, 4.261167021730792e-37, 5.383764126766077e-36, -1.0720508479499262e-35, 2.715391950399484e-36, -3.361913843351742e-36, -7.899322037525772e-36, -3.8086016966642115e-36, 2.5155579107596953e-36, -6.25362994637457e-36, -1.3353615825341186e-35, -4.848914197141936e-37, -3.0562853121379475e-36, -4.255289549976681e-36, 1.0391370061269022e-35, -1.288341808501227e-35, -1.1049646897729503e-35, -3.314894069318851e-36, -5.524823448864751e-36, 1.0696998592482816e-36, -6.629788138637702e-36, -1.8619830517025034e-35, 7.241045201065291e-36, -6.394689268473244e-36, -4.607937855223367e-36, 4.631447742239813e-36, 8.757432913626042e-37, -5.195685030634511e-36, -8.040381359624447e-36, 1.5892683623117327e-35, 2.2804590405952378e-36, -6.347669494440353e-36, 3.92615113174644e-36, 1.3106762011668506e-36, -1.2413220344683356e-35, 2.1041348879718946e-36, 1.4294011305999016e-35, -3.5264830524668625e-36, -1.2131101700486007e-35, -3.667542374565537e-36, 7.241045201065291e-36, 3.291384182302405e-36, -4.419858759091801e-36, -1.1990042378387333e-36, 3.92615113174644e-36, -1.0720508479499262e-35, -5.5953531099140885e-36, -1.8102613002663228e-36, 5.195685030634511e-36, -1.6692019781676483e-36, 4.9370762734536075e-36, 5.948001415160775e-36, -1.8337711872827685e-36, 2.9740007075803874e-36, 6.817867234769268e-36, 1.3929608057244107e-36, 1.9043008483321058e-36, -5.853961867094992e-36, 1.250725989274914e-35, -1.0579449157400588e-36, -1.095560734966372e-35, -2.9093485182851616e-37, 1.6971199689996776e-37, 2.550822741284364e-36, -2.621352402333701e-36, -5.101645482568728e-36, 5.289724578700294e-36, 2.456783193218581e-36, 4.208269775943789e-36, -4.255289549976681e-36, -1.96307556587322e-36, -1.1002627123696611e-35, -5.583598166405866e-37, 1.2977457633078054e-35, -1.5610564978919978e-35, -1.0062231643038781e-35, -8.79269774415071e-36, 2.6595559687354255e-37, 1.6574470346594254e-36, 8.191726257292816e-38, -9.497994354644083e-36, -5.101645482568728e-36, -1.1990042378387333e-35, 3.1503248602037305e-36, -8.9925317837905e-37, -5.5013135618483055e-36, -9.73309322480854e-36, 1.680956921675871e-36, 6.629788138637702e-36, -4.378716456813021e-37, 4.255289549976681e-36, 1.034435028723613e-35, -5.360254239749631e-36, -4.678467516272704e-36, 3.667542374565537e-36, -2.3980084756774665e-36, 2.5978425153172554e-36, 1.316553672920962e-35, 5.736412432012763e-36, 1.1472824864025526e-35, 1.4670169498262148e-35, -2.5978425153172554e-36, -1.1660903960157092e-35, 4.4668785331246925e-36, -1.6339371476429796e-36, -3.738072035614874e-36, 1.1660903960157092e-35, -8.228460455756013e-36, 4.4668785331246925e-37, 2.644862289350147e-36, 5.64237288394698e-37, -3.267874295285959e-36, 1.774996469741654e-36, 8.886737292216494e-36, 7.099985878966617e-36, -9.497994354644083e-36, 6.230120059358124e-37, -2.644862289350147e-36, -1.3917853113735884e-35, -4.796016951354933e-36, -4.1436175866485635e-37, 1.250725989274914e-35, 9.309915258512517e-36, -1.410593220986745e-36, -8.651638422052036e-36, -1.0226800852153901e-36, 1.786751413249877e-35, 6.78847987599871e-37, 2.6918820633830384e-36, -8.416539551887579e-36, -2.0453601704307803e-36, 3.3384039563352965e-36, 3.079795199154393e-36, 3.4324435044010795e-36, -2.644862289350147e-36, -1.3917853113735884e-35, -3.667542374565537e-36, 3.549992939483308e-36, 6.065550850243004e-36, 9.874152546907215e-36, 1.2695338988880705e-35, 2.82118644197349e-36, -7.805282489459989e-36, 1.357695975199742e-36, -1.9160557918403286e-36, 7.664223167361315e-36, -3.455953391417525e-36, 1.4341031080031908e-36, 2.1276447749883404e-36, -5.524823448864751e-36, -3.032775425121502e-36, 9.07481638834806e-36, 3.92615113174644e-36, 1.2107591813469561e-36, -8.510579099953362e-36, 3.502973165450417e-36, -9.968192094972998e-36, 8.13442090769023e-36, 5.6893926579798715e-36, 1.8995988709288166e-35, -1.3071497181143837e-35, -2.915225990039273e-36, -7.476144071229749e-36, 1.1472824864025526e-35, -1.0720508479499262e-35, -8.79269774415071e-36, 3.361913843351742e-36, -8.510579099953362e-36, -1.1801963282255767e-35, -6.347669494440353e-36, -2.1981744360376776e-36, -5.571843222897643e-36, -7.946341811558664e-36, 1.4199971757933233e-35, -1.96307556587322e-36, 1.5398975995771966e-36, -2.4685381367268038e-36, -1.7397316392169855e-36, 6.817867234769268e-36, 1.3988382774785221e-36, 7.147005652999508e-36, -1.4482090402130582e-35, 6.394689268473244e-36, -1.8220162437745456e-36, 8.181440681723121e-36, -6.488728816539027e-36, -7.805282489459989e-36, -8.839717518183602e-36, 1.7632415262334313e-36, 2.0923799444636718e-36, -3.4236272967699124e-37, -8.933757066249385e-36, -3.7145621485984285e-36, -2.3274788146281293e-36, 3.103305086170839e-36, -6.488728816539027e-36, 6.018531076210112e-36, 7.052966104933725e-37, 6.770847460736376e-36, -1.829363083467185e-37, 6.817867234769268e-36, 3.361913843351742e-36, -5.148665256601619e-36, -3.408933617384634e-36, -1.1190706219828177e-35, 1.6833079103775157e-35, 1.189600283032155e-35, -1.316553672920962e-35, 5.830451980078546e-36, -3.644032487549091e-36, -9.497994354644083e-36, 7.052966104933725e-36, -9.121836162380951e-36, 1.532844633472263e-35, 6.25362994637457e-36, -3.667542374565537e-36, -1.2871663141504048e-36, -4.231779662960235e-36, -8.228460455756013e-37, -2.73890183741593e-36, 1.3259576277275403e-35, 2.82118644197349e-36, -1.3929608057244107e-36, -3.585257770007977e-37, 7.335084749131074e-36, -7.711242941394206e-36, 1.516387712560751e-36, 2.1158898314801175e-36, -2.068870057447226e-35, 2.6801271198748155e-36, 9.62435999735748e-38, 2.550822741284364e-36, 6.018531076210112e-36, 5.0311158215193905e-36, -2.8917161030228273e-36, -2.5155579107596953e-36, 2.6566172328583698e-36, -1.4294011305999016e-35, -9.545014128676975e-36, 9.545014128676975e-36, 3.126814973187285e-36, 1.189600283032155e-35, -2.2099293795459005e-35, 2.0453601704307803e-36, -3.408933617384634e-36, -1.7679435036367204e-35, 2.256949153578792e-36, 4.584427968206921e-37, 6.770847460736376e-36, -5.0311158215193905e-36, 5.736412432012763e-36, 3.314894069318851e-36, -1.8455261307909914e-36, -9.027796614315168e-36, 6.876641952310382e-37, 1.5722236942248095e-37, 5.466048731323637e-37, 1.189600283032155e-35, -1.0626468931433479e-35, -1.410593220986745e-35, 1.4693679385278594e-37, -1.1660903960157092e-35, -1.8220162437745456e-36, -1.7397316392169855e-35, -2.7859216114488214e-36, -2.040658193027491e-35, -8.322500003821796e-36, 7.241045201065291e-36, 9.874152546907215e-36, 8.639883478543813e-37, -4.8430367253878245e-36, -1.570460452698576e-35, -6.018531076210112e-36, -5.430783900798968e-36, 2.097081921866961e-35, -1.2037062152420224e-35, 1.4987552972984166e-36, -8.639883478543813e-37, -2.8917161030228273e-36, 7.805282489459989e-36, -9.215875710446734e-36, -5.730534960258652e-37, -1.532844633472263e-35, -1.1143686445795286e-35, -2.7624117244323756e-36, -5.148665256601619e-36, 3.691052261581983e-36, 5.900981641127883e-36, -8.557598873986253e-36, 1.2166366531010676e-36, -4.419858759091801e-36, 3.949661018762886e-36, -1.0168026134612787e-36, -4.9370762734536075e-36, 4.4668785331246925e-36, -7.1940254270324e-36, 4.255289549976681e-36, 6.441709042506136e-36, -2.0336052269225574e-36, 5.289724578700294e-36, -5.5953531099140885e-36, -1.6574470346594254e-36, -7.288064975098183e-36, -7.899322037525772e-36, 3.032775425121502e-36, 2.8446963289899358e-36, 3.6205226005326455e-36, -1.1707923734189984e-35, 1.946618644961708e-35, -3.644032487549091e-36, 9.827132772874324e-36, -7.711242941394206e-36, -2.4215183626939123e-36, -5.054625708535836e-36, -3.173834747220176e-36, -9.309915258512517e-36, -7.335084749131074e-36, -7.93458686805044e-37, -1.38238135656701e-35, -1.88079096131566e-37, -6.864887008802159e-36, 2.8682062160063815e-36, -8.980776840282277e-36, -7.993361585591555e-36, -1.0814548027565045e-36, -5.571843222897643e-36, -1.1660903960157092e-35, -1.0626468931433479e-35, 1.986585452889666e-36, -2.4215183626939123e-36, 1.4870003537901937e-36, -1.7679435036367204e-35, -6.723827686703485e-36, -5.736412432012763e-36, 7.899322037525772e-36, -2.4795583962657627e-39, -2.82118644197349e-36, -1.8337711872827685e-36, 7.993361585591555e-36, -5.360254239749631e-36, -2.4920480237432495e-36, 7.335084749131074e-36, 1.6739039555709374e-35, -4.70197740328915e-36, 1.2577789553798476e-36, -3.8086016966642115e-36, -7.85230226349288e-36, -5.0311158215193905e-36, -3.9026412447299945e-36, -5.430783900798968e-36, 8.510579099953362e-36, -7.805282489459989e-36, 3.0562853121379475e-36, 2.424457098570968e-38, 1.9654265545748647e-35, -2.3980084756774665e-36, 5.6893926579798715e-36, -4.255289549976681e-36, -6.347669494440353e-36, -2.4920480237432495e-36, -1.0203290965137456e-35, 6.58276836460481e-36, -2.7976765549570443e-36, 2.0100953399061116e-36, 9.121836162380951e-36, -4.043700566828669e-36, -9.262895484479626e-36, 7.805282489459989e-36, 1.0297330513203239e-35, 4.53740819417403e-36, 7.335084749131074e-36, 2.915225990039273e-36, 7.993361585591555e-36, -2.2922139841034606e-36, -5.5953531099140885e-36, 2.5978425153172554e-36, -6.488728816539027e-36, 7.335084749131074e-36, 1.5281426560689738e-36, 2.5860875718090325e-36, 5.948001415160775e-36, 1.43880508540648e-35, 2.6918820633830384e-36, -4.2787994369931265e-36, 1.2989212576586277e-36, 7.899322037525772e-36, -1.1660903960157092e-35, -2.2216843230541234e-36, 1.3400635599374078e-36, 3.314894069318851e-36, -4.419858759091801e-36, 4.5609180811904755e-36, 7.335084749131074e-36, -1.2601299440814922e-35, 3.455953391417525e-36, 1.0391370061269022e-35, -7.581938562803754e-37, 1.0861567801597937e-35, 3.691052261581983e-36, -1.2601299440814922e-35, 5.5013135618483055e-36, -4.325819211026018e-36, 5.510129769479473e-39, -6.065550850243004e-36, 1.88079096131566e-36, 6.864887008802159e-36, 3.667542374565537e-36, -3.126814973187285e-36, -6.906029311080939e-37, 6.065550850243004e-36, -2.453844457341525e-37, 2.1452771902506747e-37, -1.1402295202976189e-36, 3.5264830524668625e-36, 9.262895484479626e-36, 9.874152546907215e-36, 9.827132772874324e-36, -3.197344634236622e-36, -1.4914084576057773e-37, 5.995021189193666e-36, 1.1660903960157092e-35, -1.4199971757933233e-35, -4.513898307157584e-36, -8.46355932592047e-36, 1.0203290965137456e-35, 6.347669494440353e-36, -4.396348872075355e-36, -1.8220162437745456e-36, 1.1801963282255767e-35, -1.0297330513203239e-35, 3.220854521253068e-36, 1.3929608057244107e-36, -9.874152546907215e-36, 3.738072035614874e-36, -1.2037062152420224e-35, -2.0218502834143345e-36, 3.0562853121379475e-36, -7.711242941394206e-36, 9.73309322480854e-36, 6.206610172341678e-36, -4.6549576292562585e-36, -1.4576129950196365e-35, -2.5860875718090325e-36, 1.0720508479499262e-35, 6.723827686703485e-36, -1.476420904632793e-35, -2.8446963289899358e-36, -1.0297330513203239e-35, -1.2695338988880705e-35, -3.3501588998435194e-37, -1.3812058622161878e-36, 2.3039689276116835e-36, -6.065550850243004e-36, -3.314894069318851e-36, -4.678467516272704e-36, 2.456783193218581e-36, -7.85230226349288e-36, -2.130583510865396e-37, 9.121836162380951e-36, -8.287235173297127e-37, 9.686073450775649e-36, 1.6971199689996776e-37, 3.455953391417525e-36, -2.5155579107596953e-36, -4.607937855223367e-36, 1.3541694921472752e-35, 5.830451980078546e-36, -5.7834322060456545e-36, 5.054625708535836e-36, 4.86654661240427e-36, 2.915225990039273e-36, -7.005946330900834e-36, -7.85230226349288e-36, -1.1049646897729503e-35, -1.8367099231598242e-39, -7.099985878966617e-36, 6.065550850243004e-36, 2.040658193027491e-35, 2.5978425153172554e-36, -9.07481638834806e-36, 5.524823448864751e-36, 1.9160557918403286e-36, 4.255289549976681e-36, -9.93292726444833e-37, -6.723827686703485e-36, 1.222514124855179e-35, 3.032775425121502e-36, -7.993361585591555e-36, -7.147005652999508e-36, -4.6549576292562585e-36, -1.0062231643038781e-35, 1.2107591813469561e-36, -1.4987552972984166e-36, 1.7021158199906723e-35, 4.631447742239813e-36, -1.1049646897729503e-35, 4.725487290305596e-36, -7.570183619295532e-36, -2.2381412439656354e-35, -3.9966807927957775e-36, 7.099985878966617e-36, 8.040381359624447e-36, -9.168855936413843e-36, -1.6574470346594254e-36, -6.347669494440353e-36, 1.476420904632793e-35, 1.0579449157400588e-35, 2.621352402333701e-36, -5.383764126766077e-36, 1.4482090402130582e-35, 5.7834322060456545e-36, 1.758539548830142e-35, -9.73309322480854e-36, -1.2930437859045163e-36, -7.335084749131074e-36, -9.4039548065783e-36, 5.5953531099140885e-36, 1.410593220986745e-36, -7.288064975098183e-36, -6.488728816539027e-36, -9.73309322480854e-36, 3.197344634236622e-36, 3.34281206015088e-38, 3.691052261581983e-36, -7.335084749131074e-36, -3.291384182302405e-36, 5.995021189193666e-37, 1.4576129950196365e-35, 1.9513206223649973e-36, -2.0923799444636718e-36, 1.1002627123696611e-35, -9.309915258512517e-36, 6.488728816539027e-36, -1.4670169498262148e-35, 3.361913843351742e-36, -5.148665256601619e-36, 1.7779352056187099e-37, -8.9925317837905e-37, -7.099985878966617e-36, 8.040381359624447e-36, -9.07481638834806e-36, 2.0718087933242817e-37, 3.832111583680657e-36, -3.361913843351742e-36, 1.2695338988880705e-35, 8.369519777854687e-36, -1.2342690683634019e-36, 4.631447742239813e-36, 6.676807912670593e-36, 2.1158898314801175e-37, -9.592033902709866e-36, 3.0562853121379475e-36, 1.43880508540648e-35, 1.598672317118311e-36, -2.6918820633830384e-36, 1.598672317118311e-35, -8.13442090769023e-36, -5.101645482568728e-36, -1.1143686445795286e-35, 5.360254239749631e-36, -1.034435028723613e-36, 6.206610172341678e-36, 1.1378785315959743e-35, -4.1847598889273435e-36, -5.571843222897643e-36, 4.7489971773220415e-36, -9.07481638834806e-36, -9.686073450775649e-36, -2.433273306202135e-36, -1.1660903960157092e-35, -3.549992939483308e-36, -1.4482090402130582e-35, 5.47780367483186e-36, 6.394689268473244e-36, 2.1535056507064307e-35, -1.6739039555709374e-35, -4.913566386437162e-36, -3.188528426605455e-37, 1.6221822041347568e-36, -2.1981744360376776e-36, -5.148665256601619e-36, 4.913566386437162e-36, 8.087401133657338e-36, -4.2787994369931265e-36, 3.832111583680657e-36, -7.1117408224748394e-37, -1.516387712560751e-36, 2.0218502834143345e-36, -2.8682062160063815e-36, 2.2334392665623463e-36, 2.453844457341525e-37, -4.325819211026018e-36, 4.8430367253878245e-36, 1.586917373610088e-36, 1.095560734966372e-35, 4.796016951354933e-36, 5.360254239749631e-36, 6.065550850243004e-36, 1.4199971757933233e-35, 7.429124297196857e-36, -1.1990042378387333e-36, -3.6205226005326455e-36, 9.309915258512517e-36, 1.095560734966372e-35, 9.968192094972998e-36, -1.7115197747972506e-35, -2.527312854267918e-36, -1.2695338988880705e-35, 3.949661018762886e-36, 8.839717518183602e-36, -1.8220162437745456e-36, -1.2319180796617573e-35, -3.220854521253068e-36, 7.052966104933725e-36, 2.2804590405952378e-36, 5.5013135618483055e-36, 1.0485409609334805e-35, -8.485599844998388e-38, 1.8243672324761902e-35, 5.995021189193666e-37, 6.770847460736376e-36, -6.488728816539027e-36, 4.137740114894452e-36, 3.96729343402522e-37, -1.1108421615270617e-36, -3.9966807927957775e-36, -3.92615113174644e-36, -6.629788138637702e-36, -3.502973165450417e-36, 2.162909605513009e-36, -1.0297330513203239e-35, -3.173834747220176e-36, 2.0923799444636718e-36, -5.571843222897643e-36, 1.774996469741654e-36, 1.189600283032155e-35, 1.5892683623117327e-35, -1.704466808692317e-36, 4.913566386437162e-36, -5.853961867094992e-36, 1.0767528253532154e-35, 3.4324435044010795e-36, -4.8430367253878245e-36, -8.557598873986253e-36, -3.549992939483308e-36, 9.73309322480854e-36, -1.0109251417071673e-35, -2.7624117244323756e-36, -6.676807912670593e-36, 4.913566386437162e-36, 4.255289549976681e-36, -2.621352402333701e-36, 1.516387712560751e-36, -7.570183619295532e-36, 2.7859216114488214e-36, -1.2037062152420224e-35, -7.617203393328423e-36, 1.3635734469538535e-35, -3.032775425121502e-36, -9.968192094972998e-36, -7.85230226349288e-36, -1.6456920911512025e-35, -1.0579449157400588e-35, -1.250725989274914e-35, 1.2695338988880705e-35, -4.114230227878006e-36, -1.263656427133959e-36, -1.3259576277275403e-35, -1.1343520485435074e-36, 4.443368646108247e-36, 2.1393997184965633e-36, -1.9513206223649973e-36, 1.0297330513203239e-35, 1.4752454102819708e-36, -1.1190706219828177e-35, 9.4039548065783e-36, 2.9534295564409974e-37, 1.2695338988880705e-35, -6.58276836460481e-36, 6.817867234769268e-36, -1.8901949161222383e-35, 6.25362994637457e-36, -5.830451980078546e-36, -3.5264830524668625e-36, -2.497925495497361e-37, 7.229290257557068e-37, -2.6801271198748155e-36, 7.899322037525772e-36, -8.087401133657338e-36, 1.128474576789396e-35, 8.275480229788904e-36, -7.099985878966617e-36, -3.667542374565537e-36, -5.383764126766077e-36, 2.4685381367268038e-36, 3.314894069318851e-36, -9.968192094972998e-36, 5.142787784847508e-37, -1.1660903960157092e-35, -1.2789378536946488e-35, -1.9395656788567744e-36, 6.018531076210112e-36, 1.974830509381443e-35, -5.6893926579798715e-36, -7.382104523163966e-36, 4.7489971773220415e-36, 4.86654661240427e-36, 4.0907203408615605e-36, -4.7489971773220415e-36, 1.758539548830142e-35, -5.571843222897643e-36, 3.173834747220176e-36, -1.2977457633078054e-35, -1.2107591813469561e-36, 2.380376060415132e-37, 4.490388420141138e-36, -5.7834322060456545e-36, 9.07481638834806e-36, 4.607937855223367e-36, -6.347669494440353e-36, -7.664223167361315e-36, -1.8619830517025034e-35, -5.0311158215193905e-36, -7.476144071229749e-36, 2.0923799444636718e-36, 3.76158192263132e-36, 4.86654661240427e-36, -9.07481638834806e-36, -5.47780367483186e-36, -8.839717518183602e-36, -3.314894069318851e-36, -8.557598873986253e-36, -1.3929608057244107e-36, -1.532844633472263e-35, -3.479463278433971e-36, -8.087401133657338e-36, 3.92615113174644e-36, -7.93458686805044e-37, -2.82118644197349e-36, 9.121836162380951e-36, -7.85230226349288e-36, -6.159590398308787e-36, -6.018531076210112e-36, -4.173004945419121e-37, -7.581938562803754e-37, -1.6339371476429796e-36, 8.522334043461584e-37, 2.7624117244323756e-36, 1.128474576789396e-35, -1.0062231643038781e-35, -6.629788138637702e-36, -9.73309322480854e-36, 1.1378785315959743e-35, -1.087332274510616e-36, 1.570460452698576e-35, -1.3259576277275403e-35, -1.5398975995771966e-36, -8.839717518183602e-36, 4.043700566828669e-36, -6.25362994637457e-36, 4.349329098042464e-36, 5.936246471652552e-37, -3.644032487549091e-36, -5.900981641127883e-36, 5.195685030634511e-36, -5.571843222897643e-36, 5.319111937470851e-37, 4.067210453845115e-36, 1.0520674439859473e-36, 4.231779662960235e-36, 4.255289549976681e-36, 9.73309322480854e-36, 1.7985063567581e-36, 1.1049646897729503e-36, -5.0311158215193905e-36, 7.241045201065291e-36, 1.532844633472263e-35, 4.231779662960235e-37, -1.6574470346594254e-36, 3.032775425121502e-36, -6.629788138637702e-36, -1.570460452698576e-35, 5.5953531099140885e-36, -5.571843222897643e-36, 1.986585452889666e-36, 2.6918820633830384e-36, 1.4294011305999016e-35, 2.0923799444636718e-36, 4.255289549976681e-36, 7.099985878966617e-36, 3.314894069318851e-36, -1.4576129950196365e-36, -4.1847598889273435e-36, -6.958926556867942e-36, 3.267874295285959e-36, 9.686073450775649e-36, 7.005946330900834e-36, -1.410593220986745e-36, -4.984096047486499e-36, 1.316553672920962e-35, 8.287235173297127e-37, 4.984096047486499e-36, 1.2037062152420224e-35, 7.147005652999508e-36, 1.2131101700486007e-35, -1.946618644961708e-35, -1.3353615825341186e-35, 9.991701981989444e-37, 1.8619830517025034e-35, -7.85230226349288e-36, -1.369450918707965e-36, -5.054625708535836e-36, 1.3917853113735884e-35, 5.383764126766077e-36, -1.1660903960157092e-35, 5.830451980078546e-36, 3.220854521253068e-36, -4.8430367253878245e-36, -9.262895484479626e-36, -5.319111937470851e-37, 9.497994354644083e-36, 4.513898307157584e-36, 1.7279766957087626e-36, 2.1393997184965633e-36, 1.3812058622161878e-36, 7.147005652999508e-36, -1.4576129950196365e-35, -2.3603926564511533e-35, -1.8572810742992143e-36, -3.949661018762886e-36, -2.1981744360376776e-36, 6.25362994637457e-36, 3.644032487549091e-36, 1.3917853113735884e-35, 7.429124297196857e-36, 5.830451980078546e-36, -7.376227051409854e-37, -8.181440681723121e-36, 2.997510594596833e-36, 8.839717518183602e-36, 9.215875710446734e-36, 1.128474576789396e-35, 9.168855936413843e-36, -4.796016951354933e-36, -1.8995988709288166e-35, -4.984096047486499e-36, 5.054625708535836e-36, 2.997510594596833e-36, 1.852579096895925e-35, -1.814963277669612e-35, 9.356935032545409e-36, -1.3259576277275403e-35, -3.173834747220176e-36, -7.711242941394206e-36, 7.758262715427098e-36, 5.965633830423109e-37, -7.93458686805044e-37, 8.181440681723121e-36, 5.5013135618483055e-36, 1.4011892661801667e-35, -1.316553672920962e-35, -1.96307556587322e-36, 1.1143686445795286e-35, 6.629788138637702e-36, 1.5516525430854195e-36, 4.8430367253878245e-36, -5.5013135618483055e-36, 3.0562853121379475e-36, 1.535489495761613e-37, -2.9504908205639416e-36, -7.288064975098183e-36, -1.2789378536946488e-35, -9.262895484479626e-36, 2.057115113939003e-37, -5.948001415160775e-36, 2.0218502834143345e-35, -9.309915258512517e-36, -2.8446963289899358e-36, -8.369519777854687e-36, -3.76158192263132e-36, -1.1378785315959743e-35, -8.510579099953362e-36, -3.0268979533673903e-37, 1.6530389308438418e-38, 7.85230226349288e-36, -2.0453601704307803e-36, -5.495436090094194e-37, -1.5234406786656846e-35, -9.874152546907215e-36, -3.691052261581983e-36, -2.5743326283008096e-36, -1.463490466773748e-36, 1.1402295202976189e-36, 9.827132772874324e-36, 1.095560734966372e-35, 6.817867234769268e-36, 7.147005652999508e-36, 1.0109251417071673e-35, -7.1940254270324e-36, 1.7412010071555134e-37, 3.5264830524668625e-36, -9.309915258512517e-36, 3.549992939483308e-36, -2.865267480129326e-37, -9.168855936413843e-36, -4.0907203408615605e-36, 8.839717518183602e-36, 4.4668785331246925e-36, -5.995021189193666e-37, -3.385423730368188e-36, 1.1049646897729503e-35, 4.4668785331246925e-36, 5.101645482568728e-36, -3.855621470697103e-36, -5.289724578700294e-36, -3.667542374565537e-36, 4.86654661240427e-36, 1.570460452698576e-35, 5.319111937470851e-37, -7.617203393328423e-36, -3.549992939483308e-36, -1.4914084576057773e-37, 2.1981744360376776e-36, -9.356935032545409e-36, -5.64237288394698e-36, -1.758539548830142e-35, 3.103305086170839e-36, -1.9043008483321058e-36, -2.4920480237432495e-36, -1.3635734469538535e-35, -1.570460452698576e-35, 2.9504908205639416e-36, 1.1519844638058418e-36, -3.079795199154393e-36, -1.7397316392169855e-36, -9.121836162380951e-36, -6.676807912670593e-36, -7.429124297196857e-36, 1.0579449157400588e-35, 2.8682062160063815e-36, 7.85230226349288e-36, -5.5953531099140885e-36, -7.85230226349288e-36, -1.1578619355599532e-36, -6.629788138637702e-36, -3.408933617384634e-36, -2.7976765549570443e-36, -6.864887008802159e-36, 6.112570624275895e-36, -5.289724578700294e-36, -1.288341808501227e-35, -4.302309324009572e-36, -1.2413220344683356e-35, 1.3224311446750734e-36, -4.043700566828669e-36, -4.6549576292562585e-36, -6.465218929522581e-37, -9.07481638834806e-36, 9.968192094972998e-36, 5.612985525176423e-37, -6.488728816539027e-36, -8.557598873986253e-36, 3.314894069318851e-36, 5.995021189193666e-36, -2.1723135603195873e-35, 1.570460452698576e-35, -7.758262715427098e-36, 7.85230226349288e-36, -6.58276836460481e-36, -1.626884181538046e-35, 2.362743645152798e-36, 1.516387712560751e-36, 1.4482090402130582e-35, -8.05213630313267e-37, 7.805282489459989e-36, 3.573502826499754e-36, 1.1660903960157092e-35, -1.7162217522005398e-36, 1.476420904632793e-35, -8.933757066249385e-36, -4.513898307157584e-36, -1.7985063567581e-36, 1.610427260626534e-36, -2.1393997184965633e-36, 2.915225990039273e-36, -1.3729774017604318e-35, -1.1566864412091309e-35, -1.1660903960157092e-35, 1.570460452698576e-35, 1.4576129950196365e-35, -2.4685381367268038e-36, -2.339233758136352e-36, 2.433273306202135e-36, -1.034435028723613e-36, -1.6739039555709374e-35, -1.3729774017604318e-35, 7.241045201065291e-36, -1.2695338988880705e-35, -1.1801963282255767e-35, 1.2319180796617573e-35, -1.3988382774785221e-36, 2.068870057447226e-35, -3.314894069318851e-36, -6.58276836460481e-36, 8.087401133657338e-36, 5.853961867094992e-36, -1.9936384189945996e-35, 1.9395656788567744e-36, -9.756603111824986e-37, -1.774996469741654e-36, -1.1801963282255767e-35, -7.993361585591555e-36, -6.676807912670593e-36, -3.220854521253068e-36, -5.0311158215193905e-36, -6.300649720407461e-36, 7.971321066513637e-38, 4.2787994369931265e-36, -2.0218502834143345e-36, 3.879131357713549e-36, 7.476144071229749e-36, -1.8337711872827685e-36, 1.4341031080031908e-36, 9.309915258512517e-36, 5.0311158215193905e-36, -1.0168026134612787e-36, -1.9654265545748647e-35, -5.571843222897643e-36, -7.899322037525772e-36, -8.228460455756013e-36, 7.617203393328423e-36, 1.0579449157400588e-35, 6.723827686703485e-36, 1.8337711872827685e-36, -7.241045201065291e-36, 5.360254239749631e-36, 8.087401133657338e-36, 1.4952288142459497e-35, -1.1049646897729503e-35, -5.571843222897643e-36, -1.5234406786656846e-35, -5.383764126766077e-36, 1.4294011305999016e-35, 4.984096047486499e-36, -9.07481638834806e-36, 1.504632769052528e-35, 7.099985878966617e-36, 7.241045201065291e-36, -1.8572810742992143e-36, -4.86654661240427e-36, 9.036612821946335e-38, -1.0297330513203239e-35, 3.0562853121379475e-36, 1.2416159080560412e-37, 7.288064975098183e-36, -3.9966807927957775e-36, 1.1378785315959743e-35, 6.817867234769268e-36, -5.47780367483186e-36, -8.181440681723121e-36, -1.2131101700486007e-35, 7.664223167361315e-36, 4.7489971773220415e-36, 1.5234406786656846e-35, 5.383764126766077e-36, 6.065550850243004e-36, 6.018531076210112e-36, 6.700317799687039e-37, 2.997510594596833e-36, 1.974830509381443e-35, -3.291384182302405e-36, -2.915225990039273e-36, -5.8774717541114375e-37, -1.0050476699530558e-36, -6.676807912670593e-36, -4.396348872075355e-36, -2.2099293795459005e-36, 1.5234406786656846e-35, 3.479463278433971e-36, 2.0100953399061116e-36, -1.626884181538046e-35, -1.664500000764359e-35, -1.2460240118716248e-36, -3.173834747220176e-36, 2.7624117244323756e-36, 8.557598873986253e-36, -1.2131101700486007e-35, -1.4341031080031908e-36, -1.0109251417071673e-35, 8.087401133657338e-36, 4.678467516272704e-36, 9.827132772874324e-36, 4.3728389850589095e-36, -9.827132772874324e-36, 4.9370762734536075e-36, 1.43880508540648e-35, -4.9370762734536075e-37, 6.817867234769268e-36, -4.984096047486499e-36, 6.700317799687039e-37, -1.8337711872827685e-36, -7.993361585591555e-36, -2.456783193218581e-36, -3.502973165450417e-36, -4.772507064338487e-36, 5.571843222897643e-36, 1.5140367238591063e-35, -1.2131101700486007e-35, 8.980776840282277e-36, -1.6339371476429796e-36, -9.686073450775649e-36, -8.557598873986253e-36, 1.0285575569695016e-36, -1.189600283032155e-35, 4.1436175866485635e-37, 4.6549576292562585e-36, -9.309915258512517e-36, 7.617203393328423e-36, -5.47780367483186e-36, 1.0391370061269022e-35, -8.087401133657338e-36, -3.408933617384634e-36, -4.020190679812223e-36, -1.4858248594393714e-35, 3.6205226005326455e-36, 7.382104523163966e-36, -4.208269775943789e-36, 5.383764126766077e-36, -4.9370762734536075e-36, 7.382104523163966e-36, 9.545014128676975e-36, 1.8337711872827685e-35, 6.535748590571919e-36, 7.288064975098183e-36, -1.38238135656701e-35, 1.0062231643038781e-35, -2.73890183741593e-36, 1.5798644075051544e-35, 7.993361585591555e-36, -1.0579449157400588e-35, 2.8682062160063815e-36, 3.455953391417525e-36, -1.2930437859045163e-36, -7.241045201065291e-36, 8.416539551887579e-36, 1.1343520485435074e-36, -7.335084749131074e-36, -1.1801963282255767e-35, -5.5953531099140885e-36, 8.651638422052036e-36, -5.830451980078546e-36, -4.161250001910898e-36, -5.948001415160775e-36, 1.2789378536946488e-35, -4.796016951354933e-36, -1.0297330513203239e-35, -1.9043008483321058e-36, -4.984096047486499e-36, 1.610427260626534e-36, -2.3862535321692436e-36, 6.58276836460481e-36, -1.9513206223649973e-36, 1.3635734469538535e-35, -1.1660903960157092e-35, 3.76158192263132e-36, 3.667542374565537e-36, 8.13442090769023e-36, -1.0626468931433479e-35, 6.394689268473244e-36, 1.1990042378387333e-35, 1.786751413249877e-36, 1.189600283032155e-35, -1.1002627123696611e-35, 2.9740007075803874e-36, -5.995021189193666e-36, 9.545014128676975e-36, -1.0626468931433479e-35, 5.6893926579798715e-36, 1.0203290965137456e-35, -1.0297330513203239e-35, 4.772507064338487e-36, 1.5610564978919978e-35, -4.607937855223367e-36, -1.3635734469538535e-35, -5.853961867094992e-36, 3.455953391417525e-36, 1.9043008483321058e-36, 1.128474576789396e-35, 4.173004945419121e-37, -2.644862289350147e-36, -1.0062231643038781e-35, 1.087332274510616e-36, -3.408933617384634e-36, 5.5013135618483055e-36, 3.479463278433971e-36, 1.288341808501227e-35, 5.47780367483186e-36, -1.0767528253532154e-35, 6.58276836460481e-36, 6.676807912670593e-36, -3.197344634236622e-36, 1.1707923734189984e-35, -8.980776840282277e-36, 1.1660903960157092e-35, -7.875812150509326e-37, 3.2443644082695135e-36, -3.790969281401877e-37, 6.37705685321091e-37, -1.8220162437745456e-36, -1.2283915966092904e-36, 1.986585452889666e-36, -2.997510594596833e-36, -2.9534295564409974e-37, -4.043700566828669e-36, 7.93458686805044e-37, 4.8430367253878245e-36, 1.0109251417071673e-35, 3.3384039563352965e-36, -4.9370762734536075e-36, -9.827132772874324e-36, 5.172175143618065e-36, 5.5953531099140885e-36, -1.0109251417071673e-35, 5.242704804667402e-36, 1.1143686445795286e-35, 1.344765537340697e-35, -3.7145621485984285e-36, -4.796016951354933e-36, -3.92615113174644e-36, 2.456783193218581e-36, -1.7115197747972506e-35, 3.455953391417525e-36, -1.1707923734189984e-35, -2.915225990039273e-36, 7.993361585591555e-36, -4.885648395605132e-38, -9.497994354644083e-36, 1.9043008483321058e-36, -2.162909605513009e-36, -7.052966104933725e-36, -1.0168026134612787e-36, -4.513898307157584e-36, 3.691052261581983e-36, -3.573502826499754e-36, 1.626884181538046e-35, -2.1276447749883404e-36, 1.6456920911512025e-35, 8.933757066249385e-36, -8.604618648019145e-36, 8.745677970117819e-36, 7.476144071229749e-36, 8.604618648019145e-36, 1.1461069920517303e-36, 7.099985878966617e-36, -1.2789378536946488e-35, 4.419858759091801e-36, -5.948001415160775e-36, -1.4011892661801667e-35, -1.1566864412091309e-35, -2.5860875718090325e-36, -8.040381359624447e-36, 8.040381359624447e-36, 9.309915258512517e-36, -4.419858759091801e-36, -3.385423730368188e-36, 4.8430367253878245e-36, -8.557598873986253e-36, -3.032775425121502e-36, -1.2037062152420224e-35, 7.617203393328423e-36, -7.147005652999508e-36, -1.1190706219828177e-35, -2.915225990039273e-36, -4.2795341209623905e-38, 1.504632769052528e-35, -6.676807912670593e-36, 1.9395656788567744e-36, -1.2037062152420224e-35, 9.592033902709866e-36, 7.993361585591555e-36, 1.3635734469538535e-35, -1.2789378536946488e-35, -2.1393997184965633e-36, 5.054625708535836e-36, 1.3929608057244107e-36, 4.8430367253878245e-36, 3.6205226005326455e-36, -3.573502826499754e-36, 1.7021158199906723e-35, -5.571843222897643e-36, -8.557598873986253e-36, 6.723827686703485e-36, 8.604618648019145e-36, -1.0062231643038781e-35, 6.535748590571919e-36, 4.1847598889273435e-36, -3.502973165450417e-36, 1.3259576277275403e-35, 1.6833079103775157e-35, -2.0130340757831674e-37, 1.0579449157400588e-35, -8.933757066249385e-36, 1.6080762719248893e-35, 2.3656823810298536e-37, -1.2131101700486007e-35, 5.900981641127883e-36, 6.817867234769268e-36, 7.85230226349288e-36, 1.2789378536946488e-35, -9.215875710446734e-36, -8.13442090769023e-36, -3.009265538105056e-36, -3.079795199154393e-36, 2.7859216114488214e-36, 3.4236272967699124e-37, -8.87498234870827e-37, 3.855621470697103e-36, -2.621352402333701e-36, 1.8337711872827685e-36, -9.07481638834806e-36, -5.242704804667402e-36, -5.47780367483186e-36, 2.4685381367268038e-36, -4.255289549976681e-36, -1.5140367238591063e-35, -1.0109251417071673e-35, 1.586917373610088e-36, -5.5953531099140885e-36, -9.968192094972998e-36, -1.0109251417071673e-36, -1.0626468931433479e-35, -2.0218502834143345e-36, -5.5542108076353085e-37, -1.3259576277275403e-35, -4.6549576292562585e-36, -5.0311158215193905e-36, 1.9983403963978888e-36, 1.7632415262334313e-36, 5.5953531099140885e-36, 7.288064975098183e-36, 6.6415430821459244e-37, 8.632536638851174e-38, 1.0861567801597937e-35, 1.128474576789396e-35, -1.0579449157400588e-35, -1.88079096131566e-37, 1.570460452698576e-35, 4.208269775943789e-36, -7.617203393328423e-36, -1.6739039555709374e-35, -2.3274788146281293e-36, 1.96307556587322e-36, 9.874152546907215e-36, 2.1276447749883404e-36, -8.745677970117819e-36, -5.524823448864751e-36, -6.864887008802159e-36, 3.2326094647612906e-37, 4.513898307157584e-36, -9.686073450775649e-36, 1.8220162437745456e-36, 1.5234406786656846e-35, -1.5398975995771966e-36, -9.07481638834806e-36, -8.228460455756013e-36, 1.1049646897729503e-35, 1.6574470346594254e-36, -7.335084749131074e-36, -4.984096047486499e-36, 5.47780367483186e-36, 4.9370762734536075e-36, -2.527312854267918e-36, 9.73309322480854e-36, -3.408933617384634e-36, 1.3259576277275403e-35, -6.723827686703485e-36, -1.5398975995771966e-36, 6.700317799687039e-37, 1.1637394073140646e-36, -2.926980933547496e-36, -1.2789378536946488e-35, -9.309915258512517e-36, -1.4752454102819708e-36, 1.9043008483321058e-36, -1.4482090402130582e-35, 4.231779662960235e-37, 7.85230226349288e-36, -9.462729524119414e-37, -1.9395656788567744e-36, 1.610427260626534e-36, -9.07481638834806e-36, -8.13442090769023e-36, -9.545014128676975e-36, 7.899322037525772e-36, -1.2601299440814922e-35, 2.1441016958998524e-35, -4.419858759091801e-36, 6.206610172341678e-36, 4.1436175866485635e-37, -7.170515540015954e-37, 1.598672317118311e-35, 2.0923799444636718e-36, -1.4858248594393714e-35, -2.1276447749883404e-36, 1.7632415262334313e-36, 9.309915258512517e-36, -5.830451980078546e-36, 5.360254239749631e-36, -3.479463278433971e-36, -7.335084749131074e-36, 5.7834322060456545e-36, -2.3274788146281293e-36, 1.1566864412091309e-35, -8.839717518183602e-36, -1.6080762719248893e-35, -1.774996469741654e-36, -9.874152546907215e-36, 8.087401133657338e-36, -3.92615113174644e-36, -3.291384182302405e-36, -8.05213630313267e-37, 4.4668785331246925e-36, 8.999878623483139e-39, -5.172175143618065e-37, -5.242704804667402e-36, -1.8440567628524635e-37, -1.0062231643038781e-35, 1.1566864412091309e-35, 2.5743326283008096e-36, -3.973170905779332e-36, -5.7834322060456545e-36, 9.121836162380951e-36, 4.302309324009572e-36, 1.814963277669612e-35, -5.47780367483186e-36, -4.760752120830264e-37, -9.07481638834806e-36, -4.231779662960235e-36, -5.101645482568728e-36, 4.796016951354933e-36, 9.168855936413843e-36, 1.720923729603829e-35, -4.137740114894452e-36, -4.255289549976681e-36, 9.73309322480854e-36, -2.9740007075803874e-36, 3.76158192263132e-36, 4.419858759091801e-36, 8.9925317837905e-37, -5.948001415160775e-36, -1.3958995416014664e-37, 4.4668785331246925e-36, -1.8337711872827685e-36, -1.8337711872827685e-36, -6.018531076210112e-36, 2.915225990039273e-36, -5.3132344657167395e-36, -1.1872492943305104e-36, 4.419858759091801e-36, -1.3635734469538535e-35, 1.6080762719248893e-35, -1.6971199689996776e-37, 6.112570624275895e-36, 2.456783193218581e-36, 5.571843222897643e-36, -1.2695338988880705e-35, -1.4670169498262148e-35, 8.087401133657338e-36, -4.231779662960235e-36, 1.5398975995771966e-36, 3.103305086170839e-36, 8.651638422052036e-36, 8.040381359624447e-36, -4.349329098042464e-36, 5.172175143618065e-36, -1.1108421615270617e-36, -1.2413220344683356e-35, 2.4685381367268038e-36, -9.309915258512517e-36, 5.900981641127883e-36, 2.080625000955449e-36, 1.2131101700486007e-35, 1.3812058622161878e-37, 4.760752120830264e-37, 4.584427968206921e-36, 6.58276836460481e-36, -2.2099293795459005e-35, -1.504632769052528e-36, -2.6801271198748155e-36, -1.1660903960157092e-35, -9.07481638834806e-36, 2.5860875718090325e-36, -9.121836162380951e-36, 1.2283915966092904e-36, 9.93292726444833e-37, -6.723827686703485e-36, -1.0109251417071673e-35, -5.47780367483186e-36, -6.159590398308787e-36, -3.879131357713549e-36, 8.79269774415071e-36, 3.879131357713549e-36, -9.592033902709866e-36, 9.027796614315168e-36, -1.4458580515114136e-36, 6.535748590571919e-36, 6.441709042506136e-36, -1.2989212576586277e-36, -3.0562853121379475e-36, -1.1002627123696611e-35, 3.6205226005326455e-36, 6.676807912670593e-36, 1.598672317118311e-35, 1.5634074865936424e-36, 1.2131101700486007e-35, 7.335084749131074e-36, 8.087401133657338e-36, 1.3635734469538535e-35, -1.2695338988880705e-35, -1.8572810742992143e-36, -6.629788138637702e-36, -1.0297330513203239e-35, -3.92615113174644e-36, 2.527312854267918e-36, 3.6205226005326455e-36, 2.621352402333701e-36, -5.936246471652552e-37, -2.73890183741593e-36, -1.5281426560689738e-36, 8.933757066249385e-36, 3.785091809647766e-36, -9.686073450775649e-36, -1.410593220986745e-36, -8.557598873986253e-36, -7.899322037525772e-36, 1.774996469741654e-36, 1.034435028723613e-35, -4.419858759091801e-36, 1.3729774017604318e-35, 9.450974580611192e-36, 8.369519777854687e-36, 1.6339371476429796e-36, 5.148665256601619e-36, 6.700317799687039e-37, 3.667542374565537e-36, -5.289724578700294e-36, -3.549992939483308e-36, 2.997510594596833e-36, 7.335084749131074e-36, -1.2460240118716248e-36, 8.087401133657338e-36, -1.9936384189945996e-35, 6.159590398308787e-36, -9.545014128676975e-36, -4.760752120830264e-37, -1.9513206223649973e-36, -2.915225990039273e-36, -1.0062231643038781e-35, 1.7021158199906723e-35, -1.516387712560751e-36, -1.5398975995771966e-36, -1.0050476699530558e-36, -1.0461899722318359e-36, -4.8430367253878245e-36, -7.899322037525772e-36, 3.644032487549091e-36, -1.88079096131566e-36, -4.70197740328915e-38, -1.087332274510616e-36, -2.915225990039273e-36, -1.692711865184094e-35, -1.570460452698576e-35, 5.736412432012763e-36, 2.9740007075803874e-36, -9.462729524119414e-37, -6.347669494440353e-36, 6.535748590571919e-36, -1.9983403963978888e-36, -4.349329098042464e-36, 7.85230226349288e-36, -1.1931267660846218e-36, -3.6205226005326455e-36, 1.5234406786656846e-35, 9.110081218872728e-37, -2.3980084756774665e-36, -2.9740007075803874e-36, 2.747718045047097e-37, -2.245194210070569e-36, 1.344765537340697e-35, 1.2789378536946488e-35, -4.419858759091801e-36, -8.322500003821796e-36, -1.7773474584432987e-35, -4.607937855223367e-36, -6.206610172341678e-36, -1.7118136483849562e-37, -5.101645482568728e-36, -2.1393997184965633e-36, 4.0907203408615605e-36, 1.704466808692317e-36, 4.70197740328915e-36, 2.1723135603195873e-35, -5.242704804667402e-36, -3.032775425121502e-36, -3.949661018762886e-36, 7.099985878966617e-36, -1.128474576789396e-35, -9.121836162380951e-36, -4.772507064338487e-36, -9.545014128676975e-36, 1.0520674439859473e-36, 6.629788138637702e-36, 1.9043008483321058e-36, 5.830451980078546e-36, -1.128474576789396e-36, 1.8734441216230207e-37, 6.817867234769268e-36, -9.051306501331614e-37, 4.53740819417403e-36, -1.9513206223649973e-36, 2.5155579107596953e-36, 2.82118644197349e-37, 3.3501588998435194e-37, 3.9026412447299945e-36, -2.151154662004786e-36, -7.85230226349288e-36, 2.1041348879718946e-36, 8.087401133657338e-36, -4.86654661240427e-36, -2.715391950399484e-36, -5.360254239749631e-36, 8.040381359624447e-36, 1.7491355940235638e-35, -1.0579449157400588e-35, 2.5155579107596953e-36, 4.419858759091801e-36, 1.946618644961708e-35, 6.171345341817009e-37, 1.0109251417071673e-36, 3.9966807927957775e-36, -1.5140367238591063e-35, -1.1660903960157092e-35, -9.497994354644083e-36, 1.6221822041347568e-36, 7.699487997885983e-37, 2.621352402333701e-36, -3.8086016966642115e-36, -3.6205226005326455e-36, -1.2131101700486007e-35, -2.5860875718090325e-36, -3.408933617384634e-36, 1.0109251417071673e-36, -2.4215183626939123e-36, 9.4039548065783e-36, -4.043700566828669e-36, -4.643202685748036e-37, -1.075577331002393e-36, -1.2989212576586277e-36, 1.3729774017604318e-35, 3.9026412447299945e-36, 7.052966104933725e-36, -3.220854521253068e-36, 1.4199971757933233e-35, -1.974830509381443e-35, 1.7115197747972506e-35, 5.101645482568728e-36, -5.7834322060456545e-36, -9.027796614315168e-36, 6.488728816539027e-36, -6.441709042506136e-36, -1.3635734469538535e-35, -1.3259576277275403e-35, 5.195685030634511e-36, -5.360254239749631e-36, 7.4350017689509685e-37, 5.383764126766077e-36, 9.345180089037186e-37, -7.85230226349288e-36, 8.886737292216494e-36, 6.864887008802159e-36, -1.5634074865936424e-36, 9.545014128676975e-36, 1.5892683623117327e-35, 1.4576129950196365e-35, -1.5610564978919978e-35, 4.419858759091801e-36, 1.034435028723613e-35, 7.52316384526264e-36, 3.9966807927957775e-36, 1.7021158199906723e-35, 6.300649720407461e-36, 2.2804590405952378e-36, 1.189600283032155e-35, -1.0062231643038781e-35, 3.502973165450417e-36, -1.9160557918403286e-36, 1.4294011305999016e-35, 2.550822741284364e-36, 3.549992939483308e-36, -2.6801271198748155e-36, 2.5743326283008096e-36, -1.1660903960157092e-35, -6.629788138637702e-36, -3.291384182302405e-36, 7.85230226349288e-36, -5.948001415160775e-36, -1.5634074865936424e-36, 1.1637394073140646e-36, -3.385423730368188e-36, -5.195685030634511e-36, 2.0218502834143345e-36, 4.6549576292562585e-36, 4.984096047486499e-36, -1.8587504422377421e-37, 1.6174802267314676e-35, -5.5013135618483055e-36, 1.610427260626534e-36, -5.6893926579798715e-36, -1.986585452889666e-36, -1.410593220986745e-35, -7.758262715427098e-36, 2.3274788146281293e-36, 5.348499296241408e-37, 5.571843222897643e-36, 9.545014128676975e-36, -3.479463278433971e-36, -9.07481638834806e-36, 1.4576129950196365e-36, 2.527312854267918e-37, -9.697828394283872e-37, 1.4199971757933233e-35, -1.1707923734189984e-35, 1.2342690683634019e-36, -1.2989212576586277e-36, 3.832111583680657e-36, -2.6918820633830384e-36, 2.865267480129326e-37, 5.995021189193666e-36, 3.5970127135162e-36, -6.300649720407461e-36, 1.0767528253532154e-35, -1.664500000764359e-35, 6.770847460736376e-36, -3.790969281401877e-37, 3.738072035614874e-36, 1.189600283032155e-35, -1.7303276844104072e-35, -4.607937855223367e-36, 1.909002825735395e-35, 5.64237288394698e-36, 6.535748590571919e-36, 1.0461899722318359e-36, 3.103305086170839e-36, -7.335084749131074e-36, -9.4039548065783e-36, -1.0579449157400588e-35, 9.874152546907215e-36, 1.4870003537901937e-36, -3.92615113174644e-36, -1.3259576277275403e-35, 1.974830509381443e-36, -2.997510594596833e-36, -2.456783193218581e-36, -1.6550960459577808e-35, -5.524823448864751e-36, -2.6918820633830384e-36, 6.676807912670593e-36, -1.2695338988880705e-35, 1.0814548027565045e-36, 6.112570624275895e-36, -4.7489971773220415e-36, -2.456783193218581e-36, 6.159590398308787e-36, -1.3541694921472752e-35, -4.2787994369931265e-36, 7.805282489459989e-36, 6.112570624275895e-36, 9.874152546907215e-36, -3.92615113174644e-36, -1.7491355940235638e-35, -6.065550850243004e-36, -1.4482090402130582e-35, -1.4223481644949679e-36, 5.5953531099140885e-36, -8.510579099953362e-36, 1.1801963282255767e-35, 7.581938562803754e-37, -1.2977457633078054e-35, 2.7624117244323756e-36, -1.4670169498262148e-35, -1.2460240118716248e-36, -2.5978425153172554e-36, 3.6205226005326455e-36, 4.966463632224165e-37, -3.549992939483308e-36, 1.3353615825341186e-35, 5.995021189193666e-36, 1.034435028723613e-35, 9.309915258512517e-36, 4.70197740328915e-36, -4.255289549976681e-36, -1.0062231643038781e-35, -1.357695975199742e-36, 1.2695338988880705e-35, 8.886737292216494e-36, -8.839717518183602e-36, 1.2601299440814922e-35, -6.347669494440353e-36, 2.915225990039273e-36, -7.93458686805044e-37, -3.408933617384634e-36, 6.25362994637457e-36, -1.3259576277275403e-35, 1.4870003537901937e-36, -4.8430367253878245e-36, 1.3635734469538535e-35, -6.817867234769268e-36, -9.827132772874324e-36, 3.832111583680657e-36, -6.112570624275895e-36, 3.079795199154393e-36, 7.005946330900834e-36, 7.617203393328423e-36, -6.065550850243004e-36, -2.3862535321692436e-36, 5.5013135618483055e-36, -5.383764126766077e-36, -1.96307556587322e-36, 5.289724578700294e-36, 1.43880508540648e-35, -1.3077374652897949e-37, -1.6603857705364811e-37, 2.0923799444636718e-36, -2.2005254247393222e-35, -2.621352402333701e-36, 6.676807912670593e-36, -6.723827686703485e-36, 4.70197740328915e-36, -1.189600283032155e-35, -4.6549576292562585e-36, -3.6205226005326455e-36, 2.2804590405952378e-36, 6.25362994637457e-36, -1.3259576277275403e-35, -6.817867234769268e-36, 8.745677970117819e-36, 1.774996469741654e-36, 9.356935032545409e-36, -1.0062231643038781e-35, 5.148665256601619e-36, 1.1990042378387333e-35, -4.3728389850589095e-36, 5.3132344657167395e-36, 5.252990380237097e-38, -6.065550850243004e-36, 4.208269775943789e-36, 1.7021158199906723e-35, -4.53740819417403e-36, -4.984096047486499e-36, 1.288341808501227e-35, 1.786751413249877e-36, 1.4199971757933233e-35, -7.993361585591555e-36, 1.598672317118311e-35, 1.0767528253532154e-35, -2.8682062160063815e-36, -1.692711865184094e-36, -1.3541694921472752e-35, 4.86654661240427e-36, -4.53740819417403e-36, -1.3259576277275403e-35, -8.79269774415071e-36, -1.2107591813469561e-36, 9.592033902709866e-36, 5.3132344657167395e-36, 1.2037062152420224e-35, -3.76158192263132e-36, -2.245194210070569e-36, 2.3603926564511533e-35, 1.0485409609334805e-35, 7.993361585591555e-36, -7.617203393328423e-36, -3.973170905779332e-36, 5.853961867094992e-36, 8.228460455756013e-36, 3.8086016966642115e-36, 2.3274788146281293e-36, 1.1190706219828177e-35, 5.995021189193666e-36, -1.095560734966372e-35, -5.348499296241408e-37, 6.723827686703485e-36, -1.0626468931433479e-35, 9.497994354644083e-36, 5.948001415160775e-36, 7.85230226349288e-36, -5.242704804667402e-36, 3.9966807927957775e-36, -6.018531076210112e-36, 7.147005652999508e-36, -2.997510594596833e-36, -6.91190678283505e-36, -8.745677970117819e-36, 3.5970127135162e-36, -4.325819211026018e-36, -6.629788138637702e-36, -1.88079096131566e-35, 1.1707923734189984e-35, -5.524823448864751e-36, -1.9983403963978888e-36, 1.222514124855179e-35, -1.0485409609334805e-35, -3.314894069318851e-36, -1.1990042378387333e-35, 9.686073450775649e-36, 9.262895484479626e-36, -2.3980084756774665e-36, 4.4668785331246925e-36, 5.64237288394698e-36, 6.958926556867942e-36, -6.25362994637457e-36, 3.3384039563352965e-36, -1.189600283032155e-35, -8.275480229788904e-36, -1.680956921675871e-36, -1.7021158199906723e-35, 1.316553672920962e-35, -3.5970127135162e-36, 5.612985525176423e-37, -8.46355932592047e-36, -3.644032487549091e-36, -1.1143686445795286e-35, 3.361913843351742e-36, -2.4920480237432495e-36, 3.7145621485984285e-36, 3.173834747220176e-36, -8.839717518183602e-36, 4.984096047486499e-36, 1.0579449157400588e-35, 6.347669494440353e-36, -8.369519777854687e-36, -4.725487290305596e-36, 1.8619830517025034e-35, -3.949661018762886e-36, -1.3259576277275403e-35, 4.796016951354933e-36, -7.711242941394206e-36, 2.747718045047097e-37, 1.1402295202976189e-36, 1.96307556587322e-36, 2.71833068627654e-37, -1.0579449157400588e-35, -1.2930437859045163e-36, -5.0311158215193905e-36, 1.1931267660846218e-36, -1.222514124855179e-35, 7.946341811558664e-36, 4.86654661240427e-36, -9.827132772874324e-36, 9.545014128676975e-36, -5.5013135618483055e-36, 3.6205226005326455e-36, -9.827132772874324e-36, 1.0203290965137456e-35, -1.626884181538046e-35, -8.369519777854687e-36, -2.644862289350147e-36, -8.557598873986253e-36, -9.686073450775649e-36, 4.255289549976681e-36, -1.570460452698576e-35, -3.96729343402522e-37, 9.051306501331614e-37, 7.993361585591555e-36, -1.034435028723613e-35, 2.245194210070569e-36, -1.0461899722318359e-36, -1.0485409609334805e-35, -6.770847460736376e-36, -5.242704804667402e-36, 3.636685647856452e-38, -6.535748590571919e-36, -1.6574470346594254e-36, -2.7036370068912613e-37, -5.5953531099140885e-36, 1.344765537340697e-35, 2.0124463286077562e-35, -1.1190706219828177e-35, 2.915225990039273e-36, -1.3283086164291849e-36, 1.8431751420893468e-35, -1.476420904632793e-35, -9.686073450775649e-36, -4.419858759091801e-36, -1.3459410316915192e-36, -1.1472824864025526e-35, 1.2131101700486007e-35, 7.85230226349288e-36, 1.1990042378387333e-36, -7.241045201065291e-36, 6.347669494440353e-36, -5.360254239749631e-36, 8.745677970117819e-36, -7.52316384526264e-36, -6.58276836460481e-36, -1.189600283032155e-35, -2.3744985886610208e-36, -9.592033902709866e-36, -2.268704097087015e-36, -2.621352402333701e-36, 9.827132772874324e-36, 7.052966104933725e-36, 3.314894069318851e-36, 1.4294011305999016e-35, 7.570183619295532e-36, 6.817867234769268e-36, 6.817867234769268e-36, 1.8901949161222383e-35, -4.2787994369931265e-36, -1.0109251417071673e-35, -9.686073450775649e-36, -1.3259576277275403e-35, 1.0391370061269022e-35, -6.676807912670593e-36, -9.497994354644083e-36, -8.510579099953362e-36, -1.0203290965137456e-35, -1.3106762011668506e-36, 7.099985878966617e-36, 2.73890183741593e-36, 1.598672317118311e-36, 1.263656427133959e-36, -1.586917373610088e-36, -1.7115197747972506e-35, 4.319941739271907e-37, -1.692711865184094e-36, -1.9043008483321058e-36, -2.621352402333701e-36, 6.171345341817009e-37, -1.7279766957087626e-36, -4.5609180811904755e-36, 9.639053676742758e-37, 1.2037062152420224e-35, -5.195685030634511e-36, -2.1981744360376776e-36, 4.913566386437162e-36, -2.2099293795459005e-36, -3.173834747220176e-36, 4.2787994369931265e-36, 1.3635734469538535e-35, 1.0579449157400588e-35, 1.2319180796617573e-35, -2.245194210070569e-36, 1.504632769052528e-35, 2.5978425153172554e-36, -4.796016951354933e-36, 5.524823448864751e-36, 6.488728816539027e-36, 1.0109251417071673e-35, -1.1707923734189984e-35, -1.9654265545748647e-35, 7.52316384526264e-36, -1.8713870065090817e-35, 1.1402295202976189e-36, 9.844765188136658e-38, 4.8430367253878245e-36, 9.686073450775649e-36, -1.2977457633078054e-35, 1.3353615825341186e-35, -1.2131101700486007e-35, 6.488728816539027e-36, -2.8446963289899358e-36, 2.8446963289899358e-36, -1.4294011305999016e-35, 7.382104523163966e-36, -9.497994354644083e-36, 1.1660903960157092e-35, -4.678467516272704e-36, 9.07481638834806e-36, -8.322500003821796e-36, 9.356935032545409e-36, -2.997510594596833e-36, 1.3917853113735884e-35, 1.3259576277275403e-35, 2.527312854267918e-36, 4.86654661240427e-36, 5.64237288394698e-36, -8.839717518183602e-36, 1.9395656788567744e-37, -1.1637394073140646e-36, 6.347669494440353e-36, 7.899322037525772e-36, -1.6080762719248893e-35, 4.760752120830264e-37, -4.772507064338487e-36, 8.087401133657338e-36, 1.189600283032155e-35, 6.535748590571919e-36, 1.43880508540648e-35, -9.686073450775649e-36, 1.2037062152420224e-35, 7.805282489459989e-36, 3.009265538105056e-36, 1.3867159919856673e-38, 3.738072035614874e-36, 2.339233758136352e-36, -8.839717518183602e-36, -1.1190706219828177e-35, -1.6550960459577808e-35, 9.07481638834806e-36, 6.629788138637702e-36, 8.698658196084928e-36, 6.488728816539027e-36, -5.524823448864751e-36, 3.855621470697103e-36, 8.46355932592047e-36, -2.8446963289899358e-36, 3.009265538105056e-36, -7.52316384526264e-36, 3.471381754772068e-38, -3.973170905779332e-36, -7.946341811558664e-36, -5.113400426076951e-37, 1.2871663141504048e-36, -2.915225990039273e-36, -2.7506567809241528e-36, -4.8430367253878245e-36, 8.557598873986253e-36, -3.385423730368188e-36, 1.2695338988880705e-35, -3.8086016966642115e-36, -1.1225971050352846e-36, -2.433273306202135e-36, -2.1393997184965633e-36, -1.0109251417071673e-35, -1.1190706219828177e-35, -3.3384039563352965e-36, -2.4215183626939123e-36, -4.255289549976681e-36, 7.099985878966617e-36, 3.5970127135162e-36, 8.886737292216494e-36, -6.065550850243004e-36, -4.984096047486499e-36, 8.632536638851174e-38, 2.0100953399061116e-36, 9.07481638834806e-36, -4.114230227878006e-36, -3.220854521253068e-36, 2.5860875718090325e-36, 5.319111937470851e-37, 3.032775425121502e-36, 7.617203393328423e-36, 7.617203393328423e-36, 6.347669494440353e-36, -8.510579099953362e-36, 5.830451980078546e-36, -4.9370762734536075e-37, 5.900981641127883e-36, 6.065550850243004e-36, -5.583598166405866e-37, 6.864887008802159e-36, -9.356935032545409e-36, 6.958926556867942e-36, -2.268704097087015e-36, 1.1472824864025526e-35, 3.644032487549091e-36, 1.9513206223649973e-36, -7.93458686805044e-37, 8.557598873986253e-36, -1.4199971757933233e-35, -3.832111583680657e-36, 4.396348872075355e-36, -1.692711865184094e-35, -2.8917161030228273e-36, 1.946618644961708e-35, 1.2131101700486007e-35, 5.736412432012763e-36, -7.946341811558664e-36, 8.322500003821796e-36, 5.430783900798968e-36, -1.1461069920517303e-36, -5.289724578700294e-36, 3.361913843351742e-36, 1.3259576277275403e-35, -6.300649720407461e-36, 6.817867234769268e-36, -3.502973165450417e-36, 4.1847598889273435e-36, -2.82118644197349e-36, 9.968192094972998e-36, -1.0485409609334805e-35, 1.570460452698576e-35, 1.3259576277275403e-35, -1.0626468931433479e-35, -1.6362881363446242e-35, -1.4952288142459497e-35, 1.892545904823883e-36, -1.0297330513203239e-35, 6.723827686703485e-36, -1.598672317118311e-35, 7.85230226349288e-36, -8.816207631167156e-38, 2.0424214345537245e-37, -1.1002627123696611e-35, -6.906029311080939e-37, 9.697828394283872e-37, 4.2787994369931265e-36, 4.020190679812223e-36, 1.0403125004777244e-36, 3.361913843351742e-36, 7.335084749131074e-36, 5.571843222897643e-36, 1.5140367238591063e-35, 1.0297330513203239e-35, 3.7145621485984285e-36, 1.0626468931433479e-35, -9.592033902709866e-36, -1.6080762719248893e-35, -8.510579099953362e-36, -2.5978425153172554e-36, 1.0297330513203239e-35, 7.335084749131074e-36, 4.114230227878006e-36, -6.629788138637702e-36, -4.9370762734536075e-36, 9.07481638834806e-36, 3.92615113174644e-36, 1.9184067805419732e-35, 5.101645482568728e-36, -3.879131357713549e-36, 4.2787994369931265e-36, 2.915225990039273e-36, 7.85230226349288e-36, -3.220854521253068e-36, -1.1660903960157092e-35, -9.968192094972998e-36, 3.5970127135162e-36, -1.7115197747972506e-35, -6.300649720407461e-36, 3.122406869371701e-39, 7.817037432968212e-37, -6.441709042506136e-36, 8.651638422052036e-36, 1.189600283032155e-35, 7.005946330900834e-36, 5.101645482568728e-36, 4.043700566828669e-36, -6.25362994637457e-36, 2.0100953399061116e-36, 2.550822741284364e-36, 5.054625708535836e-36, 5.360254239749631e-36, -1.9160557918403286e-36, -2.256949153578792e-36, 3.438320976155191e-37, 4.8430367253878245e-36, 4.3728389850589095e-36, 6.018531076210112e-36, 1.128474576789396e-36, -3.314894069318851e-36, 8.816207631167156e-37, -7.946341811558664e-36, 1.4576129950196365e-35, -2.339233758136352e-36, -5.172175143618065e-36, -8.228460455756013e-36, -1.0485409609334805e-35, -5.830451980078546e-36, 6.817867234769268e-36, 1.4576129950196365e-35, -2.277520304718182e-37, -1.1343520485435074e-36, -1.6456920911512025e-35, 2.997510594596833e-36, -1.0767528253532154e-35, 2.4215183626939123e-36, -4.1847598889273435e-36, -6.065550850243004e-36, 4.026068151566335e-37, 2.1276447749883404e-36, -9.844765188136658e-38, 5.736412432012763e-36, -5.054625708535836e-36, -4.020190679812223e-36, 3.0562853121379475e-36, -2.6566172328583698e-36, -4.913566386437162e-36, 9.827132772874324e-36, 1.5234406786656846e-35, 4.231779662960235e-36, 1.680956921675871e-36, -6.629788138637702e-36, -5.995021189193666e-36, 1.5398975995771966e-36, 2.5126191748826395e-37, 1.692711865184094e-36, 3.126814973187285e-36, -8.228460455756013e-36, -4.208269775943789e-36, 3.76158192263132e-36, -4.208269775943789e-36, -4.8430367253878245e-36, -4.86654661240427e-36, -1.410593220986745e-35, 4.1847598889273435e-36, -4.8430367253878245e-36, 2.997510594596833e-36, 7.758262715427098e-36, 7.273371295712904e-38, 2.715391950399484e-36, -4.419858759091801e-36, -5.571843222897643e-36, 4.70197740328915e-36, 1.0109251417071673e-35, -1.3259576277275403e-35, -1.4199971757933233e-35, 6.347669494440353e-36, -3.103305086170839e-36, 3.92615113174644e-36, -1.1378785315959743e-35, -1.128474576789396e-35, 1.2601299440814922e-35, -1.3929608057244107e-36, -9.8153778293661e-37, -1.095560734966372e-35, -1.2342690683634019e-36, -2.2334392665623463e-36, 3.9966807927957775e-37, 2.453844457341525e-37, 5.830451980078546e-36, 2.1158898314801175e-35, 2.0923799444636718e-36, 7.946341811558664e-36, -8.46355932592047e-36, 2.080625000955449e-36, 9.309915258512517e-36, 6.629788138637702e-36, 3.314894069318851e-36, -2.7976765549570443e-36, -2.245194210070569e-36, -2.621352402333701e-36, -6.770847460736376e-36, -1.570460452698576e-35, 1.1049646897729503e-35, 1.288341808501227e-35, 3.032775425121502e-36, 6.91190678283505e-36, -1.0203290965137456e-35, 5.430783900798968e-36, 1.6174802267314676e-35, -1.5798644075051544e-35, 6.159590398308787e-36, 1.0720508479499262e-35, -5.64237288394698e-36, 3.291384182302405e-36, -1.6692019781676483e-36, -9.027796614315168e-36, 4.020190679812223e-36, -1.1990042378387333e-35, -9.309915258512517e-36, 9.686073450775649e-36, -1.7021158199906723e-35, -8.595802440387977e-38, 1.1143686445795286e-35, -1.0861567801597937e-35, -4.4668785331246925e-36, 1.0579449157400588e-35, 2.8446963289899358e-36, -2.7976765549570443e-36, -4.437491174354135e-37, -5.995021189193666e-36, 1.2695338988880705e-35, 1.1002627123696611e-35, -4.055455510336892e-37, 7.52316384526264e-36, -5.3132344657167395e-36, 6.394689268473244e-36, -6.112570624275895e-36, -3.549992939483308e-36, -1.3929608057244107e-36, -6.817867234769268e-36, 7.570183619295532e-36, 8.287235173297127e-37, 1.2601299440814922e-35, 2.3274788146281293e-36, 2.8446963289899358e-36, -2.5860875718090325e-36, -1.4294011305999016e-35, -8.980776840282277e-36, 3.314894069318851e-36, 8.651638422052036e-36, -4.043700566828669e-36, 1.6174802267314676e-35, 7.429124297196857e-36, 4.796016951354933e-36, 4.1847598889273435e-36, 1.5610564978919978e-35, -1.1461069920517303e-36, 1.1801963282255767e-35, 2.5155579107596953e-36, 8.698658196084928e-36, 6.553381005834253e-37, 5.172175143618065e-36, 2.0124463286077562e-35, -5.054625708535836e-36, 1.8337711872827685e-35, 2.362743645152798e-36, -5.524823448864751e-36, 7.617203393328423e-36, 1.5610564978919978e-35, 4.231779662960235e-36, -5.47780367483186e-36, -8.745677970117819e-36, -6.876641952310382e-37, 8.963144425019942e-38, -1.1801963282255767e-35, -6.112570624275895e-36, -7.493776486492083e-37, -1.0109251417071673e-35, 8.228460455756013e-36, 2.3980084756774665e-36, -2.0718087933242817e-37, -9.827132772874324e-36, 3.1503248602037305e-36, -6.676807912670593e-36, -2.2804590405952378e-36, -3.408933617384634e-36, -2.82118644197349e-36, -1.3224311446750734e-36, 8.557598873986253e-36, 1.1472824864025526e-35, 9.262895484479626e-36, -3.314894069318851e-36, -7.147005652999508e-36, 1.0861567801597937e-35, -1.3353615825341186e-35, -6.770847460736376e-36, -5.995021189193666e-36, 8.839717518183602e-36, -1.0297330513203239e-35, 1.4199971757933233e-35, 7.617203393328423e-36, -1.6362881363446242e-35, -1.7632415262334313e-36, -1.4670169498262148e-35, 3.502973165450417e-36, -6.37705685321091e-37, 1.5140367238591063e-35, 4.349329098042464e-36, -6.018531076210112e-36, -1.3635734469538535e-35, 8.79269774415071e-36, 5.965633830423109e-37, 5.84808439534088e-37, 1.3635734469538535e-35, 7.946341811558664e-36, -1.189600283032155e-35, 6.58276836460481e-36, -6.958926556867942e-36, -7.147005652999508e-36, 1.4294011305999016e-35, 4.161250001910898e-36, -1.9395656788567744e-36, -1.1707923734189984e-35, 1.1990042378387333e-36, 6.629788138637702e-36, 2.915225990039273e-36, -6.25362994637457e-36, 4.725487290305596e-36, -2.080625000955449e-36, 3.667542374565537e-36, 4.231779662960235e-36, 9.121836162380951e-36, -1.3106762011668506e-36, 8.79269774415071e-36, -4.137740114894452e-36, 9.991701981989444e-38, -5.383764126766077e-36, -5.0311158215193905e-36, -1.0062231643038781e-35, -4.678467516272704e-36, 1.598672317118311e-36, -5.736412432012763e-36, -9.07481638834806e-36, 4.325819211026018e-36, 3.9026412447299945e-36, 2.1041348879718946e-36, -1.1472824864025526e-35, 7.288064975098183e-36, 2.1864194925294548e-36, -1.1801963282255767e-35, 2.82118644197349e-36, -1.6339371476429796e-36, 9.8153778293661e-37, 1.0626468931433479e-35, -9.168855936413843e-36, 3.408933617384634e-36, 2.8917161030228273e-36, 7.052966104933725e-36, -1.6362881363446242e-35, 2.715391950399484e-36, 9.286405371496071e-37, -8.604618648019145e-36, 3.2443644082695135e-36, -6.723827686703485e-36, -7.899322037525772e-36, 4.607937855223367e-36, 1.476420904632793e-35, 6.629788138637702e-36, 5.5953531099140885e-36, -7.567244883418476e-38, 4.53740819417403e-36, -1.1566864412091309e-35, 1.0109251417071673e-35, -5.054625708535836e-36, 5.242704804667402e-36, 2.4685381367268038e-36, -1.8102613002663228e-36, 4.6549576292562585e-36, 8.698658196084928e-36, 2.3744985886610208e-36, -5.47780367483186e-36, -1.7279766957087626e-36, -3.785091809647766e-36, -6.488728816539027e-36, -1.1801963282255767e-35, -9.91823358506305e-38, 2.7506567809241528e-36, 7.335084749131074e-36, 9.262895484479626e-36, -1.0062231643038781e-35, -6.488728816539027e-36, 1.6550960459577808e-35, 4.1847598889273435e-36, -6.318282135669795e-37, -4.5609180811904755e-36, -8.510579099953362e-36, -1.410593220986745e-36, -3.0562853121379475e-36, 5.148665256601619e-36, -4.775445800215543e-38, -5.6893926579798715e-36, -1.410593220986745e-35, 1.3283086164291849e-36, -3.502973165450417e-36, -1.288341808501227e-35, 1.1002627123696611e-35, -4.443368646108247e-36, -1.704466808692317e-36, 1.4576129950196365e-35, -2.3862535321692436e-36, -2.1535056507064307e-35, 4.631447742239813e-36, -2.433273306202135e-36, -3.785091809647766e-36, 9.827132772874324e-36, -1.504632769052528e-36, -6.159590398308787e-36, 9.4039548065783e-36, -2.3321807920314184e-35, 3.9966807927957775e-36, 2.5860875718090325e-36, 1.0109251417071673e-35, 1.3635734469538535e-35, 1.4670169498262148e-35, 1.5610564978919978e-35, -9.545014128676975e-36, -6.206610172341678e-36, -9.309915258512517e-36, -9.07481638834806e-36, 4.173004945419121e-37, 1.7397316392169855e-36, 2.268704097087015e-36, -2.7976765549570443e-36, -1.1872492943305104e-36, -4.984096047486499e-36, -5.383764126766077e-36, 5.853961867094992e-36, -7.288064975098183e-36, -2.4920480237432495e-36, 9.356935032545409e-36, 4.490388420141138e-36, -7.099985878966617e-36, 1.5610564978919978e-35, -9.07481638834806e-36, -4.678467516272704e-36, 1.0203290965137456e-35, -9.262895484479626e-36, 1.598672317118311e-36, -3.691052261581983e-36, -7.993361585591555e-36, 7.476144071229749e-36, 9.592033902709866e-36, 7.382104523163966e-36, 3.92615113174644e-36, -4.3728389850589095e-36, -5.3132344657167395e-36, -8.639883478543813e-37, 1.0990872180188388e-36, -7.147005652999508e-36, -6.065550850243004e-36, 4.725487290305596e-36, -1.9395656788567744e-36, -6.91190678283505e-36, -4.86654661240427e-36, -8.557598873986253e-36, -4.137740114894452e-36, 5.3132344657167395e-36, -3.92615113174644e-36, 1.0520674439859473e-36, 1.0203290965137456e-35, 2.0923799444636718e-36, 1.626884181538046e-35, 9.827132772874324e-36, 6.906029311080939e-37, -9.93292726444833e-37, 2.2922139841034606e-36, 2.7624117244323756e-36, -5.289724578700294e-36, 3.267874295285959e-36, 9.521504241660529e-37, 3.173834747220176e-36, 2.0594661026406477e-35, -5.3132344657167395e-36, 2.644862289350147e-36, -2.6918820633830384e-36, -5.6893926579798715e-36, 6.817867234769268e-36, -2.8446963289899358e-36, -4.86654661240427e-36, 1.263656427133959e-36, -6.817867234769268e-36, 1.0203290965137456e-35, 1.0767528253532154e-35, 1.2107591813469561e-36, 7.229290257557068e-37, -4.584427968206921e-36, 9.07481638834806e-36, 8.228460455756013e-36, 7.2880649750981825e-37, 4.513898307157584e-36, 7.52316384526264e-37, -2.915225990039273e-36, 1.786751413249877e-36, -3.5970127135162e-36, -6.770847460736376e-36, 1.7115197747972506e-35, -1.9983403963978888e-36, 2.433273306202135e-36, 9.73309322480854e-36, -1.1707923734189984e-35, -6.535748590571919e-36, 9.286405371496071e-37, 4.796016951354933e-36, 7.617203393328423e-36, -2.256949153578792e-36, -5.5013135618483055e-36, -7.052966104933725e-36, -2.0923799444636718e-36, -7.335084749131074e-36, -1.664500000764359e-35, 5.6893926579798715e-36, -9.697828394283872e-37, -1.96307556587322e-36, 2.4215183626939123e-36, 5.571843222897643e-36, -4.8430367253878245e-36, -1.1660903960157092e-35, 5.900981641127883e-36, -3.549992939483308e-36, -2.0100953399061116e-36, -1.5516525430854195e-35, -5.7834322060456545e-36, -1.6339371476429796e-36, -8.933757066249385e-36, 1.570460452698576e-35, -4.772507064338487e-36, 1.4199971757933233e-35, 9.168855936413843e-36, 2.7976765549570443e-36, -3.502973165450417e-36, -1.0285575569695016e-36, -5.7834322060456545e-36, -3.408933617384634e-36, 3.667542374565537e-36, 9.545014128676975e-36, 6.91190678283505e-36, 7.664223167361315e-36, -1.4752454102819708e-36, 1.4458580515114136e-36, 3.849743998942992e-37, -2.527312854267918e-36, 4.86654661240427e-36, 7.758262715427098e-36, -1.6574470346594254e-36, 5.383764126766077e-36, 1.3071497181143837e-35, -1.1660903960157092e-35, -7.617203393328423e-36, -4.984096047486499e-36, 5.054625708535836e-37, -6.876641952310382e-37, -9.356935032545409e-36, -1.275411370642182e-36, 7.85230226349288e-36, 4.2787994369931265e-36, 6.612155723375367e-37, -1.316553672920962e-35, -1.2319180796617573e-35, -3.267874295285959e-36, 2.926980933547496e-36, 1.2695338988880705e-35, 5.289724578700294e-36, -1.3541694921472752e-35, 1.2695338988880705e-35, 1.7491355940235638e-35, 4.319941739271907e-37, -2.456783193218581e-36, -1.0391370061269022e-35, -1.88079096131566e-36, -7.899322037525772e-36, 9.545014128676975e-36, 5.995021189193666e-36, 2.9828169152115546e-37, -1.986585452889666e-36, 3.5970127135162e-36, 9.592033902709866e-36, -9.215875710446734e-36, -5.172175143618065e-36, 4.513898307157584e-36, -9.592033902709866e-36, 2.5126191748826395e-37, 2.997510594596833e-36, 1.2695338988880705e-35, -1.2977457633078054e-35, -8.557598873986253e-36, 1.0638223874941702e-36, 1.4011892661801667e-35, 4.6549576292562585e-36, 9.07481638834806e-36, 4.643202685748036e-37, -5.5953531099140885e-36, -2.0218502834143345e-36, 3.855621470697103e-36, -1.288341808501227e-35, 6.441709042506136e-36, 2.5743326283008096e-36, -1.2166366531010676e-36, 7.993361585591555e-36, -1.4011892661801667e-35, 5.64237288394698e-36, -5.995021189193666e-36, -6.770847460736376e-36, -7.335084749131074e-36, -2.3980084756774665e-36, -3.009265538105056e-36, 9.4039548065783e-36, 9.686073450775649e-36, 1.4870003537901937e-36, 9.521504241660529e-37, -3.197344634236622e-36, 4.9370762734536075e-36, -1.2601299440814922e-35, -1.0626468931433479e-35, -4.231779662960235e-36, -7.899322037525772e-36, 1.2166366531010676e-36, -1.3459410316915192e-36, -7.85230226349288e-36, 9.545014128676975e-36, 7.1940254270324e-36, 1.0062231643038781e-35, -5.948001415160775e-36, -4.349329098042464e-36, 8.745677970117819e-36, -2.057115113939003e-36, 1.344765537340697e-35, 1.504632769052528e-36, 1.369450918707965e-36, 3.855621470697103e-36, 3.197344634236622e-36, 8.287235173297127e-37, 7.640713280344869e-37, 7.005946330900834e-36, -8.040381359624447e-36, 3.197344634236622e-36, -5.5953531099140885e-36, 2.7859216114488214e-36, 5.47780367483186e-36, -4.4668785331246925e-37, 4.5609180811904755e-36, 3.103305086170839e-36, 2.256949153578792e-36, -5.64237288394698e-36, 9.73309322480854e-36, 6.629788138637702e-36, -1.814963277669612e-35, 4.5609180811904755e-36, -6.465218929522581e-38, 5.524823448864751e-36, 3.549992939483308e-36, -5.995021189193666e-36, 1.2166366531010676e-36, 5.383764126766077e-36, 2.527312854267918e-37, -2.915225990039273e-36, 1.504632769052528e-35, 1.6692019781676483e-36, -2.915225990039273e-36, -1.222514124855179e-35, -5.64237288394698e-36, 7.993361585591555e-36, 2.3039689276116835e-36, 1.96307556587322e-36, 2.456783193218581e-36, 2.926980933547496e-36, 7.52316384526264e-36, 2.915225990039273e-36, -6.488728816539027e-36, -6.535748590571919e-36, 5.571843222897643e-36, 1.2131101700486007e-35, 2.4685381367268038e-36, 3.385423730368188e-36, -1.5516525430854195e-35, -8.79269774415071e-36, -3.573502826499754e-36, -1.3729774017604318e-35, 9.309915258512517e-36, 1.4952288142459497e-35, 6.318282135669795e-38, -1.2131101700486007e-35, -1.0720508479499262e-35, 8.369519777854687e-36, 1.3635734469538535e-35, -1.0861567801597937e-35, 1.504632769052528e-35, -5.64237288394698e-36, 7.147005652999508e-36, 3.92615113174644e-36, -1.586917373610088e-36, 1.250725989274914e-35, 5.172175143618065e-36, -3.032775425121502e-36, -3.8086016966642115e-36, -3.032775425121502e-36, 5.430783900798968e-36, 9.168855936413843e-36, 2.5155579107596953e-36, 4.2787994369931265e-36, -1.4011892661801667e-35, 1.598672317118311e-36, -4.5609180811904755e-36, -1.0720508479499262e-35, -2.6918820633830384e-36, -5.195685030634511e-36, 1.7021158199906723e-35, 1.189600283032155e-35, 1.128474576789396e-35, -6.676807912670593e-36, 1.288341808501227e-35, -6.994191387392611e-37, 1.0062231643038781e-35, 8.181440681723121e-36, -6.723827686703485e-36, -2.82118644197349e-36, -6.91190678283505e-36, -1.4294011305999016e-35, -1.814963277669612e-35, 3.6440324875490913e-37, 3.126814973187285e-36, 9.8153778293661e-37, 2.3069076634887392e-37, 9.497994354644083e-36, -5.830451980078546e-36, -5.363192975626687e-38, -1.1660903960157092e-35, -6.018531076210112e-36, -7.617203393328423e-36, 2.5743326283008096e-36, 9.592033902709866e-36, 2.4685381367268038e-36, -9.262895484479626e-36, -9.262895484479626e-36, -1.4482090402130582e-35, 8.980776840282277e-36, -3.785091809647766e-36, 1.3259576277275403e-35, -1.4752454102819708e-36, 3.173834747220176e-36, -2.1276447749883404e-36, 9.874152546907215e-36, 3.314894069318851e-36, -6.347669494440353e-36, 5.853961867094992e-36, -6.535748590571919e-36, -8.485599844998388e-38, 7.241045201065291e-36, -5.830451980078546e-36, -1.1990042378387333e-35, -1.3071497181143837e-35, -9.309915258512517e-36, 9.521504241660529e-37, 1.7632415262334313e-36, -5.6893926579798715e-36, 3.079795199154393e-36, -8.369519777854687e-36, 1.0485409609334805e-35, 4.53740819417403e-36, 1.0062231643038781e-35, 6.864887008802159e-36, 7.335084749131074e-36, 8.181440681723121e-36, 6.58276836460481e-36, 3.4530146555404696e-37, -1.4294011305999016e-35, 1.1990042378387333e-35, 1.0626468931433479e-35, -7.147005652999508e-36, -6.018531076210112e-36, 1.0485409609334805e-35, -1.3812058622161878e-36, 3.4324435044010795e-36, -5.383764126766077e-36, -4.302309324009572e-36, -6.065550850243004e-36, 6.723827686703485e-36, 9.4039548065783e-36, -4.2787994369931265e-36, 1.2977457633078054e-35, -5.195685030634511e-36, 5.172175143618065e-37, 5.0311158215193905e-36, -4.678467516272704e-36, -9.827132772874324e-36, 1.6080762719248893e-35, -9.07481638834806e-36, 2.5155579107596953e-36, 1.7491355940235638e-35, 1.095560734966372e-35, -6.817867234769268e-36, -1.3988382774785221e-36, 7.899322037525772e-36, -4.2787994369931265e-36, 5.6893926579798715e-36, 6.91190678283505e-36, -4.0907203408615605e-36, 1.1002627123696611e-35, 1.0720508479499262e-35, -1.0932097462647274e-36, 1.0720508479499262e-35, 2.256949153578792e-36, 6.58276836460481e-36, -1.2131101700486007e-35, 6.112570624275895e-36, 3.6205226005326455e-36, 1.1707923734189984e-35, 6.159590398308787e-36, 7.147005652999508e-36, -4.984096047486499e-36, 8.698658196084928e-36, -7.493776486492083e-37, 5.360254239749631e-36, 1.6739039555709374e-35, 4.9370762734536075e-36, 6.553381005834253e-37, 9.168855936413843e-37, -8.040381359624447e-36, -2.5743326283008096e-36, 6.347669494440353e-36, -2.3862535321692436e-36, -3.879131357713549e-36, 8.416539551887579e-36, 3.408933617384634e-36, -1.1472824864025526e-35, -4.678467516272704e-36, -1.0297330513203239e-35, 2.938735877055719e-38, -5.172175143618065e-36, -1.5140367238591063e-35, -7.099985878966617e-36, -9.991701981989444e-37, -6.864887008802159e-36, 1.3738590225235485e-37, -9.592033902709866e-36, 6.394689268473244e-36, -1.4670169498262148e-35, 4.913566386437162e-36, -9.686073450775649e-36, -1.1225971050352846e-36, 1.8220162437745456e-36, 9.07481638834806e-36, -1.5516525430854195e-35, -5.995021189193666e-36, -3.291384182302405e-36, 4.796016951354933e-36, 1.4576129950196365e-36, 1.8055593228630336e-35, -1.8337711872827685e-35, 1.2789378536946488e-35, 9.592033902709866e-36, 6.347669494440353e-36, -7.335084749131074e-36, -7.4350017689509685e-37, 9.309915258512517e-36, 2.5978425153172554e-36, 1.7115197747972506e-35, 6.300649720407461e-36, 3.361913843351742e-36, 6.629788138637702e-36, -2.068870057447226e-36, -4.86654661240427e-36, 4.731364762059707e-37, 1.1049646897729503e-35, -8.698658196084928e-36, -4.86654661240427e-36, -6.58276836460481e-36, 9.051306501331614e-37, -1.0861567801597937e-35, -1.0062231643038781e-35, 7.241045201065291e-36, -1.2319180796617573e-35, 3.92615113174644e-36, 4.3728389850589095e-36, -1.4576129950196365e-36, -1.4576129950196365e-35, 1.6530389308438418e-38, 2.5860875718090325e-36, 2.3744985886610208e-36, -1.0109251417071673e-35, 7.85230226349288e-36, -1.4576129950196365e-36, -7.85230226349288e-36, 1.2131101700486007e-35, 9.874152546907215e-36, 3.032775425121502e-36, -1.986585452889666e-36, 1.2460240118716248e-36, 1.6574470346594254e-36, -1.4670169498262148e-35, 3.314894069318851e-36, -3.4824020143110267e-37, -1.222514124855179e-35, -1.9043008483321058e-36, -3.314894069318851e-36, -9.73309322480854e-36, 1.369450918707965e-36, 4.86654661240427e-36, -2.0453601704307803e-36, -1.9043008483321058e-36, 1.2977457633078054e-35, 8.275480229788904e-36, 7.052966104933725e-36, 9.345180089037186e-37, 1.2131101700486007e-35, 6.906029311080939e-37, 1.6456920911512025e-35, -1.3259576277275403e-35, -1.4670169498262148e-35, -1.7279766957087626e-36, 2.350988701644575e-36, 2.4685381367268038e-36, 5.6893926579798715e-36, 3.8086016966642115e-36, 7.099985878966617e-36, 2.4685381367268038e-36, 5.853961867094992e-36, 4.8430367253878245e-36, -7.241045201065291e-36, 4.678467516272704e-36, 1.3635734469538535e-35, 5.360254239749631e-36, 1.692711865184094e-35, 7.85230226349288e-36, -2.068870057447226e-36, -1.1167196332811731e-36, 4.396348872075355e-36, -6.723827686703485e-36, -4.255289549976681e-36, 3.92615113174644e-36, 7.099985878966617e-36, 6.206610172341678e-36, -2.068870057447226e-35, -4.419858759091801e-36, -2.6889433275059827e-37, 1.2695338988880705e-35, -2.527312854267918e-36, 9.874152546907215e-36, 2.204051907791789e-37, 8.933757066249385e-36, -7.147005652999508e-36, -1.6456920911512025e-36, 1.1225971050352846e-36, -7.4350017689509685e-37, -3.502973165450417e-36, -3.667542374565537e-36, -4.7489971773220415e-36, -7.335084749131074e-36, -1.0579449157400588e-35, 7.099985878966617e-36, -1.786751413249877e-35, -4.913566386437162e-36, -8.228460455756013e-36, -1.2166366531010676e-36, -1.3259576277275403e-35, 3.785091809647766e-36, 4.419858759091801e-36, 1.0861567801597937e-35, 9.874152546907215e-36, 1.1990042378387333e-36, -1.1378785315959743e-35, -1.5798644075051544e-35, -1.774996469741654e-36, 6.629788138637702e-36, 3.173834747220176e-36, -1.410593220986745e-36, 3.4530146555404696e-37, -1.2695338988880705e-35, -3.738072035614874e-36, 8.651638422052036e-36, 8.087401133657338e-36, -1.0861567801597937e-35, 2.1535056507064307e-35, -1.986585452889666e-36, 3.197344634236622e-36, 2.1599708696359533e-37, -3.2473031441465692e-37, 3.173834747220176e-36, 1.96307556587322e-36, -4.3728389850589095e-36, -3.291384182302405e-36, 7.617203393328423e-36, 2.0594661026406477e-35, -1.1990042378387333e-35, -5.242704804667402e-36, 1.8337711872827685e-36, -8.087401133657338e-36, -2.256949153578792e-36, -3.92615113174644e-36, 4.161250001910898e-36, 6.112570624275895e-36, -9.309915258512517e-36, -8.181440681723121e-36, 7.617203393328423e-36, -1.3259576277275403e-35, 3.408933617384634e-36, 9.262895484479626e-36, -3.408933617384634e-36, 8.322500003821796e-36, -3.573502826499754e-36, -4.984096047486499e-36, -3.614645128778534e-37, 1.0626468931433479e-35, 1.1566864412091309e-35, -7.570183619295532e-36, 1.275411370642182e-36, 3.6205226005326455e-36, 1.476420904632793e-35, 5.5013135618483055e-36, 2.456783193218581e-36, 9.968192094972998e-36, 8.839717518183602e-36, -9.686073450775649e-36, 3.738072035614874e-36, 3.3384039563352965e-36, -1.8619830517025034e-35, -3.691052261581983e-36, 6.629788138637702e-36, 1.8220162437745456e-36, 1.2789378536946488e-35, -4.2787994369931265e-36, 3.973170905779332e-36, 1.0109251417071673e-35, 3.7145621485984285e-36, 6.958926556867942e-36, -1.692711865184094e-35, -7.85230226349288e-36, -8.228460455756013e-36, 1.7985063567581e-36, 9.4039548065783e-36, -1.0579449157400588e-36, -3.879131357713549e-36, -2.433273306202135e-36, -8.698658196084928e-36, 1.1049646897729503e-35, 1.516387712560751e-36, -6.629788138637702e-36, -6.629788138637702e-36, -4.913566386437162e-36, 8.839717518183602e-36, 1.2413220344683356e-35, -3.197344634236622e-36, -5.524823448864751e-36, -5.830451980078546e-36, 2.0923799444636718e-36, -4.9370762734536075e-36, -7.993361585591555e-36, -5.948001415160775e-36])
-        
         if inputs_embeds is not None:
             hidden_states = inputs_embeds
         else:
             hidden_states = self.get_input_embeddings(input_ids)
         
+        print("embeddings: {hidden_states}")
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]

From 0848b256ff3023c1dac238c95489ff938265f744 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 24 Apr 2024 02:22:16 +0000
Subject: [PATCH 08/90] to try with torch.scaled_mm

---
 run_fp8.py                                    |  5 ++--
 .../layers/quantization/fp8_static.py         | 28 ++-----------------
 vllm/model_executor/models/llama.py           | 13 +++++++--
 3 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/run_fp8.py b/run_fp8.py
index 4b3833be8337e..1d7ed0e2e575b 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -1,5 +1,6 @@
-from vllm import LLM
+from vllm import LLM, SamplingParams
 
 model = LLM("FriendliAI/Mistral-7B-Instruct-v0.2-fp8", quantization="fp8_static", enforce_eager=True, max_model_len=1024)
 # model = LLM("mistralai/Mistral-7B-Instruct-v0.2", enforce_eager=True, max_model_len=1024)
-print(model.generate("Hello my name is"))
\ No newline at end of file
+sampling_params = SamplingParams(max_tokens=2)
+print(model.generate("Hello my name is"), sampling_params)
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/fp8_static.py b/vllm/model_executor/layers/quantization/fp8_static.py
index 1def8c0f9fcd3..28470d335bd50 100644
--- a/vllm/model_executor/layers/quantization/fp8_static.py
+++ b/vllm/model_executor/layers/quantization/fp8_static.py
@@ -149,32 +149,12 @@ def apply_weights(
             x_dq = self._fake_quantize_static(x, in_scale)
 
             # print(f"x_dq[0,0]: {x_dq[0,0]} // weight_dq[0,0]: {weight_dq[0,0]}")
-            output[:, start_offset:end_offset] = torch.nn.functional.linear(x_dq, weight_dq)
+            # output[:, start_offset:end_offset] = torch.nn.functional.linear(x_dq, weight_dq)
+            output[:, start_offset:end_offset] = torch.nn.functional.linear(x, weight_dq)
             start_offset = end_offset
         
         assert end_offset == output.shape[1]
-        # print(output)
-        # print(output.dtype)
         return output
-
-    def _quantize_dynamic(self, x: torch.Tensor):
-        finfo = torch.finfo(torch.float8_e4m3fn)
-        min_val, max_val = x.aminmax()
-        amax = min_val.abs().max(max_val.abs())
-        scale = finfo.max / amax.clamp(min=1e-12)
-
-        # print(finfo.max)
-        # print(amax)
-        # print(finfo.max / amax.clamp(min=1e-12))
-        # assert False
-        # scale and clamp the tensor to bring it to
-        # the representative range of float8 data type
-        # (as default cast is unsaturated)
-        qweight = (x * scale).clamp(min=finfo.min, max=finfo.max)
-        # Return both float8 data and the inverse scale (as float),
-        # as both required as inputs to torch._scaled_mm
-        # print(scale)
-        return qweight, scale.float().reciprocal()
     
     def _quantize(self, x: torch.Tensor, inv_scale: torch.tensor):
         finfo = torch.finfo(torch.float8_e4m3fn)
@@ -185,12 +165,8 @@ def _dequantize(self, xq: torch.Tensor, inv_scale: torch.tensor, dtype: torch.dt
     
     def _fake_quantize_static(self, x: torch.Tensor, inv_scale: torch.Tensor):
         xq = self._quantize(x, inv_scale)
-        # xq, inv_scale = self._dynamic_quantize(x)
-        # print(inv_scale)
         xdq = self._dequantize(xq, inv_scale, x.dtype)
-
         # print(f"----- inv_scale: {inv_scale} // x[0,0]: {x[0,0]} // xq[0,0]: {xq[0,0]} // xdq[0,0]: {xdq[0,0]}")
-
         return xdq
 
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 8b8679b9bc1d5..cd4a310ecbcf5 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -281,7 +281,12 @@ def forward(
         else:
             hidden_states = self.get_input_embeddings(input_ids)
         
-        print("embeddings: {hidden_states}")
+        is_warmup = kv_caches[0] is None
+
+        if not is_warmup: 
+            # print(f"embeddings: {hidden_states[0]}")
+            pass
+
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]
@@ -292,9 +297,11 @@ def forward(
                 attn_metadata,
                 residual,
             )
-            print(f"idx: {i}: {hidden_states}")
+            if not is_warmup:
+                pass
+                print(f"idx: {i}: {hidden_states}")
         hidden_states, _ = self.norm(hidden_states, residual)
-        print(hidden_states)
+        # print("------------")
         return hidden_states
 
 

From 15882eaded0ba7101a87b17e7d1bd833254a90e5 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 24 Apr 2024 12:06:04 +0000
Subject: [PATCH 09/90] stash

---
 run_fp8.py                                    |   4 +-
 .../layers/quantization/fp8_static.py         | 117 +++++++++++++-----
 vllm/model_executor/models/llama.py           |  11 +-
 3 files changed, 95 insertions(+), 37 deletions(-)

diff --git a/run_fp8.py b/run_fp8.py
index 1d7ed0e2e575b..4ffcbf3c0985d 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -1,6 +1,6 @@
 from vllm import LLM, SamplingParams
 
 model = LLM("FriendliAI/Mistral-7B-Instruct-v0.2-fp8", quantization="fp8_static", enforce_eager=True, max_model_len=1024)
-# model = LLM("mistralai/Mistral-7B-Instruct-v0.2", enforce_eager=True, max_model_len=1024)
+# model = LLM("mistralai/Mistral-7B-Instruct-v0.2", enforce_eager=True, max_model_len=1024, quantization="fp8")
 sampling_params = SamplingParams(max_tokens=2)
-print(model.generate("Hello my name is"), sampling_params)
\ No newline at end of file
+print(model.generate("What is your name"), sampling_params)
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/fp8_static.py b/vllm/model_executor/layers/quantization/fp8_static.py
index 28470d335bd50..e84997a07f45b 100644
--- a/vllm/model_executor/layers/quantization/fp8_static.py
+++ b/vllm/model_executor/layers/quantization/fp8_static.py
@@ -22,7 +22,7 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 89
+        return 90
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
@@ -130,49 +130,99 @@ def scales_shard_indexer(
         # print(f"----- loaded_weight: {loaded_weight}")
         return param[self.shard_id_as_int(shard_id)], loaded_weight
 
+    # def apply_weights(
+    #     self,
+    #     layer: torch.nn.Module,
+    #     x: torch.Tensor,
+    #     bias: Optional[torch.Tensor] = None
+    # ) -> torch.Tensor:        
+    #     logical_widths = layer.logical_widths
+    #     q_weight = layer.weight
+    #     w_scales = layer.weight_scale
+    #     in_scales = layer.in_scale
+
+    #     output = torch.zeros(x.shape[0], q_weight.shape[0], dtype=x.dtype, device="cuda")
+    #     start_offset = 0
+    #     for _, (logical_width, w_scale, in_scale) in enumerate(zip(logical_widths, w_scales, in_scales)):
+    #         end_offset = start_offset + logical_width
+    #         weight_dq = self._dequantize(q_weight[start_offset:end_offset, :], w_scale, x.dtype)
+    #         x_dq = self._fake_quantize_static(x, in_scale)
+
+    #         # print(f"x_dq[0,0]: {x_dq[0,0]} // weight_dq[0,0]: {weight_dq[0,0]}")
+    #         # output[:, start_offset:end_offset] = torch.nn.functional.linear(x_dq, weight_dq)
+    #         output[:, start_offset:end_offset] = torch.nn.functional.linear(x, weight_dq)
+    #         start_offset = end_offset
+        
+    #     assert end_offset == output.shape[1]
+    #     return output
+    
     def apply_weights(
         self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:        
-        logical_widths = layer.logical_widths
-        q_weight = layer.weight
-        w_scales = layer.weight_scale
-        in_scales = layer.in_scale
-
-        output = torch.zeros(x.shape[0], q_weight.shape[0], dtype=x.dtype, device="cuda")
+        layer,
+        x,
+        bias=None
+    ):
+        # print(sum(x))
+        # assert False
+        # qinput, x_scale = per_tensor_quantize(x)
+        # print(qinput)
+        # assert False
+        output = torch.zeros(x.shape[0], layer.weight.shape[0], dtype=x.dtype, device="cuda")
         start_offset = 0
-        for _, (logical_width, w_scale, in_scale) in enumerate(zip(logical_widths, w_scales, in_scales)):
+        print("\n----")
+        
+        for _, (logical_width, w_scale, in_scale) in enumerate(zip(layer.logical_widths, layer.weight_scale, layer.in_scale)):
             end_offset = start_offset + logical_width
-            weight_dq = self._dequantize(q_weight[start_offset:end_offset, :], w_scale, x.dtype)
-            x_dq = self._fake_quantize_static(x, in_scale)
-
-            # print(f"x_dq[0,0]: {x_dq[0,0]} // weight_dq[0,0]: {weight_dq[0,0]}")
-            # output[:, start_offset:end_offset] = torch.nn.functional.linear(x_dq, weight_dq)
-            output[:, start_offset:end_offset] = torch.nn.functional.linear(x, weight_dq)
+            print(f"(start,end) = ({start_offset}, {end_offset})")
+
+            q_weight = layer.weight[start_offset:end_offset, :].t()
+            q_input = self._quantize(x, inv_scale=in_scale)
+            x_scale = in_scale
+            # print(f"in_scale: {in_scale}")
+            # print(f"w_scale: {w_scale}")
+            # print(f"input: {x}")
+            # print(f"q_input: {q_input}")
+            # print(f"q_weight: {q_weight}")
+            # q_input, x_scale = per_tensor_quantize(x)
+            
+            assert not torch.isnan(q_input[0,0])
+
+            out, _ = torch._scaled_mm(
+                q_input,
+                q_weight,
+                out_dtype=x.dtype,
+                scale_a=x_scale.float(),
+                scale_b=w_scale.float(),
+                bias=bias,
+            )
+            print(f"out.norm(): {out.norm()}")
+            output[:, start_offset:end_offset] = out
             start_offset = end_offset
         
         assert end_offset == output.shape[1]
+        # print(output.sum(dim=0).shape)
+        # print(output.sum(dim=1).shape)
+        
+        # print(output.norm(), output.norm(dim=0), output.norm(dim=1))
         return output
-    
-    def _quantize(self, x: torch.Tensor, inv_scale: torch.tensor):
+
+    def _quantize(self, tensor: torch.Tensor, inv_scale: torch.tensor):
         finfo = torch.finfo(torch.float8_e4m3fn)
-        return (x / inv_scale).clamp(min=finfo.min, max=finfo.max)
+        qtensor = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
+        return qtensor.to(torch.float8_e4m3fn)
         
-    def _dequantize(self, xq: torch.Tensor, inv_scale: torch.tensor, dtype: torch.dtype):
-        return (xq.to(dtype) * inv_scale)
+    # def _dequantize(self, xq: torch.Tensor, inv_scale: torch.tensor, dtype: torch.dtype):
+    #     return (xq.to(dtype) * inv_scale)
     
-    def _fake_quantize_static(self, x: torch.Tensor, inv_scale: torch.Tensor):
-        xq = self._quantize(x, inv_scale)
-        xdq = self._dequantize(xq, inv_scale, x.dtype)
-        # print(f"----- inv_scale: {inv_scale} // x[0,0]: {x[0,0]} // xq[0,0]: {xq[0,0]} // xdq[0,0]: {xdq[0,0]}")
-        return xdq
+    # def _fake_quantize_static(self, x: torch.Tensor, inv_scale: torch.Tensor):
+    #     xq = self._quantize(x, inv_scale)
+    #     xdq = self._dequantize(xq, inv_scale, x.dtype)
+    #     # print(f"----- inv_scale: {inv_scale} // x[0,0]: {x[0,0]} // xq[0,0]: {xq[0,0]} // xdq[0,0]: {xdq[0,0]}")
+    #     return xdq
 
 
-def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
+def per_tensor_quantize(tensor: torch.Tensor) -> tuple[torch.Tensor, float]:
     """Quantize a tensor using per-tensor static scaling factor.
-
     Args:
         tensor: The input tensor.
     """
@@ -181,14 +231,19 @@ def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
     # Since .abs() creates a new tensor, we use aminmax to get
     # the min and max first and then calculate the absmax.
     min_val, max_val = tensor.aminmax()
+    print(min_val)
+    print(max_val)
     amax = min_val.abs().max(max_val.abs())
+    print(amax)
     scale = finfo.max / amax.clamp(min=1e-12)
+    print(scale)
     # scale and clamp the tensor to bring it to
     # the representative range of float8 data type
     # (as default cast is unsaturated)
     qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)
+    print(qweight)
     # Return both float8 data and the inverse scale (as float),
     # as both required as inputs to torch._scaled_mm
     qweight = qweight.to(torch.float8_e4m3fn)
     scale = scale.float().reciprocal()
-    return qweight, scale
+    return qweight, scale
\ No newline at end of file
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index cd4a310ecbcf5..c9c2097ce6267 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -283,8 +283,9 @@ def forward(
         
         is_warmup = kv_caches[0] is None
 
-        if not is_warmup: 
-            # print(f"embeddings: {hidden_states[0]}")
+        if not is_warmup:
+            print("NOT IN WARMUP")
+            print(f"embeddings: {hidden_states[0]}")
             pass
 
         residual = None
@@ -298,10 +299,12 @@ def forward(
                 residual,
             )
             if not is_warmup:
+                assert False
+                # print(f"idx: {i}: {hidden_states}")
                 pass
-                print(f"idx: {i}: {hidden_states}")
         hidden_states, _ = self.norm(hidden_states, residual)
-        # print("------------")
+        if not is_warmup:
+            assert False
         return hidden_states
 
 

From 7e6b675cfdc5272735d369e2c29155441a59e31e Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 24 Apr 2024 12:58:45 +0000
Subject: [PATCH 10/90] added way to do weight quantization

---
 quantize.ipynb                                | 250 ++++++++++++++++++
 run_fp8.py                                    |   2 +-
 .../model_executor/layers/quantization/fp8.py |   5 +
 .../layers/quantization/fp8_static.py         | 143 +++++-----
 4 files changed, 340 insertions(+), 60 deletions(-)
 create mode 100644 quantize.ipynb

diff --git a/quantize.ipynb b/quantize.ipynb
new file mode 100644
index 0000000000000..17efdc17fd39b
--- /dev/null
+++ b/quantize.ipynb
@@ -0,0 +1,250 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/paperspace/env/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.94it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    \"mistralai/Mistral-7B-Instruct-v0.2\",\n",
+    "    torch_dtype=torch.bfloat16,\n",
+    ")\n",
+    "model = model.to(\"cuda\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Tuple\n",
+    "\n",
+    "def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:\n",
+    "    \"\"\"Quantize a tensor using per-tensor static scaling factor.\n",
+    "\n",
+    "    Args:\n",
+    "        tensor: The input tensor.\n",
+    "    \"\"\"\n",
+    "    finfo = torch.finfo(torch.float8_e4m3fn)\n",
+    "    # Calculate the scale as dtype max divided by absmax.\n",
+    "    # Since .abs() creates a new tensor, we use aminmax to get\n",
+    "    # the min and max first and then calculate the absmax.\n",
+    "    min_val, max_val = tensor.aminmax()\n",
+    "    amax = min_val.abs().max(max_val.abs())\n",
+    "    scale = finfo.max / amax.clamp(min=1e-12)\n",
+    "    # scale and clamp the tensor to bring it to\n",
+    "    # the representative range of float8 data type\n",
+    "    # (as default cast is unsaturated)\n",
+    "    qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)\n",
+    "    # Return both float8 data and the inverse scale (as float),\n",
+    "    # as both required as inputs to torch._scaled_mm\n",
+    "    qweight = qweight.to(torch.float8_e4m3fn)\n",
+    "    scale = scale.float().reciprocal()\n",
+    "    return qweight, scale"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class LinearFP8(torch.nn.Module):\n",
+    "    def __init__(self, qweight, scale):\n",
+    "        super().__init__()\n",
+    "        self.weight = torch.nn.Parameter(qweight, requires_grad=False)\n",
+    "        self.weight_scale = torch.nn.Parameter(scale, requires_grad=False)\n",
+    "    \n",
+    "    def forward(self, x):\n",
+    "        shape = x.shape\n",
+    "        x = x.reshape(-1, shape[-1])\n",
+    "        qinput, x_scale = per_tensor_quantize(x)\n",
+    "        \n",
+    "        output, _ = torch._scaled_mm(\n",
+    "            qinput,\n",
+    "            self.weight.t(),\n",
+    "            out_dtype=x.dtype,\n",
+    "            scale_a=x_scale,\n",
+    "            scale_b=self.weight_scale,\n",
+    "            bias=None,\n",
+    "        )\n",
+    "        return output.reshape(shape[0], shape[1], -1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SELF_ATTN_WEIGHTS = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\"]\n",
+    "MLP_WEIGHTS = [\"gate_proj\", \"up_proj\", \"down_proj\"]\n",
+    "\n",
+    "def quantize_proj(module, proj_name):\n",
+    "    proj = getattr(module, proj_name)\n",
+    "    quant_weight, quant_scale = per_tensor_quantize(proj.weight)\n",
+    "    quant_proj = LinearFP8(quant_weight, quant_scale)\n",
+    "    \n",
+    "    del proj\n",
+    "    setattr(module, proj_name, quant_proj)\n",
+    "\n",
+    "for layer in model.model.layers:\n",
+    "    for proj_name in SELF_ATTN_WEIGHTS:\n",
+    "        quantize_proj(layer.self_attn, proj_name)\n",
+    "    for proj_name in MLP_WEIGHTS:\n",
+    "        quantize_proj(layer.mlp, proj_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "MistralForCausalLM(\n",
+       "  (model): MistralModel(\n",
+       "    (embed_tokens): Embedding(32000, 4096)\n",
+       "    (layers): ModuleList(\n",
+       "      (0-31): 32 x MistralDecoderLayer(\n",
+       "        (self_attn): MistralSdpaAttention(\n",
+       "          (q_proj): LinearFP8()\n",
+       "          (k_proj): LinearFP8()\n",
+       "          (v_proj): LinearFP8()\n",
+       "          (o_proj): LinearFP8()\n",
+       "          (rotary_emb): MistralRotaryEmbedding()\n",
+       "        )\n",
+       "        (mlp): MistralMLP(\n",
+       "          (gate_proj): LinearFP8()\n",
+       "          (up_proj): LinearFP8()\n",
+       "          (down_proj): LinearFP8()\n",
+       "          (act_fn): SiLU()\n",
+       "        )\n",
+       "        (input_layernorm): MistralRMSNorm()\n",
+       "        (post_attention_layernorm): MistralRMSNorm()\n",
+       "      )\n",
+       "    )\n",
+       "    (norm): MistralRMSNorm()\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)\n",
+       ")"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\n",
+    "    \"mistralai/Mistral-7B-Instruct-v0.2\"\n",
+    ")\n",
+    "tokenizer.pad_token_id = tokenizer.eos_token_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_ids = tokenizer.apply_chat_template(\n",
+    "    [{\"role\": \"user\", \"content\": \"What is your name?\" }],\n",
+    "    return_tensors=\"pt\"\n",
+    ").to(\"cuda\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<s> [INST] What is your name? [/INST] I don't have a name. I'm just a computer program designed to assist with information\n"
+     ]
+    }
+   ],
+   "source": [
+    "output = model.generate(input_ids=input_ids, max_new_tokens=20)\n",
+    "print(tokenizer.decode(output[0]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# hacked transformers/modeling_utils/dtype_byte_size to make this work\n",
+    "model.save_pretrained(\"mistral-fp8-static\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/run_fp8.py b/run_fp8.py
index 4ffcbf3c0985d..afcb0b5e58415 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -1,6 +1,6 @@
 from vllm import LLM, SamplingParams
 
-model = LLM("FriendliAI/Mistral-7B-Instruct-v0.2-fp8", quantization="fp8_static", enforce_eager=True, max_model_len=1024)
+model = LLM("nm-testing/mistral-fp8-test", tokenizer="mistralai/Mistral-7B-Instruct-v0.2", quantization="fp8_static", enforce_eager=True, max_model_len=1024)
 # model = LLM("mistralai/Mistral-7B-Instruct-v0.2", enforce_eager=True, max_model_len=1024, quantization="fp8")
 sampling_params = SamplingParams(max_tokens=2)
 print(model.generate("What is your name"), sampling_params)
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 01e494c870e71..6ec3802c6318a 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -104,6 +104,10 @@ def apply_weights(self,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         qinput, x_scale = per_tensor_quantize(x)
+        print(f"w_scale: {layer.weight_scaling_factor.item()}")
+        print(f"x_scale: {x_scale.item()}")
+        print(f"q_input: {qinput}")
+        print(f"weight: {layer.weight}")
         output, _ = torch._scaled_mm(
             qinput,
             layer.weight,
@@ -112,6 +116,7 @@ def apply_weights(self,
             scale_b=layer.weight_scaling_factor,
             bias=bias,
         )
+    
         return output
 
 
diff --git a/vllm/model_executor/layers/quantization/fp8_static.py b/vllm/model_executor/layers/quantization/fp8_static.py
index e84997a07f45b..9a7c374fde1e4 100644
--- a/vllm/model_executor/layers/quantization/fp8_static.py
+++ b/vllm/model_executor/layers/quantization/fp8_static.py
@@ -81,17 +81,17 @@ def create_weights(
             "shard_indexer": self.scales_shard_indexer,
         })
 
-        in_scale = Parameter(
-            torch.empty(
-                len(output_partition_sizes), 
-                 device='cuda', dtype=torch.float32,
-            ), requires_grad=False
-        )
-        layer.register_parameter("in_scale", in_scale)
-        set_weight_attrs(in_scale, extra_weight_attrs)
-        set_weight_attrs(in_scale, {
-            "shard_indexer": self.scales_shard_indexer,
-        })
+        # in_scale = Parameter(
+        #     torch.empty(
+        #         len(output_partition_sizes), 
+        #          device='cuda', dtype=torch.float32,
+        #     ), requires_grad=False
+        # )
+        # layer.register_parameter("in_scale", in_scale)
+        # set_weight_attrs(in_scale, extra_weight_attrs)
+        # set_weight_attrs(in_scale, {
+        #     "shard_indexer": self.scales_shard_indexer,
+        # })
 
         layer.logical_widths = output_partition_sizes
 
@@ -130,6 +130,36 @@ def scales_shard_indexer(
         # print(f"----- loaded_weight: {loaded_weight}")
         return param[self.shard_id_as_int(shard_id)], loaded_weight
 
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        logical_widths = layer.logical_widths
+        q_weight = layer.weight
+        w_scales = layer.weight_scale
+
+        qinput, x_scale = per_tensor_quantize(x)
+
+        output = torch.zeros(x.shape[0], q_weight.shape[0], dtype=x.dtype, device="cuda")
+        start_offset = 0
+        for _, (logical_width, w_scale) in enumerate(zip(logical_widths, w_scales)):
+            end_offset = start_offset + logical_width
+            q_weight = layer.weight[start_offset:end_offset, :].t()
+
+            out, _ = torch._scaled_mm(
+                qinput,
+                q_weight,
+                out_dtype=x.dtype,
+                scale_a=x_scale,
+                scale_b=w_scale,
+                bias=bias,
+            )
+            output[:, start_offset:end_offset] = out
+            start_offset = end_offset
+        
+        assert end_offset == output.shape[1]
+        return output
+  
     # def apply_weights(
     #     self,
     #     layer: torch.nn.Module,
@@ -156,55 +186,55 @@ def scales_shard_indexer(
     #     assert end_offset == output.shape[1]
     #     return output
     
-    def apply_weights(
-        self,
-        layer,
-        x,
-        bias=None
-    ):
-        # print(sum(x))
-        # assert False
-        # qinput, x_scale = per_tensor_quantize(x)
-        # print(qinput)
-        # assert False
-        output = torch.zeros(x.shape[0], layer.weight.shape[0], dtype=x.dtype, device="cuda")
-        start_offset = 0
-        print("\n----")
+    # def apply_weights(
+    #     self,
+    #     layer,
+    #     x,
+    #     bias=None
+    # ):
+    #     # print(sum(x))
+    #     # assert False
+    #     # qinput, x_scale = per_tensor_quantize(x)
+    #     # print(qinput)
+    #     # assert False
+    #     output = torch.zeros(x.shape[0], layer.weight.shape[0], dtype=x.dtype, device="cuda")
+    #     start_offset = 0
+    #     print("\n----")
         
-        for _, (logical_width, w_scale, in_scale) in enumerate(zip(layer.logical_widths, layer.weight_scale, layer.in_scale)):
-            end_offset = start_offset + logical_width
-            print(f"(start,end) = ({start_offset}, {end_offset})")
+    #     for _, (logical_width, w_scale, in_scale) in enumerate(zip(layer.logical_widths, layer.weight_scale, layer.in_scale)):
+    #         end_offset = start_offset + logical_width
+    #         print(f"(start,end) = ({start_offset}, {end_offset})")
 
-            q_weight = layer.weight[start_offset:end_offset, :].t()
-            q_input = self._quantize(x, inv_scale=in_scale)
-            x_scale = in_scale
-            # print(f"in_scale: {in_scale}")
-            # print(f"w_scale: {w_scale}")
-            # print(f"input: {x}")
-            # print(f"q_input: {q_input}")
-            # print(f"q_weight: {q_weight}")
-            # q_input, x_scale = per_tensor_quantize(x)
+    #         q_weight = layer.weight[start_offset:end_offset, :].t()
+    #         q_input = self._quantize(x, inv_scale=in_scale)
+    #         x_scale = in_scale
+    #         # print(f"in_scale: {in_scale}")
+    #         # print(f"w_scale: {w_scale}")
+    #         # print(f"input: {x}")
+    #         # print(f"q_input: {q_input}")
+    #         # print(f"q_weight: {q_weight}")
+    #         # q_input, x_scale = per_tensor_quantize(x)
             
-            assert not torch.isnan(q_input[0,0])
+    #         assert not torch.isnan(q_input[0,0])
 
-            out, _ = torch._scaled_mm(
-                q_input,
-                q_weight,
-                out_dtype=x.dtype,
-                scale_a=x_scale.float(),
-                scale_b=w_scale.float(),
-                bias=bias,
-            )
-            print(f"out.norm(): {out.norm()}")
-            output[:, start_offset:end_offset] = out
-            start_offset = end_offset
+    #         out, _ = torch._scaled_mm(
+    #             q_input,
+    #             q_weight * w_scale,
+    #             out_dtype=x.dtype,
+    #             scale_a=x_scale.float(),
+    #             scale_b=w_scale.float(),
+    #             bias=bias,
+    #         )
+    #         print(f"out.norm(): {out.norm()}")
+    #         output[:, start_offset:end_offset] = out
+    #         start_offset = end_offset
         
-        assert end_offset == output.shape[1]
-        # print(output.sum(dim=0).shape)
-        # print(output.sum(dim=1).shape)
+    #     assert end_offset == output.shape[1]
+    #     # print(output.sum(dim=0).shape)
+    #     # print(output.sum(dim=1).shape)
         
-        # print(output.norm(), output.norm(dim=0), output.norm(dim=1))
-        return output
+    #     # print(output.norm(), output.norm(dim=0), output.norm(dim=1))
+    #     return output
 
     def _quantize(self, tensor: torch.Tensor, inv_scale: torch.tensor):
         finfo = torch.finfo(torch.float8_e4m3fn)
@@ -231,17 +261,12 @@ def per_tensor_quantize(tensor: torch.Tensor) -> tuple[torch.Tensor, float]:
     # Since .abs() creates a new tensor, we use aminmax to get
     # the min and max first and then calculate the absmax.
     min_val, max_val = tensor.aminmax()
-    print(min_val)
-    print(max_val)
     amax = min_val.abs().max(max_val.abs())
-    print(amax)
     scale = finfo.max / amax.clamp(min=1e-12)
-    print(scale)
     # scale and clamp the tensor to bring it to
     # the representative range of float8 data type
     # (as default cast is unsaturated)
     qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)
-    print(qweight)
     # Return both float8 data and the inverse scale (as float),
     # as both required as inputs to torch._scaled_mm
     qweight = qweight.to(torch.float8_e4m3fn)

From cc959ea08db3ea7ba99f894e59ee0ca16168dba9 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 24 Apr 2024 13:05:45 +0000
Subject: [PATCH 11/90] working!

---
 run_fp8.py                                    | 14 ++-
 .../model_executor/layers/quantization/fp8.py |  4 -
 .../layers/quantization/fp8_static.py         | 92 +------------------
 vllm/model_executor/models/llama.py           |  8 +-
 4 files changed, 15 insertions(+), 103 deletions(-)

diff --git a/run_fp8.py b/run_fp8.py
index afcb0b5e58415..b9fac76e0c1ca 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -1,6 +1,12 @@
-from vllm import LLM, SamplingParams
+from vllm import LLM
+from transformers import AutoTokenizer
 
 model = LLM("nm-testing/mistral-fp8-test", tokenizer="mistralai/Mistral-7B-Instruct-v0.2", quantization="fp8_static", enforce_eager=True, max_model_len=1024)
-# model = LLM("mistralai/Mistral-7B-Instruct-v0.2", enforce_eager=True, max_model_len=1024, quantization="fp8")
-sampling_params = SamplingParams(max_tokens=2)
-print(model.generate("What is your name"), sampling_params)
\ No newline at end of file
+tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
+
+prompt = tokenizer.apply_chat_template([{"role": "user", "content": "What is your name"}], tokenize=False, add_generation_prompt=True)
+print(f"----- Prompt: {prompt}")
+
+outputs = model.generate(prompt)
+generation = outputs[0].outputs[0].text
+print(f"----- Generation: {generation}")
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 6ec3802c6318a..983a63e124ae8 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -104,10 +104,6 @@ def apply_weights(self,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         qinput, x_scale = per_tensor_quantize(x)
-        print(f"w_scale: {layer.weight_scaling_factor.item()}")
-        print(f"x_scale: {x_scale.item()}")
-        print(f"q_input: {qinput}")
-        print(f"weight: {layer.weight}")
         output, _ = torch._scaled_mm(
             qinput,
             layer.weight,
diff --git a/vllm/model_executor/layers/quantization/fp8_static.py b/vllm/model_executor/layers/quantization/fp8_static.py
index 9a7c374fde1e4..24dc498fd2c18 100644
--- a/vllm/model_executor/layers/quantization/fp8_static.py
+++ b/vllm/model_executor/layers/quantization/fp8_static.py
@@ -141,6 +141,8 @@ def apply_weights(self,
         qinput, x_scale = per_tensor_quantize(x)
 
         output = torch.zeros(x.shape[0], q_weight.shape[0], dtype=x.dtype, device="cuda")
+        
+        # FOR LOOP TO BE REPLACED BY CUTLASS KERNEL
         start_offset = 0
         for _, (logical_width, w_scale) in enumerate(zip(logical_widths, w_scales)):
             end_offset = start_offset + logical_width
@@ -159,96 +161,6 @@ def apply_weights(self,
         
         assert end_offset == output.shape[1]
         return output
-  
-    # def apply_weights(
-    #     self,
-    #     layer: torch.nn.Module,
-    #     x: torch.Tensor,
-    #     bias: Optional[torch.Tensor] = None
-    # ) -> torch.Tensor:        
-    #     logical_widths = layer.logical_widths
-    #     q_weight = layer.weight
-    #     w_scales = layer.weight_scale
-    #     in_scales = layer.in_scale
-
-    #     output = torch.zeros(x.shape[0], q_weight.shape[0], dtype=x.dtype, device="cuda")
-    #     start_offset = 0
-    #     for _, (logical_width, w_scale, in_scale) in enumerate(zip(logical_widths, w_scales, in_scales)):
-    #         end_offset = start_offset + logical_width
-    #         weight_dq = self._dequantize(q_weight[start_offset:end_offset, :], w_scale, x.dtype)
-    #         x_dq = self._fake_quantize_static(x, in_scale)
-
-    #         # print(f"x_dq[0,0]: {x_dq[0,0]} // weight_dq[0,0]: {weight_dq[0,0]}")
-    #         # output[:, start_offset:end_offset] = torch.nn.functional.linear(x_dq, weight_dq)
-    #         output[:, start_offset:end_offset] = torch.nn.functional.linear(x, weight_dq)
-    #         start_offset = end_offset
-        
-    #     assert end_offset == output.shape[1]
-    #     return output
-    
-    # def apply_weights(
-    #     self,
-    #     layer,
-    #     x,
-    #     bias=None
-    # ):
-    #     # print(sum(x))
-    #     # assert False
-    #     # qinput, x_scale = per_tensor_quantize(x)
-    #     # print(qinput)
-    #     # assert False
-    #     output = torch.zeros(x.shape[0], layer.weight.shape[0], dtype=x.dtype, device="cuda")
-    #     start_offset = 0
-    #     print("\n----")
-        
-    #     for _, (logical_width, w_scale, in_scale) in enumerate(zip(layer.logical_widths, layer.weight_scale, layer.in_scale)):
-    #         end_offset = start_offset + logical_width
-    #         print(f"(start,end) = ({start_offset}, {end_offset})")
-
-    #         q_weight = layer.weight[start_offset:end_offset, :].t()
-    #         q_input = self._quantize(x, inv_scale=in_scale)
-    #         x_scale = in_scale
-    #         # print(f"in_scale: {in_scale}")
-    #         # print(f"w_scale: {w_scale}")
-    #         # print(f"input: {x}")
-    #         # print(f"q_input: {q_input}")
-    #         # print(f"q_weight: {q_weight}")
-    #         # q_input, x_scale = per_tensor_quantize(x)
-            
-    #         assert not torch.isnan(q_input[0,0])
-
-    #         out, _ = torch._scaled_mm(
-    #             q_input,
-    #             q_weight * w_scale,
-    #             out_dtype=x.dtype,
-    #             scale_a=x_scale.float(),
-    #             scale_b=w_scale.float(),
-    #             bias=bias,
-    #         )
-    #         print(f"out.norm(): {out.norm()}")
-    #         output[:, start_offset:end_offset] = out
-    #         start_offset = end_offset
-        
-    #     assert end_offset == output.shape[1]
-    #     # print(output.sum(dim=0).shape)
-    #     # print(output.sum(dim=1).shape)
-        
-    #     # print(output.norm(), output.norm(dim=0), output.norm(dim=1))
-    #     return output
-
-    def _quantize(self, tensor: torch.Tensor, inv_scale: torch.tensor):
-        finfo = torch.finfo(torch.float8_e4m3fn)
-        qtensor = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
-        return qtensor.to(torch.float8_e4m3fn)
-        
-    # def _dequantize(self, xq: torch.Tensor, inv_scale: torch.tensor, dtype: torch.dtype):
-    #     return (xq.to(dtype) * inv_scale)
-    
-    # def _fake_quantize_static(self, x: torch.Tensor, inv_scale: torch.Tensor):
-    #     xq = self._quantize(x, inv_scale)
-    #     xdq = self._dequantize(xq, inv_scale, x.dtype)
-    #     # print(f"----- inv_scale: {inv_scale} // x[0,0]: {x[0,0]} // xq[0,0]: {xq[0,0]} // xdq[0,0]: {xdq[0,0]}")
-    #     return xdq
 
 
 def per_tensor_quantize(tensor: torch.Tensor) -> tuple[torch.Tensor, float]:
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index c9c2097ce6267..58a895ac7de33 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -284,8 +284,8 @@ def forward(
         is_warmup = kv_caches[0] is None
 
         if not is_warmup:
-            print("NOT IN WARMUP")
-            print(f"embeddings: {hidden_states[0]}")
+            # print("NOT IN WARMUP")
+            # print(f"embeddings: {hidden_states[0]}")
             pass
 
         residual = None
@@ -299,12 +299,10 @@ def forward(
                 residual,
             )
             if not is_warmup:
-                assert False
+                # assert False
                 # print(f"idx: {i}: {hidden_states}")
                 pass
         hidden_states, _ = self.norm(hidden_states, residual)
-        if not is_warmup:
-            assert False
         return hidden_states
 
 

From 8d68dbc6c2848ee2ca6091d75be1965ecafb758d Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 24 Apr 2024 13:08:06 +0000
Subject: [PATCH 12/90] fixed llama

---
 vllm/model_executor/models/llama.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 58a895ac7de33..5021b9f66eadd 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -280,13 +280,6 @@ def forward(
             hidden_states = inputs_embeds
         else:
             hidden_states = self.get_input_embeddings(input_ids)
-        
-        is_warmup = kv_caches[0] is None
-
-        if not is_warmup:
-            # print("NOT IN WARMUP")
-            # print(f"embeddings: {hidden_states[0]}")
-            pass
 
         residual = None
         for i in range(len(self.layers)):
@@ -298,10 +291,7 @@ def forward(
                 attn_metadata,
                 residual,
             )
-            if not is_warmup:
-                # assert False
-                # print(f"idx: {i}: {hidden_states}")
-                pass
+
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 

From 881fc65c8676e9bc76ae628de217a3edaa7494d7 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 24 Apr 2024 13:08:36 +0000
Subject: [PATCH 13/90] fixed llama again

---
 vllm/model_executor/models/llama.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 5021b9f66eadd..016e3b039d1e8 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -280,7 +280,6 @@ def forward(
             hidden_states = inputs_embeds
         else:
             hidden_states = self.get_input_embeddings(input_ids)
-
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]
@@ -291,7 +290,6 @@ def forward(
                 attn_metadata,
                 residual,
             )
-
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 

From e6dd46f18a1184277742572268156e9721e71f85 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 24 Apr 2024 13:16:07 +0000
Subject: [PATCH 14/90] updated names

---
 run_fp8.py                                    |   2 +-
 .../layers/quantization/__init__.py           |   4 +-
 .../layers/quantization/fp8_static.py         | 186 ------------------
 3 files changed, 3 insertions(+), 189 deletions(-)
 delete mode 100644 vllm/model_executor/layers/quantization/fp8_static.py

diff --git a/run_fp8.py b/run_fp8.py
index b9fac76e0c1ca..84bef6578301e 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -1,7 +1,7 @@
 from vllm import LLM
 from transformers import AutoTokenizer
 
-model = LLM("nm-testing/mistral-fp8-test", tokenizer="mistralai/Mistral-7B-Instruct-v0.2", quantization="fp8_static", enforce_eager=True, max_model_len=1024)
+model = LLM("nm-testing/mistral-fp8-test", tokenizer="mistralai/Mistral-7B-Instruct-v0.2", quantization="fp8_serialized", enforce_eager=True, max_model_len=1024)
 tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
 
 prompt = tokenizer.apply_chat_template([{"role": "user", "content": "What is your name"}], tokenize=False, add_generation_prompt=True)
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index c139331307872..dbbe466dd7a94 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -5,7 +5,7 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.fp8 import FP8Config
-from vllm.model_executor.layers.quantization.fp8_static import FP8StaticConfig
+from vllm.model_executor.layers.quantization.fp8_serialized import FP8SerializedConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.marlin import MarlinConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
@@ -14,7 +14,7 @@
     "aqlm": AQLMConfig,
     "awq": AWQConfig,
     "fp8": FP8Config,
-    "fp8_static": FP8StaticConfig,
+    "fp8_serialized": FP8SerializedConfig,
     "gptq": GPTQConfig,
     "squeezellm": SqueezeLLMConfig,
     "marlin": MarlinConfig,
diff --git a/vllm/model_executor/layers/quantization/fp8_static.py b/vllm/model_executor/layers/quantization/fp8_static.py
deleted file mode 100644
index 24dc498fd2c18..0000000000000
--- a/vllm/model_executor/layers/quantization/fp8_static.py
+++ /dev/null
@@ -1,186 +0,0 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import torch
-from torch.nn.parameter import Parameter
-
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               set_weight_attrs)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-
-
-class FP8StaticConfig(QuantizationConfig):
-    """Config class for FP8."""
-
-    @classmethod
-    def get_name(cls) -> str:
-        return "fp8_static"
-
-    @classmethod
-    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
-        return [torch.bfloat16, torch.half]
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        return 90
-
-    @classmethod
-    def get_config_filenames(cls) -> List[str]:
-        return []
-
-    @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "FP8StaticConfig":
-        return cls()
-
-    def get_linear_method(self) -> "Fp8LinearMethod":
-        return Fp8LinearMethod(self)
-
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
-
-class Fp8LinearMethod(LinearMethodBase):
-    """Linear method for StaticFP8
-    .
-    Args:
-        quant_config: The quantization config.
-    """
-
-    def __init__(self, quant_config: FP8StaticConfig):
-        self.quant_config = quant_config
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: List[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        del input_size, output_size
-        
-        weight = Parameter(torch.empty(sum(output_partition_sizes),
-                                       input_size_per_partition,
-                                       dtype=torch.float8_e4m3fn),
-                           requires_grad=False)
-        layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
-        set_weight_attrs(weight, extra_weight_attrs)
-
-        weight_scale = Parameter(
-            torch.empty(
-                len(output_partition_sizes), 
-                 device='cuda', dtype=torch.float32,
-            ), requires_grad=False
-        )
-        layer.register_parameter("weight_scale", weight_scale)
-        set_weight_attrs(weight_scale, extra_weight_attrs)
-        set_weight_attrs(weight_scale, {
-            "shard_indexer": self.scales_shard_indexer,
-        })
-
-        # in_scale = Parameter(
-        #     torch.empty(
-        #         len(output_partition_sizes), 
-        #          device='cuda', dtype=torch.float32,
-        #     ), requires_grad=False
-        # )
-        # layer.register_parameter("in_scale", in_scale)
-        # set_weight_attrs(in_scale, extra_weight_attrs)
-        # set_weight_attrs(in_scale, {
-        #     "shard_indexer": self.scales_shard_indexer,
-        # })
-
-        layer.logical_widths = output_partition_sizes
-
-    def shard_id_as_int(
-        self, 
-        shard_id: Union[str, int]
-    ) -> int:
-        if isinstance(shard_id, int):
-            return shard_id
-        assert isinstance(shard_id, str)
-        qkv_idxs = { "q": 0, "k": 1, "v": 2 }
-        assert shard_id in qkv_idxs
-        return qkv_idxs[shard_id]
-
-    # def scales_shard_splitter_NKK(
-    #     self,
-    #     param: torch.Tensor,
-    #     loaded_weight: torch.Tensor,
-    #     shard_id: Union[str, int],
-    #     logical_widths: torch.Tensor
-    # ) -> Tuple[torch.Tensor, torch.Tensor]:
-    #     shard_id = self.shard_id_as_int(shard_id)
-    #     offset = sum(logical_widths[:shard_id]) 
-    #     size = logical_widths[shard_id]
-    #     # update loaded weight with copies for broadcast.
-    #     loaded_weight = loaded_weight.repeat(size)
-    #     return param[offset : offset + size], loaded_weight
-    
-    def scales_shard_indexer(
-        self,
-        param: torch.Tensor,
-        loaded_weight: torch.Tensor,
-        shard_id: Union[str, int],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # print(f"----- shard_id: {shard_id}")
-        # print(f"----- loaded_weight: {loaded_weight}")
-        return param[self.shard_id_as_int(shard_id)], loaded_weight
-
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        logical_widths = layer.logical_widths
-        q_weight = layer.weight
-        w_scales = layer.weight_scale
-
-        qinput, x_scale = per_tensor_quantize(x)
-
-        output = torch.zeros(x.shape[0], q_weight.shape[0], dtype=x.dtype, device="cuda")
-        
-        # FOR LOOP TO BE REPLACED BY CUTLASS KERNEL
-        start_offset = 0
-        for _, (logical_width, w_scale) in enumerate(zip(logical_widths, w_scales)):
-            end_offset = start_offset + logical_width
-            q_weight = layer.weight[start_offset:end_offset, :].t()
-
-            out, _ = torch._scaled_mm(
-                qinput,
-                q_weight,
-                out_dtype=x.dtype,
-                scale_a=x_scale,
-                scale_b=w_scale,
-                bias=bias,
-            )
-            output[:, start_offset:end_offset] = out
-            start_offset = end_offset
-        
-        assert end_offset == output.shape[1]
-        return output
-
-
-def per_tensor_quantize(tensor: torch.Tensor) -> tuple[torch.Tensor, float]:
-    """Quantize a tensor using per-tensor static scaling factor.
-    Args:
-        tensor: The input tensor.
-    """
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    # Calculate the scale as dtype max divided by absmax.
-    # Since .abs() creates a new tensor, we use aminmax to get
-    # the min and max first and then calculate the absmax.
-    min_val, max_val = tensor.aminmax()
-    amax = min_val.abs().max(max_val.abs())
-    scale = finfo.max / amax.clamp(min=1e-12)
-    # scale and clamp the tensor to bring it to
-    # the representative range of float8 data type
-    # (as default cast is unsaturated)
-    qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)
-    # Return both float8 data and the inverse scale (as float),
-    # as both required as inputs to torch._scaled_mm
-    qweight = qweight.to(torch.float8_e4m3fn)
-    scale = scale.float().reciprocal()
-    return qweight, scale
\ No newline at end of file

From 7e3933b96fb390cbd7845c76b8dc4ad8083401ff Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 24 Apr 2024 13:16:53 +0000
Subject: [PATCH 15/90] nit

---
 run_fp8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_fp8.py b/run_fp8.py
index 84bef6578301e..16b1fab255293 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -9,4 +9,4 @@
 
 outputs = model.generate(prompt)
 generation = outputs[0].outputs[0].text
-print(f"----- Generation: {generation}")
\ No newline at end of file
+print(f"----- Generation: {generation}")

From 453a236cd05bdfea722bca52a1777a584954e1ab Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 24 Apr 2024 13:22:47 +0000
Subject: [PATCH 16/90] cleanup

---
 .buildkite/test-pipeline.yaml  | 3 ---
 tests/quantization/test_fp8.py | 9 ---------
 2 files changed, 12 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 11cda053260ec..f7c1569696249 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -96,9 +96,6 @@ steps:
 - label: Metrics Test
   command: pytest -v -s metrics
 
-- label: Quantization Test
-  command: pytest -v -s quantization
-
 - label: Benchmarks
   working_dir: "/vllm-workspace/.buildkite"
   commands:
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 6e38d570f8541..43300a72a86c7 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -12,15 +12,6 @@
 capability = torch.cuda.get_device_capability()
 capability = capability[0] * 10 + capability[1]
 
-@pytest.mark.skipif(
-    capability < QUANTIZATION_METHODS["fp8_static"].get_min_capability(),
-    reason="FP8 is not supported on this GPU type.")
-def test_load_static_model(vllm_runner) -> None:
-    llm = vllm_runner("FriendliAI/Llama-2-7b-chat-hf-fp8", quantization="fp8_static", enforce_eager=True)
-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
-    fc1 = model.model.decoder.layers[0].fc1
-    assert isinstance(fc1.linear_method, Fp8LinearMethod)
-    print(llm.generate("Hello my name is", SamplingParams(max_tokens=20)))
 
 @pytest.mark.skipif(
     capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),

From 310e0a7ad0fb1383a9980ee47d0cb0a8766848da Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 24 Apr 2024 13:23:06 +0000
Subject: [PATCH 17/90] cleanup

---
 tests/quantization/test_fp8.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 43300a72a86c7..fa10e60de10a7 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -5,7 +5,6 @@
 import pytest
 import torch
 
-from vllm import SamplingParams
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 

From ab4cb0209a2425a39b249379dc92ae5727b2d909 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 24 Apr 2024 13:23:47 +0000
Subject: [PATCH 18/90] missed file :)

---
 .../layers/quantization/fp8_serialized.py     | 166 ++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 vllm/model_executor/layers/quantization/fp8_serialized.py

diff --git a/vllm/model_executor/layers/quantization/fp8_serialized.py b/vllm/model_executor/layers/quantization/fp8_serialized.py
new file mode 100644
index 0000000000000..933a70a7d6d10
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/fp8_serialized.py
@@ -0,0 +1,166 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+
+class FP8SerializedConfig(QuantizationConfig):
+    """Config class for FP8."""
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "fp8_static"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 90
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "FP8SerializedConfig":
+        return cls()
+
+    def get_linear_method(self) -> "Fp8SerializedLinearMethod":
+        return Fp8SerializedLinearMethod(self)
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class Fp8SerializedLinearMethod(LinearMethodBase):
+    """Linear method for StaticFP8
+    .
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: FP8SerializedConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size, params_dtype
+        
+        weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                       input_size_per_partition,
+                                       dtype=torch.float8_e4m3fn),
+                           requires_grad=False)
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+        set_weight_attrs(weight, extra_weight_attrs)
+
+        weight_scale = Parameter(
+            torch.empty(
+                len(output_partition_sizes), 
+                 device='cuda', dtype=torch.float32,
+            ), requires_grad=False
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+        set_weight_attrs(weight_scale, extra_weight_attrs)
+        set_weight_attrs(weight_scale, {
+            "shard_indexer": self.scales_shard_indexer,
+        })
+
+        layer.logical_widths = output_partition_sizes
+
+    def shard_id_as_int(
+        self, 
+        shard_id: Union[str, int]
+    ) -> int:
+        if isinstance(shard_id, int):
+            return shard_id
+        assert isinstance(shard_id, str)
+        qkv_idxs = { "q": 0, "k": 1, "v": 2 }
+        assert shard_id in qkv_idxs
+        return qkv_idxs[shard_id]
+
+    # def scales_shard_splitter_NKK(
+    #     self,
+    #     param: torch.Tensor,
+    #     loaded_weight: torch.Tensor,
+    #     shard_id: Union[str, int],
+    #     logical_widths: torch.Tensor
+    # ) -> Tuple[torch.Tensor, torch.Tensor]:
+    #     shard_id = self.shard_id_as_int(shard_id)
+    #     offset = sum(logical_widths[:shard_id]) 
+    #     size = logical_widths[shard_id]
+    #     # update loaded weight with copies for broadcast.
+    #     loaded_weight = loaded_weight.repeat(size)
+    #     return param[offset : offset + size], loaded_weight
+    
+    def scales_shard_indexer(
+        self,
+        param: torch.Tensor,
+        loaded_weight: torch.Tensor,
+        shard_id: Union[str, int],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return param[self.shard_id_as_int(shard_id)], loaded_weight
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        qinput, x_scale = per_tensor_quantize(x)
+        
+        # FOR LOOP TO BE REPLACED BY CUTLASS KERNEL W/ EPILOGUE FUSION
+        output = torch.zeros(x.shape[0], layer.weight.shape[0], dtype=x.dtype, device="cuda")
+        start_offset = 0
+        for _, (logical_width, w_scale) in enumerate(zip(layer.logical_widths, layer.weight_scale)):
+            end_offset = start_offset + logical_width
+
+            out, _ = torch._scaled_mm(
+                qinput,
+                layer.weight[start_offset:end_offset, :].t(),
+                out_dtype=x.dtype,
+                scale_a=x_scale,
+                scale_b=w_scale,
+                bias=bias,
+            )
+            output[:, start_offset:end_offset] = out
+            start_offset = end_offset
+
+        return output
+
+
+def per_tensor_quantize(tensor: torch.Tensor) -> tuple[torch.Tensor, float]:
+    """Quantize a tensor using per-tensor static scaling factor.
+    Args:
+        tensor: The input tensor.
+    """
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    # Calculate the scale as dtype max divided by absmax.
+    # Since .abs() creates a new tensor, we use aminmax to get
+    # the min and max first and then calculate the absmax.
+    min_val, max_val = tensor.aminmax()
+    amax = min_val.abs().max(max_val.abs())
+    scale = finfo.max / amax.clamp(min=1e-12)
+    # scale and clamp the tensor to bring it to
+    # the representative range of float8 data type
+    # (as default cast is unsaturated)
+    qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)
+    # Return both float8 data and the inverse scale (as float),
+    # as both required as inputs to torch._scaled_mm
+    qweight = qweight.to(torch.float8_e4m3fn)
+    scale = scale.float().reciprocal()
+    return qweight, scale

From 2edd93afe360be2de69f23497b16436ba28673a9 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Wed, 24 Apr 2024 10:07:48 -0400
Subject: [PATCH 19/90] Update fp8.py

---
 vllm/model_executor/layers/quantization/fp8.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 983a63e124ae8..01e494c870e71 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -112,7 +112,6 @@ def apply_weights(self,
             scale_b=layer.weight_scaling_factor,
             bias=bias,
         )
-    
         return output
 
 

From ccee5d337df4455587c099890f93e977972d3c4f Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 13:46:35 -0700
Subject: [PATCH 20/90] Implement static scaling for Mixtral

---
 csrc/ops.h                                    |  7 ++++-
 csrc/pybind.cpp                               |  3 +-
 csrc/quantization/fp8/fp8_cuda_kernels.cu     | 25 +++++++++++++++-
 vllm/_custom_ops.py                           |  9 ++++--
 .../layers/fused_moe/fused_moe.py             | 12 +++++---
 .../model_executor/layers/quantization/fp8.py | 11 +++++--
 vllm/model_executor/models/mixtral.py         | 30 ++++++++++++++++++-
 7 files changed, 84 insertions(+), 13 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index ff7a3de1a0a8c..03bb1e24dc68e 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -146,7 +146,12 @@ void gptq_shuffle(
   torch::Tensor q_perm,
   int bit);
 
-void scaled_fp8_quant(
+void static_scaled_fp8_quant(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  torch::Tensor& scale);
+
+void dynamic_scaled_fp8_quant(
   torch::Tensor& out,
   torch::Tensor& input,
   torch::Tensor& scale);
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index a5b16c5abc3ed..2250c7f69f0ab 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -73,7 +73,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
   ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
   ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
-  ops.def("scaled_fp8_quant", &scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");
+  ops.def("static_scaled_fp8_quant", &static_scaled_fp8_quant, "Compute FP8 quantized tensor for given scaling factor");
+  ops.def("dynamic_scaled_fp8_quant", &dynamic_scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");
   ops.def(
     "moe_align_block_size",
     &moe_align_block_size,
diff --git a/csrc/quantization/fp8/fp8_cuda_kernels.cu b/csrc/quantization/fp8/fp8_cuda_kernels.cu
index c3337cede1282..2477051eb60d7 100644
--- a/csrc/quantization/fp8/fp8_cuda_kernels.cu
+++ b/csrc/quantization/fp8/fp8_cuda_kernels.cu
@@ -74,7 +74,30 @@ __global__ void scaled_fp8_quant_kernel(
 
 } // namespace vllm
 
-void scaled_fp8_quant(
+void static_scaled_fp8_quant(
+  torch::Tensor& out,      // [..., d]
+  torch::Tensor& input,    // [..., d]
+  torch::Tensor& scale)    // [1]
+{
+  int64_t num_tokens = input.numel() / input.size(-1);
+  int64_t num_elems = input.numel();
+  dim3 grid(num_tokens);
+  dim3 block(1024);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+    input.scalar_type(),
+    "scaled_fp8_quant_kernel",
+    [&] {
+      vllm::scaled_fp8_quant_kernel<scalar_t><<<grid, block, 0, stream>>>(
+        out.data_ptr<c10::Float8_e4m3fn>(),
+        input.data_ptr<scalar_t>(),
+        scale.data_ptr<float>(),
+        num_elems);
+      });
+}
+
+void dynamic_scaled_fp8_quant(
   torch::Tensor& out,      // [..., d]
   torch::Tensor& input,    // [..., d]
   torch::Tensor& scale)    // [1]
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e4b16ed918d1a..73e7b723edb31 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -154,10 +154,13 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 
 # fp8
-def scaled_fp8_quant(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+def scaled_fp8_quant(input: torch.Tensor, scale: Optional[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
-    vllm_ops.scaled_fp8_quant(output, input, scale)
+    if scale:
+        vllm_ops.static_scaled_fp8_quant(output, input, scale)
+    else:
+        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+        vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
     return output, scale
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index ac7c30e2a9727..b77732cff22ad 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -220,8 +220,8 @@ def moe_align_block_size(
 
 
 def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
-                            B_scale: torch.Tensor, topk_weights: torch.Tensor,
-                            topk_ids: torch.Tensor,
+                            A_scale: Optional[torch.Tensor], B_scale: torch.Tensor,
+                            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                             sorted_token_ids: torch.Tensor,
                             expert_ids: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
@@ -232,10 +232,10 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
     assert sorted_token_ids.stride(0) == 1
 
     if not use_fp8:
-        A_scale = None
+        assert A_scale is None
         assert B_scale is None
     else:
-        A, A_scale = ops.scaled_fp8_quant(A)
+        A, A_scale = ops.scaled_fp8_quant(A, A_scale)
         assert B_scale is not None
 
     grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[
@@ -318,6 +318,8 @@ def fused_moe(
     use_fp8: bool = False,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -434,6 +436,7 @@ def fused_moe(
     invoke_fused_moe_kernel(hidden_states,
                             w1,
                             intermediate_cache1,
+                            a1_scale,
                             w1_scale,
                             topk_weights,
                             topk_ids,
@@ -451,6 +454,7 @@ def fused_moe(
     invoke_fused_moe_kernel(intermediate_cache2,
                             w2,
                             intermediate_cache3,
+                            a2_scale,
                             w2_scale,
                             topk_weights,
                             topk_ids,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 01e494c870e71..633021fda894b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -13,6 +13,12 @@
 class FP8Config(QuantizationConfig):
     """Config class for FP8."""
 
+    def __init__(
+        self,
+        act_scaling: str="dynamic",
+    ) -> None:
+        self.act_scaling = act_scaling
+
     @classmethod
     def get_name(cls) -> str:
         return "fp8"
@@ -30,11 +36,12 @@ def get_min_capability(cls) -> int:
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
-        return []
+        return ["quantize_config.json"]
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "FP8Config":
-        return cls()
+        act_scaling = cls.get_from_keys(config, ["act_scaling"])
+        return cls(act_scaling)
 
     def get_linear_method(self) -> "Fp8LinearMethod":
         return Fp8LinearMethod(self)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index a33b795d7088e..951c44ca24622 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -114,12 +114,27 @@ def __init__(
                 self.num_total_experts, device="cuda", dtype=torch.float32),
             requires_grad=False) if self.use_fp8 else None
 
+        # Scaling factors for FP8 activations
+        static_act_scaling = self.use_fp8 and linear_method.act_scaling == "static"
+        self.as_scale = nn.Parameter(
+            torch.zeros(1, device="cuda", dtype=torch.float32),
+            requires_grad=False) if static_act_scaling else None
+        self.a2s_scale = nn.Parameter(
+            torch.zeros(1, device="cuda", dtype=torch.float32),
+            requires_grad=False) if static_act_scaling else None
+
         set_weight_attrs(self.ws, {
             "weight_loader": self.weight_loader,
         })
         set_weight_attrs(self.w2s, {
             "weight_loader": self.weight_loader,
         })
+        set_weight_attrs(self.as_scale, {
+            "weight_loader": self.weight_loader,
+        })
+        set_weight_attrs(self.a2s_scale, {
+            "weight_loader": self.weight_loader,
+        })
 
     def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
                       weight_name: str, expert_id: int):
@@ -134,6 +149,9 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
                        shard_size:2 * shard_size, :] = loaded_weight[shard, :]
         if weight_name.endswith("w2.weight"):
             param_data[expert_id, :, :] = loaded_weight[:, shard]
+        if "activation_scale" in weight_name:
+            param_data[:] = param_data[:].max(loaded_weight)
+            print("loaded scale", weight_name, param_data)
 
     def process_weights_after_loading(self):
         if self.use_fp8:
@@ -161,7 +179,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                                         inplace=True,
                                         use_fp8=self.use_fp8,
                                         w1_scale=self.ws_scale,
-                                        w2_scale=self.w2s_scale)
+                                        w2_scale=self.w2s_scale,
+                                        a1_scale=self.a1_scale,
+                                        a2_scale=self.a2_scale)
 
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
@@ -443,11 +463,19 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
 
         expert_params_mapping = [
+            # These are the weights for the experts
             # (param_name, weight_name, expert_id)
             ("ws" if weight_name in ["w1", "w3"] else "w2s",
              f"experts.{expert_id}.{weight_name}.weight", expert_id)
             for expert_id in range(self.config.num_local_experts)
             for weight_name in ["w1", "w2", "w3"]
+        ] + [
+            # These are the activation scales for the experts
+            # (param_name, weight_name, expert_id)
+            ("a_scale" if activation_name in ["a1", "a3"] else "a2_scale",
+             f"experts.{expert_id}.{activation_name}.activation_scale", expert_id)
+            for expert_id in range(self.config.num_local_experts)
+            for activation_name in ["a1", "a2", "a3"]
         ]
 
         params_dict = dict(self.named_parameters())

From 8f71c795314101187aa4aa65c128a73781ef14a4 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:01:32 -0700
Subject: [PATCH 21/90] fix

---
 vllm/model_executor/models/mixtral.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 951c44ca24622..6a52db56b50b9 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -115,13 +115,13 @@ def __init__(
             requires_grad=False) if self.use_fp8 else None
 
         # Scaling factors for FP8 activations
-        static_act_scaling = self.use_fp8 and linear_method.act_scaling == "static"
+        need_act_scales = not self.use_fp8 or linear_method.act_scaling == "static"
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
-            requires_grad=False) if static_act_scaling else None
+            requires_grad=False) if need_act_scales else None
         self.a2s_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
-            requires_grad=False) if static_act_scaling else None
+            requires_grad=False) if need_act_scales else None
 
         set_weight_attrs(self.ws, {
             "weight_loader": self.weight_loader,

From 6eb01e07977b7a93348dc5b3247a5b3f369f7720 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:07:48 -0700
Subject: [PATCH 22/90] update

---
 vllm/model_executor/model_loader/weight_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 9995f2afe3cf7..2cf4cd9f5c642 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -147,7 +147,7 @@ def get_quant_config(model_config: ModelConfig,
         f for f in config_files if any(
             f.endswith(x) for x in possible_config_filenames)
     ]
-    if len(quant_config_files) == 0:
+    if len(quant_config_files) == 0 and "<optional>" not in possible_config_filenames:
         raise ValueError(
             f"Cannot find the config file for {model_config.quantization}")
     if len(quant_config_files) > 1:

From dc89cbc364abbcd471c547381de53cb72bd76c1b Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:16:39 -0700
Subject: [PATCH 23/90] fix

---
 vllm/model_executor/layers/quantization/fp8.py   |  2 ++
 vllm/model_executor/model_loader/weight_utils.py | 13 ++++++-------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 633021fda894b..7c0e0e9e8e45b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -13,6 +13,8 @@
 class FP8Config(QuantizationConfig):
     """Config class for FP8."""
 
+    config_file_optional = True
+
     def __init__(
         self,
         act_scaling: str="dynamic",
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 2cf4cd9f5c642..8fc85da86748c 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -135,19 +135,18 @@ def get_quant_config(model_config: ModelConfig,
     else:
         hf_folder = model_name_or_path
 
-    possible_config_filenames = quant_cls.get_config_filenames()
-
-    # If the quantization config is not found, use the default config.
-    if not possible_config_filenames:
-        return quant_cls()
-
     config_files = glob.glob(os.path.join(hf_folder, "*.json"))
 
     quant_config_files = [
         f for f in config_files if any(
             f.endswith(x) for x in possible_config_filenames)
     ]
-    if len(quant_config_files) == 0 and "<optional>" not in possible_config_filenames:
+
+    possible_config_filenames = quant_cls.get_config_filenames()
+    # If the quantization config is optional and not provided, use the default config.
+    if quant_cls.getattr("config_file_optional", False) and not quant_config_files:
+        return quant_cls()
+    if len(quant_config_files) == 0:
         raise ValueError(
             f"Cannot find the config file for {model_config.quantization}")
     if len(quant_config_files) > 1:

From be60845acd14b0639fc565a7aa954ad98df5c7b3 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:18:08 -0700
Subject: [PATCH 24/90] update

---
 vllm/model_executor/model_loader/weight_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 8fc85da86748c..df5b8fd80d1a3 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -136,13 +136,13 @@ def get_quant_config(model_config: ModelConfig,
         hf_folder = model_name_or_path
 
     config_files = glob.glob(os.path.join(hf_folder, "*.json"))
+    possible_config_filenames = quant_cls.get_config_filenames()
 
     quant_config_files = [
         f for f in config_files if any(
             f.endswith(x) for x in possible_config_filenames)
     ]
 
-    possible_config_filenames = quant_cls.get_config_filenames()
     # If the quantization config is optional and not provided, use the default config.
     if quant_cls.getattr("config_file_optional", False) and not quant_config_files:
         return quant_cls()

From 4613cb562f4df81e651b798d408fb9bfab067640 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:19:52 -0700
Subject: [PATCH 25/90] update

---
 vllm/model_executor/model_loader/weight_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index df5b8fd80d1a3..f3e0411c54336 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -144,7 +144,7 @@ def get_quant_config(model_config: ModelConfig,
     ]
 
     # If the quantization config is optional and not provided, use the default config.
-    if quant_cls.getattr("config_file_optional", False) and not quant_config_files:
+    if getattr(quant_cls, "config_file_optional", False) and not quant_config_files:
         return quant_cls()
     if len(quant_config_files) == 0:
         raise ValueError(

From 3d95d86e040fa643f110f952d42376ca26a702ce Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:21:41 -0700
Subject: [PATCH 26/90] fix

---
 vllm/model_executor/models/mixtral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 6a52db56b50b9..c818764ff6799 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -115,7 +115,7 @@ def __init__(
             requires_grad=False) if self.use_fp8 else None
 
         # Scaling factors for FP8 activations
-        need_act_scales = not self.use_fp8 or linear_method.act_scaling == "static"
+        need_act_scales = not self.use_fp8 or linear_method.quant_config.act_scaling == "static"
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None

From 642763fd5a54bf442b17fdcebbc27dabc398144a Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:23:42 -0700
Subject: [PATCH 27/90] move

---
 vllm/model_executor/models/mixtral.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index c818764ff6799..7260d1e7667ea 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -104,6 +104,13 @@ def __init__(
                         device="cuda",
                         dtype=self.params_dtype))
 
+        set_weight_attrs(self.ws, {
+            "weight_loader": self.weight_loader,
+        })
+        set_weight_attrs(self.w2s, {
+            "weight_loader": self.weight_loader,
+        })
+
         # Scaling factors for FP8 weights
         self.ws_scale = nn.Parameter(
             torch.ones(
@@ -123,18 +130,13 @@ def __init__(
             torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None
 
-        set_weight_attrs(self.ws, {
-            "weight_loader": self.weight_loader,
-        })
-        set_weight_attrs(self.w2s, {
-            "weight_loader": self.weight_loader,
-        })
-        set_weight_attrs(self.as_scale, {
-            "weight_loader": self.weight_loader,
-        })
-        set_weight_attrs(self.a2s_scale, {
-            "weight_loader": self.weight_loader,
-        })
+        if need_act_scales:
+            set_weight_attrs(self.as_scale, {
+                "weight_loader": self.weight_loader,
+            })
+            set_weight_attrs(self.a2s_scale, {
+                "weight_loader": self.weight_loader,
+            })
 
     def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
                       weight_name: str, expert_id: int):

From 706e9317ef3c4f3ee715df3b0f0571a832073d11 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:27:04 -0700
Subject: [PATCH 28/90] update

---
 vllm/model_executor/models/mixtral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 7260d1e7667ea..9f4673b59c673 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -182,8 +182,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                                         use_fp8=self.use_fp8,
                                         w1_scale=self.ws_scale,
                                         w2_scale=self.w2s_scale,
-                                        a1_scale=self.a1_scale,
-                                        a2_scale=self.a2_scale)
+                                        a1_scale=self.as_scale,
+                                        a2_scale=self.a2s_scale)
 
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(

From 9a3c78ccbb06504400709d1148d1b9c34afd3295 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:31:18 -0700
Subject: [PATCH 29/90] lol

---
 vllm/model_executor/models/mixtral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 9f4673b59c673..e4a0ba7902682 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -122,7 +122,7 @@ def __init__(
             requires_grad=False) if self.use_fp8 else None
 
         # Scaling factors for FP8 activations
-        need_act_scales = not self.use_fp8 or linear_method.quant_config.act_scaling == "static"
+        need_act_scales = self.use_fp8 and linear_method.quant_config.act_scaling == "static"
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None

From 1b6f0201c60c3e75bcfc45d96f9c32d7f1754c72 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 15:15:06 -0700
Subject: [PATCH 30/90] fix cuda graph

---
 vllm/_custom_ops.py                           | 14 +++++++------
 .../layers/fused_moe/fused_moe.py             | 21 +++++++++++--------
 vllm/model_executor/models/mixtral.py         | 16 +++++++-------
 3 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 73e7b723edb31..0a18714da0e5c 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -154,13 +154,15 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 
 # fp8
-def scaled_fp8_quant(input: torch.Tensor, scale: Optional[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+def static_scaled_fp8_quant(input: torch.Tensor, scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
-    if scale:
-        vllm_ops.static_scaled_fp8_quant(output, input, scale)
-    else:
-        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
-        vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
+    vllm_ops.static_scaled_fp8_quant(output, input, scale)
+    return output
+
+def dynamic_scaled_fp8_quant(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+    scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+    vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
     return output, scale
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index b77732cff22ad..556ecc5f30868 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -227,15 +227,18 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
                             mul_routed_weight: bool, top_k: int,
                             config: Dict[str, Any], compute_type: tl.dtype,
-                            use_fp8: bool) -> None:
+                            linear_method: Optional[LinearMethodBase]) -> None:
     assert topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
 
-    if not use_fp8:
+    if not isinstance(linear_method, Fp8LinearMethod):
         assert A_scale is None
         assert B_scale is None
-    else:
-        A, A_scale = ops.scaled_fp8_quant(A, A_scale)
+    elif linear_method.quant_config.act_scaling == "static":
+        A = ops.static_scaled_fp8_quant(A, A_scale)
+        assert B_scale is not None
+    elif linear_method.quant_config.act_scaling == "dynamic":
+        A, A_scale = ops.dynamic_scaled_fp8_quant(A)
         assert B_scale is not None
 
     grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[
@@ -265,7 +268,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
         MUL_ROUTED_WEIGHT=mul_routed_weight,
         top_k=top_k,
         compute_type=compute_type,
-        use_fp8=use_fp8,
+        use_fp8=isinstance(linear_method, Fp8LinearMethod),
         **config,
     )
 
@@ -315,7 +318,7 @@ def fused_moe(
     renormalize: bool,
     inplace: bool = False,
     override_config: Optional[Dict[str, Any]] = None,
-    use_fp8: bool = False,
+    linear_method: Optional[LinearMethodBase] = None,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
     a1_scale: Optional[torch.Tensor] = None,
@@ -397,7 +400,7 @@ def fused_moe(
     else:
         # First try to load optimal config from the file
         configs = get_moe_configs(E, w2.shape[2],
-                                  "float8" if use_fp8 else None)
+                                  "float8" if isinstance(linear_method, Fp8LinearMethod) else None)
 
         if configs:
             # If an optimal configuration map has been found, look up the
@@ -447,7 +450,7 @@ def fused_moe(
                             topk_ids.shape[1],
                             config,
                             compute_type=tl.float16,
-                            use_fp8=use_fp8)
+                            linear_method=linear_method)
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
 
@@ -465,7 +468,7 @@ def fused_moe(
                             1,
                             config,
                             compute_type=tl.float16,
-                            use_fp8=use_fp8)
+                            linear_method=linear_method)
 
     if inplace:
         return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index e4a0ba7902682..ca051e8fa813c 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -77,9 +77,7 @@ def __init__(
         self.top_k = top_k
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size // self.tp_size
-        # FIXME(pcmoritz): Make this more general to support different
-        # quantization schemes
-        self.use_fp8 = isinstance(linear_method, Fp8LinearMethod)
+        self.linear_method = linear_method
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -111,18 +109,20 @@ def __init__(
             "weight_loader": self.weight_loader,
         })
 
+        use_fp8 = isinstance(linear_method, Fp8LinearMethod)
+
         # Scaling factors for FP8 weights
         self.ws_scale = nn.Parameter(
             torch.ones(
                 self.num_total_experts, device="cuda", dtype=torch.float32),
-            requires_grad=False) if self.use_fp8 else None
+            requires_grad=False) if use_fp8 else None
         self.w2s_scale = nn.Parameter(
             torch.ones(
                 self.num_total_experts, device="cuda", dtype=torch.float32),
-            requires_grad=False) if self.use_fp8 else None
+            requires_grad=False) if use_fp8 else None
 
         # Scaling factors for FP8 activations
-        need_act_scales = self.use_fp8 and linear_method.quant_config.act_scaling == "static"
+        need_act_scales = use_fp8 and linear_method.quant_config.act_scaling == "static"
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None
@@ -156,7 +156,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
             print("loaded scale", weight_name, param_data)
 
     def process_weights_after_loading(self):
-        if self.use_fp8:
+        if isinstance(self.linear_method, Fp8LinearMethod):
             ws = torch.empty_like(self.ws.data, dtype=torch.float8_e4m3fn)
             w2s = torch.empty_like(self.w2s.data, dtype=torch.float8_e4m3fn)
             for expert in range(self.num_total_experts):
@@ -179,7 +179,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                                         self.top_k,
                                         renormalize=True,
                                         inplace=True,
-                                        use_fp8=self.use_fp8,
+                                        linear_method=self.linear_method,
                                         w1_scale=self.ws_scale,
                                         w2_scale=self.w2s_scale,
                                         a1_scale=self.as_scale,

From b09bcecc6ff019ce5a4b3ece92d9e591832d1f5a Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 15:20:17 -0700
Subject: [PATCH 31/90] fix

---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 556ecc5f30868..dab54a385b119 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -10,6 +10,7 @@
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearMethodBase
 from vllm.utils import is_hip
 
 logger = init_logger(__name__)

From 052e2b3b83e6760739e2101252f38ed198784029 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 15:22:17 -0700
Subject: [PATCH 32/90] update

---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index dab54a385b119..db22c38011c82 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -11,6 +11,7 @@
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearMethodBase
+from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 from vllm.utils import is_hip
 
 logger = init_logger(__name__)

From b33c6d7a5208564735b6f6eeeeaed88b51a7be30 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 18:46:25 -0700
Subject: [PATCH 33/90] update

---
 vllm/model_executor/models/mixtral.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index ca051e8fa813c..c233cd0d9b065 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -474,10 +474,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ] + [
             # These are the activation scales for the experts
             # (param_name, weight_name, expert_id)
-            ("a_scale" if activation_name in ["a1", "a3"] else "a2_scale",
-             f"experts.{expert_id}.{activation_name}.activation_scale", expert_id)
+            ("as_scale" if weight_name in ["w1", "w3"] else "a2s_scale",
+             f"experts.{expert_id}.{weight_name}.activation_scale", expert_id)
             for expert_id in range(self.config.num_local_experts)
-            for activation_name in ["a1", "a2", "a3"]
+            for weight_name in ["w1", "w2", "w3"]
         ]
 
         params_dict = dict(self.named_parameters())

From 475f58d9a25a2cc0de04dcfbcba733590aeafd11 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 18:56:02 -0700
Subject: [PATCH 34/90] refactor

---
 vllm/model_executor/layers/quantization/base_config.py | 6 ++++++
 vllm/model_executor/layers/quantization/fp8.py         | 6 ++++--
 vllm/model_executor/model_loader/weight_utils.py       | 2 +-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 6115e7c3be956..08061f83b5a38 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -29,6 +29,12 @@ def get_min_capability(self) -> int:
         """
         raise NotImplementedError
 
+    # The following is not an abstract method and returns True by default.
+    @classmethod
+    def require_config_file(cls) -> bool:
+        """Whether this quantization config needs a configuration filen."""
+        return True
+
     @staticmethod
     @abstractmethod
     def get_config_filenames() -> List[str]:
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 7c0e0e9e8e45b..d5e37cbd7bd5e 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -13,8 +13,6 @@
 class FP8Config(QuantizationConfig):
     """Config class for FP8."""
 
-    config_file_optional = True
-
     def __init__(
         self,
         act_scaling: str="dynamic",
@@ -36,6 +34,10 @@ def get_min_capability(cls) -> int:
         # be included: https://github.com/pytorch/pytorch/pull/118881
         return 90
 
+    @classmethod
+    def require_config_file(cls) -> bool:
+        return False
+
     @classmethod
     def get_config_filenames(cls) -> List[str]:
         return ["quantize_config.json"]
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index f3e0411c54336..70879f91bdc23 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -144,7 +144,7 @@ def get_quant_config(model_config: ModelConfig,
     ]
 
     # If the quantization config is optional and not provided, use the default config.
-    if getattr(quant_cls, "config_file_optional", False) and not quant_config_files:
+    if not quant_cls.require_config_file() and not quant_config_files:
         return quant_cls()
     if len(quant_config_files) == 0:
         raise ValueError(

From 56b4880b47e4911bebaabe788eaeba08812eaa1b Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 19:16:27 -0700
Subject: [PATCH 35/90] update

---
 vllm/_custom_ops.py                               | 14 ++++++--------
 vllm/model_executor/layers/fused_moe/fused_moe.py |  7 ++-----
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 0a18714da0e5c..b871b8e3a11d9 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -154,15 +154,13 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 
 # fp8
-def static_scaled_fp8_quant(input: torch.Tensor, scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+def scaled_fp8_quant(input: torch.Tensor, scale: Optional[torch.Tensor]=None) -> Tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
-    vllm_ops.static_scaled_fp8_quant(output, input, scale)
-    return output
-
-def dynamic_scaled_fp8_quant(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
-    scale = torch.zeros(1, device=input.device, dtype=torch.float32)
-    vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
+    if scale is None:
+        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+        vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
+    else:
+        vllm_ops.static_scaled_fp8_quant(output, input, scale)
     return output, scale
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index db22c38011c82..6fe94d4e268c3 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -236,11 +236,8 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
     if not isinstance(linear_method, Fp8LinearMethod):
         assert A_scale is None
         assert B_scale is None
-    elif linear_method.quant_config.act_scaling == "static":
-        A = ops.static_scaled_fp8_quant(A, A_scale)
-        assert B_scale is not None
-    elif linear_method.quant_config.act_scaling == "dynamic":
-        A, A_scale = ops.dynamic_scaled_fp8_quant(A)
+    else:
+        A, A_scale = ops.scaled_fp8_quant(A, A_scale)
         assert B_scale is not None
 
     grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[

From be371549899fd3e169bb8d3f1730c27e83ecf760 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 19:27:40 -0700
Subject: [PATCH 36/90] revert

---
 .../model_executor/layers/fused_moe/fused_moe.py | 16 +++++++---------
 vllm/model_executor/models/mixtral.py            | 16 ++++++++--------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 6fe94d4e268c3..b77732cff22ad 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -10,8 +10,6 @@
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import LinearMethodBase
-from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 from vllm.utils import is_hip
 
 logger = init_logger(__name__)
@@ -229,11 +227,11 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
                             mul_routed_weight: bool, top_k: int,
                             config: Dict[str, Any], compute_type: tl.dtype,
-                            linear_method: Optional[LinearMethodBase]) -> None:
+                            use_fp8: bool) -> None:
     assert topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
 
-    if not isinstance(linear_method, Fp8LinearMethod):
+    if not use_fp8:
         assert A_scale is None
         assert B_scale is None
     else:
@@ -267,7 +265,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
         MUL_ROUTED_WEIGHT=mul_routed_weight,
         top_k=top_k,
         compute_type=compute_type,
-        use_fp8=isinstance(linear_method, Fp8LinearMethod),
+        use_fp8=use_fp8,
         **config,
     )
 
@@ -317,7 +315,7 @@ def fused_moe(
     renormalize: bool,
     inplace: bool = False,
     override_config: Optional[Dict[str, Any]] = None,
-    linear_method: Optional[LinearMethodBase] = None,
+    use_fp8: bool = False,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
     a1_scale: Optional[torch.Tensor] = None,
@@ -399,7 +397,7 @@ def fused_moe(
     else:
         # First try to load optimal config from the file
         configs = get_moe_configs(E, w2.shape[2],
-                                  "float8" if isinstance(linear_method, Fp8LinearMethod) else None)
+                                  "float8" if use_fp8 else None)
 
         if configs:
             # If an optimal configuration map has been found, look up the
@@ -449,7 +447,7 @@ def fused_moe(
                             topk_ids.shape[1],
                             config,
                             compute_type=tl.float16,
-                            linear_method=linear_method)
+                            use_fp8=use_fp8)
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
 
@@ -467,7 +465,7 @@ def fused_moe(
                             1,
                             config,
                             compute_type=tl.float16,
-                            linear_method=linear_method)
+                            use_fp8=use_fp8)
 
     if inplace:
         return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index c233cd0d9b065..9baf176fc78ee 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -77,7 +77,9 @@ def __init__(
         self.top_k = top_k
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size // self.tp_size
-        self.linear_method = linear_method
+        # FIXME(pcmoritz): Make this more general to support different
+        # quantization schemes
+        self.use_fp8 = isinstance(linear_method, Fp8LinearMethod)
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -109,20 +111,18 @@ def __init__(
             "weight_loader": self.weight_loader,
         })
 
-        use_fp8 = isinstance(linear_method, Fp8LinearMethod)
-
         # Scaling factors for FP8 weights
         self.ws_scale = nn.Parameter(
             torch.ones(
                 self.num_total_experts, device="cuda", dtype=torch.float32),
-            requires_grad=False) if use_fp8 else None
+            requires_grad=False) if self.use_fp8 else None
         self.w2s_scale = nn.Parameter(
             torch.ones(
                 self.num_total_experts, device="cuda", dtype=torch.float32),
-            requires_grad=False) if use_fp8 else None
+            requires_grad=False) if self.use_fp8 else None
 
         # Scaling factors for FP8 activations
-        need_act_scales = use_fp8 and linear_method.quant_config.act_scaling == "static"
+        need_act_scales = self.use_fp8 and linear_method.quant_config.act_scaling == "static"
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None
@@ -156,7 +156,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
             print("loaded scale", weight_name, param_data)
 
     def process_weights_after_loading(self):
-        if isinstance(self.linear_method, Fp8LinearMethod):
+        if self.use_fp8:
             ws = torch.empty_like(self.ws.data, dtype=torch.float8_e4m3fn)
             w2s = torch.empty_like(self.w2s.data, dtype=torch.float8_e4m3fn)
             for expert in range(self.num_total_experts):
@@ -179,7 +179,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                                         self.top_k,
                                         renormalize=True,
                                         inplace=True,
-                                        linear_method=self.linear_method,
+                                        use_fp8=self.use_fp8,
                                         w1_scale=self.ws_scale,
                                         w2_scale=self.w2s_scale,
                                         a1_scale=self.as_scale,

From 9c54d1923876d661f93d470a539903415036735f Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 19:35:27 -0700
Subject: [PATCH 37/90] format

---
 vllm/_custom_ops.py                               | 5 ++++-
 vllm/model_executor/layers/fused_moe/fused_moe.py | 5 +++--
 vllm/model_executor/layers/quantization/fp8.py    | 2 +-
 vllm/model_executor/model_loader/weight_utils.py  | 4 +++-
 vllm/model_executor/models/mixtral.py             | 3 ++-
 5 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index b871b8e3a11d9..c6f330f211e8a 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -154,7 +154,10 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 
 # fp8
-def scaled_fp8_quant(input: torch.Tensor, scale: Optional[torch.Tensor]=None) -> Tuple[torch.Tensor, torch.Tensor]:
+def scaled_fp8_quant(
+        input: torch.Tensor,
+        scale: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
     if scale is None:
         scale = torch.zeros(1, device=input.device, dtype=torch.float32)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index b77732cff22ad..17b140195676e 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -220,8 +220,9 @@ def moe_align_block_size(
 
 
 def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
-                            A_scale: Optional[torch.Tensor], B_scale: torch.Tensor,
-                            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                            A_scale: Optional[torch.Tensor],
+                            B_scale: torch.Tensor, topk_weights: torch.Tensor,
+                            topk_ids: torch.Tensor,
                             sorted_token_ids: torch.Tensor,
                             expert_ids: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index d5e37cbd7bd5e..ab562288a8f7d 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -15,7 +15,7 @@ class FP8Config(QuantizationConfig):
 
     def __init__(
         self,
-        act_scaling: str="dynamic",
+        act_scaling: str = "dynamic",
     ) -> None:
         self.act_scaling = act_scaling
 
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 70879f91bdc23..54fd4f22f0eef 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -143,9 +143,11 @@ def get_quant_config(model_config: ModelConfig,
             f.endswith(x) for x in possible_config_filenames)
     ]
 
-    # If the quantization config is optional and not provided, use the default config.
+    # If the quantization config is optional and not provided, use the
+    # default config.
     if not quant_cls.require_config_file() and not quant_config_files:
         return quant_cls()
+
     if len(quant_config_files) == 0:
         raise ValueError(
             f"Cannot find the config file for {model_config.quantization}")
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 9baf176fc78ee..f42e1c939d723 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -122,7 +122,8 @@ def __init__(
             requires_grad=False) if self.use_fp8 else None
 
         # Scaling factors for FP8 activations
-        need_act_scales = self.use_fp8 and linear_method.quant_config.act_scaling == "static"
+        need_act_scales = (self.use_fp8 and
+                           linear_method.quant_config.act_scaling == "static")
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None

From c5155eab2f068dcd49a2fa78b0b198e0fc36582e Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 20:48:55 -0700
Subject: [PATCH 38/90] Update vllm/_custom_ops.py

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/_custom_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index c6f330f211e8a..9d40ec4928276 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -155,8 +155,8 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 # fp8
 def scaled_fp8_quant(
-        input: torch.Tensor,
-        scale: Optional[torch.Tensor] = None
+    input: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
     if scale is None:

From 948cca76dff98914d7ad08dea7eed17c10f2b933 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 21:16:41 -0700
Subject: [PATCH 39/90] Update
 vllm/model_executor/layers/fused_moe/fused_moe.py

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 17b140195676e..8e49c86202eb7 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -221,7 +221,7 @@ def moe_align_block_size(
 
 def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
                             A_scale: Optional[torch.Tensor],
-                            B_scale: torch.Tensor, topk_weights: torch.Tensor,
+                            B_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
                             topk_ids: torch.Tensor,
                             sorted_token_ids: torch.Tensor,
                             expert_ids: torch.Tensor,

From 3feb88790dd5ab0d9d8244a92015a0ac27525387 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 21:16:52 -0700
Subject: [PATCH 40/90] Update vllm/model_executor/models/mixtral.py

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/model_executor/models/mixtral.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index f42e1c939d723..4439a75e5b57f 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -154,7 +154,6 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
             param_data[expert_id, :, :] = loaded_weight[:, shard]
         if "activation_scale" in weight_name:
             param_data[:] = param_data[:].max(loaded_weight)
-            print("loaded scale", weight_name, param_data)
 
     def process_weights_after_loading(self):
         if self.use_fp8:

From df1631690c706b72df401316858884585167970f Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 21:44:13 -0700
Subject: [PATCH 41/90] format

---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 8e49c86202eb7..0cc59ae57d0d3 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -221,8 +221,8 @@ def moe_align_block_size(
 
 def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
                             A_scale: Optional[torch.Tensor],
-                            B_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
-                            topk_ids: torch.Tensor,
+                            B_scale: Optional[torch.Tensor],
+                            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                             sorted_token_ids: torch.Tensor,
                             expert_ids: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,

From 7b6b0fa7f9a04e77c4568a66c4da47e3fff1937d Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 25 Apr 2024 13:50:24 +0000
Subject: [PATCH 42/90] support static scales

---
 run_fp8.py                                    |  2 +-
 .../layers/quantization/__init__.py           |  5 +-
 .../layers/quantization/fp8_serialized.py     | 60 +++++++++++++++----
 3 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/run_fp8.py b/run_fp8.py
index 16b1fab255293..19f991902d486 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -1,7 +1,7 @@
 from vllm import LLM
 from transformers import AutoTokenizer
 
-model = LLM("nm-testing/mistral-fp8-test", tokenizer="mistralai/Mistral-7B-Instruct-v0.2", quantization="fp8_serialized", enforce_eager=True, max_model_len=1024)
+model = LLM("nm-testing/mistral-fp8-static", enforce_eager=True, max_model_len=1024)
 tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
 
 prompt = tokenizer.apply_chat_template([{"role": "user", "content": "What is your name"}], tokenize=False, add_generation_prompt=True)
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index dbbe466dd7a94..060eaad31899b 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -4,8 +4,8 @@
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.quantization.fp8 import FP8Config
-from vllm.model_executor.layers.quantization.fp8_serialized import FP8SerializedConfig
+# from vllm.model_executor.layers.quantization.fp8 import FP8Config
+from vllm.model_executor.layers.quantization.fp8_serialized import FP8Config
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.marlin import MarlinConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
@@ -14,7 +14,6 @@
     "aqlm": AQLMConfig,
     "awq": AWQConfig,
     "fp8": FP8Config,
-    "fp8_serialized": FP8SerializedConfig,
     "gptq": GPTQConfig,
     "squeezellm": SqueezeLLMConfig,
     "marlin": MarlinConfig,
diff --git a/vllm/model_executor/layers/quantization/fp8_serialized.py b/vllm/model_executor/layers/quantization/fp8_serialized.py
index 933a70a7d6d10..c187bed817412 100644
--- a/vllm/model_executor/layers/quantization/fp8_serialized.py
+++ b/vllm/model_executor/layers/quantization/fp8_serialized.py
@@ -9,12 +9,18 @@
     QuantizationConfig)
 
 
-class FP8SerializedConfig(QuantizationConfig):
+class FP8Config(QuantizationConfig):
     """Config class for FP8."""
+    def __init__(
+        self,
+        scheme: str,
+    ) -> None:
+        assert scheme == "static" or scheme == "dynamic"
+        self.scheme = scheme
 
     @classmethod
     def get_name(cls) -> str:
-        return "fp8_static"
+        return "fp8"
 
     @classmethod
     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
@@ -29,24 +35,24 @@ def get_config_filenames(cls) -> List[str]:
         return []
 
     @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "FP8SerializedConfig":
-        return cls()
+    def from_config(cls, config: Dict[str, Any]) -> "FP8Config":
+        scheme = cls.get_from_keys(config, ["scheme"])
+        return cls(scheme=scheme)
 
-    def get_linear_method(self) -> "Fp8SerializedLinearMethod":
-        return Fp8SerializedLinearMethod(self)
+    def get_linear_method(self) -> "FP8LinearMethod":
+        return FP8LinearMethod(self)
 
     def get_scaled_act_names(self) -> List[str]:
         return []
 
-
-class Fp8SerializedLinearMethod(LinearMethodBase):
+class FP8LinearMethod(LinearMethodBase):
     """Linear method for StaticFP8
     .
     Args:
         quant_config: The quantization config.
     """
 
-    def __init__(self, quant_config: FP8SerializedConfig):
+    def __init__(self, quant_config: FP8Config):
         self.quant_config = quant_config
 
     def create_weights(
@@ -81,6 +87,17 @@ def create_weights(
             "shard_indexer": self.scales_shard_indexer,
         })
 
+        if self.quant_config.scheme == "static":
+            act_scale = Parameter(
+                torch.empty(len(output_partition_sizes), dtype=torch.float32), 
+                requires_grad=False
+            )
+            layer.register_parameter("act_scale", act_scale)
+            set_weight_attrs(act_scale, extra_weight_attrs)
+            set_weight_attrs(act_scale, {
+                "shard_indexer": self.scales_shard_indexer,
+            })
+
         layer.logical_widths = output_partition_sizes
 
     def shard_id_as_int(
@@ -121,7 +138,14 @@ def apply_weights(self,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        qinput, x_scale = per_tensor_quantize(x)
+        w_scale = layer.weight_scale.max()
+
+        if self.quant_config.scheme == "dynamic":
+            qinput, x_scale = per_tensor_quantize_dyanmic(x)
+        elif self.quant_config.scheme == "static":
+            # empirically, these are all the same
+            x_scale = layer.act_scale.max()
+            qinput = per_tensor_quantize_static(x, x_scale)
         
         # FOR LOOP TO BE REPLACED BY CUTLASS KERNEL W/ EPILOGUE FUSION
         output = torch.zeros(x.shape[0], layer.weight.shape[0], dtype=x.dtype, device="cuda")
@@ -143,8 +167,22 @@ def apply_weights(self,
         return output
 
 
-def per_tensor_quantize(tensor: torch.Tensor) -> tuple[torch.Tensor, float]:
+def per_tensor_quantize_static(tensor: torch.Tensor, inv_scale: float) -> torch.Tensor:
     """Quantize a tensor using per-tensor static scaling factor.
+    Args:
+        tensor: The input tensor.
+        inv_scale: The scale.
+    """
+    # Scale and clamp the tensor to bring it to
+    # the representative range of float8 data type
+    # (as default cast is unsaturated)
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    qweight = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
+    return qweight.to(torch.float8_e4m3fn)
+
+
+def per_tensor_quantize_dyanmic(tensor: torch.Tensor) -> tuple[torch.Tensor, float]:
+    """Quantize a tensor using per-tensor dynamic scaling factor.
     Args:
         tensor: The input tensor.
     """

From 1a3b2e1ead4ed17a00f8f22f70ce05d3f3b436b6 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 25 Apr 2024 14:12:22 +0000
Subject: [PATCH 43/90] fixed example

---
 run_fp8.py | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/run_fp8.py b/run_fp8.py
index 19f991902d486..1ef12c3f352f9 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -1,12 +1,33 @@
 from vllm import LLM
 from transformers import AutoTokenizer
+import argparse 
 
-model = LLM("nm-testing/mistral-fp8-static", enforce_eager=True, max_model_len=1024)
-tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
+parser = argparse.ArgumentParser()
+parser.add_argument("--type", choices=["static", "dynamic"])
 
-prompt = tokenizer.apply_chat_template([{"role": "user", "content": "What is your name"}], tokenize=False, add_generation_prompt=True)
-print(f"----- Prompt: {prompt}")
+if __name__ == "__main__":
+    args = parser.parse_args()
+    
+    if args.type == "static":
+        model_name = "nm-testing/mistral-fp8-static"
+    elif args.type == "dynamic":
+        model_name = "nm-testing/mistral-fp8-dynamic"
+    else:
+        raise ValueError("--type should be `static` or `dynamic`")
 
-outputs = model.generate(prompt)
-generation = outputs[0].outputs[0].text
-print(f"----- Generation: {generation}")
+    tokenizer_name = "mistralai/Mistral-7B-Instruct-v0.2"
+
+    model = LLM(
+        model_name, 
+        tokenizer=tokenizer_name,
+        enforce_eager=True,
+        max_model_len=1024)
+    
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+
+    prompt = tokenizer.apply_chat_template([{"role": "user", "content": "What is your name"}], tokenize=False, add_generation_prompt=True)
+    print(f"----- Prompt: {prompt}")
+
+    outputs = model.generate(prompt)
+    generation = outputs[0].outputs[0].text
+    print(f"----- Generation: {generation}")

From 63ad2ef747d046733fe1b671cded43b1beaba914 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Thu, 25 Apr 2024 10:25:22 -0400
Subject: [PATCH 44/90] Delete quantize.ipynb

---
 quantize.ipynb | 250 -------------------------------------------------
 1 file changed, 250 deletions(-)
 delete mode 100644 quantize.ipynb

diff --git a/quantize.ipynb b/quantize.ipynb
deleted file mode 100644
index 17efdc17fd39b..0000000000000
--- a/quantize.ipynb
+++ /dev/null
@@ -1,250 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/paperspace/env/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n",
-      "Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.94it/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "import torch\n",
-    "from transformers import AutoModelForCausalLM\n",
-    "\n",
-    "model = AutoModelForCausalLM.from_pretrained(\n",
-    "    \"mistralai/Mistral-7B-Instruct-v0.2\",\n",
-    "    torch_dtype=torch.bfloat16,\n",
-    ")\n",
-    "model = model.to(\"cuda\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from typing import Tuple\n",
-    "\n",
-    "def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:\n",
-    "    \"\"\"Quantize a tensor using per-tensor static scaling factor.\n",
-    "\n",
-    "    Args:\n",
-    "        tensor: The input tensor.\n",
-    "    \"\"\"\n",
-    "    finfo = torch.finfo(torch.float8_e4m3fn)\n",
-    "    # Calculate the scale as dtype max divided by absmax.\n",
-    "    # Since .abs() creates a new tensor, we use aminmax to get\n",
-    "    # the min and max first and then calculate the absmax.\n",
-    "    min_val, max_val = tensor.aminmax()\n",
-    "    amax = min_val.abs().max(max_val.abs())\n",
-    "    scale = finfo.max / amax.clamp(min=1e-12)\n",
-    "    # scale and clamp the tensor to bring it to\n",
-    "    # the representative range of float8 data type\n",
-    "    # (as default cast is unsaturated)\n",
-    "    qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)\n",
-    "    # Return both float8 data and the inverse scale (as float),\n",
-    "    # as both required as inputs to torch._scaled_mm\n",
-    "    qweight = qweight.to(torch.float8_e4m3fn)\n",
-    "    scale = scale.float().reciprocal()\n",
-    "    return qweight, scale"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class LinearFP8(torch.nn.Module):\n",
-    "    def __init__(self, qweight, scale):\n",
-    "        super().__init__()\n",
-    "        self.weight = torch.nn.Parameter(qweight, requires_grad=False)\n",
-    "        self.weight_scale = torch.nn.Parameter(scale, requires_grad=False)\n",
-    "    \n",
-    "    def forward(self, x):\n",
-    "        shape = x.shape\n",
-    "        x = x.reshape(-1, shape[-1])\n",
-    "        qinput, x_scale = per_tensor_quantize(x)\n",
-    "        \n",
-    "        output, _ = torch._scaled_mm(\n",
-    "            qinput,\n",
-    "            self.weight.t(),\n",
-    "            out_dtype=x.dtype,\n",
-    "            scale_a=x_scale,\n",
-    "            scale_b=self.weight_scale,\n",
-    "            bias=None,\n",
-    "        )\n",
-    "        return output.reshape(shape[0], shape[1], -1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "SELF_ATTN_WEIGHTS = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\"]\n",
-    "MLP_WEIGHTS = [\"gate_proj\", \"up_proj\", \"down_proj\"]\n",
-    "\n",
-    "def quantize_proj(module, proj_name):\n",
-    "    proj = getattr(module, proj_name)\n",
-    "    quant_weight, quant_scale = per_tensor_quantize(proj.weight)\n",
-    "    quant_proj = LinearFP8(quant_weight, quant_scale)\n",
-    "    \n",
-    "    del proj\n",
-    "    setattr(module, proj_name, quant_proj)\n",
-    "\n",
-    "for layer in model.model.layers:\n",
-    "    for proj_name in SELF_ATTN_WEIGHTS:\n",
-    "        quantize_proj(layer.self_attn, proj_name)\n",
-    "    for proj_name in MLP_WEIGHTS:\n",
-    "        quantize_proj(layer.mlp, proj_name)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "MistralForCausalLM(\n",
-       "  (model): MistralModel(\n",
-       "    (embed_tokens): Embedding(32000, 4096)\n",
-       "    (layers): ModuleList(\n",
-       "      (0-31): 32 x MistralDecoderLayer(\n",
-       "        (self_attn): MistralSdpaAttention(\n",
-       "          (q_proj): LinearFP8()\n",
-       "          (k_proj): LinearFP8()\n",
-       "          (v_proj): LinearFP8()\n",
-       "          (o_proj): LinearFP8()\n",
-       "          (rotary_emb): MistralRotaryEmbedding()\n",
-       "        )\n",
-       "        (mlp): MistralMLP(\n",
-       "          (gate_proj): LinearFP8()\n",
-       "          (up_proj): LinearFP8()\n",
-       "          (down_proj): LinearFP8()\n",
-       "          (act_fn): SiLU()\n",
-       "        )\n",
-       "        (input_layernorm): MistralRMSNorm()\n",
-       "        (post_attention_layernorm): MistralRMSNorm()\n",
-       "      )\n",
-       "    )\n",
-       "    (norm): MistralRMSNorm()\n",
-       "  )\n",
-       "  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)\n",
-       ")"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import AutoTokenizer\n",
-    "tokenizer = AutoTokenizer.from_pretrained(\n",
-    "    \"mistralai/Mistral-7B-Instruct-v0.2\"\n",
-    ")\n",
-    "tokenizer.pad_token_id = tokenizer.eos_token_id"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_ids = tokenizer.apply_chat_template(\n",
-    "    [{\"role\": \"user\", \"content\": \"What is your name?\" }],\n",
-    "    return_tensors=\"pt\"\n",
-    ").to(\"cuda\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
-      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<s> [INST] What is your name? [/INST] I don't have a name. I'm just a computer program designed to assist with information\n"
-     ]
-    }
-   ],
-   "source": [
-    "output = model.generate(input_ids=input_ids, max_new_tokens=20)\n",
-    "print(tokenizer.decode(output[0]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# hacked transformers/modeling_utils/dtype_byte_size to make this work\n",
-    "model.save_pretrained(\"mistral-fp8-static\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "env",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

From 794f1a185e179ed4e3a59e27e1d55cc0064e8283 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Thu, 25 Apr 2024 13:45:11 -0700
Subject: [PATCH 45/90] Update vllm/_custom_ops.py

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/_custom_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 9d40ec4928276..40fdb2cef380f 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -160,7 +160,7 @@ def scaled_fp8_quant(
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
     if scale is None:
-        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+        scale = torch.empty(1, device=input.device, dtype=torch.float32)
         vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         vllm_ops.static_scaled_fp8_quant(output, input, scale)

From c13b6a49ac5ab715e6ca5d38d747ddfc68876018 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Thu, 25 Apr 2024 14:08:37 -0700
Subject: [PATCH 46/90] update

---
 vllm/model_executor/layers/quantization/fp8.py | 8 ++++----
 vllm/model_executor/models/mixtral.py          | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index ab562288a8f7d..271cddc1175a5 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -15,9 +15,9 @@ class FP8Config(QuantizationConfig):
 
     def __init__(
         self,
-        act_scaling: str = "dynamic",
+        activation_scheme: str = "dynamic",
     ) -> None:
-        self.act_scaling = act_scaling
+        self.activation_scheme = activation_scheme
 
     @classmethod
     def get_name(cls) -> str:
@@ -44,8 +44,8 @@ def get_config_filenames(cls) -> List[str]:
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "FP8Config":
-        act_scaling = cls.get_from_keys(config, ["act_scaling"])
-        return cls(act_scaling)
+        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        return cls(activation_scheme)
 
     def get_linear_method(self) -> "Fp8LinearMethod":
         return Fp8LinearMethod(self)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 4439a75e5b57f..6f6c93b273335 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -123,7 +123,7 @@ def __init__(
 
         # Scaling factors for FP8 activations
         need_act_scales = (self.use_fp8 and
-                           linear_method.quant_config.act_scaling == "static")
+                           linear_method.quant_config.activation_scheme == "static")
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None

From 5a230ed4794899bc20518f9cf48cc59ed40a97d4 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Thu, 25 Apr 2024 14:12:54 -0700
Subject: [PATCH 47/90] update

---
 vllm/model_executor/models/mixtral.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 6f6c93b273335..c7199338b5525 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -123,7 +123,8 @@ def __init__(
 
         # Scaling factors for FP8 activations
         need_act_scales = (self.use_fp8 and
-                           linear_method.quant_config.activation_scheme == "static")
+                           linear_method.quant_config.activation_scheme
+                           == "static")
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None

From 80069c977123957a4e814dcb2485cb14cf5b3bcf Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Thu, 25 Apr 2024 14:17:00 -0700
Subject: [PATCH 48/90] format

---
 vllm/model_executor/models/mixtral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index c7199338b5525..ad6f43fbd4aa0 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -122,8 +122,8 @@ def __init__(
             requires_grad=False) if self.use_fp8 else None
 
         # Scaling factors for FP8 activations
-        need_act_scales = (self.use_fp8 and
-                           linear_method.quant_config.activation_scheme
+        need_act_scales = (self.use_fp8
+                           and linear_method.quant_config.activation_scheme
                            == "static")
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),

From 5ce17d0cec2d7f43ebff895536f29ef356009534 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Thu, 25 Apr 2024 14:27:41 -0700
Subject: [PATCH 49/90] activation_scale -> act_scale

---
 vllm/model_executor/models/mixtral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index ad6f43fbd4aa0..dad1c43e1b0da 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -153,7 +153,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
                        shard_size:2 * shard_size, :] = loaded_weight[shard, :]
         if weight_name.endswith("w2.weight"):
             param_data[expert_id, :, :] = loaded_weight[:, shard]
-        if "activation_scale" in weight_name:
+        if "act_scale" in weight_name:
             param_data[:] = param_data[:].max(loaded_weight)
 
     def process_weights_after_loading(self):
@@ -476,7 +476,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             # These are the activation scales for the experts
             # (param_name, weight_name, expert_id)
             ("as_scale" if weight_name in ["w1", "w3"] else "a2s_scale",
-             f"experts.{expert_id}.{weight_name}.activation_scale", expert_id)
+             f"experts.{expert_id}.{weight_name}.act_scale", expert_id)
             for expert_id in range(self.config.num_local_experts)
             for weight_name in ["w1", "w2", "w3"]
         ]

From 5fc033516381b4f36f6e4a2620352789edb8cf1b Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 25 Apr 2024 21:48:33 +0000
Subject: [PATCH 50/90] Update scheme->activation_scheme

---
 .../layers/quantization/fp8_serialized.py        | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8_serialized.py b/vllm/model_executor/layers/quantization/fp8_serialized.py
index c187bed817412..d9f21dab8fb27 100644
--- a/vllm/model_executor/layers/quantization/fp8_serialized.py
+++ b/vllm/model_executor/layers/quantization/fp8_serialized.py
@@ -13,10 +13,10 @@ class FP8Config(QuantizationConfig):
     """Config class for FP8."""
     def __init__(
         self,
-        scheme: str,
+        activation_scheme: str,
     ) -> None:
-        assert scheme == "static" or scheme == "dynamic"
-        self.scheme = scheme
+        assert activation_scheme == "static" or activation_scheme == "dynamic"
+        self.activation_scheme = activation_scheme
 
     @classmethod
     def get_name(cls) -> str:
@@ -36,8 +36,8 @@ def get_config_filenames(cls) -> List[str]:
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "FP8Config":
-        scheme = cls.get_from_keys(config, ["scheme"])
-        return cls(scheme=scheme)
+        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        return cls(activation_scheme=activation_scheme)
 
     def get_linear_method(self) -> "FP8LinearMethod":
         return FP8LinearMethod(self)
@@ -87,7 +87,7 @@ def create_weights(
             "shard_indexer": self.scales_shard_indexer,
         })
 
-        if self.quant_config.scheme == "static":
+        if self.quant_config.activation_scheme == "static":
             act_scale = Parameter(
                 torch.empty(len(output_partition_sizes), dtype=torch.float32), 
                 requires_grad=False
@@ -140,9 +140,9 @@ def apply_weights(self,
 
         w_scale = layer.weight_scale.max()
 
-        if self.quant_config.scheme == "dynamic":
+        if self.quant_config.activation_scheme == "dynamic":
             qinput, x_scale = per_tensor_quantize_dyanmic(x)
-        elif self.quant_config.scheme == "static":
+        elif self.quant_config.activation_scheme == "static":
             # empirically, these are all the same
             x_scale = layer.act_scale.max()
             qinput = per_tensor_quantize_static(x, x_scale)

From 92d5162ee6298f2557ded542d5157f7bf3db097d Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Thu, 25 Apr 2024 14:50:36 -0700
Subject: [PATCH 51/90] fix dynamic scaling -- need init to zero due to atomic
 update

---
 vllm/_custom_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 40fdb2cef380f..9d40ec4928276 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -160,7 +160,7 @@ def scaled_fp8_quant(
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
     if scale is None:
-        scale = torch.empty(1, device=input.device, dtype=torch.float32)
+        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
         vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         vllm_ops.static_scaled_fp8_quant(output, input, scale)

From e1bfe1027e1726de51bb33fc6327623650d538a5 Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 25 Apr 2024 22:05:19 +0000
Subject: [PATCH 52/90] Format

---
 run_fp8.py                                    | 26 ++++++----
 vllm/model_executor/layers/linear.py          | 32 ++++++------
 .../layers/quantization/fp8_serialized.py     | 50 ++++++++++---------
 3 files changed, 58 insertions(+), 50 deletions(-)

diff --git a/run_fp8.py b/run_fp8.py
index 1ef12c3f352f9..113dca8401494 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -1,13 +1,15 @@
-from vllm import LLM
+import argparse
+
 from transformers import AutoTokenizer
-import argparse 
+
+from vllm import LLM
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--type", choices=["static", "dynamic"])
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    
+
     if args.type == "static":
         model_name = "nm-testing/mistral-fp8-static"
     elif args.type == "dynamic":
@@ -17,15 +19,19 @@
 
     tokenizer_name = "mistralai/Mistral-7B-Instruct-v0.2"
 
-    model = LLM(
-        model_name, 
-        tokenizer=tokenizer_name,
-        enforce_eager=True,
-        max_model_len=1024)
-    
+    model = LLM(model_name,
+                tokenizer=tokenizer_name,
+                enforce_eager=True,
+                max_model_len=1024)
+
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
 
-    prompt = tokenizer.apply_chat_template([{"role": "user", "content": "What is your name"}], tokenize=False, add_generation_prompt=True)
+    prompt = tokenizer.apply_chat_template([{
+        "role": "user",
+        "content": "What is your name"
+    }],
+                                           tokenize=False,
+                                           add_generation_prompt=True)
     print(f"----- Prompt: {prompt}")
 
     outputs = model.generate(prompt)
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 01af70a34c488..61670b34e16b4 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -285,7 +285,7 @@ def weight_loader(self,
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
         is_metadata = getattr(param, "is_metadata", False)
-        
+
         # TODO: document.
         # TODO: sync with is_metadata.
         # For loading scales.
@@ -293,14 +293,12 @@ def weight_loader(self,
         if output_dim is not None and shard_indexer is not None:
             raise NotImplementedError(
                 "We do not currently support output_dim != None and "
-                "shard_indexer != None for a parameter. Please open an issue."
-            )
+                "shard_indexer != None for a parameter. Please open an issue.")
         if loaded_shard_id is None and shard_indexer is not None:
             raise NotImplementedError(
                 "We do not currently support loaded_shard_id == None and "
-                "shard_indexer != None for a parameter. Please open an issue."
-            )
-        
+                "shard_indexer != None for a parameter. Please open an issue.")
+
         if loaded_shard_id is None:
             # Loaded weight is already packed.
             if output_dim is None:
@@ -358,13 +356,14 @@ def weight_loader(self,
             shard_size = loaded_weight.shape[0]
             shard_offset = loaded_shard_id * shard_size
             param_data = param_data.narrow(0, shard_offset, shard_size)
-        
+
         # TODO: sync with is_metadata UX.
         # If a param_shard_splitter is defined by the LinearMethod, use it.
         elif shard_indexer is not None:
-            param_data, loaded_weight = shard_indexer(
-                param_data, loaded_weight, loaded_shard_id)
-        
+            param_data, loaded_weight = shard_indexer(param_data,
+                                                      loaded_weight,
+                                                      loaded_shard_id)
+
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -446,19 +445,17 @@ def weight_loader(self,
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
         is_metadata = getattr(param, "is_metadata", False)
-        
+
         # TODO: sync with is_metadata UX
         shard_indexer = getattr(param, "shard_indexer", None)
         if output_dim is not None and shard_indexer is not None:
             raise NotImplementedError(
                 "We do not currently support output_dim != None and "
-                "shard_indexer != None for a parameter. Please open an issue."
-            )
+                "shard_indexer != None for a parameter. Please open an issue.")
         if loaded_shard_id is None and shard_indexer is not None:
             raise NotImplementedError(
                 "We do not currently support loaded_shard_id == None and "
-                "shard_indexer != None for a parameter. Please open an issue."
-            )
+                "shard_indexer != None for a parameter. Please open an issue.")
 
         if loaded_shard_id is None:
             # Loaded weight is already packed.
@@ -535,8 +532,9 @@ def weight_loader(self,
         # TODO: sync with QKV
         # If a param_shard_splitter is defined by the LinearMethod, use it.
         elif shard_indexer is not None:
-            param_data, loaded_weight = shard_indexer(
-                param_data, loaded_weight, loaded_shard_id)
+            param_data, loaded_weight = shard_indexer(param_data,
+                                                      loaded_weight,
+                                                      loaded_shard_id)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
diff --git a/vllm/model_executor/layers/quantization/fp8_serialized.py b/vllm/model_executor/layers/quantization/fp8_serialized.py
index d9f21dab8fb27..1179dc07a48d6 100644
--- a/vllm/model_executor/layers/quantization/fp8_serialized.py
+++ b/vllm/model_executor/layers/quantization/fp8_serialized.py
@@ -11,6 +11,7 @@
 
 class FP8Config(QuantizationConfig):
     """Config class for FP8."""
+
     def __init__(
         self,
         activation_scheme: str,
@@ -45,6 +46,7 @@ def get_linear_method(self) -> "FP8LinearMethod":
     def get_scaled_act_names(self) -> List[str]:
         return []
 
+
 class FP8LinearMethod(LinearMethodBase):
     """Linear method for StaticFP8
     .
@@ -66,7 +68,7 @@ def create_weights(
         **extra_weight_attrs,
     ):
         del input_size, output_size, params_dtype
-        
+
         weight = Parameter(torch.empty(sum(output_partition_sizes),
                                        input_size_per_partition,
                                        dtype=torch.float8_e4m3fn),
@@ -75,12 +77,12 @@ def create_weights(
         set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
         set_weight_attrs(weight, extra_weight_attrs)
 
-        weight_scale = Parameter(
-            torch.empty(
-                len(output_partition_sizes), 
-                 device='cuda', dtype=torch.float32,
-            ), requires_grad=False
-        )
+        weight_scale = Parameter(torch.empty(
+            len(output_partition_sizes),
+            device='cuda',
+            dtype=torch.float32,
+        ),
+                                 requires_grad=False)
         layer.register_parameter("weight_scale", weight_scale)
         set_weight_attrs(weight_scale, extra_weight_attrs)
         set_weight_attrs(weight_scale, {
@@ -88,10 +90,9 @@ def create_weights(
         })
 
         if self.quant_config.activation_scheme == "static":
-            act_scale = Parameter(
-                torch.empty(len(output_partition_sizes), dtype=torch.float32), 
-                requires_grad=False
-            )
+            act_scale = Parameter(torch.empty(len(output_partition_sizes),
+                                              dtype=torch.float32),
+                                  requires_grad=False)
             layer.register_parameter("act_scale", act_scale)
             set_weight_attrs(act_scale, extra_weight_attrs)
             set_weight_attrs(act_scale, {
@@ -100,14 +101,11 @@ def create_weights(
 
         layer.logical_widths = output_partition_sizes
 
-    def shard_id_as_int(
-        self, 
-        shard_id: Union[str, int]
-    ) -> int:
+    def shard_id_as_int(self, shard_id: Union[str, int]) -> int:
         if isinstance(shard_id, int):
             return shard_id
         assert isinstance(shard_id, str)
-        qkv_idxs = { "q": 0, "k": 1, "v": 2 }
+        qkv_idxs = {"q": 0, "k": 1, "v": 2}
         assert shard_id in qkv_idxs
         return qkv_idxs[shard_id]
 
@@ -119,12 +117,12 @@ def shard_id_as_int(
     #     logical_widths: torch.Tensor
     # ) -> Tuple[torch.Tensor, torch.Tensor]:
     #     shard_id = self.shard_id_as_int(shard_id)
-    #     offset = sum(logical_widths[:shard_id]) 
+    #     offset = sum(logical_widths[:shard_id])
     #     size = logical_widths[shard_id]
     #     # update loaded weight with copies for broadcast.
     #     loaded_weight = loaded_weight.repeat(size)
     #     return param[offset : offset + size], loaded_weight
-    
+
     def scales_shard_indexer(
         self,
         param: torch.Tensor,
@@ -146,11 +144,15 @@ def apply_weights(self,
             # empirically, these are all the same
             x_scale = layer.act_scale.max()
             qinput = per_tensor_quantize_static(x, x_scale)
-        
+
         # FOR LOOP TO BE REPLACED BY CUTLASS KERNEL W/ EPILOGUE FUSION
-        output = torch.zeros(x.shape[0], layer.weight.shape[0], dtype=x.dtype, device="cuda")
+        output = torch.zeros(x.shape[0],
+                             layer.weight.shape[0],
+                             dtype=x.dtype,
+                             device="cuda")
         start_offset = 0
-        for _, (logical_width, w_scale) in enumerate(zip(layer.logical_widths, layer.weight_scale)):
+        for _, (logical_width, w_scale) in enumerate(
+                zip(layer.logical_widths, layer.weight_scale)):
             end_offset = start_offset + logical_width
 
             out, _ = torch._scaled_mm(
@@ -167,7 +169,8 @@ def apply_weights(self,
         return output
 
 
-def per_tensor_quantize_static(tensor: torch.Tensor, inv_scale: float) -> torch.Tensor:
+def per_tensor_quantize_static(tensor: torch.Tensor,
+                               inv_scale: float) -> torch.Tensor:
     """Quantize a tensor using per-tensor static scaling factor.
     Args:
         tensor: The input tensor.
@@ -181,7 +184,8 @@ def per_tensor_quantize_static(tensor: torch.Tensor, inv_scale: float) -> torch.
     return qweight.to(torch.float8_e4m3fn)
 
 
-def per_tensor_quantize_dyanmic(tensor: torch.Tensor) -> tuple[torch.Tensor, float]:
+def per_tensor_quantize_dyanmic(
+        tensor: torch.Tensor) -> tuple[torch.Tensor, float]:
     """Quantize a tensor using per-tensor dynamic scaling factor.
     Args:
         tensor: The input tensor.

From 72426000873ab6b07e7cf41b65d38ed915ecf379 Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 25 Apr 2024 22:34:55 +0000
Subject: [PATCH 53/90] Fix tuple type

---
 vllm/model_executor/layers/quantization/fp8_serialized.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8_serialized.py b/vllm/model_executor/layers/quantization/fp8_serialized.py
index 1179dc07a48d6..1d0b44f37b155 100644
--- a/vllm/model_executor/layers/quantization/fp8_serialized.py
+++ b/vllm/model_executor/layers/quantization/fp8_serialized.py
@@ -185,7 +185,7 @@ def per_tensor_quantize_static(tensor: torch.Tensor,
 
 
 def per_tensor_quantize_dyanmic(
-        tensor: torch.Tensor) -> tuple[torch.Tensor, float]:
+        tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
     """Quantize a tensor using per-tensor dynamic scaling factor.
     Args:
         tensor: The input tensor.

From 21ddbb4049fef3f2cfaa0d3bfe89fe96b6e5ed92 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 26 Apr 2024 11:09:08 +0000
Subject: [PATCH 54/90] stash tyler's state

---
 requirements-cuda.txt                         |   1 +
 run_fp8.py                                    |   2 +-
 .../layers/fused_gemm_dq/__init__.py          |   5 +
 vllm/model_executor/layers/linear.py          |   8 +-
 .../layers/quantization/fp8_serialized.py     | 112 ++++++++++--------
 5 files changed, 75 insertions(+), 53 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_gemm_dq/__init__.py

diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 1bddae4c6f40f..b873423911488 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -7,3 +7,4 @@ nvidia-ml-py # for pynvml package
 vllm-nccl-cu12>=2.18,<2.19  # for downloading nccl library
 torch == 2.2.1
 xformers == 0.0.25  # Requires PyTorch 2.2.1
+nvidia-cutlass
diff --git a/run_fp8.py b/run_fp8.py
index 113dca8401494..5c4b4529cdbb1 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -11,7 +11,7 @@
     args = parser.parse_args()
 
     if args.type == "static":
-        model_name = "nm-testing/mistral-fp8-static"
+        model_name = "nm-testing/Meta-Llama-3-8B-Instruct-FP8"
     elif args.type == "dynamic":
         model_name = "nm-testing/mistral-fp8-dynamic"
     else:
diff --git a/vllm/model_executor/layers/fused_gemm_dq/__init__.py b/vllm/model_executor/layers/fused_gemm_dq/__init__.py
new file mode 100644
index 0000000000000..2acb1c2aad6c7
--- /dev/null
+++ b/vllm/model_executor/layers/fused_gemm_dq/__init__.py
@@ -0,0 +1,5 @@
+from vllm.model_executor.layers.fused_gemm_dq.fused_gemm_dq_fp8 import fused_gemm_dq_fp8
+
+__all__ = [
+    "fused_gemm_dq_fp8",
+]
\ No newline at end of file
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 0f590a98688fb..87b2bb7d9acce 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -300,6 +300,7 @@ def weight_loader(self,
         # TODO: sync with is_metadata.
         # For loading scales.
         shard_indexer = getattr(param, "shard_indexer", None)
+        logical_widths = getattr(param, "logical_widths", None)
         if output_dim is not None and shard_indexer is not None:
             raise NotImplementedError(
                 "We do not currently support output_dim != None and "
@@ -372,7 +373,8 @@ def weight_loader(self,
         elif shard_indexer is not None:
             param_data, loaded_weight = shard_indexer(param_data,
                                                       loaded_weight,
-                                                      loaded_shard_id)
+                                                      loaded_shard_id,
+                                                      logical_widths)
 
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
@@ -458,6 +460,7 @@ def weight_loader(self,
 
         # TODO: sync with is_metadata UX
         shard_indexer = getattr(param, "shard_indexer", None)
+        logical_widths = getattr(param, "logical_widths", None)
         if output_dim is not None and shard_indexer is not None:
             raise NotImplementedError(
                 "We do not currently support output_dim != None and "
@@ -544,7 +547,8 @@ def weight_loader(self,
         elif shard_indexer is not None:
             param_data, loaded_weight = shard_indexer(param_data,
                                                       loaded_weight,
-                                                      loaded_shard_id)
+                                                      loaded_shard_id,
+                                                      logical_widths)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
diff --git a/vllm/model_executor/layers/quantization/fp8_serialized.py b/vllm/model_executor/layers/quantization/fp8_serialized.py
index 1d0b44f37b155..da062a8a1492b 100644
--- a/vllm/model_executor/layers/quantization/fp8_serialized.py
+++ b/vllm/model_executor/layers/quantization/fp8_serialized.py
@@ -3,6 +3,7 @@
 import torch
 from torch.nn.parameter import Parameter
 
+from vllm.model_executor.layers.fused_gemm_dq import fused_gemm_dq_fp8
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -68,7 +69,10 @@ def create_weights(
         **extra_weight_attrs,
     ):
         del input_size, output_size, params_dtype
+        num_logical_weights = len(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
 
+        # WEIGHT
         weight = Parameter(torch.empty(sum(output_partition_sizes),
                                        input_size_per_partition,
                                        dtype=torch.float8_e4m3fn),
@@ -76,19 +80,29 @@ def create_weights(
         layer.register_parameter("weight", weight)
         set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
         set_weight_attrs(weight, extra_weight_attrs)
+        
+        # WEIGHT SCALE
+        if num_logical_weights == 1:
+            weight_scale = Parameter(
+                torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                requires_grad=False)
+             
+            layer.register_parameter("weight_scale", weight_scale)
+            set_weight_attrs(weight_scale, extra_weight_attrs)
+            set_weight_attrs(weight_scale, {
+                "shard_indexer": self.scales_shard_indexer,
+            })
+        else:
+            weight_scale = Parameter(
+                torch.empty(sum(output_partition_sizes), dtype=torch.float32),
+                requires_grad=False)
+            layer.register_parameter("weight_scale", weight_scale)
+            set_weight_attrs(weight_scale, extra_weight_attrs)
+            set_weight_attrs(weight_scale, {
+                "shard_indexer": self.scales_shard_indexer,
+            })
 
-        weight_scale = Parameter(torch.empty(
-            len(output_partition_sizes),
-            device='cuda',
-            dtype=torch.float32,
-        ),
-                                 requires_grad=False)
-        layer.register_parameter("weight_scale", weight_scale)
-        set_weight_attrs(weight_scale, extra_weight_attrs)
-        set_weight_attrs(weight_scale, {
-            "shard_indexer": self.scales_shard_indexer,
-        })
-
+        # ACT Scale
         if self.quant_config.activation_scheme == "static":
             act_scale = Parameter(torch.empty(len(output_partition_sizes),
                                               dtype=torch.float32),
@@ -99,8 +113,6 @@ def create_weights(
                 "shard_indexer": self.scales_shard_indexer,
             })
 
-        layer.logical_widths = output_partition_sizes
-
     def shard_id_as_int(self, shard_id: Union[str, int]) -> int:
         if isinstance(shard_id, int):
             return shard_id
@@ -109,26 +121,28 @@ def shard_id_as_int(self, shard_id: Union[str, int]) -> int:
         assert shard_id in qkv_idxs
         return qkv_idxs[shard_id]
 
-    # def scales_shard_splitter_NKK(
-    #     self,
-    #     param: torch.Tensor,
-    #     loaded_weight: torch.Tensor,
-    #     shard_id: Union[str, int],
-    #     logical_widths: torch.Tensor
-    # ) -> Tuple[torch.Tensor, torch.Tensor]:
-    #     shard_id = self.shard_id_as_int(shard_id)
-    #     offset = sum(logical_widths[:shard_id])
-    #     size = logical_widths[shard_id]
-    #     # update loaded weight with copies for broadcast.
-    #     loaded_weight = loaded_weight.repeat(size)
-    #     return param[offset : offset + size], loaded_weight
+    def scales_shard_indexer_NKK(
+        self,
+        param: torch.Tensor,
+        loaded_weight: torch.Tensor,
+        shard_id: Union[str, int],
+        logical_widths,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        shard_id = self.shard_id_as_int(shard_id)
+        offset = sum(logical_widths[:shard_id])
+        size = logical_widths[shard_id]
+        # update loaded weight with copies for broadcast.
+        loaded_weight = loaded_weight.repeat(size)
+        return param[offset:offset + size], loaded_weight
 
     def scales_shard_indexer(
         self,
         param: torch.Tensor,
         loaded_weight: torch.Tensor,
         shard_id: Union[str, int],
+        logical_widths,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        del logical_widths
         return param[self.shard_id_as_int(shard_id)], loaded_weight
 
     def apply_weights(self,
@@ -136,7 +150,7 @@ def apply_weights(self,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        w_scale = layer.weight_scale.max()
+        m = x.shape[0]
 
         if self.quant_config.activation_scheme == "dynamic":
             qinput, x_scale = per_tensor_quantize_dyanmic(x)
@@ -145,28 +159,26 @@ def apply_weights(self,
             x_scale = layer.act_scale.max()
             qinput = per_tensor_quantize_static(x, x_scale)
 
-        # FOR LOOP TO BE REPLACED BY CUTLASS KERNEL W/ EPILOGUE FUSION
-        output = torch.zeros(x.shape[0],
-                             layer.weight.shape[0],
-                             dtype=x.dtype,
-                             device="cuda")
-        start_offset = 0
-        for _, (logical_width, w_scale) in enumerate(
-                zip(layer.logical_widths, layer.weight_scale)):
-            end_offset = start_offset + logical_width
-
-            out, _ = torch._scaled_mm(
-                qinput,
-                layer.weight[start_offset:end_offset, :].t(),
-                out_dtype=x.dtype,
-                scale_a=x_scale,
-                scale_b=w_scale,
-                bias=bias,
-            )
-            output[:, start_offset:end_offset] = out
-            start_offset = end_offset
-
-        return output
+        # HACK: CUTLASS hits an illegal instruction for fp8 when m == 1,
+        # so pad m to 2
+        if m == 1:
+            tmp = torch.zeros(2,
+                              qinput.shape[1],
+                              dtype=qinput.dtype,
+                              device="cuda")
+            tmp[0, :] = qinput
+            qinput = tmp
+
+        out_cutlass_qkv = fused_gemm_dq_fp8(
+            qinput,
+            layer.weight.t(),
+            out_dtype=x.dtype,
+            scale_a=x_scale,
+            scale_b=layer.weight_scale,
+            bias=bias,
+        )
+
+        return out_cutlass_qkv[:m, :]
 
 
 def per_tensor_quantize_static(tensor: torch.Tensor,

From d27015cc9bdd3686b632e7e74629b918b5db8b1a Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 26 Apr 2024 11:20:37 +0000
Subject: [PATCH 55/90] stash

---
 run_fp8.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/run_fp8.py b/run_fp8.py
index 5c4b4529cdbb1..702f8e6df9eea 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -11,13 +11,14 @@
     args = parser.parse_args()
 
     if args.type == "static":
-        model_name = "nm-testing/Meta-Llama-3-8B-Instruct-FP8"
+        model_name = "nm-testing/mistral-fp8-static"
     elif args.type == "dynamic":
         model_name = "nm-testing/mistral-fp8-dynamic"
     else:
         raise ValueError("--type should be `static` or `dynamic`")
 
-    tokenizer_name = "mistralai/Mistral-7B-Instruct-v0.2"
+    # tokenizer_name = "mistralai/Mistral-7B-Instruct-v0.2"
+    tokenizer_name = model_name
 
     model = LLM(model_name,
                 tokenizer=tokenizer_name,
@@ -29,9 +30,7 @@
     prompt = tokenizer.apply_chat_template([{
         "role": "user",
         "content": "What is your name"
-    }],
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    }], tokenize=False, add_generation_prompt=True)
     print(f"----- Prompt: {prompt}")
 
     outputs = model.generate(prompt)

From 1111f8789b8008a5263615ee22be66e777312b39 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 26 Apr 2024 11:37:45 +0000
Subject: [PATCH 56/90] cutlass working, but slow jitting on hotpath

---
 run_fp8.py                                                | 6 +-----
 vllm/model_executor/layers/quantization/fp8_serialized.py | 7 +++++--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/run_fp8.py b/run_fp8.py
index 702f8e6df9eea..2c1b071cc5e1b 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -17,15 +17,11 @@
     else:
         raise ValueError("--type should be `static` or `dynamic`")
 
-    # tokenizer_name = "mistralai/Mistral-7B-Instruct-v0.2"
-    tokenizer_name = model_name
-
     model = LLM(model_name,
-                tokenizer=tokenizer_name,
                 enforce_eager=True,
                 max_model_len=1024)
 
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
 
     prompt = tokenizer.apply_chat_template([{
         "role": "user",
diff --git a/vllm/model_executor/layers/quantization/fp8_serialized.py b/vllm/model_executor/layers/quantization/fp8_serialized.py
index da062a8a1492b..a3511bdb02fa3 100644
--- a/vllm/model_executor/layers/quantization/fp8_serialized.py
+++ b/vllm/model_executor/layers/quantization/fp8_serialized.py
@@ -97,9 +97,11 @@ def create_weights(
                 torch.empty(sum(output_partition_sizes), dtype=torch.float32),
                 requires_grad=False)
             layer.register_parameter("weight_scale", weight_scale)
-            set_weight_attrs(weight_scale, extra_weight_attrs)
+            # set_weight_attrs(weight_scale, extra_weight_attrs)
             set_weight_attrs(weight_scale, {
-                "shard_indexer": self.scales_shard_indexer,
+                **extra_weight_attrs,
+                "shard_indexer": self.scales_shard_indexer_NKK,
+                "logical_widths": output_partition_sizes
             })
 
         # ACT Scale
@@ -113,6 +115,7 @@ def create_weights(
                 "shard_indexer": self.scales_shard_indexer,
             })
 
+
     def shard_id_as_int(self, shard_id: Union[str, int]) -> int:
         if isinstance(shard_id, int):
             return shard_id

From f5d32aee06d270215ae4629ff032a45c8bc83c7f Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 26 Apr 2024 12:29:25 +0000
Subject: [PATCH 57/90] first end to end run with mixtral

---
 run_fp8.py                            | 15 +++++--
 vllm/model_executor/models/mixtral.py | 65 ++++++++++++++++++---------
 2 files changed, 54 insertions(+), 26 deletions(-)

diff --git a/run_fp8.py b/run_fp8.py
index 2c1b071cc5e1b..0caf538de1fef 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -4,18 +4,24 @@
 
 from vllm import LLM
 
+choices = ["llama-static", "mistral-static", "mistral-dynamic", "mixtral-static"]
+
 parser = argparse.ArgumentParser()
-parser.add_argument("--type", choices=["static", "dynamic"])
+parser.add_argument("--type", choices="mixtral-static")
 
 if __name__ == "__main__":
     args = parser.parse_args()
 
-    if args.type == "static":
+    if args.type == "llama-static":
+        model_name = "nm-testing/Meta-Llama-3-8B-Instruct-FP8"
+    elif args.type == "mistral-static":
         model_name = "nm-testing/mistral-fp8-static"
-    elif args.type == "dynamic":
+    elif args.type == "mistral-dynamic":
         model_name = "nm-testing/mistral-fp8-dynamic"
+    elif args.type == 'mixtral-static':
+        model_name = "nm-testing/Mixtral-8x7B-Instruct-v0.1-FP8"
     else:
-        raise ValueError("--type should be `static` or `dynamic`")
+        raise ValueError(f"--type should be in {choices}")
 
     model = LLM(model_name,
                 enforce_eager=True,
@@ -30,5 +36,6 @@
     print(f"----- Prompt: {prompt}")
 
     outputs = model.generate(prompt)
+    print(outputs)
     generation = outputs[0].outputs[0].text
     print(f"----- Generation: {generation}")
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index dad1c43e1b0da..2f7aca2c3b0e9 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -39,6 +39,7 @@
                                                ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.fp8_serialized import FP8LinearMethod
 from vllm.model_executor.layers.quantization.fp8 import (Fp8LinearMethod,
                                                          per_tensor_quantize)
 from vllm.model_executor.layers.rotary_embedding import get_rope
@@ -79,7 +80,8 @@ def __init__(
         self.intermediate_size = intermediate_size // self.tp_size
         # FIXME(pcmoritz): Make this more general to support different
         # quantization schemes
-        self.use_fp8 = isinstance(linear_method, Fp8LinearMethod)
+        self.use_fp8 = (isinstance(linear_method, Fp8LinearMethod) or 
+                        isinstance(linear_method, FP8LinearMethod))
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -90,19 +92,22 @@ def __init__(
                                      bias=False,
                                      params_dtype=self.params_dtype,
                                      linear_method=None)
+        
+        if self.use_fp8:
+            params_dtype = torch.float8_e4m3fn
 
         self.ws = nn.Parameter(
             torch.empty(self.num_total_experts,
                         2 * self.intermediate_size,
                         self.hidden_size,
                         device="cuda",
-                        dtype=self.params_dtype))
+                        dtype=params_dtype))
         self.w2s = nn.Parameter(
             torch.empty(self.num_total_experts,
                         self.hidden_size,
                         self.intermediate_size,
                         device="cuda",
-                        dtype=self.params_dtype))
+                        dtype=params_dtype))
 
         set_weight_attrs(self.ws, {
             "weight_loader": self.weight_loader,
@@ -120,16 +125,23 @@ def __init__(
             torch.ones(
                 self.num_total_experts, device="cuda", dtype=torch.float32),
             requires_grad=False) if self.use_fp8 else None
+        
+        set_weight_attrs(self.ws_scale, {
+            "weight_loader": self.weight_loader,
+        })
+        set_weight_attrs(self.w2s_scale, {
+            "weight_loader": self.weight_loader,
+        })
 
         # Scaling factors for FP8 activations
         need_act_scales = (self.use_fp8
                            and linear_method.quant_config.activation_scheme
                            == "static")
         self.as_scale = nn.Parameter(
-            torch.zeros(1, device="cuda", dtype=torch.float32),
+            torch.zeros(self.num_total_experts, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None
         self.a2s_scale = nn.Parameter(
-            torch.zeros(1, device="cuda", dtype=torch.float32),
+            torch.zeros(self.num_total_experts, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None
 
         if need_act_scales:
@@ -152,27 +164,29 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
             param_data[expert_id,
                        shard_size:2 * shard_size, :] = loaded_weight[shard, :]
         if weight_name.endswith("w2.weight"):
-            param_data[expert_id, :, :] = loaded_weight[:, shard]
-        if "act_scale" in weight_name:
-            param_data[:] = param_data[:].max(loaded_weight)
-
-    def process_weights_after_loading(self):
-        if self.use_fp8:
-            ws = torch.empty_like(self.ws.data, dtype=torch.float8_e4m3fn)
-            w2s = torch.empty_like(self.w2s.data, dtype=torch.float8_e4m3fn)
-            for expert in range(self.num_total_experts):
-                ws[expert, :, :], self.ws_scale[expert] = per_tensor_quantize(
-                    self.ws.data[expert, :, :])
-                w2s[expert, :, :], self.w2s_scale[
-                    expert] = per_tensor_quantize(self.w2s.data[expert, :, :])
-            self.ws = nn.Parameter(ws, requires_grad=False)
-            self.w2s = nn.Parameter(w2s, requires_grad=False)
+            param_data[expert_id] = loaded_weight[:, shard]
+        if "act_scale" in weight_name or "weight_scale" in weight_name:
+            param_data[expert_id] = loaded_weight
+
+    # def process_weights_after_loading(self):
+    #     if self.use_fp8:
+    #         ws = torch.empty_like(self.ws.data, dtype=torch.float8_e4m3fn)
+    #         w2s = torch.empty_like(self.w2s.data, dtype=torch.float8_e4m3fn)
+    #         for expert in range(self.num_total_experts):
+    #             ws[expert, :, :], self.ws_scale[expert] = per_tensor_quantize(
+    #                 self.ws.data[expert, :, :])
+    #             w2s[expert, :, :], self.w2s_scale[
+    #                 expert] = per_tensor_quantize(self.w2s.data[expert, :, :])
+    #         self.ws = nn.Parameter(ws, requires_grad=False)
+    #         self.w2s = nn.Parameter(w2s, requires_grad=False)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_size = hidden_states.shape
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
+
+        # TODO: fused MoE kernel might want to take different scales for each expert?
         final_hidden_states = fused_moe(hidden_states,
                                         self.ws,
                                         self.w2s,
@@ -183,8 +197,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                                         use_fp8=self.use_fp8,
                                         w1_scale=self.ws_scale,
                                         w2_scale=self.w2s_scale,
-                                        a1_scale=self.as_scale,
-                                        a2_scale=self.a2s_scale)
+                                        a1_scale=self.as_scale.max(),
+                                        a2_scale=self.a2s_scale.max())
 
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
@@ -466,6 +480,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
 
         expert_params_mapping = [
+            # These are the activation scales for the experts
+            # (param_name, weight_name, expert_id)
+            ("ws_scale" if weight_name in ["w1", "w3"] else "w2s_scale",
+             f"experts.{expert_id}.{weight_name}.weight_scale", expert_id)
+            for expert_id in range(self.config.num_local_experts)
+            for weight_name in ["w1", "w2", "w3"]
+        ] + [
             # These are the weights for the experts
             # (param_name, weight_name, expert_id)
             ("ws" if weight_name in ["w1", "w3"] else "w2s",

From 924e8ce4a61a7bc0400325f8cf027a116380a11e Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 26 Apr 2024 13:41:49 +0000
Subject: [PATCH 58/90] added missed file

---
 .../layers/fused_gemm_dq/fused_gemm_dq_fp8.py | 87 +++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_gemm_dq/fused_gemm_dq_fp8.py

diff --git a/vllm/model_executor/layers/fused_gemm_dq/fused_gemm_dq_fp8.py b/vllm/model_executor/layers/fused_gemm_dq/fused_gemm_dq_fp8.py
new file mode 100644
index 0000000000000..66e18e0c4b7b2
--- /dev/null
+++ b/vllm/model_executor/layers/fused_gemm_dq/fused_gemm_dq_fp8.py
@@ -0,0 +1,87 @@
+import cutlass
+from cutlass import Tensor as FakeTensor
+import cutlass.epilogue
+
+import torch
+from typing import Optional, Tuple, Dict
+
+
+def setup_dequant_epilogue(
+    plan: cutlass.op.Gemm,
+    dq: torch.Tensor,
+    scale_a: Optional[torch.Tensor],
+    scale_b: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+) -> Tuple[cutlass.op.Gemm, Dict]:
+    assert bias is None
+
+    if all([scale_a is None, scale_b is None]):
+        return plan, None
+    assert scale_b is not None
+
+    def epilog_with_scale_b(accum, scale_b):
+        D = scale_b * accum
+        return D
+
+    def epilog_with_both_scales(accum, scale_a, scale_b):
+        D = scale_a * (scale_b * accum)
+        return D
+
+    visitor_args = {"scale_a": scale_a, "scale_b": scale_b, "D": dq}
+    epilogue_tensors = {
+        "accum": FakeTensor(
+            element=torch.float32,
+            shape=dq.shape,
+            layout_tag=cutlass.LayoutType.RowMajor,
+        ),
+        "D": dq,
+        "scale_b": scale_b,
+    }
+    epilog_fn = epilog_with_scale_b
+
+    if scale_a is not None:
+        epilogue_tensors["scale_a"] = scale_a
+        visitor_args["scale_a"] = scale_a
+        epilog_fn = epilog_with_both_scales
+
+    plan.epilogue_visitor = cutlass.epilogue.trace(epilog_fn, epilogue_tensors)
+    return plan, visitor_args
+
+
+def fused_gemm_dq_fp8(
+    x_q: torch.Tensor,
+    w_q: torch.Tensor,
+    out_dtype: torch.dtype,
+    scale_a: Optional[torch.Tensor] = None,
+    scale_b: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    dq = torch.empty((x_q.shape[0], w_q.shape[1]), dtype=out_dtype, device="cuda")
+    C = torch.zeros((x_q.shape[0], w_q.shape[1]), dtype=out_dtype, device="cuda")
+
+    plan = cutlass.op.Gemm(
+        element_A=x_q.dtype,
+        element_B=w_q.dtype,
+        element_C=dq.dtype,
+        element_D=dq.dtype,
+        layout_A=cutlass.LayoutType.RowMajor,
+        layout_B=cutlass.LayoutType.ColumnMajor,
+        layout_C=cutlass.LayoutType.RowMajor,
+        element_accumulator=torch.float32,
+        kernel_cc=90,
+    )
+
+    plan, visitor_args = setup_dequant_epilogue(plan, dq, scale_a, scale_b, bias)
+
+    plan.run(
+        x_q,
+        w_q,
+        C,
+        dq,
+        alpha=1,
+        beta=0,
+        visitor_args=visitor_args,
+        print_module=False,
+    )
+
+    return dq

From 823a2e79ca7e767c9c4fb88b2f506154d6786d19 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 26 Apr 2024 08:55:39 -0600
Subject: [PATCH 59/90] Update run_fp8.py

Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 run_fp8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_fp8.py b/run_fp8.py
index 0caf538de1fef..0ee0a69af91ad 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -7,7 +7,7 @@
 choices = ["llama-static", "mistral-static", "mistral-dynamic", "mixtral-static"]
 
 parser = argparse.ArgumentParser()
-parser.add_argument("--type", choices="mixtral-static")
+parser.add_argument("--type", choices=choices)
 
 if __name__ == "__main__":
     args = parser.parse_args()

From 81f42befabf210054ec1a717a8772b08410ac65e Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 27 Apr 2024 08:00:29 -0400
Subject: [PATCH 60/90] Dynamic FP8 works, but static does not (#213)

merging michael's work
---
 requirements-cuda.txt                         |   1 -
 .../layers/fused_gemm_dq/__init__.py          |   5 -
 .../layers/fused_gemm_dq/fused_gemm_dq_fp8.py |  87 -------
 vllm/model_executor/layers/linear.py          |   5 +
 .../layers/quantization/__init__.py           |   3 +-
 .../layers/quantization/base_config.py        |   6 -
 .../model_executor/layers/quantization/fp8.py | 195 ++++++++++++---
 .../layers/quantization/fp8_serialized.py     | 223 ------------------
 .../model_loader/weight_utils.py              |  12 +-
 vllm/model_executor/models/mixtral.py         |   1 -
 10 files changed, 174 insertions(+), 364 deletions(-)
 delete mode 100644 vllm/model_executor/layers/fused_gemm_dq/__init__.py
 delete mode 100644 vllm/model_executor/layers/fused_gemm_dq/fused_gemm_dq_fp8.py
 delete mode 100644 vllm/model_executor/layers/quantization/fp8_serialized.py

diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index b873423911488..1bddae4c6f40f 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -7,4 +7,3 @@ nvidia-ml-py # for pynvml package
 vllm-nccl-cu12>=2.18,<2.19  # for downloading nccl library
 torch == 2.2.1
 xformers == 0.0.25  # Requires PyTorch 2.2.1
-nvidia-cutlass
diff --git a/vllm/model_executor/layers/fused_gemm_dq/__init__.py b/vllm/model_executor/layers/fused_gemm_dq/__init__.py
deleted file mode 100644
index 2acb1c2aad6c7..0000000000000
--- a/vllm/model_executor/layers/fused_gemm_dq/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from vllm.model_executor.layers.fused_gemm_dq.fused_gemm_dq_fp8 import fused_gemm_dq_fp8
-
-__all__ = [
-    "fused_gemm_dq_fp8",
-]
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_gemm_dq/fused_gemm_dq_fp8.py b/vllm/model_executor/layers/fused_gemm_dq/fused_gemm_dq_fp8.py
deleted file mode 100644
index 66e18e0c4b7b2..0000000000000
--- a/vllm/model_executor/layers/fused_gemm_dq/fused_gemm_dq_fp8.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import cutlass
-from cutlass import Tensor as FakeTensor
-import cutlass.epilogue
-
-import torch
-from typing import Optional, Tuple, Dict
-
-
-def setup_dequant_epilogue(
-    plan: cutlass.op.Gemm,
-    dq: torch.Tensor,
-    scale_a: Optional[torch.Tensor],
-    scale_b: Optional[torch.Tensor],
-    bias: Optional[torch.Tensor],
-) -> Tuple[cutlass.op.Gemm, Dict]:
-    assert bias is None
-
-    if all([scale_a is None, scale_b is None]):
-        return plan, None
-    assert scale_b is not None
-
-    def epilog_with_scale_b(accum, scale_b):
-        D = scale_b * accum
-        return D
-
-    def epilog_with_both_scales(accum, scale_a, scale_b):
-        D = scale_a * (scale_b * accum)
-        return D
-
-    visitor_args = {"scale_a": scale_a, "scale_b": scale_b, "D": dq}
-    epilogue_tensors = {
-        "accum": FakeTensor(
-            element=torch.float32,
-            shape=dq.shape,
-            layout_tag=cutlass.LayoutType.RowMajor,
-        ),
-        "D": dq,
-        "scale_b": scale_b,
-    }
-    epilog_fn = epilog_with_scale_b
-
-    if scale_a is not None:
-        epilogue_tensors["scale_a"] = scale_a
-        visitor_args["scale_a"] = scale_a
-        epilog_fn = epilog_with_both_scales
-
-    plan.epilogue_visitor = cutlass.epilogue.trace(epilog_fn, epilogue_tensors)
-    return plan, visitor_args
-
-
-def fused_gemm_dq_fp8(
-    x_q: torch.Tensor,
-    w_q: torch.Tensor,
-    out_dtype: torch.dtype,
-    scale_a: Optional[torch.Tensor] = None,
-    scale_b: Optional[torch.Tensor] = None,
-    bias: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    dq = torch.empty((x_q.shape[0], w_q.shape[1]), dtype=out_dtype, device="cuda")
-    C = torch.zeros((x_q.shape[0], w_q.shape[1]), dtype=out_dtype, device="cuda")
-
-    plan = cutlass.op.Gemm(
-        element_A=x_q.dtype,
-        element_B=w_q.dtype,
-        element_C=dq.dtype,
-        element_D=dq.dtype,
-        layout_A=cutlass.LayoutType.RowMajor,
-        layout_B=cutlass.LayoutType.ColumnMajor,
-        layout_C=cutlass.LayoutType.RowMajor,
-        element_accumulator=torch.float32,
-        kernel_cc=90,
-    )
-
-    plan, visitor_args = setup_dequant_epilogue(plan, dq, scale_a, scale_b, bias)
-
-    plan.run(
-        x_q,
-        w_q,
-        C,
-        dq,
-        alpha=1,
-        beta=0,
-        visitor_args=visitor_args,
-        print_module=False,
-    )
-
-    return dq
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 87b2bb7d9acce..cfe6eefe35455 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -232,6 +232,11 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
+        # TODO: canon
+        # This is for loading scales for fp8, which have no dims.
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 060eaad31899b..a525add458499 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -4,8 +4,7 @@
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-# from vllm.model_executor.layers.quantization.fp8 import FP8Config
-from vllm.model_executor.layers.quantization.fp8_serialized import FP8Config
+from vllm.model_executor.layers.quantization.fp8 import FP8Config
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.marlin import MarlinConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 08061f83b5a38..6115e7c3be956 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -29,12 +29,6 @@ def get_min_capability(self) -> int:
         """
         raise NotImplementedError
 
-    # The following is not an abstract method and returns True by default.
-    @classmethod
-    def require_config_file(cls) -> bool:
-        """Whether this quantization config needs a configuration filen."""
-        return True
-
     @staticmethod
     @abstractmethod
     def get_config_filenames() -> List[str]:
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 271cddc1175a5..29637704d5e4d 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch.nn import Module
@@ -9,14 +9,19 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 
+ACTIVATION_SCHEMES = ["static", "dynamic"]
+
 
 class FP8Config(QuantizationConfig):
     """Config class for FP8."""
 
     def __init__(
         self,
+        is_serialized: bool = False,
         activation_scheme: str = "dynamic",
     ) -> None:
+        self.is_serialized = is_serialized
+        assert activation_scheme in ACTIVATION_SCHEMES
         self.activation_scheme = activation_scheme
 
     @classmethod
@@ -32,20 +37,19 @@ def get_min_capability(cls) -> int:
         # TODO: PyTorch 2.3.0+ is required to run FP8 on
         # SM 89 (e.g. Ada) GPUs. Specifically, this PR has to
         # be included: https://github.com/pytorch/pytorch/pull/118881
-        return 90
-
-    @classmethod
-    def require_config_file(cls) -> bool:
-        return False
+        return 89
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
-        return ["quantize_config.json"]
+        return []
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "FP8Config":
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_serialized = ("fp8" in quant_method)
         activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
-        return cls(activation_scheme)
+        return cls(is_serialized=is_serialized,
+                   activation_scheme=activation_scheme)
 
     def get_linear_method(self) -> "Fp8LinearMethod":
         return Fp8LinearMethod(self)
@@ -56,8 +60,12 @@ def get_scaled_act_names(self) -> List[str]:
 
 class Fp8LinearMethod(LinearMethodBase):
     """Linear method for FP8.
-    We now support common FP16/BF16 model checkpoints ONLY. The weight
-    scaling factor will be initialized after the model weights are loaded.
+    Supports loading FP8 checkpoints with static weight scale and
+    dynamic/scale activation scale.
+
+    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
 
     Limitations:
     1. Only support per-tensor quantization due to torch._scaled_mm support.
@@ -81,54 +89,175 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
+        del input_size, output_size
+
+        layer.logical_widths = output_partition_sizes
         output_size_per_partition = sum(output_partition_sizes)
+
+        # WEIGHT
+        weight_dtype = torch.float8_e4m3fn if self.quant_config.is_serialized else params_dtype
         weight = Parameter(torch.empty(output_size_per_partition,
                                        input_size_per_partition,
-                                       dtype=params_dtype),
+                                       dtype=weight_dtype),
                            requires_grad=False)
         layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
-        set_weight_attrs(weight, extra_weight_attrs)
+        set_weight_attrs(weight, {
+            **extra_weight_attrs,
+            "input_dim": 1,
+            "output_dim": 0,
+        })
 
-        w_scale = Parameter(
-            torch.empty(1, dtype=torch.float32),
-            requires_grad=False,
-        )
-        layer.register_parameter("weight_scaling_factor", w_scale)
+        # WEIGHT SCALE
+        weight_scale = Parameter(torch.empty(
+            len(output_partition_sizes),
+            dtype=torch.float32,
+        ),
+                                 requires_grad=False)
+        layer.register_parameter("weight_scale", weight_scale)
+        set_weight_attrs(
+            weight_scale, {
+                **extra_weight_attrs,
+                "shard_indexer": self.scales_shard_indexer,
+            })
+
+        # ACTIVATION SCALE
+        if self.quant_config.activation_scheme == "static":
+            act_scale = Parameter(torch.empty(len(output_partition_sizes),
+                                              dtype=torch.float32),
+                                  requires_grad=False)
+            layer.register_parameter("act_scale", act_scale)
+            set_weight_attrs(act_scale, {
+                **extra_weight_attrs,
+                "shard_indexer":
+                self.scales_shard_indexer,
+            })
+
+    def shard_id_as_int(self, shard_id: Union[str, int]) -> int:
+        if isinstance(shard_id, int):
+            return shard_id
+        assert isinstance(shard_id, str)
+        qkv_idxs = {"q": 0, "k": 1, "v": 2}
+        assert shard_id in qkv_idxs
+        return qkv_idxs[shard_id]
+
+    def scales_shard_indexer(
+        self,
+        param: torch.Tensor,
+        loaded_weight: torch.Tensor,
+        shard_id: Union[str, int],
+        logical_widths: List[int],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        del logical_widths
+        return param[self.shard_id_as_int(shard_id)], loaded_weight
 
     def process_weights_after_loading(self, layer: Module) -> None:
         # Although the linear_method is propagated to all layers,
         # only linear layers invoke "create_weights". So we check
-        # whether "weight_scaling_facor" is registered to determine
+        # whether "weight_scale" is registered to determine
         # whether the layer is a linear layer that requires quantization.
-        if not hasattr(layer, "weight_scaling_factor"):
+        if not hasattr(layer, "weight_scale"):
             return
 
-        qweight, weight_scale = per_tensor_quantize(layer.weight)
+        # If we loaded in an FP8 checkpoint, we can skip weight quantization
+        if self.quant_config.is_serialized:
+            # torch._scaled_mm requires column-major in the second
+            # input (weight), so we transpose the quantized weight.
+            # TODO
+            return
+
+        qweight, weight_scale = per_tensor_quantize_dynamic(layer.weight)
         # torch._scaled_mm requires column-major in the second
         # input (weight), so we transpose the quantized weight.
-        layer.weight = Parameter(qweight.t(), requires_grad=False)
-        layer.weight_scaling_factor.data.copy_(weight_scale)
+        # TODO
+        # layer.weight = Parameter(qweight.t(), requires_grad=False)
+        layer.weight = Parameter(qweight, requires_grad=False)
+        weight_scales = torch.tensor(
+            [weight_scale for _ in layer.logical_widths], dtype=torch.float32)
+        layer.weight_scale.data.copy_(weight_scales)
 
     def apply_weights(self,
                       layer: torch.nn.Module,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        qinput, x_scale = per_tensor_quantize(x)
-        output, _ = torch._scaled_mm(
-            qinput,
-            layer.weight,
-            out_dtype=x.dtype,
-            scale_a=x_scale,
-            scale_b=layer.weight_scaling_factor,
-            bias=bias,
-        )
+
+        if self.quant_config.activation_scheme == "static":
+            # Empirically, these are all the same
+            x_scale = layer.act_scale.max()
+            qinput = per_tensor_quantize_static(x, x_scale)
+        else:
+            qinput, x_scale = per_tensor_quantize_dynamic(x)
+
+        # # TODO: Inefficient loop over each shard since there is a per-tensor
+        # # scale for each shard.
+        # # To be replaced by cutlass gemm with epilogue fusion for performance.
+        # output = torch.zeros(x.shape[0],
+        #                      layer.weight.shape[0],
+        #                      dtype=x.dtype,
+        #                      device="cuda")
+        # start_offset = 0
+        # for _, (logical_width, w_scale) in enumerate(
+        #         zip(layer.logical_widths, layer.weight_scale)):
+        #     end_offset = start_offset + logical_width
+
+        #     cuda_compute_capability = torch.cuda.get_device_capability()
+        #     if cuda_compute_capability >= (9, 0):
+        #         out, _ = torch._scaled_mm(
+        #             qinput,
+        #             layer.weight[start_offset:end_offset, :].t(),
+        #             out_dtype=x.dtype,
+        #             scale_a=x_scale,
+        #             scale_b=w_scale,
+        #         )
+        #     else:
+        #         out = torch.nn.functional.linear(
+        #             qinput.to(x.dtype) * x_scale.to(x.dtype),
+        #             layer.weight[start_offset:end_offset, :].to(x.dtype) * w_scale.to(x.dtype),
+        #         )
+
+        #     output[:, start_offset:end_offset] = out
+        #     start_offset = end_offset
+
+        w_scale = layer.weight_scale.max()
+
+        cuda_compute_capability = torch.cuda.get_device_capability()
+        if cuda_compute_capability >= (9, 0):
+            output, _ = torch._scaled_mm(
+                qinput,
+                layer.weight.t(),
+                out_dtype=x.dtype,
+                scale_a=x_scale,
+                scale_b=w_scale,
+            )
+        else:
+            output = torch.nn.functional.linear(
+                qinput.to(x.dtype) * x_scale.to(x.dtype),
+                layer.weight.to(x.dtype) * w_scale.to(x.dtype),
+            )
+
+        if bias is not None:
+            output = output + bias
+
         return output
 
 
-def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
+def per_tensor_quantize_static(tensor: torch.Tensor,
+                               inv_scale: float) -> torch.Tensor:
     """Quantize a tensor using per-tensor static scaling factor.
+    Args:
+        tensor: The input tensor.
+        inv_scale: The scale.
+    """
+    # Scale and clamp the tensor to bring it to
+    # the representative range of float8 data type
+    # (as default cast is unsaturated)
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    qweight = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
+    return qweight.to(torch.float8_e4m3fn)
+
 
+def per_tensor_quantize_dynamic(
+        tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
+    """Quantize a tensor using per-tensor dynamic scaling factor.
     Args:
         tensor: The input tensor.
     """
diff --git a/vllm/model_executor/layers/quantization/fp8_serialized.py b/vllm/model_executor/layers/quantization/fp8_serialized.py
deleted file mode 100644
index a3511bdb02fa3..0000000000000
--- a/vllm/model_executor/layers/quantization/fp8_serialized.py
+++ /dev/null
@@ -1,223 +0,0 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import torch
-from torch.nn.parameter import Parameter
-
-from vllm.model_executor.layers.fused_gemm_dq import fused_gemm_dq_fp8
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               set_weight_attrs)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-
-
-class FP8Config(QuantizationConfig):
-    """Config class for FP8."""
-
-    def __init__(
-        self,
-        activation_scheme: str,
-    ) -> None:
-        assert activation_scheme == "static" or activation_scheme == "dynamic"
-        self.activation_scheme = activation_scheme
-
-    @classmethod
-    def get_name(cls) -> str:
-        return "fp8"
-
-    @classmethod
-    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
-        return [torch.bfloat16, torch.half]
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        return 90
-
-    @classmethod
-    def get_config_filenames(cls) -> List[str]:
-        return []
-
-    @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "FP8Config":
-        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
-        return cls(activation_scheme=activation_scheme)
-
-    def get_linear_method(self) -> "FP8LinearMethod":
-        return FP8LinearMethod(self)
-
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
-
-class FP8LinearMethod(LinearMethodBase):
-    """Linear method for StaticFP8
-    .
-    Args:
-        quant_config: The quantization config.
-    """
-
-    def __init__(self, quant_config: FP8Config):
-        self.quant_config = quant_config
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: List[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        del input_size, output_size, params_dtype
-        num_logical_weights = len(output_partition_sizes)
-        layer.logical_widths = output_partition_sizes
-
-        # WEIGHT
-        weight = Parameter(torch.empty(sum(output_partition_sizes),
-                                       input_size_per_partition,
-                                       dtype=torch.float8_e4m3fn),
-                           requires_grad=False)
-        layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
-        set_weight_attrs(weight, extra_weight_attrs)
-        
-        # WEIGHT SCALE
-        if num_logical_weights == 1:
-            weight_scale = Parameter(
-                torch.empty(len(output_partition_sizes), dtype=torch.float32),
-                requires_grad=False)
-             
-            layer.register_parameter("weight_scale", weight_scale)
-            set_weight_attrs(weight_scale, extra_weight_attrs)
-            set_weight_attrs(weight_scale, {
-                "shard_indexer": self.scales_shard_indexer,
-            })
-        else:
-            weight_scale = Parameter(
-                torch.empty(sum(output_partition_sizes), dtype=torch.float32),
-                requires_grad=False)
-            layer.register_parameter("weight_scale", weight_scale)
-            # set_weight_attrs(weight_scale, extra_weight_attrs)
-            set_weight_attrs(weight_scale, {
-                **extra_weight_attrs,
-                "shard_indexer": self.scales_shard_indexer_NKK,
-                "logical_widths": output_partition_sizes
-            })
-
-        # ACT Scale
-        if self.quant_config.activation_scheme == "static":
-            act_scale = Parameter(torch.empty(len(output_partition_sizes),
-                                              dtype=torch.float32),
-                                  requires_grad=False)
-            layer.register_parameter("act_scale", act_scale)
-            set_weight_attrs(act_scale, extra_weight_attrs)
-            set_weight_attrs(act_scale, {
-                "shard_indexer": self.scales_shard_indexer,
-            })
-
-
-    def shard_id_as_int(self, shard_id: Union[str, int]) -> int:
-        if isinstance(shard_id, int):
-            return shard_id
-        assert isinstance(shard_id, str)
-        qkv_idxs = {"q": 0, "k": 1, "v": 2}
-        assert shard_id in qkv_idxs
-        return qkv_idxs[shard_id]
-
-    def scales_shard_indexer_NKK(
-        self,
-        param: torch.Tensor,
-        loaded_weight: torch.Tensor,
-        shard_id: Union[str, int],
-        logical_widths,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        shard_id = self.shard_id_as_int(shard_id)
-        offset = sum(logical_widths[:shard_id])
-        size = logical_widths[shard_id]
-        # update loaded weight with copies for broadcast.
-        loaded_weight = loaded_weight.repeat(size)
-        return param[offset:offset + size], loaded_weight
-
-    def scales_shard_indexer(
-        self,
-        param: torch.Tensor,
-        loaded_weight: torch.Tensor,
-        shard_id: Union[str, int],
-        logical_widths,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        del logical_widths
-        return param[self.shard_id_as_int(shard_id)], loaded_weight
-
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-
-        m = x.shape[0]
-
-        if self.quant_config.activation_scheme == "dynamic":
-            qinput, x_scale = per_tensor_quantize_dyanmic(x)
-        elif self.quant_config.activation_scheme == "static":
-            # empirically, these are all the same
-            x_scale = layer.act_scale.max()
-            qinput = per_tensor_quantize_static(x, x_scale)
-
-        # HACK: CUTLASS hits an illegal instruction for fp8 when m == 1,
-        # so pad m to 2
-        if m == 1:
-            tmp = torch.zeros(2,
-                              qinput.shape[1],
-                              dtype=qinput.dtype,
-                              device="cuda")
-            tmp[0, :] = qinput
-            qinput = tmp
-
-        out_cutlass_qkv = fused_gemm_dq_fp8(
-            qinput,
-            layer.weight.t(),
-            out_dtype=x.dtype,
-            scale_a=x_scale,
-            scale_b=layer.weight_scale,
-            bias=bias,
-        )
-
-        return out_cutlass_qkv[:m, :]
-
-
-def per_tensor_quantize_static(tensor: torch.Tensor,
-                               inv_scale: float) -> torch.Tensor:
-    """Quantize a tensor using per-tensor static scaling factor.
-    Args:
-        tensor: The input tensor.
-        inv_scale: The scale.
-    """
-    # Scale and clamp the tensor to bring it to
-    # the representative range of float8 data type
-    # (as default cast is unsaturated)
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    qweight = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
-    return qweight.to(torch.float8_e4m3fn)
-
-
-def per_tensor_quantize_dyanmic(
-        tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
-    """Quantize a tensor using per-tensor dynamic scaling factor.
-    Args:
-        tensor: The input tensor.
-    """
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    # Calculate the scale as dtype max divided by absmax.
-    # Since .abs() creates a new tensor, we use aminmax to get
-    # the min and max first and then calculate the absmax.
-    min_val, max_val = tensor.aminmax()
-    amax = min_val.abs().max(max_val.abs())
-    scale = finfo.max / amax.clamp(min=1e-12)
-    # scale and clamp the tensor to bring it to
-    # the representative range of float8 data type
-    # (as default cast is unsaturated)
-    qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)
-    # Return both float8 data and the inverse scale (as float),
-    # as both required as inputs to torch._scaled_mm
-    qweight = qweight.to(torch.float8_e4m3fn)
-    scale = scale.float().reciprocal()
-    return qweight, scale
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 54fd4f22f0eef..c061c9b8ab68c 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -135,19 +135,19 @@ def get_quant_config(model_config: ModelConfig,
     else:
         hf_folder = model_name_or_path
 
-    config_files = glob.glob(os.path.join(hf_folder, "*.json"))
     possible_config_filenames = quant_cls.get_config_filenames()
 
+    # If the quantization config is not found, use the default config.
+    if not possible_config_filenames:
+        return quant_cls()
+
+    config_files = glob.glob(os.path.join(hf_folder, "*.json"))
+
     quant_config_files = [
         f for f in config_files if any(
             f.endswith(x) for x in possible_config_filenames)
     ]
 
-    # If the quantization config is optional and not provided, use the
-    # default config.
-    if not quant_cls.require_config_file() and not quant_config_files:
-        return quant_cls()
-
     if len(quant_config_files) == 0:
         raise ValueError(
             f"Cannot find the config file for {model_config.quantization}")
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 2f7aca2c3b0e9..3c9b7da426635 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -39,7 +39,6 @@
                                                ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.fp8_serialized import FP8LinearMethod
 from vllm.model_executor.layers.quantization.fp8 import (Fp8LinearMethod,
                                                          per_tensor_quantize)
 from vllm.model_executor.layers.rotary_embedding import get_rope

From 1a4fd8ab466e0cff07fad700d3686522fcf37c65 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 27 Apr 2024 12:10:06 +0000
Subject: [PATCH 61/90] static correctness

---
 .../model_executor/layers/quantization/fp8.py | 87 +++++++------------
 1 file changed, 30 insertions(+), 57 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 29637704d5e4d..126a530dbe4e2 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -96,29 +96,26 @@ def create_weights(
 
         # WEIGHT
         weight_dtype = torch.float8_e4m3fn if self.quant_config.is_serialized else params_dtype
-        weight = Parameter(torch.empty(output_size_per_partition,
-                                       input_size_per_partition,
-                                       dtype=weight_dtype),
-                           requires_grad=False)
+        weight = Parameter(
+            torch.empty(output_size_per_partition,
+                        input_size_per_partition,
+                        dtype=weight_dtype),
+            requires_grad=False)
         layer.register_parameter("weight", weight)
         set_weight_attrs(weight, {
             **extra_weight_attrs,
-            "input_dim": 1,
-            "output_dim": 0,
+            "input_dim": 1, "output_dim": 0,
         })
 
         # WEIGHT SCALE
-        weight_scale = Parameter(torch.empty(
-            len(output_partition_sizes),
-            dtype=torch.float32,
-        ),
-                                 requires_grad=False)
+        weight_scale = Parameter(
+            torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            requires_grad=False)
         layer.register_parameter("weight_scale", weight_scale)
-        set_weight_attrs(
-            weight_scale, {
-                **extra_weight_attrs,
-                "shard_indexer": self.scales_shard_indexer,
-            })
+        set_weight_attrs(weight_scale, {
+            **extra_weight_attrs,
+            "shard_indexer": self.scales_shard_indexer,
+        })
 
         # ACTIVATION SCALE
         if self.quant_config.activation_scheme == "static":
@@ -187,52 +184,28 @@ def apply_weights(self,
         else:
             qinput, x_scale = per_tensor_quantize_dynamic(x)
 
-        # # TODO: Inefficient loop over each shard since there is a per-tensor
-        # # scale for each shard.
-        # # To be replaced by cutlass gemm with epilogue fusion for performance.
-        # output = torch.zeros(x.shape[0],
-        #                      layer.weight.shape[0],
-        #                      dtype=x.dtype,
-        #                      device="cuda")
-        # start_offset = 0
-        # for _, (logical_width, w_scale) in enumerate(
-        #         zip(layer.logical_widths, layer.weight_scale)):
-        #     end_offset = start_offset + logical_width
-
-        #     cuda_compute_capability = torch.cuda.get_device_capability()
-        #     if cuda_compute_capability >= (9, 0):
-        #         out, _ = torch._scaled_mm(
-        #             qinput,
-        #             layer.weight[start_offset:end_offset, :].t(),
-        #             out_dtype=x.dtype,
-        #             scale_a=x_scale,
-        #             scale_b=w_scale,
-        #         )
-        #     else:
-        #         out = torch.nn.functional.linear(
-        #             qinput.to(x.dtype) * x_scale.to(x.dtype),
-        #             layer.weight[start_offset:end_offset, :].to(x.dtype) * w_scale.to(x.dtype),
-        #         )
-
-        #     output[:, start_offset:end_offset] = out
-        #     start_offset = end_offset
-
-        w_scale = layer.weight_scale.max()
-
-        cuda_compute_capability = torch.cuda.get_device_capability()
-        if cuda_compute_capability >= (9, 0):
-            output, _ = torch._scaled_mm(
+        # TODO: Inefficient loop over each shard since there is a per-tensor
+        # scale for each shard.
+        # To be replaced by cutlass gemm with epilogue fusion for performance.
+        output = torch.zeros(x.shape[0],
+                             layer.weight.shape[0],
+                             dtype=x.dtype,
+                             device="cuda")
+        start_offset = 0
+        for _, (logical_width, w_scale) in enumerate(
+                zip(layer.logical_widths, layer.weight_scale)):
+            end_offset = start_offset + logical_width
+
+            out, _ = torch._scaled_mm(
                 qinput,
-                layer.weight.t(),
+                layer.weight[start_offset:end_offset, :].t(),
                 out_dtype=x.dtype,
                 scale_a=x_scale,
                 scale_b=w_scale,
             )
-        else:
-            output = torch.nn.functional.linear(
-                qinput.to(x.dtype) * x_scale.to(x.dtype),
-                layer.weight.to(x.dtype) * w_scale.to(x.dtype),
-            )
+
+            output[:, start_offset:end_offset] = out
+            start_offset = end_offset
 
         if bias is not None:
             output = output + bias

From e48c981cfaa15280be0f7257f210232233faf8f3 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 27 Apr 2024 13:29:26 +0000
Subject: [PATCH 62/90] static fp8 loading

---
 run_fp8.py                                    |   9 +-
 .../model_executor/layers/quantization/fp8.py | 182 ++++++++----------
 2 files changed, 85 insertions(+), 106 deletions(-)

diff --git a/run_fp8.py b/run_fp8.py
index 0ee0a69af91ad..fdea9e66d8b45 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -4,7 +4,7 @@
 
 from vllm import LLM
 
-choices = ["llama-static", "mistral-static", "mistral-dynamic", "mixtral-static"]
+choices = ["llama-static", "mistral-static", "mistral-dynamic", "mixtral-static", "tinyllama-fp16"]
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--type", choices=choices)
@@ -20,18 +20,21 @@
         model_name = "nm-testing/mistral-fp8-dynamic"
     elif args.type == 'mixtral-static':
         model_name = "nm-testing/Mixtral-8x7B-Instruct-v0.1-FP8"
+    elif args.type == 'tinyllama-fp16':
+        model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
     else:
         raise ValueError(f"--type should be in {choices}")
 
     model = LLM(model_name,
                 enforce_eager=True,
-                max_model_len=1024)
+                max_model_len=1024,
+                quantization="fp8")
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
 
     prompt = tokenizer.apply_chat_template([{
         "role": "user",
-        "content": "What is your name"
+        "content": "What is open source software?"
     }], tokenize=False, add_generation_prompt=True)
     print(f"----- Prompt: {prompt}")
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 126a530dbe4e2..68a29dc334142 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -4,6 +4,7 @@
 from torch.nn import Module
 from torch.nn.parameter import Parameter
 
+from vllm import _custom_ops as ops
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -37,7 +38,7 @@ def get_min_capability(cls) -> int:
         # TODO: PyTorch 2.3.0+ is required to run FP8 on
         # SM 89 (e.g. Ada) GPUs. Specifically, this PR has to
         # be included: https://github.com/pytorch/pytorch/pull/118881
-        return 89
+        return 90
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
@@ -90,10 +91,11 @@ def create_weights(
         **extra_weight_attrs,
     ):
         del input_size, output_size
-
-        layer.logical_widths = output_partition_sizes
         output_size_per_partition = sum(output_partition_sizes)
 
+        layer.process_after_load = True
+        layer.logical_widths = output_partition_sizes
+        
         # WEIGHT
         weight_dtype = torch.float8_e4m3fn if self.quant_config.is_serialized else params_dtype
         weight = Parameter(
@@ -107,28 +109,31 @@ def create_weights(
             "input_dim": 1, "output_dim": 0,
         })
 
-        # WEIGHT SCALE
-        weight_scale = Parameter(
-            torch.empty(len(output_partition_sizes), dtype=torch.float32),
-            requires_grad=False)
-        layer.register_parameter("weight_scale", weight_scale)
-        set_weight_attrs(weight_scale, {
-            **extra_weight_attrs,
-            "shard_indexer": self.scales_shard_indexer,
-        })
-
-        # ACTIVATION SCALE
-        if self.quant_config.activation_scheme == "static":
-            act_scale = Parameter(torch.empty(len(output_partition_sizes),
-                                              dtype=torch.float32),
-                                  requires_grad=False)
-            layer.register_parameter("act_scale", act_scale)
-            set_weight_attrs(act_scale, {
+        # SCALES
+        #   We only need to load scales if the model is serialized FP8.
+        #   Otherwise, scale creation is delayed until `process_weights_after_loading`.
+        if self.quant_config.is_serialized:
+            # WEIGHT SCALE
+            weight_scale = Parameter(
+                torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                requires_grad=False)
+            layer.register_parameter("weight_scale", weight_scale)
+            set_weight_attrs(weight_scale, {
                 **extra_weight_attrs,
-                "shard_indexer":
-                self.scales_shard_indexer,
+                "shard_indexer": self.scales_shard_indexer,
             })
 
+            # ACTIVATION SCALE
+            if self.quant_config.activation_scheme == "static":
+                act_scale = Parameter(
+                    torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                    requires_grad=False)
+                layer.register_parameter("act_scale", act_scale)
+                set_weight_attrs(act_scale, {
+                    **extra_weight_attrs,
+                    "shard_indexer": self.scales_shard_indexer,
+                })
+
     def shard_id_as_int(self, shard_id: Union[str, int]) -> int:
         if isinstance(shard_id, int):
             return shard_id
@@ -152,101 +157,72 @@ def process_weights_after_loading(self, layer: Module) -> None:
         # only linear layers invoke "create_weights". So we check
         # whether "weight_scale" is registered to determine
         # whether the layer is a linear layer that requires quantization.
-        if not hasattr(layer, "weight_scale"):
+        if not hasattr(layer, "process_after_load") or not layer.process_after_load:
             return
 
-        # If we loaded in an FP8 checkpoint, we can skip weight quantization
-        if self.quant_config.is_serialized:
-            # torch._scaled_mm requires column-major in the second
-            # input (weight), so we transpose the quantized weight.
-            # TODO
+        # If the model was not serialized, quantize the weights.
+        if not self.quant_config.is_serialized:
+            qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
+            layer.weight = Parameter(qweight, requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+            layer.logical_widths = None
+            layer.act_scale = None
             return
 
-        qweight, weight_scale = per_tensor_quantize_dynamic(layer.weight)
-        # torch._scaled_mm requires column-major in the second
-        # input (weight), so we transpose the quantized weight.
-        # TODO
-        # layer.weight = Parameter(qweight.t(), requires_grad=False)
-        layer.weight = Parameter(qweight, requires_grad=False)
-        weight_scales = torch.tensor(
-            [weight_scale for _ in layer.logical_widths], dtype=torch.float32)
-        layer.weight_scale.data.copy_(weight_scales)
+        # If the model is serialized, cleanup the weight_scales / act_scales.
+        else:
+            if len(layer.logical_widths) == 1:
+                layer.weight_scale = Parameter(layer.weight_scale.max(), requires_grad=False)
+                layer.logical_widths = None
+            # ACT_SCALE
+            if self.quant_config.activation_scheme == "dynamic":
+                layer.act_scale = None
+            elif self.quant_config.activation_scheme == "static":
+                # Act_scale for each logical input is the same, so take max().
+                layer.act_scale = Parameter(layer.act_scale.max(), requires_grad=False)
+            else:
+                raise ValueError(f"Unknown activation_scheme {self.quant_config.activation_scheme}")
 
     def apply_weights(self,
                       layer: torch.nn.Module,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-
-        if self.quant_config.activation_scheme == "static":
-            # Empirically, these are all the same
-            x_scale = layer.act_scale.max()
-            qinput = per_tensor_quantize_static(x, x_scale)
-        else:
-            qinput, x_scale = per_tensor_quantize_dynamic(x)
-
-        # TODO: Inefficient loop over each shard since there is a per-tensor
-        # scale for each shard.
-        # To be replaced by cutlass gemm with epilogue fusion for performance.
-        output = torch.zeros(x.shape[0],
-                             layer.weight.shape[0],
-                             dtype=x.dtype,
-                             device="cuda")
-        start_offset = 0
-        for _, (logical_width, w_scale) in enumerate(
-                zip(layer.logical_widths, layer.weight_scale)):
-            end_offset = start_offset + logical_width
-
-            out, _ = torch._scaled_mm(
+        # ops.scaled_fp8_quant supports both dynamic and static quant.
+        #   If dynamic, layer.act_scale is None and x_scale computed from x.
+        #   If static,  layer.act_scale is scalar and x_scale set to act_scale.
+        qinput, x_scale = ops.scaled_fp8_quant(x, layer.act_scale)
+
+        # Case 1: we have one single scale for N logical weights.
+        if layer.logical_widths is None:
+            output, _ = torch._scaled_mm(
                 qinput,
-                layer.weight[start_offset:end_offset, :].t(),
+                layer.weight.t(),
                 out_dtype=x.dtype,
                 scale_a=x_scale,
-                scale_b=w_scale,
+                scale_b=layer.weight_scale,
             )
-
-            output[:, start_offset:end_offset] = out
-            start_offset = end_offset
+        
+        # Case 2: We have N weigth_scales for N logical weights.
+        #   Current: inefficient for loop to apply each logical GEMM_DQ.
+        #   TODO: replace will cutlass gemm_dq with epilogue fusion.
+        else:
+            output = torch.empty(x.shape[0], layer.weight.shape[0],
+                                 dtype=x.dtype, device="cuda")
+            start = 0
+            # Loop over the N logical shards.
+            for logical_width, w_scale in zip(layer.logical_widths, layer.weight_scale):
+                end = start + logical_width
+                out, _ = torch._scaled_mm(
+                    qinput,
+                    layer.weight[start:end, :].t(),
+                    out_dtype=x.dtype,
+                    scale_a=x_scale,
+                    scale_b=w_scale,
+                )
+                output[:, start:end] = out
+                start = end
 
         if bias is not None:
-            output = output + bias
+            output.add_(bias)
 
         return output
-
-
-def per_tensor_quantize_static(tensor: torch.Tensor,
-                               inv_scale: float) -> torch.Tensor:
-    """Quantize a tensor using per-tensor static scaling factor.
-    Args:
-        tensor: The input tensor.
-        inv_scale: The scale.
-    """
-    # Scale and clamp the tensor to bring it to
-    # the representative range of float8 data type
-    # (as default cast is unsaturated)
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    qweight = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
-    return qweight.to(torch.float8_e4m3fn)
-
-
-def per_tensor_quantize_dynamic(
-        tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
-    """Quantize a tensor using per-tensor dynamic scaling factor.
-    Args:
-        tensor: The input tensor.
-    """
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    # Calculate the scale as dtype max divided by absmax.
-    # Since .abs() creates a new tensor, we use aminmax to get
-    # the min and max first and then calculate the absmax.
-    min_val, max_val = tensor.aminmax()
-    amax = min_val.abs().max(max_val.abs())
-    scale = finfo.max / amax.clamp(min=1e-12)
-    # scale and clamp the tensor to bring it to
-    # the representative range of float8 data type
-    # (as default cast is unsaturated)
-    qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)
-    # Return both float8 data and the inverse scale (as float),
-    # as both required as inputs to torch._scaled_mm
-    qweight = qweight.to(torch.float8_e4m3fn)
-    scale = scale.float().reciprocal()
-    return qweight, scale

From 02f683e6f4139107d30b088aa6a40da773a14751 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 27 Apr 2024 13:56:29 +0000
Subject: [PATCH 63/90] working for dense models

---
 run_fp8.py                                    | 10 ++++-
 .../model_executor/layers/quantization/fp8.py | 38 +++++++++++++++----
 2 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/run_fp8.py b/run_fp8.py
index fdea9e66d8b45..507f9972a740f 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -4,7 +4,12 @@
 
 from vllm import LLM
 
-choices = ["llama-static", "mistral-static", "mistral-dynamic", "mixtral-static", "tinyllama-fp16"]
+choices = ["llama-static", 
+           "mistral-static", 
+           "mistral-dynamic", 
+           "mixtral-static", 
+           "tinyllama-fp16",
+           "qwen-fp16"]
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--type", choices=choices)
@@ -22,6 +27,8 @@
         model_name = "nm-testing/Mixtral-8x7B-Instruct-v0.1-FP8"
     elif args.type == 'tinyllama-fp16':
         model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+    elif args.type == 'qwen-fp16':
+        model_name = "Qwen/CodeQwen1.5-7B-Chat"
     else:
         raise ValueError(f"--type should be in {choices}")
 
@@ -39,6 +46,5 @@
     print(f"----- Prompt: {prompt}")
 
     outputs = model.generate(prompt)
-    print(outputs)
     generation = outputs[0].outputs[0].text
     print(f"----- Generation: {generation}")
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 68a29dc334142..f291bc2ff9cad 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -160,25 +160,40 @@ def process_weights_after_loading(self, layer: Module) -> None:
         if not hasattr(layer, "process_after_load") or not layer.process_after_load:
             return
 
-        # If the model was not serialized, quantize the weights.
+        # If the checkpoint is fp16/bf16 (not serialized fp8), quantize the weights.
         if not self.quant_config.is_serialized:
             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
-            layer.weight = Parameter(qweight, requires_grad=False)
+            layer.weight = Parameter(qweight.t(), requires_grad=False)
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
             layer.logical_widths = None
             layer.act_scale = None
             return
 
-        # If the model is serialized, cleanup the weight_scales / act_scales.
-        else:
+        # If the checkpoint is serialized fp8, cleanup state_dict --> apply_weights.
+        # TODO: this will be cleaned up once we have the cutlass kernels.
+        else: 
+            # WEIGHT
+            #   Tranpose weight for passing to torch._scaled_mm
+            weight = layer.weight
+            layer.weight = Parameter(weight.t(), requires_grad=False)
+            
+            # WEIGHT_SCALE
+            #   If we only have one logical shard, avoid the for loop in apply weights.
+            #   TODO: once we have the cutlass_gemm, this will be removed.
             if len(layer.logical_widths) == 1:
                 layer.weight_scale = Parameter(layer.weight_scale.max(), requires_grad=False)
                 layer.logical_widths = None
+        
             # ACT_SCALE
+            #   Dyanmic: set to None (required input to ops.scaled_fp8_quant).
+            #   Static:  set to max of the act_scales (since they are equal to eachoter).
             if self.quant_config.activation_scheme == "dynamic":
                 layer.act_scale = None
             elif self.quant_config.activation_scheme == "static":
-                # Act_scale for each logical input is the same, so take max().
+                if not all_close_1d(layer.act_scale):
+                    raise ValueError(
+                        "All the act_scales for the logical weights of a layer "
+                        f"must be equal. But got {layer.act_scale}")
                 layer.act_scale = Parameter(layer.act_scale.max(), requires_grad=False)
             else:
                 raise ValueError(f"Unknown activation_scheme {self.quant_config.activation_scheme}")
@@ -196,7 +211,7 @@ def apply_weights(self,
         if layer.logical_widths is None:
             output, _ = torch._scaled_mm(
                 qinput,
-                layer.weight.t(),
+                layer.weight,
                 out_dtype=x.dtype,
                 scale_a=x_scale,
                 scale_b=layer.weight_scale,
@@ -206,7 +221,7 @@ def apply_weights(self,
         #   Current: inefficient for loop to apply each logical GEMM_DQ.
         #   TODO: replace will cutlass gemm_dq with epilogue fusion.
         else:
-            output = torch.empty(x.shape[0], layer.weight.shape[0],
+            output = torch.empty(x.shape[0], layer.weight.shape[1],
                                  dtype=x.dtype, device="cuda")
             start = 0
             # Loop over the N logical shards.
@@ -214,7 +229,7 @@ def apply_weights(self,
                 end = start + logical_width
                 out, _ = torch._scaled_mm(
                     qinput,
-                    layer.weight[start:end, :].t(),
+                    layer.weight[:, start:end],
                     out_dtype=x.dtype,
                     scale_a=x_scale,
                     scale_b=w_scale,
@@ -226,3 +241,10 @@ def apply_weights(self,
             output.add_(bias)
 
         return output
+    
+def all_close_1d(x: torch.Tensor):
+    assert len(x.shape) == 1
+    for i in range(x.shape[0]):
+        if not torch.allclose(x[0], x[i]):
+            return False
+    return True

From 81b73ef23d6335476f26b493cbdd35f05ef8f005 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 27 Apr 2024 09:59:58 -0400
Subject: [PATCH 64/90] Update weight_utils.py

---
 vllm/model_executor/model_loader/weight_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index c061c9b8ab68c..9995f2afe3cf7 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -147,7 +147,6 @@ def get_quant_config(model_config: ModelConfig,
         f for f in config_files if any(
             f.endswith(x) for x in possible_config_filenames)
     ]
-
     if len(quant_config_files) == 0:
         raise ValueError(
             f"Cannot find the config file for {model_config.quantization}")

From 58dbe0fae451024659a8f50acbf0b12b69e3f987 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 27 Apr 2024 14:01:51 +0000
Subject: [PATCH 65/90] moving mixtral updates to separate pr

---
 vllm/model_executor/models/mixtral.py | 105 +++++++++++---------------
 1 file changed, 42 insertions(+), 63 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 3c9b7da426635..c5dd1a63e2f7a 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -27,6 +27,7 @@
 from torch import nn
 from transformers import MixtralConfig
 
+from vllm import _custom_ops as ops
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import LoRAConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
@@ -34,13 +35,13 @@
                               tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               QKVParallelLinear,
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.fp8 import (Fp8LinearMethod,
-                                                         per_tensor_quantize)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.fp8 import Fp8Config
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -69,7 +70,7 @@ def __init__(
         intermediate_size: int,
         params_dtype: Optional[torch.dtype] = None,
         tp_size: Optional[int] = None,
-        linear_method: Optional[LinearMethodBase] = None,
+        quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
         self.tp_size = tp_size or get_tensor_model_parallel_world_size()
@@ -79,8 +80,7 @@ def __init__(
         self.intermediate_size = intermediate_size // self.tp_size
         # FIXME(pcmoritz): Make this more general to support different
         # quantization schemes
-        self.use_fp8 = (isinstance(linear_method, Fp8LinearMethod) or 
-                        isinstance(linear_method, FP8LinearMethod))
+        self.use_fp8 = isinstance(quant_config, Fp8Config)
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -90,23 +90,20 @@ def __init__(
                                      self.num_total_experts,
                                      bias=False,
                                      params_dtype=self.params_dtype,
-                                     linear_method=None)
-        
-        if self.use_fp8:
-            params_dtype = torch.float8_e4m3fn
+                                     quant_config=None)
 
         self.ws = nn.Parameter(
             torch.empty(self.num_total_experts,
                         2 * self.intermediate_size,
                         self.hidden_size,
                         device="cuda",
-                        dtype=params_dtype))
+                        dtype=self.params_dtype))
         self.w2s = nn.Parameter(
             torch.empty(self.num_total_experts,
                         self.hidden_size,
                         self.intermediate_size,
                         device="cuda",
-                        dtype=params_dtype))
+                        dtype=self.params_dtype))
 
         set_weight_attrs(self.ws, {
             "weight_loader": self.weight_loader,
@@ -124,23 +121,15 @@ def __init__(
             torch.ones(
                 self.num_total_experts, device="cuda", dtype=torch.float32),
             requires_grad=False) if self.use_fp8 else None
-        
-        set_weight_attrs(self.ws_scale, {
-            "weight_loader": self.weight_loader,
-        })
-        set_weight_attrs(self.w2s_scale, {
-            "weight_loader": self.weight_loader,
-        })
 
         # Scaling factors for FP8 activations
         need_act_scales = (self.use_fp8
-                           and linear_method.quant_config.activation_scheme
-                           == "static")
+                           and quant_config.activation_scheme == "static")
         self.as_scale = nn.Parameter(
-            torch.zeros(self.num_total_experts, device="cuda", dtype=torch.float32),
+            torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None
         self.a2s_scale = nn.Parameter(
-            torch.zeros(self.num_total_experts, device="cuda", dtype=torch.float32),
+            torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None
 
         if need_act_scales:
@@ -163,29 +152,27 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
             param_data[expert_id,
                        shard_size:2 * shard_size, :] = loaded_weight[shard, :]
         if weight_name.endswith("w2.weight"):
-            param_data[expert_id] = loaded_weight[:, shard]
-        if "act_scale" in weight_name or "weight_scale" in weight_name:
-            param_data[expert_id] = loaded_weight
-
-    # def process_weights_after_loading(self):
-    #     if self.use_fp8:
-    #         ws = torch.empty_like(self.ws.data, dtype=torch.float8_e4m3fn)
-    #         w2s = torch.empty_like(self.w2s.data, dtype=torch.float8_e4m3fn)
-    #         for expert in range(self.num_total_experts):
-    #             ws[expert, :, :], self.ws_scale[expert] = per_tensor_quantize(
-    #                 self.ws.data[expert, :, :])
-    #             w2s[expert, :, :], self.w2s_scale[
-    #                 expert] = per_tensor_quantize(self.w2s.data[expert, :, :])
-    #         self.ws = nn.Parameter(ws, requires_grad=False)
-    #         self.w2s = nn.Parameter(w2s, requires_grad=False)
+            param_data[expert_id, :, :] = loaded_weight[:, shard]
+        if "act_scale" in weight_name:
+            param_data[:] = param_data[:].max(loaded_weight)
+
+    def process_weights_after_loading(self):
+        if self.use_fp8:
+            ws = torch.empty_like(self.ws.data, dtype=torch.float8_e4m3fn)
+            w2s = torch.empty_like(self.w2s.data, dtype=torch.float8_e4m3fn)
+            for expert in range(self.num_total_experts):
+                ws[expert, :, :], self.ws_scale[expert] = ops.scaled_fp8_quant(
+                    self.ws.data[expert, :, :])
+                w2s[expert, :, :], self.w2s_scale[
+                    expert] = ops.scaled_fp8_quant(self.w2s.data[expert, :, :])
+            self.ws = nn.Parameter(ws, requires_grad=False)
+            self.w2s = nn.Parameter(w2s, requires_grad=False)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_size = hidden_states.shape
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-
-        # TODO: fused MoE kernel might want to take different scales for each expert?
         final_hidden_states = fused_moe(hidden_states,
                                         self.ws,
                                         self.w2s,
@@ -196,8 +183,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                                         use_fp8=self.use_fp8,
                                         w1_scale=self.ws_scale,
                                         w2_scale=self.w2s_scale,
-                                        a1_scale=self.as_scale.max(),
-                                        a2_scale=self.a2s_scale.max())
+                                        a1_scale=self.as_scale,
+                                        a2_scale=self.a2s_scale)
 
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
@@ -214,7 +201,7 @@ def __init__(self,
                  num_kv_heads: int,
                  max_position: int = 4096 * 32,
                  rope_theta: float = 10000,
-                 linear_method: Optional[LinearMethodBase] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
                  sliding_window: Optional[int] = None) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -239,12 +226,12 @@ def __init__(self,
         self.rope_theta = rope_theta
         self.sliding_window = sliding_window
 
-        if isinstance(linear_method, Fp8LinearMethod):
+        if isinstance(quant_config, Fp8Config):
             print_warning_once(
                 "For Mixtral FP8 quantization, we currently do not quantize "
                 "the attention layers until their FP8 performance is improved."
             )
-            linear_method = None
+            quant_config = None
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -252,13 +239,13 @@ def __init__(self,
             self.total_num_heads,
             self.total_num_kv_heads,
             bias=False,
-            linear_method=linear_method,
+            quant_config=quant_config,
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=False,
-            linear_method=linear_method,
+            quant_config=quant_config,
         )
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -295,7 +282,7 @@ class MixtralDecoderLayer(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
-        linear_method: Optional[LinearMethodBase] = None,
+        quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -308,13 +295,13 @@ def __init__(
             num_kv_heads=config.num_key_value_heads,
             rope_theta=rope_theta,
             sliding_window=config.sliding_window,
-            linear_method=linear_method)
+            quant_config=quant_config)
         self.block_sparse_moe = MixtralMoE(
             num_experts=config.num_local_experts,
             top_k=config.num_experts_per_tok,
             hidden_size=config.hidden_size,
             intermediate_size=config.intermediate_size,
-            linear_method=linear_method)
+            quant_config=quant_config)
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(config.hidden_size,
@@ -354,7 +341,7 @@ class MixtralModel(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
-        linear_method: Optional[LinearMethodBase] = None,
+        quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
@@ -370,7 +357,7 @@ def __init__(
             org_num_embeddings=config.vocab_size,
         )
         self.layers = nn.ModuleList([
-            MixtralDecoderLayer(config, linear_method=linear_method)
+            MixtralDecoderLayer(config, quant_config=quant_config)
             for _ in range(config.num_hidden_layers)
         ])
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -420,14 +407,13 @@ class MixtralForCausalLM(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
-        linear_method: Optional[LinearMethodBase] = None,
+        quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
     ) -> None:
         super().__init__()
         self.config = config
-        self.linear_method = linear_method
         self.model = MixtralModel(config,
-                                  linear_method,
+                                  quant_config,
                                   lora_config=lora_config)
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
@@ -479,13 +465,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
 
         expert_params_mapping = [
-            # These are the activation scales for the experts
-            # (param_name, weight_name, expert_id)
-            ("ws_scale" if weight_name in ["w1", "w3"] else "w2s_scale",
-             f"experts.{expert_id}.{weight_name}.weight_scale", expert_id)
-            for expert_id in range(self.config.num_local_experts)
-            for weight_name in ["w1", "w2", "w3"]
-        ] + [
             # These are the weights for the experts
             # (param_name, weight_name, expert_id)
             ("ws" if weight_name in ["w1", "w3"] else "w2s",

From a8d4b334d79a53dcfbcebce5a217ed7a3f49b07c Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 27 Apr 2024 14:15:56 +0000
Subject: [PATCH 66/90] make ./format pass

---
 run_fp8.py                                    |  21 ++--
 .../model_executor/layers/quantization/fp8.py | 117 ++++++++++--------
 2 files changed, 73 insertions(+), 65 deletions(-)

diff --git a/run_fp8.py b/run_fp8.py
index 507f9972a740f..9ddd687c0e8e4 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -4,12 +4,10 @@
 
 from vllm import LLM
 
-choices = ["llama-static", 
-           "mistral-static", 
-           "mistral-dynamic", 
-           "mixtral-static", 
-           "tinyllama-fp16",
-           "qwen-fp16"]
+choices = [
+    "llama-static", "mistral-static", "mistral-dynamic", "mixtral-static",
+    "tinyllama-fp16", "qwen-fp16"
+]
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--type", choices=choices)
@@ -39,10 +37,13 @@
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
 
-    prompt = tokenizer.apply_chat_template([{
-        "role": "user",
-        "content": "What is open source software?"
-    }], tokenize=False, add_generation_prompt=True)
+    prompt = tokenizer.apply_chat_template(
+        [{
+            "role": "user",
+            "content": "What is open source software?"
+        }],
+        tokenize=False,
+        add_generation_prompt=True)
     print(f"----- Prompt: {prompt}")
 
     outputs = model.generate(prompt)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index ddeb95abb19a1..a4debe8aec224 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -5,7 +5,7 @@
 from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.linear import LinearBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
@@ -45,7 +45,6 @@ def get_config_filenames(cls) -> List[str]:
         return []
 
     @classmethod
-
     def from_config(cls, config: Dict[str, Any]) -> "Fp8Config":
         quant_method = cls.get_from_keys(config, ["quant_method"])
         is_serialized = ("fp8" in quant_method)
@@ -53,8 +52,7 @@ def from_config(cls, config: Dict[str, Any]) -> "Fp8Config":
         return cls(is_serialized=is_serialized,
                    activation_scheme=activation_scheme)
 
-    def get_quant_method(
-            self, layer: torch.nn.Module) -> "Fp8LinearMethod":
+    def get_quant_method(self, layer: torch.nn.Module) -> "Fp8LinearMethod":
         if isinstance(layer, LinearBase):
             return Fp8LinearMethod(self)
         return None
@@ -63,7 +61,7 @@ def get_scaled_act_names(self) -> List[str]:
         return []
 
 
-class Fp8LinearMethod(LinearMethodBase):
+class Fp8LinearMethod(QuantizeMethodBase):
     """Linear method for FP8.
     Supports loading FP8 checkpoints with static weight scale and
     dynamic/scale activation scale.
@@ -97,46 +95,49 @@ def create_weights(
         del input_size, output_size
         output_size_per_partition = sum(output_partition_sizes)
 
-        layer.process_after_load = True
+        layer.process_after_loading = True
         layer.logical_widths = output_partition_sizes
-        
+
         # WEIGHT
-        weight_dtype = torch.float8_e4m3fn if self.quant_config.is_serialized else params_dtype
-        weight = Parameter(
-            torch.empty(output_size_per_partition,
-                        input_size_per_partition,
-                        dtype=weight_dtype),
-            requires_grad=False)
+        weight_dtype = (torch.float8_e4m3fn
+                        if self.quant_config.is_serialized else params_dtype)
+        weight = Parameter(torch.empty(output_size_per_partition,
+                                       input_size_per_partition,
+                                       dtype=weight_dtype),
+                           requires_grad=False)
         layer.register_parameter("weight", weight)
         set_weight_attrs(weight, {
             **extra_weight_attrs,
-            "input_dim": 1, "output_dim": 0,
+            "input_dim": 1,
+            "output_dim": 0,
         })
 
-        # SCALES
-        #   We only need to load scales if the model is serialized FP8.
-        #   Otherwise, scale creation is delayed until `process_weights_after_loading`.
+        # If checkpoint is serialized fp8, load them.
+        # Otherwise, wait until process_weights_after_loading.
         if self.quant_config.is_serialized:
             # WEIGHT SCALE
-            weight_scale = Parameter(
-                torch.empty(len(output_partition_sizes), dtype=torch.float32),
-                requires_grad=False)
+            weight_scale = Parameter(torch.empty(len(output_partition_sizes),
+                                                 dtype=torch.float32),
+                                     requires_grad=False)
             layer.register_parameter("weight_scale", weight_scale)
             set_weight_attrs(weight_scale, {
                 **extra_weight_attrs,
-                "shard_indexer": self.scales_shard_indexer,
+                "shard_indexer":
+                self.scales_shard_indexer,
             })
 
             # ACTIVATION SCALE
             if self.quant_config.activation_scheme == "static":
-                act_scale = Parameter(
-                    torch.empty(len(output_partition_sizes), dtype=torch.float32),
-                    requires_grad=False)
+                act_scale = Parameter(torch.empty(len(output_partition_sizes),
+                                                  dtype=torch.float32),
+                                      requires_grad=False)
                 layer.register_parameter("act_scale", act_scale)
-                set_weight_attrs(act_scale, {
-                    **extra_weight_attrs,
-                    "shard_indexer": self.scales_shard_indexer,
-                })
+                set_weight_attrs(
+                    act_scale, {
+                        **extra_weight_attrs,
+                        "shard_indexer":
+                        self.scales_shard_indexer,
+                    })
 
     def shard_id_as_int(self, shard_id: Union[str, int]) -> int:
         if isinstance(shard_id, int):
@@ -161,36 +162,40 @@ def process_weights_after_loading(self, layer: Module) -> None:
         # only linear layers invoke "create_weights". So we check
         # whether "weight_scale" is registered to determine
         # whether the layer is a linear layer that requires quantization.
-        if not hasattr(layer, "process_after_load") or not layer.process_after_load:
+        if not hasattr(
+                layer,
+                "process_after_loading") or not layer.process_after_load:
             return
 
-        # If the checkpoint is fp16/bf16 (not serialized fp8), quantize the weights.
+        # If checkpoint is fp1616 (not serialized fp8), quantize the weights.
         if not self.quant_config.is_serialized:
-            qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
+            qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
+                                                         scale=None)
             layer.weight = Parameter(qweight.t(), requires_grad=False)
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
             layer.logical_widths = None
             layer.act_scale = None
             return
 
-        # If the checkpoint is serialized fp8, cleanup state_dict --> apply_weights.
-        # TODO: this will be cleaned up once we have the cutlass kernels.
-        else: 
+        # TODO: cutlass kernels will remove the need for much of this logic.
+        # If the checkpoint is serialized fp8, we already loaded quantized.
+        #   So, just cleanup the Parameters for easier use in apply()
+        else:
             # WEIGHT
-            #   Tranpose weight for passing to torch._scaled_mm
+            #   Transpose weight for passing to torch._scaled_mm
             weight = layer.weight
             layer.weight = Parameter(weight.t(), requires_grad=False)
-            
+
             # WEIGHT_SCALE
-            #   If we only have one logical shard, avoid the for loop in apply weights.
-            #   TODO: once we have the cutlass_gemm, this will be removed.
+            #   If we only have one logical shard, avoid the loop in apply().
             if len(layer.logical_widths) == 1:
-                layer.weight_scale = Parameter(layer.weight_scale.max(), requires_grad=False)
+                layer.weight_scale = Parameter(layer.weight_scale.max(),
+                                               requires_grad=False)
                 layer.logical_widths = None
-        
+
             # ACT_SCALE
-            #   Dyanmic: set to None (required input to ops.scaled_fp8_quant).
-            #   Static:  set to max of the act_scales (since they are equal to eachoter).
+            #   Dynamic: set to None (required input to ops.scaled_fp8_quant).
+            #   Static:  set to max of the act_scales (since they are equal).
             if self.quant_config.activation_scheme == "dynamic":
                 layer.act_scale = None
             elif self.quant_config.activation_scheme == "static":
@@ -198,9 +203,11 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     raise ValueError(
                         "All the act_scales for the logical weights of a layer "
                         f"must be equal. But got {layer.act_scale}")
-                layer.act_scale = Parameter(layer.act_scale.max(), requires_grad=False)
+                layer.act_scale = Parameter(layer.act_scale.max(),
+                                            requires_grad=False)
             else:
-                raise ValueError(f"Unknown activation_scheme {self.quant_config.activation_scheme}")
+                raise ValueError(
+                    f"Unknown scheme {self.quant_config.activation_scheme}")
 
     def apply(self,
               layer: torch.nn.Module,
@@ -220,16 +227,18 @@ def apply(self,
                 scale_a=x_scale,
                 scale_b=layer.weight_scale,
             )
-        
-        # TODO: replace will cutlass gemm_dq with epilogue fusion.
+
+        # TODO: replace naive loop with cutlass gemm_dq w/ epilogue fusion.
         # Case 2: We have N weigth_scales for N logical weights.
-        #   Current: inefficient for loop to apply each logical GEMM_DQ.
         else:
-            output = torch.empty(x.shape[0], layer.weight.shape[1],
-                                 dtype=x.dtype, device="cuda")
+            output = torch.empty(x.shape[0],
+                                 layer.weight.shape[1],
+                                 dtype=x.dtype,
+                                 device="cuda")
             start = 0
             # Loop over the N logical shards.
-            for logical_width, w_scale in zip(layer.logical_widths, layer.weight_scale):
+            for logical_width, w_scale in zip(layer.logical_widths,
+                                              layer.weight_scale):
                 end = start + logical_width
                 out, _ = torch._scaled_mm(
                     qinput,
@@ -245,10 +254,8 @@ def apply(self,
             output.add_(bias)
 
         return output
-    
+
+
 def all_close_1d(x: torch.Tensor):
     assert len(x.shape) == 1
-    for i in range(x.shape[0]):
-        if not torch.allclose(x[0], x[i]):
-            return False
-    return True
+    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))

From 5be09702d205d033863de068d99b074ab05de761 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 27 Apr 2024 14:46:30 +0000
Subject: [PATCH 67/90] better comments in linear.py

---
 vllm/model_executor/layers/linear.py          | 90 ++++++++-----------
 .../model_executor/layers/quantization/fp8.py | 35 ++++----
 2 files changed, 51 insertions(+), 74 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index dc0723a49f526..9606662f7fea5 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -240,6 +240,9 @@ def __init__(
             self.register_parameter("bias", None)
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        # Special case for Fp8 scales.
+        shard_indexer = getattr(param, "shard_indexer", None)
+
         tp_rank = get_tensor_model_parallel_rank()
         output_dim = getattr(param, "output_dim", None)
         param_data = param.data
@@ -248,10 +251,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
-        # TODO: canon
-        # This is for loading scales for fp8, which have no dims.
-        if len(loaded_weight.shape) == 0:
-            loaded_weight = loaded_weight.reshape(1)
+        # Special case for Fp8 scales.
+        elif shard_indexer is not None:
+            param_data, loaded_weight = shard_indexer(
+                param_data, loaded_weight, shard_id=0)
 
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
@@ -315,21 +318,10 @@ def weight_loader(self,
 
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
+        # Special case for AQLM codebooks.
         is_metadata = getattr(param, "is_metadata", False)
-
-        # TODO: document.
-        # TODO: sync with is_metadata.
-        # For loading scales.
+        # Special case for Fp8 scales.
         shard_indexer = getattr(param, "shard_indexer", None)
-        logical_widths = getattr(param, "logical_widths", None)
-        if output_dim is not None and shard_indexer is not None:
-            raise NotImplementedError(
-                "We do not currently support output_dim != None and "
-                "shard_indexer != None for a parameter. Please open an issue.")
-        if loaded_shard_id is None and shard_indexer is not None:
-            raise NotImplementedError(
-                "We do not currently support loaded_shard_id == None and "
-                "shard_indexer != None for a parameter. Please open an issue.")
 
         if loaded_shard_id is None:
             # Loaded weight is already packed.
@@ -344,14 +336,13 @@ def weight_loader(self,
                 current_shard_offset += output_size
             packed_dim = getattr(param, "packed_dim", None)
             for shard_id, shard_offset, shard_size in shard_offsets:
+                # Special case for Quantization.
                 # If quantized, we need to adjust the offset and size to account
                 # for the packing.
                 if packed_dim == output_dim:
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
-
-                    # If marlin, we need to adjust the offset and size to
-                    # account for the tiling.
+                    # Special case for Marlin.
                     shard_size, shard_offset = adjust_marlin_shard(
                         param, shard_size, shard_offset)
 
@@ -366,15 +357,14 @@ def weight_loader(self,
         if output_dim is not None:
             shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
             shard_size = self.output_sizes[loaded_shard_id] // tp_size
+            # Special case for quantization.
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
             packed_dim = getattr(param, "packed_dim", None)
             if packed_dim == output_dim:
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
-
-                # If marlin, we need to adjust the offset and size to
-                # account for the tiling.
+                # Special case for Marlin.
                 shard_size, shard_offset = adjust_marlin_shard(
                     param, shard_size, shard_offset)
 
@@ -383,19 +373,16 @@ def weight_loader(self,
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
+        # Special case for AQLM codebooks.
         elif is_metadata:
             # metadata indicates fixed size concatenated along dim 0
             shard_size = loaded_weight.shape[0]
             shard_offset = loaded_shard_id * shard_size
             param_data = param_data.narrow(0, shard_offset, shard_size)
-
-        # TODO: sync with is_metadata UX.
-        # If a param_shard_splitter is defined by the LinearMethod, use it.
+        # Special case sharding for Fp8 scales.
         elif shard_indexer is not None:
-            param_data, loaded_weight = shard_indexer(param_data,
-                                                      loaded_weight,
-                                                      loaded_shard_id,
-                                                      logical_widths)
+            param_data, loaded_weight = shard_indexer(
+                param_data, loaded_weight, loaded_shard_id)
 
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
@@ -477,19 +464,10 @@ def weight_loader(self,
                       loaded_shard_id: Optional[str] = None):
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
+        # Special case for AQLM codebooks.
         is_metadata = getattr(param, "is_metadata", False)
-
-        # TODO: sync with is_metadata UX
+        # Special case for Fp8 scales.
         shard_indexer = getattr(param, "shard_indexer", None)
-        logical_widths = getattr(param, "logical_widths", None)
-        if output_dim is not None and shard_indexer is not None:
-            raise NotImplementedError(
-                "We do not currently support output_dim != None and "
-                "shard_indexer != None for a parameter. Please open an issue.")
-        if loaded_shard_id is None and shard_indexer is not None:
-            raise NotImplementedError(
-                "We do not currently support loaded_shard_id == None and "
-                "shard_indexer != None for a parameter. Please open an issue.")
 
         if loaded_shard_id is None:
             # Loaded weight is already packed.
@@ -507,14 +485,14 @@ def weight_loader(self,
             ]
             packed_dim = getattr(param, "packed_dim", None)
             for shard_id, shard_offset, shard_size in shard_offsets:
+                # Special case for Quantized Weights.
                 # If quantized, we need to adjust the offset and size to account
                 # for the packing.
                 if packed_dim == output_dim:
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
 
-                    # If marlin, we need to adjust the offset and size to
-                    # account for the tiling.
+                    # Special case for Marlin.
                     shard_size, shard_offset = adjust_marlin_shard(
                         param, shard_size, shard_offset)
 
@@ -536,6 +514,7 @@ def weight_loader(self,
                 shard_offset = (self.num_heads +
                                 self.num_kv_heads) * self.head_size
                 shard_size = self.num_kv_heads * self.head_size
+            # Special case for Quantized Weights.
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
             packed_dim = getattr(param, "packed_dim", None)
@@ -543,8 +522,7 @@ def weight_loader(self,
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
 
-                # If marlin, we need to adjust the offset and size to
-                # account for the tiling.
+                # Special case for Marlin.
                 shard_size, shard_offset = adjust_marlin_shard(
                     param, shard_size, shard_offset)
 
@@ -557,19 +535,18 @@ def weight_loader(self,
             start_idx = shard_id * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
+        # Special case for for AQLM codebooks.
         elif is_metadata:
             # metadata indicates fixed size concatenated along dim 0
             shard_size = loaded_weight.shape[0]
             shard_index = ["q", "k", "v"].index(loaded_shard_id)
             param_data = param_data.narrow(0, shard_index * shard_size,
                                            shard_size)
-        # TODO: sync with QKV
-        # If a param_shard_splitter is defined by the LinearMethod, use it.
+        # Special case for for Fp8 scales.
         elif shard_indexer is not None:
-            param_data, loaded_weight = shard_indexer(param_data,
-                                                      loaded_weight,
-                                                      loaded_shard_id,
-                                                      logical_widths)
+
+            param_data, loaded_weight = shard_indexer(
+                param_data, loaded_weight, loaded_shard_id)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -650,6 +627,9 @@ def __init__(
             self.register_parameter("bias", None)
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        # Special case for Fp8 scales.
+        shard_indexer = getattr(param, "shard_indexer", None)
+
         tp_rank = get_tensor_model_parallel_rank()
         input_dim = getattr(param, "input_dim", None)
         param_data = param.data
@@ -658,10 +638,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(input_dim, start_idx,
                                                  shard_size)
-        # TODO: canon
-        # This is for loading scales for fp8, which have no dims.
-        if len(loaded_weight.shape) == 0:
-            loaded_weight = loaded_weight.reshape(1)
+        # Special case for Fp8 scales.
+        elif shard_indexer is not None:
+            param_data, loaded_weight = shard_indexer(
+                param_data, loaded_weight, shard_id=0)
 
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index a4debe8aec224..37e98b84134d0 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -95,7 +95,7 @@ def create_weights(
         del input_size, output_size
         output_size_per_partition = sum(output_partition_sizes)
 
-        layer.process_after_loading = True
+        layer.process_after_load = True
         layer.logical_widths = output_partition_sizes
 
         # WEIGHT
@@ -139,32 +139,29 @@ def create_weights(
                         self.scales_shard_indexer,
                     })
 
-    def shard_id_as_int(self, shard_id: Union[str, int]) -> int:
-        if isinstance(shard_id, int):
-            return shard_id
-        assert isinstance(shard_id, str)
+    def scales_shard_indexer(
+            self, param: torch.Tensor, loaded_weight: torch.Tensor,
+            shard_id: Union[str, int]) -> Tuple[torch.Tensor, torch.Tensor]:
         qkv_idxs = {"q": 0, "k": 1, "v": 2}
-        assert shard_id in qkv_idxs
-        return qkv_idxs[shard_id]
 
-    def scales_shard_indexer(
-        self,
-        param: torch.Tensor,
-        loaded_weight: torch.Tensor,
-        shard_id: Union[str, int],
-        logical_widths: List[int],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        del logical_widths
-        return param[self.shard_id_as_int(shard_id)], loaded_weight
+        if isinstance(shard_id, int):
+            pass
+        elif isinstance(shard_id, str):
+            if shard_id not in qkv_idxs:
+                raise ValueError(f"Unknown shard_id: {shard_id}")
+            shard_id = qkv_idxs[shard_id]
+        else:
+            ValueError(f"Shard id must be int or str but got {type(shard_id)}")
+
+        return param[shard_id], loaded_weight
 
     def process_weights_after_loading(self, layer: Module) -> None:
         # Although the quant_method is propagated to all layers,
         # only linear layers invoke "create_weights". So we check
         # whether "weight_scale" is registered to determine
         # whether the layer is a linear layer that requires quantization.
-        if not hasattr(
-                layer,
-                "process_after_loading") or not layer.process_after_load:
+        if (not hasattr(layer, "process_after_load") or 
+            not layer.process_after_load):
             return
 
         # If checkpoint is fp1616 (not serialized fp8), quantize the weights.

From ef7992bd018d5aa50a4eff360e1aacccb7ea98f2 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 27 Apr 2024 14:47:50 +0000
Subject: [PATCH 68/90] better comments in linear.py

---
 vllm/model_executor/layers/quantization/fp8.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 37e98b84134d0..ba83719c73624 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -175,8 +175,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
             return
 
         # TODO: cutlass kernels will remove the need for much of this logic.
-        # If the checkpoint is serialized fp8, we already loaded quantized.
-        #   So, just cleanup the Parameters for easier use in apply()
+        # If the checkpoint is serialized fp8, we already loaded quantized,
+        # so, just cleanup the Parameters for easier use in apply().
         else:
             # WEIGHT
             #   Transpose weight for passing to torch._scaled_mm

From 06677919a4fb2f5a4bcbd42655bf8c6a93c0035d Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 27 Apr 2024 15:04:52 +0000
Subject: [PATCH 69/90] fixed opt-125

---
 run_fp8.py                        | 24 +++++++++++++-----------
 vllm/model_executor/models/opt.py |  3 ++-
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/run_fp8.py b/run_fp8.py
index 9ddd687c0e8e4..23298999a3050 100644
--- a/run_fp8.py
+++ b/run_fp8.py
@@ -6,7 +6,7 @@
 
 choices = [
     "llama-static", "mistral-static", "mistral-dynamic", "mixtral-static",
-    "tinyllama-fp16", "qwen-fp16"
+    "opt-static", "tinyllama-fp16", "qwen-fp16"
 ]
 
 parser = argparse.ArgumentParser()
@@ -21,8 +21,8 @@
         model_name = "nm-testing/mistral-fp8-static"
     elif args.type == "mistral-dynamic":
         model_name = "nm-testing/mistral-fp8-dynamic"
-    elif args.type == 'mixtral-static':
-        model_name = "nm-testing/Mixtral-8x7B-Instruct-v0.1-FP8"
+    elif args.type == "opt-static":
+        model_name = "nm-testing/opt-125m-fp8-static"
     elif args.type == 'tinyllama-fp16':
         model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
     elif args.type == 'qwen-fp16':
@@ -36,14 +36,16 @@
                 quantization="fp8")
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-    prompt = tokenizer.apply_chat_template(
-        [{
-            "role": "user",
-            "content": "What is open source software?"
-        }],
-        tokenize=False,
-        add_generation_prompt=True)
+    if tokenizer.chat_template is not None:
+        prompt = tokenizer.apply_chat_template(
+            [{
+                "role": "user",
+                "content": "What is open source software?"
+            }],
+            tokenize=False,
+            add_generation_prompt=True)
+    else:
+        prompt = "The best thing about"
     print(f"----- Prompt: {prompt}")
 
     outputs = model.generate(prompt)
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 838a2f0adc4d1..133ee79260620 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -130,7 +130,7 @@ def __init__(
             bias=config.enable_bias,
             quant_config=quant_config,
         )
-        quant_config = getattr(quant_config, "quant_config", None)
+        # quant_config = getattr(quant_config, "quant_config", None)
         self.activation_fn = get_act_fn(config.activation_function,
                                         quant_config, config.ffn_dim)
         self.fc2 = RowParallelLinear(
@@ -323,6 +323,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        print(params_dict.keys())
         for name, loaded_weight in weights:
             if "lm_head.weight" in name:
                 continue

From d8adf1455f5b1e62e62f383810bd6ad812133d98 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 27 Apr 2024 15:05:08 +0000
Subject: [PATCH 70/90] removed run_fp8.py

---
 run_fp8.py | 53 -----------------------------------------------------
 1 file changed, 53 deletions(-)
 delete mode 100644 run_fp8.py

diff --git a/run_fp8.py b/run_fp8.py
deleted file mode 100644
index 23298999a3050..0000000000000
--- a/run_fp8.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import argparse
-
-from transformers import AutoTokenizer
-
-from vllm import LLM
-
-choices = [
-    "llama-static", "mistral-static", "mistral-dynamic", "mixtral-static",
-    "opt-static", "tinyllama-fp16", "qwen-fp16"
-]
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--type", choices=choices)
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-
-    if args.type == "llama-static":
-        model_name = "nm-testing/Meta-Llama-3-8B-Instruct-FP8"
-    elif args.type == "mistral-static":
-        model_name = "nm-testing/mistral-fp8-static"
-    elif args.type == "mistral-dynamic":
-        model_name = "nm-testing/mistral-fp8-dynamic"
-    elif args.type == "opt-static":
-        model_name = "nm-testing/opt-125m-fp8-static"
-    elif args.type == 'tinyllama-fp16':
-        model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-    elif args.type == 'qwen-fp16':
-        model_name = "Qwen/CodeQwen1.5-7B-Chat"
-    else:
-        raise ValueError(f"--type should be in {choices}")
-
-    model = LLM(model_name,
-                enforce_eager=True,
-                max_model_len=1024,
-                quantization="fp8")
-
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    if tokenizer.chat_template is not None:
-        prompt = tokenizer.apply_chat_template(
-            [{
-                "role": "user",
-                "content": "What is open source software?"
-            }],
-            tokenize=False,
-            add_generation_prompt=True)
-    else:
-        prompt = "The best thing about"
-    print(f"----- Prompt: {prompt}")
-
-    outputs = model.generate(prompt)
-    generation = outputs[0].outputs[0].text
-    print(f"----- Generation: {generation}")

From 9bb1a2b9ada6ba2586e8991ff7e81cf764385e4e Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 27 Apr 2024 15:06:47 +0000
Subject: [PATCH 71/90] format

---
 vllm/model_executor/layers/linear.py          | 20 +++++++++++--------
 .../model_executor/layers/quantization/fp8.py |  4 ++--
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 9606662f7fea5..1b8171ab3ea41 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -253,8 +253,9 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
                                                  shard_size)
         # Special case for Fp8 scales.
         elif shard_indexer is not None:
-            param_data, loaded_weight = shard_indexer(
-                param_data, loaded_weight, shard_id=0)
+            param_data, loaded_weight = shard_indexer(param_data,
+                                                      loaded_weight,
+                                                      shard_id=0)
 
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
@@ -381,8 +382,9 @@ def weight_loader(self,
             param_data = param_data.narrow(0, shard_offset, shard_size)
         # Special case sharding for Fp8 scales.
         elif shard_indexer is not None:
-            param_data, loaded_weight = shard_indexer(
-                param_data, loaded_weight, loaded_shard_id)
+            param_data, loaded_weight = shard_indexer(param_data,
+                                                      loaded_weight,
+                                                      loaded_shard_id)
 
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
@@ -545,8 +547,9 @@ def weight_loader(self,
         # Special case for for Fp8 scales.
         elif shard_indexer is not None:
 
-            param_data, loaded_weight = shard_indexer(
-                param_data, loaded_weight, loaded_shard_id)
+            param_data, loaded_weight = shard_indexer(param_data,
+                                                      loaded_weight,
+                                                      loaded_shard_id)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -640,8 +643,9 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
                                                  shard_size)
         # Special case for Fp8 scales.
         elif shard_indexer is not None:
-            param_data, loaded_weight = shard_indexer(
-                param_data, loaded_weight, shard_id=0)
+            param_data, loaded_weight = shard_indexer(param_data,
+                                                      loaded_weight,
+                                                      shard_id=0)
 
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index ba83719c73624..d651216b14fb0 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -160,8 +160,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
         # only linear layers invoke "create_weights". So we check
         # whether "weight_scale" is registered to determine
         # whether the layer is a linear layer that requires quantization.
-        if (not hasattr(layer, "process_after_load") or 
-            not layer.process_after_load):
+        if (not hasattr(layer, "process_after_load")
+                or not layer.process_after_load):
             return
 
         # If checkpoint is fp1616 (not serialized fp8), quantize the weights.

From 169c9edf091efd6de980aa39aee886d320a8ccab Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Sat, 27 Apr 2024 11:17:41 -0400
Subject: [PATCH 72/90] Cleanup opt.py

---
 vllm/model_executor/models/opt.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 133ee79260620..336f765ababaa 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -130,7 +130,6 @@ def __init__(
             bias=config.enable_bias,
             quant_config=quant_config,
         )
-        # quant_config = getattr(quant_config, "quant_config", None)
         self.activation_fn = get_act_fn(config.activation_function,
                                         quant_config, config.ffn_dim)
         self.fc2 = RowParallelLinear(
@@ -323,7 +322,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
-        print(params_dict.keys())
         for name, loaded_weight in weights:
             if "lm_head.weight" in name:
                 continue

From 8ef9c7d1272bb519d3efa4626eb8fcd82de16305 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 27 Apr 2024 16:43:47 +0000
Subject: [PATCH 73/90] added testing

---
 tests/models/test_fp8.py | 66 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 tests/models/test_fp8.py

diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
new file mode 100644
index 0000000000000..c8e8dade2d06e
--- /dev/null
+++ b/tests/models/test_fp8.py
@@ -0,0 +1,66 @@
+"""Compares the outputs of gptq vs gptq_marlin 
+"""
+import os
+
+import pytest
+import torch
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+MODELS = [
+    "nm-testing/mistral-fp8-static",
+    "nm-testing/mistral-fp8-dynamic",
+    "mistralai/Mistral-7B-Instruct-v0.2",
+]
+
+EXPECTED_STRS_MAP = {
+    "nm-testing/mistral-fp8-static" : [' VLLM (Vulcan Learning Machine) is a high-performance and memory-efficient', ' 1. 1950s: The Concept of AI is Born: The term', ' Artificial Intelligence (AI) and Human Intelligence (HI) are two distinct ways of processing information.', " A neural network is a type of artificial intelligence model inspired by the human brain's structure and function", ' In the heart of a sprawling industrial city, nestled among the hum of machinery and the rhythm', ' The COVID-19 pandemic has had a profound impact on global economic structures and has forced businesses to', ' The Mona Lisa painting, created by the Italian artist Leonardo da Vinci between 15', ' Japanese: 早く起きる'], # noqa: E501
+    "nm-testing/mistral-fp8-dynamic": [' VLLM (Vulcan Learning Machine) is a high-performance and memory-efficient', ' 1. 1950s: The Concept of AI is Born: The term', ' Artificial Intelligence (AI) and Human Intelligence (HI) are two distinct ways of processing information.', " A neural network is a type of artificial intelligence model inspired by the human brain's structure and function", ' In the heart of the bustling city of Neo-Tokyo, nestled among the tow', ' The COVID-19 pandemic has had a profound impact on global economic structures and has forced businesses to', ' The Mona Lisa painting, created by the Italian artist Leonardo da Vinci between 15', ' Japanese: 早く起きる鳥は虫を取る (S'], # noqa: E501
+    "mistralai/Mistral-7B-Instruct-v0.2": [' VLLM (Vulcan Learning Machine) is a high-performance and memory-efficient', ' 1. 1950s: The Concept of AI is Born: The term', ' Artificial Intelligence (AI) and Human Intelligence (HI) are two distinct ways of processing information.', " A neural network is a type of machine learning model inspired by the human brain's structure and function", ' In the heart of the bustling city of Neo-Tokyo, nestled among the tow', ' The COVID-19 pandemic has had a profound impact on global economic structures and has forced businesses to', ' The Mona Lisa painting, created by the Italian artist Leonardo da Vinci between 15', ' Japanese: 早く起きる鳥は虫を取る (S'], # noqa: E501
+}
+
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+fp8_not_supported = (
+    capability < QUANTIZATION_METHODS["fp8"].get_min_capability())
+
+@pytest.mark.skipif(fp8_not_supported,
+                    reason="fp8 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(
+    example_prompts,
+    model_name,
+) -> None:
+    model = LLM(
+        model=model_name,
+        max_model_len=MAX_MODEL_LEN,
+        enforce_eager=True,
+        quantization="fp8")
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    formatted_prompts = [
+        tokenizer.apply_chat_template(
+            [{ "role": "user", "content": prompt }],
+            tokenize=False, add_generation_prompt=True
+        ) for prompt in example_prompts ]        
+
+    params = SamplingParams(max_tokens=20, temperature=0)
+    generations = []
+    # Note: these need to be run 1 at a time due to numerical precision,
+    # since the expected strs were generated this way.
+    for prompt in formatted_prompts:
+        outputs = model.generate(prompt, params)
+        generations.append(outputs[0].outputs[0].text)
+    del model
+    
+    expected_strs = EXPECTED_STRS_MAP[model_name]
+    for i in range(len(example_prompts)):
+        generated_str = generations[i]
+        expected_str = expected_strs[i]
+        assert expected_str == generated_str, (
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
\ No newline at end of file

From c7d6dd665b5f5aab20b7d03bb627632783576d83 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 27 Apr 2024 16:45:27 +0000
Subject: [PATCH 74/90] ./format.sh

---
 tests/models/test_fp8.py | 67 ++++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 17 deletions(-)

diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index c8e8dade2d06e..14abe8576f6ff 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -1,10 +1,12 @@
-"""Compares the outputs of gptq vs gptq_marlin 
+# flake8: noqa
+"""Tests fp8 models against ground truth generation
 """
 import os
 
 import pytest
 import torch
 from transformers import AutoTokenizer
+
 from vllm import LLM, SamplingParams
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
@@ -19,15 +21,43 @@
 ]
 
 EXPECTED_STRS_MAP = {
-    "nm-testing/mistral-fp8-static" : [' VLLM (Vulcan Learning Machine) is a high-performance and memory-efficient', ' 1. 1950s: The Concept of AI is Born: The term', ' Artificial Intelligence (AI) and Human Intelligence (HI) are two distinct ways of processing information.', " A neural network is a type of artificial intelligence model inspired by the human brain's structure and function", ' In the heart of a sprawling industrial city, nestled among the hum of machinery and the rhythm', ' The COVID-19 pandemic has had a profound impact on global economic structures and has forced businesses to', ' The Mona Lisa painting, created by the Italian artist Leonardo da Vinci between 15', ' Japanese: 早く起きる'], # noqa: E501
-    "nm-testing/mistral-fp8-dynamic": [' VLLM (Vulcan Learning Machine) is a high-performance and memory-efficient', ' 1. 1950s: The Concept of AI is Born: The term', ' Artificial Intelligence (AI) and Human Intelligence (HI) are two distinct ways of processing information.', " A neural network is a type of artificial intelligence model inspired by the human brain's structure and function", ' In the heart of the bustling city of Neo-Tokyo, nestled among the tow', ' The COVID-19 pandemic has had a profound impact on global economic structures and has forced businesses to', ' The Mona Lisa painting, created by the Italian artist Leonardo da Vinci between 15', ' Japanese: 早く起きる鳥は虫を取る (S'], # noqa: E501
-    "mistralai/Mistral-7B-Instruct-v0.2": [' VLLM (Vulcan Learning Machine) is a high-performance and memory-efficient', ' 1. 1950s: The Concept of AI is Born: The term', ' Artificial Intelligence (AI) and Human Intelligence (HI) are two distinct ways of processing information.', " A neural network is a type of machine learning model inspired by the human brain's structure and function", ' In the heart of the bustling city of Neo-Tokyo, nestled among the tow', ' The COVID-19 pandemic has had a profound impact on global economic structures and has forced businesses to', ' The Mona Lisa painting, created by the Italian artist Leonardo da Vinci between 15', ' Japanese: 早く起きる鳥は虫を取る (S'], # noqa: E501
+    "nm-testing/mistral-fp8-static": [
+        ' VLLM (Vulcan Learning Machine) is a high-performance and memory-efficient',
+        ' 1. 1950s: The Concept of AI is Born: The term',
+        ' Artificial Intelligence (AI) and Human Intelligence (HI) are two distinct ways of processing information.',
+        " A neural network is a type of artificial intelligence model inspired by the human brain's structure and function",
+        ' In the heart of a sprawling industrial city, nestled among the hum of machinery and the rhythm',
+        ' The COVID-19 pandemic has had a profound impact on global economic structures and has forced businesses to',
+        ' The Mona Lisa painting, created by the Italian artist Leonardo da Vinci between 15',
+        ' Japanese: 早く起きる'
+    ],
+    "nm-testing/mistral-fp8-dynamic": [
+        ' VLLM (Vulcan Learning Machine) is a high-performance and memory-efficient',
+        ' 1. 1950s: The Concept of AI is Born: The term',
+        ' Artificial Intelligence (AI) and Human Intelligence (HI) are two distinct ways of processing information.',
+        " A neural network is a type of artificial intelligence model inspired by the human brain's structure and function",
+        ' In the heart of the bustling city of Neo-Tokyo, nestled among the tow',
+        ' The COVID-19 pandemic has had a profound impact on global economic structures and has forced businesses to',
+        ' The Mona Lisa painting, created by the Italian artist Leonardo da Vinci between 15',
+        ' Japanese: 早く起きる鳥は虫を取る (S'
+    ],
+    "mistralai/Mistral-7B-Instruct-v0.2": [
+        ' VLLM (Vulcan Learning Machine) is a high-performance and memory-efficient',
+        ' 1. 1950s: The Concept of AI is Born: The term',
+        ' Artificial Intelligence (AI) and Human Intelligence (HI) are two distinct ways of processing information.',
+        " A neural network is a type of machine learning model inspired by the human brain's structure and function",
+        ' In the heart of the bustling city of Neo-Tokyo, nestled among the tow',
+        ' The COVID-19 pandemic has had a profound impact on global economic structures and has forced businesses to',
+        ' The Mona Lisa painting, created by the Italian artist Leonardo da Vinci between 15',
+        ' Japanese: 早く起きる鳥は虫を取る (S'
+    ],
 }
 
 capability = torch.cuda.get_device_capability()
 capability = capability[0] * 10 + capability[1]
-fp8_not_supported = (
-    capability < QUANTIZATION_METHODS["fp8"].get_min_capability())
+fp8_not_supported = (capability <
+                     QUANTIZATION_METHODS["fp8"].get_min_capability())
+
 
 @pytest.mark.skipif(fp8_not_supported,
                     reason="fp8 is not supported on this GPU type.")
@@ -36,18 +66,21 @@ def test_models(
     example_prompts,
     model_name,
 ) -> None:
-    model = LLM(
-        model=model_name,
-        max_model_len=MAX_MODEL_LEN,
-        enforce_eager=True,
-        quantization="fp8")
+    model = LLM(model=model_name,
+                max_model_len=MAX_MODEL_LEN,
+                enforce_eager=True,
+                quantization="fp8")
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     formatted_prompts = [
-        tokenizer.apply_chat_template(
-            [{ "role": "user", "content": prompt }],
-            tokenize=False, add_generation_prompt=True
-        ) for prompt in example_prompts ]        
+        tokenizer.apply_chat_template([{
+            "role": "user",
+            "content": prompt
+        }],
+                                      tokenize=False,
+                                      add_generation_prompt=True)
+        for prompt in example_prompts
+    ]
 
     params = SamplingParams(max_tokens=20, temperature=0)
     generations = []
@@ -57,10 +90,10 @@ def test_models(
         outputs = model.generate(prompt, params)
         generations.append(outputs[0].outputs[0].text)
     del model
-    
+
     expected_strs = EXPECTED_STRS_MAP[model_name]
     for i in range(len(example_prompts)):
         generated_str = generations[i]
         expected_str = expected_strs[i]
         assert expected_str == generated_str, (
-            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
\ No newline at end of file
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")

From 50b58238d883ab7c99b0022a99410dc1a681671c Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 27 Apr 2024 16:47:40 +0000
Subject: [PATCH 75/90] fixed typing

---
 vllm/model_executor/layers/quantization/fp8.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index d651216b14fb0..a0479e4e034b3 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -5,9 +5,8 @@
 from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.linear import LinearBase
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.utils import set_weight_attrs
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
@@ -61,7 +60,7 @@ def get_scaled_act_names(self) -> List[str]:
         return []
 
 
-class Fp8LinearMethod(QuantizeMethodBase):
+class Fp8LinearMethod(LinearMethodBase):
     """Linear method for FP8.
     Supports loading FP8 checkpoints with static weight scale and
     dynamic/scale activation scale.

From 4156ca9bca702a3b03f884fa9e067a1d5005377b Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 27 Apr 2024 16:48:40 +0000
Subject: [PATCH 76/90] fixed typing

---
 vllm/model_executor/layers/quantization/fp8.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index a0479e4e034b3..ff1a8cf601be9 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -155,10 +155,6 @@ def scales_shard_indexer(
         return param[shard_id], loaded_weight
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        # Although the quant_method is propagated to all layers,
-        # only linear layers invoke "create_weights". So we check
-        # whether "weight_scale" is registered to determine
-        # whether the layer is a linear layer that requires quantization.
         if (not hasattr(layer, "process_after_load")
                 or not layer.process_after_load):
             return

From 3148fc96c797e76ee7e23b209b6bfb60d15aab10 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 27 Apr 2024 17:03:00 +0000
Subject: [PATCH 77/90] added warning format

---
 vllm/model_executor/layers/quantization/fp8.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index ff1a8cf601be9..79c68f7b8c516 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -5,12 +5,16 @@
 from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
+from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
-from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
 from vllm.model_executor.utils import set_weight_attrs
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
+logger = init_logger(__name__)
+
 
 class Fp8Config(QuantizationConfig):
     """Config class for FP8."""
@@ -21,6 +25,10 @@ def __init__(
         activation_scheme: str = "dynamic",
     ) -> None:
         self.is_serialized = is_serialized
+        if is_serialized:
+            logger.warning(
+                "Detected fp8 checkpoint. Please note that the "
+                "format is experimental and subject to change.")
         assert activation_scheme in ACTIVATION_SCHEMES
         self.activation_scheme = activation_scheme
 

From 7846d6731c8b56023fe298888a60b9357960b6e2 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 27 Apr 2024 13:03:41 -0400
Subject: [PATCH 78/90] Update opt.py

re-added opt change, will fix in another PR
---
 vllm/model_executor/models/opt.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 336f765ababaa..838a2f0adc4d1 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -130,6 +130,7 @@ def __init__(
             bias=config.enable_bias,
             quant_config=quant_config,
         )
+        quant_config = getattr(quant_config, "quant_config", None)
         self.activation_fn = get_act_fn(config.activation_function,
                                         quant_config, config.ffn_dim)
         self.fc2 = RowParallelLinear(

From ba408c668bafb8bc8b30afd9963b0abcb707620e Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sat, 27 Apr 2024 17:05:29 +0000
Subject: [PATCH 79/90] formatted

---
 vllm/model_executor/layers/quantization/fp8.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 79c68f7b8c516..ca58cacfa722d 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -26,9 +26,8 @@ def __init__(
     ) -> None:
         self.is_serialized = is_serialized
         if is_serialized:
-            logger.warning(
-                "Detected fp8 checkpoint. Please note that the "
-                "format is experimental and subject to change.")
+            logger.warning("Detected fp8 checkpoint. Please note that the "
+                           "format is experimental and subject to change.")
         assert activation_scheme in ACTIVATION_SCHEMES
         self.activation_scheme = activation_scheme
 

From 04617fd7f6035d1961e9fabbbaea1965a068dc80 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 27 Apr 2024 18:44:52 -0400
Subject: [PATCH 80/90] Update vllm/model_executor/layers/quantization/fp8.py

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 vllm/model_executor/layers/quantization/fp8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index ca58cacfa722d..4f9e213899506 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -70,7 +70,7 @@ def get_scaled_act_names(self) -> List[str]:
 class Fp8LinearMethod(LinearMethodBase):
     """Linear method for FP8.
     Supports loading FP8 checkpoints with static weight scale and
-    dynamic/scale activation scale.
+    dynamic/static activation scale.
 
     Also supports loading quantized FP16/BF16 model checkpoints with dynamic
     activation scaling. The weight scaling factor will be initialized after

From cc3d3955c88c06fb3d6f42e20184b785b5e505de Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 27 Apr 2024 18:46:45 -0400
Subject: [PATCH 81/90] Update vllm/model_executor/layers/quantization/fp8.py

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 vllm/model_executor/layers/quantization/fp8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 4f9e213899506..086c34b9689da 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -228,7 +228,7 @@ def apply(self,
             )
 
         # TODO: replace naive loop with cutlass gemm_dq w/ epilogue fusion.
-        # Case 2: We have N weigth_scales for N logical weights.
+        # Case 2: We have N weight_scales for N logical weights.
         else:
             output = torch.empty(x.shape[0],
                                  layer.weight.shape[1],

From f556016cf7391072627fd315f6d82f4ab6bc1db5 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sun, 28 Apr 2024 13:24:17 -0400
Subject: [PATCH 82/90] auto detect shared scale (#214)

---
 vllm/model_executor/layers/quantization/fp8.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 086c34b9689da..e5f5a78696cff 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -166,7 +166,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 or not layer.process_after_load):
             return
 
-        # If checkpoint is fp1616 (not serialized fp8), quantize the weights.
+        # If checkpoint is fp/bf16 (not serialized fp8), quantize the weights.
         if not self.quant_config.is_serialized:
             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
                                                          scale=None)
@@ -186,8 +186,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.weight = Parameter(weight.t(), requires_grad=False)
 
             # WEIGHT_SCALE
-            #   If we only have one logical shard, avoid the loop in apply().
-            if len(layer.logical_widths) == 1:
+            #   If all weight_scales are equal, use a single scale to avoid naive loop.
+            if all_close_1d(layer.weight_scale):
                 layer.weight_scale = Parameter(layer.weight_scale.max(),
                                                requires_grad=False)
                 layer.logical_widths = None
@@ -217,7 +217,7 @@ def apply(self,
         #   If static,  layer.act_scale is scalar and x_scale set to act_scale.
         qinput, x_scale = ops.scaled_fp8_quant(x, layer.act_scale)
 
-        # Case 1: we have one single scale for N logical weights.
+        # Case 1: we have 1 weight_scale for N logical weights.
         if layer.logical_widths is None:
             output, _ = torch._scaled_mm(
                 qinput,

From 30bfbd81e1e907b58dadf6c56fa525d50ab4f07e Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sun, 28 Apr 2024 18:17:18 +0000
Subject: [PATCH 83/90] ./format.sh

---
 vllm/model_executor/layers/quantization/fp8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index e5f5a78696cff..554e8fe1546e2 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -186,7 +186,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.weight = Parameter(weight.t(), requires_grad=False)
 
             # WEIGHT_SCALE
-            #   If all weight_scales are equal, use a single scale to avoid naive loop.
+            #   If all weight_scales are equal, use a single scale.
             if all_close_1d(layer.weight_scale):
                 layer.weight_scale = Parameter(layer.weight_scale.max(),
                                                requires_grad=False)

From 572107ab92f3b5dce613ce1d69ecf8fd910f6165 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sun, 28 Apr 2024 21:07:18 -0400
Subject: [PATCH 84/90] Update vllm/model_executor/layers/quantization/fp8.py

Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
---
 vllm/model_executor/layers/quantization/fp8.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 554e8fe1546e2..eda17ab316322 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -28,7 +28,8 @@ def __init__(
         if is_serialized:
             logger.warning("Detected fp8 checkpoint. Please note that the "
                            "format is experimental and subject to change.")
-        assert activation_scheme in ACTIVATION_SCHEMES
+        if activation_scheme not in ACTIVATION_SCHEMES:
+            raise ValueError(f"Unsupported activation scheme {activation_scheme}")
         self.activation_scheme = activation_scheme
 
     @classmethod

From 41fbde92035f7cab6369f09081132da40d0c9bf5 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Mon, 29 Apr 2024 02:08:59 +0000
Subject: [PATCH 85/90] ./format.sh

---
 vllm/model_executor/layers/quantization/fp8.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index eda17ab316322..cdcbab1a515f3 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -29,7 +29,8 @@ def __init__(
             logger.warning("Detected fp8 checkpoint. Please note that the "
                            "format is experimental and subject to change.")
         if activation_scheme not in ACTIVATION_SCHEMES:
-            raise ValueError(f"Unsupported activation scheme {activation_scheme}")
+            raise ValueError(
+                f"Unsupported activation scheme {activation_scheme}")
         self.activation_scheme = activation_scheme
 
     @classmethod

From f2cd5617e0c8144c75f8039f677910296dc77416 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Mon, 29 Apr 2024 02:19:46 +0000
Subject: [PATCH 86/90] addressed cody's comments + format

---
 vllm/model_executor/layers/linear.py          | 47 ++++++++++---------
 .../model_executor/layers/quantization/fp8.py | 30 ++++++------
 2 files changed, 40 insertions(+), 37 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 1b8171ab3ea41..6a732f30bfab9 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -241,7 +241,8 @@ def __init__(
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         # Special case for Fp8 scales.
-        shard_indexer = getattr(param, "shard_indexer", None)
+        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
+                                           None)
 
         tp_rank = get_tensor_model_parallel_rank()
         output_dim = getattr(param, "output_dim", None)
@@ -252,10 +253,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
         # Special case for Fp8 scales.
-        elif shard_indexer is not None:
-            param_data, loaded_weight = shard_indexer(param_data,
-                                                      loaded_weight,
-                                                      shard_id=0)
+        elif fp8_scales_shard_indexer is not None:
+            param_data, loaded_weight = fp8_scales_shard_indexer(param_data,
+                                                                 loaded_weight,
+                                                                 shard_id=0)
 
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
@@ -322,7 +323,8 @@ def weight_loader(self,
         # Special case for AQLM codebooks.
         is_metadata = getattr(param, "is_metadata", False)
         # Special case for Fp8 scales.
-        shard_indexer = getattr(param, "shard_indexer", None)
+        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
+                                           None)
 
         if loaded_shard_id is None:
             # Loaded weight is already packed.
@@ -380,11 +382,10 @@ def weight_loader(self,
             shard_size = loaded_weight.shape[0]
             shard_offset = loaded_shard_id * shard_size
             param_data = param_data.narrow(0, shard_offset, shard_size)
-        # Special case sharding for Fp8 scales.
-        elif shard_indexer is not None:
-            param_data, loaded_weight = shard_indexer(param_data,
-                                                      loaded_weight,
-                                                      loaded_shard_id)
+        # Special case for Fp8 scales.
+        elif fp8_scales_shard_indexer is not None:
+            param_data, loaded_weight = fp8_scales_shard_indexer(
+                param_data, loaded_weight, loaded_shard_id)
 
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
@@ -469,7 +470,8 @@ def weight_loader(self,
         # Special case for AQLM codebooks.
         is_metadata = getattr(param, "is_metadata", False)
         # Special case for Fp8 scales.
-        shard_indexer = getattr(param, "shard_indexer", None)
+        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
+                                           None)
 
         if loaded_shard_id is None:
             # Loaded weight is already packed.
@@ -544,12 +546,10 @@ def weight_loader(self,
             shard_index = ["q", "k", "v"].index(loaded_shard_id)
             param_data = param_data.narrow(0, shard_index * shard_size,
                                            shard_size)
-        # Special case for for Fp8 scales.
-        elif shard_indexer is not None:
-
-            param_data, loaded_weight = shard_indexer(param_data,
-                                                      loaded_weight,
-                                                      loaded_shard_id)
+        # Special case for Fp8 scales.
+        elif fp8_scales_shard_indexer is not None:
+            param_data, loaded_weight = fp8_scales_shard_indexer(
+                param_data, loaded_weight, loaded_shard_id)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -631,7 +631,8 @@ def __init__(
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         # Special case for Fp8 scales.
-        shard_indexer = getattr(param, "shard_indexer", None)
+        fp8_scales_shard_indexer = getattr(param, "fp8_scales_shard_indexer",
+                                           None)
 
         tp_rank = get_tensor_model_parallel_rank()
         input_dim = getattr(param, "input_dim", None)
@@ -642,10 +643,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             loaded_weight = loaded_weight.narrow(input_dim, start_idx,
                                                  shard_size)
         # Special case for Fp8 scales.
-        elif shard_indexer is not None:
-            param_data, loaded_weight = shard_indexer(param_data,
-                                                      loaded_weight,
-                                                      shard_id=0)
+        elif fp8_scales_shard_indexer is not None:
+            param_data, loaded_weight = fp8_scales_shard_indexer(param_data,
+                                                                 loaded_weight,
+                                                                 shard_id=0)
 
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index cdcbab1a515f3..c16fd3acea004 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -21,11 +21,11 @@ class Fp8Config(QuantizationConfig):
 
     def __init__(
         self,
-        is_serialized: bool = False,
+        is_checkpoint_fp8_serialized: bool = False,
         activation_scheme: str = "dynamic",
     ) -> None:
-        self.is_serialized = is_serialized
-        if is_serialized:
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        if is_checkpoint_fp8_serialized:
             logger.warning("Detected fp8 checkpoint. Please note that the "
                            "format is experimental and subject to change.")
         if activation_scheme not in ACTIVATION_SCHEMES:
@@ -55,9 +55,9 @@ def get_config_filenames(cls) -> List[str]:
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "Fp8Config":
         quant_method = cls.get_from_keys(config, ["quant_method"])
-        is_serialized = ("fp8" in quant_method)
+        is_checkpoint_fp8_serialized = ("fp8" in quant_method)
         activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
-        return cls(is_serialized=is_serialized,
+        return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
                    activation_scheme=activation_scheme)
 
     def get_quant_method(self, layer: torch.nn.Module) -> "Fp8LinearMethod":
@@ -108,7 +108,8 @@ def create_weights(
 
         # WEIGHT
         weight_dtype = (torch.float8_e4m3fn
-                        if self.quant_config.is_serialized else params_dtype)
+                        if self.quant_config.is_checkpoint_fp8_serialized else
+                        params_dtype)
         weight = Parameter(torch.empty(output_size_per_partition,
                                        input_size_per_partition,
                                        dtype=weight_dtype),
@@ -122,17 +123,18 @@ def create_weights(
 
         # If checkpoint is serialized fp8, load them.
         # Otherwise, wait until process_weights_after_loading.
-        if self.quant_config.is_serialized:
+        if self.quant_config.is_checkpoint_fp8_serialized:
             # WEIGHT SCALE
             weight_scale = Parameter(torch.empty(len(output_partition_sizes),
                                                  dtype=torch.float32),
                                      requires_grad=False)
             layer.register_parameter("weight_scale", weight_scale)
-            set_weight_attrs(weight_scale, {
-                **extra_weight_attrs,
-                "shard_indexer":
-                self.scales_shard_indexer,
-            })
+            set_weight_attrs(
+                weight_scale, {
+                    **extra_weight_attrs,
+                    "fp8_scales_shard_indexer":
+                    self.scales_shard_indexer,
+                })
 
             # ACTIVATION SCALE
             if self.quant_config.activation_scheme == "static":
@@ -143,7 +145,7 @@ def create_weights(
                 set_weight_attrs(
                     act_scale, {
                         **extra_weight_attrs,
-                        "shard_indexer":
+                        "fp8_scales_shard_indexer":
                         self.scales_shard_indexer,
                     })
 
@@ -169,7 +171,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
             return
 
         # If checkpoint is fp/bf16 (not serialized fp8), quantize the weights.
-        if not self.quant_config.is_serialized:
+        if not self.quant_config.is_checkpoint_fp8_serialized:
             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
                                                          scale=None)
             layer.weight = Parameter(qweight.t(), requires_grad=False)

From 125266e10a5a17a5079a839fc64b6c0b883c7cf9 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Mon, 29 Apr 2024 02:47:16 +0000
Subject: [PATCH 87/90] make mypy happy

---
 vllm/model_executor/layers/quantization/fp8.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index c16fd3acea004..5952f382f8e8f 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -60,7 +60,8 @@ def from_config(cls, config: Dict[str, Any]) -> "Fp8Config":
         return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
                    activation_scheme=activation_scheme)
 
-    def get_quant_method(self, layer: torch.nn.Module) -> "Fp8LinearMethod":
+    def get_quant_method(
+            self, layer: torch.nn.Module) -> Optional["Fp8LinearMethod"]:
         if isinstance(layer, LinearBase):
             return Fp8LinearMethod(self)
         return None

From 280a4d5e5dc29d5f8fc9b52f92227d835f76d3b8 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 30 Apr 2024 01:49:47 +0000
Subject: [PATCH 88/90] test

---
 tests/models/test_fp8.py                      | 16 +++---
 .../model_executor/layers/quantization/fp8.py | 53 ++++++++++---------
 2 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index 14abe8576f6ff..2ed08afc270a4 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -1,5 +1,6 @@
 # flake8: noqa
 """Tests fp8 models against ground truth generation
+Note: these tests will only pass on L4 GPU.
 """
 import os
 
@@ -22,27 +23,27 @@
 
 EXPECTED_STRS_MAP = {
     "nm-testing/mistral-fp8-static": [
-        ' VLLM (Vulcan Learning Machine) is a high-performance and memory-efficient',
+        ' VLLM (Vulcan Language Model) is an open-source inference and serving engine',
         ' 1. 1950s: The Concept of AI is Born: The term',
         ' Artificial Intelligence (AI) and Human Intelligence (HI) are two distinct ways of processing information.',
-        " A neural network is a type of artificial intelligence model inspired by the human brain's structure and function",
-        ' In the heart of a sprawling industrial city, nestled among the hum of machinery and the rhythm',
+        ' A neural network is a type of machine learning model inspired by the structure and function of the human brain',
+        ' In the heart of the bustling city of Neo-Tokyo, nestled among the tow',
         ' The COVID-19 pandemic has had a profound impact on global economic structures and has forced businesses to',
         ' The Mona Lisa painting, created by the Italian artist Leonardo da Vinci between 15',
         ' Japanese: 早く起きる'
     ],
     "nm-testing/mistral-fp8-dynamic": [
-        ' VLLM (Vulcan Learning Machine) is a high-performance and memory-efficient',
+        ' VLLM (Vulcan Language Model) is an open-source, high-throughput',
         ' 1. 1950s: The Concept of AI is Born: The term',
         ' Artificial Intelligence (AI) and Human Intelligence (HI) are two distinct ways of processing information.',
-        " A neural network is a type of artificial intelligence model inspired by the human brain's structure and function",
-        ' In the heart of the bustling city of Neo-Tokyo, nestled among the tow',
+        " A neural network is a type of machine learning model inspired by the human brain's structure and function",
+        ' Once upon a time, in the heart of a bustling city, there was a robot named B',
         ' The COVID-19 pandemic has had a profound impact on global economic structures and has forced businesses to',
         ' The Mona Lisa painting, created by the Italian artist Leonardo da Vinci between 15',
         ' Japanese: 早く起きる鳥は虫を取る (S'
     ],
     "mistralai/Mistral-7B-Instruct-v0.2": [
-        ' VLLM (Vulcan Learning Machine) is a high-performance and memory-efficient',
+        ' VLLM (Vulcan Language Model) is an open-source, high-throughput',
         ' 1. 1950s: The Concept of AI is Born: The term',
         ' Artificial Intelligence (AI) and Human Intelligence (HI) are two distinct ways of processing information.',
         " A neural network is a type of machine learning model inspired by the human brain's structure and function",
@@ -91,6 +92,7 @@ def test_models(
         generations.append(outputs[0].outputs[0].text)
     del model
 
+    print(generations)
     expected_strs = EXPECTED_STRS_MAP[model_name]
     for i in range(len(example_prompts)):
         generated_str = generations[i]
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 5952f382f8e8f..5a77193a03541 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -43,10 +43,7 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
 
     @classmethod
     def get_min_capability(cls) -> int:
-        # TODO: PyTorch 2.3.0+ is required to run FP8 on
-        # SM 89 (e.g. Ada) GPUs. Specifically, this PR has to
-        # be included: https://github.com/pytorch/pytorch/pull/118881
-        return 90
+        return 89
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
@@ -91,6 +88,24 @@ class Fp8LinearMethod(LinearMethodBase):
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
 
+    def _create_scale_param(
+        self,
+        scale_name: str,
+        layer: torch.nn.Module,
+        output_partition_sizes: List[int],
+        **extra_weight_attrs,
+    ) -> None:
+        scale = Parameter(torch.empty(len(output_partition_sizes),
+                                      dtype=torch.float32),
+                          requires_grad=False)
+        layer.register_parameter(scale_name, scale)
+        set_weight_attrs(
+            scale, {
+                **extra_weight_attrs,
+                "fp8_scales_shard_indexer":
+                self.scales_shard_indexer,
+            })
+
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -126,29 +141,19 @@ def create_weights(
         # Otherwise, wait until process_weights_after_loading.
         if self.quant_config.is_checkpoint_fp8_serialized:
             # WEIGHT SCALE
-            weight_scale = Parameter(torch.empty(len(output_partition_sizes),
-                                                 dtype=torch.float32),
-                                     requires_grad=False)
-            layer.register_parameter("weight_scale", weight_scale)
-            set_weight_attrs(
-                weight_scale, {
-                    **extra_weight_attrs,
-                    "fp8_scales_shard_indexer":
-                    self.scales_shard_indexer,
-                })
+            self._create_scale_param(
+                scale_name="weight_scale",
+                layer=layer,
+                output_partition_sizes=output_partition_sizes,
+                **extra_weight_attrs)
 
             # ACTIVATION SCALE
             if self.quant_config.activation_scheme == "static":
-                act_scale = Parameter(torch.empty(len(output_partition_sizes),
-                                                  dtype=torch.float32),
-                                      requires_grad=False)
-                layer.register_parameter("act_scale", act_scale)
-                set_weight_attrs(
-                    act_scale, {
-                        **extra_weight_attrs,
-                        "fp8_scales_shard_indexer":
-                        self.scales_shard_indexer,
-                    })
+                self._create_scale_param(
+                    scale_name="act_scale",
+                    layer=layer,
+                    output_partition_sizes=output_partition_sizes,
+                    **extra_weight_attrs)
 
     def scales_shard_indexer(
             self, param: torch.Tensor, loaded_weight: torch.Tensor,

From 8e1ede1fc4625f9b35aa29ca21c5404c56987159 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 30 Apr 2024 20:58:08 +0000
Subject: [PATCH 89/90] cleaned up

---
 tests/models/test_fp8.py                      | 51 +++++-------
 .../model_executor/layers/quantization/fp8.py | 81 +++++++++----------
 2 files changed, 59 insertions(+), 73 deletions(-)

diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index 2ed08afc270a4..e87a1783a83f1 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -16,41 +16,30 @@
 MAX_MODEL_LEN = 1024
 
 MODELS = [
-    "nm-testing/mistral-fp8-static",
-    "nm-testing/mistral-fp8-dynamic",
-    "mistralai/Mistral-7B-Instruct-v0.2",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8",
+    "meta-llama/Meta-Llama-3-8B-Instruct",
 ]
 
 EXPECTED_STRS_MAP = {
-    "nm-testing/mistral-fp8-static": [
-        ' VLLM (Vulcan Language Model) is an open-source inference and serving engine',
-        ' 1. 1950s: The Concept of AI is Born: The term',
-        ' Artificial Intelligence (AI) and Human Intelligence (HI) are two distinct ways of processing information.',
-        ' A neural network is a type of machine learning model inspired by the structure and function of the human brain',
-        ' In the heart of the bustling city of Neo-Tokyo, nestled among the tow',
-        ' The COVID-19 pandemic has had a profound impact on global economic structures and has forced businesses to',
-        ' The Mona Lisa painting, created by the Italian artist Leonardo da Vinci between 15',
-        ' Japanese: 早く起きる'
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8": [
+        'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
+        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+        'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+        'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
+        'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
+        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
+        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+        'Here are the translations:\n\n**Japanese:** (Haya tori, nemuri nemuri)\n\n**'
     ],
-    "nm-testing/mistral-fp8-dynamic": [
-        ' VLLM (Vulcan Language Model) is an open-source, high-throughput',
-        ' 1. 1950s: The Concept of AI is Born: The term',
-        ' Artificial Intelligence (AI) and Human Intelligence (HI) are two distinct ways of processing information.',
-        " A neural network is a type of machine learning model inspired by the human brain's structure and function",
-        ' Once upon a time, in the heart of a bustling city, there was a robot named B',
-        ' The COVID-19 pandemic has had a profound impact on global economic structures and has forced businesses to',
-        ' The Mona Lisa painting, created by the Italian artist Leonardo da Vinci between 15',
-        ' Japanese: 早く起きる鳥は虫を取る (S'
-    ],
-    "mistralai/Mistral-7B-Instruct-v0.2": [
-        ' VLLM (Vulcan Language Model) is an open-source, high-throughput',
-        ' 1. 1950s: The Concept of AI is Born: The term',
-        ' Artificial Intelligence (AI) and Human Intelligence (HI) are two distinct ways of processing information.',
-        " A neural network is a type of machine learning model inspired by the human brain's structure and function",
-        ' In the heart of the bustling city of Neo-Tokyo, nestled among the tow',
-        ' The COVID-19 pandemic has had a profound impact on global economic structures and has forced businesses to',
-        ' The Mona Lisa painting, created by the Italian artist Leonardo da Vinci between 15',
-        ' Japanese: 早く起きる鳥は虫を取る (S'
+    "meta-llama/Meta-Llama-3-8B-Instruct": [
+        'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
+        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+        'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+        'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
+        'In the year 2154, the robotics lab at NeuroSpark Industries was on the cusp of',
+        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
+        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+        'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
     ],
 }
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 5a77193a03541..90bb98af9faf4 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -186,22 +186,28 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.act_scale = None
             return
 
-        # TODO: cutlass kernels will remove the need for much of this logic.
-        # If the checkpoint is serialized fp8, we already loaded quantized,
-        # so, just cleanup the Parameters for easier use in apply().
+        # If checkpoint is fp8, requantize the separately quantized logical
+        # weights into a single fp8 weight with a single weight scale.
         else:
+            # WEIGHT_SCALE / WEIGHT
+            #   Loop over logical weights, requantizing with single scale.
+            max_w_scale = layer.weight_scale.max()
+            start = 0
+            for idx, logical_width in enumerate(layer.logical_widths):
+                end = start + logical_width
+                weight_dq = per_tensor_dequantize(layer.weight[start:end, :],
+                                                  layer.weight_scale[idx])
+
+                layer.weight[start:end, :] = per_tensor_quantize(
+                    weight_dq, layer.weight_scale.max())
+                start = end
+            layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+
             # WEIGHT
             #   Transpose weight for passing to torch._scaled_mm
             weight = layer.weight
             layer.weight = Parameter(weight.t(), requires_grad=False)
 
-            # WEIGHT_SCALE
-            #   If all weight_scales are equal, use a single scale.
-            if all_close_1d(layer.weight_scale):
-                layer.weight_scale = Parameter(layer.weight_scale.max(),
-                                               requires_grad=False)
-                layer.logical_widths = None
-
             # ACT_SCALE
             #   Dynamic: set to None (required input to ops.scaled_fp8_quant).
             #   Static:  set to max of the act_scales (since they are equal).
@@ -227,37 +233,14 @@ def apply(self,
         #   If static,  layer.act_scale is scalar and x_scale set to act_scale.
         qinput, x_scale = ops.scaled_fp8_quant(x, layer.act_scale)
 
-        # Case 1: we have 1 weight_scale for N logical weights.
-        if layer.logical_widths is None:
-            output, _ = torch._scaled_mm(
-                qinput,
-                layer.weight,
-                out_dtype=x.dtype,
-                scale_a=x_scale,
-                scale_b=layer.weight_scale,
-            )
-
-        # TODO: replace naive loop with cutlass gemm_dq w/ epilogue fusion.
-        # Case 2: We have N weight_scales for N logical weights.
-        else:
-            output = torch.empty(x.shape[0],
-                                 layer.weight.shape[1],
-                                 dtype=x.dtype,
-                                 device="cuda")
-            start = 0
-            # Loop over the N logical shards.
-            for logical_width, w_scale in zip(layer.logical_widths,
-                                              layer.weight_scale):
-                end = start + logical_width
-                out, _ = torch._scaled_mm(
-                    qinput,
-                    layer.weight[:, start:end],
-                    out_dtype=x.dtype,
-                    scale_a=x_scale,
-                    scale_b=w_scale,
-                )
-                output[:, start:end] = out
-                start = end
+        # Fused GEMM_DQ
+        output, _ = torch._scaled_mm(
+            qinput,
+            layer.weight,
+            out_dtype=x.dtype,
+            scale_a=x_scale,
+            scale_b=layer.weight_scale,
+        )
 
         if bias is not None:
             output.add_(bias)
@@ -265,6 +248,20 @@ def apply(self,
         return output
 
 
-def all_close_1d(x: torch.Tensor):
+def all_close_1d(x: torch.Tensor) -> bool:
     assert len(x.shape) == 1
     return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
+
+
+def per_tensor_quantize(tensor: torch.Tensor,
+                        inv_scale: float) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    qweight = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
+    return qweight.to(torch.float8_e4m3fn)
+
+
+def per_tensor_dequantize(tensor: torch.Tensor,
+                          inv_scale: float) -> torch.Tensor:
+    fake_qweight = tensor.to(torch.float16)
+    dq_weight = fake_qweight * inv_scale
+    return dq_weight

From d067428c60bf6a1b7cca70f1d6672ebff25135b9 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Tue, 30 Apr 2024 17:08:13 -0400
Subject: [PATCH 90/90] Update vllm/model_executor/layers/quantization/fp8.py

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 vllm/model_executor/layers/quantization/fp8.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 90bb98af9faf4..b57e1dde81a5f 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -240,11 +240,9 @@ def apply(self,
             out_dtype=x.dtype,
             scale_a=x_scale,
             scale_b=layer.weight_scale,
+            bias=bias,
         )
 
-        if bias is not None:
-            output.add_(bias)
-
         return output