Merge branch 'pytorch:main' into Arm-backend-Updated-toolchain-to-arm…

…-gnu-toolchain-13.3.rel1
pytorch · Nov 25, 2024 · a0787ba · a0787ba
2 parents 44e0923 + fbee0c8
commit a0787ba
Show file tree

Hide file tree

Showing 91 changed files with 2,334 additions and 907 deletions.
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
@@ -27,6 +27,10 @@ while [[ $# -gt 0 ]]; do
       MODE="$2" # portable or xnnpack+custom or xnnpack+custom+qe
       shift 2
       ;;
+    -pt2e_quantize)
+      PT2E_QUANTIZE="$2"
+      shift 2
+      ;;
     -upload)
       UPLOAD_DIR="$2"
       shift 2
@@ -234,6 +238,10 @@ if [[ "${COREML}" == "ON" ]]; then
 fi
 if [[ "${QNN}" == "ON" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
+  echo "PT2E_QUANTIZE is ${PT2E_QUANTIZE}"
+  if [[ "${PT2E_QUANTIZE}" == "qnn_16a16w" ]]; then
+    EXPORT_ARGS+=" --tokenizer_path tokenizer.model --pt2e_quantize qnn_16a16w --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --calibration_data Once "
+  fi
 fi
 # Add dynamically linked library location
 $PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -441,3 +441,39 @@ jobs:
 
         cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
         echo "::endgroup::"
+
+
+  test-llama-runner-qnn-linux:
+    name: test-llama-runner-qnn-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      matrix:
+        dtype: [fp32]
+        pt2e_quantize: [qnn_16a16w, qnn_8a8w]
+        mode: [qnn]
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-qnn-sdk
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 900
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        BUILD_TOOL="cmake"
+        DTYPE=${{ matrix.dtype }}
+        MODE=${{ matrix.mode }}
+        PT2E_QUANTIZE=${{ matrix.pt2e_quantize }}
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+
+        # Setup executorch
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        # Install requirements for export_llama
+        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
+        # Test llama2
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -283,10 +283,15 @@ for basics.
    - If the reviewers have requests or questions, follow up with them.
    - The goal of the reviewer is to ensure that the code in the `main` branch of
      the repo is consistent, maintainable, and of high quality.
-1. Once approved, your reviewer will import the PR into Meta's internal system
-   and merge it from there.
-   - If the PR is approved and not merged within a few business days, please
-     comment on the PR to ask about its status.
+1. Once the PR has been approved,
+   - If you have the "write permission" in this repo, you can merge it yourself
+     by clicking the "Squash and merge" button once it is green and all CI
+     signals are passing.
+   - If you don't have "write permission" in this repo, the reviewer will take
+     care of the PR. The reviewer may import the PR into Meta's internal system
+     to validate it against internal CI.
+   - If the PR is approved but not merged within 5 business days, please comment
+     on the PR to ask about its status.
    - Note that if the `main` [CI](#continuous-integration) jobs are broken, we
      will only merge PRs that fix the broken jobs until all critical jobs are
      fixed.

diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelCompiler.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelCompiler.mm
@@ -26,25 +26,38 @@ + (nullable NSURL *)compileModelAtURL:(NSURL *)modelURL
 #else
     __block NSError *localError = nil;
     __block NSURL *result = nil;
-
-    dispatch_semaphore_t sema = dispatch_semaphore_create(0);
-    [MLModel compileModelAtURL:modelURL completionHandler:^(NSURL * _Nullable tempURL, NSError * _Nullable compilationError) {
-        result = [tempURL copy];
-        localError = compilationError;
-        dispatch_semaphore_signal(sema);
-    }];
-
-    long status = dispatch_semaphore_wait(sema, dispatch_time(DISPATCH_TIME_NOW, (int64_t)(maxWaitTimeInSeconds * NSEC_PER_SEC)));
-    if (status != 0) {
+
+    if (@available(iOS 16, macOS 13, watchOS 9, tvOS 16, *)) {
+        dispatch_semaphore_t sema = dispatch_semaphore_create(0);
+        [MLModel compileModelAtURL:modelURL completionHandler:^(NSURL * _Nullable tempURL, NSError * _Nullable compilationError) {
+            result = [tempURL copy];
+            localError = compilationError;
+            dispatch_semaphore_signal(sema);
+        }];
+
+        long status = dispatch_semaphore_wait(sema, dispatch_time(DISPATCH_TIME_NOW, (int64_t)(maxWaitTimeInSeconds * NSEC_PER_SEC)));
+        if (status != 0) {
+            ETCoreMLLogErrorAndSetNSError(error,
+                                        ETCoreMLErrorCompilationFailed,
+                                        "%@: Failed to compile model in %f seconds.",
+                                        NSStringFromClass(ETCoreMLModelCompiler.class),
+                                        maxWaitTimeInSeconds);
+            return nil;
+        }
+    } else {
+        result = [MLModel compileModelAtURL:modelURL error:&localError];
+    }
+
+    if (localError) {
         ETCoreMLLogErrorAndSetNSError(error,
-                                      ETCoreMLErrorCompilationFailed,
-                                      "%@: Failed to compile model in %f seconds.",
-                                      NSStringFromClass(ETCoreMLModelCompiler.class),
-                                      maxWaitTimeInSeconds);
+                                    ETCoreMLErrorCompilationFailed,
+                                    "%@: Failed to compile model, error: %@",
+                                    NSStringFromClass(ETCoreMLModelCompiler.class),
+                                    localError);
         return nil;
+    } else {
+        return result;
     }
-
-    return result;
 #endif
 }
 

diff --git a/backends/apple/coreml/scripts/install_requirements.sh b/backends/apple/coreml/scripts/install_requirements.sh
@@ -24,7 +24,7 @@ rm -rf "$COREML_DIR_PATH/third-party"
 mkdir "$COREML_DIR_PATH/third-party"
 
 echo "${green}ExecuTorch: Cloning coremltools."
-git clone --depth 1 --branch 8.0 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
+git clone --depth 1 --branch 8.1 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
 cd $COREMLTOOLS_DIR_PATH
 
 STATUS=$?

diff --git a/backends/apple/coreml/test/test_coreml_partitioner.py b/backends/apple/coreml/test/test_coreml_partitioner.py
@@ -71,23 +71,15 @@ def test_vit_skip_conv(self):
             )
         )
 
-        conv_block = ["aten.convolution.default", "executorch_call_delegate"]
-        safe_softmax_block = [
-            "getitem",
-            "getitem",
-            "getitem",
-            "getitem",
-            "aten.any.dim",
-            "executorch_call_delegate",
-        ]
-        final_block = ["getitem"]
-        total = conv_block + 12 * safe_softmax_block + final_block
-
         assert [
             node.target.__name__
             for node in delegated_program_manager.exported_program().graph.nodes
             if node.op == "call_function"
-        ] == total
+        ] == [
+            "aten.convolution.default",
+            "executorch_call_delegate",
+            "getitem",
+        ]
 
     def test_buffer(self):
         embedding_dim = 3

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
@@ -52,6 +52,7 @@ def __init__(self):
         self.permute_nhwc = False
         self.quantize_io = False
         self.tosa_version = None
+        self.input_order = None
 
     def ethosu_compile_spec(
         self,
@@ -89,7 +90,7 @@ def ethosu_compile_spec(
             self.compiler_flags.append(extra_flags)
 
         base_tosa_version = "TOSA-0.80.0+BI"
-        if "U55" in config:
+        if "u55" in config:
             # Add the Ethos-U55 extension marker
             base_tosa_version += "+u55"
         self.tosa_version = TosaSpecification.create_from_string(base_tosa_version)
@@ -134,6 +135,14 @@ def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder":
         self.quantize_io = quantize_io
         return self
 
+    def set_input_order(self, input_order: str = None) -> "ArmCompileSpecBuilder":
+        """
+        Reorder the inputs coming in. This may be required when inputs > 1.
+        And while using the U55/U85 CompileSpec.
+        """
+        self.input_order = input_order
+        return self
+
     def build(self) -> List[CompileSpec]:
         """
         Generate a list of compile spec objects from the builder
@@ -163,6 +172,13 @@ def build(self) -> List[CompileSpec]:
                 CompileSpec("permute_memory_format", "nhwc".encode())
             )
 
+        if self.input_order:
+            self.compile_spec.append(
+                CompileSpec(
+                    "input_order", " ".join(map(str, self.input_order)).encode()
+                )
+            )
+
         if self.quantize_io:
             self.compile_spec.append(CompileSpec("quantize_io", "True".encode()))
 
@@ -214,13 +230,16 @@ def preprocess(  # noqa: C901
         artifact_path = None
         output_format = ""
         compile_flags = []
+        input_order = []
         for spec in compile_spec:
             if spec.key == "debug_artifact_path":
                 artifact_path = spec.value.decode()
             if spec.key == "output_format":
                 output_format = spec.value.decode()
             if spec.key == "compile_flags":
                 compile_flags.append(spec.value.decode())
+            if spec.key == "input_order":
+                input_order = list(map(int, spec.value.decode().split(",")))
 
         # Check that the output format is set in the compile spec
         if not output_format:
@@ -246,19 +265,27 @@ def preprocess(  # noqa: C901
         )
 
         node_visitors = get_node_visitors(edge_program, tosa_spec)
-
+        input_count = 0
         for node in graph_module.graph.nodes:
             if node.op == "call_function":
                 process_call_function(node, tosa_graph, node_visitors, tosa_spec)
             elif node.op == "placeholder":
                 process_placeholder(node, tosa_graph, edge_program, tosa_spec)
+                if node.name in edge_program.graph_signature.user_inputs:
+                    input_count += 1
             elif node.op == "output":
                 process_output(node, tosa_graph)
             else:
                 # This will only happen if an unpartitioned graph is passed without
                 # any checking of compatibility.
                 dbg_fail(node, tosa_graph, artifact_path)
 
+        if len(input_order) > 0:
+            if input_count != len(input_order):
+                raise RuntimeError(
+                    "The rank of the input order is not equal to amount of input tensors"
+                )
+
         # TODO: It would be awesome if this dump could somehow be done on top level and not here.
         # Problem is that the desc.json has to be created on the tosa_graph object, which we can't
         # access from top level.
@@ -275,7 +302,7 @@ def preprocess(  # noqa: C901
         # preprocess and some consume TOSA fb directly.
         if output_format == "vela":
             # Emit vela_bin_stream format
-            binary = vela_compile(tosa_graph, compile_flags)
+            binary = vela_compile(tosa_graph, compile_flags, input_order)
         elif output_format == "tosa":
             # Emit TOSA flatbuffer
             binary = bytes(tosa_graph.serialize())

diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py
@@ -17,10 +17,13 @@
 
 # Pack either input or output tensor block, compose the related arrays into
 # per-io structs to simplify runtime use.
-def vela_bin_pack_io(prefix, data):
-    ios = struct.pack("<i", len(data[prefix + "_shape"]))
-    for i in range(len(data[prefix + "_shape"])):
-        io_shape = data[prefix + "_shape"][i]
+def vela_bin_pack_io(prefix, data, shape_order=None):
+    vela_input_shapes = data[prefix + "_shape"]
+
+    order = shape_order if shape_order else range(len(vela_input_shapes))
+    ios = struct.pack("<i", len(vela_input_shapes))
+    for i in order:
+        io_shape = vela_input_shapes[i]
         io_elem_size = data[prefix + "_elem_size"][i]
         io_offset = data[prefix + "_offset"][i]
         io_region = data[prefix + "_region"][i]
@@ -36,7 +39,7 @@ def vela_bin_pack_io(prefix, data):
 # Output via Vela to binary stream for ArmBackendEthosU
 # WARNING: Do not change this without changing VelaBinStream.cpp as that
 #          function consumes this format and the two need to align.
-def vela_compile(tosa_graph, args: List[str]):
+def vela_compile(tosa_graph, args: List[str], shape_order=None):
     with tempfile.TemporaryDirectory() as tmpdir:
         tosaname = "out.tosa"
         flatbuffer = tosa_graph.serialize()
@@ -78,7 +81,7 @@ def vela_compile(tosa_graph, args: List[str]):
             bin_blocks["scratch_data"] = b"\x00" * block_length
 
             # Capture inputs and outputs
-            bin_blocks["inputs"] = vela_bin_pack_io("input", data)
+            bin_blocks["inputs"] = vela_bin_pack_io("input", data, shape_order)
             bin_blocks["outputs"] = vela_bin_pack_io("output", data)
 
             bin_blocks["vela_end_stream"] = b""

diff --git a/backends/arm/operator_support/__init__.py b/backends/arm/operator_support/__init__.py
@@ -5,4 +5,9 @@
 
 # pyre-unsafe
 
-from . import mean_dim_support, tosa_supported_operators, var_correction_support  # noqa
+from . import (  # noqa
+    mean_dim_support,
+    right_shift_support,
+    tosa_supported_operators,
+    var_correction_support,
+)
diff --git a/backends/arm/operator_support/right_shift_support.py b/backends/arm/operator_support/right_shift_support.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import logging
+
+import torch.fx as fx
+from executorch.backends.arm.operator_support.tosa_supported_operators import (
+    register_tosa_support_check,
+    SupportedTOSAOperatorCheck,
+)
+from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
+from executorch.exir.dialects._ops import ops as exir_ops
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
+
+
+@register_tosa_support_check
+class RightShiftSupported(SupportedTOSAOperatorCheck):
+    targets = [exir_ops.edge.aten.__rshift__.Scalar]
+
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-0.80.0+BI"),
+        TosaSpecification.create_from_string("TOSA-0.80.0+MI"),
+    ]
+
+    def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
+
+        # TODO MLETORCH-525 Remove warning
+        if isinstance(tosa_spec, Tosa_0_80) and tosa_spec.is_U55_subset:
+            logging.warning(f"{node.target} may introduce one-off errors.")
+        return True
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
@@ -27,6 +27,7 @@
     op_reciprocal,
     op_relu,
     op_repeat,
+    op_rshift,
     op_rsqrt,
     op_select,
     op_sigmoid,