From f9057b75cc0ef8b0c5d639ce7d7487b1a8272c8b Mon Sep 17 00:00:00 2001
From: daquexian <daquexian566@gmail.com>
Date: Sun, 23 Aug 2020 14:25:40 +0800
Subject: [PATCH 01/14] naive moving

---
 .gitignore                                    |  91 +++++
 .gitmodules                                   |   3 +
 CMakeLists.txt                                | 110 ++++++
 VERSION_NUMBER                                |   1 +
 onnx_opt/__init__.py                          |  59 +++
 onnx_opt/cpp2py_export.cc                     |  37 ++
 onnx_opt/optimize.cc                          |  45 +++
 onnx_opt/optimize.h                           |  56 +++
 onnx_opt/pass.cc                              | 103 ++++++
 onnx_opt/pass.h                               | 212 +++++++++++
 onnx_opt/pass_manager.cc                      |  47 +++
 onnx_opt/pass_manager.h                       |  51 +++
 onnx_opt/pass_registry.cc                     |  18 +
 onnx_opt/pass_registry.h                      |  88 +++++
 onnx_opt/passes/eliminate_deadend.h           |  39 ++
 onnx_opt/passes/eliminate_identity.h          |  38 ++
 onnx_opt/passes/eliminate_nop_dropout.h       |  45 +++
 .../passes/eliminate_nop_monotone_argmax.h    |  66 ++++
 onnx_opt/passes/eliminate_nop_pad.h           |  77 ++++
 onnx_opt/passes/eliminate_nop_transpose.h     |  46 +++
 .../passes/eliminate_unused_initializer.h     |  80 ++++
 .../passes/extract_constant_to_initializer.h  |  46 +++
 onnx_opt/passes/fuse_add_bias_into_conv.h     | 157 ++++++++
 onnx_opt/passes/fuse_bn_into_conv.h           | 190 ++++++++++
 onnx_opt/passes/fuse_consecutive_concats.h    |  76 ++++
 .../passes/fuse_consecutive_log_softmax.h     |  49 +++
 .../fuse_consecutive_reduce_unsqueeze.h       |  65 ++++
 onnx_opt/passes/fuse_consecutive_squeezes.h   |  80 ++++
 onnx_opt/passes/fuse_consecutive_transposes.h |  74 ++++
 .../passes/fuse_matmul_add_bias_into_gemm.h   | 107 ++++++
 onnx_opt/passes/fuse_pad_into_conv.h          | 173 +++++++++
 onnx_opt/passes/fuse_transpose_into_gemm.h    |  46 +++
 onnx_opt/passes/lift_lexical_references.h     | 231 ++++++++++++
 onnx_opt/passes/nop.h                         |  26 ++
 onnx_opt/passes/split.h                       | 228 ++++++++++++
 setup.py                                      | 344 ++++++++++++++++++
 third_party/onnx                              |   1 +
 37 files changed, 3205 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 .gitmodules
 create mode 100644 CMakeLists.txt
 create mode 100644 VERSION_NUMBER
 create mode 100644 onnx_opt/__init__.py
 create mode 100644 onnx_opt/cpp2py_export.cc
 create mode 100644 onnx_opt/optimize.cc
 create mode 100644 onnx_opt/optimize.h
 create mode 100644 onnx_opt/pass.cc
 create mode 100644 onnx_opt/pass.h
 create mode 100644 onnx_opt/pass_manager.cc
 create mode 100644 onnx_opt/pass_manager.h
 create mode 100644 onnx_opt/pass_registry.cc
 create mode 100644 onnx_opt/pass_registry.h
 create mode 100644 onnx_opt/passes/eliminate_deadend.h
 create mode 100644 onnx_opt/passes/eliminate_identity.h
 create mode 100644 onnx_opt/passes/eliminate_nop_dropout.h
 create mode 100644 onnx_opt/passes/eliminate_nop_monotone_argmax.h
 create mode 100644 onnx_opt/passes/eliminate_nop_pad.h
 create mode 100644 onnx_opt/passes/eliminate_nop_transpose.h
 create mode 100644 onnx_opt/passes/eliminate_unused_initializer.h
 create mode 100644 onnx_opt/passes/extract_constant_to_initializer.h
 create mode 100644 onnx_opt/passes/fuse_add_bias_into_conv.h
 create mode 100644 onnx_opt/passes/fuse_bn_into_conv.h
 create mode 100644 onnx_opt/passes/fuse_consecutive_concats.h
 create mode 100644 onnx_opt/passes/fuse_consecutive_log_softmax.h
 create mode 100644 onnx_opt/passes/fuse_consecutive_reduce_unsqueeze.h
 create mode 100644 onnx_opt/passes/fuse_consecutive_squeezes.h
 create mode 100644 onnx_opt/passes/fuse_consecutive_transposes.h
 create mode 100644 onnx_opt/passes/fuse_matmul_add_bias_into_gemm.h
 create mode 100644 onnx_opt/passes/fuse_pad_into_conv.h
 create mode 100644 onnx_opt/passes/fuse_transpose_into_gemm.h
 create mode 100644 onnx_opt/passes/lift_lexical_references.h
 create mode 100644 onnx_opt/passes/nop.h
 create mode 100644 onnx_opt/passes/split.h
 create mode 100644 setup.py
 create mode 160000 third_party/onnx

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..512a0b353
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,91 @@
+## General
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.cuo
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.pyd
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+
+# Compiled python
+*.pyc
+
+# Compiled MATLAB
+*.mex*
+
+# IPython notebook checkpoints
+.ipynb_checkpoints
+
+# Editor temporaries
+*.swn
+*.swo
+*.swp
+*~
+
+# Sublime Text settings
+*.sublime-workspace
+*.sublime-project
+
+# Eclipse Project settings
+*.*project
+.settings
+
+# QtCreator files
+*.user
+
+# PyCharm files
+.idea
+
+# Visual Studio Code files
+.vscode
+
+# OSX dir files
+.DS_Store
+
+## ONNX
+
+# build, distribute, and bins (+ python proto bindings)
+build
+build_*
+.build_debug/*
+.build_release/*
+.setuptools-cmake-build/*
+
+# setup.py intermediates
+.eggs
+dist
+onnx_opt.egg-info
+*.ninja
+.ninja_deps
+.ninja_log
+compile_commands.json
+
+# generated files
+onnx/version.py
+compile_commands.json
+
+# test generated files
+.cache
+.coverage
+onnx/examples/.coverage.nbval
+.pytest_cache
+test_report
+
+# autocomplete
+.ycm_extra_conf.py
+
+# test coverage data files
+*.gcov
+
+.mypy_cache
+virtualenv
+venv
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..d2c597cd6
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "third_party/onnx"]
+	path = third_party/onnx
+	url = https://github.com/onnx/onnx
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..2502e9224
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,110 @@
+cmake_minimum_required(VERSION 3.1)
+
+project(onnx_optimizer C CXX)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(ONNX_ROOT ${PROJECT_SOURCE_DIR}/third_party/onnx)
+option(ONNX_BUILD_OPTIMIZER "" OFF)
+add_subdirectory(${ONNX_ROOT})
+
+file(GLOB_RECURSE onnx_opt_srcs "onnx_opt/*.cc"
+    "onnx_opt/*.h"
+    )
+list(REMOVE_ITEM onnx_opt_srcs "${PROJECT_SOURCE_DIR}/onnx_opt/cpp2py_export.cc")
+
+add_library(onnx_optimizer ${onnx_opt_srcs})
+target_link_libraries(onnx_optimizer PUBLIC onnx)
+
+if(BUILD_ONNX_PYTHON)
+  if("${PY_EXT_SUFFIX}" STREQUAL "")
+    if(MSVC)
+      set(PY_EXT_SUFFIX ".pyd")
+    else()
+      set(PY_EXT_SUFFIX ".so")
+    endif()
+  endif()
+
+  add_library(onnx_opt_cpp2py_export MODULE "onnx_opt/cpp2py_export.cc")
+  set_target_properties(onnx_opt_cpp2py_export PROPERTIES PREFIX "")
+  set_target_properties(onnx_opt_cpp2py_export
+                        PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
+  set_target_properties(onnx_opt_cpp2py_export PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
+  set_target_properties(onnx_opt_cpp2py_export
+                        PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+  target_include_directories(onnx_opt_cpp2py_export PRIVATE
+                             $<BUILD_INTERFACE:${ONNX_ROOT}>
+                             $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+                             $<INSTALL_INTERFACE:include>
+                             ${PYTHON_INCLUDE_DIR})
+  # pybind11 is a header only lib
+  find_package(pybind11 2.2)
+  if(pybind11_FOUND)
+    target_include_directories(onnx_opt_cpp2py_export PUBLIC
+      ${pybind11_INCLUDE_DIRS})
+  else()
+    if(EXISTS ${ONNX_ROOT}/third_party/pybind11/include/pybind11/pybind11.h)
+      target_include_directories(onnx_opt_cpp2py_export PUBLIC
+        ${ONNX_ROOT}/third_party/pybind11/include)
+    else()
+      message(FATAL_ERROR "cannot find pybind")
+    endif()
+  endif()
+
+  if(APPLE)
+    set_target_properties(onnx_opt_cpp2py_export
+                          PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+    target_link_libraries(onnx_opt_cpp2py_export
+                          PRIVATE -Wl,-force_load,$<TARGET_FILE:onnx_optimizer>)
+  elseif(MSVC)
+    # In MSVC, we will add whole archive in default
+    target_link_libraries(onnx_opt_cpp2py_export
+                          PRIVATE -WHOLEARCHIVE:$<TARGET_FILE:onnx_optimizer>)
+  elseif(CMAKE_SYSTEM_NAME STREQUAL "AIX")
+    # whole-archive linker option not available on AIX
+    target_sources(onnx_opt_cpp2py_export
+                          PRIVATE $<TARGET_OBJECTS:onnx_optimizer>)
+  else()
+    # Assume everything else is like gcc
+    target_link_libraries(onnx_opt_cpp2py_export
+                          PRIVATE "-Wl,--whole-archive" $<TARGET_FILE:onnx_optimizer>
+                                  "-Wl,--no-whole-archive")
+    set_target_properties(onnx_opt_cpp2py_export
+                          PROPERTIES LINK_FLAGS "-Wl,--exclude-libs,ALL")
+  endif()
+
+  target_link_libraries(onnx_opt_cpp2py_export PRIVATE onnx_optimizer)
+
+  if(MSVC)
+    find_package(PythonInterp ${PY_VERSION} REQUIRED)
+    find_package(PythonLibs ${PY_VERSION} REQUIRED)
+    target_link_libraries(onnx_opt_cpp2py_export PRIVATE ${PYTHON_LIBRARIES})
+    target_compile_options(onnx_opt_cpp2py_export
+                           PRIVATE /MP
+                                   /WX
+                                   /wd4800 # disable warning type' : forcing
+                                           # value to bool 'true' or 'false'
+                                           # (performance warning)
+                                   /wd4503 # identifier' : decorated name length
+                                           # exceeded, name was truncated
+                                   /wd4146 # unary minus operator applied to
+                                           # unsigned type, result still
+                                           # unsigned from include\google\protob
+                                           # uf\wire_format_lite.h
+                                 /wd4244 # 'argument': conversion from 'google::
+                                         # protobuf::uint64' to 'int', possible
+                                         # loss of data
+                                 /wd4267 # Conversion from 'size_t' to 'int',
+                                         # possible loss of data
+                                 /wd4996 # The second parameter is ignored.
+                                   ${EXTRA_FLAGS})
+    if(ONNX_USE_PROTOBUF_SHARED_LIBS)
+      target_compile_options(onnx_opt_cpp2py_export
+                             PRIVATE /wd4251 # 'identifier' : class 'type1' needs to
+                                             # have dll-interface to be used by
+                                             # clients of class 'type2'
+                            )
+    endif()
+    add_msvc_runtime_flag(onnx_opt_cpp2py_export)
+  endif()
+endif()
diff --git a/VERSION_NUMBER b/VERSION_NUMBER
new file mode 100644
index 000000000..6e8bf73aa
--- /dev/null
+++ b/VERSION_NUMBER
@@ -0,0 +1 @@
+0.1.0
diff --git a/onnx_opt/__init__.py b/onnx_opt/__init__.py
new file mode 100644
index 000000000..c0c044827
--- /dev/null
+++ b/onnx_opt/__init__.py
@@ -0,0 +1,59 @@
+# ATTENTION: The code in this file is highly EXPERIMENTAL.
+# Adventurous users should note that the APIs will probably change.
+
+"""onnx optimizer
+
+This enables users to optimize their models.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import onnx
+import onnx_opt.onnx_opt_cpp2py_export as C
+from onnx import ModelProto
+from typing import Text, Sequence, Optional
+
+"""Apply the optimization on the serialized ModelProto.
+
+Arguments:
+    input (ModelProto): model
+    names (list of string): list of optimization names
+
+Return:
+    return (ModelProto) optimized model
+
+Supported pass names:
+    -- nop
+    -- eliminate_identity
+    -- eliminate_nop_transpose
+    -- eliminate_nop_pad
+    -- eliminate_unused_initializer
+    -- fuse_consecutive_squeezes
+    -- fuse_consecutive_transposes
+    -- fuse_add_bias_into_conv
+    -- fuse_transpose_into_gemm
+"""
+
+get_available_passes = C.get_available_passes
+
+
+def optimize(model, passes=None, fixed_point=False):  # type: (ModelProto, Optional[Sequence[Text]], bool) -> ModelProto
+    if passes is None:
+        passes = ['eliminate_nop_transpose',
+                  'eliminate_nop_pad',
+                  'fuse_consecutive_transposes',
+                  'fuse_transpose_into_gemm']
+    if not isinstance(model, ModelProto):
+        raise ValueError('Optimizer only accepts ModelProto, incorrect type: {}'.format(type(model)))
+
+    model_str = model.SerializeToString()
+    if fixed_point:
+        optimized_model_str = C.optimize_fixedpoint(model_str, passes)
+    else:
+        optimized_model_str = C.optimize(model_str, passes)
+
+    return onnx.load_from_string(optimized_model_str)
+
+__all__ = ['optimize', 'get_available_passes']
diff --git a/onnx_opt/cpp2py_export.cc b/onnx_opt/cpp2py_export.cc
new file mode 100644
index 000000000..0a4f60af0
--- /dev/null
+++ b/onnx_opt/cpp2py_export.cc
@@ -0,0 +1,37 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "onnx/optimizer/optimize.h"
+#include "onnx/py_utils.h"
+
+namespace ONNX_NAMESPACE {
+namespace py = pybind11;
+using namespace pybind11::literals;
+PYBIND11_MODULE(onnx_opt_cpp2py_export, onnx_opt_cpp2py_export) {
+  onnx_opt_cpp2py_export.doc() = "ONNX Optimizer";
+
+  onnx_opt_cpp2py_export.def(
+      "optimize",
+      [](const py::bytes& bytes, const std::vector<std::string>& names) {
+        ModelProto proto{};
+        ParseProtoFromPyBytes(&proto, bytes);
+        auto const result = optimization::Optimize(proto, names);
+        std::string out;
+        result.SerializeToString(&out);
+        return py::bytes(out);
+      });
+
+  onnx_opt_cpp2py_export.def(
+      "optimize_fixedpoint",
+      [](const py::bytes& bytes, const std::vector<std::string>& names) {
+        ModelProto proto{};
+        ParseProtoFromPyBytes(&proto, bytes);
+        auto const result =
+            optimization::OptimizeFixed(proto, names);
+        std::string out;
+        result.SerializeToString(&out);
+        return py::bytes(out);
+      });
+  onnx_opt_cpp2py_export.def("get_available_passes", &optimization::GetAvailablePasses);
+}
+}
diff --git a/onnx_opt/optimize.cc b/onnx_opt/optimize.cc
new file mode 100644
index 000000000..dbacb1a32
--- /dev/null
+++ b/onnx_opt/optimize.cc
@@ -0,0 +1,45 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#include "onnx/optimizer/optimize.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+GlobalPassRegistry Optimizer::passes;
+
+Optimizer::Optimizer(
+    const std::vector<std::string>& names,
+    const bool fixed_point) {
+  if (fixed_point) {
+    this->pass_manager =
+        std::shared_ptr<FixedPointPassManager>(new FixedPointPassManager());
+  } else {
+    this->pass_manager =
+        std::shared_ptr<GeneralPassManager>(new GeneralPassManager());
+  }
+  for (const auto& name : names) {
+    auto pass = passes.find(name);
+    this->pass_manager->add(pass);
+  }
+}
+Optimizer::~Optimizer() {}
+
+ModelProto Optimize(
+    const ModelProto& mp_in,
+    const std::vector<std::string>& names) {
+  Optimizer current_opt(names, false);
+  return current_opt.optimize(mp_in);
+}
+ModelProto OptimizeFixed(
+    const ModelProto& mp_in,
+    const std::vector<std::string>& names) {
+  Optimizer current_opt(names, true);
+  return current_opt.optimize(mp_in);
+}
+const std::vector<std::string> GetAvailablePasses() {
+  return Optimizer::passes.GetAvailablePasses();
+}
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/optimize.h b/onnx_opt/optimize.h
new file mode 100644
index 000000000..018a62f79
--- /dev/null
+++ b/onnx_opt/optimize.h
@@ -0,0 +1,56 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+#include "onnx/common/ir.h"
+#include "onnx/common/ir_pb_converter.h"
+#include "onnx/common/stl_backports.h"
+#include "onnx/optimizer/pass_manager.h"
+#include "onnx/optimizer/pass_registry.h"
+#include "onnx/proto_utils.h"
+
+#include "vector"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+struct Optimizer {
+  static GlobalPassRegistry passes;
+
+ public:
+  Optimizer(const std::vector<std::string>& names, const bool fixed_point);
+  ~Optimizer();
+
+  ModelProto optimize(const ModelProto& mp_in) {
+    std::shared_ptr<Graph> g(ImportModelProto(mp_in));
+
+    if (g.get() == nullptr) {
+      std::cerr << "Warning: onnx optimizer is unable to parse input model. "
+                << "(The IR version of the ONNX model may be too old.)"
+                << std::endl;
+      // If we can't parse the file, just return the input.
+      return mp_in;
+    }
+
+    ModelProto mp_out = PrepareOutput(mp_in);
+    this->pass_manager->run(*g);
+    ExportModelProto(&mp_out, g);
+    return mp_out;
+  }
+
+ private:
+  std::shared_ptr<PassManager> pass_manager;
+};
+
+const std::vector<std::string> GetAvailablePasses();
+
+ModelProto Optimize(
+    const ModelProto& mp_in,
+    const std::vector<std::string>& names);
+
+ModelProto OptimizeFixed(
+    const ModelProto& mp_in,
+    const std::vector<std::string>& names);
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/pass.cc b/onnx_opt/pass.cc
new file mode 100644
index 000000000..df0a89b5b
--- /dev/null
+++ b/onnx_opt/pass.cc
@@ -0,0 +1,103 @@
+#include "onnx/optimizer/pass.h"
+#include "onnx/common/assertions.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+Pass::Pass(
+    PassType pass_type,
+    PassEfficiency pass_efficiency,
+    PassOptimizationType pass_optimization_type) {
+  this->pass_type = pass_type;
+  this->pass_efficiency = pass_efficiency;
+  this->pass_optimization_type = pass_optimization_type;
+}
+
+Pass::~Pass() {}
+
+unsigned int Pass::DescendOnGraphAttributesAndCount(
+    Node* n,
+    std::function<unsigned int(Graph&)> fn) {
+  unsigned int num_changes = 0;
+  for (auto name : n->attributeNames()) {
+    auto kind = n->kindOf(name);
+    if (kind == AttributeKind::g) {
+      num_changes += fn(*n->g(name));
+    }
+    if (kind == AttributeKind::gs) {
+      for (auto& g : n->gs(name)) {
+        num_changes += fn(*g);
+      }
+    }
+  }
+  return num_changes;
+}
+
+void Pass::DescendOnGraphAttributesUnconstrained(
+    Node* n,
+    std::function<void(Graph&)> fn) {
+  for (auto name : n->attributeNames()) {
+    auto kind = n->kindOf(name);
+    if (kind == AttributeKind::g) {
+      fn(*n->g(name));
+    }
+    if (kind == AttributeKind::gs) {
+      for (auto& g : n->gs(name)) {
+        fn(*g);
+      }
+    }
+  }
+}
+
+PredicateBasedPass::~PredicateBasedPass() {}
+
+unsigned int PredicateBasedPass::_runPassInternal(Graph& graph) {
+  unsigned int num_changes = false;
+  for (auto it = graph.begin(); it != graph.end(); ++it) {
+    auto* n = *it;
+    num_changes += this->DescendOnGraphAttributesAndCount(
+        n, [this](Graph& g) { return _runPassInternal(g); });
+    if (this->patternMatchPredicate(n)) {
+      NodeDestroyType destroy_type = NodeDestroyType::DestroyZero;
+      num_changes += this->runTransform(n, graph, destroy_type);
+
+      if (destroy_type == NodeDestroyType::DestroyOne) {
+        it.destroyCurrent();
+      }
+      if (destroy_type == NodeDestroyType::DestroyTwo) {
+        it.destroyCurrent();
+        it.destroyCurrent();
+      }
+    }
+  }
+  return num_changes;
+}
+
+PassAnalysisType PredicateBasedPass::getPassAnalysisType() const {
+  return PassAnalysisType::CountBased;
+}
+
+std::shared_ptr<PostPassAnalysis> PredicateBasedPass::runPass(Graph& graph) {
+  bool initialized_pass = this->initializePass(graph);
+  unsigned int touched_optimizations = this->_runPassInternal(graph);
+  bool finalized_pass = this->finalizePass(graph);
+
+  return std::shared_ptr<PostPassAnalysis>(new CountBasedPassAnalysis(
+      this, touched_optimizations, initialized_pass, finalized_pass));
+}
+
+CountBasedPassAnalysis::CountBasedPassAnalysis(
+    Pass* pass,
+    unsigned int num_positive_transforms,
+    bool initialization_done,
+    bool finalization_done) {
+  this->pass = pass;
+  this->num_positive_transforms = num_positive_transforms;
+  this->initialization_done = initialization_done;
+  this->finalization_done = finalization_done;
+}
+
+FullGraphBasedPass::~FullGraphBasedPass() {}
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/pass.h b/onnx_opt/pass.h
new file mode 100644
index 000000000..55d21c909
--- /dev/null
+++ b/onnx_opt/pass.h
@@ -0,0 +1,212 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+#include <string>
+#include "onnx/common/ir.h"
+#include "onnx/onnx_pb.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+// Base struct representing result of a pass.
+struct PostPassAnalysis {
+  virtual ~PostPassAnalysis() = default;
+};
+
+// Enum that represents the type of optimization it is.
+enum PassType {
+  // Class of optimizations that fuses operations.
+  Fuse = 0,
+  // Class of optimizations that removes useless operations.
+  Nop = 1,
+  // Class of optimizations that includes some form of seperation.
+  Separate = 2,
+  // Immutable pass, also sometimes referred to as an analysis pass.
+  Immutable = 3,
+  // Other type of pass.
+  Other = 4
+};
+
+// Enum that represents the return type of the analysis.
+enum PassAnalysisType {
+  // An empty analysis is returned. Most likely will return PostPassAnalysis.
+  Empty = 0,
+  // A count based analysis is returned. Most likely of type
+  // CountBasedPassAnalysis
+  CountBased = 1
+};
+
+enum PassEfficiency {
+  // A partially efficient optimization pass cannot guarantee that running two
+  // consecutive passes
+  // will return the same result as running a single pass.
+  Partial = 0,
+  // A completely efficient optimization guarantees that running two consecutive
+  // passes is equivalent
+  // to running a single pass.
+  Complete = 1
+};
+
+// Describes what the optimization pass is attempting to optimize.
+enum PassOptimizationType {
+  // Is not optimizing anything. Most likely will be used in an immutable pass.
+  None = 0,
+  // Optimizes for compute.
+  Compute = 1,
+  // Optimizes for memory.
+  Memory = 2,
+  // Optimizes for both compute and memory.
+  ComputeMemory = 3,
+  // Optimizes for stability (e.g. log-sum-exp trick).
+  Stability = 4
+};
+
+enum NodeDestroyType {
+  // Does not destroy node
+  DestroyZero = 0,
+  // Equivalent to calling it.destroyCurrent() once.
+  DestroyOne = 1,
+  // Equivalent to calling it.destroyCurrent() twice.
+  DestroyTwo = 2
+};
+
+// Base class for all optimizations within ONNX. A pass must contain the
+// annotations described above. Furthermore each pass is given the ability to
+// initialize and finalize it's pass. Each pass must have a unique name that
+// pass managers/registry will use as identification. Finally the pass
+// implements runPass which completes the pass inplace.
+class Pass {
+  PassType pass_type;
+  PassEfficiency pass_efficiency;
+  PassOptimizationType pass_optimization_type;
+
+ public:
+  Pass(
+      PassType pass_type,
+      PassEfficiency pass_efficiency,
+      PassOptimizationType pass_optimization_type);
+  virtual ~Pass();
+
+  PassType getPassType() const {
+    return this->pass_type;
+  }
+  PassEfficiency getPassEfficiency() const {
+    return this->pass_efficiency;
+  }
+  PassOptimizationType getPassOptimizationType() const {
+    return this->pass_optimization_type;
+  }
+  virtual PassAnalysisType getPassAnalysisType() const = 0;
+  virtual std::string getPassName() const = 0;
+
+  virtual bool initializePass(Graph&) {
+    return false;
+  }
+  virtual bool finalizePass(Graph&) {
+    return false;
+  }
+  virtual std::shared_ptr<PostPassAnalysis> runPass(Graph& graph) = 0;
+
+ protected:
+  // Iterates through the elements in the graph and counts the number of times
+  // the transform is successfully run.
+  unsigned int DescendOnGraphAttributesAndCount(
+      Node* n,
+      std::function<unsigned int(Graph&)> fn);
+  // A more general version of the function above that doesn't constrain the
+  // return type of fn.
+  void DescendOnGraphAttributesUnconstrained(
+      Node* n,
+      std::function<void(Graph&)> fn);
+};
+
+class ImmutablePass : Pass {
+ public:
+  explicit ImmutablePass()
+      : Pass(
+            PassType::Immutable,
+            PassEfficiency::Complete,
+            PassOptimizationType::None) {}
+  ~ImmutablePass() override;
+};
+
+// Pass Analysis done after a predicate based pass.
+struct CountBasedPassAnalysis : PostPassAnalysis {
+  // Have to use raw pointer here. The idea is that the pass will pass <this> as
+  // a parameter to the constructor. We could use std::enable_shared_from_this
+  // but this complicates the memory model. Also since all passes come from
+  // GlobalPassRegistry which already utilizes smart pointers we don't have to
+  // worry about memory leaks from passes.
+  Pass* pass;
+  unsigned int num_positive_transforms;
+  bool initialization_done;
+  bool finalization_done;
+
+ public:
+  explicit CountBasedPassAnalysis(
+      Pass* pass,
+      unsigned int num_positive_transforms,
+      bool initialization_done,
+      bool finalization_done);
+
+  bool graphChanged() {
+    return this->num_positive_transforms > 0;
+  }
+  bool numSucceededTransforms() {
+    return this->num_positive_transforms;
+  }
+
+  // Whether or not a repeated application of the pass might be useful.
+  bool fixedPointOptimizationNeeded() {
+    return this->graphChanged() &&
+        pass->getPassEfficiency() == PassEfficiency::Partial;
+  }
+};
+
+// A pass that is based on pattern matching. The majority of passes will
+// implement this pass. In order for the pass to work the patternMatchPredicate
+// function must be implemented witch matches a subgraph to the respective
+// optimization pass. Lastly the runTransform method must also be implemented
+// which simply implements the pass on any node which passes
+// patternMatchPredicate.
+class PredicateBasedPass : public Pass {
+ public:
+  explicit PredicateBasedPass(
+      PassType pass_type,
+      PassEfficiency pass_efficiency,
+      PassOptimizationType pass_optimization_type)
+      : Pass(pass_type, pass_efficiency, pass_optimization_type) {}
+  ~PredicateBasedPass() override;
+
+  virtual bool patternMatchPredicate(Node* node) = 0;
+  // Run transform is given the current node in the iterator, a reference to the
+  // current graph as well as a reference describing how to treat the current
+  // node in the iterator post transform. Run transform is then responsible for
+  // running the actual transform as well as describing how to treat the
+  // iterator node. By default the current node will not call destroy. Do not
+  // internally delete node instead set the correct destroy_current type.
+  virtual bool
+  runTransform(Node* node, Graph& graph, NodeDestroyType& destroy_current) = 0;
+
+  std::shared_ptr<PostPassAnalysis> runPass(Graph& graph) override;
+  PassAnalysisType getPassAnalysisType() const override;
+
+ private:
+  unsigned int _runPassInternal(Graph& graph);
+};
+
+// The most general pass which allows the user to run a pass given only a graph.
+class FullGraphBasedPass : public Pass {
+ public:
+  explicit FullGraphBasedPass(
+      PassType pass_type,
+      PassEfficiency pass_efficiency,
+      PassOptimizationType pass_optimization_type)
+      : Pass(pass_type, pass_efficiency, pass_optimization_type) {}
+  ~FullGraphBasedPass() override;
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/pass_manager.cc b/onnx_opt/pass_manager.cc
new file mode 100644
index 000000000..7ad365a06
--- /dev/null
+++ b/onnx_opt/pass_manager.cc
@@ -0,0 +1,47 @@
+#include "onnx/optimizer/pass_manager.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+PassManager::PassManager() {}
+PassManager::~PassManager() {}
+
+GeneralPassManager::~GeneralPassManager() {
+  this->passes.clear();
+}
+void GeneralPassManager::add(std::shared_ptr<Pass> pass) {
+  this->passes.push_back(std::move(pass));
+}
+
+std::shared_ptr<PassManagerAnalysis> GeneralPassManager::run(Graph& graph) {
+  for (const std::shared_ptr<Pass>& pass : this->passes) {
+    auto pass_analysis = pass->runPass(graph);
+  }
+  return std::shared_ptr<PassManagerAnalysis>(new EmptyPassManagerAnalysis());
+}
+
+std::shared_ptr<PassManagerAnalysis> FixedPointPassManager::run(Graph& graph) {
+  bool fixed_point_optimization_done;
+
+  do {
+    fixed_point_optimization_done = false;
+    for (const std::shared_ptr<Pass>& pass : this->passes) {
+      std::shared_ptr<PostPassAnalysis> analysis = pass->runPass(graph);
+      if (pass->getPassAnalysisType() == PassAnalysisType::Empty) {
+        continue;
+      }
+      std::shared_ptr<CountBasedPassAnalysis> count_analysis =
+          std::static_pointer_cast<CountBasedPassAnalysis>(analysis);
+
+      while (count_analysis->fixedPointOptimizationNeeded()) {
+        count_analysis = std::static_pointer_cast<CountBasedPassAnalysis>(
+            pass->runPass(graph));
+        fixed_point_optimization_done = true;
+      }
+    }
+  } while (fixed_point_optimization_done);
+
+  return std::shared_ptr<PassManagerAnalysis>(new EmptyPassManagerAnalysis());
+}
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/pass_manager.h b/onnx_opt/pass_manager.h
new file mode 100644
index 000000000..c7ba35ef0
--- /dev/null
+++ b/onnx_opt/pass_manager.h
@@ -0,0 +1,51 @@
+#pragma once
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#include <vector>
+#include "onnx/optimizer/pass.h"
+#include "onnx/optimizer/passes/eliminate_deadend.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+// An analysis returned from the run done by a manager
+struct PassManagerAnalysis {};
+struct EmptyPassManagerAnalysis : PassManagerAnalysis {};
+
+// Base class of all PassManager's. The class should be able to add new passes
+// as well as run the passes given a graph.
+class PassManager {
+ public:
+  PassManager();
+  virtual ~PassManager();
+
+  virtual void add(std::shared_ptr<Pass> P) = 0;
+  virtual std::shared_ptr<PassManagerAnalysis> run(Graph& graph) = 0;
+};
+
+// The GeneralPassManager has no restriction on type of Pass and runs the passes
+// once in a linear fashion.
+class GeneralPassManager : public PassManager {
+ public:
+  GeneralPassManager() {}
+  ~GeneralPassManager() override;
+
+  void add(std::shared_ptr<Pass> pass) override;
+  std::shared_ptr<PassManagerAnalysis> run(Graph& graph) override;
+
+ protected:
+  // use vector here to ensure the order of the passes
+  // for some pass, order is critical, for example,
+  // split_init and split_predict should be the last in the list
+  std::vector<std::shared_ptr<Pass>> passes;
+};
+
+// Exhibits the same behavior as GeneralPassManager but will instead check
+// whether or not fixed point optimization is needed.
+class FixedPointPassManager : public GeneralPassManager {
+  std::shared_ptr<PassManagerAnalysis> run(Graph& graph) override;
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/pass_registry.cc b/onnx_opt/pass_registry.cc
new file mode 100644
index 000000000..c2eee87f8
--- /dev/null
+++ b/onnx_opt/pass_registry.cc
@@ -0,0 +1,18 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#include "onnx/optimizer/pass_registry.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+const std::vector<std::string> GlobalPassRegistry::GetAvailablePasses() {
+  std::vector<std::string> names;
+  for (const auto& pass : this->passes) {
+    names.push_back(pass.first);
+  }
+  return names;
+}
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/pass_registry.h b/onnx_opt/pass_registry.h
new file mode 100644
index 000000000..a925b9142
--- /dev/null
+++ b/onnx_opt/pass_registry.h
@@ -0,0 +1,88 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+#include "onnx/common/ir.h"
+#include "onnx/common/ir_pb_converter.h"
+#include "onnx/common/stl_backports.h"
+#include "onnx/optimizer/passes/eliminate_deadend.h"
+#include "onnx/optimizer/passes/eliminate_identity.h"
+#include "onnx/optimizer/passes/eliminate_nop_dropout.h"
+#include "onnx/optimizer/passes/eliminate_nop_monotone_argmax.h"
+#include "onnx/optimizer/passes/eliminate_nop_pad.h"
+#include "onnx/optimizer/passes/eliminate_nop_transpose.h"
+#include "onnx/optimizer/passes/eliminate_unused_initializer.h"
+#include "onnx/optimizer/passes/extract_constant_to_initializer.h"
+#include "onnx/optimizer/passes/fuse_add_bias_into_conv.h"
+#include "onnx/optimizer/passes/fuse_bn_into_conv.h"
+#include "onnx/optimizer/passes/fuse_consecutive_concats.h"
+#include "onnx/optimizer/passes/fuse_consecutive_log_softmax.h"
+#include "onnx/optimizer/passes/fuse_consecutive_reduce_unsqueeze.h"
+#include "onnx/optimizer/passes/fuse_consecutive_squeezes.h"
+#include "onnx/optimizer/passes/fuse_consecutive_transposes.h"
+#include "onnx/optimizer/passes/fuse_matmul_add_bias_into_gemm.h"
+#include "onnx/optimizer/passes/fuse_pad_into_conv.h"
+#include "onnx/optimizer/passes/fuse_transpose_into_gemm.h"
+#include "onnx/optimizer/passes/lift_lexical_references.h"
+#include "onnx/optimizer/passes/nop.h"
+#include "onnx/optimizer/passes/split.h"
+#include "onnx/proto_utils.h"
+
+#include <unordered_set>
+#include <vector>
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+// Registry containing all passes available in ONNX.
+struct GlobalPassRegistry {
+  std::map<std::string, std::shared_ptr<Pass>> passes;
+
+  GlobalPassRegistry() {
+    // Register the optimization passes to the optimizer.
+    registerPass<NopEmptyPass>();
+    registerPass<EliminateDeadEnd>();
+    registerPass<EliminateNopDropout>();
+    registerPass<EliminateIdentity>();
+    registerPass<EliminateNopMonotoneArgmax>();
+    registerPass<EliminateNopPad>();
+    registerPass<EliminateNopTranspose>();
+    registerPass<EliminateUnusedInitializer>();
+    registerPass<ExtractConstantToInitializer>();
+    registerPass<FuseAddBiasIntoConv>();
+    registerPass<FuseBNIntoConv>();
+    registerPass<FuseConsecutiveConcats>();
+    registerPass<FuseConsecutiveLogSoftmax>();
+    registerPass<FuseConsecutiveReduceUnsqueeze>();
+    registerPass<FuseConsecutiveSqueezes>();
+    registerPass<FuseConsecutiveTransposes>();
+    registerPass<FuseMatMulAddBiasIntoGemm>();
+    registerPass<FusePadIntoConv>();
+    registerPass<FuseTransposeIntoGemm>();
+    registerPass<LiftLexicalReferences>();
+    registerPass<SplitInit>();
+    registerPass<SplitPredict>();
+  }
+
+  ~GlobalPassRegistry() {
+    this->passes.clear();
+  }
+
+  std::shared_ptr<Pass> find(std::string pass_name) {
+    auto it = this->passes.find(pass_name);
+    ONNX_ASSERTM(
+        it != this->passes.end(), "pass %s is unknown.", pass_name.c_str());
+    return it->second;
+  }
+  const std::vector<std::string> GetAvailablePasses();
+
+  template <typename T>
+  void registerPass() {
+    static_assert(std::is_base_of<Pass, T>::value, "T must inherit from Pass");
+    std::shared_ptr<Pass> pass(new T());
+    passes[pass->getPassName()] = pass;
+  }
+};
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/eliminate_deadend.h b/onnx_opt/passes/eliminate_deadend.h
new file mode 100644
index 000000000..a70e2ec9d
--- /dev/null
+++ b/onnx_opt/passes/eliminate_deadend.h
@@ -0,0 +1,39 @@
+
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+#pragma once
+#include "onnx/optimizer/pass.h"
+namespace ONNX_NAMESPACE {
+namespace optimization {
+struct EliminateDeadEnd final : public FullGraphBasedPass {
+  explicit EliminateDeadEnd()
+      : FullGraphBasedPass(
+            PassType::Nop,
+            PassEfficiency::Complete,
+            PassOptimizationType::Compute) {}
+  std::string getPassName() const override {
+    return "eliminate_deadend";
+  }
+  PassAnalysisType getPassAnalysisType() const override {
+    return PassAnalysisType::CountBased;
+  }
+  unsigned int EliminateDead(Graph& graph) {
+    unsigned int nodes_removed = 0;
+    auto nodes = graph.nodes().reverse();
+    for (auto it = nodes.begin(); it != nodes.end(); it++) {
+      auto node = *it;
+      if (!node->hasUses()) {
+        nodes_removed++;
+        it.destroyCurrent();
+      }
+    }
+    return nodes_removed;
+  }
+  std::shared_ptr<PostPassAnalysis> runPass(Graph& graph) override {
+    auto nodes_removed = this->EliminateDead(graph);
+    return std::shared_ptr<PostPassAnalysis>(
+        new CountBasedPassAnalysis(this, nodes_removed, false, false));
+  }
+};
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
\ No newline at end of file
diff --git a/onnx_opt/passes/eliminate_identity.h b/onnx_opt/passes/eliminate_identity.h
new file mode 100644
index 000000000..3f5f8525b
--- /dev/null
+++ b/onnx_opt/passes/eliminate_identity.h
@@ -0,0 +1,38 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+struct EliminateIdentity final : public PredicateBasedPass {
+  explicit EliminateIdentity()
+      : PredicateBasedPass(
+            PassType::Nop,
+            PassEfficiency::Complete,
+            PassOptimizationType::Compute) {}
+
+  std::string getPassName() const override {
+    return "eliminate_identity";
+  }
+
+  bool patternMatchPredicate(Node* node) override {
+    return node->kind() == kIdentity;
+  }
+  bool runTransform(Node* node, Graph&, NodeDestroyType& destroy_current)
+      override {
+
+    if (node->output()->has_sizes()) {
+        node->input()->setSizes(node->output()->sizes());
+    }
+    node->output()->replaceAllUsesWith(node->input());
+    destroy_current = NodeDestroyType::DestroyOne;
+    return true;
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/eliminate_nop_dropout.h b/onnx_opt/passes/eliminate_nop_dropout.h
new file mode 100644
index 000000000..6be190b5e
--- /dev/null
+++ b/onnx_opt/passes/eliminate_nop_dropout.h
@@ -0,0 +1,45 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+struct EliminateNopDropout final : public PredicateBasedPass {
+  explicit EliminateNopDropout()
+      : PredicateBasedPass(
+            PassType::Nop,
+            PassEfficiency::Complete,
+            PassOptimizationType::Compute) {}
+
+  std::string getPassName() const override {
+    return "eliminate_nop_dropout";
+  }
+
+  bool patternMatchPredicate(Node* node) override {
+    // in opset 12, ratio is an input of Dropout rather than an attribute,
+    // however we don't want to to remove Dropout fro opset 12+, since it
+    // supports training-friendly models, for which the Dropout ops are required
+    return (node->kind() == kDropout && node->hasAttribute(kratio)) &&
+        node->f(kratio) == 0.0;
+  }
+
+  bool runTransform(Node* node, Graph&, NodeDestroyType& destroy_current)
+      override {
+    // Don't assume that theres only one output.
+    for (size_t i = 0; i < node->outputs().size(); ++i) {
+      node->outputs()[i]->replaceAllUsesWith(node->input());
+    }
+    if (node->outputs()[0]->has_sizes()) {
+        node->input()->setSizes(node->outputs()[0]->sizes());
+    }
+    destroy_current = NodeDestroyType::DestroyOne;
+    return true;
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/eliminate_nop_monotone_argmax.h b/onnx_opt/passes/eliminate_nop_monotone_argmax.h
new file mode 100644
index 000000000..67f3fdf79
--- /dev/null
+++ b/onnx_opt/passes/eliminate_nop_monotone_argmax.h
@@ -0,0 +1,66 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+#pragma once
+
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+const std::unordered_set<NodeKind> monotone_node_no_axis_kind{kLog,
+                                                              kExp,
+                                                              kSqrt};
+
+const std::unordered_set<NodeKind> monotone_node_axis_kind{kSoftmax,
+                                                           kLogSoftmax};
+
+struct EliminateNopMonotoneArgmax final : public PredicateBasedPass {
+  explicit EliminateNopMonotoneArgmax()
+      : PredicateBasedPass(
+            PassType::Nop,
+            PassEfficiency::Partial,
+            PassOptimizationType::Compute) {}
+
+  std::string getPassName() const override {
+    return "eliminate_nop_monotone_argmax";
+  }
+
+  static inline bool satisfies_monotone_condition(int64_t axis, Node* node) {
+    if (monotone_node_no_axis_kind.find(node->kind()) !=
+        monotone_node_no_axis_kind.end()) {
+      return true;
+    }
+    if (monotone_node_axis_kind.find(node->kind()) !=
+        monotone_node_axis_kind.end()) {
+      if (node->hasAttribute(kaxis)) {
+        return axis == node->i(kaxis);
+      }
+    }
+    return false;
+  }
+
+  bool patternMatchPredicate(Node* node) override {
+    if (node->kind() == kArgMax) {
+      if (node->hasAttribute(kaxis)) {
+        auto node_axis = node->i(kaxis);
+        return node->inputs().size() == 1 &&
+            satisfies_monotone_condition(node_axis, node->input()->node());
+      }
+    }
+    return false;
+  }
+
+  bool runTransform(Node* node, Graph&, NodeDestroyType&)
+      override {
+    Node* monotone_node = node->input()->node();
+    if (monotone_node->output()->uses().size() == 1) {
+      monotone_node->output()->replaceAllUsesWith(monotone_node->input());
+      monotone_node->destroy();
+      return true;
+    }
+    return false;
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
\ No newline at end of file
diff --git a/onnx_opt/passes/eliminate_nop_pad.h b/onnx_opt/passes/eliminate_nop_pad.h
new file mode 100644
index 000000000..23f8cfc6f
--- /dev/null
+++ b/onnx_opt/passes/eliminate_nop_pad.h
@@ -0,0 +1,77 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+#include "onnx/defs/tensor_util.h"
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+struct EliminateNopPad final : public PredicateBasedPass {
+  explicit EliminateNopPad()
+      : PredicateBasedPass(
+            PassType::Nop,
+            PassEfficiency::Complete,
+            PassOptimizationType::Compute) {}
+
+  std::string getPassName() const override {
+    return "eliminate_nop_pad";
+  }
+
+  static bool is_nop_pad(Node* node, Graph& graph) {
+    if (node->hasAttribute(kpads)) {
+      // opset 10 and below
+      const auto& pads = node->is(kpads);
+      for (size_t i = 0; i < pads.size(); i++) {
+        // if pad constant_value is non-zero, this is not a nop pad
+        if (pads[i] != 0) {
+          return false;      			
+        }
+      }
+      return true;
+    } else {
+      // opset 11 and above
+      const auto& pads_name = node->inputs()[1]->uniqueName();
+      const auto pads_initializer = graph.getInitializer(pads_name);
+      // 'pad' node has the 'pads' input which has not been initialized -
+      // can't proceed with elimination
+      if (pads_initializer == graph.initializers().end())
+        return false;
+
+      // validate values within 'pads'
+      if (pads_initializer->elem_type() == TensorProto::INT64) {
+        const auto& pads = ParseData<int64_t>(&*pads_initializer);
+        for (const auto& val : pads) {
+          // if pad constant_value is non-zero, this is not a nop pad
+          if (val != 0) {
+            return false;
+          }      
+        }
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  bool patternMatchPredicate(Node* node) override {
+    return node->kind() == kPad;
+  }
+
+  bool runTransform(Node* node, Graph& graph, NodeDestroyType& destroy_current)
+      override {
+    if (!is_nop_pad(node, graph))
+      return false;
+    if (node->output()->has_sizes()) {
+        node->inputs()[0]->setSizes(node->output()->sizes());
+    }
+    node->output()->replaceAllUsesWith(node->inputs()[0]);
+    destroy_current = NodeDestroyType::DestroyOne;
+    return true;
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/eliminate_nop_transpose.h b/onnx_opt/passes/eliminate_nop_transpose.h
new file mode 100644
index 000000000..ba1595dfc
--- /dev/null
+++ b/onnx_opt/passes/eliminate_nop_transpose.h
@@ -0,0 +1,46 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+struct EliminateNopTranspose final : public PredicateBasedPass {
+  explicit EliminateNopTranspose()
+      : PredicateBasedPass(
+            PassType::Nop,
+            PassEfficiency::Complete,
+            PassOptimizationType::Compute) {}
+
+  std::string getPassName() const override {
+    return "eliminate_nop_transpose";
+  }
+
+  static bool is_nop_transpose(const std::vector<int64_t>& perm) {
+    for (size_t i = 0; i < perm.size(); i++)
+      if (perm[i] != (int)i)
+        return false;
+    return true;
+  }
+
+  bool patternMatchPredicate(Node* node) override {
+    return (node->kind() == kTranspose && node->hasAttribute(kperm)) &&
+        is_nop_transpose(node->is(kperm));
+  }
+
+  bool runTransform(Node* node, Graph&, NodeDestroyType& destroy_current)
+      override {
+    if (node->output()->has_sizes()) {
+        node->input()->setSizes(node->output()->sizes());
+    }
+    node->output()->replaceAllUsesWith(node->input());
+    destroy_current = NodeDestroyType::DestroyOne;
+    return true;
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/eliminate_unused_initializer.h b/onnx_opt/passes/eliminate_unused_initializer.h
new file mode 100644
index 000000000..95995610f
--- /dev/null
+++ b/onnx_opt/passes/eliminate_unused_initializer.h
@@ -0,0 +1,80 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+// Before:
+//   A, B, C are in the initializer list
+//   D = Add(B, C)
+// After:
+//   B, C are in the initializer list and A is removed
+//   D = Add(B, C)
+//
+// this pass can handle the case satisfy all following conditions:
+//   condition 1: A is not used as any node's input
+//   condition 2: A is not an output
+
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+struct EliminateUnusedInitializer final : public FullGraphBasedPass {
+  explicit EliminateUnusedInitializer()
+      : FullGraphBasedPass(
+            PassType::Nop,
+            PassEfficiency::Complete,
+            PassOptimizationType::Memory) {}
+
+  std::string getPassName() const override {
+    return "eliminate_unused_initializer";
+  }
+
+  PassAnalysisType getPassAnalysisType() const override {
+    return PassAnalysisType::Empty;
+  }
+
+  void erase_used_initializers(
+      Graph& g,
+      std::unordered_set<std::string>* initializer_names) {
+    for (auto output : g.outputs()) {
+      initializer_names->erase(output->uniqueName());
+    }
+    for (auto it = g.begin(); it != g.end(); ++it) {
+      auto* n = *it;
+      DescendOnGraphAttributesUnconstrained(
+          n, [this, initializer_names](Graph& graph) {
+            erase_used_initializers(graph, initializer_names);
+          });
+      for (auto* input : n->inputs()) {
+        initializer_names->erase(input->uniqueName());
+      }
+    }
+  }
+
+  void eliminate_unused_initializer(Graph& graph) {
+    std::unordered_set<std::string> initializer_names(
+        graph.initializer_names().begin(), graph.initializer_names().end());
+    erase_used_initializers(graph, &initializer_names);
+
+    // remove initializer and input if need
+    for (std::string name : initializer_names) {
+      graph.eraseInitializer(name);
+      auto iter = std::find_if(
+          graph.inputs().begin(), graph.inputs().end(), [&name](Value* input) {
+            return input->uniqueName() == name;
+          });
+      if (iter != graph.inputs().end()) {
+        graph.eraseInput(std::distance(graph.inputs().begin(), iter));
+      }
+    }
+  }
+
+  std::shared_ptr<PostPassAnalysis> runPass(Graph& graph) override {
+    eliminate_unused_initializer(graph);
+    return std::shared_ptr<PostPassAnalysis>(new PostPassAnalysis());
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/extract_constant_to_initializer.h b/onnx_opt/passes/extract_constant_to_initializer.h
new file mode 100644
index 000000000..696f78c6e
--- /dev/null
+++ b/onnx_opt/passes/extract_constant_to_initializer.h
@@ -0,0 +1,46 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+// Before:
+//	 A = Constant()
+// After:
+//	 A is in the initializer list
+//
+//	 this pass can handle the case satisfy all following conditions:
+//	   condition 1: A is the output of a Constant node
+#include "onnx/common/assertions.h"
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+struct ExtractConstantToInitializer final : public PredicateBasedPass {
+  explicit ExtractConstantToInitializer()
+      : PredicateBasedPass(
+            PassType::Nop,
+            PassEfficiency::Complete,
+            PassOptimizationType::Memory) {}
+
+  std::string getPassName() const override {
+    return "extract_constant_to_initializer";
+  }
+
+  bool patternMatchPredicate(Node* node) override {
+    return node->kind() == kConstant;
+  }
+
+  bool runTransform(Node* node, Graph& graph, NodeDestroyType& destroy_current)
+      override {
+    const auto name = node->output()->uniqueName();
+    Tensor t = node->t(kvalue);
+    Value* new_init = graph.addInitializerAndInput(t, name);
+    node->output()->replaceAllUsesWith(new_init);
+    destroy_current = NodeDestroyType::DestroyOne;
+    return true;
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/fuse_add_bias_into_conv.h b/onnx_opt/passes/fuse_add_bias_into_conv.h
new file mode 100644
index 000000000..0af10cd7c
--- /dev/null
+++ b/onnx_opt/passes/fuse_add_bias_into_conv.h
@@ -0,0 +1,157 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+// Before:
+//   Z = Conv(X, Y)
+//   B = Z + A
+// After:
+//   B = Conv(X, Y, A)
+//
+// the pass can handle the following cases:
+//   case 1: A is 1D tensor and A.dim[0] == Z.dim[1]
+//   case 2: A is 1-element 1D tensor
+
+#include <numeric>
+
+#include "onnx/common/assertions.h"
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+struct FuseAddBiasIntoConv final : public PredicateBasedPass {
+  explicit FuseAddBiasIntoConv()
+      : PredicateBasedPass(
+            PassType::Fuse,
+            PassEfficiency::Complete,
+            PassOptimizationType::Compute) {}
+  std::string getPassName() const override {
+    return "fuse_add_bias_into_conv";
+  }
+  bool patternMatchPredicate(Node* node) override {
+    return node->kind() == kAdd && node->inputs()[0]->node()->kind() == kConv &&
+        node->inputs()[0]->node()->inputs().size() == 2;
+  }
+  bool runTransform(Node* n, Graph& graph, NodeDestroyType& destroy_current)
+      override {
+    // due to current broadcasting's constraint, Conv has to be the first
+    // operand
+    destroy_current = NodeDestroyType::DestroyZero;
+    auto orig_conv = n->inputs()[0];
+    auto orig_bias = n->inputs()[1];
+    // check if bias is Const or in graph's initializers
+    if (orig_bias->node()->kind() != kConstant &&
+        orig_bias->node()->kind() != kParam) {
+      return false;
+    }
+    // check if conv is only used by Add
+    if (orig_conv->uses().size() > 1) {
+      return false;
+    }
+    auto conv_shape = orig_conv->sizes();
+    auto bias_shape = orig_bias->sizes();
+    auto weight_shape = orig_conv->node()->inputs()[1]->sizes();
+    int64_t M = -1;
+    int64_t rank = -1;
+    // try to get feature M and rank from conv_shape
+    if (conv_shape.size() > 1 && conv_shape[1].is_int) {
+      M = conv_shape[1].dim;
+      rank = conv_shape.size();
+    }
+    // try to get feature M and rank from weight_shape
+    if (weight_shape.size() > 0 && weight_shape[0].is_int) {
+      ONNX_ASSERT(M == -1 || M == weight_shape[0].dim);
+      M = weight_shape[0].dim;
+      ONNX_ASSERT(
+          rank == -1 || rank == static_cast<int64_t>(weight_shape.size()));
+      rank = weight_shape.size();
+    }
+    int64_t num_el = 1;
+    for (int i = 0; i < static_cast<int64_t>(bias_shape.size()); ++i) {
+      if (bias_shape[i].is_int) {
+        num_el *= bias_shape[i].dim;
+      } else {
+        num_el = -1;
+        return false;
+      }
+    }
+    if (M == -1 || num_el == -1) {
+      // No enough information, bail out
+      return false;
+    }
+    if (rank < static_cast<int64_t>(bias_shape.size())) {
+      return false;
+    }
+    if (num_el == 1) {
+      if (orig_bias->node()->kind() != kParam &&
+          orig_conv->node()->isBefore(orig_bias->node())) {
+        orig_bias->node()->moveBefore(orig_conv->node());
+      }
+      Value* conv_3rd_input = orig_bias;
+      if (bias_shape.size() > 1) {
+        Node* squeeze = graph.create(kSqueeze, 1);
+        std::vector<int64_t> axes(bias_shape.size() - 1);
+        std::iota(axes.begin(), axes.end(), 0);
+        squeeze->is_(kaxes, std::move(axes));
+        squeeze->addInput(conv_3rd_input);
+        conv_3rd_input = squeeze->output();
+        squeeze->insertBefore(orig_conv->node());
+      }
+      if (M > 1) {
+        Node* constant = graph.create(kConstant, 1);
+        Tensor t;
+        t.sizes().push_back(static_cast<int64_t>(1));
+        t.int64s().push_back(M);
+        t.elem_type() = TensorProto_DataType_INT64;
+        Symbol sym = Symbol("value");
+        constant->t_(sym, t);
+        std::vector<Dimension> s = {1};
+        constant->output()->setSizes(s);
+        constant->output()->setElemType(TensorProto_DataType_INT64);
+        constant->insertBefore(orig_conv->node());
+        Node* tile = graph.create(kTile, 1);
+        tile->addInput(conv_3rd_input);
+        tile->addInput(constant->output());
+        conv_3rd_input = tile->output();
+        tile->insertBefore(orig_conv->node());
+      }
+      orig_conv->node()->addInput(conv_3rd_input);
+    } else if (rank > static_cast<int64_t>(bias_shape.size()) + 1) {
+      return false;
+    } else if (
+        num_el == M &&
+        bias_shape[1 + bias_shape.size() - static_cast<unsigned>(rank)].dim ==
+            M) {
+      ONNX_ASSERT(bias_shape.size() > 1);
+      if (orig_bias->node()->kind() != kParam &&
+          orig_conv->node()->isBefore(orig_bias->node())) {
+        orig_bias->node()->moveBefore(orig_conv->node());
+      }
+      Node* squeeze = graph.create(kSqueeze, 1);
+      std::vector<int64_t> axes(bias_shape.size());
+      std::iota(axes.begin(), axes.end(), static_cast<int64_t>(0));
+      axes.erase(
+          axes.begin() + (1 + bias_shape.size() - static_cast<unsigned>(rank)));
+      squeeze->is_(kaxes, std::move(axes));
+      squeeze->addInput(orig_bias);
+      squeeze->insertBefore(orig_conv->node());
+      orig_conv->node()->addInput(squeeze->output());
+    } else {
+      return false;
+    }
+    if (orig_conv->sizes().size() == 0 && n->output()->sizes().size() > 0) {
+      orig_conv->setSizes(n->output()->sizes());
+    }
+    if (n->output()->elemType() != TensorProto_DataType_UNDEFINED) {
+      orig_conv->setElemType(n->output()->elemType());
+    }
+    n->replaceAllUsesWith(orig_conv->node());
+    destroy_current = NodeDestroyType::DestroyOne;
+    return true;
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/fuse_bn_into_conv.h b/onnx_opt/passes/fuse_bn_into_conv.h
new file mode 100644
index 000000000..e4fdddca1
--- /dev/null
+++ b/onnx_opt/passes/fuse_bn_into_conv.h
@@ -0,0 +1,190 @@
+
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+// Before:
+//	 conv = Conv()
+//   bn = BatchNormalization()
+//
+// After:
+//	 bn is deleted
+//   new inputs/initializers to conv are added to graph
+//   any no longer used inputs/initializers are erased from graph
+//
+//	 this pass can handle the case satisfy all following conditions:
+//	   condition 1: Run in testing mode
+//     condition 2: Inputs 1 - 4 of bn are all initializer_size
+//     condition 3: Output of initial conv has no other uses
+//     condition 3: Currently works for only DOUBLE, FLOAT32 tensor types
+//
+// Formula for transformation
+// $$ X_{bn} = \frac{s(X - m)}{\sqrt{\sigma + \epsilon}} + b_{bn}$$
+// $$ X_{conv} = X * W + b_{conv} $$
+// thus, substituting $X$ with $X_{conv}$ in the BN equation we get:
+// $$X_{bn} = X * \frac{sW}{\sqrt{\sigma + \epsilon}} + \frac{s(b_{conv} -
+// m)}{\sqrt{\sigma + \epsilon}} + b_{bn}$$ or
+// $$ W' = W\frac{s}{\sqrt{\sigma + \epsilon}}$$
+// $$ b' = (b_{conv} - m)\frac{s}{\sqrt{\sigma + \epsilon}} + b_{bn}$$
+
+#include "onnx/common/assertions.h"
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+// TODO: Currently broken for complex values and float16
+struct FuseBNIntoConv final : public PredicateBasedPass {
+  explicit FuseBNIntoConv()
+      : PredicateBasedPass(
+            PassType::Fuse,
+            PassEfficiency::Complete,
+            PassOptimizationType::Compute) {}
+
+  std::string getPassName() const override {
+    return "fuse_bn_into_conv";
+  }
+
+  void replace_inputs(Tensor& W, Tensor& b, Node* conv, Graph& graph) {
+    Value* new_W_value = graph.addInitializerAndInput(W);
+    Value* old_W_value = conv->inputs()[1];
+    conv->replaceInput(1, new_W_value);
+    if (old_W_value->uses().size() == 0) {
+      graph.eraseInitializerAndInput(old_W_value);
+    }
+
+    if (conv->inputs().size() == 3) {
+      Value* new_b_value = graph.addInitializerAndInput(b);
+      Value* old_b_value = conv->inputs()[2];
+      conv->replaceInput(2, new_b_value);
+      if (old_b_value->uses().size() == 0) {
+        graph.eraseInitializerAndInput(old_b_value);
+      }
+    } else {
+      Value* new_b_value = graph.addInitializerAndInput(b);
+      conv->addInput(new_b_value);
+    }
+  }
+
+  bool modify_conv(Node* conv, Node* bn, Graph& graph) {
+    const auto& bn_inputs = bn->inputs();
+    const auto& conv_inputs = conv->inputs();
+    auto end_iter = graph.initializers().end();
+    auto s_iter = graph.getInitializer(bn_inputs[1]->uniqueName());
+    auto bbn_iter = graph.getInitializer(bn_inputs[2]->uniqueName());
+    auto m_iter = graph.getInitializer(bn_inputs[3]->uniqueName());
+    auto var_iter = graph.getInitializer(bn_inputs[4]->uniqueName());
+    auto W_iter = graph.getInitializer(conv_inputs[1]->uniqueName());
+    if (s_iter == end_iter || bbn_iter == end_iter || m_iter == end_iter ||
+        var_iter == end_iter || W_iter == end_iter) {
+      return false;
+    }
+
+    ONNX_ASSERT(s_iter->sizes().size() == 1);
+    ONNX_ASSERT(
+        bbn_iter->sizes().size() == 1 &&
+        bbn_iter->sizes()[0] == s_iter->sizes()[0]);
+    ONNX_ASSERT(
+        m_iter->sizes().size() == 1 &&
+        m_iter->sizes()[0] == s_iter->sizes()[0]);
+    ONNX_ASSERT(
+        var_iter->sizes().size() == 1 &&
+        var_iter->sizes()[0] == s_iter->sizes()[0]);
+    ONNX_ASSERT(
+        W_iter->sizes().size() > 2 && W_iter->sizes()[0] == s_iter->sizes()[0]);
+    ONNX_ASSERT(
+        s_iter->elem_type() == bbn_iter->elem_type() &&
+        s_iter->elem_type() == m_iter->elem_type() &&
+        s_iter->elem_type() == var_iter->elem_type() &&
+        s_iter->elem_type() == W_iter->elem_type());
+    if (s_iter->elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
+        s_iter->elem_type() != ONNX_NAMESPACE::TensorProto_DataType_DOUBLE) {
+      return false;
+    }
+
+    Tensor bc;
+    if (conv_inputs.size() == 3) {
+      auto bc_iter = graph.getInitializer(conv_inputs[2]->uniqueName());
+      if (bc_iter == end_iter) {
+        return false;
+      }
+      bc = *bc_iter;
+      ONNX_ASSERT(
+          bc.sizes().size() == 1 && bc.sizes()[0] == s_iter->sizes()[0]);
+    }
+
+    Tensor s = *s_iter;
+    const Tensor& bbn = *bbn_iter;
+    const Tensor& m = *m_iter;
+    Tensor var = *var_iter;
+    Tensor W = *W_iter;
+    float epsilon = bn->hasAttribute(kepsilon) ? (float)bn->f(kepsilon) : 1e-5f;
+    Tensor eps;
+
+#define DO_COMPUTATION(TENSOR_TYPE, vec)                                 \
+  eps.sizes().push_back(s.sizes()[0]);                                   \
+  eps.elem_type() = ONNX_NAMESPACE::TensorProto_DataType_##TENSOR_TYPE;  \
+  for (int64_t i = 0; i < eps.sizes()[0]; ++i) {                         \
+    eps.vec().push_back(epsilon);                                        \
+  }                                                                      \
+  if (conv_inputs.size() != 3) {                                         \
+    bc.sizes().push_back(s.sizes()[0]);                                  \
+    bc.elem_type() = ONNX_NAMESPACE::TensorProto_DataType_##TENSOR_TYPE; \
+    for (int64_t i = 0; i < eps.sizes()[0]; ++i) {                       \
+      bc.vec().push_back(0.f);                                           \
+    }                                                                    \
+  }                                                                      \
+  var.add(eps);                                                          \
+  var.sqrt();                                                            \
+  s.divide(var);                                                         \
+  W.scale_by_first_dim(s);                                               \
+  bc.subtract(m);                                                        \
+  bc.multiply(s);                                                        \
+  bc.add(bbn);
+
+    switch (s.elem_type()) {
+      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
+        DO_COMPUTATION(FLOAT, floats)
+        break;
+      }
+      case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE: {
+        DO_COMPUTATION(DOUBLE, doubles)
+        break;
+      }
+      default:
+        return false;
+    }
+#undef DO_COMPUTATION
+    replace_inputs(W, bc, conv, graph);
+    return true;
+  }
+
+  bool patternMatchPredicate(Node* node) override {
+    return node->kind() == kBatchNormalization &&
+        node->inputs()[0]->node()->kind() == kConv;
+  }
+  bool runTransform(Node* n, Graph& graph, NodeDestroyType& destroy_current)
+      override {
+    Node* bn = n;
+    Node* conv = n->inputs()[0]->node();
+    auto origInput = bn->inputs()[0];
+    if (origInput->uses().size() > 1 || bn->outputs().size() > 1 ||
+        !modify_conv(conv, bn, graph)) {
+      destroy_current = NodeDestroyType::DestroyZero;
+      return false;
+    }
+    for (int i = 4; i >= 1; --i) {
+      if (bn->inputs()[i]->uses().size() == 1) {
+        auto input = bn->inputs()[i];
+        bn->removeInput(i);
+        graph.eraseInitializerAndInput(input);
+      }
+    }
+    bn->output()->replaceAllUsesWith(origInput);
+    destroy_current = NodeDestroyType::DestroyOne;
+    return true;
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/fuse_consecutive_concats.h b/onnx_opt/passes/fuse_consecutive_concats.h
new file mode 100644
index 000000000..6b18413a7
--- /dev/null
+++ b/onnx_opt/passes/fuse_consecutive_concats.h
@@ -0,0 +1,76 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+struct FuseConsecutiveConcats final : public PredicateBasedPass {
+  explicit FuseConsecutiveConcats()
+      : PredicateBasedPass(
+            PassType::Fuse,
+            PassEfficiency::Partial,
+            PassOptimizationType::Compute) {}
+
+  std::string getPassName() const override {
+    return "fuse_consecutive_concats";
+  }
+
+  void insertInput(Node* node, size_t i, Value* value) {
+    const auto input_size = node->inputs().size();
+    if (i == input_size) {
+      node->addInput(value);
+    } else {
+      for (size_t j = input_size - 1; j >= i; j--) {
+        Value* cur_input = node->input(j);
+        if (j == input_size - 1) {
+          node->addInput(cur_input);
+        } else {
+          node->replaceInput(j + 1, cur_input);
+        }
+      }
+      node->replaceInput(i, value);
+    }
+  }
+
+  bool patternMatchPredicate(Node* node) override {
+    // we don't check if our concat node has inputs which are also concat nodes
+    // because this requires a for loop through the inputs. If it turns out
+    // there is then we still have to do a for loop in the runTransform portion
+    // of the code. In order not to waste a loop we don't check the real pattern
+    // match condition.
+    return node->kind() == kConcat && node->hasAttribute(kaxis);
+  }
+  bool runTransform(Node* concat_node, Graph&, NodeDestroyType& destroy_current)
+      override {
+    destroy_current = NodeDestroyType::DestroyZero;
+    bool transform_ran = false;
+    for (size_t i = 0; i < concat_node->inputs().size(); i++) {
+      Value* cur_input_value = concat_node->inputs()[i];
+      Node* cur_input_node = cur_input_value->node();
+      if (cur_input_node->kind() == kConcat &&
+          cur_input_value->uses().size() == 1 &&
+          cur_input_node->hasAttribute(kaxis) &&
+          cur_input_node->i(kaxis) == concat_node->i(kaxis)) {
+        transform_ran = true;
+        // Inserts n inputs of cur_input_node at index i+1~i+1+(n-1), 
+        // and remove cur_input_node at index i. 
+        // As a result, cur_input_node is replaced by its inputs inplace, 
+        // instead of always appending its inputs at the end.
+        for (size_t j = 0; j < cur_input_node->inputs().size(); j++) {
+          Value* value = cur_input_node->input(j);
+          insertInput(concat_node, i + 1 + j, value);
+        }
+        concat_node->removeInput(i);
+        cur_input_node->destroy();
+      }
+    }
+    return transform_ran;
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/fuse_consecutive_log_softmax.h b/onnx_opt/passes/fuse_consecutive_log_softmax.h
new file mode 100644
index 000000000..474c0001b
--- /dev/null
+++ b/onnx_opt/passes/fuse_consecutive_log_softmax.h
@@ -0,0 +1,49 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+struct FuseConsecutiveLogSoftmax final : public PredicateBasedPass {
+  explicit FuseConsecutiveLogSoftmax()
+      : PredicateBasedPass(
+            PassType::Fuse,
+            PassEfficiency::Complete,
+            PassOptimizationType::Compute) {}
+
+  std::string getPassName() const override {
+    return "fuse_consecutive_log_softmax";
+  }
+
+  bool patternMatchPredicate(Node* node) override {
+    return node->kind() == kLog && node->input()->node()->kind() == kSoftmax &&
+        node->input()->uses().size() == 1;
+  }
+  bool runTransform(
+      Node* log_node,
+      Graph& graph,
+      NodeDestroyType& destroy_current) override {
+    Value* log_node_output = log_node->output();
+    Node* softmax_node = log_node->inputs()[0]->node();
+    Node* log_softmax_node = graph.create(kLogSoftmax, 1);
+
+    // log_softmax_node construction
+    log_softmax_node->i_(kaxis, softmax_node->i(kaxis));
+    log_softmax_node->addInput(softmax_node->input());
+    log_softmax_node->insertBefore(softmax_node);
+    log_softmax_node->output()->setSizes(log_node_output->sizes());
+    log_softmax_node->output()->setElemType(log_node_output->elemType());
+
+    log_node->replaceAllUsesWith(log_softmax_node);
+    log_node->removeAllInputs();
+    destroy_current = NodeDestroyType::DestroyTwo;
+    return true;
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/fuse_consecutive_reduce_unsqueeze.h b/onnx_opt/passes/fuse_consecutive_reduce_unsqueeze.h
new file mode 100644
index 000000000..ae6731755
--- /dev/null
+++ b/onnx_opt/passes/fuse_consecutive_reduce_unsqueeze.h
@@ -0,0 +1,65 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+const std::unordered_set<NodeKind> reduction_operators{kReduceL1,
+                                                       kReduceL2,
+                                                       kReduceLogSum,
+                                                       kReduceLogSumExp,
+                                                       kReduceMax,
+                                                       kReduceMean,
+                                                       kReduceMin,
+                                                       kReduceProd,
+                                                       kReduceSum,
+                                                       kReduceSumSquare};
+
+struct FuseConsecutiveReduceUnsqueeze final : public PredicateBasedPass {
+  explicit FuseConsecutiveReduceUnsqueeze()
+      : PredicateBasedPass(
+            PassType::Fuse,
+            PassEfficiency::Complete,
+            PassOptimizationType::Compute) {}
+
+  std::string getPassName() const override {
+    return "fuse_consecutive_reduce_unsqueeze";
+  }
+  bool patternMatchPredicate(Node* node) override {
+    // check that the current node is of type Unsqueeze and has defined axes
+    bool cur_node_check =
+        node->kind() == kUnsqueeze && node->hasAttribute(kaxes);
+    if (cur_node_check) {
+      Node* prev_node = node->input()->node();
+      // check that the previous node a reduction operator and has defined
+      // axes/keepdims
+      bool reduction_node_check = reduction_operators.find(prev_node->kind()) !=
+              reduction_operators.end() &&
+          prev_node->hasAttribute(kaxes) && prev_node->hasAttribute(kkeepdims);
+      if (reduction_node_check) {
+        // insure that keepdims is set to false currently
+        return prev_node->i(kkeepdims) == 0 && node->is(kaxes) == prev_node->is(kaxes);
+      }
+    }
+    return false;
+  }
+  bool runTransform(Node* node, Graph&, NodeDestroyType& destroy_current)
+      override {
+    Node* reduction_op = node->input()->node();
+    // set keepdims flag to be true
+    reduction_op->i_(kkeepdims, 1);
+    // remove unnecessary unsqueeze
+    reduction_op->output()->setSizes(node->output()->sizes());
+    reduction_op->output()->setElemType(node->output()->elemType());
+    node->output()->replaceAllUsesWith(node->input());
+    destroy_current = NodeDestroyType::DestroyOne;
+    return true;
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/fuse_consecutive_squeezes.h b/onnx_opt/passes/fuse_consecutive_squeezes.h
new file mode 100644
index 000000000..550cadd21
--- /dev/null
+++ b/onnx_opt/passes/fuse_consecutive_squeezes.h
@@ -0,0 +1,80 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+// Before:
+//   X is a tensor with shape=[1, 1, 2, 3, 1, 5, 1]
+//   Y = Squeeze(X, axes=[1, 4]) -> shape=[1, 2, 3, 5, 1]
+//   Z = Squeeze(Y, axes=[0, 4]) -> shape=[2, 3, 5]
+// After:
+//   Z = Squeeze(X, axes=[0, 1, 4, 6])
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+struct FuseConsecutiveSqueezes final : public PredicateBasedPass {
+  explicit FuseConsecutiveSqueezes()
+      : PredicateBasedPass(
+            PassType::Fuse,
+            PassEfficiency::Complete,
+            PassOptimizationType::Compute) {}
+
+  std::string getPassName() const override {
+    return "fuse_consecutive_squeezes";
+  }
+  // returns a vector `ret` such that squeeze by `ret` is equivalent
+  // to squeeze by `axes_1` and then by `axes_2`
+  std::vector<int64_t> compose_squeezes(
+      const std::vector<int64_t>& axes_1,
+      const std::vector<int64_t>& axes_2) {
+    std::vector<int64_t> ret;
+    ret.reserve(axes_1.size() + axes_2.size());
+
+    std::vector<int64_t> sorted_axes_1(axes_1.begin(), axes_1.end());
+    std::sort(sorted_axes_1.begin(), sorted_axes_1.end());
+    std::copy(
+        sorted_axes_1.begin(), sorted_axes_1.end(), std::back_inserter(ret));
+
+    for (int64_t i : axes_2) {
+      for (auto iter = sorted_axes_1.begin(); iter != sorted_axes_1.end();
+           ++iter) {
+        // if current axis 1 - prev_num is bigger than axis 2
+        // put axis 2 + prev_num as new axis
+        int64_t prev_num = std::distance(sorted_axes_1.begin(), iter);
+        if (*iter - prev_num > i) {
+          ret.push_back(i + prev_num);
+          break;
+        }
+        // if no current axis 1 - prev_num is bigger than axis 2
+        // put axis 2 + prev_num + 1 as new axis
+        if (std::next(iter) == sorted_axes_1.end()) {
+          ret.push_back(i + prev_num + 1);
+        }
+      }
+    }
+    std::sort(ret.begin(), ret.end());
+    return ret;
+  }
+
+  bool patternMatchPredicate(Node* node) override {
+    return node->kind() == kSqueeze &&
+        node->input()->node()->kind() == kSqueeze;
+  }
+  bool runTransform(Node* n, Graph&, NodeDestroyType& destroy_current)
+      override {
+    auto orig_input = n->input();
+    n->is_(
+        kaxes, compose_squeezes(orig_input->node()->is(kaxes), n->is(kaxes)));
+    n->replaceInput(0, orig_input->node()->input());
+    if (orig_input->uses().size() == 0) {
+      orig_input->node()->destroy();
+    }
+    destroy_current = NodeDestroyType::DestroyZero;
+    return true;
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/fuse_consecutive_transposes.h b/onnx_opt/passes/fuse_consecutive_transposes.h
new file mode 100644
index 000000000..ef2fb664a
--- /dev/null
+++ b/onnx_opt/passes/fuse_consecutive_transposes.h
@@ -0,0 +1,74 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+struct FuseConsecutiveTransposes final : public PredicateBasedPass {
+  explicit FuseConsecutiveTransposes()
+      : PredicateBasedPass(
+            PassType::Fuse,
+            PassEfficiency::Complete,
+            PassOptimizationType::Compute) {}
+
+  std::string getPassName() const override {
+    return "fuse_consecutive_transposes";
+  }
+
+  // returns a vector `ret` such that transposing by `ret` is equivalent
+  // to transposing by `t1` and then by `t2`
+  std::vector<int64_t> compose_transposes(
+      const std::vector<int64_t>& t1,
+      const std::vector<int64_t>& t2) {
+    ONNX_ASSERT(t1.size() == t2.size());
+    std::vector<int64_t> ret;
+    ret.reserve(t1.size());
+    for (size_t i = 0; i < t1.size(); i++) {
+      ONNX_ASSERT(t2[i] < static_cast<int64_t>(t1.size()));
+      ONNX_ASSERT(
+          t1[static_cast<size_t>(t2[i])] < static_cast<int64_t>(t1.size()));
+      ret.push_back(t1[static_cast<size_t>(t2[i])]);
+    }
+    return ret;
+  }
+
+  bool patternMatchPredicate(Node* node) override {
+    return node->kind() == kTranspose &&
+        node->input()->node()->kind() == kTranspose;
+  }
+
+  bool runTransform(Node* n, Graph&, NodeDestroyType& destroy_current)
+      override {
+    auto origInput = n->input();
+    if (!n->hasAttribute(kperm) && !origInput->node()->hasAttribute(kperm)) {
+      // One special case (two consecutive transposes with no perm,
+      // since we do not have the shape information here, we have
+      // to eliminate two transpose together.
+      if (n->output()->has_sizes()) {
+          origInput->node()->input()->setSizes(n->output()->sizes());
+      }
+      n->replaceAllUsesWith(origInput->node()->input()->node());
+      destroy_current = NodeDestroyType::DestroyTwo;
+      return true;
+    }
+    if (!n->hasAttribute(kperm) || !origInput->node()->hasAttribute(kperm)) {
+      destroy_current = NodeDestroyType::DestroyZero;
+      return false;
+    }
+    n->is_(
+        kperm, compose_transposes(origInput->node()->is(kperm), n->is(kperm)));
+    n->replaceInput(0, origInput->node()->input());
+    if (origInput->uses().size() == 0) {
+      origInput->node()->destroy();
+    }
+    destroy_current = NodeDestroyType::DestroyZero;
+    return false;
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/fuse_matmul_add_bias_into_gemm.h b/onnx_opt/passes/fuse_matmul_add_bias_into_gemm.h
new file mode 100644
index 000000000..2ddef7cab
--- /dev/null
+++ b/onnx_opt/passes/fuse_matmul_add_bias_into_gemm.h
@@ -0,0 +1,107 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+// Before:
+//   Z = MatMul(X, Y)
+//   A = Z + Bias
+// After:
+//   A = Gemm(X, Y, Bias)
+//
+// the pass can handle the case when:
+//   case 1: Bias is 1D tensor and Bias.dim[0] == Z.dim[1]
+//   case 2: Bias is 2D tensor and Bias.dim[0] == Z.dim[0] or 1
+//           and Bias.dim[1] = Z.dim[1]
+
+#include <numeric>
+
+#include "onnx/common/assertions.h"
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+struct FuseMatMulAddBiasIntoGemm final : public PredicateBasedPass {
+  explicit FuseMatMulAddBiasIntoGemm()
+      : PredicateBasedPass(
+            PassType::Fuse,
+            PassEfficiency::Complete,
+            PassOptimizationType::Compute) {}
+  std::string getPassName() const override {
+    return "fuse_matmul_add_bias_into_gemm";
+  }
+  bool patternMatchPredicate(Node* node) override {
+    return node->kind() == kAdd &&
+        node->inputs()[0]->node()->kind() == kMatMul;
+  }
+  bool runTransform(Node* n, Graph& graph, NodeDestroyType& destroy_current)
+      override {
+    // due to current broadcasting's constraint, MatMul has to be the first
+    // operand
+    destroy_current = NodeDestroyType::DestroyZero;
+    auto orig_matmul = n->inputs()[0];
+    auto orig_bias = n->inputs()[1];
+    // check if bias is Const or in graph's initializers
+    if (orig_bias->node()->kind() != kConstant &&
+        orig_bias->node()->kind() != kParam) {
+      return false;
+    }
+    // check if MatMul is only used by Add
+    if (orig_matmul->uses().size() > 1) {
+      return false;
+    }
+    auto x_shape = orig_matmul->node()->inputs()[0]->sizes();
+    auto y_shape = orig_matmul->node()->inputs()[1]->sizes();
+    int64_t z_N = -1;
+    int64_t z_M = -1;
+    // try to get feature N from x_shape
+    if (static_cast<int64_t>(x_shape.size()) == 2 && x_shape[0].is_int) {
+      z_N = x_shape[0].dim;
+    } else {
+      return false;
+    }
+    // try to get feature M from y_shape
+    if (static_cast<int64_t>(y_shape.size()) == 2 && y_shape[1].is_int) {
+      z_M = y_shape[1].dim;
+    } else {
+      return false;
+    }
+    // check if bias_shape is compatible
+    auto bias_shape = orig_bias->sizes();
+    auto bias_dim = static_cast<int64_t>(bias_shape.size());
+    int64_t bias_N = -1;
+    int64_t bias_M = -1;
+    if (bias_dim == 1 && bias_shape[0].is_int) {
+      bias_N = 1;
+      bias_M = bias_shape[0].dim;
+    } else if (bias_dim == 2 && bias_shape[0].is_int && bias_shape[1].is_int) {
+      bias_N = bias_shape[0].dim;
+      bias_M = bias_shape[1].dim;
+    } else {
+      return false;
+    }
+    if ((bias_N != z_N && bias_N != 1) || bias_M != z_M) {
+        return false;
+    }
+    // proceed to fuse MatMul and Add into Gemm
+    Node* gemm = graph.create(kGemm,
+        orig_matmul->node()->inputs(),
+        n->outputs().size());
+    gemm->addInput(n->inputs()[1]);
+    for (int i = 0; i < static_cast<int64_t>(gemm->outputs().size()); ++i) {
+      gemm->outputs()[i]->copyMetadata(n->outputs()[i]);
+    }
+    gemm->f_(kalpha, 1.0);
+    gemm->f_(kbeta, 1.0);
+    gemm->i_(ktransA, 0);
+    gemm->i_(ktransB, 0);
+    gemm->insertBefore(orig_matmul->node());
+    n->replaceAllUsesWith(gemm);
+    destroy_current = NodeDestroyType::DestroyTwo;
+    return true;
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/fuse_pad_into_conv.h b/onnx_opt/passes/fuse_pad_into_conv.h
new file mode 100644
index 000000000..4a66aa6e5
--- /dev/null
+++ b/onnx_opt/passes/fuse_pad_into_conv.h
@@ -0,0 +1,173 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+// Before:
+//   P = Pad(X) - opset 10 and below (or) Pad(X, Pads, [Constant_value]) - opset 11 and
+//   above Z = Conv(P, Y)
+// After:
+//   Z = Conv(X, Y) with "pads" attribute set
+//
+// the pass handles the case when Pad is zero-padding the input
+// (i.e. mode=constant and Constant_value=0)
+
+#include <numeric>
+
+#include "onnx/defs/tensor_util.h"
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+struct FusePadIntoConv final : public PredicateBasedPass {
+  explicit FusePadIntoConv()
+      : PredicateBasedPass(
+            PassType::Fuse,
+            PassEfficiency::Complete,
+            PassOptimizationType::Compute) {}
+  std::string getPassName() const override {
+    return "fuse_pad_into_conv";
+  }
+  bool patternMatchPredicate(Node* node) override {
+    return node->kind() == kConv && node->inputs()[0]->node()->kind() == kPad;
+  }
+  bool runTransform(Node* n, Graph& graph, NodeDestroyType& destroy_current)
+      override {
+    destroy_current = NodeDestroyType::DestroyZero;
+
+    // check if Pad is only used by Conv
+    if (n->inputs()[0]->uses().size() > 1) {
+      return false;
+    }
+
+    Node* conv = n;
+    Node* pad = n->inputs()[0]->node();
+
+    // Process 'pads' data
+    std::vector<int64_t> pads;
+    if (pad->hasAttribute(kpads)) {
+      // opset 10 and below
+      pads = pad->is(kpads);
+    } else {
+      // opset 11 and above - first check if 'pad' node has 'pads' input
+      // initialized
+      const auto& pads_name = pad->inputs()[1]->uniqueName();
+      const auto pads_initializer = graph.getInitializer(pads_name);
+      // 'pad' node has the 'pads' input which has not been initialized -
+      // can't proceed with fusing
+      if (pads_initializer == graph.initializers().end()) {
+        return false;
+      }
+
+      // make sure the type of 'pads' is INT64
+      if (pads_initializer->elem_type() != TensorProto::INT64) {
+        return false;
+      }
+
+      // parse 'pads' data from the initialized input
+      pads = ParseData<int64_t>(&*pads_initializer);
+    }
+
+    // Process 'mode'
+    std::string pad_mode;
+    if (pad->hasAttribute(kmode)) {
+      pad_mode = pad->s(kmode);
+    } else {
+      pad_mode = "constant";
+    }
+
+    // cannot fuse if the pad mode is not "Constant"
+    if (pad_mode != "constant") {
+      return false;
+    }
+
+    // Process 'Constant_value'
+    // opset 10 and below
+    if (pad->hasAttribute(kvalue) && static_cast<double>(pad->f(kvalue)) != 0.0) {
+      return false;
+    } else if (pad->inputs().size() == 3) {
+      // opset 11 and above - check if the 'pad' node has the optional 'Constant_value'
+      // input check if it has data initialized
+      const auto& value_name = pad->inputs()[2]->uniqueName();
+      const auto value_initializer = graph.getInitializer(value_name);
+
+      // 'pad' node has the 'Constant_value' input which has not been initialized -
+      // can't proceed with fusing
+      if (value_initializer == graph.initializers().end()) {
+        return false;      
+      }
+
+      // parse 'Constant_value' data from the initialized input and stop optimizer if the
+      // Constant_value is non-zero
+      switch (value_initializer->elem_type()) {
+        case TensorProto::FLOAT:
+          if (ParseData<float>(&*value_initializer)[0] != 0)
+            return false; // cannot fuse Pad into Conv
+          else
+            break;
+
+        case TensorProto::DOUBLE:
+          if (ParseData<double>(&*value_initializer)[0] != 0)
+            return false; // cannot fuse Pad into Conv
+          else
+            break;
+
+        case TensorProto::INT32:
+          if (ParseData<int32_t>(&*value_initializer)[0] != 0)
+            return false; // cannot fuse Pad into Conv
+          else
+            break;
+
+        case TensorProto::INT64:
+          if (ParseData<int64_t>(&*value_initializer)[0] != 0)
+            return false; // cannot fuse Pad into Conv
+          else
+            break;
+  
+        // TODO: Support more uncommon but valid types for Pad op (int8, uint8, int16, uint16, etc.)       
+
+        default:
+          return false; // Either type of Constant_value is invalid or not yet supported by data parsing logic.
+                        // Since we canot validate the data present in 'Constant_value', we exit the optimizer   
+      }
+    }
+
+    // check if some values in 'pads' prevents us from fusing it into 'Conv' node
+    int pads_size = static_cast<int>(pads.size());
+
+    // check if padding is applied only on feature dims
+    if (pads[0] != 0 || pads[1] != 0 || pads[pads_size / 2] != 0 ||
+        pads[pads_size / 2 + 1] != 0) {
+      return false;
+    }
+
+    // check if padding is only positive
+    if (std::any_of(pads.begin(), pads.end(), [](int64_t local_value) {
+          return local_value < 0;
+        })) {
+      return false;
+    }
+
+    int conv_pads_size = pads_size - 4;
+    std::vector<int64_t> conv_pads(conv_pads_size, 0);
+    // Fuse into existing padding, if available
+    if (conv->hasAttribute(kpads)) {
+      conv_pads = conv->is(kpads);
+    }
+
+    for (int i = 2, j = 0; i < pads_size / 2; ++i, ++j) {
+      conv_pads[j] += pads[i];
+      conv_pads[conv_pads_size / 2 + j] += pads[pads_size / 2 + i];
+    }
+
+    conv->is_(kpads, std::move(conv_pads));
+    conv->replaceInput(0, pad->inputs()[0]);
+    pad->destroy();
+
+    return true;
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/fuse_transpose_into_gemm.h b/onnx_opt/passes/fuse_transpose_into_gemm.h
new file mode 100644
index 000000000..b7ada112c
--- /dev/null
+++ b/onnx_opt/passes/fuse_transpose_into_gemm.h
@@ -0,0 +1,46 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+struct FuseTransposeIntoGemm final : public PredicateBasedPass {
+  explicit FuseTransposeIntoGemm()
+      : PredicateBasedPass(
+            PassType::Fuse,
+            PassEfficiency::Complete,
+            PassOptimizationType::Compute) {}
+  std::string getPassName() const override {
+    return "fuse_transpose_into_gemm";
+  }
+  bool patternMatchPredicate(Node* node) override {
+    return node->kind() == kGemm;
+  }
+  bool runTransform(Node* n, Graph&, NodeDestroyType& destroy_current)
+      override {
+    const std::vector<int64_t> simple_trans_perm({1, 0});
+    destroy_current = NodeDestroyType::DestroyZero;
+    bool ret_val = false;
+    for (size_t i : {0, 1}) {
+      auto inp = n->inputs()[i];
+      auto trans = i == 0 ? ktransA : ktransB;
+      if (inp->node()->kind() == kTranspose &&
+          inp->node()->is(kperm) == simple_trans_perm) {
+        n->replaceInput(i, inp->node()->input());
+        n->i_(trans, n->hasAttribute(trans) ? !n->i(trans) : 1);
+        if (inp->uses().size() == 0) {
+          inp->node()->destroy();
+          ret_val = true;
+        }
+      }
+    }
+    return ret_val;
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/lift_lexical_references.h b/onnx_opt/passes/lift_lexical_references.h
new file mode 100644
index 000000000..d0a5eeec9
--- /dev/null
+++ b/onnx_opt/passes/lift_lexical_references.h
@@ -0,0 +1,231 @@
+#pragma once
+
+#include <set>
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+// Lift lexically-scoped references within control operators to be inputs of the
+// ops themselves. This transformation yields a graph that does not conform to
+// the ONNX spec.
+//
+// The purpose of this pass is to expose the data dependencies within control
+// blocks for frameworks that use those dependencies to schedule parallel
+// execution. e.g. caffe2 graph execution.
+//
+// Example:
+// ******************************** Before *************************************
+// graph test (%X[FLOAT, 5]) {
+//   %Y = Identity(%X)
+//   %trip_count = Constant[value = <Scalar Tensor [10]>]()
+//   %condition = Constant[value = <Scalar Tensor [1]>]()
+//   %Y2, %Y3 = Loop[body = <graph body_graph>](%trip_count, %condition, %)
+//   return %Y, %Y2
+// }
+//
+// graph body_graph (%i[INT32, scalar], %cond[BOOL, scalar]) {
+//   %_Y2 = Identity(%X)
+//   %_Y3 = Identity(%Y)
+//   return %cond, %_Y2, %_Y3
+// }
+//
+// ******************************** After **************************************
+// graph test (%X[FLOAT, 5]) {
+//   %Y = Identity(%X)
+//   %trip_count = Constant[value = <Scalar Tensor [10]>]()
+//   %condition = Constant[value = <Scalar Tensor [1]>]()
+//   %Y2, %Y3 = Loop[__control_inputs = ['X', 'Y'], body = <graph
+//   body_graph>](%trip_count, %condition, %)
+//                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+//   return %Y, %Y2
+// }
+//
+// graph body_graph (%i[INT32, scalar], %cond[BOOL, scalar]) {
+//   %_Y2 = Identity(%X)
+//   %_Y3 = Identity(%Y)
+//   return %cond, %_Y2, %_Y3
+// }
+//
+// ******************************** Continue Docs*******************************
+//
+// The algorithm is roughly:
+//  symbol_table_stack = empty stack of symbol tables
+//
+//  liftreferences(graph)
+//      -> a set of unresolved reference strings:
+//    unresolved_references = {}
+//
+//    symbol_table_stack.push(new symbol table containing inputs for this
+//    sub-graph) for each node in the graph:
+//      for input in node.inputs:
+//        if input is not in this frame:
+//          unresolved_references.insert(input)
+//      if node is a control flow operator:
+//        for each sub-graph g:
+//          for each output in g's body:
+//            if output is defined in current scope:
+//              control_inputs.insert(output)
+//          refs = liftreferences(g)
+//          for each ref in refs:
+//            if ref is in this frame or any parent frame (control_inputs):
+//              control_inputs.insert(ref)
+//            else:
+//              unresolved_references.insert(ref)
+//          set the control inputs attribute to the node
+//        for output in node.outputs:
+//          symbol_table_stack.top()[output] = Value*
+//    return unresolved_references
+struct LiftLexicalReferences : public FullGraphBasedPass {
+  explicit LiftLexicalReferences()
+      : FullGraphBasedPass(
+            PassType::Separate,
+            PassEfficiency::Complete,
+            PassOptimizationType::Memory) {}
+
+  std::string getPassName() const override {
+    return "lift_lexical_references";
+  }
+  PassAnalysisType getPassAnalysisType() const override {
+    return PassAnalysisType::Empty;
+  }
+
+  using ValueTable = std::unordered_map<std::string, Value*>;
+
+  // Environment stack, please to store value table and
+  // controlled inputs
+  struct Environment {
+    Environment(std::shared_ptr<Environment> next = nullptr) : next(next) {}
+
+    std::shared_ptr<Environment> next;
+
+    Value* findInThisFrame(const std::string& name) {
+      auto it = value_table.find(name);
+      if (it != value_table.end()) {
+        return it->second;
+      }
+      return nullptr;
+    }
+
+    Value* findInParentFrame(const std::string& name) {
+      return next ? next->findInAnyFrame(name) : nullptr;
+    }
+
+    Value* findInAnyFrame(const std::string& name) {
+      for (auto runner = this; runner; runner = runner->next.get()) {
+        if (auto r = runner->findInThisFrame(name)) {
+          return r;
+        }
+      }
+      return nullptr;
+    }
+
+    void setVar(const std::string& name, Value* value) {
+      value_table[name] = value;
+    }
+
+   private:
+    ValueTable value_table;
+  };
+
+  std::shared_ptr<Environment> environment_stack;
+
+  // environment stack helper
+  void pushFrame() {
+    environment_stack = std::make_shared<Environment>(environment_stack);
+  }
+
+  std::shared_ptr<Environment> popFrame() {
+    auto old_frame = environment_stack;
+    environment_stack = environment_stack->next;
+    return old_frame;
+  }
+
+  std::set<std::string> liftReferences(Graph* g) {
+    std::set<std::string> unresolved_references;
+    pushFrame();
+    for (auto& inp : g->inputs()) {
+      environment_stack->setVar(inp->uniqueName(), inp);
+    }
+
+    for (auto* n : g->nodes()) {
+      // Skip optional input/captured value node.
+      if (n->kind() == ONNX_NAMESPACE::kUndefined ||
+          n->kind() == ONNX_NAMESPACE::kCaptured) {
+        continue;
+      }
+      for (auto* inp : n->inputs()) {
+        // Empty string is 0-input variadic argument. Skip that one.
+        if (!inp->uniqueName().empty() &&
+            !environment_stack->findInThisFrame(inp->uniqueName())) {
+          unresolved_references.insert(inp->uniqueName());
+        }
+      }
+
+      std::set<std::string> local_unresolved;
+
+      // if a graph body output has already already been emitted outside of the
+      // subgraph scope, then it must be added as an input to the subgraph
+      auto add_subgraph_outputs = [&](Graph* body_graph) {
+        for (auto* out : body_graph->outputs()) {
+          if (environment_stack->findInAnyFrame(out->uniqueName())) {
+            local_unresolved.insert(out->uniqueName());
+          }
+        }
+      };
+
+      if (n->kind() == ONNX_NAMESPACE::kLoop) {
+        auto* body_graph = n->g(ONNX_NAMESPACE::kbody).get();
+        local_unresolved = liftReferences(body_graph);
+        add_subgraph_outputs(body_graph);
+      } else if (n->kind() == ONNX_NAMESPACE::kIf) {
+        auto* then_graph = n->g(ONNX_NAMESPACE::kthen_branch).get();
+        add_subgraph_outputs(then_graph);
+        auto then_unresolved = liftReferences(then_graph);
+        local_unresolved.insert(then_unresolved.begin(), then_unresolved.end());
+        auto* else_graph = n->g(ONNX_NAMESPACE::kelse_branch).get();
+        add_subgraph_outputs(else_graph);
+        auto else_unresolved = liftReferences(else_graph);
+        local_unresolved.insert(else_unresolved.begin(), else_unresolved.end());
+      }
+
+      std::vector<std::string> control_inputs;
+      for (auto& unresolved : local_unresolved) {
+        if (environment_stack->findInAnyFrame(unresolved)) {
+          control_inputs.push_back(unresolved);
+        } else {
+          unresolved_references.insert(unresolved);
+        }
+      }
+
+      // Create this attribute so the backend knows how many of these inputs
+      // are simply there for control dependencies
+      if (!control_inputs.empty()) {
+        n->ss_(ONNX_NAMESPACE::k__control_inputs, std::move(control_inputs));
+      }
+
+      for (auto* out : n->outputs()) {
+        environment_stack->setVar(out->uniqueName(), out);
+      }
+    }
+
+    popFrame();
+    return unresolved_references;
+  }
+
+  std::shared_ptr<PostPassAnalysis> runPass(Graph& graph) override {
+    auto unresolved = liftReferences(&graph);
+
+    if (unresolved.size()) {
+      std::string errmsg = "Unresolved value references: ";
+      for (auto& ref : unresolved) {
+        errmsg += ref + ",";
+      }
+      throw std::runtime_error(errmsg);
+    }
+    return std::shared_ptr<PostPassAnalysis>(new PostPassAnalysis());
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/nop.h b/onnx_opt/passes/nop.h
new file mode 100644
index 000000000..a51321e02
--- /dev/null
+++ b/onnx_opt/passes/nop.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+struct NopEmptyPass final : public FullGraphBasedPass {
+  explicit NopEmptyPass()
+      : FullGraphBasedPass(
+            PassType::Nop,
+            PassEfficiency::Complete,
+            PassOptimizationType::None) {}
+
+  std::string getPassName() const override {
+    return "nop";
+  }
+  PassAnalysisType getPassAnalysisType() const override {
+    return PassAnalysisType::Empty;
+  }
+  std::shared_ptr<PostPassAnalysis> runPass(Graph&) override {
+    return std::make_shared<PostPassAnalysis>();
+  }
+};
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/onnx_opt/passes/split.h b/onnx_opt/passes/split.h
new file mode 100644
index 000000000..f59a8cc1a
--- /dev/null
+++ b/onnx_opt/passes/split.h
@@ -0,0 +1,228 @@
+// ATTENTION: The code in this file is highly EXPERIMENTAL.
+// Adventurous users should note that the APIs will probably change.
+
+#pragma once
+
+#include "onnx/optimizer/pass.h"
+
+namespace ONNX_NAMESPACE {
+namespace optimization {
+
+static constexpr const char* impure_operators[] = {
+    "RandomNormal",
+    "RandomNormalLike",
+    "RandomUniform",
+    "RandomUniformLike",
+    "Loop",
+    "If",
+    "Scan",
+};
+
+static bool is_pure_operator(Node* n) {
+  for (auto x : impure_operators) {
+    if (n->kind() == Symbol(x)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Split the graph into 'init' and 'predict' nets. This is kind of
+// like constant folding, except that rather than actually execute the
+// constant computations, we simply split them out into a separate
+// graph. Nodes that have any transitive dependency on the
+// initializers, or on impure operators, must remain in the predict
+// net. All others may be moved to the init net.
+//
+// This function destructively mutates the graph into either the init
+// or the predict net. If you want both, which you probably do,
+// arrange to call it twice.
+//
+// NOTE POTENTIAL BREAKAGE:
+//
+// The ONNX spec provides no guarantees about "staging", i.e. which
+// inputs change on every invocation vs which generally stay the same.
+// Here we make the assumption that inputs which have an initializer
+// value provided for them vary only between invocations of the init
+// net, and are constant across runs of the predict net.
+//
+static void split_init_and_predict(Graph& graph, bool init, bool predict) {
+  // The first step is to identify which Values are reachable from
+  // either of
+  //   - inputs without corresponding initializers
+  //   - impure operators
+  // Any such Values belong to the predict net. Nodes belong to the
+  // predict net if they are impure or if any of their inputs do.
+
+  std::unordered_set<Value*> predict_net_values;
+
+  auto value_belongs_to_predict_net = [&](Value* v) {
+    return predict_net_values.count(v) > 0;
+  };
+  auto node_belongs_to_predict_net = [&](Node* n) {
+    return !is_pure_operator(n) ||
+        std::any_of(
+            n->inputs().begin(),
+            n->inputs().end(),
+            value_belongs_to_predict_net);
+  };
+
+  {
+    std::unordered_set<std::string> initializer_names(
+        graph.initializer_names().begin(), graph.initializer_names().end());
+
+    for (Value* v : graph.inputs()) {
+      if (initializer_names.count(v->uniqueName()) == 0) {
+        predict_net_values.insert(v);
+      }
+    }
+  }
+
+  for (Node* n : graph.nodes()) {
+    if (node_belongs_to_predict_net(n)) {
+      for (Value* v : n->outputs()) {
+        predict_net_values.insert(v);
+      }
+    }
+  }
+
+  // Any Value which is not itself in the predict net, but which
+  // is used by a Node which is, becomes an output of the init
+  // graph and an input of the predict net
+  std::unordered_set<Value*> new_interface;
+  for (Node* n : graph.nodes()) {
+    if (node_belongs_to_predict_net(n)) {
+      for (Value* v : n->inputs()) {
+        if (!value_belongs_to_predict_net(v)) {
+          new_interface.insert(v);
+        }
+      }
+    }
+  }
+
+  for (Value* v : graph.outputs()) {
+    if (!value_belongs_to_predict_net(v)) {
+      new_interface.insert(v);
+    }
+  }
+
+  if (init) {
+    // Add new outputs corresponding to the boundary between init and
+    // predict nets, ensuring that we don't duplicate outputs.
+    for (Value* v : graph.outputs()) {
+      new_interface.erase(v);
+    }
+    for (Value* v : new_interface) {
+      if (v->node()->kind() == kUndefined) {
+        continue;
+      }
+      graph.registerOutput(v);
+    }
+
+    // Remove outputs that belong to the predict net.
+    for (auto i = graph.outputs().size(); i--;) {
+      if (value_belongs_to_predict_net(graph.outputs()[i])) {
+        graph.return_node()->removeInput(i);
+      }
+    }
+
+    // Delete nodes that belong to the predict net, in reverse
+    // topological order.
+    for (auto it = graph.nodes().rbegin(); it != graph.nodes().rend(); it++) {
+      if (node_belongs_to_predict_net(*it)) {
+        it.destroyCurrent();
+      }
+    }
+
+    // Remove inputs that belong to the predict net.
+    for (auto i = graph.inputs().size(); i--;) {
+      if (value_belongs_to_predict_net(graph.inputs()[i])) {
+        graph.eraseInput(i);
+      }
+    }
+  } else if (predict) {
+    // When creating the predict net, 'undefined' nodes will
+    // naturally go into the init net. We need to have a place to
+    // copy the ones we want to keep in the predict net.
+    auto* optionalInputDummyNode = graph.create(kUndefined, 1);
+    graph.appendNode(optionalInputDummyNode);
+    optionalInputDummyNode->outputs()[0]->setUniqueName("");
+
+    // Add new inputs, ensuring that we don't introduce duplicates.
+    // Also cut the boundary between init and predict net by replacing
+    // the Values along the boundary with replaceAllUsesWith.
+    for (Value* v : graph.inputs()) {
+      new_interface.erase(v);
+    }
+    for (Value* v : new_interface) {
+      if (v->node()->kind() == kUndefined) {
+        v->replaceAllUsesWith(optionalInputDummyNode->outputs()[0]);
+      } else {
+        Value* newv = graph.addInput()->copyMetadata(v);
+        v->replaceAllUsesWith(newv);
+      }
+    }
+
+    // Delete nodes that aren't in the predict net, in reverse
+    // topological order.
+    for (auto it = graph.nodes().rbegin(); it != graph.nodes().rend(); it++) {
+      if (*it == optionalInputDummyNode) {
+        continue;
+      }
+      if (node_belongs_to_predict_net(*it)) {
+        continue;
+      }
+      it.destroyCurrent();
+    }
+
+    // Remove inputs that aren't used by the predict net.
+    for (auto i = graph.inputs().size(); i--;) {
+      if (graph.inputs()[i]->uses().empty()) {
+        graph.eraseInput(i);
+      }
+    }
+
+    // Remove all initializers, they are already in the init net.
+    graph.clearInitializers();
+  }
+}
+
+struct SplitInit final : public FullGraphBasedPass {
+  explicit SplitInit()
+      : FullGraphBasedPass(
+            PassType::Separate,
+            PassEfficiency::Complete,
+            PassOptimizationType::Memory) {}
+
+  std::string getPassName() const override {
+    return "split_init";
+  }
+  PassAnalysisType getPassAnalysisType() const override {
+    return PassAnalysisType::Empty;
+  }
+  std::shared_ptr<PostPassAnalysis> runPass(Graph& graph) override {
+    split_init_and_predict(graph, true, false);
+    return std::shared_ptr<PostPassAnalysis>(new PostPassAnalysis());
+  }
+};
+
+struct SplitPredict final : public FullGraphBasedPass {
+  explicit SplitPredict()
+      : FullGraphBasedPass(
+            PassType::Separate,
+            PassEfficiency::Complete,
+            PassOptimizationType::Memory) {}
+  std::string getPassName() const override {
+    return "split_predict";
+  }
+  PassAnalysisType getPassAnalysisType() const override {
+    return PassAnalysisType::Empty;
+  }
+  std::shared_ptr<PostPassAnalysis> runPass(Graph& graph) override {
+    split_init_and_predict(graph, false, true);
+    return std::shared_ptr<PostPassAnalysis>(new PostPassAnalysis());
+  }
+};
+
+} // namespace optimization
+} // namespace ONNX_NAMESPACE
diff --git a/setup.py b/setup.py
new file mode 100644
index 000000000..720b29389
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,344 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from distutils.spawn import find_executable
+from distutils import sysconfig, log
+import setuptools
+import setuptools.command.build_py
+import setuptools.command.develop
+import setuptools.command.build_ext
+
+from collections import namedtuple
+from contextlib import contextmanager
+import glob
+import os
+import shlex
+import subprocess
+import sys
+import platform
+from textwrap import dedent
+import multiprocessing
+
+
+TOP_DIR = os.path.realpath(os.path.dirname(__file__))
+SRC_DIR = os.path.join(TOP_DIR, 'onnx_opt')
+CMAKE_BUILD_DIR = os.path.join(TOP_DIR, '.setuptools-cmake-build')
+
+WINDOWS = (os.name == 'nt')
+
+CMAKE = find_executable('cmake3') or find_executable('cmake')
+MAKE = find_executable('make')
+
+install_requires = []
+setup_requires = []
+tests_require = []
+extras_require = {}
+
+################################################################################
+# Global variables for controlling the build variant
+################################################################################
+
+# Default value is set to TRUE\1 to keep the settings same as the current ones.
+# However going forward the recomemded way to is to set this to False\0
+USE_MSVC_STATIC_RUNTIME = bool(os.getenv('USE_MSVC_STATIC_RUNTIME', '1') == '1')
+ONNX_ML = not bool(os.getenv('ONNX_ML') == '0')
+ONNX_VERIFY_PROTO3 = bool(os.getenv('ONNX_VERIFY_PROTO3') == '1')
+ONNX_NAMESPACE = os.getenv('ONNX_NAMESPACE', 'onnx')
+ONNX_BUILD_TESTS = bool(os.getenv('ONNX_BUILD_TESTS') == '1')
+
+DEBUG = bool(os.getenv('DEBUG'))
+COVERAGE = bool(os.getenv('COVERAGE'))
+
+################################################################################
+# Version
+################################################################################
+
+try:
+    git_version = subprocess.check_output(['git', 'rev-parse', 'HEAD'],
+                                          cwd=TOP_DIR).decode('ascii').strip()
+except (OSError, subprocess.CalledProcessError):
+    git_version = None
+
+with open(os.path.join(TOP_DIR, 'VERSION_NUMBER')) as version_file:
+    VersionInfo = namedtuple('VersionInfo', ['version', 'git_version'])(
+        version=version_file.read().strip(),
+        git_version=git_version
+    )
+
+################################################################################
+# Pre Check
+################################################################################
+
+assert CMAKE, 'Could not find "cmake" executable!'
+
+################################################################################
+# Utilities
+################################################################################
+
+
+@contextmanager
+def cd(path):
+    if not os.path.isabs(path):
+        raise RuntimeError('Can only cd to absolute path, got: {}'.format(path))
+    orig_path = os.getcwd()
+    os.chdir(path)
+    try:
+        yield
+    finally:
+        os.chdir(orig_path)
+
+
+################################################################################
+# Customized commands
+################################################################################
+
+
+class ONNXCommand(setuptools.Command):
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+
+class create_version(ONNXCommand):
+    def run(self):
+        with open(os.path.join(SRC_DIR, 'version.py'), 'w') as f:
+            f.write(dedent('''\
+            # This file is generated by setup.py. DO NOT EDIT!
+
+            from __future__ import absolute_import
+            from __future__ import division
+            from __future__ import print_function
+            from __future__ import unicode_literals
+
+            version = '{version}'
+            git_version = '{git_version}'
+            '''.format(**dict(VersionInfo._asdict()))))
+
+
+class cmake_build(setuptools.Command):
+    """
+    Compiles everything when `python setupmnm.py build` is run using cmake.
+
+    Custom args can be passed to cmake by specifying the `CMAKE_ARGS`
+    environment variable.
+
+    The number of CPUs used by `make` can be specified by passing `-j<ncpus>`
+    to `setup.py build`.  By default all CPUs are used.
+    """
+    user_options = [
+        (str('jobs='), str('j'), str('Specifies the number of jobs to use with make'))
+    ]
+
+    built = False
+
+    def initialize_options(self):
+        self.jobs = None
+
+    def finalize_options(self):
+        if sys.version_info[0] >= 3:
+            self.set_undefined_options('build', ('parallel', 'jobs'))
+        if self.jobs is None and os.getenv("MAX_JOBS") is not None:
+            self.jobs = os.getenv("MAX_JOBS")
+        self.jobs = multiprocessing.cpu_count() if self.jobs is None else int(self.jobs)
+
+    def run(self):
+        if cmake_build.built:
+            return
+        cmake_build.built = True
+        if not os.path.exists(CMAKE_BUILD_DIR):
+            os.makedirs(CMAKE_BUILD_DIR)
+
+        with cd(CMAKE_BUILD_DIR):
+            build_type = 'Release'
+            # configure
+            cmake_args = [
+                CMAKE,
+                '-DPYTHON_INCLUDE_DIR={}'.format(sysconfig.get_python_inc()),
+                '-DPYTHON_EXECUTABLE={}'.format(sys.executable),
+                '-DBUILD_ONNX_PYTHON=ON',
+                '-DCMAKE_EXPORT_COMPILE_COMMANDS=ON',
+                '-DONNX_NAMESPACE={}'.format(ONNX_NAMESPACE),
+                '-DPY_EXT_SUFFIX={}'.format(sysconfig.get_config_var('EXT_SUFFIX') or ''),
+            ]
+            if COVERAGE:
+                cmake_args.append('-DONNX_COVERAGE=ON')
+            if COVERAGE or DEBUG:
+                # in order to get accurate coverage information, the
+                # build needs to turn off optimizations
+                build_type = 'Debug'
+            cmake_args.append('-DCMAKE_BUILD_TYPE=%s' % build_type)
+            if WINDOWS:
+                cmake_args.extend([
+                    # we need to link with libpython on windows, so
+                    # passing python version to window in order to
+                    # find python in cmake
+                    '-DPY_VERSION={}'.format('{0}.{1}'.format(*sys.version_info[:2])),
+                ])
+                if USE_MSVC_STATIC_RUNTIME:
+                    cmake_args.append('-DONNX_USE_MSVC_STATIC_RUNTIME=ON')
+                if platform.architecture()[0] == '64bit':
+                    cmake_args.extend(['-A', 'x64', '-T', 'host=x64'])
+                else:
+                    cmake_args.extend(['-A', 'Win32', '-T', 'host=x86'])
+            if ONNX_ML:
+                cmake_args.append('-DONNX_ML=1')
+            if ONNX_VERIFY_PROTO3:
+                cmake_args.append('-DONNX_VERIFY_PROTO3=1')
+            if ONNX_BUILD_TESTS:
+                cmake_args.append('-DONNX_BUILD_TESTS=ON')
+            if 'CMAKE_ARGS' in os.environ:
+                extra_cmake_args = shlex.split(os.environ['CMAKE_ARGS'])
+                # prevent crossfire with downstream scripts
+                del os.environ['CMAKE_ARGS']
+                log.info('Extra cmake args: {}'.format(extra_cmake_args))
+                cmake_args.extend(extra_cmake_args)
+            cmake_args.append(TOP_DIR)
+            subprocess.check_call(cmake_args)
+
+            build_args = [CMAKE, '--build', os.curdir]
+            if WINDOWS:
+                build_args.extend(['--config', build_type])
+                build_args.extend(['--', '/maxcpucount:{}'.format(self.jobs)])
+            else:
+                build_args.extend(['--', '-j', str(self.jobs)])
+            subprocess.check_call(build_args)
+
+
+class build_py(setuptools.command.build_py.build_py):
+    def run(self):
+        self.run_command('create_version')
+        self.run_command('cmake_build')
+
+        generated_python_files = \
+            glob.glob(os.path.join(CMAKE_BUILD_DIR, 'onnx_opt', '*.py')) + \
+            glob.glob(os.path.join(CMAKE_BUILD_DIR, 'onnx_opt', '*.pyi'))
+
+        for src in generated_python_files:
+            dst = os.path.join(
+                TOP_DIR, os.path.relpath(src, CMAKE_BUILD_DIR))
+            self.copy_file(src, dst)
+
+        return setuptools.command.build_py.build_py.run(self)
+
+
+class develop(setuptools.command.develop.develop):
+    def run(self):
+        self.run_command('build_py')
+        setuptools.command.develop.develop.run(self)
+
+
+class build_ext(setuptools.command.build_ext.build_ext):
+    def run(self):
+        self.run_command('cmake_build')
+        setuptools.command.build_ext.build_ext.run(self)
+
+    def build_extensions(self):
+        for ext in self.extensions:
+            fullname = self.get_ext_fullname(ext.name)
+            filename = os.path.basename(self.get_ext_filename(fullname))
+
+            lib_path = CMAKE_BUILD_DIR
+            if os.name == 'nt':
+                debug_lib_dir = os.path.join(lib_path, "Debug")
+                release_lib_dir = os.path.join(lib_path, "Release")
+                if os.path.exists(debug_lib_dir):
+                    lib_path = debug_lib_dir
+                elif os.path.exists(release_lib_dir):
+                    lib_path = release_lib_dir
+            src = os.path.join(lib_path, filename)
+            dst = os.path.join(os.path.realpath(self.build_lib), "onnx_opt", filename)
+            self.copy_file(src, dst)
+
+
+class mypy_type_check(ONNXCommand):
+    description = 'Run MyPy type checker'
+
+    def run(self):
+        """Run command."""
+        onnx_script = os.path.realpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/mypy-onnx.py"))
+        returncode = subprocess.call([sys.executable, onnx_script])
+        sys.exit(returncode)
+
+
+cmdclass = {
+    'create_version': create_version,
+    'cmake_build': cmake_build,
+    'build_py': build_py,
+    'develop': develop,
+    'build_ext': build_ext,
+    'typecheck': mypy_type_check,
+}
+
+################################################################################
+# Extensions
+################################################################################
+
+ext_modules = [
+    setuptools.Extension(
+        name=str('onnx_opt.onnx_opt_cpp2py_export'),
+        sources=[])
+]
+
+################################################################################
+# Packages
+################################################################################
+
+# no need to do fancy stuff so far
+packages = setuptools.find_packages()
+
+install_requires.extend([
+    'protobuf',
+    'numpy',
+    'six',
+    'typing>=3.6.4; python_version < "3.5"',
+    'typing-extensions>=3.6.2.1',
+])
+
+################################################################################
+# Test
+################################################################################
+
+setup_requires.append('pytest-runner')
+tests_require.append('pytest')
+tests_require.append('nbval')
+tests_require.append('tabulate')
+
+if sys.version_info[0] == 3:
+    # Mypy doesn't work with Python 2
+    extras_require['mypy'] = ['mypy==0.600']
+
+################################################################################
+# Final
+################################################################################
+
+setuptools.setup(
+    name="onnx_opt",
+    version=VersionInfo.version,
+    description="Open Neural Network Exchange",
+    ext_modules=ext_modules,
+    cmdclass=cmdclass,
+    packages=packages,
+    license='MIT',
+    include_package_data=True,
+    install_requires=install_requires,
+    setup_requires=setup_requires,
+    tests_require=tests_require,
+    extras_require=extras_require,
+    author='ONNX',
+    author_email='onnx-technical-discuss@lists.lfai.foundation',
+    url='https://github.com/onnx/onnx',
+    entry_points={
+        'console_scripts': [
+            'check-model = onnx.bin.checker:check_model',
+            'check-node = onnx.bin.checker:check_node',
+            'backend-test-tools = onnx.backend.test.cmd_tools:main',
+        ]
+    },
+)
+
diff --git a/third_party/onnx b/third_party/onnx
new file mode 160000
index 000000000..c443abd2a
--- /dev/null
+++ b/third_party/onnx
@@ -0,0 +1 @@
+Subproject commit c443abd2acad2411103593600319ff81a676afbc

From 4e80d5ba567727a3ff40ef4ed92798493e77d87f Mon Sep 17 00:00:00 2001
From: daquexian <daquexian566@gmail.com>
Date: Sun, 23 Aug 2020 15:28:37 +0800
Subject: [PATCH 02/14] change include dir, add code about cmake install

---
 CMakeLists.txt                                | 32 +++++++++++++-
 cmake/ONNXOptimizerConfig.cmake.in            | 24 +++++++++++
 cmake/ONNXOptimizerConfigVersion.cmake.in     | 12 ++++++
 onnx_opt/cpp2py_export.cc                     |  2 +-
 onnx_opt/optimize.cc                          |  2 +-
 onnx_opt/optimize.h                           |  4 +-
 onnx_opt/pass.cc                              |  2 +-
 onnx_opt/pass_manager.cc                      |  2 +-
 onnx_opt/pass_manager.h                       |  4 +-
 onnx_opt/pass_registry.cc                     |  2 +-
 onnx_opt/pass_registry.h                      | 42 +++++++++----------
 onnx_opt/passes/eliminate_deadend.h           |  2 +-
 onnx_opt/passes/eliminate_identity.h          |  2 +-
 onnx_opt/passes/eliminate_nop_dropout.h       |  2 +-
 .../passes/eliminate_nop_monotone_argmax.h    |  2 +-
 onnx_opt/passes/eliminate_nop_pad.h           |  2 +-
 onnx_opt/passes/eliminate_nop_transpose.h     |  2 +-
 .../passes/eliminate_unused_initializer.h     |  2 +-
 .../passes/extract_constant_to_initializer.h  |  2 +-
 onnx_opt/passes/fuse_add_bias_into_conv.h     |  2 +-
 onnx_opt/passes/fuse_bn_into_conv.h           |  2 +-
 onnx_opt/passes/fuse_consecutive_concats.h    |  2 +-
 .../passes/fuse_consecutive_log_softmax.h     |  2 +-
 .../fuse_consecutive_reduce_unsqueeze.h       |  2 +-
 onnx_opt/passes/fuse_consecutive_squeezes.h   |  2 +-
 onnx_opt/passes/fuse_consecutive_transposes.h |  2 +-
 .../passes/fuse_matmul_add_bias_into_gemm.h   |  2 +-
 onnx_opt/passes/fuse_pad_into_conv.h          |  2 +-
 onnx_opt/passes/fuse_transpose_into_gemm.h    |  2 +-
 onnx_opt/passes/lift_lexical_references.h     |  2 +-
 onnx_opt/passes/nop.h                         |  2 +-
 onnx_opt/passes/split.h                       |  2 +-
 32 files changed, 117 insertions(+), 53 deletions(-)
 create mode 100644 cmake/ONNXOptimizerConfig.cmake.in
 create mode 100644 cmake/ONNXOptimizerConfigVersion.cmake.in

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2502e9224..7a4abbf3f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,6 @@ project(onnx_optimizer C CXX)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 set(ONNX_ROOT ${PROJECT_SOURCE_DIR}/third_party/onnx)
-option(ONNX_BUILD_OPTIMIZER "" OFF)
 add_subdirectory(${ONNX_ROOT})
 
 file(GLOB_RECURSE onnx_opt_srcs "onnx_opt/*.cc"
@@ -15,6 +14,10 @@ list(REMOVE_ITEM onnx_opt_srcs "${PROJECT_SOURCE_DIR}/onnx_opt/cpp2py_export.cc"
 
 add_library(onnx_optimizer ${onnx_opt_srcs})
 target_link_libraries(onnx_optimizer PUBLIC onnx)
+target_include_directories(onnx_optimizer PUBLIC
+    $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:include>
+    )
 
 if(BUILD_ONNX_PYTHON)
   if("${PY_EXT_SUFFIX}" STREQUAL "")
@@ -34,7 +37,6 @@ if(BUILD_ONNX_PYTHON)
                         PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
   target_include_directories(onnx_opt_cpp2py_export PRIVATE
                              $<BUILD_INTERFACE:${ONNX_ROOT}>
-                             $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
                              $<INSTALL_INTERFACE:include>
                              ${PYTHON_INCLUDE_DIR})
   # pybind11 is a header only lib
@@ -108,3 +110,29 @@ if(BUILD_ONNX_PYTHON)
     add_msvc_runtime_flag(onnx_opt_cpp2py_export)
   endif()
 endif()
+
+include(GNUInstallDirs)
+
+install(DIRECTORY ${PROJECT_SOURCE_DIR}/onnx_opt
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+        FILES_MATCHING
+        PATTERN "*.h")
+
+configure_file(
+  ${PROJECT_SOURCE_DIR}/cmake/ONNXOptimizerConfigVersion.cmake.in
+  ${PROJECT_BINARY_DIR}/ONNXOptimizerConfigVersion.cmake
+  @ONLY)
+configure_file(
+  ${PROJECT_SOURCE_DIR}/cmake/ONNXOptimizerConfig.cmake.in
+  ${PROJECT_BINARY_DIR}/ONNXOptimizerConfig.cmake
+  @ONLY)
+install(FILES
+  ${PROJECT_BINARY_DIR}/ONNXOptimizerConfigVersion.cmake
+  ${PROJECT_BINARY_DIR}/ONNXOptimizerConfig.cmake
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ONNXOptimizer
+  COMPONENT dev)
+install(EXPORT ONNXOptimizerTargets DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/ONNXOptimizer")
+install(TARGETS
+  onnx_optimizer
+  EXPORT ONNXOptimizerTargets DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
diff --git a/cmake/ONNXOptimizerConfig.cmake.in b/cmake/ONNXOptimizerConfig.cmake.in
new file mode 100644
index 000000000..85c56549e
--- /dev/null
+++ b/cmake/ONNXOptimizerConfig.cmake.in
@@ -0,0 +1,24 @@
+# - Config file for the ONNX Optimizer package
+# It defines the following variable(s)
+#   ONNX_OPTIMIZER_INCLUDE_DIRS     - include directories for onnx optimizer
+# as well as ONNX Optimizer targets for other cmake libraries to use.
+
+# library version information
+set(ONNX_OPTIMIZER_VERSION "@ONNX_OPTIMIZER_VERSION@")
+
+# import targets
+include ("${CMAKE_CURRENT_LIST_DIR}/ONNXOptimizerTargets.cmake")
+
+# include directory.
+#
+# Newer versions of CMake set the INTERFACE_INCLUDE_DIRECTORIES property
+# of the imported targets. It is hence not necessary to add this path
+# manually to the include search path for targets which link to gflags.
+# The following lines are here for backward compatibility, in case one
+# would like to use the old-style include path.
+get_filename_component(
+    CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(
+    _INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE)
+set(ONNX_INCLUDE_DIRS "${_INSTALL_PREFIX}/include")
+
diff --git a/cmake/ONNXOptimizerConfigVersion.cmake.in b/cmake/ONNXOptimizerConfigVersion.cmake.in
new file mode 100644
index 000000000..76724ae13
--- /dev/null
+++ b/cmake/ONNXOptimizerConfigVersion.cmake.in
@@ -0,0 +1,12 @@
+set(PACKAGE_VERSION "@ONNX_OPTIMIZER_VERSION@")
+
+# Check whether the requested PACKAGE_FIND_VERSION is compatible
+if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}")
+  set(PACKAGE_VERSION_COMPATIBLE FALSE)
+else()
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}")
+    set(PACKAGE_VERSION_EXACT TRUE)
+  endif()
+endif()
+
diff --git a/onnx_opt/cpp2py_export.cc b/onnx_opt/cpp2py_export.cc
index 0a4f60af0..6393baa86 100644
--- a/onnx_opt/cpp2py_export.cc
+++ b/onnx_opt/cpp2py_export.cc
@@ -1,7 +1,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "onnx/optimizer/optimize.h"
+#include "onnx_opt/optimize.h"
 #include "onnx/py_utils.h"
 
 namespace ONNX_NAMESPACE {
diff --git a/onnx_opt/optimize.cc b/onnx_opt/optimize.cc
index dbacb1a32..7b27aebb8 100644
--- a/onnx_opt/optimize.cc
+++ b/onnx_opt/optimize.cc
@@ -1,7 +1,7 @@
 // ATTENTION: The code in this file is highly EXPERIMENTAL.
 // Adventurous users should note that the APIs will probably change.
 
-#include "onnx/optimizer/optimize.h"
+#include "onnx_opt/optimize.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/optimize.h b/onnx_opt/optimize.h
index 018a62f79..459beb337 100644
--- a/onnx_opt/optimize.h
+++ b/onnx_opt/optimize.h
@@ -6,8 +6,8 @@
 #include "onnx/common/ir.h"
 #include "onnx/common/ir_pb_converter.h"
 #include "onnx/common/stl_backports.h"
-#include "onnx/optimizer/pass_manager.h"
-#include "onnx/optimizer/pass_registry.h"
+#include "onnx_opt/pass_manager.h"
+#include "onnx_opt/pass_registry.h"
 #include "onnx/proto_utils.h"
 
 #include "vector"
diff --git a/onnx_opt/pass.cc b/onnx_opt/pass.cc
index df0a89b5b..27ac67062 100644
--- a/onnx_opt/pass.cc
+++ b/onnx_opt/pass.cc
@@ -1,4 +1,4 @@
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 #include "onnx/common/assertions.h"
 
 namespace ONNX_NAMESPACE {
diff --git a/onnx_opt/pass_manager.cc b/onnx_opt/pass_manager.cc
index 7ad365a06..0c9cae0a9 100644
--- a/onnx_opt/pass_manager.cc
+++ b/onnx_opt/pass_manager.cc
@@ -1,4 +1,4 @@
-#include "onnx/optimizer/pass_manager.h"
+#include "onnx_opt/pass_manager.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/pass_manager.h b/onnx_opt/pass_manager.h
index c7ba35ef0..44a3899b4 100644
--- a/onnx_opt/pass_manager.h
+++ b/onnx_opt/pass_manager.h
@@ -3,8 +3,8 @@
 // Adventurous users should note that the APIs will probably change.
 
 #include <vector>
-#include "onnx/optimizer/pass.h"
-#include "onnx/optimizer/passes/eliminate_deadend.h"
+#include "onnx_opt/pass.h"
+#include "onnx_opt/passes/eliminate_deadend.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/pass_registry.cc b/onnx_opt/pass_registry.cc
index c2eee87f8..b1c6cc69d 100644
--- a/onnx_opt/pass_registry.cc
+++ b/onnx_opt/pass_registry.cc
@@ -1,7 +1,7 @@
 // ATTENTION: The code in this file is highly EXPERIMENTAL.
 // Adventurous users should note that the APIs will probably change.
 
-#include "onnx/optimizer/pass_registry.h"
+#include "onnx_opt/pass_registry.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/pass_registry.h b/onnx_opt/pass_registry.h
index a925b9142..5c4350359 100644
--- a/onnx_opt/pass_registry.h
+++ b/onnx_opt/pass_registry.h
@@ -6,27 +6,27 @@
 #include "onnx/common/ir.h"
 #include "onnx/common/ir_pb_converter.h"
 #include "onnx/common/stl_backports.h"
-#include "onnx/optimizer/passes/eliminate_deadend.h"
-#include "onnx/optimizer/passes/eliminate_identity.h"
-#include "onnx/optimizer/passes/eliminate_nop_dropout.h"
-#include "onnx/optimizer/passes/eliminate_nop_monotone_argmax.h"
-#include "onnx/optimizer/passes/eliminate_nop_pad.h"
-#include "onnx/optimizer/passes/eliminate_nop_transpose.h"
-#include "onnx/optimizer/passes/eliminate_unused_initializer.h"
-#include "onnx/optimizer/passes/extract_constant_to_initializer.h"
-#include "onnx/optimizer/passes/fuse_add_bias_into_conv.h"
-#include "onnx/optimizer/passes/fuse_bn_into_conv.h"
-#include "onnx/optimizer/passes/fuse_consecutive_concats.h"
-#include "onnx/optimizer/passes/fuse_consecutive_log_softmax.h"
-#include "onnx/optimizer/passes/fuse_consecutive_reduce_unsqueeze.h"
-#include "onnx/optimizer/passes/fuse_consecutive_squeezes.h"
-#include "onnx/optimizer/passes/fuse_consecutive_transposes.h"
-#include "onnx/optimizer/passes/fuse_matmul_add_bias_into_gemm.h"
-#include "onnx/optimizer/passes/fuse_pad_into_conv.h"
-#include "onnx/optimizer/passes/fuse_transpose_into_gemm.h"
-#include "onnx/optimizer/passes/lift_lexical_references.h"
-#include "onnx/optimizer/passes/nop.h"
-#include "onnx/optimizer/passes/split.h"
+#include "onnx_opt/passes/eliminate_deadend.h"
+#include "onnx_opt/passes/eliminate_identity.h"
+#include "onnx_opt/passes/eliminate_nop_dropout.h"
+#include "onnx_opt/passes/eliminate_nop_monotone_argmax.h"
+#include "onnx_opt/passes/eliminate_nop_pad.h"
+#include "onnx_opt/passes/eliminate_nop_transpose.h"
+#include "onnx_opt/passes/eliminate_unused_initializer.h"
+#include "onnx_opt/passes/extract_constant_to_initializer.h"
+#include "onnx_opt/passes/fuse_add_bias_into_conv.h"
+#include "onnx_opt/passes/fuse_bn_into_conv.h"
+#include "onnx_opt/passes/fuse_consecutive_concats.h"
+#include "onnx_opt/passes/fuse_consecutive_log_softmax.h"
+#include "onnx_opt/passes/fuse_consecutive_reduce_unsqueeze.h"
+#include "onnx_opt/passes/fuse_consecutive_squeezes.h"
+#include "onnx_opt/passes/fuse_consecutive_transposes.h"
+#include "onnx_opt/passes/fuse_matmul_add_bias_into_gemm.h"
+#include "onnx_opt/passes/fuse_pad_into_conv.h"
+#include "onnx_opt/passes/fuse_transpose_into_gemm.h"
+#include "onnx_opt/passes/lift_lexical_references.h"
+#include "onnx_opt/passes/nop.h"
+#include "onnx_opt/passes/split.h"
 #include "onnx/proto_utils.h"
 
 #include <unordered_set>
diff --git a/onnx_opt/passes/eliminate_deadend.h b/onnx_opt/passes/eliminate_deadend.h
index a70e2ec9d..db121d694 100644
--- a/onnx_opt/passes/eliminate_deadend.h
+++ b/onnx_opt/passes/eliminate_deadend.h
@@ -2,7 +2,7 @@
 // ATTENTION: The code in this file is highly EXPERIMENTAL.
 // Adventurous users should note that the APIs will probably change.
 #pragma once
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 namespace ONNX_NAMESPACE {
 namespace optimization {
 struct EliminateDeadEnd final : public FullGraphBasedPass {
diff --git a/onnx_opt/passes/eliminate_identity.h b/onnx_opt/passes/eliminate_identity.h
index 3f5f8525b..4fae9fc19 100644
--- a/onnx_opt/passes/eliminate_identity.h
+++ b/onnx_opt/passes/eliminate_identity.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/eliminate_nop_dropout.h b/onnx_opt/passes/eliminate_nop_dropout.h
index 6be190b5e..94ffb9fa3 100644
--- a/onnx_opt/passes/eliminate_nop_dropout.h
+++ b/onnx_opt/passes/eliminate_nop_dropout.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/eliminate_nop_monotone_argmax.h b/onnx_opt/passes/eliminate_nop_monotone_argmax.h
index 67f3fdf79..0e3334225 100644
--- a/onnx_opt/passes/eliminate_nop_monotone_argmax.h
+++ b/onnx_opt/passes/eliminate_nop_monotone_argmax.h
@@ -2,7 +2,7 @@
 // Adventurous users should note that the APIs will probably change.
 #pragma once
 
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/eliminate_nop_pad.h b/onnx_opt/passes/eliminate_nop_pad.h
index 23f8cfc6f..04441f983 100644
--- a/onnx_opt/passes/eliminate_nop_pad.h
+++ b/onnx_opt/passes/eliminate_nop_pad.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "onnx/defs/tensor_util.h"
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/eliminate_nop_transpose.h b/onnx_opt/passes/eliminate_nop_transpose.h
index ba1595dfc..daad9c8d9 100644
--- a/onnx_opt/passes/eliminate_nop_transpose.h
+++ b/onnx_opt/passes/eliminate_nop_transpose.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/eliminate_unused_initializer.h b/onnx_opt/passes/eliminate_unused_initializer.h
index 95995610f..592dc1cec 100644
--- a/onnx_opt/passes/eliminate_unused_initializer.h
+++ b/onnx_opt/passes/eliminate_unused_initializer.h
@@ -14,7 +14,7 @@
 //   condition 1: A is not used as any node's input
 //   condition 2: A is not an output
 
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/extract_constant_to_initializer.h b/onnx_opt/passes/extract_constant_to_initializer.h
index 696f78c6e..5fafe85a6 100644
--- a/onnx_opt/passes/extract_constant_to_initializer.h
+++ b/onnx_opt/passes/extract_constant_to_initializer.h
@@ -11,7 +11,7 @@
 //	 this pass can handle the case satisfy all following conditions:
 //	   condition 1: A is the output of a Constant node
 #include "onnx/common/assertions.h"
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_add_bias_into_conv.h b/onnx_opt/passes/fuse_add_bias_into_conv.h
index 0af10cd7c..22a8adb9f 100644
--- a/onnx_opt/passes/fuse_add_bias_into_conv.h
+++ b/onnx_opt/passes/fuse_add_bias_into_conv.h
@@ -16,7 +16,7 @@
 #include <numeric>
 
 #include "onnx/common/assertions.h"
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_bn_into_conv.h b/onnx_opt/passes/fuse_bn_into_conv.h
index e4fdddca1..697f9cbd7 100644
--- a/onnx_opt/passes/fuse_bn_into_conv.h
+++ b/onnx_opt/passes/fuse_bn_into_conv.h
@@ -29,7 +29,7 @@
 // $$ b' = (b_{conv} - m)\frac{s}{\sqrt{\sigma + \epsilon}} + b_{bn}$$
 
 #include "onnx/common/assertions.h"
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_consecutive_concats.h b/onnx_opt/passes/fuse_consecutive_concats.h
index 6b18413a7..9e42c7530 100644
--- a/onnx_opt/passes/fuse_consecutive_concats.h
+++ b/onnx_opt/passes/fuse_consecutive_concats.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_consecutive_log_softmax.h b/onnx_opt/passes/fuse_consecutive_log_softmax.h
index 474c0001b..8f732c17f 100644
--- a/onnx_opt/passes/fuse_consecutive_log_softmax.h
+++ b/onnx_opt/passes/fuse_consecutive_log_softmax.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_consecutive_reduce_unsqueeze.h b/onnx_opt/passes/fuse_consecutive_reduce_unsqueeze.h
index ae6731755..550fb5cc1 100644
--- a/onnx_opt/passes/fuse_consecutive_reduce_unsqueeze.h
+++ b/onnx_opt/passes/fuse_consecutive_reduce_unsqueeze.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_consecutive_squeezes.h b/onnx_opt/passes/fuse_consecutive_squeezes.h
index 550cadd21..2b1c8aa51 100644
--- a/onnx_opt/passes/fuse_consecutive_squeezes.h
+++ b/onnx_opt/passes/fuse_consecutive_squeezes.h
@@ -9,7 +9,7 @@
 //   Z = Squeeze(Y, axes=[0, 4]) -> shape=[2, 3, 5]
 // After:
 //   Z = Squeeze(X, axes=[0, 1, 4, 6])
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_consecutive_transposes.h b/onnx_opt/passes/fuse_consecutive_transposes.h
index ef2fb664a..6b7d58978 100644
--- a/onnx_opt/passes/fuse_consecutive_transposes.h
+++ b/onnx_opt/passes/fuse_consecutive_transposes.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_matmul_add_bias_into_gemm.h b/onnx_opt/passes/fuse_matmul_add_bias_into_gemm.h
index 2ddef7cab..8d093cee1 100644
--- a/onnx_opt/passes/fuse_matmul_add_bias_into_gemm.h
+++ b/onnx_opt/passes/fuse_matmul_add_bias_into_gemm.h
@@ -17,7 +17,7 @@
 #include <numeric>
 
 #include "onnx/common/assertions.h"
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_pad_into_conv.h b/onnx_opt/passes/fuse_pad_into_conv.h
index 4a66aa6e5..575a199b2 100644
--- a/onnx_opt/passes/fuse_pad_into_conv.h
+++ b/onnx_opt/passes/fuse_pad_into_conv.h
@@ -15,7 +15,7 @@
 #include <numeric>
 
 #include "onnx/defs/tensor_util.h"
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_transpose_into_gemm.h b/onnx_opt/passes/fuse_transpose_into_gemm.h
index b7ada112c..b9fab13af 100644
--- a/onnx_opt/passes/fuse_transpose_into_gemm.h
+++ b/onnx_opt/passes/fuse_transpose_into_gemm.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/lift_lexical_references.h b/onnx_opt/passes/lift_lexical_references.h
index d0a5eeec9..2082c555c 100644
--- a/onnx_opt/passes/lift_lexical_references.h
+++ b/onnx_opt/passes/lift_lexical_references.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <set>
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/nop.h b/onnx_opt/passes/nop.h
index a51321e02..de71fab65 100644
--- a/onnx_opt/passes/nop.h
+++ b/onnx_opt/passes/nop.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/split.h b/onnx_opt/passes/split.h
index f59a8cc1a..c7311201b 100644
--- a/onnx_opt/passes/split.h
+++ b/onnx_opt/passes/split.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx/optimizer/pass.h"
+#include "onnx_opt/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {

From 5308d3ed859b467a2de449e573fff19d66156c03 Mon Sep 17 00:00:00 2001
From: daquexian <daquexian566@gmail.com>
Date: Sun, 23 Aug 2020 15:32:48 +0800
Subject: [PATCH 03/14] move optimizer_test.py

---
 onnx_opt/test/optimizer_test.py | 1774 +++++++++++++++++++++++++++++++
 1 file changed, 1774 insertions(+)
 create mode 100644 onnx_opt/test/optimizer_test.py

diff --git a/onnx_opt/test/optimizer_test.py b/onnx_opt/test/optimizer_test.py
new file mode 100644
index 000000000..6fb1de61a
--- /dev/null
+++ b/onnx_opt/test/optimizer_test.py
@@ -0,0 +1,1774 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import onnx
+from onnx import checker, helper, ModelProto, TensorProto, GraphProto, NodeProto, shape_inference
+from typing import Sequence, Text, Any, Tuple, List, Callable
+from onnx import numpy_helper
+
+import numpy as np  # type: ignore
+
+import onnx_opt
+import unittest
+
+
+class TestOptimizer(unittest.TestCase):
+
+    def _optimized(self, graph, opts, fixed_point=False, **kwargs):  # type: (GraphProto, Sequence[Text], bool, **Any) -> ModelProto
+        orig_model = helper.make_model(graph, producer_name='onnx-test', **kwargs)
+        optimized_model = onnx_opt.optimize(orig_model, opts, fixed_point)
+        checker.check_model(optimized_model)
+        return optimized_model
+
+    # input_types and output_types are lists of triples of (name, type, shape)
+    def _make_fake_loop_op(self,
+                           body_nodes,   # type: Sequence[NodeProto]
+                           input_types,  # type: Sequence[Tuple[TensorProto.DataType, Sequence[int], Text]]
+                           output_types  # type: Sequence[Tuple[TensorProto.DataType, Sequence[int], Text]]
+                           ):  # type: (...) -> List[NodeProto]
+        zero = helper.make_tensor(
+            "trip_count_value", TensorProto.INT64, (), [10])
+        true = helper.make_tensor("condition", TensorProto.BOOL, (), [True])
+        # lcd is a dummy loop-carried dependency that only exists because
+        # right now the schema checker is broken and assumes a variadic
+        # input needs at least one value.
+        graph_inputs = [helper.make_tensor_value_info("i", TensorProto.INT64, ()),
+                        helper.make_tensor_value_info("cond", TensorProto.BOOL, ())]
+        for type, shape, name in input_types:
+            graph_inputs.append(
+                helper.make_tensor_value_info("_" + name, type, shape))
+        graph_outputs = [helper.make_tensor_value_info(
+            "cond", TensorProto.BOOL, ())]
+        for type, shape, name in output_types:
+            graph_outputs.append(
+                helper.make_tensor_value_info("_" + name, type, shape))
+        body_graph = helper.make_graph(body_nodes, "body_graph", graph_inputs,
+                                       graph_outputs)
+        loop_inputs = ["trip_count", "condition"]
+        loop_inputs.extend([name for _, _, name in input_types])
+        # TODO: fix checker to accept 0-input variadic inputs
+        if len(loop_inputs) == 2:
+            loop_inputs.append("")
+        loop_outputs = [name for _, _, name in output_types]
+        retval_nodes = [
+            helper.make_node("Constant", [], ["trip_count"], value=zero),
+            helper.make_node("Constant", [], ["condition"], value=true),
+            helper.make_node("Loop", loop_inputs, loop_outputs, body=body_graph)
+        ]
+        return retval_nodes
+
+    def _make_fake_if_op(self,
+                         true_nodes,   # type: Sequence[NodeProto]
+                         false_nodes,  # type: Sequence[NodeProto]
+                         output_types  # type: Sequence[Tuple[TensorProto.DataType, Sequence[int], Text]]
+                         ):  # type: (...) -> List[NodeProto]
+        true = helper.make_tensor("condition", TensorProto.BOOL, (), [True])
+        true_graph = helper.make_graph(true_nodes, "true_graph", [], [])
+        false_graph = helper.make_graph(false_nodes, "false_graph", [], [])
+        if_inputs = ["condition"]
+        if_outputs = [name for _, _, name in output_types]
+        retval_nodes = [
+            helper.make_node("Constant", [], ["condition"], value=true),
+            helper.make_node("If", if_inputs, if_outputs, then_branch=true_graph,
+                             else_branch=false_graph)
+        ]
+        return retval_nodes
+
+    # fn is a function that takes a single node as argument
+    def _visit_all_nodes_recursive(self, graph, fn):  # type: (GraphProto, Callable[[NodeProto], None]) -> None
+        for node in graph.node:
+            fn(node)
+            for attr in node.attribute:
+                if attr.g is not None:
+                    self._visit_all_nodes_recursive(attr.g, fn)
+                if len(attr.graphs):
+                    for gr in attr.graphs:
+                        self._visit_all_nodes_recursive(gr, fn)
+
+    def test_get_available_passes(self):  # type: () -> None
+        # FIXME does not guarantees to be listing all
+        graph = helper.make_graph([], "dummy_graph", [], [])
+        list_of_passes = onnx_opt.get_available_passes()
+        assert isinstance(list_of_passes, (list)) and len(list_of_passes) > 0
+        for pass_name in list_of_passes:
+            # If pass_name is invalid it throws a RuntimeError
+            self._optimized(graph, [pass_name])
+
+    def test_eliminate_identity_single_use(self):  # type: () -> None
+        nodes = [helper.make_node("Identity", ["X"], ["Y"])]
+        nodes.extend(self._make_fake_loop_op(
+            [helper.make_node("Identity", ["_Y"], ["_Y2"])],
+            [(TensorProto.FLOAT, (5,), "Y")],
+            [(TensorProto.FLOAT, (5,), "Y2")]))
+        graph = helper.make_graph(
+            nodes,
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (5,))],
+            [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (5,)),
+             helper.make_tensor_value_info("Y2", TensorProto.FLOAT, (5,))])
+        optimized_model = self._optimized(graph, ["eliminate_identity"])
+
+        # All identity nodes should have been eliminated
+        def check_identity(node):  # type: (NodeProto) -> None
+            assert node.op_type != "Identity"
+        self._visit_all_nodes_recursive(optimized_model.graph, check_identity)
+        # Use of the output from the Identity node in the main graph should
+        # have been replaced with the input to the identity node
+        assert len(optimized_model.graph.output) == 2
+        assert optimized_model.graph.output[0].name == "X"
+        # Use of the output from the Identity node in the loop graph should
+        # have been replaced with the input to that identity node
+        assert len(optimized_model.graph.node[2].attribute[0].g.output) == 2
+        assert optimized_model.graph.node[2].attribute[0].g.output[1].name == "_Y"
+
+    def test_eliminate_identity_graph_output(self):  # type: () -> None
+        add = helper.make_node("Add", ["X", "Y"], ["A"])
+        identity = helper.make_node("Identity", ["A"], ["B"])
+        graph = helper.make_graph(
+            [add, identity],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (5,)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (5,))],
+            [helper.make_tensor_value_info("B", TensorProto.FLOAT, (5,))])
+        optimized_model = self._optimized(graph, ["eliminate_identity"])
+
+        for node in optimized_model.graph.node:
+            assert node.op_type != "Identity"
+        assert len(optimized_model.graph.node) == 1
+
+    def test_eliminate_identity_multiple_uses(self):  # type: () -> None
+        identity = helper.make_node("Identity", ["X"], ["Y"])
+        add = helper.make_node("Add", ["Z", "Y"], ["A"])
+        mul = helper.make_node("Mul", ["A", "Y"], ["B"])
+        graph = helper.make_graph(
+            [identity, add, mul],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (5,)),
+             helper.make_tensor_value_info("Z", TensorProto.FLOAT, (5,))],
+            [helper.make_tensor_value_info("B", TensorProto.FLOAT, (5,))])
+        optimized_model = self._optimized(graph, ["eliminate_identity"])
+
+        for node in optimized_model.graph.node:
+            assert node.op_type != "Identity"
+        assert len(optimized_model.graph.node) == 2
+
+    def test_nop_transpose_graph_output(self):  # type: () -> None
+        add = helper.make_node("Add", ["X", "Y"], ["A"])
+        trans = helper.make_node("Transpose", ["A"], ["B"], perm=[0, 1])
+        graph = helper.make_graph(
+            [add, trans],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 3))],
+            [helper.make_tensor_value_info("B", TensorProto.FLOAT, (2, 3))])
+        # The existence of shape infos of graoh outputs is checked in _optimized
+        optimized_model = self._optimized(graph, ["eliminate_nop_transpose"])
+
+        def check_transpose(node):  # type: (NodeProto) -> None
+            assert node.op_type != "Transpose"
+        self._visit_all_nodes_recursive(optimized_model.graph, check_transpose)
+        assert len(optimized_model.graph.node) == 1
+
+    def test_nop_transpose(self):  # type: () -> None
+        nodes = [helper.make_node("Transpose", ["X"], ["Y"], perm=[0, 1])]
+        nodes.extend(self._make_fake_loop_op(
+            [helper.make_node("Transpose", ["_Y"], ["_Y2"], perm=[0, 1])],
+            [(TensorProto.FLOAT, (2, 3), "Y")],
+            [(TensorProto.FLOAT, (2, 3), "Y2")]))
+        graph = helper.make_graph(
+            nodes,
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3))],
+            [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 3)),
+             helper.make_tensor_value_info("Y2", TensorProto.FLOAT, (2, 3))])
+        optimized_model = self._optimized(graph, ["eliminate_nop_transpose"])
+
+        def check_transpose(node):  # type: (NodeProto) -> None
+            assert node.op_type != "Transpose"
+        self._visit_all_nodes_recursive(optimized_model.graph, check_transpose)
+        # Use of the output from the Transpose node in the main graph should
+        # have been replaced with the input to the identity node
+        assert len(optimized_model.graph.output) == 2
+        assert optimized_model.graph.output[0].name == "X"
+        # Use of the output from the Transpose node in the loop graph should
+        # have been replaced with the input to that identity node
+        assert len(optimized_model.graph.node[2].attribute[0].g.output) == 2
+        assert optimized_model.graph.node[2].attribute[0].g.output[1].name == "_Y"
+
+    def test_nop_transpose_default(self):  # type: () -> None
+        trans = helper.make_node("Transpose", ["X"], ["Y"])
+        graph = helper.make_graph(
+            [trans],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3))],
+            [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (3, 2))])
+        optimized_model = self._optimized(graph, ["eliminate_nop_transpose"])
+
+        assert len(list(optimized_model.graph.node)) == 1
+        assert optimized_model.graph.node[0].op_type == "Transpose"
+
+    def test_nop_pad_opset10(self):  # type: () -> None
+        nodes = [helper.make_node("Pad", ["X"], ["Y"], pads=[0, 0])]
+        graph = helper.make_graph(
+            nodes,
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3))],
+            [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 3))])
+        assert len(graph.node) == 1
+        optimized_model = self._optimized(graph, ["eliminate_nop_pad"], False, opset_imports=[helper.make_opsetid("", 10)])
+
+        def check_pad(node):  # type: (NodeProto) -> None
+            assert node.op_type != "Pad"
+        self._visit_all_nodes_recursive(optimized_model.graph, check_pad)
+        assert len(optimized_model.graph.output) == 1
+        assert optimized_model.graph.output[0].name == "X"
+        assert len(optimized_model.graph.node) == 0
+
+    def test_nop_pad_graph_output(self):  # type: () -> None
+        add = helper.make_node("Add", ["X", "Y"], ["A"])
+        pad = helper.make_node("Pad", ["A", "Pads"], ["B"])
+        graph = helper.make_graph(
+            [add, pad],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (5,)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (5,)),
+             helper.make_tensor_value_info("Pads", TensorProto.INT64, (2,))],
+            [helper.make_tensor_value_info("B", TensorProto.FLOAT, (5,))],
+            [helper.make_tensor("Pads", TensorProto.INT64,
+                dims=(2,),
+                vals=np.array([0, 0]).astype(np.int64).tobytes(),
+                raw=True)])
+        # The existence of shape infos of graoh outputs is checked in _optimized
+        optimized_model = self._optimized(graph, ["eliminate_nop_pad"])
+
+        def check_pad(node):  # type: (NodeProto) -> None
+            assert node.op_type != "Pad"
+        self._visit_all_nodes_recursive(optimized_model.graph, check_pad)
+        assert len(optimized_model.graph.node) == 1
+
+    def test_nop_pad(self):  # type: () -> None
+        nodes = [helper.make_node("Pad", ["X", "Pads"], ["Y"])]
+        graph = helper.make_graph(
+            nodes,
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3)),
+             helper.make_tensor_value_info("Pads", TensorProto.INT64, (4,))],
+            [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 3))],
+            [helper.make_tensor("Pads", TensorProto.INT64,
+                dims=(4,),
+                vals=np.array([0, 0, 0, 0]).astype(np.int64).tobytes(),
+                raw=True)])
+        assert len(graph.node) == 1
+        optimized_model = self._optimized(graph, ["eliminate_nop_pad"])
+
+        def check_pad(node):  # type: (NodeProto) -> None
+            assert node.op_type != "Pad"
+        self._visit_all_nodes_recursive(optimized_model.graph, check_pad)
+        assert len(optimized_model.graph.output) == 1
+        assert optimized_model.graph.output[0].name == "X"
+        assert len(optimized_model.graph.node) == 0
+
+    def test_nop_pad_default_opset10(self):  # type: () -> None
+        trans = helper.make_node("Pad", ["X"], ["Y"], pads=[0, 1])
+        graph = helper.make_graph(
+            [trans],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3))],
+            [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 4))])
+        optimized_model = self._optimized(graph, ["eliminate_nop_pad"], False, opset_imports=[helper.make_opsetid("", 10)])
+
+        assert len(list(optimized_model.graph.node)) == 1
+        assert optimized_model.graph.node[0].op_type == "Pad"
+
+    def test_nop_pad_default(self):  # type: () -> None
+        trans = helper.make_node("Pad", ["X", "Pads"], ["Y"])
+        graph = helper.make_graph(
+            [trans],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3)),
+             helper.make_tensor_value_info("Pads", TensorProto.INT64, (4,))],
+            [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 4))],
+            [helper.make_tensor("Pads", TensorProto.INT64,
+                dims=(4,),
+                vals=np.array([0, 1, 0, 0]).astype(np.int64).tobytes(),
+                raw=True)])
+        optimized_model = self._optimized(graph, ["eliminate_nop_pad"])
+
+        assert len(list(optimized_model.graph.node)) == 1
+        assert optimized_model.graph.node[0].op_type == "Pad"
+
+    def test_eliminate_unused_initializer(self):  # type: () -> None
+        add = helper.make_node("Add", ["X", "Y"], ["Z"])
+        graph = helper.make_graph(
+            [add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 2)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (1, 2))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 2))],
+            [helper.make_tensor("A", TensorProto.FLOAT,
+                                dims=(2, 3),
+                                vals=np.random.randn(2, 3).astype(
+                                    np.float32).tobytes(),
+                                raw=True)])
+        optimized_model = self._optimized(
+            graph, ["eliminate_unused_initializer"])
+
+        assert len(list(optimized_model.graph.initializer)) == 0
+
+    def test_eliminate_unused_initializer_input(self):  # type: () -> None
+        add = helper.make_node("Add", ["X", "Y"], ["Z"])
+        graph = helper.make_graph(
+            [add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 2)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (1, 2)),
+             helper.make_tensor_value_info("A", TensorProto.FLOAT, (2, 3))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 2))],
+            [helper.make_tensor("A", TensorProto.FLOAT,
+                                dims=(2, 3),
+                                vals=np.random.randn(2, 3).astype(
+                                    np.float32).tobytes(),
+                                raw=True)])
+        optimized_model = self._optimized(
+            graph, ["eliminate_unused_initializer"])
+
+        assert len(list(optimized_model.graph.initializer)) == 0
+        assert len(optimized_model.graph.input) == 2
+
+    def test_eliminate_unused_initializer_no_eliminate_used_default(self):  # type: () -> None
+        add = helper.make_node("Add", ["X", "A"], ["Z"])
+        graph = helper.make_graph(
+            [add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 2)),
+             helper.make_tensor_value_info("A", TensorProto.FLOAT, (1, 2))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 2))],
+            [helper.make_tensor("A", TensorProto.FLOAT,
+                                dims=(1, 2),
+                                vals=np.random.randn(1, 2).astype(
+                                    np.float32).tobytes(),
+                                raw=True)])
+        optimized_model = self._optimized(
+            graph, ["eliminate_unused_initializer"])
+
+        assert len(list(optimized_model.graph.initializer)) == 1
+
+    def test_eliminate_unused_initializer_no_eliminate_used(self):  # type: () -> None
+        nodes = [helper.make_node("Add", ["X", "A"], ["Z"])]
+        nodes.extend(self._make_fake_loop_op(
+            [helper.make_node("Add", ["_X", "_A"], ["_Z2"])],
+            [(TensorProto.FLOAT, (1, 2), "X"),
+             (TensorProto.FLOAT, (1, 2), "A")],
+            [(TensorProto.FLOAT, (1, 2), "Z2")]))
+        graph = helper.make_graph(
+            nodes,
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 2)),
+             helper.make_tensor_value_info("A", TensorProto.FLOAT, (1, 2))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 2))],
+            [helper.make_tensor("A", TensorProto.FLOAT,
+                                dims=(1, 2),
+                                vals=np.random.randn(1, 2).astype(
+                                    np.float32).tobytes(),
+                                raw=True)])
+        optimized_model = self._optimized(
+            graph, ["eliminate_unused_initializer"])
+
+        # Add, Constant (trip count), Constant (cond), Loop
+        assert len(list(optimized_model.graph.node)) == 4
+        assert optimized_model.graph.node[0].op_type == "Add"
+        assert optimized_model.graph.output[0].name == "Z"
+        # Add
+        assert len(optimized_model.graph.node[3].attribute[0].g.node) == 1
+        assert optimized_model.graph.node[3].attribute[0].g.node[0].op_type == 'Add'
+        assert optimized_model.graph.node[3].attribute[0].g.output[1].name == '_Z2'
+
+        assert len(list(optimized_model.graph.initializer)) == 1
+
+    def test_eliminate_unused_initializer_no_eliminate_output(self):  # type: () -> None
+        add = helper.make_node("Add", ["X", "Y"], ["Z"])
+        graph = helper.make_graph(
+            [add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 2)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (1, 2)),
+             helper.make_tensor_value_info("A", TensorProto.FLOAT, (2, 3))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 2)),
+             helper.make_tensor_value_info("A", TensorProto.FLOAT, (2, 3))],
+            [helper.make_tensor("A", TensorProto.FLOAT,
+                                dims=(2, 3),
+                                vals=np.random.randn(2, 3).astype(
+                                    np.float32).tobytes(),
+                                raw=True)])
+        optimized_model = self._optimized(
+            graph, ["eliminate_unused_initializer"])
+
+        assert len(list(optimized_model.graph.initializer)) == 1
+        assert "Z" in [o.name for o in optimized_model.graph.output]
+
+    def test_extract_constant_to_initializer(self):  # type: () -> None
+        conv = helper.make_node("Conv", ["X", "Y"], ["Z"])
+        constant = helper.make_node("Constant", [], ["A"],
+                                    value=helper.make_tensor(
+                                        name="bias",
+                                        data_type=TensorProto.FLOAT,
+                                        dims=(16, 1, 1),
+                                        vals=np.random.randn(16).astype(np.float32).tolist()))
+        add = helper.make_node("Add", ["Z", "A"], ["B"])
+        graph = helper.make_graph(
+            [conv, constant, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 3, 3)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
+            [helper.make_tensor_value_info(
+                "B", TensorProto.FLOAT, (1, 16, 1, 1))],
+        )
+        optimized_model = self._optimized(
+            graph, ["extract_constant_to_initializer"])
+        self.assertEqual(
+            set(vi.name for vi in optimized_model.graph.input),
+            {'X', 'Y', 'A'})
+
+        self.assertEqual(len(optimized_model.graph.initializer), 1)
+        init = optimized_model.graph.initializer[0]
+        self.assertEqual(init.name, 'A')
+        self.assertEqual(init.dims, [16, 1, 1])
+        self.assertEqual(init.data_type, TensorProto.FLOAT)
+
+        self.assertEqual(
+            [n.op_type for n in optimized_model.graph.node], ['Conv', 'Add'])
+
+    def test_fuse_concats(self):  # type: () -> None
+        nodes = [helper.make_node("Concat", ["A", "B", "C"], ["X"], axis=0),
+                 helper.make_node("Concat", ["D", "E", "F"], ["Y"], axis=0),
+                 helper.make_node("Concat", ["X", "G", "Y"], ["Z"], axis=0)]
+        graph = helper.make_graph(
+            nodes,
+            "test",
+            [helper.make_tensor_value_info("A", TensorProto.FLOAT, (2, 3, 4)),
+            helper.make_tensor_value_info("B", TensorProto.FLOAT, (4, 3, 4)),
+            helper.make_tensor_value_info("C", TensorProto.FLOAT, (2, 3, 4)),
+            helper.make_tensor_value_info("D", TensorProto.FLOAT, (4, 3, 4)),
+            helper.make_tensor_value_info("E", TensorProto.FLOAT, (2, 3, 4)),
+            helper.make_tensor_value_info("F", TensorProto.FLOAT, (4, 3, 4)),
+            helper.make_tensor_value_info("G", TensorProto.FLOAT, (4, 3, 4))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (18, 3, 4))])
+        optimized_model = self._optimized(
+            graph, ["fuse_consecutive_concats"], True)  # two passes are needed to simplify the graph to its simplest state.
+
+        assert len(optimized_model.graph.node) == 1
+        assert len(optimized_model.graph.node[0].input) == 7
+        assert optimized_model.graph.node[0].input == [
+            "A", "B", "C", "G", "D", "E", "F"]
+        assert optimized_model.graph.node[0].op_type == "Concat"
+
+    def test_fuse_concats_different_axis(self):  # type: () -> None
+        nodes = [helper.make_node("Concat", ["A", "B", "C"], ["X"], axis=0),
+                 helper.make_node("Concat", ["D", "E", "F"], ["Y"], axis=1),
+                 helper.make_node("Concat", ["X", "Y"], ["Z"], axis=2)]
+        graph = helper.make_graph(
+            nodes,
+            "test",
+            [helper.make_tensor_value_info("A", TensorProto.FLOAT, (2, 3, 4)),
+            helper.make_tensor_value_info("B", TensorProto.FLOAT, (4, 3, 4)),
+            helper.make_tensor_value_info("C", TensorProto.FLOAT, (2, 3, 4)),
+            helper.make_tensor_value_info("D", TensorProto.FLOAT, (4, 3, 4)),
+            helper.make_tensor_value_info("E", TensorProto.FLOAT, (4, 3, 4)),
+            helper.make_tensor_value_info("F", TensorProto.FLOAT, (4, 3, 4))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (18, 3, 4))])
+        optimized_model = self._optimized(
+            graph, ["fuse_consecutive_concats"], True)  # two passes are needed to simplify the graph to its simplest state.
+
+        assert optimized_model.graph == graph
+
+    def test_fuse_transpose(self):  # type: () -> None
+        nodes = [helper.make_node("Transpose", ["X"], ["Y"], perm=[1, 0, 2]),
+                 helper.make_node("Transpose", ["Y"], ["Z"], perm=[2, 0, 1]),
+                 helper.make_node("Transpose", ["Z"], ["A"], perm=[2, 0, 1])]
+        nodes.extend(self._make_fake_loop_op(
+            [helper.make_node("Transpose", ["_X"], ["_Y2"], perm=[1, 0, 2]),
+             helper.make_node("Transpose", ["_Y2"], ["_Y3"], perm=[2, 0, 1]),
+             helper.make_node("Transpose", ["_Y3"], ["_Y4"], perm=[2, 0, 1])],
+            [(TensorProto.FLOAT, (2, 3, 4), "X")],
+            [(TensorProto.FLOAT, (2, 4, 3), "Y4")]))
+        graph = helper.make_graph(
+            nodes,
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3, 4))],
+            [helper.make_tensor_value_info("A", TensorProto.FLOAT, (2, 4, 3)),
+             helper.make_tensor_value_info("Y4", TensorProto.FLOAT, (4, 3, 2))])
+        original_model = helper.make_model(graph)
+        shape_inference.infer_shapes(original_model)
+        optimized_model = self._optimized(
+            graph, ["fuse_consecutive_transposes"])
+        shape_inference.infer_shapes(optimized_model)
+
+        # Transpose, Constant (trip count), Constant (cond), Loop
+        assert len(list(optimized_model.graph.node)) == 4
+        # Transpose
+        assert len(optimized_model.graph.node[3].attribute[0].g.node) == 1
+
+    def test_fuse_transpose_default_graph_output(self):  # type: () -> None
+        add = helper.make_node("Add", ["X", "Y"], ["A"])
+        trans1 = helper.make_node("Transpose", ["A"], ["B"])
+        trans2 = helper.make_node("Transpose", ["B"], ["C"])
+        graph = helper.make_graph(
+            [add, trans1, trans2],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 3))],
+            [helper.make_tensor_value_info("C", TensorProto.FLOAT, (2, 3))])
+        # The existence of shape infos of graoh outputs is checked in _optimized
+        optimized_model = self._optimized(graph, ["fuse_consecutive_transposes"])
+
+        def check_transpose(node):  # type: (NodeProto) -> None
+            assert node.op_type != "Transpose"
+        self._visit_all_nodes_recursive(optimized_model.graph, check_transpose)
+        assert len(optimized_model.graph.node) == 1
+
+    def test_fuse_transpose_default(self):  # type: () -> None
+        trans1 = helper.make_node("Transpose", ["X"], ["Y"])
+        trans2 = helper.make_node("Transpose", ["Y"], ["Z"])
+        graph = helper.make_graph(
+            [trans1, trans2],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3, 4))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (2, 3, 4))])
+        optimized_model = self._optimized(
+            graph, ["fuse_consecutive_transposes"])
+
+        assert len(list(optimized_model.graph.node)) == 0
+
+    def test_fuse_transpose_default_no_fuse(self):  # type: () -> None
+        trans1 = helper.make_node("Transpose", ["X"], ["Y"])
+        trans2 = helper.make_node("Transpose", ["Y"], ["Z"], perm=[0, 1, 2])
+        graph = helper.make_graph(
+            [trans1, trans2],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3, 4))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (4, 3, 2))])
+        optimized_model = self._optimized(
+            graph, ["fuse_consecutive_transposes"])
+
+        assert len(list(optimized_model.graph.node)) == 2
+        for node in optimized_model.graph.node:
+            assert node.op_type == "Transpose"
+
+    def test_fuse_transpose_into_gemm(self):  # type: () -> None
+        nodes = [helper.make_node("Transpose", ["X"], ["A"], perm=[1, 0]),
+                 helper.make_node("Transpose", ["Y"], ["B"], perm=[1, 0]),
+                 helper.make_node("Gemm", ["A", "B", "C"], ["Z"])]
+        nodes.extend(self._make_fake_loop_op(
+            [helper.make_node("Transpose", ["_X"], ["_A"], perm=[1, 0]),
+             helper.make_node("Transpose", ["_Y"], ["_B"], perm=[1, 0]),
+             helper.make_node("Gemm", ["_A", "_B", "_C"], ["_Z2"])],
+            [(TensorProto.FLOAT, (2, 3), "X"),
+             (TensorProto.FLOAT, (5, 2), "Y"),
+             (TensorProto.FLOAT, (3, 5), "C")],
+            [(TensorProto.FLOAT, (2, 3), "Z2")]))
+        graph = helper.make_graph(
+            nodes,
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (5, 2)),
+             helper.make_tensor_value_info("C", TensorProto.FLOAT, (3, 5))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (3, 5))])
+        optimized_model = self._optimized(graph, ["fuse_transpose_into_gemm"])
+
+        # Gemm, Constant (trip count), Constant (cond), Loop
+        assert len(list(optimized_model.graph.node)) == 4
+        assert optimized_model.graph.node[0].op_type == "Gemm"
+        # Gemm
+        assert len(optimized_model.graph.node[3].attribute[0].g.node) == 1
+        assert optimized_model.graph.node[3].attribute[0].g.node[0].op_type == "Gemm"
+
+    def test_fuse_add_bias_into_conv_use_weight_shape(self):  # type: () -> None
+        nodes = [helper.make_node("Conv", ["X", "Y"], ["Z"]),
+                 helper.make_node("Add", ["Z", "A"], ["B"])]
+        nodes.extend(self._make_fake_loop_op(
+            [helper.make_node("Conv", ["_X", "_Y"], ["_Z"]),
+             helper.make_node("Add", ["_Z", "_A"], ["_B2"])],
+            [(TensorProto.FLOAT, (1, 5, 3, 3), "X"),
+             (TensorProto.FLOAT, (16, 5, 3, 3), "Y"),
+             (TensorProto.FLOAT, (16, 1, 1), "A")],
+            [(TensorProto.FLOAT, (1, 16, 3, 3), "B2")]))
+        graph = helper.make_graph(
+            nodes,
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 3, 3)),
+             helper.make_tensor_value_info(
+                 "Y", TensorProto.FLOAT, (16, 5, 3, 3)),
+             helper.make_tensor_value_info("A", TensorProto.FLOAT, (16, 1, 1))],
+            [helper.make_tensor_value_info(
+                "B", TensorProto.FLOAT, (1, 16, 1, 1))],
+        )
+        optimized_model = self._optimized(graph, ["fuse_add_bias_into_conv"])
+
+        # Squeeze, Conv, Constant (trip count), Constant (condition), Loop
+        assert len(list(optimized_model.graph.node)) == 5
+        assert optimized_model.graph.node[0].op_type == 'Squeeze'
+        assert optimized_model.graph.node[1].op_type == 'Conv'
+        assert optimized_model.graph.output[0].name == 'Z'
+        # Squeeze, Conv
+        assert len(optimized_model.graph.node[4].attribute[0].g.node) == 2
+        assert optimized_model.graph.node[4].attribute[0].g.node[0].op_type == 'Squeeze'
+        assert optimized_model.graph.node[4].attribute[0].g.node[1].op_type == 'Conv'
+        # Output 1 since 0 is 'cond'
+        assert optimized_model.graph.node[4].attribute[0].g.output[1].name == '_Z'
+
+    def test_fuse_add_bias_into_conv_use_weight_shape_with_tile(self):  # type: () -> None
+        conv = helper.make_node("Conv", ["X", "Y"], ["Z"])
+        add = helper.make_node("Add", ["Z", "A"], ["B"])
+        graph = helper.make_graph(
+            [conv, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 3, 3)),
+             helper.make_tensor_value_info(
+                 "Y", TensorProto.FLOAT, (16, 5, 3, 3)),
+             helper.make_tensor_value_info("A", TensorProto.FLOAT, (1,))],
+            [helper.make_tensor_value_info(
+                "B", TensorProto.FLOAT, (1, 16, 1, 1))],
+        )
+        optimized_model = self._optimized(graph, ["fuse_add_bias_into_conv"])
+
+        assert len(list(optimized_model.graph.node)) == 3
+        assert len(optimized_model.graph.value_info) == 1
+        assert optimized_model.graph.value_info[0].type.tensor_type.elem_type == TensorProto.INT64
+        assert len(
+            optimized_model.graph.value_info[0].type.tensor_type.shape.dim) == 1
+        assert optimized_model.graph.node[0].op_type == 'Constant'
+        assert optimized_model.graph.node[1].op_type == 'Tile'
+        assert optimized_model.graph.node[2].op_type == 'Conv'
+        assert optimized_model.graph.output[0].name == 'Z'
+
+    def test_fuse_add_bias_into_conv_use_conv_shape(self):  # type: () -> None
+        sub = helper.make_node("Sub", ["M", "N"], ["Y"])
+        conv = helper.make_node("Conv", ["X", "Y"], ["Z"])
+        add = helper.make_node("Add", ["Z", "A"], ["B"])
+        graph = helper.make_graph(
+            [sub, conv, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 3, 3)),
+             helper.make_tensor_value_info(
+                 "M", TensorProto.FLOAT, (16, 5, 3, 3)),
+             helper.make_tensor_value_info(
+                 "N", TensorProto.FLOAT, (16, 5, 3, 3)),
+             helper.make_tensor_value_info("A", TensorProto.FLOAT, (1, 16, 1, 1))],
+            [helper.make_tensor_value_info(
+                "B", TensorProto.FLOAT, (1, 16, 1, 1))],
+            value_info=[
+                helper.make_tensor_value_info(
+                    "Z", TensorProto.FLOAT, (1, 16, 1, 1))
+            ],
+        )
+        optimized_model = self._optimized(graph, ["fuse_add_bias_into_conv"])
+
+        assert len(optimized_model.graph.node) == 3
+        assert optimized_model.graph.node[0].op_type == 'Sub'
+        assert optimized_model.graph.node[1].op_type == 'Squeeze'
+        assert optimized_model.graph.node[2].op_type == 'Conv'
+        assert optimized_model.graph.output[0].name == 'Z'
+        assert optimized_model.graph.output[0].type.tensor_type.elem_type == TensorProto.FLOAT
+        assert len(
+            optimized_model.graph.output[0].type.tensor_type.shape.dim) == 4
+
+    def test_fuse_add_bias_into_conv_use_move_constant(self):  # type: () -> None
+        conv = helper.make_node("Conv", ["X", "Y"], ["Z"])
+        constant = helper.make_node("Constant", [], ["A"],
+                                    value=helper.make_tensor(
+                                        name="bias",
+                                        data_type=TensorProto.FLOAT,
+                                        dims=(16, 1, 1),
+                                        vals=np.random.randn(16).astype(np.float32).tolist()))
+        add = helper.make_node("Add", ["Z", "A"], ["B"])
+        graph = helper.make_graph(
+            [conv, constant, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 3, 3)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
+            [helper.make_tensor_value_info(
+                "B", TensorProto.FLOAT, (1, 16, 1, 1))],
+            value_info=[
+                helper.make_tensor_value_info(
+                    "A", TensorProto.FLOAT, (16, 1, 1)),
+            ]
+        )
+        optimized_model = self._optimized(graph, ["fuse_add_bias_into_conv"])
+
+        assert len(optimized_model.graph.node) == 3
+        assert optimized_model.graph.node[0].op_type == 'Constant'
+        assert optimized_model.graph.node[1].op_type == 'Squeeze'
+        assert optimized_model.graph.node[2].op_type == 'Conv'
+        assert optimized_model.graph.output[0].name == 'Z'
+        assert optimized_model.graph.output[0].type.tensor_type.elem_type == TensorProto.FLOAT
+        assert len(
+            optimized_model.graph.output[0].type.tensor_type.shape.dim) == 4
+
+    def test_fuse_add_bias_into_conv_squeeze_1d_bias_no_fuse(self):  # type: () -> None
+        conv = helper.make_node("Conv", ["X", "Y"], ["Z"])
+        add = helper.make_node("Add", ["Z", "A"], ["B"])
+        graph = helper.make_graph(
+            [conv, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 3, 3)),
+             helper.make_tensor_value_info(
+                 "Y", TensorProto.FLOAT, (16, 5, 3, 3)),
+             helper.make_tensor_value_info("A", TensorProto.FLOAT, (3,))],
+            [helper.make_tensor_value_info(
+                "B", TensorProto.FLOAT, (1, 16, 1, 3))],
+            value_info=[
+                helper.make_tensor_value_info(
+                    "Z", TensorProto.FLOAT, (1, 16, 1, 1)),
+            ]
+        )
+        optimized_model = self._optimized(graph, ["fuse_add_bias_into_conv"])
+
+        assert len(list(optimized_model.graph.node)) == 2
+        assert optimized_model.graph.node[0].op_type == 'Conv'
+        assert optimized_model.graph.node[1].op_type == 'Add'
+
+    def test_fuse_add_bias_into_conv_squeeze_3d_bias_no_fuse(self):  # type: () -> None
+        conv = helper.make_node("Conv", ["X", "Y"], ["Z"])
+        add = helper.make_node("Add", ["Z", "A"], ["B"])
+        graph = helper.make_graph(
+            [conv, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 3, 3)),
+             helper.make_tensor_value_info(
+                 "Y", TensorProto.FLOAT, (16, 5, 3, 3)),
+             helper.make_tensor_value_info("A", TensorProto.FLOAT, (16, 3, 3))],
+            [helper.make_tensor_value_info(
+                "B", TensorProto.FLOAT, (1, 16, 3, 3))],
+            value_info=[
+                helper.make_tensor_value_info(
+                    "Z", TensorProto.FLOAT, (1, 16, 1, 1)),
+            ]
+        )
+        optimized_model = self._optimized(graph, ["fuse_add_bias_into_conv"])
+
+        assert len(list(optimized_model.graph.node)) == 2
+        assert optimized_model.graph.node[0].op_type == 'Conv'
+        assert optimized_model.graph.node[1].op_type == 'Add'
+
+    def test_fuse_add_bias_into_conv_squeeze_4d_bias_no_fuse(self):  # type: () -> None
+        conv = helper.make_node("Conv", ["X", "Y"], ["Z"])
+        add = helper.make_node("Add", ["Z", "A"], ["B"])
+        graph = helper.make_graph(
+            [conv, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 3, 3)),
+             helper.make_tensor_value_info(
+                 "Y", TensorProto.FLOAT, (16, 5, 3, 3)),
+             helper.make_tensor_value_info("A", TensorProto.FLOAT, (1, 16, 3, 3))],
+            [helper.make_tensor_value_info(
+                "B", TensorProto.FLOAT, (1, 16, 3, 3))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_add_bias_into_conv"])
+
+        assert len(list(optimized_model.graph.node)) == 2
+        assert optimized_model.graph.node[0].op_type == 'Conv'
+        assert optimized_model.graph.node[1].op_type == 'Add'
+
+    def test_fuse_matmul_add_bias_into_gemm(self):  # type: () -> None
+        matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
+        add = helper.make_node("Add", ["Z", "B"], ["A"])
+        graph = helper.make_graph(
+            [matmul, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (32, 10)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (10, 16)),
+             helper.make_tensor_value_info("B", TensorProto.FLOAT, (16,))],
+            [helper.make_tensor_value_info("A", TensorProto.FLOAT, (32, 16))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+
+        assert len(list(optimized_model.graph.node)) == 1
+        assert optimized_model.graph.node[0].op_type == "Gemm"
+
+    def test_fuse_matmul_add_bias_into_gemm_2d_bias(self):  # type: () -> None
+        matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
+        add = helper.make_node("Add", ["Z", "B"], ["A"])
+        graph = helper.make_graph(
+            [matmul, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (32, 10)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (10, 16)),
+             helper.make_tensor_value_info("B", TensorProto.FLOAT, (1, 16))],
+            [helper.make_tensor_value_info("A", TensorProto.FLOAT, (32, 16))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+
+        assert len(list(optimized_model.graph.node)) == 1
+        assert optimized_model.graph.node[0].op_type == "Gemm"
+
+    def test_fuse_matmul_add_bias_into_gemm_2d_bias_same_shape(self):  # type: () -> None
+        matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
+        add = helper.make_node("Add", ["Z", "B"], ["A"])
+        graph = helper.make_graph(
+            [matmul, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (32, 10)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (10, 16)),
+             helper.make_tensor_value_info("B", TensorProto.FLOAT, (32, 16))],
+            [helper.make_tensor_value_info("A", TensorProto.FLOAT, (32, 16))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+
+        assert len(list(optimized_model.graph.node)) == 1
+        assert optimized_model.graph.node[0].op_type == "Gemm"
+
+    def test_fuse_matmul_add_bias_into_gemm_2d_bias_bcast_no_fuse(self):  # type: () -> None
+        matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
+        add = helper.make_node("Add", ["Z", "B"], ["A"])
+        graph = helper.make_graph(
+            [matmul, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 10)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (10, 16)),
+             helper.make_tensor_value_info("B", TensorProto.FLOAT, (16, 16))],
+            [helper.make_tensor_value_info("A", TensorProto.FLOAT, (16, 16))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+
+        assert optimized_model.graph == graph
+
+    def test_fuse_matmul_add_bias_into_gemm_3d_matmul_no_fuse(self):  # type: () -> None
+        matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
+        add = helper.make_node("Add", ["Z", "B"], ["A"])
+        graph = helper.make_graph(
+            [matmul, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3, 4)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 4, 3)),
+             helper.make_tensor_value_info("B", TensorProto.FLOAT, (3, 3))],
+            [helper.make_tensor_value_info("A", TensorProto.FLOAT, (2, 3, 3))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+
+        assert optimized_model.graph == graph
+
+    def test_fuse_matmul_add_bias_into_gemm_3d_bias_no_fuse(self):  # type: () -> None
+        matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
+        add = helper.make_node("Add", ["Z", "B"], ["A"])
+        graph = helper.make_graph(
+            [matmul, add],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (32, 10)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (10, 16)),
+             helper.make_tensor_value_info("B", TensorProto.FLOAT, (4, 1, 16))],
+            [helper.make_tensor_value_info("A", TensorProto.FLOAT, (32, 16))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+
+        assert optimized_model.graph == graph
+
+    def test_fuse_matmul_add_bias_into_gemm_multiple_use_no_fuse(self):  # type: () -> None
+        matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
+        identity = helper.make_node("Identity", ["Z"], ["A1"])
+        add = helper.make_node("Add", ["Z", "B"], ["A2"])
+        graph = helper.make_graph(
+            [matmul, add, identity],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (32, 10)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (10, 16)),
+             helper.make_tensor_value_info("B", TensorProto.FLOAT, (1, 16))],
+            [helper.make_tensor_value_info("A1", TensorProto.FLOAT, (32, 16)),
+             helper.make_tensor_value_info("A2", TensorProto.FLOAT, (32, 16))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+
+        assert optimized_model.graph == graph
+
+    def test_fuse_pad_into_conv_no_optional_value_opset10(self):  # type: () -> None
+        pad = helper.make_node(
+            "Pad",
+            ["X"],
+            ["P"],
+            mode="constant",
+            pads=[0, 0, 0, 0, 0, 0, 1, 1]
+        )
+        conv = helper.make_node("Conv", ["P", "Y"], ["Z"])
+        graph = helper.make_graph(
+            [pad, conv],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 2, 2)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
+
+        assert len(list(optimized_model.graph.node)) == 1
+        assert optimized_model.graph.node[0].op_type == "Conv"
+        assert optimized_model.graph.node[0].attribute[0].name == "pads"
+        assert list(optimized_model.graph.node[0].attribute[0].ints) == [0, 0, 1, 1]
+
+    def test_fuse_pad_into_conv_no_optional_value(self):  # type: () -> None
+        pad = helper.make_node(
+            "Pad",
+            ["X", "Pads"],
+            ["P"],
+            mode="constant"
+        )
+        conv = helper.make_node("Conv", ["P", "Y"], ["Z"])
+        graph = helper.make_graph(
+            [pad, conv],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 2, 2)),
+             helper.make_tensor_value_info("Pads", TensorProto.INT64, (8,)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))],
+            [helper.make_tensor("Pads", TensorProto.INT64,
+             dims=(8,),
+             vals=np.array([0, 0, 0, 0, 0, 0, 1, 1]).astype(np.int64).tobytes(),
+             raw=True)])
+        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"])
+
+        assert len(list(optimized_model.graph.node)) == 1
+        assert optimized_model.graph.node[0].op_type == "Conv"
+        assert optimized_model.graph.node[0].attribute[0].name == "pads"
+        assert list(optimized_model.graph.node[0].attribute[0].ints) == [0, 0, 1, 1]
+
+    def test_fuse_pad_into_conv_with_optional_value(self):  # type: () -> None
+        pad = helper.make_node(
+            "Pad",
+            ["X", "Pads", "Constant_value"],
+            ["P"],
+            mode="constant"
+        )
+        conv = helper.make_node("Conv", ["P", "Y"], ["Z"])
+        graph = helper.make_graph(
+            [pad, conv],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 2, 2)),
+             helper.make_tensor_value_info("Pads", TensorProto.INT64, (8,)),
+             helper.make_tensor_value_info("Constant_value", TensorProto.FLOAT, ()),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))],
+            [helper.make_tensor("Pads", TensorProto.INT64,
+             dims=(8,),
+             vals=np.array([0, 0, 0, 0, 0, 0, 1, 1]).astype(np.int64).tobytes(),
+             raw=True),
+             helper.make_tensor("Constant_value", TensorProto.FLOAT,
+             dims=(),
+             vals=np.array([0]).astype(np.float32).tobytes(),
+             raw=True)])
+        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"])
+
+        assert len(list(optimized_model.graph.node)) == 1
+        assert optimized_model.graph.node[0].op_type == "Conv"
+        assert optimized_model.graph.node[0].attribute[0].name == "pads"
+        assert list(optimized_model.graph.node[0].attribute[0].ints) == [0, 0, 1, 1]
+
+    def test_fuse_pad_into_conv_with_nonzero_optional_value(self):  # type: () -> None
+        pad = helper.make_node(
+            "Pad",
+            ["X", "Pads", "Constant_value"],
+            ["P"],
+            mode="constant"
+        )
+        conv = helper.make_node("Conv", ["P", "Y"], ["Z"])
+        graph = helper.make_graph(
+            [pad, conv],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 2, 2)),
+             helper.make_tensor_value_info("Pads", TensorProto.INT64, (8,)),
+             helper.make_tensor_value_info("Constant_value", TensorProto.FLOAT, ()),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))],
+            [helper.make_tensor("Pads", TensorProto.INT64,
+             dims=(8,),
+             vals=np.array([0, 0, 0, 0, 0, 0, 1, 1]).astype(np.int64).tobytes(),
+             raw=True),
+             helper.make_tensor("Constant_value", TensorProto.FLOAT,
+             dims=(),
+             vals=np.array([25]).astype(np.float32).tobytes(),  # non-zero Constant_value -> so no pad
+             raw=True)])
+        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"])
+
+        assert optimized_model.graph == graph
+
+    def test_fuse_pad_into_conv_1d_opset10(self):  # type: () -> None
+        pad = helper.make_node(
+            "Pad",
+            ["X"],
+            ["P"],
+            mode="constant",
+            pads=[0, 0, 1, 0, 0, 1]
+        )
+        conv = helper.make_node("Conv", ["P", "Y"], ["Z"])
+        graph = helper.make_graph(
+            [pad, conv],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 30)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 32))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
+
+        assert len(list(optimized_model.graph.node)) == 1
+        assert optimized_model.graph.node[0].op_type == "Conv"
+        assert optimized_model.graph.node[0].attribute[0].name == "pads"
+        assert list(optimized_model.graph.node[0].attribute[0].ints) == [1, 1]
+
+    def test_fuse_pad_into_conv_1d(self):  # type: () -> None
+        pad = helper.make_node(
+            "Pad",
+            ["X", "Pads"],
+            ["P"],
+            mode="constant"
+        )
+        conv = helper.make_node("Conv", ["P", "Y"], ["Z"])
+        graph = helper.make_graph(
+            [pad, conv],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 30)),
+             helper.make_tensor_value_info("Pads", TensorProto.INT64, (6,)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 32))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1))],
+            [helper.make_tensor("Pads", TensorProto.INT64,
+             dims=(6,),
+             vals=np.array([0, 0, 1, 0, 0, 1]).astype(np.int64).tobytes(),
+             raw=True)])
+        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"])
+
+        assert len(list(optimized_model.graph.node)) == 1
+        assert optimized_model.graph.node[0].op_type == "Conv"
+        assert optimized_model.graph.node[0].attribute[0].name == "pads"
+        assert list(optimized_model.graph.node[0].attribute[0].ints) == [1, 1]
+
+    def test_fuse_pad_into_conv_existing_conv_pad_opset10(self):  # type: () -> None
+        pad = helper.make_node(
+            "Pad",
+            ["X"],
+            ["P"],
+            mode="constant",
+            pads=[0, 0, 0, 0, 0, 0, 1, 1]
+        )
+        conv = helper.make_node(
+            "Conv",
+            ["P", "Y"],
+            ["Z"],
+            pads=[1, 1, 0, 0]
+        )
+        graph = helper.make_graph(
+            [pad, conv],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 2, 2)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 4, 4))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
+
+        assert len(list(optimized_model.graph.node)) == 1
+        assert optimized_model.graph.node[0].op_type == "Conv"
+        assert optimized_model.graph.node[0].attribute[0].name == "pads"
+        assert list(optimized_model.graph.node[0].attribute[0].ints) == [1, 1, 1, 1]
+
+    def test_fuse_pad_into_conv_existing_conv_pad(self):  # type: () -> None
+        pad = helper.make_node(
+            "Pad",
+            ["X", "Pads"],
+            ["P"],
+            mode="constant"
+        )
+        conv = helper.make_node(
+            "Conv",
+            ["P", "Y"],
+            ["Z"],
+            pads=[1, 1, 0, 0]
+        )
+        graph = helper.make_graph(
+            [pad, conv],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 2, 2)),
+             helper.make_tensor_value_info("Pads", TensorProto.INT64, (8,)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 4, 4))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))],
+            [helper.make_tensor("Pads", TensorProto.INT64,
+             dims=(8,),
+             vals=np.array([0, 0, 0, 0, 0, 0, 1, 1]).astype(np.int64).tobytes(),
+             raw=True)])
+        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"])
+
+        assert len(list(optimized_model.graph.node)) == 1
+        assert optimized_model.graph.node[0].op_type == "Conv"
+        assert optimized_model.graph.node[0].attribute[0].name == "pads"
+        assert list(optimized_model.graph.node[0].attribute[0].ints) == [1, 1, 1, 1]
+
+    def test_fuse_pad_into_conv_pad_feature_no_fuse_opset10(self):  # type: () -> None
+        pad = helper.make_node(
+            "Pad",
+            ["X"],
+            ["P"],
+            mode="constant",
+            pads=[0, 1, 0, 0, 0, 0, 0, 0]
+        )
+        conv = helper.make_node("Conv", ["P", "Y"], ["Z"])
+        graph = helper.make_graph(
+            [pad, conv],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 4, 3, 3)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
+
+        assert optimized_model.graph == graph
+
+    def test_fuse_pad_into_conv_pad_feature_no_fuse(self):  # type: () -> None
+        pad = helper.make_node(
+            "Pad",
+            ["X", "Pads"],
+            ["P"],
+            mode="constant"
+        )
+        conv = helper.make_node("Conv", ["P", "Y"], ["Z"])
+        graph = helper.make_graph(
+            [pad, conv],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 4, 3, 3)),
+             helper.make_tensor_value_info("Pads", TensorProto.INT64, (8,)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))],
+            [helper.make_tensor("Pads", TensorProto.INT64,
+             dims=(8,),
+             vals=np.array([0, 1, 0, 0, 0, 0, 0, 0]).astype(np.int64).tobytes(),
+             raw=True)])
+        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"])
+
+        assert optimized_model.graph == graph
+
+    def test_fuse_pad_into_conv_negative_pad_no_fuse_opset10(self):  # type: () -> None
+        pad = helper.make_node(
+            "Pad",
+            ["X"],
+            ["P"],
+            mode="constant",
+            pads=[0, 0, 0, 0, 0, 0, -1, -1]
+        )
+        conv = helper.make_node("Conv", ["P", "Y"], ["Z"])
+        graph = helper.make_graph(
+            [pad, conv],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 4, 4)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
+
+        assert optimized_model.graph == graph
+
+    def test_fuse_pad_into_conv_negative_pad_no_fuse(self):  # type: () -> None
+        pad = helper.make_node(
+            "Pad",
+            ["X", "Pads"],
+            ["P"],
+            mode="constant"
+        )
+        conv = helper.make_node("Conv", ["P", "Y"], ["Z"])
+        graph = helper.make_graph(
+            [pad, conv],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 4, 4)),
+             helper.make_tensor_value_info("Pads", TensorProto.INT64, (8,)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))],
+            [helper.make_tensor("Pads", TensorProto.INT64,
+             dims=(8,),
+             vals=np.array([0, 0, 0, 0, 0, 0, -1, -1]).astype(np.int64).tobytes(),
+             raw=True)])
+        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"])
+
+        assert optimized_model.graph == graph
+
+    def test_fuse_pad_into_conv_reflection_pad_no_fuse_opset10(self):  # type: () -> None
+        pad = helper.make_node(
+            "Pad",
+            ["X"],
+            ["P"],
+            mode="reflect",
+            pads=[0, 0, 0, 0, 0, 0, 1, 1]
+        )
+        conv = helper.make_node("Conv", ["P", "Y"], ["Z"])
+        graph = helper.make_graph(
+            [pad, conv],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 2, 2)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))]
+        )
+        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
+
+        assert optimized_model.graph == graph
+
+    def test_fuse_pad_into_conv_reflection_pad_no_fuse(self):  # type: () -> None
+        pad = helper.make_node(
+            "Pad",
+            ["X", "Pads"],
+            ["P"],
+            mode="reflect"
+        )
+        conv = helper.make_node("Conv", ["P", "Y"], ["Z"])
+        graph = helper.make_graph(
+            [pad, conv],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 2, 2)),
+             helper.make_tensor_value_info("Pads", TensorProto.INT64, (8,)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))],
+            [helper.make_tensor("Pads", TensorProto.INT64,
+             dims=(8,),
+             vals=np.array([0, 0, 0, 0, 0, 0, 1, 1]).astype(np.int64).tobytes(),
+             raw=True)])
+        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"])
+
+        assert optimized_model.graph == graph
+
+    def test_fuse_consecutive_squeezes(self):  # type: () -> None
+        nodes = [helper.make_node("Squeeze", ["X"], ["Y"], axes=[0, 4, 5]),
+                 helper.make_node("Squeeze", ["Y"], ["Z"], axes=[0, 3])]
+        nodes.extend(self._make_fake_loop_op(
+            [helper.make_node("Squeeze", ["_X"], ["_Y"], axes=[0, 4, 5]),
+             helper.make_node("Squeeze", ["_Y"], ["_Z2"], axes=[0, 3])],
+            [(TensorProto.FLOAT, (1, 1, 2, 3, 1, 1, 1, 1, 8, 9), "X")],
+            [(TensorProto.FLOAT, (2, 3, 1, 8, 9), "Z2")]))
+
+        graph = helper.make_graph(
+            nodes,
+            "test",
+            [helper.make_tensor_value_info(
+                "X", TensorProto.FLOAT, (1, 1, 2, 3, 1, 1, 1, 1, 8, 9))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (2, 3, 1, 8, 9))])
+        optimized_model = self._optimized(graph, ["fuse_consecutive_squeezes"])
+
+        # Squeeze, Constant (trip count), Constant (cond), Loop
+        assert optimized_model.graph.node[0].op_type == "Squeeze"
+        assert list(optimized_model.graph.node[0].attribute[0].ints) == [
+            0, 1, 4, 5, 6]
+        assert len(list(optimized_model.graph.node)) == 4
+
+    def test_fuse_consecutive_squeezes_default(self):  # type: () -> None
+        squeeze1 = helper.make_node("Squeeze", ["X"], ["Y"], axes=[0, 4, 5])
+        squeeze2 = helper.make_node("Squeeze", ["Y"], ["Z"], axes=[0, 3])
+        squeeze3 = helper.make_node("Squeeze", ["Z"], ["A"], axes=[2])
+        nodes = [squeeze1, squeeze2, squeeze3]
+        graph = helper.make_graph(
+            nodes,
+            "test",
+            [helper.make_tensor_value_info(
+                "X", TensorProto.FLOAT, (1, 1, 2, 3, 1, 1, 1, 1, 8, 9))],
+            [helper.make_tensor_value_info("A", TensorProto.FLOAT, (2, 3, 8, 9))])
+        optimized_model = self._optimized(graph, ["fuse_consecutive_squeezes"])
+
+        assert optimized_model.graph.node[0].op_type == "Squeeze"
+        assert list(optimized_model.graph.node[0].attribute[0].ints) == [
+            0, 1, 4, 5, 6, 7]
+        assert len(list(optimized_model.graph.node)) == 1
+
+    def test_fuse_consecutive_squeezes_random(self):  # type: () -> None
+        x_shape = [1, 1, 1, 3, 4, 1, 6, 1, 1, 9]
+        s1_one_indices = [i for i, a in enumerate(x_shape) if a == 1]
+        s1_axes = np.random.choice(s1_one_indices, size=np.random.randint(low=1, high=len(s1_one_indices) - 1),
+                                   replace=False)
+        s2_x_shape = [a for i, a in enumerate(x_shape) if i not in s1_axes]
+        s2_one_indices = [i for i, a in enumerate(s2_x_shape) if a == 1]
+        s2_axes = s2_one_indices
+
+        squeeze1 = helper.make_node("Squeeze", ["X"], ["Y"], axes=s1_axes)
+        squeeze2 = helper.make_node("Squeeze", ["Y"], ["Z"], axes=s2_axes)
+        nodes = [squeeze1, squeeze2]
+        graph = helper.make_graph(
+            nodes,
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, x_shape)],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (3, 4, 6, 9))])
+        optimized_model = self._optimized(graph, ["fuse_consecutive_squeezes"])
+
+        assert optimized_model.graph.node[0].op_type == "Squeeze"
+        assert list(optimized_model.graph.node[0].attribute[0].ints) == [
+            0, 1, 2, 5, 7, 8]
+        assert len(list(optimized_model.graph.node)) == 1
+
+    def test_fuse_consecutive_squeezes_multi_uses(self):  # type: () -> None
+        squeeze1 = helper.make_node("Squeeze", ["X"], ["Y"], axes=[0, 4, 5])
+        add = helper.make_node("Add", ["Y", "A"], ["Z2"])
+        squeeze2 = helper.make_node("Squeeze", ["Y"], ["Z"], axes=[0, 3])
+        graph = helper.make_graph(
+            [squeeze1, add, squeeze2],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 1, 2, 3, 1, 1, 1, 1, 8, 9)),
+             helper.make_tensor_value_info("A", TensorProto.FLOAT, (1,))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (2, 3, 1, 8, 9)),
+             helper.make_tensor_value_info("Z2", TensorProto.FLOAT, (1, 2, 3, 1, 1, 8, 9))])
+        optimized_model = self._optimized(graph, ["fuse_consecutive_squeezes"])
+
+        assert optimized_model.graph.node[0].op_type == "Squeeze"
+        assert list(optimized_model.graph.node[0].attribute[0].ints) == [
+            0, 4, 5]
+        assert optimized_model.graph.node[2].op_type == "Squeeze"
+        assert optimized_model.graph.node[2].input == ["X"]
+        assert list(optimized_model.graph.node[2].attribute[0].ints) == [
+            0, 1, 4, 5, 6]
+        assert len(list(optimized_model.graph.node)) == 3
+
+    def test_fuse_consecutive_softmax_log_axis(self):  # type: () -> None
+        for axis in range(3):
+            softmax = helper.make_node("Softmax", ["X"], ["Y"], axis=axis)
+            log = helper.make_node("Log", ["Y"], ["Z"])
+            graph = helper.make_graph(
+                [softmax, log],
+                "test",
+                [helper.make_tensor_value_info(
+                    "X", TensorProto.FLOAT, (5, 7, 11))],
+                [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (5, 7, 11))])
+            optimized_model = self._optimized(
+                graph, ["fuse_consecutive_log_softmax"])
+
+            assert optimized_model.graph.output[0].type.tensor_type.elem_type == TensorProto.FLOAT
+            assert len(optimized_model.graph.output) == 1
+            assert len(optimized_model.graph.node) == 1
+            assert optimized_model.graph.node[0].op_type == "LogSoftmax"
+            assert optimized_model.graph.node[0].attribute[0].name == "axis"
+            assert optimized_model.graph.node[0].attribute[0].i == axis
+
+    def test_fuse_consecutive_softmax_log_side_effect(self):  # type: () -> None
+        softmax = helper.make_node("Softmax", ["X"], ["Y"], axis=2)
+        log = helper.make_node("Log", ["Y"], ["Z"])
+        graph = helper.make_graph(
+            [softmax, log],
+            "test",
+            [helper.make_tensor_value_info(
+                "X", TensorProto.FLOAT, (5, 7, 11))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (5, 7, 11)),
+             helper.make_tensor_value_info("Y", TensorProto.FLOAT, (5, 7, 11))])
+        optimized_model = self._optimized(
+            graph, ["fuse_consecutive_log_softmax"])
+
+        assert graph == optimized_model.graph
+
+    def test_fuse_consecutive_softmax_log_multiple_out(self):  # type: () -> None
+        softmax = helper.make_node("Softmax", ["X"], ["Y"], axis=2)
+        log = helper.make_node("Log", ["Y"], ["Z"])
+        exp = helper.make_node("Exp", ["Z"], ["Z1"])
+        graph = helper.make_graph(
+            [softmax, log, exp],
+            "test",
+            [helper.make_tensor_value_info(
+                "X", TensorProto.FLOAT, (5, 7, 11))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (5, 7, 11)),
+             helper.make_tensor_value_info("Z1", TensorProto.FLOAT, (5, 7, 11))])
+        optimized_model = self._optimized(
+            graph, ["fuse_consecutive_log_softmax"])
+
+        assert len(optimized_model.graph.output) == 2
+        assert len(optimized_model.graph.node) == 2
+        assert optimized_model.graph.output[0].type.tensor_type.elem_type == TensorProto.FLOAT
+        assert optimized_model.graph.output[1].type.tensor_type.elem_type == TensorProto.FLOAT
+        assert optimized_model.graph.node[0].op_type == "LogSoftmax"
+        assert optimized_model.graph.node[0].attribute[0].name == "axis"
+        assert optimized_model.graph.node[0].attribute[0].i == 2
+        assert optimized_model.graph.node[1].op_type == "Exp"
+
+    def test_preserve_value_info(self):  # type: () -> None
+        trans1 = helper.make_node("Transpose", ["X"], ["Y"], perm=[1, 0, 2])
+        trans2 = helper.make_node("Transpose", ["Y"], ["Z"], perm=[2, 0, 1])
+        trans3 = helper.make_node("Transpose", ["Z"], ["A"], perm=[2, 0, 1])
+        graph = helper.make_graph(
+            [trans1, trans2, trans3],
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3, 4))],
+            [helper.make_tensor_value_info("A", TensorProto.FLOAT, (2, 4, 3))])
+        vi = helper.make_tensor_value_info("Y", TensorProto.FLOAT, (3, 2, 4))
+        graph.value_info.extend([vi])
+        optimized_model = self._optimized(graph, ["nop"])
+        assert list(optimized_model.graph.value_info) == [vi]
+        assert len(list(optimized_model.graph.node)) == 3
+
+    def test_split(self):  # type: () -> None
+        node = onnx.helper.make_node(
+            'Constant',
+            inputs=[],
+            outputs=['X'],
+            value=onnx.helper.make_tensor(
+                name='X',
+                data_type=TensorProto.FLOAT,
+                dims=[1],
+                vals=[5],
+            ),
+        )
+        graph = helper.make_graph(
+            [node],
+            'test-optimize-split',
+            [],
+            [helper.make_tensor_value_info('X', TensorProto.FLOAT, (1,))])
+
+        init_model = self._optimized(graph, ['split_init'])
+        self.assertEqual(len(init_model.graph.node), 1)
+        self.assertEqual(len(init_model.graph.output), 1)
+        self.assertEqual(init_model.graph.node[0].op_type, 'Constant')
+
+        predict_model = self._optimized(graph, ['split_predict'])
+        self.assertEqual(len(predict_model.graph.node), 0)
+        self.assertEqual(len(predict_model.graph.input), 1)
+        self.assertEqual(predict_model.graph.input[0].name, 'X')
+
+    def test_lift_lex_loop(self):  # type: () -> None
+        nodes = [helper.make_node("Identity", ["X"], ["Y"])]
+        nodes.extend(self._make_fake_loop_op(
+            [helper.make_node("Identity", ["X"], ["_Y2"]),
+             helper.make_node("Identity", ["Y"], ["_Y3"])],
+            [],
+            [(TensorProto.FLOAT, (5,), "Y2"),
+             (TensorProto.FLOAT, (5,), "Y3")]))
+        graph = helper.make_graph(
+            nodes,
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (5,))],
+            [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (5,)),
+             helper.make_tensor_value_info("Y2", TensorProto.FLOAT, (5,))])
+        optimized_model = self._optimized(graph, ["lift_lexical_references"])
+        assert len(optimized_model.graph.node) == 4
+        # body_graph, __control_inputs
+        assert len(optimized_model.graph.node[3].attribute) == 2
+        assert optimized_model.graph.node[3].attribute[1].name == "__control_inputs"
+        assert optimized_model.graph.node[3].attribute[1].strings[0] == b"X"
+        assert optimized_model.graph.node[3].attribute[1].strings[1] == b"Y"
+
+    def test_lift_lex_if(self):  # type: () -> None
+        nodes = [helper.make_node("Identity", ["X"], ["Y"])]
+        nodes.extend(self._make_fake_if_op(
+            [helper.make_node("Identity", ["X"], ["_Y2"]),
+             helper.make_node("Identity", ["Y"], ["_Y3"])],
+            [helper.make_node("Identity", ["X"], ["_Y2"]),
+             helper.make_node("Identity", ["X"], ["_Y3"])],
+            [(TensorProto.FLOAT, (5,), "Y2"),
+             (TensorProto.FLOAT, (5,), "Y3")]))
+        graph = helper.make_graph(
+            nodes,
+            "test",
+            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (5,))],
+            [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (5,)),
+             helper.make_tensor_value_info("Y2", TensorProto.FLOAT, (5,))])
+        # "If" node now diverges from ONNX schema. Disable checking.
+        optimized_model = self._optimized(graph, ["lift_lexical_references"])
+
+        # Identity, Constant (condition), If
+        assert len(optimized_model.graph.node) == 3
+        # else_branch, then_branch, __control_inputs
+        assert len(optimized_model.graph.node[2].attribute) == 3
+        assert optimized_model.graph.node[2].attribute[2].name == "__control_inputs"
+        assert optimized_model.graph.node[2].attribute[2].strings[0] == b"X"
+        assert optimized_model.graph.node[2].attribute[2].strings[1] == b"Y"
+
+    def test_fuse_bn_into_conv_simple(self):  # type: () -> None
+        for (tensor_type, np_type) in [(TensorProto.FLOAT, np.float32), (TensorProto.DOUBLE, np.float64)]:
+            conv = helper.make_node("Conv", ["X", "W", "B"], ["Y"])
+            bn = helper.make_node("BatchNormalization", [
+                                  "Y", "scale", "b", "mean", "var"], ["Z"])
+
+            W = np.random.randn(3, 2, 5, 5).astype(np_type) + 2
+            B = np.random.randn(3,).astype(np_type) + 2
+            scale = np.random.randn(3,).astype(np_type) + 2
+            b = np.random.randn(3,).astype(np_type) + 2
+            mean = np.random.randn(3,).astype(np_type) + 2
+            var = np.abs(np.random.randn(3,).astype(np_type)) + 2
+
+            initializers = [
+                helper.make_tensor(name, tensor_type,
+                                   npa.shape, npa.tobytes(), raw=True)
+                for name, npa in [('W', W), ('B', B), ('scale', scale), ('b', b), ('mean', mean), ('var', var)]
+            ]
+            graph = helper.make_graph(
+                [conv, bn],
+                "test",
+                [helper.make_tensor_value_info("X", tensor_type, (5, 2, 28, 28)),
+                 helper.make_tensor_value_info("W", tensor_type, (3, 2, 5, 5)),
+                 helper.make_tensor_value_info("B", tensor_type, (3,)),
+                 helper.make_tensor_value_info("scale", tensor_type, (3,)),
+                 helper.make_tensor_value_info("b", tensor_type, (3,)),
+                 helper.make_tensor_value_info("mean", tensor_type, (3,)),
+                 helper.make_tensor_value_info("var", tensor_type, (3,))],
+                [helper.make_tensor_value_info(
+                    "Z", tensor_type, (5, 3, 24, 24))],
+                initializer=initializers,
+                value_info=[
+                    helper.make_tensor_value_info(
+                        "Y", tensor_type, (5, 3, 24, 24))
+                ]
+            )
+            optimized_model = self._optimized(graph, ["fuse_bn_into_conv"])
+
+            self.assertEqual(len(optimized_model.graph.node), 1)
+            self.assertEqual(optimized_model.graph.node[0].op_type, 'Conv')
+            self.assertEqual(len(optimized_model.graph.initializer), 2)
+            new_W = numpy_helper.to_array(optimized_model.graph.initializer[0])
+            new_b = numpy_helper.to_array(optimized_model.graph.initializer[1])
+
+            f = scale / np.sqrt(var + 1e-5)
+            np.testing.assert_almost_equal((B - mean) * f + b, new_b)
+            np.testing.assert_almost_equal(
+                W * f[:, np.newaxis, np.newaxis, np.newaxis], new_W)
+
+    def _internal_test_deadend_elimination(self, fixed):  # type: (bool) -> None
+        softmax = helper.make_node("Softmax", ["X"], ["Y"], axis=2)
+        log = helper.make_node("Log", ["Y"], ["Z"])
+        exp = helper.make_node("Exp", ["Z"], ["Z1"])
+        exp1 = helper.make_node("Log", ["Z"], ["Z2"])
+        exp2 = helper.make_node("Sqrt", ["Z1"], ["Z3"])
+        graph = helper.make_graph(
+            [softmax, log, exp, exp1, exp2],
+            "test",
+            [helper.make_tensor_value_info(
+                "X", TensorProto.FLOAT, (5, 7, 11))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (5, 7, 11))])
+        optimized_model = self._optimized(
+            graph, ["eliminate_deadend"], fixed)
+        assert len(optimized_model.graph.output) == 1
+        assert len(optimized_model.graph.node) == 2
+        assert optimized_model.graph.output[0].type.tensor_type.elem_type == TensorProto.FLOAT
+        assert optimized_model.graph.node[0].op_type == "Softmax"
+        assert optimized_model.graph.node[0].attribute[0].name == "axis"
+        assert optimized_model.graph.node[0].attribute[0].i == 2
+        assert optimized_model.graph.node[1].op_type == "Log"
+
+    def test_deadend_elimination_simple(self):  # type: () -> None
+        self._internal_test_deadend_elimination(False)
+
+    def test_deadend_elimination_simple_fixed(self):  # type: () -> None
+        self._internal_test_deadend_elimination(True)
+
+    def test_eliminate_nop_monotone_argmax_basic_no_node_axis(self):  # type: () -> None
+        for node_name in ["Log", "Exp", "Sqrt"]:
+            for axis in range(3):
+                node = helper.make_node(node_name, ["X"], ["Y"])
+                argmax = helper.make_node("ArgMax", ["Y"], ["Z"], axis=axis)
+                graph = helper.make_graph(
+                    [node, argmax],
+                    "test",
+                    [helper.make_tensor_value_info(
+                        "X", TensorProto.FLOAT, (5, 7, 11))],
+                    [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (5, 7, 11))])
+                optimized_model = self._optimized(
+                    graph, ["eliminate_nop_monotone_argmax"])
+                assert len(optimized_model.graph.output) == 1
+                assert len(optimized_model.graph.node) == 1
+                assert optimized_model.graph.output[0].type.tensor_type.elem_type == TensorProto.FLOAT
+                assert optimized_model.graph.node[0].op_type == "ArgMax"
+                assert optimized_model.graph.node[0].attribute[0].name == "axis"
+                assert optimized_model.graph.node[0].attribute[0].i == axis
+
+    def test_eliminate_nop_monotone_argmax_basic_with_node_axis(self):  # type: () -> None
+        for node_name in ["Softmax", "LogSoftmax"]:
+            for axis_n in range(3):
+                for axis_max in range(3):
+                    node = helper.make_node(node_name, ["X"], ["Y"], axis=axis_n)
+                    argmax = helper.make_node("ArgMax", ["Y"], ["Z"], axis=axis_max)
+                    graph = helper.make_graph(
+                        [node, argmax],
+                        "test",
+                        [helper.make_tensor_value_info(
+                            "X", TensorProto.FLOAT, (5, 7, 11))],
+                        [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (5, 7, 11))])
+                    optimized_model = self._optimized(
+                        graph, ["eliminate_nop_monotone_argmax"])
+                    if axis_max == axis_n:
+                        assert len(optimized_model.graph.output) == 1
+                        assert len(optimized_model.graph.node) == 1
+                        assert optimized_model.graph.output[0].type.tensor_type.elem_type == TensorProto.FLOAT
+                        assert optimized_model.graph.node[0].op_type == "ArgMax"
+                        assert optimized_model.graph.node[0].attribute[0].name == "axis"
+                        assert optimized_model.graph.node[0].attribute[0].i == axis_max
+                    else:
+                        assert optimized_model.graph == graph
+
+    def test_eliminate_nop_monotone_argmax_multiple_out(self):  # type: () -> None
+        for node_name in ["Log", "Exp", "Sqrt"]:
+            for axis in range(3):
+                node = helper.make_node(node_name, ["X"], ["Y"])
+                node2 = helper.make_node(node_name, ["Y"], ["Z1"])
+                argmax = helper.make_node("ArgMax", ["Y"], ["Z"], axis=axis)
+                graph = helper.make_graph(
+                    [node, node2, argmax],
+                    "test",
+                    [helper.make_tensor_value_info(
+                        "X", TensorProto.FLOAT, (5, 7, 11))],
+                    [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (5, 7, 11)),
+                     helper.make_tensor_value_info("Z1", TensorProto.FLOAT, (5, 7, 11))])
+                optimized_model = self._optimized(
+                    graph, ["eliminate_nop_monotone_argmax"])
+                assert optimized_model.graph == graph
+
+    def test_eliminate_nop_monotone_argmax_consecutive(self):  # type: () -> None
+        def _assertion(graph, optimized_model, axis_aligned, true_axis):  # type: (GraphProto, ModelProto, bool, int) -> None
+            if axis_aligned:
+                assert len(optimized_model.graph.output) == 1
+                assert len(optimized_model.graph.node) == 1
+                assert optimized_model.graph.output[0].type.tensor_type.elem_type == TensorProto.FLOAT
+                assert optimized_model.graph.node[0].op_type == "ArgMax"
+                assert optimized_model.graph.node[0].attribute[0].name == "axis"
+                assert optimized_model.graph.node[0].attribute[0].i == true_axis
+            else:
+                assert optimized_model.graph == graph
+        # no axis X no axis test
+        for node_name_0 in ["Log", "Exp", "Sqrt"]:
+            for node_name_1 in ["Log", "Exp", "Sqrt"]:
+                for axis in range(3):
+                    node = helper.make_node(node_name_0, ["X"], ["Y"])
+                    node2 = helper.make_node(node_name_1, ["Y"], ["Y1"])
+                    argmax = helper.make_node("ArgMax", ["Y1"], ["Z"], axis=axis)
+                    graph = helper.make_graph(
+                        [node, node2, argmax],
+                        "test",
+                        [helper.make_tensor_value_info(
+                            "X", TensorProto.FLOAT, (5, 7, 11))],
+                        [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (5, 7, 11))])
+                    optimized_model = self._optimized(
+                        graph, ["eliminate_nop_monotone_argmax"], True)
+                    _assertion(graph, optimized_model, True, axis)
+        # no axis X axis test
+        for node_name_0 in ["Log", "Exp", "Sqrt"]:
+            for node_name_1 in ["Softmax", "LogSoftmax"]:
+                for axis_0 in range(3):
+                    for axis_1 in range(3):
+                        node = helper.make_node(node_name_0, ["X"], ["Y"])
+                        node2 = helper.make_node(node_name_1, ["Y"], ["Y1"], axis=axis_0)
+                        argmax = helper.make_node("ArgMax", ["Y1"], ["Z"], axis=axis_1)
+                        graph = helper.make_graph(
+                            [node, node2, argmax],
+                            "test",
+                            [helper.make_tensor_value_info(
+                                "X", TensorProto.FLOAT, (5, 7, 11))],
+                            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (5, 7, 11))])
+                        optimized_model = self._optimized(
+                            graph, ["eliminate_nop_monotone_argmax"], True)
+                        _assertion(graph, optimized_model, axis_0 == axis_1, axis_1)
+        # axis X axis test
+        for node_name_0 in ["Softmax", "LogSoftmax"]:
+            for node_name_1 in ["Softmax", "LogSoftmax"]:
+                for axis_0 in range(3):
+                    for axis_1 in range(3):
+                        for axis_2 in range(3):
+                            node = helper.make_node(node_name_0, ["X"], ["Y"], axis=axis_0)
+                            node2 = helper.make_node(node_name_1, ["Y"], ["Y1"], axis=axis_1)
+                            argmax = helper.make_node("ArgMax", ["Y1"], ["Z"], axis=axis_2)
+                            graph = helper.make_graph(
+                                [node, node2, argmax],
+                                "test",
+                                [helper.make_tensor_value_info(
+                                    "X", TensorProto.FLOAT, (5, 7, 11))],
+                                [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (5, 7, 11))])
+                            optimized_model = self._optimized(
+                                graph, ["eliminate_nop_monotone_argmax"], True)
+                            if axis_0 == axis_1:  # we can reduce both of the monotonic ops
+                                _assertion(graph, optimized_model, axis_1 == axis_2, axis_2)
+                            elif axis_1 == axis_2:  # we can reduce one of the monotonic ops
+                                assert len(optimized_model.graph.output) == 1
+                                assert len(optimized_model.graph.node) == 2
+                                assert optimized_model.graph.output[0].type.tensor_type.elem_type == TensorProto.FLOAT
+                                assert optimized_model.graph.node[-1].op_type == "ArgMax"
+                                assert optimized_model.graph.node[-1].attribute[0].name == "axis"
+                                assert optimized_model.graph.node[-1].attribute[0].i == axis_2
+                            else:  # we can't reduce anything
+                                assert optimized_model.graph == graph
+
+    def test_eliminate_nop_dropout(self):  # type: () -> None
+        node = helper.make_node("Dropout", ["X"], ["Y"])
+        node1 = helper.make_node("Log", ["Y"], ["Z"])
+        graph = helper.make_graph(
+            [node, node1],
+            "test",
+            [helper.make_tensor_value_info(
+                "X", TensorProto.FLOAT, (5, 7))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (5, 7))])
+        optimized_model = self._optimized(
+            graph, ["eliminate_nop_dropout"], False)
+
+        # we don't want to eliminate the dropoutin opset 12,
+        # even when it';s an optional parameter (defaults to 0)
+        assert optimized_model.graph == graph
+
+    def test_eliminate_nop_dropout_opset11_graph_output(self):  # type: () -> None
+        node = helper.make_node("Log", ["X"], ["Y"])
+        node1 = helper.make_node("Dropout", ["Y"], ["Z"], ratio=0.0)
+        graph = helper.make_graph(
+            [node, node1],
+            "test",
+            [helper.make_tensor_value_info(
+                "X", TensorProto.FLOAT, (5, 7))],
+            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (5, 7))])
+        optimized_model = self._optimized(
+            graph, ["eliminate_nop_dropout"], False, opset_imports=[helper.make_opsetid("", 11)])
+
+        assert len(optimized_model.graph.output) == 1
+        assert len(optimized_model.graph.node) == 1
+        assert optimized_model.graph.node[0].op_type == "Log"
+
+    def test_eliminate_nop_dropout_opset11(self):  # type: () -> None
+        for ratio in [0.0, 0.5]:
+            node = helper.make_node("Dropout", ["X"], ["Y"], ratio=ratio)
+            node1 = helper.make_node("Log", ["Y"], ["Z"])
+            graph = helper.make_graph(
+                [node, node1],
+                "test",
+                [helper.make_tensor_value_info(
+                    "X", TensorProto.FLOAT, (5, 7))],
+                [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (5, 7))])
+            optimized_model = self._optimized(
+                graph, ["eliminate_nop_dropout"], False, opset_imports=[helper.make_opsetid("", 11)])
+
+            if ratio > 0.0:
+                assert optimized_model.graph == graph
+            else:
+                assert len(optimized_model.graph.output) == 1
+                assert len(optimized_model.graph.node) == 1
+                assert optimized_model.graph.node[0].op_type == "Log"
+
+    def test_fuse_reduction_unsqueeze(self):  # type: () -> None
+        def _calculate_post_transform_shape(input_shape, reduction_axes, unsqueeze_axes, keepdim):  # type: (Tuple[int, ...], List[int], List[int], bool) -> Tuple[int, ...]
+            post_reduce_shape = None
+            if keepdim:
+                post_reduce_shape = tuple([(x if i not in reduction_axes else 1) for i, x in enumerate(input_shape)])
+            else:
+                post_reduce_shape = tuple([x for i, x in enumerate(input_shape) if i not in reduction_axes])
+            post_unsqueeze_shape = list(post_reduce_shape)
+            for ax in unsqueeze_axes:
+                post_unsqueeze_shape.insert(ax, 1)
+            return tuple(post_unsqueeze_shape)
+
+        for reduction in ["ReduceL1", "ReduceL2", "ReduceLogSum",
+                          "ReduceLogSumExp", "ReduceMax", "ReduceMean",
+                          "ReduceMin", "ReduceProd", "ReduceSum", "ReduceSumSquare"]:
+            for axes1 in [[1], [1, 2], [2]]:
+                for axes2 in [[1], [1, 2], [2]]:
+                    for keepdim in [False, True]:
+                        input_shape = (5, 7, 9)
+                        output_shape = _calculate_post_transform_shape(input_shape, axes1, axes2, keepdim)  # type: Tuple[int, ...]
+                        node = helper.make_node(reduction, ["X"], ["Y"], axes=axes1, keepdims=keepdim)
+                        node1 = helper.make_node("Unsqueeze", ["Y"], ["Z"], axes=axes2)
+                        graph = helper.make_graph(
+                            [node, node1],
+                            "test",
+                            [helper.make_tensor_value_info(
+                                "X", TensorProto.FLOAT, input_shape)],
+                            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, output_shape)])
+                        optimized_model = self._optimized(
+                            graph, ["fuse_consecutive_reduce_unsqueeze"], False)
+
+                        if keepdim or axes1 != axes2:
+                            assert optimized_model.graph == graph
+                        else:
+                            assert len(optimized_model.graph.output) == 1
+                            assert len(optimized_model.graph.node) == 1
+                            assert optimized_model.graph.output[0].type.tensor_type.elem_type == TensorProto.FLOAT
+                            assert optimized_model.graph.node[-1].op_type == reduction
+                            assert optimized_model.graph.node[-1].attribute[0].name == "axes"
+                            assert optimized_model.graph.node[-1].attribute[0].ints == axes1
+                            optimized_output_shape = tuple(x.dim_value for x in optimized_model.graph.output[0].type.tensor_type.shape.dim)
+                            assert optimized_output_shape == output_shape
+
+
+if __name__ == '__main__':
+    unittest.main()
+

From 98d2156bca9202cc704b05b1c2bd1762da0380c5 Mon Sep 17 00:00:00 2001
From: daquexian <daquexian566@gmail.com>
Date: Sun, 23 Aug 2020 15:49:39 +0800
Subject: [PATCH 04/14] update .gitignore

---
 .gitignore | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore
index 512a0b353..13bc1fbe0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -63,23 +63,16 @@ build_*
 # setup.py intermediates
 .eggs
 dist
-onnx_opt.egg-info
+*.egg-info
 *.ninja
 .ninja_deps
 .ninja_log
 compile_commands.json
 
 # generated files
-onnx/version.py
+onnx_opt/version.py
 compile_commands.json
 
-# test generated files
-.cache
-.coverage
-onnx/examples/.coverage.nbval
-.pytest_cache
-test_report
-
 # autocomplete
 .ycm_extra_conf.py
 

From 3dd943b358ee3fba5aaddca3daee1d7ce79bd2a8 Mon Sep 17 00:00:00 2001
From: daquexian <daquexian566@gmail.com>
Date: Sun, 23 Aug 2020 16:06:45 +0800
Subject: [PATCH 05/14] update setup.py

---
 setup.py | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/setup.py b/setup.py
index 720b29389..2d18a8b54 100644
--- a/setup.py
+++ b/setup.py
@@ -293,11 +293,7 @@ def run(self):
 packages = setuptools.find_packages()
 
 install_requires.extend([
-    'protobuf',
-    'numpy',
-    'six',
-    'typing>=3.6.4; python_version < "3.5"',
-    'typing-extensions>=3.6.2.1',
+    'onnx'
 ])
 
 ################################################################################
@@ -330,15 +326,8 @@ def run(self):
     setup_requires=setup_requires,
     tests_require=tests_require,
     extras_require=extras_require,
-    author='ONNX',
+    author='ONNX Optimizer Authors',
     author_email='onnx-technical-discuss@lists.lfai.foundation',
-    url='https://github.com/onnx/onnx',
-    entry_points={
-        'console_scripts': [
-            'check-model = onnx.bin.checker:check_model',
-            'check-node = onnx.bin.checker:check_node',
-            'backend-test-tools = onnx.backend.test.cmd_tools:main',
-        ]
-    },
+    url='https://github.com/onnx/optimizer',
 )
 

From 2f94057efc6aa97d74b0c0f15d9f0ef92ec67d4a Mon Sep 17 00:00:00 2001
From: daquexian <daquexian566@gmail.com>
Date: Sun, 23 Aug 2020 16:26:03 +0800
Subject: [PATCH 06/14] fix undefined ONNX_OPTIMIZER_VERSION

---
 CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a4abbf3f..2a832a846 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,9 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(ONNX_ROOT ${PROJECT_SOURCE_DIR}/third_party/onnx)
 add_subdirectory(${ONNX_ROOT})
 
+file(READ "${PROJECT_SOURCE_DIR}/VERSION_NUMBER" ONNX_OPTIMIZER_VERSION)
+string(STRIP "${ONNX_OPTIMIZER_VERSION}" ONNX_OPTIMIZER_VERSION)
+
 file(GLOB_RECURSE onnx_opt_srcs "onnx_opt/*.cc"
     "onnx_opt/*.h"
     )

From d1882615a84b46d69da1b0693422c41bbf5a73bc Mon Sep 17 00:00:00 2001
From: daquexian <daquexian566@gmail.com>
Date: Sun, 23 Aug 2020 16:28:32 +0800
Subject: [PATCH 07/14] fix undefined ONNX_OPTIMIZER_INCLUDE_DIR

---
 cmake/ONNXOptimizerConfig.cmake.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/ONNXOptimizerConfig.cmake.in b/cmake/ONNXOptimizerConfig.cmake.in
index 85c56549e..72dcc88d6 100644
--- a/cmake/ONNXOptimizerConfig.cmake.in
+++ b/cmake/ONNXOptimizerConfig.cmake.in
@@ -20,5 +20,5 @@ get_filename_component(
     CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
 get_filename_component(
     _INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE)
-set(ONNX_INCLUDE_DIRS "${_INSTALL_PREFIX}/include")
+set(ONNX_OPTIMIZER_INCLUDE_DIRS "${_INSTALL_PREFIX}/include")
 

From 50997e6445610f7c11cbb26e310c1b7a5d600f2a Mon Sep 17 00:00:00 2001
From: daquexian <daquexian566@gmail.com>
Date: Sun, 23 Aug 2020 16:31:42 +0800
Subject: [PATCH 08/14] refine include order

---
 onnx_opt/cpp2py_export.cc | 2 +-
 onnx_opt/optimize.h       | 3 ++-
 onnx_opt/pass.cc          | 3 ++-
 onnx_opt/pass_registry.h  | 3 ++-
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/onnx_opt/cpp2py_export.cc b/onnx_opt/cpp2py_export.cc
index 6393baa86..9cebf841a 100644
--- a/onnx_opt/cpp2py_export.cc
+++ b/onnx_opt/cpp2py_export.cc
@@ -1,8 +1,8 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include "onnx/py_utils.h"
 
 #include "onnx_opt/optimize.h"
-#include "onnx/py_utils.h"
 
 namespace ONNX_NAMESPACE {
 namespace py = pybind11;
diff --git a/onnx_opt/optimize.h b/onnx_opt/optimize.h
index 459beb337..e579ba911 100644
--- a/onnx_opt/optimize.h
+++ b/onnx_opt/optimize.h
@@ -6,9 +6,10 @@
 #include "onnx/common/ir.h"
 #include "onnx/common/ir_pb_converter.h"
 #include "onnx/common/stl_backports.h"
+#include "onnx/proto_utils.h"
+
 #include "onnx_opt/pass_manager.h"
 #include "onnx_opt/pass_registry.h"
-#include "onnx/proto_utils.h"
 
 #include "vector"
 
diff --git a/onnx_opt/pass.cc b/onnx_opt/pass.cc
index 27ac67062..88ac1ba90 100644
--- a/onnx_opt/pass.cc
+++ b/onnx_opt/pass.cc
@@ -1,6 +1,7 @@
-#include "onnx_opt/pass.h"
 #include "onnx/common/assertions.h"
 
+#include "onnx_opt/pass.h"
+
 namespace ONNX_NAMESPACE {
 namespace optimization {
 
diff --git a/onnx_opt/pass_registry.h b/onnx_opt/pass_registry.h
index 5c4350359..5168f2e70 100644
--- a/onnx_opt/pass_registry.h
+++ b/onnx_opt/pass_registry.h
@@ -6,6 +6,8 @@
 #include "onnx/common/ir.h"
 #include "onnx/common/ir_pb_converter.h"
 #include "onnx/common/stl_backports.h"
+#include "onnx/proto_utils.h"
+
 #include "onnx_opt/passes/eliminate_deadend.h"
 #include "onnx_opt/passes/eliminate_identity.h"
 #include "onnx_opt/passes/eliminate_nop_dropout.h"
@@ -27,7 +29,6 @@
 #include "onnx_opt/passes/lift_lexical_references.h"
 #include "onnx_opt/passes/nop.h"
 #include "onnx_opt/passes/split.h"
-#include "onnx/proto_utils.h"
 
 #include <unordered_set>
 #include <vector>

From b4950ec5b99e299c251117c42ad5084a5300c2ba Mon Sep 17 00:00:00 2001
From: daquexian <daquexian566@gmail.com>
Date: Sun, 23 Aug 2020 18:30:47 +0800
Subject: [PATCH 09/14] rename onnx_opt to onnxoptimizer

---
 .gitignore                                    |  2 +-
 CMakeLists.txt                                | 10 ++---
 {onnx_opt => onnxoptimizer}/__init__.py       |  2 +-
 {onnx_opt => onnxoptimizer}/cpp2py_export.cc  |  2 +-
 {onnx_opt => onnxoptimizer}/optimize.cc       |  2 +-
 {onnx_opt => onnxoptimizer}/optimize.h        |  4 +-
 {onnx_opt => onnxoptimizer}/pass.cc           |  2 +-
 {onnx_opt => onnxoptimizer}/pass.h            |  0
 {onnx_opt => onnxoptimizer}/pass_manager.cc   |  2 +-
 {onnx_opt => onnxoptimizer}/pass_manager.h    |  4 +-
 {onnx_opt => onnxoptimizer}/pass_registry.cc  |  2 +-
 {onnx_opt => onnxoptimizer}/pass_registry.h   | 42 +++++++++----------
 .../passes/eliminate_deadend.h                |  2 +-
 .../passes/eliminate_identity.h               |  2 +-
 .../passes/eliminate_nop_dropout.h            |  2 +-
 .../passes/eliminate_nop_monotone_argmax.h    |  2 +-
 .../passes/eliminate_nop_pad.h                |  2 +-
 .../passes/eliminate_nop_transpose.h          |  2 +-
 .../passes/eliminate_unused_initializer.h     |  2 +-
 .../passes/extract_constant_to_initializer.h  |  2 +-
 .../passes/fuse_add_bias_into_conv.h          |  2 +-
 .../passes/fuse_bn_into_conv.h                |  2 +-
 .../passes/fuse_consecutive_concats.h         |  2 +-
 .../passes/fuse_consecutive_log_softmax.h     |  2 +-
 .../fuse_consecutive_reduce_unsqueeze.h       |  2 +-
 .../passes/fuse_consecutive_squeezes.h        |  2 +-
 .../passes/fuse_consecutive_transposes.h      |  2 +-
 .../passes/fuse_matmul_add_bias_into_gemm.h   |  2 +-
 .../passes/fuse_pad_into_conv.h               |  2 +-
 .../passes/fuse_transpose_into_gemm.h         |  2 +-
 .../passes/lift_lexical_references.h          |  2 +-
 {onnx_opt => onnxoptimizer}/passes/nop.h      |  2 +-
 {onnx_opt => onnxoptimizer}/passes/split.h    |  2 +-
 .../test/optimizer_test.py                    |  6 +--
 setup.py                                      | 12 +++---
 35 files changed, 67 insertions(+), 67 deletions(-)
 rename {onnx_opt => onnxoptimizer}/__init__.py (97%)
 rename {onnx_opt => onnxoptimizer}/cpp2py_export.cc (96%)
 rename {onnx_opt => onnxoptimizer}/optimize.cc (97%)
 rename {onnx_opt => onnxoptimizer}/optimize.h (94%)
 rename {onnx_opt => onnxoptimizer}/pass.cc (98%)
 rename {onnx_opt => onnxoptimizer}/pass.h (100%)
 rename {onnx_opt => onnxoptimizer}/pass_manager.cc (97%)
 rename {onnx_opt => onnxoptimizer}/pass_manager.h (94%)
 rename {onnx_opt => onnxoptimizer}/pass_registry.cc (91%)
 rename {onnx_opt => onnxoptimizer}/pass_registry.h (64%)
 rename {onnx_opt => onnxoptimizer}/passes/eliminate_deadend.h (97%)
 rename {onnx_opt => onnxoptimizer}/passes/eliminate_identity.h (96%)
 rename {onnx_opt => onnxoptimizer}/passes/eliminate_nop_dropout.h (97%)
 rename {onnx_opt => onnxoptimizer}/passes/eliminate_nop_monotone_argmax.h (98%)
 rename {onnx_opt => onnxoptimizer}/passes/eliminate_nop_pad.h (98%)
 rename {onnx_opt => onnxoptimizer}/passes/eliminate_nop_transpose.h (97%)
 rename {onnx_opt => onnxoptimizer}/passes/eliminate_unused_initializer.h (98%)
 rename {onnx_opt => onnxoptimizer}/passes/extract_constant_to_initializer.h (97%)
 rename {onnx_opt => onnxoptimizer}/passes/fuse_add_bias_into_conv.h (99%)
 rename {onnx_opt => onnxoptimizer}/passes/fuse_bn_into_conv.h (99%)
 rename {onnx_opt => onnxoptimizer}/passes/fuse_consecutive_concats.h (98%)
 rename {onnx_opt => onnxoptimizer}/passes/fuse_consecutive_log_softmax.h (97%)
 rename {onnx_opt => onnxoptimizer}/passes/fuse_consecutive_reduce_unsqueeze.h (98%)
 rename {onnx_opt => onnxoptimizer}/passes/fuse_consecutive_squeezes.h (98%)
 rename {onnx_opt => onnxoptimizer}/passes/fuse_consecutive_transposes.h (98%)
 rename {onnx_opt => onnxoptimizer}/passes/fuse_matmul_add_bias_into_gemm.h (99%)
 rename {onnx_opt => onnxoptimizer}/passes/fuse_pad_into_conv.h (99%)
 rename {onnx_opt => onnxoptimizer}/passes/fuse_transpose_into_gemm.h (97%)
 rename {onnx_opt => onnxoptimizer}/passes/lift_lexical_references.h (99%)
 rename {onnx_opt => onnxoptimizer}/passes/nop.h (95%)
 rename {onnx_opt => onnxoptimizer}/passes/split.h (99%)
 rename {onnx_opt => onnxoptimizer}/test/optimizer_test.py (99%)

diff --git a/.gitignore b/.gitignore
index 13bc1fbe0..2435431ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -70,7 +70,7 @@ dist
 compile_commands.json
 
 # generated files
-onnx_opt/version.py
+onnxoptimizer/version.py
 compile_commands.json
 
 # autocomplete
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2a832a846..ef6fcc930 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,10 +10,10 @@ add_subdirectory(${ONNX_ROOT})
 file(READ "${PROJECT_SOURCE_DIR}/VERSION_NUMBER" ONNX_OPTIMIZER_VERSION)
 string(STRIP "${ONNX_OPTIMIZER_VERSION}" ONNX_OPTIMIZER_VERSION)
 
-file(GLOB_RECURSE onnx_opt_srcs "onnx_opt/*.cc"
-    "onnx_opt/*.h"
+file(GLOB_RECURSE onnx_opt_srcs "onnxoptimizer/*.cc"
+    "onnxoptimizer/*.h"
     )
-list(REMOVE_ITEM onnx_opt_srcs "${PROJECT_SOURCE_DIR}/onnx_opt/cpp2py_export.cc")
+list(REMOVE_ITEM onnx_opt_srcs "${PROJECT_SOURCE_DIR}/onnxoptimizer/cpp2py_export.cc")
 
 add_library(onnx_optimizer ${onnx_opt_srcs})
 target_link_libraries(onnx_optimizer PUBLIC onnx)
@@ -31,7 +31,7 @@ if(BUILD_ONNX_PYTHON)
     endif()
   endif()
 
-  add_library(onnx_opt_cpp2py_export MODULE "onnx_opt/cpp2py_export.cc")
+  add_library(onnx_opt_cpp2py_export MODULE "onnxoptimizer/cpp2py_export.cc")
   set_target_properties(onnx_opt_cpp2py_export PROPERTIES PREFIX "")
   set_target_properties(onnx_opt_cpp2py_export
                         PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
@@ -116,7 +116,7 @@ endif()
 
 include(GNUInstallDirs)
 
-install(DIRECTORY ${PROJECT_SOURCE_DIR}/onnx_opt
+install(DIRECTORY ${PROJECT_SOURCE_DIR}/onnxoptimizer
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
         FILES_MATCHING
         PATTERN "*.h")
diff --git a/onnx_opt/__init__.py b/onnxoptimizer/__init__.py
similarity index 97%
rename from onnx_opt/__init__.py
rename to onnxoptimizer/__init__.py
index c0c044827..ec8429605 100644
--- a/onnx_opt/__init__.py
+++ b/onnxoptimizer/__init__.py
@@ -11,7 +11,7 @@
 from __future__ import unicode_literals
 
 import onnx
-import onnx_opt.onnx_opt_cpp2py_export as C
+import onnxoptimizer.onnx_opt_cpp2py_export as C
 from onnx import ModelProto
 from typing import Text, Sequence, Optional
 
diff --git a/onnx_opt/cpp2py_export.cc b/onnxoptimizer/cpp2py_export.cc
similarity index 96%
rename from onnx_opt/cpp2py_export.cc
rename to onnxoptimizer/cpp2py_export.cc
index 9cebf841a..2f92222db 100644
--- a/onnx_opt/cpp2py_export.cc
+++ b/onnxoptimizer/cpp2py_export.cc
@@ -2,7 +2,7 @@
 #include <pybind11/stl.h>
 #include "onnx/py_utils.h"
 
-#include "onnx_opt/optimize.h"
+#include "onnxoptimizer/optimize.h"
 
 namespace ONNX_NAMESPACE {
 namespace py = pybind11;
diff --git a/onnx_opt/optimize.cc b/onnxoptimizer/optimize.cc
similarity index 97%
rename from onnx_opt/optimize.cc
rename to onnxoptimizer/optimize.cc
index 7b27aebb8..7ba4bf3fb 100644
--- a/onnx_opt/optimize.cc
+++ b/onnxoptimizer/optimize.cc
@@ -1,7 +1,7 @@
 // ATTENTION: The code in this file is highly EXPERIMENTAL.
 // Adventurous users should note that the APIs will probably change.
 
-#include "onnx_opt/optimize.h"
+#include "onnxoptimizer/optimize.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/optimize.h b/onnxoptimizer/optimize.h
similarity index 94%
rename from onnx_opt/optimize.h
rename to onnxoptimizer/optimize.h
index e579ba911..aefd03f2d 100644
--- a/onnx_opt/optimize.h
+++ b/onnxoptimizer/optimize.h
@@ -8,8 +8,8 @@
 #include "onnx/common/stl_backports.h"
 #include "onnx/proto_utils.h"
 
-#include "onnx_opt/pass_manager.h"
-#include "onnx_opt/pass_registry.h"
+#include "onnxoptimizer/pass_manager.h"
+#include "onnxoptimizer/pass_registry.h"
 
 #include "vector"
 
diff --git a/onnx_opt/pass.cc b/onnxoptimizer/pass.cc
similarity index 98%
rename from onnx_opt/pass.cc
rename to onnxoptimizer/pass.cc
index 88ac1ba90..ece7bde3a 100644
--- a/onnx_opt/pass.cc
+++ b/onnxoptimizer/pass.cc
@@ -1,6 +1,6 @@
 #include "onnx/common/assertions.h"
 
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/pass.h b/onnxoptimizer/pass.h
similarity index 100%
rename from onnx_opt/pass.h
rename to onnxoptimizer/pass.h
diff --git a/onnx_opt/pass_manager.cc b/onnxoptimizer/pass_manager.cc
similarity index 97%
rename from onnx_opt/pass_manager.cc
rename to onnxoptimizer/pass_manager.cc
index 0c9cae0a9..2ea5f65d4 100644
--- a/onnx_opt/pass_manager.cc
+++ b/onnxoptimizer/pass_manager.cc
@@ -1,4 +1,4 @@
-#include "onnx_opt/pass_manager.h"
+#include "onnxoptimizer/pass_manager.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/pass_manager.h b/onnxoptimizer/pass_manager.h
similarity index 94%
rename from onnx_opt/pass_manager.h
rename to onnxoptimizer/pass_manager.h
index 44a3899b4..7f925acd3 100644
--- a/onnx_opt/pass_manager.h
+++ b/onnxoptimizer/pass_manager.h
@@ -3,8 +3,8 @@
 // Adventurous users should note that the APIs will probably change.
 
 #include <vector>
-#include "onnx_opt/pass.h"
-#include "onnx_opt/passes/eliminate_deadend.h"
+#include "onnxoptimizer/pass.h"
+#include "onnxoptimizer/passes/eliminate_deadend.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/pass_registry.cc b/onnxoptimizer/pass_registry.cc
similarity index 91%
rename from onnx_opt/pass_registry.cc
rename to onnxoptimizer/pass_registry.cc
index b1c6cc69d..a3d3fd257 100644
--- a/onnx_opt/pass_registry.cc
+++ b/onnxoptimizer/pass_registry.cc
@@ -1,7 +1,7 @@
 // ATTENTION: The code in this file is highly EXPERIMENTAL.
 // Adventurous users should note that the APIs will probably change.
 
-#include "onnx_opt/pass_registry.h"
+#include "onnxoptimizer/pass_registry.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/pass_registry.h b/onnxoptimizer/pass_registry.h
similarity index 64%
rename from onnx_opt/pass_registry.h
rename to onnxoptimizer/pass_registry.h
index 5168f2e70..12e274d77 100644
--- a/onnx_opt/pass_registry.h
+++ b/onnxoptimizer/pass_registry.h
@@ -8,27 +8,27 @@
 #include "onnx/common/stl_backports.h"
 #include "onnx/proto_utils.h"
 
-#include "onnx_opt/passes/eliminate_deadend.h"
-#include "onnx_opt/passes/eliminate_identity.h"
-#include "onnx_opt/passes/eliminate_nop_dropout.h"
-#include "onnx_opt/passes/eliminate_nop_monotone_argmax.h"
-#include "onnx_opt/passes/eliminate_nop_pad.h"
-#include "onnx_opt/passes/eliminate_nop_transpose.h"
-#include "onnx_opt/passes/eliminate_unused_initializer.h"
-#include "onnx_opt/passes/extract_constant_to_initializer.h"
-#include "onnx_opt/passes/fuse_add_bias_into_conv.h"
-#include "onnx_opt/passes/fuse_bn_into_conv.h"
-#include "onnx_opt/passes/fuse_consecutive_concats.h"
-#include "onnx_opt/passes/fuse_consecutive_log_softmax.h"
-#include "onnx_opt/passes/fuse_consecutive_reduce_unsqueeze.h"
-#include "onnx_opt/passes/fuse_consecutive_squeezes.h"
-#include "onnx_opt/passes/fuse_consecutive_transposes.h"
-#include "onnx_opt/passes/fuse_matmul_add_bias_into_gemm.h"
-#include "onnx_opt/passes/fuse_pad_into_conv.h"
-#include "onnx_opt/passes/fuse_transpose_into_gemm.h"
-#include "onnx_opt/passes/lift_lexical_references.h"
-#include "onnx_opt/passes/nop.h"
-#include "onnx_opt/passes/split.h"
+#include "onnxoptimizer/passes/eliminate_deadend.h"
+#include "onnxoptimizer/passes/eliminate_identity.h"
+#include "onnxoptimizer/passes/eliminate_nop_dropout.h"
+#include "onnxoptimizer/passes/eliminate_nop_monotone_argmax.h"
+#include "onnxoptimizer/passes/eliminate_nop_pad.h"
+#include "onnxoptimizer/passes/eliminate_nop_transpose.h"
+#include "onnxoptimizer/passes/eliminate_unused_initializer.h"
+#include "onnxoptimizer/passes/extract_constant_to_initializer.h"
+#include "onnxoptimizer/passes/fuse_add_bias_into_conv.h"
+#include "onnxoptimizer/passes/fuse_bn_into_conv.h"
+#include "onnxoptimizer/passes/fuse_consecutive_concats.h"
+#include "onnxoptimizer/passes/fuse_consecutive_log_softmax.h"
+#include "onnxoptimizer/passes/fuse_consecutive_reduce_unsqueeze.h"
+#include "onnxoptimizer/passes/fuse_consecutive_squeezes.h"
+#include "onnxoptimizer/passes/fuse_consecutive_transposes.h"
+#include "onnxoptimizer/passes/fuse_matmul_add_bias_into_gemm.h"
+#include "onnxoptimizer/passes/fuse_pad_into_conv.h"
+#include "onnxoptimizer/passes/fuse_transpose_into_gemm.h"
+#include "onnxoptimizer/passes/lift_lexical_references.h"
+#include "onnxoptimizer/passes/nop.h"
+#include "onnxoptimizer/passes/split.h"
 
 #include <unordered_set>
 #include <vector>
diff --git a/onnx_opt/passes/eliminate_deadend.h b/onnxoptimizer/passes/eliminate_deadend.h
similarity index 97%
rename from onnx_opt/passes/eliminate_deadend.h
rename to onnxoptimizer/passes/eliminate_deadend.h
index db121d694..ea6a9f767 100644
--- a/onnx_opt/passes/eliminate_deadend.h
+++ b/onnxoptimizer/passes/eliminate_deadend.h
@@ -2,7 +2,7 @@
 // ATTENTION: The code in this file is highly EXPERIMENTAL.
 // Adventurous users should note that the APIs will probably change.
 #pragma once
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 namespace ONNX_NAMESPACE {
 namespace optimization {
 struct EliminateDeadEnd final : public FullGraphBasedPass {
diff --git a/onnx_opt/passes/eliminate_identity.h b/onnxoptimizer/passes/eliminate_identity.h
similarity index 96%
rename from onnx_opt/passes/eliminate_identity.h
rename to onnxoptimizer/passes/eliminate_identity.h
index 4fae9fc19..a0e75ee78 100644
--- a/onnx_opt/passes/eliminate_identity.h
+++ b/onnxoptimizer/passes/eliminate_identity.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/eliminate_nop_dropout.h b/onnxoptimizer/passes/eliminate_nop_dropout.h
similarity index 97%
rename from onnx_opt/passes/eliminate_nop_dropout.h
rename to onnxoptimizer/passes/eliminate_nop_dropout.h
index 94ffb9fa3..44132de96 100644
--- a/onnx_opt/passes/eliminate_nop_dropout.h
+++ b/onnxoptimizer/passes/eliminate_nop_dropout.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/eliminate_nop_monotone_argmax.h b/onnxoptimizer/passes/eliminate_nop_monotone_argmax.h
similarity index 98%
rename from onnx_opt/passes/eliminate_nop_monotone_argmax.h
rename to onnxoptimizer/passes/eliminate_nop_monotone_argmax.h
index 0e3334225..9bca9048b 100644
--- a/onnx_opt/passes/eliminate_nop_monotone_argmax.h
+++ b/onnxoptimizer/passes/eliminate_nop_monotone_argmax.h
@@ -2,7 +2,7 @@
 // Adventurous users should note that the APIs will probably change.
 #pragma once
 
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/eliminate_nop_pad.h b/onnxoptimizer/passes/eliminate_nop_pad.h
similarity index 98%
rename from onnx_opt/passes/eliminate_nop_pad.h
rename to onnxoptimizer/passes/eliminate_nop_pad.h
index 04441f983..3c74e4a96 100644
--- a/onnx_opt/passes/eliminate_nop_pad.h
+++ b/onnxoptimizer/passes/eliminate_nop_pad.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "onnx/defs/tensor_util.h"
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/eliminate_nop_transpose.h b/onnxoptimizer/passes/eliminate_nop_transpose.h
similarity index 97%
rename from onnx_opt/passes/eliminate_nop_transpose.h
rename to onnxoptimizer/passes/eliminate_nop_transpose.h
index daad9c8d9..230d07005 100644
--- a/onnx_opt/passes/eliminate_nop_transpose.h
+++ b/onnxoptimizer/passes/eliminate_nop_transpose.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/eliminate_unused_initializer.h b/onnxoptimizer/passes/eliminate_unused_initializer.h
similarity index 98%
rename from onnx_opt/passes/eliminate_unused_initializer.h
rename to onnxoptimizer/passes/eliminate_unused_initializer.h
index 592dc1cec..aea24201b 100644
--- a/onnx_opt/passes/eliminate_unused_initializer.h
+++ b/onnxoptimizer/passes/eliminate_unused_initializer.h
@@ -14,7 +14,7 @@
 //   condition 1: A is not used as any node's input
 //   condition 2: A is not an output
 
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/extract_constant_to_initializer.h b/onnxoptimizer/passes/extract_constant_to_initializer.h
similarity index 97%
rename from onnx_opt/passes/extract_constant_to_initializer.h
rename to onnxoptimizer/passes/extract_constant_to_initializer.h
index 5fafe85a6..6c6607e28 100644
--- a/onnx_opt/passes/extract_constant_to_initializer.h
+++ b/onnxoptimizer/passes/extract_constant_to_initializer.h
@@ -11,7 +11,7 @@
 //	 this pass can handle the case satisfy all following conditions:
 //	   condition 1: A is the output of a Constant node
 #include "onnx/common/assertions.h"
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_add_bias_into_conv.h b/onnxoptimizer/passes/fuse_add_bias_into_conv.h
similarity index 99%
rename from onnx_opt/passes/fuse_add_bias_into_conv.h
rename to onnxoptimizer/passes/fuse_add_bias_into_conv.h
index 22a8adb9f..749916aa8 100644
--- a/onnx_opt/passes/fuse_add_bias_into_conv.h
+++ b/onnxoptimizer/passes/fuse_add_bias_into_conv.h
@@ -16,7 +16,7 @@
 #include <numeric>
 
 #include "onnx/common/assertions.h"
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_bn_into_conv.h b/onnxoptimizer/passes/fuse_bn_into_conv.h
similarity index 99%
rename from onnx_opt/passes/fuse_bn_into_conv.h
rename to onnxoptimizer/passes/fuse_bn_into_conv.h
index 697f9cbd7..564b26c37 100644
--- a/onnx_opt/passes/fuse_bn_into_conv.h
+++ b/onnxoptimizer/passes/fuse_bn_into_conv.h
@@ -29,7 +29,7 @@
 // $$ b' = (b_{conv} - m)\frac{s}{\sqrt{\sigma + \epsilon}} + b_{bn}$$
 
 #include "onnx/common/assertions.h"
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_consecutive_concats.h b/onnxoptimizer/passes/fuse_consecutive_concats.h
similarity index 98%
rename from onnx_opt/passes/fuse_consecutive_concats.h
rename to onnxoptimizer/passes/fuse_consecutive_concats.h
index 9e42c7530..e9edb8a81 100644
--- a/onnx_opt/passes/fuse_consecutive_concats.h
+++ b/onnxoptimizer/passes/fuse_consecutive_concats.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_consecutive_log_softmax.h b/onnxoptimizer/passes/fuse_consecutive_log_softmax.h
similarity index 97%
rename from onnx_opt/passes/fuse_consecutive_log_softmax.h
rename to onnxoptimizer/passes/fuse_consecutive_log_softmax.h
index 8f732c17f..521cda3ea 100644
--- a/onnx_opt/passes/fuse_consecutive_log_softmax.h
+++ b/onnxoptimizer/passes/fuse_consecutive_log_softmax.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_consecutive_reduce_unsqueeze.h b/onnxoptimizer/passes/fuse_consecutive_reduce_unsqueeze.h
similarity index 98%
rename from onnx_opt/passes/fuse_consecutive_reduce_unsqueeze.h
rename to onnxoptimizer/passes/fuse_consecutive_reduce_unsqueeze.h
index 550fb5cc1..8f20247a4 100644
--- a/onnx_opt/passes/fuse_consecutive_reduce_unsqueeze.h
+++ b/onnxoptimizer/passes/fuse_consecutive_reduce_unsqueeze.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_consecutive_squeezes.h b/onnxoptimizer/passes/fuse_consecutive_squeezes.h
similarity index 98%
rename from onnx_opt/passes/fuse_consecutive_squeezes.h
rename to onnxoptimizer/passes/fuse_consecutive_squeezes.h
index 2b1c8aa51..0d0d8a493 100644
--- a/onnx_opt/passes/fuse_consecutive_squeezes.h
+++ b/onnxoptimizer/passes/fuse_consecutive_squeezes.h
@@ -9,7 +9,7 @@
 //   Z = Squeeze(Y, axes=[0, 4]) -> shape=[2, 3, 5]
 // After:
 //   Z = Squeeze(X, axes=[0, 1, 4, 6])
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_consecutive_transposes.h b/onnxoptimizer/passes/fuse_consecutive_transposes.h
similarity index 98%
rename from onnx_opt/passes/fuse_consecutive_transposes.h
rename to onnxoptimizer/passes/fuse_consecutive_transposes.h
index 6b7d58978..764091f69 100644
--- a/onnx_opt/passes/fuse_consecutive_transposes.h
+++ b/onnxoptimizer/passes/fuse_consecutive_transposes.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_matmul_add_bias_into_gemm.h b/onnxoptimizer/passes/fuse_matmul_add_bias_into_gemm.h
similarity index 99%
rename from onnx_opt/passes/fuse_matmul_add_bias_into_gemm.h
rename to onnxoptimizer/passes/fuse_matmul_add_bias_into_gemm.h
index 8d093cee1..19f21d2be 100644
--- a/onnx_opt/passes/fuse_matmul_add_bias_into_gemm.h
+++ b/onnxoptimizer/passes/fuse_matmul_add_bias_into_gemm.h
@@ -17,7 +17,7 @@
 #include <numeric>
 
 #include "onnx/common/assertions.h"
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_pad_into_conv.h b/onnxoptimizer/passes/fuse_pad_into_conv.h
similarity index 99%
rename from onnx_opt/passes/fuse_pad_into_conv.h
rename to onnxoptimizer/passes/fuse_pad_into_conv.h
index 575a199b2..e7adc9312 100644
--- a/onnx_opt/passes/fuse_pad_into_conv.h
+++ b/onnxoptimizer/passes/fuse_pad_into_conv.h
@@ -15,7 +15,7 @@
 #include <numeric>
 
 #include "onnx/defs/tensor_util.h"
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/fuse_transpose_into_gemm.h b/onnxoptimizer/passes/fuse_transpose_into_gemm.h
similarity index 97%
rename from onnx_opt/passes/fuse_transpose_into_gemm.h
rename to onnxoptimizer/passes/fuse_transpose_into_gemm.h
index b9fab13af..20a9a1e37 100644
--- a/onnx_opt/passes/fuse_transpose_into_gemm.h
+++ b/onnxoptimizer/passes/fuse_transpose_into_gemm.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/lift_lexical_references.h b/onnxoptimizer/passes/lift_lexical_references.h
similarity index 99%
rename from onnx_opt/passes/lift_lexical_references.h
rename to onnxoptimizer/passes/lift_lexical_references.h
index 2082c555c..99cf6342f 100644
--- a/onnx_opt/passes/lift_lexical_references.h
+++ b/onnxoptimizer/passes/lift_lexical_references.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <set>
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/nop.h b/onnxoptimizer/passes/nop.h
similarity index 95%
rename from onnx_opt/passes/nop.h
rename to onnxoptimizer/passes/nop.h
index de71fab65..edc02bae1 100644
--- a/onnx_opt/passes/nop.h
+++ b/onnxoptimizer/passes/nop.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/passes/split.h b/onnxoptimizer/passes/split.h
similarity index 99%
rename from onnx_opt/passes/split.h
rename to onnxoptimizer/passes/split.h
index c7311201b..81e8a7c2d 100644
--- a/onnx_opt/passes/split.h
+++ b/onnxoptimizer/passes/split.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx_opt/pass.h"
+#include "onnxoptimizer/pass.h"
 
 namespace ONNX_NAMESPACE {
 namespace optimization {
diff --git a/onnx_opt/test/optimizer_test.py b/onnxoptimizer/test/optimizer_test.py
similarity index 99%
rename from onnx_opt/test/optimizer_test.py
rename to onnxoptimizer/test/optimizer_test.py
index 6fb1de61a..3dabd5423 100644
--- a/onnx_opt/test/optimizer_test.py
+++ b/onnxoptimizer/test/optimizer_test.py
@@ -10,7 +10,7 @@
 
 import numpy as np  # type: ignore
 
-import onnx_opt
+import onnxoptimizer
 import unittest
 
 
@@ -18,7 +18,7 @@ class TestOptimizer(unittest.TestCase):
 
     def _optimized(self, graph, opts, fixed_point=False, **kwargs):  # type: (GraphProto, Sequence[Text], bool, **Any) -> ModelProto
         orig_model = helper.make_model(graph, producer_name='onnx-test', **kwargs)
-        optimized_model = onnx_opt.optimize(orig_model, opts, fixed_point)
+        optimized_model = onnxoptimizer.optimize(orig_model, opts, fixed_point)
         checker.check_model(optimized_model)
         return optimized_model
 
@@ -90,7 +90,7 @@ def _visit_all_nodes_recursive(self, graph, fn):  # type: (GraphProto, Callable[
     def test_get_available_passes(self):  # type: () -> None
         # FIXME does not guarantees to be listing all
         graph = helper.make_graph([], "dummy_graph", [], [])
-        list_of_passes = onnx_opt.get_available_passes()
+        list_of_passes = onnxoptimizer.get_available_passes()
         assert isinstance(list_of_passes, (list)) and len(list_of_passes) > 0
         for pass_name in list_of_passes:
             # If pass_name is invalid it throws a RuntimeError
diff --git a/setup.py b/setup.py
index 2d18a8b54..4bd207d90 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@
 
 
 TOP_DIR = os.path.realpath(os.path.dirname(__file__))
-SRC_DIR = os.path.join(TOP_DIR, 'onnx_opt')
+SRC_DIR = os.path.join(TOP_DIR, 'onnxoptimizer')
 CMAKE_BUILD_DIR = os.path.join(TOP_DIR, '.setuptools-cmake-build')
 
 WINDOWS = (os.name == 'nt')
@@ -216,8 +216,8 @@ def run(self):
         self.run_command('cmake_build')
 
         generated_python_files = \
-            glob.glob(os.path.join(CMAKE_BUILD_DIR, 'onnx_opt', '*.py')) + \
-            glob.glob(os.path.join(CMAKE_BUILD_DIR, 'onnx_opt', '*.pyi'))
+            glob.glob(os.path.join(CMAKE_BUILD_DIR, 'onnxoptimizer', '*.py')) + \
+            glob.glob(os.path.join(CMAKE_BUILD_DIR, 'onnxoptimizer', '*.pyi'))
 
         for src in generated_python_files:
             dst = os.path.join(
@@ -252,7 +252,7 @@ def build_extensions(self):
                 elif os.path.exists(release_lib_dir):
                     lib_path = release_lib_dir
             src = os.path.join(lib_path, filename)
-            dst = os.path.join(os.path.realpath(self.build_lib), "onnx_opt", filename)
+            dst = os.path.join(os.path.realpath(self.build_lib), "onnxoptimizer", filename)
             self.copy_file(src, dst)
 
 
@@ -281,7 +281,7 @@ def run(self):
 
 ext_modules = [
     setuptools.Extension(
-        name=str('onnx_opt.onnx_opt_cpp2py_export'),
+        name=str('onnxoptimizer.onnx_opt_cpp2py_export'),
         sources=[])
 ]
 
@@ -314,7 +314,7 @@ def run(self):
 ################################################################################
 
 setuptools.setup(
-    name="onnx_opt",
+    name="onnxoptimizer",
     version=VersionInfo.version,
     description="Open Neural Network Exchange",
     ext_modules=ext_modules,

From aaa3ea4b4bf0b0bf25af9a9108979daafd7b35bb Mon Sep 17 00:00:00 2001
From: daquexian <daquexian566@gmail.com>
Date: Sat, 29 Aug 2020 21:03:13 +0800
Subject: [PATCH 10/14] add mypy typecheck and ci (copied from onnx/onnx repo)

---
 .azure-pipelines/Linux-CI.yml   | 105 ++++++++++++++++++++++++++++++++
 .azure-pipelines/MacOS-CI.yml   |  70 +++++++++++++++++++++
 .azure-pipelines/Windows-CI.yml |  66 ++++++++++++++++++++
 setup.cfg                       |  73 ++++++++++++++++++++++
 tools/mypy-onnx.py              |  22 +++++++
 5 files changed, 336 insertions(+)
 create mode 100644 .azure-pipelines/Linux-CI.yml
 create mode 100644 .azure-pipelines/MacOS-CI.yml
 create mode 100644 .azure-pipelines/Windows-CI.yml
 create mode 100644 setup.cfg
 create mode 100644 tools/mypy-onnx.py

diff --git a/.azure-pipelines/Linux-CI.yml b/.azure-pipelines/Linux-CI.yml
new file mode 100644
index 000000000..3fe5610ea
--- /dev/null
+++ b/.azure-pipelines/Linux-CI.yml
@@ -0,0 +1,105 @@
+trigger:
+- master
+
+jobs:
+- job: 'Test'
+  pool:
+    vmImage: 'Ubuntu-16.04'
+  strategy:
+    matrix:
+      py27:
+        python.version: '2.7'
+        onnx_ml: 0
+        onnx_debug: 0
+      py36:
+        python.version: '3.6'
+        onnx_ml: 0
+        onnx_debug: 0
+      py37:
+        python.version: '3.7'
+        onnx_ml: 0
+        onnx_debug: 0
+      py37-ml:
+        python.version: '3.7'
+        onnx_ml: 1
+        onnx_debug: 0
+      py37-ml-debug:
+        python.version: '3.7'
+        onnx_ml: 1
+        onnx_debug: 1
+    maxParallel: 5
+
+  steps:
+  - script: sudo install -d -m 0777 /home/vsts/.conda/envs
+    displayName: Fix Conda permissions
+
+  - task: CondaEnvironment@1
+    inputs:
+      createCustomEnvironment: true
+      environmentName: 'py$(python.version)'
+      packageSpecs: 'python=$(python.version) protobuf'
+
+  - script: |
+      python -m pip install --upgrade pip
+      sudo apt-get install -qq -o=Dpkg::Use-Pty=0 -y --no-install-recommends dos2unix
+      python -m pip install numpy
+      git submodule update --init --recursive
+      export ONNX_ML=${onnx_ml}
+      export DEBUG=${onnx_debug}
+      export ONNX_BUILD_TESTS=0
+      export CMAKE_ARGS="-DONNXIFI_DUMMY_BACKEND=ON"
+      export ONNX_NAMESPACE=ONNX_NAMESPACE_FOO_BAR_FOR_CI
+      python setup.py --quiet install
+    displayName: 'Install ONNX and dependencies'
+
+
+  - script: |
+      # lint python code
+      pip install --quiet flake8
+      flake8
+      if [ $? -ne 0 ]; then
+        echo "flake8 returned failures"
+        exit 1
+      fi
+
+      # check line endings to be UNIX
+      find . -type f -regextype posix-extended -regex '.*\.(py|cpp|md|h|cc|proto|proto3|in)' | xargs dos2unix --quiet
+      git status
+      git diff --exit-code
+      
+      # Do not hardcode onnx's namespace in the c++ source code, so that
+      # other libraries who statically link with onnx can hide onnx symbols
+      # in a private namespace.
+      ! grep -R --include='*.cc' --include='*.h' 'namespace onnx' .
+      ! grep -R --include='*.cc' --include='*.h' 'onnx::' .
+
+      # onnx python api tests
+      if [ "$(python.version)" == "2.7" ]; then
+        pip install --quiet pytest nbval
+      else
+        # pytest 6.0 made deprecation warnings fail by default, pinning pytest to 5.4.3.
+        # TODO replace deprecated function with the suggested one. https://docs.pytest.org/en/stable/deprecations.html#id5
+        pip install --quiet pytest==5.4.3 nbval
+      fi
+
+      pytest
+      if [ $? -ne 0 ]; then
+        echo "pytest failed"
+        exit 1
+      fi
+
+      # Mypy only works with Python 3
+      if [ "$(python.version)" != "2.7" ]; then
+        # Mypy only works with our generated _pb.py files when we install in develop mode, so let's do that
+        pip uninstall -y onnxoptimizer
+        ONNX_NAMESPACE=ONNX_NAMESPACE_FOO_BAR_FOR_CI pip install --no-use-pep517 -e .[mypy]
+        python setup.py --quiet typecheck
+        if [ $? -ne 0 ]; then
+          echo "type check failed"
+          exit 1
+        fi
+        pip uninstall -y onnxoptimizer
+        rm -rf .setuptools-cmake-build
+        ONNX_NAMESPACE=ONNX_NAMESPACE_FOO_BAR_FOR_CI pip install .
+      fi
+    displayName: 'Run ONNX Optimizer tests'
diff --git a/.azure-pipelines/MacOS-CI.yml b/.azure-pipelines/MacOS-CI.yml
new file mode 100644
index 000000000..09962e438
--- /dev/null
+++ b/.azure-pipelines/MacOS-CI.yml
@@ -0,0 +1,70 @@
+trigger:
+- master
+
+jobs:
+- job: 'Test'
+  pool:
+    vmImage: 'macOS-10.14'
+  strategy:
+    matrix:
+      py27:
+        python.version: '2.7'
+        onnx_ml: 0
+      py36:
+        python.version: '3.6'
+        onnx_ml: 0
+      py36-onnx-ml:
+        python.version: '3.6'
+        onnx_ml: 1
+    maxParallel: 3
+
+  steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: '$(python.version)'
+
+  - script: |
+      git submodule update --init --recursive
+      python -m pip install --upgrade setuptools
+      python -m pip install numpy
+      conda install -y -c conda-forge pybind11 protobuf
+      brew update
+      brew install protobuf
+      export DEBUG=${onnx_debug}
+      export ONNX_ML=${onnx_ml}
+      export CMAKE_ARGS="-DONNX_USE_LITE_PROTO=ON -DONNXIFI_DUMMY_BACKEND=ON"
+      export ONNX_NAMESPACE=ONNX_NAMESPACE_FOO_BAR_FOR_CI
+      python setup.py --quiet install
+    displayName: 'Install dependencies and ONNX'
+
+  - script: |
+      # lint python code
+      pip install --quiet flake8
+      flake8
+      if [ $? -ne 0 ]; then
+        echo "flake8 returned failures"
+        exit 1
+      fi
+
+      # Do not hardcode onnx's namespace in the c++ source code, so that
+      # other libraries who statically link with onnx can hide onnx symbols
+      # in a private namespace.
+      ! grep -R --include='*.cc' --include='*.h' 'namespace onnx' .
+      ! grep -R --include='*.cc' --include='*.h' 'onnx::' .
+
+      # onnx python api tests
+      if [ "$(python.version)" == "2.7" ]; then
+        pip install --quiet pytest nbval
+      else
+        # pytest 6.0 made deprecation warnings fail by default, pinning pytest to 5.4.3.
+        # TODO replace deprecated function with the suggested one. https://docs.pytest.org/en/stable/deprecations.html#id5
+        pip install --quiet pytest==5.4.3 nbval
+      fi
+
+      pytest onnxoptimizer
+      if [ $? -ne 0 ]; then
+        echo "pytest failed"
+        exit 1
+      fi
+
+    displayName: 'Run ONNX Optimizer Tests'
diff --git a/.azure-pipelines/Windows-CI.yml b/.azure-pipelines/Windows-CI.yml
new file mode 100644
index 000000000..49b4b40be
--- /dev/null
+++ b/.azure-pipelines/Windows-CI.yml
@@ -0,0 +1,66 @@
+trigger:
+- master
+
+jobs:
+
+- job: 'Test'
+  pool:
+    vmImage: 'vs2017-win2016'
+  strategy:
+    matrix:
+      py37:
+        python.version: '3.7'
+        onnx_ml: 0
+        onnx_verify_proto: 0
+      py36:
+        python.version: '3.6'
+        onnx_ml: 0
+        onnx_verify_proto: 0
+      py37_onnx_ml:
+        python.version: '3.7'
+        onnx_ml: 1
+        onnx_verify_proto: 0
+      py36_verify_proto:
+        python.version: '3.6'
+        onnx_ml: 0
+        onnx_verify_proto: 1
+    maxParallel: 4
+
+  steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: '$(python.version)'
+      architecture: 'x64'
+
+  - powershell: Write-Host "##vso[task.prependpath]$env:CONDA\Scripts"
+    displayName: Add conda to PATH
+
+  - script: conda create --yes --quiet --name py$(python.version) -c conda-forge python=$(python.version) numpy libprotobuf=3.11.3 protobuf
+    displayName: Create Anaconda environment
+
+  - script: |
+      call activate py$(python.version)
+      python -m pip install --upgrade pip
+      # pytest 6.0 made deprecation warnings fail by default, pinning pytest to 5.4.3.
+      # TODO replace deprecated function with the suggested one. https://docs.pytest.org/en/stable/deprecations.html#id5
+      python -m pip install --quiet pytest==5.4.3 nbval numpy
+
+      git submodule update --init --recursive
+      set ONNX_BUILD_TESTS=1
+      set ONNX_ML=$(onnx_ml)
+      set ONNX_VERIFY_PROTO_3=$(onnx_verify_proto)
+      set USE_MSVC_STATIC_RUNTIME=0
+      set CMAKE_ARGS=-DONNX_USE_PROTOBUF_SHARED_LIBS=ON -DProtobuf_USE_STATIC_LIBS=OFF -DONNX_USE_LITE_PROTO=ON
+
+      python setup.py --quiet install
+      pytest
+      IF NOT %ERRORLEVEL% EQU 0 (
+        @echo "pytest failed"
+        EXIT 1
+      )
+
+      rm -rf .setuptools-cmake-build
+      pip install --quiet -e .[mypy]
+      python setup.py typecheck
+
+    displayName: Install and test ONNX Optimizer
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 000000000..3f74a5bf6
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,73 @@
+[aliases]
+test=pytest
+
+[tool:pytest]
+addopts = --nbval --current-env
+testpaths = onnxoptimizer/test/
+
+[metadata]
+license-file = LICENSE
+
+[flake8]
+select = B,C,E,F,P,T4,W,B9
+max-line-length = 80
+### DEFAULT IGNORES FOR 4-space INDENTED PROJECTS ###
+# E127, E128 are hard to silence in certain nested formatting situations.
+# E265, E266 talk about comment formatting which is too opinionated.
+# E402 warns on imports coming after statements. There are important use cases
+# like demandimport (https://fburl.com/demandimport) that require statements
+# before imports.
+# E501 is not flexible enough, we're using B950 instead.
+# E722 is a duplicate of B001.
+# F405 is hard to silence since we indeed do star import
+# P207 is a duplicate of B003.
+# P208 is a duplicate of C403.
+# W503 talks about operator formatting which is too opinionated.
+# F401 clashes with PEP484 requiring us to import types that are only used in
+# type comments.
+ignore = E127, E128, E265, E266, E402, E501, E722, F405, P207, P208, W503, F401
+exclude =
+  .git,
+  __pycache__,
+  build/*,
+  third_party/*
+  *_pb2.py,
+  .cache/*
+  .eggs
+  .setuptools-cmake-build/*
+
+[mypy]
+# follow-imports = silent  # TODO remove this
+mypy_path = stubs:third_party/onnx/third_party/pybind11
+strict_optional = True
+warn_return_any = True
+warn_no_return = True
+# TODO warn_unused_ignores = True
+warn_redundant_casts = True
+warn_incomplete_stub = True
+# TODO disallow_untyped_calls = True
+check_untyped_defs = True
+disallow_any_generics = True
+no_implicit_optional = True
+# TODO disallow_incomplete_defs = True
+# TODO disallow_subclassing_any = True
+disallow_untyped_decorators = True
+warn_unused_configs = True
+
+[mypy-onnxoptimizer.*]
+disallow_untyped_defs = True
+
+[mypy-onnxoptimizer.onnx_opt_cpp2py_export]
+ignore_missing_imports = True
+
+[mypy-onnx.*]
+disallow_untyped_defs = True
+ignore_missing_imports = True
+
+[mypy-tools.*]
+disallow_untyped_defs = True
+
+# Ignore errors in setup.py
+[mypy-setup]
+ignore_errors = True
+
diff --git a/tools/mypy-onnx.py b/tools/mypy-onnx.py
new file mode 100644
index 000000000..ead74291f
--- /dev/null
+++ b/tools/mypy-onnx.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+
+import subprocess
+import os
+
+
+def main():  # type: () -> None
+    try:
+        root_folder = os.path.realpath(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+        os.chdir(root_folder)
+
+        subprocess.check_call(["mypy", "."])
+        subprocess.check_call(["mypy", "--py2", "."])
+
+        exit(0)
+    except subprocess.CalledProcessError:
+        # Catch this exception because we don't want it to output a backtrace that would clutter the mypy output
+        exit(1)
+
+
+if __name__ == '__main__':
+    main()

From 3311ebf6d937b85ddc148b54341db31f73404c3a Mon Sep 17 00:00:00 2001
From: daquexian <daquexian566@gmail.com>
Date: Sat, 29 Aug 2020 21:09:26 +0800
Subject: [PATCH 11/14] fix flake8 error

---
 onnxoptimizer/__init__.py            |   7 +-
 onnxoptimizer/test/optimizer_test.py | 369 +++++++++++++++++----------
 setup.py                             |  13 +-
 3 files changed, 246 insertions(+), 143 deletions(-)

diff --git a/onnxoptimizer/__init__.py b/onnxoptimizer/__init__.py
index ec8429605..458b0d725 100644
--- a/onnxoptimizer/__init__.py
+++ b/onnxoptimizer/__init__.py
@@ -39,14 +39,16 @@
 get_available_passes = C.get_available_passes
 
 
-def optimize(model, passes=None, fixed_point=False):  # type: (ModelProto, Optional[Sequence[Text]], bool) -> ModelProto
+# type: (ModelProto, Optional[Sequence[Text]], bool) -> ModelProto
+def optimize(model, passes=None, fixed_point=False):
     if passes is None:
         passes = ['eliminate_nop_transpose',
                   'eliminate_nop_pad',
                   'fuse_consecutive_transposes',
                   'fuse_transpose_into_gemm']
     if not isinstance(model, ModelProto):
-        raise ValueError('Optimizer only accepts ModelProto, incorrect type: {}'.format(type(model)))
+        raise ValueError(
+            'Optimizer only accepts ModelProto, incorrect type: {}'.format(type(model)))
 
     model_str = model.SerializeToString()
     if fixed_point:
@@ -56,4 +58,5 @@ def optimize(model, passes=None, fixed_point=False):  # type: (ModelProto, Optio
 
     return onnx.load_from_string(optimized_model_str)
 
+
 __all__ = ['optimize', 'get_available_passes']
diff --git a/onnxoptimizer/test/optimizer_test.py b/onnxoptimizer/test/optimizer_test.py
index 3dabd5423..e77c4751d 100644
--- a/onnxoptimizer/test/optimizer_test.py
+++ b/onnxoptimizer/test/optimizer_test.py
@@ -16,8 +16,10 @@
 
 class TestOptimizer(unittest.TestCase):
 
-    def _optimized(self, graph, opts, fixed_point=False, **kwargs):  # type: (GraphProto, Sequence[Text], bool, **Any) -> ModelProto
-        orig_model = helper.make_model(graph, producer_name='onnx-test', **kwargs)
+    # type: (GraphProto, Sequence[Text], bool, **Any) -> ModelProto
+    def _optimized(self, graph, opts, fixed_point=False, **kwargs):
+        orig_model = helper.make_model(
+            graph, producer_name='onnx-test', **kwargs)
         optimized_model = onnxoptimizer.optimize(orig_model, opts, fixed_point)
         checker.check_model(optimized_model)
         return optimized_model
@@ -25,8 +27,10 @@ def _optimized(self, graph, opts, fixed_point=False, **kwargs):  # type: (GraphP
     # input_types and output_types are lists of triples of (name, type, shape)
     def _make_fake_loop_op(self,
                            body_nodes,   # type: Sequence[NodeProto]
-                           input_types,  # type: Sequence[Tuple[TensorProto.DataType, Sequence[int], Text]]
-                           output_types  # type: Sequence[Tuple[TensorProto.DataType, Sequence[int], Text]]
+                           # type: Sequence[Tuple[TensorProto.DataType, Sequence[int], Text]]
+                           input_types,
+                           # type: Sequence[Tuple[TensorProto.DataType, Sequence[int], Text]]
+                           output_types
                            ):  # type: (...) -> List[NodeProto]
         zero = helper.make_tensor(
             "trip_count_value", TensorProto.INT64, (), [10])
@@ -62,7 +66,8 @@ def _make_fake_loop_op(self,
     def _make_fake_if_op(self,
                          true_nodes,   # type: Sequence[NodeProto]
                          false_nodes,  # type: Sequence[NodeProto]
-                         output_types  # type: Sequence[Tuple[TensorProto.DataType, Sequence[int], Text]]
+                         # type: Sequence[Tuple[TensorProto.DataType, Sequence[int], Text]]
+                         output_types
                          ):  # type: (...) -> List[NodeProto]
         true = helper.make_tensor("condition", TensorProto.BOOL, (), [True])
         true_graph = helper.make_graph(true_nodes, "true_graph", [], [])
@@ -77,7 +82,8 @@ def _make_fake_if_op(self,
         return retval_nodes
 
     # fn is a function that takes a single node as argument
-    def _visit_all_nodes_recursive(self, graph, fn):  # type: (GraphProto, Callable[[NodeProto], None]) -> None
+    # type: (GraphProto, Callable[[NodeProto], None]) -> None
+    def _visit_all_nodes_recursive(self, graph, fn):
         for node in graph.node:
             fn(node)
             for attr in node.attribute:
@@ -217,7 +223,8 @@ def test_nop_pad_opset10(self):  # type: () -> None
             [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3))],
             [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 3))])
         assert len(graph.node) == 1
-        optimized_model = self._optimized(graph, ["eliminate_nop_pad"], False, opset_imports=[helper.make_opsetid("", 10)])
+        optimized_model = self._optimized(
+            graph, ["eliminate_nop_pad"], False, opset_imports=[helper.make_opsetid("", 10)])
 
         def check_pad(node):  # type: (NodeProto) -> None
             assert node.op_type != "Pad"
@@ -237,9 +244,10 @@ def test_nop_pad_graph_output(self):  # type: () -> None
              helper.make_tensor_value_info("Pads", TensorProto.INT64, (2,))],
             [helper.make_tensor_value_info("B", TensorProto.FLOAT, (5,))],
             [helper.make_tensor("Pads", TensorProto.INT64,
-                dims=(2,),
-                vals=np.array([0, 0]).astype(np.int64).tobytes(),
-                raw=True)])
+                                dims=(2,),
+                                vals=np.array([0, 0]).astype(
+                                    np.int64).tobytes(),
+                                raw=True)])
         # The existence of shape infos of graoh outputs is checked in _optimized
         optimized_model = self._optimized(graph, ["eliminate_nop_pad"])
 
@@ -257,9 +265,10 @@ def test_nop_pad(self):  # type: () -> None
              helper.make_tensor_value_info("Pads", TensorProto.INT64, (4,))],
             [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 3))],
             [helper.make_tensor("Pads", TensorProto.INT64,
-                dims=(4,),
-                vals=np.array([0, 0, 0, 0]).astype(np.int64).tobytes(),
-                raw=True)])
+                                dims=(4,),
+                                vals=np.array([0, 0, 0, 0]).astype(
+                                    np.int64).tobytes(),
+                                raw=True)])
         assert len(graph.node) == 1
         optimized_model = self._optimized(graph, ["eliminate_nop_pad"])
 
@@ -277,7 +286,8 @@ def test_nop_pad_default_opset10(self):  # type: () -> None
             "test",
             [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3))],
             [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 4))])
-        optimized_model = self._optimized(graph, ["eliminate_nop_pad"], False, opset_imports=[helper.make_opsetid("", 10)])
+        optimized_model = self._optimized(
+            graph, ["eliminate_nop_pad"], False, opset_imports=[helper.make_opsetid("", 10)])
 
         assert len(list(optimized_model.graph.node)) == 1
         assert optimized_model.graph.node[0].op_type == "Pad"
@@ -291,9 +301,10 @@ def test_nop_pad_default(self):  # type: () -> None
              helper.make_tensor_value_info("Pads", TensorProto.INT64, (4,))],
             [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 4))],
             [helper.make_tensor("Pads", TensorProto.INT64,
-                dims=(4,),
-                vals=np.array([0, 1, 0, 0]).astype(np.int64).tobytes(),
-                raw=True)])
+                                dims=(4,),
+                                vals=np.array([0, 1, 0, 0]).astype(
+                                    np.int64).tobytes(),
+                                raw=True)])
         optimized_model = self._optimized(graph, ["eliminate_nop_pad"])
 
         assert len(list(optimized_model.graph.node)) == 1
@@ -337,7 +348,8 @@ def test_eliminate_unused_initializer_input(self):  # type: () -> None
         assert len(list(optimized_model.graph.initializer)) == 0
         assert len(optimized_model.graph.input) == 2
 
-    def test_eliminate_unused_initializer_no_eliminate_used_default(self):  # type: () -> None
+    # type: () -> None
+    def test_eliminate_unused_initializer_no_eliminate_used_default(self):
         add = helper.make_node("Add", ["X", "A"], ["Z"])
         graph = helper.make_graph(
             [add],
@@ -355,7 +367,8 @@ def test_eliminate_unused_initializer_no_eliminate_used_default(self):  # type:
 
         assert len(list(optimized_model.graph.initializer)) == 1
 
-    def test_eliminate_unused_initializer_no_eliminate_used(self):  # type: () -> None
+    # type: () -> None
+    def test_eliminate_unused_initializer_no_eliminate_used(self):
         nodes = [helper.make_node("Add", ["X", "A"], ["Z"])]
         nodes.extend(self._make_fake_loop_op(
             [helper.make_node("Add", ["_X", "_A"], ["_Z2"])],
@@ -387,7 +400,8 @@ def test_eliminate_unused_initializer_no_eliminate_used(self):  # type: () -> No
 
         assert len(list(optimized_model.graph.initializer)) == 1
 
-    def test_eliminate_unused_initializer_no_eliminate_output(self):  # type: () -> None
+    # type: () -> None
+    def test_eliminate_unused_initializer_no_eliminate_output(self):
         add = helper.make_node("Add", ["X", "Y"], ["Z"])
         graph = helper.make_graph(
             [add],
@@ -448,12 +462,12 @@ def test_fuse_concats(self):  # type: () -> None
             nodes,
             "test",
             [helper.make_tensor_value_info("A", TensorProto.FLOAT, (2, 3, 4)),
-            helper.make_tensor_value_info("B", TensorProto.FLOAT, (4, 3, 4)),
-            helper.make_tensor_value_info("C", TensorProto.FLOAT, (2, 3, 4)),
-            helper.make_tensor_value_info("D", TensorProto.FLOAT, (4, 3, 4)),
-            helper.make_tensor_value_info("E", TensorProto.FLOAT, (2, 3, 4)),
-            helper.make_tensor_value_info("F", TensorProto.FLOAT, (4, 3, 4)),
-            helper.make_tensor_value_info("G", TensorProto.FLOAT, (4, 3, 4))],
+             helper.make_tensor_value_info("B", TensorProto.FLOAT, (4, 3, 4)),
+             helper.make_tensor_value_info("C", TensorProto.FLOAT, (2, 3, 4)),
+             helper.make_tensor_value_info("D", TensorProto.FLOAT, (4, 3, 4)),
+             helper.make_tensor_value_info("E", TensorProto.FLOAT, (2, 3, 4)),
+             helper.make_tensor_value_info("F", TensorProto.FLOAT, (4, 3, 4)),
+             helper.make_tensor_value_info("G", TensorProto.FLOAT, (4, 3, 4))],
             [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (18, 3, 4))])
         optimized_model = self._optimized(
             graph, ["fuse_consecutive_concats"], True)  # two passes are needed to simplify the graph to its simplest state.
@@ -472,11 +486,11 @@ def test_fuse_concats_different_axis(self):  # type: () -> None
             nodes,
             "test",
             [helper.make_tensor_value_info("A", TensorProto.FLOAT, (2, 3, 4)),
-            helper.make_tensor_value_info("B", TensorProto.FLOAT, (4, 3, 4)),
-            helper.make_tensor_value_info("C", TensorProto.FLOAT, (2, 3, 4)),
-            helper.make_tensor_value_info("D", TensorProto.FLOAT, (4, 3, 4)),
-            helper.make_tensor_value_info("E", TensorProto.FLOAT, (4, 3, 4)),
-            helper.make_tensor_value_info("F", TensorProto.FLOAT, (4, 3, 4))],
+             helper.make_tensor_value_info("B", TensorProto.FLOAT, (4, 3, 4)),
+             helper.make_tensor_value_info("C", TensorProto.FLOAT, (2, 3, 4)),
+             helper.make_tensor_value_info("D", TensorProto.FLOAT, (4, 3, 4)),
+             helper.make_tensor_value_info("E", TensorProto.FLOAT, (4, 3, 4)),
+             helper.make_tensor_value_info("F", TensorProto.FLOAT, (4, 3, 4))],
             [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (18, 3, 4))])
         optimized_model = self._optimized(
             graph, ["fuse_consecutive_concats"], True)  # two passes are needed to simplify the graph to its simplest state.
@@ -521,7 +535,8 @@ def test_fuse_transpose_default_graph_output(self):  # type: () -> None
              helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 3))],
             [helper.make_tensor_value_info("C", TensorProto.FLOAT, (2, 3))])
         # The existence of shape infos of graoh outputs is checked in _optimized
-        optimized_model = self._optimized(graph, ["fuse_consecutive_transposes"])
+        optimized_model = self._optimized(
+            graph, ["fuse_consecutive_transposes"])
 
         def check_transpose(node):  # type: (NodeProto) -> None
             assert node.op_type != "Transpose"
@@ -618,7 +633,8 @@ def test_fuse_add_bias_into_conv_use_weight_shape(self):  # type: () -> None
         # Output 1 since 0 is 'cond'
         assert optimized_model.graph.node[4].attribute[0].g.output[1].name == '_Z'
 
-    def test_fuse_add_bias_into_conv_use_weight_shape_with_tile(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_add_bias_into_conv_use_weight_shape_with_tile(self):
         conv = helper.make_node("Conv", ["X", "Y"], ["Z"])
         add = helper.make_node("Add", ["Z", "A"], ["B"])
         graph = helper.make_graph(
@@ -674,7 +690,8 @@ def test_fuse_add_bias_into_conv_use_conv_shape(self):  # type: () -> None
         assert len(
             optimized_model.graph.output[0].type.tensor_type.shape.dim) == 4
 
-    def test_fuse_add_bias_into_conv_use_move_constant(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_add_bias_into_conv_use_move_constant(self):
         conv = helper.make_node("Conv", ["X", "Y"], ["Z"])
         constant = helper.make_node("Constant", [], ["A"],
                                     value=helper.make_tensor(
@@ -706,7 +723,8 @@ def test_fuse_add_bias_into_conv_use_move_constant(self):  # type: () -> None
         assert len(
             optimized_model.graph.output[0].type.tensor_type.shape.dim) == 4
 
-    def test_fuse_add_bias_into_conv_squeeze_1d_bias_no_fuse(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_add_bias_into_conv_squeeze_1d_bias_no_fuse(self):
         conv = helper.make_node("Conv", ["X", "Y"], ["Z"])
         add = helper.make_node("Add", ["Z", "A"], ["B"])
         graph = helper.make_graph(
@@ -729,7 +747,8 @@ def test_fuse_add_bias_into_conv_squeeze_1d_bias_no_fuse(self):  # type: () -> N
         assert optimized_model.graph.node[0].op_type == 'Conv'
         assert optimized_model.graph.node[1].op_type == 'Add'
 
-    def test_fuse_add_bias_into_conv_squeeze_3d_bias_no_fuse(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_add_bias_into_conv_squeeze_3d_bias_no_fuse(self):
         conv = helper.make_node("Conv", ["X", "Y"], ["Z"])
         add = helper.make_node("Add", ["Z", "A"], ["B"])
         graph = helper.make_graph(
@@ -752,7 +771,8 @@ def test_fuse_add_bias_into_conv_squeeze_3d_bias_no_fuse(self):  # type: () -> N
         assert optimized_model.graph.node[0].op_type == 'Conv'
         assert optimized_model.graph.node[1].op_type == 'Add'
 
-    def test_fuse_add_bias_into_conv_squeeze_4d_bias_no_fuse(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_add_bias_into_conv_squeeze_4d_bias_no_fuse(self):
         conv = helper.make_node("Conv", ["X", "Y"], ["Z"])
         add = helper.make_node("Add", ["Z", "A"], ["B"])
         graph = helper.make_graph(
@@ -782,7 +802,8 @@ def test_fuse_matmul_add_bias_into_gemm(self):  # type: () -> None
              helper.make_tensor_value_info("B", TensorProto.FLOAT, (16,))],
             [helper.make_tensor_value_info("A", TensorProto.FLOAT, (32, 16))]
         )
-        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+        optimized_model = self._optimized(
+            graph, ["fuse_matmul_add_bias_into_gemm"])
 
         assert len(list(optimized_model.graph.node)) == 1
         assert optimized_model.graph.node[0].op_type == "Gemm"
@@ -798,12 +819,14 @@ def test_fuse_matmul_add_bias_into_gemm_2d_bias(self):  # type: () -> None
              helper.make_tensor_value_info("B", TensorProto.FLOAT, (1, 16))],
             [helper.make_tensor_value_info("A", TensorProto.FLOAT, (32, 16))]
         )
-        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+        optimized_model = self._optimized(
+            graph, ["fuse_matmul_add_bias_into_gemm"])
 
         assert len(list(optimized_model.graph.node)) == 1
         assert optimized_model.graph.node[0].op_type == "Gemm"
 
-    def test_fuse_matmul_add_bias_into_gemm_2d_bias_same_shape(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_matmul_add_bias_into_gemm_2d_bias_same_shape(self):
         matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
         add = helper.make_node("Add", ["Z", "B"], ["A"])
         graph = helper.make_graph(
@@ -814,12 +837,14 @@ def test_fuse_matmul_add_bias_into_gemm_2d_bias_same_shape(self):  # type: () ->
              helper.make_tensor_value_info("B", TensorProto.FLOAT, (32, 16))],
             [helper.make_tensor_value_info("A", TensorProto.FLOAT, (32, 16))]
         )
-        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+        optimized_model = self._optimized(
+            graph, ["fuse_matmul_add_bias_into_gemm"])
 
         assert len(list(optimized_model.graph.node)) == 1
         assert optimized_model.graph.node[0].op_type == "Gemm"
 
-    def test_fuse_matmul_add_bias_into_gemm_2d_bias_bcast_no_fuse(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_matmul_add_bias_into_gemm_2d_bias_bcast_no_fuse(self):
         matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
         add = helper.make_node("Add", ["Z", "B"], ["A"])
         graph = helper.make_graph(
@@ -830,11 +855,13 @@ def test_fuse_matmul_add_bias_into_gemm_2d_bias_bcast_no_fuse(self):  # type: ()
              helper.make_tensor_value_info("B", TensorProto.FLOAT, (16, 16))],
             [helper.make_tensor_value_info("A", TensorProto.FLOAT, (16, 16))]
         )
-        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+        optimized_model = self._optimized(
+            graph, ["fuse_matmul_add_bias_into_gemm"])
 
         assert optimized_model.graph == graph
 
-    def test_fuse_matmul_add_bias_into_gemm_3d_matmul_no_fuse(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_matmul_add_bias_into_gemm_3d_matmul_no_fuse(self):
         matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
         add = helper.make_node("Add", ["Z", "B"], ["A"])
         graph = helper.make_graph(
@@ -845,11 +872,13 @@ def test_fuse_matmul_add_bias_into_gemm_3d_matmul_no_fuse(self):  # type: () ->
              helper.make_tensor_value_info("B", TensorProto.FLOAT, (3, 3))],
             [helper.make_tensor_value_info("A", TensorProto.FLOAT, (2, 3, 3))]
         )
-        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+        optimized_model = self._optimized(
+            graph, ["fuse_matmul_add_bias_into_gemm"])
 
         assert optimized_model.graph == graph
 
-    def test_fuse_matmul_add_bias_into_gemm_3d_bias_no_fuse(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_matmul_add_bias_into_gemm_3d_bias_no_fuse(self):
         matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
         add = helper.make_node("Add", ["Z", "B"], ["A"])
         graph = helper.make_graph(
@@ -860,11 +889,13 @@ def test_fuse_matmul_add_bias_into_gemm_3d_bias_no_fuse(self):  # type: () -> No
              helper.make_tensor_value_info("B", TensorProto.FLOAT, (4, 1, 16))],
             [helper.make_tensor_value_info("A", TensorProto.FLOAT, (32, 16))]
         )
-        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+        optimized_model = self._optimized(
+            graph, ["fuse_matmul_add_bias_into_gemm"])
 
         assert optimized_model.graph == graph
 
-    def test_fuse_matmul_add_bias_into_gemm_multiple_use_no_fuse(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_matmul_add_bias_into_gemm_multiple_use_no_fuse(self):
         matmul = helper.make_node("MatMul", ["X", "Y"], ["Z"])
         identity = helper.make_node("Identity", ["Z"], ["A1"])
         add = helper.make_node("Add", ["Z", "B"], ["A2"])
@@ -877,11 +908,13 @@ def test_fuse_matmul_add_bias_into_gemm_multiple_use_no_fuse(self):  # type: ()
             [helper.make_tensor_value_info("A1", TensorProto.FLOAT, (32, 16)),
              helper.make_tensor_value_info("A2", TensorProto.FLOAT, (32, 16))]
         )
-        optimized_model = self._optimized(graph, ["fuse_matmul_add_bias_into_gemm"])
+        optimized_model = self._optimized(
+            graph, ["fuse_matmul_add_bias_into_gemm"])
 
         assert optimized_model.graph == graph
 
-    def test_fuse_pad_into_conv_no_optional_value_opset10(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_pad_into_conv_no_optional_value_opset10(self):
         pad = helper.make_node(
             "Pad",
             ["X"],
@@ -895,14 +928,17 @@ def test_fuse_pad_into_conv_no_optional_value_opset10(self):  # type: () -> None
             "test",
             [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 2, 2)),
              helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
-            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))]
+            [helper.make_tensor_value_info(
+                "Z", TensorProto.FLOAT, (1, 16, 1, 1))]
         )
-        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
+        optimized_model = self._optimized(
+            graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
 
         assert len(list(optimized_model.graph.node)) == 1
         assert optimized_model.graph.node[0].op_type == "Conv"
         assert optimized_model.graph.node[0].attribute[0].name == "pads"
-        assert list(optimized_model.graph.node[0].attribute[0].ints) == [0, 0, 1, 1]
+        assert list(optimized_model.graph.node[0].attribute[0].ints) == [
+            0, 0, 1, 1]
 
     def test_fuse_pad_into_conv_no_optional_value(self):  # type: () -> None
         pad = helper.make_node(
@@ -918,17 +954,20 @@ def test_fuse_pad_into_conv_no_optional_value(self):  # type: () -> None
             [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 2, 2)),
              helper.make_tensor_value_info("Pads", TensorProto.INT64, (8,)),
              helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
-            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))],
+            [helper.make_tensor_value_info(
+                "Z", TensorProto.FLOAT, (1, 16, 1, 1))],
             [helper.make_tensor("Pads", TensorProto.INT64,
-             dims=(8,),
-             vals=np.array([0, 0, 0, 0, 0, 0, 1, 1]).astype(np.int64).tobytes(),
-             raw=True)])
+                                dims=(8,),
+                                vals=np.array([0, 0, 0, 0, 0, 0, 1, 1]).astype(
+                                    np.int64).tobytes(),
+                                raw=True)])
         optimized_model = self._optimized(graph, ["fuse_pad_into_conv"])
 
         assert len(list(optimized_model.graph.node)) == 1
         assert optimized_model.graph.node[0].op_type == "Conv"
         assert optimized_model.graph.node[0].attribute[0].name == "pads"
-        assert list(optimized_model.graph.node[0].attribute[0].ints) == [0, 0, 1, 1]
+        assert list(optimized_model.graph.node[0].attribute[0].ints) == [
+            0, 0, 1, 1]
 
     def test_fuse_pad_into_conv_with_optional_value(self):  # type: () -> None
         pad = helper.make_node(
@@ -943,25 +982,30 @@ def test_fuse_pad_into_conv_with_optional_value(self):  # type: () -> None
             "test",
             [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 2, 2)),
              helper.make_tensor_value_info("Pads", TensorProto.INT64, (8,)),
-             helper.make_tensor_value_info("Constant_value", TensorProto.FLOAT, ()),
+             helper.make_tensor_value_info(
+                 "Constant_value", TensorProto.FLOAT, ()),
              helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
-            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))],
+            [helper.make_tensor_value_info(
+                "Z", TensorProto.FLOAT, (1, 16, 1, 1))],
             [helper.make_tensor("Pads", TensorProto.INT64,
-             dims=(8,),
-             vals=np.array([0, 0, 0, 0, 0, 0, 1, 1]).astype(np.int64).tobytes(),
-             raw=True),
+                                dims=(8,),
+                                vals=np.array([0, 0, 0, 0, 0, 0, 1, 1]).astype(
+                                    np.int64).tobytes(),
+                                raw=True),
              helper.make_tensor("Constant_value", TensorProto.FLOAT,
-             dims=(),
-             vals=np.array([0]).astype(np.float32).tobytes(),
-             raw=True)])
+                                dims=(),
+                                vals=np.array([0]).astype(np.float32).tobytes(),
+                                raw=True)])
         optimized_model = self._optimized(graph, ["fuse_pad_into_conv"])
 
         assert len(list(optimized_model.graph.node)) == 1
         assert optimized_model.graph.node[0].op_type == "Conv"
         assert optimized_model.graph.node[0].attribute[0].name == "pads"
-        assert list(optimized_model.graph.node[0].attribute[0].ints) == [0, 0, 1, 1]
+        assert list(optimized_model.graph.node[0].attribute[0].ints) == [
+            0, 0, 1, 1]
 
-    def test_fuse_pad_into_conv_with_nonzero_optional_value(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_pad_into_conv_with_nonzero_optional_value(self):
         pad = helper.make_node(
             "Pad",
             ["X", "Pads", "Constant_value"],
@@ -974,17 +1018,22 @@ def test_fuse_pad_into_conv_with_nonzero_optional_value(self):  # type: () -> No
             "test",
             [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 2, 2)),
              helper.make_tensor_value_info("Pads", TensorProto.INT64, (8,)),
-             helper.make_tensor_value_info("Constant_value", TensorProto.FLOAT, ()),
+             helper.make_tensor_value_info(
+                 "Constant_value", TensorProto.FLOAT, ()),
              helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
-            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))],
+            [helper.make_tensor_value_info(
+                "Z", TensorProto.FLOAT, (1, 16, 1, 1))],
             [helper.make_tensor("Pads", TensorProto.INT64,
-             dims=(8,),
-             vals=np.array([0, 0, 0, 0, 0, 0, 1, 1]).astype(np.int64).tobytes(),
-             raw=True),
+                                dims=(8,),
+                                vals=np.array([0, 0, 0, 0, 0, 0, 1, 1]).astype(
+                                    np.int64).tobytes(),
+                                raw=True),
              helper.make_tensor("Constant_value", TensorProto.FLOAT,
-             dims=(),
-             vals=np.array([25]).astype(np.float32).tobytes(),  # non-zero Constant_value -> so no pad
-             raw=True)])
+                                dims=(),
+                                # non-zero Constant_value -> so no pad
+                                vals=np.array([25]).astype(
+                                    np.float32).tobytes(),
+                                raw=True)])
         optimized_model = self._optimized(graph, ["fuse_pad_into_conv"])
 
         assert optimized_model.graph == graph
@@ -1005,7 +1054,8 @@ def test_fuse_pad_into_conv_1d_opset10(self):  # type: () -> None
              helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 32))],
             [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1))]
         )
-        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
+        optimized_model = self._optimized(
+            graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
 
         assert len(list(optimized_model.graph.node)) == 1
         assert optimized_model.graph.node[0].op_type == "Conv"
@@ -1028,9 +1078,10 @@ def test_fuse_pad_into_conv_1d(self):  # type: () -> None
              helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 32))],
             [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1))],
             [helper.make_tensor("Pads", TensorProto.INT64,
-             dims=(6,),
-             vals=np.array([0, 0, 1, 0, 0, 1]).astype(np.int64).tobytes(),
-             raw=True)])
+                                dims=(6,),
+                                vals=np.array([0, 0, 1, 0, 0, 1]).astype(
+                                    np.int64).tobytes(),
+                                raw=True)])
         optimized_model = self._optimized(graph, ["fuse_pad_into_conv"])
 
         assert len(list(optimized_model.graph.node)) == 1
@@ -1038,7 +1089,8 @@ def test_fuse_pad_into_conv_1d(self):  # type: () -> None
         assert optimized_model.graph.node[0].attribute[0].name == "pads"
         assert list(optimized_model.graph.node[0].attribute[0].ints) == [1, 1]
 
-    def test_fuse_pad_into_conv_existing_conv_pad_opset10(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_pad_into_conv_existing_conv_pad_opset10(self):
         pad = helper.make_node(
             "Pad",
             ["X"],
@@ -1057,14 +1109,17 @@ def test_fuse_pad_into_conv_existing_conv_pad_opset10(self):  # type: () -> None
             "test",
             [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 2, 2)),
              helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 4, 4))],
-            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))]
+            [helper.make_tensor_value_info(
+                "Z", TensorProto.FLOAT, (1, 16, 1, 1))]
         )
-        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
+        optimized_model = self._optimized(
+            graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
 
         assert len(list(optimized_model.graph.node)) == 1
         assert optimized_model.graph.node[0].op_type == "Conv"
         assert optimized_model.graph.node[0].attribute[0].name == "pads"
-        assert list(optimized_model.graph.node[0].attribute[0].ints) == [1, 1, 1, 1]
+        assert list(optimized_model.graph.node[0].attribute[0].ints) == [
+            1, 1, 1, 1]
 
     def test_fuse_pad_into_conv_existing_conv_pad(self):  # type: () -> None
         pad = helper.make_node(
@@ -1085,19 +1140,23 @@ def test_fuse_pad_into_conv_existing_conv_pad(self):  # type: () -> None
             [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 2, 2)),
              helper.make_tensor_value_info("Pads", TensorProto.INT64, (8,)),
              helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 4, 4))],
-            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))],
+            [helper.make_tensor_value_info(
+                "Z", TensorProto.FLOAT, (1, 16, 1, 1))],
             [helper.make_tensor("Pads", TensorProto.INT64,
-             dims=(8,),
-             vals=np.array([0, 0, 0, 0, 0, 0, 1, 1]).astype(np.int64).tobytes(),
-             raw=True)])
+                                dims=(8,),
+                                vals=np.array([0, 0, 0, 0, 0, 0, 1, 1]).astype(
+                                    np.int64).tobytes(),
+                                raw=True)])
         optimized_model = self._optimized(graph, ["fuse_pad_into_conv"])
 
         assert len(list(optimized_model.graph.node)) == 1
         assert optimized_model.graph.node[0].op_type == "Conv"
         assert optimized_model.graph.node[0].attribute[0].name == "pads"
-        assert list(optimized_model.graph.node[0].attribute[0].ints) == [1, 1, 1, 1]
+        assert list(optimized_model.graph.node[0].attribute[0].ints) == [
+            1, 1, 1, 1]
 
-    def test_fuse_pad_into_conv_pad_feature_no_fuse_opset10(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_pad_into_conv_pad_feature_no_fuse_opset10(self):
         pad = helper.make_node(
             "Pad",
             ["X"],
@@ -1111,9 +1170,11 @@ def test_fuse_pad_into_conv_pad_feature_no_fuse_opset10(self):  # type: () -> No
             "test",
             [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 4, 3, 3)),
              helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
-            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))]
+            [helper.make_tensor_value_info(
+                "Z", TensorProto.FLOAT, (1, 16, 1, 1))]
         )
-        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
+        optimized_model = self._optimized(
+            graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
 
         assert optimized_model.graph == graph
 
@@ -1131,16 +1192,19 @@ def test_fuse_pad_into_conv_pad_feature_no_fuse(self):  # type: () -> None
             [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 4, 3, 3)),
              helper.make_tensor_value_info("Pads", TensorProto.INT64, (8,)),
              helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
-            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))],
+            [helper.make_tensor_value_info(
+                "Z", TensorProto.FLOAT, (1, 16, 1, 1))],
             [helper.make_tensor("Pads", TensorProto.INT64,
-             dims=(8,),
-             vals=np.array([0, 1, 0, 0, 0, 0, 0, 0]).astype(np.int64).tobytes(),
-             raw=True)])
+                                dims=(8,),
+                                vals=np.array([0, 1, 0, 0, 0, 0, 0, 0]).astype(
+                                    np.int64).tobytes(),
+                                raw=True)])
         optimized_model = self._optimized(graph, ["fuse_pad_into_conv"])
 
         assert optimized_model.graph == graph
 
-    def test_fuse_pad_into_conv_negative_pad_no_fuse_opset10(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_pad_into_conv_negative_pad_no_fuse_opset10(self):
         pad = helper.make_node(
             "Pad",
             ["X"],
@@ -1154,9 +1218,11 @@ def test_fuse_pad_into_conv_negative_pad_no_fuse_opset10(self):  # type: () -> N
             "test",
             [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 4, 4)),
              helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
-            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))]
+            [helper.make_tensor_value_info(
+                "Z", TensorProto.FLOAT, (1, 16, 1, 1))]
         )
-        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
+        optimized_model = self._optimized(
+            graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
 
         assert optimized_model.graph == graph
 
@@ -1174,16 +1240,19 @@ def test_fuse_pad_into_conv_negative_pad_no_fuse(self):  # type: () -> None
             [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 4, 4)),
              helper.make_tensor_value_info("Pads", TensorProto.INT64, (8,)),
              helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
-            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))],
+            [helper.make_tensor_value_info(
+                "Z", TensorProto.FLOAT, (1, 16, 1, 1))],
             [helper.make_tensor("Pads", TensorProto.INT64,
-             dims=(8,),
-             vals=np.array([0, 0, 0, 0, 0, 0, -1, -1]).astype(np.int64).tobytes(),
-             raw=True)])
+                                dims=(8,),
+                                vals=np.array(
+                                    [0, 0, 0, 0, 0, 0, -1, -1]).astype(np.int64).tobytes(),
+                                raw=True)])
         optimized_model = self._optimized(graph, ["fuse_pad_into_conv"])
 
         assert optimized_model.graph == graph
 
-    def test_fuse_pad_into_conv_reflection_pad_no_fuse_opset10(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_pad_into_conv_reflection_pad_no_fuse_opset10(self):
         pad = helper.make_node(
             "Pad",
             ["X"],
@@ -1197,13 +1266,16 @@ def test_fuse_pad_into_conv_reflection_pad_no_fuse_opset10(self):  # type: () ->
             "test",
             [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 2, 2)),
              helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
-            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))]
+            [helper.make_tensor_value_info(
+                "Z", TensorProto.FLOAT, (1, 16, 1, 1))]
         )
-        optimized_model = self._optimized(graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
+        optimized_model = self._optimized(
+            graph, ["fuse_pad_into_conv"], False, opset_imports=[helper.make_opsetid("", 10)])
 
         assert optimized_model.graph == graph
 
-    def test_fuse_pad_into_conv_reflection_pad_no_fuse(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_pad_into_conv_reflection_pad_no_fuse(self):
         pad = helper.make_node(
             "Pad",
             ["X", "Pads"],
@@ -1217,11 +1289,13 @@ def test_fuse_pad_into_conv_reflection_pad_no_fuse(self):  # type: () -> None
             [helper.make_tensor_value_info("X", TensorProto.FLOAT, (1, 5, 2, 2)),
              helper.make_tensor_value_info("Pads", TensorProto.INT64, (8,)),
              helper.make_tensor_value_info("Y", TensorProto.FLOAT, (16, 5, 3, 3))],
-            [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (1, 16, 1, 1))],
+            [helper.make_tensor_value_info(
+                "Z", TensorProto.FLOAT, (1, 16, 1, 1))],
             [helper.make_tensor("Pads", TensorProto.INT64,
-             dims=(8,),
-             vals=np.array([0, 0, 0, 0, 0, 0, 1, 1]).astype(np.int64).tobytes(),
-             raw=True)])
+                                dims=(8,),
+                                vals=np.array([0, 0, 0, 0, 0, 0, 1, 1]).astype(
+                                    np.int64).tobytes(),
+                                raw=True)])
         optimized_model = self._optimized(graph, ["fuse_pad_into_conv"])
 
         assert optimized_model.graph == graph
@@ -1348,7 +1422,8 @@ def test_fuse_consecutive_softmax_log_side_effect(self):  # type: () -> None
 
         assert graph == optimized_model.graph
 
-    def test_fuse_consecutive_softmax_log_multiple_out(self):  # type: () -> None
+    # type: () -> None
+    def test_fuse_consecutive_softmax_log_multiple_out(self):
         softmax = helper.make_node("Softmax", ["X"], ["Y"], axis=2)
         log = helper.make_node("Log", ["Y"], ["Z"])
         exp = helper.make_node("Exp", ["Z"], ["Z1"])
@@ -1539,7 +1614,8 @@ def test_deadend_elimination_simple(self):  # type: () -> None
     def test_deadend_elimination_simple_fixed(self):  # type: () -> None
         self._internal_test_deadend_elimination(True)
 
-    def test_eliminate_nop_monotone_argmax_basic_no_node_axis(self):  # type: () -> None
+    # type: () -> None
+    def test_eliminate_nop_monotone_argmax_basic_no_node_axis(self):
         for node_name in ["Log", "Exp", "Sqrt"]:
             for axis in range(3):
                 node = helper.make_node(node_name, ["X"], ["Y"])
@@ -1559,12 +1635,15 @@ def test_eliminate_nop_monotone_argmax_basic_no_node_axis(self):  # type: () ->
                 assert optimized_model.graph.node[0].attribute[0].name == "axis"
                 assert optimized_model.graph.node[0].attribute[0].i == axis
 
-    def test_eliminate_nop_monotone_argmax_basic_with_node_axis(self):  # type: () -> None
+    # type: () -> None
+    def test_eliminate_nop_monotone_argmax_basic_with_node_axis(self):
         for node_name in ["Softmax", "LogSoftmax"]:
             for axis_n in range(3):
                 for axis_max in range(3):
-                    node = helper.make_node(node_name, ["X"], ["Y"], axis=axis_n)
-                    argmax = helper.make_node("ArgMax", ["Y"], ["Z"], axis=axis_max)
+                    node = helper.make_node(
+                        node_name, ["X"], ["Y"], axis=axis_n)
+                    argmax = helper.make_node(
+                        "ArgMax", ["Y"], ["Z"], axis=axis_max)
                     graph = helper.make_graph(
                         [node, argmax],
                         "test",
@@ -1583,7 +1662,8 @@ def test_eliminate_nop_monotone_argmax_basic_with_node_axis(self):  # type: () -
                     else:
                         assert optimized_model.graph == graph
 
-    def test_eliminate_nop_monotone_argmax_multiple_out(self):  # type: () -> None
+    # type: () -> None
+    def test_eliminate_nop_monotone_argmax_multiple_out(self):
         for node_name in ["Log", "Exp", "Sqrt"]:
             for axis in range(3):
                 node = helper.make_node(node_name, ["X"], ["Y"])
@@ -1600,8 +1680,10 @@ def test_eliminate_nop_monotone_argmax_multiple_out(self):  # type: () -> None
                     graph, ["eliminate_nop_monotone_argmax"])
                 assert optimized_model.graph == graph
 
-    def test_eliminate_nop_monotone_argmax_consecutive(self):  # type: () -> None
-        def _assertion(graph, optimized_model, axis_aligned, true_axis):  # type: (GraphProto, ModelProto, bool, int) -> None
+    # type: () -> None
+    def test_eliminate_nop_monotone_argmax_consecutive(self):
+        # type: (GraphProto, ModelProto, bool, int) -> None
+        def _assertion(graph, optimized_model, axis_aligned, true_axis):
             if axis_aligned:
                 assert len(optimized_model.graph.output) == 1
                 assert len(optimized_model.graph.node) == 1
@@ -1617,7 +1699,8 @@ def _assertion(graph, optimized_model, axis_aligned, true_axis):  # type: (Graph
                 for axis in range(3):
                     node = helper.make_node(node_name_0, ["X"], ["Y"])
                     node2 = helper.make_node(node_name_1, ["Y"], ["Y1"])
-                    argmax = helper.make_node("ArgMax", ["Y1"], ["Z"], axis=axis)
+                    argmax = helper.make_node(
+                        "ArgMax", ["Y1"], ["Z"], axis=axis)
                     graph = helper.make_graph(
                         [node, node2, argmax],
                         "test",
@@ -1633,8 +1716,10 @@ def _assertion(graph, optimized_model, axis_aligned, true_axis):  # type: (Graph
                 for axis_0 in range(3):
                     for axis_1 in range(3):
                         node = helper.make_node(node_name_0, ["X"], ["Y"])
-                        node2 = helper.make_node(node_name_1, ["Y"], ["Y1"], axis=axis_0)
-                        argmax = helper.make_node("ArgMax", ["Y1"], ["Z"], axis=axis_1)
+                        node2 = helper.make_node(
+                            node_name_1, ["Y"], ["Y1"], axis=axis_0)
+                        argmax = helper.make_node(
+                            "ArgMax", ["Y1"], ["Z"], axis=axis_1)
                         graph = helper.make_graph(
                             [node, node2, argmax],
                             "test",
@@ -1643,16 +1728,20 @@ def _assertion(graph, optimized_model, axis_aligned, true_axis):  # type: (Graph
                             [helper.make_tensor_value_info("Z", TensorProto.FLOAT, (5, 7, 11))])
                         optimized_model = self._optimized(
                             graph, ["eliminate_nop_monotone_argmax"], True)
-                        _assertion(graph, optimized_model, axis_0 == axis_1, axis_1)
+                        _assertion(graph, optimized_model,
+                                   axis_0 == axis_1, axis_1)
         # axis X axis test
         for node_name_0 in ["Softmax", "LogSoftmax"]:
             for node_name_1 in ["Softmax", "LogSoftmax"]:
                 for axis_0 in range(3):
                     for axis_1 in range(3):
                         for axis_2 in range(3):
-                            node = helper.make_node(node_name_0, ["X"], ["Y"], axis=axis_0)
-                            node2 = helper.make_node(node_name_1, ["Y"], ["Y1"], axis=axis_1)
-                            argmax = helper.make_node("ArgMax", ["Y1"], ["Z"], axis=axis_2)
+                            node = helper.make_node(
+                                node_name_0, ["X"], ["Y"], axis=axis_0)
+                            node2 = helper.make_node(
+                                node_name_1, ["Y"], ["Y1"], axis=axis_1)
+                            argmax = helper.make_node(
+                                "ArgMax", ["Y1"], ["Z"], axis=axis_2)
                             graph = helper.make_graph(
                                 [node, node2, argmax],
                                 "test",
@@ -1662,7 +1751,8 @@ def _assertion(graph, optimized_model, axis_aligned, true_axis):  # type: (Graph
                             optimized_model = self._optimized(
                                 graph, ["eliminate_nop_monotone_argmax"], True)
                             if axis_0 == axis_1:  # we can reduce both of the monotonic ops
-                                _assertion(graph, optimized_model, axis_1 == axis_2, axis_2)
+                                _assertion(graph, optimized_model,
+                                           axis_1 == axis_2, axis_2)
                             elif axis_1 == axis_2:  # we can reduce one of the monotonic ops
                                 assert len(optimized_model.graph.output) == 1
                                 assert len(optimized_model.graph.node) == 2
@@ -1689,7 +1779,8 @@ def test_eliminate_nop_dropout(self):  # type: () -> None
         # even when it';s an optional parameter (defaults to 0)
         assert optimized_model.graph == graph
 
-    def test_eliminate_nop_dropout_opset11_graph_output(self):  # type: () -> None
+    # type: () -> None
+    def test_eliminate_nop_dropout_opset11_graph_output(self):
         node = helper.make_node("Log", ["X"], ["Y"])
         node1 = helper.make_node("Dropout", ["Y"], ["Z"], ratio=0.0)
         graph = helper.make_graph(
@@ -1726,12 +1817,15 @@ def test_eliminate_nop_dropout_opset11(self):  # type: () -> None
                 assert optimized_model.graph.node[0].op_type == "Log"
 
     def test_fuse_reduction_unsqueeze(self):  # type: () -> None
-        def _calculate_post_transform_shape(input_shape, reduction_axes, unsqueeze_axes, keepdim):  # type: (Tuple[int, ...], List[int], List[int], bool) -> Tuple[int, ...]
+        # type: (Tuple[int, ...], List[int], List[int], bool) -> Tuple[int, ...]
+        def _calculate_post_transform_shape(input_shape, reduction_axes, unsqueeze_axes, keepdim):
             post_reduce_shape = None
             if keepdim:
-                post_reduce_shape = tuple([(x if i not in reduction_axes else 1) for i, x in enumerate(input_shape)])
+                post_reduce_shape = tuple(
+                    [(x if i not in reduction_axes else 1) for i, x in enumerate(input_shape)])
             else:
-                post_reduce_shape = tuple([x for i, x in enumerate(input_shape) if i not in reduction_axes])
+                post_reduce_shape = tuple(
+                    [x for i, x in enumerate(input_shape) if i not in reduction_axes])
             post_unsqueeze_shape = list(post_reduce_shape)
             for ax in unsqueeze_axes:
                 post_unsqueeze_shape.insert(ax, 1)
@@ -1744,9 +1838,12 @@ def _calculate_post_transform_shape(input_shape, reduction_axes, unsqueeze_axes,
                 for axes2 in [[1], [1, 2], [2]]:
                     for keepdim in [False, True]:
                         input_shape = (5, 7, 9)
-                        output_shape = _calculate_post_transform_shape(input_shape, axes1, axes2, keepdim)  # type: Tuple[int, ...]
-                        node = helper.make_node(reduction, ["X"], ["Y"], axes=axes1, keepdims=keepdim)
-                        node1 = helper.make_node("Unsqueeze", ["Y"], ["Z"], axes=axes2)
+                        output_shape = _calculate_post_transform_shape(
+                            input_shape, axes1, axes2, keepdim)  # type: Tuple[int, ...]
+                        node = helper.make_node(
+                            reduction, ["X"], ["Y"], axes=axes1, keepdims=keepdim)
+                        node1 = helper.make_node(
+                            "Unsqueeze", ["Y"], ["Z"], axes=axes2)
                         graph = helper.make_graph(
                             [node, node1],
                             "test",
@@ -1765,10 +1862,10 @@ def _calculate_post_transform_shape(input_shape, reduction_axes, unsqueeze_axes,
                             assert optimized_model.graph.node[-1].op_type == reduction
                             assert optimized_model.graph.node[-1].attribute[0].name == "axes"
                             assert optimized_model.graph.node[-1].attribute[0].ints == axes1
-                            optimized_output_shape = tuple(x.dim_value for x in optimized_model.graph.output[0].type.tensor_type.shape.dim)
+                            optimized_output_shape = tuple(
+                                x.dim_value for x in optimized_model.graph.output[0].type.tensor_type.shape.dim)
                             assert optimized_output_shape == output_shape
 
 
 if __name__ == '__main__':
     unittest.main()
-
diff --git a/setup.py b/setup.py
index 4bd207d90..fbacf560d 100644
--- a/setup.py
+++ b/setup.py
@@ -164,7 +164,8 @@ def run(self):
                 '-DBUILD_ONNX_PYTHON=ON',
                 '-DCMAKE_EXPORT_COMPILE_COMMANDS=ON',
                 '-DONNX_NAMESPACE={}'.format(ONNX_NAMESPACE),
-                '-DPY_EXT_SUFFIX={}'.format(sysconfig.get_config_var('EXT_SUFFIX') or ''),
+                '-DPY_EXT_SUFFIX={}'.format(
+                    sysconfig.get_config_var('EXT_SUFFIX') or ''),
             ]
             if COVERAGE:
                 cmake_args.append('-DONNX_COVERAGE=ON')
@@ -178,7 +179,8 @@ def run(self):
                     # we need to link with libpython on windows, so
                     # passing python version to window in order to
                     # find python in cmake
-                    '-DPY_VERSION={}'.format('{0}.{1}'.format(*sys.version_info[:2])),
+                    '-DPY_VERSION={}'.format('{0}.{1}'.format(* \
+                                                              sys.version_info[:2])),
                 ])
                 if USE_MSVC_STATIC_RUNTIME:
                     cmake_args.append('-DONNX_USE_MSVC_STATIC_RUNTIME=ON')
@@ -252,7 +254,8 @@ def build_extensions(self):
                 elif os.path.exists(release_lib_dir):
                     lib_path = release_lib_dir
             src = os.path.join(lib_path, filename)
-            dst = os.path.join(os.path.realpath(self.build_lib), "onnxoptimizer", filename)
+            dst = os.path.join(os.path.realpath(
+                self.build_lib), "onnxoptimizer", filename)
             self.copy_file(src, dst)
 
 
@@ -261,7 +264,8 @@ class mypy_type_check(ONNXCommand):
 
     def run(self):
         """Run command."""
-        onnx_script = os.path.realpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "tools/mypy-onnx.py"))
+        onnx_script = os.path.realpath(os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "tools/mypy-onnx.py"))
         returncode = subprocess.call([sys.executable, onnx_script])
         sys.exit(returncode)
 
@@ -330,4 +334,3 @@ def run(self):
     author_email='onnx-technical-discuss@lists.lfai.foundation',
     url='https://github.com/onnx/optimizer',
 )
-

From 755c49d42d4dee6b1776e1ada14f9b3599ee5469 Mon Sep 17 00:00:00 2001
From: daquexian <daquexian566@gmail.com>
Date: Sat, 29 Aug 2020 21:11:54 +0800
Subject: [PATCH 12/14] update ci yaml

---
 .azure-pipelines/Linux-CI.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.azure-pipelines/Linux-CI.yml b/.azure-pipelines/Linux-CI.yml
index 3fe5610ea..acfb56034 100644
--- a/.azure-pipelines/Linux-CI.yml
+++ b/.azure-pipelines/Linux-CI.yml
@@ -64,8 +64,6 @@ jobs:
 
       # check line endings to be UNIX
       find . -type f -regextype posix-extended -regex '.*\.(py|cpp|md|h|cc|proto|proto3|in)' | xargs dos2unix --quiet
-      git status
-      git diff --exit-code
       
       # Do not hardcode onnx's namespace in the c++ source code, so that
       # other libraries who statically link with onnx can hide onnx symbols

From e527581167bfb7d69bb43f6e2a4cb562c928a243 Mon Sep 17 00:00:00 2001
From: daquexian <daquexian566@gmail.com>
Date: Sun, 6 Sep 2020 19:12:05 +0800
Subject: [PATCH 13/14] fix misplaced type annotations

---
 onnxoptimizer/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/onnxoptimizer/__init__.py b/onnxoptimizer/__init__.py
index 458b0d725..c3213be4a 100644
--- a/onnxoptimizer/__init__.py
+++ b/onnxoptimizer/__init__.py
@@ -39,8 +39,7 @@
 get_available_passes = C.get_available_passes
 
 
-# type: (ModelProto, Optional[Sequence[Text]], bool) -> ModelProto
-def optimize(model, passes=None, fixed_point=False):
+def optimize(model, passes=None, fixed_point=False):  # type: (ModelProto, Optional[Sequence[Text]], bool) -> ModelProto
     if passes is None:
         passes = ['eliminate_nop_transpose',
                   'eliminate_nop_pad',

From 0613c2047c1c03c9cc060c1cd4ac9536974f3517 Mon Sep 17 00:00:00 2001
From: daquexian <daquexian566@gmail.com>
Date: Sun, 6 Sep 2020 20:07:52 +0800
Subject: [PATCH 14/14] set c++ standard to c++11

---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ef6fcc930..2a3038964 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,11 @@ cmake_minimum_required(VERSION 3.1)
 
 project(onnx_optimizer C CXX)
 
+# Set C++11 as standard for the whole project
+if(NOT MSVC)
+  set(CMAKE_CXX_STANDARD 11)
+endif(NOT MSVC)
+
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 set(ONNX_ROOT ${PROJECT_SOURCE_DIR}/third_party/onnx)