diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
index 889aecfa84b04c..aceec5106c56d1 100644
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@@ -3266,7 +3266,9 @@
   name: alias
   return: THTensor*
   cpu_half: True
-  variants: [function]
+  variants:
+    - method
+    - function
   options:
     - cname: newWithTensor
       arguments:
diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp
index d7f04e2f261d7c..21a823cb73701f 100644
--- a/aten/src/ATen/SparseTensorImpl.cpp
+++ b/aten/src/ATen/SparseTensorImpl.cpp
@@ -22,18 +22,18 @@ namespace {
 // a scalar and have one element)
 //
 // Thus, an empty sparse tensor should be a 1-dimensional tensor of size [0].
-// Furthermore, we have dim == sparseDims + denseDims; since this is a sparse
-// tensor, let us say that an empty sparse tensor has sparseDims == 1 and
-// denseDims == 0.  (There is a degree of freedom here, but given that this
-// is a sparse dimension, it seems reasonable to demand that sparseDims > 0).
+// Furthermore, we have dim == sparse_dim + dense_dim; since this is a sparse
+// tensor, let us say that an empty sparse tensor has sparse_dim == 1 and
+// dense_dim == 0.  (There is a degree of freedom here, but given that this
+// is a sparse dimension, it seems reasonable to demand that sparse_dim > 0).
 //
 // This means that we allocate a [1,0] size indices tensor and a [0] size
 // values tensor for such an empty tensor.
 SparseTensorImpl::SparseTensorImpl(at::TensorTypeId type_id, const caffe2::TypeMeta& data_type)
     : TensorImpl(type_id, data_type, nullptr, false)
     , size_{0}
-    , sparseDims_(1)
-    , denseDims_(0)
+    , sparse_dim_(1)
+    , dense_dim_(0)
     , indices_(at::empty({1, 0}, at::initialTensorOptions().device(sparseTensorIdToDeviceType(type_id)).dtype(ScalarType::Long)))
     , values_(at::empty({0}, at::initialTensorOptions().device(sparseTensorIdToDeviceType(type_id)).dtype(dataTypeToScalarType(data_type.id())))) {}
 
@@ -67,7 +67,7 @@ void SparseTensorImpl::set_storage_offset(int64_t storage_offset) {
 }
 
 int64_t SparseTensorImpl::dim() const {
-  return sparseDims_ + denseDims_;
+  return sparse_dim_ + dense_dim_;
 }
 TensorImpl* SparseTensorImpl::maybe_zero_dim(bool condition_when_zero_dim) {
   AT_CHECK(condition_when_zero_dim == (dim() == 0),
@@ -83,17 +83,22 @@ int64_t SparseTensorImpl::storage_offset() const {
   AT_ERROR("sparse tensors do not have storage");
 }
 void SparseTensorImpl::set_indices_and_values_unsafe(const Tensor& indices, const Tensor& values) {
+  AT_ASSERT(!indices.is_variable() && !values.is_variable());  // They should be plain tensors!
+
+  AT_CHECK(!indices.is_sparse(), "expected indices to be a dense tensor, but got indices of layout ", indices.layout());
+  AT_CHECK(!values.is_sparse(), "expected values to be a dense tensor, but got values of layout ", values.layout());
+
   AT_CHECK(values.type().toSparse() == type(), "values type must match sparse tensor type");
   AT_CHECK(indices.type().scalarType() == kLong, "indices must be an int64 tensor");
   AT_CHECK(indices.type().backend() == values.type().backend(), "backend of indices (", indices.type().backend(), ") must match backend of values (", values.type().backend(), ")");
   AT_CHECK(!indices.is_cuda() || indices.get_device() == values.get_device(), "device of indices (", indices.get_device(), ") must match device of values (", values.get_device(), ")");
 
-  AT_CHECK(indices.dim() == 2, "indices must be nDim x nnz, but got: ", indices.sizes());
+  AT_CHECK(indices.dim() == 2, "indices must be sparse_dim x nnz, but got: ", indices.sizes());
   AT_CHECK(indices.size(1) == values.size(0), "indices and values must have same nnz, but got nnz from indices: ", indices.size(1), ", nnz from values: ", values.size(0));
-  AT_CHECK(indices.size(0) == sparseDims_, "indices has incorrect first dimension, expected ", sparseDims_, ", got ", indices.size(0));
-  AT_CHECK(values.dim() == denseDims_ + 1, "values has incorrect number of dimensions, expected ", denseDims_ + 1, ", got ", values.dim());
+  AT_CHECK(indices.size(0) == sparse_dim_, "indices has incorrect first dimension, expected ", sparse_dim_, ", got ", indices.size(0));
+  AT_CHECK(values.dim() == dense_dim_ + 1, "values has incorrect number of dimensions, expected ", dense_dim_ + 1, ", got ", values.dim());
 
-  auto dense_size_original = sizes().slice(sparseDims_);
+  auto dense_size_original = sizes().slice(sparse_dim_);
   std::vector<int64_t> expected_values_size_vec = {values.size(0)};
   expected_values_size_vec.insert(expected_values_size_vec.end(), dense_size_original.begin(), dense_size_original.end());
   IntList expected_values_size(expected_values_size_vec);
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index 9a68f80a912b61..1a8fa8b29baeaf 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -9,18 +9,18 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
   // Stored in COO format, indices + values.
 
   // INVARIANTS:
-  // _sparseDims: range [0, len(shape)]; _sparseDims + _denseDims = len(shape)
-  // _denseDims : range [0, len(shape)]; _sparseDims + _denseDims = len(shape)
-  // _indices.shape: dimensionality: 2,  shape: (_sparseDims, nnz)
-  // _values.shape:  dimensionality: 1 + _denseDims.  shape: (nnz, shape[_sparseDims:])
+  // sparse_dim: range [0, len(shape)]; sparse_dim + dense_dim = len(shape)
+  // dense_dim : range [0, len(shape)]; sparse_dim + dense_dim = len(shape)
+  // _indices.shape: dimensionality: 2,  shape: (sparse_dim, nnz)
+  // _values.shape:  dimensionality: 1 + dense_dim.  shape: (nnz, shape[sparse_dim:])
 
   // The true size of the sparse tensor (e.g., if you called to_dense()
   // on it).  When THTensor merges into TensorImpl, this field
   // should move to the parent class.
   std::vector<int64_t> size_;
 
-  int64_t sparseDims_ = 0; // number of sparse dimensions
-  int64_t denseDims_ = 0; // number of dense dimensions
+  int64_t sparse_dim_ = 0; // number of sparse dimensions
+  int64_t dense_dim_ = 0; // number of dense dimensions
 
   Tensor indices_; // always a LongTensor
   Tensor values_;
@@ -39,8 +39,8 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
   explicit SparseTensorImpl(at::TensorTypeId, const caffe2::TypeMeta&);
 
   int64_t nnz() const { return values_.size(0); }
-  int64_t sparseDims() const { return sparseDims_; }
-  int64_t denseDims() const { return denseDims_; }
+  int64_t sparse_dim() const { return sparse_dim_; }
+  int64_t dense_dim() const { return dense_dim_; }
   bool coalesced() const { return coalesced_; }
   Tensor indices() const { return indices_; }
   Tensor values() const { return values_; }
@@ -60,16 +60,16 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
   const Storage& storage() const override;
   int64_t storage_offset() const override;
 
-  // WARNING: This function does NOT preserve invariants of sparseDims/denseDims with
+  // WARNING: This function does NOT preserve invariants of sparse_dim/dense_dim with
   // respect to indices and values
-  void raw_resize_(int64_t sparseDims, int64_t denseDims, IntList size) {
+  void raw_resize_(int64_t sparse_dim, int64_t dense_dim, IntList size) {
     size_ = size.vec();
-    sparseDims_ = sparseDims;
-    denseDims_ = denseDims;
+    sparse_dim_ = sparse_dim;
+    dense_dim_ = dense_dim;
     refresh_numel();
   }
 
-  // NOTE: This function preserves invariants of sparseDims/denseDims with respect to
+  // NOTE: This function preserves invariants of sparse_dim/dense_dim with respect to
   // indices and values.
   //
   // NOTE: This function supports the following cases:
@@ -91,36 +91,36 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
   // and for API consistency we don't support it).
   // 4. When we attempt to shrink the size of any of the sparse dimensions on a non-empty sparse tensor
   // (this could make some of the stored indices out-of-bound and thus unsafe).
-  void resize_(int64_t sparseDims, int64_t denseDims, IntList size) {
-    AT_CHECK(sparseDims + denseDims == size.size(), "number of dimensions must be sparseDims (", sparseDims, ") + denseDims (", denseDims, "), but got ", size.size());
+  void resize_(int64_t sparse_dim, int64_t dense_dim, IntList size) {
+    AT_CHECK(sparse_dim + dense_dim == size.size(), "number of dimensions must be sparse_dim (", sparse_dim, ") + dense_dim (", dense_dim, "), but got ", size.size());
     if (nnz() > 0) {
       auto alt_options_msg = "You could try the following options:\n\
-1. If you need an empty sparse tensor of this size, call `x=torch.sparse_coo_tensor(size)`.\n\
+1. If you need an empty sparse tensor of this size, call `x = torch.sparse_coo_tensor(size)`.\n\
 2. If you need to resize this tensor, you have the following options:\n\
     1. For both sparse and dense dimensions, keep the number of them constant and the size of them non-shrinking, and then try the same call again.\n\
     2. Or, create a new sparse tensor with the correct indices and values from this sparse tensor.";
 
-      AT_CHECK(sparseDims == sparseDims_,
-        "changing the number of sparse dimensions (from ", sparseDims_, " to ", sparseDims, ") on a non-empty sparse tensor is not supported.\n", alt_options_msg);
+      AT_CHECK(sparse_dim == sparse_dim_,
+        "changing the number of sparse dimensions (from ", sparse_dim_, " to ", sparse_dim, ") on a non-empty sparse tensor is not supported.\n", alt_options_msg);
 
-      AT_CHECK(denseDims == denseDims_,
-        "changing the number of dense dimensions (from ", denseDims_, " to ", denseDims, ") on a non-empty sparse tensor is not supported.\n", alt_options_msg);
+      AT_CHECK(dense_dim == dense_dim_,
+        "changing the number of dense dimensions (from ", dense_dim_, " to ", dense_dim, ") on a non-empty sparse tensor is not supported.\n", alt_options_msg);
 
       bool shrinking_sparse_dims = false;
-      bool shrinking_dense_dims = false;
-      auto sparse_size_original = sizes().slice(0, sparseDims);
-      auto sparse_size_new = size.slice(0, sparseDims);
-      for (int i = 0; i < sparseDims; i++) {
+      bool shrinking_dense_dim = false;
+      auto sparse_size_original = sizes().slice(0, sparse_dim);
+      auto sparse_size_new = size.slice(0, sparse_dim);
+      for (int i = 0; i < sparse_dim; i++) {
         if (sparse_size_new[i] < sparse_size_original[i]) {
           shrinking_sparse_dims = true;
           break;
         }
       }
-      auto dense_size_original = sizes().slice(sparseDims);
-      auto dense_size_new = size.slice(sparseDims);
-      for (int i = 0; i < denseDims; i++) {
+      auto dense_size_original = sizes().slice(sparse_dim);
+      auto dense_size_new = size.slice(sparse_dim);
+      for (int i = 0; i < dense_dim; i++) {
         if (dense_size_new[i] < dense_size_original[i]) {
-          shrinking_dense_dims = true;
+          shrinking_dense_dim = true;
           break;
         }
       }
@@ -128,38 +128,36 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
       AT_CHECK(!shrinking_sparse_dims,
         "shrinking the size of sparse dimensions (from ", sparse_size_original, " to ", sparse_size_new, ") on a non-empty sparse tensor is not supported.\n", alt_options_msg);
 
-      AT_CHECK(!shrinking_dense_dims,
+      AT_CHECK(!shrinking_dense_dim,
         "shrinking the size of dense dimensions (from ", dense_size_original, " to ", dense_size_new, ") on a non-empty sparse tensor is not supported.\n", alt_options_msg);
     }
 
-    if ((!size.equals(size_)) || (sparseDims != sparseDims_) || (denseDims != denseDims_)) {
-      std::vector<int64_t> values_size = {values().size(0)};
-      auto dense_size = size.slice(sparseDims);
+    if ((!size.equals(size_)) || (sparse_dim != sparse_dim_) || (dense_dim != dense_dim_)) {
+      auto nnz = values().size(0);
+      std::vector<int64_t> values_size = {nnz};
+      auto dense_size = size.slice(sparse_dim);
       values_size.insert(values_size.end(), dense_size.begin(), dense_size.end());
       values_.resize_(values_size);
-
-      std::vector<int64_t> indices_size = indices().sizes().vec();
-      indices_size[0] = sparseDims;
-      indices_.resize_(indices_size);
+      indices_.resize_({sparse_dim, nnz});
     }
 
     size_ = size.vec();
-    sparseDims_ = sparseDims;
-    denseDims_ = denseDims;
+    sparse_dim_ = sparse_dim;
+    dense_dim_ = dense_dim;
     refresh_numel();
   }
 
   // NOTE: this function will resize the sparse tensor and also set `indices` and `values` to empty.
-  void resize_and_clear_(int64_t sparseDims, int64_t denseDims, IntList size) {
-    AT_CHECK(sparseDims + denseDims == size.size(), "number of dimensions must be sparseDims (", sparseDims, ") + denseDims (", denseDims, "), but got ", size.size());
+  void resize_and_clear_(int64_t sparse_dim, int64_t dense_dim, IntList size) {
+    AT_CHECK(sparse_dim + dense_dim == size.size(), "number of dimensions must be sparse_dim (", sparse_dim, ") + dense_dim (", dense_dim, "), but got ", size.size());
 
     size_ = size.vec();
-    sparseDims_ = sparseDims;
-    denseDims_ = denseDims;
+    sparse_dim_ = sparse_dim;
+    dense_dim_ = dense_dim;
 
-    auto empty_indices = at::empty({sparseDims, 0}, indices().options());
+    auto empty_indices = at::empty({sparse_dim, 0}, indices().options());
     std::vector<int64_t> values_size = {0};
-    auto dense_size = sizes().slice(sparseDims);
+    auto dense_size = sizes().slice(sparse_dim);
     values_size.insert(values_size.end(), dense_size.begin(), dense_size.end());
     auto empty_values = at::empty(values_size, values().options());
     set_indices_and_values_unsafe(empty_indices, empty_values);
@@ -169,9 +167,10 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
   void set_coalesced(bool coalesced) { coalesced_ = coalesced; }
 
   // NOTE: this function is only used internally and not exposed to Python frontend
-  void set_nnz_and_narrow(int64_t nnz) {
-    indices_ = indices_.narrow(1, 0, nnz);
-    values_ = values_.narrow(0, 0, nnz);
+  void set_nnz_and_narrow(int64_t new_nnz) {
+    AT_ASSERT(new_nnz <= nnz());
+    indices_ = indices_.narrow(1, 0, new_nnz);
+    values_ = values_.narrow(0, 0, new_nnz);
   }
 
   // Takes indices and values and directly puts them into the sparse tensor, no copy.
diff --git a/aten/src/ATen/SparseTensorUtils.h b/aten/src/ATen/SparseTensorUtils.h
new file mode 100644
index 00000000000000..7bf2e938bc8440
--- /dev/null
+++ b/aten/src/ATen/SparseTensorUtils.h
@@ -0,0 +1,111 @@
+#include <ATen/ATen.h>
+#include <ATen/SparseTensorImpl.h>
+
+namespace at { namespace sparse {
+
+// Just for documentary purposes
+using SparseTensor = Tensor;
+using LongTensor = Tensor;
+using IntTensor = Tensor;
+using SparseType = Type;
+
+// This is an internal utility function for getting at the SparseTensorImpl,
+// so that we can write sparse tensor specific accessors for special fields
+// in SparseTensor.  You should only use this for writing low level
+// setters/getters for SparseTensorImpl fields; otherwise, you should use
+// the low level setters/getters that were implemented using this.
+//
+// This may be called repeatedly, so make sure it's pretty cheap.
+inline SparseTensorImpl* get_sparse_impl(const SparseTensor& self) {
+  AT_ASSERTM(!self.is_variable(), "_internal_get_SparseTensorImpl: should not be a variable");
+  AT_ASSERTM(self.is_sparse(), "_internal_get_SparseTensorImpl: not a sparse tensor");
+  return static_cast<SparseTensorImpl*>(self.unsafeGetTensorImpl());
+}
+
+// Port of the old THCSTensor_(checkGPU), but it doesn't really belong here
+// because it is more general
+// NB: I dropped kernelP2PEnabled support
+// NB: This only works if the tensors are KNOWN to be CUDA.
+// TODO: Generalize it so it works on CPU as well
+inline bool check_device(ArrayRef<Tensor> ts) {
+  if (ts.empty()) {
+    return true;
+  }
+  int64_t curDevice = current_device();
+  for (const Tensor& t : ts) {
+    if (t.get_device() != curDevice) return false;
+  }
+  return true;
+}
+
+// Takes indices and values and directly puts them into the sparse tensor, no
+// copy.  This used to be called THSTensor_(_move)
+inline void alias_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values) {
+  get_sparse_impl(self)->set_indices_and_values_unsafe(indices, values);
+}
+
+// Take indices and values and makes a (data) copy of them to put into the sparse
+// indices/values.  This used to be called THSTensor_(_set)
+inline void copy_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values, bool non_blocking) {
+  alias_into_sparse(self, self._indices().type().copy(indices, non_blocking), self._values().type().copy(values, non_blocking));
+}
+
+// TODO: put this into the public API
+inline bool is_same_tensor(const Tensor& lhs, const Tensor& rhs) {
+  return lhs.unsafeGetTensorImpl() == rhs.unsafeGetTensorImpl();
+}
+
+inline bool is_same_density(const SparseTensor& self, const SparseTensor& src) {
+  return self.sparse_dim() == src.sparse_dim() && self.dense_dim() == src.dense_dim();
+}
+
+// Give us a new values tensor, with the same dimensionality
+// as 'values' but with a new number of non-zero elements.
+// TODO: Expose this for real in ATen, some day?
+// NB: Doesn't preserve data.
+inline Tensor new_values_with_size_of(const Tensor& values, int64_t nnz) {
+  std::vector<int64_t> size = values.sizes().vec();
+  size[0] = nnz;
+  return at::empty(size, values.options());
+}
+
+// This helper function flattens a sparse indices tensor (a LongTensor) into a 1D
+// indices tensor. E.g.,
+//   input = [[2, 4, 0],
+//            [3, 1, 10]]
+//   full_size = [2, 12]
+//   output = [ 2 * 12 + 3, 4 * 12 + 1, 0 * 12 + 10 ] = [27, 49, 10]
+//
+// In other words, assuming that each `indices[i, :]` is a valid index to a
+// tensor `t` of shape `full_size`. This returns the corresponding indices to
+// the flattened tensor `t.reshape( prod(full_size[:indices.size(0)]), -1 )`.
+// if forceClone is true, the result will forced to be a clone of self.
+// if force_clone is true, the result will forced to be a clone of self.
+inline LongTensor flatten_indices(const Tensor& indices, IntList full_size, bool force_clone = false) {
+  int64_t sparse_dim = indices.size(0);
+  if (sparse_dim == 1) {
+    if (force_clone) {
+      return indices.squeeze(0).clone();
+    } else {
+      return indices.squeeze(0);
+    }
+  } else {
+    std::vector<int64_t> indices_mult_cpu_vec;
+    indices_mult_cpu_vec.reserve(sparse_dim);
+    int64_t mult = 1;
+    for (int64_t i = sparse_dim - 1; i >= 0; i--) {
+      indices_mult_cpu_vec[i] = mult;
+      mult *= full_size[i];
+    }
+    auto indices_mult_cpu = indices.type().cpu()
+                                   .tensorFromBlob(indices_mult_cpu_vec.data(), /*size=*/{sparse_dim, 1});
+    // NB: must be blocking because this blob may be freed after this closure,
+    //     and non_blocking copy will see garbage.
+    auto indices_mult = indices_mult_cpu.to(indices.device(), /*non_blocking=*/false);
+    // Ideally we want matmul but matmul is slow on CPU Long and not implemented
+    // on CUDA Long. So mul is faster.
+    return indices.mul(indices_mult).sum(0);
+  }
+}
+
+}} // namespace at::sparse
diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
index e94b2a4152c513..8271e5b1de7e17 100644
--- a/aten/src/ATen/core/Tensor.h
+++ b/aten/src/ATen/core/Tensor.h
@@ -404,6 +404,7 @@ class CAFFE2_API Tensor {
   Tensor & log_normal_(double mean=1, double std=2, Generator * generator=nullptr);
   Tensor & exponential_(double lambd=1, Generator * generator=nullptr);
   Tensor & geometric_(double p, Generator * generator=nullptr);
+  Tensor alias() const;
   Tensor abs() const;
   Tensor & abs_();
   Tensor acos() const;
@@ -621,17 +622,22 @@ class CAFFE2_API Tensor {
   Tensor & sub_(Scalar other, Scalar alpha=1);
   Tensor addmm(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
   Tensor & addmm_(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1);
-  Tensor & sparse_resize_(IntList size, int64_t sparseDims, int64_t denseDims);
-  Tensor & sparse_resize_and_clear_(IntList size, int64_t sparseDims, int64_t denseDims);
+  Tensor & sparse_resize_(IntList size, int64_t sparse_dim, int64_t dense_dim);
+  Tensor & sparse_resize_and_clear_(IntList size, int64_t sparse_dim, int64_t dense_dim);
   Tensor sparse_mask(SparseTensorRef mask) const;
   Tensor to_dense() const;
-  int64_t _sparseDims() const;
-  int64_t _denseDims() const;
+  int64_t sparse_dim() const;
+  int64_t _dimI() const;
+  int64_t dense_dim() const;
+  int64_t _dimV() const;
   int64_t _nnz() const;
   Tensor coalesce() const;
   bool is_coalesced() const;
   Tensor _indices() const;
   Tensor _values() const;
+  Tensor & _coalesced_(bool coalesced);
+  Tensor indices() const;
+  Tensor values() const;
   int64_t numel() const;
   std::vector<Tensor> unbind(int64_t dim=0) const;
   int64_t get_device() const;
diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h
index ba5044963c0fad..27e5a718d16870 100644
--- a/aten/src/ATen/core/TensorMethods.h
+++ b/aten/src/ATen/core/TensorMethods.h
@@ -515,6 +515,9 @@ inline Tensor & Tensor::exponential_(double lambd, Generator * generator) {
 inline Tensor & Tensor::geometric_(double p, Generator * generator) {
     return type().geometric_(*this, p, generator);
 }
+inline Tensor Tensor::alias() const {
+    return type().alias(*this);
+}
 inline Tensor Tensor::abs() const {
     return type().abs(*this);
 }
@@ -1166,11 +1169,11 @@ inline Tensor Tensor::addmm(const Tensor & mat1, const Tensor & mat2, Scalar bet
 inline Tensor & Tensor::addmm_(const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) {
     return type().addmm_(*this, mat1, mat2, beta, alpha);
 }
-inline Tensor & Tensor::sparse_resize_(IntList size, int64_t sparseDims, int64_t denseDims) {
-    return type().sparse_resize_(*this, size, sparseDims, denseDims);
+inline Tensor & Tensor::sparse_resize_(IntList size, int64_t sparse_dim, int64_t dense_dim) {
+    return type().sparse_resize_(*this, size, sparse_dim, dense_dim);
 }
-inline Tensor & Tensor::sparse_resize_and_clear_(IntList size, int64_t sparseDims, int64_t denseDims) {
-    return type().sparse_resize_and_clear_(*this, size, sparseDims, denseDims);
+inline Tensor & Tensor::sparse_resize_and_clear_(IntList size, int64_t sparse_dim, int64_t dense_dim) {
+    return type().sparse_resize_and_clear_(*this, size, sparse_dim, dense_dim);
 }
 inline Tensor Tensor::sparse_mask(SparseTensorRef mask) const {
     return type().sparse_mask(*this, mask);
@@ -1178,11 +1181,17 @@ inline Tensor Tensor::sparse_mask(SparseTensorRef mask) const {
 inline Tensor Tensor::to_dense() const {
     return type().to_dense(*this);
 }
-inline int64_t Tensor::_sparseDims() const {
-    return type()._sparseDims(*this);
+inline int64_t Tensor::sparse_dim() const {
+    return type().sparse_dim(*this);
+}
+inline int64_t Tensor::_dimI() const {
+    return type()._dimI(*this);
+}
+inline int64_t Tensor::dense_dim() const {
+    return type().dense_dim(*this);
 }
-inline int64_t Tensor::_denseDims() const {
-    return type()._denseDims(*this);
+inline int64_t Tensor::_dimV() const {
+    return type()._dimV(*this);
 }
 inline int64_t Tensor::_nnz() const {
     return type()._nnz(*this);
@@ -1199,6 +1208,15 @@ inline Tensor Tensor::_indices() const {
 inline Tensor Tensor::_values() const {
     return type()._values(*this);
 }
+inline Tensor & Tensor::_coalesced_(bool coalesced) {
+    return type()._coalesced_(*this, coalesced);
+}
+inline Tensor Tensor::indices() const {
+    return type().indices(*this);
+}
+inline Tensor Tensor::values() const {
+    return type().values(*this);
+}
 inline int64_t Tensor::numel() const {
     return type().numel(*this);
 }
diff --git a/aten/src/ATen/core/TensorOptions.h b/aten/src/ATen/core/TensorOptions.h
index e830f3b778a726..cf5845c0a0f7ed 100644
--- a/aten/src/ATen/core/TensorOptions.h
+++ b/aten/src/ATen/core/TensorOptions.h
@@ -181,6 +181,11 @@ struct CAFFE2_API TensorOptions {
     return has_device_ ? device_ : getDefaultTensorOptions().device();
   }
 
+  /// Returns whether the device is specified.
+  bool has_device() const noexcept {
+    return has_device_;
+  }
+
   /// Returns the device of the `TensorOptions`, or `c10::nullopt` if
   /// device is not specified.
   optional<Device> device_opt() const noexcept {
@@ -197,6 +202,11 @@ struct CAFFE2_API TensorOptions {
     return has_dtype_ ? dtype_ : getDefaultTensorOptions().dtype();
   }
 
+  /// Returns whether the dtype is specified.
+  bool has_dtype() const noexcept {
+    return has_dtype_;
+  }
+
   /// Returns the dtype of the `TensorOptions`, or `c10::nullopt` if
   /// device is not specified.
   optional<ScalarType> dtype_opt() const noexcept {
@@ -208,6 +218,11 @@ struct CAFFE2_API TensorOptions {
     return has_layout_ ? layout_ : getDefaultTensorOptions().layout();
   }
 
+  /// Returns whether the layout is specified.
+  bool has_layout() const noexcept {
+    return has_layout_;
+  }
+
   /// Returns the layout of the `TensorOptions`, or `c10::nullopt` if
   /// layout is not specified.
   optional<Layout> layout_opt() const noexcept {
@@ -219,6 +234,11 @@ struct CAFFE2_API TensorOptions {
     return has_requires_grad_ ? requires_grad_ : getDefaultTensorOptions().requires_grad();
   }
 
+  /// Returns whether the `requires_grad` is specified.
+  bool has_requires_grad() const noexcept {
+    return has_requires_grad_;
+  }
+
   /// Returns the `requires_grad` property of the `TensorOptions`, or
   /// `c10::nullopt` if `requires_grad` is not specified.
   optional<bool> requires_grad_opt() const noexcept {
@@ -231,6 +251,11 @@ struct CAFFE2_API TensorOptions {
     return has_is_variable_ ? is_variable_ : getDefaultTensorOptions().is_variable();
   }
 
+  /// Returns whether the `is_variable` is specified.
+  bool has_is_variable() const noexcept {
+    return has_is_variable_;
+  }
+
   /// Returns the `is_variable` property of the `TensorOptions`, or
   /// `c10::nullopt` if `is_variable` is not specified.
   optional<bool> is_variable_opt() const noexcept {
diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
index 5f666cdcb61fc1..6ad8913363aa4b 100644
--- a/aten/src/ATen/core/Type.h
+++ b/aten/src/ATen/core/Type.h
@@ -367,6 +367,7 @@ struct CAFFE2_API Type {
   virtual Tensor & log_normal_(Tensor & self, double mean, double std, Generator * generator) const = 0;
   virtual Tensor & exponential_(Tensor & self, double lambd, Generator * generator) const = 0;
   virtual Tensor & geometric_(Tensor & self, double p, Generator * generator) const = 0;
+  virtual Tensor alias(const Tensor & self) const = 0;
   virtual Tensor abs(const Tensor & self) const = 0;
   virtual Tensor & abs_(Tensor & self) const = 0;
   virtual Tensor acos(const Tensor & self) const = 0;
@@ -584,17 +585,22 @@ struct CAFFE2_API Type {
   virtual Tensor & sub_(Tensor & self, Scalar other, Scalar alpha) const = 0;
   virtual Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0;
   virtual Tensor & addmm_(Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0;
-  virtual Tensor & sparse_resize_(Tensor & self, IntList size, int64_t sparseDims, int64_t denseDims) const = 0;
-  virtual Tensor & sparse_resize_and_clear_(Tensor & self, IntList size, int64_t sparseDims, int64_t denseDims) const = 0;
+  virtual Tensor & sparse_resize_(Tensor & self, IntList size, int64_t sparse_dim, int64_t dense_dim) const = 0;
+  virtual Tensor & sparse_resize_and_clear_(Tensor & self, IntList size, int64_t sparse_dim, int64_t dense_dim) const = 0;
   virtual Tensor sparse_mask(const Tensor & self, SparseTensorRef mask) const = 0;
   virtual Tensor to_dense(const Tensor & self) const = 0;
-  virtual int64_t _sparseDims(const Tensor & self) const = 0;
-  virtual int64_t _denseDims(const Tensor & self) const = 0;
+  virtual int64_t sparse_dim(const Tensor & self) const = 0;
+  virtual int64_t _dimI(const Tensor & self) const = 0;
+  virtual int64_t dense_dim(const Tensor & self) const = 0;
+  virtual int64_t _dimV(const Tensor & self) const = 0;
   virtual int64_t _nnz(const Tensor & self) const = 0;
   virtual Tensor coalesce(const Tensor & self) const = 0;
   virtual bool is_coalesced(const Tensor & self) const = 0;
   virtual Tensor _indices(const Tensor & self) const = 0;
   virtual Tensor _values(const Tensor & self) const = 0;
+  virtual Tensor & _coalesced_(Tensor & self, bool coalesced) const = 0;
+  virtual Tensor indices(const Tensor & self) const = 0;
+  virtual Tensor values(const Tensor & self) const = 0;
   virtual int64_t numel(const Tensor & self) const = 0;
   virtual std::vector<Tensor> unbind(const Tensor & self, int64_t dim) const = 0;
   virtual int64_t get_device(const Tensor & self) const = 0;
diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index e7f487f264f64b..0ee9d337e91583 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -96,7 +96,6 @@ _(aten, _logspace) \
 _(aten, _masked_scale) \
 _(aten, _mm) \
 _(aten, _mv) \
-_(aten, _native_sparse_coo_tensor_unsafe) \
 _(aten, _nnz) \
 _(aten, _pack_padded_sequence) \
 _(aten, _pack_padded_sequence_backward) \
@@ -118,6 +117,8 @@ _(aten, _sin) \
 _(aten, _sinh) \
 _(aten, _sparseDims) \
 _(aten, _sparse_add) \
+_(aten, _sparse_coo_tensor_with_dims) \
+_(aten, _sparse_coo_tensor_with_dims_and_tensors) \
 _(aten, _sparse_coo_tensor_unsafe) \
 _(aten, _sparse_dense_add) \
 _(aten, _sparse_div_scalar) \
@@ -364,6 +365,7 @@ _(aten, index_copy) \
 _(aten, index_fill) \
 _(aten, index_put) \
 _(aten, index_select) \
+_(aten, indices) \
 _(aten, instance_norm) \
 _(aten, inverse) \
 _(aten, irfft) \
@@ -478,7 +480,6 @@ _(aten, native_get_device) \
 _(aten, native_norm) \
 _(aten, native_pow) \
 _(aten, native_resize_as) \
-_(aten, native_sparse_coo_tensor) \
 _(aten, native_tensor) \
 _(aten, native_zero) \
 _(aten, ne) \
@@ -685,6 +686,7 @@ _(aten, upsample_nearest3d_forward) \
 _(aten, upsample_trilinear3d) \
 _(aten, upsample_trilinear3d_backward) \
 _(aten, upsample_trilinear3d_forward) \
+_(aten, values) \
 _(aten, var) \
 _(aten, view) \
 _(aten, view_as) \
@@ -737,7 +739,7 @@ _(attr, cudnn_enabled) \
 _(attr, cx) \
 _(attr, cy) \
 _(attr, data) \
-_(attr, denseDims) \
+_(attr, dense_dim) \
 _(attr, descending) \
 _(attr, deterministic) \
 _(attr, device) \
@@ -931,7 +933,8 @@ _(attr, some) \
 _(attr, sorted) \
 _(attr, source) \
 _(attr, sparse) \
-_(attr, sparseDims) \
+_(attr, sparse_dim) \
+_(attr, sparse_dtype) \
 _(attr, spatialScale) \
 _(attr, split_size) \
 _(attr, split_sizes) \
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index 132c036b0358f7..ad600b467758c1 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -110,7 +110,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 TYPE_DERIVED_DEFINITION_NATIVE = CodeTemplate("""\
 ${return_type} ${Type}::${api_name}(${type_method_formals}) const {
     ${device_guard_declaration}
-    ${return_call} at::native::${native_type_method_dispatch}(/* actuals */ ${type_derived_call_actuals});
+    ${return_call} at::native::${native_type_method_dispatch}(/* actuals */ ${actuals});
 }
 """)
 TYPE_DERIVED_DEFINITION_NATIVE_MISSING = CodeTemplate("""\
@@ -499,6 +499,7 @@ def __getitem__(self, x):
     'native_type_method_dispatch': str,
     # options should be List[FunctionOption]
     'options': Any,
+    'requires_tensor': bool,
     'return_call': str,
     'return_type': str,
     'return': ReturnDecl,
@@ -527,6 +528,7 @@ def __getitem__(self, x):
     ('returns', List[ReturnType]),
     ('inplace', bool),
     ('abstract', bool),
+    ('requires_tensor', bool),
     ('device_guard', bool),
     ('with_gil', bool),
     ('deprecated', bool),
@@ -559,7 +561,8 @@ def is_mutable_formal_argument(argument, option):
 
 
 def check_methods_do_not_start_with_underscore(name, is_method):
-    if name in {'_local_scalar', '_values', '_indices', '_nnz', '_sparseDims', '_denseDims'}:
+    if name in {'_local_scalar', '_values', '_indices', '_nnz', '_dimI',
+                '_dimV', '_coalesced_'}:
         return
     if is_method and name.startswith('_') and not name.startswith('__'):
         message = "Function '{}' starts with a single underscore and is ".format(name)
@@ -923,6 +926,7 @@ def process_option(option, output_options):
             inplace=option['inplace'],
             # See Note [Abstract ATen methods]
             abstract=abstract,
+            requires_tensor=option.get('requires_tensor', False),
             device_guard=option.get('device_guard', True),
             with_gil=option.get('with_gil', False),
             deprecated=option.get('deprecated', False)
@@ -1040,14 +1044,20 @@ def find_formal(formal_name, formals):
                     return formal
             return None
 
+        assert find_formal('Type', formals) is None, \
+            "Found Type argument in {}({}). Use TensorOptions instead.".format(
+                option['name'], ", ".join(option['method_formals_with_defaults']))
+
         type_method_dispatch = option['type_method_definition_dispatch']
-        dispatch_tensor = find_dispatch_tensor(formals)
-        # we only dispatch via options if there is backend-specific dispatch (otherwise it's a factory function that
-        # can dispatch directly to the native function).
         backend_dispatch = isinstance(type_method_dispatch, dict)
+
+        # We only dispatch via options if there is backend-specific dispatch
+        # (otherwise it's a factory function that can dispatch directly to the
+        # native function).
         dispatch_options = (find_formal('TensorOptions', formals)
-                            if not dispatch_tensor and backend_dispatch
-                            else None)
+                            if backend_dispatch else None)
+        # Only dispatch via tensor if there is no Options argument
+        dispatch_tensor = None if dispatch_options else find_dispatch_tensor(formals)
 
         option['type_method_formals'] = [format_formal(f) for f in formals]
         option['type_method_actuals'] = [f['name'] for f in formals]
@@ -1160,6 +1170,7 @@ def find_formal(formal_name, formals):
             inplace=option['inplace'],
             # See Note [Abstract ATen methods]
             abstract=abstract,
+            requires_tensor=option.get('requires_tensor', False),
             device_guard=option.get('device_guard', True),
             with_gil=option.get('with_gil', False),
             deprecated=option['deprecated'],
@@ -1572,15 +1583,8 @@ def process_native(option):
                         TYPE_DERIVED_DEFINITION_NATIVE_MISSING.substitute(env))
                 else:
                     option['native_type_method_dispatch'] = native_dispatch
-                    type_derived_call_actuals = []
-                    for actual, arg in zip(option['actuals'], option['arguments']):
-                        if arg.get('is_type_dispatched', False):
-                            type_derived_call_actuals.append('*this')
-                        else:
-                            type_derived_call_actuals.append(actual)
                     type_object_definitions.append(
-                        TYPE_DERIVED_DEFINITION_NATIVE.substitute(
-                            env, type_derived_call_actuals=type_derived_call_actuals))
+                        TYPE_DERIVED_DEFINITION_NATIVE.substitute(env))
 
     for declaration in declarations:
         for option in declaration['options']:
diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp
index 930832416db17f..1537fcd2db8c87 100644
--- a/aten/src/ATen/native/LegacyBridge.cpp
+++ b/aten/src/ATen/native/LegacyBridge.cpp
@@ -134,33 +134,6 @@ Tensor& addmm_(Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta
   }
 }
 
-Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values) {
-  return at::getType(values.options().layout(at::kSparse)).native_sparse_coo_tensor(indices, values);
-}
-
-Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, ArrayRef<int64_t> size) {
-  return at::getType(values.options().layout(at::kSparse)).native_sparse_coo_tensor(indices, values, size);
-}
-
-Tensor sparse_coo_tensor(ArrayRef<int64_t> size, const TensorOptions& options) {
-  TensorOptions toptions = TensorOptions(options).layout(at::kSparse);
-  return at::getType(toptions).native_sparse_coo_tensor(size, toptions);
-}
-
-Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, const TensorOptions& options) {
-  TensorOptions toptions = options;
-  return at::getType(toptions.layout(at::kSparse)).native_sparse_coo_tensor(indices, values);
-}
-
-Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, ArrayRef<int64_t> size, const TensorOptions& options) {
-  TensorOptions toptions = options;
-  return at::getType(toptions.layout(at::kSparse)).native_sparse_coo_tensor(indices, values, size);
-}
-
-Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values, ArrayRef<int64_t> size) {
-  return at::getType(values.options().layout(at::kSparse))._native_sparse_coo_tensor_unsafe(indices, values, size);
-}
-
 int64_t get_device(const Tensor& self) {
   if (_has_native(self)) {
     return native_get_device(self);
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index e9241c368c34b8..4894c2cecc129e 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -153,7 +153,7 @@ Tensor empty_like(const Tensor& self) {
 Tensor empty_like(const Tensor& self, const TensorOptions& options) {
   if (options.layout() == kSparse && self.type().is_sparse()) {
     auto res = at::empty({0}, options); // to be resized
-    res.sparse_resize_and_clear_(self.sizes(), self._sparseDims(), self._denseDims());
+    res.sparse_resize_and_clear_(self.sizes(), self.sparse_dim(), self.dense_dim());
     return res;
   }
   return at::empty(self.sizes(), options);
@@ -525,7 +525,7 @@ Tensor zeros_like(const Tensor& self) {
 Tensor zeros_like(const Tensor& self, const TensorOptions& options) {
   if (options.layout() == kSparse && self.type().is_sparse()) {
     auto res = at::empty({0}, options); // to be resized
-    res.sparse_resize_and_clear_(self.sizes(), self._sparseDims(), self._denseDims());
+    res.sparse_resize_and_clear_(self.sizes(), self.sparse_dim(), self.dense_dim());
     return res;
   }
   return native::zeros(self.sizes(), options);
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 387bebce3932a1..c8f0d61aee6838 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -1,4 +1,3 @@
-#include <ATen/native/sparse/SparseUtils.h>
 #include <TH/THTensor.hpp>
 #include <algorithm>
 #include <vector>
@@ -9,6 +8,9 @@
 #include "ATen/WrapDimUtils.h"
 #include "c10/util/Exception.h"
 #include "c10/util/Optional.h"
+#include <ATen/SparseTensorUtils.h>
+#include <algorithm>
+#include <vector>
 
 namespace at {
 namespace native {
@@ -163,7 +165,7 @@ Tensor &as_strided_(Tensor& self, IntList size, IntList stride) {
   return at::as_strided_(self, size, stride, self.storage_offset());
 }
 
-Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_t length){
+Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
   int64_t allDim = self.dim();
   int64_t end = start+length;
   AT_CHECK(allDim > 0, "narrow() cannot be applied to a 0-dim tensor.");
@@ -171,31 +173,30 @@ Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_
     "Dimension ", dim, " out of range. Expecting 0 <= dim < ", allDim, ".");
   AT_CHECK(start >= 0 && length >= 0 && end <= self.size(dim),
     "Invalid range to narrow. range(start, start+length) must be a subset of range(0, ", self.size(dim), ").")
-  LongTensor indices = self._indices();
-  int64_t sparseDims = self._sparseDims();
+  Tensor indices = self._indices();
+  int64_t sparse_dim = self.sparse_dim();
 
-  std::vector<int64_t> newSizes = self.sizes().vec();
-  newSizes[dim]=length;
+  std::vector<int64_t> new_sizes = self.sizes().vec();
+  new_sizes[dim] = length;
 
-  Tensor newValues;
-  LongTensor newIndices;
-  if(dim < sparseDims){
+  Tensor new_values;
+  Tensor new_indices;
+  if (dim < sparse_dim) {
     Tensor mask = (indices[dim] >= start).__and__((indices[dim] < end));
-    newIndices = indices.masked_select(mask).view({sparseDims, -1});
-    newIndices[dim].add_(-start);
+    new_indices = indices.masked_select(mask).view({sparse_dim, -1});
+    new_indices[dim].sub_(start);
     Tensor nzIndices = mask.nonzero().view(-1);
-    newValues = self._values().index_select(0, nzIndices);
-  }else{
+    new_values = self._values().index_select(0, nzIndices);
+  } else {
     /* This means we are narrowing on a dense dim, which is in effect just a
         regular narrow on _values() */
-    newIndices = indices;
-    int64_t ddim = dim - sparseDims + 1;
-    newValues = self._values().narrow_copy(ddim, start, length);
+    new_indices = indices;
+    int64_t dense_dim = dim - sparse_dim + 1;
+    new_values = self._values().narrow_copy(dense_dim, start, length);
   }
 
-  SparseTensor newTensor = at::sparse_coo_tensor(newIndices, newValues, newSizes, self.type().options());
-  _get_sparse_impl(newTensor)->set_coalesced(self.is_coalesced());
-  return newTensor;
+  auto newTensor = at::sparse_coo_tensor(new_indices, new_values, new_sizes);
+  return newTensor._coalesced_(self.is_coalesced());
 }
 
 Tensor narrow_copy_dense(const Tensor& self, int64_t dim, int64_t start, int64_t length){
@@ -397,16 +398,16 @@ Tensor& stack_out(Tensor& result, TensorList tensors, int64_t dim) {
 }
 
 static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t dim1) {
-  int64_t nsparseDims = self._sparseDims();
-  AT_CHECK(dim0 < nsparseDims && dim1 < nsparseDims,
+  int64_t nsparse_dim = self.sparse_dim();
+  AT_CHECK(dim0 < nsparse_dim && dim1 < nsparse_dim,
            "sparse transpose: transposed dimensions must be sparse ",
-           "Got sparseDims: ", nsparseDims, ", d0: ", dim0, ", d1: ", dim1);
+           "Got sparse_dim: ", nsparse_dim, ", d0: ", dim0, ", d1: ", dim1);
 
   if (self._indices().numel() == 0 && self._values().numel() == 0) {
     auto sizes = self.sizes().vec();
     std::swap(sizes[dim0], sizes[dim1]);
 
-    _get_sparse_impl(self)->raw_resize_(self._sparseDims(), self._denseDims(), sizes);
+    at::sparse::get_sparse_impl(self)->raw_resize_(self.sparse_dim(), self.dense_dim(), sizes);
   } else {
     auto indices = self._indices();
     auto row0 = indices.select(0, dim0);
@@ -418,12 +419,12 @@ static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t di
     row0.copy_(row1);
     row1.copy_(tmp);
 
-    _get_sparse_impl(self)->set_coalesced(false);
+    self._coalesced_(false);
 
     auto sizes = self.sizes().vec();
     std::swap(sizes[dim0], sizes[dim1]);
 
-    _get_sparse_impl(self)->raw_resize_(self._indices().size(0), self._values().dim() - 1, sizes);
+    at::sparse::get_sparse_impl(self)->raw_resize_(self._indices().size(0), self._values().dim() - 1, sizes);
   }
   return self;
 }
@@ -469,11 +470,11 @@ Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) {
 
 static void check_t(const Tensor& self, const char *fn) {
   if (self.is_sparse()) {
-    int64_t sparseDims = self._sparseDims();
-    int64_t denseDims = self._denseDims();
-    AT_CHECK(sparseDims == 2 && denseDims == 0,
+    int64_t sparse_dim = self.sparse_dim();
+    int64_t dense_dim = self.dense_dim();
+    AT_CHECK(sparse_dim == 2 && dense_dim == 0,
              fn, " expects a tensor with 2 sparse and 0 dense dimensions, but got ",
-             sparseDims, " sparse and ", denseDims, " dense dimensions");
+             sparse_dim, " sparse and ", dense_dim, " dense dimensions");
   } else if (self.dim() != 2) {
     AT_ERROR(fn, " expects a 2D tensor, but self is ", self.dim(), "D");
   }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 49c05ecdf6e78b..a95cfc3d817dfb 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1929,66 +1929,160 @@
 - func: addmm_(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   variants: method
 
-# NB: I have to decompose sparse_coo_tensor into two functions, because
-# it has custom dispatch logic for which Type to dispatch on (we must
-# use the sparse equivalent of the type of the SECOND argument).
-#
-# The actual dispatcher, native_sparse_coo_tensor, has all of its overloads
-# removed so you don't accidentally trigger the default behavior, which
-# is to infer Type based on the first argument (indices), which is ~never
-# what you want. (I guess hypothetically it would work; you'd
-# just only ever dispatch to CPULongTensor or CUDALongTensor, but that
-# seems a bit too finely balanced.)
-
-- func: native_sparse_coo_tensor(IntList size, TensorOptions options) -> Tensor
-  variants: []
-  dispatch:
-    SparseCPU: new_with_size_sparse
-    SparseCUDA: new_with_size_sparse
-
-- func: native_sparse_coo_tensor(IndexTensor indices, Tensor values) -> Tensor
-  variants: []
-  dispatch:
-    SparseCPU: new_with_tensor_sparse
-    SparseCUDA: new_with_tensor_sparse
-
-- func: native_sparse_coo_tensor(IndexTensor indices, Tensor values, IntList size) -> Tensor
-  variants: []
-  dispatch:
-    SparseCPU: new_with_tensor_and_size_sparse
-    SparseCUDA: new_with_tensor_and_size_sparse
 
-- func: sparse_coo_tensor(IndexTensor indices, Tensor values) -> Tensor
+# NOTE [ Sparse: autograd and API ]
+#
+#
+# Sparse Tensor Constructors
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The API entry points to sparse tensor construction should be
+# `sparse_coo tensor` and `_sparse_coo_tensor_unsafe`. Depending on whether the
+# indices and values tensors are given, they eventually dispatch to either
+# `sparse_coo_tensor_with_dims` or `sparse_coo_tensor_with_dims_and_tensors`.
+#
+# The autograd support for ctor is implement on `sparse_coo_tensor_with_dims_and_tensors`.
+#
+# The API methods `sparse_coo tensor` and `_sparse_coo_tensor_unsafe`
+# **must not** have specific type dispatches because otherwise codegen will
+# consider them as abstract methods (see Note [Abstract ATen methods]), dispatch
+# using **Tensor** type, and thus lose autograd tracking on the actual method
+# they dispatch to, e.g., `sparse_coo_tensor_with_dims_and_tensors`.
+#
+# The actual ctors `sparse_coo_tensor_with_dims` and `sparse_coo_tensor_with_dims_and_tensors`,
+# on the other hand, need to create `SparseTensorImpl` and know nothing about
+# how `VariableType`s work. So they need to be dispatched using Tensor types.
+# We thus put `requires_tensor=True` to ensure that `VariableType` will unwrap
+# the given variables and call with the Tensor type.
+#
+#
+# Sparse Methods API Design
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Goals: 1. Flexible API for users to write custom sparse ops
+#        2. ctor and member accessor with autograd support
+#
+# To achieve 1, we need to provide a set of *dangerous* APIs (dangerous in the
+# sense that misusing them will break sparse tensor invariant and may result in
+# unexpected behavior, e.g., crash). These methods are all prefixed with
+# underscore "_" to indicate that they should be used with care. We provide:
+#
+#   + `_indices()`: returns the *raw* indices within the sparse tensor (not just
+#                   sharing storage). Any inplace operation will change the
+#                   actual indices, including t_, set_, as_strided_, resize_,
+#                   etc.
+#   + `_values()`: returns the *raw* values within the sparse tensor. Similar
+#                  semantics as `_indices()`
+#   + `_nnz()`: returns the number of non-zero entries. This will always be
+#               determined by the shapes of indices and values.
+#   + `_coalesced_(bool)`: inplace sets whether the tensor is coalesced, and
+#                          returns itself.
+#
+# These methods are very useful in writing new operations, e.g., a custom
+# autograd Function.
+#
+# We also provide other public *safe* APIs:
+#   + `indices()`: returns a **view** of the indices tensor if the sparse tensor
+#                  is **coalesced**.
+#   + `values()`: returns a **view** of the values tensor if the containing
+#                 sparse tensor is **coalesced**.
+#   + `sparse_dim()`: number of sparse dimensions
+#   + `dense_dim()`: number of dense dimensions
+#   + `is_coalesced()`: whether the sparse tensor is coalesced
+#
+# `_indices()` and `_values()` should returns the raw indices and values dense
+# tensors within a sparse tensor. They can be quite unsafe with inplace
+# operations like `t_()`, and exposes uncoalesced indices and values. The public
+# recommended API is `indices()` and `values()`, both of which first check that
+# the tensor is coalesced and return views on those tensors.
+#
+#
+# Autograd Support
+# ~~~~~~~~~~~~~~~~
+#
+# Autograd is supported on `values()` and sparse tensor ctor with indices and
+# values tensors. E.g., `torch.sparse_coo_tensor(i, v).values().sum()` is
+# differentiable w.r.t. `v`.
+#
+# NB: The `values()` and `_values()` operators are special in that they are
+# layout-aware, i.e., the output depends not just on the data it represents, but
+# also on the input layout details (in this case, the `indices` tensor). See
+# NOTE [ as_strided Backward and layout-aware/agnostic autograd ] in Functions.cpp
+# for discussion on layout-aware vs layout-agnostic autograd. Since PyTorch ops
+# operate in the layout-agnostic mode, similar to `as_strided`, backward of
+# these two operators need to consider them in a layout-agnostic way:
+#   + `values()`:
+#     Input is coalesced.
+#     We just pretend having `input.indices()` as an additional argument
+#     `input_indices`, then forward is similar to
+#     `input.to(kStrided).index_select(input_indices)` regardless of the layout.
+#     Note that `values()` normally is layout-aware even if we constrain
+#     ourselves on sparse inputs since it may include all zeros values entries
+#     as "present" entries.
+#   + `_values()`:
+#     Input may be uncoalesced.
+#     It is not straightforward to construct a layout-agnostic version because
+#     duplicate indices entries may exist and additional parameterization is
+#     needed to distribute the value into different values entries. Furthermore,
+#     this op is intended to provide ways to write custom sparse ops, rather
+#     than being used in autograd graph, so it is marked as *non-differentiable*
+#     in derivatives.yaml.
+#
+# Before reading the following, see NOTE [ Autograd Variable Views ] in
+# variable.h for details on views that are tracked by autograd, and views that
+# are not.
+#
+# Moreover, these methods return tensors that share storage with inputs, so we
+# mark these methods as view ops to support autograd history tracking.
+# The sparse tensor ctor output should technically be view of both input indices
+# and values tensors, but currently we only support setting as view of a single
+# Variable, so it is only view of the values tensor.
+# TODO: clone indices in sparse tensor ctor.
+#
+# For other methods that return outputs that share storage with inputs, i.e.,
+# `indices()` and `_indices()`. We mark their outputs as non-differentiable, so
+# the view relation is not tracked by autograd, but the version counter is still
+# shared. In other words, their outputs are non-differentiable views of the
+# sparse tensor.
 
-- func: sparse_coo_tensor(IndexTensor indices, Tensor values, IntList size) -> Tensor
 
 # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
 # the default would never make sense.
 - func: sparse_coo_tensor(IntList size, *, TensorOptions options) -> Tensor
 
-- func: sparse_coo_tensor(IndexTensor indices, Tensor values, *, TensorOptions options) -> Tensor
+- func: sparse_coo_tensor(IndexTensor indices, Tensor values, *, TensorOptions options={}) -> Tensor
+
+- func: sparse_coo_tensor(IndexTensor indices, Tensor values, IntList size, *, TensorOptions options={}) -> Tensor
+
+- func: _sparse_coo_tensor_unsafe(IndexTensor indices, Tensor values, IntList size, *, TensorOptions options={}) -> Tensor
+
 
-- func: sparse_coo_tensor(IndexTensor indices, Tensor values, IntList size, *, TensorOptions options) -> Tensor
+- func: _sparse_coo_tensor_with_dims(int64_t sparse_dim, int64_t dense_dim, IntList size, *, TensorOptions options) -> Tensor
+  dispatch:
+    SparseCPU: new_with_dims_sparse
+    SparseCUDA: new_with_dims_sparse
+  requires_tensor: True
 
-- func: _native_sparse_coo_tensor_unsafe(IndexTensor indices, Tensor values, IntList size) -> Tensor
-  variants: []
+- func: _sparse_coo_tensor_with_dims_and_tensors(int64_t sparse_dim, int64_t dense_dim, IntList size, Tensor indices, Tensor values, *, TensorOptions options) -> Tensor
   dispatch:
-    SparseCPU: new_with_tensor_and_size_unsafe_sparse
-    SparseCUDA: new_with_tensor_and_size_unsafe_sparse
+    SparseCPU: new_with_dims_and_tensor_sparse
+    SparseCUDA: new_with_dims_and_tensor_sparse
+  requires_tensor: True
 
-- func: _sparse_coo_tensor_unsafe(IndexTensor indices, Tensor values, IntList size) -> Tensor
 
-- func: sparse_resize_(Tensor self, IntList size, int64_t sparseDims, int64_t denseDims) -> Tensor
+- func: sparse_resize_(Tensor self, IntList size, int64_t sparse_dim, int64_t dense_dim) -> Tensor
   variants: method
   dispatch:
     SparseCPU: sparse_resize_
     SparseCUDA: sparse_resize_
+  requires_tensor: True
 
-- func: sparse_resize_and_clear_(Tensor self, IntList size, int64_t sparseDims, int64_t denseDims) -> Tensor
+- func: sparse_resize_and_clear_(Tensor self, IntList size, int64_t sparse_dim, int64_t dense_dim) -> Tensor
   variants: method
   dispatch:
     SparseCPU: sparse_resize_and_clear_
     SparseCUDA: sparse_resize_and_clear_
+  requires_tensor: True
 
 
 - func: sparse_mask(Tensor self, SparseTensorRef mask) -> Tensor
@@ -1996,6 +2090,7 @@
   dispatch:
     CPU: sparse_mask_cpu
     CUDA: sparse_mask_cuda
+  requires_tensor: True
 
 
 - func: to_dense(Tensor self) -> Tensor
@@ -2003,33 +2098,38 @@
   dispatch:
     SparseCPU: sparse_to_dense
     SparseCUDA: sparse_to_dense
+  requires_tensor: True
 
 
-- func: _sparseDims(Tensor self) -> int64_t
+- func: sparse_dim(Tensor self) -> int64_t
   variants: method
   dispatch:
-    SparseCPU: _sparseDims_sparse
-    SparseCUDA: _sparseDims_sparse
+    SparseCPU: sparse_dim_sparse
+    SparseCUDA: sparse_dim_sparse
+  requires_tensor: True
   device_guard: False
 
 # legacy method
 - func: _dimI(Tensor self) -> int64_t
-  variants: function
-  dispatch: _sparseDims_sparse
+  variants: method
+  dispatch: sparse_dim_sparse
+  requires_tensor: True
   device_guard: False
 
 
-- func: _denseDims(Tensor self) -> int64_t
+- func: dense_dim(Tensor self) -> int64_t
   variants: method
   dispatch:
-    SparseCPU: _denseDims_sparse
-    SparseCUDA: _denseDims_sparse
+    SparseCPU: dense_dim_sparse
+    SparseCUDA: dense_dim_sparse
+  requires_tensor: True
   device_guard: False
 
 # legacy method
 - func: _dimV(Tensor self) -> int64_t
-  variants: function
-  dispatch: _denseDims_sparse
+  variants: method
+  dispatch: dense_dim_sparse
+  requires_tensor: True
   device_guard: False
 
 
@@ -2038,6 +2138,7 @@
   dispatch:
     SparseCPU: _nnz_sparse
     SparseCUDA: _nnz_sparse
+  requires_tensor: True
   device_guard: False
 
 
@@ -2046,6 +2147,7 @@
   dispatch:
     SparseCPU: coalesce_sparse_cpu
     SparseCUDA: coalesce_sparse_cuda
+  requires_tensor: True
 
 
 - func: is_coalesced(Tensor self) -> bool
@@ -2053,6 +2155,7 @@
   dispatch:
     SparseCPU: is_coalesced_sparse
     SparseCUDA: is_coalesced_sparse
+  requires_tensor: True
   device_guard: False
 
 
@@ -2061,14 +2164,42 @@
   dispatch:
     SparseCPU: _indices_sparse
     SparseCUDA: _indices_sparse
+  requires_tensor: True
   device_guard: False
 
-
 - func: _values(Tensor self) -> Tensor
   variants: method
   dispatch:
     SparseCPU: _values_sparse
     SparseCUDA: _values_sparse
+  requires_tensor: True
+  device_guard: False
+
+# This method doesn't do any check but only directly sets the flag. So it can be
+# a bit unsafe. Similar to _indices and _values, this is useful for implementing
+# custom sparse operations in Python/C++ extension.
+- func: _coalesced_(Tensor self, bool coalesced) -> Tensor
+  variants: method
+  dispatch:
+    SparseCPU: _coalesced_sparse_
+    SparseCUDA: _coalesced_sparse_
+  requires_tensor: True
+  device_guard: False
+
+- func: indices(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    SparseCPU: indices_sparse
+    SparseCUDA: indices_sparse
+  requires_tensor: True
+  device_guard: False
+
+- func: values(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    SparseCPU: values_sparse
+    SparseCUDA: values_sparse
+  requires_tensor: True
   device_guard: False
 
 
@@ -2076,17 +2207,20 @@
   dispatch:
     SparseCPU: hspmm_out_sparse_cpu
     SparseCUDA: hspmm_out_sparse_cuda
+  requires_tensor: True
 
 - func: hspmm(Tensor mat1, Tensor mat2) -> Tensor
   dispatch:
     SparseCPU: hspmm_sparse_cpu
     SparseCUDA: hspmm_sparse_cuda
+  requires_tensor: True
 
 - func: copy_sparse_to_sparse_(Tensor self, Tensor src, bool non_blocking=false) -> Tensor
   variants: function
   dispatch:
     SparseCPU: copy_sparse_
     SparseCUDA: copy_sparse_
+  requires_tensor: True
 
 - func: numel(Tensor self) -> int64_t
   variants: function, method
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 107233ecfccb48..aba245881b8bfc 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -1,64 +1,74 @@
 // Basic functions on sparse tensors
 
 #include <ATen/ATen.h>
+#include <ATen/Layout.h>
 #include <ATen/SparseTensorImpl.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/InitialTensorOptions.h>
-#include <ATen/native/sparse/SparseUtils.h>
+#include <ATen/SparseTensorUtils.h>
 
 #include <TH/THBlasUtils.h>
 
 namespace at { namespace native {
 
+using namespace at::sparse;
+
+
 /******************************************************************************
  * access methods
  ******************************************************************************/
 
-int64_t _sparseDims_sparse(const SparseTensor& self) {
-  return _get_sparse_impl(self)->sparseDims();
+int64_t sparse_dim_sparse(const SparseTensor& self) {
+  return get_sparse_impl(self)->sparse_dim();
 }
 
-int64_t _denseDims_sparse(const SparseTensor& self) {
-  return _get_sparse_impl(self)->denseDims();
+int64_t dense_dim_sparse(const SparseTensor& self) {
+  return get_sparse_impl(self)->dense_dim();
 }
 
 bool is_coalesced_sparse(const SparseTensor& self) {
-  return _get_sparse_impl(self)->coalesced();
+  return get_sparse_impl(self)->coalesced();
 }
 
 int64_t _nnz_sparse(const SparseTensor& self) {
-  return _get_sparse_impl(self)->nnz();
+  return get_sparse_impl(self)->nnz();
 }
 
-// TODO: This is wrong: if nnz == 0 but indices/values is not
-// empty then we'll return all the values, even the ones that
-// are "masked out" by nnz
+// Why are there so many methods to get indices and value?
+// See Note [ Sparse: different methods to get indices and values ] in native_functions.yaml
 
 Tensor _indices_sparse(const SparseTensor& self) {
-  auto nnz = self._nnz();
-  if (nnz == 0) {
-    // Narrows don't work on 0-length tensors
-    // TODO: When we handle zero-size dims correctly, this will work and
-    // we can remove the special case.
-    return _get_sparse_impl(self)->indices();
-  }
-  return _get_sparse_impl(self)->indices().narrow(1, 0, nnz);
+  return get_sparse_impl(self)->indices();
 }
 
 Tensor _values_sparse(const SparseTensor& self) {
-  // See indices for some relevant notes
-  auto nnz = self._nnz();
-  if (nnz == 0) {
-    return _get_sparse_impl(self)->values();
-  }
-  return _get_sparse_impl(self)->values().narrow(0, 0, nnz);
+  return get_sparse_impl(self)->values();
+}
+
+Tensor &_coalesced_sparse_(SparseTensor& self, bool coalesced) {
+  get_sparse_impl(self)->set_coalesced(coalesced);
+  return self;
+}
+
+Tensor indices_sparse(const Tensor& self) {
+  AT_CHECK(self.is_coalesced(),
+           "Cannot get indices on an uncoalesced tensor, please call .coalesce() first");
+  return get_sparse_impl(self)->indices().alias();
+}
+
+Tensor values_sparse(const Tensor& self) {
+  AT_CHECK(self.is_coalesced(),
+           "Cannot get values on an uncoalesced tensor, please call .coalesce() first");
+  return get_sparse_impl(self)->values().alias();
 }
 
 /******************************************************************************
  * creation methods
+ * See NOTE [ Sparse: autograd and API ] for details
  ******************************************************************************/
 
-/* Empty init */
+/*** Helper methods ***/
+
 SparseTensor new_sparse(const TensorOptions& options) {
   AT_ASSERT(!options.is_variable());
   AT_ASSERT(options.layout() == kSparse);
@@ -72,184 +82,201 @@ SparseTensor new_sparse(const TensorOptions& options) {
       type_id, scalarTypeToTypeMeta(options.dtype()));
 }
 
-/*** Helper methods ***/
+/** Actual dispatched creation methods ***/
+
+SparseTensor new_with_dims_sparse(int64_t sparse_dim, int64_t dense_dim, ArrayRef<int64_t> size, const TensorOptions& options) {
+  SparseTensor self = new_sparse(options);
+  AT_CHECK(size.size() != 0,
+    "cannot construct sparse tensor with 0 dimensions and no values; you must specify at least 1 dimension if you want to create a sparse tensor with no elements, \
+or you must provide a single-element `values` tensor (e.g. x = torch.sparse_coo_tensor(torch.zeros(0, 1), 12.3, [])) if you want to create a scalar sparse tensor");
+  get_sparse_impl(self)->resize_and_clear_(sparse_dim, dense_dim, size);
+  return self;
+}
+
+// Does NOT make copies of indices and values
+SparseTensor new_with_dims_and_tensor_sparse(
+    int64_t sparse_dim,
+    int64_t dense_dim,
+    ArrayRef<int64_t> size,
+    const LongTensor& indices,
+    const Tensor& values,
+    const TensorOptions& options) {
+  SparseTensor self = new_sparse(options);
+  get_sparse_impl(self)->resize_(sparse_dim, dense_dim, size);
+  alias_into_sparse(self, indices, values);
+  return self;
+}
+
+/** Public creation API that dispatch to methods above **/
+
+/** Empty init **/
+Tensor empty_sparse(IntList size, const TensorOptions& options) {
+  return new_with_dims_sparse(size.size(), 0, size, options);
+}
+
+/* Shape init */
+Tensor sparse_coo_tensor(ArrayRef<int64_t> size, const TensorOptions& options) {
+  return at::_sparse_coo_tensor_with_dims(size.size(), 0, size, options.layout(at::kSparse));
+}
 
 /* Pointer-copy init */
-SparseTensor new_with_tensor_sparse(const LongTensor& indices, const Tensor& values_) {
-  Tensor values;
-  if (values_.dim() == 0) {
-    // Mimic Numpy behavior here and treat it as a 1D tensor
-    values = values_.expand({1});
-  } else {
-    values = values_;
+
+// helper
+namespace {
+  static inline Tensor expand_values_if_needed(const Tensor& values) {
+    // expand
+    if (values.dim() == 0) {
+      // Mimic Numpy behavior here and treat it as a 1D tensor
+      return values.expand({1});
+    } else {
+      return values;
+    }
   }
+}
+
+Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_, const TensorOptions& options) {
+  Tensor values = expand_values_if_needed(values_);
 
-  const SparseType& dtype = values.type().toSparse();
+  // arg checking
+  AT_CHECK(!options.has_layout() || options.layout() == kSparse, "expected sparse layout, but got layout ", options.layout());
+  // the following checks are redundant because they are also checked in SparseTensorImpl::set_indices_and_values_unsafe
+  // but we need to ensure them in order to infer the shape.
+  AT_CHECK(indices.dim() == 2, "indices must be sparse_dim x nnz, but got: ", indices.sizes())
+  AT_CHECK(!indices.is_sparse(), "expected indices to be a dense tensor, but got indices of layout ", indices.layout());
 
   // If sizes are not given, it is inferred as max index of each dim.
-  int64_t sparseDims = indices.size(0);
-  int64_t denseDims = values.dim() - 1;
+  int64_t sparse_dim = indices.size(0);
+  int64_t dense_dim = values.dim() - 1;
 
-  std::vector<int64_t> computed_sizes(sparseDims + denseDims);
+  std::vector<int64_t> computed_sizes(sparse_dim + dense_dim);
   if (indices.numel() > 0) {
     // If the indices has elements in it, we infer the minimum sparse dimension sizes
     // as the max value of each dim in indices.
     // NB: It used to keepdim. I think that was wrong.
+    LongTensor min_indices = std::get</* values */ 0>(indices.min(/* dim */ 1, /* keepdim */ false));
     LongTensor computed_indices_sizes = std::get</* values */ 0>(indices.max(/* dim */ 1, /* keepdim */ false));
     computed_indices_sizes.add_(1); // len = max_index + 1
-    LongTensor cpu_computed_indices_sizes;
-    if (computed_indices_sizes.is_cuda()) {
-      cpu_computed_indices_sizes = at::empty(computed_indices_sizes.sizes(), at::initialTensorOptions().dtype(kLong));
-      cpu_computed_indices_sizes.copy_(computed_indices_sizes);
-    } else {
-      cpu_computed_indices_sizes = computed_indices_sizes;
-    }
+    LongTensor cpu_min_indices = min_indices.to(at::DeviceType::CPU);
+    LongTensor cpu_computed_indices_sizes = computed_indices_sizes.to(at::DeviceType::CPU);
+    auto cpu_min_indices_accessor = cpu_min_indices.accessor<int64_t, 1>();
     auto cpu_computed_indices_sizes_accessor = cpu_computed_indices_sizes.accessor<int64_t, 1>();
-    for (int64_t d = 0; d < sparseDims; d++) {
+    for (int64_t d = 0; d < sparse_dim; d++) {
+      int64_t min_index_in_dim = cpu_min_indices_accessor[d];
+      AT_CHECK(min_index_in_dim >= 0,
+               "found negative index ", min_index_in_dim, " for dim ", d);
       computed_sizes[static_cast<size_t>(d)] = cpu_computed_indices_sizes_accessor[d];
     }
   } else {
     // If the indices doesn't have elements in it, there is not enough information
     // to know what the minimum sparse dimension sizes should be, and in this case
     // we set them to 0
-    for (int64_t d = 0; d < sparseDims; d++) {
+    for (int64_t d = 0; d < sparse_dim; d++) {
       computed_sizes[static_cast<size_t>(d)] = 0;
     }
   }
-  for (int64_t d = 0; d < denseDims; d++) {
-    computed_sizes[static_cast<size_t>(sparseDims + d)] = values.size(d+1);
+  for (int64_t d = 0; d < dense_dim; d++) {
+    computed_sizes[static_cast<size_t>(sparse_dim + d)] = values.size(d+1);
   }
-  return _new_with_dims_and_tensor_sparse(dtype, sparseDims, denseDims, computed_sizes, indices, values);
-}
 
-SparseTensor new_with_dims_and_size_sparse(int64_t sparseDims, int64_t denseDims, ArrayRef<int64_t> size, const TensorOptions& options) {
-  SparseTensor self = new_sparse(options);
-  AT_CHECK(size.size() != 0,
-    "cannot construct sparse tensor with 0 dimensions and no values; you must specify at least 1 dimension if you want to create a sparse tensor with no elements, \
-or you must provide a single-element `values` tensor (e.g. x = torch.sparse_coo_tensor(torch.zeros(0, 1), 12.3, [])) if you want to create a scalar sparse tensor");
-  _get_sparse_impl(self)->resize_and_clear_(sparseDims, denseDims, size);
-  return self;
+  return at::_sparse_coo_tensor_with_dims_and_tensors(
+      sparse_dim, dense_dim, computed_sizes, indices, values, values.options().layout(kSparse));
 }
 
-Tensor empty_sparse(IntList size, const TensorOptions& options) {
-  AT_CHECK(size.size() != 0,
-    "cannot construct sparse tensor with 0 dimensions and no values; you must specify at least 1 dimension if you want to create a sparse tensor with no elements, \
-     or you must provide a single-element `values` tensor (e.g. x = torch.sparse_coo_tensor(torch.zeros(0, 1), 12.3, [])) if you want to create a scalar sparse tensor");
-  AT_ASSERT(!options.is_variable());
-  AT_ASSERT(options.layout() == kSparse);
-  TensorTypeId type_id;
-  if (options.device().type() == kCUDA) {
-    type_id = SparseCUDATensorId();
-  } else {
-    type_id = SparseCPUTensorId();
-  }
-  auto tensor = Tensor(c10::make_intrusive<SparseTensorImpl>(type_id, scalarTypeToTypeMeta(options.dtype())));
-  _get_sparse_impl(tensor)->resize_and_clear_(size.size(), 0, size);
-  return tensor;
-}
-
-SparseTensor new_with_size_sparse(IntList size, const TensorOptions& options) {
-  return new_with_dims_and_size_sparse(size.size(), 0, size, options);
-}
-
-// NOTE: new_with_tensor_and_size_unsafe_sparse() differs from new_with_tensor_and_size_sparse()
-// in that we don't check whether any indices are out of boundaries of `sizes`, thus avoiding a
-// copy from CUDA to CPU. However, this function should ONLY be used where we know that the indices
-// are guaranteed to be within bounds.
 // NB: Got rid of the sizes == NULL case
-SparseTensor new_with_tensor_and_size_unsafe_sparse(const LongTensor& indices, const Tensor& values_, ArrayRef<int64_t> sizes) {
-  Tensor values;
-  if (values_.dim() == 0) {
-    // Mimic Numpy behavior here and treat it as a 1D tensor
-    values = values_.expand({1});
-  } else {
-    values = values_;
-  }
-
-  const SparseType& dtype = values.type().toSparse();
-
-  int64_t sparseDims = indices.size(0);
-  int64_t denseDims = values.dim() - 1;
-  return _new_with_dims_and_tensor_sparse(dtype, sparseDims, denseDims, sizes, indices, values);
-}
-
-// NB: Got rid of the sizes == NULL case
-SparseTensor new_with_tensor_and_size_sparse(const LongTensor& indices, const Tensor& values_, ArrayRef<int64_t> sizes) {
-  Tensor values;
-  if (values_.dim() == 0) {
-    // Mimic Numpy behavior here and treat it as a 1D tensor
-    values = values_.expand({1});
-  } else {
-    values = values_;
-  }
-
-  const SparseType& dtype = values.type().toSparse();
-
-  int64_t sparseDims = indices.size(0);
-  int64_t denseDims = values.dim() - 1;
-  AT_CHECK(sizes.size() == sparseDims + denseDims, "number of dimensions must be sparseDims (", sparseDims, ") + denseDims (", denseDims, "), but got ", sizes.size());
-
-  // Check to make sure all indices are within the boundaries of `sizes`
+Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_, ArrayRef<int64_t> size, const TensorOptions& options) {
+  Tensor values = expand_values_if_needed(values_);
+
+  // arg checking
+  AT_CHECK(!options.has_layout() || options.layout() == kSparse, "expected sparse layout, but got layout ", options.layout());
+  // the following checks are redundant because they are also checked in SparseTensorImpl::set_indices_and_values_unsafe
+  // but we need to ensure them in order to infer the shape.
+  AT_CHECK(indices.dim() == 2, "indices must be sparse_dim x nnz, but got: ", indices.sizes())
+  AT_CHECK(!indices.is_sparse(), "expected indices to be a dense tensor, but got indices of layout ", indices.layout());
+  int64_t sparse_dim = indices.size(0);
+  int64_t dense_dim = values.dim() - 1;
+  AT_CHECK(size.size() == sparse_dim + dense_dim,
+           "number of dimensions must be sparse_dim (", sparse_dim, ") + dense_dim (", dense_dim, "), but got ", size.size());
+
+  // Check to make sure all indices are within the boundaries of `size`
   if (indices.numel() > 0) {
     LongTensor min_indices = std::get</* values */ 0>(indices.min(/* dim */ 1, /* keepdim */ false));
     LongTensor max_indices = std::get</* values */ 0>(indices.max(/* dim */ 1, /* keepdim */ false));
     LongTensor cpu_min_indices, cpu_max_indices;
     if (indices.is_cuda()) {
-      cpu_min_indices = at::CPU(kLong).copy(min_indices);
-      cpu_max_indices = at::CPU(kLong).copy(max_indices);
+      cpu_min_indices = min_indices.to(at::DeviceType::CPU);
+      cpu_max_indices = max_indices.to(at::DeviceType::CPU);
     } else {
       cpu_min_indices = min_indices;
       cpu_max_indices = max_indices;
     }
     auto cpu_min_indices_accessor = cpu_min_indices.accessor<int64_t, 1>();
     auto cpu_max_indices_accessor = cpu_max_indices.accessor<int64_t, 1>();
-    for (int64_t d = 0; d < sparseDims; d++) {
+    for (int64_t d = 0; d < sparse_dim; d++) {
       // NB: This used to sync ndim times to access each entry; now we copy
       // everything to CPU first and then access it.
       int64_t min_index_in_dim = cpu_min_indices_accessor[d];
       AT_CHECK(min_index_in_dim >= 0,
                "found negative index ", min_index_in_dim, " for dim ", d);
       int64_t max_index_in_dim = cpu_max_indices_accessor[d];
-      int64_t dim_size = sizes[static_cast<size_t>(d)];
+      int64_t dim_size = size[static_cast<size_t>(d)];
       AT_CHECK(max_index_in_dim < dim_size,
-               "sizes is inconsistent with indices: for dim ", d, ", size is ", dim_size, " but found index ", max_index_in_dim);
+               "size is inconsistent with indices: for dim ", d, ", size is ", dim_size, " but found index ", max_index_in_dim);
     }
   }
-  return _new_with_dims_and_tensor_sparse(dtype, sparseDims, denseDims, sizes, indices, values);
+
+  return at::_sparse_coo_tensor_with_dims_and_tensors(
+      sparse_dim, dense_dim, size, indices, values, values.options().layout(kSparse));
+}
+
+// NOTE: _sparse_coo_tensor_unsafe() differs from sparse_coo_tensor()
+// in that we don't check whether any indices are out of boundaries of `size`, thus avoiding a
+// copy from CUDA to CPU. However, this function should ONLY be used where we know that the indices
+// are guaranteed to be within bounds.
+// NB: Got rid of the size == NULL case
+Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values_, ArrayRef<int64_t> size, const TensorOptions& options) {
+  Tensor values = expand_values_if_needed(values_);
+
+  // arg checking
+  AT_CHECK(!options.has_layout() || options.layout() == kSparse, "expected sparse layout, but got layout ", options.layout());
+
+  int64_t sparse_dim = indices.size(0);
+  int64_t dense_dim = values.dim() - 1;
+
+  return at::_sparse_coo_tensor_with_dims_and_tensors(
+      sparse_dim, dense_dim, size, indices, values, values.options().layout(kSparse));
 }
 
 // NB: Deleted newWithSizeNd variants
 
 SparseTensor clone_sparse(const SparseTensor& self) {
-  SparseTensor other = new_with_dims_and_size_sparse(self._sparseDims(), self._denseDims(), self.sizes(), self.options());
-  _copy_into_sparse(other, _get_sparse_impl(self)->indices(), _get_sparse_impl(self)->values(), true);
-  _get_sparse_impl(other)->set_coalesced(self.is_coalesced());
-  return other;
+  SparseTensor other = new_with_dims_sparse(self.sparse_dim(), self.dense_dim(), self.sizes(), self.options());
+  copy_into_sparse(other, self._indices(), self._values(), true);
+  return other._coalesced_(self.is_coalesced());
 }
 
 /******************************************************************************
  * reshaping methods
  ******************************************************************************/
 
-SparseTensor& sparse_resize_(SparseTensor& self, ArrayRef<int64_t> size, int64_t sparseDims, int64_t denseDims) {
-  _get_sparse_impl(self)->resize_(sparseDims, denseDims, size);
+SparseTensor& sparse_resize_(SparseTensor& self, ArrayRef<int64_t> size, int64_t sparse_dim, int64_t dense_dim) {
+  get_sparse_impl(self)->resize_(sparse_dim, dense_dim, size);
   return self;
 }
 
-SparseTensor& sparse_resize_and_clear_(SparseTensor& self, ArrayRef<int64_t> size, int64_t sparseDims, int64_t denseDims) {
-  _get_sparse_impl(self)->resize_and_clear_(sparseDims, denseDims, size);
+SparseTensor& sparse_resize_and_clear_(SparseTensor& self, ArrayRef<int64_t> size, int64_t sparse_dim, int64_t dense_dim) {
+  get_sparse_impl(self)->resize_and_clear_(sparse_dim, dense_dim, size);
   return self;
 }
 
 namespace {
   bool _is_same_size_as_sparse(const SparseTensor& self, const SparseTensor& src) {
-    return self._sparseDims() == src._sparseDims() && self._denseDims() == src._denseDims() && self.sizes().equals(src.sizes());
+    return self.sparse_dim() == src.sparse_dim() && self.dense_dim() == src.dense_dim() && self.sizes().equals(src.sizes());
   }
 }
 
 SparseTensor& resize_as_sparse_(SparseTensor& self, const SparseTensor& src) {
   if (!_is_same_size_as_sparse(self, src)) {
-    sparse_resize_(self, src.sizes(), src._sparseDims(), src._denseDims());
+    sparse_resize_(self, src.sizes(), src.sparse_dim(), src.dense_dim());
   }
   return self;
 }
@@ -257,17 +284,15 @@ SparseTensor& resize_as_sparse_(SparseTensor& self, const SparseTensor& src) {
 // NB: Dropped the resizeNd variants
 
 Tensor sparse_to_dense(const SparseTensor& self) {
-  Tensor dst = at::zeros(self.sizes(), self.type().toDense());
+  Tensor dst = at::zeros(self.sizes(), self.options().layout(kStrided));
   return dst.add_(self);
 }
 
 SparseTensor& copy_sparse_(SparseTensor& self, const SparseTensor& src, bool non_blocking) {
-  if (isSameTensor(self, src)) return self;
-  _get_sparse_impl(self)->resize_(src._sparseDims(), src._denseDims(), src.sizes());
-  // NB: This seems to copy the underlying full indices/values buffer
-  _copy_into_sparse(self, _get_sparse_impl(src)->indices(), _get_sparse_impl(src)->values(), non_blocking);
-  _get_sparse_impl(self)->set_coalesced(src.is_coalesced());
-  return self;
+  if (is_same_tensor(self, src)) return self;
+  get_sparse_impl(self)->resize_(src.sparse_dim(), src.dense_dim(), src.sizes());
+  copy_into_sparse(self, src._indices(), src._values(), non_blocking);
+  return self._coalesced_(src.is_coalesced());
 }
 
 SparseTensor coalesce_sparse_cpu(const SparseTensor& self) {
@@ -282,31 +307,24 @@ SparseTensor coalesce_sparse_cpu(const SparseTensor& self) {
   // we should keep the original tensor intact and do coalesce on a copy of the tensor
   if (self._nnz() < 2) {
     SparseTensor dst = self.clone();
-    _get_sparse_impl(dst)->set_coalesced(true);
+    dst._coalesced_(true);
     return dst;
   }
 
   LongTensor indices = self._indices();
   Tensor values = self._values().contiguous();
-  int64_t sparseDims = self._sparseDims();
-  int64_t denseDims = self._denseDims();
+  int64_t sparse_dim = self.sparse_dim();
+  int64_t dense_dim = self.dense_dim();
   int64_t nnz = self._nnz();
 
-  LongTensor indices_scalar = at::zeros({nnz}, kLong);
-
-  int64_t factor = 1;
-  for (int64_t d = sparseDims - 1; d >= 0; d--) {
-    LongTensor indices_slice = indices.select(0, d);
-    indices_scalar.add_(indices_slice, factor); // cadd is swapped args
-    factor *= self.size(d);
-  }
+  LongTensor indices_scalar = flatten_indices(indices, self.sizes());
 
   SparseTensor dst = new_sparse(self.options());
-  _get_sparse_impl(dst)->resize_(sparseDims, denseDims, self.sizes());
+  get_sparse_impl(dst)->resize_(sparse_dim, dense_dim, self.sizes());
   // TODO: is there a more idiomatic way to do this?
   LongTensor newIndices = at::empty(indices.sizes(), indices.options());
   Tensor newValues = at::empty(values.sizes(), values.options());
-  _alias_into_sparse(dst, newIndices, newValues);
+  alias_into_sparse(dst, newIndices, newValues);
 
   LongTensor indicesBuffer;
   LongTensor indicesPermutation;
@@ -333,7 +351,7 @@ SparseTensor coalesce_sparse_cpu(const SparseTensor& self) {
             }
           } else {
             ++i;
-            for (int64_t d = 0; d < sparseDims; d++) {
+            for (int64_t d = 0; d < sparse_dim; d++) {
               newIndicesAccessor[d][i] = indicesAccessor[d][pos];
             }
             if (values.numel() > 0) {  // if values is an empty tensor, there are no elements to copy
@@ -344,8 +362,8 @@ SparseTensor coalesce_sparse_cpu(const SparseTensor& self) {
         }
     });
 
-  _get_sparse_impl(dst)->set_coalesced(true);
-  _get_sparse_impl(dst)->set_nnz_and_narrow(i + 1);
+  dst._coalesced_(true);
+  get_sparse_impl(dst)->set_nnz_and_narrow(i + 1);
 
   return dst;
 }
@@ -363,14 +381,15 @@ SparseTensor& sparse_mask_out_cpu(SparseTensor& r, const Tensor& t, const Sparse
     return r;
   }
   int64_t dim = t.dim();
-  int64_t sparseDims = mask._sparseDims();
+  int64_t sparse_dim = mask.sparse_dim();
   LongTensor mask_indices = mask._indices();
   Tensor mask_values = mask._values();
   Tensor r_values = at::empty(mask_values.sizes(), r._values().options());
-  _alias_into_sparse(r, mask_indices.clone(), r_values);
-  _get_sparse_impl(r)->set_coalesced(mask.is_coalesced());
+  alias_into_sparse(r, mask_indices.clone(), r_values);
+  r._coalesced_(mask.is_coalesced());
   int64_t r_nnz = mask._nnz();
-  _get_sparse_impl(r)->set_nnz_and_narrow(r_nnz);
+  get_sparse_impl(r)->set_nnz_and_narrow(r_nnz);
+
   if (t.numel() == 0) {  // if t is an empty tensor, there is no need to mask its elements
     return r;
   }
@@ -378,11 +397,11 @@ SparseTensor& sparse_mask_out_cpu(SparseTensor& r, const Tensor& t, const Sparse
   // NB: Relies on mask._nnz() == 0 test above
   auto mask_indices_accessor = mask_indices.accessor<int64_t, 2>();
 
-  if (dim > sparseDims) {
+  if (dim > sparse_dim) {
     // NB: This used to reuse buffers, but I deoptimized it
     for (int64_t i = 0; i < r_nnz; i++) {
       Tensor srcBuffer = t;
-      for (int64_t d = 0; d < sparseDims; d++) {
+      for (int64_t d = 0; d < sparse_dim; d++) {
         srcBuffer = srcBuffer.select(0, mask_indices_accessor[d][i]);
       }
       Tensor dstBuffer = r_values.select(0, i);
@@ -398,7 +417,7 @@ SparseTensor& sparse_mask_out_cpu(SparseTensor& r, const Tensor& t, const Sparse
           scalar_t* t_ptr = t.data<scalar_t>();
           for (int64_t i = 0; i < r_nnz; i++) {
             int64_t idx = 0;
-            for (int64_t d = 0; d < sparseDims; d++) {
+            for (int64_t d = 0; d < sparse_dim; d++) {
               idx += mask_indices_accessor[d][i] * t.stride(d);
             }
             scalar_t val = t_ptr[idx];
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index 053535a976c46a..994a3647d74a12 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -3,12 +3,14 @@
 #include <ATen/ExpandUtils.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/InitialTensorOptions.h>
-#include <ATen/native/sparse/SparseUtils.h>
+#include <ATen/SparseTensorUtils.h>
 
 #include <TH/THBlasUtils.h>
 
 namespace at { namespace native {
 
+using namespace at::sparse;
+
 // --------------------------------------------------------------------
 // Utility functions
 // --------------------------------------------------------------------
@@ -43,9 +45,8 @@ namespace {
 // hummu hummu
 SparseTensor& zero_sparse_(SparseTensor& self) {
   AT_ASSERT(self.is_sparse());
-  at::zeros_out(self, _get_sparse_impl(self)->sizes());
-  _get_sparse_impl(self)->set_coalesced(true); // NB: This is new
-  return self;
+  at::zeros_out(self, get_sparse_impl(self)->sizes());
+  return self._coalesced_(true);
 }
 
 // NB: Don't need zeros, zeros_like, already implemented in TensorFactories
@@ -65,16 +66,17 @@ SparseTensor& mul_out_sparse_zerodim(SparseTensor& r, const SparseTensor& t, con
   AT_ASSERT(t.is_sparse());
   AT_ASSERT(value.dim() == 0);
 
-  if (isSameTensor(r, t)) {
+  if (is_same_tensor(r, t)) {
     r._values().mul_(value);
   } else {
     r.resize_as_(t);
-    r._indices().resize_as_(t._indices());
-    r._indices().copy_(t._indices());
+    auto indices = r._indices();
+    indices.resize_as_(t._indices());
+    indices.copy_(t._indices());
     Tensor r_values = r._values(); // Sigh... needed because mul_out takes Tensor&
     at::mul_out(r_values, t._values(), value);
-    _get_sparse_impl(r)->set_nnz_and_narrow(t._nnz());
-    _get_sparse_impl(r)->set_coalesced(t.is_coalesced());
+    get_sparse_impl(r)->set_nnz_and_narrow(t._nnz());
+    r._coalesced_(t.is_coalesced());
   }
   return r;
 }
@@ -93,7 +95,7 @@ SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) {
   AT_ASSERT(r.is_sparse());
   AT_ASSERT(t.is_sparse());
 
-  if (isSameTensor(r, t)) {
+  if (is_same_tensor(r, t)) {
     // don't have in-place log1p for uncoalesced input because coalesce() is not in-place
     AT_CHECK(
       r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!");
@@ -125,14 +127,13 @@ SparseTensor& pow_out_sparse_scalar(SparseTensor& r, const SparseTensor& t_, Sca
   SparseTensor t = t_.coalesce();
 
   r.resize_as_(t);
-  r._indices().resize_as_(t._indices());
-  r._indices().copy_(t._indices());
+  auto indices = r._indices();
+  indices.resize_as_(t._indices());
+  indices.copy_(t._indices());
   Tensor r_values = r._values(); // Sigh... needed because pow_out takes Tensor&
   at::pow_out(r_values, t._values(), value);
-  _get_sparse_impl(r)->set_nnz_and_narrow(t._nnz());
-  _get_sparse_impl(r)->set_coalesced(t.is_coalesced());
-
-  return r;
+  get_sparse_impl(r)->set_nnz_and_narrow(t._nnz());
+  return r._coalesced_(t.is_coalesced());
 }
 
 SparseTensor pow_sparse_scalar(const SparseTensor& t, Scalar value) {
@@ -150,16 +151,17 @@ SparseTensor& div_out_sparse_zerodim(SparseTensor& r, const SparseTensor& t, con
   AT_ASSERT(t.is_sparse());
   AT_ASSERT(value.dim() == 0);
 
-  if (isSameTensor(r, t)) {
+  if (is_same_tensor(r, t)) {
     r._values().div_(value);
   } else {
     r.resize_as_(t);
-    r._indices().resize_as_(t._indices());
-    r._indices().copy_(t._indices());
+    auto indices = r._indices();
+    indices.resize_as_(t._indices());
+    indices.copy_(t._indices());
     Tensor r_values = r._values(); // Sigh... needed because div_out takes Tensor&
     at::div_out(r_values, t._values(), value);
-    _get_sparse_impl(r)->set_nnz_and_narrow(t._nnz());
-    _get_sparse_impl(r)->set_coalesced(t.is_coalesced());
+    get_sparse_impl(r)->set_nnz_and_narrow(t._nnz());
+    r._coalesced_(t.is_coalesced());
   }
   return r;
 }
@@ -199,20 +201,20 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S
     return mul_out_sparse_scalar(r, src, value);
   }
 
-  AT_CHECK(_is_same_density(t, src), "add: expected 'self' and 'other' to have same density, but 'self' has ", t._sparseDims(), " sparse dimensions while 'other' has ", src._sparseDims(), " sparse dimensions");
+  AT_CHECK(is_same_density(t, src), "add: expected 'self' and 'other' to have same density, but 'self' has ", t.sparse_dim(), " sparse dimensions while 'other' has ", src.sparse_dim(), " sparse dimensions");
 
   // saving those because they can be overwritten when doing in-place operations
   int64_t t_nnz = t._nnz(), s_nnz = src._nnz(), max_nnz = t_nnz + s_nnz;
   bool t_coalesced = t.is_coalesced(), s_coalesced = src.is_coalesced();
-  int64_t sparseDims = src._sparseDims();
+  int64_t sparse_dim = src.sparse_dim();
   LongTensor t_indices = t._indices();
   Tensor t_values = t._values();
   LongTensor src_indices = src._indices();
   Tensor s_values = src._values();
-  LongTensor r_indices = at::empty({sparseDims, max_nnz}, t_indices.options());
-  Tensor r_values = _new_values_with_size_of(s_values, max_nnz).zero_();
+  LongTensor r_indices = at::empty({sparse_dim, max_nnz}, t_indices.options());
+  Tensor r_values = new_values_with_size_of(s_values, max_nnz).zero_();
   r.resize_as_(src);
-  _get_sparse_impl(r)->set_indices_and_values_unsafe(r_indices, r_values);
+  get_sparse_impl(r)->set_indices_and_values_unsafe(r_indices, r_values);
 
   int64_t blockSize = r_values.stride(0);
   int64_t cmp, d;
@@ -236,7 +238,7 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S
             cmp = 1;
           } else {
             cmp = 0;
-            for (d = 0; d < sparseDims; d++) {
+            for (d = 0; d < sparse_dim; d++) {
               if (t_indices_accessor[d][t_i] < src_indices_accessor[d][s_i]) {
                 cmp = 1;
                 break;
@@ -248,7 +250,7 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S
             }
           }
           if (cmp >= 0) {
-            for (d = 0; d < sparseDims; d++) {
+            for (d = 0; d < sparse_dim; d++) {
               r_indices_accessor[d][r_i] = t_indices_accessor[d][t_i];
             }
             if (t_values.numel() > 0) {  // We add all elements from t_values to r_values only if t_values is not an empty tensor
@@ -259,7 +261,7 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S
             t_i++;
           }
           if (cmp <= 0) {
-            for (d = 0; d < sparseDims; d++) {
+            for (d = 0; d < sparse_dim; d++) {
               r_indices_accessor[d][r_i] = src_indices_accessor[d][s_i];
             }
             if (s_values.numel() > 0) {  // We add all elements from s_values to r_values only if s_values is not an empty tensor
@@ -274,14 +276,12 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S
       }
   );
 
-  _get_sparse_impl(r)->set_nnz_and_narrow(r_i);
+  get_sparse_impl(r)->set_nnz_and_narrow(r_i);
   // TODO: I think it may be possible to track inside the loop and
   // detect when we are uncoalesced (e.g., by observing that an
   // index goes backwards) which may be more precise than using the
   // coalesced flag here.  But this is easy.
-  _get_sparse_impl(r)->set_coalesced(t_coalesced && s_coalesced);
-
-  return r;
+  return r._coalesced_(t_coalesced && s_coalesced);
 }
 
 // --------------------------------------------------------------------
@@ -302,7 +302,7 @@ void add_dense_sparse_worker_cpu(Tensor& r, Scalar value, const SparseTensor& sp
   #pragma omp parallel for private(k)
   for (k = 0; k < sparse._nnz(); k++) {
     int64_t index = r.storage_offset();
-    for (int64_t d = 0; d < sparse._sparseDims(); d++) {
+    for (int64_t d = 0; d < sparse.sparse_dim(); d++) {
       index += r.stride(d) * indices_accessor[d][k];
     }
     r_ptr[index] += cast_value * values_accessor[k];
@@ -329,9 +329,9 @@ Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, SparseTensorRef
   LongTensor indices = sparse._indices();
   Tensor values = sparse._values();
   int64_t nDim = dense.dim();
-  int64_t nDimI = sparse._sparseDims();
+  int64_t nDimI = sparse.sparse_dim();
 
-  if (!isSameTensor(r, dense)) r.copy_(dense);
+  if (!is_same_tensor(r, dense)) r.copy_(dense);
   if (sparse._nnz() == 0) return r;
 
   // accessors rely on nnz test
@@ -339,7 +339,7 @@ Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, SparseTensorRef
     auto indices_accessor = indices.accessor<int64_t, 2>();
     for (int64_t k = 0; k < sparse._nnz(); k++) {
       Tensor dstBuffer = r;
-      for (int64_t d = 0; d < sparse._sparseDims(); d++) {
+      for (int64_t d = 0; d < sparse.sparse_dim(); d++) {
         dstBuffer = dstBuffer.select(0, indices_accessor[d][k]);
       }
       Tensor srcBuffer = values.select(0, k);
@@ -383,15 +383,15 @@ SparseTensor& mul_out_sparse_cpu(SparseTensor& r, const Tensor& t_, const Tensor
   // saving those because they can be overwritten when doing in-place operations
   int64_t t_nnz = t._nnz(), s_nnz = src._nnz();
   int64_t max_nnz = std::min(t_nnz, s_nnz);  // multiply by zero is zero, and can be dropped
-  int64_t sparseDims = src._sparseDims();
+  int64_t sparse_dim = src.sparse_dim();
   LongTensor t_indices = t._indices();
   Tensor t_values = t._values();
   LongTensor src_indices = src._indices();
   Tensor s_values = src._values();
-  LongTensor r_indices = at::empty({sparseDims, max_nnz}, t_indices.options());
-  Tensor r_values = _new_values_with_size_of(t_values, max_nnz).zero_();
+  LongTensor r_indices = at::empty({sparse_dim, max_nnz}, t_indices.options());
+  Tensor r_values = new_values_with_size_of(t_values, max_nnz).zero_();
   r.resize_as_(src);
-  _get_sparse_impl(r)->set_indices_and_values_unsafe(r_indices, r_values);
+  get_sparse_impl(r)->set_indices_and_values_unsafe(r_indices, r_values);
 
   int64_t match, d;
   int64_t r_i = 0, t_i = 0, s_i = 0;
@@ -406,7 +406,7 @@ SparseTensor& mul_out_sparse_cpu(SparseTensor& r, const Tensor& t_, const Tensor
   // indices were found.
   auto index_preamble = [&]() {
     match = 1;
-    for (d = 0; d < sparseDims; d++) {
+    for (d = 0; d < sparse_dim; d++) {
       if (t_indices_accessor[d][t_i] < src_indices_accessor[d][s_i]) {
         t_i++;
         match = 0;
@@ -419,7 +419,7 @@ SparseTensor& mul_out_sparse_cpu(SparseTensor& r, const Tensor& t_, const Tensor
       }
     }
     if (!match) return false;
-    for (d = 0; d < sparseDims; d++) {
+    for (d = 0; d < sparse_dim; d++) {
       r_indices_accessor[d][r_i] = t_indices_accessor[d][t_i];
     }
     return true;
@@ -451,10 +451,8 @@ SparseTensor& mul_out_sparse_cpu(SparseTensor& r, const Tensor& t_, const Tensor
     );
   }
 
-  _get_sparse_impl(r)->set_nnz_and_narrow(r_i);
-  _get_sparse_impl(r)->set_coalesced(true);
-
-  return r;
+  get_sparse_impl(r)->set_nnz_and_narrow(r_i);
+  return r._coalesced_(true);
 }
 
 // --------------------------------------------------------------------
@@ -472,7 +470,7 @@ void s_addmm_out_sparse_dense_worker(int64_t nnz, int64_t dim_i, int64_t dim_j,
   if (cast_beta == 0) {
     r.zero_();
   } else if (cast_beta == 1) {
-    if (!isSameTensor(r, t)) {
+    if (!is_same_tensor(r, t)) {
       r.copy_(t);
     }
   } else {
@@ -523,8 +521,8 @@ Tensor& s_addmm_out_sparse_dense_cpu(
   AT_CHECK(!sparse_.is_cuda(), "addmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor");
   AT_CHECK(!dense.is_cuda(), "addmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor");
 
-  AT_CHECK(sparse_._sparseDims() == 2, "addmm: matrices expected, got ", sparse_._sparseDims(), "D tensor");
-  AT_CHECK(sparse_._denseDims() == 0, "addmm: scalar values expected, got ", sparse_._denseDims(), "D values");
+  AT_CHECK(sparse_.sparse_dim() == 2, "addmm: matrices expected, got ", sparse_.sparse_dim(), "D tensor");
+  AT_CHECK(sparse_.dense_dim() == 0, "addmm: scalar values expected, got ", sparse_.dense_dim(), "D values");
   AT_CHECK(dense.dim() == 2, "addmm: matrices expected, got ", dense.dim(), "D tensor");
 
   SparseTensor sparse = sparse_.coalesce();
@@ -599,10 +597,10 @@ SparseTensor& hspmm_out_sparse_cpu(SparseTensor& r, const SparseTensor& sparse_,
   AT_CHECK(!r.is_cuda(), "hspmm: expected 'out' to be CPU tensor, but got CUDA tensor");
   AT_CHECK(!dense.is_cuda(), "hspmm: expected 'other' to be a CPU tensor, but got a CUDA tensor");
 
-  AT_CHECK(sparse_._sparseDims() == 2,
-      "hspmm: Argument #2: matrices expected, got ", sparse_._sparseDims(), "D tensor");
-  AT_CHECK(sparse_._denseDims() == 0,
-      "hspmm: Argument #2: scalar values expected, got ", sparse_._denseDims(), "D values");
+  AT_CHECK(sparse_.sparse_dim() == 2,
+      "hspmm: Argument #2: matrices expected, got ", sparse_.sparse_dim(), "D tensor");
+  AT_CHECK(sparse_.dense_dim() == 0,
+      "hspmm: Argument #2: scalar values expected, got ", sparse_.dense_dim(), "D values");
   AT_CHECK(dense.dim() == 2,
       "hspmm: Argument #3: matrices expected, got ", dense.dim(), "D tensor");
 
@@ -613,7 +611,7 @@ SparseTensor& hspmm_out_sparse_cpu(SparseTensor& r, const SparseTensor& sparse_,
   AT_CHECK(dense.size(0) == k,
       "hspmm: Argument #3: Expected dim 0 size ", k, ", got ", dense.size(0));
 
-  _get_sparse_impl(r)->raw_resize_(1, 1, {m, n});
+  get_sparse_impl(r)->raw_resize_(1, 1, {m, n});
 
   SparseTensor sparse = sparse_.coalesce();
 
@@ -649,13 +647,13 @@ SparseTensor& hspmm_out_sparse_cpu(SparseTensor& r, const SparseTensor& sparse_,
   indices.resize_({1, outNnz});
   Tensor values = at::empty({outNnz, n}, dense.options());
 
-  std::vector<int64_t> new_size = _get_sparse_impl(newSparse)->sizes().vec();
+  std::vector<int64_t> new_size = get_sparse_impl(newSparse)->sizes().vec();
   new_size[0] = outNnz;
-  _get_sparse_impl(newSparse)->raw_resize_(_get_sparse_impl(newSparse)->sparseDims(), _get_sparse_impl(newSparse)->denseDims(), new_size);
+  get_sparse_impl(newSparse)->raw_resize_(get_sparse_impl(newSparse)->sparse_dim(), get_sparse_impl(newSparse)->dense_dim(), new_size);
 
   // Compute output values tensor with sparse * dense multiplication
   s_addmm_out_sparse_dense_cpu(values, values, newSparse, dense, 0, alpha);
-  _get_sparse_impl(r)->set_indices_and_values_unsafe(indices, values);
+  get_sparse_impl(r)->set_indices_and_values_unsafe(indices, values);
 
   return r;
 }
@@ -683,10 +681,10 @@ SparseTensor& _sspaddmm_out_cpu(
   AT_CHECK(!sparse_.is_cuda(), "sspaddmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor");
   AT_CHECK(!dense.is_cuda(), "sspaddmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor");
 
-  AT_CHECK(sparse_._sparseDims() == 2,
-      "sspaddmm: Argument #2: matrices expected, got ", sparse_._sparseDims(), "D tensor");
-  AT_CHECK(sparse_._denseDims() == 0,
-      "sspaddmm: Argument #2: scalar values expected, got ", sparse_._denseDims(), "D values");
+  AT_CHECK(sparse_.sparse_dim() == 2,
+      "sspaddmm: Argument #2: matrices expected, got ", sparse_.sparse_dim(), "D tensor");
+  AT_CHECK(sparse_.dense_dim() == 0,
+      "sspaddmm: Argument #2: scalar values expected, got ", sparse_.dense_dim(), "D values");
   AT_CHECK(dense.dim() == 2,
       "sspaddmm: Argument #2: matrices expected, got ", dense.dim(), "D tensor");
 
@@ -699,7 +697,7 @@ SparseTensor& _sspaddmm_out_cpu(
 
   // NB: This has to occur before the checks, because r may alias t.
   // See test_saddmm
-  _get_sparse_impl(r)->raw_resize_(2, 0, {dim_i, dim_k});
+  get_sparse_impl(r)->raw_resize_(2, 0, {dim_i, dim_k});
 
   AT_CHECK(dense.size(0) == dim_j,
       "sspaddmm: Argument #3: Expected dim 0 size ", dim_j, ", got ", dense.size(0));
@@ -774,8 +772,8 @@ SparseTensor& _sspaddmm_out_cpu(
   );
 
   // to avoid a clone
-  _get_sparse_impl(r)->set_indices_and_values_unsafe(newi, newv);
-  _get_sparse_impl(r)->set_nnz_and_narrow(p);
+  get_sparse_impl(r)->set_indices_and_values_unsafe(newi, newv);
+  get_sparse_impl(r)->set_nnz_and_narrow(p);
 
   return r;
 }
diff --git a/aten/src/ATen/native/sparse/SparseUtils.h b/aten/src/ATen/native/sparse/SparseUtils.h
deleted file mode 100644
index 2e48fe41238c22..00000000000000
--- a/aten/src/ATen/native/sparse/SparseUtils.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/SparseTensorImpl.h>
-
-#include <TH/THGeneral.h>
-
-namespace at { namespace native {
-
-// Just for documentary purposes
-using SparseTensor = Tensor;
-using LongTensor = Tensor;
-using IntTensor = Tensor;
-using SparseType = Type;
-
-namespace {
-
-// This is an internal utility function for getting at the SparseTensorImpl,
-// so that we can write sparse tensor specific accessors for special fields
-// in SparseTensor.  You should only use this for writing low level
-// setters/getters for SparseTensorImpl fields; otherwise, you should use
-// the low level setters/getters that were implemented using this.
-//
-// This may be called repeatedly, so make sure it's pretty cheap.
-SparseTensorImpl* _get_sparse_impl(const SparseTensor& self) {
-  if (!self.is_sparse()) AT_ERROR("_internal_get_SparseTensorImpl: not a sparse tensor");
-  return static_cast<SparseTensorImpl*>(self.unsafeGetTensorImpl());
-}
-
-// Port of the old THCSTensor_(checkGPU), but it doesn't really belong here
-// because it is more general
-// NB: I dropped kernelP2PEnabled support
-// NB: This only works if the tensors are KNOWN to be CUDA.
-// TODO: Generalize it so it works on CPU as well
-inline bool _check_device(ArrayRef<Tensor> ts) {
-  if (ts.empty()) {
-    return true;
-  }
-  const Tensor& ref_t = ts.front();
-  int64_t curDevice = current_device();
-  for (const Tensor& t : ts) {
-    if (t.get_device() != curDevice) return false;
-  }
-  return true;
-}
-
-// Takes indices and values and directly puts them into the sparse tensor, no
-// copy.  This used to be called THSTensor_(_move)
-inline void _alias_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values) {
-  _get_sparse_impl(self)->set_indices_and_values_unsafe(indices, values);
-}
-
-// Take indices and values and makes a (data) copy of them to put into the sparse
-// indices/values.  This used to be called THSTensor_(_set)
-inline void _copy_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values, bool non_blocking) {
-  _alias_into_sparse(self, self._indices().type().copy(indices, non_blocking), self._values().type().copy(values, non_blocking));
-}
-
-// Does NOT make copies of indices/values
-inline SparseTensor _new_with_dims_and_tensor_sparse(
-    const SparseType& dtype,
-    int64_t sparseDims,
-    int64_t denseDims,
-    ArrayRef<int64_t> sizes,
-    const LongTensor& indices,
-    const Tensor& values) {
-  SparseTensor self = at::empty({0}, dtype.options());
-  _get_sparse_impl(self)->resize_(sparseDims, denseDims, sizes);
-  _alias_into_sparse(self, indices, values);
-  return self;
-}
-
-// TODO: put this into the public API
-inline bool isSameTensor(const Tensor& lhs, const Tensor& rhs) {
-  return lhs.unsafeGetTensorImpl() == rhs.unsafeGetTensorImpl();
-}
-
-inline bool _is_same_density(const SparseTensor& self, const SparseTensor& src) {
-  return self._sparseDims() == src._sparseDims() && self._denseDims() == src._denseDims();
-}
-
-// if forceClone is true, the result will forced to be a clone of self.
-inline LongTensor _newFlattenedIndices(const SparseTensor& self, bool forceClone) {
-  LongTensor indices = self._indices();
-  int64_t sparseDims = self._sparseDims();
-  if (sparseDims == 1) {
-    if (forceClone) {
-      return indices.clone();
-    } else {
-      return indices;
-    }
-  } else {
-    // FIXME TH_INDEX_BASE
-    int64_t factor = 1;
-    LongTensor indices1D = at::empty({1, self._nnz()}, indices.options());
-    indices1D.fill_(TH_INDEX_BASE);
-    for (int64_t d = sparseDims - 1; d >= 0; d--) {
-      indices1D.add_(indices.select(0, d), factor);
-      if (TH_INDEX_BASE != 0) {
-        indices1D.add_(-TH_INDEX_BASE);
-      }
-      factor *= self.size(d);
-    }
-    return indices1D;
-  }
-}
-
-// Give us a new values tensor, with the same dimensionality
-// as 'values' but with a new number of non-zero elements.
-// TODO: Expose this for real in ATen, some day?
-// NB: Doesn't preserve data.
-inline Tensor _new_values_with_size_of(const Tensor& values, int64_t nnz) {
-  std::vector<int64_t> size = values.sizes().vec();
-  size[0] = nnz;
-  return at::empty(size, values.options());
-}
-
-
-
-} // anonymous namespace
-
-}} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
index 44bd3ab8f7060b..3de5ced0edfaf4 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
@@ -38,6 +38,8 @@ __device__ void applyOp3(
   }
 }
 
+// Assume both dense and values are contiguous.
+// Currently only used in add_out_dense_sparse_cuda: add(dense, sparse, scalar).
 template <typename Op, typename IndexType, typename Real>
 __global__ void sparseElementwiseKernel(
     Op op,
@@ -45,23 +47,26 @@ __global__ void sparseElementwiseKernel(
     TensorInfo<indexT, IndexType> indices,
     TensorInfo<Real, IndexType> values,
     const IndexType nnz) {
-  IndexType indskip = indices.strides[0];
-  IndexType valueSize = values.strides[0];
+  IndexType ind_skip = indices.strides[0];
+  IndexType ind_nnz_skip = indices.strides[1];
+  IndexType value_size = values.strides[0];  // numel of each slice in values
   for (IndexType linearId = blockIdx.x;
        linearId < nnz;
        linearId += gridDim.x) {
     IndexType index = 0;
     for (IndexType d = 0; d < indices.sizes[0]; d++) {
-      index = dense.sizes[d] * index + indices.data[d * indskip + linearId];
+      index = dense.sizes[d] * index + indices.data[d * ind_skip + linearId * ind_nnz_skip];
     }
-    Real *dst = dense.data + index * valueSize;
-    Real *src = values.data + linearId * valueSize;
-    for (IndexType linearId2 = threadIdx.x; linearId2 < valueSize; linearId2 += blockDim.x) {
+    Real *dst = dense.data + index * value_size;
+    Real *src = values.data + linearId * value_size;
+    for (IndexType linearId2 = threadIdx.x; linearId2 < value_size; linearId2 += blockDim.x) {
       op(dst + linearId2, src + linearId2);
     }
   }
 }
 
+// Assume dense is contiguous.
+// Currently only used in add_out_dense_sparse_cuda: add(dense, sparse, scalar).
 template <typename Op, typename IndexType, typename Real>
 __global__ void sparseElementwiseKernelScalar(
     Op op,
@@ -69,15 +74,17 @@ __global__ void sparseElementwiseKernelScalar(
     TensorInfo<indexT, IndexType> indices,
     TensorInfo<Real, IndexType> values,
     const IndexType nnz) {
-  IndexType indskip = indices.strides[0];
+  IndexType ind_skip = indices.strides[0];
+  IndexType ind_nnz_skip = indices.strides[1];
+  IndexType value_skip = values.strides[0];
   for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
        linearId < nnz;
        linearId += gridDim.x * blockDim.x) {
     IndexType index = 0;
     for (IndexType d = 0; d < indices.sizes[0]; d++) {
-      index = dense.sizes[d] * index + indices.data[d * indskip + linearId];
+      index = dense.sizes[d] * index + indices.data[d * ind_skip + linearId * ind_nnz_skip];
     }
-    op(dense.data + index, values.data + linearId);
+    op(dense.data + index, values.data + linearId * value_skip);
   }
 }
 
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
index ab9fb15c628735..1da0345e3c36f2 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
@@ -1,18 +1,20 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 
-#include <ATen/native/sparse/SparseUtils.h>
+#include <ATen/SparseTensorUtils.h>
 
 namespace at { namespace native {
 
+using namespace at::sparse;
+
 SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const SparseTensor& mask) {
   AT_CHECK(mask.is_coalesced(), "sparse_mask: mask is uncoalesced");
   AT_CHECK(mask.sizes().equals(t.sizes()), "sparse_mask: operands have incompatible sizes; self has size ",
       t.sizes(), " but mask has size ", mask.sizes());
-  AT_ASSERT(t.is_cuda()); // dispatch argument
+  AT_ASSERT(t.is_cuda());  // dispatch argument
   AT_CHECK(mask.is_cuda(), "sparse_mask: expected 'mask' to be CUDA, but got CPU");
   AT_CHECK(r.is_cuda(), "sparse_mask: expected 'out' to be CUDA, but got CPU");
-  AT_CHECK(_check_device({r, t, mask}),
+  AT_CHECK(check_device({r, t, mask}),
       "sparse_mask: arguments are located on different devices; self is on device ", t.get_device(),
       ", mask is on device ", mask.get_device(), ", out is on device ", r.get_device());
   resize_as_sparse_(r, mask);
@@ -22,25 +24,24 @@ SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const Spars
   LongTensor mask_indices = mask._indices();
   Tensor mask_values = mask._values();
   Tensor r_values = at::empty(mask_values.sizes(), r._values().options());
-  _alias_into_sparse(r, mask_indices.clone(), r_values);
-  _get_sparse_impl(r)->set_coalesced(mask.is_coalesced());
-  _get_sparse_impl(r)->set_nnz_and_narrow(mask._nnz());
+  alias_into_sparse(r, mask_indices.clone(), r_values);
+  r._coalesced_(mask.is_coalesced());
   if (t.numel() == 0) {  // if t is an empty tensor, there is no need to mask its elements
     return r;
   }
 
   LongTensor indices = at::zeros({mask._nnz()}, mask_indices.options());
 
-  for (int64_t d = 0; d < mask._sparseDims(); d++) {
+  for (int64_t d = 0; d < mask.sparse_dim(); d++) {
     indices.mul_(mask.size(d));
     // This used to use a buffer but I deoptimized it
     indices.add_(mask_indices.select(0, d));
   }
 
-  std::vector<int64_t> view_size(1 + mask._denseDims());
+  std::vector<int64_t> view_size(1 + mask.dense_dim());
   view_size[0] = -1;
-  for (int64_t d = 0; d < mask._denseDims(); d++) {
-    view_size[d + 1] = mask.size(mask._sparseDims() + d);
+  for (int64_t d = 0; d < mask.dense_dim(); d++) {
+    view_size[d + 1] = mask.size(mask.sparse_dim() + d);
   }
 
   Tensor t_view = t.view(view_size);
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
index 23cc9a944a35c1..401d3833fd4c38 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
@@ -1,7 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/NativeFunctions.h>
-#include <ATen/native/sparse/SparseUtils.h>
+#include <ATen/SparseTensorUtils.h>
 #include <ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh>
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
@@ -24,6 +24,8 @@
 
 namespace at { namespace native {
 
+using namespace at::sparse;
+
 SparseTensor coalesce_sparse_cuda(const SparseTensor& self) {
   int64_t nnz = self._nnz();
   if (self.is_coalesced()) {
@@ -33,7 +35,7 @@ SparseTensor coalesce_sparse_cuda(const SparseTensor& self) {
   // we should keep the original tensor intact and do coalesce on a copy of the tensor
   if (nnz < 2) {
     SparseTensor dst = self.clone();
-    _get_sparse_impl(dst)->set_coalesced(true);
+    dst._coalesced_(true);
     return dst;
   }
 
@@ -45,15 +47,13 @@ SparseTensor coalesce_sparse_cuda(const SparseTensor& self) {
   // For indices, a simple sort + unique suffices
   // For values, we use a custom kernel for segmented reduction (can't use Thrust due to indirection).
 
-  // TODO: I'm not sure if this could ever be non-contiguous
-  LongTensor values = self._values().contiguous();
+  Tensor values = self._values();
 
-  int64_t sparseDims = self._sparseDims();
-  int64_t stride = values.stride(0);
+  int64_t sparse_dim = self.sparse_dim();
 
   // indices will be modified by Thrust, so we have to clone or use new storage
   // here.
-  LongTensor indices1D = _newFlattenedIndices(self, true);
+  LongTensor indices1D = flatten_indices(self._indices(), self.sizes(), true);
 
   LongTensor origIndices = at::empty({nnz}, self._indices().options());
   LongTensor uniqueOffsets = at::empty({nnz}, self._indices().options());
@@ -88,21 +88,26 @@ SparseTensor coalesce_sparse_cuda(const SparseTensor& self) {
   newValues_size[0] = newNnz;
   Tensor newValues = at::empty(newValues_size, values.options());
 
-  dim3 grid(THCCeilDiv(newNnz, (int64_t) 4), THCCeilDiv(stride, (int64_t) 128));
-  dim3 block(32, 4);
-  AT_DISPATCH_ALL_TYPES_AND_HALF(
-      values.type(), "coalesce_sparse_cuda", [&] {
-        using cuda_accscalar_t = acc_type<scalar_t, /* is_cuda */ true>;
-        apply::coalesceValuesKernel<scalar_t, cuda_accscalar_t><<<grid, block, 0, stream>>>(
-          uniqueOffsets.data<int64_t>(),
-          origIndices.data<int64_t>(),
-          values.data<scalar_t>(),
-          newValues.data<scalar_t>(),
-          nnz,
-          newNnz,
-          stride
-        );
-      });
+  // If there is no values to copy, save running the kernel.
+  if (newValues.numel() > 0) {
+    values = values.contiguous();
+    int64_t stride = at::prod_intlist(values.sizes().slice(1));
+    dim3 grid(THCCeilDiv(newNnz, (int64_t) 4), THCCeilDiv(stride, (int64_t) 128));
+    dim3 block(32, 4);
+    AT_DISPATCH_ALL_TYPES_AND_HALF(
+        values.type(), "coalesce_sparse_cuda", [&] {
+          using cuda_accscalar_t = acc_type<scalar_t, /* is_cuda */ true>;
+          apply::coalesceValuesKernel<scalar_t, cuda_accscalar_t><<<grid, block, 0, stream>>>(
+            uniqueOffsets.data<int64_t>(),
+            origIndices.data<int64_t>(),
+            values.data<scalar_t>(),
+            newValues.data<scalar_t>(),
+            nnz,
+            newNnz,
+            stride
+          );
+        });
+  }
 
 // this grid-strided version is slower but probably more flexible
   // to different sizes
@@ -122,14 +127,14 @@ SparseTensor coalesce_sparse_cuda(const SparseTensor& self) {
   ////////////////////////////////////////////////////////////
   // unflatten indices if necessary
   LongTensor newIndices;
-  if (sparseDims == 1) {
+  if (sparse_dim == 1) {
     newIndices = indices1D;
   } else {
-    newIndices = at::empty({sparseDims, newNnz}, origIndices.options());
+    newIndices = at::empty({sparse_dim, newNnz}, origIndices.options());
     if (TH_INDEX_BASE != 0) {
       indices1D.add_(-1);
     }
-    for (int64_t d = sparseDims - 1; d >= 0; d--) {
+    for (int64_t d = sparse_dim - 1; d >= 0; d--) {
       // NB: Not a select, so I can preserve the outer dimension
       LongTensor indicesSlice = newIndices.narrow(0, d, 1);
       // Note for the porting guide: THCTensor_(copy) does NOT do normal
@@ -145,8 +150,7 @@ SparseTensor coalesce_sparse_cuda(const SparseTensor& self) {
   }
   ////////////////////////////////////////////////////////////
 
-  SparseTensor dst = ::at::native::sparse_coo_tensor(newIndices, newValues, self.sizes());
-  _get_sparse_impl(dst)->set_coalesced(true);
+  SparseTensor dst = ::at::native::sparse_coo_tensor(newIndices, newValues, self.sizes())._coalesced_(true);
 
   THCudaCheck(cudaGetLastError());
   return dst;
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index d655d3e93289ec..eef84007ec3c02 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -1,7 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/NativeFunctions.h>
-#include <ATen/native/sparse/SparseUtils.h>
+#include <ATen/SparseTensorUtils.h>
 #include <ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh>
 #include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
@@ -18,6 +18,8 @@
 
 namespace at { namespace native {
 
+using namespace at::sparse;
+
 // --------------------------------------------------------------------
 // Utility functions
 // --------------------------------------------------------------------
@@ -45,12 +47,11 @@ Tensor& s_addmm_out_sparse_dense_cuda(Tensor& r_, const Tensor& t, const SparseT
   AT_CHECK(sparse_.is_cuda(), "addmm: expected 'mat1' to be CUDA, but got CPU");
   AT_CHECK(dense.is_cuda(), "addmm: expected 'mat2' to be CUDA, but got CPU");
 
-  AT_CHECK(_check_device({sparse_, r_, t, dense}));
+  AT_CHECK(check_device({sparse_, r_, t, dense}));
 
-  // TODO: This error message seems awfully opaque
-  AT_CHECK(sparse_._sparseDims() == 2, "addmm: 2D tensor expected, got ", sparse_._sparseDims(), "D tensor");
-  AT_CHECK(sparse_._denseDims() == 0, "addmm: scalar values expected, got ", sparse_._denseDims(), "D values");
   AT_CHECK(dense.dim() == 2, "addmm: 2D tensor expected, got ", dense.dim(), "D tensor");
+  AT_CHECK(sparse_.sparse_dim() == 2, "addmm: expected first two dims to be sparse (indices has size 2 at first dim), but got ", sparse_.sparse_dim(), " spase dims");
+  // no need to check dense_dim because dense_dim + sparse_dim = dim
 
   // mxk * kxn = mxn
   int64_t m = sparse_.size(0);
@@ -87,7 +88,7 @@ Tensor& s_addmm_out_sparse_dense_cuda(Tensor& r_, const Tensor& t, const SparseT
         if (cast_beta == 0) {
           r_.zero_();
         } else if (cast_beta == 1) {
-          if (!isSameTensor(t, r_)) {
+          if (!is_same_tensor(t, r_)) {
             r_.copy_(t);
           }
         } else {
@@ -174,12 +175,12 @@ SparseTensor& hspmm_out_sparse_cuda(SparseTensor& r_, const SparseTensor& sparse
   AT_CHECK(r_.is_cuda(), "hspmm: expected 'out' to be CUDA, but got CPU");
   AT_CHECK(dense.is_cuda(), "hspmm: expected 'mat2' to be CUDA, but got CPU");
 
-  AT_CHECK(_check_device({r_, sparse_, dense}));
+  AT_CHECK(check_device({r_, sparse_, dense}));
 
-  AT_CHECK(sparse_._sparseDims() == 2,
-      "hspmm: Argument #2: 2D tensor expected, got ", sparse_._sparseDims(), "D tensor");
-  AT_CHECK(sparse_._denseDims() == 0,
-      "hspmm: Argument #2: scalar values expected, got ", sparse_._denseDims(), "D values");
+  AT_CHECK(sparse_.sparse_dim() == 2,
+      "hspmm: Argument #2: 2D tensor expected, got ", sparse_.sparse_dim(), "D tensor");
+  AT_CHECK(sparse_.dense_dim() == 0,
+      "hspmm: Argument #2: scalar values expected, got ", sparse_.dense_dim(), "D values");
   AT_CHECK(dense.dim() == 2,
       "hspmm: Argument #3: 2D tensor expected, got ", dense.dim(), "D tensor");
 
@@ -190,7 +191,7 @@ SparseTensor& hspmm_out_sparse_cuda(SparseTensor& r_, const SparseTensor& sparse
   AT_CHECK(dense.size(0) == k,
       "hspmm: Argument #3: Expected dim 0 size ", k, ", got ", dense.size(0));
 
-  _get_sparse_impl(r_)->resize_and_clear_(1, 1, {m, n});
+  get_sparse_impl(r_)->resize_and_clear_(1, 1, {m, n});
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
@@ -217,12 +218,12 @@ SparseTensor& hspmm_out_sparse_cuda(SparseTensor& r_, const SparseTensor& sparse
   thrust::device_ptr<int64_t> indicesIter(dstIndices.data<int64_t>());
   thrust::sequence(policy, indicesIter, indicesIter + nnz);
 
-  std::vector<int64_t> new_size = _get_sparse_impl(newSparse)->sizes().vec();
+  std::vector<int64_t> new_size = get_sparse_impl(newSparse)->sizes().vec();
   new_size[0] = nnz;
-  _get_sparse_impl(newSparse)->raw_resize_(_get_sparse_impl(newSparse)->sparseDims(), _get_sparse_impl(newSparse)->denseDims(), new_size);
+  get_sparse_impl(newSparse)->raw_resize_(get_sparse_impl(newSparse)->sparse_dim(), get_sparse_impl(newSparse)->dense_dim(), new_size);
 
   s_addmm_out_sparse_dense_cuda(values, values, newSparse, dense, 0, /*alpha*/ 1);
-  _get_sparse_impl(r_)->set_indices_and_values_unsafe(indices, values);
+  get_sparse_impl(r_)->set_indices_and_values_unsafe(indices, values);
 
   return r_;
 }
@@ -245,7 +246,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR
   AT_CHECK(sparse.is_cuda(), "add: expected 'other' to be CUDA, but got CPU");
   AT_CHECK(r_.is_cuda(), "add: expected 'out' to be CUDA, but got CPU");
 
-  AT_CHECK(_check_device({sparse, r_, dense}));
+  AT_CHECK(check_device({sparse, r_, dense}));
 
   AT_CHECK(dense.sizes().equals(sparse.sizes()), "add: expected 'self' and 'other' to have same size, but self has size ",
     dense.sizes(), " while other has size ", sparse.sizes(), " (FYI: dense-sparse addition does not currently support broadcasting)");
@@ -258,7 +259,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR
   }
 
   Tensor r = r_;
-  if (!isSameTensor(r, dense)) {
+  if (!is_same_tensor(r, dense)) {
     r_.resize_as_(dense);
     r_.copy_(dense);
   } else {
@@ -269,7 +270,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR
   LongTensor indices = sparse._indices();
   Tensor values = sparse._values();
   int64_t nDim = dense.dim();
-  int64_t nDimI = sparse._sparseDims();
+  int64_t nDimI = sparse.sparse_dim();
 
   if (sparse._values().numel() == 0) {
     return r_;
@@ -282,7 +283,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR
     int curDevice = -1;
     cudaGetDevice(&curDevice);
     cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
-    if (sparse._denseDims() == 0) {
+    if (sparse.dense_dim() == 0) {
       AT_CHECK(cuda::getApplyGrid(nnz, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions");
 
       AT_DISPATCH_ALL_TYPES_AND_HALF(
@@ -296,6 +297,9 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR
     } else {
       AT_CHECK(cuda::getApplyGrid(nnz * block.x, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions");
 
+      // sparseElementwiseKernel needs values to be contiguous too
+      values = values.contiguous();
+
       AT_DISPATCH_ALL_TYPES_AND_HALF(
           values.type(), "add_out_dense_sparse_cuda", [&] {
             apply::sparseElementwiseKernel<TensorCAddOp<scalar_t>, uint64_t, scalar_t>
@@ -306,7 +310,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR
           });
     }
   } else {
-    LongTensor indices1D = _newFlattenedIndices(sparse, 0).squeeze_(0).narrow(0, 0, nnz);
+    LongTensor indices1D = flatten_indices(indices, sparse.sizes(), 0);
 
     // FIXME: at some point we can wrap the scale into indexAdd
     // NB: Purposely not inplace!
@@ -327,7 +331,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR
     }
 
     Tensor r_view = r.view({view_rows, view_columns});
-    values = values.narrow(0, 0, nnz).reshape({nnz, view_columns});
+    values = values.reshape({nnz, view_columns});
     r_view.index_add_(0, indices1D, values);
   }
   THCudaCheck(cudaGetLastError());
@@ -344,7 +348,7 @@ SparseTensor& add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const
   AT_CHECK(src.is_cuda(), "add: expected 'other' to be CUDA, but got CPU");
   AT_CHECK(r_.is_cuda(), "add: expected 'out' to be CUDA, but got CPU");
 
-  AT_CHECK(_check_device({r_, t, src}));
+  AT_CHECK(check_device({r_, t, src}));
   AT_CHECK(t.sizes().equals(src.sizes()), "add: expected 'self' and 'other' to have same size, but ", t.sizes(), " != ", src.sizes());
 
   if (src._nnz() == 0) {
@@ -354,7 +358,7 @@ SparseTensor& add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const
     return mul_out_sparse_scalar(r_, src, value);
   }
 
-  AT_CHECK(_is_same_density(t, src), "add: expected 'self' and 'other' to have same density, but 'self' has ", t._sparseDims(), " sparse dimensions while 'other' has ", src._sparseDims(), " sparse dimensions");
+  AT_CHECK(is_same_density(t, src), "add: expected 'self' and 'other' to have same density, but 'self' has ", t.sparse_dim(), " sparse dimensions while 'other' has ", src.sparse_dim(), " sparse dimensions");
 
   // We deliberately choose to simply concat the indices and values tensors
   // rather than merging them. This removes the need to synchronously fetch nnz
@@ -375,7 +379,7 @@ SparseTensor& add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const
   LongTensor r_indices_ = at::cat({t_indices_, s_indices_}, 1);
   Tensor r_values_ = at::cat({t_values_, s_values_}, 0);
   r_.resize_as_(src);
-  _alias_into_sparse(r_, r_indices_, r_values_);
+  alias_into_sparse(r_, r_indices_, r_values_);
 
   // FIXME: add some heuristic about when to call coalesce() here, so that
   // tensors don't totally blow up in size by concatenation; e.g.
@@ -402,7 +406,7 @@ SparseTensor& mul_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t_, cons
   AT_ASSERT(t_.is_cuda()); // dispatch argument
   AT_CHECK(src_.is_cuda(), "mul: expected 'other' to be CUDA, but got CPU");
   AT_CHECK(r_.is_cuda(), "mul: expected 'out' to be CUDA, but got CPU");
-  AT_CHECK(_check_device({r_, t_, src_}));
+  AT_CHECK(check_device({r_, t_, src_}));
   AT_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same size, but ", t_.sizes(), " != ", src_.sizes());
 
   SparseTensor t = t_.coalesce();
@@ -416,15 +420,15 @@ SparseTensor& mul_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t_, cons
   // saving those because they can be overwritten when doing in-place operations
   int64_t t_nnz = t._nnz(), s_nnz = src._nnz();
   int64_t max_nnz = std::min(t_nnz, s_nnz);  // multiply by zero is zero, and can be dropped
-  int64_t sparseDims = src._sparseDims();
+  int64_t sparse_dim = src.sparse_dim();
   LongTensor t_indices_ = t._indices();
   Tensor t_values_ = t._values();
   LongTensor s_indices_ = src._indices();
   Tensor s_values_ = src._values();
-  LongTensor r_indices_ = at::empty({sparseDims, max_nnz}, t_indices_.options());
-  Tensor r_values_ = _new_values_with_size_of(t_values_, max_nnz).zero_();
+  LongTensor r_indices_ = at::empty({sparse_dim, max_nnz}, t_indices_.options());
+  Tensor r_values_ = new_values_with_size_of(t_values_, max_nnz).zero_();
   r_.resize_as_(src);
-  _get_sparse_impl(r_)->set_indices_and_values_unsafe(r_indices_, r_values_);
+  get_sparse_impl(r_)->set_indices_and_values_unsafe(r_indices_, r_values_);
 
   int64_t valueSize = t_values_.stride(0);
   const dim3 block = dim3(std::min(static_cast<int64_t>(cuda::getApplyBlock().x), valueSize));
@@ -457,10 +461,9 @@ SparseTensor& mul_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t_, cons
   // sync!  (surely there is a more idiomatic way to do this...)
   LongTensor cpu_resultNnz = at::empty({1}, CPU(kLong));
   cpu_resultNnz.copy_(resultNnz);
-  _get_sparse_impl(r_)->set_nnz_and_narrow(cpu_resultNnz.accessor<int64_t, 1>()[0]);
-  _get_sparse_impl(r_)->set_coalesced(true);
+  get_sparse_impl(r_)->set_nnz_and_narrow(cpu_resultNnz.accessor<int64_t, 1>()[0]);
 
-  return r_;
+  return r_._coalesced_(true);
 }
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native_parse.py b/aten/src/ATen/native_parse.py
index 7f34b48e0228f0..3368a9f10178f0 100644
--- a/aten/src/ATen/native_parse.py
+++ b/aten/src/ATen/native_parse.py
@@ -129,6 +129,7 @@ def run(paths):
                 output_arguments = [x for x in arguments if x.get('output')]
                 declaration['return'] = return_type if len(output_arguments) == 0 else output_arguments
                 declaration['variants'] = func.get('variants', ['function'])
+                declaration['requires_tensor'] = func.get('requires_tensor', False)
                 declaration['cpu_half'] = func.get('cpu_half', False)
                 declaration['deprecated'] = func.get('deprecated', False)
                 declaration['device_guard'] = func.get('device_guard', True)
diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
index 76ad9465d6e324..c0fb9ca80bdac8 100644
--- a/aten/src/ATen/templates/TypeDefault.cpp
+++ b/aten/src/ATen/templates/TypeDefault.cpp
@@ -13,6 +13,7 @@
 #include "ATen/Tensor.h"
 #include "ATen/core/TensorOptions.h"
 #include "ATen/DeviceGuard.h"
+#include "ATen/SparseTensorUtils.h"
 
 namespace at {
 
diff --git a/aten/src/TH/generic/THTensorEvenMoreMath.cpp b/aten/src/TH/generic/THTensorEvenMoreMath.cpp
index 7efee5ee8d1be2..ad6f995b68e962 100644
--- a/aten/src/TH/generic/THTensorEvenMoreMath.cpp
+++ b/aten/src/TH/generic/THTensorEvenMoreMath.cpp
@@ -12,9 +12,9 @@ void THTensor_(fill)(THTensor *r_, scalar_t value)
     TH_TENSOR_APPLY(scalar_t, r_,
       if (r__stride == 1) {
         THVector_(fill)(r__data, value, r__size);
-	r__i = r__size;
-	r__data += r__stride * r__size;
-	break;
+        r__i = r__size;
+        r__data += r__stride * r__size;
+        break;
       } else {
         *r__data = value;
       }
@@ -181,7 +181,7 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens
     tensor_data = tensor->data<scalar_t>();
     src_data = src->data<scalar_t>();
     auto src_size0 = THTensor_sizeLegacyNoScalars(src, 0);
-    ptrdiff_t rowsize = src_size0 == 0 ? 1: THTensor_(nElement)(src) / src_size0;
+    ptrdiff_t rowsize = src_size0 == 0 ? 1 : THTensor_(nElement)(src) / src_size0;
 
     // check that the indices are within range
     int64_t max = src_size0 - 1 + TH_INDEX_BASE;
@@ -192,14 +192,18 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens
       }
     }
 
-    if (src->dim() <= 1) {
-      #pragma omp parallel for if(numel > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-      for (i=0; i<numel; i++)
-        tensor_data[i] = src_data[index_data[i] - TH_INDEX_BASE];
-    } else {
-      #pragma omp parallel for if(numel*rowsize > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-      for (i=0; i<numel; i++)
-        memcpy(tensor_data + i*rowsize, src_data + (index_data[i] - TH_INDEX_BASE)*rowsize, rowsize*sizeof(scalar_t));
+    // When src is empty, tensor_data maybe nullptr, and the memcpy will trigger
+    // ubsan. So we skip copying at all when every slice to copy is empty.
+    if (rowsize > 0) {
+      if (src->dim() <= 1) {
+        #pragma omp parallel for if(numel > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+        for (i=0; i<numel; i++)
+          tensor_data[i] = src_data[index_data[i] - TH_INDEX_BASE];
+      } else {
+        #pragma omp parallel for if(numel*rowsize > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+        for (i=0; i<numel; i++)
+          memcpy(tensor_data + i*rowsize, src_data + (index_data[i] - TH_INDEX_BASE)*rowsize, rowsize*sizeof(scalar_t));
+      }
     }
   }
   else if (src->dim() <= 1)
diff --git a/test/common_utils.py b/test/common_utils.py
index 414c6dd8aae17e..377522b677fd1e 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -290,6 +290,33 @@ def assertTensorsSlowEqual(self, x, y, prec=None, message=''):
             max_err = max(max_err, abs(x[index] - y[index]))
         self.assertLessEqual(max_err, prec, message)
 
+    def genSparseTensor(self, size, sparse_dim, nnz, is_uncoalesced, device='cpu'):
+        # Assert not given impossible combination, where the sparse dims have
+        # empty numel, but nnz > 0 makes the indices containing values.
+        assert all(size[d] > 0 for d in range(sparse_dim)) or nnz == 0, 'invalid arguments'
+
+        v_size = [nnz] + list(size[sparse_dim:])
+        v = torch.randn(*v_size, device=device)
+        i = torch.rand(sparse_dim, nnz, device=device)
+        i.mul_(torch.tensor(size[:sparse_dim]).unsqueeze(1).to(i))
+        i = i.to(torch.long)
+        if is_uncoalesced:
+            v = torch.cat([v, torch.randn_like(v)], 0)
+            i = torch.cat([i, i], 1)
+
+        x = torch.sparse_coo_tensor(i, v, torch.Size(size))
+
+        if not is_uncoalesced:
+            x = x.coalesce()
+        else:
+            # FIXME: `x` is a sparse view of `v`. Currently rebase_history for
+            #        sparse views is not implemented, so this workaround is
+            #        needed for inplace operations done on `x`, e.g., copy_().
+            #        Remove after implementing something equivalent to CopySlice
+            #        for sparse views.
+            x = x.detach()
+        return x, x._indices().clone(), x._values().clone()
+
     def safeToDense(self, t):
         r = self.safeCoalesce(t)
         return r.to_dense()
diff --git a/test/expect/TestCudaSparse.test_print.expect b/test/expect/TestCudaSparse.test_print.expect
index cae534bd16e8d6..dd924a8385cc1c 100644
--- a/test/expect/TestCudaSparse.test_print.expect
+++ b/test/expect/TestCudaSparse.test_print.expect
@@ -1,6 +1,6 @@
 # shape: torch.Size([])
 # nnz: 2
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 2])
 # values shape: torch.Size([2])
 ########## torch.int32 ##########
@@ -30,15 +30,13 @@ tensor(indices=tensor([], size=(0, 4)),
        device='cuda:0', size=(), nnz=4, dtype=torch.float32,
        layout=torch.sparse_coo, grad_fn=<AddBackward0>)
 # _indices
-tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
 # _values
-tensor([0., 1.], device='cuda:0', dtype=torch.float32,
-       grad_fn=<NotImplemented>)
+tensor([0., 1.], device='cuda:0', dtype=torch.float32)
 
 # shape: torch.Size([0])
 # nnz: 10
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 10])
 # values shape: torch.Size([10, 0])
 ########## torch.int32 ##########
@@ -68,15 +66,13 @@ tensor(indices=tensor([], size=(0, 20)),
        device='cuda:0', size=(0,), nnz=20, dtype=torch.float32,
        layout=torch.sparse_coo, grad_fn=<AddBackward0>)
 # _indices
-tensor([], device='cuda:0', size=(0, 10), dtype=torch.int64,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(0, 10), dtype=torch.int64)
 # _values
-tensor([], device='cuda:0', size=(10, 0), dtype=torch.float32,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(10, 0), dtype=torch.float32)
 
 # shape: torch.Size([2])
 # nnz: 3
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 3])
 # values shape: torch.Size([3, 2])
 ########## torch.int32 ##########
@@ -119,17 +115,15 @@ tensor(indices=tensor([], size=(0, 6)),
        device='cuda:0', size=(2,), nnz=6, dtype=torch.float32,
        layout=torch.sparse_coo, grad_fn=<AddBackward0>)
 # _indices
-tensor([], device='cuda:0', size=(0, 3), dtype=torch.int64,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(0, 3), dtype=torch.int64)
 # _values
 tensor([[0.0000, 0.3333],
         [0.6667, 1.0000],
-        [1.3333, 1.6667]], device='cuda:0', dtype=torch.float32,
-       grad_fn=<NotImplemented>)
+        [1.3333, 1.6667]], device='cuda:0', dtype=torch.float32)
 
 # shape: torch.Size([100, 3])
 # nnz: 3
-# sparseDim: 1
+# sparse_dim: 1
 # indices shape: torch.Size([1, 3])
 # values shape: torch.Size([3, 3])
 ########## torch.int32 ##########
@@ -172,16 +166,15 @@ tensor(indices=tensor([[0, 1, 2, 0, 1, 2]]),
        device='cuda:0', size=(100, 3), nnz=6, dtype=torch.float32,
        layout=torch.sparse_coo, grad_fn=<AddBackward0>)
 # _indices
-tensor([[0, 1, 2]], device='cuda:0', grad_fn=<NotImplemented>)
+tensor([[0, 1, 2]], device='cuda:0')
 # _values
 tensor([[0.0000, 0.2222, 0.4444],
         [0.6667, 0.8889, 1.1111],
-        [1.3333, 1.5556, 1.7778]], device='cuda:0', dtype=torch.float32,
-       grad_fn=<NotImplemented>)
+        [1.3333, 1.5556, 1.7778]], device='cuda:0', dtype=torch.float32)
 
 # shape: torch.Size([100, 20, 3])
 # nnz: 0
-# sparseDim: 2
+# sparse_dim: 2
 # indices shape: torch.Size([2, 0])
 # values shape: torch.Size([0, 3])
 ########## torch.int32 ##########
@@ -211,15 +204,13 @@ tensor(indices=tensor([], size=(2, 0)),
        device='cuda:0', size=(100, 20, 3), nnz=0, dtype=torch.float32,
        layout=torch.sparse_coo, grad_fn=<AddBackward0>)
 # _indices
-tensor([], device='cuda:0', size=(2, 0), dtype=torch.int64,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(2, 0), dtype=torch.int64)
 # _values
-tensor([], device='cuda:0', size=(0, 3), dtype=torch.float32,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(0, 3), dtype=torch.float32)
 
 # shape: torch.Size([10, 0, 3])
 # nnz: 3
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 3])
 # values shape: torch.Size([3, 10, 0, 3])
 ########## torch.int32 ##########
@@ -249,15 +240,13 @@ tensor(indices=tensor([], size=(0, 6)),
        device='cuda:0', size=(10, 0, 3), nnz=6, dtype=torch.float32,
        layout=torch.sparse_coo, grad_fn=<AddBackward0>)
 # _indices
-tensor([], device='cuda:0', size=(0, 3), dtype=torch.int64,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(0, 3), dtype=torch.int64)
 # _values
-tensor([], device='cuda:0', size=(3, 10, 0, 3), dtype=torch.float32,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(3, 10, 0, 3), dtype=torch.float32)
 
 # shape: torch.Size([10, 0, 3])
 # nnz: 0
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 0])
 # values shape: torch.Size([0, 10, 0, 3])
 ########## torch.int32 ##########
@@ -287,8 +276,6 @@ tensor(indices=tensor([], size=(0, 0)),
        device='cuda:0', size=(10, 0, 3), nnz=0, dtype=torch.float32,
        layout=torch.sparse_coo, grad_fn=<AddBackward0>)
 # _indices
-tensor([], device='cuda:0', size=(0, 0), dtype=torch.int64,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(0, 0), dtype=torch.int64)
 # _values
-tensor([], device='cuda:0', size=(0, 10, 0, 3), dtype=torch.float32,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(0, 10, 0, 3), dtype=torch.float32)
diff --git a/test/expect/TestCudaUncoalescedSparse.test_print.expect b/test/expect/TestCudaUncoalescedSparse.test_print.expect
index e6207c6da3d278..e982b059933b98 100644
--- a/test/expect/TestCudaUncoalescedSparse.test_print.expect
+++ b/test/expect/TestCudaUncoalescedSparse.test_print.expect
@@ -1,6 +1,6 @@
 # shape: torch.Size([])
 # nnz: 2
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 2])
 # values shape: torch.Size([2])
 ########## torch.int32 ##########
@@ -30,15 +30,13 @@ tensor(indices=tensor([], size=(0, 4)),
        device='cuda:0', size=(), nnz=4, dtype=torch.float32,
        layout=torch.sparse_coo, grad_fn=<AddBackward0>)
 # _indices
-tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(0, 2), dtype=torch.int64)
 # _values
-tensor([0., 1.], device='cuda:0', dtype=torch.float32,
-       grad_fn=<NotImplemented>)
+tensor([0., 1.], device='cuda:0', dtype=torch.float32)
 
 # shape: torch.Size([0])
 # nnz: 10
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 10])
 # values shape: torch.Size([10, 0])
 ########## torch.int32 ##########
@@ -68,15 +66,13 @@ tensor(indices=tensor([], size=(0, 20)),
        device='cuda:0', size=(0,), nnz=20, dtype=torch.float32,
        layout=torch.sparse_coo, grad_fn=<AddBackward0>)
 # _indices
-tensor([], device='cuda:0', size=(0, 10), dtype=torch.int64,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(0, 10), dtype=torch.int64)
 # _values
-tensor([], device='cuda:0', size=(10, 0), dtype=torch.float32,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(10, 0), dtype=torch.float32)
 
 # shape: torch.Size([2])
 # nnz: 3
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 3])
 # values shape: torch.Size([3, 2])
 ########## torch.int32 ##########
@@ -119,17 +115,15 @@ tensor(indices=tensor([], size=(0, 6)),
        device='cuda:0', size=(2,), nnz=6, dtype=torch.float32,
        layout=torch.sparse_coo, grad_fn=<AddBackward0>)
 # _indices
-tensor([], device='cuda:0', size=(0, 3), dtype=torch.int64,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(0, 3), dtype=torch.int64)
 # _values
 tensor([[0.0000, 0.3333],
         [0.6667, 1.0000],
-        [1.3333, 1.6667]], device='cuda:0', dtype=torch.float32,
-       grad_fn=<NotImplemented>)
+        [1.3333, 1.6667]], device='cuda:0', dtype=torch.float32)
 
 # shape: torch.Size([100, 3])
 # nnz: 3
-# sparseDim: 1
+# sparse_dim: 1
 # indices shape: torch.Size([1, 3])
 # values shape: torch.Size([3, 3])
 ########## torch.int32 ##########
@@ -172,16 +166,15 @@ tensor(indices=tensor([[0, 1, 0, 0, 1, 0]]),
        device='cuda:0', size=(100, 3), nnz=6, dtype=torch.float32,
        layout=torch.sparse_coo, grad_fn=<AddBackward0>)
 # _indices
-tensor([[0, 1, 0]], device='cuda:0', grad_fn=<NotImplemented>)
+tensor([[0, 1, 0]], device='cuda:0')
 # _values
 tensor([[0.0000, 0.2222, 0.4444],
         [0.6667, 0.8889, 1.1111],
-        [1.3333, 1.5556, 1.7778]], device='cuda:0', dtype=torch.float32,
-       grad_fn=<NotImplemented>)
+        [1.3333, 1.5556, 1.7778]], device='cuda:0', dtype=torch.float32)
 
 # shape: torch.Size([100, 20, 3])
 # nnz: 0
-# sparseDim: 2
+# sparse_dim: 2
 # indices shape: torch.Size([2, 0])
 # values shape: torch.Size([0, 3])
 ########## torch.int32 ##########
@@ -211,15 +204,13 @@ tensor(indices=tensor([], size=(2, 0)),
        device='cuda:0', size=(100, 20, 3), nnz=0, dtype=torch.float32,
        layout=torch.sparse_coo, grad_fn=<AddBackward0>)
 # _indices
-tensor([], device='cuda:0', size=(2, 0), dtype=torch.int64,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(2, 0), dtype=torch.int64)
 # _values
-tensor([], device='cuda:0', size=(0, 3), dtype=torch.float32,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(0, 3), dtype=torch.float32)
 
 # shape: torch.Size([10, 0, 3])
 # nnz: 3
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 3])
 # values shape: torch.Size([3, 10, 0, 3])
 ########## torch.int32 ##########
@@ -249,15 +240,13 @@ tensor(indices=tensor([], size=(0, 6)),
        device='cuda:0', size=(10, 0, 3), nnz=6, dtype=torch.float32,
        layout=torch.sparse_coo, grad_fn=<AddBackward0>)
 # _indices
-tensor([], device='cuda:0', size=(0, 3), dtype=torch.int64,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(0, 3), dtype=torch.int64)
 # _values
-tensor([], device='cuda:0', size=(3, 10, 0, 3), dtype=torch.float32,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(3, 10, 0, 3), dtype=torch.float32)
 
 # shape: torch.Size([10, 0, 3])
 # nnz: 0
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 0])
 # values shape: torch.Size([0, 10, 0, 3])
 ########## torch.int32 ##########
@@ -287,8 +276,6 @@ tensor(indices=tensor([], size=(0, 0)),
        device='cuda:0', size=(10, 0, 3), nnz=0, dtype=torch.float32,
        layout=torch.sparse_coo, grad_fn=<AddBackward0>)
 # _indices
-tensor([], device='cuda:0', size=(0, 0), dtype=torch.int64,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(0, 0), dtype=torch.int64)
 # _values
-tensor([], device='cuda:0', size=(0, 10, 0, 3), dtype=torch.float32,
-       grad_fn=<NotImplemented>)
+tensor([], device='cuda:0', size=(0, 10, 0, 3), dtype=torch.float32)
diff --git a/test/expect/TestSparse.test_print.expect b/test/expect/TestSparse.test_print.expect
index c0223c1483741c..d0501228f9262a 100644
--- a/test/expect/TestSparse.test_print.expect
+++ b/test/expect/TestSparse.test_print.expect
@@ -1,6 +1,6 @@
 # shape: torch.Size([])
 # nnz: 2
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 2])
 # values shape: torch.Size([2])
 ########## torch.int32 ##########
@@ -28,13 +28,13 @@ tensor(indices=tensor([], size=(0, 2)),
        size=(), nnz=2, dtype=torch.float32, layout=torch.sparse_coo,
        grad_fn=<AddBackward0>)
 # _indices
-tensor([], size=(0, 2), dtype=torch.int64, grad_fn=<NotImplemented>)
+tensor([], size=(0, 2), dtype=torch.int64)
 # _values
-tensor([0., 1.], dtype=torch.float32, grad_fn=<NotImplemented>)
+tensor([0., 1.], dtype=torch.float32)
 
 # shape: torch.Size([0])
 # nnz: 10
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 10])
 # values shape: torch.Size([10, 0])
 ########## torch.int32 ##########
@@ -62,13 +62,13 @@ tensor(indices=tensor([], size=(0, 10)),
        size=(0,), nnz=10, dtype=torch.float32, layout=torch.sparse_coo,
        grad_fn=<AddBackward0>)
 # _indices
-tensor([], size=(0, 10), dtype=torch.int64, grad_fn=<NotImplemented>)
+tensor([], size=(0, 10), dtype=torch.int64)
 # _values
-tensor([], size=(10, 0), dtype=torch.float32, grad_fn=<NotImplemented>)
+tensor([], size=(10, 0), dtype=torch.float32)
 
 # shape: torch.Size([2])
 # nnz: 3
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 3])
 # values shape: torch.Size([3, 2])
 ########## torch.int32 ##########
@@ -106,15 +106,15 @@ tensor(indices=tensor([], size=(0, 3)),
        size=(2,), nnz=3, dtype=torch.float32, layout=torch.sparse_coo,
        grad_fn=<AddBackward0>)
 # _indices
-tensor([], size=(0, 3), dtype=torch.int64, grad_fn=<NotImplemented>)
+tensor([], size=(0, 3), dtype=torch.int64)
 # _values
 tensor([[0.0000, 0.3333],
         [0.6667, 1.0000],
-        [1.3333, 1.6667]], dtype=torch.float32, grad_fn=<NotImplemented>)
+        [1.3333, 1.6667]], dtype=torch.float32)
 
 # shape: torch.Size([100, 3])
 # nnz: 3
-# sparseDim: 1
+# sparse_dim: 1
 # indices shape: torch.Size([1, 3])
 # values shape: torch.Size([3, 3])
 ########## torch.int32 ##########
@@ -152,16 +152,15 @@ tensor(indices=tensor([[0, 1, 2]]),
        size=(100, 3), nnz=3, dtype=torch.float32, layout=torch.sparse_coo,
        grad_fn=<AddBackward0>)
 # _indices
-tensor([[0, 1, 2]], grad_fn=<NotImplemented>)
+tensor([[0, 1, 2]])
 # _values
 tensor([[0.0000, 0.2222, 0.4444],
         [0.6667, 0.8889, 1.1111],
-        [1.3333, 1.5556, 1.7778]], dtype=torch.float32,
-       grad_fn=<NotImplemented>)
+        [1.3333, 1.5556, 1.7778]], dtype=torch.float32)
 
 # shape: torch.Size([100, 20, 3])
 # nnz: 0
-# sparseDim: 2
+# sparse_dim: 2
 # indices shape: torch.Size([2, 0])
 # values shape: torch.Size([0, 3])
 ########## torch.int32 ##########
@@ -189,13 +188,13 @@ tensor(indices=tensor([], size=(2, 0)),
        size=(100, 20, 3), nnz=0, dtype=torch.float32, layout=torch.sparse_coo,
        grad_fn=<AddBackward0>)
 # _indices
-tensor([], size=(2, 0), dtype=torch.int64, grad_fn=<NotImplemented>)
+tensor([], size=(2, 0), dtype=torch.int64)
 # _values
-tensor([], size=(0, 3), dtype=torch.float32, grad_fn=<NotImplemented>)
+tensor([], size=(0, 3), dtype=torch.float32)
 
 # shape: torch.Size([10, 0, 3])
 # nnz: 3
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 3])
 # values shape: torch.Size([3, 10, 0, 3])
 ########## torch.int32 ##########
@@ -223,13 +222,13 @@ tensor(indices=tensor([], size=(0, 3)),
        size=(10, 0, 3), nnz=3, dtype=torch.float32, layout=torch.sparse_coo,
        grad_fn=<AddBackward0>)
 # _indices
-tensor([], size=(0, 3), dtype=torch.int64, grad_fn=<NotImplemented>)
+tensor([], size=(0, 3), dtype=torch.int64)
 # _values
-tensor([], size=(3, 10, 0, 3), dtype=torch.float32, grad_fn=<NotImplemented>)
+tensor([], size=(3, 10, 0, 3), dtype=torch.float32)
 
 # shape: torch.Size([10, 0, 3])
 # nnz: 0
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 0])
 # values shape: torch.Size([0, 10, 0, 3])
 ########## torch.int32 ##########
@@ -257,6 +256,6 @@ tensor(indices=tensor([], size=(0, 0)),
        size=(10, 0, 3), nnz=0, dtype=torch.float32, layout=torch.sparse_coo,
        grad_fn=<AddBackward0>)
 # _indices
-tensor([], size=(0, 0), dtype=torch.int64, grad_fn=<NotImplemented>)
+tensor([], size=(0, 0), dtype=torch.int64)
 # _values
-tensor([], size=(0, 10, 0, 3), dtype=torch.float32, grad_fn=<NotImplemented>)
+tensor([], size=(0, 10, 0, 3), dtype=torch.float32)
diff --git a/test/expect/TestUncoalescedSparse.test_print.expect b/test/expect/TestUncoalescedSparse.test_print.expect
index 244442de0cc73d..7a2e3c68b9ed12 100644
--- a/test/expect/TestUncoalescedSparse.test_print.expect
+++ b/test/expect/TestUncoalescedSparse.test_print.expect
@@ -1,6 +1,6 @@
 # shape: torch.Size([])
 # nnz: 2
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 2])
 # values shape: torch.Size([2])
 ########## torch.int32 ##########
@@ -28,13 +28,13 @@ tensor(indices=tensor([], size=(0, 2)),
        size=(), nnz=2, dtype=torch.float32, layout=torch.sparse_coo,
        grad_fn=<AddBackward0>)
 # _indices
-tensor([], size=(0, 2), dtype=torch.int64, grad_fn=<NotImplemented>)
+tensor([], size=(0, 2), dtype=torch.int64)
 # _values
-tensor([0., 1.], dtype=torch.float32, grad_fn=<NotImplemented>)
+tensor([0., 1.], dtype=torch.float32)
 
 # shape: torch.Size([0])
 # nnz: 10
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 10])
 # values shape: torch.Size([10, 0])
 ########## torch.int32 ##########
@@ -62,13 +62,13 @@ tensor(indices=tensor([], size=(0, 10)),
        size=(0,), nnz=10, dtype=torch.float32, layout=torch.sparse_coo,
        grad_fn=<AddBackward0>)
 # _indices
-tensor([], size=(0, 10), dtype=torch.int64, grad_fn=<NotImplemented>)
+tensor([], size=(0, 10), dtype=torch.int64)
 # _values
-tensor([], size=(10, 0), dtype=torch.float32, grad_fn=<NotImplemented>)
+tensor([], size=(10, 0), dtype=torch.float32)
 
 # shape: torch.Size([2])
 # nnz: 3
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 3])
 # values shape: torch.Size([3, 2])
 ########## torch.int32 ##########
@@ -106,15 +106,15 @@ tensor(indices=tensor([], size=(0, 3)),
        size=(2,), nnz=3, dtype=torch.float32, layout=torch.sparse_coo,
        grad_fn=<AddBackward0>)
 # _indices
-tensor([], size=(0, 3), dtype=torch.int64, grad_fn=<NotImplemented>)
+tensor([], size=(0, 3), dtype=torch.int64)
 # _values
 tensor([[0.0000, 0.3333],
         [0.6667, 1.0000],
-        [1.3333, 1.6667]], dtype=torch.float32, grad_fn=<NotImplemented>)
+        [1.3333, 1.6667]], dtype=torch.float32)
 
 # shape: torch.Size([100, 3])
 # nnz: 3
-# sparseDim: 1
+# sparse_dim: 1
 # indices shape: torch.Size([1, 3])
 # values shape: torch.Size([3, 3])
 ########## torch.int32 ##########
@@ -152,16 +152,15 @@ tensor(indices=tensor([[0, 1, 0]]),
        size=(100, 3), nnz=3, dtype=torch.float32, layout=torch.sparse_coo,
        grad_fn=<AddBackward0>)
 # _indices
-tensor([[0, 1, 0]], grad_fn=<NotImplemented>)
+tensor([[0, 1, 0]])
 # _values
 tensor([[0.0000, 0.2222, 0.4444],
         [0.6667, 0.8889, 1.1111],
-        [1.3333, 1.5556, 1.7778]], dtype=torch.float32,
-       grad_fn=<NotImplemented>)
+        [1.3333, 1.5556, 1.7778]], dtype=torch.float32)
 
 # shape: torch.Size([100, 20, 3])
 # nnz: 0
-# sparseDim: 2
+# sparse_dim: 2
 # indices shape: torch.Size([2, 0])
 # values shape: torch.Size([0, 3])
 ########## torch.int32 ##########
@@ -189,13 +188,13 @@ tensor(indices=tensor([], size=(2, 0)),
        size=(100, 20, 3), nnz=0, dtype=torch.float32, layout=torch.sparse_coo,
        grad_fn=<AddBackward0>)
 # _indices
-tensor([], size=(2, 0), dtype=torch.int64, grad_fn=<NotImplemented>)
+tensor([], size=(2, 0), dtype=torch.int64)
 # _values
-tensor([], size=(0, 3), dtype=torch.float32, grad_fn=<NotImplemented>)
+tensor([], size=(0, 3), dtype=torch.float32)
 
 # shape: torch.Size([10, 0, 3])
 # nnz: 3
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 3])
 # values shape: torch.Size([3, 10, 0, 3])
 ########## torch.int32 ##########
@@ -223,13 +222,13 @@ tensor(indices=tensor([], size=(0, 3)),
        size=(10, 0, 3), nnz=3, dtype=torch.float32, layout=torch.sparse_coo,
        grad_fn=<AddBackward0>)
 # _indices
-tensor([], size=(0, 3), dtype=torch.int64, grad_fn=<NotImplemented>)
+tensor([], size=(0, 3), dtype=torch.int64)
 # _values
-tensor([], size=(3, 10, 0, 3), dtype=torch.float32, grad_fn=<NotImplemented>)
+tensor([], size=(3, 10, 0, 3), dtype=torch.float32)
 
 # shape: torch.Size([10, 0, 3])
 # nnz: 0
-# sparseDim: 0
+# sparse_dim: 0
 # indices shape: torch.Size([0, 0])
 # values shape: torch.Size([0, 10, 0, 3])
 ########## torch.int32 ##########
@@ -257,6 +256,6 @@ tensor(indices=tensor([], size=(0, 0)),
        size=(10, 0, 3), nnz=0, dtype=torch.float32, layout=torch.sparse_coo,
        grad_fn=<AddBackward0>)
 # _indices
-tensor([], size=(0, 0), dtype=torch.int64, grad_fn=<NotImplemented>)
+tensor([], size=(0, 0), dtype=torch.int64)
 # _values
-tensor([], size=(0, 10, 0, 3), dtype=torch.float32, grad_fn=<NotImplemented>)
+tensor([], size=(0, 10, 0, 3), dtype=torch.float32)
diff --git a/test/test_autograd.py b/test/test_autograd.py
index b42ab95993a9ef..fa00280ef55715 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -553,14 +553,14 @@ def backward(self, grad_x):
             [0, 2, 2],
         ])
         v1 = torch.DoubleTensor([[1, 2], [4, 5], [7, 8]])
-        sparse_grad1 = Variable(torch.sparse.DoubleTensor(i1, v1, size))
+        sparse_grad1 = torch.sparse.DoubleTensor(i1, v1, size)
         i2 = torch.LongTensor([
             [0, 1, 3, 4],
             [0, 1, 2, 2],
         ])
         v2 = torch.DoubleTensor([[1, 2], [4, 3], [4, 5], [7, 8]])
-        sparse_grad2 = Variable(torch.sparse.DoubleTensor(i2, v2, size))
-        dense_grad = Variable(torch.rand(size).double())
+        sparse_grad2 = torch.sparse.DoubleTensor(i2, v2, size)
+        dense_grad = torch.rand(size).double()
         sparse_fn1 = FixedGradientFunction(sparse_grad1)
         sparse_fn2 = FixedGradientFunction(sparse_grad2)
         dense_fn = FixedGradientFunction(dense_grad)
@@ -578,6 +578,47 @@ def backward(self, grad_x):
         (sparse_fn1(x) + sparse_fn2(x)).sum().backward()
         self.assertEqual(x.grad, sparse_grad1 + sparse_grad2)
 
+    @skipIfRocm
+    def test_sparse_ctor_getter_backward(self):
+        # See NOTE [ Sparse: autograd and API ] on the expected behavior of this test
+        def test(size, sparse_dim, nnz, device):
+            v_size = [nnz] + list(size[sparse_dim:])
+            i = torch.rand(sparse_dim, nnz)
+            i.mul_(torch.tensor(size[:sparse_dim]).unsqueeze(1).to(i))
+            i = i.to(torch.long)
+
+            inp = torch.randn(v_size, requires_grad=True)
+            other = self.genSparseTensor(size, sparse_dim, nnz, is_uncoalesced=True)[0]
+            other = other.to(device)
+
+            def fn(v):
+                x = torch.sparse_coo_tensor(i, v, size, device=device)
+                y = (x + other).coalesce()
+                yv = y.values()
+                new_v = yv.tanh()
+                z = torch.sparse_coo_tensor(y.indices(), new_v, y.size())
+                return z.coalesce().values()
+
+            gradcheck(fn, (inp,))
+            # FIXME: make gradgradcheck work.
+            # gradgradcheck(fn, (inp,))
+
+            # assert that _values is non-differentiable
+            with self.assertRaisesRegex(RuntimeError, "does not have a grad_fn"):
+                other.detach().requires_grad_()._values().backward(torch.ones_like(other._values()))
+
+        devices = ['cpu']
+
+        if torch.cuda.is_available():
+            devices.append('cuda')
+
+        for empty_i, empty_v, empty_nnz in product([True, False], repeat=3):
+            sparse_size = [] if empty_i else [2, 1]
+            dense_size = [1, 0, 2] if empty_v else [1, 2]
+            nnz = 0 if empty_nnz else 5
+            for device in devices:
+                test(sparse_size + dense_size, len(sparse_size), nnz, device)
+
     def test_multi_backward(self):
         x = torch.randn(5, 5, requires_grad=True)
         y = torch.randn(5, 5, requires_grad=True)
diff --git a/test/test_numba_integration.py b/test/test_numba_integration.py
index 2894d5daaa54e9..ba14916a8fbc7c 100644
--- a/test/test_numba_integration.py
+++ b/test/test_numba_integration.py
@@ -67,14 +67,15 @@ def test_cuda_array_interface(self):
 
             # Sparse CPU/CUDA tensors do not implement the interface
             if tp not in (torch.HalfTensor,):
-                sparse_t = torch.sparse_coo_tensor(cput[None, :], cput)
+                indices_t = torch.empty(1, cput.size(0), dtype=torch.long).clamp_(min=0)
+                sparse_t = torch.sparse_coo_tensor(indices_t, cput)
 
                 self.assertFalse(hasattr(sparse_t, "__cuda_array_interface__"))
                 self.assertRaises(
                     AttributeError, lambda: sparse_t.__cuda_array_interface__
                 )
 
-                sparse_cuda_t = torch.sparse_coo_tensor(cput[None, :], cput).cuda()
+                sparse_cuda_t = torch.sparse_coo_tensor(indices_t, cput).cuda()
 
                 self.assertFalse(hasattr(sparse_cuda_t, "__cuda_array_interface__"))
                 self.assertRaises(
diff --git a/test/test_sparse.py b/test/test_sparse.py
index d32aabb145a65c..b08f3e79a2bb37 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -44,47 +44,16 @@ def setUp(self):
         self.SparseTensor = torch.sparse.DoubleTensor
         super(TestSparse, self).setUp()
 
-    def _gen_sparse(self, sparse_dims, nnz, with_size):
-        # TODO: Consider implementing this in the CUDA case by directly
-        # performing the operations on the GPU.  You won't be able to
-        # use torch.rand/torch.randn in this case because they are
-        # CPU-only.  If you do this, you can remove the is_cuda branch
-        # at the end.
-        #
-        # If you do this, be sure to update assert_uncoalesced too
-
+    def _gen_sparse(self, sparse_dim, nnz, with_size):
         if isinstance(with_size, Number):
-            with_size = [with_size] * sparse_dims
+            with_size = [with_size] * sparse_dim
+
+        x, i, v = self.genSparseTensor(with_size, sparse_dim, nnz, self.is_uncoalesced, self.device)
 
         if self.is_uncoalesced:
-            # We want to generate a tensor with a lot of uncoalesced
-            # entries to stress test whether or not we handle this
-            # (subtle) case correctly
-            v_size = [nnz * 2] + list(with_size[sparse_dims:])
-            v = torch.randn(*v_size)
-            r = torch.rand(sparse_dims, nnz)
-            # Repeat the indexes, so every position shows up twice
-            i = torch.cat([r, r], dim=1)
-            if nnz > 0:
-                i *= torch.Tensor(with_size[:sparse_dims]).repeat(nnz * 2, 1).transpose(0, 1)
-            i = i.type(torch.LongTensor)
-            x = torch.sparse.DoubleTensor(i, v, torch.Size(with_size))
             self.assert_uncoalesced(x)
-        else:
-            # Generate a sparse tensor with sparse_dims sparse dimensions; the
-            # rest the dimensions with_size[sparse_dims:] are dense.
-            v_size = [nnz] + list(with_size[sparse_dims:])
-            v = torch.randn(*v_size)
-            i = torch.rand(sparse_dims, nnz)
-            if nnz > 0:
-                i *= torch.Tensor(with_size[:sparse_dims]).repeat(nnz, 1).transpose(0, 1)
-            i = i.type(torch.LongTensor)
-            x = torch.sparse.DoubleTensor(i, v, torch.Size(with_size))
 
-        if self.is_cuda:
-            return x.cuda(), i.cuda(), v.cuda()
-        else:
-            return x, i.clone(), v.clone()
+        return x, i, v
 
     def assert_uncoalesced(self, x):
         """
@@ -109,7 +78,7 @@ def randn(self, *args, **kwargs):
 
     @skipIfRocm  # ROCm stack doesn't like the x + x call
     def test_print(self):
-        shape_sparseDim_nnz = [
+        shape_sparse_dim_nnz = [
             ((), 0, 2),
             ((0,), 0, 10),
             ((2,), 0, 3),
@@ -120,18 +89,18 @@ def test_print(self):
         ]
 
         printed = []
-        for shape, sparseDim, nnz in shape_sparseDim_nnz:
-            indices_shape = torch.Size((sparseDim, nnz))
-            values_shape = torch.Size((nnz,) + shape[sparseDim:])
+        for shape, sparse_dim, nnz in shape_sparse_dim_nnz:
+            indices_shape = torch.Size((sparse_dim, nnz))
+            values_shape = torch.Size((nnz,) + shape[sparse_dim:])
             printed.append("# shape: {}".format(torch.Size(shape)))
             printed.append("# nnz: {}".format(nnz))
-            printed.append("# sparseDim: {}".format(sparseDim))
+            printed.append("# sparse_dim: {}".format(sparse_dim))
             printed.append("# indices shape: {}".format(indices_shape))
             printed.append("# values shape: {}".format(values_shape))
 
             indices = torch.arange(indices_shape.numel(), dtype=self.IndexTensor.dtype,
                                    device=self.device).view(indices_shape)
-            for d in range(sparseDim):
+            for d in range(sparse_dim):
                 indices[d].clamp_(max=(shape[d] - 1))  # make it valid index
             if self.is_uncoalesced and indices.numel() > 0:
                 indices[:, -1] = indices[:, 0]  # make it uncoalesced
@@ -174,6 +143,16 @@ def test_shape(sparse_dims, nnz, with_size):
             self.assertEqual(self.safeCoalesce(x)._nnz(), nnz)
             self.assertEqual(list(x.size()), with_size)
 
+            # Test .indices() and .values()
+            if self.is_uncoalesced:
+                with self.assertRaisesRegex(RuntimeError, "Cannot get indices on an uncoalesced tensor"):
+                    x.indices()
+                with self.assertRaisesRegex(RuntimeError, "Cannot get values on an uncoalesced tensor"):
+                    x.values()
+            else:
+                self.assertEqual(x.indices(), x._indices())
+                self.assertEqual(x.values(), x._values())
+
         test_shape(3, 10, 100)
         test_shape(3, 10, [100, 100, 100])
         test_shape(3, 10, [100, 100, 100, 5, 5, 5, 0])
@@ -190,6 +169,16 @@ def test_shape(sparse_dims, nnz, with_size):
         self.assertEqual(x._indices().numel(), 0)
         self.assertEqual(x._values().numel(), 0)
 
+    @skipIfRocm
+    def test_coalecce(self):
+        for empty_i, empty_v, empty_nnz in itertools.product([True, False], repeat=3):
+            sparse_size = [] if empty_i else [2, 1]
+            dense_size = [1, 0, 2] if empty_v else [1, 2]
+            nnz = 0 if empty_nnz else 5
+
+            t, _, _ = self._gen_sparse(len(sparse_size), nnz, sparse_size + dense_size)
+            self.safeCoalesce(t)  # this tests correctness
+
     def test_ctor_size_checks(self):
         indices = self.IndexTensor([
             [0, 0, 0],
@@ -569,11 +558,11 @@ def test_cross_device(x1, x2):
     def test_cuda_empty(self):
         def test_tensor(x):
             y = x.cuda(0)
-            self.assertEqual(x._sparseDims(), y._sparseDims())
-            self.assertEqual(x._denseDims(), y._denseDims())
+            self.assertEqual(x.sparse_dim(), y.sparse_dim())
+            self.assertEqual(x.dense_dim(), y.dense_dim())
             x = y.cpu()
-            self.assertEqual(y._sparseDims(), x._sparseDims())
-            self.assertEqual(y._denseDims(), x._denseDims())
+            self.assertEqual(y.sparse_dim(), x.sparse_dim())
+            self.assertEqual(y.dense_dim(), x.dense_dim())
 
         x = torch.sparse.FloatTensor(2, 3, 4)
         test_tensor(x)
@@ -609,8 +598,9 @@ def test_shape(di, dj, dk, nnz):
             x_coalesced = x.coalesce()
             self.assertTrue(x_coalesced.is_coalesced())
 
-            x_coalesced_t = x.t()
-            self.assertFalse(x_coalesced_t.is_coalesced())
+            x_coalesced_t = x_coalesced.t()
+            # Transpose is `colasced`-preserving if the indices tensor is empty.
+            self.assertEqual(x_coalesced_t.is_coalesced(), di * nnz == 0)
 
             res = torch.mm(x_coalesced_t, y)
             expected = torch.mm(self.safeToDense(x_coalesced_t), y)
@@ -629,8 +619,8 @@ def test_in_place(x):
             self.assertEqual(torch.Size([shape_original[1], shape_original[0]]), x.size())
             self.assertEqual(0, x._indices().numel())
             self.assertEqual(0, x._values().numel())
-            self.assertEqual(x._sparseDims(), 2)
-            self.assertEqual(x._denseDims(), 0)
+            self.assertEqual(x.sparse_dim(), 2)
+            self.assertEqual(x.dense_dim(), 0)
 
         def test_not_in_place(x):
             shape_original = x.shape
@@ -638,8 +628,8 @@ def test_not_in_place(x):
             self.assertEqual(torch.Size([shape_original[1], shape_original[0]]), y.size())
             self.assertEqual(0, y._indices().numel())
             self.assertEqual(0, y._values().numel())
-            self.assertEqual(x._sparseDims(), 2)
-            self.assertEqual(x._denseDims(), 0)
+            self.assertEqual(x.sparse_dim(), 2)
+            self.assertEqual(x.dense_dim(), 0)
 
         x = self.SparseTensor(2, 3)
         test_in_place(x)
@@ -886,15 +876,21 @@ def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v=None):
         expected = torch.zeros(x1.size())
         self.assertEqual(self.safeToDense(y), expected)
 
-        self.assertFalse(x1.is_coalesced())
+        self.assertEqual(x1.is_coalesced(), not self.is_uncoalesced)
         y = x1.coalesce()
         z = x1.coalesce()
-        self.assertFalse(x1.is_coalesced())
+        self.assertEqual(x1.is_coalesced(), not self.is_uncoalesced)
         self.assertTrue(y.is_coalesced())
         self.assertEqual(x1, y)
-        # check that coalesce is out of place
         y._values().add_(1)
-        self.assertEqual(z._values() + 1, y._values())
+        if not x1.is_coalesced():
+            # check that coalesce is out of place if the original tensor is not
+            # coalesced.
+            self.assertEqual(z._values() + 1, y._values())
+        else:
+            # check that coalesce is in-place if the original tensor is
+            # coalesced.
+            self.assertEqual(z._values(), y._values())
 
     @skipIfRocm
     def test_basic_ops(self):
@@ -1051,8 +1047,8 @@ def _test_zeros(self, nnzs, shape, out_shape_i, out_shape_v=None):
             self.assertEqual(tuple(out.size()), tuple(shape))
             self.assertTrue(out._indices().numel() == out._values().numel() == 0)
             self.assertEqual(out._nnz(), 0)
-            self.assertEqual(out._sparseDims(), len(shape))
-            self.assertEqual(out._denseDims(), 0)
+            self.assertEqual(out.sparse_dim(), len(shape))
+            self.assertEqual(out.dense_dim(), 0)
 
     def test_zeros(self):
         def test_shape(i_shapes, v_shapes, shape, nnzs):
@@ -1075,8 +1071,8 @@ def _test_zeros_like(self, nnzs, template_shape_i, template_shape_v=None):
             self.assertEqual(tuple(res.size()), tuple(template_shape))
             self.assertTrue(res._indices().numel() == res._values().numel() == 0)
             self.assertEqual(res._nnz(), 0)
-            self.assertEqual(res._sparseDims(), len(template_shape_i))
-            self.assertEqual(res._denseDims(), len(template_shape_v))
+            self.assertEqual(res.sparse_dim(), len(template_shape_i))
+            self.assertEqual(res.dense_dim(), len(template_shape_v))
 
     def test_zeros_like(self):
         def test_shape(i_shapes, v_shapes, nnzs):
@@ -1119,7 +1115,7 @@ def test_narrow(self):
         self.assertRaises(RuntimeError, lambda: with_dense.narrow_copy(10, 0, 3))  # dim > sparseDim + denseDim
 
     def _test_log1p_tensor(self, input, dense_tensor):
-        expected_output = torch.tensor(dense_tensor).log1p_()
+        expected_output = dense_tensor.log1p()
         self.assertEqual(expected_output, input.log1p().to_dense())
         self.assertEqual(expected_output, input.coalesce().log1p_().to_dense())
 
@@ -1143,7 +1139,7 @@ def test_log1p(self):
             torch.FloatTensor([3, 4, 5]),
             torch.Size([3]),
             device=self.device)
-        self._test_log1p_tensor(input, [3., 4., 5.])
+        self._test_log1p_tensor(input, torch.as_tensor([3., 4., 5.]))
 
         # test uncoalesced input
         input_uncoalesced = torch.sparse_coo_tensor(
@@ -1151,7 +1147,7 @@ def test_log1p(self):
             torch.FloatTensor([2, 3, 4, 1, 1, 1]),
             torch.Size([3]),
             device=self.device)
-        self._test_log1p_tensor(input_uncoalesced, [3., 4., 5.])
+        self._test_log1p_tensor(input_uncoalesced, torch.as_tensor([3., 4., 5.]))
 
         input = torch.sparse_coo_tensor(
             torch.zeros([2, 0]),
@@ -1309,7 +1305,7 @@ def test_factory_size_check(self):
                                     [0, 2]])
         values = self.ValueTensor([.5, .5])
         sizes = torch.Size([2, 3])
-        with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"):
+        with self.assertRaisesRegex(RuntimeError, "size is inconsistent with indices"):
             torch.sparse_coo_tensor(indices, values, sizes)
 
         indices.fill_(-1)
@@ -1320,14 +1316,14 @@ def test_factory_size_check(self):
                                     [0, 2]])
         values = self.ValueTensor(2, 1, 0)
         sizes = torch.Size([2, 3, 1, 0])
-        with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"):
+        with self.assertRaisesRegex(RuntimeError, "size is inconsistent with indices"):
             torch.sparse_coo_tensor(indices, values, sizes)
 
         indices = self.IndexTensor([[1, 2],
                                     [0, 2]])
         values = self.ValueTensor(2, 2, 2)
         sizes = torch.Size([0, 0, 2, 2])
-        with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"):
+        with self.assertRaisesRegex(RuntimeError, "size is inconsistent with indices"):
             torch.sparse_coo_tensor(indices, values, sizes)
 
         indices = self.IndexTensor([[1, 2],
@@ -1370,13 +1366,13 @@ def test_factory_empty_indices(self):
         self.assertEqual(tensor._indices(), expected_indices)
 
     def test_factory_nnz(self):
-        indices = self.IndexTensor([[0]])  # (sparseDims, nnz): (1, 1)
+        indices = self.IndexTensor([[0]])  # (sparse_dim, nnz): (1, 1)
         values = self.ValueTensor([[1, 1], [1, 1]])  # (nnz, ...): (2, 2)
         sizes = torch.Size([2, 2])
         with self.assertRaisesRegex(RuntimeError, "indices and values must have same nnz"):
             torch.sparse_coo_tensor(indices, values, sizes)
 
-        indices = self.IndexTensor([[0]])  # (sparseDims, nnz): (1, 1)
+        indices = self.IndexTensor([[0]])  # (sparse_dim, nnz): (1, 1)
         values = self.ValueTensor(2, 0)  # (nnz, ...): (2, 0)
         sizes = torch.Size([2, 0])
         with self.assertRaisesRegex(RuntimeError, "indices and values must have same nnz"):
@@ -1402,7 +1398,8 @@ def test_shape(i_shape, v_shape, size, expected_size):
         test_shape([3, 0], [0, 2, 4, 0], [0, 0, 0, 2, 4, 0], [0, 0, 0, 2, 4, 0])
         test_shape([3, 0], [0, 2, 4, 0], [1, 2, 3, 2, 4, 0], [1, 2, 3, 2, 4, 0])
 
-    def test_factory_dense_dims(self):
+    @skipIfRocm
+    def test_factory_dense_dim(self):
         indices = self.IndexTensor([[0]])
         values = self.ValueTensor([[[1, 1, 1], [1, 1, 1]]])
         sizes = torch.Size([1, 3, 4])
@@ -1563,7 +1560,7 @@ def do_test(t):
             y = t.new().resize_as_(t).zero_()
             self.assertEqual(y.shape, t.shape)
             # Check that y can be added to t. Currently, this requires that
-            # _sparseDims and _denseDims match.
+            # sparse_dim and dense_dim match.
             self.assertEqual(t, t + y)
 
         do_test(self.SparseTensor())
@@ -1585,8 +1582,8 @@ def _test_resize_shape(self, x_i, x_v, x_size, y_i, y_v, y_size):
         x.resize_as_(y)
         x_dense.resize_as_(y_dense)
         self.assertEqual(x.shape, y.shape)
-        self.assertEqual(x._sparseDims(), y._sparseDims())
-        self.assertEqual(x._denseDims(), y._denseDims())
+        self.assertEqual(x.sparse_dim(), y.sparse_dim())
+        self.assertEqual(x.dense_dim(), y.dense_dim())
         self.assertEqual(x.shape, x_dense.shape)
         self.assertEqual(y.shape, y_dense.shape)
         # Here we make sure that the original data are preserved after resizing
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index f0e87e85f3c7a6..e0d46634924278 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -8,6 +8,12 @@
 #     Note that a single gradient entry can specify the gradient
 #     formula for multiple input names, by specifying a key
 #     "input1, input2" (see atan2 for an example).
+#   - Optional entry with key 'output_differentiability' and value a list of the
+#     same length as the number of outputs from the forward function. The list
+#     should contain only booleans, specifying whether each of the output Tensor
+#     is differentiable.
+#     If None of the output is differentiable, you can also add the function
+#     name to `gen_variable_type.py`'s `DONT_REQUIRE_DERIVATIVE` list.
 #
 # If a function has out-of-place and in-place variants, then the derivative
 # definition for the in-place variant is optional. It will default to the
@@ -23,13 +29,16 @@
 #
 #     When a function returns multiple *differentiable* outputs,
 #     you can refer to the gradients of each outputs using 'grads',
-#     e.g., 'grads[0]', 'grads[1]'
+#     e.g., 'grads[0]', 'grads[1]'.
 #
 #     When a function returns *one* differentiable output (the
 #     first output) and some more nondifferentiable outputs,
 #     you MUST refer to the gradient of the differentiable output with
 #     'grad' (this case is special-cased in our code generation).
 #
+#     Note that the number of differentibale outputs can be modified by the
+#     'output_differentiability' entry (see above).
+#
 #   - Any of the input arguments, tensor or non-tensor, including
 #     argument names that only appear in Declarations.cwrap, e.g. 'output'.
 #
@@ -189,6 +198,9 @@
 - name: clone(Tensor self)
   self: grad
 
+- name: coalesce(Tensor self)
+  self: grad
+
 - name: cos(Tensor self)
   self: grad * -self.sin()
 
@@ -321,6 +333,12 @@
   self: gesv_backward_self(grad, self, A)
   A: gesv_backward_A(grad, self, A, result0)
 
+- name: indices(Tensor self)
+  output_differentiability: [False]
+
+- name: _indices(Tensor self)
+  output_differentiability: [False]
+
 - name: grid_sampler_2d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode)
   input, grid: grid_sampler_2d_backward(grad, input, grid, interpolation_mode, padding_mode)
 
@@ -785,12 +803,23 @@
   self: not_implemented("sparse_mask")
   mask: not_implemented("sparse_mask")
 
+- name: _sparse_coo_tensor_with_dims_and_tensors(int64_t sparse_dim, int64_t dense_dim, IntList size, Tensor indices, Tensor values, TensorOptions options)
+  values: sparse_constructor_values_backward(grad, indices, values.sizes())
+
 - name: _standard_gamma(Tensor self, Generator generator)
   self: grad * _standard_gamma_grad(self, result)
 
 - name: _standard_gamma_grad(Tensor self, Tensor output)
   self: not_implemented("_standard_gamma_grad")
 
+- name: values(Tensor self)
+  self: at::_sparse_coo_tensor_unsafe(self.indices(), grad, self.sizes())._coalesced_(true);
+
+# Why is _values() not differentiable?
+# See NOTE [ Sparse: autograd and API ]
+- name: _values(Tensor self)
+  output_differentiability: [False]
+
 # NN
 - name: _trilinear(Tensor i1, Tensor i2, Tensor i3, IntList expand1, IntList expand2, IntList expand3, IntList sumdim, int64_t unroll_dim)
   i1, i2, i3: _trilinear_backward(grad, i1, i2, i3, expand1, expand2, expand3, sumdim, unroll_dim, grad_input_mask)
@@ -1234,7 +1263,11 @@
   save_var: not_implemented("cudnn_batch_norm_backward save_var")
   input, weight, grad_output: batchnorm_double_backward(input, weight, grads[0], grads[1], grads[2], grad_output, running_mean, running_var, true, epsilon, save_mean, save_var, grad_input_mask)
 
+
+# Only frst three of _cudnn_rnn outputs can have gradients.
+# _cudnn_rnn outputs: (output, hy, cy, reserve, weight_buf)
 - name: _cudnn_rnn(Tensor input, TensorList weight, int64_t weight_stride0, Tensor weight_buf, Tensor hx, Tensor cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, Tensor dropout_state)
+  output_differentiability: [True, True, True, False, False]
   input, hx, cx, weight: "_cudnn_rnn_backward(input, weight, weight_stride0, result4, hx, cx, result0, grads[0], grads[1], grads[2], mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, retain_variables ? result3.clone() : result3, grad_input_mask)"
 
 # miopen
@@ -1274,7 +1307,11 @@
   tensors: unbind(grad, dim)
 
 # fused RNN kernels
+
+# Only frst two of _thnn_fused_lstm_cell outputs can have gradients.
+# _thnn_fused_lstm_cell outputs: (hy, cy, workspace)
 - name: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor input_bias, Tensor hidden_bias)
+  output_differentiability: [True, True, False]
   input_gates, hidden_gates, cx, input_bias, hidden_bias: _thnn_fused_lstm_cell_backward(grads[0], grads[1], cx, result1, result2, input_bias.defined())
 
 - name: _thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor input_bias, Tensor hidden_bias)
diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py
index af62f0ad489036..e09bc810c3f8ad 100644
--- a/tools/autograd/gen_autograd.py
+++ b/tools/autograd/gen_autograd.py
@@ -14,29 +14,35 @@
 from collections import defaultdict
 from .utils import YamlLoader, split_name_params
 
+# See NOTE [ Autograd View Variables ] in variable.h for details.
+# A map: function name => two options:
+#      1. name of the argument that all outputs are view of
+#      2. map: output idx => name of the argument that this result is view of
 VIEW_FUNCTIONS = {
-    'alias', 'as_strided', 'diagonal', 'expand', 'narrow', 'permute', 'select', 'slice',
-    'squeeze', 't', 'transpose', 'unfold', 'unsqueeze', 'view', 'unbind',
-}
-
-# In principle this should live in derivatives.yaml, but I could not
-# think of a good syntax for it
-HARDCODED_DIFFERENTIABLE_OUTPUTS = {
-    # Suppose that 'foo' is a function for which outputs 0 and 1 are
-    # differentiable, and 2 is not.  Then you would write:
-    # 'foo': (0, 1),
-    '_cudnn_rnn': (0, 1, 2),
-    # _cudnn_rnn outputs:
-    #   0 => output
-    #   1 => hy
-    #   2 => cy
-    #   3 => reserve
-    #   4 => weight_buf
-    '_thnn_fused_lstm_cell': (0, 1),
-    # _thnn_fused_lstm_cell outputs:
-    #   0 => hy
-    #   1 => cy
-    #   2 => workspace
+    'alias': 'self',
+    'as_strided': 'self',
+    'diagonal': 'self',
+    'expand': 'self',
+    'narrow': 'self',
+    'permute': 'self',
+    'select': 'self',
+    'slice': 'self',
+    'squeeze': 'self',
+    't': 'self',
+    'transpose': 'self',
+    'unfold': 'self',
+    'unsqueeze': 'self',
+    'view': 'self',
+    'unbind': 'self',
+    '_indices': 'self',
+    '_values': 'self',
+    'indices': 'self',
+    'values': 'self',
+    # sparse_coo ctor output should really be views of both indices and values,
+    # but we only supports making as view of a single varible, and indices is
+    # discrete anyways.
+    # FIXME: clone indices on construction.
+    'sparse_coo_tensor_with_dims_and_tensors': 'values',
 }
 
 
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index b1746e47bd5113..5376d1b6384907 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -19,8 +19,7 @@
 SKIP_PYTHON_BINDINGS = [
     'alias', 'contiguous', 'is_cuda', 'is_sparse', 'size', 'stride',
     '.*_backward', '.*_backward_(out|input|weight|bias)', '.*_forward',
-    '.*_forward_out', '_unsafe_view', 'tensor',
-    'sparse_coo_tensor', 'th_sparse_coo_tensor', 'native_sparse_coo_tensor',
+    '.*_forward_out', '_unsafe_view', 'tensor', '_?sparse_coo_tensor.*',
     '_arange.*', '_range.*', '_linspace.*', '_logspace.*',
     '_sparse_add.*', '_sparse_div.*', '_sparse_mul.*', '_sparse_sub.*',
     'index',
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 26a2a9539efbc2..16af5c65d04a95 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -26,7 +26,7 @@
 import os
 import sys
 from .utils import CodeTemplate, nested_dict, write, uninplace_api_name
-from .gen_autograd import VIEW_FUNCTIONS, HARDCODED_DIFFERENTIABLE_OUTPUTS
+from .gen_autograd import VIEW_FUNCTIONS
 from .gen_autograd_functions import uses_single_grad
 
 
@@ -43,7 +43,9 @@
 DONT_RECORD_TRACE = {
     'convolution', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
     'conv_transpose2d', 'conv_transpose3d', 'lstm_cell', 'gru_cell',
-    'rnn_tanh_cell', 'rnn_relu_cell', 'linear'
+    'rnn_tanh_cell', 'rnn_relu_cell', 'linear',
+    # FIXME: figure out a better way when we support sparse tensors in jit
+    '_coalesced_',
 }
 
 # These functions have their names recorded under trace renamed,
@@ -71,13 +73,13 @@
 # tensors that have requires_grad=False. In-place functions listed here will
 # not examine or modify requires_grad or grad_fn.
 DONT_REQUIRE_DERIVATIVE = {
-    # These  only depend on the input Tensor's shape and device, not the data
+    # These only depend on the input Tensor's shape and device, not the data
     'ones_like', 'zeros_like', 'rand_like', 'randn_like',
-    # Tensor constructors
-    'sparse_coo_tensor', 'th_sparse_coo_tensor', 'native_sparse_coo_tensor',
     # These are only implemented on integral types
     '__and__', '__iand__', '__ilshift__', '__ior__', '__irshift__', '__ixor__',
     '__lshift__', '__or__', '__rshift__', '__xor__',
+    # This is an unsafe method that is meant to be out of reach of autograd.
+    '_coalesced_',
 }
 
 METHOD_DECLARATION = CodeTemplate("""\
@@ -180,7 +182,7 @@ def should_trace(declaration):
         return False
     name = declaration['name']
     base_name = name[:-1] if declaration['inplace'] else name[:-4] if name.endswith('_out') else name
-    if base_name in DONT_RECORD_TRACE:
+    if base_name in DONT_RECORD_TRACE or name in DONT_RECORD_TRACE:
         return False
     # We need to disable these because their inner implementations implement
     # broadcasting, and if we trace them top level we will lose the expand nodes.
@@ -313,7 +315,7 @@ def emit_body(declaration):
     returns_void = len(returns) == 1 and returns[0]['type'] == 'void'
 
     base_name = name[:-1] if inplace else name[:-4] if is_out_fn else name
-    is_view = base_name in VIEW_FUNCTIONS
+    view_info = VIEW_FUNCTIONS.get(base_name, None)
 
     # These exclude things like BoolTensor, int64_t, and Scalar
     def is_differentiable(arg):
@@ -329,18 +331,19 @@ def is_differentiable(arg):
     differentiable_inputs = list(filter(is_differentiable, inputs))
     candidate_differentiable_outputs = list(filter(is_differentiable, returns))
 
-    hardcoded_diff = HARDCODED_DIFFERENTIABLE_OUTPUTS.get(name)
-    if hardcoded_diff:
+    if func is not None and func.get('output_differentiability') is not None:
         differentiable_outputs = []
-        for i in hardcoded_diff:
-            differentiable_outputs.append(candidate_differentiable_outputs[i])
+        output_differentiability = func.get('output_differentiability')
+        for differentiable, output in zip(output_differentiability, returns):
+            if differentiable:
+                differentiable_outputs.append(output)
     elif uses_single_grad(func):
         differentiable_outputs = candidate_differentiable_outputs[:1]
     else:
         differentiable_outputs = candidate_differentiable_outputs
 
     requires_derivative = (
-        base_name not in DONT_REQUIRE_DERIVATIVE and
+        base_name not in DONT_REQUIRE_DERIVATIVE and name not in DONT_REQUIRE_DERIVATIVE and
         len(differentiable_inputs) > 0 and len(differentiable_outputs) > 0 and
         strategy == 'use_derived')
 
@@ -455,24 +458,77 @@ def declare_returned_variables():
         return '\n'.join(names)
 
     def wrap_output(call):
+        # Returns a 2-tuple `(wrapped_call, extra_wrapping_stmts)`, where
+        # `wrapped_call` is to drop-in replace `call`, and
+        # `extra_wrapping_stmts` is a list of extra statements to run after
+        # `call`.
         if 'Tensor' not in declaration['return_type']:
-            return call
-        elif is_view:
-            return 'as_view(self, {})'.format(call)
+            return call, []
+        elif view_info is not None:
+            # See NOTE [ Autograd View Variables ] in variable.h for details.
+            differentiable_output_vars = {r['name'] for r in differentiable_outputs}
+            tensor_output_vars = {r['name'] for r in returns if 'Tensor' in r['type']}
+            if not isinstance(view_info, dict):
+                if len(differentiable_output_vars) == len(tensor_output_vars):
+                    # all outputs are differentiable
+                    return 'as_view({}, {}, true)'.format(view_info, call), []
+                elif len(differentiable_output_vars) == 0:
+                    # no output is differentiable
+                    return 'as_view({}, {}, false)'.format(view_info, call), []
+                else:
+                    # some of the outputs are differentiable
+                    # need to expand to dict mode, i.e., one entry per output
+                    base_name = view_info
+                    view_info_dict = {}
+                    for i, return_info in enumerate(returns):
+                        if 'Tensor' in return_info['type']:
+                            view_info_dict[i] = base_name
+            else:
+                view_info_dict = view_info
+
+            def wrap_view_single(output_var, base_var):
+                fmt = '{output_var} = as_view({base_var}, {output_var}, {is_differentiable});'
+                if output_var in differentiable_output_vars:
+                    # If `GradMode::is_enabled()` is False, this is a
+                    # non-differentiable view. Gradients should not flow through.
+                    is_differentiable = 'true'
+                else:
+                    # This output is non-differentiable, so it is a
+                    # non-differentiable view. Gradients should not flow through.
+                    is_differentiable = 'false'
+                return fmt.format(output_var=output_var, base_var=base_var,
+                                  is_differentiable=is_differentiable)
+
+            extra_wrapping_stmts = []
+            for output_idx, return_info in enumerate(returns):
+                if 'Tensor' not in return_info['type']:
+                    assert output_idx not in view_info_dict, 'Can not wrap non-Tensor output as a view'
+                    continue
+                output_var = return_info['name']
+                if output_idx in view_info_dict:
+                    stmt = wrap_view_single(output_var, view_info_dict[output_idx])
+                elif 'Tensor' in return_info['type']:
+                    stmt = '{output_var} = as_variable({output_var});'.format(output_var=output_var)
+                extra_wrapping_stmts.append(stmt)
+            return call, extra_wrapping_stmts
         else:
-            return 'as_variable({})'.format(call)
+            return 'as_variable({})'.format(call), []
 
     def emit_call(env):
         combined = nested_dict(env, declaration)
+        extra_wrapping_stmts = []
         if strategy == 'use_derived':
             call = CALL_VIA_DERIVED.substitute(combined)
             if not modifies_arguments:
-                call = wrap_output(call)
+                call, extra_wrapping_stmts = wrap_output(call)
         else:
             call = CALL_VIA_TYPE.substitute(declaration)
         if not modifies_arguments and not returns_void:
             call = '{} = {}'.format(tie_return_values(), call)
-        return call + ';'
+        call = call + ';'
+        for stmt in extra_wrapping_stmts:
+            call += '\n' + stmt
+        return call
 
     def tie_return_values():
         if len(declaration['returns']) == 1:
@@ -497,7 +553,7 @@ def get_return_value():
         return 'std::make_tuple({})'.format(', '.join(moved))
 
     def emit_history():
-        fn = 'rebase' if modifies_arguments and not is_view else 'set'
+        fn = 'rebase' if modifies_arguments and view_info is None else 'set'
         output_names = [r['name'] for r in differentiable_outputs]
         # TODO: flatten allocates a std::vector, which could be expensive
         outs = CodeTemplate("flatten_tensor_args( ${outs} )").substitute(outs=output_names)
@@ -612,7 +668,8 @@ def dispatch_strategy(declaration):
           get dispatched back to VariableType (which will ensure that they
           are differentiable.)
     """
-    if (declaration['abstract'] or declaration['derivative'] is not None):
+    if (declaration['abstract'] or declaration['requires_tensor'] or
+            declaration['derivative'] is not None):
         # If the function is abstract (not implemented on at::Type), we must
         # call the implementation on the derived type with unpacked tensors.
 
@@ -631,6 +688,6 @@ def dispatch_strategy(declaration):
     else:
         # If the function is concrete (we don't have to override it) and we
         # didn't declare it in derivatives.yaml, we'll assume that it is
-        # actually implemented  out of differentiable functions. (This
+        # actually implemented out of differentiable functions. (This
         # assumption might not hold, but then you'll see gradcheck fail.)
         return 'use_type'
diff --git a/tools/autograd/load_derivatives.py b/tools/autograd/load_derivatives.py
index ef4ef8d9acbb58..c2ede2d9d7b6d4 100644
--- a/tools/autograd/load_derivatives.py
+++ b/tools/autograd/load_derivatives.py
@@ -9,7 +9,6 @@
 import yaml
 from .utils import YamlLoader
 from .utils import IDENT_REGEX, split_name_params
-from .gen_autograd import HARDCODED_DIFFERENTIABLE_OUTPUTS
 
 
 def load_derivatives(path, declarations):
@@ -30,7 +29,8 @@ def load_derivatives(path, declarations):
 
 
 # How do you feel about pasting declaration inside autograd function...
-def create_autograd_function(name, derivatives, args_with_gradients, signature, declaration):
+def create_autograd_function(name, derivatives, args_with_gradients, signature,
+                             declaration, output_differentiability):
     op = to_camel_case(name) + 'Backward'
     op = op.replace('ForwardBackward', 'Backward')
     return {
@@ -42,6 +42,7 @@ def create_autograd_function(name, derivatives, args_with_gradients, signature,
         'derivatives': derivatives,
         'saved_inputs': all_saved_variables(derivatives, 'saved_inputs'),
         'saved_outputs': all_saved_variables(derivatives, 'saved_outputs'),
+        'output_differentiability': output_differentiability,
     }
 
 
@@ -128,36 +129,6 @@ def check_grad_usage(defn_name, declaration, derivatives):
                                "otherwise, there is a likely error in your derivatives "
                                "declaration.".format(defn_name))
 
-        hardcoded_diff = HARDCODED_DIFFERENTIABLE_OUTPUTS.get(defn_name)
-        if hardcoded_diff:
-            if used_grad:
-                raise RuntimeError("Derivative definition {} has hard-coded differentiable "
-                                   "outputs in gen_autograd.py, but used grad (which implies "
-                                   "only the first output is differentiable) in its "
-                                   "derivative declaration.  You likely meant to write "
-                                   "grads[i] for some i instead.".format(defn_name))
-            if only_used_grads_indices and set(used_grads_indices) != set(hardcoded_diff):
-                raise RuntimeError("Derivative definition {} has hard-coded differentiable "
-                                   "outputs {}, but the used grads in the derivative "
-                                   "definitions are only {}.  Either your derivatives "
-                                   "declaration is wrong, or the value of "
-                                   "HARDCODED_DIFFERENTIABLE_OUTPUTS in gen_autograd.py "
-                                   "is wrong.".format(defn_name, hardcoded_diff,
-                                                      used_grads_indices))
-        else:
-            if fully_implemented and not used_grad and \
-               used_grads and only_used_grads_indices and \
-               set(used_grads_indices) != set(range(len(declaration['returns']))):
-                raise RuntimeError("Derivative definition of {} in derivatives.yaml does "
-                                   "not refer to the gradients of all of its outputs.  Either "
-                                   "the derivatives declaration is wrong, OR you have some "
-                                   "non-differentiable outputs.  If you have a single "
-                                   "differentiable output, make it the first output in ATen "
-                                   "and reference its gradient with 'grad'; otherwise, hard "
-                                   "code the list of differentiable outputs in "
-                                   "HARDCODED_DIFFERENTIABLE_OUTPUTS in gen_autograd.py."
-                                   .format(defn_name))
-
     def set_up_derivatives(defn_name, defn, declaration):
         # Determine the set of inputs which have gradients
         args_with_gradients_set = set()
@@ -188,6 +159,9 @@ def unzip(xs):
 
     # NB: Removes 'name' from defn dictionary
     defn_name, params = split_name_params(defn.pop('name'))
+    # NB: Removes 'output_differentiability' from defn dictionary
+    #     `None` means all differentiable.
+    output_differentiability = defn.pop('output_differentiability', None)
     param_types, param_names = unzip([p.split(' ') for p in params if p != '*'])
     if 'grad_input_mask' in param_names:
         raise RuntimeError("Signature for {} has an argument named grad_input_mask, "
@@ -221,7 +195,8 @@ def unzip(xs):
                                .format(i, defn_name, x, y))
 
     derivatives, args_with_gradients = set_up_derivatives(defn_name, defn, canonical)
-    return create_autograd_function(defn_name, derivatives, args_with_gradients, signature, canonical)
+    return create_autograd_function(defn_name, derivatives, args_with_gradients,
+                                    signature, canonical, output_differentiability)
 
 
 def ensure_unique_names(autograd_functions):
diff --git a/tools/autograd/templates/Functions.cpp b/tools/autograd/templates/Functions.cpp
index 68197d7d825151..2752a156b35ecd 100644
--- a/tools/autograd/templates/Functions.cpp
+++ b/tools/autograd/templates/Functions.cpp
@@ -10,6 +10,7 @@
 #include <ATen/core/TensorOptions.h>
 #include <ATen/WrapDimUtils.h>
 #include <ATen/WrapDimUtilsMulti.h>
+#include <ATen/SparseTensorUtils.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/core/Reduction.h>
 
@@ -874,15 +875,15 @@ Tensor softplus_double_backward(const Tensor & grad, const Tensor & input, Scala
 }
 
 
-// NOTE [ as_strided Backward ]
+// NOTE [ as_strided Backward and layout-aware/agnostic autograd ]
 //
 // `storage_offset` is ignored for simplicity in this note. If you just want the
 // full algorithm without explanation, scroll down to bottom of this note.
 //
 // Implementing the backward of as_strided is tricky because you have to deal
-// with mappings that maps one memory location to multiple indices, i.e., the
-// output tensor indices pointing to **overlapping** memory addresses. This can
-// happen in all in all sorts of weird cases. For example,
+// with mappings that map one memory location to multiple indices, i.e., the
+// output tensor has multiple indices pointing to **overlapping** memory
+// addresses. This can happen in all in all sorts of weird cases. For example,
 //
 //   x = torch.randn(15)
 //   x.as_strided([3, 3], [1, 0])  # "expand" case
@@ -891,7 +892,7 @@ Tensor softplus_double_backward(const Tensor & grad, const Tensor & input, Scala
 //                                 # res[0, 1] points to 0*3 + 1*6 = 6
 //
 // Here is the general strategy we apply in implementing as_strided backward:
-//   0. ??? (optimizaiont step. we will talk about this later)
+//   0. ??? (optimization step. we will talk about this later)
 //   1. Create some underlying flattened tensor as if it is the base tensor
 //      representing the contiguous memory storage for both input and output.
 //   2. Use the output geometry to scatter (or index_add) the gradients into
@@ -902,7 +903,7 @@ Tensor softplus_double_backward(const Tensor & grad, const Tensor & input, Scala
 //
 // In step (2), if the output tensor does't have overlapping memory, we can
 // safely scatter (`storage.as_strided(output_geometry).copy_(grad)`);
-// otherwise, we must use `index_add` as gradient at different indices may need
+// otherwise, we must use `index_add` as gradients at different indices may need
 // to be summed to a single location.
 //
 // For example, in this case:
@@ -933,10 +934,10 @@ Tensor softplus_double_backward(const Tensor & grad, const Tensor & input, Scala
 //   x = t.expand(3, 3)            # input with overlapping memory
 //                                 # size   [3, 3]
 //                                 # stride [0, 1]
-//   y = x.as_strided([3], [1])    # contiguous output
-//                                 # size   [3]
+//   y = x.as_strided([1], [1])    # contiguous output
+//                                 # size   [1]
 //                                 # stride [1]
-//   y.backward()  # step (1): contiguous storagte tensor `s` of size 3, which
+//   y.backward()  # step (1): contiguous storage tensor `s` of size 3, which
 //                             is large enough to be used as underlying storage
 //                             for `x` and `y`.
 //                               s = [ 0, 0, 0]
@@ -969,17 +970,17 @@ Tensor softplus_double_backward(const Tensor & grad, const Tensor & input, Scala
 // In the above computation of numerical gradients, they only match the
 // analytical results because strides and memory locations are considered in the
 // forward pass, i.e., this op (including both forward and backward) is
-// stride-aware.
+// layout-aware.
 //
-// However, most (probably all) other ops (forward and backward) are
-// stride-agnostic. E.g.,
+// However, in PyTorch, most (probably all) other ops (forward and backward) are
+// layout-agnostic. E.g.,
 //
 //   t = torch.randn(1)
 //   x = t.expand(2)
 //   y = x.sum()
 //   y.backward()
 //
-// Stride-agnostic autograd (as it is currently in PyTorch) will give you
+// Layout-agnostic autograd (as it is currently in PyTorch) will give you
 //
 //   gy = 1
 //   gx = [ 1, 1]  # SumBackward:    torch.ones_like(x)
@@ -989,28 +990,30 @@ Tensor softplus_double_backward(const Tensor & grad, const Tensor & input, Scala
 // (the other will also change by `delta`), `y` will change by `2 * delta`. So
 // the gradients, if strides are taken into consideration, should be 2.
 //
-// Stride-aware autograd should give you
+// Layout-aware autograd should give you
 //
 //   gy = 1
 //   gx = [ 2, 2]  # Because the backward considers the fact that the input `x`
 //                 # is already expanded.
-//   gt = [ 2]     # Stride-aware backward of expand is just a slicing because
+//   gt = [ 2]     # Layout-aware backward of expand is just a slicing because
 //                 # the previous backward should have already taken care of
 //                 # strides and made sure that gradients are the same along the
 //                 # expanded dimension.
 //
 // As shown above, these two types are not compatible. Therefore, we must either
-// make as_strided stride-agnostic, or make all other ops stride-aware.
+// make as_strided layout-agnostic, or make all other ops layout-aware.
 //
-// It is unrealisitc to support stride-aware autograd (at least in the current
-// structure), because it would mean
+// It is difficult to support layout-aware autograd (at least in the current
+// codebase structure), because it would mean
 //   1. storing tensor geometries of every input tensor for backward
 //   2. depending on input geometry, the gradient computed from backward change
 //   3. ideally enforcing gradient of T to always have same strides as T
 // (although these two methods only differ when it comes to overlapping memory)
 //
-// To formulate `as_strided(input, size, stride)` in a stride-agnostic way, we
-// consider `input.stride()` as a separate independent arguement `input_stride`:
+// Therefore, we must formulate `as_strided` in a layout-agnostic way, i.e.,
+// giving the same output regardless of the input layout. We consider
+// `input.stride()` as a separate independent fixed argument `input_stride`.
+// Then, `as_strided(input, size, stride)` can be thought of as:
 //   1. "Scatter" each value of `input` into a "storage" using storage location
 //      computed from the value's index in `input`, `input.size()` and
 //      `input_stride`, but if N values end up in the same location, the value
@@ -1020,7 +1023,9 @@ Tensor softplus_double_backward(const Tensor & grad, const Tensor & input, Scala
 //        Denote the set of all input indices that pointing to the same storage
 //        location `storage[n]` as `S(n)`, i.e.,
 //
-//            S(n) = { index : index @ input_stride == n, index is valid given input.size() }
+//            S(n) = { index : <index, input_stride> == n, index is valid given input.size() },
+//
+//        where `<x, y>` is the dot product between `x` and `y`.
 //
 //        Then, the process is:
 //
@@ -1028,7 +1033,9 @@ Tensor softplus_double_backward(const Tensor & grad, const Tensor & input, Scala
 //
 //        Note that all values in `S(n)` are the same (they point to the same
 //        memory location anyways, so this step doesn't change anything, but
-//        effectively avoids using `input.stride()`.
+//        effectively avoids having the denpendency on the layout of `input`.
+//        I.e., the result holds fixed regardless of the layout of `input`, as
+//        long as `input_stride` is fixed.
 //
 //      NOTE: for forward pass, we can equivalently simply selet any one of
 //            `S(n)` as `storage[n]`. However, cosnidering this as an average
@@ -1038,7 +1045,7 @@ Tensor softplus_double_backward(const Tensor & grad, const Tensor & input, Scala
 //   2. As usual, return the as_strided view of `storage` using required output
 //      `size` and `stride`.
 //
-// To backward through this stride-agnostic version, we simply add the following
+// To backward through this layout-agnostic version, we simply add the following
 // step:
 //   .... (scatter gradients into the storage tensor using output geometry)
 //   3. For all storage location n, `storage[n] /= |S(n)|`.
@@ -1311,13 +1318,13 @@ static inline int64_t _min_storage_size(IntList sizes, IntList strides, int64_t
   return storage_size;
 }
 
-// See NOTE [ as_strided Backward ] for explanation
+// See NOTE [ as_strided Backward and layout-aware/agnostic autograd ] for explanation
 Tensor as_strided_backward(Tensor grad, TensorGeometry input_geometry, IntList sizes, IntList strides, int64_t storage_offset) {
   // For output geometry,
   //   check for size 0 dimensions,
   //   skip size 1 dimensions,
   //   reduce grad on expanded dims (stride=0, size>1)
-  // Step (0)     for the algorithm in NOTE [ as_strided Backward ]
+  // Step (0)     for the algorithm in NOTE [ as_strided Backward and layout-aware/agnostic autograd ]
   // Step (0)~(1) for the algorithm in NOTE [ Detecting Memory Overlap Within A Strided Tensor ]
   //              on output geometry
   auto odim = grad.dim();
@@ -1368,7 +1375,7 @@ Tensor as_strided_backward(Tensor grad, TensorGeometry input_geometry, IntList s
 
 
   // Rest of this function implements
-  // Step (1)~(4) for the algorithm in NOTE [ as_strided Backward ]
+  // Step (1)~(4) for the algorithm in NOTE [ as_strided Backward and layout-aware/agnostic autograd ]
   // TODO: Raise if not all output values are visible in input geometry.
   //       Technically speaking, if you treat those values as constants, not
   //       raising is fine, and mathematically correct. However, these values
@@ -1964,8 +1971,8 @@ std::tuple<Tensor, Tensor, Tensor> batchnorm_double_backward(
 }
 
 std::tuple<Tensor, Tensor, Tensor> _trilinear_backward(const Tensor& grad_out, const Tensor& i1, const Tensor& i2, const Tensor& i3,
-						       IntList expand1, IntList expand2, IntList expand3,
-						       IntList sumdim, int64_t unroll_dim, std::array<bool, 3> grad_mask) {
+                                                       IntList expand1, IntList expand2, IntList expand3,
+                                                       IntList sumdim, int64_t unroll_dim, std::array<bool, 3> grad_mask) {
   Tensor grad_i1, grad_i2, grad_i3;
   if (grad_mask[0])
     grad_i1 = at::_trilinear(grad_out, i2, i3, sumdim, expand2, expand3, expand1);
@@ -1987,6 +1994,17 @@ Tensor log1p_backward(const Tensor& grad, const Tensor& self) {
   return grad / (self + 1);
 }
 
+Tensor sparse_constructor_values_backward(const Tensor& sparse_grad_out, const Tensor& indices, IntList values_shape) {
+  // TODO: improve this backward by writing a kernel (maybe)
+  auto dense_grad = sparse_grad_out.is_sparse() ? sparse_grad_out.to_dense() : sparse_grad_out;
+  auto full_size = sparse_grad_out.sizes();
+  auto flattened_grad_shape = values_shape.vec();
+  flattened_grad_shape[0] = at::prod_intlist(full_size.slice(0, indices.size(0)));
+  auto flattened_dense_grad = dense_grad.view(flattened_grad_shape);
+  auto flattened_indices = at::sparse::flatten_indices(indices, full_size);
+  return flattened_dense_grad.index_select(0, flattened_indices);
+}
+
 } // anonymous namespace
 
 ${autograd_function_definitions}
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index d5a876e4ca83e6..3e3114841a8e15 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -662,6 +662,17 @@ def add_docstr_all(method, docstr):
 Returns the address of the first element of :attr:`self` tensor.
 """)
 
+add_docstr_all('dense_dim',
+               r"""
+dense_dim() -> int
+
+If :attr:`self` is a sparse COO tensor (i.e., with ``torch.sparse_coo`` layout),
+this returns a the number of dense dimensions. Otherwise, this throws an
+error.
+
+See also :meth:`Tensor.sparse_dim`.
+""")
+
 add_docstr_all('diag',
                r"""
 diag(diagonal=0) -> Tensor
@@ -980,9 +991,24 @@ def add_docstr_all(method, docstr):
 See :func:`torch.gesv`
 """)
 
+add_docstr_all('indices',
+               r"""
+indices() -> Tensor
+
+If :attr:`self` is a sparse COO tensor (i.e., with ``torch.sparse_coo`` layout),
+this returns a view of the contained indices tensor. Otherwise, this throws an
+error.
+
+See also :meth:`Tensor.values`.
+
+.. note::
+  This method can only be called on a coalesced sparse tensor. See
+  :meth:`Tensor.coalesce` for details.
+""")
+
 add_docstr_all('get_device',
                r"""
-get_device(A) -> Device ordinal (Integer)
+get_device() -> Device ordinal (Integer)
 
 For CUDA tensors, this function returns the device ordinal of the GPU on which the tensor resides.
 For CPU tensors, an error is thrown.
@@ -995,6 +1021,21 @@ def add_docstr_all(method, docstr):
     >>> x.cpu().get_device()  # RuntimeError: get_device is not implemented for type torch.FloatTensor
 """)
 
+add_docstr_all('values',
+               r"""
+values() -> Tensor
+
+If :attr:`self` is a sparse COO tensor (i.e., with ``torch.sparse_coo`` layout),
+this returns a view of the contained values tensor. Otherwise, this throws an
+error.
+
+See also :meth:`Tensor.indices`.
+
+.. note::
+  This method can only be called on a coalesced sparse tensor. See
+  :meth:`Tensor.coalesce` for details.
+""")
+
 add_docstr_all('gt',
                r"""
 gt(other) -> Tensor
@@ -1462,7 +1503,7 @@ def callable(a, b) -> number
 Same as :meth:`Tensor.narrow` except returning a copy rather
 than shared storage.  This is primarily for sparse tensors, which
 do not have a shared-storage narrow method.  Calling ```narrow_copy``
-with ```dimemsion > self._sparseDims()``` will return a copy with the
+with ```dimemsion > self.sparse_dim()``` will return a copy with the
 relevant dense dimension narrowed, and ```self.shape``` updated accordingly.
 """)
 
@@ -2094,6 +2135,17 @@ def callable(a, b) -> number
 See :func:`torch.sort`
 """)
 
+add_docstr_all('sparse_dim',
+               r"""
+sparse_dim() -> int
+
+If :attr:`self` is a sparse COO tensor (i.e., with ``torch.sparse_coo`` layout),
+this returns a the number of sparse dimensions. Otherwise, this throws an
+error.
+
+See also :meth:`Tensor.dense_dim`.
+""")
+
 add_docstr_all('sqrt',
                r"""
 sqrt() -> Tensor
diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h
index d28149e1282571..07cc420e429875 100644
--- a/torch/csrc/autograd/VariableTypeUtils.h
+++ b/torch/csrc/autograd/VariableTypeUtils.h
@@ -103,21 +103,24 @@ template<typename... Args> inline variable_list flatten_tensor_args(Args&&... ar
   return out; // RVO
 }
 
-inline Tensor as_view(const Tensor & base, Tensor tensor) {
+// See NOTE [ Autograd View Variables ] for details.
+inline Tensor as_view(const Tensor & base, Tensor tensor, bool is_differentiable = true) {
   auto base_var = Variable(base);
   if (base_var.is_view()) {
     base_var = base_var.base();
   }
-  return make_variable_view(std::move(base_var), std::move(tensor));
+  return make_variable_view(std::move(base_var), std::move(tensor), is_differentiable);
 }
 
-inline std::vector<Tensor> as_view(const Tensor & base, std::vector<Tensor> tensors) {
+// See NOTE [ Autograd View Variables ] for details.
+inline std::vector<Tensor> as_view(const Tensor & base, std::vector<Tensor> tensors,
+                                   bool is_differentiable = true) {
   auto base_var = Variable(base);
   if (base_var.is_view()) {
     base_var = base_var.base();
   }
   for(Tensor &tensor : tensors) {
-    tensor = make_variable_view(base_var, std::move(tensor));
+    tensor = make_variable_view(base_var, std::move(tensor), is_differentiable);
   }
   return tensors;
 }
diff --git a/torch/csrc/autograd/functions/tensor.h b/torch/csrc/autograd/functions/tensor.h
index 101595368076e9..11c447cb8bae38 100644
--- a/torch/csrc/autograd/functions/tensor.h
+++ b/torch/csrc/autograd/functions/tensor.h
@@ -22,6 +22,10 @@ struct CopyBackwards : public Function {
 // Performs grad[idx] = fn(grad[idx]), but out-of-place. The slicing operation
 // grad[idx] is defined by the relative sizes, strides, and offset of base and
 // view.
+// When an in-place operation is done on a differentiable view, the base's
+// grad_fn is updated to become a `CopySlice` wrapping the backward of the
+// in-place operation.
+// See NOTE [ Autograd View Variables ].
 struct CopySlices : public Function {
   CopySlices(
       const Variable& base_var,
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index c36a52a71a279a..74a58f8454bd96 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -115,12 +115,6 @@ std::shared_ptr<Function> Variable::Impl::get_grad_accumulator() {
   return result;
 }
 
-Variable Variable::Impl::detach() const {
-  auto detached = make_variable(data_, /*requires_grad=*/false);
-  detached.set_version_counter(version_counter_);
-  return detached;
-}
-
 void Variable::Impl::detach_() {
   if (is_view_) {
     AT_ERROR("Can't detach views in-place. Use detach() instead");
@@ -172,7 +166,7 @@ void Variable::Impl::release_resources() {
   hooks_.clear();
 }
 
-Variable::ViewImpl::ViewImpl(Variable base, at::Tensor data, Edge gradient_edge)
+Variable::DifferentiableViewImpl::DifferentiableViewImpl(Variable base, at::Tensor data, Edge gradient_edge)
     : Variable::Impl(std::move(data), false, std::move(gradient_edge)),
       base_(std::move(base)) {
   AT_CHECK(base_.defined(), "base is undefined");
@@ -184,7 +178,7 @@ Variable::ViewImpl::ViewImpl(Variable base, at::Tensor data, Edge gradient_edge)
   attr_version = version_counter_.current_version();
 }
 
-std::shared_ptr<Function>& Variable::ViewImpl::get_grad_fn() {
+std::shared_ptr<Function>& Variable::DifferentiableViewImpl::get_grad_fn() {
   std::lock_guard<std::mutex> lock(mutex_);
   if (!grad_fn_ && !base_.requires_grad()) {
     return grad_fn_;
@@ -208,7 +202,7 @@ std::shared_ptr<Function>& Variable::ViewImpl::get_grad_fn() {
   return grad_fn_;
 }
 
-void Variable::ViewImpl::rebase_history(Edge gradient_edge) {
+void Variable::DifferentiableViewImpl::rebase_history(Edge gradient_edge) {
   AT_ASSERT(gradient_edge.input_nr == 0);
   AT_ASSERT(gradient_edge.function);
   AT_CHECK(
@@ -221,7 +215,7 @@ void Variable::ViewImpl::rebase_history(Edge gradient_edge) {
   get_grad_fn(); // trigger an update to the view's grad_fn
 }
 
-void Variable::ViewImpl::release_resources() {
+void Variable::DifferentiableViewImpl::release_resources() {
   Variable::Impl::release_resources();
   base_.reset();
 }
@@ -229,7 +223,7 @@ void Variable::ViewImpl::release_resources() {
 void Variable::rebase_history(Edge gradient_edge) {
   AT_ASSERT(gradient_edge.function != nullptr);
   if (is_view()) {
-    auto& impl = static_cast<Variable::ViewImpl&>(*get());
+    auto& impl = static_cast<Variable::DifferentiableViewImpl&>(*get());
     impl.rebase_history(std::move(gradient_edge));
   } else {
     set_gradient_edge(std::move(gradient_edge));
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index 804be552392d83..48cc918ca288c9 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -61,8 +61,10 @@ struct Function;
 /// `Variable`. You can determine whether `Variable` is in fact a view by
 /// probing its `is_view()` method. Note that the *view* semantics are only
 /// meaningful for `Variable` relations that are relevant to autograd. For
-/// example, if you hide your code from autograd using `.data`, the `Variable`s
-/// will not be registered as having view relations, even if they share storage.
+/// example, if you hide your code from autograd using `.no_grad()`, the
+/// `Variable`s will not be registered as having view relations, even if they
+/// share storage.
+/// See NOTE [ Autograd View Variables ] for more details.
 ///
 ///
 ///                               Interface
@@ -92,9 +94,13 @@ struct TORCH_API Variable : public at::Tensor {
 
   /// Creates a `Variable` that is a *view* of another (*base*) variable.
   /// The `gradient_edge` is an optional (gradient_function, input_number) pair.
+  /// `is_differentiable` is a bool that specifies whether this view is
+  /// differentiable, i.e., whether the relation should be tracked by autograd.
+  /// See NOTE [ Autograd View Variables ] for details.
   friend Variable make_variable_view(
       Variable base,
       at::Tensor data,
+      bool is_differentiable,
       Edge gradient_edge);
 
   /// Creates a `Variable` from the given `Tensor`. `requires_grad` should be
@@ -263,7 +269,7 @@ struct TORCH_API Variable : public at::Tensor {
   /// and the `get()` method which exposes it shall forever remain private and
   /// never be exposed to the public interface of this class.
   struct Impl;
-  struct ViewImpl;
+  struct DifferentiableViewImpl;
 
   // Private Methods
   //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -330,7 +336,6 @@ struct TORCH_API Variable::Impl : public at::TensorImpl {
     return grad_;
   }
 
-  Variable detach() const;
   void detach_();
 
   void set_data(Tensor new_data);
@@ -372,15 +377,80 @@ struct TORCH_API Variable::Impl : public at::TensorImpl {
 };
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//                          Variable::ViewImpl
+//                     Variable::DifferentiableViewImpl
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-/// A Variable that is a view on another Variable. The base and view share the
-/// same version_counter. The grad_fn field of the Variable may become stale
-/// due to in-place modifications of the shared data. Accesses should go
-/// through get_grad_fn(). All other fields are always valid.
-struct TORCH_API Variable::ViewImpl : public Variable::Impl {
-  ViewImpl(Variable base, at::Tensor data, Edge gradient_edge);
+/// NOTE [ Autograd View Variables ]
+///
+/// Many operations return Variable that shares storage with an input Variable.
+/// The returned Variable is called a **view** Variable on the input **base**
+/// Variable.
+///
+/// In PyTorch, we have two types of views: differentiable views, and
+/// non-differentiable views. In either type, to support proper version
+/// checking, the base and view Variables must always share the same
+/// version_counter.
+///
+///
+/// Differentiable Views
+/// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// Differentiable views are the view variables where you want gradients to flow
+/// back to the base variables. Out-of-place operations on views are quite
+/// straightforward, but in-place ones are very tricky. Even if the base
+/// variable may not require grad when we create the view, we still need to
+/// track the view relation because future in-place ops may require back-proping
+/// through it. For example, we need to support
+///
+///   (1) in-place operation on view, e.g.,
+///
+///     # Have:
+///     #   base.requires_grad = False
+///     #   var.requires_grad = True
+///     base[1] = var  # i.e., base[1].copy_(var)
+///     torch.autograd.grad(base.sum(), var)  <- should return an all ones tensor
+///
+///   (2) in-place operation on base after view is created, e.g.,
+///
+///     # Have:
+///     #   base.requires_grad = False
+///     #   var.requires_grad = True
+///     view = base[1]
+///     base.copy_(var)
+///     torch.autograd.grad(view.sum(), var)  <- should return a tensor with
+///                                              var[1] filled with all ones and
+///                                              zeros everywhere else
+///
+/// Variable::DifferentiableViewImpl is created to support gradient tracking of
+/// such **in-place** operations. In particular,
+///   + if an in-place op is done on base, the grad_fn field of the view may
+///     become stale. So accesses should always go through get_grad_fn(), which
+///     reconstructs an updated grad_fn if the version_counter has incremented.
+///     All other fields are always valid.
+///   + if an in-place op is done on view, in rebase_history() of view, which is
+///     called after every in-place op in VariableType.cpp, the grad_fn of base
+///     is updated.
+///
+///
+/// Non-Differentiable Views
+/// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// In certain cases, although function outputs share storage with inputs, they
+/// will **never** require gradient history tracking. Instead of registering the
+/// view relation via DifferentiableViewImpl in autograd, the views will be
+/// using usual Variable::Impl and just share the version counters with the base
+/// Variables.
+/// Such views include:
+///   1. Views created from .detach()
+///   2. Views that are non-differentiable by its nature.
+///      E.g., `sparse_tensor.indices()` is a integral view on a (possibly)
+///      floating point tensor.
+///      See top of `derivatives.yaml` on how to specify that outputs of a
+///      function are non-differentiable.
+/// These are called non-differentiable views as the gradients do not flow
+/// through the view relation.
+/// Relevant logic for non-differentiable views is implemented in
+/// make_variable_view below, and wrap_output of gen_variable_type.py.
+struct TORCH_API Variable::DifferentiableViewImpl : public Variable::Impl {
+  DifferentiableViewImpl(Variable base, at::Tensor data, Edge gradient_edge);
 
   /// Gets the up-to-date grad_fn. If the shared data or base was modified, we
   /// re-create the grad_fn to express the up-to-date view relationship between
@@ -414,13 +484,24 @@ struct TORCH_API Variable::ViewImpl : public Variable::Impl {
 // Factory Functions
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+// See NOTE [ Autograd View Variables ] for details.
 inline Variable make_variable_view(
     Variable base,
     at::Tensor data,
+    bool is_differentiable = true,
     Edge gradient_edge = Edge()) {
   if (data.defined()) {
-    return Variable(c10::make_intrusive<Variable::ViewImpl>(
-            std::move(base), std::move(data), std::move(gradient_edge)));
+    if (is_differentiable) {
+      /// Differentiable view. Track history with DifferentiableViewImpl.
+      return Variable(c10::make_intrusive<Variable::DifferentiableViewImpl>(
+              std::move(base), std::move(data), std::move(gradient_edge)));
+    } else {
+      /// Non-differentiable view. Just share version counter.
+      auto var = Variable(c10::make_intrusive<Variable::Impl>(
+              std::move(data), false, std::move(gradient_edge)));
+      var.set_version_counter(base.version_counter());
+      return var;
+    }
   }
   return Variable();
 }
@@ -500,7 +581,7 @@ inline std::shared_ptr<Function> Variable::grad_accumulator() const {
 }
 
 inline Variable Variable::detach() const {
-  return get()->detach();
+  return make_variable_view(*this, get()->data_, /*is_differentiable=*/false);
 }
 
 inline void Variable::detach_() {
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
index d4259a70baeb75..5ba6c55cfe1af8 100644
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -82,6 +82,7 @@ tensor_list2d broadcast_coalesced(TensorList tensors, IntList devices, size_t bu
     o.reserve(tensors.size());
 
   unique_type_checker type_checker;
+  at::DeviceGuard device_guard(devices[0]);
   for (auto & chunk : utils::take_tensors(tensors, buffer_size)) {
     auto & type = chunk.type();
     type_checker.show(type);
@@ -92,7 +93,7 @@ tensor_list2d broadcast_coalesced(TensorList tensors, IntList devices, size_t bu
       std::vector<at::Tensor> broadcast_values = broadcast(flat_tuple.second, devices);
       results.reserve(devices.size());
       for (size_t i = 1, num_devices = devices.size(); i < num_devices; ++i) {
-        at::DeviceGuard device_guard(devices[i]);
+        device_guard.set_index(devices[i]);
         auto & device_outputs = outputs[i];
         auto & inds = broadcast_indices[i];
         auto & vals = broadcast_values[i];
@@ -100,7 +101,6 @@ tensor_list2d broadcast_coalesced(TensorList tensors, IntList devices, size_t bu
           device_outputs.push_back(std::move(t));
       }
     } else {
-      at::DeviceGuard device_guard(devices[0]);
       std::vector<Tensor> results = broadcast(utils::flatten_dense_tensors(chunk.tensors),
                                               devices);
       for (size_t i = 1, num_devices = devices.size(); i < num_devices; ++i) {
diff --git a/torch/csrc/utils/tensor_flatten.cpp b/torch/csrc/utils/tensor_flatten.cpp
index 29414368c7bd50..be01a66b5996bb 100644
--- a/torch/csrc/utils/tensor_flatten.cpp
+++ b/torch/csrc/utils/tensor_flatten.cpp
@@ -85,8 +85,11 @@ std::vector<at::Tensor> unflatten_sparse_tensors(
 
   std::vector<at::Tensor> outputs;
   outputs.reserve(tensors.size());
-  for (size_t i = 0, num_tensors = tensors.size(); i < num_tensors; ++i)
-    outputs.emplace_back(at::_sparse_coo_tensor_unsafe(indices[i], values[i], tensors[i].sizes()));
+  for (size_t i = 0, num_tensors = tensors.size(); i < num_tensors; ++i) {
+    auto &ref_t = tensors[i];
+    auto t = at::_sparse_coo_tensor_unsafe(indices[i], values[i], ref_t.sizes());
+    outputs.emplace_back(t._coalesced_(ref_t.is_coalesced()));
+  }
   return outputs;
 }
 
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 628409b18df981..7f2a70827acba3 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -221,8 +221,8 @@ Tensor internal_new_from_data(
                                                      *torch::getLayout(type.backend()),
                                                      type_inference_device_type);
     const auto& type_to_use = type_inference ? type_inference_type : type;
-    return copy_variables ? new_with_tensor_copy(type_to_use, var, device_index) :
-                              new_with_type_conversion(type_to_use, var, device_index);
+    return copy_variables ? new_with_tensor_copy(type_to_use, var, device_index)
+                          : new_with_type_conversion(type_to_use, var, device_index);
   }
 
 #ifdef USE_NUMPY
@@ -291,12 +291,12 @@ Tensor legacy_sparse_tensor_ctor(const Type& type, PyObject* args, PyObject* kwa
     auto deviceOptional = r.deviceOptional(2);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1), type.options());
+    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1));
   } else if (r.idx == 3) {
     auto deviceOptional = r.deviceOptional(3);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2), type.options());
+    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2));
   } else if (r.idx == 4) {
     PyObject* arg = r.pyobject(0);
     auto deviceOptional = r.deviceOptional(1);
@@ -335,14 +335,14 @@ Tensor legacy_sparse_tensor_new(const Type& type, PyObject* args, PyObject* kwar
     auto deviceOptional = r.deviceOptional(2);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1), type.options());
+    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1));
   } else if (r.idx == 3) {
     // Note: this signature doesn't have a dtype, even though it has a device; it probably shouldn't
     // have a device (we should infer it).
     auto deviceOptional = r.deviceOptional(3);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2), type.options());
+    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2));
   } else if (r.idx == 4) {
     PyObject* arg = r.pyobject(0);
     auto deviceOptional = r.deviceOptional(1);
@@ -464,10 +464,7 @@ Tensor legacy_new_from_data(
   return internal_new_from_data(type, device, data, false, false, false);
 }
 
-Tensor sparse_coo_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) {
-  const auto sparse_backend = type.is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU;
-  const auto& default_sparse_type = type.toBackend(sparse_backend);
-
+Tensor sparse_coo_tensor_ctor(const Type& default_type, PyObject* args, PyObject* kwargs) {
   static PythonArgParser parser({
     "sparse_coo_tensor(PyObject* indices, PyObject* values, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
     "sparse_coo_tensor(PyObject* indices, PyObject* values, IntList size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
@@ -478,29 +475,27 @@ Tensor sparse_coo_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs
   auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {
     bool type_inference = r.isNone(2);
-    const auto& sparse_type = typeWithDefault(r, 2, 3, default_sparse_type);
-    const auto& dense_type = sparse_type.toBackend(sparse_type.is_cuda() ? Backend::CUDA : Backend::CPU);
+    const auto& type = typeWithDefault(r, 2, 3, default_type);
+    const auto& values_type = type.toDense();
     at::DeviceGuard device_guard(r.device(3));
-    Tensor values = internal_new_from_data(dense_type, r.deviceOptional(3), r.pyobject(1), false, true, type_inference);
     // if no dtype provided, infer type based on value type.
-    const auto& index_type = values.type().toScalarType(kLong);
-    Tensor indices = internal_new_from_data(index_type, r.deviceOptional(3), r.pyobject(0), false, true, false);
-    const auto& sparse_type_to_use = values.type().toBackend(values.type().is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU);
-    return at::sparse_coo_tensor(indices, values, sparse_type_to_use.options()).set_requires_grad(r.toBool(4));
+    Tensor values = internal_new_from_data(values_type, r.deviceOptional(3), r.pyobject(1), false, true, type_inference);
+    const auto& indices_type = values.type().toScalarType(kLong);
+    Tensor indices = internal_new_from_data(indices_type, r.deviceOptional(3), r.pyobject(0), false, true, false);
+    return at::sparse_coo_tensor(indices, values, values.options().layout(at::kSparse)).set_requires_grad(r.toBool(4));
   } else if (r.idx == 1) {
     bool type_inference = r.isNone(3);
-    const auto& sparse_type = typeWithDefault(r, 3, 4, default_sparse_type);
-    const auto& dense_type = sparse_type.toBackend(sparse_type.is_cuda() ? Backend::CUDA : Backend::CPU);
+    const auto& type = typeWithDefault(r, 3, 4, default_type);
+    const auto& values_type = type.toDense();
     at::DeviceGuard device_guard(r.device(4));
-    Tensor values = internal_new_from_data(dense_type, r.deviceOptional(4), r.pyobject(1), false, true, type_inference);
-    const auto& index_type = values.type().toScalarType(kLong);
-    Tensor indices = internal_new_from_data(index_type, r.deviceOptional(4), r.pyobject(0), false, true, false);
-    const auto& sparse_type_to_use = values.type().toBackend(values.type().is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU);
-    return at::sparse_coo_tensor(indices, values, r.intlist(2), sparse_type_to_use.options()).set_requires_grad(r.toBool(5));
+    Tensor values = internal_new_from_data(values_type, r.deviceOptional(4), r.pyobject(1), false, true, type_inference);
+    const auto& indices_type = values.type().toScalarType(kLong);
+    Tensor indices = internal_new_from_data(indices_type, r.deviceOptional(4), r.pyobject(0), false, true, false);
+    return at::sparse_coo_tensor(indices, values, r.intlist(2), values.options().layout(at::kSparse)).set_requires_grad(r.toBool(5));
   } else if (r.idx == 2) {
-    const auto& sparse_type_to_use = typeWithDefault(r, 1, 2, default_sparse_type);
+    const auto& type = typeWithDefault(r, 1, 2, default_type);
     at::DeviceGuard device_guard(r.device(2));
-    return at::sparse_coo_tensor(r.intlist(0), sparse_type_to_use.options()).set_requires_grad(r.toBool(3));
+    return at::sparse_coo_tensor(r.intlist(0), type.options().layout(at::kSparse)).set_requires_grad(r.toBool(3));
   }
   throw std::runtime_error("sparse_coo_tensor(): invalid arguments");
 }