Skip to content

Commit

Permalink
[CUDA] auto detect compatibility when arch is not passed (#490)
Browse files Browse the repository at this point in the history
  • Loading branch information
tqchen authored Sep 26, 2017
1 parent c6a2045 commit c468558
Show file tree
Hide file tree
Showing 16 changed files with 61 additions and 31 deletions.
3 changes: 2 additions & 1 deletion include/tvm/runtime/device_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ namespace runtime {
enum DeviceAttrKind : int {
kExist = 0,
kMaxThreadsPerBlock = 1,
kWarpSize = 2
kWarpSize = 2,
kComputeVersion = 3
};

/*! \brief Number of bytes each allocation must align to */
Expand Down
14 changes: 14 additions & 0 deletions python/tvm/_ffi/runtime_ctypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,20 @@ def warp_size(self):
return _api_internal._GetDeviceAttr(
self.device_type, self.device_id, 2)

@property
def compute_version(self):
"""Get compute verison number in string.
Currently used to get compute capability of CUDA device.
Returns
-------
version : str
The version string in `major.minor` format.
"""
return _api_internal._GetDeviceAttr(
self.device_type, self.device_id, 3)

def sync(self):
"""Synchronize until jobs finished at the context."""
check_call(_LIB.TVMSynchronize(self.device_type, self.device_id, None))
Expand Down
5 changes: 1 addition & 4 deletions python/tvm/contrib/cc.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,8 @@ def create_shared(output,
if options:
cmd += options

args = ' '.join(cmd)
proc = subprocess.Popen(
args, shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
(out, _) = proc.communicate()

if proc.returncode != 0:
Expand Down
40 changes: 22 additions & 18 deletions python/tvm/contrib/nvcc.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
# pylint: disable=invalid-name
"""Utility to invoke nvcc compiler in the system"""
from __future__ import absolute_import as _abs
import sys

import subprocess
from . import util
from .. import ndarray as nd

def compile_cuda(code, target="ptx", arch=None,
options=None, path_target=None):
def compile_cuda(code,
target="ptx",
arch=None,
options=None,
path_target=None):
"""Compile cuda code with NVCC from env.
Parameters
Expand Down Expand Up @@ -39,32 +43,32 @@ def compile_cuda(code, target="ptx", arch=None,

with open(temp_code, "w") as out_file:
out_file.write(code)
if target == "cubin" and arch is None:
raise ValueError("arch(sm_xy) must be passed for generating cubin")

if arch is None:
if nd.gpu(0).exist:
# auto detect the compute arch argument
arch = "sm_" + "".join(nd.gpu(0).compute_version.split('.'))
else:
raise ValueError("arch(sm_xy) is not passed, and we cannot detect it from env")

file_target = path_target if path_target else temp_target
cmd = ["nvcc"]
cmd += ["--%s" % target, "-O3"]
if arch:
cmd += ["-arch", arch]
cmd += ["-arch", arch]
cmd += ["-o", file_target]

if options:
cmd += options
cmd += [temp_code]
args = ' '.join(cmd)

proc = subprocess.Popen(
args, shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

(out, _) = proc.communicate()

if proc.returncode != 0:
sys.stderr.write("Compilation error:\n")
sys.stderr.write(str(out))
sys.stderr.flush()
cubin = None
else:
cubin = bytearray(open(file_target, "rb").read())
return cubin
msg = "Compilation error:\n"
msg += out
raise RuntimeError(msg)

return bytearray(open(file_target, "rb").read())
11 changes: 11 additions & 0 deletions src/runtime/cuda/cuda_device_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@ class CUDADeviceAPI final : public DeviceAPI {
&value, cudaDevAttrWarpSize, ctx.device_id));
break;
}
case kComputeVersion: {
std::ostringstream os;
CUDA_CALL(cudaDeviceGetAttribute(
&value, cudaDevAttrComputeCapabilityMajor, ctx.device_id));
os << value << ".";
CUDA_CALL(cudaDeviceGetAttribute(
&value, cudaDevAttrComputeCapabilityMinor, ctx.device_id));
os << value;
*rv = os.str();
return;
}
}
*rv = value;
}
Expand Down
1 change: 1 addition & 0 deletions src/runtime/metal/metal_device_api.mm
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
*rv = 1;
break;
}
case kComputeVersion: return;
case kExist: break;
}
}
Expand Down
1 change: 1 addition & 0 deletions src/runtime/opencl/opencl_device_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ void OpenCLWorkspace::GetAttr(
*rv = 1;
break;
}
case kComputeVersion: return;
case kExist: break;
}
}
Expand Down
1 change: 1 addition & 0 deletions src/runtime/rocm/rocm_device_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class ROCMDeviceAPI final : public DeviceAPI {
value = 64;
break;
}
case kComputeVersion: return;
}
*rv = value;
}
Expand Down
2 changes: 1 addition & 1 deletion topi/python/topi/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def _compute(begin, *indices):
begin_ids = [seg_size * i for i in range(indices_or_sections)]
elif isinstance(indices_or_sections, (tuple, list)):
assert tuple(indices_or_sections) == tuple(sorted(indices_or_sections)),\
"Should be sorted, recieved %s" %str(indices_or_sections)
"Should be sorted, recieved %s" % str(indices_or_sections)
begin_ids = [0] + list(indices_or_sections)
else:
raise NotImplementedError
Expand Down
2 changes: 1 addition & 1 deletion topi/recipe/broadcast/test_broadcast_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

@tvm.register_func
def tvm_callback_cuda_compile(code):
ptx = nvcc.compile_cuda(code, target="ptx", options=["-arch=sm_52"])
ptx = nvcc.compile_cuda(code, target="ptx")
return ptx


Expand Down
2 changes: 1 addition & 1 deletion topi/recipe/conv/depthwise_conv2d_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

@tvm.register_func
def tvm_callback_cuda_compile(code):
ptx = nvcc.compile_cuda(code, target="ptx", options=["-arch=sm_37"]) # 37 for k80(ec2 instance)
ptx = nvcc.compile_cuda(code, target="ptx")
return ptx

def write_code(code, fname):
Expand Down
2 changes: 1 addition & 1 deletion topi/recipe/conv/test_conv2d_hwcn_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

@tvm.register_func
def tvm_callback_cuda_compile(code):
ptx = nvcc.compile_cuda(code, target="ptx", options=["-arch=sm_37"])
ptx = nvcc.compile_cuda(code, target="ptx")
return ptx

def write_code(code, fname):
Expand Down
2 changes: 1 addition & 1 deletion topi/recipe/gemm/cuda_gemm_square.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

@tvm.register_func
def tvm_callback_cuda_compile(code):
ptx = nvcc.compile_cuda(code, target="ptx", options=["-arch=sm_52"])
ptx = nvcc.compile_cuda(code, target="ptx")
return ptx

def write_code(code, fname):
Expand Down
2 changes: 1 addition & 1 deletion topi/recipe/reduce/test_reduce_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

@tvm.register_func
def tvm_callback_cuda_compile(code):
ptx = nvcc.compile_cuda(code, target="ptx", options=["-arch=sm_52"])
ptx = nvcc.compile_cuda(code, target="ptx")
return ptx


Expand Down
2 changes: 1 addition & 1 deletion topi/recipe/rnn/lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
@tvm.register_func
def tvm_callback_cuda_compile(code):
"""Use nvcc compiler for better perf."""
ptx = nvcc.compile_cuda(code, target="ptx", options=["-arch=sm_52"])
ptx = nvcc.compile_cuda(code, target="ptx")
return ptx

def write_code(code, fname):
Expand Down
2 changes: 1 addition & 1 deletion topi/recipe/rnn/matexp.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
@tvm.register_func
def tvm_callback_cuda_compile(code):
"""Use nvcc compiler for better perf."""
ptx = nvcc.compile_cuda(code, target="ptx", options=["-arch=sm_52"])
ptx = nvcc.compile_cuda(code, target="ptx")
return ptx

def write_code(code, fname):
Expand Down

0 comments on commit c468558

Please sign in to comment.