diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 2ed8d8e3ff78..0ab56a116eab 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -273,6 +273,16 @@ void CodeGenCUDA::VisitExpr_(const Ramp* op, std::ostream& os) {
 }
 
 void CodeGenCUDA::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
+  if (op->type.is_int() && op->type.bits() == 8 && op->lanes == 4) {
+    // make_int8x4
+    const int64_t *p = as_const_int(op->value);
+    CHECK(p);
+    int64_t v = *p & 0xFF;
+    v = (v << 24) | (v << 16) | (v << 8) | v;
+    os << "(int)" << v;
+    return;
+  }
+
   std::string v = PrintExpr(op->value);
   os << "make_";
   PrintType(op->type, os);
diff --git a/tests/python/unittest/test_codegen_cuda.py b/tests/python/unittest/test_codegen_cuda.py
index a0b1cf445ba6..d3b770790bdb 100644
--- a/tests/python/unittest/test_codegen_cuda.py
+++ b/tests/python/unittest/test_codegen_cuda.py
@@ -87,7 +87,30 @@ def check_cuda(dtype, n, lanes):
     check_cuda("int8", 64, 8)
     check_cuda("int8", 64, 16)
 
+def test_cuda_make_int8x4():
+    def check_cuda(n, value):
+        if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
+            print("skip because cuda is not enabled..")
+            return
+        lanes = 4
+        dtype = 'int8'
+        ctx = tvm.gpu(0)
+        A = tvm.compute((n, lanes), lambda i,j: tvm.const(value, dtype=dtype))
+        s = tvm.create_schedule(A.op)
+        y, x = s[A].op.axis
+        s[A].vectorize(x)
+        s[A].bind(y, tvm.thread_axis("blockIdx.x"))
+        fun = tvm.build(s, [A], "cuda", name="make_int8x4")
+        np_a = np.full((n, lanes), value, dtype=dtype)
+        a = tvm.nd.empty(np_a.shape, dtype, ctx)
+        fun(a)
+        np.testing.assert_equal(a.asnumpy(), np_a)
+    check_cuda(64, 0xAB)
+    check_cuda(64, 0)
+    check_cuda(64, -3)
+
 if __name__ == "__main__":
     test_cuda_vectorize_add()
     test_cuda_multiply_add()
     test_cuda_vectorize_load()
+    test_cuda_make_int8x4()