diff --git a/java/src/main/java/ai/rapids/cudf/CudaException.java b/java/src/main/java/ai/rapids/cudf/CudaException.java index 2d862b47ef8..ff7ca308f3c 100755 --- a/java/src/main/java/ai/rapids/cudf/CudaException.java +++ b/java/src/main/java/ai/rapids/cudf/CudaException.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,9 @@ */ package ai.rapids.cudf; +import java.util.HashMap; +import java.util.Map; + /** * Exception from the cuda language/library. Be aware that because of how cuda does asynchronous * processing exceptions from cuda can be thrown by method calls that did not cause the exception @@ -28,11 +31,283 @@ * don't switch between threads for different parts of processing that can be retried as a chunk. */ public class CudaException extends RuntimeException { - CudaException(String message) { + CudaException(String message, int errorCode) { super(message); + cudaError = CudaError.parseErrorCode(errorCode); } - CudaException(String message, Throwable cause) { + CudaException(String message, int errorCode, Throwable cause) { super(message, cause); + cudaError = CudaError.parseErrorCode(errorCode); + } + + public final CudaError cudaError; + + /** + * The Java mirror of cudaError, which facilities the tracking of CUDA errors in JVM. + */ + public enum CudaError { + UnknownNativeError(-1), // native CUDA error type which Java doesn't have a representation + cudaErrorInvalidValue(1), + cudaErrorMemoryAllocation(2), + cudaErrorInitializationError(3), + cudaErrorCudartUnloading(4), + cudaErrorProfilerDisabled(5), + cudaErrorProfilerNotInitialized(6), + cudaErrorProfilerAlreadyStarted(7), + cudaErrorProfilerAlreadyStopped(8), + cudaErrorInvalidConfiguration(9), + cudaErrorInvalidPitchValue(12), + cudaErrorInvalidSymbol(13), + cudaErrorInvalidHostPointer(16), + cudaErrorInvalidDevicePointer(17), + cudaErrorInvalidTexture(18), + cudaErrorInvalidTextureBinding(19), + cudaErrorInvalidChannelDescriptor(20), + cudaErrorInvalidMemcpyDirection(21), + cudaErrorAddressOfConstant(22), + cudaErrorTextureFetchFailed(23), + cudaErrorTextureNotBound(24), + cudaErrorSynchronizationError(25), + cudaErrorInvalidFilterSetting(26), + cudaErrorInvalidNormSetting(27), + cudaErrorMixedDeviceExecution(28), + cudaErrorNotYetImplemented(31), + cudaErrorMemoryValueTooLarge(32), + cudaErrorStubLibrary(34), + cudaErrorInsufficientDriver(35), + cudaErrorCallRequiresNewerDriver(36), + cudaErrorInvalidSurface(37), + cudaErrorDuplicateVariableName(43), + cudaErrorDuplicateTextureName(44), + cudaErrorDuplicateSurfaceName(45), + cudaErrorDevicesUnavailable(46), + cudaErrorIncompatibleDriverContext(49), + cudaErrorMissingConfiguration(52), + cudaErrorPriorLaunchFailure(53), + cudaErrorLaunchMaxDepthExceeded(65), + cudaErrorLaunchFileScopedTex(66), + cudaErrorLaunchFileScopedSurf(67), + cudaErrorSyncDepthExceeded(68), + cudaErrorLaunchPendingCountExceeded(69), + cudaErrorInvalidDeviceFunction(98), + cudaErrorNoDevice(100), + cudaErrorInvalidDevice(101), + cudaErrorDeviceNotLicensed(102), + cudaErrorSoftwareValidityNotEstablished(103), + cudaErrorStartupFailure(127), + cudaErrorInvalidKernelImage(200), + cudaErrorDeviceUninitialized(201), + cudaErrorMapBufferObjectFailed(205), + cudaErrorUnmapBufferObjectFailed(206), + cudaErrorArrayIsMapped(207), + cudaErrorAlreadyMapped(208), + cudaErrorNoKernelImageForDevice(209), + cudaErrorAlreadyAcquired(210), + cudaErrorNotMapped(211), + cudaErrorNotMappedAsArray(212), + cudaErrorNotMappedAsPointer(213), + cudaErrorECCUncorrectable(214), + cudaErrorUnsupportedLimit(215), + cudaErrorDeviceAlreadyInUse(216), + cudaErrorPeerAccessUnsupported(217), + cudaErrorInvalidPtx(218), + cudaErrorInvalidGraphicsContext(219), + cudaErrorNvlinkUncorrectable(220), + cudaErrorJitCompilerNotFound(221), + cudaErrorUnsupportedPtxVersion(222), + cudaErrorJitCompilationDisabled(223), + cudaErrorUnsupportedExecAffinity(224), + cudaErrorInvalidSource(300), + cudaErrorFileNotFound(301), + cudaErrorSharedObjectSymbolNotFound(302), + cudaErrorSharedObjectInitFailed(303), + cudaErrorOperatingSystem(304), + cudaErrorInvalidResourceHandle(400), + cudaErrorIllegalState(401), + cudaErrorSymbolNotFound(500), + cudaErrorNotReady(600), + cudaErrorIllegalAddress(700), + cudaErrorLaunchOutOfResources(701), + cudaErrorLaunchTimeout(702), + cudaErrorLaunchIncompatibleTexturing(703), + cudaErrorPeerAccessAlreadyEnabled(704), + cudaErrorPeerAccessNotEnabled(705), + cudaErrorSetOnActiveProcess(708), + cudaErrorContextIsDestroyed(709), + cudaErrorAssert(710), + cudaErrorTooManyPeers(711), + cudaErrorHostMemoryAlreadyRegistered(712), + cudaErrorHostMemoryNotRegistered(713), + cudaErrorHardwareStackError(714), + cudaErrorIllegalInstruction(715), + cudaErrorMisalignedAddress(716), + cudaErrorInvalidAddressSpace(717), + cudaErrorInvalidPc(718), + cudaErrorLaunchFailure(719), + cudaErrorCooperativeLaunchTooLarge(720), + cudaErrorNotPermitted(800), + cudaErrorNotSupported(801), + cudaErrorSystemNotReady(802), + cudaErrorSystemDriverMismatch(803), + cudaErrorCompatNotSupportedOnDevice(804), + cudaErrorMpsConnectionFailed(805), + cudaErrorMpsRpcFailure(806), + cudaErrorMpsServerNotReady(807), + cudaErrorMpsMaxClientsReached(808), + cudaErrorMpsMaxConnectionsReached(809), + cudaErrorStreamCaptureUnsupported(900), + cudaErrorStreamCaptureInvalidated(901), + cudaErrorStreamCaptureMerge(902), + cudaErrorStreamCaptureUnmatched(903), + cudaErrorStreamCaptureUnjoined(904), + cudaErrorStreamCaptureIsolation(905), + cudaErrorStreamCaptureImplicit(906), + cudaErrorCapturedEvent(907), + cudaErrorStreamCaptureWrongThread(908), + cudaErrorTimeout(909), + cudaErrorGraphExecUpdateFailure(910), + cudaErrorExternalDevice(911), + cudaErrorUnknown(999), + cudaErrorApiFailureBase(10000); + + final int code; + + private static Map codeToError = new HashMap(){{ + put(cudaErrorInvalidValue.code, cudaErrorInvalidValue); + put(cudaErrorMemoryAllocation.code, cudaErrorMemoryAllocation); + put(cudaErrorInitializationError.code, cudaErrorInitializationError); + put(cudaErrorCudartUnloading.code, cudaErrorCudartUnloading); + put(cudaErrorProfilerDisabled.code, cudaErrorProfilerDisabled); + put(cudaErrorProfilerNotInitialized.code, cudaErrorProfilerNotInitialized); + put(cudaErrorProfilerAlreadyStarted.code, cudaErrorProfilerAlreadyStarted); + put(cudaErrorProfilerAlreadyStopped.code, cudaErrorProfilerAlreadyStopped); + put(cudaErrorInvalidConfiguration.code, cudaErrorInvalidConfiguration); + put(cudaErrorInvalidPitchValue.code, cudaErrorInvalidPitchValue); + put(cudaErrorInvalidSymbol.code, cudaErrorInvalidSymbol); + put(cudaErrorInvalidHostPointer.code, cudaErrorInvalidHostPointer); + put(cudaErrorInvalidDevicePointer.code, cudaErrorInvalidDevicePointer); + put(cudaErrorInvalidTexture.code, cudaErrorInvalidTexture); + put(cudaErrorInvalidTextureBinding.code, cudaErrorInvalidTextureBinding); + put(cudaErrorInvalidChannelDescriptor.code, cudaErrorInvalidChannelDescriptor); + put(cudaErrorInvalidMemcpyDirection.code, cudaErrorInvalidMemcpyDirection); + put(cudaErrorAddressOfConstant.code, cudaErrorAddressOfConstant); + put(cudaErrorTextureFetchFailed.code, cudaErrorTextureFetchFailed); + put(cudaErrorTextureNotBound.code, cudaErrorTextureNotBound); + put(cudaErrorSynchronizationError.code, cudaErrorSynchronizationError); + put(cudaErrorInvalidFilterSetting.code, cudaErrorInvalidFilterSetting); + put(cudaErrorInvalidNormSetting.code, cudaErrorInvalidNormSetting); + put(cudaErrorMixedDeviceExecution.code, cudaErrorMixedDeviceExecution); + put(cudaErrorNotYetImplemented.code, cudaErrorNotYetImplemented); + put(cudaErrorMemoryValueTooLarge.code, cudaErrorMemoryValueTooLarge); + put(cudaErrorStubLibrary.code, cudaErrorStubLibrary); + put(cudaErrorInsufficientDriver.code, cudaErrorInsufficientDriver); + put(cudaErrorCallRequiresNewerDriver.code, cudaErrorCallRequiresNewerDriver); + put(cudaErrorInvalidSurface.code, cudaErrorInvalidSurface); + put(cudaErrorDuplicateVariableName.code, cudaErrorDuplicateVariableName); + put(cudaErrorDuplicateTextureName.code, cudaErrorDuplicateTextureName); + put(cudaErrorDuplicateSurfaceName.code, cudaErrorDuplicateSurfaceName); + put(cudaErrorDevicesUnavailable.code, cudaErrorDevicesUnavailable); + put(cudaErrorIncompatibleDriverContext.code, cudaErrorIncompatibleDriverContext); + put(cudaErrorMissingConfiguration.code, cudaErrorMissingConfiguration); + put(cudaErrorPriorLaunchFailure.code, cudaErrorPriorLaunchFailure); + put(cudaErrorLaunchMaxDepthExceeded.code, cudaErrorLaunchMaxDepthExceeded); + put(cudaErrorLaunchFileScopedTex.code, cudaErrorLaunchFileScopedTex); + put(cudaErrorLaunchFileScopedSurf.code, cudaErrorLaunchFileScopedSurf); + put(cudaErrorSyncDepthExceeded.code, cudaErrorSyncDepthExceeded); + put(cudaErrorLaunchPendingCountExceeded.code, cudaErrorLaunchPendingCountExceeded); + put(cudaErrorInvalidDeviceFunction.code, cudaErrorInvalidDeviceFunction); + put(cudaErrorNoDevice.code, cudaErrorNoDevice); + put(cudaErrorInvalidDevice.code, cudaErrorInvalidDevice); + put(cudaErrorDeviceNotLicensed.code, cudaErrorDeviceNotLicensed); + put(cudaErrorSoftwareValidityNotEstablished.code, cudaErrorSoftwareValidityNotEstablished); + put(cudaErrorStartupFailure.code, cudaErrorStartupFailure); + put(cudaErrorInvalidKernelImage.code, cudaErrorInvalidKernelImage); + put(cudaErrorDeviceUninitialized.code, cudaErrorDeviceUninitialized); + put(cudaErrorMapBufferObjectFailed.code, cudaErrorMapBufferObjectFailed); + put(cudaErrorUnmapBufferObjectFailed.code, cudaErrorUnmapBufferObjectFailed); + put(cudaErrorArrayIsMapped.code, cudaErrorArrayIsMapped); + put(cudaErrorAlreadyMapped.code, cudaErrorAlreadyMapped); + put(cudaErrorNoKernelImageForDevice.code, cudaErrorNoKernelImageForDevice); + put(cudaErrorAlreadyAcquired.code, cudaErrorAlreadyAcquired); + put(cudaErrorNotMapped.code, cudaErrorNotMapped); + put(cudaErrorNotMappedAsArray.code, cudaErrorNotMappedAsArray); + put(cudaErrorNotMappedAsPointer.code, cudaErrorNotMappedAsPointer); + put(cudaErrorECCUncorrectable.code, cudaErrorECCUncorrectable); + put(cudaErrorUnsupportedLimit.code, cudaErrorUnsupportedLimit); + put(cudaErrorDeviceAlreadyInUse.code, cudaErrorDeviceAlreadyInUse); + put(cudaErrorPeerAccessUnsupported.code, cudaErrorPeerAccessUnsupported); + put(cudaErrorInvalidPtx.code, cudaErrorInvalidPtx); + put(cudaErrorInvalidGraphicsContext.code, cudaErrorInvalidGraphicsContext); + put(cudaErrorNvlinkUncorrectable.code, cudaErrorNvlinkUncorrectable); + put(cudaErrorJitCompilerNotFound.code, cudaErrorJitCompilerNotFound); + put(cudaErrorUnsupportedPtxVersion.code, cudaErrorUnsupportedPtxVersion); + put(cudaErrorJitCompilationDisabled.code, cudaErrorJitCompilationDisabled); + put(cudaErrorUnsupportedExecAffinity.code, cudaErrorUnsupportedExecAffinity); + put(cudaErrorInvalidSource.code, cudaErrorInvalidSource); + put(cudaErrorFileNotFound.code, cudaErrorFileNotFound); + put(cudaErrorSharedObjectSymbolNotFound.code, cudaErrorSharedObjectSymbolNotFound); + put(cudaErrorSharedObjectInitFailed.code, cudaErrorSharedObjectInitFailed); + put(cudaErrorOperatingSystem.code, cudaErrorOperatingSystem); + put(cudaErrorInvalidResourceHandle.code, cudaErrorInvalidResourceHandle); + put(cudaErrorIllegalState.code, cudaErrorIllegalState); + put(cudaErrorSymbolNotFound.code, cudaErrorSymbolNotFound); + put(cudaErrorNotReady.code, cudaErrorNotReady); + put(cudaErrorIllegalAddress.code, cudaErrorIllegalAddress); + put(cudaErrorLaunchOutOfResources.code, cudaErrorLaunchOutOfResources); + put(cudaErrorLaunchTimeout.code, cudaErrorLaunchTimeout); + put(cudaErrorLaunchIncompatibleTexturing.code, cudaErrorLaunchIncompatibleTexturing); + put(cudaErrorPeerAccessAlreadyEnabled.code, cudaErrorPeerAccessAlreadyEnabled); + put(cudaErrorPeerAccessNotEnabled.code, cudaErrorPeerAccessNotEnabled); + put(cudaErrorSetOnActiveProcess.code, cudaErrorSetOnActiveProcess); + put(cudaErrorContextIsDestroyed.code, cudaErrorContextIsDestroyed); + put(cudaErrorAssert.code, cudaErrorAssert); + put(cudaErrorTooManyPeers.code, cudaErrorTooManyPeers); + put(cudaErrorHostMemoryAlreadyRegistered.code, cudaErrorHostMemoryAlreadyRegistered); + put(cudaErrorHostMemoryNotRegistered.code, cudaErrorHostMemoryNotRegistered); + put(cudaErrorHardwareStackError.code, cudaErrorHardwareStackError); + put(cudaErrorIllegalInstruction.code, cudaErrorIllegalInstruction); + put(cudaErrorMisalignedAddress.code, cudaErrorMisalignedAddress); + put(cudaErrorInvalidAddressSpace.code, cudaErrorInvalidAddressSpace); + put(cudaErrorInvalidPc.code, cudaErrorInvalidPc); + put(cudaErrorLaunchFailure.code, cudaErrorLaunchFailure); + put(cudaErrorCooperativeLaunchTooLarge.code, cudaErrorCooperativeLaunchTooLarge); + put(cudaErrorNotPermitted.code, cudaErrorNotPermitted); + put(cudaErrorNotSupported.code, cudaErrorNotSupported); + put(cudaErrorSystemNotReady.code, cudaErrorSystemNotReady); + put(cudaErrorSystemDriverMismatch.code, cudaErrorSystemDriverMismatch); + put(cudaErrorCompatNotSupportedOnDevice.code, cudaErrorCompatNotSupportedOnDevice); + put(cudaErrorMpsConnectionFailed.code, cudaErrorMpsConnectionFailed); + put(cudaErrorMpsRpcFailure.code, cudaErrorMpsRpcFailure); + put(cudaErrorMpsServerNotReady.code, cudaErrorMpsServerNotReady); + put(cudaErrorMpsMaxClientsReached.code, cudaErrorMpsMaxClientsReached); + put(cudaErrorMpsMaxConnectionsReached.code, cudaErrorMpsMaxConnectionsReached); + put(cudaErrorStreamCaptureUnsupported.code, cudaErrorStreamCaptureUnsupported); + put(cudaErrorStreamCaptureInvalidated.code, cudaErrorStreamCaptureInvalidated); + put(cudaErrorStreamCaptureMerge.code, cudaErrorStreamCaptureMerge); + put(cudaErrorStreamCaptureUnmatched.code, cudaErrorStreamCaptureUnmatched); + put(cudaErrorStreamCaptureUnjoined.code, cudaErrorStreamCaptureUnjoined); + put(cudaErrorStreamCaptureIsolation.code, cudaErrorStreamCaptureIsolation); + put(cudaErrorStreamCaptureImplicit.code, cudaErrorStreamCaptureImplicit); + put(cudaErrorCapturedEvent.code, cudaErrorCapturedEvent); + put(cudaErrorStreamCaptureWrongThread.code, cudaErrorStreamCaptureWrongThread); + put(cudaErrorTimeout.code, cudaErrorTimeout); + put(cudaErrorGraphExecUpdateFailure.code, cudaErrorGraphExecUpdateFailure); + put(cudaErrorExternalDevice.code, cudaErrorExternalDevice); + put(cudaErrorUnknown.code, cudaErrorUnknown); + put(cudaErrorApiFailureBase.code, cudaErrorApiFailureBase); + }}; + + CudaError(int errorCode) { + this.code = errorCode; + } + + public static CudaError parseErrorCode(int errorCode) { + if (!codeToError.containsKey(errorCode)) { + return UnknownNativeError; + } + return codeToError.get(errorCode); + } + } } diff --git a/java/src/main/java/ai/rapids/cudf/CudaFatalException.java b/java/src/main/java/ai/rapids/cudf/CudaFatalException.java new file mode 100644 index 00000000000..cf36726aa80 --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/CudaFatalException.java @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ai.rapids.cudf; + +/** + * CudaFatalException is a kind of CudaException which leaves the process in an inconsistent state + * and any further CUDA work will return the same error. + * To continue using CUDA, the process must be terminated and relaunched. + */ +public class CudaFatalException extends CudaException { + CudaFatalException(String message, int errorCode) { + super(message, errorCode); + } + + CudaFatalException(String message, int errorCode, Throwable cause) { + super(message, errorCode, cause); + } +} diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index a45716a89b3..eca424132a5 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -30,6 +30,7 @@ namespace jni { constexpr jint MINIMUM_JNI_VERSION = JNI_VERSION_1_6; constexpr char const *CUDA_ERROR_CLASS = "ai/rapids/cudf/CudaException"; +constexpr char const *CUDA_FATAL_ERROR_CLASS = "ai/rapids/cudf/CudaFatalException"; constexpr char const *CUDF_ERROR_CLASS = "ai/rapids/cudf/CudfException"; constexpr char const *INDEX_OOB_CLASS = "java/lang/ArrayIndexOutOfBoundsException"; constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException"; @@ -737,12 +738,26 @@ class native_jstringArray { * @brief create a cuda exception from a given cudaError_t */ inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowable cause = NULL) { - jclass ex_class = env->FindClass(cudf::jni::CUDA_ERROR_CLASS); + const char *ex_class_name; + + // Calls cudaGetLastError twice. It is nearly certain that a fatal error occurred if the second + // call doesn't return with cudaSuccess. + cudaGetLastError(); + auto const last = cudaGetLastError(); + // Call cudaDeviceSynchronize to ensure `last` did not result from an asynchronous error. + // between two calls. + if (status == last && last == cudaDeviceSynchronize()) { + ex_class_name = cudf::jni::CUDA_FATAL_ERROR_CLASS; + } else { + ex_class_name = cudf::jni::CUDA_ERROR_CLASS; + } + + jclass ex_class = env->FindClass(ex_class_name); if (ex_class == NULL) { return NULL; } jmethodID ctor_id = - env->GetMethodID(ex_class, "", "(Ljava/lang/String;Ljava/lang/Throwable;)V"); + env->GetMethodID(ex_class, "", "(Ljava/lang/String;ILjava/lang/Throwable;)V"); if (ctor_id == NULL) { return NULL; } @@ -752,19 +767,20 @@ inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowab return NULL; } - jobject ret = env->NewObject(ex_class, ctor_id, msg, cause); + jint err_code = static_cast(status); + + jobject ret = env->NewObject(ex_class, ctor_id, msg, err_code, cause); return (jthrowable)ret; } inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { if (cudaSuccess != cuda_status) { - // Clear the last error so it does not propagate. - cudaGetLastError(); jthrowable jt = cuda_exception(env, cuda_status); if (jt != NULL) { env->Throw(jt); - throw jni_exception("CUDA ERROR"); } + throw jni_exception(std::string("CUDA ERROR: code ") + + std::to_string(static_cast(cuda_status))); } } @@ -790,18 +806,26 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { JNI_THROW_NEW(env, class_name, message, ret_val) \ } -#define JNI_CUDA_TRY(env, ret_val, call) \ +// Throw a new exception only if one is not pending then always return with the specified value +#define JNI_CHECK_CUDA_ERROR(env, class_name, e, ret_val) \ { \ - cudaError_t internal_cuda_status = (call); \ - if (cudaSuccess != internal_cuda_status) { \ - /* Clear the last error so it does not propagate.*/ \ - cudaGetLastError(); \ - jthrowable jt = cudf::jni::cuda_exception(env, internal_cuda_status); \ - if (jt != NULL) { \ - env->Throw(jt); \ - } \ + if (env->ExceptionOccurred()) { \ return ret_val; \ } \ + std::string n_msg = e.what() == nullptr ? "" : e.what(); \ + jstring j_msg = env->NewStringUTF(n_msg.c_str()); \ + jint e_code = static_cast(e.error_code()); \ + jclass ex_class = env->FindClass(class_name); \ + if (ex_class != NULL) { \ + jmethodID ctor_id = env->GetMethodID(ex_class, "", "(Ljava/lang/String;I)V"); \ + if (ctor_id != NULL) { \ + jobject cuda_error = env->NewObject(ex_class, ctor_id, j_msg, e_code); \ + if (cuda_error != NULL) { \ + env->Throw((jthrowable)cuda_error); \ + } \ + } \ + } \ + return ret_val; \ } #define JNI_NULL_CHECK(env, obj, error_msg, ret_val) \ @@ -831,6 +855,12 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) { std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \ JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val); \ } \ + catch (const cudf::fatal_cuda_error &e) { \ + JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, ret_val); \ + } \ + catch (const cudf::cuda_error &e) { \ + JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e, ret_val); \ + } \ catch (const std::exception &e) { \ /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \ JNI_CHECK_THROW_NEW(env, class_name, e.what(), ret_val); \ diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp index 9862c3bface..926521c55f9 100644 --- a/java/src/main/native/src/CudaJni.cpp +++ b/java/src/main/native/src/CudaJni.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include "jni_utils.hpp" @@ -66,7 +67,7 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv *env, jclas cudf::jni::auto_set_device(env); size_t free, total; - JNI_CUDA_TRY(env, NULL, cudaMemGetInfo(&free, &total)); + CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total)); jclass info_class = env->FindClass("Lai/rapids/cudf/CudaMemInfo;"); if (info_class == NULL) { @@ -90,7 +91,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_hostAllocPinned(JNIEnv *env, jc try { cudf::jni::auto_set_device(env); void *ret = nullptr; - JNI_CUDA_TRY(env, 0, cudaMallocHost(&ret, size)); + CUDF_CUDA_TRY(cudaMallocHost(&ret, size)); return reinterpret_cast(ret); } CATCH_STD(env, 0); @@ -99,7 +100,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_hostAllocPinned(JNIEnv *env, jc JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freePinned(JNIEnv *env, jclass, jlong ptr) { try { cudf::jni::auto_set_device(env); - JNI_CUDA_TRY(env, , cudaFreeHost(reinterpret_cast(ptr))); + CUDF_CUDA_TRY(cudaFreeHost(reinterpret_cast(ptr))); } CATCH_STD(env, ); } @@ -109,8 +110,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memset(JNIEnv *env, jclass, jlon JNI_NULL_CHECK(env, dst, "dst memory pointer is null", ); try { cudf::jni::auto_set_device(env); - JNI_CUDA_TRY(env, , cudaMemsetAsync((void *)dst, value, count)); - JNI_CUDA_TRY(env, , cudaStreamSynchronize(0)); + CUDF_CUDA_TRY(cudaMemsetAsync((void *)dst, value, count)); + CUDF_CUDA_TRY(cudaStreamSynchronize(0)); } CATCH_STD(env, ); } @@ -120,7 +121,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemset(JNIEnv *env, jclass, JNI_NULL_CHECK(env, dst, "dst memory pointer is null", ); try { cudf::jni::auto_set_device(env); - JNI_CUDA_TRY(env, , cudaMemsetAsync((void *)dst, value, count)); + CUDF_CUDA_TRY(cudaMemsetAsync((void *)dst, value, count)); } CATCH_STD(env, ); } @@ -129,7 +130,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDevice(JNIEnv *env, jclass) { try { cudf::jni::auto_set_device(env); jint dev; - JNI_CUDA_TRY(env, -2, cudaGetDevice(&dev)); + CUDF_CUDA_TRY(cudaGetDevice(&dev)); return dev; } CATCH_STD(env, -2); @@ -139,7 +140,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDeviceCount(JNIEnv *env, jcla try { cudf::jni::auto_set_device(env); jint count; - JNI_CUDA_TRY(env, -2, cudaGetDeviceCount(&count)); + CUDF_CUDA_TRY(cudaGetDeviceCount(&count)); return count; } CATCH_STD(env, -2); @@ -151,7 +152,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_setDevice(JNIEnv *env, jclass, j cudf::jni::throw_java_exception(env, cudf::jni::CUDF_ERROR_CLASS, "Cannot change device after RMM init"); } - JNI_CUDA_TRY(env, , cudaSetDevice(dev)); + CUDF_CUDA_TRY(cudaSetDevice(dev)); } CATCH_STD(env, ); } @@ -167,7 +168,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDriverVersion(JNIEnv *env, jc try { cudf::jni::auto_set_device(env); jint driver_version; - JNI_CUDA_TRY(env, -2, cudaDriverGetVersion(&driver_version)); + CUDF_CUDA_TRY(cudaDriverGetVersion(&driver_version)); return driver_version; } CATCH_STD(env, -2); @@ -177,7 +178,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getRuntimeVersion(JNIEnv *env, j try { cudf::jni::auto_set_device(env); jint runtime_version; - JNI_CUDA_TRY(env, -2, cudaRuntimeGetVersion(&runtime_version)); + CUDF_CUDA_TRY(cudaRuntimeGetVersion(&runtime_version)); return runtime_version; } CATCH_STD(env, -2); @@ -187,9 +188,9 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getNativeComputeMode(JNIEnv *env try { cudf::jni::auto_set_device(env); int device; - JNI_CUDA_TRY(env, -2, cudaGetDevice(&device)); + CUDF_CUDA_TRY(cudaGetDevice(&device)); cudaDeviceProp device_prop; - JNI_CUDA_TRY(env, -2, cudaGetDeviceProperties(&device_prop, device)); + CUDF_CUDA_TRY(cudaGetDeviceProperties(&device_prop, device)); return device_prop.computeMode; } CATCH_STD(env, -2); @@ -199,10 +200,9 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMajor(JNIEnv try { cudf::jni::auto_set_device(env); int device; - JNI_CUDA_TRY(env, -2, ::cudaGetDevice(&device)); + CUDF_CUDA_TRY(::cudaGetDevice(&device)); int attribute_value; - JNI_CUDA_TRY( - env, -2, + CUDF_CUDA_TRY( ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMajor, device)); return attribute_value; } @@ -213,10 +213,9 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMinor(JNIEnv try { cudf::jni::auto_set_device(env); int device; - JNI_CUDA_TRY(env, -2, ::cudaGetDevice(&device)); + CUDF_CUDA_TRY(::cudaGetDevice(&device)); int attribute_value; - JNI_CUDA_TRY( - env, -2, + CUDF_CUDA_TRY( ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMinor, device)); return attribute_value; } @@ -226,7 +225,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMinor(JNIEnv JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv *env, jclass) { try { cudf::jni::auto_set_device(env); - JNI_CUDA_TRY(env, , cudaFree(0)); + CUDF_CUDA_TRY(cudaFree(0)); } CATCH_STD(env, ); } @@ -237,7 +236,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createStream(JNIEnv *env, jclas cudf::jni::auto_set_device(env); cudaStream_t stream = nullptr; auto flags = isNonBlocking ? cudaStreamNonBlocking : cudaStreamDefault; - JNI_CUDA_TRY(env, 0, cudaStreamCreateWithFlags(&stream, flags)); + CUDF_CUDA_TRY(cudaStreamCreateWithFlags(&stream, flags)); return reinterpret_cast(stream); } CATCH_STD(env, 0); @@ -247,7 +246,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv *env, jclas try { cudf::jni::auto_set_device(env); auto stream = reinterpret_cast(jstream); - JNI_CUDA_TRY(env, , cudaStreamDestroy(stream)); + CUDF_CUDA_TRY(cudaStreamDestroy(stream)); } CATCH_STD(env, ); } @@ -258,7 +257,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamWaitEvent(JNIEnv *env, jcl cudf::jni::auto_set_device(env); auto stream = reinterpret_cast(jstream); auto event = reinterpret_cast(jevent); - JNI_CUDA_TRY(env, , cudaStreamWaitEvent(stream, event, 0)); + CUDF_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0)); } CATCH_STD(env, ); } @@ -268,7 +267,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv *env, j try { cudf::jni::auto_set_device(env); auto stream = reinterpret_cast(jstream); - JNI_CUDA_TRY(env, , cudaStreamSynchronize(stream)); + CUDF_CUDA_TRY(cudaStreamSynchronize(stream)); } CATCH_STD(env, ); } @@ -286,7 +285,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createEvent(JNIEnv *env, jclass if (blockingSync) { flags = flags | cudaEventBlockingSync; } - JNI_CUDA_TRY(env, 0, cudaEventCreateWithFlags(&event, flags)); + CUDF_CUDA_TRY(cudaEventCreateWithFlags(&event, flags)); return reinterpret_cast(event); } CATCH_STD(env, 0); @@ -296,7 +295,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv *env, jclass try { cudf::jni::auto_set_device(env); auto event = reinterpret_cast(jevent); - JNI_CUDA_TRY(env, , cudaEventDestroy(event)); + CUDF_CUDA_TRY(cudaEventDestroy(event)); } CATCH_STD(env, ); } @@ -311,7 +310,7 @@ JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_eventQuery(JNIEnv *env, jcla } else if (result == cudaErrorNotReady) { return false; } // else - JNI_CUDA_TRY(env, false, result); + CUDF_CUDA_TRY(result); } CATCH_STD(env, false); return false; @@ -323,7 +322,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventRecord(JNIEnv *env, jclass, cudf::jni::auto_set_device(env); auto event = reinterpret_cast(jevent); auto stream = reinterpret_cast(jstream); - JNI_CUDA_TRY(env, , cudaEventRecord(event, stream)); + CUDF_CUDA_TRY(cudaEventRecord(event, stream)); } CATCH_STD(env, ); } @@ -333,7 +332,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv *env, jc try { cudf::jni::auto_set_device(env); auto event = reinterpret_cast(jevent); - JNI_CUDA_TRY(env, , cudaEventSynchronize(event)); + CUDF_CUDA_TRY(cudaEventSynchronize(event)); } CATCH_STD(env, ); } @@ -352,8 +351,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(JNIEnv *env, jcla auto src = reinterpret_cast(jsrc); auto kind = static_cast(jkind); auto stream = reinterpret_cast(jstream); - JNI_CUDA_TRY(env, , cudaMemcpyAsync(dst, src, count, kind, stream)); - JNI_CUDA_TRY(env, , cudaStreamSynchronize(stream)); + CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream)); + CUDF_CUDA_TRY(cudaStreamSynchronize(stream)); } CATCH_STD(env, ); } @@ -372,7 +371,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemcpyOnStream(JNIEnv *env, auto src = reinterpret_cast(jsrc); auto kind = static_cast(jkind); auto stream = reinterpret_cast(jstream); - JNI_CUDA_TRY(env, , cudaMemcpyAsync(dst, src, count, kind, stream)); + CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream)); } CATCH_STD(env, ); } diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java index 8905c2edd56..1a86dbb374d 100644 --- a/java/src/test/java/ai/rapids/cudf/CudaTest.java +++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.*; public class CudaTest { @@ -32,4 +32,17 @@ public void testGetCudaRuntimeInfo() { assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId); } + @Test + public void testCudaException() { + assertThrows(CudaException.class, () -> { + try { + Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024); + } catch (CudaFatalException ignored) { + } catch (CudaException ex) { + assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.cudaError); + throw ex; + } + } + ); + } }