Skip to content

Commit

Permalink
Merge branch 'amd-develop' into amd-master
Browse files Browse the repository at this point in the history
Change-Id: I6b59c6f0d187685344e0444f769e19454a2c6ef0
  • Loading branch information
mangupta committed Oct 20, 2016
2 parents 7e8bfba + cd6eb7a commit 6f499ad
Show file tree
Hide file tree
Showing 45 changed files with 858 additions and 157 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
*.Po
hip-amdinternal
HIP-Examples
lib
packages

bin/hipInfo
bin/hipBusBandwidth
bin/hipDispatchLatency
bin/hipify-clang

samples/1_Utils/hipInfo/hipInfo
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ if(HIP_PLATFORM STREQUAL "hcc")
message(FATAL_ERROR "Don't know where to find HCC. Please specify abolute path using -DHCC_HOME")
endif()
add_to_config(_buildInfo HCC_VERSION)
string(REPLACE "-" ";" HCC_VERSION_LIST ${HCC_VERSION})
list(GET HCC_VERSION_LIST 0 HCC_PACKAGE_VERSION)

# Determine HSA_PATH
if(NOT DEFINED HSA_PATH)
Expand Down
12 changes: 12 additions & 0 deletions bin/hipcc
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,18 @@ if ($HIP_PLATFORM eq "hcc") {
$HIPCC=$HCC;
$HIPCXXFLAGS = $HCCFLAGS;

#### GCC system includes workaround ####
$WA = $ENV{'HIP_HCC_SYS_INCLUDES_WA'} // 0;
if (${WA}) {
my $GCC_CUR_VER = `gcc -dumpversion`;
my $GPP_CUR_VER = `g++ -dumpversion`;
$GCC_CUR_VER =~ s/\R//g;
$GPP_CUR_VER =~ s/\R//g;
if (${GCC_CUR_VER} eq ${GPP_CUR_VER}) {
$HIPCXXFLAGS .= "-I/usr/include/x86_64-linux-gnu -I/usr/include/x86_64-linux-gnu/c++/${GCC_CUR_VER} -I/usr/include/c++/${GCC_CUR_VER}";
}
}

$HIPCXXFLAGS .= " -I$HIP_PATH/include/hip/hcc_detail/cuda";
$HIPCXXFLAGS .= " -I$HSA_PATH/include";
$HIPCXXFLAGS .= " -Wno-deprecated-register";
Expand Down
11 changes: 6 additions & 5 deletions docs/markdown/hip_kernel_language.md
Original file line number Diff line number Diff line change
Expand Up @@ -231,10 +231,10 @@ typedef struct dim3 {
```

## Memory-Fence Instructions
HIP support for __threadfence(), __threadfence_block() and __threadfence_system() is under development.
The stubs for the threadfence routines are defined in hcc_details/hip_runtime.h.
Applications that use these threadfence features should disable both of the L1 and L2 caches by:
"export HSA_DISABLE_CACHE=1"
HIP supports __threadfence() and __threadfence_block().

Applications that use threadfence_system can disable the L1 and L2 caches on the GPU by:
"export HSA_DISABLE_CACHE=1". See the hip_porting_guide.md#threadfence_system for more information.

## Synchronization Functions
The __syncthreads() built-in function is supported in HIP. The __syncthreads_count(int), __syncthreads_and(int) and __syncthreads_or(int) functions are under development.
Expand Down Expand Up @@ -602,7 +602,8 @@ The printf function is under development.

## Device-Side Dynamic Global Memory Allocation

Device-side dynamic global memory allocation is not supported.
Device-side dynamic global memory allocation is under development. HIP now includes a preliminary
implementation of malloc and free that can be called from device functions.

## `__launch_bounds__`
GPU multiprocessors have a fixed pool of resources (primarily registers and shared memory) that are shared among the active warps. Using more resources can increase the kernel’s IPC, but it reduces the resources available for other warps and limits the number of warps that can run simultaneously. Thus, GPUs exhibit a complex relationship between resource usage and performance. `__launch_bounds__` allows the application to provide usage hints that influence the resources (primarily registers) employed by the generated code. It’s a function attribute that must be attached to a `__global__` function:
Expand Down
31 changes: 24 additions & 7 deletions include/hip/hcc_detail/hip_runtime_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,12 @@ enum hipLimit_t
#define hipHostRegisterIoMemory 0x4 ///< Not supported.


#define hipDeviceScheduleAuto 0x0
#define hipDeviceScheduleSpin 0x1
#define hipDeviceScheduleYield 0x2
#define hipDeviceBlockingSync 0x4
#define hipDeviceScheduleAuto 0x0 ///< Automatically select between Spin and Yield
#define hipDeviceScheduleSpin 0x1 ///< Dedicate a CPU core to spin-wait. Provides lowest latency, but burns a CPU core and may consume more power.
#define hipDeviceScheduleYield 0x2 ///< Yield the CPU to the operating system when waiting. May increase latency, but lowers power and is friendlier to other threads in the system.
#define hipDeviceScheduleBlockingSync 0x4
#define hipDeviceScheduleMask 0x7

#define hipDeviceMapHost 0x8
#define hipDeviceLmemResizeToMax 0x16

Expand Down Expand Up @@ -383,9 +385,18 @@ hipError_t hipDeviceSetSharedMemConfig ( hipSharedMemConfig config );
*
* @param [in] flags
*
* The schedule flags impact how HIP waits for the completion of a command running on a device.
* hipDeviceScheduleSpin : HIP runtime will actively spin in the thread which submitted the work until the command completes. This offers the lowest latency, but will consume a CPU core and may increase power.
* hipDeviceScheduleYield : The HIP runtime will yield the CPU to system so that other tasks can use it. This may increase latency to detect the completion but will consume less power and is friendlier to other tasks in the system.
* hipDeviceScheduleBlockingSync : On ROCm platform, this is a synonym for hipDeviceScheduleYield.
* hipDeviceScheduleAuto : Use a hueristic to select between Spin and Yield modes. If the number of HIP contexts is greater than the number of logical processors in the system, use Spin scheduling. Else use Yield scheduling.
*
*
* hipDeviceMapHost : Allow mapping host memory. On ROCM, this is always allowed and the flag is ignored.
* hipDeviceLmemResizeToMax : @warning ROCm silently ignores this flag.
*
* @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorSetOnActiveProcess
*
* Note: Only hipDeviceScheduleAuto and hipDeviceMapHost are supported
*
*/
hipError_t hipSetDeviceFlags ( unsigned flags);
Expand Down Expand Up @@ -626,8 +637,12 @@ hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int *flags);
*
* @param[in,out] event Returns the newly created event.
* @param[in] flags Flags to control event behavior. Valid values are #hipEventDefault, #hipEventBlockingSync, #hipEventDisableTiming, #hipEventInterprocess
*
* @warning On HCC platform, flags must be #hipEventDefault.
* #hipEventDefault : Default flag. The event will use active synchronization and will support timing. Blocking synchronization provides lowest possible latency at the expense of dedicating a CPU to poll on the eevent.
* #hipEventBlockingSync : The event will use blocking synchronization : if hipEventSynchronize is called on this event, the thread will block until the event completes. This can increase latency for the synchroniation but can result in lower power and more resources for other CPU threads.
* #hipEventDisableTiming : Disable recording of timing information. On ROCM platform, timing information is always recorded and this flag has no performance benefit.
* @warning On HCC platform, hipEventInterprocess support is under development. Use of this flag will return an error.
*
* @returns #hipSuccess, #hipErrorInitializationError, #hipErrorInvalidValue, #hipErrorLaunchFailure, #hipErrorMemoryAllocation
*
Expand Down Expand Up @@ -688,6 +703,8 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
* the function will return immediately and the completion_future resources will be released later, when the hipDevice is synchronized.
*
* @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize, hipEventRecord, hipEventElapsedTime
*
* @returns #hipSuccess
*/
hipError_t hipEventDestroy(hipEvent_t event);

Expand Down
8 changes: 4 additions & 4 deletions packaging/hip_hcc.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,19 @@ set(CPACK_GENERATOR "TGZ;DEB;RPM")
set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
if(@COMPILE_HIP_ATP_MARKER@)
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION}), hcc_lc (= @HCC_VERSION@), rocm-profiler")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION}), hcc_lc (= @HCC_PACKAGE_VERSION@), rocm-profiler")
else()
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION}), hcc_lc (= @HCC_VERSION@)")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION}), hcc_lc (= @HCC_PACKAGE_VERSION@)")
endif()
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64")
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
if(@COMPILE_HIP_ATP_MARKER@)
set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, hcc_lc = @HCC_VERSION@, rocm-profiler")
set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, hcc_lc = @HCC_PACKAGE_VERSION@, rocm-profiler")
else()
set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, hcc_lc = @HCC_VERSION@")
set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, hcc_lc = @HCC_PACKAGE_VERSION@")
endif()
set(CPACK_SOURCE_GENERATOR "TGZ")
include(CPack)
2 changes: 1 addition & 1 deletion samples/2_Cookbook/0_MatrixTranspose/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ TARGET=hcc
SOURCES = MatrixTranspose.cpp
OBJECTS = $(SOURCES:.cpp=.o)

EXECUTABLE=./exe
EXECUTABLE=./MatrixTranspose

.PHONY: test

Expand Down
28 changes: 13 additions & 15 deletions samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,40 +27,38 @@ THE SOFTWARE.


#define WIDTH 1024
#define HEIGHT 1024

#define NUM (WIDTH*HEIGHT)

#define THREADS_PER_BLOCK_X 16
#define THREADS_PER_BLOCK_Y 16
#define NUM (WIDTH*WIDTH)

#define THREADS_PER_BLOCK_X 4
#define THREADS_PER_BLOCK_Y 4
#define THREADS_PER_BLOCK_Z 1

// Device (Kernel) function, it must be void
// hipLaunchParm provides the execution configuration
__global__ void matrixTranspose(hipLaunchParm lp,
float *out,
float *in,
const int width,
const int height)
const int width)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

out[y * width + x] = in[x * height + y];
out[y * width + x] = in[x * width + y];
}

// CPU implementation of matrix transpose
void matrixTransposeCPUReference(
float * output,
float * input,
const unsigned int width,
const unsigned int height)
const unsigned int width)
{
for(unsigned int j=0; j < height; j++)
for(unsigned int j=0; j < width; j++)
{
for(unsigned int i=0; i < width; i++)
{
output[i*height + j] = input[j*width + i];
output[i*width + j] = input[j*width + i];
}
}
}
Expand Down Expand Up @@ -100,22 +98,22 @@ int main() {

// Lauching kernel from host
hipLaunchKernel(matrixTranspose,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT);
gpuTransposeMatrix , gpuMatrix, WIDTH);

// Memory transfer from device to host
hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);

// CPU MatrixTranspose computation
matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT);
matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);

// verify the results
errors = 0;
double eps = 1.0E-6;
for (i = 0; i < NUM; i++) {
if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) {
if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
errors++;
}
}
Expand Down
2 changes: 1 addition & 1 deletion samples/2_Cookbook/1_hipEvent/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ TARGET=hcc
SOURCES = hipEvent.cpp
OBJECTS = $(SOURCES:.cpp=.o)

EXECUTABLE=./exe
EXECUTABLE=./hipEvent

.PHONY: test

Expand Down
27 changes: 12 additions & 15 deletions samples/2_Cookbook/1_hipEvent/hipEvent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,40 +26,37 @@ THE SOFTWARE.
#include "hip/hip_runtime.h"

#define WIDTH 1024
#define HEIGHT 1024

#define NUM (WIDTH*HEIGHT)
#define NUM (WIDTH*WIDTH)

#define THREADS_PER_BLOCK_X 16
#define THREADS_PER_BLOCK_Y 16
#define THREADS_PER_BLOCK_X 4
#define THREADS_PER_BLOCK_Y 4
#define THREADS_PER_BLOCK_Z 1

// Device (Kernel) function, it must be void
// hipLaunchParm provides the execution configuration
__global__ void matrixTranspose(hipLaunchParm lp,
float *out,
float *in,
const int width,
const int height)
const int width)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

out[y * width + x] = in[x * height + y];
out[y * width + x] = in[x * width + y];
}

// CPU implementation of matrix transpose
void matrixTransposeCPUReference(
float * output,
float * input,
const unsigned int width,
const unsigned int height)
const unsigned int width)
{
for(unsigned int j=0; j < height; j++)
for(unsigned int j=0; j < width; j++)
{
for(unsigned int i=0; i < width; i++)
{
output[i*height + j] = input[j*width + i];
output[i*width + j] = input[j*width + i];
}
}
}
Expand Down Expand Up @@ -118,10 +115,10 @@ int main() {

// Lauching kernel from host
hipLaunchKernel(matrixTranspose,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT);
gpuTransposeMatrix , gpuMatrix, WIDTH);

// Record the stop event
hipEventRecord(stop, NULL);
Expand All @@ -146,13 +143,13 @@ int main() {
printf ("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs);

// CPU MatrixTranspose computation
matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT);
matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);

// verify the results
errors = 0;
double eps = 1.0E-6;
for (i = 0; i < NUM; i++) {
if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) {
if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
errors++;
}
}
Expand Down
2 changes: 1 addition & 1 deletion samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ TARGET=hcc
SOURCES = MatrixTranspose.cpp
OBJECTS = $(SOURCES:.cpp=.o)

EXECUTABLE=./exe
EXECUTABLE=./MatrixTranspose

.PHONY: test

Expand Down
Loading

0 comments on commit 6f499ad

Please sign in to comment.