Skip to content

Commit

Permalink
[BE][MPS] Apply clang-format to mps headers (pytorch#140906)
Browse files Browse the repository at this point in the history
It was a mistake to amiss them in the past

All changes in this PR except ones to .lintrunner.toml are generated by running
`lintrunner -a --take CLANGFORMAT --all-files`
Pull Request resolved: pytorch#140906
Approved by: https://github.com/Skylion007
  • Loading branch information
malfet authored and youssef62 committed Nov 23, 2024
1 parent 20e9221 commit 5d94fc9
Show file tree
Hide file tree
Showing 24 changed files with 932 additions and 823 deletions.
2 changes: 2 additions & 0 deletions .lintrunner.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,12 @@ code = 'CLANGFORMAT'
include_patterns = [
'aten/src/ATen/*.h',
'aten/src/ATen/mps/**/*.mm',
'aten/src/ATen/mps/**/*.h',
'aten/src/ATen/xpu/**/*.h',
'aten/src/ATen/xpu/**/*.cpp',
'aten/src/ATen/native/mps/**/*.metal',
'aten/src/ATen/native/mps/**/*.mm',
'aten/src/ATen/native/mps/**/*.h',
'aten/src/ATen/native/vulkan/**/*.h',
'aten/src/ATen/native/vulkan/**/*.cpp',
'aten/src/ATen/native/cuda/MultiTensorApply.cuh',
Expand Down
5 changes: 2 additions & 3 deletions aten/src/ATen/mps/EmptyTensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ C10_EXPORT TensorBase empty_mps(
std::optional<Device> device_opt,
std::optional<bool> pin_memory_opt,
std::optional<c10::MemoryFormat> memory_format_opt);
C10_EXPORT TensorBase empty_mps(
IntArrayRef size, const TensorOptions &options);
C10_EXPORT TensorBase empty_mps(IntArrayRef size, const TensorOptions& options);

C10_EXPORT TensorBase empty_strided_mps(
IntArrayRef size,
Expand All @@ -24,6 +23,6 @@ C10_EXPORT TensorBase empty_strided_mps(
C10_EXPORT TensorBase empty_strided_mps(
IntArrayRef size,
IntArrayRef stride,
const TensorOptions &options);
const TensorOptions& options);

} // namespace at::detail
4 changes: 2 additions & 2 deletions aten/src/ATen/mps/IndexKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

namespace at::mps {

static const char *SCATTER_OPS_TEMPLATE = R"METAL_SCATTER(
static const char* SCATTER_OPS_TEMPLATE = R"METAL_SCATTER(
struct __attribute__ ((packed)) packed_uint5{{
uint32_t x; uint32_t y; uint32_t z; uint32_t w; uint32_t u;
}};
Expand Down Expand Up @@ -120,7 +120,7 @@ kernel void scatter_kernel_1(uint linear_index [[thread_position_in
}}
)METAL_SCATTER";

static const char *GATHER_OPS_TEMPLATE = R"METAL_GATHER(
static const char* GATHER_OPS_TEMPLATE = R"METAL_GATHER(
struct __attribute__ ((packed)) packed_uint5{{
uint32_t x; uint32_t y; uint32_t z; uint32_t w; uint32_t u;
}};
Expand Down
157 changes: 96 additions & 61 deletions aten/src/ATen/mps/MPSAllocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,45 +6,47 @@
#include <ATen/mps/MPSEvent.h>
#include <ATen/mps/MPSStream.h>

#include <c10/util/flat_hash_map.h>
#include <mach/vm_page_size.h>
#include <cstdio>
#include <mutex>
#include <set>
#include <unordered_set>
#include <mach/vm_page_size.h>
#include <c10/util/flat_hash_map.h>

// this implementation is based on CUDACachingAllocator.
// It utilizes Metal Heaps to improve the performance with buffer allocation.
// Do not include this header. Use MPSAllocatorInterface.h instead.
// TODO: Unify the logic with CUDACachingAllocator and remove redundant code.
namespace at::mps::HeapAllocator {

static const size_t kMaxSmallAlloc = MB(1); // largest "small" allocation is 1 MiB
static const size_t kMinLargeAlloc = MB(10); // allocations between 1 and 10 MiB may use kLargeHeap
static const size_t kRoundLarge = MB(2); // round up large allocations to 2 MiB
static const size_t kSmallHeap = MB(8); // "small" allocations are packed in 8 MiB heaps
static const size_t kLargeHeap = MB(32); // "large" allocations may be packed in 32 MiB heaps
static const size_t kXLargeHeapD = MB(128); // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps
static const size_t kXLargeHeapU = MB(1024); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps
static const size_t kMaxSmallAlloc = MB(1); // largest "small" allocation is 1 MiB
static const size_t kMinLargeAlloc = MB(10); // allocations between 1 and 10 MiB may use kLargeHeap
static const size_t kRoundLarge = MB(2); // round up large allocations to 2 MiB
static const size_t kSmallHeap = MB(8); // "small" allocations are packed in 8 MiB heaps
static const size_t kLargeHeap = MB(32); // "large" allocations may be packed in 32 MiB heaps
static const size_t kXLargeHeapD =
MB(128); // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps
static const size_t kXLargeHeapU =
MB(1024); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps
static const size_t kMaxScalarAlloc = (sizeof(int64_t)); // largest "scalar" allocation

// buffer pools could be customized with a combination of usage flags
enum UsageFlags : uint32_t {
PRIVATE = 0,
SMALL = (1 << 0), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap
SHARED = (1 << 1), // shared pools allocated on devices with unified memory; otherwise, private between host/device
SMALL = (1 << 0), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap
SHARED = (1 << 1), // shared pools allocated on devices with unified memory; otherwise, private between host/device
MANAGED = (1 << 2), // managed storage mode
HAZARD = (1 << 3), // enables Automatic Hazard Tracking for the resources allocated on the pool
SCALAR = (1 << 4), // used to import CPU scalar values to GPU and use them in MPS Stream
HAZARD = (1 << 3), // enables Automatic Hazard Tracking for the resources allocated on the pool
SCALAR = (1 << 4), // used to import CPU scalar values to GPU and use them in MPS Stream
};
// debug verbosity flags
enum DebugVerbosity : uint32_t {
SILENT = 0,
PROFILING = (1 << 0), // print generic profiling data for total system memory usage
SILENT = 0,
PROFILING = (1 << 0), // print generic profiling data for total system memory usage
ALLOCATIONS = (1 << 1), // print buffer allocations
RECYCLES = (1 << 2), // print buffer recycling
RELEASES = (1 << 3), // print buffer releases
LARGE_ONLY = (1 << 4), // only log large buffer pool transactions
RECYCLES = (1 << 2), // print buffer recycling
RELEASES = (1 << 3), // print buffer releases
LARGE_ONLY = (1 << 4), // only log large buffer pool transactions
};

struct HeapBlock;
Expand All @@ -67,10 +69,8 @@ struct BufferBlock {
// Metal events used to sync GPU/CPU operations on the shared-storage buffers
MPSEventPtr event;

BufferBlock(size_t Size, size_t RequestedSize = 0, const id<MTLBuffer> Buffer = nullptr,
HeapBlock* Heap = nullptr) :
buffer(Buffer), size(Size), requested_size(RequestedSize),
heap(Heap), buf_id(Buffer ? ++buffer_counter : 0) { }
BufferBlock(size_t Size, size_t RequestedSize = 0, const id<MTLBuffer> Buffer = nullptr, HeapBlock* Heap = nullptr)
: buffer(Buffer), size(Size), requested_size(RequestedSize), heap(Heap), buf_id(Buffer ? ++buffer_counter : 0) {}

static bool Comparator(const BufferBlock* a, const BufferBlock* b) {
return (a->size != b->size) ? a->size < b->size : (uintptr_t)a->buffer < (uintptr_t)b->buffer;
Expand All @@ -79,15 +79,19 @@ struct BufferBlock {
assert(((Alignment - 1) & Alignment) == 0);
return ((Size + Alignment - 1) & ~(Alignment - 1));
}
uint32_t retainCount() const { return [buffer retainCount]; }
uint32_t retainCount() const {
return [buffer retainCount];
}
};
typedef bool (*BufferComparison)(const BufferBlock*, const BufferBlock*);

struct BufferPool;
struct AllocParams {
AllocParams(size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool) :
search_key(Alloc_Size), pool(Pool), requested_size(Requested_Size) { }
size_t size() const { return search_key.size; }
AllocParams(size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool)
: search_key(Alloc_Size), pool(Pool), requested_size(Requested_Size) {}
size_t size() const {
return search_key.size;
}

BufferBlock search_key;
BufferPool* pool;
Expand All @@ -102,7 +106,9 @@ struct AllocParams {

struct HeapBlock {
id<MTLHeap> heap;
struct { size_t total, available; } size;
struct {
size_t total, available;
} size;
BufferPool* pool;
unsigned int n_buffers = 0;
id_t heap_id;
Expand All @@ -111,9 +117,12 @@ struct HeapBlock {
// counter to assign unique ids to heap blocks
static uint64_t heap_counter;

HeapBlock(size_t Size, const id<MTLHeap> Heap = nullptr, BufferPool *Pool = nullptr) :
heap(Heap), size({.total = Size, .available = Size}), pool(Pool),
heap_id(Heap ? ++heap_counter : 0), is_split(true) { }
HeapBlock(size_t Size, const id<MTLHeap> Heap = nullptr, BufferPool* Pool = nullptr)
: heap(Heap),
size({.total = Size, .available = Size}),
pool(Pool),
heap_id(Heap ? ++heap_counter : 0),
is_split(true) {}

static MTLResourceOptions getOptions(uint32_t usage) {
// TODO: check the caching performance of write-combined mode
Expand All @@ -126,16 +135,17 @@ struct HeapBlock {
else
options |= MTLResourceStorageModePrivate;

options |= (usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked;
options |=
(usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked;

return options;
}

static HeapBlock* createHeapBlock(AllocParams& params, id<MTLDevice> device, uint32_t usage) {
HeapBlock *heapBlock = nullptr;
HeapBlock* heapBlock = nullptr;
bool is_split = true;
const size_t size = params.size();
MTLHeapDescriptor *d = [MTLHeapDescriptor new];
MTLHeapDescriptor* d = [MTLHeapDescriptor new];
if (d) {
const size_t kXLargeHeap = params.has_unified_memory ? kXLargeHeapU : kXLargeHeapD;
if (size <= kMaxSmallAlloc) {
Expand All @@ -152,10 +162,11 @@ struct HeapBlock {
d.cpuCacheMode = MTLCPUCacheModeDefaultCache;
// this automatically handles Metal buffer access synchronizations at the
// cost of slightly lower performance.
d.hazardTrackingMode = (usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked;
d.hazardTrackingMode =
(usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked;
d.resourceOptions = getOptions(usage);
d.type = MTLHeapTypeAutomatic;
id<MTLHeap> heap = [device newHeapWithDescriptor: d];
id<MTLHeap> heap = [device newHeapWithDescriptor:d];
if (heap) {
[heap setPurgeableState:MTLPurgeableStateNonVolatile];
const size_t heap_size = heapAvailableSize(heap);
Expand All @@ -169,8 +180,8 @@ struct HeapBlock {
return heapBlock;
}
static bool Comparator(const HeapBlock* a, const HeapBlock* b) {
return (a->size.available != b->size.available) ? a->size.available < b->size.available :
(uintptr_t)a->heap < (uintptr_t)b->heap;
return (a->size.available != b->size.available) ? a->size.available < b->size.available
: (uintptr_t)a->heap < (uintptr_t)b->heap;
}
static NSUInteger heapAvailableSize(id<MTLHeap> heap, size_t Alignment = vm_page_size) {
return [heap maxAvailableSizeWithAlignment:Alignment];
Expand Down Expand Up @@ -205,8 +216,12 @@ struct HeapBlock {
size.available = 0;
return retainCount;
}
uint32_t retainCount() const { return [heap retainCount]; }
void updateAvailableSize() { size.available = heapAvailableSize(heap); }
uint32_t retainCount() const {
return [heap retainCount];
}
void updateAvailableSize() {
size.available = heapAvailableSize(heap);
}
};
typedef bool (*HeapComparison)(const HeapBlock*, const HeapBlock*);

Expand All @@ -219,9 +234,8 @@ struct BufferPool {
SCALAR,
};

BufferPool(const id<MTLDevice> Device, uint32_t Usage) :
device(Device), usage(Usage),
heaps(HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) { }
BufferPool(const id<MTLDevice> Device, uint32_t Usage)
: device(Device), usage(Usage), heaps(HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) {}

const id<MTLDevice> device;
// usage flags to customize the pool for various purposes (see UsageFlags enum)
Expand All @@ -248,12 +262,12 @@ struct BufferPool {
};

class MPSHeapAllocatorImpl {
public:
explicit MPSHeapAllocatorImpl() :
m_device(at::mps::MPSDevice::getInstance()->device()),
m_max_buffer_size([m_device maxBufferLength]),
m_stream(getDefaultMPSStream()),
m_event_pool(getMPSEventPool()) {
public:
explicit MPSHeapAllocatorImpl()
: m_device(at::mps::MPSDevice::getInstance()->device()),
m_max_buffer_size([m_device maxBufferLength]),
m_stream(getDefaultMPSStream()),
m_event_pool(getMPSEventPool()) {
init_allocator();
}
~MPSHeapAllocatorImpl() {
Expand Down Expand Up @@ -298,34 +312,50 @@ class MPSHeapAllocatorImpl {
// (see m_high_watermark_ratio for description)
void setHighWatermarkRatio(double ratio);
// (see m_low_watermark_limit for description)
size_t getLowWatermarkLimit() const { return m_low_watermark_limit; }
size_t getLowWatermarkLimit() const {
return m_low_watermark_limit;
}
// (see m_max_total_allowed_size for description)
size_t getHighWatermarkLimit() const { return m_max_total_allowed_size; }
size_t getHighWatermarkLimit() const {
return m_max_total_allowed_size;
}
// (see m_total_allocated_memory for description)
size_t getTotalAllocatedMemory() const { return m_total_allocated_memory; }
size_t getTotalAllocatedMemory() const {
return m_total_allocated_memory;
}
// (see m_current_allocated_memory for description)
size_t getCurrentAllocatedMemory() const { return m_current_allocated_memory; }
size_t getCurrentAllocatedMemory() const {
return m_current_allocated_memory;
}
// total GPU memory allocated in the process by Metal driver; including
// implicit allocations from MPS/MPSGraph frameworks and MPSHeapAllocatorImpl.
size_t getDriverAllocatedMemory() const { return current_allocated_size(); }
size_t getDriverAllocatedMemory() const {
return current_allocated_size();
}
// recommended Max memory for Metal
size_t getRecommendedMaxMemory() const { return max_device_size(); }
size_t getRecommendedMaxMemory() const {
return max_device_size();
}
// (see enum DebugVerbosity for description)
uint32_t getDebugVerbosity() const { return m_debug_verbosity; }
uint32_t getDebugVerbosity() const {
return m_debug_verbosity;
}
// returns the device that we allocate from
inline id<MTLDevice> Device() const { return m_device; }
inline id<MTLDevice> Device() const {
return m_device;
}

// TODO: make a common function to do size unit conversions in PyTorch.
inline std::string format_size(uint64_t size) const;

private:
private:
// (see m_high_watermark_ratio for description)
constexpr static double default_high_watermark_ratio = 1.7;
// we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
constexpr static double default_high_watermark_upper_bound = 2.0;
// (see m_low_watermark_ratio for description)
// on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize
constexpr static double default_low_watermark_ratio_unified = 1.4;
constexpr static double default_low_watermark_ratio_unified = 1.4;
constexpr static double default_low_watermark_ratio_discrete = 1.0;

const id<MTLDevice> m_device;
Expand Down Expand Up @@ -387,14 +417,19 @@ class MPSHeapAllocatorImpl {
size_t get_allocation_size(size_t size, uint32_t usage) const;
// maximum size of device memory available for allocation in current process
// Note: the recommendedMaxWorkingSetSize is typically 75% of the total system memory.
size_t max_device_size() const { return [m_device recommendedMaxWorkingSetSize]; }
size_t max_device_size() const {
return [m_device recommendedMaxWorkingSetSize];
}
// there are implicit allocations from MPS backend, so we need to query the 'device' for
// total allocated size instead of manually tracking in MPSAllocator
size_t current_allocated_size() const { return [m_device currentAllocatedSize]; }
size_t current_allocated_size() const {
return [m_device currentAllocatedSize];
}

bool trigger_memory_callbacks(BufferBlock* buffer_block, IMpsAllocatorCallback::EventType event) const {
for (const auto& name : MPSAllocatorCallbacksRegistry()->Keys()) {
MPSAllocatorCallbacksRegistry()->Create(name)->executeMPSAllocatorCallback(buffer_block ? buffer_block->buffer : nullptr, event);
MPSAllocatorCallbacksRegistry()->Create(name)->executeMPSAllocatorCallback(
buffer_block ? buffer_block->buffer : nullptr, event);
}
return true;
}
Expand Down
Loading

0 comments on commit 5d94fc9

Please sign in to comment.