Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

video_core: Improve handling of image buffer aliases #757

Merged
merged 15 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 1 addition & 7 deletions src/core/libraries/kernel/thread_management.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -414,11 +414,6 @@ ScePthreadMutex* createMutex(ScePthreadMutex* addr) {
if (addr == nullptr || *addr != nullptr) {
return addr;
}
static std::mutex mutex;
std::scoped_lock lk{mutex};
if (*addr != nullptr) {
return addr;
}
const VAddr vaddr = reinterpret_cast<VAddr>(addr);
std::string name = fmt::format("mutex{:#x}", vaddr);
scePthreadMutexInit(addr, nullptr, name.c_str());
Expand Down Expand Up @@ -584,8 +579,7 @@ int PS4_SYSV_ABI scePthreadMutexLock(ScePthreadMutex* mutex) {
}

int PS4_SYSV_ABI scePthreadMutexUnlock(ScePthreadMutex* mutex) {
mutex = createMutex(mutex);
if (mutex == nullptr) {
if (mutex == nullptr || *mutex == nullptr) {
return SCE_KERNEL_ERROR_EINVAL;
}

Expand Down
36 changes: 12 additions & 24 deletions src/video_core/buffer_cache/buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@ void UniqueBuffer::Create(const vk::BufferCreateInfo& buffer_ci, MemoryUsage usa
buffer = vk::Buffer{unsafe_buffer};
}

Buffer::Buffer(const Vulkan::Instance& instance_, MemoryUsage usage_, VAddr cpu_addr_,
vk::BufferUsageFlags flags, u64 size_bytes_)
: cpu_addr{cpu_addr_}, size_bytes{size_bytes_}, instance{&instance_}, usage{usage_},
buffer{instance->GetDevice(), instance->GetAllocator()} {
Buffer::Buffer(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, MemoryUsage usage_,
VAddr cpu_addr_, vk::BufferUsageFlags flags, u64 size_bytes_)
: cpu_addr{cpu_addr_}, size_bytes{size_bytes_}, instance{&instance_}, scheduler{&scheduler_},
usage{usage_}, buffer{instance->GetDevice(), instance->GetAllocator()} {
// Create buffer object.
const vk::BufferCreateInfo buffer_ci = {
.size = size_bytes,
Expand All @@ -117,13 +117,6 @@ Buffer::Buffer(const Vulkan::Instance& instance_, MemoryUsage usage_, VAddr cpu_

vk::BufferView Buffer::View(u32 offset, u32 size, bool is_written, AmdGpu::DataFormat dfmt,
AmdGpu::NumberFormat nfmt) {
const auto it{std::ranges::find_if(views, [=](const BufferView& view) {
return offset == view.offset && size == view.size && is_written == view.is_written &&
dfmt == view.dfmt && nfmt == view.nfmt;
})};
if (it != views.end()) {
return *it->handle;
}
const vk::BufferUsageFlags2CreateInfoKHR usage_flags = {
.usage = is_written ? vk::BufferUsageFlagBits2KHR::eStorageTexelBuffer
: vk::BufferUsageFlagBits2KHR::eUniformTexelBuffer,
Expand All @@ -135,23 +128,18 @@ vk::BufferView Buffer::View(u32 offset, u32 size, bool is_written, AmdGpu::DataF
.offset = offset,
.range = size,
};
views.push_back({
.offset = offset,
.size = size,
.is_written = is_written,
.dfmt = dfmt,
.nfmt = nfmt,
.handle = instance->GetDevice().createBufferViewUnique(view_ci),
});
return *views.back().handle;
const auto view = instance->GetDevice().createBufferView(view_ci);
scheduler->DeferOperation(
[view, device = instance->GetDevice()] { device.destroyBufferView(view); });
return view;
}

constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000;
constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000;

StreamBuffer::StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler_,
StreamBuffer::StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
MemoryUsage usage, u64 size_bytes)
: Buffer{instance, usage, 0, AllFlags, size_bytes}, scheduler{scheduler_} {
: Buffer{instance, scheduler, usage, 0, AllFlags, size_bytes} {
ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE);
ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE);
const auto device = instance.GetDevice();
Expand Down Expand Up @@ -206,7 +194,7 @@ void StreamBuffer::Commit() {

auto& watch = current_watches[current_watch_cursor++];
watch.upper_bound = offset;
watch.tick = scheduler.CurrentTick();
watch.tick = scheduler->CurrentTick();
}

void StreamBuffer::ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size) {
Expand All @@ -220,7 +208,7 @@ void StreamBuffer::WaitPendingOperations(u64 requested_upper_bound) {
while (requested_upper_bound > wait_bound && wait_cursor < *invalidation_mark) {
auto& watch = previous_watches[wait_cursor];
wait_bound = watch.upper_bound;
scheduler.Wait(watch.tick);
scheduler->Wait(watch.tick);
++wait_cursor;
}
}
Expand Down
18 changes: 5 additions & 13 deletions src/video_core/buffer_cache/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,9 @@ struct UniqueBuffer {

class Buffer {
public:
explicit Buffer(const Vulkan::Instance& instance, MemoryUsage usage, VAddr cpu_addr_,
vk::BufferUsageFlags flags, u64 size_bytes_);
explicit Buffer(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
MemoryUsage usage, VAddr cpu_addr_, vk::BufferUsageFlags flags,
u64 size_bytes_);

Buffer& operator=(const Buffer&) = delete;
Buffer(const Buffer&) = delete;
Expand Down Expand Up @@ -144,20 +145,12 @@ class Buffer {
int stream_score = 0;
size_t size_bytes = 0;
std::span<u8> mapped_data;
const Vulkan::Instance* instance{};
const Vulkan::Instance* instance;
Vulkan::Scheduler* scheduler;
MemoryUsage usage;
UniqueBuffer buffer;
vk::AccessFlagBits2 access_mask{vk::AccessFlagBits2::eNone};
vk::PipelineStageFlagBits2 stage{vk::PipelineStageFlagBits2::eNone};
struct BufferView {
u32 offset;
u32 size;
bool is_written;
AmdGpu::DataFormat dfmt;
AmdGpu::NumberFormat nfmt;
vk::UniqueBufferView handle;
};
std::vector<BufferView> views;
};

class StreamBuffer : public Buffer {
Expand Down Expand Up @@ -196,7 +189,6 @@ class StreamBuffer : public Buffer {
void WaitPendingOperations(u64 requested_upper_bound);

private:
Vulkan::Scheduler& scheduler;
u64 offset{};
u64 mapped_size{};
std::vector<Watch> current_watches;
Expand Down
107 changes: 91 additions & 16 deletions src/video_core/buffer_cache/buffer_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,24 @@
#include "video_core/renderer_vulkan/liverpool_to_vk.h"
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/texture_cache/texture_cache.h"

namespace VideoCore {

static constexpr size_t NumVertexBuffers = 32;
static constexpr size_t StagingBufferSize = 512_MB;
static constexpr size_t UboStreamBufferSize = 64_MB;

BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
const AmdGpu::Liverpool* liverpool_, PageManager& tracker_)
: instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, tracker{tracker_},
const AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_,
PageManager& tracker_)
: instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_},
texture_cache{texture_cache_}, tracker{tracker_},
staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize},
stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize},
memory_tracker{&tracker} {
// Ensure the first slot is used for the null buffer
void(slot_buffers.insert(instance, MemoryUsage::DeviceLocal, 0, ReadFlags, 1));
void(slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, 0, ReadFlags, 1));
}

BufferCache::~BufferCache() = default;
Expand Down Expand Up @@ -100,9 +104,9 @@ bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) {
return false;
}

std::array<vk::Buffer, NUM_VERTEX_BUFFERS> host_buffers;
std::array<vk::DeviceSize, NUM_VERTEX_BUFFERS> host_offsets;
boost::container::static_vector<AmdGpu::Buffer, NUM_VERTEX_BUFFERS> guest_buffers;
std::array<vk::Buffer, NumVertexBuffers> host_buffers;
std::array<vk::DeviceSize, NumVertexBuffers> host_offsets;
boost::container::static_vector<AmdGpu::Buffer, NumVertexBuffers> guest_buffers;

struct BufferRange {
VAddr base_address;
Expand All @@ -117,7 +121,7 @@ bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) {

// Calculate buffers memory overlaps
bool has_step_rate = false;
boost::container::static_vector<BufferRange, NUM_VERTEX_BUFFERS> ranges{};
boost::container::static_vector<BufferRange, NumVertexBuffers> ranges{};
for (const auto& input : vs_info.vs_inputs) {
if (input.instance_step_rate == Shader::Info::VsInput::InstanceIdType::OverStepRate0 ||
input.instance_step_rate == Shader::Info::VsInput::InstanceIdType::OverStepRate1) {
Expand Down Expand Up @@ -152,7 +156,7 @@ bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) {
return lhv.base_address < rhv.base_address;
});

boost::container::static_vector<BufferRange, NUM_VERTEX_BUFFERS> ranges_merged{ranges[0]};
boost::container::static_vector<BufferRange, NumVertexBuffers> ranges_merged{ranges[0]};
for (auto range : ranges) {
auto& prev_range = ranges_merged.back();
if (prev_range.end_address < range.base_address) {
Expand Down Expand Up @@ -232,7 +236,7 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
bool is_texel_buffer) {
static constexpr u64 StreamThreshold = CACHING_PAGESIZE;
const bool is_gpu_dirty = memory_tracker.IsRegionGpuModified(device_addr, size);
if (!is_written && !is_texel_buffer && size <= StreamThreshold && !is_gpu_dirty) {
if (!is_written && size <= StreamThreshold && !is_gpu_dirty) {
// For small uniform buffers that have not been modified by gpu
// use device local stream buffer to reduce renderpass breaks.
const u64 offset = stream_buffer.Copy(device_addr, size, instance.UniformMinAlignment());
Expand All @@ -241,7 +245,7 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b

const BufferId buffer_id = FindBuffer(device_addr, size);
Buffer& buffer = slot_buffers[buffer_id];
SynchronizeBuffer(buffer, device_addr, size);
SynchronizeBuffer(buffer, device_addr, size, is_texel_buffer);
if (is_written) {
memory_tracker.MarkRegionAsGpuModified(device_addr, size);
}
Expand Down Expand Up @@ -420,8 +424,8 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {
wanted_size = static_cast<u32>(device_addr_end - device_addr);
const OverlapResult overlap = ResolveOverlaps(device_addr, wanted_size);
const u32 size = static_cast<u32>(overlap.end - overlap.begin);
const BufferId new_buffer_id =
slot_buffers.insert(instance, MemoryUsage::DeviceLocal, overlap.begin, AllFlags, size);
const BufferId new_buffer_id = slot_buffers.insert(
instance, scheduler, MemoryUsage::DeviceLocal, overlap.begin, AllFlags, size);
auto& new_buffer = slot_buffers[new_buffer_id];
const size_t size_bytes = new_buffer.SizeBytes();
const auto cmdbuf = scheduler.CommandBuffer();
Expand Down Expand Up @@ -459,7 +463,8 @@ void BufferCache::ChangeRegister(BufferId buffer_id) {
}
}

bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size) {
void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
bool is_texel_buffer) {
std::scoped_lock lk{mutex};
boost::container::small_vector<vk::BufferCopy, 4> copies;
u64 total_size_bytes = 0;
Expand All @@ -479,8 +484,13 @@ bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size)
// Prevent uploading to gpu modified regions.
// gpu_modified_ranges.ForEachNotInRange(device_addr_out, range_size, add_copy);
});
SCOPE_EXIT {
if (is_texel_buffer) {
SynchronizeBufferFromImage(buffer, device_addr, size);
}
};
if (total_size_bytes == 0) {
return true;
return;
}
vk::Buffer src_buffer = staging_buffer.Handle();
if (total_size_bytes < StagingBufferSize) {
Expand All @@ -496,7 +506,11 @@ bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size)
} else {
// For large one time transfers use a temporary host buffer.
// RenderDoc can lag quite a bit if the stream buffer is too large.
Buffer temp_buffer{instance, MemoryUsage::Upload, 0, vk::BufferUsageFlagBits::eTransferSrc,
Buffer temp_buffer{instance,
scheduler,
MemoryUsage::Upload,
0,
vk::BufferUsageFlagBits::eTransferSrc,
total_size_bytes};
src_buffer = temp_buffer.Handle();
u8* const staging = temp_buffer.mapped_data.data();
Expand Down Expand Up @@ -524,7 +538,68 @@ bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size)
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer,
vk::PipelineStageFlagBits::eAllCommands,
vk::DependencyFlagBits::eByRegion, WRITE_BARRIER, {}, {});
return false;
}

bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size) {
boost::container::small_vector<ImageId, 8> image_ids;
const u32 inv_size = std::min(size, MaxInvalidateDist);
texture_cache.ForEachImageInRegion(device_addr, inv_size, [&](ImageId image_id, Image& image) {
// Only consider GPU modified images, i.e render targets or storage images.
// Also avoid any CPU modified images as the image data is likely to be stale.
if (True(image.flags & ImageFlagBits::CpuModified) ||
False(image.flags & ImageFlagBits::GpuModified)) {
return;
}
// Image must fully overlap with the provided buffer range.
if (image.cpu_addr < device_addr || image.cpu_addr_end > device_addr + size) {
return;
}
image_ids.push_back(image_id);
});
if (image_ids.empty()) {
return false;
}
// Sort images by modification tick. If there are overlaps we want to
// copy from least to most recently modified.
std::ranges::sort(image_ids, [&](ImageId lhs_id, ImageId rhs_id) {
const Image& lhs = texture_cache.GetImage(lhs_id);
const Image& rhs = texture_cache.GetImage(rhs_id);
return lhs.tick_accessed_last < rhs.tick_accessed_last;
});
boost::container::small_vector<vk::BufferImageCopy, 8> copies;
for (const ImageId image_id : image_ids) {
copies.clear();
Image& image = texture_cache.GetImage(image_id);
u32 offset = buffer.Offset(image.cpu_addr);
const u32 num_layers = image.info.resources.layers;
for (u32 m = 0; m < image.info.resources.levels; m++) {
const u32 width = std::max(image.info.size.width >> m, 1u);
const u32 height = std::max(image.info.size.height >> m, 1u);
const u32 depth =
image.info.props.is_volume ? std::max(image.info.size.depth >> m, 1u) : 1u;
const auto& [mip_size, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m];
copies.push_back({
.bufferOffset = offset,
.bufferRowLength = static_cast<u32>(mip_pitch),
.bufferImageHeight = static_cast<u32>(mip_height),
.imageSubresource{
.aspectMask = image.aspect_mask & ~vk::ImageAspectFlagBits::eStencil,
.mipLevel = m,
.baseArrayLayer = 0,
.layerCount = num_layers,
},
.imageOffset = {0, 0, 0},
.imageExtent = {width, height, depth},
});
offset += mip_ofs * num_layers;
}
scheduler.EndRendering();
image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits::eTransferRead);
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.copyImageToBuffer(image.image, vk::ImageLayout::eTransferSrcOptimal, buffer.buffer,
copies);
}
return true;
}

void BufferCache::DeleteBuffer(BufferId buffer_id, bool do_not_mark) {
Expand Down
10 changes: 7 additions & 3 deletions src/video_core/buffer_cache/buffer_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ using BufferId = Common::SlotId;

static constexpr BufferId NULL_BUFFER_ID{0};

static constexpr u32 NUM_VERTEX_BUFFERS = 32;
class TextureCache;

class BufferCache {
public:
Expand All @@ -53,7 +53,8 @@ class BufferCache {

public:
explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
const AmdGpu::Liverpool* liverpool, PageManager& tracker);
const AmdGpu::Liverpool* liverpool, TextureCache& texture_cache,
PageManager& tracker);
~BufferCache();

/// Invalidates any buffer in the logical page range.
Expand Down Expand Up @@ -116,13 +117,16 @@ class BufferCache {
template <bool insert>
void ChangeRegister(BufferId buffer_id);

bool SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size);
void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_texel_buffer);

bool SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size);

void DeleteBuffer(BufferId buffer_id, bool do_not_mark = false);

const Vulkan::Instance& instance;
Vulkan::Scheduler& scheduler;
const AmdGpu::Liverpool* liverpool;
TextureCache& texture_cache;
PageManager& tracker;
StreamBuffer staging_buffer;
StreamBuffer stream_buffer;
Expand Down
Loading
Loading