From 668b1c687c1aa2b8ff3c514167c420bfa72acabe Mon Sep 17 00:00:00 2001
From: Fabio Pellacini <fabio.pellacini@gmail.com>
Date: Tue, 8 Mar 2022 12:57:22 +0100
Subject: [PATCH] Async cuda execution and optix denoiser (#1347)

---
 CMakePresets.json            |   8 +-
 apps/ycutrace/ycutrace.cpp   |   8 +-
 libs/yocto/yocto_cutrace.cpp | 555 ++++++++++++++++++++++++-----------
 libs/yocto/yocto_cutrace.cu  | 125 ++------
 libs/yocto/yocto_cutrace.h   | 141 +++++----
 libs/yocto/yocto_gui.cpp     |  32 +-
 libs/yocto/yocto_trace.cpp   |  75 ++++-
 libs/yocto/yocto_trace.h     |  47 ++-
 8 files changed, 604 insertions(+), 387 deletions(-)
diff --git a/CMakePresets.json b/CMakePresets.json
index edb2d6a2c..4e6a695ca 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -86,8 +86,8 @@
       "cacheVariables": {
         "CMAKE_BUILD_TYPE": "RelWithDebInfo",
         "YOCTO_APPS": "ON",
-        "YOCTO_EMBREE": "OFF",
-        "YOCTO_DENOISE": "OFF",
+        "YOCTO_EMBREE": "ON",
+        "YOCTO_DENOISE": "ON",
         "YOCTO_OPENGL": "ON",
         "YOCTO_CUDA": "ON"
       },
@@ -104,8 +104,8 @@
       "cacheVariables": {
         "CMAKE_BUILD_TYPE": "Debug",
         "YOCTO_APPS": "ON",
-        "YOCTO_EMBREE": "OFF",
-        "YOCTO_DENOISE": "OFF",
+        "YOCTO_EMBREE": "ON",
+        "YOCTO_DENOISE": "ON",
         "YOCTO_OPENGL": "ON",
         "YOCTO_CUDA": "ON"
       },
diff --git a/apps/ycutrace/ycutrace.cpp b/apps/ycutrace/ycutrace.cpp
index c138de883..fb4dd9fe1 100644
--- a/apps/ycutrace/ycutrace.cpp
+++ b/apps/ycutrace/ycutrace.cpp
@@ -124,19 +124,19 @@ void run_render(const render_params& params_) {
 
   // upload scene to the gpu
   timer        = simple_timer{};
-  auto cuscene = make_cutrace_scene(scene, params__);
+  auto cuscene = make_cutrace_scene(context, scene, params__);
   print_info("upload scene: {}", elapsed_formatted(timer));
 
   // build bvh
   timer    = simple_timer{};
-  auto bvh = make_cutrace_bvh(context, cuscene, scene, params__);
+  auto bvh = make_cutrace_bvh(context, cuscene, params__);
   print_info("build bvh: {}", elapsed_formatted(timer));
 
   // init lights
-  auto lights = make_cutrace_lights(scene, params__);
+  auto lights = make_cutrace_lights(context, scene, params__);
 
   // state
-  auto state = make_cutrace_state(scene, params__);
+  auto state = make_cutrace_state(context, scene, params__);
 
   // render
   timer = simple_timer{};
diff --git a/libs/yocto/yocto_cutrace.cpp b/libs/yocto/yocto_cutrace.cpp
index 8cfae089f..ff6351da5 100644
--- a/libs/yocto/yocto_cutrace.cpp
+++ b/libs/yocto/yocto_cutrace.cpp
@@ -46,6 +46,10 @@
 #include <optix_function_table_definition.h>
 #include <optix_stubs.h>
 
+#ifdef YOCTO_DENOISE
+#include <OpenImageDenoise/oidn.hpp>
+#endif
+
 // -----------------------------------------------------------------------------
 // CUDA HELPERS
 // -----------------------------------------------------------------------------
@@ -59,8 +63,8 @@ static void check_result(CUresult result) {
   }
 }
 
-static void check_cusync() {
-  check_result(cuStreamSynchronize(nullptr));  // TODO: cuda_stream
+static void sync_gpu(CUstream stream) {
+  check_result(cuStreamSynchronize(stream));
 }
 
 static void check_result(OptixResult result) {
@@ -71,91 +75,94 @@ static void check_result(OptixResult result) {
 
 // make a buffer
 template <typename T>
-static cubuffer<T> make_buffer(size_t size, const T* data) {
-  auto buffer  = cubuffer<T>{};
+static cuspan<T> make_buffer(CUstream stream, size_t size, const T* data) {
+  auto buffer  = cuspan<T>{};
   buffer._size = size;
   check_result(cuMemAlloc(&buffer._data, buffer.size_in_bytes()));
   if (data) {
-    check_result(
-        cuMemcpyHtoD(buffer.device_ptr(), data, buffer.size_in_bytes()));
+    check_result(cuMemcpyHtoDAsync(
+        buffer.device_ptr(), data, buffer.size_in_bytes(), stream));
   }
   return buffer;
 }
 template <typename T>
-static cubuffer<T> make_buffer(const vector<T>& data) {
+static cuspan<T> make_buffer(CUstream stream, const vector<T>& data) {
   if (data.empty()) return {};
-  return make_buffer(data.size(), data.data());
+  return make_buffer(stream, data.size(), data.data());
 }
 template <typename T>
-static cubuffer<T> make_buffer(const T& data) {
-  return make_buffer(1, &data);
+static cuspan<T> make_buffer(CUstream stream, const T& data) {
+  return make_buffer(stream, 1, &data);
 }
 
 // resize a buffer
 template <typename T>
-static void resize_buffer(cubuffer<T>& buffer, size_t size, const T* data) {
+static void resize_buffer(
+    CUstream stream, cuspan<T>& buffer, size_t size, const T* data) {
   if (buffer._size != size) {
-    check_result(cuMemFree(buffer._data));
+    if (buffer._size != 0) check_result(cuMemFree(buffer._data));
     buffer._size = size;
     check_result(cuMemAlloc(&buffer._data, buffer.size_in_bytes()));
   }
   if (data) {
-    check_result(
-        cuMemcpyHtoD(buffer.device_ptr(), data, buffer.size_in_bytes()));
+    check_result(cuMemcpyHtoDAsync(
+        buffer.device_ptr(), data, buffer.size_in_bytes(), stream));
   }
 }
 
 // update a buffer
 template <typename T>
-static void update_buffer(cubuffer<T>& buffer, size_t size, const T* data) {
+static void update_buffer(
+    CUstream stream, cuspan<T>& buffer, size_t size, const T* data) {
   if (buffer.size() != size) throw std::runtime_error{"Cuda buffer error"};
-  check_result(cuMemcpyHtoD(buffer.device_ptr(), data, buffer.size_in_bytes()));
+  check_result(cuMemcpyHtoDAsync(
+      buffer.device_ptr(), data, buffer.size_in_bytes(), stream));
 }
 template <typename T>
-static void update_buffer(cubuffer<T>& buffer, const vector<T>& data) {
-  return update_buffer(buffer, data.size(), data.data());
+static void update_buffer(
+    CUstream stream, cuspan<T>& buffer, const vector<T>& data) {
+  return update_buffer(stream, buffer, data.size(), data.data());
 }
 template <typename T>
-static void update_buffer(cubuffer<T>& buffer, const T& data) {
-  return update_buffer(buffer, 1, &data);
+static void update_buffer(CUstream stream, cuspan<T>& buffer, const T& data) {
+  return update_buffer(stream, buffer, 1, &data);
 }
 
 // update a buffer
 template <typename T, typename T1>
-static void update_buffer_value(
-    cubuffer<T>& buffer, size_t offset, size_t size, const T1* data) {
-  check_result(
-      cuMemcpyHtoD(buffer.device_ptr() + offset, data, size * sizeof(T1)));
+static void update_buffer_value(CUstream stream, cuspan<T>& buffer,
+    size_t offset, size_t size, const T1* data) {
+  check_result(cuMemcpyHtoDAsync(
+      buffer.device_ptr() + offset, data, size * sizeof(T1), stream));
 }
 template <typename T, typename T1>
 static void update_buffer_value(
-    cubuffer<T>& buffer, size_t offset, const T1& data) {
-  return update_buffer_value(buffer, offset, 1, &data);
+    CUstream stream, cuspan<T>& buffer, size_t offset, const T1& data) {
+  return update_buffer_value(stream, buffer, offset, 1, &data);
 }
 
-// download buffer
+// download buffer --- these are synched to avoid errors
 template <typename T>
-static void download_buffer(
-    const cubuffer<T>& buffer, size_t size, void* data) {
+static void download_buffer(const cuspan<T>& buffer, size_t size, void* data) {
   if (buffer.size() != size) throw std::runtime_error{"Cuda download error"};
   check_result(cuMemcpyDtoH(data, buffer.device_ptr(), buffer.size_in_bytes()));
 }
 template <typename T>
-static void download_buffer(const cubuffer<T>& buffer, vector<T>& data) {
+static void download_buffer(const cuspan<T>& buffer, vector<T>& data) {
   return download_buffer(buffer, data.size(), data.data());
 }
 template <typename T>
-static void download_buffer(const cubuffer<T>& buffer, T& data) {
+static void download_buffer(const cuspan<T>& buffer, T& data) {
   return download_buffer(buffer, 1, &data);
 }
 template <typename T>
-static vector<T> download_buffer_vector(const cubuffer<T>& buffer) {
+static vector<T> download_buffer_vector(const cuspan<T>& buffer) {
   auto data = vector<T>(buffer.size());
   download_buffer(buffer, data.size(), data.data());
   return data;
 }
 template <typename T>
-static T download_buffer_value(const cubuffer<T>& buffer) {
+static T download_buffer_value(const cuspan<T>& buffer) {
   if (buffer.size() != 1) throw std::runtime_error{"Cuda download error"};
   auto data = T{};
   download_buffer(buffer, 1, &data);
@@ -164,7 +171,7 @@ static T download_buffer_value(const cubuffer<T>& buffer) {
 
 // free buffer
 template <typename T>
-static void clear_buffer(cubuffer<T>& buffer) {
+static void clear_buffer(cuspan<T>& buffer) {
   if (buffer.device_ptr() == 0) return;
   check_result(cuMemFree(buffer.device_ptr()));
   buffer._data = 0;
@@ -180,9 +187,7 @@ namespace yocto {
 
 extern "C" char yocto_cutrace_ptx[];
 
-cusceneext_data::cusceneext_data(cusceneext_data&& other) {
-  cutextures.swap(other.cutextures);
-  cushapes.swap(other.cushapes);
+cuscene_data::cuscene_data(cuscene_data&& other) {
   cameras.swap(other.cameras);
   textures.swap(other.textures);
   materials.swap(other.materials);
@@ -190,9 +195,7 @@ cusceneext_data::cusceneext_data(cusceneext_data&& other) {
   instances.swap(other.instances);
   environments.swap(other.environments);
 }
-cusceneext_data& cusceneext_data::operator=(cusceneext_data&& other) {
-  cutextures.swap(other.cutextures);
-  cushapes.swap(other.cushapes);
+cuscene_data& cuscene_data::operator=(cuscene_data&& other) {
   cameras.swap(other.cameras);
   textures.swap(other.textures);
   materials.swap(other.materials);
@@ -201,17 +204,23 @@ cusceneext_data& cusceneext_data::operator=(cusceneext_data&& other) {
   environments.swap(other.environments);
   return *this;
 }
-cusceneext_data::~cusceneext_data() {
-  for (auto& cutexture : cutextures) {
-    cuArrayDestroy(cutexture.array);
-    // TODO: texture
+cuscene_data::~cuscene_data() {
+  if (!textures.empty()) {
+    auto textures_ = download_buffer_vector(textures);
+    for (auto& texture : textures_) {
+      cuArrayDestroy(texture.array);
+      // TODO: texture
+    }
   }
-  for (auto& cushape : cushapes) {
-    clear_buffer(cushape.positions);
-    clear_buffer(cushape.normals);
-    clear_buffer(cushape.texcoords);
-    clear_buffer(cushape.colors);
-    clear_buffer(cushape.triangles);
+  if (!shapes.empty()) {
+    auto shapes_ = download_buffer_vector(shapes);
+    for (auto& shape : shapes_) {
+      clear_buffer(shape.positions);
+      clear_buffer(shape.normals);
+      clear_buffer(shape.texcoords);
+      clear_buffer(shape.colors);
+      clear_buffer(shape.triangles);
+    }
   }
   clear_buffer(cameras);
   clear_buffer(textures);
@@ -243,6 +252,7 @@ cubvh_data::~cubvh_data() {
 }
 
 cutrace_context::cutrace_context(cutrace_context&& other) {
+  std::swap(denoiser, other.denoiser);
   globals_buffer.swap(other.globals_buffer);
   raygen_records.swap(other.raygen_records);
   miss_records.swap(other.miss_records);
@@ -258,6 +268,7 @@ cutrace_context::cutrace_context(cutrace_context&& other) {
   std::swap(cuda_context, other.cuda_context);
 }
 cutrace_context& cutrace_context::operator=(cutrace_context&& other) {
+  std::swap(denoiser, other.denoiser);
   globals_buffer.swap(other.globals_buffer);
   raygen_records.swap(other.raygen_records);
   miss_records.swap(other.miss_records);
@@ -274,7 +285,65 @@ cutrace_context& cutrace_context::operator=(cutrace_context&& other) {
   return *this;
 }
 
+cutrace_state::cutrace_state(cutrace_state&& other) {
+  std::swap(width, other.width);
+  std::swap(height, other.height);
+  std::swap(samples, other.samples);
+  image.swap(other.image);
+  albedo.swap(other.albedo);
+  normal.swap(other.normal);
+  hits.swap(other.hits);
+  rngs.swap(other.rngs);
+  denoised.swap(other.denoised);
+  denoiser_state.swap(other.denoiser_state);
+  denoiser_scratch.swap(other.denoiser_scratch);
+}
+cutrace_state& cutrace_state::operator=(cutrace_state&& other) {
+  std::swap(width, other.width);
+  std::swap(height, other.height);
+  std::swap(samples, other.samples);
+  image.swap(other.image);
+  albedo.swap(other.albedo);
+  normal.swap(other.normal);
+  hits.swap(other.hits);
+  rngs.swap(other.rngs);
+  denoised.swap(other.denoised);
+  denoiser_state.swap(other.denoiser_state);
+  denoiser_scratch.swap(other.denoiser_scratch);
+  return *this;
+}
+cutrace_state::~cutrace_state() {
+  clear_buffer(image);
+  clear_buffer(albedo);
+  clear_buffer(normal);
+  clear_buffer(hits);
+  clear_buffer(rngs);
+  clear_buffer(denoised);
+  clear_buffer(denoiser_state);
+  clear_buffer(denoiser_scratch);
+}
+
+cutrace_lights::cutrace_lights(cutrace_lights&& other) {
+  lights.swap(other.lights);
+}
+cutrace_lights& cutrace_lights::operator=(cutrace_lights&& other) {
+  lights.swap(other.lights);
+  return *this;
+}
+cutrace_lights::~cutrace_lights() {
+  if (!lights.empty()) {
+    auto lights_ = download_buffer_vector(lights);
+    for (auto& light : lights_) {
+      clear_buffer(light.elements_cdf);
+    }
+  }
+  clear_buffer(lights);
+}
+
 cutrace_context::~cutrace_context() {
+  // denoiser
+  optixDenoiserDestroy(denoiser);
+
   // global buffer
   clear_buffer(globals_buffer);
 
@@ -298,6 +367,11 @@ cutrace_context::~cutrace_context() {
   cuCtxDestroy(cuda_context);
 }
 
+static void optix_log_callback(
+    unsigned int level, const char* tag, const char* message, void* cbdata) {
+  printf("[%s] %s\n", tag, message);
+}
+
 // init cuda and optix context
 cutrace_context make_cutrace_context(const cutrace_params& params) {
   // context
@@ -314,10 +388,16 @@ cutrace_context make_cutrace_context(const cutrace_params& params) {
   // init cuda device
   check_result(cuStreamCreate(&context.cuda_stream, CU_STREAM_DEFAULT));
 
-  // init optix device
+  // init optix device --- disable logging
+  auto enable_logging          = false;
+  auto ooptions                = OptixDeviceContextOptions{};
+  ooptions.logCallbackFunction = optix_log_callback;
+  ooptions.logCallbackData     = nullptr;
+  ooptions.logCallbackLevel    = 4;
+  ooptions.validationMode      = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
   check_result(cuCtxGetCurrent(&context.cuda_context));
-  check_result(optixDeviceContextCreate(
-      context.cuda_context, 0, &context.optix_context));
+  check_result(optixDeviceContextCreate(context.cuda_context,
+      enable_logging ? &ooptions : nullptr, &context.optix_context));
 
   // options
   auto module_options             = OptixModuleCompileOptions{};
@@ -389,13 +469,13 @@ cutrace_context make_cutrace_context(const cutrace_params& params) {
   auto raygen_record = cutrace_stbrecord{};
   check_result(
       optixSbtRecordPackHeader(context.raygen_program, &raygen_record));
-  context.raygen_records             = make_buffer(raygen_record);
+  context.raygen_records = make_buffer(context.cuda_stream, raygen_record);
   context.binding_table.raygenRecord = context.raygen_records.device_ptr();
 
   // stb miss
   auto miss_record = cutrace_stbrecord{};
   check_result(optixSbtRecordPackHeader(context.miss_program, &miss_record));
-  context.miss_records                 = make_buffer(miss_record);
+  context.miss_records = make_buffer(context.cuda_stream, miss_record);
   context.binding_table.missRecordBase = context.miss_records.device_ptr();
   context.binding_table.missRecordStrideInBytes = sizeof(cutrace_stbrecord);
   context.binding_table.missRecordCount         = 1;
@@ -404,14 +484,28 @@ cutrace_context make_cutrace_context(const cutrace_params& params) {
   auto hitgroup_record = cutrace_stbrecord{};
   check_result(
       optixSbtRecordPackHeader(context.hitgroup_program, &hitgroup_record));
-  context.hitgroup_records = make_buffer(hitgroup_record);
+  context.hitgroup_records = make_buffer(context.cuda_stream, hitgroup_record);
   context.binding_table.hitgroupRecordBase =
       context.hitgroup_records.device_ptr();
   context.binding_table.hitgroupRecordStrideInBytes = sizeof(cutrace_stbrecord);
   context.binding_table.hitgroupRecordCount         = 1;
 
   // globals
-  context.globals_buffer = make_buffer(cutrace_globals{});
+  context.globals_buffer = make_buffer(context.cuda_stream, cutrace_globals{});
+
+  // denoiser
+  auto doptions        = OptixDenoiserOptions{};
+  doptions.guideAlbedo = (uint) true;
+  doptions.guideNormal = (uint) true;
+  check_result(optixDenoiserCreate(context.optix_context,
+      OPTIX_DENOISER_MODEL_KIND_HDR, &doptions, &context.denoiser));
+
+  auto denoiser_sizes = OptixDenoiserSizes{};
+  check_result(optixDenoiserComputeMemoryResources(
+      context.denoiser, 1280, 1280, &denoiser_sizes));
+
+  // sync gpu
+  sync_gpu(context.cuda_stream);
 
   return context;
 }
@@ -422,18 +516,18 @@ void trace_start(cutrace_context& context, cutrace_state& state,
     const cutrace_lights& lights, const scene_data& scene,
     const cutrace_params& params) {
   auto globals = cutrace_globals{};
-  update_buffer_value(
-      context.globals_buffer, offsetof(cutrace_globals, state), state);
-  update_buffer_value(
-      context.globals_buffer, offsetof(cutrace_globals, scene), cuscene);
-  update_buffer_value(context.globals_buffer, offsetof(cutrace_globals, bvh),
-      bvh.instances_bvh.handle);
-  update_buffer_value(
-      context.globals_buffer, offsetof(cutrace_globals, lights), lights);
-  update_buffer_value(
-      context.globals_buffer, offsetof(cutrace_globals, params), params);
-  // sync so we can get the frame
-  check_cusync();
+  update_buffer_value(context.cuda_stream, context.globals_buffer,
+      offsetof(cutrace_globals, state), state);
+  update_buffer_value(context.cuda_stream, context.globals_buffer,
+      offsetof(cutrace_globals, scene), cuscene);
+  update_buffer_value(context.cuda_stream, context.globals_buffer,
+      offsetof(cutrace_globals, bvh), bvh.instances_bvh.handle);
+  update_buffer_value(context.cuda_stream, context.globals_buffer,
+      offsetof(cutrace_globals, lights), lights);
+  update_buffer_value(context.cuda_stream, context.globals_buffer,
+      offsetof(cutrace_globals, params), params);
+  // sync to avoid errors
+  sync_gpu(context.cuda_stream);
 }
 
 // render a batch of samples
@@ -443,7 +537,7 @@ void trace_samples(cutrace_context& context, cutrace_state& state,
     const cutrace_params& params) {
   if (state.samples >= params.samples) return;
   auto nsamples = params.batch;
-  update_buffer_value(context.globals_buffer,
+  update_buffer_value(context.cuda_stream, context.globals_buffer,
       offsetof(cutrace_globals, state) + offsetof(cutrace_state, samples),
       state.samples);
   check_result(optixLaunch(context.optix_pipeline, context.cuda_stream,
@@ -451,13 +545,16 @@ void trace_samples(cutrace_context& context, cutrace_state& state,
       context.globals_buffer.size_in_bytes(), &context.binding_table,
       state.width, state.height, 1));
   state.samples += nsamples;
-  // sync so we can get the frame
-  check_cusync();
+  if (params.denoise) {
+    denoise_image(context, state);
+  }
+  // sync so we can get the image
+  sync_gpu(context.cuda_stream);
 }
 
-cusceneext_data make_cutrace_scene(
+cuscene_data make_cutrace_scene(cutrace_context& context,
     const scene_data& scene, const cutrace_params& params) {
-  auto cuscene = cusceneext_data{};
+  auto cuscene = cuscene_data{};
 
   auto cucameras = vector<cucamera_data>{};
   for (auto& camera : scene.cameras) {
@@ -470,23 +567,27 @@ cusceneext_data make_cutrace_scene(
     cucamera.focus        = camera.focus;
     cucamera.orthographic = camera.orthographic;
   }
-  cuscene.cameras = make_buffer(cucameras);
+  cuscene.cameras = make_buffer(context.cuda_stream, cucameras);
 
   // shapes
+  auto cushapes = vector<cushape_data>{};
   for (auto& shape : scene.shapes) {
-    auto& cushape     = cuscene.cushapes.emplace_back();
-    cushape.positions = make_buffer(shape.positions);
-    cushape.triangles = make_buffer(shape.triangles);
-    if (!shape.normals.empty()) cushape.normals = make_buffer(shape.normals);
+    auto& cushape     = cushapes.emplace_back();
+    cushape.positions = make_buffer(context.cuda_stream, shape.positions);
+    cushape.triangles = make_buffer(context.cuda_stream, shape.triangles);
+    if (!shape.normals.empty())
+      cushape.normals = make_buffer(context.cuda_stream, shape.normals);
     if (!shape.texcoords.empty())
-      cushape.texcoords = make_buffer(shape.texcoords);
-    if (!shape.colors.empty()) cushape.colors = make_buffer(shape.colors);
+      cushape.texcoords = make_buffer(context.cuda_stream, shape.texcoords);
+    if (!shape.colors.empty())
+      cushape.colors = make_buffer(context.cuda_stream, shape.colors);
   }
-  cuscene.shapes = make_buffer(cuscene.cushapes);
+  cuscene.shapes = make_buffer(context.cuda_stream, cushapes);
 
   // textures
+  auto cutextures = vector<cutexture_data>{};
   for (auto& texture : scene.textures) {
-    auto& cutexture  = cuscene.cutextures.emplace_back();
+    auto& cutexture  = cutextures.emplace_back();
     cutexture.width  = texture.width;
     cutexture.height = texture.height;
     cutexture.linear = texture.linear;
@@ -542,7 +643,7 @@ cusceneext_data make_cutrace_scene(
     check_result(cuTexObjectCreate(&cutexture.texture, &resource_descriptor,
         &texture_descriptor, nullptr));
   }
-  cuscene.textures = make_buffer(cuscene.cutextures);
+  cuscene.textures = make_buffer(context.cuda_stream, cutextures);
 
   auto materials = vector<cumaterial_data>{};
   for (auto& material : scene.materials) {
@@ -563,7 +664,7 @@ cusceneext_data make_cutrace_scene(
     cumaterial.scattering_tex = material.scattering_tex;
     cumaterial.normal_tex     = material.normal_tex;
   }
-  cuscene.materials = make_buffer(materials);
+  cuscene.materials = make_buffer(context.cuda_stream, materials);
 
   auto instances = vector<cuinstance_data>{};
   for (auto& instance : scene.instances) {
@@ -572,7 +673,7 @@ cusceneext_data make_cutrace_scene(
     cuinstance.shape    = instance.shape;
     cuinstance.material = instance.material;
   }
-  cuscene.instances = make_buffer(instances);
+  cuscene.instances = make_buffer(context.cuda_stream, instances);
 
   auto environments = vector<cuenvironment_data>{};
   for (auto& environment : scene.environments) {
@@ -581,16 +682,16 @@ cusceneext_data make_cutrace_scene(
     cuenvironment.emission     = environment.emission;
     cuenvironment.emission_tex = environment.emission_tex;
   }
-  cuscene.environments = make_buffer(environments);
+  cuscene.environments = make_buffer(context.cuda_stream, environments);
 
   // sync gpu
-  check_cusync();
+  sync_gpu(context.cuda_stream);
 
   return cuscene;
 }
 
-void update_cutrace_cameras(cusceneext_data& cuscene, const scene_data& scene,
-    const cutrace_params& params) {
+void update_cutrace_cameras(cutrace_context& context, cuscene_data& cuscene,
+    const scene_data& scene, const cutrace_params& params) {
   auto cucameras = vector<cucamera_data>{};
   for (auto& camera : scene.cameras) {
     auto& cucamera        = cucameras.emplace_back();
@@ -602,18 +703,24 @@ void update_cutrace_cameras(cusceneext_data& cuscene, const scene_data& scene,
     cucamera.focus        = camera.focus;
     cucamera.orthographic = camera.orthographic;
   }
-  update_buffer(cuscene.cameras, cucameras);
+  update_buffer(context.cuda_stream, cuscene.cameras, cucameras);
+  sync_gpu(context.cuda_stream);
 }
 
-cubvh_data make_cutrace_bvh(cutrace_context& context, cusceneext_data& cuscene,
-    const scene_data& scene, const cutrace_params& params) {
+cubvh_data make_cutrace_bvh(cutrace_context& context, const cuscene_data& scene,
+    const cutrace_params& params) {
   auto bvh = cubvh_data{};
 
+  // download shapes and instances
+  // this is not efficient, but keeps the API very clean
+  // in the future, we might want to merge scene and bvh creation
+  auto shapes_data    = download_buffer_vector(scene.shapes);
+  auto instances_data = download_buffer_vector(scene.instances);
+
   // shapes
   bvh.shapes_bvhs.resize(scene.shapes.size());
   for (auto shape_id = (size_t)0; shape_id < scene.shapes.size(); shape_id++) {
-    auto& shape   = scene.shapes[shape_id];
-    auto& cushape = cuscene.cushapes[shape_id];
+    auto& shape = shapes_data[shape_id];
 
     // input
     auto built_input                       = OptixBuildInput{};
@@ -621,16 +728,16 @@ cubvh_data make_cutrace_bvh(cutrace_context& context, cusceneext_data& cuscene,
     built_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
     built_input.triangleArray.vertexStrideInBytes = sizeof(vec3f);
     built_input.triangleArray.numVertices         = (int)shape.positions.size();
-    auto vertex_buffer                      = cushape.positions.device_ptr();
+    auto vertex_buffer                      = shape.positions.device_ptr();
     built_input.triangleArray.vertexBuffers = &vertex_buffer;
     built_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
     built_input.triangleArray.indexStrideInBytes = sizeof(vec3i);
     built_input.triangleArray.numIndexTriplets   = (int)shape.triangles.size();
-    auto index_buffer                       = cushape.triangles.device_ptr();
-    built_input.triangleArray.indexBuffer   = index_buffer;
-    auto input_flags                        = (unsigned int)0;
-    built_input.triangleArray.flags         = &input_flags;
-    built_input.triangleArray.numSbtRecords = 1;
+    auto index_buffer                            = shape.triangles.device_ptr();
+    built_input.triangleArray.indexBuffer        = index_buffer;
+    auto input_flags                             = (unsigned int)0;
+    built_input.triangleArray.flags              = &input_flags;
+    built_input.triangleArray.numSbtRecords      = 1;
     built_input.triangleArray.sbtIndexOffsetBuffer        = 0;
     built_input.triangleArray.sbtIndexOffsetSizeInBytes   = 0;
     built_input.triangleArray.sbtIndexOffsetStrideInBytes = 0;
@@ -646,31 +753,37 @@ cubvh_data make_cutrace_bvh(cutrace_context& context, cusceneext_data& cuscene,
     check_result(optixAccelComputeMemoryUsage(context.optix_context,
         &accelerator_options, &built_input, (int)1, &accelerator_sizes));
 
-    auto compacted_size_buffer = make_buffer(1, (uint64_t*)nullptr);
+    auto compacted_size_buffer = make_buffer(
+        context.cuda_stream, 1, (uint64_t*)nullptr);
     auto readback_descriptor   = OptixAccelEmitDesc{};
     readback_descriptor.type   = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
     readback_descriptor.result = compacted_size_buffer.device_ptr();
 
     // build
     auto temporary_buffer = make_buffer(
-        accelerator_sizes.tempSizeInBytes, (byte*)nullptr);
-    auto bvh_buffer = make_buffer(
-        accelerator_sizes.outputSizeInBytes, (byte*)nullptr);
-    auto& sbvh = bvh.shapes_bvhs[shape_id];
+        context.cuda_stream, accelerator_sizes.tempSizeInBytes, (byte*)nullptr);
+    auto  bvh_buffer = make_buffer(context.cuda_stream,
+         accelerator_sizes.outputSizeInBytes, (byte*)nullptr);
+    auto& sbvh       = bvh.shapes_bvhs[shape_id];
     check_result(optixAccelBuild(context.optix_context,
         /* cuda_stream */ 0, &accelerator_options, &built_input, (int)1,
         temporary_buffer.device_ptr(), temporary_buffer.size_in_bytes(),
         bvh_buffer.device_ptr(), bvh_buffer.size_in_bytes(), &sbvh.handle,
         &readback_descriptor, 1));
-    check_cusync();
+
+    // sync
+    sync_gpu(context.cuda_stream);
 
     // compact
     auto compacted_size = download_buffer_value(compacted_size_buffer);
-    sbvh.buffer         = make_buffer(compacted_size, (byte*)nullptr);
+    sbvh.buffer         = make_buffer(
+                context.cuda_stream, compacted_size, (byte*)nullptr);
     check_result(optixAccelCompact(context.optix_context,
         /*cuda_stream:*/ 0, sbvh.handle, sbvh.buffer.device_ptr(),
         sbvh.buffer.size_in_bytes(), &sbvh.handle));
-    check_cusync();
+
+    // sync
+    sync_gpu(context.cuda_stream);
 
     // cleanup
     clear_buffer(bvh_buffer);
@@ -681,20 +794,20 @@ cubvh_data make_cutrace_bvh(cutrace_context& context, cusceneext_data& cuscene,
   // instances
   {
     // upload data
-    auto instances = vector<OptixInstance>(scene.instances.size());
+    auto opinstances = vector<OptixInstance>(scene.instances.size());
     for (auto instance_id = 0; instance_id < scene.instances.size();
          instance_id++) {
-      auto& instance   = scene.instances[instance_id];
-      auto& cuinstance = instances[instance_id];
+      auto& instance   = instances_data[instance_id];
+      auto& opinstance = opinstances[instance_id];
       auto  transform  = transpose(frame_to_mat(instance.frame));
-      memcpy(cuinstance.transform, &transform, sizeof(float) * 12);
-      cuinstance.sbtOffset         = 0;
-      cuinstance.instanceId        = instance_id;
-      cuinstance.traversableHandle = bvh.shapes_bvhs[instance.shape].handle;
-      cuinstance.flags             = OPTIX_INSTANCE_FLAG_NONE;
-      cuinstance.visibilityMask    = 0xff;
+      memcpy(opinstance.transform, &transform, sizeof(float) * 12);
+      opinstance.sbtOffset         = 0;
+      opinstance.instanceId        = instance_id;
+      opinstance.traversableHandle = bvh.shapes_bvhs[instance.shape].handle;
+      opinstance.flags             = OPTIX_INSTANCE_FLAG_NONE;
+      opinstance.visibilityMask    = 0xff;
     }
-    bvh.instances = make_buffer(instances);
+    bvh.instances = make_buffer(context.cuda_stream, opinstances);
 
     // config
     auto build_input                       = OptixBuildInput{};
@@ -712,15 +825,16 @@ cubvh_data make_cutrace_bvh(cutrace_context& context, cusceneext_data& cuscene,
     check_result(optixAccelComputeMemoryUsage(context.optix_context,
         &accelerator_options, &build_input, (int)1, &accelerator_sizes));
 
-    auto compacted_size_buffer = make_buffer(1, (uint64_t*)nullptr);
+    auto compacted_size_buffer = make_buffer(
+        context.cuda_stream, 1, (uint64_t*)nullptr);
     auto readback_descriptor   = OptixAccelEmitDesc{};
     readback_descriptor.type   = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
     readback_descriptor.result = compacted_size_buffer.device_ptr();
 
     // build
     auto temporary_buffer = make_buffer(
-        accelerator_sizes.tempSizeInBytes, (byte*)nullptr);
-    auto bvh_buffer = make_buffer(
+        context.cuda_stream, accelerator_sizes.tempSizeInBytes, (byte*)nullptr);
+    auto bvh_buffer = make_buffer(context.cuda_stream,
         accelerator_sizes.outputSizeInBytes, (byte*)nullptr);
 
     auto& ibvh = bvh.instances_bvh;
@@ -729,16 +843,20 @@ cubvh_data make_cutrace_bvh(cutrace_context& context, cusceneext_data& cuscene,
         temporary_buffer.device_ptr(), temporary_buffer.size_in_bytes(),
         bvh_buffer.device_ptr(), bvh_buffer.size_in_bytes(), &ibvh.handle,
         &readback_descriptor, 1));
-    check_cusync();
+
+    // sync gpu
+    sync_gpu(context.cuda_stream);
 
     // compact
     auto compacted_size = download_buffer_value(compacted_size_buffer);
-
-    ibvh.buffer = make_buffer(compacted_size, (byte*)nullptr);
+    ibvh.buffer         = make_buffer(
+                context.cuda_stream, compacted_size, (byte*)nullptr);
     check_result(optixAccelCompact(context.optix_context,
         /*cuda_stream:*/ 0, ibvh.handle, ibvh.buffer.device_ptr(),
         ibvh.buffer.size_in_bytes(), &ibvh.handle));
-    check_cusync();
+
+    // sync gpu
+    sync_gpu(context.cuda_stream);
 
     // cleanup
     clear_buffer(bvh_buffer);
@@ -747,14 +865,14 @@ cubvh_data make_cutrace_bvh(cutrace_context& context, cusceneext_data& cuscene,
   }
 
   // sync gpu
-  check_cusync();
+  sync_gpu(context.cuda_stream);
 
   // done
   return bvh;
 }
 
 // Initialize state.
-cutrace_state make_cutrace_state(
+cutrace_state make_cutrace_state(cutrace_context& context,
     const scene_data& scene, const cutrace_params& params) {
   auto& camera = scene.cameras[params.camera];
   auto  state  = cutrace_state{};
@@ -766,17 +884,33 @@ cutrace_state make_cutrace_state(
     state.width  = (int)round(params.resolution * camera.aspect);
   }
   state.samples = 0;
-  state.image   = make_buffer(state.width * state.height, (vec4f*)nullptr);
-  state.albedo  = make_buffer(state.width * state.height, (vec3f*)nullptr);
-  state.normal  = make_buffer(state.width * state.height, (vec3f*)nullptr);
-  state.hits    = make_buffer(state.width * state.height, (int*)nullptr);
-  state.rngs    = make_buffer(state.width * state.height, (rng_state*)nullptr);
-  state.display = make_buffer(state.width * state.height, (vec4f*)nullptr);
+  state.image   = make_buffer(
+        context.cuda_stream, state.width * state.height, (vec4f*)nullptr);
+  state.albedo = make_buffer(
+      context.cuda_stream, state.width * state.height, (vec3f*)nullptr);
+  state.normal = make_buffer(
+      context.cuda_stream, state.width * state.height, (vec3f*)nullptr);
+  state.hits = make_buffer(
+      context.cuda_stream, state.width * state.height, (int*)nullptr);
+  state.rngs = make_buffer(
+      context.cuda_stream, state.width * state.height, (rng_state*)nullptr);
+  if (params.denoise) {
+    auto denoiser_sizes = OptixDenoiserSizes{};
+    check_result(optixDenoiserComputeMemoryResources(
+        context.denoiser, state.width, state.height, &denoiser_sizes));
+    state.denoised = make_buffer(
+        context.cuda_stream, state.width * state.height, (vec4f*)nullptr);
+    state.denoiser_state = make_buffer(
+        context.cuda_stream, denoiser_sizes.stateSizeInBytes, (byte*)nullptr);
+    state.denoiser_scratch = make_buffer(context.cuda_stream,
+        denoiser_sizes.withoutOverlapScratchSizeInBytes, (byte*)nullptr);
+  }
+  sync_gpu(context.cuda_stream);
   return state;
 };
 
-void reset_cutrace_state(cutrace_state& state, const scene_data& scene,
-    const cutrace_params& params) {
+void reset_cutrace_state(cutrace_context& context, cutrace_state& state,
+    const scene_data& scene, const cutrace_params& params) {
   auto& camera = scene.cameras[params.camera];
   if (camera.aspect >= 1) {
     state.width  = params.resolution;
@@ -786,16 +920,36 @@ void reset_cutrace_state(cutrace_state& state, const scene_data& scene,
     state.width  = (int)round(params.resolution * camera.aspect);
   }
   state.samples = 0;
-  resize_buffer(state.image, state.width * state.height, (vec4f*)nullptr);
-  resize_buffer(state.albedo, state.width * state.height, (vec3f*)nullptr);
-  resize_buffer(state.normal, state.width * state.height, (vec3f*)nullptr);
-  resize_buffer(state.hits, state.width * state.height, (int*)nullptr);
-  resize_buffer(state.rngs, state.width * state.height, (rng_state*)nullptr);
-  resize_buffer(state.display, state.width * state.height, (vec4f*)nullptr);
+  resize_buffer(context.cuda_stream, state.image, state.width * state.height,
+      (vec4f*)nullptr);
+  resize_buffer(context.cuda_stream, state.albedo, state.width * state.height,
+      (vec3f*)nullptr);
+  resize_buffer(context.cuda_stream, state.normal, state.width * state.height,
+      (vec3f*)nullptr);
+  resize_buffer(context.cuda_stream, state.hits, state.width * state.height,
+      (int*)nullptr);
+  resize_buffer(context.cuda_stream, state.rngs, state.width * state.height,
+      (rng_state*)nullptr);
+  if (params.denoise) {
+    auto denoiser_sizes = OptixDenoiserSizes{};
+    check_result(optixDenoiserComputeMemoryResources(
+        context.denoiser, state.width, state.height, &denoiser_sizes));
+    resize_buffer(context.cuda_stream, state.denoised,
+        state.width * state.height, (vec4f*)nullptr);
+    resize_buffer(context.cuda_stream, state.denoiser_state,
+        denoiser_sizes.stateSizeInBytes, (byte*)nullptr);
+    resize_buffer(context.cuda_stream, state.denoiser_scratch,
+        denoiser_sizes.withoutOverlapScratchSizeInBytes, (byte*)nullptr);
+  } else {
+    clear_buffer(state.denoised);
+    clear_buffer(state.denoiser_state);
+    clear_buffer(state.denoiser_scratch);
+  }
+  sync_gpu(context.cuda_stream);
 }
 
 // Init trace lights
-cutrace_lights make_cutrace_lights(
+cutrace_lights make_cutrace_lights(cutrace_context& context,
     const scene_data& scene, const cutrace_params& params) {
   auto lights    = make_trace_lights(scene, (const trace_params&)params);
   auto culights_ = vector<cutrace_light>{};
@@ -803,10 +957,11 @@ cutrace_lights make_cutrace_lights(
     auto& culight        = culights_.emplace_back();
     culight.instance     = light.instance;
     culight.environment  = light.environment;
-    culight.elements_cdf = make_buffer(light.elements_cdf);
+    culight.elements_cdf = make_buffer(context.cuda_stream, light.elements_cdf);
   }
   auto culights   = cutrace_lights{};
-  culights.lights = make_buffer(culights_);
+  culights.lights = make_buffer(context.cuda_stream, culights_);
+  sync_gpu(context.cuda_stream);
   return culights;
 }
 
@@ -815,10 +970,10 @@ image_data cutrace_image(
     const scene_data& scene, const cutrace_params& params) {
   // initialization
   auto context = make_cutrace_context(params);
-  auto cuscene = make_cutrace_scene(scene, params);
-  auto bvh     = make_cutrace_bvh(context, cuscene, scene, params);
-  auto state   = make_cutrace_state(scene, params);
-  auto lights  = make_cutrace_lights(scene, params);
+  auto cuscene = make_cutrace_scene(context, scene, params);
+  auto bvh     = make_cutrace_bvh(context, cuscene, params);
+  auto state   = make_cutrace_state(context, scene, params);
+  auto lights  = make_cutrace_lights(context, scene, params);
 
   // rendering
   trace_start(context, state, cuscene, bvh, lights, scene, params);
@@ -827,7 +982,21 @@ image_data cutrace_image(
   }
 
   // copy back image and return
-  return get_rendered_image(state);
+  return get_image(state);
+}
+
+// Get resulting render
+image_data get_image(const cutrace_state& state) {
+  auto image = make_image(state.width, state.height, true);
+  get_image(image, state);
+  return image;
+}
+void get_image(image_data& image, const cutrace_state& state) {
+  if (state.denoised.empty()) {
+    download_buffer(state.image, image.pixels);
+  } else {
+    download_buffer(state.denoised, image.pixels);
+  }
 }
 
 // Get resulting render
@@ -856,12 +1025,8 @@ void get_denoised_image(image_data& image, const cutrace_state& state) {
   get_rendered_image(image, state);
 
   // get albedo and normal
-  auto albedo = vector<vec3f>(image.pixels.size()),
-       normal = vector<vec3f>(image.pixels.size());
-  for (auto idx = 0; idx < state.width * state.height; idx++) {
-    albedo[idx] = state.albedo[idx];
-    normal[idx] = state.normal[idx];
-  }
+  auto albedo = download_buffer_vector(state.albedo);
+  auto normal = download_buffer_vector(state.normal);
 
   // Create a denoising filter
   oidn::FilterRef filter = device.newFilter("RT");  // ray tracing filter
@@ -910,6 +1075,41 @@ void get_normal_image(image_data& image, const cutrace_state& state) {
   }
 }
 
+// denoise image
+void denoise_image(cutrace_context& context, cutrace_state& state) {
+  // denoiser setup
+  check_result(optixDenoiserSetup(context.denoiser, context.cuda_stream,
+      state.width, state.height, state.denoiser_state.device_ptr(),
+      state.denoiser_state.size_in_bytes(), state.denoiser_scratch.device_ptr(),
+      state.denoiser_scratch.size_in_bytes()));
+
+  // params
+  auto dparams = OptixDenoiserParams{};
+
+  // layers
+  auto guides   = OptixDenoiserGuideLayer{};
+  guides.albedo = OptixImage2D{state.albedo.device_ptr(), (uint)state.width,
+      (uint)state.height, (uint)state.width * sizeof(vec3f), sizeof(vec3f),
+      OPTIX_PIXEL_FORMAT_FLOAT3};
+  guides.normal = OptixImage2D{state.normal.device_ptr(), (uint)state.width,
+      (uint)state.height, (uint)state.width * sizeof(vec3f), sizeof(vec3f),
+      OPTIX_PIXEL_FORMAT_FLOAT3};
+  auto layers   = OptixDenoiserLayer{};
+  layers.input  = OptixImage2D{state.image.device_ptr(), (uint)state.width,
+      (uint)state.height, (uint)state.width * sizeof(vec4f), sizeof(vec4f),
+      OPTIX_PIXEL_FORMAT_FLOAT4};
+  layers.output = OptixImage2D{state.denoised.device_ptr(), (uint)state.width,
+      (uint)state.height, (uint)state.width * sizeof(vec4f), sizeof(vec4f),
+      OPTIX_PIXEL_FORMAT_FLOAT4};
+
+  // denoiser execution
+  check_result(optixDenoiserInvoke(context.denoiser, context.cuda_stream,
+      &dparams, state.denoiser_state.device_ptr(),
+      state.denoiser_state.size_in_bytes(), &guides, &layers, 1, 0, 0,
+      state.denoiser_scratch.device_ptr(),
+      state.denoiser_scratch.size_in_bytes()));
+}
+
 bool is_display(const cutrace_context& context) {
   auto device = 0, is_display = 0;
   // check_result(cuDevice(&current_device));
@@ -927,14 +1127,19 @@ bool is_display(const cutrace_context& context) {
 // -----------------------------------------------------------------------------
 namespace yocto {
 
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4722)
+#endif
+
 static void exit_nocuda() { throw std::runtime_error{"Cuda not linked"}; }
 
-cusceneext_data::cusceneext_data(cusceneext_data&& other) { exit_nocuda(); }
-cusceneext_data& cusceneext_data::operator=(cusceneext_data&& other) {
+cuscene_data::cuscene_data(cuscene_data&& other) { exit_nocuda(); }
+cuscene_data& cuscene_data::operator=(cuscene_data&& other) {
   exit_nocuda();
   return *this;
 }
-cusceneext_data::~cusceneext_data() { exit_nocuda(); };
+cuscene_data::~cuscene_data() { exit_nocuda(); };
 
 cubvh_data::cubvh_data(cubvh_data&& other) { exit_nocuda(); }
 cubvh_data& cubvh_data::operator=(cubvh_data&& other) {
@@ -950,6 +1155,13 @@ cutrace_context& cutrace_context::operator=(cutrace_context&& other) {
 }
 cutrace_context::~cutrace_context() { exit_nocuda(); }
 
+cutrace_state::~cutrace_state() { exit_nocuda(); }
+cutrace_lights::~cutrace_lights() { exit_nocuda(); }
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
 image_data cutrace_image(
     const scene_data& scene, const cutrace_params& params) {
   exit_nocuda();
@@ -963,38 +1175,38 @@ cutrace_context make_cutrace_context(const cutrace_params& params) {
 }
 
 // Upload the scene to the GPU.
-cusceneext_data make_cutrace_scene(
+cuscene_data make_cutrace_scene(cutrace_context& context,
     const scene_data& scene, const cutrace_params& params) {
   exit_nocuda();
   return {};
 }
 
 // Update cameras
-void update_cutrace_cameras(cusceneext_data& cuscene, const scene_data& scene,
-    const cutrace_params& params) {
+void update_cutrace_cameras(cutrace_context& context, cuscene_data& cuscene,
+    const scene_data& scene, const cutrace_params& params) {
   exit_nocuda();
 }
 
 // Build the bvh acceleration structure.
-cubvh_data make_cutrace_bvh(cutrace_context& context, cusceneext_data& cuscene,
-    const scene_data& scene, const cutrace_params& params) {
+cubvh_data make_cutrace_bvh(cutrace_context& context,
+    const cuscene_data& cuscene, const cutrace_params& params) {
   exit_nocuda();
   return {};
 }
 
 // Initialize state.
-cutrace_state make_cutrace_state(
+cutrace_state make_cutrace_state(cutrace_context& context,
     const scene_data& scene, const cutrace_params& params) {
   exit_nocuda();
   return {};
 }
-void reset_cutrace_state(cutrace_state& state, const scene_data& scene,
-    const cutrace_params& params) {
+void reset_cutrace_state(cutrace_context& context, cutrace_state& state,
+    const scene_data& scene, const cutrace_params& params) {
   exit_nocuda();
 }
 
 // Initialize lights.
-cutrace_lights make_cutrace_lights(
+cutrace_lights make_cutrace_lights(cutrace_context& context,
     const scene_data& scene, const cutrace_params& params) {
   exit_nocuda();
   return {};
@@ -1016,6 +1228,13 @@ void trace_samples(cutrace_context& context, cutrace_state& state,
   exit_nocuda();
 }
 
+// Get render
+image_data get_image(const cutrace_state& state) {
+  exit_nocuda();
+  return {};
+}
+void get_image(image_data& image, const cutrace_state& state) { exit_nocuda(); }
+
 // Get resulting render
 image_data get_rendered_image(const cutrace_state& state) {
   exit_nocuda();
diff --git a/libs/yocto/yocto_cutrace.cu b/libs/yocto/yocto_cutrace.cu
index ac129b03f..8456bc1ae 100644
--- a/libs/yocto/yocto_cutrace.cu
+++ b/libs/yocto/yocto_cutrace.cu
@@ -1038,15 +1038,8 @@ namespace yocto {
 struct rng_state {
   uint64_t state = 0x853c49e6748fea9bULL;
   uint64_t inc   = 0xda3e39cb94b95bdbULL;
-
-  rng_state() = default;
-  rng_state(uint64_t state, uint64_t inc);
 };
 
-// PCG random numbers from http://www.pcg-random.org/
-inline rng_state::rng_state(uint64_t state, uint64_t inc)
-    : state{state}, inc{inc} {}
-
 // Next random number, used internally only.
 inline uint32_t _advance_rng(rng_state& rng) {
   uint64_t oldstate = rng.state;
@@ -2051,7 +2044,7 @@ inline float sample_phasefunction_pdf(
 namespace yocto {
 
 template <typename T>
-struct cubuffer {
+struct cuspan {
   inline bool     empty() const { return _size == 0; }
   inline size_t   size() const { return _size; }
   inline T&       operator[](int idx) { return _data[idx]; }
@@ -2126,15 +2119,17 @@ namespace yocto {
 constexpr int invalidid = -1;
 
 struct cutrace_state {
-  int                 width   = 0;
-  int                 height  = 0;
-  int                 samples = 0;
-  cubuffer<vec4f>     image   = {};
-  cubuffer<vec3f>     albedo  = {};
-  cubuffer<vec3f>     normal  = {};
-  cubuffer<int>       hits    = {};
-  cubuffer<rng_state> rngs    = {};
-  cubuffer<vec4f>     display = {};
+  int               width            = 0;
+  int               height           = 0;
+  int               samples          = 0;
+  cuspan<vec4f>     image            = {};
+  cuspan<vec3f>     albedo           = {};
+  cuspan<vec3f>     normal           = {};
+  cuspan<int>       hits             = {};
+  cuspan<rng_state> rngs             = {};
+  cuspan<vec4f>     denoised         = {};
+  cuspan<byte>      denoiser_state   = {};
+  cuspan<byte>      denoiser_scratch = {};
 };
 
 struct cucamera_data {
@@ -2188,11 +2183,11 @@ struct cuinstance_data {
 };
 
 struct cushape_data {
-  cubuffer<vec3f> positions = {};
-  cubuffer<vec3f> normals   = {};
-  cubuffer<vec2f> texcoords = {};
-  cubuffer<vec4f> colors    = {};
-  cubuffer<vec3i> triangles = {};
+  cuspan<vec3f> positions = {};
+  cuspan<vec3f> normals   = {};
+  cuspan<vec2f> texcoords = {};
+  cuspan<vec4f> colors    = {};
+  cuspan<vec3i> triangles = {};
 };
 
 struct cuenvironment_data {
@@ -2202,12 +2197,12 @@ struct cuenvironment_data {
 };
 
 struct cuscene_data {
-  cubuffer<cucamera_data>      cameras      = {};
-  cubuffer<cutexture_data>     textures     = {};
-  cubuffer<cumaterial_data>    materials    = {};
-  cubuffer<cushape_data>       shapes       = {};
-  cubuffer<cuinstance_data>    instances    = {};
-  cubuffer<cuenvironment_data> environments = {};
+  cuspan<cucamera_data>      cameras      = {};
+  cuspan<cutexture_data>     textures     = {};
+  cuspan<cumaterial_data>    materials    = {};
+  cuspan<cushape_data>       shapes       = {};
+  cuspan<cuinstance_data>    instances    = {};
+  cuspan<cuenvironment_data> environments = {};
 };
 
 // Type of tracing algorithm
@@ -2260,14 +2255,14 @@ using cutrace_bvh = OptixTraversableHandle;
 
 // light
 struct cutrace_light {
-  int             instance     = invalidid;
-  int             environment  = invalidid;
-  cubuffer<float> elements_cdf = {};
+  int           instance     = invalidid;
+  int           environment  = invalidid;
+  cuspan<float> elements_cdf = {};
 };
 
 // lights
 struct cutrace_lights {
-  cubuffer<cutrace_light> lights = {};
+  cuspan<cutrace_light> lights = {};
 };
 
 struct cutrace_globals {
@@ -2346,52 +2341,6 @@ struct material_point {
   float         trdepth      = 0.01f;
 };
 
-// Evaluate material
-static material_point eval_material(const scene_data& scene,
-    const material_data& material, const vec2f& texcoord,
-    const vec4f& color_shp) {
-  // evaluate textures
-  auto emission_tex = eval_texture(
-      scene, material.emission_tex, texcoord, true);
-  auto color_tex     = eval_texture(scene, material.color_tex, texcoord, true);
-  auto roughness_tex = eval_texture(
-      scene, material.roughness_tex, texcoord, false);
-  auto scattering_tex = eval_texture(
-      scene, material.scattering_tex, texcoord, true);
-
-  // material point
-  auto point         = material_point{};
-  point.type         = material.type;
-  point.emission     = material.emission * xyz(emission_tex);
-  point.color        = material.color * xyz(color_tex) * xyz(color_shp);
-  point.opacity      = material.opacity * color_tex.w * color_shp.w;
-  point.metallic     = material.metallic * roughness_tex.z;
-  point.roughness    = material.roughness * roughness_tex.y;
-  point.roughness    = point.roughness * point.roughness;
-  point.ior          = material.ior;
-  point.scattering   = material.scattering * xyz(scattering_tex);
-  point.scanisotropy = material.scanisotropy;
-  point.trdepth      = material.trdepth;
-
-  // volume density
-  if (material.type == material_type::refractive ||
-      material.type == material_type::volumetric ||
-      material.type == material_type::subsurface) {
-    point.density = -log(clamp(point.color, 0.0001f, 1.0f)) / point.trdepth;
-  } else {
-    point.density = {0, 0, 0};
-  }
-
-  // fix roughness
-  if (point.type == material_type::matte ||
-      point.type == material_type::gltfpbr ||
-      point.type == material_type::glossy) {
-    point.roughness = clamp(point.roughness, min_roughness, 1.0f);
-  }
-
-  return point;
-}
-
 // Eval position
 static vec3f eval_position(const scene_data& scene,
     const instance_data& instance, int element, const vec2f& uv) {
@@ -4013,26 +3962,6 @@ static trace_result trace_falsecolor(const scene_data& scene,
   return {srgb_to_rgb(result), true, material.color, normal};
 }
 
-// Trace a single ray from the camera using the given algorithm.
-using sampler_func = trace_result (*)(const scene_data& scene,
-    const trace_bvh& bvh, const trace_lights& lights, const ray3f& ray,
-    rng_state& rng, const trace_params& params);
-static sampler_func get_trace_sampler_func(const trace_params& params) {
-  switch (params.sampler) {
-    case trace_sampler_type::path: return trace_path;
-    case trace_sampler_type::pathdirect: return trace_pathdirect;
-    case trace_sampler_type::pathmis: return trace_pathmis;
-    case trace_sampler_type::naive: return trace_naive;
-    case trace_sampler_type::eyelight: return trace_eyelight;
-    case trace_sampler_type::eyelightao: return trace_eyelightao;
-    case trace_sampler_type::furnace: return trace_furnace;
-    case trace_sampler_type::falsecolor: return trace_falsecolor;
-    default: {
-      return nullptr;
-    }
-  }
-}
-
 static trace_result trace_sampler(const scene_data& scene, const trace_bvh& bvh,
     const trace_lights& lights, const ray3f& ray, rng_state& rng,
     const trace_params& params) {
diff --git a/libs/yocto/yocto_cutrace.h b/libs/yocto/yocto_cutrace.h
index 8aba21ad1..20f1c0029 100644
--- a/libs/yocto/yocto_cutrace.h
+++ b/libs/yocto/yocto_cutrace.h
@@ -85,7 +85,7 @@ namespace yocto {
 
 // forward declarations
 struct cuscene_data;
-struct cusceneext_data;
+struct cuscene_data;
 struct cubvh_data;
 struct cutrace_state;
 struct cutrace_lights;
@@ -95,23 +95,23 @@ struct cutrace_context;
 cutrace_context make_cutrace_context(const cutrace_params& params);
 
 // Upload the scene to the GPU.
-cusceneext_data make_cutrace_scene(
+cuscene_data make_cutrace_scene(cutrace_context& context,
+    const scene_data& scene, const cutrace_params& params);
+void update_cutrace_cameras(cutrace_context& context, cuscene_data& cuscene,
     const scene_data& scene, const cutrace_params& params);
-void update_cutrace_cameras(cusceneext_data& cuscene, const scene_data& scene,
-    const cutrace_params& params);
 
 // Build the bvh acceleration structure.
-cubvh_data make_cutrace_bvh(cutrace_context& context, cusceneext_data& cuscene,
-    const scene_data& scene, const cutrace_params& params);
+cubvh_data make_cutrace_bvh(cutrace_context& context,
+    const cuscene_data& cuscene, const cutrace_params& params);
 
 // Initialize state.
-cutrace_state make_cutrace_state(
+cutrace_state make_cutrace_state(cutrace_context& context,
+    const scene_data& scene, const cutrace_params& params);
+void reset_cutrace_state(cutrace_context& context, cutrace_state& state,
     const scene_data& scene, const cutrace_params& params);
-void reset_cutrace_state(cutrace_state& state, const scene_data& scene,
-    const cutrace_params& params);
 
 // Initialize lights.
-cutrace_lights make_cutrace_lights(
+cutrace_lights make_cutrace_lights(cutrace_context& context,
     const scene_data& scene, const cutrace_params& params);
 
 // Start rendering an image.
@@ -126,20 +126,23 @@ void trace_samples(cutrace_context& context, cutrace_state& state,
     const cutrace_lights& lights, const scene_data& scene,
     const cutrace_params& params);
 
-// Get resulting render
+// Get resulting render, denoised if requested
+image_data get_image(const cutrace_state& state);
+void       get_image(image_data& image, const cutrace_state& state);
+
+// Get internal images from state
 image_data get_rendered_image(const cutrace_state& state);
 void       get_rendered_image(image_data& image, const cutrace_state& state);
-
-// Get denoised result
 image_data get_denoised_image(const cutrace_state& state);
 void       get_denoised_image(image_data& image, const cutrace_state& state);
-
-// Get denoising buffers
 image_data get_albedo_image(const cutrace_state& state);
 void       get_albedo_image(image_data& image, const cutrace_state& state);
 image_data get_normal_image(const cutrace_state& state);
 void       get_normal_image(image_data& image, const cutrace_state& state);
 
+// denoise image
+void denoise_image(cutrace_context& context, cutrace_state& state);
+
 // check if display
 bool is_display(const cutrace_context& context);
 
@@ -194,6 +197,7 @@ using OptixModule             = void*;
 using OptixShaderBindingTable = void*;
 using CUarray                 = void*;
 using CUtexObject             = void*;
+using OptixDenoiser           = void*;
 
 #endif
 
@@ -204,11 +208,12 @@ namespace yocto {
 
 // cuda buffer
 template <typename T>
-struct cubuffer {
+struct cuspan {
+  bool        empty() const { return _size == 0; }
   size_t      size() const { return _size; }
   CUdeviceptr device_ptr() const { return _data; }
   size_t      size_in_bytes() const { return _size * sizeof(T); }
-  void        swap(cubuffer& other) {
+  void        swap(cuspan& other) {
     std::swap(_data, other._data);
     std::swap(_size, other._size);
   }
@@ -263,17 +268,17 @@ struct cumaterial_data {
 };
 
 struct cuinstance_data {
-  frame3f frame;
-  int     shape;
-  int     material;
+  frame3f frame    = {{1, 0, 0}, {0, 1, 0}, {0, 0, 1}, {0, 0, 0}};
+  int     shape    = invalidid;
+  int     material = invalidid;
 };
 
 struct cushape_data {
-  cubuffer<vec3f> positions = {};
-  cubuffer<vec3f> normals   = {};
-  cubuffer<vec2f> texcoords = {};
-  cubuffer<vec4f> colors    = {};
-  cubuffer<vec3i> triangles = {};
+  cuspan<vec3f> positions = {};
+  cuspan<vec3f> normals   = {};
+  cuspan<vec2f> texcoords = {};
+  cuspan<vec4f> colors    = {};
+  cuspan<vec3i> triangles = {};
 };
 
 struct cuenvironment_data {
@@ -283,33 +288,28 @@ struct cuenvironment_data {
 };
 
 struct cuscene_data {
-  cubuffer<cucamera_data>      cameras      = {};
-  cubuffer<cutexture_data>     textures     = {};
-  cubuffer<cumaterial_data>    materials    = {};
-  cubuffer<cushape_data>       shapes       = {};
-  cubuffer<cuinstance_data>    instances    = {};
-  cubuffer<cuenvironment_data> environments = {};
-};
-
-struct cusceneext_data : cuscene_data {
-  vector<cutexture_data> cutextures = {};
-  vector<cushape_data>   cushapes   = {};
-
-  cusceneext_data() {}
-  cusceneext_data(cusceneext_data&&);
-  cusceneext_data& operator=(cusceneext_data&&);
-  ~cusceneext_data();
+  cuspan<cucamera_data>      cameras      = {};
+  cuspan<cutexture_data>     textures     = {};
+  cuspan<cumaterial_data>    materials    = {};
+  cuspan<cushape_data>       shapes       = {};
+  cuspan<cuinstance_data>    instances    = {};
+  cuspan<cuenvironment_data> environments = {};
+
+  cuscene_data() {}
+  cuscene_data(cuscene_data&&);
+  cuscene_data& operator=(cuscene_data&&);
+  ~cuscene_data();
 };
 
 struct cubvh_tree {
-  cubuffer<byte>         buffer = {};
+  cuspan<byte>           buffer = {};
   OptixTraversableHandle handle = 0;
 };
 
 struct cubvh_data {
-  cubuffer<OptixInstance> instances     = {};
-  cubvh_tree              instances_bvh = {};
-  vector<cubvh_tree>      shapes_bvhs   = {};
+  cuspan<OptixInstance> instances     = {};
+  cubvh_tree            instances_bvh = {};
+  vector<cubvh_tree>    shapes_bvhs   = {};
 
   cubvh_data() {}
   cubvh_data(cubvh_data&&);
@@ -319,27 +319,39 @@ struct cubvh_data {
 
 // state
 struct cutrace_state {
-  int                 width   = 0;
-  int                 height  = 0;
-  int                 samples = 0;
-  cubuffer<vec4f>     image   = {};
-  cubuffer<vec3f>     albedo  = {};
-  cubuffer<vec3f>     normal  = {};
-  cubuffer<int>       hits    = {};
-  cubuffer<rng_state> rngs    = {};
-  cubuffer<vec4f>     display = {};
+  int               width            = 0;
+  int               height           = 0;
+  int               samples          = 0;
+  cuspan<vec4f>     image            = {};
+  cuspan<vec3f>     albedo           = {};
+  cuspan<vec3f>     normal           = {};
+  cuspan<int>       hits             = {};
+  cuspan<rng_state> rngs             = {};
+  cuspan<vec4f>     denoised         = {};
+  cuspan<byte>      denoiser_state   = {};
+  cuspan<byte>      denoiser_scratch = {};
+
+  cutrace_state() {}
+  cutrace_state(cutrace_state&&);
+  cutrace_state& operator=(cutrace_state&&);
+  ~cutrace_state();
 };
 
 // light
 struct cutrace_light {
-  int             instance     = invalidid;
-  int             environment  = invalidid;
-  cubuffer<float> elements_cdf = {};
+  int           instance     = invalidid;
+  int           environment  = invalidid;
+  cuspan<float> elements_cdf = {};
 };
 
 // lights
 struct cutrace_lights {
-  cubuffer<cutrace_light> lights = {};
+  cuspan<cutrace_light> lights = {};
+
+  cutrace_lights() {}
+  cutrace_lights(cutrace_lights&&);
+  cutrace_lights& operator=(cutrace_lights&&);
+  ~cutrace_lights();
 };
 
 // device params
@@ -382,13 +394,16 @@ struct cutrace_context {
   OptixProgramGroup hitgroup_program = nullptr;
 
   // stb
-  cubuffer<cutrace_stbrecord> raygen_records   = {};
-  cubuffer<cutrace_stbrecord> miss_records     = {};
-  cubuffer<cutrace_stbrecord> hitgroup_records = {};
-  OptixShaderBindingTable     binding_table    = {};
+  cuspan<cutrace_stbrecord> raygen_records   = {};
+  cuspan<cutrace_stbrecord> miss_records     = {};
+  cuspan<cutrace_stbrecord> hitgroup_records = {};
+  OptixShaderBindingTable   binding_table    = {};
 
   // global buffer
-  cubuffer<cutrace_globals> globals_buffer = {};
+  cuspan<cutrace_globals> globals_buffer = {};
+
+  // denoiser
+  OptixDenoiser denoiser = nullptr;
 
   cutrace_context() {}
   cutrace_context(cutrace_context&&);
diff --git a/libs/yocto/yocto_gui.cpp b/libs/yocto/yocto_gui.cpp
index 6a18cf4eb..5616031d0 100644
--- a/libs/yocto/yocto_gui.cpp
+++ b/libs/yocto/yocto_gui.cpp
@@ -712,14 +712,14 @@ void show_trace_gui(const string& title, const string& name, scene_data& scene,
           }
          });
         state.samples += params.batch;
+        if (params.denoise && !state.denoised.empty()) {
+          denoise_image(state.denoised, state.width, state.height, state.image,
+               state.albedo, state.normal);
+        }
         if (!render_stop) {
           auto lock      = std::lock_guard{render_mutex};
           render_current = state.samples;
-          if (!params.denoise || render_stop) {
-            get_rendered_image(render, state);
-          } else {
-            get_denoised_image(render, state);
-          }
+          get_image(render, state);
           image         = render;
           render_update = true;
         }
@@ -830,13 +830,13 @@ void show_cutrace_gui(const string& title, const string& name,
   auto context = make_cutrace_context(params);
 
   // upload scene to the gpu
-  auto cuscene = make_cutrace_scene(scene, params);
+  auto cuscene = make_cutrace_scene(context, scene, params);
 
   // build bvh
-  auto bvh = make_cutrace_bvh(context, cuscene, scene, params);
+  auto bvh = make_cutrace_bvh(context, cuscene, params);
 
   // init lights
-  auto lights = make_cutrace_lights(scene, params);
+  auto lights = make_cutrace_lights(context, scene, params);
 
   // fix renderer type if no lights
   // if (lights.lights.empty() && is_sampler_lit(params)) {
@@ -844,13 +844,13 @@ void show_cutrace_gui(const string& title, const string& name,
   // }
 
   // state
-  auto state = make_cutrace_state(scene, params);
+  auto state = make_cutrace_state(context, scene, params);
 
   // preview state
   auto pparams = params;
   pparams.resolution /= params.pratio;
   pparams.samples = 1;
-  auto pstate     = make_cutrace_state(scene, pparams);
+  auto pstate     = make_cutrace_state(context, scene, pparams);
 
   // init state
   auto image = make_image(state.width, state.height, true);
@@ -877,7 +877,7 @@ void show_cutrace_gui(const string& title, const string& name,
     auto pparams = params;
     pparams.resolution /= params.pratio;
     pparams.samples = 1;
-    reset_cutrace_state(pstate, scene, pparams);
+    reset_cutrace_state(context, pstate, scene, pparams);
     trace_start(context, pstate, cuscene, bvh, lights, scene, pparams);
     trace_samples(context, pstate, cuscene, bvh, lights, scene, pparams);
     auto preview = get_rendered_image(pstate);
@@ -887,7 +887,7 @@ void show_cutrace_gui(const string& title, const string& name,
            pj           = clamp(j / params.pratio, 0, preview.height - 1);
       image.pixels[idx] = preview.pixels[pj * preview.width + pi];
     }
-    reset_cutrace_state(state, scene, params);
+    reset_cutrace_state(context, state, scene, params);
     return true;
   };
 
@@ -898,11 +898,7 @@ void show_cutrace_gui(const string& title, const string& name,
       trace_start(context, state, cuscene, bvh, lights, scene, params);
     }
     trace_samples(context, state, cuscene, bvh, lights, scene, params);
-    if (!params.denoise) {
-      get_rendered_image(image, state);
-    } else {
-      get_denoised_image(image, state);
-    }
+    get_image(image, state);
     return true;
   };
 
@@ -966,7 +962,7 @@ void show_cutrace_gui(const string& title, const string& name,
     auto camera = scene.cameras[params.camera];
     if (uiupdate_camera_params(input, camera)) {
       scene.cameras[params.camera] = camera;
-      update_cutrace_cameras(cuscene, scene, params);
+      update_cutrace_cameras(context, cuscene, scene, params);
       if (render_preview()) set_image(glimage, image);
     }
   };
diff --git a/libs/yocto/yocto_trace.cpp b/libs/yocto/yocto_trace.cpp
index dad3920ff..fd133cfe0 100644
--- a/libs/yocto/yocto_trace.cpp
+++ b/libs/yocto/yocto_trace.cpp
@@ -1437,6 +1437,9 @@ trace_state make_trace_state(
   for (auto& rng : state.rngs) {
     rng = make_rng(params.seed, rand1i(rng_, 1 << 31) / 2 + 1);
   }
+  if (params.denoise) {
+    state.denoised.assign(state.width * state.height, {0, 0, 0, 0});
+  }
   return state;
 }
 
@@ -1509,7 +1512,7 @@ image_data trace_image(const scene_data& scene, const trace_params& params) {
   for (auto sample = 0; sample < params.samples; sample++) {
     trace_samples(state, scene, bvh, lights, params);
   }
-  return get_rendered_image(state);
+  return get_image(state);
 }
 
 // Progressively compute an image by calling trace_samples multiple times.
@@ -1533,6 +1536,10 @@ void trace_samples(trace_state& state, const scene_data& scene,
     });
   }
   state.samples += params.batch;
+  if (params.denoise && !state.denoised.empty()) {
+    denoise_image(state.denoised, state.width, state.height, state.image,
+        state.albedo, state.normal);
+  }
 }
 
 // Check image type
@@ -1544,6 +1551,28 @@ static void check_image(
     throw std::invalid_argument{
         linear ? "expected linear image" : "expected srgb image"};
 }
+template <typename T>
+static void check_image(const vector<T>& image, int width, int height) {
+  if (image.size() != (size_t)width * (size_t)height)
+    throw std::invalid_argument{"image should have the same size"};
+}
+
+// Get resulting render, denoised if requested
+image_data get_image(const trace_state& state) {
+  auto image = make_image(state.width, state.height, true);
+  get_image(image, state);
+  return image;
+}
+void get_image(image_data& image, const trace_state& state) {
+  image.width  = state.width;
+  image.height = state.height;
+  image.linear = true;
+  if (state.denoised.empty()) {
+    image.pixels = state.image;
+  } else {
+    image.pixels = state.denoised;
+  }
+}
 
 // Get resulting render
 image_data get_rendered_image(const trace_state& state) {
@@ -1629,13 +1658,13 @@ void get_normal_image(image_data& normal, const trace_state& state) {
 }
 
 // Denoise image
-image_data denoise_rendered_image(const image_data& render,
-    const image_data& albedo, const image_data& normal) {
+image_data denoise_image(const image_data& render, const image_data& albedo,
+    const image_data& normal) {
   auto denoised = make_image(render.width, render.height, render.linear);
-  denoise_rendered_image(denoised, render, albedo, normal);
+  denoise_image(denoised, render, albedo, normal);
   return denoised;
 }
-void denoise_rendered_image(image_data& denoised, const image_data& render,
+void denoise_image(image_data& denoised, const image_data& render,
     const image_data& albedo, const image_data& normal) {
   check_image(denoised, render.width, render.height, render.linear);
   check_image(albedo, render.width, render.height, albedo.linear);
@@ -1673,4 +1702,40 @@ void denoise_rendered_image(image_data& denoised, const image_data& render,
 #endif
 }
 
+void denoise_image(vector<vec4f>& denoised, int width, int height,
+    const vector<vec4f>& render, const vector<vec3f>& albedo,
+    const vector<vec3f>& normal) {
+  check_image(denoised, width, height);
+  check_image(render, width, height);
+  check_image(albedo, width, height);
+  check_image(normal, width, height);
+#if YOCTO_DENOISE
+  // Create an Intel Open Image Denoise device
+  oidn::DeviceRef device = oidn::newDevice();
+  device.commit();
+
+  // set image
+  denoised = render;
+
+  // Create a denoising filter
+  oidn::FilterRef filter = device.newFilter("RT");  // ray tracing filter
+  filter.setImage("color", (void*)render.data(), oidn::Format::Float3, width,
+      height, 0, sizeof(vec4f), sizeof(vec4f) * width);
+  filter.setImage("albedo", (void*)albedo.data(), oidn::Format::Float3, width,
+      height, 0, sizeof(vec3f), sizeof(vec3f) * width);
+  filter.setImage("normal", (void*)normal.data(), oidn::Format::Float3, width,
+      height, 0, sizeof(vec3f), sizeof(vec3f) * width);
+  filter.setImage("output", denoised.data(), oidn::Format::Float3, width,
+      height, 0, sizeof(vec4f), sizeof(vec4f) * width);
+  filter.set("inputScale", 1.0f);  // set scale as fixed
+  filter.set("hdr", true);         // image is HDR
+  filter.commit();
+
+  // Filter the image
+  filter.execute();
+#else
+  denoised = render;
+#endif
+}
+
 }  // namespace yocto
diff --git a/libs/yocto/yocto_trace.h b/libs/yocto/yocto_trace.h
index 04576a6f5..efcf7095d 100644
--- a/libs/yocto/yocto_trace.h
+++ b/libs/yocto/yocto_trace.h
@@ -144,14 +144,15 @@ bool is_sampler_lit(const trace_params& params);
 
 // Trace state
 struct trace_state {
-  int               width   = 0;
-  int               height  = 0;
-  int               samples = 0;
-  vector<vec4f>     image   = {};
-  vector<vec3f>     albedo  = {};
-  vector<vec3f>     normal  = {};
-  vector<int>       hits    = {};
-  vector<rng_state> rngs    = {};
+  int               width    = 0;
+  int               height   = 0;
+  int               samples  = 0;
+  vector<vec4f>     image    = {};
+  vector<vec3f>     albedo   = {};
+  vector<vec3f>     normal   = {};
+  vector<int>       hits     = {};
+  vector<rng_state> rngs     = {};
+  vector<vec4f>     denoised = {};
 };
 
 // Initialize state.
@@ -173,25 +174,28 @@ void trace_sample(trace_state& state, const scene_data& scene,
     const trace_bvh& bvh, const trace_lights& lights, int i, int j, int sample,
     const trace_params& params);
 
-// Get resulting render
+// Get resulting render, denoised if requested
+image_data get_image(const trace_state& state);
+void       get_image(image_data& image, const trace_state& state);
+
+// Get internal images from state
 image_data get_rendered_image(const trace_state& state);
 void       get_rendered_image(image_data& image, const trace_state& state);
-
-// Get denoised result
 image_data get_denoised_image(const trace_state& state);
 void       get_denoised_image(image_data& image, const trace_state& state);
-
-// Get denoising buffers
 image_data get_albedo_image(const trace_state& state);
 void       get_albedo_image(image_data& image, const trace_state& state);
 image_data get_normal_image(const trace_state& state);
 void       get_normal_image(image_data& image, const trace_state& state);
 
 // Denoise image
-image_data denoise_rendered_image(const image_data& render,
-    const image_data& albedo, const image_data& normal);
-void       denoise_rendered_image(image_data& image, const image_data& render,
+image_data denoise_image(const image_data& render, const image_data& albedo,
+    const image_data& normal);
+void       denoise_image(image_data& image, const image_data& render,
           const image_data& albedo, const image_data& normal);
+void       denoise_image(vector<vec4f>& denoised, int width, int height,
+          const vector<vec4f>& render, const vector<vec3f>& albedo,
+          const vector<vec3f>& normal);
 
 }  // namespace yocto
 
@@ -302,17 +306,6 @@ namespace yocto {
   return get_normal_image(image, state);
 }
 
-// Denoise image
-[[deprecated]] inline image_data denoise_render(const image_data& render,
-    const image_data& albedo, const image_data& normal) {
-  return denoise_rendered_image(render, albedo, normal);
-}
-[[deprecated]] inline void denoise_render(image_data& image,
-    const image_data& render, const image_data& albedo,
-    const image_data& normal) {
-  return denoise_rendered_image(image, render, albedo, normal);
-}
-
 }  // namespace yocto
 
 #endif