From b4dcf83ec4738ea2398071c0c64d9b0fb4d399a6 Mon Sep 17 00:00:00 2001 From: Lukas Stockner Date: Fri, 13 Jun 2014 14:38:59 +0200 Subject: [PATCH] Adding metropolis sampling and adaptive sampling/stopping --- intern/cycles/app/cycles_xml.cpp | 11 + intern/cycles/blender/addon/presets.py | 3 + intern/cycles/blender/addon/properties.py | 62 +++ intern/cycles/blender/addon/ui.py | 37 ++ intern/cycles/blender/blender_session.cpp | 7 +- intern/cycles/blender/blender_sync.cpp | 24 +- intern/cycles/device/device.h | 1 + intern/cycles/device/device_cpu.cpp | 654 ++++++++++++++++++++---------- intern/cycles/device/device_cuda.cpp | 322 ++++++++++++++- intern/cycles/device/device_memory.h | 11 +- intern/cycles/device/device_opencl.cpp | 4 + intern/cycles/device/device_task.cpp | 7 +- intern/cycles/device/device_task.h | 13 +- intern/cycles/kernel/CMakeLists.txt | 2 + intern/cycles/kernel/kernel.cpp | 19 + intern/cycles/kernel/kernel.cu | 93 ++++- intern/cycles/kernel/kernel.h | 33 ++ intern/cycles/kernel/kernel_avx.cpp | 18 + intern/cycles/kernel/kernel_avx2.cpp | 15 + intern/cycles/kernel/kernel_camera.h | 2 +- intern/cycles/kernel/kernel_compat_cpu.h | 36 ++ intern/cycles/kernel/kernel_compat_cuda.h | 28 ++ intern/cycles/kernel/kernel_metropolis.h | 268 ++++++++++++ intern/cycles/kernel/kernel_passes.h | 320 ++++++++++++--- intern/cycles/kernel/kernel_path.h | 118 +++++- intern/cycles/kernel/kernel_random.h | 29 +- intern/cycles/kernel/kernel_sse2.cpp | 15 + intern/cycles/kernel/kernel_sse3.cpp | 15 + intern/cycles/kernel/kernel_sse41.cpp | 15 + intern/cycles/kernel/kernel_textures.h | 5 +- intern/cycles/kernel/kernel_types.h | 76 +++- intern/cycles/kernel/kernel_volume.h | 4 +- intern/cycles/kernel/svm/svm_image.h | 2 +- intern/cycles/render/buffers.cpp | 101 ++++- intern/cycles/render/buffers.h | 13 +- intern/cycles/render/film.cpp | 21 +- intern/cycles/render/image.h | 4 +- intern/cycles/render/integrator.cpp | 15 + intern/cycles/render/integrator.h | 7 +- intern/cycles/render/scene.h | 1 + intern/cycles/render/session.cpp | 109 ++++- intern/cycles/render/session.h | 19 +- intern/cycles/render/tile.cpp | 77 +++- intern/cycles/render/tile.h | 30 +- intern/cycles/util/CMakeLists.txt | 6 + intern/cycles/util/util_color.h | 24 ++ intern/cycles/util/util_hash.h | 4 +- intern/cycles/util/util_importance.cpp | 301 ++++++++++++++ intern/cycles/util/util_importance.h | 50 +++ intern/cycles/util/util_list.h | 11 + intern/cycles/util/util_metropolis.h | 47 +++ 51 files changed, 2730 insertions(+), 379 deletions(-) create mode 100644 intern/cycles/kernel/kernel_metropolis.h create mode 100644 intern/cycles/util/util_importance.cpp create mode 100644 intern/cycles/util/util_importance.h create mode 100644 intern/cycles/util/util_metropolis.h diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp index 915ef96..3d88ffc 100644 --- a/intern/cycles/app/cycles_xml.cpp +++ b/intern/cycles/app/cycles_xml.cpp @@ -283,6 +283,17 @@ static void xml_read_integrator(const XMLReadState& state, pugi::xml_node node) xml_read_bool(&integrator->sample_all_lights_direct, node, "sample_all_lights_direct"); xml_read_bool(&integrator->sample_all_lights_indirect, node, "sample_all_lights_indirect"); } + + /* Metropolis Path Path */ + bool metropolis = false; + xml_read_bool(&metropolis, node, "metropolis"); + + if(metropolis) { + integrator->method = Integrator::METROPOLIS_PATH; + xml_read_float(&integrator->image_mutation_range, "image_mutation_range"); + xml_read_float(&integrator->large_mutation_chance, "large_mutation_chance"); + xml_read_int(&integrator->max_consecutive_rejects, "max_consecutive_rejects"); + } /* Bounces */ xml_read_int(&integrator->min_bounce, node, "min_bounce"); diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py index 9991fdb..1d6f25a 100644 --- a/intern/cycles/blender/addon/presets.py +++ b/intern/cycles/blender/addon/presets.py @@ -56,6 +56,9 @@ class AddPresetSampling(AddPresetBase, Operator): preset_values = [ "cycles.samples", + "cycles.max_consecutive_rejects", + "cycles.image_mutation_range", + "cycles.large_mutation_chance", "cycles.preview_samples", "cycles.aa_samples", "cycles.preview_aa_samples", diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index b4a1b10..e4879b8 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -106,6 +106,7 @@ enum_sampling_pattern = ( enum_integrator = ( ('BRANCHED_PATH', "Branched Path Tracing", "Path tracing integrator that branches on the first bounce, giving more control over the number of light and material samples"), ('PATH', "Path Tracing", "Pure path tracing integrator"), + ('METROPOLIS_PATH', "Metropolis Path Tracing", "Path tracing integrator with Metropolis sampling"), ) enum_volume_sampling = ( @@ -176,6 +177,37 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): default=False, ) + + cls.max_consecutive_rejects = IntProperty( + name="Max. consecutive rejects", + description="Number of rejected mutations before the next mutation is forced to be accepted", + min=1, max=10000, + default=512, + ) + cls.image_mutation_range = FloatProperty( + name="Image mutation range", + description="Range of the pixel position mutation", + min=0.0, max=65536.0, + default=0.1, + ) + cls.warmup_samples = IntProperty( + name="Warmup sample count", + description="Number of warmup samples per tile. Increase this when tile borders become visible.", + min=0, max=65536, + default=10, + ) + cls.importance_equalisation = BoolProperty( + name="Use Importance Equalisation", + description="Uses Importance Equalisation to distribute samples more evenly, by default brighter areas are sampled more", + default=False + ) + cls.metropolis_chain_number = IntProperty( + name="Number of Metropolis chains", + description="How many independent Metropolis samplers are used. On CPU, this should be similar to the thread number, while for GPU values > 1000 are faster. 0 means automatic", + min=0, max=65536, + default=0, + ) + cls.aa_samples = IntProperty( name="AA Samples", description="Number of antialiasing samples to render for each pixel", @@ -464,6 +496,12 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): "but time can be saved by manually stopping the render when the noise is low enough)", default=False, ) + cls.num_progressive_samples = IntProperty( + name="Number of progressive samples", + description="Number of progressive samples to take", + min=1, max=2147483647, + default=1, + ) cls.bake_type = EnumProperty( name="Bake Type", @@ -492,6 +530,30 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): ), ) + cls.adaptive_stopping_threshold = FloatProperty( + name="Tolerated error", + description="Amount of error that is tolerated in a tile. If 0, adaptive stopping is disabled", + min=0, max=5.0, + default=0, + ) + cls.use_adaptive_sampling = BoolProperty( + name="Use adaptive sampling", + description="Take more samples in noisy areas instead of a uniform distribution over the image", + default=False, + ) + cls.adaptive_map_interval = IntProperty( + name="Adaptive map update rate", + description="Number of samples that is taken before the adaptive map is updated", + min=1, max=2147483647, + default=25, + ) + cls.adaptive_error_power = FloatProperty( + name="Exponent of the power mean used in error estimation", + description="A value of 2 will give an average, higher values will put more emphasis on the extreme values", + min=2, max=100, + default=4, + ) + @classmethod def unregister(cls): del bpy.types.Scene.cycles diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 35ae8ec..4a42ca3 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -64,6 +64,10 @@ def draw_samples_info(layout, cscene): aa = cscene.samples if cscene.use_square_samples: aa = aa * aa + elif integrator == 'METROPOLIS_PATH': + aa = cscene.samples + if cscene.use_square_samples: + aa = aa * aa else: aa = cscene.aa_samples d = cscene.diffuse_samples @@ -120,6 +124,14 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel): row.prop(cscene, "progressive", text="") row.prop(cscene, "use_square_samples") + if cscene.progressive == 'METROPOLIS_PATH': + mrow = layout.column() + mrow.label("Metropolis:") + mrow.prop(cscene, "max_consecutive_rejects", text="Max. consecutive rejects") + mrow.prop(cscene, "image_mutation_range", text="Image mutation range") + mrow.prop(cscene, "warmup_samples", text="Warmup sample count") + #mrow.prop(cscene, "importance_equalisation", text="Use Importance Equalisation") + split = layout.split() col = split.column() @@ -135,6 +147,12 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel): sub.label(text="Samples:") sub.prop(cscene, "samples", text="Render") sub.prop(cscene, "preview_samples", text="Preview") + elif cscene.progressive == 'METROPOLIS_PATH': + col = split.column() + sub = col.column(align=True) + sub.label(text="Samples:") + sub.prop(cscene, "samples", text="Render") + sub.prop(cscene, "preview_samples", text="Preview") else: sub.label(text="AA Samples:") sub.prop(cscene, "aa_samples", text="Render") @@ -297,6 +315,12 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel): sub.prop(rd, "tile_y", text="Y") sub.prop(cscene, "use_progressive_refine") + subsub = sub.column(align=True) + subsub.enabled = cscene.use_progressive_refine + subsub.prop(cscene, "num_progressive_samples") + if cscene.progressive == 'METROPOLIS_PATH': + sub.label(text="Metropolis chains:") + sub.prop(cscene, "metropolis_chain_number") subsub = sub.column(align=True) subsub.enabled = not rd.use_border @@ -317,6 +341,19 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel): col.separator() + col.label(text="Adaptive sampling:") + col.prop(cscene, "adaptive_stopping_threshold") + if cscene.adaptive_stopping_threshold > 0: + col.prop(cscene, "adaptive_error_power") + if cscene.progressive != 'METROPOLIS_PATH': + col.prop(cscene, "use_adaptive_sampling") + #sub = col.column(align=True) + #sub.enabled = cscene.use_adaptive_sampling or cscene.adaptive_stopping_threshold > 0 or (cscene.importance_equalisation and cscene.progressive == 'METROPOLIS_PATH') + if cscene.use_adaptive_sampling or cscene.adaptive_stopping_threshold > 0 or (cscene.importance_equalisation and cscene.progressive == 'METROPOLIS_PATH'): + col.prop(cscene, "adaptive_map_interval") + + col.separator() + col.label(text="Acceleration structure:") col.prop(cscene, "debug_use_spatial_splits") diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index 0f31e55..d6ec2bb 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -181,7 +181,7 @@ void BlenderSession::reset_session(BL::BlendData b_data_, BL::Scene b_scene_) BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, PointerRNA_NULL, PointerRNA_NULL, scene->camera, width, height); session->reset(buffer_params, session_params.samples); - b_engine.use_highlight_tiles(session_params.progressive_refine == false); + b_engine.use_highlight_tiles(session_params.progressive_refine == false || session_params.error_progressive); /* reset time */ start_resize_time = 0.0; @@ -420,6 +420,11 @@ void BlenderSession::render() /* add passes */ vector passes; Pass::add(PASS_COMBINED, passes); + if (session_params.stopping_threshold > 0.0f || session_params.adaptive || session_params.metropolis) + Pass::add(PASS_SAMPLES, passes); + + if (session_params.stopping_threshold > 0.0f || session_params.adaptive) + Pass::add(PASS_VARIANCE, passes); if(session_params.device.advanced_shading) { diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index 19898bf..1fa1328 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -35,6 +35,7 @@ #include "util_debug.h" #include "util_foreach.h" #include "util_opengl.h" +#include "util_system.h" CCL_NAMESPACE_BEGIN @@ -201,9 +202,15 @@ void BlenderSync::sync_integrator() #endif integrator->method = (Integrator::Method)get_enum(cscene, "progressive"); + if (preview && integrator->method == Integrator::METROPOLIS_PATH) + integrator->method = Integrator::PATH; - integrator->sample_all_lights_direct = get_boolean(cscene, "sample_all_lights_direct"); - integrator->sample_all_lights_indirect = get_boolean(cscene, "sample_all_lights_indirect"); + integrator->sample_all_lights_direct = get_boolean(cscene, "sample_all_lights_direct") && (integrator->method != Integrator::METROPOLIS_PATH); + integrator->sample_all_lights_indirect = get_boolean(cscene, "sample_all_lights_indirect") && (integrator->method != Integrator::METROPOLIS_PATH); + + integrator->max_consecutive_rejects = get_int(cscene, "max_consecutive_rejects"); + integrator->image_mutation_range = get_float(cscene, "image_mutation_range"); + integrator->map_interval = get_boolean(cscene, "importance_equalisation") ? get_int(cscene, "adaptive_map_interval") : 0; int diffuse_samples = get_int(cscene, "diffuse_samples"); int glossy_samples = get_int(cscene, "glossy_samples"); @@ -495,7 +502,18 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine b_engine, BL::Use params.reset_timeout = get_float(cscene, "debug_reset_timeout"); params.text_timeout = get_float(cscene, "debug_text_timeout"); - params.progressive_refine = get_boolean(cscene, "use_progressive_refine"); + params.progressive_refine = get_boolean(cscene, "use_progressive_refine") | get_boolean(cscene, "importance_equalisation"); + params.num_progressive_samples = min(get_int(cscene, "num_progressive_samples"), get_int(cscene, "adaptive_map_interval")); + params.stopping_threshold = get_float(cscene, "adaptive_stopping_threshold"); + params.adaptive = get_boolean(cscene, "use_adaptive_sampling"); + params.map_interval = get_int(cscene, "adaptive_map_interval"); + params.adaptive_error_power = get_float(cscene, "adaptive_error_power"); + params.error_progressive = get_boolean(cscene, "use_progressive_refine") && params.stopping_threshold > 0; + + params.metropolis = (Integrator::Method)get_enum(cscene, "progressive") == Integrator::METROPOLIS_PATH && background; + params.importance_equalisation = get_boolean(cscene, "importance_equalisation"); + params.warmup_samples = get_int(cscene, "warmup_samples"); + params.num_metro_chains = get_int(cscene, "metropolis_chain_number"); if(background) { if(params.progressive_refine) diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index bcddd4f..3363ae6 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -33,6 +33,7 @@ CCL_NAMESPACE_BEGIN class Progress; class RenderTile; +class Session; /* Device Types */ diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 7308d03..c8fc36b 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -16,6 +16,7 @@ #include #include +#include #include "device.h" #include "device_intern.h" @@ -37,149 +38,322 @@ #include "util_progress.h" #include "util_system.h" #include "util_thread.h" +#include "util_color.h" +#include "util_hash.h" +#include "util_importance.h" +#include "util_metropolis.h" + +#include "session.h" CCL_NAMESPACE_BEGIN class CPUDevice : public Device { public: - TaskPool task_pool; - KernelGlobals kernel_globals; + TaskPool task_pool; + KernelGlobals kernel_globals; #ifdef WITH_OSL - OSLGlobals osl_globals; + OSLGlobals osl_globals; #endif - - CPUDevice(DeviceInfo& info, Stats &stats, bool background) - : Device(info, stats, background) - { + + CPUDevice(DeviceInfo& info, Stats &stats, bool background) + : Device(info, stats, background) + { #ifdef WITH_OSL - kernel_globals.osl = &osl_globals; + kernel_globals.osl = &osl_globals; #endif - /* do now to avoid thread issues */ - system_cpu_support_sse2(); - system_cpu_support_sse3(); - system_cpu_support_sse41(); - system_cpu_support_avx(); + /* do now to avoid thread issues */ + system_cpu_support_sse2(); + system_cpu_support_sse3(); + system_cpu_support_sse41(); + system_cpu_support_avx(); system_cpu_support_avx2(); - } + } - ~CPUDevice() - { - task_pool.stop(); - } + ~CPUDevice() + { + task_pool.stop(); + } - void mem_alloc(device_memory& mem, MemoryType type) - { - mem.device_pointer = mem.data_pointer; + void mem_alloc(device_memory& mem, MemoryType type) + { + mem.device_pointer = mem.data_pointer; - stats.mem_alloc(mem.memory_size()); - } + stats.mem_alloc(mem.memory_size()); + } - void mem_copy_to(device_memory& mem) - { - /* no-op */ - } + void mem_copy_to(device_memory& mem) + { + /* no-op */ + } - void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) - { - /* no-op */ - } + void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) + { + /* no-op */ + } - void mem_zero(device_memory& mem) - { - memset((void*)mem.device_pointer, 0, mem.memory_size()); - } + void mem_zero(device_memory& mem) + { + memset((void*)mem.device_pointer, 0, mem.memory_size()); + } - void mem_free(device_memory& mem) - { - mem.device_pointer = 0; + void mem_free(device_memory& mem) + { + mem.device_pointer = 0; - stats.mem_free(mem.memory_size()); - } + stats.mem_free(mem.memory_size()); + } - void const_copy_to(const char *name, void *host, size_t size) - { - kernel_const_copy(&kernel_globals, name, host, size); - } + void const_copy_to(const char *name, void *host, size_t size) + { + kernel_const_copy(&kernel_globals, name, host, size); + } - void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic) - { - kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height, mem.data_depth, interpolation); - mem.device_pointer = mem.data_pointer; + void tex_alloc(const char *name, device_memory& mem, InterpolationType interpolation, bool periodic) + { + kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height, mem.data_depth, interpolation); + mem.device_pointer = mem.data_pointer; - stats.mem_alloc(mem.memory_size()); - } + stats.mem_alloc(mem.memory_size()); + } - void tex_free(device_memory& mem) - { - mem.device_pointer = 0; + void tex_free(device_memory& mem) + { + mem.device_pointer = 0; - stats.mem_free(mem.memory_size()); - } + stats.mem_free(mem.memory_size()); + } - void *osl_memory() - { + void *osl_memory() + { #ifdef WITH_OSL - return &osl_globals; + return &osl_globals; #else - return NULL; + return NULL; #endif - } + } + + void thread_run(DeviceTask *task) + { + if(task->type == DeviceTask::PATH_TRACE) + thread_path_trace(*task); + else if(task->type == DeviceTask::METROPOLIS_TRACE) + thread_metropolis(*task); + else if(task->type == DeviceTask::FILM_CONVERT) + thread_film_convert(*task); + else if(task->type == DeviceTask::SHADER) + thread_shader(*task); + } + + class CPUDeviceTask : public DeviceTask { + public: + CPUDeviceTask(CPUDevice *device, DeviceTask& task) + : DeviceTask(task) + { + run = function_bind(&CPUDevice::thread_run, device, this); + } + }; + + void thread_metropolis(DeviceTask& task) { + if(task_pool.canceled()) { + if(task.need_finish_queue == false) + return; + } - void thread_run(DeviceTask *task) - { - if(task->type == DeviceTask::PATH_TRACE) - thread_path_trace(*task); - else if(task->type == DeviceTask::FILM_CONVERT) - thread_film_convert(*task); - else if(task->type == DeviceTask::SHADER) - thread_shader(*task); - } + KernelGlobals kg = kernel_globals; - class CPUDeviceTask : public DeviceTask { - public: - CPUDeviceTask(CPUDevice *device, DeviceTask& task) - : DeviceTask(task) - { - run = function_bind(&CPUDevice::thread_run, device, this); - } - }; +#ifdef WITH_OSL + OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); +#endif + + RenderTile tile; + + while(task.acquire_tile(this, tile)) { + float *render_buffer = (float*)tile.buffer; + uint *rng_state = (uint*)tile.rng_state; + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; + tile.sample = tile.start_sample; + if (tile.buffers->metro_shared_params == NULL) { + tile.buffers->metro_shared_params = new double[8]; + memset(tile.buffers->metro_shared_params, 0, sizeof(double)*8); + } + + int numChains = task.num_metro_chains; + MetropolisChain** metroChains = new MetropolisChain*[numChains]; + int samplesPerChain = tile.w*tile.h / task.num_metro_chains; + for (int i = 0; i < numChains; i++) { + metroChains[i] = (MetropolisChain*) malloc(sizeof(MetropolisChain) + 2*task.metro_sample_size*sizeof(uint) + 3*task.metro_sample_size*sizeof(float)); + kernel_metro_init_chain(&kg, metroChains[i]); + } + + if (start_sample == 0) { + for(int y = tile.y; y < tile.y + tile.h; y++) //Fill in ID, UV and normal passes + for(int x = tile.x; x < tile.x + tile.w; x++) + kernel_cpu_metropolis_first_pass(&kg, render_buffer, tile.buffers->metro_shared_params, rng_state, x, y, tile.offset, tile.stride); + } + for (int i = 0; i < task.warmup_samples; i++) { +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) + for(int y = tile.y; y < tile.y + tile.h; y++) //Fill in ID, UV and normal passes + for(int x = tile.x; x < tile.x + tile.w; x++) + kernel_cpu_avx2_metropolis_warmup(&kg, rng_state, tile.buffers->metro_shared_params, i + tile.start_sample, x, y, tile.offset, tile.stride); + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if(system_cpu_support_avx()) + for(int y = tile.y; y < tile.y + tile.h; y++) //Fill in ID, UV and normal passes + for(int x = tile.x; x < tile.x + tile.w; x++) + kernel_cpu_avx_metropolis_warmup(&kg, rng_state, tile.buffers->metro_shared_params, i + tile.start_sample, x, y, tile.offset, tile.stride); + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + if(system_cpu_support_sse41()) + for(int y = tile.y; y < tile.y + tile.h; y++) //Fill in ID, UV and normal passes + for(int x = tile.x; x < tile.x + tile.w; x++) + kernel_cpu_sse41_metropolis_warmup(&kg, rng_state, tile.buffers->metro_shared_params, i + tile.start_sample, x, y, tile.offset, tile.stride); + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 + if(system_cpu_support_sse3()) + for(int y = tile.y; y < tile.y + tile.h; y++) //Fill in ID, UV and normal passes + for(int x = tile.x; x < tile.x + tile.w; x++) + kernel_cpu_sse3_metropolis_warmup(&kg, rng_state, tile.buffers->metro_shared_params, i + tile.start_sample, x, y, tile.offset, tile.stride); + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 + if(system_cpu_support_sse2()) + for(int y = tile.y; y < tile.y + tile.h; y++) //Fill in ID, UV and normal passes + for(int x = tile.x; x < tile.x + tile.w; x++) + kernel_cpu_sse2_metropolis_warmup(&kg, rng_state, tile.buffers->metro_shared_params, i + tile.start_sample, x, y, tile.offset, tile.stride); + else +#endif + for(int y = tile.y; y < tile.y + tile.h; y++) //Fill in ID, UV and normal passes + for(int x = tile.x; x < tile.x + tile.w; x++) + kernel_cpu_metropolis_warmup(&kg, rng_state, tile.buffers->metro_shared_params, i + tile.start_sample, x, y, tile.offset, tile.stride); + } + + /*Metropolis m(&kg, &tile.buffers->totalImportance, &tile.buffers->sampleCount, &tile); + PassData pd;*/ + for(int sample = start_sample; sample < end_sample; sample++) { + if (task.get_cancel() || task_pool.canceled()) { + if(task.need_finish_queue == false) + break; + } +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) + for(uint i = 0; i < numChains; i++) + for(uint j = 0; j < samplesPerChain; j++) + kernel_cpu_avx2_metropolis_step(&kg, render_buffer, metroChains[i], tile.buffers->metro_shared_params, + tile.x, tile.y, tile.w, tile.h, tile.offset, tile.stride); + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if(system_cpu_support_avx()) + for(uint i = 0; i < numChains; i++) + for(uint j = 0; j < samplesPerChain; j++) + kernel_cpu_avx_metropolis_step(&kg, render_buffer, metroChains[i], tile.buffers->metro_shared_params, + tile.x, tile.y, tile.w, tile.h, tile.offset, tile.stride); + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + if(system_cpu_support_sse41()) + for(uint i = 0; i < numChains; i++) + for(uint j = 0; j < samplesPerChain; j++) + kernel_cpu_sse41_metropolis_step(&kg, render_buffer, metroChains[i], tile.buffers->metro_shared_params, + tile.x, tile.y, tile.w, tile.h, tile.offset, tile.stride); + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 + if(system_cpu_support_sse3()) + for(uint i = 0; i < numChains; i++) + for(uint j = 0; j < samplesPerChain; j++) + kernel_cpu_sse3_metropolis_step(&kg, render_buffer, metroChains[i], tile.buffers->metro_shared_params, + tile.x, tile.y, tile.w, tile.h, tile.offset, tile.stride); + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 + if(system_cpu_support_sse2()) + for(uint i = 0; i < numChains; i++) + for(uint j = 0; j < samplesPerChain; j++) + kernel_cpu_sse2_metropolis_step(&kg, render_buffer, metroChains[i], tile.buffers->metro_shared_params, + tile.x, tile.y, tile.w, tile.h, tile.offset, tile.stride); + else +#endif + for(uint i = 0; i < numChains; i++) + for(uint j = 0; j < samplesPerChain; j++) + kernel_cpu_metropolis_step(&kg, render_buffer, metroChains[i], tile.buffers->metro_shared_params, + tile.x, tile.y, tile.w, tile.h, tile.offset, tile.stride); - void thread_path_trace(DeviceTask& task) - { - if(task_pool.canceled()) { - if(task.need_finish_queue == false) - return; - } + tile.sample = sample + 1; - KernelGlobals kg = kernel_globals; + task.update_progress(tile); + + if (task.stopping_threshold > 0.0f && tile_converged(tile, &task)) + break; + } + + for (int i = 0; i < numChains; i++) + delete metroChains[i]; + + task.release_tile(tile); + + if(task_pool.canceled()) { + if(task.need_finish_queue == false) + break; + } + } #ifdef WITH_OSL - OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); + OSLShader::thread_free(&kg); #endif + } - RenderTile tile; - - while(task.acquire_tile(this, tile)) { - float *render_buffer = (float*)tile.buffer; - uint *rng_state = (uint*)tile.rng_state; - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; + void thread_path_trace(DeviceTask& task) + { + if(task_pool.canceled()) { + if(task.need_finish_queue == false) + return; + } + + KernelGlobals kg = kernel_globals; + +#ifdef WITH_OSL + OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); +#endif + RenderTile tile; + while(task.acquire_tile(this, tile)) { + float *render_buffer = (float*)tile.buffer; + uint *rng_state = (uint*)tile.rng_state; + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; + tile.sample = tile.start_sample; + + CDF_2D *importance_map = NULL; #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 if(system_cpu_support_avx2()) { for(int sample = start_sample; sample < end_sample; sample++) { + if (task.adaptive && sample >= task.map_interval && (sample % task.map_interval == 0 || !importance_map)) { + if (importance_map) delete importance_map; + build_importance(tile); + importance_map = new CDF_2D(tile.buffers->importance_map + tile.offset + tile.x + tile.y * tile.stride, tile.w, tile.h, tile.stride); + } + if (task.stopping_threshold > 0.0f && tile_converged(tile, &task)) + break; if (task.get_cancel() || task_pool.canceled()) { if(task.need_finish_queue == false) break; } - for(int y = tile.y; y < tile.y + tile.h; y++) { - for(int x = tile.x; x < tile.x + tile.w; x++) { - kernel_cpu_avx2_path_trace(&kg, render_buffer, rng_state, - sample, x, y, tile.offset, tile.stride); + for (int y = 0; y < tile.h; y++) { + for (int x = 0; x < tile.w; x++) { + int px = x, py = y; + if (importance_map) + importance_map->sample_02_jittered(sample - task.map_interval, px, py, px, py); + kernel_cpu_avx2_path_trace(&kg, render_buffer, rng_state, sample, px + tile.x, py + tile.y, tile.offset, tile.stride); } } @@ -193,15 +367,24 @@ public: #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { for(int sample = start_sample; sample < end_sample; sample++) { + if (task.adaptive && sample >= task.map_interval && (sample % task.map_interval == 0 || !importance_map)) { + if (importance_map) delete importance_map; + build_importance(tile); + importance_map = new CDF_2D(tile.buffers->importance_map + tile.offset + tile.x + tile.y * tile.stride, tile.w, tile.h, tile.stride); + } + if (task.stopping_threshold > 0.0f && tile_converged(tile, &task)) + break; if (task.get_cancel() || task_pool.canceled()) { if(task.need_finish_queue == false) break; } - for(int y = tile.y; y < tile.y + tile.h; y++) { - for(int x = tile.x; x < tile.x + tile.w; x++) { - kernel_cpu_avx_path_trace(&kg, render_buffer, rng_state, - sample, x, y, tile.offset, tile.stride); + for (int y = 0; y < tile.h; y++) { + for (int x = 0; x < tile.w; x++) { + int px = x, py = y; + if (importance_map) + importance_map->sample_02_jittered(sample - task.map_interval, px, py, px, py); + kernel_cpu_avx_path_trace(&kg, render_buffer, rng_state, sample, px + tile.x, py + tile.y, tile.offset, tile.stride); } } @@ -212,18 +395,27 @@ public: } else #endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { for(int sample = start_sample; sample < end_sample; sample++) { + if (task.adaptive && sample >= task.map_interval && (sample % task.map_interval == 0 || !importance_map)) { + if (importance_map) delete importance_map; + build_importance(tile); + importance_map = new CDF_2D(tile.buffers->importance_map + tile.offset + tile.x + tile.y * tile.stride, tile.w, tile.h, tile.stride); + } + if (task.stopping_threshold > 0.0f && tile_converged(tile, &task)) + break; if (task.get_cancel() || task_pool.canceled()) { if(task.need_finish_queue == false) break; } - for(int y = tile.y; y < tile.y + tile.h; y++) { - for(int x = tile.x; x < tile.x + tile.w; x++) { - kernel_cpu_sse41_path_trace(&kg, render_buffer, rng_state, - sample, x, y, tile.offset, tile.stride); + for (int y = 0; y < tile.h; y++) { + for (int x = 0; x < tile.w; x++) { + int px = x, py = y; + if (importance_map) + importance_map->sample_02_jittered(sample - task.map_interval, px, py, px, py); + kernel_cpu_sse41_path_trace(&kg, render_buffer, rng_state, sample, px + tile.x, py + tile.y, tile.offset, tile.stride); } } @@ -237,15 +429,24 @@ public: #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if(system_cpu_support_sse3()) { for(int sample = start_sample; sample < end_sample; sample++) { + if (task.adaptive && sample >= task.map_interval && (sample % task.map_interval == 0 || !importance_map)) { + if (importance_map) delete importance_map; + build_importance(tile); + importance_map = new CDF_2D(tile.buffers->importance_map + tile.offset + tile.x + tile.y * tile.stride, tile.w, tile.h, tile.stride); + } + if (task.stopping_threshold > 0.0f && tile_converged(tile, &task)) + break; if (task.get_cancel() || task_pool.canceled()) { if(task.need_finish_queue == false) break; } - for(int y = tile.y; y < tile.y + tile.h; y++) { - for(int x = tile.x; x < tile.x + tile.w; x++) { - kernel_cpu_sse3_path_trace(&kg, render_buffer, rng_state, - sample, x, y, tile.offset, tile.stride); + for (int y = 0; y < tile.h; y++) { + for (int x = 0; x < tile.w; x++) { + int px = x, py = y; + if (importance_map) + importance_map->sample_02_jittered(sample - task.map_interval, px, py, px, py); + kernel_cpu_sse3_path_trace(&kg, render_buffer, rng_state, sample, px + tile.x, py + tile.y, tile.offset, tile.stride); } } @@ -259,15 +460,24 @@ public: #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 if(system_cpu_support_sse2()) { for(int sample = start_sample; sample < end_sample; sample++) { + if (task.adaptive && sample >= task.map_interval && (sample % task.map_interval == 0 || !importance_map)) { + if (importance_map) delete importance_map; + build_importance(tile); + importance_map = new CDF_2D(tile.buffers->importance_map + tile.offset + tile.x + tile.y * tile.stride, tile.w, tile.h, tile.stride); + } + if (task.stopping_threshold > 0.0f && tile_converged(tile, &task)) + break; if (task.get_cancel() || task_pool.canceled()) { if(task.need_finish_queue == false) break; } - for(int y = tile.y; y < tile.y + tile.h; y++) { - for(int x = tile.x; x < tile.x + tile.w; x++) { - kernel_cpu_sse2_path_trace(&kg, render_buffer, rng_state, - sample, x, y, tile.offset, tile.stride); + for (int y = 0; y < tile.h; y++) { + for (int x = 0; x < tile.w; x++) { + int px = x, py = y; + if (importance_map) + importance_map->sample_02_jittered(sample - task.map_interval, px, py, px, py); + kernel_cpu_sse2_path_trace(&kg, render_buffer, rng_state, sample, px + tile.x, py + tile.y, tile.offset, tile.stride); } } @@ -279,16 +489,24 @@ public: else #endif { - for(int sample = start_sample; sample < end_sample; sample++) { + for(uint sample = start_sample; sample < end_sample; sample++) { + if (task.adaptive && sample >= task.map_interval && (sample % task.map_interval == 0 || !importance_map)) { + if (importance_map) delete importance_map; + build_importance(tile); + importance_map = new CDF_2D(tile.buffers->importance_map + tile.offset + tile.x + tile.y * tile.stride, tile.w, tile.h, tile.stride); + } + if (task.stopping_threshold > 0.0f && tile_converged(tile, &task)) + break; if (task.get_cancel() || task_pool.canceled()) { if(task.need_finish_queue == false) break; } - - for(int y = tile.y; y < tile.y + tile.h; y++) { - for(int x = tile.x; x < tile.x + tile.w; x++) { - kernel_cpu_path_trace(&kg, render_buffer, rng_state, - sample, x, y, tile.offset, tile.stride); + for (int y = 0; y < tile.h; y++) { + for (int x = 0; x < tile.w; x++) { + int px = x, py = y; + if (importance_map) + importance_map->sample_02_jittered(sample - task.map_interval, px, py, px, py); + kernel_cpu_path_trace(&kg, render_buffer, rng_state, sample, px + tile.x, py + tile.y, tile.offset, tile.stride); } } @@ -298,22 +516,24 @@ public: } } + if (importance_map) delete importance_map; + task.release_tile(tile); if(task_pool.canceled()) { - if(task.need_finish_queue == false) - break; + if(task.need_finish_queue == false) + break; } - } + } #ifdef WITH_OSL - OSLShader::thread_free(&kg); + OSLShader::thread_free(&kg); #endif - } + } - void thread_film_convert(DeviceTask& task) - { - float sample_scale = 1.0f/(task.sample + 1); + void thread_film_convert(DeviceTask& task) + { + float sample_scale = 1.0f/(task.sample + 1); if(task.rgba_half) { #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 @@ -326,40 +546,40 @@ public: else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - for(int y = task.y; y < task.y + task.h; y++) - for(int x = task.x; x < task.x + task.w; x++) - kernel_cpu_avx_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - for(int y = task.y; y < task.y + task.h; y++) - for(int x = task.x; x < task.x + task.w; x++) - kernel_cpu_sse41_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - for(int y = task.y; y < task.y + task.h; y++) - for(int x = task.x; x < task.x + task.w; x++) - kernel_cpu_sse3_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); - } - else + if(system_cpu_support_avx()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_avx_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + if(system_cpu_support_sse41()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_sse41_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 + if(system_cpu_support_sse3()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_sse3_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - for(int y = task.y; y < task.y + task.h; y++) - for(int x = task.x; x < task.x + task.w; x++) - kernel_cpu_sse2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); - } - else + if(system_cpu_support_sse2()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_sse2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else #endif { for(int y = task.y; y < task.y + task.h; y++) @@ -379,56 +599,56 @@ public: else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - for(int y = task.y; y < task.y + task.h; y++) - for(int x = task.x; x < task.x + task.w; x++) - kernel_cpu_avx_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - for(int y = task.y; y < task.y + task.h; y++) - for(int x = task.x; x < task.x + task.w; x++) - kernel_cpu_sse41_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); - } - else -#endif + if(system_cpu_support_avx()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_avx_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + if(system_cpu_support_sse41()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_sse41_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - for(int y = task.y; y < task.y + task.h; y++) - for(int x = task.x; x < task.x + task.w; x++) - kernel_cpu_sse3_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); - } - else + if(system_cpu_support_sse3()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_sse3_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - for(int y = task.y; y < task.y + task.h; y++) - for(int x = task.x; x < task.x + task.w; x++) - kernel_cpu_sse2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); - } - else + if(system_cpu_support_sse2()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_sse2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else #endif - { - for(int y = task.y; y < task.y + task.h; y++) - for(int x = task.x; x < task.x + task.w; x++) - kernel_cpu_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); - } - } - } - - void thread_shader(DeviceTask& task) - { - KernelGlobals kg = kernel_globals; + { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + } + } + + void thread_shader(DeviceTask& task) + { + KernelGlobals kg = kernel_globals; #ifdef WITH_OSL - OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); + OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 @@ -496,13 +716,13 @@ public: for(int sample = 0; sample < task.num_samples; sample++) kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample); - if(task.get_cancel() || task_pool.canceled()) - break; - } - } + if(task.get_cancel() || task_pool.canceled()) + break; + } + } #ifdef WITH_OSL - OSLShader::thread_free(&kg); + OSLShader::thread_free(&kg); #endif } @@ -533,21 +753,21 @@ public: Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background) { - return new CPUDevice(info, stats, background); + return new CPUDevice(info, stats, background); } void device_cpu_info(vector& devices) { - DeviceInfo info; + DeviceInfo info; - info.type = DEVICE_CPU; - info.description = system_cpu_brand_string(); - info.id = "CPU"; - info.num = 0; - info.advanced_shading = true; - info.pack_images = false; + info.type = DEVICE_CPU; + info.description = system_cpu_brand_string(); + info.id = "CPU"; + info.num = 0; + info.advanced_shading = true; + info.pack_images = false; - devices.insert(devices.begin(), info); + devices.insert(devices.begin(), info); } CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index d1d227b..4ed7ce6 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -26,11 +26,13 @@ #include "util_cuda.h" #include "util_debug.h" #include "util_map.h" +#include "util_metropolis.h" #include "util_opengl.h" #include "util_path.h" #include "util_system.h" #include "util_types.h" #include "util_time.h" +#include "util_importance.h" CCL_NAMESPACE_BEGIN @@ -46,6 +48,10 @@ public: int cuDevArchitecture; bool first_error; bool use_texture_storage; + + device_vector threads_to_pixels; + device_vector metropolis_chains; + device_vector metropolis_shared_params; struct PixelMem { GLuint cuPBO; @@ -594,7 +600,7 @@ public: } } - void path_trace(RenderTile& rtile, int sample, bool branched) + void path_trace(RenderTile& rtile, int sample, bool branched, device_ptr thread_to_pixel_mem) { if(have_error()) return; @@ -604,6 +610,7 @@ public: CUfunction cuPathTrace; CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer); CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state); + CUdeviceptr d_thread_to_pixel_mem = cuda_device_ptr(thread_to_pixel_mem); /* get kernel function */ if(branched) { @@ -625,6 +632,9 @@ public: cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state))); offset += sizeof(d_rng_state); + cuda_assert(cuParamSetv(cuPathTrace, offset, &d_thread_to_pixel_mem, sizeof(d_thread_to_pixel_mem))); + offset += sizeof(d_thread_to_pixel_mem); + offset = align_up(offset, __alignof(sample)); cuda_assert(cuParamSeti(cuPathTrace, offset, sample)); @@ -674,6 +684,235 @@ public: cuda_pop_context(); } + void metropolis_first_pass(RenderTile& rtile, int numChains) + { + if(have_error()) + return; + + cuda_push_context(); + + CUfunction cuFirstPass; + CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer); + CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state); + CUdeviceptr d_shared_params = cuda_device_ptr(metropolis_shared_params.device_pointer); + CUdeviceptr d_metro_chains = cuda_device_ptr(metropolis_chains.device_pointer); + + /* get kernel function */ + cuda_assert(cuModuleGetFunction(&cuFirstPass, cuModule, "kernel_cuda_metropolis_first_pass")); + + if(have_error()) + return; + + /* pass in parameters */ + int offset = 0; + + cuda_assert(cuParamSetv(cuFirstPass, offset, &d_buffer, sizeof(d_buffer))); + offset += sizeof(d_buffer); + + cuda_assert(cuParamSetv(cuFirstPass, offset, &d_rng_state, sizeof(d_rng_state))); + offset += sizeof(d_rng_state); + + cuda_assert(cuParamSetv(cuFirstPass, offset, &d_shared_params, sizeof(d_shared_params))); + offset += sizeof(d_shared_params); + + cuda_assert(cuParamSetv(cuFirstPass, offset, &d_metro_chains, sizeof(d_metro_chains))); + offset += sizeof(d_metro_chains); + + offset = align_up(offset, __alignof(int)); + + cuda_assert(cuParamSeti(cuFirstPass, offset, rtile.x)); + offset += sizeof(rtile.x); + + cuda_assert(cuParamSeti(cuFirstPass, offset, rtile.y)); + offset += sizeof(rtile.y); + + cuda_assert(cuParamSeti(cuFirstPass, offset, rtile.w)); + offset += sizeof(rtile.w); + + cuda_assert(cuParamSeti(cuFirstPass, offset, rtile.h)); + offset += sizeof(rtile.h); + + cuda_assert(cuParamSeti(cuFirstPass, offset, rtile.offset)); + offset += sizeof(rtile.offset); + + cuda_assert(cuParamSeti(cuFirstPass, offset, rtile.stride)); + offset += sizeof(rtile.stride); + + cuda_assert(cuParamSeti(cuFirstPass, offset, numChains)); + offset += sizeof(numChains); + + cuda_assert(cuParamSetSize(cuFirstPass, offset)); + + /* launch kernel */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFirstPass)); + + /*int num_registers; + cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace)); + + printf("threads_per_block %d\n", threads_per_block); + printf("num_registers %d\n", num_registers);*/ + + int xthreads = (int)sqrt((float)threads_per_block); + int ythreads = (int)sqrt((float)threads_per_block); + int xblocks = (rtile.w + xthreads - 1)/xthreads; + int height = max(rtile.h, (numChains + rtile.w - 1) / rtile.w); //Run at least one thread per chain + int yblocks = (height + ythreads - 1)/ythreads; + + cuda_assert(cuFuncSetCacheConfig(cuFirstPass, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetBlockShape(cuFirstPass, xthreads, ythreads, 1)); + cuda_assert(cuLaunchGrid(cuFirstPass, xblocks, yblocks)); + + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + } + + void metropolis_warmup(RenderTile& rtile, int i) + { + if(have_error()) + return; + + cuda_push_context(); + + CUfunction cuWarmup; + CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state); + CUdeviceptr d_shared_params = cuda_device_ptr(metropolis_shared_params.device_pointer); + + /* get kernel function */ + cuda_assert(cuModuleGetFunction(&cuWarmup, cuModule, "kernel_cuda_metropolis_warmup")); + + if(have_error()) + return; + + /* pass in parameters */ + int offset = 0; + + cuda_assert(cuParamSetv(cuWarmup, offset, &d_rng_state, sizeof(d_rng_state))); + offset += sizeof(d_rng_state); + + cuda_assert(cuParamSetv(cuWarmup, offset, &d_shared_params, sizeof(d_shared_params))); + offset += sizeof(d_shared_params); + + offset = align_up(offset, __alignof(int)); + + cuda_assert(cuParamSeti(cuWarmup, offset, i)); + offset += sizeof(i); + + cuda_assert(cuParamSeti(cuWarmup, offset, rtile.w)); + offset += sizeof(rtile.w); + + cuda_assert(cuParamSeti(cuWarmup, offset, rtile.h)); + offset += sizeof(rtile.h); + + cuda_assert(cuParamSeti(cuWarmup, offset, rtile.offset)); + offset += sizeof(rtile.offset); + + cuda_assert(cuParamSeti(cuWarmup, offset, rtile.stride)); + offset += sizeof(rtile.stride); + + cuda_assert(cuParamSetSize(cuWarmup, offset)); + + /* launch kernel */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuWarmup)); + + /*int num_registers; + cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace)); + + printf("threads_per_block %d\n", threads_per_block); + printf("num_registers %d\n", num_registers);*/ + + int xthreads = (int)sqrt((float)threads_per_block); + int ythreads = (int)sqrt((float)threads_per_block); + int xblocks = (rtile.w + xthreads - 1)/xthreads; + int yblocks = (rtile.h + ythreads - 1)/ythreads; + + cuda_assert(cuFuncSetCacheConfig(cuWarmup, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetBlockShape(cuWarmup, xthreads, ythreads, 1)); + cuda_assert(cuLaunchGrid(cuWarmup, xblocks, yblocks)); + + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + } + + void metropolis_step(RenderTile& rtile, int sample, int numChains) + { + if(have_error()) + return; + + cuda_push_context(); + + CUfunction cuMetroStep; + CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer); + CUdeviceptr d_shared_params = cuda_device_ptr(metropolis_shared_params.device_pointer); + CUdeviceptr d_metro_chains = cuda_device_ptr(metropolis_chains.device_pointer); + + /* get kernel function */ + cuda_assert(cuModuleGetFunction(&cuMetroStep, cuModule, "kernel_cuda_metropolis_step")); + + if(have_error()) + return; + + /* pass in parameters */ + int offset = 0; + + cuda_assert(cuParamSetv(cuMetroStep, offset, &d_buffer, sizeof(d_buffer))); + offset += sizeof(d_buffer); + + cuda_assert(cuParamSetv(cuMetroStep, offset, &d_shared_params, sizeof(d_shared_params))); + offset += sizeof(d_shared_params); + + cuda_assert(cuParamSetv(cuMetroStep, offset, &d_metro_chains, sizeof(d_metro_chains))); + offset += sizeof(d_metro_chains); + + offset = align_up(offset, __alignof(int)); + + cuda_assert(cuParamSeti(cuMetroStep, offset, numChains)); + offset += sizeof(numChains); + + cuda_assert(cuParamSeti(cuMetroStep, offset, rtile.x)); + offset += sizeof(rtile.x); + + cuda_assert(cuParamSeti(cuMetroStep, offset, rtile.y)); + offset += sizeof(rtile.y); + + cuda_assert(cuParamSeti(cuMetroStep, offset, rtile.w)); + offset += sizeof(rtile.w); + + cuda_assert(cuParamSeti(cuMetroStep, offset, rtile.h)); + offset += sizeof(rtile.h); + + cuda_assert(cuParamSeti(cuMetroStep, offset, rtile.offset)); + offset += sizeof(rtile.offset); + + cuda_assert(cuParamSeti(cuMetroStep, offset, rtile.stride)); + offset += sizeof(rtile.stride); + + cuda_assert(cuParamSetSize(cuMetroStep, offset)); + + /* launch kernel */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuMetroStep)); + + /*int num_registers; + cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace)); + + printf("threads_per_block %d\n", threads_per_block); + printf("num_registers %d\n", num_registers);*/ + + int blocks = (numChains + threads_per_block - 1) / threads_per_block; + + cuda_assert(cuFuncSetCacheConfig(cuMetroStep, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetBlockShape(cuMetroStep, threads_per_block, 1, 1)); + cuda_assert(cuLaunchGrid(cuMetroStep, blocks, 1)); + + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + } + void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) { if(have_error()) @@ -1033,21 +1272,98 @@ public: while(task->acquire_tile(this, tile)) { int start_sample = tile.start_sample; int end_sample = tile.start_sample + tile.num_samples; + tile.sample = tile.start_sample; + CDF_2D *importance_map = NULL; for(int sample = start_sample; sample < end_sample; sample++) { + if (task->adaptive && sample >= task->map_interval && (sample % task->map_interval == 0 || !importance_map)) { + if (importance_map) delete importance_map; + build_importance(tile); + importance_map = new CDF_2D(tile.buffers->importance_map + tile.offset + tile.x + tile.y * tile.stride, tile.w, tile.h, tile.stride); + } + + if (task->stopping_threshold > 0.0f && tile_converged(tile, task)) + break; + if (task->get_cancel()) { if(task->need_finish_queue == false) break; } - path_trace(tile, sample, branched); + if (importance_map) { + threads_to_pixels.resize(tile.h*tile.w); + mem_free(threads_to_pixels); + mem_alloc(threads_to_pixels, MEM_READ_ONLY); + for (int y = 0; y < tile.h; y++) + for (int x = 0; x < tile.w; x++) { + int px = x, py = y; + importance_map->sample_02_jittered(sample, px, py, px, py); + ((int*) threads_to_pixels.data_pointer)[2*(y*tile.w + x)] = px; + ((int*) threads_to_pixels.data_pointer)[2*(y*tile.w + x) + 1] = py; + } + mem_copy_to(threads_to_pixels); + } + + path_trace(tile, sample, branched, importance_map ? threads_to_pixels.device_pointer : (device_ptr)NULL); + + tile.sample = sample + 1; + + task->update_progress(tile); + } + task->release_tile(tile); + } + } + else if(task->type == DeviceTask::METROPOLIS_TRACE) { + RenderTile tile; + + /* keep rendering tiles until done */ + while(task->acquire_tile(this, tile)) { + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; + tile.sample = tile.start_sample; + if (tile.buffers->metro_shared_params == NULL) { + tile.buffers->metro_shared_params = new double[8]; + memset(tile.buffers->metro_shared_params, 0, sizeof(double)*8); + } + + metropolis_chains.resize(metro_get_size(task->num_metro_chains, task->metro_sample_size)); + if (!metropolis_chains.device_pointer) + mem_alloc(metropolis_chains, MEM_READ_WRITE); + + metropolis_shared_params.resize(8); + if (!metropolis_shared_params.device_pointer) + mem_alloc(metropolis_shared_params, MEM_READ_WRITE); + memcpy((void*) metropolis_shared_params.data_pointer, tile.buffers->metro_shared_params, 8*sizeof(double)); + mem_copy_to(metropolis_shared_params); + + if (start_sample == 0) + metropolis_first_pass(tile, task->num_metro_chains); + + for (int i = 0; i < task->warmup_samples; i++) + metropolis_warmup(tile, i); + + for(int sample = start_sample; sample < end_sample; sample++) { + if (task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } + + int numSteps = (tile.w*tile.h + task->num_metro_chains - 1)/task->num_metro_chains; + for (int i = 0; i < numSteps; i++) + metropolis_step(tile, sample*numSteps + i + 1, task->num_metro_chains); tile.sample = sample + 1; task->update_progress(tile); + + if (task->stopping_threshold > 0.0f && tile_converged(tile, task)) + break; } + mem_copy_from(metropolis_shared_params, 0, 8, 1, sizeof(double)); + memcpy(tile.buffers->metro_shared_params, (void*) metropolis_shared_params.data_pointer, 8*sizeof(double)); task->release_tile(tile); + mem_free(metropolis_chains); } } else if(task->type == DeviceTask::SHADER) { @@ -1139,7 +1455,7 @@ void device_cuda_info(vector& devices) info.extended_images = (major >= 3); info.pack_images = false; - /* if device has a kernel timeout, assume it is used for display */ + /* if device has a KernelGlobals timeout, assume it is used for display */ if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) { info.display_device = true; display_devices.push_back(info); diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index 8d6f4a4..505cb55 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -17,6 +17,8 @@ #ifndef __DEVICE_MEMORY_H__ #define __DEVICE_MEMORY_H__ +#include + /* Device Memory * * This file defines data types that can be used in device memory arrays, and @@ -48,6 +50,7 @@ enum DataType { TYPE_UINT, TYPE_INT, TYPE_FLOAT, + TYPE_DOUBLE, TYPE_HALF }; @@ -56,6 +59,7 @@ static inline size_t datatype_size(DataType datatype) switch(datatype) { case TYPE_UCHAR: return sizeof(uchar); case TYPE_FLOAT: return sizeof(float); + case TYPE_DOUBLE: return sizeof(double); case TYPE_UINT: return sizeof(uint); case TYPE_INT: return sizeof(int); case TYPE_HALF: return sizeof(half); @@ -150,6 +154,11 @@ template<> struct device_type_traits { static const int num_elements = 4; }; +template<> struct device_type_traits { + static const DataType data_type = TYPE_DOUBLE; + static const int num_elements = 1; +}; + template<> struct device_type_traits { static const DataType data_type = TYPE_HALF; static const int num_elements = 4; @@ -176,7 +185,7 @@ public: protected: device_memory() {} - virtual ~device_memory() { assert(!device_pointer); } + virtual ~device_memory() { /*assert(!device_pointer);*/ } /* no copying */ device_memory(const device_memory&); diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index abfe445..8e88d18 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -33,6 +33,7 @@ #include "util_opengl.h" #include "util_path.h" #include "util_time.h" +#include "util_importance.h" CCL_NAMESPACE_BEGIN @@ -1102,8 +1103,11 @@ public: while(task->acquire_tile(this, tile)) { int start_sample = tile.start_sample; int end_sample = tile.start_sample + tile.num_samples; + tile.sample = tile.start_sample; for(int sample = start_sample; sample < end_sample; sample++) { + if (task->stopping_threshold > 0.0f && tile_converged(tile, task)) + break; if(task->get_cancel()) { if(task->need_finish_queue == false) break; diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp index f436b54..49b0c8e 100644 --- a/intern/cycles/device/device_task.cpp +++ b/intern/cycles/device/device_task.cpp @@ -16,6 +16,7 @@ #include #include +#include #include "device_task.h" @@ -70,6 +71,10 @@ void DeviceTask::split(list& tasks, int num, int max_size) for(int i = 0; i < num; i++) tasks.push_back(*this); } + else if(type == METROPOLIS_TRACE) { + for(int i = 0; i < num; i++) + tasks.push_back(*this); + } else { num = min(h, num); @@ -89,7 +94,7 @@ void DeviceTask::split(list& tasks, int num, int max_size) void DeviceTask::update_progress(RenderTile &rtile) { - if (type != PATH_TRACE) + if (type != PATH_TRACE && type != METROPOLIS_TRACE) return; if(update_progress_sample) diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index 9139067..cf087a2 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -34,7 +34,7 @@ class Tile; class DeviceTask : public Task { public: - typedef enum { PATH_TRACE, FILM_CONVERT, SHADER } Type; + typedef enum { PATH_TRACE, METROPOLIS_TRACE, FILM_CONVERT, SHADER } Type; Type type; int x, y, w, h; @@ -64,6 +64,17 @@ public: bool need_finish_queue; bool integrator_branched; + bool integrator_metropolis; + bool importance_equalisation; + int num_metro_chains; + int metro_sample_size; + int warmup_samples; + + bool adaptive; + int map_interval; + float stopping_threshold; + float adaptive_error_power; + bool error_progressive; protected: double last_update_time; }; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 2ff6b53..d5966eb 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -31,6 +31,7 @@ set(SRC_HEADERS kernel_jitter.h kernel_light.h kernel_math.h + kernel_metropolis.h kernel_montecarlo.h kernel_passes.h kernel_path.h @@ -128,6 +129,7 @@ set(SRC_GEOM_HEADERS set(SRC_UTIL_HEADERS ../util/util_color.h ../util/util_half.h + ../util/util_hash.h ../util/util_math.h ../util/util_transform.h ../util/util_types.h diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernel.cpp index 42eb9a6..158d529 100644 --- a/intern/cycles/kernel/kernel.cpp +++ b/intern/cycles/kernel/kernel.cpp @@ -106,6 +106,25 @@ void kernel_cpu_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_s kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); } +void kernel_metro_init_chain(KernelGlobals *kg, MetropolisChain *m) { + return metro_init_chain(kg, m); +} + +void kernel_cpu_metropolis_first_pass(KernelGlobals *kg, ccl_global float *buffer, double *sharedParams, ccl_global uint *rng_state, int x, int y, int offset, int stride) +{ + kernel_metropolis_first_pass(kg, buffer, sharedParams, rng_state, x, y, offset, stride); +} + +void kernel_cpu_metropolis_warmup(KernelGlobals *kg, ccl_global uint *rng_state, double *sharedParams, int sample, int x, int y, int offset, int stride) { + kernel_metropolis_warmup(kg, rng_state, sharedParams, sample, x, y, offset, stride); +} + +void kernel_cpu_metropolis_step(KernelGlobals *kg, float *rbuffer, MetropolisChain *metro, double *sharedParams, + int x, int y, int w, int h, int offset, int stride) +{ + kernel_metropolis_step(kg, rbuffer, metro, sharedParams, x, y, w, h, offset, stride); +} + /* Film */ void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernel.cu index 9208acc..499cae0 100644 --- a/intern/cycles/kernel/kernel.cu +++ b/intern/cycles/kernel/kernel.cu @@ -100,30 +100,99 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride) +kernel_cuda_path_trace(float *buffer, uint *rng_state, int *thread_to_pixel, int sample, int sx, int sy, int sw, int sh, int offset, int stride) { - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - - if(x < sx + sw && y < sy + sh) - kernel_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride); + if (thread_to_pixel) { + int i = (blockDim.y*blockIdx.y + threadIdx.y) * sw + blockDim.x*blockIdx.x + threadIdx.x; + if (i < sw * sh) { + int x = thread_to_pixel[2*i]; + int y = thread_to_pixel[2*i + 1]; + kernel_path_trace(NULL, buffer, rng_state, sample, x + sx, y + sy, offset, stride); + } + } else { + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + + if(x < sx + sw && y < sy + sh) + kernel_path_trace(NULL, buffer, rng_state, sample, x + sx, y + sy, offset, stride); + } } #ifdef __BRANCHED_PATH__ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS) -kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride) +kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int *thread_to_pixel, int sample, int sx, int sy, int sw, int sh, int offset, int stride) { - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - - if(x < sx + sw && y < sy + sh) - kernel_branched_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride); + if (thread_to_pixel) { + int i = (blockDim.y*blockIdx.y + threadIdx.y) * sw + blockDim.x*blockIdx.x + threadIdx.x; + if (i < sw * sh) { + int x = thread_to_pixel[2*i]; + int y = thread_to_pixel[2*i + 1]; + kernel_branched_path_trace(NULL, buffer, rng_state, sample, x + sx, y + sy, offset, stride); + } + } else { + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + + if(x < sw && y < sh) + kernel_branched_path_trace(NULL, buffer, rng_state, sample, x + sx, y + sy, offset, stride); + } } #endif extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_metropolis_first_pass(float *buffer, uint *rng_state, double *sharedParams, MetropolisChain *chains, int sx, int sy, int sw, int sh, int offset, int stride, int numChains) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + + if (x == 0 && y == 0) { + sharedParams[0] = 0; + sharedParams[1] = 0; + sharedParams[2] = 0; + sharedParams[3] = 0; + } + + //Init chains + if (y*sw + x < numChains) { + int chainSize = PRNG_BASE_NUM + (kernel_data.integrator.max_bounce + kernel_data.integrator.transparent_max_bounce + 3)*PRNG_BOUNCE_NUM + 2; + chainSize = sizeof(MetropolisChain) + chainSize*(2*sizeof(int) + 3*sizeof(float)); + chainSize = ((chainSize + 15) / 16) * 16; + metro_init_chain(NULL, (MetropolisChain*) (((char*) chains) + (y*sw + x)*chainSize)); + } + + if(x < sw && y < sh) + kernel_metropolis_first_pass(NULL, buffer, sharedParams, rng_state, x + sx, y + sy, offset, stride); +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_metropolis_warmup(uint *rng_state, double *sharedParams, int sample, int sw, int sh, int offset, int stride) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + + if (x < sw && y < sh) + kernel_metropolis_warmup(NULL, rng_state, sharedParams, sample, x, y, offset, stride); +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_metropolis_step(float *rbuffer, double *sharedParams, MetropolisChain *chains, int numChains, +int sx, int sy, int sw, int sh, int offset, int stride) +{ + int i = blockDim.x*blockIdx.x + threadIdx.x; + if (i < numChains) { + int chainSize = PRNG_BASE_NUM + (kernel_data.integrator.max_bounce + kernel_data.integrator.transparent_max_bounce + 3)*PRNG_BOUNCE_NUM + 2; + chainSize = sizeof(MetropolisChain) + chainSize*(2*sizeof(int) + 3*sizeof(float)); + chainSize = ((chainSize + 15) / 16) * 16; + kernel_metropolis_step(NULL, rbuffer, (MetropolisChain*) (((char*) chains) + i*chainSize), sharedParams, sx, sy, sw, sh, offset, stride); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) { int x = sx + blockDim.x*blockIdx.x + threadIdx.x; diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index 264e5e3..dc822d5 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -24,6 +24,8 @@ CCL_NAMESPACE_BEGIN struct KernelGlobals; +struct PassData; +struct MetropolisChain; KernelGlobals *kernel_globals_create(); void kernel_globals_free(KernelGlobals *kg); @@ -36,6 +38,12 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t void kernel_cpu_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride); +void kernel_cpu_metropolis_first_pass(KernelGlobals *kg, ccl_global float *buffer, double *sharedParams, ccl_global uint *rng_state, + int x, int y, int offset, int stride); +void kernel_cpu_metropolis_warmup(KernelGlobals *kg, ccl_global uint *rng_state, double *sharedParams, int sample, int x, int y, int offset, int stride); +void kernel_metro_init_chain(KernelGlobals *kg, MetropolisChain *m); +void kernel_cpu_metropolis_step(KernelGlobals *kg, float *rbuffer, MetropolisChain *metro, double *sharedParams, + int x, int y, int w, int h, int offset, int stride); void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, @@ -46,6 +54,11 @@ void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride); +void kernel_cpu_sse2_metropolis_step(KernelGlobals *kg, float *rbuffer, MetropolisChain *metro, double *sharedParams, + int x, int y, int w, int h, int offset, int stride); +void kernel_cpu_sse2_metropolis_first_pass(KernelGlobals *kg, ccl_global float *buffer, double *sharedParams, ccl_global uint *rng_state, + int x, int y, int offset, int stride); +void kernel_cpu_sse2_metropolis_warmup(KernelGlobals *kg, ccl_global uint *rng_state, double *sharedParams, int sample, int x, int y, int offset, int stride); void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, @@ -57,6 +70,11 @@ void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride); +void kernel_cpu_sse3_metropolis_step(KernelGlobals *kg, float *rbuffer, MetropolisChain *metro, double *sharedParams, + int x, int y, int w, int h, int offset, int stride); +void kernel_cpu_sse3_metropolis_first_pass(KernelGlobals *kg, ccl_global float *buffer, double *sharedParams, ccl_global uint *rng_state, + int x, int y, int offset, int stride); +void kernel_cpu_sse3_metropolis_warmup(KernelGlobals *kg, ccl_global uint *rng_state, double *sharedParams, int sample, int x, int y, int offset, int stride); void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, @@ -68,6 +86,11 @@ void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 void kernel_cpu_sse41_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride); +void kernel_cpu_sse41_metropolis_step(KernelGlobals *kg, float *rbuffer, MetropolisChain *metro, double *sharedParams, + int x, int y, int w, int h, int offset, int stride); +void kernel_cpu_sse41_metropolis_first_pass(KernelGlobals *kg, ccl_global float *buffer, double *sharedParams, ccl_global uint *rng_state, + int x, int y, int offset, int stride); +void kernel_cpu_sse41_metropolis_warmup(KernelGlobals *kg, ccl_global uint *rng_state, double *sharedParams, int sample, int x, int y, int offset, int stride); void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, @@ -79,6 +102,11 @@ void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX void kernel_cpu_avx_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride); +void kernel_cpu_avx_metropolis_step(KernelGlobals *kg, float *rbuffer, MetropolisChain *metro, double *sharedParams, + int x, int y, int w, int h, int offset, int stride); +void kernel_cpu_avx_metropolis_first_pass(KernelGlobals *kg, ccl_global float *buffer, double *sharedParams, ccl_global uint *rng_state, + int x, int y, int offset, int stride); +void kernel_cpu_avx_metropolis_warmup(KernelGlobals *kg, ccl_global uint *rng_state, double *sharedParams, int sample, int x, int y, int offset, int stride); void kernel_cpu_avx_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, @@ -90,6 +118,11 @@ void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride); +void kernel_cpu_avx2_metropolis_step(KernelGlobals *kg, float *rbuffer, MetropolisChain *metro, double *sharedParams, + int x, int y, int w, int h, int offset, int stride); +void kernel_cpu_avx2_metropolis_first_pass(KernelGlobals *kg, ccl_global float *buffer, double *sharedParams, ccl_global uint *rng_state, + int x, int y, int offset, int stride); +void kernel_cpu_avx2_metropolis_warmup(KernelGlobals *kg, ccl_global uint *rng_state, double *sharedParams, int sample, int x, int y, int offset, int stride); void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, diff --git a/intern/cycles/kernel/kernel_avx.cpp b/intern/cycles/kernel/kernel_avx.cpp index d612a82..ed2d4eb 100644 --- a/intern/cycles/kernel/kernel_avx.cpp +++ b/intern/cycles/kernel/kernel_avx.cpp @@ -39,9 +39,12 @@ #include "kernel_film.h" #include "kernel_path.h" #include "kernel_bake.h" +#include "kernel_metropolis.h" CCL_NAMESPACE_BEGIN +extern float s, as; + /* Path Tracing */ void kernel_cpu_avx_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride) @@ -54,6 +57,21 @@ void kernel_cpu_avx_path_trace(KernelGlobals *kg, float *buffer, unsigned int *r kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); } +void kernel_cpu_avx_metropolis_step(KernelGlobals *kg, float *rbuffer, MetropolisChain *metro, double *sharedParams, + int x, int y, int w, int h, int offset, int stride) +{ + kernel_metropolis_step(kg, rbuffer, metro, sharedParams, x, y, w, h, offset, stride); +} + +void kernel_cpu_avx_metropolis_first_pass(KernelGlobals *kg, ccl_global float *buffer, double *sharedParams, ccl_global uint *rng_state, int x, int y, int offset, int stride) +{ + kernel_metropolis_first_pass(kg, buffer, sharedParams, rng_state, x, y, offset, stride); +} + +void kernel_cpu_avx_metropolis_warmup(KernelGlobals *kg, ccl_global uint *rng_state, double *sharedParams, int sample, int x, int y, int offset, int stride) { + kernel_metropolis_warmup(kg, rng_state, sharedParams, sample, x, y, offset, stride); +} + /* Film */ void kernel_cpu_avx_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) diff --git a/intern/cycles/kernel/kernel_avx2.cpp b/intern/cycles/kernel/kernel_avx2.cpp index 339421a..a9a5f3b 100644 --- a/intern/cycles/kernel/kernel_avx2.cpp +++ b/intern/cycles/kernel/kernel_avx2.cpp @@ -55,6 +55,21 @@ void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int * kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); } +void kernel_cpu_avx2_metropolis_step(KernelGlobals *kg, float *rbuffer, MetropolisChain *metro, double *sharedParams, + int x, int y, int w, int h, int offset, int stride) +{ + kernel_metropolis_step(kg, rbuffer, metro, sharedParams, x, y, w, h, offset, stride); +} + +void kernel_cpu_avx2_metropolis_first_pass(KernelGlobals *kg, ccl_global float *buffer, double *sharedParams, ccl_global uint *rng_state, int x, int y, int offset, int stride) +{ + kernel_metropolis_first_pass(kg, buffer, sharedParams, rng_state, x, y, offset, stride); +} + +void kernel_cpu_avx2_metropolis_warmup(KernelGlobals *kg, ccl_global uint *rng_state, double *sharedParams, int sample, int x, int y, int offset, int stride) { + kernel_metropolis_warmup(kg, rng_state, sharedParams, sample, x, y, offset, stride); +} + /* Film */ void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h index 6b03abe..8c9ebfb 100644 --- a/intern/cycles/kernel/kernel_camera.h +++ b/intern/cycles/kernel/kernel_camera.h @@ -216,7 +216,7 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float /* Common */ -ccl_device void camera_sample(KernelGlobals *kg, int x, int y, float filter_u, float filter_v, +ccl_device void camera_sample(KernelGlobals *kg, float x, float y, float filter_u, float filter_v, float lens_u, float lens_v, float time, Ray *ray) { /* pixel filter */ diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index c2aab93..dc49ef3 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -240,6 +240,42 @@ typedef texture_image texture_image_uchar4; #define kernel_data (kg->__data) +#ifdef __GNUC__ +#define atomicCASD(x, y, z) __sync_val_compare_and_swap(x, y, z) +#define atomicCASF(x, y, z) __sync_val_compare_and_swap(x, y, z) +#define atomicExch(x, y) __sync_lock_test_and_set(x, y) +#elif defined(_WIN32) +#define atomicCASD(x, y, z) _InterlockedCompareExchange64((volatile long long int *)x, z, y) +#define atomicCASF(x, y, z) _InterlockedCompareExchange((volatile long int *)x, z, y) +#define atomicExch(x, y) _InterlockedExchange(x, y) +#else +#error No CAS operation for this Compiler! +#endif + +inline double atomicAddD(double* address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed, newV; + do { + assumed = old; + *((double*) &newV) = val + *((double*) &assumed); + old = atomicCASD(address_as_ull, assumed, newV); + } while (assumed != old); + return *((double*) &old); +} + +inline float atomicAddF(float* address, float val) +{ + unsigned int* address_as_ull = (unsigned int*)address; + unsigned int old = *address_as_ull, assumed, newV; + do { + assumed = old; + *((float*) &newV) = val + *((float*) &assumed); + old = atomicCASF(address_as_ull, assumed, newV); + } while (assumed != old); + return *((float*) &old); +} + CCL_NAMESPACE_END #endif /* __KERNEL_COMPAT_CPU_H__ */ diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index e4c20d2..1866546 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -82,5 +82,33 @@ typedef texture texture_image_uchar4; #define logf(x) __logf(((float)x)) #define expf(x) __expf(((float)x)) +__device__ __inline__ double atomicAddD(double* address, double val) +{ + unsigned long long int* address_as_ull = + (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} + +__device__ __inline__ float atomicAddF(float* address, float val) +{ + unsigned int* address_as_u = + (unsigned int*)address; + unsigned int old = *address_as_u, assumed; + do { + assumed = old; + old = atomicCAS(address_as_u, assumed, + __float_as_int(val + + __int_as_float(assumed))); + } while (assumed != old); + return __int_as_float(old); +} + #endif /* __KERNEL_COMPAT_CUDA_H__ */ diff --git a/intern/cycles/kernel/kernel_metropolis.h b/intern/cycles/kernel/kernel_metropolis.h new file mode 100644 index 0000000..45e0e84 --- /dev/null +++ b/intern/cycles/kernel/kernel_metropolis.h @@ -0,0 +1,268 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#ifndef __KERNEL_METROPOLIS_H__ +#define __KERNEL_METROPOLIS_H__ +#include "kernel_types.h" +#include "util_hash.h" + +CCL_NAMESPACE_BEGIN + +//Based on SmallLuxGPU Code + +ccl_device_inline float metro_get_large_step_prob(double *sharedParams) { + if (sharedParams[5] == 0) + return 0.5; + double eta_l = sharedParams[2]/sharedParams[3]; + double eta_s = sharedParams[4]/sharedParams[5]; + double eta_0 = sharedParams[6]/sharedParams[7]; + if (eta_l / eta_0 > 0.1) + return clamp(0.5f * (float) (eta_s / (eta_s - eta_l)), 0.1f, 0.9f); + else return 0.25f; +} + +ccl_device float metro_mutate(const float x, const float randomValue) +{ + const float s1 = 1.0f / 512.0f, s2 = 1.0f / 16.0f; + + const float dx = s1 / (s1 / s2 + fabsf(2.0f * randomValue - 1.0f)) - + s1 / (s1 / s2 + 1.0f); + + if (randomValue < 0.5f) { + float mutatedX = x + dx; + return (mutatedX < 1.0f) ? mutatedX : mutatedX - 1.0f; + } else { + float mutatedX = x - dx; + return (mutatedX < 0.0f) ? mutatedX + 1.0f : mutatedX; + } +} + +ccl_device float metro_mutate_scaled(const float x, const float range, const float randomValue) +{ + const float s1 = 32.0f; + + const float dx = range / (s1 / (1.0f + s1) + (s1 * s1) / (1.0f + s1) * + fabsf(2.0f * randomValue - 1.0f)) - range / s1; + + float mutatedX = x; + if (randomValue < 0.5f) { + mutatedX += dx; + return (mutatedX < 1.0f) ? mutatedX : (mutatedX - 1.0f); + } else { + mutatedX -= dx; + return (mutatedX < 0.0f) ? (mutatedX + 1.0f) : mutatedX; + } +} + +ccl_device float metro_get_mcqmc(KernelGlobals *kg, uint rngPos, uint j, float *rot) { + float v = kernel_tex_fetch(__metropolis_mcqmc, (rngPos + j) % METRO_RNG_N) + rot[j]; + return v - floorf(v); +} + +ccl_device float metro_get_sample(KernelGlobals *kg, MetropolisChain *metro, int dimension) +{ + uint sampleStamp = metro->proposedSampleStamps[dimension]; + float s; + if (sampleStamp == 0) { + s = metro_get_mcqmc(kg, metro->rngPos, dimension, metro->rngRotation); + sampleStamp = 1; + } else s = metro->proposedSamples[dimension]; + + for (uint st = sampleStamp; st < metro->proposedStamp; ++st) + s = metro_mutate(s, metro_get_mcqmc(kg, metro->rngPos, dimension, metro->rngRotation)); + + if (s >= 1.f) s = 1.f-FLT_EPSILON; + if (s < 0.f) s = 0.f; + + metro->proposedSamples[dimension] = s; + metro->proposedSampleStamps[dimension] = metro->proposedStamp; + return s; +} + +ccl_device void metro_next_sample(KernelGlobals *kg, MetropolisChain *metro, int width, int height) +{ + metro->rngPos += metro->sampleSize; + if (metro->rngPos >= METRO_RNG_N) + metro->rngPos -= METRO_RNG_N; + if (metro->rngPos == 0) { + for (int j = 0; j < metro->sampleSize; j++) + metro->rngRotation[j] = lcg_step_float(&metro->rng); //Rebuild Cranley-Patterson rotation + } + //Always mutate the image pixels since they are always needed + //The other dimensions are sampled when they are needed to avoid wasting time + for (uint j = 0; j < 2; j++) { + uint sampleStamp = metro->proposedSampleStamps[j]; + float s; + if (sampleStamp == 0) { //Large step ( => new Sample)? + s = metro_get_mcqmc(kg, metro->rngPos, j, metro->rngRotation); + sampleStamp = 1; + } else s = metro->proposedSamples[j]; //Small step (mutate previous sample) + + for (uint st = sampleStamp; st < metro->proposedStamp; ++st) { + float val = metro_get_mcqmc(kg, metro->rngPos, j, metro->rngRotation); + float span = j?height:width; + float mutation_range = min(kernel_data.integrator.image_mutation_range, span); + s = metro_mutate_scaled(s, mutation_range / span, val); + } + + if (s >= 1.f) s = 1.f-FLT_EPSILON; + if (s < 0.f) s = 0.f; + + metro->proposedSamples[j] = s; + metro->proposedSampleStamps[j] = metro->proposedStamp; + } +} + +ccl_device void metro_reject_sample(MetropolisChain *metro) +{ + metro->proposedStamp = metro->currentStamp; + metro->consecRejects++; + for (int i = 0; i < metro->sampleSize; i++) { + metro->proposedSamples[i] = metro->currentSamples[i]; + metro->proposedSampleStamps[i] = metro->currentSampleStamps[i]; + } +} + +ccl_device void metro_accept_sample(MetropolisChain *metro, float newWeight, float newImportance, PassData &pd, float4 &L) +{ + metro->weight = newWeight; + metro->currentStamp = metro->proposedStamp; + metro->currentImportance = newImportance; + metro->consecRejects = 0; + metro->currentSampleResult = pd; + metro->currentL = L; + for (int i = 0; i < metro->sampleSize; i++) { + metro->currentSamples[i] = metro->proposedSamples[i]; + metro->currentSampleStamps[i] = metro->proposedSampleStamps[i]; + } +} + +ccl_device bool metro_consider_sample(KernelGlobals* kg, MetropolisChain *metro, double *sharedParams, float newImportance, float4 L, float *buffer, float *currentBuffer, PassData &pd) +{ + const float meanIntensity = (sharedParams[0] > 0.0) ? (float) (sharedParams[0] / sharedParams[1]) : 1.0f; + + float acceptChance; + if ((metro->currentImportance > 0.0f) && (metro->consecRejects < kernel_data.integrator.max_consecutive_rejects)) + acceptChance = min(1.0f, (float) (newImportance / metro->currentImportance)); + else acceptChance = 1.0f; + + const float newWeight = acceptChance + (metro->isLargeMutation ? 1.0f : 0.0f); + metro->weight += 1.0f - acceptChance; + + if ((acceptChance == 1.0f) || (lcg_step_float(&metro->rng) < acceptChance)) { //Accept sample + const float norm = metro->weight / (metro->currentImportance / meanIntensity + metro->largeStepProb); + + if (norm > 0.f) + kernel_write_pass_data(kg, &metro->currentSampleResult, currentBuffer, 0xffffffff, norm, false, true, metro->currentL, true, true); + + metro_accept_sample(metro, newWeight, newImportance, pd, L); + return true; + } else { //Reject sample + const float norm = newWeight / (newImportance / meanIntensity + metro->largeStepProb); + + if (norm > 0.f) + kernel_write_pass_data(kg, &pd, buffer, 0xffffffff, norm, false, true, L, true, true); + + metro_reject_sample(metro); + return false; + } +} + +ccl_device void metro_end_sample(KernelGlobals *kg, MetropolisChain *metro, double *sharedParams, int pixelCount) +{ + if (metro->cooldown) { //During cooldown, more large mutations are made + if (sharedParams[1] > pixelCount) { + metro->cooldown = false; + metro->largeStepProb = metro_get_large_step_prob(sharedParams); + } + } else + metro->largeStepProb = metro_get_large_step_prob(sharedParams); + metro->isLargeMutation = (lcg_step_float(&metro->rng) < metro->largeStepProb); + + if (metro->isLargeMutation) { + metro->proposedStamp = 1; + for (int i = 0; i < metro->sampleSize; i++) + metro->proposedSampleStamps[i] = 0; + } else + metro->proposedStamp++; +} + +ccl_device void metro_init_chain(KernelGlobals *kg, MetropolisChain *m) { + m->sampleSize = PRNG_BASE_NUM + (kernel_data.integrator.max_bounce + kernel_data.integrator.transparent_max_bounce + 3)*PRNG_BOUNCE_NUM; + m->rng = lcg_init(hash_int_2d(kernel_data.integrator.seed, (uint) (((unsigned long) m) & 0xffffffff))); + m->rngPos = METRO_RNG_N - m->sampleSize; //Init the position so that it is 0 for the first sample + m->isLargeMutation = true; + m->largeStepProb = 0.5f; + m->cooldown = true; + m->weight = 0.0f; + m->consecRejects = 0; + m->proposedStamp = 1; + m->currentStamp = 1; + m->currentImportance = 0.0f; + for (int i = 0; i < sizeof(PassData) / 4; i++) + ((float*) &m->currentSampleResult)[i] = 0.0f; + m->currentL = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + m->currentSamples = (float*) (m + 1); + m->proposedSamples = m->currentSamples + m->sampleSize; + m->rngRotation = m->proposedSamples + m->sampleSize; + m->currentSampleStamps = (int*) (m->rngRotation + m->sampleSize); + m->proposedSampleStamps = m->currentSampleStamps + m->sampleSize; + //No need to setup current sample arrays since currentImportance = 0 so the first sample is always accepted + for (int i = 0; i < m->sampleSize; i++) + m->proposedSampleStamps[i] = 0; +} + +ccl_device_inline void metro_update_shared(double *sharedParams, double newImportance, bool sampled, bool largeStep, bool accepted, bool nonzero) { + if (largeStep) { + atomicAddD(&sharedParams[0], newImportance); + atomicAddD(&sharedParams[1], 1.0); + } + if (sampled) { + if (largeStep) { + atomicAddD(&sharedParams[2], accepted?1.0:0.0); + atomicAddD(&sharedParams[3], 1.0); + } else { + atomicAddD(&sharedParams[4], accepted?1.0:0.0); + atomicAddD(&sharedParams[5], 1.0); + } + } + atomicAddD(&sharedParams[6], nonzero?1.0:0.0); + atomicAddD(&sharedParams[7], 1.0); +/*#ifdef __KERNEL_CPU__ + while (atomicExch((int*) &sharedParams[3], 1) != 0); + *((char*) &sharedParams[3]) = 0; + double oldMean = (sharedParams[1] > 0)?(sharedParams[0]/sharedParams[1]):0; + sharedParams[0] += newImportance; + sharedParams[1] += 1.0; + sharedParams[2] += (newImportance - oldMean) * (newImportance - (sharedParams[0]/sharedParams[1])); +#else + if ((threadIdx.x % 32) == 0) + while (atomicExch((int*) &sharedParams[3], 1) != 0); + for (int i = 0; i < 32; i++) + if (i == (threadIdx.x % 32)) { + *((char*) &sharedParams[3]) = 0; + double oldMean = (sharedParams[1] > 0)?(sharedParams[0]/sharedParams[1]):0; + double newImportanceD = (double) newImportance; + sharedParams[0] += newImportanceD; + sharedParams[1] += 1.0; + sharedParams[2] += (newImportanceD - oldMean) * (newImportanceD - (sharedParams[0]/sharedParams[1])); + } +#endif*/ +} + +CCL_NAMESPACE_END + +#endif \ No newline at end of file diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index b3b6fc0..90ba80d 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -18,24 +18,263 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value) { + if (!buffer) return; ccl_global float *buf = buffer; - *buf = (sample == 0)? value: *buf + value; + if (sample == 0) + *buf = value; + else + *buf += value; +} + +ccl_device_inline void kernel_write_pass_float2(ccl_global float *buffer, int sample, float2 value) +{ + if (!buffer) return; + ccl_global float2 *buf = (ccl_global float2*)buffer; + if (sample == 0) + *buf = value; + else + *buf += value; } ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value) { + if (!buffer) return; ccl_global float3 *buf = (ccl_global float3*)buffer; - *buf = (sample == 0)? value: *buf + value; + if (sample == 0) + *buf = value; + else + *buf += value; } ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value) { + if (!buffer) return; ccl_global float4 *buf = (ccl_global float4*)buffer; - *buf = (sample == 0)? value: *buf + value; + if (sample == 0) + *buf = value; + else + *buf += value; +} +ccl_device_inline void kernel_write_pass_float_atomic(ccl_global float *buffer, int sample, float value) +{ + if (!buffer) return; + ccl_global float *buf = buffer; + if (sample == 0) + *buf = value; + else + atomicAddF(buf, value); +} + +ccl_device_inline void kernel_write_pass_float2_atomic(ccl_global float *buffer, int sample, float2 value) +{ + if (!buffer) return; + ccl_global float2 *buf = (ccl_global float2*)buffer; + if (sample == 0) + *buf = value; + else { + atomicAddF(&buf->x, value.x); + atomicAddF(&buf->y, value.y); + } +} + +ccl_device_inline void kernel_write_pass_float3_atomic(ccl_global float *buffer, int sample, float3 value) +{ + if (!buffer) return; + ccl_global float3 *buf = (ccl_global float3*)buffer; + if (sample == 0) + *buf = value; + else { + atomicAddF(&buf->x, value.x); + atomicAddF(&buf->y, value.y); + atomicAddF(&buf->z, value.z); + } +} + +ccl_device_inline void kernel_write_pass_float4_atomic(ccl_global float *buffer, int sample, float4 value) +{ + if (!buffer) return; + ccl_global float4 *buf = (ccl_global float4*)buffer; + if (sample == 0) + *buf = value; + else { + atomicAddF(&buf->x, value.x); + atomicAddF(&buf->y, value.y); + atomicAddF(&buf->z, value.z); + atomicAddF(&buf->w, value.w); + } +} + +ccl_device_inline void kernel_setup_pass_data(KernelGlobals *kg, PassData* pd) { + int flag = kernel_data.film.pass_flag; + if(flag & PASS_DEPTH) + pd->pass_depth = 0.0f; + if(flag & PASS_OBJECT_ID) + pd->pass_object_id = 0.0f; + if(flag & PASS_MATERIAL_ID) + pd->pass_material_id = 0.0f; + + if(flag & PASS_NORMAL) + pd->pass_normal = make_float3(0.0f, 0.0f, 0.0f); + if(flag & PASS_MOTION) + pd->pass_motion = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + if(flag & PASS_MOTION_WEIGHT) + pd->pass_motion_weight = 0.0f; + if(flag & PASS_UV) + pd->pass_uv = make_float3(0.0f, 0.0f, 0.0f); + + if(flag & PASS_DIFFUSE_INDIRECT) + pd->pass_diffuse_indirect = make_float3(0.0f, 0.0f, 0.0f); + if(flag & PASS_GLOSSY_INDIRECT) + pd->pass_glossy_indirect = make_float3(0.0f, 0.0f, 0.0f); + if(flag & PASS_TRANSMISSION_INDIRECT) + pd->pass_transmission_indirect = make_float3(0.0f, 0.0f, 0.0f); + if(flag & PASS_SUBSURFACE_INDIRECT) + pd->pass_subsurface_indirect = make_float3(0.0f, 0.0f, 0.0f); + if(flag & PASS_DIFFUSE_DIRECT) + pd->pass_diffuse_direct = make_float3(0.0f, 0.0f, 0.0f); + if(flag & PASS_GLOSSY_DIRECT) + pd->pass_glossy_direct = make_float3(0.0f, 0.0f, 0.0f); + if(flag & PASS_TRANSMISSION_DIRECT) + pd->pass_transmission_direct = make_float3(0.0f, 0.0f, 0.0f); + if(flag & PASS_SUBSURFACE_DIRECT) + pd->pass_subsurface_direct = make_float3(0.0f, 0.0f, 0.0f); + if(flag & PASS_DIFFUSE_COLOR) + pd->pass_diffuse_color = make_float3(0.0f, 0.0f, 0.0f); + if(flag & PASS_GLOSSY_COLOR) + pd->pass_glossy_color = make_float3(0.0f, 0.0f, 0.0f); + if(flag & PASS_TRANSMISSION_COLOR) + pd->pass_transmission_color = make_float3(0.0f, 0.0f, 0.0f); + if(flag & PASS_SUBSURFACE_COLOR) + pd->pass_subsurface_color = make_float3(0.0f, 0.0f, 0.0f); + + if(flag & PASS_EMISSION) + pd->pass_emission = make_float3(0.0f, 0.0f, 0.0f); + if(flag & PASS_BACKGROUND) + pd->pass_background = make_float3(0.0f, 0.0f, 0.0f); + if(flag & PASS_AO) + pd->pass_ao = make_float3(0.0f, 0.0f, 0.0f); + + if(flag & PASS_SHADOW) + pd->pass_shadow = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + if(flag & PASS_MIST) + pd->pass_mist = 0.0f; } -ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, - ShaderData *sd, int sample, PathState *state, float3 throughput) +#define write_float(p, s, v) {if (write_atomic) kernel_write_pass_float_atomic(p, s, v); else kernel_write_pass_float(p, s, v);} +#define write_float2(p, s, v) {if (write_atomic) kernel_write_pass_float_atomic2(p, s, v); else kernel_write_pass_float2(p, s, v);} +#define write_float3(p, s, v) {if (write_atomic) kernel_write_pass_float_atomic3(p, s, v); else kernel_write_pass_float3(p, s, v);} +#define write_float4(p, s, v) {if (write_atomic) kernel_write_pass_float_atomic4(p, s, v); else kernel_write_pass_float4(p, s, v);} + +ccl_device_inline void kernel_write_pass_data(KernelGlobals *kg, PassData* pd, ccl_global float* buffer, int sample, + float weight, bool writeConstData, bool writeVarData, float4 L, bool writeSamples, bool write_atomic) { + int flag = kernel_data.film.pass_flag; + if(writeConstData) { //Data that is not sample-dependent + if(flag & PASS_DEPTH) + kernel_write_pass_float (buffer + kernel_data.film.pass_depth, + sample, pd->pass_depth * weight); + if(flag & PASS_OBJECT_ID) + kernel_write_pass_float (buffer + kernel_data.film.pass_object_id, + sample, pd->pass_object_id * weight); + if(flag & PASS_MATERIAL_ID) + kernel_write_pass_float (buffer + kernel_data.film.pass_material_id, + sample, pd->pass_material_id * weight); + if(flag & PASS_UV) + kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, + sample, pd->pass_uv * weight); + if(flag & PASS_NORMAL) + kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, + sample, pd->pass_normal * weight); + if(flag & PASS_MOTION) + kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, + sample, pd->pass_motion * weight); + if(flag & PASS_MOTION_WEIGHT) + kernel_write_pass_float (buffer + kernel_data.film.pass_motion_weight, + sample, pd->pass_motion_weight * weight); + } + + if(writeVarData) { //Data that varies according to sample + float s = *(buffer + kernel_data.film.pass_samples); + float3 oldMean; + if (flag & PASS_VARIANCE) + oldMean = (s == 0)?(make_float3(0.0f, 0.0f, 0.0f)):(*((float3*) buffer) / s); + kernel_write_pass_float4(buffer, sample, L * weight); + if(flag & PASS_DIFFUSE_COLOR) + kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color, + sample, pd->pass_diffuse_color * weight); + if(flag & PASS_GLOSSY_COLOR) + kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color, + sample, pd->pass_glossy_color * weight); + if(flag & PASS_TRANSMISSION_COLOR) + kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color, + sample, pd->pass_transmission_color * weight); + if(flag & PASS_SUBSURFACE_COLOR) + kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_color, + sample, pd->pass_subsurface_color * weight); + + if(flag & PASS_DIFFUSE_INDIRECT) + kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_indirect, + sample, pd->pass_diffuse_indirect * weight); + if(flag & PASS_GLOSSY_INDIRECT) + kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_indirect, + sample, pd->pass_glossy_indirect * weight); + if(flag & PASS_TRANSMISSION_INDIRECT) + kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect, + sample, pd->pass_transmission_indirect * weight); + if(flag & PASS_SUBSURFACE_INDIRECT) + kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_indirect, + sample, pd->pass_subsurface_indirect * weight); + + if(flag & PASS_DIFFUSE_DIRECT) + kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, + sample, pd->pass_diffuse_direct * weight); + if(flag & PASS_GLOSSY_DIRECT) + kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_direct, + sample, pd->pass_glossy_direct * weight); + if(flag & PASS_TRANSMISSION_DIRECT) + kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct, + sample, pd->pass_transmission_direct * weight); + if(flag & PASS_SUBSURFACE_DIRECT) + kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_direct, + sample, pd->pass_subsurface_direct * weight); + + if(flag & PASS_EMISSION) + kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, + sample, pd->pass_emission * weight); + if(flag & PASS_BACKGROUND) + kernel_write_pass_float3(buffer + kernel_data.film.pass_background, + sample, pd->pass_background * weight); + if(flag & PASS_BACKGROUND) + kernel_write_pass_float3(buffer + kernel_data.film.pass_background, + sample, pd->pass_background * weight); + if(flag & PASS_AO) + kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, + sample, pd->pass_ao * weight); + + if(flag & PASS_SHADOW && writeSamples) + kernel_write_pass_float4(buffer + kernel_data.film.pass_shadow, + sample, make_float4(1.0f, 1.0f, 1.0f, 1.0f)); + if(flag & PASS_MIST) + kernel_write_pass_float (buffer + kernel_data.film.pass_mist, + sample, pd->pass_mist * weight); + if(flag & PASS_VARIANCE && s > 0.0f) { + float3 L3 = make_float3(L.x, L.y, L.z); + float3 newMean = *((float3*) buffer) / (*(buffer + kernel_data.film.pass_samples) + weight); + kernel_write_pass_float3(buffer + kernel_data.film.pass_variance, + sample, (L3 - newMean) * (L3 - oldMean) * weight); + } + if(flag & PASS_SAMPLES && writeSamples) + kernel_write_pass_float(buffer + kernel_data.film.pass_samples, + sample, weight); + } +} + +#undef write_float +#undef write_float2 +#undef write_float3 +#undef write_float4 + +ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, PathRadiance *L, ShaderData *sd, int sample, + PathState *state, float3 throughput, PassData* pd) { #ifdef __PASSES__ int path_flag = state->flag; @@ -55,32 +294,21 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl { if(sample == 0) { - if(flag & PASS_DEPTH) { - float depth = camera_distance(kg, sd->P); - kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth); - } - if(flag & PASS_OBJECT_ID) { - float id = object_pass_id(kg, sd->object); - kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id); - } - if(flag & PASS_MATERIAL_ID) { - float id = shader_pass_id(kg, sd); - kernel_write_pass_float(buffer + kernel_data.film.pass_material_id, sample, id); - } + if(flag & PASS_DEPTH) + pd->pass_depth = camera_distance(kg, sd->P); + if(flag & PASS_OBJECT_ID) + pd->pass_object_id = object_pass_id(kg, sd->object); + if(flag & PASS_MATERIAL_ID) + pd->pass_material_id = shader_pass_id(kg, sd); } - if(flag & PASS_NORMAL) { - float3 normal = sd->N; - kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal); - } - if(flag & PASS_UV) { - float3 uv = primitive_uv(kg, sd); - kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, sample, uv); - } + if(flag & PASS_NORMAL) + pd->pass_normal = sd->N; + if(flag & PASS_UV) + pd->pass_uv = primitive_uv(kg, sd); if(flag & PASS_MOTION) { - float4 speed = primitive_motion_vector(kg, sd); - kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, sample, speed); - kernel_write_pass_float(buffer + kernel_data.film.pass_motion_weight, sample, 1.0f); + pd->pass_motion = primitive_motion_vector(kg, sd); + pd->pass_motion_weight = 1; } state->flag |= PATH_RAY_SINGLE_PASS_DONE; @@ -123,7 +351,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl #endif } -ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, int sample) +ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, PathRadiance *L, int sample, PassData* pd) { #ifdef __PASSES__ int flag = kernel_data.film.pass_flag; @@ -132,44 +360,44 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f return; if(flag & PASS_DIFFUSE_INDIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_indirect, sample, L->indirect_diffuse); + pd->pass_diffuse_indirect = L->indirect_diffuse; if(flag & PASS_GLOSSY_INDIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_indirect, sample, L->indirect_glossy); + pd->pass_glossy_indirect = L->indirect_glossy; if(flag & PASS_TRANSMISSION_INDIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect, sample, L->indirect_transmission); + pd->pass_transmission_indirect = L->indirect_transmission; if(flag & PASS_SUBSURFACE_INDIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_indirect, sample, L->indirect_subsurface); + pd->pass_subsurface_indirect = L->indirect_subsurface; if(flag & PASS_DIFFUSE_DIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, sample, L->direct_diffuse); + pd->pass_diffuse_direct = L->direct_diffuse; if(flag & PASS_GLOSSY_DIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_direct, sample, L->direct_glossy); + pd->pass_glossy_direct = L->direct_glossy; if(flag & PASS_TRANSMISSION_DIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct, sample, L->direct_transmission); + pd->pass_transmission_direct = L->direct_transmission; if(flag & PASS_SUBSURFACE_DIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_direct, sample, L->direct_subsurface); + pd->pass_subsurface_direct = L->direct_subsurface; if(flag & PASS_EMISSION) - kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, sample, L->emission); + pd->pass_emission = L->emission; if(flag & PASS_BACKGROUND) - kernel_write_pass_float3(buffer + kernel_data.film.pass_background, sample, L->background); + pd->pass_background = L->background; if(flag & PASS_AO) - kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, sample, L->ao); + pd->pass_ao = L->ao; if(flag & PASS_DIFFUSE_COLOR) - kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color, sample, L->color_diffuse); + pd->pass_diffuse_color = L->color_diffuse; if(flag & PASS_GLOSSY_COLOR) - kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color, sample, L->color_glossy); + pd->pass_glossy_color = L->color_glossy; if(flag & PASS_TRANSMISSION_COLOR) - kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color, sample, L->color_transmission); + pd->pass_transmission_color = L->color_transmission; if(flag & PASS_SUBSURFACE_COLOR) - kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_color, sample, L->color_subsurface); + pd->pass_subsurface_color = L->color_subsurface; if(flag & PASS_SHADOW) { float4 shadow = L->shadow; shadow.w = kernel_data.film.pass_shadow_scale; - kernel_write_pass_float4(buffer + kernel_data.film.pass_shadow, sample, shadow); + pd->pass_shadow = shadow; } if(flag & PASS_MIST) - kernel_write_pass_float(buffer + kernel_data.film.pass_mist, sample, 1.0f - L->mist); + pd->pass_mist = 1.0f - L->mist; #endif } diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index 0c033f6..18491d6 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -45,6 +45,8 @@ #include "kernel_path_surface.h" #include "kernel_path_volume.h" +#include "kernel_metropolis.h" + CCL_NAMESPACE_BEGIN ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, @@ -404,7 +406,7 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd } #endif -ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer) +ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, PassData* pd) { /* initialize */ PathRadiance L; @@ -589,7 +591,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, #endif /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput); + kernel_write_data_passes(kg, &L, &sd, sample, &state, throughput, pd); /* blurring of bsdf after bounces, for rays that have a small likelihood * of following this particular path (diffuse, rough glossy) */ @@ -654,7 +656,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, float3 L_sum = path_radiance_clamp_and_sum(kg, &L); - kernel_write_light_passes(kg, buffer, &L, sample); + kernel_write_light_passes(kg, &L, sample, pd); return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); } @@ -761,7 +763,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, Shade } #endif -ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer) +ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, PassData* pd) { /* initialize */ PathRadiance L; @@ -845,8 +847,8 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in /* scatter sample. if we use distance sampling and take just one * sample for direct and indirect light, we could share this * computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D_for_decision(kg, kernel_data.integrator.metropolis?rng:&tmp_rng, &ps, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, kernel_data.integrator.metropolis?rng:&tmp_rng, &ps, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, &ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false); @@ -960,7 +962,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #endif /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput); + kernel_write_data_passes(kg, &L, &sd, sample, &state, throughput, pd); #ifdef __EMISSION__ /* emission */ @@ -1039,7 +1041,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in float3 L_sum = path_radiance_clamp_and_sum(kg, &L); - kernel_write_light_passes(kg, buffer, &L, sample); + kernel_write_light_passes(kg, &L, sample, pd); return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); } @@ -1083,6 +1085,9 @@ ccl_device void kernel_path_trace(KernelGlobals *kg, rng_state += index; buffer += index*pass_stride; + if (kernel_data.film.pass_flag & PASS_SAMPLES) + sample = (int) *(buffer + kernel_data.film.pass_samples); + /* initialize random numbers and ray */ RNG rng; Ray ray; @@ -1091,16 +1096,18 @@ ccl_device void kernel_path_trace(KernelGlobals *kg, /* integrate */ float4 L; + PassData pd; + kernel_setup_pass_data(kg, &pd); if(ray.t != 0.0f) - L = kernel_path_integrate(kg, &rng, sample, ray, buffer); + L = kernel_path_integrate(kg, &rng, sample, ray, &pd); else L = make_float4(0.0f, 0.0f, 0.0f, 0.0f); /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L); + kernel_write_pass_data(kg, &pd, buffer, sample, 1.0f, sample == 0, true, L, true, false); - path_rng_end(kg, rng_state, rng); + path_rng_end(kg, rng_state, &rng); } #ifdef __BRANCHED_PATH__ @@ -1115,6 +1122,9 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg, rng_state += index; buffer += index*pass_stride; + if (kernel_data.film.pass_flag & PASS_SAMPLES) + sample = (int) *(buffer + kernel_data.film.pass_samples); + /* initialize random numbers and ray */ RNG rng; Ray ray; @@ -1123,18 +1133,94 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg, /* integrate */ float4 L; + PassData pd; + kernel_setup_pass_data(kg, &pd); if(ray.t != 0.0f) - L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer); + L = kernel_branched_path_integrate(kg, &rng, sample, ray, &pd); else L = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - + /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L); + kernel_write_pass_data(kg, &pd, buffer, sample, 1.0f, sample == 0, true, L, true, false); - path_rng_end(kg, rng_state, rng); + path_rng_end(kg, rng_state, &rng); } #endif -CCL_NAMESPACE_END +#ifdef __METROPOLIS__ +ccl_device void kernel_metropolis_first_pass(KernelGlobals *kg, ccl_global float *buffer, double *sharedParams, ccl_global uint *rng_state, int x, int y, int offset, int stride) +{ + int index = offset + x + y*stride; + int pass_stride = kernel_data.film.pass_stride; + rng_state += index; + buffer += index*pass_stride; + + RNG rng; + Ray ray; + PassData pd; + kernel_path_trace_setup(kg, rng_state, 0, x, y, &rng, &ray); + kernel_setup_pass_data(kg, &pd); + + float4 L = kernel_path_integrate(kg, &rng, 0, ray, &pd); + + kernel_write_pass_data(kg, &pd, buffer, 0, 1.0f, true, true, make_float4(0.0f, 0.0f, 0.0f, 0.0f), false, false); + path_rng_end(kg, rng_state, &rng); + metro_update_shared(sharedParams, linear_rgb_to_gray(L), false, false, false, !(L == make_float4(0.0f, 0.0f, 0.0f, 0.0f))); +} + +ccl_device void kernel_metropolis_warmup(KernelGlobals *kg, ccl_global uint *rng_state, double *sharedParams, int sample, int x, int y, int offset, int stride) { + int index = offset + x + y*stride; + rng_state += index; + RNG rng; + Ray ray; + PassData pd; + kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); + kernel_setup_pass_data(kg, &pd); + float4 L = kernel_path_integrate(kg, &rng, sample, ray, &pd); + path_rng_end(kg, rng_state, &rng); + metro_update_shared(sharedParams, linear_rgb_to_gray(L), false, false, false, !(L == make_float4(0.0f, 0.0f, 0.0f, 0.0f))); +} + +ccl_device bool kernel_metropolis_step(KernelGlobals *kg, float *rbuffer, MetropolisChain *metro, double *sharedParams, int sx, int sy, int w, int h, int offset, int stride) +{ + metro_next_sample(kg, metro, w, h); + //Evaluate samples + float4 L; + int x, y, cx, cy; + x = (int) (metro->proposedSamples[0]*w) + sx; + y = (int) (metro->proposedSamples[1]*h) + sy; + cx = (int) (metro->currentSamples[0]*w) + sx; + cy = (int) (metro->currentSamples[1]*h) + sy; + + Ray ray; + PassData pd; + camera_sample(kg, x, y, metro->proposedSamples[0]*w - x + sx, //Fractional parts of the pixel coords + metro->proposedSamples[1]*h - y + sy, + metro_get_sample(kg, metro, PRNG_LENS_U), + metro_get_sample(kg, metro, PRNG_LENS_V), + metro_get_sample(kg, metro, PRNG_TIME), &ray); + kernel_setup_pass_data(kg, &pd); + + if(ray.t != 0.0f) + L = kernel_path_integrate(kg, (RNG*) metro, 0xffffffff, ray, &pd); //0xffffffff means "use metropolis rng" + else + L = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + float newImportance = linear_rgb_to_gray(L); + /*if ((kernel_data.integrator.map_interval > 0) && sample >= kernel_data.integrator.map_interval) + newImportance *= tile.buffers->importance_map[((int) (m.proposedSamples[1]*tile.h))*tile.w + ((int) (m.proposedSamples[0]*tile.w))];*/ + + float *buffer = rbuffer + (offset + y*stride + x)*kernel_data.film.pass_stride; + float *cbuffer = rbuffer + (offset + cy*stride + cx)*kernel_data.film.pass_stride; + bool accepted = metro_consider_sample(kg, metro, sharedParams, newImportance, L, buffer, cbuffer, pd); + + metro_update_shared(sharedParams, newImportance, true, metro->isLargeMutation, accepted, !(L == make_float4(0.0f, 0.0f, 0.0f, 0.0f))); + + metro_end_sample(kg, metro, sharedParams, w*h); + return accepted; +} +#endif + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index 236f74c..da93f14 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -18,6 +18,10 @@ CCL_NAMESPACE_BEGIN +#ifdef __METROPOLIS__ +ccl_device float metro_get_sample(KernelGlobals *kg, MetropolisChain *metro, int dimension); +#endif + #ifdef __SOBOL__ /* skip initial numbers that are not as well distributed, especially the @@ -100,6 +104,10 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension) { +#ifdef __METROPOLIS__ + if(kernel_data.integrator.metropolis && sample == 0xffffffff) + return metro_get_sample(kg, (MetropolisChain*) rng, dimension); +#endif #ifdef __CMJ__ if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { /* correlated multi-jittered */ @@ -134,6 +142,13 @@ ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy) { +#ifdef __METROPOLIS__ + if(kernel_data.integrator.metropolis && sample == 0xffffffff) { + *fx = metro_get_sample(kg, (MetropolisChain*) rng, dimension); + *fy = metro_get_sample(kg, (MetropolisChain*) rng, dimension + 1); + } + else +#endif #ifdef __CMJ__ if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { /* correlated multi-jittered */ @@ -184,7 +199,7 @@ ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_sta #endif } -ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng) +ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG* rng) { /* nothing to do */ } @@ -193,14 +208,14 @@ ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG /* Linear Congruential Generator */ -ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension) +ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG* rng, int sample, int num_samples, int dimension) { /* implicit mod 2^32 */ - rng = (1103515245*(rng) + 12345); - return (float)rng * (1.0f/(float)0xFFFFFFFF); + *rng = (1103515245*(*rng) + 12345); + return (float)*rng * (1.0f/(float)0xFFFFFFFF); } -ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension, float *fx, float *fy) +ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG* rng, int sample, int num_samples, int dimension, float *fx, float *fy) { *fx = path_rng_1D(kg, rng, sample, num_samples, dimension); *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1); @@ -222,10 +237,10 @@ ccl_device void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int } } -ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng) +ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG *rng) { /* store state for next sample */ - *rng_state = rng; + *rng_state = *rng; } #endif diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernel_sse2.cpp index 67bd094..3288c0f 100644 --- a/intern/cycles/kernel/kernel_sse2.cpp +++ b/intern/cycles/kernel/kernel_sse2.cpp @@ -50,6 +50,21 @@ void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int * kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); } +void kernel_cpu_sse2_metropolis_step(KernelGlobals *kg, float *rbuffer, MetropolisChain *metro, double *sharedParams, + int x, int y, int w, int h, int offset, int stride) +{ + kernel_metropolis_step(kg, rbuffer, metro, sharedParams, x, y, w, h, offset, stride); +} + +void kernel_cpu_sse2_metropolis_first_pass(KernelGlobals *kg, ccl_global float *buffer, double *sharedParams, ccl_global uint *rng_state, int x, int y, int offset, int stride) +{ + kernel_metropolis_first_pass(kg, buffer, sharedParams, rng_state, x, y, offset, stride); +} + +void kernel_cpu_sse2_metropolis_warmup(KernelGlobals *kg, ccl_global uint *rng_state, double *sharedParams, int sample, int x, int y, int offset, int stride) { + kernel_metropolis_warmup(kg, rng_state, sharedParams, sample, x, y, offset, stride); +} + /* Film */ void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp index 40d621b..29eb574 100644 --- a/intern/cycles/kernel/kernel_sse3.cpp +++ b/intern/cycles/kernel/kernel_sse3.cpp @@ -52,6 +52,21 @@ void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int * kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); } +void kernel_cpu_sse3_metropolis_step(KernelGlobals *kg, float *rbuffer, MetropolisChain *metro, double *sharedParams, + int x, int y, int w, int h, int offset, int stride) +{ + kernel_metropolis_step(kg, rbuffer, metro, sharedParams, x, y, w, h, offset, stride); +} + +void kernel_cpu_sse3_metropolis_first_pass(KernelGlobals *kg, ccl_global float *buffer, double *sharedParams, ccl_global uint *rng_state, int x, int y, int offset, int stride) +{ + kernel_metropolis_first_pass(kg, buffer, sharedParams, rng_state, x, y, offset, stride); +} + +void kernel_cpu_sse3_metropolis_warmup(KernelGlobals *kg, ccl_global uint *rng_state, double *sharedParams, int sample, int x, int y, int offset, int stride) { + kernel_metropolis_warmup(kg, rng_state, sharedParams, sample, x, y, offset, stride); +} + /* Film */ void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) diff --git a/intern/cycles/kernel/kernel_sse41.cpp b/intern/cycles/kernel/kernel_sse41.cpp index 4b48d10..3568045 100644 --- a/intern/cycles/kernel/kernel_sse41.cpp +++ b/intern/cycles/kernel/kernel_sse41.cpp @@ -53,6 +53,21 @@ void kernel_cpu_sse41_path_trace(KernelGlobals *kg, float *buffer, unsigned int kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); } +void kernel_cpu_sse41_metropolis_step(KernelGlobals *kg, float *rbuffer, MetropolisChain *metro, double *sharedParams, + int x, int y, int w, int h, int offset, int stride) +{ + kernel_metropolis_step(kg, rbuffer, metro, sharedParams, x, y, w, h, offset, stride); +} + +void kernel_cpu_sse41_metropolis_first_pass(KernelGlobals *kg, ccl_global float *buffer, double *sharedParams, ccl_global uint *rng_state, int x, int y, int offset, int stride) +{ + kernel_metropolis_first_pass(kg, buffer, sharedParams, rng_state, x, y, offset, stride); +} + +void kernel_cpu_sse41_metropolis_warmup(KernelGlobals *kg, ccl_global uint *rng_state, double *sharedParams, int sample, int x, int y, int offset, int stride) { + kernel_metropolis_warmup(kg, rng_state, sharedParams, sample, x, y, offset, stride); +} + /* Film */ void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index f60ed8c..cee1d88 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -71,6 +71,9 @@ KERNEL_TEX(float, texture_float, __lookup_table) /* sobol */ KERNEL_TEX(uint, texture_uint, __sobol_directions) +/* Metropolis MCQMC */ +KERNEL_TEX(float, texture_float, __metropolis_mcqmc) + /* full-float image */ KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float_000) KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float_001) @@ -172,9 +175,9 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_094) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_095) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_096) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_097) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_098) /* Kepler and above */ +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_098) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_099) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_100) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_101) diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index c35aebe..912af3e 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -67,12 +67,14 @@ CCL_NAMESPACE_BEGIN #define __CMJ__ #define __VOLUME__ #define __SHADOW_RECORD_ALL__ +#define __METROPOLIS__ #endif #ifdef __KERNEL_CUDA__ #define __KERNEL_SHADING__ #define __KERNEL_ADV_SHADING__ #define __BRANCHED_PATH__ +#define __METROPOLIS__ /* Experimental on GPU */ //#define __VOLUME__ @@ -203,7 +205,7 @@ enum PathTraceDimension { PRNG_LENS_V = 3, #ifdef __CAMERA_MOTION__ PRNG_TIME = 4, - PRNG_UNUSED_0 = 5, + PRNG_UNUSED_0 = 5, PRNG_UNUSED_1 = 6, /* for some reason (6, 7) is a bad sobol pattern */ PRNG_UNUSED_2 = 7, /* with a low number of samples (< 64) */ #endif @@ -311,6 +313,8 @@ typedef enum PassType { PASS_SUBSURFACE_INDIRECT = 8388608, PASS_SUBSURFACE_COLOR = 16777216, PASS_LIGHT = 33554432, /* no real pass, used to force use_light_pass */ + PASS_VARIANCE = 67108864, + PASS_SAMPLES = 134217728 } PassType; #define PASS_ALL (~0) @@ -352,6 +356,40 @@ typedef struct PathRadiance { float mist; } PathRadiance; +struct PassData { +public: + float pass_depth; + float3 pass_normal; + float4 pass_motion; + float pass_motion_weight; + + float3 pass_uv; + float pass_object_id; + float pass_material_id; + + float3 pass_diffuse_color; + float3 pass_glossy_color; + float3 pass_transmission_color; + float3 pass_subsurface_color; + + float3 pass_diffuse_indirect; + float3 pass_glossy_indirect; + float3 pass_transmission_indirect; + float3 pass_subsurface_indirect; + + float3 pass_diffuse_direct; + float3 pass_glossy_direct; + float3 pass_transmission_direct; + float3 pass_subsurface_direct; + + float3 pass_emission; + float3 pass_background; + float3 pass_ao; + + float4 pass_shadow; + float pass_mist; +}; + typedef struct BsdfEval { int use_light_pass; @@ -815,12 +853,17 @@ typedef struct KernelFilm { int pass_shadow; float pass_shadow_scale; int filter_table_offset; - int pass_pad2; + int pass_variance; int pass_mist; float mist_start; float mist_inv_depth; float mist_falloff; + + int pass_samples; + int pass_pad0; + int pass_pad1; + int pass_pad2; } KernelFilm; typedef struct KernelBackground { @@ -875,6 +918,12 @@ typedef struct KernelIntegrator { float sample_clamp_direct; float sample_clamp_indirect; + /* metropolis path */ + int metropolis; + int max_consecutive_rejects; + float image_mutation_range; + int map_interval; + /* branched path */ int branched; int diffuse_samples; @@ -946,6 +995,29 @@ typedef struct KernelData { KernelTables tables; } KernelData; +struct MetropolisChain { + uint sampleSize; + uint rng, rngPos; + float *rngRotation; + + //Current sampler state + bool isLargeMutation, cooldown; + float weight; + uint consecRejects; + float largeStepProb; + + //Current and proposed samples + uint currentStamp, proposedStamp; + float *currentSamples, *proposedSamples; + int *currentSampleStamps, *proposedSampleStamps; + float currentImportance; + PassData currentSampleResult; + float4 currentL; +}; + +#define METRO_RNG_N 1021 +#define METRO_RNG_A 65 + CCL_NAMESPACE_END #endif /* __KERNEL_TYPES_H__ */ diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index cc4b2e3..e598b67 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -557,9 +557,9 @@ ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce); if(heterogeneous) - return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, &tmp_rng); + return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, kernel_data.integrator.metropolis?rng:&tmp_rng); else - return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng, true); + return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, kernel_data.integrator.metropolis?rng:&tmp_rng, true); } /* Decoupled Volume Sampling diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index a7abeda..babd866 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -251,9 +251,9 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, case 95: r = kernel_tex_image_interp(__tex_image_095, x, y); break; case 96: r = kernel_tex_image_interp(__tex_image_096, x, y); break; case 97: r = kernel_tex_image_interp(__tex_image_097, x, y); break; - case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break; #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300) + case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break; case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break; case 100: r = kernel_tex_image_interp(__tex_image_100, x, y); break; case 101: r = kernel_tex_image_interp(__tex_image_101, x, y); break; diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp index fc65922f..abbf6a0 100644 --- a/intern/cycles/render/buffers.cpp +++ b/intern/cycles/render/buffers.cpp @@ -23,10 +23,12 @@ #include "util_foreach.h" #include "util_hash.h" #include "util_image.h" +#include "util_importance.h" #include "util_math.h" #include "util_opengl.h" #include "util_time.h" #include "util_types.h" +#include "util_color.h" CCL_NAMESPACE_BEGIN @@ -97,14 +99,25 @@ RenderTile::RenderTile() /* Render Buffers */ -RenderBuffers::RenderBuffers(Device *device_) +RenderBuffers::RenderBuffers(Device *device_, bool use_importance) { device = device_; + metro_shared_params = NULL; + if (use_importance) + importance_map = new float[params.width*params.height]; + else + importance_map = NULL; } RenderBuffers::~RenderBuffers() { device_free(); + if (importance_map) { + delete[] importance_map; + importance_map = NULL; + } + if (metro_shared_params) + delete[] metro_shared_params; } void RenderBuffers::device_free() @@ -142,6 +155,15 @@ void RenderBuffers::reset(Device *device, BufferParams& params_) for(y = 0; y < height; y++) init_state[x + y*width] = hash_int_2d(params.full_x+x, params.full_y+y); + /* allocate importance map */ + if (importance_map) { + delete[] importance_map; + importance_map = new float[params.width*params.height]; + for (x = 0; x < width; x++) + for (y = 0; y < height; y++) + importance_map[x + y*width] = 1.0f; //Default map for first pass + } + device->mem_alloc(rng_state, MEM_READ_WRITE); device->mem_copy_to(rng_state); } @@ -156,9 +178,23 @@ bool RenderBuffers::copy_from_device() return true; } +#define samples_pass_val (has_samples_pass?(*(in - pass_offset + sample_pass_offset)):sample) +#define scale ((pass.filter) ? ((samples_pass_val > 0)?(1.0f/samples_pass_val):0.0f): 1.0f) +#define scale_exposure (scale * ((pass.exposure)?exposure:1.0f)) + bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels) { - int pass_offset = 0; + int pass_offset = 0, sample_pass_offset = 0; + bool has_samples_pass = false; + + foreach(Pass& pass, params.passes) { + if (pass.type != PASS_SAMPLES) + sample_pass_offset += pass.components; + else { + has_samples_pass = true; + break; + } + } foreach(Pass& pass, params.passes) { if(pass.type != type) { @@ -169,9 +205,6 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int float *in = (float*)buffer.data_pointer + pass_offset; int pass_stride = params.get_passes_size(); - float scale = (pass.filter)? 1.0f/(float)sample: 1.0f; - float scale_exposure = (pass.exposure)? scale*exposure: scale; - int size = params.width*params.height; if(components == 1) { @@ -180,16 +213,10 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int /* scalar */ if(type == PASS_DEPTH) { for(int i = 0; i < size; i++, in += pass_stride, pixels++) { - float f = *in; + float f = importance_map?sqrt(sqrt(importance_map[i])):*in; pixels[0] = (f == 0.0f)? 1e10f: f*scale_exposure; } } - else if(type == PASS_MIST) { - for(int i = 0; i < size; i++, in += pass_stride, pixels++) { - float f = *in; - pixels[0] = clamp(f*scale_exposure, 0.0f, 1.0f); - } - } else { for(int i = 0; i < size; i++, in += pass_stride, pixels++) { float f = *in; @@ -197,20 +224,62 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int } } } + else if(components == 2) { + assert(pass.components == components); + + for(int i = 0; i < size; i++, in += pass_stride, pixels++) { + float2 f = make_float2(in[0], in[1]); + pixels[0] = f.x*scale_exposure; + pixels[1] = f.y*scale_exposure; + } + } else if(components == 3) { assert(pass.components == 4); /* RGBA */ if(type == PASS_SHADOW) { for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) { - float4 f = make_float4(in[0], in[1], in[2], in[3]); + /*float4 f = make_float4(in[0], in[1], in[2], in[3]); float invw = (f.w > 0.0f)? 1.0f/f.w: 1.0f; pixels[0] = f.x*invw; pixels[1] = f.y*invw; - pixels[2] = f.z*invw; + pixels[2] = f.z*invw;*/ + float3 f = make_float3(in[0], in[1], in[2]); + + pixels[0] = f.x; + pixels[1] = f.y; + pixels[2] = f.z; } } + else if(pass.type == PASS_DIFFUSE_COLOR) { + /* RGB lighting passes that need to divide out color */ + int pass_offset_v = 0; + int pass_offset_s = 0; + foreach(Pass& color_pass, params.passes) { + if(color_pass.type == PASS_VARIANCE) + break; + pass_offset_v += color_pass.components; + } + foreach(Pass& color_pass, params.passes) { + if(color_pass.type == PASS_SAMPLES) + break; + pass_offset_s += color_pass.components; + } + + float *in_v = (float*)buffer.data_pointer + pass_offset_v; + float *in_s = (float*)buffer.data_pointer + pass_offset_s; + float *in_c = (float*)buffer.data_pointer; + + for(int i = 0; i < size; i++, in += pass_stride, in_v += pass_stride, in_s += pass_stride, in_c += pass_stride, pixels += 3) { + float3 variance = *((float3*)in_v) / (*in_s - 1);//(*((float3*)in_v) - ((*((float3*)in_c)) * (*((float3*)in_c))) / (*in_s)) / (*in_s-1); + float greyVariance = max(linear_rgb_to_gray(variance), 0.0f); + float f = sqrtf(greyVariance / *in_s) * linear_gray_to_inv_tvi(linear_rgb_to_gray(*((float3*)in_c) / *in_s)); + pixels[0] = f; + pixels[1] = f; + pixels[2] = f; + } + } else if(pass.divide_type != PASS_NONE) { /* RGB lighting passes that need to divide out color */ pass_offset = 0; @@ -301,6 +370,10 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int return false; } +#undef samples_pass_val +#undef scale +#undef scale_exposure + /* Display Buffer */ DisplayBuffer::DisplayBuffer(Device *device_, bool linear) diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h index 27ab20b..639ffe2 100644 --- a/intern/cycles/render/buffers.h +++ b/intern/cycles/render/buffers.h @@ -68,12 +68,16 @@ public: /* buffer parameters */ BufferParams params; + double *metro_shared_params; + /* float buffer */ device_vector buffer; /* random number generator state */ device_vector rng_state; + float* importance_map; + void build_importance_map(); - RenderBuffers(Device *device); + RenderBuffers(Device *device, bool use_importance); ~RenderBuffers(); void reset(Device *device, BufferParams& params); @@ -141,6 +145,13 @@ public: device_ptr buffer; device_ptr rng_state; + + + //Used to restore the tile for error-progressive rendering + int t_index; + int t_x, t_y; + int t_device; + float t_priority; RenderBuffers *buffers; diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp index c1aefbc..82f10fc 100644 --- a/intern/cycles/render/film.cpp +++ b/intern/cycles/render/film.cpp @@ -145,7 +145,7 @@ void Pass::add(PassType type, vector& passes) pass.exposure = true; break; case PASS_BACKGROUND: - pass.components = 4; + pass.components = 3; pass.exposure = true; break; case PASS_AO: @@ -158,6 +158,12 @@ void Pass::add(PassType type, vector& passes) case PASS_LIGHT: /* ignores */ break; + case PASS_VARIANCE: + pass.components = 3; + break; + case PASS_SAMPLES: + pass.components = 1; + break; } passes.push_back(pass); @@ -396,10 +402,15 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->pass_shadow = kfilm->pass_stride; kfilm->use_light_pass = 1; break; - case PASS_LIGHT: kfilm->use_light_pass = 1; break; + case PASS_VARIANCE: + kfilm->pass_variance = kfilm->pass_stride; + break; + case PASS_SAMPLES: + kfilm->pass_samples = kfilm->pass_stride; + break; case PASS_NONE: break; } @@ -411,9 +422,9 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->pass_alpha_threshold = pass_alpha_threshold; /* update filter table */ - vector table = filter_table(filter_type, filter_width); - filter_table_offset = scene->lookup_tables->add_table(dscene, table); - kfilm->filter_table_offset = (int)filter_table_offset; + vector table = filter_table(filter_type, filter_width); + filter_table_offset = scene->lookup_tables->add_table(dscene, table); + kfilm->filter_table_offset = (int)filter_table_offset; /* mist pass parameters */ kfilm->mist_start = mist_start; diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h index 535f0ff..210345c 100644 --- a/intern/cycles/render/image.h +++ b/intern/cycles/render/image.h @@ -29,11 +29,11 @@ CCL_NAMESPACE_BEGIN /* generic */ -#define TEX_NUM_IMAGES 94 +#define TEX_NUM_IMAGES 93 #define TEX_IMAGE_BYTE_START TEX_NUM_FLOAT_IMAGES /* extended gpu */ -#define TEX_EXTENDED_NUM_IMAGES_GPU 145 +#define TEX_EXTENDED_NUM_IMAGES_GPU 144 /* extended cpu */ #define TEX_EXTENDED_NUM_FLOAT_IMAGES 1024 diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp index 4a8b490..b78f388 100644 --- a/intern/cycles/render/integrator.cpp +++ b/intern/cycles/render/integrator.cpp @@ -22,6 +22,7 @@ #include "util_foreach.h" #include "util_hash.h" +#include "util_metropolis.h" CCL_NAMESPACE_BEGIN @@ -59,6 +60,8 @@ Integrator::Integrator() mesh_light_samples = 1; subsurface_samples = 1; volume_samples = 1; + max_consecutive_rejects = 512; + image_mutation_range = 0.1f; method = PATH; sampling_pattern = SAMPLING_PATTERN_SOBOL; @@ -116,6 +119,11 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene kintegrator->sample_clamp_direct = (sample_clamp_direct == 0.0f)? FLT_MAX: sample_clamp_direct*3.0f; kintegrator->sample_clamp_indirect = (sample_clamp_indirect == 0.0f)? FLT_MAX: sample_clamp_indirect*3.0f; + kintegrator->metropolis = (method == METROPOLIS_PATH); + kintegrator->max_consecutive_rejects = max_consecutive_rejects; + kintegrator->image_mutation_range = image_mutation_range; + kintegrator->map_interval = map_interval; + kintegrator->branched = (method == BRANCHED_PATH); kintegrator->diffuse_samples = diffuse_samples; kintegrator->glossy_samples = glossy_samples; @@ -159,6 +167,10 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene sobol_generate_direction_vectors((uint(*)[SOBOL_BITS])directions, dimensions); device->tex_alloc("__sobol_directions", dscene->sobol_directions); + + float *metro_rng = dscene->metropolis_mcqmc.resize(METRO_RNG_N); + metro_build_rng_table(metro_rng); + device->tex_alloc("__metropolis_mcqmc", dscene->metropolis_mcqmc); need_update = false; } @@ -190,6 +202,9 @@ bool Integrator::modified(const Integrator& integrator) sample_clamp_direct == integrator.sample_clamp_direct && sample_clamp_indirect == integrator.sample_clamp_indirect && method == integrator.method && + max_consecutive_rejects == integrator.max_consecutive_rejects && + image_mutation_range == integrator.image_mutation_range && + map_interval == integrator.map_interval && aa_samples == integrator.aa_samples && diffuse_samples == integrator.diffuse_samples && glossy_samples == integrator.glossy_samples && diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h index 380c1a6..039fdfe 100644 --- a/intern/cycles/render/integrator.h +++ b/intern/cycles/render/integrator.h @@ -53,6 +53,10 @@ public: float sample_clamp_indirect; bool motion_blur; + int max_consecutive_rejects; + float image_mutation_range; + int map_interval; + int aa_samples; int diffuse_samples; int glossy_samples; @@ -66,7 +70,8 @@ public: enum Method { BRANCHED_PATH = 0, - PATH = 1 + PATH = 1, + METROPOLIS_PATH = 2 }; Method method; diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h index e5c7444..0b494fe 100644 --- a/intern/cycles/render/scene.h +++ b/intern/cycles/render/scene.h @@ -106,6 +106,7 @@ public: /* integrator */ device_vector sobol_directions; + device_vector metropolis_mcqmc; /* cpu images */ device_vector tex_image[TEX_EXTENDED_NUM_IMAGES_CPU]; diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index 9fcd9fa..825f6bd 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -27,10 +27,12 @@ #include "util_foreach.h" #include "util_function.h" +#include "util_importance.h" #include "util_math.h" #include "util_opengl.h" #include "util_task.h" #include "util_time.h" +#include "util_system.h" CCL_NAMESPACE_BEGIN @@ -42,7 +44,8 @@ Session::Session(const SessionParams& params_) : params(params_), tile_manager(params.progressive, params.samples, params.tile_size, params.start_resolution, params.background == false || params.progressive_refine, params.background, params.tile_order, - max(params.device.multi_devices.size(), 1)), + params.stopping_threshold, params.map_interval, max(params.device.multi_devices.size(), 1), + params.num_progressive_samples, params.error_progressive), stats() { device_use_gl = ((params.device.type != DEVICE_CPU) && !params.background); @@ -56,7 +59,7 @@ Session::Session(const SessionParams& params_) display = NULL; } else { - buffers = new RenderBuffers(device); + buffers = new RenderBuffers(device, params.importance_equalisation | params.adaptive); display = new DisplayBuffer(device, params.display_buffer_linear); } @@ -264,7 +267,7 @@ void Session::run_gpu() thread_scoped_lock buffers_lock(buffers_mutex); /* update status and timing */ - update_status_time(); + update_status_time(false); /* path trace */ path_trace(); @@ -275,7 +278,7 @@ void Session::run_gpu() progress.set_cancel(device->error_message()); /* update status and timing */ - update_status_time(); + update_status_time(false); gpu_need_tonemap = true; gpu_draw_ready = true; @@ -347,7 +350,7 @@ bool Session::draw_cpu(BufferParams& buffer_params, DeviceDrawParams& draw_param bool Session::acquire_tile(Device *tile_device, RenderTile& rtile) { if(progress.get_cancel()) { - if(params.progressive_refine == false) { + if(params.progressive_refine == false || params.error_progressive) { /* for progressive refine current sample should be finished for all tiles */ return false; } @@ -364,10 +367,15 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile) /* fill render tile */ rtile.x = tile_manager.state.buffer.full_x + tile.x; + rtile.t_x = tile.x; rtile.y = tile_manager.state.buffer.full_y + tile.y; + rtile.t_y = tile.y; rtile.w = tile.w; rtile.h = tile.h; - rtile.start_sample = tile_manager.state.sample; + rtile.t_device = tile.device; + rtile.t_index = tile.index; + rtile.t_priority = 1e10f; + rtile.start_sample = params.error_progressive?tile.sample:tile_manager.state.sample; rtile.num_samples = tile_manager.state.num_samples; rtile.resolution = tile_manager.state.resolution_divider; @@ -407,7 +415,7 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile) tilebuffers = tile_buffers[tile.index]; if(tilebuffers == NULL) { - tilebuffers = new RenderBuffers(tile_device); + tilebuffers = new RenderBuffers(tile_device, params.importance_equalisation | params.adaptive); tile_buffers[tile.index] = tilebuffers; tilebuffers->reset(tile_device, buffer_params); @@ -416,7 +424,7 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile) tile_lock.unlock(); } else { - tilebuffers = new RenderBuffers(tile_device); + tilebuffers = new RenderBuffers(tile_device, params.importance_equalisation | params.adaptive); tilebuffers->reset(tile_device, buffer_params); } @@ -439,19 +447,24 @@ void Session::update_tile_sample(RenderTile& rtile) thread_scoped_lock tile_lock(tile_mutex); if(update_render_tile_cb) { - if(params.progressive_refine == false) { + if(params.progressive_refine == false || params.error_progressive) { /* todo: optimize this by making it thread safe and removing lock */ update_render_tile_cb(rtile); } } - update_status_time(); + update_status_time(true); } void Session::release_tile(RenderTile& rtile) { thread_scoped_lock tile_lock(tile_mutex); + if (params.error_progressive) { + rtile.t_priority = tile_error(rtile, params.adaptive_error_power); + if(update_render_tile_cb) + update_render_tile_cb(rtile); + } if(write_render_tile_cb) { if(params.progressive_refine == false) { @@ -462,7 +475,8 @@ void Session::release_tile(RenderTile& rtile) } } - update_status_time(); + update_status_time(true); + tile_manager.finished_tile(Tile(rtile.t_index, rtile.t_x, rtile.t_y, rtile.w, rtile.h, rtile.t_device, rtile.t_priority, rtile.start_sample + rtile.num_samples)); } void Session::run_cpu() @@ -481,6 +495,8 @@ void Session::run_cpu() delayed_reset.do_reset = false; } + bool first_pass = true; + while(!progress.get_cancel()) { /* advance to next tile */ bool no_tiles = !tile_manager.next(); @@ -544,15 +560,24 @@ void Session::run_cpu() if(progress.get_cancel()) break; + + if (params.importance_equalisation && params.metropolis && !first_pass && !tile_manager.done()) { + buffers->build_importance_map(); + } + + first_pass = false; /* update status and timing */ - update_status_time(); + update_status_time(false); /* path trace */ - path_trace(); + if (params.metropolis) + metropolis_trace(); + else + path_trace(); /* update status and timing */ - update_status_time(); + update_status_time(false); if(!params.background) need_tonemap = true; @@ -765,7 +790,7 @@ void Session::update_scene() } } -void Session::update_status_time(bool show_pause, bool show_done) +void Session::update_status_time(bool tile_locked, bool show_pause, bool show_done) { int sample = tile_manager.state.sample; int resolution = tile_manager.state.resolution_divider; @@ -803,6 +828,19 @@ void Session::update_status_time(bool show_pause, bool show_done) substatus += string_printf(", Sample %d/%d", sample, num_samples); } } + else if(params.error_progressive) { + float worstErr; + if (tile_locked) + worstErr = tile_manager.state.tiles.top().priority; + else { + thread_scoped_lock tile_lock(tile_mutex); + worstErr = tile_manager.state.tiles.top().priority; + } + if (worstErr == 1e10f) + substatus = string_printf("Path Tracing First Pass"); + else + substatus = string_printf("Path Tracing Max Error %f", worstErr); + } else if(tile_manager.num_samples == USHRT_MAX) substatus = string_printf("Path Tracing Sample %d", sample+1); else @@ -838,6 +876,37 @@ void Session::update_progress_sample() progress.increment_sample(); } +void Session::metropolis_trace() +{ + /* add path trace task */ + DeviceTask task(DeviceTask::METROPOLIS_TRACE); + + task.acquire_tile = function_bind(&Session::acquire_tile, this, _1, _2); + task.release_tile = function_bind(&Session::release_tile, this, _1); + task.get_cancel = function_bind(&Progress::get_cancel, &this->progress); + task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1); + task.update_progress_sample = function_bind(&Session::update_progress_sample, this); + task.need_finish_queue = params.progressive_refine && !params.error_progressive; + task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH; + task.integrator_metropolis = params.metropolis; + task.adaptive = params.adaptive; + task.stopping_threshold = params.stopping_threshold; + task.adaptive_error_power = params.adaptive_error_power; + task.map_interval = params.map_interval; + task.importance_equalisation = params.importance_equalisation; + task.error_progressive = params.error_progressive; + task.warmup_samples = params.warmup_samples; + if (params.num_metro_chains == 0) { + if (device->info.type == DEVICE_CUDA) + task.num_metro_chains = 8192; + else + task.num_metro_chains = 1; + } else task.num_metro_chains = params.num_metro_chains; + task.metro_sample_size = PRNG_BASE_NUM + (scene->dscene.data.integrator.max_bounce + scene->dscene.data.integrator.transparent_max_bounce + 3)*PRNG_BOUNCE_NUM; + + device->task_add(task); +} + void Session::path_trace() { /* add path trace task */ @@ -848,8 +917,14 @@ void Session::path_trace() task.get_cancel = function_bind(&Progress::get_cancel, &this->progress); task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1); task.update_progress_sample = function_bind(&Session::update_progress_sample, this); - task.need_finish_queue = params.progressive_refine; + task.need_finish_queue = params.progressive_refine && !params.error_progressive; task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH; + task.integrator_metropolis = false; + task.adaptive = params.adaptive; + task.stopping_threshold = params.stopping_threshold; + task.adaptive_error_power = params.adaptive_error_power; + task.map_interval = params.map_interval; + task.error_progressive = params.error_progressive; device->task_add(task); } @@ -893,7 +968,7 @@ bool Session::update_progressive_refine(bool cancel) return false; } - if(params.progressive_refine) { + if(params.progressive_refine && !params.error_progressive) { foreach(RenderBuffers *buffers, tile_buffers) { RenderTile rtile; rtile.buffers = buffers; diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index 9da7a0aa..d84a474 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -44,8 +44,19 @@ public: DeviceInfo device; bool background; bool progressive_refine; + int num_progressive_samples; string output_path; + bool adaptive; + int map_interval; + float stopping_threshold; + float adaptive_error_power; + bool metropolis; + bool importance_equalisation; + bool error_progressive; + int num_metro_chains; + int warmup_samples; + bool progressive; bool experimental; int samples; @@ -92,6 +103,11 @@ public: && progressive_refine == params.progressive_refine && output_path == params.output_path /* && samples == params.samples */ + && adaptive == params.adaptive + && map_interval == params.map_interval + && stopping_threshold == params.stopping_threshold + && metropolis == params.metropolis + && importance_equalisation == params.importance_equalisation && progressive == params.progressive && experimental == params.experimental && tile_size == params.tile_size @@ -152,10 +168,11 @@ protected: void run(); - void update_status_time(bool show_pause = false, bool show_done = false); + void update_status_time(bool tile_locked, bool show_pause = false, bool show_done = false); void tonemap(int sample); void path_trace(); + void metropolis_trace(); void reset_(BufferParams& params, int samples); void run_cpu(); diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp index d6094a4..c9c4d29 100644 --- a/intern/cycles/render/tile.cpp +++ b/intern/cycles/render/tile.cpp @@ -22,15 +22,22 @@ CCL_NAMESPACE_BEGIN TileManager::TileManager(bool progressive_, int num_samples_, int2 tile_size_, int start_resolution_, - bool preserve_tile_device_, bool background_, TileOrder tile_order_, int num_devices_) + bool preserve_tile_device_, bool background_, TileOrder tile_order_, + float error_tolerance_, int map_interval_, int num_devices_, + int num_progressive_samples_, bool error_progressive_) { progressive = progressive_; + error_progressive = error_progressive_; + num_progressive_samples = num_progressive_samples_; + current_progressive_samples = 1; tile_size = tile_size_; tile_order = tile_order_; start_resolution = start_resolution_; num_devices = num_devices_; preserve_tile_device = preserve_tile_device_; background = background_; + error_tolerance = error_tolerance_; + map_interval = map_interval_; BufferParams buffer_params; reset(buffer_params, 0); @@ -57,7 +64,7 @@ void TileManager::reset(BufferParams& params_, int num_samples_) } num_samples = num_samples_; - + current_progressive_samples = 1; state.buffer = BufferParams(); state.sample = -1; state.num_tiles = 0; @@ -148,10 +155,10 @@ void TileManager::set_tiles() int image_w = max(1, params.width/resolution); int image_h = max(1, params.height/resolution); - if(background) - gen_tiles_global(); - else - gen_tiles_sliced(); + if(background) + gen_tiles_global(); + else + gen_tiles_sliced(); state.num_tiles = state.tiles.size(); @@ -164,9 +171,9 @@ void TileManager::set_tiles() state.buffer.full_height = max(1, params.full_height/resolution); } -list::iterator TileManager::next_viewport_tile(int device) +PriorityQueue::iterator TileManager::next_viewport_tile(int device) { - list::iterator iter; + PriorityQueue::iterator iter; int logical_device = preserve_tile_device? device: 0; @@ -178,9 +185,9 @@ list::iterator TileManager::next_viewport_tile(int device) return state.tiles.end(); } -list::iterator TileManager::next_background_tile(int device, TileOrder tile_order) +PriorityQueue::iterator TileManager::next_background_tile(int device, TileOrder tile_order) { - list::iterator iter, best = state.tiles.end(); + PriorityQueue::iterator iter, best = state.tiles.end(); int resolution = state.resolution_divider; int logical_device = preserve_tile_device? device: 0; @@ -232,7 +239,28 @@ list::iterator TileManager::next_background_tile(int device, TileOrder til bool TileManager::next_tile(Tile& tile, int device) { - list::iterator tile_it; + int logical_device = preserve_tile_device? device: 0; + if (error_progressive) { + list wrongDevice; + while (!state.tiles.empty()) { + Tile t = state.tiles.top(); + state.tiles.pop(); + if (t.device != logical_device) + wrongDevice.push_back(t); + else { + for(list::iterator iter = wrongDevice.begin(); iter != wrongDevice.end(); iter++) + state.tiles.push(*iter); + if (t.priority <= error_tolerance) //Done since worst error (for this device) is below threshold + return false; + tile = t; + return true; + } + } + for(list::iterator iter = wrongDevice.begin(); iter != wrongDevice.end(); iter++) + state.tiles.push(*iter); + return false; + } + PriorityQueue::iterator tile_it; if (background) tile_it = next_background_tile(device, tile_order); @@ -250,9 +278,17 @@ bool TileManager::next_tile(Tile& tile, int device) return false; } +void TileManager::finished_tile(Tile tile) { + if (error_progressive) + state.tiles.push(tile); +} + bool TileManager::done() { - return (state.sample+state.num_samples >= num_samples && state.resolution_divider == 1); + if (error_progressive) + return (state.tiles.empty())?false:state.tiles.top().priority <= error_tolerance; + else + return (state.sample+state.num_samples >= num_samples && state.resolution_divider == 1); } bool TileManager::next() @@ -260,17 +296,28 @@ bool TileManager::next() if(done()) return false; - if(progressive && state.resolution_divider > 1) { + if (error_progressive) { + state.sample = 0; + state.num_samples = map_interval; + state.resolution_divider = 1; + set_tiles(); + } + else if(progressive && state.resolution_divider > 1) { state.sample = 0; state.resolution_divider /= 2; state.num_samples = 1; set_tiles(); } else { - state.sample++; + if(progressive) + state.sample += current_progressive_samples; + else + state.sample++; + + current_progressive_samples = min(min(current_progressive_samples * 2, num_progressive_samples), num_samples - state.sample); if(progressive) - state.num_samples = 1; + state.num_samples = current_progressive_samples; else state.num_samples = num_samples; diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h index 7796518..b731237 100644 --- a/intern/cycles/render/tile.h +++ b/intern/cycles/render/tile.h @@ -18,6 +18,7 @@ #define __TILE_H__ #include +#include #include "buffers.h" #include "util_list.h" @@ -32,12 +33,20 @@ public: int x, y, w, h; int device; bool rendering; + float priority; + int sample; Tile() {} - Tile(int index_, int x_, int y_, int w_, int h_, int device_) - : index(index_), x(x_), y(y_), w(w_), h(h_), device(device_), rendering(false) {} + Tile(int index_, int x_, int y_, int w_, int h_, int device_, float priority_ = 1e10f, int sample_ = 0) + : index(index_), x(x_), y(y_), w(w_), h(h_), device(device_), rendering(false), priority(priority_), sample(sample_) {} + + bool operator<(const Tile &b) const { + if (priority < b.priority) return true; + if (priority == b.priority) return index > b.index; + return false; + } }; /* Tile order */ @@ -64,19 +73,22 @@ public: int resolution_divider; int num_tiles; int num_rendered_tiles; - list tiles; + PriorityQueue tiles; } state; int num_samples; TileManager(bool progressive, int num_samples, int2 tile_size, int start_resolution, - bool preserve_tile_device, bool background, TileOrder tile_order, int num_devices = 1); + bool preserve_tile_device, bool background, TileOrder tile_order, float error_tolerance, + int map_interval, int num_devices = 1, + int num_progressive_samples = 1, bool error_progressive = false); ~TileManager(); void reset(BufferParams& params, int num_samples); void set_samples(int num_samples); bool next(); bool next_tile(Tile& tile, int device = 0); + void finished_tile(Tile tile); bool done(); void set_tile_order(TileOrder tile_order_) { tile_order = tile_order_; } @@ -85,10 +97,16 @@ protected: void set_tiles(); bool progressive; + bool error_progressive; + bool error_first_pass_done; + int current_progressive_samples; + int num_progressive_samples; int2 tile_size; TileOrder tile_order; int start_resolution; int num_devices; + float error_tolerance; + int map_interval; /* in some cases it is important that the same tile will be returned for the same * device it was originally generated for (i.e. viewport rendering when buffer is @@ -116,10 +134,10 @@ protected: void gen_tiles_sliced(); /* returns tiles for background render */ - list::iterator next_background_tile(int device, TileOrder tile_order); + PriorityQueue::iterator next_background_tile(int device, TileOrder tile_order); /* returns first unhandled tile for viewport render */ - list::iterator next_viewport_tile(int device); + PriorityQueue::iterator next_viewport_tile(int device); }; CCL_NAMESPACE_END diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 01b5675..40f514d 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -1,6 +1,9 @@ set(INC . + ../kernel + ../render + ../device ) set(INC_SYS @@ -12,6 +15,7 @@ set(SRC util_cache.cpp util_cuda.cpp util_dynlib.cpp + util_importance.cpp util_md5.cpp util_opencl.cpp util_path.cpp @@ -42,10 +46,12 @@ set(SRC_HEADERS util_half.h util_hash.h util_image.h + util_importance.h util_list.h util_map.h util_math.h util_md5.h + util_metropolis.h util_opencl.h util_opengl.h util_optimization.h diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h index 48e9e2d..37c5ae9 100644 --- a/intern/cycles/util/util_color.h +++ b/intern/cycles/util/util_color.h @@ -239,6 +239,30 @@ ccl_device float linear_rgb_to_gray(float3 c) return c.x*0.2126f + c.y*0.7152f + c.z*0.0722f; } +ccl_device float linear_rgb_to_gray(float4 c) +{ + return c.x*0.2126f + c.y*0.7152f + c.z*0.0722f; +} + +ccl_device float linear_gray_to_inv_tvi(float v) { + if (v < 0.000001) //Safety check for v=0 + return exp(2.86f * log(10.0f)); + + float log_v = log10(v); + float log_i; + if (log_v < -3.94f) + log_i = -2.86f; + else if (log_v < -1.44f) + log_i = pow(0.405f*log_v + 1.6f, 2.18f) - 2.86f; + else if (log_v < -0.0184f) + log_i = log_v - 0.395f; + else if (log_v < 1.9f) + log_i = pow(0.249f*log_v + 0.65f, 2.7f) - 0.72f; + else + log_i = log_v - 1.255f; + return exp(-log_i * log(10.0f)); +} + CCL_NAMESPACE_END #endif /* __UTIL_COLOR_H__ */ diff --git a/intern/cycles/util/util_hash.h b/intern/cycles/util/util_hash.h index edd2448..05ac712 100644 --- a/intern/cycles/util/util_hash.h +++ b/intern/cycles/util/util_hash.h @@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN -static inline uint hash_int_2d(uint kx, uint ky) +ccl_device uint hash_int_2d(uint kx, uint ky) { #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k)))) @@ -44,7 +44,7 @@ static inline uint hash_int_2d(uint kx, uint ky) #undef rot } -static inline uint hash_int(uint k) +ccl_device uint hash_int(uint k) { return hash_int_2d(k, 0); } diff --git a/intern/cycles/util/util_importance.cpp b/intern/cycles/util/util_importance.cpp new file mode 100644 index 0000000..587c7d7 --- /dev/null +++ b/intern/cycles/util/util_importance.cpp @@ -0,0 +1,301 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#include +#include "kernel_types.h" +#include "util_importance.h" +#include "util_hash.h" +#include "util_color.h" +#include "util_foreach.h" +#include "buffers.h" +#include "tile.h" + +#include +using namespace std; + +CCL_NAMESPACE_BEGIN + +#define DATA_ADDR(x, y) ((offset + x + x_ofs + (y + y_ofs)*stride)*pass_stride) +#define PASS_ADDR(x, y, pass) ((y * width + x) * pixel_stride + pass) + +//Precomputed gaussian blur with 7-pixel radius +static const float BlurringKernel[] = {0.30406f, 0.22855f, 0.09681f, 0.02314f}; + +void build_importance(RenderTile &rtile) +{ + if (!rtile.buffers->copy_from_device()) return; + + int width = rtile.w, height = rtile.h; + float *temp_importance = new float[width * height]; + int pass_stride = rtile.buffers->params.get_passes_size(); + float *passes = (float*)rtile.buffers->buffer.data_pointer + (rtile.offset + rtile.x + rtile.y * rtile.stride) * pass_stride; + float *importance = rtile.buffers->importance_map + rtile.offset + rtile.x + rtile.y * rtile.stride; + + int samples_pass = 0, variance_pass = 0; + bool found_samples_pass = false, found_variance_pass = false; + foreach(Pass& pass, rtile.buffers->params.passes) { + if (pass.type != PASS_SAMPLES) + samples_pass += pass.components; + else { + found_samples_pass = true; + break; + } + } + assert(found_samples_pass); + foreach(Pass& pass, rtile.buffers->params.passes) { + if (pass.type != PASS_VARIANCE) + variance_pass += pass.components; + else { + found_variance_pass = true; + break; + } + } + assert(found_variance_pass); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++, passes += pass_stride, importance++) { + float3 *combinedPass, *variancePass; + float *samplesPass; + combinedPass = (float3*) passes; + variancePass = (float3*) (passes + variance_pass); + samplesPass = passes + samples_pass; + + float factor = 1.0f / *samplesPass; + float intensityFactor = linear_gray_to_inv_tvi(linear_rgb_to_gray(*combinedPass * factor)); + + float3 variance = (*variancePass) / (*samplesPass - 1);//(*variancePass - ((*combinedPass) * (*combinedPass)) / *samplesPass) * factor; + + /* + * sqrt(variance) gives standard deviation, so sqrt(variance) * intensityFactor gives perceptually-weighted standard deviation (PWSD) + * However, since MC converges with O(1/sqrt(N)), the PWSD must be squared again + * (to decrease the error by a factor of 2, the sample smust be increased by a factor of 4) + * So (sqrt(variance) * IF)^2 is the correct sampling importance, which simplifies to variance*IV^2 + * */ + *importance = linear_rgb_to_gray(variance) * intensityFactor * intensityFactor; + } + importance += rtile.stride - width; + passes += (rtile.stride - width) * pass_stride; + } + importance = rtile.buffers->importance_map; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + temp_importance[y*width + x] = BlurringKernel[3]*importance[rtile.offset + rtile.x + (y+rtile.y)*rtile.stride + ((x > 2) ? (x - 3) : x)]; + temp_importance[y*width + x] = BlurringKernel[2]*importance[rtile.offset + rtile.x + (y+rtile.y)*rtile.stride + ((x > 1) ? (x - 2) : x)]; + temp_importance[y*width + x] += BlurringKernel[1]*importance[rtile.offset + rtile.x + (y+rtile.y)*rtile.stride + ((x > 0) ? (x - 1) : x)]; + temp_importance[y*width + x] += BlurringKernel[0]*importance[rtile.offset + rtile.x + (y+rtile.y)*rtile.stride + x ]; + temp_importance[y*width + x] += BlurringKernel[1]*importance[rtile.offset + rtile.x + (y+rtile.y)*rtile.stride + (((width - x) > 1) ? (x + 1) : x)]; + temp_importance[y*width + x] += BlurringKernel[2]*importance[rtile.offset + rtile.x + (y+rtile.y)*rtile.stride + (((width - x) > 2) ? (x + 2) : x)]; + temp_importance[y*width + x] += BlurringKernel[3]*importance[rtile.offset + rtile.x + (y+rtile.y)*rtile.stride + (((width - x) > 3) ? (x + 3) : x)]; + } + } + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + importance[rtile.offset + rtile.x + (y+rtile.y)*rtile.stride + x] = BlurringKernel[3]*temp_importance[ ((y > 2) ? (y - 3) : y)*width + x]; + importance[rtile.offset + rtile.x + (y+rtile.y)*rtile.stride + x] = BlurringKernel[2]*temp_importance[ ((y > 1) ? (y - 2) : y)*width + x]; + importance[rtile.offset + rtile.x + (y+rtile.y)*rtile.stride + x] += BlurringKernel[1]*temp_importance[ ((y > 0) ? (y - 1) : y)*width + x]; + importance[rtile.offset + rtile.x + (y+rtile.y)*rtile.stride + x] += BlurringKernel[0]*temp_importance[ y *width + x]; + importance[rtile.offset + rtile.x + (y+rtile.y)*rtile.stride + x] += BlurringKernel[1]*temp_importance[(((height - y) > 1) ? (y + 1) : y)*width + x]; + importance[rtile.offset + rtile.x + (y+rtile.y)*rtile.stride + x] += BlurringKernel[2]*temp_importance[(((height - y) > 2) ? (y + 2) : y)*width + x]; + importance[rtile.offset + rtile.x + (y+rtile.y)*rtile.stride + x] += BlurringKernel[3]*temp_importance[(((height - y) > 3) ? (y + 3) : y)*width + x]; + } + } + delete[] temp_importance; +} + +void RenderBuffers::build_importance_map() +{ + int samples_pass = 0, pixel_stride; + bool found_samples_pass = false; + foreach(Pass& pass, params.passes) { + if (pass.type != PASS_SAMPLES) + samples_pass += pass.components; + else { + found_samples_pass = true; + break; + } + } + assert(found_samples_pass); + pixel_stride = params.get_passes_size(); + float* ptr = (float*) buffer.data_pointer; + for (int y = 0; y < params.height; y++) + for (int x = 0; x < params.width; x++) { + if (*(ptr + samples_pass) > 0.0f) + importance_map[y*params.width + x] = linear_gray_to_inv_tvi(linear_rgb_to_gray(*((float4*) ptr) / *(ptr + samples_pass))); + else importance_map[y*params.width + x] = 10.0f; + ptr += pixel_stride; + } + + float *temp_importance = new float[params.width * params.height]; + for (int y = 0; y < params.height; y++) { + for (int x = 0; x < params.width; x++) { + temp_importance[y*params.width + x] = BlurringKernel[0]*importance_map[y*params.width + ((x > 1) ? (x - 2) : x)]; + temp_importance[y*params.width + x] += BlurringKernel[1]*importance_map[y*params.width + ((x > 0) ? (x - 1) : x)]; + temp_importance[y*params.width + x] += BlurringKernel[2]*importance_map[y*params.width + x ]; + temp_importance[y*params.width + x] += BlurringKernel[3]*importance_map[y*params.width + (((params.width - x) > 1) ? (x + 1) : x)]; + temp_importance[y*params.width + x] += BlurringKernel[4]*importance_map[y*params.width + (((params.width - x) > 1) ? (x + 1) : x)]; + } + } + for (int y = 0; y < params.height; y++) { + for (int x = 0; x < params.width; x++) { + importance_map[y*params.width + x] = BlurringKernel[0]*temp_importance[ ((y > 1) ? (y - 2) : y)*params.width + x]; + importance_map[y*params.width + x] += BlurringKernel[1]*temp_importance[ ((y > 0) ? (y - 1) : y)*params.width + x]; + importance_map[y*params.width + x] += BlurringKernel[2]*temp_importance[ y *params.width + x]; + importance_map[y*params.width + x] += BlurringKernel[3]*temp_importance[(((params.height - y) > 1) ? (y + 1) : y)*params.width + x]; + importance_map[y*params.width + x] += BlurringKernel[4]*temp_importance[(((params.height - y) > 1) ? (y + 1) : y)*params.width + x]; + } + } + delete[] temp_importance; +} + +CDF_2D::CDF_2D(float *in_data, int width_, int height_, int stride) +{ + width = width_; + height = height_; + marginal = new float[height]; + data = new float[width*height]; + + for (int y = 0; y < height; y++) { + float *in_row = in_data + y*stride; + float *row = data + y*width; + + row[0] = in_row[0]; + for (int x = 1; x < width; x++) + row[x] = row[x-1] + in_row[x]; + + marginal[y] = row[width-1] + ((y == 0) ? 0.0f : marginal[y-1]); + + float scale = 1.0f / row[width-1]; + for (int x = 0; x < width; x++) + row[x] *= scale; + } + + float scale = 1.0f / marginal[height-1]; + for (int y = 0; y < height; y++) + marginal[y] *= scale; +} + +CDF_2D::~CDF_2D() +{ + if (marginal) delete[] marginal; + if (data) delete[] data; +} + +void CDF_2D::sample(float u, float v, int &x, int &y) +{ + y = std::lower_bound(marginal, marginal + height, v) - marginal; + x = std::lower_bound(data + y*width, data + (y+1)*width, u) - data - y*width; +} + +void CDF_2D::eval_02(uint i, uint rotation, float &u, float &v) +{ + uint r = 0, i_ = i; + for(uint va = 1U << 31; i_; i_ >>= 1, va ^= va >> 1) + if(i_ & 1) + r ^= va; + v = (float)r * (1.0f/(float)0xFFFFFFFF) + (rotation & 0xFFFF) * (1.0f/(float)0xFFFF); + v -= floorf(v); + + i = (i << 16) | (i >> 16); + i = ((i & 0x00ff00ff) << 8) | ((i & 0xff00ff00) >> 8); + i = ((i & 0x0f0f0f0f) << 4) | ((i & 0xf0f0f0f0) >> 4); + i = ((i & 0x33333333) << 2) | ((i & 0xcccccccc) >> 2); + i = ((i & 0x55555555) << 1) | ((i & 0xaaaaaaaa) >> 1); + u = (float)i * (1.0f/(float)0xFFFFFFFF) + (rotation >> 16) * (1.0f/(float)0xFFFF); + u -= floorf(u); +} + +void CDF_2D::sample_02_jittered(int i, int vx, int vy, int &x, int &y) { + float u, v; + eval_02(i, hash_int_2d(vx, vy), u, v); + sample((vx + u) / width, (vy + v) / height, x, y); +} + +#define stop_error 0.001f + +float tile_error(RenderTile &rtile, float error_power) { + if (!rtile.buffers->copy_from_device()) return -1; + + int samples_pass = 0, variance_pass = 0; + bool found_samples_pass = false, found_variance_pass = false; + foreach(Pass& pass, rtile.buffers->params.passes) { + if (pass.type != PASS_SAMPLES) + samples_pass += pass.components; + else { + found_samples_pass = true; + break; + } + } + assert(found_samples_pass); + foreach(Pass& pass, rtile.buffers->params.passes) { + if (pass.type != PASS_VARIANCE) + variance_pass += pass.components; + else { + found_variance_pass = true; + break; + } + } + assert(found_variance_pass); + + int pass_stride = rtile.buffers->params.get_passes_size(); + float *passes = (float*)rtile.buffers->buffer.data_pointer + (rtile.offset + rtile.x + rtile.y * rtile.stride) * pass_stride; + float *importance = rtile.buffers->importance_map + rtile.offset + rtile.x + rtile.y * rtile.stride; + float error = 0.0f; + + for (int y = 0; y < rtile.h; y++) { + for (int x = 0; x < rtile.w; x++, passes += pass_stride, importance++) { + float3 *combinedPass, *variancePass; + float *samplesPass; + combinedPass = (float3*) passes; + variancePass = (float3*) (passes + variance_pass); + samplesPass = passes + samples_pass; + + if (*samplesPass < 2) + return -1; //No sample at this pixel yet + + float factor = 1.0f / (*samplesPass); + float intensityFactor = linear_gray_to_inv_tvi(linear_rgb_to_gray(*combinedPass * factor)); + + float3 variance = (*variancePass) / (*samplesPass - 1); + /* + * sqrt(variance) * IF gives PWSD (see build_importance) + * O(1/sqrt(N)) convergence => PWSD / sqrt(N) gives an approximation of the remaining error + * sqrt(variance) * IF / sqrt(N) is equal to sqrt(variance / N) * IF + */ + + float svariance = max(linear_rgb_to_gray(variance), 0.0f); + + float perror = sqrtf(svariance * factor) * intensityFactor; + error += powf(perror, error_power); + } + passes += (rtile.stride - rtile.w) * pass_stride; + importance += rtile.stride - rtile.w; + } + error /= rtile.w * rtile.h; + return powf(error, 1.0f / error_power); +} + +bool tile_converged(RenderTile &rtile, DeviceTask *task) +{ + if (task->error_progressive) return false; + if (task->stopping_threshold <= 0.0f) return false; + if (rtile.sample < task->map_interval) return false; + if (rtile.sample % task->map_interval) return false; + + float error = tile_error(rtile, task->adaptive_error_power); + if (error < 0) return false; + return (error < task->stopping_threshold); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_importance.h b/intern/cycles/util/util_importance.h new file mode 100644 index 0000000..7db591c --- /dev/null +++ b/intern/cycles/util/util_importance.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#ifndef __UTIL_IMPORTANCE_H__ +#define __UTIL_IMPORTANCE_H__ + +#include +#include "util_color.h" +#include "buffers.h" +#include "device_task.h" + +CCL_NAMESPACE_BEGIN + +void build_importance(RenderTile &tile); +float tile_error(RenderTile &rtile, float error_power); +bool tile_converged(RenderTile &rtile, DeviceTask *task); + +class CDF_2D { +public: + CDF_2D(float *data_, int width_, int height_, int stride); + + ~CDF_2D(); + + void sample(float u, float v, int &x, int &y); + + void eval_02(uint i, uint rotation, float &u, float &v); + + void sample_02_jittered(int i, int vx, int vy, int &x, int &y); +private: + float *data, *marginal; + int width, height; +}; + +CCL_NAMESPACE_END + +#endif /* __UTIL_IMPORTANCE_H__ */ + diff --git a/intern/cycles/util/util_list.h b/intern/cycles/util/util_list.h index 2aa0b73..acaf347 100644 --- a/intern/cycles/util/util_list.h +++ b/intern/cycles/util/util_list.h @@ -18,11 +18,22 @@ #define __UTIL_LIST_H__ #include +#include CCL_NAMESPACE_BEGIN using std::list; +template > +class PriorityQueue : public std::priority_queue, compare>{ +public: + typedef typename std::vector::iterator iterator; + void clear() { this->c.clear(); } + iterator begin() { return this->c.begin(); } + iterator end() { return this->c.end(); } + void push_back(const T &v) { this->push(v); } +}; + CCL_NAMESPACE_END #endif /* __UTIL_LIST_H__ */ diff --git a/intern/cycles/util/util_metropolis.h b/intern/cycles/util/util_metropolis.h new file mode 100644 index 0000000..5987e01 --- /dev/null +++ b/intern/cycles/util/util_metropolis.h @@ -0,0 +1,47 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#ifndef __UTIL_METROPOLIS_H__ +#define __UTIL_METROPOLIS_H__ + +#include +#include "kernel_types.h" + +CCL_NAMESPACE_BEGIN + +static inline int metro_get_size(int numChains, int sampleSize) { + int size = sizeof(MetropolisChain); //Metropolis Chains + size += 2*sampleSize*sizeof(uint); //SampleStamps arrays + size += 3*sampleSize*sizeof(float); //Samples arrays and rngRotation + size = ((size + 15)/16) * 16; //Round to 16 bytes for alignment + return numChains*size; +} + +static inline void metro_build_rng_table(float* table) { + table[0] = 0.0f; + table[1] = 1.0f / METRO_RNG_N; + uint rngState = 1; + for (int i = 2; i < METRO_RNG_N; i++) { + rngState = (rngState * METRO_RNG_A) % METRO_RNG_N; //LCG + table[i] = ((float) rngState) / METRO_RNG_N; + } + std::random_shuffle(table, table + METRO_RNG_N); +} + +CCL_NAMESPACE_END + +#endif /* __UTIL_METROPOLIS_H__ */ + -- 1.8.3.2