diff --git a/build_files/cmake/macros.cmake b/build_files/cmake/macros.cmake index 6998595..4d26069 100644 --- a/build_files/cmake/macros.cmake +++ b/build_files/cmake/macros.cmake @@ -593,6 +593,7 @@ function(SETUP_BLENDER_SORTED_LIBS) bf_editor_io bf_render + bf_processing bf_python bf_python_ext bf_python_mathutils diff --git a/release/scripts/startup/bl_ui/space_userpref.py b/release/scripts/startup/bl_ui/space_userpref.py index 5ba1f0b..c1e9cd8 100644 --- a/release/scripts/startup/bl_ui/space_userpref.py +++ b/release/scripts/startup/bl_ui/space_userpref.py @@ -504,6 +504,9 @@ class USERPREF_PT_system(Panel): col.label(text="OpenSubdiv compute:") col.row().prop(system, "opensubdiv_compute_type", text="") + col.label(text="Processing device:") + col.row().prop(system, "processing_type", text="") + # 2. Column column = split.column() colsplit = column.split(percentage=0.85) diff --git a/source/blender/CMakeLists.txt b/source/blender/CMakeLists.txt index 818d4cd..2702b6a 100644 --- a/source/blender/CMakeLists.txt +++ b/source/blender/CMakeLists.txt @@ -107,6 +107,7 @@ add_subdirectory(blenloader) add_subdirectory(depsgraph) add_subdirectory(ikplugin) add_subdirectory(physics) +add_subdirectory(processing) add_subdirectory(gpu) add_subdirectory(imbuf) add_subdirectory(nodes) diff --git a/source/blender/blenkernel/CMakeLists.txt b/source/blender/blenkernel/CMakeLists.txt index 96c653d..f35e5fa 100644 --- a/source/blender/blenkernel/CMakeLists.txt +++ b/source/blender/blenkernel/CMakeLists.txt @@ -39,6 +39,7 @@ set(INC ../modifiers ../nodes ../physics + ../processing ../render/extern/include ../../../intern/ghost ../../../intern/guardedalloc diff --git a/source/blender/blenkernel/intern/seqeffects.c b/source/blender/blenkernel/intern/seqeffects.c index a2c4505..5744943 100644 --- a/source/blender/blenkernel/intern/seqeffects.c +++ b/source/blender/blenkernel/intern/seqeffects.c @@ -53,6 +53,8 @@ #include "IMB_imbuf.h" #include "IMB_colormanagement.h" +#include "BPL_processing.h" + #include "RNA_access.h" #include "RE_pipeline.h" @@ -3027,6 +3029,8 @@ static void *render_effect_execute_do_y_thread(void *thread_data_v) return NULL; } +double PIL_check_seconds_timer(); + static ImBuf *do_gaussian_blur_effect(const SeqRenderData *context, Sequence *seq, float UNUSED(cfra), @@ -3038,32 +3042,64 @@ static ImBuf *do_gaussian_blur_effect(const SeqRenderData *context, { ImBuf *out = prepare_effect_imbufs(context, ibuf1, NULL, NULL); - RenderGaussianBlurEffectInitData init_data; + double start = PIL_check_seconds_timer(); + + if (U.processing_type == USER_BPL_COMPUTE_LEGACY) { + RenderGaussianBlurEffectInitData init_data; + + init_data.context = context; + init_data.seq = seq; + init_data.ibuf = ibuf1; + init_data.out = out; - init_data.context = context; - init_data.seq = seq; - init_data.ibuf = ibuf1; - init_data.out = out; + IMB_processor_apply_threaded(out->y, + sizeof(RenderGaussianBlurEffectThread), + &init_data, + render_effect_execute_init_handle, + render_effect_execute_do_x_thread); + + ibuf1 = out; + init_data.ibuf = ibuf1; + out = prepare_effect_imbufs(context, ibuf1, NULL, NULL); + init_data.out = out; + + IMB_processor_apply_threaded(out->y, + sizeof(RenderGaussianBlurEffectThread), + &init_data, + render_effect_execute_init_handle, + render_effect_execute_do_y_thread); + + IMB_freeImBuf(ibuf1); - IMB_processor_apply_threaded(out->y, - sizeof(RenderGaussianBlurEffectThread), - &init_data, - render_effect_execute_init_handle, - render_effect_execute_do_x_thread); + double end = PIL_check_seconds_timer(); + printf("Legacy time: %f\n", end - start); + } + else { + ProcessOperation *op = NULL; + void *src = NULL; + void *dst = NULL; - ibuf1 = out; - init_data.ibuf = ibuf1; - out = prepare_effect_imbufs(context, ibuf1, NULL, NULL); - init_data.out = out; + if (ibuf1->rect_float) { + src = ibuf1->rect_float; + dst = out->rect_float; + op = BPL_process_image_float(src, ibuf1->x, ibuf1->y, ibuf1->channels); + } + else { + src = ibuf1->rect; + dst = out->rect; + op = BPL_process_image_8bit(src, ibuf1->x, ibuf1->y, ibuf1->channels); + } - IMB_processor_apply_threaded(out->y, - sizeof(RenderGaussianBlurEffectThread), - &init_data, - render_effect_execute_init_handle, - render_effect_execute_do_y_thread); + GaussianBlurVars *data = seq->effectdata; - IMB_freeImBuf(ibuf1); + BPL_op_image_blur(op, data->size_x, data->size_y); + BPL_op_image_copy(op, dst); + BPL_end_operation(op); + double end = PIL_check_seconds_timer(); + printf("BPL time: %f\n", end - start); + } + return out; } diff --git a/source/blender/makesdna/DNA_userdef_types.h b/source/blender/makesdna/DNA_userdef_types.h index cddb1e0..81873f8 100644 --- a/source/blender/makesdna/DNA_userdef_types.h +++ b/source/blender/makesdna/DNA_userdef_types.h @@ -578,8 +578,9 @@ typedef struct UserDef { struct WalkNavigation walk_navigation; + short processing_type; short opensubdiv_compute_type; - char pad5[6]; + char pad5[4]; } UserDef; extern UserDef U; /* from blenkernel blender.c */ @@ -907,6 +908,12 @@ typedef enum eUserpref_VirtualPixel { VIRTUAL_PIXEL_DOUBLE = 1, } eUserpref_VirtualPixel; +typedef enum eProcessing_Type { + USER_BPL_COMPUTE_LEGACY = 0, + USER_BPL_COMPUTE_CPU = 1, + USER_BPL_COMPUTE_OPENCL = 2, +} eProcessing_Type; + typedef enum eOpensubdiv_Computee_Type { USER_OPENSUBDIV_COMPUTE_NONE = 0, USER_OPENSUBDIV_COMPUTE_CPU = 1, diff --git a/source/blender/makesrna/intern/rna_userdef.c b/source/blender/makesrna/intern/rna_userdef.c index f031360..8707895 100644 --- a/source/blender/makesrna/intern/rna_userdef.c +++ b/source/blender/makesrna/intern/rna_userdef.c @@ -65,6 +65,13 @@ static const EnumPropertyItem opensubdiv_compute_type_items[] = { }; #endif +static const EnumPropertyItem bpl_processing_type_items[] = { + { USER_BPL_COMPUTE_LEGACY, "LEGACY", 0, "Legacy", "" }, + { USER_BPL_COMPUTE_CPU, "CPU", 0, "CPU", "" }, + { USER_BPL_COMPUTE_OPENCL, "OPENCL", 0, "OpenCL", "" }, + { 0, NULL, 0, NULL, NULL } +}; + static const EnumPropertyItem audio_device_items[] = { {0, "NONE", 0, "None", "Null device - there will be no audio output"}, #ifdef WITH_SDL @@ -4196,6 +4203,12 @@ static void rna_def_userdef_system(BlenderRNA *brna) "Draw tool/property regions over the main region, when using Triple Buffer"); RNA_def_property_update(prop, 0, "rna_userdef_dpi_update"); + prop = RNA_def_property(srna, "processing_type", PROP_ENUM, PROP_NONE); + RNA_def_property_flag(prop, PROP_ENUM_NO_CONTEXT); + RNA_def_property_enum_sdna(prop, NULL, "processing_type"); + RNA_def_property_enum_items(prop, bpl_processing_type_items); + RNA_def_property_ui_text(prop, "BPL Processing Type", ""); + #ifdef WITH_OPENSUBDIV prop = RNA_def_property(srna, "opensubdiv_compute_type", PROP_ENUM, PROP_NONE); RNA_def_property_flag(prop, PROP_ENUM_NO_CONTEXT); diff --git a/source/blender/processing/BPL_defines.h b/source/blender/processing/BPL_defines.h new file mode 100644 index 0000000..2aa9c64 --- /dev/null +++ b/source/blender/processing/BPL_defines.h @@ -0,0 +1,21 @@ + +#ifndef __BPL_DEFINES_H__ +#define __BPL_DEFINES_H__ + +typedef enum BPLFormatType { + BPL_FORMAT_UINT8 = 0, + BPL_FORMAT_FLOAT = 1, +} BPLFormatType; + +inline int BPL_get_format_byte_size(BPLFormatType format) +{ + if (format == BPL_FORMAT_FLOAT) { + return sizeof(float); + } + else if (format == BPL_FORMAT_UINT8) { + return sizeof(unsigned char); + } + return 0; +} + +#endif diff --git a/source/blender/processing/BPL_processing.h b/source/blender/processing/BPL_processing.h new file mode 100644 index 0000000..0931c34 --- /dev/null +++ b/source/blender/processing/BPL_processing.h @@ -0,0 +1,65 @@ +/* + * Copyright 2017, Blender Foundation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#ifndef __BPL_PROCESSING_H__ +#define __BPL_PROCESSING_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "BPL_defines.h" + +typedef struct ProcessOperation ProcessOperation; + +void BPL_init(void); +void BPL_exit(void); + +/* + Begin an image operation using the given image as a source. Image + data is copied to an internal buffer. +*/ +ProcessOperation *BPL_process_image_8bit(const unsigned char *data, int width, int height, int channels); +ProcessOperation *BPL_process_image_float(const float *data, int width, int height, int channels); + +/* + End image operation. Do not use the operation after calling this. +*/ +void BPL_end_operation(ProcessOperation *op); + +/* + Perform a blur on the operation. +*/ +void BPL_op_image_blur(ProcessOperation *op, int w, int h); + +/* + Copy current image data to destination buffer. +*/ +void BPL_op_image_copy(ProcessOperation *op, void *dst); + +/* + Generate a gauss table for blur operations. +*/ +float *BPL_make_gausstab(float rad, int size); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/source/blender/processing/CMakeLists.txt b/source/blender/processing/CMakeLists.txt new file mode 100644 index 0000000..42288fe --- /dev/null +++ b/source/blender/processing/CMakeLists.txt @@ -0,0 +1,71 @@ +# ***** BEGIN GPL LICENSE BLOCK ***** +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# +# The Original Code is Copyright (C) 2014, Blender Foundation +# All rights reserved. +# +# The Original Code is: all of this file. +# +# ***** END GPL LICENSE BLOCK ***** + +set(INC + . + intern + ../blenkernel + ../blenlib + ../imbuf + ../makesdna + ../makesrna + ../windowmanager + ../render/extern/include + ../render/intern/include + ../../../extern/clew/include + ../../../intern/guardedalloc + ../../../intern/atomic +) + +set(INC_SYS + +) + +set(SRC + BPL_processing.h + BPL_defines.h + + intern/BPL_processing.cpp + intern/BPL_opencl.cpp + intern/BPL_cpu.h + intern/BPL_cpu.cpp + intern/BPL_cpu_compat.h + + intern/BPL_DeviceBase.h + intern/BPL_DeviceBase.cpp + intern/BPL_DeviceCPU.h + intern/BPL_DeviceCPU.cpp + intern/BPL_DeviceOpenCL.h + intern/BPL_DeviceOpenCL.cpp +) + +list(APPEND INC + ${CMAKE_CURRENT_BINARY_DIR}/intern/kernels +) + +data_to_c(${CMAKE_CURRENT_SOURCE_DIR}/intern/kernels/BPL_opencl_kernels.cl + ${CMAKE_CURRENT_BINARY_DIR}/intern/kernels/BPL_opencl_kernels.cl.h SRC) + +add_definitions(-DCL_USE_DEPRECATED_OPENCL_1_1_APIS) + +blender_add_lib(bf_processing "${SRC}" "${INC}" "${INC_SYS}") diff --git a/source/blender/processing/intern/BPL_DeviceBase.cpp b/source/blender/processing/intern/BPL_DeviceBase.cpp new file mode 100644 index 0000000..78ff967 --- /dev/null +++ b/source/blender/processing/intern/BPL_DeviceBase.cpp @@ -0,0 +1,12 @@ + +#include "BPL_DeviceBase.h" + +BPLDeviceBase::~BPLDeviceBase() +{ + +} + +BPLDeviceBase::BPLDeviceBase() +{ + +} diff --git a/source/blender/processing/intern/BPL_DeviceBase.h b/source/blender/processing/intern/BPL_DeviceBase.h new file mode 100644 index 0000000..a2f3398 --- /dev/null +++ b/source/blender/processing/intern/BPL_DeviceBase.h @@ -0,0 +1,23 @@ + +#ifndef __BPL_DEVICEBASE_H__ +#define __BPL_DEVICEBASE_H__ + +#include "BPL_defines.h" + +class BPLDeviceBase { +public: + virtual ~BPLDeviceBase(); + BPLDeviceBase(); + + virtual bool is_gpu() { return false; } + + virtual bool start_operation(const void *data, BPLFormatType format, int width, int height, int channels) = 0; + + virtual void blur(int horizontal, int vertical) = 0; + + virtual void copy_data(void* dst) = 0; + + virtual void wait() { } //TODO run multiple operations in parallel +}; + +#endif diff --git a/source/blender/processing/intern/BPL_DeviceCPU.cpp b/source/blender/processing/intern/BPL_DeviceCPU.cpp new file mode 100644 index 0000000..0546913 --- /dev/null +++ b/source/blender/processing/intern/BPL_DeviceCPU.cpp @@ -0,0 +1,96 @@ + +#include +#include + +extern "C" { +#include "MEM_guardedalloc.h" +} + +#include "BPL_DeviceCPU.h" +#include "BPL_cpu.h" + +BPLDeviceCPU::~BPLDeviceCPU() +{ + if (m_buffer1) { + MEM_freeN(m_buffer1); + } + + if (m_buffer2) { + MEM_freeN(m_buffer2); + } +} + +BPLDeviceCPU::BPLDeviceCPU() +{ + m_image = NULL; + m_format = BPL_FORMAT_UINT8; + m_width = 0; + m_height = 0; + m_channels = 0; + + m_buffer1 = NULL; + m_buffer2 = NULL; + m_read_ptr = NULL; + m_write_ptr = NULL; +} + +void BPLDeviceCPU::swap_buffers() +{ + void *tmp = m_read_ptr; + m_read_ptr = m_write_ptr; + m_write_ptr = tmp; +} + +int BPLDeviceCPU::image_size() +{ + return (m_width * m_height) * BPL_get_format_byte_size(m_format) * m_channels; +} + +bool BPLDeviceCPU::start_operation(const void *data, BPLFormatType format, int width, int height, int channels) +{ + m_image = data; + m_format = format; + m_width = width; + m_height = height; + m_channels = channels; + + const int size = image_size(); + + //Always align for SSE2 + m_buffer1 = MEM_mallocN_aligned(size, 16, "bpl buffer 1"); + m_buffer2 = MEM_mallocN_aligned(size, 16, "bpl buffer 2"); + + bool ok = false; + if (m_buffer1 && m_buffer2) { + ok = true; + + m_read_ptr = m_buffer1; + m_write_ptr = m_buffer2; + + memcpy(m_read_ptr, m_image, size); + } + return ok; +} + +void BPLDeviceCPU::blur(int horizontal, int vertical) +{ + BPLImageCPU source; + source.data = m_read_ptr; + source.width = m_width; + source.height = m_height; + source.width_clamp = m_width - 1; + source.height_clamp = m_height - 1; + source.channels = m_channels; + source.format = m_format; + + BPLImageCPU target = source; + target.data = m_write_ptr; + + // Blur ping-pongs buffers + bpl_cpu_blur(&source, &target, horizontal, vertical); +} + +void BPLDeviceCPU::copy_data(void* dst) +{ + memcpy(dst, m_read_ptr, image_size()); +} diff --git a/source/blender/processing/intern/BPL_DeviceCPU.h b/source/blender/processing/intern/BPL_DeviceCPU.h new file mode 100644 index 0000000..c3e527e --- /dev/null +++ b/source/blender/processing/intern/BPL_DeviceCPU.h @@ -0,0 +1,36 @@ + +#ifndef __BPL_DEVICECPU_H__ +#define __BPL_DEVICECPU_H__ + +#include "BPL_DeviceBase.h" + +class BPLDeviceCPU : public BPLDeviceBase { + + const void *m_image; + BPLFormatType m_format; + int m_width; + int m_height; + int m_channels; + + void *m_buffer1; + void *m_buffer2; + void *m_read_ptr; + void *m_write_ptr; + +private: + + void swap_buffers(); + int image_size(); + +public: + ~BPLDeviceCPU(); + BPLDeviceCPU(); + + bool start_operation(const void *data, BPLFormatType format, int width, int height, int channels) override; + + void blur(int horizontal, int vertical) override; + + void copy_data(void* dst) override; +}; + +#endif diff --git a/source/blender/processing/intern/BPL_DeviceOpenCL.cpp b/source/blender/processing/intern/BPL_DeviceOpenCL.cpp new file mode 100644 index 0000000..821d713 --- /dev/null +++ b/source/blender/processing/intern/BPL_DeviceOpenCL.cpp @@ -0,0 +1,253 @@ + +#include + +#include "MEM_guardedalloc.h" +#include "BLI_math.h" + +#include "BPL_DeviceOpenCL.h" + +/* + BPLOpenCLDevice needs to share some resources between it's instances, + so use a mutex to control access to them. +*/ +class BPLShareLock { +private: + ThreadMutex *m_mutex; +public: + ~BPLShareLock() { + BLI_mutex_unlock(m_mutex); + } + + BPLShareLock(ThreadMutex *mutex) : m_mutex(mutex) { + BLI_mutex_lock(m_mutex); + } +}; + +void check_status(cl_int status) +{ + if (status != CL_SUCCESS) { + const char* msg = clewErrorString(status); + printf("OpenCL error: %s\n", msg); + } +} + +BPLOpenCLDevice::~BPLOpenCLDevice() +{ + if (m_buffer1 != 0) { + clReleaseMemObject(m_buffer1); + m_buffer1 = 0; + } + + if (m_buffer2 != 0) { + clReleaseMemObject(m_buffer2); + m_buffer2 = 0; + } + + if (m_queue != 0) { + clReleaseCommandQueue(m_queue); + m_queue = 0; + } +} + +BPLOpenCLDevice::BPLOpenCLDevice(cl_context context, cl_device_id device, cl_program program, cl_int vendor_id) +{ + //These are technically shared, but we don't need to manage them + m_device = device; + m_context = context; + m_program = program; + m_vendor_id = vendor_id; + + m_image = NULL; + m_format = BPL_FORMAT_UINT8; + m_width = 0; + m_height = 0; + m_channels = 0; + + m_queue = NULL; + m_buffer1 = NULL; + m_buffer2 = NULL; + m_read_buffer = NULL; + m_write_buffer = NULL; + + //Manage these shared resources between the devices + m_shared_kernels = NULL; + m_shared_mutex = NULL; +} + +cl_kernel BPLOpenCLDevice::get_shared_kernel(const char* name) +{ + auto iter = m_shared_kernels->find(name); + if (iter != m_shared_kernels->end()) { + return iter->second; + } + + printf("Loading kernel: %s\n", name); + + cl_int status = 0; + cl_kernel kernel = clCreateKernel(m_program, name, &status); + if (kernel) { + (*m_shared_kernels)[name] = kernel; + } + return kernel; +} + +BPLOpenCLDevice *BPLOpenCLDevice::clone_shared() +{ + BPLOpenCLDevice *clone = new BPLOpenCLDevice(m_context, m_device, m_program, m_vendor_id); + clone->m_shared_kernels = m_shared_kernels; + clone->m_shared_mutex = m_shared_mutex; + return clone; +} + +void BPLOpenCLDevice::create_shared_resources() +{ + BLI_assert(m_shared_kernels == NULL); + + m_shared_kernels = new std::map(); + m_shared_mutex = BLI_mutex_alloc(); +} + +void BPLOpenCLDevice::destroy_shared_resources() +{ + BLI_assert(m_shared_kernels != NULL); + BLI_assert(m_shared_mutex != NULL); + + int count = m_shared_kernels->size(); + printf("Destroying %i kernels...\n", count); + + for (auto iter = m_shared_kernels->begin(); iter != m_shared_kernels->end(); ++iter) { + clReleaseKernel(iter->second); + } + + delete m_shared_kernels; + m_shared_kernels = NULL; + + BLI_mutex_free(m_shared_mutex); + m_shared_mutex = NULL; +} + +void BPLOpenCLDevice::swap_buffers() +{ + cl_mem temp = m_read_buffer; + m_read_buffer = m_write_buffer; + m_write_buffer = temp; +} + +bool BPLOpenCLDevice::start_operation(const void *data, BPLFormatType format, int width, int height, int channels) +{ + m_image = data; + m_format = format; + m_width = width; + m_height = height; + m_channels = channels; + + bool ok = false; + + BLI_assert(m_queue == NULL); + + cl_int status = 0; + m_queue = clCreateCommandQueue(m_context, m_device, 0, &status); + check_status(status); + + if (m_queue) { + const int orders[4] = { CL_R, CL_RG, CL_RGB, CL_RGBA }; + + cl_image_format cl_format = { 0 }; + cl_format.image_channel_order = orders[channels - 1]; + if (m_format == BPL_FORMAT_FLOAT) { + cl_format.image_channel_data_type = CL_FLOAT; + } + else { + cl_format.image_channel_data_type = CL_UNORM_INT8; + } + + m_buffer1 = clCreateImage2D(m_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + &cl_format, width, height, 0, const_cast(data), &status); + check_status(status); + + if (m_buffer1) { + m_buffer2 = clCreateImage2D(m_context, CL_MEM_READ_WRITE, &cl_format, width, height, 0, NULL, &status); + check_status(status); + + if (m_buffer2) { + ok = true; + + m_read_buffer = m_buffer1; + m_write_buffer = m_buffer2; + } + } + } + + return ok; +} + +int bpl_gauss_filter_size(int size) +{ + float rad = max_ff(size, 0.0f); + int filtersize = min_ii(ceil(rad), 30000); + return filtersize; +} + +cl_mem bpl_gauss_tab(cl_context context, int size) +{ + int filter = bpl_gauss_filter_size(size); + float *gausstab = BPL_make_gausstab(max_ff(size, 0.0f), filter); + + cl_int status = 0; + cl_mem gauss_buffer = clCreateBuffer(context, + CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(float) * (filter * 2 + 1), + gausstab, + &status); + check_status(status); + + MEM_freeN(gausstab); + + return gauss_buffer; +} + +void BPLOpenCLDevice::blur_pass(const char *name, int size, cl_mem src, cl_mem dst, cl_mem gausstab) +{ + cl_kernel kernel = get_shared_kernel(name); + + cl_int status = 0; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &src); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &gausstab); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), &size); + + const size_t global_work_size[] = { (size_t)m_width, (size_t)m_height }; + status = clEnqueueNDRangeKernel(m_queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); + check_status(status); +} + +void BPLOpenCLDevice::blur(int horizontal, int vertical) +{ + BPLShareLock lock(m_shared_mutex); + + //TODO could skip passes if size is 0... + int filter_x = bpl_gauss_filter_size(max_ii(horizontal, 1)); + int filter_y = bpl_gauss_filter_size(max_ii(vertical, 1)); + + cl_mem gausstab_x = bpl_gauss_tab(m_context, filter_x); + cl_mem gausstab_y = bpl_gauss_tab(m_context, filter_y); + + blur_pass("gaussian_blur_horizontal", filter_x, m_read_buffer, m_write_buffer, gausstab_x); + + swap_buffers(); + + blur_pass("gaussian_blur_vertical", filter_y, m_read_buffer, m_write_buffer, gausstab_y); + + swap_buffers(); + + clReleaseMemObject(gausstab_x); + clReleaseMemObject(gausstab_y); +} + +void BPLOpenCLDevice::copy_data(void* dst) +{ + const size_t origin[3] = { 0, 0, 0 }; + const size_t region[3] = { (size_t)m_width, (size_t)m_height, 1 }; + cl_int status = clEnqueueReadImage(m_queue, m_read_buffer, CL_TRUE, origin, region, 0, 0, dst, 0, NULL, NULL); + check_status(status); +} diff --git a/source/blender/processing/intern/BPL_DeviceOpenCL.h b/source/blender/processing/intern/BPL_DeviceOpenCL.h new file mode 100644 index 0000000..b8e0242 --- /dev/null +++ b/source/blender/processing/intern/BPL_DeviceOpenCL.h @@ -0,0 +1,62 @@ + +#ifndef __BPL_OPENCLDEVICE_H__ +#define __BPL_OPENCLDEVICE_H__ + +#include "BPL_processing.h" +#include "BPL_DeviceBase.h" +#include "BLI_threads.h" + +#include +#include + +#include "clew.h" + +class BPLOpenCLDevice : public BPLDeviceBase { +private: + + cl_context m_context; + cl_device_id m_device; + cl_program m_program; + cl_int m_vendor_id; + + const void *m_image; + BPLFormatType m_format; + int m_width; + int m_height; + int m_channels; + + cl_command_queue m_queue; + cl_mem m_buffer1; + cl_mem m_buffer2; + cl_mem m_read_buffer; + cl_mem m_write_buffer; + + std::map *m_shared_kernels; + ThreadMutex *m_shared_mutex; + +private: + + cl_kernel get_shared_kernel(const char* name); + + void swap_buffers(); + + void blur_pass(const char *name, int size, cl_mem src, cl_mem dst, cl_mem gausstab); + +public: + ~BPLOpenCLDevice(); + BPLOpenCLDevice(cl_context context, cl_device_id device, cl_program program, cl_int vendor_id); + + void create_shared_resources(); + void destroy_shared_resources(); + BPLOpenCLDevice *clone_shared(); + + bool is_gpu() override { return true; } + + bool start_operation(const void *data, BPLFormatType format, int width, int height, int channels) override; + + void blur(int horizontal, int vertical) override; + + void copy_data(void* dst) override; +}; + +#endif diff --git a/source/blender/processing/intern/BPL_cpu.cpp b/source/blender/processing/intern/BPL_cpu.cpp new file mode 100644 index 0000000..a3acc4c --- /dev/null +++ b/source/blender/processing/intern/BPL_cpu.cpp @@ -0,0 +1,80 @@ + +#include "BPL_cpu.h" +#include "BPL_cpu_compat.h" + +namespace { + +#include "kernels\BPL_opencl_kernels.cl" + +} + +//Some includes after the kernels to avoid polluting the namespace with too many variables + +extern "C" { + +#include "MEM_guardedalloc.h" +#include "IMB_imbuf.h" + +float *BPL_make_gausstab(float, int); + +} + +int bpl_gauss_filter_size(int); + +struct BlurThreadData { + BPLImageCPU *source; + BPLImageCPU *target; + float *gausstab_x; + float *gausstab_y; + int blur_x; + int blur_y; +}; + +BPL_THREAD_CALLBACK +void blur_callback_x(void *custom_data, int start_scanline, int num_scanlines) +{ + BlurThreadData* data = (BlurThreadData*)custom_data; + BPL_EXECUTE_IMAGE_KERNEL(data->source, start_scanline, num_scanlines, gaussian_blur_horizontal, + data->source, data->target, data->gausstab_x, data->blur_x); +} + +BPL_THREAD_CALLBACK +void blur_callback_y(void *custom_data, int start_scanline, int num_scanlines) +{ + BlurThreadData* data = (BlurThreadData*)custom_data; + BPL_EXECUTE_IMAGE_KERNEL(data->source, start_scanline, num_scanlines, gaussian_blur_vertical, + data->target, data->source, data->gausstab_y, data->blur_y); +} + +void bpl_cpu_blur(BPLImageCPU *source, BPLImageCPU *target, int horizontal, int vertical) +{ + if (horizontal < 1) { + horizontal = 1; + } + if (vertical < 1) { + vertical = 1; + } + + int size_x = bpl_gauss_filter_size(horizontal); + float *gausstab_x = BPL_make_gausstab(horizontal, size_x); + + int size_y = bpl_gauss_filter_size(vertical); + float *gausstab_y = BPL_make_gausstab(vertical, size_y); + + BlurThreadData custom; + custom.source = source; + custom.target = target; + custom.gausstab_x = gausstab_x; + custom.gausstab_y = gausstab_y; + custom.blur_x = size_x; + custom.blur_y = size_y; + + //From source to target image + IMB_processor_apply_threaded_scanlines(source->height, blur_callback_x, &custom); + + //From target to source image + IMB_processor_apply_threaded_scanlines(source->height, blur_callback_y, &custom); + + MEM_freeN(gausstab_x); + MEM_freeN(gausstab_y); +} diff --git a/source/blender/processing/intern/BPL_cpu.h b/source/blender/processing/intern/BPL_cpu.h new file mode 100644 index 0000000..2251a6f --- /dev/null +++ b/source/blender/processing/intern/BPL_cpu.h @@ -0,0 +1,19 @@ + +#ifndef __BPL_OPENCL_CPU_H__ +#define __BPL_OPENCL_CPU_H__ + +#include "BPL_defines.h" + +struct BPLImageCPU { + void *data; + int width; + int width_clamp; //Micro optimization + int height; + int height_clamp; //Micro optimization + int channels; + BPLFormatType format; +}; + +void bpl_cpu_blur(BPLImageCPU *source, BPLImageCPU *target, int horizontal, int vertical); + +#endif diff --git a/source/blender/processing/intern/BPL_cpu_compat.h b/source/blender/processing/intern/BPL_cpu_compat.h new file mode 100644 index 0000000..7b704f0 --- /dev/null +++ b/source/blender/processing/intern/BPL_cpu_compat.h @@ -0,0 +1,367 @@ + +#ifndef __BPL_OPENCL_CPUCOMPAT_H__ +#define __BPL_OPENCL_CPUCOMPAT_H__ + +#include +#include +#include "clew.h" + +#include "BLI_compiler_compat.h" +#include "BLI_compiler_attrs.h" + +//OpenCL compatiblity macros and classes + +#define __constant +#define __kernel template BLI_INLINE +#define __read_only const +#define __write_only +#define __global + +#define sampler_t int +#define image2d_t BPLImageCPU* + +#define CLK_NORMALIZED_COORDS_FALSE 0 +#define CLK_ADDRESS_CLAMP_TO_EDGE 0 +#define CLK_FILTER_NEAREST 0 + +#define BPL_STATIC_FAIL(message) char invalid_array[0] + +#ifdef __SSE2__ +#define BPL_SSE2 +#endif + +ATTR_ALIGN(16) +struct float4 { + +#ifdef BPL_SSE2 + union { + __m128 m128; + struct { float x, y, z, w; }; + }; +#else + float x, y, z, w; +#endif + + float4 operator+(const float4& v) const { +#ifdef BPL_SSE2 + return float4(_mm_add_ps(m128, v.m128)); +#else + return float4(x + v.x, y + v.y, z + v.z, w + v.w); +#endif + } + + float4& operator+=(const float4& v) { + *this = *this + v; + return *this; + } + + float4 operator-(const float4& v) const { +#ifdef BPL_SSE2 + return float4(_mm_sub_ps(m128, v.m128)); +#else + return float4(x - v.x, y - v.y, z - v.z, w - v.w); +#endif + } + + float4 operator*(float f) const { +#ifdef BPL_SSE2 + return float4(_mm_mul_ps(m128, _mm_set1_ps(f))); +#else + return float4(x * f, y * f, z * f, w * f); +#endif + } + + float4& operator*=(float f) { + *this = *this * f; + return *this; + } + + float4 operator/(float f) const { +#ifdef BPL_SSE2 + return float4(_mm_div_ps(m128, _mm_set1_ps(f))); +#else + return float4(x / f, y / f, z / f, w / f); +#endif + } + + //Uninitialized by default - beware + float4() { } + +#ifdef BPL_SSE2 + float4(float x, float y, float z, float w) : m128(_mm_set_ps(x, y, z, w)) { } +#else + float4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) { } +#endif + +#ifdef BPL_SSE2 + float4(__m128 v) : m128(v) { } +#endif +}; + +struct int2 { + int x; + int y; + + int2 operator*(float f) const { + return int2(x * f, y * f); + } + + int2 operator/(float f) const { + return int2(x / f, y / f); + } + + int2() { } + int2(int x, int y) : x(x), y(y) { } +}; + +/* +Clamping image pixel lookup coordinates adds a branch in the most +inner loop and slows things down quite a bit. We can increase +performance if we mandate that all lookups must be inside the image... +But indexing wrongly would blow things up or show incorrect results. +*/ +template +BLI_INLINE T clamp(T value, T min_value, T max_value) +{ + if (value < min_value) { + value = min_value; + } + else if (value > max_value) { + value = max_value; + } + return value; +} + +//Dangerous macros... + +#define max(a, b) (((a) > (b)) ? (a) : (b)) +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +//Check image access in debug builds +#ifdef NDEBUG +#define check_coordinates(image, x, y) +#else +void check_coordinates(const image2d_t image, int x, int y) +{ + if (x < 0 || x >= image->width || y < 0 || y >= image->height) { + //We *really* want to make this clear in debug builds + printf("Image access out of bounds, aborting!\n"); + abort(); + } +} +#endif + +template +BLI_INLINE int get_pixel_index(const image2d_t image, int x, int y) +{ + check_coordinates(image, x, y); + //x = clamp(x, 0, image->width_clamp); + //y = clamp(y, 0, image->height_clamp); + return ((y * image->width) + x) * Channels; +} + +BLI_INLINE int get_pixel_index_slow(const image2d_t image, int x, int y) +{ + check_coordinates(image, x, y); + //x = clamp(x, 0, image->width_clamp); + //y = clamp(y, 0, image->height_clamp); + return ((y * image->width) + x) * image->channels; +} + +template +BLI_INLINE T *get_pixel(const image2d_t image, const int2& pos) +{ + return ((T*)image->data) + get_pixel_index(image, pos.x, pos.y); +} + +template +BLI_INLINE T *get_pixel_slow(const image2d_t image, const int2& pos) +{ + return ((T*)image->data) + get_pixel_index_slow(image, pos.x, pos.y); +} + +/* +Image reading functions are crucial for performance, most kernels read a lot more +than they write. +*/ +template +float4 read_imagef_base(const image2d_t image, sampler_t sampler, int2 pos) +{ + BPL_STATIC_FAIL("Invalid template specialization"); + return float4(); +} + +template<> +BLI_INLINE float4 read_imagef_base(const image2d_t image, sampler_t sampler, int2 pos) +{ + //Use multiply instead of division by 255.0f for a minor speed boost + const unsigned char *data = get_pixel(image, pos); + return float4(data[0], data[1], data[2], data[3]) * (1.0f / 255.0f); +} + +template<> +BLI_INLINE float4 read_imagef_base(const image2d_t image, sampler_t sampler, int2 pos) +{ + const unsigned char *data = get_pixel(image, pos); + return float4(data[0], data[1], data[2], 0.0f) * (1.0f / 255.0f); +} + +template<> +BLI_INLINE float4 read_imagef_base(const image2d_t image, sampler_t sampler, int2 pos) +{ + const unsigned char *data = get_pixel(image, pos); + return float4(data[0], data[1], 0.0f, 0.0f) * (1.0f / 255.0f); +} + +template<> +BLI_INLINE float4 read_imagef_base(const image2d_t image, sampler_t sampler, int2 pos) +{ + const unsigned char *data = get_pixel(image, pos); + return float4(data[0], 0.0f, 0.0f, 0.0f) * (1.0f / 255.0f); +} + +template<> +BLI_INLINE float4 read_imagef_base(const image2d_t image, sampler_t sampler, int2 pos) +{ +#ifdef BPL_SSE2 + return float4(*get_pixel<__m128, 1>(image, pos)); +#else + const float *data = get_pixel(image, pos); + return float4(data[0], data[1], data[2], data[3]); +#endif +} + +template<> +BLI_INLINE float4 read_imagef_base(const image2d_t image, sampler_t sampler, int2 pos) +{ + const float *data = get_pixel(image, pos); + return float4(data[0], data[1], data[2], 0.0f); +} + +template<> +BLI_INLINE float4 read_imagef_base(const image2d_t image, sampler_t sampler, int2 pos) +{ + const float *data = get_pixel(image, pos); + return float4(data[0], data[1], 0.0f, 0.0f); +} + +template<> +BLI_INLINE float4 read_imagef_base(const image2d_t image, sampler_t sampler, int2 pos) +{ + const float *data = get_pixel(image, pos); + return float4(data[0], 0.0f, 0.0f, 0.0f); +} + +#define read_imagef read_imagef_base + +//Don't use template channel count specialization for image writing functions. Usually reading is much more speed critical. + +template +inline void write_imagef_base(const image2d_t image, int2 pos, const float4& color) +{ + BPL_STATIC_FAIL("Invalid template specialization"); +} + +template<> +BLI_INLINE void write_imagef_base(const image2d_t image, int2 pos, const float4& color) +{ + unsigned char *data = get_pixel_slow(image, pos); + const float4 norm = color * 255.0f; + + switch (image->channels) { + case 4: + data[3] = norm.w; + // fall-through + case 3: + data[2] = norm.z; + // fall-through + case 2: + data[1] = norm.y; + // fall-through + case 1: + data[0] = norm.x; + // fall-through + } +} + +template<> +BLI_INLINE void write_imagef_base(const image2d_t image, int2 pos, const float4& color) +{ + float *data = get_pixel_slow(image, pos); + + switch (image->channels) { + case 4: + data[3] = color.w; + // fall-through + case 3: + data[2] = color.z; + // fall-through + case 2: + data[1] = color.y; + // fall-through + case 1: + data[0] = color.x; + // fall-through + } +} + +#define write_imagef write_imagef_base + +struct BPLKernelContext { + int global_ids[2]; +}; + +#define get_global_id(index) kernel_context->global_ids[index] +#define get_image_width(image) image->width +#define get_image_height(image) image->height +#define get_image_dim(image) int2(image->width, image->height) + +#define _BPL_IMAGE_PASS(image, scanline, lines, kernel, type, channels, ...) \ + { \ + const int width = image->width; \ + const int height = scanline + lines; \ + for (int y = scanline; y < height; ++y) { \ + BPLKernelContext context; \ + context.global_ids[1] = y; \ + for (int x = 0; x < width; ++x) { \ + context.global_ids[0] = x; \ + kernel(&context, __VA_ARGS__); \ + } \ + } \ + } + +//Support all four channels for now, in practive it's rare to have 2-component colors... + +#define BPL_EXECUTE_IMAGE_KERNEL(image, scanline, lines, kernel, ...) \ + if (image->format == BPL_FORMAT_UINT8) { \ + if (image->channels == 1) { \ + _BPL_IMAGE_PASS(image, scanline, lines, kernel, unsigned char, 1, __VA_ARGS__); \ + } \ + else if (image->channels == 2) { \ + _BPL_IMAGE_PASS(image, scanline, lines, kernel, unsigned char, 2, __VA_ARGS__); \ + } \ + else if (image->channels == 3) { \ + _BPL_IMAGE_PASS(image, scanline, lines, kernel, unsigned char, 3, __VA_ARGS__); \ + } \ + else if (image->channels == 4) { \ + _BPL_IMAGE_PASS(image, scanline, lines, kernel, unsigned char, 4, __VA_ARGS__); \ + } \ + } \ + else if (image->format == BPL_FORMAT_FLOAT) { \ + if (image->channels == 1) { \ + _BPL_IMAGE_PASS(image, scanline, lines, kernel, float, 1, __VA_ARGS__); \ + } \ + else if (image->channels == 2) { \ + _BPL_IMAGE_PASS(image, scanline, lines, kernel, float, 2, __VA_ARGS__); \ + } \ + else if (image->channels == 3) { \ + _BPL_IMAGE_PASS(image, scanline, lines, kernel, float, 3, __VA_ARGS__); \ + } \ + else if (image->channels == 4) { \ + _BPL_IMAGE_PASS(image, scanline, lines, kernel, float, 4, __VA_ARGS__); \ + } \ + } + +#define BPL_THREAD_CALLBACK extern "C" static + +#endif diff --git a/source/blender/processing/intern/BPL_opencl.cpp b/source/blender/processing/intern/BPL_opencl.cpp new file mode 100644 index 0000000..fc46abf --- /dev/null +++ b/source/blender/processing/intern/BPL_opencl.cpp @@ -0,0 +1,172 @@ + +#include +#include + +#include "clew.h" +#include "BPL_DeviceOpenCL.h" +#include "BPL_opencl_kernels.cl.h" + +void BPL_log(const char*); + +cl_context BPL_cl_context = 0; +cl_program BPL_cl_program = 0; +std::vector BPL_devices; + +BPLOpenCLDevice *bpl_get_device() +{ + //TODO Try to selecte the "best" device somehow... Or let user decide? + + if (BPL_devices.size() > 0) { + return BPL_devices[0]; + } + return NULL; +} + +static void CL_CALLBACK BPL_cl_context_error(const char *errinfo, + const void * /*private_info*/, + size_t /*cb*/, + void * /*user_data*/) +{ + printf("OpenCL error: %s\n", errinfo); +} + +cl_program bpl_build_program(cl_context context, std::vector *devices) +{ + const char *cl_str[2] = { datatoc_BPL_opencl_kernels_cl, NULL }; + cl_int status = 0; + cl_program program = clCreateProgramWithSource(context, 1, cl_str, 0, &status); + if (status == CL_SUCCESS) { + status = clBuildProgram(program, devices->size(), devices->data(), 0, 0, 0); + if (status == CL_SUCCESS) { + BPL_log("Program build OK"); + } + else { + BPL_log("Program build failed"); + + size_t ret_val_size = 0; + status = clGetProgramBuildInfo(program, devices->at(0), CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); + + std::vector build_log; + build_log.resize(ret_val_size + 1); + status = clGetProgramBuildInfo(program, devices->at(0), CL_PROGRAM_BUILD_LOG, ret_val_size, build_log.data(), NULL); + build_log[ret_val_size] = '\0'; + + std::string log = "Program build log: "; + log += build_log.data(); + BPL_log(log.c_str()); + + clReleaseProgram(program); + program = NULL; + } + } + else { + BPL_log("Program creation failed"); + } + + return program; +} + +void bpl_log_device_name(cl_device_id device) +{ + std::vector device_name; + + size_t length = 0; + cl_int status = clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &length); + if (status == CL_SUCCESS) { + device_name.resize(length + 1); + + status = clGetDeviceInfo(device, CL_DEVICE_NAME, length, device_name.data(), NULL); + if (status == CL_SUCCESS) { + BPL_log((std::string("OpenCL device: ") + device_name.data()).c_str()); + } + } +} + +void BPL_opencl_init() +{ + if (clewInit() != CLEW_SUCCESS) { + BPL_log("CLEW init failed"); + return; + } + + if (clCreateContextFromType == NULL) { + BPL_log("OpenCL API pointers are NULL"); + return; + } + + cl_uint platform_count = 0; + cl_int status = clGetPlatformIDs(0, 0, &platform_count); + if (status == CL_SUCCESS) { + BPL_log(("OpenCL platforms: " + std::to_string(platform_count)).c_str()); + + std::vector platform_ids; + platform_ids.resize(platform_count); + status = clGetPlatformIDs(platform_count, platform_ids.data(), 0); + if (status == CL_SUCCESS) { + BPL_log("Platform IDs OK"); + + for (int i = 0; i < platform_count; i++) { + cl_platform_id platform = platform_ids[i]; + cl_uint device_count = 0; + clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, 0, &device_count); + if (device_count <= 0) + continue; + + std::vector device_ids; + device_ids.resize(device_count); + clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, device_count, device_ids.data(), 0); + + BPL_cl_context = clCreateContext(NULL, device_count, device_ids.data(), BPL_cl_context_error, NULL, &status); + if (status == CL_SUCCESS) { + BPL_log("OpenCL context OK"); + + BPL_cl_program = bpl_build_program(BPL_cl_context, &device_ids); + if (BPL_cl_program != 0) { + for (int i = 0; i < device_count; i++) { + cl_device_id device = device_ids[i]; + bpl_log_device_name(device); + + cl_int vendor_id = 0; + status = clGetDeviceInfo(device, CL_DEVICE_VENDOR_ID, sizeof(vendor_id), &vendor_id, NULL); + + BPLOpenCLDevice *bpl_device = new BPLOpenCLDevice(BPL_cl_context, device, BPL_cl_program, vendor_id); + bpl_device->create_shared_resources(); + BPL_devices.push_back(bpl_device); + } + } + } + else { + BPL_log("OpenCL context error"); + } + } + } + else { + BPL_log("OpenCL platform id error"); + } + } + else { + BPL_log("OpenCL platform error"); + } + + BPL_log(("OpenCL device count: " + std::to_string(BPL_devices.size())).c_str()); +} + +void BPL_opencl_close() +{ + while (BPL_devices.size() > 0) { + BPLOpenCLDevice *device = BPL_devices.back(); + device->destroy_shared_resources(); + delete device; + BPL_devices.pop_back(); + } + + if (BPL_cl_program != 0) { + clReleaseProgram(BPL_cl_program); + BPL_cl_program = 0; + } + + if (BPL_cl_context != 0) { + clReleaseContext(BPL_cl_context); + BPL_cl_context = 0; + } +} diff --git a/source/blender/processing/intern/BPL_processing.cpp b/source/blender/processing/intern/BPL_processing.cpp new file mode 100644 index 0000000..5133204 --- /dev/null +++ b/source/blender/processing/intern/BPL_processing.cpp @@ -0,0 +1,155 @@ + +#include +#include + +#include "BLI_math.h" + +extern "C" { +#include "MEM_guardedalloc.h" + +#include "BLI_threads.h" + +#include "IMB_imbuf.h" + +#include "DNA_userdef_types.h" +} + +#include + +#include "BPL_processing.h" +#include "BPL_DeviceOpenCL.h" +#include "BPL_DeviceCPU.h" + +void BPL_opencl_init(); +void BPL_opencl_close(); +BPLOpenCLDevice *bpl_get_device(); + +struct ProcessOperation { + BPLDeviceBase *device; +}; + +void BPL_log(const char *message) +{ + printf("%s", message); + printf("\n"); +} + +void BPL_init() +{ + BPL_log("BPL init"); + + BPL_opencl_init(); +} + +void BPL_exit() +{ + BPL_log("BPL exit"); + + BPL_opencl_close(); +} + +BPLDeviceBase *bpl_create_operation_device() +{ + BPLDeviceBase *device = NULL; + + if (U.processing_type == USER_BPL_COMPUTE_OPENCL) { + // Try to use OpenCL if available and enabled in settings + BPLOpenCLDevice *cl_device = bpl_get_device(); + if (cl_device) { + device = cl_device->clone_shared(); + } + } + + if (device == NULL) { + // Use a software device by default + device = new BPLDeviceCPU(); + } + + return device; +} + +ProcessOperation *bpl_process_image(const void *data, BPLFormatType format, int width, int height, int channels) +{ + ProcessOperation *op = new ProcessOperation(); + op->device = bpl_create_operation_device(); + + if (!op->device->start_operation(data, format, width, height, channels)) { + //Start failed, probably could not allocate buffers etc. + + if (op->device->is_gpu()) { + BPL_log("Hardware device start failed, falling back to software"); + delete op->device; + + op->device = new BPLDeviceCPU(); + if (!op->device->start_operation(data, format, width, height, channels)) { + BPL_log("CPU Device fallback start failed, aborting"); + abort(); + } + } + else { + BPL_log("CPU Device start failed, aborting"); + abort(); + } + } + + return op; +} + +ProcessOperation *BPL_process_image_8bit(const unsigned char *data, int width, int height, int channels) +{ + return bpl_process_image(data, BPL_FORMAT_UINT8, width, height, channels); +} + +ProcessOperation *BPL_process_image_float(const float *data, int width, int height, int channels) +{ + return bpl_process_image(data, BPL_FORMAT_FLOAT, width, height, channels); +} + +void BPL_end_operation(ProcessOperation *op) +{ + delete op->device; + op->device = NULL; + + delete op; +} + +void BPL_op_image_blur(ProcessOperation *op, int w, int h) +{ + op->device->blur(w, h); +} + +void BPL_op_image_copy(ProcessOperation *op, void *dst) +{ + op->device->wait(); + op->device->copy_data(dst); +} + +//TODO duplicated in GaussianXBlurOperation... + +extern "C" float RE_filter_value(int type, float x); + +#define R_FILTER_GAUSS 5 + +float *BPL_make_gausstab(float rad, int size) +{ + float *gausstab, sum, val; + int i, n; + + n = 2 * size + 1; + + gausstab = (float *)MEM_mallocN(sizeof(float) * n, __func__); + + sum = 0.0f; + float fac = (rad > 0.0f ? 1.0f / rad : 0.0f); + for (i = -size; i <= size; i++) { + val = RE_filter_value(R_FILTER_GAUSS, (float)i * fac); + sum += val; + gausstab[i + size] = val; + } + + sum = 1.0f / sum; + for (i = 0; i < n; i++) + gausstab[i] *= sum; + + return gausstab; +} diff --git a/source/blender/processing/intern/kernels/BPL_opencl_kernels.cl b/source/blender/processing/intern/kernels/BPL_opencl_kernels.cl new file mode 100644 index 0000000..e5ab5be --- /dev/null +++ b/source/blender/processing/intern/kernels/BPL_opencl_kernels.cl @@ -0,0 +1,70 @@ + +/* + BPL Kernels. This file is compiled to C++ by using a compatibility shim and also run by OpenCL. + If you need to add new features that must work on both environments, see BPL_cpu.cpp. +*/ + +//OpenCL / C++ compatibility defines +#ifdef __OPENCL_VERSION__ + #define KERNEL_CONTEXT +#else + #define KERNEL_CONTEXT BPLKernelContext *kernel_context, +#endif + +// This sampler must be used always as the CPU version will match it. +// Support for other samplers can be added if required. +__constant sampler_t SAMPLER_NEAREST = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; + +__kernel void gaussian_blur_horizontal( + KERNEL_CONTEXT + __read_only image2d_t image_in, + __write_only image2d_t image_out, + __global float *gauss_table, + int filter_size) +{ + float4 color = {0.0f, 0.0f, 0.0f, 0.0f}; + float weight = 0.0f; + + const int2 pos = {get_global_id(0), get_global_id(1)}; + const int xmin = max(pos.x - filter_size, 0); + const int xmax = min(pos.x + filter_size + 1, get_image_width(image_in)); + + int i = max(filter_size - pos.x, 0); + for (int x = xmin; x < xmax; x++, i++) { + float w = gauss_table[i]; + int2 src = {x, pos.y}; + color += read_imagef(image_in, SAMPLER_NEAREST, src) * w; + weight += w; + } + + color *= (1.0f / weight); + + write_imagef(image_out, pos, color); +} + +__kernel void gaussian_blur_vertical( + KERNEL_CONTEXT + __read_only image2d_t image_in, + __write_only image2d_t image_out, + __global float *gauss_table, + int filter_size) +{ + float4 color = {0.0f, 0.0f, 0.0f, 0.0f}; + float weight = 0.0f; + + const int2 pos = {get_global_id(0), get_global_id(1)}; + const int ymin = max(pos.y - filter_size, 0); + const int ymax = min(pos.y + filter_size + 1, get_image_height(image_in)); + + int i = max(filter_size - pos.y, 0); + for (int y = ymin; y < ymax; y++, i++) { + float w = gauss_table[i]; + int2 src = {pos.x, y}; + color += read_imagef(image_in, SAMPLER_NEAREST, src) * w; + weight += w; + } + + color *= (1.0f / weight); + + write_imagef(image_out, pos, color); +} diff --git a/source/blender/windowmanager/CMakeLists.txt b/source/blender/windowmanager/CMakeLists.txt index a8b3c99..cc7121c 100644 --- a/source/blender/windowmanager/CMakeLists.txt +++ b/source/blender/windowmanager/CMakeLists.txt @@ -31,6 +31,7 @@ set(INC ../blenloader ../blentranslation ../compositor + ../processing ../editors/include ../gpu ../imbuf diff --git a/source/blender/windowmanager/intern/wm_init_exit.c b/source/blender/windowmanager/intern/wm_init_exit.c index e73ec2b..4eecdc2 100644 --- a/source/blender/windowmanager/intern/wm_init_exit.c +++ b/source/blender/windowmanager/intern/wm_init_exit.c @@ -121,6 +121,7 @@ #include "BKE_depsgraph.h" #include "BKE_sound.h" +#include "BPL_processing.h" #include "COM_compositor.h" #ifdef WITH_OPENSUBDIV @@ -159,6 +160,8 @@ void WM_init(bContext *C, int argc, const char **argv) } GHOST_CreateSystemPaths(); + BPL_init(); + BKE_addon_pref_type_init(); wm_operatortype_init(); @@ -513,6 +516,8 @@ void WM_exit_ext(bContext *C, const bool do_python) #ifdef WITH_COMPOSITOR COM_deinitialize(); #endif + + BPL_exit(); BKE_blender_free(); /* blender.c, does entire library and spacetypes */ // free_matcopybuf(); diff --git a/source/blenderplayer/bad_level_call_stubs/stubs.c b/source/blenderplayer/bad_level_call_stubs/stubs.c index b8e8826..79f7e04 100644 --- a/source/blenderplayer/bad_level_call_stubs/stubs.c +++ b/source/blenderplayer/bad_level_call_stubs/stubs.c @@ -148,6 +148,7 @@ struct wmWindowManager; #include "../blender/blenkernel/BKE_paint.h" #include "../blender/collada/collada.h" #include "../blender/compositor/COM_compositor.h" +#include "../blender/processing/BPL_processing.h" #include "../blender/editors/include/ED_armature.h" #include "../blender/editors/include/ED_anim_api.h" #include "../blender/editors/include/ED_buttons.h" @@ -793,6 +794,15 @@ void COM_execute(RenderData *rd, Scene *scene, bNodeTree *editingtree, int rende const ColorManagedViewSettings *viewSettings, const ColorManagedDisplaySettings *displaySettings, const char *viewName) RET_NONE +/* processing */ +void BPL_init() RET_NONE +void BPL_exit() RET_NONE +ProcessOperation* BPL_process_image_8bit(const unsigned char *data, int width, int height, int channels) RET_NULL +ProcessOperation* BPL_process_image_float(const float *data, int width, int height, int channels) RET_NULL +void BPL_end_operation(ProcessOperation *op) RET_NONE +void BPL_op_image_blur(ProcessOperation *op, int w, int h) RET_NONE +void BPL_op_image_copy(ProcessOperation *op, void *dst) RET_NONE + /*multiview*/ bool RE_RenderResult_is_stereo(RenderResult *res) RET_ZERO void uiTemplateImageViews(uiLayout *layout, struct PointerRNA *imfptr) RET_NONE