commit 8b5b3c1e2ec0f944e2ed8333634bf4dada2e006c Author: Brecht Van Lommel Date: Tue Jun 4 13:14:13 2013 +0200 Tmp. diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index f32c6dd..27978b9 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -46,6 +46,7 @@ public: map tex_interp_map; int cuDevId; bool first_error; + vector cuStreams; struct PixelMem { GLuint cuPBO; @@ -205,6 +206,12 @@ public: if(cuda_error_(result, "cuCtxCreate")) return; + const int num_streams = 8; + cuStreams.resize(num_streams); + + for(int i = 0; i < num_streams; i++) + cuStreamCreate(&cuStreams[i], 0); + cuda_pop_context(); } @@ -212,6 +219,9 @@ public: { task_pool.stop(); + for(int i = 0; i < cuStreams.size(); i++) + cuStreamDestroy(cuStreams[i]); + cuda_push_context(); cuda_assert(cuCtxDetach(cuContext)) } @@ -514,7 +524,7 @@ public: } } - void path_trace(RenderTile& rtile, int sample) + void path_trace(RenderTile& rtile, int sample, CUstream stream) { if(have_error()) return; @@ -575,9 +585,9 @@ public: cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)) cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1)) - cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks)) + cuda_assert(cuLaunchGridAsync(cuPathTrace, xblocks, yblocks, stream)) - cuda_assert(cuCtxSynchronize()) + //cuda_assert(cuCtxSynchronize()) cuda_pop_context(); } @@ -882,12 +892,35 @@ public: void thread_run(DeviceTask *task) { if(task->type == DeviceTask::PATH_TRACE) { - RenderTile tile; + vector concurrent_tiles(cuStreams.size()); + vector have_tile(cuStreams.size()); /* keep rendering tiles until done */ - while(task->acquire_tile(this, tile)) { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; + while(1) { + int start_sample = -1; + int end_sample = -1; + + for(int i = 0; i < concurrent_tiles.size(); i++) { + RenderTile& tile = concurrent_tiles[i]; + + if(task->acquire_tile(this, tile)) { + have_tile[i] = true; + + if(start_sample == -1) { + start_sample = tile.start_sample; + end_sample = tile.start_sample + tile.num_samples; + } + else { + start_sample = min(start_sample, tile.start_sample); + end_sample = max(end_sample, tile.start_sample + tile.num_samples); + } + } + else + have_tile[i] = false; + } + + if(start_sample == -1) + break; for(int sample = start_sample; sample < end_sample; sample++) { if (task->get_cancel()) { @@ -895,21 +928,35 @@ public: break; } - path_trace(tile, sample); + for(int i = 0; i < concurrent_tiles.size(); i++) { + if(have_tile[i]) { + RenderTile& tile = concurrent_tiles[i]; + int tile_end_sample = tile.start_sample + tile.num_samples; - tile.sample = sample + 1; + if(sample > tile.start_sample && sample < tile_end_sample) { + path_trace(tile, sample, cuStreams[i]); + tile.sample = sample + 1; - task->update_progress(tile); + if(i == 0) + task->update_progress(tile); + } + } + } } - task->release_tile(tile); + for(int i = 0; i < concurrent_tiles.size(); i++) { + if(have_tile[i]) { + RenderTile& tile = concurrent_tiles[i]; + task->release_tile(tile); + } + } } } else if(task->type == DeviceTask::SHADER) { shader(*task); cuda_push_context(); - cuda_assert(cuCtxSynchronize()) + //cuda_assert(cuCtxSynchronize()) cuda_pop_context(); } } @@ -930,7 +977,7 @@ public: tonemap(task, task.buffer, task.rgba); cuda_push_context(); - cuda_assert(cuCtxSynchronize()) + //cuda_assert(cuCtxSynchronize()) cuda_pop_context(); } else {