From af4391300c4ce8c59e85b42725d767b13240408c Mon Sep 17 00:00:00 2001 From: jacob Date: Thu, 26 Jun 2025 00:47:12 -0500 Subject: [PATCH] gpu evictor thread --- src/gp_dx12.c | 308 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 253 insertions(+), 55 deletions(-) diff --git a/src/gp_dx12.c b/src/gp_dx12.c index 3a956836..3f1e393c 100644 --- a/src/gp_dx12.c +++ b/src/gp_dx12.c @@ -57,6 +57,21 @@ #define DX12_NUM_RTV_DESCRIPTORS (1024 * 1) #define DX12_COMMAND_BUFFER_MIN_SIZE (1024 * 64) +#define DX12_MULTI_QUEUE 1 +#if DX12_MULTI_QUEUE +# define DX12_QUEUE_DIRECT 0 +# define DX12_QUEUE_COMPUTE 1 +# define DX12_QUEUE_COPY_CRITICAL 2 +# define DX12_QUEUE_COPY_BACKGROUND 3 +# define DX12_NUM_QUEUES 4 +#else +# define DX12_QUEUE_DIRECT 0 +# define DX12_QUEUE_COMPUTE 0 +# define DX12_QUEUE_COPY_CRITICAL 0 +# define DX12_QUEUE_COPY_BACKGROUND 0 +# define DX12_NUM_QUEUES 1 +#endif + #if RTC # define DX12_DEBUG 1 # define DX12_SHADER_DEBUG 1 @@ -200,8 +215,8 @@ struct descriptor { }; struct dx12_resource { - ID3D12Resource *resource; enum D3D12_RESOURCE_STATES state; + ID3D12Resource *resource; struct descriptor *cbv_descriptor; struct descriptor *srv_descriptor; struct descriptor *uav_descriptor; @@ -242,6 +257,17 @@ struct cpu_descriptor_heap { struct D3D12_CPU_DESCRIPTOR_HANDLE handle; }; +enum fenced_release_kind { + FENCED_RELEASE_KIND_NONE, + FENCED_RELEASE_KIND_RESOURCE, + FENCED_RELEASE_KIND_PIPELINE +}; + +struct fenced_release_data { + enum fenced_release_kind kind; + void *ptr; +}; + enum handle_kind { DX12_HANDLE_KIND_NONE, DX12_HANDLE_KIND_RESOURCE, @@ -293,6 +319,11 @@ GLOBAL struct { struct dict *top_successful_pipelines; /* Latest pipelines that successfully compiled */ struct pipeline_scope *first_free_pipeline_scope; + /* Fenced release queue */ + struct sys_mutex *fenced_releases_mutex; + struct arena *fenced_releases_arena; + u64 fenced_release_targets[DX12_NUM_QUEUES]; + /* Factory */ IDXGIFactory6 *factory; @@ -311,16 +342,17 @@ GLOBAL struct { struct cpu_descriptor_heap *rtv_heap; /* Command queues */ - /* TODO: Add optional mode to route everything to direct queue */ struct sys_mutex *global_command_list_record_mutex; - struct sys_mutex *global_command_list_submit_mutex; - struct command_queue *cq_direct; - struct command_queue *cq_compute; - struct command_queue *cq_copy_critical; - struct command_queue *cq_copy_background; + struct sys_mutex *global_submit_mutex; + struct command_queue *command_queues[DX12_NUM_QUEUES]; /* Swapchain */ struct swapchain swapchain; + + /* Evictor thread */ + struct atomic_i32 evictor_thread_shutdown; + HANDLE evictor_thread_wake_event; + struct sys_thread *evictor_thread; } G = ZI, DEBUG_ALIAS(G, G_gp_dx12); /* ========================== * @@ -334,7 +366,8 @@ INTERNAL void dx12_init_pipelines(void); INTERNAL struct cpu_descriptor_heap *cpu_descriptor_heap_alloc(enum D3D12_DESCRIPTOR_HEAP_TYPE type); INTERNAL struct command_queue *command_queue_alloc(enum D3D12_COMMAND_LIST_TYPE type, enum D3D12_COMMAND_QUEUE_PRIORITY priority, struct string dbg_name); INTERNAL void command_queue_release(struct command_queue *cq); -INTERNAL void dx12_resource_release(struct dx12_resource *resource); +INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(evictor_thread_entry_point, arg); +INTERNAL void fenced_release(void *data, enum fenced_release_kind kind); #if RESOURCE_RELOADING INTERNAL RESOURCE_WATCH_CALLBACK_FUNC_DEF(pipeline_resource_watch_callback, name); @@ -368,6 +401,10 @@ struct gp_startup_receipt gp_startup(struct work_startup_receipt *work_sr) G.top_pipelines = dict_init(G.pipelines_arena, 1024); G.top_successful_pipelines = dict_init(G.pipelines_arena, 1024); + /* Initialize fenced releases queue */ + G.fenced_releases_mutex = sys_mutex_alloc(); + G.fenced_releases_arena = arena_alloc(GIGABYTE(64)); + /* Initialize dx12 */ dx12_init_device(); dx12_init_objects(); @@ -379,6 +416,10 @@ struct gp_startup_receipt gp_startup(struct work_startup_receipt *work_sr) #endif app_register_exit_callback(gp_shutdown); + /* Start evictor thread */ + G.evictor_thread_wake_event = CreateEvent(NULL, false, false, NULL); + G.evictor_thread = sys_thread_alloc(evictor_thread_entry_point, NULL, LIT("[P2] GPU evictor thread")); + struct gp_startup_receipt res = ZI; return res; } @@ -386,25 +427,27 @@ struct gp_startup_receipt gp_startup(struct work_startup_receipt *work_sr) INTERNAL APP_EXIT_CALLBACK_FUNC_DEF(gp_shutdown) { __prof; -#if DX12_DEBUG +#if 0 /* Release objects to make live object reporting less noisy */ //IDXGISwapChain3_Release(G.swapchain); - command_queue_release(G.cq_copy_background); - command_queue_release(G.cq_copy_critical); - command_queue_release(G.cq_compute); - command_queue_release(G.cq_direct); + for (u32 i = 0; i < ARRAY_COUNT(G.command_queues); ++i) { + struct command_queue *cq = G.command_queues[i]; + cmomand_queue_release(cq); + } ID3D12Device_Release(G.device); #else (UNUSED)command_queue_release; #endif + + atomic_i32_eval_exchange(&G.evictor_thread_shutdown, 1); + SetEvent(G.evictor_thread_wake_event); + sys_thread_wait_release(G.evictor_thread); } /* ========================== * * Handle * ========================== */ -INTERNAL void dx12_resource_release(struct dx12_resource *t); - INTERNAL struct gp_handle handle_alloc(enum handle_kind kind, void *data) { u64 old_gen = 0; @@ -502,7 +545,7 @@ void gp_release(struct gp_handle handle) case DX12_HANDLE_KIND_RESOURCE: { - dx12_resource_release(data); + fenced_release(data, FENCED_RELEASE_KIND_RESOURCE); } break; } } @@ -684,11 +727,18 @@ INTERNAL void dx12_init_objects(void) /* Create command queues */ G.global_command_list_record_mutex = sys_mutex_alloc(); - G.global_command_list_submit_mutex = sys_mutex_alloc(); - G.cq_direct = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_DIRECT, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, LIT("Direct queue")); - G.cq_compute = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_COMPUTE, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, LIT("Compute queue")); - G.cq_copy_critical = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_COPY, D3D12_COMMAND_QUEUE_PRIORITY_HIGH, LIT("High priority copy queue")); - G.cq_copy_background = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_COPY, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, LIT("Background copy queue")); + G.global_submit_mutex = sys_mutex_alloc(); + for (u32 i = 0; i < DX12_NUM_QUEUES; ++i) { + if (i == DX12_QUEUE_DIRECT) { + G.command_queues[i] = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_DIRECT, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, LIT("Direct queue")); + } else if (i == DX12_QUEUE_COMPUTE) { + G.command_queues[i] = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_COMPUTE, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, LIT("Compute queue")); + } else if (i == DX12_QUEUE_COPY_CRITICAL) { + G.command_queues[i] = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_COPY, D3D12_COMMAND_QUEUE_PRIORITY_HIGH, LIT("High priority copy queue")); + } else if (i == DX12_QUEUE_COPY_BACKGROUND) { + G.command_queues[i] = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_COPY, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, LIT("Background copy queue")); + } + } } /* ========================== * @@ -696,7 +746,6 @@ INTERNAL void dx12_init_objects(void) * ========================== */ INTERNAL void pipeline_alloc_from_desc(u64 num_pipelines, struct pipeline_desc *descs, struct pipeline **pipelines_out); -INTERNAL void pipeline_release(struct pipeline *pipeline); INTERNAL void pipeline_register(u64 num_pipelines, struct pipeline **pipelines); INTERNAL void dx12_init_pipelines(void) @@ -1205,17 +1254,13 @@ INTERNAL void pipeline_alloc_from_desc(u64 num_pipelines, struct pipeline_desc * work_wait(work); } -INTERNAL void pipeline_release(struct pipeline *pipeline) +INTERNAL void pipeline_release_now(struct pipeline *pipeline) { __prof; - /* FIXME: Delayed release based on queue fence */ - (UNUSED)pipeline; -#if 0 if (pipeline->pso) { ID3D12PipelineState_Release(pipeline->pso); } arena_release(pipeline->arena); -#endif } /* ========================== * @@ -1255,7 +1300,7 @@ INTERNAL void pipeline_scope_end(struct pipeline_scope *scope) for (struct dict_entry *entry = scope->refs->first; entry; entry = entry->next) { struct pipeline *pipeline = (struct pipeline *)entry->value; if (--pipeline->refcount <= 0) { - pipeline_release(pipeline); + fenced_release(pipeline, FENCED_RELEASE_KIND_PIPELINE); } } scope->next_free = G.first_free_pipeline_scope; @@ -1304,7 +1349,7 @@ INTERNAL void pipeline_register(u64 num_pipelines, struct pipeline **pipelines) { struct pipeline *old_pipeline = (struct pipeline *)dict_get(G.top_pipelines, hash); if (old_pipeline && --old_pipeline->refcount <= 0) { - pipeline_release(old_pipeline); + fenced_release(old_pipeline, FENCED_RELEASE_KIND_PIPELINE); } dict_set(G.pipelines_arena, G.top_pipelines, hash, (u64)pipeline); ++pipeline->refcount; @@ -1313,7 +1358,7 @@ INTERNAL void pipeline_register(u64 num_pipelines, struct pipeline **pipelines) if (pipeline->success) { struct pipeline *old_pipeline = (struct pipeline *)dict_get(G.top_successful_pipelines, hash); if (old_pipeline && --old_pipeline->refcount <= 0) { - pipeline_release(old_pipeline); + fenced_release(old_pipeline, FENCED_RELEASE_KIND_PIPELINE); } dict_set(G.pipelines_arena, G.top_successful_pipelines, hash, (u64)pipeline); ++pipeline->refcount; @@ -1614,6 +1659,38 @@ i32 gp_push_cmd(struct gp_flow *gp_flow, struct gp_cmd_desc *cmd_desc) return ret; } +/* ========================== * + * Fenced release + * ========================== */ + +INTERNAL void fenced_release(void *data, enum fenced_release_kind kind) +{ + struct fenced_release_data fr = ZI; + fr.kind = kind; + fr.ptr = data; + + u64 fr_targets[ARRAY_COUNT(G.fenced_release_targets)] = ZI; + + /* Read fence values */ + for (u32 i = 0; i < ARRAY_COUNT(G.command_queues); ++i) { + struct command_queue *cq = G.command_queues[i]; + struct sys_lock lock = sys_mutex_lock_s(cq->submit_fence_mutex); + fr_targets[i] = cq->submit_fence_target; + sys_mutex_unlock(&lock); + } + + /* Push data to release queue */ + { + struct sys_lock lock = sys_mutex_lock_e(G.fenced_releases_mutex); + *arena_push(G.fenced_releases_arena, struct fenced_release_data) = fr; + MEMCPY(G.fenced_release_targets, fr_targets, sizeof(fr_targets)); + sys_mutex_unlock(&lock); + } + + /* Wake evictor */ + SetEvent(G.evictor_thread_wake_event); +} + /* ========================== * * Resource * ========================== */ @@ -1681,11 +1758,33 @@ INTERNAL struct dx12_resource *dx12_resource_alloc(D3D12_HEAP_PROPERTIES heap_pr return r; } -INTERNAL void dx12_resource_release(struct dx12_resource *t) +INTERNAL void dx12_resource_release_now(struct dx12_resource *t) { __prof; - /* TODO */ - (UNUSED)t; + + /* Release descriptors */ + /* TODO: Batch lock heaps */ + if (t->cbv_descriptor) { + descriptor_release(t->cbv_descriptor); + } + if (t->srv_descriptor) { + descriptor_release(t->srv_descriptor); + } + if (t->uav_descriptor) { + descriptor_release(t->uav_descriptor); + } + if (t->rtv_descriptor) { + descriptor_release(t->rtv_descriptor); + } + + /* Release resource */ + ID3D12Resource_Release(t->resource); + + /* Add to free list */ + struct sys_lock lock = sys_mutex_lock_e(G.resources_mutex); + t->next_free = G.first_free_resource; + G.first_free_resource = t; + sys_mutex_unlock(&lock); } INTERNAL enum D3D12_RESOURCE_STATES dx12_resource_barrier(ID3D12GraphicsCommandList *cl, struct dx12_resource *resource, enum D3D12_RESOURCE_STATES state) @@ -1755,7 +1854,7 @@ INTERNAL void command_queue_release(struct command_queue *cq) __prof; /* TODO */ (UNUSED)cq; - //ID3D12CommandQueue_Release(G.cq_copy_background->cq); + //ID3D12CommandQueue_Release(cq->cq); } /* ========================== * @@ -1875,7 +1974,7 @@ INTERNAL u64 command_list_close(struct command_list *cl) u64 submit_fence_target = 0; { __profscope(Execute); - struct sys_lock global_lock = sys_mutex_lock_s(G.global_command_list_submit_mutex); + struct sys_lock submit_lock = sys_mutex_lock_s(G.global_submit_mutex); struct sys_lock fence_lock = sys_mutex_lock_e(cq->submit_fence_mutex); { submit_fence_target = ++cq->submit_fence_target; @@ -1883,7 +1982,7 @@ INTERNAL u64 command_list_close(struct command_list *cl) ID3D12CommandQueue_Signal(cq->cq, cq->submit_fence, submit_fence_target); } sys_mutex_unlock(&fence_lock); - sys_mutex_unlock(&global_lock); + sys_mutex_unlock(&submit_lock); } /* Add descriptor heaps to submitted list */ @@ -2333,7 +2432,7 @@ struct gp_handle gp_texture_alloc(enum gp_texture_format format, u32 flags, stru } /* Copy from upload heap to texture */ - struct command_queue *cq = G.cq_copy_background; + struct command_queue *cq = G.command_queues[DX12_QUEUE_COPY_BACKGROUND]; struct command_list *cl = command_list_open(cq->cl_pool); { __profscope_dx12(cl->cq->prof, cl->cl, Upload texture, RGB32_F(0.2, 0.5, 0.2)); @@ -2359,7 +2458,7 @@ struct gp_handle gp_texture_alloc(enum gp_texture_format format, u32 flags, stru /* TODO: Return async waitable to caller */ { __profscope(Wait for upload); - HANDLE event = CreateEvent(NULL, FALSE, FALSE, NULL); + HANDLE event = CreateEvent(NULL, false, false, NULL); ID3D12Fence_SetEventOnCompletion(cq->submit_fence, fence_target, event); WaitForSingleObject(event, INFINITE); CloseHandle(event); @@ -2401,7 +2500,8 @@ void gp_dispatch(struct gp_dispatch_params params) struct pipeline_scope *pipeline_scope = pipeline_scope_begin(); struct pipeline *material_pipeline = pipeline_from_name(pipeline_scope, LIT("material")); struct pipeline *shape_pipeline = pipeline_from_name(pipeline_scope, LIT("shape")); - struct command_list *cl = command_list_open(G.cq_direct->cl_pool); + struct command_queue *cq = G.command_queues[DX12_QUEUE_DIRECT]; + struct command_list *cl = command_list_open(cq->cl_pool); { __profscope_dx12(cl->cq->prof, cl->cl, Dispatch, RGB32_F(0.5, 0.2, 0.2)); struct dx12_resource *target = handle_get_data(params.draw_target, DX12_HANDLE_KIND_RESOURCE); @@ -2616,21 +2716,21 @@ INTERNAL struct swapchain_buffer *update_swapchain(struct swapchain *swapchain, b32 should_rebuild = !v2i32_eq(swapchain->resolution, resolution); if (should_rebuild) { HRESULT hr = 0; - struct command_queue *cq = G.cq_direct; + struct command_queue *cq = G.command_queues[DX12_QUEUE_DIRECT]; HWND hwnd = (HWND)sys_window_get_internal_handle(window); if (swapchain->swapchain) { ASSERT(hwnd == swapchain->hwnd); /* Lock direct queue submissions (in case any write to backbuffer) */ /* TODO: Less overkill approach - Only flush present_blit since we know it's the only operation targeting backbuffer */ - //struct sys_lock lock = sys_mutex_lock_e(cq->submit_fence_mutex); - DEBUGBREAKABLE; - struct sys_lock lock = sys_mutex_lock_e(G.global_command_list_record_mutex); + struct sys_lock lock = sys_mutex_lock_e(cq->submit_fence_mutex); + //DEBUGBREAKABLE; + //struct sys_lock lock = sys_mutex_lock_e(G.global_command_list_record_mutex); { /* Flush direct queue */ //ID3D12CommandQueue_Signal(cq->cq, cq->submit_fence, ++cq->submit_fence_target); { - HANDLE event = CreateEvent(NULL, FALSE, FALSE, NULL); + HANDLE event = CreateEvent(NULL, false, false, NULL); ID3D12Fence_SetEventOnCompletion(cq->submit_fence, cq->submit_fence_target, event); WaitForSingleObject(event, INFINITE); CloseHandle(event); @@ -2720,7 +2820,8 @@ INTERNAL void present_blit(struct swapchain_buffer *dst, struct dx12_resource *s struct pipeline_scope *pipeline_scope = pipeline_scope_begin(); struct pipeline *blit_pipeline = pipeline_from_name(pipeline_scope, LIT("blit")); if (blit_pipeline->success) { - struct command_list *cl = command_list_open(G.cq_direct->cl_pool); + struct command_queue *cq = G.command_queues[DX12_QUEUE_DIRECT]; + struct command_list *cl = command_list_open(cq->cl_pool); { __profscope_dx12(cl->cq->prof, cl->cl, Blit, RGB32_F(0.5, 0.2, 0.2)); struct swapchain *swapchain = dst->swapchain; @@ -2838,18 +2939,18 @@ void gp_present(struct sys_window *window, struct v2i32 backresolution, struct g __profscope(Mark queue frames); /* Lock because frame marks shouldn't occur while command lists are recording */ struct sys_lock lock = sys_mutex_lock_e(G.global_command_list_record_mutex); - __prof_dx12_new_frame(G.cq_direct->prof); - __prof_dx12_new_frame(G.cq_compute->prof); - __prof_dx12_new_frame(G.cq_copy_critical->prof); - __prof_dx12_new_frame(G.cq_copy_background->prof); + for (u32 i = 0; i < ARRAY_COUNT(G.command_queues); ++i) { + struct command_queue *cq = G.command_queues[i]; + __prof_dx12_new_frame(cq->prof); + } sys_mutex_unlock(&lock); } { __profscope(Collect queues); - __prof_dx12_collect(G.cq_direct->prof); - __prof_dx12_collect(G.cq_compute->prof); - __prof_dx12_collect(G.cq_copy_critical->prof); - __prof_dx12_collect(G.cq_copy_background->prof); + for (u32 i = 0; i < ARRAY_COUNT(G.command_queues); ++i) { + struct command_queue *cq = G.command_queues[i]; + __prof_dx12_collect(cq->prof); + } } #endif @@ -2860,6 +2961,105 @@ void gp_present(struct sys_window *window, struct v2i32 backresolution, struct g (UNUSED)vsync; } +/* ========================== * + * Evictor thread + * ========================== */ + +INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(evictor_thread_entry_point, arg) +{ + __prof; + (UNUSED)arg; + struct arena_temp scratch = scratch_begin_no_conflict(); + + HANDLE event = CreateEvent(NULL, false, false, NULL); + HANDLE events[2] = ZI; + events[0] = G.evictor_thread_wake_event; + events[1] = event; + + u64 completed_targets[DX12_NUM_QUEUES] = ZI; + + b32 shutdown = atomic_i32_eval(&G.evictor_thread_shutdown); + while (!shutdown) { + struct arena_temp temp = arena_temp_begin(scratch.arena); + { + __profscope(Run); + + u64 targets[ARRAY_COUNT(completed_targets)] = ZI; + + /* Copy queued data */ + u32 num_fenced_releases = 0; + struct fenced_release_data *fenced_releases = NULL; + { + __profscope(Copy queued releases); + struct sys_lock lock = sys_mutex_lock_e(G.fenced_releases_mutex); + num_fenced_releases = G.fenced_releases_arena->pos / sizeof(struct fenced_release_data); + fenced_releases = arena_push_array_no_zero(temp.arena, struct fenced_release_data, num_fenced_releases); + MEMCPY(fenced_releases, arena_base(G.fenced_releases_arena), G.fenced_releases_arena->pos); + arena_reset(G.fenced_releases_arena); + MEMCPY(targets, G.fenced_release_targets, sizeof(targets)); + sys_mutex_unlock(&lock); + } + + /* Wait until fences reach target */ + { + __profscope(Check fences); + for (u32 i = 0; i < ARRAY_COUNT(targets) && !shutdown; ++i) { + while (completed_targets[i] < targets[i] && !shutdown) { + struct command_queue *cq = G.command_queues[i]; + completed_targets[i] = ID3D12Fence_GetCompletedValue(cq->submit_fence); + if (completed_targets[i] < targets[i]) { + ID3D12Fence_SetEventOnCompletion(cq->submit_fence, targets[i], event); + { + __profscope(Wait on fence); + WaitForMultipleObjects(2, events, false, INFINITE); + shutdown = atomic_i32_eval(&G.evictor_thread_shutdown); + } + } + } + } + } + + /* Process releases */ + if (!shutdown) { + __profscope(Release); + for (u32 i = 0; i < num_fenced_releases; ++i) { + struct fenced_release_data *fr = &fenced_releases[i]; + switch (fr->kind) { + default: + { + /* Unknown handle type */ + ASSERT(false); + } break; + + case FENCED_RELEASE_KIND_RESOURCE: + { + struct dx12_resource *resource = (struct dx12_resource *)fr->ptr; + dx12_resource_release_now(resource); + } break; + + case FENCED_RELEASE_KIND_PIPELINE: + { + struct pipeline *pipeline = (struct pipeline *)fr->ptr; + pipeline_release_now(pipeline); + } break; + } + } + } + } + arena_temp_end(temp); + { + __profscope(Sleep); + WaitForSingleObject(G.evictor_thread_wake_event, INFINITE); + shutdown = atomic_i32_eval(&G.evictor_thread_shutdown); + } + } + + /* Release event */ + CloseHandle(event); + + scratch_end(scratch); +} + @@ -2879,8 +3079,6 @@ void gp_present(struct sys_window *window, struct v2i32 backresolution, struct g - -