gpu evictor thread

This commit is contained in:
jacob 2025-06-26 00:47:12 -05:00
parent 130b90bf7a
commit af4391300c

View File

@ -57,6 +57,21 @@
#define DX12_NUM_RTV_DESCRIPTORS (1024 * 1)
#define DX12_COMMAND_BUFFER_MIN_SIZE (1024 * 64)
#define DX12_MULTI_QUEUE 1
#if DX12_MULTI_QUEUE
# define DX12_QUEUE_DIRECT 0
# define DX12_QUEUE_COMPUTE 1
# define DX12_QUEUE_COPY_CRITICAL 2
# define DX12_QUEUE_COPY_BACKGROUND 3
# define DX12_NUM_QUEUES 4
#else
# define DX12_QUEUE_DIRECT 0
# define DX12_QUEUE_COMPUTE 0
# define DX12_QUEUE_COPY_CRITICAL 0
# define DX12_QUEUE_COPY_BACKGROUND 0
# define DX12_NUM_QUEUES 1
#endif
#if RTC
# define DX12_DEBUG 1
# define DX12_SHADER_DEBUG 1
@ -200,8 +215,8 @@ struct descriptor {
};
struct dx12_resource {
ID3D12Resource *resource;
enum D3D12_RESOURCE_STATES state;
ID3D12Resource *resource;
struct descriptor *cbv_descriptor;
struct descriptor *srv_descriptor;
struct descriptor *uav_descriptor;
@ -242,6 +257,17 @@ struct cpu_descriptor_heap {
struct D3D12_CPU_DESCRIPTOR_HANDLE handle;
};
enum fenced_release_kind {
FENCED_RELEASE_KIND_NONE,
FENCED_RELEASE_KIND_RESOURCE,
FENCED_RELEASE_KIND_PIPELINE
};
struct fenced_release_data {
enum fenced_release_kind kind;
void *ptr;
};
enum handle_kind {
DX12_HANDLE_KIND_NONE,
DX12_HANDLE_KIND_RESOURCE,
@ -293,6 +319,11 @@ GLOBAL struct {
struct dict *top_successful_pipelines; /* Latest pipelines that successfully compiled */
struct pipeline_scope *first_free_pipeline_scope;
/* Fenced release queue */
struct sys_mutex *fenced_releases_mutex;
struct arena *fenced_releases_arena;
u64 fenced_release_targets[DX12_NUM_QUEUES];
/* Factory */
IDXGIFactory6 *factory;
@ -311,16 +342,17 @@ GLOBAL struct {
struct cpu_descriptor_heap *rtv_heap;
/* Command queues */
/* TODO: Add optional mode to route everything to direct queue */
struct sys_mutex *global_command_list_record_mutex;
struct sys_mutex *global_command_list_submit_mutex;
struct command_queue *cq_direct;
struct command_queue *cq_compute;
struct command_queue *cq_copy_critical;
struct command_queue *cq_copy_background;
struct sys_mutex *global_submit_mutex;
struct command_queue *command_queues[DX12_NUM_QUEUES];
/* Swapchain */
struct swapchain swapchain;
/* Evictor thread */
struct atomic_i32 evictor_thread_shutdown;
HANDLE evictor_thread_wake_event;
struct sys_thread *evictor_thread;
} G = ZI, DEBUG_ALIAS(G, G_gp_dx12);
/* ========================== *
@ -334,7 +366,8 @@ INTERNAL void dx12_init_pipelines(void);
INTERNAL struct cpu_descriptor_heap *cpu_descriptor_heap_alloc(enum D3D12_DESCRIPTOR_HEAP_TYPE type);
INTERNAL struct command_queue *command_queue_alloc(enum D3D12_COMMAND_LIST_TYPE type, enum D3D12_COMMAND_QUEUE_PRIORITY priority, struct string dbg_name);
INTERNAL void command_queue_release(struct command_queue *cq);
INTERNAL void dx12_resource_release(struct dx12_resource *resource);
INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(evictor_thread_entry_point, arg);
INTERNAL void fenced_release(void *data, enum fenced_release_kind kind);
#if RESOURCE_RELOADING
INTERNAL RESOURCE_WATCH_CALLBACK_FUNC_DEF(pipeline_resource_watch_callback, name);
@ -368,6 +401,10 @@ struct gp_startup_receipt gp_startup(struct work_startup_receipt *work_sr)
G.top_pipelines = dict_init(G.pipelines_arena, 1024);
G.top_successful_pipelines = dict_init(G.pipelines_arena, 1024);
/* Initialize fenced releases queue */
G.fenced_releases_mutex = sys_mutex_alloc();
G.fenced_releases_arena = arena_alloc(GIGABYTE(64));
/* Initialize dx12 */
dx12_init_device();
dx12_init_objects();
@ -379,6 +416,10 @@ struct gp_startup_receipt gp_startup(struct work_startup_receipt *work_sr)
#endif
app_register_exit_callback(gp_shutdown);
/* Start evictor thread */
G.evictor_thread_wake_event = CreateEvent(NULL, false, false, NULL);
G.evictor_thread = sys_thread_alloc(evictor_thread_entry_point, NULL, LIT("[P2] GPU evictor thread"));
struct gp_startup_receipt res = ZI;
return res;
}
@ -386,25 +427,27 @@ struct gp_startup_receipt gp_startup(struct work_startup_receipt *work_sr)
INTERNAL APP_EXIT_CALLBACK_FUNC_DEF(gp_shutdown)
{
__prof;
#if DX12_DEBUG
#if 0
/* Release objects to make live object reporting less noisy */
//IDXGISwapChain3_Release(G.swapchain);
command_queue_release(G.cq_copy_background);
command_queue_release(G.cq_copy_critical);
command_queue_release(G.cq_compute);
command_queue_release(G.cq_direct);
for (u32 i = 0; i < ARRAY_COUNT(G.command_queues); ++i) {
struct command_queue *cq = G.command_queues[i];
cmomand_queue_release(cq);
}
ID3D12Device_Release(G.device);
#else
(UNUSED)command_queue_release;
#endif
atomic_i32_eval_exchange(&G.evictor_thread_shutdown, 1);
SetEvent(G.evictor_thread_wake_event);
sys_thread_wait_release(G.evictor_thread);
}
/* ========================== *
* Handle
* ========================== */
INTERNAL void dx12_resource_release(struct dx12_resource *t);
INTERNAL struct gp_handle handle_alloc(enum handle_kind kind, void *data)
{
u64 old_gen = 0;
@ -502,7 +545,7 @@ void gp_release(struct gp_handle handle)
case DX12_HANDLE_KIND_RESOURCE:
{
dx12_resource_release(data);
fenced_release(data, FENCED_RELEASE_KIND_RESOURCE);
} break;
}
}
@ -684,11 +727,18 @@ INTERNAL void dx12_init_objects(void)
/* Create command queues */
G.global_command_list_record_mutex = sys_mutex_alloc();
G.global_command_list_submit_mutex = sys_mutex_alloc();
G.cq_direct = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_DIRECT, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, LIT("Direct queue"));
G.cq_compute = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_COMPUTE, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, LIT("Compute queue"));
G.cq_copy_critical = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_COPY, D3D12_COMMAND_QUEUE_PRIORITY_HIGH, LIT("High priority copy queue"));
G.cq_copy_background = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_COPY, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, LIT("Background copy queue"));
G.global_submit_mutex = sys_mutex_alloc();
for (u32 i = 0; i < DX12_NUM_QUEUES; ++i) {
if (i == DX12_QUEUE_DIRECT) {
G.command_queues[i] = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_DIRECT, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, LIT("Direct queue"));
} else if (i == DX12_QUEUE_COMPUTE) {
G.command_queues[i] = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_COMPUTE, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, LIT("Compute queue"));
} else if (i == DX12_QUEUE_COPY_CRITICAL) {
G.command_queues[i] = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_COPY, D3D12_COMMAND_QUEUE_PRIORITY_HIGH, LIT("High priority copy queue"));
} else if (i == DX12_QUEUE_COPY_BACKGROUND) {
G.command_queues[i] = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_COPY, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, LIT("Background copy queue"));
}
}
}
/* ========================== *
@ -696,7 +746,6 @@ INTERNAL void dx12_init_objects(void)
* ========================== */
INTERNAL void pipeline_alloc_from_desc(u64 num_pipelines, struct pipeline_desc *descs, struct pipeline **pipelines_out);
INTERNAL void pipeline_release(struct pipeline *pipeline);
INTERNAL void pipeline_register(u64 num_pipelines, struct pipeline **pipelines);
INTERNAL void dx12_init_pipelines(void)
@ -1205,17 +1254,13 @@ INTERNAL void pipeline_alloc_from_desc(u64 num_pipelines, struct pipeline_desc *
work_wait(work);
}
INTERNAL void pipeline_release(struct pipeline *pipeline)
INTERNAL void pipeline_release_now(struct pipeline *pipeline)
{
__prof;
/* FIXME: Delayed release based on queue fence */
(UNUSED)pipeline;
#if 0
if (pipeline->pso) {
ID3D12PipelineState_Release(pipeline->pso);
}
arena_release(pipeline->arena);
#endif
}
/* ========================== *
@ -1255,7 +1300,7 @@ INTERNAL void pipeline_scope_end(struct pipeline_scope *scope)
for (struct dict_entry *entry = scope->refs->first; entry; entry = entry->next) {
struct pipeline *pipeline = (struct pipeline *)entry->value;
if (--pipeline->refcount <= 0) {
pipeline_release(pipeline);
fenced_release(pipeline, FENCED_RELEASE_KIND_PIPELINE);
}
}
scope->next_free = G.first_free_pipeline_scope;
@ -1304,7 +1349,7 @@ INTERNAL void pipeline_register(u64 num_pipelines, struct pipeline **pipelines)
{
struct pipeline *old_pipeline = (struct pipeline *)dict_get(G.top_pipelines, hash);
if (old_pipeline && --old_pipeline->refcount <= 0) {
pipeline_release(old_pipeline);
fenced_release(old_pipeline, FENCED_RELEASE_KIND_PIPELINE);
}
dict_set(G.pipelines_arena, G.top_pipelines, hash, (u64)pipeline);
++pipeline->refcount;
@ -1313,7 +1358,7 @@ INTERNAL void pipeline_register(u64 num_pipelines, struct pipeline **pipelines)
if (pipeline->success) {
struct pipeline *old_pipeline = (struct pipeline *)dict_get(G.top_successful_pipelines, hash);
if (old_pipeline && --old_pipeline->refcount <= 0) {
pipeline_release(old_pipeline);
fenced_release(old_pipeline, FENCED_RELEASE_KIND_PIPELINE);
}
dict_set(G.pipelines_arena, G.top_successful_pipelines, hash, (u64)pipeline);
++pipeline->refcount;
@ -1614,6 +1659,38 @@ i32 gp_push_cmd(struct gp_flow *gp_flow, struct gp_cmd_desc *cmd_desc)
return ret;
}
/* ========================== *
* Fenced release
* ========================== */
INTERNAL void fenced_release(void *data, enum fenced_release_kind kind)
{
struct fenced_release_data fr = ZI;
fr.kind = kind;
fr.ptr = data;
u64 fr_targets[ARRAY_COUNT(G.fenced_release_targets)] = ZI;
/* Read fence values */
for (u32 i = 0; i < ARRAY_COUNT(G.command_queues); ++i) {
struct command_queue *cq = G.command_queues[i];
struct sys_lock lock = sys_mutex_lock_s(cq->submit_fence_mutex);
fr_targets[i] = cq->submit_fence_target;
sys_mutex_unlock(&lock);
}
/* Push data to release queue */
{
struct sys_lock lock = sys_mutex_lock_e(G.fenced_releases_mutex);
*arena_push(G.fenced_releases_arena, struct fenced_release_data) = fr;
MEMCPY(G.fenced_release_targets, fr_targets, sizeof(fr_targets));
sys_mutex_unlock(&lock);
}
/* Wake evictor */
SetEvent(G.evictor_thread_wake_event);
}
/* ========================== *
* Resource
* ========================== */
@ -1681,11 +1758,33 @@ INTERNAL struct dx12_resource *dx12_resource_alloc(D3D12_HEAP_PROPERTIES heap_pr
return r;
}
INTERNAL void dx12_resource_release(struct dx12_resource *t)
INTERNAL void dx12_resource_release_now(struct dx12_resource *t)
{
__prof;
/* TODO */
(UNUSED)t;
/* Release descriptors */
/* TODO: Batch lock heaps */
if (t->cbv_descriptor) {
descriptor_release(t->cbv_descriptor);
}
if (t->srv_descriptor) {
descriptor_release(t->srv_descriptor);
}
if (t->uav_descriptor) {
descriptor_release(t->uav_descriptor);
}
if (t->rtv_descriptor) {
descriptor_release(t->rtv_descriptor);
}
/* Release resource */
ID3D12Resource_Release(t->resource);
/* Add to free list */
struct sys_lock lock = sys_mutex_lock_e(G.resources_mutex);
t->next_free = G.first_free_resource;
G.first_free_resource = t;
sys_mutex_unlock(&lock);
}
INTERNAL enum D3D12_RESOURCE_STATES dx12_resource_barrier(ID3D12GraphicsCommandList *cl, struct dx12_resource *resource, enum D3D12_RESOURCE_STATES state)
@ -1755,7 +1854,7 @@ INTERNAL void command_queue_release(struct command_queue *cq)
__prof;
/* TODO */
(UNUSED)cq;
//ID3D12CommandQueue_Release(G.cq_copy_background->cq);
//ID3D12CommandQueue_Release(cq->cq);
}
/* ========================== *
@ -1875,7 +1974,7 @@ INTERNAL u64 command_list_close(struct command_list *cl)
u64 submit_fence_target = 0;
{
__profscope(Execute);
struct sys_lock global_lock = sys_mutex_lock_s(G.global_command_list_submit_mutex);
struct sys_lock submit_lock = sys_mutex_lock_s(G.global_submit_mutex);
struct sys_lock fence_lock = sys_mutex_lock_e(cq->submit_fence_mutex);
{
submit_fence_target = ++cq->submit_fence_target;
@ -1883,7 +1982,7 @@ INTERNAL u64 command_list_close(struct command_list *cl)
ID3D12CommandQueue_Signal(cq->cq, cq->submit_fence, submit_fence_target);
}
sys_mutex_unlock(&fence_lock);
sys_mutex_unlock(&global_lock);
sys_mutex_unlock(&submit_lock);
}
/* Add descriptor heaps to submitted list */
@ -2333,7 +2432,7 @@ struct gp_handle gp_texture_alloc(enum gp_texture_format format, u32 flags, stru
}
/* Copy from upload heap to texture */
struct command_queue *cq = G.cq_copy_background;
struct command_queue *cq = G.command_queues[DX12_QUEUE_COPY_BACKGROUND];
struct command_list *cl = command_list_open(cq->cl_pool);
{
__profscope_dx12(cl->cq->prof, cl->cl, Upload texture, RGB32_F(0.2, 0.5, 0.2));
@ -2359,7 +2458,7 @@ struct gp_handle gp_texture_alloc(enum gp_texture_format format, u32 flags, stru
/* TODO: Return async waitable to caller */
{
__profscope(Wait for upload);
HANDLE event = CreateEvent(NULL, FALSE, FALSE, NULL);
HANDLE event = CreateEvent(NULL, false, false, NULL);
ID3D12Fence_SetEventOnCompletion(cq->submit_fence, fence_target, event);
WaitForSingleObject(event, INFINITE);
CloseHandle(event);
@ -2401,7 +2500,8 @@ void gp_dispatch(struct gp_dispatch_params params)
struct pipeline_scope *pipeline_scope = pipeline_scope_begin();
struct pipeline *material_pipeline = pipeline_from_name(pipeline_scope, LIT("material"));
struct pipeline *shape_pipeline = pipeline_from_name(pipeline_scope, LIT("shape"));
struct command_list *cl = command_list_open(G.cq_direct->cl_pool);
struct command_queue *cq = G.command_queues[DX12_QUEUE_DIRECT];
struct command_list *cl = command_list_open(cq->cl_pool);
{
__profscope_dx12(cl->cq->prof, cl->cl, Dispatch, RGB32_F(0.5, 0.2, 0.2));
struct dx12_resource *target = handle_get_data(params.draw_target, DX12_HANDLE_KIND_RESOURCE);
@ -2616,21 +2716,21 @@ INTERNAL struct swapchain_buffer *update_swapchain(struct swapchain *swapchain,
b32 should_rebuild = !v2i32_eq(swapchain->resolution, resolution);
if (should_rebuild) {
HRESULT hr = 0;
struct command_queue *cq = G.cq_direct;
struct command_queue *cq = G.command_queues[DX12_QUEUE_DIRECT];
HWND hwnd = (HWND)sys_window_get_internal_handle(window);
if (swapchain->swapchain) {
ASSERT(hwnd == swapchain->hwnd);
/* Lock direct queue submissions (in case any write to backbuffer) */
/* TODO: Less overkill approach - Only flush present_blit since we know it's the only operation targeting backbuffer */
//struct sys_lock lock = sys_mutex_lock_e(cq->submit_fence_mutex);
DEBUGBREAKABLE;
struct sys_lock lock = sys_mutex_lock_e(G.global_command_list_record_mutex);
struct sys_lock lock = sys_mutex_lock_e(cq->submit_fence_mutex);
//DEBUGBREAKABLE;
//struct sys_lock lock = sys_mutex_lock_e(G.global_command_list_record_mutex);
{
/* Flush direct queue */
//ID3D12CommandQueue_Signal(cq->cq, cq->submit_fence, ++cq->submit_fence_target);
{
HANDLE event = CreateEvent(NULL, FALSE, FALSE, NULL);
HANDLE event = CreateEvent(NULL, false, false, NULL);
ID3D12Fence_SetEventOnCompletion(cq->submit_fence, cq->submit_fence_target, event);
WaitForSingleObject(event, INFINITE);
CloseHandle(event);
@ -2720,7 +2820,8 @@ INTERNAL void present_blit(struct swapchain_buffer *dst, struct dx12_resource *s
struct pipeline_scope *pipeline_scope = pipeline_scope_begin();
struct pipeline *blit_pipeline = pipeline_from_name(pipeline_scope, LIT("blit"));
if (blit_pipeline->success) {
struct command_list *cl = command_list_open(G.cq_direct->cl_pool);
struct command_queue *cq = G.command_queues[DX12_QUEUE_DIRECT];
struct command_list *cl = command_list_open(cq->cl_pool);
{
__profscope_dx12(cl->cq->prof, cl->cl, Blit, RGB32_F(0.5, 0.2, 0.2));
struct swapchain *swapchain = dst->swapchain;
@ -2838,18 +2939,18 @@ void gp_present(struct sys_window *window, struct v2i32 backresolution, struct g
__profscope(Mark queue frames);
/* Lock because frame marks shouldn't occur while command lists are recording */
struct sys_lock lock = sys_mutex_lock_e(G.global_command_list_record_mutex);
__prof_dx12_new_frame(G.cq_direct->prof);
__prof_dx12_new_frame(G.cq_compute->prof);
__prof_dx12_new_frame(G.cq_copy_critical->prof);
__prof_dx12_new_frame(G.cq_copy_background->prof);
for (u32 i = 0; i < ARRAY_COUNT(G.command_queues); ++i) {
struct command_queue *cq = G.command_queues[i];
__prof_dx12_new_frame(cq->prof);
}
sys_mutex_unlock(&lock);
}
{
__profscope(Collect queues);
__prof_dx12_collect(G.cq_direct->prof);
__prof_dx12_collect(G.cq_compute->prof);
__prof_dx12_collect(G.cq_copy_critical->prof);
__prof_dx12_collect(G.cq_copy_background->prof);
for (u32 i = 0; i < ARRAY_COUNT(G.command_queues); ++i) {
struct command_queue *cq = G.command_queues[i];
__prof_dx12_collect(cq->prof);
}
}
#endif
@ -2860,7 +2961,104 @@ void gp_present(struct sys_window *window, struct v2i32 backresolution, struct g
(UNUSED)vsync;
}
/* ========================== *
* Evictor thread
* ========================== */
INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(evictor_thread_entry_point, arg)
{
__prof;
(UNUSED)arg;
struct arena_temp scratch = scratch_begin_no_conflict();
HANDLE event = CreateEvent(NULL, false, false, NULL);
HANDLE events[2] = ZI;
events[0] = G.evictor_thread_wake_event;
events[1] = event;
u64 completed_targets[DX12_NUM_QUEUES] = ZI;
b32 shutdown = atomic_i32_eval(&G.evictor_thread_shutdown);
while (!shutdown) {
struct arena_temp temp = arena_temp_begin(scratch.arena);
{
__profscope(Run);
u64 targets[ARRAY_COUNT(completed_targets)] = ZI;
/* Copy queued data */
u32 num_fenced_releases = 0;
struct fenced_release_data *fenced_releases = NULL;
{
__profscope(Copy queued releases);
struct sys_lock lock = sys_mutex_lock_e(G.fenced_releases_mutex);
num_fenced_releases = G.fenced_releases_arena->pos / sizeof(struct fenced_release_data);
fenced_releases = arena_push_array_no_zero(temp.arena, struct fenced_release_data, num_fenced_releases);
MEMCPY(fenced_releases, arena_base(G.fenced_releases_arena), G.fenced_releases_arena->pos);
arena_reset(G.fenced_releases_arena);
MEMCPY(targets, G.fenced_release_targets, sizeof(targets));
sys_mutex_unlock(&lock);
}
/* Wait until fences reach target */
{
__profscope(Check fences);
for (u32 i = 0; i < ARRAY_COUNT(targets) && !shutdown; ++i) {
while (completed_targets[i] < targets[i] && !shutdown) {
struct command_queue *cq = G.command_queues[i];
completed_targets[i] = ID3D12Fence_GetCompletedValue(cq->submit_fence);
if (completed_targets[i] < targets[i]) {
ID3D12Fence_SetEventOnCompletion(cq->submit_fence, targets[i], event);
{
__profscope(Wait on fence);
WaitForMultipleObjects(2, events, false, INFINITE);
shutdown = atomic_i32_eval(&G.evictor_thread_shutdown);
}
}
}
}
}
/* Process releases */
if (!shutdown) {
__profscope(Release);
for (u32 i = 0; i < num_fenced_releases; ++i) {
struct fenced_release_data *fr = &fenced_releases[i];
switch (fr->kind) {
default:
{
/* Unknown handle type */
ASSERT(false);
} break;
case FENCED_RELEASE_KIND_RESOURCE:
{
struct dx12_resource *resource = (struct dx12_resource *)fr->ptr;
dx12_resource_release_now(resource);
} break;
case FENCED_RELEASE_KIND_PIPELINE:
{
struct pipeline *pipeline = (struct pipeline *)fr->ptr;
pipeline_release_now(pipeline);
} break;
}
}
}
}
arena_temp_end(temp);
{
__profscope(Sleep);
WaitForSingleObject(G.evictor_thread_wake_event, INFINITE);
shutdown = atomic_i32_eval(&G.evictor_thread_shutdown);
}
}
/* Release event */
CloseHandle(event);
scratch_end(scratch);
}