From 74609cdb3ceee6792d88ae77eb91ff48d86f0a67 Mon Sep 17 00:00:00 2001 From: jacob Date: Sat, 7 Jun 2025 20:20:21 -0500 Subject: [PATCH] shader PSO creation --- res/shaders/grid.hlsl | 10 +- res/shaders/mesh.hlsl | 4 +- res/shaders/test.hlsl | 10 +- res/shaders/texture.hlsl | 59 +++++---- src/app.c | 9 +- src/gpu_dx12.c | 270 +++++++++++++++++++++++++-------------- src/work.c | 6 +- 7 files changed, 229 insertions(+), 139 deletions(-) diff --git a/res/shaders/grid.hlsl b/res/shaders/grid.hlsl index 19ef928c..6106473b 100644 --- a/res/shaders/grid.hlsl +++ b/res/shaders/grid.hlsl @@ -28,12 +28,12 @@ struct ps_input { * Globals * ========================== */ -StructuredBuffer G_instance_buffer : register(t0); +StructuredBuffer g_instance_buffer : register(t0); cbuffer constants : register(b0) { - float4x4 G_projection; - uint G_instance_offset; + float4x4 g_projection; + uint g_instance_offset; }; /* ========================== * @@ -49,12 +49,12 @@ static const float2 G_quad_verts[4] = { ps_input vs_main(uint instance_id : SV_InstanceID, uint vertex_id : SV_VertexID) { - vs_instance instance = G_instance_buffer[G_instance_offset + instance_id]; + vs_instance instance = g_instance_buffer[g_instance_offset + instance_id]; float2 vert = G_quad_verts[vertex_id]; float2 world_pos = mul(instance.xf, float3(vert, 1)).xy; ps_input output; - output.screen_pos = mul(G_projection, float4(world_pos, 0, 1)); + output.screen_pos = mul(g_projection, float4(world_pos, 0, 1)); output.line_thickness = instance.line_thickness; output.line_spacing = instance.line_spacing; output.offset = instance.offset; diff --git a/res/shaders/mesh.hlsl b/res/shaders/mesh.hlsl index d583837f..20ec6461 100644 --- a/res/shaders/mesh.hlsl +++ b/res/shaders/mesh.hlsl @@ -16,7 +16,7 @@ struct ps_input { cbuffer constants : register(b0) { - float4x4 G_projection; + float4x4 g_projection; }; /* ========================== * @@ -26,7 +26,7 @@ cbuffer constants : register(b0) ps_input vs_main(vs_input input) { ps_input output; - output.screen_pos = mul(G_projection, float4(input.pos.xy, 0.f, 1.f)); + output.screen_pos = mul(g_projection, float4(input.pos.xy, 0.f, 1.f)); output.color_lin = linear_from_srgb(input.color_srgb); return output; diff --git a/res/shaders/test.hlsl b/res/shaders/test.hlsl index da522318..076735ca 100644 --- a/res/shaders/test.hlsl +++ b/res/shaders/test.hlsl @@ -12,12 +12,12 @@ struct ps_input { * Globals * ========================== */ -StructuredBuffer G_instance_buffer : register(t0); +StructuredBuffer g_instance_buffer : register(t0); cbuffer constants : register(b0) { - float4x4 G_projection; - uint G_instance_offset; + float4x4 g_projection; + uint g_instance_offset; }; /* ========================== * @@ -33,11 +33,11 @@ static const float2 G_quad_verts[4] = { ps_input vs_main(uint instance_id : SV_InstanceID, uint vertex_id : SV_VertexID) { - vs_instance instance = G_instance_buffer[G_instance_offset + instance_id]; + vs_instance instance = g_instance_buffer[g_instance_offset + instance_id]; float2 vert = G_quad_verts[vertex_id]; float2 world_pos = mul(instance.xf, float3(vert, 1)).xy; ps_input output; - output.screen_pos = mul(G_projection, float4(world_pos, 0, 1)); + output.screen_pos = mul(g_projection, float4(world_pos, 0, 1)); return output; } diff --git a/res/shaders/texture.hlsl b/res/shaders/texture.hlsl index 18e0c717..7f152dbb 100644 --- a/res/shaders/texture.hlsl +++ b/res/shaders/texture.hlsl @@ -1,6 +1,6 @@ #include "shaders/common.hlsl" -struct vs_instance { +struct instance { float2x3 xf; float2 uv0; float2 uv1; @@ -8,55 +8,65 @@ struct vs_instance { float emittance; }; -struct ps_input { - DESV(float4, screen_pos, SV_POSITION); - DECL(float2, uv); - DECL(float4, tint_lin); +struct constants { + float4x4 projection; + uint instance_offset; }; /* ========================== * - * Globals + * Root Signature * ========================== */ -StructuredBuffer G_instance_buffer : register(t0); +#define SIG \ + "CBV(b0), " \ + "DescriptorTable(SRV(t0), SRV(t1)), " \ + "DescriptorTable(Sampler(s0))" -Texture2D G_texture : register(t1); - -SamplerState G_sampler : register(s0); - -cbuffer constants : register(b0) +cbuffer c : register(b0) { - float4x4 G_projection; - uint G_instance_offset; + struct constants g_constants; }; +StructuredBuffer g_instance_buffer : register(t0); + +Texture2D g_texture : register(t1); + +SamplerState g_sampler : register(s0); + /* ========================== * * Vertex shader * ========================== */ -static const float2 G_quad_verts[4] = { +static const float2 g_quad_verts[4] = { float2(-0.5f, -0.5f), float2( 0.5f, -0.5f), float2( 0.5f, 0.5f), float2(-0.5f, 0.5f) }; -static const int2 G_uv_factors[4] = { +static const int2 g_uv_factors[4] = { int2(0, 0), int2(1, 0), int2(1, 1), int2(0, 1) }; -ps_input vs_main(uint instance_id : SV_InstanceID, uint vertex_id : SV_VertexID) +struct vs_output { + DESV(float4, screen_pos, SV_POSITION); + DECL(float2, uv); + DECL(float4, tint_lin); +}; + +[RootSignature(SIG)] +vs_output vs_main(uint instance_id : SV_InstanceID, uint vertex_id : SV_VertexID) { - vs_instance instance = G_instance_buffer[G_instance_offset + instance_id]; - float2 vert = G_quad_verts[vertex_id]; - float2 uv_factor = G_uv_factors[vertex_id]; + instance instance = g_instance_buffer[g_constants.instance_offset + instance_id]; + float2 vert = g_quad_verts[vertex_id]; + float2 uv_factor = g_uv_factors[vertex_id]; float2 world_pos = mul(instance.xf, float3(vert, 1)).xy; - ps_input output; - output.screen_pos = mul(G_projection, float4(world_pos, 0, 1)); + vs_output output; + output.screen_pos = mul(g_constants.projection, float4(world_pos, 0, 1)); output.uv = instance.uv0 + uv_factor * (instance.uv1 - instance.uv0); output.tint_lin = linear_from_srgb32(instance.tint_srgb); @@ -67,8 +77,9 @@ ps_input vs_main(uint instance_id : SV_InstanceID, uint vertex_id : SV_VertexID) * Pixel shader * ========================== */ -float4 ps_main(ps_input input) : SV_TARGET +[RootSignature(SIG)] +float4 ps_main(vs_output input) : SV_TARGET { - float4 color = G_texture.Sample(G_sampler, input.uv) * input.tint_lin; + float4 color = g_texture.Sample(g_sampler, input.uv) * input.tint_lin; return color; } diff --git a/src/app.c b/src/app.c index f5a2d46e..68f00de0 100644 --- a/src/app.c +++ b/src/app.c @@ -248,13 +248,16 @@ void app_entry_point(struct string args_str) /* Ideally these layers should have cores "reserved" for them * 1. User thread * 2. Sim thread - * 3. Audio mixing/playback thread + * 3. Audio mixing / playback thread + * 4. Networking thread */ - i32 num_reserved_cores = 3; + i32 num_reserved_cores = 4; + i32 num_logical_cores = (i32)sys_num_logical_processors(); + //num_logical_cores = min(num_logical_cores, 8) + (max(num_logical_cores - 8, 0) / 2); /* Dumb heuristic to try and lessen e-core usage */ i32 min_worker_count = 2; i32 max_worker_count = 128; - i32 target_worker_count = (i32)sys_num_logical_processors() - num_reserved_cores; + i32 target_worker_count = num_logical_cores - num_reserved_cores; worker_count = (u32)clamp_i32(target_worker_count, min_worker_count, max_worker_count); #endif } diff --git a/src/gpu_dx12.c b/src/gpu_dx12.c index babb3ff7..4ea61eb9 100644 --- a/src/gpu_dx12.c +++ b/src/gpu_dx12.c @@ -56,6 +56,7 @@ struct dx12_shader_desc { struct dx12_shader { struct dx12_shader_desc desc; + ID3D12PipelineState *pso; }; struct dx12_shader_result { @@ -162,6 +163,90 @@ INTERNAL APP_EXIT_CALLBACK_FUNC_DEF(gpu_shutdown) #endif } +/* ========================== * + * Handle + * ========================== */ + +INTERNAL void dx12_texture_release(struct dx12_texture *t); + +INTERNAL struct gpu_handle handle_alloc(enum dx12_handle_kind kind, void *data) +{ + u64 old_gen = 0; + u64 idx = 0; + struct dx12_handle_entry *entry = NULL; + { + struct sys_lock lock = sys_mutex_lock_e(&G.handle_entries_mutex); + if (G.first_free_handle_entry) { + entry = G.first_free_handle_entry; + G.first_free_handle_entry = entry->next_free; + old_gen = entry->gen; + idx = entry->idx; + } else { + entry = arena_push_no_zero(&G.handle_entries_arena, struct dx12_handle_entry); + idx = G.num_handle_entries_reserved++; + } + sys_mutex_unlock(&lock); + } + MEMZERO_STRUCT(entry); + entry->kind = kind; + entry->gen = old_gen + 1; + entry->idx = idx; + entry->data = data; + + struct gpu_handle res = ZI; + res.gen = entry->gen; + res.idx = entry->idx; + return res; +} + +INTERNAL struct dx12_handle_entry *handle_get_entry(struct gpu_handle handle, struct sys_lock *lock) +{ + sys_assert_locked_e_or_s(lock, &G.handle_entries_mutex); + struct dx12_handle_entry *res = NULL; + if (handle.idx > 0 && handle.idx < G.num_handle_entries_reserved) { + struct dx12_handle_entry *tmp = &((struct dx12_handle_entry *)G.handle_entries_arena.base)[handle.idx]; + if (tmp->gen == handle.gen) { + res = tmp; + } + } + return res; +} + +/* TODO: The GPU api should ensure that resources freed by the caller will not cause issues on the GPU (via fencing), + * however the caller is responsible for managing resource lifetimes on the CPU side (e.g. using sprites w/ sprite scopes + * to ensure freed textures aren't being used in pending command lists. */ +void gpu_release(struct gpu_handle handle) +{ + enum dx12_handle_kind kind = NULL; + void *data = NULL; + + /* Release handle entry */ + struct sys_lock lock = sys_mutex_lock_e(&G.handle_entries_mutex); + { + struct dx12_handle_entry *entry = handle_get_entry(handle, &lock); + if (entry) { + kind = entry->kind; + data = entry->data; + } + ++entry->gen; + entry->next_free = G.first_free_handle_entry; + G.first_free_handle_entry = entry; + } + sys_mutex_unlock(&lock); + + /* Release data */ + if (data) { + switch (kind) { + default: break; + + case DX12_HANDLE_KIND_TEXTURE: + { + dx12_texture_release(data); + } break; + } + } +} + /* ========================== * * Dx12 base initialization * ========================== */ @@ -433,7 +518,7 @@ PACK(struct dx12_texture_shader_instance { /* Init shaders */ INTERNAL struct dx12_shader_result *shader_alloc_from_descs(struct arena *arena, u64 num_shaders, struct dx12_shader_desc *descs); -INTERNAL void shader_release(struct dx12_shader *shader); +INTERNAL void dx12_shader_release(struct dx12_shader *shader); INTERNAL void dx12_init_shaders(void) { @@ -442,18 +527,21 @@ INTERNAL void dx12_init_shaders(void) struct dx12_shader_desc shader_descs[] = { /* Texture shader */ { - .name = "shaders/texture.hlsl", - .flags = DX12_SHADER_DESC_FLAG_VS | DX12_SHADER_DESC_FLAG_PS + .name = "shaders/texture.hlsl", + .flags = DX12_SHADER_DESC_FLAG_VS | + DX12_SHADER_DESC_FLAG_PS } }; struct dx12_shader_result *results = shader_alloc_from_descs(scratch.arena, ARRAY_COUNT(shader_descs), shader_descs); for (u64 i = 0; i < ARRAY_COUNT(shader_descs); ++i) { struct dx12_shader_result *result = &results[i]; - if (result->errors_text_len <= 0) { - /* TODO */ + if (result->errors_text_len > 0) { + struct string msg = STRING(result->errors_text_len, result->errors_text); + sys_panic(msg); + dx12_shader_release(&result->shader); } else { - shader_release(&result->shader); + /* TODO */ } } @@ -538,7 +626,8 @@ INTERNAL void dx12_include_handler_release(struct dx12_include_handler *handler) * Shader compilation * ========================== */ - /* TODO: Compile shaders offline w/ dxc */ + /* TODO: Compile shaders offline w/ dxc. + * Will also allow for some hlsl language features like static_assert */ enum shader_compile_task_kind { SHADER_COMPILE_TASK_KIND_VS, @@ -673,7 +762,7 @@ INTERNAL WORK_TASK_FUNC_DEF(shader_load_task, load_arg_raw) struct string shader_name = string_from_cstr_no_limit(desc.name); logf_info("Loading shader '%F'", FMT_STR(shader_name)); struct resource src_res = resource_open(shader_name); - (UNUSED)result; + struct string error_str = LIT("Unknown error"); struct shader_compile_task_arg vs = ZI; vs.kind = SHADER_COMPILE_TASK_KIND_VS; @@ -696,9 +785,77 @@ INTERNAL WORK_TASK_FUNC_DEF(shader_load_task, load_arg_raw) work_wait(work); b32 success = vs.success && ps.success; + + /* FIXME: Validate root signature blob exists in bytecode */ + + /* Create PSO */ + ID3D12PipelineState *pso = NULL; if (success) { - } else { - struct string error_str = LIT("Unknown error"); + /* Default rasterizer state */ + D3D12_RASTERIZER_DESC raster_desc = { + .FillMode = D3D12_FILL_MODE_SOLID, + .CullMode = D3D12_CULL_MODE_BACK, + .FrontCounterClockwise = FALSE, + .DepthBias = D3D12_DEFAULT_DEPTH_BIAS, + .DepthBiasClamp = D3D12_DEFAULT_DEPTH_BIAS_CLAMP, + .SlopeScaledDepthBias = D3D12_DEFAULT_SLOPE_SCALED_DEPTH_BIAS, + .DepthClipEnable = TRUE, + .MultisampleEnable = FALSE, + .AntialiasedLineEnable = FALSE, + .ForcedSampleCount = 0, + .ConservativeRaster = D3D12_CONSERVATIVE_RASTERIZATION_MODE_OFF + }; + + /* No input layout */ + D3D12_INPUT_LAYOUT_DESC input_layout_desc = { + .pInputElementDescs = NULL, + .NumElements = 0 + }; + + /* Opaque blend state */ + D3D12_BLEND_DESC blend_desc = { + .AlphaToCoverageEnable = FALSE, + .IndependentBlendEnable = FALSE + }; + blend_desc.RenderTarget[0].BlendEnable = FALSE; + blend_desc.RenderTarget[0].RenderTargetWriteMask = D3D12_COLOR_WRITE_ENABLE_ALL; + + /* Disable depth stencil */ + D3D12_DEPTH_STENCIL_DESC depth_stencil_desc = { + .DepthEnable = FALSE, + .StencilEnable = FALSE + }; + + /* PSO */ + D3D12_GRAPHICS_PIPELINE_STATE_DESC pso_desc = { 0 }; + pso_desc.pRootSignature = NULL; /* Use embedded root signature */ + if (vs.success) { + pso_desc.VS.pShaderBytecode = ID3D10Blob_GetBufferPointer(vs.blob); + pso_desc.VS.BytecodeLength = ID3D10Blob_GetBufferSize(vs.blob); + } + if (ps.success) { + pso_desc.PS.pShaderBytecode = ID3D10Blob_GetBufferPointer(ps.blob); + pso_desc.PS.BytecodeLength = ID3D10Blob_GetBufferSize(ps.blob); + } + pso_desc.BlendState = blend_desc; + pso_desc.SampleMask = UINT_MAX; + pso_desc.RasterizerState = raster_desc; + pso_desc.DepthStencilState = depth_stencil_desc; + pso_desc.InputLayout = input_layout_desc; + pso_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; + pso_desc.NumRenderTargets = 1; + pso_desc.RTVFormats[0] = DXGI_FORMAT_R8G8B8A8_UNORM; + pso_desc.SampleDesc.Count = 1; + HRESULT hr = ID3D12Device_CreateGraphicsPipelineState(G.device, &pso_desc, &IID_ID3D12PipelineState, (void **)&pso); + if (FAILED(hr)) { + error_str = LIT("Failed to create pipeline state object"); + success = false; + ASSERT(false); + } + } + + /* Copy error */ + if (!success) { ID3D10Blob *error_blob = vs.error_blob ? vs.error_blob : ps.error_blob; if (error_blob) { u64 error_blob_cstr_len = ID3D10Blob_GetBufferSize(error_blob); @@ -712,10 +869,12 @@ INTERNAL WORK_TASK_FUNC_DEF(shader_load_task, load_arg_raw) error_str = error_blob_str; } } - result->errors_text_len = max_u64(error_str.len, ARRAY_COUNT(result->errors_text)); + result->errors_text_len = min_u64(error_str.len, ARRAY_COUNT(result->errors_text)); MEMCPY(result->errors_text, error_str.text, result->errors_text_len); } + shader->pso = pso; + if (vs.blob) { ID3D10Blob_Release(vs.blob); } @@ -761,94 +920,11 @@ INTERNAL struct dx12_shader_result *shader_alloc_from_descs(struct arena *arena, return results; } -INTERNAL void shader_release(struct dx12_shader *shader) +INTERNAL void dx12_shader_release(struct dx12_shader *shader) { __prof; - /* TODO */ - (UNUSED)shader; -} - -/* ========================== * - * Handle - * ========================== */ - -INTERNAL void dx12_texture_release(struct dx12_texture *t); - -INTERNAL struct gpu_handle handle_alloc(enum dx12_handle_kind kind, void *data) -{ - u64 old_gen = 0; - u64 idx = 0; - struct dx12_handle_entry *entry = NULL; - { - struct sys_lock lock = sys_mutex_lock_e(&G.handle_entries_mutex); - if (G.first_free_handle_entry) { - entry = G.first_free_handle_entry; - G.first_free_handle_entry = entry->next_free; - old_gen = entry->gen; - idx = entry->idx; - } else { - entry = arena_push_no_zero(&G.handle_entries_arena, struct dx12_handle_entry); - idx = G.num_handle_entries_reserved++; - } - sys_mutex_unlock(&lock); - } - MEMZERO_STRUCT(entry); - entry->kind = kind; - entry->gen = old_gen + 1; - entry->idx = idx; - entry->data = data; - - struct gpu_handle res = ZI; - res.gen = entry->gen; - res.idx = entry->idx; - return res; -} - -INTERNAL struct dx12_handle_entry *handle_get_entry(struct gpu_handle handle, struct sys_lock *lock) -{ - sys_assert_locked_e_or_s(lock, &G.handle_entries_mutex); - struct dx12_handle_entry *res = NULL; - if (handle.idx > 0 && handle.idx < G.num_handle_entries_reserved) { - struct dx12_handle_entry *tmp = &((struct dx12_handle_entry *)G.handle_entries_arena.base)[handle.idx]; - if (tmp->gen == handle.gen) { - res = tmp; - } - } - return res; -} - -/* TODO: The GPU api should ensure that resources freed by the caller will not cause issues on the GPU (via fencing), - * however the caller is responsible for managing resource lifetimes on the CPU side (e.g. using sprites w/ sprite scopes - * to ensure freed textures aren't being used in pending command lists. */ -void gpu_release(struct gpu_handle handle) -{ - enum dx12_handle_kind kind = NULL; - void *data = NULL; - - /* Release handle entry */ - struct sys_lock lock = sys_mutex_lock_e(&G.handle_entries_mutex); - { - struct dx12_handle_entry *entry = handle_get_entry(handle, &lock); - if (entry) { - kind = entry->kind; - data = entry->data; - } - ++entry->gen; - entry->next_free = G.first_free_handle_entry; - G.first_free_handle_entry = entry; - } - sys_mutex_unlock(&lock); - - /* Release data */ - if (data) { - switch (kind) { - default: break; - - case DX12_HANDLE_KIND_TEXTURE: - { - dx12_texture_release(data); - } break; - } + if (shader->pso) { + ID3D12PipelineState_Release(shader->pso); } } diff --git a/src/work.c b/src/work.c index a6e02147..e950dd08 100644 --- a/src/work.c +++ b/src/work.c @@ -448,9 +448,9 @@ INTERNAL struct work_handle work_push_from_slate_locked(struct sys_lock *lock, s * there would be no remaining workers to complete the child work, meaning * there is a deadlock. * - * By forcing workers to do their own child work, we can guarantee that this - * does not occur. However it is not ideal since it creates situations in - * which work is not done asynchronously. + * By forcing workers to do their own child work in this scenario, we can + * guarantee that this does not occur. However it is not ideal since it + * creates situations in which work is not done asynchronously. */ struct worker_ctx *ctx = thread_local_var_eval(&tl_worker_ctx); if (ctx->is_worker) {