From 74609cdb3ceee6792d88ae77eb91ff48d86f0a67 Mon Sep 17 00:00:00 2001
From: jacob <jacob@cagori.com>
Date: Sat, 7 Jun 2025 20:20:21 -0500
Subject: [PATCH] shader PSO creation

---
 res/shaders/grid.hlsl    |  10 +-
 res/shaders/mesh.hlsl    |   4 +-
 res/shaders/test.hlsl    |  10 +-
 res/shaders/texture.hlsl |  59 +++++----
 src/app.c                |   9 +-
 src/gpu_dx12.c           | 270 +++++++++++++++++++++++++--------------
 src/work.c               |   6 +-
 7 files changed, 229 insertions(+), 139 deletions(-)

diff --git a/res/shaders/grid.hlsl b/res/shaders/grid.hlsl
index 19ef928c..6106473b 100644
--- a/res/shaders/grid.hlsl
+++ b/res/shaders/grid.hlsl
@@ -28,12 +28,12 @@ struct ps_input {
  * Globals
  * ========================== */
 
-StructuredBuffer<vs_instance> G_instance_buffer : register(t0);
+StructuredBuffer<vs_instance> g_instance_buffer : register(t0);
 
 cbuffer constants : register(b0)
 {
-    float4x4 G_projection;
-    uint G_instance_offset;
+    float4x4 g_projection;
+    uint g_instance_offset;
 };
 
 /* ========================== *
@@ -49,12 +49,12 @@ static const float2 G_quad_verts[4] = {
 
 ps_input vs_main(uint instance_id : SV_InstanceID, uint vertex_id : SV_VertexID)
 {
-    vs_instance instance = G_instance_buffer[G_instance_offset + instance_id];
+    vs_instance instance = g_instance_buffer[g_instance_offset + instance_id];
     float2 vert = G_quad_verts[vertex_id];
     float2 world_pos = mul(instance.xf, float3(vert, 1)).xy;
 
     ps_input output;
-    output.screen_pos = mul(G_projection, float4(world_pos, 0, 1));
+    output.screen_pos = mul(g_projection, float4(world_pos, 0, 1));
     output.line_thickness = instance.line_thickness;
     output.line_spacing = instance.line_spacing;
     output.offset = instance.offset;
diff --git a/res/shaders/mesh.hlsl b/res/shaders/mesh.hlsl
index d583837f..20ec6461 100644
--- a/res/shaders/mesh.hlsl
+++ b/res/shaders/mesh.hlsl
@@ -16,7 +16,7 @@ struct ps_input {
 
 cbuffer constants : register(b0)
 {
-    float4x4 G_projection;
+    float4x4 g_projection;
 };
 
 /* ========================== *
@@ -26,7 +26,7 @@ cbuffer constants : register(b0)
 ps_input vs_main(vs_input input)
 {
     ps_input output;
-    output.screen_pos = mul(G_projection, float4(input.pos.xy, 0.f, 1.f));
+    output.screen_pos = mul(g_projection, float4(input.pos.xy, 0.f, 1.f));
     output.color_lin = linear_from_srgb(input.color_srgb);
 
     return output;
diff --git a/res/shaders/test.hlsl b/res/shaders/test.hlsl
index da522318..076735ca 100644
--- a/res/shaders/test.hlsl
+++ b/res/shaders/test.hlsl
@@ -12,12 +12,12 @@ struct ps_input {
  * Globals
  * ========================== */
 
-StructuredBuffer<vs_instance> G_instance_buffer : register(t0);
+StructuredBuffer<vs_instance> g_instance_buffer : register(t0);
 
 cbuffer constants : register(b0)
 {
-    float4x4 G_projection;
-    uint G_instance_offset;
+    float4x4 g_projection;
+    uint g_instance_offset;
 };
 
 /* ========================== *
@@ -33,11 +33,11 @@ static const float2 G_quad_verts[4] = {
 
 ps_input vs_main(uint instance_id : SV_InstanceID, uint vertex_id : SV_VertexID)
 {
-    vs_instance instance = G_instance_buffer[G_instance_offset + instance_id];
+    vs_instance instance = g_instance_buffer[g_instance_offset + instance_id];
     float2 vert = G_quad_verts[vertex_id];
     float2 world_pos = mul(instance.xf, float3(vert, 1)).xy;
     ps_input output;
-    output.screen_pos = mul(G_projection, float4(world_pos, 0, 1));
+    output.screen_pos = mul(g_projection, float4(world_pos, 0, 1));
     return output;
 }
 
diff --git a/res/shaders/texture.hlsl b/res/shaders/texture.hlsl
index 18e0c717..7f152dbb 100644
--- a/res/shaders/texture.hlsl
+++ b/res/shaders/texture.hlsl
@@ -1,6 +1,6 @@
 #include "shaders/common.hlsl"
 
-struct vs_instance {
+struct instance {
     float2x3 xf;
     float2 uv0;
     float2 uv1;
@@ -8,55 +8,65 @@ struct vs_instance {
     float emittance;
 };
 
-struct ps_input {
-    DESV(float4, screen_pos, SV_POSITION);
-    DECL(float2, uv);
-    DECL(float4, tint_lin);
+struct constants {
+    float4x4 projection;
+    uint instance_offset;
 };
 
 /* ========================== *
- * Globals
+ * Root Signature
  * ========================== */
 
-StructuredBuffer<vs_instance> G_instance_buffer : register(t0);
+#define SIG \
+    "CBV(b0), " \
+    "DescriptorTable(SRV(t0), SRV(t1)), " \
+    "DescriptorTable(Sampler(s0))"
 
-Texture2D G_texture : register(t1);
-
-SamplerState G_sampler : register(s0);
-
-cbuffer constants : register(b0)
+cbuffer c : register(b0)
 {
-    float4x4 G_projection;
-    uint G_instance_offset;
+    struct constants g_constants;
 };
 
+StructuredBuffer<instance> g_instance_buffer : register(t0);
+
+Texture2D g_texture : register(t1);
+
+SamplerState g_sampler : register(s0);
+
 /* ========================== *
  * Vertex shader
  * ========================== */
 
-static const float2 G_quad_verts[4] = {
+static const float2 g_quad_verts[4] = {
     float2(-0.5f, -0.5f),
     float2( 0.5f, -0.5f),
     float2( 0.5f,  0.5f),
     float2(-0.5f,  0.5f)
 };
 
-static const int2 G_uv_factors[4] = {
+static const int2 g_uv_factors[4] = {
     int2(0, 0),
     int2(1, 0),
     int2(1, 1),
     int2(0, 1)
 };
 
-ps_input vs_main(uint instance_id : SV_InstanceID, uint vertex_id : SV_VertexID)
+struct vs_output {
+    DESV(float4, screen_pos, SV_POSITION);
+    DECL(float2, uv);
+    DECL(float4, tint_lin);
+};
+
+[RootSignature(SIG)]
+vs_output vs_main(uint instance_id : SV_InstanceID, uint vertex_id : SV_VertexID)
 {
-    vs_instance instance = G_instance_buffer[G_instance_offset + instance_id];
-    float2 vert = G_quad_verts[vertex_id];
-    float2 uv_factor  = G_uv_factors[vertex_id];
+    instance instance = g_instance_buffer[g_constants.instance_offset + instance_id];
+    float2 vert = g_quad_verts[vertex_id];
+    float2 uv_factor  = g_uv_factors[vertex_id];
     float2 world_pos = mul(instance.xf, float3(vert, 1)).xy;
 
-    ps_input output;
-    output.screen_pos = mul(G_projection, float4(world_pos, 0, 1));
+    vs_output output;
+    output.screen_pos = mul(g_constants.projection, float4(world_pos, 0, 1));
     output.uv = instance.uv0 + uv_factor * (instance.uv1 - instance.uv0);
     output.tint_lin = linear_from_srgb32(instance.tint_srgb);
 
@@ -67,8 +77,9 @@ ps_input vs_main(uint instance_id : SV_InstanceID, uint vertex_id : SV_VertexID)
  * Pixel shader
  * ========================== */
 
-float4 ps_main(ps_input input) : SV_TARGET
+[RootSignature(SIG)]
+float4 ps_main(vs_output input) : SV_TARGET
 {
-    float4 color = G_texture.Sample(G_sampler, input.uv) * input.tint_lin;
+    float4 color = g_texture.Sample(g_sampler, input.uv) * input.tint_lin;
     return color;
 }
diff --git a/src/app.c b/src/app.c
index f5a2d46e..68f00de0 100644
--- a/src/app.c
+++ b/src/app.c
@@ -248,13 +248,16 @@ void app_entry_point(struct string args_str)
         /* Ideally these layers should have cores "reserved" for them
          * 1. User thread
          * 2. Sim thread
-         * 3. Audio mixing/playback thread
+         * 3. Audio mixing / playback thread
+         * 4. Networking thread
          */
-        i32 num_reserved_cores = 3;
+        i32 num_reserved_cores = 4;
 
+        i32 num_logical_cores = (i32)sys_num_logical_processors();
+        //num_logical_cores = min(num_logical_cores, 8) + (max(num_logical_cores - 8, 0) / 2);  /* Dumb heuristic to try and lessen e-core usage */
         i32 min_worker_count = 2;
         i32 max_worker_count = 128;
-        i32 target_worker_count = (i32)sys_num_logical_processors() - num_reserved_cores;
+        i32 target_worker_count = num_logical_cores - num_reserved_cores;
         worker_count = (u32)clamp_i32(target_worker_count, min_worker_count, max_worker_count);
 #endif
     }
diff --git a/src/gpu_dx12.c b/src/gpu_dx12.c
index babb3ff7..4ea61eb9 100644
--- a/src/gpu_dx12.c
+++ b/src/gpu_dx12.c
@@ -56,6 +56,7 @@ struct dx12_shader_desc {
 
 struct dx12_shader {
     struct dx12_shader_desc desc;
+    ID3D12PipelineState *pso;
 };
 
 struct dx12_shader_result {
@@ -162,6 +163,90 @@ INTERNAL APP_EXIT_CALLBACK_FUNC_DEF(gpu_shutdown)
 #endif
 }
 
+/* ========================== *
+ * Handle
+ * ========================== */
+
+INTERNAL void dx12_texture_release(struct dx12_texture *t);
+
+INTERNAL struct gpu_handle handle_alloc(enum dx12_handle_kind kind, void *data)
+{
+    u64 old_gen = 0;
+    u64 idx = 0;
+    struct dx12_handle_entry *entry = NULL;
+    {
+        struct sys_lock lock = sys_mutex_lock_e(&G.handle_entries_mutex);
+        if (G.first_free_handle_entry) {
+            entry = G.first_free_handle_entry;
+            G.first_free_handle_entry = entry->next_free;
+            old_gen = entry->gen;
+            idx = entry->idx;
+        } else {
+            entry = arena_push_no_zero(&G.handle_entries_arena, struct dx12_handle_entry);
+            idx = G.num_handle_entries_reserved++;
+        }
+        sys_mutex_unlock(&lock);
+    }
+    MEMZERO_STRUCT(entry);
+    entry->kind = kind;
+    entry->gen = old_gen + 1;
+    entry->idx = idx;
+    entry->data = data;
+
+    struct gpu_handle res = ZI;
+    res.gen = entry->gen;
+    res.idx = entry->idx;
+    return res;
+}
+
+INTERNAL struct dx12_handle_entry *handle_get_entry(struct gpu_handle handle, struct sys_lock *lock)
+{
+    sys_assert_locked_e_or_s(lock, &G.handle_entries_mutex);
+    struct dx12_handle_entry *res = NULL;
+    if (handle.idx > 0 && handle.idx < G.num_handle_entries_reserved) {
+        struct dx12_handle_entry *tmp = &((struct dx12_handle_entry *)G.handle_entries_arena.base)[handle.idx];
+        if (tmp->gen == handle.gen) {
+            res = tmp;
+        }
+    }
+    return res;
+}
+
+/* TODO: The GPU api should ensure that resources freed by the caller will not cause issues on the GPU (via fencing),
+ * however the caller is responsible for managing resource lifetimes on the CPU side (e.g. using sprites w/ sprite scopes
+ * to ensure freed textures aren't being used in pending command lists. */
+void gpu_release(struct gpu_handle handle)
+{
+    enum dx12_handle_kind kind = NULL;
+    void *data = NULL;
+
+    /* Release handle entry */
+    struct sys_lock lock = sys_mutex_lock_e(&G.handle_entries_mutex);
+    {
+        struct dx12_handle_entry *entry = handle_get_entry(handle, &lock);
+        if (entry) {
+            kind = entry->kind;
+            data = entry->data;
+        }
+        ++entry->gen;
+        entry->next_free = G.first_free_handle_entry;
+        G.first_free_handle_entry = entry;
+    }
+    sys_mutex_unlock(&lock);
+
+    /* Release data */
+    if (data) {
+        switch (kind) {
+            default: break;
+
+            case DX12_HANDLE_KIND_TEXTURE:
+            {
+                dx12_texture_release(data);
+            } break;
+        }
+    }
+}
+
 /* ========================== *
  * Dx12 base initialization
  * ========================== */
@@ -433,7 +518,7 @@ PACK(struct dx12_texture_shader_instance {
 /* Init shaders */
 
 INTERNAL struct dx12_shader_result *shader_alloc_from_descs(struct arena *arena, u64 num_shaders, struct dx12_shader_desc *descs);
-INTERNAL void shader_release(struct dx12_shader *shader);
+INTERNAL void dx12_shader_release(struct dx12_shader *shader);
 
 INTERNAL void dx12_init_shaders(void)
 {
@@ -442,18 +527,21 @@ INTERNAL void dx12_init_shaders(void)
     struct dx12_shader_desc shader_descs[] = {
         /* Texture shader */
         {
-            .name = "shaders/texture.hlsl",
-            .flags = DX12_SHADER_DESC_FLAG_VS | DX12_SHADER_DESC_FLAG_PS
+            .name   =   "shaders/texture.hlsl",
+            .flags  =   DX12_SHADER_DESC_FLAG_VS |
+                        DX12_SHADER_DESC_FLAG_PS
         }
     };
 
     struct dx12_shader_result *results = shader_alloc_from_descs(scratch.arena, ARRAY_COUNT(shader_descs), shader_descs);
     for (u64 i = 0; i < ARRAY_COUNT(shader_descs); ++i) {
         struct dx12_shader_result *result = &results[i];
-        if (result->errors_text_len <= 0) {
-            /* TODO */
+        if (result->errors_text_len > 0) {
+            struct string msg = STRING(result->errors_text_len, result->errors_text);
+            sys_panic(msg);
+            dx12_shader_release(&result->shader);
         } else {
-            shader_release(&result->shader);
+            /* TODO */
         }
     }
 
@@ -538,7 +626,8 @@ INTERNAL void dx12_include_handler_release(struct dx12_include_handler *handler)
  * Shader compilation
  * ========================== */
 
- /* TODO: Compile shaders offline w/ dxc */
+ /* TODO: Compile shaders offline w/ dxc.
+  * Will also allow for some hlsl language features like static_assert */
 
 enum shader_compile_task_kind {
     SHADER_COMPILE_TASK_KIND_VS,
@@ -673,7 +762,7 @@ INTERNAL WORK_TASK_FUNC_DEF(shader_load_task, load_arg_raw)
         struct string shader_name = string_from_cstr_no_limit(desc.name);
         logf_info("Loading shader '%F'", FMT_STR(shader_name));
         struct resource src_res = resource_open(shader_name);
-        (UNUSED)result;
+        struct string error_str = LIT("Unknown error");
 
         struct shader_compile_task_arg vs = ZI;
         vs.kind = SHADER_COMPILE_TASK_KIND_VS;
@@ -696,9 +785,77 @@ INTERNAL WORK_TASK_FUNC_DEF(shader_load_task, load_arg_raw)
         work_wait(work);
 
         b32 success = vs.success && ps.success;
+
+        /* FIXME: Validate root signature blob exists in bytecode */
+
+        /* Create PSO */
+        ID3D12PipelineState *pso = NULL;
         if (success) {
-        } else {
-            struct string error_str = LIT("Unknown error");
+            /* Default rasterizer state */
+            D3D12_RASTERIZER_DESC raster_desc = {
+                .FillMode = D3D12_FILL_MODE_SOLID,
+                .CullMode = D3D12_CULL_MODE_BACK,
+                .FrontCounterClockwise = FALSE,
+                .DepthBias = D3D12_DEFAULT_DEPTH_BIAS,
+                .DepthBiasClamp = D3D12_DEFAULT_DEPTH_BIAS_CLAMP,
+                .SlopeScaledDepthBias = D3D12_DEFAULT_SLOPE_SCALED_DEPTH_BIAS,
+                .DepthClipEnable = TRUE,
+                .MultisampleEnable = FALSE,
+                .AntialiasedLineEnable = FALSE,
+                .ForcedSampleCount = 0,
+                .ConservativeRaster = D3D12_CONSERVATIVE_RASTERIZATION_MODE_OFF
+            };
+
+            /* No input layout */
+            D3D12_INPUT_LAYOUT_DESC input_layout_desc = {
+                .pInputElementDescs = NULL,
+                .NumElements = 0
+            };
+
+            /* Opaque blend state */
+            D3D12_BLEND_DESC blend_desc = {
+                .AlphaToCoverageEnable = FALSE,
+                .IndependentBlendEnable = FALSE
+            };
+            blend_desc.RenderTarget[0].BlendEnable = FALSE;
+            blend_desc.RenderTarget[0].RenderTargetWriteMask = D3D12_COLOR_WRITE_ENABLE_ALL;
+
+            /* Disable depth stencil */
+            D3D12_DEPTH_STENCIL_DESC depth_stencil_desc = {
+                .DepthEnable = FALSE,
+                .StencilEnable = FALSE
+            };
+
+            /* PSO */
+            D3D12_GRAPHICS_PIPELINE_STATE_DESC pso_desc = { 0 };
+            pso_desc.pRootSignature = NULL;  /* Use embedded root signature */
+            if (vs.success) {
+                pso_desc.VS.pShaderBytecode = ID3D10Blob_GetBufferPointer(vs.blob);
+                pso_desc.VS.BytecodeLength = ID3D10Blob_GetBufferSize(vs.blob);
+            }
+            if (ps.success) {
+                pso_desc.PS.pShaderBytecode = ID3D10Blob_GetBufferPointer(ps.blob);
+                pso_desc.PS.BytecodeLength = ID3D10Blob_GetBufferSize(ps.blob);
+            }
+            pso_desc.BlendState = blend_desc;
+            pso_desc.SampleMask = UINT_MAX;
+            pso_desc.RasterizerState = raster_desc;
+            pso_desc.DepthStencilState = depth_stencil_desc;
+            pso_desc.InputLayout = input_layout_desc;
+            pso_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
+            pso_desc.NumRenderTargets = 1;
+            pso_desc.RTVFormats[0] = DXGI_FORMAT_R8G8B8A8_UNORM;
+            pso_desc.SampleDesc.Count = 1;
+            HRESULT hr = ID3D12Device_CreateGraphicsPipelineState(G.device, &pso_desc, &IID_ID3D12PipelineState, (void **)&pso);
+            if (FAILED(hr)) {
+                error_str = LIT("Failed to create pipeline state object");
+                success = false;
+                ASSERT(false);
+            }
+        }
+
+        /* Copy error */
+        if (!success) {
             ID3D10Blob *error_blob = vs.error_blob ? vs.error_blob : ps.error_blob;
             if (error_blob) {
                 u64 error_blob_cstr_len = ID3D10Blob_GetBufferSize(error_blob);
@@ -712,10 +869,12 @@ INTERNAL WORK_TASK_FUNC_DEF(shader_load_task, load_arg_raw)
                     error_str = error_blob_str;
                 }
             }
-            result->errors_text_len = max_u64(error_str.len, ARRAY_COUNT(result->errors_text));
+            result->errors_text_len = min_u64(error_str.len, ARRAY_COUNT(result->errors_text));
             MEMCPY(result->errors_text, error_str.text, result->errors_text_len);
         }
 
+        shader->pso = pso;
+
         if (vs.blob) {
             ID3D10Blob_Release(vs.blob);
         }
@@ -761,94 +920,11 @@ INTERNAL struct dx12_shader_result *shader_alloc_from_descs(struct arena *arena,
     return results;
 }
 
-INTERNAL void shader_release(struct dx12_shader *shader)
+INTERNAL void dx12_shader_release(struct dx12_shader *shader)
 {
     __prof;
-    /* TODO */
-    (UNUSED)shader;
-}
-
-/* ========================== *
- * Handle
- * ========================== */
-
-INTERNAL void dx12_texture_release(struct dx12_texture *t);
-
-INTERNAL struct gpu_handle handle_alloc(enum dx12_handle_kind kind, void *data)
-{
-    u64 old_gen = 0;
-    u64 idx = 0;
-    struct dx12_handle_entry *entry = NULL;
-    {
-        struct sys_lock lock = sys_mutex_lock_e(&G.handle_entries_mutex);
-        if (G.first_free_handle_entry) {
-            entry = G.first_free_handle_entry;
-            G.first_free_handle_entry = entry->next_free;
-            old_gen = entry->gen;
-            idx = entry->idx;
-        } else {
-            entry = arena_push_no_zero(&G.handle_entries_arena, struct dx12_handle_entry);
-            idx = G.num_handle_entries_reserved++;
-        }
-        sys_mutex_unlock(&lock);
-    }
-    MEMZERO_STRUCT(entry);
-    entry->kind = kind;
-    entry->gen = old_gen + 1;
-    entry->idx = idx;
-    entry->data = data;
-
-    struct gpu_handle res = ZI;
-    res.gen = entry->gen;
-    res.idx = entry->idx;
-    return res;
-}
-
-INTERNAL struct dx12_handle_entry *handle_get_entry(struct gpu_handle handle, struct sys_lock *lock)
-{
-    sys_assert_locked_e_or_s(lock, &G.handle_entries_mutex);
-    struct dx12_handle_entry *res = NULL;
-    if (handle.idx > 0 && handle.idx < G.num_handle_entries_reserved) {
-        struct dx12_handle_entry *tmp = &((struct dx12_handle_entry *)G.handle_entries_arena.base)[handle.idx];
-        if (tmp->gen == handle.gen) {
-            res = tmp;
-        }
-    }
-    return res;
-}
-
-/* TODO: The GPU api should ensure that resources freed by the caller will not cause issues on the GPU (via fencing),
- * however the caller is responsible for managing resource lifetimes on the CPU side (e.g. using sprites w/ sprite scopes
- * to ensure freed textures aren't being used in pending command lists. */
-void gpu_release(struct gpu_handle handle)
-{
-    enum dx12_handle_kind kind = NULL;
-    void *data = NULL;
-
-    /* Release handle entry */
-    struct sys_lock lock = sys_mutex_lock_e(&G.handle_entries_mutex);
-    {
-        struct dx12_handle_entry *entry = handle_get_entry(handle, &lock);
-        if (entry) {
-            kind = entry->kind;
-            data = entry->data;
-        }
-        ++entry->gen;
-        entry->next_free = G.first_free_handle_entry;
-        G.first_free_handle_entry = entry;
-    }
-    sys_mutex_unlock(&lock);
-
-    /* Release data */
-    if (data) {
-        switch (kind) {
-            default: break;
-
-            case DX12_HANDLE_KIND_TEXTURE:
-            {
-                dx12_texture_release(data);
-            } break;
-        }
+    if (shader->pso) {
+        ID3D12PipelineState_Release(shader->pso);
     }
 }
 
diff --git a/src/work.c b/src/work.c
index a6e02147..e950dd08 100644
--- a/src/work.c
+++ b/src/work.c
@@ -448,9 +448,9 @@ INTERNAL struct work_handle work_push_from_slate_locked(struct sys_lock *lock, s
          * there would be no remaining workers to complete the child work, meaning
          * there is a deadlock.
          *
-         * By forcing workers to do their own child work, we can guarantee that this
-         * does not occur. However it is not ideal since it creates situations in
-         * which work is not done asynchronously.
+         * By forcing workers to do their own child work in this scenario, we can
+         * guarantee that this does not occur. However it is not ideal since it
+         * creates situations in which work is not done asynchronously.
          */
         struct worker_ctx *ctx = thread_local_var_eval(&tl_worker_ctx);
         if (ctx->is_worker) {