allow color & values for profiling

2025-07-06 14:37:17 -05:00 · 2025-07-06 14:37:17 -05:00 · a9bcab1b78
commit a9bcab1b78
parent 60613815d7
16 changed files with 211 additions and 191 deletions
--- a/src/app.c
+++ b/src/app.c
@ -324,7 +324,7 @@ void sys_app_entry(struct string args_str)
     * forcing process exit (to prevent process hanging in the background
     * if something gets stuck) */
    {
-        __profscope(Run exit callbacks);
+        __profn("Run exit callbacks");
        struct sys_lock lock = sys_mutex_lock_e(G.exit_callbacks_mutex);
        for (struct exit_callback *callback = G.exit_callbacks_head; callback; callback = callback->next) {
            callback->func();
@ -334,7 +334,7 @@ void sys_app_entry(struct string args_str)

    /* Write window settings to file */
    {
-        __profscope(Write settings file);
+        __profn("Write settings file");
        struct arena_temp temp = arena_temp_begin(scratch.arena);

        struct string window_settings_path = app_write_path_cat(temp.arena, settings_file_name);
--- a/src/arena.c
+++ b/src/arena.c
@ -76,7 +76,7 @@ void *arena_push_bytes_no_zero(struct arena *arena, u64 size, u64 align)

        u64 new_pos = aligned_start_pos + size;
        if (new_pos > arena->committed) {
-            __profscope(Arena commit);
+            __profn("Arena commit");
            /* Commit new block(s) */
            u64 blocks_needed = (new_pos - arena->committed + ARENA_BLOCK_SIZE - 1) / ARENA_BLOCK_SIZE;
            u64 commit_bytes = blocks_needed * ARENA_BLOCK_SIZE;
--- a/src/ase.c
+++ b/src/ase.c
@ -731,7 +731,7 @@ struct ase_decode_image_result ase_decode_image(struct arena *arena, struct stri
    }

    {
-        __profscope(Build image from cels);
+        __profn("Build image from cels");

        /* Assemble image from cels */
        for (struct cel *cel = cel_head; cel; cel = cel->next) {
--- a/src/gp_dx12.c
+++ b/src/gp_dx12.c
@ -534,7 +534,7 @@ INTERNAL void dx12_init_device(void)
    /* Enable stable power state */
    {
        b32 success = true;
-        __profscope(Set stable power state);
+        __profn("Set stable power state");
        HKEY key = 0;
        success = RegOpenKeyExW(HKEY_LOCAL_MACHINE, L"SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\AppModelUnlock", 0, KEY_READ, &key) == ERROR_SUCCESS;
        if (success) {
@ -942,7 +942,7 @@ INTERNAL SYS_JOB_DEF(pipeline_init_job, job)
         * root signature exists and matches between shaders. */
        ID3D10Blob *rootsig_blob = NULL;
        if (success) {
-            __profscope(Validate root signatures);
+            __profn("Validate root signatures");
            char *vs_rootsig_data = NULL;
            char *ps_rootsig_data = NULL;
            u32 vs_rootsig_data_len = 0;
@ -979,7 +979,7 @@ INTERNAL SYS_JOB_DEF(pipeline_init_job, job)
        /* Create root signature */
        ID3D12RootSignature *rootsig = NULL;
        if (success) {
-            __profscope(Create root signature);
+            __profn("Create root signature");
            hr = ID3D12Device_CreateRootSignature(G.device, 0, ID3D10Blob_GetBufferPointer(rootsig_blob), ID3D10Blob_GetBufferSize(rootsig_blob), &IID_ID3D12RootSignature, (void **)&rootsig);
            if (FAILED(hr)) {
                error_str = LIT("Failed to create root signature");
@ -991,7 +991,7 @@ INTERNAL SYS_JOB_DEF(pipeline_init_job, job)
        ID3D12PipelineState *pso = NULL;
        if (success) {
            /* Default rasterizer state */
-            __profscope(Create PSO);
+            __profn("Create PSO");
            D3D12_RASTERIZER_DESC raster_desc = {
                .FillMode = D3D12_FILL_MODE_SOLID,
                .CullMode = D3D12_CULL_MODE_NONE,
@ -1845,7 +1845,7 @@ INTERNAL u64 command_list_close(struct command_list *cl)

    /* Close */
    {
-        __profscope(Close DX12 command list);
+        __profn("Close DX12 command list");
        HRESULT hr = ID3D12GraphicsCommandList_Close(cl->cl);
        if (FAILED(hr)) {
            /* TODO: Don't panic */
@ -1856,7 +1856,7 @@ INTERNAL u64 command_list_close(struct command_list *cl)
    /* Submit */
    u64 submit_fence_target = 0;
    {
-        __profscope(Execute);
+        __profn("Execute");
        struct sys_lock submit_lock = sys_mutex_lock_s(G.global_submit_mutex);
        struct sys_lock fence_lock = sys_mutex_lock_e(cq->submit_fence_mutex);
        {
@ -2318,7 +2318,7 @@ struct gp_resource *gp_texture_alloc(enum gp_texture_format format, u32 flags, s
        struct command_queue *cq = G.command_queues[DX12_QUEUE_COPY_BACKGROUND];
        struct command_list *cl = command_list_open(cq->cl_pool);
        {
-            __profscope_dx12(cl->cq->prof, cl->cl, Upload texture, RGB32_F(0.2, 0.5, 0.2));
+            __profnc_dx12(cl->cq->prof, cl->cl, "Upload texture", RGB32_F(0.2, 0.5, 0.2));
            D3D12_TEXTURE_COPY_LOCATION dst_loc = {
                .pResource = r->resource,
                .Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX,
@ -2340,7 +2340,7 @@ struct gp_resource *gp_texture_alloc(enum gp_texture_format format, u32 flags, s
        /* Wait */
        /* TODO: Return async waitable to caller */
        {
-            __profscope(Wait for upload);
+            __profn("Wait for upload");
            HANDLE event = CreateEvent(NULL, false, false, NULL);
            ID3D12Fence_SetEventOnCompletion(cq->submit_fence, fence_target, event);
            WaitForSingleObject(event, INFINITE);
@ -2383,7 +2383,7 @@ void gp_dispatch(struct gp_dispatch_params params)
    struct command_queue *cq = G.command_queues[DX12_QUEUE_DIRECT];
    struct command_list *cl = command_list_open(cq->cl_pool);
    {
-        __profscope_dx12(cl->cq->prof, cl->cl, Dispatch, RGB32_F(0.5, 0.2, 0.2));
+        __profnc_dx12(cl->cq->prof, cl->cl, "Dispatch", RGB32_F(0.5, 0.2, 0.2));
        struct mat4x4 vp_matrix = calculate_vp(params.draw_target_view, params.draw_target_viewport.width, params.draw_target_viewport.height);

        /* Upload dummmy vert & index buffer */
@ -2397,11 +2397,11 @@ void gp_dispatch(struct gp_dispatch_params params)
        struct sh_material_instance *material_instances = arena_push_array_no_zero(scratch.arena, struct sh_material_instance, flow->num_material_instance_descs);
        struct sh_material_grid *grids = arena_push_array_no_zero(scratch.arena, struct sh_material_grid, flow->num_material_grid_descs);
        {
-            __profscope(Process flow data);
+            __profn("Process flow data");

            /* Process material instances */
            {
-                __profscope(Process material instances);
+                __profn("Process material instances");
                for (u32 i = 0; i < flow->num_material_instance_descs; ++i) {
                    struct material_instance_desc *desc = &((struct material_instance_desc *)arena_base(flow->material_instance_descs_arena))[i];
                    struct sh_material_instance *instance = &material_instances[i];
@ -2427,7 +2427,7 @@ void gp_dispatch(struct gp_dispatch_params params)

            /* Process grids */
            {
-                __profscope(Process grids);
+                __profn("Process grids");
                for (u32 i = 0; i < flow->num_material_grid_descs; ++i) {
                    struct material_grid_desc *desc = &((struct material_grid_desc *)arena_base(flow->material_grid_descs_arena))[i];
                    struct sh_material_grid *grid = &grids[i];
@ -2464,8 +2464,8 @@ void gp_dispatch(struct gp_dispatch_params params)

        /* Material pass */
        if (material_pipeline->success) {
-            __profscope(Material pass);
-            __profscope_dx12(cl->cq->prof, cl->cl, Material pass, RGB32_F(0.5, 0.2, 0.2));
+            __profn("Material pass");
+            __profnc_dx12(cl->cq->prof, cl->cl, "Material pass", RGB32_F(0.5, 0.2, 0.2));

            /* Bind pipeline */
            ID3D12GraphicsCommandList_SetPipelineState(cl->cl, material_pipeline->pso);
@ -2505,8 +2505,8 @@ void gp_dispatch(struct gp_dispatch_params params)

        /* Shape pass */
        if (shape_pipeline->success) {
-            __profscope(Shape pass);
-            __profscope_dx12(cl->cq->prof, cl->cl, Shape pass, RGB32_F(0.5, 0.2, 0.2));
+            __profn("Shape pass");
+            __profnc_dx12(cl->cq->prof, cl->cl, "Shape pass", RGB32_F(0.5, 0.2, 0.2));

            /* Bind pipeline */
            ID3D12GraphicsCommandList_SetPipelineState(cl->cl, shape_pipeline->pso);
@ -2696,7 +2696,7 @@ INTERNAL void present_blit(struct swapchain_buffer *dst, struct dx12_resource *s
        struct command_queue *cq = G.command_queues[DX12_QUEUE_DIRECT];
        struct command_list *cl = command_list_open(cq->cl_pool);
        {
-            __profscope_dx12(cl->cq->prof, cl->cl, Blit, RGB32_F(0.5, 0.2, 0.2));
+            __profnc_dx12(cl->cq->prof, cl->cl, "Blit", RGB32_F(0.5, 0.2, 0.2));
            struct swapchain *swapchain = dst->swapchain;

            /* Upload dummmy vert & index buffer */
@ -2804,7 +2804,7 @@ void gp_present(struct sys_window *window, struct v2i32 backresolution, struct g
    /* Present */
    /* FIXME: Resource barrier */
    {
-        __profscope(Present);
+        __profn("Present");
        HRESULT hr = IDXGISwapChain3_Present(swapchain->swapchain, vsync, present_flags);
        if (!SUCCEEDED(hr)) {
            ASSERT(false);
@ -2814,7 +2814,7 @@ void gp_present(struct sys_window *window, struct v2i32 backresolution, struct g

 #if PROFILING_D3D
    {
-        __profscope(Mark queue frames);
+        __profn("Mark queue frames");
        /* Lock because frame marks shouldn't occur while command lists are recording */
        struct sys_lock lock = sys_mutex_lock_e(G.global_command_list_record_mutex);
        for (u32 i = 0; i < countof(G.command_queues); ++i) {
@ -2824,7 +2824,7 @@ void gp_present(struct sys_window *window, struct v2i32 backresolution, struct g
        sys_mutex_unlock(&lock);
    }
    {
-        __profscope(Collect queues);
+        __profn("Collect queues");
        for (u32 i = 0; i < countof(G.command_queues); ++i) {
            struct command_queue *cq = G.command_queues[i];
            __prof_dx12_collect(cq->prof);
@ -2854,7 +2854,7 @@ INTERNAL SYS_THREAD_DEF(evictor_thread_entry_point, arg)
    while (!shutdown) {
        struct arena_temp temp = arena_temp_begin(scratch.arena);
        {
-            __profscope(Run);
+            __profn("Run");

            u64 targets[countof(completed_targets)] = ZI;

@ -2862,7 +2862,7 @@ INTERNAL SYS_THREAD_DEF(evictor_thread_entry_point, arg)
            u32 num_fenced_releases = 0;
            struct fenced_release_data *fenced_releases = NULL;
            {
-                __profscope(Copy queued releases);
+                __profn("Copy queued releases");
                struct sys_lock lock = sys_mutex_lock_e(G.fenced_releases_mutex);
                num_fenced_releases = G.fenced_releases_arena->pos / sizeof(struct fenced_release_data);
                fenced_releases = arena_push_array_no_zero(temp.arena, struct fenced_release_data, num_fenced_releases);
@ -2874,7 +2874,7 @@ INTERNAL SYS_THREAD_DEF(evictor_thread_entry_point, arg)

            /* Wait until fences reach target */
            {
-                __profscope(Check fences);
+                __profn("Check fences");
                for (u32 i = 0; i < countof(targets) && !shutdown; ++i) {
                    while (completed_targets[i] < targets[i] && !shutdown) {
                        struct command_queue *cq = G.command_queues[i];
@ -2882,7 +2882,7 @@ INTERNAL SYS_THREAD_DEF(evictor_thread_entry_point, arg)
                        if (completed_targets[i] < targets[i]) {
                            ID3D12Fence_SetEventOnCompletion(cq->submit_fence, targets[i], event);
                            {
-                                __profscope(Wait on fence);
+                                __profn("Wait on fence");
                                WaitForMultipleObjects(2, events, false, INFINITE);
                                shutdown = atomic_i32_fetch(&G.evictor_thread_shutdown);
                            }
@ -2893,7 +2893,7 @@ INTERNAL SYS_THREAD_DEF(evictor_thread_entry_point, arg)

            /* Process releases */
            for (u32 i = 0; i < num_fenced_releases; ++i) {
-                __profscope(Release);
+                __profn("Release");
                struct fenced_release_data *fr = &fenced_releases[i];
                switch (fr->kind) {
                    default:
@ -2918,7 +2918,7 @@ INTERNAL SYS_THREAD_DEF(evictor_thread_entry_point, arg)
        }
        arena_temp_end(temp);
        {
-            __profscope(Sleep);
+            __profn("Sleep");
            WaitForSingleObject(G.evictor_thread_wake_event, INFINITE);
            shutdown = atomic_i32_fetch(&G.evictor_thread_shutdown);
        }
--- a/src/host.c
+++ b/src/host.c
@ -652,7 +652,7 @@ struct host_event_list host_update_begin(struct arena *arena, struct host *host)
    i64 now_ns = sys_time_ns();

    {
-        __profscope(Read host packets);
+        __profn("Read host packets");
        struct string read_buff = ZI;
        read_buff.len = PACKET_DATA_MAX_LEN;
        read_buff.text = arena_push_array_no_zero(scratch.arena, u8, read_buff.len);
@ -828,7 +828,7 @@ struct host_event_list host_update_begin(struct arena *arena, struct host *host)

    /* Update channels */
    {
-        __profscope(Update host channels);
+        __profn("Update host channels");
        for (u64 i = 0; i < host->num_channels_reserved; ++i) {
            struct host_channel *channel = &host->channels[i];
            if (channel->valid) {
@ -902,7 +902,7 @@ void host_update_end(struct host *host)
    /* Process cmds into sendable packets */
    /* TODO: Unreliable packets don't need to be allocated into unreliable packet queue, should just send them and forget */
    {
-        __profscope(Process host cmds);
+        __profn("Process host cmds");
        for (struct host_cmd *cmd = host->first_cmd; cmd; cmd = cmd->next) {
            enum host_cmd_kind kind = cmd->kind;
            struct host_channel_id channel_id = cmd->channel_id;
@ -1017,7 +1017,7 @@ void host_update_end(struct host *host)
    /* Send packets */
    /* TODO: Aggregate small packets */
    {
-        __profscope(Send host packets);
+        __profn("Send host packets");
        for (u64 i = 0; i < host->num_channels_reserved; ++i) {
            struct sock *sock = host->sock;
            struct host_channel *channel = &host->channels[i];
--- a/src/log.c
+++ b/src/log.c
@ -216,7 +216,7 @@ void _log(i32 level, struct string msg)
        struct sys_lock lock = sys_mutex_lock_s(G.callbacks_mutex);
        for (struct log_event_callback *callback = G.first_callback; callback; callback = callback->next) {
            if (level <= callback->level) {
-                __profscope(Run log callback);
+                __profn("Run log callback");
                callback->func(event);
            }
        }
--- a/src/mixer.c
+++ b/src/mixer.c
@ -290,7 +290,7 @@ struct mixed_pcm_f32 mixer_update(struct arena *arena, u64 frame_count)
        /* Update & read mixes */
        mixes = arena_push_array_no_zero(scratch.arena, struct mix *, G.track_playing_count);
        for (struct track *track = G.track_first_playing; track; track = track->next) {
-            __profscope(Prepare track);
+            __profn("Prepare track");
            struct mix *mix = &track->mix;
            mix->desc = track->desc;
            mixes[mixes_count++] = mix;
@ -300,7 +300,7 @@ struct mixed_pcm_f32 mixer_update(struct arena *arena, u64 frame_count)
    }

    for (u64 mix_index = 0; mix_index < mixes_count; ++mix_index) {
-        __profscope(Mix track);
+        __profn("Mix track");
        struct mix *mix = mixes[mix_index];

        if (mix->source->pcm.count <= 0) {
@ -353,7 +353,7 @@ struct mixed_pcm_f32 mixer_update(struct arena *arena, u64 frame_count)

        /* Transform 16 bit source -> 32 bit stereo at output duration */
        {
-            __profscope(Resample);
+            __profn("Resample");
            f32 *out_samples = mix_pcm.samples;

            u64 out_frames_count = mix_pcm.count / 2;
@ -407,7 +407,7 @@ struct mixed_pcm_f32 mixer_update(struct arena *arena, u64 frame_count)
         * ========================== */

        if (desc.flags & MIXER_FLAG_SPATIALIZE) {
-            __profscope(Spatialize);
+            __profn("Spatialize");

            /* Algorithm constants */
            const f32 rolloff_height = 1.2f;
@ -468,7 +468,7 @@ struct mixed_pcm_f32 mixer_update(struct arena *arena, u64 frame_count)
    }

    {
-        __profscope(Update track effect data);
+        __profn("Update track effect data");
        struct sys_lock lock = sys_mutex_lock_e(G.mutex);
        for (u64 i = 0; i < mixes_count; ++i) {
            struct mix *mix = mixes[i];
--- a/src/phys.c
+++ b/src/phys.c
@ -1254,7 +1254,7 @@ void phys_step(struct phys_step_ctx *ctx, f32 timestep)

    f32 remaining_dt = timestep;
    while (remaining_dt > 0) {
-        __profscope(Step part);
+        __profn("Step part");
        ++phys_iteration;
        struct arena_temp scratch = scratch_begin_no_conflict();

@ -1282,7 +1282,7 @@ void phys_step(struct phys_step_ctx *ctx, f32 timestep)

        f32 substep_dt = step_dt / SIM_PHYSICS_SUBSTEPS;
        for (u32 i = 0; i < SIM_PHYSICS_SUBSTEPS; ++i) {
-            __profscope(Substep);
+            __profn("Substep");

            /* Warm start */
 #if SIM_PHYSICS_ENABLE_WARM_STARTING
--- a/src/playback_wasapi.c
+++ b/src/playback_wasapi.c
@ -174,7 +174,7 @@ INTERNAL struct wasapi_buffer wasapi_update_begin(void)

    /* Wait */
    {
-        __profscope(wasapi_wait_on_event);
+        __profn("Wasapi wait");
        WaitForSingleObject(G.event, INFINITE);
    }

--- a/src/prof_tracy.h
+++ b/src/prof_tracy.h
@ -10,10 +10,10 @@
 #define PROFILING_SYSTEM_TRACE 0
 #define PROFILING_CAPTURE_FRAME_IMAGE 0
 #define PROFILING_LOCKS 0
-#define PROFILING_D3D 0
+#define PROFILING_D3D 1
 #define PROFILING_FILE_WSTR L".tracy"
-//#define PROFILING_CMD_WSTR L"cmd /C start \"\" /wait tracy-capture.exe -o .tracy -a 127.0.0.1 && start \"\" tracy-profiler.exe .tracy"
-#define PROFILING_CMD_WSTR L"tracy-profiler.exe -a 127.0.0.1"
+#define PROFILING_CMD_WSTR L"cmd /C start \"\" /wait tracy-capture.exe -o .tracy -a 127.0.0.1 && start \"\" tracy-profiler.exe .tracy"
+//#define PROFILING_CMD_WSTR L"tracy-profiler.exe -a 127.0.0.1"

 /* Tracy defines */
 #define TRACY_ENABLE
@ -31,11 +31,12 @@
 #pragma clang diagnostic ignored "-Wincompatible-pointer-types-discards-qualifiers"
 #include TRACY_CLIENT_HEADER_PATH

-/* Clang/GCC cleanup macros */
-#define __prof static const struct ___tracy_source_location_data CAT(__tracy_source_location,__LINE__) = { NULL, __func__,  __FILE__, (uint32_t)__LINE__, 0 }; __attribute((cleanup(__prof_zone_cleanup_func))) TracyCZoneCtx __tracy_zone_ctx = ___tracy_emit_zone_begin( &CAT(__tracy_source_location,__LINE__), true )
-#define __profscope(name) static const struct ___tracy_source_location_data CAT(__tracy_source_location,__LINE__) = { #name, __func__,  __FILE__, (uint32_t)__LINE__, 0 }; __attribute((cleanup(__prof_zone_cleanup_func))) TracyCZoneCtx __tracy_zone_ctx = ___tracy_emit_zone_begin( &CAT(__tracy_source_location,__LINE__), true )
 INLINE void __prof_zone_cleanup_func(TracyCZoneCtx *ctx) { TracyCZoneEnd(*ctx) }
+#define __profnc(name, color) static const struct ___tracy_source_location_data CAT(__tracy_source_location,__LINE__) = { (name), __func__,  __FILE__, (uint32_t)__LINE__, BGR32(color) }; __attribute((cleanup(__prof_zone_cleanup_func))) TracyCZoneCtx __tracy_zone_ctx = ___tracy_emit_zone_begin( &CAT(__tracy_source_location,__LINE__), true )
+#define __profn(name) __profnc(name, 0)
+#define __prof __profnc(NULL, 0)

+#define __profvalue(v)                  TracyCZoneValue(__tracy_zone_ctx, (v))
 #define __profalloc(ptr, size)          TracyCAlloc((ptr), (size))
 #define __proffree(ptr)                 TracyCFree((ptr))
 #define __profmsg(txt, len, col)        TracyCMessageC((txt), (len), BGR32(col))
@ -59,8 +60,11 @@ enum __prof_plot_type {
 #define PROFILING_LOCKS 0
 #define PROFILING_D3D 0

+#define __profnc(name, color)
+#define __profn(name)
 #define __prof
-#define __profscope(name)
+
+#define __profvalue(v)
 #define __profalloc(ptr, size)
 #define __proffree(ptr)
 #define __profmsg(txt, len, col)
@ -105,25 +109,25 @@ enum __prof_plot_type {
 #if PROFILING_D3D
 /* Dx11 */
 INLINE void __prof_dx11_zone_cleanup_func(TracyCD3D11ZoneCtx *ctx) { ___tracy_d3d11_emit_zone_end(*ctx); }
-# define __profscope_dx11(dx11_ctx, name, color) static const struct ___tracy_source_location_data CAT(__tracy_gpu_d3d11_source_location,__LINE__) = { #name, __func__,  __FILE__, (uint32_t)__LINE__, BGR32(color) }; __attribute((cleanup(__prof_dx11_zone_cleanup_func))) TracyCD3D11ZoneCtx __tracy_d3d11_zone_ctx; ___tracy_d3d11_emit_zone_begin( dx11_ctx, &__tracy_d3d11_zone_ctx, &CAT(__tracy_gpu_d3d11_source_location,__LINE__), true)
+# define __profnc_dx11(dx11_ctx, name, color) static const struct ___tracy_source_location_data CAT(__tracy_gpu_d3d11_source_location,__LINE__) = { name, __func__,  __FILE__, (uint32_t)__LINE__, BGR32(color) }; __attribute((cleanup(__prof_dx11_zone_cleanup_func))) TracyCD3D11ZoneCtx __tracy_d3d11_zone_ctx; ___tracy_d3d11_emit_zone_begin( dx11_ctx, &__tracy_d3d11_zone_ctx, &CAT(__tracy_gpu_d3d11_source_location,__LINE__), true)
 # define __prof_dx11_ctx(name) struct TracyCD3D11Ctx *name
 # define __prof_dx11_ctx_alloc(ctx, device, device_ctx, name, name_len) ctx = ___tracy_d3d11_context_announce(device, device_ctx, name, name_len)
 # define __prof_dx11_ctx_release(ctx) ___tracy_d3d11_context_terminate(ctx)
 # define __prof_dx11_collect(ctx) ___tracy_d3d11_context_collect(ctx)
 /* Dx12 */
 INLINE void __prof_dx12_zone_cleanup_func(TracyCD3D12ZoneCtx *ctx) { ___tracy_d3d12_emit_zone_end(*ctx); }
-# define __profscope_dx12(dx12_ctx, cmd_list, name, color) static const struct ___tracy_source_location_data CAT(__tracy_gpu_d3d12_source_location,__LINE__) = { #name, __func__,  __FILE__, (uint32_t)__LINE__, BGR32(color) }; __attribute((cleanup(__prof_dx12_zone_cleanup_func))) TracyCD3D12ZoneCtx __tracy_d3d12_zone_ctx; ___tracy_d3d12_emit_zone_begin( dx12_ctx, cmd_list, &__tracy_d3d12_zone_ctx, &CAT(__tracy_gpu_d3d12_source_location,__LINE__), true)
+# define __profnc_dx12(dx12_ctx, cmd_list, name, color) static const struct ___tracy_source_location_data CAT(__tracy_gpu_d3d12_source_location,__LINE__) = { name, __func__,  __FILE__, (uint32_t)__LINE__, BGR32(color) }; __attribute((cleanup(__prof_dx12_zone_cleanup_func))) TracyCD3D12ZoneCtx __tracy_d3d12_zone_ctx; ___tracy_d3d12_emit_zone_begin( dx12_ctx, cmd_list, &__tracy_d3d12_zone_ctx, &CAT(__tracy_gpu_d3d12_source_location,__LINE__), true)
 # define __prof_dx12_ctx(name) struct TracyCD3D12Ctx *name
 # define __prof_dx12_ctx_alloc(ctx, device, queue, name, name_len) ctx = ___tracy_d3d12_context_announce(device, queue, name, name_len)
 # define __prof_dx12_ctx_release(ctx) ___tracy_d3d12_context_terminate(ctx)
 # define __prof_dx12_new_frame(ctx) ___tracy_d3d12_context_new_frame(ctx)
 # define __prof_dx12_collect(ctx) ___tracy_d3d12_context_collect(ctx)
 #else
-# define __profscope_dx11(dx11_ctx, name, color)
+# define __profnc_dx11(dx11_ctx, name, color)
 # define __prof_dx11_ctx_alloc(ctx, device, device_ctx, name, name_len)
 # define __prof_dx11_ctx_release(ctx)
 # define __prof_dx11_collect(ctx)
-# define __profscope_dx12(dx11_ctx, queue, name, color)
+# define __profnc_dx12(dx11_ctx, queue, name, color)
 # define __prof_dx12_ctx_alloc(ctx, device, queue, name, name_len)
 # define __prof_dx12_ctx_release(ctx)
 # define __prof_dx12_new_frame(ctx)
@ -137,7 +141,7 @@ INLINE void __prof_dx12_zone_cleanup_func(TracyCD3D12ZoneCtx *ctx) { ___tracy_d3
 #endif  /* PROFILING_CAPTURE_FRAME_IMAGE */

 #ifdef TRACY_FIBERS
-/* Tracy fiber methods are wrapped in NO_INLINE because otherwise issues arise
+/* Tracy fiber methods are wrapped in NO_INLINE because otherwise issues can arise
 * accross fiber context boundaries during optimization */
 NO_INLINE INLINE void __prof_fiber_enter(char *fiber_name, i32 profiler_group) { TracyCFiberEnterWithHint(fiber_name, profiler_group); }
 NO_INLINE INLINE void __prof_fiber_leave(void) { TracyCFiberLeave; }
--- a/src/resource.c
+++ b/src/resource.c
@ -241,10 +241,10 @@ INTERNAL SYS_THREAD_DEF(resource_watch_dispatcher_thread_entry_point, _)
    while (!atomic_i32_fetch(&G.watch_shutdown)) {
        sys_condition_variable_wait(G.watch_dispatcher_cv, &watch_dispatcher_lock);
        if (!atomic_i32_fetch(&G.watch_shutdown) && G.watch_dispatcher_info_arena->pos > 0) {
-            __profscope(Dispatch resource watch callbacks);
+            __profn("Dispatch resource watch callbacks");
            /* Unlock and sleep a bit so duplicate events pile up */
            {
-                __profscope(Delay);
+                __profn("Delay");
                sys_mutex_unlock(&watch_dispatcher_lock);
                sys_sleep(WATCH_DISPATCHER_DELAY_SECONDS);
                watch_dispatcher_lock = sys_mutex_lock_e(G.watch_dispatcher_mutex);
@ -275,7 +275,7 @@ INTERNAL SYS_THREAD_DEF(resource_watch_dispatcher_thread_entry_point, _)
                {
                    struct dict *dedup_dict = dict_init(temp.arena, WATCH_DISPATCHER_DEDUP_DICT_BINS);
                    for (struct sys_watch_info *info = watch_info_list.first; info; info = info->next) {
-                        __profscope(Dispatch);
+                        __profn("Dispatch");
                        /* Do not run callbacks for the same file more than once */
                        b32 skip = false;
                        u64 hash = hash_fnv64(HASH_FNV64_BASIS, info->name);
--- a/src/sim.c
+++ b/src/sim.c
@ -649,7 +649,7 @@ struct sim_snapshot *sim_snapshot_alloc_from_lerp(struct sim_client *client, str

        /* Blend entities */
        {
-            __profscope(Lerp snapshot entities);
+            __profn("Lerp snapshot entities");
            u64 num_entities = min_u64(ss0->num_ents_reserved, ss1->num_ents_reserved);
            for (u64 i = 0; i < num_entities; ++i) {
                struct sim_ent *e = &ss->ents[i];
--- a/src/sprite.c
+++ b/src/sprite.c
@ -427,7 +427,7 @@ INTERNAL struct sprite_sheet init_sheet_from_ase_result(struct arena *arena, str

    /* Init frames */
    {
-        __profscope(Init frames);
+        __profn("Init frames");
        sheet.image_size = ase.image_size;
        sheet.frame_size = ase.frame_size;
        sheet.frames = arena_push_array(arena, struct sprite_sheet_frame, ase.num_frames);
@ -449,7 +449,7 @@ INTERNAL struct sprite_sheet init_sheet_from_ase_result(struct arena *arena, str
    /* Init spans */
    sheet.spans_count = ase.num_spans;
    if (ase.num_spans > 0) {
-        __profscope(Init spans);
+        __profn("Init spans");
        sheet.spans = arena_push_array(arena, struct sprite_sheet_span, sheet.spans_count);
        sheet.spans_dict = dict_init(arena, (u64)(ase.num_spans * SHEET_SPAN_LOOKUP_TABLE_BIN_RATIO));
        u64 index = 0;
@ -467,7 +467,7 @@ INTERNAL struct sprite_sheet init_sheet_from_ase_result(struct arena *arena, str

    /* Init slices */
    if (ase.num_slice_keys > 0) {
-        __profscope(Init slices);
+        __profn("Init slices");
        struct arena_temp scratch = scratch_begin(arena);

        struct temp_ase_slice_key_node {
@ -1241,7 +1241,7 @@ INTERNAL SYS_JOB_DEF(sprite_evictor_job, _)
            /* Scan for evictable nodes */
            b32 cache_over_budget_threshold = atomic_u64_fetch(&G.cache.memory_usage) > CACHE_MEMORY_BUDGET_THRESHOLD;
            if (cache_over_budget_threshold || RESOURCE_RELOADING) {
-                __profscope(Evictor scan);
+                __profn("Evictor scan");
                for (u64 i = 0; i < CACHE_BINS_COUNT; ++i) {
                    struct cache_bin *bin = &G.cache.bins[i];
                    struct sys_lock bin_lock = sys_mutex_lock_s(bin->mutex);
@ -1282,14 +1282,14 @@ INTERNAL SYS_JOB_DEF(sprite_evictor_job, _)

            /* Sort evict nodes */
            {
-                __profscope(Evictor sort);
+                __profn("Evictor sort");
                merge_sort(evict_array, evict_array_count, sizeof(*evict_array), evict_sort, NULL);
            }

            /* Remove evictable nodes from cache until under budget */
            struct evict_node *first_evicted = NULL;
            {
-                __profscope(Evictor cache removal);
+                __profn("Evictor cache removal");
                b32 stop_evicting = false;
                for (u64 i = 0; i < evict_array_count && !stop_evicting; ++i) {
                    struct evict_node *en = &evict_array[i];
@ -1335,7 +1335,7 @@ INTERNAL SYS_JOB_DEF(sprite_evictor_job, _)
            if (first_evicted) {
                /* Release evicted node memory */
                {
-                    __profscope(Evictor memory release);
+                    __profn("Evictor memory release");
                    for (struct evict_node *en = first_evicted; en; en = en->next_evicted) {
                        struct cache_entry *n = en->cache_entry;
                        if (n->kind == CACHE_ENTRY_KIND_TEXTURE && n->texture->valid) {
@ -1347,7 +1347,7 @@ INTERNAL SYS_JOB_DEF(sprite_evictor_job, _)

                /* Add evicted nodes to free list */
                {
-                    __profscope(Evictor free list append);
+                    __profn("Evictor free list append");
                    struct sys_lock pool_lock = sys_mutex_lock_e(G.cache.entry_pool_mutex);
                    for (struct evict_node *en = first_evicted; en; en = en->next_evicted) {
                        struct cache_entry *n = en->cache_entry;
--- a/src/sys_win32.c
+++ b/src/sys_win32.c
@ -403,7 +403,7 @@ INTERNAL void job_fiber_yield(struct fiber *fiber, struct fiber *parent_fiber);

 void sys_wait(void *addr, void *cmp, u32 size)
 {
-    //__prof;
+    __prof;
 #if 0
    WaitOnAddress(addr, cmp, size, INFINITE);
 #else
@ -756,7 +756,7 @@ INTERNAL void job_fiber_entry(void *id_ptr)
    while (true) {
        /* Run job */
        {
-            //__profscope(Run job);
+            __profn("Run job");
            volatile struct yield_param *yield_param = fiber->yield_param;
            yield_param->kind = YIELD_KIND_NONE;
            struct sys_job_data data = ZI;
@ -783,10 +783,23 @@ INTERNAL void job_fiber_entry(void *id_ptr)

 INTERNAL SYS_THREAD_DEF(worker_entry, worker_ctx_arg)
 {
-    __prof;
    struct worker_ctx *ctx = worker_ctx_arg;
    (UNUSED)ctx;

+    {
+        HANDLE thread_handle = GetCurrentThread();
+        b32 success = false;
+        (UNUSED)success;
+
+        i32 priority = THREAD_PRIORITY_TIME_CRITICAL;
+        success = SetThreadPriority(thread_handle, priority);
+        ASSERT(success);
+
+        u64 affinity_mask = 1 << (ctx->id * 2);
+        success = !!SetThreadAffinityMask(thread_handle, affinity_mask);
+        ASSERT(success);
+    }
+
    i32 worker_fiber_id = sys_current_fiber_id();

    struct job_queue *queues[countof(G.job_queues)] = ZI;
@ -806,7 +819,7 @@ INTERNAL SYS_THREAD_DEF(worker_entry, worker_ctx_arg)
        void *job_sig = 0;
        struct counter *job_counter = 0;
        {
-            //__profscope(Pull job);
+            //__profnc("Pull job", RGB32_F(0.75, 0.75, 0));
            for (u32 queue_index = 0; queue_index < countof(queues) && !job_func; ++queue_index) {
                struct job_queue *queue = queues[queue_index];
                if (queue) {
@ -868,10 +881,12 @@ INTERNAL SYS_THREAD_DEF(worker_entry, worker_ctx_arg)

        /* Run fiber */
        if (job_func) {
-            //__profscope(Run fiber);
            if (!job_fiber) {
                job_fiber = fiber_alloc(FIBER_KIND_JOB_WORKER);
            }
+            {
+                __profnc("Run fiber", RGB32_F(0.25, 0.75, 0));
+                __profvalue(job_fiber->id);
                struct yield_param yield = ZI;
                job_fiber->job_func = job_func;
                job_fiber->job_sig = job_sig;
@ -987,6 +1002,7 @@ INTERNAL SYS_THREAD_DEF(worker_entry, worker_ctx_arg)
            }
        }
    }
+}

 /* ========================== *
 * Test entry
@ -1042,7 +1058,7 @@ struct sys_scratch_ctx *sys_scratch_ctx_from_fiber_id(i32 id)
    struct fiber_ctx *fiber_ctx = fiber_ctx_from_id(id);
    struct sys_scratch_ctx *scratch_ctx = &fiber_ctx->scratch_ctx;
    if (!scratch_ctx->arenas[0]) {
-        //__profscope(Initialize scratch context);
+        __profn("Initialize scratch context");
        for (u32 i = 0; i < countof(scratch_ctx->arenas); ++i) {
            scratch_ctx->arenas[i] = arena_alloc(GIGABYTE(64));
        }
@ -1323,7 +1339,7 @@ struct sys_file sys_file_open_read_wait(struct string path)
    HANDLE handle;
    while ((handle = CreateFileW(path_wstr, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE) {
        if (GetLastError() == ERROR_SHARING_VIOLATION) {
-            __profscope(File share conflict delay);
+            __profn("File share conflict delay");
            Sleep(delay_ms);
            if (delay_ms < 1024) {
                delay_ms *= 2;
@ -1869,7 +1885,7 @@ INTERNAL SYS_THREAD_DEF(window_thread_entry_point, arg)
            GetMessageW(&msg, 0, 0, 0);
        }
        {
-            __profscope(Process window message);
+            __profn("Process window message");
            if (atomic_i32_fetch(&window->event_thread_shutdown)) {
                break;
            }
@ -3035,7 +3051,7 @@ INTERNAL void win32_precise_sleep_timer(HANDLE timer, f64 seconds)

    i64 max_ticks = (i64)scheduler_period_ms * 9500;
    while (true) {
-        __profscope(Sleep part);
+        __profn("Sleep part");
        /* Break sleep up into parts that are lower than scheduler period */
        f64 remaining_seconds = (f64)(target_qpc - qpc.QuadPart) / (f64)qpc_per_second;
        i64 sleep_ticks = (i64)((remaining_seconds - tolerance) * 10000000);
@ -3051,7 +3067,7 @@ INTERNAL void win32_precise_sleep_timer(HANDLE timer, f64 seconds)

    /* Spin for any remaining time */
    {
-        __profscope(Sleep spin);
+        __profn("Sleep spin");
        while (qpc.QuadPart < target_qpc) {
            YieldProcessor();
            QueryPerformanceCounter(&qpc);
@ -3079,14 +3095,14 @@ INTERNAL void win32_precise_sleep_legacy(f64 seconds)
    f64 sleep_ms = (seconds * 1000) - tolerance;
    i32 sleep_slices = (i32)(sleep_ms / scheduler_period_ms);
    if (sleep_slices > 0) {
-        __profscope(Legacy sleep part);
+        __profn("Legacy sleep part");
        Sleep((DWORD)sleep_slices * scheduler_period_ms);
    }
    QueryPerformanceCounter(&qpc);

    /* Spin for any remaining time */
    {
-        __profscope(Legacy sleep spin);
+        __profn("Legacy sleep spin");
        while (qpc.QuadPart < target_qpc) {
            YieldProcessor();
            QueryPerformanceCounter(&qpc);
@ -3162,7 +3178,7 @@ int CALLBACK wWinMain(_In_ HINSTANCE instance, _In_opt_ HINSTANCE prev_instance,

 #if PROFILING
    {
-        __profscope(Launch profiler);
+        __profn("Launch profiler");
        STARTUPINFO si = ZI;
        si.cb = sizeof(si);
        PROCESS_INFORMATION pi = ZI;
--- a/src/ttf_dwrite.cpp
+++ b/src/ttf_dwrite.cpp
@ -174,7 +174,7 @@ struct ttf_decode_result ttf_decode(struct arena *arena, struct string encoded,
    u32 out_offset_y = 0;
    u32 row_height = 0;
    {
-        __profscope(Build atlas);
+        __profn("Build atlas");
        for (u16 i = 0; i < glyph_count; ++i) {
            /* Render glyph to target */
            DWRITE_GLYPH_RUN glyph_run = ZI;
--- a/src/user.c
+++ b/src/user.c
@ -1147,7 +1147,7 @@ INTERNAL void user_update(void)
    {
        /* Copy valid entities */
        {
-            __profscope(Build ents list for sorting);
+            __profn("Build ents list for sorting");
            for (u64 ent_index = 0; ent_index < G.ss_blended->num_ents_reserved; ++ent_index) {
                struct sim_ent *ent = &G.ss_blended->ents[ent_index];
                if (sim_ent_is_valid_and_active(ent)) {
@ -1158,7 +1158,7 @@ INTERNAL void user_update(void)
        }
        /* Sort */
        {
-            __profscope(Sort ents);
+            __profn("Sort ents");
            merge_sort(sorted, sorted_count, sizeof(*sorted), ent_draw_order_cmp, NULL);
        }
    }
@ -1168,7 +1168,7 @@ INTERNAL void user_update(void)
     * ========================== */

    {
-        __profscope(Draw entities);
+        __profn("Draw entities");
        for (u64 sorted_index = 0; sorted_index < sorted_count; ++sorted_index) {
            struct sim_ent *ent = sorted[sorted_index];
            if (!sim_ent_is_valid_and_active(ent)) continue;
@ -1694,7 +1694,7 @@ INTERNAL void user_update(void)

    /* Draw crosshair or show cursor */
    if (!G.debug_camera) {
-        __profscope(Draw crosshair);
+        __profn("Draw crosshair");
        struct v2 crosshair_pos = G.user_cursor;
        struct sprite_tag crosshair = sprite_tag_from_path(LIT("sprite/crosshair.ase"));
        struct sprite_texture *t = sprite_texture_from_tag_async(sprite_frame_scope, crosshair);
@ -1706,7 +1706,7 @@ INTERNAL void user_update(void)
    /* FIXME: Enable this */
 #if 0
    {
-        __profscope(Update window cursor);
+        __profn("Update window cursor");
        if (G.debug_camera) {
            sys_window_cursor_disable_clip(G.window);
            sys_window_cursor_show(G.window);
@ -1924,7 +1924,7 @@ INTERNAL void user_update(void)
     * ========================== */

    if (G.debug_draw) {
-        __profscope(Draw debug info);
+        __profn("Draw debug info");
        struct font *font = font_load_async(LIT("font/fixedsys.ttf"), 12.0f);
        if (font) {
            struct arena_temp temp = arena_temp_begin(scratch.arena);
@ -2047,7 +2047,7 @@ INTERNAL void user_update(void)
     * ========================== */

    {
-        __profscope(Render);
+        __profn("Render");

        struct rect user_viewport = RECT_FROM_V2(V2(0, 0), G.user_size);
        struct v2i32 user_resolution = v2_round_to_int(user_viewport.size);
@ -2107,7 +2107,7 @@ INTERNAL SYS_JOB_DEF(user_job, _)

    while (!atomic_i32_fetch(&G.shutdown)) {
        {
-            __profscope(User sleep);
+            __profn("User sleep");
            sleep_frame(last_frame_ns, target_dt_ns);
        }
        last_frame_ns = sys_time_ns();
@ -2268,11 +2268,11 @@ INTERNAL SYS_JOB_DEF(local_sim_job, _)
    while (!atomic_i32_fetch(&G.shutdown)) {
        struct arena_temp scratch = scratch_begin_no_conflict();
        {
-            __profscope(Sim sleep);
+            __profn("Sim sleep");
            sleep_frame(real_time_ns, step_dt_ns * compute_timescale);
        }
        {
-            __profscope(Sim update);
+            __profn("Sim update");

            real_dt_ns = sys_time_ns() - real_time_ns;
            real_time_ns += real_dt_ns;