diff --git a/src/gp_dx12.c b/src/gp_dx12.c index 1d8df737..0767845e 100644 --- a/src/gp_dx12.c +++ b/src/gp_dx12.c @@ -2768,8 +2768,6 @@ INTERNAL void present_blit(struct swapchain_buffer *dst, struct dx12_resource *s void gp_present(struct sys_window *window, struct v2i32 backresolution, struct gp_resource *texture, struct xform texture_xf, i32 vsync) { __prof; - //sys_sleep(0.1); - struct swapchain *swapchain = &G.swapchain; struct swapchain_buffer *swapchain_buffer = update_swapchain(swapchain, window, backresolution); struct dx12_resource *texture_resource = (struct dx12_resource *)texture; @@ -2777,8 +2775,6 @@ void gp_present(struct sys_window *window, struct v2i32 backresolution, struct g /* Blit */ present_blit(swapchain_buffer, texture_resource, texture_xf); - //sys_sleep(0.1); - u32 present_flags = 0; if (!vsync) { present_flags |= (DXGI_PRESENT_ALLOW_TEARING * DX12_ALLOW_TEARING); diff --git a/src/prof_tracy.h b/src/prof_tracy.h index bc12e952..415e696d 100644 --- a/src/prof_tracy.h +++ b/src/prof_tracy.h @@ -7,7 +7,7 @@ #if PROFILING -#define PROFILING_SYSTEM_TRACE 0 +#define PROFILING_SYSTEM_TRACE 1 #define PROFILING_CAPTURE_FRAME_IMAGE 0 #define PROFILING_LOCKS 0 #define PROFILING_D3D 1 diff --git a/src/resource.c b/src/resource.c index d47f25f0..e6b4d51f 100644 --- a/src/resource.c +++ b/src/resource.c @@ -247,7 +247,7 @@ INTERNAL SYS_THREAD_DEF(resource_watch_dispatcher_thread_entry_point, _) { __profn("Delay"); snc_unlock(&watch_dispatcher_lock); - sys_sleep(WATCH_DISPATCHER_DELAY_SECONDS); + sys_wait(NULL, NULL, 0, NS_FROM_SECONDS(WATCH_DISPATCHER_DELAY_SECONDS)); watch_dispatcher_lock = snc_lock_e(&G.watch_dispatcher_mutex); } if (!atomic_i32_fetch(&G.watch_shutdown)) { diff --git a/src/sprite.c b/src/sprite.c index 91aa7eb3..98e4a213 100644 --- a/src/sprite.c +++ b/src/sprite.c @@ -152,6 +152,7 @@ GLOBAL struct { /* Evictor */ struct atomic_i32 evictor_cycle; + struct snc_counter shutdown_counter; b32 evictor_scheduler_shutdown; struct snc_mutex evictor_scheduler_mutex; struct snc_cv evictor_scheduler_shutdown_cv; @@ -248,7 +249,7 @@ struct sprite_startup_receipt sprite_startup(struct gp_startup_receipt *gp_sr, G.scopes_arena = arena_alloc(GIBI(64)); - sys_run(1, sprite_evictor_job, NULL, SYS_PRIORITY_BACKGROUND, NULL); + sys_run(1, sprite_evictor_job, NULL, SYS_PRIORITY_BACKGROUND, &G.shutdown_counter); app_register_exit_callback(&sprite_shutdown); resource_register_watch_callback(&sprite_resource_watch_callback); @@ -266,6 +267,8 @@ INTERNAL APP_EXIT_CALLBACK_FUNC_DEF(sprite_shutdown) snc_cv_broadcast(&G.evictor_scheduler_shutdown_cv); snc_unlock(&lock); } + /* Wait for evictor shutdown */ + snc_counter_wait(&G.shutdown_counter); } /* ========================== * diff --git a/src/sys.h b/src/sys.h index f9107df4..8ca5892e 100644 --- a/src/sys.h +++ b/src/sys.h @@ -416,17 +416,6 @@ u32 sys_num_logical_processors(void); void sys_exit(void); void sys_panic(struct string msg); -/* ========================== * - * Sleep - * ========================== */ - -/* Sleep for precisely the amount of time specified (more cpu intensive) */ -void sys_sleep_precise(f64 seconds); - -/* Sleep for the amount of time specified rounded to the OS scheduler period - * (less cpu intensive) */ -void sys_sleep(f64 seconds); - /* ========================== * * Command line * ========================== */ diff --git a/src/sys_win32.c b/src/sys_win32.c index 22747d5c..9d174471 100644 --- a/src/sys_win32.c +++ b/src/sys_win32.c @@ -109,7 +109,7 @@ struct win32_window { * NOTE: This is not the actual rate that the scheduler runs at, just the * minimum amount of time that it can refer to. Smaller values mean that the * scheduler has to process a greater number of wait lists upon waking up. */ -#define SCHEDULER_MIN_INTERVAL_NS (KIBI(256)) /* ~256 microseconds */ +#define SCHEDULER_MIN_INTERVAL_NS (KIBI(256)) /* ~262 microseconds */ struct alignas(64) wait_list { /* =================================================== */ @@ -307,7 +307,7 @@ GLOBAL struct { /* Scheduler */ - struct atomic_i64 current_scheduler_interval; /* TODO: Prevent false sharing */ + struct atomic_i64 last_scheduler_interval; /* TODO: Prevent false sharing */ /* Wait lists */ struct atomic_u64 waiter_wake_gen; /* TODO: Prevent false sharing */ @@ -330,6 +330,7 @@ GLOBAL struct { /* Workers */ struct atomic_i64 workers_wake_gen; /* TODO: Prevent false sharing */ + struct atomic_i64 num_jobs_in_queue; /* TODO: Prevent false sharing */ struct snc_mutex workers_wake_mutex; struct snc_cv workers_wake_cv; @@ -391,7 +392,11 @@ void sys_wait(void *addr, void *cmp, u32 size, i64 timeout_ns) timeout_ms = timeout_ns / 1000000; timeout_ms += (timeout_ms == 0) * math_fsign(timeout_ns); } - WaitOnAddress(addr, cmp, size, timeout_ms); + if (addr == NULL) { + Sleep(timeout_ms); + } else { + WaitOnAddress(addr, cmp, size, timeout_ms); + } } } @@ -587,6 +592,7 @@ void sys_wake_all(void *addr) if (num_waiters > 0) { struct snc_lock lock = snc_lock_e(&G.workers_wake_mutex); { + atomic_i64_fetch_add(&G.num_jobs_in_queue, num_waiters); if (atomic_i64_fetch(&G.workers_wake_gen) >= 0) { atomic_i64_fetch_add(&G.workers_wake_gen, 1); snc_cv_broadcast(&G.workers_wake_cv); @@ -755,6 +761,7 @@ void sys_run(i32 count, sys_job_func *func, void *sig, enum sys_priority priorit /* TODO: Only wake necessary amount of workers */ struct snc_lock lock = snc_lock_e(&G.workers_wake_mutex); { + atomic_i64_fetch_add(&G.num_jobs_in_queue, count); if (atomic_i64_fetch(&G.workers_wake_gen) >= 0) { atomic_i64_fetch_add(&G.workers_wake_gen, 1); snc_cv_broadcast(&G.workers_wake_cv); @@ -870,7 +877,6 @@ INTERNAL SYS_THREAD_DEF(job_worker_entry, worker_ctx_arg) i64 last_seen_wake_gen = 0; while (last_seen_wake_gen >= 0) { /* Pull job from queue */ - b32 queues_empty = true; enum sys_priority job_priority = 0; i16 job_fiber_id = 0; i32 job_id = 0; @@ -878,7 +884,7 @@ INTERNAL SYS_THREAD_DEF(job_worker_entry, worker_ctx_arg) void *job_sig = 0; struct snc_counter *job_counter = 0; { - __profnc("Pull job", RGB32_F(0.75, 0.75, 0)); + //__profnc("Pull job", RGB32_F(0.75, 0.75, 0)); for (u32 queue_index = 0; queue_index < countof(queues) && !job_func; ++queue_index) { struct job_queue *queue = queues[queue_index]; if (queue) { @@ -894,6 +900,7 @@ INTERNAL SYS_THREAD_DEF(job_worker_entry, worker_ctx_arg) job_id = info->num_dispatched++; if (job_id < info->count) { /* Pick job */ + atomic_i64_fetch_add(&G.num_jobs_in_queue, -1); job_func = info->func; job_sig = info->sig; job_counter = info->counter; @@ -901,21 +908,16 @@ INTERNAL SYS_THREAD_DEF(job_worker_entry, worker_ctx_arg) /* We're picking up the last dispatch, so dequeue the job */ dequeue = true; } - if (!next) { - queues_empty = queue_index >= ((i32)countof(queues) - 1); - } } } else { /* This job is to be resumed from a yield */ + atomic_i64_fetch_add(&G.num_jobs_in_queue, -1); job_fiber_id = info->fiber_id; job_id = info->num_dispatched; job_func = info->func; job_sig = info->sig; job_counter = info->counter; dequeue = true; - if (!next) { - queues_empty = queue_index >= ((i32)countof(queues) - 1); - } } if (dequeue) { if (!next) { @@ -978,7 +980,7 @@ INTERNAL SYS_THREAD_DEF(job_worker_entry, worker_ctx_arg) i64 wait_timeout_ns = yield.wait.timeout_ns; i64 wait_time = 0; if (wait_timeout_ns > 0 && wait_timeout_ns < I64_MAX) { - wait_time = atomic_i64_fetch(&G.current_scheduler_interval) + (wait_timeout_ns / SCHEDULER_MIN_INTERVAL_NS); + wait_time = (sys_time_ns() + wait_timeout_ns) / SCHEDULER_MIN_INTERVAL_NS - 1; } u64 wait_addr_bin_index = (u64)wait_addr % NUM_WAIT_ADDR_BINS; @@ -990,7 +992,7 @@ INTERNAL SYS_THREAD_DEF(job_worker_entry, worker_ctx_arg) { if (wait_time != 0) while (atomic_i32_fetch_test_set(&wait_time_bin->lock, 0, 1) != 0) ix_pause(); { - b32 cancel_wait = true; + b32 cancel_wait = wait_addr == 0 && wait_time == 0; if (wait_addr != 0) { switch (wait_size) { case 1: cancel_wait = (u8)_InterlockedCompareExchange8(wait_addr, 0, 0) != *(u8 *)wait_cmp; break; @@ -1001,7 +1003,7 @@ INTERNAL SYS_THREAD_DEF(job_worker_entry, worker_ctx_arg) } } if (wait_time != 0 && !cancel_wait) { - cancel_wait = atomic_i64_fetch(&G.current_scheduler_interval) > wait_time; + cancel_wait = wait_time <= atomic_i64_fetch(&G.last_scheduler_interval); } if (!cancel_wait) { if (wait_addr != 0) { @@ -1095,76 +1097,6 @@ INTERNAL SYS_THREAD_DEF(job_worker_entry, worker_ctx_arg) if (wait_time != 0) atomic_i32_fetch_set(&wait_time_bin->lock, 0); } if (wait_addr != 0) atomic_i32_fetch_set(&wait_addr_bin->lock, 0); - - - - - - - - -#if 0 - while (atomic_i32_fetch_test_set(&wait_addr_bin->lock, 0, 1) != 0) ix_pause(); - { - /* Load and compare values now that bin is locked */ - b32 cancel_wait; - switch (wait_size) { - case 1: cancel_wait = (u8)_InterlockedCompareExchange8(wait_addr, 0, 0) == *(u8 *)wait_cmp; break; - case 2: cancel_wait = (u16)_InterlockedCompareExchange16(wait_addr, 0, 0) == *(u16 *)wait_cmp; break; - case 4: cancel_wait = (u32)_InterlockedCompareExchange(wait_addr, 0, 0) == *(u32 *)wait_cmp; break; - case 8: cancel_wait = (u64)_InterlockedCompareExchange64(wait_addr, 0, 0) == *(u64 *)wait_cmp; break; - default: cancel_wait = true; ASSERT(false); break; /* Invalid wait size */ - } - if (!cancel_wait) { - /* Search addr wait list in bin */ - struct wait_list *wait_addr_list = NULL; - for (struct wait_list *tmp = wait_addr_bin->first_wait_list; tmp && !wait_addr_list; tmp = tmp->next_in_bin) { - if (tmp->value == (u64)wait_addr) { - wait_addr_list = tmp; - } - } - - /* Allocate new wait list */ - if (!wait_addr_list) { - if (wait_addr_bin->first_free_wait_list) { - wait_addr_list = wait_addr_bin->first_free_wait_list; - wait_addr_bin->first_free_wait_list = wait_addr_list->next_in_bin; - } else { - while (atomic_i32_fetch_test_set(&G.wait_lists_arena_lock, 0, 1) != 0) ix_pause(); - { - wait_addr_list = arena_push_no_zero(G.wait_lists_arena, struct wait_list); - } - atomic_i32_fetch_set(&G.wait_lists_arena_lock, 0); - } - MEMZERO_STRUCT(wait_addr_list); - wait_addr_list->value = wait_addr; - if (wait_addr_bin->last_wait_list) { - wait_addr_bin->last_wait_list->next_in_bin = wait_addr_list; - wait_addr_list->prev_in_bin = wait_addr_bin->last_wait_list; - } else { - wait_addr_bin->first_wait_list = wait_addr_list; - } - wait_addr_bin->last_wait_list = wait_addr_list; - } - - /* Insert fiber into wait list */ - job_fiber->wait_addr = wait_addr; - if (wait_addr_list->last_waiter) { - fiber_from_id(wait_addr_list->last_waiter)->next_addr_waiter = job_fiber_id; - job_fiber->prev_addr_waiter = wait_addr_list->last_waiter; - } else { - wait_addr_list->first_waiter = job_fiber_id; - } - wait_addr_list->last_waiter = job_fiber_id; - ++wait_addr_list->num_waiters; - - /* Pop worker's job fiber */ - job_fiber = NULL; - done = true; - } - } - atomic_i32_fetch_set(&wait_addr_bin->lock, 0); -#endif } break; case YIELD_KIND_DONE: @@ -1182,7 +1114,7 @@ INTERNAL SYS_THREAD_DEF(job_worker_entry, worker_ctx_arg) /* Wait */ struct snc_lock wake_lock = snc_lock_s(&G.workers_wake_mutex); { - if (queues_empty) { + if (atomic_i64_fetch(&G.num_jobs_in_queue) <= 0) { i64 new_wake_gen = atomic_i64_fetch(&G.workers_wake_gen); while (new_wake_gen == last_seen_wake_gen) { __profnc("Wait for job", RGB32_F(0.75, 0.75, 0)); @@ -1224,14 +1156,15 @@ INTERNAL SYS_THREAD_DEF(job_scheduler_entry, _) { __profn("Job scheduler wait"); LARGE_INTEGER due = ZI; - due.QuadPart = -(SCHEDULER_MIN_INTERVAL_NS / 100); + //due.QuadPart = -(SCHEDULER_MIN_INTERVAL_NS / 100); + due.QuadPart = 0; SetWaitableTimerEx(timer, &due, 0, NULL, NULL, NULL, 0); WaitForSingleObject(timer, INFINITE); } u64 wake_gen = atomic_u64_fetch_add_u64(&G.waiter_wake_gen, 1); i64 new_interval = sys_time_ns() / SCHEDULER_MIN_INTERVAL_NS; - atomic_i64_fetch_set(&G.current_scheduler_interval, new_interval); + atomic_i64_fetch_set(&G.last_scheduler_interval, new_interval); { __profn("Job scheduler run"); struct arena_temp temp = arena_temp_begin(scratch.arena); @@ -1405,6 +1338,7 @@ INTERNAL SYS_THREAD_DEF(job_scheduler_entry, _) if (num_waiters > 0) { struct snc_lock lock = snc_lock_e(&G.workers_wake_mutex); { + atomic_i64_fetch_add(&G.num_jobs_in_queue, num_waiters); if (atomic_i64_fetch(&G.workers_wake_gen) >= 0) { atomic_i64_fetch_add(&G.workers_wake_gen, 1); snc_cv_broadcast(&G.workers_wake_cv); @@ -1433,9 +1367,10 @@ INTERNAL SYS_THREAD_DEF(test_entry, _) /* Start scheduler */ struct sys_thread *scheduler_thread = sys_thread_alloc(job_scheduler_entry, NULL, LIT("Scheduler thread"), PROF_THREAD_GROUP_SCHEDULER); - while (atomic_i64_fetch(&G.current_scheduler_interval) == 0) ix_pause(); + while (atomic_i64_fetch(&G.last_scheduler_interval) == 0) ix_pause(); /* Start workers */ + //G.num_worker_threads = 1; G.num_worker_threads = 6; G.worker_threads_arena = arena_alloc(GIBI(64)); G.worker_threads = arena_push_array(G.worker_threads_arena, struct sys_thread *, G.num_worker_threads); @@ -3241,119 +3176,6 @@ void sys_panic(struct string msg) } } -/* ========================== * - * Sleep - * ========================== */ - -/* https://blog.bearcats.nl/perfect-sleep-function/ */ - -INTERNAL void win32_precise_sleep_timer(HANDLE timer, f64 seconds) -{ - __prof; - - /* TODO: Does the high frequency timer even require setting / scaling of - * timeBeginPeriod/scheduler_period_ms? There isn't much documentation. */ - - i64 qpc_per_second = G.qpc_per_second; - i32 scheduler_period_ms = G.scheduler_period_ms; - - LARGE_INTEGER qpc; - QueryPerformanceCounter(&qpc); - i64 target_qpc = (i64)(qpc.QuadPart + seconds * qpc_per_second); - - /* TODO: Maybe increase tolerance for higher precision but more power usage */ - //const f64 tolerance = scheduler_period_ms * 0.001200; - const f64 tolerance = scheduler_period_ms * 0.000520; - //const f64 tolerance = scheduler_period_ms * 1; - - i64 max_ticks = (i64)scheduler_period_ms * 9500; - while (true) { - __profn("Sleep part"); - /* Break sleep up into parts that are lower than scheduler period */ - f64 remaining_seconds = (f64)(target_qpc - qpc.QuadPart) / (f64)qpc_per_second; - i64 sleep_ticks = (i64)((remaining_seconds - tolerance) * 10000000); - if (sleep_ticks <= 0) { - break; - } - LARGE_INTEGER due; - due.QuadPart = -(sleep_ticks > max_ticks ? max_ticks : sleep_ticks); - SetWaitableTimerEx(timer, &due, 0, NULL, NULL, NULL, 0); - WaitForSingleObject(timer, INFINITE); - QueryPerformanceCounter(&qpc); - } - - /* Spin for any remaining time */ - { - __profn("Sleep spin"); - while (qpc.QuadPart < target_qpc) { - YieldProcessor(); - QueryPerformanceCounter(&qpc); - } - } -} - -INTERNAL void win32_precise_sleep_legacy(f64 seconds) -{ - __prof; - i64 qpc_per_second = G.qpc_per_second; - i32 scheduler_period_ms = G.scheduler_period_ms; - - LARGE_INTEGER qpc; - QueryPerformanceCounter(&qpc); - i64 target_qpc = (i64)(qpc.QuadPart + seconds * qpc_per_second); - - /* TODO: Calculate tolerance */ - - /* TODO: Maybe increase tolerance for higher precision but more power usage */ - //const double tolerance = 1.02; - const double tolerance = 0.52 * scheduler_period_ms; - - /* Sleep */ - f64 sleep_ms = (seconds * 1000) - tolerance; - i32 sleep_slices = (i32)(sleep_ms / scheduler_period_ms); - if (sleep_slices > 0) { - __profn("Legacy sleep part"); - Sleep((DWORD)sleep_slices * scheduler_period_ms); - } - QueryPerformanceCounter(&qpc); - - /* Spin for any remaining time */ - { - __profn("Legacy sleep spin"); - while (qpc.QuadPart < target_qpc) { - YieldProcessor(); - QueryPerformanceCounter(&qpc); - } - } -} - -void sys_sleep_precise(f64 seconds) -{ - __prof; - /* FIXME: Enable this */ -#if 0 - HANDLE timer = ctx->sleep_timer; - if (timer) { - /* Use newer sleeping method */ - win32_precise_sleep_timer(timer, seconds); - } else { - /* Fall back to older sleep method if CREATE_WAITABLE_TIMER_HIGH_RESOLUTION - * is not available due to older windows version */ - win32_precise_sleep_legacy(seconds); - } -#else - (UNUSED)win32_precise_sleep_timer; - win32_precise_sleep_legacy(seconds); -#endif -} - -void sys_sleep(f64 seconds) -{ - __prof; - u32 ms = max_u32(1, math_round_to_int((f32)(seconds * 1000.0))); - Sleep(ms); -} - /* ========================== * * Command line * ========================== */ diff --git a/src/user.c b/src/user.c index ccbdc424..11e0acc1 100644 --- a/src/user.c +++ b/src/user.c @@ -49,6 +49,7 @@ struct console_log { GLOBAL struct { struct atomic_i32 shutdown; + struct snc_counter shutdown_job_counters; struct sim_ctx *local_sim_ctx; @@ -253,8 +254,8 @@ struct user_startup_receipt user_startup(struct gp_startup_receipt *gp_sr, sys_window_register_event_callback(G.window, &window_event_callback); /* Start jobs */ - sys_run(1, local_sim_job, NULL, SYS_PRIORITY_HIGH, NULL); - sys_run(1, user_job, NULL, SYS_PRIORITY_HIGH, NULL); + sys_run(1, local_sim_job, NULL, SYS_PRIORITY_HIGH, &G.shutdown_job_counters); + sys_run(1, user_job, NULL, SYS_PRIORITY_HIGH, &G.shutdown_job_counters); app_register_exit_callback(&user_shutdown); return (struct user_startup_receipt) { 0 }; @@ -263,9 +264,11 @@ struct user_startup_receipt user_startup(struct gp_startup_receipt *gp_sr, INTERNAL APP_EXIT_CALLBACK_FUNC_DEF(user_shutdown) { __prof; - sys_window_unregister_event_callback(G.window, &window_event_callback); + /* Signal shutdown */ atomic_i32_fetch_set(&G.shutdown, true); + /* Wait for jobs shutdown */ + snc_counter_wait(&G.shutdown_job_counters); } /* ========================== * diff --git a/src/util.h b/src/util.h index f4e46bc6..684fa4a6 100644 --- a/src/util.h +++ b/src/util.h @@ -261,6 +261,33 @@ INLINE void dict_remove_entry(struct dict *dict, struct dict_entry *entry) * Sleep frame * ========================== */ +INLINE void sleep_precise(i64 sleep_time_ns) +{ + __prof; + + i64 tolerance = 200000; + i64 big_sleep = 500000; + + i64 now_ns = sys_time_ns(); + i64 target_ns = now_ns + sleep_time_ns; + + /* Sleep */ + while (now_ns < target_ns - big_sleep - tolerance) { + __profn("Sleep part"); + sys_wait(NULL, NULL, 0, big_sleep); + now_ns = sys_time_ns(); + } + + /* Spin */ + { + __profn("Sleep spin"); + while (now_ns < target_ns) { + ix_pause(); + now_ns = sys_time_ns(); + } + } +} + INLINE void sleep_frame(i64 last_frame_time_ns, i64 target_dt_ns) { if (last_frame_time_ns != 0 && target_dt_ns > 0) { @@ -268,7 +295,7 @@ INLINE void sleep_frame(i64 last_frame_time_ns, i64 target_dt_ns) i64 last_frame_dt_ns = now_ns - last_frame_time_ns; i64 sleep_time_ns = target_dt_ns - last_frame_dt_ns; if (sleep_time_ns > 0) { - sys_sleep_precise(SECONDS_FROM_NS(sleep_time_ns)); + sleep_precise(sleep_time_ns); } } }