#include "work.h"
#include "intrinsics.h"
#include "sys.h"
#include "arena.h"
#include "scratch.h"
#include "memory.h"
#include "string.h"
#include "log.h"

/* Terminology:
 *
 * Task: Single unit of stuff to be done (a function with a data pointer)
 *
 * Work: A group of tasks (doesn't have to be homogeneous) bundled together.
 *       Work is "complete" when all of its tasks are complete.
 *
 * Work Slate: A list of tasks used as a building-tool for constructing work.
 *
 * Worker: A thread that can do work. "work_startup" will create a certain
 *         amount of dedicated worker threads. Note that non-worker threads can
 *         also do work themselves (IE: callers of "work_wait")
 */

/* NOTE:
 * Functions suffixed with "assume_locked" require `L.mutex` to be
 * locked & unlocked by the caller.
 */

struct worker {
    struct sys_thread thread;
    struct worker *next;
};

struct work_task;

struct work {
    enum work_priority priority;
    enum work_status status;
    u32 workers;

    struct sys_condition_variable condition_variable_finished;

    struct work *prev_scheduled;
    struct work *next_scheduled;
    struct work *next_free;

    struct work_task *task_head;  /* Unstarted task head */
    u32 tasks_incomplete;

    u64 gen;
};

struct work_task {
    void *data;
    work_task_func *func;
    struct work *work;

    struct work_task *next_in_work;
    struct work_task *next_free;
};

/* ========================== *
 * Global state
 * ========================== */

GLOBAL struct {
    b32 shutdown;

    struct arena arena;

    struct sys_mutex mutex;
    struct sys_semaphore semaphore;

    u32 worker_count;
    u32 idle_worker_count;
    struct worker *worker_head;

    /* TODO: Make below pointers volatile? */

    struct work_task *free_task_head;

    struct work *free_work_head;
    struct work *scheduled_work_head;

    /* Pointers to the last piece of work of each priority in the scheduled
     * work list (used for O(1) insertion) */
    struct work *scheduled_work_priority_tails[NUM_WORK_PRIORITIES];
} L = { 0 }, DEBUG_LVAR(L_work);

INTERNAL void worker_thread_entry_point(void *thread_data);

/* ========================== *
 * Startup
 * ========================== */

void work_startup(u32 num_worker_threads)
{
    struct temp_arena scratch = scratch_begin_no_conflict();

    if (num_worker_threads <= 0) {
        sys_panic(STR("Tried to start up worker pool with 0 threads"));
    }

    L.arena = arena_alloc(GIGABYTE(64));
    L.mutex = sys_mutex_alloc();
    L.semaphore = sys_semaphore_alloc(num_worker_threads);
    L.worker_count = num_worker_threads;
    L.idle_worker_count = num_worker_threads;

    /* Initialize threads */
    {
        sys_mutex_lock(&L.mutex);
        {
            struct worker *prev = NULL;
            for (u32 i = 0; i < num_worker_threads; ++i) {
                struct string thread_name = string_format(scratch.arena,
                                                          STR("[P0] Worker %F"),
                                                          FMT_UINT(i));

                struct worker *worker = arena_push_zero(&L.arena, struct worker);
                worker->thread = sys_thread_init(&worker_thread_entry_point, NULL, thread_name);
                if (prev) {
                    prev->next = worker;
                } else {
                    L.worker_head = worker;
                }
                prev = worker;
            }
        }
        sys_mutex_unlock(&L.mutex);
    }

    scratch_end(scratch);
}

void work_shutdown(void)
{
    L.shutdown = true;
    WRITE_BARRIER();
    sys_semaphore_signal(&L.semaphore, L.worker_count);
    for (struct worker *worker = L.worker_head; (worker = worker->next);) {
        sys_thread_join(&worker->thread);
    }
}

/* ========================== *
 * Internal work / task allocation
 * ========================== */

INTERNAL struct work *work_alloc_assume_locked(void)
{
    __prof;
    sys_mutex_assert_locked(&L.mutex);
    struct work *work = NULL;

    /* Allocate work */
    if (L.free_work_head) {
        /* Reuse from free list */
        work = L.free_work_head;
        L.free_work_head = work->next_free;
        *work = (struct work) {
            .condition_variable_finished = work->condition_variable_finished,
            .gen = work->gen
        };
    } else {
        /* Make new */
        work = arena_push(&L.arena, struct work);
        *work = (struct work) {
            .condition_variable_finished = sys_condition_variable_alloc(),
            .gen = 1
        };
    }
    return work;
}

INTERNAL void work_release_assume_locked(struct work *work)
{
    sys_mutex_assert_locked(&L.mutex);
    work->next_free = L.free_work_head;
    L.free_work_head = work;
    ++work->gen;
}

INTERNAL struct work_handle work_to_handle_assume_locked(struct work *work)
{
    sys_mutex_assert_locked(&L.mutex);
    return (struct work_handle) {
        .work = work,
        .gen = work->gen
    };
}

INTERNAL struct work_task *task_alloc_assume_locked(void)
{
    sys_mutex_assert_locked(&L.mutex);
    struct work_task *task = NULL;

    /* Allocate task */
    if (L.free_task_head) {
        /* Reuse from free list */
        task = L.free_task_head;
        L.free_task_head = task->next_free;
        *task = (struct work_task) { 0 };
    } else {
        /* Make new */
        task = arena_push_zero(&L.arena, struct work_task);
    }

    return task;
}

INTERNAL void task_release_assume_locked(struct work_task *task)
{
    sys_mutex_assert_locked(&L.mutex);
    task->next_free = L.free_task_head;
    L.free_task_head = task;
}

/* ========================== *
 * Work scheduling / insertion
 * ========================== */

INTERNAL void work_schedule_assume_locked(struct work *work)
{
    __prof;
    sys_mutex_assert_locked(&L.mutex);
    enum work_priority priority = work->priority;

    if (L.scheduled_work_head) {
        struct work *head = L.scheduled_work_head;

        if (head->priority >= priority) {
            /* Head is lower priority, insert work as new head */
            L.scheduled_work_head = work;
            work->next_scheduled = head;
            head->prev_scheduled = work;
        } else {
            /* Find higher priority */
            struct work *tail = NULL;
            for (i32 i = priority; i >= 0; --i) {
                tail = L.scheduled_work_priority_tails[i];
                if (tail) {
                    break;
                }
            }
            /* Hook work */
            work->next_scheduled = tail->next_scheduled;
            work->prev_scheduled = tail;
            tail->next_scheduled = work;
        }
    } else {
        L.scheduled_work_head = work;
    }

    L.scheduled_work_priority_tails[priority] = work;

    WRITE_BARRIER();
    sys_semaphore_signal(&L.semaphore, min_u32(work->tasks_incomplete, L.worker_count));
}

INTERNAL void work_unschedule_assume_locked(struct work *work)
{
    __prof;
    sys_mutex_assert_locked(&L.mutex);

    struct work *prev = (struct work *)work->prev_scheduled;
    struct work *next = (struct work *)work->next_scheduled;

    /* Remove from priority tails array */
    enum work_priority priority = work->priority;
    struct work *priority_tail = L.scheduled_work_priority_tails[priority];
    if (priority_tail == work && (!prev || prev->priority == priority)) {
        L.scheduled_work_priority_tails[priority] = prev;
    }

    /* Unhook work */
    if (prev) {
        prev->next_scheduled = next;
    }
    if (next) {
        next->prev_scheduled = prev;
    }
    if (work == L.scheduled_work_head) {
        L.scheduled_work_head = next;
    }
}

/* ========================== *
 * Task dequeuing
 * ========================== */

INTERNAL struct work_task *work_dequeue_task_assume_locked(struct work *work)
{
    __prof;
    sys_mutex_assert_locked(&L.mutex);
    struct work_task *task = work->task_head;
    if (task) {
        work->task_head = task->next_in_work;
        if (!work->task_head) {
            /* Unschedule work if last task */
            work_unschedule_assume_locked(work);
        }
    }
    return task;
}

/* ========================== *
 * Work doing
 * ========================== */

/* NOTE: This function will release `work` there are no more tasks once completed.
 * Returns `true` if work was finished & released. */
INTERNAL b32 work_exec_single_task_maybe_release_assume_locked(struct work *work)
{
    __prof;
    sys_mutex_assert_locked(&L.mutex);

    struct work_task *task = work_dequeue_task_assume_locked(work);
    b32 more_tasks = work->task_head != NULL;

    if (task) {
        work->status = WORK_STATUS_IN_PROGRESS;

        ++work->workers;
        /* Do task (temporarily unlock) */
        sys_mutex_unlock(&L.mutex);
        {
            task->func(task->data);
        }
        sys_mutex_lock(&L.mutex);
        --work->workers;
        --work->tasks_incomplete;
        task_release_assume_locked(task);


        if (work->tasks_incomplete == 0) {
            /* Signal finished */
            work->status = WORK_STATUS_DONE;
            sys_condition_variable_signal(&work->condition_variable_finished);

            /* Release */
            work_release_assume_locked(work);
        }
    }

    return more_tasks;
}

INTERNAL void work_exec_remaining_tasks_maybe_release_assume_locked(struct work *work)
{
    __prof;
    sys_mutex_assert_locked(&L.mutex);

    b32 more_tasks = true;
    while (more_tasks) {
        more_tasks = work_exec_single_task_maybe_release_assume_locked(work);
    }

}

/* ========================== *
 * Work thread proc
 * ========================== */

INTERNAL void worker_thread_entry_point(void *thread_data)
{
    (UNUSED)thread_data;

    struct worker_context *ctx = sys_thread_get_worker_context();
    *ctx = (struct worker_context) {
        .is_worker = true
    };

    while (true) {
        sys_semaphore_wait(&L.semaphore);
        if (L.shutdown) {
            /* Exit thread */
            break;
        }
        while (L.scheduled_work_head) {
            /* Do work from top */
            sys_mutex_lock(&L.mutex);
            {
                struct work *work = L.scheduled_work_head;
                if (work) {
                    __profscope(work_pool_task);
                    --L.idle_worker_count;
                    work_exec_single_task_maybe_release_assume_locked((struct work *)work);
                    ++L.idle_worker_count;
                }
            }
            sys_mutex_unlock(&L.mutex);
        }
    }
}

/* ========================== *
 * Work pushing interface
 * ========================== */

/* If `help` is true, then the calling thread will start picking up tasks immediately (before other workers can see it) */
INTERNAL struct work_handle work_push_from_slate_assume_locked(struct work_slate *ws, b32 help, enum work_priority priority)
{
    __prof;
    sys_mutex_assert_locked(&L.mutex);

    struct work *work = work_alloc_assume_locked();
    struct work_handle wh = work_to_handle_assume_locked(work);

    work->priority = priority;
    work->status = WORK_STATUS_IN_PROGRESS;

    work->task_head = ws->task_head;
    work->tasks_incomplete = ws->num_tasks;

    work_schedule_assume_locked(work);

    if (help) {
        work_exec_remaining_tasks_maybe_release_assume_locked(work);
    } else {
        /* When work is submitted from a worker thread, we want the worker to pick
         * up the tasks itself when idle workers = 0 and work.workers = 0
         * (work.workers will always = 0 when work is first pushed).
         *
         * This is not ideal, however it is necessary to prevent
         * a scenario in which all workers are waiting on child work to complete in
         * a subtle way (IE: outside of work_wait). Since all workers are waiting,
         * there would be no remaining workers to complete the child work, meaning
         * there is a deadlock.
         *
         * By forcing workers to do their own child work, we can guarantee that this
         * does not occur. However it is not ideal since it creates situations in
         * which work is not done asynchronously.
         */
        struct worker_context *ctx = sys_thread_get_worker_context();
        if (ctx->is_worker) {
            b32 more_tasks = true;
            while (L.idle_worker_count == 0 && work->workers == 0 && more_tasks) {
                more_tasks = work_exec_single_task_maybe_release_assume_locked(work);
            }
        }
    }

    return wh;
}

INTERNAL struct work_handle work_push_task_internal(work_task_func *func, void *data, b32 help, enum work_priority priority)
{
    struct work_handle handle;
    sys_mutex_lock(&L.mutex);
    {
        struct work_task *task = task_alloc_assume_locked();
        task->data = data;
        task->func = func;

        struct work_slate ws = {
            .task_head = task,
            .task_tail = task,
            .num_tasks = 1
        };
        handle = work_push_from_slate_assume_locked(&ws, help, priority);
    }
    sys_mutex_unlock(&L.mutex);

    return handle;
}

/* Push work that contains a single task */
struct work_handle work_push_task(work_task_func *func, void *data, enum work_priority priority)
{
    __prof;
    struct work_handle handle = work_push_task_internal(func, data, false, priority);
    return handle;
}

struct work_handle work_push_task_and_help(work_task_func *func, void *data, enum work_priority priority)
{
    __prof;
    struct work_handle handle = work_push_task_internal(func, data, true, priority);
    return handle;
}

struct work_slate work_slate_begin(void)
{
    __prof;
    struct work_slate ws = { 0 };
    return ws;
}

void work_slate_push_task(struct work_slate *ws, work_task_func *func, void *data)
{
    __prof;

    struct work_task *task = NULL;
    sys_mutex_lock(&L.mutex);
    {
        task = task_alloc_assume_locked();
    }
    sys_mutex_unlock(&L.mutex);

    task->data = data;
    task->func = func;

    if (ws->task_tail) {
        ws->task_tail->next_in_work = task;
    } else {
        ws->task_head = task;
    }
    ws->task_tail = task;
    ++ws->num_tasks;

}

/* Push work that contains multiple tasks (work slate) */
struct work_handle work_slate_end(struct work_slate *ws, enum work_priority priority)
{
    __prof;

    struct work_handle handle;
    sys_mutex_lock(&L.mutex);
    {
        handle = work_push_from_slate_assume_locked(ws, false, priority);
    }
    sys_mutex_unlock(&L.mutex);

    return handle;
}

struct work_handle work_slate_end_and_help(struct work_slate *ws, enum work_priority priority)
{
    __prof;

    sys_mutex_lock(&L.mutex);
    struct work_handle handle = work_push_from_slate_assume_locked(ws, true, priority);
    sys_mutex_unlock(&L.mutex);

    return handle;
}

/* ========================== *
 * Work intervention interface
 * ========================== */

INTERNAL struct work *work_from_handle_assume_locked(struct work_handle handle)
{
    sys_mutex_assert_locked(&L.mutex);

    struct work *work = handle.work;
    if (work->gen != handle.gen) {
        work = NULL;
    }
    return work;
}

/* Wait for all tasks in work to be completed. Will also pick up any unstarted
 * tasks in the work since the caller will be idle while waiting anyway. */
void work_wait(struct work_handle handle)
{
    __prof;
    sys_mutex_lock(&L.mutex);
    {
        struct work *work = work_from_handle_assume_locked(handle);
        if (work) {
            /* Help with tasks */
            work_exec_remaining_tasks_maybe_release_assume_locked(work);

            /* Wait for work completion */
            work = work_from_handle_assume_locked(handle);  /* Re-checking work is sitll valid here in case work_do caused work to release */
            if (work) {
                while (work->status != WORK_STATUS_DONE) {
                    sys_condition_variable_wait(&work->condition_variable_finished, &L.mutex);
                }
            }
        }
    }
    sys_mutex_unlock(&L.mutex);
}

/* Try to pick up any scheduled tasks */
void work_help(struct work_handle handle)
{
    __prof;
    sys_mutex_lock(&L.mutex);

    struct work *work = work_from_handle_assume_locked(handle);
    if (work) {
        work_exec_remaining_tasks_maybe_release_assume_locked(work);
    }

    sys_mutex_unlock(&L.mutex);
}

/* ========================== *
 * Worker context interface
 * ========================== */

struct worker_context worker_context_alloc(void)
{
    return (struct worker_context) { 0 };
}

void worker_context_release(struct worker_context *ctx)
{
    /* Do nothing */
    (UNUSED)ctx;
}