revert d3d12 to enhanced barriers w/ explicit layouts. use independent-device agility sdk.

This commit is contained in:
jacob 2026-02-23 16:50:14 -06:00
parent 6f35da3fa6
commit 27885ead8a
20 changed files with 1467 additions and 2425 deletions

1
.gitattributes vendored
View File

@ -17,6 +17,7 @@
*.exe filter=lfs diff=lfs merge=lfs -text *.exe filter=lfs diff=lfs merge=lfs -text
*.dll filter=lfs diff=lfs merge=lfs -text *.dll filter=lfs diff=lfs merge=lfs -text
*.lib filter=lfs diff=lfs merge=lfs -text *.lib filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.tga filter=lfs diff=lfs merge=lfs -text *.tga filter=lfs diff=lfs merge=lfs -text
*.ase filter=lfs diff=lfs merge=lfs -text *.ase filter=lfs diff=lfs merge=lfs -text
*.ttf filter=lfs diff=lfs merge=lfs -text *.ttf filter=lfs diff=lfs merge=lfs -text

View File

@ -501,7 +501,7 @@ i32 W32_Main(void)
// Create app dir // Create app dir
{ {
String path = PathFromString(perm, appdir_path, '\\'); String path = PathFromString(perm, appdir_path, '\\');
wchar_t *path_wstr = WstrFromString(perm, appdir_path); wchar_t *path_wstr = WstrFromString(perm, path);
i32 err_code = SHCreateDirectoryExW(0, path_wstr, 0); i32 err_code = SHCreateDirectoryExW(0, path_wstr, 0);
String err = StringF(perm, "Error code %F", FmtSint(err_code)); String err = StringF(perm, "Error code %F", FmtSint(err_code));
switch (err_code) switch (err_code)
@ -519,6 +519,10 @@ i32 W32_Main(void)
{ {
err = Lit("User canceled the operation"); err = Lit("User canceled the operation");
} break; } break;
case ERROR_PATH_NOT_FOUND:
{
err = Lit("The system cannot find the path specified.");
} break;
} }
if (err_code != ERROR_SUCCESS && err_code != ERROR_ALREADY_EXISTS && err_code != ERROR_FILE_EXISTS) if (err_code != ERROR_SUCCESS && err_code != ERROR_ALREADY_EXISTS && err_code != ERROR_FILE_EXISTS)
{ {

View File

@ -25,6 +25,7 @@
#include <dwmapi.h> #include <dwmapi.h>
#include <avrt.h> #include <avrt.h>
#include <shellapi.h> #include <shellapi.h>
#include <compressapi.h>
// #pragma warning(pop) // #pragma warning(pop)
#ifndef BCRYPT_RNG_ALG_HANDLE #ifndef BCRYPT_RNG_ALG_HANDLE
@ -37,6 +38,8 @@
#pragma comment(lib, "kernel32") #pragma comment(lib, "kernel32")
#pragma comment(lib, "user32") #pragma comment(lib, "user32")
#pragma comment(lib, "bcrypt") #pragma comment(lib, "bcrypt")
#pragma comment(lib, "gdi32")
#pragma comment(lib, "cabinet")
#pragma comment(lib, "shell32") #pragma comment(lib, "shell32")
#pragma comment(lib, "ole32") #pragma comment(lib, "ole32")
#pragma comment(lib, "winmm") #pragma comment(lib, "winmm")

View File

@ -316,6 +316,7 @@ void GC_TickAsync(WaveLaneCtx *lane, AsyncFrameLaneCtx *base_async_lane_frame)
gpu_perm, cl, gpu_perm, cl,
G_Format_R8G8B8A8_Unorm_Srgb, G_Format_R8G8B8A8_Unorm_Srgb,
atlas->dims, atlas->dims,
G_Layout_Simultaneous,
.name = Lit("Glyph atlas") .name = Lit("Glyph atlas")
); );
atlas->tex = G_PushTexture2DRef(gpu_perm, atlas->tex_res); atlas->tex = G_PushTexture2DRef(gpu_perm, atlas->tex_res);

View File

@ -25,6 +25,7 @@ void G_BootstrapCommon(void)
gpu_perm, cl, gpu_perm, cl,
G_Format_R8G8B8A8_Uint, G_Format_R8G8B8A8_Uint,
VEC2I32(8, 8), VEC2I32(8, 8),
G_Layout_Common,
.flags = G_ResourceFlag_ZeroMemory .flags = G_ResourceFlag_ZeroMemory
); );
G.blank_tex = G_PushTexture2DRef(gpu_perm, blank_tex); G.blank_tex = G_PushTexture2DRef(gpu_perm, blank_tex);
@ -42,7 +43,8 @@ void G_BootstrapCommon(void)
noise_tex = G_PushTexture3D( noise_tex = G_PushTexture3D(
gpu_perm, cl, gpu_perm, cl,
G_Format_R16_Uint, G_Format_R16_Uint,
noise_dims noise_dims,
G_Layout_Common
); );
G_CopyCpuToTexture( G_CopyCpuToTexture(
cl, cl,

View File

@ -188,6 +188,117 @@ Enum(G_Format)
G_Format_COUNT = 192 G_Format_COUNT = 192
}; };
////////////////////////////////////////////////////////////
//~ Memory sync types
Enum(G_Stage)
{
G_Stage_None = 0,
// Compute stages
G_Stage_ComputeShading = (1 << 1),
// Draw stages
G_Stage_IndexAssembly = (1 << 2),
G_Stage_VertexShading = (1 << 3),
G_Stage_PixelShading = (1 << 4),
G_Stage_DepthStencil = (1 << 5),
G_Stage_RenderTarget = (1 << 6),
// Copy stages
G_Stage_Copy = (1 << 7),
// Indirect stages
G_Stage_Indirect = (1 << 8),
// Aggregate stages
G_Stage_Drawing = G_Stage_IndexAssembly |
G_Stage_VertexShading |
G_Stage_PixelShading |
G_Stage_DepthStencil |
G_Stage_RenderTarget,
G_Stage_Shading = G_Stage_ComputeShading |
G_Stage_VertexShading |
G_Stage_PixelShading,
G_Stage_All = 0xFFFFFFFF
};
Enum(G_Access)
{
G_Access_None = 0,
G_Access_ShaderReadWrite = (1 << 1),
G_Access_ShaderRead = (1 << 2),
G_Access_CopyWrite = (1 << 3),
G_Access_CopyRead = (1 << 4),
G_Access_DepthStencilRead = (1 << 5),
G_Access_DepthStencilWrite = (1 << 6),
G_Access_RenderTargetWrite = (1 << 7),
G_Access_IndexBuffer = (1 << 8),
G_Access_IndirectArgument = (1 << 9),
G_Access_All = 0xFFFFFFFF // Represents all accesses relevant to the stage specified in the barrier
};
Enum(G_Layout)
{
G_Layout_NoChange,
G_Layout_Undefined, // No access <-- D3D12_BARRIER_LAYOUT_UNDEFINED
// Simultaneous layout allows a resource to be used on any queue with any
// access type (except depth-stencil). Resources cannot transition to/from
// this layout, they must be created with it. Allows concurrent reads
// while up to 1 write is occuring to non-overlapping regions.
G_Layout_Simultaneous, // Any access except depth-stencil <-- D3D12_BARRIER_LAYOUT_COMMON + D3D12_RESOURCE_FLAG_ALLOW_SIMULTANEOUS_ACCESS
G_Layout_Common, // ShaderRead/CopyRead/CopyWrite/Present <-- D3D12_BARRIER_LAYOUT_COMMON
//////////////////////////////
//- Direct queue
G_Layout_DirectQueue_General, // ShaderRead/ShaderReadWrite/CopyRead/CopyWrite <-- D3D12_BARRIER_LAYOUT_DIRECT_QUEUE_COMMON
G_Layout_DirectQueue_Read, // ShaderRead/CopyRead/DepthStencilRead <-- D3D12_BARRIER_LAYOUT_DIRECT_QUEUE_GENERIC_READ
G_Layout_DirectQueue_DepthStencil, // DepthStencilRead/DepthStencilWrite <-- D3D12_BARRIER_LAYOUT_DEPTH_STENCIL_WRITE
G_Layout_DirectQueue_RenderTarget, // RenderTargetWrite <-- D3D12_BARRIER_LAYOUT_RENDER_TARGET
//////////////////////////////
//- Compute queue
G_Layout_ComputeQueue_General, // ShaderRead/ShaderReadWrite/CopyRead/CopyWrite <-- D3D12_BARRIER_LAYOUT_COMPUTE_QUEUE_COMMON
//////////////////////////////
//- Direct & Compute queue
G_Layout_DirectComputeQueue_Read, // ShaderRead/CopyRead <-- D3D12_BARRIER_LAYOUT_GENERIC_READ
G_Layout_DirectComputeQueue_ShaderReadWrite, // ShaderReadWrite <-- D3D12_BARRIER_LAYOUT_UNORDERED_ACCESS
G_Layout_DirectComputeQueue_CopyWrite, // CopyWrite <-- D3D12_BARRIER_LAYOUT_COPY_DEST
};
// Barrier will execute after previous stages specified by `stage_prev`, and before next stages specified by `stage_next`.
// When barrier executes:
// - Necessary resource flushes will occur based on `access_prev` & `access_next`
// - Texture layout will transition based on `layout` (if specified)
Struct(G_MemoryBarrierDesc)
{
G_ResourceHandle resource;
b32 is_global;
G_Stage stage_prev;
G_Stage stage_next;
G_Access access_prev;
G_Access access_next;
G_Layout layout;
RngI32 mips; // Inclusive range of texture mip indices to sync
};
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
//~ Filter types //~ Filter types
@ -303,6 +414,7 @@ Struct(G_TextureDesc)
G_ResourceFlag flags; G_ResourceFlag flags;
G_Format format; G_Format format;
Vec3I32 dims; Vec3I32 dims;
G_Layout initial_layout;
Vec4 clear_color; Vec4 clear_color;
i32 max_mips; // Will be clamped to range [1, max mips] i32 max_mips; // Will be clamped to range [1, max mips]
String name; String name;
@ -424,34 +536,37 @@ G_ResourceHandle G_PushResource(G_ArenaHandle arena, G_CommandListHandle cl, G_R
} \ } \
) )
#define G_PushTexture1D(arena, cl, _format, _size, ...) G_PushResource((arena), (cl), \ #define G_PushTexture1D(arena, cl, _format, _size, _initial_layout, ...) G_PushResource((arena), (cl), \
(G_ResourceDesc) { \ (G_ResourceDesc) { \
.kind = G_ResourceKind_Texture1D, \ .kind = G_ResourceKind_Texture1D, \
.texture = { \ .texture = { \
.format = (_format), \ .format = (_format), \
.dims = VEC3I32((_size), 1, 1), \ .dims = VEC3I32((_size), 1, 1), \
.initial_layout = (_initial_layout), \
__VA_ARGS__ \ __VA_ARGS__ \
} \ } \
} \ } \
) )
#define G_PushTexture2D(arena, cl, _format, _size, ...) G_PushResource((arena), (cl), \ #define G_PushTexture2D(arena, cl, _format, _size, _initial_layout, ...) G_PushResource((arena), (cl), \
(G_ResourceDesc) { \ (G_ResourceDesc) { \
.kind = G_ResourceKind_Texture2D, \ .kind = G_ResourceKind_Texture2D, \
.texture = { \ .texture = { \
.format = (_format), \ .format = (_format), \
.dims = VEC3I32((_size).x, (_size).y, 1), \ .dims = VEC3I32((_size).x, (_size).y, 1), \
.initial_layout = (_initial_layout), \
__VA_ARGS__ \ __VA_ARGS__ \
} \ } \
} \ } \
) )
#define G_PushTexture3D(arena, cl, _format, _size, ...) G_PushResource((arena), (cl), \ #define G_PushTexture3D(arena, cl, _format, _size, _initial_layout, ...) G_PushResource((arena), (cl), \
(G_ResourceDesc) { \ (G_ResourceDesc) { \
.kind = G_ResourceKind_Texture3D, \ .kind = G_ResourceKind_Texture3D, \
.texture = { \ .texture = { \
.format = (_format), \ .format = (_format), \
.dims = (_size), \ .dims = (_size), \
.initial_layout = (_initial_layout), \
__VA_ARGS__ \ __VA_ARGS__ \
} \ } \
} \ } \
@ -608,7 +723,50 @@ void G_SetConstantEx(G_CommandListHandle cl, i32 slot, void *src_32bit, u32 size
//- Memory sync //- Memory sync
void G_Barrier(G_CommandListHandle cl); void G_MemorySyncEx(G_CommandListHandle cl, G_MemoryBarrierDesc desc);
#define G_MemorySync(_cl, _resource, _stage_prev, _access_prev, _stage_next, _access_next, ...) \
G_MemorySyncEx((_cl), (G_MemoryBarrierDesc) { \
.resource = (_resource), \
.stage_prev = _stage_prev, \
.access_prev = _access_prev, \
.stage_next = _stage_next, \
.access_next = _access_next, \
.mips.max = 64, \
__VA_ARGS__ \
})
#define G_MemoryLayoutSync(_cl, _resource, _stage_prev, _access_prev, _stage_next, _access_next, _layout, ...) \
G_MemorySyncEx((_cl), (G_MemoryBarrierDesc) { \
.resource = (_resource), \
.stage_prev = _stage_prev, \
.access_prev = _access_prev, \
.stage_next = _stage_next, \
.access_next = _access_next, \
.layout = _layout, \
.mips.max = 64, \
__VA_ARGS__ \
})
#define G_GlobalMemorySync(_cl, _stage_prev, _access_prev, _stage_next, _access_next, ...) \
G_MemorySyncEx((_cl), (G_MemoryBarrierDesc) { \
.is_global = 1, \
.stage_prev = _stage_prev, \
.access_prev = _access_prev, \
.stage_next = _stage_next, \
.access_next = _access_next, \
.mips.max = 64, \
__VA_ARGS__ \
})
#define G_DumbMemorySync(cl, resource, ...) \
G_MemorySync((cl), (resource), G_Stage_All, G_Access_All, G_Stage_All, G_Access_All, __VA_ARGS__)
#define G_DumbMemoryLayoutSync(cl, resource, layout, ...) \
G_MemoryLayoutSync((cl), (resource), G_Stage_All, G_Access_All, G_Stage_All, G_Access_All, (layout), __VA_ARGS__)
#define G_DumbGlobalMemorySync(cl, ...) \
G_GlobalMemorySync((cl), G_Stage_All, G_Access_All, G_Stage_All, G_Access_All, __VA_ARGS__)
//- Compute //- Compute

View File

@ -1,5 +1,10 @@
@Layer gpu_dx12 @Layer gpu_dx12
//////////////////////////////
//- Resources
@EmbedDir G_D12_Resources gpu_dx12_res
////////////////////////////// //////////////////////////////
//- Api //- Api

File diff suppressed because it is too large Load Diff

View File

@ -72,7 +72,7 @@ Struct(G_D12_Resource)
u64 uid; u64 uid;
// D3D12 resource // D3D12 resource
D3D12_RESOURCE_DESC d3d_desc; D3D12_RESOURCE_DESC1 d3d_desc;
ID3D12Resource *d3d_resource; ID3D12Resource *d3d_resource;
D3D12_GPU_VIRTUAL_ADDRESS buffer_gpu_address; D3D12_GPU_VIRTUAL_ADDRESS buffer_gpu_address;
void *mapped; void *mapped;
@ -86,6 +86,7 @@ Struct(G_D12_Resource)
G_Format texture_format; G_Format texture_format;
Vec3I32 texture_dims; Vec3I32 texture_dims;
i32 texture_mips; i32 texture_mips;
D3D12_BARRIER_LAYOUT cmdlist_texture_layouts[G_D12_MaxMips];
// Sampler info // Sampler info
G_SamplerDesc sampler_desc; G_SamplerDesc sampler_desc;
@ -261,7 +262,7 @@ Struct(G_D12_RawCommandList)
u64 commit_fence_target; u64 commit_fence_target;
ID3D12CommandAllocator *d3d_ca; ID3D12CommandAllocator *d3d_ca;
ID3D12GraphicsCommandList *d3d_cl; ID3D12GraphicsCommandList7 *d3d_cl;
// Direct queue command lists keep a constant list of CPU-only descriptors // Direct queue command lists keep a constant list of CPU-only descriptors
G_D12_Descriptor *rtv_descriptors[G_MaxRenderTargets]; G_D12_Descriptor *rtv_descriptors[G_MaxRenderTargets];
@ -323,7 +324,10 @@ Struct(G_D12_Cmd)
struct struct
{ {
G_MemoryBarrierDesc desc;
// Post-batch data // Post-batch data
b32 is_end_of_batch;
u64 batch_gen; u64 batch_gen;
} barrier; } barrier;
@ -474,7 +478,7 @@ Struct(G_D12_Ctx)
// Device // Device
IDXGIFactory6 *factory; IDXGIFactory6 *factory;
IDXGIAdapter3 *adapter; IDXGIAdapter3 *adapter;
ID3D12Device1 *device; ID3D12Device10 *device;
// Release-queue // Release-queue
Mutex pending_releases_mutex; Mutex pending_releases_mutex;
@ -505,7 +509,10 @@ G_D12_Resource *G_D12_ResourceFromHandle(G_ResourceHandle handle);
G_D12_Swapchain *G_D12_SwapchainFromHandle(G_SwapchainHandle handle); G_D12_Swapchain *G_D12_SwapchainFromHandle(G_SwapchainHandle handle);
DXGI_FORMAT G_D12_DxgiFormatFromGpuFormat(G_Format format); DXGI_FORMAT G_D12_DxgiFormatFromGpuFormat(G_Format format);
b32 G_D12_IsSimultaneous(G_D12_Resource *resource); D3D12_BARRIER_SYNC G_D12_BarrierSyncFromStages(G_Stage stages);
D3D12_BARRIER_ACCESS G_D12_BarrierAccessFromAccesses(G_Access accesses);
D3D12_BARRIER_LAYOUT G_D12_BarrierLayoutFromLayout(G_Layout layout);
String G_D12_NameFromBarrierLayout(D3D12_BARRIER_LAYOUT layout);
void G_D12_InitRtv(G_D12_Resource *resource, D3D12_CPU_DESCRIPTOR_HANDLE rtv_handle, i32 mip); void G_D12_InitRtv(G_D12_Resource *resource, D3D12_CPU_DESCRIPTOR_HANDLE rtv_handle, i32 mip);

Binary file not shown.

Binary file not shown.

View File

@ -26,6 +26,19 @@ Struct(PLT_FileMap)
b32 valid; b32 valid;
}; };
////////////////////////////////////////////////////////////
//~ Compression types
Enum(PLT_CompressionLevel)
{
PLT_CompressionLevel_0, // Fastest
PLT_CompressionLevel_1,
PLT_CompressionLevel_2,
PLT_CompressionLevel_3,
PLT_CompressionLevel_COUNT
};
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
//~ Message box types //~ Message box types
@ -74,6 +87,12 @@ PLT_FileMap PLT_OpenFileMap(PLT_File file);
void PLT_CloseFileMap(PLT_FileMap map); void PLT_CloseFileMap(PLT_FileMap map);
String PLT_GetFileMapData(PLT_FileMap map); String PLT_GetFileMapData(PLT_FileMap map);
////////////////////////////////////////////////////////////
//~ @hoodkecl Compression
String PLT_Compress(Arena *arena, String data, PLT_CompressionLevel level);
String PLT_Decompress(Arena *arena, String data, PLT_CompressionLevel level);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
//~ @hookdecl Utils //~ @hookdecl Utils

View File

@ -12,6 +12,19 @@ void PLT_Bootstrap(void)
DispatchWave(Lit("Win32 timer sync"), 1, PLT_W32_SyncTimerForever, 0); DispatchWave(Lit("Win32 timer sync"), 1, PLT_W32_SyncTimerForever, 0);
} }
DWORD PLT_W32_CompressionAlgorithmFromLevel(PLT_CompressionLevel level)
{
// Win32 compression algorithms from fastest -> slowest
PERSIST Readonly DWORD algos[] = {
COMPRESS_ALGORITHM_XPRESS,
COMPRESS_ALGORITHM_XPRESS_HUFF,
COMPRESS_ALGORITHM_MSZIP,
COMPRESS_ALGORITHM_LZMS,
};
i32 algo_idx = ClampI32(level, 0, countof(algos));
return algos[algo_idx];
}
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
//~ Time //~ Time
@ -258,50 +271,41 @@ void PLT_CloseFile(PLT_File file)
String PLT_ReadFile(Arena *arena, PLT_File file) String PLT_ReadFile(Arena *arena, PLT_File file)
{ {
i64 size = 0; String result = Zi;
GetFileSizeEx((HANDLE)file.handle, (PLARGE_INTEGER)&size); HANDLE handle = (HANDLE)file.handle;
u32 chunk_size = Kibi(64);
String result; result.text = ArenaNext(arena, u8);
result.len = size; for (;;)
if (size > 0)
{ {
// ReadFile returns non-zero on success u8 *chunk = PushStructsNoZero(arena, u8, chunk_size);
// TODO: error checking DWORD chunk_bytes_read = 0;
result.text = PushStructsNoZero(arena, u8, size); ReadFile(handle, chunk, chunk_size, &chunk_bytes_read, 0);
ReadFile( result.len += chunk_bytes_read;
(HANDLE)file.handle, if (chunk_bytes_read < chunk_size)
result.text, {
(DWORD)result.len, PopStructsNoCopy(arena, u8, chunk_size - chunk_bytes_read);
0, break;
0 }
);
} }
return result; return result;
} }
void PLT_WriteFile(PLT_File file, String data) void PLT_WriteFile(PLT_File file, String data)
{ {
// TODO: Check what the real data limit is and chunk sequentially based on u32 chunk_size = Kibi(64);
// that (rather than failing) u32 pos = 0;
if (data.len >= 0x7FFF) while (pos < data.len)
{ {
TempArena scratch = BeginScratchNoConflict(); u32 part_size = MinU32(chunk_size, data.len - pos);
Panic(StringF(
scratch.arena,
"Tried to write too many bytes to disk (%F)",
FmtUint(data.len)
));
EndScratch(scratch);
}
// WriteFile returns TRUE on success
WriteFile( WriteFile(
(HANDLE)file.handle, (HANDLE)file.handle,
data.text, data.text + pos,
(DWORD)data.len, part_size,
0, 0,
0 0
); );
pos += part_size;
}
} }
u64 PLT_GetFileSize(PLT_File file) u64 PLT_GetFileSize(PLT_File file)
@ -412,6 +416,97 @@ String PLT_GetFileMapData(PLT_FileMap map)
return map.mapped_memory; return map.mapped_memory;
} }
////////////////////////////////////////////////////////////
//~ @hookimpl compression
String PLT_Compress(Arena *arena, String data, PLT_CompressionLevel level)
{
String result = Zi;
b32 ok = 1;
DWORD algo = PLT_W32_CompressionAlgorithmFromLevel(level);
COMPRESSOR_HANDLE compressor = 0;
if (ok)
{
ok = CreateCompressor(algo, 0, &compressor);
}
SIZE_T compressed_cap = data.len;
if (ok)
{
Compress(compressor, data.text, data.len, 0, 0, &compressed_cap);
}
if (ok)
{
SIZE_T written_count = 0;
u8 *compressed = PushStructsNoZero(arena, u8, compressed_cap);
ok = Compress(compressor, data.text, data.len, compressed, compressed_cap, &written_count);
if (ok)
{
result.text = compressed;
result.len = written_count;
PopBytesNoCopy(arena, compressed_cap - written_count);
}
}
if (compressor)
{
CloseCompressor(compressor);
}
return result;
}
String PLT_Decompress(Arena *arena, String data, PLT_CompressionLevel level)
{
String result = Zi;
b32 ok = data.len > 0;
DWORD algo = PLT_W32_CompressionAlgorithmFromLevel(level);
DECOMPRESSOR_HANDLE decompressor = 0;
if (ok)
{
ok = CreateDecompressor(algo, 0, &decompressor);
}
SIZE_T out_cap = MaxI64(NextPow2U64(data.len * 8), Kibi(4));
while (ok)
{
u8 *out = PushStructsNoZero(arena, u8, out_cap);
SIZE_T written_count = 0;
b32 decompress_ok = Decompress(decompressor, data.text, data.len, out, out_cap, &written_count);
if (decompress_ok)
{
result.text = out;
result.len = written_count;
PopBytesNoCopy(arena, out_cap - written_count);
break;
}
else
{
DWORD err = GetLastError();
if (err == ERROR_INSUFFICIENT_BUFFER)
{
out_cap *= 2;
}
else
{
ok = 0;
}
}
}
if (decompressor)
{
CloseDecompressor(decompressor);
}
return result;
}
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
//~ @hookimpl Utils //~ @hookimpl Utils

View File

@ -29,6 +29,11 @@ Struct(PLT_W32_Ctx)
extern PLT_W32_Ctx PLT_W32; extern PLT_W32_Ctx PLT_W32;
////////////////////////////////////////////////////////////
//~ Helpers
DWORD PLT_W32_CompressionAlgorithmFromLevel(PLT_CompressionLevel level);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
//~ Time //~ Time

View File

@ -416,6 +416,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl, gpu_perm, cl,
G_Format_R8_Uint, G_Format_R8_Uint,
tiles_dims, tiles_dims,
G_Layout_DirectQueue_General,
.flags = G_ResourceFlag_ZeroMemory, .flags = G_ResourceFlag_ZeroMemory,
.name = Lit("Tiles") .name = Lit("Tiles")
); );
@ -440,6 +441,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl, gpu_perm, cl,
G_Format_R32_Uint, G_Format_R32_Uint,
cells_dims, cells_dims,
G_Layout_DirectQueue_General,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = StringF(perm, "Particle cells - layer %F", FmtSint(layer)) .name = StringF(perm, "Particle cells - layer %F", FmtSint(layer))
); );
@ -452,6 +454,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl, gpu_perm, cl,
G_Format_R32_Uint, G_Format_R32_Uint,
cells_dims, cells_dims,
G_Layout_DirectQueue_General,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = StringF(perm, "Particle densities - layer %F", FmtSint(layer)) .name = StringF(perm, "Particle densities - layer %F", FmtSint(layer))
); );
@ -466,6 +469,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl, gpu_perm, cl,
G_Format_R16G16B16A16_Float, G_Format_R16G16B16A16_Float,
cells_dims, cells_dims,
G_Layout_DirectQueue_General,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = Lit("Stains") .name = Lit("Stains")
); );
@ -477,6 +481,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl, gpu_perm, cl,
G_Format_R16G16B16A16_Float, G_Format_R16G16B16A16_Float,
cells_dims, cells_dims,
G_Layout_DirectQueue_General,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = Lit("Dry stains") .name = Lit("Dry stains")
); );
@ -488,6 +493,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl, gpu_perm, cl,
G_Format_R32_Float, G_Format_R32_Float,
cells_dims, cells_dims,
G_Layout_DirectQueue_General,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = Lit("Drynesses") .name = Lit("Drynesses")
); );
@ -499,6 +505,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl, gpu_perm, cl,
G_Format_R32_Uint, G_Format_R32_Uint,
cells_dims, cells_dims,
G_Layout_DirectQueue_General,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = Lit("Occluders cells") .name = Lit("Occluders cells")
); );
@ -2494,9 +2501,9 @@ void V_TickForever(WaveLaneCtx *lane)
////////////////////////////// //////////////////////////////
//- Push test emitter //- Push test emitter
if (frame->held_buttons[Button_F]) // if (frame->held_buttons[Button_F])
// if (frame->held_buttons[Button_F] && !prev_frame->held_buttons[Button_F]) // if (frame->held_buttons[Button_F] && !prev_frame->held_buttons[Button_F])
// if (0) if (0)
{ {
{ {
V_Emitter emitter = Zi; V_Emitter emitter = Zi;
@ -2556,9 +2563,9 @@ void V_TickForever(WaveLaneCtx *lane)
////////////////////////////// //////////////////////////////
//- Push test explosion //- Push test explosion
if (frame->held_buttons[Button_G]) // if (frame->held_buttons[Button_G])
// if (frame->held_buttons[Button_G] && !prev_frame->held_buttons[Button_G]) // if (frame->held_buttons[Button_G] && !prev_frame->held_buttons[Button_G])
// if (0) if (0)
{ {
// Fire // Fire
{ {
@ -4784,6 +4791,7 @@ void V_TickForever(WaveLaneCtx *lane)
frame->gpu_arena, frame->cl, frame->gpu_arena, frame->cl,
G_Format_R16G16B16A16_Float, G_Format_R16G16B16A16_Float,
frame->screen_dims, frame->screen_dims,
G_Layout_DirectQueue_RenderTarget,
.flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_AllowRenderTarget, .flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_AllowRenderTarget,
.name = StringF(frame->arena, "Screen target [%F]", FmtSint(frame->tick)) .name = StringF(frame->arena, "Screen target [%F]", FmtSint(frame->tick))
); );
@ -4797,6 +4805,7 @@ void V_TickForever(WaveLaneCtx *lane)
frame->gpu_arena, frame->cl, frame->gpu_arena, frame->cl,
G_Format_R16G16B16A16_Float, G_Format_R16G16B16A16_Float,
G_DimsFromMip2D(G_Count2D(screen_target), 1), G_DimsFromMip2D(G_Count2D(screen_target), 1),
G_Layout_DirectQueue_General,
.flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_AllowRenderTarget, .flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_AllowRenderTarget,
.name = StringF(frame->arena, "Bloom target [%F]", FmtSint(frame->tick)), .name = StringF(frame->arena, "Bloom target [%F]", FmtSint(frame->tick)),
.max_mips = 64 .max_mips = 64
@ -4812,6 +4821,7 @@ void V_TickForever(WaveLaneCtx *lane)
frame->gpu_arena, frame->cl, frame->gpu_arena, frame->cl,
G_Format_R16G16B16A16_Float, G_Format_R16G16B16A16_Float,
frame->screen_dims, frame->screen_dims,
G_Layout_DirectQueue_RenderTarget,
.flags = G_ResourceFlag_AllowRenderTarget, .flags = G_ResourceFlag_AllowRenderTarget,
.name = StringF(frame->arena, "Albedo target [%F]", FmtSint(frame->tick)) .name = StringF(frame->arena, "Albedo target [%F]", FmtSint(frame->tick))
); );
@ -4822,6 +4832,7 @@ void V_TickForever(WaveLaneCtx *lane)
frame->gpu_arena, frame->cl, frame->gpu_arena, frame->cl,
G_Format_R16G16B16A16_Float, G_Format_R16G16B16A16_Float,
frame->shade_dims, frame->shade_dims,
G_Layout_DirectQueue_General,
.flags = G_ResourceFlag_AllowShaderReadWrite, .flags = G_ResourceFlag_AllowShaderReadWrite,
.name = StringF(frame->arena, "Shade target [%F]", FmtSint(frame->tick)) .name = StringF(frame->arena, "Shade target [%F]", FmtSint(frame->tick))
); );
@ -4888,7 +4899,7 @@ void V_TickForever(WaveLaneCtx *lane)
G_SetConstant(frame->cl, V_GpuConst_NoiseTex, G_BasicNoiseTexture()); G_SetConstant(frame->cl, V_GpuConst_NoiseTex, G_BasicNoiseTexture());
// Sync // Sync
G_Barrier(frame->cl); G_DumbGlobalMemorySync(frame->cl);
////////////////////////////// //////////////////////////////
//- Initialization pass //- Initialization pass
@ -4907,12 +4918,14 @@ void V_TickForever(WaveLaneCtx *lane)
V.particle_seq = 0; V.particle_seq = 0;
} }
// Prepare albedo RT // Prepare RTs
G_DiscardRenderTarget(frame->cl, screen_target, 0);
G_ClearRenderTarget(frame->cl, albedo_target, VEC4(0, 0, 0, 0), 0); G_ClearRenderTarget(frame->cl, albedo_target, VEC4(0, 0, 0, 0), 0);
}
// Sync // Sync
G_Barrier(frame->cl); G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_General);
G_DumbGlobalMemorySync(frame->cl);
}
////////////////////////////// //////////////////////////////
//- Quads & emitters pass //- Quads & emitters pass
@ -4932,7 +4945,10 @@ void V_TickForever(WaveLaneCtx *lane)
G_Compute(frame->cl, V_EmitParticlesCS, V_ThreadGroupSizeFromBufferSize(frame->emitters_count)); G_Compute(frame->cl, V_EmitParticlesCS, V_ThreadGroupSizeFromBufferSize(frame->emitters_count));
// Sync particles & occluders // Sync particles & occluders
G_Barrier(frame->cl); G_DumbGlobalMemorySync(frame->cl);
// Transition albedo
G_DumbMemoryLayoutSync(frame->cl, albedo_target, G_Layout_DirectQueue_General);
} }
////////////////////////////// //////////////////////////////
@ -4943,7 +4959,7 @@ void V_TickForever(WaveLaneCtx *lane)
G_Compute(frame->cl, V_SimParticlesCS, V_ThreadGroupSizeFromBufferSize(V_ParticlesCap)); G_Compute(frame->cl, V_SimParticlesCS, V_ThreadGroupSizeFromBufferSize(V_ParticlesCap));
// Sync cells // Sync cells
G_Barrier(frame->cl); G_DumbGlobalMemorySync(frame->cl);
} }
////////////////////////////// //////////////////////////////
@ -4962,7 +4978,7 @@ void V_TickForever(WaveLaneCtx *lane)
G_Compute(frame->cl, V_CompositeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims)); G_Compute(frame->cl, V_CompositeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims));
// Sync screen tex // Sync screen tex
G_Barrier(frame->cl); G_DumbGlobalMemorySync(frame->cl);
} }
////////////////////////////// //////////////////////////////
@ -4984,7 +5000,7 @@ void V_TickForever(WaveLaneCtx *lane)
G_SetConstant(frame->cl, V_GpuConst_MipIdx, mip_idx); G_SetConstant(frame->cl, V_GpuConst_MipIdx, mip_idx);
G_Compute(frame->cl, V_BloomDownCS, V_ThreadGroupSizeFromTexSize(down_dims)); G_Compute(frame->cl, V_BloomDownCS, V_ThreadGroupSizeFromTexSize(down_dims));
G_Barrier(frame->cl); G_DumbGlobalMemorySync(frame->cl);
} }
//- Upsample passes //- Upsample passes
@ -4995,7 +5011,7 @@ void V_TickForever(WaveLaneCtx *lane)
G_SetConstant(frame->cl, V_GpuConst_MipIdx, mip_idx); G_SetConstant(frame->cl, V_GpuConst_MipIdx, mip_idx);
G_Compute(frame->cl, V_BloomUpCS, V_ThreadGroupSizeFromTexSize(up_dims)); G_Compute(frame->cl, V_BloomUpCS, V_ThreadGroupSizeFromTexSize(up_dims));
G_Barrier(frame->cl); G_DumbGlobalMemorySync(frame->cl);
} }
} }
@ -5005,13 +5021,15 @@ void V_TickForever(WaveLaneCtx *lane)
{ {
G_Compute(frame->cl, V_FinalizeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims)); G_Compute(frame->cl, V_FinalizeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims));
G_Barrier(frame->cl); G_DumbGlobalMemorySync(frame->cl);
} }
////////////////////////////// //////////////////////////////
//- Debug shapes pass //- Debug shapes pass
{ {
G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_RenderTarget);
G_Rasterize( G_Rasterize(
frame->cl, frame->cl,
V_DVertVS, V_DVertPS, V_DVertVS, V_DVertPS,
@ -5021,7 +5039,7 @@ void V_TickForever(WaveLaneCtx *lane)
G_RasterMode_TriangleList G_RasterMode_TriangleList
); );
G_Barrier(frame->cl); G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_General);
} }
////////////////////////////// //////////////////////////////

View File

@ -439,6 +439,7 @@ void SPR_TickAsync(WaveLaneCtx *lane, AsyncFrameLaneCtx *base_async_lane_frame)
gpu_perm, cl, gpu_perm, cl,
G_Format_R8G8B8A8_Unorm_Srgb, G_Format_R8G8B8A8_Unorm_Srgb,
atlas->dims, atlas->dims,
G_Layout_Simultaneous,
.name = Lit("Sprite atlas") .name = Lit("Sprite atlas")
); );
atlas->tex = G_PushTexture2DRef(gpu_perm, atlas->tex_res); atlas->tex = G_PushTexture2DRef(gpu_perm, atlas->tex_res);

View File

@ -2,7 +2,6 @@
//~ Win32 libs //~ Win32 libs
#pragma comment(lib, "dwrite") #pragma comment(lib, "dwrite")
#pragma comment(lib, "gdi32")
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
//~ DirectWrite types //~ DirectWrite types

View File

@ -1703,6 +1703,7 @@ void UI_EndFrame(UI_Frame *frame, i32 vsync)
frame->gpu_arena, frame->cl, frame->gpu_arena, frame->cl,
G_Format_R16G16B16A16_Float, G_Format_R16G16B16A16_Float,
monitor_size, monitor_size,
G_Layout_DirectQueue_RenderTarget,
.flags = G_ResourceFlag_AllowRenderTarget, .flags = G_ResourceFlag_AllowRenderTarget,
.name = Lit("UI draw target") .name = Lit("UI draw target")
); );
@ -1738,7 +1739,7 @@ void UI_EndFrame(UI_Frame *frame, i32 vsync)
G_SetConstant(frame->cl, UI_GpuConst_Params, params_ro); G_SetConstant(frame->cl, UI_GpuConst_Params, params_ro);
// Sync // Sync
G_Barrier(frame->cl); G_DumbGlobalMemorySync(frame->cl);
////////////////////////////// //////////////////////////////
//- Dispatch shaders //- Dispatch shaders
@ -1780,7 +1781,8 @@ void UI_EndFrame(UI_Frame *frame, i32 vsync)
//- Backbuffer blit pass //- Backbuffer blit pass
G_Barrier(frame->cl); G_DumbMemoryLayoutSync(frame->cl, draw_target, G_Layout_DirectQueue_General);
G_DumbMemoryLayoutSync(frame->cl, backbuffer, G_Layout_DirectQueue_RenderTarget);
{ {
G_Rasterize( G_Rasterize(
@ -1792,6 +1794,8 @@ void UI_EndFrame(UI_Frame *frame, i32 vsync)
G_RasterMode_TriangleList G_RasterMode_TriangleList
); );
} }
G_DumbMemoryLayoutSync(frame->cl, backbuffer, G_Layout_Common);
} }
////////////////////////////// //////////////////////////////

View File

@ -1,10 +1,5 @@
WND_W32_Ctx WND_W32 = Zi; WND_W32_Ctx WND_W32 = Zi;
////////////////////////////////////////////////////////////
//~ Win32 libs
#pragma comment(lib, "gdi32")
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
//~ @hookimpl Bootstrap //~ @hookimpl Bootstrap

926
tatus
View File

@ -1,926 +0,0 @@
diff --git a/src/gpu/gpu_common.c b/src/gpu/gpu_common.c
index a9686d87..43835793 100644
--- a/src/gpu/gpu_common.c
+++ b/src/gpu/gpu_common.c
@@ -25,7 +25,7 @@ void G_BootstrapCommon(void)
gpu_perm, cl,
G_Format_R8G8B8A8_Uint,
VEC2I32(8, 8),
- G_Layout_AnyQueue_ShaderRead_CopyRead_CopyWrite_Present,
+ G_Layout_Simultaneous,
.flags = G_ResourceFlag_ZeroMemory
);
G.blank_tex = G_PushTexture2DRef(gpu_perm, blank_tex);
@@ -44,7 +44,7 @@ void G_BootstrapCommon(void)
gpu_perm, cl,
G_Format_R16_Uint,
noise_dims,
- G_Layout_AnyQueue_ShaderRead_CopyRead_CopyWrite_Present
+ G_Layout_Simultaneous
);
G_CopyCpuToTexture(
cl,
@@ -143,30 +143,54 @@ G_ResourceHandle G_PushBufferFromCpuCopy_(G_ArenaHandle gpu_arena, G_CommandList

//- Mip

-i32 G_DimsFromMip1D(i32 texture_dims, i32 mip)
+i32 G_DimsFromMip1D(i32 mip0_dims, i32 mip)
{
- mip = ClampI32(mip, 0, 31);
+ mip = ClampI32(mip, -31, 31);
i32 result = 0;
- result = MaxI32(result >> mip, 1);
+ if (mip >= 0)
+ {
+ result = MaxI32(result >> mip, 1);
+ }
+ else
+ {
+ result = MaxI32(result << -mip, 1);
+ }
return result;
}

-Vec2I32 G_DimsFromMip2D(Vec2I32 texture_dims, i32 mip)
+Vec2I32 G_DimsFromMip2D(Vec2I32 mip0_dims, i32 mip)
{
- mip = ClampI32(mip, 0, 31);
+ mip = ClampI32(mip, -31, 31);
Vec2I32 result = Zi;
- result.x = MaxI32(texture_dims.x >> mip, 1);
- result.y = MaxI32(texture_dims.y >> mip, 1);
+ if (mip >= 0)
+ {
+ result.x = MaxI32(mip0_dims.x >> mip, 1);
+ result.y = MaxI32(mip0_dims.y >> mip, 1);
+ }
+ else
+ {
+ result.x = MaxI32(mip0_dims.x << -mip, 1);
+ result.y = MaxI32(mip0_dims.y << -mip, 1);
+ }
return result;
}

-Vec3I32 G_DimsFromMip3D(Vec3I32 texture_dims, i32 mip)
+Vec3I32 G_DimsFromMip3D(Vec3I32 mip0_dims, i32 mip)
{
- mip = ClampI32(mip, 0, 31);
+ mip = ClampI32(mip, -31, 31);
Vec3I32 result = Zi;
- result.x = MaxI32(texture_dims.x >> mip, 1);
- result.y = MaxI32(texture_dims.y >> mip, 1);
- result.z = MaxI32(texture_dims.z >> mip, 1);
+ if (mip >= 0)
+ {
+ result.x = MaxI32(mip0_dims.x >> mip, 1);
+ result.y = MaxI32(mip0_dims.y >> mip, 1);
+ result.z = MaxI32(mip0_dims.z >> mip, 1);
+ }
+ else
+ {
+ result.x = MaxI32(mip0_dims.x << -mip, 1);
+ result.y = MaxI32(mip0_dims.y << -mip, 1);
+ result.z = MaxI32(mip0_dims.z << -mip, 1);
+ }
return result;
}

diff --git a/src/gpu/gpu_common.h b/src/gpu/gpu_common.h
index eb3ee6d2..03927040 100644
--- a/src/gpu/gpu_common.h
+++ b/src/gpu/gpu_common.h
@@ -35,9 +35,9 @@ G_ResourceHandle G_PushBufferFromCpuCopy_(G_ArenaHandle gpu_arena, G_CommandList
G_PushBufferFromCpuCopy_((_arena), (_cl), (_src), (G_BufferDesc) { .size = (_src).len, __VA_ARGS__ })

//- Mip
-i32 G_DimsFromMip1D(i32 texture_dims, i32 mip);
-Vec2I32 G_DimsFromMip2D(Vec2I32 texture_dims, i32 mip);
-Vec3I32 G_DimsFromMip3D(Vec3I32 texture_dims, i32 mip);
+i32 G_DimsFromMip1D(i32 mip0_dims, i32 mip);
+Vec2I32 G_DimsFromMip2D(Vec2I32 mip0_dims, i32 mip);
+Vec3I32 G_DimsFromMip3D(Vec3I32 mip0_dims, i32 mip);

//- Viewport / scissor
Rng3 G_ViewportFromTexture(G_ResourceHandle texture);
diff --git a/src/gpu/gpu_core.h b/src/gpu/gpu_core.h
index 7e1b329a..bed18c93 100644
--- a/src/gpu/gpu_core.h
+++ b/src/gpu/gpu_core.h
@@ -242,18 +242,16 @@ Enum(G_Access)
G_Access_IndexBuffer = (1 << 8),
G_Access_IndirectArgument = (1 << 9),

- G_Access_All = 0xFFFFFFFF
+ G_Access_All = 0xFFFFFFFF // Represents all accesses relevant to the specified sync stage
};

Enum(G_Layout)
{
G_Layout_NoChange,

- // "Simultaneous" allows a resource to be used on any queue with any access
- // type, as long as there is only one writer at a time, and the writer is not
- // writing to any texels currently being read.
- // Resources cannot transition to/from this layout. They must be created
- // with it and are locked to it.
+ // Simultaneous layout allows a resource to be used on any queue with any
+ // access type (except depth-stencil). Resources cannot transition to/from
+ // this layout, they must be created with it.
G_Layout_Simultaneous, // D3D12_BARRIER_LAYOUT_COMMON + D3D12_RESOURCE_FLAG_ALLOW_SIMULTANEOUS_ACCESS

G_Layout_Undefined, // D3D12_BARRIER_LAYOUT_UNDEFINED
diff --git a/src/pp/pp_vis/pp_vis.lay b/src/pp/pp_vis/pp_vis.lay
index f72dc528..2d916376 100644
--- a/src/pp/pp_vis/pp_vis.lay
+++ b/src/pp/pp_vis/pp_vis.lay
@@ -26,7 +26,7 @@
@ComputeShader V_CompositeCS
@ComputeShader V_BloomDownCS
@ComputeShader V_BloomUpCS
-@ComputeShader V_PostProcessCS
+@ComputeShader V_FinalizeCS
@VertexShader V_DVertVS
@PixelShader V_DVertPS

diff --git a/src/pp/pp_vis/pp_vis_core.c b/src/pp/pp_vis/pp_vis_core.c
index f2f5e6b5..338036ba 100644
--- a/src/pp/pp_vis/pp_vis_core.c
+++ b/src/pp/pp_vis/pp_vis_core.c
@@ -416,7 +416,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl,
G_Format_R8_Uint,
tiles_dims,
- G_Layout_DirectQueue_ShaderRead,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_ZeroMemory,
.name = Lit("Tiles")
);
@@ -441,7 +441,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl,
G_Format_R32_Uint,
cells_dims,
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = StringF(perm, "Particle cells - layer %F", FmtSint(layer))
);
@@ -454,7 +454,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl,
G_Format_R32_Uint,
cells_dims,
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = StringF(perm, "Particle densities - layer %F", FmtSint(layer))
);
@@ -469,7 +469,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl,
G_Format_R16G16B16A16_Float,
cells_dims,
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = Lit("Stains")
);
@@ -481,7 +481,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl,
G_Format_R16G16B16A16_Float,
cells_dims,
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = Lit("Dry stains")
);
@@ -493,7 +493,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl,
G_Format_R32_Float,
cells_dims,
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = Lit("Drynesses")
);
@@ -505,7 +505,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl,
G_Format_R32_Uint,
cells_dims,
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = Lit("Occluders cells")
);
@@ -614,6 +614,8 @@ void V_TickForever(WaveLaneCtx *lane)
frame->dt = SecondsFromNs(frame->dt_ns);
frame->rand = prev_frame->rand;

+ frame->should_tone_map = TweakBool("Tone mapping enabled", 1);
+
if (P_IsEntKeyNil(V.player_key))
{
TrueRand(StringFromStruct(&V.player_key));
@@ -4918,18 +4920,17 @@ void V_TickForever(WaveLaneCtx *lane)
frame->tile_descs[tile_kind] = tile_desc;
}
}
+
// Upload tiles
if (frame->tiles_dirty)
{
// LogDebugF("Uploading tiles to gpu");
- G_DumbMemoryLayoutSync(frame->cl, gpu_tiles_res, G_Layout_DirectQueue_CopyWrite);
G_CopyCpuToTexture(
frame->cl,
gpu_tiles_res, VEC3I32(0, 0, 0),
local_world->tiles, VEC3I32(tiles_dims.x, tiles_dims.y, 1),
RNG3I32(VEC3I32(0, 0, 0), VEC3I32(tiles_dims.x, tiles_dims.y, 1))
);
- G_DumbMemoryLayoutSync(frame->cl, gpu_tiles_res, G_Layout_DirectQueue_ShaderRead);
}

// Screen texture
@@ -4937,7 +4938,7 @@ void V_TickForever(WaveLaneCtx *lane)
frame->gpu_arena, frame->cl,
G_Format_R16G16B16A16_Float,
frame->screen_dims,
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_AllowRenderTarget,
.name = StringF(frame->arena, "Screen target [%F]", FmtSint(frame->tick))
);
@@ -4951,11 +4952,10 @@ void V_TickForever(WaveLaneCtx *lane)
frame->gpu_arena, frame->cl,
G_Format_R16G16B16A16_Float,
G_DimsFromMip2D(G_Count2D(screen_target), 1),
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_AllowRenderTarget,
.name = StringF(frame->arena, "Bloom target [%F]", FmtSint(frame->tick)),
- // .max_mips = 4
- .max_mips = 8
+ .max_mips = 64
);
for (i32 mip_idx = 0; mip_idx < G_CountMips(bloom_target); ++mip_idx)
{
@@ -4979,7 +4979,7 @@ void V_TickForever(WaveLaneCtx *lane)
frame->gpu_arena, frame->cl,
G_Format_R16G16B16A16_Float,
frame->shade_dims,
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_AllowShaderReadWrite,
.name = StringF(frame->arena, "Shade target [%F]", FmtSint(frame->tick))
);
@@ -5091,6 +5091,9 @@ void V_TickForever(WaveLaneCtx *lane)

// Sync particles & occluders
G_DumbGlobalMemorySync(frame->cl);
+
+ // Transition albedo
+ G_DumbMemoryLayoutSync(frame->cl, albedo_target, G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite);
}

//////////////////////////////
@@ -5113,83 +5116,63 @@ void V_TickForever(WaveLaneCtx *lane)
G_Compute(frame->cl, V_ShadeCS, V_ThreadGroupSizeFromTexSize(frame->shade_dims));
}

- //////////////////////////////
- //- Transition G-buffers to readonly
-
- {
- G_DumbMemoryLayoutSync(frame->cl, albedo_target, G_Layout_DirectQueue_ShaderRead);
- G_DumbMemoryLayoutSync(frame->cl, shade_target, G_Layout_DirectQueue_ShaderRead);
- }
-
//////////////////////////////
//- Composite pass

{
G_Compute(frame->cl, V_CompositeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims));

- G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderRead);
+ // Sync screen tex
+ G_DumbGlobalMemorySync(frame->cl);
}

//////////////////////////////
//- Bloom passes

{
- i32 mips_count = G_CountMips(bloom_target);
+ i32 mips_count = G_CountMips(bloom_target) + 1;
+ G_SetConstant(frame->cl, V_GpuConst_MipsCount, mips_count);
+
+ // NOTE: Because bloom mip chain starts at half screen size, mip_idx 0
+ // actually represents the screen texture, while mip_idx - 1 represents
+ // the first mip index in the bloom mip chain

//- Downsample + blur passes
- for (i32 mip_idx = 0; mip_idx < mips_count; ++mip_idx)
+ for (i32 mip_idx = 1; mip_idx < mips_count; ++mip_idx)
{
- Vec2I32 dims = G_DimsFromMip2D(G_Count2D(bloom_target), mip_idx);
- if (mip_idx == 0)
- {
- // Init bloom pyramid from screen target on first pass (prefilter)
- gpu_flags |= V_GpuFlag_InitBloom;
- G_SetConstant(frame->cl, V_GpuConst_Flags, gpu_flags);
- G_SetConstant(frame->cl, V_GpuConst_BloomRead, frame->screen_ro);
- }
- else
- {
- G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderRead, .mips = RNGI32(mip_idx - 1, mip_idx - 1));
- G_SetConstant(frame->cl, V_GpuConst_BloomRead, frame->bloom_mips_ro[mip_idx - 1]);
- }
- G_SetConstant(frame->cl, V_GpuConst_BloomWrite, frame->bloom_mips_rw[mip_idx]);
- {
- G_Compute(frame->cl, V_BloomDownCS, V_ThreadGroupSizeFromTexSize(dims));
- }
- gpu_flags &= ~V_GpuFlag_InitBloom;
- G_SetConstant(frame->cl, V_GpuConst_Flags, gpu_flags);
+ Vec2I32 down_dims = G_DimsFromMip2D(G_Count2D(screen_target), mip_idx);
+
+ G_SetConstant(frame->cl, V_GpuConst_MipIdx, mip_idx);
+ G_Compute(frame->cl, V_BloomDownCS, V_ThreadGroupSizeFromTexSize(down_dims));
+
+ G_DumbGlobalMemorySync(frame->cl);
}

//- Upsample passes
for (i32 mip_idx = mips_count - 2; mip_idx >= 0; --mip_idx)
{
- Vec2I32 dims = G_DimsFromMip2D(G_Count2D(bloom_target), mip_idx);
-
- G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderReadWrite, .mips = RNGI32(mip_idx, mip_idx));
- G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderRead, .mips = RNGI32(mip_idx + 1, mip_idx + 1));
+ Vec2I32 up_dims = G_DimsFromMip2D(G_Count2D(screen_target), mip_idx);

- G_SetConstant(frame->cl, V_GpuConst_BloomRead, frame->bloom_mips_ro[mip_idx + 1]);
- G_SetConstant(frame->cl, V_GpuConst_BloomWrite, frame->bloom_mips_rw[mip_idx]);
+ G_SetConstant(frame->cl, V_GpuConst_MipIdx, mip_idx);
+ G_Compute(frame->cl, V_BloomUpCS, V_ThreadGroupSizeFromTexSize(up_dims));

- G_Compute(frame->cl, V_BloomUpCS, V_ThreadGroupSizeFromTexSize(dims));
- }
+ G_DumbGlobalMemorySync(frame->cl);
+ }
}

//////////////////////////////
- //- Post process pass
+ //- Finalization pass

{
- G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderReadWrite);
- G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderRead, .mips = RNGI32(0, 0));
- G_Compute(frame->cl, V_PostProcessCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims));
+ G_Compute(frame->cl, V_FinalizeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims));
}

//////////////////////////////
//- Debug shapes pass

- G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_RenderTargetWrite);
-
{
+ G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_RenderTargetWrite);
+
G_Rasterize(
frame->cl,
V_DVertVS, V_DVertPS,
@@ -5198,12 +5181,13 @@ void V_TickForever(WaveLaneCtx *lane)
screen_viewport, screen_scissor,
G_RasterMode_TriangleList
);
+
+ G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite);
}

//////////////////////////////
//- Finalize screen target

- G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderRead);
{
Rng2 uv = Zi;
uv.p0 = Vec2FromVec(screen_viewport.p0);
diff --git a/src/pp/pp_vis/pp_vis_gpu.g b/src/pp/pp_vis/pp_vis_gpu.g
index f8a254de..c0a9e47d 100644
--- a/src/pp/pp_vis/pp_vis_gpu.g
+++ b/src/pp/pp_vis/pp_vis_gpu.g
@@ -53,13 +53,6 @@ Vec4 V_ColorFromParticle(V_ParticleDesc desc, u32 particle_idx, u32 density)
return result;
}

-// ACES approximation by Krzysztof Narkowicz
-// https://knarkowicz.wordpress.com/2016/01/06/aces-filmic-tone-mapping-curve/
-Vec3 V_ToneMap(Vec3 v)
-{
- return saturate((v * (2.51f * v + 0.03f)) / (v * (2.43f * v + 0.59f) + 0.14f));
-}
-
////////////////////////////////////////////////////////////
//~ Prepare frame

@@ -142,11 +135,11 @@ ComputeShader2D(V_PrepareCellsCS, 8, 8)
}
else if (over_stain.a > 0)
{
- Vec4 stain = dry_stains[cell_pos];
Vec4 dry_stain = max(dry_stains[cell_pos], 0);
+ Vec4 stain = dry_stain;

- stain = BlendPremul(over_stain, stain);
dry_stain = BlendPremul(over_dry_stain, dry_stain);
+ stain = BlendPremul(over_stain, stain);

stains[cell_pos] = stain;
dry_stains[cell_pos] = dry_stain;
@@ -483,7 +476,7 @@ ComputeShader(V_SimParticlesCS, 64)
particle.prev_occluder = occluder;
}

- if (!AnyBit(desc.flags, V_ParticleFlag_NoPruneWhenStill) && dot(particle.velocity, particle.velocity) < 0.0001)
+ if (dot(particle.velocity, particle.velocity) < (desc.prune_speed_threshold * desc.prune_speed_threshold))
{
prune = 1;
}
@@ -723,7 +716,6 @@ ComputeShader2D(V_CompositeCS, 8, 8)
Vec4 ground_particle_color = 0;
Vec4 air_particle_color = 0;

-
for (V_ParticleLayer layer = (V_ParticleLayer)0; layer < V_ParticleLayer_COUNT; layer += (V_ParticleLayer)1)
{
RWTexture2D<u32> cells = G_Dereference<u32>(frame.particle_cells[layer]);
@@ -752,9 +744,9 @@ ComputeShader2D(V_CompositeCS, 8, 8)
// Darken wall particles / stains
if (tile == P_TileKind_Wall)
{
- ground_particle_color *= 0.25;
- air_particle_color *= 0.25;
- stain_color *= 0.25;
+ ground_particle_color *= 0.5;
+ air_particle_color *= 0.5;
+ stain_color *= 0.5;
}

//////////////////////////////
@@ -972,57 +964,73 @@ ComputeShader2D(V_CompositeCS, 8, 8)
////////////////////////////////////////////////////////////
//~ Bloom

+//////////////////////////////
+//- Downsample
+
ComputeShader2D(V_BloomDownCS, 8, 8)
{
+ i32 mips_count = V_GpuConst_MipsCount;
+ i32 mip_idx = V_GpuConst_MipIdx;
+
V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0];
- Texture2D<Vec4> bloom_up = G_Dereference<Vec4>(V_GpuConst_BloomRead);
- RWTexture2D<Vec4> bloom_down = G_Dereference<Vec4>(V_GpuConst_BloomWrite);
SamplerState sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]);
+ RWTexture2D<Vec4> bloom_down = G_Dereference<Vec4>(frame.bloom_mips_rw[mip_idx - 1]);
+
+ Texture2D<Vec4> bloom_up;
+ b32 is_first_pass = mip_idx == 1;
+ if (is_first_pass)
+ {
+ bloom_up = G_Dereference<Vec4>(frame.screen_ro);
+ }
+ else
+ {
+ bloom_up = G_Dereference<Vec4>(frame.bloom_mips_ro[mip_idx - 2]);
+ }

- Vec2 up_dims = countof(bloom_up);
Vec2 down_dims = countof(bloom_down);

Vec2 bloom_pos = SV_DispatchThreadID + 0.5;
Vec2 bloom_uv = bloom_pos / down_dims;
Vec2 off_uv = 0.5 / down_dims;
- b32 is_first_pass = !!(V_GpuConst_Flags & V_GpuFlag_InitBloom);

- Struct(SampleDesc) { Vec2 uv; f32 weight; };
- SampleDesc samples[] = {
- { bloom_uv + Vec2(0, 0), 0.5 },
- { bloom_uv + Vec2(-off_uv.x, -off_uv.y), 0.125 },
- { bloom_uv + Vec2(off_uv.x, -off_uv.y), 0.125 },
- { bloom_uv + Vec2(off_uv.x, off_uv.y), 0.125 },
- { bloom_uv + Vec2(-off_uv.x, off_uv.y), 0.125 },
- };
+ f32 threshold = 0.25;
+ f32 knee = 0.75;

Vec4 result = 0;
- for (u32 sample_idx = 0; sample_idx < countof(samples); ++sample_idx)
{
- SampleDesc desc = samples[sample_idx];
- Vec4 src = bloom_up.SampleLevel(sampler, desc.uv, 0);
-
- f32 knee_weight = 1;
- if (is_first_pass)
+ Struct(SampleDesc) { Vec2 uv; f32 weight; };
+ SampleDesc samples[] = {
+ { bloom_uv + Vec2(0, 0), 0.5 },
+ { bloom_uv + Vec2(-off_uv.x, -off_uv.y), 0.125 },
+ { bloom_uv + Vec2(off_uv.x, -off_uv.y), 0.125 },
+ { bloom_uv + Vec2(off_uv.x, off_uv.y), 0.125 },
+ { bloom_uv + Vec2(-off_uv.x, off_uv.y), 0.125 },
+ };
+ for (u32 sample_idx = 0; sample_idx < countof(samples); ++sample_idx)
{
- f32 luminance = LuminanceFromColor(src);
- f32 max_rgb = max(max(src.r, src.g), src.b); // So that we can get bloom on colors with high rgb, not just high luminance
- f32 bright = max(luminance, (max_rgb - 1.0) * 0.5);
- if (bright > 0)
- {
- f32 threshold = 1.0;
- f32 knee = 0.5;
- f32 over_threshold = max(bright - threshold, 0.0);
- f32 ramp = saturate(over_threshold / knee);
- knee_weight = (over_threshold * ramp * ramp) / bright;
- }
- else
+ SampleDesc desc = samples[sample_idx];
+ Vec4 src = bloom_up.SampleLevel(sampler, desc.uv, 0);
+
+ f32 knee_weight = 1;
+ if (is_first_pass)
{
- knee_weight = 0;
+ f32 luminance = LuminanceFromColor(src);
+ f32 max_rgb = max(max(src.r, src.g), src.b); // So that we can get bloom on colors with high rgb, not just high luminance
+ f32 bright = max(luminance, (max_rgb - 1.0) * 0.5);
+ if (bright > 0)
+ {
+ f32 over_threshold = max(bright - threshold, 0.0);
+ f32 ramp = saturate(over_threshold / knee);
+ knee_weight = (over_threshold * ramp * ramp) / bright;
+ }
+ else
+ {
+ knee_weight = 0;
+ }
}
- }

- result += src * desc.weight * knee_weight;
+ result += src * desc.weight * knee_weight;
+ }
}

if (IsInside(bloom_pos, down_dims))
@@ -1031,52 +1039,77 @@ ComputeShader2D(V_BloomDownCS, 8, 8)
}
}

+//////////////////////////////
+//- Upsample
+
ComputeShader2D(V_BloomUpCS, 8, 8)
{
+ i32 mips_count = V_GpuConst_MipsCount;
+ i32 mip_idx = V_GpuConst_MipIdx;
+
V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0];
- Texture2D<Vec4> bloom_down = G_Dereference<Vec4>(V_GpuConst_BloomRead);
- RWTexture2D<Vec4> bloom_up = G_Dereference<Vec4>(V_GpuConst_BloomWrite);
SamplerState sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]);
+ Texture2D<Vec4> bloom_down = G_Dereference<Vec4>(frame.bloom_mips_ro[mip_idx]);
+
+ b32 is_last_pass = mip_idx == 0;
+ RWTexture2D<Vec4> bloom_up;
+ if (is_last_pass)
+ {
+ bloom_up = G_Dereference<Vec4>(frame.screen_rw);
+ }
+ else
+ {
+ bloom_up = G_Dereference<Vec4>(frame.bloom_mips_rw[mip_idx - 1]);
+ }

- Vec2 up_dims = countof(bloom_up);
Vec2 down_dims = countof(bloom_down);
+ Vec2 up_dims = countof(bloom_up);

Vec2 bloom_pos = SV_DispatchThreadID + 0.5;
Vec2 bloom_uv = bloom_pos / up_dims;
- Vec2 off_uv = 1 / up_dims;
+ Vec2 off_uv0 = 1 / down_dims;
+ Vec2 off_uv1 = off_uv0 * 2;

Vec4 result = 0;
{
// Center
- result += bloom_down.SampleLevel(sampler, bloom_uv, 0) * 4;
- // Edges
+ result += bloom_down.SampleLevel(sampler, bloom_uv, 0) * 9.0f / 41.0f;
+
+ // Outer Edges
result += (
- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, -off_uv.y), 0) +
- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv.x, 0), 0) +
- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, off_uv.y), 0) +
- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv.x, 0), 0)
- ) * 2;
- // Corners
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, -off_uv1.y), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv1.x, 0), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, off_uv1.y), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv1.x, 0), 0)
+ ) * 3.0f / 41.0f;
+
+ // Inner corners
+ result += (
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv0.x, -off_uv0.y), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv0.x, -off_uv0.y), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv0.x, off_uv0.y), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv0.x, off_uv0.y), 0)
+ ) * 4.0f / 41.0f;
+
+ // Outer corners
result += (
- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv.x, -off_uv.y), 0) +
- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv.x, -off_uv.y), 0) +
- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv.x, off_uv.y), 0) +
- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv.x, off_uv.y), 0)
- );
- // Normalize
- result /= 16;
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv1.x, -off_uv1.y), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv1.x, -off_uv1.y), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv1.x, off_uv1.y), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv1.x, off_uv1.y), 0)
+ ) * 1.0f / 41.0f;
}

if (IsInside(bloom_pos, up_dims))
{
- bloom_up[bloom_pos] += result;
+ bloom_up[bloom_pos] += result * 0.75;
}
}

////////////////////////////////////////////////////////////
-//~ Post process
+//~ Finalize

-ComputeShader2D(V_PostProcessCS, 8, 8)
+ComputeShader2D(V_FinalizeCS, 8, 8)
{
V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0];
SamplerState bilinear_sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]);
@@ -1084,42 +1117,21 @@ ComputeShader2D(V_PostProcessCS, 8, 8)
RWTexture2D<Vec4> screen_tex = G_Dereference<Vec4>(frame.screen_rw);

Vec2 screen_pos = SV_DispatchThreadID + 0.5;
- Vec2 screen_uv = screen_pos / frame.screen_dims;
b32 is_in_screen = IsInside(screen_pos, frame.screen_dims);
-
- //////////////////////////////
- //- Original
-
- Vec4 original = 0;
if (is_in_screen)
{
- original = screen_tex[screen_pos];
- original.rgb *= original.a;
- }
+ Vec4 result = screen_tex[screen_pos];

+ //- Tone map
+ if (frame.should_tone_map)
+ {
+ // ACES approximation by Krzysztof Narkowicz
+ // https://knarkowicz.wordpress.com/2016/01/06/aces-filmic-tone-mapping-curve/
+ result.rgb = saturate((result.rgb * (2.51f * result.rgb + 0.03f)) / (result.rgb * (2.43f * result.rgb + 0.59f) + 0.14f));
+ }

- //////////////////////////////
- //- Bloom
-
- Vec4 bloom = 0;
- if (is_in_screen)
- {
- bloom = bloom_tex.SampleLevel(bilinear_sampler, screen_uv, 0);
- // bloom.rgb *= bloom.a;
- }
-
- //////////////////////////////
- //- Compose
-
- Vec4 result = Vec4(0, 0, 0, 1);
- result = BlendPremul(original, result);
- result += bloom;
- // result.rgb = V_ToneMap(result);
+ result = Unpremul(result);

- result = Unpremul(result);
-
- if (is_in_screen)
- {
screen_tex[screen_pos] = result;
}
}
diff --git a/src/pp/pp_vis/pp_vis_gpu.gh b/src/pp/pp_vis/pp_vis_gpu.gh
index a47a2335..f176f2f8 100644
--- a/src/pp/pp_vis/pp_vis_gpu.gh
+++ b/src/pp/pp_vis/pp_vis_gpu.gh
@@ -46,7 +46,6 @@ Struct(V_DVertPSOutput)

f32 V_RandFromPos(Vec3 pos);
Vec4 V_ColorFromParticle(V_ParticleDesc desc, u32 particle_idx, u32 density);
-Vec3 V_ToneMap(Vec3 v);

////////////////////////////////////////////////////////////
//~ Shaders
@@ -73,8 +72,8 @@ ComputeShader2D(V_CompositeCS, 8, 8);
ComputeShader2D(V_BloomDownCS, 8, 8);
ComputeShader2D(V_BloomUpCS, 8, 8);

-//- Post process
-ComputeShader2D(V_PostProcessCS, 8, 8);
+//- Finalize
+ComputeShader2D(V_FinalizeCS, 8, 8);

//- Debug shapes
VertexShader(V_DVertVS, V_DVertPSInput);
diff --git a/src/pp/pp_vis/pp_vis_shared.cg b/src/pp/pp_vis/pp_vis_shared.cg
index 2419a6f2..72f6ae8d 100644
--- a/src/pp/pp_vis/pp_vis_shared.cg
+++ b/src/pp/pp_vis/pp_vis_shared.cg
@@ -11,37 +11,42 @@ V_ParticleDesc V_DescFromParticleKind(V_ParticleKind kind)
V_ParticleDesc result;
{
PERSIST Readonly V_ParticleFlag flags[V_ParticleKind_COUNT] = {
- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) flags,
+ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) flags,
V_ParticlesXList(X)
#undef X
};
PERSIST Readonly V_ParticleLayer layers[V_ParticleKind_COUNT] = {
- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) layer,
+ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) layer,
V_ParticlesXList(X)
#undef X
};
PERSIST Readonly f32 stain_rates[V_ParticleKind_COUNT] = {
- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) stain_rate,
+ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) stain_rate,
V_ParticlesXList(X)
#undef X
};
PERSIST Readonly f32 pen_rates[V_ParticleKind_COUNT] = {
- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) pen_rate,
+ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) pen_rate,
V_ParticlesXList(X)
#undef X
};
PERSIST Readonly f32 lifetimes[V_ParticleKind_COUNT] = {
- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) lifetime,
+ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) lifetime,
+ V_ParticlesXList(X)
+ #undef X
+ };
+ PERSIST Readonly f32 prune_speed_thresholds[V_ParticleKind_COUNT] = {
+ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) prune_speed_threshold,
V_ParticlesXList(X)
#undef X
};
PERSIST Readonly Vec4 base_colors[V_ParticleKind_COUNT] = {
- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) base_color,
+ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) base_color,
V_ParticlesXList(X)
#undef X
};
PERSIST Readonly Vec4 dry_factor[V_ParticleKind_COUNT] = {
- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) dry_factor,
+ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) dry_factor,
V_ParticlesXList(X)
#undef X
};
@@ -51,6 +56,7 @@ V_ParticleDesc V_DescFromParticleKind(V_ParticleKind kind)
result.stain_rate = stain_rates[kind];
result.pen_rate = pen_rates[kind];
result.lifetime = lifetimes[kind];
+ result.prune_speed_threshold = prune_speed_thresholds[kind];
result.base_color = LinearFromSrgb(base_colors[kind]);
result.dry_factor = LinearFromSrgb(dry_factor[kind]);
}
diff --git a/src/pp/pp_vis/pp_vis_shared.cgh b/src/pp/pp_vis/pp_vis_shared.cgh
index 16ca6419..71d88ea5 100644
--- a/src/pp/pp_vis/pp_vis_shared.cgh
+++ b/src/pp/pp_vis/pp_vis_shared.cgh
@@ -9,14 +9,13 @@
Enum(V_GpuFlag)
{
V_GpuFlag_None = 0,
- V_GpuFlag_InitBloom = (1 << 0),
};

G_DeclConstant(V_GpuFlag, V_GpuConst_Flags, 0);
G_DeclConstant(G_StructuredBufferRef, V_GpuConst_Frame, 1);
G_DeclConstant(G_Texture3DRef, V_GpuConst_NoiseTex, 2);
-G_DeclConstant(G_Texture2DRef, V_GpuConst_BloomRead, 3);
-G_DeclConstant(G_RWTexture2DRef, V_GpuConst_BloomWrite, 4);
+G_DeclConstant(i32, V_GpuConst_MipsCount, 3);
+G_DeclConstant(i32, V_GpuConst_MipIdx, 4);

////////////////////////////////////////////////////////////
//~ Particle types
@@ -29,7 +28,6 @@ G_DeclConstant(G_RWTexture2DRef, V_GpuConst_BloomWrite, 4);
Enum(V_ParticleFlag)
{
V_ParticleFlag_None = 0,
- V_ParticleFlag_NoPruneWhenStill = (1 << 0),
V_ParticleFlag_StainWhenPruned = (1 << 1),
V_ParticleFlag_NoReflect = (1 << 2),
V_ParticleFlag_OnlyCollideWithWalls = (1 << 3),
@@ -53,6 +51,7 @@ Enum(V_ParticleLayer)
/* Layer */ V_ParticleLayer_Ground, \
/* Stain rate, pen chance */ 30, 0, \
/* Lifetime */ Inf, \
+ /* Prune speed threshold */ 0.01, \
/* Base color */ CompVec4(0, 0, 0, 0), \
/* Dry color factor */ CompVec4(1, 1, 1, 1) \
) \
@@ -64,8 +63,9 @@ Enum(V_ParticleLayer)
/* Layer */ V_ParticleLayer_Ground, \
/* Stain rate, pen chance */ 100, 0.25, \
/* Lifetime */ Inf, \
- /* Base color */ CompVec4(0.5, 0.1, 0.1, 0.05), \
- /* Dry color factor */ CompVec4(0.5, 0.5, 0.5, 1) \
+ /* Prune speed threshold */ 0.5, \
+ /* Base color */ CompVec4(0.6, 0.1, 0.1, 0.05), \
+ /* Dry color factor */ CompVec4(0.4, 0.4, 0.4, 1) \
) \
X( \
/* Name */ BloodDebris, \
@@ -73,6 +73,7 @@ Enum(V_ParticleLayer)
/* Layer */ V_ParticleLayer_Mid, \
/* Stain rate, pen chance */ 30, 0, \
/* Lifetime */ Inf, \
+ /* Prune speed threshold */ 0.01, \
/* Base color */ CompVec4(0.5, 0.1, 0.1, 0.8), \
/* Dry color factor */ CompVec4(1, 1, 1, 1) \
) \
@@ -82,6 +83,7 @@ Enum(V_ParticleLayer)
/* Layer */ V_ParticleLayer_Mid, \
/* Stain rate, pen chance */ 0, 0, \
/* Lifetime */ Inf, \
+ /* Prune speed threshold */ 0.01, \
/* Base color */ CompVec4(0.4, 0.3, 0.2, 1), \
/* Dry color factor */ CompVec4(1, 1, 1, 1) \
) \
@@ -91,6 +93,7 @@ Enum(V_ParticleLayer)
/* Layer */ V_ParticleLayer_Mid, \
/* Stain rate, pen chance */ 0, 0, \
/* Lifetime */ Inf, \
+ /* Prune speed threshold */ 0.1, \
/* Base color */ CompVec4(2, 0.5, 0, 1), \
/* Dry color factor */ CompVec4(0.2, 0.1, 0.0, 1) \
) \
@@ -102,6 +105,7 @@ Enum(V_ParticleLayer)
/* Layer */ V_ParticleLayer_Mid, \
/* Stain rate, pen chance */ 0, 0, \
/* Lifetime */ 0.075, \
+ /* Prune speed threshold */ 0.01, \
/* Base color */ CompVec4(0.8, 0.6, 0.2, 0.25), \
/* Dry color factor */ CompVec4(1, 1, 1, 1) \
) \
@@ -111,6 +115,7 @@ Enum(V_ParticleLayer)
/* Layer */ V_ParticleLayer_Air, \
/* Stain rate, pen chance */ 0, 0, \
/* Lifetime */ Inf, \
+ /* Prune speed threshold */ 0.01, \
/* Base color */ CompVec4(0.25, 0.25, 0.25, 0.75), \
/* Dry color factor */ CompVec4(1, 1, 1, 1) \
) \
@@ -122,6 +127,7 @@ Enum(V_ParticleLayer)
/* Layer */ V_ParticleLayer_Mid, \
/* Stain rate, pen chance */ 0, 0, \
/* Lifetime */ Inf, \
+ /* Prune speed threshold */ 0.01, \
/* Base color */ CompVec4(1, 1, 0, 1), \
/* Dry color factor */ CompVec4(1, 1, 1, 1) \
) \
@@ -168,6 +174,7 @@ Struct(V_ParticleDesc)
f32 stain_rate;
f32 pen_rate;
f32 lifetime;
+ f32 prune_speed_threshold;
Vec4 base_color;
Vec4 dry_factor;
};
@@ -264,6 +271,7 @@ Struct(V_SharedFrame)

b32 tiles_dirty;
b32 should_clear_particles;
+ b32 should_tone_map;

b32 is_looking;
b32 is_moving;