JBR-7725 Vulkan: low performance in SwingMark

Removed extra synchronization in blits and extra copy

(cherry picked from commit 5411f2a1df5e6a6ed60c27837b414799b80d6fa5)
This commit is contained in:
Alexey Ushakov
2025-03-24 11:11:34 +01:00
parent e948e68a1c
commit f14bfe0de9
5 changed files with 165 additions and 48 deletions

View File

@@ -95,6 +95,11 @@ static AlphaType getSrcAlphaType(jshort srctype) {
ALPHA_TYPE_PRE_MULTIPLIED : ALPHA_TYPE_STRAIGHT;
}
static void VKTexturePoolTexture_Dispose(VKDevice* device, void* ctx) {
VKTexturePoolHandle* hnd = (VKTexturePoolHandle*) ctx;
VKTexturePoolHandle_ReleaseTexture(hnd);
}
static void VKBlitSwToTextureViaPooledTexture(VKRenderingContext* context,
VKSDOps *dstOps,
const SurfaceDataRasInfo *srcInfo, jshort srctype, jint hint,
@@ -104,8 +109,6 @@ static void VKBlitSwToTextureViaPooledTexture(VKRenderingContext* context,
const int sw = srcInfo->bounds.x2 - srcInfo->bounds.x1;
const int sh = srcInfo->bounds.y2 - srcInfo->bounds.y1;
const int dw = dx2 - dx1;
const int dh = dy2 - dy1;
ARRAY(VKTxVertex) vertices = ARRAY_ALLOC(VKTxVertex, 4);
/*
@@ -129,19 +132,18 @@ static void VKBlitSwToTextureViaPooledTexture(VKRenderingContext* context,
VKBuffer* renderVertexBuffer = ARRAY_TO_VERTEX_BUF(device, vertices);
ARRAY_FREE(vertices);
const char *raster = srcInfo->rasBase;
raster += (uint32_t)srcInfo->bounds.y1 * (uint32_t)srcInfo->scanStride + (uint32_t)srcInfo->bounds.x1 * (uint32_t)srcInfo->pixelStride;
J2dTraceLn4(J2D_TRACE_VERBOSE, "replaceTextureRegion src (dw, dh) : [%d, %d] dest (dx1, dy1) =[%d, %d]",
dw, dh, dx1, dy1);
uint32_t dataSize = sw * sh * srcInfo->pixelStride;
char* data = malloc(dataSize);
// copy src pixels inside src bounds to buff
for (int row = 0; row < sh; row++) {
memcpy(data + (row * sw * srcInfo->pixelStride), raster, sw * srcInfo->pixelStride);
raster += (uint32_t)srcInfo->scanStride;
}
VKBuffer *buffer = VKBuffer_CreateFromData(device, data, dataSize);
free(data);
(dx2 - dx1), (dy2 - dy1), dx1, dy1);
VKBuffer *buffer =
VKBuffer_CreateFromRaster(device, (VKBuffer_RasterInfo){
.data = srcInfo->rasBase,
.x1 = srcInfo->bounds.x1,
.y1 = srcInfo->bounds.y1,
.w = sw,
.h = sh,
.pixelStride = srcInfo->pixelStride,
.scanStride = srcInfo->scanStride
}, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_READ_BIT);
VkCommandBuffer cb = VKRenderer_Record(device->renderer);
{
@@ -181,15 +183,9 @@ static void VKBlitSwToTextureViaPooledTexture(VKRenderingContext* context,
VkDescriptorSet srcDescriptorSet = VKImage_GetDescriptorSet(device, src, type.format, type.swizzle);
VKRenderer_TextureRender(srcDescriptorSet, renderVertexBuffer->handle, 4, hint, SAMPLER_WRAP_BORDER);
// TODO: Not optimal but required for releasing raster buffer. Such Buffers should also be managed by special pools
VKRenderer_FlushSurface(dstOps);
VKRenderer_Flush(device->renderer);
VKRenderer_Sync(device->renderer);
// TODO: Track lifecycle of the texture to avoid reuse of occupied texture
VKTexturePoolHandle_ReleaseTexture(hnd);
VKBuffer_Destroy(device, buffer);
// TODO: Add proper sync for renderVertexBuffer
// VKBuffer_Destroy(device, renderVertexBuffer);
VKRenderer_DisposeOnCleanup(device->renderer, VKTexturePoolTexture_Dispose, hnd);
VKRenderer_DisposeOnCleanup(device->renderer, VKBuffer_Dispose, buffer);
}
static void VKBlitTextureToTexture(VKRenderingContext* context, VKImage* src, VkBool32 srcOpaque, jint hint,
@@ -246,12 +242,8 @@ static void VKBlitTextureToTexture(VKRenderingContext* context, VKImage* src, Vk
VkDescriptorSet srcDescriptorSet = VKImage_GetDescriptorSet(device, src, src->format, srcOpaque ? OPAQUE_SWIZZLE : 0);
VKRenderer_TextureRender(srcDescriptorSet, renderVertexBuffer->handle, 4, hint, SAMPLER_WRAP_BORDER);
// TODO: Not optimal but required for releasing raster buffer. Such Buffers should also be managed by special pools
// TODO: Also, consider using VKRenderer_FlushRenderPass here to process pending command
VKRenderer_Flush(device->renderer);
VKRenderer_Sync(device->renderer);
// TODO: Add proper sync for renderVertexBuffer
// VKBuffer_Destroy(device, renderVertexBuffer);
VKRenderer_FlushSurface(context->surface);
VKRenderer_DisposeOnCleanup(device->renderer, VKBuffer_Dispose, renderVertexBuffer);
}
static jboolean clipDestCoords(

View File

@@ -30,6 +30,11 @@
#include "VKAllocator.h"
#include "VKBuffer.h"
#include "VKDevice.h"
#include "VKRenderer.h"
#define VK_BUFFER_HOST_COHERENT_MEMORY
const size_t VK_BUFFER_CREATE_THRESHOLD = 0xDC000;
static VKMemory VKBuffer_DestroyBuffersOnFailure(VKDevice* device, VKMemory page, uint32_t bufferCount, VKBuffer* buffers) {
assert(device != NULL && device->allocator != NULL);
@@ -220,41 +225,82 @@ VKBuffer* VKBuffer_Create(VKDevice* device, VkDeviceSize size,
VKBuffer_Destroy(device, buffer);
return NULL;
}
buffer->lastStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
buffer->lastAccess = 0;
return buffer;
}
VKBuffer* VKBuffer_CreateFromData(VKDevice* device, void* vertices, VkDeviceSize bufferSize)
void VKBuffer_Dispose(VKDevice* device, void* data) {
VKBuffer* buffer = (VKBuffer*) data;
VKBuffer_Destroy(device, buffer);
}
VKBuffer *VKBuffer_CreateFromRaster(VKDevice *device,
VKBuffer_RasterInfo info,
VkPipelineStageFlags stage,
VkAccessFlags access)
{
VKBuffer* buffer = VKBuffer_Create(device, bufferSize,
VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
uint32_t dataSize = info.w * info.h * info.pixelStride;
VKBuffer *buffer = VKBuffer_Create(device, dataSize,
VK_BUFFER_USAGE_TRANSFER_DST_BIT |
VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
#ifdef VK_BUFFER_HOST_COHERENT_MEMORY
| VK_MEMORY_PROPERTY_HOST_COHERENT_BIT
#endif
);
void* data;
VK_IF_ERROR(device->vkMapMemory(device->handle, buffer->range.memory, 0, VK_WHOLE_SIZE, 0, &data)) {
VKBuffer_Destroy(device, buffer);
return NULL;
}
memcpy(data, vertices, bufferSize);
VkMappedMemoryRange memoryRange = {
.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
.pNext = NULL,
.memory = buffer->range.memory,
.offset = 0,
.size = VK_WHOLE_SIZE
};
char* raster = (char*)info.data + info.y1 * info.scanStride + info.x1 * info.pixelStride;;
VK_IF_ERROR(device->vkFlushMappedMemoryRanges(device->handle, 1, &memoryRange)) {
VKBuffer_Destroy(device, buffer);
return NULL;
// copy src pixels inside src bounds to buff
for (size_t row = 0; row < info.h; row++) {
memcpy((char*)data + (row * info.w * info.pixelStride), raster, info.w * info.pixelStride);
raster += (uint32_t) info.scanStride;
}
device->vkUnmapMemory(device->handle, buffer->range.memory);
#ifndef VK_BUFFER_HOST_COHERENT_MEMORY
device->vkFlushMappedMemoryRanges(device->handle, 1, &buffer->range);
#endif
device->vkUnmapMemory(device->handle, buffer->range.memory);
{
VkCommandBuffer cb = VKRenderer_Record(device->renderer);
VkBufferMemoryBarrier barrier;
VKBarrierBatch barrierBatch = {};
VKRenderer_AddBufferBarrier(&barrier, &barrierBatch, buffer,
stage, access);
if (barrierBatch.barrierCount > 0) {
device->vkCmdPipelineBarrier(cb, barrierBatch.srcStages,
barrierBatch.dstStages,
0, 0, NULL,
barrierBatch.barrierCount, &barrier,
0, NULL);
}
}
return buffer;
}
VKBuffer* VKBuffer_CreateFromData(VKDevice* device, void* data, VkDeviceSize dataSize,
VkPipelineStageFlags stage, VkAccessFlags access) {
return VKBuffer_CreateFromRaster(device, (VKBuffer_RasterInfo) {
.data = data,
.w = dataSize,
.h = 1,
.scanStride = dataSize,
.pixelStride = 1
}, stage, access);
}
void VKBuffer_Destroy(VKDevice* device, VKBuffer* buffer) {
if (buffer != NULL) {
if (buffer->handle != VK_NULL_HANDLE) {

View File

@@ -30,10 +30,13 @@
#include "VKTypes.h"
#define ARRAY_TO_VERTEX_BUF(device, vertices) \
VKBuffer_CreateFromData(device, vertices, ARRAY_SIZE(vertices)*sizeof (vertices[0]))
VKBuffer_CreateFromData(device, vertices, ARRAY_SIZE(vertices)*sizeof (vertices[0]),\
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT)
struct VKBuffer {
VkBuffer handle;
VkPipelineStageFlagBits lastStage;
VkAccessFlagBits lastAccess;
// Buffer has no ownership over its memory.
// Provided memory, offset and size must only be used to flush memory writes.
// Allocation and freeing is done in pages.
@@ -48,6 +51,13 @@ struct VKTexelBuffer {
VkDescriptorSet descriptorSet;
};
typedef struct {
void* data;
size_t x1, y1, w, h;
size_t scanStride;
size_t pixelStride;
} VKBuffer_RasterInfo;
/**
* Create buffers, allocate a memory page and bind them together.
* 'pageSize' can be 0, meaning that page size is calculated based on buffer memory requirements.
@@ -75,9 +85,15 @@ VKBuffer* VKBuffer_Create(VKDevice* device, VkDeviceSize size,
VkBufferUsageFlags usage, VkMemoryPropertyFlags properties);
// TODO usage of this function is suboptimal, we need to avoid creating one-time buffers.
VKBuffer* VKBuffer_CreateFromData(VKDevice* device, void* vertices, VkDeviceSize bufferSize);
VKBuffer* VKBuffer_CreateFromData(VKDevice* device, void* data, VkDeviceSize dataSize,
VkPipelineStageFlags stage, VkAccessFlags access);
VKBuffer* VKBuffer_CreateFromRaster(VKDevice* device, VKBuffer_RasterInfo info,
VkPipelineStageFlags stage, VkAccessFlags access);
// TODO usage of this function is suboptimal, we need to avoid destroying individual buffers.
void VKBuffer_Destroy(VKDevice* device, VKBuffer* buffer);
void VKBuffer_Dispose(VKDevice* device, void* ctx);
#endif // VKBuffer_h_Included

View File

@@ -53,6 +53,12 @@ RING_BUFFER(struct PoolEntry_ ## NAME { \
(VAR) = RING_BUFFER_FRONT((RENDERER)->NAME)->value; RING_BUFFER_POP_FRONT((RENDERER)->NAME); \
}} while(0)
/**
* Check if there are available items in the pool.
*/
#define POOL_NOT_EMPTY(RENDERER, NAME) \
(VKRenderer_CheckPoolEntryAvailable((RENDERER), RING_BUFFER_FRONT((RENDERER)->NAME)))
/**
* Return an item to the pool. It will only become available again
* after the next submitted batch of work completes execution on GPU.
@@ -83,6 +89,11 @@ RING_BUFFER(struct PoolEntry_ ## NAME { \
*/
#define POOL_FREE(RENDERER, NAME) RING_BUFFER_FREE((RENDERER)->NAME)
typedef struct {
VKCleanupHandler handler;
void* data;
} VKCleanupEntry;
/**
* Renderer attached to device.
*/
@@ -96,6 +107,7 @@ struct VKRenderer {
POOL(VKBuffer, vertexBufferPool);
POOL(VKTexelBuffer, maskFillBufferPool);
POOL(VkFramebuffer, framebufferDestructionQueue);
POOL(VKCleanupEntry, cleanupQueue);
ARRAY(VKMemory) bufferMemoryPages;
ARRAY(VkDescriptorPool) descriptorPools;
ARRAY(VkDescriptorPool) imageDescriptorPools;
@@ -436,6 +448,13 @@ void VKRenderer_Destroy(VKRenderer* renderer) {
static void VKRenderer_CleanupPendingResources(VKRenderer* renderer) {
VKDevice* device = renderer->device;
while (POOL_NOT_EMPTY(renderer, cleanupQueue)) {
VKCleanupEntry entry;
POOL_TAKE(renderer, cleanupQueue, entry);
entry.handler(device, entry.data);
}
for (;;) {
VkFramebuffer framebuffer = VK_NULL_HANDLE;
POOL_TAKE(renderer, framebufferDestructionQueue, framebuffer);
@@ -579,6 +598,33 @@ void VKRenderer_AddImageBarrier(VkImageMemoryBarrier* barriers, VKBarrierBatch*
}
}
/**
* Prepare buffer barrier info to be executed in batch, if needed.
*/
void VKRenderer_AddBufferBarrier(VkBufferMemoryBarrier* barriers, VKBarrierBatch* batch,
VKBuffer* buffer, VkPipelineStageFlags stage,
VkAccessFlags access)
{
assert(barriers != NULL && batch != NULL && buffer != NULL);
if (stage != buffer->lastStage || access != buffer->lastAccess) {
barriers[batch->barrierCount] = (VkBufferMemoryBarrier) {
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
.srcAccessMask = buffer->lastAccess,
.dstAccessMask = access,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = buffer->handle,
.offset = 0,
.size = VK_WHOLE_SIZE
};
batch->barrierCount++;
batch->srcStages |= buffer->lastStage;
batch->dstStages |= stage;
buffer->lastStage = stage;
buffer->lastAccess = access;
}
}
/**
* Get Color RGBA components in a suitable for the current render pass.
*/
@@ -1178,6 +1224,12 @@ static void VKRenderer_SetupStencil(const VKRenderingContext* context) {
renderPass->state.shader = NO_SHADER;
}
void VKRenderer_DisposeOnCleanup(VKRenderer* renderer, VKCleanupHandler hnd, void* data) {
if (renderer == NULL) return;
VKCleanupEntry entry = {hnd, data};
POOL_RETURN(renderer, cleanupQueue, entry);
}
/**
* Setup pipeline for drawing. Returns FALSE if surface is not yet ready for drawing.
*/

View File

@@ -61,6 +61,8 @@ typedef struct {
VkPipelineStageFlags dstStages;
} VKBarrierBatch;
typedef void (*VKCleanupHandler)(VKDevice *renderer, void* data);
VKRenderer* VKRenderer_Create(VKDevice* device);
/**
@@ -80,6 +82,10 @@ VkCommandBuffer VKRenderer_Record(VKRenderer* renderer);
void VKRenderer_AddImageBarrier(VkImageMemoryBarrier* barriers, VKBarrierBatch* batch,
VKImage* image, VkPipelineStageFlags stage, VkAccessFlags access, VkImageLayout layout);
void VKRenderer_AddBufferBarrier(VkBufferMemoryBarrier* barriers, VKBarrierBatch* batch,
VKBuffer* buffer, VkPipelineStageFlags stage,
VkAccessFlags access);
void VKRenderer_CreateImageDescriptorSet(VKRenderer* renderer, VkDescriptorPool* descriptorPool, VkDescriptorSet* set);
void VKRenderer_Destroy(VKRenderer* renderer);
@@ -105,6 +111,11 @@ void VKRenderer_DestroyRenderPass(VKSDOps* surface);
*/
VkBool32 VKRenderer_FlushRenderPass(VKSDOps* surface);
/**
* Register a handler to be called at the cleanup phase of the renderer.
*/
void VKRenderer_DisposeOnCleanup(VKRenderer* renderer, VKCleanupHandler hnd, void* data);
/**
* Flush pending render pass and queue surface for presentation (if applicable).
*/