Greedy Meshing with Vertex Pulling is SLOWER to render?
I finally got down to implementing vertex pulling.
My current setup (pre-VP) of uploading already greedy meshed chunks to the GPU and storing vertex data as follows:
struct PackedVoxelVertex {
uint16_t x, y, z; // local pos within chunk, only using first 9 bits in each to represent positions from 0 to 256.
uint16_t w; // only using first 8 bits for normal and block type, the rest is padding.
};
I know the layout is far from ideal and I am wasting some memory here, but I thought it's temporary anyway as I would eventually switch to vertex pulling. The actual upload code in OpenGL looks like that:
GLuint VAO, VBO, IBO;
void setupStandardGreedy(const std::vector<PackedVoxelVertex>& verts, const std::vector<uint32_t>& indices) {
glGenVertexArrays(1, &VAO);
glGenBuffers(1, &VBO);
glGenBuffers(1, &IBO);
glBindVertexArray(VAO);
glBindBuffer(GL_ARRAY_BUFFER, VBO);
glBufferData(GL_ARRAY_BUFFER, verts.size() * sizeof(PackedVoxelVertex), verts.data(), GL_STATIC_DRAW);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, IBO);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, indices.size() * sizeof(uint32_t), indices.data(), GL_STATIC_DRAW);
glEnableVertexAttribArray(0);
glVertexAttribIPointer(0, 4, GL_UNSIGNED_SHORT, sizeof(PackedVoxelVertex), (void*)0);
glBindVertexArray(0);
}
void drawStandardGreedy(GLsizei indexCount) {
glBindVertexArray(VAO);
glDrawElements(GL_TRIANGLES, indexCount, GL_UNSIGNED_INT, 0);
glBindVertexArray(0);
}
and vertex code reading the data just unpacks it:
#version 430 core
layout (location = 0) in uvec4 aVertex; // VAO
layout (location = 0) out vec3 WorldPos;
layout (location = 1) flat out uint vNormalDir;
layout (location = 2) flat out uint vBlockType;
layout (std140, row_major) uniform SceneData {
mat4 view;
mat4 proj;
vec4 lodColor;
vec4 cameraPos;
vec4 normalBlendParams;
vec4 chunkOrigin;
};
void main() {
vec3 localPos = vec3(float(aVertex.r), float(aVertex.g), float(aVertex.b));
vec3 worldPos = chunkOrigin.xyz + localPos * chunkOrigin.w;
uint normalDir = aVertex.a & 0x7u;
uint blockType = (aVertex.a >> 3u) & 0x1Fu;
gl_Position = vec4(worldPos, 1.0) * view * proj;
WorldPos = worldPos;
vNormalDir = normalDir;
vBlockType = blockType;
}
Now I read about vertex pulling and decided to try and adapt my code to it. Most examples online were about drawing a single side of the voxel, not a greedy meshed face, so I had to adapt. In the end, instead of sending 16*4 = 64b per face, I started sending just 8b:
struct PackedVPFace {
uint8_t posX; // local X (0-255)
uint8_t posY; // local Y (0-255)
uint8_t posZ; // local Z (0-255)
uint8_t dimW; // width-1 (0-255)
uint8_t dimH; // height-1 (0-255)
uint8_t normalDir; // normal dir (0-5)
uint8_t blockType;
uint8_t padding;
};
The process of uploading that data to the GPU is as follows:
GLuint dummyVAO;
GLuint SSBO;
GLuint sharedIBO;
void setupGreedyVP(const std::vector<uint64_t>& faces) {
glGenVertexArrays(1, &dummyVAO);
glGenBuffers(1, &SSBO);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO);
glBufferData(GL_SHADER_STORAGE_BUFFER, faces.size() * sizeof(uint64_t), faces.data(), GL_STATIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
glGenBuffers(1, &sharedIBO);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, sharedIBO);
std::vector<uint32_t> vpIB;
vpIB.reserve(faces.size() * 6);
for (uint32_t f = 0; f < faces.size(); ++f) {
uint32_t v = f * 4;
vpIB.push_back(v); vpIB.push_back(v+2);
vpIB.push_back(v+1); vpIB.push_back(v+1);
vpIB.push_back(v+2); vpIB.push_back(v+3);
}
glBufferData(GL_ELEMENT_ARRAY_BUFFER, vpIB.size() * sizeof(uint32_t), vpIB.data(), GL_STATIC_DRAW);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
}
void drawGreedyVP(GLsizei faceCount) {
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, SSBO);
glBindVertexArray(dummyVAO);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, sharedIBO);
glDrawElements(GL_TRIANGLES, faceCount * 6, GL_UNSIGNED_INT, 0);
glBindVertexArray(0);
}
And the process of recreating vertices on the GPU is:
#version 430 core
layout (std430, binding = 0) buffer FaceBuffer { uvec2 faces[]; };
layout (location = 0) out vec3 WorldPos;
layout (location = 1) flat out uint vNormalDir;
layout (location = 2) flat out uint vBlockType;
layout (std140, row_major) uniform SceneData {
mat4 view;
mat4 proj;
vec4 lodColor;
vec4 cameraPos;
vec4 normalBlendParams;
vec4 chunkOrigin;
};
// i=normalDir*4+corner
const vec3 BASE_OFFSETS[6] = vec3[](
vec3(0,1,0), vec3(0,0,0), vec3(1,0,0),
vec3(0,0,0), vec3(0,0,1), vec3(0,0,0)
);
const vec3 U_SCALES[24] = vec3[](
vec3(0,0,0), vec3(0,0,0), vec3(1,0,0), vec3(1,0,0), // 0 (+Y)
vec3(0,0,0), vec3(0,0,0), vec3(1,0,0), vec3(1,0,0), // 1 (-Y)
vec3(0,0,1), vec3(0,0,1), vec3(0,0,0), vec3(0,0,0), // 2 (+X)
vec3(0,0,0), vec3(0,0,0), vec3(0,0,1), vec3(0,0,1), // 3 (-X)
vec3(0,0,0), vec3(1,0,0), vec3(0,0,0), vec3(1,0,0), // 4 (+Z)
vec3(1,0,0), vec3(0,0,0), vec3(1,0,0), vec3(0,0,0) // 5 (-Z)
);
const vec3 V_SCALES[24] = vec3[](
vec3(0,0,1), vec3(0,0,0), vec3(0,0,1), vec3(0,0,0), // 0 (+Y)
vec3(0,0,1), vec3(0,0,0), vec3(0,0,1), vec3(0,0,0), // 1 (-Y)
vec3(0,0,0), vec3(0,1,0), vec3(0,0,0), vec3(0,1,0), // 2 (+X)
vec3(0,0,0), vec3(0,1,0), vec3(0,0,0), vec3(0,1,0), // 3 (-X)
vec3(0,1,0), vec3(0,1,0), vec3(0,0,0), vec3(0,0,0), // 4 (+Z)
vec3(0,1,0), vec3(0,1,0), vec3(0,0,0), vec3(0,0,0) // 5 (-Z)
);
void main() {
uint faceIndex = uint(gl_VertexID) >> 2u;
uint corner = uint(gl_VertexID) & 3u;
uvec2 face = faces[faceIndex];
vec4 p0 = unpackUnorm4x8(face.x) * 255.0;
vec3 localPos = p0.xyz;
float W = p0.w + 1.0;
float H = float(face.y & 0xFFu) + 1.0;
uint normalDir = (face.y >> 8u) & 0x7u;
uint blockType = (face.y >> 16u) & 0x1Fu;
int lutIndex = (int(normalDir) << 2) + int(corner);
vec3 worldPos = chunkOrigin.xyz + (localPos + BASE_OFFSETS[normalDir] + U_SCALES[lutIndex] * W + V_SCALES[lutIndex] * H) * chunkOrigin.w;
gl_Position = vec4(worldPos, 1.0) * view * proj;
WorldPos = worldPos;
vNormalDir = normalDir;
vBlockType = blockType;
}
I went through several iterations with the shader code, initially involving a shitton of branching and eventually coming to this layout to abuse unpackUnorm and vector multiplication. Given fragment shader is identical, this is as good as I got (I could get rid of base offset LUT to send 9 bits per location axis but I would need to do extra ALU so I am not sure if it would make any meaningful difference).
I benchmarked both methods switched in runtime to see the FPS on three devices, high end, middle end and lower end. Same scene, same resolution, same everything, just different code executing to send the vertices to the GPU and read them there. My engine is GPU bound so any changes in FPS are equivalent to changes in GPU times. Results are as follows:
- On my high end machine (4090) vertex pulling gave about 5-7% improvement in raw FPS, giving me 1690 FPS instead of 1610. Not that I needed it, but just to note that the algorithm did work on some hardware.
- On a 3060, the difference was within noise (1-2%), it was not obvious whether vertex pulling was winning or not.
- On an integrated GPU (i3-10110U's UHD Graphics) vertex pulling resulted in ~15% REDUCTION in raw FPS compared to just sending vertices directly.
I always hear vertex pulling mentioned as an optimization, and it makes sense on paper - I am sending 1/4th of the data per face, and even with 2.7x more instructions, I should be saving in total, but as measurements show, this is clearly not the case.
Can someone explain to me what might be at hand here and can I do something about it to make VP actually act better on lower end harware?