u/SurDno

Greedy Meshing with Vertex Pulling is SLOWER to render?

I finally got down to implementing vertex pulling.

My current setup (pre-VP) of uploading already greedy meshed chunks to the GPU and storing vertex data as follows:

    struct PackedVoxelVertex {
        uint16_t x, y, z; // local pos within chunk, only using first 9 bits in each to represent positions from 0 to 256.
        uint16_t w; // only using first 8 bits for normal and block type, the rest is padding.
    };

I know the layout is far from ideal and I am wasting some memory here, but I thought it's temporary anyway as I would eventually switch to vertex pulling. The actual upload code in OpenGL looks like that:

    GLuint VAO, VBO, IBO;
    void setupStandardGreedy(const std::vector<PackedVoxelVertex>& verts, const std::vector<uint32_t>& indices) {
        glGenVertexArrays(1, &VAO);
        glGenBuffers(1, &VBO);
        glGenBuffers(1, &IBO);
        glBindVertexArray(VAO);

        glBindBuffer(GL_ARRAY_BUFFER, VBO);
        glBufferData(GL_ARRAY_BUFFER, verts.size() * sizeof(PackedVoxelVertex), verts.data(), GL_STATIC_DRAW);

        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, IBO);
        glBufferData(GL_ELEMENT_ARRAY_BUFFER, indices.size() * sizeof(uint32_t), indices.data(), GL_STATIC_DRAW);

        glEnableVertexAttribArray(0);
        glVertexAttribIPointer(0, 4, GL_UNSIGNED_SHORT, sizeof(PackedVoxelVertex), (void*)0);
        glBindVertexArray(0);
    }

    void drawStandardGreedy(GLsizei indexCount) {
        glBindVertexArray(VAO);
        glDrawElements(GL_TRIANGLES, indexCount, GL_UNSIGNED_INT, 0);
        glBindVertexArray(0);
    }

and vertex code reading the data just unpacks it:

#version 430 core
layout (location = 0) in uvec4 aVertex; // VAO
layout (location = 0) out vec3 WorldPos;
layout (location = 1) flat out uint vNormalDir;
layout (location = 2) flat out uint vBlockType;

layout (std140, row_major) uniform SceneData {
    mat4 view;
    mat4 proj;
    vec4 lodColor;
    vec4 cameraPos;
    vec4 normalBlendParams;
    vec4 chunkOrigin;
};

void main() {
    vec3 localPos = vec3(float(aVertex.r), float(aVertex.g), float(aVertex.b));
    vec3 worldPos = chunkOrigin.xyz + localPos * chunkOrigin.w;

    uint normalDir = aVertex.a & 0x7u;
    uint blockType = (aVertex.a >> 3u) & 0x1Fu;

    gl_Position = vec4(worldPos, 1.0) * view * proj;

    WorldPos   = worldPos;
    vNormalDir = normalDir;
    vBlockType = blockType;
}

Now I read about vertex pulling and decided to try and adapt my code to it. Most examples online were about drawing a single side of the voxel, not a greedy meshed face, so I had to adapt. In the end, instead of sending 16*4 = 64b per face, I started sending just 8b:

struct PackedVPFace {
    uint8_t posX;       // local X (0-255)
    uint8_t posY;       // local Y (0-255)
    uint8_t posZ;       // local Z (0-255)
    uint8_t dimW;       // width-1 (0-255)
    uint8_t dimH;       // height-1 (0-255)
    uint8_t normalDir;  // normal dir (0-5)
    uint8_t blockType;
    uint8_t padding; 
};

The process of uploading that data to the GPU is as follows:

GLuint dummyVAO;
GLuint SSBO;
GLuint sharedIBO;
void setupGreedyVP(const std::vector<uint64_t>& faces) {
    glGenVertexArrays(1, &dummyVAO);

    glGenBuffers(1, &SSBO);
    glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO);
    glBufferData(GL_SHADER_STORAGE_BUFFER, faces.size() * sizeof(uint64_t), faces.data(), GL_STATIC_DRAW);
    glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);

    glGenBuffers(1, &sharedIBO);
    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, sharedIBO);
    
    std::vector<uint32_t> vpIB;
    vpIB.reserve(faces.size() * 6);
    for (uint32_t f = 0; f < faces.size(); ++f) {
        uint32_t v = f * 4;
        vpIB.push_back(v);   vpIB.push_back(v+2);
        vpIB.push_back(v+1); vpIB.push_back(v+1);
        vpIB.push_back(v+2); vpIB.push_back(v+3);
    }
    glBufferData(GL_ELEMENT_ARRAY_BUFFER, vpIB.size() * sizeof(uint32_t), vpIB.data(), GL_STATIC_DRAW);
    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
}

void drawGreedyVP(GLsizei faceCount) {
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, SSBO);

    glBindVertexArray(dummyVAO);
    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, sharedIBO);

    glDrawElements(GL_TRIANGLES, faceCount * 6, GL_UNSIGNED_INT, 0);
    glBindVertexArray(0);
}

And the process of recreating vertices on the GPU is:

#version 430 core

layout (std430, binding = 0) buffer FaceBuffer { uvec2 faces[]; };

layout (location = 0) out vec3 WorldPos;
layout (location = 1) flat out uint vNormalDir;
layout (location = 2) flat out uint vBlockType;

layout (std140, row_major) uniform SceneData {
    mat4 view;
    mat4 proj;
    vec4 lodColor;
    vec4 cameraPos;
    vec4 normalBlendParams;
    vec4 chunkOrigin;
};

// i=normalDir*4+corner
const vec3 BASE_OFFSETS[6] = vec3[](
    vec3(0,1,0), vec3(0,0,0), vec3(1,0,0),
    vec3(0,0,0), vec3(0,0,1), vec3(0,0,0)
);
const vec3 U_SCALES[24] = vec3[](
    vec3(0,0,0), vec3(0,0,0), vec3(1,0,0), vec3(1,0,0), // 0 (+Y)
    vec3(0,0,0), vec3(0,0,0), vec3(1,0,0), vec3(1,0,0), // 1 (-Y)
    vec3(0,0,1), vec3(0,0,1), vec3(0,0,0), vec3(0,0,0), // 2 (+X)
    vec3(0,0,0), vec3(0,0,0), vec3(0,0,1), vec3(0,0,1), // 3 (-X)
    vec3(0,0,0), vec3(1,0,0), vec3(0,0,0), vec3(1,0,0), // 4 (+Z)
    vec3(1,0,0), vec3(0,0,0), vec3(1,0,0), vec3(0,0,0)  // 5 (-Z)
);
const vec3 V_SCALES[24] = vec3[](
    vec3(0,0,1), vec3(0,0,0), vec3(0,0,1), vec3(0,0,0), // 0 (+Y)
    vec3(0,0,1), vec3(0,0,0), vec3(0,0,1), vec3(0,0,0), // 1 (-Y)
    vec3(0,0,0), vec3(0,1,0), vec3(0,0,0), vec3(0,1,0), // 2 (+X)
    vec3(0,0,0), vec3(0,1,0), vec3(0,0,0), vec3(0,1,0), // 3 (-X)
    vec3(0,1,0), vec3(0,1,0), vec3(0,0,0), vec3(0,0,0), // 4 (+Z)
    vec3(0,1,0), vec3(0,1,0), vec3(0,0,0), vec3(0,0,0)  // 5 (-Z)
);

void main() {
    uint faceIndex = uint(gl_VertexID) >> 2u;
    uint corner    = uint(gl_VertexID) & 3u;

    uvec2 face    = faces[faceIndex];
    vec4 p0       = unpackUnorm4x8(face.x) * 255.0;
    vec3 localPos = p0.xyz;
    float W       = p0.w + 1.0;
    float H       = float(face.y & 0xFFu) + 1.0;
    uint normalDir = (face.y >> 8u) & 0x7u;
    uint blockType = (face.y >> 16u) & 0x1Fu;

    int lutIndex  = (int(normalDir) << 2) + int(corner);
    vec3 worldPos = chunkOrigin.xyz + (localPos + BASE_OFFSETS[normalDir] + U_SCALES[lutIndex] * W + V_SCALES[lutIndex] * H) * chunkOrigin.w;

    gl_Position = vec4(worldPos, 1.0) * view * proj;
    WorldPos   = worldPos;
    vNormalDir = normalDir;
    vBlockType = blockType;
}

I went through several iterations with the shader code, initially involving a shitton of branching and eventually coming to this layout to abuse unpackUnorm and vector multiplication. Given fragment shader is identical, this is as good as I got (I could get rid of base offset LUT to send 9 bits per location axis but I would need to do extra ALU so I am not sure if it would make any meaningful difference).

I benchmarked both methods switched in runtime to see the FPS on three devices, high end, middle end and lower end. Same scene, same resolution, same everything, just different code executing to send the vertices to the GPU and read them there. My engine is GPU bound so any changes in FPS are equivalent to changes in GPU times. Results are as follows:

  • On my high end machine (4090) vertex pulling gave about 5-7% improvement in raw FPS, giving me 1690 FPS instead of 1610. Not that I needed it, but just to note that the algorithm did work on some hardware.
  • On a 3060, the difference was within noise (1-2%), it was not obvious whether vertex pulling was winning or not.
  • On an integrated GPU (i3-10110U's UHD Graphics) vertex pulling resulted in ~15% REDUCTION in raw FPS compared to just sending vertices directly.

I always hear vertex pulling mentioned as an optimization, and it makes sense on paper - I am sending 1/4th of the data per face, and even with 2.7x more instructions, I should be saving in total, but as measurements show, this is clearly not the case.

Can someone explain to me what might be at hand here and can I do something about it to make VP actually act better on lower end harware?

reddit.com
u/SurDno — 1 day ago

How to balance shader work vs bandwidth?

I am working on a small scale voxel engine and currently just trying to push rendering distance to its absolute limits.

One of the optimisations I hear often is reducing the amount of data sent to the GPU. So I reduced my vertex buffer 7x to 4 bytes (32 bits) by storing local chunk coordinates instead of float global coord, packing normal vector into first 3 bits of a byte (as it can only ever have 6 values) and using the rest for block type.

But the work I had to do in a shader to decode those values ended up resulting in (slightly but still) worse performance than when sending all the data raw, at least on my high end GPU.

Is there a rule of thumb somewhere about how much to send vs what to delegate to a shader? Is less bandwidth always better or does it only start to become an issue once you reach certain amount of data sent? Is this balance any different on lower end GPUs, and I will feel the optimisation if I benchmark on a different machine?

Sorry if the question is stipud, I’m just a beginner.

reddit.com
u/SurDno — 3 days ago

How to balance shader work vs bandwidth?

I am working on a small scale voxel engine and currently just trying to push rendering distance to its absolute limits.

One of the optimisations I hear often is reducing the amount of data sent to the GPU. So I reduced my vertex buffer 7x to 4 bytes (32 bits) by storing local chunk coordinates instead of float global coord, packing normal vector into first 3 bits of a byte (as it can only ever have 6 values) and using the rest for block type.

But the work I had to do in a shader to decode those values ended up resulting in (slightly but still) worse performance than when sending all the data raw, at least on my high end GPU.

Is there a rule of thumb somewhere about how much to send vs what to delegate to a shader? Is less bandwidth always better or does it only start to become an issue once you reach certain amount of data sent? Is this balance any different on lower end GPUs, and I will feel the optimisation if I benchmark on a different machine?

Sorry if the question is stipud, I’m just a beginner.

reddit.com
u/SurDno — 3 days ago

How to combat extreme Moire pattern when generating terrain with extremely small voxels?

This is on a 4k screen. MSAA helps a bit, using LOD chunks with larger voxels helps further but if I decrease the LOD distance to the point where Moire disappears, the pop-in of LODS becomes obvious. Any other solutions I am not thinking of?

u/SurDno — 4 days ago
▲ 51 r/DotA2

Texture filtering got broken in Spoon Man

In an absolutely disheartening turn of events, a recent update seems to have enabled anisotropic filtering instead of point (nearest neighbor) for the pixel art sprites used in Spoon Man minigame. Some textures are unaffected (the background looks fine!) but character sprites, items, VFX are all blurry.

This makes the minigame literally unplayable, it looks like shit. I am only bringing this up in hopes that someone from Valve sees that post, because this *has* to be addressed as soon as possible. An issue like that cannot stay in DotA.

u/SurDno — 7 days ago
▲ 392 r/DotA2

My first green wall after 5 years of Dota! So proud.

u/SurDno — 8 days ago
▲ 55 r/DotA2

At this point I get a lot more excited for a bunch of QoL changes over number changes for a couple heroes. Was really looking forward to Spring Cleaning 2026, but apparently it ain't coming.

Some changes I would personally appreciate:

  • Getting rid of 2 pings in a short time frame limit when you are pinging different things in quick succession (e.g. bkb ready, refresher ready, ult ready). Or better yet, not having any ping limit at all for 12k behaviour score.
  • Replacing "I will stack" message with "This camp is empty" when pinging an empty camp, so that you can communicate an area has been farmed or the camp is blocked.
  • Adding an extra "potential enemy vision here" ping, because neither "We need vision" nor "Enemy has vision" communicate that you want a sentry but aren't 100% sure.
  • "On my way" ping minimap pathfinding taking the twin gates into account, so if you're standing next to a gate and pinging the opposite lane, it does not show that the shortest path is you going through the entire map.
  • "On my way" pings still working on stuff like Wisdom Shrines instead of just displaying when it activates without the info that you are going there.
  • "Wisdom Shrine is active" ping to differentiate between having it in vision (and thus knowing it's active) and not having it in vision (and thus "might be active").
  • Rune pings when not seeing them directly to work on pinging the actual rune spot, not just the minimap.
  • "Enemy Tower has X% HP" saying which tower specifically is getting pinged (e.g. "Bottom Tier 2 Tower").
reddit.com
u/SurDno — 23 days ago