OpenGL中的延迟平铺阴影,平铺视锥面计算


11

我正在尝试使用计算着色器在OpenGL中进行延迟的平铺着色,但是在尝试为每个平铺创建平截头体时遇到了障碍。我正在使用AMD的Forward +演示(用D3D编写)作为指导,但是当不应该使用时,灯光似乎被剔除了。

更新

阅读下面的更新。

这是我的(完整)计算着色器:

    #version 430 core

#define MAX_LIGHTS 1024
#define MAX_LIGHTS_PER_TILE 40

#define WORK_GROUP_SIZE 16

struct PointLight
{
    vec3 position;
    float radius;
    vec3 color;
    float intensity;
};

layout (binding = 0, rgba32f) uniform writeonly image2D outTexture;
layout (binding = 1, rgba32f) uniform readonly image2D normalDepth;
layout (binding = 2, rgba32f) uniform readonly image2D diffuse;
layout (binding = 3, rgba32f) uniform readonly image2D specular;
layout (binding = 4, rgba32f) uniform readonly image2D glowMatID;

layout (std430, binding = 5) buffer BufferObject
{
    PointLight pointLights[];
};

uniform mat4 view;
uniform mat4 proj;
uniform mat4 viewProj;
uniform mat4 invViewProj;
uniform mat4 invProj;
uniform vec2 framebufferDim;

layout (local_size_x = WORK_GROUP_SIZE, local_size_y = WORK_GROUP_SIZE) in;

shared uint minDepth = 0xFFFFFFFF;
shared uint maxDepth = 0;
shared uint pointLightIndex[MAX_LIGHTS];
shared uint pointLightCount = 0;

vec3 ReconstructWP(float z, vec2 uv_f)
{
    vec4 sPos = vec4(uv_f * 2.0 - 1.0, z, 1.0);
    sPos = invViewProj * sPos;

    return (sPos.xyz / sPos.w);
}

vec4 ConvertProjToView( vec4 p )
{
    p = invProj * p;
    p /= p.w;
    return p;
}

// calculate the number of tiles in the horizontal direction
uint GetNumTilesX()
{
    return uint(( ( 1280 + WORK_GROUP_SIZE - 1 ) / float(WORK_GROUP_SIZE) ));
}

// calculate the number of tiles in the vertical direction
uint GetNumTilesY()
{
    return uint(( ( 720 + WORK_GROUP_SIZE - 1 ) / float(WORK_GROUP_SIZE) ));
}


vec4 CreatePlaneEquation( vec4 b, vec4 c )
{
    vec4 n;

    // normalize(cross( b.xyz-a.xyz, c.xyz-a.xyz )), except we know "a" is the origin
     n.xyz = normalize(cross( b.xyz, c.xyz ));

    // -(n dot a), except we know "a" is the origin
    n.w = 0;

    return n;
}

float GetSignedDistanceFromPlane( vec4 p, vec4 eqn )
{
    // dot( eqn.xyz, p.xyz ) + eqn.w, , except we know eqn.w is zero 
    // (see CreatePlaneEquation above)
    return dot( eqn.xyz, p.xyz );
}

vec4 CalculateLighting( PointLight p, vec3 wPos, vec3 wNormal, vec4 wSpec, vec4 wGlow)
{
    vec3 direction = p.position - wPos;

    if(length(direction) > p.radius)
        return vec4(0.0f, 0.0f, 0.0f, 0.0f);

    float attenuation = 1.0f - length(direction) / (p.radius);
    direction = normalize(direction);
    float diffuseFactor = max(0.0f, dot(direction, wNormal)) * attenuation;
    return vec4(p.color.xyz, 0.0f) * diffuseFactor * p.intensity;
}


void main()
{
        ivec2 pixelPos = ivec2(gl_GlobalInvocationID.xy);
        vec2 tilePos = vec2(gl_WorkGroupID.xy * gl_WorkGroupSize.xy) / vec2(1280, 720);

        vec4 normalColor = imageLoad(normalDepth, pixelPos);

        float d = normalColor.w;

        uint depth = uint(d * 0xFFFFFFFF);

        atomicMin(minDepth, depth);
        atomicMax(maxDepth, depth);

        barrier();

        float minDepthZ = float(minDepth / float(0xFFFFFFFF));
        float maxDepthZ = float(maxDepth / float(0xFFFFFFFF));

        vec4 frustumEqn[4];
        uint pxm = WORK_GROUP_SIZE * gl_WorkGroupID.x;
        uint pym = WORK_GROUP_SIZE * gl_WorkGroupID.y;
        uint pxp = WORK_GROUP_SIZE * (gl_WorkGroupID.x + 1);
        uint pyp = WORK_GROUP_SIZE * (gl_WorkGroupID.y + 1);

        uint uWindowWidthEvenlyDivisibleByTileRes = WORK_GROUP_SIZE * GetNumTilesX();
        uint uWindowHeightEvenlyDivisibleByTileRes = WORK_GROUP_SIZE * GetNumTilesY();

        vec4 frustum[4];
        frustum[0] = ConvertProjToView( vec4( pxm / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pym) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );
        frustum[1] = ConvertProjToView( vec4( pxp / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pym) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );
        frustum[2] = ConvertProjToView( vec4( pxp / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pyp) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f ,1.0f) );
        frustum[3] = ConvertProjToView( vec4( pxm / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pyp) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );

        for (int i = 0; i < 4; i++)
            frustumEqn[i] = CreatePlaneEquation(frustum[i], frustum[(i+1) & 3]);

        barrier();

        int threadsPerTile = WORK_GROUP_SIZE * WORK_GROUP_SIZE;

        for (uint i = 0; i < MAX_LIGHTS; i+= threadsPerTile)
        {
            uint il = gl_LocalInvocationIndex + i;

            if (il < MAX_LIGHTS)
            {
                PointLight p = pointLights[il];

                vec4 viewPos = view * vec4(p.position, 1.0f);
                float r = p.radius;

                if (viewPos.z + minDepthZ < r && viewPos.z - maxDepthZ < r)
                {

                if( ( GetSignedDistanceFromPlane( viewPos, frustumEqn[0] ) < r ) &&
                    ( GetSignedDistanceFromPlane( viewPos, frustumEqn[1] ) < r ) &&
                    ( GetSignedDistanceFromPlane( viewPos, frustumEqn[2] ) < r ) &&
                    ( GetSignedDistanceFromPlane( viewPos, frustumEqn[3] ) < r) )

                    {
                        uint id = atomicAdd(pointLightCount, 1);
                        pointLightIndex[id] = il;
                    }
                }

            }
        }

        barrier();

        vec4 diffuseColor = imageLoad(diffuse, pixelPos);
        vec4 specularColor = imageLoad(specular, pixelPos);
        vec4 glowColor = imageLoad(glowMatID, pixelPos);

        vec2 uv = vec2(pixelPos.x / 1280.0f, pixelPos.y / 720.0f);

        vec3 wp = ReconstructWP(d, uv);
        vec4 color = vec4(0.0f, 0.0f, 0.0f, 1.0f);

        for (int i = 0; i < pointLightCount; i++)
        {
            color += CalculateLighting( pointLights[pointLightIndex[i]], wp, normalColor.xyz, specularColor, glowColor);
        }

        barrier();

        if (gl_LocalInvocationID.x == 0 || gl_LocalInvocationID.y == 0 || gl_LocalInvocationID.x == 16 || gl_LocalInvocationID.y == 16)
            imageStore(outTexture, pixelPos, vec4(.2f, .2f, .2f, 1.0f));
        else
        {
            imageStore(outTexture, pixelPos, color);
            //imageStore(outTexture, pixelPos, vec4(maxDepthZ));
            //imageStore(outTexture, pixelPos, vec4(pointLightCount / 128.0f));
            //imageStore(outTexture, pixelPos, vec4(vec2(tilePos.xy), 0.0f, 1.0f));
        }
}

这是我认为是问题的部分,剔除部分:

        barrier();

    float minDepthZ = float(minDepth / float(0xFFFFFFFF));
    float maxDepthZ = float(maxDepth / float(0xFFFFFFFF));

    vec4 frustumEqn[4];
    uint pxm = WORK_GROUP_SIZE * gl_WorkGroupID.x;
    uint pym = WORK_GROUP_SIZE * gl_WorkGroupID.y;
    uint pxp = WORK_GROUP_SIZE * (gl_WorkGroupID.x + 1);
    uint pyp = WORK_GROUP_SIZE * (gl_WorkGroupID.y + 1);

    uint uWindowWidthEvenlyDivisibleByTileRes = WORK_GROUP_SIZE * GetNumTilesX();
    uint uWindowHeightEvenlyDivisibleByTileRes = WORK_GROUP_SIZE * GetNumTilesY();

    vec4 frustum[4];
    frustum[0] = ConvertProjToView( vec4( pxm / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pym) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );
    frustum[1] = ConvertProjToView( vec4( pxp / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pym) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );
    frustum[2] = ConvertProjToView( vec4( pxp / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pyp) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f ,1.0f) );
    frustum[3] = ConvertProjToView( vec4( pxm / float(uWindowWidthEvenlyDivisibleByTileRes) * 2.0f - 1.0f, (uWindowHeightEvenlyDivisibleByTileRes - pyp) / float(uWindowHeightEvenlyDivisibleByTileRes) * 2.0f - 1.0f, 1.0f, 1.0f) );

    for (int i = 0; i < 4; i++)
        frustumEqn[i] = CreatePlaneEquation(frustum[i], frustum[(i+1) & 3]);

    barrier();

    int threadsPerTile = WORK_GROUP_SIZE * WORK_GROUP_SIZE;

    for (uint i = 0; i < MAX_LIGHTS; i+= threadsPerTile)
    {
        uint il = gl_LocalInvocationIndex + i;

        if (il < MAX_LIGHTS)
        {
            PointLight p = pointLights[il];

            vec4 viewPos = view * vec4(p.position, 1.0f);
            float r = p.radius;

            if (viewPos.z + minDepthZ < r && viewPos.z - maxDepthZ < r)
            {

            if( ( GetSignedDistanceFromPlane( viewPos, frustumEqn[0] ) < r ) &&
                ( GetSignedDistanceFromPlane( viewPos, frustumEqn[1] ) < r ) &&
                ( GetSignedDistanceFromPlane( viewPos, frustumEqn[2] ) < r ) &&
                ( GetSignedDistanceFromPlane( viewPos, frustumEqn[3] ) < r) )

                {
                    uint id = atomicAdd(pointLightCount, 1);
                    pointLightIndex[id] = il;
                }
            }

        }
    }

    barrier();

奇怪的是,当我可视化每个图块的光照计数时,它显示了所有具有某种照明方式的图块(第一个图像)。

第二张图片显示了最终的输出,屏幕中间的细线灯光,上方或下方没有任何东西。删除剔除(GetSignedDistanceFromPlane())可以得到理想的结果,尽管我的帧速率像一块石头一样下降。

在此处输入图片说明

在此处输入图片说明

我的猜测是,视锥的构造是错误的,但是我不确定其背后的数学原理,现在可以使用一些帮助。

编辑:添加了另一个图像,显示预期的输出。

在此处输入图片说明

更新1

我们已经更改了剔除的方式,现在的代码如下所示:

barrier();

float minDepthZ = float(minDepth / float(0xFFFFFFFF));
float maxDepthZ = float(maxDepth / float(0xFFFFFFFF));

//total tiles = tileScale * 2
vec2 tileScale = vec2(1280, 720) * (1.0f / float(2*WORK_GROUP_SIZE));
vec2 tileBias = tileScale - vec2(gl_WorkGroupID.xy);

vec4 c1 = vec4(-proj[0][0] * tileScale.x, 0.0f, tileBias.x, 0.0f);
vec4 c2 = vec4(0.0f, -proj[1][1] * tileScale.y, tileBias.y, 0.0f);
vec4 c4 = vec4(0.0f, 0.0f, 1.0f, 0.0f);

 // Derive frustum planes
vec4 frustumPlanes[6];
// Sides
//right
frustumPlanes[0] = c4 - c1;
//left
frustumPlanes[1] = c4 + c1;
//bottom
frustumPlanes[2] = c4 - c2;
//top
frustumPlanes[3] = c4 + c2;
// Near/far
frustumPlanes[4] = vec4(0.0f, 0.0f,  1.0f, -minDepthZ);
frustumPlanes[5] = vec4(0.0f, 0.0f, -1.0f,  maxDepthZ);

for(int i = 0; i < 4; i++)
{
    frustumPlanes[i] *= 1.0f / length(frustumPlanes[i].xyz);
}

//DO CULLING HERE
for (uint lightIndex = gl_LocalInvocationIndex; lightIndex < numActiveLights; lightIndex += WORK_GROUP_SIZE)
{
    PointLight p = pointLights[lightIndex];

    if (lightIndex < numActiveLights)
    {
        bool inFrustum = true;
        for (uint i = 0; i < 4; i++)
        {
            float dd = dot(frustumPlanes[i], view * vec4(p.position, 1.0f));
            inFrustum = inFrustum && (dd >= -p.radius_length);
        }

        if (inFrustum)
        {
            uint id = atomicAdd(pointLightCount, 1);
            pointLightIndex[id] = lightIndex;
        }
    }
}

barrier();

这样效果更好,现在可以将我们的灯光在瓷砖上正确剔除(最小/最大深度除外,因为尚未正确实施)。到目前为止,一切都很好,但是!我们在灯光的边缘存在问题,瓷砖无法覆盖整个灯光半径,并且性能令人吃惊。1024灯可产生最高40fps的景象,并产生大量的停顿现象。

该视频显示了边缘发生的情况,灰色的图块是受光(单个点光源)影响的图块,红色的部分是阴影几何体。

http://www.youtube.com/watch?v=PiwGcFb9rWk&feature=youtu.be

缩放半径,以使其在剔除“作品”时更大,但使性能下降更加困难。

Answers:


5

最终答案,解决了性能问题!改为将我的剔除循环更改为此(基于BF3中Dice使用的剔除循环)

uint threadCount = WORK_GROUP_SIZE * WORK_GROUP_SIZE;
    uint passCount = (numActiveLights + threadCount - 1) /threadCount;
for (uint passIt = 0; passIt < passCount; ++passIt)
{
    uint lightIndex =  passIt * threadCount + gl_LocalInvocationIndex;

    lightIndex = min(lightIndex, numActiveLights);

    p = pointLights[lightIndex];
    pos = view * vec4(p.position, 1.0f);
    rad = p.radius_length;

    if (pointLightCount < MAX_LIGHTS_PER_TILE)
    {
        inFrustum = true;
        for (uint i = 3; i >= 0 && inFrustum; i--)
        {
            dist = dot(frustumPlanes[i], pos);
            inFrustum = (-rad <= dist);
        }

        if (inFrustum)
        {
            id = atomicAdd(pointLightCount, 1);
            pointLightIndex[id] = lightIndex;
        }
    }
}

我现在可以以80 fps的速度拍摄4096盏灯,我感到非常高兴。


2

部分解决了问题。这是新的剔除代码,适用于除远近平面以外的所有内容。性能仍然很差,因此如果有人可以看到导致它的原因,我们将不胜感激。

        ivec2 pixel = ivec2(gl_GlobalInvocationID.xy);

    vec4 normalColor = imageLoad(normalDepth, pixel);

    float d = normalColor.w;

    uint depth = uint(d * 0xFFFFFFFF);

    atomicMin(minDepth, depth);
    atomicMax(maxDepth, depth);

    barrier();

    float minDepthZ = float(minDepth / float(0xFFFFFFFF));
    float maxDepthZ = float(maxDepth / float(0xFFFFFFFF));

    vec2 tileScale = vec2(1280, 720) * (1.0f / float( 2 * WORK_GROUP_SIZE));
    vec2 tileBias = tileScale - vec2(gl_WorkGroupID.xy);

    vec4 col1 = vec4(-proj[0][0]  * tileScale.x, proj[0][1], tileBias.x, proj[0][3]); 

    vec4 col2 = vec4(proj[1][0], -proj[1][1] * tileScale.y, tileBias.y, proj[1][3]);

    vec4 col4 = vec4(proj[3][0], proj[3][1],  -1.0f, proj[3][3]); 

    vec4 frustumPlanes[6];

    //Left plane
    frustumPlanes[0] = col4 + col1;

    //right plane
    frustumPlanes[1] = col4 - col1;

    //top plane
    frustumPlanes[2] = col4 - col2;

    //bottom plane
    frustumPlanes[3] = col4 + col2;

    //near
    frustumPlanes[4] =vec4(0.0f, 0.0f, -1.0f,  -minDepthZ);

    //far
    frustumPlanes[5] = vec4(0.0f, 0.0f, -1.0f,  maxDepthZ);

    for(int i = 0; i < 4; i++)
    {
        frustumPlanes[i] *= 1.0f / length(frustumPlanes[i].xyz);
    }

    //DO CULLING HERE
    for (uint lightIndex = gl_LocalInvocationIndex; lightIndex < numActiveLights; lightIndex += WORK_GROUP_SIZE)
    {
        PointLight p = pointLights[lightIndex];

        if (pointLightCount < MAX_LIGHTS_PER_TILE)
        {
            bool inFrustum = true;
            for (uint i = 3; i >= 0 && inFrustum; i--)
            {
                float dd = dot(frustumPlanes[i], view * vec4(p.position, 1.0f));
                inFrustum = (dd >= -p.radius_length);
            }

            if (inFrustum)
            {
                uint id = atomicAdd(pointLightCount, 1);
                pointLightIndex[id] = lightIndex;
            }
        }
    }

    barrier();

实际上:

http://www.youtube.com/watch?v=8SnvYya1Jn8&feature=youtu.be


1
我有一些实现灯光索引渲染/延迟的经验。至于灯光的边缘,您可能需要看一下imdoingitwrong.wordpress.com/2011/01/31/light-衰减,这使您可以指定一个阈值来切断灯光,并提供一个方程式来进行计算传递给着色器的比例。至于近飞机和远飞机,我在索引灯光方面遇到了很多麻烦。我发现最好的方法是对与近平面相交的灯光进行全屏显示。至于远平面,您可能需要查找深度夹紧(GL_ARB_depth_clamp)
ashleysmithgpu 2013年

1
抱歉,空间不足:)。至于性能,您可能要分析您的应用程序。我可以想象将照明计算移到if(inFrustum)测试内部会有所帮助,因为您避免需要写入内存,循环和从内存读取来计算照明。
ashleysmithgpu 2013年

谢谢您的帮助!我一直在尝试进行性能分析,这是当前淘汰性能的淘汰阶段。具体来说,它似乎是在写inFrustum(inFrustum =(dd> = -p.radius_length);由于某种原因绝对会破坏性能,我不知道为什么吗?它应该在本地内存中而不应该在线程之间共享,认为可能是导致过多的分支?不能完全确定如何将光计算移至if(inFrustum)条件内,因为每个线程都需要完整的光列表?
Bentebent 2013年
By using our site, you acknowledge that you have read and understand our Cookie Policy and Privacy Policy.
Licensed under cc by-sa 3.0 with attribution required.