基础

Compute Shaders是在GPU运行却又在普通渲染管线之外的程序。用于运行GPGPU program。
平行算法被拆分成很多线程组，而线程组包含很多线程。例如一个线程处理一个像素点，而一定要注意这种处理是无序的随机的，并不一定是固定的处理顺序，例如不一定是从左到右挨个处理像素点。

线程组

A Thread Group 运行在一个GPU单元（A single multiprocesser）,如果GPU有16个
multiprocesser，那么程序至少要分成16个 Thread Group使得每个multiprocesser都参与计算。
组之间不分享内存。

线程

一个线程组包含n个线程，每32个thread称为一个warp（nvidia：warp=32 ,ati:wavefront=64,因此未来此数字可能会更高）。
从效率考虑，一个线程组包含的线程数最好的warp的倍数，256是一个比较合适的数字。

语法

numthreads中定义单个线程组，这个线程组为三维线程矩阵881个线程
在外面调用computeShader.Dispath(2,2,1)，表示定义221个线程组
SV_DispatchThreadID表示当前线程Id，取值范围为(0,0,0)~(threadxthread_groupx-1,threadythread_groupy-1,threadz*thread_groupz-1)
因此在使用是根据实际处理图片等分辨率，来写thread以及thread_group满足SV_DispatchThreadID的xy值可以覆盖图片分辨率，这样可以做到处理每个像素
打个比方：图片像素为512*512，那么thread写成(8,8,1), thread_group写成(512/8,512/8,1)
ComputeShader如下，相当于对图片每个像素写入红色，因为Id.xy的范围在(0~~511,0~~511)
一维调度：
DispatchIndex=DispatchThreadID.x+DispatchThreadID.ynumthreads.xDispatch.x+DispatchThreadID.znumthreads.xDispatch.xnumthreads.yDispatch.y

#pragma kernel FillWithRed
RWTexture2D< float4 > res;

// numthreads中定义单个线程组，这个线程组为三维线程矩阵8*8*1个线程
// 在外面调用computeShader.Dispath(2,2,1)，表示定义2*2*1个线程组
// SV_DispatchThreadID表示当前线程Id，取值范围为(0,0,0)~(threadx*thread_groupx-1,thready*thread_groupy-1,threadz*thread_groupz-1)
[numthreads(8,8,1)]     
void FillWithRed (uint3 id : SV_DispatchThreadID)
{
  res[id.xy] = float4(1,0,0,1);
}

测试代码

Shader

Shader "Unlit/ComputTest"
{
    Properties
    {
        _Color ("Color", Color) = (1,1,1,1)
    }
    SubShader
    {
        Tags { "RenderType" = "Opaque" "RenderPipeline" = "UniversalPipeline"}

        Blend SrcAlpha OneMinusSrcAlpha
        HLSLINCLUDE

        #include "Packages/com.unity.render-pipelines.universal/ShaderLibrary/Core.hlsl"
        #include "Packages/com.unity.render-pipelines.universal/ShaderLibrary/Lighting.hlsl"

        CBUFFER_START(UnityPerMaterial)
        float4 _Color;
        CBUFFER_END

        struct data
        {
            float3 position;
            float4 color;
        };
        StructuredBuffer<data> Result;

        ENDHLSL

        Pass
        {
            Blend SrcAlpha OneMinusSrcAlpha
            HLSLPROGRAM
            #pragma vertex vert
            #pragma fragment frag

            struct v2f
            {
                float4 vertex : SV_POSITION;
                float4 color : COLOR;
            };

            sampler2D _MainTex;
            float4 _MainTex_ST;

            v2f vert (uint id : SV_VertexID)
            {
                v2f o;
                o.vertex = TransformObjectToHClip(float4(Result[id].position, 1.0));
                o.color = Result[id].color;
                return o;
            }

            float4 frag (v2f i) : SV_Target
            {
                return i.color;
            }
            ENDHLSL
        }
    }
}

using System.Collections;
using System.Collections.Generic;
using UnityEngine;

public struct ParticleData
{
    Vector3 position;
    Color color;
}
public class ComputeTest : MonoBehaviour
{
    public ComputeShader computeShader;

    ParticleData[] particleDatas;

    public int count = 64;
    public Color color;
    public int size = 1;

    public Material material;

    private ComputeBuffer ComputeBuffer;
    private int id;

    // Start is called before the first frame update
    void Start()
    {
        int vec3Stride = sizeof(float) * 3;
        int colorStride = sizeof(float) * 4;
        ComputeBuffer = new ComputeBuffer(count, vec3Stride + colorStride);
        id = computeShader.FindKernel("CSMain");


        particleDatas = new ParticleData[count];
        for (int i = 0; i < count; i++)
        {
            particleDatas[i] = new ParticleData();
        }
        ComputeBuffer.SetData(particleDatas);
        computeShader.SetBuffer(id, "Result", ComputeBuffer);
        material.SetBuffer("Result", ComputeBuffer);
    }

    // Update is called once per frame
    void OnRenderObject()
    {
        computeShader.SetFloat("time", Time.time);
        computeShader.SetFloat("size", size);
        computeShader.SetVector("color", color);
        computeShader.SetVector("_threadGroup", new Vector3(10, 10, 100));
        computeShader.Dispatch(id, 10, 10, 100);
        material.SetPass(0);
        Graphics.DrawProceduralNow(MeshTopology.Points, ComputeBuffer.count);
    }

    private void OnDestroy()
    {
        ComputeBuffer.Release();
        ComputeBuffer.Dispose();
    }
}

// Each #kernel tells which function to compile; you can have many kernels
#pragma kernel CSMain


float time;
float4 color;
float size;
struct ParticleData
{
    float3 position;
    float4 color;
};


RWStructuredBuffer<ParticleData> Result;

float3 _threadGroup;

[numthreads(10,10,10)]
void CSMain (uint3 id : SV_DispatchThreadID)
{
    int DispatchIndex = id.x + (id.y * 10 * _threadGroup.x) + (id.z * 10 * 10 * _threadGroup.x * _threadGroup.y); // x*x, x*y*x*y

    ParticleData data = Result[DispatchIndex];
    data.color = color;
    data.position = float3(size * sin(DispatchIndex + time), DispatchIndex * 0.002,size * cos(DispatchIndex + time));
    data.position.xy *= abs(sin(data.position.y + time * 0.3));
    Result[DispatchIndex] = data;
}

作用

只要有涉及大量数据的处理都可以放在ComputeShader中计算

Unity新版VFX
布料/头发模拟
光追
后处理

参考链接

Compute Shader介绍（一）
Compute Shader介绍（二）
初识ComputeShader
Shader第二十八讲 Compute Shaders