用于双数组数学的C#OpenCL GPU实现

Mar*_*o M -4 .net c# gpu opencl

如何使这个函数的for循环使用GPU与OpenCL?

    public static double[] Calculate(double[] num, int period)
    {          
        var final = new double[num.Length];
        double sum = num[0];
        double coeff = 2.0 / (1.0 + period);

        for (int i = 0; i < num.Length; i++)
        {
            sum += coeff * (num[i] - sum);
            final[i] = sum;
        }

        return final;
    }
Run Code Online (Sandbox Code Playgroud)

Aar*_*. S 5

作为评论者Cory声明,请参阅此链接进行设置.

如何在.NET中使用GPU

以下是如何使用此项目:

  1. 添加Nuget Package Cloo
  2. 添加对OpenCLlib.dll的引用
  3. 下载OpenCLLib.zip
  4. 使用OpenCL添加

    static void Main(string[] args)
    {
        int[] Primes = { 1,2,3,4,5,6,7 };
        EasyCL cl = new EasyCL();
        cl.Accelerator = AcceleratorDevice.GPU;
        cl.LoadKernel(IsPrime);
        cl.Invoke("GetIfPrime", 0, Primes.Length, Primes, 1.0);
    }
    
    static string IsPrime
    {
        get
        {
            return @"
            kernel void GetIfPrime(global int* num, int period)
            {
                int index = get_global_id(0);
    
                int sum = (2.0 / (1.0 + period)) * (num[index] - num[0]);
                printf("" %d \n"",sum);
            }";
        }
    }
    
    Run Code Online (Sandbox Code Playgroud)


Min*_*neR 5

您所写的问题不适合在 GPU 上运行的问题。您不能并行化(以提高性能的方式)单个数组上的操作,因为第 n 个元素的值取决于元素 1 到 n。但是,您可以利用 GPU 处理多个阵列,其中每个 GPU 内核在单独的阵列上运行。

解决方案的完整代码在答案的末尾,但测试结果,计算 10,000 个数组,每个数组有 10,000 个元素,生成以下结果(在 GTX1080M 和 i7 7700k 上,具有 32GB RAM):

Task Generating Data: 1096.4583ms
Task CPU Single Thread: 596.2624ms
Task CPU Parallel: 179.1717ms
GPU CPU->GPU: 89ms
GPU Execute: 86ms
GPU GPU->CPU: 29ms
Task Running GPU: 921.4781ms
Finished
Run Code Online (Sandbox Code Playgroud)

在此测试中,我们测量了使用具有一个线程的 CPU、具有所有线程的 CPU,最后使用所有内核的 GPU 将结果生成到托管 C# 数组中的速度。我们使用函数 AreTheSame 验证每个测试的结果是否相同。

最快的时间是使用所有线程处理 CPU 上的数组(任务 CPU 并行:179 毫秒)。

GPU 实际上是最慢的(任务运行 GPU:922 毫秒),但这是因为重新格式化 C# 数组需要花费时间,以便它们可以传输到 GPU 上。

如果消除了这个瓶颈(这很有可能,取决于您的用例),GPU 可能是最快的。如果数据已经以可以立即传输到 GPU 的方式格式化,则 GPU 的总处理时间将为 204 毫秒(CPU->GPU:89 毫秒 + 执行:86 毫秒 + GPU->CPU:29 毫秒 = 204 毫秒)。这仍然比并行 CPU 选项慢,但在不同类型的数据集上,它可能更快。

为了从 GPU 取回数据(实际使用 GPU 的最重要部分),我们使用函数 ComputeCommandQueue.Read。这会将 GPU 上更改的数组传输回 CPU。

要运行以下代码,请参考 Cloo Nuget 包(我使用的是 0.9.1)。并确保在 x64 上编译(您将需要内存)。如果找不到 OpenCL 设备,您可能也需要更新显卡驱动程序。

class Program
{
    static string CalculateKernel
    {
        get
        {
            return @"
            kernel void Calc(global int* offsets, global int* lengths, global double* doubles, double periodFactor) 
            {
                int id = get_global_id(0);
                int start = offsets[id];
                int length = lengths[id];
                int end = start + length;
                double sum = doubles[start];

                for(int i = start; i < end; i++)
                {
                    sum = sum + periodFactor * ( doubles[i] - sum );
                    doubles[i] = sum;
                }
            }";
        }
    }

    public static double[] Calculate(double[] num, int period)
    {
        var final = new double[num.Length];
        double sum = num[0];
        double coeff = 2.0 / (1.0 + period);

        for (int i = 0; i < num.Length; i++)
        {
            sum += coeff * (num[i] - sum);
            final[i] = sum;
        }

        return final;
    }


    static void Main(string[] args)
    {

        int maxElements = 10000;
        int numArrays = 10000;
        int computeCores = 2048;

        double[][] sets = new double[numArrays][];

        using (Timer("Generating Data"))
        {
            Random elementRand = new Random(1);
            for (int i = 0; i < numArrays; i++)
            {
                sets[i] = GetRandomDoubles(elementRand.Next((int)(maxElements * 0.9), maxElements), randomSeed: i);
            }
        }

        int period = 14;

        double[][] singleResults;
        using (Timer("CPU Single Thread"))
        {
            singleResults = CalculateCPU(sets, period);
        }

        double[][] parallelResults;
        using (Timer("CPU Parallel"))
        {
            parallelResults = CalculateCPUParallel(sets, period);
        }

        if (!AreTheSame(singleResults, parallelResults)) throw new Exception();

        double[][] gpuResults;
        using (Timer("Running GPU"))
        {
            gpuResults = CalculateGPU(computeCores, sets, period);
        }

        if (!AreTheSame(singleResults, gpuResults)) throw new Exception();


        Console.WriteLine("Finished");
        Console.ReadKey();
    }

    public static bool AreTheSame(double[][] a1, double[][] a2)
    {
        if (a1.Length != a2.Length) return false;
        for (int i = 0; i < a1.Length; i++)
        {
            var ar1 = a1[i];
            var ar2 = a2[i];
            if (ar1.Length != ar2.Length) return false;
            for (int j = 0; j < ar1.Length; j++)
                if (Math.Abs(ar1[j] - ar2[j]) > 0.0000001) return false;

        }
        return true;
    }

    public static double[][] CalculateGPU(int partitionSize, double[][] sets, int period)
    {
        ComputeContextPropertyList cpl = new ComputeContextPropertyList(ComputePlatform.Platforms[0]);
        ComputeContext context = new ComputeContext(ComputeDeviceTypes.Gpu, cpl, null, IntPtr.Zero);


        ComputeProgram program = new ComputeProgram(context, new string[] { CalculateKernel });
        program.Build(null, null, null, IntPtr.Zero);

        ComputeCommandQueue commands = new ComputeCommandQueue(context, context.Devices[0], ComputeCommandQueueFlags.None);

        ComputeEventList events = new ComputeEventList();

        ComputeKernel kernel = program.CreateKernel("Calc");


        double[][] results = new double[sets.Length][];

        double periodFactor = 2d / (1d + period);

        Stopwatch sendStopWatch = new Stopwatch();
        Stopwatch executeStopWatch = new Stopwatch();
        Stopwatch recieveStopWatch = new Stopwatch();


        int offset = 0;
        while (true)
        {
            int first = offset;
            int last = Math.Min(offset + partitionSize, sets.Length);
            int length = last - first;

            var merged = Merge(sets, first, length);

            sendStopWatch.Start();

            ComputeBuffer<int> offsetBuffer = new ComputeBuffer<int>(
                context,
                ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.UseHostPointer,
                merged.Offsets);

            ComputeBuffer<int> lengthsBuffer = new ComputeBuffer<int>(
                context,
                ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.UseHostPointer,
                merged.Lengths);

            ComputeBuffer<double> doublesBuffer = new ComputeBuffer<double>(
                context,
                ComputeMemoryFlags.ReadWrite | ComputeMemoryFlags.UseHostPointer,
                merged.Doubles);



            kernel.SetMemoryArgument(0, offsetBuffer);
            kernel.SetMemoryArgument(1, lengthsBuffer);
            kernel.SetMemoryArgument(2, doublesBuffer);
            kernel.SetValueArgument(3, periodFactor);

            sendStopWatch.Stop();

            executeStopWatch.Start();

            commands.Execute(kernel, null, new long[] { merged.Lengths.Length }, null, events);

            executeStopWatch.Stop();

            using (var pin = Pinned(merged.Doubles))
            {
                recieveStopWatch.Start();
                commands.Read(doublesBuffer, false, 0, merged.Doubles.Length, pin.Address, events);
                commands.Finish();
                recieveStopWatch.Stop();
            }

            for (int i = 0; i < merged.Lengths.Length; i++)
            {
                int len = merged.Lengths[i];
                int off = merged.Offsets[i];

                var res = new double[len];
                Array.Copy(merged.Doubles,off,res,0,len);

                results[first + i] = res;
            }


            offset += partitionSize;
            if (offset >= sets.Length) break;
        }

        Console.WriteLine("GPU CPU->GPU: " + recieveStopWatch.ElapsedMilliseconds + "ms");
        Console.WriteLine("GPU Execute: " + executeStopWatch.ElapsedMilliseconds + "ms");
        Console.WriteLine("GPU GPU->CPU: " + sendStopWatch.ElapsedMilliseconds + "ms");


        return results;
    }

    public static PinnedHandle Pinned(object obj) => new PinnedHandle(obj);
    public class PinnedHandle : IDisposable
    {
        public IntPtr Address => handle.AddrOfPinnedObject();
        private GCHandle handle;
        public PinnedHandle(object val)
        {
            handle = GCHandle.Alloc(val, GCHandleType.Pinned);
        }
        public void Dispose()
        {
            handle.Free();
        }
    }

    public class MergedResults
    {
        public double[] Doubles { get; set; }
        public int[] Lengths { get; set; }
        public int[] Offsets { get; set; }
    }



    public static MergedResults Merge(double[][] sets, int offset, int length)
    {
        List<int> lengths = new List<int>(length);
        List<int> offsets = new List<int>(length);

        for (int i = 0; i < length; i++)
        {
            var arr = sets[i + offset];
            lengths.Add(arr.Length);
        }
        var totalLength = lengths.Sum();

        double[] doubles = new double[totalLength];
        int dataOffset = 0;
        for (int i = 0; i < length; i++)
        {
            var arr = sets[i + offset];
            Array.Copy(arr, 0, doubles, dataOffset, arr.Length);
            offsets.Add(dataOffset);
            dataOffset += arr.Length;
        }

        return new MergedResults()
        {
            Doubles = doubles,
            Lengths = lengths.ToArray(),
            Offsets = offsets.ToArray(),
        };
    }


    public static IDisposable Timer(string name)
    {
        return new SWTimer(name);
    }

    public class SWTimer : IDisposable
    {
        private Stopwatch _sw;
        private string _name;
        public SWTimer(string name)
        {
            _name = name;
            _sw = Stopwatch.StartNew();
        }
        public void Dispose()
        {
            _sw.Stop();
            Console.WriteLine("Task " + _name + ": " + _sw.Elapsed.TotalMilliseconds + "ms");
        }

    }

    public static double[][] CalculateCPU(double[][] arrays, int period)
    {
        double[][] results = new double[arrays.Length][];
        for (var index = 0; index < arrays.Length; index++)
        {
            var arr = arrays[index];
            results[index] = Calculate(arr, period);
        }
        return results;
    }

    public static double[][] CalculateCPUParallel(double[][] arrays, int period)
    {
        double[][] results = new double[arrays.Length][];
        Parallel.For(0, arrays.Length, i =>
         {
             var arr = arrays[i];
             results[i] = Calculate(arr, period);
         });
        return results;
    }


    static double[] GetRandomDoubles(int num, int randomSeed)
    {
        Random r = new Random(randomSeed);
        var res = new double[num];
        for (int i = 0; i < num; i++)
            res[i] = r.NextDouble() * 0.9 + 0.05;
        return res;
    }
}
Run Code Online (Sandbox Code Playgroud)