iOS上数组的Swift Metal并行和计算

Mar*_*vec 2 arrays posix-api ios swift metal

基于@Kametrixom 答案,我已经做了一些测试应用程序来并行计算数组中的总和。

我的测试应用程序如下所示:

import UIKit
import Metal

class ViewController: UIViewController {

// Data type, has to be the same as in the shader
typealias DataType = CInt

override func viewDidLoad() {
    super.viewDidLoad()

    let data = (0..<10000000).map{ _ in DataType(200) } // Our data, randomly generated


    var start, end : UInt64


    var result:DataType = 0
    start = mach_absolute_time()
    data.withUnsafeBufferPointer { buffer in
        for elem in buffer {
            result += elem
        }
    }
    end = mach_absolute_time()

    print("CPU result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")

    result = 0


    start = mach_absolute_time()
    result = sumParallel4(data)
    end = mach_absolute_time()

    print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")


    result = 0

    start = mach_absolute_time()
    result = sumParralel(data)
    end = mach_absolute_time()

    print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")

    result = 0

    start = mach_absolute_time()
    result = sumParallel3(data)
    end = mach_absolute_time()

    print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")





}

func sumParralel(data : Array<DataType>) -> DataType {

    let count = data.count
    let elementsPerSum: Int = Int(sqrt(Double(count)))

    let device = MTLCreateSystemDefaultDevice()!
    let parsum = device.newDefaultLibrary()!.newFunctionWithName("parsum")!
    let pipeline = try! device.newComputePipelineStateWithFunction(parsum)


    var dataCount = CUnsignedInt(count)
    var elementsPerSumC = CUnsignedInt(elementsPerSum)
    let resultsCount = (count + elementsPerSum - 1) / elementsPerSum // Number of individual results = count / elementsPerSum (rounded up)


    let dataBuffer = device.newBufferWithBytes(data, length: strideof(DataType) * count, options: []) // Our data in a buffer (copied)
    let resultsBuffer = device.newBufferWithLength(strideof(DataType) * resultsCount, options: []) // A buffer for individual results (zero initialized)
    let results = UnsafeBufferPointer<DataType>(start: UnsafePointer(resultsBuffer.contents()), count: resultsCount) // Our results in convenient form to compute the actual result later

    let queue = device.newCommandQueue()
    let cmds = queue.commandBuffer()
    let encoder = cmds.computeCommandEncoder()

    encoder.setComputePipelineState(pipeline)

    encoder.setBuffer(dataBuffer, offset: 0, atIndex: 0)
    encoder.setBytes(&dataCount, length: sizeofValue(dataCount), atIndex: 1)
    encoder.setBuffer(resultsBuffer, offset: 0, atIndex: 2)
    encoder.setBytes(&elementsPerSumC, length: sizeofValue(elementsPerSumC), atIndex: 3)

    // We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
    let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth, height: 1, depth: 1)

    // Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
    let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)

    encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)
    encoder.endEncoding()


    var result : DataType = 0


    cmds.commit()
    cmds.waitUntilCompleted()
    for elem in results {
        result += elem
    }


    return result
}



func sumParralel1(data : Array<DataType>) -> UnsafeBufferPointer<DataType> {

    let count = data.count
    let elementsPerSum: Int = Int(sqrt(Double(count)))

    let device = MTLCreateSystemDefaultDevice()!
    let parsum = device.newDefaultLibrary()!.newFunctionWithName("parsum")!
    let pipeline = try! device.newComputePipelineStateWithFunction(parsum)


    var dataCount = CUnsignedInt(count)
    var elementsPerSumC = CUnsignedInt(elementsPerSum)
    let resultsCount = (count + elementsPerSum - 1) / elementsPerSum // Number of individual results = count / elementsPerSum (rounded up)

    let dataBuffer = device.newBufferWithBytes(data, length: strideof(DataType) * count, options: []) // Our data in a buffer (copied)
    let resultsBuffer = device.newBufferWithLength(strideof(DataType) * resultsCount, options: []) // A buffer for individual results (zero initialized)
    let results = UnsafeBufferPointer<DataType>(start: UnsafePointer(resultsBuffer.contents()), count: resultsCount) // Our results in convenient form to compute the actual result later

    let queue = device.newCommandQueue()
    let cmds = queue.commandBuffer()
    let encoder = cmds.computeCommandEncoder()

    encoder.setComputePipelineState(pipeline)

    encoder.setBuffer(dataBuffer, offset: 0, atIndex: 0)
    encoder.setBytes(&dataCount, length: sizeofValue(dataCount), atIndex: 1)
    encoder.setBuffer(resultsBuffer, offset: 0, atIndex: 2)
    encoder.setBytes(&elementsPerSumC, length: sizeofValue(elementsPerSumC), atIndex: 3)

    // We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
    let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth, height: 1, depth: 1)

    // Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
    let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)

    encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)
    encoder.endEncoding()


    cmds.commit()
    cmds.waitUntilCompleted()



    return results
}

func sumParallel3(data : Array<DataType>) -> DataType {

    var results = sumParralel1(data)

    repeat {
        results = sumParralel1(Array(results))
    } while results.count >= 100

    var result : DataType = 0

    for elem in results {
        result += elem
    }


    return result
}

func sumParallel4(data : Array<DataType>) -> DataType {

    let queue = NSOperationQueue()
    queue.maxConcurrentOperationCount = 4

    var a0 : DataType = 0
    var a1 : DataType = 0
    var a2 : DataType = 0
    var a3 : DataType = 0

    let op0 = NSBlockOperation( block : {

        for i in 0..<(data.count/4) {
            a0 = a0 + data[i]
        }

    })

    let op1 = NSBlockOperation( block : {
        for i in (data.count/4)..<(data.count/2) {
            a1 = a1 + data[i]
        }
    })

    let op2 = NSBlockOperation( block : {
        for i in (data.count/2)..<(3 * data.count/4) {
            a2 = a2 + data[i]
        }
    })

    let op3 = NSBlockOperation( block : {
        for i in (3 * data.count/4)..<(data.count) {
            a3 = a3 + data[i]
        }
    })



    queue.addOperation(op0)
    queue.addOperation(op1)
    queue.addOperation(op2)
    queue.addOperation(op3)

    queue.suspended = false
    queue.waitUntilAllOperationsAreFinished()

    let aaa: DataType = a0 + a1 + a2 + a3

    return aaa
 }
}
Run Code Online (Sandbox Code Playgroud)

我有一个看起来像这样的着色器:

kernel void parsum(const device DataType* data [[ buffer(0) ]],
               const device uint& dataLength [[ buffer(1) ]],
               device DataType* sums [[ buffer(2) ]],
               const device uint& elementsPerSum [[ buffer(3) ]],

               const uint tgPos [[ threadgroup_position_in_grid ]],
               const uint tPerTg [[ threads_per_threadgroup ]],
               const uint tPos [[ thread_position_in_threadgroup ]]) {

    uint resultIndex = tgPos * tPerTg + tPos; // This is the index of the individual result, this var is unique to this thread
    uint dataIndex = resultIndex * elementsPerSum; // Where the summation should begin
    uint endIndex = dataIndex + elementsPerSum < dataLength ? dataIndex + elementsPerSum : dataLength; // The index where summation should end

    for (; dataIndex < endIndex; dataIndex++)
        sums[resultIndex] += data[dataIndex];
}
Run Code Online (Sandbox Code Playgroud)

对我而言,惊喜功能sumParallel4是最快的,我认为不应该这样。我注意到,当我调用函数 sumParralel和时sumParallel3,即使更改函数的顺序,第一个函数也总是较慢。(因此,如果我先调用sumParralel,则速度较慢;如果我调用sumParallel3,则速度较慢。)

为什么是这样?为什么sumParallel3没有比sumParallel快很多?为什么sumParallel4是最快的,尽管它是在CPU上计算的?


如何使用更新我的GPU功能posix_memalign?我知道它应该工作得更快,因为它将在GPU和CPU之间共享内存,但是我不知道应该以这种方式分配巫婆数组(数据或结果),如果数据是函数中传递的参数,我该如何使用posix_memalign分配数据?

war*_*enm 6

在iPhone 6上运行这些测试时,我发现Metal版本的运行速度比单纯的CPU总和慢3倍至2倍。通过下面介绍的修改,它始终保持更快的速度。

我发现运行Metal版本的很多成本不仅可以归因于缓冲区的分配(尽管这很重要),还可以归因于设备的首次创建和计算管道状态。这些操作通常在应用程序初始化时执行一次,因此将它们包括在时间中并不完全公平。

还应该注意的是,如果您在启用了Metal验证层和GPU帧捕获的情况下通过Xcode运行这些测试,则这将花费大量的运行时间,并且会使结果偏向CPU。

有了这些警告,这就是您可能posix_memalign用来分配可用于支持的内存的方法MTLBuffer。技巧是确保请求的内存实际上是页面对齐的(即,其地址是的倍数getpagesize()),这可能需要将内存量舍入到超出实际需要存储的数据量之外:

let dataCount = 1_000_000
let dataSize = dataCount * strideof(DataType)
let pageSize = Int(getpagesize())
let pageCount = (dataSize + (pageSize - 1)) / pageSize
var dataPointer: UnsafeMutablePointer<Void> = nil
posix_memalign(&dataPointer, pageSize, pageCount * pageSize)
let data = UnsafeMutableBufferPointer(start: UnsafeMutablePointer<DataType>(dataPointer),
                                      count: (pageCount * pageSize) / strideof(DataType))

for i in 0..<dataCount {
    data[i] = 200
}
Run Code Online (Sandbox Code Playgroud)

这确实需要制作data一个UnsafeMutableBufferPointer<DataType>,而不是[DataType],因为Swift会Array分配自己的后备存储。您还需要传递数据项的数量以进行操作,因为count可变缓冲区指针的s已四舍五入以使缓冲区页面对齐。

要实际MTLBuffer使用此数据创建支持,请使用newBufferWithBytesNoCopy(_:length:options:deallocator:)API。同样重要的是,您提供的长度必须是页面大小的倍数;否则,此方法返回nil

let roundedUpDataSize = strideof(DataType) * data.count
let dataBuffer = device.newBufferWithBytesNoCopy(data.baseAddress, length: roundedUpDataSize, options: [], deallocator: nil)
Run Code Online (Sandbox Code Playgroud)

在这里,我们没有提供解除分配器,但是您应该在使用完baseAddress之后通过将缓冲区指针传递到来释放内存free()