使用 Numba 进行矩阵乘法时出现 CUDA 内存不足错误

sec*_*ive -3 cuda matrix-multiplication pycuda numba

我需要将矩阵与其转置相乘,但我的 GPU 内存不足并出现错误消息numba.cuda.cudadrv.driver.CudaAPIError: [2] Call to cuMemAlloc results in CUDA_ERROR_OUT_OF_MEMORY

我预计矩阵的大小约为 10k 行和 100k 列,因此将其与其 trnspose 相乘将得到 10k 行和 10k 列的方阵的结果。矩阵只包含0和1。

这是我正在运行的脚本。

from numba import cuda, uint16
import numba
import numpy
import math
import time

TPB = 16

@cuda.jit()
def matmul_shared_mem(A, B, C):
    sA = cuda.shared.array((TPB, TPB), dtype=uint16)
    sB = cuda.shared.array((TPB, TPB), dtype=uint16)
    x, y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    if x >= C.shape[0] and y >= C.shape[1]:
        return
    tmp = 0.
    for i in range(int(A.shape[1] / TPB)):
        sA[tx, ty] = A[x, ty + i * TPB]
        sB[tx, ty] = B[tx + i * TPB, y]
        cuda.syncthreads()
        for j in range(TPB):
            tmp += sA[tx, j] * sB[j, ty]
        cuda.syncthreads()
    C[x, y] = tmp

A = numpy.random.randint(2, size=(TPB * 625, 50000))

B = A.transpose()

C_shared_mem = cuda.device_array((A.shape[0], B.shape[1]))

threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(B.shape[1] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)

start_gpu_shared_memory = time.time()
matmul_shared_mem[blocks_per_grid, threads_per_block](A, B, C_shared_mem)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
Run Code Online (Sandbox Code Playgroud)

更新1:

根据您的建议,我做了一些更改,但我的内存仍然不足。

import numpy as np
import numba as nb
colm = int(200000/8)
rows = 100000
cols = int(colm*8)
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
A = np.empty((rows,colm), dtype=np.uint8)

@nb.njit('void(uint8[:,:],int8[:,:])', parallel=True)
def compute(A, AU):
    for i in nb.prange(A.shape[0]):
        for j in range(A.shape[1]):
            offset = j * 8
            res = AU[i,offset] << 7
            res |= AU[i,offset+1] << 6
            res |= AU[i,offset+2] << 5
            res |= AU[i,offset+3] << 4
            res |= AU[i,offset+4] << 3
            res |= AU[i,offset+5] << 2
            res |= AU[i,offset+6] << 1
            res |= AU[i,offset+7]
            A[i,j] = res

compute(A, AU)

from numba import cuda, uint8, int32
import numba
import numpy as np
import math
import time

TPB = 8
TPB1 = 9

@cuda.jit()
def bit_A_AT(A, C):
    sA = cuda.shared.array((TPB, TPB), dtype=uint8)
    sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
    x, y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    if bx >= by:
        tmp = int32(0)
        for i in range((A.shape[1]+TPB-1) // TPB):
            if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
                sA[ty, tx] = A[y, i*TPB+tx]
            else:
                sA[ty, tx] = 0
            if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
                sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
            else:
                sB[ty, tx] = 0
            cuda.syncthreads()
            for j in range(TPB):
                tmp1 = sA[ty,j] & sB[tx, j]
                test = uint8(1)
                for k in range(8):
                    if (tmp1 & test) > 0:
                        tmp += 1
                    test <<= 1
            cuda.syncthreads()
        if y < C.shape[0] and x < C.shape[1]:
            C[y, x] = tmp

C = np.empty((A.shape[0], A.shape[0]), dtype=np.int32)
threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(A.shape[0] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)

start_gpu_shared_memory = time.time()
bit_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
Run Code Online (Sandbox Code Playgroud)

知道如何解决这个问题吗?

Rob*_*lla 5

以下方法应减少计算 A x AT 所需的设备内存量。我们将使用以下想法:

  • 由于输入数组 ( A) 仅采用 0,1 值,因此我们将该数组的存储空间减少到最小方便大小 ,int8即每个元素一个字节
  • 由于B数组只是A数组的转置,因此不需要显式处理它。我们可以从A数组中导出它,有点类似于这里,尽管它正在执行 AT x A
  • A x AT 的矩阵乘法涉及矩阵 A 各行的点积,如下所示
  • sB我们将使用调整后的索引在数组中提供 A 转置版本
  • 您的代码还有一系列其他更改,以解决各种错误并提高加载/存储效率,例如一般反转 x,y 索引的使用
  • 我还修复了您的用法syncthreads并修改了代码以允许行和列尺寸的任意值

这是一个有效的例子:

$ cat t62.py
from numba import cuda, int32, int8
import numba
import numpy as np
import math
import time

TPB = 32
TPB1 = TPB+1
@cuda.jit()
def byte_A_AT(A, C):
    sA = cuda.shared.array((TPB, TPB),  dtype=int8)
    sB = cuda.shared.array((TPB, TPB1), dtype=int8)
    x, y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
# uncomment and indent remainder of kernel to only do the "symmetric half" of calculation
#    if bx >= by:
    tmp = int32(0)
    for i in range((A.shape[1]+TPB-1)// TPB):
        if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sA[ty, tx] = A[y, i*TPB+tx]
        else:
            sA[ty, tx] = 0
        if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
        else:
            sB[ty, tx] = 0
        cuda.syncthreads()
        for j in range(TPB):
            tmp += int32(sA[ty,j]) * int32(sB[tx, j])
        cuda.syncthreads()
    if y < C.shape[0] and x < C.shape[1]:
        C[y, x] = tmp
rows = 1041
cols = 1043
print('host mem: ', (rows*cols*2+rows*rows*4*2)//1048576, 'MB device mem: ', (rows*cols+rows*rows*4)//1048576, 'MB')
A = np.random.randint(2,size=(rows, cols),dtype=np.int8)
AT = A.transpose()
CU = np.matmul(A,AT, dtype = np.int32)
C = np.empty((A.shape[0], A.shape[0]), dtype=np.int32)
threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(A.shape[0] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
byte_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
start_gpu_shared_memory = time.time()
byte_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
test = np.array_equal(C, CU)
print(test)
if test == False:
    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            if C[i,j] != CU[i,j]:
                print(i, ' ' , j ,' ' , C[i,j] , ' ' , CU[i,j])
$ python t62.py
host mem:  10 MB device mem:  5 MB
GPU time(shared memory):0.019593000411987305
True
$
Run Code Online (Sandbox Code Playgroud)

笔记:

  • 上述代码的大部分运行时间将花费在python中(在操作中np.matmul(),这实际上只是用于验证结果,实际实现中应该没有必要),而不是在GPU部分。随着矩阵变大,代码运行速度会变慢。
  • 正如评论中提到的,A x AT 的结果是一个对称矩阵。我的代码没有利用这一点,但是我们可以粗略地利用它,方法是在内核开头取消注释 if 测试,然后缩进内核的其余部分。然而,这当然会导致主机代码np.array_equal测试失败。
  • 设备内存消耗是在代码中计算的。对于注释中的最大值(行= 30k,列= 200k),这将达到大约10GB,因此它仍然无法在您的8GB GPU 中运行。
  • 我创建了此代码的一个版本,其中为矩阵打包了每个字节 8 个元素A,这将进一步减少内存需求,但是编写此代码来处理任意列维度(相对于 8 的倍数)的情况被证明是相当混乱的。然而,对于 30k 行和 200k 列的情况,该代码可以将总设备内存消耗降至约 5GB。

这是每元素一位版本,它要求列数是能被 8 整除的整数:

$ cat t61.py
from numba import cuda, uint8, int32
import numba
import numpy as np
import math
import time

TPB = 32
TPB1 = 33

@cuda.jit()
def bit_A_AT(A, C):
    sA = cuda.shared.array((TPB, TPB), dtype=uint8)
    sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
    x, y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    tmp = int32(0)
    for i in range((A.shape[1]+TPB-1) // TPB):
        if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sA[ty, tx] = A[y, i*TPB+tx]
        else:
            sA[ty, tx] = 0
        if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
        else:
            sB[ty, tx] = 0
        cuda.syncthreads()
        for j in range(TPB):
            tmp1 = sA[ty,j] & sB[tx, j]
            test = uint8(1)
            for k in range(8):
                if (tmp1 & test) > 0:
                    tmp += 1
                test <<= 1
        cuda.syncthreads()
    if y < C.shape[0] and x < C.shape[1]:
        C[y, x] = tmp
colm = 131
rows = 1041
cols = int(colm*8)
print('host mem: ', (rows*cols*2+rows*rows*4*2)//1048576, 'MB device mem: ', (((rows*cols)//8)+rows*rows*4)//1048576, 'MB')
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
AUT = AU.transpose()
CU = np.matmul(AU,AUT,dtype=np.int32)
A = np.empty((rows,colm), dtype=np.uint8)
for i in range(A.shape[0]):
    for j in range(A.shape[1]):
        A[i,j] = 0
        for k in range(8):
            if AU[i,(j*8)+k] == 1:
                A[i,j] = A[i,j] | (1<<(7-k))
C = np.empty((A.shape[0], A.shape[0]), dtype=np.int32)
threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(A.shape[0] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
bit_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()

start_gpu_shared_memory = time.time()
bit_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
test = np.array_equal(C, CU)
print(test)
if test == False:
    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            if C[i,j] != CU[i,j]:
                print(i, ' ' , j ,' ' , C[i,j] , ' ' , CU[i,j])
                break
$ python t61.py
host mem:  10 MB device mem:  4 MB
GPU time(shared memory):0.009343624114990234
True
$
Run Code Online (Sandbox Code Playgroud)

编辑:回应评论、更新中的一些问题,现在考虑到 A 矩阵可能有明显超过 30k 行,这也会导致 C 矩阵增加。如果A矩阵可以放入GPU内存中,我们可以通过分段计算来减少C矩阵的内存需求。这些片段将是一起计算的一组行,我将其称为row_sliceC 矩阵的 a。以下代码演示了只需对上述代码进行相对较小的更改即可实现此目的:

$ cat t63.py
from numba import cuda, uint8, int32
import numba as nb
import numpy as np
import math
import time

TPB = 32
TPB1 = 33

@cuda.jit()
def bit_slice_A_AT(A, C, row_offset):
    sA = cuda.shared.array((TPB, TPB), dtype=uint8)
    sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
    x, y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    tmp = int32(0)
    for i in range((A.shape[1]+TPB-1) // TPB):
        if y+row_offset < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sA[ty, tx] = A[y+row_offset, i*TPB+tx]
        else:
            sA[ty, tx] = 0
        if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
        else:
            sB[ty, tx] = 0
        cuda.syncthreads()
        for j in range(TPB):
            tmp1 = sA[ty,j] & sB[tx, j]
            test = uint8(1)
            for k in range(8):
                if (tmp1 & test) > 0:
                    tmp += 1
                test <<= 1
        cuda.syncthreads()
    if y < C.shape[0] and x < C.shape[1]:
        C[y, x] = tmp

@nb.njit('void(uint8[:,:],int8[:,:])', parallel=True)
def bitpack(A, AU):
    for i in nb.prange(A.shape[0]):
        for j in range(A.shape[1]):
            offset = j * 8
            res = AU[i,offset] << 7
            res |= AU[i,offset+1] << 6
            res |= AU[i,offset+2] << 5
            res |= AU[i,offset+3] << 4
            res |= AU[i,offset+4] << 3
            res |= AU[i,offset+5] << 2
            res |= AU[i,offset+6] << 1
            res |= AU[i,offset+7]
            A[i,j] = res

colm = 131
rows = 1535
cols = int(colm*8)
row_slice = 512
print('host mem: ', (rows*cols*2+rows*rows*4*2)//1048576, 'MB device mem: ', (((rows*cols)//8)+row_slice*rows*4)//1048576, 'MB')
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
CU = np.matmul(AU,AU.T,dtype=np.int32)
A = np.empty((rows,colm), dtype=np.uint8)
bitpack(A, AU)
A_dev = cuda.to_device(A)
threads_per_block = (TPB, TPB)
C = np.empty((row_slice, A.shape[0]), dtype=np.int32)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(row_slice / threads_per_block[1])
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
for i in range((A.shape[0]+row_slice-1)//row_slice):
    bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C, i*row_slice)
    lower = i*row_slice
    upper = min(lower+row_slice, CU.shape[0])
    width = upper-lower
    test = np.array_equal(C[:width,:], CU[i*row_slice:i*row_slice+width,:])
    print(test)
cuda.synchronize()
C_dev = cuda.device_array_like(C)
start_gpu_shared_memory = time.time()
for i in range((A.shape[0]+row_slice-1)//row_slice):
    bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C_dev, i*row_slice)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
$ python t63.py
host mem:  21 MB device mem:  3 MB
True
True
True
GPU time(shared memory):0.010116815567016602
$
Run Code Online (Sandbox Code Playgroud)

这意味着,正如建议的那样,对于问题最新更新中给出的行 = 100k 和列 = 200k 的情况,我们应该能够将 C 矩阵划分为 5k 行的块。A 矩阵的内存使用量为 2.5GB,但对于 C 矩阵,由于我们一次仅计算 5k 行切片,因此所需的设备内存存储将为100k*5k*4字节,因此本例为 2GB。

经过进一步研究,我们可以matmul通过切换数据int8类型来加速主机代码操作float32。这使得该操作速度相当快,但 GPU 代码似乎仍然比这快约 4 倍:

$ cat t64.py
from numba import cuda, uint8, int32
import numba as nb
import numpy as np
import math
import time

TPB = 32
TPB1 = 33

@cuda.jit()
def bit_slice_A_AT(A, C, row_offset):
    sA = cuda.shared.array((TPB, TPB), dtype=uint8)
    sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
    x, y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    tmp = int32(0)
    for i in range((A.shape[1]+TPB-1) // TPB):
        if y+row_offset < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sA[ty, tx] = A[y+row_offset, i*TPB+tx]
        else:
            sA[ty, tx] = 0
        if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
        else:
            sB[ty, tx] = 0
        cuda.syncthreads()
        for j in range(TPB):
            tmp1 = sA[ty,j] & sB[tx, j]
            test = uint8(1)
            for k in range(8):
                if (tmp1 & test) > 0:
                    tmp += 1
                test <<= 1
        cuda.syncthreads()
    if y < C.shape[0] and x < C.shape[1]:
        C[y, x] = tmp

@nb.njit('void(uint8[:,:],float32[:,:])', parallel=True)
def bitpack(A, AU):
    for i in nb.prange(A.shape[0]):
        for j in range(A.shape[1]):
            offset = j * 8
            res = int(AU[i,offset]) << 7
            res |= int(AU[i,offset+1]) << 6
            res |= int(AU[i,offset+2]) << 5
            res |= int(AU[i,offset+3]) << 4
            res |= int(AU[i,offset+4]) << 3
            res |= int(AU[i,offset+5]) << 2
            res |= int(AU[i,offset+6]) << 1
            res |= int(AU[i,offset+7])
            A[i,j] = res

colm = 1000
rows = 6000
cols = int(colm*8)
row_slice = 512
print('host mem: ', (rows*cols*4+rows*colm+rows*rows*4+rows*row_slice*4)//1048576, 'MB device mem: ', (((rows*cols)//8)+row_slice*rows*4)//1048576, 'MB')
t1 = time.time()
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
AU = AU.astype(np.float32)
print("randint:" + str(time.time()-t1))
t1 = time.time()
#CU = np.empty((rows, rows), dtype=np.int32)
CU = np.matmul(AU,AU.T,dtype=np.float32)
print("matmul:" + str(time.time()-t1))
t1 = time.time()
A = np.empty((rows,colm), dtype=np.uint8)
print("np.empty:" + str(time.time()-t1))
t1 = time.time()
bitpack(A, AU)
print("bitpack:" + str(time.time()-t1))
t1 = time.time()
A_dev = cuda.to_device(A)
threads_per_block = (TPB, TPB)
C = np.empty((row_slice, A.shape[0]), dtype=np.int32)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(row_slice / threads_per_block[1])
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
for i in range((A.shape[0]+row_slice-1)//row_slice):
    bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C, i*row_slice)
    lower = i*row_slice
    upper = min(lower+row_slice, CU.shape[0])
    width = upper-lower
    test = np.array_equal(C[:width,:], CU[i*row_slice:i*row_slice+width,:])
    print(test)
cuda.synchronize()
C_dev = cuda.device_array_like(C)
start_gpu_shared_memory = time.time()
for i in range((A.shape[0]+row_slice-1)//row_slice):
    bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C_dev, i*row_slice)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
$ python t64.py
host mem:  337 MB device mem:  17 MB
randint:0.1817936897277832
matmul:3.498671531677246
np.empty:7.62939453125e-05
bitpack:0.03707313537597656
True
True
True
True
True
True
True
True
True
True
True
True
GPU time(shared memory):0.8318064212799072
$
Run Code Online (Sandbox Code Playgroud)

我还没有彻底测试这些代码。可能存在错误。使用风险自负。

对于归属,numba 位打包代码似乎来自这里