sec*_*ive -3 cuda matrix-multiplication pycuda numba
我需要将矩阵与其转置相乘,但我的 GPU 内存不足并出现错误消息numba.cuda.cudadrv.driver.CudaAPIError: [2] Call to cuMemAlloc results in CUDA_ERROR_OUT_OF_MEMORY
我预计矩阵的大小约为 10k 行和 100k 列,因此将其与其 trnspose 相乘将得到 10k 行和 10k 列的方阵的结果。矩阵只包含0和1。
这是我正在运行的脚本。
from numba import cuda, uint16
import numba
import numpy
import math
import time
TPB = 16
@cuda.jit()
def matmul_shared_mem(A, B, C):
sA = cuda.shared.array((TPB, TPB), dtype=uint16)
sB = cuda.shared.array((TPB, TPB), dtype=uint16)
x, y = cuda.grid(2)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
if x >= C.shape[0] and y >= C.shape[1]:
return
tmp = 0.
for i in range(int(A.shape[1] / TPB)):
sA[tx, ty] = A[x, ty + i * TPB]
sB[tx, ty] = B[tx + i * TPB, y]
cuda.syncthreads()
for j in range(TPB):
tmp += sA[tx, j] * sB[j, ty]
cuda.syncthreads()
C[x, y] = tmp
A = numpy.random.randint(2, size=(TPB * 625, 50000))
B = A.transpose()
C_shared_mem = cuda.device_array((A.shape[0], B.shape[1]))
threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(B.shape[1] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
start_gpu_shared_memory = time.time()
matmul_shared_mem[blocks_per_grid, threads_per_block](A, B, C_shared_mem)
cuda.synchronize()
end_gpu_shared_memory = time.time()
time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
Run Code Online (Sandbox Code Playgroud)
更新1:
根据您的建议,我做了一些更改,但我的内存仍然不足。
import numpy as np
import numba as nb
colm = int(200000/8)
rows = 100000
cols = int(colm*8)
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
A = np.empty((rows,colm), dtype=np.uint8)
@nb.njit('void(uint8[:,:],int8[:,:])', parallel=True)
def compute(A, AU):
for i in nb.prange(A.shape[0]):
for j in range(A.shape[1]):
offset = j * 8
res = AU[i,offset] << 7
res |= AU[i,offset+1] << 6
res |= AU[i,offset+2] << 5
res |= AU[i,offset+3] << 4
res |= AU[i,offset+4] << 3
res |= AU[i,offset+5] << 2
res |= AU[i,offset+6] << 1
res |= AU[i,offset+7]
A[i,j] = res
compute(A, AU)
from numba import cuda, uint8, int32
import numba
import numpy as np
import math
import time
TPB = 8
TPB1 = 9
@cuda.jit()
def bit_A_AT(A, C):
sA = cuda.shared.array((TPB, TPB), dtype=uint8)
sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
x, y = cuda.grid(2)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bx = cuda.blockIdx.x
by = cuda.blockIdx.y
if bx >= by:
tmp = int32(0)
for i in range((A.shape[1]+TPB-1) // TPB):
if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sA[ty, tx] = A[y, i*TPB+tx]
else:
sA[ty, tx] = 0
if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
else:
sB[ty, tx] = 0
cuda.syncthreads()
for j in range(TPB):
tmp1 = sA[ty,j] & sB[tx, j]
test = uint8(1)
for k in range(8):
if (tmp1 & test) > 0:
tmp += 1
test <<= 1
cuda.syncthreads()
if y < C.shape[0] and x < C.shape[1]:
C[y, x] = tmp
C = np.empty((A.shape[0], A.shape[0]), dtype=np.int32)
threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(A.shape[0] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
start_gpu_shared_memory = time.time()
bit_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
end_gpu_shared_memory = time.time()
time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
Run Code Online (Sandbox Code Playgroud)
知道如何解决这个问题吗?
以下方法应减少计算 A x AT 所需的设备内存量。我们将使用以下想法:
A) 仅采用 0,1 值,因此我们将该数组的存储空间减少到最小方便大小 ,int8即每个元素一个字节B数组只是A数组的转置,因此不需要显式处理它。我们可以从A数组中导出它,有点类似于这里,尽管它正在执行 AT x AsB我们将使用调整后的索引在数组中提供 A 转置版本syncthreads并修改了代码以允许行和列尺寸的任意值这是一个有效的例子:
$ cat t62.py
from numba import cuda, int32, int8
import numba
import numpy as np
import math
import time
TPB = 32
TPB1 = TPB+1
@cuda.jit()
def byte_A_AT(A, C):
sA = cuda.shared.array((TPB, TPB), dtype=int8)
sB = cuda.shared.array((TPB, TPB1), dtype=int8)
x, y = cuda.grid(2)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bx = cuda.blockIdx.x
by = cuda.blockIdx.y
# uncomment and indent remainder of kernel to only do the "symmetric half" of calculation
# if bx >= by:
tmp = int32(0)
for i in range((A.shape[1]+TPB-1)// TPB):
if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sA[ty, tx] = A[y, i*TPB+tx]
else:
sA[ty, tx] = 0
if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
else:
sB[ty, tx] = 0
cuda.syncthreads()
for j in range(TPB):
tmp += int32(sA[ty,j]) * int32(sB[tx, j])
cuda.syncthreads()
if y < C.shape[0] and x < C.shape[1]:
C[y, x] = tmp
rows = 1041
cols = 1043
print('host mem: ', (rows*cols*2+rows*rows*4*2)//1048576, 'MB device mem: ', (rows*cols+rows*rows*4)//1048576, 'MB')
A = np.random.randint(2,size=(rows, cols),dtype=np.int8)
AT = A.transpose()
CU = np.matmul(A,AT, dtype = np.int32)
C = np.empty((A.shape[0], A.shape[0]), dtype=np.int32)
threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(A.shape[0] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
byte_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
start_gpu_shared_memory = time.time()
byte_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
end_gpu_shared_memory = time.time()
time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
test = np.array_equal(C, CU)
print(test)
if test == False:
for i in range(C.shape[0]):
for j in range(C.shape[1]):
if C[i,j] != CU[i,j]:
print(i, ' ' , j ,' ' , C[i,j] , ' ' , CU[i,j])
$ python t62.py
host mem: 10 MB device mem: 5 MB
GPU time(shared memory):0.019593000411987305
True
$
Run Code Online (Sandbox Code Playgroud)
笔记:
np.matmul(),这实际上只是用于验证结果,实际实现中应该没有必要),而不是在GPU部分。随着矩阵变大,代码运行速度会变慢。np.array_equal测试失败。A,这将进一步减少内存需求,但是编写此代码来处理任意列维度(相对于 8 的倍数)的情况被证明是相当混乱的。然而,对于 30k 行和 200k 列的情况,该代码可以将总设备内存消耗降至约 5GB。这是每元素一位版本,它要求列数是能被 8 整除的整数:
$ cat t61.py
from numba import cuda, uint8, int32
import numba
import numpy as np
import math
import time
TPB = 32
TPB1 = 33
@cuda.jit()
def bit_A_AT(A, C):
sA = cuda.shared.array((TPB, TPB), dtype=uint8)
sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
x, y = cuda.grid(2)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bx = cuda.blockIdx.x
by = cuda.blockIdx.y
tmp = int32(0)
for i in range((A.shape[1]+TPB-1) // TPB):
if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sA[ty, tx] = A[y, i*TPB+tx]
else:
sA[ty, tx] = 0
if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
else:
sB[ty, tx] = 0
cuda.syncthreads()
for j in range(TPB):
tmp1 = sA[ty,j] & sB[tx, j]
test = uint8(1)
for k in range(8):
if (tmp1 & test) > 0:
tmp += 1
test <<= 1
cuda.syncthreads()
if y < C.shape[0] and x < C.shape[1]:
C[y, x] = tmp
colm = 131
rows = 1041
cols = int(colm*8)
print('host mem: ', (rows*cols*2+rows*rows*4*2)//1048576, 'MB device mem: ', (((rows*cols)//8)+rows*rows*4)//1048576, 'MB')
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
AUT = AU.transpose()
CU = np.matmul(AU,AUT,dtype=np.int32)
A = np.empty((rows,colm), dtype=np.uint8)
for i in range(A.shape[0]):
for j in range(A.shape[1]):
A[i,j] = 0
for k in range(8):
if AU[i,(j*8)+k] == 1:
A[i,j] = A[i,j] | (1<<(7-k))
C = np.empty((A.shape[0], A.shape[0]), dtype=np.int32)
threads_per_block = (TPB, TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(A.shape[0] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
bit_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
start_gpu_shared_memory = time.time()
bit_A_AT[blocks_per_grid, threads_per_block](A, C)
cuda.synchronize()
end_gpu_shared_memory = time.time()
time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
test = np.array_equal(C, CU)
print(test)
if test == False:
for i in range(C.shape[0]):
for j in range(C.shape[1]):
if C[i,j] != CU[i,j]:
print(i, ' ' , j ,' ' , C[i,j] , ' ' , CU[i,j])
break
$ python t61.py
host mem: 10 MB device mem: 4 MB
GPU time(shared memory):0.009343624114990234
True
$
Run Code Online (Sandbox Code Playgroud)
编辑:回应评论、更新中的一些问题,现在考虑到 A 矩阵可能有明显超过 30k 行,这也会导致 C 矩阵增加。如果A矩阵可以放入GPU内存中,我们可以通过分段计算来减少C矩阵的内存需求。这些片段将是一起计算的一组行,我将其称为row_sliceC 矩阵的 a。以下代码演示了只需对上述代码进行相对较小的更改即可实现此目的:
$ cat t63.py
from numba import cuda, uint8, int32
import numba as nb
import numpy as np
import math
import time
TPB = 32
TPB1 = 33
@cuda.jit()
def bit_slice_A_AT(A, C, row_offset):
sA = cuda.shared.array((TPB, TPB), dtype=uint8)
sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
x, y = cuda.grid(2)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bx = cuda.blockIdx.x
by = cuda.blockIdx.y
tmp = int32(0)
for i in range((A.shape[1]+TPB-1) // TPB):
if y+row_offset < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sA[ty, tx] = A[y+row_offset, i*TPB+tx]
else:
sA[ty, tx] = 0
if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
else:
sB[ty, tx] = 0
cuda.syncthreads()
for j in range(TPB):
tmp1 = sA[ty,j] & sB[tx, j]
test = uint8(1)
for k in range(8):
if (tmp1 & test) > 0:
tmp += 1
test <<= 1
cuda.syncthreads()
if y < C.shape[0] and x < C.shape[1]:
C[y, x] = tmp
@nb.njit('void(uint8[:,:],int8[:,:])', parallel=True)
def bitpack(A, AU):
for i in nb.prange(A.shape[0]):
for j in range(A.shape[1]):
offset = j * 8
res = AU[i,offset] << 7
res |= AU[i,offset+1] << 6
res |= AU[i,offset+2] << 5
res |= AU[i,offset+3] << 4
res |= AU[i,offset+4] << 3
res |= AU[i,offset+5] << 2
res |= AU[i,offset+6] << 1
res |= AU[i,offset+7]
A[i,j] = res
colm = 131
rows = 1535
cols = int(colm*8)
row_slice = 512
print('host mem: ', (rows*cols*2+rows*rows*4*2)//1048576, 'MB device mem: ', (((rows*cols)//8)+row_slice*rows*4)//1048576, 'MB')
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
CU = np.matmul(AU,AU.T,dtype=np.int32)
A = np.empty((rows,colm), dtype=np.uint8)
bitpack(A, AU)
A_dev = cuda.to_device(A)
threads_per_block = (TPB, TPB)
C = np.empty((row_slice, A.shape[0]), dtype=np.int32)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(row_slice / threads_per_block[1])
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
for i in range((A.shape[0]+row_slice-1)//row_slice):
bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C, i*row_slice)
lower = i*row_slice
upper = min(lower+row_slice, CU.shape[0])
width = upper-lower
test = np.array_equal(C[:width,:], CU[i*row_slice:i*row_slice+width,:])
print(test)
cuda.synchronize()
C_dev = cuda.device_array_like(C)
start_gpu_shared_memory = time.time()
for i in range((A.shape[0]+row_slice-1)//row_slice):
bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C_dev, i*row_slice)
cuda.synchronize()
end_gpu_shared_memory = time.time()
time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
$ python t63.py
host mem: 21 MB device mem: 3 MB
True
True
True
GPU time(shared memory):0.010116815567016602
$
Run Code Online (Sandbox Code Playgroud)
这意味着,正如建议的那样,对于问题最新更新中给出的行 = 100k 和列 = 200k 的情况,我们应该能够将 C 矩阵划分为 5k 行的块。A 矩阵的内存使用量为 2.5GB,但对于 C 矩阵,由于我们一次仅计算 5k 行切片,因此所需的设备内存存储将为100k*5k*4字节,因此本例为 2GB。
经过进一步研究,我们可以matmul通过切换数据int8类型来加速主机代码操作float32。这使得该操作速度相当快,但 GPU 代码似乎仍然比这快约 4 倍:
$ cat t64.py
from numba import cuda, uint8, int32
import numba as nb
import numpy as np
import math
import time
TPB = 32
TPB1 = 33
@cuda.jit()
def bit_slice_A_AT(A, C, row_offset):
sA = cuda.shared.array((TPB, TPB), dtype=uint8)
sB = cuda.shared.array((TPB, TPB1), dtype=uint8)
x, y = cuda.grid(2)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bx = cuda.blockIdx.x
by = cuda.blockIdx.y
tmp = int32(0)
for i in range((A.shape[1]+TPB-1) // TPB):
if y+row_offset < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sA[ty, tx] = A[y+row_offset, i*TPB+tx]
else:
sA[ty, tx] = 0
if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
sB[ty, tx] = A[TPB*bx+ty, i*TPB+tx]
else:
sB[ty, tx] = 0
cuda.syncthreads()
for j in range(TPB):
tmp1 = sA[ty,j] & sB[tx, j]
test = uint8(1)
for k in range(8):
if (tmp1 & test) > 0:
tmp += 1
test <<= 1
cuda.syncthreads()
if y < C.shape[0] and x < C.shape[1]:
C[y, x] = tmp
@nb.njit('void(uint8[:,:],float32[:,:])', parallel=True)
def bitpack(A, AU):
for i in nb.prange(A.shape[0]):
for j in range(A.shape[1]):
offset = j * 8
res = int(AU[i,offset]) << 7
res |= int(AU[i,offset+1]) << 6
res |= int(AU[i,offset+2]) << 5
res |= int(AU[i,offset+3]) << 4
res |= int(AU[i,offset+4]) << 3
res |= int(AU[i,offset+5]) << 2
res |= int(AU[i,offset+6]) << 1
res |= int(AU[i,offset+7])
A[i,j] = res
colm = 1000
rows = 6000
cols = int(colm*8)
row_slice = 512
print('host mem: ', (rows*cols*4+rows*colm+rows*rows*4+rows*row_slice*4)//1048576, 'MB device mem: ', (((rows*cols)//8)+row_slice*rows*4)//1048576, 'MB')
t1 = time.time()
AU = np.random.randint(2,size=(rows, cols),dtype=np.int8)
AU = AU.astype(np.float32)
print("randint:" + str(time.time()-t1))
t1 = time.time()
#CU = np.empty((rows, rows), dtype=np.int32)
CU = np.matmul(AU,AU.T,dtype=np.float32)
print("matmul:" + str(time.time()-t1))
t1 = time.time()
A = np.empty((rows,colm), dtype=np.uint8)
print("np.empty:" + str(time.time()-t1))
t1 = time.time()
bitpack(A, AU)
print("bitpack:" + str(time.time()-t1))
t1 = time.time()
A_dev = cuda.to_device(A)
threads_per_block = (TPB, TPB)
C = np.empty((row_slice, A.shape[0]), dtype=np.int32)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(row_slice / threads_per_block[1])
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
for i in range((A.shape[0]+row_slice-1)//row_slice):
bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C, i*row_slice)
lower = i*row_slice
upper = min(lower+row_slice, CU.shape[0])
width = upper-lower
test = np.array_equal(C[:width,:], CU[i*row_slice:i*row_slice+width,:])
print(test)
cuda.synchronize()
C_dev = cuda.device_array_like(C)
start_gpu_shared_memory = time.time()
for i in range((A.shape[0]+row_slice-1)//row_slice):
bit_slice_A_AT[blocks_per_grid, threads_per_block](A_dev, C_dev, i*row_slice)
cuda.synchronize()
end_gpu_shared_memory = time.time()
time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
$ python t64.py
host mem: 337 MB device mem: 17 MB
randint:0.1817936897277832
matmul:3.498671531677246
np.empty:7.62939453125e-05
bitpack:0.03707313537597656
True
True
True
True
True
True
True
True
True
True
True
True
GPU time(shared memory):0.8318064212799072
$
Run Code Online (Sandbox Code Playgroud)
我还没有彻底测试这些代码。可能存在错误。使用风险自负。
对于归属,numba 位打包代码似乎来自这里。
| 归档时间: |
|
| 查看次数: |
1542 次 |
| 最近记录: |