使用CUDA并行实现多个SVD

Log*_*ang 6 parallel-processing cuda gpu svd

我是使用GPU进行并行编程的新手,所以如果问题广泛或模糊,我会道歉.我知道CULA库中有一些并行的SVD功能,但是如果我有大量相对较小的矩阵来分解,应该采取什么策略呢?例如,我有n尺寸矩阵d,n大而d小.如何并行化这个过程?谁能给我一个提示?

Jac*_*ern 7

我之前的回答现在已经过时了.截至2015年2月,CUDA 7(目前处于候选版本)在其cuSOLVER库中提供完整的SVD功能.下面,我将提供一个使用CUDA cuSOLVER生成奇异值分解的示例.

关于您正在上升的具体问题(计算几个小尺寸矩阵的SVD),您应该使用流来调整我在下面提供的示例.将流关联到您可以使用的每个任务

cudaStreamCreate()
Run Code Online (Sandbox Code Playgroud)

cusolverDnSetStream()
Run Code Online (Sandbox Code Playgroud)

kernel.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include<iostream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include<math.h>

#include <cusolverDn.h>
#include <cuda_runtime_api.h>

#include "Utilities.cuh"

/********/
/* MAIN */
/********/
int main(){

    // --- gesvd only supports Nrows >= Ncols
    // --- column major memory ordering

    const int Nrows = 7;
    const int Ncols = 5;

    // --- cuSOLVE input/output parameters/arrays
    int work_size = 0;
    int *devInfo;           gpuErrchk(cudaMalloc(&devInfo,          sizeof(int)));

    // --- CUDA solver initialization
    cusolverDnHandle_t solver_handle;
    cusolverDnCreate(&solver_handle);

    // --- Setting the host, Nrows x Ncols matrix
    double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double));
    for(int j = 0; j < Nrows; j++)
        for(int i = 0; i < Ncols; i++)
            h_A[j + i*Nrows] = (i + j*j) * sqrt((double)(i + j));

    // --- Setting the device matrix and moving the host matrix to the device
    double *d_A;            gpuErrchk(cudaMalloc(&d_A,      Nrows * Ncols * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));

    // --- host side SVD results space
    double *h_U = (double *)malloc(Nrows * Nrows     * sizeof(double));
    double *h_V = (double *)malloc(Ncols * Ncols     * sizeof(double));
    double *h_S = (double *)malloc(min(Nrows, Ncols) * sizeof(double));

    // --- device side SVD workspace and matrices
    double *d_U;            gpuErrchk(cudaMalloc(&d_U,  Nrows * Nrows     * sizeof(double)));
    double *d_V;            gpuErrchk(cudaMalloc(&d_V,  Ncols * Ncols     * sizeof(double)));
    double *d_S;            gpuErrchk(cudaMalloc(&d_S,  min(Nrows, Ncols) * sizeof(double)));

    // --- CUDA SVD initialization
    cusolveSafeCall(cusolverDnDgesvd_bufferSize(solver_handle, Nrows, Ncols, &work_size));
    double *work;   gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));

    // --- CUDA SVD execution
    cusolveSafeCall(cusolverDnDgesvd(solver_handle, 'A', 'A', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, d_V, Ncols, work, work_size, NULL, devInfo));
    int devInfo_h = 0;  gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
    if (devInfo_h != 0) std::cout   << "Unsuccessful SVD execution\n\n";

    // --- Moving the results from device to host
    gpuErrchk(cudaMemcpy(h_S, d_S, min(Nrows, Ncols) * sizeof(double), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_U, d_U, Nrows * Nrows     * sizeof(double), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_V, d_V, Ncols * Ncols     * sizeof(double), cudaMemcpyDeviceToHost));

    std::cout << "Singular values\n";
    for(int i = 0; i < min(Nrows, Ncols); i++) 
        std::cout << "d_S["<<i<<"] = " << std::setprecision(15) << h_S[i] << std::endl;

    std::cout << "\nLeft singular vectors - For y = A * x, the columns of U span the space of y\n";
    for(int j = 0; j < Nrows; j++) {
        printf("\n");
        for(int i = 0; i < Nrows; i++)
            printf("U[%i,%i]=%f\n",i,j,h_U[j*Nrows + i]);
    }

    std::cout << "\nRight singular vectors - For y = A * x, the columns of V span the space of x\n";
    for(int i = 0; i < Ncols; i++) {
        printf("\n");
        for(int j = 0; j < Ncols; j++)
            printf("V[%i,%i]=%f\n",i,j,h_V[j*Ncols + i]);
    }

    cusolverDnDestroy(solver_handle);

    return 0;

}
Run Code Online (Sandbox Code Playgroud)

Utilities.cuh

#ifndef UTILITIES_CUH
#define UTILITIES_CUH

extern "C" int iDivUp(int, int);
extern "C" void gpuErrchk(cudaError_t);
extern "C" void cusolveSafeCall(cusolverStatus_t);

#endif
Run Code Online (Sandbox Code Playgroud)

Utilities.cu

#include <stdio.h>
#include <assert.h>

#include "cuda_runtime.h"
#include <cuda.h>

#include <cusolverDn.h>

/*******************/
/* iDivUp FUNCTION */
/*******************/
extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) { exit(code); }
   }
}

extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }

/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
    switch (error)
    {
        case CUSOLVER_STATUS_SUCCESS:
            return "CUSOLVER_SUCCESS";

        case CUSOLVER_STATUS_NOT_INITIALIZED:
            return "CUSOLVER_STATUS_NOT_INITIALIZED";

        case CUSOLVER_STATUS_ALLOC_FAILED:
            return "CUSOLVER_STATUS_ALLOC_FAILED";

        case CUSOLVER_STATUS_INVALID_VALUE:
            return "CUSOLVER_STATUS_INVALID_VALUE";

        case CUSOLVER_STATUS_ARCH_MISMATCH:
            return "CUSOLVER_STATUS_ARCH_MISMATCH";

        case CUSOLVER_STATUS_EXECUTION_FAILED:
            return "CUSOLVER_STATUS_EXECUTION_FAILED";

        case CUSOLVER_STATUS_INTERNAL_ERROR:
            return "CUSOLVER_STATUS_INTERNAL_ERROR";

        case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
            return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";

    }

    return "<unknown>";
}

inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
    if(CUSOLVER_STATUS_SUCCESS != err) {
        fprintf(stderr, "CUSOLVE error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
                                _cudaGetErrorEnum(err)); \
        cudaDeviceReset(); assert(0); \
    }
}

extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }
Run Code Online (Sandbox Code Playgroud)


Jac*_*ern 4

您可以查看CULA 博客的批量操作帖子来讨论您的问题。

编辑

根据我从下面的评论中了解到的,您希望每个线程计算一个单独的 SVD。因此,基本上每个线程都应该执行标准的、顺序的 SVD 方案。为此,一些可能有用的参考:

数字食谱

Golub、Van Loan、矩阵计算

但是,如果您使用这种方法,恐怕您将无法再使用 cuBLAS,因为这些函数host无法从 中调用device(除非您没有计算能力>3.5,请参阅示例simpleDevLibCUBLAS。)。但基本上通过这种方式,我认为您正在以某种方式自己实现批处理概念。

如果您决定采用更标准的并行 GPU 实现,下面的参考可能会感兴趣:

使用 CUDA 在 GPU 上进行奇异值分解