使用快速英特尔随机生成器(SSE2)失败,堆栈周围...已损坏

Bra*_* Ds 5 c++ random sse simd

我需要非常快(最快)的随机发生器.我从英特尔发现了这个:快速英特尔随机数发生器

看起来不错.所以我在MS Visual Studio 2013上创建了项目:

//FastRandom.h:
#pragma once
#include "emmintrin.h"
#include <time.h>
//define this if you wish to return values similar to the standard rand();
#define COMPATABILITY

namespace Brans
{
        __declspec(align(16)) static __m128i cur_seed;

        // uncoment this if you are using intel compiler
        // for MS CL the vectorizer is on by default and jumps in if you
        // compile with /O2 ...
        //#pragma intel optimization_parameter target_arch=avx
        //__declspec(cpu_dispatch(core_2nd_gen_avx, core_i7_sse4_2, core_2_duo_ssse3, generic )
        inline void rand_sse(unsigned int* result)
        {
            __declspec(align(16)) __m128i cur_seed_split;

            __declspec(align(16)) __m128i multiplier;

            __declspec(align(16)) __m128i adder;

            __declspec(align(16)) __m128i mod_mask;

            __declspec(align(16)) __m128i sra_mask;

            __declspec(align(16)) __m128i sseresult;

            __declspec(align(16)) static const unsigned int mult[4] =

            { 214013, 17405, 214013, 69069 };

            __declspec(align(16)) static const unsigned int gadd[4] =

            { 2531011, 10395331, 13737667, 1 };

            __declspec(align(16)) static const unsigned int mask[4] =

            { 0xFFFFFFFF, 0, 0xFFFFFFFF, 0 };

            __declspec(align(16)) static const unsigned int masklo[4] =

            { 0x00007FFF, 0x00007FFF, 0x00007FFF, 0x00007FFF };



            adder = _mm_load_si128((__m128i*) gadd);

            multiplier = _mm_load_si128((__m128i*) mult);

            mod_mask = _mm_load_si128((__m128i*) mask);

            sra_mask = _mm_load_si128((__m128i*) masklo);

            cur_seed_split = _mm_shuffle_epi32(cur_seed, _MM_SHUFFLE(2, 3, 0, 1));



            cur_seed = _mm_mul_epu32(cur_seed, multiplier);

            multiplier = _mm_shuffle_epi32(multiplier, _MM_SHUFFLE(2, 3, 0, 1));

            cur_seed_split = _mm_mul_epu32(cur_seed_split, multiplier);


            cur_seed = _mm_and_si128(cur_seed, mod_mask);

            cur_seed_split = _mm_and_si128(cur_seed_split, mod_mask);

            cur_seed_split = _mm_shuffle_epi32(cur_seed_split, _MM_SHUFFLE(2, 3, 0, 1));

            cur_seed = _mm_or_si128(cur_seed, cur_seed_split);

            cur_seed = _mm_add_epi32(cur_seed, adder);


#ifdef COMPATABILITY



            // Add the lines below if you wish to reduce your results to 16-bit vals...

            sseresult = _mm_srai_epi32(cur_seed, 16);

            sseresult = _mm_and_si128(sseresult, sra_mask);

            _mm_storeu_si128((__m128i*) result, sseresult);

            return;

#endif


            _mm_storeu_si128((__m128i*) result, cur_seed);

            return;
        }

        inline void srand_sse(unsigned int seed)
        {
            cur_seed = _mm_set_epi32(seed, seed + 1, seed, seed + 1);
        }

        inline void srand_sse()
        {
            unsigned int seed = (unsigned int)time(0);
            cur_seed = _mm_set_epi32(seed, seed + 1, seed, seed + 1);
        }

        inline unsigned int GetRandom(unsigned int low, unsigned int high)
        {
            unsigned int ret = 0;
            rand_sse(&ret);
            return ret % (high - low + 1) + low;
        }

    };


// Test.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"
#include "FastRandom.h"
#include <iostream>

using namespace Brans;

int _tmain(int argc, _TCHAR* argv[])
{
    srand_sse();
    unsigned int result = 0;
    for (size_t i = 0; i < 10000; i++)
    {
        result += GetRandom(1, 50);
        result -= GetRandom(1, 50);
    }

    std::cout << result << std::endl;
    return 0;
}
Run Code Online (Sandbox Code Playgroud)

我期望0结果+ - 50.但是当我在Debug中运行程序时,我得到: 运行时检查失败#2 - 变量'ret'周围的堆栈已损坏.在GetRandom(...).当我在发布中运行它时,我得到了未定义的结果,最多为max unsigned int.(我使用的是intel i5处理器).

怎么了?

=========添加到接受的答案,我也错了,我应该使用long而不是unsigned int,因为否定结果对unsigned来说变大.

Iva*_*nko 5

来自英特尔快速随机生成器的文档:

rand_sse()函数实现了此fast_rand()函数的矢量化版本,其中整数数学运算使用SIMD架构以四个方式完成.

这意味着rand_sse使用sse2一次生成4个随机数.

所以你需要给它的数组unsigned int:

unsigned int result[4];
rand_sse( result );
Run Code Online (Sandbox Code Playgroud)