在给定字节序列开始的流中查找位置的最佳方法

sh0*_*ged 11 c# algorithm bytearray stream find

您如何看待在给定字节序列开始的System.Stream中找到位置的最佳方法是什么(第一次出现):

public static long FindPosition(Stream stream, byte[] byteSequence)
{
    long position = -1;

    /// ???
    return position;
}
Run Code Online (Sandbox Code Playgroud)

PS最简单但最快速的解决方案是优先考虑的.:)

dha*_*rga 5

如果您将流视为另一个字节序列,则可以像进行字符串搜索一样搜索它。 维基百科对此有一篇很棒的文章。Boyer-Moore 是一个很好且简单的算法。

这是我用 Java 编写的一个快速技巧。它有效,而且即使不是 Boyer-Moore,也非常接近。希望能帮助到你 ;)

public static final int BUFFER_SIZE = 32;

public static int [] buildShiftArray(byte [] byteSequence){
    int [] shifts = new int[byteSequence.length];
    int [] ret;
    int shiftCount = 0;
    byte end = byteSequence[byteSequence.length-1];
    int index = byteSequence.length-1;
    int shift = 1;

    while(--index >= 0){
        if(byteSequence[index] == end){
            shifts[shiftCount++] = shift;
            shift = 1;
        } else {
            shift++;
        }
    }
    ret = new int[shiftCount];
    for(int i = 0;i < shiftCount;i++){
        ret[i] = shifts[i];
    }
    return ret;
}

public static byte [] flushBuffer(byte [] buffer, int keepSize){
    byte [] newBuffer = new byte[buffer.length];
    for(int i = 0;i < keepSize;i++){
        newBuffer[i] = buffer[buffer.length - keepSize + i];
    }
    return newBuffer;
}

public static int findBytes(byte [] haystack, int haystackSize, byte [] needle, int [] shiftArray){
    int index = needle.length;
    int searchIndex, needleIndex, currentShiftIndex = 0, shift;
    boolean shiftFlag = false;

    index = needle.length;
    while(true){
        needleIndex = needle.length-1;
        while(true){
            if(index >= haystackSize)
                return -1;
            if(haystack[index] == needle[needleIndex])
                break;
            index++;
        }
        searchIndex = index;
        needleIndex = needle.length-1;
        while(needleIndex >= 0 && haystack[searchIndex] == needle[needleIndex]){
            searchIndex--;
            needleIndex--;
        }
        if(needleIndex < 0)
            return index-needle.length+1;
        if(shiftFlag){
            shiftFlag = false;
            index += shiftArray[0];
            currentShiftIndex = 1;
        } else if(currentShiftIndex >= shiftArray.length){
            shiftFlag = true;
            index++;
        } else{
            index += shiftArray[currentShiftIndex++];
        }           
    }
}

public static int findBytes(InputStream stream, byte [] needle){
    byte [] buffer = new byte[BUFFER_SIZE];
    int [] shiftArray = buildShiftArray(needle);
    int bufferSize, initBufferSize;
    int offset = 0, init = needle.length;
    int val;

    try{
        while(true){
            bufferSize = stream.read(buffer, needle.length-init, buffer.length-needle.length+init);
            if(bufferSize == -1)
                return -1;
            if((val = findBytes(buffer, bufferSize+needle.length-init, needle, shiftArray)) != -1)
                return val+offset;
            buffer = flushBuffer(buffer, needle.length);
            offset += bufferSize-init;
            init = 0;
        }
    } catch (IOException e){
        e.printStackTrace();
    }
    return -1;
}
Run Code Online (Sandbox Code Playgroud)


bru*_*nde 5

我已经达成了这个解决方案.

我做了一些基准测试与是一个ASCII文件3.050 KB38803 lines.与搜索byte array22 bytes在文件的最后一行,我得到了在有关结果2.28秒(在慢/旧机).

public static long FindPosition(Stream stream, byte[] byteSequence)
{
    if (byteSequence.Length > stream.Length)
        return -1;

    byte[] buffer = new byte[byteSequence.Length];

    using (BufferedStream bufStream = new BufferedStream(stream, byteSequence.Length))
    {
        int i;
        while ((i = bufStream.Read(buffer, 0, byteSequence.Length)) == byteSequence.Length)
        {
            if (byteSequence.SequenceEqual(buffer))
                return bufStream.Position - byteSequence.Length;
            else
                bufStream.Position -= byteSequence.Length - PadLeftSequence(buffer, byteSequence);
        }
    }

    return -1;
}

private static int PadLeftSequence(byte[] bytes, byte[] seqBytes)
{
    int i = 1;
    while (i < bytes.Length)
    {
        int n = bytes.Length - i;
        byte[] aux1 = new byte[n];
        byte[] aux2 = new byte[n];
        Array.Copy(bytes, i, aux1, 0, n);
        Array.Copy(seqBytes, aux2, n);
        if (aux1.SequenceEqual(aux2))
            return i;
        i++;
    }
    return i;
}
Run Code Online (Sandbox Code Playgroud)