用于Win64 Release Target的Delphi 10.2下的FillChar和StringOfChar

Max*_*tin 14 windows delphi x86-64

我对Delphi 10.2 Pascal编程语言中的特定编程问题有疑问.

StringOfChar和FillChar在2012年之前发布的CPU上的Win64 Release版本下无法正常工作.

  • FillChar的预期结果只是在给定的内存缓冲区中重复8位字符的简单序列.

  • StringOfChar的预期结果是相同的,但结果存储在字符串类型中.

但实际上,当我在10.2版本的Delphi中编译我们在10.2之前的Delphi中运行的应用程序时,我们为Win64编译的应用程序在2012年之前发布的CPU上停止正常工作.

StringOfChar和FillChar不能正常工作 - 它们返回一串不同的字符,虽然是重复的模式 - 而不仅仅是它们应该具有相同字符的序列.

这是足以证明问题的最小代码.请注意,序列的长度应至少为16个字符,并且字符不应为nul(#0).代码如下:

procedure TestStringOfChar;
var
  a: AnsiString;
  ac: AnsiChar;
begin
  ac := #1;
  a := StringOfChar(ac, 43);
  if a <> #1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1#1 then
  begin
    raise Exception.Create('ANSI StringOfChar Failed!!');
  end;
end;
Run Code Online (Sandbox Code Playgroud)

我知道StackOverflow上有很多Delphi程序员.你遇到同样的问题吗?如果是,您如何解决?解决办法是什么?顺便说一句,我已经联系了Delphi的开发人员,但到目前为止他们还没有确认也没有否认这个问题.我正在使用Embarcadero Delphi 10.2版本25.0.26309.314.

更新:

如果您的CPU是在2012年或之后生产的,则在调用StringOfChar之前还要包含以下行以重现该问题:

const
  ERMSBBit    = 1 shl 9; //$0200
begin
  CPUIDTable[7].EBX := CPUIDTable[7].EBX and not ERMSBBit;
Run Code Online (Sandbox Code Playgroud)

至于2017年4月的RAD Studio 10.2工具链问题修补程序 - 尝试过它而没有它 - 它没有帮助.无论Hotfix如何,问题都存在.

Joh*_*ica 11

StringOfChar(A: AnsiChar, count) 在引擎盖下使用FillChar.

您可以使用以下代码来解决此问题:

(*******************************************************
 System.FastSystem
 A fast drop-in addition to speed up function in system.pas
 It should compile and run in XE2 and beyond.
 Alpha version 0.5, fully tested in Win64
 (c) Copyright 2016 J. Bontes
   This Source Code Form is subject to the terms of the
   Mozilla Public License, v. 2.0.
   If a copy of the MPL was not distributed with this file,
   You can obtain one at http://mozilla.org/MPL/2.0/.
********************************************************
FillChar code is an altered version FillCharsse2 SynCommons.pas
which is part of Synopse framework by Arnaud Bouchez
********************************************************
Changelog
0.5 Initial version:
********************************************************)

unit FastSystem;

interface

procedure FillChar(var Dest; Count: NativeInt; Value: ansichar); inline; overload;
procedure FillChar(var Dest; Count: NativeInt; Value: Byte); overload;
procedure FillMemory(Destination: Pointer; Length: NativeUInt; Fill: Byte); inline;
{$EXTERNALSYM FillMemory}
procedure ZeroMemory(Destination: Pointer; Length: NativeUInt); inline;
{$EXTERNALSYM ZeroMemory}

implementation

procedure FillChar(var Dest; Count: NativeInt; Value: ansichar); inline; overload;
begin
  FillChar(Dest, Count, byte(Value));
end;

procedure FillMemory(Destination: Pointer; Length: NativeUInt; Fill: Byte);
begin
  FillChar(Destination^, Length, Fill);
end;

procedure ZeroMemory(Destination: Pointer; Length: NativeUInt); inline;
begin
  FillChar(Destination^, Length, 0);
end;

//This code is 3x faster than System.FillChar on x64.

{$ifdef CPUX64}
procedure FillChar(var Dest; Count: NativeInt; Value: Byte);
//rcx = dest
//rdx=count
//r8b=value
asm
              .noframe
              .align 16
              movzx r8,r8b           //There's no need to optimize for count <= 3
              mov rax,$0101010101010101
              mov r9d,edx
              imul rax,r8            //fill rax with value.
              cmp rdx,59             //Use simple code for small blocks.
              jl  @Below32
@Above32:     mov r11,rcx
              mov r8b,7              //code shrink to help alignment.
              lea r9,[rcx+rdx]       //r9=end of array
              sub rdx,8
              rep mov [rcx],rax
              add rcx,8
              and r11,r8             //and 7 See if dest is aligned
              jz @tail
@NotAligned:  xor rcx,r11            //align dest
              lea rdx,[rdx+r11]
@tail:        test r9,r8             //and 7 is tail aligned?
              jz @alignOK
@tailwrite:   mov [r9-8],rax         //no, we need to do a tail write
              and r9,r8              //and 7
              sub rdx,r9             //dec(count, tailcount)
@alignOK:     mov r10,rdx
              and edx,(32+16+8)      //count the partial iterations of the loop
              mov r8b,64             //code shrink to help alignment.
              mov r9,rdx
              jz @Initloop64
@partialloop: shr r9,1              //every instruction is 4 bytes
              lea r11,[rip + @partial +(4*7)] //start at the end of the loop
              sub r11,r9            //step back as needed
              add rcx,rdx            //add the partial loop count to dest
              cmp r10,r8             //do we need to do more loops?
              jmp r11                //do a partial loop
@Initloop64:  shr r10,6              //any work left?
              jz @done               //no, return
              mov rdx,r10
              shr r10,(19-6)         //use non-temporal move for > 512kb
              jnz @InitFillHuge
@Doloop64:    add rcx,r8
              dec edx
              mov [rcx-64+00H],rax
              mov [rcx-64+08H],rax
              mov [rcx-64+10H],rax
              mov [rcx-64+18H],rax
              mov [rcx-64+20H],rax
              mov [rcx-64+28H],rax
              mov [rcx-64+30H],rax
              mov [rcx-64+38H],rax
              jnz @DoLoop64
@done:        rep ret
              //db $66,$66,$0f,$1f,$44,$00,$00 //nop7
@partial:     mov [rcx-64+08H],rax
              mov [rcx-64+10H],rax
              mov [rcx-64+18H],rax
              mov [rcx-64+20H],rax
              mov [rcx-64+28H],rax
              mov [rcx-64+30H],rax
              mov [rcx-64+38H],rax
              jge @Initloop64        //are we done with all loops?
              rep ret
              db $0F,$1F,$40,$00
@InitFillHuge:
@FillHuge:    add rcx,r8
              dec rdx
              db $48,$0F,$C3,$41,$C0 // movnti  [rcx-64+00H],rax
              db $48,$0F,$C3,$41,$C8 // movnti  [rcx-64+08H],rax
              db $48,$0F,$C3,$41,$D0 // movnti  [rcx-64+10H],rax
              db $48,$0F,$C3,$41,$D8 // movnti  [rcx-64+18H],rax
              db $48,$0F,$C3,$41,$E0 // movnti  [rcx-64+20H],rax
              db $48,$0F,$C3,$41,$E8 // movnti  [rcx-64+28H],rax
              db $48,$0F,$C3,$41,$F0 // movnti  [rcx-64+30H],rax
              db $48,$0F,$C3,$41,$F8 // movnti  [rcx-64+38H],rax
              jnz @FillHuge
@donefillhuge:mfence
              rep ret
              db $0F,$1F,$44,$00,$00  //db $0F,$1F,$40,$00
@Below32:     and  r9d,not(3)
              jz @SizeIs3
@FillTail:    sub   edx,4
              lea   r10,[rip + @SmallFill + (15*4)]
              sub   r10,r9
              jmp   r10
@SmallFill:   rep mov [rcx+56], eax
              rep mov [rcx+52], eax
              rep mov [rcx+48], eax
              rep mov [rcx+44], eax
              rep mov [rcx+40], eax
              rep mov [rcx+36], eax
              rep mov [rcx+32], eax
              rep mov [rcx+28], eax
              rep mov [rcx+24], eax
              rep mov [rcx+20], eax
              rep mov [rcx+16], eax
              rep mov [rcx+12], eax
              rep mov [rcx+08], eax
              rep mov [rcx+04], eax
              mov [rcx],eax
@Fallthough:  mov [rcx+rdx],eax  //unaligned write to fix up tail
              rep ret

@SizeIs3:     shl edx,2           //r9 <= 3  r9*4
              lea r10,[rip + @do3 + (4*3)]
              sub r10,rdx
              jmp r10
@do3:         rep mov [rcx+2],al
@do2:         mov [rcx],ax
              ret
@do1:         mov [rcx],al
              rep ret
@do0:         rep ret
end;
{$endif}
Run Code Online (Sandbox Code Playgroud)

解决问题的最简单方法是下载Mormot并将SynCommon.pas包含到您的项目中.这将修补System.FillChar到上面的代码,并包括其他一些性能改进.

请注意,您不需要所有Mormot,只需要SynCommons.

  • 好的,我查了一下.这比新的System.FillChar快.大约2-3倍的速度. (2认同)
  • 再次感谢Johan,他将您的x64 asm代码贡献给了SynCommons.pas! (2认同)