unit FastMove;
(*
Copyright (c) 2006, John O'Harrow (john@elmcrest.demon.co.uk)
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the
use of this software.
Permission is granted to anyone to use this software for any purpose, including
commercial applications, and to alter it and redistribute it freely, subject to
the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim
that you wrote the original software. If you use this software in a product,
an acknowledgment in the product documentation would be appreciated but is
not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
-------------------------------------------------------------------------------
Version: 3.03 : 06-MAR-2006
How to use:
Include this unit in any uses clause - Thats It!
What is Does:
This unit replaces all calls to system.move with calls to a faster move
procedure (Up to 3 times faster). The code will automatically detect and
use MMX, SSE, SSE2 or SSE3 where available.
Version Change
------- ------
2.00 Updated all Moves to use Move_JOH_XXX_6 (better code alignment).
2.10 VirtualProtect modified to use PAGE_EXECUTE_READWRITE to prevent
error under WinXP SP2 with AMD64 running DEP.
2.20 Added Check for "Already Patched" to prevent error when using.
Packages in both DLL's and EXE.
2.30 PrefetchLimit initialization moved outside of ReplaceMove.
3.00 Updated all Moves to use latest Move_JOH_XXX_10 procedures and added
SSE2 and SSE3 optimized moves.
Extra VirtualProtect removed. PatchSystemMove Directive added.
Local Move Procedure added for direct calling of fastest Move.
3.01 {$D-} Added to prevent stepping into the Move BASM Code
Badly Names PrefetchLimit renamed CacheLimit
3.02 Incorrect DB equivalents of lddqu instruction corrected
3.03 Compiler Directives for D2006 Added
Contact Email address corrected
*)
{.$D-} {Prevent Steppping into Move Code}
{$DEFINE PatchSystemMove} {Enable Patching of System.Move}
{$IFDEF VER170}
{$DEFINE SSE2Basm}
{$WARN UNSAFE_CODE OFF}
{$ENDIF}
{$IFDEF CONDITIONALEXPRESSIONS}
{$IF CompilerVersion >= 18.0}
{$DEFINE SSE2Basm}
{$WARN UNSAFE_CODE OFF}
{$IFEND}
{$ENDIF}
interface
var
Move : procedure(const Source; var Dest; Count : Integer); {Fastest Move}
{Procedures interfaced only for testing and validation purposes}
procedure MoveJOH_PAS_10 (const Source; var Dest; Count : Integer);
procedure MoveJOH_IA32_10(const Source; var Dest; Count : Integer);
procedure MoveJOH_MMX_10 (const Source; var Dest; Count : Integer);
procedure MoveJOH_SSE_10 (const Source; var Dest; Count : Integer);
procedure MoveJOH_SSE2_10(const Source; var Dest; Count : Integer);
procedure MoveJOH_SSE3_10(const Source; var Dest; Count : Integer);
implementation
uses
Windows, SysUtils, FastcodeCPUID;
var
CacheLimit : Integer; {Used within SSE Moves}
{-------------------------------------------------------------------------}
{Move without using any BASM Code}
procedure MoveJOH_PAS_10(const Source; var Dest; Count : Integer);
var
S, D : Cardinal;
Temp, C, I : Integer;
L : PInteger;
begin
S := Cardinal(@Source);
D := Cardinal(@Dest);
if S = D then
Exit;
if Count <= 4 then
case Count of
1 : PByte(@Dest)^ := PByte(S)^;
2 : PWord(@Dest)^ := PWord(S)^;
3 : if D > S then
begin
PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
PWord(@Dest)^ := PWord(S)^;
end
else
begin
PWord(@Dest)^ := PWord(S)^;
PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
end;
4 : PInteger(@Dest)^ := PInteger(S)^
else Exit; {Count <= 0}
end
else
if D > S then
begin
Temp := PInteger(S)^;
I := Integer(@Dest);
C := Count - 4;
L := PInteger(Integer(@Dest) + C);
Inc(S, C);
repeat
L^ := PInteger(S)^;
if Count <= 8 then
Break;
Dec(Count, 4);
Dec(S, 4);
Dec(L);
until False;
PInteger(I)^ := Temp;
end
else
begin
C := Count - 4;
Temp := PInteger(S + Cardinal(C))^;
I := Integer(@Dest) + C;
L := @Dest;
repeat
L^ := PInteger(S)^;
if Count <= 8 then
Break;
Dec(Count, 4);
Inc(S, 4);
Inc(L);
until False;
PInteger(I)^ := Temp;
end;
end; {MoveJOH_PAS}
const
TINYSIZE = 36;
{-------------------------------------------------------------------------}
{Perform Forward Move of 0..36 Bytes}
{On Entry, ECX = Count, EAX = Source+Count, EDX = Dest+Count. Destroys ECX}
procedure SmallForwardMove_10;
asm
jmp dword ptr [@@FwdJumpTable+ecx*4]
nop {Align Jump Table}
@@FwdJumpTable:
dd @@Done {Removes need to test for zero size move}
dd @@Fwd01, @@Fwd02, @@Fwd03, @@Fwd04, @@Fwd05, @@Fwd06, @@Fwd07, @@Fwd08
dd @@Fwd09, @@Fwd10, @@Fwd11, @@Fwd12, @@Fwd13, @@Fwd14, @@Fwd15, @@Fwd16
dd @@Fwd17, @@Fwd18, @@Fwd19, @@Fwd20, @@Fwd21, @@Fwd22, @@Fwd23, @@Fwd24
dd @@Fwd25, @@Fwd26, @@Fwd27, @@Fwd28, @@Fwd29, @@Fwd30, @@Fwd31, @@Fwd32
dd @@Fwd33, @@Fwd34, @@Fwd35, @@Fwd36
@@Fwd36:
mov ecx, [eax-36]
mov [edx-36], ecx
@@Fwd32:
mov ecx, [eax-32]
mov [edx-32], ecx
@@Fwd28:
mov ecx, [eax-28]
mov [edx-28], ecx
@@Fwd24:
mov ecx, [eax-24]
mov [edx-24], ecx
@@Fwd20:
mov ecx, [eax-20]
mov [edx-20], ecx
@@Fwd16:
mov ecx, [eax-16]
mov [edx-16], ecx
@@Fwd12:
mov ecx, [eax-12]
mov [edx-12], ecx
@@Fwd08:
mov ecx, [eax-8]
mov [edx-8], ecx
@@Fwd04:
mov ecx, [eax-4]
mov [edx-4], ecx
ret
nop
@@Fwd35:
mov ecx, [eax-35]
mov [edx-35], ecx
@@Fwd31:
mov ecx, [eax-31]
mov [edx-31], ecx
@@Fwd27:
mov ecx, [eax-27]
mov [edx-27], ecx
@@Fwd23:
mov ecx, [eax-23]
mov [edx-23], ecx
@@Fwd19:
mov ecx, [eax-19]
mov [edx-19], ecx
@@Fwd15:
mov ecx, [eax-15]
mov [edx-15], ecx
@@Fwd11:
mov ecx, [eax-11]
mov [edx-11], ecx
@@Fwd07:
mov ecx, [eax-7]
mov [edx-7], ecx
mov ecx, [eax-4]
mov [edx-4], ecx
ret
nop
@@Fwd03:
movzx ecx, word ptr [eax-3]
mov [edx-3], cx
movzx ecx, byte ptr [eax-1]
mov [edx-1], cl
ret
@@Fwd34:
mov ecx, [eax-34]
mov [edx-34], ecx
@@Fwd30:
mov ecx, [eax-30]
mov [edx-30], ecx
@@Fwd26:
mov ecx, [eax-26]
mov [edx-26], ecx
@@Fwd22:
mov ecx, [eax-22]
mov [edx-22], ecx
@@Fwd18:
mov ecx, [eax-18]
mov [edx-18], ecx
@@Fwd14:
mov ecx, [eax-14]
mov [edx-14], ecx
@@Fwd10:
mov ecx, [eax-10]
mov [edx-10], ecx
@@Fwd06:
mov ecx, [eax-6]
mov [edx-6], ecx
@@Fwd02:
movzx ecx, word ptr [eax-2]
mov [edx-2], cx
ret
nop
nop
nop
@@Fwd33:
mov ecx, [eax-33]
mov [edx-33], ecx
@@Fwd29:
mov ecx, [eax-29]
mov [edx-29], ecx
@@Fwd25:
mov ecx, [eax-25]
mov [edx-25], ecx
@@Fwd21:
mov ecx, [eax-21]
mov [edx-21], ecx
@@Fwd17:
mov ecx, [eax-17]
mov [edx-17], ecx
@@Fwd13:
mov ecx, [eax-13]
mov [edx-13], ecx
@@Fwd09:
mov ecx, [eax-9]
mov [edx-9], ecx
@@Fwd05:
mov ecx, [eax-5]
mov [edx-5], ecx
@@Fwd01:
movzx ecx, byte ptr [eax-1]
mov [edx-1], cl
ret
@@Done:
end; {SmallForwardMove}
{-------------------------------------------------------------------------}
{Perform Backward Move of 0..36 Bytes}
{On Entry, ECX = Count, EAX = Source, EDX = Dest. Destroys ECX}
procedure SmallBackwardMove_10;
asm
jmp dword ptr [@@BwdJumpTable+ecx*4]
nop {Align Jump Table}
@@BwdJumpTable:
dd @@Done {Removes need to test for zero size move}
dd @@Bwd01, @@Bwd02, @@Bwd03, @@Bwd04, @@Bwd05, @@Bwd06, @@Bwd07, @@Bwd08
dd @@Bwd09, @@Bwd10, @@Bwd11, @@Bwd12, @@Bwd13, @@Bwd14, @@Bwd15, @@Bwd16
dd @@Bwd17, @@Bwd18, @@Bwd19, @@Bwd20, @@Bwd21, @@Bwd22, @@Bwd23, @@Bwd24
dd @@Bwd25, @@Bwd26, @@Bwd27, @@Bwd28, @@Bwd29, @@Bwd30, @@Bwd31, @@Bwd32
dd @@Bwd33, @@Bwd34, @@Bwd35, @@Bwd36
@@Bwd36:
mov ecx, [eax+32]
mov [edx+32], ecx
@@Bwd32:
mov ecx, [eax+28]
mov [edx+28], ecx
@@Bwd28:
mov ecx, [eax+24]
mov [edx+24], ecx
@@Bwd24:
mov ecx, [eax+20]
mov [edx+20], ecx
@@Bwd20:
mov ecx, [eax+16]
mov [edx+16], ecx
@@Bwd16:
mov ecx, [eax+12]
mov [edx+12], ecx
@@Bwd12:
mov ecx, [eax+8]
mov [edx+8], ecx
@@Bwd08:
mov ecx, [eax+4]
mov [edx+4], ecx
@@Bwd04:
mov ecx, [eax]
mov [edx], ecx
ret
nop
nop
nop
@@Bwd35:
mov ecx, [eax+31]
mov [edx+31], ecx
@@Bwd31:
mov ecx, [eax+27]
mov [edx+27], ecx
@@Bwd27:
mov ecx, [eax+23]
mov [edx+23], ecx
@@Bwd23:
mov ecx, [eax+19]
mov [edx+19], ecx
@@Bwd19:
mov ecx, [eax+15]
mov [edx+15], ecx
@@Bwd15:
mov ecx, [eax+11]
mov [edx+11], ecx
@@Bwd11:
mov ecx, [eax+7]
mov [edx+7], ecx
@@Bwd07:
mov ecx, [eax+3]
mov [edx+3], ecx
mov ecx, [eax]
mov [edx], ecx
ret
nop
nop
nop
@@Bwd03:
movzx ecx, word ptr [eax+1]
mov [edx+1], cx
movzx ecx, byte ptr [eax]
mov [edx], cl
ret
nop
nop
@@Bwd34:
mov ecx, [eax+30]
mov [edx+30], ecx
@@Bwd30:
mov ecx, [eax+26]
mov [edx+26], ecx
@@Bwd26:
mov ecx, [eax+22]
mov [edx+22], ecx
@@Bwd22:
mov ecx, [eax+18]
mov [edx+18], ecx
@@Bwd18:
mov ecx, [eax+14]
mov [edx+14], ecx
@@Bwd14:
mov ecx, [eax+10]
mov [edx+10], ecx
@@Bwd10:
mov ecx, [eax+6]
mov [edx+6], ecx
@@Bwd06:
mov ecx, [eax+2]
mov [edx+2], ecx
@@Bwd02:
movzx ecx, word ptr [eax]
mov [edx], cx
ret
nop
@@Bwd33:
mov ecx, [eax+29]
mov [edx+29], ecx
@@Bwd29:
mov ecx, [eax+25]
mov [edx+25], ecx
@@Bwd25:
mov ecx, [eax+21]
mov [edx+21], ecx
@@Bwd21:
mov ecx, [eax+17]
mov [edx+17], ecx
@@Bwd17:
mov ecx, [eax+13]
mov [edx+13], ecx
@@Bwd13:
mov ecx, [eax+9]
mov [edx+9], ecx
@@Bwd09:
mov ecx, [eax+5]
mov [edx+5], ecx
@@Bwd05:
mov ecx, [eax+1]
mov [edx+1], ecx
@@Bwd01:
movzx ecx, byte ptr[eax]
mov [edx], cl
ret
@@Done:
end; {SmallBackwardMove}
{-------------------------------------------------------------------------}
{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (TINYSIZE)}
procedure Forwards_IA32_10;
asm
fild qword ptr [eax] {First 8}
lea eax, [eax+ecx-8]
lea ecx, [edx+ecx-8]
push edx
push ecx
fild qword ptr [eax] {Last 8}
neg ecx {QWORD Align Writes}
and edx, -8
lea ecx, [ecx+edx+8]
pop edx
@@Loop:
fild qword ptr [eax+ecx]
fistp qword ptr [edx+ecx]
add ecx, 8
jl @@Loop
pop eax
fistp qword ptr [edx] {Last 8}
fistp qword ptr [eax] {First 8}
end; {Forwards_IA32}
{-------------------------------------------------------------------------}
{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (TINYSIZE)}
procedure Backwards_IA32_10;
asm
sub ecx, 8
fild qword ptr [eax+ecx] {Last 8}
fild qword ptr [eax] {First 8}
add ecx, edx {QWORD Align Writes}
push ecx
and ecx, -8
sub ecx, edx
@@Loop:
fild qword ptr [eax+ecx]
fistp qword ptr [edx+ecx]
sub ecx, 8
jg @@Loop
pop eax
fistp qword ptr [edx] {First 8}
fistp qword ptr [eax] {Last 8}
end; {Backwards_IA32}
{-------------------------------------------------------------------------}
{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (TINYSIZE)}
procedure Forwards_MMX_10;
const
SMALLSIZE = 64;
LARGESIZE = 2048;
asm
cmp ecx, SMALLSIZE {Size at which using MMX becomes worthwhile}
jl Forwards_IA32_10
cmp ecx, LARGESIZE
jge @@FwdLargeMove
push ebx
mov ebx, edx
movq mm0, [eax] {First 8 Bytes}
add eax, ecx {QWORD Align Writes}
add ecx, edx
and edx, -8
add edx, 40
sub ecx, edx
add edx, ecx
neg ecx
nop {Align Loop}
@@FwdLoopMMX:
movq mm1, [eax+ecx-32]
movq mm2, [eax+ecx-24]
movq mm3, [eax+ecx-16]
movq mm4, [eax+ecx- 8]
movq [edx+ecx-32], mm1
movq [edx+ecx-24], mm2
movq [edx+ecx-16], mm3
movq [edx+ecx- 8], mm4
add ecx, 32
jle @@FwdLoopMMX
movq [ebx], mm0 {First 8 Bytes}
emms
pop ebx
neg ecx
add ecx, 32
jmp SmallForwardMove_10
nop {Align Loop}
nop
@@FwdLargeMove:
push ebx
mov ebx, ecx
test edx, 15
jz @@FwdAligned
lea ecx, [edx+15] {16 byte Align Destination}
and ecx, -16
sub ecx, edx
add eax, ecx
add edx, ecx
sub ebx, ecx
call SmallForwardMove_10
@@FwdAligned:
mov ecx, ebx
and ecx, -16
sub ebx, ecx {EBX = Remainder}
push esi
push edi
mov esi, eax {ESI = Source}
mov edi, edx {EDI = Dest}
mov eax, ecx {EAX = Count}
and eax, -64 {EAX = No of Bytes to Blocks Moves}
and ecx, $3F {ECX = Remaining Bytes to Move (0..63)}
add esi, eax
add edi, eax
neg eax
@@MMXcopyloop:
movq mm0, [esi+eax ]
movq mm1, [esi+eax+ 8]
movq mm2, [esi+eax+16]
movq mm3, [esi+eax+24]
movq mm4, [esi+eax+32]
movq mm5, [esi+eax+40]
movq mm6, [esi+eax+48]
movq mm7, [esi+eax+56]
movq [edi+eax ], mm0
movq [edi+eax+ 8], mm1
movq [edi+eax+16], mm2
movq [edi+eax+24], mm3
movq [edi+eax+32], mm4
movq [edi+eax+40], mm5
movq [edi+eax+48], mm6
movq [edi+eax+56], mm7
add eax, 64
jnz @@MMXcopyloop
emms {Empty MMX State}
add ecx, ebx
shr ecx, 2
rep movsd
mov ecx, ebx
and ecx, 3
rep movsb
pop edi
pop esi
pop ebx
end; {Forwards_MMX}
{-------------------------------------------------------------------------}
{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (TINYSIZE)}
procedure Backwards_MMX_10;
const
SMALLSIZE = 64;
asm
cmp ecx, SMALLSIZE {Size at which using MMX becomes worthwhile}
jl Backwards_IA32_10
push ebx
movq mm0, [eax+ecx-8] {Get Last QWORD}
lea ebx, [edx+ecx] {QWORD Align Writes}
and ebx, 7
sub ecx, ebx
add ebx, ecx
sub ecx, 32
@@BwdLoopMMX:
movq mm1, [eax+ecx ]
movq mm2, [eax+ecx+ 8]
movq mm3, [eax+ecx+16]
movq mm4, [eax+ecx+24]
movq [edx+ecx+24], mm4
movq [edx+ecx+16], mm3
movq [edx+ecx+ 8], mm2
movq [edx+ecx ], mm1
sub ecx, 32
jge @@BwdLoopMMX
movq [edx+ebx-8], mm0 {Last QWORD}
emms
add ecx, 32
pop ebx
jmp SmallBackwardMove_10
end; {Backwards_MMX}
{-------------------------------------------------------------------------}
procedure LargeAlignedSSEMove;
asm
@@Loop:
movaps xmm0, [eax+ecx]
movaps xmm1, [eax+ecx+16]
movaps xmm2, [eax+ecx+32]
movaps xmm3, [eax+ecx+48]
movaps [edx+ecx], xmm0
movaps [edx+ecx+16], xmm1
movaps [edx+ecx+32], xmm2
movaps [edx+ecx+48], xmm3
movaps xmm4, [eax+ecx+64]
movaps xmm5, [eax+ecx+80]
movaps xmm6, [eax+ecx+96]
movaps xmm7, [eax+ecx+112]
movaps [edx+ecx+64], xmm4
movaps [edx+ecx+80], xmm5
movaps [edx+ecx+96], xmm6
movaps [edx+ecx+112], xmm7
add ecx, 128
js @@Loop
end; {LargeAlignedSSEMove}
{-------------------------------------------------------------------------}
procedure LargeUnalignedSSEMove;
asm
@@Loop:
movups xmm0, [eax+ecx]
movups xmm1, [eax+ecx+16]
movups xmm2, [eax+ecx+32]
movups xmm3, [eax+ecx+48]
movaps [edx+ecx], xmm0
movaps [edx+ecx+16], xmm1
movaps [edx+ecx+32], xmm2
movaps [edx+ecx+48], xmm3
movups xmm4, [eax+ecx+64]
movups xmm5, [eax+ecx+80]
movups xmm6, [eax+ecx+96]
movups xmm7, [eax+ecx+112]
movaps [edx+ecx+64], xmm4
movaps [edx+ecx+80], xmm5
movaps [edx+ecx+96], xmm6
movaps [edx+ecx+112], xmm7
add ecx, 128
js @@Loop
end; {LargeUnalignedSSEMove}
{-------------------------------------------------------------------------}
procedure HugeAlignedSSEMove;
const
Prefetch = 512;
asm
@@Loop:
prefetchnta [eax+ecx+Prefetch]
prefetchnta [eax+ecx+Prefetch+64]
movaps xmm0, [eax+ecx]
movaps xmm1, [eax+ecx+16]
movaps xmm2, [eax+ecx+32]
movaps xmm3, [eax+ecx+48]
movntps [edx+ecx], xmm0
movntps [edx+ecx+16], xmm1
movntps [edx+ecx+32], xmm2
movntps [edx+ecx+48], xmm3
movaps xmm4, [eax+ecx+64]
movaps xmm5, [eax+ecx+80]
movaps xmm6, [eax+ecx+96]
movaps xmm7, [eax+ecx+112]
movntps [edx+ecx+64], xmm4
movntps [edx+ecx+80], xmm5
movntps [edx+ecx+96], xmm6
movntps [edx+ecx+112], xmm7
add ecx, 128
js @@Loop
sfence
end; {HugeAlignedSSEMove}
{-------------------------------------------------------------------------}
procedure HugeUnalignedSSEMove;
const
Prefetch = 512;
asm
@@Loop:
prefetchnta [eax+ecx+Prefetch]
prefetchnta [eax+ecx+Prefetch+64]
movups xmm0, [eax+ecx]
movups xmm1, [eax+ecx+16]
movups xmm2, [eax+ecx+32]
movups xmm3, [eax+ecx+48]
movntps [edx+ecx], xmm0
movntps [edx+ecx+16], xmm1
movntps [edx+ecx+32], xmm2
movntps [edx+ecx+48], xmm3
movups xmm4, [eax+ecx+64]
movups xmm5, [eax+ecx+80]
movups xmm6, [eax+ecx+96]
movups xmm7, [eax+ecx+112]
movntps [edx+ecx+64], xmm4
movntps [edx+ecx+80], xmm5
movntps [edx+ecx+96], xmm6
movntps [edx+ecx+112], xmm7
add ecx, 128
js @@Loop
sfence
end; {HugeUnalignedSSEMove}
{-------------------------------------------------------------------------}
{Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 }
procedure LargeSSEMove(const Source; var Dest; Count: Integer);
asm
push ebx
mov ebx, ecx
and ecx, -128 {No of Bytes to Block Move (Multiple of 128)}
add eax, ecx {End of Source Blocks}
add edx, ecx {End of Dest Blocks}
neg ecx
cmp ecx, CacheLimit {Count > Limit - Use Prefetch}
jl @@Huge
test eax, 15 {Check if Both Source/Dest are Aligned}
jnz @@LargeUnaligned
call LargeAlignedSSEMove {Both Source and Dest 16-Byte Aligned}
jmp @@Remainder
@@LargeUnaligned: {Source Not 16-Byte Aligned}
call LargeUnalignedSSEMove
jmp @@Remainder
@@Huge:
test eax, 15 {Check if Both Source/Dest Aligned}
jnz @@HugeUnaligned
call HugeAlignedSSEMove {Both Source and Dest 16-Byte Aligned}
jmp @@Remainder
@@HugeUnaligned: {Source Not 16-Byte Aligned}
call HugeUnalignedSSEMove
@@Remainder:
and ebx, $7F {Remainder (0..112 - Multiple of 16)}
jz @@Done
add eax, ebx
add edx, ebx
neg ebx
@@RemainderLoop:
movups xmm0, [eax+ebx]
movaps [edx+ebx], xmm0
add ebx, 16
jnz @@RemainderLoop
@@Done:
pop ebx
end; {LargeSSEMove}
{-------------------------------------------------------------------------}
{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (TINYSIZE)}
procedure Forwards_SSE_10;
const
SMALLSIZE = 64;
LARGESIZE = 2048;
asm
cmp ecx, SMALLSIZE
jle Forwards_IA32_10
push ebx
cmp ecx, LARGESIZE
jge @@FwdLargeMove
movups xmm0, [eax] {First 16 Bytes}
mov ebx, edx
add eax, ecx {Align Writes}
add ecx, edx
and edx, -16
add edx, 48
sub ecx, edx
add edx, ecx
neg ecx
nop {Align Loop}
@@FwdLoopSSE:
movups xmm1, [eax+ecx-32]
movups xmm2, [eax+ecx-16]
movaps [edx+ecx-32], xmm1
movaps [edx+ecx-16], xmm2
add ecx, 32
jle @@FwdLoopSSE
movups [ebx], xmm0 {First 16 Bytes}
neg ecx
add ecx, 32
pop ebx
jmp SmallForwardMove_10
@@FwdLargeMove:
mov ebx, ecx
test edx, 15
jz @@FwdLargeAligned
lea ecx, [edx+15] {16 byte Align Destination}
and ecx, -16
sub ecx, edx
add eax, ecx
add edx, ecx
sub ebx, ecx
call SmallForwardMove_10
mov ecx, ebx
@@FwdLargeAligned:
and ecx, -16
sub ebx, ecx {EBX = Remainder}
push edx
push eax
push ecx
call LargeSSEMove
pop ecx
pop eax
pop edx
add ecx, ebx
add eax, ecx
add edx, ecx
mov ecx, ebx
pop ebx
jmp SmallForwardMove_10
end; {Forwards_SSE}
{-------------------------------------------------------------------------}
{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (TINYSIZE)}
procedure Backwards_SSE_10;
const
SMALLSIZE = 64;
asm
cmp ecx, SMALLSIZE
jle Backwards_IA32_10
push ebx
movups xmm0, [eax+ecx-16] {Last 16 Bytes}
lea ebx, [edx+ecx] {Align Writes}
and ebx, 15
sub ecx, ebx
add ebx, ecx
sub ecx, 32
@@BwdLoop:
movups xmm1, [eax+ecx]
movups xmm2, [eax+ecx+16]
movaps [edx+ecx], xmm1
movaps [edx+ecx+16], xmm2
sub ecx, 32
jge @@BwdLoop
movups [edx+ebx-16], xmm0 {Last 16 Bytes}
add ecx, 32
pop ebx
jmp SmallBackwardMove_10
end; {Backwards_SSE}
{-------------------------------------------------------------------------}
procedure LargeAlignedSSE2Move; {Also used in SSE3 Move}
asm
@@Loop:
movdqa xmm0, [eax+ecx]
movdqa xmm1, [eax+ecx+16]
movdqa xmm2, [eax+ecx+32]
movdqa xmm3, [eax+ecx+48]
movdqa [edx+ecx], xmm0
movdqa [edx+ecx+16], xmm1
movdqa [edx+ecx+32], xmm2
movdqa [edx+ecx+48], xmm3
movdqa xmm4, [eax+ecx+64]
movdqa xmm5, [eax+ecx+80]
movdqa xmm6, [eax+ecx+96]
movdqa xmm7, [eax+ecx+112]
movdqa [edx+ecx+64], xmm4
movdqa [edx+ecx+80], xmm5
movdqa [edx+ecx+96], xmm6
movdqa [edx+ecx+112], xmm7
add ecx, 128
js @@Loop
end; {LargeAlignedSSE2Move}
{-------------------------------------------------------------------------}
procedure LargeUnalignedSSE2Move;
asm
@@Loop:
movdqu xmm0, [eax+ecx]
movdqu xmm1, [eax+ecx+16]
movdqu xmm2, [eax+ecx+32]
movdqu xmm3, [eax+ecx+48]
movdqa [edx+ecx], xmm0
movdqa [edx+ecx+16], xmm1
movdqa [edx+ecx+32], xmm2
movdqa [edx+ecx+48], xmm3
movdqu xmm4, [eax+ecx+64]
movdqu xmm5, [eax+ecx+80]
movdqu xmm6, [eax+ecx+96]
movdqu xmm7, [eax+ecx+112]
movdqa [edx+ecx+64], xmm4
movdqa [edx+ecx+80], xmm5
movdqa [edx+ecx+96], xmm6
movdqa [edx+ecx+112], xmm7
add ecx, 128
js @@Loop
end; {LargeUnalignedSSE2Move}
{-------------------------------------------------------------------------}
procedure HugeAlignedSSE2Move; {Also used in SSE3 Move}
const
Prefetch = 512;
asm
@@Loop:
prefetchnta [eax+ecx+Prefetch]
prefetchnta [eax+ecx+Prefetch+64]
movdqa xmm0, [eax+ecx]
movdqa xmm1, [eax+ecx+16]
movdqa xmm2, [eax+ecx+32]
movdqa xmm3, [eax+ecx+48]
movntdq [edx+ecx], xmm0
movntdq [edx+ecx+16], xmm1
movntdq [edx+ecx+32], xmm2
movntdq [edx+ecx+48], xmm3
movdqa xmm4, [eax+ecx+64]
movdqa xmm5, [eax+ecx+80]
movdqa xmm6, [eax+ecx+96]
movdqa xmm7, [eax+ecx+112]
movntdq [edx+ecx+64], xmm4
movntdq [edx+ecx+80], xmm5
movntdq [edx+ecx+96], xmm6
movntdq [edx+ecx+112], xmm7
add ecx, 128
js @@Loop
sfence
end; {HugeAlignedSSE2Move}
{-------------------------------------------------------------------------}
procedure HugeUnalignedSSE2Move;
const
Prefetch = 512;
asm
@@Loop:
prefetchnta [eax+ecx+Prefetch]
prefetchnta [eax+ecx+Prefetch+64]
movdqu xmm0, [eax+ecx]
movdqu xmm1, [eax+ecx+16]
movdqu xmm2, [eax+ecx+32]
movdqu xmm3, [eax+ecx+48]
movntdq [edx+ecx], xmm0
movntdq [edx+ecx+16], xmm1
movntdq [edx+ecx+32], xmm2
movntdq [edx+ecx+48], xmm3
movdqu xmm4, [eax+ecx+64]
movdqu xmm5, [eax+ecx+80]
movdqu xmm6, [eax+ecx+96]
movdqu xmm7, [eax+ecx+112]
movntdq [edx+ecx+64], xmm4
movntdq [edx+ecx+80], xmm5
movntdq [edx+ecx+96], xmm6
movntdq [edx+ecx+112], xmm7
add ecx, 128
js @@Loop
sfence
end; {HugeUnalignedSSE2Move}
{-------------------------------------------------------------------------}
{Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 }
procedure LargeSSE2Move(const Source; var Dest; Count: Integer);
asm
push ebx
mov ebx, ecx
and ecx, -128 {No of Bytes to Block Move (Multiple of 128)}
add eax, ecx {End of Source Blocks}
add edx, ecx {End of Dest Blocks}
neg ecx
cmp ecx, CacheLimit {Count > Limit - Use Prefetch}
jl @@Huge
test eax, 15 {Check if Both Source/Dest are Aligned}
jnz @@LargeUnaligned
call LargeAlignedSSE2Move {Both Source and Dest 16-Byte Aligned}
jmp @@Remainder
@@LargeUnaligned: {Source Not 16-Byte Aligned}
call LargeUnalignedSSE2Move
jmp @@Remainder
@@Huge:
test eax, 15 {Check if Both Source/Dest Aligned}
jnz @@HugeUnaligned
call HugeAlignedSSE2Move {Both Source and Dest 16-Byte Aligned}
jmp @@Remainder
@@HugeUnaligned: {Source Not 16-Byte Aligned}
call HugeUnalignedSSE2Move
@@Remainder:
and ebx, $7F {Remainder (0..112 - Multiple of 16)}
jz @@Done
add eax, ebx
add edx, ebx
neg ebx
@@RemainderLoop:
movdqu xmm0, [eax+ebx]
movdqa [edx+ebx], xmm0
add ebx, 16
jnz @@RemainderLoop
@@Done:
pop ebx
end; {LargeSSE2Move}
{-------------------------------------------------------------------------}
{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (TINYSIZE)}
procedure Forwards_SSE2_10;
const
SMALLSIZE = 64;
LARGESIZE = 2048;
asm
cmp ecx, SMALLSIZE
jle Forwards_IA32_10
push ebx
cmp ecx, LARGESIZE
jge @@FwdLargeMove
movdqu xmm0, [eax] {First 16 Bytes}
mov ebx, edx
add eax, ecx {Align Writes}
add ecx, edx
and edx, -16
add edx, 48
sub ecx, edx
add edx, ecx
neg ecx
@@FwdLoopSSE2:
movdqu xmm1, [eax+ecx-32]
movdqu xmm2, [eax+ecx-16]
movdqa [edx+ecx-32], xmm1
movdqa [edx+ecx-16], xmm2
add ecx, 32
jle @@FwdLoopSSE2
movdqu [ebx], xmm0 {First 16 Bytes}
neg ecx
add ecx, 32
pop ebx
jmp SmallForwardMove_10
@@FwdLargeMove:
mov ebx, ecx
test edx, 15
jz @@FwdLargeAligned
lea ecx, [edx+15] {16 byte Align Destination}
and ecx, -16
sub ecx, edx
add eax, ecx
add edx, ecx
sub ebx, ecx
call SmallForwardMove_10
mov ecx, ebx
@@FwdLargeAligned:
and ecx, -16
sub ebx, ecx {EBX = Remainder}
push edx
push eax
push ecx
call LargeSSE2Move
pop ecx
pop eax
pop edx
add ecx, ebx
add eax, ecx
add edx, ecx
mov ecx, ebx
pop ebx
jmp SmallForwardMove_10
end; {Forwards_SSE2}
{-------------------------------------------------------------------------}
{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (TINYSIZE)}
procedure Backwards_SSE2_10;
const
SMALLSIZE = 64;
asm
cmp ecx, SMALLSIZE
jle Backwards_IA32_10
push ebx
movdqu xmm0, [eax+ecx-16] {Last 16 Bytes}
lea ebx, [edx+ecx] {Align Writes}
and ebx, 15
sub ecx, ebx
add ebx, ecx
sub ecx, 32
add edi, 0 {3-Byte NOP Equivalent to Align Loop}
@@BwdLoop:
movdqu xmm1, [eax+ecx]
movdqu xmm2, [eax+ecx+16]
movdqa [edx+ecx], xmm1
movdqa [edx+ecx+16], xmm2
sub ecx, 32
jge @@BwdLoop
movdqu [edx+ebx-16], xmm0 {Last 16 Bytes}
add ecx, 32
pop ebx
jmp SmallBackwardMove_10
end; {Backwards_SSE2}
{-------------------------------------------------------------------------}
procedure LargeUnalignedSSE3Move;
asm
@@Loop:
{$IFDEF SSE2Basm}
lddqu xmm0, [eax+ecx]
lddqu xmm1, [eax+ecx+16]
lddqu xmm2, [eax+ecx+32]
lddqu xmm3, [eax+ecx+48]
{$ELSE}
DB $F2,$0F,$F0,$04,$01
DB $F2,$0F,$F0,$4C,$01,$10
DB $F2,$0F,$F0,$54,$01,$20
DB $F2,$0F,$F0,$5C,$01,$30
{$ENDIF}
movdqa [edx+ecx], xmm0
movdqa [edx+ecx+16], xmm1
movdqa [edx+ecx+32], xmm2
movdqa [edx+ecx+48], xmm3
{$IFDEF SSE2Basm}
lddqu xmm4, [eax+ecx+64]
lddqu xmm5, [eax+ecx+80]
lddqu xmm6, [eax+ecx+96]
lddqu xmm7, [eax+ecx+112]
{$ELSE}
DB $F2,$0F,$F0,$64,$01,$40
DB $F2,$0F,$F0,$6C,$01,$50
DB $F2,$0F,$F0,$74,$01,$60
DB $F2,$0F,$F0,$7C,$01,$70
{$ENDIF}
movdqa [edx+ecx+64], xmm4
movdqa [edx+ecx+80], xmm5
movdqa [edx+ecx+96], xmm6
movdqa [edx+ecx+112], xmm7
add ecx, 128
js @@Loop
end; {LargeUnalignedSSE3Move}
{-------------------------------------------------------------------------}
procedure HugeUnalignedSSE3Move;
const
Prefetch = 512;
asm
@@Loop:
prefetchnta [eax+ecx+Prefetch]
prefetchnta [eax+ecx+Prefetch+64]
{$IFDEF SSE2Basm}
lddqu xmm0, [eax+ecx]
lddqu xmm1, [eax+ecx+16]
lddqu xmm2, [eax+ecx+32]
lddqu xmm3, [eax+ecx+48]
{$ELSE}
DB $F2,$0F,$F0,$04,$01
DB $F2,$0F,$F0,$4C,$01,$10
DB $F2,$0F,$F0,$54,$01,$20
DB $F2,$0F,$F0,$5C,$01,$30
{$ENDIF}
movntdq [edx+ecx], xmm0
movntdq [edx+ecx+16], xmm1
movntdq [edx+ecx+32], xmm2
movntdq [edx+ecx+48], xmm3
{$IFDEF SSE2Basm}
lddqu xmm4, [eax+ecx+64]
lddqu xmm5, [eax+ecx+80]
lddqu xmm6, [eax+ecx+96]
lddqu xmm7, [eax+ecx+112]
{$ELSE}
DB $F2,$0F,$F0,$64,$01,$40
DB $F2,$0F,$F0,$6C,$01,$50
DB $F2,$0F,$F0,$74,$01,$60
DB $F2,$0F,$F0,$7C,$01,$70
{$ENDIF}
movntdq [edx+ecx+64], xmm4
movntdq [edx+ecx+80], xmm5
movntdq [edx+ecx+96], xmm6
movntdq [edx+ecx+112], xmm7
add ecx, 128
js @@Loop
sfence
end; {HugeUnalignedSSE3Move}
{-------------------------------------------------------------------------}
{Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 }
procedure LargeSSE3Move(const Source; var Dest; Count: Integer);
asm
push ebx
mov ebx, ecx
and ecx, -128 {No of Bytes to Block Move (Multiple of 128)}
add eax, ecx {End of Source Blocks}
add edx, ecx {End of Dest Blocks}
neg ecx
cmp ecx, CacheLimit {Count > Limit - Use Prefetch}
jl @@Huge
test eax, 15 {Check if Both Source/Dest are Aligned}
jnz @@LargeUnaligned
call LargeAlignedSSE2Move {Both Source and Dest 16-Byte Aligned}
jmp @@Remainder
@@LargeUnaligned: {Source Not 16-Byte Aligned}
call LargeUnalignedSSE3Move
jmp @@Remainder
@@Huge:
test eax, 15 {Check if Both Source/Dest Aligned}
jnz @@HugeUnaligned
call HugeAlignedSSE2Move {Both Source and Dest 16-Byte Aligned}
jmp @@Remainder
@@HugeUnaligned: {Source Not 16-Byte Aligned}
call HugeUnalignedSSE3Move
@@Remainder:
and ebx, $7F {Remainder (0..112 - Multiple of 16)}
jz @@Done
add eax, ebx
add edx, ebx
neg ebx
@@RemainderLoop:
{$IFDEF SSE2Basm}
lddqu xmm0, [eax+ebx]
{$ELSE}
DB $F2,$0F,$F0,$04,$03
{$ENDIF}
movdqa [edx+ebx], xmm0
add ebx, 16
jnz @@RemainderLoop
@@Done:
pop ebx
end; {LargeSSE3Move}
{-------------------------------------------------------------------------}
{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (TINYSIZE)}
procedure Forwards_SSE3_10;
const
SMALLSIZE = 64;
LARGESIZE = 2048;
asm
cmp ecx, SMALLSIZE
jle Forwards_IA32_10
push ebx
cmp ecx, LARGESIZE
jge @@FwdLargeMove
{$IFDEF SSE2Basm}
lddqu xmm0, [eax] {First 16 Bytes}
{$ELSE}
DB $F2,$0F,$F0,$00
{$ENDIF}
mov ebx, edx
add eax, ecx {Align Writes}
add ecx, edx
and edx, -16
add edx, 48
sub ecx, edx
add edx, ecx
neg ecx
@@FwdLoopSSE3:
{$IFDEF SSE2Basm}
lddqu xmm1, [eax+ecx-32]
lddqu xmm2, [eax+ecx-16]
{$ELSE}
DB $F2,$0F,$F0,$4C,$01,$E0
DB $F2,$0F,$F0,$54,$01,$F0
{$ENDIF}
movdqa [edx+ecx-32], xmm1
movdqa [edx+ecx-16], xmm2
add ecx, 32
jle @@FwdLoopSSE3
movdqu [ebx], xmm0 {First 16 Bytes}
neg ecx
add ecx, 32
pop ebx
jmp SmallForwardMove_10
@@FwdLargeMove:
mov ebx, ecx
test edx, 15
jz @@FwdLargeAligned
lea ecx, [edx+15] {16 byte Align Destination}
and ecx, -16
sub ecx, edx
add eax, ecx
add edx, ecx
sub ebx, ecx
call SmallForwardMove_10
mov ecx, ebx
@@FwdLargeAligned:
and ecx, -16
sub ebx, ecx {EBX = Remainder}
push edx
push eax
push ecx
call LargeSSE3Move
pop ecx
pop eax
pop edx
add ecx, ebx
add eax, ecx
add edx, ecx
mov ecx, ebx
pop ebx
jmp SmallForwardMove_10
end; {Forwards_SSE3}
{-------------------------------------------------------------------------}
{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (TINYSIZE)}
procedure Backwards_SSE3_10;
const
SMALLSIZE = 64;
asm
cmp ecx, SMALLSIZE
jle Backwards_IA32_10
push ebx
{$IFDEF SSE2Basm}
lddqu xmm0, [eax+ecx-16] {Last 16 Bytes}
{$ELSE}
DB $F2,$0F,$F0,$44,$01,$F0
{$ENDIF}
lea ebx, [edx+ecx] {Align Writes}
and ebx, 15
sub ecx, ebx
add ebx, ecx
sub ecx, 32
add edi, 0 {3-Byte NOP Equivalent to Align Loop}
@@BwdLoop:
{$IFDEF SSE2Basm}
lddqu xmm1, [eax+ecx]
lddqu xmm2, [eax+ecx+16]
{$ELSE}
DB $F2,$0F,$F0,$0C,$01
DB $F2,$0F,$F0,$54,$01,$10
{$ENDIF}
movdqa [edx+ecx], xmm1
movdqa [edx+ecx+16], xmm2
sub ecx, 32
jge @@BwdLoop
movdqu [edx+ebx-16], xmm0 {Last 16 Bytes}
add ecx, 32
pop ebx
jmp SmallBackwardMove_10
end; {Backwards_SSE3}
{-------------------------------------------------------------------------}
{Move using IA32 Instruction Set Only}
procedure MoveJOH_IA32_10(const Source; var Dest; Count : Integer);
asm
cmp ecx, TINYSIZE
ja @@Large {Count > TINYSIZE or Count < 0}
cmp eax, edx
jbe @@SmallCheck
add eax, ecx
add edx, ecx
jmp SmallForwardMove_10
@@SmallCheck:
jne SmallBackwardMove_10
ret {For Compatibility with Delphi's move for Source = Dest}
@@Large:
jng @@Done {For Compatibility with Delphi's move for Count < 0}
cmp eax, edx
ja Forwards_IA32_10
je @@Done {For Compatibility with Delphi's move for Source = Dest}
sub edx, ecx
cmp eax, edx
lea edx, [edx+ecx]
jna Forwards_IA32_10
jmp Backwards_IA32_10 {Source/Dest Overlap}
@@Done:
end; {MoveJOH_IA32}
{-------------------------------------------------------------------------}
{Move using MMX Instruction Set}
procedure MoveJOH_MMX_10(const Source; var Dest; Count : Integer);
asm
cmp ecx, TINYSIZE
ja @@Large {Count > TINYSIZE or Count < 0}
cmp eax, edx
jbe @@SmallCheck
add eax, ecx
add edx, ecx
jmp SmallForwardMove_10
@@SmallCheck:
jne SmallBackwardMove_10
ret {For Compatibility with Delphi's move for Source = Dest}
@@Large:
jng @@Done {For Compatibility with Delphi's move for Count < 0}
cmp eax, edx
ja Forwards_MMX_10
je @@Done {For Compatibility with Delphi's move for Source = Dest}
sub edx, ecx
cmp eax, edx
lea edx, [edx+ecx]
jna Forwards_MMX_10
jmp Backwards_MMX_10 {Source/Dest Overlap}
@@Done:
end; {MoveJOH_MMX}
{-------------------------------------------------------------------------}
{Move using SSE Instruction Set}
procedure MoveJOH_SSE_10(const Source; var Dest; Count : Integer);
asm
cmp ecx, TINYSIZE
ja @@Large {Count > TINYSIZE or Count < 0}
cmp eax, edx
jbe @@SmallCheck
add eax, ecx
add edx, ecx
jmp SmallForwardMove_10
@@SmallCheck:
jne SmallBackwardMove_10
ret {For Compatibility with Delphi's move for Source = Dest}
@@Large:
jng @@Done {For Compatibility with Delphi's move for Count < 0}
cmp eax, edx
ja Forwards_SSE_10
je @@Done {For Compatibility with Delphi's move for Source = Dest}
sub edx, ecx
cmp eax, edx
lea edx, [edx+ecx]
jna Forwards_SSE_10
jmp Backwards_SSE_10 {Source/Dest Overlap}
@@Done:
end; {MoveJOH_SSE}
{-------------------------------------------------------------------------}
{Move using SSE2 Instruction Set}
procedure MoveJOH_SSE2_10(const Source; var Dest; Count : Integer);
asm
cmp ecx, TINYSIZE
ja @@Large {Count > TINYSIZE or Count < 0}
cmp eax, edx
jbe @@SmallCheck
add eax, ecx
add edx, ecx
jmp SmallForwardMove_10
@@SmallCheck:
jne SmallBackwardMove_10
ret {For Compatibility with Delphi's move for Source = Dest}
@@Large:
jng @@Done {For Compatibility with Delphi's move for Count < 0}
cmp eax, edx
ja Forwards_SSE2_10
je @@Done {For Compatibility with Delphi's move for Source = Dest}
sub edx, ecx
cmp eax, edx
lea edx, [edx+ecx]
jna Forwards_SSE2_10
jmp Backwards_SSE2_10 {Source/Dest Overlap}
@@Done:
end; {MoveJOH_SSE2}
{-------------------------------------------------------------------------}
{Move using SSE3 Instruction Set}
procedure MoveJOH_SSE3_10(const Source; var Dest; Count : Integer);
asm
cmp ecx, TINYSIZE
ja @@Large {Count > TINYSIZE or Count < 0}
cmp eax, edx
jbe @@SmallCheck
add eax, ecx
add edx, ecx
jmp SmallForwardMove_10
@@SmallCheck:
jne SmallBackwardMove_10
ret {For Compatibility with Delphi's move for Source = Dest}
@@Large:
jng @@Done {For Compatibility with Delphi's move for Count < 0}
cmp eax, edx
ja Forwards_SSE3_10
je @@Done {For Compatibility with Delphi's move for Source = Dest}
sub edx, ecx
cmp eax, edx
lea edx, [edx+ecx]
jna Forwards_SSE3_10
jmp Backwards_SSE3_10 {Source/Dest Overlap}
@@Done:
end; {MoveJOH_SSE3}
{-------------------------------------------------------------------------}
{$IFDEF PurePascal}
procedure PatchMove; {Patch System.Move to Divert Calls to New Move Procedure}
const
JumpFarId = $E9;
var
Protect, OldProtect : DWORD;
begin
VirtualProtect(@System.Move, 256, PAGE_EXECUTE_READWRITE, @OldProtect);
if PByte(@System.Move)^ <> JumpFarId then {Check if Already Patched}
begin
PByte(@System.Move)^ := JumpFarId;
PInteger(Integer(@System.Move)+1)^ :=
Integer(@Move) - Integer(@System.Move)-5; {Change Destination}
end;
VirtualProtect(@System.Move, 256, OldProtect, @Protect);
FlushInstructionCache(GetCurrentProcess, @System.Move, 256);
end; {PatchMove}
{$ELSE}
procedure PatchMove; {Overwrite System.Move with Main Procedure of New Move}
const
JumpFarId = $E9;
JumpPtrId = $25FF;
type {Jump Positions in Move Prodedures Above - Really Horrible but it Works}
NewMoveType = packed record {Size = 56 Bytes, System.Move Size = 64 Bytes}
Padding1 : array[1..14] of Byte;
Jump1Dest : Integer; {jmp SmallForwardMove}
Padding2 : array[1.. 2] of Byte;
Jump2Dest : Integer; {jmp SmallBackwardMove}
Padding3 : array[1.. 7] of Byte;
Jump3Dest : Integer; {jg Forwards_XXX}
Padding4 : array[1..11] of Byte;
Jump4Dest : Integer; {jg Backwards_XXX}
Padding5 : array[1.. 1] of Byte;
Jump5Dest : Integer; {jmp Forwards_XXX}
Padding6 : array[1.. 1] of Byte;
end;
var
I, Offset : Integer;
Src, Dest : PByteArray;
NewMove : NewMoveType;
Protect,
OldProtect : DWORD;
begin
VirtualProtect(@System.Move, 256, PAGE_EXECUTE_READWRITE, @OldProtect);
if PByte(@System.Move)^ <> JumpFarId then {Check if Already Patched}
if PWord(@System.Move)^ = JumpPtrId then
begin {System.Move Starts JMP DWORD PTR [XXXXXXXX] (ie. Using Packages)}
PByte(@System.Move)^ := JumpFarId;
PInteger(Integer(@System.Move)+1)^ :=
Integer(@Move) - Integer(@System.Move)-5; {Change Destination}
end
else
begin {Patch System.Move. Adjust Jump Destinations in Copied Procedure}
Move(Pointer(@Move)^, NewMove, SizeOf(NewMove));
Offset := Integer(@Move) - Integer(@System.Move);
Inc(NewMove.Jump1Dest, Offset);
Inc(NewMove.Jump2Dest, Offset);
Inc(NewMove.Jump3Dest, Offset);
Inc(NewMove.Jump4Dest, Offset);
Inc(NewMove.Jump5Dest, Offset);
Src := @NewMove;
Dest := @System.Move;
for I := 0 to SizeOf(NewMove) - 1 do
Dest[I] := Src[I]; {Overwrite System.Move}
end;
VirtualProtect(@System.Move, 256, OldProtect, @Protect);
FlushInstructionCache(GetCurrentProcess, @System.Move, SizeOf(NewMove));
end; {PatchMove}
{$ENDIF}
initialization
{$IFDEF PurePascal}
Move := MoveJOH_PAS_10;
{$ELSE}
if isSSE3 in CPU.InstructionSupport then
Move := MoveJOH_SSE3_10 {Processor Supports SSE3}
else
if isSSE2 in CPU.InstructionSupport then
Move := MoveJOH_SSE2_10 {Processor Supports SSE2}
else
if isSSE in CPU.InstructionSupport then
Move := MoveJOH_SSE_10 {Processor Supports SSE}
else
if isMMX in CPU.InstructionSupport then
Move := MoveJOH_MMX_10 {Processor Supports MMX}
else
Move := MoveJOH_IA32_10; {Processor does not Support MMX or SSE}
CacheLimit := CPU.L2CacheSize * -512; {Used within SSE Based Moves}
{$ENDIF}
{$IFDEF PatchSystemMove}
PatchMove; {Patch Delphi's System.Move}
{$ENDIF}
end.
oooO ↘┏━┓ ↙ Oooo
( 踩)→┃你┃ ←(死 )
\ ( →┃√┃ ← ) /
\_)↗┗━┛ ↖(_/
