BaseMemoryLibSse2: Take advantage of write combining buffers

The current SSE2 implementation of the ZeroMem(), SetMem(),
SetMem16(), SetMem32 and SetMem64 functions is writing 16 bytes per 16
bytes. It hurts the performances so bad that this is even slower than
a simple 'rep stos' (4% slower) in regular DRAM.

To take full advantages of the 'movntdq' instruction it is better to
"queue" a total of 64 bytes in the write combining buffers.  This
patch implement such a change.  Below is a table where I measured
(with 'rdtsc') the time to write an entire 100MB RAM buffer. These
functions operate almost two times faster.

| Function | Arch | Untouched | 64 bytes | Result |
|----------+------+-----------+----------+--------|
| ZeroMem  | Ia32 |  17765947 |  9136062 | 1.945x |
| ZeroMem  | X64  |  17525170 |  9233391 | 1.898x |
| SetMem   | Ia32 |  17522291 |  9137272 | 1.918x |
| SetMem   | X64  |  17949261 |  9176978 | 1.956x |
| SetMem16 | Ia32 |  18219673 |  9372062 | 1.944x |
| SetMem16 | X64  |  17523331 |  9275184 | 1.889x |
| SetMem32 | Ia32 |  18495036 |  9273053 | 1.994x |
| SetMem32 | X64  |  17368864 |  9285885 | 1.870x |
| SetMem64 | Ia32 |  18564473 |  9241362 | 2.009x |
| SetMem64 | X64  |  17506951 |  9280148 | 1.886x |

Signed-off-by: Jeremy Compostella <jeremy.compostella@intel.com>
Reviewed-by: Liming Gao <gaoliming@byosoft.com.cn>
This commit is contained in:
Compostella, Jeremy 2020-10-10 04:42:34 +08:00 committed by mergify[bot]
parent 19c87b7d44
commit d25fd8710d
10 changed files with 85 additions and 38 deletions

View File

@ -34,7 +34,7 @@ ASM_PFX(InternalMemSetMem):
mov al, [esp + 16] ; al <- Value mov al, [esp + 16] ; al <- Value
xor ecx, ecx xor ecx, ecx
sub ecx, edi sub ecx, edi
and ecx, 15 ; ecx + edi aligns on 16-byte boundary and ecx, 63 ; ecx + edi aligns on 16-byte boundary
jz .0 jz .0
cmp ecx, edx cmp ecx, edx
cmova ecx, edx cmova ecx, edx
@ -42,8 +42,8 @@ ASM_PFX(InternalMemSetMem):
rep stosb rep stosb
.0: .0:
mov ecx, edx mov ecx, edx
and edx, 15 and edx, 63
shr ecx, 4 ; ecx <- # of DQwords to set shr ecx, 6 ; ecx <- # of DQwords to set
jz @SetBytes jz @SetBytes
mov ah, al ; ax <- Value | (Value << 8) mov ah, al ; ax <- Value | (Value << 8)
add esp, -16 add esp, -16
@ -53,7 +53,10 @@ ASM_PFX(InternalMemSetMem):
movlhps xmm0, xmm0 ; xmm0 <- Value repeats 16 times movlhps xmm0, xmm0 ; xmm0 <- Value repeats 16 times
.1: .1:
movntdq [edi], xmm0 ; edi should be 16-byte aligned movntdq [edi], xmm0 ; edi should be 16-byte aligned
add edi, 16 movntdq [edi + 16], xmm0
movntdq [edi + 32], xmm0
movntdq [edi + 48], xmm0
add edi, 64
loop .1 loop .1
mfence mfence
movdqu xmm0, [esp] ; restore xmm0 movdqu xmm0, [esp] ; restore xmm0

View File

@ -33,7 +33,7 @@ ASM_PFX(InternalMemSetMem16):
mov edi, [esp + 8] mov edi, [esp + 8]
xor ecx, ecx xor ecx, ecx
sub ecx, edi sub ecx, edi
and ecx, 15 ; ecx + edi aligns on 16-byte boundary and ecx, 63 ; ecx + edi aligns on 16-byte boundary
mov eax, [esp + 16] mov eax, [esp + 16]
jz .0 jz .0
shr ecx, 1 shr ecx, 1
@ -43,15 +43,18 @@ ASM_PFX(InternalMemSetMem16):
rep stosw rep stosw
.0: .0:
mov ecx, edx mov ecx, edx
and edx, 7 and edx, 31
shr ecx, 3 shr ecx, 5
jz @SetWords jz @SetWords
movd xmm0, eax movd xmm0, eax
pshuflw xmm0, xmm0, 0 pshuflw xmm0, xmm0, 0
movlhps xmm0, xmm0 movlhps xmm0, xmm0
.1: .1:
movntdq [edi], xmm0 ; edi should be 16-byte aligned movntdq [edi], xmm0 ; edi should be 16-byte aligned
add edi, 16 movntdq [edi + 16], xmm0
movntdq [edi + 32], xmm0
movntdq [edi + 48], xmm0
add edi, 64
loop .1 loop .1
mfence mfence
@SetWords: @SetWords:

View File

@ -43,14 +43,17 @@ ASM_PFX(InternalMemSetMem32):
rep stosd rep stosd
.0: .0:
mov ecx, edx mov ecx, edx
and edx, 3 and edx, 15
shr ecx, 2 shr ecx, 4
jz @SetDwords jz @SetDwords
movd xmm0, eax movd xmm0, eax
pshufd xmm0, xmm0, 0 pshufd xmm0, xmm0, 0
.1: .1:
movntdq [edi], xmm0 movntdq [edi], xmm0
add edi, 16 movntdq [edi + 16], xmm0
movntdq [edi + 32], xmm0
movntdq [edi + 48], xmm0
add edi, 64
loop .1 loop .1
mfence mfence
@SetDwords: @SetDwords:

View File

@ -38,17 +38,29 @@ ASM_PFX(InternalMemSetMem64):
add edx, 8 add edx, 8
dec ecx dec ecx
.0: .0:
shr ecx, 1 push ebx
mov ebx, ecx
and ebx, 7
shr ecx, 3
jz @SetQwords jz @SetQwords
movlhps xmm0, xmm0 movlhps xmm0, xmm0
.1: .1:
movntdq [edx], xmm0 movntdq [edx], xmm0
lea edx, [edx + 16] movntdq [edx + 16], xmm0
movntdq [edx + 32], xmm0
movntdq [edx + 48], xmm0
lea edx, [edx + 64]
loop .1 loop .1
mfence mfence
@SetQwords: @SetQwords:
jnc .2 test ebx, ebx
jz .3
mov ecx, ebx
.2
movq qword [edx], xmm0 movq qword [edx], xmm0
.2: lea edx, [edx + 8]
loop .2
.3:
pop ebx
ret ret

View File

@ -33,7 +33,7 @@ ASM_PFX(InternalMemZeroMem):
xor ecx, ecx xor ecx, ecx
sub ecx, edi sub ecx, edi
xor eax, eax xor eax, eax
and ecx, 15 and ecx, 63
jz .0 jz .0
cmp ecx, edx cmp ecx, edx
cmova ecx, edx cmova ecx, edx
@ -41,13 +41,16 @@ ASM_PFX(InternalMemZeroMem):
rep stosb rep stosb
.0: .0:
mov ecx, edx mov ecx, edx
and edx, 15 and edx, 63
shr ecx, 4 shr ecx, 6
jz @ZeroBytes jz @ZeroBytes
pxor xmm0, xmm0 pxor xmm0, xmm0
.1: .1:
movntdq [edi], xmm0 movntdq [edi], xmm0
add edi, 16 movntdq [edi + 16], xmm0
movntdq [edi + 32], xmm0
movntdq [edi + 48], xmm0
add edi, 64
loop .1 loop .1
mfence mfence
@ZeroBytes: @ZeroBytes:

View File

@ -42,8 +42,8 @@ ASM_PFX(InternalMemSetMem):
rep stosb rep stosb
.0: .0:
mov rcx, rdx mov rcx, rdx
and rdx, 15 and rdx, 63
shr rcx, 4 shr rcx, 6
jz @SetBytes jz @SetBytes
mov ah, al ; ax <- Value repeats twice mov ah, al ; ax <- Value repeats twice
movdqa [rsp + 0x10], xmm0 ; save xmm0 movdqa [rsp + 0x10], xmm0 ; save xmm0
@ -52,7 +52,10 @@ ASM_PFX(InternalMemSetMem):
movlhps xmm0, xmm0 ; xmm0 <- Value repeats 16 times movlhps xmm0, xmm0 ; xmm0 <- Value repeats 16 times
.1: .1:
movntdq [rdi], xmm0 ; rdi should be 16-byte aligned movntdq [rdi], xmm0 ; rdi should be 16-byte aligned
add rdi, 16 movntdq [rdi + 16], xmm0
movntdq [rdi + 32], xmm0
movntdq [rdi + 48], xmm0
add rdi, 64
loop .1 loop .1
mfence mfence
movdqa xmm0, [rsp + 0x10] ; restore xmm0 movdqa xmm0, [rsp + 0x10] ; restore xmm0

View File

@ -33,7 +33,7 @@ ASM_PFX(InternalMemSetMem16):
mov r9, rdi mov r9, rdi
xor rcx, rcx xor rcx, rcx
sub rcx, rdi sub rcx, rdi
and rcx, 15 and rcx, 63
mov rax, r8 mov rax, r8
jz .0 jz .0
shr rcx, 1 shr rcx, 1
@ -43,15 +43,18 @@ ASM_PFX(InternalMemSetMem16):
rep stosw rep stosw
.0: .0:
mov rcx, rdx mov rcx, rdx
and edx, 7 and edx, 31
shr rcx, 3 shr rcx, 5
jz @SetWords jz @SetWords
movd xmm0, eax movd xmm0, eax
pshuflw xmm0, xmm0, 0 pshuflw xmm0, xmm0, 0
movlhps xmm0, xmm0 movlhps xmm0, xmm0
.1: .1:
movntdq [rdi], xmm0 movntdq [rdi], xmm0
add rdi, 16 movntdq [rdi + 16], xmm0
movntdq [rdi + 32], xmm0
movntdq [rdi + 48], xmm0
add rdi, 64
loop .1 loop .1
mfence mfence
@SetWords: @SetWords:

View File

@ -43,14 +43,17 @@ ASM_PFX(InternalMemSetMem32):
rep stosd rep stosd
.0: .0:
mov rcx, rdx mov rcx, rdx
and edx, 3 and edx, 15
shr rcx, 2 shr rcx, 4
jz @SetDwords jz @SetDwords
movd xmm0, eax movd xmm0, eax
pshufd xmm0, xmm0, 0 pshufd xmm0, xmm0, 0
.1: .1:
movntdq [rdi], xmm0 movntdq [rdi], xmm0
add rdi, 16 movntdq [rdi + 16], xmm0
movntdq [rdi + 32], xmm0
movntdq [rdi + 48], xmm0
add rdi, 64
loop .1 loop .1
mfence mfence
@SetDwords: @SetDwords:

View File

@ -37,17 +37,28 @@ ASM_PFX(InternalMemSetMem64):
add rdx, 8 add rdx, 8
dec rcx dec rcx
.0: .0:
shr rcx, 1 push rbx
mov rbx, rcx
and rbx, 7
shr rcx, 3
jz @SetQwords jz @SetQwords
movlhps xmm0, xmm0 movlhps xmm0, xmm0
.1: .1:
movntdq [rdx], xmm0 movntdq [rdx], xmm0
lea rdx, [rdx + 16] movntdq [rdx + 16], xmm0
movntdq [rdx + 32], xmm0
movntdq [rdx + 48], xmm0
lea rdx, [rdx + 64]
loop .1 loop .1
mfence mfence
@SetQwords: @SetQwords:
jnc .2 push rdi
mov [rdx], r8 mov rcx, rbx
mov rax, r8
mov rdi, rdx
rep stosq
pop rdi
.2: .2:
pop rbx
ret ret

View File

@ -32,7 +32,7 @@ ASM_PFX(InternalMemZeroMem):
xor rcx, rcx xor rcx, rcx
xor eax, eax xor eax, eax
sub rcx, rdi sub rcx, rdi
and rcx, 15 and rcx, 63
mov r8, rdi mov r8, rdi
jz .0 jz .0
cmp rcx, rdx cmp rcx, rdx
@ -41,13 +41,16 @@ ASM_PFX(InternalMemZeroMem):
rep stosb rep stosb
.0: .0:
mov rcx, rdx mov rcx, rdx
and edx, 15 and edx, 63
shr rcx, 4 shr rcx, 6
jz @ZeroBytes jz @ZeroBytes
pxor xmm0, xmm0 pxor xmm0, xmm0
.1: .1:
movntdq [rdi], xmm0 ; rdi should be 16-byte aligned movntdq [rdi], xmm0
add rdi, 16 movntdq [rdi + 16], xmm0
movntdq [rdi + 32], xmm0
movntdq [rdi + 48], xmm0
add rdi, 64
loop .1 loop .1
mfence mfence
@ZeroBytes: @ZeroBytes: