audk/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S

141 lines
4.2 KiB
ArmAsm

//
// Copyright (c) 2014, ARM Limited
// All rights Reserved.
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
// Assumptions:
//
// ARMv8-a, AArch64
// Neon Available.
//
// Arguments and results.
#define srcin x0
#define cntin x1
#define chrin w2
#define result x0
#define src x3
#define tmp x4
#define wtmp2 w5
#define synd x6
#define soff x9
#define cntrem x10
#define vrepchr v0
#define vdata1 v1
#define vdata2 v2
#define vhas_chr1 v3
#define vhas_chr2 v4
#define vrepmask v5
#define vend v6
//
// Core algorithm:
//
// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
// per byte. For each tuple, bit 0 is set if the relevant byte matched the
// requested character and bit 1 is not used (faster than using a 32bit
// syndrome). Since the bits in the syndrome reflect exactly the order in which
// things occur in the original string, counting trailing zeros allows to
// identify exactly which byte has matched.
//
ASM_GLOBAL ASM_PFX(InternalMemScanMem8)
ASM_PFX(InternalMemScanMem8):
AARCH64_BTI(c)
// Do not dereference srcin if no bytes to compare.
cbz cntin, .Lzero_length
//
// Magic constant 0x40100401 allows us to identify which lane matches
// the requested byte.
//
mov wtmp2, #0x0401
movk wtmp2, #0x4010, lsl #16
dup vrepchr.16b, chrin
// Work with aligned 32-byte chunks
bic src, srcin, #31
dup vrepmask.4s, wtmp2
ands soff, srcin, #31
and cntrem, cntin, #31
b.eq .Lloop
//
// Input string is not 32-byte aligned. We calculate the syndrome
// value for the aligned 32 bytes block containing the first bytes
// and mask the irrelevant part.
//
ld1 {vdata1.16b, vdata2.16b}, [src], #32
sub tmp, soff, #32
adds cntin, cntin, tmp
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
addp vend.16b, vend.16b, vend.16b // 128->64
mov synd, vend.d[0]
// Clear the soff*2 lower bits
lsl tmp, soff, #1
lsr synd, synd, tmp
lsl synd, synd, tmp
// The first block can also be the last
b.ls .Lmasklast
// Have we found something already?
cbnz synd, .Ltail
.Lloop:
ld1 {vdata1.16b, vdata2.16b}, [src], #32
subs cntin, cntin, #32
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
// If we're out of data we finish regardless of the result
b.ls .Lend
// Use a fast check for the termination condition
orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
addp vend.2d, vend.2d, vend.2d
mov synd, vend.d[0]
// We're not out of data, loop if we haven't found the character
cbz synd, .Lloop
.Lend:
// Termination condition found, let's calculate the syndrome value
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
addp vend.16b, vend.16b, vend.16b // 128->64
mov synd, vend.d[0]
// Only do the clear for the last possible block
b.hi .Ltail
.Lmasklast:
// Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits
add tmp, cntrem, soff
and tmp, tmp, #31
sub tmp, tmp, #32
neg tmp, tmp, lsl #1
lsl synd, synd, tmp
lsr synd, synd, tmp
.Ltail:
// Count the trailing zeros using bit reversing
rbit synd, synd
// Compensate the last post-increment
sub src, src, #32
// Check that we have found a character
cmp synd, #0
// And count the leading zeros
clz synd, synd
// Compute the potential result
add result, src, synd, lsr #1
// Select result or NULL
csel result, xzr, result, eq
ret
.Lzero_length:
mov result, #0
ret