// // Copyright (c) 2014, ARM Limited // All rights Reserved. // SPDX-License-Identifier: BSD-2-Clause-Patent // // Assumptions: // // ARMv8-a, AArch64 // Neon Available. // // Arguments and results. #define srcin x0 #define cntin x1 #define chrin w2 #define result x0 #define src x3 #define tmp x4 #define wtmp2 w5 #define synd x6 #define soff x9 #define cntrem x10 #define vrepchr v0 #define vdata1 v1 #define vdata2 v2 #define vhas_chr1 v3 #define vhas_chr2 v4 #define vrepmask v5 #define vend v6 // // Core algorithm: // // For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits // per byte. For each tuple, bit 0 is set if the relevant byte matched the // requested character and bit 1 is not used (faster than using a 32bit // syndrome). Since the bits in the syndrome reflect exactly the order in which // things occur in the original string, counting trailing zeros allows to // identify exactly which byte has matched. // ASM_GLOBAL ASM_PFX(InternalMemScanMem8) ASM_PFX(InternalMemScanMem8): AARCH64_BTI(c) // Do not dereference srcin if no bytes to compare. cbz cntin, .Lzero_length // // Magic constant 0x40100401 allows us to identify which lane matches // the requested byte. // mov wtmp2, #0x0401 movk wtmp2, #0x4010, lsl #16 dup vrepchr.16b, chrin // Work with aligned 32-byte chunks bic src, srcin, #31 dup vrepmask.4s, wtmp2 ands soff, srcin, #31 and cntrem, cntin, #31 b.eq .Lloop // // Input string is not 32-byte aligned. We calculate the syndrome // value for the aligned 32 bytes block containing the first bytes // and mask the irrelevant part. // ld1 {vdata1.16b, vdata2.16b}, [src], #32 sub tmp, soff, #32 adds cntin, cntin, tmp cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 addp vend.16b, vend.16b, vend.16b // 128->64 mov synd, vend.d[0] // Clear the soff*2 lower bits lsl tmp, soff, #1 lsr synd, synd, tmp lsl synd, synd, tmp // The first block can also be the last b.ls .Lmasklast // Have we found something already? cbnz synd, .Ltail .Lloop: ld1 {vdata1.16b, vdata2.16b}, [src], #32 subs cntin, cntin, #32 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b // If we're out of data we finish regardless of the result b.ls .Lend // Use a fast check for the termination condition orr vend.16b, vhas_chr1.16b, vhas_chr2.16b addp vend.2d, vend.2d, vend.2d mov synd, vend.d[0] // We're not out of data, loop if we haven't found the character cbz synd, .Lloop .Lend: // Termination condition found, let's calculate the syndrome value and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 addp vend.16b, vend.16b, vend.16b // 128->64 mov synd, vend.d[0] // Only do the clear for the last possible block b.hi .Ltail .Lmasklast: // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits add tmp, cntrem, soff and tmp, tmp, #31 sub tmp, tmp, #32 neg tmp, tmp, lsl #1 lsl synd, synd, tmp lsr synd, synd, tmp .Ltail: // Count the trailing zeros using bit reversing rbit synd, synd // Compensate the last post-increment sub src, src, #32 // Check that we have found a character cmp synd, #0 // And count the leading zeros clz synd, synd // Compute the potential result add result, src, synd, lsr #1 // Select result or NULL csel result, xzr, result, eq ret .Lzero_length: mov result, #0 ret