//
// Copyright (c) 2014, ARM Limited
// All rights Reserved.
// SPDX-License-Identifier: BSD-2-Clause-Patent
//

// Assumptions:
//
// ARMv8-a, AArch64
// Neon Available.
//

// Arguments and results.
#define srcin     x0
#define cntin     x1
#define chrin     w2

#define result    x0

#define src       x3
#define  tmp       x4
#define wtmp2     w5
#define synd      x6
#define soff      x9
#define cntrem    x10

#define vrepchr   v0
#define vdata1    v1
#define vdata2    v2
#define vhas_chr1 v3
#define vhas_chr2 v4
#define vrepmask  v5
#define vend      v6

//
// Core algorithm:
//
// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
// per byte. For each tuple, bit 0 is set if the relevant byte matched the
// requested character and bit 1 is not used (faster than using a 32bit
// syndrome). Since the bits in the syndrome reflect exactly the order in which
// things occur in the original string, counting trailing zeros allows to
// identify exactly which byte has matched.
//

ASM_GLOBAL ASM_PFX(InternalMemScanMem8)
ASM_PFX(InternalMemScanMem8):
    AARCH64_BTI(c)
    // Do not dereference srcin if no bytes to compare.
    cbz  cntin, .Lzero_length
    //
    // Magic constant 0x40100401 allows us to identify which lane matches
    // the requested byte.
    //
    mov     wtmp2, #0x0401
    movk    wtmp2, #0x4010, lsl #16
    dup     vrepchr.16b, chrin
    // Work with aligned 32-byte chunks
    bic     src, srcin, #31
    dup     vrepmask.4s, wtmp2
    ands    soff, srcin, #31
    and     cntrem, cntin, #31
    b.eq    .Lloop

    //
    // Input string is not 32-byte aligned. We calculate the syndrome
    // value for the aligned 32 bytes block containing the first bytes
    // and mask the irrelevant part.
    //

    ld1     {vdata1.16b, vdata2.16b}, [src], #32
    sub     tmp, soff, #32
    adds    cntin, cntin, tmp
    cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
    cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
    and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
    and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
    addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b        // 256->128
    addp    vend.16b, vend.16b, vend.16b                  // 128->64
    mov     synd, vend.d[0]
    // Clear the soff*2 lower bits
    lsl     tmp, soff, #1
    lsr     synd, synd, tmp
    lsl     synd, synd, tmp
    // The first block can also be the last
    b.ls    .Lmasklast
    // Have we found something already?
    cbnz    synd, .Ltail

.Lloop:
    ld1     {vdata1.16b, vdata2.16b}, [src], #32
    subs    cntin, cntin, #32
    cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
    cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
    // If we're out of data we finish regardless of the result
    b.ls    .Lend
    // Use a fast check for the termination condition
    orr     vend.16b, vhas_chr1.16b, vhas_chr2.16b
    addp    vend.2d, vend.2d, vend.2d
    mov     synd, vend.d[0]
    // We're not out of data, loop if we haven't found the character
    cbz     synd, .Lloop

.Lend:
    // Termination condition found, let's calculate the syndrome value
    and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
    and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
    addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b      // 256->128
    addp    vend.16b, vend.16b, vend.16b                // 128->64
    mov     synd, vend.d[0]
    // Only do the clear for the last possible block
    b.hi    .Ltail

.Lmasklast:
    // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits
    add     tmp, cntrem, soff
    and     tmp, tmp, #31
    sub     tmp, tmp, #32
    neg     tmp, tmp, lsl #1
    lsl     synd, synd, tmp
    lsr     synd, synd, tmp

.Ltail:
    // Count the trailing zeros using bit reversing
    rbit    synd, synd
    // Compensate the last post-increment
    sub     src, src, #32
    // Check that we have found a character
    cmp     synd, #0
    // And count the leading zeros
    clz     synd, synd
    // Compute the potential result
    add     result, src, synd, lsr #1
    // Select result or NULL
    csel    result, xzr, result, eq
    ret

.Lzero_length:
    mov   result, #0
    ret