audk/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S

//
// Copyright (c) 2012 - 2016, Linaro Limited
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//     * Redistributions of source code must retain the above copyright
//       notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above copyright
//       notice, this list of conditions and the following disclaimer in the
//       documentation and/or other materials provided with the distribution.
//     * Neither the name of the Linaro nor the
//       names of its contributors may be used to endorse or promote products
//       derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//

//
// Copyright (c) 2015 ARM Ltd
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
// 3. The name of the company may not be used to endorse or promote
//    products derived from this software without specific prior written
//    permission.
//
// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//

// Assumptions:
//
// ARMv8-a, AArch64, unaligned accesses.
//
//

#define dstin     x0
#define src       x1
#define count     x2
#define dst       x3
#define srcend    x4
#define dstend    x5
#define A_l       x6
#define A_lw      w6
#define A_h       x7
#define A_hw      w7
#define B_l       x8
#define B_lw      w8
#define B_h       x9
#define C_l       x10
#define C_h       x11
#define D_l       x12
#define D_h       x13
#define E_l       x14
#define E_h       x15
#define F_l       srcend
#define F_h       dst
#define tmp1      x9
#define tmp2      x3

#define L(l) .L ## l

// Copies are split into 3 main cases: small copies of up to 16 bytes,
// medium copies of 17..96 bytes which are fully unrolled. Large copies
// of more than 96 bytes align the destination and use an unrolled loop
// processing 64 bytes per iteration.
// Small and medium copies read all data before writing, allowing any
// kind of overlap, and memmove tailcalls memcpy for these cases as
// well as non-overlapping copies.

__memcpy:
    prfm    PLDL1KEEP, [src]
    add     srcend, src, count
    add     dstend, dstin, count
    cmp     count, 16
    b.ls    L(copy16)
    cmp     count, 96
    b.hi    L(copy_long)

    // Medium copies: 17..96 bytes.
    sub     tmp1, count, 1
    ldp     A_l, A_h, [src]
    tbnz    tmp1, 6, L(copy96)
    ldp     D_l, D_h, [srcend, -16]
    tbz     tmp1, 5, 1f
    ldp     B_l, B_h, [src, 16]
    ldp     C_l, C_h, [srcend, -32]
    stp     B_l, B_h, [dstin, 16]
    stp     C_l, C_h, [dstend, -32]
1:
    stp     A_l, A_h, [dstin]
    stp     D_l, D_h, [dstend, -16]
    ret

    .p2align 4
    // Small copies: 0..16 bytes.
L(copy16):
    cmp     count, 8
    b.lo    1f
    ldr     A_l, [src]
    ldr     A_h, [srcend, -8]
    str     A_l, [dstin]
    str     A_h, [dstend, -8]
    ret
    .p2align 4
1:
    tbz     count, 2, 1f
    ldr     A_lw, [src]
    ldr     A_hw, [srcend, -4]
    str     A_lw, [dstin]
    str     A_hw, [dstend, -4]
    ret

    // Copy 0..3 bytes.  Use a branchless sequence that copies the same
    // byte 3 times if count==1, or the 2nd byte twice if count==2.
1:
    cbz     count, 2f
    lsr     tmp1, count, 1
    ldrb    A_lw, [src]
    ldrb    A_hw, [srcend, -1]
    ldrb    B_lw, [src, tmp1]
    strb    A_lw, [dstin]
    strb    B_lw, [dstin, tmp1]
    strb    A_hw, [dstend, -1]
2:  ret

    .p2align 4
    // Copy 64..96 bytes.  Copy 64 bytes from the start and
    // 32 bytes from the end.
L(copy96):
    ldp     B_l, B_h, [src, 16]
    ldp     C_l, C_h, [src, 32]
    ldp     D_l, D_h, [src, 48]
    ldp     E_l, E_h, [srcend, -32]
    ldp     F_l, F_h, [srcend, -16]
    stp     A_l, A_h, [dstin]
    stp     B_l, B_h, [dstin, 16]
    stp     C_l, C_h, [dstin, 32]
    stp     D_l, D_h, [dstin, 48]
    stp     E_l, E_h, [dstend, -32]
    stp     F_l, F_h, [dstend, -16]
    ret

    // Align DST to 16 byte alignment so that we don't cross cache line
    // boundaries on both loads and stores. There are at least 96 bytes
    // to copy, so copy 16 bytes unaligned and then align.	The loop
    // copies 64 bytes per iteration and prefetches one iteration ahead.

    .p2align 4
L(copy_long):
    and     tmp1, dstin, 15
    bic     dst, dstin, 15
    ldp     D_l, D_h, [src]
    sub     src, src, tmp1
    add     count, count, tmp1      // Count is now 16 too large.
    ldp     A_l, A_h, [src, 16]
    stp     D_l, D_h, [dstin]
    ldp     B_l, B_h, [src, 32]
    ldp     C_l, C_h, [src, 48]
    ldp     D_l, D_h, [src, 64]!
    subs    count, count, 128 + 16  // Test and readjust count.
    b.ls    2f
1:
    stp     A_l, A_h, [dst, 16]
    ldp     A_l, A_h, [src, 16]
    stp     B_l, B_h, [dst, 32]
    ldp     B_l, B_h, [src, 32]
    stp     C_l, C_h, [dst, 48]
    ldp     C_l, C_h, [src, 48]
    stp     D_l, D_h, [dst, 64]!
    ldp     D_l, D_h, [src, 64]!
    subs    count, count, 64
    b.hi    1b

    // Write the last full set of 64 bytes.	 The remainder is at most 64
    // bytes, so it is safe to always copy 64 bytes from the end even if
    // there is just 1 byte left.
2:
    ldp     E_l, E_h, [srcend, -64]
    stp     A_l, A_h, [dst, 16]
    ldp     A_l, A_h, [srcend, -48]
    stp     B_l, B_h, [dst, 32]
    ldp     B_l, B_h, [srcend, -32]
    stp     C_l, C_h, [dst, 48]
    ldp     C_l, C_h, [srcend, -16]
    stp     D_l, D_h, [dst, 64]
    stp     E_l, E_h, [dstend, -64]
    stp     A_l, A_h, [dstend, -48]
    stp     B_l, B_h, [dstend, -32]
    stp     C_l, C_h, [dstend, -16]
    ret


//
// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
// Larger backwards copies are also handled by memcpy. The only remaining
// case is forward large copies.  The destination is aligned, and an
// unrolled loop processes 64 bytes per iteration.
//

ASM_GLOBAL ASM_PFX(InternalMemCopyMem)
ASM_PFX(InternalMemCopyMem):
    sub     tmp2, dstin, src
    cmp     count, 96
    ccmp    tmp2, count, 2, hi
    b.hs    __memcpy

    cbz     tmp2, 3f
    add     dstend, dstin, count
    add     srcend, src, count

    // Align dstend to 16 byte alignment so that we don't cross cache line
    // boundaries on both loads and stores. There are at least 96 bytes
    // to copy, so copy 16 bytes unaligned and then align. The loop
    // copies 64 bytes per iteration and prefetches one iteration ahead.

    and     tmp2, dstend, 15
    ldp     D_l, D_h, [srcend, -16]
    sub     srcend, srcend, tmp2
    sub     count, count, tmp2
    ldp     A_l, A_h, [srcend, -16]
    stp     D_l, D_h, [dstend, -16]
    ldp     B_l, B_h, [srcend, -32]
    ldp     C_l, C_h, [srcend, -48]
    ldp     D_l, D_h, [srcend, -64]!
    sub     dstend, dstend, tmp2
    subs    count, count, 128
    b.ls    2f
    nop
1:
    stp     A_l, A_h, [dstend, -16]
    ldp     A_l, A_h, [srcend, -16]
    stp     B_l, B_h, [dstend, -32]
    ldp     B_l, B_h, [srcend, -32]
    stp     C_l, C_h, [dstend, -48]
    ldp     C_l, C_h, [srcend, -48]
    stp     D_l, D_h, [dstend, -64]!
    ldp     D_l, D_h, [srcend, -64]!
    subs    count, count, 64
    b.hi    1b

    // Write the last full set of 64 bytes. The remainder is at most 64
    // bytes, so it is safe to always copy 64 bytes from the start even if
    // there is just 1 byte left.
2:
    ldp     E_l, E_h, [src, 48]
    stp     A_l, A_h, [dstend, -16]
    ldp     A_l, A_h, [src, 32]
    stp     B_l, B_h, [dstend, -32]
    ldp     B_l, B_h, [src, 16]
    stp     C_l, C_h, [dstend, -48]
    ldp     C_l, C_h, [src]
    stp     D_l, D_h, [dstend, -64]
    stp     E_l, E_h, [dstin, 48]
    stp     A_l, A_h, [dstin, 32]
    stp     B_l, B_h, [dstin, 16]
    stp     C_l, C_h, [dstin]
3:  ret
MdePkg/BaseMemoryLibOptDxe: add accelerated AARCH64 routines This adds AARCH64 support to BaseMemoryLibOptDxe, based on the cortex-strings library. All string routines are accelerated except ScanMem16, ScanMem32, ScanMem64 and IsZeroBuffer, which can wait for another day. (Very few occurrences exist in the codebase) Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Reviewed-by: Liming Gao <liming.gao@intel.com> 2016-09-02 13:34:22 +02:00			`//`
			`// Copyright (c) 2012 - 2016, Linaro Limited`
			`// All rights reserved.`
			`//`
			`// Redistribution and use in source and binary forms, with or without`
			`// modification, are permitted provided that the following conditions are met:`
			`// * Redistributions of source code must retain the above copyright`
			`// notice, this list of conditions and the following disclaimer.`
			`// * Redistributions in binary form must reproduce the above copyright`
			`// notice, this list of conditions and the following disclaimer in the`
			`// documentation and/or other materials provided with the distribution.`
			`// * Neither the name of the Linaro nor the`
			`// names of its contributors may be used to endorse or promote products`
			`// derived from this software without specific prior written permission.`
			`//`
			`// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS`
			`// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT`
			`// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR`
			`// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT`
			`// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,`
			`// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT`
			`// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,`
			`// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY`
			`// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`//`

			`//`
			`// Copyright (c) 2015 ARM Ltd`
			`// All rights reserved.`
			`//`
			`// Redistribution and use in source and binary forms, with or without`
			`// modification, are permitted provided that the following conditions`
			`// are met:`
			`// 1. Redistributions of source code must retain the above copyright`
			`// notice, this list of conditions and the following disclaimer.`
			`// 2. Redistributions in binary form must reproduce the above copyright`
			`// notice, this list of conditions and the following disclaimer in the`
			`// documentation and/or other materials provided with the distribution.`
			`// 3. The name of the company may not be used to endorse or promote`
			`// products derived from this software without specific prior written`
			`// permission.`
			`//`
			// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
			`// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF`
			`// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.`
			`// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,`
			`// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED`
			`// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF`
			`// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING`
			`// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS`
			`// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`//`

			`// Assumptions:`
			`//`
			`// ARMv8-a, AArch64, unaligned accesses.`
			`//`
			`//`

			`#define dstin x0`
			`#define src x1`
			`#define count x2`
			`#define dst x3`
			`#define srcend x4`
			`#define dstend x5`
			`#define A_l x6`
			`#define A_lw w6`
			`#define A_h x7`
			`#define A_hw w7`
			`#define B_l x8`
			`#define B_lw w8`
			`#define B_h x9`
			`#define C_l x10`
			`#define C_h x11`
			`#define D_l x12`
			`#define D_h x13`
			`#define E_l x14`
			`#define E_h x15`
			`#define F_l srcend`
			`#define F_h dst`
			`#define tmp1 x9`
			`#define tmp2 x3`

			`#define L(l) .L ## l`

			`// Copies are split into 3 main cases: small copies of up to 16 bytes,`
			`// medium copies of 17..96 bytes which are fully unrolled. Large copies`
			`// of more than 96 bytes align the destination and use an unrolled loop`
			`// processing 64 bytes per iteration.`
			`// Small and medium copies read all data before writing, allowing any`
			`// kind of overlap, and memmove tailcalls memcpy for these cases as`
			`// well as non-overlapping copies.`

			`__memcpy:`
			`prfm PLDL1KEEP, [src]`
			`add srcend, src, count`
			`add dstend, dstin, count`
			`cmp count, 16`
			`b.ls L(copy16)`
			`cmp count, 96`
			`b.hi L(copy_long)`

			`// Medium copies: 17..96 bytes.`
			`sub tmp1, count, 1`
			`ldp A_l, A_h, [src]`
			`tbnz tmp1, 6, L(copy96)`
			`ldp D_l, D_h, [srcend, -16]`
			`tbz tmp1, 5, 1f`
			`ldp B_l, B_h, [src, 16]`
			`ldp C_l, C_h, [srcend, -32]`
			`stp B_l, B_h, [dstin, 16]`
			`stp C_l, C_h, [dstend, -32]`
			`1:`
			`stp A_l, A_h, [dstin]`
			`stp D_l, D_h, [dstend, -16]`
			`ret`

			`.p2align 4`
			`// Small copies: 0..16 bytes.`
			`L(copy16):`
			`cmp count, 8`
			`b.lo 1f`
			`ldr A_l, [src]`
			`ldr A_h, [srcend, -8]`
			`str A_l, [dstin]`
			`str A_h, [dstend, -8]`
			`ret`
			`.p2align 4`
			`1:`
			`tbz count, 2, 1f`
			`ldr A_lw, [src]`
			`ldr A_hw, [srcend, -4]`
			`str A_lw, [dstin]`
			`str A_hw, [dstend, -4]`
			`ret`

			`// Copy 0..3 bytes. Use a branchless sequence that copies the same`
			`// byte 3 times if count==1, or the 2nd byte twice if count==2.`
			`1:`
			`cbz count, 2f`
			`lsr tmp1, count, 1`
			`ldrb A_lw, [src]`
			`ldrb A_hw, [srcend, -1]`
			`ldrb B_lw, [src, tmp1]`
			`strb A_lw, [dstin]`
			`strb B_lw, [dstin, tmp1]`
			`strb A_hw, [dstend, -1]`
			`2: ret`

			`.p2align 4`
			`// Copy 64..96 bytes. Copy 64 bytes from the start and`
			`// 32 bytes from the end.`
			`L(copy96):`
			`ldp B_l, B_h, [src, 16]`
			`ldp C_l, C_h, [src, 32]`
			`ldp D_l, D_h, [src, 48]`
			`ldp E_l, E_h, [srcend, -32]`
			`ldp F_l, F_h, [srcend, -16]`
			`stp A_l, A_h, [dstin]`
			`stp B_l, B_h, [dstin, 16]`
			`stp C_l, C_h, [dstin, 32]`
			`stp D_l, D_h, [dstin, 48]`
			`stp E_l, E_h, [dstend, -32]`
			`stp F_l, F_h, [dstend, -16]`
			`ret`

			`// Align DST to 16 byte alignment so that we don't cross cache line`
			`// boundaries on both loads and stores. There are at least 96 bytes`
			`// to copy, so copy 16 bytes unaligned and then align. The loop`
			`// copies 64 bytes per iteration and prefetches one iteration ahead.`

			`.p2align 4`
			`L(copy_long):`
			`and tmp1, dstin, 15`
			`bic dst, dstin, 15`
			`ldp D_l, D_h, [src]`
			`sub src, src, tmp1`
			`add count, count, tmp1 // Count is now 16 too large.`
			`ldp A_l, A_h, [src, 16]`
			`stp D_l, D_h, [dstin]`
			`ldp B_l, B_h, [src, 32]`
			`ldp C_l, C_h, [src, 48]`
			`ldp D_l, D_h, [src, 64]!`
			`subs count, count, 128 + 16 // Test and readjust count.`
			`b.ls 2f`
			`1:`
			`stp A_l, A_h, [dst, 16]`
			`ldp A_l, A_h, [src, 16]`
			`stp B_l, B_h, [dst, 32]`
			`ldp B_l, B_h, [src, 32]`
			`stp C_l, C_h, [dst, 48]`
			`ldp C_l, C_h, [src, 48]`
			`stp D_l, D_h, [dst, 64]!`
			`ldp D_l, D_h, [src, 64]!`
			`subs count, count, 64`
			`b.hi 1b`

			`// Write the last full set of 64 bytes. The remainder is at most 64`
			`// bytes, so it is safe to always copy 64 bytes from the end even if`
			`// there is just 1 byte left.`
			`2:`
			`ldp E_l, E_h, [srcend, -64]`
			`stp A_l, A_h, [dst, 16]`
			`ldp A_l, A_h, [srcend, -48]`
			`stp B_l, B_h, [dst, 32]`
			`ldp B_l, B_h, [srcend, -32]`
			`stp C_l, C_h, [dst, 48]`
			`ldp C_l, C_h, [srcend, -16]`
			`stp D_l, D_h, [dst, 64]`
			`stp E_l, E_h, [dstend, -64]`
			`stp A_l, A_h, [dstend, -48]`
			`stp B_l, B_h, [dstend, -32]`
			`stp C_l, C_h, [dstend, -16]`
			`ret`


			`//`
			`// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.`
			`// Larger backwards copies are also handled by memcpy. The only remaining`
			`// case is forward large copies. The destination is aligned, and an`
			`// unrolled loop processes 64 bytes per iteration.`
			`//`

			`ASM_GLOBAL ASM_PFX(InternalMemCopyMem)`
			`ASM_PFX(InternalMemCopyMem):`
			`sub tmp2, dstin, src`
			`cmp count, 96`
			`ccmp tmp2, count, 2, hi`
			`b.hs __memcpy`

			`cbz tmp2, 3f`
			`add dstend, dstin, count`
			`add srcend, src, count`

			`// Align dstend to 16 byte alignment so that we don't cross cache line`
			`// boundaries on both loads and stores. There are at least 96 bytes`
			`// to copy, so copy 16 bytes unaligned and then align. The loop`
			`// copies 64 bytes per iteration and prefetches one iteration ahead.`

			`and tmp2, dstend, 15`
			`ldp D_l, D_h, [srcend, -16]`
			`sub srcend, srcend, tmp2`
			`sub count, count, tmp2`
			`ldp A_l, A_h, [srcend, -16]`
			`stp D_l, D_h, [dstend, -16]`
			`ldp B_l, B_h, [srcend, -32]`
			`ldp C_l, C_h, [srcend, -48]`
			`ldp D_l, D_h, [srcend, -64]!`
			`sub dstend, dstend, tmp2`
			`subs count, count, 128`
			`b.ls 2f`
			`nop`
			`1:`
			`stp A_l, A_h, [dstend, -16]`
			`ldp A_l, A_h, [srcend, -16]`
			`stp B_l, B_h, [dstend, -32]`
			`ldp B_l, B_h, [srcend, -32]`
			`stp C_l, C_h, [dstend, -48]`
			`ldp C_l, C_h, [srcend, -48]`
			`stp D_l, D_h, [dstend, -64]!`
			`ldp D_l, D_h, [srcend, -64]!`
			`subs count, count, 64`
			`b.hi 1b`

			`// Write the last full set of 64 bytes. The remainder is at most 64`
			`// bytes, so it is safe to always copy 64 bytes from the start even if`
			`// there is just 1 byte left.`
			`2:`
			`ldp E_l, E_h, [src, 48]`
			`stp A_l, A_h, [dstend, -16]`
			`ldp A_l, A_h, [src, 32]`
			`stp B_l, B_h, [dstend, -32]`
			`ldp B_l, B_h, [src, 16]`
			`stp C_l, C_h, [dstend, -48]`
			`ldp C_l, C_h, [src]`
			`stp D_l, D_h, [dstend, -64]`
			`stp E_l, E_h, [dstin, 48]`
			`stp A_l, A_h, [dstin, 32]`
			`stp B_l, B_h, [dstin, 16]`
			`stp C_l, C_h, [dstin]`
			`3: ret`