audk/ArmPkg/Library/CompilerIntrinsicsLib/AArch64/memcpy.S

/*
 * Copyright (c) 2011 - 2013, ARM Ltd
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the company may not be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */


.text
.align 2


GCC_ASM_EXPORT(memcpy)


// Taken from Newlib BSD implementation.
ASM_PFX(memcpy):
        // Copy dst to x6, so we can preserve return value.
        mov     x6, x0

        // NOTE: although size_t is unsigned, this code uses signed
        // comparisons on x2 so relies on nb never having its top bit
        // set. In practice this is not going to be a real problem.

        // Require at least 64 bytes to be worth aligning.
        cmp     x2, #64
        blt     qwordcopy

        // Compute offset to align destination to 16 bytes.
        neg     x3, x0
        and     x3, x3, 15

        cbz     x3, blockcopy           // offset == 0 is likely

        // We know there is at least 64 bytes to be done, so we
        // do a 16 byte misaligned copy at first and then later do
        // all 16-byte aligned copies.  Some bytes will be copied
        // twice, but there's no harm in that since memcpy does not
        // guarantee correctness on overlap.

        sub     x2, x2, x3              // nb -= offset
        ldp     x4, x5, [x1]
        add     x1, x1, x3
        stp     x4, x5, [x6]
        add     x6, x6, x3

        // The destination pointer is now qword (16 byte) aligned.
        // (The src pointer might be.)

blockcopy:
        // Copy 64 bytes at a time.
        subs    x2, x2, #64
        blt     3f
2:      subs    x2, x2, #64
        ldp     x4, x5, [x1,#0]
        ldp     x8, x9, [x1,#16]
        ldp     x10,x11,[x1,#32]
        ldp     x12,x13,[x1,#48]
        add     x1, x1, #64
        stp     x4, x5, [x6,#0]
        stp     x8, x9, [x6,#16]
        stp     x10,x11,[x6,#32]
        stp     x12,x13,[x6,#48]
        add     x6, x6, #64
        bge     2b

        // Unwind pre-decrement
3:      add     x2, x2, #64

qwordcopy:
        // Copy 0-48 bytes, 16 bytes at a time.
        subs    x2, x2, #16
        blt     tailcopy
2:      ldp     x4, x5, [x1],#16
        subs    x2, x2, #16
        stp     x4, x5, [x6],#16
        bge     2b

        // No need to unwind the pre-decrement, it would not change
        // the low 4 bits of the count. But how likely is it for the
        // byte count to be multiple of 16? Is it worth the overhead
        // of testing for x2 == -16?

tailcopy:
        // Copy trailing 0-15 bytes.
        tbz     x2, #3, 1f
        ldr     x4, [x1],#8             // copy 8 bytes
        str     x4, [x6],#8
1:
        tbz     x2, #2, 1f
        ldr     w4, [x1],#4             // copy 4 bytes
        str     w4, [x6],#4
1:
        tbz     x2, #1, 1f
        ldrh    w4, [x1],#2             // copy 2 bytes
        strh    w4, [x6],#2
1:
        tbz     x2, #0, return
        ldrb    w4, [x1]                // copy 1 byte
        strb    w4, [x6]

return:
        // This is the only return point of memcpy.
        ret
ArmPkg: Added Aarch64 support Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Harry Liebel <Harry.Liebel@arm.com> Signed-off-by: Olivier Martin <olivier.martin@arm.com> git-svn-id: https://svn.code.sf.net/p/edk2/code/trunk/edk2@14486 6f19259b-4bc3-4df7-8a09-765794883524 2013-07-18 20:07:46 +02:00			`/*`
			`* Copyright (c) 2011 - 2013, ARM Ltd`
			`* All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* 1. Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* 2. Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* 3. The name of the company may not be used to endorse or promote`
			`* products derived from this software without specific prior written`
			`* permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
			`* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF`
			`* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.`
			`* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,`
			`* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED`
			`* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF`
			`* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING`
			`* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS`
			`* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`


			`.text`
			`.align 2`


ARM Packages: use GCC_ASM_EXPORT to export functions This ensures the .type directive is used to mark them as function symbols Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Brendan Jackman <brendan.jackman@arm.com> Reviewed-by: Olivier Martin <olivier.martin@arm.com> git-svn-id: https://svn.code.sf.net/p/edk2/code/trunk/edk2@15506 6f19259b-4bc3-4df7-8a09-765794883524 2014-05-08 16:55:52 +02:00			`GCC_ASM_EXPORT(memcpy)`
ArmPkg: Added Aarch64 support Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Harry Liebel <Harry.Liebel@arm.com> Signed-off-by: Olivier Martin <olivier.martin@arm.com> git-svn-id: https://svn.code.sf.net/p/edk2/code/trunk/edk2@14486 6f19259b-4bc3-4df7-8a09-765794883524 2013-07-18 20:07:46 +02:00

			`// Taken from Newlib BSD implementation.`
			`ASM_PFX(memcpy):`
			`// Copy dst to x6, so we can preserve return value.`
			`mov x6, x0`

			`// NOTE: although size_t is unsigned, this code uses signed`
			`// comparisons on x2 so relies on nb never having its top bit`
			`// set. In practice this is not going to be a real problem.`

			`// Require at least 64 bytes to be worth aligning.`
			`cmp x2, #64`
			`blt qwordcopy`

			`// Compute offset to align destination to 16 bytes.`
			`neg x3, x0`
			`and x3, x3, 15`

			`cbz x3, blockcopy // offset == 0 is likely`

			`// We know there is at least 64 bytes to be done, so we`
			`// do a 16 byte misaligned copy at first and then later do`
			`// all 16-byte aligned copies. Some bytes will be copied`
			`// twice, but there's no harm in that since memcpy does not`
			`// guarantee correctness on overlap.`

			`sub x2, x2, x3 // nb -= offset`
			`ldp x4, x5, [x1]`
			`add x1, x1, x3`
			`stp x4, x5, [x6]`
			`add x6, x6, x3`

			`// The destination pointer is now qword (16 byte) aligned.`
			`// (The src pointer might be.)`

			`blockcopy:`
			`// Copy 64 bytes at a time.`
			`subs x2, x2, #64`
			`blt 3f`
			`2: subs x2, x2, #64`
			`ldp x4, x5, [x1,#0]`
			`ldp x8, x9, [x1,#16]`
			`ldp x10,x11,[x1,#32]`
			`ldp x12,x13,[x1,#48]`
			`add x1, x1, #64`
			`stp x4, x5, [x6,#0]`
			`stp x8, x9, [x6,#16]`
			`stp x10,x11,[x6,#32]`
			`stp x12,x13,[x6,#48]`
			`add x6, x6, #64`
			`bge 2b`

			`// Unwind pre-decrement`
			`3: add x2, x2, #64`

			`qwordcopy:`
			`// Copy 0-48 bytes, 16 bytes at a time.`
			`subs x2, x2, #16`
			`blt tailcopy`
			`2: ldp x4, x5, [x1],#16`
			`subs x2, x2, #16`
			`stp x4, x5, [x6],#16`
			`bge 2b`

			`// No need to unwind the pre-decrement, it would not change`
			`// the low 4 bits of the count. But how likely is it for the`
			`// byte count to be multiple of 16? Is it worth the overhead`
			`// of testing for x2 == -16?`

			`tailcopy:`
			`// Copy trailing 0-15 bytes.`
			`tbz x2, #3, 1f`
			`ldr x4, [x1],#8 // copy 8 bytes`
			`str x4, [x6],#8`
			`1:`
			`tbz x2, #2, 1f`
			`ldr w4, [x1],#4 // copy 4 bytes`
			`str w4, [x6],#4`
			`1:`
			`tbz x2, #1, 1f`
			`ldrh w4, [x1],#2 // copy 2 bytes`
			`strh w4, [x6],#2`
			`1:`
			`tbz x2, #0, return`
			`ldrb w4, [x1] // copy 1 byte`
			`strb w4, [x6]`

			`return:`
			`// This is the only return point of memcpy.`
			`ret`