您的位置:首页 > 运维架构

ARM Memory Copy

2015-07-16 14:55 465 查看
MODULE  ARM_MEMORY

PUBLIC  ARM_MEMCPY
PUBLIC  ARM_MEMSET
PUBLIC  ARM_MEMSET8
PUBLIC  ARM_MEMSET16
PUBLIC  ARM_MEMSET32

SECTION .text:CODE:NOROOT(2)
CODE32

;-------------------------------------------------------------------------------
; void ARM_MEMCPY(void* pDest, void* pSrc, U32 NumBytes)
;
; Function description
;   Copy data in memory from source address to destination address.
;
; Register usage:
;
;   R0    pDest
;   R1    pSrc
;   R2    NumBytes
;
;   R3    Used for data transfers
;   R4    Used for data transfers
;   R12   Used for data transfers
;   R14   Used for data transfers
;
;   R13   SP
;   R14   LR (contains return address)
;   R15   PC
;
;-------------------------------------------------------------------------------
ARM_MEMCPY:
;-------------------------------------------------------------------------------
cmp         R2, #+3                           ; R2 = NumBytes
bls         ARM_MEMCPY_HandleTrailingBytes    ; If we have less than one complete word, use single byte transfer

ands        R12, R0, #+3                      ; R0 = destination address
beq         ARM_MEMCPY_DestIsDWordAligned     ; Is destination address already word aligned ?

;-------------------------------------------------------------------------------
; Handle as much bytes as necessary to align destination address
;
ldrb        R3, [R1], #+1                     ; We need at least one byte to the next word alignment, so we read one.
cmp         R12, #+2                          ; Set condition codes according to the mis-alignment
add         R2, R2, R12                       ; Adjust NumBytes : 1, 2, 3
ldrbls      R12, [R1], #+1                    ; Lower or same (LS)? -> We need one or two bytes to the next word aligned address
strb        R3, [R0], #+1
ldrbcc      R3, [R1], #+1                     ; Carry clear (CC)? -> We need one more byte
strbls      R12, [R0], #+1
sub         R2, R2, #+4                       ; Adjust NumBytes
strbcc      R3, [R0], #+1                     ; now destination address already is word aligned

;-------------------------------------------------------------------------------
; Choose best way to transfer data
;
ARM_MEMCPY_DestIsDWordAligned:
ands        R3, R1, #+3
beq         ARM_MEMCPY_HandleBulkWordData     ; If source and destination are aligned, use bulk word transfer

subs        R2, R2, #+4
bcc         ARM_MEMCPY_HandleTrailingBytes    ; If we have less than one complete word left, use single byte transfer

ldr         R12, [R1, -R3]!                   ; Read first mis-aligned data word and word align source address
cmp         R3, #+2
beq         ARM_MEMCPY_Loop16BitShift

bhi         ARM_MEMCPY_Loop24BitShift

;-------------------------------------------------------------------------------
; Handle data in units of word
;
; This is done by reading mis-aligned words from source address and
; shift them into the right alignment. After this the next data word
; will be read to complete the missing data part.
;
ARM_MEMCPY_Loop8BitShift:
mov         R3, R12, LSR #+8           ; Shift data word into right position
ldr         R12, [R1, #+4]!            ; Load next mis-aligned data word
subs        R2, R2, #+4                ; Decrement NumBytes
orr         R3, R3, R12, LSL #+24      ; Combine missing part of data to build full data word
str         R3, [R0], #+4              ; Store complete word
bcs         ARM_MEMCPY_Loop8BitShift

add         R1, R1, #+1                ; Adjust source address
b           ARM_MEMCPY_HandleTrailingBytes         ; Handle trailing bytes

ARM_MEMCPY_Loop16BitShift:
mov         R3, R12, LSR #+16          ; Shift data word into right position
ldr         R12, [R1, #+4]!            ; Load next mis-aligned data word
subs        R2, R2, #+4                ; Decrement NumBytes
orr         R3, R3, R12, LSL #+16      ; Combine missing part of data to build full data word
str         R3, [R0], #+4              ; Store complete word
bcs         ARM_MEMCPY_Loop16BitShift

add         R1, R1, #+2                ; Adjust source address
b           ARM_MEMCPY_HandleTrailingBytes         ; Handle trailing bytes

ARM_MEMCPY_Loop24BitShift:
mov         R3, R12, LSR #+24          ; Shift data word into right position
ldr         R12, [R1, #+4]!            ; Load next mis-aligned data word
subs        R2, R2, #+4                ; Decrement NumBytes
orr         R3, R3, R12, LSL #+8       ; Combine missing part of data to build full data word
str         R3, [R0], #+4              ; Store complete word
bcs         ARM_MEMCPY_Loop24BitShift

add         R1, R1, #+3                ; Adjust source address
b           ARM_MEMCPY_HandleTrailingBytes         ; Handle trailing bytes

;-------------------------------------------------------------------------------
; Handle large bulk data in blocks of 8 words (32 bytes)
;
ARM_MEMCPY_HandleBulkWordData:
subs        R2, R2, #+0x20
stmdb       SP!, {R4, LR}
bcc         ARM_MEMCPY_HandleTrailingWords

ARM_MEMCPY_LoopHandleBulkWord:
ldm         R1!, {R3, R4, R12, LR}     ; Transfer 16 bytes at once
stm         R0!, {R3, R4, R12, LR}
ldm         R1!, {R3, R4, R12, LR}     ; Transfer 16 bytes at once
stm         R0!, {R3, R4, R12, LR}
subs        R2, R2, #+0x20
bcs         ARM_MEMCPY_LoopHandleBulkWord

;-------------------------------------------------------------------------------
; Handle trailing 7 words
;
ARM_MEMCPY_HandleTrailingWords:
movs        R12, R2, LSL #28           ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data

ldmcs       R1!, {R3, R4, R12, LR}     ; C flag contain bit 4 of NumBytes (transfer 16 bytes if it is set)
stmcs       R0!, {R3, R4, R12, LR}
ldmmi       R1!, {R3, R4}              ; N flag contain bit 3 of NumBytes (transfer 8 bytes if it is set)
stmmi       R0!, {R3, R4}

movs        R12, R2, LSL #+30          ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data

ldmia       SP!, {R4, LR}
ldrcs       R3, [R1], #+4              ; C flag contain bit 2 of NumBytes (transfer 4 bytes if it is set)
strcs       R3, [R0], #+4
bxeq        LR

;-------------------------------------------------------------------------------
; Handle trailing 3 bytes
;
; N Z C V Q  ***** I F T M4 3 2 1 0
; N = bit[31]
; C = last shift bit : shift
; C = 1 ADD/CMN has carry bit
; C = 0 SUB/CMP no borrow bit
; xxxxxxxxxxxxxxxxxxxx10 << 31 : N=0, C=1
; xxxxxxxxxxxxxxxxxxxx01 << 31 : N=1, C=0
; BMI : N=1
; BCS : C=1
ARM_MEMCPY_HandleTrailingBytes:
movs        R2, R2, LSL #+31           ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data

ldrbmi      R2, [R1], #+1              ; N flag contain bit 0 of NumBytes (transfer 1 byte if it is set)
ldrbcs      R3, [R1], #+1              ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is set)
ldrbcs      R12, [R1], #+1
strbmi      R2, [R0], #+1
strbcs      R3, [R0], #+1
strbcs      R12, [R0], #+1
bx          LR

;-------------------------------------------------------------------------------
; void ARM_MEMSET(void* pDest, U32 c, U32 NumBytes)
;
; Function description
;   Copy data in memory from source address to destination address.
;
; Register usage:
;
;   R0    pDest
;   R1    c
;   R2    NumBytes
;
;   R3    Used for data transfers
;   R4    Used for data transfers
;   R5    Used for data transfers
;   R6    Used for data transfers
;
;   R13   SP
;   R14   LR (contains return address)
;   R15   PC
;
;-------------------------------------------------------------------------------
ARM_MEMSET:
;-------------------------------------------------------------------------------
orr         R1, R1, R1, LSL #+8
orr         R1, R1, R1, LSL #+16

cmp         R2, #+3                           ; R2 = NumBytes
bls         ARM_MEMSET_HandleTrailingBytes    ; If we have less than one complete word, use single byte transfer

ands        R3, R0, #+3                       ; R0 = destination address
beq         ARM_MEMSET_DestIsAligned          ; Is destination address already word aligned ?

; Handle as much bytes as necessary to align destination address

strb        R1, [R0], #+1              ; We need at least one byte to the next word alignment, so we read one.
cmp         R3, #+2                    ; Set condition codes according to the mis-alignment
add         R2, R2, R3                 ; Adjust NumBytes
strbls      R1, [R0], #+1              ; Lower or same (LS)? -> We need one or two bytes to the next word aligned address
sub         R2, R2, #+4                ; Adjust NumBytes
strbcc      R1, [R0], #+1              ; Carry clear (CC)? -> We need one more byte

; Choose best way to transfer data

ARM_MEMSET_DestIsAligned:                      ; destination is aligned, use bulk word transfer

; Handle large bulk data in blocks of 8 words (32 bytes)

ARM_MEMSET_HandleBulkWordData:
stmdb       SP!, {R4, R5, R6}

mov         R3, R1, LSL #+0           ; Transfer 16 bytes at once
mov         R4, R1, LSL #+0
mov         R5, R1, LSL #+0

subs        R2, R2, #+0x20             ; 32 Bytes = 8 DWords
bcc         ARM_MEMSET_HandleTrailingWords

ARM_MEMSET_LoopHandleBulkWord:
stm         R0!, {R1, R3, R4, R5}
stm         R0!, {R1, R3, R4, R5}
subs        R2, R2, #+0x20
bcs         ARM_MEMSET_LoopHandleBulkWord

; Handle trailing 7 words

ARM_MEMSET_HandleTrailingWords:
movs        R6, R2, LSL #28            ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
stmcs       R0!, {R1, R3, R4, R5}      ; C flag contain bit 4 of NumBytes (transfer 16 bytes if it is set)
stmmi       R0!, {R1, R3}              ; N flag contain bit 3 of NumBytes (transfer 8 bytes if it is set)

movs        R6, R2, LSL #+30           ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
strcs       R1, [R0], #+4              ; C flag contain bit 2 of NumBytes (transfer 4 bytes if it is set)

ldmia       SP!, {R4, R5, R6}
bxeq        LR                         ; Z flag contain no Trailing Bytes

; Handle trailing 3 bytes

ARM_MEMSET_HandleTrailingBytes:
movs        R2, R2, LSL #+31           ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
strbmi      R1, [R0], #+1              ; N flag contain bit 0 of NumBytes (transfer 1 byte if it is set)
strbcs      R1, [R0], #+1              ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is set)
strbcs      R1, [R0], #+1
bx          LR

;      int ARM_MEMSET8(void* pDest, U32 c, U32 NumBytes);
;-------------------------------------------------------------------------------
ARM_MEMSET8:
;-------------------------------------------------------------------------------
stmdb       SP!, {R4, R5}
cmp         R2, #4
blt         ARM_MEMSET8_loop3

; Alignment is unknown
tst         R0, #1
strneb      R1, [R0], #1
subne       R2, R2, #1

; Now we are 16-bit aligned (need to upgrade 'c' to 16-bit)
orr         R1, R1, R1, LSL #8
tst         R0, #2
strneh      R1, [R0], #2
subne       R2, R2, #2

; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit)
orr         R1, R1, R1, LSL #16
mov         R3, R1
cmp         R2, #16
blt         ARM_MEMSET8_loop2
tst         R0, #4
strne       R1, [R0], #4
subne       R2, R2, #4
tst         R0, #8
stmneia     R0!, {R1, R3}
subne       R2, R2, #8

; Now we are 128-bit aligned
mov         R4, R1
mov         R5, R1
ARM_MEMSET8_loop1:
; Copy 4 32-bit values per loop iteration
subs        R2, R2, #16
stmgeia     R0!, {R1, R3, R4, R5}
bge         ARM_MEMSET8_loop1
add         R2, R2, #16

ARM_MEMSET8_loop2:
; Copy up to 3 remaining 32-bit values
tst         R2, #8
stmneia     R0!, {R1, R3}
tst         R2, #4
strne       R1, [R0], #4
and         R2, R2, #3

ARM_MEMSET8_loop3:
; Copy up to 3 remaining bytes
subs        R2, R2, #1
strgeb      R1, [R0], #1
subs        R2, R2, #1
strgeb      R1, [R0], #1
subs        R2, R2, #1
strgeb      R1, [R0], #1
ldmia       SP!, {R4, R5}
bx          LR

; int ARM_MEMSET16(void* pDest, U32 c, U32 NumHalfWords);
;-------------------------------------------------------------------------------
ARM_MEMSET16:
;-------------------------------------------------------------------------------
stmdb       SP!, {R4, R5}

cmp         R2, #2
blt         ARM_MEMSET16_HandleTrailingHalfWord    ; 1 or 0

; Alignment is known to be at least 16-bit
tst         R0, #2
strneh      R1, [R0], #2              ; xxxx-xx10 --->
subne       R2, R2, #1                ; xxxx-xx00

; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit )
orr         R1, R1, R1, LSL #16
mov         R4, R1

cmp         R2, #8
blt         ARM_MEMSET16_HandleTrailingWords       ; 7, 6, ... 0

tst         R0, #4
strne       R1, [R0], #4              ; xxxx-x100 --->
subne       R2, R2, #2                ; xxxx-x000 --->

; Now we are 64-bit aligned
tst         R0, #8
stmneia     R0!, {R1, R4}             ; xxxx-1000 --->
subne       R2, R2, #4                ; xxxx-0000 --->

ARM_MEMSET16_HandleBulkWordData:
; Now we are 128-bit aligned
mov         R5, R1
mov         R3, R1

ARM_MEMSET16_LoopHandleBulkWord:
; Copy 4 32-bit values per loop iteration
subs        R2, R2, #8
stmgeia     R0!, {R1, R3, R4, R5}
bge         ARM_MEMSET16_LoopHandleBulkWord
add         R2, R2, #8

ARM_MEMSET16_HandleTrailingWords:
; Copy up to 3 remaining 32-bit values
tst         R2, #4
stmneia     R0!, {R1, R4}

tst         R2, #2
strne       R1, [R0], #4

and         R2, R2, #1

ARM_MEMSET16_HandleTrailingHalfWord:
; Copy up to 1 remaining 16-bit value
subs        R2, R2, #1
strgeh      R1, [R0], #2

ldmia       SP!, {R4, R5}
bx          LR

; int ARM_MEMSET32(void* pDest, U32 c, U32 NumWords);
;-------------------------------------------------------------------------------
ARM_MEMSET32:
;-------------------------------------------------------------------------------
stmdb       SP!, {R4, R5}

cmp         R2, #4
blt         ARM_MEMSET32_loop2

; Alignment is known to be at least 32-bit
mov         R3, R1

tst         R0, #4
strne       R1, [R0], #4
subne       R2, R2, #1

; Now we are 64-bit aligned
tst         R0, #8
stmneia     R0!, {R1, R3}
subne       R2, R2, #2

; Now we are 128-bit aligned
mov         R4, R1
mov         R5, R1
ARM_MEMSET32_loop1:
; Copy 4 32-bit values per loop iteration
subs        R2, R2, #4
stmgeia     R0!, {R1, R3, R4, R5}
bge         ARM_MEMSET32_loop1
add         R2, R2, #4

ARM_MEMSET32_loop2:
; Copy up to 3 remaining 32-bit values
subs        R2, R2, #1
strge       R1, [R0], #4
subs        R2, R2, #1
strge       R1, [R0], #4
subs        R2, R2, #1
strge       R1, [R0], #4

ldmia       SP!, {R4, R5}
bx          LR

;-__arm void ARM_memxor(void* pDest, U32 c, U32 NumBytes);
;                           r0         r1     r2
;-------------------------------------------------------------------------------
arm_memxor:
;-------------------------------------------------------------------------------
orr         R1, R1, R1, LSL #+8
orr         R1, R1, R1, LSL #+16

cmp         R2, #+3                     ; R2 = NumBytes
bls         arm_memxor_HandleTrailingBytes        ; If we have less than one complete word, use single byte transfer

ands        R3, R0, #+3                 ; R0 = destination address
beq         arm_memxor_DestIsAligned              ; Is destination address already word aligned ?

;-
; Handle as much bytes as necessary to align destination address
;-
ldrb        R12, [R0], #+0              ; We need at least one byte to the next word alignment, so we read one.
eor         R12, R12, r1
strb        R12, [R0], #+1              ; We need at least one byte to the next word alignment, so we read one.

cmp         R3, #+2                    ; Set condition codes according to the mis-alignment
add         R2, R2, R3                 ; Adjust NumBytes

ldrbls      R3, [R0], #+0              ; We need at least one byte to the next word alignment, so we read one.
eorls       R3, R3, r1
strbls      R3, [R0], #+1             ; Lower or same (LS)? -> We need one or two bytes to the next word aligned address

sub         R2, R2, #+4                ; Adjust NumBytes

ldrbcc      R3, [R0], #+0              ; We need at least one byte to the next word alignment, so we read one.
eorcc       R3, R3, r1
strbcc      R3, [R0], #+1              ; Carry clear (CC)? -> We need one more byte

;-
; Choose best way to transfer data
;-
arm_memxor_DestIsAligned:                                  ; destination is aligned, use bulk word transfer
;-
; Handle large bulk data in blocks of 8 words (32 bytes)
;-
arm_memxor_HandleBulkWordData:
stmdb       SP!, {R4, R5, R6, R7}

subs        R2, R2, #+0x20             ; 32 Bytes = 8 DWords
bcc         arm_memxor_HandleTrailingWords

arm_memxor_LoopHandleBulkWord:
ldm         R0,  {R3, R4, R5, R6}
eor         r3, r3, r1
eor         r4, r4, r1
eor         r5, r5, r1
eor         r6, r6, r1
stm         R0!, {R3, R4, R5, R6}

ldm         R0,  {R3, R4, R5, R6}
eor         r3, r3, r1
eor         r4, r4, r1
eor         r5, r5, r1
eor         r6, r6, r1
stm         R0!, {R3, R4, R5, R6}

subs        R2, R2, #+0x20
bcs         arm_memxor_LoopHandleBulkWord

;-
; Handle trailing 7 words
;-
arm_memxor_HandleTrailingWords:
movs        R7, R2, LSL #28             ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data

ldmcs       R0,  {R3, R4, R5, R6}
eorcs       r3, r3, r1
eorcs       r4, r4, r1
eorcs       r5, r5, r1
eorcs       r6, r6, r1
stmcs       R0!, {R3, R4, R5, R6}       ; C flag contain bit 4 of NumBytes (transfer 16 bytes if it is xor)

ldmmi       R0,  {R3, R4}
eormi       r3, r3, r1
eormi       r4, r4, r1
stmmi       R0!, {R3, R4}                ; N flag contain bit 3 of NumBytes (transfer 8 bytes if it is xor)

movs        R7, R2, LSL #+30            ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data

ldrcs       R3, [R0]
eorcs       r3, r3, r1
strcs       R3, [R0], #+4              ; C flag contain bit 2 of NumBytes (transfer 4 bytes if it is xor)

ldmia       SP!, {R4, R5, R6, R7}
bxeq        LR                          ; Z flag contain no Trailing Bytes

;-
; Handle trailing 3 bytes
;-
arm_memxor_HandleTrailingBytes:
movs        R2, R2, LSL #+31           ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data

ldrmi       R2, [R0]
eormi       R2, R2, r1
strbmi      R2, [R0], #+1              ; N flag contain bit 0 of NumBytes (transfer 1 byte if it is xor)

ldrcs       R2, [R0]
eorcs       R2, R2, r1
strbcs      R2, [R0], #+1              ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is xor)

ldrcs       R2, [R0]
eorcs       R2, R2, r1
strbcs      R2, [R0], #+1              ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is xor)

bx          LR

;-__arm int arm_memxor8(void* pDest, U32 c, U32 NumBytes);
;                           r0         r1     r2
;-------------------------------------------------------------------------------
arm_memxor8:
;-------------------------------------------------------------------------------
stmdb       SP!, {R4, R5, R6}

orr         R1, R1, R1, LSL #+8
orr         R1, R1, R1, LSL #+16

cmp         R2, #4
blt         arm_memxor8_loop3

; Alignment is unknown
tst         R0, #1

ldrneb      R6, [R0]
eorne       R6, r6, R1
strneb      R6, [R0], #1

subne       R2, R2, #1

; Now we are 16-bit aligned (need to upgrade 'c' to 16-bit)
tst         R0, #2

ldrneh      R6, [R0]
eorne       R6, r6, R1
strneh      R6, [R0], #2

subne       R2, R2, #2

; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit)
cmp         R2, #16
blt         arm_memxor8_loop2
tst         R0, #4

ldrne       R6, [R0]
eorne       R6, r6, R1
strne       R6, [R0], #4
; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit)
subne       R2, R2, #4
tst         R0, #8

ldmneia     R0, {R3, R6}
eorne       R3, r3, R1
eorne       R6, r6, R1
stmneia     R0!, {R3, R6}

subne       R2, R2, #8

; Now we are 128-bit aligned
mov         R4, R1
mov         R5, R1
arm_memxor8_loop1:
; Copy 4 32-bit values per loop iteration
subs        R2, R2, #16

ldmgeia     R0,  {R3, R4, R5, R6}
eorge       r3, r3, r1
eorge       r4, r4, r1
eorge       r5, r5, r1
eorge       r6, r6, r1
stmgeia     R0!, {R3, R4, R5, R6}

bge         arm_memxor8_loop1
add         R2, R2, #16

arm_memxor8_loop2:
; Copy up to 3 remaining 32-bit values
tst         R2, #8

ldmneia     R0, {R3, R4}
eorne       r3, r3, r1
eorne       r4, r4, r1
stmneia     R0!, {R3, R4}

tst         R2, #4

ldrne       R3, [R0]
eorne       r3, r3, r1
strne       R3, [R0], #4

and         R2, R2, #3

arm_memxor8_loop3:
; Copy up to 3 remaining bytes
subs        R2, R2, #1

ldrgeb      R3, [R0]
eorge       r3, r3, r1
strgeb      R3, [R0], #1

subs        R2, R2, #1

ldrgeb      R3, [R0]
eorge       r3, r3, r1
strgeb      R1, [R0], #1

subs        R2, R2, #1

ldrgeb      R3, [R0]
eorge       r3, r3, r1
strgeb      R1, [R0], #1

ldmia       SP!, {R4, R5, R6}
bx          LR

;-__arm int arm_memxor16(void* pDest, U32 c, U32 NumHalfWords);
;                           r0         r1     r2
;-------------------------------------------------------------------------------
arm_memxor16:
;-------------------------------------------------------------------------------
stmdb       SP!, {R4, R5, R6}
orr         R1, R1, R1, LSL #+16

cmp         R2, #2
blt         arm_memxor16_HandleTrailingHalfWord    ; 1 or 0

; Alignment is known to be at least 16-bit
tst         R0, #2

ldrneh      R6, [R0]
eorne       R6, r6, R1
strneh      R6, [R0], #2              ; xxxx-xx10 --->

subne       R2, R2, #1                ; xxxx-xx00

; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit )
cmp         R2, #8
blt         arm_memxor16_HandleTrailingWords       ; 7, 6, ... 0

tst         R0, #4

ldrne       R3, [R0]
eorne       r3, r3, r1
strne       R3, [R0], #4              ; xxxx-x100 --->

subne       R2, R2, #2                ; xxxx-x000 --->

; Now we are 64-bit aligned
tst         R0, #8

ldmneia     R0, {R3, R4}
eorne       r3, r3, r1
eorne       r4, r4, r1
stmneia     R0!, {R3, R4}             ; xxxx-1000 --->

subne       R2, R2, #4                ; xxxx-0000 --->

arm_memxor16_HandleBulkWordData:
; Now we are 128-bit aligned
mov         R5, R1
mov         R6, R1

arm_memxor16_LoopHandleBulkWord:
; Copy 4 32-bit values per loop iteration
subs        R2, R2, #8

ldmgeia     R0,  {R3, R4, R5, R6}
eorge       r3, r3, r1
eorge       r4, r4, r1
eorge       r5, r5, r1
eorge       r6, r6, r1
stmgeia     R0!, {R3, R4, R5, R6}

bge         arm_memxor16_LoopHandleBulkWord
add         R2, R2, #8

arm_memxor16_HandleTrailingWords:
; Copy up to 3 remaining 32-bit values
tst         R2, #4

ldmneia     R0, {R3, R4}
eorne       r3, r3, r1
eorne       r4, r4, r1
stmneia     R0!, {R3, R4}

tst         R2, #2

ldrne       R3, [R0]
eorne       r3, r3, r1
strne       R3, [R0], #4

and         R2, R2, #1

arm_memxor16_HandleTrailingHalfWord:
; Copy up to 1 remaining 16-bit value
subs        R2, R2, #1

ldrgeh      R3, [R0]
eorge       r3, r3, r1
strgeh      R3, [R0], #2

ldmia       SP!, {R4, R5, R6}
bx          LR

;-__arm int arm_memxor32(void* pDest, U32 c, U32 NumWords);
;                           r0         r1     r2
;-------------------------------------------------------------------------------
arm_memxor32:
;-------------------------------------------------------------------------------
stmdb       SP!, {R4, R5, R6}

cmp         R2, #4
blt         arm_memxor32_loop2

; Alignment is known to be at least 32-bit, is it 64-bit aligned ?
tst         R0, #4
; No, it is 32-bit aligned
ldrne       R3, [R0]
eorne       R3, r3, R1
strne       R3, [R0], #4
subne       R2, R2, #1

; Now we are 64-bit aligned, is it 128-bit aligned ?
tst         R0, #8
; No, it is 64-bit aligned
ldmneia     R0, {R3, R4}
eorne       r3, r3, r1
eorne       r4, r4, r1
stmneia     R0!, {R3, R4}             ; xxxx-1000 --->
subne       R2, R2, #2

; Now we are 128-bit aligned
mov         R4, R1
mov         R5, R1
arm_memxor32_loop1:
; Copy 4 32-bit values per loop iteration
subs        R2, R2, #4

ldmgeia     R0,  {R3, R4, R5, R6}
eorge       r3, r3, r1
eorge       r4, r4, r1
eorge       r5, r5, r1
eorge       r6, r6, r1
stmgeia     R0!, {R3, R4, R5, R6}

bge         arm_memxor32_loop1
add         R2, R2, #4

arm_memxor32_loop2:
; Copy up to 3 remaining 32-bit values

subs        R2, R2, #1
ldrge       R3, [R0]
eorge       r3, r3, r1
strge       R3, [R0], #4

subs        R2, R2, #1
ldrge       R3, [R0]
eorge       r3, r3, r1
strge       R3, [R0], #4

subs        R2, R2, #1
ldrge       R3, [R0]
eorge       r3, r3, r1
strge       R3, [R0], #4

ldmia       SP!, {R4, R5, R6}
bx          LR

END
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: