ARM Memory Copy
2015-07-16 14:55
465 查看
MODULE ARM_MEMORY PUBLIC ARM_MEMCPY PUBLIC ARM_MEMSET PUBLIC ARM_MEMSET8 PUBLIC ARM_MEMSET16 PUBLIC ARM_MEMSET32 SECTION .text:CODE:NOROOT(2) CODE32 ;------------------------------------------------------------------------------- ; void ARM_MEMCPY(void* pDest, void* pSrc, U32 NumBytes) ; ; Function description ; Copy data in memory from source address to destination address. ; ; Register usage: ; ; R0 pDest ; R1 pSrc ; R2 NumBytes ; ; R3 Used for data transfers ; R4 Used for data transfers ; R12 Used for data transfers ; R14 Used for data transfers ; ; R13 SP ; R14 LR (contains return address) ; R15 PC ; ;------------------------------------------------------------------------------- ARM_MEMCPY: ;------------------------------------------------------------------------------- cmp R2, #+3 ; R2 = NumBytes bls ARM_MEMCPY_HandleTrailingBytes ; If we have less than one complete word, use single byte transfer ands R12, R0, #+3 ; R0 = destination address beq ARM_MEMCPY_DestIsDWordAligned ; Is destination address already word aligned ? ;------------------------------------------------------------------------------- ; Handle as much bytes as necessary to align destination address ; ldrb R3, [R1], #+1 ; We need at least one byte to the next word alignment, so we read one. cmp R12, #+2 ; Set condition codes according to the mis-alignment add R2, R2, R12 ; Adjust NumBytes : 1, 2, 3 ldrbls R12, [R1], #+1 ; Lower or same (LS)? -> We need one or two bytes to the next word aligned address strb R3, [R0], #+1 ldrbcc R3, [R1], #+1 ; Carry clear (CC)? -> We need one more byte strbls R12, [R0], #+1 sub R2, R2, #+4 ; Adjust NumBytes strbcc R3, [R0], #+1 ; now destination address already is word aligned ;------------------------------------------------------------------------------- ; Choose best way to transfer data ; ARM_MEMCPY_DestIsDWordAligned: ands R3, R1, #+3 beq ARM_MEMCPY_HandleBulkWordData ; If source and destination are aligned, use bulk word transfer subs R2, R2, #+4 bcc ARM_MEMCPY_HandleTrailingBytes ; If we have less than one complete word left, use single byte transfer ldr R12, [R1, -R3]! ; Read first mis-aligned data word and word align source address cmp R3, #+2 beq ARM_MEMCPY_Loop16BitShift bhi ARM_MEMCPY_Loop24BitShift ;------------------------------------------------------------------------------- ; Handle data in units of word ; ; This is done by reading mis-aligned words from source address and ; shift them into the right alignment. After this the next data word ; will be read to complete the missing data part. ; ARM_MEMCPY_Loop8BitShift: mov R3, R12, LSR #+8 ; Shift data word into right position ldr R12, [R1, #+4]! ; Load next mis-aligned data word subs R2, R2, #+4 ; Decrement NumBytes orr R3, R3, R12, LSL #+24 ; Combine missing part of data to build full data word str R3, [R0], #+4 ; Store complete word bcs ARM_MEMCPY_Loop8BitShift add R1, R1, #+1 ; Adjust source address b ARM_MEMCPY_HandleTrailingBytes ; Handle trailing bytes ARM_MEMCPY_Loop16BitShift: mov R3, R12, LSR #+16 ; Shift data word into right position ldr R12, [R1, #+4]! ; Load next mis-aligned data word subs R2, R2, #+4 ; Decrement NumBytes orr R3, R3, R12, LSL #+16 ; Combine missing part of data to build full data word str R3, [R0], #+4 ; Store complete word bcs ARM_MEMCPY_Loop16BitShift add R1, R1, #+2 ; Adjust source address b ARM_MEMCPY_HandleTrailingBytes ; Handle trailing bytes ARM_MEMCPY_Loop24BitShift: mov R3, R12, LSR #+24 ; Shift data word into right position ldr R12, [R1, #+4]! ; Load next mis-aligned data word subs R2, R2, #+4 ; Decrement NumBytes orr R3, R3, R12, LSL #+8 ; Combine missing part of data to build full data word str R3, [R0], #+4 ; Store complete word bcs ARM_MEMCPY_Loop24BitShift add R1, R1, #+3 ; Adjust source address b ARM_MEMCPY_HandleTrailingBytes ; Handle trailing bytes ;------------------------------------------------------------------------------- ; Handle large bulk data in blocks of 8 words (32 bytes) ; ARM_MEMCPY_HandleBulkWordData: subs R2, R2, #+0x20 stmdb SP!, {R4, LR} bcc ARM_MEMCPY_HandleTrailingWords ARM_MEMCPY_LoopHandleBulkWord: ldm R1!, {R3, R4, R12, LR} ; Transfer 16 bytes at once stm R0!, {R3, R4, R12, LR} ldm R1!, {R3, R4, R12, LR} ; Transfer 16 bytes at once stm R0!, {R3, R4, R12, LR} subs R2, R2, #+0x20 bcs ARM_MEMCPY_LoopHandleBulkWord ;------------------------------------------------------------------------------- ; Handle trailing 7 words ; ARM_MEMCPY_HandleTrailingWords: movs R12, R2, LSL #28 ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data ldmcs R1!, {R3, R4, R12, LR} ; C flag contain bit 4 of NumBytes (transfer 16 bytes if it is set) stmcs R0!, {R3, R4, R12, LR} ldmmi R1!, {R3, R4} ; N flag contain bit 3 of NumBytes (transfer 8 bytes if it is set) stmmi R0!, {R3, R4} movs R12, R2, LSL #+30 ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data ldmia SP!, {R4, LR} ldrcs R3, [R1], #+4 ; C flag contain bit 2 of NumBytes (transfer 4 bytes if it is set) strcs R3, [R0], #+4 bxeq LR ;------------------------------------------------------------------------------- ; Handle trailing 3 bytes ; ; N Z C V Q ***** I F T M4 3 2 1 0 ; N = bit[31] ; C = last shift bit : shift ; C = 1 ADD/CMN has carry bit ; C = 0 SUB/CMP no borrow bit ; xxxxxxxxxxxxxxxxxxxx10 << 31 : N=0, C=1 ; xxxxxxxxxxxxxxxxxxxx01 << 31 : N=1, C=0 ; BMI : N=1 ; BCS : C=1 ARM_MEMCPY_HandleTrailingBytes: movs R2, R2, LSL #+31 ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data ldrbmi R2, [R1], #+1 ; N flag contain bit 0 of NumBytes (transfer 1 byte if it is set) ldrbcs R3, [R1], #+1 ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is set) ldrbcs R12, [R1], #+1 strbmi R2, [R0], #+1 strbcs R3, [R0], #+1 strbcs R12, [R0], #+1 bx LR ;------------------------------------------------------------------------------- ; void ARM_MEMSET(void* pDest, U32 c, U32 NumBytes) ; ; Function description ; Copy data in memory from source address to destination address. ; ; Register usage: ; ; R0 pDest ; R1 c ; R2 NumBytes ; ; R3 Used for data transfers ; R4 Used for data transfers ; R5 Used for data transfers ; R6 Used for data transfers ; ; R13 SP ; R14 LR (contains return address) ; R15 PC ; ;------------------------------------------------------------------------------- ARM_MEMSET: ;------------------------------------------------------------------------------- orr R1, R1, R1, LSL #+8 orr R1, R1, R1, LSL #+16 cmp R2, #+3 ; R2 = NumBytes bls ARM_MEMSET_HandleTrailingBytes ; If we have less than one complete word, use single byte transfer ands R3, R0, #+3 ; R0 = destination address beq ARM_MEMSET_DestIsAligned ; Is destination address already word aligned ? ; Handle as much bytes as necessary to align destination address strb R1, [R0], #+1 ; We need at least one byte to the next word alignment, so we read one. cmp R3, #+2 ; Set condition codes according to the mis-alignment add R2, R2, R3 ; Adjust NumBytes strbls R1, [R0], #+1 ; Lower or same (LS)? -> We need one or two bytes to the next word aligned address sub R2, R2, #+4 ; Adjust NumBytes strbcc R1, [R0], #+1 ; Carry clear (CC)? -> We need one more byte ; Choose best way to transfer data ARM_MEMSET_DestIsAligned: ; destination is aligned, use bulk word transfer ; Handle large bulk data in blocks of 8 words (32 bytes) ARM_MEMSET_HandleBulkWordData: stmdb SP!, {R4, R5, R6} mov R3, R1, LSL #+0 ; Transfer 16 bytes at once mov R4, R1, LSL #+0 mov R5, R1, LSL #+0 subs R2, R2, #+0x20 ; 32 Bytes = 8 DWords bcc ARM_MEMSET_HandleTrailingWords ARM_MEMSET_LoopHandleBulkWord: stm R0!, {R1, R3, R4, R5} stm R0!, {R1, R3, R4, R5} subs R2, R2, #+0x20 bcs ARM_MEMSET_LoopHandleBulkWord ; Handle trailing 7 words ARM_MEMSET_HandleTrailingWords: movs R6, R2, LSL #28 ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data stmcs R0!, {R1, R3, R4, R5} ; C flag contain bit 4 of NumBytes (transfer 16 bytes if it is set) stmmi R0!, {R1, R3} ; N flag contain bit 3 of NumBytes (transfer 8 bytes if it is set) movs R6, R2, LSL #+30 ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data strcs R1, [R0], #+4 ; C flag contain bit 2 of NumBytes (transfer 4 bytes if it is set) ldmia SP!, {R4, R5, R6} bxeq LR ; Z flag contain no Trailing Bytes ; Handle trailing 3 bytes ARM_MEMSET_HandleTrailingBytes: movs R2, R2, LSL #+31 ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data strbmi R1, [R0], #+1 ; N flag contain bit 0 of NumBytes (transfer 1 byte if it is set) strbcs R1, [R0], #+1 ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is set) strbcs R1, [R0], #+1 bx LR ; int ARM_MEMSET8(void* pDest, U32 c, U32 NumBytes); ;------------------------------------------------------------------------------- ARM_MEMSET8: ;------------------------------------------------------------------------------- stmdb SP!, {R4, R5} cmp R2, #4 blt ARM_MEMSET8_loop3 ; Alignment is unknown tst R0, #1 strneb R1, [R0], #1 subne R2, R2, #1 ; Now we are 16-bit aligned (need to upgrade 'c' to 16-bit) orr R1, R1, R1, LSL #8 tst R0, #2 strneh R1, [R0], #2 subne R2, R2, #2 ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit) orr R1, R1, R1, LSL #16 mov R3, R1 cmp R2, #16 blt ARM_MEMSET8_loop2 tst R0, #4 strne R1, [R0], #4 subne R2, R2, #4 tst R0, #8 stmneia R0!, {R1, R3} subne R2, R2, #8 ; Now we are 128-bit aligned mov R4, R1 mov R5, R1 ARM_MEMSET8_loop1: ; Copy 4 32-bit values per loop iteration subs R2, R2, #16 stmgeia R0!, {R1, R3, R4, R5} bge ARM_MEMSET8_loop1 add R2, R2, #16 ARM_MEMSET8_loop2: ; Copy up to 3 remaining 32-bit values tst R2, #8 stmneia R0!, {R1, R3} tst R2, #4 strne R1, [R0], #4 and R2, R2, #3 ARM_MEMSET8_loop3: ; Copy up to 3 remaining bytes subs R2, R2, #1 strgeb R1, [R0], #1 subs R2, R2, #1 strgeb R1, [R0], #1 subs R2, R2, #1 strgeb R1, [R0], #1 ldmia SP!, {R4, R5} bx LR ; int ARM_MEMSET16(void* pDest, U32 c, U32 NumHalfWords); ;------------------------------------------------------------------------------- ARM_MEMSET16: ;------------------------------------------------------------------------------- stmdb SP!, {R4, R5} cmp R2, #2 blt ARM_MEMSET16_HandleTrailingHalfWord ; 1 or 0 ; Alignment is known to be at least 16-bit tst R0, #2 strneh R1, [R0], #2 ; xxxx-xx10 ---> subne R2, R2, #1 ; xxxx-xx00 ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit ) orr R1, R1, R1, LSL #16 mov R4, R1 cmp R2, #8 blt ARM_MEMSET16_HandleTrailingWords ; 7, 6, ... 0 tst R0, #4 strne R1, [R0], #4 ; xxxx-x100 ---> subne R2, R2, #2 ; xxxx-x000 ---> ; Now we are 64-bit aligned tst R0, #8 stmneia R0!, {R1, R4} ; xxxx-1000 ---> subne R2, R2, #4 ; xxxx-0000 ---> ARM_MEMSET16_HandleBulkWordData: ; Now we are 128-bit aligned mov R5, R1 mov R3, R1 ARM_MEMSET16_LoopHandleBulkWord: ; Copy 4 32-bit values per loop iteration subs R2, R2, #8 stmgeia R0!, {R1, R3, R4, R5} bge ARM_MEMSET16_LoopHandleBulkWord add R2, R2, #8 ARM_MEMSET16_HandleTrailingWords: ; Copy up to 3 remaining 32-bit values tst R2, #4 stmneia R0!, {R1, R4} tst R2, #2 strne R1, [R0], #4 and R2, R2, #1 ARM_MEMSET16_HandleTrailingHalfWord: ; Copy up to 1 remaining 16-bit value subs R2, R2, #1 strgeh R1, [R0], #2 ldmia SP!, {R4, R5} bx LR ; int ARM_MEMSET32(void* pDest, U32 c, U32 NumWords); ;------------------------------------------------------------------------------- ARM_MEMSET32: ;------------------------------------------------------------------------------- stmdb SP!, {R4, R5} cmp R2, #4 blt ARM_MEMSET32_loop2 ; Alignment is known to be at least 32-bit mov R3, R1 tst R0, #4 strne R1, [R0], #4 subne R2, R2, #1 ; Now we are 64-bit aligned tst R0, #8 stmneia R0!, {R1, R3} subne R2, R2, #2 ; Now we are 128-bit aligned mov R4, R1 mov R5, R1 ARM_MEMSET32_loop1: ; Copy 4 32-bit values per loop iteration subs R2, R2, #4 stmgeia R0!, {R1, R3, R4, R5} bge ARM_MEMSET32_loop1 add R2, R2, #4 ARM_MEMSET32_loop2: ; Copy up to 3 remaining 32-bit values subs R2, R2, #1 strge R1, [R0], #4 subs R2, R2, #1 strge R1, [R0], #4 subs R2, R2, #1 strge R1, [R0], #4 ldmia SP!, {R4, R5} bx LR ;-__arm void ARM_memxor(void* pDest, U32 c, U32 NumBytes); ; r0 r1 r2 ;------------------------------------------------------------------------------- arm_memxor: ;------------------------------------------------------------------------------- orr R1, R1, R1, LSL #+8 orr R1, R1, R1, LSL #+16 cmp R2, #+3 ; R2 = NumBytes bls arm_memxor_HandleTrailingBytes ; If we have less than one complete word, use single byte transfer ands R3, R0, #+3 ; R0 = destination address beq arm_memxor_DestIsAligned ; Is destination address already word aligned ? ;- ; Handle as much bytes as necessary to align destination address ;- ldrb R12, [R0], #+0 ; We need at least one byte to the next word alignment, so we read one. eor R12, R12, r1 strb R12, [R0], #+1 ; We need at least one byte to the next word alignment, so we read one. cmp R3, #+2 ; Set condition codes according to the mis-alignment add R2, R2, R3 ; Adjust NumBytes ldrbls R3, [R0], #+0 ; We need at least one byte to the next word alignment, so we read one. eorls R3, R3, r1 strbls R3, [R0], #+1 ; Lower or same (LS)? -> We need one or two bytes to the next word aligned address sub R2, R2, #+4 ; Adjust NumBytes ldrbcc R3, [R0], #+0 ; We need at least one byte to the next word alignment, so we read one. eorcc R3, R3, r1 strbcc R3, [R0], #+1 ; Carry clear (CC)? -> We need one more byte ;- ; Choose best way to transfer data ;- arm_memxor_DestIsAligned: ; destination is aligned, use bulk word transfer ;- ; Handle large bulk data in blocks of 8 words (32 bytes) ;- arm_memxor_HandleBulkWordData: stmdb SP!, {R4, R5, R6, R7} subs R2, R2, #+0x20 ; 32 Bytes = 8 DWords bcc arm_memxor_HandleTrailingWords arm_memxor_LoopHandleBulkWord: ldm R0, {R3, R4, R5, R6} eor r3, r3, r1 eor r4, r4, r1 eor r5, r5, r1 eor r6, r6, r1 stm R0!, {R3, R4, R5, R6} ldm R0, {R3, R4, R5, R6} eor r3, r3, r1 eor r4, r4, r1 eor r5, r5, r1 eor r6, r6, r1 stm R0!, {R3, R4, R5, R6} subs R2, R2, #+0x20 bcs arm_memxor_LoopHandleBulkWord ;- ; Handle trailing 7 words ;- arm_memxor_HandleTrailingWords: movs R7, R2, LSL #28 ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data ldmcs R0, {R3, R4, R5, R6} eorcs r3, r3, r1 eorcs r4, r4, r1 eorcs r5, r5, r1 eorcs r6, r6, r1 stmcs R0!, {R3, R4, R5, R6} ; C flag contain bit 4 of NumBytes (transfer 16 bytes if it is xor) ldmmi R0, {R3, R4} eormi r3, r3, r1 eormi r4, r4, r1 stmmi R0!, {R3, R4} ; N flag contain bit 3 of NumBytes (transfer 8 bytes if it is xor) movs R7, R2, LSL #+30 ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data ldrcs R3, [R0] eorcs r3, r3, r1 strcs R3, [R0], #+4 ; C flag contain bit 2 of NumBytes (transfer 4 bytes if it is xor) ldmia SP!, {R4, R5, R6, R7} bxeq LR ; Z flag contain no Trailing Bytes ;- ; Handle trailing 3 bytes ;- arm_memxor_HandleTrailingBytes: movs R2, R2, LSL #+31 ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data ldrmi R2, [R0] eormi R2, R2, r1 strbmi R2, [R0], #+1 ; N flag contain bit 0 of NumBytes (transfer 1 byte if it is xor) ldrcs R2, [R0] eorcs R2, R2, r1 strbcs R2, [R0], #+1 ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is xor) ldrcs R2, [R0] eorcs R2, R2, r1 strbcs R2, [R0], #+1 ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is xor) bx LR ;-__arm int arm_memxor8(void* pDest, U32 c, U32 NumBytes); ; r0 r1 r2 ;------------------------------------------------------------------------------- arm_memxor8: ;------------------------------------------------------------------------------- stmdb SP!, {R4, R5, R6} orr R1, R1, R1, LSL #+8 orr R1, R1, R1, LSL #+16 cmp R2, #4 blt arm_memxor8_loop3 ; Alignment is unknown tst R0, #1 ldrneb R6, [R0] eorne R6, r6, R1 strneb R6, [R0], #1 subne R2, R2, #1 ; Now we are 16-bit aligned (need to upgrade 'c' to 16-bit) tst R0, #2 ldrneh R6, [R0] eorne R6, r6, R1 strneh R6, [R0], #2 subne R2, R2, #2 ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit) cmp R2, #16 blt arm_memxor8_loop2 tst R0, #4 ldrne R6, [R0] eorne R6, r6, R1 strne R6, [R0], #4 ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit) subne R2, R2, #4 tst R0, #8 ldmneia R0, {R3, R6} eorne R3, r3, R1 eorne R6, r6, R1 stmneia R0!, {R3, R6} subne R2, R2, #8 ; Now we are 128-bit aligned mov R4, R1 mov R5, R1 arm_memxor8_loop1: ; Copy 4 32-bit values per loop iteration subs R2, R2, #16 ldmgeia R0, {R3, R4, R5, R6} eorge r3, r3, r1 eorge r4, r4, r1 eorge r5, r5, r1 eorge r6, r6, r1 stmgeia R0!, {R3, R4, R5, R6} bge arm_memxor8_loop1 add R2, R2, #16 arm_memxor8_loop2: ; Copy up to 3 remaining 32-bit values tst R2, #8 ldmneia R0, {R3, R4} eorne r3, r3, r1 eorne r4, r4, r1 stmneia R0!, {R3, R4} tst R2, #4 ldrne R3, [R0] eorne r3, r3, r1 strne R3, [R0], #4 and R2, R2, #3 arm_memxor8_loop3: ; Copy up to 3 remaining bytes subs R2, R2, #1 ldrgeb R3, [R0] eorge r3, r3, r1 strgeb R3, [R0], #1 subs R2, R2, #1 ldrgeb R3, [R0] eorge r3, r3, r1 strgeb R1, [R0], #1 subs R2, R2, #1 ldrgeb R3, [R0] eorge r3, r3, r1 strgeb R1, [R0], #1 ldmia SP!, {R4, R5, R6} bx LR ;-__arm int arm_memxor16(void* pDest, U32 c, U32 NumHalfWords); ; r0 r1 r2 ;------------------------------------------------------------------------------- arm_memxor16: ;------------------------------------------------------------------------------- stmdb SP!, {R4, R5, R6} orr R1, R1, R1, LSL #+16 cmp R2, #2 blt arm_memxor16_HandleTrailingHalfWord ; 1 or 0 ; Alignment is known to be at least 16-bit tst R0, #2 ldrneh R6, [R0] eorne R6, r6, R1 strneh R6, [R0], #2 ; xxxx-xx10 ---> subne R2, R2, #1 ; xxxx-xx00 ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit ) cmp R2, #8 blt arm_memxor16_HandleTrailingWords ; 7, 6, ... 0 tst R0, #4 ldrne R3, [R0] eorne r3, r3, r1 strne R3, [R0], #4 ; xxxx-x100 ---> subne R2, R2, #2 ; xxxx-x000 ---> ; Now we are 64-bit aligned tst R0, #8 ldmneia R0, {R3, R4} eorne r3, r3, r1 eorne r4, r4, r1 stmneia R0!, {R3, R4} ; xxxx-1000 ---> subne R2, R2, #4 ; xxxx-0000 ---> arm_memxor16_HandleBulkWordData: ; Now we are 128-bit aligned mov R5, R1 mov R6, R1 arm_memxor16_LoopHandleBulkWord: ; Copy 4 32-bit values per loop iteration subs R2, R2, #8 ldmgeia R0, {R3, R4, R5, R6} eorge r3, r3, r1 eorge r4, r4, r1 eorge r5, r5, r1 eorge r6, r6, r1 stmgeia R0!, {R3, R4, R5, R6} bge arm_memxor16_LoopHandleBulkWord add R2, R2, #8 arm_memxor16_HandleTrailingWords: ; Copy up to 3 remaining 32-bit values tst R2, #4 ldmneia R0, {R3, R4} eorne r3, r3, r1 eorne r4, r4, r1 stmneia R0!, {R3, R4} tst R2, #2 ldrne R3, [R0] eorne r3, r3, r1 strne R3, [R0], #4 and R2, R2, #1 arm_memxor16_HandleTrailingHalfWord: ; Copy up to 1 remaining 16-bit value subs R2, R2, #1 ldrgeh R3, [R0] eorge r3, r3, r1 strgeh R3, [R0], #2 ldmia SP!, {R4, R5, R6} bx LR ;-__arm int arm_memxor32(void* pDest, U32 c, U32 NumWords); ; r0 r1 r2 ;------------------------------------------------------------------------------- arm_memxor32: ;------------------------------------------------------------------------------- stmdb SP!, {R4, R5, R6} cmp R2, #4 blt arm_memxor32_loop2 ; Alignment is known to be at least 32-bit, is it 64-bit aligned ? tst R0, #4 ; No, it is 32-bit aligned ldrne R3, [R0] eorne R3, r3, R1 strne R3, [R0], #4 subne R2, R2, #1 ; Now we are 64-bit aligned, is it 128-bit aligned ? tst R0, #8 ; No, it is 64-bit aligned ldmneia R0, {R3, R4} eorne r3, r3, r1 eorne r4, r4, r1 stmneia R0!, {R3, R4} ; xxxx-1000 ---> subne R2, R2, #2 ; Now we are 128-bit aligned mov R4, R1 mov R5, R1 arm_memxor32_loop1: ; Copy 4 32-bit values per loop iteration subs R2, R2, #4 ldmgeia R0, {R3, R4, R5, R6} eorge r3, r3, r1 eorge r4, r4, r1 eorge r5, r5, r1 eorge r6, r6, r1 stmgeia R0!, {R3, R4, R5, R6} bge arm_memxor32_loop1 add R2, R2, #4 arm_memxor32_loop2: ; Copy up to 3 remaining 32-bit values subs R2, R2, #1 ldrge R3, [R0] eorge r3, r3, r1 strge R3, [R0], #4 subs R2, R2, #1 ldrge R3, [R0] eorge r3, r3, r1 strge R3, [R0], #4 subs R2, R2, #1 ldrge R3, [R0] eorge r3, r3, r1 strge R3, [R0], #4 ldmia SP!, {R4, R5, R6} bx LR END
相关文章推荐
- 用shell脚本编写区别两个文件夹内文件的不同
- linux命令说明
- CentOS-6.5系统基础优化篇,附带优化脚本 推荐
- 快速理解Docker - 容器级虚拟化解决方案
- MMORPG服务器架构
- linux系统各种终端命令
- 构建高并发高可用的电商平台架构实践
- 使用Apache POI API读写Excel
- 【动态规划】[USACO2011 OPEN]修剪草坪
- shell对比文件内容脚本分享
- linux 查看系统信息命令
- linux 查看系统信息命令
- hyper-v 安装 openwrt x86 squashfs
- linux修改文件所属用户和组
- linux下自动获取并安装软件包 apt-get 的命令介绍
- ORA-1092 : opitsk aborting process---killed by oom killer
- 基于xmpp openfire smack开发之openfire介绍和部署[1]
- AVRO文件结构分析
- XShell配置Socket5代理
- Powershell查看AD 组成员的变化