mirror of
https://github.com/reactos/reactos.git
synced 2025-07-23 10:13:38 +00:00
297 lines
20 KiB
ArmAsm
297 lines
20 KiB
ArmAsm
;
|
|
; wcslen.asm
|
|
;
|
|
; Copyright (c) Microsoft Corporation. All rights reserved.
|
|
;
|
|
; Optimized wcslen and wcsnlen implementations for ARM64.
|
|
;
|
|
#include "ksarm64.h"
|
|
|
|
; size_t wcslen(const wchar_t *str);
|
|
; size_t wcsnlen(const wchar_t *str, size_t numberOfElements);
|
|
|
|
; This file could also define wcsnlen_s. wcsnlen_s is currently defined in the headers (string.h & wchar.h) in C
|
|
; using a check for null and a call to wcsnlen. This avoids making the call in the case where the string is null,
|
|
; which should be infrequent. However it makes code larger by inlining that check everywhere wcsnlen_s is called.
|
|
; An alternative would be to modify the standard headers and define wcsnlen_s here. It would be just one instruction:
|
|
;
|
|
; LEAF_ENTRY wcsnlen_s
|
|
; cbz x0, AnyRet ; AnyRet would be a label in front of any ret instruction. Return value in x0 is already 0.
|
|
; ; fallthrough into wcsnlen code
|
|
; ALTERNATE_ENTRY wcsnlen ; change LEAF_ENTRY for wcsnlen to ALTERNATE_ENTRY
|
|
; ...
|
|
|
|
; Note: this code assumes that the input parameter is always aligned to an even byte boundary.
|
|
|
|
#if !defined(_M_ARM64EC)
|
|
|
|
EXPORT A64NAME(wcslen) [FUNC]
|
|
EXPORT A64NAME(wcsnlen) [FUNC]
|
|
|
|
#endif
|
|
|
|
SET_COMDAT_ALIGNMENT 6
|
|
|
|
; With wcslen we will usually read some chars past the end of the string. To avoid getting an AV
|
|
; when a char-by-char implementation would not, we have to ensure that we never cross a page boundary with a
|
|
; vector load, so we must align the vector loads to 16-byte-aligned boundaries.
|
|
;
|
|
; For wcsnlen we know the buffer length and so we won't read any chars beyond the end of the buffer. This means
|
|
; we have a choice whether to arrange our vector loads to be 16-byte aligned. (Note that on arm64 a vector load
|
|
; only produces an alignment fault when the vector *elements* are misaligned, so an "8H" vector load will never
|
|
; give an alignment fault on any even address). Aligning the vector loads on 16-byte boundaries saves one cycle
|
|
; per vector load instruction. The cost of forcing 16-byte aligned loads is the 10 instructions preceding the
|
|
; 'NoNeedToAlign' label below. On Cortex-A57, the execution latency of those 10 instructions is 26 cycles,
|
|
; (one less than the strnlen case because uminv is 1 cycle faster for halfwords than it is for bytes),
|
|
; assuming no branch mispredict on the 'beq'. To account for the cost of an occasional mispredict we guess a
|
|
; mispredict rate of 2% and a mispredict cost of 50 cycles, or 1 cycle per call amortized, 27 total. 27 * 8 = 216.
|
|
; In this analysis we are ignoring the chance of extra cache misses due to loads crossing cache lines when
|
|
; they are not 16-byte aligned. When the vector loads span cache line boundaries each cache line is referenced
|
|
; one more time than it is when the loads are aligned. But we assume that the cache line stays loaded for the
|
|
; short time we need to do all the references to it, and so one extra reference won't matter.
|
|
; It is expected that the number of cycles (27) will stay about the same for future processor models. If it
|
|
; changes radically, it will be worth converting the EQU to a global, using ldr to load it instead of a
|
|
; mov-immediate, and dynamically setting the global during CRT startup based on processor model.
|
|
|
|
__wcsnlen_forceAlignThreshold EQU 216 ; code logic below assumes >= 16
|
|
|
|
ARM64EC_ENTRY_THUNK A64NAME(wcslen),1,0
|
|
LEAF_ENTRY_COMDAT A64NAME(wcslen)
|
|
|
|
; check for empty string to avoid huge perf degradation in this case.
|
|
ldrh w2, [x0], #0
|
|
cbz w2, EmptyStr
|
|
|
|
mov x5, x0 ; keep original x0 value for the final 'sub'
|
|
tbnz x0, #0, WCharAtATime ; check for misaligned characters. Must go char-by-char when
|
|
; misaligned so that if there's an access violation it gets
|
|
; generated on the correct address (one byte into a new page
|
|
; instead of up to 15 bytes in if we had loaded a vector
|
|
; from the last byte of the previous page)
|
|
|
|
; calculate number of bytes until first 16-byte alignment point
|
|
|
|
ands x1, x5, #15 ; x1 = (addr mod 16)
|
|
beq WcslenMainLoop ; no need to force alignment if already aligned
|
|
|
|
; we need to align, check whether we are within 16 bytes of the end of the page.
|
|
; branch if ((address mod PAGESIZE - 1) > (PAGESIZE - 16))
|
|
|
|
and x2, x5, #4095 ; x2 = address mod (PAGESIZE - 1)
|
|
cmp x2, #4080 ; compare x2 to (PAGESIZE - 16)
|
|
bgt AlignSlowly ; too close to end of page, must align one wchar at a time
|
|
|
|
; AlignFast: safe to do one 2-byte aligned vector load to force alignment to a 16-byte boundary
|
|
|
|
ld1 v0.8h, [x5] ; don't post-increment x5
|
|
uminv h1, v0.8h
|
|
fmov w2, s1 ; fmov is sometimes 1 cycle faster than "umov w2, v1.h[0]"
|
|
cbz w2, FindWideNullInVector ; jump when string < 15 bytes (<= 7 wchar_t's) long & not near end of page
|
|
add x5, x5, #16 ; move x5 forward only to aligned address. (Assumes even address)
|
|
and x5, x5, 0xFFFFFFFFFFFFFFF0 ; first iter of StrlenMainLoop will retest some bytes we already tested
|
|
|
|
; The code at WcslenMainLoop should be 64-byte aligned for best performance.
|
|
; Due to VSO#1106651, automatic padding with NOPs when in code areas is
|
|
; broken. The workaround is to use -721215457.
|
|
; MSFT:21876224 tracks removal of this workaround.
|
|
ALIGN 64,0,-721215457,4
|
|
|
|
WcslenMainLoop ; test 8 wchar_t's at a time until we find it
|
|
ld1 v0.8h, [x5], #16
|
|
uminv h1, v0.8h ; use unsigned min to look for a zero wchar_t; too bad it doesn't set CC
|
|
fmov w2, s1 ; need to move min wchar_t into gpr to test it
|
|
cbnz w2, WcslenMainLoop ; fall through when any one of the wchar_ts in v0 is zero
|
|
|
|
sub x5, x5, #16 ; undo the last #16 post-increment of x5
|
|
|
|
FindWideNullInVector
|
|
ldr q1, ReverseBytePos ; load the position indicator mask
|
|
|
|
cmeq v0.8h, v0.8h, #0 ; +----
|
|
and v0.16b, v0.16b, v1.16b ; |
|
|
umaxv h0, v0.8h ; | see big comment below
|
|
fmov w2, s0 ; |
|
|
eor w2, w2, #7 ; +----
|
|
|
|
sub x0, x5, x0 ; subtract ptr to null char from ptr to first char to get the string length in bytes
|
|
add x0, x2, x0, ASR #1 ; divide x0 by 2 to get the number of wide chars and then add in the final vector char pos
|
|
ret
|
|
|
|
AlignSlowly
|
|
sub x1, x1, #16 ; x1 = (addr mod 16) - 16
|
|
sub x1, xzr, x1, ASR #1 ; x1 = -(((addr mod 16) - 16) / 2) = (16 - (addr mod 16)) / 2 = num wchar_ts
|
|
|
|
AlignLoop ; test one wchar_t at a time until we are 16-byte aligned
|
|
ldrh w2, [x5], #2
|
|
cbz w2, OneByOneFoundIt ; branch if found the null
|
|
subs x1, x1, #1
|
|
bgt AlignLoop ; fall through when not found and reached 16-byte alignment
|
|
b WcslenMainLoop
|
|
|
|
WCharAtATime
|
|
ldrh w2, [x5], #2
|
|
cbnz w2, WCharAtATime ; when found use same exit sequence as when found during slow alignment
|
|
|
|
OneByOneFoundIt
|
|
sub x5, x5, #2 ; Undo the final post-increment that happened on the load of the null wchar_t.
|
|
sub x0, x5, x0 ; With x5 pointing at the null char, x5-x0 is the length in bytes
|
|
asr x0, x0, #1 ; divide by 2 to get length in wchar_ts
|
|
ret
|
|
|
|
EmptyStr
|
|
mov x0, 0
|
|
ret
|
|
|
|
; The challenge is to find a way to efficiently determine which of the 8 wchar_t's we loaded is the end of the string.
|
|
; The trick is to load a position indicator mask and generate the position of the rightmost null from that.
|
|
; Little-endian order means when we load the mask below v1.8h[0] has 7, and v0.8h[0] is the wchar_t of the string
|
|
; that comes first of the 8 we loaded. We do a cmeq, mapping all the wchar_t's we loaded to either 0xFFFF (for nulls)
|
|
; or 0x0000 for non-nulls. Then we and with the mask below. SIMD lanes corresponding to a non-null wchar_t will be 0x0000,
|
|
; and SIMD lanes corresponding to a null wchar_t will have a halfword from the mask. We take the max across the halfwords
|
|
; of the vector to find the highest position that corresponds to a null wchar_t. The numbering order means we find the
|
|
; rightmost null in the vector, which is the null that occurred first in memory due to little endian loading.
|
|
; Exclusive oring the position indicator byte with 7 inverts the order, which gives us the character position of the null
|
|
; counting from the first wchar_t we loaded into the v0 SIMD reg.
|
|
|
|
ReverseBytePos \
|
|
dcw 7, 6, 5, 4, 3, 2, 1, 0 ; vector of halfwords
|
|
|
|
LEAF_END
|
|
|
|
|
|
ARM64EC_ENTRY_THUNK A64NAME(wcsnlen),1,0
|
|
LEAF_ENTRY_COMDAT A64NAME(wcsnlen)
|
|
|
|
mov x5, x0 ; keep original x0 value for the final 'sub'
|
|
|
|
tbnz x0, #0, ShortWcsnlen ; check for misaligned characters; must go char-by-char if misaligned
|
|
|
|
cmp x1, #8 ; x1 has length. When x1 < 8 we have to go char-by-char
|
|
blo ShortWcsnlen ; only do char-by-char for 0 to 7 characters
|
|
|
|
ands x3, x5, #15 ; x3 = start address mod 16
|
|
beq NoNeedToAlign ; branch on x3 == 0 because it's already aligned
|
|
|
|
; we need to align, check whether we are within 16 bytes of the end of the page.
|
|
; branch if ((address mod PAGESIZE - 1) > (PAGESIZE - 16))
|
|
|
|
and x2, x5, #4095 ; x2 = address mod (PAGESIZE - 1)
|
|
cmp x2, #4080 ; compare x2 to (PAGESIZE - 16)
|
|
bgt AlignSlowly_Wcsnlen ; too close to end of page, must align one wchar at a time
|
|
|
|
; force vector loads in the main loop to be 16-byte aligned
|
|
sub x3, x3, #16 ; x3 = (start address mod 16) - 16
|
|
neg x3, x3 ; x3 = 16 - (start address mod 16) = number of *bytes* to advance to get aligned
|
|
ld1 v0.8h, [x5] ; don't post-increment x5
|
|
uminv h1, v0.8h
|
|
fmov w2, s1 ; fmov is sometimes 1 cycle faster than "umov w2, v1.h[0]"
|
|
cbz w2, FindWideNullInVector_Wcsnlen ; jump when found null within first 8 wchar_t's
|
|
sub x1, x1, x3, ASR #1 ; reduce elements remaining by number of wchar_t's needed to get aligned (bytes/2)
|
|
add x5, x5, x3 ; move x5 forward by x3 bytes, so x5 is now a 16-byte aligned address
|
|
ResumeAfterAlignSlowly
|
|
cmp x1, #8 ; check for size < 8 after alignment adjustment
|
|
blo ShortWcsnlen
|
|
NoNeedToAlign
|
|
asr x3, x1, #3 ; set up interations remaining after alignment point reached (8 wchar_t's per iteration)
|
|
; no need to check here for x3 == 0 because:
|
|
; - if we didn't align it, it is at least 16 bytes long
|
|
; - if we did align it, we checked for <16 before coming here
|
|
WcsNlenMainLoop ; test 8 wchar_t's at a time until we find it
|
|
ld1 v0.8h, [x5], #16
|
|
uminv h1, v0.8h ; use unsigned min to look for a zero wchar_t
|
|
fmov w2, s1 ; need to move min wchar_t into gpr to test it
|
|
cbz w2, UndoPI_FindNullInVector ; jump out and over into wcslen function when any one of the wchar_t's in v0 is zero
|
|
subs x3, x3, #1
|
|
bne WcsNlenMainLoop
|
|
|
|
ands x1, x1, #7 ; check for remainder
|
|
beq WcsNLenOverrun ; orig buffer size was multiple of 8 wchar_t's so no remainder; goto overrun case
|
|
|
|
; We're less than 8 wchar_t's from the end of the buffer and haven't found a '\0\0' yet. We know we were originally longer than
|
|
; 16 bytes so we can do a 2-byte aligned vector compare of the last 8 wchar_t's of the buffer, overlapping with some wchar_t's
|
|
; we already know are non-zero, without fear of underrunning the original front of the buffer. This avoids a more costly
|
|
; char-by-char comparison for the remainder (which would average 32 instructions executed and two branch mispredicts).
|
|
; At this point:
|
|
; x5 points at one of the last 7 wchar_t's of the buffer
|
|
; x1 has the number of wchar_t's remaining in the buffer. 1 <= x1 <= 7
|
|
; 8 - x1 is the number of wchar_t's we have to 'back up', LSL that by 1 to get bytes
|
|
FastRemainderHandling
|
|
sub x1, x1, #8
|
|
neg x1, x1 ; x1 = (8 - number of chars remaining); the number of wchar_t's to back up.
|
|
sub x5, x5, x1, LSL #1 ; x5 = x5 - (2*x1); back up number of bytes equivalent to x1 wchar_t's
|
|
ld1 v0.8h, [x5], #16 ; load all of remainder and some already-checked wchar_t's.
|
|
uminv h1, v0.8h
|
|
fmov w2, s1 ; fmov is sometimes 1 cycle faster than "umov w2, v1.h[0]"
|
|
cbz w2, UndoPI_FindNullInVector ; found a '\0\0' within the last 8 elements of the buffer
|
|
b WcsNLenOverrun ; else x5 points one past end of buffer, and we're all set for the overrun exit.
|
|
|
|
ShortWcsnlen
|
|
cbz x1, WcsNLenOverrun ; if original number of elements was zero, we must return 0 without touching the buffer
|
|
|
|
ShortWcsNLenLoop
|
|
ldrh w2, [x5], #2
|
|
cbz w2, OneByOneFoundIt_Wcsnlen ; jump into other function to avoid code duplication of exit sequence
|
|
subs x1, x1, #1
|
|
bhi ShortWcsNLenLoop
|
|
|
|
WcsNLenOverrun
|
|
sub x0, x5, x0 ; x5 points one past the end of the buffer, x5-x0 is original buffer size in bytes
|
|
asr x0, x0, #1 ; adjust return value from bytes to wchar_t elements
|
|
; as an alternative to the above two instructions, we could save the original x1 value
|
|
; and just move that to x0 here, but that would add an instruction to all paths in order
|
|
; to save one here that's only on the overrun path. So we reconstruct the value instead.
|
|
ret
|
|
|
|
AlignSlowly_Wcsnlen
|
|
sub x3, x3, #16 ; x3 = (addr mod 16) - 16
|
|
sub x3, xzr, x3, ASR #1 ; x3 = -(((addr mod 16) - 16) / 2) = (16 - (addr mod 16)) / 2 = num wchar_ts
|
|
|
|
AlignLoop_Wcsnlen ; test one wchar_t at a time until we are 16-byte aligned
|
|
ldrh w2, [x5], #2
|
|
cbz w2, OneByOneFoundIt_Wcsnlen ; branch if found the null
|
|
subs x1, x1, #1
|
|
beq OneByOneReachedMax_Wcsnlen ; branch if byte-at-a-time testing reached end of buffer count
|
|
subs x3, x3, #1
|
|
bgt AlignLoop_Wcsnlen ; fall through when not found and reached 16-byte alignment
|
|
b ResumeAfterAlignSlowly
|
|
|
|
OneByOneFoundIt_Wcsnlen
|
|
sub x5, x5, #2 ; Undo the final post-increment that happened on the load of the null wchar_t.
|
|
OneByOneReachedMax_Wcsnlen
|
|
sub x0, x5, x0 ; With x5 pointing at the null char, x5-x0 is the length in bytes
|
|
asr x0, x0, #1 ; divide by 2 to get length in wchar_ts
|
|
ret
|
|
|
|
UndoPI_FindNullInVector
|
|
sub x5, x5, #16 ; undo the last #16 post-increment of x5
|
|
|
|
FindWideNullInVector_Wcsnlen
|
|
ldr q1, ReverseBytePos_Wcsnlen ; load the position indicator mask
|
|
|
|
cmeq v0.8h, v0.8h, #0 ; +----
|
|
and v0.16b, v0.16b, v1.16b ; |
|
|
umaxv h0, v0.8h ; | see big comment below
|
|
fmov w2, s0 ; |
|
|
eor w2, w2, #7 ; +----
|
|
|
|
sub x0, x5, x0 ; subtract ptr to null char from ptr to first char to get the string length in bytes
|
|
add x0, x2, x0, ASR #1 ; divide x0 by 2 to get the number of wide chars and then add in the final vector char pos
|
|
ret
|
|
|
|
; The challenge is to find a way to efficiently determine which of the 8 wchar_t's we loaded is the end of the string.
|
|
; The trick is to load a position indicator mask and generate the position of the rightmost null from that.
|
|
; Little-endian order means when we load the mask below v1.8h[0] has 7, and v0.8h[0] is the wchar_t of the string
|
|
; that comes first of the 8 we loaded. We do a cmeq, mapping all the wchar_t's we loaded to either 0xFFFF (for nulls)
|
|
; or 0x0000 for non-nulls. Then we and with the mask below. SIMD lanes corresponding to a non-null wchar_t will be 0x0000,
|
|
; and SIMD lanes corresponding to a null wchar_t will have a halfword from the mask. We take the max across the halfwords
|
|
; of the vector to find the highest position that corresponds to a null wchar_t. The numbering order means we find the
|
|
; rightmost null in the vector, which is the null that occurred first in memory due to little endian loading.
|
|
; Exclusive oring the position indicator byte with 7 inverts the order, which gives us the character position of the null
|
|
; counting from the first wchar_t we loaded into the v0 SIMD reg.
|
|
|
|
ReverseBytePos_Wcsnlen \
|
|
dcw 7, 6, 5, 4, 3, 2, 1, 0 ; vector of halfwords
|
|
|
|
LEAF_END
|
|
|
|
END
|