diff options
| author | Pádraig Brady <P@draigBrady.com> | 2025-10-19 13:11:46 +0100 |
|---|---|---|
| committer | Pádraig Brady <P@draigBrady.com> | 2025-10-20 13:13:25 +0100 |
| commit | e58f9390f1ae30d1855e40ae734dac35bd793a92 (patch) | |
| tree | 5c18dbee1265692d00435cc52a476ec6ccd1be1c /src | |
| parent | numfmt: support multi-byte --delimiter (diff) | |
| download | coreutils-e58f9390f1ae30d1855e40ae734dac35bd793a92.tar.gz coreutils-e58f9390f1ae30d1855e40ae734dac35bd793a92.zip | |
numfmt: optimize multi-byte --delimiter search
* src/numfmt.c (is_utf8_charset): A new function to efficiently
determine if running with a UTF-8 charset.
(mbsmbchr): A new function to efficiently search for
a (multi-byte) character in a multi-byte string.
(next-field): Use mbsmbchr() rather than mbstr() directly.
Diffstat (limited to 'src')
| -rw-r--r-- | src/numfmt.c | 32 |
1 files changed, 31 insertions, 1 deletions
diff --git a/src/numfmt.c b/src/numfmt.c index 0f0a8770b..5a1e94570 100644 --- a/src/numfmt.c +++ b/src/numfmt.c @@ -1363,6 +1363,36 @@ process_suffixed_number (char *text, long double *result, return (e == SSE_OK || e == SSE_OK_PRECISION_LOSS); } +/* Return true if the current charset is UTF-8. */ +static bool +is_utf8_charset (void) +{ + static int is_utf8 = -1; + if (is_utf8 == -1) + { + char32_t w; + mbstate_t mbs; mbszero (&mbs); + is_utf8 = mbrtoc32 (&w, "\xe2\x9f\xb8", 3, &mbs) == 3 && w == 0x27F8; + } + return is_utf8; +} + +/* Search for multi-byte character C in multi-byte string S. + Return a pointer to the character, or nullptr if not found. */ +ATTRIBUTE_PURE +static char * +mbsmbchr (char const* s, char const* c) +{ + unsigned char uc = *c; + /* GB18030 is the most restrictive for the 0x30 optimization below. */ + if (uc < 0x30 || MB_CUR_MAX == 1) + return strchr (s, uc); + else if (is_utf8_charset ()) + return uc < 0x80 ? strchr (s, uc) : strstr (s, c); + else + return *(c + 1) == '\0' ? mbschr (s, uc) : (char *) mbsstr (s, c); +} + /* Return a pointer to the beginning of the next field in line. The line pointer is moved to the end of the next field. */ static char* @@ -1373,7 +1403,7 @@ next_field (char **line) if (delimiter) { - if (! *delimiter || ! (field_end = mbsstr (field_start, delimiter))) + if (! (field_end = mbsmbchr (field_start, delimiter))) field_end = strchr (field_start, '\0'); } else |
