aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorPádraig Brady <P@draigBrady.com>2025-10-19 13:11:46 +0100
committerPádraig Brady <P@draigBrady.com>2025-10-20 13:13:25 +0100
commite58f9390f1ae30d1855e40ae734dac35bd793a92 (patch)
tree5c18dbee1265692d00435cc52a476ec6ccd1be1c /src
parentnumfmt: support multi-byte --delimiter (diff)
downloadcoreutils-e58f9390f1ae30d1855e40ae734dac35bd793a92.tar.gz
coreutils-e58f9390f1ae30d1855e40ae734dac35bd793a92.zip
numfmt: optimize multi-byte --delimiter search
* src/numfmt.c (is_utf8_charset): A new function to efficiently determine if running with a UTF-8 charset. (mbsmbchr): A new function to efficiently search for a (multi-byte) character in a multi-byte string. (next-field): Use mbsmbchr() rather than mbstr() directly.
Diffstat (limited to 'src')
-rw-r--r--src/numfmt.c32
1 files changed, 31 insertions, 1 deletions
diff --git a/src/numfmt.c b/src/numfmt.c
index 0f0a8770b..5a1e94570 100644
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -1363,6 +1363,36 @@ process_suffixed_number (char *text, long double *result,
return (e == SSE_OK || e == SSE_OK_PRECISION_LOSS);
}
+/* Return true if the current charset is UTF-8. */
+static bool
+is_utf8_charset (void)
+{
+ static int is_utf8 = -1;
+ if (is_utf8 == -1)
+ {
+ char32_t w;
+ mbstate_t mbs; mbszero (&mbs);
+ is_utf8 = mbrtoc32 (&w, "\xe2\x9f\xb8", 3, &mbs) == 3 && w == 0x27F8;
+ }
+ return is_utf8;
+}
+
+/* Search for multi-byte character C in multi-byte string S.
+ Return a pointer to the character, or nullptr if not found. */
+ATTRIBUTE_PURE
+static char *
+mbsmbchr (char const* s, char const* c)
+{
+ unsigned char uc = *c;
+ /* GB18030 is the most restrictive for the 0x30 optimization below. */
+ if (uc < 0x30 || MB_CUR_MAX == 1)
+ return strchr (s, uc);
+ else if (is_utf8_charset ())
+ return uc < 0x80 ? strchr (s, uc) : strstr (s, c);
+ else
+ return *(c + 1) == '\0' ? mbschr (s, uc) : (char *) mbsstr (s, c);
+}
+
/* Return a pointer to the beginning of the next field in line.
The line pointer is moved to the end of the next field. */
static char*
@@ -1373,7 +1403,7 @@ next_field (char **line)
if (delimiter)
{
- if (! *delimiter || ! (field_end = mbsstr (field_start, delimiter)))
+ if (! (field_end = mbsmbchr (field_start, delimiter)))
field_end = strchr (field_start, '\0');
}
else