/* wc_neon - Count the number of newlines with neon instructions.
Copyright (C) 2026 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see . */
/* Written by Collin Funk , 2026. */
#include
#include "wc.h"
#include "system.h"
#include "ioblksize.h"
#include
/* Read FD and return a summary. */
extern struct wc_lines
wc_lines_neon (int fd)
{
intmax_t lines = 0;
intmax_t bytes = 0;
const uint8x16_t endlines = vdupq_n_u8 ('\n');
while (true)
{
unsigned char neon_buf[IO_BUFSIZE];
ssize_t bytes_read = read (fd, neon_buf, sizeof neon_buf);
if (bytes_read <= 0)
return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes };
bytes += bytes_read;
unsigned char *datap = neon_buf;
while (8192 <= bytes_read)
{
/* Accumulator. */
int8x16_t acc0 = vdupq_n_s8 (0);
int8x16_t acc1 = vdupq_n_s8 (0);
int8x16_t acc2 = vdupq_n_s8 (0);
int8x16_t acc3 = vdupq_n_s8 (0);
/* Process all 8192 bytes in 64 byte chunks. */
for (int i = 0; i < 128; ++i)
{
/* Load 64 bytes from DATAP. */
uint8x16_t v0 = vld1q_u8 (datap);
uint8x16_t v1 = vld1q_u8 (datap + 16);
uint8x16_t v2 = vld1q_u8 (datap + 32);
uint8x16_t v3 = vld1q_u8 (datap + 48);
/* Bitwise equal with ENDLINES. We use a reinterpret cast to
convert the 0xff if a newline is found into -1. */
int8x16_t c0 = vreinterpretq_s8_u8 (vceqq_u8 (v0, endlines));
int8x16_t c1 = vreinterpretq_s8_u8 (vceqq_u8 (v1, endlines));
int8x16_t c2 = vreinterpretq_s8_u8 (vceqq_u8 (v2, endlines));
int8x16_t c3 = vreinterpretq_s8_u8 (vceqq_u8 (v3, endlines));
/* Increment the accumulator. */
acc0 = vaddq_s8 (acc0, c0);
acc1 = vaddq_s8 (acc1, c1);
acc2 = vaddq_s8 (acc2, c2);
acc3 = vaddq_s8 (acc3, c3);
datap += 64;
}
/* Pairwise sum the vectors. */
int16x8_t a0 = vpaddlq_s8 (acc0);
int16x8_t a1 = vpaddlq_s8 (acc1);
int16x8_t a2 = vpaddlq_s8 (acc2);
int16x8_t a3 = vpaddlq_s8 (acc3);
int32x4_t b0 = vpaddlq_s16 (a0);
int32x4_t b1 = vpaddlq_s16 (a1);
int32x4_t b2 = vpaddlq_s16 (a2);
int32x4_t b3 = vpaddlq_s16 (a3);
int64x2_t c0 = vpaddlq_s32 (b0);
int64x2_t c1 = vpaddlq_s32 (b1);
int64x2_t c2 = vpaddlq_s32 (b2);
int64x2_t c3 = vpaddlq_s32 (b3);
/* Extract the lane sums. Since each newline was counted as -1, we
subtract the sum of them from LINES to get the total number of
lines. */
lines -= (vgetq_lane_s64 (c0, 0) + vgetq_lane_s64 (c0, 1)
+ vgetq_lane_s64 (c1, 0) + vgetq_lane_s64 (c1, 1)
+ vgetq_lane_s64 (c2, 0) + vgetq_lane_s64 (c2, 1)
+ vgetq_lane_s64 (c3, 0) + vgetq_lane_s64 (c3, 1));
bytes_read -= 8192;
}
/* Finish up any left over bytes. */
unsigned char *end = (unsigned char *) datap + bytes_read;
for (unsigned char *p = (unsigned char *) datap; p < end; p++)
lines += *p == '\n';
}
}