#include #include #include #include _Static_assert(sizeof(input_t)*INPUT_VECTOR_STEP == sizeof(v128_t), "invalid vector size mul"); #define INPUT_VPROC_STEP 15 __attribute__((pure)) u64 vproc(usize len, const input_t input[const restrict len]) { static const v128_t SINGLE = { .as.i16 = { -1, -1, -1, -1, -1, -1, -1, -1 } }; static const v128_t ONES = { .as.i16 = { 1, 1, 1, 1, 1, 1, 1, 1 } }; #if DEBUG assert(len % INPUT_VPROC_STEP == 0); #endif #define LAST(j) input[i + (j)] #define NEXT(j) input[i + ((j) * 2)] __m128i end = _mm_setzero_si128(); for(usize i = 0;i