Code Poetry
and Text Adventures

by catid posted (>30 days ago) 1:34am Mon. Oct 12th 2015 PDT
Can tweak things a bit more of course to suit the expected size of the data, and unrolling it once more for 80 bytes at a time seems to be marginally faster but it's down to tuning at that point.

void bulk_xor(void * __restrict vdest,
                  const void * __restrict vsrc, int bytes)
{
   __m128i * __restrict dest16 = reinterpret_cast<__m128i *>(vdest);
   const __m128i * __restrict src16 = reinterpret_cast<const __m128i *>(vsrc);

   while (bytes >= 64)
   {
      __m128i s0, s1, s2, s3;
      s0 = _mm_loadu_si128(src16);
      s1 = _mm_loadu_si128(src16 + 1);
      s2 = _mm_loadu_si128(src16 + 2);
      s3 = _mm_loadu_si128(src16 + 3);
      __m128i d0, d1, d2, d3;
      d0 = _mm_loadu_si128(dest16);
      d1 = _mm_loadu_si128(dest16 + 1);
      d2 = _mm_loadu_si128(dest16 + 2);
      d3 = _mm_loadu_si128(dest16 + 3);

      _mm_storeu_si128(dest16,
           _mm_xor_si128(s0, d0));
      _mm_storeu_si128(dest16 + 1,
           _mm_xor_si128(s1, d1));
      _mm_storeu_si128(dest16 + 2,
           _mm_xor_si128(s2, d2));
      _mm_storeu_si128(dest16 + 3,
           _mm_xor_si128(s3, d3));

      dest16 += 4;
      src16 += 4;
      bytes -= 64;
   }

... And the rest is pretty straight-forward.
last edit by catid edited (>30 days ago) 1:34am Fri. Oct 16th 2015 PDT