BitMagic-C++
bmsse2.h
Go to the documentation of this file.
1#ifndef BMSSE2__H__INCLUDED__
2#define BMSSE2__H__INCLUDED__
3/*
4Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17
18For more information please visit: http://bitmagic.io
19*/
20
21/*! \file bmsse2.h
22 \brief Compute functions for SSE2 SIMD instruction set (internal)
23*/
24
25#if !defined(__arm64__) && !defined(__arm__)
26#ifndef BMWASMSIMDOPT
27#include<mmintrin.h>
28#endif
29#include<emmintrin.h>
30#endif
31
32#include "bmdef.h"
33#include "bmutil.h"
34#include "bmsse_util.h"
35
36
37#ifdef __GNUG__
38#pragma GCC diagnostic push
39#pragma GCC diagnostic ignored "-Wconversion"
40#endif
41
42namespace bm
43{
44
45
46/*!
47 SSE2 optimized bitcounting function implements parallel bitcounting
48 algorithm for SSE2 instruction set.
49
50<pre>
51unsigned CalcBitCount32(unsigned b)
52{
53 b = (b & 0x55555555) + (b >> 1 & 0x55555555);
54 b = (b & 0x33333333) + (b >> 2 & 0x33333333);
55 b = (b + (b >> 4)) & 0x0F0F0F0F;
56 b = b + (b >> 8);
57 b = (b + (b >> 16)) & 0x0000003F;
58 return b;
59}
60</pre>
61
62 @ingroup SSE2
63
64*/
65inline
66bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end)
67{
68 const unsigned mu1 = 0x55555555;
69 const unsigned mu2 = 0x33333333;
70 const unsigned mu3 = 0x0F0F0F0F;
71 const unsigned mu4 = 0x0000003F;
72
73 // Loading masks
74 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
75 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
76 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
77 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
78 __m128i mcnt;
79 mcnt = _mm_xor_si128(m1, m1); // cnt = 0
80
81 __m128i tmp1, tmp2;
82 do
83 {
84 __m128i b = _mm_load_si128(block);
85 ++block;
86
87 // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
88 tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
89 tmp1 = _mm_and_si128(tmp1, m1);
90 tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
91 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
92
93 // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
94 tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
95 tmp1 = _mm_and_si128(tmp1, m2);
96 tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
97 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
98
99 // b = (b + (b >> 4)) & 0x0F0F0F0F;
100 tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
101 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
102 b = _mm_and_si128(b, m3); // & 0x0F0F0F0F
103
104 // b = b + (b >> 8);
105 tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
106 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
107
108 // b = (b + (b >> 16)) & 0x0000003F;
109 tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
110 b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
111 b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
112
113 mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
114
115 } while (block < block_end);
116
117
119 _mm_store_si128((__m128i*)tcnt, mcnt);
120
121 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
122}
123
124
125
126template<class Func>
128 const __m128i* BMRESTRICT block_end,
129 const __m128i* BMRESTRICT mask_block,
130 Func sse2_func)
131{
132 const unsigned mu1 = 0x55555555;
133 const unsigned mu2 = 0x33333333;
134 const unsigned mu3 = 0x0F0F0F0F;
135 const unsigned mu4 = 0x0000003F;
136
137 // Loading masks
138 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
139 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
140 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
141 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
142 __m128i mcnt;
143 mcnt = _mm_xor_si128(m1, m1); // cnt = 0
144 do
145 {
146 __m128i tmp1, tmp2;
147 __m128i b = _mm_load_si128(block++);
148
149 tmp1 = _mm_load_si128(mask_block++);
150
151 b = sse2_func(b, tmp1);
152
153 // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
154 tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
155 tmp1 = _mm_and_si128(tmp1, m1);
156 tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
157 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
158
159 // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
160 tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
161 tmp1 = _mm_and_si128(tmp1, m2);
162 tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
163 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
164
165 // b = (b + (b >> 4)) & 0x0F0F0F0F;
166 tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
167 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
168 b = _mm_and_si128(b, m3); // & 0x0F0F0F0F
169
170 // b = b + (b >> 8);
171 tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
172 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
173
174 // b = (b + (b >> 16)) & 0x0000003F;
175 tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
176 b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
177 b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
178
179 mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
180
181 } while (block < block_end);
182
184 _mm_store_si128((__m128i*)tcnt, mcnt);
185
186 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
187}
188
189/*!
190 @brief check if block is all zero bits
191 @ingroup SSE2
192*/
193inline
194bool sse2_is_all_zero(const __m128i* BMRESTRICT block) BMNOEXCEPT
195{
196 __m128i w;
197 const __m128i maskz = _mm_setzero_si128();
198 const __m128i* BMRESTRICT block_end =
199 (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
200
201 do
202 {
203 w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
204 auto m1 = _mm_movemask_epi8(_mm_cmpeq_epi8(w, maskz));
205 w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
206 auto m2 = _mm_movemask_epi8(_mm_cmpeq_epi8(w, maskz));
207 if (m1 != 0xFFFF || m2 != 0xFFFF)
208 return false;
209 block += 4;
210 } while (block < block_end);
211 return true;
212}
213
214/*!
215 @brief check if block is all ONE bits
216 @ingroup SSE2
217*/
218inline
219bool sse2_is_all_one(const __m128i* BMRESTRICT block) BMNOEXCEPT
220{
221 __m128i w;
222 const __m128i mask1 = _mm_set_epi32 (~0u, ~0u, ~0u, ~0u);
223 const __m128i* BMRESTRICT block_end =
224 (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
225
226 do
227 {
228 w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
229 auto m1 = _mm_movemask_epi8(_mm_cmpeq_epi8(w, mask1));
230 w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
231 auto m2 = _mm_movemask_epi8(_mm_cmpeq_epi8(w, mask1));
232 if (m1 != 0xFFFF || m2 != 0xFFFF)
233 return false;
234 block+=4;
235 } while (block < block_end);
236 return true;
237}
238
239/*!
240 @brief check if digest stride is all zero bits
241 @ingroup SSE2
242*/
244bool sse2_is_digest_zero(const __m128i* BMRESTRICT block) BMNOEXCEPT
245{
246 const __m128i maskz = _mm_setzero_si128();
247
248 __m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
249 __m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
250 wA = _mm_or_si128(wA, wB);
251 auto m1 = _mm_movemask_epi8(_mm_cmpeq_epi8(wA, maskz));
252
253 wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
254 wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
255 wA = _mm_or_si128(wA, wB);
256 auto m2 = _mm_movemask_epi8(_mm_cmpeq_epi8(wA, maskz));
257
258 if (m1 != 0xFFFF || m2 != 0xFFFF)
259 return false;
260 return true;
261}
262
263/*!
264 @brief set digest stride to 0xFF.. or 0x0 value
265 @ingroup SSE2
266*/
268void sse2_block_set_digest(__m128i* dst, unsigned value) BMNOEXCEPT
269{
270 __m128i mV = _mm_set1_epi32(int(value));
271 _mm_store_si128(dst, mV); _mm_store_si128(dst + 1, mV);
272 _mm_store_si128(dst + 2, mV); _mm_store_si128(dst + 3, mV);
273 _mm_store_si128(dst + 4, mV); _mm_store_si128(dst + 5, mV);
274 _mm_store_si128(dst + 6, mV); _mm_store_si128(dst + 7, mV);
275}
276
277
278/**
279 Build partial XOR product of 2 bit-blocks using digest mask
280
281 @param target_block - target := block ^ xor_block
282 @param block - arg1
283 @param xor_block - arg2
284 @param digest - mask for each block wave to XOR (1) or just copy (0)
285
286 @ingroup SSE2
287*/
288inline
290 const bm::word_t* block,
291 const bm::word_t* xor_block,
292 bm::id64_t digest) BMNOEXCEPT
293{
294 for (unsigned i = 0; i < bm::block_waves; ++i)
295 {
296 const bm::id64_t mask = (1ull << i);
297 unsigned off = (i * bm::set_block_digest_wave_size);
298 const __m128i* sub_block = (__m128i*) (block + off);
299 __m128i* t_sub_block = (__m128i*)(target_block + off);
300
301 if (digest & mask) // XOR filtered sub-block
302 {
303 const __m128i* xor_sub_block = (__m128i*) (xor_block + off);
304 __m128i mA, mB, mC, mD;
305 mA = _mm_xor_si128(_mm_load_si128(sub_block),
306 _mm_load_si128(xor_sub_block));
307 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
308 _mm_load_si128(xor_sub_block+1));
309 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
310 _mm_load_si128(xor_sub_block+2));
311 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
312 _mm_load_si128(xor_sub_block+3));
313
314 _mm_store_si128(t_sub_block, mA);
315 _mm_store_si128(t_sub_block+1, mB);
316 _mm_store_si128(t_sub_block+2, mC);
317 _mm_store_si128(t_sub_block+3, mD);
318
319 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
320 _mm_load_si128(xor_sub_block+4));
321 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
322 _mm_load_si128(xor_sub_block+5));
323 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
324 _mm_load_si128(xor_sub_block+6));
325 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
326 _mm_load_si128(xor_sub_block+7));
327
328 _mm_store_si128(t_sub_block+4, mA);
329 _mm_store_si128(t_sub_block+5, mB);
330 _mm_store_si128(t_sub_block+6, mC);
331 _mm_store_si128(t_sub_block+7, mD);
332
333 }
334 else // just copy source
335 {
336 _mm_store_si128(t_sub_block , _mm_load_si128(sub_block));
337 _mm_store_si128(t_sub_block+1, _mm_load_si128(sub_block+1));
338 _mm_store_si128(t_sub_block+2, _mm_load_si128(sub_block+2));
339 _mm_store_si128(t_sub_block+3, _mm_load_si128(sub_block+3));
340
341 _mm_store_si128(t_sub_block+4, _mm_load_si128(sub_block+4));
342 _mm_store_si128(t_sub_block+5, _mm_load_si128(sub_block+5));
343 _mm_store_si128(t_sub_block+6, _mm_load_si128(sub_block+6));
344 _mm_store_si128(t_sub_block+7, _mm_load_si128(sub_block+7));
345 }
346 } // for i
347}
348
349/**
350 Build partial XOR product of 2 bit-blocks using digest mask
351
352 @param target_block - target ^= xor_block
353 @param xor_block - arg1
354 @param digest - mask for each block wave to XOR (if 1)
355
356 @ingroup SSE2
357 @internal
358*/
359inline
361 const bm::word_t* xor_block,
362 bm::id64_t digest) BMNOEXCEPT
363{
364 while (digest)
365 {
366 bm::id64_t t = bm::bmi_blsi_u64(digest); // d & -d;
367 unsigned wave = bm::word_bitcount64(t - 1);
368 unsigned off = wave * bm::set_block_digest_wave_size;
369
370 const __m128i* sub_block = (const __m128i*) (xor_block + off);
371 __m128i* t_sub_block = (__m128i*)(target_block + off);
372
373 __m128i mA, mB, mC, mD;
374 mA = _mm_xor_si128(_mm_load_si128(sub_block),
375 _mm_load_si128(t_sub_block));
376 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
377 _mm_load_si128(t_sub_block+1));
378 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
379 _mm_load_si128(t_sub_block+2));
380 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
381 _mm_load_si128(t_sub_block+3));
382
383 _mm_store_si128(t_sub_block, mA);
384 _mm_store_si128(t_sub_block+1, mB);
385 _mm_store_si128(t_sub_block+2, mC);
386 _mm_store_si128(t_sub_block+3, mD);
387
388 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
389 _mm_load_si128(t_sub_block+4));
390 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
391 _mm_load_si128(t_sub_block+5));
392 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
393 _mm_load_si128(t_sub_block+6));
394 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
395 _mm_load_si128(t_sub_block+7));
396
397 _mm_store_si128(t_sub_block+4, mA);
398 _mm_store_si128(t_sub_block+5, mB);
399 _mm_store_si128(t_sub_block+6, mC);
400 _mm_store_si128(t_sub_block+7, mD);
401
402 digest = bm::bmi_bslr_u64(digest); // d &= d - 1;
403 } // while
404}
405
406
407
408/*!
409 @brief AND block digest stride
410 *dst &= *src
411 @return true if stide is all zero
412 @ingroup SSE2
413*/
415bool sse2_and_digest(__m128i* BMRESTRICT dst,
416 const __m128i* BMRESTRICT src) BMNOEXCEPT
417{
418 __m128i m1A, m1B, m1C, m1D;
419 const __m128i maskz = _mm_setzero_si128();
420
421 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
422 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
423 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
424 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
425
426 _mm_store_si128(dst+0, m1A);
427 _mm_store_si128(dst+1, m1B);
428 _mm_store_si128(dst+2, m1C);
429 _mm_store_si128(dst+3, m1D);
430
431 m1A = _mm_or_si128(m1A, m1B);
432 m1C = _mm_or_si128(m1C, m1D);
433 m1A = _mm_or_si128(m1A, m1C);
434
435 bool z1 = _mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF;
436
437 m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
438 m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
439 m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
440 m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
441
442 _mm_store_si128(dst+4, m1A);
443 _mm_store_si128(dst+5, m1B);
444 _mm_store_si128(dst+6, m1C);
445 _mm_store_si128(dst+7, m1D);
446
447 m1A = _mm_or_si128(m1A, m1B);
448 m1C = _mm_or_si128(m1C, m1D);
449 m1A = _mm_or_si128(m1A, m1C);
450
451 bool z2 = _mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF;
452
453 return z1 & z2;
454}
455
456/*!
457 @brief AND-OR block digest stride
458 *dst |= *src1 & src2
459
460 @return true if stide is all zero
461 @ingroup SSE2
462*/
465 const __m128i* BMRESTRICT src1,
466 const __m128i* BMRESTRICT src2) BMNOEXCEPT
467{
468 __m128i m1A, m1B, m1C, m1D;
469 __m128i mACC1;
470 const __m128i maskz = _mm_setzero_si128();
471
472 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
473 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
474 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
475 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
476
477 mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
478 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mACC1, maskz)) == 0xFFFF);
479
480 m1A = _mm_or_si128(_mm_load_si128(dst+0), m1A);
481 m1B = _mm_or_si128(_mm_load_si128(dst+1), m1B);
482 m1C = _mm_or_si128(_mm_load_si128(dst+2), m1C);
483 m1D = _mm_or_si128(_mm_load_si128(dst+3), m1D);
484
485 _mm_store_si128(dst+0, m1A);
486 _mm_store_si128(dst+1, m1B);
487 _mm_store_si128(dst+2, m1C);
488 _mm_store_si128(dst+3, m1D);
489
490
491 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
492 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
493 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
494 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
495
496 mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
497 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mACC1, maskz)) == 0xFFFF);
498
499 m1A = _mm_or_si128(_mm_load_si128(dst+4), m1A);
500 m1B = _mm_or_si128(_mm_load_si128(dst+5), m1B);
501 m1C = _mm_or_si128(_mm_load_si128(dst+6), m1C);
502 m1D = _mm_or_si128(_mm_load_si128(dst+7), m1D);
503
504 _mm_store_si128(dst+4, m1A);
505 _mm_store_si128(dst+5, m1B);
506 _mm_store_si128(dst+6, m1C);
507 _mm_store_si128(dst+7, m1D);
508
509 return z1 & z2;
510}
511
512
513/*!
514 @brief AND block digest stride
515 @return true if stide is all zero
516 @ingroup SSE2
517*/
518inline
520 const __m128i* BMRESTRICT src1,
521 const __m128i* BMRESTRICT src2,
522 const __m128i* BMRESTRICT src3,
523 const __m128i* BMRESTRICT src4) BMNOEXCEPT
524{
525 __m128i m1A, m1B, m1C, m1D;
526 __m128i m1E, m1F, m1G, m1H;
527
528 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
529 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
530 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
531 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
532
533 m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
534 m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
535 m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
536 m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
537
538 m1A = _mm_and_si128(m1A, m1E);
539 m1B = _mm_and_si128(m1B, m1F);
540 m1C = _mm_and_si128(m1C, m1G);
541 m1D = _mm_and_si128(m1D, m1H);
542
543 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
544 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
545 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
546 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
547
548 _mm_store_si128(dst+0, m1A);
549 _mm_store_si128(dst+1, m1B);
550 _mm_store_si128(dst+2, m1C);
551 _mm_store_si128(dst+3, m1D);
552
553 m1A = _mm_or_si128(m1A, m1B);
554 m1C = _mm_or_si128(m1C, m1D);
555 m1A = _mm_or_si128(m1A, m1C);
556
557 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF);
558
559 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
560 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
561 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
562 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
563
564 m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
565 m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
566 m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
567 m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
568
569 m1A = _mm_and_si128(m1A, m1E);
570 m1B = _mm_and_si128(m1B, m1F);
571 m1C = _mm_and_si128(m1C, m1G);
572 m1D = _mm_and_si128(m1D, m1H);
573
574 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
575 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
576 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
577 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
578
579 _mm_store_si128(dst+4, m1A);
580 _mm_store_si128(dst+5, m1B);
581 _mm_store_si128(dst+6, m1C);
582 _mm_store_si128(dst+7, m1D);
583
584 m1A = _mm_or_si128(m1A, m1B);
585 m1C = _mm_or_si128(m1C, m1D);
586 m1A = _mm_or_si128(m1A, m1C);
587
588 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF);
589
590 return z1 & z2;
591}
592
593/*!
594 @brief AND block digest stride
595 @return true if stide is all zero
596 @ingroup SSE2
597*/
598inline
600 const __m128i* BMRESTRICT src1,
601 const __m128i* BMRESTRICT src2) BMNOEXCEPT
602{
603 __m128i m1A, m1B, m1C, m1D;
604// __m128i m1E, m1F, m1G, m1H;
605
606 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
607 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
608 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
609 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
610/*
611 m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
612 m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
613 m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
614 m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
615
616 m1A = _mm_and_si128(m1A, m1E);
617 m1B = _mm_and_si128(m1B, m1F);
618 m1C = _mm_and_si128(m1C, m1G);
619 m1D = _mm_and_si128(m1D, m1H);
620*/
621 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
622 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
623 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
624 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
625
626 _mm_store_si128(dst+0, m1A);
627 _mm_store_si128(dst+1, m1B);
628 _mm_store_si128(dst+2, m1C);
629 _mm_store_si128(dst+3, m1D);
630
631 m1A = _mm_or_si128(m1A, m1B);
632 m1C = _mm_or_si128(m1C, m1D);
633 m1A = _mm_or_si128(m1A, m1C);
634
635 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF);
636
637 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
638 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
639 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
640 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
641/*
642 m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
643 m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
644 m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
645 m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
646
647 m1A = _mm_and_si128(m1A, m1E);
648 m1B = _mm_and_si128(m1B, m1F);
649 m1C = _mm_and_si128(m1C, m1G);
650 m1D = _mm_and_si128(m1D, m1H);
651*/
652 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
653 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
654 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
655 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
656
657 _mm_store_si128(dst+4, m1A);
658 _mm_store_si128(dst+5, m1B);
659 _mm_store_si128(dst+6, m1C);
660 _mm_store_si128(dst+7, m1D);
661
662 m1A = _mm_or_si128(m1A, m1B);
663 m1C = _mm_or_si128(m1C, m1D);
664 m1A = _mm_or_si128(m1A, m1C);
665
666 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF);
667
668 return z1 & z2;
669}
670
671
672
673/*!
674 @brief AND block digest stride
675 *dst = *src1 & src2
676
677 @return true if stide is all zero
678 @ingroup SSE2
679*/
682 const __m128i* BMRESTRICT src1,
683 const __m128i* BMRESTRICT src2) BMNOEXCEPT
684{
685 __m128i m1A, m1B, m1C, m1D;
686
687 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
688 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
689 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
690 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
691
692 _mm_store_si128(dst+0, m1A);
693 _mm_store_si128(dst+1, m1B);
694 _mm_store_si128(dst+2, m1C);
695 _mm_store_si128(dst+3, m1D);
696
697 m1A = _mm_or_si128(m1A, m1B);
698 m1C = _mm_or_si128(m1C, m1D);
699 m1A = _mm_or_si128(m1A, m1C);
700
701 const __m128i maskz = _mm_setzero_si128();
702 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
703
704 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
705 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
706 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
707 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
708
709 _mm_store_si128(dst+4, m1A);
710 _mm_store_si128(dst+5, m1B);
711 _mm_store_si128(dst+6, m1C);
712 _mm_store_si128(dst+7, m1D);
713
714 m1A = _mm_or_si128(m1A, m1B);
715 m1C = _mm_or_si128(m1C, m1D);
716 m1A = _mm_or_si128(m1A, m1C);
717
718 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
719
720 return z1 & z2;
721}
722
723/*!
724 @brief SUB (AND NOT) block digest stride
725 *dst &= ~*src
726
727 @return true if stide is all zero
728 @ingroup SSE2
729*/
731bool sse2_sub_digest(__m128i* BMRESTRICT dst,
732 const __m128i* BMRESTRICT src) BMNOEXCEPT
733{
734 __m128i m1A, m1B, m1C, m1D;
735 const __m128i maskz = _mm_setzero_si128();
736
737 m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
738 m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
739 m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
740 m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
741
742 _mm_store_si128(dst+0, m1A);
743 _mm_store_si128(dst+1, m1B);
744 _mm_store_si128(dst+2, m1C);
745 _mm_store_si128(dst+3, m1D);
746
747 m1A = _mm_or_si128(m1A, m1B);
748 m1C = _mm_or_si128(m1C, m1D);
749 m1A = _mm_or_si128(m1A, m1C);
750
751 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
752
753 m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
754 m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
755 m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
756 m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
757
758 _mm_store_si128(dst+4, m1A);
759 _mm_store_si128(dst+5, m1B);
760 _mm_store_si128(dst+6, m1C);
761 _mm_store_si128(dst+7, m1D);
762
763 m1A = _mm_or_si128(m1A, m1B);
764 m1C = _mm_or_si128(m1C, m1D);
765 m1A = _mm_or_si128(m1A, m1C);
766
767 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
768
769 return z1 & z2;
770}
771
772/*!
773 @brief 2-operand SUB (AND NOT) block digest stride
774 *dst = src1 & ~*src2
775
776 @return true if stide is all zero
777 @ingroup SSE2
778*/
781 const __m128i* BMRESTRICT src1,
782 const __m128i* BMRESTRICT src2) BMNOEXCEPT
783{
784 __m128i m1A, m1B, m1C, m1D;
785 const __m128i maskz = _mm_setzero_si128();
786
787 m1A = _mm_andnot_si128(_mm_load_si128(src2+0), _mm_load_si128(src1+0));
788 m1B = _mm_andnot_si128(_mm_load_si128(src2+1), _mm_load_si128(src1+1));
789 m1C = _mm_andnot_si128(_mm_load_si128(src2+2), _mm_load_si128(src1+2));
790 m1D = _mm_andnot_si128(_mm_load_si128(src2+3), _mm_load_si128(src1+3));
791
792 _mm_store_si128(dst+0, m1A);
793 _mm_store_si128(dst+1, m1B);
794 _mm_store_si128(dst+2, m1C);
795 _mm_store_si128(dst+3, m1D);
796
797 m1A = _mm_or_si128(m1A, m1B);
798 m1C = _mm_or_si128(m1C, m1D);
799 m1A = _mm_or_si128(m1A, m1C);
800
801 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
802
803 m1A = _mm_andnot_si128(_mm_load_si128(src2+4), _mm_load_si128(src1+4));
804 m1B = _mm_andnot_si128(_mm_load_si128(src2+5), _mm_load_si128(src1+5));
805 m1C = _mm_andnot_si128(_mm_load_si128(src2+6), _mm_load_si128(src1+6));
806 m1D = _mm_andnot_si128(_mm_load_si128(src2+7), _mm_load_si128(src1+7));
807
808 _mm_store_si128(dst+4, m1A);
809 _mm_store_si128(dst+5, m1B);
810 _mm_store_si128(dst+6, m1C);
811 _mm_store_si128(dst+7, m1D);
812
813 m1A = _mm_or_si128(m1A, m1B);
814 m1C = _mm_or_si128(m1C, m1D);
815 m1A = _mm_or_si128(m1A, m1C);
816
817 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
818
819 return z1 & z2;
820}
821
822/*!
823 @brief SUB block digest stride
824 @return true if stide is all zero
825 @ingroup SSE4
826*/
827inline
829 const __m128i* BMRESTRICT src1,
830 const __m128i* BMRESTRICT src2,
831 const __m128i* BMRESTRICT src3,
832 const __m128i* BMRESTRICT src4) BMNOEXCEPT
833{
834 __m128i m1A, m1B, m1C, m1D;
835 __m128i m1E, m1F, m1G, m1H;
836 __m128i maskFF = _mm_set1_epi32(~0u);
837
838 m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+0)), _mm_xor_si128(maskFF,_mm_load_si128(src2+0)));
839 m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+1)), _mm_xor_si128(maskFF,_mm_load_si128(src2+1)));
840 m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+2)), _mm_xor_si128(maskFF,_mm_load_si128(src2+2)));
841 m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+3)), _mm_xor_si128(maskFF,_mm_load_si128(src2+3)));
842
843 m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+0)), _mm_xor_si128(maskFF,_mm_load_si128(src4+0)));
844 m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+1)), _mm_xor_si128(maskFF,_mm_load_si128(src4+1)));
845 m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+2)), _mm_xor_si128(maskFF,_mm_load_si128(src4+2)));
846 m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+3)), _mm_xor_si128(maskFF,_mm_load_si128(src4+3)));
847
848 m1A = _mm_and_si128(m1A, m1E);
849 m1B = _mm_and_si128(m1B, m1F);
850 m1C = _mm_and_si128(m1C, m1G);
851 m1D = _mm_and_si128(m1D, m1H);
852
853 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
854 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
855 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
856 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
857
858 _mm_store_si128(dst+0, m1A);
859 _mm_store_si128(dst+1, m1B);
860 _mm_store_si128(dst+2, m1C);
861 _mm_store_si128(dst+3, m1D);
862
863 m1A = _mm_or_si128(m1A, m1B);
864 m1C = _mm_or_si128(m1C, m1D);
865 m1A = _mm_or_si128(m1A, m1C);
866
867 const __m128i maskz = _mm_setzero_si128();
868 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
869
870 m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+4)), _mm_xor_si128(maskFF,_mm_load_si128(src2+4)));
871 m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+5)), _mm_xor_si128(maskFF,_mm_load_si128(src2+5)));
872 m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+6)), _mm_xor_si128(maskFF,_mm_load_si128(src2+6)));
873 m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+7)), _mm_xor_si128(maskFF,_mm_load_si128(src2+7)));
874
875 m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+4)), _mm_xor_si128(maskFF,_mm_load_si128(src4+4)));
876 m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+5)), _mm_xor_si128(maskFF,_mm_load_si128(src4+5)));
877 m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+6)), _mm_xor_si128(maskFF,_mm_load_si128(src4+6)));
878 m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+7)), _mm_xor_si128(maskFF,_mm_load_si128(src4+7)));
879
880 m1A = _mm_and_si128(m1A, m1E);
881 m1B = _mm_and_si128(m1B, m1F);
882 m1C = _mm_and_si128(m1C, m1G);
883 m1D = _mm_and_si128(m1D, m1H);
884
885 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
886 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
887 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
888 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
889
890 _mm_store_si128(dst+4, m1A);
891 _mm_store_si128(dst+5, m1B);
892 _mm_store_si128(dst+6, m1C);
893 _mm_store_si128(dst+7, m1D);
894
895 m1A = _mm_or_si128(m1A, m1B);
896 m1C = _mm_or_si128(m1C, m1D);
897 m1A = _mm_or_si128(m1A, m1C);
898
899 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
900
901 return z1 & z2;
902}
903
904
905/*!
906 @brief SUB block digest stride
907 @return true if stide is all zero
908 @ingroup SSE4
909*/
910inline
912 const __m128i* BMRESTRICT src1,
913 const __m128i* BMRESTRICT src2) BMNOEXCEPT
914{
915 __m128i m1A, m1B, m1C, m1D;
916// __m128i m1E, m1F, m1G, m1H;
917 __m128i maskFF = _mm_set1_epi32(~0u);
918
919 m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+0)), _mm_xor_si128(maskFF,_mm_load_si128(src2+0)));
920 m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+1)), _mm_xor_si128(maskFF,_mm_load_si128(src2+1)));
921 m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+2)), _mm_xor_si128(maskFF,_mm_load_si128(src2+2)));
922 m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+3)), _mm_xor_si128(maskFF,_mm_load_si128(src2+3)));
923/*
924 m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+0)), _mm_xor_si128(maskFF,_mm_load_si128(src4+0)));
925 m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+1)), _mm_xor_si128(maskFF,_mm_load_si128(src4+1)));
926 m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+2)), _mm_xor_si128(maskFF,_mm_load_si128(src4+2)));
927 m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+3)), _mm_xor_si128(maskFF,_mm_load_si128(src4+3)));
928
929 m1A = _mm_and_si128(m1A, m1E);
930 m1B = _mm_and_si128(m1B, m1F);
931 m1C = _mm_and_si128(m1C, m1G);
932 m1D = _mm_and_si128(m1D, m1H);
933*/
934 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
935 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
936 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
937 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
938
939 _mm_store_si128(dst+0, m1A);
940 _mm_store_si128(dst+1, m1B);
941 _mm_store_si128(dst+2, m1C);
942 _mm_store_si128(dst+3, m1D);
943
944 m1A = _mm_or_si128(m1A, m1B);
945 m1C = _mm_or_si128(m1C, m1D);
946 m1A = _mm_or_si128(m1A, m1C);
947
948 const __m128i maskz = _mm_setzero_si128();
949 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
950
951 m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+4)), _mm_xor_si128(maskFF,_mm_load_si128(src2+4)));
952 m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+5)), _mm_xor_si128(maskFF,_mm_load_si128(src2+5)));
953 m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+6)), _mm_xor_si128(maskFF,_mm_load_si128(src2+6)));
954 m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+7)), _mm_xor_si128(maskFF,_mm_load_si128(src2+7)));
955/*
956 m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+4)), _mm_xor_si128(maskFF,_mm_load_si128(src4+4)));
957 m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+5)), _mm_xor_si128(maskFF,_mm_load_si128(src4+5)));
958 m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+6)), _mm_xor_si128(maskFF,_mm_load_si128(src4+6)));
959 m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+7)), _mm_xor_si128(maskFF,_mm_load_si128(src4+7)));
960
961 m1A = _mm_and_si128(m1A, m1E);
962 m1B = _mm_and_si128(m1B, m1F);
963 m1C = _mm_and_si128(m1C, m1G);
964 m1D = _mm_and_si128(m1D, m1H);
965*/
966 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
967 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
968 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
969 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
970
971 _mm_store_si128(dst+4, m1A);
972 _mm_store_si128(dst+5, m1B);
973 _mm_store_si128(dst+6, m1C);
974 _mm_store_si128(dst+7, m1D);
975
976 m1A = _mm_or_si128(m1A, m1B);
977 m1C = _mm_or_si128(m1C, m1D);
978 m1A = _mm_or_si128(m1A, m1C);
979
980 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
981 return z1 & z2;
982}
983
984
985
986
987/*!
988 \brief Find first non-zero bit
989 @ingroup SSE2
990*/
991inline
992bool sse2_bit_find_first(const __m128i* BMRESTRICT block, unsigned off,
993 unsigned* pos) BMNOEXCEPT
994{
995 unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR;
996
997 block = (const __m128i*)((bm::word_t*)(block) + off);
998 const __m128i* block_end =
999 (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1000 const __m128i maskZ = _mm_setzero_si128();
1001 __m128i mA, mB;
1002 unsigned simd_lane = 0;
1003 int bsf;
1004 do
1005 {
1006 mA = _mm_load_si128(block); mB = _mm_load_si128(block+1);
1007 __m128i mOR = _mm_or_si128(mA, mB);
1008 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mOR, maskZ)) == 0xFFFF);
1009 if (!z1) // test 2x128 lanes
1010 {
1011 z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mA, maskZ)) == 0xFFFF);
1012 if (!z1)
1013 {
1014 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
1015 mask = ~mask; // invert to find (w != 0)
1016 BM_ASSERT(mask);
1017 bsf = bm::bit_scan_forward32(mask); // find first !=0 (could use lzcnt())
1018 _mm_store_si128 ((__m128i*)simd_buf, mA);
1019 unsigned widx = bsf >> 2; // (bsf / 4);
1020 unsigned w = simd_buf[widx];
1021 bsf = bm::bit_scan_forward32(w); // find first bit != 0
1022 *pos = (off * 32) +(simd_lane * 128) + (widx * 32) + bsf;
1023 return true;
1024 }
1025 unsigned mask = (_mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ)));
1026 mask = ~mask; // invert to find (w != 0)
1027 BM_ASSERT(mask);
1028 bsf = bm::bit_scan_forward32(mask); // find first !=0 (could use lzcnt())
1029 _mm_store_si128 ((__m128i*)simd_buf, mB);
1030 unsigned widx = bsf >> 2; // (bsf / 4);
1031 unsigned w = simd_buf[widx];
1032 bsf = bm::bit_scan_forward32(w); // find first bit != 0
1033 *pos = (off * 32) + ((++simd_lane) * 128) + (widx * 32) + bsf;
1034 return true;
1035 }
1036 simd_lane+=2;
1037 block+=2;
1038 } while (block < block_end);
1039
1040 return false;
1041}
1042
1043/*!
1044 \brief Find first bit which is different between two bit-blocks
1045 @ingroup SSE2
1046*/
1047inline
1048bool sse2_bit_find_first_diff(const __m128i* BMRESTRICT block1,
1049 const __m128i* BMRESTRICT block2,
1050 unsigned* pos) BMNOEXCEPT
1051{
1052 unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR;
1053
1054 const __m128i* block1_end =
1055 (const __m128i*)((bm::word_t*)(block1) + bm::set_block_size);
1056 const __m128i maskZ = _mm_setzero_si128();
1057 __m128i mA, mB;
1058 unsigned simd_lane = 0;
1059 do
1060 {
1061 mA = _mm_xor_si128(_mm_load_si128(block1), _mm_load_si128(block2));
1062 mB = _mm_xor_si128(_mm_load_si128(block1+1), _mm_load_si128(block2+1));
1063 __m128i mOR = _mm_or_si128(mA, mB);
1064 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mOR, maskZ)) == 0xFFFF);
1065 if (!z1) // test 2x128 lanes
1066 {
1067 z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mA, maskZ)) == 0xFFFF);
1068 if (!z1) // test 2x128 lanes
1069 {
1070 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
1071 mask = ~mask; // invert to find (w != 0)
1072 BM_ASSERT(mask);
1073 int bsf = bm::bit_scan_forward32(mask); // find first !=0 (could use lzcnt())
1074 _mm_store_si128 ((__m128i*)simd_buf, mA);
1075 unsigned widx = bsf >> 2; // (bsf / 4);
1076 unsigned w = simd_buf[widx]; // _mm_extract_epi32 (mA, widx);
1077 bsf = bm::bit_scan_forward32(w); // find first bit != 0
1078 *pos = (simd_lane * 128) + (widx * 32) + bsf;
1079 return true;
1080 }
1081 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
1082 mask = ~mask; // invert to find (w != 0)
1083 BM_ASSERT(mask);
1084 int bsf = bm::bit_scan_forward32(mask); // find first !=0 (could use lzcnt())
1085 _mm_store_si128 ((__m128i*)simd_buf, mB);
1086 unsigned widx = bsf >> 2; // (bsf / 4);
1087 unsigned w = simd_buf[widx]; // _mm_extract_epi32 (mB, widx);
1088 bsf = bm::bit_scan_forward32(w); // find first bit != 0
1089 *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
1090 return true;
1091 }
1092 simd_lane+=2;
1093 block1+=2; block2+=2;
1094 } while (block1 < block1_end);
1095 return false;
1096}
1097
1098/*
1099Snippets to extract32 in SSE2:
1100
1101inline int get_x(const __m128i& vec){return _mm_cvtsi128_si32 (vec);}
1102inline int get_y(const __m128i& vec){return _mm_cvtsi128_si32 (_mm_shuffle_epi32(vec,0x55));}
1103inline int get_z(const __m128i& vec){return _mm_cvtsi128_si32 (_mm_shuffle_epi32(vec,0xAA));}
1104inline int get_w(const __m128i& vec){return _mm_cvtsi128_si32 (_mm_shuffle_epi32(vec,0xFF));}
1105*/
1106
1107/*!
1108 @brief block shift right by 1
1109 @ingroup SSE2
1110*/
1111inline
1112bool sse2_shift_r1(__m128i* block, unsigned* empty_acc, unsigned co1) BMNOEXCEPT
1113{
1114 __m128i* block_end =
1115 ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1116 __m128i m1COshft, m2COshft;
1117 __m128i mAcc = _mm_set1_epi32(0);
1118
1119 __m128i mMask0 = _mm_set_epi32(-1,-1,-1, 0);
1120
1121 unsigned co2;
1122 for (;block < block_end; block += 2)
1123 {
1124 __m128i m1A = _mm_load_si128(block);
1125 __m128i m2A = _mm_load_si128(block+1);
1126
1127 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1128 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1129
1130 co2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(m1CO, 0xFF));
1131
1132 m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1133 m2A = _mm_slli_epi32(m2A, 1);
1134
1135 m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift-l by 1 int32
1136 m2COshft = _mm_slli_si128 (m2CO, 4);
1137
1138 m1COshft = _mm_and_si128(m1COshft, mMask0); // clear the vec[0]
1139 m1COshft = _mm_or_si128(m1COshft, _mm_set_epi32(0, 0, 0, co1)); // vec[0] = co1
1140
1141 m2COshft = _mm_and_si128(m2COshft, mMask0); // clear the vec[0]
1142 m2COshft = _mm_or_si128(m2COshft, _mm_set_epi32(0, 0, 0, co2)); // vec[0] = co2
1143
1144 m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1145 m2A = _mm_or_si128(m2A, m2COshft);
1146
1147 co1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(m2CO, 0xFF));
1148
1149 _mm_store_si128(block, m1A);
1150 _mm_store_si128(block+1, m2A);
1151
1152 mAcc = _mm_or_si128(mAcc, m1A);
1153 mAcc = _mm_or_si128(mAcc, m2A);
1154 }
1155 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mAcc, _mm_set1_epi32(0))) == 0xFFFF);
1156 *empty_acc = !z1;
1157 return co1;
1158}
1159
1160/*!
1161 @brief block shift left by 1
1162 @ingroup SSE2
1163*/
1164inline
1165bool sse2_shift_l1(__m128i* block, unsigned* empty_acc, unsigned co1) BMNOEXCEPT
1166{
1167 __m128i* block_end =
1168 ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1169 __m128i mAcc = _mm_set1_epi32(0);
1170 __m128i mMask1 = _mm_set1_epi32(1);
1171 __m128i mMask0 = _mm_set_epi32(0, -1, -1, -1);
1172
1173 unsigned co2;
1174 for (--block_end; block_end >= block; block_end -= 2)
1175 {
1176 __m128i m1A = _mm_load_si128(block_end);
1177 __m128i m2A = _mm_load_si128(block_end-1);
1178
1179 __m128i m1CO = _mm_and_si128(m1A, mMask1);
1180 __m128i m2CO = _mm_and_si128(m2A, mMask1);
1181
1182 co2 = _mm_cvtsi128_si32 (m1CO); // get vec[0]
1183
1184 m1A = _mm_srli_epi32(m1A, 1); // (block[i] >> 1u)
1185 m2A = _mm_srli_epi32(m2A, 1);
1186
1187 __m128i m1COshft = _mm_srli_si128 (m1CO, 4); // byte shift-r by 1 int32
1188 __m128i m2COshft = _mm_srli_si128 (m2CO, 4);
1189
1190 // m1COshft = _mm_insert_epi32 (m1COshft, co1, 3);
1191 // m2COshft = _mm_insert_epi32 (m2COshft, co2, 3);
1192 m1COshft = _mm_and_si128(m1COshft, mMask0); // clear the vec[0]
1193 m1COshft = _mm_or_si128(m1COshft, _mm_set_epi32(co1, 0, 0, 0)); // vec[3] = co1
1194 m2COshft = _mm_and_si128(m2COshft, mMask0); // clear the vec[0]
1195 m2COshft = _mm_or_si128(m2COshft, _mm_set_epi32(co2, 0, 0, 0)); // vec[3] = co2
1196
1197
1198 m1COshft = _mm_slli_epi32(m1COshft, 31);
1199 m2COshft = _mm_slli_epi32(m2COshft, 31);
1200
1201 m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1202 m2A = _mm_or_si128(m2A, m2COshft);
1203
1204 co1 = _mm_cvtsi128_si32 (m2CO); // get vec[0]
1205
1206 _mm_store_si128(block_end, m1A);
1207 _mm_store_si128(block_end-1, m2A);
1208
1209 mAcc = _mm_or_si128(mAcc, m1A);
1210 mAcc = _mm_or_si128(mAcc, m2A);
1211 } // for
1212
1213 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mAcc, _mm_set1_epi32(0))) == 0xFFFF);
1214 *empty_acc = !z1; // !_mm_testz_si128(mAcc, mAcc);
1215 return co1;
1216}
1217
1218
1219
1220inline
1222 const __m128i* BMRESTRICT block_end,
1223 unsigned* BMRESTRICT bit_count)
1224{
1225 const unsigned mu1 = 0x55555555;
1226 const unsigned mu2 = 0x33333333;
1227 const unsigned mu3 = 0x0F0F0F0F;
1228 const unsigned mu4 = 0x0000003F;
1229
1230 // Loading masks
1231 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
1232 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
1233 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
1234 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
1235 __m128i mcnt;//, ccnt;
1236 mcnt = _mm_xor_si128(m1, m1); // bit_cnt = 0
1237 //ccnt = _mm_xor_si128(m1, m1); // change_cnt = 0
1238
1239 __m128i tmp1, tmp2;
1240
1241 int count = (int)(block_end - block)*4; //0;//1;
1242
1243 bm::word_t w, w0, w_prev;//, w_l;
1244 const int w_shift = sizeof(w) * 8 - 1;
1245 bool first_word = true;
1246
1247 // first word
1248 {
1249 const bm::word_t* blk = (const bm::word_t*) block;
1250 w = w0 = blk[0];
1251 w ^= (w >> 1);
1252 count += bm::word_bitcount(w);
1253 count -= (w_prev = (w0 >> w_shift)); // negative value correction
1254 }
1255
1257
1258 do
1259 {
1260 // compute bit-count
1261 // ---------------------------------------------------------------------
1262 {
1263 __m128i b = _mm_load_si128(block);
1264
1265 // w ^(w >> 1)
1266 tmp1 = _mm_srli_epi32(b, 1); // tmp1 = b >> 1
1267 tmp2 = _mm_xor_si128(b, tmp1); // tmp2 = tmp1 ^ b;
1268 _mm_store_si128((__m128i*)tcnt, tmp2);
1269
1270
1271 // compare with zero
1272 // SSE4: _mm_test_all_zero()
1273 {
1274 // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
1275 //tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
1276 tmp1 = _mm_and_si128(tmp1, m1);
1277 tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
1278 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
1279
1280 // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
1281 tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
1282 tmp1 = _mm_and_si128(tmp1, m2);
1283 tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
1284 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
1285
1286 // b = (b + (b >> 4)) & 0x0F0F0F0F;
1287 tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
1288 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
1289 b = _mm_and_si128(b, m3); //& 0x0F0F0F0F
1290
1291 // b = b + (b >> 8);
1292 tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
1293 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
1294
1295 // b = (b + (b >> 16)) & 0x0000003F;
1296 tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
1297 b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
1298 b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
1299
1300 mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
1301 }
1302
1303 }
1304 // ---------------------------------------------------------------------
1305 {
1306 //__m128i b = _mm_load_si128(block);
1307 // TODO: SSE4...
1308 //w = _mm_extract_epi32(b, i);
1309
1310 const bm::word_t* BMRESTRICT blk = (const bm::word_t*) block;
1311
1312 if (first_word)
1313 {
1314 first_word = false;
1315 }
1316 else
1317 {
1318 if (0!=(w0=blk[0]))
1319 {
1320 count += bm::word_bitcount(tcnt[0]);
1321 count -= !(w_prev ^ (w0 & 1));
1322 count -= w_prev = (w0 >> w_shift);
1323 }
1324 else
1325 {
1326 count -= !w_prev; w_prev ^= w_prev;
1327 }
1328 }
1329 if (0!=(w0=blk[1]))
1330 {
1331 count += bm::word_bitcount(tcnt[1]);
1332 count -= !(w_prev ^ (w0 & 1));
1333 count -= w_prev = (w0 >> w_shift);
1334 }
1335 else
1336 {
1337 count -= !w_prev; w_prev ^= w_prev;
1338 }
1339 if (0!=(w0=blk[2]))
1340 {
1341 count += bm::word_bitcount(tcnt[2]);
1342 count -= !(w_prev ^ (w0 & 1));
1343 count -= w_prev = (w0 >> w_shift);
1344 }
1345 else
1346 {
1347 count -= !w_prev; w_prev ^= w_prev;
1348 }
1349 if (0!=(w0=blk[3]))
1350 {
1351 count += bm::word_bitcount(tcnt[3]);
1352 count -= !(w_prev ^ (w0 & 1));
1353 count -= w_prev = (w0 >> w_shift);
1354 }
1355 else
1356 {
1357 count -= !w_prev; w_prev ^= w_prev;
1358 }
1359 }
1360 } while (++block < block_end);
1361
1362 _mm_store_si128((__m128i*)tcnt, mcnt);
1363 *bit_count = tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
1364
1365 return unsigned(count);
1366}
1367
1368#ifdef __GNUG__
1369// necessary measure to silence false warning from GCC about negative pointer arithmetics
1370#pragma GCC diagnostic push
1371#pragma GCC diagnostic ignored "-Warray-bounds"
1372#endif
1373
1374/*!
1375SSE4.2 check for one to two (variable len) 128 bit SSE lines for gap search results (8 elements)
1376\internal
1377*/
1378inline
1380 const bm::gap_word_t pos, unsigned size)
1381{
1382 BM_ASSERT(size <= 16);
1383 BM_ASSERT(size);
1384
1385 const unsigned unroll_factor = 8;
1386 if (size < 4) // for very short vector use conventional scan
1387 {
1388 if (pbuf[0] >= pos) { size = 0; }
1389 else if (pbuf[1] >= pos) { size = 1; }
1390 else { size = 2; BM_ASSERT(pbuf[2] >= pos); }
1391 return size;
1392 }
1393
1394 __m128i m1, mz, maskF, maskFL;
1395
1396 mz = _mm_setzero_si128();
1397 m1 = _mm_loadu_si128((__m128i*)(pbuf)); // load first 8 elements
1398
1399 maskF = _mm_cmpeq_epi32(mz, mz); // set all FF
1400 maskFL = _mm_slli_si128(maskF, 4 * 2); // byle shift to make [0000 FFFF]
1401 int shiftL = (64 - (unroll_factor - size) * 16);
1402 maskFL = _mm_slli_epi64(maskFL, shiftL); // additional bit shift to [0000 00FF]
1403
1404 m1 = _mm_andnot_si128(maskFL, m1); // m1 = (~mask) & m1
1405 m1 = _mm_or_si128(m1, maskFL);
1406
1407 __m128i mp = _mm_set1_epi16(pos); // broadcast pos into all elements of a SIMD vector
1408 __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // unsigned m1 >= mp
1409 int mi = _mm_movemask_epi8(mge_mask); // collect flag bits
1410 if (mi)
1411 {
1412 int bsr_i= bm::bit_scan_fwd(mi) >> 1;
1413 return bsr_i; // address of first one element (target)
1414 }
1415 if (size == 8)
1416 return size;
1417
1418 // inspect the next lane with possible step back (to avoid over-read the block boundaries)
1419 // GCC gives a false warning for "- unroll_factor" here
1420 const bm::gap_word_t* BMRESTRICT pbuf2 = pbuf + size - unroll_factor;
1421 BM_ASSERT(pbuf2 > pbuf); // assert in place to make sure GCC warning is indeed false
1422
1423 m1 = _mm_loadu_si128((__m128i*)(pbuf2)); // load next elements (with possible overlap)
1424 mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // m1 >= mp
1425 mi = _mm_movemask_epi8(mge_mask);
1426 if (mi)
1427 {
1428 int bsr_i = bm::bit_scan_fwd(mi) >> 1;
1429 return size - (unroll_factor - bsr_i);
1430 }
1431 return size;
1432}
1433
1434/**
1435 Hybrid binary search, starts as binary, then switches to linear scan
1436
1437 \param buf - GAP buffer pointer.
1438 \param pos - index of the element.
1439 \param is_set - output. GAP value (0 or 1).
1440 \return GAP index.
1441
1442 @ingroup SSE2
1443*/
1444inline
1445unsigned sse2_gap_bfind(const unsigned short* BMRESTRICT buf,
1446 unsigned pos, unsigned* BMRESTRICT is_set)
1447{
1448 unsigned start = 1;
1449 unsigned end = 1 + ((*buf) >> 3);
1450
1451 const unsigned arr_end = end;
1452 BM_ASSERT(start != end);
1453 unsigned size = end - start;
1454
1455 for (; size >= 64; size = end - start)
1456 {
1457 unsigned mid = (start + end) >> 1;
1458 if (buf[mid] < pos)
1459 start = mid+1;
1460 else
1461 end = mid;
1462 if (buf[mid = (start + end) >> 1] < pos)
1463 start = mid+1;
1464 else
1465 end = mid;
1466 if (buf[mid = (start + end) >> 1] < pos)
1467 start = mid+1;
1468 else
1469 end = mid;
1470 if (buf[mid = (start + end) >> 1] < pos)
1471 start = mid+1;
1472 else
1473 end = mid;
1474 } // for
1475
1476 for (; size >= 16; size = end - start)
1477 {
1478 if (unsigned mid = (start + end) >> 1; buf[mid] < pos)
1479 start = mid + 1;
1480 else
1481 end = mid;
1482 if (unsigned mid = (start + end) >> 1; buf[mid] < pos)
1483 start = mid + 1;
1484 else
1485 end = mid;
1486 } // for
1487
1488 size += (end != arr_end);
1489 start += bm::sse2_gap_find(buf + start, (bm::gap_word_t)pos, size);
1490 BM_ASSERT(buf[start] >= pos);
1491 BM_ASSERT(buf[start - 1] < pos || (start == 1));
1492
1493 *is_set = ((*buf) & 1) ^ ((start-1) & 1);
1494 return start;
1495}
1496
1497/**
1498 Hybrid binary search, starts as binary, then switches to scan
1499 @ingroup SSE2
1500*/
1501inline
1502unsigned sse2_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos)
1503{
1504 unsigned is_set;
1505 bm::sse2_gap_bfind(buf, pos, &is_set);
1506 return is_set;
1507}
1508
1509
1510
1511
1512#ifdef __GNUG__
1513#pragma GCC diagnostic pop
1514#endif
1515
1516
1517#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
1518 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1519
1520#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
1521 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1522
1523#define VECT_BITCOUNT(first, last) \
1524 sse2_bit_count((__m128i*) (first), (__m128i*) (last))
1525
1526#define VECT_BITCOUNT_AND(first, last, mask) \
1527 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
1528
1529#define VECT_BITCOUNT_OR(first, last, mask) \
1530 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
1531
1532#define VECT_BITCOUNT_XOR(first, last, mask) \
1533 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
1534
1535#define VECT_BITCOUNT_SUB(first, last, mask) \
1536 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
1537
1538#define VECT_INVERT_BLOCK(first) \
1539 sse2_invert_block((__m128i*)first);
1540
1541#define VECT_AND_BLOCK(dst, src) \
1542 sse2_and_block((__m128i*) dst, (__m128i*) (src))
1543
1544#define VECT_AND_DIGEST(dst, src) \
1545 sse2_and_digest((__m128i*) dst, (const __m128i*) (src))
1546
1547#define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \
1548 sse2_and_or_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1549
1550#define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
1551 sse2_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
1552
1553#define VECT_AND_DIGEST_3WAY(dst, src1, src2) \
1554 sse2_and_digest_3way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1555
1556#define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
1557 sse2_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1558
1559#define VECT_OR_BLOCK(dst, src) \
1560 sse2_or_block((__m128i*) dst, (__m128i*) (src))
1561
1562#define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
1563 sse2_or_block_2way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
1564
1565#define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
1566 sse2_or_block_3way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
1567
1568#define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
1569 sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
1570
1571#define VECT_SUB_BLOCK(dst, src) \
1572 sse2_sub_block((__m128i*) dst, (__m128i*) (src))
1573
1574#define VECT_SUB_DIGEST(dst, src) \
1575 sse2_sub_digest((__m128i*) dst, (const __m128i*) (src))
1576
1577#define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
1578 sse2_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1579
1580#define VECT_SUB_DIGEST_5WAY(dst, src1, src2, src3, src4) \
1581 sse2_sub_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
1582
1583#define VECT_SUB_DIGEST_3WAY(dst, src1, src2) \
1584 sse2_sub_digest_3way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1585
1586#define VECT_XOR_BLOCK(dst, src) \
1587 sse2_xor_block((__m128i*) dst, (__m128i*) (src))
1588
1589#define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
1590 sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1591
1592#define VECT_COPY_BLOCK(dst, src) \
1593 sse2_copy_block((__m128i*) dst, (__m128i*) (src))
1594
1595#define VECT_COPY_BLOCK_UNALIGN(dst, src) \
1596 sse2_copy_block_unalign((__m128i*) dst, (__m128i*) (src))
1597
1598#define VECT_STREAM_BLOCK(dst, src) \
1599 sse2_stream_block((__m128i*) dst, (__m128i*) (src))
1600
1601#define VECT_STREAM_BLOCK_UNALIGN(dst, src) \
1602 sse2_stream_block_unalign((__m128i*) dst, (__m128i*) (src))
1603
1604#define VECT_SET_BLOCK(dst, value) \
1605 sse2_set_block((__m128i*) dst, value)
1606
1607#define VECT_IS_ZERO_BLOCK(dst) \
1608 sse2_is_all_zero((__m128i*) dst)
1609
1610#define VECT_IS_ONE_BLOCK(dst) \
1611 sse2_is_all_one((__m128i*) dst)
1612
1613#define VECT_IS_DIGEST_ZERO(start) \
1614 sse2_is_digest_zero((__m128i*)start)
1615
1616#define VECT_BLOCK_SET_DIGEST(dst, val) \
1617 sse2_block_set_digest((__m128i*)dst, val)
1618
1619#define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
1620 sse2_lower_bound_scan_u32(arr, target, from, to)
1621
1622#define VECT_SHIFT_R1(b, acc, co) \
1623 sse2_shift_r1((__m128i*)b, acc, co)
1624
1625
1626#define VECT_BIT_FIND_FIRST(src, off, pos) \
1627 sse2_bit_find_first((__m128i*) src, off, pos)
1628
1629#define VECT_BIT_FIND_DIFF(src1, src2, pos) \
1630 sse2_bit_find_first_diff((__m128i*) src1, (__m128i*) (src2), pos)
1631
1632#define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
1633 sse2_bit_block_xor(t, src, src_xor, d)
1634
1635#define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \
1636 sse2_bit_block_xor_2way(t, src_xor, d)
1637
1638#define VECT_GAP_BFIND(buf, pos, is_set) \
1639 sse2_gap_bfind(buf, pos, is_set)
1640
1641#define VECT_GAP_TEST(buf, pos) \
1642 sse2_gap_test(buf, pos)
1643
1644} // namespace
1645
1646
1647#ifdef __GNUG__
1648#pragma GCC diagnostic pop
1649#endif
1650
1651
1652#endif
Definitions(internal).
#define BM_ALIGN16
Definition bmdef.h:287
#define BMRESTRICT
Definition bmdef.h:203
#define BMNOEXCEPT
Definition bmdef.h:82
#define BM_ALIGN32
Definition bmdef.h:292
#define BM_ALIGN16ATTR
Definition bmdef.h:288
#define BMFORCEINLINE
Definition bmdef.h:213
#define BM_ASSERT
Definition bmdef.h:139
#define BM_ALIGN32ATTR
Definition bmdef.h:293
Compute functions for SSE SIMD instruction set (internal).
Bit manipulation primitives (internal).
bool sse2_bit_find_first(const __m128i *BMRESTRICT block, unsigned off, unsigned *pos) BMNOEXCEPT
Find first non-zero bit.
Definition bmsse2.h:992
BMFORCEINLINE void sse2_block_set_digest(__m128i *dst, unsigned value) BMNOEXCEPT
set digest stride to 0xFF.. or 0x0 value
Definition bmsse2.h:268
BMFORCEINLINE bool sse2_and_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND block digest stride dst &= *src.
Definition bmsse2.h:415
void sse2_bit_block_xor_2way(bm::word_t *target_block, const bm::word_t *xor_block, bm::id64_t digest) BMNOEXCEPT
Build partial XOR product of 2 bit-blocks using digest mask.
Definition bmsse2.h:360
bool sse2_bit_find_first_diff(const __m128i *BMRESTRICT block1, const __m128i *BMRESTRICT block2, unsigned *pos) BMNOEXCEPT
Find first bit which is different between two bit-blocks.
Definition bmsse2.h:1048
bool sse2_shift_r1(__m128i *block, unsigned *empty_acc, unsigned co1) BMNOEXCEPT
block shift right by 1
Definition bmsse2.h:1112
bool sse2_is_all_one(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if block is all ONE bits
Definition bmsse2.h:219
bool sse2_and_digest_3way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
AND block digest stride.
Definition bmsse2.h:599
bool sse2_and_digest_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4) BMNOEXCEPT
AND block digest stride.
Definition bmsse2.h:519
unsigned sse2_gap_bfind(const unsigned short *BMRESTRICT buf, unsigned pos, unsigned *BMRESTRICT is_set)
Hybrid binary search, starts as binary, then switches to linear scan.
Definition bmsse2.h:1445
unsigned sse2_gap_test(const unsigned short *BMRESTRICT buf, unsigned pos)
Hybrid binary search, starts as binary, then switches to scan.
Definition bmsse2.h:1502
BMFORCEINLINE bool sse2_and_or_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
AND-OR block digest stride dst |= *src1 & src2.
Definition bmsse2.h:464
BMFORCEINLINE bool sse2_is_digest_zero(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if digest stride is all zero bits
Definition bmsse2.h:244
void sse2_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest) BMNOEXCEPT
Build partial XOR product of 2 bit-blocks using digest mask.
Definition bmsse2.h:289
bool sse2_is_all_zero(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if block is all zero bits
Definition bmsse2.h:194
BMFORCEINLINE bool sse2_and_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
AND block digest stride dst = *src1 & src2.
Definition bmsse2.h:681
BMFORCEINLINE bool sse2_sub_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SUB (AND NOT) block digest stride dst &= ~*src.
Definition bmsse2.h:731
bool sse2_shift_l1(__m128i *block, unsigned *empty_acc, unsigned co1) BMNOEXCEPT
block shift left by 1
Definition bmsse2.h:1165
BMFORCEINLINE bool sse2_sub_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
2-operand SUB (AND NOT) block digest stride dst = src1 & ~*src2
Definition bmsse2.h:780
bool sse2_sub_digest_3way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
SUB block digest stride.
Definition bmsse2.h:911
bool sse2_sub_digest_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4) BMNOEXCEPT
SUB block digest stride.
Definition bmsse2.h:828
BMFORCEINLINE bm::id_t word_bitcount(bm::id_t w) BMNOEXCEPT
Definition bmutil.h:582
BMFORCEINLINE unsigned word_bitcount64(bm::id64_t x) BMNOEXCEPT
Definition bmutil.h:605
Definition bm.h:78
bm::id_t sse2_bit_block_calc_count_change(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, unsigned *BMRESTRICT bit_count)
Definition bmsse2.h:1221
const unsigned set_block_digest_wave_size
Definition bmconst.h:67
bm::id_t sse2_bit_count(const __m128i *block, const __m128i *block_end)
Definition bmsse2.h:66
unsigned int word_t
Definition bmconst.h:39
BMFORCEINLINE unsigned bit_scan_forward32(unsigned w) BMNOEXCEPT
Definition bmutil.h:319
BMFORCEINLINE T bit_scan_fwd(T v) BMNOEXCEPT
Definition bmutil.h:297
bm::id_t sse2_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func)
Definition bmsse2.h:127
unsigned sse2_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, unsigned size)
Definition bmsse2.h:1379
const unsigned set_block_size
Definition bmconst.h:55
unsigned long long int id64_t
Definition bmconst.h:35
const unsigned block_waves
Definition bmconst.h:66
unsigned int id_t
Definition bmconst.h:38
BMFORCEINLINE unsigned long long bmi_bslr_u64(unsigned long long w) BMNOEXCEPT
Definition bmutil.h:335
unsigned short gap_word_t
Definition bmconst.h:78
BMFORCEINLINE unsigned long long bmi_blsi_u64(unsigned long long w)
Definition bmutil.h:345