BitMagic-C++
bmsse4.h
Go to the documentation of this file.
1#ifndef BMSSE4__H__INCLUDED__
2#define BMSSE4__H__INCLUDED__
3/*
4Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17
18For more information please visit: http://bitmagic.io
19*/
20
21/*! \file bmsse4.h
22 \brief Compute functions for SSE4.2 SIMD instruction set (internal)
23
24 Aside from SSE4.2 it also compiles in WASM SIMD mode
25 for 128-bit SIMD portable target.
26*/
27
28#ifndef BMWASMSIMDOPT
29#include<mmintrin.h>
30#endif
31#include<emmintrin.h>
32#include<smmintrin.h>
33#include<nmmintrin.h>
34#include<immintrin.h>
35
36#include "bmdef.h"
37#include "bmutil.h"
38#include "bmsse_util.h"
39
40namespace bm
41{
42
43/** @defgroup SSE4 SSE4.2 funcions (internal)
44 Processor specific optimizations for SSE4.2 instructions (internals)
45 @internal
46 @ingroup bvector
47 */
48
49#ifdef __GNUG__
50#pragma GCC diagnostic push
51#pragma GCC diagnostic ignored "-Wconversion"
52#endif
53
54#ifdef _MSC_VER
55#pragma warning( push )
56#pragma warning( disable : 4146)
57#endif
58
59
60// WASM build: define missing POPCNT intrinsics via GCC build-ins
61#ifdef BMWASMSIMDOPT
62# define _mm_popcnt_u32 __builtin_popcount
63# define _mm_popcnt_u64 __builtin_popcountll
64# define BM_BSF32 __builtin_ctz
65#else
66# define BM_BSF32 bm::bsf_asm32
67#endif
68
69
70/*
71inline
72void sse2_print128(const char* prefix, const __m128i & value)
73{
74 const size_t n = sizeof(__m128i) / sizeof(unsigned);
75 unsigned buffer[n];
76 _mm_storeu_si128((__m128i*)buffer, value);
77 std::cout << prefix << " [ ";
78 for (int i = n-1; 1; --i)
79 {
80 std::cout << buffer[i] << " ";
81 if (i == 0)
82 break;
83 }
84 std::cout << "]" << std::endl;
85}
86*/
87
88/*!
89 SSE4.2 optimized bitcounting .
90 @ingroup SSE4
91*/
92inline
93bm::id_t sse4_bit_count(const __m128i* block, const __m128i* block_end) BMNOEXCEPT
94{
95 bm::id_t count = 0;
96#ifdef BM64_SSE4
97 const bm::id64_t* b = (bm::id64_t*) block;
98 const bm::id64_t* b_end = (bm::id64_t*) block_end;
99 do
100 {
101 count += unsigned( _mm_popcnt_u64(b[0]) +
102 _mm_popcnt_u64(b[1]) +
103 _mm_popcnt_u64(b[2]) +
104 _mm_popcnt_u64(b[3]));
105 b += 4;
106 } while (b < b_end);
107#else
108 do
109 {
110 const unsigned* b = (unsigned*) block;
111 count += _mm_popcnt_u32(b[0]) +
112 _mm_popcnt_u32(b[1]) +
113 _mm_popcnt_u32(b[2]) +
114 _mm_popcnt_u32(b[3]);
115 } while (++block < block_end);
116#endif
117 return count;
118}
119
120#ifdef BM64_SSE4
121
122/*!
123 SSE4.2 optimized bitcounting, uses digest for positioning
124 @ingroup SSE4
125*/
126inline
128 bm::id64_t digest) BMNOEXCEPT
129{
130 BM_ASSERT(digest);
131
132 bm::id_t count = 0;
133 bm::id64_t d = digest;
134 while (d)
135 {
136 const bm::id64_t t = bm::bmi_blsi_u64(d); // d & -d;
137 const unsigned wave = (unsigned)_mm_popcnt_u64(t - 1);
138 const unsigned off = wave * bm::set_block_digest_wave_size;
139
141 (const bm::bit_block_t::bunion_t*)(&block[off]);
142 unsigned j = 0;
143 do
144 {
145 count +=
146 unsigned( _mm_popcnt_u64(src_u->w64[j]) +
147 _mm_popcnt_u64(src_u->w64[j+1]) +
148 _mm_popcnt_u64(src_u->w64[j+2]) +
149 _mm_popcnt_u64(src_u->w64[j+3]));
150 } while ((j+=4) < bm::set_block_digest_wave_size/2);
151
152 d = bm::bmi_bslr_u64(d); // d &= d - 1;
153 } // while (d);
154 return count;
155}
156
157#endif
158
159/*!
160\internal
161*/
163unsigned op_xor(unsigned a, unsigned b) BMNOEXCEPT
164{
165 unsigned ret = (a ^ b);
166 return ret;
167}
168
169/*!
170\internal
171*/
173unsigned op_or(unsigned a, unsigned b) BMNOEXCEPT
174{
175 return (a | b);
176}
177
178/*!
179\internal
180*/
182unsigned op_and(unsigned a, unsigned b) BMNOEXCEPT
183{
184 return (a & b);
185}
186
187
188template<class Func>
190 const __m128i* BMRESTRICT block_end,
191 const __m128i* BMRESTRICT mask_block,
192 Func sse2_func) BMNOEXCEPT
193{
194 bm::id_t count = 0;
195#ifdef BM64_SSE4
197 do
198 {
199 __m128i b = sse2_func(_mm_load_si128(block), _mm_load_si128(mask_block));
200 _mm_store_si128((__m128i*)tcnt, b);
201 count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
202
203 b = sse2_func(_mm_load_si128(block+1), _mm_load_si128(mask_block+1));
204 _mm_store_si128((__m128i*)tcnt, b);
205 count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
206 block+=2; mask_block+=2;
207 } while (block < block_end);
208#else
209 do
210 {
211 __m128i tmp0 = _mm_load_si128(block);
212 __m128i tmp1 = _mm_load_si128(mask_block);
213 __m128i b = sse2_func(tmp0, tmp1);
214
215 count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
216 count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
217 count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
218 count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
219
220 ++block; ++mask_block;
221 } while (block < block_end);
222#endif
223
224 return count;
225}
226
227/*!
228 @brief check if block is all zero bits
229 @ingroup SSE4
230*/
231inline
232bool sse4_is_all_zero(const __m128i* BMRESTRICT block) BMNOEXCEPT
233{
234 __m128i w;
235 __m128i maskz = _mm_setzero_si128();
236 const __m128i* BMRESTRICT block_end =
237 (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
238
239 do
240 {
241 w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
242 if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
243 return false;
244 w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
245 if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
246 return false;
247 block += 4;
248 } while (block < block_end);
249 return true;
250}
251
252/*!
253 @brief check if digest stride is all zero bits
254 @ingroup SSE4
255*/
257bool sse4_is_digest_zero(const __m128i* BMRESTRICT block) BMNOEXCEPT
258{
259 __m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
260 __m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
261 wA = _mm_or_si128(wA, wB);
262 bool z1 = _mm_test_all_zeros(wA, wA);
263
264 wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
265 wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
266 wA = _mm_or_si128(wA, wB);
267 bool z2 = _mm_test_all_zeros(wA, wA);
268 return z1 & z2;
269}
270
271/*!
272 @brief set digest stride to 0xFF.. or 0x0 value
273 @ingroup SSE4
274*/
276void sse4_block_set_digest(__m128i* dst, unsigned value) BMNOEXCEPT
277{
278 __m128i mV = _mm_set1_epi32(int(value));
279 _mm_store_si128(dst, mV); _mm_store_si128(dst + 1, mV);
280 _mm_store_si128(dst + 2, mV); _mm_store_si128(dst + 3, mV);
281 _mm_store_si128(dst + 4, mV); _mm_store_si128(dst + 5, mV);
282 _mm_store_si128(dst + 6, mV); _mm_store_si128(dst + 7, mV);
283}
284
285
286/*!
287 @brief AND blocks2
288 *dst &= *src
289
290 @return 0 if no bits were set
291 @ingroup SSE4
292*/
293inline
294unsigned sse4_and_block(__m128i* BMRESTRICT dst,
295 const __m128i* BMRESTRICT src) BMNOEXCEPT
296{
297 __m128i m1A, m1B, m1C, m1D;
298 __m128i accA, accB, accC, accD;
299
300 const __m128i* BMRESTRICT src_end =
301 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
302
303 accA = accB = accC = accD = _mm_setzero_si128();
304
305 do
306 {
307 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
308 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
309 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
310 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
311
312 _mm_store_si128(dst+0, m1A);
313 _mm_store_si128(dst+1, m1B);
314 _mm_store_si128(dst+2, m1C);
315 _mm_store_si128(dst+3, m1D);
316
317 accA = _mm_or_si128(accA, m1A);
318 accB = _mm_or_si128(accB, m1B);
319 accC = _mm_or_si128(accC, m1C);
320 accD = _mm_or_si128(accD, m1D);
321
322 src += 4; dst += 4;
323 } while (src < src_end);
324
325 accA = _mm_or_si128(accA, accB); // A = A | B
326 accC = _mm_or_si128(accC, accD); // C = C | D
327 accA = _mm_or_si128(accA, accC); // A = A | C
328
329 return !_mm_testz_si128(accA, accA);
330}
331
332
333/*!
334 @brief AND block digest stride
335 *dst &= *src
336
337 @return true if stide is all zero
338 @ingroup SSE4
339*/
341bool sse4_and_digest(__m128i* BMRESTRICT dst,
342 const __m128i* BMRESTRICT src) BMNOEXCEPT
343{
344 __m128i m1A, m1B, m1C, m1D;
345
346 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
347 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
348 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
349 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
350
351 _mm_store_si128(dst+0, m1A);
352 _mm_store_si128(dst+1, m1B);
353 _mm_store_si128(dst+2, m1C);
354 _mm_store_si128(dst+3, m1D);
355
356 m1A = _mm_or_si128(m1A, m1B);
357 m1C = _mm_or_si128(m1C, m1D);
358 m1A = _mm_or_si128(m1A, m1C);
359
360 bool z1 = _mm_testz_si128(m1A, m1A);
361
362 m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
363 m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
364 m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
365 m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
366
367 _mm_store_si128(dst+4, m1A);
368 _mm_store_si128(dst+5, m1B);
369 _mm_store_si128(dst+6, m1C);
370 _mm_store_si128(dst+7, m1D);
371
372 m1A = _mm_or_si128(m1A, m1B);
373 m1C = _mm_or_si128(m1C, m1D);
374 m1A = _mm_or_si128(m1A, m1C);
375
376 bool z2 = _mm_testz_si128(m1A, m1A);
377
378 return z1 & z2;
379}
380
381/*!
382 @brief AND block digest stride
383 *dst = *src1 & src2
384
385 @return true if stide is all zero
386 @ingroup SSE4
387*/
390 const __m128i* BMRESTRICT src1,
391 const __m128i* BMRESTRICT src2) BMNOEXCEPT
392{
393 __m128i m1A, m1B, m1C, m1D;
394
395 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
396 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
397 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
398 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
399
400 _mm_store_si128(dst+0, m1A);
401 _mm_store_si128(dst+1, m1B);
402 _mm_store_si128(dst+2, m1C);
403 _mm_store_si128(dst+3, m1D);
404
405 m1A = _mm_or_si128(m1A, m1B);
406 m1C = _mm_or_si128(m1C, m1D);
407 m1A = _mm_or_si128(m1A, m1C);
408
409 bool z1 = _mm_testz_si128(m1A, m1A);
410
411 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
412 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
413 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
414 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
415
416 _mm_store_si128(dst+4, m1A);
417 _mm_store_si128(dst+5, m1B);
418 _mm_store_si128(dst+6, m1C);
419 _mm_store_si128(dst+7, m1D);
420
421 m1A = _mm_or_si128(m1A, m1B);
422 m1C = _mm_or_si128(m1C, m1D);
423 m1A = _mm_or_si128(m1A, m1C);
424
425 bool z2 = _mm_testz_si128(m1A, m1A);
426
427 return z1 & z2;
428}
429
430/*!
431 @brief AND-OR block digest stride
432 *dst |= *src1 & src2
433
434 @return true if stide is all zero
435 @ingroup SSE4
436*/
437inline
439 const __m128i* BMRESTRICT src1,
440 const __m128i* BMRESTRICT src2) BMNOEXCEPT
441{
442 __m128i m1A, m1B, m1C, m1D;
443 __m128i mACC1;
444
445 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
446 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
447 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
448 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
449
450 mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
451 bool z1 = _mm_testz_si128(mACC1, mACC1);
452
453 m1A = _mm_or_si128(_mm_load_si128(dst+0), m1A);
454 m1B = _mm_or_si128(_mm_load_si128(dst+1), m1B);
455 m1C = _mm_or_si128(_mm_load_si128(dst+2), m1C);
456 m1D = _mm_or_si128(_mm_load_si128(dst+3), m1D);
457
458 _mm_store_si128(dst+0, m1A);
459 _mm_store_si128(dst+1, m1B);
460 _mm_store_si128(dst+2, m1C);
461 _mm_store_si128(dst+3, m1D);
462
463
464 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
465 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
466 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
467 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
468
469 mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
470 bool z2 = _mm_testz_si128(mACC1, mACC1);
471
472 m1A = _mm_or_si128(_mm_load_si128(dst+4), m1A);
473 m1B = _mm_or_si128(_mm_load_si128(dst+5), m1B);
474 m1C = _mm_or_si128(_mm_load_si128(dst+6), m1C);
475 m1D = _mm_or_si128(_mm_load_si128(dst+7), m1D);
476
477 _mm_store_si128(dst+4, m1A);
478 _mm_store_si128(dst+5, m1B);
479 _mm_store_si128(dst+6, m1C);
480 _mm_store_si128(dst+7, m1D);
481
482 return z1 & z2;
483}
484
485/*!
486 @brief AND block digest stride
487 @return true if stide is all zero
488 @ingroup SSE4
489*/
490inline
492 const __m128i* BMRESTRICT src1,
493 const __m128i* BMRESTRICT src2) BMNOEXCEPT
494{
495 __m128i m1A, m1B, m1C, m1D;
496
497 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
498 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
499 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
500 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
501
502
503 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
504 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
505 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
506 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
507
508 _mm_store_si128(dst+0, m1A);
509 _mm_store_si128(dst+1, m1B);
510 _mm_store_si128(dst+2, m1C);
511 _mm_store_si128(dst+3, m1D);
512
513 m1A = _mm_or_si128(m1A, m1B);
514 m1C = _mm_or_si128(m1C, m1D);
515 m1A = _mm_or_si128(m1A, m1C);
516
517 bool z1 = _mm_testz_si128(m1A, m1A);
518
519 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
520 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
521 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
522 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
523
524
525 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
526 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
527 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
528 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
529
530 _mm_store_si128(dst+4, m1A);
531 _mm_store_si128(dst+5, m1B);
532 _mm_store_si128(dst+6, m1C);
533 _mm_store_si128(dst+7, m1D);
534
535 m1A = _mm_or_si128(m1A, m1B);
536 m1C = _mm_or_si128(m1C, m1D);
537 m1A = _mm_or_si128(m1A, m1C);
538
539 bool z2 = _mm_testz_si128(m1A, m1A);
540
541 return z1 & z2;
542}
543
544
545
546/*!
547 @brief AND block digest stride
548 @return true if stide is all zero
549 @ingroup SSE4
550*/
551inline
553 const __m128i* BMRESTRICT src1,
554 const __m128i* BMRESTRICT src2,
555 const __m128i* BMRESTRICT src3,
556 const __m128i* BMRESTRICT src4) BMNOEXCEPT
557{
558 __m128i m1A, m1B, m1C, m1D;
559 __m128i m1E, m1F, m1G, m1H;
560
561 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
562 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
563 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
564 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
565
566 m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
567 m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
568 m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
569 m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
570
571 m1A = _mm_and_si128(m1A, m1E);
572 m1B = _mm_and_si128(m1B, m1F);
573 m1C = _mm_and_si128(m1C, m1G);
574 m1D = _mm_and_si128(m1D, m1H);
575
576 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
577 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
578 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
579 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
580
581 _mm_store_si128(dst+0, m1A);
582 _mm_store_si128(dst+1, m1B);
583 _mm_store_si128(dst+2, m1C);
584 _mm_store_si128(dst+3, m1D);
585
586 m1A = _mm_or_si128(m1A, m1B);
587 m1C = _mm_or_si128(m1C, m1D);
588 m1A = _mm_or_si128(m1A, m1C);
589
590 bool z1 = _mm_testz_si128(m1A, m1A);
591
592 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
593 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
594 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
595 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
596
597 m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
598 m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
599 m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
600 m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
601
602 m1A = _mm_and_si128(m1A, m1E);
603 m1B = _mm_and_si128(m1B, m1F);
604 m1C = _mm_and_si128(m1C, m1G);
605 m1D = _mm_and_si128(m1D, m1H);
606
607 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
608 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
609 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
610 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
611
612 _mm_store_si128(dst+4, m1A);
613 _mm_store_si128(dst+5, m1B);
614 _mm_store_si128(dst+6, m1C);
615 _mm_store_si128(dst+7, m1D);
616
617 m1A = _mm_or_si128(m1A, m1B);
618 m1C = _mm_or_si128(m1C, m1D);
619 m1A = _mm_or_si128(m1A, m1C);
620
621 bool z2 = _mm_testz_si128(m1A, m1A);
622
623 return z1 & z2;
624}
625
626
627
628/*!
629 @brief SUB (AND NOT) block digest stride
630 *dst &= ~*src
631
632 @return true if stide is all zero
633 @ingroup SSE4
634*/
636bool sse4_sub_digest(__m128i* BMRESTRICT dst,
637 const __m128i* BMRESTRICT src) BMNOEXCEPT
638{
639 __m128i m1A, m1B, m1C, m1D;
640
641 m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
642 m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
643 m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
644 m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
645
646 _mm_store_si128(dst+0, m1A);
647 _mm_store_si128(dst+1, m1B);
648 _mm_store_si128(dst+2, m1C);
649 _mm_store_si128(dst+3, m1D);
650
651 m1A = _mm_or_si128(m1A, m1B);
652 m1C = _mm_or_si128(m1C, m1D);
653 m1A = _mm_or_si128(m1A, m1C);
654
655 bool z1 = _mm_testz_si128(m1A, m1A);
656
657 m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
658 m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
659 m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
660 m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
661
662 _mm_store_si128(dst+4, m1A);
663 _mm_store_si128(dst+5, m1B);
664 _mm_store_si128(dst+6, m1C);
665 _mm_store_si128(dst+7, m1D);
666
667 m1A = _mm_or_si128(m1A, m1B);
668 m1C = _mm_or_si128(m1C, m1D);
669 m1A = _mm_or_si128(m1A, m1C);
670
671 bool z2 = _mm_testz_si128(m1A, m1A);
672
673 return z1 & z2;
674}
675
676
677/*!
678 @brief 2-operand SUB (AND NOT) block digest stride
679 *dst = src1 & ~*src2
680
681 @return true if stide is all zero
682 @ingroup SSE4
683*/
686 const __m128i* BMRESTRICT src1,
687 const __m128i* BMRESTRICT src2) BMNOEXCEPT
688{
689 __m128i m1A, m1B, m1C, m1D;
690
691 m1A = _mm_andnot_si128(_mm_load_si128(src2+0), _mm_load_si128(src1+0));
692 m1B = _mm_andnot_si128(_mm_load_si128(src2+1), _mm_load_si128(src1+1));
693 m1C = _mm_andnot_si128(_mm_load_si128(src2+2), _mm_load_si128(src1+2));
694 m1D = _mm_andnot_si128(_mm_load_si128(src2+3), _mm_load_si128(src1+3));
695
696 _mm_store_si128(dst+0, m1A);
697 _mm_store_si128(dst+1, m1B);
698 _mm_store_si128(dst+2, m1C);
699 _mm_store_si128(dst+3, m1D);
700
701 m1A = _mm_or_si128(m1A, m1B);
702 m1C = _mm_or_si128(m1C, m1D);
703 m1A = _mm_or_si128(m1A, m1C);
704
705 bool z1 = _mm_testz_si128(m1A, m1A);
706
707 m1A = _mm_andnot_si128(_mm_load_si128(src2+4), _mm_load_si128(src1+4));
708 m1B = _mm_andnot_si128(_mm_load_si128(src2+5), _mm_load_si128(src1+5));
709 m1C = _mm_andnot_si128(_mm_load_si128(src2+6), _mm_load_si128(src1+6));
710 m1D = _mm_andnot_si128(_mm_load_si128(src2+7), _mm_load_si128(src1+7));
711
712 _mm_store_si128(dst+4, m1A);
713 _mm_store_si128(dst+5, m1B);
714 _mm_store_si128(dst+6, m1C);
715 _mm_store_si128(dst+7, m1D);
716
717 m1A = _mm_or_si128(m1A, m1B);
718 m1C = _mm_or_si128(m1C, m1D);
719 m1A = _mm_or_si128(m1A, m1C);
720
721 bool z2 = _mm_testz_si128(m1A, m1A);
722
723 return z1 & z2;
724}
725
726/*!
727 @brief SUB block digest stride
728 @return true if stide is all zero
729 @ingroup SSE4
730*/
731inline
733 const __m128i* BMRESTRICT src1,
734 const __m128i* BMRESTRICT src2,
735 const __m128i* BMRESTRICT src3,
736 const __m128i* BMRESTRICT src4) BMNOEXCEPT
737{
738 __m128i m1A, m1B, m1C, m1D;
739 __m128i m1E, m1F, m1G, m1H;
740 __m128i maskFF = _mm_set1_epi32(~0u);
741
742 m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+0)), _mm_xor_si128(maskFF,_mm_load_si128(src2+0)));
743 m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+1)), _mm_xor_si128(maskFF,_mm_load_si128(src2+1)));
744 m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+2)), _mm_xor_si128(maskFF,_mm_load_si128(src2+2)));
745 m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+3)), _mm_xor_si128(maskFF,_mm_load_si128(src2+3)));
746
747 m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+0)), _mm_xor_si128(maskFF,_mm_load_si128(src4+0)));
748 m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+1)), _mm_xor_si128(maskFF,_mm_load_si128(src4+1)));
749 m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+2)), _mm_xor_si128(maskFF,_mm_load_si128(src4+2)));
750 m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+3)), _mm_xor_si128(maskFF,_mm_load_si128(src4+3)));
751
752 m1A = _mm_and_si128(m1A, m1E);
753 m1B = _mm_and_si128(m1B, m1F);
754 m1C = _mm_and_si128(m1C, m1G);
755 m1D = _mm_and_si128(m1D, m1H);
756
757 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
758 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
759 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
760 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
761
762 _mm_store_si128(dst+0, m1A);
763 _mm_store_si128(dst+1, m1B);
764 _mm_store_si128(dst+2, m1C);
765 _mm_store_si128(dst+3, m1D);
766
767 m1A = _mm_or_si128(m1A, m1B);
768 m1C = _mm_or_si128(m1C, m1D);
769 m1A = _mm_or_si128(m1A, m1C);
770
771 bool z1 = _mm_testz_si128(m1A, m1A);
772
773 m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+4)), _mm_xor_si128(maskFF,_mm_load_si128(src2+4)));
774 m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+5)), _mm_xor_si128(maskFF,_mm_load_si128(src2+5)));
775 m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+6)), _mm_xor_si128(maskFF,_mm_load_si128(src2+6)));
776 m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+7)), _mm_xor_si128(maskFF,_mm_load_si128(src2+7)));
777
778 m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+4)), _mm_xor_si128(maskFF,_mm_load_si128(src4+4)));
779 m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+5)), _mm_xor_si128(maskFF,_mm_load_si128(src4+5)));
780 m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+6)), _mm_xor_si128(maskFF,_mm_load_si128(src4+6)));
781 m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+7)), _mm_xor_si128(maskFF,_mm_load_si128(src4+7)));
782
783 m1A = _mm_and_si128(m1A, m1E);
784 m1B = _mm_and_si128(m1B, m1F);
785 m1C = _mm_and_si128(m1C, m1G);
786 m1D = _mm_and_si128(m1D, m1H);
787
788 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
789 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
790 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
791 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
792
793 _mm_store_si128(dst+4, m1A);
794 _mm_store_si128(dst+5, m1B);
795 _mm_store_si128(dst+6, m1C);
796 _mm_store_si128(dst+7, m1D);
797
798 m1A = _mm_or_si128(m1A, m1B);
799 m1C = _mm_or_si128(m1C, m1D);
800 m1A = _mm_or_si128(m1A, m1C);
801
802 bool z2 = _mm_testz_si128(m1A, m1A);
803
804 return z1 & z2;
805}
806
807
808/*!
809 @brief SUB block digest stride
810 @return true if stide is all zero
811 @ingroup SSE4
812*/
813inline
815 const __m128i* BMRESTRICT src1,
816 const __m128i* BMRESTRICT src2) BMNOEXCEPT
817{
818 __m128i m1A, m1B, m1C, m1D;
819 __m128i maskFF = _mm_set1_epi32(~0u);
820
821 m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+0)), _mm_xor_si128(maskFF,_mm_load_si128(src2+0)));
822 m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+1)), _mm_xor_si128(maskFF,_mm_load_si128(src2+1)));
823 m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+2)), _mm_xor_si128(maskFF,_mm_load_si128(src2+2)));
824 m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+3)), _mm_xor_si128(maskFF,_mm_load_si128(src2+3)));
825
826 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
827 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
828 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
829 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
830
831 _mm_store_si128(dst+0, m1A);
832 _mm_store_si128(dst+1, m1B);
833 _mm_store_si128(dst+2, m1C);
834 _mm_store_si128(dst+3, m1D);
835
836 m1A = _mm_or_si128(m1A, m1B);
837 m1C = _mm_or_si128(m1C, m1D);
838 m1A = _mm_or_si128(m1A, m1C);
839
840 bool z1 = _mm_testz_si128(m1A, m1A);
841
842 m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+4)), _mm_xor_si128(maskFF,_mm_load_si128(src2+4)));
843 m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+5)), _mm_xor_si128(maskFF,_mm_load_si128(src2+5)));
844 m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+6)), _mm_xor_si128(maskFF,_mm_load_si128(src2+6)));
845 m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+7)), _mm_xor_si128(maskFF,_mm_load_si128(src2+7)));
846
847 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
848 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
849 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
850 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
851
852 _mm_store_si128(dst+4, m1A);
853 _mm_store_si128(dst+5, m1B);
854 _mm_store_si128(dst+6, m1C);
855 _mm_store_si128(dst+7, m1D);
856
857 m1A = _mm_or_si128(m1A, m1B);
858 m1C = _mm_or_si128(m1C, m1D);
859 m1A = _mm_or_si128(m1A, m1C);
860
861 bool z2 = _mm_testz_si128(m1A, m1A);
862
863 return z1 & z2;
864}
865
866
867
868
869/*!
870 @brief check if block is all ONE bits
871 @ingroup SSE4
872*/
873inline
874bool sse4_is_all_one(const __m128i* BMRESTRICT block) BMNOEXCEPT
875{
876 __m128i w;
877 const __m128i* BMRESTRICT block_end =
878 (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
879
880 do
881 {
882 w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
883 if (!_mm_test_all_ones(w))
884 return false;
885 w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
886 if (!_mm_test_all_ones(w))
887 return false;
888
889 block+=4;
890 } while (block < block_end);
891 return true;
892}
893
894/*!
895 @brief check if SSE wave is all oxFFFF...FFF
896 @ingroup SSE4
897*/
900{
901 return _mm_test_all_ones(_mm_loadu_si128((__m128i*)ptr));
902}
903
904
905/*!
906 @brief check if wave of pointers is all NULL
907 @ingroup SSE4
908*/
911{
912 __m128i w0 = _mm_loadu_si128((__m128i*)ptr);
913 return _mm_testz_si128(w0, w0);
914}
915
916/*!
917 @brief check if 2 waves of pointers are all NULL
918 @ingroup SSE4
919*/
921bool sse42_test_all_zero_wave2(const void* ptr0, const void* ptr1) BMNOEXCEPT
922{
923 __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
924 __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
925 w0 = _mm_or_si128(w0, w1);
926 return _mm_testz_si128(w0, w0);
927}
928
929/*!
930 @brief check if wave of 2 pointers are the same (null or FULL)
931 @ingroup SSE4
932*/
934bool sse42_test_all_eq_wave2(const void* ptr0, const void* ptr1) BMNOEXCEPT
935{
936 __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
937 __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
938 w0 = _mm_xor_si128(w0, w1);
939 return _mm_testz_si128(w0, w0);
940}
941
942
943/*!
944 SSE4.2 calculate number of bit changes from 0 to 1
945 @ingroup SSE4
946*/
947inline
948unsigned sse42_bit_block_calc_change(const __m128i* BMRESTRICT block,
949 unsigned size) BMNOEXCEPT
950{
952
953 const __m128i* block_end =
954 ( __m128i*)((bm::word_t*)(block) + size); // bm::set_block_size
955 __m128i m1COshft, m2COshft;
956
957 unsigned w0 = *((bm::word_t*)(block));
958 unsigned count = 1;
959
960 unsigned co2, co1 = 0;
961 for (;block < block_end; block += 2)
962 {
963 __m128i m1A = _mm_load_si128(block);
964 __m128i m2A = _mm_load_si128(block+1);
965
966 __m128i m1CO = _mm_srli_epi32(m1A, 31);
967 __m128i m2CO = _mm_srli_epi32(m2A, 31);
968
969 co2 = _mm_extract_epi32(m1CO, 3);
970
971 __m128i m1As = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
972 __m128i m2As = _mm_slli_epi32(m2A, 1);
973
974 m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
975 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
976
977 co1 = co2;
978
979 co2 = _mm_extract_epi32(m2CO, 3);
980
981 m2COshft = _mm_slli_si128 (m2CO, 4);
982 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
983
984 m1As = _mm_or_si128(m1As, m1COshft); // block[i] |= co_flag
985 m2As = _mm_or_si128(m2As, m2COshft);
986
987 co1 = co2;
988
989 // we now have two shifted SSE4 regs with carry-over
990 m1A = _mm_xor_si128(m1A, m1As); // w ^= (w >> 1);
991 m2A = _mm_xor_si128(m2A, m2As);
992
993#ifdef BM64_SSE4
994 _mm_store_si128((__m128i*)tcnt, m1A);
995 count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
996 _mm_store_si128((__m128i*)tcnt, m2A);
997 count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
998#else
999 bm::id_t m0 = _mm_extract_epi32(m1A, 0);
1000 bm::id_t m1 = _mm_extract_epi32(m1A, 1);
1001 bm::id_t m2 = _mm_extract_epi32(m1A, 2);
1002 bm::id_t m3 = _mm_extract_epi32(m1A, 3);
1003 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
1004 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
1005
1006 m0 = _mm_extract_epi32(m2A, 0);
1007 m1 = _mm_extract_epi32(m2A, 1);
1008 m2 = _mm_extract_epi32(m2A, 2);
1009 m3 = _mm_extract_epi32(m2A, 3);
1010 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
1011 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
1012#endif
1013
1014 }
1015 count -= (w0 & 1u); // correct initial carry-in error
1016 return count;
1017}
1018
1019
1020/*!
1021 SSE4.2 calculate number of bit changes from 0 to 1 of a XOR product
1022 @ingroup SSE4
1023*/
1024inline
1026 const __m128i* BMRESTRICT xor_block,
1027 unsigned size,
1028 unsigned* BMRESTRICT gc,
1029 unsigned* BMRESTRICT bc) BMNOEXCEPT
1030{
1031#ifdef BM64_SSE4
1034#endif
1035
1036 const __m128i* block_end =
1037 ( __m128i*)((bm::word_t*)(block) + size);
1038 __m128i m1COshft, m2COshft;
1039
1040 unsigned w0 = *((bm::word_t*)(block));
1041 unsigned gap_count = 1;
1042 unsigned bit_count = 0;
1043
1044 unsigned co2, co1 = 0;
1045 for (;block < block_end; block += 2, xor_block += 2)
1046 {
1047 __m128i m1A = _mm_load_si128(block);
1048 __m128i m2A = _mm_load_si128(block+1);
1049 __m128i m1B = _mm_load_si128(xor_block);
1050 __m128i m2B = _mm_load_si128(xor_block+1);
1051
1052 m1A = _mm_xor_si128(m1A, m1B);
1053 m2A = _mm_xor_si128(m2A, m2B);
1054
1055 {
1056#ifdef BM64_SSE4
1057 _mm_store_si128 ((__m128i*)simd_buf0, m1A);
1058 _mm_store_si128 ((__m128i*)simd_buf1, m2A);
1059 bit_count += unsigned(_mm_popcnt_u64(simd_buf0[0]) + _mm_popcnt_u64(simd_buf0[1]));
1060 bit_count += unsigned(_mm_popcnt_u64(simd_buf1[0]) + _mm_popcnt_u64(simd_buf1[1]));
1061#else
1062 bm::id_t m0 = _mm_extract_epi32(m1A, 0);
1063 bm::id_t m1 = _mm_extract_epi32(m1A, 1);
1064 bm::id_t m2 = _mm_extract_epi32(m1A, 2);
1065 bm::id_t m3 = _mm_extract_epi32(m1A, 3);
1066 bit_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
1067 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
1068
1069 m0 = _mm_extract_epi32(m2A, 0);
1070 m1 = _mm_extract_epi32(m2A, 1);
1071 m2 = _mm_extract_epi32(m2A, 2);
1072 m3 = _mm_extract_epi32(m2A, 3);
1073 bit_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
1074 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
1075#endif
1076 }
1077
1078 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1079 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1080
1081 co2 = _mm_extract_epi32(m1CO, 3);
1082
1083 __m128i m1As = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1084 __m128i m2As = _mm_slli_epi32(m2A, 1);
1085
1086 m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
1087 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1088
1089 co1 = co2;
1090
1091 co2 = _mm_extract_epi32(m2CO, 3);
1092
1093 m2COshft = _mm_slli_si128 (m2CO, 4);
1094 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1095
1096 m1As = _mm_or_si128(m1As, m1COshft); // block[i] |= co_flag
1097 m2As = _mm_or_si128(m2As, m2COshft);
1098
1099 co1 = co2;
1100
1101 // we now have two shifted SSE4 regs with carry-over
1102 m1A = _mm_xor_si128(m1A, m1As); // w ^= (w >> 1);
1103 m2A = _mm_xor_si128(m2A, m2As);
1104
1105#ifdef BM64_SSE4
1106 _mm_store_si128 ((__m128i*)simd_buf0, m1A);
1107 _mm_store_si128 ((__m128i*)simd_buf1, m2A);
1108 gap_count += unsigned(_mm_popcnt_u64(simd_buf0[0]) + _mm_popcnt_u64(simd_buf0[1]));
1109 gap_count += unsigned(_mm_popcnt_u64(simd_buf1[0]) + _mm_popcnt_u64(simd_buf1[1]));
1110#else
1111 bm::id_t m0 = _mm_extract_epi32(m1A, 0);
1112 bm::id_t m1 = _mm_extract_epi32(m1A, 1);
1113 bm::id_t m2 = _mm_extract_epi32(m1A, 2);
1114 bm::id_t m3 = _mm_extract_epi32(m1A, 3);
1115 gap_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
1116 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
1117
1118 m0 = _mm_extract_epi32(m2A, 0);
1119 m1 = _mm_extract_epi32(m2A, 1);
1120 m2 = _mm_extract_epi32(m2A, 2);
1121 m3 = _mm_extract_epi32(m2A, 3);
1122 gap_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
1123 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
1124#endif
1125
1126 }
1127 gap_count -= (w0 & 1u); // correct initial carry-in error
1128 if (!gap_count)
1129 ++gap_count; // must be >0
1130 *gc = gap_count;
1131 *bc = bit_count;
1132}
1133
1134
1135
1136#ifdef BM64_SSE4
1137
1138/*!
1139 SSE4.2 calculate number of bit changes from 0 to 1
1140 @ingroup SSE4
1141*/
1142inline
1144 unsigned* gc, unsigned* bc) BMNOEXCEPT
1145{
1146 const __m128i* block_end =
1147 ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1148 __m128i m1COshft, m2COshft;
1149
1150 unsigned w0 = *((bm::word_t*)(block));
1151 unsigned bit_count = 0;
1152 unsigned gap_count = 1;
1153
1154 unsigned co2, co1 = 0;
1155 for (;block < block_end; block += 2)
1156 {
1157 __m128i m1A = _mm_load_si128(block);
1158 __m128i m2A = _mm_load_si128(block+1);
1159 {
1160 bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
1161 bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
1162 bit_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
1163 m0 = _mm_extract_epi64(m2A, 0);
1164 m1 = _mm_extract_epi64(m2A, 1);
1165 bit_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
1166 }
1167
1168 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1169 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1170
1171 co2 = _mm_extract_epi32(m1CO, 3);
1172
1173 __m128i m1As = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1174 __m128i m2As = _mm_slli_epi32(m2A, 1);
1175
1176 m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
1177 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1178
1179 co1 = co2;
1180
1181 co2 = _mm_extract_epi32(m2CO, 3);
1182
1183 m2COshft = _mm_slli_si128 (m2CO, 4);
1184 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1185
1186 m1As = _mm_or_si128(m1As, m1COshft); // block[i] |= co_flag
1187 m2As = _mm_or_si128(m2As, m2COshft);
1188
1189 co1 = co2;
1190
1191 // we now have two shifted SSE4 regs with carry-over
1192 m1A = _mm_xor_si128(m1A, m1As); // w ^= (w >> 1);
1193 m2A = _mm_xor_si128(m2A, m2As);
1194 {
1195 bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
1196 bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
1197 gap_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
1198 }
1199
1200 bm::id64_t m0 = _mm_extract_epi64(m2A, 0);
1201 bm::id64_t m1 = _mm_extract_epi64(m2A, 1);
1202 gap_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
1203
1204 }
1205 gap_count -= (w0 & 1u); // correct initial carry-in error
1206 *gc = gap_count;
1207 *bc = bit_count;
1208}
1209
1210#endif
1211
1212
1213/*!
1214 \brief Find first bit which is different between two bit-blocks
1215 @ingroup SSE4
1216*/
1217inline
1218bool sse42_bit_find_first_diff(const __m128i* BMRESTRICT block1,
1219 const __m128i* BMRESTRICT block2,
1220 unsigned* pos) BMNOEXCEPT
1221{
1222 unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR;
1223
1224 const __m128i* block1_end =
1225 (const __m128i*)((bm::word_t*)(block1) + bm::set_block_size);
1226 const __m128i maskZ = _mm_setzero_si128();
1227 __m128i mA, mB;
1228 unsigned simd_lane = 0;
1229 do
1230 {
1231 mA = _mm_xor_si128(_mm_load_si128(block1), _mm_load_si128(block2));
1232 mB = _mm_xor_si128(_mm_load_si128(block1+1), _mm_load_si128(block2+1));
1233 __m128i mOR = _mm_or_si128(mA, mB);
1234 if (!_mm_test_all_zeros(mOR, mOR)) // test 2x128 lanes
1235 {
1236 if (!_mm_test_all_zeros(mA, mA))
1237 {
1238 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
1239 mask = ~mask; // invert to find (w != 0)
1240 BM_ASSERT(mask);
1241 int bsf = BM_BSF32(mask); // find first !=0 (could use lzcnt())
1242 _mm_store_si128 ((__m128i*)simd_buf, mA);
1243 unsigned widx = bsf >> 2; // (bsf / 4);
1244 unsigned w = simd_buf[widx]; // _mm_extract_epi32 (mA, widx);
1245 bsf = BM_BSF32(w); // find first bit != 0
1246 *pos = (simd_lane * 128) + (widx * 32) + bsf;
1247 return true;
1248 }
1249 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
1250 mask = ~mask; // invert to find (w != 0)
1251 BM_ASSERT(mask);
1252 int bsf = BM_BSF32(mask); // find first !=0 (could use lzcnt())
1253 _mm_store_si128 ((__m128i*)simd_buf, mB);
1254 unsigned widx = bsf >> 2; // (bsf / 4);
1255 unsigned w = simd_buf[widx]; // _mm_extract_epi32 (mB, widx);
1256 bsf = BM_BSF32(w); // find first bit != 0
1257 *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
1258 return true;
1259 }
1260
1261 simd_lane+=2;
1262 block1+=2; block2+=2;
1263
1264 } while (block1 < block1_end);
1265 return false;
1266}
1267
1268
1269/*!
1270 \brief Find first non-zero bit
1271 @ingroup SSE4
1272*/
1273inline
1274bool sse42_bit_find_first(const __m128i* BMRESTRICT block,
1275 unsigned off,
1276 unsigned* pos) BMNOEXCEPT
1277{
1278 unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR;
1279
1280 block = (const __m128i*)((const bm::word_t*)(block) + off);
1281 const __m128i* block_end =
1282 (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1283 const __m128i maskZ = _mm_setzero_si128();
1284 __m128i mA, mB;
1285 unsigned simd_lane = 0;
1286 do
1287 {
1288 mA = _mm_load_si128(block); mB = _mm_load_si128(block+1);
1289 __m128i mOR = _mm_or_si128(mA, mB);
1290 if (!_mm_test_all_zeros(mOR, mOR)) // test 2x128 lanes
1291 {
1292 if (!_mm_test_all_zeros(mA, mA))
1293 {
1294 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
1295 mask = ~mask; // invert to find (w != 0)
1296 BM_ASSERT(mask);
1297 int bsf = BM_BSF32(mask); // find first !=0 (could use lzcnt())
1298 _mm_store_si128 ((__m128i*)simd_buf, mA);
1299 unsigned widx = bsf >> 2; // (bsf / 4);
1300 unsigned w = simd_buf[widx];
1301 bsf = BM_BSF32(w); // find first bit != 0
1302 *pos = (off * 32) + (simd_lane * 128) + (widx * 32) + bsf;
1303 return true;
1304 }
1305 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
1306 mask = ~mask; // invert to find (w != 0)
1307 BM_ASSERT(mask);
1308 int bsf = BM_BSF32(mask); // find first !=0 (could use lzcnt())
1309 _mm_store_si128 ((__m128i*)simd_buf, mB);
1310 unsigned widx = bsf >> 2; // (bsf / 4);
1311 unsigned w = simd_buf[widx];
1312 bsf = BM_BSF32(w); // find first bit != 0
1313 *pos = (off * 32) + ((++simd_lane) * 128) + (widx * 32) + bsf;
1314 return true;
1315 }
1316
1317 simd_lane+=2;
1318 block+=2;
1319
1320 } while (block < block_end);
1321 return false;
1322}
1323
1324
1325
1326
1327#ifdef __GNUG__
1328// necessary measure to silence false warning from GCC about negative pointer arithmetics
1329#pragma GCC diagnostic push
1330#pragma GCC diagnostic ignored "-Warray-bounds"
1331#endif
1332
1333/*!
1334 SSE4.2 check for one to two (variable len) 128 bit SSE lines
1335 for gap search results (8 elements)
1336 @ingroup SSE4
1337 \internal
1338*/
1339inline
1341 const bm::gap_word_t pos, const unsigned size) BMNOEXCEPT
1342{
1343 BM_ASSERT(size <= 16);
1344 BM_ASSERT(size >= 4);
1345 const unsigned unroll_factor = 8;
1346
1347 __m128i m1, mz, maskF, maskFL;
1348
1349 mz = _mm_setzero_si128();
1350 m1 = _mm_loadu_si128((__m128i*)(pbuf)); // load first 8 elements
1351
1352 maskF = _mm_cmpeq_epi64(mz, mz); // set all FF
1353 maskFL = _mm_slli_si128(maskF, 4 * 2); // byte shift to make [0000 FFFF]
1354 int shiftL= (64 - (unroll_factor - size) * 16);
1355 maskFL = _mm_slli_epi64(maskFL, shiftL); // additional bit shift to [0000 00FF]
1356
1357 m1 = _mm_andnot_si128(maskFL, m1); // m1 = (~mask) & m1
1358 m1 = _mm_or_si128(m1, maskFL);
1359
1360 __m128i mp = _mm_set1_epi16(pos); // broadcast pos into all elements of a SIMD vector
1361 __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // unsigned m1 >= mp
1362 __m128i c_mask = _mm_slli_epi16(mge_mask, 15); // clear not needed flag bits by shift
1363 int mi = _mm_movemask_epi8(c_mask); // collect flag bits
1364 if (unsigned bc = _mm_popcnt_u32(mi)) // gives us number of elements >= pos
1365 return unroll_factor - bc; // address of first one element (target)
1366 // inspect the next lane with possible step back (to avoid over-read the block boundaries)
1367 // GCC gives a false warning for "- unroll_factor" here
1368 const bm::gap_word_t* BMRESTRICT pbuf2 = pbuf + size - unroll_factor;
1369 BM_ASSERT(pbuf2 > pbuf || size == 8); // assert in place to make sure GCC warning is indeed false
1370
1371 m1 = _mm_loadu_si128((__m128i*)(pbuf2)); // load next elements (with possible overlap)
1372 mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // m1 >= mp
1373 mi = _mm_movemask_epi8(_mm_slli_epi16(mge_mask, 15));
1374 unsigned bc = _mm_popcnt_u32(mi);
1375
1376 return size - bc;
1377}
1378
1379/**
1380 Hybrid binary search, starts as binary, then switches to linear scan
1381
1382 \param buf - GAP buffer pointer.
1383 \param pos - index of the element.
1384 \param is_set - output. GAP value (0 or 1).
1385 \return GAP index.
1386
1387 @ingroup SSE4
1388*/
1389inline
1390unsigned sse42_gap_bfind(const unsigned short* BMRESTRICT buf,
1391 unsigned pos, unsigned* BMRESTRICT is_set) BMNOEXCEPT
1392{
1393 unsigned start = 1;
1394// unsigned end = 1 + ((*buf) >> 3);
1395 unsigned end = ((*buf) >> 3);
1396 BM_ASSERT(buf[end] == 65535);
1397
1398// const unsigned arr_end = end+1;
1399 unsigned size = end - start;
1400 for (; size >= 64; size = end - start)
1401 {
1402 unsigned mid = (start + end) >> 1;
1403 if (buf[mid] < pos)
1404 start = mid+1;
1405 else
1406 end = mid;
1407 if (buf[mid = (start + end) >> 1] < pos)
1408 start = mid+1;
1409 else
1410 end = mid;
1411 if (buf[mid = (start + end) >> 1] < pos)
1412 start = mid+1;
1413 else
1414 end = mid;
1415 if (buf[mid = (start + end) >> 1] < pos)
1416 start = mid+1;
1417 else
1418 end = mid;
1419 } // for
1420 BM_ASSERT(buf[end] >= pos);
1421
1422 for (; size >= 16; size = end - start)
1423 {
1424 if (unsigned mid = (start + end) >> 1; buf[mid] < pos)
1425 start = mid + 1;
1426 else
1427 end = mid;
1428 if (unsigned mid = (start + end) >> 1; buf[mid] < pos)
1429 start = mid + 1;
1430 else
1431 end = mid;
1432 } // for
1433// size += (end != arr_end);
1434 ++size;
1435 if (size < 4) // for very short vector use conventional scan
1436 {
1437 const unsigned short* BMRESTRICT pbuf = buf + start;
1438 if (pbuf[0] >= pos) { }
1439 else if (pbuf[1] >= pos) { start++; }
1440 else
1441 {
1442 BM_ASSERT(pbuf[2] >= pos);
1443 start+=2;
1444 }
1445 }
1446 else
1447 {
1448 start += bm::sse4_gap_find(buf+start, (bm::gap_word_t)pos, size);
1449 }
1450 *is_set = ((*buf) & 1) ^ ((start-1) & 1);
1451 return start;
1452}
1453
1454
1455/**
1456 Hybrid binary search to test GAP value, starts as binary, then switches to scan
1457 @return test result
1458 @ingroup SSE4
1459*/
1460inline
1461unsigned sse42_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos) BMNOEXCEPT
1462{
1463 unsigned start = 1;
1464// unsigned end = start + ((*buf) >> 3);
1465 unsigned end = ((*buf) >> 3);
1466 unsigned size = end - start;
1467// const unsigned arr_end = end;
1468 for (; size >= 64; size = end - start)
1469 {
1470 unsigned mid = (start + end) >> 1;
1471 if (buf[mid] < pos)
1472 start = mid+1;
1473 else
1474 end = mid;
1475 if (buf[mid = (start + end) >> 1] < pos)
1476 start = mid+1;
1477 else
1478 end = mid;
1479 if (buf[mid = (start + end) >> 1] < pos)
1480 start = mid+1;
1481 else
1482 end = mid;
1483 if (buf[mid = (start + end) >> 1] < pos)
1484 start = mid+1;
1485 else
1486 end = mid;
1487 } // for
1488 for (; size >= 16; size = end - start)
1489 {
1490 if (unsigned mid = (start + end) >> 1; buf[mid] < pos)
1491 start = mid+1;
1492 else
1493 end = mid;
1494 } // for
1495 //size += (end != arr_end);
1496 ++size;
1497 if (size < 4) // for very short vector use conventional scan
1498 {
1499 const unsigned short* BMRESTRICT pbuf = buf + start;
1500 if (pbuf[0] >= pos) { }
1501 else if (pbuf[1] >= pos) { start++; }
1502 else
1503 {
1504 BM_ASSERT(pbuf[2] >= pos);
1505 start+=2;
1506 }
1507 }
1508 else
1509 {
1510 start += bm::sse4_gap_find(buf+start, (bm::gap_word_t)pos, size);
1511 }
1512 BM_ASSERT(buf[start] >= pos);
1513 BM_ASSERT(buf[start - 1] < pos || (start == 1));
1514
1515 return ((*buf) & 1) ^ ((--start) & 1);
1516}
1517
1518
1519/**
1520 Experimental (test) function to do SIMD vector search (lower bound)
1521 in sorted, growing array
1522 @ingroup SSE4
1523
1524 \internal
1525*/
1526inline
1527int sse42_cmpge_u32(__m128i vect4, unsigned value) BMNOEXCEPT
1528{
1529 // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
1530 // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
1531 //
1532 __m128i mask0x8 = _mm_set1_epi32(0x80000000);
1533 __m128i mm_val = _mm_set1_epi32(value);
1534
1535 __m128i norm_vect4 = _mm_sub_epi32(vect4, mask0x8); // (signed) vect4 - 0x80000000
1536 __m128i norm_val = _mm_sub_epi32(mm_val, mask0x8); // (signed) mm_val - 0x80000000
1537
1538 __m128i cmp_mask_gt = _mm_cmpgt_epi32 (norm_vect4, norm_val);
1539 __m128i cmp_mask_eq = _mm_cmpeq_epi32 (mm_val, vect4);
1540
1541 __m128i cmp_mask_ge = _mm_or_si128 (cmp_mask_gt, cmp_mask_eq);
1542 int mask = _mm_movemask_epi8(cmp_mask_ge);
1543 if (mask)
1544 {
1545 int bsf = BM_BSF32(mask);//_bit_scan_forward(mask);
1546 return bsf / 4;
1547 }
1548 return -1;
1549}
1550
1551
1552
1553/*!
1554 SSE4.2 index lookup to check what belongs to the same block (8 elements)
1555 \internal
1556*/
1557inline
1558unsigned sse42_idx_arr_block_lookup(const unsigned* idx, unsigned size,
1559 unsigned nb, unsigned start) BMNOEXCEPT
1560{
1561 const unsigned unroll_factor = 8;
1562 const unsigned len = (size - start);
1563 const unsigned len_unr = len - (len % unroll_factor);
1564 unsigned k;
1565
1566 idx += start;
1567
1568 __m128i nbM = _mm_set1_epi32(nb);
1569
1570 for (k = 0; k < len_unr; k+=unroll_factor)
1571 {
1572 __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
1573 __m128i idxB = _mm_loadu_si128((__m128i*)(idx+k+4));
1574 __m128i nbA = _mm_srli_epi32(idxA, bm::set_block_shift); // idx[k] >> bm::set_block_shift
1575 __m128i nbB = _mm_srli_epi32(idxB, bm::set_block_shift);
1576
1577 if (!_mm_test_all_ones(_mm_cmpeq_epi32(nbM, nbA)) |
1578 !_mm_test_all_ones(_mm_cmpeq_epi32 (nbM, nbB)))
1579 break;
1580
1581 } // for k
1582 for (; k < len; ++k)
1583 {
1584 if (nb != unsigned(idx[k] >> bm::set_block_shift))
1585 break;
1586 }
1587 return start + k;
1588}
1589
1590/*!
1591 SSE4.2 bulk bit set
1592 \internal
1593*/
1594inline
1596 const unsigned* BMRESTRICT idx,
1597 unsigned start, unsigned stop ) BMNOEXCEPT
1598{
1599 const unsigned unroll_factor = 4;
1600 const unsigned len = (stop - start);
1601 const unsigned len_unr = len - (len % unroll_factor);
1602
1603 idx += start;
1604
1605 unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
1606 unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
1607
1608 __m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
1609 __m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
1610
1611 unsigned k = 0;
1612 for (; k < len_unr; k+=unroll_factor)
1613 {
1614 __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
1615 __m128i nbitA = _mm_and_si128 (idxA, sb_mask); // nbit = idx[k] & bm::set_block_mask
1616 __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
1617
1618
1619 nbitA = _mm_and_si128 (nbitA, sw_mask);
1620 _mm_store_si128 ((__m128i*)mshift_v, nbitA);
1621
1622 // check-compare if all 4 bits are in the very same word
1623 //
1624 __m128i nwordA_0 = _mm_shuffle_epi32(nwordA, 0x0); // copy element 0
1625 __m128i cmpA = _mm_cmpeq_epi32(nwordA_0, nwordA); // compare EQ
1626 if (_mm_test_all_ones(cmpA)) // check if all are in one word
1627 {
1628 unsigned nword = _mm_extract_epi32(nwordA, 0);
1629 block[nword] |= (1u << mshift_v[0]) | (1u << mshift_v[1])
1630 |(1u << mshift_v[2]) | (1u << mshift_v[3]);
1631 }
1632 else // bits are in different words, use scalar scatter
1633 {
1634 _mm_store_si128 ((__m128i*)mword_v, nwordA);
1635
1636 block[mword_v[0]] |= (1u << mshift_v[0]);
1637 block[mword_v[1]] |= (1u << mshift_v[1]);
1638 block[mword_v[2]] |= (1u << mshift_v[2]);
1639 block[mword_v[3]] |= (1u << mshift_v[3]);
1640 }
1641
1642 } // for k
1643
1644 for (; k < len; ++k)
1645 {
1646 unsigned n = idx[k];
1647 unsigned nbit = unsigned(n & bm::set_block_mask);
1648 unsigned nword = nbit >> bm::set_word_shift;
1649 nbit &= bm::set_word_mask;
1650 block[nword] |= (1u << nbit);
1651 } // for k
1652}
1653
1654
1655/*!
1656 SSE4.2 bit block gather-scatter
1657
1658 @param arr - destination array to set bits
1659 @param blk - source bit-block
1660 @param idx - gather index array
1661 @param size - gather array size
1662 @param start - gaher start index
1663 @param bit_idx - bit to set in the target array
1664
1665 \internal
1666
1667 C algorithm:
1668
1669 for (unsigned k = start; k < size; ++k)
1670 {
1671 nbit = unsigned(idx[k] & bm::set_block_mask);
1672 nword = unsigned(nbit >> bm::set_word_shift);
1673 mask0 = 1u << (nbit & bm::set_word_mask);
1674 arr[k] |= TRGW(bool(blk[nword] & mask0) << bit_idx);
1675 }
1676
1677*/
1678inline
1680 const unsigned* BMRESTRICT blk,
1681 const unsigned* BMRESTRICT idx,
1682 unsigned size,
1683 unsigned start,
1684 unsigned bit_idx) BMNOEXCEPT
1685{
1686 const unsigned unroll_factor = 4;
1687 const unsigned len = (size - start);
1688 const unsigned len_unr = len - (len % unroll_factor);
1689
1690 __m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
1691 __m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
1692 __m128i maskFF = _mm_set1_epi32(~0u);
1693 __m128i maskZ = _mm_xor_si128(maskFF, maskFF);
1694
1695 __m128i mask_tmp, mask_0;
1696
1697 unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
1698 unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
1699
1700 unsigned k = 0;
1701 unsigned base = start + k;
1702 __m128i* idx_ptr = (__m128i*)(idx + base); // idx[base]
1703 __m128i* target_ptr = (__m128i*)(arr + base); // arr[base]
1704 for (; k < len_unr; k+=unroll_factor)
1705 {
1706 __m128i nbitA = _mm_and_si128 (_mm_loadu_si128(idx_ptr), sb_mask); // nbit = idx[base] & bm::set_block_mask
1707 __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
1708 // (nbit & bm::set_word_mask)
1709 _mm_store_si128 ((__m128i*)mshift_v, _mm_and_si128 (nbitA, sw_mask));
1710 _mm_store_si128 ((__m128i*)mword_v, nwordA);
1711
1712 // mask0 = 1u << (nbit & bm::set_word_mask);
1713 //
1714#if 0
1715 // ifdefed an alternative SHIFT implementation using SSE and masks
1716 // (it is not faster than just doing scalar operations)
1717 {
1718 __m128i am_0 = _mm_set_epi32(0, 0, 0, ~0u);
1719 __m128i mask1 = _mm_srli_epi32 (maskFF, 31);
1720 mask_0 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[0]), am_0);
1721 mask_tmp = _mm_and_si128 (_mm_slli_epi32(mask1, mshift_v[1]), _mm_slli_si128 (am_0, 4));
1722 mask_0 = _mm_or_si128 (mask_0, mask_tmp);
1723
1724 __m128i mask_2 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[2]),
1725 _mm_slli_si128 (am_0, 8));
1726 mask_tmp = _mm_and_si128 (
1727 _mm_slli_epi32(mask1, mshift_v[3]),
1728 _mm_slli_si128 (am_0, 12)
1729 );
1730
1731 mask_0 = _mm_or_si128 (mask_0,
1732 _mm_or_si128 (mask_2, mask_tmp)); // assemble bit-test mask
1733 }
1734#endif
1735 mask_0 = _mm_set_epi32(1 << mshift_v[3], 1 << mshift_v[2], 1 << mshift_v[1], 1 << mshift_v[0]);
1736
1737
1738 // gather for: blk[nword] (.. & mask0 )
1739 //
1740 mask_tmp = _mm_and_si128(_mm_set_epi32(blk[mword_v[3]], blk[mword_v[2]],
1741 blk[mword_v[1]], blk[mword_v[0]]),
1742 mask_0);
1743
1744 // bool(blk[nword] ...)
1745 //maskFF = _mm_set1_epi32(~0u);
1746 mask_tmp = _mm_cmpeq_epi32 (mask_tmp, maskZ); // set 0xFF where == 0
1747 mask_tmp = _mm_xor_si128 (mask_tmp, maskFF); // invert
1748 mask_tmp = _mm_srli_epi32 (mask_tmp, 31); // (bool) 1 only to the 0 pos
1749
1750 mask_tmp = _mm_slli_epi32(mask_tmp, bit_idx); // << bit_idx
1751
1752 _mm_storeu_si128 (target_ptr, // arr[base] |= MASK_EXPR
1753 _mm_or_si128 (mask_tmp, _mm_loadu_si128(target_ptr)));
1754
1755 ++idx_ptr; ++target_ptr;
1756 _mm_prefetch((const char*)target_ptr, _MM_HINT_T0);
1757 }
1758
1759 for (; k < len; ++k)
1760 {
1761 base = start + k;
1762 unsigned nbit = unsigned(idx[base] & bm::set_block_mask);
1763 arr[base] |= unsigned(bool(blk[nbit >> bm::set_word_shift] & (1u << (nbit & bm::set_word_mask))) << bit_idx);
1764 }
1765
1766}
1767
1768/*!
1769 @brief block shift left by 1
1770 @ingroup SSE4
1771*/
1772inline
1773bool sse42_shift_l1(__m128i* block, unsigned* empty_acc, unsigned co1) BMNOEXCEPT
1774{
1775 __m128i* block_end =
1776 ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1777 __m128i mAcc = _mm_set1_epi32(0);
1778 __m128i mMask1 = _mm_set1_epi32(1);
1779
1780 unsigned co2;
1781 for (--block_end; block_end >= block; block_end -= 2)
1782 {
1783 __m128i m1A = _mm_load_si128(block_end);
1784 __m128i m2A = _mm_load_si128(block_end-1);
1785
1786 __m128i m1CO = _mm_and_si128(m1A, mMask1);
1787 __m128i m2CO = _mm_and_si128(m2A, mMask1);
1788
1789 co2 = _mm_extract_epi32(m1CO, 0);
1790
1791 m1A = _mm_srli_epi32(m1A, 1); // (block[i] >> 1u)
1792 m2A = _mm_srli_epi32(m2A, 1);
1793
1794 __m128i m1COshft = _mm_srli_si128 (m1CO, 4); // byte shift-r by 1 int32
1795 __m128i m2COshft = _mm_srli_si128 (m2CO, 4);
1796 m1COshft = _mm_insert_epi32 (m1COshft, co1, 3);
1797 m2COshft = _mm_insert_epi32 (m2COshft, co2, 3);
1798 m1COshft = _mm_slli_epi32(m1COshft, 31);
1799 m2COshft = _mm_slli_epi32(m2COshft, 31);
1800
1801 m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1802 m2A = _mm_or_si128(m2A, m2COshft);
1803
1804 co1 = _mm_extract_epi32(m2CO, 0);
1805
1806 _mm_store_si128(block_end, m1A);
1807 _mm_store_si128(block_end-1, m2A);
1808
1809 mAcc = _mm_or_si128(mAcc, m1A);
1810 mAcc = _mm_or_si128(mAcc, m2A);
1811 } // for
1812
1813 *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1814 return co1;
1815}
1816
1817
1818/*!
1819 @brief block shift right by 1
1820 @ingroup SSE4
1821*/
1822inline
1823bool sse42_shift_r1(__m128i* block, unsigned* empty_acc, unsigned co1) BMNOEXCEPT
1824{
1825 __m128i* block_end =
1826 ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1827 __m128i m1COshft, m2COshft;
1828 __m128i mAcc = _mm_set1_epi32(0);
1829
1830 unsigned co2;
1831 for (;block < block_end; block += 2)
1832 {
1833 __m128i m1A = _mm_load_si128(block);
1834 __m128i m2A = _mm_load_si128(block+1);
1835
1836 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1837 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1838
1839 co2 = _mm_extract_epi32(m1CO, 3);
1840
1841 m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1842 m2A = _mm_slli_epi32(m2A, 1);
1843
1844 m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift-l by 1 int32
1845 m2COshft = _mm_slli_si128 (m2CO, 4);
1846 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1847 m2COshft = _mm_insert_epi32 (m2COshft, co2, 0);
1848
1849 m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1850 m2A = _mm_or_si128(m2A, m2COshft);
1851
1852 co1 = _mm_extract_epi32(m2CO, 3);
1853
1854 _mm_store_si128(block, m1A);
1855 _mm_store_si128(block+1, m2A);
1856
1857 mAcc = _mm_or_si128(mAcc, m1A);
1858 mAcc = _mm_or_si128(mAcc, m2A);
1859 }
1860 *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1861 return co1;
1862}
1863
1864
1865
1866/*!
1867 @brief block shift right by 1 plus AND
1868
1869 @return carry over flag
1870 @ingroup SSE4
1871*/
1872inline
1873bool sse42_shift_r1_and(__m128i* block,
1874 bm::word_t co1,
1875 const __m128i* BMRESTRICT mask_block,
1876 bm::id64_t* digest) BMNOEXCEPT
1877{
1878 bm::word_t* wblock = (bm::word_t*) block;
1879 const bm::word_t* mblock = (const bm::word_t*) mask_block;
1880
1881 __m128i m1COshft, m2COshft;
1882 __m128i mAcc = _mm_set1_epi32(0);
1883 unsigned co2;
1884
1885 bm::id64_t d, wd;
1886 wd = d = *digest;
1887
1888 unsigned di = 0;
1889 if (!co1)
1890 {
1891 bm::id64_t t = d & -d;
1892#ifdef BM64_SSE4
1893 di = unsigned(_mm_popcnt_u64(t - 1)); // find start bit-index
1894#else
1895 bm::id_t t32 = t & bm::id_max;
1896 if (t32 == 0) {
1897 di = 32;
1898 t32 = t >> 32;
1899 }
1900 di += unsigned(_mm_popcnt_u32(t32 - 1));
1901#endif
1902 }
1903
1904 for (; di < 64 ; ++di)
1905 {
1906 const unsigned d_base = di * bm::set_block_digest_wave_size;
1907 bm::id64_t dmask = (1ull << di);
1908 if (d & dmask) // digest stride NOT empty
1909 {
1910 block = (__m128i*) &wblock[d_base];
1911 mask_block = (__m128i*) &mblock[d_base];
1912 mAcc = _mm_xor_si128(mAcc, mAcc); // mAcc = 0
1913 for (unsigned i = 0; i < 4; ++i, block += 2, mask_block += 2)
1914 {
1915 __m128i m1A = _mm_load_si128(block);
1916 __m128i m2A = _mm_load_si128(block+1);
1917
1918 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1919 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1920
1921 co2 = _mm_extract_epi32(m1CO, 3);
1922
1923 m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1924 m2A = _mm_slli_epi32(m2A, 1);
1925
1926 m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
1927 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1928
1929 co1 = co2;
1930
1931 co2 = _mm_extract_epi32(m2CO, 3);
1932
1933 m2COshft = _mm_slli_si128 (m2CO, 4);
1934 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1935
1936 m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1937 m2A = _mm_or_si128(m2A, m2COshft);
1938
1939 m1A = _mm_and_si128(m1A, _mm_load_si128(mask_block)); // block[i] &= mask_block[i]
1940 m2A = _mm_and_si128(m2A, _mm_load_si128(mask_block+1)); // block[i] &= mask_block[i]
1941
1942 mAcc = _mm_or_si128(mAcc, m1A);
1943 mAcc = _mm_or_si128(mAcc, m2A);
1944
1945 _mm_store_si128(block, m1A);
1946 _mm_store_si128(block+1, m2A);
1947
1948 co1 = co2;
1949
1950 } // for i
1951
1952 if (_mm_testz_si128(mAcc, mAcc))
1953 d &= ~dmask; // clear digest bit
1954 wd &= wd - 1;
1955 }
1956 else
1957 {
1958 if (co1)
1959 {
1960 BM_ASSERT(co1 == 1);
1961 BM_ASSERT(wblock[d_base] == 0);
1962
1963 bm::id64_t w0 = wblock[d_base] = co1 & mblock[d_base];
1964 d |= (dmask & (w0 << di)); // update digest (branchless if (w0))
1965 co1 = 0;
1966 }
1967 if (!wd) // digest is empty, no CO -> exit
1968 break;
1969 }
1970 } // for di
1971
1972 *digest = d;
1973 return co1;
1974}
1975
1976/**
1977 Build partial XOR product of 2 bit-blocks using digest mask
1978
1979 @param target_block - target := block ^ xor_block
1980 @param block - arg1
1981 @param xor_block - arg2
1982 @param digest - mask for each block wave to XOR (1) or just copy (0)
1983
1984 @ingroup SSE4
1985 @internal
1986*/
1987inline
1989 const bm::word_t* block, const bm::word_t* xor_block,
1990 bm::id64_t digest) BMNOEXCEPT
1991{
1992 for (unsigned i = 0; i < bm::block_waves; ++i)
1993 {
1994 const bm::id64_t mask = (1ull << i);
1995 unsigned off = (i * bm::set_block_digest_wave_size);
1996 const __m128i* sub_block = (__m128i*) (block + off);
1997 __m128i* t_sub_block = (__m128i*)(target_block + off);
1998
1999 if (digest & mask) // XOR filtered sub-block
2000 {
2001 const __m128i* xor_sub_block = (__m128i*) (xor_block + off);
2002 __m128i mA, mB, mC, mD;
2003 mA = _mm_xor_si128(_mm_load_si128(sub_block),
2004 _mm_load_si128(xor_sub_block));
2005 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
2006 _mm_load_si128(xor_sub_block+1));
2007 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
2008 _mm_load_si128(xor_sub_block+2));
2009 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
2010 _mm_load_si128(xor_sub_block+3));
2011
2012 _mm_store_si128(t_sub_block, mA);
2013 _mm_store_si128(t_sub_block+1, mB);
2014 _mm_store_si128(t_sub_block+2, mC);
2015 _mm_store_si128(t_sub_block+3, mD);
2016
2017 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
2018 _mm_load_si128(xor_sub_block+4));
2019 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
2020 _mm_load_si128(xor_sub_block+5));
2021 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
2022 _mm_load_si128(xor_sub_block+6));
2023 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
2024 _mm_load_si128(xor_sub_block+7));
2025
2026 _mm_store_si128(t_sub_block+4, mA);
2027 _mm_store_si128(t_sub_block+5, mB);
2028 _mm_store_si128(t_sub_block+6, mC);
2029 _mm_store_si128(t_sub_block+7, mD);
2030
2031 }
2032 else // just copy source
2033 {
2034 _mm_store_si128(t_sub_block , _mm_load_si128(sub_block));
2035 _mm_store_si128(t_sub_block+1, _mm_load_si128(sub_block+1));
2036 _mm_store_si128(t_sub_block+2, _mm_load_si128(sub_block+2));
2037 _mm_store_si128(t_sub_block+3, _mm_load_si128(sub_block+3));
2038
2039 _mm_store_si128(t_sub_block+4, _mm_load_si128(sub_block+4));
2040 _mm_store_si128(t_sub_block+5, _mm_load_si128(sub_block+5));
2041 _mm_store_si128(t_sub_block+6, _mm_load_si128(sub_block+6));
2042 _mm_store_si128(t_sub_block+7, _mm_load_si128(sub_block+7));
2043 }
2044 } // for i
2045}
2046
2047/**
2048 Build partial XOR product of 2 bit-blocks using digest mask
2049
2050 @param target_block - target ^= xor_block
2051 @param xor_block - arg1
2052 @param digest - mask for each block wave to XOR (if 1)
2053
2054 @ingroup SSE4
2055 @internal
2056*/
2057inline
2059 const bm::word_t* xor_block,
2060 bm::id64_t digest) BMNOEXCEPT
2061{
2062 while (digest)
2063 {
2064 bm::id64_t t = bm::bmi_blsi_u64(digest); // d & -d;
2065 unsigned wave = unsigned(_mm_popcnt_u64(t - 1));
2066 unsigned off = wave * bm::set_block_digest_wave_size;
2067
2068 const __m128i* sub_block = (const __m128i*) (xor_block + off);
2069 __m128i* t_sub_block = (__m128i*)(target_block + off);
2070
2071 __m128i mA, mB, mC, mD;
2072 mA = _mm_xor_si128(_mm_load_si128(sub_block),
2073 _mm_load_si128(t_sub_block));
2074 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
2075 _mm_load_si128(t_sub_block+1));
2076 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
2077 _mm_load_si128(t_sub_block+2));
2078 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
2079 _mm_load_si128(t_sub_block+3));
2080
2081 _mm_store_si128(t_sub_block, mA);
2082 _mm_store_si128(t_sub_block+1, mB);
2083 _mm_store_si128(t_sub_block+2, mC);
2084 _mm_store_si128(t_sub_block+3, mD);
2085
2086 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
2087 _mm_load_si128(t_sub_block+4));
2088 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
2089 _mm_load_si128(t_sub_block+5));
2090 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
2091 _mm_load_si128(t_sub_block+6));
2092 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
2093 _mm_load_si128(t_sub_block+7));
2094
2095 _mm_store_si128(t_sub_block+4, mA);
2096 _mm_store_si128(t_sub_block+5, mB);
2097 _mm_store_si128(t_sub_block+6, mC);
2098 _mm_store_si128(t_sub_block+7, mD);
2099
2100 digest = bm::bmi_bslr_u64(digest); // d &= d - 1;
2101 } // while
2102}
2103
2104
2105
2106#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
2107 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
2108
2109#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
2110 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
2111
2112#define VECT_BITCOUNT(first, last) \
2113 sse4_bit_count((__m128i*) (first), (__m128i*) (last))
2114/*
2115#ifdef BM64_SSE4
2116#define VECT_BIT_COUNT_DIGEST(src, digest) \
2117 sse42_bit_count_digest(src, digest)
2118#endif
2119*/
2120#define VECT_BITCOUNT_AND(first, last, mask) \
2121 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
2122
2123#define VECT_BITCOUNT_OR(first, last, mask) \
2124 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
2125
2126#define VECT_BITCOUNT_XOR(first, last, mask) \
2127 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
2128
2129#define VECT_BITCOUNT_SUB(first, last, mask) \
2130 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
2131
2132#define VECT_INVERT_BLOCK(first) \
2133 sse2_invert_block((__m128i*)first);
2134
2135#define VECT_AND_BLOCK(dst, src) \
2136 sse4_and_block((__m128i*) dst, (__m128i*) (src))
2137
2138#define VECT_AND_DIGEST(dst, src) \
2139 sse4_and_digest((__m128i*) dst, (const __m128i*) (src))
2140
2141#define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \
2142 sse4_and_or_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
2143
2144#define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
2145 sse4_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
2146
2147#define VECT_AND_DIGEST_3WAY(dst, src1, src2) \
2148 sse4_and_digest_3way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
2149
2150#define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
2151 sse4_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
2152
2153#define VECT_OR_BLOCK(dst, src) \
2154 sse2_or_block((__m128i*) dst, (__m128i*) (src))
2155
2156#define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
2157 sse2_or_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
2158
2159#define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
2160 sse2_or_block_3way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
2161
2162#define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
2163 sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
2164
2165#define VECT_SUB_BLOCK(dst, src) \
2166 sse2_sub_block((__m128i*) dst, (const __m128i*) (src))
2167
2168#define VECT_SUB_DIGEST(dst, src) \
2169 sse4_sub_digest((__m128i*) dst, (const __m128i*) (src))
2170
2171#define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
2172 sse4_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
2173
2174#define VECT_SUB_DIGEST_5WAY(dst, src1, src2, src3, src4) \
2175 sse4_sub_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
2176
2177#define VECT_SUB_DIGEST_3WAY(dst, src1, src2) \
2178 sse4_sub_digest_3way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
2179
2180#define VECT_XOR_BLOCK(dst, src) \
2181 sse2_xor_block((__m128i*) dst, (__m128i*) (src))
2182
2183#define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
2184 sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
2185
2186#define VECT_COPY_BLOCK(dst, src) \
2187 sse2_copy_block((__m128i*) dst, (__m128i*) (src))
2188
2189#define VECT_COPY_BLOCK_UNALIGN(dst, src) \
2190 sse2_copy_block_unalign((__m128i*) dst, (__m128i*) (src))
2191
2192#define VECT_STREAM_BLOCK(dst, src) \
2193 sse2_stream_block((__m128i*) dst, (__m128i*) (src))
2194
2195#define VECT_STREAM_BLOCK_UNALIGN(dst, src) \
2196 sse2_stream_block_unalign((__m128i*) dst, (__m128i*) (src))
2197
2198#define VECT_SET_BLOCK(dst, value) \
2199 sse2_set_block((__m128i*) dst, value)
2200
2201#define VECT_IS_ZERO_BLOCK(dst) \
2202 sse4_is_all_zero((__m128i*) dst)
2203
2204#define VECT_IS_ONE_BLOCK(dst) \
2205 sse4_is_all_one((__m128i*) dst)
2206
2207#define VECT_IS_DIGEST_ZERO(start) \
2208 sse4_is_digest_zero((__m128i*)start)
2209
2210#define VECT_BLOCK_SET_DIGEST(dst, val) \
2211 sse4_block_set_digest((__m128i*)dst, val)
2212
2213#define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
2214 sse2_lower_bound_scan_u32(arr, target, from, to)
2215
2216#define VECT_SHIFT_L1(b, acc, co) \
2217 sse42_shift_l1((__m128i*)b, acc, co)
2218
2219#define VECT_SHIFT_R1(b, acc, co) \
2220 sse42_shift_r1((__m128i*)b, acc, co)
2221
2222#define VECT_SHIFT_R1_AND(b, co, m, digest) \
2223 sse42_shift_r1_and((__m128i*)b, co, (__m128i*)m, digest)
2224
2225#define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
2226 sse42_idx_arr_block_lookup(idx, size, nb, start)
2227
2228#define VECT_SET_BLOCK_BITS(block, idx, start, stop) \
2229 sse42_set_block_bits(block, idx, start, stop)
2230
2231#define VECT_BLOCK_CHANGE(block, size) \
2232 sse42_bit_block_calc_change((__m128i*)block, size)
2233
2234#define VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc) \
2235 sse42_bit_block_calc_xor_change((__m128i*)block, (__m128i*)xor_block, size, gc, bc)
2236
2237#ifdef BM64_SSE4
2238#define VECT_BLOCK_CHANGE_BC(block, gc, bc) \
2239 sse42_bit_block_calc_change_bc((__m128i*)block, gc, bc)
2240#endif
2241
2242#define VECT_BIT_FIND_FIRST(src, off, pos) \
2243 sse42_bit_find_first((__m128i*) src, off, pos)
2244
2245#define VECT_BIT_FIND_DIFF(src1, src2, pos) \
2246 sse42_bit_find_first_diff((__m128i*) src1, (__m128i*) (src2), pos)
2247
2248#define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
2249 sse42_bit_block_xor(t, src, src_xor, d)
2250
2251#define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \
2252 sse42_bit_block_xor_2way(t, src_xor, d)
2253
2254
2255#define VECT_GAP_BFIND(buf, pos, is_set) \
2256 sse42_gap_bfind(buf, pos, is_set)
2257
2258#define VECT_GAP_TEST(buf, pos) \
2259 sse42_gap_test(buf, pos)
2260
2261#ifdef __GNUG__
2262#pragma GCC diagnostic pop
2263#endif
2264
2265
2266// undefine local defines to avoid pre-proc space pollution
2267//
2268#undef BM_BSF32
2269
2270#ifdef _MSC_VER
2271#pragma warning( pop )
2272#endif
2273
2274} // namespace
2275
2276
2277
2278
2279#endif
Definitions(internal).
#define BM_ALIGN16
Definition bmdef.h:287
#define BMRESTRICT
Definition bmdef.h:203
#define BMNOEXCEPT
Definition bmdef.h:82
#define BM_ALIGN32
Definition bmdef.h:292
#define BM_ALIGN16ATTR
Definition bmdef.h:288
#define BMFORCEINLINE
Definition bmdef.h:213
#define BM_ASSERT
Definition bmdef.h:139
#define BM_ALIGN32ATTR
Definition bmdef.h:293
#define BM_BSF32
Definition bmsse4.h:66
Compute functions for SSE SIMD instruction set (internal).
Bit manipulation primitives (internal).
BMFORCEINLINE bool sse42_test_all_eq_wave2(const void *ptr0, const void *ptr1) BMNOEXCEPT
check if wave of 2 pointers are the same (null or FULL)
Definition bmsse4.h:934
bool sse42_shift_l1(__m128i *block, unsigned *empty_acc, unsigned co1) BMNOEXCEPT
block shift left by 1
Definition bmsse4.h:1773
BMFORCEINLINE bool sse42_test_all_zero_wave(const void *ptr) BMNOEXCEPT
check if wave of pointers is all NULL
Definition bmsse4.h:910
unsigned sse42_bit_block_calc_change(const __m128i *BMRESTRICT block, unsigned size) BMNOEXCEPT
Definition bmsse4.h:948
bm::id_t sse42_bit_count_digest(const bm::word_t *BMRESTRICT block, bm::id64_t digest) BMNOEXCEPT
Definition bmsse4.h:127
bool sse42_bit_find_first_diff(const __m128i *BMRESTRICT block1, const __m128i *BMRESTRICT block2, unsigned *pos) BMNOEXCEPT
Find first bit which is different between two bit-blocks.
Definition bmsse4.h:1218
bool sse42_shift_r1(__m128i *block, unsigned *empty_acc, unsigned co1) BMNOEXCEPT
block shift right by 1
Definition bmsse4.h:1823
void sse42_bit_block_calc_xor_change(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT xor_block, unsigned size, unsigned *BMRESTRICT gc, unsigned *BMRESTRICT bc) BMNOEXCEPT
Definition bmsse4.h:1025
int sse42_cmpge_u32(__m128i vect4, unsigned value) BMNOEXCEPT
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array.
Definition bmsse4.h:1527
BMFORCEINLINE bool sse4_is_digest_zero(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if digest stride is all zero bits
Definition bmsse4.h:257
BMFORCEINLINE bool sse4_and_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND block digest stride dst &= *src.
Definition bmsse4.h:341
bool sse4_is_all_zero(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if block is all zero bits
Definition bmsse4.h:232
bm::id_t sse4_bit_count(const __m128i *block, const __m128i *block_end) BMNOEXCEPT
Definition bmsse4.h:93
BMFORCEINLINE bool sse4_and_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
AND block digest stride dst = *src1 & src2.
Definition bmsse4.h:389
void sse42_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest) BMNOEXCEPT
Build partial XOR product of 2 bit-blocks using digest mask.
Definition bmsse4.h:1988
bool sse4_sub_digest_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4) BMNOEXCEPT
SUB block digest stride.
Definition bmsse4.h:732
bool sse4_and_or_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
AND-OR block digest stride dst |= *src1 & src2.
Definition bmsse4.h:438
unsigned sse4_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND blocks2 dst &= *src.
Definition bmsse4.h:294
BMFORCEINLINE bool sse4_sub_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
2-operand SUB (AND NOT) block digest stride dst = src1 & ~*src2
Definition bmsse4.h:685
BMFORCEINLINE bool sse4_sub_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SUB (AND NOT) block digest stride dst &= ~*src.
Definition bmsse4.h:636
bool sse4_sub_digest_3way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
SUB block digest stride.
Definition bmsse4.h:814
bool sse4_and_digest_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4) BMNOEXCEPT
AND block digest stride.
Definition bmsse4.h:552
unsigned sse42_gap_test(const unsigned short *BMRESTRICT buf, unsigned pos) BMNOEXCEPT
Hybrid binary search to test GAP value, starts as binary, then switches to scan.
Definition bmsse4.h:1461
unsigned sse42_gap_bfind(const unsigned short *BMRESTRICT buf, unsigned pos, unsigned *BMRESTRICT is_set) BMNOEXCEPT
Hybrid binary search, starts as binary, then switches to linear scan.
Definition bmsse4.h:1390
BMFORCEINLINE void sse4_block_set_digest(__m128i *dst, unsigned value) BMNOEXCEPT
set digest stride to 0xFF.. or 0x0 value
Definition bmsse4.h:276
bool sse42_shift_r1_and(__m128i *block, bm::word_t co1, const __m128i *BMRESTRICT mask_block, bm::id64_t *digest) BMNOEXCEPT
block shift right by 1 plus AND
Definition bmsse4.h:1873
bool sse4_is_all_one(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if block is all ONE bits
Definition bmsse4.h:874
unsigned sse4_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size) BMNOEXCEPT
Definition bmsse4.h:1340
bool sse42_bit_find_first(const __m128i *BMRESTRICT block, unsigned off, unsigned *pos) BMNOEXCEPT
Find first non-zero bit.
Definition bmsse4.h:1274
BMFORCEINLINE bool sse42_test_all_zero_wave2(const void *ptr0, const void *ptr1) BMNOEXCEPT
check if 2 waves of pointers are all NULL
Definition bmsse4.h:921
bool sse4_and_digest_3way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
AND block digest stride.
Definition bmsse4.h:491
void sse42_bit_block_xor_2way(bm::word_t *target_block, const bm::word_t *xor_block, bm::id64_t digest) BMNOEXCEPT
Build partial XOR product of 2 bit-blocks using digest mask.
Definition bmsse4.h:2058
BMFORCEINLINE bool sse42_test_all_one_wave(const void *ptr) BMNOEXCEPT
check if SSE wave is all oxFFFF...FFF
Definition bmsse4.h:899
void sse42_bit_block_calc_change_bc(const __m128i *BMRESTRICT block, unsigned *gc, unsigned *bc) BMNOEXCEPT
Definition bmsse4.h:1143
Definition bm.h:78
const unsigned set_block_digest_wave_size
Definition bmconst.h:67
const unsigned id_max
Definition bmconst.h:109
unsigned int word_t
Definition bmconst.h:39
const unsigned set_block_mask
Definition bmconst.h:57
BMFORCEINLINE unsigned op_or(unsigned a, unsigned b) BMNOEXCEPT
Definition bmsse4.h:173
void sse4_bit_block_gather_scatter(unsigned *BMRESTRICT arr, const unsigned *BMRESTRICT blk, const unsigned *BMRESTRICT idx, unsigned size, unsigned start, unsigned bit_idx) BMNOEXCEPT
Definition bmsse4.h:1679
bm::id_t sse4_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func) BMNOEXCEPT
Definition bmsse4.h:189
BMFORCEINLINE unsigned op_and(unsigned a, unsigned b) BMNOEXCEPT
Definition bmsse4.h:182
const unsigned set_word_shift
Definition bmconst.h:72
void sse42_set_block_bits(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop) BMNOEXCEPT
Definition bmsse4.h:1595
const unsigned set_block_size
Definition bmconst.h:55
unsigned long long int id64_t
Definition bmconst.h:35
const unsigned block_waves
Definition bmconst.h:66
unsigned sse42_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start) BMNOEXCEPT
Definition bmsse4.h:1558
unsigned int id_t
Definition bmconst.h:38
BMFORCEINLINE unsigned long long bmi_bslr_u64(unsigned long long w) BMNOEXCEPT
Definition bmutil.h:335
unsigned short gap_word_t
Definition bmconst.h:78
const unsigned set_block_shift
Definition bmconst.h:56
const unsigned set_word_mask
Definition bmconst.h:73
BMFORCEINLINE unsigned long long bmi_blsi_u64(unsigned long long w)
Definition bmutil.h:345
BMFORCEINLINE unsigned op_xor(unsigned a, unsigned b) BMNOEXCEPT
Definition bmsse4.h:163