BitMagic-C++
bmsparsevec_algo.h
Go to the documentation of this file.
1#ifndef BMSPARSEVEC_ALGO__H__INCLUDED__
2#define BMSPARSEVEC_ALGO__H__INCLUDED__
3/*
4Copyright(c) 2002-2022 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17
18For more information please visit: http://bitmagic.io
19*/
20/*! \file bmsparsevec_algo.h
21 \brief Algorithms for bm::sparse_vector
22*/
23
24#ifndef BM__H__INCLUDED__
25// BitMagic utility headers do not include main "bm.h" declaration
26// #include "bm.h" or "bm64.h" explicitly
27# error missing include (bm.h or bm64.h)
28#endif
29
30#include <limits>
31#include <string.h>
32
33#include "bmdef.h"
34#include "bmsparsevec.h"
35#include "bmaggregator.h"
36#include "bmbuffer.h"
37#include "bmalgo.h"
38#include "bmdef.h"
39
40#ifdef _MSC_VER
41# pragma warning( disable: 4146 )
42#endif
43
44
45
46/** \defgroup svalgo Sparse vector algorithms
47 Sparse vector algorithms
48 \ingroup svector
49 */
50
51
52namespace bm
53{
54
55
56/*!
57 \brief Clip dynamic range for signal higher than specified
58
59 \param svect - sparse vector to do clipping
60 \param high_bit - max bit (inclusive) allowed for this signal vector
61
62
63 \ingroup svalgo
64 \sa dynamic_range_clip_low
65*/
66template<typename SV>
67void dynamic_range_clip_high(SV& svect, unsigned high_bit)
68{
69 unsigned sv_planes = svect.slices();
70
71 BM_ASSERT(sv_planes > high_bit && high_bit > 0);
72
73 typename SV::bvector_type bv_acc;
74 unsigned i;
75
76 // combine all the high bits into accumulator vector
77 for (i = high_bit+1; i < sv_planes; ++i)
78 {
79 if (typename SV::bvector_type* bv = svect.slice(i))
80 {
81 bv_acc.bit_or(*bv);
82 svect.free_slice(i);
83 }
84 } // for i
85
86 // set all bits ON for all low vectors, which happen to be clipped
87 for (i = high_bit; true; --i)
88 {
89 typename SV::bvector_type* bv = svect.get_create_slice(i);
90 bv->bit_or(bv_acc);
91 if (!i)
92 break;
93 } // for i
94}
95
96
97/*!
98 \brief Clip dynamic range for signal lower than specified (boost)
99
100 \param svect - sparse vector to do clipping
101 \param low_bit - low bit (inclusive) allowed for this signal vector
102
103 \ingroup svalgo
104 \sa dynamic_range_clip_high
105*/
106template<typename SV>
107void dynamic_range_clip_low(SV& svect, unsigned low_bit)
108{
109 if (low_bit == 0) return; // nothing to do
110 BM_ASSERT(svect.slices() > low_bit);
111
112 unsigned sv_planes = svect.slices();
113 typename SV::bvector_type bv_acc1;
114 unsigned i;
115
116 // combine all the high bits into accumulator vector
117 for (i = low_bit+1; i < sv_planes; ++i)
118 {
119 if (typename SV::bvector_type* bv = svect.slice(i))
120 bv_acc1.bit_or(*bv);
121 } // for i
122
123 // accumulate all vectors below the clipping point
124 typename SV::bvector_type bv_acc2;
125 typename SV::bvector_type* bv_low_plane = svect.get_create_slice(low_bit);
126
127 for (i = low_bit-1; true; --i)
128 {
129 if (typename SV::bvector_type* bv = svect.slice(i))
130 {
131 bv_acc2.bit_or(*bv);
132 svect.free_slice(i);
133 if (i == 0)
134 break;
135 }
136 } // for i
137
138 // now we want to set 1 in the clipping low plane based on
139 // exclusive or (XOR) between upper and lower parts)
140 // as a result high signal (any bits in the upper planes) gets
141 // slightly lower value (due to clipping) low signal gets amplified
142 // (lower contrast algorithm)
143
144 bv_acc1.bit_xor(bv_acc2);
145 bv_low_plane->bit_or(bv_acc1);
146}
147
148/**
149 Find first mismatch (element which is different) between two sparse vectors
150 (uses linear scan in bit-vector planes)
151
152 Function works with both NULL and NOT NULL vectors
153 NULL means unassigned (uncertainty), so first mismatch NULL is a mismatch
154 even if values in vectors can be formally the same (i.e. 0)
155
156 @param sv1 - vector 1
157 @param sv2 - vector 2
158 @param midx - mismatch index
159 @param null_proc - defines if we want to include (not) NULL
160 vector into comparison (bm::use_null) or not.
161 By default search takes NULL vector into account
162
163 @return true if mismatch found
164
165 @sa sparse_vector_find_mismatch
166
167 \ingroup svalgo
168*/
169template<typename SV>
171 const SV& sv2,
172 typename SV::size_type& midx,
173 bm::null_support null_proc = bm::use_null)
174{
175 typename SV::size_type mismatch = bm::id_max;
176 bool found = false;
177
178 unsigned sv_idx = 0;
179
180 unsigned planes1 = (unsigned)sv1.get_bmatrix().rows();
181 BM_ASSERT(planes1);
182
183 typename SV::bvector_type_const_ptr bv_null1 = sv1.get_null_bvector();
184 typename SV::bvector_type_const_ptr bv_null2 = sv2.get_null_bvector();
185
186 // for RSC vector do NOT compare NULL planes
187
189 {
190 //--planes1;
191 }
192 else // regular sparse vector - may have NULL planes
193 {
194 if (null_proc == bm::use_null)
195 {
196 if (bv_null1 && bv_null2) // both (not) NULL vectors present
197 {
198 bool f = bv_null1->find_first_mismatch(*bv_null2, midx, mismatch);
199 if (f && (midx < mismatch)) // better mismatch found
200 {
201 found = f; mismatch = midx;
202 }
203 }
204 else // one or both NULL vectors are not present
205 {
206 if (bv_null1)
207 {
208 typename SV::bvector_type bv_tmp; // TODO: get rid of temp bv
209 bv_tmp.resize(sv2.size());
210 bv_tmp.invert(); // turn into true NULL vector
211
212 // find first NULL value (mismatch)
213 bool f = bv_null1->find_first_mismatch(bv_tmp, midx, mismatch);
214 if (f && (midx < mismatch)) // better mismatch found
215 {
216 found = f; mismatch = midx;
217 }
218 }
219 if (bv_null2)
220 {
221 typename SV::bvector_type bv_tmp; // TODO: get rid of temp bv
222 bv_tmp.resize(sv1.size());
223 bv_tmp.invert();
224
225 bool f = bv_null2->find_first_mismatch(bv_tmp, midx, mismatch);
226 if (f && (midx < mismatch)) // better mismatch found
227 {
228 found = f; mismatch = midx;
229 }
230 }
231 }
232 } // null_proc
233 }
234
235 for (unsigned i = 0; mismatch && (i < planes1); ++i)
236 {
237 typename SV::bvector_type_const_ptr bv1 = sv1.get_slice(i);
238 if (bv1 == bv_null1)
239 bv1 = 0;
240 typename SV::bvector_type_const_ptr bv2 = sv2.get_slice(i);
241 if (bv2 == bv_null2)
242 bv2 = 0;
243
244 if (!bv1)
245 {
246 if (!bv2)
247 continue;
248 bool f = bv2->find(midx);
249 if (f && (midx < mismatch))
250 {
251 found = f; sv_idx = 2; mismatch = midx;
252 }
253 continue;
254 }
255 if (!bv2)
256 {
257 BM_ASSERT(bv1);
258 bool f = bv1->find(midx);
259 if (f && (midx < mismatch))
260 {
261 found = f; sv_idx = 1; mismatch = midx;
262 }
263 continue;
264 }
265 // both planes are not NULL
266 //
267 bool f = bv1->find_first_mismatch(*bv2, midx, mismatch);
268 if (f && (midx < mismatch)) // better mismatch found
269 {
270 found = f; mismatch = midx;
271 // which vector has '1' at mismatch position?
273 sv_idx = (bv1->test(mismatch)) ? 1 : 2;
274 }
275
276 } // for i
277
278 // RSC address translation here
279 //
281 {
282 if (found) // RSC address translation
283 {
284 BM_ASSERT(sv1.is_compressed());
285 BM_ASSERT(sv2.is_compressed());
286
287 switch (sv_idx)
288 {
289 case 1:
290 found = sv1.find_rank(midx + 1, mismatch);
291 break;
292 case 2:
293 found = sv2.find_rank(midx + 1, mismatch);
294 break;
295 default: // unknown, try both
296 BM_ASSERT(0);
297 }
298 BM_ASSERT(found);
299 midx = mismatch;
300 }
301 if (null_proc == bm::use_null)
302 {
303 // no need for address translation in this case
304 bool f = bv_null1->find_first_mismatch(*bv_null2, midx, mismatch);
305 if (f && (midx < mismatch)) // better mismatch found
306 {
307 found = f; mismatch = midx;
308 }
309 }
310
311/*
312 else // search for mismatch in the NOT NULL vectors
313 {
314 if (null_proc == bm::use_null)
315 {
316 // no need for address translation in this case
317 typename SV::bvector_type_const_ptr bv_null1 = sv1.get_null_bvector();
318 typename SV::bvector_type_const_ptr bv_null2 = sv2.get_null_bvector();
319 found = bv_null1->find_first_mismatch(*bv_null2, mismatch);
320 }
321 }
322*/
323 }
324
325 midx = mismatch; // minimal mismatch
326 return found;
327}
328
329/**
330 Find mismatch vector, indicating positions of mismatch between two sparse vectors
331 (uses linear scan in bit-vector planes)
332
333 Function works with both NULL and NOT NULL vectors
334
335 @param bv - [out] - bit-ector with mismatch positions indicated as 1s
336 @param sv1 - vector 1
337 @param sv2 - vector 2
338 @param null_proc - rules of processing for (not) NULL plane
339 bm::no_null - NULLs from both vectors are treated as uncertainty
340 and NOT included into final result
341 bm::use_null - difference in NULLs accounted into the result
342
343 @sa sparse_vector_find_first_mismatch
344
345 \ingroup svalgo
346*/
347template<typename SV1, typename SV2>
348void sparse_vector_find_mismatch(typename SV1::bvector_type& bv,
349 const SV1& sv1,
350 const SV2& sv2,
351 bm::null_support null_proc)
352{
353 typedef typename SV1::bvector_type bvector_type;
354 typedef typename bvector_type::allocator_type allocator_type;
355 typedef typename allocator_type::allocator_pool_type allocator_pool_type;
356
357 allocator_pool_type pool; // local pool for blocks
358 typename bvector_type::mem_pool_guard mp_guard_bv;
359 mp_guard_bv.assign_if_not_set(pool, bv);
360
361
363 {
364 BM_ASSERT(0); // TODO: fixme
365 }
367 {
368 BM_ASSERT(0); // TODO: fixme
369 }
370
371 bv.clear();
372
373 unsigned planes = sv1.effective_slices();; // slices();
374 if (planes < sv2.slices())
375 planes = sv2.slices();
376
377 for (unsigned i = 0; i < planes; ++i)
378 {
379 typename SV1::bvector_type_const_ptr bv1 = sv1.get_slice(i);
380 typename SV2::bvector_type_const_ptr bv2 = sv2.get_slice(i);
381
382 if (!bv1)
383 {
384 if (!bv2)
385 continue;
386 bv |= *bv2;
387 continue;
388 }
389 if (!bv2)
390 {
391 BM_ASSERT(bv1);
392 bv |= *bv1;
393 continue;
394 }
395
396 // both planes are not NULL, compute XOR diff
397 //
398 bvector_type bv_xor;
399 typename bvector_type::mem_pool_guard mp_guard;
400 mp_guard.assign_if_not_set(pool, bv_xor);
401
402 bv_xor.bit_xor(*bv1, *bv2, SV1::bvector_type::opt_none);
403 bv |= bv_xor;
404
405 } // for i
406
407 // size mismatch check
408 {
409 typename SV1::size_type sz1 = sv1.size();
410 typename SV2::size_type sz2 = sv2.size();
411 if (sz1 != sz2)
412 {
413 if (sz1 < sz2)
414 {
415 }
416 else
417 {
418 bm::xor_swap(sz1, sz2);
419 }
420 bv.set_range(sz1, sz2-1);
421 }
422 }
423
424 // NULL processings
425 //
426 typename SV1::bvector_type_const_ptr bv_null1 = sv1.get_null_bvector();
427 typename SV2::bvector_type_const_ptr bv_null2 = sv2.get_null_bvector();
428
429 switch (null_proc)
430 {
431 case bm::no_null:
432 // NULL correction to exclude all NULL (unknown) values from the result set
433 // (AND with NOT NULL vector)
434 if (bv_null1 && bv_null2)
435 {
436 bvector_type bv_or;
437 typename bvector_type::mem_pool_guard mp_guard;
438 mp_guard.assign_if_not_set(pool, bv_or);
439
440 bv_or.bit_or(*bv_null1, *bv_null2, bvector_type::opt_none);
441 bv &= bv_or;
442 }
443 else
444 {
445 if (bv_null1)
446 bv &= *bv_null1;
447 if (bv_null2)
448 bv &= *bv_null2;
449 }
450 break;
451 case bm::use_null:
452 if (bv_null1 && bv_null2)
453 {
454 bvector_type bv_xor;
455 typename bvector_type::mem_pool_guard mp_guard;
456 mp_guard.assign_if_not_set(pool, bv_xor);
457
458 bv_xor.bit_xor(*bv_null1, *bv_null2, bvector_type::opt_none);
459 bv |= bv_xor;
460 }
461 else
462 {
463 bvector_type bv_null;
464 typename bvector_type::mem_pool_guard mp_guard;
465 mp_guard.assign_if_not_set(pool, bv_null);
466 if (bv_null1)
467 {
468 bv_null = *bv_null1;
469 bv_null.resize(sv1.size());
470 }
471 if (bv_null2)
472 {
473 bv_null = *bv_null2;
474 bv_null.resize(sv2.size());
475 }
476 bv_null.invert();
477 bv |= bv_null;
478 }
479 break;
480 default:
481 BM_ASSERT(0);
482 }
483}
484
485/**
486 \brief Index for SV sorted vectors for approximate range queries
487
488 @internal
489 */
490template<typename SV>
492{
493public:
494 typedef typename SV::value_type value_type;
495 typedef typename SV::size_type size_type;
496 typedef typename SV::bvector_type bvector_type;
498 typedef bm::dynamic_heap_matrix<value_type, allocator_type> heap_matrix_type;
499
501 sv_sample_index(const SV& sv, unsigned s_factor)
502 {
503 construct(sv, s_factor);
504 }
505
506
507
508 /**
509 Build sampling index for the sorted sprase vector
510 @param sv - string sparse vector to index
511 @param s_factor - sampling factor
512 */
513 void construct(const SV& sv, unsigned s_factor);
514
515
516 /// Original SV size
517 size_type sv_size() const BMNOEXCEPT { return sv_size_; }
518
519 /// Index size (number of sampled elements)
520 size_type size() const BMNOEXCEPT { return idx_size_; }
521
522 /// returns true if all index values are unique
523 bool is_unique() const BMNOEXCEPT { return idx_unique_; }
524
525 /// find range (binary)
526 /// @internal
527 bool bfind_range(const value_type* search_str,
528 size_t in_len,
529 size_type& l,
530 size_type& r) const BMNOEXCEPT;
531
532 /// find common prefix between index elements and search string
533 ///
534 size_type common_prefix_length(const value_type* search_str,
535 size_t in_len,
536 size_type l, size_type r) const BMNOEXCEPT;
537
538
539 /**
540 recalculate range into SV coordinates range [from..to)
541 */
542 void recalc_range(const value_type* search_str,
543 size_type& l,
544 size_type& r) const BMNOEXCEPT;
545
546 /// Return length of minimal indexed string
547 size_t get_min_len() const BMNOEXCEPT { return min_key_len_; }
548
549
550
551private:
552 heap_matrix_type s_cache_; ///< cache for SV sampled elements
553 unsigned s_factor_ = 0;
554 size_type sv_size_ = 0; ///< original sv size
555 size_type idx_size_ = 0; ///< index size
556 bool idx_unique_ = true; ///< inx value unique or there are dups?
557 size_t min_key_len_ = 0; ///< minimal key size in index
558};
559
560
561/**
562 \brief algorithms for sparse_vector scan/search
563
564 Scanner uses properties of bit-vector planes to answer questions
565 like "find all sparse vector elements equivalent to XYZ".
566
567 Class uses fast algorithms based on properties of bit-planes.
568 This is NOT a brute force, direct scan, scanner uses search space pruning and cache optimizations
569 to run the search.
570
571 S_FACTOR - Sampling factor for search. Can be: [ 4, 8, 16, 32, 64 ]. Default: 16.
572 Lower sampling facor (4, 8) lowers memory footprint for the scanner class instance
573 Higher - improves search performance (at the expense for memory for sampled elements)
574 Sampling factor is used for binary search in bound string sparse vector, so memory consumption
575 depends on sampling and max string length.
576
577 @ingroup svalgo
578 @ingroup setalgo
579*/
580template<typename SV, unsigned S_FACTOR>
582{
583public:
584 typedef typename SV::bvector_type bvector_type;
587 typedef typename SV::value_type value_type;
588 typedef typename SV::size_type size_type;
589
591 typedef typename allocator_type::allocator_pool_type allocator_pool_type;
592
594 typedef
595 bm::heap_vector<value_type, typename bvector_type::allocator_type, true>
597 typedef
598 bm::heap_vector<bvector_type*, allocator_type, true> bvect_vector_type;
599 typedef
601
602 /**
603 Scanner options to control execution
604 */
605 typedef
606 typename aggregator_type::run_options run_options;
607
608public:
609
610
611 /**
612 Pipeline to run multiple searches against a particular SV faster using cache optimizations.
613 USAGE:
614 - setup the pipeline (add searches)
615 - run all searches at once
616 - get the search results (in the order it was added)
617 */
618 template<class Opt = bm::agg_run_options<> >
620 {
621 public:
622 typedef
623 typename aggregator_type::template pipeline<Opt> aggregator_pipeline_type;
624 public:
625 pipeline(const SV& sv)
626 : sv_(sv), bv_and_mask_(0),
627 eff_slices_(sv_.get_bmatrix().calc_effective_rows_not_null())
628 {}
629
630 /// Set pipeline run options
632 { return agg_pipe_.options(); }
633
634 /// Get pipeline run options
636 { return agg_pipe_.options(); }
637
638 /** Set pipeline mask bit-vector to restrict the search space
639 @param bv_mask - pointer to bit-vector restricting search to vector indexes marked
640 as 1s. Pointer ownership is not transferred, NULL value resets the mask.
641 bv_mask defines the mask for all added searches.
642
643 */
644 void set_search_mask(const bvector_type* bv_mask) BMNOEXCEPT;
645
646 /**
647 Set search limit for results. Requires that bit-counting to be enabled in the template parameters.
648 Warning: search limit is approximate (for performance reasons) so it can occasinally find more
649 than requested. It cannot find less. Search limit works for individual results (not ORed aggregate)
650 @param limit - search limit (target population count to search for)
651 */
653 { agg_pipe_.set_search_count_limit(limit); }
654
655 /**
656 Attach OR (aggregator vector).
657 Pipeline results all will be OR-ed (UNION) into the OR target vector
658 @param bv_or - OR target vector
659 */
661 { agg_pipe_.set_or_target(bv_or); }
662
663 /*! @name pipeline fill-in methods */
664 //@{
665
666 /**
667 Add new search string
668 @param str - zero terminated string pointer to configure a search for
669 */
670 void add(const typename SV::value_type* str);
671
672 /** Prepare pipeline for the execution (resize and init internal structures)
673 Once complete, you cannot add() to it.
674 */
675 void complete() { agg_pipe_.complete(); }
676
678 { return agg_pipe_.is_complete(); }
679
680 size_type size() const BMNOEXCEPT { return agg_pipe_.size(); }
681
682 //@}
683
684 /** Return results vector used for pipeline execution */
686 { return agg_pipe_.get_bv_res_vector(); }
687 /** Return results vector count used for pipeline execution */
688
690 { return agg_pipe_.get_bv_count_vector(); }
691
692
693 /**
694 get aggregator pipeline (access to compute internals)
695 @internal
696 */
699
700 protected:
702
703 pipeline(const pipeline&) = delete;
704 pipeline& operator=(const pipeline&) = delete;
705 protected:
706 const SV& sv_; ///< target sparse vector ref
707 aggregator_pipeline_type agg_pipe_; ///< traget aggregator pipeline
710 size_t eff_slices_; ///< number of effectice NOT NULL value slices
711 };
712
713public:
715
716 /**
717 \brief bind sparse vector for all searches
718
719 \param sv - input sparse vector to bind for searches
720 \param sorted - source index is sorted, build index for binary search
721 */
722 void bind(const SV& sv, bool sorted);
723
724 /**
725 \brief reset sparse vector binding
726 */
728
729
730 /*! @name Find in scalar vector */
731 //@{
732
733 /**
734 \brief find all sparse vector elements EQ to search value
735
736 Find all sparse vector elements equivalent to specified value
737
738 \param sv - input sparse vector
739 \param value - value to search for
740 \param bv_out - search result bit-vector (search result is a vector of 1s when sv[i] == value)
741 */
742 void find_eq(const SV& sv, value_type value, bvector_type& bv_out);
743
744
745 /**
746 \brief find all sparse vector elements EQ to search value
747
748 Find all sparse vector elements equivalent to specified value
749
750 \param sv - input sparse vector
751 \param value - value to search for
752 \param bi - back insert iterator for the search results
753 */
754 template<typename BII>
755 void find_eq(const SV& sv, value_type value, BII bi);
756
757
758 /**
759 \brief find first sparse vector element
760
761 Find all sparse vector elements equivalent to specified value.
762 Works well if sperse vector represents unordered set
763
764 \param sv - input sparse vector
765 \param value - value to search for
766 \param pos - output found sparse vector element index
767
768 \return true if found
769 */
770 bool find_eq(const SV& sv, value_type value, size_type& pos);
771
772 /**
773 \brief binary search for position in the sorted sparse vector
774
775 Method assumes the sparse array is sorted, if value is found pos returns its index,
776 if not found, pos would contain index where it could be inserted to maintain the order
777
778 \param sv - input sparse vector
779 \param val - value to search for
780 \param pos - output sparse vector element index (actual index if found or insertion
781 point if not found)
782
783 \return true if value found
784 */
785 bool bfind(const SV& sv, const value_type val, size_type& pos);
786
787 /**
788 \brief find all elements sparse vector element greater (>) than value
789
790 \param sv - input sparse vector
791 \param value - value to search for
792 \param bv_out - search result bit-vector (search result is a vector of 1s when sv[i] > value)
793 */
794 void find_gt(const SV& sv, value_type val, bvector_type& bv_out);
795
796 /**
797 \brief find all elements sparse vector element greater or equal (>=) than value
798
799 \param sv - input sparse vector
800 \param value - value to search for
801 \param bv_out - search result bit-vector (search result is a vector of 1s when sv[i] >= value)
802 */
803 void find_ge(const SV& sv, value_type val, bvector_type& bv_out);
804
805
806 /**
807 \brief find all elements sparse vector element less (<) than value
808
809 \param sv - input sparse vector
810 \param value - value to search for
811 \param bv_out - search result bit-vector (search result is a vector of 1s when sv[i] < value)
812 */
813 void find_lt(const SV& sv, value_type val, bvector_type& bv_out);
814
815 /**
816 \brief find all elements sparse vector element less or equal (<=) than value
817
818 \param sv - input sparse vector
819 \param value - value to search for
820 \param bv_out - search result bit-vector (search result is a vector of 1s when sv[i] <= value)
821 */
822 void find_le(const SV& sv, value_type val, bvector_type& bv_out);
823
824
825 /**
826 \brief find all elements sparse vector element in closed range [left..right] interval
827
828 \param sv - input sparse vector
829 \param from - range from
830 \param to - range to
831 \param bv_out - search result bit-vector (search result is a vector of 1s when sv[i] <= value)
832 */
833 void find_range(const SV& sv,
834 value_type from, value_type to,
835 bvector_type& bv_out);
836
837
838 //@}
839
840
841 // --------------------------------------------------
842 //
843 /*! @name Find in bit-transposed string vector */
844 //@{
845
846
847 /**
848 \brief find sparse vector elements (string)
849 @param sv - sparse vector of strings to search
850 @param str - string to search for
851 @param bv_out - search result bit-vector (search result masks 1 elements)
852 */
853 bool find_eq_str(const SV& sv, const value_type* str, bvector_type& bv_out);
854
855 /**
856 \brief find sparse vector elementa (string) in the attached SV
857 @param str - string to search for
858 @param bv_out - search result bit-vector (search result masks 1 elements)
859 */
860 bool find_eq_str(const value_type* str, bvector_type& bv_out);
861
862 /**
863 \brief find first sparse vector element (string)
864 @param sv - sparse vector of strings to search
865 @param str - string to search for
866 @param pos - [out] index of the first found
867 */
868 bool find_eq_str(const SV& sv, const value_type* str, size_type& pos);
869
870 /**
871 \brief binary find first sparse vector element (string)
872 Sparse vector must be attached (bind())
873 @param str - string to search for
874 @param pos - [out] index of the first found
875 @sa bind
876 */
877 bool find_eq_str(const value_type* str, size_type& pos);
878
879 /**
880 \brief find sparse vector elements with a given prefix (string)
881 @param sv - sparse vector of strings to search
882 @param str - string prefix to search for
883 "123" is a prefix for "1234567"
884 @param bv_out - search result bit-vector (search result masks 1 elements)
885 */
886 bool find_eq_str_prefix(const SV& sv, const value_type* str,
887 bvector_type& bv_out);
888
889 /**
890 \brief find sparse vector elements using search pipeline
891 @param pipe - pipeline to run
892 */
893 template<class TPipe>
894 void find_eq_str(TPipe& pipe);
895
896 /**
897 \brief binary find first sparse vector element (string). Sparse vector must be sorted.
898
899 @param sv - sparse vector of strings to search
900 @param str - string prefix to search for
901 @param pos - [out] first position found
902 */
903 bool bfind_eq_str(const SV& sv,
904 const value_type* str, size_type& pos);
905
906 /**
907 \brief lower bound search for an array position
908
909 Method assumes the sparse array is sorted
910
911 \param sv - input sparse vector
912 \param str - value to search for
913 \param pos - output sparse vector element index
914
915 \return true if value found
916 */
917 bool lower_bound_str(const SV& sv,
918 const value_type* str,
919 size_type& pos);
920
921 /**
922 \brief binary find first sparse vector element (string)
923 Sparse vector must be sorted and attached (use method bind())
924
925 @param str - string prefix to search for
926 @param pos - [out] first position found
927
928 @sa bind
929 */
930 bool bfind_eq_str(const value_type* str, size_type& pos);
931
932 /**
933 \brief binary find first sparse vector element (string)
934 Sparse vector must be sorted and attached (use method bind())
935
936 @param str - string prefix to search for
937 @param len - string length
938 @param pos - [out] first position found
939
940 @sa bind
941 */
942 bool bfind_eq_str(const value_type* str, size_t len, size_type& pos);
943
944
945 //@}
946
947 /**
948 \brief find all sparse vector elements EQ to 0
949 \param sv - input sparse vector
950 \param bv_out - output bit-vector (search result masks 1 elements)
951 \param null_correct - flag to perform NULL correction
952 */
953 void find_zero(const SV& sv, bvector_type& bv_out, bool null_correct = true);
954
955 /*!
956 \brief Find non-zero elements
957 Output vector is computed as a logical OR (join) of all planes
958
959 \param sv - input sparse vector
960 \param bv_out - output bit-bector of non-zero elements
961 */
962 void find_nonzero(const SV& sv, bvector_type& bv_out);
963
964 /*!
965 \brief Find positive (greter than zero elements)
966 Output vector is computed as a logical OR (join) of all planes
967
968 \param sv - input sparse vector
969 \param bv_out - output bit-bector of non-zero elements
970 */
971 void find_positive(const SV& sv, bvector_type& bv_out);
972
973
974 /**
975 \brief invert search result ("EQ" to "not EQ")
976
977 \param sv - input sparse vector
978 \param bv_out - output bit-bector of non-zero elements
979 */
980 void invert(const SV& sv, bvector_type& bv_out);
981
982 /**
983 \brief find all values A IN (C, D, E, F)
984 \param sv - input sparse vector
985 \param start - start iterator (set to search)
986 \param end - end iterator (set to search)
987 \param bv_out - output bit-bector of non-zero elements
988 */
989 template<typename It>
990 void find_eq(const SV& sv,
991 It start,
992 It end,
993 bvector_type& bv_out)
994 {
995 typename bvector_type::mem_pool_guard mp_guard;
996 mp_guard.assign_if_not_set(pool_, bv_out); // set algorithm-local memory pool to avoid heap contention
997
998 bvector_type bv1;
999 typename bvector_type::mem_pool_guard mp_guard1(pool_, bv1);
1000 bool any_zero = false;
1001 for (; start < end; ++start)
1002 {
1003 value_type v = *start;
1004 any_zero |= (v == 0);
1005 bool found = find_eq_with_nulls(sv, v, bv1);
1006 if (found)
1007 bv_out.bit_or(bv1);
1008 else
1009 {
1010 BM_ASSERT(!bv1.any());
1011 }
1012 } // for
1013 if (any_zero)
1014 correct_nulls(sv, bv_out);
1015 }
1016
1017
1018 /// For testing purposes only
1019 ///
1020 /// @internal
1021 void find_eq_with_nulls_horizontal(const SV& sv, value_type value,
1022 bvector_type& bv_out);
1023
1024 /// For testing purposes only
1025 ///
1026 /// @internal
1027 void find_gt_horizontal(const SV& sv,
1028 value_type value,
1029 bvector_type& bv_out,
1030 bool null_correct = true);
1031
1032
1033 /** Exclude possible NULL values from the result vector
1034 \param sv - input sparse vector
1035 \param bv_out - output bit-bector of non-zero elements
1036 */
1037 void correct_nulls(const SV& sv, bvector_type& bv_out);
1038
1039 /// Return allocator pool for blocks
1040 /// (Can be used to improve performance of repeated searches with the same scanner)
1041 ///
1043
1044protected:
1045
1046 template<bool BOUND>
1047 bool bfind_eq_str_impl(const SV& sv,
1048 const value_type* str, size_t in_len,
1049 bool remaped,
1050 size_type& pos);
1051
1052
1053 /// Remap input value into SV char encodings
1054 static
1055 bool remap_tosv(remap_vector_type& remap_vect_target,
1056 const value_type* str,
1057 const SV& sv);
1058
1059 /// set search boundaries (hint for the aggregator)
1061
1062 /// reset (disable) search range
1064
1065 /// find value (may include NULL indexes)
1066 bool find_eq_with_nulls(const SV& sv,
1067 value_type value,
1068 bvector_type& bv_out,
1069 size_type search_limit = 0);
1070
1071 /// find first value (may include NULL indexes)
1072 bool find_first_eq(const SV& sv,
1073 value_type value,
1074 size_type& idx);
1075
1076 /// find first string value (may include NULL indexes)
1077 bool find_first_eq(const SV& sv,
1078 const value_type* str,
1079 size_t in_len,
1080 size_type& idx,
1081 bool remaped,
1082 unsigned prefix_len);
1083
1084 /// find EQ str / prefix impl
1085 bool find_eq_str_impl(const SV& sv,
1086 const value_type* str,
1087 bvector_type& bv_out,
1088 bool prefix_sub);
1089
1090 /// Prepare aggregator for AND-SUB (EQ) search
1091 bool prepare_and_sub_aggregator(const SV& sv, value_type value);
1092
1093 /// Prepare aggregator for AND-SUB (EQ) search (string)
1094 bool prepare_and_sub_aggregator(const SV& sv,
1095 const value_type* str,
1096 unsigned octet_start,
1097 bool prefix_sub);
1098
1099 /// Rank-Select decompression for RSC vectors
1100 void decompress(const SV& sv, bvector_type& bv_out);
1101
1102 /// compare sv[idx] with input str
1103 template <bool BOUND>
1104 int compare_str(const SV& sv, size_type idx,
1105 const value_type* str) const BMNOEXCEPT;
1106
1107 /// compare sv[idx] with input value
1108 int compare(const SV& sv, size_type idx, const value_type val) BMNOEXCEPT;
1109
1110 /// @internal
1111 void aggregate_OR_slices(bvector_type& bv_target, const SV& sv,
1112 unsigned from, unsigned total_planes);
1113
1114 /// @internal
1115 static
1116 void aggregate_AND_OR_slices(bvector_type& bv_target,
1117 const bvector_type& bv_mask,
1118 const SV& sv,
1119 unsigned from, unsigned total_planes);
1120
1121 /// Return the slice limit for signed/unsigned vector value types
1122 /// @internal
1123 static constexpr int gt_slice_limit() noexcept
1124 {
1125 if constexpr (std::is_signed<value_type>::value)
1126 return 1; // last plane
1127 else
1128 return 0;
1129 }
1130
1132 {
1133 value_vect_.resize_no_copy(effective_str_max_ * 2);
1134 remap_value_vect_.resize_no_copy(effective_str_max_ * 2);
1135 remap_prefix_vect_.resize_no_copy(effective_str_max_ * 2);
1136 }
1137
1138
1139protected:
1141 void operator=(const sparse_vector_scanner&) = delete;
1142
1143protected:
1149
1150 typedef bm::dynamic_heap_matrix<value_type, allocator_type> heap_matrix_type;
1151 typedef bm::dynamic_heap_matrix<value_type, allocator_type> matrix_search_buf_type;
1152
1153 typedef
1154 bm::heap_vector<bm::id64_t, typename bvector_type::allocator_type, true>
1156
1157protected:
1158
1159 /// Addd char to aggregator (AND-SUB)
1160 template<typename AGG>
1161 static
1162 void add_agg_char(AGG& agg,
1163 const SV& sv,
1164 int octet_idx, bm::id64_t planes_mask,
1165 unsigned value)
1166 {
1167 unsigned char bits[64];
1168 unsigned short bit_count_v = bm::bitscan(value, bits);
1169 for (unsigned i = 0; i < bit_count_v; ++i)
1170 {
1171 unsigned bit_idx = bits[i];
1172 BM_ASSERT(value & (value_type(1) << bit_idx));
1173 unsigned plane_idx = (unsigned(octet_idx) * 8) + bit_idx;
1174 const bvector_type* bv = sv.get_slice(plane_idx);
1175 BM_ASSERT(bv != sv.get_null_bvector());
1176 agg.add(bv, 0); // add to the AND group
1177 } // for i
1178 unsigned vinv = unsigned(value);
1179 if constexpr (sizeof(value_type) == 1)
1180 vinv ^= 0xFF;
1181 else // 2-byte char
1182 {
1183 BM_ASSERT(sizeof(value_type) == 2);
1184 vinv ^= 0xFFFF;
1185 }
1186 // exclude the octet bits which are all not set (no vectors)
1187 vinv &= unsigned(planes_mask);
1188 for (unsigned octet_plane = (unsigned(octet_idx) * 8); vinv; vinv &= vinv-1)
1189 {
1190 bm::id_t t = vinv & -vinv;
1191 unsigned bit_idx = bm::word_bitcount(t - 1);
1192 unsigned plane_idx = octet_plane + bit_idx;
1193 const bvector_type* bv = sv.get_slice(plane_idx);
1194 BM_ASSERT(bv);
1195 BM_ASSERT(bv != sv.get_null_bvector());
1196 agg.add(bv, 1); // add to SUB group
1197 } // for
1198 }
1199
1200 enum code
1201 {
1203 sub_block_l1_size = bm::gap_max_bits / S_FACTOR // size in bits/elements
1204 };
1205
1206private:
1207 allocator_pool_type pool_;
1208 bvector_type bv_tmp_;
1209 aggregator_type agg_;
1211
1212 size_type mask_from_;
1213 size_type mask_to_;
1214 bool mask_set_;
1215
1216 const SV* bound_sv_;
1217
1218 bm::sv_sample_index<SV> range_idx_; ///< range index
1219/*
1220 heap_matrix_type block_l0_cache_; ///< cache for elements[0] of each block
1221 heap_matrix_type block_l1_cache_; ///< cache for elements[x]
1222*/
1223 size_type effective_str_max_;
1224
1225 remap_vector_type value_vect_; ///< value buffer
1226 remap_vector_type remap_value_vect_; ///< remap buffer
1227 remap_vector_type remap_prefix_vect_; ///< common prefix buffer
1228 /// masks of allocated bit-planes (1 - means there is a bit-plane)
1229 mask_vector_type vector_plane_masks_;
1230 matrix_search_buf_type hmatr_; ///< heap matrix for string search linear stage
1231};
1232
1233
1234/*!
1235 \brief Integer set to set transformation (functional image in groups theory)
1236 https://en.wikipedia.org/wiki/Image_(mathematics)
1237
1238 Input sets gets translated through the function, which is defined as
1239 "one to one (or NULL)" binary relation object (sparse_vector).
1240 Also works for M:1 relationships.
1241
1242 \ingroup svalgo
1243 \ingroup setalgo
1244*/
1245template<typename SV>
1247{
1248public:
1249 typedef typename SV::bvector_type bvector_type;
1250 typedef typename SV::value_type value_type;
1251 typedef typename SV::size_type size_type;
1253public:
1254
1257
1258
1259 /** Get read access to zero-elements vector
1260 Zero vector gets populated after attach_sv() is called
1261 or as a side-effect of remap() with immediate sv param
1262 */
1263 const bvector_type& get_bv_zero() const { return bv_zero_; }
1264
1265 /** Perform remapping (Image function) (based on attached translation table)
1266
1267 \param bv_in - input set, defined as a bit-vector
1268 \param bv_out - output set as a bit-vector
1269
1270 @sa attach_sv
1271 */
1272 void remap(const bvector_type& bv_in,
1273 bvector_type& bv_out);
1274
1275 /** Perform remapping (Image function)
1276
1277 \param bv_in - input set, defined as a bit-vector
1278 \param sv_brel - binary relation (translation table) sparse vector
1279 \param bv_out - output set as a bit-vector
1280 */
1281 void remap(const bvector_type& bv_in,
1282 const SV& sv_brel,
1283 bvector_type& bv_out);
1284
1285 /** Remap single set element
1286
1287 \param id_from - input value
1288 \param sv_brel - translation table sparse vector
1289 \param id_to - out value
1290
1291 \return - true if value was found and remapped
1292 */
1293 bool remap(size_type id_from, const SV& sv_brel, size_type& id_to);
1294
1295
1296 /** Run remap transformation
1297
1298 \param bv_in - input set, defined as a bit-vector
1299 \param sv_brel - translation table sparse vector
1300 \param bv_out - output set as a bit-vector
1301
1302 @sa remap
1303 */
1304 void run(const bvector_type& bv_in,
1305 const SV& sv_brel,
1306 bvector_type& bv_out)
1307 {
1308 remap(bv_in, sv_brel, bv_out);
1309 }
1310
1311 /** Attach a translation table vector for remapping (Image function)
1312
1313 \param sv_brel - binary relation sparse vector pointer
1314 (pass NULL to detach)
1315 \param compute_stats - flag to perform computation of some statistics
1316 later used in remapping. This only make sense
1317 for series of remappings on the same translation
1318 vector.
1319 @sa remap
1320 */
1321 void attach_sv(const SV* sv_brel, bool compute_stats = false);
1322
1323
1324protected:
1325 void one_pass_run(const bvector_type& bv_in,
1326 const SV& sv_brel,
1327 bvector_type& bv_out);
1328
1329 /// @internal
1330 template<unsigned BSIZE>
1336
1338 {
1339 sv_g_size = 1024 * 8
1340 };
1342
1343
1344protected:
1346 void operator=(const set2set_11_transform&) = delete;
1347
1348protected:
1349 const SV* sv_ptr_; ///< current translation table vector
1350 gather_buffer_type* gb_; ///< intermediate buffers
1351 bvector_type bv_product_;///< temp vector
1352
1353 bool have_stats_; ///< flag of statistics presense
1354 bvector_type bv_zero_; ///< bit-vector for zero elements
1355
1357};
1358
1359
1360
1361//----------------------------------------------------------------------------
1362//
1363//----------------------------------------------------------------------------
1364
1365template<typename SV>
1367: sv_ptr_(0), gb_(0), have_stats_(false)
1368{
1369 // set2set_11_transform for signed int is not implemented yet
1370 static_assert(std::is_unsigned<value_type>::value,
1371 "BM: unsigned sparse vector is required for transform");
1372 gb_ = (gather_buffer_type*)::malloc(sizeof(gather_buffer_type));
1373 if (!gb_)
1374 {
1375 SV::throw_bad_alloc();
1376 }
1377}
1378
1379//----------------------------------------------------------------------------
1380
1381template<typename SV>
1383{
1384 if (gb_)
1385 ::free(gb_);
1386}
1387
1388
1389//----------------------------------------------------------------------------
1390
1391template<typename SV>
1392void set2set_11_transform<SV>::attach_sv(const SV* sv_brel, bool compute_stats)
1393{
1394 sv_ptr_ = sv_brel;
1395 if (!sv_ptr_)
1396 {
1397 have_stats_ = false;
1398 }
1399 else
1400 {
1401 if (sv_brel->empty() || !compute_stats)
1402 return; // nothing to do
1403 const bvector_type* bv_non_null = sv_brel->get_null_bvector();
1404 if (bv_non_null)
1405 return; // already have 0 elements vector
1406
1407
1408 typename bvector_type::mem_pool_guard mp_g_z;
1409 mp_g_z.assign_if_not_set(pool_, bv_zero_);
1410
1412 scanner.find_zero(*sv_brel, bv_zero_);
1413 have_stats_ = true;
1414 }
1415}
1416
1417//----------------------------------------------------------------------------
1418
1419template<typename SV>
1421 const SV& sv_brel,
1422 size_type& id_to)
1423{
1424 if (sv_brel.empty())
1425 return false; // nothing to do
1426
1427 const bvector_type* bv_non_null = sv_brel.get_null_bvector();
1428 if (bv_non_null)
1429 {
1430 if (!bv_non_null->test(id_from))
1431 return false;
1432 }
1433 size_type idx = sv_brel.translate_address(id_from);
1434 id_to = sv_brel.get(idx);
1435 return true;
1436}
1437
1438//----------------------------------------------------------------------------
1439
1440template<typename SV>
1442 const SV& sv_brel,
1443 bvector_type& bv_out)
1444{
1445 typename bvector_type::mem_pool_guard mp_g_out, mp_g_p, mp_g_z;
1446 mp_g_out.assign_if_not_set(pool_, bv_out);
1447 mp_g_p.assign_if_not_set(pool_, bv_product_);
1448 mp_g_z.assign_if_not_set(pool_, bv_zero_);
1449
1450 attach_sv(&sv_brel);
1451
1452 remap(bv_in, bv_out);
1453
1454 attach_sv(0); // detach translation table
1455}
1456
1457template<typename SV>
1459 bvector_type& bv_out)
1460{
1462
1463 bv_out.clear();
1464
1465 if (sv_ptr_ == 0 || sv_ptr_->empty())
1466 return; // nothing to do
1467
1468 bv_out.init(); // just in case to "fast set" later
1469
1470 typename bvector_type::mem_pool_guard mp_g_out, mp_g_p, mp_g_z;
1471 mp_g_out.assign_if_not_set(pool_, bv_out);
1472 mp_g_p.assign_if_not_set(pool_, bv_product_);
1473 mp_g_z.assign_if_not_set(pool_, bv_zero_);
1474
1475
1476 const bvector_type* enum_bv;
1477
1478 const bvector_type * bv_non_null = sv_ptr_->get_null_bvector();
1479 if (bv_non_null)
1480 {
1481 // TODO: optimize with 2-way ops
1482 bv_product_ = bv_in;
1483 bv_product_.bit_and(*bv_non_null);
1484 enum_bv = &bv_product_;
1485 }
1486 else // non-NULL vector is not available
1487 {
1488 enum_bv = &bv_in;
1489 // if we have any elements mapping into "0" on the other end
1490 // we map it once (chances are there are many duplicates)
1491 //
1492
1493 if (have_stats_) // pre-attached translation statistics
1494 {
1495 bv_product_ = bv_in;
1496 size_type cnt1 = bv_product_.count();
1497 bv_product_.bit_sub(bv_zero_);
1498 size_type cnt2 = bv_product_.count();
1499
1500 BM_ASSERT(cnt2 <= cnt1);
1501
1502 if (cnt1 != cnt2) // mapping included 0 elements
1503 bv_out.set_bit_no_check(0);
1504
1505 enum_bv = &bv_product_;
1506 }
1507 }
1508
1509
1510
1511 size_type buf_cnt, nb_old, nb;
1512 buf_cnt = nb_old = 0;
1513
1514 typename bvector_type::enumerator en(enum_bv->first());
1515 for (; en.valid(); ++en)
1516 {
1517 typename SV::size_type idx = *en;
1518 idx = sv_ptr_->translate_address(idx);
1519
1520 nb = unsigned(idx >> bm::set_block_shift);
1521 if (nb != nb_old) // new blocks
1522 {
1523 if (buf_cnt)
1524 {
1525 sv_ptr_->gather(&gb_->buffer_[0], &gb_->gather_idx_[0], buf_cnt, BM_SORTED_UNIFORM);
1526 bv_out.set(&gb_->buffer_[0], buf_cnt, BM_SORTED);
1527 buf_cnt = 0;
1528 }
1529 nb_old = nb;
1530 gb_->gather_idx_[buf_cnt++] = idx;
1531 }
1532 else
1533 {
1534 gb_->gather_idx_[buf_cnt++] = idx;
1535 }
1536
1537 if (buf_cnt == sv_g_size)
1538 {
1539 sv_ptr_->gather(&gb_->buffer_[0], &gb_->gather_idx_[0], buf_cnt, BM_SORTED_UNIFORM);
1540 bv_out.set(&gb_->buffer_[0], buf_cnt, bm::BM_SORTED);
1541 buf_cnt = 0;
1542 }
1543 } // for en
1544 if (buf_cnt)
1545 {
1546 sv_ptr_->gather(&gb_->buffer_[0], &gb_->gather_idx_[0], buf_cnt, BM_SORTED_UNIFORM);
1547 bv_out.set(&gb_->buffer_[0], buf_cnt, bm::BM_SORTED);
1548 }
1549}
1550
1551
1552//----------------------------------------------------------------------------
1553
1554template<typename SV>
1556 const SV& sv_brel,
1557 bvector_type& bv_out)
1558{
1559 if (sv_brel.empty())
1560 return; // nothing to do
1561
1562 bv_out.init();
1563
1564 typename SV::const_iterator it = sv_brel.begin();
1565 for (; it.valid(); ++it)
1566 {
1567 typename SV::value_type t_id = *it;
1568 size_type idx = it.pos();
1569 if (bv_in.test(idx))
1570 {
1571 bv_out.set_bit_no_check(t_id);
1572 }
1573 } // for
1574}
1575
1576
1577//----------------------------------------------------------------------------
1578//
1579//----------------------------------------------------------------------------
1580
1581template<typename SV, unsigned S_FACTOR>
1583{
1584 mask_set_ = false;
1585 mask_from_ = mask_to_ = bm::id_max;
1586
1587 bound_sv_ = 0;
1588 effective_str_max_ = 0;
1589}
1590
1591//----------------------------------------------------------------------------
1592
1593template<typename SV, unsigned S_FACTOR>
1594void sparse_vector_scanner<SV, S_FACTOR>::bind(const SV& sv, bool sorted)
1595{
1596 static_assert(S_FACTOR == 4 || S_FACTOR == 8 || S_FACTOR == 16
1597 || S_FACTOR == 32 || S_FACTOR == 64,
1598 "BM: sparse_vector_scanner<> incorrect sampling factor template parameter");
1599
1600 (void)sorted; // MSVC warning over if constexpr variable "not-referenced"
1601 bound_sv_ = &sv;
1602
1603 if constexpr (SV::is_str()) // bindings for the string sparse vector
1604 {
1605 effective_str_max_ = sv.effective_vector_max();
1607
1608 if (sorted)
1609 {
1610 range_idx_.construct(sv, S_FACTOR);
1611 }
1612 // pre-calculate vector plane masks
1613 //
1614 vector_plane_masks_.resize(effective_str_max_);
1615 for (unsigned i = 0; i < effective_str_max_; ++i)
1616 {
1617 vector_plane_masks_[i] = sv.get_slice_mask(i);
1618 } // for i
1619 }
1620}
1621
1622//----------------------------------------------------------------------------
1623
1624template<typename SV, unsigned S_FACTOR>
1626{
1627 bound_sv_ = 0;
1628 effective_str_max_ = 0;
1629}
1630
1631//----------------------------------------------------------------------------
1632
1633template<typename SV, unsigned S_FACTOR>
1635 bvector_type& bv_out,
1636 bool null_correct)
1637{
1638 if (sv.size() == 0)
1639 {
1640 bv_out.clear();
1641 return;
1642 }
1643 find_nonzero(sv, bv_out);
1644 if constexpr (SV::is_compressed())
1645 {
1646 bv_out.invert();
1647 bv_out.set_range(sv.effective_size(), bm::id_max - 1, false);
1648 decompress(sv, bv_out);
1649 }
1650 else
1651 {
1652 invert(sv, bv_out);
1653 }
1654 if (null_correct)
1655 correct_nulls(sv, bv_out);
1656}
1657
1658//----------------------------------------------------------------------------
1659
1660template<typename SV, unsigned S_FACTOR>
1662{
1663 if (sv.size() == 0)
1664 {
1665 bv_out.clear();
1666 return;
1667 }
1668 // TODO: find a better algorithm (NAND?)
1669 auto old_sz = bv_out.size();
1670 bv_out.resize(sv.size());
1671 bv_out.invert();
1672 bv_out.resize(old_sz);
1673 correct_nulls(sv, bv_out);
1674}
1675
1676//----------------------------------------------------------------------------
1677
1678template<typename SV, unsigned S_FACTOR>
1680 bvector_type& bv_out)
1681{
1682 const bvector_type* bv_null = sv.get_null_bvector();
1683 if (bv_null) // correct result to only use not NULL elements
1684 bv_out.bit_and(*bv_null);
1685}
1686
1687//----------------------------------------------------------------------------
1688
1689template<typename SV, unsigned S_FACTOR>
1691 value_type value,
1692 bvector_type& bv_out,
1693 size_type search_limit)
1694{
1695 if (sv.empty())
1696 return false; // nothing to do
1697
1698 if (!value)
1699 {
1700 find_zero(sv, bv_out);
1701 return bv_out.any();
1702 }
1703 agg_.reset();
1704
1705 bool found = prepare_and_sub_aggregator(sv, value);
1706 if (!found)
1707 {
1708 bv_out.clear();
1709 return found;
1710 }
1711
1712 bool any = (search_limit == 1);
1713 found = agg_.combine_and_sub(bv_out, any);
1714 agg_.reset();
1715 return found;
1716}
1717
1718//----------------------------------------------------------------------------
1719
1720template<typename SV, unsigned S_FACTOR>
1722 value_type value,
1723 size_type& idx)
1724{
1725 if (sv.empty())
1726 return false; // nothing to do
1727
1728 BM_ASSERT(value); // cannot handle zero values
1729 if (!value)
1730 return false;
1731
1732 agg_.reset();
1733 bool found = prepare_and_sub_aggregator(sv, value);
1734 if (!found)
1735 return found;
1736 found = agg_.find_first_and_sub(idx);
1737 agg_.reset();
1738 return found;
1739}
1740
1741
1742//----------------------------------------------------------------------------
1743
1744template<typename SV, unsigned S_FACTOR>
1746 const SV& sv,
1747 const value_type* str,
1748 size_t in_len,
1749 size_type& idx,
1750 bool remaped,
1751 unsigned prefix_len)
1752{
1753 BM_ASSERT(*str && in_len);
1754 BM_ASSERT(in_len == ::strlen(str));
1755
1756 if (!*str)
1757 return false;
1758 agg_.reset();
1759 unsigned common_prefix_len = 0;
1760
1761 value_type* pref = remap_prefix_vect_.data();
1762 if (mask_set_) // it is assumed that the sv is SORTED so common prefix check
1763 {
1764 // if in range is exactly one block
1765 if (/*bool one_nb = */agg_.set_range_hint(mask_from_, mask_to_))
1766 {
1767 if (prefix_len == ~0u) // not valid (uncalculated) prefix len
1768 {
1769 common_prefix_len =
1770 sv.template common_prefix_length<true>(mask_from_, mask_to_, pref);
1771 if (common_prefix_len)
1772 {
1773 if (remaped)
1774 str = remap_value_vect_.data();
1775 // next comparison is in the remapped form
1776 for (unsigned i = 0; i < common_prefix_len; ++i)
1777 if (str[i] != pref[i])
1778 return false;
1779 }
1780 }
1781 else
1782 {
1783 unsigned pl; (void)pl;
1784 BM_ASSERT(prefix_len <=
1785 (pl=sv.template common_prefix_length<true>(
1786 mask_from_, mask_to_, pref)));
1787 common_prefix_len = prefix_len;
1788 }
1789 } // if one block hit
1790 else
1791 {
1792 if (prefix_len != ~0u) // not valid (uncalculated) prefix len
1793 {
1794 unsigned pl; (void)pl;
1795 BM_ASSERT(prefix_len <=
1796 (pl=sv.template common_prefix_length<true>(
1797 mask_from_, mask_to_, pref)));
1798 common_prefix_len = prefix_len;
1799 }
1800 }
1801 }
1802
1803 // prefix len checks
1804 if (common_prefix_len && (in_len <= common_prefix_len))
1805 {
1806 if (in_len == common_prefix_len)
1807 --common_prefix_len;
1808 else // (in_len < common_prefix_len)
1809 return false;
1810 }
1811
1812 const value_type* search_str = str;
1813 if (remaped)
1814 str = remap_value_vect_.data();
1815 else
1816 {
1817 if (sv.is_remap() && (str != remap_value_vect_.data()))
1818 {
1819 bool r = sv.remap_tosv(
1820 remap_value_vect_.data(), sv.effective_max_str(), str);
1821 if (!r)
1822 return r;
1823 str = remap_value_vect_.data();
1824 }
1825 }
1826
1827 bool found = prepare_and_sub_aggregator(sv, str, common_prefix_len, true);
1828 if (found)
1829 {
1830 found = agg_.find_first_and_sub(idx);
1831 if (found && idx > mask_to_) // out of bounds? may be false positive
1832 {
1833 int cmp = sv.compare(idx, search_str);
1834 found = (cmp == 0);
1835 }
1836 }
1837
1838 agg_.reset();
1839 return found;
1840}
1841
1842//----------------------------------------------------------------------------
1843
1844template<typename SV, unsigned S_FACTOR>
1846 const SV& sv,
1847 const value_type* str,
1848 unsigned octet_start,
1849 bool prefix_sub)
1850{
1851 int len = 0;
1852 for (; str[len] != 0; ++len)
1853 {}
1854 BM_ASSERT(len);
1855
1856 // use reverse order (faster for sorted arrays)
1857 // octet_start is the common prefix length (end index)
1858 for (int octet_idx = len-1; octet_idx >= int(octet_start); --octet_idx)
1859 {
1860 unsigned value = unsigned(str[octet_idx]) & 0xFFu;
1861 BM_ASSERT(value != 0);
1862 bm::id64_t planes_mask;
1863 if (&sv == bound_sv_)
1864 planes_mask = vector_plane_masks_[unsigned(octet_idx)];
1865 else
1866 planes_mask = sv.get_slice_mask(unsigned(octet_idx));
1867
1868 if ((value & planes_mask) != value) // pre-screen for impossible values
1869 return false; // found non-existing plane
1870
1871 add_agg_char(agg_, sv, octet_idx, planes_mask, value); // AND-SUB aggregator
1872 } // for octet_idx
1873
1874 // add all vectors above string len to the SUB aggregation group
1875 //
1876 if (prefix_sub)
1877 {
1878 typename SV::size_type planes = sv.get_bmatrix().rows_not_null();
1879 for (unsigned plane_idx = unsigned(len * 8);
1880 plane_idx < planes; ++plane_idx)
1881 {
1882 if (bvector_type_const_ptr bv = sv.get_slice(plane_idx))
1883 agg_.add(bv, 1); // agg to SUB group
1884 } // for
1885 }
1886 return true;
1887}
1888
1889//----------------------------------------------------------------------------
1890
1891template<typename SV, unsigned S_FACTOR>
1893 const SV& sv,
1894 value_type value)
1895{
1896 using unsigned_value_type = typename SV::unsigned_value_type;
1897
1898 agg_.reset();
1899
1900 unsigned char bits[sizeof(value) * 8];
1901 unsigned_value_type uv = sv.s2u(value);
1902
1903 unsigned short bit_count_v = bm::bitscan(uv, bits);
1904 BM_ASSERT(bit_count_v);
1905 const unsigned_value_type mask1 = 1;
1906
1907 // prep the lists for combined AND-SUB aggregator
1908 // (backward order has better chance for bit reduction on AND)
1909 //
1910 for (unsigned i = bit_count_v; i > 0; --i)
1911 {
1912 unsigned bit_idx = bits[i-1];
1913 BM_ASSERT(uv & (mask1 << bit_idx));
1914 if (const bvector_type* bv = sv.get_slice(bit_idx))
1915 agg_.add(bv);
1916 else
1917 return false;
1918 }
1919 unsigned sv_planes = sv.effective_slices();
1920 for (unsigned i = 0; i < sv_planes; ++i)
1921 {
1922 unsigned_value_type mask = mask1 << i;
1923 if (bvector_type_const_ptr bv = sv.get_slice(i); bv && !(uv & mask))
1924 agg_.add(bv, 1); // agg to SUB group
1925 } // for i
1926 return true;
1927}
1928
1929
1930//----------------------------------------------------------------------------
1931
1932template<typename SV, unsigned S_FACTOR>
1934 const SV& sv,
1935 value_type value,
1936 bvector_type& bv_out)
1937{
1938 bv_out.clear();
1939 if (sv.empty())
1940 return; // nothing to do
1941
1942 if (!value)
1943 {
1944 find_zero(sv, bv_out);
1945 return;
1946 }
1947
1948 unsigned char bits[sizeof(value) * 8];
1949 typename SV::unsigned_value_type uv = sv.s2u(value);
1950 unsigned short bit_count_v = bm::bitscan(uv, bits);
1951 BM_ASSERT(bit_count_v);
1952
1953 // aggregate AND all matching vectors
1954 if (const bvector_type* bv_plane = sv.get_slice(bits[--bit_count_v]))
1955 bv_out = *bv_plane;
1956 else // plane not found
1957 {
1958 bv_out.clear();
1959 return;
1960 }
1961 for (unsigned i = 0; i < bit_count_v; ++i)
1962 {
1963 const bvector_type* bv_plane = sv.get_slice(bits[i]);
1964 if (bv_plane)
1965 bv_out &= *bv_plane;
1966 else // mandatory plane not found - empty result!
1967 {
1968 bv_out.clear();
1969 return;
1970 }
1971 } // for i
1972
1973 // SUB all other planes
1974 unsigned sv_planes = sv.effective_slices();
1975 for (unsigned i = 0; (i < sv_planes) && uv; ++i)
1976 {
1977 if (const bvector_type* bv = sv.get_slice(i); bv && !(uv & (1u << i)))
1978 bv_out -= *bv;
1979 } // for i
1980}
1981
1982//----------------------------------------------------------------------------
1983
1984template<typename SV, unsigned S_FACTOR>
1986 const SV& sv,
1987 value_type val,
1988 bvector_type& bv_out)
1989{
1990 // this is not very optimal but should be good enough
1991 find_gt_horizontal(sv, val, bv_out, true /*NULL correction */);
1992}
1993
1994//----------------------------------------------------------------------------
1995
1996template<typename SV, unsigned S_FACTOR>
1998 const SV& sv,
1999 value_type val,
2000 bvector_type& bv_out)
2001{
2002 if constexpr (std::is_signed<value_type>::value)
2003 {
2004 if (val == std::numeric_limits<int>::min())
2005 {
2006 bvector_type bv_min;
2007 find_eq(sv, val, bv_min);
2008 find_gt_horizontal(sv, val, bv_out, true /*NULL correction */);
2009 bv_out.merge(bv_min);
2010 }
2011 else
2012 {
2013 --val;
2014 find_gt_horizontal(sv, val, bv_out, true /*NULL correction */);
2015 }
2016 }
2017 else // unsigned
2018 {
2019 if (val)
2020 {
2021 --val;
2022 find_gt_horizontal(sv, val, bv_out, true /*NULL correction */);
2023 }
2024 else // val == 0
2025 {
2026 // result set is ALL elements minus possible NULL values
2027 bv_out.set_range(0, sv.size()-1);
2028 correct_nulls(sv, bv_out);
2029 }
2030 }
2031}
2032
2033//----------------------------------------------------------------------------
2034
2035template<typename SV, unsigned S_FACTOR>
2037 const SV& sv,
2038 value_type val,
2039 bvector_type& bv_out)
2040{
2041 find_ge(sv, val, bv_out);
2042 invert(sv, bv_out);
2043}
2044
2045//----------------------------------------------------------------------------
2046
2047template<typename SV, unsigned S_FACTOR>
2049 value_type val,
2050 bvector_type& bv_out)
2051{
2052 find_gt(sv, val, bv_out);
2053 invert(sv, bv_out);
2054}
2055
2056//----------------------------------------------------------------------------
2057
2058template<typename SV, unsigned S_FACTOR>
2060 value_type from, value_type to,
2061 bvector_type& bv_out)
2062{
2063 if (to < from)
2064 bm::xor_swap(from, to);
2065
2066 find_ge(sv, from, bv_out);
2067
2068 bvector_type bv_gt;
2069 find_gt(sv, to, bv_gt);
2070 bv_out.bit_sub(bv_gt);
2071}
2072
2073//----------------------------------------------------------------------------
2074
2075
2076template<typename SV, unsigned S_FACTOR>
2078 value_type value,
2079 bvector_type& bv_out,
2080 bool null_correct)
2081{
2082 unsigned char blist[64];
2083 unsigned bit_count_v;
2084
2085 if (sv.empty())
2086 {
2087 bv_out.clear();
2088 return; // nothing to do
2089 }
2090
2091 if (!value)
2092 {
2093 bvector_type bv_zero;
2094 find_zero(sv, bv_zero);
2095 auto sz = sv.size();
2096 bv_out.set_range(0, sz-1);
2097 bv_out.bit_sub(bv_zero); // all but zero
2098
2099 if constexpr (std::is_signed<value_type>::value)
2100 {
2101 if (const bvector_type* bv_sign = sv.get_slice(0)) // sign bvector
2102 bv_out.bit_sub(*bv_sign); // all but negatives
2103 }
2104 if (null_correct)
2105 correct_nulls(sv, bv_out);
2106 return;
2107 }
2108 bv_out.clear();
2109
2110 bvector_type gtz_bv; // all >= 0
2111
2112 if constexpr (std::is_signed<value_type>::value)
2113 {
2114 find_positive(sv, gtz_bv); // all positives are greater than all negs
2115 if (value == -1)
2116 {
2117 bv_out.swap(gtz_bv);
2118 if (null_correct)
2119 correct_nulls(sv, bv_out);
2120 return;
2121 }
2122 if (value == -2)
2123 {
2124 find_eq(sv, -1, bv_out);
2125 bv_out.bit_or(gtz_bv);
2126 if (null_correct)
2127 correct_nulls(sv, bv_out);
2128 return;
2129 }
2130
2131 auto uvalue = SV::s2u(value + bool(value < 0)); // turns it int LE expression
2132 uvalue &= ~1u; // drop the negative bit
2133 BM_ASSERT(uvalue);
2134 bit_count_v = bm::bitscan(uvalue, blist);
2135 }
2136 else
2137 bit_count_v = bm::bitscan(value, blist);
2138
2139 BM_ASSERT(bit_count_v);
2140
2141
2142 // aggregate all top bit-slices (surely greater)
2143 // TODO: use aggregator function
2144 bvector_type top_or_bv;
2145
2146 unsigned total_planes = sv.effective_slices();
2147 const bvector_type* bv_sign = sv.get_slice(0); (void)bv_sign;
2148
2149 agg_.reset();
2150 if constexpr (std::is_signed<value_type>::value)
2151 {
2152 if (value < 0)
2153 {
2154 if (!bv_sign) // no negatives at all
2155 {
2156 bv_out.swap(gtz_bv); // return all >= 0
2157 if (null_correct)
2158 correct_nulls(sv, bv_out);
2159 return;
2160 }
2161 aggregate_AND_OR_slices(top_or_bv, *bv_sign, sv, blist[bit_count_v-1]+1, total_planes);
2162 }
2163 else // value > 0
2164 {
2165 aggregate_OR_slices(top_or_bv, sv, blist[bit_count_v-1]+1, total_planes);
2166 }
2167 }
2168 else
2169 {
2170 if (total_planes < unsigned(blist[bit_count_v-1]+1))
2171 return; // number is greater than anything in this vector (empty set)
2172 aggregate_OR_slices(top_or_bv, sv, blist[bit_count_v-1]+1, total_planes);
2173 }
2174 agg_.reset();
2175
2176 bv_out.merge(top_or_bv);
2177
2178 // TODO: optimize FULL blocks
2179
2180 bvector_type and_eq_bv; // AND accum
2181 bool first = true; // flag for initial assignment
2182
2183 // GT search
2184 for (int i = int(bit_count_v)-1; i >= 0; --i)
2185 {
2186 int slice_idx = blist[i];
2187 if (slice_idx == gt_slice_limit()) // last plane
2188 break;
2189 const bvector_type* bv_base_plane = sv.get_slice(unsigned(slice_idx));
2190 if (!bv_base_plane)
2191 break;
2192 if (first)
2193 {
2194 first = false;
2195 if constexpr (std::is_signed<value_type>::value)
2196 {
2197 if (value < 0)
2198 and_eq_bv.bit_and(*bv_base_plane, *bv_sign); // use negatives for AND mask
2199 else // value > 0
2200 and_eq_bv.bit_and(*bv_base_plane, gtz_bv);
2201 }
2202 else // unsigned
2203 and_eq_bv = *bv_base_plane; // initial assignment
2204 }
2205 else
2206 and_eq_bv.bit_and(*bv_base_plane); // AND base to accumulator
2207
2208 int next_slice_idx = 0;
2209 if (i)
2210 {
2211 next_slice_idx = blist[i-1];
2212 if (slice_idx-1 == next_slice_idx)
2213 continue;
2214 ++next_slice_idx;
2215 }
2216
2217 // AND-OR the mid-planes
2218 //
2219 for (int j = slice_idx-1; j >= next_slice_idx; --j)
2220 {
2221 if constexpr (std::is_signed<value_type>::value)
2222 if (j == 0) // sign plane
2223 break; // do not process the sign plane at all
2224 if (const bvector_type* bv_sub_plane = sv.get_slice(unsigned(j)))
2225 bv_out.bit_or_and(and_eq_bv, *bv_sub_plane);
2226 } // for j
2227 } // for i
2228
2229 if constexpr (std::is_signed<value_type>::value)
2230 {
2231 if (value < 0)
2232 {
2233 // now we have negatives greter by abs value
2234 top_or_bv.set_range(0, sv.size()-1);
2235 top_or_bv.bit_sub(bv_out);
2236 bv_out.swap(top_or_bv);
2237 }
2238 else // value > 0
2239 {
2240 gtz_bv.resize(sv.size());
2241 gtz_bv.invert(); // now it is all < 0
2242 bv_out.bit_sub(gtz_bv); // exclude all negatives
2243 }
2244 }
2245
2246 decompress(sv, bv_out);
2247
2248 if (null_correct)
2249 {
2250 if constexpr (!std::is_signed<value_type>::value) // unsigned
2251 return; // NULL correction for positive values is not needed
2252 else // signed
2253 {
2254 if (value < 0)
2255 correct_nulls(sv, bv_out);
2256 }
2257 }
2258}
2259
2260//----------------------------------------------------------------------------
2261
2262template<typename SV, unsigned S_FACTOR>
2264 bvector_type& bv_target,
2265 const SV& sv,
2266 unsigned from, unsigned total_planes)
2267{
2268 for (unsigned i = from; i < total_planes; ++i)
2269 {
2270 if (const bvector_type* bv_slice = sv.get_slice(i))
2271 {
2272 BM_ASSERT(bv_slice != sv.get_null_bvector());
2273 agg_.add(bv_slice);
2274 }
2275 }
2276 agg_.combine_or(bv_target);
2277}
2278
2279//----------------------------------------------------------------------------
2280
2281template<typename SV, unsigned S_FACTOR>
2283 const bvector_type& bv_mask,
2284 const SV& sv,
2285 unsigned from, unsigned total_planes)
2286{
2287 for (unsigned i = from; i < total_planes; ++i)
2288 {
2289 if (const bvector_type* bv_slice = sv.get_slice(i))
2290 {
2291 BM_ASSERT(bv_slice != sv.get_null_bvector());
2292 bv_target.bit_or_and(bv_mask, *bv_slice);
2293 }
2294 }
2295}
2296
2297//----------------------------------------------------------------------------
2298
2299template<typename SV, unsigned S_FACTOR>
2301 const typename SV::value_type* str,
2302 typename SV::bvector_type& bv_out)
2303{
2304 return find_eq_str_impl(sv, str, bv_out, false);
2305}
2306
2307
2308//----------------------------------------------------------------------------
2309
2310template<typename SV, unsigned S_FACTOR>
2312 const typename SV::value_type* str,
2313 typename SV::size_type& pos)
2314{
2315 BM_ASSERT(bound_sv_);
2316 return this->find_eq_str(*bound_sv_, str, pos);
2317}
2318
2319//----------------------------------------------------------------------------
2320
2321template<typename SV, unsigned S_FACTOR>
2323 const SV& sv,
2324 const typename SV::value_type* str,
2325 typename SV::size_type& pos)
2326{
2327 bool found = false;
2328 if (sv.empty())
2329 return found;
2330 if (*str)
2331 {
2332 bool remaped = false;
2333 if constexpr (SV::is_remap_support::value) // test remapping trait
2334 {
2335 if (sv.is_remap() && str != remap_value_vect_.data())
2336 {
2337 auto str_len = sv.effective_vector_max();
2338 remap_value_vect_.resize(str_len);
2339 remaped = sv.remap_tosv(
2340 remap_value_vect_.data(), str_len, str);
2341 if (!remaped)
2342 return remaped;
2343 str = remap_value_vect_.data();
2344 }
2345 }
2346
2347 size_t in_len = ::strlen(str);
2348 size_type found_pos;
2349 found = find_first_eq(sv, str, in_len, found_pos, remaped, ~0u);
2350 if (found)
2351 {
2352 pos = found_pos;
2353 if constexpr (SV::is_rsc_support::value) // test rank/select trait
2354 {
2355 if constexpr (sv.is_compressed()) // if compressed vector - need rank translation
2356 found = sv.find_rank(found_pos + 1, pos);
2357 }
2358 }
2359 }
2360 else // search for zero(empty str) value
2361 {
2362 // TODO: implement optimized version which would work witout temp vector
2363 typename SV::bvector_type bv_zero;
2364 find_zero(sv, bv_zero);
2365 found = bv_zero.find(pos);
2366 }
2367 return found;
2368}
2369
2370//----------------------------------------------------------------------------
2371
2372template<typename SV, unsigned S_FACTOR>
2374 const typename SV::value_type* str,
2375 typename SV::bvector_type& bv_out)
2376{
2377 BM_ASSERT(bound_sv_);
2378 return find_eq_str(*bound_sv_, str, bv_out);
2379}
2380
2381//----------------------------------------------------------------------------
2382
2383template<typename SV, unsigned S_FACTOR>
2385 const SV& sv,
2386 const typename SV::value_type* str,
2387 typename SV::bvector_type& bv_out)
2388{
2389 return find_eq_str_impl(sv, str, bv_out, true);
2390}
2391
2392//----------------------------------------------------------------------------
2393
2394template<typename SV, unsigned S_FACTOR>
2396 remap_vector_type& remap_vect_target,
2397 const typename SV::value_type* str,
2398 const SV& sv)
2399{
2400 auto str_len = sv.effective_vector_max();
2401 remap_vect_target.resize(str_len);
2402 bool remaped = sv.remap_tosv(remap_vect_target.data(), str_len, str);
2403 return remaped;
2404}
2405
2406//----------------------------------------------------------------------------
2407
2408template<typename SV, unsigned S_FACTOR>
2410 const SV& sv,
2411 const typename SV::value_type* str,
2412 typename SV::bvector_type& bv_out,
2413 bool prefix_sub)
2414{
2415 bool found = false;
2416 bv_out.clear(true);
2417 if (sv.empty())
2418 return false; // nothing to do
2419 if constexpr (SV::is_rsc_support::value) // test rank/select trait
2420 {
2421 // search in RS compressed string vectors is not implemented
2422 BM_ASSERT(0);
2423 }
2424
2425 if (*str)
2426 {
2427 if constexpr (SV::is_remap_support::value) // test remapping trait
2428 {
2429 if (sv.is_remap() && (str != remap_value_vect_.data()))
2430 {
2431 bool remaped = remap_tosv(remap_value_vect_, str, sv);
2432 if (!remaped)
2433 return false; // not found because
2434 str = remap_value_vect_.data();
2435 }
2436 }
2437 /// aggregator search
2438 agg_.reset();
2439
2440 const unsigned common_prefix_len = 0;
2441 found = prepare_and_sub_aggregator(sv, str, common_prefix_len,
2442 prefix_sub);
2443 if (!found)
2444 return found;
2445 found = agg_.combine_and_sub(bv_out);
2446
2447 agg_.reset();
2448 }
2449 else // search for zero(empty str) value
2450 {
2451 find_zero(sv, bv_out);
2452 found = bv_out.any();
2453 }
2454 return found;
2455
2456}
2457
2458//----------------------------------------------------------------------------
2459
2460template<typename SV, unsigned S_FACTOR> template<class TPipe>
2462{
2463 if (pipe.bv_and_mask_)
2464 {
2465 size_type first, last;
2466 bool any = pipe.bv_and_mask_->find_range(first, last);
2467 if (!any)
2468 return;
2469 agg_.set_range_hint(first, last);
2470 }
2471 agg_.combine_and_sub(pipe.agg_pipe_);
2472 agg_.reset_range_hint();
2473}
2474
2475//----------------------------------------------------------------------------
2476
2477template<typename SV, unsigned S_FACTOR>
2478template<bool BOUND>
2480 const SV& sv,
2481 const typename SV::value_type* str,
2482 size_t in_len,
2483 bool remaped,
2484 typename SV::size_type& pos)
2485{
2486 bool found = false;
2487 if (sv.empty())
2488 return found;
2489
2490 unsigned prefix_len = ~0u;
2491
2492 if (in_len)
2493 {
2495
2496 size_type l, r;
2497 size_type found_pos;
2498
2499 if constexpr (BOUND)
2500 {
2501 found = range_idx_.bfind_range(str, in_len, l, r);
2502 if (!found)
2503 return found;
2504
2505 prefix_len =
2506 (unsigned) range_idx_.common_prefix_length(str, in_len, l, r);
2507
2508 if ((l == r) && (in_len == prefix_len))
2509 {
2510 range_idx_.recalc_range(str, l, r);
2511 pos = l;
2512 return found;
2513 }
2514
2515 range_idx_.recalc_range(str, l, r);
2516 set_search_range(l, r); // r := r-1 (may happen here) [l..r] interval
2517
2518 BM_ASSERT(this->compare_str<false>(sv, l, str) <= 0);
2519 // bad assert, because of the r = r-1 correction in recalc_range()
2520 //BM_ASSERT(this->compare_str<false>(sv, r, str) >= 0);
2521
2522 }
2523 else
2524 {
2525 // narrow down the search
2526 const unsigned min_distance_cutoff = bm::gap_max_bits + bm::gap_max_bits / 2;
2527 size_type dist;
2528 l = 0; r = sv.size()-1;
2529
2530 // binary search to narrow down the search window
2531 while (l <= r)
2532 {
2533 dist = r - l;
2534 if (dist < min_distance_cutoff)
2535 {
2536 // we are in an narrow <2 blocks window, but still may be in two
2537 // different neighboring blocks, lets try to narrow
2538 // to exactly one block
2539
2540 size_type nb_l = (l >> bm::set_block_shift);
2541 size_type nb_r = (r >> bm::set_block_shift);
2542 if (nb_l != nb_r)
2543 {
2544 size_type mid = nb_r * bm::gap_max_bits;
2545 if (mid < r)
2546 {
2547 int cmp = this->compare_str<BOUND>(sv, mid, str);
2548 if (cmp < 0) // mid < str
2549 l = mid;
2550 else
2551 r = mid-(cmp!=0); // branchless if (cmp==0) r=mid;
2552 BM_ASSERT(l < r);
2553 }
2554 nb_l = unsigned(l >> bm::set_block_shift);
2555 nb_r = unsigned(r >> bm::set_block_shift);
2556 }
2557
2558 if (nb_l == nb_r)
2559 {
2560 size_type max_nb = sv.size() >> bm::set_block_shift;
2561 if (nb_l != max_nb)
2562 {
2563 // linear scan to identify the sub-range
2565 for (unsigned i = 0; i < (sub_bfind_block_cnt-1);
2566 ++i, mid += sub_block_l1_size)
2567 {
2568 int cmp = this->compare_str<BOUND>(sv, mid, str);
2569 if (cmp < 0)
2570 l = mid;
2571 else
2572 {
2573 r = mid;
2574 break;
2575 }
2576 } // for i
2577 }
2578 set_search_range(l, r);
2579 break;
2580 }
2581 }
2582
2583 typename SV::size_type mid = dist/2+l;
2584 size_type nb = (mid >> bm::set_block_shift);
2585 mid = nb * bm::gap_max_bits;
2586 int cmp;
2587 if (mid <= l)
2588 {
2589 if (nb == 0 && r > bm::gap_max_bits)
2590 mid = bm::gap_max_bits;
2591 else
2592 {
2593 mid = dist / 2 + l;
2594 cmp = this->compare_str<false>(sv, mid, str);
2595 goto l1;
2596 }
2597 }
2598 BM_ASSERT(mid > l);
2599 cmp = this->compare_str<BOUND>(sv, mid, str);
2600 l1:
2601 if (cmp == 0)
2602 {
2603 found_pos = mid;
2604 //found = true;
2605 set_search_range(l, mid);
2606 break;
2607 }
2608 if (cmp < 0)
2609 l = mid+1;
2610 else
2611 r = mid-1;
2612 } // while
2613 }
2614
2615 // use linear search (range is set)
2616 found = find_first_eq(sv, str, in_len, found_pos, remaped, prefix_len);
2617 if (found)
2618 {
2619 pos = found_pos;
2620 if constexpr (SV::is_compressed()) // if compressed vector - need rank translation
2621 found = sv.find_rank(found_pos + 1, pos);
2622 }
2624 }
2625 else // search for zero value
2626 {
2627 // TODO: implement optimized version which would work without temp vector
2628 typename SV::bvector_type bv_zero;
2629 find_zero(sv, bv_zero);
2630 found = bv_zero.find(pos);
2631 }
2632 return found;
2633}
2634
2635//----------------------------------------------------------------------------
2636
2637template<typename SV, unsigned S_FACTOR>
2639 const value_type* str, size_type& pos)
2640{
2641 size_t len = ::strlen(str);
2642 effective_str_max_ = sv.effective_max_str();
2643 if (len > effective_str_max_)
2644 return false; // impossible value
2645
2647
2648 bool remaped = false;
2649 if constexpr (SV::is_remap_support::value)
2650 {
2651 if (sv.is_remap())
2652 {
2653 remap_value_vect_.resize_no_copy(len);
2654 remaped = sv.remap_tosv(remap_value_vect_.data(),
2655 effective_str_max_, str);
2656 if (!remaped)
2657 return remaped;
2658 }
2659 }
2660 return bfind_eq_str_impl<false>(sv, str, len, remaped, pos);
2661}
2662
2663//----------------------------------------------------------------------------
2664
2665template<typename SV, unsigned S_FACTOR>
2667 const typename SV::value_type* str,
2668 typename SV::size_type& pos)
2669{
2670 BM_ASSERT(bound_sv_); // this function needs prior bind()
2671 size_t len = ::strlen(str);
2672 if (len > effective_str_max_)
2673 return false; // impossible value
2674 bool remaped = false;
2675 if constexpr (SV::is_remap_support::value)
2676 {
2677 if (bound_sv_->is_remap())
2678 {
2679 remaped = bound_sv_->remap_tosv(remap_value_vect_.data(),
2680 effective_str_max_, str);
2681 if (!remaped)
2682 return remaped;
2683 }
2684 }
2685 return bfind_eq_str_impl<true>(*bound_sv_, str, len, remaped, pos);
2686}
2687
2688//----------------------------------------------------------------------------
2689
2690template<typename SV, unsigned S_FACTOR>
2692 const value_type* str, size_t in_len, size_type& pos)
2693{
2694 BM_ASSERT(str);
2695 BM_ASSERT(bound_sv_);
2696
2697 if (in_len > effective_str_max_)
2698 return false; // impossible value
2699
2700 value_type* s = value_vect_.data(); // copy to temp buffer, put zero end
2701
2702 bool remaped = false;
2703 // test search pre-condition based on remap tables
2704 if constexpr (SV::is_remap_support::value)
2705 {
2706 if (bound_sv_->is_remap())
2707 {
2708 remaped = bound_sv_->remap_n_tosv_2way(
2709 remap_value_vect_.data(),
2710 s,
2711 effective_str_max_,
2712 str,
2713 in_len);
2714 if (!remaped)
2715 return remaped;
2716 }
2717 }
2718 if (!remaped) // copy string, make sure it is zero terminated
2719 {
2720 for (size_t i = 0; i < in_len && *str; ++i)
2721 s[i] = str[i];
2722 s[in_len] = value_type(0);
2723 }
2724 return bfind_eq_str_impl<true>(*bound_sv_, s, in_len, remaped, pos);
2725}
2726
2727//----------------------------------------------------------------------------
2728
2729template<typename SV, unsigned S_FACTOR>
2731 const SV& sv,
2732 const typename SV::value_type val,
2733 typename SV::size_type& pos)
2734{
2735 int cmp;
2736 size_type l = 0, r = sv.size();
2737 if (l == r) // empty vector
2738 {
2739 pos = 0;
2740 return false;
2741 }
2742 --r;
2743
2744 // check initial boundary conditions if insert point is at tail/head
2745 cmp = this->compare(sv, l, val); // left (0) boundary check
2746 if (cmp > 0) // vect[x] > str
2747 {
2748 pos = 0;
2749 return false;
2750 }
2751 if (cmp == 0)
2752 {
2753 pos = 0;
2754 return true;
2755 }
2756 cmp = this->compare(sv, r, val); // right(size-1) boundary check
2757 if (cmp == 0)
2758 {
2759 pos = r;
2760 // back-scan to rewind all duplicates
2761 // TODO: adapt one-sided binary search to traverse large platos
2762 for (; r >= 0; --r)
2763 {
2764 cmp = this->compare(sv, r, val);
2765 if (cmp != 0)
2766 return true;
2767 pos = r;
2768 } // for i
2769 return true;
2770 }
2771 if (cmp < 0) // vect[x] < str
2772 {
2773 pos = r+1;
2774 return false;
2775 }
2776
2777 size_type dist = r - l;
2778 if (dist < linear_cutoff1)
2779 {
2780 for (; l <= r; ++l)
2781 {
2782 cmp = this->compare(sv, l, val);
2783 if (cmp == 0)
2784 {
2785 pos = l;
2786 return true;
2787 }
2788 if (cmp > 0)
2789 {
2790 pos = l;
2791 return false;
2792 }
2793 } // for
2794 }
2795
2796 while (l <= r)
2797 {
2798 size_type mid = (r-l)/2+l;
2799 cmp = this->compare(sv, mid, val);
2800 if (cmp == 0)
2801 {
2802 pos = mid;
2803 // back-scan to rewind all duplicates
2804 for (size_type i = mid-1; i >= 0; --i)
2805 {
2806 cmp = this->compare(sv, i, val);
2807 if (cmp != 0)
2808 return true;
2809 pos = i;
2810 } // for i
2811 pos = 0;
2812 return true;
2813 }
2814 if (cmp < 0)
2815 l = mid+1;
2816 else
2817 r = mid-1;
2818
2819 dist = r - l;
2820 if (dist < linear_cutoff2) // do linear scan here
2821 {
2822 typename SV::const_iterator it(&sv, l);
2823 for (; it.valid(); ++it, ++l)
2824 {
2825 typename SV::value_type sv_value = it.value();
2826 if (sv_value == val)
2827 {
2828 pos = l;
2829 return true;
2830 }
2831 if (sv_value > val) // vect[x] > val
2832 {
2833 pos = l;
2834 return false;
2835 }
2836 } // for it
2837 BM_ASSERT(0);
2838 pos = l;
2839 return false;
2840 }
2841 } // while
2842
2843 BM_ASSERT(0);
2844 return false;
2845}
2846
2847
2848//----------------------------------------------------------------------------
2849
2850template<typename SV, unsigned S_FACTOR>
2852 const SV& sv,
2853 const typename SV::value_type* str,
2854 typename SV::size_type& pos)
2855{
2856 int cmp;
2857 size_type l = 0, r = sv.size();
2858
2859 if (l == r) // empty vector
2860 {
2861 pos = 0;
2862 return false;
2863 }
2864 --r;
2865
2866 // check initial boundary conditions if insert point is at tail/head
2867 cmp = this->compare_str<false>(sv, l, str); // left (0) boundary check
2868 if (cmp > 0) // vect[x] > str
2869 {
2870 pos = 0;
2871 return false;
2872 }
2873 if (cmp == 0)
2874 {
2875 pos = 0;
2876 return true;
2877 }
2878 cmp = this->compare_str<false>(sv, r, str); // right(size-1) boundary check
2879 if (cmp == 0)
2880 {
2881 pos = r;
2882 // back-scan to rewind all duplicates
2883 // TODO: adapt one-sided binary search to traverse large platos
2884 for (; r >= 0; --r)
2885 {
2886 cmp = this->compare_str<false>(sv, r, str);
2887 if (cmp != 0)
2888 return true;
2889 pos = r;
2890 } // for i
2891 return true;
2892 }
2893 if (cmp < 0) // vect[x] < str
2894 {
2895 pos = r+1;
2896 return false;
2897 }
2898
2899 size_type dist = r - l;
2900 if (dist < linear_cutoff1)
2901 {
2902 for (; l <= r; ++l)
2903 {
2904 cmp = this->compare_str<false>(sv, l, str);
2905 if (cmp == 0)
2906 {
2907 pos = l;
2908 return true;
2909 }
2910 if (cmp > 0)
2911 {
2912 pos = l;
2913 return false;
2914 }
2915 } // for
2916 }
2917 while (l <= r)
2918 {
2919 size_type mid = (r-l)/2+l;
2920 cmp = this->compare_str<false>(sv, mid, str);
2921 if (cmp == 0)
2922 {
2923 pos = mid;
2924 // back-scan to rewind all duplicates
2925 for (size_type i = mid-1; i >= 0; --i)
2926 {
2927 cmp = this->compare_str<false>(sv, i, str);
2928 if (cmp != 0)
2929 return true;
2930 pos = i;
2931 } // for i
2932 pos = 0;
2933 return true;
2934 }
2935 if (cmp < 0)
2936 l = mid+1;
2937 else
2938 r = mid-1;
2939
2940 dist = r - l;
2941 if (dist < linear_cutoff2) // do linear scan here
2942 {
2943 if (!hmatr_.is_init())
2944 {
2945 unsigned max_str = (unsigned)sv.effective_max_str();
2946 hmatr_.resize(linear_cutoff2, max_str+1, false);
2947 }
2948
2949 dist = sv.decode(hmatr_, l, dist+1);
2950 for (unsigned i = 0; i < dist; ++i, ++l)
2951 {
2952 const typename SV::value_type* hm_str = hmatr_.row(i);
2953 cmp = ::strcmp(hm_str, str);
2954 if (cmp == 0)
2955 {
2956 pos = l;
2957 return true;
2958 }
2959 if (cmp > 0) // vect[x] > str
2960 {
2961 pos = l;
2962 return false;
2963 }
2964 }
2965 cmp = this->compare_str<false>(sv, l, str);
2966 if (cmp > 0) // vect[x] > str
2967 {
2968 pos = l;
2969 return false;
2970 }
2971 BM_ASSERT(0);
2972 pos = l;
2973 return false;
2974 }
2975 } // while
2976
2977 BM_ASSERT(0);
2978 return false;
2979}
2980
2981
2982//----------------------------------------------------------------------------
2983
2984template<typename SV, unsigned S_FACTOR>
2985template <bool BOUND>
2987 const SV& sv,
2988 size_type idx,
2989 const value_type* BMRESTRICT str
2990 ) const BMNOEXCEPT
2991{
2992#if 0
2993 if constexpr (BOUND)
2994 {
2995 BM_ASSERT(bound_sv_ == &sv);
2996
2997 size_type nb = (idx >> bm::set_block_shift);
2998 size_type nbit = (idx & bm::set_block_mask);
2999 int res = 0;
3000 const value_type* BMRESTRICT s0;
3001 /*
3002 if (!nbit) // access to sentinel, first block element
3003 s0 = block_l0_cache_.row(nb);
3004 else
3005 {
3006 BM_ASSERT(nbit % sub_block_l1_size == 0);
3007 size_type k =
3008 (nb * (sub_bfind_block_cnt-1)) + (nbit / sub_block_l1_size - 1);
3009 s0 = block_l1_cache_.row(k);
3010 }
3011 */
3012 // strcmp
3013 /*
3014 if constexpr (sizeof(void*) == 8) // TODO: improve for WASM
3015 {
3016 for (unsigned i = 0; true; i+=sizeof(bm::id64_t))
3017 {
3018 bm::id64_t o64, v64;
3019 ::memcpy(&o64, str+i, sizeof(o64));
3020 ::memcpy(&v64, s0+i, sizeof(v64));
3021
3022 if (o64 != v64 || bm::has_zero_byte_u64(o64)
3023 || bm::has_zero_byte_u64(v64))
3024 {
3025 do
3026 {
3027 char octet = str[i]; char value = s0[i];
3028 res = (value > octet) - (value < octet);
3029 if (res || !octet)
3030 return res;
3031 ++i;
3032 } while(1);
3033 }
3034 } // for i
3035 }
3036 else */
3037 {
3038 for (unsigned i = 0; true; ++i)
3039 {
3040 char octet = str[i]; char value = s0[i];
3041 res = (value > octet) - (value < octet);
3042 if (res || !octet)
3043 break;
3044 } // for i
3045 }
3046
3047 return res;
3048 }
3049 else
3050#endif
3051 {
3052 return sv.compare(idx, str);
3053 }
3054}
3055
3056//----------------------------------------------------------------------------
3057
3058template<typename SV, unsigned S_FACTOR>
3060 const SV& sv,
3061 size_type idx,
3062 const value_type val) BMNOEXCEPT
3063{
3064 // TODO: implement sentinel elements cache (similar to compare_str())
3065 return sv.compare(idx, val);
3066}
3067
3068//----------------------------------------------------------------------------
3069
3070template<typename SV, unsigned S_FACTOR>
3072 const SV& sv,
3073 typename SV::value_type value,
3074 typename SV::bvector_type& bv_out)
3075{
3076 if (sv.empty())
3077 {
3078 bv_out.clear();
3079 return; // nothing to do
3080 }
3081 if (!value)
3082 {
3083 find_zero(sv, bv_out);
3084 return;
3085 }
3086
3087 find_eq_with_nulls(sv, value, bv_out, 0);
3088
3089 decompress(sv, bv_out);
3090 correct_nulls(sv, bv_out);
3091}
3092
3093//----------------------------------------------------------------------------
3094
3095template<typename SV, unsigned S_FACTOR> template<typename BII>
3097 const SV& sv, value_type value, BII bi)
3098{
3099 static_assert(!SV::is_compressed(), "BM:find_eq on RSC vector not implemented");
3100
3101 if (sv.empty())
3102 return; // nothing to do
3103 if (!value)
3104 {
3105 // TODO: better implementation for 0 value seach
3106 typename SV::bvector_type bv_out;
3107 find_zero(sv, bv_out);
3108 typename SV::bvector_type::enumerator en = bv_out.get_enumerator(0);
3109 for (; en.valid(); ++en)
3110 *bi = *en;
3111 return;
3112 }
3113
3114 // search for value with aggregator
3115 //
3116 agg_.reset();
3117
3118 bool found = prepare_and_sub_aggregator(sv, value);
3119 if (!found)
3120 return; // impossible value
3121
3122 found = agg_.combine_and_sub_bi(bi);
3123 agg_.reset();
3124}
3125
3126
3127//----------------------------------------------------------------------------
3128
3129template<typename SV, unsigned S_FACTOR>
3131 const SV& sv,
3132 typename SV::value_type value,
3133 typename SV::size_type& pos)
3134{
3135 if (!value) // zero value - special case
3136 {
3137 bvector_type bv_zero;
3138 find_eq(sv, value, bv_zero);
3139 bool found = bv_zero.find(pos);
3140 return found;
3141 }
3142
3143 size_type found_pos;
3144 bool found = find_first_eq(sv, value, found_pos);
3145 if (found)
3146 {
3147 pos = found_pos;
3148 if (bm::conditional<SV::is_rsc_support::value>::test()) // test rank/select trait
3149 {
3150 if constexpr (SV::is_compressed()) // if compressed vector - need rank translation
3151 found = sv.find_rank(found_pos + 1, pos);
3152 }
3153 }
3154 return found;
3155}
3156
3157//----------------------------------------------------------------------------
3158
3159template<typename SV, unsigned S_FACTOR>
3161 const SV& sv,
3162 typename SV::bvector_type& bv_out)
3163{
3164 agg_.reset(); // in case if previous scan was interrupted
3165 auto sz = sv.effective_slices(); // sv.slices();
3166 for (unsigned i = 0; i < sz; ++i)
3167 agg_.add(sv.get_slice(i));
3168 agg_.combine_or(bv_out);
3169 agg_.reset();
3170}
3171
3172//----------------------------------------------------------------------------
3173
3174template<typename SV, unsigned S_FACTOR>
3176 const SV& sv,
3177 typename SV::bvector_type& bv_out)
3178{
3179 BM_ASSERT(sv.size());
3180 bv_out.set_range(0, sv.size()-1); // select all elements
3181 if constexpr (std::is_signed<value_type>::value)
3182 {
3183 if (const bvector_type* bv_sign = sv.get_slice(0)) // sign bvector
3184 bv_out.bit_sub(*bv_sign); // all MINUS negatives
3185 }
3186}
3187
3188//----------------------------------------------------------------------------
3189
3190template<typename SV, unsigned S_FACTOR>
3192 const SV& sv,
3193 typename SV::bvector_type& bv_out)
3194{
3195 if constexpr (SV::is_compressed())
3196 {
3197 const bvector_type* bv_non_null = sv.get_null_bvector();
3198 BM_ASSERT(bv_non_null);
3199
3200 // TODO: implement faster decompressor for small result-sets
3201 rank_compr_.decompress(bv_tmp_, *bv_non_null, bv_out);
3202 bv_out.swap(bv_tmp_);
3203 }
3204 else
3205 {
3206 (void)sv; (void)bv_out;
3207 }
3208}
3209
3210//----------------------------------------------------------------------------
3211
3212template<typename SV, unsigned S_FACTOR>
3215{
3216 BM_ASSERT(from <= to);
3217 mask_from_ = from; mask_to_ = to; mask_set_ = true;
3218}
3219
3220//----------------------------------------------------------------------------
3221
3222template<typename SV, unsigned S_FACTOR>
3227
3228
3229//----------------------------------------------------------------------------
3230// sparse_vector_scanner<SV, S_FACTOR>::pipeline<Opt>
3231//----------------------------------------------------------------------------
3232
3233template<typename SV, unsigned S_FACTOR> template<class Opt>
3234void
3236 const bvector_type* bv_mask) BMNOEXCEPT
3237{
3238 static_assert(Opt::is_masks(),
3239 "BM: Search masking needs to be enabled in template parameter options before function call. see bm::agg_run_options<> ");
3240 bv_and_mask_ = bv_mask;
3241}
3242
3243//----------------------------------------------------------------------------
3244
3245template<typename SV, unsigned S_FACTOR> template<class Opt>
3246void
3248 const typename SV::value_type* str)
3249{
3250 BM_ASSERT(str);
3251
3252 typename aggregator_type::arg_groups* arg = agg_pipe_.add();
3253 BM_ASSERT(arg);
3254
3255 if constexpr (SV::is_remap_support::value) // test remapping trait
3256 {
3257 if (sv_.is_remap() && (str != remap_value_vect_.data()))
3258 {
3259 bool remaped = remap_tosv(remap_value_vect_, str, sv_);
3260 if (!remaped)
3261 return; // not found (leaves the arg(arg_group) empty
3262 str = remap_value_vect_.data();
3263 }
3264 }
3265 int len = 0;
3266 for (; str[len] != 0; ++len)
3267 {}
3268 BM_ASSERT(len);
3269
3270 if constexpr(Opt::is_masks())
3271 arg->add(bv_and_mask_, 0);
3272 arg->add(sv_.get_null_bvector(), 0);
3273
3274 // use reverse order (faster for sorted arrays)
3275
3276 for (int octet_idx = len-1; octet_idx >= 0; --octet_idx)
3277 {
3278 //if (unsigned(octet_idx) < octet_start) // common prefix
3279 // break;
3280
3281 unsigned value = unsigned(str[octet_idx]) & 0xFF;
3282 BM_ASSERT(value != 0);
3283
3284 bm::id64_t planes_mask;
3285 #if (0)
3286 if (&sv == bound_sv_)
3287 planes_mask = vector_plane_masks_[unsigned(octet_idx)];
3288 else
3289 #endif
3290 planes_mask = sv_.get_slice_mask(unsigned(octet_idx));
3291
3292 if ((value & planes_mask) != value) // pre-screen for impossible values
3293 {
3294 arg->reset(); // reset is necessary to disable single plane AND
3295 return; // found non-existing plane
3296 }
3297
3299 *arg, sv_, octet_idx, planes_mask, value); // AND-SUB aggregator
3300 } // for octet_idx
3301
3302
3303 // add all vectors above string len to the SUB aggregation group
3304 //
3305 //if (prefix_sub)
3306 {
3307 unsigned plane_idx = unsigned(len * 8);
3308 // SUB group should not include not NULL bvector
3309 size_type planes = size_type(this->eff_slices_);
3310 for (; plane_idx < planes; ++plane_idx)
3311 {
3312 if (bvector_type_const_ptr bv = sv_.get_slice(plane_idx))
3313 arg->add(bv, 1); // agg to SUB group
3314 } // for
3315 }
3316
3317}
3318
3319//----------------------------------------------------------------------------
3320// sv_sample_index<SV>
3321//----------------------------------------------------------------------------
3322
3323template<typename SV>
3324void sv_sample_index<SV>::construct(const SV& sv, unsigned s_factor)
3325{
3326 BM_ASSERT(SV::is_str());
3327 s_factor_ = s_factor;
3328 sv_size_ = sv.size();
3329 if (!sv_size_)
3330 return;
3331
3332 // resize and init the cache matrix
3333 //
3334 auto effective_str_max = sv.effective_vector_max() + 1;
3335 size_type total_nb = (sv_size_ / bm::gap_max_bits) + 1;
3336 size_type idx_size = total_nb * s_factor + 1;
3337 s_cache_.init_resize(idx_size, effective_str_max);
3338 s_cache_.set_zero();
3339
3340 // build the index
3341 const size_type cols = size_type(s_cache_.cols());
3342 const size_type s_step = bm::gap_max_bits / s_factor;
3343 idx_size_ = 0;
3344 for(size_type i = 0; true; )
3345 {
3346 value_type* s_str = s_cache_.row(idx_size_);
3347 ++idx_size_;
3348 sv.get(i, s_str, cols);
3349
3350 if (i == sv_size_-1) // last element was aleady covered, break
3351 break;
3352 i += s_step;
3353 if (i >= sv_size_) // add the last sampled element
3354 {
3355 i = sv_size_-1;
3356 if (i)
3357 {
3358 s_str = s_cache_.row(idx_size_);
3359 ++idx_size_;
3360 sv.get(i, s_str, cols);
3361 }
3362 break;
3363 }
3364 } // for i
3365
3366 size_t min_len = 0;
3367 {
3368 const value_type* s = s_cache_.row(0);
3369 min_len = ::strlen(s);
3370 }
3371
3372 // find index duplicates, minimum key size, ...
3373 //
3374 idx_unique_ = true;
3375 const value_type* str_prev = s_cache_.row(0);
3376 for(size_type i = 1; i < idx_size_; ++i)
3377 {
3378 const value_type* str_curr = s_cache_.row(i);
3379 size_t curr_len = ::strlen(str_curr);
3380 if (curr_len < min_len)
3381 min_len = curr_len;
3382
3383 int cmp = SV::compare_str(str_prev, str_curr);
3384 BM_ASSERT(cmp <= 0);
3385 if (cmp == 0) // duplicate
3386 {
3387 idx_unique_ = false;
3388 break;
3389 }
3390 str_prev = str_curr;
3391 } // for i
3392
3393 min_key_len_ = min_len;
3394
3395}
3396
3397//----------------------------------------------------------------------------
3398
3399template<typename SV>
3401 size_t in_len,
3402 size_type& l,
3403 size_type& r) const BMNOEXCEPT
3404{
3405 const size_type linear_cutoff = 4;
3406 if (!idx_size_)
3407 return false;
3408 l = 0; r = idx_size_ - 1;
3409 int cmp;
3410
3411 size_t min_len = this->min_key_len_;
3412 if (in_len < min_len)
3413 min_len = in_len;
3414
3415 // check the left-right boundaries
3416 {
3417 const value_type* str = s_cache_.row(l);
3418 cmp = SV::compare_str(search_str, str, min_len);
3419 if (cmp < 0)
3420 return false;
3421
3422 str = s_cache_.row(r);
3423 cmp = SV::compare_str(search_str, str, min_len);
3424 if (cmp > 0)
3425 return false;
3426 }
3427
3428 while (l < r)
3429 {
3430 size_type dist = r - l;
3431 if (dist < linear_cutoff) // do linear scan here
3432 {
3433 for (size_type i = l+1; i < r; ++i)
3434 {
3435 const value_type* str_i = s_cache_.row(i);
3436 cmp = SV::compare_str(search_str, str_i, min_len);
3437 if (cmp > 0) // |----i-*--|----|
3438 { // |----*----|----|
3439 l = i;
3440 continue; // continue searching
3441 }
3442 /*
3443 if (cmp == 0) // |----*----|----|
3444 {
3445 l = r = i;
3446 return true;
3447 }
3448 */
3449 // |--*-i----|----|
3450 BM_ASSERT(i);
3451 r = i;
3452 break;
3453 } // for i
3454 return true;
3455 } // if linear scan
3456
3457 size_type mid = (r-l) / 2 + l;
3458 const value_type* str_m = s_cache_.row(mid);
3459 cmp = SV::compare_str(str_m, search_str, min_len);
3460 if (cmp <= 0) // str_m <= search_str
3461 l = mid;
3462 else // str_m > search_str
3463 r = mid;
3464 } // while
3465
3466 return true;
3467}
3468
3469//----------------------------------------------------------------------------
3470
3471template<typename SV>
3474 size_t in_len,
3475 size_type l,
3476 size_type r) const BMNOEXCEPT
3477{
3478 const value_type* str_l = s_cache_.row(l);
3479 const value_type* str_r = s_cache_.row(r);
3480
3481 size_t min_len = (in_len < min_key_len_) ? in_len : min_key_len_;
3482 size_type i = 0;
3483 if (min_len >= 4)
3484 {
3485 for (; i < min_len-3; i+=4)
3486 {
3487 unsigned i2, i1;
3488 ::memcpy(&i2, &str_l[i], sizeof(i2));
3489 ::memcpy(&i1, &str_r[i], sizeof(i1));
3491 bm::id64_t(i2) | (bm::id64_t(i1) << 32)));
3492 if (i1 != i2)
3493 break;
3494 ::memcpy(&i2, &str_s[i], sizeof(i2));
3496 bm::id64_t(i2) | (bm::id64_t(i1) << 32)));
3497 if (i1 != i2)
3498 break;
3499 } // for i
3500 }
3501
3502 for (; true; ++i)
3503 {
3504 auto ch1 = str_l[i]; auto ch2 = str_r[i];
3505 if (ch1 != ch2 || (!(ch1|ch2))) // chars not the same or both zero
3506 break;
3507 auto chs = str_s[i];
3508 if (ch1 != chs)
3509 break;
3510 } // for i
3511 return i;
3512}
3513
3514
3515//----------------------------------------------------------------------------
3516
3517template<typename SV>
3519 size_type& l,
3520 size_type& r) const BMNOEXCEPT
3521{
3522 BM_ASSERT(l <= r);
3523 BM_ASSERT(r < idx_size_);
3524
3525 // -1 correction here below is done to get it to the closed interval
3526 // [from..to] when possible, because it reduces search space
3527 // by one scan wave
3528
3529 const size_type s_step = bm::gap_max_bits / s_factor_;
3530 if (r == idx_size_-1) // last element
3531 {
3532 l *= s_step;
3533 if (l == r)
3534 {
3535 r = sv_size_-1;
3536 BM_ASSERT(l <= r);
3537 return;
3538 }
3539 r = sv_size_-1;
3540 if (l > r)
3541 l = 0;
3542 }
3543 else
3544 {
3545 if (l == r)
3546 {
3547 l *= s_step;
3548 r = l + s_step-1;
3549 if (r >= sv_size_)
3550 r = sv_size_-1;
3551 }
3552 else
3553 {
3554 const value_type* str = s_cache_.row(r);
3555 l *= s_step;
3556 r *= s_step;
3557 int cmp = SV::compare_str(search_str, str);
3558 BM_ASSERT(cmp <= 0);
3559 if (cmp != 0)
3560 r -= (r && idx_unique_); // -1 correct
3561 else
3562 if (idx_unique_)
3563 {
3564 l = r;
3565 }
3566 }
3567 }
3568 BM_ASSERT(r <= sv_size_);
3569 BM_ASSERT(l <= r);
3570}
3571
3572//----------------------------------------------------------------------------
3573
3574} // namespace bm
3575
3576
3577#endif
Algorithms for fast aggregation of N bvectors.
Algorithms for bvector<> (main include).
Definitions(internal).
#define BMRESTRICT
Definition bmdef.h:203
#define BMNOEXCEPT
Definition bmdef.h:82
#define BM_ASSERT
Definition bmdef.h:139
#define BM_VECT_ALIGN
Definition bmdef.h:320
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
Algorithms for fast aggregation of a group of bit-vectors.
bm::heap_vector< size_type, allocator_type, true > bv_count_vector_type
Constant iterator designed to enumerate "ON" bits.
Definition bm.h:603
bool valid() const BMNOEXCEPT
Checks if iterator is still valid.
Definition bm.h:283
bm::bvector< Alloc > & bit_or(const bm::bvector< Alloc > &bv1, const bm::bvector< Alloc > &bv2, typename bm::bvector< Alloc >::optmode opt_mode=opt_none)
3-operand OR : this := bv1 OR bv2
Definition bm.h:5906
@ opt_none
no optimization
Definition bm.h:134
bvector< Alloc > & invert()
Invert/NEG all bits It should be noted, invert is affected by size() if size is set - it only inverts...
Definition bm.h:3550
allocator_type::allocator_pool_type allocator_pool_type
Definition bm.h:118
void resize(size_type new_size)
Change size of the bvector.
Definition bm.h:2463
Alloc allocator_type
Definition bm.h:117
bm::bvector< Alloc > & bit_xor(const bm::bvector< Alloc > &bv1, const bm::bvector< Alloc > &bv2, typename bm::bvector< Alloc >::optmode opt_mode=opt_none)
3-operand XOR : this := bv1 XOR bv2
Definition bm.h:6005
Algorithms for rank compression of bit-vector.
Definition bmalgo.h:453
bvector_type bv_zero_
bit-vector for zero elements
bvector_type bv_product_
temp vector
void attach_sv(const SV *sv_brel, bool compute_stats=false)
Attach a translation table vector for remapping (Image function).
allocator_pool_type pool_
const SV * sv_ptr_
current translation table vector
void remap(const bvector_type &bv_in, bvector_type &bv_out)
Perform remapping (Image function) (based on attached translation table).
bool have_stats_
flag of statistics presense
gather_buffer_type * gb_
intermediate buffers
bvector_type::allocator_type::allocator_pool_type allocator_pool_type
void operator=(const set2set_11_transform &)=delete
void one_pass_run(const bvector_type &bv_in, const SV &sv_brel, bvector_type &bv_out)
gather_buffer< sv_g_size > gather_buffer_type
void run(const bvector_type &bv_in, const SV &sv_brel, bvector_type &bv_out)
Run remap transformation.
set2set_11_transform(const set2set_11_transform &)=delete
const bvector_type & get_bv_zero() const
Get read access to zero-elements vector Zero vector gets populated after attach_sv() is called or as ...
void set_search_count_limit(size_type limit) BMNOEXCEPT
Set search limit for results.
void complete()
Prepare pipeline for the execution (resize and init internal structures) Once complete,...
pipeline(const pipeline &)=delete
void set_search_mask(const bvector_type *bv_mask) BMNOEXCEPT
Set pipeline mask bit-vector to restrict the search space.
const run_options & get_options() const BMNOEXCEPT
Get pipeline run options.
aggregator_pipeline_type & get_aggregator_pipe() BMNOEXCEPT
get aggregator pipeline (access to compute internals)
void add(const typename SV::value_type *str)
Add new search string.
aggregator_pipeline_type agg_pipe_
traget aggregator pipeline
remap_vector_type remap_value_vect_
remap buffer
bvect_vector_type & get_bv_res_vector() BMNOEXCEPT
Return results vector used for pipeline execution.
size_type size() const BMNOEXCEPT
pipeline & operator=(const pipeline &)=delete
bv_count_vector_type & get_bv_count_vector() BMNOEXCEPT
Return results vector count used for pipeline execution.
aggregator_type::template pipeline< Opt > aggregator_pipeline_type
size_t eff_slices_
number of effectice NOT NULL value slices
run_options & options() BMNOEXCEPT
Set pipeline run options.
const SV & sv_
target sparse vector ref
void set_or_target(bvector_type *bv_or) BMNOEXCEPT
Attach OR (aggregator vector).
algorithms for sparse_vector scan/search
bm::dynamic_heap_matrix< value_type, allocator_type > matrix_search_buf_type
allocator_pool_type & get_bvector_alloc_pool() BMNOEXCEPT
Return allocator pool for blocks (Can be used to improve performance of repeated searches with the sa...
bm::dynamic_heap_matrix< value_type, allocator_type > heap_matrix_type
int compare(const SV &sv, size_type idx, const value_type val) BMNOEXCEPT
compare sv[idx] with input value
void operator=(const sparse_vector_scanner &)=delete
bool prepare_and_sub_aggregator(const SV &sv, value_type value)
Prepare aggregator for AND-SUB (EQ) search.
void find_eq(const SV &sv, value_type value, bvector_type &bv_out)
find all sparse vector elements EQ to search value
bool bfind(const SV &sv, const value_type val, size_type &pos)
binary search for position in the sorted sparse vector
void bind(const SV &sv, bool sorted)
bind sparse vector for all searches
void correct_nulls(const SV &sv, bvector_type &bv_out)
Exclude possible NULL values from the result vector.
int compare_str(const SV &sv, size_type idx, const value_type *str) const BMNOEXCEPT
compare sv[idx] with input str
void reset_binding() BMNOEXCEPT
reset sparse vector binding
void find_range(const SV &sv, value_type from, value_type to, bvector_type &bv_out)
find all elements sparse vector element in closed range [left..right] interval
bm::heap_vector< bm::id64_t, typename bvector_type::allocator_type, true > mask_vector_type
const bvector_type * bvector_type_const_ptr
void aggregate_OR_slices(bvector_type &bv_target, const SV &sv, unsigned from, unsigned total_planes)
aggregator_type::bv_count_vector_type bv_count_vector_type
bm::heap_vector< bvector_type *, allocator_type, true > bvect_vector_type
void decompress(const SV &sv, bvector_type &bv_out)
Rank-Select decompression for RSC vectors.
sparse_vector_scanner(const sparse_vector_scanner &)=delete
void find_gt(const SV &sv, value_type val, bvector_type &bv_out)
find all elements sparse vector element greater (>) than value
void set_search_range(size_type from, size_type to) BMNOEXCEPT
set search boundaries (hint for the aggregator)
bool find_eq_str_impl(const SV &sv, const value_type *str, bvector_type &bv_out, bool prefix_sub)
find EQ str / prefix impl
void find_zero(const SV &sv, bvector_type &bv_out, bool null_correct=true)
find all sparse vector elements EQ to 0
void reset_search_range() BMNOEXCEPT
reset (disable) search range
void find_eq_with_nulls_horizontal(const SV &sv, value_type value, bvector_type &bv_out)
For testing purposes only.
bm::heap_vector< value_type, typename bvector_type::allocator_type, true > remap_vector_type
void find_lt(const SV &sv, value_type val, bvector_type &bv_out)
find all elements sparse vector element less (<) than value
bool find_eq_with_nulls(const SV &sv, value_type value, bvector_type &bv_out, size_type search_limit=0)
find value (may include NULL indexes)
bool bfind_eq_str(const SV &sv, const value_type *str, size_type &pos)
binary find first sparse vector element (string).
bool bfind_eq_str_impl(const SV &sv, const value_type *str, size_t in_len, bool remaped, size_type &pos)
void find_gt_horizontal(const SV &sv, value_type value, bvector_type &bv_out, bool null_correct=true)
For testing purposes only.
static constexpr int gt_slice_limit() noexcept
Return the slice limit for signed/unsigned vector value types.
static bool remap_tosv(remap_vector_type &remap_vect_target, const value_type *str, const SV &sv)
Remap input value into SV char encodings.
static void add_agg_char(AGG &agg, const SV &sv, int octet_idx, bm::id64_t planes_mask, unsigned value)
Addd char to aggregator (AND-SUB).
bm::aggregator< bvector_type > aggregator_type
aggregator_type::run_options run_options
Scanner options to control execution.
bool lower_bound_str(const SV &sv, const value_type *str, size_type &pos)
lower bound search for an array position
void find_positive(const SV &sv, bvector_type &bv_out)
Find positive (greter than zero elements) Output vector is computed as a logical OR (join) of all pla...
bvector_type::allocator_type allocator_type
bool find_eq_str(const SV &sv, const value_type *str, bvector_type &bv_out)
find sparse vector elements (string)
void find_nonzero(const SV &sv, bvector_type &bv_out)
Find non-zero elements Output vector is computed as a logical OR (join) of all planes.
bool find_first_eq(const SV &sv, value_type value, size_type &idx)
find first value (may include NULL indexes)
void invert(const SV &sv, bvector_type &bv_out)
invert search result ("EQ" to "not EQ")
allocator_type::allocator_pool_type allocator_pool_type
bool find_eq_str_prefix(const SV &sv, const value_type *str, bvector_type &bv_out)
find sparse vector elements with a given prefix (string)
void find_le(const SV &sv, value_type val, bvector_type &bv_out)
find all elements sparse vector element less or equal (<=) than value
void find_ge(const SV &sv, value_type val, bvector_type &bv_out)
find all elements sparse vector element greater or equal (>=) than value
static void aggregate_AND_OR_slices(bvector_type &bv_target, const bvector_type &bv_mask, const SV &sv, unsigned from, unsigned total_planes)
Index for SV sorted vectors for approximate range queries.
size_type size() const BMNOEXCEPT
Index size (number of sampled elements).
size_type common_prefix_length(const value_type *search_str, size_t in_len, size_type l, size_type r) const BMNOEXCEPT
find common prefix between index elements and search string
SV::bvector_type bvector_type
SV::value_type value_type
bool bfind_range(const value_type *search_str, size_t in_len, size_type &l, size_type &r) const BMNOEXCEPT
find range (binary)
sv_sample_index(const SV &sv, unsigned s_factor)
bool is_unique() const BMNOEXCEPT
returns true if all index values are unique
size_type sv_size() const BMNOEXCEPT
Original SV size.
bm::dynamic_heap_matrix< value_type, allocator_type > heap_matrix_type
void recalc_range(const value_type *search_str, size_type &l, size_type &r) const BMNOEXCEPT
recalculate range into SV coordinates range [from..to)
void construct(const SV &sv, unsigned s_factor)
Build sampling index for the sorted sprase vector.
size_t get_min_len() const BMNOEXCEPT
Return length of minimal indexed string.
bvector_type::allocator_type allocator_type
BMFORCEINLINE bm::id_t word_bitcount(bm::id_t w) BMNOEXCEPT
Definition bmutil.h:582
unsigned short bitscan(V w, B *bits) BMNOEXCEPT
Templated Bitscan with dynamic dispatch for best type.
Definition bmfunc.h:769
bm::alloc_pool_guard< allocator_pool_type, bvector< Alloc > > mem_pool_guard
Definition bm.h:790
null_support
NULL-able value support.
Definition bmconst.h:229
@ BM_SORTED
input set is sorted (ascending order)
Definition bmconst.h:206
@ BM_SORTED_UNIFORM
sorted and in one block (internal!)
Definition bmconst.h:207
@ use_null
support "non-assigned" or "NULL" logic
Definition bmconst.h:230
@ no_null
do not support NULL values
Definition bmconst.h:231
bool sparse_vector_find_first_mismatch(const SV &sv1, const SV &sv2, typename SV::size_type &midx, bm::null_support null_proc=bm::use_null)
Find first mismatch (element which is different) between two sparse vectors (uses linear scan in bit-...
void dynamic_range_clip_low(SV &svect, unsigned low_bit)
Clip dynamic range for signal lower than specified (boost).
void dynamic_range_clip_high(SV &svect, unsigned high_bit)
Clip dynamic range for signal higher than specified.
void sparse_vector_find_mismatch(typename SV1::bvector_type &bv, const SV1 &sv1, const SV2 &sv2, bm::null_support null_proc)
Find mismatch vector, indicating positions of mismatch between two sparse vectors (uses linear scan i...
Definition bm.h:78
const unsigned id_max
Definition bmconst.h:109
const unsigned set_block_mask
Definition bmconst.h:57
BMFORCEINLINE bool has_zero_byte_u64(bm::id64_t v) BMNOEXCEPT
Returns true if INT64 contains 0 octet.
Definition bmutil.h:571
BMFORCEINLINE void xor_swap(W &x, W &y) BMNOEXCEPT
XOR swap two variables.
Definition bmutil.h:534
unsigned long long int id64_t
Definition bmconst.h:35
unsigned int id_t
Definition bmconst.h:38
const unsigned gap_max_bits
Definition bmconst.h:81
const unsigned set_block_shift
Definition bmconst.h:56
static bool test()
Definition bmutil.h:114
size_type BM_VECT_ALIGN gather_idx_[BSIZE] BM_VECT_ALIGN_ATTR
bm::aggregator< bm::bvector<> > aggregator_type
bm::bvector bvector_type