BitMagic-C++
strsvsample08.cpp
Go to the documentation of this file.
1/*
2Copyright(c) 2002-2022 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15
16For more information please visit: http://bitmagic.io
17*/
18
19/** \example strsvsample08.cpp
20 Succinct container binary search
21
22 \sa bm::str_sparse_vector
23 \sa bm::sparse_vector_scanner
24 \sa bm::sparse_vector_scanner::bind
25 \sa bm::sparse_vector_scanner::bfind_eq_str
26 \sa bm::str_sparse_vector::freeze
27 \sa bm::str_sparse_vector::remap
28 \sa bm::str_sparse_vector::optimize
29*/
30
31/*! \file strsvsample06.cpp
32 \brief Example: Succinct container binary search
33*/
34
35#include <iostream>
36#include <string>
37#include <vector>
38#include <algorithm>
39#include <random>
40#include <cassert>
41
42#include "bm.h"
43#include "bmstrsparsevec.h"
44#include "bmsparsevec_algo.h"
45#include "bmtimer.h"
46#include "bmundef.h" /* clear the pre-proc defines from BM */
47
48using namespace std;
49
52
53
54static
55void GenerateTestStrCollection(std::vector<string>& str_coll, unsigned max_coll)
56{
57 string prefix = "az";
58 string str;
59 for (unsigned i = 0; i < max_coll; ++i)
60 {
61 str = prefix;
62 str.append(to_string(i));
63 str_coll.emplace_back(str);
64
65 if (i % 1024 == 0) // generate new prefix
66 {
67 prefix.clear();
68 unsigned prefix_len = (unsigned)rand() % 5;
69 for (unsigned j = 0; j < prefix_len; ++j)
70 {
71 char cch = char('a' + (unsigned)rand() % 26);
72 prefix.push_back(cch);
73 } // for j
74 }
75 } // for i
76}
77
78
79int main(void)
80{
81 const unsigned max_coll = 30000000;
82
83 try
84 {
85 std::vector<string> str_coll;
86 str_sv_type str_sv;
87
88 cout << "Prepare test data" << endl;
89 cout << " generation ..." << endl;
90 GenerateTestStrCollection(str_coll, max_coll);
91
92 cout << " sort..." << endl;
93 std::sort(str_coll.begin(), str_coll.end());
94
95 str_sv_type str_sv_srt; // sorted succinct vector
96 // fill in using back-insert iterator (fastest mathod to fill in)
97 {
98 auto bi = str_sv_srt.get_back_inserter();
99 for (auto str : str_coll)
100 bi = str;
101 // important to explicitly flush
102 // ( because destruction flush is not exception safe )
103 bi.flush();
104 }
105 cout << " remap-optimize-freeze..." << endl;
106 str_sv_srt.remap(); // apply remapping compression
107 str_sv_srt.optimize(); // RLE compression
108 str_sv_srt.freeze(); // turn into read-only mode
109
110 // pick the test sets
111 std::vector<string> str_coll_test1;
112 {
113 const unsigned pick_factor = 5;
114 for (size_t i = 0; i < size_t(str_coll.size()); i+=pick_factor)
115 {
116 const string& s = str_coll[i];
117 str_coll_test1.push_back(s);
118 } // for i
119 std::random_device rd;
120 std::mt19937 g(rd());
121 std::shuffle(str_coll_test1.begin(), str_coll_test1.end(), g);
122 }
123
124 cout << "Run test" << endl;
125
126
127 size_t sum_pos4=0, sum_pos32=0;
128
129 // code below illustrates bm::sparse_vector_scanner<> search sampling
130 // parameters: 4 and 32 where 32 offers better performance.
131 //
132 // the use case here is to create a vector-scanner pair, bind then
133 // and repeat multiple searches to vector using its scanner
134 // (scanner maintains the sampled search index (for sorted searches).
135 //
136 // Sampling approach works well, because BM uses hybrid binary search
137 // first narrowing down the seach using uncomprssed samples O(Nlog(N))
138 // then at the end running a vector search using logical ops with
139 // search space prunning.
140 //
141 // all of the above implements fast search in sorted-compressed array
142 // without decompression
143 //
144
145 {
148 scanner.bind(str_sv_srt, true); // bind sorted vector
149
150 bm::chrono_taker tt(cout, "bm::sparse_vector_scanner<>::bfind_eq_str() [4] ", 1);
151 for (unsigned i = 0; i < unsigned(str_coll_test1.size()); ++i)
152 {
153 const string& s = str_coll_test1[i];
154 bool found = scanner.bfind_eq_str(s.c_str(), pos);
155 if (!found)
156 {
157 cerr << "String bfind_eq_str() failure!" << endl;
158 assert(0); exit(1);
159 }
160 sum_pos4 += pos;
161 } // for
162 }
163
164 {
167 scanner.bind(str_sv_srt, true); // bind sorted vector
168
169 bm::chrono_taker tt(cout, "bm::sparse_vector_scanner<>::bfind_eq_str() [32] ", 1);
170 for (unsigned i = 0; i < unsigned(str_coll_test1.size()); ++i)
171 {
172 const string& s = str_coll_test1[i];
173 bool found = scanner.bfind_eq_str(s.c_str(), pos);
174 if (!found)
175 {
176 cerr << "String bfind_eq_str() failure!" << endl;
177 assert(0); exit(1);
178 }
179 sum_pos32 += pos;
180 } // for
181 }
182
183 assert(sum_pos4 == sum_pos32);
184
185 }
186 catch(std::exception& ex)
187 {
188 std::cerr << ex.what() << std::endl;
189 return 1;
190 }
191
192
193 return 0;
194}
195
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
Algorithms for bm::sparse_vector.
string sparse vector based on bit-transposed matrix
Timing utilities for benchmarking (internal).
pre-processor un-defines to avoid global space pollution (internal)
Bitvector Bit-vector container with runtime compression of bits.
Definition bm.h:115
Utility class to collect performance measurements and statistics.
Definition bmtimer.h:41
algorithms for sparse_vector scan/search
void bind(const SV &sv, bool sorted)
bind sparse vector for all searches
bool bfind_eq_str(const SV &sv, const value_type *str, size_type &pos)
binary find first sparse vector element (string).
succinct sparse vector for strings with compression using bit-slicing ( transposition) method
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename str_sparse_vector< CharType, BV, STR_SIZE >::statistics *stat=0)
run memory optimization for all vector planes
void freeze()
Turn sparse vector into immutable mode Read-only (immutable) vector uses less memory and allows faste...
void remap()
Build remapping profile and re-load content to save memory.
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
bm::str_sparse_vector< char, bvector_type, 32 > str_sv_type
static void GenerateTestStrCollection(std::vector< string > &str_coll, unsigned max_coll)
int main(void)
bm::bvector bvector_type