BitMagic-C++
strsvsample03.cpp
Go to the documentation of this file.
1/*
2Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15
16For more information please visit: http://bitmagic.io
17*/
18
19/** \example strsvsample03.cpp
20 Example of how to use bm::str_sparse_vector<> - succinct container for
21 bit-transposed string collections
22
23 \sa bm::str_sparse_vector
24*/
25
26/*! \file strsvsample03.cpp
27 \brief Example: str_sparse_vector<> back insert iterator example
28
29 This example loads sparse vector from an STL container uses character re-mapping
30 to compress, serialize and save container to disk.
31 Example also illustrates how to check memory footprint.
32*/
33
34#include <assert.h>
35#include <iostream>
36#include <string>
37#include <vector>
38#include <random>
39#include <algorithm>
40#include <fstream>
41
42
43#include "bm.h"
44#include "bmstrsparsevec.h"
45#include "bmsparsevec_serial.h"
46#include "bmundef.h" /* clear the pre-proc defines from BM */
47
48using namespace std;
49
51
52// define the sparse vector type for 'char' type using bvector as
53// a container of bits for bit-transposed planes
54// 32 - is maximum string length for this container.
55// Memory allocation is dynamic using sparse techniques, so this number
56// just defines the max capacity.
57//
59
60
61// generate collection of strings from integers and shuffle it
62//
63static
64void generate_string_set(vector<string>& str_vec)
65{
66 const unsigned max_coll = 50000;
67
68 str_vec.resize(0);
69 string str;
70 for (unsigned i = 10; i < max_coll; i += (unsigned)rand() % 3)
71 {
72 str = to_string(i);
73 str_vec.emplace_back(str);
74 } // for i
75
76 // shuffle the data set
77 //
78 std::random_device rd;
79 std::mt19937 g(rd());
80 std::shuffle(str_vec.begin(), str_vec.end(), g);
81}
82
83
84int main(void)
85{
86 try
87 {
88 str_sv_type str_sv;
89
90 vector<string> str_vec;
91 generate_string_set(str_vec);
92 std::sort(str_vec.begin(), str_vec.end()); // sort the input vector
93
94
95 // load sparse vector from an STL container
96 //
97 {
98 size_t vect_size = 0; // approx std::vector<string> memory usage
99 str_sv_type str_sv_tmp; // temp vector
100 {
101 str_sv_type::back_insert_iterator bi =
102 str_sv_tmp.get_back_inserter();
103 for (auto str : str_vec)
104 {
105 bi = str;
106
107 // some approximate estimate of std::string element cost
108 //
109 size_t str_size = str.size() + sizeof(str);
110 vect_size += str_size;
111 }
112
113 // it is important to use flush, because back inserter is
114 // buffering data. Of cause it flashes automatically on
115 // destruction but explicit flush is somewhat better
116 // because of possible exception is thrown here and not from
117 // destructor.
118 //
119
120 bi.flush();
121
122 cout << "STL vector<string> approx.memory consumption:"
123 << vect_size << endl;
124 }
125
126 // calculate memory footprint
127 //
128 str_sv_type::statistics st;
129 str_sv_tmp.calc_stat(&st);
130
131 cout << "Used memory: " << st.memory_used << std::endl;
132
133
134 // final step is re-mapping, which increses chances for
135 // good memory compression.
136 // A side-effect here is that remapping makes container
137 // effectively read-only.
138 //
139 str_sv.remap_from(str_sv_tmp);
140
142 str_sv.optimize(tb); // optimize the vector
143
144 str_sv.calc_stat(&st);
145 cout << "Used memory after remap and optimization: "
146 << st.memory_used
147 << std::endl;
148
149
150 // a slightly different way to do the reampped loading
151 // iterator in this case is used to collect character frequency
152 // tables and do remap on flush() (which is a bit faster)
153 //
154 // another option here is to turn vector into read-only mode
155 // which runs an embedded memory defragmentation algorithm
156 //
157 {
158 str_sv_type str_sv1;
159 {
160 str_sv_type::back_insert_iterator bi =
161 str_sv1.get_back_inserter();
162 bi.set_remap(true);
163 for (auto str : str_vec)
164 bi = str;
165 bi.flush();
166 }
167
168
169 str_sv1.optimize(tb);
170
171 // freeze is used to turn the vector into read-only (immutable)
172 //
173 str_sv1.freeze();
174
175 assert(str_sv1.is_ro());
176 bool eq = str_sv1.equal(str_sv);
177 assert(eq); (void)eq;
178
179 str_sv1.calc_stat(&st);
180 cout << "Used memory after remap / optimization / freeze: "
181 << st.memory_used
182 << std::endl;
183
184 // construct a vector with remap table derived from another
185 // we can just add the same data into the new vector
186 // or part of the same data (projection) or reorderd data.
187 //
188 // the key thing is that we have to guarantee that the
189 // remapping tables will be the same
190 //
191 {
193 {
194 unsigned cnt = 0;
195 str_sv_type::back_insert_iterator bi =
196 str_sv2.get_back_inserter();
197 for (auto str : str_vec)
198 {
199 bi = str;
200 if (++cnt >= 10)
201 break; // pick first 10
202 }
203 bi.flush();
204 }
205 str_sv2.optimize();
206 cout << "size2=" << str_sv2.size() << endl; // 10
207 }
208
209 }
210
211
212 }
213
214 // serialize and save
215 //
216 {
217 std::string fname = "test.sv";
219
221 bm::sparse_vector_serialize(str_sv, sv_lay, tb);
222
223 std::ofstream fout(fname.c_str(), std::ios::binary);
224 if (!fout.good())
225 {
226 return -1;
227 }
228 const char* buf = (char*)sv_lay.buf();
229 fout.write(buf, (unsigned)sv_lay.size());
230 if (!fout.good())
231 {
232 return -1;
233 }
234 fout.close();
235
236 cout << "Saved size: " << sv_lay.size() << endl;
237 }
238
239 }
240 catch(std::exception& ex)
241 {
242 std::cerr << ex.what() << std::endl;
243 return 1;
244 }
245
246
247 return 0;
248}
249
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
#define BM_DECLARE_TEMP_BLOCK(x)
Definition bm.h:47
Serialization for sparse_vector<>.
string sparse vector based on bit-transposed matrix
pre-processor un-defines to avoid global space pollution (internal)
Bitvector Bit-vector container with runtime compression of bits.
Definition bm.h:115
succinct sparse vector for strings with compression using bit-slicing ( transposition) method
bool is_ro() const BMNOEXCEPT
Returns true if vector is read-only.
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename str_sparse_vector< CharType, BV, STR_SIZE >::statistics *stat=0)
run memory optimization for all vector planes
void calc_stat(struct str_sparse_vector< CharType, BV, STR_SIZE >::statistics *st) const BMNOEXCEPT
Calculates memory statistics.
bool equal(const str_sparse_vector< CharType, BV, STR_SIZE > &sv, bm::null_support null_able=bm::use_null) const BMNOEXCEPT
check if another sparse vector has the same content and size
void remap_from(const str_sparse_vector &str_sv, octet_freq_matrix_type *omatrix=0)
Build remapping profile and load content from another sparse vector Remapped vector likely saves memo...
size_type size() const
return size of the vector
void freeze()
Turn sparse vector into immutable mode Read-only (immutable) vector uses less memory and allows faste...
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
void sparse_vector_serialize(const SV &sv, sparse_vector_serial_layout< SV > &sv_layout, bm::word_t *temp_block=0)
Serialize sparse vector into a memory buffer(s) structure.
@ COPY_RTABLES
copy remap tables only (without data)
bm::str_sparse_vector< char, bvector_type, 32 > str_sv_type
int main(void)
static void generate_string_set(vector< string > &str_vec)
layout class for serialization buffer structure
const unsigned char * buf() const BMNOEXCEPT
Return serialization buffer pointer.
size_t size() const BMNOEXCEPT
return current serialized size
bm::bvector bvector_type