72#include "dna_finger.h"
121 std::ifstream fin(fname.c_str(), std::ios::in);
126 for (
unsigned i = 0; std::getline(fin, line); ++i)
131 for (std::string::iterator it = line.begin(); it != line.end(); ++it)
132 seq_vect.push_back(*it);
166 size_t pos,
unsigned k_size,
174 for (
size_t i = 0; i < k_size; ++i)
181 k_acc |= (dna_code << shift);
194 static char lut[] = {
'A',
'T',
'G',
'C',
'N',
'$' };
210 for (
size_t i = 0; i < k_size; ++i)
212 unsigned dna_code = unsigned(kmer_code & 3);
225 size_t pos,
unsigned k_size,
228 for (
size_t i = 0; i < k_size; ++i)
230 char bp = dna[pos+i];
231 unsigned dna_code = unsigned(k_mer & 3ull);
235 if (bp ==
'N' && bp_c !=
'A')
237 cerr << bp <<
" " << bp_c << endl;
238 cerr <<
"Error! N code mismatch at pos = " << pos+i
247 cerr <<
"Error! non-zero k-mer remainder at " << pos << endl;
255template<
typename VECT>
258 std::sort(vect.begin(), vect.end());
259 auto last = std::unique(vect.begin(), vect.end());
260 vect.erase(last, vect.end());
267template<
typename VECT,
typename COUNT_VECT>
272 std::sort(vect.begin(), vect.end());
273 typename VECT::value_type prev = vect[0];
274 typename COUNT_VECT::value_type cnt = 1;
275 auto vsize = vect.size();
277 for (; i < vsize; ++i)
285 cvect.inc_not_null(prev, cnt);
289 cvect.inc_not_null(prev, cnt);
290 assert(cvect.in_sync());
316 if (seq_vect.empty())
318 const char* dna_str = &seq_vect[0];
320 std::vector<bm::id64_t> k_buf;
321 k_buf.reserve(chunk_size);
325 vector_char_type::size_type dna_sz = seq_vect.size()-(k_size-1);
326 vector_char_type::size_type pos = 0;
328 for (; pos < dna_sz; ++pos)
333 k_buf.push_back(k_mer_code);
338 const unsigned k_shift = (k_size-1) * 2;
341 for (++pos; pos < dna_sz; ++pos)
344 valid =
get_DNA_code(dna_str[pos + (k_size - 1)], bp_code);
348 for (; pos < dna_sz; ++pos)
353 k_buf.push_back(k_mer_code);
360 k_mer_code = ((k_mer_code >> 2) | (bp_code << k_shift));
363 k_buf.push_back(k_mer_code);
371 assert(k_check == k_mer_code);
374 if (k_buf.size() >= chunk_size)
384 float pcnt = float(pos) / float(dna_sz);
386 cout <<
"\r" << unsigned(pcnt) <<
"% of " << dna_sz
387 <<
" (" << (pos+1) <<
") "
398 cout <<
"Unique k-mers: " << k_buf.size() << endl;
413 if (seq_vect.empty())
415 const char* dna_str = &seq_vect[0];
416 std::vector<bm::id64_t> k_buf;
417 k_buf.reserve(chunk_size);
420 vector_char_type::size_type dna_sz = seq_vect.size()-(k_size-1);
421 vector_char_type::size_type pos = 0;
423 for (; pos < dna_sz; ++pos)
428 k_buf.push_back(k_mer_code);
432 const unsigned k_shift = (k_size-1) * 2;
435 for (++pos; pos < dna_sz; ++pos)
438 valid =
get_DNA_code(dna_str[pos + (k_size - 1)], bp_code);
442 for (; pos < dna_sz; ++pos)
447 k_buf.push_back(k_mer_code);
454 k_mer_code = ((k_mer_code >> 2) | (bp_code << k_shift));
456 k_buf.push_back(k_mer_code);
458 if (k_buf.size() == chunk_size)
463 float pcnt = float(pos) / float(dna_sz);
465 cout <<
"\r" << unsigned(pcnt) <<
"% of " << dna_sz
466 <<
" (" << (pos+1) <<
") "
491 : m_seq_vect(seq_vect), m_k_size(k_size), m_from(from), m_to(to),
492 m_kmer_counts(kmer_counts)
496 : m_seq_vect(func.m_seq_vect), m_k_size(func.m_k_size),
497 m_from(func.m_from), m_to(func.m_to),
498 m_kmer_counts(func.m_kmer_counts)
504 const bvector_type* bv_null = m_kmer_counts.get_null_bvector();
506 kmer_counts_part.
sync();
509 if (m_seq_vect.empty())
512 const char* dna_str = &m_seq_vect[0];
513 std::vector<bm::id64_t> k_buf;
514 k_buf.reserve(chunk_size);
516 const auto k_size = m_k_size;
519 vector_char_type::size_type dna_sz = m_seq_vect.size()-(m_k_size-1);
520 vector_char_type::size_type pos = 0;
522 for (; pos < dna_sz; ++pos)
527 if (k_mer_code >= m_from && k_mer_code <= m_to)
528 k_buf.push_back(k_mer_code);
533 const unsigned k_shift = (k_size-1) * 2;
536 for (++pos; pos < dna_sz; ++pos)
539 valid =
get_DNA_code(dna_str[pos + (k_size - 1)], bp_code);
543 for (; pos < dna_sz; ++pos)
548 if (k_mer_code >= m_from && k_mer_code <= m_to)
549 k_buf.push_back(k_mer_code);
556 k_mer_code = ((k_mer_code >> 2) | (bp_code << k_shift));
558 if (k_mer_code >= m_from && k_mer_code <= m_to)
560 k_buf.push_back(k_mer_code);
561 if (k_buf.size() == chunk_size)
574 static std::mutex mtx_counts_lock;
575 std::lock_guard<std::mutex> guard(mtx_counts_lock);
579 m_kmer_counts.merge_not_null(kmer_counts_part);
598 unsigned concurrency)
601 typedef std::vector<std::pair<bv_size_type, bv_size_type> >
bv_ranges_vector;
608 if (split_rank < concurrency || concurrency < 2)
619 std::vector<std::future<void> > futures;
620 futures.reserve(concurrency);
622 for (
size_t k = 0; k < pair_vect.size(); ++k)
624 futures.emplace_back(std::async(std::launch::async,
633 for (
auto& e : futures)
638 std::future_status status = e.wait_for(std::chrono::seconds(60));
639 if (status == std::future_status::ready)
641 cout <<
"\r" << ++m <<
" min" << flush;
655 std::string kmer_str;
656 typename BV::size_type cnt = 0;
657 typename BV::enumerator en = bv_kmers.first();
658 for ( ;en.valid(); ++en, ++cnt)
660 auto k_mer_code = *en;
674 typename BV::size_type count =
dna_scanner.FindCount(kmer_str);
676 kmer_counts.
set(k_mer_code, (
unsigned)count);
678 if ((cnt % 1000) == 0)
679 cout <<
"\r" << cnt << flush;
694template<
typename DNA_Scan>
708 : m_parent_scanner(parent_scanner), m_bv_kmers(bv_kmers),
709 m_from(from), m_to(to), m_kmer_counts(kmer_counts)
714 : m_parent_scanner(func.m_parent_scanner), m_bv_kmers(func.m_bv_kmers),
715 m_from(func.m_from), m_to(func.m_to), m_kmer_counts(func.m_kmer_counts)
721 std::string kmer_str;
724 static std::mutex mtx_counts_lock;
728 for ( ;en.
valid(); ++en, ++cnt)
730 auto k_mer_code = *en;
731 if (k_mer_code > m_to)
738 m_Agg.set_compute_count(
true);
739 for (
size_t i = 0; i < kmer_str.size(); ++i)
741 const bvector_type& bv_mask = m_parent_scanner.GetVector(kmer_str[i]);
745 m_Agg.combine_shift_right_and(bv_search_res);
757 std::lock_guard<std::mutex> guard(mtx_counts_lock);
758 m_kmer_counts.set(k_mer_code,
unsigned(search_count));
759 assert(m_kmer_counts.in_sync());
768 const DNA_Scan& m_parent_scanner;
772 typename DNA_Scan::aggregator_type m_Agg;
783 unsigned concurrency)
786 typedef std::vector<std::pair<bv_size_type, bv_size_type> >
bv_ranges_vector;
793 if (split_rank < concurrency || concurrency < 2)
805 std::vector<std::future<void> > futures;
806 futures.reserve(concurrency);
808 for (
size_t k = 0; k < pair_vect.size(); ++k)
810 futures.emplace_back(std::async(std::launch::async,
819 for (
auto& e : futures)
821 unsigned long long c_prev = 0;
824 std::future_status status = e.wait_for(std::chrono::seconds(60));
825 if (status == std::future_status::ready)
831 auto delta = c - c_prev;
834 auto remain_cnt = cnt - c;
835 auto remain_min = remain_cnt / delta;
836 cout <<
"\r" << c <<
": progress per minute=" << delta;
837 if (remain_min < 120)
839 cout <<
" wait for " << remain_min <<
"m " << flush;
843 auto remain_h = remain_min / 60;
844 cout <<
" wait for " << remain_h <<
"h " << flush;
864 auto en = bv_null->
first();
865 for (; en.valid(); ++en)
867 auto kmer_code = *en;
868 auto kmer_count = kmer_counts.
get(kmer_code);
869 auto mit = hmap.find(kmer_count);
870 if (mit == hmap.end())
871 hmap[kmer_count] = 1;
884 outf.open(fname, ios::out | ios::trunc );
886 outf <<
"kmer count \t number of kmers\n";
888 auto it = hmap.rbegin();
auto it_end = hmap.rend();
889 for (; it != it_end; ++it)
891 outf << it->first <<
"\t" << it->second << endl;
924 auto total_kmers = bv_null->
count();
925 bm::id64_t target_f_count = (total_kmers * percent) / 100;
927 auto it = hmap.rbegin();
928 auto it_end = hmap.rend();
929 for (; it != it_end; ++it)
931 auto kmer_count = it->first;
933 scanner.
find_eq(kmer_counts, kmer_count, bv_found);
934 auto found_cnt = bv_found.count();
939 for (; en.
valid(); ++en)
941 auto kmer_code = *en;
942 unsigned k_count = kmer_counts.
get(kmer_code);
949 assert(k_count == kmer_count);
951 frequent_bv.set(kmer_code);
952 if (kmer_count >= target_f_count)
954 frequent_bv.optimize();
962 frequent_bv.optimize();
966int main(
int argc,
char *argv[])
976 cerr <<
"cmd-line parse error. " << endl;
989 std::cout <<
"FASTA sequence size=" << seq_vect.size() << std::endl;
994 cout <<
"k-mer generation for k=" <<
ik_size << endl;
998 cout <<
"Found " << bv_kmers.
count() <<
" k-mers." << endl;
1002 bm::print_bvector_stat(cout,bv_kmers);
1003 size_t blob_size = bm::compute_serialization_size(bv_kmers);
1004 cout <<
"DNA 2-bit coded FASTA size=" << seq_vect.size()/4 << endl;
1005 cout <<
" Compressed size=" << blob_size << endl;
1013 bm::SaveBVector(
ikd_name.c_str(), bv_kmers);
1016 if (seq_vect.size())
1022 if (seq_vect.size() &&
1026 rsc_kmer_counts.
sync();
1028 rsc_kmer_counts2.
sync();
1031 cout <<
" Searching for k-mer counts..." << endl;
1035 cout <<
" ... using bm::aggregator<>" << endl;
1045 cout <<
" ... using std::sort() and count" << endl;
1058 cerr <<
"Error: Count vector save failed!" << endl;
1065 bool eq = rsc_kmer_counts.
equal(rsc_kmer_counts2);
1072 auto v1 = rsc_kmer_counts.
get(idx);
1073 auto v2 = rsc_kmer_counts2.
get(idx);
1074 cerr <<
"Mismatch at idx=" << idx <<
" v1=" << v1 <<
" v2=" << v2 << endl;
1075 assert(found); (void)found;
1077 cerr <<
"Integrity check failed!" << endl;
1111 cout <<
"Found frequent k-mers: " << bv_freq.
count() << endl;
1126 std::cout << std::endl <<
"Performance:" << std::endl;
1131 catch(std::exception& ex)
1133 std::cerr << ex.what() << std::endl;
Algorithms for fast aggregation of N bvectors.
Algorithms for bvector<> (main include).
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Algorithms for bm::sparse_vector.
Compressed sparse container rsc_sparse_vector<> for integer types.
Timing utilities for benchmarking (internal).
pre-processor un-defines to avoid global space pollution (internal)
k-mer counting job functor class using bm::aggregator<>
DNA_Scan::bvector_type bvector_type
Counting_JobFunctor(const DNA_Scan &parent_scanner, const bvector_type &bv_kmers, size_type from, size_type to, rsc_sparse_vector_u32 &kmer_counts)
constructor
bvector_type::size_type size_type
Counting_JobFunctor(const Counting_JobFunctor &func)
copy-ctor
void operator()()
Main logic (functor).
Utility for keeping all DNA finger print vectors and search using various techniques.
Functor to process job batch (task).
void operator()()
Main logic (functor).
SortCounting_JobFunctor(const vector_char_type &seq_vect, unsigned k_size, size_type from, size_type to, rsc_sparse_vector_u32 &kmer_counts)
constructor
SortCounting_JobFunctor(const SortCounting_JobFunctor &func)
bvector_type::size_type size_type
Constant iterator designed to enumerate "ON" bits.
bool valid() const BMNOEXCEPT
Checks if iterator is still valid.
Bitvector Bit-vector container with runtime compression of bits.
size_type count() const BMNOEXCEPT
population count (count of ON bits)
bm::bvector< Alloc > & bit_sub(const bm::bvector< Alloc > &bv1, const bm::bvector< Alloc > &bv2, typename bm::bvector< Alloc >::optmode opt_mode=opt_none)
3-operand SUB : this := bv1 MINUS bv2 SUBtraction is also known as AND NOT
void optimize(bm::word_t *temp_block=0, optmode opt_mode=opt_compress, statistics *stat=0)
Optimize memory bitvector's memory allocation.
bvector_size_type size_type
enumerator first() const
Returns enumerator pointing on the first non-zero bit.
Utility class to collect performance measurements and statistics.
std::map< std::string, statistics > duration_map_type
test name to duration map
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
Rank-Select compressed sparse vector.
sparse_vector_u32::size_type size_type
void set(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, statistics *stat=0)
run memory optimization for all vector slices
void sync(bool force)
Re-calculate rank-select index for faster access to vector.
bool equal(const rsc_sparse_vector< Val, SV > &csv) const BMNOEXCEPT
check if another vector has the same content
value_type get(size_type idx) const BMNOEXCEPT
get specified element without bounds checking
const bvector_type * get_null_bvector() const BMNOEXCEPT
Get bit-vector of assigned values (or NULL).
sparse_vector_u32::bvector_type bvector_type
algorithms for sparse_vector scan/search
void find_eq(const SV &sv, value_type value, bvector_type &bv_out)
find all sparse vector elements EQ to search value
succinct sparse vector with runtime compression using bit-slicing / transposition method
@ BM_SORTED
input set is sorted (ascending order)
@ BM_GAP
GAP compression is ON.
void rank_range_split(const BV &bv, typename BV::size_type rank, PairVect &target_v)
Algorithm to identify bit-vector ranges (splits) for the rank.
bool sparse_vector_find_first_mismatch(const SV &sv1, const SV &sv2, typename SV::size_type &midx, bm::null_support null_proc=bm::use_null)
Find first mismatch (element which is different) between two sparse vectors (uses linear scan in bit-...
unsigned long long int id64_t
bm::chrono_taker ::duration_map_type timing_map
bm::bvector ::size_type bv_size_type
std::vector< std::pair< bv_size_type, bv_size_type > > bv_ranges_vector
bm::sparse_vector< unsigned, bm::bvector<> > sparse_vector_u32
static int parse_args(int argc, char *argv[])
bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > rsc_sparse_vector_u32
std::vector< char > vector_char_type
DNA_FingerprintScanner< bm::bvector<> > dna_scanner_type
void validate_k_mer(const char *dna, size_t pos, unsigned k_size, bm::id64_t k_mer)
QA function to validate if reverse k-mer decode gives the same string.
void compute_frequent_kmers(BV &frequent_bv, const histogram_map_u32 &hmap, const rsc_sparse_vector_u32 &kmer_counts, unsigned percent, unsigned k_size)
Create vector, representing subset of k-mers of high frequency.
void generate_k_mer_bvector(BV &bv, const vector_char_type &seq_vect, unsigned k_size, bool check)
This function turns each k-mer into an integer number and encodes it in a bit-vector (presense vector...
void translate_kmer(std::string &dna, bm::id64_t kmer_code, unsigned k_size)
Translate k-mer code into ATGC DNA string.
static void report_hmap(const string &fname, const histogram_map_u32 &hmap)
Save TSV report of k-mer frequences (reverse sorted, most frequent k-mers first).
std::map< unsigned, unsigned > histogram_map_u32
static void compute_kmer_histogram(histogram_map_u32 &hmap, const rsc_sparse_vector_u32 &kmer_counts)
Compute a map of how often each k-mer frequency is observed in the k-mer counts vector.
void count_kmers(const vector_char_type &seq_vect, unsigned k_size, rsc_sparse_vector_u32 &kmer_counts)
k-mer counting algorithm using reference sequence, regenerates k-mer codes, sorts them and counts
dna_scanner_type dna_scanner
std::string ikd_freq_name
std::string ikd_counts_name
bool get_kmer_code(const char *dna, size_t pos, unsigned k_size, bm::id64_t &k_mer)
Calculate k-mer as an unsigned long integer.
std::vector< char > vector_char_type
bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > rsc_sparse_vector_u32
void sort_unique(VECT &vect)
Auxiliary function to do sort+unique on a vactor of ints removes duplicate elements.
char int2DNA(unsigned code)
Translate integer code to DNA letter.
void count_kmers_parallel(const BV &bv_kmers, const vector_char_type &seq_vect, rsc_sparse_vector_u32 &kmer_counts, unsigned k_size, unsigned concurrency)
MT k-mer counting.
static int load_FASTA(const std::string &fname, vector_char_type &seq_vect)
really simple FASTA parser (one entry per file)
void sort_count(VECT &vect, COUNT_VECT &cvect)
Auxiliary function to do sort+unique on a vactor of ints and save results in a counts vector.
bool get_DNA_code(char bp, bm::id64_t &dna_code)
std::atomic_ullong k_mer_progress_count(0)