Lucene++ - a full-featured, c++ search engine
API Documentation


Loading...
Searching...
No Matches
CharTokenizer.h
Go to the documentation of this file.
1
2// Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3// Distributable under the terms of either the Apache License (Version 2.0)
4// or the GNU Lesser General Public License.
6
7#ifndef CHARTOKENIZER_H
8#define CHARTOKENIZER_H
9
10#include "Tokenizer.h"
11
12namespace Lucene {
13
15class LPPAPI CharTokenizer : public Tokenizer {
16public:
20 virtual ~CharTokenizer();
21
23
24protected:
25 int32_t offset;
26 int32_t bufferIndex;
27 int32_t dataLen;
28
29 static const int32_t MAX_WORD_LEN;
30 static const int32_t IO_BUFFER_SIZE;
31
32 CharArray ioBuffer;
35
36public:
37 virtual bool incrementToken();
38 virtual void end();
39 virtual void reset(const ReaderPtr& input);
40
41protected:
45 virtual bool isTokenChar(wchar_t c) = 0;
46
49 virtual wchar_t normalize(wchar_t c);
50};
51
52}
53
54#endif
#define LUCENE_CLASS(Name)
Definition LuceneObject.h:24
AttributeFactoryPtr factory
Definition AttributeSource.h:60
virtual void reset(const ReaderPtr &input)
Reset the tokenizer to a new reader. Typically, an analyzer (in its reusableTokenStream method) will ...
CharTokenizer(const AttributeSourcePtr &source, const ReaderPtr &input)
CharArray ioBuffer
Definition CharTokenizer.h:32
OffsetAttributePtr offsetAtt
Definition CharTokenizer.h:34
CharTokenizer(const AttributeFactoryPtr &factory, const ReaderPtr &input)
virtual void end()
This method is called by the consumer after the last token has been consumed, after incrementToken() ...
static const int32_t IO_BUFFER_SIZE
Definition CharTokenizer.h:30
virtual bool isTokenChar(wchar_t c)=0
Returns true if a character should be included in a token. This tokenizer generates as tokens adjacen...
TermAttributePtr termAtt
Definition CharTokenizer.h:33
static const int32_t MAX_WORD_LEN
Definition CharTokenizer.h:29
int32_t offset
Definition CharTokenizer.h:25
int32_t dataLen
Definition CharTokenizer.h:27
virtual wchar_t normalize(wchar_t c)
Called on each token character to normalize it before it is added to the token. The default implement...
virtual bool incrementToken()
Consumers (ie., IndexWriter) use this method to advance the stream to the next token....
CharTokenizer(const ReaderPtr &input)
int32_t bufferIndex
Definition CharTokenizer.h:26
Tokenizer()
Construct a tokenizer with null input.
ReaderPtr input
The text source for this Tokenizer.
Definition Tokenizer.h:47
Definition AbstractAllTermDocs.h:12
boost::shared_ptr< Reader > ReaderPtr
Definition LuceneTypes.h:547
boost::shared_ptr< TermAttribute > TermAttributePtr
Definition LuceneTypes.h:58
boost::shared_ptr< AttributeFactory > AttributeFactoryPtr
Definition LuceneTypes.h:519
boost::shared_ptr< AttributeSource > AttributeSourcePtr
Definition LuceneTypes.h:520
boost::shared_ptr< OffsetAttribute > OffsetAttributePtr
Definition LuceneTypes.h:40

clucene.sourceforge.net