00001
00002
00003
00004
00005
00006
00007
00008 #ifndef BOOST_LOCALE_BOUNDARY_HPP_INCLUDED
00009 #define BOOST_LOCALE_BOUNDARY_HPP_INCLUDED
00010
00011 #include <boost/locale/config.hpp>
00012 #include <boost/cstdint.hpp>
00013 #ifdef BOOST_MSVC
00014 # pragma warning(push)
00015 # pragma warning(disable : 4275 4251 4231 4660)
00016 #endif
00017 #include <string>
00018 #include <locale>
00019 #include <vector>
00020 #include <iterator>
00021 #include <algorithm>
00022 #include <typeinfo>
00023 #include <iterator>
00024 #include <stdexcept>
00025
00026
00027
00028
00029 namespace boost {
00030
00031 namespace locale {
00032
00036 namespace boundary {
00044
00048 typedef enum {
00049 character,
00050 word,
00051 sentence,
00052 line
00053 } boundary_type;
00054
00058 typedef enum {
00059 word_none = 0x0000F,
00060 word_number = 0x000F0,
00061 word_letter = 0x00F00,
00062 word_kana = 0x0F000,
00063 word_ideo = 0xF0000,
00064 word_any = 0xFFFF0,
00065 word_letters = 0xFFF00,
00066 word_kana_ideo = 0xFF000,
00067 word_mask = 0xFFFFF
00068 } word_type;
00072 typedef enum {
00073 line_soft = 0x0F,
00074 line_hard = 0xF0,
00075 line_any = 0xFF,
00076 line_mask = 0xFF
00077 } line_break_type;
00078
00082 typedef enum {
00083 sentence_term = 0x0F,
00084
00085 sentence_sep = 0xF0,
00086
00087 sentence_any = 0xFF,
00088 sentence_mask = 0xFF
00089 } sentence_break_type;
00090
00095 typedef enum {
00096 character_any = 0xF,
00097 character_mask = 0xF,
00098 } character_break_type;
00099
00103 inline unsigned boundary_mask(boundary_type t)
00104 {
00105 switch(t) {
00106 case character: return character_mask;
00107 case word: return word_mask;
00108 case sentence: return sentence_mask;
00109 case line: return line_mask;
00110 default: return 0;
00111 }
00112 }
00113
00115 namespace impl {
00116
00117 struct break_info {
00118
00119 break_info() :
00120 offset(0),
00121 mark(0)
00122 {
00123 }
00124 break_info(unsigned v) :
00125 offset(v),
00126 mark(0)
00127 {
00128 }
00129
00130 uint32_t offset;
00131 uint32_t mark;
00132
00133 bool operator<(break_info const &other) const
00134 {
00135 return offset < other.offset;
00136 }
00137 };
00138
00139 typedef std::vector<break_info> index_type;
00140
00141 template<typename CharType>
00142 index_type map(boundary_type t,CharType const *begin,CharType const *end,std::locale const &loc=std::locale());
00143
00144 template<typename CharType>
00145 static index_type map(
00146 boundary_type t,
00147 std::basic_string<CharType> const &str,
00148 std::locale const &loc=std::locale())
00149 {
00150 return boost::locale::boundary::impl::map<CharType>(t,str.data(),str.data()+str.size(),loc);
00151 }
00152
00153 template<>
00154 BOOST_LOCALE_DECL index_type
00155 map(boundary_type t,char const *begin,char const *end,std::locale const &loc);
00156
00157 #ifndef BOOST_NO_STD_WSTRING
00158 template<>
00159 BOOST_LOCALE_DECL index_type
00160 map(boundary_type t,wchar_t const *begin,wchar_t const *end,std::locale const &loc);
00161 #endif
00162
00163 #ifdef BOOST_HAS_CHAR16_T
00164 template<>
00165 BOOST_LOCALE_DECL index_type
00166 map(boundary_type t,char16_t const *begin,char16_t const *end,std::locale const &loc);
00167 #endif
00168
00169 #ifdef BOOST_HAS_CHAR32_T
00170 template<>
00171 BOOST_LOCALE_DECL index_type
00172 map(boundary_type t,char32_t const *begin,char32_t const *end,std::locale const &loc);
00173 #endif
00174 }
00175
00176 namespace details {
00177
00178 template<typename IteratorType,typename CategoryType = typename std::iterator_traits<IteratorType>::iterator_category>
00179 struct mapping_traits {
00180 typedef typename std::iterator_traits<IteratorType>::value_type char_type;
00181 static impl::index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l)
00182 {
00183 std::basic_string<char_type> str(b,e);
00184 return impl::map(t,str,l);
00185 }
00186 };
00187
00188 template<typename IteratorType>
00189 struct mapping_traits<IteratorType,std::random_access_iterator_tag> {
00190 typedef typename std::iterator_traits<IteratorType>::value_type char_type;
00191
00192 static impl::index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l)
00193 {
00194 impl::index_type result;
00195
00196
00197
00198
00199
00200
00201
00202 if(
00203 (
00204 typeid(IteratorType) == typeid(typename std::basic_string<char_type>::iterator)
00205 || typeid(IteratorType) == typeid(typename std::basic_string<char_type>::const_iterator)
00206 || typeid(IteratorType) == typeid(typename std::vector<char_type>::iterator)
00207 || typeid(IteratorType) == typeid(typename std::vector<char_type>::const_iterator)
00208 || typeid(IteratorType) == typeid(char_type *)
00209 || typeid(IteratorType) == typeid(char_type const *)
00210 )
00211 &&
00212 b!=e
00213 )
00214 {
00215 char_type const *begin = &*b;
00216 char_type const *end = begin + (e-b);
00217 impl::index_type tmp=impl::map(t,begin,end,l);
00218 result.swap(tmp);
00219 }
00220 else{
00221 std::basic_string<char_type> str(b,e);
00222 impl::index_type tmp=impl::map(t,str,l);
00223 result.swap(tmp);
00224 }
00225 return result;
00226 }
00227 };
00228
00229 }
00230
00232
00233
00234
00235
00236
00237 template<typename I>
00238 class break_iterator;
00239 template<typename I,typename V>
00240 class token_iterator;
00241
00256
00257 template<class RangeIterator>
00258 class mapping {
00259 public:
00263 typedef RangeIterator iterator;
00267 typedef typename RangeIterator::base_iterator base_iterator;
00271 typedef typename std::iterator_traits<base_iterator>::value_type char_type;
00272
00276 mapping(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc = std::locale())
00277 {
00278 create_mapping(type,begin,end,loc,0xFFFFFFFFu);
00279 }
00280
00285 mapping(boundary_type type,base_iterator begin,base_iterator end,unsigned mask,std::locale const &loc = std::locale())
00286 {
00287 create_mapping(type,begin,end,loc,mask);
00288 }
00289
00293 void map(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc = std::locale())
00294 {
00295 create_mapping(type,begin,end,loc,0xFFFFFFFFu);
00296 }
00297
00301 void map(boundary_type type,base_iterator begin,base_iterator end,unsigned mask,std::locale const &loc = std::locale())
00302 {
00303 create_mapping(type,begin,end,loc,mask);
00304 }
00305
00309
00310 mapping()
00311 {
00312 mask_=0xFFFFFFFF;
00313 }
00314
00318 template<typename ORangeIterator>
00319 mapping(mapping<ORangeIterator> const &other) :
00320 index_(other.index_),
00321 begin_(other.begin_),
00322 end_(other.end_),
00323 mask_(other.mask_)
00324 {
00325 }
00326
00331 template<typename ORangeIterator>
00332 void swap(mapping<ORangeIterator> &other)
00333 {
00334 index_.swap(other.index_),
00335 std::swap(begin_,other.begin_);
00336 std::swap(end_,other.end_);
00337 std::swap(mask_,other.mask_);
00338 }
00339
00343 template<typename ORangeIterator>
00344 mapping const &operator=(mapping<ORangeIterator> const &other)
00345 {
00346 index_=other.index_;
00347 begin_=other.begin_;
00348 end_=other.end_;
00349 mask_=other.mask_;
00350 }
00351
00355 unsigned mask() const
00356 {
00357 return mask_;
00358 }
00380 void mask(unsigned u)
00381 {
00382 mask_ = u;
00383 }
00384
00388 RangeIterator begin() const
00389 {
00390 return RangeIterator(*this,true,mask_);
00391 }
00395 RangeIterator end() const
00396 {
00397 return RangeIterator(*this,false,mask_);
00398 }
00399
00400 private:
00401 void create_mapping(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc,unsigned mask)
00402 {
00403 impl::index_type idx=details::mapping_traits<base_iterator>::map(type,begin,end,loc);
00404 index_.swap(idx);
00405 begin_ = begin;
00406 end_ = end;
00407 mask_=mask;
00408 }
00409 template<typename I>
00410 friend class break_iterator;
00411 template<typename I,typename V>
00412 friend class token_iterator;
00413 template<typename I>
00414 friend class mapping;
00415
00416 base_iterator begin_,end_;
00417 impl::index_type index_;
00418 unsigned mask_;
00419 };
00420
00421
00433
00434 template<
00435 typename IteratorType,
00436 typename ValueType = std::basic_string<typename std::iterator_traits<IteratorType>::value_type>
00437 >
00438 class token_iterator : public std::iterator<std::bidirectional_iterator_tag,ValueType>
00439 {
00440 public:
00444 typedef typename std::iterator_traits<IteratorType>::value_type char_type;
00448 typedef IteratorType base_iterator;
00452 typedef mapping<token_iterator<IteratorType,ValueType> > mapping_type;
00453
00457 token_iterator() :
00458 map_(0),
00459 offset_(0),
00460 mask_(0xFFFFFFFF),
00461 full_select_(false)
00462 {
00463 }
00464
00478
00479 token_iterator const &operator=(IteratorType p)
00480 {
00481 unsigned dist=std::distance(map_->begin_,p);
00482 impl::index_type::const_iterator b=map_->index_.begin(),e=map_->index_.end();
00483 impl::index_type::const_iterator
00484 bound=std::upper_bound(b,e,impl::break_info(dist));
00485 while(bound != e && (bound->mark & mask_)==0)
00486 bound++;
00487 offset_ = bound - b;
00488 return *this;
00489 }
00490
00497 token_iterator(mapping_type const &map,bool begin,unsigned mask) :
00498 map_(&map),
00499 mask_(mask),
00500 full_select_(false)
00501 {
00502 if(begin) {
00503 offset_ = 0;
00504 next();
00505 }
00506 else
00507 offset_=map_->index_.size();
00508 }
00512 token_iterator(token_iterator const &other) :
00513 map_(other.map_),
00514 offset_(other.offset_),
00515 mask_(other.mask_),
00516 full_select_(other.full_select_)
00517 {
00518 }
00519
00523 token_iterator const &operator=(token_iterator const &other)
00524 {
00525 if(this!=&other) {
00526 map_ = other.map_;
00527 offset_ = other.offset_;
00528 mask_=other.mask_;
00529 full_select_ = other.full_select_;
00530 }
00531 return *this;
00532 }
00533
00540 ValueType operator*() const
00541 {
00542 if(offset_ < 1 || offset_ >= map_->index_.size())
00543 throw std::out_of_range("Invalid token iterator location");
00544 unsigned pos=offset_-1;
00545 if(full_select_)
00546 while(!valid_offset(pos))
00547 pos--;
00548 base_iterator b=map_->begin_;
00549 unsigned b_off = map_->index_[pos].offset;
00550 std::advance(b,b_off);
00551 base_iterator e=b;
00552 unsigned e_off = map_->index_[offset_].offset;
00553 std::advance(e,e_off-b_off);
00554 return ValueType(b,e);
00555 }
00556
00560 token_iterator &operator++()
00561 {
00562 next();
00563 return *this;
00564 }
00565
00569 token_iterator &operator--()
00570 {
00571 prev();
00572 return *this;
00573 }
00574
00578 token_iterator operator++(int unused)
00579 {
00580 token_iterator tmp(*this);
00581 next();
00582 return tmp;
00583 }
00584
00588 token_iterator operator--(int unused)
00589 {
00590 token_iterator tmp(*this);
00591 prev();
00592 return tmp;
00593 }
00594
00598 bool full_select() const
00599 {
00600 return full_select_;
00601 }
00605 void full_select(bool fs)
00606 {
00607 full_select_ = fs;
00608 }
00609
00613 bool operator==(token_iterator const &other) const
00614 {
00615 return map_ == other.map_
00616 && offset_==other.offset_
00617 && mask_ == other.mask_;
00618 }
00619
00623 bool operator!=(token_iterator const &other) const
00624 {
00625 return !(*this==other);
00626 }
00627
00631 unsigned mark() const
00632 {
00633 return map_->index_.at(offset_).mark;
00634 }
00635
00636 private:
00637
00638 bool valid_offset(unsigned offset) const
00639 {
00640 return offset == 0
00641 || offset == map_->index_.size()
00642 || (map_->index_[offset].mark & mask_)!=0;
00643 }
00644
00645 bool at_end() const
00646 {
00647 return !map_ || offset_>=map_->index_.size();
00648 }
00649
00650 void next()
00651 {
00652 while(offset_ < map_->index_.size()) {
00653 offset_++;
00654 if(valid_offset(offset_))
00655 break;
00656 }
00657 }
00658
00659 void prev()
00660 {
00661 while(offset_ > 0) {
00662 offset_ --;
00663 if(valid_offset(offset_))
00664 break;
00665 }
00666 }
00667
00668 mapping_type const * map_;
00669 size_t offset_;
00670 unsigned mask_;
00671 uint32_t full_select_ : 1;
00672 uint32_t reserved_ : 31;
00673 };
00674
00675
00688 template<typename IteratorType>
00689 class break_iterator : public std::iterator<std::bidirectional_iterator_tag,IteratorType>
00690 {
00691 public:
00695 typedef typename std::iterator_traits<IteratorType>::value_type char_type;
00699 typedef IteratorType base_iterator;
00703 typedef mapping<break_iterator<IteratorType> > mapping_type;
00704
00708 break_iterator() :
00709 map_(0),
00710 offset_(0),
00711 mask_(0xFFFFFFFF)
00712 {
00713 }
00714
00718 break_iterator(break_iterator const &other):
00719 map_(other.map_),
00720 offset_(other.offset_),
00721 mask_(other.mask_)
00722 {
00723 }
00724
00728 break_iterator const &operator=(break_iterator const &other)
00729 {
00730 if(this!=&other) {
00731 map_ = other.map_;
00732 offset_ = other.offset_;
00733 mask_=other.mask_;
00734 }
00735 return *this;
00736 }
00737
00744 break_iterator(mapping_type const &map,bool begin,unsigned mask) :
00745 map_(&map),
00746 mask_(mask)
00747 {
00748 if(begin)
00749 offset_ = 0;
00750 else
00751 offset_=map_->index_.size();
00752 }
00753
00757 bool operator==(break_iterator const &other) const
00758 {
00759 return map_ == other.map_
00760 && offset_==other.offset_
00761 && mask_==other.mask_;
00762 }
00763
00767 bool operator!=(break_iterator const &other) const
00768 {
00769 return !(*this==other);
00770 }
00771
00775 unsigned mark() const
00776 {
00777 return map_->index_.at(offset_).mark;
00778 }
00779
00793 break_iterator const &operator=(base_iterator p)
00794 {
00795 at_least(p);
00796 return *this;
00797 }
00798
00805 base_iterator operator*() const
00806 {
00807 if(offset_ >=map_->index_.size())
00808 throw std::out_of_range("Invalid position of break iterator");
00809 base_iterator p = map_->begin_;
00810 std::advance(p, map_->index_[offset_].offset);
00811 return p;
00812 }
00813
00817 break_iterator &operator++()
00818 {
00819 next();
00820 return *this;
00821 }
00822
00826 break_iterator &operator--()
00827 {
00828 prev();
00829 return *this;
00830 }
00831
00835 break_iterator operator++(int unused)
00836 {
00837 break_iterator tmp(*this);
00838 next();
00839 return tmp;
00840 }
00841
00845 break_iterator operator--(int unused)
00846 {
00847 break_iterator tmp(*this);
00848 prev();
00849 return tmp;
00850 }
00851
00852 private:
00853 bool valid_offset(unsigned offset) const
00854 {
00855 return offset == 0
00856 || offset + 1 >= map_->index_.size()
00857 || (map_->index_[offset].mark & mask_)!=0;
00858 }
00859
00860 bool at_end() const
00861 {
00862 return !map_ || offset_>=map_->index_.size();
00863 }
00864
00865 void next()
00866 {
00867 while(offset_ < map_->index_.size()) {
00868 offset_++;
00869 if(valid_offset(offset_))
00870 break;
00871 }
00872 }
00873 void prev()
00874 {
00875 while(offset_ > 0) {
00876 offset_ --;
00877 if(valid_offset(offset_))
00878 break;
00879 }
00880 }
00881
00882 void at_least(IteratorType p)
00883 {
00884 unsigned diff = std::distance(map_->begin_,p);
00885
00886 impl::index_type::const_iterator b=map_->index_.begin();
00887 impl::index_type::const_iterator e=map_->index_.end();
00888 impl::index_type::const_iterator ptr = std::lower_bound(b,e,impl::break_info(diff));
00889
00890 if(ptr==map_->index_.end())
00891 offset_=map_->index_.size()-1;
00892 else
00893 offset_=ptr - map_->index_.begin();
00894
00895 while(!valid_offset(offset_))
00896 offset_ ++;
00897 }
00898
00899 mapping_type const * map_;
00900 size_t offset_;
00901 unsigned mask_;
00902 uint32_t reserved_;
00903 };
00904
00908
00909 }
00910
00911 }
00912 }
00913
00920
00921 #ifdef BOOST_MSVC
00922 #pragma warning(pop)
00923 #endif
00924
00925 #endif
00926