99// except according to those terms.
1010
1111use core:: cmp;
12+ use core:: iter:: Filter ;
1213
1314// All of the logic for forward iteration over sentences
1415mod fwd {
@@ -40,6 +41,7 @@ mod fwd {
4041 StatePart :: Sot
4142 ] ) ;
4243
44+ #[ derive( Clone ) ]
4345 pub struct SentenceBreaks < ' a > {
4446 pub string : & ' a str ,
4547 pos : usize ,
@@ -256,13 +258,32 @@ mod fwd {
256258
257259}
258260
261+ /// An iterator over the substrings of a string which, after splitting the string on
262+ /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
263+ /// contain any characters with the
264+ /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
265+ /// property, or with
266+ /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
267+ #[ derive( Clone ) ]
268+ pub struct UnicodeSentences < ' a > {
269+ inner : Filter < USentenceBounds < ' a > , fn ( & & str ) -> bool > ,
270+ }
271+
259272/// External iterator for a string's
260273/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
274+ #[ derive( Clone ) ]
261275pub struct USentenceBounds < ' a > {
262276 iter : fwd:: SentenceBreaks < ' a > ,
263277 sentence_start : Option < usize >
264278}
265279
280+ /// External iterator for sentence boundaries and byte offsets.
281+ #[ derive( Clone ) ]
282+ pub struct USentenceBoundIndices < ' a > {
283+ start_offset : usize ,
284+ iter : USentenceBounds < ' a > ,
285+ }
286+
266287#[ inline]
267288pub fn new_sentence_bounds < ' a > ( source : & ' a str ) -> USentenceBounds < ' a > {
268289 USentenceBounds {
@@ -271,6 +292,32 @@ pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
271292 }
272293}
273294
295+ #[ inline]
296+ pub fn new_sentence_bound_indices < ' a > ( source : & ' a str ) -> USentenceBoundIndices < ' a > {
297+ USentenceBoundIndices {
298+ start_offset : source. as_ptr ( ) as usize ,
299+ iter : new_sentence_bounds ( source)
300+ }
301+ }
302+
303+ #[ inline]
304+ pub fn new_unicode_sentences < ' b > ( s : & ' b str ) -> UnicodeSentences < ' b > {
305+ use super :: UnicodeSegmentation ;
306+ use tables:: util:: is_alphanumeric;
307+
308+ fn has_alphanumeric ( s : & & str ) -> bool { s. chars ( ) . any ( |c| is_alphanumeric ( c) ) }
309+ let has_alphanumeric: fn ( & & str ) -> bool = has_alphanumeric; // coerce to fn pointer
310+
311+ UnicodeSentences { inner : s. split_sentence_bounds ( ) . filter ( has_alphanumeric) }
312+ }
313+
314+ impl < ' a > Iterator for UnicodeSentences < ' a > {
315+ type Item = & ' a str ;
316+
317+ #[ inline]
318+ fn next ( & mut self ) -> Option < & ' a str > { self . inner . next ( ) }
319+ }
320+
274321impl < ' a > Iterator for USentenceBounds < ' a > {
275322 type Item = & ' a str ;
276323
@@ -300,3 +347,17 @@ impl<'a> Iterator for USentenceBounds<'a> {
300347 }
301348 }
302349}
350+
351+ impl < ' a > Iterator for USentenceBoundIndices < ' a > {
352+ type Item = ( usize , & ' a str ) ;
353+
354+ #[ inline]
355+ fn next ( & mut self ) -> Option < ( usize , & ' a str ) > {
356+ self . iter . next ( ) . map ( |s| ( s. as_ptr ( ) as usize - self . start_offset , s) )
357+ }
358+
359+ #[ inline]
360+ fn size_hint ( & self ) -> ( usize , Option < usize > ) {
361+ self . iter . size_hint ( )
362+ }
363+ }
0 commit comments