@@ -16,6 +16,8 @@ mod fwd {
1616 use tables:: sentence:: SentenceCat ;
1717 use core:: cmp;
1818
19+ // Describe a parsed part of source string as described in this table:
20+ // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
1921 #[ derive( Clone , Copy , PartialEq , Eq ) ]
2022 enum StatePart {
2123 Sot ,
@@ -49,6 +51,8 @@ mod fwd {
4951 }
5052
5153 impl SentenceBreaksState {
54+ // Attempt to advance the internal state by one part
55+ // Whitespace and some punctutation will be collapsed
5256 fn next ( & self , cat : SentenceCat ) -> SentenceBreaksState {
5357 let & SentenceBreaksState ( parts) = self ;
5458 let parts = match ( parts[ 3 ] , cat) {
@@ -85,27 +89,28 @@ mod fwd {
8589 ] )
8690 }
8791
92+ // Helper function to check if state head matches a single `StatePart`
8893 fn match1 ( & self , part : StatePart ) -> bool {
8994 let & SentenceBreaksState ( parts) = self ;
9095 part == parts[ 3 ]
9196 }
9297
98+ // Helper function to check if first two `StateParts` in state match
99+ // the given two
93100 fn match2 ( & self , part1 : StatePart , part2 : StatePart ) -> bool {
94101 let & SentenceBreaksState ( parts) = self ;
95102 part1 == parts[ 2 ] && part2 == parts[ 3 ]
96103 }
97104 }
98105
106+ // https://unicode.org/reports/tr29/#SB8
107+ // TODO cache this, it is currently quadratic
99108 fn match_sb8 ( state : & SentenceBreaksState , ahead : & str ) -> bool {
100- let aterm_part = {
101- // ATerm Close* Sp*
102- let & SentenceBreaksState ( parts) = state;
103- let mut idx = if parts[ 3 ] == StatePart :: SpPlus { 2 } else { 3 } ;
104- if parts[ idx] == StatePart :: ClosePlus { idx -= 1 }
105- parts[ idx]
106- } ;
109+ let & SentenceBreaksState ( parts) = state;
110+ let mut idx = if parts[ 3 ] == StatePart :: SpPlus { 2 } else { 3 } ;
111+ if parts[ idx] == StatePart :: ClosePlus { idx -= 1 }
107112
108- if aterm_part == StatePart :: ATerm {
113+ if parts [ idx ] == StatePart :: ATerm {
109114 use tables:: sentence as se;
110115
111116 for next_char in ahead. chars ( ) {
@@ -124,6 +129,7 @@ mod fwd {
124129 false
125130 }
126131
132+ // https://unicode.org/reports/tr29/#SB8a
127133 fn match_sb8a ( state : & SentenceBreaksState ) -> bool {
128134 // SATerm Close* Sp*
129135 let & SentenceBreaksState ( parts) = state;
@@ -132,13 +138,15 @@ mod fwd {
132138 parts[ idx] == StatePart :: STerm || parts[ idx] == StatePart :: ATerm
133139 }
134140
141+ // https://unicode.org/reports/tr29/#SB9
135142 fn match_sb9 ( state : & SentenceBreaksState ) -> bool {
136143 // SATerm Close*
137144 let & SentenceBreaksState ( parts) = state;
138145 let idx = if parts[ 3 ] == StatePart :: ClosePlus { 2 } else { 3 } ;
139146 parts[ idx] == StatePart :: STerm || parts[ idx] == StatePart :: ATerm
140147 }
141148
149+ // https://unicode.org/reports/tr29/#SB11
142150 fn match_sb11 ( state : & SentenceBreaksState ) -> bool {
143151 // SATerm Close* Sp* ParaSep?
144152 let & SentenceBreaksState ( parts) = state;
@@ -180,67 +188,69 @@ mod fwd {
180188 self . state = self . state . next ( next_cat) ;
181189
182190 match next_cat {
183- // SB1
191+ // SB1 https://unicode.org/reports/tr29/#SB1
184192 _ if state_before. match1 ( StatePart :: Sot ) =>
185193 return Some ( position_before) ,
186194
187- // SB3
195+ // SB2 is handled when inner iterator (chars) is finished
196+
197+ // SB3 https://unicode.org/reports/tr29/#SB3
188198 SentenceCat :: SC_LF if state_before. match1 ( StatePart :: CR ) =>
189199 continue ,
190200
191- // SB4
201+ // SB4 https://unicode.org/reports/tr29/#SB4
192202 _ if state_before. match1 ( StatePart :: Sep )
193203 || state_before. match1 ( StatePart :: CR )
194204 || state_before. match1 ( StatePart :: LF )
195205 => return Some ( position_before) ,
196206
197- // SB5
207+ // SB5 https://unicode.org/reports/tr29/#SB5
198208 SentenceCat :: SC_Extend |
199209 SentenceCat :: SC_Format => self . state = state_before,
200210
201- // SB6
211+ // SB6 https://unicode.org/reports/tr29/#SB6
202212 SentenceCat :: SC_Numeric if state_before. match1 ( StatePart :: ATerm ) =>
203213 continue ,
204214
205- // SB7
215+ // SB7 https://unicode.org/reports/tr29/#SB7
206216 SentenceCat :: SC_Upper if state_before. match2 ( StatePart :: UpperLower , StatePart :: ATerm ) =>
207217 continue ,
208218
209- // SB8
219+ // SB8 https://unicode.org/reports/tr29/#SB8
210220 _ if match_sb8 ( & state_before, & self . string [ position_before..] ) =>
211221 continue ,
212222
213- // SB8a
223+ // SB8a https://unicode.org/reports/tr29/#SB8a
214224 SentenceCat :: SC_SContinue |
215225 SentenceCat :: SC_STerm |
216226 SentenceCat :: SC_ATerm if match_sb8a ( & state_before) =>
217227 continue ,
218228
219- // SB9
229+ // SB9 https://unicode.org/reports/tr29/#SB9
220230 SentenceCat :: SC_Close |
221231 SentenceCat :: SC_Sp |
222232 SentenceCat :: SC_Sep |
223233 SentenceCat :: SC_CR |
224234 SentenceCat :: SC_LF if match_sb9 ( & state_before) =>
225235 continue ,
226236
227- // SB10
237+ // SB10 https://unicode.org/reports/tr29/#SB10
228238 SentenceCat :: SC_Sp |
229239 SentenceCat :: SC_Sep |
230240 SentenceCat :: SC_CR |
231241 SentenceCat :: SC_LF if match_sb8a ( & state_before) =>
232242 continue ,
233243
234- // SB11
244+ // SB11 https://unicode.org/reports/tr29/#SB11
235245 _ if match_sb11 ( & state_before) =>
236246 return Some ( position_before) ,
237247
238- // SB998
248+ // SB998 https://unicode.org/reports/tr29/#SB998
239249 _ => continue
240250 }
241251 }
242252
243- // SB2
253+ // SB2 https://unicode.org/reports/tr29/#SB2
244254 if self . state . match1 ( StatePart :: Sot ) {
245255 None
246256 } else if self . state . match1 ( StatePart :: Eot ) {
0 commit comments