@@ -122,6 +122,11 @@ enum RegionalState {
122122 Unknown ,
123123}
124124
125+ fn is_emoji ( ch : char ) -> bool {
126+ use tables:: emoji;
127+ emoji:: emoji_category ( ch) == emoji:: EmojiCat :: EC_Extended_Pictographic
128+ }
129+
125130impl < ' a > Iterator for UWordBounds < ' a > {
126131 type Item = & ' a str ;
127132
@@ -182,26 +187,18 @@ impl<'a> Iterator for UWordBounds<'a> {
182187 // WB4 makes all ZWJs collapse into the previous state
183188 // but you can still be in a Zwj state if you started with Zwj
184189 //
185- // This means that Zwj + Extend will collapse into Zwj, which is wrong,
186- // since Extend has a boundary with following EBG/GAZ chars but ZWJ doesn't,
187- // and that rule (WB3c) has higher priority
188- //
189- // Additionally, Emoji_Base+ZWJ+(EBG/GAZ) will collapse into Emoji_Base+EBG/GAZ
190- // which won't have a boundary even though EB+ZWJ+GAZ should have a boundary.
190+ // This means that an EP + Zwj will collapse into EP, which is wrong,
191+ // since EP+EP is not a boundary but EP+ZWJ+EP is
191192 //
192193 // Thus, we separately keep track of whether or not the last character
193194 // was a ZWJ. This is an additional bit of state tracked outside of the
194195 // state enum; the state enum represents the last non-zwj state encountered.
195196 // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
196197 // however we are in the previous state for the purposes of all other rules.
197198 if prev_zwj {
198- match cat {
199- wd:: WC_Glue_After_Zwj => continue ,
200- wd:: WC_E_Base_GAZ => {
201- state = Emoji ;
202- continue ;
203- } ,
204- _ => ( )
199+ if is_emoji ( ch) {
200+ state = Emoji ;
201+ continue ;
205202 }
206203 }
207204 // Don't use `continue` in this match without updating `cat`
@@ -222,7 +219,6 @@ impl<'a> Iterator for UWordBounds<'a> {
222219 wd:: WC_Regional_Indicator => Regional ( RegionalState :: Half ) , // rule WB13c
223220 wd:: WC_LF | wd:: WC_Newline => break , // rule WB3a
224221 wd:: WC_ZWJ => Zwj , // rule WB3c
225- wd:: WC_E_Base | wd:: WC_E_Base_GAZ => Emoji , // rule WB14
226222 _ => {
227223 if let Some ( ncat) = self . get_next_cat ( idx) { // rule WB4
228224 if ncat == wd:: WC_Format || ncat == wd:: WC_Extend || ncat == wd:: WC_ZWJ {
@@ -235,9 +231,7 @@ impl<'a> Iterator for UWordBounds<'a> {
235231 }
236232 } ,
237233 Zwj => {
238- // We already handle WB3c above. At this point,
239- // the current category is not GAZ or EBG,
240- // or the previous character was not actually a ZWJ
234+ // We already handle WB3c above.
241235 take_curr = false ;
242236 break ;
243237 }
@@ -313,12 +307,10 @@ impl<'a> Iterator for UWordBounds<'a> {
313307 }
314308 } ,
315309 Regional ( _) => unreachable ! ( "RegionalState::Unknown should not occur on forward iteration" ) ,
316- Emoji => match cat { // rule WB14
317- wd:: WC_E_Modifier => state,
318- _ => {
319- take_curr = false ;
320- break ;
321- }
310+ Emoji => {
311+ // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
312+ take_curr = false ;
313+ break ;
322314 } ,
323315 FormatExtend ( t) => match t { // handle FormatExtends depending on what type
324316 RequireNumeric if cat == wd:: WC_Numeric => Numeric , // rule WB11
@@ -422,20 +414,19 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
422414 // Don't use `continue` in this match without updating `catb`
423415 state = match state {
424416 Start | FormatExtend ( AcceptAny ) => match cat {
417+ _ if is_emoji ( ch) => Zwj ,
425418 wd:: WC_ALetter => Letter , // rule WB5, WB7, WB10, WB13b
426419 wd:: WC_Hebrew_Letter => HLetter , // rule WB5, WB7, WB7c, WB10, WB13b
427420 wd:: WC_Numeric => Numeric , // rule WB8, WB9, WB11, WB13b
428421 wd:: WC_Katakana => Katakana , // rule WB13, WB13b
429422 wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a
430423 wd:: WC_Regional_Indicator => Regional ( RegionalState :: Unknown ) , // rule WB13c
431- wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => Zwj , // rule WB3c
432424 // rule WB4:
433425 wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => FormatExtend ( AcceptAny ) ,
434426 wd:: WC_Single_Quote => {
435427 saveidx = idx;
436428 FormatExtend ( AcceptQLetter ) // rule WB7a
437429 } ,
438- wd:: WC_E_Modifier => Emoji , // rule WB14
439430 wd:: WC_CR | wd:: WC_LF | wd:: WC_Newline => {
440431 if state == Start {
441432 if cat == wd:: WC_LF {
@@ -539,11 +530,10 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
539530 break ;
540531 }
541532 } ,
542- Emoji => match cat { // rule WB14
543- wd :: WC_E_Base | wd :: WC_E_Base_GAZ => {
533+ Emoji => {
534+ if is_emoji ( ch ) { // rule WB3c
544535 Zwj
545- } ,
546- _ => {
536+ } else {
547537 take_curr = false ;
548538 break ;
549539 }
0 commit comments