supa_mdx_lint/utils/
words.rs

1use bon::builder;
2use crop::RopeSlice;
3use log::trace;
4
5#[derive(Debug, Clone, Copy, Eq, PartialEq)]
6pub enum Capitalize {
7    False,
8    True,
9}
10
11#[derive(Debug, Default, Eq, PartialEq)]
12pub(crate) enum BreakOnPunctuation {
13    #[default]
14    None,
15    #[allow(dead_code)]
16    Hyphen,
17}
18
19#[derive(Debug, Default, Eq, PartialEq)]
20pub(crate) enum CapitalizeTriggerPunctuation {
21    #[default]
22    Standard,
23    PlusColon,
24}
25
26#[derive(Debug)]
27pub struct WordIterator<'rope> {
28    rope: RopeSlice<'rope>,
29    offset_from_parent: usize,
30    parser: WordParser,
31}
32
33pub(crate) struct WordIteratorOptions {
34    pub(crate) initial_capitalize: Capitalize,
35    pub(crate) break_on_punctuation: BreakOnPunctuation,
36    pub(crate) capitalize_trigger_punctuation: CapitalizeTriggerPunctuation,
37}
38
39impl Default for WordIteratorOptions {
40    fn default() -> Self {
41        Self {
42            initial_capitalize: Capitalize::False,
43            break_on_punctuation: Default::default(),
44            capitalize_trigger_punctuation: Default::default(),
45        }
46    }
47}
48
49impl<'rope> WordIterator<'rope> {
50    pub fn new(
51        rope: RopeSlice<'rope>,
52        offset_from_parent: usize,
53        options: WordIteratorOptions,
54    ) -> Self {
55        Self {
56            rope,
57            offset_from_parent,
58            parser: WordParser::new(
59                options.initial_capitalize,
60                options.break_on_punctuation,
61                options.capitalize_trigger_punctuation,
62            ),
63        }
64    }
65
66    pub fn curr_index(&self) -> Option<usize> {
67        if let ParseState::Initial = self.parser.state {
68            assert!(self.parser.word_start_offset == self.parser.tracking_offset);
69            Some(self.parser.word_start_offset)
70        } else {
71            None
72        }
73    }
74
75    pub fn next_capitalize(&self) -> Option<Capitalize> {
76        if let ParseState::Initial = self.parser.state {
77            Some(self.parser.capitalize)
78        } else {
79            None
80        }
81    }
82
83    pub(crate) fn collect_remainder(self) -> Option<String> {
84        assert!(self.parser.word_start_offset == self.parser.tracking_offset);
85        if self.parser.word_start_offset == self.rope.byte_len() {
86            None
87        } else {
88            Some(
89                self.rope
90                    .byte_slice(self.parser.word_start_offset..)
91                    .to_string(),
92            )
93        }
94    }
95}
96
97pub(crate) type WordIteratorItem<'r> = (usize, RopeSlice<'r>, Capitalize);
98
99impl<'rope> Iterator for WordIterator<'rope> {
100    type Item = WordIteratorItem<'rope>;
101
102    fn next(&mut self) -> Option<Self::Item> {
103        let next_word_data = self.parser.parse(self.rope);
104
105        if let Some((offset, slice, capitalize)) = next_word_data {
106            Some((offset + self.offset_from_parent, slice, capitalize))
107        } else {
108            None
109        }
110    }
111}
112
113#[derive(Debug)]
114struct WordParser {
115    state: ParseState,
116    word_start_offset: usize,
117    tracking_offset: usize,
118    capitalize: Capitalize,
119    break_on_punctuation: BreakOnPunctuation,
120    capitalize_trigger_punctuation: CapitalizeTriggerPunctuation,
121}
122
123#[derive(Debug, Default)]
124enum ParseState {
125    #[default]
126    Initial,
127    AsciiAlphabetic,
128    OtherAlphabetic,
129    Numeric,
130    Whitespace,
131    Escape,
132    PostEscape,
133    PunctuationLeading(String),
134    PunctuationTrailing(String),
135    Other,
136}
137
138#[derive(Debug, Clone)]
139enum ParserNext {
140    Continue,
141    Break(usize, usize, Capitalize),
142}
143
144impl WordParser {
145    fn new(
146        initial_capitalize: Capitalize,
147        break_on_punctuation: BreakOnPunctuation,
148        capitalize_trigger_punctuation: CapitalizeTriggerPunctuation,
149    ) -> Self {
150        Self {
151            state: ParseState::Initial,
152            word_start_offset: 0,
153            tracking_offset: 0,
154            capitalize: initial_capitalize,
155            break_on_punctuation,
156            capitalize_trigger_punctuation,
157        }
158    }
159
160    fn parse<'rope>(&mut self, rope: RopeSlice<'rope>) -> Option<WordIteratorItem<'rope>> {
161        assert!(self.word_start_offset == self.tracking_offset);
162        if self.word_start_offset >= rope.byte_len() {
163            return None;
164        }
165
166        let chars = rope.byte_slice(self.word_start_offset..).chars();
167        for c in chars {
168            trace!("Parser loop iteration:");
169            trace!("  state: {:?}", self.state);
170            trace!("  word_start_offset: {}", self.word_start_offset);
171            trace!("  tracking_offset: {}", self.tracking_offset);
172            trace!(
173                "  word so far: {}",
174                rope.byte_slice(self.word_start_offset..self.tracking_offset)
175            );
176            trace!("  char: {c}");
177
178            let next = match c {
179                c if c.is_ascii_alphabetic() => self.consume_ascii_alphabetic(),
180                '0'..='9' => self.consume_numeric(),
181                _ if c.is_alphabetic() => self.consume_other_alphabetic(c),
182                _ if c.is_whitespace() => self.consume_whitespace(c),
183                '\\' => self.consume_escape(),
184                _ if is_punctuation(&c) => self.consume_punctuation(c),
185                _ => self.consume_other(c),
186            };
187
188            if let ParserNext::Break(start, end, capitalize) = next {
189                trace!("Break parser at word end with start: {start}, end: {end}");
190                self.word_start_offset = self.tracking_offset;
191                return Some((start, rope.byte_slice(start..end), capitalize));
192            }
193        }
194
195        let saved_start_offset = self.word_start_offset;
196        let word_end_offset = self.calculate_final_word_end_offset();
197
198        // Reset state to parse next word
199        self.state = ParseState::Initial;
200        self.word_start_offset = self.tracking_offset;
201
202        if saved_start_offset == word_end_offset {
203            None
204        } else {
205            Some((
206                saved_start_offset,
207                rope.byte_slice(saved_start_offset..word_end_offset),
208                self.capitalize,
209            ))
210        }
211    }
212
213    fn consume_ascii_alphabetic(&mut self) -> ParserNext {
214        trace!("consume_ascii_alphabetic");
215        match &self.state {
216            ParseState::Escape => {
217                self.state = ParseState::PostEscape;
218                self.tracking_offset += 1;
219                ParserNext::Continue
220            }
221            _ => {
222                self.state = ParseState::AsciiAlphabetic;
223                self.tracking_offset += 1;
224                ParserNext::Continue
225            }
226        }
227    }
228
229    fn consume_other_alphabetic(&mut self, c: char) -> ParserNext {
230        trace!("consume_other_alphabetic: {c}");
231        match &self.state {
232            ParseState::Escape => {
233                self.state = ParseState::PostEscape;
234                self.tracking_offset += c.len_utf8();
235                ParserNext::Continue
236            }
237            _ => {
238                self.state = ParseState::OtherAlphabetic;
239                self.tracking_offset += c.len_utf8();
240                ParserNext::Continue
241            }
242        }
243    }
244
245    fn consume_numeric(&mut self) -> ParserNext {
246        trace!("consume_numeric");
247        match &self.state {
248            ParseState::Escape => {
249                self.state = ParseState::PostEscape;
250                self.tracking_offset += 1;
251                ParserNext::Continue
252            }
253            _ => {
254                self.state = ParseState::Numeric;
255                self.tracking_offset += 1;
256                ParserNext::Continue
257            }
258        }
259    }
260
261    fn consume_whitespace(&mut self, c: char) -> ParserNext {
262        trace!("consume_whitespace: {c}");
263        match &self.state {
264            ParseState::Initial | ParseState::PunctuationLeading(_) => {
265                self.state = ParseState::Whitespace;
266                self.word_start_offset += c.len_utf8();
267                self.tracking_offset += c.len_utf8();
268                ParserNext::Continue
269            }
270            ParseState::AsciiAlphabetic
271            | ParseState::OtherAlphabetic
272            | ParseState::Numeric
273            | ParseState::Other
274            | ParseState::PostEscape => {
275                let word_end_offset = self.tracking_offset;
276                let curr_capitalize = self.capitalize;
277
278                self.state = ParseState::Initial;
279                self.tracking_offset += c.len_utf8();
280                self.capitalize = Capitalize::False;
281
282                ParserNext::Break(self.word_start_offset, word_end_offset, curr_capitalize)
283            }
284            ParseState::Whitespace => {
285                self.word_start_offset += c.len_utf8();
286                self.tracking_offset += c.len_utf8();
287                ParserNext::Continue
288            }
289            ParseState::Escape => {
290                self.state = ParseState::PostEscape;
291                self.tracking_offset += c.len_utf8();
292                ParserNext::Continue
293            }
294            ParseState::PunctuationTrailing(punctuation) => {
295                // If the word ends with a hyphen, preserve the hyphen so we
296                // can capture bare prefixes like `pre-` and `post-`
297                let preserve_punctuation = punctuation == "-";
298
299                let word_end_offset = if preserve_punctuation {
300                    self.tracking_offset
301                } else {
302                    self.tracking_offset.saturating_sub(punctuation.len())
303                };
304                let curr_capitalize = self.capitalize;
305
306                if let Some(p) = punctuation.chars().last() {
307                    self.capitalize = self.punc_triggers_capitalization(&p);
308                }
309                self.state = ParseState::Initial;
310                self.tracking_offset += c.len_utf8();
311
312                ParserNext::Break(self.word_start_offset, word_end_offset, curr_capitalize)
313            }
314        }
315    }
316
317    fn consume_punctuation(&mut self, c: char) -> ParserNext {
318        trace!("consume_punctuation: {c}");
319        match &self.state {
320            ParseState::Initial | ParseState::Whitespace => {
321                self.state = ParseState::PunctuationLeading(c.to_string());
322                self.word_start_offset += c.len_utf8();
323                self.tracking_offset += c.len_utf8();
324                ParserNext::Continue
325            }
326            ParseState::AsciiAlphabetic
327            | ParseState::OtherAlphabetic
328            | ParseState::Numeric
329            | ParseState::Other
330            | ParseState::PostEscape => {
331                if self.break_word_immediately_on_puncutation(&c) {
332                    let word_end_offset = self.tracking_offset;
333                    let curr_capitalize = self.capitalize;
334
335                    self.capitalize = self.punc_triggers_capitalization(&c);
336                    self.state = ParseState::Initial;
337                    self.tracking_offset += c.len_utf8();
338
339                    ParserNext::Break(self.word_start_offset, word_end_offset, curr_capitalize)
340                } else {
341                    self.state = ParseState::PunctuationTrailing(c.to_string());
342                    self.tracking_offset += c.len_utf8();
343                    ParserNext::Continue
344                }
345            }
346            ParseState::Escape => {
347                self.state = ParseState::PostEscape;
348                self.tracking_offset += c.len_utf8();
349                ParserNext::Continue
350            }
351            ParseState::PunctuationLeading(punctuation) => {
352                self.state = ParseState::PunctuationLeading(format!("{punctuation}{c}"));
353                self.word_start_offset += c.len_utf8();
354                self.tracking_offset += c.len_utf8();
355                ParserNext::Continue
356            }
357            ParseState::PunctuationTrailing(punctuation) => {
358                if self.break_word_immediately_on_puncutation(&c) {
359                    let word_end_offset = self.tracking_offset.saturating_sub(punctuation.len());
360                    let curr_capitalize = self.capitalize;
361
362                    self.capitalize = self.punc_triggers_capitalization(&c);
363                    self.state = ParseState::Initial;
364                    self.tracking_offset += c.len_utf8();
365
366                    ParserNext::Break(self.word_start_offset, word_end_offset, curr_capitalize)
367                } else {
368                    self.state = ParseState::PunctuationTrailing(format!("{punctuation}{c}"));
369                    self.tracking_offset += c.len_utf8();
370                    ParserNext::Continue
371                }
372            }
373        }
374    }
375
376    fn consume_escape(&mut self) -> ParserNext {
377        trace!("consume_escape");
378        match &self.state {
379            ParseState::Escape => {
380                self.state = ParseState::PostEscape;
381                self.tracking_offset += 1;
382                ParserNext::Continue
383            }
384            _ => {
385                self.state = ParseState::Escape;
386                self.tracking_offset += 1;
387                ParserNext::Continue
388            }
389        }
390    }
391
392    fn consume_other(&mut self, c: char) -> ParserNext {
393        trace!("consume_other: {c}");
394        match &self.state {
395            ParseState::Escape => {
396                self.state = ParseState::PostEscape;
397                self.tracking_offset += c.len_utf8();
398                ParserNext::Continue
399            }
400            _ => {
401                self.state = ParseState::Other;
402                self.tracking_offset += c.len_utf8();
403                ParserNext::Continue
404            }
405        }
406    }
407
408    fn calculate_final_word_end_offset(&self) -> usize {
409        match &self.state {
410            ParseState::PunctuationTrailing(punctuation) => {
411                self.tracking_offset.saturating_sub(punctuation.len())
412            }
413            _ => self.tracking_offset,
414        }
415    }
416
417    fn punc_triggers_capitalization_std(c: &char) -> bool {
418        *c == '!' || *c == '?' || *c == '.'
419    }
420
421    fn punc_triggers_capitalization(&self, c: &char) -> Capitalize {
422        if Self::punc_triggers_capitalization_std(c)
423            || *c == ':'
424                && matches!(
425                    self.capitalize_trigger_punctuation,
426                    CapitalizeTriggerPunctuation::PlusColon
427                )
428        {
429            Capitalize::True
430        } else {
431            Capitalize::False
432        }
433    }
434
435    fn break_on_hyphens(&self) -> bool {
436        matches!(self.break_on_punctuation, BreakOnPunctuation::Hyphen)
437    }
438
439    fn break_word_immediately_on_puncutation(&self, c: &char) -> bool {
440        match c {
441            '–' | '—' | '―' => true,
442            '-' => self.break_on_hyphens(),
443            _ => false,
444        }
445    }
446}
447
448pub fn is_punctuation(c: &char) -> bool {
449    *c == '!'
450        || *c == '-'
451        || *c == '–'
452        || *c == '—'
453        || *c == '―'
454        || *c == '('
455        || *c == ')'
456        || *c == '['
457        || *c == ']'
458        || *c == '{'
459        || *c == '}'
460        || *c == ':'
461        || *c == '\''
462        || *c == '‘'
463        || *c == '’'
464        || *c == '“'
465        || *c == '”'
466        || *c == '"'
467        || *c == '?'
468        || *c == ','
469        || *c == '.'
470        || *c == ';'
471}
472
473const SENTENCE_ENDING_PUNCTUATION: &[char] = &['.', '!', '?', '…'];
474
475fn is_sentence_ending_punctuation(c: &char) -> bool {
476    SENTENCE_ENDING_PUNCTUATION.contains(c)
477}
478
479#[builder]
480pub(crate) fn is_sentence_start(
481    slice: RopeSlice<'_>,
482    query_offset: usize,
483    #[builder(default = true)] count_beginning_as_sentence_start: bool,
484) -> bool {
485    #[cfg(debug_assertions)]
486    log::trace!("Checking if offset {query_offset} is at sentence start");
487
488    let mut iter = WordIterator::new(slice, 0, Default::default())
489        .enumerate()
490        .peekable();
491
492    let (preceding_offset, preceding_word, queried_offset, queried_word) = loop {
493        match (iter.next(), iter.peek()) {
494            (Some((0, _)), _) if query_offset == 0 && count_beginning_as_sentence_start => {
495                return count_beginning_as_sentence_start;
496            }
497            (
498                Some((_, (preceding_offset, preceding_word, _))),
499                Some((_, (next_word_offset, next_word, _))),
500            ) => {
501                if *next_word_offset == query_offset {
502                    break (
503                        preceding_offset,
504                        preceding_word,
505                        next_word_offset,
506                        next_word,
507                    );
508                }
509            }
510            _ => {
511                return false;
512            }
513        }
514    };
515
516    // A word in the middle of a text is at the start of a sentence if it is
517    // proceeded by a word immediately followed by punctuation. The punctuation
518    // _must_ include a sentence-closing punctuation mark, which may be
519    // surrounded by other punctuation. For example, `".)` would be a valid
520    // sentence-ending punctuation cluster.
521    //
522    // We're also going to check for capitalization to avoid false positives
523    // from punctuation clusters such as `(T.B.D.)`, though this will give us
524    // false negatives for some special cases of words that are allowed to
525    // be lowercase at sentence start. The number of these exceptions is
526    // relatively small, and for simplicity's sake we will ignore them.
527    if !(queried_word.is_char_boundary(0)
528        && queried_word
529            .chars()
530            .next()
531            .is_some_and(|c: char| c.is_uppercase()))
532    {
533        return false;
534    }
535
536    let between = slice
537        .byte_slice(preceding_offset + preceding_word.byte_len()..*queried_offset)
538        .chars();
539    #[cfg(debug_assertions)]
540    trace!(
541        "Parsing the between-sentence text: \"{}\"",
542        between.clone().collect::<String>()
543    );
544    between_sentence_parser::BetweenSentenceParser::new().parse(between)
545}
546
547mod between_sentence_parser {
548    #[cfg(debug_assertions)]
549    use log::trace;
550
551    #[derive(Debug)]
552    enum BetweenSentenceParserState {
553        Initial,
554        PrecedingPunctuation,
555        SentenceEndingPunctuation(EndingPunctuationType),
556        FollowingPunctuation,
557        Whitespace,
558        SentenceStartPunctuation,
559    }
560
561    #[derive(Debug)]
562    enum EndingPunctuationType {
563        Mixable,
564        NonMixable(char),
565    }
566
567    #[derive(Debug)]
568    pub(super) struct BetweenSentenceParser {
569        state: BetweenSentenceParserState,
570    }
571
572    impl BetweenSentenceParser {
573        pub(super) fn new() -> Self {
574            Self {
575                state: BetweenSentenceParserState::Initial,
576            }
577        }
578
579        pub(super) fn parse(&mut self, chars: impl Iterator<Item = char>) -> bool {
580            use BetweenSentenceParserState::*;
581
582            for char in chars {
583                #[cfg(debug_assertions)]
584                trace!("Parser state: {:?}", self.state);
585
586                match char {
587                    c if c.is_whitespace() => match self.state {
588                        SentenceEndingPunctuation(_) | FollowingPunctuation => {
589                            self.state = Whitespace;
590                        }
591                        Whitespace => {}
592                        _ => return false,
593                    },
594                    c if super::is_sentence_ending_punctuation(&c) => {
595                        let r#type = match c {
596                            '.' => EndingPunctuationType::NonMixable(c),
597                            _ => EndingPunctuationType::Mixable,
598                        };
599                        match self.state {
600                            Initial | PrecedingPunctuation => {
601                                self.state = SentenceEndingPunctuation(r#type);
602                            }
603                            SentenceEndingPunctuation(EndingPunctuationType::Mixable)
604                                if matches!(r#type, EndingPunctuationType::Mixable) => {}
605                            SentenceEndingPunctuation(EndingPunctuationType::NonMixable(old_c))
606                                if matches!(r#type, EndingPunctuationType::NonMixable(c) if c == old_c) =>
607                                {}
608                            _ => return false,
609                        }
610                    }
611                    c if super::is_punctuation(&c) => match self.state {
612                        Initial => {
613                            self.state = PrecedingPunctuation;
614                        }
615                        PrecedingPunctuation | FollowingPunctuation | SentenceStartPunctuation => {}
616                        SentenceEndingPunctuation(_) => {
617                            self.state = FollowingPunctuation;
618                        }
619                        Whitespace => self.state = SentenceStartPunctuation,
620                    },
621                    _ => return false,
622                }
623            }
624
625            matches!(self.state, Whitespace | SentenceStartPunctuation)
626        }
627    }
628}
629
630pub(crate) mod extras {
631    use std::collections::VecDeque;
632
633    use super::*;
634
635    pub(crate) struct WordIteratorExtension<'a, I> {
636        prefix: Option<I>,
637        inner: WordIterator<'a>,
638    }
639
640    impl<'a, I> From<WordIterator<'a>> for WordIteratorExtension<'a, I> {
641        fn from(inner: WordIterator<'a>) -> Self {
642            Self {
643                prefix: None,
644                inner,
645            }
646        }
647    }
648
649    impl<'a, I> WordIteratorExtension<'a, I>
650    where
651        I: Iterator<Item = WordIteratorItem<'a>>,
652    {
653        pub(crate) fn extend_on_prefix(self, prefix: I) -> Self {
654            Self {
655                prefix: Some(prefix),
656                inner: self.into_inner().1,
657            }
658        }
659
660        pub(crate) fn into_inner(self) -> (Option<I>, WordIterator<'a>) {
661            (self.prefix, self.inner)
662        }
663    }
664
665    impl<'a, I> Iterator for WordIteratorExtension<'a, I>
666    where
667        I: Iterator<Item = WordIteratorItem<'a>>,
668    {
669        type Item = WordIteratorItem<'a>;
670
671        fn next(&mut self) -> Option<Self::Item> {
672            match self.prefix {
673                Some(ref mut prefix) => prefix.next().or_else(|| self.inner.next()),
674                None => self.inner.next(),
675            }
676        }
677    }
678
679    #[cfg(test)]
680    pub(crate) struct UnitIterator<'a> {
681        _marker: std::marker::PhantomData<&'a ()>,
682    }
683
684    #[cfg(test)]
685    impl<'a> Iterator for UnitIterator<'a> {
686        type Item = WordIteratorItem<'a>;
687
688        fn next(&mut self) -> Option<Self::Item> {
689            None
690        }
691    }
692
693    pub(crate) struct WordIteratorPrefix<'a> {
694        inner: VecDeque<WordIteratorItem<'a>>,
695    }
696
697    impl<'a> WordIteratorPrefix<'a> {
698        pub(crate) fn new<I>(inner: I) -> Self
699        where
700            I: IntoIterator<Item = WordIteratorItem<'a>>,
701        {
702            Self {
703                inner: inner.into_iter().collect(),
704            }
705        }
706    }
707
708    impl<'a> Iterator for WordIteratorPrefix<'a> {
709        type Item = WordIteratorItem<'a>;
710
711        fn next(&mut self) -> Option<Self::Item> {
712            self.inner.pop_front()
713        }
714    }
715}
716
717#[cfg(test)]
718mod tests {
719    use super::*;
720    use crop::Rope;
721
722    #[test]
723    fn test_word_iterator_basic() {
724        let rope = Rope::from("hello world");
725        let slice = rope.byte_slice(..);
726        let mut iter = WordIterator::new(slice, 0, Default::default());
727
728        let (offset, word, cap) = iter.next().unwrap();
729        assert_eq!(offset, 0);
730        assert_eq!(word.to_string(), "hello");
731        assert_eq!(cap, Capitalize::False);
732
733        let (offset, word, cap) = iter.next().unwrap();
734        assert_eq!(offset, 6);
735        assert_eq!(word.to_string(), "world");
736        assert_eq!(cap, Capitalize::False);
737
738        assert!(iter.next().is_none());
739    }
740
741    #[test]
742    fn test_word_iterator_with_punctuation() {
743        let rope = Rope::from("hello, world!");
744        let slice = rope.byte_slice(..);
745        let mut iter = WordIterator::new(slice, 0, Default::default());
746
747        let (offset, word, cap) = iter.next().unwrap();
748        assert_eq!(offset, 0);
749        assert_eq!(word.to_string(), "hello");
750        assert_eq!(cap, Capitalize::False);
751
752        let (offset, word, cap) = iter.next().unwrap();
753        assert_eq!(offset, 7);
754        assert_eq!(word.to_string(), "world");
755        assert_eq!(cap, Capitalize::False);
756
757        assert!(iter.next().is_none());
758    }
759
760    #[test]
761    fn test_word_iterator_with_multiple_spaces() {
762        let rope = Rope::from("hello   world");
763        let slice = rope.byte_slice(..);
764        let mut iter = WordIterator::new(slice, 0, Default::default());
765
766        let (offset, word, cap) = iter.next().unwrap();
767        assert_eq!(offset, 0);
768        assert_eq!(word.to_string(), "hello");
769        assert_eq!(cap, Capitalize::False);
770
771        let (offset, word, cap) = iter.next().unwrap();
772        assert_eq!(offset, 8);
773        assert_eq!(word.to_string(), "world");
774        assert_eq!(cap, Capitalize::False);
775
776        assert!(iter.next().is_none());
777    }
778
779    #[test]
780    fn test_word_iterator_with_numbers() {
781        let rope = Rope::from("test123 456");
782        let slice = rope.byte_slice(..);
783        let mut iter = WordIterator::new(slice, 0, Default::default());
784
785        let (offset, word, cap) = iter.next().unwrap();
786        assert_eq!(offset, 0);
787        assert_eq!(word.to_string(), "test123");
788        assert_eq!(cap, Capitalize::False);
789
790        let (offset, word, cap) = iter.next().unwrap();
791        assert_eq!(offset, 8);
792        assert_eq!(word.to_string(), "456");
793        assert_eq!(cap, Capitalize::False);
794
795        assert!(iter.next().is_none());
796    }
797
798    #[test]
799    fn test_word_iterator_with_quotes() {
800        let rope = Rope::from("hello \"world\"");
801        let slice = rope.byte_slice(..);
802        let mut iter = WordIterator::new(slice, 0, Default::default());
803
804        let (offset, word, cap) = iter.next().unwrap();
805        assert_eq!(offset, 0);
806        assert_eq!(word.to_string(), "hello");
807        assert_eq!(cap, Capitalize::False);
808
809        let (offset, word, cap) = iter.next().unwrap();
810        assert_eq!(offset, 7);
811        assert_eq!(word.to_string(), "world");
812        assert_eq!(cap, Capitalize::False);
813
814        assert!(iter.next().is_none());
815    }
816
817    #[test]
818    fn test_word_iterator_include_hyphen_on_bare_prefixes() {
819        let rope = Rope::from("pre- and post-world");
820        let slice = rope.byte_slice(..);
821        let mut iter = WordIterator::new(slice, 0, Default::default());
822
823        let (offset, word, _cap) = iter.next().unwrap();
824        assert_eq!(offset, 0);
825        assert_eq!(word.to_string(), "pre-");
826
827        let (offset, word, _cap) = iter.nth(1).unwrap();
828        assert_eq!(offset, 9);
829        assert_eq!(word.to_string(), "post-world");
830    }
831
832    #[test]
833    fn test_word_iterator_with_emoji() {
834        let rope = Rope::from("hello 🤝 world");
835        let slice = rope.byte_slice(..);
836        let mut iter = WordIterator::new(slice, 0, Default::default());
837
838        let (offset, word, cap) = iter.next().unwrap();
839        assert_eq!(offset, 0);
840        assert_eq!(word.to_string(), "hello");
841        assert_eq!(cap, Capitalize::False);
842
843        let (offset, word, cap) = iter.next().unwrap();
844        assert_eq!(offset, 6);
845        assert_eq!(word.to_string(), "🤝");
846        assert_eq!(cap, Capitalize::False);
847
848        let (offset, word, cap) = iter.next().unwrap();
849        assert_eq!(offset, 11);
850        assert_eq!(word.to_string(), "world");
851        assert_eq!(cap, Capitalize::False);
852
853        assert!(iter.next().is_none());
854    }
855
856    #[test]
857    fn test_word_iterator_with_cjk() {
858        let rope = Rope::from("hello 你好 world");
859        let slice = rope.byte_slice(..);
860        let mut iter = WordIterator::new(slice, 0, Default::default());
861
862        let (offset, word, cap) = iter.next().unwrap();
863        assert_eq!(offset, 0);
864        assert_eq!(word.to_string(), "hello");
865        assert_eq!(cap, Capitalize::False);
866
867        let (offset, word, cap) = iter.next().unwrap();
868        assert_eq!(offset, 6);
869        assert_eq!(word.to_string(), "你好");
870        assert_eq!(cap, Capitalize::False);
871
872        let (offset, word, cap) = iter.next().unwrap();
873        assert_eq!(offset, 13);
874        assert_eq!(word.to_string(), "world");
875        assert_eq!(cap, Capitalize::False);
876
877        assert!(iter.next().is_none());
878    }
879
880    #[test]
881    fn test_word_iterator_initial_capitalization() {
882        let rope = Rope::from("hello world");
883        let slice = rope.byte_slice(..);
884        let mut iter = WordIterator::new(
885            slice,
886            0,
887            WordIteratorOptions {
888                initial_capitalize: Capitalize::True,
889                ..Default::default()
890            },
891        );
892
893        let (offset, word, cap) = iter.next().unwrap();
894        assert_eq!(offset, 0);
895        assert_eq!(word.to_string(), "hello");
896        assert_eq!(cap, Capitalize::True);
897
898        let (offset, word, cap) = iter.next().unwrap();
899        assert_eq!(offset, 6);
900        assert_eq!(word.to_string(), "world");
901        assert_eq!(cap, Capitalize::False);
902
903        assert!(iter.next().is_none());
904    }
905
906    #[test]
907    fn test_word_iterator_subsequent_capitalization() {
908        let rope = Rope::from("some thing. Sentence. World.");
909        let slice = rope.byte_slice(..);
910        let mut iter = WordIterator::new(slice, 0, Default::default());
911
912        let (offset, word, cap) = iter.nth(2).unwrap();
913        assert_eq!(offset, 12);
914        assert_eq!(word.to_string(), "Sentence");
915        assert_eq!(cap, Capitalize::True);
916
917        let (offset, word, cap) = iter.next().unwrap();
918        assert_eq!(offset, 22);
919        assert_eq!(word.to_string(), "World");
920        assert_eq!(cap, Capitalize::True);
921
922        assert!(iter.next().is_none());
923    }
924
925    #[test]
926    fn test_word_iterator_break_on_hyphens() {
927        let rope = Rope::from("hello-world");
928        let slice = rope.byte_slice(..);
929        let mut iter = WordIterator::new(slice, 0, Default::default());
930
931        let (_offset, word, _cap) = iter.next().unwrap();
932        assert_eq!(word.to_string(), "hello-world");
933        assert!(iter.next().is_none());
934
935        let slice = rope.byte_slice(..);
936        let mut iter = WordIterator::new(
937            slice,
938            0,
939            WordIteratorOptions {
940                break_on_punctuation: BreakOnPunctuation::Hyphen,
941                ..Default::default()
942            },
943        );
944
945        let (offset, word, _cap) = iter.next().unwrap();
946        assert_eq!(offset, 0);
947        assert_eq!(word.to_string(), "hello");
948
949        let (offset, word, _cap) = iter.next().unwrap();
950        assert_eq!(offset, 6);
951        assert_eq!(word.to_string(), "world");
952
953        assert!(iter.next().is_none());
954    }
955
956    #[test]
957    fn test_word_iterator_capitalize_on_colons() {
958        let rope = Rope::from("hello: world");
959        let slice = rope.byte_slice(..);
960        let mut iter = WordIterator::new(slice, 0, Default::default());
961
962        let (_offset, word, cap) = iter.next().unwrap();
963        assert_eq!(word.to_string(), "hello");
964        assert_eq!(cap, Capitalize::False);
965
966        let (_offset, word, cap) = iter.next().unwrap();
967        assert_eq!(word.to_string(), "world");
968        assert_eq!(cap, Capitalize::False);
969
970        let slice = rope.byte_slice(..);
971        let mut iter = WordIterator::new(
972            slice,
973            0,
974            WordIteratorOptions {
975                capitalize_trigger_punctuation: CapitalizeTriggerPunctuation::PlusColon,
976                ..Default::default()
977            },
978        );
979
980        let (_offset, word, cap) = iter.next().unwrap();
981        assert_eq!(word.to_string(), "hello");
982        assert_eq!(cap, Capitalize::False);
983
984        let (_offset, word, cap) = iter.next().unwrap();
985        assert_eq!(word.to_string(), "world");
986        assert_eq!(cap, Capitalize::True);
987    }
988
989    #[test]
990    fn test_word_iterator_complex_sentence() {
991        let rope = Rope::from(
992        "Each of these open source tools are amazing, but they all had a major drawback - we couldn't use Postgres as the server's datastore. If you haven't noticed yet, our team likes Postgres a lot 😉."
993        );
994        let slice = rope.byte_slice(..);
995
996        let iter = WordIterator::new(slice, 0, Default::default());
997        let mut offsets: Vec<usize> = Vec::new();
998        let mut words: Vec<String> = Vec::new();
999        let mut caps: Vec<Capitalize> = Vec::new();
1000
1001        for (offset, word, cap) in iter {
1002            offsets.push(offset);
1003            words.push(word.to_string());
1004            caps.push(cap);
1005        }
1006
1007        assert_eq!(
1008            offsets,
1009            vec![
1010                0, 5, 8, 14, 19, 26, 32, 36, 45, 49, 54, 58, 62, 64, 70, 81, 84, 93, 97, 106, 109,
1011                113, 122, 133, 136, 140, 148, 156, 161, 165, 170, 176, 185, 187, 191
1012            ]
1013        );
1014        assert_eq!(
1015            words,
1016            vec![
1017                "Each",
1018                "of",
1019                "these",
1020                "open",
1021                "source",
1022                "tools",
1023                "are",
1024                "amazing",
1025                "but",
1026                "they",
1027                "all",
1028                "had",
1029                "a",
1030                "major",
1031                "drawback",
1032                "we",
1033                "couldn't",
1034                "use",
1035                "Postgres",
1036                "as",
1037                "the",
1038                "server's",
1039                "datastore",
1040                "If",
1041                "you",
1042                "haven't",
1043                "noticed",
1044                "yet",
1045                "our",
1046                "team",
1047                "likes",
1048                "Postgres",
1049                "a",
1050                "lot",
1051                "😉"
1052            ]
1053        );
1054        for (idx, cap) in caps.iter().enumerate() {
1055            assert_eq!(
1056                *cap,
1057                if idx == 23 {
1058                    Capitalize::True
1059                } else {
1060                    Capitalize::False
1061                }
1062            );
1063        }
1064    }
1065
1066    #[test]
1067    fn test_word_iterator_collect_remainder() {
1068        let rope = Rope::from("hello everybody in the world");
1069        let slice = rope.byte_slice(..);
1070        let mut iter = WordIterator::new(slice, 0, Default::default());
1071
1072        iter.next();
1073        assert_eq!(
1074            iter.collect_remainder(),
1075            Some("everybody in the world".to_string())
1076        );
1077    }
1078
1079    #[test]
1080    fn test_word_iterator_no_remainder() {
1081        let rope = Rope::from("hello");
1082        let slice = rope.byte_slice(..);
1083        let mut iter = WordIterator::new(slice, 0, Default::default());
1084
1085        iter.next();
1086        assert!(iter.collect_remainder().is_none());
1087    }
1088
1089    #[test]
1090    fn test_word_iterator_wrapper() {
1091        let rope = Rope::from("hello world");
1092        let slice = rope.byte_slice(..);
1093        let mut iter: extras::WordIteratorExtension<'_, extras::UnitIterator> =
1094            WordIterator::new(slice, 0, Default::default()).into();
1095
1096        let (offset, word, cap) = iter.next().unwrap();
1097        assert_eq!(offset, 0);
1098        assert_eq!(word.to_string(), "hello");
1099        assert_eq!(cap, Capitalize::False);
1100
1101        let (offset, word, cap) = iter.next().unwrap();
1102        assert_eq!(offset, 6);
1103        assert_eq!(word.to_string(), "world");
1104        assert_eq!(cap, Capitalize::False);
1105
1106        assert!(iter.next().is_none());
1107    }
1108
1109    #[test]
1110    fn test_word_iterator_wrapper_with_prefix() {
1111        let rope = Rope::from("hello world keep going");
1112        let slice = rope.byte_slice(..);
1113
1114        let mut orig_iter: extras::WordIteratorExtension<'_, extras::WordIteratorPrefix> =
1115            WordIterator::new(slice, 0, Default::default()).into();
1116
1117        let mut consumed = vec![];
1118        consumed.push(orig_iter.next().unwrap());
1119        consumed.push(orig_iter.next().unwrap());
1120
1121        let mut new_iter = orig_iter.extend_on_prefix(extras::WordIteratorPrefix::new(consumed));
1122
1123        let next = new_iter.next().unwrap();
1124        assert_eq!(next.0, 0);
1125        assert_eq!(next.1.to_string(), "hello");
1126        let next = new_iter.next().unwrap();
1127        assert_eq!(next.0, 6);
1128        assert_eq!(next.1.to_string(), "world");
1129        let next = new_iter.next().unwrap();
1130        assert_eq!(next.0, 12);
1131        assert_eq!(next.1.to_string(), "keep");
1132        let next = new_iter.next().unwrap();
1133        assert_eq!(next.0, 17);
1134        assert_eq!(next.1.to_string(), "going");
1135        assert!(new_iter.next().is_none());
1136    }
1137
1138    #[test]
1139    fn test_is_sentence_start() {
1140        let rope = Rope::from("Hello world! What a wonderful day. What's up?");
1141        assert!(is_sentence_start()
1142            .slice(rope.byte_slice(..))
1143            .query_offset(0)
1144            .call());
1145        assert!(is_sentence_start()
1146            .slice(rope.byte_slice(..))
1147            .query_offset(13)
1148            .call());
1149        assert!(is_sentence_start()
1150            .slice(rope.byte_slice(..))
1151            .query_offset(35)
1152            .call());
1153        assert!(!is_sentence_start()
1154            .slice(rope.byte_slice(..))
1155            .query_offset(6)
1156            .call());
1157        assert!(!is_sentence_start()
1158            .slice(rope.byte_slice(..))
1159            .query_offset(11)
1160            .call());
1161        assert!(!is_sentence_start()
1162            .slice(rope.byte_slice(..))
1163            .query_offset(12)
1164            .call());
1165        assert!(!is_sentence_start()
1166            .slice(rope.byte_slice(..))
1167            .query_offset(40)
1168            .call());
1169    }
1170
1171    #[test]
1172    fn test_is_sentence_start_handles_ellipsis() {
1173        let rope = Rope::from("Hello... world!");
1174        assert!(!is_sentence_start()
1175            .slice(rope.byte_slice(..))
1176            .query_offset(9)
1177            .call());
1178
1179        let rope = Rope::from("Hello... World!");
1180        assert!(is_sentence_start()
1181            .slice(rope.byte_slice(..))
1182            .query_offset(9)
1183            .call());
1184    }
1185
1186    #[test]
1187    fn test_is_sentence_start_handles_mixed_punctuation() {
1188        let rope = Rope::from("Hello?!?!?! World!");
1189        assert!(is_sentence_start()
1190            .slice(rope.byte_slice(..))
1191            .query_offset(12)
1192            .call());
1193
1194        let rope = Rope::from("Hello.!?. What?");
1195        assert!(!is_sentence_start()
1196            .slice(rope.byte_slice(..))
1197            .query_offset(10)
1198            .call());
1199    }
1200
1201    #[test]
1202    fn test_is_sentence_start_gracefully_fails_on_empty_rope() {
1203        assert!(!is_sentence_start()
1204            .slice(Rope::from("").byte_slice(..))
1205            .query_offset(0)
1206            .call());
1207    }
1208
1209    #[test]
1210    fn test_is_sentence_start_gracefully_fails_on_out_of_bounds() {
1211        assert!(!is_sentence_start()
1212            .slice(Rope::from("Hello").byte_slice(..))
1213            .query_offset(1000)
1214            .call());
1215    }
1216}